1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * alloc.c 4 * 5 * Extent allocs and frees 6 * 7 * Copyright (C) 2002, 2004 Oracle. All rights reserved. 8 */ 9 10 #include <linux/fs.h> 11 #include <linux/types.h> 12 #include <linux/slab.h> 13 #include <linux/string.h> 14 #include <linux/highmem.h> 15 #include <linux/swap.h> 16 #include <linux/quotaops.h> 17 #include <linux/blkdev.h> 18 #include <linux/sched/signal.h> 19 20 #include <cluster/masklog.h> 21 22 #include "ocfs2.h" 23 24 #include "alloc.h" 25 #include "aops.h" 26 #include "blockcheck.h" 27 #include "dlmglue.h" 28 #include "extent_map.h" 29 #include "inode.h" 30 #include "journal.h" 31 #include "localalloc.h" 32 #include "suballoc.h" 33 #include "sysfile.h" 34 #include "file.h" 35 #include "super.h" 36 #include "uptodate.h" 37 #include "xattr.h" 38 #include "refcounttree.h" 39 #include "ocfs2_trace.h" 40 41 #include "buffer_head_io.h" 42 43 enum ocfs2_contig_type { 44 CONTIG_NONE = 0, 45 CONTIG_LEFT, 46 CONTIG_RIGHT, 47 CONTIG_LEFTRIGHT, 48 }; 49 50 static enum ocfs2_contig_type 51 ocfs2_extent_rec_contig(struct super_block *sb, 52 struct ocfs2_extent_rec *ext, 53 struct ocfs2_extent_rec *insert_rec); 54 /* 55 * Operations for a specific extent tree type. 56 * 57 * To implement an on-disk btree (extent tree) type in ocfs2, add 58 * an ocfs2_extent_tree_operations structure and the matching 59 * ocfs2_init_<thingy>_extent_tree() function. That's pretty much it 60 * for the allocation portion of the extent tree. 61 */ 62 struct ocfs2_extent_tree_operations { 63 /* 64 * last_eb_blk is the block number of the right most leaf extent 65 * block. Most on-disk structures containing an extent tree store 66 * this value for fast access. The ->eo_set_last_eb_blk() and 67 * ->eo_get_last_eb_blk() operations access this value. They are 68 * both required. 69 */ 70 void (*eo_set_last_eb_blk)(struct ocfs2_extent_tree *et, 71 u64 blkno); 72 u64 (*eo_get_last_eb_blk)(struct ocfs2_extent_tree *et); 73 74 /* 75 * The on-disk structure usually keeps track of how many total 76 * clusters are stored in this extent tree. This function updates 77 * that value. new_clusters is the delta, and must be 78 * added to the total. Required. 79 */ 80 void (*eo_update_clusters)(struct ocfs2_extent_tree *et, 81 u32 new_clusters); 82 83 /* 84 * If this extent tree is supported by an extent map, insert 85 * a record into the map. 86 */ 87 void (*eo_extent_map_insert)(struct ocfs2_extent_tree *et, 88 struct ocfs2_extent_rec *rec); 89 90 /* 91 * If this extent tree is supported by an extent map, truncate the 92 * map to clusters, 93 */ 94 void (*eo_extent_map_truncate)(struct ocfs2_extent_tree *et, 95 u32 clusters); 96 97 /* 98 * If ->eo_insert_check() exists, it is called before rec is 99 * inserted into the extent tree. It is optional. 100 */ 101 int (*eo_insert_check)(struct ocfs2_extent_tree *et, 102 struct ocfs2_extent_rec *rec); 103 int (*eo_sanity_check)(struct ocfs2_extent_tree *et); 104 105 /* 106 * -------------------------------------------------------------- 107 * The remaining are internal to ocfs2_extent_tree and don't have 108 * accessor functions 109 */ 110 111 /* 112 * ->eo_fill_root_el() takes et->et_object and sets et->et_root_el. 113 * It is required. 114 */ 115 void (*eo_fill_root_el)(struct ocfs2_extent_tree *et); 116 117 /* 118 * ->eo_fill_max_leaf_clusters sets et->et_max_leaf_clusters if 119 * it exists. If it does not, et->et_max_leaf_clusters is set 120 * to 0 (unlimited). Optional. 121 */ 122 void (*eo_fill_max_leaf_clusters)(struct ocfs2_extent_tree *et); 123 124 /* 125 * ->eo_extent_contig test whether the 2 ocfs2_extent_rec 126 * are contiguous or not. Optional. Don't need to set it if use 127 * ocfs2_extent_rec as the tree leaf. 128 */ 129 enum ocfs2_contig_type 130 (*eo_extent_contig)(struct ocfs2_extent_tree *et, 131 struct ocfs2_extent_rec *ext, 132 struct ocfs2_extent_rec *insert_rec); 133 }; 134 135 136 /* 137 * Pre-declare ocfs2_dinode_et_ops so we can use it as a sanity check 138 * in the methods. 139 */ 140 static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et); 141 static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et, 142 u64 blkno); 143 static void ocfs2_dinode_update_clusters(struct ocfs2_extent_tree *et, 144 u32 clusters); 145 static void ocfs2_dinode_extent_map_insert(struct ocfs2_extent_tree *et, 146 struct ocfs2_extent_rec *rec); 147 static void ocfs2_dinode_extent_map_truncate(struct ocfs2_extent_tree *et, 148 u32 clusters); 149 static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et, 150 struct ocfs2_extent_rec *rec); 151 static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et); 152 static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et); 153 154 static int ocfs2_reuse_blk_from_dealloc(handle_t *handle, 155 struct ocfs2_extent_tree *et, 156 struct buffer_head **new_eb_bh, 157 int blk_wanted, int *blk_given); 158 static int ocfs2_is_dealloc_empty(struct ocfs2_extent_tree *et); 159 160 static const struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = { 161 .eo_set_last_eb_blk = ocfs2_dinode_set_last_eb_blk, 162 .eo_get_last_eb_blk = ocfs2_dinode_get_last_eb_blk, 163 .eo_update_clusters = ocfs2_dinode_update_clusters, 164 .eo_extent_map_insert = ocfs2_dinode_extent_map_insert, 165 .eo_extent_map_truncate = ocfs2_dinode_extent_map_truncate, 166 .eo_insert_check = ocfs2_dinode_insert_check, 167 .eo_sanity_check = ocfs2_dinode_sanity_check, 168 .eo_fill_root_el = ocfs2_dinode_fill_root_el, 169 }; 170 171 static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et, 172 u64 blkno) 173 { 174 struct ocfs2_dinode *di = et->et_object; 175 176 BUG_ON(et->et_ops != &ocfs2_dinode_et_ops); 177 di->i_last_eb_blk = cpu_to_le64(blkno); 178 } 179 180 static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et) 181 { 182 struct ocfs2_dinode *di = et->et_object; 183 184 BUG_ON(et->et_ops != &ocfs2_dinode_et_ops); 185 return le64_to_cpu(di->i_last_eb_blk); 186 } 187 188 static void ocfs2_dinode_update_clusters(struct ocfs2_extent_tree *et, 189 u32 clusters) 190 { 191 struct ocfs2_inode_info *oi = cache_info_to_inode(et->et_ci); 192 struct ocfs2_dinode *di = et->et_object; 193 194 le32_add_cpu(&di->i_clusters, clusters); 195 spin_lock(&oi->ip_lock); 196 oi->ip_clusters = le32_to_cpu(di->i_clusters); 197 spin_unlock(&oi->ip_lock); 198 } 199 200 static void ocfs2_dinode_extent_map_insert(struct ocfs2_extent_tree *et, 201 struct ocfs2_extent_rec *rec) 202 { 203 struct inode *inode = &cache_info_to_inode(et->et_ci)->vfs_inode; 204 205 ocfs2_extent_map_insert_rec(inode, rec); 206 } 207 208 static void ocfs2_dinode_extent_map_truncate(struct ocfs2_extent_tree *et, 209 u32 clusters) 210 { 211 struct inode *inode = &cache_info_to_inode(et->et_ci)->vfs_inode; 212 213 ocfs2_extent_map_trunc(inode, clusters); 214 } 215 216 static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et, 217 struct ocfs2_extent_rec *rec) 218 { 219 struct ocfs2_inode_info *oi = cache_info_to_inode(et->et_ci); 220 struct ocfs2_super *osb = OCFS2_SB(oi->vfs_inode.i_sb); 221 222 BUG_ON(oi->ip_dyn_features & OCFS2_INLINE_DATA_FL); 223 mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) && 224 (oi->ip_clusters != le32_to_cpu(rec->e_cpos)), 225 "Device %s, asking for sparse allocation: inode %llu, " 226 "cpos %u, clusters %u\n", 227 osb->dev_str, 228 (unsigned long long)oi->ip_blkno, 229 rec->e_cpos, oi->ip_clusters); 230 231 return 0; 232 } 233 234 static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et) 235 { 236 struct ocfs2_dinode *di = et->et_object; 237 238 BUG_ON(et->et_ops != &ocfs2_dinode_et_ops); 239 BUG_ON(!OCFS2_IS_VALID_DINODE(di)); 240 241 return 0; 242 } 243 244 static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et) 245 { 246 struct ocfs2_dinode *di = et->et_object; 247 248 et->et_root_el = &di->id2.i_list; 249 } 250 251 252 static void ocfs2_xattr_value_fill_root_el(struct ocfs2_extent_tree *et) 253 { 254 struct ocfs2_xattr_value_buf *vb = et->et_object; 255 256 et->et_root_el = &vb->vb_xv->xr_list; 257 } 258 259 static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et, 260 u64 blkno) 261 { 262 struct ocfs2_xattr_value_buf *vb = et->et_object; 263 264 vb->vb_xv->xr_last_eb_blk = cpu_to_le64(blkno); 265 } 266 267 static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et) 268 { 269 struct ocfs2_xattr_value_buf *vb = et->et_object; 270 271 return le64_to_cpu(vb->vb_xv->xr_last_eb_blk); 272 } 273 274 static void ocfs2_xattr_value_update_clusters(struct ocfs2_extent_tree *et, 275 u32 clusters) 276 { 277 struct ocfs2_xattr_value_buf *vb = et->et_object; 278 279 le32_add_cpu(&vb->vb_xv->xr_clusters, clusters); 280 } 281 282 static const struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = { 283 .eo_set_last_eb_blk = ocfs2_xattr_value_set_last_eb_blk, 284 .eo_get_last_eb_blk = ocfs2_xattr_value_get_last_eb_blk, 285 .eo_update_clusters = ocfs2_xattr_value_update_clusters, 286 .eo_fill_root_el = ocfs2_xattr_value_fill_root_el, 287 }; 288 289 static void ocfs2_xattr_tree_fill_root_el(struct ocfs2_extent_tree *et) 290 { 291 struct ocfs2_xattr_block *xb = et->et_object; 292 293 et->et_root_el = &xb->xb_attrs.xb_root.xt_list; 294 } 295 296 static void ocfs2_xattr_tree_fill_max_leaf_clusters(struct ocfs2_extent_tree *et) 297 { 298 struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); 299 et->et_max_leaf_clusters = 300 ocfs2_clusters_for_bytes(sb, OCFS2_MAX_XATTR_TREE_LEAF_SIZE); 301 } 302 303 static void ocfs2_xattr_tree_set_last_eb_blk(struct ocfs2_extent_tree *et, 304 u64 blkno) 305 { 306 struct ocfs2_xattr_block *xb = et->et_object; 307 struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root; 308 309 xt->xt_last_eb_blk = cpu_to_le64(blkno); 310 } 311 312 static u64 ocfs2_xattr_tree_get_last_eb_blk(struct ocfs2_extent_tree *et) 313 { 314 struct ocfs2_xattr_block *xb = et->et_object; 315 struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root; 316 317 return le64_to_cpu(xt->xt_last_eb_blk); 318 } 319 320 static void ocfs2_xattr_tree_update_clusters(struct ocfs2_extent_tree *et, 321 u32 clusters) 322 { 323 struct ocfs2_xattr_block *xb = et->et_object; 324 325 le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, clusters); 326 } 327 328 static const struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = { 329 .eo_set_last_eb_blk = ocfs2_xattr_tree_set_last_eb_blk, 330 .eo_get_last_eb_blk = ocfs2_xattr_tree_get_last_eb_blk, 331 .eo_update_clusters = ocfs2_xattr_tree_update_clusters, 332 .eo_fill_root_el = ocfs2_xattr_tree_fill_root_el, 333 .eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters, 334 }; 335 336 static void ocfs2_dx_root_set_last_eb_blk(struct ocfs2_extent_tree *et, 337 u64 blkno) 338 { 339 struct ocfs2_dx_root_block *dx_root = et->et_object; 340 341 dx_root->dr_last_eb_blk = cpu_to_le64(blkno); 342 } 343 344 static u64 ocfs2_dx_root_get_last_eb_blk(struct ocfs2_extent_tree *et) 345 { 346 struct ocfs2_dx_root_block *dx_root = et->et_object; 347 348 return le64_to_cpu(dx_root->dr_last_eb_blk); 349 } 350 351 static void ocfs2_dx_root_update_clusters(struct ocfs2_extent_tree *et, 352 u32 clusters) 353 { 354 struct ocfs2_dx_root_block *dx_root = et->et_object; 355 356 le32_add_cpu(&dx_root->dr_clusters, clusters); 357 } 358 359 static int ocfs2_dx_root_sanity_check(struct ocfs2_extent_tree *et) 360 { 361 struct ocfs2_dx_root_block *dx_root = et->et_object; 362 363 BUG_ON(!OCFS2_IS_VALID_DX_ROOT(dx_root)); 364 365 return 0; 366 } 367 368 static void ocfs2_dx_root_fill_root_el(struct ocfs2_extent_tree *et) 369 { 370 struct ocfs2_dx_root_block *dx_root = et->et_object; 371 372 et->et_root_el = &dx_root->dr_list; 373 } 374 375 static const struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = { 376 .eo_set_last_eb_blk = ocfs2_dx_root_set_last_eb_blk, 377 .eo_get_last_eb_blk = ocfs2_dx_root_get_last_eb_blk, 378 .eo_update_clusters = ocfs2_dx_root_update_clusters, 379 .eo_sanity_check = ocfs2_dx_root_sanity_check, 380 .eo_fill_root_el = ocfs2_dx_root_fill_root_el, 381 }; 382 383 static void ocfs2_refcount_tree_fill_root_el(struct ocfs2_extent_tree *et) 384 { 385 struct ocfs2_refcount_block *rb = et->et_object; 386 387 et->et_root_el = &rb->rf_list; 388 } 389 390 static void ocfs2_refcount_tree_set_last_eb_blk(struct ocfs2_extent_tree *et, 391 u64 blkno) 392 { 393 struct ocfs2_refcount_block *rb = et->et_object; 394 395 rb->rf_last_eb_blk = cpu_to_le64(blkno); 396 } 397 398 static u64 ocfs2_refcount_tree_get_last_eb_blk(struct ocfs2_extent_tree *et) 399 { 400 struct ocfs2_refcount_block *rb = et->et_object; 401 402 return le64_to_cpu(rb->rf_last_eb_blk); 403 } 404 405 static void ocfs2_refcount_tree_update_clusters(struct ocfs2_extent_tree *et, 406 u32 clusters) 407 { 408 struct ocfs2_refcount_block *rb = et->et_object; 409 410 le32_add_cpu(&rb->rf_clusters, clusters); 411 } 412 413 static enum ocfs2_contig_type 414 ocfs2_refcount_tree_extent_contig(struct ocfs2_extent_tree *et, 415 struct ocfs2_extent_rec *ext, 416 struct ocfs2_extent_rec *insert_rec) 417 { 418 return CONTIG_NONE; 419 } 420 421 static const struct ocfs2_extent_tree_operations ocfs2_refcount_tree_et_ops = { 422 .eo_set_last_eb_blk = ocfs2_refcount_tree_set_last_eb_blk, 423 .eo_get_last_eb_blk = ocfs2_refcount_tree_get_last_eb_blk, 424 .eo_update_clusters = ocfs2_refcount_tree_update_clusters, 425 .eo_fill_root_el = ocfs2_refcount_tree_fill_root_el, 426 .eo_extent_contig = ocfs2_refcount_tree_extent_contig, 427 }; 428 429 static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et, 430 struct ocfs2_caching_info *ci, 431 struct buffer_head *bh, 432 ocfs2_journal_access_func access, 433 void *obj, 434 const struct ocfs2_extent_tree_operations *ops) 435 { 436 et->et_ops = ops; 437 et->et_root_bh = bh; 438 et->et_ci = ci; 439 et->et_root_journal_access = access; 440 if (!obj) 441 obj = (void *)bh->b_data; 442 et->et_object = obj; 443 et->et_dealloc = NULL; 444 445 et->et_ops->eo_fill_root_el(et); 446 if (!et->et_ops->eo_fill_max_leaf_clusters) 447 et->et_max_leaf_clusters = 0; 448 else 449 et->et_ops->eo_fill_max_leaf_clusters(et); 450 } 451 452 void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et, 453 struct ocfs2_caching_info *ci, 454 struct buffer_head *bh) 455 { 456 __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_di, 457 NULL, &ocfs2_dinode_et_ops); 458 } 459 460 void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et, 461 struct ocfs2_caching_info *ci, 462 struct buffer_head *bh) 463 { 464 __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_xb, 465 NULL, &ocfs2_xattr_tree_et_ops); 466 } 467 468 void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et, 469 struct ocfs2_caching_info *ci, 470 struct ocfs2_xattr_value_buf *vb) 471 { 472 __ocfs2_init_extent_tree(et, ci, vb->vb_bh, vb->vb_access, vb, 473 &ocfs2_xattr_value_et_ops); 474 } 475 476 void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et, 477 struct ocfs2_caching_info *ci, 478 struct buffer_head *bh) 479 { 480 __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_dr, 481 NULL, &ocfs2_dx_root_et_ops); 482 } 483 484 void ocfs2_init_refcount_extent_tree(struct ocfs2_extent_tree *et, 485 struct ocfs2_caching_info *ci, 486 struct buffer_head *bh) 487 { 488 __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_rb, 489 NULL, &ocfs2_refcount_tree_et_ops); 490 } 491 492 static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et, 493 u64 new_last_eb_blk) 494 { 495 et->et_ops->eo_set_last_eb_blk(et, new_last_eb_blk); 496 } 497 498 static inline u64 ocfs2_et_get_last_eb_blk(struct ocfs2_extent_tree *et) 499 { 500 return et->et_ops->eo_get_last_eb_blk(et); 501 } 502 503 static inline void ocfs2_et_update_clusters(struct ocfs2_extent_tree *et, 504 u32 clusters) 505 { 506 et->et_ops->eo_update_clusters(et, clusters); 507 } 508 509 static inline void ocfs2_et_extent_map_insert(struct ocfs2_extent_tree *et, 510 struct ocfs2_extent_rec *rec) 511 { 512 if (et->et_ops->eo_extent_map_insert) 513 et->et_ops->eo_extent_map_insert(et, rec); 514 } 515 516 static inline void ocfs2_et_extent_map_truncate(struct ocfs2_extent_tree *et, 517 u32 clusters) 518 { 519 if (et->et_ops->eo_extent_map_truncate) 520 et->et_ops->eo_extent_map_truncate(et, clusters); 521 } 522 523 static inline int ocfs2_et_root_journal_access(handle_t *handle, 524 struct ocfs2_extent_tree *et, 525 int type) 526 { 527 return et->et_root_journal_access(handle, et->et_ci, et->et_root_bh, 528 type); 529 } 530 531 static inline enum ocfs2_contig_type 532 ocfs2_et_extent_contig(struct ocfs2_extent_tree *et, 533 struct ocfs2_extent_rec *rec, 534 struct ocfs2_extent_rec *insert_rec) 535 { 536 if (et->et_ops->eo_extent_contig) 537 return et->et_ops->eo_extent_contig(et, rec, insert_rec); 538 539 return ocfs2_extent_rec_contig( 540 ocfs2_metadata_cache_get_super(et->et_ci), 541 rec, insert_rec); 542 } 543 544 static inline int ocfs2_et_insert_check(struct ocfs2_extent_tree *et, 545 struct ocfs2_extent_rec *rec) 546 { 547 int ret = 0; 548 549 if (et->et_ops->eo_insert_check) 550 ret = et->et_ops->eo_insert_check(et, rec); 551 return ret; 552 } 553 554 static inline int ocfs2_et_sanity_check(struct ocfs2_extent_tree *et) 555 { 556 int ret = 0; 557 558 if (et->et_ops->eo_sanity_check) 559 ret = et->et_ops->eo_sanity_check(et); 560 return ret; 561 } 562 563 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt, 564 struct ocfs2_extent_block *eb); 565 static void ocfs2_adjust_rightmost_records(handle_t *handle, 566 struct ocfs2_extent_tree *et, 567 struct ocfs2_path *path, 568 struct ocfs2_extent_rec *insert_rec); 569 /* 570 * Reset the actual path elements so that we can reuse the structure 571 * to build another path. Generally, this involves freeing the buffer 572 * heads. 573 */ 574 void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root) 575 { 576 int i, start = 0, depth = 0; 577 struct ocfs2_path_item *node; 578 579 if (keep_root) 580 start = 1; 581 582 for(i = start; i < path_num_items(path); i++) { 583 node = &path->p_node[i]; 584 585 brelse(node->bh); 586 node->bh = NULL; 587 node->el = NULL; 588 } 589 590 /* 591 * Tree depth may change during truncate, or insert. If we're 592 * keeping the root extent list, then make sure that our path 593 * structure reflects the proper depth. 594 */ 595 if (keep_root) 596 depth = le16_to_cpu(path_root_el(path)->l_tree_depth); 597 else 598 path_root_access(path) = NULL; 599 600 path->p_tree_depth = depth; 601 } 602 603 void ocfs2_free_path(struct ocfs2_path *path) 604 { 605 if (path) { 606 ocfs2_reinit_path(path, 0); 607 kfree(path); 608 } 609 } 610 611 /* 612 * All the elements of src into dest. After this call, src could be freed 613 * without affecting dest. 614 * 615 * Both paths should have the same root. Any non-root elements of dest 616 * will be freed. 617 */ 618 static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src) 619 { 620 int i; 621 622 BUG_ON(path_root_bh(dest) != path_root_bh(src)); 623 BUG_ON(path_root_el(dest) != path_root_el(src)); 624 BUG_ON(path_root_access(dest) != path_root_access(src)); 625 626 ocfs2_reinit_path(dest, 1); 627 628 for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) { 629 dest->p_node[i].bh = src->p_node[i].bh; 630 dest->p_node[i].el = src->p_node[i].el; 631 632 if (dest->p_node[i].bh) 633 get_bh(dest->p_node[i].bh); 634 } 635 } 636 637 /* 638 * Make the *dest path the same as src and re-initialize src path to 639 * have a root only. 640 */ 641 static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src) 642 { 643 int i; 644 645 BUG_ON(path_root_bh(dest) != path_root_bh(src)); 646 BUG_ON(path_root_access(dest) != path_root_access(src)); 647 648 for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) { 649 brelse(dest->p_node[i].bh); 650 651 dest->p_node[i].bh = src->p_node[i].bh; 652 dest->p_node[i].el = src->p_node[i].el; 653 654 src->p_node[i].bh = NULL; 655 src->p_node[i].el = NULL; 656 } 657 } 658 659 /* 660 * Insert an extent block at given index. 661 * 662 * This will not take an additional reference on eb_bh. 663 */ 664 static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index, 665 struct buffer_head *eb_bh) 666 { 667 struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data; 668 669 /* 670 * Right now, no root bh is an extent block, so this helps 671 * catch code errors with dinode trees. The assertion can be 672 * safely removed if we ever need to insert extent block 673 * structures at the root. 674 */ 675 BUG_ON(index == 0); 676 677 path->p_node[index].bh = eb_bh; 678 path->p_node[index].el = &eb->h_list; 679 } 680 681 static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh, 682 struct ocfs2_extent_list *root_el, 683 ocfs2_journal_access_func access) 684 { 685 struct ocfs2_path *path; 686 687 BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH); 688 689 path = kzalloc(sizeof(*path), GFP_NOFS); 690 if (path) { 691 path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth); 692 get_bh(root_bh); 693 path_root_bh(path) = root_bh; 694 path_root_el(path) = root_el; 695 path_root_access(path) = access; 696 } 697 698 return path; 699 } 700 701 struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path) 702 { 703 return ocfs2_new_path(path_root_bh(path), path_root_el(path), 704 path_root_access(path)); 705 } 706 707 struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et) 708 { 709 return ocfs2_new_path(et->et_root_bh, et->et_root_el, 710 et->et_root_journal_access); 711 } 712 713 /* 714 * Journal the buffer at depth idx. All idx>0 are extent_blocks, 715 * otherwise it's the root_access function. 716 * 717 * I don't like the way this function's name looks next to 718 * ocfs2_journal_access_path(), but I don't have a better one. 719 */ 720 int ocfs2_path_bh_journal_access(handle_t *handle, 721 struct ocfs2_caching_info *ci, 722 struct ocfs2_path *path, 723 int idx) 724 { 725 ocfs2_journal_access_func access = path_root_access(path); 726 727 if (!access) 728 access = ocfs2_journal_access; 729 730 if (idx) 731 access = ocfs2_journal_access_eb; 732 733 return access(handle, ci, path->p_node[idx].bh, 734 OCFS2_JOURNAL_ACCESS_WRITE); 735 } 736 737 /* 738 * Convenience function to journal all components in a path. 739 */ 740 int ocfs2_journal_access_path(struct ocfs2_caching_info *ci, 741 handle_t *handle, 742 struct ocfs2_path *path) 743 { 744 int i, ret = 0; 745 746 if (!path) 747 goto out; 748 749 for(i = 0; i < path_num_items(path); i++) { 750 ret = ocfs2_path_bh_journal_access(handle, ci, path, i); 751 if (ret < 0) { 752 mlog_errno(ret); 753 goto out; 754 } 755 } 756 757 out: 758 return ret; 759 } 760 761 /* 762 * Return the index of the extent record which contains cluster #v_cluster. 763 * -1 is returned if it was not found. 764 * 765 * Should work fine on interior and exterior nodes. 766 */ 767 int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster) 768 { 769 int ret = -1; 770 int i; 771 struct ocfs2_extent_rec *rec; 772 u32 rec_end, rec_start, clusters; 773 774 for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { 775 rec = &el->l_recs[i]; 776 777 rec_start = le32_to_cpu(rec->e_cpos); 778 clusters = ocfs2_rec_clusters(el, rec); 779 780 rec_end = rec_start + clusters; 781 782 if (v_cluster >= rec_start && v_cluster < rec_end) { 783 ret = i; 784 break; 785 } 786 } 787 788 return ret; 789 } 790 791 /* 792 * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and 793 * ocfs2_extent_rec_contig only work properly against leaf nodes! 794 */ 795 static int ocfs2_block_extent_contig(struct super_block *sb, 796 struct ocfs2_extent_rec *ext, 797 u64 blkno) 798 { 799 u64 blk_end = le64_to_cpu(ext->e_blkno); 800 801 blk_end += ocfs2_clusters_to_blocks(sb, 802 le16_to_cpu(ext->e_leaf_clusters)); 803 804 return blkno == blk_end; 805 } 806 807 static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left, 808 struct ocfs2_extent_rec *right) 809 { 810 u32 left_range; 811 812 left_range = le32_to_cpu(left->e_cpos) + 813 le16_to_cpu(left->e_leaf_clusters); 814 815 return (left_range == le32_to_cpu(right->e_cpos)); 816 } 817 818 static enum ocfs2_contig_type 819 ocfs2_extent_rec_contig(struct super_block *sb, 820 struct ocfs2_extent_rec *ext, 821 struct ocfs2_extent_rec *insert_rec) 822 { 823 u64 blkno = le64_to_cpu(insert_rec->e_blkno); 824 825 /* 826 * Refuse to coalesce extent records with different flag 827 * fields - we don't want to mix unwritten extents with user 828 * data. 829 */ 830 if (ext->e_flags != insert_rec->e_flags) 831 return CONTIG_NONE; 832 833 if (ocfs2_extents_adjacent(ext, insert_rec) && 834 ocfs2_block_extent_contig(sb, ext, blkno)) 835 return CONTIG_RIGHT; 836 837 blkno = le64_to_cpu(ext->e_blkno); 838 if (ocfs2_extents_adjacent(insert_rec, ext) && 839 ocfs2_block_extent_contig(sb, insert_rec, blkno)) 840 return CONTIG_LEFT; 841 842 return CONTIG_NONE; 843 } 844 845 /* 846 * NOTE: We can have pretty much any combination of contiguousness and 847 * appending. 848 * 849 * The usefulness of APPEND_TAIL is more in that it lets us know that 850 * we'll have to update the path to that leaf. 851 */ 852 enum ocfs2_append_type { 853 APPEND_NONE = 0, 854 APPEND_TAIL, 855 }; 856 857 enum ocfs2_split_type { 858 SPLIT_NONE = 0, 859 SPLIT_LEFT, 860 SPLIT_RIGHT, 861 }; 862 863 struct ocfs2_insert_type { 864 enum ocfs2_split_type ins_split; 865 enum ocfs2_append_type ins_appending; 866 enum ocfs2_contig_type ins_contig; 867 int ins_contig_index; 868 int ins_tree_depth; 869 }; 870 871 struct ocfs2_merge_ctxt { 872 enum ocfs2_contig_type c_contig_type; 873 int c_has_empty_extent; 874 int c_split_covers_rec; 875 }; 876 877 static int ocfs2_validate_extent_block(struct super_block *sb, 878 struct buffer_head *bh) 879 { 880 int rc; 881 struct ocfs2_extent_block *eb = 882 (struct ocfs2_extent_block *)bh->b_data; 883 884 trace_ocfs2_validate_extent_block((unsigned long long)bh->b_blocknr); 885 886 BUG_ON(!buffer_uptodate(bh)); 887 888 /* 889 * If the ecc fails, we return the error but otherwise 890 * leave the filesystem running. We know any error is 891 * local to this block. 892 */ 893 rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &eb->h_check); 894 if (rc) { 895 mlog(ML_ERROR, "Checksum failed for extent block %llu\n", 896 (unsigned long long)bh->b_blocknr); 897 return rc; 898 } 899 900 /* 901 * Errors after here are fatal. 902 */ 903 904 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { 905 rc = ocfs2_error(sb, 906 "Extent block #%llu has bad signature %.*s\n", 907 (unsigned long long)bh->b_blocknr, 7, 908 eb->h_signature); 909 goto bail; 910 } 911 912 if (le64_to_cpu(eb->h_blkno) != bh->b_blocknr) { 913 rc = ocfs2_error(sb, 914 "Extent block #%llu has an invalid h_blkno of %llu\n", 915 (unsigned long long)bh->b_blocknr, 916 (unsigned long long)le64_to_cpu(eb->h_blkno)); 917 goto bail; 918 } 919 920 if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) 921 rc = ocfs2_error(sb, 922 "Extent block #%llu has an invalid h_fs_generation of #%u\n", 923 (unsigned long long)bh->b_blocknr, 924 le32_to_cpu(eb->h_fs_generation)); 925 bail: 926 return rc; 927 } 928 929 int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno, 930 struct buffer_head **bh) 931 { 932 int rc; 933 struct buffer_head *tmp = *bh; 934 935 rc = ocfs2_read_block(ci, eb_blkno, &tmp, 936 ocfs2_validate_extent_block); 937 938 /* If ocfs2_read_block() got us a new bh, pass it up. */ 939 if (!rc && !*bh) 940 *bh = tmp; 941 942 return rc; 943 } 944 945 946 /* 947 * How many free extents have we got before we need more meta data? 948 */ 949 int ocfs2_num_free_extents(struct ocfs2_extent_tree *et) 950 { 951 int retval; 952 struct ocfs2_extent_list *el = NULL; 953 struct ocfs2_extent_block *eb; 954 struct buffer_head *eb_bh = NULL; 955 u64 last_eb_blk = 0; 956 957 el = et->et_root_el; 958 last_eb_blk = ocfs2_et_get_last_eb_blk(et); 959 960 if (last_eb_blk) { 961 retval = ocfs2_read_extent_block(et->et_ci, last_eb_blk, 962 &eb_bh); 963 if (retval < 0) { 964 mlog_errno(retval); 965 goto bail; 966 } 967 eb = (struct ocfs2_extent_block *) eb_bh->b_data; 968 el = &eb->h_list; 969 } 970 971 if (el->l_tree_depth != 0) { 972 retval = ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), 973 "Owner %llu has leaf extent block %llu with an invalid l_tree_depth of %u\n", 974 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), 975 (unsigned long long)last_eb_blk, 976 le16_to_cpu(el->l_tree_depth)); 977 goto bail; 978 } 979 980 retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec); 981 bail: 982 brelse(eb_bh); 983 984 trace_ocfs2_num_free_extents(retval); 985 return retval; 986 } 987 988 /* expects array to already be allocated 989 * 990 * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and 991 * l_count for you 992 */ 993 static int ocfs2_create_new_meta_bhs(handle_t *handle, 994 struct ocfs2_extent_tree *et, 995 int wanted, 996 struct ocfs2_alloc_context *meta_ac, 997 struct buffer_head *bhs[]) 998 { 999 int count, status, i; 1000 u16 suballoc_bit_start; 1001 u32 num_got; 1002 u64 suballoc_loc, first_blkno; 1003 struct ocfs2_super *osb = 1004 OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci)); 1005 struct ocfs2_extent_block *eb; 1006 1007 count = 0; 1008 while (count < wanted) { 1009 status = ocfs2_claim_metadata(handle, 1010 meta_ac, 1011 wanted - count, 1012 &suballoc_loc, 1013 &suballoc_bit_start, 1014 &num_got, 1015 &first_blkno); 1016 if (status < 0) { 1017 mlog_errno(status); 1018 goto bail; 1019 } 1020 1021 for(i = count; i < (num_got + count); i++) { 1022 bhs[i] = sb_getblk(osb->sb, first_blkno); 1023 if (bhs[i] == NULL) { 1024 status = -ENOMEM; 1025 mlog_errno(status); 1026 goto bail; 1027 } 1028 ocfs2_set_new_buffer_uptodate(et->et_ci, bhs[i]); 1029 1030 status = ocfs2_journal_access_eb(handle, et->et_ci, 1031 bhs[i], 1032 OCFS2_JOURNAL_ACCESS_CREATE); 1033 if (status < 0) { 1034 mlog_errno(status); 1035 goto bail; 1036 } 1037 1038 memset(bhs[i]->b_data, 0, osb->sb->s_blocksize); 1039 eb = (struct ocfs2_extent_block *) bhs[i]->b_data; 1040 /* Ok, setup the minimal stuff here. */ 1041 strscpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE); 1042 eb->h_blkno = cpu_to_le64(first_blkno); 1043 eb->h_fs_generation = cpu_to_le32(osb->fs_generation); 1044 eb->h_suballoc_slot = 1045 cpu_to_le16(meta_ac->ac_alloc_slot); 1046 eb->h_suballoc_loc = cpu_to_le64(suballoc_loc); 1047 eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start); 1048 eb->h_list.l_count = 1049 cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb)); 1050 1051 suballoc_bit_start++; 1052 first_blkno++; 1053 1054 /* We'll also be dirtied by the caller, so 1055 * this isn't absolutely necessary. */ 1056 ocfs2_journal_dirty(handle, bhs[i]); 1057 } 1058 1059 count += num_got; 1060 } 1061 1062 status = 0; 1063 bail: 1064 if (status < 0) { 1065 for(i = 0; i < wanted; i++) { 1066 brelse(bhs[i]); 1067 bhs[i] = NULL; 1068 } 1069 } 1070 return status; 1071 } 1072 1073 /* 1074 * Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth(). 1075 * 1076 * Returns the sum of the rightmost extent rec logical offset and 1077 * cluster count. 1078 * 1079 * ocfs2_add_branch() uses this to determine what logical cluster 1080 * value should be populated into the leftmost new branch records. 1081 * 1082 * ocfs2_shift_tree_depth() uses this to determine the # clusters 1083 * value for the new topmost tree record. 1084 */ 1085 static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list *el) 1086 { 1087 int i; 1088 1089 i = le16_to_cpu(el->l_next_free_rec) - 1; 1090 1091 return le32_to_cpu(el->l_recs[i].e_cpos) + 1092 ocfs2_rec_clusters(el, &el->l_recs[i]); 1093 } 1094 1095 /* 1096 * Change range of the branches in the right most path according to the leaf 1097 * extent block's rightmost record. 1098 */ 1099 static int ocfs2_adjust_rightmost_branch(handle_t *handle, 1100 struct ocfs2_extent_tree *et) 1101 { 1102 int status; 1103 struct ocfs2_path *path = NULL; 1104 struct ocfs2_extent_list *el; 1105 struct ocfs2_extent_rec *rec; 1106 1107 path = ocfs2_new_path_from_et(et); 1108 if (!path) { 1109 status = -ENOMEM; 1110 return status; 1111 } 1112 1113 status = ocfs2_find_path(et->et_ci, path, UINT_MAX); 1114 if (status < 0) { 1115 mlog_errno(status); 1116 goto out; 1117 } 1118 1119 status = ocfs2_extend_trans(handle, path_num_items(path)); 1120 if (status < 0) { 1121 mlog_errno(status); 1122 goto out; 1123 } 1124 1125 status = ocfs2_journal_access_path(et->et_ci, handle, path); 1126 if (status < 0) { 1127 mlog_errno(status); 1128 goto out; 1129 } 1130 1131 el = path_leaf_el(path); 1132 rec = &el->l_recs[le16_to_cpu(el->l_next_free_rec) - 1]; 1133 1134 ocfs2_adjust_rightmost_records(handle, et, path, rec); 1135 1136 out: 1137 ocfs2_free_path(path); 1138 return status; 1139 } 1140 1141 /* 1142 * Add an entire tree branch to our inode. eb_bh is the extent block 1143 * to start at, if we don't want to start the branch at the root 1144 * structure. 1145 * 1146 * last_eb_bh is required as we have to update it's next_leaf pointer 1147 * for the new last extent block. 1148 * 1149 * the new branch will be 'empty' in the sense that every block will 1150 * contain a single record with cluster count == 0. 1151 */ 1152 static int ocfs2_add_branch(handle_t *handle, 1153 struct ocfs2_extent_tree *et, 1154 struct buffer_head *eb_bh, 1155 struct buffer_head **last_eb_bh, 1156 struct ocfs2_alloc_context *meta_ac) 1157 { 1158 int status, new_blocks, i, block_given = 0; 1159 u64 next_blkno, new_last_eb_blk; 1160 struct buffer_head *bh; 1161 struct buffer_head **new_eb_bhs = NULL; 1162 struct ocfs2_extent_block *eb; 1163 struct ocfs2_extent_list *eb_el; 1164 struct ocfs2_extent_list *el; 1165 u32 new_cpos, root_end; 1166 1167 BUG_ON(!last_eb_bh || !*last_eb_bh); 1168 1169 if (eb_bh) { 1170 eb = (struct ocfs2_extent_block *) eb_bh->b_data; 1171 el = &eb->h_list; 1172 } else 1173 el = et->et_root_el; 1174 1175 /* we never add a branch to a leaf. */ 1176 BUG_ON(!el->l_tree_depth); 1177 1178 new_blocks = le16_to_cpu(el->l_tree_depth); 1179 1180 eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data; 1181 new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list); 1182 root_end = ocfs2_sum_rightmost_rec(et->et_root_el); 1183 1184 /* 1185 * If there is a gap before the root end and the real end 1186 * of the rightmost leaf block, we need to remove the gap 1187 * between new_cpos and root_end first so that the tree 1188 * is consistent after we add a new branch(it will start 1189 * from new_cpos). 1190 */ 1191 if (root_end > new_cpos) { 1192 trace_ocfs2_adjust_rightmost_branch( 1193 (unsigned long long) 1194 ocfs2_metadata_cache_owner(et->et_ci), 1195 root_end, new_cpos); 1196 1197 status = ocfs2_adjust_rightmost_branch(handle, et); 1198 if (status) { 1199 mlog_errno(status); 1200 goto bail; 1201 } 1202 } 1203 1204 /* allocate the number of new eb blocks we need */ 1205 new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *), 1206 GFP_KERNEL); 1207 if (!new_eb_bhs) { 1208 status = -ENOMEM; 1209 mlog_errno(status); 1210 goto bail; 1211 } 1212 1213 /* Firstyly, try to reuse dealloc since we have already estimated how 1214 * many extent blocks we may use. 1215 */ 1216 if (!ocfs2_is_dealloc_empty(et)) { 1217 status = ocfs2_reuse_blk_from_dealloc(handle, et, 1218 new_eb_bhs, new_blocks, 1219 &block_given); 1220 if (status < 0) { 1221 mlog_errno(status); 1222 goto bail; 1223 } 1224 } 1225 1226 BUG_ON(block_given > new_blocks); 1227 1228 if (block_given < new_blocks) { 1229 BUG_ON(!meta_ac); 1230 status = ocfs2_create_new_meta_bhs(handle, et, 1231 new_blocks - block_given, 1232 meta_ac, 1233 &new_eb_bhs[block_given]); 1234 if (status < 0) { 1235 mlog_errno(status); 1236 goto bail; 1237 } 1238 } 1239 1240 /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be 1241 * linked with the rest of the tree. 1242 * conversely, new_eb_bhs[0] is the new bottommost leaf. 1243 * 1244 * when we leave the loop, new_last_eb_blk will point to the 1245 * newest leaf, and next_blkno will point to the topmost extent 1246 * block. */ 1247 next_blkno = new_last_eb_blk = 0; 1248 for(i = 0; i < new_blocks; i++) { 1249 bh = new_eb_bhs[i]; 1250 eb = (struct ocfs2_extent_block *) bh->b_data; 1251 /* ocfs2_create_new_meta_bhs() should create it right! */ 1252 BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb)); 1253 eb_el = &eb->h_list; 1254 1255 status = ocfs2_journal_access_eb(handle, et->et_ci, bh, 1256 OCFS2_JOURNAL_ACCESS_CREATE); 1257 if (status < 0) { 1258 mlog_errno(status); 1259 goto bail; 1260 } 1261 1262 eb->h_next_leaf_blk = 0; 1263 eb_el->l_tree_depth = cpu_to_le16(i); 1264 eb_el->l_next_free_rec = cpu_to_le16(1); 1265 /* 1266 * This actually counts as an empty extent as 1267 * c_clusters == 0 1268 */ 1269 eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos); 1270 eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno); 1271 /* 1272 * eb_el isn't always an interior node, but even leaf 1273 * nodes want a zero'd flags and reserved field so 1274 * this gets the whole 32 bits regardless of use. 1275 */ 1276 eb_el->l_recs[0].e_int_clusters = cpu_to_le32(0); 1277 if (!eb_el->l_tree_depth) 1278 new_last_eb_blk = le64_to_cpu(eb->h_blkno); 1279 1280 ocfs2_journal_dirty(handle, bh); 1281 next_blkno = le64_to_cpu(eb->h_blkno); 1282 } 1283 1284 /* This is a bit hairy. We want to update up to three blocks 1285 * here without leaving any of them in an inconsistent state 1286 * in case of error. We don't have to worry about 1287 * journal_dirty erroring as it won't unless we've aborted the 1288 * handle (in which case we would never be here) so reserving 1289 * the write with journal_access is all we need to do. */ 1290 status = ocfs2_journal_access_eb(handle, et->et_ci, *last_eb_bh, 1291 OCFS2_JOURNAL_ACCESS_WRITE); 1292 if (status < 0) { 1293 mlog_errno(status); 1294 goto bail; 1295 } 1296 status = ocfs2_et_root_journal_access(handle, et, 1297 OCFS2_JOURNAL_ACCESS_WRITE); 1298 if (status < 0) { 1299 mlog_errno(status); 1300 goto bail; 1301 } 1302 if (eb_bh) { 1303 status = ocfs2_journal_access_eb(handle, et->et_ci, eb_bh, 1304 OCFS2_JOURNAL_ACCESS_WRITE); 1305 if (status < 0) { 1306 mlog_errno(status); 1307 goto bail; 1308 } 1309 } 1310 1311 /* Link the new branch into the rest of the tree (el will 1312 * either be on the root_bh, or the extent block passed in. */ 1313 i = le16_to_cpu(el->l_next_free_rec); 1314 el->l_recs[i].e_blkno = cpu_to_le64(next_blkno); 1315 el->l_recs[i].e_cpos = cpu_to_le32(new_cpos); 1316 el->l_recs[i].e_int_clusters = 0; 1317 le16_add_cpu(&el->l_next_free_rec, 1); 1318 1319 /* fe needs a new last extent block pointer, as does the 1320 * next_leaf on the previously last-extent-block. */ 1321 ocfs2_et_set_last_eb_blk(et, new_last_eb_blk); 1322 1323 eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data; 1324 eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk); 1325 1326 ocfs2_journal_dirty(handle, *last_eb_bh); 1327 ocfs2_journal_dirty(handle, et->et_root_bh); 1328 if (eb_bh) 1329 ocfs2_journal_dirty(handle, eb_bh); 1330 1331 /* 1332 * Some callers want to track the rightmost leaf so pass it 1333 * back here. 1334 */ 1335 brelse(*last_eb_bh); 1336 get_bh(new_eb_bhs[0]); 1337 *last_eb_bh = new_eb_bhs[0]; 1338 1339 status = 0; 1340 bail: 1341 if (new_eb_bhs) { 1342 for (i = 0; i < new_blocks; i++) 1343 brelse(new_eb_bhs[i]); 1344 kfree(new_eb_bhs); 1345 } 1346 1347 return status; 1348 } 1349 1350 /* 1351 * adds another level to the allocation tree. 1352 * returns back the new extent block so you can add a branch to it 1353 * after this call. 1354 */ 1355 static int ocfs2_shift_tree_depth(handle_t *handle, 1356 struct ocfs2_extent_tree *et, 1357 struct ocfs2_alloc_context *meta_ac, 1358 struct buffer_head **ret_new_eb_bh) 1359 { 1360 int status, i, block_given = 0; 1361 u32 new_clusters; 1362 struct buffer_head *new_eb_bh = NULL; 1363 struct ocfs2_extent_block *eb; 1364 struct ocfs2_extent_list *root_el; 1365 struct ocfs2_extent_list *eb_el; 1366 1367 if (!ocfs2_is_dealloc_empty(et)) { 1368 status = ocfs2_reuse_blk_from_dealloc(handle, et, 1369 &new_eb_bh, 1, 1370 &block_given); 1371 } else if (meta_ac) { 1372 status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac, 1373 &new_eb_bh); 1374 1375 } else { 1376 BUG(); 1377 } 1378 1379 if (status < 0) { 1380 mlog_errno(status); 1381 goto bail; 1382 } 1383 1384 eb = (struct ocfs2_extent_block *) new_eb_bh->b_data; 1385 /* ocfs2_create_new_meta_bhs() should create it right! */ 1386 BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb)); 1387 1388 eb_el = &eb->h_list; 1389 root_el = et->et_root_el; 1390 1391 status = ocfs2_journal_access_eb(handle, et->et_ci, new_eb_bh, 1392 OCFS2_JOURNAL_ACCESS_CREATE); 1393 if (status < 0) { 1394 mlog_errno(status); 1395 goto bail; 1396 } 1397 1398 /* copy the root extent list data into the new extent block */ 1399 eb_el->l_tree_depth = root_el->l_tree_depth; 1400 eb_el->l_next_free_rec = root_el->l_next_free_rec; 1401 for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++) 1402 eb_el->l_recs[i] = root_el->l_recs[i]; 1403 1404 ocfs2_journal_dirty(handle, new_eb_bh); 1405 1406 status = ocfs2_et_root_journal_access(handle, et, 1407 OCFS2_JOURNAL_ACCESS_WRITE); 1408 if (status < 0) { 1409 mlog_errno(status); 1410 goto bail; 1411 } 1412 1413 new_clusters = ocfs2_sum_rightmost_rec(eb_el); 1414 1415 /* update root_bh now */ 1416 le16_add_cpu(&root_el->l_tree_depth, 1); 1417 root_el->l_recs[0].e_cpos = 0; 1418 root_el->l_recs[0].e_blkno = eb->h_blkno; 1419 root_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters); 1420 for (i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++) 1421 memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec)); 1422 root_el->l_next_free_rec = cpu_to_le16(1); 1423 1424 /* If this is our 1st tree depth shift, then last_eb_blk 1425 * becomes the allocated extent block */ 1426 if (root_el->l_tree_depth == cpu_to_le16(1)) 1427 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno)); 1428 1429 ocfs2_journal_dirty(handle, et->et_root_bh); 1430 1431 *ret_new_eb_bh = new_eb_bh; 1432 new_eb_bh = NULL; 1433 status = 0; 1434 bail: 1435 brelse(new_eb_bh); 1436 1437 return status; 1438 } 1439 1440 /* 1441 * Should only be called when there is no space left in any of the 1442 * leaf nodes. What we want to do is find the lowest tree depth 1443 * non-leaf extent block with room for new records. There are three 1444 * valid results of this search: 1445 * 1446 * 1) a lowest extent block is found, then we pass it back in 1447 * *lowest_eb_bh and return '0' 1448 * 1449 * 2) the search fails to find anything, but the root_el has room. We 1450 * pass NULL back in *lowest_eb_bh, but still return '0' 1451 * 1452 * 3) the search fails to find anything AND the root_el is full, in 1453 * which case we return > 0 1454 * 1455 * return status < 0 indicates an error. 1456 */ 1457 static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et, 1458 struct buffer_head **target_bh) 1459 { 1460 int status = 0, i; 1461 u64 blkno; 1462 struct ocfs2_extent_block *eb; 1463 struct ocfs2_extent_list *el; 1464 struct buffer_head *bh = NULL; 1465 struct buffer_head *lowest_bh = NULL; 1466 1467 *target_bh = NULL; 1468 1469 el = et->et_root_el; 1470 1471 while(le16_to_cpu(el->l_tree_depth) > 1) { 1472 if (le16_to_cpu(el->l_next_free_rec) == 0) { 1473 status = ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), 1474 "Owner %llu has empty extent list (next_free_rec == 0)\n", 1475 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci)); 1476 goto bail; 1477 } 1478 i = le16_to_cpu(el->l_next_free_rec) - 1; 1479 blkno = le64_to_cpu(el->l_recs[i].e_blkno); 1480 if (!blkno) { 1481 status = ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), 1482 "Owner %llu has extent list where extent # %d has no physical block start\n", 1483 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), i); 1484 goto bail; 1485 } 1486 1487 brelse(bh); 1488 bh = NULL; 1489 1490 status = ocfs2_read_extent_block(et->et_ci, blkno, &bh); 1491 if (status < 0) { 1492 mlog_errno(status); 1493 goto bail; 1494 } 1495 1496 eb = (struct ocfs2_extent_block *) bh->b_data; 1497 el = &eb->h_list; 1498 1499 if (le16_to_cpu(el->l_next_free_rec) < 1500 le16_to_cpu(el->l_count)) { 1501 brelse(lowest_bh); 1502 lowest_bh = bh; 1503 get_bh(lowest_bh); 1504 } 1505 } 1506 1507 /* If we didn't find one and the fe doesn't have any room, 1508 * then return '1' */ 1509 el = et->et_root_el; 1510 if (!lowest_bh && (el->l_next_free_rec == el->l_count)) 1511 status = 1; 1512 1513 *target_bh = lowest_bh; 1514 bail: 1515 brelse(bh); 1516 1517 return status; 1518 } 1519 1520 /* 1521 * Grow a b-tree so that it has more records. 1522 * 1523 * We might shift the tree depth in which case existing paths should 1524 * be considered invalid. 1525 * 1526 * Tree depth after the grow is returned via *final_depth. 1527 * 1528 * *last_eb_bh will be updated by ocfs2_add_branch(). 1529 */ 1530 static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et, 1531 int *final_depth, struct buffer_head **last_eb_bh, 1532 struct ocfs2_alloc_context *meta_ac) 1533 { 1534 int ret, shift; 1535 struct ocfs2_extent_list *el = et->et_root_el; 1536 int depth = le16_to_cpu(el->l_tree_depth); 1537 struct buffer_head *bh = NULL; 1538 1539 BUG_ON(meta_ac == NULL && ocfs2_is_dealloc_empty(et)); 1540 1541 shift = ocfs2_find_branch_target(et, &bh); 1542 if (shift < 0) { 1543 ret = shift; 1544 mlog_errno(ret); 1545 goto out; 1546 } 1547 1548 /* We traveled all the way to the bottom of the allocation tree 1549 * and didn't find room for any more extents - we need to add 1550 * another tree level */ 1551 if (shift) { 1552 BUG_ON(bh); 1553 trace_ocfs2_grow_tree( 1554 (unsigned long long) 1555 ocfs2_metadata_cache_owner(et->et_ci), 1556 depth); 1557 1558 /* ocfs2_shift_tree_depth will return us a buffer with 1559 * the new extent block (so we can pass that to 1560 * ocfs2_add_branch). */ 1561 ret = ocfs2_shift_tree_depth(handle, et, meta_ac, &bh); 1562 if (ret < 0) { 1563 mlog_errno(ret); 1564 goto out; 1565 } 1566 depth++; 1567 if (depth == 1) { 1568 /* 1569 * Special case: we have room now if we shifted from 1570 * tree_depth 0, so no more work needs to be done. 1571 * 1572 * We won't be calling add_branch, so pass 1573 * back *last_eb_bh as the new leaf. At depth 1574 * zero, it should always be null so there's 1575 * no reason to brelse. 1576 */ 1577 BUG_ON(*last_eb_bh); 1578 get_bh(bh); 1579 *last_eb_bh = bh; 1580 goto out; 1581 } 1582 } 1583 1584 /* call ocfs2_add_branch to add the final part of the tree with 1585 * the new data. */ 1586 ret = ocfs2_add_branch(handle, et, bh, last_eb_bh, 1587 meta_ac); 1588 if (ret < 0) 1589 mlog_errno(ret); 1590 1591 out: 1592 if (final_depth) 1593 *final_depth = depth; 1594 brelse(bh); 1595 return ret; 1596 } 1597 1598 /* 1599 * This function will discard the rightmost extent record. 1600 */ 1601 static void ocfs2_shift_records_right(struct ocfs2_extent_list *el) 1602 { 1603 int next_free = le16_to_cpu(el->l_next_free_rec); 1604 int count = le16_to_cpu(el->l_count); 1605 unsigned int num_bytes; 1606 1607 BUG_ON(!next_free); 1608 /* This will cause us to go off the end of our extent list. */ 1609 BUG_ON(next_free >= count); 1610 1611 num_bytes = sizeof(struct ocfs2_extent_rec) * next_free; 1612 1613 memmove(&el->l_recs[1], &el->l_recs[0], num_bytes); 1614 } 1615 1616 static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el, 1617 struct ocfs2_extent_rec *insert_rec) 1618 { 1619 int i, insert_index, next_free, has_empty, num_bytes; 1620 u32 insert_cpos = le32_to_cpu(insert_rec->e_cpos); 1621 struct ocfs2_extent_rec *rec; 1622 1623 next_free = le16_to_cpu(el->l_next_free_rec); 1624 has_empty = ocfs2_is_empty_extent(&el->l_recs[0]); 1625 1626 BUG_ON(!next_free); 1627 1628 /* The tree code before us didn't allow enough room in the leaf. */ 1629 BUG_ON(el->l_next_free_rec == el->l_count && !has_empty); 1630 1631 /* 1632 * The easiest way to approach this is to just remove the 1633 * empty extent and temporarily decrement next_free. 1634 */ 1635 if (has_empty) { 1636 /* 1637 * If next_free was 1 (only an empty extent), this 1638 * loop won't execute, which is fine. We still want 1639 * the decrement above to happen. 1640 */ 1641 for(i = 0; i < (next_free - 1); i++) 1642 el->l_recs[i] = el->l_recs[i+1]; 1643 1644 next_free--; 1645 } 1646 1647 /* 1648 * Figure out what the new record index should be. 1649 */ 1650 for(i = 0; i < next_free; i++) { 1651 rec = &el->l_recs[i]; 1652 1653 if (insert_cpos < le32_to_cpu(rec->e_cpos)) 1654 break; 1655 } 1656 insert_index = i; 1657 1658 trace_ocfs2_rotate_leaf(insert_cpos, insert_index, 1659 has_empty, next_free, 1660 le16_to_cpu(el->l_count)); 1661 1662 BUG_ON(insert_index < 0); 1663 BUG_ON(insert_index >= le16_to_cpu(el->l_count)); 1664 BUG_ON(insert_index > next_free); 1665 1666 /* 1667 * No need to memmove if we're just adding to the tail. 1668 */ 1669 if (insert_index != next_free) { 1670 BUG_ON(next_free >= le16_to_cpu(el->l_count)); 1671 1672 num_bytes = next_free - insert_index; 1673 num_bytes *= sizeof(struct ocfs2_extent_rec); 1674 memmove(&el->l_recs[insert_index + 1], 1675 &el->l_recs[insert_index], 1676 num_bytes); 1677 } 1678 1679 /* 1680 * Either we had an empty extent, and need to re-increment or 1681 * there was no empty extent on a non full rightmost leaf node, 1682 * in which case we still need to increment. 1683 */ 1684 next_free++; 1685 el->l_next_free_rec = cpu_to_le16(next_free); 1686 /* 1687 * Make sure none of the math above just messed up our tree. 1688 */ 1689 BUG_ON(le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count)); 1690 1691 el->l_recs[insert_index] = *insert_rec; 1692 1693 } 1694 1695 static void ocfs2_remove_empty_extent(struct ocfs2_extent_list *el) 1696 { 1697 int size, num_recs = le16_to_cpu(el->l_next_free_rec); 1698 1699 BUG_ON(num_recs == 0); 1700 1701 if (ocfs2_is_empty_extent(&el->l_recs[0])) { 1702 num_recs--; 1703 size = num_recs * sizeof(struct ocfs2_extent_rec); 1704 memmove(&el->l_recs[0], &el->l_recs[1], size); 1705 memset(&el->l_recs[num_recs], 0, 1706 sizeof(struct ocfs2_extent_rec)); 1707 el->l_next_free_rec = cpu_to_le16(num_recs); 1708 } 1709 } 1710 1711 /* 1712 * Create an empty extent record . 1713 * 1714 * l_next_free_rec may be updated. 1715 * 1716 * If an empty extent already exists do nothing. 1717 */ 1718 static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el) 1719 { 1720 int next_free = le16_to_cpu(el->l_next_free_rec); 1721 1722 BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); 1723 1724 if (next_free == 0) 1725 goto set_and_inc; 1726 1727 if (ocfs2_is_empty_extent(&el->l_recs[0])) 1728 return; 1729 1730 mlog_bug_on_msg(el->l_count == el->l_next_free_rec, 1731 "Asked to create an empty extent in a full list:\n" 1732 "count = %u, tree depth = %u", 1733 le16_to_cpu(el->l_count), 1734 le16_to_cpu(el->l_tree_depth)); 1735 1736 ocfs2_shift_records_right(el); 1737 1738 set_and_inc: 1739 le16_add_cpu(&el->l_next_free_rec, 1); 1740 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); 1741 } 1742 1743 /* 1744 * For a rotation which involves two leaf nodes, the "root node" is 1745 * the lowest level tree node which contains a path to both leafs. This 1746 * resulting set of information can be used to form a complete "subtree" 1747 * 1748 * This function is passed two full paths from the dinode down to a 1749 * pair of adjacent leaves. It's task is to figure out which path 1750 * index contains the subtree root - this can be the root index itself 1751 * in a worst-case rotation. 1752 * 1753 * The array index of the subtree root is passed back. 1754 */ 1755 int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et, 1756 struct ocfs2_path *left, 1757 struct ocfs2_path *right) 1758 { 1759 int i = 0; 1760 1761 /* 1762 * Check that the caller passed in two paths from the same tree. 1763 */ 1764 BUG_ON(path_root_bh(left) != path_root_bh(right)); 1765 1766 do { 1767 i++; 1768 1769 /* 1770 * The caller didn't pass two adjacent paths. 1771 */ 1772 mlog_bug_on_msg(i > left->p_tree_depth, 1773 "Owner %llu, left depth %u, right depth %u\n" 1774 "left leaf blk %llu, right leaf blk %llu\n", 1775 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), 1776 left->p_tree_depth, right->p_tree_depth, 1777 (unsigned long long)path_leaf_bh(left)->b_blocknr, 1778 (unsigned long long)path_leaf_bh(right)->b_blocknr); 1779 } while (left->p_node[i].bh->b_blocknr == 1780 right->p_node[i].bh->b_blocknr); 1781 1782 return i - 1; 1783 } 1784 1785 typedef void (path_insert_t)(void *, struct buffer_head *); 1786 1787 /* 1788 * Traverse a btree path in search of cpos, starting at root_el. 1789 * 1790 * This code can be called with a cpos larger than the tree, in which 1791 * case it will return the rightmost path. 1792 */ 1793 static int __ocfs2_find_path(struct ocfs2_caching_info *ci, 1794 struct ocfs2_extent_list *root_el, u32 cpos, 1795 path_insert_t *func, void *data) 1796 { 1797 int i, ret = 0; 1798 u32 range; 1799 u64 blkno; 1800 struct buffer_head *bh = NULL; 1801 struct ocfs2_extent_block *eb; 1802 struct ocfs2_extent_list *el; 1803 struct ocfs2_extent_rec *rec; 1804 1805 el = root_el; 1806 while (el->l_tree_depth) { 1807 if (unlikely(le16_to_cpu(el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH)) { 1808 ocfs2_error(ocfs2_metadata_cache_get_super(ci), 1809 "Owner %llu has invalid tree depth %u in extent list\n", 1810 (unsigned long long)ocfs2_metadata_cache_owner(ci), 1811 le16_to_cpu(el->l_tree_depth)); 1812 ret = -EROFS; 1813 goto out; 1814 } 1815 if (!el->l_next_free_rec || !el->l_count) { 1816 ocfs2_error(ocfs2_metadata_cache_get_super(ci), 1817 "Owner %llu has empty extent list at depth %u\n" 1818 "(next free=%u count=%u)\n", 1819 (unsigned long long)ocfs2_metadata_cache_owner(ci), 1820 le16_to_cpu(el->l_tree_depth), 1821 le16_to_cpu(el->l_next_free_rec), le16_to_cpu(el->l_count)); 1822 ret = -EROFS; 1823 goto out; 1824 } 1825 1826 for(i = 0; i < le16_to_cpu(el->l_next_free_rec) - 1; i++) { 1827 rec = &el->l_recs[i]; 1828 1829 /* 1830 * In the case that cpos is off the allocation 1831 * tree, this should just wind up returning the 1832 * rightmost record. 1833 */ 1834 range = le32_to_cpu(rec->e_cpos) + 1835 ocfs2_rec_clusters(el, rec); 1836 if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range) 1837 break; 1838 } 1839 1840 blkno = le64_to_cpu(el->l_recs[i].e_blkno); 1841 if (blkno == 0) { 1842 ocfs2_error(ocfs2_metadata_cache_get_super(ci), 1843 "Owner %llu has bad blkno in extent list at depth %u (index %d)\n", 1844 (unsigned long long)ocfs2_metadata_cache_owner(ci), 1845 le16_to_cpu(el->l_tree_depth), i); 1846 ret = -EROFS; 1847 goto out; 1848 } 1849 1850 brelse(bh); 1851 bh = NULL; 1852 ret = ocfs2_read_extent_block(ci, blkno, &bh); 1853 if (ret) { 1854 mlog_errno(ret); 1855 goto out; 1856 } 1857 1858 eb = (struct ocfs2_extent_block *) bh->b_data; 1859 el = &eb->h_list; 1860 1861 if (le16_to_cpu(el->l_next_free_rec) > 1862 le16_to_cpu(el->l_count)) { 1863 ocfs2_error(ocfs2_metadata_cache_get_super(ci), 1864 "Owner %llu has bad count in extent list at block %llu (next free=%u, count=%u)\n", 1865 (unsigned long long)ocfs2_metadata_cache_owner(ci), 1866 (unsigned long long)bh->b_blocknr, 1867 le16_to_cpu(el->l_next_free_rec), 1868 le16_to_cpu(el->l_count)); 1869 ret = -EROFS; 1870 goto out; 1871 } 1872 1873 if (func) 1874 func(data, bh); 1875 } 1876 1877 out: 1878 /* 1879 * Catch any trailing bh that the loop didn't handle. 1880 */ 1881 brelse(bh); 1882 1883 return ret; 1884 } 1885 1886 /* 1887 * Given an initialized path (that is, it has a valid root extent 1888 * list), this function will traverse the btree in search of the path 1889 * which would contain cpos. 1890 * 1891 * The path traveled is recorded in the path structure. 1892 * 1893 * Note that this will not do any comparisons on leaf node extent 1894 * records, so it will work fine in the case that we just added a tree 1895 * branch. 1896 */ 1897 struct find_path_data { 1898 int index; 1899 struct ocfs2_path *path; 1900 }; 1901 static void find_path_ins(void *data, struct buffer_head *bh) 1902 { 1903 struct find_path_data *fp = data; 1904 1905 get_bh(bh); 1906 ocfs2_path_insert_eb(fp->path, fp->index, bh); 1907 fp->index++; 1908 } 1909 int ocfs2_find_path(struct ocfs2_caching_info *ci, 1910 struct ocfs2_path *path, u32 cpos) 1911 { 1912 struct find_path_data data; 1913 1914 data.index = 1; 1915 data.path = path; 1916 return __ocfs2_find_path(ci, path_root_el(path), cpos, 1917 find_path_ins, &data); 1918 } 1919 1920 static void find_leaf_ins(void *data, struct buffer_head *bh) 1921 { 1922 struct ocfs2_extent_block *eb =(struct ocfs2_extent_block *)bh->b_data; 1923 struct ocfs2_extent_list *el = &eb->h_list; 1924 struct buffer_head **ret = data; 1925 1926 /* We want to retain only the leaf block. */ 1927 if (le16_to_cpu(el->l_tree_depth) == 0) { 1928 get_bh(bh); 1929 *ret = bh; 1930 } 1931 } 1932 /* 1933 * Find the leaf block in the tree which would contain cpos. No 1934 * checking of the actual leaf is done. 1935 * 1936 * Some paths want to call this instead of allocating a path structure 1937 * and calling ocfs2_find_path(). 1938 * 1939 * This function doesn't handle non btree extent lists. 1940 */ 1941 int ocfs2_find_leaf(struct ocfs2_caching_info *ci, 1942 struct ocfs2_extent_list *root_el, u32 cpos, 1943 struct buffer_head **leaf_bh) 1944 { 1945 int ret; 1946 struct buffer_head *bh = NULL; 1947 1948 ret = __ocfs2_find_path(ci, root_el, cpos, find_leaf_ins, &bh); 1949 if (ret) { 1950 mlog_errno(ret); 1951 goto out; 1952 } 1953 1954 *leaf_bh = bh; 1955 out: 1956 return ret; 1957 } 1958 1959 /* 1960 * Adjust the adjacent records (left_rec, right_rec) involved in a rotation. 1961 * 1962 * Basically, we've moved stuff around at the bottom of the tree and 1963 * we need to fix up the extent records above the changes to reflect 1964 * the new changes. 1965 * 1966 * left_rec: the record on the left. 1967 * right_rec: the record to the right of left_rec 1968 * right_child_el: is the child list pointed to by right_rec 1969 * 1970 * By definition, this only works on interior nodes. 1971 */ 1972 static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec, 1973 struct ocfs2_extent_rec *right_rec, 1974 struct ocfs2_extent_list *right_child_el) 1975 { 1976 u32 left_clusters, right_end; 1977 1978 /* 1979 * Interior nodes never have holes. Their cpos is the cpos of 1980 * the leftmost record in their child list. Their cluster 1981 * count covers the full theoretical range of their child list 1982 * - the range between their cpos and the cpos of the record 1983 * immediately to their right. 1984 */ 1985 left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos); 1986 if (!ocfs2_rec_clusters(right_child_el, &right_child_el->l_recs[0])) { 1987 BUG_ON(right_child_el->l_tree_depth); 1988 BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1); 1989 left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos); 1990 } 1991 left_clusters -= le32_to_cpu(left_rec->e_cpos); 1992 left_rec->e_int_clusters = cpu_to_le32(left_clusters); 1993 1994 /* 1995 * Calculate the rightmost cluster count boundary before 1996 * moving cpos - we will need to adjust clusters after 1997 * updating e_cpos to keep the same highest cluster count. 1998 */ 1999 right_end = le32_to_cpu(right_rec->e_cpos); 2000 right_end += le32_to_cpu(right_rec->e_int_clusters); 2001 2002 right_rec->e_cpos = left_rec->e_cpos; 2003 le32_add_cpu(&right_rec->e_cpos, left_clusters); 2004 2005 right_end -= le32_to_cpu(right_rec->e_cpos); 2006 right_rec->e_int_clusters = cpu_to_le32(right_end); 2007 } 2008 2009 /* 2010 * Adjust the adjacent root node records involved in a 2011 * rotation. left_el_blkno is passed in as a key so that we can easily 2012 * find it's index in the root list. 2013 */ 2014 static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el, 2015 struct ocfs2_extent_list *left_el, 2016 struct ocfs2_extent_list *right_el, 2017 u64 left_el_blkno) 2018 { 2019 int i; 2020 2021 BUG_ON(le16_to_cpu(root_el->l_tree_depth) <= 2022 le16_to_cpu(left_el->l_tree_depth)); 2023 2024 for(i = 0; i < le16_to_cpu(root_el->l_next_free_rec) - 1; i++) { 2025 if (le64_to_cpu(root_el->l_recs[i].e_blkno) == left_el_blkno) 2026 break; 2027 } 2028 2029 /* 2030 * The path walking code should have never returned a root and 2031 * two paths which are not adjacent. 2032 */ 2033 BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1)); 2034 2035 ocfs2_adjust_adjacent_records(&root_el->l_recs[i], 2036 &root_el->l_recs[i + 1], right_el); 2037 } 2038 2039 /* 2040 * We've changed a leaf block (in right_path) and need to reflect that 2041 * change back up the subtree. 2042 * 2043 * This happens in multiple places: 2044 * - When we've moved an extent record from the left path leaf to the right 2045 * path leaf to make room for an empty extent in the left path leaf. 2046 * - When our insert into the right path leaf is at the leftmost edge 2047 * and requires an update of the path immediately to it's left. This 2048 * can occur at the end of some types of rotation and appending inserts. 2049 * - When we've adjusted the last extent record in the left path leaf and the 2050 * 1st extent record in the right path leaf during cross extent block merge. 2051 */ 2052 static void ocfs2_complete_edge_insert(handle_t *handle, 2053 struct ocfs2_path *left_path, 2054 struct ocfs2_path *right_path, 2055 int subtree_index) 2056 { 2057 int i, idx; 2058 struct ocfs2_extent_list *el, *left_el, *right_el; 2059 struct ocfs2_extent_rec *left_rec, *right_rec; 2060 struct buffer_head *root_bh; 2061 2062 /* 2063 * Update the counts and position values within all the 2064 * interior nodes to reflect the leaf rotation we just did. 2065 * 2066 * The root node is handled below the loop. 2067 * 2068 * We begin the loop with right_el and left_el pointing to the 2069 * leaf lists and work our way up. 2070 * 2071 * NOTE: within this loop, left_el and right_el always refer 2072 * to the *child* lists. 2073 */ 2074 left_el = path_leaf_el(left_path); 2075 right_el = path_leaf_el(right_path); 2076 for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) { 2077 trace_ocfs2_complete_edge_insert(i); 2078 2079 /* 2080 * One nice property of knowing that all of these 2081 * nodes are below the root is that we only deal with 2082 * the leftmost right node record and the rightmost 2083 * left node record. 2084 */ 2085 el = left_path->p_node[i].el; 2086 idx = le16_to_cpu(left_el->l_next_free_rec) - 1; 2087 left_rec = &el->l_recs[idx]; 2088 2089 el = right_path->p_node[i].el; 2090 right_rec = &el->l_recs[0]; 2091 2092 ocfs2_adjust_adjacent_records(left_rec, right_rec, right_el); 2093 2094 ocfs2_journal_dirty(handle, left_path->p_node[i].bh); 2095 ocfs2_journal_dirty(handle, right_path->p_node[i].bh); 2096 2097 /* 2098 * Setup our list pointers now so that the current 2099 * parents become children in the next iteration. 2100 */ 2101 left_el = left_path->p_node[i].el; 2102 right_el = right_path->p_node[i].el; 2103 } 2104 2105 /* 2106 * At the root node, adjust the two adjacent records which 2107 * begin our path to the leaves. 2108 */ 2109 2110 el = left_path->p_node[subtree_index].el; 2111 left_el = left_path->p_node[subtree_index + 1].el; 2112 right_el = right_path->p_node[subtree_index + 1].el; 2113 2114 ocfs2_adjust_root_records(el, left_el, right_el, 2115 left_path->p_node[subtree_index + 1].bh->b_blocknr); 2116 2117 root_bh = left_path->p_node[subtree_index].bh; 2118 2119 ocfs2_journal_dirty(handle, root_bh); 2120 } 2121 2122 static int ocfs2_rotate_subtree_right(handle_t *handle, 2123 struct ocfs2_extent_tree *et, 2124 struct ocfs2_path *left_path, 2125 struct ocfs2_path *right_path, 2126 int subtree_index) 2127 { 2128 int ret, i; 2129 struct buffer_head *right_leaf_bh; 2130 struct buffer_head *left_leaf_bh = NULL; 2131 struct buffer_head *root_bh; 2132 struct ocfs2_extent_list *right_el, *left_el; 2133 struct ocfs2_extent_rec move_rec; 2134 2135 left_leaf_bh = path_leaf_bh(left_path); 2136 left_el = path_leaf_el(left_path); 2137 2138 if (left_el->l_next_free_rec != left_el->l_count) { 2139 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), 2140 "Inode %llu has non-full interior leaf node %llu (next free = %u)\n", 2141 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), 2142 (unsigned long long)left_leaf_bh->b_blocknr, 2143 le16_to_cpu(left_el->l_next_free_rec)); 2144 return -EROFS; 2145 } 2146 2147 /* 2148 * This extent block may already have an empty record, so we 2149 * return early if so. 2150 */ 2151 if (ocfs2_is_empty_extent(&left_el->l_recs[0])) 2152 return 0; 2153 2154 root_bh = left_path->p_node[subtree_index].bh; 2155 BUG_ON(root_bh != right_path->p_node[subtree_index].bh); 2156 2157 ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path, 2158 subtree_index); 2159 if (ret) { 2160 mlog_errno(ret); 2161 goto out; 2162 } 2163 2164 for(i = subtree_index + 1; i < path_num_items(right_path); i++) { 2165 ret = ocfs2_path_bh_journal_access(handle, et->et_ci, 2166 right_path, i); 2167 if (ret) { 2168 mlog_errno(ret); 2169 goto out; 2170 } 2171 2172 ret = ocfs2_path_bh_journal_access(handle, et->et_ci, 2173 left_path, i); 2174 if (ret) { 2175 mlog_errno(ret); 2176 goto out; 2177 } 2178 } 2179 2180 right_leaf_bh = path_leaf_bh(right_path); 2181 right_el = path_leaf_el(right_path); 2182 2183 /* This is a code error, not a disk corruption. */ 2184 mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails " 2185 "because rightmost leaf block %llu is empty\n", 2186 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), 2187 (unsigned long long)right_leaf_bh->b_blocknr); 2188 2189 ocfs2_create_empty_extent(right_el); 2190 2191 ocfs2_journal_dirty(handle, right_leaf_bh); 2192 2193 /* Do the copy now. */ 2194 i = le16_to_cpu(left_el->l_next_free_rec) - 1; 2195 move_rec = left_el->l_recs[i]; 2196 right_el->l_recs[0] = move_rec; 2197 2198 /* 2199 * Clear out the record we just copied and shift everything 2200 * over, leaving an empty extent in the left leaf. 2201 * 2202 * We temporarily subtract from next_free_rec so that the 2203 * shift will lose the tail record (which is now defunct). 2204 */ 2205 le16_add_cpu(&left_el->l_next_free_rec, -1); 2206 ocfs2_shift_records_right(left_el); 2207 memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); 2208 le16_add_cpu(&left_el->l_next_free_rec, 1); 2209 2210 ocfs2_journal_dirty(handle, left_leaf_bh); 2211 2212 ocfs2_complete_edge_insert(handle, left_path, right_path, 2213 subtree_index); 2214 2215 out: 2216 return ret; 2217 } 2218 2219 /* 2220 * Given a full path, determine what cpos value would return us a path 2221 * containing the leaf immediately to the left of the current one. 2222 * 2223 * Will return zero if the path passed in is already the leftmost path. 2224 */ 2225 int ocfs2_find_cpos_for_left_leaf(struct super_block *sb, 2226 struct ocfs2_path *path, u32 *cpos) 2227 { 2228 int i, j, ret = 0; 2229 u64 blkno; 2230 struct ocfs2_extent_list *el; 2231 2232 BUG_ON(path->p_tree_depth == 0); 2233 2234 *cpos = 0; 2235 2236 blkno = path_leaf_bh(path)->b_blocknr; 2237 2238 /* Start at the tree node just above the leaf and work our way up. */ 2239 i = path->p_tree_depth - 1; 2240 while (i >= 0) { 2241 el = path->p_node[i].el; 2242 2243 /* 2244 * Find the extent record just before the one in our 2245 * path. 2246 */ 2247 for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) { 2248 if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) { 2249 if (j == 0) { 2250 if (i == 0) { 2251 /* 2252 * We've determined that the 2253 * path specified is already 2254 * the leftmost one - return a 2255 * cpos of zero. 2256 */ 2257 goto out; 2258 } 2259 /* 2260 * The leftmost record points to our 2261 * leaf - we need to travel up the 2262 * tree one level. 2263 */ 2264 goto next_node; 2265 } 2266 2267 *cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos); 2268 *cpos = *cpos + ocfs2_rec_clusters(el, 2269 &el->l_recs[j - 1]); 2270 *cpos = *cpos - 1; 2271 goto out; 2272 } 2273 } 2274 2275 /* 2276 * If we got here, we never found a valid node where 2277 * the tree indicated one should be. 2278 */ 2279 ocfs2_error(sb, "Invalid extent tree at extent block %llu\n", 2280 (unsigned long long)blkno); 2281 ret = -EROFS; 2282 goto out; 2283 2284 next_node: 2285 blkno = path->p_node[i].bh->b_blocknr; 2286 i--; 2287 } 2288 2289 out: 2290 return ret; 2291 } 2292 2293 /* 2294 * Extend the transaction by enough credits to complete the rotation, 2295 * and still leave at least the original number of credits allocated 2296 * to this transaction. 2297 */ 2298 static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth, 2299 int op_credits, 2300 struct ocfs2_path *path) 2301 { 2302 int ret = 0; 2303 int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits; 2304 2305 if (jbd2_handle_buffer_credits(handle) < credits) 2306 ret = ocfs2_extend_trans(handle, 2307 credits - jbd2_handle_buffer_credits(handle)); 2308 2309 return ret; 2310 } 2311 2312 /* 2313 * Trap the case where we're inserting into the theoretical range past 2314 * the _actual_ left leaf range. Otherwise, we'll rotate a record 2315 * whose cpos is less than ours into the right leaf. 2316 * 2317 * It's only necessary to look at the rightmost record of the left 2318 * leaf because the logic that calls us should ensure that the 2319 * theoretical ranges in the path components above the leaves are 2320 * correct. 2321 */ 2322 static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path, 2323 u32 insert_cpos) 2324 { 2325 struct ocfs2_extent_list *left_el; 2326 struct ocfs2_extent_rec *rec; 2327 int next_free; 2328 2329 left_el = path_leaf_el(left_path); 2330 next_free = le16_to_cpu(left_el->l_next_free_rec); 2331 rec = &left_el->l_recs[next_free - 1]; 2332 2333 if (insert_cpos > le32_to_cpu(rec->e_cpos)) 2334 return 1; 2335 return 0; 2336 } 2337 2338 static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos) 2339 { 2340 int next_free = le16_to_cpu(el->l_next_free_rec); 2341 unsigned int range; 2342 struct ocfs2_extent_rec *rec; 2343 2344 if (next_free == 0) 2345 return 0; 2346 2347 rec = &el->l_recs[0]; 2348 if (ocfs2_is_empty_extent(rec)) { 2349 /* Empty list. */ 2350 if (next_free == 1) 2351 return 0; 2352 rec = &el->l_recs[1]; 2353 } 2354 2355 range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec); 2356 if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range) 2357 return 1; 2358 return 0; 2359 } 2360 2361 /* 2362 * Rotate all the records in a btree right one record, starting at insert_cpos. 2363 * 2364 * The path to the rightmost leaf should be passed in. 2365 * 2366 * The array is assumed to be large enough to hold an entire path (tree depth). 2367 * 2368 * Upon successful return from this function: 2369 * 2370 * - The 'right_path' array will contain a path to the leaf block 2371 * whose range contains e_cpos. 2372 * - That leaf block will have a single empty extent in list index 0. 2373 * - In the case that the rotation requires a post-insert update, 2374 * *ret_left_path will contain a valid path which can be passed to 2375 * ocfs2_insert_path(). 2376 */ 2377 static int ocfs2_rotate_tree_right(handle_t *handle, 2378 struct ocfs2_extent_tree *et, 2379 enum ocfs2_split_type split, 2380 u32 insert_cpos, 2381 struct ocfs2_path *right_path, 2382 struct ocfs2_path **ret_left_path) 2383 { 2384 int ret, start, orig_credits = jbd2_handle_buffer_credits(handle); 2385 u32 cpos; 2386 struct ocfs2_path *left_path = NULL; 2387 struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); 2388 2389 *ret_left_path = NULL; 2390 2391 left_path = ocfs2_new_path_from_path(right_path); 2392 if (!left_path) { 2393 ret = -ENOMEM; 2394 mlog_errno(ret); 2395 goto out; 2396 } 2397 2398 ret = ocfs2_find_cpos_for_left_leaf(sb, right_path, &cpos); 2399 if (ret) { 2400 mlog_errno(ret); 2401 goto out; 2402 } 2403 2404 trace_ocfs2_rotate_tree_right( 2405 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), 2406 insert_cpos, cpos); 2407 2408 /* 2409 * What we want to do here is: 2410 * 2411 * 1) Start with the rightmost path. 2412 * 2413 * 2) Determine a path to the leaf block directly to the left 2414 * of that leaf. 2415 * 2416 * 3) Determine the 'subtree root' - the lowest level tree node 2417 * which contains a path to both leaves. 2418 * 2419 * 4) Rotate the subtree. 2420 * 2421 * 5) Find the next subtree by considering the left path to be 2422 * the new right path. 2423 * 2424 * The check at the top of this while loop also accepts 2425 * insert_cpos == cpos because cpos is only a _theoretical_ 2426 * value to get us the left path - insert_cpos might very well 2427 * be filling that hole. 2428 * 2429 * Stop at a cpos of '0' because we either started at the 2430 * leftmost branch (i.e., a tree with one branch and a 2431 * rotation inside of it), or we've gone as far as we can in 2432 * rotating subtrees. 2433 */ 2434 while (cpos && insert_cpos <= cpos) { 2435 trace_ocfs2_rotate_tree_right( 2436 (unsigned long long) 2437 ocfs2_metadata_cache_owner(et->et_ci), 2438 insert_cpos, cpos); 2439 2440 ret = ocfs2_find_path(et->et_ci, left_path, cpos); 2441 if (ret) { 2442 mlog_errno(ret); 2443 goto out; 2444 } 2445 2446 mlog_bug_on_msg(path_leaf_bh(left_path) == 2447 path_leaf_bh(right_path), 2448 "Owner %llu: error during insert of %u " 2449 "(left path cpos %u) results in two identical " 2450 "paths ending at %llu\n", 2451 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), 2452 insert_cpos, cpos, 2453 (unsigned long long) 2454 path_leaf_bh(left_path)->b_blocknr); 2455 2456 if (split == SPLIT_NONE && 2457 ocfs2_rotate_requires_path_adjustment(left_path, 2458 insert_cpos)) { 2459 2460 /* 2461 * We've rotated the tree as much as we 2462 * should. The rest is up to 2463 * ocfs2_insert_path() to complete, after the 2464 * record insertion. We indicate this 2465 * situation by returning the left path. 2466 * 2467 * The reason we don't adjust the records here 2468 * before the record insert is that an error 2469 * later might break the rule where a parent 2470 * record e_cpos will reflect the actual 2471 * e_cpos of the 1st nonempty record of the 2472 * child list. 2473 */ 2474 *ret_left_path = left_path; 2475 goto out_ret_path; 2476 } 2477 2478 start = ocfs2_find_subtree_root(et, left_path, right_path); 2479 2480 trace_ocfs2_rotate_subtree(start, 2481 (unsigned long long) 2482 right_path->p_node[start].bh->b_blocknr, 2483 right_path->p_tree_depth); 2484 2485 ret = ocfs2_extend_rotate_transaction(handle, start, 2486 orig_credits, right_path); 2487 if (ret) { 2488 mlog_errno(ret); 2489 goto out; 2490 } 2491 2492 ret = ocfs2_rotate_subtree_right(handle, et, left_path, 2493 right_path, start); 2494 if (ret) { 2495 mlog_errno(ret); 2496 goto out; 2497 } 2498 2499 if (split != SPLIT_NONE && 2500 ocfs2_leftmost_rec_contains(path_leaf_el(right_path), 2501 insert_cpos)) { 2502 /* 2503 * A rotate moves the rightmost left leaf 2504 * record over to the leftmost right leaf 2505 * slot. If we're doing an extent split 2506 * instead of a real insert, then we have to 2507 * check that the extent to be split wasn't 2508 * just moved over. If it was, then we can 2509 * exit here, passing left_path back - 2510 * ocfs2_split_extent() is smart enough to 2511 * search both leaves. 2512 */ 2513 *ret_left_path = left_path; 2514 goto out_ret_path; 2515 } 2516 2517 /* 2518 * There is no need to re-read the next right path 2519 * as we know that it'll be our current left 2520 * path. Optimize by copying values instead. 2521 */ 2522 ocfs2_mv_path(right_path, left_path); 2523 2524 ret = ocfs2_find_cpos_for_left_leaf(sb, right_path, &cpos); 2525 if (ret) { 2526 mlog_errno(ret); 2527 goto out; 2528 } 2529 } 2530 2531 out: 2532 ocfs2_free_path(left_path); 2533 2534 out_ret_path: 2535 return ret; 2536 } 2537 2538 static int ocfs2_update_edge_lengths(handle_t *handle, 2539 struct ocfs2_extent_tree *et, 2540 struct ocfs2_path *path) 2541 { 2542 int i, idx, ret; 2543 struct ocfs2_extent_rec *rec; 2544 struct ocfs2_extent_list *el; 2545 struct ocfs2_extent_block *eb; 2546 u32 range; 2547 2548 ret = ocfs2_journal_access_path(et->et_ci, handle, path); 2549 if (ret) { 2550 mlog_errno(ret); 2551 goto out; 2552 } 2553 2554 /* Path should always be rightmost. */ 2555 eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data; 2556 BUG_ON(eb->h_next_leaf_blk != 0ULL); 2557 2558 el = &eb->h_list; 2559 BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0); 2560 idx = le16_to_cpu(el->l_next_free_rec) - 1; 2561 rec = &el->l_recs[idx]; 2562 range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec); 2563 2564 for (i = 0; i < path->p_tree_depth; i++) { 2565 el = path->p_node[i].el; 2566 idx = le16_to_cpu(el->l_next_free_rec) - 1; 2567 rec = &el->l_recs[idx]; 2568 2569 rec->e_int_clusters = cpu_to_le32(range); 2570 le32_add_cpu(&rec->e_int_clusters, -le32_to_cpu(rec->e_cpos)); 2571 2572 ocfs2_journal_dirty(handle, path->p_node[i].bh); 2573 } 2574 out: 2575 return ret; 2576 } 2577 2578 static void ocfs2_unlink_path(handle_t *handle, 2579 struct ocfs2_extent_tree *et, 2580 struct ocfs2_cached_dealloc_ctxt *dealloc, 2581 struct ocfs2_path *path, int unlink_start) 2582 { 2583 int ret, i; 2584 struct ocfs2_extent_block *eb; 2585 struct ocfs2_extent_list *el; 2586 struct buffer_head *bh; 2587 2588 for(i = unlink_start; i < path_num_items(path); i++) { 2589 bh = path->p_node[i].bh; 2590 2591 eb = (struct ocfs2_extent_block *)bh->b_data; 2592 /* 2593 * Not all nodes might have had their final count 2594 * decremented by the caller - handle this here. 2595 */ 2596 el = &eb->h_list; 2597 if (le16_to_cpu(el->l_next_free_rec) > 1) { 2598 mlog(ML_ERROR, 2599 "Inode %llu, attempted to remove extent block " 2600 "%llu with %u records\n", 2601 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), 2602 (unsigned long long)le64_to_cpu(eb->h_blkno), 2603 le16_to_cpu(el->l_next_free_rec)); 2604 2605 ocfs2_journal_dirty(handle, bh); 2606 ocfs2_remove_from_cache(et->et_ci, bh); 2607 continue; 2608 } 2609 2610 el->l_next_free_rec = 0; 2611 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); 2612 2613 ocfs2_journal_dirty(handle, bh); 2614 2615 ret = ocfs2_cache_extent_block_free(dealloc, eb); 2616 if (ret) 2617 mlog_errno(ret); 2618 2619 ocfs2_remove_from_cache(et->et_ci, bh); 2620 } 2621 } 2622 2623 static void ocfs2_unlink_subtree(handle_t *handle, 2624 struct ocfs2_extent_tree *et, 2625 struct ocfs2_path *left_path, 2626 struct ocfs2_path *right_path, 2627 int subtree_index, 2628 struct ocfs2_cached_dealloc_ctxt *dealloc) 2629 { 2630 int i; 2631 struct buffer_head *root_bh = left_path->p_node[subtree_index].bh; 2632 struct ocfs2_extent_list *root_el = left_path->p_node[subtree_index].el; 2633 struct ocfs2_extent_block *eb; 2634 2635 eb = (struct ocfs2_extent_block *)right_path->p_node[subtree_index + 1].bh->b_data; 2636 2637 for(i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++) 2638 if (root_el->l_recs[i].e_blkno == eb->h_blkno) 2639 break; 2640 2641 BUG_ON(i >= le16_to_cpu(root_el->l_next_free_rec)); 2642 2643 memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec)); 2644 le16_add_cpu(&root_el->l_next_free_rec, -1); 2645 2646 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data; 2647 eb->h_next_leaf_blk = 0; 2648 2649 ocfs2_journal_dirty(handle, root_bh); 2650 ocfs2_journal_dirty(handle, path_leaf_bh(left_path)); 2651 2652 ocfs2_unlink_path(handle, et, dealloc, right_path, 2653 subtree_index + 1); 2654 } 2655 2656 static int ocfs2_rotate_subtree_left(handle_t *handle, 2657 struct ocfs2_extent_tree *et, 2658 struct ocfs2_path *left_path, 2659 struct ocfs2_path *right_path, 2660 int subtree_index, 2661 struct ocfs2_cached_dealloc_ctxt *dealloc, 2662 int *deleted) 2663 { 2664 int ret, i, del_right_subtree = 0, right_has_empty = 0; 2665 struct buffer_head *root_bh, *et_root_bh = path_root_bh(right_path); 2666 struct ocfs2_extent_list *right_leaf_el, *left_leaf_el; 2667 struct ocfs2_extent_block *eb; 2668 2669 *deleted = 0; 2670 2671 right_leaf_el = path_leaf_el(right_path); 2672 left_leaf_el = path_leaf_el(left_path); 2673 root_bh = left_path->p_node[subtree_index].bh; 2674 BUG_ON(root_bh != right_path->p_node[subtree_index].bh); 2675 2676 if (!ocfs2_is_empty_extent(&left_leaf_el->l_recs[0])) 2677 return 0; 2678 2679 eb = (struct ocfs2_extent_block *)path_leaf_bh(right_path)->b_data; 2680 if (ocfs2_is_empty_extent(&right_leaf_el->l_recs[0])) { 2681 /* 2682 * It's legal for us to proceed if the right leaf is 2683 * the rightmost one and it has an empty extent. There 2684 * are two cases to handle - whether the leaf will be 2685 * empty after removal or not. If the leaf isn't empty 2686 * then just remove the empty extent up front. The 2687 * next block will handle empty leaves by flagging 2688 * them for unlink. 2689 * 2690 * Non rightmost leaves will throw -EAGAIN and the 2691 * caller can manually move the subtree and retry. 2692 */ 2693 2694 if (eb->h_next_leaf_blk != 0ULL) 2695 return -EAGAIN; 2696 2697 if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) { 2698 ret = ocfs2_journal_access_eb(handle, et->et_ci, 2699 path_leaf_bh(right_path), 2700 OCFS2_JOURNAL_ACCESS_WRITE); 2701 if (ret) { 2702 mlog_errno(ret); 2703 goto out; 2704 } 2705 2706 ocfs2_remove_empty_extent(right_leaf_el); 2707 } else 2708 right_has_empty = 1; 2709 } 2710 2711 if (eb->h_next_leaf_blk == 0ULL && 2712 le16_to_cpu(right_leaf_el->l_next_free_rec) == 1) { 2713 /* 2714 * We have to update i_last_eb_blk during the meta 2715 * data delete. 2716 */ 2717 ret = ocfs2_et_root_journal_access(handle, et, 2718 OCFS2_JOURNAL_ACCESS_WRITE); 2719 if (ret) { 2720 mlog_errno(ret); 2721 goto out; 2722 } 2723 2724 del_right_subtree = 1; 2725 } 2726 2727 /* 2728 * Getting here with an empty extent in the right path implies 2729 * that it's the rightmost path and will be deleted. 2730 */ 2731 BUG_ON(right_has_empty && !del_right_subtree); 2732 2733 ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path, 2734 subtree_index); 2735 if (ret) { 2736 mlog_errno(ret); 2737 goto out; 2738 } 2739 2740 for(i = subtree_index + 1; i < path_num_items(right_path); i++) { 2741 ret = ocfs2_path_bh_journal_access(handle, et->et_ci, 2742 right_path, i); 2743 if (ret) { 2744 mlog_errno(ret); 2745 goto out; 2746 } 2747 2748 ret = ocfs2_path_bh_journal_access(handle, et->et_ci, 2749 left_path, i); 2750 if (ret) { 2751 mlog_errno(ret); 2752 goto out; 2753 } 2754 } 2755 2756 if (!right_has_empty) { 2757 /* 2758 * Only do this if we're moving a real 2759 * record. Otherwise, the action is delayed until 2760 * after removal of the right path in which case we 2761 * can do a simple shift to remove the empty extent. 2762 */ 2763 ocfs2_rotate_leaf(left_leaf_el, &right_leaf_el->l_recs[0]); 2764 memset(&right_leaf_el->l_recs[0], 0, 2765 sizeof(struct ocfs2_extent_rec)); 2766 } 2767 if (eb->h_next_leaf_blk == 0ULL) { 2768 /* 2769 * Move recs over to get rid of empty extent, decrease 2770 * next_free. This is allowed to remove the last 2771 * extent in our leaf (setting l_next_free_rec to 2772 * zero) - the delete code below won't care. 2773 */ 2774 ocfs2_remove_empty_extent(right_leaf_el); 2775 } 2776 2777 ocfs2_journal_dirty(handle, path_leaf_bh(left_path)); 2778 ocfs2_journal_dirty(handle, path_leaf_bh(right_path)); 2779 2780 if (del_right_subtree) { 2781 ocfs2_unlink_subtree(handle, et, left_path, right_path, 2782 subtree_index, dealloc); 2783 ret = ocfs2_update_edge_lengths(handle, et, left_path); 2784 if (ret) { 2785 mlog_errno(ret); 2786 goto out; 2787 } 2788 2789 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data; 2790 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno)); 2791 2792 /* 2793 * Removal of the extent in the left leaf was skipped 2794 * above so we could delete the right path 2795 * 1st. 2796 */ 2797 if (right_has_empty) 2798 ocfs2_remove_empty_extent(left_leaf_el); 2799 2800 ocfs2_journal_dirty(handle, et_root_bh); 2801 2802 *deleted = 1; 2803 } else 2804 ocfs2_complete_edge_insert(handle, left_path, right_path, 2805 subtree_index); 2806 2807 out: 2808 return ret; 2809 } 2810 2811 /* 2812 * Given a full path, determine what cpos value would return us a path 2813 * containing the leaf immediately to the right of the current one. 2814 * 2815 * Will return zero if the path passed in is already the rightmost path. 2816 * 2817 * This looks similar, but is subtly different to 2818 * ocfs2_find_cpos_for_left_leaf(). 2819 */ 2820 int ocfs2_find_cpos_for_right_leaf(struct super_block *sb, 2821 struct ocfs2_path *path, u32 *cpos) 2822 { 2823 int i, j, ret = 0; 2824 u64 blkno; 2825 struct ocfs2_extent_list *el; 2826 2827 *cpos = 0; 2828 2829 if (path->p_tree_depth == 0) 2830 return 0; 2831 2832 blkno = path_leaf_bh(path)->b_blocknr; 2833 2834 /* Start at the tree node just above the leaf and work our way up. */ 2835 i = path->p_tree_depth - 1; 2836 while (i >= 0) { 2837 int next_free; 2838 2839 el = path->p_node[i].el; 2840 2841 /* 2842 * Find the extent record just after the one in our 2843 * path. 2844 */ 2845 next_free = le16_to_cpu(el->l_next_free_rec); 2846 for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) { 2847 if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) { 2848 if (j == (next_free - 1)) { 2849 if (i == 0) { 2850 /* 2851 * We've determined that the 2852 * path specified is already 2853 * the rightmost one - return a 2854 * cpos of zero. 2855 */ 2856 goto out; 2857 } 2858 /* 2859 * The rightmost record points to our 2860 * leaf - we need to travel up the 2861 * tree one level. 2862 */ 2863 goto next_node; 2864 } 2865 2866 *cpos = le32_to_cpu(el->l_recs[j + 1].e_cpos); 2867 goto out; 2868 } 2869 } 2870 2871 /* 2872 * If we got here, we never found a valid node where 2873 * the tree indicated one should be. 2874 */ 2875 ocfs2_error(sb, "Invalid extent tree at extent block %llu\n", 2876 (unsigned long long)blkno); 2877 ret = -EROFS; 2878 goto out; 2879 2880 next_node: 2881 blkno = path->p_node[i].bh->b_blocknr; 2882 i--; 2883 } 2884 2885 out: 2886 return ret; 2887 } 2888 2889 static int ocfs2_rotate_rightmost_leaf_left(handle_t *handle, 2890 struct ocfs2_extent_tree *et, 2891 struct ocfs2_path *path) 2892 { 2893 int ret; 2894 struct buffer_head *bh = path_leaf_bh(path); 2895 struct ocfs2_extent_list *el = path_leaf_el(path); 2896 2897 if (!ocfs2_is_empty_extent(&el->l_recs[0])) 2898 return 0; 2899 2900 ret = ocfs2_path_bh_journal_access(handle, et->et_ci, path, 2901 path_num_items(path) - 1); 2902 if (ret) { 2903 mlog_errno(ret); 2904 goto out; 2905 } 2906 2907 ocfs2_remove_empty_extent(el); 2908 ocfs2_journal_dirty(handle, bh); 2909 2910 out: 2911 return ret; 2912 } 2913 2914 static int __ocfs2_rotate_tree_left(handle_t *handle, 2915 struct ocfs2_extent_tree *et, 2916 int orig_credits, 2917 struct ocfs2_path *path, 2918 struct ocfs2_cached_dealloc_ctxt *dealloc, 2919 struct ocfs2_path **empty_extent_path) 2920 { 2921 int ret, subtree_root, deleted; 2922 u32 right_cpos; 2923 struct ocfs2_path *left_path = NULL; 2924 struct ocfs2_path *right_path = NULL; 2925 struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); 2926 2927 if (!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0]))) 2928 return 0; 2929 2930 *empty_extent_path = NULL; 2931 2932 ret = ocfs2_find_cpos_for_right_leaf(sb, path, &right_cpos); 2933 if (ret) { 2934 mlog_errno(ret); 2935 goto out; 2936 } 2937 2938 left_path = ocfs2_new_path_from_path(path); 2939 if (!left_path) { 2940 ret = -ENOMEM; 2941 mlog_errno(ret); 2942 goto out; 2943 } 2944 2945 ocfs2_cp_path(left_path, path); 2946 2947 right_path = ocfs2_new_path_from_path(path); 2948 if (!right_path) { 2949 ret = -ENOMEM; 2950 mlog_errno(ret); 2951 goto out; 2952 } 2953 2954 while (right_cpos) { 2955 ret = ocfs2_find_path(et->et_ci, right_path, right_cpos); 2956 if (ret) { 2957 mlog_errno(ret); 2958 goto out; 2959 } 2960 2961 subtree_root = ocfs2_find_subtree_root(et, left_path, 2962 right_path); 2963 2964 trace_ocfs2_rotate_subtree(subtree_root, 2965 (unsigned long long) 2966 right_path->p_node[subtree_root].bh->b_blocknr, 2967 right_path->p_tree_depth); 2968 2969 ret = ocfs2_extend_rotate_transaction(handle, 0, 2970 orig_credits, left_path); 2971 if (ret) { 2972 mlog_errno(ret); 2973 goto out; 2974 } 2975 2976 /* 2977 * Caller might still want to make changes to the 2978 * tree root, so re-add it to the journal here. 2979 */ 2980 ret = ocfs2_path_bh_journal_access(handle, et->et_ci, 2981 left_path, 0); 2982 if (ret) { 2983 mlog_errno(ret); 2984 goto out; 2985 } 2986 2987 ret = ocfs2_rotate_subtree_left(handle, et, left_path, 2988 right_path, subtree_root, 2989 dealloc, &deleted); 2990 if (ret == -EAGAIN) { 2991 /* 2992 * The rotation has to temporarily stop due to 2993 * the right subtree having an empty 2994 * extent. Pass it back to the caller for a 2995 * fixup. 2996 */ 2997 *empty_extent_path = right_path; 2998 right_path = NULL; 2999 goto out; 3000 } 3001 if (ret) { 3002 mlog_errno(ret); 3003 goto out; 3004 } 3005 3006 /* 3007 * The subtree rotate might have removed records on 3008 * the rightmost edge. If so, then rotation is 3009 * complete. 3010 */ 3011 if (deleted) 3012 break; 3013 3014 ocfs2_mv_path(left_path, right_path); 3015 3016 ret = ocfs2_find_cpos_for_right_leaf(sb, left_path, 3017 &right_cpos); 3018 if (ret) { 3019 mlog_errno(ret); 3020 goto out; 3021 } 3022 } 3023 3024 out: 3025 ocfs2_free_path(right_path); 3026 ocfs2_free_path(left_path); 3027 3028 return ret; 3029 } 3030 3031 static int ocfs2_remove_rightmost_path(handle_t *handle, 3032 struct ocfs2_extent_tree *et, 3033 struct ocfs2_path *path, 3034 struct ocfs2_cached_dealloc_ctxt *dealloc) 3035 { 3036 int ret, subtree_index; 3037 u32 cpos; 3038 struct ocfs2_path *left_path = NULL; 3039 struct ocfs2_extent_block *eb; 3040 struct ocfs2_extent_list *el; 3041 3042 ret = ocfs2_et_sanity_check(et); 3043 if (ret) 3044 goto out; 3045 3046 ret = ocfs2_journal_access_path(et->et_ci, handle, path); 3047 if (ret) { 3048 mlog_errno(ret); 3049 goto out; 3050 } 3051 3052 ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci), 3053 path, &cpos); 3054 if (ret) { 3055 mlog_errno(ret); 3056 goto out; 3057 } 3058 3059 if (cpos) { 3060 /* 3061 * We have a path to the left of this one - it needs 3062 * an update too. 3063 */ 3064 left_path = ocfs2_new_path_from_path(path); 3065 if (!left_path) { 3066 ret = -ENOMEM; 3067 mlog_errno(ret); 3068 goto out; 3069 } 3070 3071 ret = ocfs2_find_path(et->et_ci, left_path, cpos); 3072 if (ret) { 3073 mlog_errno(ret); 3074 goto out; 3075 } 3076 3077 ret = ocfs2_journal_access_path(et->et_ci, handle, left_path); 3078 if (ret) { 3079 mlog_errno(ret); 3080 goto out; 3081 } 3082 3083 subtree_index = ocfs2_find_subtree_root(et, left_path, path); 3084 3085 ocfs2_unlink_subtree(handle, et, left_path, path, 3086 subtree_index, dealloc); 3087 ret = ocfs2_update_edge_lengths(handle, et, left_path); 3088 if (ret) { 3089 mlog_errno(ret); 3090 goto out; 3091 } 3092 3093 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data; 3094 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno)); 3095 } else { 3096 /* 3097 * 'path' is also the leftmost path which 3098 * means it must be the only one. This gets 3099 * handled differently because we want to 3100 * revert the root back to having extents 3101 * in-line. 3102 */ 3103 ocfs2_unlink_path(handle, et, dealloc, path, 1); 3104 3105 el = et->et_root_el; 3106 el->l_tree_depth = 0; 3107 el->l_next_free_rec = 0; 3108 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); 3109 3110 ocfs2_et_set_last_eb_blk(et, 0); 3111 } 3112 3113 ocfs2_journal_dirty(handle, path_root_bh(path)); 3114 3115 out: 3116 ocfs2_free_path(left_path); 3117 return ret; 3118 } 3119 3120 static int ocfs2_remove_rightmost_empty_extent(struct ocfs2_super *osb, 3121 struct ocfs2_extent_tree *et, 3122 struct ocfs2_path *path, 3123 struct ocfs2_cached_dealloc_ctxt *dealloc) 3124 { 3125 handle_t *handle; 3126 int ret; 3127 int credits = path->p_tree_depth * 2 + 1; 3128 3129 handle = ocfs2_start_trans(osb, credits); 3130 if (IS_ERR(handle)) { 3131 ret = PTR_ERR(handle); 3132 mlog_errno(ret); 3133 return ret; 3134 } 3135 3136 ret = ocfs2_remove_rightmost_path(handle, et, path, dealloc); 3137 if (ret) 3138 mlog_errno(ret); 3139 3140 ocfs2_commit_trans(osb, handle); 3141 return ret; 3142 } 3143 3144 /* 3145 * Left rotation of btree records. 3146 * 3147 * In many ways, this is (unsurprisingly) the opposite of right 3148 * rotation. We start at some non-rightmost path containing an empty 3149 * extent in the leaf block. The code works its way to the rightmost 3150 * path by rotating records to the left in every subtree. 3151 * 3152 * This is used by any code which reduces the number of extent records 3153 * in a leaf. After removal, an empty record should be placed in the 3154 * leftmost list position. 3155 * 3156 * This won't handle a length update of the rightmost path records if 3157 * the rightmost tree leaf record is removed so the caller is 3158 * responsible for detecting and correcting that. 3159 */ 3160 static int ocfs2_rotate_tree_left(handle_t *handle, 3161 struct ocfs2_extent_tree *et, 3162 struct ocfs2_path *path, 3163 struct ocfs2_cached_dealloc_ctxt *dealloc) 3164 { 3165 int ret, orig_credits = jbd2_handle_buffer_credits(handle); 3166 struct ocfs2_path *tmp_path = NULL, *restart_path = NULL; 3167 struct ocfs2_extent_block *eb; 3168 struct ocfs2_extent_list *el; 3169 3170 el = path_leaf_el(path); 3171 if (!ocfs2_is_empty_extent(&el->l_recs[0])) 3172 return 0; 3173 3174 if (path->p_tree_depth == 0) { 3175 rightmost_no_delete: 3176 /* 3177 * Inline extents. This is trivially handled, so do 3178 * it up front. 3179 */ 3180 ret = ocfs2_rotate_rightmost_leaf_left(handle, et, path); 3181 if (ret) 3182 mlog_errno(ret); 3183 goto out; 3184 } 3185 3186 /* 3187 * Handle rightmost branch now. There's several cases: 3188 * 1) simple rotation leaving records in there. That's trivial. 3189 * 2) rotation requiring a branch delete - there's no more 3190 * records left. Two cases of this: 3191 * a) There are branches to the left. 3192 * b) This is also the leftmost (the only) branch. 3193 * 3194 * 1) is handled via ocfs2_rotate_rightmost_leaf_left() 3195 * 2a) we need the left branch so that we can update it with the unlink 3196 * 2b) we need to bring the root back to inline extents. 3197 */ 3198 3199 eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data; 3200 el = &eb->h_list; 3201 if (eb->h_next_leaf_blk == 0) { 3202 /* 3203 * This gets a bit tricky if we're going to delete the 3204 * rightmost path. Get the other cases out of the way 3205 * 1st. 3206 */ 3207 if (le16_to_cpu(el->l_next_free_rec) > 1) 3208 goto rightmost_no_delete; 3209 3210 if (le16_to_cpu(el->l_next_free_rec) == 0) { 3211 ret = ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), 3212 "Owner %llu has empty extent block at %llu\n", 3213 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), 3214 (unsigned long long)le64_to_cpu(eb->h_blkno)); 3215 goto out; 3216 } 3217 3218 /* 3219 * XXX: The caller can not trust "path" any more after 3220 * this as it will have been deleted. What do we do? 3221 * 3222 * In theory the rotate-for-merge code will never get 3223 * here because it'll always ask for a rotate in a 3224 * nonempty list. 3225 */ 3226 3227 ret = ocfs2_remove_rightmost_path(handle, et, path, 3228 dealloc); 3229 if (ret) 3230 mlog_errno(ret); 3231 goto out; 3232 } 3233 3234 /* 3235 * Now we can loop, remembering the path we get from -EAGAIN 3236 * and restarting from there. 3237 */ 3238 try_rotate: 3239 ret = __ocfs2_rotate_tree_left(handle, et, orig_credits, path, 3240 dealloc, &restart_path); 3241 if (ret && ret != -EAGAIN) { 3242 mlog_errno(ret); 3243 goto out; 3244 } 3245 3246 while (ret == -EAGAIN) { 3247 tmp_path = restart_path; 3248 restart_path = NULL; 3249 3250 ret = __ocfs2_rotate_tree_left(handle, et, orig_credits, 3251 tmp_path, dealloc, 3252 &restart_path); 3253 if (ret && ret != -EAGAIN) { 3254 mlog_errno(ret); 3255 goto out; 3256 } 3257 3258 ocfs2_free_path(tmp_path); 3259 tmp_path = NULL; 3260 3261 if (ret == 0) 3262 goto try_rotate; 3263 } 3264 3265 out: 3266 ocfs2_free_path(tmp_path); 3267 ocfs2_free_path(restart_path); 3268 return ret; 3269 } 3270 3271 static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el, 3272 int index) 3273 { 3274 struct ocfs2_extent_rec *rec = &el->l_recs[index]; 3275 unsigned int size; 3276 3277 if (rec->e_leaf_clusters == 0) { 3278 /* 3279 * We consumed all of the merged-from record. An empty 3280 * extent cannot exist anywhere but the 1st array 3281 * position, so move things over if the merged-from 3282 * record doesn't occupy that position. 3283 * 3284 * This creates a new empty extent so the caller 3285 * should be smart enough to have removed any existing 3286 * ones. 3287 */ 3288 if (index > 0) { 3289 BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0])); 3290 size = index * sizeof(struct ocfs2_extent_rec); 3291 memmove(&el->l_recs[1], &el->l_recs[0], size); 3292 } 3293 3294 /* 3295 * Always memset - the caller doesn't check whether it 3296 * created an empty extent, so there could be junk in 3297 * the other fields. 3298 */ 3299 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); 3300 } 3301 } 3302 3303 static int ocfs2_get_right_path(struct ocfs2_extent_tree *et, 3304 struct ocfs2_path *left_path, 3305 struct ocfs2_path **ret_right_path) 3306 { 3307 int ret; 3308 u32 right_cpos; 3309 struct ocfs2_path *right_path = NULL; 3310 struct ocfs2_extent_list *left_el; 3311 3312 *ret_right_path = NULL; 3313 3314 /* This function shouldn't be called for non-trees. */ 3315 BUG_ON(left_path->p_tree_depth == 0); 3316 3317 left_el = path_leaf_el(left_path); 3318 BUG_ON(left_el->l_next_free_rec != left_el->l_count); 3319 3320 ret = ocfs2_find_cpos_for_right_leaf(ocfs2_metadata_cache_get_super(et->et_ci), 3321 left_path, &right_cpos); 3322 if (ret) { 3323 mlog_errno(ret); 3324 goto out; 3325 } 3326 3327 /* This function shouldn't be called for the rightmost leaf. */ 3328 BUG_ON(right_cpos == 0); 3329 3330 right_path = ocfs2_new_path_from_path(left_path); 3331 if (!right_path) { 3332 ret = -ENOMEM; 3333 mlog_errno(ret); 3334 goto out; 3335 } 3336 3337 ret = ocfs2_find_path(et->et_ci, right_path, right_cpos); 3338 if (ret) { 3339 mlog_errno(ret); 3340 goto out; 3341 } 3342 3343 *ret_right_path = right_path; 3344 out: 3345 if (ret) 3346 ocfs2_free_path(right_path); 3347 return ret; 3348 } 3349 3350 /* 3351 * Remove split_rec clusters from the record at index and merge them 3352 * onto the beginning of the record "next" to it. 3353 * For index < l_count - 1, the next means the extent rec at index + 1. 3354 * For index == l_count - 1, the "next" means the 1st extent rec of the 3355 * next extent block. 3356 */ 3357 static int ocfs2_merge_rec_right(struct ocfs2_path *left_path, 3358 handle_t *handle, 3359 struct ocfs2_extent_tree *et, 3360 struct ocfs2_extent_rec *split_rec, 3361 int index) 3362 { 3363 int ret, next_free, i; 3364 unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters); 3365 struct ocfs2_extent_rec *left_rec; 3366 struct ocfs2_extent_rec *right_rec; 3367 struct ocfs2_extent_list *right_el; 3368 struct ocfs2_path *right_path = NULL; 3369 int subtree_index = 0; 3370 struct ocfs2_extent_list *el = path_leaf_el(left_path); 3371 struct buffer_head *bh = path_leaf_bh(left_path); 3372 struct buffer_head *root_bh = NULL; 3373 3374 BUG_ON(index >= le16_to_cpu(el->l_next_free_rec)); 3375 left_rec = &el->l_recs[index]; 3376 3377 if (index == le16_to_cpu(el->l_next_free_rec) - 1 && 3378 le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count)) { 3379 /* we meet with a cross extent block merge. */ 3380 ret = ocfs2_get_right_path(et, left_path, &right_path); 3381 if (ret) { 3382 mlog_errno(ret); 3383 return ret; 3384 } 3385 3386 right_el = path_leaf_el(right_path); 3387 next_free = le16_to_cpu(right_el->l_next_free_rec); 3388 BUG_ON(next_free <= 0); 3389 right_rec = &right_el->l_recs[0]; 3390 if (ocfs2_is_empty_extent(right_rec)) { 3391 BUG_ON(next_free <= 1); 3392 right_rec = &right_el->l_recs[1]; 3393 } 3394 3395 BUG_ON(le32_to_cpu(left_rec->e_cpos) + 3396 le16_to_cpu(left_rec->e_leaf_clusters) != 3397 le32_to_cpu(right_rec->e_cpos)); 3398 3399 subtree_index = ocfs2_find_subtree_root(et, left_path, 3400 right_path); 3401 3402 ret = ocfs2_extend_rotate_transaction(handle, subtree_index, 3403 jbd2_handle_buffer_credits(handle), 3404 right_path); 3405 if (ret) { 3406 mlog_errno(ret); 3407 goto out; 3408 } 3409 3410 root_bh = left_path->p_node[subtree_index].bh; 3411 BUG_ON(root_bh != right_path->p_node[subtree_index].bh); 3412 3413 ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path, 3414 subtree_index); 3415 if (ret) { 3416 mlog_errno(ret); 3417 goto out; 3418 } 3419 3420 for (i = subtree_index + 1; 3421 i < path_num_items(right_path); i++) { 3422 ret = ocfs2_path_bh_journal_access(handle, et->et_ci, 3423 right_path, i); 3424 if (ret) { 3425 mlog_errno(ret); 3426 goto out; 3427 } 3428 3429 ret = ocfs2_path_bh_journal_access(handle, et->et_ci, 3430 left_path, i); 3431 if (ret) { 3432 mlog_errno(ret); 3433 goto out; 3434 } 3435 } 3436 3437 } else { 3438 BUG_ON(index == le16_to_cpu(el->l_next_free_rec) - 1); 3439 right_rec = &el->l_recs[index + 1]; 3440 } 3441 3442 ret = ocfs2_path_bh_journal_access(handle, et->et_ci, left_path, 3443 path_num_items(left_path) - 1); 3444 if (ret) { 3445 mlog_errno(ret); 3446 goto out; 3447 } 3448 3449 le16_add_cpu(&left_rec->e_leaf_clusters, -split_clusters); 3450 3451 le32_add_cpu(&right_rec->e_cpos, -split_clusters); 3452 le64_add_cpu(&right_rec->e_blkno, 3453 -ocfs2_clusters_to_blocks(ocfs2_metadata_cache_get_super(et->et_ci), 3454 split_clusters)); 3455 le16_add_cpu(&right_rec->e_leaf_clusters, split_clusters); 3456 3457 ocfs2_cleanup_merge(el, index); 3458 3459 ocfs2_journal_dirty(handle, bh); 3460 if (right_path) { 3461 ocfs2_journal_dirty(handle, path_leaf_bh(right_path)); 3462 ocfs2_complete_edge_insert(handle, left_path, right_path, 3463 subtree_index); 3464 } 3465 out: 3466 ocfs2_free_path(right_path); 3467 return ret; 3468 } 3469 3470 static int ocfs2_get_left_path(struct ocfs2_extent_tree *et, 3471 struct ocfs2_path *right_path, 3472 struct ocfs2_path **ret_left_path) 3473 { 3474 int ret; 3475 u32 left_cpos; 3476 struct ocfs2_path *left_path = NULL; 3477 3478 *ret_left_path = NULL; 3479 3480 /* This function shouldn't be called for non-trees. */ 3481 BUG_ON(right_path->p_tree_depth == 0); 3482 3483 ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci), 3484 right_path, &left_cpos); 3485 if (ret) { 3486 mlog_errno(ret); 3487 goto out; 3488 } 3489 3490 /* This function shouldn't be called for the leftmost leaf. */ 3491 BUG_ON(left_cpos == 0); 3492 3493 left_path = ocfs2_new_path_from_path(right_path); 3494 if (!left_path) { 3495 ret = -ENOMEM; 3496 mlog_errno(ret); 3497 goto out; 3498 } 3499 3500 ret = ocfs2_find_path(et->et_ci, left_path, left_cpos); 3501 if (ret) { 3502 mlog_errno(ret); 3503 goto out; 3504 } 3505 3506 *ret_left_path = left_path; 3507 out: 3508 if (ret) 3509 ocfs2_free_path(left_path); 3510 return ret; 3511 } 3512 3513 /* 3514 * Remove split_rec clusters from the record at index and merge them 3515 * onto the tail of the record "before" it. 3516 * For index > 0, the "before" means the extent rec at index - 1. 3517 * 3518 * For index == 0, the "before" means the last record of the previous 3519 * extent block. And there is also a situation that we may need to 3520 * remove the rightmost leaf extent block in the right_path and change 3521 * the right path to indicate the new rightmost path. 3522 */ 3523 static int ocfs2_merge_rec_left(struct ocfs2_path *right_path, 3524 handle_t *handle, 3525 struct ocfs2_extent_tree *et, 3526 struct ocfs2_extent_rec *split_rec, 3527 struct ocfs2_cached_dealloc_ctxt *dealloc, 3528 int index) 3529 { 3530 int ret, i, subtree_index = 0, has_empty_extent = 0; 3531 unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters); 3532 struct ocfs2_extent_rec *left_rec; 3533 struct ocfs2_extent_rec *right_rec; 3534 struct ocfs2_extent_list *el = path_leaf_el(right_path); 3535 struct buffer_head *bh = path_leaf_bh(right_path); 3536 struct buffer_head *root_bh = NULL; 3537 struct ocfs2_path *left_path = NULL; 3538 struct ocfs2_extent_list *left_el; 3539 3540 BUG_ON(index < 0); 3541 3542 right_rec = &el->l_recs[index]; 3543 if (index == 0) { 3544 /* we meet with a cross extent block merge. */ 3545 ret = ocfs2_get_left_path(et, right_path, &left_path); 3546 if (ret) { 3547 mlog_errno(ret); 3548 return ret; 3549 } 3550 3551 left_el = path_leaf_el(left_path); 3552 BUG_ON(le16_to_cpu(left_el->l_next_free_rec) != 3553 le16_to_cpu(left_el->l_count)); 3554 3555 left_rec = &left_el->l_recs[ 3556 le16_to_cpu(left_el->l_next_free_rec) - 1]; 3557 BUG_ON(le32_to_cpu(left_rec->e_cpos) + 3558 le16_to_cpu(left_rec->e_leaf_clusters) != 3559 le32_to_cpu(split_rec->e_cpos)); 3560 3561 subtree_index = ocfs2_find_subtree_root(et, left_path, 3562 right_path); 3563 3564 ret = ocfs2_extend_rotate_transaction(handle, subtree_index, 3565 jbd2_handle_buffer_credits(handle), 3566 left_path); 3567 if (ret) { 3568 mlog_errno(ret); 3569 goto out; 3570 } 3571 3572 root_bh = left_path->p_node[subtree_index].bh; 3573 BUG_ON(root_bh != right_path->p_node[subtree_index].bh); 3574 3575 ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path, 3576 subtree_index); 3577 if (ret) { 3578 mlog_errno(ret); 3579 goto out; 3580 } 3581 3582 for (i = subtree_index + 1; 3583 i < path_num_items(right_path); i++) { 3584 ret = ocfs2_path_bh_journal_access(handle, et->et_ci, 3585 right_path, i); 3586 if (ret) { 3587 mlog_errno(ret); 3588 goto out; 3589 } 3590 3591 ret = ocfs2_path_bh_journal_access(handle, et->et_ci, 3592 left_path, i); 3593 if (ret) { 3594 mlog_errno(ret); 3595 goto out; 3596 } 3597 } 3598 } else { 3599 left_rec = &el->l_recs[index - 1]; 3600 if (ocfs2_is_empty_extent(&el->l_recs[0])) 3601 has_empty_extent = 1; 3602 } 3603 3604 ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path, 3605 path_num_items(right_path) - 1); 3606 if (ret) { 3607 mlog_errno(ret); 3608 goto out; 3609 } 3610 3611 if (has_empty_extent && index == 1) { 3612 /* 3613 * The easy case - we can just plop the record right in. 3614 */ 3615 *left_rec = *split_rec; 3616 } else 3617 le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters); 3618 3619 le32_add_cpu(&right_rec->e_cpos, split_clusters); 3620 le64_add_cpu(&right_rec->e_blkno, 3621 ocfs2_clusters_to_blocks(ocfs2_metadata_cache_get_super(et->et_ci), 3622 split_clusters)); 3623 le16_add_cpu(&right_rec->e_leaf_clusters, -split_clusters); 3624 3625 ocfs2_cleanup_merge(el, index); 3626 3627 ocfs2_journal_dirty(handle, bh); 3628 if (left_path) { 3629 ocfs2_journal_dirty(handle, path_leaf_bh(left_path)); 3630 3631 /* 3632 * In the situation that the right_rec is empty and the extent 3633 * block is empty also, ocfs2_complete_edge_insert can't handle 3634 * it and we need to delete the right extent block. 3635 */ 3636 if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 && 3637 le16_to_cpu(el->l_next_free_rec) == 1) { 3638 /* extend credit for ocfs2_remove_rightmost_path */ 3639 ret = ocfs2_extend_rotate_transaction(handle, 0, 3640 jbd2_handle_buffer_credits(handle), 3641 right_path); 3642 if (ret) { 3643 mlog_errno(ret); 3644 goto out; 3645 } 3646 3647 ret = ocfs2_remove_rightmost_path(handle, et, 3648 right_path, 3649 dealloc); 3650 if (ret) { 3651 mlog_errno(ret); 3652 goto out; 3653 } 3654 3655 /* Now the rightmost extent block has been deleted. 3656 * So we use the new rightmost path. 3657 */ 3658 ocfs2_mv_path(right_path, left_path); 3659 } else 3660 ocfs2_complete_edge_insert(handle, left_path, 3661 right_path, subtree_index); 3662 } 3663 out: 3664 ocfs2_free_path(left_path); 3665 return ret; 3666 } 3667 3668 static int ocfs2_try_to_merge_extent(handle_t *handle, 3669 struct ocfs2_extent_tree *et, 3670 struct ocfs2_path *path, 3671 int split_index, 3672 struct ocfs2_extent_rec *split_rec, 3673 struct ocfs2_cached_dealloc_ctxt *dealloc, 3674 struct ocfs2_merge_ctxt *ctxt) 3675 { 3676 int ret = 0; 3677 struct ocfs2_extent_list *el = path_leaf_el(path); 3678 struct ocfs2_extent_rec *rec = &el->l_recs[split_index]; 3679 3680 BUG_ON(ctxt->c_contig_type == CONTIG_NONE); 3681 3682 if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) { 3683 /* extend credit for ocfs2_remove_rightmost_path */ 3684 ret = ocfs2_extend_rotate_transaction(handle, 0, 3685 jbd2_handle_buffer_credits(handle), 3686 path); 3687 if (ret) { 3688 mlog_errno(ret); 3689 goto out; 3690 } 3691 /* 3692 * The merge code will need to create an empty 3693 * extent to take the place of the newly 3694 * emptied slot. Remove any pre-existing empty 3695 * extents - having more than one in a leaf is 3696 * illegal. 3697 */ 3698 ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); 3699 if (ret) { 3700 mlog_errno(ret); 3701 goto out; 3702 } 3703 split_index--; 3704 rec = &el->l_recs[split_index]; 3705 } 3706 3707 if (ctxt->c_contig_type == CONTIG_LEFTRIGHT) { 3708 /* 3709 * Left-right contig implies this. 3710 */ 3711 BUG_ON(!ctxt->c_split_covers_rec); 3712 3713 /* 3714 * Since the leftright insert always covers the entire 3715 * extent, this call will delete the insert record 3716 * entirely, resulting in an empty extent record added to 3717 * the extent block. 3718 * 3719 * Since the adding of an empty extent shifts 3720 * everything back to the right, there's no need to 3721 * update split_index here. 3722 * 3723 * When the split_index is zero, we need to merge it to the 3724 * previous extent block. It is more efficient and easier 3725 * if we do merge_right first and merge_left later. 3726 */ 3727 ret = ocfs2_merge_rec_right(path, handle, et, split_rec, 3728 split_index); 3729 if (ret) { 3730 mlog_errno(ret); 3731 goto out; 3732 } 3733 3734 /* 3735 * We can only get this from logic error above. 3736 */ 3737 BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0])); 3738 3739 /* extend credit for ocfs2_remove_rightmost_path */ 3740 ret = ocfs2_extend_rotate_transaction(handle, 0, 3741 jbd2_handle_buffer_credits(handle), 3742 path); 3743 if (ret) { 3744 mlog_errno(ret); 3745 goto out; 3746 } 3747 3748 /* The merge left us with an empty extent, remove it. */ 3749 ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); 3750 if (ret) { 3751 mlog_errno(ret); 3752 goto out; 3753 } 3754 3755 rec = &el->l_recs[split_index]; 3756 3757 /* 3758 * Note that we don't pass split_rec here on purpose - 3759 * we've merged it into the rec already. 3760 */ 3761 ret = ocfs2_merge_rec_left(path, handle, et, rec, 3762 dealloc, split_index); 3763 3764 if (ret) { 3765 mlog_errno(ret); 3766 goto out; 3767 } 3768 3769 /* extend credit for ocfs2_remove_rightmost_path */ 3770 ret = ocfs2_extend_rotate_transaction(handle, 0, 3771 jbd2_handle_buffer_credits(handle), 3772 path); 3773 if (ret) { 3774 mlog_errno(ret); 3775 goto out; 3776 } 3777 3778 ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); 3779 /* 3780 * Error from this last rotate is not critical, so 3781 * print but don't bubble it up. 3782 */ 3783 if (ret) 3784 mlog_errno(ret); 3785 ret = 0; 3786 } else { 3787 /* 3788 * Merge a record to the left or right. 3789 * 3790 * 'contig_type' is relative to the existing record, 3791 * so for example, if we're "right contig", it's to 3792 * the record on the left (hence the left merge). 3793 */ 3794 if (ctxt->c_contig_type == CONTIG_RIGHT) { 3795 ret = ocfs2_merge_rec_left(path, handle, et, 3796 split_rec, dealloc, 3797 split_index); 3798 if (ret) { 3799 mlog_errno(ret); 3800 goto out; 3801 } 3802 } else { 3803 ret = ocfs2_merge_rec_right(path, handle, 3804 et, split_rec, 3805 split_index); 3806 if (ret) { 3807 mlog_errno(ret); 3808 goto out; 3809 } 3810 } 3811 3812 if (ctxt->c_split_covers_rec) { 3813 /* extend credit for ocfs2_remove_rightmost_path */ 3814 ret = ocfs2_extend_rotate_transaction(handle, 0, 3815 jbd2_handle_buffer_credits(handle), 3816 path); 3817 if (ret) { 3818 mlog_errno(ret); 3819 ret = 0; 3820 goto out; 3821 } 3822 3823 /* 3824 * The merge may have left an empty extent in 3825 * our leaf. Try to rotate it away. 3826 */ 3827 ret = ocfs2_rotate_tree_left(handle, et, path, 3828 dealloc); 3829 if (ret) 3830 mlog_errno(ret); 3831 ret = 0; 3832 } 3833 } 3834 3835 out: 3836 return ret; 3837 } 3838 3839 static void ocfs2_subtract_from_rec(struct super_block *sb, 3840 enum ocfs2_split_type split, 3841 struct ocfs2_extent_rec *rec, 3842 struct ocfs2_extent_rec *split_rec) 3843 { 3844 u64 len_blocks; 3845 3846 len_blocks = ocfs2_clusters_to_blocks(sb, 3847 le16_to_cpu(split_rec->e_leaf_clusters)); 3848 3849 if (split == SPLIT_LEFT) { 3850 /* 3851 * Region is on the left edge of the existing 3852 * record. 3853 */ 3854 le32_add_cpu(&rec->e_cpos, 3855 le16_to_cpu(split_rec->e_leaf_clusters)); 3856 le64_add_cpu(&rec->e_blkno, len_blocks); 3857 le16_add_cpu(&rec->e_leaf_clusters, 3858 -le16_to_cpu(split_rec->e_leaf_clusters)); 3859 } else { 3860 /* 3861 * Region is on the right edge of the existing 3862 * record. 3863 */ 3864 le16_add_cpu(&rec->e_leaf_clusters, 3865 -le16_to_cpu(split_rec->e_leaf_clusters)); 3866 } 3867 } 3868 3869 /* 3870 * Do the final bits of extent record insertion at the target leaf 3871 * list. If this leaf is part of an allocation tree, it is assumed 3872 * that the tree above has been prepared. 3873 */ 3874 static void ocfs2_insert_at_leaf(struct ocfs2_extent_tree *et, 3875 struct ocfs2_extent_rec *insert_rec, 3876 struct ocfs2_extent_list *el, 3877 struct ocfs2_insert_type *insert) 3878 { 3879 int i = insert->ins_contig_index; 3880 unsigned int range; 3881 struct ocfs2_extent_rec *rec; 3882 3883 BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); 3884 3885 if (insert->ins_split != SPLIT_NONE) { 3886 i = ocfs2_search_extent_list(el, le32_to_cpu(insert_rec->e_cpos)); 3887 BUG_ON(i == -1); 3888 rec = &el->l_recs[i]; 3889 ocfs2_subtract_from_rec(ocfs2_metadata_cache_get_super(et->et_ci), 3890 insert->ins_split, rec, 3891 insert_rec); 3892 goto rotate; 3893 } 3894 3895 /* 3896 * Contiguous insert - either left or right. 3897 */ 3898 if (insert->ins_contig != CONTIG_NONE) { 3899 rec = &el->l_recs[i]; 3900 if (insert->ins_contig == CONTIG_LEFT) { 3901 rec->e_blkno = insert_rec->e_blkno; 3902 rec->e_cpos = insert_rec->e_cpos; 3903 } 3904 le16_add_cpu(&rec->e_leaf_clusters, 3905 le16_to_cpu(insert_rec->e_leaf_clusters)); 3906 return; 3907 } 3908 3909 /* 3910 * Handle insert into an empty leaf. 3911 */ 3912 if (le16_to_cpu(el->l_next_free_rec) == 0 || 3913 ((le16_to_cpu(el->l_next_free_rec) == 1) && 3914 ocfs2_is_empty_extent(&el->l_recs[0]))) { 3915 el->l_recs[0] = *insert_rec; 3916 el->l_next_free_rec = cpu_to_le16(1); 3917 return; 3918 } 3919 3920 /* 3921 * Appending insert. 3922 */ 3923 if (insert->ins_appending == APPEND_TAIL) { 3924 i = le16_to_cpu(el->l_next_free_rec) - 1; 3925 rec = &el->l_recs[i]; 3926 range = le32_to_cpu(rec->e_cpos) 3927 + le16_to_cpu(rec->e_leaf_clusters); 3928 BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range); 3929 3930 mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >= 3931 le16_to_cpu(el->l_count), 3932 "owner %llu, depth %u, count %u, next free %u, " 3933 "rec.cpos %u, rec.clusters %u, " 3934 "insert.cpos %u, insert.clusters %u\n", 3935 ocfs2_metadata_cache_owner(et->et_ci), 3936 le16_to_cpu(el->l_tree_depth), 3937 le16_to_cpu(el->l_count), 3938 le16_to_cpu(el->l_next_free_rec), 3939 le32_to_cpu(el->l_recs[i].e_cpos), 3940 le16_to_cpu(el->l_recs[i].e_leaf_clusters), 3941 le32_to_cpu(insert_rec->e_cpos), 3942 le16_to_cpu(insert_rec->e_leaf_clusters)); 3943 i++; 3944 el->l_recs[i] = *insert_rec; 3945 le16_add_cpu(&el->l_next_free_rec, 1); 3946 return; 3947 } 3948 3949 rotate: 3950 /* 3951 * Ok, we have to rotate. 3952 * 3953 * At this point, it is safe to assume that inserting into an 3954 * empty leaf and appending to a leaf have both been handled 3955 * above. 3956 * 3957 * This leaf needs to have space, either by the empty 1st 3958 * extent record, or by virtue of an l_next_free_rec < l_count. 3959 */ 3960 ocfs2_rotate_leaf(el, insert_rec); 3961 } 3962 3963 static void ocfs2_adjust_rightmost_records(handle_t *handle, 3964 struct ocfs2_extent_tree *et, 3965 struct ocfs2_path *path, 3966 struct ocfs2_extent_rec *insert_rec) 3967 { 3968 int i, next_free; 3969 struct buffer_head *bh; 3970 struct ocfs2_extent_list *el; 3971 struct ocfs2_extent_rec *rec; 3972 3973 /* 3974 * Update everything except the leaf block. 3975 */ 3976 for (i = 0; i < path->p_tree_depth; i++) { 3977 bh = path->p_node[i].bh; 3978 el = path->p_node[i].el; 3979 3980 next_free = le16_to_cpu(el->l_next_free_rec); 3981 if (next_free == 0) { 3982 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), 3983 "Owner %llu has a bad extent list\n", 3984 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci)); 3985 return; 3986 } 3987 3988 rec = &el->l_recs[next_free - 1]; 3989 3990 rec->e_int_clusters = insert_rec->e_cpos; 3991 le32_add_cpu(&rec->e_int_clusters, 3992 le16_to_cpu(insert_rec->e_leaf_clusters)); 3993 le32_add_cpu(&rec->e_int_clusters, 3994 -le32_to_cpu(rec->e_cpos)); 3995 3996 ocfs2_journal_dirty(handle, bh); 3997 } 3998 } 3999 4000 static int ocfs2_append_rec_to_path(handle_t *handle, 4001 struct ocfs2_extent_tree *et, 4002 struct ocfs2_extent_rec *insert_rec, 4003 struct ocfs2_path *right_path, 4004 struct ocfs2_path **ret_left_path) 4005 { 4006 int ret, next_free; 4007 struct ocfs2_extent_list *el; 4008 struct ocfs2_path *left_path = NULL; 4009 4010 *ret_left_path = NULL; 4011 4012 /* 4013 * This shouldn't happen for non-trees. The extent rec cluster 4014 * count manipulation below only works for interior nodes. 4015 */ 4016 BUG_ON(right_path->p_tree_depth == 0); 4017 4018 /* 4019 * If our appending insert is at the leftmost edge of a leaf, 4020 * then we might need to update the rightmost records of the 4021 * neighboring path. 4022 */ 4023 el = path_leaf_el(right_path); 4024 next_free = le16_to_cpu(el->l_next_free_rec); 4025 if (next_free == 0 || 4026 (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) { 4027 u32 left_cpos; 4028 4029 ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci), 4030 right_path, &left_cpos); 4031 if (ret) { 4032 mlog_errno(ret); 4033 goto out; 4034 } 4035 4036 trace_ocfs2_append_rec_to_path( 4037 (unsigned long long) 4038 ocfs2_metadata_cache_owner(et->et_ci), 4039 le32_to_cpu(insert_rec->e_cpos), 4040 left_cpos); 4041 4042 /* 4043 * No need to worry if the append is already in the 4044 * leftmost leaf. 4045 */ 4046 if (left_cpos) { 4047 left_path = ocfs2_new_path_from_path(right_path); 4048 if (!left_path) { 4049 ret = -ENOMEM; 4050 mlog_errno(ret); 4051 goto out; 4052 } 4053 4054 ret = ocfs2_find_path(et->et_ci, left_path, 4055 left_cpos); 4056 if (ret) { 4057 mlog_errno(ret); 4058 goto out; 4059 } 4060 4061 /* 4062 * ocfs2_insert_path() will pass the left_path to the 4063 * journal for us. 4064 */ 4065 } 4066 } 4067 4068 ret = ocfs2_journal_access_path(et->et_ci, handle, right_path); 4069 if (ret) { 4070 mlog_errno(ret); 4071 goto out; 4072 } 4073 4074 ocfs2_adjust_rightmost_records(handle, et, right_path, insert_rec); 4075 4076 *ret_left_path = left_path; 4077 ret = 0; 4078 out: 4079 if (ret != 0) 4080 ocfs2_free_path(left_path); 4081 4082 return ret; 4083 } 4084 4085 static void ocfs2_split_record(struct ocfs2_extent_tree *et, 4086 struct ocfs2_path *left_path, 4087 struct ocfs2_path *right_path, 4088 struct ocfs2_extent_rec *split_rec, 4089 enum ocfs2_split_type split) 4090 { 4091 int index; 4092 u32 cpos = le32_to_cpu(split_rec->e_cpos); 4093 struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el; 4094 struct ocfs2_extent_rec *rec, *tmprec; 4095 4096 right_el = path_leaf_el(right_path); 4097 if (left_path) 4098 left_el = path_leaf_el(left_path); 4099 4100 el = right_el; 4101 insert_el = right_el; 4102 index = ocfs2_search_extent_list(el, cpos); 4103 if (index != -1) { 4104 if (index == 0 && left_path) { 4105 BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0])); 4106 4107 /* 4108 * This typically means that the record 4109 * started in the left path but moved to the 4110 * right as a result of rotation. We either 4111 * move the existing record to the left, or we 4112 * do the later insert there. 4113 * 4114 * In this case, the left path should always 4115 * exist as the rotate code will have passed 4116 * it back for a post-insert update. 4117 */ 4118 4119 if (split == SPLIT_LEFT) { 4120 /* 4121 * It's a left split. Since we know 4122 * that the rotate code gave us an 4123 * empty extent in the left path, we 4124 * can just do the insert there. 4125 */ 4126 insert_el = left_el; 4127 } else { 4128 /* 4129 * Right split - we have to move the 4130 * existing record over to the left 4131 * leaf. The insert will be into the 4132 * newly created empty extent in the 4133 * right leaf. 4134 */ 4135 tmprec = &right_el->l_recs[index]; 4136 ocfs2_rotate_leaf(left_el, tmprec); 4137 el = left_el; 4138 4139 memset(tmprec, 0, sizeof(*tmprec)); 4140 index = ocfs2_search_extent_list(left_el, cpos); 4141 BUG_ON(index == -1); 4142 } 4143 } 4144 } else { 4145 BUG_ON(!left_path); 4146 BUG_ON(!ocfs2_is_empty_extent(&left_el->l_recs[0])); 4147 /* 4148 * Left path is easy - we can just allow the insert to 4149 * happen. 4150 */ 4151 el = left_el; 4152 insert_el = left_el; 4153 index = ocfs2_search_extent_list(el, cpos); 4154 BUG_ON(index == -1); 4155 } 4156 4157 rec = &el->l_recs[index]; 4158 ocfs2_subtract_from_rec(ocfs2_metadata_cache_get_super(et->et_ci), 4159 split, rec, split_rec); 4160 ocfs2_rotate_leaf(insert_el, split_rec); 4161 } 4162 4163 /* 4164 * This function only does inserts on an allocation b-tree. For tree 4165 * depth = 0, ocfs2_insert_at_leaf() is called directly. 4166 * 4167 * right_path is the path we want to do the actual insert 4168 * in. left_path should only be passed in if we need to update that 4169 * portion of the tree after an edge insert. 4170 */ 4171 static int ocfs2_insert_path(handle_t *handle, 4172 struct ocfs2_extent_tree *et, 4173 struct ocfs2_path *left_path, 4174 struct ocfs2_path *right_path, 4175 struct ocfs2_extent_rec *insert_rec, 4176 struct ocfs2_insert_type *insert) 4177 { 4178 int ret, subtree_index; 4179 struct buffer_head *leaf_bh = path_leaf_bh(right_path); 4180 4181 if (left_path) { 4182 /* 4183 * There's a chance that left_path got passed back to 4184 * us without being accounted for in the 4185 * journal. Extend our transaction here to be sure we 4186 * can change those blocks. 4187 */ 4188 ret = ocfs2_extend_trans(handle, left_path->p_tree_depth); 4189 if (ret < 0) { 4190 mlog_errno(ret); 4191 goto out; 4192 } 4193 4194 ret = ocfs2_journal_access_path(et->et_ci, handle, left_path); 4195 if (ret < 0) { 4196 mlog_errno(ret); 4197 goto out; 4198 } 4199 } 4200 4201 /* 4202 * Pass both paths to the journal. The majority of inserts 4203 * will be touching all components anyway. 4204 */ 4205 ret = ocfs2_journal_access_path(et->et_ci, handle, right_path); 4206 if (ret < 0) { 4207 mlog_errno(ret); 4208 goto out; 4209 } 4210 4211 if (insert->ins_split != SPLIT_NONE) { 4212 /* 4213 * We could call ocfs2_insert_at_leaf() for some types 4214 * of splits, but it's easier to just let one separate 4215 * function sort it all out. 4216 */ 4217 ocfs2_split_record(et, left_path, right_path, 4218 insert_rec, insert->ins_split); 4219 4220 /* 4221 * Split might have modified either leaf and we don't 4222 * have a guarantee that the later edge insert will 4223 * dirty this for us. 4224 */ 4225 if (left_path) 4226 ocfs2_journal_dirty(handle, 4227 path_leaf_bh(left_path)); 4228 } else 4229 ocfs2_insert_at_leaf(et, insert_rec, path_leaf_el(right_path), 4230 insert); 4231 4232 ocfs2_journal_dirty(handle, leaf_bh); 4233 4234 if (left_path) { 4235 /* 4236 * The rotate code has indicated that we need to fix 4237 * up portions of the tree after the insert. 4238 * 4239 * XXX: Should we extend the transaction here? 4240 */ 4241 subtree_index = ocfs2_find_subtree_root(et, left_path, 4242 right_path); 4243 ocfs2_complete_edge_insert(handle, left_path, right_path, 4244 subtree_index); 4245 } 4246 4247 ret = 0; 4248 out: 4249 return ret; 4250 } 4251 4252 static int ocfs2_do_insert_extent(handle_t *handle, 4253 struct ocfs2_extent_tree *et, 4254 struct ocfs2_extent_rec *insert_rec, 4255 struct ocfs2_insert_type *type) 4256 { 4257 int ret, rotate = 0; 4258 u32 cpos; 4259 struct ocfs2_path *right_path = NULL; 4260 struct ocfs2_path *left_path = NULL; 4261 struct ocfs2_extent_list *el; 4262 4263 el = et->et_root_el; 4264 4265 ret = ocfs2_et_root_journal_access(handle, et, 4266 OCFS2_JOURNAL_ACCESS_WRITE); 4267 if (ret) { 4268 mlog_errno(ret); 4269 goto out; 4270 } 4271 4272 if (le16_to_cpu(el->l_tree_depth) == 0) { 4273 ocfs2_insert_at_leaf(et, insert_rec, el, type); 4274 goto out_update_clusters; 4275 } 4276 4277 right_path = ocfs2_new_path_from_et(et); 4278 if (!right_path) { 4279 ret = -ENOMEM; 4280 mlog_errno(ret); 4281 goto out; 4282 } 4283 4284 /* 4285 * Determine the path to start with. Rotations need the 4286 * rightmost path, everything else can go directly to the 4287 * target leaf. 4288 */ 4289 cpos = le32_to_cpu(insert_rec->e_cpos); 4290 if (type->ins_appending == APPEND_NONE && 4291 type->ins_contig == CONTIG_NONE) { 4292 rotate = 1; 4293 cpos = UINT_MAX; 4294 } 4295 4296 ret = ocfs2_find_path(et->et_ci, right_path, cpos); 4297 if (ret) { 4298 mlog_errno(ret); 4299 goto out; 4300 } 4301 4302 /* 4303 * Rotations and appends need special treatment - they modify 4304 * parts of the tree's above them. 4305 * 4306 * Both might pass back a path immediate to the left of the 4307 * one being inserted to. This will be cause 4308 * ocfs2_insert_path() to modify the rightmost records of 4309 * left_path to account for an edge insert. 4310 * 4311 * XXX: When modifying this code, keep in mind that an insert 4312 * can wind up skipping both of these two special cases... 4313 */ 4314 if (rotate) { 4315 ret = ocfs2_rotate_tree_right(handle, et, type->ins_split, 4316 le32_to_cpu(insert_rec->e_cpos), 4317 right_path, &left_path); 4318 if (ret) { 4319 mlog_errno(ret); 4320 goto out; 4321 } 4322 4323 /* 4324 * ocfs2_rotate_tree_right() might have extended the 4325 * transaction without re-journaling our tree root. 4326 */ 4327 ret = ocfs2_et_root_journal_access(handle, et, 4328 OCFS2_JOURNAL_ACCESS_WRITE); 4329 if (ret) { 4330 mlog_errno(ret); 4331 goto out; 4332 } 4333 } else if (type->ins_appending == APPEND_TAIL 4334 && type->ins_contig != CONTIG_LEFT) { 4335 ret = ocfs2_append_rec_to_path(handle, et, insert_rec, 4336 right_path, &left_path); 4337 if (ret) { 4338 mlog_errno(ret); 4339 goto out; 4340 } 4341 } 4342 4343 ret = ocfs2_insert_path(handle, et, left_path, right_path, 4344 insert_rec, type); 4345 if (ret) { 4346 mlog_errno(ret); 4347 goto out; 4348 } 4349 4350 out_update_clusters: 4351 if (type->ins_split == SPLIT_NONE) 4352 ocfs2_et_update_clusters(et, 4353 le16_to_cpu(insert_rec->e_leaf_clusters)); 4354 4355 ocfs2_journal_dirty(handle, et->et_root_bh); 4356 4357 out: 4358 ocfs2_free_path(left_path); 4359 ocfs2_free_path(right_path); 4360 4361 return ret; 4362 } 4363 4364 static int ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, 4365 struct ocfs2_path *path, 4366 struct ocfs2_extent_list *el, int index, 4367 struct ocfs2_extent_rec *split_rec, 4368 struct ocfs2_merge_ctxt *ctxt) 4369 { 4370 int status = 0; 4371 enum ocfs2_contig_type ret = CONTIG_NONE; 4372 u32 left_cpos, right_cpos; 4373 struct ocfs2_extent_rec *rec = NULL; 4374 struct ocfs2_extent_list *new_el; 4375 struct ocfs2_path *left_path = NULL, *right_path = NULL; 4376 struct buffer_head *bh; 4377 struct ocfs2_extent_block *eb; 4378 struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); 4379 4380 if (index > 0) { 4381 rec = &el->l_recs[index - 1]; 4382 } else if (path->p_tree_depth > 0) { 4383 status = ocfs2_find_cpos_for_left_leaf(sb, path, &left_cpos); 4384 if (status) 4385 goto exit; 4386 4387 if (left_cpos != 0) { 4388 left_path = ocfs2_new_path_from_path(path); 4389 if (!left_path) { 4390 status = -ENOMEM; 4391 mlog_errno(status); 4392 goto exit; 4393 } 4394 4395 status = ocfs2_find_path(et->et_ci, left_path, 4396 left_cpos); 4397 if (status) 4398 goto free_left_path; 4399 4400 new_el = path_leaf_el(left_path); 4401 4402 if (le16_to_cpu(new_el->l_next_free_rec) != 4403 le16_to_cpu(new_el->l_count)) { 4404 bh = path_leaf_bh(left_path); 4405 eb = (struct ocfs2_extent_block *)bh->b_data; 4406 status = ocfs2_error(sb, 4407 "Extent block #%llu has an invalid l_next_free_rec of %d. It should have matched the l_count of %d\n", 4408 (unsigned long long)le64_to_cpu(eb->h_blkno), 4409 le16_to_cpu(new_el->l_next_free_rec), 4410 le16_to_cpu(new_el->l_count)); 4411 goto free_left_path; 4412 } 4413 rec = &new_el->l_recs[ 4414 le16_to_cpu(new_el->l_next_free_rec) - 1]; 4415 } 4416 } 4417 4418 /* 4419 * We're careful to check for an empty extent record here - 4420 * the merge code will know what to do if it sees one. 4421 */ 4422 if (rec) { 4423 if (index == 1 && ocfs2_is_empty_extent(rec)) { 4424 if (split_rec->e_cpos == el->l_recs[index].e_cpos) 4425 ret = CONTIG_RIGHT; 4426 } else { 4427 ret = ocfs2_et_extent_contig(et, rec, split_rec); 4428 } 4429 } 4430 4431 rec = NULL; 4432 if (index < (le16_to_cpu(el->l_next_free_rec) - 1)) 4433 rec = &el->l_recs[index + 1]; 4434 else if (le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count) && 4435 path->p_tree_depth > 0) { 4436 status = ocfs2_find_cpos_for_right_leaf(sb, path, &right_cpos); 4437 if (status) 4438 goto free_left_path; 4439 4440 if (right_cpos == 0) 4441 goto free_left_path; 4442 4443 right_path = ocfs2_new_path_from_path(path); 4444 if (!right_path) { 4445 status = -ENOMEM; 4446 mlog_errno(status); 4447 goto free_left_path; 4448 } 4449 4450 status = ocfs2_find_path(et->et_ci, right_path, right_cpos); 4451 if (status) 4452 goto free_right_path; 4453 4454 new_el = path_leaf_el(right_path); 4455 rec = &new_el->l_recs[0]; 4456 if (ocfs2_is_empty_extent(rec)) { 4457 if (le16_to_cpu(new_el->l_next_free_rec) <= 1) { 4458 bh = path_leaf_bh(right_path); 4459 eb = (struct ocfs2_extent_block *)bh->b_data; 4460 status = ocfs2_error(sb, 4461 "Extent block #%llu has an invalid l_next_free_rec of %d\n", 4462 (unsigned long long)le64_to_cpu(eb->h_blkno), 4463 le16_to_cpu(new_el->l_next_free_rec)); 4464 goto free_right_path; 4465 } 4466 rec = &new_el->l_recs[1]; 4467 } 4468 } 4469 4470 if (rec) { 4471 enum ocfs2_contig_type contig_type; 4472 4473 contig_type = ocfs2_et_extent_contig(et, rec, split_rec); 4474 4475 if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT) 4476 ret = CONTIG_LEFTRIGHT; 4477 else if (ret == CONTIG_NONE) 4478 ret = contig_type; 4479 } 4480 4481 free_right_path: 4482 ocfs2_free_path(right_path); 4483 free_left_path: 4484 ocfs2_free_path(left_path); 4485 exit: 4486 if (status == 0) 4487 ctxt->c_contig_type = ret; 4488 4489 return status; 4490 } 4491 4492 static void ocfs2_figure_contig_type(struct ocfs2_extent_tree *et, 4493 struct ocfs2_insert_type *insert, 4494 struct ocfs2_extent_list *el, 4495 struct ocfs2_extent_rec *insert_rec) 4496 { 4497 int i; 4498 enum ocfs2_contig_type contig_type = CONTIG_NONE; 4499 4500 BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); 4501 4502 for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { 4503 contig_type = ocfs2_et_extent_contig(et, &el->l_recs[i], 4504 insert_rec); 4505 if (contig_type != CONTIG_NONE) { 4506 insert->ins_contig_index = i; 4507 break; 4508 } 4509 } 4510 insert->ins_contig = contig_type; 4511 4512 if (insert->ins_contig != CONTIG_NONE) { 4513 struct ocfs2_extent_rec *rec = 4514 &el->l_recs[insert->ins_contig_index]; 4515 unsigned int len = le16_to_cpu(rec->e_leaf_clusters) + 4516 le16_to_cpu(insert_rec->e_leaf_clusters); 4517 4518 /* 4519 * Caller might want us to limit the size of extents, don't 4520 * calculate contiguousness if we might exceed that limit. 4521 */ 4522 if (et->et_max_leaf_clusters && 4523 (len > et->et_max_leaf_clusters)) 4524 insert->ins_contig = CONTIG_NONE; 4525 } 4526 } 4527 4528 /* 4529 * This should only be called against the rightmost leaf extent list. 4530 * 4531 * ocfs2_figure_appending_type() will figure out whether we'll have to 4532 * insert at the tail of the rightmost leaf. 4533 * 4534 * This should also work against the root extent list for tree's with 0 4535 * depth. If we consider the root extent list to be the rightmost leaf node 4536 * then the logic here makes sense. 4537 */ 4538 static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert, 4539 struct ocfs2_extent_list *el, 4540 struct ocfs2_extent_rec *insert_rec) 4541 { 4542 int i; 4543 u32 cpos = le32_to_cpu(insert_rec->e_cpos); 4544 struct ocfs2_extent_rec *rec; 4545 4546 insert->ins_appending = APPEND_NONE; 4547 4548 BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); 4549 4550 if (!el->l_next_free_rec) 4551 goto set_tail_append; 4552 4553 if (ocfs2_is_empty_extent(&el->l_recs[0])) { 4554 /* Were all records empty? */ 4555 if (le16_to_cpu(el->l_next_free_rec) == 1) 4556 goto set_tail_append; 4557 } 4558 4559 i = le16_to_cpu(el->l_next_free_rec) - 1; 4560 rec = &el->l_recs[i]; 4561 4562 if (cpos >= 4563 (le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters))) 4564 goto set_tail_append; 4565 4566 return; 4567 4568 set_tail_append: 4569 insert->ins_appending = APPEND_TAIL; 4570 } 4571 4572 /* 4573 * Helper function called at the beginning of an insert. 4574 * 4575 * This computes a few things that are commonly used in the process of 4576 * inserting into the btree: 4577 * - Whether the new extent is contiguous with an existing one. 4578 * - The current tree depth. 4579 * - Whether the insert is an appending one. 4580 * - The total # of free records in the tree. 4581 * 4582 * All of the information is stored on the ocfs2_insert_type 4583 * structure. 4584 */ 4585 static int ocfs2_figure_insert_type(struct ocfs2_extent_tree *et, 4586 struct buffer_head **last_eb_bh, 4587 struct ocfs2_extent_rec *insert_rec, 4588 int *free_records, 4589 struct ocfs2_insert_type *insert) 4590 { 4591 int ret; 4592 struct ocfs2_extent_block *eb; 4593 struct ocfs2_extent_list *el; 4594 struct ocfs2_path *path = NULL; 4595 struct buffer_head *bh = NULL; 4596 4597 insert->ins_split = SPLIT_NONE; 4598 4599 el = et->et_root_el; 4600 insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth); 4601 4602 if (el->l_tree_depth) { 4603 /* 4604 * If we have tree depth, we read in the 4605 * rightmost extent block ahead of time as 4606 * ocfs2_figure_insert_type() and ocfs2_add_branch() 4607 * may want it later. 4608 */ 4609 ret = ocfs2_read_extent_block(et->et_ci, 4610 ocfs2_et_get_last_eb_blk(et), 4611 &bh); 4612 if (ret) { 4613 mlog_errno(ret); 4614 goto out; 4615 } 4616 eb = (struct ocfs2_extent_block *) bh->b_data; 4617 el = &eb->h_list; 4618 } 4619 4620 /* 4621 * Unless we have a contiguous insert, we'll need to know if 4622 * there is room left in our allocation tree for another 4623 * extent record. 4624 * 4625 * XXX: This test is simplistic, we can search for empty 4626 * extent records too. 4627 */ 4628 *free_records = le16_to_cpu(el->l_count) - 4629 le16_to_cpu(el->l_next_free_rec); 4630 4631 if (!insert->ins_tree_depth) { 4632 ocfs2_figure_contig_type(et, insert, el, insert_rec); 4633 ocfs2_figure_appending_type(insert, el, insert_rec); 4634 return 0; 4635 } 4636 4637 path = ocfs2_new_path_from_et(et); 4638 if (!path) { 4639 ret = -ENOMEM; 4640 mlog_errno(ret); 4641 goto out; 4642 } 4643 4644 /* 4645 * In the case that we're inserting past what the tree 4646 * currently accounts for, ocfs2_find_path() will return for 4647 * us the rightmost tree path. This is accounted for below in 4648 * the appending code. 4649 */ 4650 ret = ocfs2_find_path(et->et_ci, path, le32_to_cpu(insert_rec->e_cpos)); 4651 if (ret) { 4652 mlog_errno(ret); 4653 goto out; 4654 } 4655 4656 el = path_leaf_el(path); 4657 4658 /* 4659 * Now that we have the path, there's two things we want to determine: 4660 * 1) Contiguousness (also set contig_index if this is so) 4661 * 4662 * 2) Are we doing an append? We can trivially break this up 4663 * into two types of appends: simple record append, or a 4664 * rotate inside the tail leaf. 4665 */ 4666 ocfs2_figure_contig_type(et, insert, el, insert_rec); 4667 4668 /* 4669 * The insert code isn't quite ready to deal with all cases of 4670 * left contiguousness. Specifically, if it's an insert into 4671 * the 1st record in a leaf, it will require the adjustment of 4672 * cluster count on the last record of the path directly to it's 4673 * left. For now, just catch that case and fool the layers 4674 * above us. This works just fine for tree_depth == 0, which 4675 * is why we allow that above. 4676 */ 4677 if (insert->ins_contig == CONTIG_LEFT && 4678 insert->ins_contig_index == 0) 4679 insert->ins_contig = CONTIG_NONE; 4680 4681 /* 4682 * Ok, so we can simply compare against last_eb to figure out 4683 * whether the path doesn't exist. This will only happen in 4684 * the case that we're doing a tail append, so maybe we can 4685 * take advantage of that information somehow. 4686 */ 4687 if (ocfs2_et_get_last_eb_blk(et) == 4688 path_leaf_bh(path)->b_blocknr) { 4689 /* 4690 * Ok, ocfs2_find_path() returned us the rightmost 4691 * tree path. This might be an appending insert. There are 4692 * two cases: 4693 * 1) We're doing a true append at the tail: 4694 * -This might even be off the end of the leaf 4695 * 2) We're "appending" by rotating in the tail 4696 */ 4697 ocfs2_figure_appending_type(insert, el, insert_rec); 4698 } 4699 4700 out: 4701 ocfs2_free_path(path); 4702 4703 if (ret == 0) 4704 *last_eb_bh = bh; 4705 else 4706 brelse(bh); 4707 return ret; 4708 } 4709 4710 /* 4711 * Insert an extent into a btree. 4712 * 4713 * The caller needs to update the owning btree's cluster count. 4714 */ 4715 int ocfs2_insert_extent(handle_t *handle, 4716 struct ocfs2_extent_tree *et, 4717 u32 cpos, 4718 u64 start_blk, 4719 u32 new_clusters, 4720 u8 flags, 4721 struct ocfs2_alloc_context *meta_ac) 4722 { 4723 int status; 4724 int free_records; 4725 struct buffer_head *last_eb_bh = NULL; 4726 struct ocfs2_insert_type insert = {0, }; 4727 struct ocfs2_extent_rec rec; 4728 4729 trace_ocfs2_insert_extent_start( 4730 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), 4731 cpos, new_clusters); 4732 4733 memset(&rec, 0, sizeof(rec)); 4734 rec.e_cpos = cpu_to_le32(cpos); 4735 rec.e_blkno = cpu_to_le64(start_blk); 4736 rec.e_leaf_clusters = cpu_to_le16(new_clusters); 4737 rec.e_flags = flags; 4738 status = ocfs2_et_insert_check(et, &rec); 4739 if (status) { 4740 mlog_errno(status); 4741 goto bail; 4742 } 4743 4744 status = ocfs2_figure_insert_type(et, &last_eb_bh, &rec, 4745 &free_records, &insert); 4746 if (status < 0) { 4747 mlog_errno(status); 4748 goto bail; 4749 } 4750 4751 trace_ocfs2_insert_extent(insert.ins_appending, insert.ins_contig, 4752 insert.ins_contig_index, free_records, 4753 insert.ins_tree_depth); 4754 4755 if (insert.ins_contig == CONTIG_NONE && free_records == 0) { 4756 status = ocfs2_grow_tree(handle, et, 4757 &insert.ins_tree_depth, &last_eb_bh, 4758 meta_ac); 4759 if (status) { 4760 mlog_errno(status); 4761 goto bail; 4762 } 4763 } 4764 4765 /* Finally, we can add clusters. This might rotate the tree for us. */ 4766 status = ocfs2_do_insert_extent(handle, et, &rec, &insert); 4767 if (status < 0) 4768 mlog_errno(status); 4769 else 4770 ocfs2_et_extent_map_insert(et, &rec); 4771 4772 bail: 4773 brelse(last_eb_bh); 4774 4775 return status; 4776 } 4777 4778 /* 4779 * Allocate and add clusters into the extent b-tree. 4780 * The new clusters(clusters_to_add) will be inserted at logical_offset. 4781 * The extent b-tree's root is specified by et, and 4782 * it is not limited to the file storage. Any extent tree can use this 4783 * function if it implements the proper ocfs2_extent_tree. 4784 */ 4785 int ocfs2_add_clusters_in_btree(handle_t *handle, 4786 struct ocfs2_extent_tree *et, 4787 u32 *logical_offset, 4788 u32 clusters_to_add, 4789 int mark_unwritten, 4790 struct ocfs2_alloc_context *data_ac, 4791 struct ocfs2_alloc_context *meta_ac, 4792 enum ocfs2_alloc_restarted *reason_ret) 4793 { 4794 int status = 0, err = 0; 4795 int need_free = 0; 4796 int free_extents; 4797 enum ocfs2_alloc_restarted reason = RESTART_NONE; 4798 u32 bit_off, num_bits; 4799 u64 block; 4800 u8 flags = 0; 4801 struct ocfs2_super *osb = 4802 OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci)); 4803 4804 BUG_ON(!clusters_to_add); 4805 4806 if (mark_unwritten) 4807 flags = OCFS2_EXT_UNWRITTEN; 4808 4809 free_extents = ocfs2_num_free_extents(et); 4810 if (free_extents < 0) { 4811 status = free_extents; 4812 mlog_errno(status); 4813 goto leave; 4814 } 4815 4816 /* there are two cases which could cause us to EAGAIN in the 4817 * we-need-more-metadata case: 4818 * 1) we haven't reserved *any* 4819 * 2) we are so fragmented, we've needed to add metadata too 4820 * many times. */ 4821 if (!free_extents && !meta_ac) { 4822 err = -1; 4823 status = -EAGAIN; 4824 reason = RESTART_META; 4825 goto leave; 4826 } else if ((!free_extents) 4827 && (ocfs2_alloc_context_bits_left(meta_ac) 4828 < ocfs2_extend_meta_needed(et->et_root_el))) { 4829 err = -2; 4830 status = -EAGAIN; 4831 reason = RESTART_META; 4832 goto leave; 4833 } 4834 4835 status = __ocfs2_claim_clusters(handle, data_ac, 1, 4836 clusters_to_add, &bit_off, &num_bits); 4837 if (status < 0) { 4838 if (status != -ENOSPC) 4839 mlog_errno(status); 4840 goto leave; 4841 } 4842 4843 BUG_ON(num_bits > clusters_to_add); 4844 4845 /* reserve our write early -- insert_extent may update the tree root */ 4846 status = ocfs2_et_root_journal_access(handle, et, 4847 OCFS2_JOURNAL_ACCESS_WRITE); 4848 if (status < 0) { 4849 mlog_errno(status); 4850 need_free = 1; 4851 goto bail; 4852 } 4853 4854 block = ocfs2_clusters_to_blocks(osb->sb, bit_off); 4855 trace_ocfs2_add_clusters_in_btree( 4856 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), 4857 bit_off, num_bits); 4858 status = ocfs2_insert_extent(handle, et, *logical_offset, block, 4859 num_bits, flags, meta_ac); 4860 if (status < 0) { 4861 mlog_errno(status); 4862 need_free = 1; 4863 goto bail; 4864 } 4865 4866 ocfs2_journal_dirty(handle, et->et_root_bh); 4867 4868 clusters_to_add -= num_bits; 4869 *logical_offset += num_bits; 4870 4871 if (clusters_to_add) { 4872 err = clusters_to_add; 4873 status = -EAGAIN; 4874 reason = RESTART_TRANS; 4875 } 4876 4877 bail: 4878 if (need_free) { 4879 if (data_ac->ac_which == OCFS2_AC_USE_LOCAL) 4880 ocfs2_free_local_alloc_bits(osb, handle, data_ac, 4881 bit_off, num_bits); 4882 else 4883 ocfs2_free_clusters(handle, 4884 data_ac->ac_inode, 4885 data_ac->ac_bh, 4886 ocfs2_clusters_to_blocks(osb->sb, bit_off), 4887 num_bits); 4888 } 4889 4890 leave: 4891 if (reason_ret) 4892 *reason_ret = reason; 4893 trace_ocfs2_add_clusters_in_btree_ret(status, reason, err); 4894 return status; 4895 } 4896 4897 static void ocfs2_make_right_split_rec(struct super_block *sb, 4898 struct ocfs2_extent_rec *split_rec, 4899 u32 cpos, 4900 struct ocfs2_extent_rec *rec) 4901 { 4902 u32 rec_cpos = le32_to_cpu(rec->e_cpos); 4903 u32 rec_range = rec_cpos + le16_to_cpu(rec->e_leaf_clusters); 4904 4905 memset(split_rec, 0, sizeof(struct ocfs2_extent_rec)); 4906 4907 split_rec->e_cpos = cpu_to_le32(cpos); 4908 split_rec->e_leaf_clusters = cpu_to_le16(rec_range - cpos); 4909 4910 split_rec->e_blkno = rec->e_blkno; 4911 le64_add_cpu(&split_rec->e_blkno, 4912 ocfs2_clusters_to_blocks(sb, cpos - rec_cpos)); 4913 4914 split_rec->e_flags = rec->e_flags; 4915 } 4916 4917 static int ocfs2_split_and_insert(handle_t *handle, 4918 struct ocfs2_extent_tree *et, 4919 struct ocfs2_path *path, 4920 struct buffer_head **last_eb_bh, 4921 int split_index, 4922 struct ocfs2_extent_rec *orig_split_rec, 4923 struct ocfs2_alloc_context *meta_ac) 4924 { 4925 int ret = 0, depth; 4926 unsigned int insert_range, rec_range, do_leftright = 0; 4927 struct ocfs2_extent_rec tmprec; 4928 struct ocfs2_extent_list *rightmost_el; 4929 struct ocfs2_extent_rec rec; 4930 struct ocfs2_extent_rec split_rec = *orig_split_rec; 4931 struct ocfs2_insert_type insert; 4932 struct ocfs2_extent_block *eb; 4933 4934 leftright: 4935 /* 4936 * Store a copy of the record on the stack - it might move 4937 * around as the tree is manipulated below. 4938 */ 4939 rec = path_leaf_el(path)->l_recs[split_index]; 4940 4941 rightmost_el = et->et_root_el; 4942 4943 depth = le16_to_cpu(rightmost_el->l_tree_depth); 4944 if (depth) { 4945 BUG_ON(!(*last_eb_bh)); 4946 eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data; 4947 rightmost_el = &eb->h_list; 4948 } 4949 4950 if (le16_to_cpu(rightmost_el->l_next_free_rec) == 4951 le16_to_cpu(rightmost_el->l_count)) { 4952 ret = ocfs2_grow_tree(handle, et, 4953 &depth, last_eb_bh, meta_ac); 4954 if (ret) { 4955 mlog_errno(ret); 4956 goto out; 4957 } 4958 } 4959 4960 memset(&insert, 0, sizeof(struct ocfs2_insert_type)); 4961 insert.ins_appending = APPEND_NONE; 4962 insert.ins_contig = CONTIG_NONE; 4963 insert.ins_tree_depth = depth; 4964 4965 insert_range = le32_to_cpu(split_rec.e_cpos) + 4966 le16_to_cpu(split_rec.e_leaf_clusters); 4967 rec_range = le32_to_cpu(rec.e_cpos) + 4968 le16_to_cpu(rec.e_leaf_clusters); 4969 4970 if (split_rec.e_cpos == rec.e_cpos) { 4971 insert.ins_split = SPLIT_LEFT; 4972 } else if (insert_range == rec_range) { 4973 insert.ins_split = SPLIT_RIGHT; 4974 } else { 4975 /* 4976 * Left/right split. We fake this as a right split 4977 * first and then make a second pass as a left split. 4978 */ 4979 insert.ins_split = SPLIT_RIGHT; 4980 4981 ocfs2_make_right_split_rec(ocfs2_metadata_cache_get_super(et->et_ci), 4982 &tmprec, insert_range, &rec); 4983 4984 split_rec = tmprec; 4985 4986 BUG_ON(do_leftright); 4987 do_leftright = 1; 4988 } 4989 4990 ret = ocfs2_do_insert_extent(handle, et, &split_rec, &insert); 4991 if (ret) { 4992 mlog_errno(ret); 4993 goto out; 4994 } 4995 4996 if (do_leftright == 1) { 4997 u32 cpos; 4998 struct ocfs2_extent_list *el; 4999 5000 do_leftright++; 5001 split_rec = *orig_split_rec; 5002 5003 ocfs2_reinit_path(path, 1); 5004 5005 cpos = le32_to_cpu(split_rec.e_cpos); 5006 ret = ocfs2_find_path(et->et_ci, path, cpos); 5007 if (ret) { 5008 mlog_errno(ret); 5009 goto out; 5010 } 5011 5012 el = path_leaf_el(path); 5013 split_index = ocfs2_search_extent_list(el, cpos); 5014 if (split_index == -1) { 5015 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), 5016 "Owner %llu has an extent at cpos %u which can no longer be found\n", 5017 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), 5018 cpos); 5019 ret = -EROFS; 5020 goto out; 5021 } 5022 goto leftright; 5023 } 5024 out: 5025 5026 return ret; 5027 } 5028 5029 static int ocfs2_replace_extent_rec(handle_t *handle, 5030 struct ocfs2_extent_tree *et, 5031 struct ocfs2_path *path, 5032 struct ocfs2_extent_list *el, 5033 int split_index, 5034 struct ocfs2_extent_rec *split_rec) 5035 { 5036 int ret; 5037 5038 ret = ocfs2_path_bh_journal_access(handle, et->et_ci, path, 5039 path_num_items(path) - 1); 5040 if (ret) { 5041 mlog_errno(ret); 5042 goto out; 5043 } 5044 5045 el->l_recs[split_index] = *split_rec; 5046 5047 ocfs2_journal_dirty(handle, path_leaf_bh(path)); 5048 out: 5049 return ret; 5050 } 5051 5052 /* 5053 * Split part or all of the extent record at split_index in the leaf 5054 * pointed to by path. Merge with the contiguous extent record if needed. 5055 * 5056 * Care is taken to handle contiguousness so as to not grow the tree. 5057 * 5058 * meta_ac is not strictly necessary - we only truly need it if growth 5059 * of the tree is required. All other cases will degrade into a less 5060 * optimal tree layout. 5061 * 5062 * last_eb_bh should be the rightmost leaf block for any extent 5063 * btree. Since a split may grow the tree or a merge might shrink it, 5064 * the caller cannot trust the contents of that buffer after this call. 5065 * 5066 * This code is optimized for readability - several passes might be 5067 * made over certain portions of the tree. All of those blocks will 5068 * have been brought into cache (and pinned via the journal), so the 5069 * extra overhead is not expressed in terms of disk reads. 5070 */ 5071 int ocfs2_split_extent(handle_t *handle, 5072 struct ocfs2_extent_tree *et, 5073 struct ocfs2_path *path, 5074 int split_index, 5075 struct ocfs2_extent_rec *split_rec, 5076 struct ocfs2_alloc_context *meta_ac, 5077 struct ocfs2_cached_dealloc_ctxt *dealloc) 5078 { 5079 int ret = 0; 5080 struct ocfs2_extent_list *el = path_leaf_el(path); 5081 struct buffer_head *last_eb_bh = NULL; 5082 struct ocfs2_extent_rec *rec = &el->l_recs[split_index]; 5083 struct ocfs2_merge_ctxt ctxt; 5084 5085 if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) || 5086 ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) < 5087 (le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) { 5088 ret = -EIO; 5089 mlog_errno(ret); 5090 goto out; 5091 } 5092 5093 ret = ocfs2_figure_merge_contig_type(et, path, el, 5094 split_index, 5095 split_rec, 5096 &ctxt); 5097 if (ret) { 5098 mlog_errno(ret); 5099 goto out; 5100 } 5101 5102 /* 5103 * The core merge / split code wants to know how much room is 5104 * left in this allocation tree, so we pass the 5105 * rightmost extent list. 5106 */ 5107 if (path->p_tree_depth) { 5108 ret = ocfs2_read_extent_block(et->et_ci, 5109 ocfs2_et_get_last_eb_blk(et), 5110 &last_eb_bh); 5111 if (ret) { 5112 mlog_errno(ret); 5113 goto out; 5114 } 5115 } 5116 5117 if (rec->e_cpos == split_rec->e_cpos && 5118 rec->e_leaf_clusters == split_rec->e_leaf_clusters) 5119 ctxt.c_split_covers_rec = 1; 5120 else 5121 ctxt.c_split_covers_rec = 0; 5122 5123 ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]); 5124 5125 trace_ocfs2_split_extent(split_index, ctxt.c_contig_type, 5126 ctxt.c_has_empty_extent, 5127 ctxt.c_split_covers_rec); 5128 5129 if (ctxt.c_contig_type == CONTIG_NONE) { 5130 if (ctxt.c_split_covers_rec) 5131 ret = ocfs2_replace_extent_rec(handle, et, path, el, 5132 split_index, split_rec); 5133 else 5134 ret = ocfs2_split_and_insert(handle, et, path, 5135 &last_eb_bh, split_index, 5136 split_rec, meta_ac); 5137 if (ret) 5138 mlog_errno(ret); 5139 } else { 5140 ret = ocfs2_try_to_merge_extent(handle, et, path, 5141 split_index, split_rec, 5142 dealloc, &ctxt); 5143 if (ret) 5144 mlog_errno(ret); 5145 } 5146 5147 out: 5148 brelse(last_eb_bh); 5149 return ret; 5150 } 5151 5152 /* 5153 * Change the flags of the already-existing extent at cpos for len clusters. 5154 * 5155 * new_flags: the flags we want to set. 5156 * clear_flags: the flags we want to clear. 5157 * phys: the new physical offset we want this new extent starts from. 5158 * 5159 * If the existing extent is larger than the request, initiate a 5160 * split. An attempt will be made at merging with adjacent extents. 5161 * 5162 * The caller is responsible for passing down meta_ac if we'll need it. 5163 */ 5164 int ocfs2_change_extent_flag(handle_t *handle, 5165 struct ocfs2_extent_tree *et, 5166 u32 cpos, u32 len, u32 phys, 5167 struct ocfs2_alloc_context *meta_ac, 5168 struct ocfs2_cached_dealloc_ctxt *dealloc, 5169 int new_flags, int clear_flags) 5170 { 5171 int ret, index; 5172 struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); 5173 u64 start_blkno = ocfs2_clusters_to_blocks(sb, phys); 5174 struct ocfs2_extent_rec split_rec; 5175 struct ocfs2_path *left_path = NULL; 5176 struct ocfs2_extent_list *el; 5177 struct ocfs2_extent_rec *rec; 5178 5179 left_path = ocfs2_new_path_from_et(et); 5180 if (!left_path) { 5181 ret = -ENOMEM; 5182 mlog_errno(ret); 5183 goto out; 5184 } 5185 5186 ret = ocfs2_find_path(et->et_ci, left_path, cpos); 5187 if (ret) { 5188 mlog_errno(ret); 5189 goto out; 5190 } 5191 el = path_leaf_el(left_path); 5192 5193 index = ocfs2_search_extent_list(el, cpos); 5194 if (index == -1) { 5195 ocfs2_error(sb, 5196 "Owner %llu has an extent at cpos %u which can no longer be found\n", 5197 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), 5198 cpos); 5199 ret = -EROFS; 5200 goto out; 5201 } 5202 5203 ret = -EIO; 5204 rec = &el->l_recs[index]; 5205 if (new_flags && (rec->e_flags & new_flags)) { 5206 mlog(ML_ERROR, "Owner %llu tried to set %d flags on an " 5207 "extent that already had them\n", 5208 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), 5209 new_flags); 5210 goto out; 5211 } 5212 5213 if (clear_flags && !(rec->e_flags & clear_flags)) { 5214 mlog(ML_ERROR, "Owner %llu tried to clear %d flags on an " 5215 "extent that didn't have them\n", 5216 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), 5217 clear_flags); 5218 goto out; 5219 } 5220 5221 memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec)); 5222 split_rec.e_cpos = cpu_to_le32(cpos); 5223 split_rec.e_leaf_clusters = cpu_to_le16(len); 5224 split_rec.e_blkno = cpu_to_le64(start_blkno); 5225 split_rec.e_flags = rec->e_flags; 5226 if (new_flags) 5227 split_rec.e_flags |= new_flags; 5228 if (clear_flags) 5229 split_rec.e_flags &= ~clear_flags; 5230 5231 ret = ocfs2_split_extent(handle, et, left_path, 5232 index, &split_rec, meta_ac, 5233 dealloc); 5234 if (ret) 5235 mlog_errno(ret); 5236 5237 out: 5238 ocfs2_free_path(left_path); 5239 return ret; 5240 5241 } 5242 5243 /* 5244 * Mark the already-existing extent at cpos as written for len clusters. 5245 * This removes the unwritten extent flag. 5246 * 5247 * If the existing extent is larger than the request, initiate a 5248 * split. An attempt will be made at merging with adjacent extents. 5249 * 5250 * The caller is responsible for passing down meta_ac if we'll need it. 5251 */ 5252 int ocfs2_mark_extent_written(struct inode *inode, 5253 struct ocfs2_extent_tree *et, 5254 handle_t *handle, u32 cpos, u32 len, u32 phys, 5255 struct ocfs2_alloc_context *meta_ac, 5256 struct ocfs2_cached_dealloc_ctxt *dealloc) 5257 { 5258 int ret; 5259 5260 trace_ocfs2_mark_extent_written( 5261 (unsigned long long)OCFS2_I(inode)->ip_blkno, 5262 cpos, len, phys); 5263 5264 if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) { 5265 ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents that are being written to, but the feature bit is not set in the super block\n", 5266 (unsigned long long)OCFS2_I(inode)->ip_blkno); 5267 ret = -EROFS; 5268 goto out; 5269 } 5270 5271 /* 5272 * XXX: This should be fixed up so that we just re-insert the 5273 * next extent records. 5274 */ 5275 ocfs2_et_extent_map_truncate(et, 0); 5276 5277 ret = ocfs2_change_extent_flag(handle, et, cpos, 5278 len, phys, meta_ac, dealloc, 5279 0, OCFS2_EXT_UNWRITTEN); 5280 if (ret) 5281 mlog_errno(ret); 5282 5283 out: 5284 return ret; 5285 } 5286 5287 static int ocfs2_split_tree(handle_t *handle, struct ocfs2_extent_tree *et, 5288 struct ocfs2_path *path, 5289 int index, u32 new_range, 5290 struct ocfs2_alloc_context *meta_ac) 5291 { 5292 int ret, depth, credits; 5293 struct buffer_head *last_eb_bh = NULL; 5294 struct ocfs2_extent_block *eb; 5295 struct ocfs2_extent_list *rightmost_el, *el; 5296 struct ocfs2_extent_rec split_rec; 5297 struct ocfs2_extent_rec *rec; 5298 struct ocfs2_insert_type insert; 5299 5300 /* 5301 * Setup the record to split before we grow the tree. 5302 */ 5303 el = path_leaf_el(path); 5304 rec = &el->l_recs[index]; 5305 ocfs2_make_right_split_rec(ocfs2_metadata_cache_get_super(et->et_ci), 5306 &split_rec, new_range, rec); 5307 5308 depth = path->p_tree_depth; 5309 if (depth > 0) { 5310 ret = ocfs2_read_extent_block(et->et_ci, 5311 ocfs2_et_get_last_eb_blk(et), 5312 &last_eb_bh); 5313 if (ret < 0) { 5314 mlog_errno(ret); 5315 goto out; 5316 } 5317 5318 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; 5319 rightmost_el = &eb->h_list; 5320 } else 5321 rightmost_el = path_leaf_el(path); 5322 5323 credits = path->p_tree_depth + 5324 ocfs2_extend_meta_needed(et->et_root_el); 5325 ret = ocfs2_extend_trans(handle, credits); 5326 if (ret) { 5327 mlog_errno(ret); 5328 goto out; 5329 } 5330 5331 if (le16_to_cpu(rightmost_el->l_next_free_rec) == 5332 le16_to_cpu(rightmost_el->l_count)) { 5333 ret = ocfs2_grow_tree(handle, et, &depth, &last_eb_bh, 5334 meta_ac); 5335 if (ret) { 5336 mlog_errno(ret); 5337 goto out; 5338 } 5339 } 5340 5341 memset(&insert, 0, sizeof(struct ocfs2_insert_type)); 5342 insert.ins_appending = APPEND_NONE; 5343 insert.ins_contig = CONTIG_NONE; 5344 insert.ins_split = SPLIT_RIGHT; 5345 insert.ins_tree_depth = depth; 5346 5347 ret = ocfs2_do_insert_extent(handle, et, &split_rec, &insert); 5348 if (ret) 5349 mlog_errno(ret); 5350 5351 out: 5352 brelse(last_eb_bh); 5353 return ret; 5354 } 5355 5356 static int ocfs2_truncate_rec(handle_t *handle, 5357 struct ocfs2_extent_tree *et, 5358 struct ocfs2_path *path, int index, 5359 struct ocfs2_cached_dealloc_ctxt *dealloc, 5360 u32 cpos, u32 len) 5361 { 5362 int ret; 5363 u32 left_cpos, rec_range, trunc_range; 5364 int is_rightmost_tree_rec = 0; 5365 struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); 5366 struct ocfs2_path *left_path = NULL; 5367 struct ocfs2_extent_list *el = path_leaf_el(path); 5368 struct ocfs2_extent_rec *rec; 5369 struct ocfs2_extent_block *eb; 5370 5371 if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) { 5372 /* extend credit for ocfs2_remove_rightmost_path */ 5373 ret = ocfs2_extend_rotate_transaction(handle, 0, 5374 jbd2_handle_buffer_credits(handle), 5375 path); 5376 if (ret) { 5377 mlog_errno(ret); 5378 goto out; 5379 } 5380 5381 ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); 5382 if (ret) { 5383 mlog_errno(ret); 5384 goto out; 5385 } 5386 5387 index--; 5388 } 5389 5390 if (index == (le16_to_cpu(el->l_next_free_rec) - 1) && 5391 path->p_tree_depth) { 5392 /* 5393 * Check whether this is the rightmost tree record. If 5394 * we remove all of this record or part of its right 5395 * edge then an update of the record lengths above it 5396 * will be required. 5397 */ 5398 eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data; 5399 if (eb->h_next_leaf_blk == 0) 5400 is_rightmost_tree_rec = 1; 5401 } 5402 5403 rec = &el->l_recs[index]; 5404 if (index == 0 && path->p_tree_depth && 5405 le32_to_cpu(rec->e_cpos) == cpos) { 5406 /* 5407 * Changing the leftmost offset (via partial or whole 5408 * record truncate) of an interior (or rightmost) path 5409 * means we have to update the subtree that is formed 5410 * by this leaf and the one to it's left. 5411 * 5412 * There are two cases we can skip: 5413 * 1) Path is the leftmost one in our btree. 5414 * 2) The leaf is rightmost and will be empty after 5415 * we remove the extent record - the rotate code 5416 * knows how to update the newly formed edge. 5417 */ 5418 5419 ret = ocfs2_find_cpos_for_left_leaf(sb, path, &left_cpos); 5420 if (ret) { 5421 mlog_errno(ret); 5422 goto out; 5423 } 5424 5425 if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) { 5426 left_path = ocfs2_new_path_from_path(path); 5427 if (!left_path) { 5428 ret = -ENOMEM; 5429 mlog_errno(ret); 5430 goto out; 5431 } 5432 5433 ret = ocfs2_find_path(et->et_ci, left_path, 5434 left_cpos); 5435 if (ret) { 5436 mlog_errno(ret); 5437 goto out; 5438 } 5439 } 5440 } 5441 5442 ret = ocfs2_extend_rotate_transaction(handle, 0, 5443 jbd2_handle_buffer_credits(handle), 5444 path); 5445 if (ret) { 5446 mlog_errno(ret); 5447 goto out; 5448 } 5449 5450 ret = ocfs2_journal_access_path(et->et_ci, handle, path); 5451 if (ret) { 5452 mlog_errno(ret); 5453 goto out; 5454 } 5455 5456 ret = ocfs2_journal_access_path(et->et_ci, handle, left_path); 5457 if (ret) { 5458 mlog_errno(ret); 5459 goto out; 5460 } 5461 5462 rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec); 5463 trunc_range = cpos + len; 5464 5465 if (le32_to_cpu(rec->e_cpos) == cpos && rec_range == trunc_range) { 5466 int next_free; 5467 5468 memset(rec, 0, sizeof(*rec)); 5469 ocfs2_cleanup_merge(el, index); 5470 5471 next_free = le16_to_cpu(el->l_next_free_rec); 5472 if (is_rightmost_tree_rec && next_free > 1) { 5473 /* 5474 * We skip the edge update if this path will 5475 * be deleted by the rotate code. 5476 */ 5477 rec = &el->l_recs[next_free - 1]; 5478 ocfs2_adjust_rightmost_records(handle, et, path, 5479 rec); 5480 } 5481 } else if (le32_to_cpu(rec->e_cpos) == cpos) { 5482 /* Remove leftmost portion of the record. */ 5483 le32_add_cpu(&rec->e_cpos, len); 5484 le64_add_cpu(&rec->e_blkno, ocfs2_clusters_to_blocks(sb, len)); 5485 le16_add_cpu(&rec->e_leaf_clusters, -len); 5486 } else if (rec_range == trunc_range) { 5487 /* Remove rightmost portion of the record */ 5488 le16_add_cpu(&rec->e_leaf_clusters, -len); 5489 if (is_rightmost_tree_rec) 5490 ocfs2_adjust_rightmost_records(handle, et, path, rec); 5491 } else { 5492 /* Caller should have trapped this. */ 5493 mlog(ML_ERROR, "Owner %llu: Invalid record truncate: (%u, %u) " 5494 "(%u, %u)\n", 5495 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), 5496 le32_to_cpu(rec->e_cpos), 5497 le16_to_cpu(rec->e_leaf_clusters), cpos, len); 5498 BUG(); 5499 } 5500 5501 if (left_path) { 5502 int subtree_index; 5503 5504 subtree_index = ocfs2_find_subtree_root(et, left_path, path); 5505 ocfs2_complete_edge_insert(handle, left_path, path, 5506 subtree_index); 5507 } 5508 5509 ocfs2_journal_dirty(handle, path_leaf_bh(path)); 5510 5511 ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); 5512 if (ret) 5513 mlog_errno(ret); 5514 5515 out: 5516 ocfs2_free_path(left_path); 5517 return ret; 5518 } 5519 5520 int ocfs2_remove_extent(handle_t *handle, 5521 struct ocfs2_extent_tree *et, 5522 u32 cpos, u32 len, 5523 struct ocfs2_alloc_context *meta_ac, 5524 struct ocfs2_cached_dealloc_ctxt *dealloc) 5525 { 5526 int ret, index; 5527 u32 rec_range, trunc_range; 5528 struct ocfs2_extent_rec *rec; 5529 struct ocfs2_extent_list *el; 5530 struct ocfs2_path *path = NULL; 5531 5532 /* 5533 * XXX: Why are we truncating to 0 instead of wherever this 5534 * affects us? 5535 */ 5536 ocfs2_et_extent_map_truncate(et, 0); 5537 5538 path = ocfs2_new_path_from_et(et); 5539 if (!path) { 5540 ret = -ENOMEM; 5541 mlog_errno(ret); 5542 goto out; 5543 } 5544 5545 ret = ocfs2_find_path(et->et_ci, path, cpos); 5546 if (ret) { 5547 mlog_errno(ret); 5548 goto out; 5549 } 5550 5551 el = path_leaf_el(path); 5552 index = ocfs2_search_extent_list(el, cpos); 5553 if (index == -1) { 5554 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), 5555 "Owner %llu has an extent at cpos %u which can no longer be found\n", 5556 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), 5557 cpos); 5558 ret = -EROFS; 5559 goto out; 5560 } 5561 5562 /* 5563 * We have 3 cases of extent removal: 5564 * 1) Range covers the entire extent rec 5565 * 2) Range begins or ends on one edge of the extent rec 5566 * 3) Range is in the middle of the extent rec (no shared edges) 5567 * 5568 * For case 1 we remove the extent rec and left rotate to 5569 * fill the hole. 5570 * 5571 * For case 2 we just shrink the existing extent rec, with a 5572 * tree update if the shrinking edge is also the edge of an 5573 * extent block. 5574 * 5575 * For case 3 we do a right split to turn the extent rec into 5576 * something case 2 can handle. 5577 */ 5578 rec = &el->l_recs[index]; 5579 rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec); 5580 trunc_range = cpos + len; 5581 5582 BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range); 5583 5584 trace_ocfs2_remove_extent( 5585 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), 5586 cpos, len, index, le32_to_cpu(rec->e_cpos), 5587 ocfs2_rec_clusters(el, rec)); 5588 5589 if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) { 5590 ret = ocfs2_truncate_rec(handle, et, path, index, dealloc, 5591 cpos, len); 5592 if (ret) { 5593 mlog_errno(ret); 5594 goto out; 5595 } 5596 } else { 5597 ret = ocfs2_split_tree(handle, et, path, index, 5598 trunc_range, meta_ac); 5599 if (ret) { 5600 mlog_errno(ret); 5601 goto out; 5602 } 5603 5604 /* 5605 * The split could have manipulated the tree enough to 5606 * move the record location, so we have to look for it again. 5607 */ 5608 ocfs2_reinit_path(path, 1); 5609 5610 ret = ocfs2_find_path(et->et_ci, path, cpos); 5611 if (ret) { 5612 mlog_errno(ret); 5613 goto out; 5614 } 5615 5616 el = path_leaf_el(path); 5617 index = ocfs2_search_extent_list(el, cpos); 5618 if (index == -1) { 5619 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), 5620 "Owner %llu: split at cpos %u lost record\n", 5621 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), 5622 cpos); 5623 ret = -EROFS; 5624 goto out; 5625 } 5626 5627 /* 5628 * Double check our values here. If anything is fishy, 5629 * it's easier to catch it at the top level. 5630 */ 5631 rec = &el->l_recs[index]; 5632 rec_range = le32_to_cpu(rec->e_cpos) + 5633 ocfs2_rec_clusters(el, rec); 5634 if (rec_range != trunc_range) { 5635 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), 5636 "Owner %llu: error after split at cpos %u trunc len %u, existing record is (%u,%u)\n", 5637 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), 5638 cpos, len, le32_to_cpu(rec->e_cpos), 5639 ocfs2_rec_clusters(el, rec)); 5640 ret = -EROFS; 5641 goto out; 5642 } 5643 5644 ret = ocfs2_truncate_rec(handle, et, path, index, dealloc, 5645 cpos, len); 5646 if (ret) 5647 mlog_errno(ret); 5648 } 5649 5650 out: 5651 ocfs2_free_path(path); 5652 return ret; 5653 } 5654 5655 /* 5656 * ocfs2_reserve_blocks_for_rec_trunc() would look basically the 5657 * same as ocfs2_lock_alloctors(), except for it accepts a blocks 5658 * number to reserve some extra blocks, and it only handles meta 5659 * data allocations. 5660 * 5661 * Currently, only ocfs2_remove_btree_range() uses it for truncating 5662 * and punching holes. 5663 */ 5664 static int ocfs2_reserve_blocks_for_rec_trunc(struct inode *inode, 5665 struct ocfs2_extent_tree *et, 5666 u32 extents_to_split, 5667 struct ocfs2_alloc_context **ac, 5668 int extra_blocks) 5669 { 5670 int ret = 0, num_free_extents; 5671 unsigned int max_recs_needed = 2 * extents_to_split; 5672 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 5673 5674 *ac = NULL; 5675 5676 num_free_extents = ocfs2_num_free_extents(et); 5677 if (num_free_extents < 0) { 5678 ret = num_free_extents; 5679 mlog_errno(ret); 5680 goto out; 5681 } 5682 5683 if (!num_free_extents || 5684 (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) 5685 extra_blocks += ocfs2_extend_meta_needed(et->et_root_el); 5686 5687 if (extra_blocks) { 5688 ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, ac); 5689 if (ret < 0) { 5690 if (ret != -ENOSPC) 5691 mlog_errno(ret); 5692 } 5693 } 5694 5695 out: 5696 if (ret) { 5697 if (*ac) { 5698 ocfs2_free_alloc_context(*ac); 5699 *ac = NULL; 5700 } 5701 } 5702 5703 return ret; 5704 } 5705 5706 int ocfs2_remove_btree_range(struct inode *inode, 5707 struct ocfs2_extent_tree *et, 5708 u32 cpos, u32 phys_cpos, u32 len, int flags, 5709 struct ocfs2_cached_dealloc_ctxt *dealloc, 5710 u64 refcount_loc, bool refcount_tree_locked) 5711 { 5712 int ret, credits = 0, extra_blocks = 0; 5713 u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); 5714 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 5715 struct inode *tl_inode = osb->osb_tl_inode; 5716 handle_t *handle; 5717 struct ocfs2_alloc_context *meta_ac = NULL; 5718 struct ocfs2_refcount_tree *ref_tree = NULL; 5719 5720 if ((flags & OCFS2_EXT_REFCOUNTED) && len) { 5721 BUG_ON(!ocfs2_is_refcount_inode(inode)); 5722 5723 if (!refcount_tree_locked) { 5724 ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1, 5725 &ref_tree, NULL); 5726 if (ret) { 5727 mlog_errno(ret); 5728 goto bail; 5729 } 5730 } 5731 5732 ret = ocfs2_prepare_refcount_change_for_del(inode, 5733 refcount_loc, 5734 phys_blkno, 5735 len, 5736 &credits, 5737 &extra_blocks); 5738 if (ret < 0) { 5739 mlog_errno(ret); 5740 goto bail; 5741 } 5742 } 5743 5744 ret = ocfs2_reserve_blocks_for_rec_trunc(inode, et, 1, &meta_ac, 5745 extra_blocks); 5746 if (ret) { 5747 mlog_errno(ret); 5748 goto bail; 5749 } 5750 5751 inode_lock(tl_inode); 5752 5753 if (ocfs2_truncate_log_needs_flush(osb)) { 5754 ret = __ocfs2_flush_truncate_log(osb); 5755 if (ret < 0) { 5756 mlog_errno(ret); 5757 goto out; 5758 } 5759 } 5760 5761 handle = ocfs2_start_trans(osb, 5762 ocfs2_remove_extent_credits(osb->sb) + credits); 5763 if (IS_ERR(handle)) { 5764 ret = PTR_ERR(handle); 5765 mlog_errno(ret); 5766 goto out; 5767 } 5768 5769 ret = ocfs2_et_root_journal_access(handle, et, 5770 OCFS2_JOURNAL_ACCESS_WRITE); 5771 if (ret) { 5772 mlog_errno(ret); 5773 goto out_commit; 5774 } 5775 5776 dquot_free_space_nodirty(inode, 5777 ocfs2_clusters_to_bytes(inode->i_sb, len)); 5778 5779 ret = ocfs2_remove_extent(handle, et, cpos, len, meta_ac, dealloc); 5780 if (ret) { 5781 mlog_errno(ret); 5782 goto out_commit; 5783 } 5784 5785 ocfs2_et_update_clusters(et, -len); 5786 ocfs2_update_inode_fsync_trans(handle, inode, 1); 5787 5788 ocfs2_journal_dirty(handle, et->et_root_bh); 5789 5790 if (phys_blkno) { 5791 if (flags & OCFS2_EXT_REFCOUNTED) 5792 ret = ocfs2_decrease_refcount(inode, handle, 5793 ocfs2_blocks_to_clusters(osb->sb, 5794 phys_blkno), 5795 len, meta_ac, 5796 dealloc, 1); 5797 else 5798 ret = ocfs2_truncate_log_append(osb, handle, 5799 phys_blkno, len); 5800 if (ret) 5801 mlog_errno(ret); 5802 5803 } 5804 5805 out_commit: 5806 ocfs2_commit_trans(osb, handle); 5807 out: 5808 inode_unlock(tl_inode); 5809 bail: 5810 if (meta_ac) 5811 ocfs2_free_alloc_context(meta_ac); 5812 5813 if (ref_tree) 5814 ocfs2_unlock_refcount_tree(osb, ref_tree, 1); 5815 5816 return ret; 5817 } 5818 5819 int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb) 5820 { 5821 struct buffer_head *tl_bh = osb->osb_tl_bh; 5822 struct ocfs2_dinode *di; 5823 struct ocfs2_truncate_log *tl; 5824 5825 di = (struct ocfs2_dinode *) tl_bh->b_data; 5826 tl = &di->id2.i_dealloc; 5827 5828 mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count), 5829 "slot %d, invalid truncate log parameters: used = " 5830 "%u, count = %u\n", osb->slot_num, 5831 le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count)); 5832 return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count); 5833 } 5834 5835 static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl, 5836 unsigned int new_start) 5837 { 5838 unsigned int tail_index; 5839 unsigned int current_tail; 5840 5841 /* No records, nothing to coalesce */ 5842 if (!le16_to_cpu(tl->tl_used)) 5843 return 0; 5844 5845 tail_index = le16_to_cpu(tl->tl_used) - 1; 5846 current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start); 5847 current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters); 5848 5849 return current_tail == new_start; 5850 } 5851 5852 int ocfs2_truncate_log_append(struct ocfs2_super *osb, 5853 handle_t *handle, 5854 u64 start_blk, 5855 unsigned int num_clusters) 5856 { 5857 int status, index; 5858 unsigned int start_cluster, tl_count; 5859 struct inode *tl_inode = osb->osb_tl_inode; 5860 struct buffer_head *tl_bh = osb->osb_tl_bh; 5861 struct ocfs2_dinode *di; 5862 struct ocfs2_truncate_log *tl; 5863 5864 BUG_ON(inode_trylock(tl_inode)); 5865 5866 start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk); 5867 5868 di = (struct ocfs2_dinode *) tl_bh->b_data; 5869 5870 /* tl_bh is loaded from ocfs2_truncate_log_init(). It's validated 5871 * by the underlying call to ocfs2_read_inode_block(), so any 5872 * corruption is a code bug */ 5873 BUG_ON(!OCFS2_IS_VALID_DINODE(di)); 5874 5875 tl = &di->id2.i_dealloc; 5876 tl_count = le16_to_cpu(tl->tl_count); 5877 mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) || 5878 tl_count == 0, 5879 "Truncate record count on #%llu invalid " 5880 "wanted %u, actual %u\n", 5881 (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, 5882 ocfs2_truncate_recs_per_inode(osb->sb), 5883 le16_to_cpu(tl->tl_count)); 5884 5885 /* Caller should have known to flush before calling us. */ 5886 index = le16_to_cpu(tl->tl_used); 5887 if (index >= tl_count) { 5888 status = -ENOSPC; 5889 mlog_errno(status); 5890 goto bail; 5891 } 5892 5893 status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh, 5894 OCFS2_JOURNAL_ACCESS_WRITE); 5895 if (status < 0) { 5896 mlog_errno(status); 5897 goto bail; 5898 } 5899 5900 trace_ocfs2_truncate_log_append( 5901 (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index, 5902 start_cluster, num_clusters); 5903 if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) { 5904 /* 5905 * Move index back to the record we are coalescing with. 5906 * ocfs2_truncate_log_can_coalesce() guarantees nonzero 5907 */ 5908 index--; 5909 5910 num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters); 5911 trace_ocfs2_truncate_log_append( 5912 (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, 5913 index, le32_to_cpu(tl->tl_recs[index].t_start), 5914 num_clusters); 5915 } else { 5916 tl->tl_recs[index].t_start = cpu_to_le32(start_cluster); 5917 tl->tl_used = cpu_to_le16(index + 1); 5918 } 5919 tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters); 5920 5921 ocfs2_journal_dirty(handle, tl_bh); 5922 5923 osb->truncated_clusters += num_clusters; 5924 bail: 5925 return status; 5926 } 5927 5928 static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, 5929 struct inode *data_alloc_inode, 5930 struct buffer_head *data_alloc_bh) 5931 { 5932 int status = 0; 5933 int i; 5934 unsigned int num_clusters; 5935 u64 start_blk; 5936 struct ocfs2_truncate_rec rec; 5937 struct ocfs2_dinode *di; 5938 struct ocfs2_truncate_log *tl; 5939 struct inode *tl_inode = osb->osb_tl_inode; 5940 struct buffer_head *tl_bh = osb->osb_tl_bh; 5941 handle_t *handle; 5942 5943 di = (struct ocfs2_dinode *) tl_bh->b_data; 5944 tl = &di->id2.i_dealloc; 5945 i = le16_to_cpu(tl->tl_used) - 1; 5946 while (i >= 0) { 5947 handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC); 5948 if (IS_ERR(handle)) { 5949 status = PTR_ERR(handle); 5950 mlog_errno(status); 5951 goto bail; 5952 } 5953 5954 /* Caller has given us at least enough credits to 5955 * update the truncate log dinode */ 5956 status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh, 5957 OCFS2_JOURNAL_ACCESS_WRITE); 5958 if (status < 0) { 5959 ocfs2_commit_trans(osb, handle); 5960 mlog_errno(status); 5961 goto bail; 5962 } 5963 5964 tl->tl_used = cpu_to_le16(i); 5965 5966 ocfs2_journal_dirty(handle, tl_bh); 5967 5968 rec = tl->tl_recs[i]; 5969 start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb, 5970 le32_to_cpu(rec.t_start)); 5971 num_clusters = le32_to_cpu(rec.t_clusters); 5972 5973 /* if start_blk is not set, we ignore the record as 5974 * invalid. */ 5975 if (start_blk) { 5976 trace_ocfs2_replay_truncate_records( 5977 (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, 5978 i, le32_to_cpu(rec.t_start), num_clusters); 5979 5980 status = ocfs2_free_clusters(handle, data_alloc_inode, 5981 data_alloc_bh, start_blk, 5982 num_clusters); 5983 if (status < 0) { 5984 ocfs2_commit_trans(osb, handle); 5985 mlog_errno(status); 5986 goto bail; 5987 } 5988 } 5989 5990 ocfs2_commit_trans(osb, handle); 5991 i--; 5992 } 5993 5994 osb->truncated_clusters = 0; 5995 5996 bail: 5997 return status; 5998 } 5999 6000 /* Expects you to already be holding tl_inode->i_rwsem */ 6001 int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) 6002 { 6003 int status; 6004 unsigned int num_to_flush; 6005 struct inode *tl_inode = osb->osb_tl_inode; 6006 struct inode *data_alloc_inode = NULL; 6007 struct buffer_head *tl_bh = osb->osb_tl_bh; 6008 struct buffer_head *data_alloc_bh = NULL; 6009 struct ocfs2_dinode *di; 6010 struct ocfs2_truncate_log *tl; 6011 struct ocfs2_journal *journal = osb->journal; 6012 6013 BUG_ON(inode_trylock(tl_inode)); 6014 6015 di = (struct ocfs2_dinode *) tl_bh->b_data; 6016 6017 /* tl_bh is loaded from ocfs2_truncate_log_init(). It's validated 6018 * by the underlying call to ocfs2_read_inode_block(), so any 6019 * corruption is a code bug */ 6020 BUG_ON(!OCFS2_IS_VALID_DINODE(di)); 6021 6022 tl = &di->id2.i_dealloc; 6023 num_to_flush = le16_to_cpu(tl->tl_used); 6024 trace_ocfs2_flush_truncate_log( 6025 (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, 6026 num_to_flush); 6027 if (!num_to_flush) { 6028 status = 0; 6029 goto out; 6030 } 6031 6032 /* Appending truncate log(TA) and flushing truncate log(TF) are 6033 * two separated transactions. They can be both committed but not 6034 * checkpointed. If crash occurs then, both two transaction will be 6035 * replayed with several already released to global bitmap clusters. 6036 * Then truncate log will be replayed resulting in cluster double free. 6037 */ 6038 jbd2_journal_lock_updates(journal->j_journal); 6039 status = jbd2_journal_flush(journal->j_journal, 0); 6040 jbd2_journal_unlock_updates(journal->j_journal); 6041 if (status < 0) { 6042 mlog_errno(status); 6043 goto out; 6044 } 6045 6046 data_alloc_inode = ocfs2_get_system_file_inode(osb, 6047 GLOBAL_BITMAP_SYSTEM_INODE, 6048 OCFS2_INVALID_SLOT); 6049 if (!data_alloc_inode) { 6050 status = -EINVAL; 6051 mlog(ML_ERROR, "Could not get bitmap inode!\n"); 6052 goto out; 6053 } 6054 6055 inode_lock(data_alloc_inode); 6056 6057 status = ocfs2_inode_lock(data_alloc_inode, &data_alloc_bh, 1); 6058 if (status < 0) { 6059 mlog_errno(status); 6060 goto out_mutex; 6061 } 6062 6063 status = ocfs2_replay_truncate_records(osb, data_alloc_inode, 6064 data_alloc_bh); 6065 if (status < 0) 6066 mlog_errno(status); 6067 6068 brelse(data_alloc_bh); 6069 ocfs2_inode_unlock(data_alloc_inode, 1); 6070 6071 out_mutex: 6072 inode_unlock(data_alloc_inode); 6073 iput(data_alloc_inode); 6074 6075 out: 6076 return status; 6077 } 6078 6079 int ocfs2_flush_truncate_log(struct ocfs2_super *osb) 6080 { 6081 int status; 6082 struct inode *tl_inode = osb->osb_tl_inode; 6083 6084 inode_lock(tl_inode); 6085 status = __ocfs2_flush_truncate_log(osb); 6086 inode_unlock(tl_inode); 6087 6088 return status; 6089 } 6090 6091 static void ocfs2_truncate_log_worker(struct work_struct *work) 6092 { 6093 int status; 6094 struct ocfs2_super *osb = 6095 container_of(work, struct ocfs2_super, 6096 osb_truncate_log_wq.work); 6097 6098 status = ocfs2_flush_truncate_log(osb); 6099 if (status < 0) 6100 mlog_errno(status); 6101 else 6102 ocfs2_init_steal_slots(osb); 6103 } 6104 6105 #define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ) 6106 void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb, 6107 int cancel) 6108 { 6109 if (osb->osb_tl_inode && 6110 atomic_read(&osb->osb_tl_disable) == 0) { 6111 /* We want to push off log flushes while truncates are 6112 * still running. */ 6113 if (cancel) 6114 cancel_delayed_work(&osb->osb_truncate_log_wq); 6115 6116 queue_delayed_work(osb->ocfs2_wq, &osb->osb_truncate_log_wq, 6117 OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL); 6118 } 6119 } 6120 6121 /* 6122 * Try to flush truncate logs if we can free enough clusters from it. 6123 * As for return value, "< 0" means error, "0" no space and "1" means 6124 * we have freed enough spaces and let the caller try to allocate again. 6125 */ 6126 int ocfs2_try_to_free_truncate_log(struct ocfs2_super *osb, 6127 unsigned int needed) 6128 { 6129 tid_t target; 6130 int ret = 0; 6131 unsigned int truncated_clusters; 6132 6133 inode_lock(osb->osb_tl_inode); 6134 truncated_clusters = osb->truncated_clusters; 6135 inode_unlock(osb->osb_tl_inode); 6136 6137 /* 6138 * Check whether we can succeed in allocating if we free 6139 * the truncate log. 6140 */ 6141 if (truncated_clusters < needed) 6142 goto out; 6143 6144 ret = ocfs2_flush_truncate_log(osb); 6145 if (ret) { 6146 mlog_errno(ret); 6147 goto out; 6148 } 6149 6150 if (jbd2_journal_start_commit(osb->journal->j_journal, &target)) { 6151 jbd2_log_wait_commit(osb->journal->j_journal, target); 6152 ret = 1; 6153 } 6154 out: 6155 return ret; 6156 } 6157 6158 static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb, 6159 int slot_num, 6160 struct inode **tl_inode, 6161 struct buffer_head **tl_bh) 6162 { 6163 int status; 6164 struct inode *inode = NULL; 6165 struct buffer_head *bh = NULL; 6166 struct ocfs2_dinode *di; 6167 struct ocfs2_truncate_log *tl; 6168 unsigned int tl_count, tl_used; 6169 6170 inode = ocfs2_get_system_file_inode(osb, 6171 TRUNCATE_LOG_SYSTEM_INODE, 6172 slot_num); 6173 if (!inode) { 6174 status = -EINVAL; 6175 mlog(ML_ERROR, "Could not get load truncate log inode!\n"); 6176 goto bail; 6177 } 6178 6179 status = ocfs2_read_inode_block(inode, &bh); 6180 if (status < 0) { 6181 iput(inode); 6182 mlog_errno(status); 6183 goto bail; 6184 } 6185 6186 di = (struct ocfs2_dinode *)bh->b_data; 6187 tl = &di->id2.i_dealloc; 6188 tl_count = le16_to_cpu(tl->tl_count); 6189 tl_used = le16_to_cpu(tl->tl_used); 6190 if (unlikely(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) || 6191 tl_count == 0 || 6192 tl_used > tl_count)) { 6193 status = -EFSCORRUPTED; 6194 iput(inode); 6195 brelse(bh); 6196 mlog_errno(status); 6197 goto bail; 6198 } 6199 6200 *tl_inode = inode; 6201 *tl_bh = bh; 6202 bail: 6203 return status; 6204 } 6205 6206 /* called during the 1st stage of node recovery. we stamp a clean 6207 * truncate log and pass back a copy for processing later. if the 6208 * truncate log does not require processing, a *tl_copy is set to 6209 * NULL. */ 6210 int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb, 6211 int slot_num, 6212 struct ocfs2_dinode **tl_copy) 6213 { 6214 int status; 6215 struct inode *tl_inode = NULL; 6216 struct buffer_head *tl_bh = NULL; 6217 struct ocfs2_dinode *di; 6218 struct ocfs2_truncate_log *tl; 6219 6220 *tl_copy = NULL; 6221 6222 trace_ocfs2_begin_truncate_log_recovery(slot_num); 6223 6224 status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh); 6225 if (status < 0) { 6226 mlog_errno(status); 6227 goto bail; 6228 } 6229 6230 di = (struct ocfs2_dinode *) tl_bh->b_data; 6231 6232 /* tl_bh is loaded from ocfs2_get_truncate_log_info(). It's 6233 * validated by the underlying call to ocfs2_read_inode_block(), 6234 * so any corruption is a code bug */ 6235 BUG_ON(!OCFS2_IS_VALID_DINODE(di)); 6236 6237 tl = &di->id2.i_dealloc; 6238 if (le16_to_cpu(tl->tl_used)) { 6239 trace_ocfs2_truncate_log_recovery_num(le16_to_cpu(tl->tl_used)); 6240 6241 /* 6242 * Assuming the write-out below goes well, this copy will be 6243 * passed back to recovery for processing. 6244 */ 6245 *tl_copy = kmemdup(tl_bh->b_data, tl_bh->b_size, GFP_KERNEL); 6246 if (!(*tl_copy)) { 6247 status = -ENOMEM; 6248 mlog_errno(status); 6249 goto bail; 6250 } 6251 6252 /* All we need to do to clear the truncate log is set 6253 * tl_used. */ 6254 tl->tl_used = 0; 6255 6256 ocfs2_compute_meta_ecc(osb->sb, tl_bh->b_data, &di->i_check); 6257 status = ocfs2_write_block(osb, tl_bh, INODE_CACHE(tl_inode)); 6258 if (status < 0) { 6259 mlog_errno(status); 6260 goto bail; 6261 } 6262 } 6263 6264 bail: 6265 iput(tl_inode); 6266 brelse(tl_bh); 6267 6268 if (status < 0) { 6269 kfree(*tl_copy); 6270 *tl_copy = NULL; 6271 mlog_errno(status); 6272 } 6273 6274 return status; 6275 } 6276 6277 int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb, 6278 struct ocfs2_dinode *tl_copy) 6279 { 6280 int status = 0; 6281 int i; 6282 unsigned int clusters, num_recs, start_cluster; 6283 u64 start_blk; 6284 handle_t *handle; 6285 struct inode *tl_inode = osb->osb_tl_inode; 6286 struct ocfs2_truncate_log *tl; 6287 6288 if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) { 6289 mlog(ML_ERROR, "Asked to recover my own truncate log!\n"); 6290 return -EINVAL; 6291 } 6292 6293 tl = &tl_copy->id2.i_dealloc; 6294 num_recs = le16_to_cpu(tl->tl_used); 6295 trace_ocfs2_complete_truncate_log_recovery( 6296 (unsigned long long)le64_to_cpu(tl_copy->i_blkno), 6297 num_recs); 6298 6299 inode_lock(tl_inode); 6300 for(i = 0; i < num_recs; i++) { 6301 if (ocfs2_truncate_log_needs_flush(osb)) { 6302 status = __ocfs2_flush_truncate_log(osb); 6303 if (status < 0) { 6304 mlog_errno(status); 6305 goto bail_up; 6306 } 6307 } 6308 6309 handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE); 6310 if (IS_ERR(handle)) { 6311 status = PTR_ERR(handle); 6312 mlog_errno(status); 6313 goto bail_up; 6314 } 6315 6316 clusters = le32_to_cpu(tl->tl_recs[i].t_clusters); 6317 start_cluster = le32_to_cpu(tl->tl_recs[i].t_start); 6318 start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster); 6319 6320 status = ocfs2_truncate_log_append(osb, handle, 6321 start_blk, clusters); 6322 ocfs2_commit_trans(osb, handle); 6323 if (status < 0) { 6324 mlog_errno(status); 6325 goto bail_up; 6326 } 6327 } 6328 6329 bail_up: 6330 inode_unlock(tl_inode); 6331 6332 return status; 6333 } 6334 6335 void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb) 6336 { 6337 int status; 6338 struct inode *tl_inode = osb->osb_tl_inode; 6339 6340 atomic_set(&osb->osb_tl_disable, 1); 6341 6342 if (tl_inode) { 6343 cancel_delayed_work(&osb->osb_truncate_log_wq); 6344 flush_workqueue(osb->ocfs2_wq); 6345 6346 status = ocfs2_flush_truncate_log(osb); 6347 if (status < 0) 6348 mlog_errno(status); 6349 6350 brelse(osb->osb_tl_bh); 6351 iput(osb->osb_tl_inode); 6352 } 6353 } 6354 6355 int ocfs2_truncate_log_init(struct ocfs2_super *osb) 6356 { 6357 int status; 6358 struct inode *tl_inode = NULL; 6359 struct buffer_head *tl_bh = NULL; 6360 6361 status = ocfs2_get_truncate_log_info(osb, 6362 osb->slot_num, 6363 &tl_inode, 6364 &tl_bh); 6365 if (status < 0) 6366 mlog_errno(status); 6367 6368 /* ocfs2_truncate_log_shutdown keys on the existence of 6369 * osb->osb_tl_inode so we don't set any of the osb variables 6370 * until we're sure all is well. */ 6371 INIT_DELAYED_WORK(&osb->osb_truncate_log_wq, 6372 ocfs2_truncate_log_worker); 6373 atomic_set(&osb->osb_tl_disable, 0); 6374 osb->osb_tl_bh = tl_bh; 6375 osb->osb_tl_inode = tl_inode; 6376 6377 return status; 6378 } 6379 6380 /* 6381 * Delayed de-allocation of suballocator blocks. 6382 * 6383 * Some sets of block de-allocations might involve multiple suballocator inodes. 6384 * 6385 * The locking for this can get extremely complicated, especially when 6386 * the suballocator inodes to delete from aren't known until deep 6387 * within an unrelated codepath. 6388 * 6389 * ocfs2_extent_block structures are a good example of this - an inode 6390 * btree could have been grown by any number of nodes each allocating 6391 * out of their own suballoc inode. 6392 * 6393 * These structures allow the delay of block de-allocation until a 6394 * later time, when locking of multiple cluster inodes won't cause 6395 * deadlock. 6396 */ 6397 6398 /* 6399 * Describe a single bit freed from a suballocator. For the block 6400 * suballocators, it represents one block. For the global cluster 6401 * allocator, it represents some clusters and free_bit indicates 6402 * clusters number. 6403 */ 6404 struct ocfs2_cached_block_free { 6405 struct ocfs2_cached_block_free *free_next; 6406 u64 free_bg; 6407 u64 free_blk; 6408 unsigned int free_bit; 6409 }; 6410 6411 struct ocfs2_per_slot_free_list { 6412 struct ocfs2_per_slot_free_list *f_next_suballocator; 6413 int f_inode_type; 6414 int f_slot; 6415 struct ocfs2_cached_block_free *f_first; 6416 }; 6417 6418 static int ocfs2_free_cached_blocks(struct ocfs2_super *osb, 6419 int sysfile_type, 6420 int slot, 6421 struct ocfs2_cached_block_free *head) 6422 { 6423 int ret; 6424 u64 bg_blkno; 6425 handle_t *handle; 6426 struct inode *inode; 6427 struct buffer_head *di_bh = NULL; 6428 struct ocfs2_cached_block_free *tmp; 6429 6430 inode = ocfs2_get_system_file_inode(osb, sysfile_type, slot); 6431 if (!inode) { 6432 ret = -EINVAL; 6433 mlog_errno(ret); 6434 goto out; 6435 } 6436 6437 inode_lock(inode); 6438 6439 ret = ocfs2_inode_lock(inode, &di_bh, 1); 6440 if (ret) { 6441 mlog_errno(ret); 6442 goto out_mutex; 6443 } 6444 6445 while (head) { 6446 if (head->free_bg) 6447 bg_blkno = head->free_bg; 6448 else 6449 bg_blkno = ocfs2_which_suballoc_group(head->free_blk, 6450 head->free_bit); 6451 handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE); 6452 if (IS_ERR(handle)) { 6453 ret = PTR_ERR(handle); 6454 mlog_errno(ret); 6455 goto out_unlock; 6456 } 6457 6458 trace_ocfs2_free_cached_blocks( 6459 (unsigned long long)head->free_blk, head->free_bit); 6460 6461 ret = ocfs2_free_suballoc_bits(handle, inode, di_bh, 6462 head->free_bit, bg_blkno, 1); 6463 if (ret) 6464 mlog_errno(ret); 6465 6466 ocfs2_commit_trans(osb, handle); 6467 6468 tmp = head; 6469 head = head->free_next; 6470 kfree(tmp); 6471 } 6472 6473 out_unlock: 6474 ocfs2_inode_unlock(inode, 1); 6475 brelse(di_bh); 6476 out_mutex: 6477 inode_unlock(inode); 6478 iput(inode); 6479 out: 6480 while(head) { 6481 /* Premature exit may have left some dangling items. */ 6482 tmp = head; 6483 head = head->free_next; 6484 kfree(tmp); 6485 } 6486 6487 return ret; 6488 } 6489 6490 int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, 6491 u64 blkno, unsigned int bit) 6492 { 6493 int ret = 0; 6494 struct ocfs2_cached_block_free *item; 6495 6496 item = kzalloc(sizeof(*item), GFP_NOFS); 6497 if (item == NULL) { 6498 ret = -ENOMEM; 6499 mlog_errno(ret); 6500 return ret; 6501 } 6502 6503 trace_ocfs2_cache_cluster_dealloc((unsigned long long)blkno, bit); 6504 6505 item->free_blk = blkno; 6506 item->free_bit = bit; 6507 item->free_next = ctxt->c_global_allocator; 6508 6509 ctxt->c_global_allocator = item; 6510 return ret; 6511 } 6512 6513 static int ocfs2_free_cached_clusters(struct ocfs2_super *osb, 6514 struct ocfs2_cached_block_free *head) 6515 { 6516 struct ocfs2_cached_block_free *tmp; 6517 struct inode *tl_inode = osb->osb_tl_inode; 6518 handle_t *handle; 6519 int ret = 0; 6520 6521 inode_lock(tl_inode); 6522 6523 while (head) { 6524 if (ocfs2_truncate_log_needs_flush(osb)) { 6525 ret = __ocfs2_flush_truncate_log(osb); 6526 if (ret < 0) { 6527 mlog_errno(ret); 6528 break; 6529 } 6530 } 6531 6532 handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE); 6533 if (IS_ERR(handle)) { 6534 ret = PTR_ERR(handle); 6535 mlog_errno(ret); 6536 break; 6537 } 6538 6539 ret = ocfs2_truncate_log_append(osb, handle, head->free_blk, 6540 head->free_bit); 6541 6542 ocfs2_commit_trans(osb, handle); 6543 tmp = head; 6544 head = head->free_next; 6545 kfree(tmp); 6546 6547 if (ret < 0) { 6548 mlog_errno(ret); 6549 break; 6550 } 6551 } 6552 6553 inode_unlock(tl_inode); 6554 6555 while (head) { 6556 /* Premature exit may have left some dangling items. */ 6557 tmp = head; 6558 head = head->free_next; 6559 kfree(tmp); 6560 } 6561 6562 return ret; 6563 } 6564 6565 int ocfs2_run_deallocs(struct ocfs2_super *osb, 6566 struct ocfs2_cached_dealloc_ctxt *ctxt) 6567 { 6568 int ret = 0, ret2; 6569 struct ocfs2_per_slot_free_list *fl; 6570 6571 if (!ctxt) 6572 return 0; 6573 6574 while (ctxt->c_first_suballocator) { 6575 fl = ctxt->c_first_suballocator; 6576 6577 if (fl->f_first) { 6578 trace_ocfs2_run_deallocs(fl->f_inode_type, 6579 fl->f_slot); 6580 ret2 = ocfs2_free_cached_blocks(osb, 6581 fl->f_inode_type, 6582 fl->f_slot, 6583 fl->f_first); 6584 if (ret2) 6585 mlog_errno(ret2); 6586 if (!ret) 6587 ret = ret2; 6588 } 6589 6590 ctxt->c_first_suballocator = fl->f_next_suballocator; 6591 kfree(fl); 6592 } 6593 6594 if (ctxt->c_global_allocator) { 6595 ret2 = ocfs2_free_cached_clusters(osb, 6596 ctxt->c_global_allocator); 6597 if (ret2) 6598 mlog_errno(ret2); 6599 if (!ret) 6600 ret = ret2; 6601 6602 ctxt->c_global_allocator = NULL; 6603 } 6604 6605 return ret; 6606 } 6607 6608 static struct ocfs2_per_slot_free_list * 6609 ocfs2_find_per_slot_free_list(int type, 6610 int slot, 6611 struct ocfs2_cached_dealloc_ctxt *ctxt) 6612 { 6613 struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator; 6614 6615 while (fl) { 6616 if (fl->f_inode_type == type && fl->f_slot == slot) 6617 return fl; 6618 6619 fl = fl->f_next_suballocator; 6620 } 6621 6622 fl = kmalloc(sizeof(*fl), GFP_NOFS); 6623 if (fl) { 6624 fl->f_inode_type = type; 6625 fl->f_slot = slot; 6626 fl->f_first = NULL; 6627 fl->f_next_suballocator = ctxt->c_first_suballocator; 6628 6629 ctxt->c_first_suballocator = fl; 6630 } 6631 return fl; 6632 } 6633 6634 static struct ocfs2_per_slot_free_list * 6635 ocfs2_find_preferred_free_list(int type, 6636 int preferred_slot, 6637 int *real_slot, 6638 struct ocfs2_cached_dealloc_ctxt *ctxt) 6639 { 6640 struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator; 6641 6642 while (fl) { 6643 if (fl->f_inode_type == type && fl->f_slot == preferred_slot) { 6644 *real_slot = fl->f_slot; 6645 return fl; 6646 } 6647 6648 fl = fl->f_next_suballocator; 6649 } 6650 6651 /* If we can't find any free list matching preferred slot, just use 6652 * the first one. 6653 */ 6654 fl = ctxt->c_first_suballocator; 6655 *real_slot = fl->f_slot; 6656 6657 return fl; 6658 } 6659 6660 /* Return Value 1 indicates empty */ 6661 static int ocfs2_is_dealloc_empty(struct ocfs2_extent_tree *et) 6662 { 6663 struct ocfs2_per_slot_free_list *fl = NULL; 6664 6665 if (!et->et_dealloc) 6666 return 1; 6667 6668 fl = et->et_dealloc->c_first_suballocator; 6669 if (!fl) 6670 return 1; 6671 6672 if (!fl->f_first) 6673 return 1; 6674 6675 return 0; 6676 } 6677 6678 /* If extent was deleted from tree due to extent rotation and merging, and 6679 * no metadata is reserved ahead of time. Try to reuse some extents 6680 * just deleted. This is only used to reuse extent blocks. 6681 * It is supposed to find enough extent blocks in dealloc if our estimation 6682 * on metadata is accurate. 6683 */ 6684 static int ocfs2_reuse_blk_from_dealloc(handle_t *handle, 6685 struct ocfs2_extent_tree *et, 6686 struct buffer_head **new_eb_bh, 6687 int blk_wanted, int *blk_given) 6688 { 6689 int i, status = 0, real_slot; 6690 struct ocfs2_cached_dealloc_ctxt *dealloc; 6691 struct ocfs2_per_slot_free_list *fl; 6692 struct ocfs2_cached_block_free *bf; 6693 struct ocfs2_extent_block *eb; 6694 struct ocfs2_super *osb = 6695 OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci)); 6696 6697 *blk_given = 0; 6698 6699 /* If extent tree doesn't have a dealloc, this is not faulty. Just 6700 * tell upper caller dealloc can't provide any block and it should 6701 * ask for alloc to claim more space. 6702 */ 6703 dealloc = et->et_dealloc; 6704 if (!dealloc) 6705 goto bail; 6706 6707 for (i = 0; i < blk_wanted; i++) { 6708 /* Prefer to use local slot */ 6709 fl = ocfs2_find_preferred_free_list(EXTENT_ALLOC_SYSTEM_INODE, 6710 osb->slot_num, &real_slot, 6711 dealloc); 6712 /* If no more block can be reused, we should claim more 6713 * from alloc. Just return here normally. 6714 */ 6715 if (!fl) { 6716 status = 0; 6717 break; 6718 } 6719 6720 bf = fl->f_first; 6721 fl->f_first = bf->free_next; 6722 6723 new_eb_bh[i] = sb_getblk(osb->sb, bf->free_blk); 6724 if (new_eb_bh[i] == NULL) { 6725 status = -ENOMEM; 6726 mlog_errno(status); 6727 goto bail; 6728 } 6729 6730 mlog(0, "Reusing block(%llu) from " 6731 "dealloc(local slot:%d, real slot:%d)\n", 6732 bf->free_blk, osb->slot_num, real_slot); 6733 6734 ocfs2_set_new_buffer_uptodate(et->et_ci, new_eb_bh[i]); 6735 6736 status = ocfs2_journal_access_eb(handle, et->et_ci, 6737 new_eb_bh[i], 6738 OCFS2_JOURNAL_ACCESS_CREATE); 6739 if (status < 0) { 6740 mlog_errno(status); 6741 goto bail; 6742 } 6743 6744 memset(new_eb_bh[i]->b_data, 0, osb->sb->s_blocksize); 6745 eb = (struct ocfs2_extent_block *) new_eb_bh[i]->b_data; 6746 6747 /* We can't guarantee that buffer head is still cached, so 6748 * polutlate the extent block again. 6749 */ 6750 strscpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE); 6751 eb->h_blkno = cpu_to_le64(bf->free_blk); 6752 eb->h_fs_generation = cpu_to_le32(osb->fs_generation); 6753 eb->h_suballoc_slot = cpu_to_le16(real_slot); 6754 eb->h_suballoc_loc = cpu_to_le64(bf->free_bg); 6755 eb->h_suballoc_bit = cpu_to_le16(bf->free_bit); 6756 eb->h_list.l_count = 6757 cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb)); 6758 6759 /* We'll also be dirtied by the caller, so 6760 * this isn't absolutely necessary. 6761 */ 6762 ocfs2_journal_dirty(handle, new_eb_bh[i]); 6763 6764 if (!fl->f_first) { 6765 dealloc->c_first_suballocator = fl->f_next_suballocator; 6766 kfree(fl); 6767 } 6768 kfree(bf); 6769 } 6770 6771 *blk_given = i; 6772 6773 bail: 6774 if (unlikely(status < 0)) { 6775 for (i = 0; i < blk_wanted; i++) 6776 brelse(new_eb_bh[i]); 6777 } 6778 6779 return status; 6780 } 6781 6782 int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, 6783 int type, int slot, u64 suballoc, 6784 u64 blkno, unsigned int bit) 6785 { 6786 int ret; 6787 struct ocfs2_per_slot_free_list *fl; 6788 struct ocfs2_cached_block_free *item; 6789 6790 fl = ocfs2_find_per_slot_free_list(type, slot, ctxt); 6791 if (fl == NULL) { 6792 ret = -ENOMEM; 6793 mlog_errno(ret); 6794 goto out; 6795 } 6796 6797 item = kzalloc(sizeof(*item), GFP_NOFS); 6798 if (item == NULL) { 6799 ret = -ENOMEM; 6800 mlog_errno(ret); 6801 goto out; 6802 } 6803 6804 trace_ocfs2_cache_block_dealloc(type, slot, 6805 (unsigned long long)suballoc, 6806 (unsigned long long)blkno, bit); 6807 6808 item->free_bg = suballoc; 6809 item->free_blk = blkno; 6810 item->free_bit = bit; 6811 item->free_next = fl->f_first; 6812 6813 fl->f_first = item; 6814 6815 ret = 0; 6816 out: 6817 return ret; 6818 } 6819 6820 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt, 6821 struct ocfs2_extent_block *eb) 6822 { 6823 return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE, 6824 le16_to_cpu(eb->h_suballoc_slot), 6825 le64_to_cpu(eb->h_suballoc_loc), 6826 le64_to_cpu(eb->h_blkno), 6827 le16_to_cpu(eb->h_suballoc_bit)); 6828 } 6829 6830 static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh) 6831 { 6832 set_buffer_uptodate(bh); 6833 mark_buffer_dirty(bh); 6834 return 0; 6835 } 6836 6837 void ocfs2_map_and_dirty_folio(struct inode *inode, handle_t *handle, 6838 size_t from, size_t to, struct folio *folio, int zero, 6839 u64 *phys) 6840 { 6841 int ret, partial = 0; 6842 loff_t start_byte = folio_pos(folio) + from; 6843 loff_t length = to - from; 6844 6845 ret = ocfs2_map_folio_blocks(folio, phys, inode, from, to, 0); 6846 if (ret) 6847 mlog_errno(ret); 6848 6849 if (zero) 6850 folio_zero_segment(folio, from, to); 6851 6852 /* 6853 * Need to set the buffers we zero'd into uptodate 6854 * here if they aren't - ocfs2_map_page_blocks() 6855 * might've skipped some 6856 */ 6857 ret = walk_page_buffers(handle, folio_buffers(folio), 6858 from, to, &partial, 6859 ocfs2_zero_func); 6860 if (ret < 0) 6861 mlog_errno(ret); 6862 else if (ocfs2_should_order_data(inode)) { 6863 ret = ocfs2_jbd2_inode_add_write(handle, inode, 6864 start_byte, length); 6865 if (ret < 0) 6866 mlog_errno(ret); 6867 } 6868 6869 if (!partial) 6870 folio_mark_uptodate(folio); 6871 6872 flush_dcache_folio(folio); 6873 } 6874 6875 static void ocfs2_zero_cluster_folios(struct inode *inode, loff_t start, 6876 loff_t end, struct folio **folios, int numfolios, 6877 u64 phys, handle_t *handle) 6878 { 6879 int i; 6880 struct super_block *sb = inode->i_sb; 6881 6882 BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb))); 6883 6884 if (numfolios == 0) 6885 goto out; 6886 6887 for (i = 0; i < numfolios; i++) { 6888 struct folio *folio = folios[i]; 6889 size_t to = folio_size(folio); 6890 size_t from = offset_in_folio(folio, start); 6891 6892 if (to > end - folio_pos(folio)) 6893 to = end - folio_pos(folio); 6894 6895 ocfs2_map_and_dirty_folio(inode, handle, from, to, folio, 1, 6896 &phys); 6897 6898 start = folio_next_pos(folio); 6899 } 6900 out: 6901 if (folios) 6902 ocfs2_unlock_and_free_folios(folios, numfolios); 6903 } 6904 6905 static int ocfs2_grab_folios(struct inode *inode, loff_t start, loff_t end, 6906 struct folio **folios, int *num) 6907 { 6908 int numfolios, ret = 0; 6909 struct address_space *mapping = inode->i_mapping; 6910 unsigned long index; 6911 loff_t last_page_bytes; 6912 6913 BUG_ON(start > end); 6914 6915 numfolios = 0; 6916 last_page_bytes = PAGE_ALIGN(end); 6917 index = start >> PAGE_SHIFT; 6918 do { 6919 folios[numfolios] = __filemap_get_folio(mapping, index, 6920 FGP_LOCK | FGP_ACCESSED | FGP_CREAT, GFP_NOFS); 6921 if (IS_ERR(folios[numfolios])) { 6922 ret = PTR_ERR(folios[numfolios]); 6923 mlog_errno(ret); 6924 folios[numfolios] = NULL; 6925 goto out; 6926 } 6927 6928 index = folio_next_index(folios[numfolios]); 6929 numfolios++; 6930 } while (index < (last_page_bytes >> PAGE_SHIFT)); 6931 6932 out: 6933 if (ret != 0) { 6934 ocfs2_unlock_and_free_folios(folios, numfolios); 6935 numfolios = 0; 6936 } 6937 6938 *num = numfolios; 6939 6940 return ret; 6941 } 6942 6943 static int ocfs2_grab_eof_folios(struct inode *inode, loff_t start, loff_t end, 6944 struct folio **folios, int *num) 6945 { 6946 struct super_block *sb = inode->i_sb; 6947 6948 BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits != 6949 (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits); 6950 6951 return ocfs2_grab_folios(inode, start, end, folios, num); 6952 } 6953 6954 /* 6955 * Zero partial cluster for a hole punch or truncate. This avoids exposing 6956 * nonzero data on subsequent file extends. 6957 * 6958 * We need to call this before i_size is updated on the inode because 6959 * otherwise block_write_full_folio() will skip writeout of pages past 6960 * i_size. 6961 */ 6962 int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle, 6963 u64 range_start, u64 range_end) 6964 { 6965 int ret = 0, numfolios; 6966 struct folio **folios = NULL; 6967 u64 phys; 6968 unsigned int ext_flags; 6969 struct super_block *sb = inode->i_sb; 6970 6971 /* 6972 * File systems which don't support sparse files zero on every 6973 * extend. 6974 */ 6975 if (!ocfs2_sparse_alloc(OCFS2_SB(sb))) 6976 return 0; 6977 6978 /* 6979 * Avoid zeroing folios fully beyond current i_size. It is pointless as 6980 * underlying blocks of those folios should be already zeroed out and 6981 * page writeback will skip them anyway. 6982 */ 6983 range_end = min_t(u64, range_end, i_size_read(inode)); 6984 if (range_start >= range_end) 6985 return 0; 6986 6987 folios = kcalloc(ocfs2_pages_per_cluster(sb), 6988 sizeof(struct folio *), GFP_NOFS); 6989 if (folios == NULL) { 6990 ret = -ENOMEM; 6991 mlog_errno(ret); 6992 goto out; 6993 } 6994 6995 ret = ocfs2_extent_map_get_blocks(inode, 6996 range_start >> sb->s_blocksize_bits, 6997 &phys, NULL, &ext_flags); 6998 if (ret) { 6999 mlog_errno(ret); 7000 goto out; 7001 } 7002 7003 /* 7004 * Tail is a hole, or is marked unwritten. In either case, we 7005 * can count on read and write to return/push zero's. 7006 */ 7007 if (phys == 0 || ext_flags & OCFS2_EXT_UNWRITTEN) 7008 goto out; 7009 7010 ret = ocfs2_grab_eof_folios(inode, range_start, range_end, folios, 7011 &numfolios); 7012 if (ret) { 7013 mlog_errno(ret); 7014 goto out; 7015 } 7016 7017 ocfs2_zero_cluster_folios(inode, range_start, range_end, folios, 7018 numfolios, phys, handle); 7019 7020 /* 7021 * Initiate writeout of the folios we zero'd here. We don't 7022 * wait on them - the truncate_inode_pages() call later will 7023 * do that for us. 7024 */ 7025 ret = filemap_fdatawrite_range(inode->i_mapping, range_start, 7026 range_end - 1); 7027 if (ret) 7028 mlog_errno(ret); 7029 7030 out: 7031 kfree(folios); 7032 7033 return ret; 7034 } 7035 7036 static void ocfs2_zero_dinode_id2_with_xattr(struct inode *inode, 7037 struct ocfs2_dinode *di) 7038 { 7039 unsigned int blocksize = 1 << inode->i_sb->s_blocksize_bits; 7040 unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size); 7041 7042 if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL) 7043 memset(&di->id2, 0, blocksize - 7044 offsetof(struct ocfs2_dinode, id2) - 7045 xattrsize); 7046 else 7047 memset(&di->id2, 0, blocksize - 7048 offsetof(struct ocfs2_dinode, id2)); 7049 } 7050 7051 void ocfs2_dinode_new_extent_list(struct inode *inode, 7052 struct ocfs2_dinode *di) 7053 { 7054 ocfs2_zero_dinode_id2_with_xattr(inode, di); 7055 di->id2.i_list.l_tree_depth = 0; 7056 di->id2.i_list.l_next_free_rec = 0; 7057 di->id2.i_list.l_count = cpu_to_le16( 7058 ocfs2_extent_recs_per_inode_with_xattr(inode->i_sb, di)); 7059 } 7060 7061 void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di) 7062 { 7063 struct ocfs2_inode_info *oi = OCFS2_I(inode); 7064 struct ocfs2_inline_data *idata = &di->id2.i_data; 7065 7066 spin_lock(&oi->ip_lock); 7067 oi->ip_dyn_features |= OCFS2_INLINE_DATA_FL; 7068 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); 7069 spin_unlock(&oi->ip_lock); 7070 7071 /* 7072 * We clear the entire i_data structure here so that all 7073 * fields can be properly initialized. 7074 */ 7075 ocfs2_zero_dinode_id2_with_xattr(inode, di); 7076 7077 idata->id_count = cpu_to_le16( 7078 ocfs2_max_inline_data_with_xattr(inode->i_sb, di)); 7079 } 7080 7081 int ocfs2_convert_inline_data_to_extents(struct inode *inode, 7082 struct buffer_head *di_bh) 7083 { 7084 int ret, has_data, num_folios = 0; 7085 int need_free = 0; 7086 u32 bit_off, num; 7087 handle_t *handle; 7088 u64 block; 7089 struct ocfs2_inode_info *oi = OCFS2_I(inode); 7090 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 7091 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 7092 struct ocfs2_alloc_context *data_ac = NULL; 7093 struct folio *folio = NULL; 7094 struct ocfs2_extent_tree et; 7095 int did_quota = 0; 7096 7097 has_data = i_size_read(inode) ? 1 : 0; 7098 7099 if (has_data) { 7100 ret = ocfs2_reserve_clusters(osb, 1, &data_ac); 7101 if (ret) { 7102 mlog_errno(ret); 7103 goto out; 7104 } 7105 } 7106 7107 handle = ocfs2_start_trans(osb, 7108 ocfs2_inline_to_extents_credits(osb->sb)); 7109 if (IS_ERR(handle)) { 7110 ret = PTR_ERR(handle); 7111 mlog_errno(ret); 7112 goto out; 7113 } 7114 7115 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, 7116 OCFS2_JOURNAL_ACCESS_WRITE); 7117 if (ret) { 7118 mlog_errno(ret); 7119 goto out_commit; 7120 } 7121 7122 if (has_data) { 7123 unsigned int page_end = min_t(unsigned, PAGE_SIZE, 7124 osb->s_clustersize); 7125 u64 phys; 7126 7127 ret = dquot_alloc_space_nodirty(inode, 7128 ocfs2_clusters_to_bytes(osb->sb, 1)); 7129 if (ret) 7130 goto out_commit; 7131 did_quota = 1; 7132 7133 data_ac->ac_resv = &oi->ip_la_data_resv; 7134 7135 ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off, 7136 &num); 7137 if (ret) { 7138 mlog_errno(ret); 7139 goto out_commit; 7140 } 7141 7142 /* 7143 * Save two copies, one for insert, and one that can 7144 * be changed by ocfs2_map_and_dirty_folio() below. 7145 */ 7146 block = phys = ocfs2_clusters_to_blocks(inode->i_sb, bit_off); 7147 7148 ret = ocfs2_grab_eof_folios(inode, 0, page_end, &folio, 7149 &num_folios); 7150 if (ret) { 7151 mlog_errno(ret); 7152 need_free = 1; 7153 goto out_commit; 7154 } 7155 7156 /* 7157 * This should populate the 1st page for us and mark 7158 * it up to date. 7159 */ 7160 ret = ocfs2_read_inline_data(inode, folio, di_bh); 7161 if (ret) { 7162 mlog_errno(ret); 7163 need_free = 1; 7164 goto out_unlock; 7165 } 7166 7167 ocfs2_map_and_dirty_folio(inode, handle, 0, page_end, folio, 0, 7168 &phys); 7169 } 7170 7171 spin_lock(&oi->ip_lock); 7172 oi->ip_dyn_features &= ~OCFS2_INLINE_DATA_FL; 7173 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); 7174 spin_unlock(&oi->ip_lock); 7175 7176 ocfs2_update_inode_fsync_trans(handle, inode, 1); 7177 ocfs2_dinode_new_extent_list(inode, di); 7178 7179 ocfs2_journal_dirty(handle, di_bh); 7180 7181 if (has_data) { 7182 /* 7183 * An error at this point should be extremely rare. If 7184 * this proves to be false, we could always re-build 7185 * the in-inode data from our pages. 7186 */ 7187 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh); 7188 ret = ocfs2_insert_extent(handle, &et, 0, block, 1, 0, NULL); 7189 if (ret) { 7190 mlog_errno(ret); 7191 need_free = 1; 7192 goto out_unlock; 7193 } 7194 7195 inode->i_blocks = ocfs2_inode_sector_count(inode); 7196 } 7197 7198 out_unlock: 7199 if (folio) 7200 ocfs2_unlock_and_free_folios(&folio, num_folios); 7201 7202 out_commit: 7203 if (ret < 0 && did_quota) 7204 dquot_free_space_nodirty(inode, 7205 ocfs2_clusters_to_bytes(osb->sb, 1)); 7206 7207 if (need_free) { 7208 if (data_ac->ac_which == OCFS2_AC_USE_LOCAL) 7209 ocfs2_free_local_alloc_bits(osb, handle, data_ac, 7210 bit_off, num); 7211 else 7212 ocfs2_free_clusters(handle, 7213 data_ac->ac_inode, 7214 data_ac->ac_bh, 7215 ocfs2_clusters_to_blocks(osb->sb, bit_off), 7216 num); 7217 } 7218 7219 ocfs2_commit_trans(osb, handle); 7220 7221 out: 7222 if (data_ac) 7223 ocfs2_free_alloc_context(data_ac); 7224 return ret; 7225 } 7226 7227 /* 7228 * It is expected, that by the time you call this function, 7229 * inode->i_size and fe->i_size have been adjusted. 7230 * 7231 * WARNING: This will kfree the truncate context 7232 */ 7233 int ocfs2_commit_truncate(struct ocfs2_super *osb, 7234 struct inode *inode, 7235 struct buffer_head *di_bh) 7236 { 7237 int status = 0, i, flags = 0; 7238 u32 new_highest_cpos, range, trunc_cpos, trunc_len, phys_cpos, coff; 7239 u64 blkno = 0; 7240 struct ocfs2_extent_list *el; 7241 struct ocfs2_extent_rec *rec; 7242 struct ocfs2_path *path = NULL; 7243 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 7244 struct ocfs2_extent_list *root_el = &(di->id2.i_list); 7245 u64 refcount_loc = le64_to_cpu(di->i_refcount_loc); 7246 struct ocfs2_extent_tree et; 7247 struct ocfs2_cached_dealloc_ctxt dealloc; 7248 struct ocfs2_refcount_tree *ref_tree = NULL; 7249 7250 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh); 7251 ocfs2_init_dealloc_ctxt(&dealloc); 7252 7253 new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb, 7254 i_size_read(inode)); 7255 7256 path = ocfs2_new_path(di_bh, &di->id2.i_list, 7257 ocfs2_journal_access_di); 7258 if (!path) { 7259 status = -ENOMEM; 7260 mlog_errno(status); 7261 goto bail; 7262 } 7263 7264 ocfs2_extent_map_trunc(inode, new_highest_cpos); 7265 7266 start: 7267 /* 7268 * Check that we still have allocation to delete. 7269 */ 7270 if (OCFS2_I(inode)->ip_clusters == 0) { 7271 status = 0; 7272 goto bail; 7273 } 7274 7275 /* 7276 * Truncate always works against the rightmost tree branch. 7277 */ 7278 status = ocfs2_find_path(INODE_CACHE(inode), path, UINT_MAX); 7279 if (status) { 7280 mlog_errno(status); 7281 goto bail; 7282 } 7283 7284 trace_ocfs2_commit_truncate( 7285 (unsigned long long)OCFS2_I(inode)->ip_blkno, 7286 new_highest_cpos, 7287 OCFS2_I(inode)->ip_clusters, 7288 path->p_tree_depth); 7289 7290 /* 7291 * By now, el will point to the extent list on the bottom most 7292 * portion of this tree. Only the tail record is considered in 7293 * each pass. 7294 * 7295 * We handle the following cases, in order: 7296 * - empty extent: delete the remaining branch 7297 * - remove the entire record 7298 * - remove a partial record 7299 * - no record needs to be removed (truncate has completed) 7300 */ 7301 el = path_leaf_el(path); 7302 if (le16_to_cpu(el->l_next_free_rec) == 0) { 7303 ocfs2_error(inode->i_sb, 7304 "Inode %llu has empty extent block at %llu\n", 7305 (unsigned long long)OCFS2_I(inode)->ip_blkno, 7306 (unsigned long long)path_leaf_bh(path)->b_blocknr); 7307 status = -EROFS; 7308 goto bail; 7309 } 7310 7311 i = le16_to_cpu(el->l_next_free_rec) - 1; 7312 rec = &el->l_recs[i]; 7313 flags = rec->e_flags; 7314 range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec); 7315 7316 if (i == 0 && ocfs2_is_empty_extent(rec)) { 7317 /* 7318 * Lower levels depend on this never happening, but it's best 7319 * to check it up here before changing the tree. 7320 */ 7321 if (root_el->l_tree_depth && rec->e_int_clusters == 0) { 7322 mlog(ML_ERROR, "Inode %lu has an empty " 7323 "extent record, depth %u\n", inode->i_ino, 7324 le16_to_cpu(root_el->l_tree_depth)); 7325 status = ocfs2_remove_rightmost_empty_extent(osb, 7326 &et, path, &dealloc); 7327 if (status) { 7328 mlog_errno(status); 7329 goto bail; 7330 } 7331 7332 ocfs2_reinit_path(path, 1); 7333 goto start; 7334 } else { 7335 trunc_cpos = le32_to_cpu(rec->e_cpos); 7336 trunc_len = 0; 7337 blkno = 0; 7338 } 7339 } else if (le32_to_cpu(rec->e_cpos) >= new_highest_cpos) { 7340 /* 7341 * Truncate entire record. 7342 */ 7343 trunc_cpos = le32_to_cpu(rec->e_cpos); 7344 trunc_len = ocfs2_rec_clusters(el, rec); 7345 blkno = le64_to_cpu(rec->e_blkno); 7346 } else if (range > new_highest_cpos) { 7347 /* 7348 * Partial truncate. it also should be 7349 * the last truncate we're doing. 7350 */ 7351 trunc_cpos = new_highest_cpos; 7352 trunc_len = range - new_highest_cpos; 7353 coff = new_highest_cpos - le32_to_cpu(rec->e_cpos); 7354 blkno = le64_to_cpu(rec->e_blkno) + 7355 ocfs2_clusters_to_blocks(inode->i_sb, coff); 7356 } else { 7357 /* 7358 * Truncate completed, leave happily. 7359 */ 7360 status = 0; 7361 goto bail; 7362 } 7363 7364 phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno); 7365 7366 if ((flags & OCFS2_EXT_REFCOUNTED) && trunc_len && !ref_tree) { 7367 status = ocfs2_lock_refcount_tree(osb, refcount_loc, 1, 7368 &ref_tree, NULL); 7369 if (status) { 7370 mlog_errno(status); 7371 goto bail; 7372 } 7373 } 7374 7375 status = ocfs2_remove_btree_range(inode, &et, trunc_cpos, 7376 phys_cpos, trunc_len, flags, &dealloc, 7377 refcount_loc, true); 7378 if (status < 0) { 7379 mlog_errno(status); 7380 goto bail; 7381 } 7382 7383 ocfs2_reinit_path(path, 1); 7384 7385 /* 7386 * The check above will catch the case where we've truncated 7387 * away all allocation. 7388 */ 7389 goto start; 7390 7391 bail: 7392 if (ref_tree) 7393 ocfs2_unlock_refcount_tree(osb, ref_tree, 1); 7394 7395 ocfs2_schedule_truncate_log_flush(osb, 1); 7396 7397 ocfs2_run_deallocs(osb, &dealloc); 7398 7399 ocfs2_free_path(path); 7400 7401 return status; 7402 } 7403 7404 /* 7405 * 'start' is inclusive, 'end' is not. 7406 */ 7407 int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh, 7408 unsigned int start, unsigned int end, int trunc) 7409 { 7410 int ret; 7411 unsigned int numbytes; 7412 handle_t *handle; 7413 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 7414 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 7415 struct ocfs2_inline_data *idata = &di->id2.i_data; 7416 7417 /* No need to punch hole beyond i_size. */ 7418 if (start >= i_size_read(inode)) 7419 return 0; 7420 7421 if (end > i_size_read(inode)) 7422 end = i_size_read(inode); 7423 7424 BUG_ON(start > end); 7425 7426 if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) || 7427 !(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) || 7428 !ocfs2_supports_inline_data(osb)) { 7429 ocfs2_error(inode->i_sb, 7430 "Inline data flags for inode %llu don't agree! Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n", 7431 (unsigned long long)OCFS2_I(inode)->ip_blkno, 7432 le16_to_cpu(di->i_dyn_features), 7433 OCFS2_I(inode)->ip_dyn_features, 7434 osb->s_feature_incompat); 7435 ret = -EROFS; 7436 goto out; 7437 } 7438 7439 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 7440 if (IS_ERR(handle)) { 7441 ret = PTR_ERR(handle); 7442 mlog_errno(ret); 7443 goto out; 7444 } 7445 7446 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, 7447 OCFS2_JOURNAL_ACCESS_WRITE); 7448 if (ret) { 7449 mlog_errno(ret); 7450 goto out_commit; 7451 } 7452 7453 numbytes = end - start; 7454 memset(idata->id_data + start, 0, numbytes); 7455 7456 /* 7457 * No need to worry about the data page here - it's been 7458 * truncated already and inline data doesn't need it for 7459 * pushing zero's to disk, so we'll let read_folio pick it up 7460 * later. 7461 */ 7462 if (trunc) { 7463 i_size_write(inode, start); 7464 di->i_size = cpu_to_le64(start); 7465 } 7466 7467 inode->i_blocks = ocfs2_inode_sector_count(inode); 7468 inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); 7469 7470 di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime_sec(inode)); 7471 di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode)); 7472 7473 ocfs2_update_inode_fsync_trans(handle, inode, 1); 7474 ocfs2_journal_dirty(handle, di_bh); 7475 7476 out_commit: 7477 ocfs2_commit_trans(osb, handle); 7478 7479 out: 7480 return ret; 7481 } 7482 7483 static int ocfs2_trim_extent(struct super_block *sb, 7484 struct ocfs2_group_desc *gd, 7485 u64 group, u32 start, u32 count) 7486 { 7487 u64 discard, bcount; 7488 struct ocfs2_super *osb = OCFS2_SB(sb); 7489 7490 bcount = ocfs2_clusters_to_blocks(sb, count); 7491 discard = ocfs2_clusters_to_blocks(sb, start); 7492 7493 /* 7494 * For the first cluster group, the gd->bg_blkno is not at the start 7495 * of the group, but at an offset from the start. If we add it while 7496 * calculating discard for first group, we will wrongly start fstrim a 7497 * few blocks after the desried start block and the range can cross 7498 * over into the next cluster group. So, add it only if this is not 7499 * the first cluster group. 7500 */ 7501 if (group != osb->first_cluster_group_blkno) 7502 discard += le64_to_cpu(gd->bg_blkno); 7503 7504 trace_ocfs2_trim_extent(sb, (unsigned long long)discard, bcount); 7505 7506 return sb_issue_discard(sb, discard, bcount, GFP_NOFS, 0); 7507 } 7508 7509 static int ocfs2_trim_group(struct super_block *sb, 7510 struct ocfs2_group_desc *gd, u64 group, 7511 u32 start, u32 max, u32 minbits) 7512 { 7513 int ret = 0, count = 0, next; 7514 void *bitmap = gd->bg_bitmap; 7515 7516 if (le16_to_cpu(gd->bg_free_bits_count) < minbits) 7517 return 0; 7518 7519 trace_ocfs2_trim_group((unsigned long long)le64_to_cpu(gd->bg_blkno), 7520 start, max, minbits); 7521 7522 while (start < max) { 7523 start = ocfs2_find_next_zero_bit(bitmap, max, start); 7524 if (start >= max) 7525 break; 7526 next = ocfs2_find_next_bit(bitmap, max, start); 7527 7528 if ((next - start) >= minbits) { 7529 ret = ocfs2_trim_extent(sb, gd, group, 7530 start, next - start); 7531 if (ret < 0) { 7532 mlog_errno(ret); 7533 break; 7534 } 7535 count += next - start; 7536 } 7537 start = next + 1; 7538 7539 if (fatal_signal_pending(current)) { 7540 count = -ERESTARTSYS; 7541 break; 7542 } 7543 7544 if ((le16_to_cpu(gd->bg_free_bits_count) - count) < minbits) 7545 break; 7546 } 7547 7548 if (ret < 0) 7549 count = ret; 7550 7551 return count; 7552 } 7553 7554 static 7555 int ocfs2_trim_mainbm(struct super_block *sb, struct fstrim_range *range) 7556 { 7557 struct ocfs2_super *osb = OCFS2_SB(sb); 7558 u64 start, len, trimmed = 0, first_group, last_group = 0, group = 0; 7559 int ret, cnt; 7560 u32 first_bit, last_bit, minlen; 7561 struct buffer_head *main_bm_bh = NULL; 7562 struct inode *main_bm_inode = NULL; 7563 struct buffer_head *gd_bh = NULL; 7564 struct ocfs2_dinode *main_bm; 7565 struct ocfs2_group_desc *gd = NULL; 7566 7567 start = range->start >> osb->s_clustersize_bits; 7568 len = range->len >> osb->s_clustersize_bits; 7569 minlen = range->minlen >> osb->s_clustersize_bits; 7570 7571 if (minlen >= osb->bitmap_cpg || range->len < sb->s_blocksize) 7572 return -EINVAL; 7573 7574 trace_ocfs2_trim_mainbm(start, len, minlen); 7575 7576 next_group: 7577 main_bm_inode = ocfs2_get_system_file_inode(osb, 7578 GLOBAL_BITMAP_SYSTEM_INODE, 7579 OCFS2_INVALID_SLOT); 7580 if (!main_bm_inode) { 7581 ret = -EIO; 7582 mlog_errno(ret); 7583 goto out; 7584 } 7585 7586 inode_lock(main_bm_inode); 7587 7588 ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 0); 7589 if (ret < 0) { 7590 mlog_errno(ret); 7591 goto out_mutex; 7592 } 7593 main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data; 7594 7595 /* 7596 * Do some check before trim the first group. 7597 */ 7598 if (!group) { 7599 if (start >= le32_to_cpu(main_bm->i_clusters)) { 7600 ret = -EINVAL; 7601 goto out_unlock; 7602 } 7603 7604 if (start + len > le32_to_cpu(main_bm->i_clusters)) 7605 len = le32_to_cpu(main_bm->i_clusters) - start; 7606 7607 /* 7608 * Determine first and last group to examine based on 7609 * start and len 7610 */ 7611 first_group = ocfs2_which_cluster_group(main_bm_inode, start); 7612 if (first_group == osb->first_cluster_group_blkno) 7613 first_bit = start; 7614 else 7615 first_bit = start - ocfs2_blocks_to_clusters(sb, 7616 first_group); 7617 last_group = ocfs2_which_cluster_group(main_bm_inode, 7618 start + len - 1); 7619 group = first_group; 7620 } 7621 7622 do { 7623 if (first_bit + len >= osb->bitmap_cpg) 7624 last_bit = osb->bitmap_cpg; 7625 else 7626 last_bit = first_bit + len; 7627 7628 ret = ocfs2_read_group_descriptor(main_bm_inode, 7629 main_bm, group, 7630 &gd_bh); 7631 if (ret < 0) { 7632 mlog_errno(ret); 7633 break; 7634 } 7635 7636 gd = (struct ocfs2_group_desc *)gd_bh->b_data; 7637 cnt = ocfs2_trim_group(sb, gd, group, 7638 first_bit, last_bit, minlen); 7639 brelse(gd_bh); 7640 gd_bh = NULL; 7641 if (cnt < 0) { 7642 ret = cnt; 7643 mlog_errno(ret); 7644 break; 7645 } 7646 7647 trimmed += cnt; 7648 len -= osb->bitmap_cpg - first_bit; 7649 first_bit = 0; 7650 if (group == osb->first_cluster_group_blkno) 7651 group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg); 7652 else 7653 group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg); 7654 } while (0); 7655 7656 out_unlock: 7657 ocfs2_inode_unlock(main_bm_inode, 0); 7658 brelse(main_bm_bh); 7659 main_bm_bh = NULL; 7660 out_mutex: 7661 inode_unlock(main_bm_inode); 7662 iput(main_bm_inode); 7663 7664 /* 7665 * If all the groups trim are not done or failed, but we should release 7666 * main_bm related locks for avoiding the current IO starve, then go to 7667 * trim the next group 7668 */ 7669 if (ret >= 0 && group <= last_group) { 7670 cond_resched(); 7671 goto next_group; 7672 } 7673 out: 7674 range->len = trimmed * osb->s_clustersize; 7675 return ret; 7676 } 7677 7678 int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) 7679 { 7680 int ret; 7681 struct ocfs2_super *osb = OCFS2_SB(sb); 7682 struct ocfs2_trim_fs_info info, *pinfo = NULL; 7683 7684 ocfs2_trim_fs_lock_res_init(osb); 7685 7686 trace_ocfs2_trim_fs(range->start, range->len, range->minlen); 7687 7688 ret = ocfs2_trim_fs_lock(osb, NULL, 1); 7689 if (ret < 0) { 7690 if (ret != -EAGAIN) { 7691 mlog_errno(ret); 7692 ocfs2_trim_fs_lock_res_uninit(osb); 7693 return ret; 7694 } 7695 7696 mlog(ML_NOTICE, "Wait for trim on device (%s) to " 7697 "finish, which is running from another node.\n", 7698 osb->dev_str); 7699 ret = ocfs2_trim_fs_lock(osb, &info, 0); 7700 if (ret < 0) { 7701 mlog_errno(ret); 7702 ocfs2_trim_fs_lock_res_uninit(osb); 7703 return ret; 7704 } 7705 7706 if (info.tf_valid && info.tf_success && 7707 info.tf_start == range->start && 7708 info.tf_len == range->len && 7709 info.tf_minlen == range->minlen) { 7710 /* Avoid sending duplicated trim to a shared device */ 7711 mlog(ML_NOTICE, "The same trim on device (%s) was " 7712 "just done from node (%u), return.\n", 7713 osb->dev_str, info.tf_nodenum); 7714 range->len = info.tf_trimlen; 7715 goto out; 7716 } 7717 } 7718 7719 info.tf_nodenum = osb->node_num; 7720 info.tf_start = range->start; 7721 info.tf_len = range->len; 7722 info.tf_minlen = range->minlen; 7723 7724 ret = ocfs2_trim_mainbm(sb, range); 7725 7726 info.tf_trimlen = range->len; 7727 info.tf_success = (ret < 0 ? 0 : 1); 7728 pinfo = &info; 7729 out: 7730 ocfs2_trim_fs_unlock(osb, pinfo); 7731 ocfs2_trim_fs_lock_res_uninit(osb); 7732 return ret; 7733 } 7734