1 /* -*- mode: c; c-basic-offset: 8; -*- 2 * vim: noexpandtab sw=8 ts=8 sts=0: 3 * 4 * alloc.c 5 * 6 * Extent allocs and frees 7 * 8 * Copyright (C) 2002, 2004 Oracle. All rights reserved. 9 * 10 * This program is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU General Public 12 * License as published by the Free Software Foundation; either 13 * version 2 of the License, or (at your option) any later version. 14 * 15 * This program is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18 * General Public License for more details. 19 * 20 * You should have received a copy of the GNU General Public 21 * License along with this program; if not, write to the 22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 23 * Boston, MA 021110-1307, USA. 24 */ 25 26 #include <linux/fs.h> 27 #include <linux/types.h> 28 #include <linux/slab.h> 29 #include <linux/highmem.h> 30 #include <linux/swap.h> 31 #include <linux/quotaops.h> 32 33 #define MLOG_MASK_PREFIX ML_DISK_ALLOC 34 #include <cluster/masklog.h> 35 36 #include "ocfs2.h" 37 38 #include "alloc.h" 39 #include "aops.h" 40 #include "blockcheck.h" 41 #include "dlmglue.h" 42 #include "extent_map.h" 43 #include "inode.h" 44 #include "journal.h" 45 #include "localalloc.h" 46 #include "suballoc.h" 47 #include "sysfile.h" 48 #include "file.h" 49 #include "super.h" 50 #include "uptodate.h" 51 #include "xattr.h" 52 #include "refcounttree.h" 53 54 #include "buffer_head_io.h" 55 56 enum ocfs2_contig_type { 57 CONTIG_NONE = 0, 58 CONTIG_LEFT, 59 CONTIG_RIGHT, 60 CONTIG_LEFTRIGHT, 61 }; 62 63 static enum ocfs2_contig_type 64 ocfs2_extent_rec_contig(struct super_block *sb, 65 struct ocfs2_extent_rec *ext, 66 struct ocfs2_extent_rec *insert_rec); 67 /* 68 * Operations for a specific extent tree type. 69 * 70 * To implement an on-disk btree (extent tree) type in ocfs2, add 71 * an ocfs2_extent_tree_operations structure and the matching 72 * ocfs2_init_<thingy>_extent_tree() function. That's pretty much it 73 * for the allocation portion of the extent tree. 74 */ 75 struct ocfs2_extent_tree_operations { 76 /* 77 * last_eb_blk is the block number of the right most leaf extent 78 * block. Most on-disk structures containing an extent tree store 79 * this value for fast access. The ->eo_set_last_eb_blk() and 80 * ->eo_get_last_eb_blk() operations access this value. They are 81 * both required. 82 */ 83 void (*eo_set_last_eb_blk)(struct ocfs2_extent_tree *et, 84 u64 blkno); 85 u64 (*eo_get_last_eb_blk)(struct ocfs2_extent_tree *et); 86 87 /* 88 * The on-disk structure usually keeps track of how many total 89 * clusters are stored in this extent tree. This function updates 90 * that value. new_clusters is the delta, and must be 91 * added to the total. Required. 92 */ 93 void (*eo_update_clusters)(struct ocfs2_extent_tree *et, 94 u32 new_clusters); 95 96 /* 97 * If this extent tree is supported by an extent map, insert 98 * a record into the map. 99 */ 100 void (*eo_extent_map_insert)(struct ocfs2_extent_tree *et, 101 struct ocfs2_extent_rec *rec); 102 103 /* 104 * If this extent tree is supported by an extent map, truncate the 105 * map to clusters, 106 */ 107 void (*eo_extent_map_truncate)(struct ocfs2_extent_tree *et, 108 u32 clusters); 109 110 /* 111 * If ->eo_insert_check() exists, it is called before rec is 112 * inserted into the extent tree. It is optional. 113 */ 114 int (*eo_insert_check)(struct ocfs2_extent_tree *et, 115 struct ocfs2_extent_rec *rec); 116 int (*eo_sanity_check)(struct ocfs2_extent_tree *et); 117 118 /* 119 * -------------------------------------------------------------- 120 * The remaining are internal to ocfs2_extent_tree and don't have 121 * accessor functions 122 */ 123 124 /* 125 * ->eo_fill_root_el() takes et->et_object and sets et->et_root_el. 126 * It is required. 127 */ 128 void (*eo_fill_root_el)(struct ocfs2_extent_tree *et); 129 130 /* 131 * ->eo_fill_max_leaf_clusters sets et->et_max_leaf_clusters if 132 * it exists. If it does not, et->et_max_leaf_clusters is set 133 * to 0 (unlimited). Optional. 134 */ 135 void (*eo_fill_max_leaf_clusters)(struct ocfs2_extent_tree *et); 136 137 /* 138 * ->eo_extent_contig test whether the 2 ocfs2_extent_rec 139 * are contiguous or not. Optional. Don't need to set it if use 140 * ocfs2_extent_rec as the tree leaf. 141 */ 142 enum ocfs2_contig_type 143 (*eo_extent_contig)(struct ocfs2_extent_tree *et, 144 struct ocfs2_extent_rec *ext, 145 struct ocfs2_extent_rec *insert_rec); 146 }; 147 148 149 /* 150 * Pre-declare ocfs2_dinode_et_ops so we can use it as a sanity check 151 * in the methods. 152 */ 153 static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et); 154 static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et, 155 u64 blkno); 156 static void ocfs2_dinode_update_clusters(struct ocfs2_extent_tree *et, 157 u32 clusters); 158 static void ocfs2_dinode_extent_map_insert(struct ocfs2_extent_tree *et, 159 struct ocfs2_extent_rec *rec); 160 static void ocfs2_dinode_extent_map_truncate(struct ocfs2_extent_tree *et, 161 u32 clusters); 162 static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et, 163 struct ocfs2_extent_rec *rec); 164 static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et); 165 static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et); 166 static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = { 167 .eo_set_last_eb_blk = ocfs2_dinode_set_last_eb_blk, 168 .eo_get_last_eb_blk = ocfs2_dinode_get_last_eb_blk, 169 .eo_update_clusters = ocfs2_dinode_update_clusters, 170 .eo_extent_map_insert = ocfs2_dinode_extent_map_insert, 171 .eo_extent_map_truncate = ocfs2_dinode_extent_map_truncate, 172 .eo_insert_check = ocfs2_dinode_insert_check, 173 .eo_sanity_check = ocfs2_dinode_sanity_check, 174 .eo_fill_root_el = ocfs2_dinode_fill_root_el, 175 }; 176 177 static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et, 178 u64 blkno) 179 { 180 struct ocfs2_dinode *di = et->et_object; 181 182 BUG_ON(et->et_ops != &ocfs2_dinode_et_ops); 183 di->i_last_eb_blk = cpu_to_le64(blkno); 184 } 185 186 static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et) 187 { 188 struct ocfs2_dinode *di = et->et_object; 189 190 BUG_ON(et->et_ops != &ocfs2_dinode_et_ops); 191 return le64_to_cpu(di->i_last_eb_blk); 192 } 193 194 static void ocfs2_dinode_update_clusters(struct ocfs2_extent_tree *et, 195 u32 clusters) 196 { 197 struct ocfs2_inode_info *oi = cache_info_to_inode(et->et_ci); 198 struct ocfs2_dinode *di = et->et_object; 199 200 le32_add_cpu(&di->i_clusters, clusters); 201 spin_lock(&oi->ip_lock); 202 oi->ip_clusters = le32_to_cpu(di->i_clusters); 203 spin_unlock(&oi->ip_lock); 204 } 205 206 static void ocfs2_dinode_extent_map_insert(struct ocfs2_extent_tree *et, 207 struct ocfs2_extent_rec *rec) 208 { 209 struct inode *inode = &cache_info_to_inode(et->et_ci)->vfs_inode; 210 211 ocfs2_extent_map_insert_rec(inode, rec); 212 } 213 214 static void ocfs2_dinode_extent_map_truncate(struct ocfs2_extent_tree *et, 215 u32 clusters) 216 { 217 struct inode *inode = &cache_info_to_inode(et->et_ci)->vfs_inode; 218 219 ocfs2_extent_map_trunc(inode, clusters); 220 } 221 222 static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et, 223 struct ocfs2_extent_rec *rec) 224 { 225 struct ocfs2_inode_info *oi = cache_info_to_inode(et->et_ci); 226 struct ocfs2_super *osb = OCFS2_SB(oi->vfs_inode.i_sb); 227 228 BUG_ON(oi->ip_dyn_features & OCFS2_INLINE_DATA_FL); 229 mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) && 230 (oi->ip_clusters != le32_to_cpu(rec->e_cpos)), 231 "Device %s, asking for sparse allocation: inode %llu, " 232 "cpos %u, clusters %u\n", 233 osb->dev_str, 234 (unsigned long long)oi->ip_blkno, 235 rec->e_cpos, oi->ip_clusters); 236 237 return 0; 238 } 239 240 static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et) 241 { 242 struct ocfs2_dinode *di = et->et_object; 243 244 BUG_ON(et->et_ops != &ocfs2_dinode_et_ops); 245 BUG_ON(!OCFS2_IS_VALID_DINODE(di)); 246 247 return 0; 248 } 249 250 static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et) 251 { 252 struct ocfs2_dinode *di = et->et_object; 253 254 et->et_root_el = &di->id2.i_list; 255 } 256 257 258 static void ocfs2_xattr_value_fill_root_el(struct ocfs2_extent_tree *et) 259 { 260 struct ocfs2_xattr_value_buf *vb = et->et_object; 261 262 et->et_root_el = &vb->vb_xv->xr_list; 263 } 264 265 static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et, 266 u64 blkno) 267 { 268 struct ocfs2_xattr_value_buf *vb = et->et_object; 269 270 vb->vb_xv->xr_last_eb_blk = cpu_to_le64(blkno); 271 } 272 273 static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et) 274 { 275 struct ocfs2_xattr_value_buf *vb = et->et_object; 276 277 return le64_to_cpu(vb->vb_xv->xr_last_eb_blk); 278 } 279 280 static void ocfs2_xattr_value_update_clusters(struct ocfs2_extent_tree *et, 281 u32 clusters) 282 { 283 struct ocfs2_xattr_value_buf *vb = et->et_object; 284 285 le32_add_cpu(&vb->vb_xv->xr_clusters, clusters); 286 } 287 288 static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = { 289 .eo_set_last_eb_blk = ocfs2_xattr_value_set_last_eb_blk, 290 .eo_get_last_eb_blk = ocfs2_xattr_value_get_last_eb_blk, 291 .eo_update_clusters = ocfs2_xattr_value_update_clusters, 292 .eo_fill_root_el = ocfs2_xattr_value_fill_root_el, 293 }; 294 295 static void ocfs2_xattr_tree_fill_root_el(struct ocfs2_extent_tree *et) 296 { 297 struct ocfs2_xattr_block *xb = et->et_object; 298 299 et->et_root_el = &xb->xb_attrs.xb_root.xt_list; 300 } 301 302 static void ocfs2_xattr_tree_fill_max_leaf_clusters(struct ocfs2_extent_tree *et) 303 { 304 struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); 305 et->et_max_leaf_clusters = 306 ocfs2_clusters_for_bytes(sb, OCFS2_MAX_XATTR_TREE_LEAF_SIZE); 307 } 308 309 static void ocfs2_xattr_tree_set_last_eb_blk(struct ocfs2_extent_tree *et, 310 u64 blkno) 311 { 312 struct ocfs2_xattr_block *xb = et->et_object; 313 struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root; 314 315 xt->xt_last_eb_blk = cpu_to_le64(blkno); 316 } 317 318 static u64 ocfs2_xattr_tree_get_last_eb_blk(struct ocfs2_extent_tree *et) 319 { 320 struct ocfs2_xattr_block *xb = et->et_object; 321 struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root; 322 323 return le64_to_cpu(xt->xt_last_eb_blk); 324 } 325 326 static void ocfs2_xattr_tree_update_clusters(struct ocfs2_extent_tree *et, 327 u32 clusters) 328 { 329 struct ocfs2_xattr_block *xb = et->et_object; 330 331 le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, clusters); 332 } 333 334 static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = { 335 .eo_set_last_eb_blk = ocfs2_xattr_tree_set_last_eb_blk, 336 .eo_get_last_eb_blk = ocfs2_xattr_tree_get_last_eb_blk, 337 .eo_update_clusters = ocfs2_xattr_tree_update_clusters, 338 .eo_fill_root_el = ocfs2_xattr_tree_fill_root_el, 339 .eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters, 340 }; 341 342 static void ocfs2_dx_root_set_last_eb_blk(struct ocfs2_extent_tree *et, 343 u64 blkno) 344 { 345 struct ocfs2_dx_root_block *dx_root = et->et_object; 346 347 dx_root->dr_last_eb_blk = cpu_to_le64(blkno); 348 } 349 350 static u64 ocfs2_dx_root_get_last_eb_blk(struct ocfs2_extent_tree *et) 351 { 352 struct ocfs2_dx_root_block *dx_root = et->et_object; 353 354 return le64_to_cpu(dx_root->dr_last_eb_blk); 355 } 356 357 static void ocfs2_dx_root_update_clusters(struct ocfs2_extent_tree *et, 358 u32 clusters) 359 { 360 struct ocfs2_dx_root_block *dx_root = et->et_object; 361 362 le32_add_cpu(&dx_root->dr_clusters, clusters); 363 } 364 365 static int ocfs2_dx_root_sanity_check(struct ocfs2_extent_tree *et) 366 { 367 struct ocfs2_dx_root_block *dx_root = et->et_object; 368 369 BUG_ON(!OCFS2_IS_VALID_DX_ROOT(dx_root)); 370 371 return 0; 372 } 373 374 static void ocfs2_dx_root_fill_root_el(struct ocfs2_extent_tree *et) 375 { 376 struct ocfs2_dx_root_block *dx_root = et->et_object; 377 378 et->et_root_el = &dx_root->dr_list; 379 } 380 381 static struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = { 382 .eo_set_last_eb_blk = ocfs2_dx_root_set_last_eb_blk, 383 .eo_get_last_eb_blk = ocfs2_dx_root_get_last_eb_blk, 384 .eo_update_clusters = ocfs2_dx_root_update_clusters, 385 .eo_sanity_check = ocfs2_dx_root_sanity_check, 386 .eo_fill_root_el = ocfs2_dx_root_fill_root_el, 387 }; 388 389 static void ocfs2_refcount_tree_fill_root_el(struct ocfs2_extent_tree *et) 390 { 391 struct ocfs2_refcount_block *rb = et->et_object; 392 393 et->et_root_el = &rb->rf_list; 394 } 395 396 static void ocfs2_refcount_tree_set_last_eb_blk(struct ocfs2_extent_tree *et, 397 u64 blkno) 398 { 399 struct ocfs2_refcount_block *rb = et->et_object; 400 401 rb->rf_last_eb_blk = cpu_to_le64(blkno); 402 } 403 404 static u64 ocfs2_refcount_tree_get_last_eb_blk(struct ocfs2_extent_tree *et) 405 { 406 struct ocfs2_refcount_block *rb = et->et_object; 407 408 return le64_to_cpu(rb->rf_last_eb_blk); 409 } 410 411 static void ocfs2_refcount_tree_update_clusters(struct ocfs2_extent_tree *et, 412 u32 clusters) 413 { 414 struct ocfs2_refcount_block *rb = et->et_object; 415 416 le32_add_cpu(&rb->rf_clusters, clusters); 417 } 418 419 static enum ocfs2_contig_type 420 ocfs2_refcount_tree_extent_contig(struct ocfs2_extent_tree *et, 421 struct ocfs2_extent_rec *ext, 422 struct ocfs2_extent_rec *insert_rec) 423 { 424 return CONTIG_NONE; 425 } 426 427 static struct ocfs2_extent_tree_operations ocfs2_refcount_tree_et_ops = { 428 .eo_set_last_eb_blk = ocfs2_refcount_tree_set_last_eb_blk, 429 .eo_get_last_eb_blk = ocfs2_refcount_tree_get_last_eb_blk, 430 .eo_update_clusters = ocfs2_refcount_tree_update_clusters, 431 .eo_fill_root_el = ocfs2_refcount_tree_fill_root_el, 432 .eo_extent_contig = ocfs2_refcount_tree_extent_contig, 433 }; 434 435 static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et, 436 struct ocfs2_caching_info *ci, 437 struct buffer_head *bh, 438 ocfs2_journal_access_func access, 439 void *obj, 440 struct ocfs2_extent_tree_operations *ops) 441 { 442 et->et_ops = ops; 443 et->et_root_bh = bh; 444 et->et_ci = ci; 445 et->et_root_journal_access = access; 446 if (!obj) 447 obj = (void *)bh->b_data; 448 et->et_object = obj; 449 450 et->et_ops->eo_fill_root_el(et); 451 if (!et->et_ops->eo_fill_max_leaf_clusters) 452 et->et_max_leaf_clusters = 0; 453 else 454 et->et_ops->eo_fill_max_leaf_clusters(et); 455 } 456 457 void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et, 458 struct ocfs2_caching_info *ci, 459 struct buffer_head *bh) 460 { 461 __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_di, 462 NULL, &ocfs2_dinode_et_ops); 463 } 464 465 void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et, 466 struct ocfs2_caching_info *ci, 467 struct buffer_head *bh) 468 { 469 __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_xb, 470 NULL, &ocfs2_xattr_tree_et_ops); 471 } 472 473 void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et, 474 struct ocfs2_caching_info *ci, 475 struct ocfs2_xattr_value_buf *vb) 476 { 477 __ocfs2_init_extent_tree(et, ci, vb->vb_bh, vb->vb_access, vb, 478 &ocfs2_xattr_value_et_ops); 479 } 480 481 void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et, 482 struct ocfs2_caching_info *ci, 483 struct buffer_head *bh) 484 { 485 __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_dr, 486 NULL, &ocfs2_dx_root_et_ops); 487 } 488 489 void ocfs2_init_refcount_extent_tree(struct ocfs2_extent_tree *et, 490 struct ocfs2_caching_info *ci, 491 struct buffer_head *bh) 492 { 493 __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_rb, 494 NULL, &ocfs2_refcount_tree_et_ops); 495 } 496 497 static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et, 498 u64 new_last_eb_blk) 499 { 500 et->et_ops->eo_set_last_eb_blk(et, new_last_eb_blk); 501 } 502 503 static inline u64 ocfs2_et_get_last_eb_blk(struct ocfs2_extent_tree *et) 504 { 505 return et->et_ops->eo_get_last_eb_blk(et); 506 } 507 508 static inline void ocfs2_et_update_clusters(struct ocfs2_extent_tree *et, 509 u32 clusters) 510 { 511 et->et_ops->eo_update_clusters(et, clusters); 512 } 513 514 static inline void ocfs2_et_extent_map_insert(struct ocfs2_extent_tree *et, 515 struct ocfs2_extent_rec *rec) 516 { 517 if (et->et_ops->eo_extent_map_insert) 518 et->et_ops->eo_extent_map_insert(et, rec); 519 } 520 521 static inline void ocfs2_et_extent_map_truncate(struct ocfs2_extent_tree *et, 522 u32 clusters) 523 { 524 if (et->et_ops->eo_extent_map_truncate) 525 et->et_ops->eo_extent_map_truncate(et, clusters); 526 } 527 528 static inline int ocfs2_et_root_journal_access(handle_t *handle, 529 struct ocfs2_extent_tree *et, 530 int type) 531 { 532 return et->et_root_journal_access(handle, et->et_ci, et->et_root_bh, 533 type); 534 } 535 536 static inline enum ocfs2_contig_type 537 ocfs2_et_extent_contig(struct ocfs2_extent_tree *et, 538 struct ocfs2_extent_rec *rec, 539 struct ocfs2_extent_rec *insert_rec) 540 { 541 if (et->et_ops->eo_extent_contig) 542 return et->et_ops->eo_extent_contig(et, rec, insert_rec); 543 544 return ocfs2_extent_rec_contig( 545 ocfs2_metadata_cache_get_super(et->et_ci), 546 rec, insert_rec); 547 } 548 549 static inline int ocfs2_et_insert_check(struct ocfs2_extent_tree *et, 550 struct ocfs2_extent_rec *rec) 551 { 552 int ret = 0; 553 554 if (et->et_ops->eo_insert_check) 555 ret = et->et_ops->eo_insert_check(et, rec); 556 return ret; 557 } 558 559 static inline int ocfs2_et_sanity_check(struct ocfs2_extent_tree *et) 560 { 561 int ret = 0; 562 563 if (et->et_ops->eo_sanity_check) 564 ret = et->et_ops->eo_sanity_check(et); 565 return ret; 566 } 567 568 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc); 569 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt, 570 struct ocfs2_extent_block *eb); 571 static void ocfs2_adjust_rightmost_records(handle_t *handle, 572 struct ocfs2_extent_tree *et, 573 struct ocfs2_path *path, 574 struct ocfs2_extent_rec *insert_rec); 575 /* 576 * Reset the actual path elements so that we can re-use the structure 577 * to build another path. Generally, this involves freeing the buffer 578 * heads. 579 */ 580 void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root) 581 { 582 int i, start = 0, depth = 0; 583 struct ocfs2_path_item *node; 584 585 if (keep_root) 586 start = 1; 587 588 for(i = start; i < path_num_items(path); i++) { 589 node = &path->p_node[i]; 590 591 brelse(node->bh); 592 node->bh = NULL; 593 node->el = NULL; 594 } 595 596 /* 597 * Tree depth may change during truncate, or insert. If we're 598 * keeping the root extent list, then make sure that our path 599 * structure reflects the proper depth. 600 */ 601 if (keep_root) 602 depth = le16_to_cpu(path_root_el(path)->l_tree_depth); 603 else 604 path_root_access(path) = NULL; 605 606 path->p_tree_depth = depth; 607 } 608 609 void ocfs2_free_path(struct ocfs2_path *path) 610 { 611 if (path) { 612 ocfs2_reinit_path(path, 0); 613 kfree(path); 614 } 615 } 616 617 /* 618 * All the elements of src into dest. After this call, src could be freed 619 * without affecting dest. 620 * 621 * Both paths should have the same root. Any non-root elements of dest 622 * will be freed. 623 */ 624 static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src) 625 { 626 int i; 627 628 BUG_ON(path_root_bh(dest) != path_root_bh(src)); 629 BUG_ON(path_root_el(dest) != path_root_el(src)); 630 BUG_ON(path_root_access(dest) != path_root_access(src)); 631 632 ocfs2_reinit_path(dest, 1); 633 634 for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) { 635 dest->p_node[i].bh = src->p_node[i].bh; 636 dest->p_node[i].el = src->p_node[i].el; 637 638 if (dest->p_node[i].bh) 639 get_bh(dest->p_node[i].bh); 640 } 641 } 642 643 /* 644 * Make the *dest path the same as src and re-initialize src path to 645 * have a root only. 646 */ 647 static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src) 648 { 649 int i; 650 651 BUG_ON(path_root_bh(dest) != path_root_bh(src)); 652 BUG_ON(path_root_access(dest) != path_root_access(src)); 653 654 for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) { 655 brelse(dest->p_node[i].bh); 656 657 dest->p_node[i].bh = src->p_node[i].bh; 658 dest->p_node[i].el = src->p_node[i].el; 659 660 src->p_node[i].bh = NULL; 661 src->p_node[i].el = NULL; 662 } 663 } 664 665 /* 666 * Insert an extent block at given index. 667 * 668 * This will not take an additional reference on eb_bh. 669 */ 670 static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index, 671 struct buffer_head *eb_bh) 672 { 673 struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data; 674 675 /* 676 * Right now, no root bh is an extent block, so this helps 677 * catch code errors with dinode trees. The assertion can be 678 * safely removed if we ever need to insert extent block 679 * structures at the root. 680 */ 681 BUG_ON(index == 0); 682 683 path->p_node[index].bh = eb_bh; 684 path->p_node[index].el = &eb->h_list; 685 } 686 687 static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh, 688 struct ocfs2_extent_list *root_el, 689 ocfs2_journal_access_func access) 690 { 691 struct ocfs2_path *path; 692 693 BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH); 694 695 path = kzalloc(sizeof(*path), GFP_NOFS); 696 if (path) { 697 path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth); 698 get_bh(root_bh); 699 path_root_bh(path) = root_bh; 700 path_root_el(path) = root_el; 701 path_root_access(path) = access; 702 } 703 704 return path; 705 } 706 707 struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path) 708 { 709 return ocfs2_new_path(path_root_bh(path), path_root_el(path), 710 path_root_access(path)); 711 } 712 713 struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et) 714 { 715 return ocfs2_new_path(et->et_root_bh, et->et_root_el, 716 et->et_root_journal_access); 717 } 718 719 /* 720 * Journal the buffer at depth idx. All idx>0 are extent_blocks, 721 * otherwise it's the root_access function. 722 * 723 * I don't like the way this function's name looks next to 724 * ocfs2_journal_access_path(), but I don't have a better one. 725 */ 726 int ocfs2_path_bh_journal_access(handle_t *handle, 727 struct ocfs2_caching_info *ci, 728 struct ocfs2_path *path, 729 int idx) 730 { 731 ocfs2_journal_access_func access = path_root_access(path); 732 733 if (!access) 734 access = ocfs2_journal_access; 735 736 if (idx) 737 access = ocfs2_journal_access_eb; 738 739 return access(handle, ci, path->p_node[idx].bh, 740 OCFS2_JOURNAL_ACCESS_WRITE); 741 } 742 743 /* 744 * Convenience function to journal all components in a path. 745 */ 746 int ocfs2_journal_access_path(struct ocfs2_caching_info *ci, 747 handle_t *handle, 748 struct ocfs2_path *path) 749 { 750 int i, ret = 0; 751 752 if (!path) 753 goto out; 754 755 for(i = 0; i < path_num_items(path); i++) { 756 ret = ocfs2_path_bh_journal_access(handle, ci, path, i); 757 if (ret < 0) { 758 mlog_errno(ret); 759 goto out; 760 } 761 } 762 763 out: 764 return ret; 765 } 766 767 /* 768 * Return the index of the extent record which contains cluster #v_cluster. 769 * -1 is returned if it was not found. 770 * 771 * Should work fine on interior and exterior nodes. 772 */ 773 int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster) 774 { 775 int ret = -1; 776 int i; 777 struct ocfs2_extent_rec *rec; 778 u32 rec_end, rec_start, clusters; 779 780 for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { 781 rec = &el->l_recs[i]; 782 783 rec_start = le32_to_cpu(rec->e_cpos); 784 clusters = ocfs2_rec_clusters(el, rec); 785 786 rec_end = rec_start + clusters; 787 788 if (v_cluster >= rec_start && v_cluster < rec_end) { 789 ret = i; 790 break; 791 } 792 } 793 794 return ret; 795 } 796 797 /* 798 * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and 799 * ocfs2_extent_rec_contig only work properly against leaf nodes! 800 */ 801 static int ocfs2_block_extent_contig(struct super_block *sb, 802 struct ocfs2_extent_rec *ext, 803 u64 blkno) 804 { 805 u64 blk_end = le64_to_cpu(ext->e_blkno); 806 807 blk_end += ocfs2_clusters_to_blocks(sb, 808 le16_to_cpu(ext->e_leaf_clusters)); 809 810 return blkno == blk_end; 811 } 812 813 static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left, 814 struct ocfs2_extent_rec *right) 815 { 816 u32 left_range; 817 818 left_range = le32_to_cpu(left->e_cpos) + 819 le16_to_cpu(left->e_leaf_clusters); 820 821 return (left_range == le32_to_cpu(right->e_cpos)); 822 } 823 824 static enum ocfs2_contig_type 825 ocfs2_extent_rec_contig(struct super_block *sb, 826 struct ocfs2_extent_rec *ext, 827 struct ocfs2_extent_rec *insert_rec) 828 { 829 u64 blkno = le64_to_cpu(insert_rec->e_blkno); 830 831 /* 832 * Refuse to coalesce extent records with different flag 833 * fields - we don't want to mix unwritten extents with user 834 * data. 835 */ 836 if (ext->e_flags != insert_rec->e_flags) 837 return CONTIG_NONE; 838 839 if (ocfs2_extents_adjacent(ext, insert_rec) && 840 ocfs2_block_extent_contig(sb, ext, blkno)) 841 return CONTIG_RIGHT; 842 843 blkno = le64_to_cpu(ext->e_blkno); 844 if (ocfs2_extents_adjacent(insert_rec, ext) && 845 ocfs2_block_extent_contig(sb, insert_rec, blkno)) 846 return CONTIG_LEFT; 847 848 return CONTIG_NONE; 849 } 850 851 /* 852 * NOTE: We can have pretty much any combination of contiguousness and 853 * appending. 854 * 855 * The usefulness of APPEND_TAIL is more in that it lets us know that 856 * we'll have to update the path to that leaf. 857 */ 858 enum ocfs2_append_type { 859 APPEND_NONE = 0, 860 APPEND_TAIL, 861 }; 862 863 enum ocfs2_split_type { 864 SPLIT_NONE = 0, 865 SPLIT_LEFT, 866 SPLIT_RIGHT, 867 }; 868 869 struct ocfs2_insert_type { 870 enum ocfs2_split_type ins_split; 871 enum ocfs2_append_type ins_appending; 872 enum ocfs2_contig_type ins_contig; 873 int ins_contig_index; 874 int ins_tree_depth; 875 }; 876 877 struct ocfs2_merge_ctxt { 878 enum ocfs2_contig_type c_contig_type; 879 int c_has_empty_extent; 880 int c_split_covers_rec; 881 }; 882 883 static int ocfs2_validate_extent_block(struct super_block *sb, 884 struct buffer_head *bh) 885 { 886 int rc; 887 struct ocfs2_extent_block *eb = 888 (struct ocfs2_extent_block *)bh->b_data; 889 890 mlog(0, "Validating extent block %llu\n", 891 (unsigned long long)bh->b_blocknr); 892 893 BUG_ON(!buffer_uptodate(bh)); 894 895 /* 896 * If the ecc fails, we return the error but otherwise 897 * leave the filesystem running. We know any error is 898 * local to this block. 899 */ 900 rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &eb->h_check); 901 if (rc) { 902 mlog(ML_ERROR, "Checksum failed for extent block %llu\n", 903 (unsigned long long)bh->b_blocknr); 904 return rc; 905 } 906 907 /* 908 * Errors after here are fatal. 909 */ 910 911 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { 912 ocfs2_error(sb, 913 "Extent block #%llu has bad signature %.*s", 914 (unsigned long long)bh->b_blocknr, 7, 915 eb->h_signature); 916 return -EINVAL; 917 } 918 919 if (le64_to_cpu(eb->h_blkno) != bh->b_blocknr) { 920 ocfs2_error(sb, 921 "Extent block #%llu has an invalid h_blkno " 922 "of %llu", 923 (unsigned long long)bh->b_blocknr, 924 (unsigned long long)le64_to_cpu(eb->h_blkno)); 925 return -EINVAL; 926 } 927 928 if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) { 929 ocfs2_error(sb, 930 "Extent block #%llu has an invalid " 931 "h_fs_generation of #%u", 932 (unsigned long long)bh->b_blocknr, 933 le32_to_cpu(eb->h_fs_generation)); 934 return -EINVAL; 935 } 936 937 return 0; 938 } 939 940 int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno, 941 struct buffer_head **bh) 942 { 943 int rc; 944 struct buffer_head *tmp = *bh; 945 946 rc = ocfs2_read_block(ci, eb_blkno, &tmp, 947 ocfs2_validate_extent_block); 948 949 /* If ocfs2_read_block() got us a new bh, pass it up. */ 950 if (!rc && !*bh) 951 *bh = tmp; 952 953 return rc; 954 } 955 956 957 /* 958 * How many free extents have we got before we need more meta data? 959 */ 960 int ocfs2_num_free_extents(struct ocfs2_super *osb, 961 struct ocfs2_extent_tree *et) 962 { 963 int retval; 964 struct ocfs2_extent_list *el = NULL; 965 struct ocfs2_extent_block *eb; 966 struct buffer_head *eb_bh = NULL; 967 u64 last_eb_blk = 0; 968 969 mlog_entry_void(); 970 971 el = et->et_root_el; 972 last_eb_blk = ocfs2_et_get_last_eb_blk(et); 973 974 if (last_eb_blk) { 975 retval = ocfs2_read_extent_block(et->et_ci, last_eb_blk, 976 &eb_bh); 977 if (retval < 0) { 978 mlog_errno(retval); 979 goto bail; 980 } 981 eb = (struct ocfs2_extent_block *) eb_bh->b_data; 982 el = &eb->h_list; 983 } 984 985 BUG_ON(el->l_tree_depth != 0); 986 987 retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec); 988 bail: 989 brelse(eb_bh); 990 991 mlog_exit(retval); 992 return retval; 993 } 994 995 /* expects array to already be allocated 996 * 997 * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and 998 * l_count for you 999 */ 1000 static int ocfs2_create_new_meta_bhs(handle_t *handle, 1001 struct ocfs2_extent_tree *et, 1002 int wanted, 1003 struct ocfs2_alloc_context *meta_ac, 1004 struct buffer_head *bhs[]) 1005 { 1006 int count, status, i; 1007 u16 suballoc_bit_start; 1008 u32 num_got; 1009 u64 first_blkno; 1010 struct ocfs2_super *osb = 1011 OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci)); 1012 struct ocfs2_extent_block *eb; 1013 1014 mlog_entry_void(); 1015 1016 count = 0; 1017 while (count < wanted) { 1018 status = ocfs2_claim_metadata(osb, 1019 handle, 1020 meta_ac, 1021 wanted - count, 1022 &suballoc_bit_start, 1023 &num_got, 1024 &first_blkno); 1025 if (status < 0) { 1026 mlog_errno(status); 1027 goto bail; 1028 } 1029 1030 for(i = count; i < (num_got + count); i++) { 1031 bhs[i] = sb_getblk(osb->sb, first_blkno); 1032 if (bhs[i] == NULL) { 1033 status = -EIO; 1034 mlog_errno(status); 1035 goto bail; 1036 } 1037 ocfs2_set_new_buffer_uptodate(et->et_ci, bhs[i]); 1038 1039 status = ocfs2_journal_access_eb(handle, et->et_ci, 1040 bhs[i], 1041 OCFS2_JOURNAL_ACCESS_CREATE); 1042 if (status < 0) { 1043 mlog_errno(status); 1044 goto bail; 1045 } 1046 1047 memset(bhs[i]->b_data, 0, osb->sb->s_blocksize); 1048 eb = (struct ocfs2_extent_block *) bhs[i]->b_data; 1049 /* Ok, setup the minimal stuff here. */ 1050 strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE); 1051 eb->h_blkno = cpu_to_le64(first_blkno); 1052 eb->h_fs_generation = cpu_to_le32(osb->fs_generation); 1053 eb->h_suballoc_slot = 1054 cpu_to_le16(meta_ac->ac_alloc_slot); 1055 eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start); 1056 eb->h_list.l_count = 1057 cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb)); 1058 1059 suballoc_bit_start++; 1060 first_blkno++; 1061 1062 /* We'll also be dirtied by the caller, so 1063 * this isn't absolutely necessary. */ 1064 status = ocfs2_journal_dirty(handle, bhs[i]); 1065 if (status < 0) { 1066 mlog_errno(status); 1067 goto bail; 1068 } 1069 } 1070 1071 count += num_got; 1072 } 1073 1074 status = 0; 1075 bail: 1076 if (status < 0) { 1077 for(i = 0; i < wanted; i++) { 1078 brelse(bhs[i]); 1079 bhs[i] = NULL; 1080 } 1081 } 1082 mlog_exit(status); 1083 return status; 1084 } 1085 1086 /* 1087 * Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth(). 1088 * 1089 * Returns the sum of the rightmost extent rec logical offset and 1090 * cluster count. 1091 * 1092 * ocfs2_add_branch() uses this to determine what logical cluster 1093 * value should be populated into the leftmost new branch records. 1094 * 1095 * ocfs2_shift_tree_depth() uses this to determine the # clusters 1096 * value for the new topmost tree record. 1097 */ 1098 static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list *el) 1099 { 1100 int i; 1101 1102 i = le16_to_cpu(el->l_next_free_rec) - 1; 1103 1104 return le32_to_cpu(el->l_recs[i].e_cpos) + 1105 ocfs2_rec_clusters(el, &el->l_recs[i]); 1106 } 1107 1108 /* 1109 * Change range of the branches in the right most path according to the leaf 1110 * extent block's rightmost record. 1111 */ 1112 static int ocfs2_adjust_rightmost_branch(handle_t *handle, 1113 struct ocfs2_extent_tree *et) 1114 { 1115 int status; 1116 struct ocfs2_path *path = NULL; 1117 struct ocfs2_extent_list *el; 1118 struct ocfs2_extent_rec *rec; 1119 1120 path = ocfs2_new_path_from_et(et); 1121 if (!path) { 1122 status = -ENOMEM; 1123 return status; 1124 } 1125 1126 status = ocfs2_find_path(et->et_ci, path, UINT_MAX); 1127 if (status < 0) { 1128 mlog_errno(status); 1129 goto out; 1130 } 1131 1132 status = ocfs2_extend_trans(handle, path_num_items(path) + 1133 handle->h_buffer_credits); 1134 if (status < 0) { 1135 mlog_errno(status); 1136 goto out; 1137 } 1138 1139 status = ocfs2_journal_access_path(et->et_ci, handle, path); 1140 if (status < 0) { 1141 mlog_errno(status); 1142 goto out; 1143 } 1144 1145 el = path_leaf_el(path); 1146 rec = &el->l_recs[le32_to_cpu(el->l_next_free_rec) - 1]; 1147 1148 ocfs2_adjust_rightmost_records(handle, et, path, rec); 1149 1150 out: 1151 ocfs2_free_path(path); 1152 return status; 1153 } 1154 1155 /* 1156 * Add an entire tree branch to our inode. eb_bh is the extent block 1157 * to start at, if we don't want to start the branch at the root 1158 * structure. 1159 * 1160 * last_eb_bh is required as we have to update it's next_leaf pointer 1161 * for the new last extent block. 1162 * 1163 * the new branch will be 'empty' in the sense that every block will 1164 * contain a single record with cluster count == 0. 1165 */ 1166 static int ocfs2_add_branch(handle_t *handle, 1167 struct ocfs2_extent_tree *et, 1168 struct buffer_head *eb_bh, 1169 struct buffer_head **last_eb_bh, 1170 struct ocfs2_alloc_context *meta_ac) 1171 { 1172 int status, new_blocks, i; 1173 u64 next_blkno, new_last_eb_blk; 1174 struct buffer_head *bh; 1175 struct buffer_head **new_eb_bhs = NULL; 1176 struct ocfs2_extent_block *eb; 1177 struct ocfs2_extent_list *eb_el; 1178 struct ocfs2_extent_list *el; 1179 u32 new_cpos, root_end; 1180 1181 mlog_entry_void(); 1182 1183 BUG_ON(!last_eb_bh || !*last_eb_bh); 1184 1185 if (eb_bh) { 1186 eb = (struct ocfs2_extent_block *) eb_bh->b_data; 1187 el = &eb->h_list; 1188 } else 1189 el = et->et_root_el; 1190 1191 /* we never add a branch to a leaf. */ 1192 BUG_ON(!el->l_tree_depth); 1193 1194 new_blocks = le16_to_cpu(el->l_tree_depth); 1195 1196 eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data; 1197 new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list); 1198 root_end = ocfs2_sum_rightmost_rec(et->et_root_el); 1199 1200 /* 1201 * If there is a gap before the root end and the real end 1202 * of the righmost leaf block, we need to remove the gap 1203 * between new_cpos and root_end first so that the tree 1204 * is consistent after we add a new branch(it will start 1205 * from new_cpos). 1206 */ 1207 if (root_end > new_cpos) { 1208 mlog(0, "adjust the cluster end from %u to %u\n", 1209 root_end, new_cpos); 1210 status = ocfs2_adjust_rightmost_branch(handle, et); 1211 if (status) { 1212 mlog_errno(status); 1213 goto bail; 1214 } 1215 } 1216 1217 /* allocate the number of new eb blocks we need */ 1218 new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *), 1219 GFP_KERNEL); 1220 if (!new_eb_bhs) { 1221 status = -ENOMEM; 1222 mlog_errno(status); 1223 goto bail; 1224 } 1225 1226 status = ocfs2_create_new_meta_bhs(handle, et, new_blocks, 1227 meta_ac, new_eb_bhs); 1228 if (status < 0) { 1229 mlog_errno(status); 1230 goto bail; 1231 } 1232 1233 /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be 1234 * linked with the rest of the tree. 1235 * conversly, new_eb_bhs[0] is the new bottommost leaf. 1236 * 1237 * when we leave the loop, new_last_eb_blk will point to the 1238 * newest leaf, and next_blkno will point to the topmost extent 1239 * block. */ 1240 next_blkno = new_last_eb_blk = 0; 1241 for(i = 0; i < new_blocks; i++) { 1242 bh = new_eb_bhs[i]; 1243 eb = (struct ocfs2_extent_block *) bh->b_data; 1244 /* ocfs2_create_new_meta_bhs() should create it right! */ 1245 BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb)); 1246 eb_el = &eb->h_list; 1247 1248 status = ocfs2_journal_access_eb(handle, et->et_ci, bh, 1249 OCFS2_JOURNAL_ACCESS_CREATE); 1250 if (status < 0) { 1251 mlog_errno(status); 1252 goto bail; 1253 } 1254 1255 eb->h_next_leaf_blk = 0; 1256 eb_el->l_tree_depth = cpu_to_le16(i); 1257 eb_el->l_next_free_rec = cpu_to_le16(1); 1258 /* 1259 * This actually counts as an empty extent as 1260 * c_clusters == 0 1261 */ 1262 eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos); 1263 eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno); 1264 /* 1265 * eb_el isn't always an interior node, but even leaf 1266 * nodes want a zero'd flags and reserved field so 1267 * this gets the whole 32 bits regardless of use. 1268 */ 1269 eb_el->l_recs[0].e_int_clusters = cpu_to_le32(0); 1270 if (!eb_el->l_tree_depth) 1271 new_last_eb_blk = le64_to_cpu(eb->h_blkno); 1272 1273 status = ocfs2_journal_dirty(handle, bh); 1274 if (status < 0) { 1275 mlog_errno(status); 1276 goto bail; 1277 } 1278 1279 next_blkno = le64_to_cpu(eb->h_blkno); 1280 } 1281 1282 /* This is a bit hairy. We want to update up to three blocks 1283 * here without leaving any of them in an inconsistent state 1284 * in case of error. We don't have to worry about 1285 * journal_dirty erroring as it won't unless we've aborted the 1286 * handle (in which case we would never be here) so reserving 1287 * the write with journal_access is all we need to do. */ 1288 status = ocfs2_journal_access_eb(handle, et->et_ci, *last_eb_bh, 1289 OCFS2_JOURNAL_ACCESS_WRITE); 1290 if (status < 0) { 1291 mlog_errno(status); 1292 goto bail; 1293 } 1294 status = ocfs2_et_root_journal_access(handle, et, 1295 OCFS2_JOURNAL_ACCESS_WRITE); 1296 if (status < 0) { 1297 mlog_errno(status); 1298 goto bail; 1299 } 1300 if (eb_bh) { 1301 status = ocfs2_journal_access_eb(handle, et->et_ci, eb_bh, 1302 OCFS2_JOURNAL_ACCESS_WRITE); 1303 if (status < 0) { 1304 mlog_errno(status); 1305 goto bail; 1306 } 1307 } 1308 1309 /* Link the new branch into the rest of the tree (el will 1310 * either be on the root_bh, or the extent block passed in. */ 1311 i = le16_to_cpu(el->l_next_free_rec); 1312 el->l_recs[i].e_blkno = cpu_to_le64(next_blkno); 1313 el->l_recs[i].e_cpos = cpu_to_le32(new_cpos); 1314 el->l_recs[i].e_int_clusters = 0; 1315 le16_add_cpu(&el->l_next_free_rec, 1); 1316 1317 /* fe needs a new last extent block pointer, as does the 1318 * next_leaf on the previously last-extent-block. */ 1319 ocfs2_et_set_last_eb_blk(et, new_last_eb_blk); 1320 1321 eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data; 1322 eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk); 1323 1324 status = ocfs2_journal_dirty(handle, *last_eb_bh); 1325 if (status < 0) 1326 mlog_errno(status); 1327 status = ocfs2_journal_dirty(handle, et->et_root_bh); 1328 if (status < 0) 1329 mlog_errno(status); 1330 if (eb_bh) { 1331 status = ocfs2_journal_dirty(handle, eb_bh); 1332 if (status < 0) 1333 mlog_errno(status); 1334 } 1335 1336 /* 1337 * Some callers want to track the rightmost leaf so pass it 1338 * back here. 1339 */ 1340 brelse(*last_eb_bh); 1341 get_bh(new_eb_bhs[0]); 1342 *last_eb_bh = new_eb_bhs[0]; 1343 1344 status = 0; 1345 bail: 1346 if (new_eb_bhs) { 1347 for (i = 0; i < new_blocks; i++) 1348 brelse(new_eb_bhs[i]); 1349 kfree(new_eb_bhs); 1350 } 1351 1352 mlog_exit(status); 1353 return status; 1354 } 1355 1356 /* 1357 * adds another level to the allocation tree. 1358 * returns back the new extent block so you can add a branch to it 1359 * after this call. 1360 */ 1361 static int ocfs2_shift_tree_depth(handle_t *handle, 1362 struct ocfs2_extent_tree *et, 1363 struct ocfs2_alloc_context *meta_ac, 1364 struct buffer_head **ret_new_eb_bh) 1365 { 1366 int status, i; 1367 u32 new_clusters; 1368 struct buffer_head *new_eb_bh = NULL; 1369 struct ocfs2_extent_block *eb; 1370 struct ocfs2_extent_list *root_el; 1371 struct ocfs2_extent_list *eb_el; 1372 1373 mlog_entry_void(); 1374 1375 status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac, 1376 &new_eb_bh); 1377 if (status < 0) { 1378 mlog_errno(status); 1379 goto bail; 1380 } 1381 1382 eb = (struct ocfs2_extent_block *) new_eb_bh->b_data; 1383 /* ocfs2_create_new_meta_bhs() should create it right! */ 1384 BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb)); 1385 1386 eb_el = &eb->h_list; 1387 root_el = et->et_root_el; 1388 1389 status = ocfs2_journal_access_eb(handle, et->et_ci, new_eb_bh, 1390 OCFS2_JOURNAL_ACCESS_CREATE); 1391 if (status < 0) { 1392 mlog_errno(status); 1393 goto bail; 1394 } 1395 1396 /* copy the root extent list data into the new extent block */ 1397 eb_el->l_tree_depth = root_el->l_tree_depth; 1398 eb_el->l_next_free_rec = root_el->l_next_free_rec; 1399 for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++) 1400 eb_el->l_recs[i] = root_el->l_recs[i]; 1401 1402 status = ocfs2_journal_dirty(handle, new_eb_bh); 1403 if (status < 0) { 1404 mlog_errno(status); 1405 goto bail; 1406 } 1407 1408 status = ocfs2_et_root_journal_access(handle, et, 1409 OCFS2_JOURNAL_ACCESS_WRITE); 1410 if (status < 0) { 1411 mlog_errno(status); 1412 goto bail; 1413 } 1414 1415 new_clusters = ocfs2_sum_rightmost_rec(eb_el); 1416 1417 /* update root_bh now */ 1418 le16_add_cpu(&root_el->l_tree_depth, 1); 1419 root_el->l_recs[0].e_cpos = 0; 1420 root_el->l_recs[0].e_blkno = eb->h_blkno; 1421 root_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters); 1422 for (i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++) 1423 memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec)); 1424 root_el->l_next_free_rec = cpu_to_le16(1); 1425 1426 /* If this is our 1st tree depth shift, then last_eb_blk 1427 * becomes the allocated extent block */ 1428 if (root_el->l_tree_depth == cpu_to_le16(1)) 1429 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno)); 1430 1431 status = ocfs2_journal_dirty(handle, et->et_root_bh); 1432 if (status < 0) { 1433 mlog_errno(status); 1434 goto bail; 1435 } 1436 1437 *ret_new_eb_bh = new_eb_bh; 1438 new_eb_bh = NULL; 1439 status = 0; 1440 bail: 1441 brelse(new_eb_bh); 1442 1443 mlog_exit(status); 1444 return status; 1445 } 1446 1447 /* 1448 * Should only be called when there is no space left in any of the 1449 * leaf nodes. What we want to do is find the lowest tree depth 1450 * non-leaf extent block with room for new records. There are three 1451 * valid results of this search: 1452 * 1453 * 1) a lowest extent block is found, then we pass it back in 1454 * *lowest_eb_bh and return '0' 1455 * 1456 * 2) the search fails to find anything, but the root_el has room. We 1457 * pass NULL back in *lowest_eb_bh, but still return '0' 1458 * 1459 * 3) the search fails to find anything AND the root_el is full, in 1460 * which case we return > 0 1461 * 1462 * return status < 0 indicates an error. 1463 */ 1464 static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et, 1465 struct buffer_head **target_bh) 1466 { 1467 int status = 0, i; 1468 u64 blkno; 1469 struct ocfs2_extent_block *eb; 1470 struct ocfs2_extent_list *el; 1471 struct buffer_head *bh = NULL; 1472 struct buffer_head *lowest_bh = NULL; 1473 1474 mlog_entry_void(); 1475 1476 *target_bh = NULL; 1477 1478 el = et->et_root_el; 1479 1480 while(le16_to_cpu(el->l_tree_depth) > 1) { 1481 if (le16_to_cpu(el->l_next_free_rec) == 0) { 1482 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), 1483 "Owner %llu has empty " 1484 "extent list (next_free_rec == 0)", 1485 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci)); 1486 status = -EIO; 1487 goto bail; 1488 } 1489 i = le16_to_cpu(el->l_next_free_rec) - 1; 1490 blkno = le64_to_cpu(el->l_recs[i].e_blkno); 1491 if (!blkno) { 1492 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), 1493 "Owner %llu has extent " 1494 "list where extent # %d has no physical " 1495 "block start", 1496 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), i); 1497 status = -EIO; 1498 goto bail; 1499 } 1500 1501 brelse(bh); 1502 bh = NULL; 1503 1504 status = ocfs2_read_extent_block(et->et_ci, blkno, &bh); 1505 if (status < 0) { 1506 mlog_errno(status); 1507 goto bail; 1508 } 1509 1510 eb = (struct ocfs2_extent_block *) bh->b_data; 1511 el = &eb->h_list; 1512 1513 if (le16_to_cpu(el->l_next_free_rec) < 1514 le16_to_cpu(el->l_count)) { 1515 brelse(lowest_bh); 1516 lowest_bh = bh; 1517 get_bh(lowest_bh); 1518 } 1519 } 1520 1521 /* If we didn't find one and the fe doesn't have any room, 1522 * then return '1' */ 1523 el = et->et_root_el; 1524 if (!lowest_bh && (el->l_next_free_rec == el->l_count)) 1525 status = 1; 1526 1527 *target_bh = lowest_bh; 1528 bail: 1529 brelse(bh); 1530 1531 mlog_exit(status); 1532 return status; 1533 } 1534 1535 /* 1536 * Grow a b-tree so that it has more records. 1537 * 1538 * We might shift the tree depth in which case existing paths should 1539 * be considered invalid. 1540 * 1541 * Tree depth after the grow is returned via *final_depth. 1542 * 1543 * *last_eb_bh will be updated by ocfs2_add_branch(). 1544 */ 1545 static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et, 1546 int *final_depth, struct buffer_head **last_eb_bh, 1547 struct ocfs2_alloc_context *meta_ac) 1548 { 1549 int ret, shift; 1550 struct ocfs2_extent_list *el = et->et_root_el; 1551 int depth = le16_to_cpu(el->l_tree_depth); 1552 struct buffer_head *bh = NULL; 1553 1554 BUG_ON(meta_ac == NULL); 1555 1556 shift = ocfs2_find_branch_target(et, &bh); 1557 if (shift < 0) { 1558 ret = shift; 1559 mlog_errno(ret); 1560 goto out; 1561 } 1562 1563 /* We traveled all the way to the bottom of the allocation tree 1564 * and didn't find room for any more extents - we need to add 1565 * another tree level */ 1566 if (shift) { 1567 BUG_ON(bh); 1568 mlog(0, "need to shift tree depth (current = %d)\n", depth); 1569 1570 /* ocfs2_shift_tree_depth will return us a buffer with 1571 * the new extent block (so we can pass that to 1572 * ocfs2_add_branch). */ 1573 ret = ocfs2_shift_tree_depth(handle, et, meta_ac, &bh); 1574 if (ret < 0) { 1575 mlog_errno(ret); 1576 goto out; 1577 } 1578 depth++; 1579 if (depth == 1) { 1580 /* 1581 * Special case: we have room now if we shifted from 1582 * tree_depth 0, so no more work needs to be done. 1583 * 1584 * We won't be calling add_branch, so pass 1585 * back *last_eb_bh as the new leaf. At depth 1586 * zero, it should always be null so there's 1587 * no reason to brelse. 1588 */ 1589 BUG_ON(*last_eb_bh); 1590 get_bh(bh); 1591 *last_eb_bh = bh; 1592 goto out; 1593 } 1594 } 1595 1596 /* call ocfs2_add_branch to add the final part of the tree with 1597 * the new data. */ 1598 mlog(0, "add branch. bh = %p\n", bh); 1599 ret = ocfs2_add_branch(handle, et, bh, last_eb_bh, 1600 meta_ac); 1601 if (ret < 0) { 1602 mlog_errno(ret); 1603 goto out; 1604 } 1605 1606 out: 1607 if (final_depth) 1608 *final_depth = depth; 1609 brelse(bh); 1610 return ret; 1611 } 1612 1613 /* 1614 * This function will discard the rightmost extent record. 1615 */ 1616 static void ocfs2_shift_records_right(struct ocfs2_extent_list *el) 1617 { 1618 int next_free = le16_to_cpu(el->l_next_free_rec); 1619 int count = le16_to_cpu(el->l_count); 1620 unsigned int num_bytes; 1621 1622 BUG_ON(!next_free); 1623 /* This will cause us to go off the end of our extent list. */ 1624 BUG_ON(next_free >= count); 1625 1626 num_bytes = sizeof(struct ocfs2_extent_rec) * next_free; 1627 1628 memmove(&el->l_recs[1], &el->l_recs[0], num_bytes); 1629 } 1630 1631 static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el, 1632 struct ocfs2_extent_rec *insert_rec) 1633 { 1634 int i, insert_index, next_free, has_empty, num_bytes; 1635 u32 insert_cpos = le32_to_cpu(insert_rec->e_cpos); 1636 struct ocfs2_extent_rec *rec; 1637 1638 next_free = le16_to_cpu(el->l_next_free_rec); 1639 has_empty = ocfs2_is_empty_extent(&el->l_recs[0]); 1640 1641 BUG_ON(!next_free); 1642 1643 /* The tree code before us didn't allow enough room in the leaf. */ 1644 BUG_ON(el->l_next_free_rec == el->l_count && !has_empty); 1645 1646 /* 1647 * The easiest way to approach this is to just remove the 1648 * empty extent and temporarily decrement next_free. 1649 */ 1650 if (has_empty) { 1651 /* 1652 * If next_free was 1 (only an empty extent), this 1653 * loop won't execute, which is fine. We still want 1654 * the decrement above to happen. 1655 */ 1656 for(i = 0; i < (next_free - 1); i++) 1657 el->l_recs[i] = el->l_recs[i+1]; 1658 1659 next_free--; 1660 } 1661 1662 /* 1663 * Figure out what the new record index should be. 1664 */ 1665 for(i = 0; i < next_free; i++) { 1666 rec = &el->l_recs[i]; 1667 1668 if (insert_cpos < le32_to_cpu(rec->e_cpos)) 1669 break; 1670 } 1671 insert_index = i; 1672 1673 mlog(0, "ins %u: index %d, has_empty %d, next_free %d, count %d\n", 1674 insert_cpos, insert_index, has_empty, next_free, le16_to_cpu(el->l_count)); 1675 1676 BUG_ON(insert_index < 0); 1677 BUG_ON(insert_index >= le16_to_cpu(el->l_count)); 1678 BUG_ON(insert_index > next_free); 1679 1680 /* 1681 * No need to memmove if we're just adding to the tail. 1682 */ 1683 if (insert_index != next_free) { 1684 BUG_ON(next_free >= le16_to_cpu(el->l_count)); 1685 1686 num_bytes = next_free - insert_index; 1687 num_bytes *= sizeof(struct ocfs2_extent_rec); 1688 memmove(&el->l_recs[insert_index + 1], 1689 &el->l_recs[insert_index], 1690 num_bytes); 1691 } 1692 1693 /* 1694 * Either we had an empty extent, and need to re-increment or 1695 * there was no empty extent on a non full rightmost leaf node, 1696 * in which case we still need to increment. 1697 */ 1698 next_free++; 1699 el->l_next_free_rec = cpu_to_le16(next_free); 1700 /* 1701 * Make sure none of the math above just messed up our tree. 1702 */ 1703 BUG_ON(le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count)); 1704 1705 el->l_recs[insert_index] = *insert_rec; 1706 1707 } 1708 1709 static void ocfs2_remove_empty_extent(struct ocfs2_extent_list *el) 1710 { 1711 int size, num_recs = le16_to_cpu(el->l_next_free_rec); 1712 1713 BUG_ON(num_recs == 0); 1714 1715 if (ocfs2_is_empty_extent(&el->l_recs[0])) { 1716 num_recs--; 1717 size = num_recs * sizeof(struct ocfs2_extent_rec); 1718 memmove(&el->l_recs[0], &el->l_recs[1], size); 1719 memset(&el->l_recs[num_recs], 0, 1720 sizeof(struct ocfs2_extent_rec)); 1721 el->l_next_free_rec = cpu_to_le16(num_recs); 1722 } 1723 } 1724 1725 /* 1726 * Create an empty extent record . 1727 * 1728 * l_next_free_rec may be updated. 1729 * 1730 * If an empty extent already exists do nothing. 1731 */ 1732 static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el) 1733 { 1734 int next_free = le16_to_cpu(el->l_next_free_rec); 1735 1736 BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); 1737 1738 if (next_free == 0) 1739 goto set_and_inc; 1740 1741 if (ocfs2_is_empty_extent(&el->l_recs[0])) 1742 return; 1743 1744 mlog_bug_on_msg(el->l_count == el->l_next_free_rec, 1745 "Asked to create an empty extent in a full list:\n" 1746 "count = %u, tree depth = %u", 1747 le16_to_cpu(el->l_count), 1748 le16_to_cpu(el->l_tree_depth)); 1749 1750 ocfs2_shift_records_right(el); 1751 1752 set_and_inc: 1753 le16_add_cpu(&el->l_next_free_rec, 1); 1754 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); 1755 } 1756 1757 /* 1758 * For a rotation which involves two leaf nodes, the "root node" is 1759 * the lowest level tree node which contains a path to both leafs. This 1760 * resulting set of information can be used to form a complete "subtree" 1761 * 1762 * This function is passed two full paths from the dinode down to a 1763 * pair of adjacent leaves. It's task is to figure out which path 1764 * index contains the subtree root - this can be the root index itself 1765 * in a worst-case rotation. 1766 * 1767 * The array index of the subtree root is passed back. 1768 */ 1769 int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et, 1770 struct ocfs2_path *left, 1771 struct ocfs2_path *right) 1772 { 1773 int i = 0; 1774 1775 /* 1776 * Check that the caller passed in two paths from the same tree. 1777 */ 1778 BUG_ON(path_root_bh(left) != path_root_bh(right)); 1779 1780 do { 1781 i++; 1782 1783 /* 1784 * The caller didn't pass two adjacent paths. 1785 */ 1786 mlog_bug_on_msg(i > left->p_tree_depth, 1787 "Owner %llu, left depth %u, right depth %u\n" 1788 "left leaf blk %llu, right leaf blk %llu\n", 1789 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), 1790 left->p_tree_depth, right->p_tree_depth, 1791 (unsigned long long)path_leaf_bh(left)->b_blocknr, 1792 (unsigned long long)path_leaf_bh(right)->b_blocknr); 1793 } while (left->p_node[i].bh->b_blocknr == 1794 right->p_node[i].bh->b_blocknr); 1795 1796 return i - 1; 1797 } 1798 1799 typedef void (path_insert_t)(void *, struct buffer_head *); 1800 1801 /* 1802 * Traverse a btree path in search of cpos, starting at root_el. 1803 * 1804 * This code can be called with a cpos larger than the tree, in which 1805 * case it will return the rightmost path. 1806 */ 1807 static int __ocfs2_find_path(struct ocfs2_caching_info *ci, 1808 struct ocfs2_extent_list *root_el, u32 cpos, 1809 path_insert_t *func, void *data) 1810 { 1811 int i, ret = 0; 1812 u32 range; 1813 u64 blkno; 1814 struct buffer_head *bh = NULL; 1815 struct ocfs2_extent_block *eb; 1816 struct ocfs2_extent_list *el; 1817 struct ocfs2_extent_rec *rec; 1818 1819 el = root_el; 1820 while (el->l_tree_depth) { 1821 if (le16_to_cpu(el->l_next_free_rec) == 0) { 1822 ocfs2_error(ocfs2_metadata_cache_get_super(ci), 1823 "Owner %llu has empty extent list at " 1824 "depth %u\n", 1825 (unsigned long long)ocfs2_metadata_cache_owner(ci), 1826 le16_to_cpu(el->l_tree_depth)); 1827 ret = -EROFS; 1828 goto out; 1829 1830 } 1831 1832 for(i = 0; i < le16_to_cpu(el->l_next_free_rec) - 1; i++) { 1833 rec = &el->l_recs[i]; 1834 1835 /* 1836 * In the case that cpos is off the allocation 1837 * tree, this should just wind up returning the 1838 * rightmost record. 1839 */ 1840 range = le32_to_cpu(rec->e_cpos) + 1841 ocfs2_rec_clusters(el, rec); 1842 if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range) 1843 break; 1844 } 1845 1846 blkno = le64_to_cpu(el->l_recs[i].e_blkno); 1847 if (blkno == 0) { 1848 ocfs2_error(ocfs2_metadata_cache_get_super(ci), 1849 "Owner %llu has bad blkno in extent list " 1850 "at depth %u (index %d)\n", 1851 (unsigned long long)ocfs2_metadata_cache_owner(ci), 1852 le16_to_cpu(el->l_tree_depth), i); 1853 ret = -EROFS; 1854 goto out; 1855 } 1856 1857 brelse(bh); 1858 bh = NULL; 1859 ret = ocfs2_read_extent_block(ci, blkno, &bh); 1860 if (ret) { 1861 mlog_errno(ret); 1862 goto out; 1863 } 1864 1865 eb = (struct ocfs2_extent_block *) bh->b_data; 1866 el = &eb->h_list; 1867 1868 if (le16_to_cpu(el->l_next_free_rec) > 1869 le16_to_cpu(el->l_count)) { 1870 ocfs2_error(ocfs2_metadata_cache_get_super(ci), 1871 "Owner %llu has bad count in extent list " 1872 "at block %llu (next free=%u, count=%u)\n", 1873 (unsigned long long)ocfs2_metadata_cache_owner(ci), 1874 (unsigned long long)bh->b_blocknr, 1875 le16_to_cpu(el->l_next_free_rec), 1876 le16_to_cpu(el->l_count)); 1877 ret = -EROFS; 1878 goto out; 1879 } 1880 1881 if (func) 1882 func(data, bh); 1883 } 1884 1885 out: 1886 /* 1887 * Catch any trailing bh that the loop didn't handle. 1888 */ 1889 brelse(bh); 1890 1891 return ret; 1892 } 1893 1894 /* 1895 * Given an initialized path (that is, it has a valid root extent 1896 * list), this function will traverse the btree in search of the path 1897 * which would contain cpos. 1898 * 1899 * The path traveled is recorded in the path structure. 1900 * 1901 * Note that this will not do any comparisons on leaf node extent 1902 * records, so it will work fine in the case that we just added a tree 1903 * branch. 1904 */ 1905 struct find_path_data { 1906 int index; 1907 struct ocfs2_path *path; 1908 }; 1909 static void find_path_ins(void *data, struct buffer_head *bh) 1910 { 1911 struct find_path_data *fp = data; 1912 1913 get_bh(bh); 1914 ocfs2_path_insert_eb(fp->path, fp->index, bh); 1915 fp->index++; 1916 } 1917 int ocfs2_find_path(struct ocfs2_caching_info *ci, 1918 struct ocfs2_path *path, u32 cpos) 1919 { 1920 struct find_path_data data; 1921 1922 data.index = 1; 1923 data.path = path; 1924 return __ocfs2_find_path(ci, path_root_el(path), cpos, 1925 find_path_ins, &data); 1926 } 1927 1928 static void find_leaf_ins(void *data, struct buffer_head *bh) 1929 { 1930 struct ocfs2_extent_block *eb =(struct ocfs2_extent_block *)bh->b_data; 1931 struct ocfs2_extent_list *el = &eb->h_list; 1932 struct buffer_head **ret = data; 1933 1934 /* We want to retain only the leaf block. */ 1935 if (le16_to_cpu(el->l_tree_depth) == 0) { 1936 get_bh(bh); 1937 *ret = bh; 1938 } 1939 } 1940 /* 1941 * Find the leaf block in the tree which would contain cpos. No 1942 * checking of the actual leaf is done. 1943 * 1944 * Some paths want to call this instead of allocating a path structure 1945 * and calling ocfs2_find_path(). 1946 * 1947 * This function doesn't handle non btree extent lists. 1948 */ 1949 int ocfs2_find_leaf(struct ocfs2_caching_info *ci, 1950 struct ocfs2_extent_list *root_el, u32 cpos, 1951 struct buffer_head **leaf_bh) 1952 { 1953 int ret; 1954 struct buffer_head *bh = NULL; 1955 1956 ret = __ocfs2_find_path(ci, root_el, cpos, find_leaf_ins, &bh); 1957 if (ret) { 1958 mlog_errno(ret); 1959 goto out; 1960 } 1961 1962 *leaf_bh = bh; 1963 out: 1964 return ret; 1965 } 1966 1967 /* 1968 * Adjust the adjacent records (left_rec, right_rec) involved in a rotation. 1969 * 1970 * Basically, we've moved stuff around at the bottom of the tree and 1971 * we need to fix up the extent records above the changes to reflect 1972 * the new changes. 1973 * 1974 * left_rec: the record on the left. 1975 * left_child_el: is the child list pointed to by left_rec 1976 * right_rec: the record to the right of left_rec 1977 * right_child_el: is the child list pointed to by right_rec 1978 * 1979 * By definition, this only works on interior nodes. 1980 */ 1981 static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec, 1982 struct ocfs2_extent_list *left_child_el, 1983 struct ocfs2_extent_rec *right_rec, 1984 struct ocfs2_extent_list *right_child_el) 1985 { 1986 u32 left_clusters, right_end; 1987 1988 /* 1989 * Interior nodes never have holes. Their cpos is the cpos of 1990 * the leftmost record in their child list. Their cluster 1991 * count covers the full theoretical range of their child list 1992 * - the range between their cpos and the cpos of the record 1993 * immediately to their right. 1994 */ 1995 left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos); 1996 if (!ocfs2_rec_clusters(right_child_el, &right_child_el->l_recs[0])) { 1997 BUG_ON(right_child_el->l_tree_depth); 1998 BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1); 1999 left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos); 2000 } 2001 left_clusters -= le32_to_cpu(left_rec->e_cpos); 2002 left_rec->e_int_clusters = cpu_to_le32(left_clusters); 2003 2004 /* 2005 * Calculate the rightmost cluster count boundary before 2006 * moving cpos - we will need to adjust clusters after 2007 * updating e_cpos to keep the same highest cluster count. 2008 */ 2009 right_end = le32_to_cpu(right_rec->e_cpos); 2010 right_end += le32_to_cpu(right_rec->e_int_clusters); 2011 2012 right_rec->e_cpos = left_rec->e_cpos; 2013 le32_add_cpu(&right_rec->e_cpos, left_clusters); 2014 2015 right_end -= le32_to_cpu(right_rec->e_cpos); 2016 right_rec->e_int_clusters = cpu_to_le32(right_end); 2017 } 2018 2019 /* 2020 * Adjust the adjacent root node records involved in a 2021 * rotation. left_el_blkno is passed in as a key so that we can easily 2022 * find it's index in the root list. 2023 */ 2024 static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el, 2025 struct ocfs2_extent_list *left_el, 2026 struct ocfs2_extent_list *right_el, 2027 u64 left_el_blkno) 2028 { 2029 int i; 2030 2031 BUG_ON(le16_to_cpu(root_el->l_tree_depth) <= 2032 le16_to_cpu(left_el->l_tree_depth)); 2033 2034 for(i = 0; i < le16_to_cpu(root_el->l_next_free_rec) - 1; i++) { 2035 if (le64_to_cpu(root_el->l_recs[i].e_blkno) == left_el_blkno) 2036 break; 2037 } 2038 2039 /* 2040 * The path walking code should have never returned a root and 2041 * two paths which are not adjacent. 2042 */ 2043 BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1)); 2044 2045 ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el, 2046 &root_el->l_recs[i + 1], right_el); 2047 } 2048 2049 /* 2050 * We've changed a leaf block (in right_path) and need to reflect that 2051 * change back up the subtree. 2052 * 2053 * This happens in multiple places: 2054 * - When we've moved an extent record from the left path leaf to the right 2055 * path leaf to make room for an empty extent in the left path leaf. 2056 * - When our insert into the right path leaf is at the leftmost edge 2057 * and requires an update of the path immediately to it's left. This 2058 * can occur at the end of some types of rotation and appending inserts. 2059 * - When we've adjusted the last extent record in the left path leaf and the 2060 * 1st extent record in the right path leaf during cross extent block merge. 2061 */ 2062 static void ocfs2_complete_edge_insert(handle_t *handle, 2063 struct ocfs2_path *left_path, 2064 struct ocfs2_path *right_path, 2065 int subtree_index) 2066 { 2067 int ret, i, idx; 2068 struct ocfs2_extent_list *el, *left_el, *right_el; 2069 struct ocfs2_extent_rec *left_rec, *right_rec; 2070 struct buffer_head *root_bh = left_path->p_node[subtree_index].bh; 2071 2072 /* 2073 * Update the counts and position values within all the 2074 * interior nodes to reflect the leaf rotation we just did. 2075 * 2076 * The root node is handled below the loop. 2077 * 2078 * We begin the loop with right_el and left_el pointing to the 2079 * leaf lists and work our way up. 2080 * 2081 * NOTE: within this loop, left_el and right_el always refer 2082 * to the *child* lists. 2083 */ 2084 left_el = path_leaf_el(left_path); 2085 right_el = path_leaf_el(right_path); 2086 for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) { 2087 mlog(0, "Adjust records at index %u\n", i); 2088 2089 /* 2090 * One nice property of knowing that all of these 2091 * nodes are below the root is that we only deal with 2092 * the leftmost right node record and the rightmost 2093 * left node record. 2094 */ 2095 el = left_path->p_node[i].el; 2096 idx = le16_to_cpu(left_el->l_next_free_rec) - 1; 2097 left_rec = &el->l_recs[idx]; 2098 2099 el = right_path->p_node[i].el; 2100 right_rec = &el->l_recs[0]; 2101 2102 ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec, 2103 right_el); 2104 2105 ret = ocfs2_journal_dirty(handle, left_path->p_node[i].bh); 2106 if (ret) 2107 mlog_errno(ret); 2108 2109 ret = ocfs2_journal_dirty(handle, right_path->p_node[i].bh); 2110 if (ret) 2111 mlog_errno(ret); 2112 2113 /* 2114 * Setup our list pointers now so that the current 2115 * parents become children in the next iteration. 2116 */ 2117 left_el = left_path->p_node[i].el; 2118 right_el = right_path->p_node[i].el; 2119 } 2120 2121 /* 2122 * At the root node, adjust the two adjacent records which 2123 * begin our path to the leaves. 2124 */ 2125 2126 el = left_path->p_node[subtree_index].el; 2127 left_el = left_path->p_node[subtree_index + 1].el; 2128 right_el = right_path->p_node[subtree_index + 1].el; 2129 2130 ocfs2_adjust_root_records(el, left_el, right_el, 2131 left_path->p_node[subtree_index + 1].bh->b_blocknr); 2132 2133 root_bh = left_path->p_node[subtree_index].bh; 2134 2135 ret = ocfs2_journal_dirty(handle, root_bh); 2136 if (ret) 2137 mlog_errno(ret); 2138 } 2139 2140 static int ocfs2_rotate_subtree_right(handle_t *handle, 2141 struct ocfs2_extent_tree *et, 2142 struct ocfs2_path *left_path, 2143 struct ocfs2_path *right_path, 2144 int subtree_index) 2145 { 2146 int ret, i; 2147 struct buffer_head *right_leaf_bh; 2148 struct buffer_head *left_leaf_bh = NULL; 2149 struct buffer_head *root_bh; 2150 struct ocfs2_extent_list *right_el, *left_el; 2151 struct ocfs2_extent_rec move_rec; 2152 2153 left_leaf_bh = path_leaf_bh(left_path); 2154 left_el = path_leaf_el(left_path); 2155 2156 if (left_el->l_next_free_rec != left_el->l_count) { 2157 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), 2158 "Inode %llu has non-full interior leaf node %llu" 2159 "(next free = %u)", 2160 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), 2161 (unsigned long long)left_leaf_bh->b_blocknr, 2162 le16_to_cpu(left_el->l_next_free_rec)); 2163 return -EROFS; 2164 } 2165 2166 /* 2167 * This extent block may already have an empty record, so we 2168 * return early if so. 2169 */ 2170 if (ocfs2_is_empty_extent(&left_el->l_recs[0])) 2171 return 0; 2172 2173 root_bh = left_path->p_node[subtree_index].bh; 2174 BUG_ON(root_bh != right_path->p_node[subtree_index].bh); 2175 2176 ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path, 2177 subtree_index); 2178 if (ret) { 2179 mlog_errno(ret); 2180 goto out; 2181 } 2182 2183 for(i = subtree_index + 1; i < path_num_items(right_path); i++) { 2184 ret = ocfs2_path_bh_journal_access(handle, et->et_ci, 2185 right_path, i); 2186 if (ret) { 2187 mlog_errno(ret); 2188 goto out; 2189 } 2190 2191 ret = ocfs2_path_bh_journal_access(handle, et->et_ci, 2192 left_path, i); 2193 if (ret) { 2194 mlog_errno(ret); 2195 goto out; 2196 } 2197 } 2198 2199 right_leaf_bh = path_leaf_bh(right_path); 2200 right_el = path_leaf_el(right_path); 2201 2202 /* This is a code error, not a disk corruption. */ 2203 mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails " 2204 "because rightmost leaf block %llu is empty\n", 2205 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), 2206 (unsigned long long)right_leaf_bh->b_blocknr); 2207 2208 ocfs2_create_empty_extent(right_el); 2209 2210 ret = ocfs2_journal_dirty(handle, right_leaf_bh); 2211 if (ret) { 2212 mlog_errno(ret); 2213 goto out; 2214 } 2215 2216 /* Do the copy now. */ 2217 i = le16_to_cpu(left_el->l_next_free_rec) - 1; 2218 move_rec = left_el->l_recs[i]; 2219 right_el->l_recs[0] = move_rec; 2220 2221 /* 2222 * Clear out the record we just copied and shift everything 2223 * over, leaving an empty extent in the left leaf. 2224 * 2225 * We temporarily subtract from next_free_rec so that the 2226 * shift will lose the tail record (which is now defunct). 2227 */ 2228 le16_add_cpu(&left_el->l_next_free_rec, -1); 2229 ocfs2_shift_records_right(left_el); 2230 memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); 2231 le16_add_cpu(&left_el->l_next_free_rec, 1); 2232 2233 ret = ocfs2_journal_dirty(handle, left_leaf_bh); 2234 if (ret) { 2235 mlog_errno(ret); 2236 goto out; 2237 } 2238 2239 ocfs2_complete_edge_insert(handle, left_path, right_path, 2240 subtree_index); 2241 2242 out: 2243 return ret; 2244 } 2245 2246 /* 2247 * Given a full path, determine what cpos value would return us a path 2248 * containing the leaf immediately to the left of the current one. 2249 * 2250 * Will return zero if the path passed in is already the leftmost path. 2251 */ 2252 static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb, 2253 struct ocfs2_path *path, u32 *cpos) 2254 { 2255 int i, j, ret = 0; 2256 u64 blkno; 2257 struct ocfs2_extent_list *el; 2258 2259 BUG_ON(path->p_tree_depth == 0); 2260 2261 *cpos = 0; 2262 2263 blkno = path_leaf_bh(path)->b_blocknr; 2264 2265 /* Start at the tree node just above the leaf and work our way up. */ 2266 i = path->p_tree_depth - 1; 2267 while (i >= 0) { 2268 el = path->p_node[i].el; 2269 2270 /* 2271 * Find the extent record just before the one in our 2272 * path. 2273 */ 2274 for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) { 2275 if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) { 2276 if (j == 0) { 2277 if (i == 0) { 2278 /* 2279 * We've determined that the 2280 * path specified is already 2281 * the leftmost one - return a 2282 * cpos of zero. 2283 */ 2284 goto out; 2285 } 2286 /* 2287 * The leftmost record points to our 2288 * leaf - we need to travel up the 2289 * tree one level. 2290 */ 2291 goto next_node; 2292 } 2293 2294 *cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos); 2295 *cpos = *cpos + ocfs2_rec_clusters(el, 2296 &el->l_recs[j - 1]); 2297 *cpos = *cpos - 1; 2298 goto out; 2299 } 2300 } 2301 2302 /* 2303 * If we got here, we never found a valid node where 2304 * the tree indicated one should be. 2305 */ 2306 ocfs2_error(sb, 2307 "Invalid extent tree at extent block %llu\n", 2308 (unsigned long long)blkno); 2309 ret = -EROFS; 2310 goto out; 2311 2312 next_node: 2313 blkno = path->p_node[i].bh->b_blocknr; 2314 i--; 2315 } 2316 2317 out: 2318 return ret; 2319 } 2320 2321 /* 2322 * Extend the transaction by enough credits to complete the rotation, 2323 * and still leave at least the original number of credits allocated 2324 * to this transaction. 2325 */ 2326 static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth, 2327 int op_credits, 2328 struct ocfs2_path *path) 2329 { 2330 int ret; 2331 int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits; 2332 2333 if (handle->h_buffer_credits < credits) { 2334 ret = ocfs2_extend_trans(handle, 2335 credits - handle->h_buffer_credits); 2336 if (ret) 2337 return ret; 2338 2339 if (unlikely(handle->h_buffer_credits < credits)) 2340 return ocfs2_extend_trans(handle, credits); 2341 } 2342 2343 return 0; 2344 } 2345 2346 /* 2347 * Trap the case where we're inserting into the theoretical range past 2348 * the _actual_ left leaf range. Otherwise, we'll rotate a record 2349 * whose cpos is less than ours into the right leaf. 2350 * 2351 * It's only necessary to look at the rightmost record of the left 2352 * leaf because the logic that calls us should ensure that the 2353 * theoretical ranges in the path components above the leaves are 2354 * correct. 2355 */ 2356 static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path, 2357 u32 insert_cpos) 2358 { 2359 struct ocfs2_extent_list *left_el; 2360 struct ocfs2_extent_rec *rec; 2361 int next_free; 2362 2363 left_el = path_leaf_el(left_path); 2364 next_free = le16_to_cpu(left_el->l_next_free_rec); 2365 rec = &left_el->l_recs[next_free - 1]; 2366 2367 if (insert_cpos > le32_to_cpu(rec->e_cpos)) 2368 return 1; 2369 return 0; 2370 } 2371 2372 static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos) 2373 { 2374 int next_free = le16_to_cpu(el->l_next_free_rec); 2375 unsigned int range; 2376 struct ocfs2_extent_rec *rec; 2377 2378 if (next_free == 0) 2379 return 0; 2380 2381 rec = &el->l_recs[0]; 2382 if (ocfs2_is_empty_extent(rec)) { 2383 /* Empty list. */ 2384 if (next_free == 1) 2385 return 0; 2386 rec = &el->l_recs[1]; 2387 } 2388 2389 range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec); 2390 if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range) 2391 return 1; 2392 return 0; 2393 } 2394 2395 /* 2396 * Rotate all the records in a btree right one record, starting at insert_cpos. 2397 * 2398 * The path to the rightmost leaf should be passed in. 2399 * 2400 * The array is assumed to be large enough to hold an entire path (tree depth). 2401 * 2402 * Upon successful return from this function: 2403 * 2404 * - The 'right_path' array will contain a path to the leaf block 2405 * whose range contains e_cpos. 2406 * - That leaf block will have a single empty extent in list index 0. 2407 * - In the case that the rotation requires a post-insert update, 2408 * *ret_left_path will contain a valid path which can be passed to 2409 * ocfs2_insert_path(). 2410 */ 2411 static int ocfs2_rotate_tree_right(handle_t *handle, 2412 struct ocfs2_extent_tree *et, 2413 enum ocfs2_split_type split, 2414 u32 insert_cpos, 2415 struct ocfs2_path *right_path, 2416 struct ocfs2_path **ret_left_path) 2417 { 2418 int ret, start, orig_credits = handle->h_buffer_credits; 2419 u32 cpos; 2420 struct ocfs2_path *left_path = NULL; 2421 struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); 2422 2423 *ret_left_path = NULL; 2424 2425 left_path = ocfs2_new_path_from_path(right_path); 2426 if (!left_path) { 2427 ret = -ENOMEM; 2428 mlog_errno(ret); 2429 goto out; 2430 } 2431 2432 ret = ocfs2_find_cpos_for_left_leaf(sb, right_path, &cpos); 2433 if (ret) { 2434 mlog_errno(ret); 2435 goto out; 2436 } 2437 2438 mlog(0, "Insert: %u, first left path cpos: %u\n", insert_cpos, cpos); 2439 2440 /* 2441 * What we want to do here is: 2442 * 2443 * 1) Start with the rightmost path. 2444 * 2445 * 2) Determine a path to the leaf block directly to the left 2446 * of that leaf. 2447 * 2448 * 3) Determine the 'subtree root' - the lowest level tree node 2449 * which contains a path to both leaves. 2450 * 2451 * 4) Rotate the subtree. 2452 * 2453 * 5) Find the next subtree by considering the left path to be 2454 * the new right path. 2455 * 2456 * The check at the top of this while loop also accepts 2457 * insert_cpos == cpos because cpos is only a _theoretical_ 2458 * value to get us the left path - insert_cpos might very well 2459 * be filling that hole. 2460 * 2461 * Stop at a cpos of '0' because we either started at the 2462 * leftmost branch (i.e., a tree with one branch and a 2463 * rotation inside of it), or we've gone as far as we can in 2464 * rotating subtrees. 2465 */ 2466 while (cpos && insert_cpos <= cpos) { 2467 mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n", 2468 insert_cpos, cpos); 2469 2470 ret = ocfs2_find_path(et->et_ci, left_path, cpos); 2471 if (ret) { 2472 mlog_errno(ret); 2473 goto out; 2474 } 2475 2476 mlog_bug_on_msg(path_leaf_bh(left_path) == 2477 path_leaf_bh(right_path), 2478 "Owner %llu: error during insert of %u " 2479 "(left path cpos %u) results in two identical " 2480 "paths ending at %llu\n", 2481 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), 2482 insert_cpos, cpos, 2483 (unsigned long long) 2484 path_leaf_bh(left_path)->b_blocknr); 2485 2486 if (split == SPLIT_NONE && 2487 ocfs2_rotate_requires_path_adjustment(left_path, 2488 insert_cpos)) { 2489 2490 /* 2491 * We've rotated the tree as much as we 2492 * should. The rest is up to 2493 * ocfs2_insert_path() to complete, after the 2494 * record insertion. We indicate this 2495 * situation by returning the left path. 2496 * 2497 * The reason we don't adjust the records here 2498 * before the record insert is that an error 2499 * later might break the rule where a parent 2500 * record e_cpos will reflect the actual 2501 * e_cpos of the 1st nonempty record of the 2502 * child list. 2503 */ 2504 *ret_left_path = left_path; 2505 goto out_ret_path; 2506 } 2507 2508 start = ocfs2_find_subtree_root(et, left_path, right_path); 2509 2510 mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n", 2511 start, 2512 (unsigned long long) right_path->p_node[start].bh->b_blocknr, 2513 right_path->p_tree_depth); 2514 2515 ret = ocfs2_extend_rotate_transaction(handle, start, 2516 orig_credits, right_path); 2517 if (ret) { 2518 mlog_errno(ret); 2519 goto out; 2520 } 2521 2522 ret = ocfs2_rotate_subtree_right(handle, et, left_path, 2523 right_path, start); 2524 if (ret) { 2525 mlog_errno(ret); 2526 goto out; 2527 } 2528 2529 if (split != SPLIT_NONE && 2530 ocfs2_leftmost_rec_contains(path_leaf_el(right_path), 2531 insert_cpos)) { 2532 /* 2533 * A rotate moves the rightmost left leaf 2534 * record over to the leftmost right leaf 2535 * slot. If we're doing an extent split 2536 * instead of a real insert, then we have to 2537 * check that the extent to be split wasn't 2538 * just moved over. If it was, then we can 2539 * exit here, passing left_path back - 2540 * ocfs2_split_extent() is smart enough to 2541 * search both leaves. 2542 */ 2543 *ret_left_path = left_path; 2544 goto out_ret_path; 2545 } 2546 2547 /* 2548 * There is no need to re-read the next right path 2549 * as we know that it'll be our current left 2550 * path. Optimize by copying values instead. 2551 */ 2552 ocfs2_mv_path(right_path, left_path); 2553 2554 ret = ocfs2_find_cpos_for_left_leaf(sb, right_path, &cpos); 2555 if (ret) { 2556 mlog_errno(ret); 2557 goto out; 2558 } 2559 } 2560 2561 out: 2562 ocfs2_free_path(left_path); 2563 2564 out_ret_path: 2565 return ret; 2566 } 2567 2568 static int ocfs2_update_edge_lengths(handle_t *handle, 2569 struct ocfs2_extent_tree *et, 2570 int subtree_index, struct ocfs2_path *path) 2571 { 2572 int i, idx, ret; 2573 struct ocfs2_extent_rec *rec; 2574 struct ocfs2_extent_list *el; 2575 struct ocfs2_extent_block *eb; 2576 u32 range; 2577 2578 /* 2579 * In normal tree rotation process, we will never touch the 2580 * tree branch above subtree_index and ocfs2_extend_rotate_transaction 2581 * doesn't reserve the credits for them either. 2582 * 2583 * But we do have a special case here which will update the rightmost 2584 * records for all the bh in the path. 2585 * So we have to allocate extra credits and access them. 2586 */ 2587 ret = ocfs2_extend_trans(handle, 2588 handle->h_buffer_credits + subtree_index); 2589 if (ret) { 2590 mlog_errno(ret); 2591 goto out; 2592 } 2593 2594 ret = ocfs2_journal_access_path(et->et_ci, handle, path); 2595 if (ret) { 2596 mlog_errno(ret); 2597 goto out; 2598 } 2599 2600 /* Path should always be rightmost. */ 2601 eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data; 2602 BUG_ON(eb->h_next_leaf_blk != 0ULL); 2603 2604 el = &eb->h_list; 2605 BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0); 2606 idx = le16_to_cpu(el->l_next_free_rec) - 1; 2607 rec = &el->l_recs[idx]; 2608 range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec); 2609 2610 for (i = 0; i < path->p_tree_depth; i++) { 2611 el = path->p_node[i].el; 2612 idx = le16_to_cpu(el->l_next_free_rec) - 1; 2613 rec = &el->l_recs[idx]; 2614 2615 rec->e_int_clusters = cpu_to_le32(range); 2616 le32_add_cpu(&rec->e_int_clusters, -le32_to_cpu(rec->e_cpos)); 2617 2618 ocfs2_journal_dirty(handle, path->p_node[i].bh); 2619 } 2620 out: 2621 return ret; 2622 } 2623 2624 static void ocfs2_unlink_path(handle_t *handle, 2625 struct ocfs2_extent_tree *et, 2626 struct ocfs2_cached_dealloc_ctxt *dealloc, 2627 struct ocfs2_path *path, int unlink_start) 2628 { 2629 int ret, i; 2630 struct ocfs2_extent_block *eb; 2631 struct ocfs2_extent_list *el; 2632 struct buffer_head *bh; 2633 2634 for(i = unlink_start; i < path_num_items(path); i++) { 2635 bh = path->p_node[i].bh; 2636 2637 eb = (struct ocfs2_extent_block *)bh->b_data; 2638 /* 2639 * Not all nodes might have had their final count 2640 * decremented by the caller - handle this here. 2641 */ 2642 el = &eb->h_list; 2643 if (le16_to_cpu(el->l_next_free_rec) > 1) { 2644 mlog(ML_ERROR, 2645 "Inode %llu, attempted to remove extent block " 2646 "%llu with %u records\n", 2647 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), 2648 (unsigned long long)le64_to_cpu(eb->h_blkno), 2649 le16_to_cpu(el->l_next_free_rec)); 2650 2651 ocfs2_journal_dirty(handle, bh); 2652 ocfs2_remove_from_cache(et->et_ci, bh); 2653 continue; 2654 } 2655 2656 el->l_next_free_rec = 0; 2657 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); 2658 2659 ocfs2_journal_dirty(handle, bh); 2660 2661 ret = ocfs2_cache_extent_block_free(dealloc, eb); 2662 if (ret) 2663 mlog_errno(ret); 2664 2665 ocfs2_remove_from_cache(et->et_ci, bh); 2666 } 2667 } 2668 2669 static void ocfs2_unlink_subtree(handle_t *handle, 2670 struct ocfs2_extent_tree *et, 2671 struct ocfs2_path *left_path, 2672 struct ocfs2_path *right_path, 2673 int subtree_index, 2674 struct ocfs2_cached_dealloc_ctxt *dealloc) 2675 { 2676 int i; 2677 struct buffer_head *root_bh = left_path->p_node[subtree_index].bh; 2678 struct ocfs2_extent_list *root_el = left_path->p_node[subtree_index].el; 2679 struct ocfs2_extent_list *el; 2680 struct ocfs2_extent_block *eb; 2681 2682 el = path_leaf_el(left_path); 2683 2684 eb = (struct ocfs2_extent_block *)right_path->p_node[subtree_index + 1].bh->b_data; 2685 2686 for(i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++) 2687 if (root_el->l_recs[i].e_blkno == eb->h_blkno) 2688 break; 2689 2690 BUG_ON(i >= le16_to_cpu(root_el->l_next_free_rec)); 2691 2692 memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec)); 2693 le16_add_cpu(&root_el->l_next_free_rec, -1); 2694 2695 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data; 2696 eb->h_next_leaf_blk = 0; 2697 2698 ocfs2_journal_dirty(handle, root_bh); 2699 ocfs2_journal_dirty(handle, path_leaf_bh(left_path)); 2700 2701 ocfs2_unlink_path(handle, et, dealloc, right_path, 2702 subtree_index + 1); 2703 } 2704 2705 static int ocfs2_rotate_subtree_left(handle_t *handle, 2706 struct ocfs2_extent_tree *et, 2707 struct ocfs2_path *left_path, 2708 struct ocfs2_path *right_path, 2709 int subtree_index, 2710 struct ocfs2_cached_dealloc_ctxt *dealloc, 2711 int *deleted) 2712 { 2713 int ret, i, del_right_subtree = 0, right_has_empty = 0; 2714 struct buffer_head *root_bh, *et_root_bh = path_root_bh(right_path); 2715 struct ocfs2_extent_list *right_leaf_el, *left_leaf_el; 2716 struct ocfs2_extent_block *eb; 2717 2718 *deleted = 0; 2719 2720 right_leaf_el = path_leaf_el(right_path); 2721 left_leaf_el = path_leaf_el(left_path); 2722 root_bh = left_path->p_node[subtree_index].bh; 2723 BUG_ON(root_bh != right_path->p_node[subtree_index].bh); 2724 2725 if (!ocfs2_is_empty_extent(&left_leaf_el->l_recs[0])) 2726 return 0; 2727 2728 eb = (struct ocfs2_extent_block *)path_leaf_bh(right_path)->b_data; 2729 if (ocfs2_is_empty_extent(&right_leaf_el->l_recs[0])) { 2730 /* 2731 * It's legal for us to proceed if the right leaf is 2732 * the rightmost one and it has an empty extent. There 2733 * are two cases to handle - whether the leaf will be 2734 * empty after removal or not. If the leaf isn't empty 2735 * then just remove the empty extent up front. The 2736 * next block will handle empty leaves by flagging 2737 * them for unlink. 2738 * 2739 * Non rightmost leaves will throw -EAGAIN and the 2740 * caller can manually move the subtree and retry. 2741 */ 2742 2743 if (eb->h_next_leaf_blk != 0ULL) 2744 return -EAGAIN; 2745 2746 if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) { 2747 ret = ocfs2_journal_access_eb(handle, et->et_ci, 2748 path_leaf_bh(right_path), 2749 OCFS2_JOURNAL_ACCESS_WRITE); 2750 if (ret) { 2751 mlog_errno(ret); 2752 goto out; 2753 } 2754 2755 ocfs2_remove_empty_extent(right_leaf_el); 2756 } else 2757 right_has_empty = 1; 2758 } 2759 2760 if (eb->h_next_leaf_blk == 0ULL && 2761 le16_to_cpu(right_leaf_el->l_next_free_rec) == 1) { 2762 /* 2763 * We have to update i_last_eb_blk during the meta 2764 * data delete. 2765 */ 2766 ret = ocfs2_et_root_journal_access(handle, et, 2767 OCFS2_JOURNAL_ACCESS_WRITE); 2768 if (ret) { 2769 mlog_errno(ret); 2770 goto out; 2771 } 2772 2773 del_right_subtree = 1; 2774 } 2775 2776 /* 2777 * Getting here with an empty extent in the right path implies 2778 * that it's the rightmost path and will be deleted. 2779 */ 2780 BUG_ON(right_has_empty && !del_right_subtree); 2781 2782 ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path, 2783 subtree_index); 2784 if (ret) { 2785 mlog_errno(ret); 2786 goto out; 2787 } 2788 2789 for(i = subtree_index + 1; i < path_num_items(right_path); i++) { 2790 ret = ocfs2_path_bh_journal_access(handle, et->et_ci, 2791 right_path, i); 2792 if (ret) { 2793 mlog_errno(ret); 2794 goto out; 2795 } 2796 2797 ret = ocfs2_path_bh_journal_access(handle, et->et_ci, 2798 left_path, i); 2799 if (ret) { 2800 mlog_errno(ret); 2801 goto out; 2802 } 2803 } 2804 2805 if (!right_has_empty) { 2806 /* 2807 * Only do this if we're moving a real 2808 * record. Otherwise, the action is delayed until 2809 * after removal of the right path in which case we 2810 * can do a simple shift to remove the empty extent. 2811 */ 2812 ocfs2_rotate_leaf(left_leaf_el, &right_leaf_el->l_recs[0]); 2813 memset(&right_leaf_el->l_recs[0], 0, 2814 sizeof(struct ocfs2_extent_rec)); 2815 } 2816 if (eb->h_next_leaf_blk == 0ULL) { 2817 /* 2818 * Move recs over to get rid of empty extent, decrease 2819 * next_free. This is allowed to remove the last 2820 * extent in our leaf (setting l_next_free_rec to 2821 * zero) - the delete code below won't care. 2822 */ 2823 ocfs2_remove_empty_extent(right_leaf_el); 2824 } 2825 2826 ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path)); 2827 if (ret) 2828 mlog_errno(ret); 2829 ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path)); 2830 if (ret) 2831 mlog_errno(ret); 2832 2833 if (del_right_subtree) { 2834 ocfs2_unlink_subtree(handle, et, left_path, right_path, 2835 subtree_index, dealloc); 2836 ret = ocfs2_update_edge_lengths(handle, et, subtree_index, 2837 left_path); 2838 if (ret) { 2839 mlog_errno(ret); 2840 goto out; 2841 } 2842 2843 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data; 2844 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno)); 2845 2846 /* 2847 * Removal of the extent in the left leaf was skipped 2848 * above so we could delete the right path 2849 * 1st. 2850 */ 2851 if (right_has_empty) 2852 ocfs2_remove_empty_extent(left_leaf_el); 2853 2854 ret = ocfs2_journal_dirty(handle, et_root_bh); 2855 if (ret) 2856 mlog_errno(ret); 2857 2858 *deleted = 1; 2859 } else 2860 ocfs2_complete_edge_insert(handle, left_path, right_path, 2861 subtree_index); 2862 2863 out: 2864 return ret; 2865 } 2866 2867 /* 2868 * Given a full path, determine what cpos value would return us a path 2869 * containing the leaf immediately to the right of the current one. 2870 * 2871 * Will return zero if the path passed in is already the rightmost path. 2872 * 2873 * This looks similar, but is subtly different to 2874 * ocfs2_find_cpos_for_left_leaf(). 2875 */ 2876 int ocfs2_find_cpos_for_right_leaf(struct super_block *sb, 2877 struct ocfs2_path *path, u32 *cpos) 2878 { 2879 int i, j, ret = 0; 2880 u64 blkno; 2881 struct ocfs2_extent_list *el; 2882 2883 *cpos = 0; 2884 2885 if (path->p_tree_depth == 0) 2886 return 0; 2887 2888 blkno = path_leaf_bh(path)->b_blocknr; 2889 2890 /* Start at the tree node just above the leaf and work our way up. */ 2891 i = path->p_tree_depth - 1; 2892 while (i >= 0) { 2893 int next_free; 2894 2895 el = path->p_node[i].el; 2896 2897 /* 2898 * Find the extent record just after the one in our 2899 * path. 2900 */ 2901 next_free = le16_to_cpu(el->l_next_free_rec); 2902 for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) { 2903 if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) { 2904 if (j == (next_free - 1)) { 2905 if (i == 0) { 2906 /* 2907 * We've determined that the 2908 * path specified is already 2909 * the rightmost one - return a 2910 * cpos of zero. 2911 */ 2912 goto out; 2913 } 2914 /* 2915 * The rightmost record points to our 2916 * leaf - we need to travel up the 2917 * tree one level. 2918 */ 2919 goto next_node; 2920 } 2921 2922 *cpos = le32_to_cpu(el->l_recs[j + 1].e_cpos); 2923 goto out; 2924 } 2925 } 2926 2927 /* 2928 * If we got here, we never found a valid node where 2929 * the tree indicated one should be. 2930 */ 2931 ocfs2_error(sb, 2932 "Invalid extent tree at extent block %llu\n", 2933 (unsigned long long)blkno); 2934 ret = -EROFS; 2935 goto out; 2936 2937 next_node: 2938 blkno = path->p_node[i].bh->b_blocknr; 2939 i--; 2940 } 2941 2942 out: 2943 return ret; 2944 } 2945 2946 static int ocfs2_rotate_rightmost_leaf_left(handle_t *handle, 2947 struct ocfs2_extent_tree *et, 2948 struct ocfs2_path *path) 2949 { 2950 int ret; 2951 struct buffer_head *bh = path_leaf_bh(path); 2952 struct ocfs2_extent_list *el = path_leaf_el(path); 2953 2954 if (!ocfs2_is_empty_extent(&el->l_recs[0])) 2955 return 0; 2956 2957 ret = ocfs2_path_bh_journal_access(handle, et->et_ci, path, 2958 path_num_items(path) - 1); 2959 if (ret) { 2960 mlog_errno(ret); 2961 goto out; 2962 } 2963 2964 ocfs2_remove_empty_extent(el); 2965 2966 ret = ocfs2_journal_dirty(handle, bh); 2967 if (ret) 2968 mlog_errno(ret); 2969 2970 out: 2971 return ret; 2972 } 2973 2974 static int __ocfs2_rotate_tree_left(handle_t *handle, 2975 struct ocfs2_extent_tree *et, 2976 int orig_credits, 2977 struct ocfs2_path *path, 2978 struct ocfs2_cached_dealloc_ctxt *dealloc, 2979 struct ocfs2_path **empty_extent_path) 2980 { 2981 int ret, subtree_root, deleted; 2982 u32 right_cpos; 2983 struct ocfs2_path *left_path = NULL; 2984 struct ocfs2_path *right_path = NULL; 2985 struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); 2986 2987 BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0]))); 2988 2989 *empty_extent_path = NULL; 2990 2991 ret = ocfs2_find_cpos_for_right_leaf(sb, path, &right_cpos); 2992 if (ret) { 2993 mlog_errno(ret); 2994 goto out; 2995 } 2996 2997 left_path = ocfs2_new_path_from_path(path); 2998 if (!left_path) { 2999 ret = -ENOMEM; 3000 mlog_errno(ret); 3001 goto out; 3002 } 3003 3004 ocfs2_cp_path(left_path, path); 3005 3006 right_path = ocfs2_new_path_from_path(path); 3007 if (!right_path) { 3008 ret = -ENOMEM; 3009 mlog_errno(ret); 3010 goto out; 3011 } 3012 3013 while (right_cpos) { 3014 ret = ocfs2_find_path(et->et_ci, right_path, right_cpos); 3015 if (ret) { 3016 mlog_errno(ret); 3017 goto out; 3018 } 3019 3020 subtree_root = ocfs2_find_subtree_root(et, left_path, 3021 right_path); 3022 3023 mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n", 3024 subtree_root, 3025 (unsigned long long) 3026 right_path->p_node[subtree_root].bh->b_blocknr, 3027 right_path->p_tree_depth); 3028 3029 ret = ocfs2_extend_rotate_transaction(handle, subtree_root, 3030 orig_credits, left_path); 3031 if (ret) { 3032 mlog_errno(ret); 3033 goto out; 3034 } 3035 3036 /* 3037 * Caller might still want to make changes to the 3038 * tree root, so re-add it to the journal here. 3039 */ 3040 ret = ocfs2_path_bh_journal_access(handle, et->et_ci, 3041 left_path, 0); 3042 if (ret) { 3043 mlog_errno(ret); 3044 goto out; 3045 } 3046 3047 ret = ocfs2_rotate_subtree_left(handle, et, left_path, 3048 right_path, subtree_root, 3049 dealloc, &deleted); 3050 if (ret == -EAGAIN) { 3051 /* 3052 * The rotation has to temporarily stop due to 3053 * the right subtree having an empty 3054 * extent. Pass it back to the caller for a 3055 * fixup. 3056 */ 3057 *empty_extent_path = right_path; 3058 right_path = NULL; 3059 goto out; 3060 } 3061 if (ret) { 3062 mlog_errno(ret); 3063 goto out; 3064 } 3065 3066 /* 3067 * The subtree rotate might have removed records on 3068 * the rightmost edge. If so, then rotation is 3069 * complete. 3070 */ 3071 if (deleted) 3072 break; 3073 3074 ocfs2_mv_path(left_path, right_path); 3075 3076 ret = ocfs2_find_cpos_for_right_leaf(sb, left_path, 3077 &right_cpos); 3078 if (ret) { 3079 mlog_errno(ret); 3080 goto out; 3081 } 3082 } 3083 3084 out: 3085 ocfs2_free_path(right_path); 3086 ocfs2_free_path(left_path); 3087 3088 return ret; 3089 } 3090 3091 static int ocfs2_remove_rightmost_path(handle_t *handle, 3092 struct ocfs2_extent_tree *et, 3093 struct ocfs2_path *path, 3094 struct ocfs2_cached_dealloc_ctxt *dealloc) 3095 { 3096 int ret, subtree_index; 3097 u32 cpos; 3098 struct ocfs2_path *left_path = NULL; 3099 struct ocfs2_extent_block *eb; 3100 struct ocfs2_extent_list *el; 3101 3102 3103 ret = ocfs2_et_sanity_check(et); 3104 if (ret) 3105 goto out; 3106 /* 3107 * There's two ways we handle this depending on 3108 * whether path is the only existing one. 3109 */ 3110 ret = ocfs2_extend_rotate_transaction(handle, 0, 3111 handle->h_buffer_credits, 3112 path); 3113 if (ret) { 3114 mlog_errno(ret); 3115 goto out; 3116 } 3117 3118 ret = ocfs2_journal_access_path(et->et_ci, handle, path); 3119 if (ret) { 3120 mlog_errno(ret); 3121 goto out; 3122 } 3123 3124 ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci), 3125 path, &cpos); 3126 if (ret) { 3127 mlog_errno(ret); 3128 goto out; 3129 } 3130 3131 if (cpos) { 3132 /* 3133 * We have a path to the left of this one - it needs 3134 * an update too. 3135 */ 3136 left_path = ocfs2_new_path_from_path(path); 3137 if (!left_path) { 3138 ret = -ENOMEM; 3139 mlog_errno(ret); 3140 goto out; 3141 } 3142 3143 ret = ocfs2_find_path(et->et_ci, left_path, cpos); 3144 if (ret) { 3145 mlog_errno(ret); 3146 goto out; 3147 } 3148 3149 ret = ocfs2_journal_access_path(et->et_ci, handle, left_path); 3150 if (ret) { 3151 mlog_errno(ret); 3152 goto out; 3153 } 3154 3155 subtree_index = ocfs2_find_subtree_root(et, left_path, path); 3156 3157 ocfs2_unlink_subtree(handle, et, left_path, path, 3158 subtree_index, dealloc); 3159 ret = ocfs2_update_edge_lengths(handle, et, subtree_index, 3160 left_path); 3161 if (ret) { 3162 mlog_errno(ret); 3163 goto out; 3164 } 3165 3166 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data; 3167 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno)); 3168 } else { 3169 /* 3170 * 'path' is also the leftmost path which 3171 * means it must be the only one. This gets 3172 * handled differently because we want to 3173 * revert the root back to having extents 3174 * in-line. 3175 */ 3176 ocfs2_unlink_path(handle, et, dealloc, path, 1); 3177 3178 el = et->et_root_el; 3179 el->l_tree_depth = 0; 3180 el->l_next_free_rec = 0; 3181 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); 3182 3183 ocfs2_et_set_last_eb_blk(et, 0); 3184 } 3185 3186 ocfs2_journal_dirty(handle, path_root_bh(path)); 3187 3188 out: 3189 ocfs2_free_path(left_path); 3190 return ret; 3191 } 3192 3193 /* 3194 * Left rotation of btree records. 3195 * 3196 * In many ways, this is (unsurprisingly) the opposite of right 3197 * rotation. We start at some non-rightmost path containing an empty 3198 * extent in the leaf block. The code works its way to the rightmost 3199 * path by rotating records to the left in every subtree. 3200 * 3201 * This is used by any code which reduces the number of extent records 3202 * in a leaf. After removal, an empty record should be placed in the 3203 * leftmost list position. 3204 * 3205 * This won't handle a length update of the rightmost path records if 3206 * the rightmost tree leaf record is removed so the caller is 3207 * responsible for detecting and correcting that. 3208 */ 3209 static int ocfs2_rotate_tree_left(handle_t *handle, 3210 struct ocfs2_extent_tree *et, 3211 struct ocfs2_path *path, 3212 struct ocfs2_cached_dealloc_ctxt *dealloc) 3213 { 3214 int ret, orig_credits = handle->h_buffer_credits; 3215 struct ocfs2_path *tmp_path = NULL, *restart_path = NULL; 3216 struct ocfs2_extent_block *eb; 3217 struct ocfs2_extent_list *el; 3218 3219 el = path_leaf_el(path); 3220 if (!ocfs2_is_empty_extent(&el->l_recs[0])) 3221 return 0; 3222 3223 if (path->p_tree_depth == 0) { 3224 rightmost_no_delete: 3225 /* 3226 * Inline extents. This is trivially handled, so do 3227 * it up front. 3228 */ 3229 ret = ocfs2_rotate_rightmost_leaf_left(handle, et, path); 3230 if (ret) 3231 mlog_errno(ret); 3232 goto out; 3233 } 3234 3235 /* 3236 * Handle rightmost branch now. There's several cases: 3237 * 1) simple rotation leaving records in there. That's trivial. 3238 * 2) rotation requiring a branch delete - there's no more 3239 * records left. Two cases of this: 3240 * a) There are branches to the left. 3241 * b) This is also the leftmost (the only) branch. 3242 * 3243 * 1) is handled via ocfs2_rotate_rightmost_leaf_left() 3244 * 2a) we need the left branch so that we can update it with the unlink 3245 * 2b) we need to bring the root back to inline extents. 3246 */ 3247 3248 eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data; 3249 el = &eb->h_list; 3250 if (eb->h_next_leaf_blk == 0) { 3251 /* 3252 * This gets a bit tricky if we're going to delete the 3253 * rightmost path. Get the other cases out of the way 3254 * 1st. 3255 */ 3256 if (le16_to_cpu(el->l_next_free_rec) > 1) 3257 goto rightmost_no_delete; 3258 3259 if (le16_to_cpu(el->l_next_free_rec) == 0) { 3260 ret = -EIO; 3261 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), 3262 "Owner %llu has empty extent block at %llu", 3263 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), 3264 (unsigned long long)le64_to_cpu(eb->h_blkno)); 3265 goto out; 3266 } 3267 3268 /* 3269 * XXX: The caller can not trust "path" any more after 3270 * this as it will have been deleted. What do we do? 3271 * 3272 * In theory the rotate-for-merge code will never get 3273 * here because it'll always ask for a rotate in a 3274 * nonempty list. 3275 */ 3276 3277 ret = ocfs2_remove_rightmost_path(handle, et, path, 3278 dealloc); 3279 if (ret) 3280 mlog_errno(ret); 3281 goto out; 3282 } 3283 3284 /* 3285 * Now we can loop, remembering the path we get from -EAGAIN 3286 * and restarting from there. 3287 */ 3288 try_rotate: 3289 ret = __ocfs2_rotate_tree_left(handle, et, orig_credits, path, 3290 dealloc, &restart_path); 3291 if (ret && ret != -EAGAIN) { 3292 mlog_errno(ret); 3293 goto out; 3294 } 3295 3296 while (ret == -EAGAIN) { 3297 tmp_path = restart_path; 3298 restart_path = NULL; 3299 3300 ret = __ocfs2_rotate_tree_left(handle, et, orig_credits, 3301 tmp_path, dealloc, 3302 &restart_path); 3303 if (ret && ret != -EAGAIN) { 3304 mlog_errno(ret); 3305 goto out; 3306 } 3307 3308 ocfs2_free_path(tmp_path); 3309 tmp_path = NULL; 3310 3311 if (ret == 0) 3312 goto try_rotate; 3313 } 3314 3315 out: 3316 ocfs2_free_path(tmp_path); 3317 ocfs2_free_path(restart_path); 3318 return ret; 3319 } 3320 3321 static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el, 3322 int index) 3323 { 3324 struct ocfs2_extent_rec *rec = &el->l_recs[index]; 3325 unsigned int size; 3326 3327 if (rec->e_leaf_clusters == 0) { 3328 /* 3329 * We consumed all of the merged-from record. An empty 3330 * extent cannot exist anywhere but the 1st array 3331 * position, so move things over if the merged-from 3332 * record doesn't occupy that position. 3333 * 3334 * This creates a new empty extent so the caller 3335 * should be smart enough to have removed any existing 3336 * ones. 3337 */ 3338 if (index > 0) { 3339 BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0])); 3340 size = index * sizeof(struct ocfs2_extent_rec); 3341 memmove(&el->l_recs[1], &el->l_recs[0], size); 3342 } 3343 3344 /* 3345 * Always memset - the caller doesn't check whether it 3346 * created an empty extent, so there could be junk in 3347 * the other fields. 3348 */ 3349 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); 3350 } 3351 } 3352 3353 static int ocfs2_get_right_path(struct ocfs2_extent_tree *et, 3354 struct ocfs2_path *left_path, 3355 struct ocfs2_path **ret_right_path) 3356 { 3357 int ret; 3358 u32 right_cpos; 3359 struct ocfs2_path *right_path = NULL; 3360 struct ocfs2_extent_list *left_el; 3361 3362 *ret_right_path = NULL; 3363 3364 /* This function shouldn't be called for non-trees. */ 3365 BUG_ON(left_path->p_tree_depth == 0); 3366 3367 left_el = path_leaf_el(left_path); 3368 BUG_ON(left_el->l_next_free_rec != left_el->l_count); 3369 3370 ret = ocfs2_find_cpos_for_right_leaf(ocfs2_metadata_cache_get_super(et->et_ci), 3371 left_path, &right_cpos); 3372 if (ret) { 3373 mlog_errno(ret); 3374 goto out; 3375 } 3376 3377 /* This function shouldn't be called for the rightmost leaf. */ 3378 BUG_ON(right_cpos == 0); 3379 3380 right_path = ocfs2_new_path_from_path(left_path); 3381 if (!right_path) { 3382 ret = -ENOMEM; 3383 mlog_errno(ret); 3384 goto out; 3385 } 3386 3387 ret = ocfs2_find_path(et->et_ci, right_path, right_cpos); 3388 if (ret) { 3389 mlog_errno(ret); 3390 goto out; 3391 } 3392 3393 *ret_right_path = right_path; 3394 out: 3395 if (ret) 3396 ocfs2_free_path(right_path); 3397 return ret; 3398 } 3399 3400 /* 3401 * Remove split_rec clusters from the record at index and merge them 3402 * onto the beginning of the record "next" to it. 3403 * For index < l_count - 1, the next means the extent rec at index + 1. 3404 * For index == l_count - 1, the "next" means the 1st extent rec of the 3405 * next extent block. 3406 */ 3407 static int ocfs2_merge_rec_right(struct ocfs2_path *left_path, 3408 handle_t *handle, 3409 struct ocfs2_extent_tree *et, 3410 struct ocfs2_extent_rec *split_rec, 3411 int index) 3412 { 3413 int ret, next_free, i; 3414 unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters); 3415 struct ocfs2_extent_rec *left_rec; 3416 struct ocfs2_extent_rec *right_rec; 3417 struct ocfs2_extent_list *right_el; 3418 struct ocfs2_path *right_path = NULL; 3419 int subtree_index = 0; 3420 struct ocfs2_extent_list *el = path_leaf_el(left_path); 3421 struct buffer_head *bh = path_leaf_bh(left_path); 3422 struct buffer_head *root_bh = NULL; 3423 3424 BUG_ON(index >= le16_to_cpu(el->l_next_free_rec)); 3425 left_rec = &el->l_recs[index]; 3426 3427 if (index == le16_to_cpu(el->l_next_free_rec) - 1 && 3428 le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count)) { 3429 /* we meet with a cross extent block merge. */ 3430 ret = ocfs2_get_right_path(et, left_path, &right_path); 3431 if (ret) { 3432 mlog_errno(ret); 3433 goto out; 3434 } 3435 3436 right_el = path_leaf_el(right_path); 3437 next_free = le16_to_cpu(right_el->l_next_free_rec); 3438 BUG_ON(next_free <= 0); 3439 right_rec = &right_el->l_recs[0]; 3440 if (ocfs2_is_empty_extent(right_rec)) { 3441 BUG_ON(next_free <= 1); 3442 right_rec = &right_el->l_recs[1]; 3443 } 3444 3445 BUG_ON(le32_to_cpu(left_rec->e_cpos) + 3446 le16_to_cpu(left_rec->e_leaf_clusters) != 3447 le32_to_cpu(right_rec->e_cpos)); 3448 3449 subtree_index = ocfs2_find_subtree_root(et, left_path, 3450 right_path); 3451 3452 ret = ocfs2_extend_rotate_transaction(handle, subtree_index, 3453 handle->h_buffer_credits, 3454 right_path); 3455 if (ret) { 3456 mlog_errno(ret); 3457 goto out; 3458 } 3459 3460 root_bh = left_path->p_node[subtree_index].bh; 3461 BUG_ON(root_bh != right_path->p_node[subtree_index].bh); 3462 3463 ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path, 3464 subtree_index); 3465 if (ret) { 3466 mlog_errno(ret); 3467 goto out; 3468 } 3469 3470 for (i = subtree_index + 1; 3471 i < path_num_items(right_path); i++) { 3472 ret = ocfs2_path_bh_journal_access(handle, et->et_ci, 3473 right_path, i); 3474 if (ret) { 3475 mlog_errno(ret); 3476 goto out; 3477 } 3478 3479 ret = ocfs2_path_bh_journal_access(handle, et->et_ci, 3480 left_path, i); 3481 if (ret) { 3482 mlog_errno(ret); 3483 goto out; 3484 } 3485 } 3486 3487 } else { 3488 BUG_ON(index == le16_to_cpu(el->l_next_free_rec) - 1); 3489 right_rec = &el->l_recs[index + 1]; 3490 } 3491 3492 ret = ocfs2_path_bh_journal_access(handle, et->et_ci, left_path, 3493 path_num_items(left_path) - 1); 3494 if (ret) { 3495 mlog_errno(ret); 3496 goto out; 3497 } 3498 3499 le16_add_cpu(&left_rec->e_leaf_clusters, -split_clusters); 3500 3501 le32_add_cpu(&right_rec->e_cpos, -split_clusters); 3502 le64_add_cpu(&right_rec->e_blkno, 3503 -ocfs2_clusters_to_blocks(ocfs2_metadata_cache_get_super(et->et_ci), 3504 split_clusters)); 3505 le16_add_cpu(&right_rec->e_leaf_clusters, split_clusters); 3506 3507 ocfs2_cleanup_merge(el, index); 3508 3509 ret = ocfs2_journal_dirty(handle, bh); 3510 if (ret) 3511 mlog_errno(ret); 3512 3513 if (right_path) { 3514 ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path)); 3515 if (ret) 3516 mlog_errno(ret); 3517 3518 ocfs2_complete_edge_insert(handle, left_path, right_path, 3519 subtree_index); 3520 } 3521 out: 3522 if (right_path) 3523 ocfs2_free_path(right_path); 3524 return ret; 3525 } 3526 3527 static int ocfs2_get_left_path(struct ocfs2_extent_tree *et, 3528 struct ocfs2_path *right_path, 3529 struct ocfs2_path **ret_left_path) 3530 { 3531 int ret; 3532 u32 left_cpos; 3533 struct ocfs2_path *left_path = NULL; 3534 3535 *ret_left_path = NULL; 3536 3537 /* This function shouldn't be called for non-trees. */ 3538 BUG_ON(right_path->p_tree_depth == 0); 3539 3540 ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci), 3541 right_path, &left_cpos); 3542 if (ret) { 3543 mlog_errno(ret); 3544 goto out; 3545 } 3546 3547 /* This function shouldn't be called for the leftmost leaf. */ 3548 BUG_ON(left_cpos == 0); 3549 3550 left_path = ocfs2_new_path_from_path(right_path); 3551 if (!left_path) { 3552 ret = -ENOMEM; 3553 mlog_errno(ret); 3554 goto out; 3555 } 3556 3557 ret = ocfs2_find_path(et->et_ci, left_path, left_cpos); 3558 if (ret) { 3559 mlog_errno(ret); 3560 goto out; 3561 } 3562 3563 *ret_left_path = left_path; 3564 out: 3565 if (ret) 3566 ocfs2_free_path(left_path); 3567 return ret; 3568 } 3569 3570 /* 3571 * Remove split_rec clusters from the record at index and merge them 3572 * onto the tail of the record "before" it. 3573 * For index > 0, the "before" means the extent rec at index - 1. 3574 * 3575 * For index == 0, the "before" means the last record of the previous 3576 * extent block. And there is also a situation that we may need to 3577 * remove the rightmost leaf extent block in the right_path and change 3578 * the right path to indicate the new rightmost path. 3579 */ 3580 static int ocfs2_merge_rec_left(struct ocfs2_path *right_path, 3581 handle_t *handle, 3582 struct ocfs2_extent_tree *et, 3583 struct ocfs2_extent_rec *split_rec, 3584 struct ocfs2_cached_dealloc_ctxt *dealloc, 3585 int index) 3586 { 3587 int ret, i, subtree_index = 0, has_empty_extent = 0; 3588 unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters); 3589 struct ocfs2_extent_rec *left_rec; 3590 struct ocfs2_extent_rec *right_rec; 3591 struct ocfs2_extent_list *el = path_leaf_el(right_path); 3592 struct buffer_head *bh = path_leaf_bh(right_path); 3593 struct buffer_head *root_bh = NULL; 3594 struct ocfs2_path *left_path = NULL; 3595 struct ocfs2_extent_list *left_el; 3596 3597 BUG_ON(index < 0); 3598 3599 right_rec = &el->l_recs[index]; 3600 if (index == 0) { 3601 /* we meet with a cross extent block merge. */ 3602 ret = ocfs2_get_left_path(et, right_path, &left_path); 3603 if (ret) { 3604 mlog_errno(ret); 3605 goto out; 3606 } 3607 3608 left_el = path_leaf_el(left_path); 3609 BUG_ON(le16_to_cpu(left_el->l_next_free_rec) != 3610 le16_to_cpu(left_el->l_count)); 3611 3612 left_rec = &left_el->l_recs[ 3613 le16_to_cpu(left_el->l_next_free_rec) - 1]; 3614 BUG_ON(le32_to_cpu(left_rec->e_cpos) + 3615 le16_to_cpu(left_rec->e_leaf_clusters) != 3616 le32_to_cpu(split_rec->e_cpos)); 3617 3618 subtree_index = ocfs2_find_subtree_root(et, left_path, 3619 right_path); 3620 3621 ret = ocfs2_extend_rotate_transaction(handle, subtree_index, 3622 handle->h_buffer_credits, 3623 left_path); 3624 if (ret) { 3625 mlog_errno(ret); 3626 goto out; 3627 } 3628 3629 root_bh = left_path->p_node[subtree_index].bh; 3630 BUG_ON(root_bh != right_path->p_node[subtree_index].bh); 3631 3632 ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path, 3633 subtree_index); 3634 if (ret) { 3635 mlog_errno(ret); 3636 goto out; 3637 } 3638 3639 for (i = subtree_index + 1; 3640 i < path_num_items(right_path); i++) { 3641 ret = ocfs2_path_bh_journal_access(handle, et->et_ci, 3642 right_path, i); 3643 if (ret) { 3644 mlog_errno(ret); 3645 goto out; 3646 } 3647 3648 ret = ocfs2_path_bh_journal_access(handle, et->et_ci, 3649 left_path, i); 3650 if (ret) { 3651 mlog_errno(ret); 3652 goto out; 3653 } 3654 } 3655 } else { 3656 left_rec = &el->l_recs[index - 1]; 3657 if (ocfs2_is_empty_extent(&el->l_recs[0])) 3658 has_empty_extent = 1; 3659 } 3660 3661 ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path, 3662 path_num_items(right_path) - 1); 3663 if (ret) { 3664 mlog_errno(ret); 3665 goto out; 3666 } 3667 3668 if (has_empty_extent && index == 1) { 3669 /* 3670 * The easy case - we can just plop the record right in. 3671 */ 3672 *left_rec = *split_rec; 3673 3674 has_empty_extent = 0; 3675 } else 3676 le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters); 3677 3678 le32_add_cpu(&right_rec->e_cpos, split_clusters); 3679 le64_add_cpu(&right_rec->e_blkno, 3680 ocfs2_clusters_to_blocks(ocfs2_metadata_cache_get_super(et->et_ci), 3681 split_clusters)); 3682 le16_add_cpu(&right_rec->e_leaf_clusters, -split_clusters); 3683 3684 ocfs2_cleanup_merge(el, index); 3685 3686 ret = ocfs2_journal_dirty(handle, bh); 3687 if (ret) 3688 mlog_errno(ret); 3689 3690 if (left_path) { 3691 ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path)); 3692 if (ret) 3693 mlog_errno(ret); 3694 3695 /* 3696 * In the situation that the right_rec is empty and the extent 3697 * block is empty also, ocfs2_complete_edge_insert can't handle 3698 * it and we need to delete the right extent block. 3699 */ 3700 if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 && 3701 le16_to_cpu(el->l_next_free_rec) == 1) { 3702 3703 ret = ocfs2_remove_rightmost_path(handle, et, 3704 right_path, 3705 dealloc); 3706 if (ret) { 3707 mlog_errno(ret); 3708 goto out; 3709 } 3710 3711 /* Now the rightmost extent block has been deleted. 3712 * So we use the new rightmost path. 3713 */ 3714 ocfs2_mv_path(right_path, left_path); 3715 left_path = NULL; 3716 } else 3717 ocfs2_complete_edge_insert(handle, left_path, 3718 right_path, subtree_index); 3719 } 3720 out: 3721 if (left_path) 3722 ocfs2_free_path(left_path); 3723 return ret; 3724 } 3725 3726 static int ocfs2_try_to_merge_extent(handle_t *handle, 3727 struct ocfs2_extent_tree *et, 3728 struct ocfs2_path *path, 3729 int split_index, 3730 struct ocfs2_extent_rec *split_rec, 3731 struct ocfs2_cached_dealloc_ctxt *dealloc, 3732 struct ocfs2_merge_ctxt *ctxt) 3733 { 3734 int ret = 0; 3735 struct ocfs2_extent_list *el = path_leaf_el(path); 3736 struct ocfs2_extent_rec *rec = &el->l_recs[split_index]; 3737 3738 BUG_ON(ctxt->c_contig_type == CONTIG_NONE); 3739 3740 if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) { 3741 /* 3742 * The merge code will need to create an empty 3743 * extent to take the place of the newly 3744 * emptied slot. Remove any pre-existing empty 3745 * extents - having more than one in a leaf is 3746 * illegal. 3747 */ 3748 ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); 3749 if (ret) { 3750 mlog_errno(ret); 3751 goto out; 3752 } 3753 split_index--; 3754 rec = &el->l_recs[split_index]; 3755 } 3756 3757 if (ctxt->c_contig_type == CONTIG_LEFTRIGHT) { 3758 /* 3759 * Left-right contig implies this. 3760 */ 3761 BUG_ON(!ctxt->c_split_covers_rec); 3762 3763 /* 3764 * Since the leftright insert always covers the entire 3765 * extent, this call will delete the insert record 3766 * entirely, resulting in an empty extent record added to 3767 * the extent block. 3768 * 3769 * Since the adding of an empty extent shifts 3770 * everything back to the right, there's no need to 3771 * update split_index here. 3772 * 3773 * When the split_index is zero, we need to merge it to the 3774 * prevoius extent block. It is more efficient and easier 3775 * if we do merge_right first and merge_left later. 3776 */ 3777 ret = ocfs2_merge_rec_right(path, handle, et, split_rec, 3778 split_index); 3779 if (ret) { 3780 mlog_errno(ret); 3781 goto out; 3782 } 3783 3784 /* 3785 * We can only get this from logic error above. 3786 */ 3787 BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0])); 3788 3789 /* The merge left us with an empty extent, remove it. */ 3790 ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); 3791 if (ret) { 3792 mlog_errno(ret); 3793 goto out; 3794 } 3795 3796 rec = &el->l_recs[split_index]; 3797 3798 /* 3799 * Note that we don't pass split_rec here on purpose - 3800 * we've merged it into the rec already. 3801 */ 3802 ret = ocfs2_merge_rec_left(path, handle, et, rec, 3803 dealloc, split_index); 3804 3805 if (ret) { 3806 mlog_errno(ret); 3807 goto out; 3808 } 3809 3810 ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); 3811 /* 3812 * Error from this last rotate is not critical, so 3813 * print but don't bubble it up. 3814 */ 3815 if (ret) 3816 mlog_errno(ret); 3817 ret = 0; 3818 } else { 3819 /* 3820 * Merge a record to the left or right. 3821 * 3822 * 'contig_type' is relative to the existing record, 3823 * so for example, if we're "right contig", it's to 3824 * the record on the left (hence the left merge). 3825 */ 3826 if (ctxt->c_contig_type == CONTIG_RIGHT) { 3827 ret = ocfs2_merge_rec_left(path, handle, et, 3828 split_rec, dealloc, 3829 split_index); 3830 if (ret) { 3831 mlog_errno(ret); 3832 goto out; 3833 } 3834 } else { 3835 ret = ocfs2_merge_rec_right(path, handle, 3836 et, split_rec, 3837 split_index); 3838 if (ret) { 3839 mlog_errno(ret); 3840 goto out; 3841 } 3842 } 3843 3844 if (ctxt->c_split_covers_rec) { 3845 /* 3846 * The merge may have left an empty extent in 3847 * our leaf. Try to rotate it away. 3848 */ 3849 ret = ocfs2_rotate_tree_left(handle, et, path, 3850 dealloc); 3851 if (ret) 3852 mlog_errno(ret); 3853 ret = 0; 3854 } 3855 } 3856 3857 out: 3858 return ret; 3859 } 3860 3861 static void ocfs2_subtract_from_rec(struct super_block *sb, 3862 enum ocfs2_split_type split, 3863 struct ocfs2_extent_rec *rec, 3864 struct ocfs2_extent_rec *split_rec) 3865 { 3866 u64 len_blocks; 3867 3868 len_blocks = ocfs2_clusters_to_blocks(sb, 3869 le16_to_cpu(split_rec->e_leaf_clusters)); 3870 3871 if (split == SPLIT_LEFT) { 3872 /* 3873 * Region is on the left edge of the existing 3874 * record. 3875 */ 3876 le32_add_cpu(&rec->e_cpos, 3877 le16_to_cpu(split_rec->e_leaf_clusters)); 3878 le64_add_cpu(&rec->e_blkno, len_blocks); 3879 le16_add_cpu(&rec->e_leaf_clusters, 3880 -le16_to_cpu(split_rec->e_leaf_clusters)); 3881 } else { 3882 /* 3883 * Region is on the right edge of the existing 3884 * record. 3885 */ 3886 le16_add_cpu(&rec->e_leaf_clusters, 3887 -le16_to_cpu(split_rec->e_leaf_clusters)); 3888 } 3889 } 3890 3891 /* 3892 * Do the final bits of extent record insertion at the target leaf 3893 * list. If this leaf is part of an allocation tree, it is assumed 3894 * that the tree above has been prepared. 3895 */ 3896 static void ocfs2_insert_at_leaf(struct ocfs2_extent_tree *et, 3897 struct ocfs2_extent_rec *insert_rec, 3898 struct ocfs2_extent_list *el, 3899 struct ocfs2_insert_type *insert) 3900 { 3901 int i = insert->ins_contig_index; 3902 unsigned int range; 3903 struct ocfs2_extent_rec *rec; 3904 3905 BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); 3906 3907 if (insert->ins_split != SPLIT_NONE) { 3908 i = ocfs2_search_extent_list(el, le32_to_cpu(insert_rec->e_cpos)); 3909 BUG_ON(i == -1); 3910 rec = &el->l_recs[i]; 3911 ocfs2_subtract_from_rec(ocfs2_metadata_cache_get_super(et->et_ci), 3912 insert->ins_split, rec, 3913 insert_rec); 3914 goto rotate; 3915 } 3916 3917 /* 3918 * Contiguous insert - either left or right. 3919 */ 3920 if (insert->ins_contig != CONTIG_NONE) { 3921 rec = &el->l_recs[i]; 3922 if (insert->ins_contig == CONTIG_LEFT) { 3923 rec->e_blkno = insert_rec->e_blkno; 3924 rec->e_cpos = insert_rec->e_cpos; 3925 } 3926 le16_add_cpu(&rec->e_leaf_clusters, 3927 le16_to_cpu(insert_rec->e_leaf_clusters)); 3928 return; 3929 } 3930 3931 /* 3932 * Handle insert into an empty leaf. 3933 */ 3934 if (le16_to_cpu(el->l_next_free_rec) == 0 || 3935 ((le16_to_cpu(el->l_next_free_rec) == 1) && 3936 ocfs2_is_empty_extent(&el->l_recs[0]))) { 3937 el->l_recs[0] = *insert_rec; 3938 el->l_next_free_rec = cpu_to_le16(1); 3939 return; 3940 } 3941 3942 /* 3943 * Appending insert. 3944 */ 3945 if (insert->ins_appending == APPEND_TAIL) { 3946 i = le16_to_cpu(el->l_next_free_rec) - 1; 3947 rec = &el->l_recs[i]; 3948 range = le32_to_cpu(rec->e_cpos) 3949 + le16_to_cpu(rec->e_leaf_clusters); 3950 BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range); 3951 3952 mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >= 3953 le16_to_cpu(el->l_count), 3954 "owner %llu, depth %u, count %u, next free %u, " 3955 "rec.cpos %u, rec.clusters %u, " 3956 "insert.cpos %u, insert.clusters %u\n", 3957 ocfs2_metadata_cache_owner(et->et_ci), 3958 le16_to_cpu(el->l_tree_depth), 3959 le16_to_cpu(el->l_count), 3960 le16_to_cpu(el->l_next_free_rec), 3961 le32_to_cpu(el->l_recs[i].e_cpos), 3962 le16_to_cpu(el->l_recs[i].e_leaf_clusters), 3963 le32_to_cpu(insert_rec->e_cpos), 3964 le16_to_cpu(insert_rec->e_leaf_clusters)); 3965 i++; 3966 el->l_recs[i] = *insert_rec; 3967 le16_add_cpu(&el->l_next_free_rec, 1); 3968 return; 3969 } 3970 3971 rotate: 3972 /* 3973 * Ok, we have to rotate. 3974 * 3975 * At this point, it is safe to assume that inserting into an 3976 * empty leaf and appending to a leaf have both been handled 3977 * above. 3978 * 3979 * This leaf needs to have space, either by the empty 1st 3980 * extent record, or by virtue of an l_next_rec < l_count. 3981 */ 3982 ocfs2_rotate_leaf(el, insert_rec); 3983 } 3984 3985 static void ocfs2_adjust_rightmost_records(handle_t *handle, 3986 struct ocfs2_extent_tree *et, 3987 struct ocfs2_path *path, 3988 struct ocfs2_extent_rec *insert_rec) 3989 { 3990 int ret, i, next_free; 3991 struct buffer_head *bh; 3992 struct ocfs2_extent_list *el; 3993 struct ocfs2_extent_rec *rec; 3994 3995 /* 3996 * Update everything except the leaf block. 3997 */ 3998 for (i = 0; i < path->p_tree_depth; i++) { 3999 bh = path->p_node[i].bh; 4000 el = path->p_node[i].el; 4001 4002 next_free = le16_to_cpu(el->l_next_free_rec); 4003 if (next_free == 0) { 4004 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), 4005 "Owner %llu has a bad extent list", 4006 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci)); 4007 ret = -EIO; 4008 return; 4009 } 4010 4011 rec = &el->l_recs[next_free - 1]; 4012 4013 rec->e_int_clusters = insert_rec->e_cpos; 4014 le32_add_cpu(&rec->e_int_clusters, 4015 le16_to_cpu(insert_rec->e_leaf_clusters)); 4016 le32_add_cpu(&rec->e_int_clusters, 4017 -le32_to_cpu(rec->e_cpos)); 4018 4019 ret = ocfs2_journal_dirty(handle, bh); 4020 if (ret) 4021 mlog_errno(ret); 4022 4023 } 4024 } 4025 4026 static int ocfs2_append_rec_to_path(handle_t *handle, 4027 struct ocfs2_extent_tree *et, 4028 struct ocfs2_extent_rec *insert_rec, 4029 struct ocfs2_path *right_path, 4030 struct ocfs2_path **ret_left_path) 4031 { 4032 int ret, next_free; 4033 struct ocfs2_extent_list *el; 4034 struct ocfs2_path *left_path = NULL; 4035 4036 *ret_left_path = NULL; 4037 4038 /* 4039 * This shouldn't happen for non-trees. The extent rec cluster 4040 * count manipulation below only works for interior nodes. 4041 */ 4042 BUG_ON(right_path->p_tree_depth == 0); 4043 4044 /* 4045 * If our appending insert is at the leftmost edge of a leaf, 4046 * then we might need to update the rightmost records of the 4047 * neighboring path. 4048 */ 4049 el = path_leaf_el(right_path); 4050 next_free = le16_to_cpu(el->l_next_free_rec); 4051 if (next_free == 0 || 4052 (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) { 4053 u32 left_cpos; 4054 4055 ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci), 4056 right_path, &left_cpos); 4057 if (ret) { 4058 mlog_errno(ret); 4059 goto out; 4060 } 4061 4062 mlog(0, "Append may need a left path update. cpos: %u, " 4063 "left_cpos: %u\n", le32_to_cpu(insert_rec->e_cpos), 4064 left_cpos); 4065 4066 /* 4067 * No need to worry if the append is already in the 4068 * leftmost leaf. 4069 */ 4070 if (left_cpos) { 4071 left_path = ocfs2_new_path_from_path(right_path); 4072 if (!left_path) { 4073 ret = -ENOMEM; 4074 mlog_errno(ret); 4075 goto out; 4076 } 4077 4078 ret = ocfs2_find_path(et->et_ci, left_path, 4079 left_cpos); 4080 if (ret) { 4081 mlog_errno(ret); 4082 goto out; 4083 } 4084 4085 /* 4086 * ocfs2_insert_path() will pass the left_path to the 4087 * journal for us. 4088 */ 4089 } 4090 } 4091 4092 ret = ocfs2_journal_access_path(et->et_ci, handle, right_path); 4093 if (ret) { 4094 mlog_errno(ret); 4095 goto out; 4096 } 4097 4098 ocfs2_adjust_rightmost_records(handle, et, right_path, insert_rec); 4099 4100 *ret_left_path = left_path; 4101 ret = 0; 4102 out: 4103 if (ret != 0) 4104 ocfs2_free_path(left_path); 4105 4106 return ret; 4107 } 4108 4109 static void ocfs2_split_record(struct ocfs2_extent_tree *et, 4110 struct ocfs2_path *left_path, 4111 struct ocfs2_path *right_path, 4112 struct ocfs2_extent_rec *split_rec, 4113 enum ocfs2_split_type split) 4114 { 4115 int index; 4116 u32 cpos = le32_to_cpu(split_rec->e_cpos); 4117 struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el; 4118 struct ocfs2_extent_rec *rec, *tmprec; 4119 4120 right_el = path_leaf_el(right_path); 4121 if (left_path) 4122 left_el = path_leaf_el(left_path); 4123 4124 el = right_el; 4125 insert_el = right_el; 4126 index = ocfs2_search_extent_list(el, cpos); 4127 if (index != -1) { 4128 if (index == 0 && left_path) { 4129 BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0])); 4130 4131 /* 4132 * This typically means that the record 4133 * started in the left path but moved to the 4134 * right as a result of rotation. We either 4135 * move the existing record to the left, or we 4136 * do the later insert there. 4137 * 4138 * In this case, the left path should always 4139 * exist as the rotate code will have passed 4140 * it back for a post-insert update. 4141 */ 4142 4143 if (split == SPLIT_LEFT) { 4144 /* 4145 * It's a left split. Since we know 4146 * that the rotate code gave us an 4147 * empty extent in the left path, we 4148 * can just do the insert there. 4149 */ 4150 insert_el = left_el; 4151 } else { 4152 /* 4153 * Right split - we have to move the 4154 * existing record over to the left 4155 * leaf. The insert will be into the 4156 * newly created empty extent in the 4157 * right leaf. 4158 */ 4159 tmprec = &right_el->l_recs[index]; 4160 ocfs2_rotate_leaf(left_el, tmprec); 4161 el = left_el; 4162 4163 memset(tmprec, 0, sizeof(*tmprec)); 4164 index = ocfs2_search_extent_list(left_el, cpos); 4165 BUG_ON(index == -1); 4166 } 4167 } 4168 } else { 4169 BUG_ON(!left_path); 4170 BUG_ON(!ocfs2_is_empty_extent(&left_el->l_recs[0])); 4171 /* 4172 * Left path is easy - we can just allow the insert to 4173 * happen. 4174 */ 4175 el = left_el; 4176 insert_el = left_el; 4177 index = ocfs2_search_extent_list(el, cpos); 4178 BUG_ON(index == -1); 4179 } 4180 4181 rec = &el->l_recs[index]; 4182 ocfs2_subtract_from_rec(ocfs2_metadata_cache_get_super(et->et_ci), 4183 split, rec, split_rec); 4184 ocfs2_rotate_leaf(insert_el, split_rec); 4185 } 4186 4187 /* 4188 * This function only does inserts on an allocation b-tree. For tree 4189 * depth = 0, ocfs2_insert_at_leaf() is called directly. 4190 * 4191 * right_path is the path we want to do the actual insert 4192 * in. left_path should only be passed in if we need to update that 4193 * portion of the tree after an edge insert. 4194 */ 4195 static int ocfs2_insert_path(handle_t *handle, 4196 struct ocfs2_extent_tree *et, 4197 struct ocfs2_path *left_path, 4198 struct ocfs2_path *right_path, 4199 struct ocfs2_extent_rec *insert_rec, 4200 struct ocfs2_insert_type *insert) 4201 { 4202 int ret, subtree_index; 4203 struct buffer_head *leaf_bh = path_leaf_bh(right_path); 4204 4205 if (left_path) { 4206 int credits = handle->h_buffer_credits; 4207 4208 /* 4209 * There's a chance that left_path got passed back to 4210 * us without being accounted for in the 4211 * journal. Extend our transaction here to be sure we 4212 * can change those blocks. 4213 */ 4214 credits += left_path->p_tree_depth; 4215 4216 ret = ocfs2_extend_trans(handle, credits); 4217 if (ret < 0) { 4218 mlog_errno(ret); 4219 goto out; 4220 } 4221 4222 ret = ocfs2_journal_access_path(et->et_ci, handle, left_path); 4223 if (ret < 0) { 4224 mlog_errno(ret); 4225 goto out; 4226 } 4227 } 4228 4229 /* 4230 * Pass both paths to the journal. The majority of inserts 4231 * will be touching all components anyway. 4232 */ 4233 ret = ocfs2_journal_access_path(et->et_ci, handle, right_path); 4234 if (ret < 0) { 4235 mlog_errno(ret); 4236 goto out; 4237 } 4238 4239 if (insert->ins_split != SPLIT_NONE) { 4240 /* 4241 * We could call ocfs2_insert_at_leaf() for some types 4242 * of splits, but it's easier to just let one separate 4243 * function sort it all out. 4244 */ 4245 ocfs2_split_record(et, left_path, right_path, 4246 insert_rec, insert->ins_split); 4247 4248 /* 4249 * Split might have modified either leaf and we don't 4250 * have a guarantee that the later edge insert will 4251 * dirty this for us. 4252 */ 4253 if (left_path) 4254 ret = ocfs2_journal_dirty(handle, 4255 path_leaf_bh(left_path)); 4256 if (ret) 4257 mlog_errno(ret); 4258 } else 4259 ocfs2_insert_at_leaf(et, insert_rec, path_leaf_el(right_path), 4260 insert); 4261 4262 ret = ocfs2_journal_dirty(handle, leaf_bh); 4263 if (ret) 4264 mlog_errno(ret); 4265 4266 if (left_path) { 4267 /* 4268 * The rotate code has indicated that we need to fix 4269 * up portions of the tree after the insert. 4270 * 4271 * XXX: Should we extend the transaction here? 4272 */ 4273 subtree_index = ocfs2_find_subtree_root(et, left_path, 4274 right_path); 4275 ocfs2_complete_edge_insert(handle, left_path, right_path, 4276 subtree_index); 4277 } 4278 4279 ret = 0; 4280 out: 4281 return ret; 4282 } 4283 4284 static int ocfs2_do_insert_extent(handle_t *handle, 4285 struct ocfs2_extent_tree *et, 4286 struct ocfs2_extent_rec *insert_rec, 4287 struct ocfs2_insert_type *type) 4288 { 4289 int ret, rotate = 0; 4290 u32 cpos; 4291 struct ocfs2_path *right_path = NULL; 4292 struct ocfs2_path *left_path = NULL; 4293 struct ocfs2_extent_list *el; 4294 4295 el = et->et_root_el; 4296 4297 ret = ocfs2_et_root_journal_access(handle, et, 4298 OCFS2_JOURNAL_ACCESS_WRITE); 4299 if (ret) { 4300 mlog_errno(ret); 4301 goto out; 4302 } 4303 4304 if (le16_to_cpu(el->l_tree_depth) == 0) { 4305 ocfs2_insert_at_leaf(et, insert_rec, el, type); 4306 goto out_update_clusters; 4307 } 4308 4309 right_path = ocfs2_new_path_from_et(et); 4310 if (!right_path) { 4311 ret = -ENOMEM; 4312 mlog_errno(ret); 4313 goto out; 4314 } 4315 4316 /* 4317 * Determine the path to start with. Rotations need the 4318 * rightmost path, everything else can go directly to the 4319 * target leaf. 4320 */ 4321 cpos = le32_to_cpu(insert_rec->e_cpos); 4322 if (type->ins_appending == APPEND_NONE && 4323 type->ins_contig == CONTIG_NONE) { 4324 rotate = 1; 4325 cpos = UINT_MAX; 4326 } 4327 4328 ret = ocfs2_find_path(et->et_ci, right_path, cpos); 4329 if (ret) { 4330 mlog_errno(ret); 4331 goto out; 4332 } 4333 4334 /* 4335 * Rotations and appends need special treatment - they modify 4336 * parts of the tree's above them. 4337 * 4338 * Both might pass back a path immediate to the left of the 4339 * one being inserted to. This will be cause 4340 * ocfs2_insert_path() to modify the rightmost records of 4341 * left_path to account for an edge insert. 4342 * 4343 * XXX: When modifying this code, keep in mind that an insert 4344 * can wind up skipping both of these two special cases... 4345 */ 4346 if (rotate) { 4347 ret = ocfs2_rotate_tree_right(handle, et, type->ins_split, 4348 le32_to_cpu(insert_rec->e_cpos), 4349 right_path, &left_path); 4350 if (ret) { 4351 mlog_errno(ret); 4352 goto out; 4353 } 4354 4355 /* 4356 * ocfs2_rotate_tree_right() might have extended the 4357 * transaction without re-journaling our tree root. 4358 */ 4359 ret = ocfs2_et_root_journal_access(handle, et, 4360 OCFS2_JOURNAL_ACCESS_WRITE); 4361 if (ret) { 4362 mlog_errno(ret); 4363 goto out; 4364 } 4365 } else if (type->ins_appending == APPEND_TAIL 4366 && type->ins_contig != CONTIG_LEFT) { 4367 ret = ocfs2_append_rec_to_path(handle, et, insert_rec, 4368 right_path, &left_path); 4369 if (ret) { 4370 mlog_errno(ret); 4371 goto out; 4372 } 4373 } 4374 4375 ret = ocfs2_insert_path(handle, et, left_path, right_path, 4376 insert_rec, type); 4377 if (ret) { 4378 mlog_errno(ret); 4379 goto out; 4380 } 4381 4382 out_update_clusters: 4383 if (type->ins_split == SPLIT_NONE) 4384 ocfs2_et_update_clusters(et, 4385 le16_to_cpu(insert_rec->e_leaf_clusters)); 4386 4387 ret = ocfs2_journal_dirty(handle, et->et_root_bh); 4388 if (ret) 4389 mlog_errno(ret); 4390 4391 out: 4392 ocfs2_free_path(left_path); 4393 ocfs2_free_path(right_path); 4394 4395 return ret; 4396 } 4397 4398 static enum ocfs2_contig_type 4399 ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, 4400 struct ocfs2_path *path, 4401 struct ocfs2_extent_list *el, int index, 4402 struct ocfs2_extent_rec *split_rec) 4403 { 4404 int status; 4405 enum ocfs2_contig_type ret = CONTIG_NONE; 4406 u32 left_cpos, right_cpos; 4407 struct ocfs2_extent_rec *rec = NULL; 4408 struct ocfs2_extent_list *new_el; 4409 struct ocfs2_path *left_path = NULL, *right_path = NULL; 4410 struct buffer_head *bh; 4411 struct ocfs2_extent_block *eb; 4412 struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); 4413 4414 if (index > 0) { 4415 rec = &el->l_recs[index - 1]; 4416 } else if (path->p_tree_depth > 0) { 4417 status = ocfs2_find_cpos_for_left_leaf(sb, path, &left_cpos); 4418 if (status) 4419 goto out; 4420 4421 if (left_cpos != 0) { 4422 left_path = ocfs2_new_path_from_path(path); 4423 if (!left_path) 4424 goto out; 4425 4426 status = ocfs2_find_path(et->et_ci, left_path, 4427 left_cpos); 4428 if (status) 4429 goto out; 4430 4431 new_el = path_leaf_el(left_path); 4432 4433 if (le16_to_cpu(new_el->l_next_free_rec) != 4434 le16_to_cpu(new_el->l_count)) { 4435 bh = path_leaf_bh(left_path); 4436 eb = (struct ocfs2_extent_block *)bh->b_data; 4437 ocfs2_error(sb, 4438 "Extent block #%llu has an " 4439 "invalid l_next_free_rec of " 4440 "%d. It should have " 4441 "matched the l_count of %d", 4442 (unsigned long long)le64_to_cpu(eb->h_blkno), 4443 le16_to_cpu(new_el->l_next_free_rec), 4444 le16_to_cpu(new_el->l_count)); 4445 status = -EINVAL; 4446 goto out; 4447 } 4448 rec = &new_el->l_recs[ 4449 le16_to_cpu(new_el->l_next_free_rec) - 1]; 4450 } 4451 } 4452 4453 /* 4454 * We're careful to check for an empty extent record here - 4455 * the merge code will know what to do if it sees one. 4456 */ 4457 if (rec) { 4458 if (index == 1 && ocfs2_is_empty_extent(rec)) { 4459 if (split_rec->e_cpos == el->l_recs[index].e_cpos) 4460 ret = CONTIG_RIGHT; 4461 } else { 4462 ret = ocfs2_et_extent_contig(et, rec, split_rec); 4463 } 4464 } 4465 4466 rec = NULL; 4467 if (index < (le16_to_cpu(el->l_next_free_rec) - 1)) 4468 rec = &el->l_recs[index + 1]; 4469 else if (le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count) && 4470 path->p_tree_depth > 0) { 4471 status = ocfs2_find_cpos_for_right_leaf(sb, path, &right_cpos); 4472 if (status) 4473 goto out; 4474 4475 if (right_cpos == 0) 4476 goto out; 4477 4478 right_path = ocfs2_new_path_from_path(path); 4479 if (!right_path) 4480 goto out; 4481 4482 status = ocfs2_find_path(et->et_ci, right_path, right_cpos); 4483 if (status) 4484 goto out; 4485 4486 new_el = path_leaf_el(right_path); 4487 rec = &new_el->l_recs[0]; 4488 if (ocfs2_is_empty_extent(rec)) { 4489 if (le16_to_cpu(new_el->l_next_free_rec) <= 1) { 4490 bh = path_leaf_bh(right_path); 4491 eb = (struct ocfs2_extent_block *)bh->b_data; 4492 ocfs2_error(sb, 4493 "Extent block #%llu has an " 4494 "invalid l_next_free_rec of %d", 4495 (unsigned long long)le64_to_cpu(eb->h_blkno), 4496 le16_to_cpu(new_el->l_next_free_rec)); 4497 status = -EINVAL; 4498 goto out; 4499 } 4500 rec = &new_el->l_recs[1]; 4501 } 4502 } 4503 4504 if (rec) { 4505 enum ocfs2_contig_type contig_type; 4506 4507 contig_type = ocfs2_et_extent_contig(et, rec, split_rec); 4508 4509 if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT) 4510 ret = CONTIG_LEFTRIGHT; 4511 else if (ret == CONTIG_NONE) 4512 ret = contig_type; 4513 } 4514 4515 out: 4516 if (left_path) 4517 ocfs2_free_path(left_path); 4518 if (right_path) 4519 ocfs2_free_path(right_path); 4520 4521 return ret; 4522 } 4523 4524 static void ocfs2_figure_contig_type(struct ocfs2_extent_tree *et, 4525 struct ocfs2_insert_type *insert, 4526 struct ocfs2_extent_list *el, 4527 struct ocfs2_extent_rec *insert_rec) 4528 { 4529 int i; 4530 enum ocfs2_contig_type contig_type = CONTIG_NONE; 4531 4532 BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); 4533 4534 for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { 4535 contig_type = ocfs2_et_extent_contig(et, &el->l_recs[i], 4536 insert_rec); 4537 if (contig_type != CONTIG_NONE) { 4538 insert->ins_contig_index = i; 4539 break; 4540 } 4541 } 4542 insert->ins_contig = contig_type; 4543 4544 if (insert->ins_contig != CONTIG_NONE) { 4545 struct ocfs2_extent_rec *rec = 4546 &el->l_recs[insert->ins_contig_index]; 4547 unsigned int len = le16_to_cpu(rec->e_leaf_clusters) + 4548 le16_to_cpu(insert_rec->e_leaf_clusters); 4549 4550 /* 4551 * Caller might want us to limit the size of extents, don't 4552 * calculate contiguousness if we might exceed that limit. 4553 */ 4554 if (et->et_max_leaf_clusters && 4555 (len > et->et_max_leaf_clusters)) 4556 insert->ins_contig = CONTIG_NONE; 4557 } 4558 } 4559 4560 /* 4561 * This should only be called against the righmost leaf extent list. 4562 * 4563 * ocfs2_figure_appending_type() will figure out whether we'll have to 4564 * insert at the tail of the rightmost leaf. 4565 * 4566 * This should also work against the root extent list for tree's with 0 4567 * depth. If we consider the root extent list to be the rightmost leaf node 4568 * then the logic here makes sense. 4569 */ 4570 static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert, 4571 struct ocfs2_extent_list *el, 4572 struct ocfs2_extent_rec *insert_rec) 4573 { 4574 int i; 4575 u32 cpos = le32_to_cpu(insert_rec->e_cpos); 4576 struct ocfs2_extent_rec *rec; 4577 4578 insert->ins_appending = APPEND_NONE; 4579 4580 BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); 4581 4582 if (!el->l_next_free_rec) 4583 goto set_tail_append; 4584 4585 if (ocfs2_is_empty_extent(&el->l_recs[0])) { 4586 /* Were all records empty? */ 4587 if (le16_to_cpu(el->l_next_free_rec) == 1) 4588 goto set_tail_append; 4589 } 4590 4591 i = le16_to_cpu(el->l_next_free_rec) - 1; 4592 rec = &el->l_recs[i]; 4593 4594 if (cpos >= 4595 (le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters))) 4596 goto set_tail_append; 4597 4598 return; 4599 4600 set_tail_append: 4601 insert->ins_appending = APPEND_TAIL; 4602 } 4603 4604 /* 4605 * Helper function called at the begining of an insert. 4606 * 4607 * This computes a few things that are commonly used in the process of 4608 * inserting into the btree: 4609 * - Whether the new extent is contiguous with an existing one. 4610 * - The current tree depth. 4611 * - Whether the insert is an appending one. 4612 * - The total # of free records in the tree. 4613 * 4614 * All of the information is stored on the ocfs2_insert_type 4615 * structure. 4616 */ 4617 static int ocfs2_figure_insert_type(struct ocfs2_extent_tree *et, 4618 struct buffer_head **last_eb_bh, 4619 struct ocfs2_extent_rec *insert_rec, 4620 int *free_records, 4621 struct ocfs2_insert_type *insert) 4622 { 4623 int ret; 4624 struct ocfs2_extent_block *eb; 4625 struct ocfs2_extent_list *el; 4626 struct ocfs2_path *path = NULL; 4627 struct buffer_head *bh = NULL; 4628 4629 insert->ins_split = SPLIT_NONE; 4630 4631 el = et->et_root_el; 4632 insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth); 4633 4634 if (el->l_tree_depth) { 4635 /* 4636 * If we have tree depth, we read in the 4637 * rightmost extent block ahead of time as 4638 * ocfs2_figure_insert_type() and ocfs2_add_branch() 4639 * may want it later. 4640 */ 4641 ret = ocfs2_read_extent_block(et->et_ci, 4642 ocfs2_et_get_last_eb_blk(et), 4643 &bh); 4644 if (ret) { 4645 mlog_exit(ret); 4646 goto out; 4647 } 4648 eb = (struct ocfs2_extent_block *) bh->b_data; 4649 el = &eb->h_list; 4650 } 4651 4652 /* 4653 * Unless we have a contiguous insert, we'll need to know if 4654 * there is room left in our allocation tree for another 4655 * extent record. 4656 * 4657 * XXX: This test is simplistic, we can search for empty 4658 * extent records too. 4659 */ 4660 *free_records = le16_to_cpu(el->l_count) - 4661 le16_to_cpu(el->l_next_free_rec); 4662 4663 if (!insert->ins_tree_depth) { 4664 ocfs2_figure_contig_type(et, insert, el, insert_rec); 4665 ocfs2_figure_appending_type(insert, el, insert_rec); 4666 return 0; 4667 } 4668 4669 path = ocfs2_new_path_from_et(et); 4670 if (!path) { 4671 ret = -ENOMEM; 4672 mlog_errno(ret); 4673 goto out; 4674 } 4675 4676 /* 4677 * In the case that we're inserting past what the tree 4678 * currently accounts for, ocfs2_find_path() will return for 4679 * us the rightmost tree path. This is accounted for below in 4680 * the appending code. 4681 */ 4682 ret = ocfs2_find_path(et->et_ci, path, le32_to_cpu(insert_rec->e_cpos)); 4683 if (ret) { 4684 mlog_errno(ret); 4685 goto out; 4686 } 4687 4688 el = path_leaf_el(path); 4689 4690 /* 4691 * Now that we have the path, there's two things we want to determine: 4692 * 1) Contiguousness (also set contig_index if this is so) 4693 * 4694 * 2) Are we doing an append? We can trivially break this up 4695 * into two types of appends: simple record append, or a 4696 * rotate inside the tail leaf. 4697 */ 4698 ocfs2_figure_contig_type(et, insert, el, insert_rec); 4699 4700 /* 4701 * The insert code isn't quite ready to deal with all cases of 4702 * left contiguousness. Specifically, if it's an insert into 4703 * the 1st record in a leaf, it will require the adjustment of 4704 * cluster count on the last record of the path directly to it's 4705 * left. For now, just catch that case and fool the layers 4706 * above us. This works just fine for tree_depth == 0, which 4707 * is why we allow that above. 4708 */ 4709 if (insert->ins_contig == CONTIG_LEFT && 4710 insert->ins_contig_index == 0) 4711 insert->ins_contig = CONTIG_NONE; 4712 4713 /* 4714 * Ok, so we can simply compare against last_eb to figure out 4715 * whether the path doesn't exist. This will only happen in 4716 * the case that we're doing a tail append, so maybe we can 4717 * take advantage of that information somehow. 4718 */ 4719 if (ocfs2_et_get_last_eb_blk(et) == 4720 path_leaf_bh(path)->b_blocknr) { 4721 /* 4722 * Ok, ocfs2_find_path() returned us the rightmost 4723 * tree path. This might be an appending insert. There are 4724 * two cases: 4725 * 1) We're doing a true append at the tail: 4726 * -This might even be off the end of the leaf 4727 * 2) We're "appending" by rotating in the tail 4728 */ 4729 ocfs2_figure_appending_type(insert, el, insert_rec); 4730 } 4731 4732 out: 4733 ocfs2_free_path(path); 4734 4735 if (ret == 0) 4736 *last_eb_bh = bh; 4737 else 4738 brelse(bh); 4739 return ret; 4740 } 4741 4742 /* 4743 * Insert an extent into a btree. 4744 * 4745 * The caller needs to update the owning btree's cluster count. 4746 */ 4747 int ocfs2_insert_extent(handle_t *handle, 4748 struct ocfs2_extent_tree *et, 4749 u32 cpos, 4750 u64 start_blk, 4751 u32 new_clusters, 4752 u8 flags, 4753 struct ocfs2_alloc_context *meta_ac) 4754 { 4755 int status; 4756 int uninitialized_var(free_records); 4757 struct buffer_head *last_eb_bh = NULL; 4758 struct ocfs2_insert_type insert = {0, }; 4759 struct ocfs2_extent_rec rec; 4760 4761 mlog(0, "add %u clusters at position %u to owner %llu\n", 4762 new_clusters, cpos, 4763 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci)); 4764 4765 memset(&rec, 0, sizeof(rec)); 4766 rec.e_cpos = cpu_to_le32(cpos); 4767 rec.e_blkno = cpu_to_le64(start_blk); 4768 rec.e_leaf_clusters = cpu_to_le16(new_clusters); 4769 rec.e_flags = flags; 4770 status = ocfs2_et_insert_check(et, &rec); 4771 if (status) { 4772 mlog_errno(status); 4773 goto bail; 4774 } 4775 4776 status = ocfs2_figure_insert_type(et, &last_eb_bh, &rec, 4777 &free_records, &insert); 4778 if (status < 0) { 4779 mlog_errno(status); 4780 goto bail; 4781 } 4782 4783 mlog(0, "Insert.appending: %u, Insert.Contig: %u, " 4784 "Insert.contig_index: %d, Insert.free_records: %d, " 4785 "Insert.tree_depth: %d\n", 4786 insert.ins_appending, insert.ins_contig, insert.ins_contig_index, 4787 free_records, insert.ins_tree_depth); 4788 4789 if (insert.ins_contig == CONTIG_NONE && free_records == 0) { 4790 status = ocfs2_grow_tree(handle, et, 4791 &insert.ins_tree_depth, &last_eb_bh, 4792 meta_ac); 4793 if (status) { 4794 mlog_errno(status); 4795 goto bail; 4796 } 4797 } 4798 4799 /* Finally, we can add clusters. This might rotate the tree for us. */ 4800 status = ocfs2_do_insert_extent(handle, et, &rec, &insert); 4801 if (status < 0) 4802 mlog_errno(status); 4803 else 4804 ocfs2_et_extent_map_insert(et, &rec); 4805 4806 bail: 4807 brelse(last_eb_bh); 4808 4809 mlog_exit(status); 4810 return status; 4811 } 4812 4813 /* 4814 * Allcate and add clusters into the extent b-tree. 4815 * The new clusters(clusters_to_add) will be inserted at logical_offset. 4816 * The extent b-tree's root is specified by et, and 4817 * it is not limited to the file storage. Any extent tree can use this 4818 * function if it implements the proper ocfs2_extent_tree. 4819 */ 4820 int ocfs2_add_clusters_in_btree(handle_t *handle, 4821 struct ocfs2_extent_tree *et, 4822 u32 *logical_offset, 4823 u32 clusters_to_add, 4824 int mark_unwritten, 4825 struct ocfs2_alloc_context *data_ac, 4826 struct ocfs2_alloc_context *meta_ac, 4827 enum ocfs2_alloc_restarted *reason_ret) 4828 { 4829 int status = 0; 4830 int free_extents; 4831 enum ocfs2_alloc_restarted reason = RESTART_NONE; 4832 u32 bit_off, num_bits; 4833 u64 block; 4834 u8 flags = 0; 4835 struct ocfs2_super *osb = 4836 OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci)); 4837 4838 BUG_ON(!clusters_to_add); 4839 4840 if (mark_unwritten) 4841 flags = OCFS2_EXT_UNWRITTEN; 4842 4843 free_extents = ocfs2_num_free_extents(osb, et); 4844 if (free_extents < 0) { 4845 status = free_extents; 4846 mlog_errno(status); 4847 goto leave; 4848 } 4849 4850 /* there are two cases which could cause us to EAGAIN in the 4851 * we-need-more-metadata case: 4852 * 1) we haven't reserved *any* 4853 * 2) we are so fragmented, we've needed to add metadata too 4854 * many times. */ 4855 if (!free_extents && !meta_ac) { 4856 mlog(0, "we haven't reserved any metadata!\n"); 4857 status = -EAGAIN; 4858 reason = RESTART_META; 4859 goto leave; 4860 } else if ((!free_extents) 4861 && (ocfs2_alloc_context_bits_left(meta_ac) 4862 < ocfs2_extend_meta_needed(et->et_root_el))) { 4863 mlog(0, "filesystem is really fragmented...\n"); 4864 status = -EAGAIN; 4865 reason = RESTART_META; 4866 goto leave; 4867 } 4868 4869 status = __ocfs2_claim_clusters(osb, handle, data_ac, 1, 4870 clusters_to_add, &bit_off, &num_bits); 4871 if (status < 0) { 4872 if (status != -ENOSPC) 4873 mlog_errno(status); 4874 goto leave; 4875 } 4876 4877 BUG_ON(num_bits > clusters_to_add); 4878 4879 /* reserve our write early -- insert_extent may update the tree root */ 4880 status = ocfs2_et_root_journal_access(handle, et, 4881 OCFS2_JOURNAL_ACCESS_WRITE); 4882 if (status < 0) { 4883 mlog_errno(status); 4884 goto leave; 4885 } 4886 4887 block = ocfs2_clusters_to_blocks(osb->sb, bit_off); 4888 mlog(0, "Allocating %u clusters at block %u for owner %llu\n", 4889 num_bits, bit_off, 4890 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci)); 4891 status = ocfs2_insert_extent(handle, et, *logical_offset, block, 4892 num_bits, flags, meta_ac); 4893 if (status < 0) { 4894 mlog_errno(status); 4895 goto leave; 4896 } 4897 4898 status = ocfs2_journal_dirty(handle, et->et_root_bh); 4899 if (status < 0) { 4900 mlog_errno(status); 4901 goto leave; 4902 } 4903 4904 clusters_to_add -= num_bits; 4905 *logical_offset += num_bits; 4906 4907 if (clusters_to_add) { 4908 mlog(0, "need to alloc once more, wanted = %u\n", 4909 clusters_to_add); 4910 status = -EAGAIN; 4911 reason = RESTART_TRANS; 4912 } 4913 4914 leave: 4915 mlog_exit(status); 4916 if (reason_ret) 4917 *reason_ret = reason; 4918 return status; 4919 } 4920 4921 static void ocfs2_make_right_split_rec(struct super_block *sb, 4922 struct ocfs2_extent_rec *split_rec, 4923 u32 cpos, 4924 struct ocfs2_extent_rec *rec) 4925 { 4926 u32 rec_cpos = le32_to_cpu(rec->e_cpos); 4927 u32 rec_range = rec_cpos + le16_to_cpu(rec->e_leaf_clusters); 4928 4929 memset(split_rec, 0, sizeof(struct ocfs2_extent_rec)); 4930 4931 split_rec->e_cpos = cpu_to_le32(cpos); 4932 split_rec->e_leaf_clusters = cpu_to_le16(rec_range - cpos); 4933 4934 split_rec->e_blkno = rec->e_blkno; 4935 le64_add_cpu(&split_rec->e_blkno, 4936 ocfs2_clusters_to_blocks(sb, cpos - rec_cpos)); 4937 4938 split_rec->e_flags = rec->e_flags; 4939 } 4940 4941 static int ocfs2_split_and_insert(handle_t *handle, 4942 struct ocfs2_extent_tree *et, 4943 struct ocfs2_path *path, 4944 struct buffer_head **last_eb_bh, 4945 int split_index, 4946 struct ocfs2_extent_rec *orig_split_rec, 4947 struct ocfs2_alloc_context *meta_ac) 4948 { 4949 int ret = 0, depth; 4950 unsigned int insert_range, rec_range, do_leftright = 0; 4951 struct ocfs2_extent_rec tmprec; 4952 struct ocfs2_extent_list *rightmost_el; 4953 struct ocfs2_extent_rec rec; 4954 struct ocfs2_extent_rec split_rec = *orig_split_rec; 4955 struct ocfs2_insert_type insert; 4956 struct ocfs2_extent_block *eb; 4957 4958 leftright: 4959 /* 4960 * Store a copy of the record on the stack - it might move 4961 * around as the tree is manipulated below. 4962 */ 4963 rec = path_leaf_el(path)->l_recs[split_index]; 4964 4965 rightmost_el = et->et_root_el; 4966 4967 depth = le16_to_cpu(rightmost_el->l_tree_depth); 4968 if (depth) { 4969 BUG_ON(!(*last_eb_bh)); 4970 eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data; 4971 rightmost_el = &eb->h_list; 4972 } 4973 4974 if (le16_to_cpu(rightmost_el->l_next_free_rec) == 4975 le16_to_cpu(rightmost_el->l_count)) { 4976 ret = ocfs2_grow_tree(handle, et, 4977 &depth, last_eb_bh, meta_ac); 4978 if (ret) { 4979 mlog_errno(ret); 4980 goto out; 4981 } 4982 } 4983 4984 memset(&insert, 0, sizeof(struct ocfs2_insert_type)); 4985 insert.ins_appending = APPEND_NONE; 4986 insert.ins_contig = CONTIG_NONE; 4987 insert.ins_tree_depth = depth; 4988 4989 insert_range = le32_to_cpu(split_rec.e_cpos) + 4990 le16_to_cpu(split_rec.e_leaf_clusters); 4991 rec_range = le32_to_cpu(rec.e_cpos) + 4992 le16_to_cpu(rec.e_leaf_clusters); 4993 4994 if (split_rec.e_cpos == rec.e_cpos) { 4995 insert.ins_split = SPLIT_LEFT; 4996 } else if (insert_range == rec_range) { 4997 insert.ins_split = SPLIT_RIGHT; 4998 } else { 4999 /* 5000 * Left/right split. We fake this as a right split 5001 * first and then make a second pass as a left split. 5002 */ 5003 insert.ins_split = SPLIT_RIGHT; 5004 5005 ocfs2_make_right_split_rec(ocfs2_metadata_cache_get_super(et->et_ci), 5006 &tmprec, insert_range, &rec); 5007 5008 split_rec = tmprec; 5009 5010 BUG_ON(do_leftright); 5011 do_leftright = 1; 5012 } 5013 5014 ret = ocfs2_do_insert_extent(handle, et, &split_rec, &insert); 5015 if (ret) { 5016 mlog_errno(ret); 5017 goto out; 5018 } 5019 5020 if (do_leftright == 1) { 5021 u32 cpos; 5022 struct ocfs2_extent_list *el; 5023 5024 do_leftright++; 5025 split_rec = *orig_split_rec; 5026 5027 ocfs2_reinit_path(path, 1); 5028 5029 cpos = le32_to_cpu(split_rec.e_cpos); 5030 ret = ocfs2_find_path(et->et_ci, path, cpos); 5031 if (ret) { 5032 mlog_errno(ret); 5033 goto out; 5034 } 5035 5036 el = path_leaf_el(path); 5037 split_index = ocfs2_search_extent_list(el, cpos); 5038 goto leftright; 5039 } 5040 out: 5041 5042 return ret; 5043 } 5044 5045 static int ocfs2_replace_extent_rec(handle_t *handle, 5046 struct ocfs2_extent_tree *et, 5047 struct ocfs2_path *path, 5048 struct ocfs2_extent_list *el, 5049 int split_index, 5050 struct ocfs2_extent_rec *split_rec) 5051 { 5052 int ret; 5053 5054 ret = ocfs2_path_bh_journal_access(handle, et->et_ci, path, 5055 path_num_items(path) - 1); 5056 if (ret) { 5057 mlog_errno(ret); 5058 goto out; 5059 } 5060 5061 el->l_recs[split_index] = *split_rec; 5062 5063 ocfs2_journal_dirty(handle, path_leaf_bh(path)); 5064 out: 5065 return ret; 5066 } 5067 5068 /* 5069 * Split part or all of the extent record at split_index in the leaf 5070 * pointed to by path. Merge with the contiguous extent record if needed. 5071 * 5072 * Care is taken to handle contiguousness so as to not grow the tree. 5073 * 5074 * meta_ac is not strictly necessary - we only truly need it if growth 5075 * of the tree is required. All other cases will degrade into a less 5076 * optimal tree layout. 5077 * 5078 * last_eb_bh should be the rightmost leaf block for any extent 5079 * btree. Since a split may grow the tree or a merge might shrink it, 5080 * the caller cannot trust the contents of that buffer after this call. 5081 * 5082 * This code is optimized for readability - several passes might be 5083 * made over certain portions of the tree. All of those blocks will 5084 * have been brought into cache (and pinned via the journal), so the 5085 * extra overhead is not expressed in terms of disk reads. 5086 */ 5087 int ocfs2_split_extent(handle_t *handle, 5088 struct ocfs2_extent_tree *et, 5089 struct ocfs2_path *path, 5090 int split_index, 5091 struct ocfs2_extent_rec *split_rec, 5092 struct ocfs2_alloc_context *meta_ac, 5093 struct ocfs2_cached_dealloc_ctxt *dealloc) 5094 { 5095 int ret = 0; 5096 struct ocfs2_extent_list *el = path_leaf_el(path); 5097 struct buffer_head *last_eb_bh = NULL; 5098 struct ocfs2_extent_rec *rec = &el->l_recs[split_index]; 5099 struct ocfs2_merge_ctxt ctxt; 5100 struct ocfs2_extent_list *rightmost_el; 5101 5102 if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) || 5103 ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) < 5104 (le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) { 5105 ret = -EIO; 5106 mlog_errno(ret); 5107 goto out; 5108 } 5109 5110 ctxt.c_contig_type = ocfs2_figure_merge_contig_type(et, path, el, 5111 split_index, 5112 split_rec); 5113 5114 /* 5115 * The core merge / split code wants to know how much room is 5116 * left in this allocation tree, so we pass the 5117 * rightmost extent list. 5118 */ 5119 if (path->p_tree_depth) { 5120 struct ocfs2_extent_block *eb; 5121 5122 ret = ocfs2_read_extent_block(et->et_ci, 5123 ocfs2_et_get_last_eb_blk(et), 5124 &last_eb_bh); 5125 if (ret) { 5126 mlog_exit(ret); 5127 goto out; 5128 } 5129 5130 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; 5131 rightmost_el = &eb->h_list; 5132 } else 5133 rightmost_el = path_root_el(path); 5134 5135 if (rec->e_cpos == split_rec->e_cpos && 5136 rec->e_leaf_clusters == split_rec->e_leaf_clusters) 5137 ctxt.c_split_covers_rec = 1; 5138 else 5139 ctxt.c_split_covers_rec = 0; 5140 5141 ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]); 5142 5143 mlog(0, "index: %d, contig: %u, has_empty: %u, split_covers: %u\n", 5144 split_index, ctxt.c_contig_type, ctxt.c_has_empty_extent, 5145 ctxt.c_split_covers_rec); 5146 5147 if (ctxt.c_contig_type == CONTIG_NONE) { 5148 if (ctxt.c_split_covers_rec) 5149 ret = ocfs2_replace_extent_rec(handle, et, path, el, 5150 split_index, split_rec); 5151 else 5152 ret = ocfs2_split_and_insert(handle, et, path, 5153 &last_eb_bh, split_index, 5154 split_rec, meta_ac); 5155 if (ret) 5156 mlog_errno(ret); 5157 } else { 5158 ret = ocfs2_try_to_merge_extent(handle, et, path, 5159 split_index, split_rec, 5160 dealloc, &ctxt); 5161 if (ret) 5162 mlog_errno(ret); 5163 } 5164 5165 out: 5166 brelse(last_eb_bh); 5167 return ret; 5168 } 5169 5170 /* 5171 * Change the flags of the already-existing extent at cpos for len clusters. 5172 * 5173 * new_flags: the flags we want to set. 5174 * clear_flags: the flags we want to clear. 5175 * phys: the new physical offset we want this new extent starts from. 5176 * 5177 * If the existing extent is larger than the request, initiate a 5178 * split. An attempt will be made at merging with adjacent extents. 5179 * 5180 * The caller is responsible for passing down meta_ac if we'll need it. 5181 */ 5182 int ocfs2_change_extent_flag(handle_t *handle, 5183 struct ocfs2_extent_tree *et, 5184 u32 cpos, u32 len, u32 phys, 5185 struct ocfs2_alloc_context *meta_ac, 5186 struct ocfs2_cached_dealloc_ctxt *dealloc, 5187 int new_flags, int clear_flags) 5188 { 5189 int ret, index; 5190 struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); 5191 u64 start_blkno = ocfs2_clusters_to_blocks(sb, phys); 5192 struct ocfs2_extent_rec split_rec; 5193 struct ocfs2_path *left_path = NULL; 5194 struct ocfs2_extent_list *el; 5195 struct ocfs2_extent_rec *rec; 5196 5197 left_path = ocfs2_new_path_from_et(et); 5198 if (!left_path) { 5199 ret = -ENOMEM; 5200 mlog_errno(ret); 5201 goto out; 5202 } 5203 5204 ret = ocfs2_find_path(et->et_ci, left_path, cpos); 5205 if (ret) { 5206 mlog_errno(ret); 5207 goto out; 5208 } 5209 el = path_leaf_el(left_path); 5210 5211 index = ocfs2_search_extent_list(el, cpos); 5212 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { 5213 ocfs2_error(sb, 5214 "Owner %llu has an extent at cpos %u which can no " 5215 "longer be found.\n", 5216 (unsigned long long) 5217 ocfs2_metadata_cache_owner(et->et_ci), cpos); 5218 ret = -EROFS; 5219 goto out; 5220 } 5221 5222 ret = -EIO; 5223 rec = &el->l_recs[index]; 5224 if (new_flags && (rec->e_flags & new_flags)) { 5225 mlog(ML_ERROR, "Owner %llu tried to set %d flags on an " 5226 "extent that already had them", 5227 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), 5228 new_flags); 5229 goto out; 5230 } 5231 5232 if (clear_flags && !(rec->e_flags & clear_flags)) { 5233 mlog(ML_ERROR, "Owner %llu tried to clear %d flags on an " 5234 "extent that didn't have them", 5235 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), 5236 clear_flags); 5237 goto out; 5238 } 5239 5240 memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec)); 5241 split_rec.e_cpos = cpu_to_le32(cpos); 5242 split_rec.e_leaf_clusters = cpu_to_le16(len); 5243 split_rec.e_blkno = cpu_to_le64(start_blkno); 5244 split_rec.e_flags = rec->e_flags; 5245 if (new_flags) 5246 split_rec.e_flags |= new_flags; 5247 if (clear_flags) 5248 split_rec.e_flags &= ~clear_flags; 5249 5250 ret = ocfs2_split_extent(handle, et, left_path, 5251 index, &split_rec, meta_ac, 5252 dealloc); 5253 if (ret) 5254 mlog_errno(ret); 5255 5256 out: 5257 ocfs2_free_path(left_path); 5258 return ret; 5259 5260 } 5261 5262 /* 5263 * Mark the already-existing extent at cpos as written for len clusters. 5264 * This removes the unwritten extent flag. 5265 * 5266 * If the existing extent is larger than the request, initiate a 5267 * split. An attempt will be made at merging with adjacent extents. 5268 * 5269 * The caller is responsible for passing down meta_ac if we'll need it. 5270 */ 5271 int ocfs2_mark_extent_written(struct inode *inode, 5272 struct ocfs2_extent_tree *et, 5273 handle_t *handle, u32 cpos, u32 len, u32 phys, 5274 struct ocfs2_alloc_context *meta_ac, 5275 struct ocfs2_cached_dealloc_ctxt *dealloc) 5276 { 5277 int ret; 5278 5279 mlog(0, "Inode %lu cpos %u, len %u, phys clusters %u\n", 5280 inode->i_ino, cpos, len, phys); 5281 5282 if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) { 5283 ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents " 5284 "that are being written to, but the feature bit " 5285 "is not set in the super block.", 5286 (unsigned long long)OCFS2_I(inode)->ip_blkno); 5287 ret = -EROFS; 5288 goto out; 5289 } 5290 5291 /* 5292 * XXX: This should be fixed up so that we just re-insert the 5293 * next extent records. 5294 */ 5295 ocfs2_et_extent_map_truncate(et, 0); 5296 5297 ret = ocfs2_change_extent_flag(handle, et, cpos, 5298 len, phys, meta_ac, dealloc, 5299 0, OCFS2_EXT_UNWRITTEN); 5300 if (ret) 5301 mlog_errno(ret); 5302 5303 out: 5304 return ret; 5305 } 5306 5307 static int ocfs2_split_tree(handle_t *handle, struct ocfs2_extent_tree *et, 5308 struct ocfs2_path *path, 5309 int index, u32 new_range, 5310 struct ocfs2_alloc_context *meta_ac) 5311 { 5312 int ret, depth, credits = handle->h_buffer_credits; 5313 struct buffer_head *last_eb_bh = NULL; 5314 struct ocfs2_extent_block *eb; 5315 struct ocfs2_extent_list *rightmost_el, *el; 5316 struct ocfs2_extent_rec split_rec; 5317 struct ocfs2_extent_rec *rec; 5318 struct ocfs2_insert_type insert; 5319 5320 /* 5321 * Setup the record to split before we grow the tree. 5322 */ 5323 el = path_leaf_el(path); 5324 rec = &el->l_recs[index]; 5325 ocfs2_make_right_split_rec(ocfs2_metadata_cache_get_super(et->et_ci), 5326 &split_rec, new_range, rec); 5327 5328 depth = path->p_tree_depth; 5329 if (depth > 0) { 5330 ret = ocfs2_read_extent_block(et->et_ci, 5331 ocfs2_et_get_last_eb_blk(et), 5332 &last_eb_bh); 5333 if (ret < 0) { 5334 mlog_errno(ret); 5335 goto out; 5336 } 5337 5338 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; 5339 rightmost_el = &eb->h_list; 5340 } else 5341 rightmost_el = path_leaf_el(path); 5342 5343 credits += path->p_tree_depth + 5344 ocfs2_extend_meta_needed(et->et_root_el); 5345 ret = ocfs2_extend_trans(handle, credits); 5346 if (ret) { 5347 mlog_errno(ret); 5348 goto out; 5349 } 5350 5351 if (le16_to_cpu(rightmost_el->l_next_free_rec) == 5352 le16_to_cpu(rightmost_el->l_count)) { 5353 ret = ocfs2_grow_tree(handle, et, &depth, &last_eb_bh, 5354 meta_ac); 5355 if (ret) { 5356 mlog_errno(ret); 5357 goto out; 5358 } 5359 } 5360 5361 memset(&insert, 0, sizeof(struct ocfs2_insert_type)); 5362 insert.ins_appending = APPEND_NONE; 5363 insert.ins_contig = CONTIG_NONE; 5364 insert.ins_split = SPLIT_RIGHT; 5365 insert.ins_tree_depth = depth; 5366 5367 ret = ocfs2_do_insert_extent(handle, et, &split_rec, &insert); 5368 if (ret) 5369 mlog_errno(ret); 5370 5371 out: 5372 brelse(last_eb_bh); 5373 return ret; 5374 } 5375 5376 static int ocfs2_truncate_rec(handle_t *handle, 5377 struct ocfs2_extent_tree *et, 5378 struct ocfs2_path *path, int index, 5379 struct ocfs2_cached_dealloc_ctxt *dealloc, 5380 u32 cpos, u32 len) 5381 { 5382 int ret; 5383 u32 left_cpos, rec_range, trunc_range; 5384 int wants_rotate = 0, is_rightmost_tree_rec = 0; 5385 struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); 5386 struct ocfs2_path *left_path = NULL; 5387 struct ocfs2_extent_list *el = path_leaf_el(path); 5388 struct ocfs2_extent_rec *rec; 5389 struct ocfs2_extent_block *eb; 5390 5391 if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) { 5392 ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); 5393 if (ret) { 5394 mlog_errno(ret); 5395 goto out; 5396 } 5397 5398 index--; 5399 } 5400 5401 if (index == (le16_to_cpu(el->l_next_free_rec) - 1) && 5402 path->p_tree_depth) { 5403 /* 5404 * Check whether this is the rightmost tree record. If 5405 * we remove all of this record or part of its right 5406 * edge then an update of the record lengths above it 5407 * will be required. 5408 */ 5409 eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data; 5410 if (eb->h_next_leaf_blk == 0) 5411 is_rightmost_tree_rec = 1; 5412 } 5413 5414 rec = &el->l_recs[index]; 5415 if (index == 0 && path->p_tree_depth && 5416 le32_to_cpu(rec->e_cpos) == cpos) { 5417 /* 5418 * Changing the leftmost offset (via partial or whole 5419 * record truncate) of an interior (or rightmost) path 5420 * means we have to update the subtree that is formed 5421 * by this leaf and the one to it's left. 5422 * 5423 * There are two cases we can skip: 5424 * 1) Path is the leftmost one in our btree. 5425 * 2) The leaf is rightmost and will be empty after 5426 * we remove the extent record - the rotate code 5427 * knows how to update the newly formed edge. 5428 */ 5429 5430 ret = ocfs2_find_cpos_for_left_leaf(sb, path, &left_cpos); 5431 if (ret) { 5432 mlog_errno(ret); 5433 goto out; 5434 } 5435 5436 if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) { 5437 left_path = ocfs2_new_path_from_path(path); 5438 if (!left_path) { 5439 ret = -ENOMEM; 5440 mlog_errno(ret); 5441 goto out; 5442 } 5443 5444 ret = ocfs2_find_path(et->et_ci, left_path, 5445 left_cpos); 5446 if (ret) { 5447 mlog_errno(ret); 5448 goto out; 5449 } 5450 } 5451 } 5452 5453 ret = ocfs2_extend_rotate_transaction(handle, 0, 5454 handle->h_buffer_credits, 5455 path); 5456 if (ret) { 5457 mlog_errno(ret); 5458 goto out; 5459 } 5460 5461 ret = ocfs2_journal_access_path(et->et_ci, handle, path); 5462 if (ret) { 5463 mlog_errno(ret); 5464 goto out; 5465 } 5466 5467 ret = ocfs2_journal_access_path(et->et_ci, handle, left_path); 5468 if (ret) { 5469 mlog_errno(ret); 5470 goto out; 5471 } 5472 5473 rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec); 5474 trunc_range = cpos + len; 5475 5476 if (le32_to_cpu(rec->e_cpos) == cpos && rec_range == trunc_range) { 5477 int next_free; 5478 5479 memset(rec, 0, sizeof(*rec)); 5480 ocfs2_cleanup_merge(el, index); 5481 wants_rotate = 1; 5482 5483 next_free = le16_to_cpu(el->l_next_free_rec); 5484 if (is_rightmost_tree_rec && next_free > 1) { 5485 /* 5486 * We skip the edge update if this path will 5487 * be deleted by the rotate code. 5488 */ 5489 rec = &el->l_recs[next_free - 1]; 5490 ocfs2_adjust_rightmost_records(handle, et, path, 5491 rec); 5492 } 5493 } else if (le32_to_cpu(rec->e_cpos) == cpos) { 5494 /* Remove leftmost portion of the record. */ 5495 le32_add_cpu(&rec->e_cpos, len); 5496 le64_add_cpu(&rec->e_blkno, ocfs2_clusters_to_blocks(sb, len)); 5497 le16_add_cpu(&rec->e_leaf_clusters, -len); 5498 } else if (rec_range == trunc_range) { 5499 /* Remove rightmost portion of the record */ 5500 le16_add_cpu(&rec->e_leaf_clusters, -len); 5501 if (is_rightmost_tree_rec) 5502 ocfs2_adjust_rightmost_records(handle, et, path, rec); 5503 } else { 5504 /* Caller should have trapped this. */ 5505 mlog(ML_ERROR, "Owner %llu: Invalid record truncate: (%u, %u) " 5506 "(%u, %u)\n", 5507 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), 5508 le32_to_cpu(rec->e_cpos), 5509 le16_to_cpu(rec->e_leaf_clusters), cpos, len); 5510 BUG(); 5511 } 5512 5513 if (left_path) { 5514 int subtree_index; 5515 5516 subtree_index = ocfs2_find_subtree_root(et, left_path, path); 5517 ocfs2_complete_edge_insert(handle, left_path, path, 5518 subtree_index); 5519 } 5520 5521 ocfs2_journal_dirty(handle, path_leaf_bh(path)); 5522 5523 ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); 5524 if (ret) { 5525 mlog_errno(ret); 5526 goto out; 5527 } 5528 5529 out: 5530 ocfs2_free_path(left_path); 5531 return ret; 5532 } 5533 5534 int ocfs2_remove_extent(handle_t *handle, 5535 struct ocfs2_extent_tree *et, 5536 u32 cpos, u32 len, 5537 struct ocfs2_alloc_context *meta_ac, 5538 struct ocfs2_cached_dealloc_ctxt *dealloc) 5539 { 5540 int ret, index; 5541 u32 rec_range, trunc_range; 5542 struct ocfs2_extent_rec *rec; 5543 struct ocfs2_extent_list *el; 5544 struct ocfs2_path *path = NULL; 5545 5546 /* 5547 * XXX: Why are we truncating to 0 instead of wherever this 5548 * affects us? 5549 */ 5550 ocfs2_et_extent_map_truncate(et, 0); 5551 5552 path = ocfs2_new_path_from_et(et); 5553 if (!path) { 5554 ret = -ENOMEM; 5555 mlog_errno(ret); 5556 goto out; 5557 } 5558 5559 ret = ocfs2_find_path(et->et_ci, path, cpos); 5560 if (ret) { 5561 mlog_errno(ret); 5562 goto out; 5563 } 5564 5565 el = path_leaf_el(path); 5566 index = ocfs2_search_extent_list(el, cpos); 5567 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { 5568 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), 5569 "Owner %llu has an extent at cpos %u which can no " 5570 "longer be found.\n", 5571 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), 5572 cpos); 5573 ret = -EROFS; 5574 goto out; 5575 } 5576 5577 /* 5578 * We have 3 cases of extent removal: 5579 * 1) Range covers the entire extent rec 5580 * 2) Range begins or ends on one edge of the extent rec 5581 * 3) Range is in the middle of the extent rec (no shared edges) 5582 * 5583 * For case 1 we remove the extent rec and left rotate to 5584 * fill the hole. 5585 * 5586 * For case 2 we just shrink the existing extent rec, with a 5587 * tree update if the shrinking edge is also the edge of an 5588 * extent block. 5589 * 5590 * For case 3 we do a right split to turn the extent rec into 5591 * something case 2 can handle. 5592 */ 5593 rec = &el->l_recs[index]; 5594 rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec); 5595 trunc_range = cpos + len; 5596 5597 BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range); 5598 5599 mlog(0, "Owner %llu, remove (cpos %u, len %u). Existing index %d " 5600 "(cpos %u, len %u)\n", 5601 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), 5602 cpos, len, index, 5603 le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec)); 5604 5605 if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) { 5606 ret = ocfs2_truncate_rec(handle, et, path, index, dealloc, 5607 cpos, len); 5608 if (ret) { 5609 mlog_errno(ret); 5610 goto out; 5611 } 5612 } else { 5613 ret = ocfs2_split_tree(handle, et, path, index, 5614 trunc_range, meta_ac); 5615 if (ret) { 5616 mlog_errno(ret); 5617 goto out; 5618 } 5619 5620 /* 5621 * The split could have manipulated the tree enough to 5622 * move the record location, so we have to look for it again. 5623 */ 5624 ocfs2_reinit_path(path, 1); 5625 5626 ret = ocfs2_find_path(et->et_ci, path, cpos); 5627 if (ret) { 5628 mlog_errno(ret); 5629 goto out; 5630 } 5631 5632 el = path_leaf_el(path); 5633 index = ocfs2_search_extent_list(el, cpos); 5634 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { 5635 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), 5636 "Owner %llu: split at cpos %u lost record.", 5637 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), 5638 cpos); 5639 ret = -EROFS; 5640 goto out; 5641 } 5642 5643 /* 5644 * Double check our values here. If anything is fishy, 5645 * it's easier to catch it at the top level. 5646 */ 5647 rec = &el->l_recs[index]; 5648 rec_range = le32_to_cpu(rec->e_cpos) + 5649 ocfs2_rec_clusters(el, rec); 5650 if (rec_range != trunc_range) { 5651 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), 5652 "Owner %llu: error after split at cpos %u" 5653 "trunc len %u, existing record is (%u,%u)", 5654 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), 5655 cpos, len, le32_to_cpu(rec->e_cpos), 5656 ocfs2_rec_clusters(el, rec)); 5657 ret = -EROFS; 5658 goto out; 5659 } 5660 5661 ret = ocfs2_truncate_rec(handle, et, path, index, dealloc, 5662 cpos, len); 5663 if (ret) { 5664 mlog_errno(ret); 5665 goto out; 5666 } 5667 } 5668 5669 out: 5670 ocfs2_free_path(path); 5671 return ret; 5672 } 5673 5674 int ocfs2_remove_btree_range(struct inode *inode, 5675 struct ocfs2_extent_tree *et, 5676 u32 cpos, u32 phys_cpos, u32 len, 5677 struct ocfs2_cached_dealloc_ctxt *dealloc) 5678 { 5679 int ret; 5680 u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); 5681 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 5682 struct inode *tl_inode = osb->osb_tl_inode; 5683 handle_t *handle; 5684 struct ocfs2_alloc_context *meta_ac = NULL; 5685 5686 ret = ocfs2_lock_allocators(inode, et, 0, 1, NULL, &meta_ac); 5687 if (ret) { 5688 mlog_errno(ret); 5689 return ret; 5690 } 5691 5692 mutex_lock(&tl_inode->i_mutex); 5693 5694 if (ocfs2_truncate_log_needs_flush(osb)) { 5695 ret = __ocfs2_flush_truncate_log(osb); 5696 if (ret < 0) { 5697 mlog_errno(ret); 5698 goto out; 5699 } 5700 } 5701 5702 handle = ocfs2_start_trans(osb, ocfs2_remove_extent_credits(osb->sb)); 5703 if (IS_ERR(handle)) { 5704 ret = PTR_ERR(handle); 5705 mlog_errno(ret); 5706 goto out; 5707 } 5708 5709 ret = ocfs2_et_root_journal_access(handle, et, 5710 OCFS2_JOURNAL_ACCESS_WRITE); 5711 if (ret) { 5712 mlog_errno(ret); 5713 goto out; 5714 } 5715 5716 dquot_free_space_nodirty(inode, 5717 ocfs2_clusters_to_bytes(inode->i_sb, len)); 5718 5719 ret = ocfs2_remove_extent(handle, et, cpos, len, meta_ac, dealloc); 5720 if (ret) { 5721 mlog_errno(ret); 5722 goto out_commit; 5723 } 5724 5725 ocfs2_et_update_clusters(et, -len); 5726 5727 ret = ocfs2_journal_dirty(handle, et->et_root_bh); 5728 if (ret) { 5729 mlog_errno(ret); 5730 goto out_commit; 5731 } 5732 5733 ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len); 5734 if (ret) 5735 mlog_errno(ret); 5736 5737 out_commit: 5738 ocfs2_commit_trans(osb, handle); 5739 out: 5740 mutex_unlock(&tl_inode->i_mutex); 5741 5742 if (meta_ac) 5743 ocfs2_free_alloc_context(meta_ac); 5744 5745 return ret; 5746 } 5747 5748 int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb) 5749 { 5750 struct buffer_head *tl_bh = osb->osb_tl_bh; 5751 struct ocfs2_dinode *di; 5752 struct ocfs2_truncate_log *tl; 5753 5754 di = (struct ocfs2_dinode *) tl_bh->b_data; 5755 tl = &di->id2.i_dealloc; 5756 5757 mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count), 5758 "slot %d, invalid truncate log parameters: used = " 5759 "%u, count = %u\n", osb->slot_num, 5760 le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count)); 5761 return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count); 5762 } 5763 5764 static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl, 5765 unsigned int new_start) 5766 { 5767 unsigned int tail_index; 5768 unsigned int current_tail; 5769 5770 /* No records, nothing to coalesce */ 5771 if (!le16_to_cpu(tl->tl_used)) 5772 return 0; 5773 5774 tail_index = le16_to_cpu(tl->tl_used) - 1; 5775 current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start); 5776 current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters); 5777 5778 return current_tail == new_start; 5779 } 5780 5781 int ocfs2_truncate_log_append(struct ocfs2_super *osb, 5782 handle_t *handle, 5783 u64 start_blk, 5784 unsigned int num_clusters) 5785 { 5786 int status, index; 5787 unsigned int start_cluster, tl_count; 5788 struct inode *tl_inode = osb->osb_tl_inode; 5789 struct buffer_head *tl_bh = osb->osb_tl_bh; 5790 struct ocfs2_dinode *di; 5791 struct ocfs2_truncate_log *tl; 5792 5793 mlog_entry("start_blk = %llu, num_clusters = %u\n", 5794 (unsigned long long)start_blk, num_clusters); 5795 5796 BUG_ON(mutex_trylock(&tl_inode->i_mutex)); 5797 5798 start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk); 5799 5800 di = (struct ocfs2_dinode *) tl_bh->b_data; 5801 5802 /* tl_bh is loaded from ocfs2_truncate_log_init(). It's validated 5803 * by the underlying call to ocfs2_read_inode_block(), so any 5804 * corruption is a code bug */ 5805 BUG_ON(!OCFS2_IS_VALID_DINODE(di)); 5806 5807 tl = &di->id2.i_dealloc; 5808 tl_count = le16_to_cpu(tl->tl_count); 5809 mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) || 5810 tl_count == 0, 5811 "Truncate record count on #%llu invalid " 5812 "wanted %u, actual %u\n", 5813 (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, 5814 ocfs2_truncate_recs_per_inode(osb->sb), 5815 le16_to_cpu(tl->tl_count)); 5816 5817 /* Caller should have known to flush before calling us. */ 5818 index = le16_to_cpu(tl->tl_used); 5819 if (index >= tl_count) { 5820 status = -ENOSPC; 5821 mlog_errno(status); 5822 goto bail; 5823 } 5824 5825 status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh, 5826 OCFS2_JOURNAL_ACCESS_WRITE); 5827 if (status < 0) { 5828 mlog_errno(status); 5829 goto bail; 5830 } 5831 5832 mlog(0, "Log truncate of %u clusters starting at cluster %u to " 5833 "%llu (index = %d)\n", num_clusters, start_cluster, 5834 (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index); 5835 5836 if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) { 5837 /* 5838 * Move index back to the record we are coalescing with. 5839 * ocfs2_truncate_log_can_coalesce() guarantees nonzero 5840 */ 5841 index--; 5842 5843 num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters); 5844 mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n", 5845 index, le32_to_cpu(tl->tl_recs[index].t_start), 5846 num_clusters); 5847 } else { 5848 tl->tl_recs[index].t_start = cpu_to_le32(start_cluster); 5849 tl->tl_used = cpu_to_le16(index + 1); 5850 } 5851 tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters); 5852 5853 status = ocfs2_journal_dirty(handle, tl_bh); 5854 if (status < 0) { 5855 mlog_errno(status); 5856 goto bail; 5857 } 5858 5859 bail: 5860 mlog_exit(status); 5861 return status; 5862 } 5863 5864 static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, 5865 handle_t *handle, 5866 struct inode *data_alloc_inode, 5867 struct buffer_head *data_alloc_bh) 5868 { 5869 int status = 0; 5870 int i; 5871 unsigned int num_clusters; 5872 u64 start_blk; 5873 struct ocfs2_truncate_rec rec; 5874 struct ocfs2_dinode *di; 5875 struct ocfs2_truncate_log *tl; 5876 struct inode *tl_inode = osb->osb_tl_inode; 5877 struct buffer_head *tl_bh = osb->osb_tl_bh; 5878 5879 mlog_entry_void(); 5880 5881 di = (struct ocfs2_dinode *) tl_bh->b_data; 5882 tl = &di->id2.i_dealloc; 5883 i = le16_to_cpu(tl->tl_used) - 1; 5884 while (i >= 0) { 5885 /* Caller has given us at least enough credits to 5886 * update the truncate log dinode */ 5887 status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh, 5888 OCFS2_JOURNAL_ACCESS_WRITE); 5889 if (status < 0) { 5890 mlog_errno(status); 5891 goto bail; 5892 } 5893 5894 tl->tl_used = cpu_to_le16(i); 5895 5896 status = ocfs2_journal_dirty(handle, tl_bh); 5897 if (status < 0) { 5898 mlog_errno(status); 5899 goto bail; 5900 } 5901 5902 /* TODO: Perhaps we can calculate the bulk of the 5903 * credits up front rather than extending like 5904 * this. */ 5905 status = ocfs2_extend_trans(handle, 5906 OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC); 5907 if (status < 0) { 5908 mlog_errno(status); 5909 goto bail; 5910 } 5911 5912 rec = tl->tl_recs[i]; 5913 start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb, 5914 le32_to_cpu(rec.t_start)); 5915 num_clusters = le32_to_cpu(rec.t_clusters); 5916 5917 /* if start_blk is not set, we ignore the record as 5918 * invalid. */ 5919 if (start_blk) { 5920 mlog(0, "free record %d, start = %u, clusters = %u\n", 5921 i, le32_to_cpu(rec.t_start), num_clusters); 5922 5923 status = ocfs2_free_clusters(handle, data_alloc_inode, 5924 data_alloc_bh, start_blk, 5925 num_clusters); 5926 if (status < 0) { 5927 mlog_errno(status); 5928 goto bail; 5929 } 5930 } 5931 i--; 5932 } 5933 5934 bail: 5935 mlog_exit(status); 5936 return status; 5937 } 5938 5939 /* Expects you to already be holding tl_inode->i_mutex */ 5940 int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) 5941 { 5942 int status; 5943 unsigned int num_to_flush; 5944 handle_t *handle; 5945 struct inode *tl_inode = osb->osb_tl_inode; 5946 struct inode *data_alloc_inode = NULL; 5947 struct buffer_head *tl_bh = osb->osb_tl_bh; 5948 struct buffer_head *data_alloc_bh = NULL; 5949 struct ocfs2_dinode *di; 5950 struct ocfs2_truncate_log *tl; 5951 5952 mlog_entry_void(); 5953 5954 BUG_ON(mutex_trylock(&tl_inode->i_mutex)); 5955 5956 di = (struct ocfs2_dinode *) tl_bh->b_data; 5957 5958 /* tl_bh is loaded from ocfs2_truncate_log_init(). It's validated 5959 * by the underlying call to ocfs2_read_inode_block(), so any 5960 * corruption is a code bug */ 5961 BUG_ON(!OCFS2_IS_VALID_DINODE(di)); 5962 5963 tl = &di->id2.i_dealloc; 5964 num_to_flush = le16_to_cpu(tl->tl_used); 5965 mlog(0, "Flush %u records from truncate log #%llu\n", 5966 num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno); 5967 if (!num_to_flush) { 5968 status = 0; 5969 goto out; 5970 } 5971 5972 data_alloc_inode = ocfs2_get_system_file_inode(osb, 5973 GLOBAL_BITMAP_SYSTEM_INODE, 5974 OCFS2_INVALID_SLOT); 5975 if (!data_alloc_inode) { 5976 status = -EINVAL; 5977 mlog(ML_ERROR, "Could not get bitmap inode!\n"); 5978 goto out; 5979 } 5980 5981 mutex_lock(&data_alloc_inode->i_mutex); 5982 5983 status = ocfs2_inode_lock(data_alloc_inode, &data_alloc_bh, 1); 5984 if (status < 0) { 5985 mlog_errno(status); 5986 goto out_mutex; 5987 } 5988 5989 handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE); 5990 if (IS_ERR(handle)) { 5991 status = PTR_ERR(handle); 5992 mlog_errno(status); 5993 goto out_unlock; 5994 } 5995 5996 status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode, 5997 data_alloc_bh); 5998 if (status < 0) 5999 mlog_errno(status); 6000 6001 ocfs2_commit_trans(osb, handle); 6002 6003 out_unlock: 6004 brelse(data_alloc_bh); 6005 ocfs2_inode_unlock(data_alloc_inode, 1); 6006 6007 out_mutex: 6008 mutex_unlock(&data_alloc_inode->i_mutex); 6009 iput(data_alloc_inode); 6010 6011 out: 6012 mlog_exit(status); 6013 return status; 6014 } 6015 6016 int ocfs2_flush_truncate_log(struct ocfs2_super *osb) 6017 { 6018 int status; 6019 struct inode *tl_inode = osb->osb_tl_inode; 6020 6021 mutex_lock(&tl_inode->i_mutex); 6022 status = __ocfs2_flush_truncate_log(osb); 6023 mutex_unlock(&tl_inode->i_mutex); 6024 6025 return status; 6026 } 6027 6028 static void ocfs2_truncate_log_worker(struct work_struct *work) 6029 { 6030 int status; 6031 struct ocfs2_super *osb = 6032 container_of(work, struct ocfs2_super, 6033 osb_truncate_log_wq.work); 6034 6035 mlog_entry_void(); 6036 6037 status = ocfs2_flush_truncate_log(osb); 6038 if (status < 0) 6039 mlog_errno(status); 6040 else 6041 ocfs2_init_steal_slots(osb); 6042 6043 mlog_exit(status); 6044 } 6045 6046 #define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ) 6047 void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb, 6048 int cancel) 6049 { 6050 if (osb->osb_tl_inode) { 6051 /* We want to push off log flushes while truncates are 6052 * still running. */ 6053 if (cancel) 6054 cancel_delayed_work(&osb->osb_truncate_log_wq); 6055 6056 queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq, 6057 OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL); 6058 } 6059 } 6060 6061 static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb, 6062 int slot_num, 6063 struct inode **tl_inode, 6064 struct buffer_head **tl_bh) 6065 { 6066 int status; 6067 struct inode *inode = NULL; 6068 struct buffer_head *bh = NULL; 6069 6070 inode = ocfs2_get_system_file_inode(osb, 6071 TRUNCATE_LOG_SYSTEM_INODE, 6072 slot_num); 6073 if (!inode) { 6074 status = -EINVAL; 6075 mlog(ML_ERROR, "Could not get load truncate log inode!\n"); 6076 goto bail; 6077 } 6078 6079 status = ocfs2_read_inode_block(inode, &bh); 6080 if (status < 0) { 6081 iput(inode); 6082 mlog_errno(status); 6083 goto bail; 6084 } 6085 6086 *tl_inode = inode; 6087 *tl_bh = bh; 6088 bail: 6089 mlog_exit(status); 6090 return status; 6091 } 6092 6093 /* called during the 1st stage of node recovery. we stamp a clean 6094 * truncate log and pass back a copy for processing later. if the 6095 * truncate log does not require processing, a *tl_copy is set to 6096 * NULL. */ 6097 int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb, 6098 int slot_num, 6099 struct ocfs2_dinode **tl_copy) 6100 { 6101 int status; 6102 struct inode *tl_inode = NULL; 6103 struct buffer_head *tl_bh = NULL; 6104 struct ocfs2_dinode *di; 6105 struct ocfs2_truncate_log *tl; 6106 6107 *tl_copy = NULL; 6108 6109 mlog(0, "recover truncate log from slot %d\n", slot_num); 6110 6111 status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh); 6112 if (status < 0) { 6113 mlog_errno(status); 6114 goto bail; 6115 } 6116 6117 di = (struct ocfs2_dinode *) tl_bh->b_data; 6118 6119 /* tl_bh is loaded from ocfs2_get_truncate_log_info(). It's 6120 * validated by the underlying call to ocfs2_read_inode_block(), 6121 * so any corruption is a code bug */ 6122 BUG_ON(!OCFS2_IS_VALID_DINODE(di)); 6123 6124 tl = &di->id2.i_dealloc; 6125 if (le16_to_cpu(tl->tl_used)) { 6126 mlog(0, "We'll have %u logs to recover\n", 6127 le16_to_cpu(tl->tl_used)); 6128 6129 *tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL); 6130 if (!(*tl_copy)) { 6131 status = -ENOMEM; 6132 mlog_errno(status); 6133 goto bail; 6134 } 6135 6136 /* Assuming the write-out below goes well, this copy 6137 * will be passed back to recovery for processing. */ 6138 memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size); 6139 6140 /* All we need to do to clear the truncate log is set 6141 * tl_used. */ 6142 tl->tl_used = 0; 6143 6144 ocfs2_compute_meta_ecc(osb->sb, tl_bh->b_data, &di->i_check); 6145 status = ocfs2_write_block(osb, tl_bh, INODE_CACHE(tl_inode)); 6146 if (status < 0) { 6147 mlog_errno(status); 6148 goto bail; 6149 } 6150 } 6151 6152 bail: 6153 if (tl_inode) 6154 iput(tl_inode); 6155 brelse(tl_bh); 6156 6157 if (status < 0 && (*tl_copy)) { 6158 kfree(*tl_copy); 6159 *tl_copy = NULL; 6160 } 6161 6162 mlog_exit(status); 6163 return status; 6164 } 6165 6166 int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb, 6167 struct ocfs2_dinode *tl_copy) 6168 { 6169 int status = 0; 6170 int i; 6171 unsigned int clusters, num_recs, start_cluster; 6172 u64 start_blk; 6173 handle_t *handle; 6174 struct inode *tl_inode = osb->osb_tl_inode; 6175 struct ocfs2_truncate_log *tl; 6176 6177 mlog_entry_void(); 6178 6179 if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) { 6180 mlog(ML_ERROR, "Asked to recover my own truncate log!\n"); 6181 return -EINVAL; 6182 } 6183 6184 tl = &tl_copy->id2.i_dealloc; 6185 num_recs = le16_to_cpu(tl->tl_used); 6186 mlog(0, "cleanup %u records from %llu\n", num_recs, 6187 (unsigned long long)le64_to_cpu(tl_copy->i_blkno)); 6188 6189 mutex_lock(&tl_inode->i_mutex); 6190 for(i = 0; i < num_recs; i++) { 6191 if (ocfs2_truncate_log_needs_flush(osb)) { 6192 status = __ocfs2_flush_truncate_log(osb); 6193 if (status < 0) { 6194 mlog_errno(status); 6195 goto bail_up; 6196 } 6197 } 6198 6199 handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE); 6200 if (IS_ERR(handle)) { 6201 status = PTR_ERR(handle); 6202 mlog_errno(status); 6203 goto bail_up; 6204 } 6205 6206 clusters = le32_to_cpu(tl->tl_recs[i].t_clusters); 6207 start_cluster = le32_to_cpu(tl->tl_recs[i].t_start); 6208 start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster); 6209 6210 status = ocfs2_truncate_log_append(osb, handle, 6211 start_blk, clusters); 6212 ocfs2_commit_trans(osb, handle); 6213 if (status < 0) { 6214 mlog_errno(status); 6215 goto bail_up; 6216 } 6217 } 6218 6219 bail_up: 6220 mutex_unlock(&tl_inode->i_mutex); 6221 6222 mlog_exit(status); 6223 return status; 6224 } 6225 6226 void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb) 6227 { 6228 int status; 6229 struct inode *tl_inode = osb->osb_tl_inode; 6230 6231 mlog_entry_void(); 6232 6233 if (tl_inode) { 6234 cancel_delayed_work(&osb->osb_truncate_log_wq); 6235 flush_workqueue(ocfs2_wq); 6236 6237 status = ocfs2_flush_truncate_log(osb); 6238 if (status < 0) 6239 mlog_errno(status); 6240 6241 brelse(osb->osb_tl_bh); 6242 iput(osb->osb_tl_inode); 6243 } 6244 6245 mlog_exit_void(); 6246 } 6247 6248 int ocfs2_truncate_log_init(struct ocfs2_super *osb) 6249 { 6250 int status; 6251 struct inode *tl_inode = NULL; 6252 struct buffer_head *tl_bh = NULL; 6253 6254 mlog_entry_void(); 6255 6256 status = ocfs2_get_truncate_log_info(osb, 6257 osb->slot_num, 6258 &tl_inode, 6259 &tl_bh); 6260 if (status < 0) 6261 mlog_errno(status); 6262 6263 /* ocfs2_truncate_log_shutdown keys on the existence of 6264 * osb->osb_tl_inode so we don't set any of the osb variables 6265 * until we're sure all is well. */ 6266 INIT_DELAYED_WORK(&osb->osb_truncate_log_wq, 6267 ocfs2_truncate_log_worker); 6268 osb->osb_tl_bh = tl_bh; 6269 osb->osb_tl_inode = tl_inode; 6270 6271 mlog_exit(status); 6272 return status; 6273 } 6274 6275 /* 6276 * Delayed de-allocation of suballocator blocks. 6277 * 6278 * Some sets of block de-allocations might involve multiple suballocator inodes. 6279 * 6280 * The locking for this can get extremely complicated, especially when 6281 * the suballocator inodes to delete from aren't known until deep 6282 * within an unrelated codepath. 6283 * 6284 * ocfs2_extent_block structures are a good example of this - an inode 6285 * btree could have been grown by any number of nodes each allocating 6286 * out of their own suballoc inode. 6287 * 6288 * These structures allow the delay of block de-allocation until a 6289 * later time, when locking of multiple cluster inodes won't cause 6290 * deadlock. 6291 */ 6292 6293 /* 6294 * Describe a single bit freed from a suballocator. For the block 6295 * suballocators, it represents one block. For the global cluster 6296 * allocator, it represents some clusters and free_bit indicates 6297 * clusters number. 6298 */ 6299 struct ocfs2_cached_block_free { 6300 struct ocfs2_cached_block_free *free_next; 6301 u64 free_blk; 6302 unsigned int free_bit; 6303 }; 6304 6305 struct ocfs2_per_slot_free_list { 6306 struct ocfs2_per_slot_free_list *f_next_suballocator; 6307 int f_inode_type; 6308 int f_slot; 6309 struct ocfs2_cached_block_free *f_first; 6310 }; 6311 6312 static int ocfs2_free_cached_blocks(struct ocfs2_super *osb, 6313 int sysfile_type, 6314 int slot, 6315 struct ocfs2_cached_block_free *head) 6316 { 6317 int ret; 6318 u64 bg_blkno; 6319 handle_t *handle; 6320 struct inode *inode; 6321 struct buffer_head *di_bh = NULL; 6322 struct ocfs2_cached_block_free *tmp; 6323 6324 inode = ocfs2_get_system_file_inode(osb, sysfile_type, slot); 6325 if (!inode) { 6326 ret = -EINVAL; 6327 mlog_errno(ret); 6328 goto out; 6329 } 6330 6331 mutex_lock(&inode->i_mutex); 6332 6333 ret = ocfs2_inode_lock(inode, &di_bh, 1); 6334 if (ret) { 6335 mlog_errno(ret); 6336 goto out_mutex; 6337 } 6338 6339 handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE); 6340 if (IS_ERR(handle)) { 6341 ret = PTR_ERR(handle); 6342 mlog_errno(ret); 6343 goto out_unlock; 6344 } 6345 6346 while (head) { 6347 bg_blkno = ocfs2_which_suballoc_group(head->free_blk, 6348 head->free_bit); 6349 mlog(0, "Free bit: (bit %u, blkno %llu)\n", 6350 head->free_bit, (unsigned long long)head->free_blk); 6351 6352 ret = ocfs2_free_suballoc_bits(handle, inode, di_bh, 6353 head->free_bit, bg_blkno, 1); 6354 if (ret) { 6355 mlog_errno(ret); 6356 goto out_journal; 6357 } 6358 6359 ret = ocfs2_extend_trans(handle, OCFS2_SUBALLOC_FREE); 6360 if (ret) { 6361 mlog_errno(ret); 6362 goto out_journal; 6363 } 6364 6365 tmp = head; 6366 head = head->free_next; 6367 kfree(tmp); 6368 } 6369 6370 out_journal: 6371 ocfs2_commit_trans(osb, handle); 6372 6373 out_unlock: 6374 ocfs2_inode_unlock(inode, 1); 6375 brelse(di_bh); 6376 out_mutex: 6377 mutex_unlock(&inode->i_mutex); 6378 iput(inode); 6379 out: 6380 while(head) { 6381 /* Premature exit may have left some dangling items. */ 6382 tmp = head; 6383 head = head->free_next; 6384 kfree(tmp); 6385 } 6386 6387 return ret; 6388 } 6389 6390 int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, 6391 u64 blkno, unsigned int bit) 6392 { 6393 int ret = 0; 6394 struct ocfs2_cached_block_free *item; 6395 6396 item = kmalloc(sizeof(*item), GFP_NOFS); 6397 if (item == NULL) { 6398 ret = -ENOMEM; 6399 mlog_errno(ret); 6400 return ret; 6401 } 6402 6403 mlog(0, "Insert clusters: (bit %u, blk %llu)\n", 6404 bit, (unsigned long long)blkno); 6405 6406 item->free_blk = blkno; 6407 item->free_bit = bit; 6408 item->free_next = ctxt->c_global_allocator; 6409 6410 ctxt->c_global_allocator = item; 6411 return ret; 6412 } 6413 6414 static int ocfs2_free_cached_clusters(struct ocfs2_super *osb, 6415 struct ocfs2_cached_block_free *head) 6416 { 6417 struct ocfs2_cached_block_free *tmp; 6418 struct inode *tl_inode = osb->osb_tl_inode; 6419 handle_t *handle; 6420 int ret = 0; 6421 6422 mutex_lock(&tl_inode->i_mutex); 6423 6424 while (head) { 6425 if (ocfs2_truncate_log_needs_flush(osb)) { 6426 ret = __ocfs2_flush_truncate_log(osb); 6427 if (ret < 0) { 6428 mlog_errno(ret); 6429 break; 6430 } 6431 } 6432 6433 handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE); 6434 if (IS_ERR(handle)) { 6435 ret = PTR_ERR(handle); 6436 mlog_errno(ret); 6437 break; 6438 } 6439 6440 ret = ocfs2_truncate_log_append(osb, handle, head->free_blk, 6441 head->free_bit); 6442 6443 ocfs2_commit_trans(osb, handle); 6444 tmp = head; 6445 head = head->free_next; 6446 kfree(tmp); 6447 6448 if (ret < 0) { 6449 mlog_errno(ret); 6450 break; 6451 } 6452 } 6453 6454 mutex_unlock(&tl_inode->i_mutex); 6455 6456 while (head) { 6457 /* Premature exit may have left some dangling items. */ 6458 tmp = head; 6459 head = head->free_next; 6460 kfree(tmp); 6461 } 6462 6463 return ret; 6464 } 6465 6466 int ocfs2_run_deallocs(struct ocfs2_super *osb, 6467 struct ocfs2_cached_dealloc_ctxt *ctxt) 6468 { 6469 int ret = 0, ret2; 6470 struct ocfs2_per_slot_free_list *fl; 6471 6472 if (!ctxt) 6473 return 0; 6474 6475 while (ctxt->c_first_suballocator) { 6476 fl = ctxt->c_first_suballocator; 6477 6478 if (fl->f_first) { 6479 mlog(0, "Free items: (type %u, slot %d)\n", 6480 fl->f_inode_type, fl->f_slot); 6481 ret2 = ocfs2_free_cached_blocks(osb, 6482 fl->f_inode_type, 6483 fl->f_slot, 6484 fl->f_first); 6485 if (ret2) 6486 mlog_errno(ret2); 6487 if (!ret) 6488 ret = ret2; 6489 } 6490 6491 ctxt->c_first_suballocator = fl->f_next_suballocator; 6492 kfree(fl); 6493 } 6494 6495 if (ctxt->c_global_allocator) { 6496 ret2 = ocfs2_free_cached_clusters(osb, 6497 ctxt->c_global_allocator); 6498 if (ret2) 6499 mlog_errno(ret2); 6500 if (!ret) 6501 ret = ret2; 6502 6503 ctxt->c_global_allocator = NULL; 6504 } 6505 6506 return ret; 6507 } 6508 6509 static struct ocfs2_per_slot_free_list * 6510 ocfs2_find_per_slot_free_list(int type, 6511 int slot, 6512 struct ocfs2_cached_dealloc_ctxt *ctxt) 6513 { 6514 struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator; 6515 6516 while (fl) { 6517 if (fl->f_inode_type == type && fl->f_slot == slot) 6518 return fl; 6519 6520 fl = fl->f_next_suballocator; 6521 } 6522 6523 fl = kmalloc(sizeof(*fl), GFP_NOFS); 6524 if (fl) { 6525 fl->f_inode_type = type; 6526 fl->f_slot = slot; 6527 fl->f_first = NULL; 6528 fl->f_next_suballocator = ctxt->c_first_suballocator; 6529 6530 ctxt->c_first_suballocator = fl; 6531 } 6532 return fl; 6533 } 6534 6535 int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, 6536 int type, int slot, u64 blkno, 6537 unsigned int bit) 6538 { 6539 int ret; 6540 struct ocfs2_per_slot_free_list *fl; 6541 struct ocfs2_cached_block_free *item; 6542 6543 fl = ocfs2_find_per_slot_free_list(type, slot, ctxt); 6544 if (fl == NULL) { 6545 ret = -ENOMEM; 6546 mlog_errno(ret); 6547 goto out; 6548 } 6549 6550 item = kmalloc(sizeof(*item), GFP_NOFS); 6551 if (item == NULL) { 6552 ret = -ENOMEM; 6553 mlog_errno(ret); 6554 goto out; 6555 } 6556 6557 mlog(0, "Insert: (type %d, slot %u, bit %u, blk %llu)\n", 6558 type, slot, bit, (unsigned long long)blkno); 6559 6560 item->free_blk = blkno; 6561 item->free_bit = bit; 6562 item->free_next = fl->f_first; 6563 6564 fl->f_first = item; 6565 6566 ret = 0; 6567 out: 6568 return ret; 6569 } 6570 6571 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt, 6572 struct ocfs2_extent_block *eb) 6573 { 6574 return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE, 6575 le16_to_cpu(eb->h_suballoc_slot), 6576 le64_to_cpu(eb->h_blkno), 6577 le16_to_cpu(eb->h_suballoc_bit)); 6578 } 6579 6580 /* This function will figure out whether the currently last extent 6581 * block will be deleted, and if it will, what the new last extent 6582 * block will be so we can update his h_next_leaf_blk field, as well 6583 * as the dinodes i_last_eb_blk */ 6584 static int ocfs2_find_new_last_ext_blk(struct inode *inode, 6585 unsigned int clusters_to_del, 6586 struct ocfs2_path *path, 6587 struct buffer_head **new_last_eb) 6588 { 6589 int next_free, ret = 0; 6590 u32 cpos; 6591 struct ocfs2_extent_rec *rec; 6592 struct ocfs2_extent_block *eb; 6593 struct ocfs2_extent_list *el; 6594 struct buffer_head *bh = NULL; 6595 6596 *new_last_eb = NULL; 6597 6598 /* we have no tree, so of course, no last_eb. */ 6599 if (!path->p_tree_depth) 6600 goto out; 6601 6602 /* trunc to zero special case - this makes tree_depth = 0 6603 * regardless of what it is. */ 6604 if (OCFS2_I(inode)->ip_clusters == clusters_to_del) 6605 goto out; 6606 6607 el = path_leaf_el(path); 6608 BUG_ON(!el->l_next_free_rec); 6609 6610 /* 6611 * Make sure that this extent list will actually be empty 6612 * after we clear away the data. We can shortcut out if 6613 * there's more than one non-empty extent in the 6614 * list. Otherwise, a check of the remaining extent is 6615 * necessary. 6616 */ 6617 next_free = le16_to_cpu(el->l_next_free_rec); 6618 rec = NULL; 6619 if (ocfs2_is_empty_extent(&el->l_recs[0])) { 6620 if (next_free > 2) 6621 goto out; 6622 6623 /* We may have a valid extent in index 1, check it. */ 6624 if (next_free == 2) 6625 rec = &el->l_recs[1]; 6626 6627 /* 6628 * Fall through - no more nonempty extents, so we want 6629 * to delete this leaf. 6630 */ 6631 } else { 6632 if (next_free > 1) 6633 goto out; 6634 6635 rec = &el->l_recs[0]; 6636 } 6637 6638 if (rec) { 6639 /* 6640 * Check it we'll only be trimming off the end of this 6641 * cluster. 6642 */ 6643 if (le16_to_cpu(rec->e_leaf_clusters) > clusters_to_del) 6644 goto out; 6645 } 6646 6647 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos); 6648 if (ret) { 6649 mlog_errno(ret); 6650 goto out; 6651 } 6652 6653 ret = ocfs2_find_leaf(INODE_CACHE(inode), path_root_el(path), cpos, &bh); 6654 if (ret) { 6655 mlog_errno(ret); 6656 goto out; 6657 } 6658 6659 eb = (struct ocfs2_extent_block *) bh->b_data; 6660 el = &eb->h_list; 6661 6662 /* ocfs2_find_leaf() gets the eb from ocfs2_read_extent_block(). 6663 * Any corruption is a code bug. */ 6664 BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb)); 6665 6666 *new_last_eb = bh; 6667 get_bh(*new_last_eb); 6668 mlog(0, "returning block %llu, (cpos: %u)\n", 6669 (unsigned long long)le64_to_cpu(eb->h_blkno), cpos); 6670 out: 6671 brelse(bh); 6672 6673 return ret; 6674 } 6675 6676 /* 6677 * Trim some clusters off the rightmost edge of a tree. Only called 6678 * during truncate. 6679 * 6680 * The caller needs to: 6681 * - start journaling of each path component. 6682 * - compute and fully set up any new last ext block 6683 */ 6684 static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path, 6685 handle_t *handle, struct ocfs2_truncate_context *tc, 6686 u32 clusters_to_del, u64 *delete_start, u8 *flags) 6687 { 6688 int ret, i, index = path->p_tree_depth; 6689 u32 new_edge = 0; 6690 u64 deleted_eb = 0; 6691 struct buffer_head *bh; 6692 struct ocfs2_extent_list *el; 6693 struct ocfs2_extent_rec *rec; 6694 6695 *delete_start = 0; 6696 *flags = 0; 6697 6698 while (index >= 0) { 6699 bh = path->p_node[index].bh; 6700 el = path->p_node[index].el; 6701 6702 mlog(0, "traveling tree (index = %d, block = %llu)\n", 6703 index, (unsigned long long)bh->b_blocknr); 6704 6705 BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0); 6706 6707 if (index != 6708 (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) { 6709 ocfs2_error(inode->i_sb, 6710 "Inode %lu has invalid ext. block %llu", 6711 inode->i_ino, 6712 (unsigned long long)bh->b_blocknr); 6713 ret = -EROFS; 6714 goto out; 6715 } 6716 6717 find_tail_record: 6718 i = le16_to_cpu(el->l_next_free_rec) - 1; 6719 rec = &el->l_recs[i]; 6720 6721 mlog(0, "Extent list before: record %d: (%u, %u, %llu), " 6722 "next = %u\n", i, le32_to_cpu(rec->e_cpos), 6723 ocfs2_rec_clusters(el, rec), 6724 (unsigned long long)le64_to_cpu(rec->e_blkno), 6725 le16_to_cpu(el->l_next_free_rec)); 6726 6727 BUG_ON(ocfs2_rec_clusters(el, rec) < clusters_to_del); 6728 6729 if (le16_to_cpu(el->l_tree_depth) == 0) { 6730 /* 6731 * If the leaf block contains a single empty 6732 * extent and no records, we can just remove 6733 * the block. 6734 */ 6735 if (i == 0 && ocfs2_is_empty_extent(rec)) { 6736 memset(rec, 0, 6737 sizeof(struct ocfs2_extent_rec)); 6738 el->l_next_free_rec = cpu_to_le16(0); 6739 6740 goto delete; 6741 } 6742 6743 /* 6744 * Remove any empty extents by shifting things 6745 * left. That should make life much easier on 6746 * the code below. This condition is rare 6747 * enough that we shouldn't see a performance 6748 * hit. 6749 */ 6750 if (ocfs2_is_empty_extent(&el->l_recs[0])) { 6751 le16_add_cpu(&el->l_next_free_rec, -1); 6752 6753 for(i = 0; 6754 i < le16_to_cpu(el->l_next_free_rec); i++) 6755 el->l_recs[i] = el->l_recs[i + 1]; 6756 6757 memset(&el->l_recs[i], 0, 6758 sizeof(struct ocfs2_extent_rec)); 6759 6760 /* 6761 * We've modified our extent list. The 6762 * simplest way to handle this change 6763 * is to being the search from the 6764 * start again. 6765 */ 6766 goto find_tail_record; 6767 } 6768 6769 le16_add_cpu(&rec->e_leaf_clusters, -clusters_to_del); 6770 6771 /* 6772 * We'll use "new_edge" on our way back up the 6773 * tree to know what our rightmost cpos is. 6774 */ 6775 new_edge = le16_to_cpu(rec->e_leaf_clusters); 6776 new_edge += le32_to_cpu(rec->e_cpos); 6777 6778 /* 6779 * The caller will use this to delete data blocks. 6780 */ 6781 *delete_start = le64_to_cpu(rec->e_blkno) 6782 + ocfs2_clusters_to_blocks(inode->i_sb, 6783 le16_to_cpu(rec->e_leaf_clusters)); 6784 *flags = rec->e_flags; 6785 6786 /* 6787 * If it's now empty, remove this record. 6788 */ 6789 if (le16_to_cpu(rec->e_leaf_clusters) == 0) { 6790 memset(rec, 0, 6791 sizeof(struct ocfs2_extent_rec)); 6792 le16_add_cpu(&el->l_next_free_rec, -1); 6793 } 6794 } else { 6795 if (le64_to_cpu(rec->e_blkno) == deleted_eb) { 6796 memset(rec, 0, 6797 sizeof(struct ocfs2_extent_rec)); 6798 le16_add_cpu(&el->l_next_free_rec, -1); 6799 6800 goto delete; 6801 } 6802 6803 /* Can this actually happen? */ 6804 if (le16_to_cpu(el->l_next_free_rec) == 0) 6805 goto delete; 6806 6807 /* 6808 * We never actually deleted any clusters 6809 * because our leaf was empty. There's no 6810 * reason to adjust the rightmost edge then. 6811 */ 6812 if (new_edge == 0) 6813 goto delete; 6814 6815 rec->e_int_clusters = cpu_to_le32(new_edge); 6816 le32_add_cpu(&rec->e_int_clusters, 6817 -le32_to_cpu(rec->e_cpos)); 6818 6819 /* 6820 * A deleted child record should have been 6821 * caught above. 6822 */ 6823 BUG_ON(le32_to_cpu(rec->e_int_clusters) == 0); 6824 } 6825 6826 delete: 6827 ret = ocfs2_journal_dirty(handle, bh); 6828 if (ret) { 6829 mlog_errno(ret); 6830 goto out; 6831 } 6832 6833 mlog(0, "extent list container %llu, after: record %d: " 6834 "(%u, %u, %llu), next = %u.\n", 6835 (unsigned long long)bh->b_blocknr, i, 6836 le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec), 6837 (unsigned long long)le64_to_cpu(rec->e_blkno), 6838 le16_to_cpu(el->l_next_free_rec)); 6839 6840 /* 6841 * We must be careful to only attempt delete of an 6842 * extent block (and not the root inode block). 6843 */ 6844 if (index > 0 && le16_to_cpu(el->l_next_free_rec) == 0) { 6845 struct ocfs2_extent_block *eb = 6846 (struct ocfs2_extent_block *)bh->b_data; 6847 6848 /* 6849 * Save this for use when processing the 6850 * parent block. 6851 */ 6852 deleted_eb = le64_to_cpu(eb->h_blkno); 6853 6854 mlog(0, "deleting this extent block.\n"); 6855 6856 ocfs2_remove_from_cache(INODE_CACHE(inode), bh); 6857 6858 BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0])); 6859 BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos)); 6860 BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno)); 6861 6862 ret = ocfs2_cache_extent_block_free(&tc->tc_dealloc, eb); 6863 /* An error here is not fatal. */ 6864 if (ret < 0) 6865 mlog_errno(ret); 6866 } else { 6867 deleted_eb = 0; 6868 } 6869 6870 index--; 6871 } 6872 6873 ret = 0; 6874 out: 6875 return ret; 6876 } 6877 6878 static int ocfs2_do_truncate(struct ocfs2_super *osb, 6879 unsigned int clusters_to_del, 6880 struct inode *inode, 6881 struct buffer_head *fe_bh, 6882 handle_t *handle, 6883 struct ocfs2_truncate_context *tc, 6884 struct ocfs2_path *path, 6885 struct ocfs2_alloc_context *meta_ac) 6886 { 6887 int status; 6888 struct ocfs2_dinode *fe; 6889 struct ocfs2_extent_block *last_eb = NULL; 6890 struct ocfs2_extent_list *el; 6891 struct buffer_head *last_eb_bh = NULL; 6892 u64 delete_blk = 0; 6893 u8 rec_flags; 6894 6895 fe = (struct ocfs2_dinode *) fe_bh->b_data; 6896 6897 status = ocfs2_find_new_last_ext_blk(inode, clusters_to_del, 6898 path, &last_eb_bh); 6899 if (status < 0) { 6900 mlog_errno(status); 6901 goto bail; 6902 } 6903 6904 /* 6905 * Each component will be touched, so we might as well journal 6906 * here to avoid having to handle errors later. 6907 */ 6908 status = ocfs2_journal_access_path(INODE_CACHE(inode), handle, path); 6909 if (status < 0) { 6910 mlog_errno(status); 6911 goto bail; 6912 } 6913 6914 if (last_eb_bh) { 6915 status = ocfs2_journal_access_eb(handle, INODE_CACHE(inode), last_eb_bh, 6916 OCFS2_JOURNAL_ACCESS_WRITE); 6917 if (status < 0) { 6918 mlog_errno(status); 6919 goto bail; 6920 } 6921 6922 last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; 6923 } 6924 6925 el = &(fe->id2.i_list); 6926 6927 /* 6928 * Lower levels depend on this never happening, but it's best 6929 * to check it up here before changing the tree. 6930 */ 6931 if (el->l_tree_depth && el->l_recs[0].e_int_clusters == 0) { 6932 ocfs2_error(inode->i_sb, 6933 "Inode %lu has an empty extent record, depth %u\n", 6934 inode->i_ino, le16_to_cpu(el->l_tree_depth)); 6935 status = -EROFS; 6936 goto bail; 6937 } 6938 6939 dquot_free_space_nodirty(inode, 6940 ocfs2_clusters_to_bytes(osb->sb, clusters_to_del)); 6941 spin_lock(&OCFS2_I(inode)->ip_lock); 6942 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) - 6943 clusters_to_del; 6944 spin_unlock(&OCFS2_I(inode)->ip_lock); 6945 le32_add_cpu(&fe->i_clusters, -clusters_to_del); 6946 inode->i_blocks = ocfs2_inode_sector_count(inode); 6947 6948 status = ocfs2_trim_tree(inode, path, handle, tc, 6949 clusters_to_del, &delete_blk, &rec_flags); 6950 if (status) { 6951 mlog_errno(status); 6952 goto bail; 6953 } 6954 6955 if (le32_to_cpu(fe->i_clusters) == 0) { 6956 /* trunc to zero is a special case. */ 6957 el->l_tree_depth = 0; 6958 fe->i_last_eb_blk = 0; 6959 } else if (last_eb) 6960 fe->i_last_eb_blk = last_eb->h_blkno; 6961 6962 status = ocfs2_journal_dirty(handle, fe_bh); 6963 if (status < 0) { 6964 mlog_errno(status); 6965 goto bail; 6966 } 6967 6968 if (last_eb) { 6969 /* If there will be a new last extent block, then by 6970 * definition, there cannot be any leaves to the right of 6971 * him. */ 6972 last_eb->h_next_leaf_blk = 0; 6973 status = ocfs2_journal_dirty(handle, last_eb_bh); 6974 if (status < 0) { 6975 mlog_errno(status); 6976 goto bail; 6977 } 6978 } 6979 6980 if (delete_blk) { 6981 if (rec_flags & OCFS2_EXT_REFCOUNTED) 6982 status = ocfs2_decrease_refcount(inode, handle, 6983 ocfs2_blocks_to_clusters(osb->sb, 6984 delete_blk), 6985 clusters_to_del, meta_ac, 6986 &tc->tc_dealloc, 1); 6987 else 6988 status = ocfs2_truncate_log_append(osb, handle, 6989 delete_blk, 6990 clusters_to_del); 6991 if (status < 0) { 6992 mlog_errno(status); 6993 goto bail; 6994 } 6995 } 6996 status = 0; 6997 bail: 6998 brelse(last_eb_bh); 6999 mlog_exit(status); 7000 return status; 7001 } 7002 7003 static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh) 7004 { 7005 set_buffer_uptodate(bh); 7006 mark_buffer_dirty(bh); 7007 return 0; 7008 } 7009 7010 void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle, 7011 unsigned int from, unsigned int to, 7012 struct page *page, int zero, u64 *phys) 7013 { 7014 int ret, partial = 0; 7015 7016 ret = ocfs2_map_page_blocks(page, phys, inode, from, to, 0); 7017 if (ret) 7018 mlog_errno(ret); 7019 7020 if (zero) 7021 zero_user_segment(page, from, to); 7022 7023 /* 7024 * Need to set the buffers we zero'd into uptodate 7025 * here if they aren't - ocfs2_map_page_blocks() 7026 * might've skipped some 7027 */ 7028 ret = walk_page_buffers(handle, page_buffers(page), 7029 from, to, &partial, 7030 ocfs2_zero_func); 7031 if (ret < 0) 7032 mlog_errno(ret); 7033 else if (ocfs2_should_order_data(inode)) { 7034 ret = ocfs2_jbd2_file_inode(handle, inode); 7035 if (ret < 0) 7036 mlog_errno(ret); 7037 } 7038 7039 if (!partial) 7040 SetPageUptodate(page); 7041 7042 flush_dcache_page(page); 7043 } 7044 7045 static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start, 7046 loff_t end, struct page **pages, 7047 int numpages, u64 phys, handle_t *handle) 7048 { 7049 int i; 7050 struct page *page; 7051 unsigned int from, to = PAGE_CACHE_SIZE; 7052 struct super_block *sb = inode->i_sb; 7053 7054 BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb))); 7055 7056 if (numpages == 0) 7057 goto out; 7058 7059 to = PAGE_CACHE_SIZE; 7060 for(i = 0; i < numpages; i++) { 7061 page = pages[i]; 7062 7063 from = start & (PAGE_CACHE_SIZE - 1); 7064 if ((end >> PAGE_CACHE_SHIFT) == page->index) 7065 to = end & (PAGE_CACHE_SIZE - 1); 7066 7067 BUG_ON(from > PAGE_CACHE_SIZE); 7068 BUG_ON(to > PAGE_CACHE_SIZE); 7069 7070 ocfs2_map_and_dirty_page(inode, handle, from, to, page, 1, 7071 &phys); 7072 7073 start = (page->index + 1) << PAGE_CACHE_SHIFT; 7074 } 7075 out: 7076 if (pages) 7077 ocfs2_unlock_and_free_pages(pages, numpages); 7078 } 7079 7080 int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end, 7081 struct page **pages, int *num) 7082 { 7083 int numpages, ret = 0; 7084 struct address_space *mapping = inode->i_mapping; 7085 unsigned long index; 7086 loff_t last_page_bytes; 7087 7088 BUG_ON(start > end); 7089 7090 numpages = 0; 7091 last_page_bytes = PAGE_ALIGN(end); 7092 index = start >> PAGE_CACHE_SHIFT; 7093 do { 7094 pages[numpages] = grab_cache_page(mapping, index); 7095 if (!pages[numpages]) { 7096 ret = -ENOMEM; 7097 mlog_errno(ret); 7098 goto out; 7099 } 7100 7101 numpages++; 7102 index++; 7103 } while (index < (last_page_bytes >> PAGE_CACHE_SHIFT)); 7104 7105 out: 7106 if (ret != 0) { 7107 if (pages) 7108 ocfs2_unlock_and_free_pages(pages, numpages); 7109 numpages = 0; 7110 } 7111 7112 *num = numpages; 7113 7114 return ret; 7115 } 7116 7117 static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end, 7118 struct page **pages, int *num) 7119 { 7120 struct super_block *sb = inode->i_sb; 7121 7122 BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits != 7123 (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits); 7124 7125 return ocfs2_grab_pages(inode, start, end, pages, num); 7126 } 7127 7128 /* 7129 * Zero the area past i_size but still within an allocated 7130 * cluster. This avoids exposing nonzero data on subsequent file 7131 * extends. 7132 * 7133 * We need to call this before i_size is updated on the inode because 7134 * otherwise block_write_full_page() will skip writeout of pages past 7135 * i_size. The new_i_size parameter is passed for this reason. 7136 */ 7137 int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle, 7138 u64 range_start, u64 range_end) 7139 { 7140 int ret = 0, numpages; 7141 struct page **pages = NULL; 7142 u64 phys; 7143 unsigned int ext_flags; 7144 struct super_block *sb = inode->i_sb; 7145 7146 /* 7147 * File systems which don't support sparse files zero on every 7148 * extend. 7149 */ 7150 if (!ocfs2_sparse_alloc(OCFS2_SB(sb))) 7151 return 0; 7152 7153 pages = kcalloc(ocfs2_pages_per_cluster(sb), 7154 sizeof(struct page *), GFP_NOFS); 7155 if (pages == NULL) { 7156 ret = -ENOMEM; 7157 mlog_errno(ret); 7158 goto out; 7159 } 7160 7161 if (range_start == range_end) 7162 goto out; 7163 7164 ret = ocfs2_extent_map_get_blocks(inode, 7165 range_start >> sb->s_blocksize_bits, 7166 &phys, NULL, &ext_flags); 7167 if (ret) { 7168 mlog_errno(ret); 7169 goto out; 7170 } 7171 7172 /* 7173 * Tail is a hole, or is marked unwritten. In either case, we 7174 * can count on read and write to return/push zero's. 7175 */ 7176 if (phys == 0 || ext_flags & OCFS2_EXT_UNWRITTEN) 7177 goto out; 7178 7179 ret = ocfs2_grab_eof_pages(inode, range_start, range_end, pages, 7180 &numpages); 7181 if (ret) { 7182 mlog_errno(ret); 7183 goto out; 7184 } 7185 7186 ocfs2_zero_cluster_pages(inode, range_start, range_end, pages, 7187 numpages, phys, handle); 7188 7189 /* 7190 * Initiate writeout of the pages we zero'd here. We don't 7191 * wait on them - the truncate_inode_pages() call later will 7192 * do that for us. 7193 */ 7194 ret = filemap_fdatawrite_range(inode->i_mapping, range_start, 7195 range_end - 1); 7196 if (ret) 7197 mlog_errno(ret); 7198 7199 out: 7200 if (pages) 7201 kfree(pages); 7202 7203 return ret; 7204 } 7205 7206 static void ocfs2_zero_dinode_id2_with_xattr(struct inode *inode, 7207 struct ocfs2_dinode *di) 7208 { 7209 unsigned int blocksize = 1 << inode->i_sb->s_blocksize_bits; 7210 unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size); 7211 7212 if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL) 7213 memset(&di->id2, 0, blocksize - 7214 offsetof(struct ocfs2_dinode, id2) - 7215 xattrsize); 7216 else 7217 memset(&di->id2, 0, blocksize - 7218 offsetof(struct ocfs2_dinode, id2)); 7219 } 7220 7221 void ocfs2_dinode_new_extent_list(struct inode *inode, 7222 struct ocfs2_dinode *di) 7223 { 7224 ocfs2_zero_dinode_id2_with_xattr(inode, di); 7225 di->id2.i_list.l_tree_depth = 0; 7226 di->id2.i_list.l_next_free_rec = 0; 7227 di->id2.i_list.l_count = cpu_to_le16( 7228 ocfs2_extent_recs_per_inode_with_xattr(inode->i_sb, di)); 7229 } 7230 7231 void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di) 7232 { 7233 struct ocfs2_inode_info *oi = OCFS2_I(inode); 7234 struct ocfs2_inline_data *idata = &di->id2.i_data; 7235 7236 spin_lock(&oi->ip_lock); 7237 oi->ip_dyn_features |= OCFS2_INLINE_DATA_FL; 7238 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); 7239 spin_unlock(&oi->ip_lock); 7240 7241 /* 7242 * We clear the entire i_data structure here so that all 7243 * fields can be properly initialized. 7244 */ 7245 ocfs2_zero_dinode_id2_with_xattr(inode, di); 7246 7247 idata->id_count = cpu_to_le16( 7248 ocfs2_max_inline_data_with_xattr(inode->i_sb, di)); 7249 } 7250 7251 int ocfs2_convert_inline_data_to_extents(struct inode *inode, 7252 struct buffer_head *di_bh) 7253 { 7254 int ret, i, has_data, num_pages = 0; 7255 handle_t *handle; 7256 u64 uninitialized_var(block); 7257 struct ocfs2_inode_info *oi = OCFS2_I(inode); 7258 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 7259 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 7260 struct ocfs2_alloc_context *data_ac = NULL; 7261 struct page **pages = NULL; 7262 loff_t end = osb->s_clustersize; 7263 struct ocfs2_extent_tree et; 7264 int did_quota = 0; 7265 7266 has_data = i_size_read(inode) ? 1 : 0; 7267 7268 if (has_data) { 7269 pages = kcalloc(ocfs2_pages_per_cluster(osb->sb), 7270 sizeof(struct page *), GFP_NOFS); 7271 if (pages == NULL) { 7272 ret = -ENOMEM; 7273 mlog_errno(ret); 7274 goto out; 7275 } 7276 7277 ret = ocfs2_reserve_clusters(osb, 1, &data_ac); 7278 if (ret) { 7279 mlog_errno(ret); 7280 goto out; 7281 } 7282 } 7283 7284 handle = ocfs2_start_trans(osb, 7285 ocfs2_inline_to_extents_credits(osb->sb)); 7286 if (IS_ERR(handle)) { 7287 ret = PTR_ERR(handle); 7288 mlog_errno(ret); 7289 goto out_unlock; 7290 } 7291 7292 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, 7293 OCFS2_JOURNAL_ACCESS_WRITE); 7294 if (ret) { 7295 mlog_errno(ret); 7296 goto out_commit; 7297 } 7298 7299 if (has_data) { 7300 u32 bit_off, num; 7301 unsigned int page_end; 7302 u64 phys; 7303 7304 ret = dquot_alloc_space_nodirty(inode, 7305 ocfs2_clusters_to_bytes(osb->sb, 1)); 7306 if (ret) 7307 goto out_commit; 7308 did_quota = 1; 7309 7310 ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, 7311 &num); 7312 if (ret) { 7313 mlog_errno(ret); 7314 goto out_commit; 7315 } 7316 7317 /* 7318 * Save two copies, one for insert, and one that can 7319 * be changed by ocfs2_map_and_dirty_page() below. 7320 */ 7321 block = phys = ocfs2_clusters_to_blocks(inode->i_sb, bit_off); 7322 7323 /* 7324 * Non sparse file systems zero on extend, so no need 7325 * to do that now. 7326 */ 7327 if (!ocfs2_sparse_alloc(osb) && 7328 PAGE_CACHE_SIZE < osb->s_clustersize) 7329 end = PAGE_CACHE_SIZE; 7330 7331 ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages); 7332 if (ret) { 7333 mlog_errno(ret); 7334 goto out_commit; 7335 } 7336 7337 /* 7338 * This should populate the 1st page for us and mark 7339 * it up to date. 7340 */ 7341 ret = ocfs2_read_inline_data(inode, pages[0], di_bh); 7342 if (ret) { 7343 mlog_errno(ret); 7344 goto out_commit; 7345 } 7346 7347 page_end = PAGE_CACHE_SIZE; 7348 if (PAGE_CACHE_SIZE > osb->s_clustersize) 7349 page_end = osb->s_clustersize; 7350 7351 for (i = 0; i < num_pages; i++) 7352 ocfs2_map_and_dirty_page(inode, handle, 0, page_end, 7353 pages[i], i > 0, &phys); 7354 } 7355 7356 spin_lock(&oi->ip_lock); 7357 oi->ip_dyn_features &= ~OCFS2_INLINE_DATA_FL; 7358 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); 7359 spin_unlock(&oi->ip_lock); 7360 7361 ocfs2_dinode_new_extent_list(inode, di); 7362 7363 ocfs2_journal_dirty(handle, di_bh); 7364 7365 if (has_data) { 7366 /* 7367 * An error at this point should be extremely rare. If 7368 * this proves to be false, we could always re-build 7369 * the in-inode data from our pages. 7370 */ 7371 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh); 7372 ret = ocfs2_insert_extent(handle, &et, 0, block, 1, 0, NULL); 7373 if (ret) { 7374 mlog_errno(ret); 7375 goto out_commit; 7376 } 7377 7378 inode->i_blocks = ocfs2_inode_sector_count(inode); 7379 } 7380 7381 out_commit: 7382 if (ret < 0 && did_quota) 7383 dquot_free_space_nodirty(inode, 7384 ocfs2_clusters_to_bytes(osb->sb, 1)); 7385 7386 ocfs2_commit_trans(osb, handle); 7387 7388 out_unlock: 7389 if (data_ac) 7390 ocfs2_free_alloc_context(data_ac); 7391 7392 out: 7393 if (pages) { 7394 ocfs2_unlock_and_free_pages(pages, num_pages); 7395 kfree(pages); 7396 } 7397 7398 return ret; 7399 } 7400 7401 /* 7402 * It is expected, that by the time you call this function, 7403 * inode->i_size and fe->i_size have been adjusted. 7404 * 7405 * WARNING: This will kfree the truncate context 7406 */ 7407 int ocfs2_commit_truncate(struct ocfs2_super *osb, 7408 struct inode *inode, 7409 struct buffer_head *fe_bh, 7410 struct ocfs2_truncate_context *tc) 7411 { 7412 int status, i, credits, tl_sem = 0; 7413 u32 clusters_to_del, new_highest_cpos, range; 7414 u64 blkno = 0; 7415 struct ocfs2_extent_list *el; 7416 handle_t *handle = NULL; 7417 struct inode *tl_inode = osb->osb_tl_inode; 7418 struct ocfs2_path *path = NULL; 7419 struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data; 7420 struct ocfs2_alloc_context *meta_ac = NULL; 7421 struct ocfs2_refcount_tree *ref_tree = NULL; 7422 7423 mlog_entry_void(); 7424 7425 new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb, 7426 i_size_read(inode)); 7427 7428 path = ocfs2_new_path(fe_bh, &di->id2.i_list, 7429 ocfs2_journal_access_di); 7430 if (!path) { 7431 status = -ENOMEM; 7432 mlog_errno(status); 7433 goto bail; 7434 } 7435 7436 ocfs2_extent_map_trunc(inode, new_highest_cpos); 7437 7438 start: 7439 /* 7440 * Check that we still have allocation to delete. 7441 */ 7442 if (OCFS2_I(inode)->ip_clusters == 0) { 7443 status = 0; 7444 goto bail; 7445 } 7446 7447 credits = 0; 7448 7449 /* 7450 * Truncate always works against the rightmost tree branch. 7451 */ 7452 status = ocfs2_find_path(INODE_CACHE(inode), path, UINT_MAX); 7453 if (status) { 7454 mlog_errno(status); 7455 goto bail; 7456 } 7457 7458 mlog(0, "inode->ip_clusters = %u, tree_depth = %u\n", 7459 OCFS2_I(inode)->ip_clusters, path->p_tree_depth); 7460 7461 /* 7462 * By now, el will point to the extent list on the bottom most 7463 * portion of this tree. Only the tail record is considered in 7464 * each pass. 7465 * 7466 * We handle the following cases, in order: 7467 * - empty extent: delete the remaining branch 7468 * - remove the entire record 7469 * - remove a partial record 7470 * - no record needs to be removed (truncate has completed) 7471 */ 7472 el = path_leaf_el(path); 7473 if (le16_to_cpu(el->l_next_free_rec) == 0) { 7474 ocfs2_error(inode->i_sb, 7475 "Inode %llu has empty extent block at %llu\n", 7476 (unsigned long long)OCFS2_I(inode)->ip_blkno, 7477 (unsigned long long)path_leaf_bh(path)->b_blocknr); 7478 status = -EROFS; 7479 goto bail; 7480 } 7481 7482 i = le16_to_cpu(el->l_next_free_rec) - 1; 7483 range = le32_to_cpu(el->l_recs[i].e_cpos) + 7484 ocfs2_rec_clusters(el, &el->l_recs[i]); 7485 if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) { 7486 clusters_to_del = 0; 7487 } else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) { 7488 clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]); 7489 blkno = le64_to_cpu(el->l_recs[i].e_blkno); 7490 } else if (range > new_highest_cpos) { 7491 clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) + 7492 le32_to_cpu(el->l_recs[i].e_cpos)) - 7493 new_highest_cpos; 7494 blkno = le64_to_cpu(el->l_recs[i].e_blkno) + 7495 ocfs2_clusters_to_blocks(inode->i_sb, 7496 ocfs2_rec_clusters(el, &el->l_recs[i]) - 7497 clusters_to_del); 7498 } else { 7499 status = 0; 7500 goto bail; 7501 } 7502 7503 mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n", 7504 clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr); 7505 7506 if (el->l_recs[i].e_flags & OCFS2_EXT_REFCOUNTED && clusters_to_del) { 7507 BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & 7508 OCFS2_HAS_REFCOUNT_FL)); 7509 7510 status = ocfs2_lock_refcount_tree(osb, 7511 le64_to_cpu(di->i_refcount_loc), 7512 1, &ref_tree, NULL); 7513 if (status) { 7514 mlog_errno(status); 7515 goto bail; 7516 } 7517 7518 status = ocfs2_prepare_refcount_change_for_del(inode, fe_bh, 7519 blkno, 7520 clusters_to_del, 7521 &credits, 7522 &meta_ac); 7523 if (status < 0) { 7524 mlog_errno(status); 7525 goto bail; 7526 } 7527 } 7528 7529 mutex_lock(&tl_inode->i_mutex); 7530 tl_sem = 1; 7531 /* ocfs2_truncate_log_needs_flush guarantees us at least one 7532 * record is free for use. If there isn't any, we flush to get 7533 * an empty truncate log. */ 7534 if (ocfs2_truncate_log_needs_flush(osb)) { 7535 status = __ocfs2_flush_truncate_log(osb); 7536 if (status < 0) { 7537 mlog_errno(status); 7538 goto bail; 7539 } 7540 } 7541 7542 credits += ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del, 7543 (struct ocfs2_dinode *)fe_bh->b_data, 7544 el); 7545 handle = ocfs2_start_trans(osb, credits); 7546 if (IS_ERR(handle)) { 7547 status = PTR_ERR(handle); 7548 handle = NULL; 7549 mlog_errno(status); 7550 goto bail; 7551 } 7552 7553 status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle, 7554 tc, path, meta_ac); 7555 if (status < 0) { 7556 mlog_errno(status); 7557 goto bail; 7558 } 7559 7560 mutex_unlock(&tl_inode->i_mutex); 7561 tl_sem = 0; 7562 7563 ocfs2_commit_trans(osb, handle); 7564 handle = NULL; 7565 7566 ocfs2_reinit_path(path, 1); 7567 7568 if (meta_ac) { 7569 ocfs2_free_alloc_context(meta_ac); 7570 meta_ac = NULL; 7571 } 7572 7573 if (ref_tree) { 7574 ocfs2_unlock_refcount_tree(osb, ref_tree, 1); 7575 ref_tree = NULL; 7576 } 7577 7578 /* 7579 * The check above will catch the case where we've truncated 7580 * away all allocation. 7581 */ 7582 goto start; 7583 7584 bail: 7585 7586 ocfs2_schedule_truncate_log_flush(osb, 1); 7587 7588 if (tl_sem) 7589 mutex_unlock(&tl_inode->i_mutex); 7590 7591 if (handle) 7592 ocfs2_commit_trans(osb, handle); 7593 7594 if (meta_ac) 7595 ocfs2_free_alloc_context(meta_ac); 7596 7597 if (ref_tree) 7598 ocfs2_unlock_refcount_tree(osb, ref_tree, 1); 7599 7600 ocfs2_run_deallocs(osb, &tc->tc_dealloc); 7601 7602 ocfs2_free_path(path); 7603 7604 /* This will drop the ext_alloc cluster lock for us */ 7605 ocfs2_free_truncate_context(tc); 7606 7607 mlog_exit(status); 7608 return status; 7609 } 7610 7611 /* 7612 * Expects the inode to already be locked. 7613 */ 7614 int ocfs2_prepare_truncate(struct ocfs2_super *osb, 7615 struct inode *inode, 7616 struct buffer_head *fe_bh, 7617 struct ocfs2_truncate_context **tc) 7618 { 7619 int status; 7620 unsigned int new_i_clusters; 7621 struct ocfs2_dinode *fe; 7622 struct ocfs2_extent_block *eb; 7623 struct buffer_head *last_eb_bh = NULL; 7624 7625 mlog_entry_void(); 7626 7627 *tc = NULL; 7628 7629 new_i_clusters = ocfs2_clusters_for_bytes(osb->sb, 7630 i_size_read(inode)); 7631 fe = (struct ocfs2_dinode *) fe_bh->b_data; 7632 7633 mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size =" 7634 "%llu\n", le32_to_cpu(fe->i_clusters), new_i_clusters, 7635 (unsigned long long)le64_to_cpu(fe->i_size)); 7636 7637 *tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL); 7638 if (!(*tc)) { 7639 status = -ENOMEM; 7640 mlog_errno(status); 7641 goto bail; 7642 } 7643 ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc); 7644 7645 if (fe->id2.i_list.l_tree_depth) { 7646 status = ocfs2_read_extent_block(INODE_CACHE(inode), 7647 le64_to_cpu(fe->i_last_eb_blk), 7648 &last_eb_bh); 7649 if (status < 0) { 7650 mlog_errno(status); 7651 goto bail; 7652 } 7653 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; 7654 } 7655 7656 (*tc)->tc_last_eb_bh = last_eb_bh; 7657 7658 status = 0; 7659 bail: 7660 if (status < 0) { 7661 if (*tc) 7662 ocfs2_free_truncate_context(*tc); 7663 *tc = NULL; 7664 } 7665 mlog_exit_void(); 7666 return status; 7667 } 7668 7669 /* 7670 * 'start' is inclusive, 'end' is not. 7671 */ 7672 int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh, 7673 unsigned int start, unsigned int end, int trunc) 7674 { 7675 int ret; 7676 unsigned int numbytes; 7677 handle_t *handle; 7678 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 7679 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 7680 struct ocfs2_inline_data *idata = &di->id2.i_data; 7681 7682 if (end > i_size_read(inode)) 7683 end = i_size_read(inode); 7684 7685 BUG_ON(start >= end); 7686 7687 if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) || 7688 !(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) || 7689 !ocfs2_supports_inline_data(osb)) { 7690 ocfs2_error(inode->i_sb, 7691 "Inline data flags for inode %llu don't agree! " 7692 "Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n", 7693 (unsigned long long)OCFS2_I(inode)->ip_blkno, 7694 le16_to_cpu(di->i_dyn_features), 7695 OCFS2_I(inode)->ip_dyn_features, 7696 osb->s_feature_incompat); 7697 ret = -EROFS; 7698 goto out; 7699 } 7700 7701 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 7702 if (IS_ERR(handle)) { 7703 ret = PTR_ERR(handle); 7704 mlog_errno(ret); 7705 goto out; 7706 } 7707 7708 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, 7709 OCFS2_JOURNAL_ACCESS_WRITE); 7710 if (ret) { 7711 mlog_errno(ret); 7712 goto out_commit; 7713 } 7714 7715 numbytes = end - start; 7716 memset(idata->id_data + start, 0, numbytes); 7717 7718 /* 7719 * No need to worry about the data page here - it's been 7720 * truncated already and inline data doesn't need it for 7721 * pushing zero's to disk, so we'll let readpage pick it up 7722 * later. 7723 */ 7724 if (trunc) { 7725 i_size_write(inode, start); 7726 di->i_size = cpu_to_le64(start); 7727 } 7728 7729 inode->i_blocks = ocfs2_inode_sector_count(inode); 7730 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 7731 7732 di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec); 7733 di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); 7734 7735 ocfs2_journal_dirty(handle, di_bh); 7736 7737 out_commit: 7738 ocfs2_commit_trans(osb, handle); 7739 7740 out: 7741 return ret; 7742 } 7743 7744 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc) 7745 { 7746 /* 7747 * The caller is responsible for completing deallocation 7748 * before freeing the context. 7749 */ 7750 if (tc->tc_dealloc.c_first_suballocator != NULL) 7751 mlog(ML_NOTICE, 7752 "Truncate completion has non-empty dealloc context\n"); 7753 7754 brelse(tc->tc_last_eb_bh); 7755 7756 kfree(tc); 7757 } 7758