1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_bit.h" 13 #include "xfs_mount.h" 14 #include "xfs_inode.h" 15 #include "xfs_trans.h" 16 #include "xfs_buf_item.h" 17 #include "xfs_btree.h" 18 #include "xfs_errortag.h" 19 #include "xfs_error.h" 20 #include "xfs_trace.h" 21 #include "xfs_alloc.h" 22 #include "xfs_log.h" 23 #include "xfs_btree_staging.h" 24 #include "xfs_ag.h" 25 #include "xfs_alloc_btree.h" 26 #include "xfs_ialloc_btree.h" 27 #include "xfs_bmap_btree.h" 28 #include "xfs_rmap_btree.h" 29 #include "xfs_refcount_btree.h" 30 #include "xfs_health.h" 31 #include "xfs_buf_mem.h" 32 #include "xfs_btree_mem.h" 33 #include "xfs_rtrmap_btree.h" 34 #include "xfs_bmap.h" 35 #include "xfs_rmap.h" 36 #include "xfs_quota.h" 37 #include "xfs_metafile.h" 38 #include "xfs_rtrefcount_btree.h" 39 40 /* 41 * Btree magic numbers. 42 */ 43 uint32_t 44 xfs_btree_magic( 45 struct xfs_mount *mp, 46 const struct xfs_btree_ops *ops) 47 { 48 int idx = xfs_has_crc(mp) ? 1 : 0; 49 __be32 magic = ops->buf_ops->magic[idx]; 50 51 /* Ensure we asked for crc for crc-only magics. */ 52 ASSERT(magic != 0); 53 return be32_to_cpu(magic); 54 } 55 56 /* 57 * These sibling pointer checks are optimised for null sibling pointers. This 58 * happens a lot, and we don't need to byte swap at runtime if the sibling 59 * pointer is NULL. 60 * 61 * These are explicitly marked at inline because the cost of calling them as 62 * functions instead of inlining them is about 36 bytes extra code per call site 63 * on x86-64. Yes, gcc-11 fails to inline them, and explicit inlining of these 64 * two sibling check functions reduces the compiled code size by over 300 65 * bytes. 66 */ 67 static inline xfs_failaddr_t 68 xfs_btree_check_fsblock_siblings( 69 struct xfs_mount *mp, 70 xfs_fsblock_t fsb, 71 __be64 dsibling) 72 { 73 xfs_fsblock_t sibling; 74 75 if (dsibling == cpu_to_be64(NULLFSBLOCK)) 76 return NULL; 77 78 sibling = be64_to_cpu(dsibling); 79 if (sibling == fsb) 80 return __this_address; 81 if (!xfs_verify_fsbno(mp, sibling)) 82 return __this_address; 83 return NULL; 84 } 85 86 static inline xfs_failaddr_t 87 xfs_btree_check_memblock_siblings( 88 struct xfs_buftarg *btp, 89 xfbno_t bno, 90 __be64 dsibling) 91 { 92 xfbno_t sibling; 93 94 if (dsibling == cpu_to_be64(NULLFSBLOCK)) 95 return NULL; 96 97 sibling = be64_to_cpu(dsibling); 98 if (sibling == bno) 99 return __this_address; 100 if (!xmbuf_verify_daddr(btp, xfbno_to_daddr(sibling))) 101 return __this_address; 102 return NULL; 103 } 104 105 static inline xfs_failaddr_t 106 xfs_btree_check_agblock_siblings( 107 struct xfs_perag *pag, 108 xfs_agblock_t agbno, 109 __be32 dsibling) 110 { 111 xfs_agblock_t sibling; 112 113 if (dsibling == cpu_to_be32(NULLAGBLOCK)) 114 return NULL; 115 116 sibling = be32_to_cpu(dsibling); 117 if (sibling == agbno) 118 return __this_address; 119 if (!xfs_verify_agbno(pag, sibling)) 120 return __this_address; 121 return NULL; 122 } 123 124 static xfs_failaddr_t 125 __xfs_btree_check_lblock_hdr( 126 struct xfs_btree_cur *cur, 127 struct xfs_btree_block *block, 128 int level, 129 struct xfs_buf *bp) 130 { 131 struct xfs_mount *mp = cur->bc_mp; 132 133 if (xfs_has_crc(mp)) { 134 if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid)) 135 return __this_address; 136 if (block->bb_u.l.bb_blkno != 137 cpu_to_be64(bp ? xfs_buf_daddr(bp) : XFS_BUF_DADDR_NULL)) 138 return __this_address; 139 if (block->bb_u.l.bb_pad != cpu_to_be32(0)) 140 return __this_address; 141 } 142 143 if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(mp, cur->bc_ops)) 144 return __this_address; 145 if (be16_to_cpu(block->bb_level) != level) 146 return __this_address; 147 if (be16_to_cpu(block->bb_numrecs) > 148 cur->bc_ops->get_maxrecs(cur, level)) 149 return __this_address; 150 151 return NULL; 152 } 153 154 /* 155 * Check a long btree block header. Return the address of the failing check, 156 * or NULL if everything is ok. 157 */ 158 static xfs_failaddr_t 159 __xfs_btree_check_fsblock( 160 struct xfs_btree_cur *cur, 161 struct xfs_btree_block *block, 162 int level, 163 struct xfs_buf *bp) 164 { 165 struct xfs_mount *mp = cur->bc_mp; 166 xfs_failaddr_t fa; 167 xfs_fsblock_t fsb; 168 169 fa = __xfs_btree_check_lblock_hdr(cur, block, level, bp); 170 if (fa) 171 return fa; 172 173 /* 174 * For inode-rooted btrees, the root block sits in the inode fork. In 175 * that case bp is NULL, and the block must not have any siblings. 176 */ 177 if (!bp) { 178 if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK)) 179 return __this_address; 180 if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK)) 181 return __this_address; 182 return NULL; 183 } 184 185 fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); 186 fa = xfs_btree_check_fsblock_siblings(mp, fsb, 187 block->bb_u.l.bb_leftsib); 188 if (!fa) 189 fa = xfs_btree_check_fsblock_siblings(mp, fsb, 190 block->bb_u.l.bb_rightsib); 191 return fa; 192 } 193 194 /* 195 * Check an in-memory btree block header. Return the address of the failing 196 * check, or NULL if everything is ok. 197 */ 198 static xfs_failaddr_t 199 __xfs_btree_check_memblock( 200 struct xfs_btree_cur *cur, 201 struct xfs_btree_block *block, 202 int level, 203 struct xfs_buf *bp) 204 { 205 struct xfs_buftarg *btp = cur->bc_mem.xfbtree->target; 206 xfs_failaddr_t fa; 207 xfbno_t bno; 208 209 fa = __xfs_btree_check_lblock_hdr(cur, block, level, bp); 210 if (fa) 211 return fa; 212 213 bno = xfs_daddr_to_xfbno(xfs_buf_daddr(bp)); 214 fa = xfs_btree_check_memblock_siblings(btp, bno, 215 block->bb_u.l.bb_leftsib); 216 if (!fa) 217 fa = xfs_btree_check_memblock_siblings(btp, bno, 218 block->bb_u.l.bb_rightsib); 219 return fa; 220 } 221 222 /* 223 * Check a short btree block header. Return the address of the failing check, 224 * or NULL if everything is ok. 225 */ 226 static xfs_failaddr_t 227 __xfs_btree_check_agblock( 228 struct xfs_btree_cur *cur, 229 struct xfs_btree_block *block, 230 int level, 231 struct xfs_buf *bp) 232 { 233 struct xfs_mount *mp = cur->bc_mp; 234 struct xfs_perag *pag = to_perag(cur->bc_group); 235 xfs_failaddr_t fa; 236 xfs_agblock_t agbno; 237 238 if (xfs_has_crc(mp)) { 239 if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) 240 return __this_address; 241 if (block->bb_u.s.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp))) 242 return __this_address; 243 } 244 245 if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(mp, cur->bc_ops)) 246 return __this_address; 247 if (be16_to_cpu(block->bb_level) != level) 248 return __this_address; 249 if (be16_to_cpu(block->bb_numrecs) > 250 cur->bc_ops->get_maxrecs(cur, level)) 251 return __this_address; 252 253 agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp)); 254 fa = xfs_btree_check_agblock_siblings(pag, agbno, 255 block->bb_u.s.bb_leftsib); 256 if (!fa) 257 fa = xfs_btree_check_agblock_siblings(pag, agbno, 258 block->bb_u.s.bb_rightsib); 259 return fa; 260 } 261 262 /* 263 * Internal btree block check. 264 * 265 * Return NULL if the block is ok or the address of the failed check otherwise. 266 */ 267 xfs_failaddr_t 268 __xfs_btree_check_block( 269 struct xfs_btree_cur *cur, 270 struct xfs_btree_block *block, 271 int level, 272 struct xfs_buf *bp) 273 { 274 switch (cur->bc_ops->type) { 275 case XFS_BTREE_TYPE_MEM: 276 return __xfs_btree_check_memblock(cur, block, level, bp); 277 case XFS_BTREE_TYPE_AG: 278 return __xfs_btree_check_agblock(cur, block, level, bp); 279 case XFS_BTREE_TYPE_INODE: 280 return __xfs_btree_check_fsblock(cur, block, level, bp); 281 default: 282 ASSERT(0); 283 return __this_address; 284 } 285 } 286 287 static inline unsigned int xfs_btree_block_errtag(struct xfs_btree_cur *cur) 288 { 289 if (cur->bc_ops->ptr_len == XFS_BTREE_SHORT_PTR_LEN) 290 return XFS_ERRTAG_BTREE_CHECK_SBLOCK; 291 return XFS_ERRTAG_BTREE_CHECK_LBLOCK; 292 } 293 294 /* 295 * Debug routine: check that block header is ok. 296 */ 297 int 298 xfs_btree_check_block( 299 struct xfs_btree_cur *cur, /* btree cursor */ 300 struct xfs_btree_block *block, /* generic btree block pointer */ 301 int level, /* level of the btree block */ 302 struct xfs_buf *bp) /* buffer containing block, if any */ 303 { 304 struct xfs_mount *mp = cur->bc_mp; 305 xfs_failaddr_t fa; 306 307 fa = __xfs_btree_check_block(cur, block, level, bp); 308 if (XFS_IS_CORRUPT(mp, fa != NULL) || 309 XFS_TEST_ERROR(false, mp, xfs_btree_block_errtag(cur))) { 310 if (bp) 311 trace_xfs_btree_corrupt(bp, _RET_IP_); 312 xfs_btree_mark_sick(cur); 313 return -EFSCORRUPTED; 314 } 315 return 0; 316 } 317 318 int 319 __xfs_btree_check_ptr( 320 struct xfs_btree_cur *cur, 321 const union xfs_btree_ptr *ptr, 322 int index, 323 int level) 324 { 325 if (level <= 0) 326 return -EFSCORRUPTED; 327 328 switch (cur->bc_ops->type) { 329 case XFS_BTREE_TYPE_MEM: 330 if (!xfbtree_verify_bno(cur->bc_mem.xfbtree, 331 be64_to_cpu((&ptr->l)[index]))) 332 return -EFSCORRUPTED; 333 break; 334 case XFS_BTREE_TYPE_INODE: 335 if (!xfs_verify_fsbno(cur->bc_mp, 336 be64_to_cpu((&ptr->l)[index]))) 337 return -EFSCORRUPTED; 338 break; 339 case XFS_BTREE_TYPE_AG: 340 if (!xfs_verify_agbno(to_perag(cur->bc_group), 341 be32_to_cpu((&ptr->s)[index]))) 342 return -EFSCORRUPTED; 343 break; 344 } 345 346 return 0; 347 } 348 349 /* 350 * Check that a given (indexed) btree pointer at a certain level of a 351 * btree is valid and doesn't point past where it should. 352 */ 353 static int 354 xfs_btree_check_ptr( 355 struct xfs_btree_cur *cur, 356 const union xfs_btree_ptr *ptr, 357 int index, 358 int level) 359 { 360 int error; 361 362 error = __xfs_btree_check_ptr(cur, ptr, index, level); 363 if (error) { 364 switch (cur->bc_ops->type) { 365 case XFS_BTREE_TYPE_MEM: 366 xfs_err(cur->bc_mp, 367 "In-memory: Corrupt %sbt flags 0x%x pointer at level %d index %d fa %pS.", 368 cur->bc_ops->name, cur->bc_flags, level, index, 369 __this_address); 370 break; 371 case XFS_BTREE_TYPE_INODE: 372 xfs_err(cur->bc_mp, 373 "Inode %llu fork %d: Corrupt %sbt pointer at level %d index %d.", 374 cur->bc_ino.ip->i_ino, 375 cur->bc_ino.whichfork, cur->bc_ops->name, 376 level, index); 377 break; 378 case XFS_BTREE_TYPE_AG: 379 xfs_err(cur->bc_mp, 380 "AG %u: Corrupt %sbt pointer at level %d index %d.", 381 cur->bc_group->xg_gno, cur->bc_ops->name, 382 level, index); 383 break; 384 } 385 xfs_btree_mark_sick(cur); 386 } 387 388 return error; 389 } 390 391 #ifdef DEBUG 392 # define xfs_btree_debug_check_ptr xfs_btree_check_ptr 393 #else 394 # define xfs_btree_debug_check_ptr(...) (0) 395 #endif 396 397 /* 398 * Calculate CRC on the whole btree block and stuff it into the 399 * long-form btree header. 400 * 401 * Prior to calculting the CRC, pull the LSN out of the buffer log item and put 402 * it into the buffer so recovery knows what the last modification was that made 403 * it to disk. 404 */ 405 void 406 xfs_btree_fsblock_calc_crc( 407 struct xfs_buf *bp) 408 { 409 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 410 struct xfs_buf_log_item *bip = bp->b_log_item; 411 412 if (!xfs_has_crc(bp->b_mount)) 413 return; 414 if (bip) 415 block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn); 416 xfs_buf_update_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF); 417 } 418 419 bool 420 xfs_btree_fsblock_verify_crc( 421 struct xfs_buf *bp) 422 { 423 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 424 struct xfs_mount *mp = bp->b_mount; 425 426 if (xfs_has_crc(mp)) { 427 if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.l.bb_lsn))) 428 return false; 429 return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF); 430 } 431 432 return true; 433 } 434 435 /* 436 * Calculate CRC on the whole btree block and stuff it into the 437 * short-form btree header. 438 * 439 * Prior to calculting the CRC, pull the LSN out of the buffer log item and put 440 * it into the buffer so recovery knows what the last modification was that made 441 * it to disk. 442 */ 443 void 444 xfs_btree_agblock_calc_crc( 445 struct xfs_buf *bp) 446 { 447 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 448 struct xfs_buf_log_item *bip = bp->b_log_item; 449 450 if (!xfs_has_crc(bp->b_mount)) 451 return; 452 if (bip) 453 block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn); 454 xfs_buf_update_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF); 455 } 456 457 bool 458 xfs_btree_agblock_verify_crc( 459 struct xfs_buf *bp) 460 { 461 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 462 struct xfs_mount *mp = bp->b_mount; 463 464 if (xfs_has_crc(mp)) { 465 if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.s.bb_lsn))) 466 return false; 467 return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF); 468 } 469 470 return true; 471 } 472 473 static int 474 xfs_btree_free_block( 475 struct xfs_btree_cur *cur, 476 struct xfs_buf *bp) 477 { 478 int error; 479 480 trace_xfs_btree_free_block(cur, bp); 481 482 /* 483 * Don't allow block freeing for a staging cursor, because staging 484 * cursors do not support regular btree modifications. 485 */ 486 if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) { 487 ASSERT(0); 488 return -EFSCORRUPTED; 489 } 490 491 error = cur->bc_ops->free_block(cur, bp); 492 if (!error) { 493 xfs_trans_binval(cur->bc_tp, bp); 494 XFS_BTREE_STATS_INC(cur, free); 495 } 496 return error; 497 } 498 499 /* 500 * Delete the btree cursor. 501 */ 502 void 503 xfs_btree_del_cursor( 504 struct xfs_btree_cur *cur, /* btree cursor */ 505 int error) /* del because of error */ 506 { 507 int i; /* btree level */ 508 509 /* 510 * Clear the buffer pointers and release the buffers. If we're doing 511 * this because of an error, inspect all of the entries in the bc_bufs 512 * array for buffers to be unlocked. This is because some of the btree 513 * code works from level n down to 0, and if we get an error along the 514 * way we won't have initialized all the entries down to 0. 515 */ 516 for (i = 0; i < cur->bc_nlevels; i++) { 517 if (cur->bc_levels[i].bp) 518 xfs_trans_brelse(cur->bc_tp, cur->bc_levels[i].bp); 519 else if (!error) 520 break; 521 } 522 523 /* 524 * If we are doing a BMBT update, the number of unaccounted blocks 525 * allocated during this cursor life time should be zero. If it's not 526 * zero, then we should be shut down or on our way to shutdown due to 527 * cancelling a dirty transaction on error. 528 */ 529 ASSERT(!xfs_btree_is_bmap(cur->bc_ops) || cur->bc_bmap.allocated == 0 || 530 xfs_is_shutdown(cur->bc_mp) || error != 0); 531 532 if (cur->bc_group) 533 xfs_group_put(cur->bc_group); 534 kmem_cache_free(cur->bc_cache, cur); 535 } 536 537 /* Return the buffer target for this btree's buffer. */ 538 static inline struct xfs_buftarg * 539 xfs_btree_buftarg( 540 struct xfs_btree_cur *cur) 541 { 542 if (cur->bc_ops->type == XFS_BTREE_TYPE_MEM) 543 return cur->bc_mem.xfbtree->target; 544 return cur->bc_mp->m_ddev_targp; 545 } 546 547 /* Return the block size (in units of 512b sectors) for this btree. */ 548 static inline unsigned int 549 xfs_btree_bbsize( 550 struct xfs_btree_cur *cur) 551 { 552 if (cur->bc_ops->type == XFS_BTREE_TYPE_MEM) 553 return XFBNO_BBSIZE; 554 return cur->bc_mp->m_bsize; 555 } 556 557 /* 558 * Duplicate the btree cursor. 559 * Allocate a new one, copy the record, re-get the buffers. 560 */ 561 int /* error */ 562 xfs_btree_dup_cursor( 563 struct xfs_btree_cur *cur, /* input cursor */ 564 struct xfs_btree_cur **ncur) /* output cursor */ 565 { 566 struct xfs_mount *mp = cur->bc_mp; 567 struct xfs_trans *tp = cur->bc_tp; 568 struct xfs_buf *bp; 569 struct xfs_btree_cur *new; 570 int error; 571 int i; 572 573 /* 574 * Don't allow staging cursors to be duplicated because they're supposed 575 * to be kept private to a single thread. 576 */ 577 if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) { 578 ASSERT(0); 579 return -EFSCORRUPTED; 580 } 581 582 /* 583 * Allocate a new cursor like the old one. 584 */ 585 new = cur->bc_ops->dup_cursor(cur); 586 587 /* 588 * Copy the record currently in the cursor. 589 */ 590 new->bc_rec = cur->bc_rec; 591 592 /* 593 * For each level current, re-get the buffer and copy the ptr value. 594 */ 595 for (i = 0; i < new->bc_nlevels; i++) { 596 new->bc_levels[i].ptr = cur->bc_levels[i].ptr; 597 new->bc_levels[i].ra = cur->bc_levels[i].ra; 598 bp = cur->bc_levels[i].bp; 599 if (bp) { 600 error = xfs_trans_read_buf(mp, tp, 601 xfs_btree_buftarg(cur), 602 xfs_buf_daddr(bp), 603 xfs_btree_bbsize(cur), 0, &bp, 604 cur->bc_ops->buf_ops); 605 if (xfs_metadata_is_sick(error)) 606 xfs_btree_mark_sick(new); 607 if (error) { 608 xfs_btree_del_cursor(new, error); 609 *ncur = NULL; 610 return error; 611 } 612 } 613 new->bc_levels[i].bp = bp; 614 } 615 *ncur = new; 616 return 0; 617 } 618 619 /* 620 * XFS btree block layout and addressing: 621 * 622 * There are two types of blocks in the btree: leaf and non-leaf blocks. 623 * 624 * The leaf record start with a header then followed by records containing 625 * the values. A non-leaf block also starts with the same header, and 626 * then first contains lookup keys followed by an equal number of pointers 627 * to the btree blocks at the previous level. 628 * 629 * +--------+-------+-------+-------+-------+-------+-------+ 630 * Leaf: | header | rec 1 | rec 2 | rec 3 | rec 4 | rec 5 | rec N | 631 * +--------+-------+-------+-------+-------+-------+-------+ 632 * 633 * +--------+-------+-------+-------+-------+-------+-------+ 634 * Non-Leaf: | header | key 1 | key 2 | key N | ptr 1 | ptr 2 | ptr N | 635 * +--------+-------+-------+-------+-------+-------+-------+ 636 * 637 * The header is called struct xfs_btree_block for reasons better left unknown 638 * and comes in different versions for short (32bit) and long (64bit) block 639 * pointers. The record and key structures are defined by the btree instances 640 * and opaque to the btree core. The block pointers are simple disk endian 641 * integers, available in a short (32bit) and long (64bit) variant. 642 * 643 * The helpers below calculate the offset of a given record, key or pointer 644 * into a btree block (xfs_btree_*_offset) or return a pointer to the given 645 * record, key or pointer (xfs_btree_*_addr). Note that all addressing 646 * inside the btree block is done using indices starting at one, not zero! 647 * 648 * If XFS_BTGEO_OVERLAPPING is set, then this btree supports keys containing 649 * overlapping intervals. In such a tree, records are still sorted lowest to 650 * highest and indexed by the smallest key value that refers to the record. 651 * However, nodes are different: each pointer has two associated keys -- one 652 * indexing the lowest key available in the block(s) below (the same behavior 653 * as the key in a regular btree) and another indexing the highest key 654 * available in the block(s) below. Because records are /not/ sorted by the 655 * highest key, all leaf block updates require us to compute the highest key 656 * that matches any record in the leaf and to recursively update the high keys 657 * in the nodes going further up in the tree, if necessary. Nodes look like 658 * this: 659 * 660 * +--------+-----+-----+-----+-----+-----+-------+-------+-----+ 661 * Non-Leaf: | header | lo1 | hi1 | lo2 | hi2 | ... | ptr 1 | ptr 2 | ... | 662 * +--------+-----+-----+-----+-----+-----+-------+-------+-----+ 663 * 664 * To perform an interval query on an overlapped tree, perform the usual 665 * depth-first search and use the low and high keys to decide if we can skip 666 * that particular node. If a leaf node is reached, return the records that 667 * intersect the interval. Note that an interval query may return numerous 668 * entries. For a non-overlapped tree, simply search for the record associated 669 * with the lowest key and iterate forward until a non-matching record is 670 * found. Section 14.3 ("Interval Trees") of _Introduction to Algorithms_ by 671 * Cormen, Leiserson, Rivest, and Stein (2nd or 3rd ed. only) discuss this in 672 * more detail. 673 * 674 * Why do we care about overlapping intervals? Let's say you have a bunch of 675 * reverse mapping records on a reflink filesystem: 676 * 677 * 1: +- file A startblock B offset C length D -----------+ 678 * 2: +- file E startblock F offset G length H --------------+ 679 * 3: +- file I startblock F offset J length K --+ 680 * 4: +- file L... --+ 681 * 682 * Now say we want to map block (B+D) into file A at offset (C+D). Ideally, 683 * we'd simply increment the length of record 1. But how do we find the record 684 * that ends at (B+D-1) (i.e. record 1)? A LE lookup of (B+D-1) would return 685 * record 3 because the keys are ordered first by startblock. An interval 686 * query would return records 1 and 2 because they both overlap (B+D-1), and 687 * from that we can pick out record 1 as the appropriate left neighbor. 688 * 689 * In the non-overlapped case you can do a LE lookup and decrement the cursor 690 * because a record's interval must end before the next record. 691 */ 692 693 /* 694 * Return size of the btree block header for this btree instance. 695 */ 696 static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur) 697 { 698 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { 699 if (xfs_has_crc(cur->bc_mp)) 700 return XFS_BTREE_LBLOCK_CRC_LEN; 701 return XFS_BTREE_LBLOCK_LEN; 702 } 703 if (xfs_has_crc(cur->bc_mp)) 704 return XFS_BTREE_SBLOCK_CRC_LEN; 705 return XFS_BTREE_SBLOCK_LEN; 706 } 707 708 /* 709 * Calculate offset of the n-th record in a btree block. 710 */ 711 STATIC size_t 712 xfs_btree_rec_offset( 713 struct xfs_btree_cur *cur, 714 int n) 715 { 716 return xfs_btree_block_len(cur) + 717 (n - 1) * cur->bc_ops->rec_len; 718 } 719 720 /* 721 * Calculate offset of the n-th key in a btree block. 722 */ 723 STATIC size_t 724 xfs_btree_key_offset( 725 struct xfs_btree_cur *cur, 726 int n) 727 { 728 return xfs_btree_block_len(cur) + 729 (n - 1) * cur->bc_ops->key_len; 730 } 731 732 /* 733 * Calculate offset of the n-th high key in a btree block. 734 */ 735 STATIC size_t 736 xfs_btree_high_key_offset( 737 struct xfs_btree_cur *cur, 738 int n) 739 { 740 return xfs_btree_block_len(cur) + 741 (n - 1) * cur->bc_ops->key_len + (cur->bc_ops->key_len / 2); 742 } 743 744 /* 745 * Calculate offset of the n-th block pointer in a btree block. 746 */ 747 STATIC size_t 748 xfs_btree_ptr_offset( 749 struct xfs_btree_cur *cur, 750 int n, 751 int level) 752 { 753 return xfs_btree_block_len(cur) + 754 cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len + 755 (n - 1) * cur->bc_ops->ptr_len; 756 } 757 758 /* 759 * Return a pointer to the n-th record in the btree block. 760 */ 761 union xfs_btree_rec * 762 xfs_btree_rec_addr( 763 struct xfs_btree_cur *cur, 764 int n, 765 struct xfs_btree_block *block) 766 { 767 return (union xfs_btree_rec *) 768 ((char *)block + xfs_btree_rec_offset(cur, n)); 769 } 770 771 /* 772 * Return a pointer to the n-th key in the btree block. 773 */ 774 union xfs_btree_key * 775 xfs_btree_key_addr( 776 struct xfs_btree_cur *cur, 777 int n, 778 struct xfs_btree_block *block) 779 { 780 return (union xfs_btree_key *) 781 ((char *)block + xfs_btree_key_offset(cur, n)); 782 } 783 784 /* 785 * Return a pointer to the n-th high key in the btree block. 786 */ 787 union xfs_btree_key * 788 xfs_btree_high_key_addr( 789 struct xfs_btree_cur *cur, 790 int n, 791 struct xfs_btree_block *block) 792 { 793 return (union xfs_btree_key *) 794 ((char *)block + xfs_btree_high_key_offset(cur, n)); 795 } 796 797 /* 798 * Return a pointer to the n-th block pointer in the btree block. 799 */ 800 union xfs_btree_ptr * 801 xfs_btree_ptr_addr( 802 struct xfs_btree_cur *cur, 803 int n, 804 struct xfs_btree_block *block) 805 { 806 int level = xfs_btree_get_level(block); 807 808 ASSERT(block->bb_level != 0); 809 810 return (union xfs_btree_ptr *) 811 ((char *)block + xfs_btree_ptr_offset(cur, n, level)); 812 } 813 814 struct xfs_ifork * 815 xfs_btree_ifork_ptr( 816 struct xfs_btree_cur *cur) 817 { 818 ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE); 819 820 if (cur->bc_flags & XFS_BTREE_STAGING) 821 return cur->bc_ino.ifake->if_fork; 822 return xfs_ifork_ptr(cur->bc_ino.ip, cur->bc_ino.whichfork); 823 } 824 825 /* 826 * Get the root block which is stored in the inode. 827 * 828 * For now this btree implementation assumes the btree root is always 829 * stored in the if_broot field of an inode fork. 830 */ 831 STATIC struct xfs_btree_block * 832 xfs_btree_get_iroot( 833 struct xfs_btree_cur *cur) 834 { 835 struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); 836 837 return (struct xfs_btree_block *)ifp->if_broot; 838 } 839 840 /* 841 * Retrieve the block pointer from the cursor at the given level. 842 * This may be an inode btree root or from a buffer. 843 */ 844 struct xfs_btree_block * /* generic btree block pointer */ 845 xfs_btree_get_block( 846 struct xfs_btree_cur *cur, /* btree cursor */ 847 int level, /* level in btree */ 848 struct xfs_buf **bpp) /* buffer containing the block */ 849 { 850 if (xfs_btree_at_iroot(cur, level)) { 851 *bpp = NULL; 852 return xfs_btree_get_iroot(cur); 853 } 854 855 *bpp = cur->bc_levels[level].bp; 856 return XFS_BUF_TO_BLOCK(*bpp); 857 } 858 859 /* 860 * Change the cursor to point to the first record at the given level. 861 * Other levels are unaffected. 862 */ 863 STATIC int /* success=1, failure=0 */ 864 xfs_btree_firstrec( 865 struct xfs_btree_cur *cur, /* btree cursor */ 866 int level) /* level to change */ 867 { 868 struct xfs_btree_block *block; /* generic btree block pointer */ 869 struct xfs_buf *bp; /* buffer containing block */ 870 871 /* 872 * Get the block pointer for this level. 873 */ 874 block = xfs_btree_get_block(cur, level, &bp); 875 if (xfs_btree_check_block(cur, block, level, bp)) 876 return 0; 877 /* 878 * It's empty, there is no such record. 879 */ 880 if (!block->bb_numrecs) 881 return 0; 882 /* 883 * Set the ptr value to 1, that's the first record/key. 884 */ 885 cur->bc_levels[level].ptr = 1; 886 return 1; 887 } 888 889 /* 890 * Change the cursor to point to the last record in the current block 891 * at the given level. Other levels are unaffected. 892 */ 893 STATIC int /* success=1, failure=0 */ 894 xfs_btree_lastrec( 895 struct xfs_btree_cur *cur, /* btree cursor */ 896 int level) /* level to change */ 897 { 898 struct xfs_btree_block *block; /* generic btree block pointer */ 899 struct xfs_buf *bp; /* buffer containing block */ 900 901 /* 902 * Get the block pointer for this level. 903 */ 904 block = xfs_btree_get_block(cur, level, &bp); 905 if (xfs_btree_check_block(cur, block, level, bp)) 906 return 0; 907 /* 908 * It's empty, there is no such record. 909 */ 910 if (!block->bb_numrecs) 911 return 0; 912 /* 913 * Set the ptr value to numrecs, that's the last record/key. 914 */ 915 cur->bc_levels[level].ptr = be16_to_cpu(block->bb_numrecs); 916 return 1; 917 } 918 919 /* 920 * Compute first and last byte offsets for the fields given. 921 * Interprets the offsets table, which contains struct field offsets. 922 */ 923 void 924 xfs_btree_offsets( 925 uint32_t fields, /* bitmask of fields */ 926 const short *offsets, /* table of field offsets */ 927 int nbits, /* number of bits to inspect */ 928 int *first, /* output: first byte offset */ 929 int *last) /* output: last byte offset */ 930 { 931 int i; /* current bit number */ 932 uint32_t imask; /* mask for current bit number */ 933 934 ASSERT(fields != 0); 935 /* 936 * Find the lowest bit, so the first byte offset. 937 */ 938 for (i = 0, imask = 1u; ; i++, imask <<= 1) { 939 if (imask & fields) { 940 *first = offsets[i]; 941 break; 942 } 943 } 944 /* 945 * Find the highest bit, so the last byte offset. 946 */ 947 for (i = nbits - 1, imask = 1u << i; ; i--, imask >>= 1) { 948 if (imask & fields) { 949 *last = offsets[i + 1] - 1; 950 break; 951 } 952 } 953 } 954 955 STATIC int 956 xfs_btree_readahead_fsblock( 957 struct xfs_btree_cur *cur, 958 int lr, 959 struct xfs_btree_block *block) 960 { 961 struct xfs_mount *mp = cur->bc_mp; 962 xfs_fsblock_t left = be64_to_cpu(block->bb_u.l.bb_leftsib); 963 xfs_fsblock_t right = be64_to_cpu(block->bb_u.l.bb_rightsib); 964 int rval = 0; 965 966 if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) { 967 xfs_buf_readahead(mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, left), 968 mp->m_bsize, cur->bc_ops->buf_ops); 969 rval++; 970 } 971 972 if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) { 973 xfs_buf_readahead(mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, right), 974 mp->m_bsize, cur->bc_ops->buf_ops); 975 rval++; 976 } 977 978 return rval; 979 } 980 981 STATIC int 982 xfs_btree_readahead_memblock( 983 struct xfs_btree_cur *cur, 984 int lr, 985 struct xfs_btree_block *block) 986 { 987 struct xfs_buftarg *btp = cur->bc_mem.xfbtree->target; 988 xfbno_t left = be64_to_cpu(block->bb_u.l.bb_leftsib); 989 xfbno_t right = be64_to_cpu(block->bb_u.l.bb_rightsib); 990 int rval = 0; 991 992 if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) { 993 xfs_buf_readahead(btp, xfbno_to_daddr(left), XFBNO_BBSIZE, 994 cur->bc_ops->buf_ops); 995 rval++; 996 } 997 998 if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) { 999 xfs_buf_readahead(btp, xfbno_to_daddr(right), XFBNO_BBSIZE, 1000 cur->bc_ops->buf_ops); 1001 rval++; 1002 } 1003 1004 return rval; 1005 } 1006 1007 STATIC int 1008 xfs_btree_readahead_agblock( 1009 struct xfs_btree_cur *cur, 1010 int lr, 1011 struct xfs_btree_block *block) 1012 { 1013 struct xfs_mount *mp = cur->bc_mp; 1014 struct xfs_perag *pag = to_perag(cur->bc_group); 1015 xfs_agblock_t left = be32_to_cpu(block->bb_u.s.bb_leftsib); 1016 xfs_agblock_t right = be32_to_cpu(block->bb_u.s.bb_rightsib); 1017 int rval = 0; 1018 1019 if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) { 1020 xfs_buf_readahead(mp->m_ddev_targp, 1021 xfs_agbno_to_daddr(pag, left), mp->m_bsize, 1022 cur->bc_ops->buf_ops); 1023 rval++; 1024 } 1025 1026 if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) { 1027 xfs_buf_readahead(mp->m_ddev_targp, 1028 xfs_agbno_to_daddr(pag, right), mp->m_bsize, 1029 cur->bc_ops->buf_ops); 1030 rval++; 1031 } 1032 1033 return rval; 1034 } 1035 1036 /* 1037 * Read-ahead btree blocks, at the given level. 1038 * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA. 1039 */ 1040 STATIC int 1041 xfs_btree_readahead( 1042 struct xfs_btree_cur *cur, /* btree cursor */ 1043 int lev, /* level in btree */ 1044 int lr) /* left/right bits */ 1045 { 1046 struct xfs_btree_block *block; 1047 1048 /* 1049 * No readahead needed if we are at the root level and the 1050 * btree root is stored in the inode. 1051 */ 1052 if (xfs_btree_at_iroot(cur, lev)) 1053 return 0; 1054 1055 if ((cur->bc_levels[lev].ra | lr) == cur->bc_levels[lev].ra) 1056 return 0; 1057 1058 cur->bc_levels[lev].ra |= lr; 1059 block = XFS_BUF_TO_BLOCK(cur->bc_levels[lev].bp); 1060 1061 switch (cur->bc_ops->type) { 1062 case XFS_BTREE_TYPE_AG: 1063 return xfs_btree_readahead_agblock(cur, lr, block); 1064 case XFS_BTREE_TYPE_INODE: 1065 return xfs_btree_readahead_fsblock(cur, lr, block); 1066 case XFS_BTREE_TYPE_MEM: 1067 return xfs_btree_readahead_memblock(cur, lr, block); 1068 default: 1069 ASSERT(0); 1070 return 0; 1071 } 1072 } 1073 1074 STATIC int 1075 xfs_btree_ptr_to_daddr( 1076 struct xfs_btree_cur *cur, 1077 const union xfs_btree_ptr *ptr, 1078 xfs_daddr_t *daddr) 1079 { 1080 int error; 1081 1082 error = xfs_btree_check_ptr(cur, ptr, 0, 1); 1083 if (error) 1084 return error; 1085 1086 switch (cur->bc_ops->type) { 1087 case XFS_BTREE_TYPE_AG: 1088 *daddr = xfs_agbno_to_daddr(to_perag(cur->bc_group), 1089 be32_to_cpu(ptr->s)); 1090 break; 1091 case XFS_BTREE_TYPE_INODE: 1092 *daddr = XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l)); 1093 break; 1094 case XFS_BTREE_TYPE_MEM: 1095 *daddr = xfbno_to_daddr(be64_to_cpu(ptr->l)); 1096 break; 1097 } 1098 return 0; 1099 } 1100 1101 /* 1102 * Readahead @count btree blocks at the given @ptr location. 1103 * 1104 * We don't need to care about long or short form btrees here as we have a 1105 * method of converting the ptr directly to a daddr available to us. 1106 */ 1107 STATIC void 1108 xfs_btree_readahead_ptr( 1109 struct xfs_btree_cur *cur, 1110 union xfs_btree_ptr *ptr, 1111 xfs_extlen_t count) 1112 { 1113 xfs_daddr_t daddr; 1114 1115 if (xfs_btree_ptr_to_daddr(cur, ptr, &daddr)) 1116 return; 1117 xfs_buf_readahead(xfs_btree_buftarg(cur), daddr, 1118 xfs_btree_bbsize(cur) * count, 1119 cur->bc_ops->buf_ops); 1120 } 1121 1122 /* 1123 * Set the buffer for level "lev" in the cursor to bp, releasing 1124 * any previous buffer. 1125 */ 1126 STATIC void 1127 xfs_btree_setbuf( 1128 struct xfs_btree_cur *cur, /* btree cursor */ 1129 int lev, /* level in btree */ 1130 struct xfs_buf *bp) /* new buffer to set */ 1131 { 1132 struct xfs_btree_block *b; /* btree block */ 1133 1134 if (cur->bc_levels[lev].bp) 1135 xfs_trans_brelse(cur->bc_tp, cur->bc_levels[lev].bp); 1136 cur->bc_levels[lev].bp = bp; 1137 cur->bc_levels[lev].ra = 0; 1138 1139 b = XFS_BUF_TO_BLOCK(bp); 1140 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { 1141 if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK)) 1142 cur->bc_levels[lev].ra |= XFS_BTCUR_LEFTRA; 1143 if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK)) 1144 cur->bc_levels[lev].ra |= XFS_BTCUR_RIGHTRA; 1145 } else { 1146 if (b->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK)) 1147 cur->bc_levels[lev].ra |= XFS_BTCUR_LEFTRA; 1148 if (b->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK)) 1149 cur->bc_levels[lev].ra |= XFS_BTCUR_RIGHTRA; 1150 } 1151 } 1152 1153 bool 1154 xfs_btree_ptr_is_null( 1155 struct xfs_btree_cur *cur, 1156 const union xfs_btree_ptr *ptr) 1157 { 1158 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) 1159 return ptr->l == cpu_to_be64(NULLFSBLOCK); 1160 else 1161 return ptr->s == cpu_to_be32(NULLAGBLOCK); 1162 } 1163 1164 void 1165 xfs_btree_set_ptr_null( 1166 struct xfs_btree_cur *cur, 1167 union xfs_btree_ptr *ptr) 1168 { 1169 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) 1170 ptr->l = cpu_to_be64(NULLFSBLOCK); 1171 else 1172 ptr->s = cpu_to_be32(NULLAGBLOCK); 1173 } 1174 1175 static inline bool 1176 xfs_btree_ptrs_equal( 1177 struct xfs_btree_cur *cur, 1178 union xfs_btree_ptr *ptr1, 1179 union xfs_btree_ptr *ptr2) 1180 { 1181 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) 1182 return ptr1->l == ptr2->l; 1183 return ptr1->s == ptr2->s; 1184 } 1185 1186 /* 1187 * Get/set/init sibling pointers 1188 */ 1189 void 1190 xfs_btree_get_sibling( 1191 struct xfs_btree_cur *cur, 1192 struct xfs_btree_block *block, 1193 union xfs_btree_ptr *ptr, 1194 int lr) 1195 { 1196 ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB); 1197 1198 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { 1199 if (lr == XFS_BB_RIGHTSIB) 1200 ptr->l = block->bb_u.l.bb_rightsib; 1201 else 1202 ptr->l = block->bb_u.l.bb_leftsib; 1203 } else { 1204 if (lr == XFS_BB_RIGHTSIB) 1205 ptr->s = block->bb_u.s.bb_rightsib; 1206 else 1207 ptr->s = block->bb_u.s.bb_leftsib; 1208 } 1209 } 1210 1211 void 1212 xfs_btree_set_sibling( 1213 struct xfs_btree_cur *cur, 1214 struct xfs_btree_block *block, 1215 const union xfs_btree_ptr *ptr, 1216 int lr) 1217 { 1218 ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB); 1219 1220 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { 1221 if (lr == XFS_BB_RIGHTSIB) 1222 block->bb_u.l.bb_rightsib = ptr->l; 1223 else 1224 block->bb_u.l.bb_leftsib = ptr->l; 1225 } else { 1226 if (lr == XFS_BB_RIGHTSIB) 1227 block->bb_u.s.bb_rightsib = ptr->s; 1228 else 1229 block->bb_u.s.bb_leftsib = ptr->s; 1230 } 1231 } 1232 1233 static void 1234 __xfs_btree_init_block( 1235 struct xfs_mount *mp, 1236 struct xfs_btree_block *buf, 1237 const struct xfs_btree_ops *ops, 1238 xfs_daddr_t blkno, 1239 __u16 level, 1240 __u16 numrecs, 1241 __u64 owner) 1242 { 1243 bool crc = xfs_has_crc(mp); 1244 __u32 magic = xfs_btree_magic(mp, ops); 1245 1246 buf->bb_magic = cpu_to_be32(magic); 1247 buf->bb_level = cpu_to_be16(level); 1248 buf->bb_numrecs = cpu_to_be16(numrecs); 1249 1250 if (ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { 1251 buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK); 1252 buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK); 1253 if (crc) { 1254 buf->bb_u.l.bb_blkno = cpu_to_be64(blkno); 1255 buf->bb_u.l.bb_owner = cpu_to_be64(owner); 1256 uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid); 1257 buf->bb_u.l.bb_pad = 0; 1258 buf->bb_u.l.bb_lsn = 0; 1259 } 1260 } else { 1261 buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK); 1262 buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK); 1263 if (crc) { 1264 buf->bb_u.s.bb_blkno = cpu_to_be64(blkno); 1265 /* owner is a 32 bit value on short blocks */ 1266 buf->bb_u.s.bb_owner = cpu_to_be32((__u32)owner); 1267 uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid); 1268 buf->bb_u.s.bb_lsn = 0; 1269 } 1270 } 1271 } 1272 1273 void 1274 xfs_btree_init_block( 1275 struct xfs_mount *mp, 1276 struct xfs_btree_block *block, 1277 const struct xfs_btree_ops *ops, 1278 __u16 level, 1279 __u16 numrecs, 1280 __u64 owner) 1281 { 1282 __xfs_btree_init_block(mp, block, ops, XFS_BUF_DADDR_NULL, level, 1283 numrecs, owner); 1284 } 1285 1286 void 1287 xfs_btree_init_buf( 1288 struct xfs_mount *mp, 1289 struct xfs_buf *bp, 1290 const struct xfs_btree_ops *ops, 1291 __u16 level, 1292 __u16 numrecs, 1293 __u64 owner) 1294 { 1295 __xfs_btree_init_block(mp, XFS_BUF_TO_BLOCK(bp), ops, 1296 xfs_buf_daddr(bp), level, numrecs, owner); 1297 bp->b_ops = ops->buf_ops; 1298 } 1299 1300 static inline __u64 1301 xfs_btree_owner( 1302 struct xfs_btree_cur *cur) 1303 { 1304 switch (cur->bc_ops->type) { 1305 case XFS_BTREE_TYPE_MEM: 1306 return cur->bc_mem.xfbtree->owner; 1307 case XFS_BTREE_TYPE_INODE: 1308 return cur->bc_ino.ip->i_ino; 1309 case XFS_BTREE_TYPE_AG: 1310 return cur->bc_group->xg_gno; 1311 default: 1312 ASSERT(0); 1313 return 0; 1314 } 1315 } 1316 1317 void 1318 xfs_btree_init_block_cur( 1319 struct xfs_btree_cur *cur, 1320 struct xfs_buf *bp, 1321 int level, 1322 int numrecs) 1323 { 1324 xfs_btree_init_buf(cur->bc_mp, bp, cur->bc_ops, level, numrecs, 1325 xfs_btree_owner(cur)); 1326 } 1327 1328 STATIC void 1329 xfs_btree_buf_to_ptr( 1330 struct xfs_btree_cur *cur, 1331 struct xfs_buf *bp, 1332 union xfs_btree_ptr *ptr) 1333 { 1334 switch (cur->bc_ops->type) { 1335 case XFS_BTREE_TYPE_AG: 1336 ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp, 1337 xfs_buf_daddr(bp))); 1338 break; 1339 case XFS_BTREE_TYPE_INODE: 1340 ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp, 1341 xfs_buf_daddr(bp))); 1342 break; 1343 case XFS_BTREE_TYPE_MEM: 1344 ptr->l = cpu_to_be64(xfs_daddr_to_xfbno(xfs_buf_daddr(bp))); 1345 break; 1346 } 1347 } 1348 1349 static inline void 1350 xfs_btree_set_refs( 1351 struct xfs_btree_cur *cur, 1352 struct xfs_buf *bp) 1353 { 1354 xfs_buf_set_ref(bp, cur->bc_ops->lru_refs); 1355 } 1356 1357 int 1358 xfs_btree_get_buf_block( 1359 struct xfs_btree_cur *cur, 1360 const union xfs_btree_ptr *ptr, 1361 struct xfs_btree_block **block, 1362 struct xfs_buf **bpp) 1363 { 1364 xfs_daddr_t d; 1365 int error; 1366 1367 error = xfs_btree_ptr_to_daddr(cur, ptr, &d); 1368 if (error) 1369 return error; 1370 error = xfs_trans_get_buf(cur->bc_tp, xfs_btree_buftarg(cur), d, 1371 xfs_btree_bbsize(cur), 0, bpp); 1372 if (error) 1373 return error; 1374 1375 (*bpp)->b_ops = cur->bc_ops->buf_ops; 1376 *block = XFS_BUF_TO_BLOCK(*bpp); 1377 return 0; 1378 } 1379 1380 /* 1381 * Read in the buffer at the given ptr and return the buffer and 1382 * the block pointer within the buffer. 1383 */ 1384 int 1385 xfs_btree_read_buf_block( 1386 struct xfs_btree_cur *cur, 1387 const union xfs_btree_ptr *ptr, 1388 int flags, 1389 struct xfs_btree_block **block, 1390 struct xfs_buf **bpp) 1391 { 1392 struct xfs_mount *mp = cur->bc_mp; 1393 xfs_daddr_t d; 1394 int error; 1395 1396 /* need to sort out how callers deal with failures first */ 1397 ASSERT(!(flags & XBF_TRYLOCK)); 1398 1399 error = xfs_btree_ptr_to_daddr(cur, ptr, &d); 1400 if (error) 1401 return error; 1402 error = xfs_trans_read_buf(mp, cur->bc_tp, xfs_btree_buftarg(cur), d, 1403 xfs_btree_bbsize(cur), flags, bpp, 1404 cur->bc_ops->buf_ops); 1405 if (xfs_metadata_is_sick(error)) 1406 xfs_btree_mark_sick(cur); 1407 if (error) 1408 return error; 1409 1410 xfs_btree_set_refs(cur, *bpp); 1411 *block = XFS_BUF_TO_BLOCK(*bpp); 1412 return 0; 1413 } 1414 1415 /* 1416 * Copy keys from one btree block to another. 1417 */ 1418 void 1419 xfs_btree_copy_keys( 1420 struct xfs_btree_cur *cur, 1421 union xfs_btree_key *dst_key, 1422 const union xfs_btree_key *src_key, 1423 int numkeys) 1424 { 1425 ASSERT(numkeys >= 0); 1426 memcpy(dst_key, src_key, numkeys * cur->bc_ops->key_len); 1427 } 1428 1429 /* 1430 * Copy records from one btree block to another. 1431 */ 1432 STATIC void 1433 xfs_btree_copy_recs( 1434 struct xfs_btree_cur *cur, 1435 union xfs_btree_rec *dst_rec, 1436 union xfs_btree_rec *src_rec, 1437 int numrecs) 1438 { 1439 ASSERT(numrecs >= 0); 1440 memcpy(dst_rec, src_rec, numrecs * cur->bc_ops->rec_len); 1441 } 1442 1443 /* 1444 * Copy block pointers from one btree block to another. 1445 */ 1446 void 1447 xfs_btree_copy_ptrs( 1448 struct xfs_btree_cur *cur, 1449 union xfs_btree_ptr *dst_ptr, 1450 const union xfs_btree_ptr *src_ptr, 1451 int numptrs) 1452 { 1453 ASSERT(numptrs >= 0); 1454 memcpy(dst_ptr, src_ptr, numptrs * cur->bc_ops->ptr_len); 1455 } 1456 1457 /* 1458 * Shift keys one index left/right inside a single btree block. 1459 */ 1460 STATIC void 1461 xfs_btree_shift_keys( 1462 struct xfs_btree_cur *cur, 1463 union xfs_btree_key *key, 1464 int dir, 1465 int numkeys) 1466 { 1467 char *dst_key; 1468 1469 ASSERT(numkeys >= 0); 1470 ASSERT(dir == 1 || dir == -1); 1471 1472 dst_key = (char *)key + (dir * cur->bc_ops->key_len); 1473 memmove(dst_key, key, numkeys * cur->bc_ops->key_len); 1474 } 1475 1476 /* 1477 * Shift records one index left/right inside a single btree block. 1478 */ 1479 STATIC void 1480 xfs_btree_shift_recs( 1481 struct xfs_btree_cur *cur, 1482 union xfs_btree_rec *rec, 1483 int dir, 1484 int numrecs) 1485 { 1486 char *dst_rec; 1487 1488 ASSERT(numrecs >= 0); 1489 ASSERT(dir == 1 || dir == -1); 1490 1491 dst_rec = (char *)rec + (dir * cur->bc_ops->rec_len); 1492 memmove(dst_rec, rec, numrecs * cur->bc_ops->rec_len); 1493 } 1494 1495 /* 1496 * Shift block pointers one index left/right inside a single btree block. 1497 */ 1498 STATIC void 1499 xfs_btree_shift_ptrs( 1500 struct xfs_btree_cur *cur, 1501 union xfs_btree_ptr *ptr, 1502 int dir, 1503 int numptrs) 1504 { 1505 char *dst_ptr; 1506 1507 ASSERT(numptrs >= 0); 1508 ASSERT(dir == 1 || dir == -1); 1509 1510 dst_ptr = (char *)ptr + (dir * cur->bc_ops->ptr_len); 1511 memmove(dst_ptr, ptr, numptrs * cur->bc_ops->ptr_len); 1512 } 1513 1514 /* 1515 * Log key values from the btree block. 1516 */ 1517 STATIC void 1518 xfs_btree_log_keys( 1519 struct xfs_btree_cur *cur, 1520 struct xfs_buf *bp, 1521 int first, 1522 int last) 1523 { 1524 1525 if (bp) { 1526 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); 1527 xfs_trans_log_buf(cur->bc_tp, bp, 1528 xfs_btree_key_offset(cur, first), 1529 xfs_btree_key_offset(cur, last + 1) - 1); 1530 } else { 1531 xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip, 1532 xfs_ilog_fbroot(cur->bc_ino.whichfork)); 1533 } 1534 } 1535 1536 /* 1537 * Log record values from the btree block. 1538 */ 1539 void 1540 xfs_btree_log_recs( 1541 struct xfs_btree_cur *cur, 1542 struct xfs_buf *bp, 1543 int first, 1544 int last) 1545 { 1546 if (!bp) { 1547 xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip, 1548 xfs_ilog_fbroot(cur->bc_ino.whichfork)); 1549 return; 1550 } 1551 1552 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); 1553 xfs_trans_log_buf(cur->bc_tp, bp, 1554 xfs_btree_rec_offset(cur, first), 1555 xfs_btree_rec_offset(cur, last + 1) - 1); 1556 } 1557 1558 /* 1559 * Log block pointer fields from a btree block (nonleaf). 1560 */ 1561 STATIC void 1562 xfs_btree_log_ptrs( 1563 struct xfs_btree_cur *cur, /* btree cursor */ 1564 struct xfs_buf *bp, /* buffer containing btree block */ 1565 int first, /* index of first pointer to log */ 1566 int last) /* index of last pointer to log */ 1567 { 1568 1569 if (bp) { 1570 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 1571 int level = xfs_btree_get_level(block); 1572 1573 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); 1574 xfs_trans_log_buf(cur->bc_tp, bp, 1575 xfs_btree_ptr_offset(cur, first, level), 1576 xfs_btree_ptr_offset(cur, last + 1, level) - 1); 1577 } else { 1578 xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip, 1579 xfs_ilog_fbroot(cur->bc_ino.whichfork)); 1580 } 1581 1582 } 1583 1584 /* 1585 * Log fields from a btree block header. 1586 */ 1587 void 1588 xfs_btree_log_block( 1589 struct xfs_btree_cur *cur, /* btree cursor */ 1590 struct xfs_buf *bp, /* buffer containing btree block */ 1591 uint32_t fields) /* mask of fields: XFS_BB_... */ 1592 { 1593 int first; /* first byte offset logged */ 1594 int last; /* last byte offset logged */ 1595 static const short soffsets[] = { /* table of offsets (short) */ 1596 offsetof(struct xfs_btree_block, bb_magic), 1597 offsetof(struct xfs_btree_block, bb_level), 1598 offsetof(struct xfs_btree_block, bb_numrecs), 1599 offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib), 1600 offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib), 1601 offsetof(struct xfs_btree_block, bb_u.s.bb_blkno), 1602 offsetof(struct xfs_btree_block, bb_u.s.bb_lsn), 1603 offsetof(struct xfs_btree_block, bb_u.s.bb_uuid), 1604 offsetof(struct xfs_btree_block, bb_u.s.bb_owner), 1605 offsetof(struct xfs_btree_block, bb_u.s.bb_crc), 1606 XFS_BTREE_SBLOCK_CRC_LEN 1607 }; 1608 static const short loffsets[] = { /* table of offsets (long) */ 1609 offsetof(struct xfs_btree_block, bb_magic), 1610 offsetof(struct xfs_btree_block, bb_level), 1611 offsetof(struct xfs_btree_block, bb_numrecs), 1612 offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib), 1613 offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib), 1614 offsetof(struct xfs_btree_block, bb_u.l.bb_blkno), 1615 offsetof(struct xfs_btree_block, bb_u.l.bb_lsn), 1616 offsetof(struct xfs_btree_block, bb_u.l.bb_uuid), 1617 offsetof(struct xfs_btree_block, bb_u.l.bb_owner), 1618 offsetof(struct xfs_btree_block, bb_u.l.bb_crc), 1619 offsetof(struct xfs_btree_block, bb_u.l.bb_pad), 1620 XFS_BTREE_LBLOCK_CRC_LEN 1621 }; 1622 1623 if (bp) { 1624 int nbits; 1625 1626 if (xfs_has_crc(cur->bc_mp)) { 1627 /* 1628 * We don't log the CRC when updating a btree 1629 * block but instead recreate it during log 1630 * recovery. As the log buffers have checksums 1631 * of their own this is safe and avoids logging a crc 1632 * update in a lot of places. 1633 */ 1634 if (fields == XFS_BB_ALL_BITS) 1635 fields = XFS_BB_ALL_BITS_CRC; 1636 nbits = XFS_BB_NUM_BITS_CRC; 1637 } else { 1638 nbits = XFS_BB_NUM_BITS; 1639 } 1640 xfs_btree_offsets(fields, 1641 (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) ? 1642 loffsets : soffsets, 1643 nbits, &first, &last); 1644 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); 1645 xfs_trans_log_buf(cur->bc_tp, bp, first, last); 1646 } else { 1647 xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip, 1648 xfs_ilog_fbroot(cur->bc_ino.whichfork)); 1649 } 1650 } 1651 1652 /* 1653 * Increment cursor by one record at the level. 1654 * For nonzero levels the leaf-ward information is untouched. 1655 */ 1656 int /* error */ 1657 xfs_btree_increment( 1658 struct xfs_btree_cur *cur, 1659 int level, 1660 int *stat) /* success/failure */ 1661 { 1662 struct xfs_btree_block *block; 1663 union xfs_btree_ptr ptr; 1664 struct xfs_buf *bp; 1665 int error; /* error return value */ 1666 int lev; 1667 1668 ASSERT(level < cur->bc_nlevels); 1669 1670 /* Read-ahead to the right at this level. */ 1671 xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA); 1672 1673 /* Get a pointer to the btree block. */ 1674 block = xfs_btree_get_block(cur, level, &bp); 1675 1676 #ifdef DEBUG 1677 error = xfs_btree_check_block(cur, block, level, bp); 1678 if (error) 1679 goto error0; 1680 #endif 1681 1682 /* We're done if we remain in the block after the increment. */ 1683 if (++cur->bc_levels[level].ptr <= xfs_btree_get_numrecs(block)) 1684 goto out1; 1685 1686 /* Fail if we just went off the right edge of the tree. */ 1687 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB); 1688 if (xfs_btree_ptr_is_null(cur, &ptr)) 1689 goto out0; 1690 1691 XFS_BTREE_STATS_INC(cur, increment); 1692 1693 /* 1694 * March up the tree incrementing pointers. 1695 * Stop when we don't go off the right edge of a block. 1696 */ 1697 for (lev = level + 1; lev < cur->bc_nlevels; lev++) { 1698 block = xfs_btree_get_block(cur, lev, &bp); 1699 1700 #ifdef DEBUG 1701 error = xfs_btree_check_block(cur, block, lev, bp); 1702 if (error) 1703 goto error0; 1704 #endif 1705 1706 if (++cur->bc_levels[lev].ptr <= xfs_btree_get_numrecs(block)) 1707 break; 1708 1709 /* Read-ahead the right block for the next loop. */ 1710 xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA); 1711 } 1712 1713 /* 1714 * If we went off the root then we are either seriously 1715 * confused or have the tree root in an inode. 1716 */ 1717 if (lev == cur->bc_nlevels) { 1718 if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) 1719 goto out0; 1720 ASSERT(0); 1721 xfs_btree_mark_sick(cur); 1722 error = -EFSCORRUPTED; 1723 goto error0; 1724 } 1725 ASSERT(lev < cur->bc_nlevels); 1726 1727 /* 1728 * Now walk back down the tree, fixing up the cursor's buffer 1729 * pointers and key numbers. 1730 */ 1731 for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) { 1732 union xfs_btree_ptr *ptrp; 1733 1734 ptrp = xfs_btree_ptr_addr(cur, cur->bc_levels[lev].ptr, block); 1735 --lev; 1736 error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp); 1737 if (error) 1738 goto error0; 1739 1740 xfs_btree_setbuf(cur, lev, bp); 1741 cur->bc_levels[lev].ptr = 1; 1742 } 1743 out1: 1744 *stat = 1; 1745 return 0; 1746 1747 out0: 1748 *stat = 0; 1749 return 0; 1750 1751 error0: 1752 return error; 1753 } 1754 1755 /* 1756 * Decrement cursor by one record at the level. 1757 * For nonzero levels the leaf-ward information is untouched. 1758 */ 1759 int /* error */ 1760 xfs_btree_decrement( 1761 struct xfs_btree_cur *cur, 1762 int level, 1763 int *stat) /* success/failure */ 1764 { 1765 struct xfs_btree_block *block; 1766 struct xfs_buf *bp; 1767 int error; /* error return value */ 1768 int lev; 1769 union xfs_btree_ptr ptr; 1770 1771 ASSERT(level < cur->bc_nlevels); 1772 1773 /* Read-ahead to the left at this level. */ 1774 xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA); 1775 1776 /* We're done if we remain in the block after the decrement. */ 1777 if (--cur->bc_levels[level].ptr > 0) 1778 goto out1; 1779 1780 /* Get a pointer to the btree block. */ 1781 block = xfs_btree_get_block(cur, level, &bp); 1782 1783 #ifdef DEBUG 1784 error = xfs_btree_check_block(cur, block, level, bp); 1785 if (error) 1786 goto error0; 1787 #endif 1788 1789 /* Fail if we just went off the left edge of the tree. */ 1790 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB); 1791 if (xfs_btree_ptr_is_null(cur, &ptr)) 1792 goto out0; 1793 1794 XFS_BTREE_STATS_INC(cur, decrement); 1795 1796 /* 1797 * March up the tree decrementing pointers. 1798 * Stop when we don't go off the left edge of a block. 1799 */ 1800 for (lev = level + 1; lev < cur->bc_nlevels; lev++) { 1801 if (--cur->bc_levels[lev].ptr > 0) 1802 break; 1803 /* Read-ahead the left block for the next loop. */ 1804 xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA); 1805 } 1806 1807 /* 1808 * If we went off the root then we are seriously confused. 1809 * or the root of the tree is in an inode. 1810 */ 1811 if (lev == cur->bc_nlevels) { 1812 if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) 1813 goto out0; 1814 ASSERT(0); 1815 xfs_btree_mark_sick(cur); 1816 error = -EFSCORRUPTED; 1817 goto error0; 1818 } 1819 ASSERT(lev < cur->bc_nlevels); 1820 1821 /* 1822 * Now walk back down the tree, fixing up the cursor's buffer 1823 * pointers and key numbers. 1824 */ 1825 for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) { 1826 union xfs_btree_ptr *ptrp; 1827 1828 ptrp = xfs_btree_ptr_addr(cur, cur->bc_levels[lev].ptr, block); 1829 --lev; 1830 error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp); 1831 if (error) 1832 goto error0; 1833 xfs_btree_setbuf(cur, lev, bp); 1834 cur->bc_levels[lev].ptr = xfs_btree_get_numrecs(block); 1835 } 1836 out1: 1837 *stat = 1; 1838 return 0; 1839 1840 out0: 1841 *stat = 0; 1842 return 0; 1843 1844 error0: 1845 return error; 1846 } 1847 1848 /* 1849 * Check the btree block owner now that we have the context to know who the 1850 * real owner is. 1851 */ 1852 static inline xfs_failaddr_t 1853 xfs_btree_check_block_owner( 1854 struct xfs_btree_cur *cur, 1855 struct xfs_btree_block *block) 1856 { 1857 __u64 owner; 1858 1859 if (!xfs_has_crc(cur->bc_mp) || 1860 (cur->bc_flags & XFS_BTREE_BMBT_INVALID_OWNER)) 1861 return NULL; 1862 1863 owner = xfs_btree_owner(cur); 1864 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { 1865 if (be64_to_cpu(block->bb_u.l.bb_owner) != owner) 1866 return __this_address; 1867 } else { 1868 if (be32_to_cpu(block->bb_u.s.bb_owner) != owner) 1869 return __this_address; 1870 } 1871 1872 return NULL; 1873 } 1874 1875 int 1876 xfs_btree_lookup_get_block( 1877 struct xfs_btree_cur *cur, /* btree cursor */ 1878 int level, /* level in the btree */ 1879 const union xfs_btree_ptr *pp, /* ptr to btree block */ 1880 struct xfs_btree_block **blkp) /* return btree block */ 1881 { 1882 struct xfs_buf *bp; /* buffer pointer for btree block */ 1883 xfs_daddr_t daddr; 1884 int error = 0; 1885 1886 /* special case the root block if in an inode */ 1887 if (xfs_btree_at_iroot(cur, level)) { 1888 *blkp = xfs_btree_get_iroot(cur); 1889 return 0; 1890 } 1891 1892 /* 1893 * If the old buffer at this level for the disk address we are 1894 * looking for re-use it. 1895 * 1896 * Otherwise throw it away and get a new one. 1897 */ 1898 bp = cur->bc_levels[level].bp; 1899 error = xfs_btree_ptr_to_daddr(cur, pp, &daddr); 1900 if (error) 1901 return error; 1902 if (bp && xfs_buf_daddr(bp) == daddr) { 1903 *blkp = XFS_BUF_TO_BLOCK(bp); 1904 return 0; 1905 } 1906 1907 error = xfs_btree_read_buf_block(cur, pp, 0, blkp, &bp); 1908 if (error) 1909 return error; 1910 1911 /* Check the inode owner since the verifiers don't. */ 1912 if (xfs_btree_check_block_owner(cur, *blkp) != NULL) 1913 goto out_bad; 1914 1915 /* Did we get the level we were looking for? */ 1916 if (be16_to_cpu((*blkp)->bb_level) != level) 1917 goto out_bad; 1918 1919 /* Check that internal nodes have at least one record. */ 1920 if (level != 0 && be16_to_cpu((*blkp)->bb_numrecs) == 0) 1921 goto out_bad; 1922 1923 xfs_btree_setbuf(cur, level, bp); 1924 return 0; 1925 1926 out_bad: 1927 *blkp = NULL; 1928 xfs_buf_mark_corrupt(bp); 1929 xfs_trans_brelse(cur->bc_tp, bp); 1930 xfs_btree_mark_sick(cur); 1931 return -EFSCORRUPTED; 1932 } 1933 1934 /* 1935 * Get current search key. For level 0 we don't actually have a key 1936 * structure so we make one up from the record. For all other levels 1937 * we just return the right key. 1938 */ 1939 STATIC union xfs_btree_key * 1940 xfs_lookup_get_search_key( 1941 struct xfs_btree_cur *cur, 1942 int level, 1943 int keyno, 1944 struct xfs_btree_block *block, 1945 union xfs_btree_key *kp) 1946 { 1947 if (level == 0) { 1948 cur->bc_ops->init_key_from_rec(kp, 1949 xfs_btree_rec_addr(cur, keyno, block)); 1950 return kp; 1951 } 1952 1953 return xfs_btree_key_addr(cur, keyno, block); 1954 } 1955 1956 /* 1957 * Initialize a pointer to the root block. 1958 */ 1959 void 1960 xfs_btree_init_ptr_from_cur( 1961 struct xfs_btree_cur *cur, 1962 union xfs_btree_ptr *ptr) 1963 { 1964 if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) { 1965 /* 1966 * Inode-rooted btrees call xfs_btree_get_iroot to find the root 1967 * in xfs_btree_lookup_get_block and don't need a pointer here. 1968 */ 1969 ptr->l = 0; 1970 } else if (cur->bc_flags & XFS_BTREE_STAGING) { 1971 ptr->s = cpu_to_be32(cur->bc_ag.afake->af_root); 1972 } else { 1973 cur->bc_ops->init_ptr_from_cur(cur, ptr); 1974 } 1975 } 1976 1977 /* 1978 * Lookup the record. The cursor is made to point to it, based on dir. 1979 * stat is set to 0 if can't find any such record, 1 for success. 1980 */ 1981 int /* error */ 1982 xfs_btree_lookup( 1983 struct xfs_btree_cur *cur, /* btree cursor */ 1984 xfs_lookup_t dir, /* <=, ==, or >= */ 1985 int *stat) /* success/failure */ 1986 { 1987 struct xfs_btree_block *block; /* current btree block */ 1988 int cmp_r; /* current key comparison result */ 1989 int error; /* error return value */ 1990 int keyno; /* current key number */ 1991 int level; /* level in the btree */ 1992 union xfs_btree_ptr *pp; /* ptr to btree block */ 1993 union xfs_btree_ptr ptr; /* ptr to btree block */ 1994 1995 XFS_BTREE_STATS_INC(cur, lookup); 1996 1997 /* No such thing as a zero-level tree. */ 1998 if (XFS_IS_CORRUPT(cur->bc_mp, cur->bc_nlevels == 0)) { 1999 xfs_btree_mark_sick(cur); 2000 return -EFSCORRUPTED; 2001 } 2002 2003 block = NULL; 2004 keyno = 0; 2005 2006 /* initialise start pointer from cursor */ 2007 xfs_btree_init_ptr_from_cur(cur, &ptr); 2008 pp = &ptr; 2009 2010 /* 2011 * Iterate over each level in the btree, starting at the root. 2012 * For each level above the leaves, find the key we need, based 2013 * on the lookup record, then follow the corresponding block 2014 * pointer down to the next level. 2015 */ 2016 for (level = cur->bc_nlevels - 1, cmp_r = 1; level >= 0; level--) { 2017 /* Get the block we need to do the lookup on. */ 2018 error = xfs_btree_lookup_get_block(cur, level, pp, &block); 2019 if (error) 2020 goto error0; 2021 2022 if (cmp_r == 0) { 2023 /* 2024 * If we already had a key match at a higher level, we 2025 * know we need to use the first entry in this block. 2026 */ 2027 keyno = 1; 2028 } else { 2029 /* Otherwise search this block. Do a binary search. */ 2030 2031 int high; /* high entry number */ 2032 int low; /* low entry number */ 2033 2034 /* Set low and high entry numbers, 1-based. */ 2035 low = 1; 2036 high = xfs_btree_get_numrecs(block); 2037 if (!high) { 2038 /* Block is empty, must be an empty leaf. */ 2039 if (level != 0 || cur->bc_nlevels != 1) { 2040 XFS_CORRUPTION_ERROR(__func__, 2041 XFS_ERRLEVEL_LOW, 2042 cur->bc_mp, block, 2043 sizeof(*block)); 2044 xfs_btree_mark_sick(cur); 2045 return -EFSCORRUPTED; 2046 } 2047 2048 cur->bc_levels[0].ptr = dir != XFS_LOOKUP_LE; 2049 *stat = 0; 2050 return 0; 2051 } 2052 2053 /* Binary search the block. */ 2054 while (low <= high) { 2055 union xfs_btree_key key; 2056 union xfs_btree_key *kp; 2057 2058 XFS_BTREE_STATS_INC(cur, compare); 2059 2060 /* keyno is average of low and high. */ 2061 keyno = (low + high) >> 1; 2062 2063 /* Get current search key */ 2064 kp = xfs_lookup_get_search_key(cur, level, 2065 keyno, block, &key); 2066 2067 /* 2068 * Compute comparison result to get next 2069 * direction: 2070 * - less than, move right 2071 * - greater than, move left 2072 * - equal, we're done 2073 */ 2074 cmp_r = cur->bc_ops->cmp_key_with_cur(cur, kp); 2075 if (cmp_r < 0) 2076 low = keyno + 1; 2077 else if (cmp_r > 0) 2078 high = keyno - 1; 2079 else 2080 break; 2081 } 2082 } 2083 2084 /* 2085 * If there are more levels, set up for the next level 2086 * by getting the block number and filling in the cursor. 2087 */ 2088 if (level > 0) { 2089 /* 2090 * If we moved left, need the previous key number, 2091 * unless there isn't one. 2092 */ 2093 if (cmp_r > 0 && --keyno < 1) 2094 keyno = 1; 2095 pp = xfs_btree_ptr_addr(cur, keyno, block); 2096 2097 error = xfs_btree_debug_check_ptr(cur, pp, 0, level); 2098 if (error) 2099 goto error0; 2100 2101 cur->bc_levels[level].ptr = keyno; 2102 } 2103 } 2104 2105 /* Done with the search. See if we need to adjust the results. */ 2106 if (dir != XFS_LOOKUP_LE && cmp_r < 0) { 2107 keyno++; 2108 /* 2109 * If ge search and we went off the end of the block, but it's 2110 * not the last block, we're in the wrong block. 2111 */ 2112 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB); 2113 if (dir == XFS_LOOKUP_GE && 2114 keyno > xfs_btree_get_numrecs(block) && 2115 !xfs_btree_ptr_is_null(cur, &ptr)) { 2116 int i; 2117 2118 cur->bc_levels[0].ptr = keyno; 2119 error = xfs_btree_increment(cur, 0, &i); 2120 if (error) 2121 goto error0; 2122 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 2123 xfs_btree_mark_sick(cur); 2124 return -EFSCORRUPTED; 2125 } 2126 *stat = 1; 2127 return 0; 2128 } 2129 } else if (dir == XFS_LOOKUP_LE && cmp_r > 0) 2130 keyno--; 2131 cur->bc_levels[0].ptr = keyno; 2132 2133 /* Return if we succeeded or not. */ 2134 if (keyno == 0 || keyno > xfs_btree_get_numrecs(block)) 2135 *stat = 0; 2136 else if (dir != XFS_LOOKUP_EQ || cmp_r == 0) 2137 *stat = 1; 2138 else 2139 *stat = 0; 2140 return 0; 2141 2142 error0: 2143 return error; 2144 } 2145 2146 /* Find the high key storage area from a regular key. */ 2147 union xfs_btree_key * 2148 xfs_btree_high_key_from_key( 2149 struct xfs_btree_cur *cur, 2150 union xfs_btree_key *key) 2151 { 2152 ASSERT(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING); 2153 return (union xfs_btree_key *)((char *)key + 2154 (cur->bc_ops->key_len / 2)); 2155 } 2156 2157 /* Determine the low (and high if overlapped) keys of a leaf block */ 2158 STATIC void 2159 xfs_btree_get_leaf_keys( 2160 struct xfs_btree_cur *cur, 2161 struct xfs_btree_block *block, 2162 union xfs_btree_key *key) 2163 { 2164 union xfs_btree_key max_hkey; 2165 union xfs_btree_key hkey; 2166 union xfs_btree_rec *rec; 2167 union xfs_btree_key *high; 2168 int n; 2169 2170 rec = xfs_btree_rec_addr(cur, 1, block); 2171 cur->bc_ops->init_key_from_rec(key, rec); 2172 2173 if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) { 2174 2175 cur->bc_ops->init_high_key_from_rec(&max_hkey, rec); 2176 for (n = 2; n <= xfs_btree_get_numrecs(block); n++) { 2177 rec = xfs_btree_rec_addr(cur, n, block); 2178 cur->bc_ops->init_high_key_from_rec(&hkey, rec); 2179 if (xfs_btree_keycmp_gt(cur, &hkey, &max_hkey)) 2180 max_hkey = hkey; 2181 } 2182 2183 high = xfs_btree_high_key_from_key(cur, key); 2184 memcpy(high, &max_hkey, cur->bc_ops->key_len / 2); 2185 } 2186 } 2187 2188 /* Determine the low (and high if overlapped) keys of a node block */ 2189 STATIC void 2190 xfs_btree_get_node_keys( 2191 struct xfs_btree_cur *cur, 2192 struct xfs_btree_block *block, 2193 union xfs_btree_key *key) 2194 { 2195 union xfs_btree_key *hkey; 2196 union xfs_btree_key *max_hkey; 2197 union xfs_btree_key *high; 2198 int n; 2199 2200 if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) { 2201 memcpy(key, xfs_btree_key_addr(cur, 1, block), 2202 cur->bc_ops->key_len / 2); 2203 2204 max_hkey = xfs_btree_high_key_addr(cur, 1, block); 2205 for (n = 2; n <= xfs_btree_get_numrecs(block); n++) { 2206 hkey = xfs_btree_high_key_addr(cur, n, block); 2207 if (xfs_btree_keycmp_gt(cur, hkey, max_hkey)) 2208 max_hkey = hkey; 2209 } 2210 2211 high = xfs_btree_high_key_from_key(cur, key); 2212 memcpy(high, max_hkey, cur->bc_ops->key_len / 2); 2213 } else { 2214 memcpy(key, xfs_btree_key_addr(cur, 1, block), 2215 cur->bc_ops->key_len); 2216 } 2217 } 2218 2219 /* Derive the keys for any btree block. */ 2220 void 2221 xfs_btree_get_keys( 2222 struct xfs_btree_cur *cur, 2223 struct xfs_btree_block *block, 2224 union xfs_btree_key *key) 2225 { 2226 if (be16_to_cpu(block->bb_level) == 0) 2227 xfs_btree_get_leaf_keys(cur, block, key); 2228 else 2229 xfs_btree_get_node_keys(cur, block, key); 2230 } 2231 2232 /* 2233 * Decide if we need to update the parent keys of a btree block. For 2234 * a standard btree this is only necessary if we're updating the first 2235 * record/key. For an overlapping btree, we must always update the 2236 * keys because the highest key can be in any of the records or keys 2237 * in the block. 2238 */ 2239 static inline bool 2240 xfs_btree_needs_key_update( 2241 struct xfs_btree_cur *cur, 2242 int ptr) 2243 { 2244 return (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) || ptr == 1; 2245 } 2246 2247 /* 2248 * Update the low and high parent keys of the given level, progressing 2249 * towards the root. If force_all is false, stop if the keys for a given 2250 * level do not need updating. 2251 */ 2252 STATIC int 2253 __xfs_btree_updkeys( 2254 struct xfs_btree_cur *cur, 2255 int level, 2256 struct xfs_btree_block *block, 2257 struct xfs_buf *bp0, 2258 bool force_all) 2259 { 2260 union xfs_btree_key key; /* keys from current level */ 2261 union xfs_btree_key *lkey; /* keys from the next level up */ 2262 union xfs_btree_key *hkey; 2263 union xfs_btree_key *nlkey; /* keys from the next level up */ 2264 union xfs_btree_key *nhkey; 2265 struct xfs_buf *bp; 2266 int ptr; 2267 2268 ASSERT(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING); 2269 2270 /* Exit if there aren't any parent levels to update. */ 2271 if (level + 1 >= cur->bc_nlevels) 2272 return 0; 2273 2274 trace_xfs_btree_updkeys(cur, level, bp0); 2275 2276 lkey = &key; 2277 hkey = xfs_btree_high_key_from_key(cur, lkey); 2278 xfs_btree_get_keys(cur, block, lkey); 2279 for (level++; level < cur->bc_nlevels; level++) { 2280 #ifdef DEBUG 2281 int error; 2282 #endif 2283 block = xfs_btree_get_block(cur, level, &bp); 2284 trace_xfs_btree_updkeys(cur, level, bp); 2285 #ifdef DEBUG 2286 error = xfs_btree_check_block(cur, block, level, bp); 2287 if (error) 2288 return error; 2289 #endif 2290 ptr = cur->bc_levels[level].ptr; 2291 nlkey = xfs_btree_key_addr(cur, ptr, block); 2292 nhkey = xfs_btree_high_key_addr(cur, ptr, block); 2293 if (!force_all && 2294 xfs_btree_keycmp_eq(cur, nlkey, lkey) && 2295 xfs_btree_keycmp_eq(cur, nhkey, hkey)) 2296 break; 2297 xfs_btree_copy_keys(cur, nlkey, lkey, 1); 2298 xfs_btree_log_keys(cur, bp, ptr, ptr); 2299 if (level + 1 >= cur->bc_nlevels) 2300 break; 2301 xfs_btree_get_node_keys(cur, block, lkey); 2302 } 2303 2304 return 0; 2305 } 2306 2307 /* Update all the keys from some level in cursor back to the root. */ 2308 STATIC int 2309 xfs_btree_updkeys_force( 2310 struct xfs_btree_cur *cur, 2311 int level) 2312 { 2313 struct xfs_buf *bp; 2314 struct xfs_btree_block *block; 2315 2316 block = xfs_btree_get_block(cur, level, &bp); 2317 return __xfs_btree_updkeys(cur, level, block, bp, true); 2318 } 2319 2320 /* 2321 * Update the parent keys of the given level, progressing towards the root. 2322 */ 2323 STATIC int 2324 xfs_btree_update_keys( 2325 struct xfs_btree_cur *cur, 2326 int level) 2327 { 2328 struct xfs_btree_block *block; 2329 struct xfs_buf *bp; 2330 union xfs_btree_key *kp; 2331 union xfs_btree_key key; 2332 int ptr; 2333 2334 ASSERT(level >= 0); 2335 2336 block = xfs_btree_get_block(cur, level, &bp); 2337 if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) 2338 return __xfs_btree_updkeys(cur, level, block, bp, false); 2339 2340 /* 2341 * Go up the tree from this level toward the root. 2342 * At each level, update the key value to the value input. 2343 * Stop when we reach a level where the cursor isn't pointing 2344 * at the first entry in the block. 2345 */ 2346 xfs_btree_get_keys(cur, block, &key); 2347 for (level++, ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) { 2348 #ifdef DEBUG 2349 int error; 2350 #endif 2351 block = xfs_btree_get_block(cur, level, &bp); 2352 #ifdef DEBUG 2353 error = xfs_btree_check_block(cur, block, level, bp); 2354 if (error) 2355 return error; 2356 #endif 2357 ptr = cur->bc_levels[level].ptr; 2358 kp = xfs_btree_key_addr(cur, ptr, block); 2359 xfs_btree_copy_keys(cur, kp, &key, 1); 2360 xfs_btree_log_keys(cur, bp, ptr, ptr); 2361 } 2362 2363 return 0; 2364 } 2365 2366 /* 2367 * Update the record referred to by cur to the value in the 2368 * given record. This either works (return 0) or gets an 2369 * EFSCORRUPTED error. 2370 */ 2371 int 2372 xfs_btree_update( 2373 struct xfs_btree_cur *cur, 2374 union xfs_btree_rec *rec) 2375 { 2376 struct xfs_btree_block *block; 2377 struct xfs_buf *bp; 2378 int error; 2379 int ptr; 2380 union xfs_btree_rec *rp; 2381 2382 /* Pick up the current block. */ 2383 block = xfs_btree_get_block(cur, 0, &bp); 2384 2385 #ifdef DEBUG 2386 error = xfs_btree_check_block(cur, block, 0, bp); 2387 if (error) 2388 goto error0; 2389 #endif 2390 /* Get the address of the rec to be updated. */ 2391 ptr = cur->bc_levels[0].ptr; 2392 rp = xfs_btree_rec_addr(cur, ptr, block); 2393 2394 /* Fill in the new contents and log them. */ 2395 xfs_btree_copy_recs(cur, rp, rec, 1); 2396 xfs_btree_log_recs(cur, bp, ptr, ptr); 2397 2398 /* Pass new key value up to our parent. */ 2399 if (xfs_btree_needs_key_update(cur, ptr)) { 2400 error = xfs_btree_update_keys(cur, 0); 2401 if (error) 2402 goto error0; 2403 } 2404 2405 return 0; 2406 2407 error0: 2408 return error; 2409 } 2410 2411 /* 2412 * Move 1 record left from cur/level if possible. 2413 * Update cur to reflect the new path. 2414 */ 2415 STATIC int /* error */ 2416 xfs_btree_lshift( 2417 struct xfs_btree_cur *cur, 2418 int level, 2419 int *stat) /* success/failure */ 2420 { 2421 struct xfs_buf *lbp; /* left buffer pointer */ 2422 struct xfs_btree_block *left; /* left btree block */ 2423 int lrecs; /* left record count */ 2424 struct xfs_buf *rbp; /* right buffer pointer */ 2425 struct xfs_btree_block *right; /* right btree block */ 2426 struct xfs_btree_cur *tcur; /* temporary btree cursor */ 2427 int rrecs; /* right record count */ 2428 union xfs_btree_ptr lptr; /* left btree pointer */ 2429 union xfs_btree_key *rkp = NULL; /* right btree key */ 2430 union xfs_btree_ptr *rpp = NULL; /* right address pointer */ 2431 union xfs_btree_rec *rrp = NULL; /* right record pointer */ 2432 int error; /* error return value */ 2433 int i; 2434 2435 if (xfs_btree_at_iroot(cur, level)) 2436 goto out0; 2437 2438 /* Set up variables for this block as "right". */ 2439 right = xfs_btree_get_block(cur, level, &rbp); 2440 2441 #ifdef DEBUG 2442 error = xfs_btree_check_block(cur, right, level, rbp); 2443 if (error) 2444 goto error0; 2445 #endif 2446 2447 /* If we've got no left sibling then we can't shift an entry left. */ 2448 xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB); 2449 if (xfs_btree_ptr_is_null(cur, &lptr)) 2450 goto out0; 2451 2452 /* 2453 * If the cursor entry is the one that would be moved, don't 2454 * do it... it's too complicated. 2455 */ 2456 if (cur->bc_levels[level].ptr <= 1) 2457 goto out0; 2458 2459 /* Set up the left neighbor as "left". */ 2460 error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp); 2461 if (error) 2462 goto error0; 2463 2464 /* If it's full, it can't take another entry. */ 2465 lrecs = xfs_btree_get_numrecs(left); 2466 if (lrecs == cur->bc_ops->get_maxrecs(cur, level)) 2467 goto out0; 2468 2469 rrecs = xfs_btree_get_numrecs(right); 2470 2471 /* 2472 * We add one entry to the left side and remove one for the right side. 2473 * Account for it here, the changes will be updated on disk and logged 2474 * later. 2475 */ 2476 lrecs++; 2477 rrecs--; 2478 2479 XFS_BTREE_STATS_INC(cur, lshift); 2480 XFS_BTREE_STATS_ADD(cur, moves, 1); 2481 2482 /* 2483 * If non-leaf, copy a key and a ptr to the left block. 2484 * Log the changes to the left block. 2485 */ 2486 if (level > 0) { 2487 /* It's a non-leaf. Move keys and pointers. */ 2488 union xfs_btree_key *lkp; /* left btree key */ 2489 union xfs_btree_ptr *lpp; /* left address pointer */ 2490 2491 lkp = xfs_btree_key_addr(cur, lrecs, left); 2492 rkp = xfs_btree_key_addr(cur, 1, right); 2493 2494 lpp = xfs_btree_ptr_addr(cur, lrecs, left); 2495 rpp = xfs_btree_ptr_addr(cur, 1, right); 2496 2497 error = xfs_btree_debug_check_ptr(cur, rpp, 0, level); 2498 if (error) 2499 goto error0; 2500 2501 xfs_btree_copy_keys(cur, lkp, rkp, 1); 2502 xfs_btree_copy_ptrs(cur, lpp, rpp, 1); 2503 2504 xfs_btree_log_keys(cur, lbp, lrecs, lrecs); 2505 xfs_btree_log_ptrs(cur, lbp, lrecs, lrecs); 2506 2507 ASSERT(cur->bc_ops->keys_inorder(cur, 2508 xfs_btree_key_addr(cur, lrecs - 1, left), lkp)); 2509 } else { 2510 /* It's a leaf. Move records. */ 2511 union xfs_btree_rec *lrp; /* left record pointer */ 2512 2513 lrp = xfs_btree_rec_addr(cur, lrecs, left); 2514 rrp = xfs_btree_rec_addr(cur, 1, right); 2515 2516 xfs_btree_copy_recs(cur, lrp, rrp, 1); 2517 xfs_btree_log_recs(cur, lbp, lrecs, lrecs); 2518 2519 ASSERT(cur->bc_ops->recs_inorder(cur, 2520 xfs_btree_rec_addr(cur, lrecs - 1, left), lrp)); 2521 } 2522 2523 xfs_btree_set_numrecs(left, lrecs); 2524 xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS); 2525 2526 xfs_btree_set_numrecs(right, rrecs); 2527 xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS); 2528 2529 /* 2530 * Slide the contents of right down one entry. 2531 */ 2532 XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1); 2533 if (level > 0) { 2534 /* It's a nonleaf. operate on keys and ptrs */ 2535 for (i = 0; i < rrecs; i++) { 2536 error = xfs_btree_debug_check_ptr(cur, rpp, i + 1, level); 2537 if (error) 2538 goto error0; 2539 } 2540 2541 xfs_btree_shift_keys(cur, 2542 xfs_btree_key_addr(cur, 2, right), 2543 -1, rrecs); 2544 xfs_btree_shift_ptrs(cur, 2545 xfs_btree_ptr_addr(cur, 2, right), 2546 -1, rrecs); 2547 2548 xfs_btree_log_keys(cur, rbp, 1, rrecs); 2549 xfs_btree_log_ptrs(cur, rbp, 1, rrecs); 2550 } else { 2551 /* It's a leaf. operate on records */ 2552 xfs_btree_shift_recs(cur, 2553 xfs_btree_rec_addr(cur, 2, right), 2554 -1, rrecs); 2555 xfs_btree_log_recs(cur, rbp, 1, rrecs); 2556 } 2557 2558 /* 2559 * Using a temporary cursor, update the parent key values of the 2560 * block on the left. 2561 */ 2562 if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) { 2563 error = xfs_btree_dup_cursor(cur, &tcur); 2564 if (error) 2565 goto error0; 2566 i = xfs_btree_firstrec(tcur, level); 2567 if (XFS_IS_CORRUPT(tcur->bc_mp, i != 1)) { 2568 xfs_btree_mark_sick(cur); 2569 error = -EFSCORRUPTED; 2570 goto error0; 2571 } 2572 2573 error = xfs_btree_decrement(tcur, level, &i); 2574 if (error) 2575 goto error1; 2576 2577 /* Update the parent high keys of the left block, if needed. */ 2578 error = xfs_btree_update_keys(tcur, level); 2579 if (error) 2580 goto error1; 2581 2582 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); 2583 } 2584 2585 /* Update the parent keys of the right block. */ 2586 error = xfs_btree_update_keys(cur, level); 2587 if (error) 2588 goto error0; 2589 2590 /* Slide the cursor value left one. */ 2591 cur->bc_levels[level].ptr--; 2592 2593 *stat = 1; 2594 return 0; 2595 2596 out0: 2597 *stat = 0; 2598 return 0; 2599 2600 error0: 2601 return error; 2602 2603 error1: 2604 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); 2605 return error; 2606 } 2607 2608 /* 2609 * Move 1 record right from cur/level if possible. 2610 * Update cur to reflect the new path. 2611 */ 2612 STATIC int /* error */ 2613 xfs_btree_rshift( 2614 struct xfs_btree_cur *cur, 2615 int level, 2616 int *stat) /* success/failure */ 2617 { 2618 struct xfs_buf *lbp; /* left buffer pointer */ 2619 struct xfs_btree_block *left; /* left btree block */ 2620 struct xfs_buf *rbp; /* right buffer pointer */ 2621 struct xfs_btree_block *right; /* right btree block */ 2622 struct xfs_btree_cur *tcur; /* temporary btree cursor */ 2623 union xfs_btree_ptr rptr; /* right block pointer */ 2624 union xfs_btree_key *rkp; /* right btree key */ 2625 int rrecs; /* right record count */ 2626 int lrecs; /* left record count */ 2627 int error; /* error return value */ 2628 int i; /* loop counter */ 2629 2630 if (xfs_btree_at_iroot(cur, level)) 2631 goto out0; 2632 2633 /* Set up variables for this block as "left". */ 2634 left = xfs_btree_get_block(cur, level, &lbp); 2635 2636 #ifdef DEBUG 2637 error = xfs_btree_check_block(cur, left, level, lbp); 2638 if (error) 2639 goto error0; 2640 #endif 2641 2642 /* If we've got no right sibling then we can't shift an entry right. */ 2643 xfs_btree_get_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB); 2644 if (xfs_btree_ptr_is_null(cur, &rptr)) 2645 goto out0; 2646 2647 /* 2648 * If the cursor entry is the one that would be moved, don't 2649 * do it... it's too complicated. 2650 */ 2651 lrecs = xfs_btree_get_numrecs(left); 2652 if (cur->bc_levels[level].ptr >= lrecs) 2653 goto out0; 2654 2655 /* Set up the right neighbor as "right". */ 2656 error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp); 2657 if (error) 2658 goto error0; 2659 2660 /* If it's full, it can't take another entry. */ 2661 rrecs = xfs_btree_get_numrecs(right); 2662 if (rrecs == cur->bc_ops->get_maxrecs(cur, level)) 2663 goto out0; 2664 2665 XFS_BTREE_STATS_INC(cur, rshift); 2666 XFS_BTREE_STATS_ADD(cur, moves, rrecs); 2667 2668 /* 2669 * Make a hole at the start of the right neighbor block, then 2670 * copy the last left block entry to the hole. 2671 */ 2672 if (level > 0) { 2673 /* It's a nonleaf. make a hole in the keys and ptrs */ 2674 union xfs_btree_key *lkp; 2675 union xfs_btree_ptr *lpp; 2676 union xfs_btree_ptr *rpp; 2677 2678 lkp = xfs_btree_key_addr(cur, lrecs, left); 2679 lpp = xfs_btree_ptr_addr(cur, lrecs, left); 2680 rkp = xfs_btree_key_addr(cur, 1, right); 2681 rpp = xfs_btree_ptr_addr(cur, 1, right); 2682 2683 for (i = rrecs - 1; i >= 0; i--) { 2684 error = xfs_btree_debug_check_ptr(cur, rpp, i, level); 2685 if (error) 2686 goto error0; 2687 } 2688 2689 xfs_btree_shift_keys(cur, rkp, 1, rrecs); 2690 xfs_btree_shift_ptrs(cur, rpp, 1, rrecs); 2691 2692 error = xfs_btree_debug_check_ptr(cur, lpp, 0, level); 2693 if (error) 2694 goto error0; 2695 2696 /* Now put the new data in, and log it. */ 2697 xfs_btree_copy_keys(cur, rkp, lkp, 1); 2698 xfs_btree_copy_ptrs(cur, rpp, lpp, 1); 2699 2700 xfs_btree_log_keys(cur, rbp, 1, rrecs + 1); 2701 xfs_btree_log_ptrs(cur, rbp, 1, rrecs + 1); 2702 2703 ASSERT(cur->bc_ops->keys_inorder(cur, rkp, 2704 xfs_btree_key_addr(cur, 2, right))); 2705 } else { 2706 /* It's a leaf. make a hole in the records */ 2707 union xfs_btree_rec *lrp; 2708 union xfs_btree_rec *rrp; 2709 2710 lrp = xfs_btree_rec_addr(cur, lrecs, left); 2711 rrp = xfs_btree_rec_addr(cur, 1, right); 2712 2713 xfs_btree_shift_recs(cur, rrp, 1, rrecs); 2714 2715 /* Now put the new data in, and log it. */ 2716 xfs_btree_copy_recs(cur, rrp, lrp, 1); 2717 xfs_btree_log_recs(cur, rbp, 1, rrecs + 1); 2718 } 2719 2720 /* 2721 * Decrement and log left's numrecs, bump and log right's numrecs. 2722 */ 2723 xfs_btree_set_numrecs(left, --lrecs); 2724 xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS); 2725 2726 xfs_btree_set_numrecs(right, ++rrecs); 2727 xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS); 2728 2729 /* 2730 * Using a temporary cursor, update the parent key values of the 2731 * block on the right. 2732 */ 2733 error = xfs_btree_dup_cursor(cur, &tcur); 2734 if (error) 2735 goto error0; 2736 i = xfs_btree_lastrec(tcur, level); 2737 if (XFS_IS_CORRUPT(tcur->bc_mp, i != 1)) { 2738 xfs_btree_mark_sick(cur); 2739 error = -EFSCORRUPTED; 2740 goto error0; 2741 } 2742 2743 error = xfs_btree_increment(tcur, level, &i); 2744 if (error) 2745 goto error1; 2746 2747 /* Update the parent high keys of the left block, if needed. */ 2748 if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) { 2749 error = xfs_btree_update_keys(cur, level); 2750 if (error) 2751 goto error1; 2752 } 2753 2754 /* Update the parent keys of the right block. */ 2755 error = xfs_btree_update_keys(tcur, level); 2756 if (error) 2757 goto error1; 2758 2759 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); 2760 2761 *stat = 1; 2762 return 0; 2763 2764 out0: 2765 *stat = 0; 2766 return 0; 2767 2768 error0: 2769 return error; 2770 2771 error1: 2772 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); 2773 return error; 2774 } 2775 2776 static inline int 2777 xfs_btree_alloc_block( 2778 struct xfs_btree_cur *cur, 2779 const union xfs_btree_ptr *hint_block, 2780 union xfs_btree_ptr *new_block, 2781 int *stat) 2782 { 2783 int error; 2784 2785 /* 2786 * Don't allow block allocation for a staging cursor, because staging 2787 * cursors do not support regular btree modifications. 2788 * 2789 * Bulk loading uses a separate callback to obtain new blocks from a 2790 * preallocated list, which prevents ENOSPC failures during loading. 2791 */ 2792 if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) { 2793 ASSERT(0); 2794 return -EFSCORRUPTED; 2795 } 2796 2797 error = cur->bc_ops->alloc_block(cur, hint_block, new_block, stat); 2798 trace_xfs_btree_alloc_block(cur, new_block, *stat, error); 2799 return error; 2800 } 2801 2802 /* 2803 * Split cur/level block in half. 2804 * Return new block number and the key to its first 2805 * record (to be inserted into parent). 2806 */ 2807 STATIC int /* error */ 2808 __xfs_btree_split( 2809 struct xfs_btree_cur *cur, 2810 int level, 2811 union xfs_btree_ptr *ptrp, 2812 union xfs_btree_key *key, 2813 struct xfs_btree_cur **curp, 2814 int *stat) /* success/failure */ 2815 { 2816 union xfs_btree_ptr lptr; /* left sibling block ptr */ 2817 struct xfs_buf *lbp; /* left buffer pointer */ 2818 struct xfs_btree_block *left; /* left btree block */ 2819 union xfs_btree_ptr rptr; /* right sibling block ptr */ 2820 struct xfs_buf *rbp; /* right buffer pointer */ 2821 struct xfs_btree_block *right; /* right btree block */ 2822 union xfs_btree_ptr rrptr; /* right-right sibling ptr */ 2823 struct xfs_buf *rrbp; /* right-right buffer pointer */ 2824 struct xfs_btree_block *rrblock; /* right-right btree block */ 2825 int lrecs; 2826 int rrecs; 2827 int src_index; 2828 int error; /* error return value */ 2829 int i; 2830 2831 XFS_BTREE_STATS_INC(cur, split); 2832 2833 /* Set up left block (current one). */ 2834 left = xfs_btree_get_block(cur, level, &lbp); 2835 2836 #ifdef DEBUG 2837 error = xfs_btree_check_block(cur, left, level, lbp); 2838 if (error) 2839 goto error0; 2840 #endif 2841 2842 xfs_btree_buf_to_ptr(cur, lbp, &lptr); 2843 2844 /* Allocate the new block. If we can't do it, we're toast. Give up. */ 2845 error = xfs_btree_alloc_block(cur, &lptr, &rptr, stat); 2846 if (error) 2847 goto error0; 2848 if (*stat == 0) 2849 goto out0; 2850 XFS_BTREE_STATS_INC(cur, alloc); 2851 2852 /* Set up the new block as "right". */ 2853 error = xfs_btree_get_buf_block(cur, &rptr, &right, &rbp); 2854 if (error) 2855 goto error0; 2856 2857 /* Fill in the btree header for the new right block. */ 2858 xfs_btree_init_block_cur(cur, rbp, xfs_btree_get_level(left), 0); 2859 2860 /* 2861 * Split the entries between the old and the new block evenly. 2862 * Make sure that if there's an odd number of entries now, that 2863 * each new block will have the same number of entries. 2864 */ 2865 lrecs = xfs_btree_get_numrecs(left); 2866 rrecs = lrecs / 2; 2867 if ((lrecs & 1) && cur->bc_levels[level].ptr <= rrecs + 1) 2868 rrecs++; 2869 src_index = (lrecs - rrecs + 1); 2870 2871 XFS_BTREE_STATS_ADD(cur, moves, rrecs); 2872 2873 /* Adjust numrecs for the later get_*_keys() calls. */ 2874 lrecs -= rrecs; 2875 xfs_btree_set_numrecs(left, lrecs); 2876 xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs); 2877 2878 /* 2879 * Copy btree block entries from the left block over to the 2880 * new block, the right. Update the right block and log the 2881 * changes. 2882 */ 2883 if (level > 0) { 2884 /* It's a non-leaf. Move keys and pointers. */ 2885 union xfs_btree_key *lkp; /* left btree key */ 2886 union xfs_btree_ptr *lpp; /* left address pointer */ 2887 union xfs_btree_key *rkp; /* right btree key */ 2888 union xfs_btree_ptr *rpp; /* right address pointer */ 2889 2890 lkp = xfs_btree_key_addr(cur, src_index, left); 2891 lpp = xfs_btree_ptr_addr(cur, src_index, left); 2892 rkp = xfs_btree_key_addr(cur, 1, right); 2893 rpp = xfs_btree_ptr_addr(cur, 1, right); 2894 2895 for (i = src_index; i < rrecs; i++) { 2896 error = xfs_btree_debug_check_ptr(cur, lpp, i, level); 2897 if (error) 2898 goto error0; 2899 } 2900 2901 /* Copy the keys & pointers to the new block. */ 2902 xfs_btree_copy_keys(cur, rkp, lkp, rrecs); 2903 xfs_btree_copy_ptrs(cur, rpp, lpp, rrecs); 2904 2905 xfs_btree_log_keys(cur, rbp, 1, rrecs); 2906 xfs_btree_log_ptrs(cur, rbp, 1, rrecs); 2907 2908 /* Stash the keys of the new block for later insertion. */ 2909 xfs_btree_get_node_keys(cur, right, key); 2910 } else { 2911 /* It's a leaf. Move records. */ 2912 union xfs_btree_rec *lrp; /* left record pointer */ 2913 union xfs_btree_rec *rrp; /* right record pointer */ 2914 2915 lrp = xfs_btree_rec_addr(cur, src_index, left); 2916 rrp = xfs_btree_rec_addr(cur, 1, right); 2917 2918 /* Copy records to the new block. */ 2919 xfs_btree_copy_recs(cur, rrp, lrp, rrecs); 2920 xfs_btree_log_recs(cur, rbp, 1, rrecs); 2921 2922 /* Stash the keys of the new block for later insertion. */ 2923 xfs_btree_get_leaf_keys(cur, right, key); 2924 } 2925 2926 /* 2927 * Find the left block number by looking in the buffer. 2928 * Adjust sibling pointers. 2929 */ 2930 xfs_btree_get_sibling(cur, left, &rrptr, XFS_BB_RIGHTSIB); 2931 xfs_btree_set_sibling(cur, right, &rrptr, XFS_BB_RIGHTSIB); 2932 xfs_btree_set_sibling(cur, right, &lptr, XFS_BB_LEFTSIB); 2933 xfs_btree_set_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB); 2934 2935 xfs_btree_log_block(cur, rbp, XFS_BB_ALL_BITS); 2936 xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB); 2937 2938 /* 2939 * If there's a block to the new block's right, make that block 2940 * point back to right instead of to left. 2941 */ 2942 if (!xfs_btree_ptr_is_null(cur, &rrptr)) { 2943 error = xfs_btree_read_buf_block(cur, &rrptr, 2944 0, &rrblock, &rrbp); 2945 if (error) 2946 goto error0; 2947 xfs_btree_set_sibling(cur, rrblock, &rptr, XFS_BB_LEFTSIB); 2948 xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB); 2949 } 2950 2951 /* Update the parent high keys of the left block, if needed. */ 2952 if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) { 2953 error = xfs_btree_update_keys(cur, level); 2954 if (error) 2955 goto error0; 2956 } 2957 2958 /* 2959 * If the cursor is really in the right block, move it there. 2960 * If it's just pointing past the last entry in left, then we'll 2961 * insert there, so don't change anything in that case. 2962 */ 2963 if (cur->bc_levels[level].ptr > lrecs + 1) { 2964 xfs_btree_setbuf(cur, level, rbp); 2965 cur->bc_levels[level].ptr -= lrecs; 2966 } 2967 /* 2968 * If there are more levels, we'll need another cursor which refers 2969 * the right block, no matter where this cursor was. 2970 */ 2971 if (level + 1 < cur->bc_nlevels) { 2972 error = xfs_btree_dup_cursor(cur, curp); 2973 if (error) 2974 goto error0; 2975 (*curp)->bc_levels[level + 1].ptr++; 2976 } 2977 *ptrp = rptr; 2978 *stat = 1; 2979 return 0; 2980 out0: 2981 *stat = 0; 2982 return 0; 2983 2984 error0: 2985 return error; 2986 } 2987 2988 #ifdef __KERNEL__ 2989 struct xfs_btree_split_args { 2990 struct xfs_btree_cur *cur; 2991 int level; 2992 union xfs_btree_ptr *ptrp; 2993 union xfs_btree_key *key; 2994 struct xfs_btree_cur **curp; 2995 int *stat; /* success/failure */ 2996 int result; 2997 bool kswapd; /* allocation in kswapd context */ 2998 struct completion *done; 2999 struct work_struct work; 3000 }; 3001 3002 /* 3003 * Stack switching interfaces for allocation 3004 */ 3005 static void 3006 xfs_btree_split_worker( 3007 struct work_struct *work) 3008 { 3009 struct xfs_btree_split_args *args = container_of(work, 3010 struct xfs_btree_split_args, work); 3011 unsigned long pflags; 3012 unsigned long new_pflags = 0; 3013 3014 /* 3015 * we are in a transaction context here, but may also be doing work 3016 * in kswapd context, and hence we may need to inherit that state 3017 * temporarily to ensure that we don't block waiting for memory reclaim 3018 * in any way. 3019 */ 3020 if (args->kswapd) 3021 new_pflags |= PF_MEMALLOC | PF_KSWAPD; 3022 3023 current_set_flags_nested(&pflags, new_pflags); 3024 xfs_trans_set_context(args->cur->bc_tp); 3025 3026 args->result = __xfs_btree_split(args->cur, args->level, args->ptrp, 3027 args->key, args->curp, args->stat); 3028 3029 xfs_trans_clear_context(args->cur->bc_tp); 3030 current_restore_flags_nested(&pflags, new_pflags); 3031 3032 /* 3033 * Do not access args after complete() has run here. We don't own args 3034 * and the owner may run and free args before we return here. 3035 */ 3036 complete(args->done); 3037 3038 } 3039 3040 /* 3041 * BMBT split requests often come in with little stack to work on so we push 3042 * them off to a worker thread so there is lots of stack to use. For the other 3043 * btree types, just call directly to avoid the context switch overhead here. 3044 * 3045 * Care must be taken here - the work queue rescuer thread introduces potential 3046 * AGF <> worker queue deadlocks if the BMBT block allocation has to lock new 3047 * AGFs to allocate blocks. A task being run by the rescuer could attempt to 3048 * lock an AGF that is already locked by a task queued to run by the rescuer, 3049 * resulting in an ABBA deadlock as the rescuer cannot run the lock holder to 3050 * release it until the current thread it is running gains the lock. 3051 * 3052 * To avoid this issue, we only ever queue BMBT splits that don't have an AGF 3053 * already locked to allocate from. The only place that doesn't hold an AGF 3054 * locked is unwritten extent conversion at IO completion, but that has already 3055 * been offloaded to a worker thread and hence has no stack consumption issues 3056 * we have to worry about. 3057 */ 3058 STATIC int /* error */ 3059 xfs_btree_split( 3060 struct xfs_btree_cur *cur, 3061 int level, 3062 union xfs_btree_ptr *ptrp, 3063 union xfs_btree_key *key, 3064 struct xfs_btree_cur **curp, 3065 int *stat) /* success/failure */ 3066 { 3067 struct xfs_btree_split_args args; 3068 DECLARE_COMPLETION_ONSTACK(done); 3069 3070 if (!xfs_btree_is_bmap(cur->bc_ops) || 3071 cur->bc_tp->t_highest_agno == NULLAGNUMBER) 3072 return __xfs_btree_split(cur, level, ptrp, key, curp, stat); 3073 3074 args.cur = cur; 3075 args.level = level; 3076 args.ptrp = ptrp; 3077 args.key = key; 3078 args.curp = curp; 3079 args.stat = stat; 3080 args.done = &done; 3081 args.kswapd = current_is_kswapd(); 3082 INIT_WORK_ONSTACK(&args.work, xfs_btree_split_worker); 3083 queue_work(xfs_alloc_wq, &args.work); 3084 wait_for_completion(&done); 3085 destroy_work_on_stack(&args.work); 3086 return args.result; 3087 } 3088 #else 3089 #define xfs_btree_split __xfs_btree_split 3090 #endif /* __KERNEL__ */ 3091 3092 /* Move the records from a root leaf block to a separate block. */ 3093 STATIC void 3094 xfs_btree_promote_leaf_iroot( 3095 struct xfs_btree_cur *cur, 3096 struct xfs_btree_block *block, 3097 struct xfs_buf *cbp, 3098 union xfs_btree_ptr *cptr, 3099 struct xfs_btree_block *cblock) 3100 { 3101 union xfs_btree_rec *rp; 3102 union xfs_btree_rec *crp; 3103 union xfs_btree_key *kp; 3104 union xfs_btree_ptr *pp; 3105 struct xfs_btree_block *broot; 3106 int numrecs = xfs_btree_get_numrecs(block); 3107 3108 /* Copy the records from the leaf broot into the new child block. */ 3109 rp = xfs_btree_rec_addr(cur, 1, block); 3110 crp = xfs_btree_rec_addr(cur, 1, cblock); 3111 xfs_btree_copy_recs(cur, crp, rp, numrecs); 3112 3113 /* 3114 * Increment the tree height. 3115 * 3116 * Trickery here: The amount of memory that we need per record for the 3117 * ifork's btree root block may change when we convert the broot from a 3118 * leaf to a node block. Free the existing leaf broot so that nobody 3119 * thinks we need to migrate node pointers when we realloc the broot 3120 * buffer after bumping nlevels. 3121 */ 3122 cur->bc_ops->broot_realloc(cur, 0); 3123 cur->bc_nlevels++; 3124 cur->bc_levels[1].ptr = 1; 3125 3126 /* 3127 * Allocate a new node broot and initialize it to point to the new 3128 * child block. 3129 */ 3130 broot = cur->bc_ops->broot_realloc(cur, 1); 3131 xfs_btree_init_block(cur->bc_mp, broot, cur->bc_ops, 3132 cur->bc_nlevels - 1, 1, cur->bc_ino.ip->i_ino); 3133 3134 pp = xfs_btree_ptr_addr(cur, 1, broot); 3135 kp = xfs_btree_key_addr(cur, 1, broot); 3136 xfs_btree_copy_ptrs(cur, pp, cptr, 1); 3137 xfs_btree_get_keys(cur, cblock, kp); 3138 3139 /* Attach the new block to the cursor and log it. */ 3140 xfs_btree_setbuf(cur, 0, cbp); 3141 xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS); 3142 xfs_btree_log_recs(cur, cbp, 1, numrecs); 3143 } 3144 3145 /* 3146 * Move the keys and pointers from a root block to a separate block. 3147 * 3148 * Since the keyptr size does not change, all we have to do is increase the 3149 * tree height, copy the keyptrs to the new internal node (cblock), shrink 3150 * the root, and copy the pointers there. 3151 */ 3152 STATIC int 3153 xfs_btree_promote_node_iroot( 3154 struct xfs_btree_cur *cur, 3155 struct xfs_btree_block *block, 3156 int level, 3157 struct xfs_buf *cbp, 3158 union xfs_btree_ptr *cptr, 3159 struct xfs_btree_block *cblock) 3160 { 3161 union xfs_btree_key *ckp; 3162 union xfs_btree_key *kp; 3163 union xfs_btree_ptr *cpp; 3164 union xfs_btree_ptr *pp; 3165 int i; 3166 int error; 3167 int numrecs = xfs_btree_get_numrecs(block); 3168 3169 /* 3170 * Increase tree height, adjusting the root block level to match. 3171 * We cannot change the root btree node size until we've copied the 3172 * block contents to the new child block. 3173 */ 3174 be16_add_cpu(&block->bb_level, 1); 3175 cur->bc_nlevels++; 3176 cur->bc_levels[level + 1].ptr = 1; 3177 3178 /* 3179 * Adjust the root btree record count, then copy the keys from the old 3180 * root to the new child block. 3181 */ 3182 xfs_btree_set_numrecs(block, 1); 3183 kp = xfs_btree_key_addr(cur, 1, block); 3184 ckp = xfs_btree_key_addr(cur, 1, cblock); 3185 xfs_btree_copy_keys(cur, ckp, kp, numrecs); 3186 3187 /* Check the pointers and copy them to the new child block. */ 3188 pp = xfs_btree_ptr_addr(cur, 1, block); 3189 cpp = xfs_btree_ptr_addr(cur, 1, cblock); 3190 for (i = 0; i < numrecs; i++) { 3191 error = xfs_btree_debug_check_ptr(cur, pp, i, level); 3192 if (error) 3193 return error; 3194 } 3195 xfs_btree_copy_ptrs(cur, cpp, pp, numrecs); 3196 3197 /* 3198 * Set the first keyptr to point to the new child block, then shrink 3199 * the memory buffer for the root block. 3200 */ 3201 error = xfs_btree_debug_check_ptr(cur, cptr, 0, level); 3202 if (error) 3203 return error; 3204 xfs_btree_copy_ptrs(cur, pp, cptr, 1); 3205 xfs_btree_get_keys(cur, cblock, kp); 3206 3207 cur->bc_ops->broot_realloc(cur, 1); 3208 3209 /* Attach the new block to the cursor and log it. */ 3210 xfs_btree_setbuf(cur, level, cbp); 3211 xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS); 3212 xfs_btree_log_keys(cur, cbp, 1, numrecs); 3213 xfs_btree_log_ptrs(cur, cbp, 1, numrecs); 3214 return 0; 3215 } 3216 3217 /* 3218 * Copy the old inode root contents into a real block and make the 3219 * broot point to it. 3220 */ 3221 int /* error */ 3222 xfs_btree_new_iroot( 3223 struct xfs_btree_cur *cur, /* btree cursor */ 3224 int *logflags, /* logging flags for inode */ 3225 int *stat) /* return status - 0 fail */ 3226 { 3227 struct xfs_buf *cbp; /* buffer for cblock */ 3228 struct xfs_btree_block *block; /* btree block */ 3229 struct xfs_btree_block *cblock; /* child btree block */ 3230 union xfs_btree_ptr aptr; 3231 union xfs_btree_ptr nptr; /* new block addr */ 3232 int level; /* btree level */ 3233 int error; /* error return code */ 3234 3235 XFS_BTREE_STATS_INC(cur, newroot); 3236 3237 ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE); 3238 3239 level = cur->bc_nlevels - 1; 3240 3241 block = xfs_btree_get_iroot(cur); 3242 ASSERT(level > 0 || (cur->bc_ops->geom_flags & XFS_BTGEO_IROOT_RECORDS)); 3243 if (level > 0) 3244 aptr = *xfs_btree_ptr_addr(cur, 1, block); 3245 else 3246 aptr.l = cpu_to_be64(XFS_INO_TO_FSB(cur->bc_mp, 3247 cur->bc_ino.ip->i_ino)); 3248 3249 /* Allocate the new block. If we can't do it, we're toast. Give up. */ 3250 error = xfs_btree_alloc_block(cur, &aptr, &nptr, stat); 3251 if (error) 3252 goto error0; 3253 if (*stat == 0) 3254 return 0; 3255 3256 XFS_BTREE_STATS_INC(cur, alloc); 3257 3258 /* Copy the root into a real block. */ 3259 error = xfs_btree_get_buf_block(cur, &nptr, &cblock, &cbp); 3260 if (error) 3261 goto error0; 3262 3263 /* 3264 * we can't just memcpy() the root in for CRC enabled btree blocks. 3265 * In that case have to also ensure the blkno remains correct 3266 */ 3267 memcpy(cblock, block, xfs_btree_block_len(cur)); 3268 if (xfs_has_crc(cur->bc_mp)) { 3269 __be64 bno = cpu_to_be64(xfs_buf_daddr(cbp)); 3270 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) 3271 cblock->bb_u.l.bb_blkno = bno; 3272 else 3273 cblock->bb_u.s.bb_blkno = bno; 3274 } 3275 3276 if (level > 0) { 3277 error = xfs_btree_promote_node_iroot(cur, block, level, cbp, 3278 &nptr, cblock); 3279 if (error) 3280 goto error0; 3281 } else { 3282 xfs_btree_promote_leaf_iroot(cur, block, cbp, &nptr, cblock); 3283 } 3284 3285 *logflags |= XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork); 3286 *stat = 1; 3287 return 0; 3288 error0: 3289 return error; 3290 } 3291 3292 static void 3293 xfs_btree_set_root( 3294 struct xfs_btree_cur *cur, 3295 const union xfs_btree_ptr *ptr, 3296 int inc) 3297 { 3298 if (cur->bc_flags & XFS_BTREE_STAGING) { 3299 /* Update the btree root information for a per-AG fake root. */ 3300 cur->bc_ag.afake->af_root = be32_to_cpu(ptr->s); 3301 cur->bc_ag.afake->af_levels += inc; 3302 } else { 3303 cur->bc_ops->set_root(cur, ptr, inc); 3304 } 3305 } 3306 3307 /* 3308 * Allocate a new root block, fill it in. 3309 */ 3310 STATIC int /* error */ 3311 xfs_btree_new_root( 3312 struct xfs_btree_cur *cur, /* btree cursor */ 3313 int *stat) /* success/failure */ 3314 { 3315 struct xfs_btree_block *block; /* one half of the old root block */ 3316 struct xfs_buf *bp; /* buffer containing block */ 3317 int error; /* error return value */ 3318 struct xfs_buf *lbp; /* left buffer pointer */ 3319 struct xfs_btree_block *left; /* left btree block */ 3320 struct xfs_buf *nbp; /* new (root) buffer */ 3321 struct xfs_btree_block *new; /* new (root) btree block */ 3322 int nptr; /* new value for key index, 1 or 2 */ 3323 struct xfs_buf *rbp; /* right buffer pointer */ 3324 struct xfs_btree_block *right; /* right btree block */ 3325 union xfs_btree_ptr rptr; 3326 union xfs_btree_ptr lptr; 3327 3328 XFS_BTREE_STATS_INC(cur, newroot); 3329 3330 /* initialise our start point from the cursor */ 3331 xfs_btree_init_ptr_from_cur(cur, &rptr); 3332 3333 /* Allocate the new block. If we can't do it, we're toast. Give up. */ 3334 error = xfs_btree_alloc_block(cur, &rptr, &lptr, stat); 3335 if (error) 3336 goto error0; 3337 if (*stat == 0) 3338 goto out0; 3339 XFS_BTREE_STATS_INC(cur, alloc); 3340 3341 /* Set up the new block. */ 3342 error = xfs_btree_get_buf_block(cur, &lptr, &new, &nbp); 3343 if (error) 3344 goto error0; 3345 3346 /* Set the root in the holding structure increasing the level by 1. */ 3347 xfs_btree_set_root(cur, &lptr, 1); 3348 3349 /* 3350 * At the previous root level there are now two blocks: the old root, 3351 * and the new block generated when it was split. We don't know which 3352 * one the cursor is pointing at, so we set up variables "left" and 3353 * "right" for each case. 3354 */ 3355 block = xfs_btree_get_block(cur, cur->bc_nlevels - 1, &bp); 3356 3357 #ifdef DEBUG 3358 error = xfs_btree_check_block(cur, block, cur->bc_nlevels - 1, bp); 3359 if (error) 3360 goto error0; 3361 #endif 3362 3363 xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB); 3364 if (!xfs_btree_ptr_is_null(cur, &rptr)) { 3365 /* Our block is left, pick up the right block. */ 3366 lbp = bp; 3367 xfs_btree_buf_to_ptr(cur, lbp, &lptr); 3368 left = block; 3369 error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp); 3370 if (error) 3371 goto error0; 3372 bp = rbp; 3373 nptr = 1; 3374 } else { 3375 /* Our block is right, pick up the left block. */ 3376 rbp = bp; 3377 xfs_btree_buf_to_ptr(cur, rbp, &rptr); 3378 right = block; 3379 xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB); 3380 error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp); 3381 if (error) 3382 goto error0; 3383 bp = lbp; 3384 nptr = 2; 3385 } 3386 3387 /* Fill in the new block's btree header and log it. */ 3388 xfs_btree_init_block_cur(cur, nbp, cur->bc_nlevels, 2); 3389 xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS); 3390 ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) && 3391 !xfs_btree_ptr_is_null(cur, &rptr)); 3392 3393 /* Fill in the key data in the new root. */ 3394 if (xfs_btree_get_level(left) > 0) { 3395 /* 3396 * Get the keys for the left block's keys and put them directly 3397 * in the parent block. Do the same for the right block. 3398 */ 3399 xfs_btree_get_node_keys(cur, left, 3400 xfs_btree_key_addr(cur, 1, new)); 3401 xfs_btree_get_node_keys(cur, right, 3402 xfs_btree_key_addr(cur, 2, new)); 3403 } else { 3404 /* 3405 * Get the keys for the left block's records and put them 3406 * directly in the parent block. Do the same for the right 3407 * block. 3408 */ 3409 xfs_btree_get_leaf_keys(cur, left, 3410 xfs_btree_key_addr(cur, 1, new)); 3411 xfs_btree_get_leaf_keys(cur, right, 3412 xfs_btree_key_addr(cur, 2, new)); 3413 } 3414 xfs_btree_log_keys(cur, nbp, 1, 2); 3415 3416 /* Fill in the pointer data in the new root. */ 3417 xfs_btree_copy_ptrs(cur, 3418 xfs_btree_ptr_addr(cur, 1, new), &lptr, 1); 3419 xfs_btree_copy_ptrs(cur, 3420 xfs_btree_ptr_addr(cur, 2, new), &rptr, 1); 3421 xfs_btree_log_ptrs(cur, nbp, 1, 2); 3422 3423 /* Fix up the cursor. */ 3424 xfs_btree_setbuf(cur, cur->bc_nlevels, nbp); 3425 cur->bc_levels[cur->bc_nlevels].ptr = nptr; 3426 cur->bc_nlevels++; 3427 ASSERT(cur->bc_nlevels <= cur->bc_maxlevels); 3428 *stat = 1; 3429 return 0; 3430 error0: 3431 return error; 3432 out0: 3433 *stat = 0; 3434 return 0; 3435 } 3436 3437 STATIC int 3438 xfs_btree_make_block_unfull( 3439 struct xfs_btree_cur *cur, /* btree cursor */ 3440 int level, /* btree level */ 3441 int numrecs,/* # of recs in block */ 3442 int *oindex,/* old tree index */ 3443 int *index, /* new tree index */ 3444 union xfs_btree_ptr *nptr, /* new btree ptr */ 3445 struct xfs_btree_cur **ncur, /* new btree cursor */ 3446 union xfs_btree_key *key, /* key of new block */ 3447 int *stat) 3448 { 3449 int error = 0; 3450 3451 if (xfs_btree_at_iroot(cur, level)) { 3452 struct xfs_inode *ip = cur->bc_ino.ip; 3453 3454 if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) { 3455 /* A root block that can be made bigger. */ 3456 cur->bc_ops->broot_realloc(cur, numrecs + 1); 3457 *stat = 1; 3458 } else { 3459 /* A root block that needs replacing */ 3460 int logflags = 0; 3461 3462 error = xfs_btree_new_iroot(cur, &logflags, stat); 3463 if (error || *stat == 0) 3464 return error; 3465 3466 xfs_trans_log_inode(cur->bc_tp, ip, logflags); 3467 } 3468 3469 return 0; 3470 } 3471 3472 /* First, try shifting an entry to the right neighbor. */ 3473 error = xfs_btree_rshift(cur, level, stat); 3474 if (error || *stat) 3475 return error; 3476 3477 /* Next, try shifting an entry to the left neighbor. */ 3478 error = xfs_btree_lshift(cur, level, stat); 3479 if (error) 3480 return error; 3481 3482 if (*stat) { 3483 *oindex = *index = cur->bc_levels[level].ptr; 3484 return 0; 3485 } 3486 3487 /* 3488 * Next, try splitting the current block in half. 3489 * 3490 * If this works we have to re-set our variables because we 3491 * could be in a different block now. 3492 */ 3493 error = xfs_btree_split(cur, level, nptr, key, ncur, stat); 3494 if (error || *stat == 0) 3495 return error; 3496 3497 3498 *index = cur->bc_levels[level].ptr; 3499 return 0; 3500 } 3501 3502 /* 3503 * Insert one record/level. Return information to the caller 3504 * allowing the next level up to proceed if necessary. 3505 */ 3506 STATIC int 3507 xfs_btree_insrec( 3508 struct xfs_btree_cur *cur, /* btree cursor */ 3509 int level, /* level to insert record at */ 3510 union xfs_btree_ptr *ptrp, /* i/o: block number inserted */ 3511 union xfs_btree_rec *rec, /* record to insert */ 3512 union xfs_btree_key *key, /* i/o: block key for ptrp */ 3513 struct xfs_btree_cur **curp, /* output: new cursor replacing cur */ 3514 int *stat) /* success/failure */ 3515 { 3516 struct xfs_btree_block *block; /* btree block */ 3517 struct xfs_buf *bp; /* buffer for block */ 3518 union xfs_btree_ptr nptr; /* new block ptr */ 3519 struct xfs_btree_cur *ncur = NULL; /* new btree cursor */ 3520 union xfs_btree_key nkey; /* new block key */ 3521 union xfs_btree_key *lkey; 3522 int optr; /* old key/record index */ 3523 int ptr; /* key/record index */ 3524 int numrecs;/* number of records */ 3525 int error; /* error return value */ 3526 int i; 3527 xfs_daddr_t old_bn; 3528 3529 ncur = NULL; 3530 lkey = &nkey; 3531 3532 /* 3533 * If we have an external root pointer, and we've made it to the 3534 * root level, allocate a new root block and we're done. 3535 */ 3536 if (cur->bc_ops->type != XFS_BTREE_TYPE_INODE && 3537 level >= cur->bc_nlevels) { 3538 error = xfs_btree_new_root(cur, stat); 3539 xfs_btree_set_ptr_null(cur, ptrp); 3540 3541 return error; 3542 } 3543 3544 /* If we're off the left edge, return failure. */ 3545 ptr = cur->bc_levels[level].ptr; 3546 if (ptr == 0) { 3547 *stat = 0; 3548 return 0; 3549 } 3550 3551 optr = ptr; 3552 3553 XFS_BTREE_STATS_INC(cur, insrec); 3554 3555 /* Get pointers to the btree buffer and block. */ 3556 block = xfs_btree_get_block(cur, level, &bp); 3557 old_bn = bp ? xfs_buf_daddr(bp) : XFS_BUF_DADDR_NULL; 3558 numrecs = xfs_btree_get_numrecs(block); 3559 3560 #ifdef DEBUG 3561 error = xfs_btree_check_block(cur, block, level, bp); 3562 if (error) 3563 goto error0; 3564 3565 /* Check that the new entry is being inserted in the right place. */ 3566 if (ptr <= numrecs) { 3567 if (level == 0) { 3568 ASSERT(cur->bc_ops->recs_inorder(cur, rec, 3569 xfs_btree_rec_addr(cur, ptr, block))); 3570 } else { 3571 ASSERT(cur->bc_ops->keys_inorder(cur, key, 3572 xfs_btree_key_addr(cur, ptr, block))); 3573 } 3574 } 3575 #endif 3576 3577 /* 3578 * If the block is full, we can't insert the new entry until we 3579 * make the block un-full. 3580 */ 3581 xfs_btree_set_ptr_null(cur, &nptr); 3582 if (numrecs == cur->bc_ops->get_maxrecs(cur, level)) { 3583 error = xfs_btree_make_block_unfull(cur, level, numrecs, 3584 &optr, &ptr, &nptr, &ncur, lkey, stat); 3585 if (error || *stat == 0) 3586 goto error0; 3587 } 3588 3589 /* 3590 * The current block may have changed if the block was 3591 * previously full and we have just made space in it. 3592 */ 3593 block = xfs_btree_get_block(cur, level, &bp); 3594 numrecs = xfs_btree_get_numrecs(block); 3595 3596 #ifdef DEBUG 3597 error = xfs_btree_check_block(cur, block, level, bp); 3598 if (error) 3599 goto error0; 3600 #endif 3601 3602 /* 3603 * At this point we know there's room for our new entry in the block 3604 * we're pointing at. 3605 */ 3606 XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr + 1); 3607 3608 if (level > 0) { 3609 /* It's a nonleaf. make a hole in the keys and ptrs */ 3610 union xfs_btree_key *kp; 3611 union xfs_btree_ptr *pp; 3612 3613 kp = xfs_btree_key_addr(cur, ptr, block); 3614 pp = xfs_btree_ptr_addr(cur, ptr, block); 3615 3616 for (i = numrecs - ptr; i >= 0; i--) { 3617 error = xfs_btree_debug_check_ptr(cur, pp, i, level); 3618 if (error) 3619 goto error0; 3620 } 3621 3622 xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1); 3623 xfs_btree_shift_ptrs(cur, pp, 1, numrecs - ptr + 1); 3624 3625 error = xfs_btree_debug_check_ptr(cur, ptrp, 0, level); 3626 if (error) 3627 goto error0; 3628 3629 /* Now put the new data in, bump numrecs and log it. */ 3630 xfs_btree_copy_keys(cur, kp, key, 1); 3631 xfs_btree_copy_ptrs(cur, pp, ptrp, 1); 3632 numrecs++; 3633 xfs_btree_set_numrecs(block, numrecs); 3634 xfs_btree_log_ptrs(cur, bp, ptr, numrecs); 3635 xfs_btree_log_keys(cur, bp, ptr, numrecs); 3636 #ifdef DEBUG 3637 if (ptr < numrecs) { 3638 ASSERT(cur->bc_ops->keys_inorder(cur, kp, 3639 xfs_btree_key_addr(cur, ptr + 1, block))); 3640 } 3641 #endif 3642 } else { 3643 /* It's a leaf. make a hole in the records */ 3644 union xfs_btree_rec *rp; 3645 3646 rp = xfs_btree_rec_addr(cur, ptr, block); 3647 3648 xfs_btree_shift_recs(cur, rp, 1, numrecs - ptr + 1); 3649 3650 /* Now put the new data in, bump numrecs and log it. */ 3651 xfs_btree_copy_recs(cur, rp, rec, 1); 3652 xfs_btree_set_numrecs(block, ++numrecs); 3653 xfs_btree_log_recs(cur, bp, ptr, numrecs); 3654 #ifdef DEBUG 3655 if (ptr < numrecs) { 3656 ASSERT(cur->bc_ops->recs_inorder(cur, rp, 3657 xfs_btree_rec_addr(cur, ptr + 1, block))); 3658 } 3659 #endif 3660 } 3661 3662 /* Log the new number of records in the btree header. */ 3663 xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS); 3664 3665 /* 3666 * Update btree keys to reflect the newly added record or keyptr. 3667 * There are three cases here to be aware of. Normally, all we have to 3668 * do is walk towards the root, updating keys as necessary. 3669 * 3670 * If the caller had us target a full block for the insertion, we dealt 3671 * with that by calling the _make_block_unfull function. If the 3672 * "make unfull" function splits the block, it'll hand us back the key 3673 * and pointer of the new block. We haven't yet added the new block to 3674 * the next level up, so if we decide to add the new record to the new 3675 * block (bp->b_bn != old_bn), we have to update the caller's pointer 3676 * so that the caller adds the new block with the correct key. 3677 * 3678 * However, there is a third possibility-- if the selected block is the 3679 * root block of an inode-rooted btree and cannot be expanded further, 3680 * the "make unfull" function moves the root block contents to a new 3681 * block and updates the root block to point to the new block. In this 3682 * case, no block pointer is passed back because the block has already 3683 * been added to the btree. In this case, we need to use the regular 3684 * key update function, just like the first case. This is critical for 3685 * overlapping btrees, because the high key must be updated to reflect 3686 * the entire tree, not just the subtree accessible through the first 3687 * child of the root (which is now two levels down from the root). 3688 */ 3689 if (!xfs_btree_ptr_is_null(cur, &nptr) && 3690 bp && xfs_buf_daddr(bp) != old_bn) { 3691 xfs_btree_get_keys(cur, block, lkey); 3692 } else if (xfs_btree_needs_key_update(cur, optr)) { 3693 error = xfs_btree_update_keys(cur, level); 3694 if (error) 3695 goto error0; 3696 } 3697 3698 /* 3699 * Return the new block number, if any. 3700 * If there is one, give back a record value and a cursor too. 3701 */ 3702 *ptrp = nptr; 3703 if (!xfs_btree_ptr_is_null(cur, &nptr)) { 3704 xfs_btree_copy_keys(cur, key, lkey, 1); 3705 *curp = ncur; 3706 } 3707 3708 *stat = 1; 3709 return 0; 3710 3711 error0: 3712 if (ncur) 3713 xfs_btree_del_cursor(ncur, error); 3714 return error; 3715 } 3716 3717 /* 3718 * Insert the record at the point referenced by cur. 3719 * 3720 * A multi-level split of the tree on insert will invalidate the original 3721 * cursor. All callers of this function should assume that the cursor is 3722 * no longer valid and revalidate it. 3723 */ 3724 int 3725 xfs_btree_insert( 3726 struct xfs_btree_cur *cur, 3727 int *stat) 3728 { 3729 int error; /* error return value */ 3730 int i; /* result value, 0 for failure */ 3731 int level; /* current level number in btree */ 3732 union xfs_btree_ptr nptr; /* new block number (split result) */ 3733 struct xfs_btree_cur *ncur; /* new cursor (split result) */ 3734 struct xfs_btree_cur *pcur; /* previous level's cursor */ 3735 union xfs_btree_key bkey; /* key of block to insert */ 3736 union xfs_btree_key *key; 3737 union xfs_btree_rec rec; /* record to insert */ 3738 3739 level = 0; 3740 ncur = NULL; 3741 pcur = cur; 3742 key = &bkey; 3743 3744 xfs_btree_set_ptr_null(cur, &nptr); 3745 3746 /* Make a key out of the record data to be inserted, and save it. */ 3747 cur->bc_ops->init_rec_from_cur(cur, &rec); 3748 cur->bc_ops->init_key_from_rec(key, &rec); 3749 3750 /* 3751 * Loop going up the tree, starting at the leaf level. 3752 * Stop when we don't get a split block, that must mean that 3753 * the insert is finished with this level. 3754 */ 3755 do { 3756 /* 3757 * Insert nrec/nptr into this level of the tree. 3758 * Note if we fail, nptr will be null. 3759 */ 3760 error = xfs_btree_insrec(pcur, level, &nptr, &rec, key, 3761 &ncur, &i); 3762 if (error) { 3763 if (pcur != cur) 3764 xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR); 3765 goto error0; 3766 } 3767 3768 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 3769 xfs_btree_mark_sick(cur); 3770 error = -EFSCORRUPTED; 3771 goto error0; 3772 } 3773 level++; 3774 3775 /* 3776 * See if the cursor we just used is trash. 3777 * Can't trash the caller's cursor, but otherwise we should 3778 * if ncur is a new cursor or we're about to be done. 3779 */ 3780 if (pcur != cur && 3781 (ncur || xfs_btree_ptr_is_null(cur, &nptr))) { 3782 /* Save the state from the cursor before we trash it */ 3783 if (cur->bc_ops->update_cursor && 3784 !(cur->bc_flags & XFS_BTREE_STAGING)) 3785 cur->bc_ops->update_cursor(pcur, cur); 3786 cur->bc_nlevels = pcur->bc_nlevels; 3787 xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR); 3788 } 3789 /* If we got a new cursor, switch to it. */ 3790 if (ncur) { 3791 pcur = ncur; 3792 ncur = NULL; 3793 } 3794 } while (!xfs_btree_ptr_is_null(cur, &nptr)); 3795 3796 *stat = i; 3797 return 0; 3798 error0: 3799 return error; 3800 } 3801 3802 /* Move the records from a child leaf block to the root block. */ 3803 STATIC void 3804 xfs_btree_demote_leaf_child( 3805 struct xfs_btree_cur *cur, 3806 struct xfs_btree_block *cblock, 3807 int numrecs) 3808 { 3809 union xfs_btree_rec *rp; 3810 union xfs_btree_rec *crp; 3811 struct xfs_btree_block *broot; 3812 3813 /* 3814 * Decrease the tree height. 3815 * 3816 * Trickery here: The amount of memory that we need per record for the 3817 * ifork's btree root block may change when we convert the broot from a 3818 * node to a leaf. Free the old node broot so that we can get a fresh 3819 * leaf broot. 3820 */ 3821 cur->bc_ops->broot_realloc(cur, 0); 3822 cur->bc_nlevels--; 3823 3824 /* 3825 * Allocate a new leaf broot and copy the records from the old child. 3826 * Detach the old child from the cursor. 3827 */ 3828 broot = cur->bc_ops->broot_realloc(cur, numrecs); 3829 xfs_btree_init_block(cur->bc_mp, broot, cur->bc_ops, 0, numrecs, 3830 cur->bc_ino.ip->i_ino); 3831 3832 rp = xfs_btree_rec_addr(cur, 1, broot); 3833 crp = xfs_btree_rec_addr(cur, 1, cblock); 3834 xfs_btree_copy_recs(cur, rp, crp, numrecs); 3835 3836 cur->bc_levels[0].bp = NULL; 3837 } 3838 3839 /* 3840 * Move the keyptrs from a child node block to the root block. 3841 * 3842 * Since the keyptr size does not change, all we have to do is increase the 3843 * tree height, copy the keyptrs to the new internal node (cblock), shrink 3844 * the root, and copy the pointers there. 3845 */ 3846 STATIC int 3847 xfs_btree_demote_node_child( 3848 struct xfs_btree_cur *cur, 3849 struct xfs_btree_block *cblock, 3850 int level, 3851 int numrecs) 3852 { 3853 struct xfs_btree_block *block; 3854 union xfs_btree_key *ckp; 3855 union xfs_btree_key *kp; 3856 union xfs_btree_ptr *cpp; 3857 union xfs_btree_ptr *pp; 3858 int i; 3859 int error; 3860 3861 /* 3862 * Adjust the root btree node size and the record count to match the 3863 * doomed child so that we can copy the keyptrs ahead of changing the 3864 * tree shape. 3865 */ 3866 block = cur->bc_ops->broot_realloc(cur, numrecs); 3867 3868 xfs_btree_set_numrecs(block, numrecs); 3869 ASSERT(block->bb_numrecs == cblock->bb_numrecs); 3870 3871 /* Copy keys from the doomed block. */ 3872 kp = xfs_btree_key_addr(cur, 1, block); 3873 ckp = xfs_btree_key_addr(cur, 1, cblock); 3874 xfs_btree_copy_keys(cur, kp, ckp, numrecs); 3875 3876 /* Copy pointers from the doomed block. */ 3877 pp = xfs_btree_ptr_addr(cur, 1, block); 3878 cpp = xfs_btree_ptr_addr(cur, 1, cblock); 3879 for (i = 0; i < numrecs; i++) { 3880 error = xfs_btree_debug_check_ptr(cur, cpp, i, level - 1); 3881 if (error) 3882 return error; 3883 } 3884 xfs_btree_copy_ptrs(cur, pp, cpp, numrecs); 3885 3886 /* Decrease tree height, adjusting the root block level to match. */ 3887 cur->bc_levels[level - 1].bp = NULL; 3888 be16_add_cpu(&block->bb_level, -1); 3889 cur->bc_nlevels--; 3890 return 0; 3891 } 3892 3893 /* 3894 * Try to merge a non-leaf block back into the inode root. 3895 * 3896 * Note: the killroot names comes from the fact that we're effectively 3897 * killing the old root block. But because we can't just delete the 3898 * inode we have to copy the single block it was pointing to into the 3899 * inode. 3900 */ 3901 STATIC int 3902 xfs_btree_kill_iroot( 3903 struct xfs_btree_cur *cur) 3904 { 3905 struct xfs_inode *ip = cur->bc_ino.ip; 3906 struct xfs_btree_block *block; 3907 struct xfs_btree_block *cblock; 3908 struct xfs_buf *cbp; 3909 int level; 3910 int numrecs; 3911 int error; 3912 #ifdef DEBUG 3913 union xfs_btree_ptr ptr; 3914 #endif 3915 3916 ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE); 3917 ASSERT((cur->bc_ops->geom_flags & XFS_BTGEO_IROOT_RECORDS) || 3918 cur->bc_nlevels > 1); 3919 3920 /* 3921 * Don't deal with the root block needs to be a leaf case. 3922 * We're just going to turn the thing back into extents anyway. 3923 */ 3924 level = cur->bc_nlevels - 1; 3925 if (level == 1 && !(cur->bc_ops->geom_flags & XFS_BTGEO_IROOT_RECORDS)) 3926 goto out0; 3927 3928 /* If we're already a leaf, jump out. */ 3929 if (level == 0) 3930 goto out0; 3931 3932 /* 3933 * Give up if the root has multiple children. 3934 */ 3935 block = xfs_btree_get_iroot(cur); 3936 if (xfs_btree_get_numrecs(block) != 1) 3937 goto out0; 3938 3939 cblock = xfs_btree_get_block(cur, level - 1, &cbp); 3940 numrecs = xfs_btree_get_numrecs(cblock); 3941 3942 /* 3943 * Only do this if the next level will fit. 3944 * Then the data must be copied up to the inode, 3945 * instead of freeing the root you free the next level. 3946 */ 3947 if (numrecs > cur->bc_ops->get_dmaxrecs(cur, level)) 3948 goto out0; 3949 3950 XFS_BTREE_STATS_INC(cur, killroot); 3951 3952 #ifdef DEBUG 3953 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB); 3954 ASSERT(xfs_btree_ptr_is_null(cur, &ptr)); 3955 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB); 3956 ASSERT(xfs_btree_ptr_is_null(cur, &ptr)); 3957 #endif 3958 3959 if (level > 1) { 3960 error = xfs_btree_demote_node_child(cur, cblock, level, 3961 numrecs); 3962 if (error) 3963 return error; 3964 } else 3965 xfs_btree_demote_leaf_child(cur, cblock, numrecs); 3966 3967 error = xfs_btree_free_block(cur, cbp); 3968 if (error) 3969 return error; 3970 3971 xfs_trans_log_inode(cur->bc_tp, ip, 3972 XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork)); 3973 out0: 3974 return 0; 3975 } 3976 3977 /* 3978 * Kill the current root node, and replace it with it's only child node. 3979 */ 3980 STATIC int 3981 xfs_btree_kill_root( 3982 struct xfs_btree_cur *cur, 3983 struct xfs_buf *bp, 3984 int level, 3985 union xfs_btree_ptr *newroot) 3986 { 3987 int error; 3988 3989 XFS_BTREE_STATS_INC(cur, killroot); 3990 3991 /* 3992 * Update the root pointer, decreasing the level by 1 and then 3993 * free the old root. 3994 */ 3995 xfs_btree_set_root(cur, newroot, -1); 3996 3997 error = xfs_btree_free_block(cur, bp); 3998 if (error) 3999 return error; 4000 4001 cur->bc_levels[level].bp = NULL; 4002 cur->bc_levels[level].ra = 0; 4003 cur->bc_nlevels--; 4004 4005 return 0; 4006 } 4007 4008 STATIC int 4009 xfs_btree_dec_cursor( 4010 struct xfs_btree_cur *cur, 4011 int level, 4012 int *stat) 4013 { 4014 int error; 4015 int i; 4016 4017 if (level > 0) { 4018 error = xfs_btree_decrement(cur, level, &i); 4019 if (error) 4020 return error; 4021 } 4022 4023 *stat = 1; 4024 return 0; 4025 } 4026 4027 /* 4028 * Single level of the btree record deletion routine. 4029 * Delete record pointed to by cur/level. 4030 * Remove the record from its block then rebalance the tree. 4031 * Return 0 for error, 1 for done, 2 to go on to the next level. 4032 */ 4033 STATIC int /* error */ 4034 xfs_btree_delrec( 4035 struct xfs_btree_cur *cur, /* btree cursor */ 4036 int level, /* level removing record from */ 4037 int *stat) /* fail/done/go-on */ 4038 { 4039 struct xfs_btree_block *block; /* btree block */ 4040 union xfs_btree_ptr cptr; /* current block ptr */ 4041 struct xfs_buf *bp; /* buffer for block */ 4042 int error; /* error return value */ 4043 int i; /* loop counter */ 4044 union xfs_btree_ptr lptr; /* left sibling block ptr */ 4045 struct xfs_buf *lbp; /* left buffer pointer */ 4046 struct xfs_btree_block *left; /* left btree block */ 4047 int lrecs = 0; /* left record count */ 4048 int ptr; /* key/record index */ 4049 union xfs_btree_ptr rptr; /* right sibling block ptr */ 4050 struct xfs_buf *rbp; /* right buffer pointer */ 4051 struct xfs_btree_block *right; /* right btree block */ 4052 struct xfs_btree_block *rrblock; /* right-right btree block */ 4053 struct xfs_buf *rrbp; /* right-right buffer pointer */ 4054 int rrecs = 0; /* right record count */ 4055 struct xfs_btree_cur *tcur; /* temporary btree cursor */ 4056 int numrecs; /* temporary numrec count */ 4057 4058 tcur = NULL; 4059 4060 /* Get the index of the entry being deleted, check for nothing there. */ 4061 ptr = cur->bc_levels[level].ptr; 4062 if (ptr == 0) { 4063 *stat = 0; 4064 return 0; 4065 } 4066 4067 /* Get the buffer & block containing the record or key/ptr. */ 4068 block = xfs_btree_get_block(cur, level, &bp); 4069 numrecs = xfs_btree_get_numrecs(block); 4070 4071 #ifdef DEBUG 4072 error = xfs_btree_check_block(cur, block, level, bp); 4073 if (error) 4074 goto error0; 4075 #endif 4076 4077 /* Fail if we're off the end of the block. */ 4078 if (ptr > numrecs) { 4079 *stat = 0; 4080 return 0; 4081 } 4082 4083 XFS_BTREE_STATS_INC(cur, delrec); 4084 XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr); 4085 4086 /* Excise the entries being deleted. */ 4087 if (level > 0) { 4088 /* It's a nonleaf. operate on keys and ptrs */ 4089 union xfs_btree_key *lkp; 4090 union xfs_btree_ptr *lpp; 4091 4092 lkp = xfs_btree_key_addr(cur, ptr + 1, block); 4093 lpp = xfs_btree_ptr_addr(cur, ptr + 1, block); 4094 4095 for (i = 0; i < numrecs - ptr; i++) { 4096 error = xfs_btree_debug_check_ptr(cur, lpp, i, level); 4097 if (error) 4098 goto error0; 4099 } 4100 4101 if (ptr < numrecs) { 4102 xfs_btree_shift_keys(cur, lkp, -1, numrecs - ptr); 4103 xfs_btree_shift_ptrs(cur, lpp, -1, numrecs - ptr); 4104 xfs_btree_log_keys(cur, bp, ptr, numrecs - 1); 4105 xfs_btree_log_ptrs(cur, bp, ptr, numrecs - 1); 4106 } 4107 } else { 4108 /* It's a leaf. operate on records */ 4109 if (ptr < numrecs) { 4110 xfs_btree_shift_recs(cur, 4111 xfs_btree_rec_addr(cur, ptr + 1, block), 4112 -1, numrecs - ptr); 4113 xfs_btree_log_recs(cur, bp, ptr, numrecs - 1); 4114 } 4115 } 4116 4117 /* 4118 * Decrement and log the number of entries in the block. 4119 */ 4120 xfs_btree_set_numrecs(block, --numrecs); 4121 xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS); 4122 4123 /* 4124 * We're at the root level. First, shrink the root block in-memory. 4125 * Try to get rid of the next level down. If we can't then there's 4126 * nothing left to do. numrecs was decremented above. 4127 */ 4128 if (xfs_btree_at_iroot(cur, level)) { 4129 cur->bc_ops->broot_realloc(cur, numrecs); 4130 4131 error = xfs_btree_kill_iroot(cur); 4132 if (error) 4133 goto error0; 4134 4135 error = xfs_btree_dec_cursor(cur, level, stat); 4136 if (error) 4137 goto error0; 4138 *stat = 1; 4139 return 0; 4140 } 4141 4142 /* 4143 * If this is the root level, and there's only one entry left, and it's 4144 * NOT the leaf level, then we can get rid of this level. 4145 */ 4146 if (level == cur->bc_nlevels - 1) { 4147 if (numrecs == 1 && level > 0) { 4148 union xfs_btree_ptr *pp; 4149 /* 4150 * pp is still set to the first pointer in the block. 4151 * Make it the new root of the btree. 4152 */ 4153 pp = xfs_btree_ptr_addr(cur, 1, block); 4154 error = xfs_btree_kill_root(cur, bp, level, pp); 4155 if (error) 4156 goto error0; 4157 } else if (level > 0) { 4158 error = xfs_btree_dec_cursor(cur, level, stat); 4159 if (error) 4160 goto error0; 4161 } 4162 *stat = 1; 4163 return 0; 4164 } 4165 4166 /* 4167 * If we deleted the leftmost entry in the block, update the 4168 * key values above us in the tree. 4169 */ 4170 if (xfs_btree_needs_key_update(cur, ptr)) { 4171 error = xfs_btree_update_keys(cur, level); 4172 if (error) 4173 goto error0; 4174 } 4175 4176 /* 4177 * If the number of records remaining in the block is at least 4178 * the minimum, we're done. 4179 */ 4180 if (numrecs >= cur->bc_ops->get_minrecs(cur, level)) { 4181 error = xfs_btree_dec_cursor(cur, level, stat); 4182 if (error) 4183 goto error0; 4184 return 0; 4185 } 4186 4187 /* 4188 * Otherwise, we have to move some records around to keep the 4189 * tree balanced. Look at the left and right sibling blocks to 4190 * see if we can re-balance by moving only one record. 4191 */ 4192 xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB); 4193 xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB); 4194 4195 if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) { 4196 /* 4197 * One child of root, need to get a chance to copy its contents 4198 * into the root and delete it. Can't go up to next level, 4199 * there's nothing to delete there. 4200 */ 4201 if (xfs_btree_ptr_is_null(cur, &rptr) && 4202 xfs_btree_ptr_is_null(cur, &lptr) && 4203 level == cur->bc_nlevels - 2) { 4204 error = xfs_btree_kill_iroot(cur); 4205 if (!error) 4206 error = xfs_btree_dec_cursor(cur, level, stat); 4207 if (error) 4208 goto error0; 4209 return 0; 4210 } 4211 } 4212 4213 ASSERT(!xfs_btree_ptr_is_null(cur, &rptr) || 4214 !xfs_btree_ptr_is_null(cur, &lptr)); 4215 4216 /* 4217 * Duplicate the cursor so our btree manipulations here won't 4218 * disrupt the next level up. 4219 */ 4220 error = xfs_btree_dup_cursor(cur, &tcur); 4221 if (error) 4222 goto error0; 4223 4224 /* 4225 * If there's a right sibling, see if it's ok to shift an entry 4226 * out of it. 4227 */ 4228 if (!xfs_btree_ptr_is_null(cur, &rptr)) { 4229 /* 4230 * Move the temp cursor to the last entry in the next block. 4231 * Actually any entry but the first would suffice. 4232 */ 4233 i = xfs_btree_lastrec(tcur, level); 4234 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 4235 xfs_btree_mark_sick(cur); 4236 error = -EFSCORRUPTED; 4237 goto error0; 4238 } 4239 4240 error = xfs_btree_increment(tcur, level, &i); 4241 if (error) 4242 goto error0; 4243 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 4244 xfs_btree_mark_sick(cur); 4245 error = -EFSCORRUPTED; 4246 goto error0; 4247 } 4248 4249 i = xfs_btree_lastrec(tcur, level); 4250 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 4251 xfs_btree_mark_sick(cur); 4252 error = -EFSCORRUPTED; 4253 goto error0; 4254 } 4255 4256 /* Grab a pointer to the block. */ 4257 right = xfs_btree_get_block(tcur, level, &rbp); 4258 #ifdef DEBUG 4259 error = xfs_btree_check_block(tcur, right, level, rbp); 4260 if (error) 4261 goto error0; 4262 #endif 4263 /* Grab the current block number, for future use. */ 4264 xfs_btree_get_sibling(tcur, right, &cptr, XFS_BB_LEFTSIB); 4265 4266 /* 4267 * If right block is full enough so that removing one entry 4268 * won't make it too empty, and left-shifting an entry out 4269 * of right to us works, we're done. 4270 */ 4271 if (xfs_btree_get_numrecs(right) - 1 >= 4272 cur->bc_ops->get_minrecs(tcur, level)) { 4273 error = xfs_btree_lshift(tcur, level, &i); 4274 if (error) 4275 goto error0; 4276 if (i) { 4277 ASSERT(xfs_btree_get_numrecs(block) >= 4278 cur->bc_ops->get_minrecs(tcur, level)); 4279 4280 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); 4281 tcur = NULL; 4282 4283 error = xfs_btree_dec_cursor(cur, level, stat); 4284 if (error) 4285 goto error0; 4286 return 0; 4287 } 4288 } 4289 4290 /* 4291 * Otherwise, grab the number of records in right for 4292 * future reference, and fix up the temp cursor to point 4293 * to our block again (last record). 4294 */ 4295 rrecs = xfs_btree_get_numrecs(right); 4296 if (!xfs_btree_ptr_is_null(cur, &lptr)) { 4297 i = xfs_btree_firstrec(tcur, level); 4298 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 4299 xfs_btree_mark_sick(cur); 4300 error = -EFSCORRUPTED; 4301 goto error0; 4302 } 4303 4304 error = xfs_btree_decrement(tcur, level, &i); 4305 if (error) 4306 goto error0; 4307 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 4308 xfs_btree_mark_sick(cur); 4309 error = -EFSCORRUPTED; 4310 goto error0; 4311 } 4312 } 4313 } 4314 4315 /* 4316 * If there's a left sibling, see if it's ok to shift an entry 4317 * out of it. 4318 */ 4319 if (!xfs_btree_ptr_is_null(cur, &lptr)) { 4320 /* 4321 * Move the temp cursor to the first entry in the 4322 * previous block. 4323 */ 4324 i = xfs_btree_firstrec(tcur, level); 4325 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 4326 xfs_btree_mark_sick(cur); 4327 error = -EFSCORRUPTED; 4328 goto error0; 4329 } 4330 4331 error = xfs_btree_decrement(tcur, level, &i); 4332 if (error) 4333 goto error0; 4334 i = xfs_btree_firstrec(tcur, level); 4335 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 4336 xfs_btree_mark_sick(cur); 4337 error = -EFSCORRUPTED; 4338 goto error0; 4339 } 4340 4341 /* Grab a pointer to the block. */ 4342 left = xfs_btree_get_block(tcur, level, &lbp); 4343 #ifdef DEBUG 4344 error = xfs_btree_check_block(cur, left, level, lbp); 4345 if (error) 4346 goto error0; 4347 #endif 4348 /* Grab the current block number, for future use. */ 4349 xfs_btree_get_sibling(tcur, left, &cptr, XFS_BB_RIGHTSIB); 4350 4351 /* 4352 * If left block is full enough so that removing one entry 4353 * won't make it too empty, and right-shifting an entry out 4354 * of left to us works, we're done. 4355 */ 4356 if (xfs_btree_get_numrecs(left) - 1 >= 4357 cur->bc_ops->get_minrecs(tcur, level)) { 4358 error = xfs_btree_rshift(tcur, level, &i); 4359 if (error) 4360 goto error0; 4361 if (i) { 4362 ASSERT(xfs_btree_get_numrecs(block) >= 4363 cur->bc_ops->get_minrecs(tcur, level)); 4364 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); 4365 tcur = NULL; 4366 if (level == 0) 4367 cur->bc_levels[0].ptr++; 4368 4369 *stat = 1; 4370 return 0; 4371 } 4372 } 4373 4374 /* 4375 * Otherwise, grab the number of records in right for 4376 * future reference. 4377 */ 4378 lrecs = xfs_btree_get_numrecs(left); 4379 } 4380 4381 /* Delete the temp cursor, we're done with it. */ 4382 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); 4383 tcur = NULL; 4384 4385 /* If here, we need to do a join to keep the tree balanced. */ 4386 ASSERT(!xfs_btree_ptr_is_null(cur, &cptr)); 4387 4388 if (!xfs_btree_ptr_is_null(cur, &lptr) && 4389 lrecs + xfs_btree_get_numrecs(block) <= 4390 cur->bc_ops->get_maxrecs(cur, level)) { 4391 /* 4392 * Set "right" to be the starting block, 4393 * "left" to be the left neighbor. 4394 */ 4395 rptr = cptr; 4396 right = block; 4397 rbp = bp; 4398 error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp); 4399 if (error) 4400 goto error0; 4401 4402 /* 4403 * If that won't work, see if we can join with the right neighbor block. 4404 */ 4405 } else if (!xfs_btree_ptr_is_null(cur, &rptr) && 4406 rrecs + xfs_btree_get_numrecs(block) <= 4407 cur->bc_ops->get_maxrecs(cur, level)) { 4408 /* 4409 * Set "left" to be the starting block, 4410 * "right" to be the right neighbor. 4411 */ 4412 lptr = cptr; 4413 left = block; 4414 lbp = bp; 4415 error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp); 4416 if (error) 4417 goto error0; 4418 4419 /* 4420 * Otherwise, we can't fix the imbalance. 4421 * Just return. This is probably a logic error, but it's not fatal. 4422 */ 4423 } else { 4424 error = xfs_btree_dec_cursor(cur, level, stat); 4425 if (error) 4426 goto error0; 4427 return 0; 4428 } 4429 4430 rrecs = xfs_btree_get_numrecs(right); 4431 lrecs = xfs_btree_get_numrecs(left); 4432 4433 /* 4434 * We're now going to join "left" and "right" by moving all the stuff 4435 * in "right" to "left" and deleting "right". 4436 */ 4437 XFS_BTREE_STATS_ADD(cur, moves, rrecs); 4438 if (level > 0) { 4439 /* It's a non-leaf. Move keys and pointers. */ 4440 union xfs_btree_key *lkp; /* left btree key */ 4441 union xfs_btree_ptr *lpp; /* left address pointer */ 4442 union xfs_btree_key *rkp; /* right btree key */ 4443 union xfs_btree_ptr *rpp; /* right address pointer */ 4444 4445 lkp = xfs_btree_key_addr(cur, lrecs + 1, left); 4446 lpp = xfs_btree_ptr_addr(cur, lrecs + 1, left); 4447 rkp = xfs_btree_key_addr(cur, 1, right); 4448 rpp = xfs_btree_ptr_addr(cur, 1, right); 4449 4450 for (i = 1; i < rrecs; i++) { 4451 error = xfs_btree_debug_check_ptr(cur, rpp, i, level); 4452 if (error) 4453 goto error0; 4454 } 4455 4456 xfs_btree_copy_keys(cur, lkp, rkp, rrecs); 4457 xfs_btree_copy_ptrs(cur, lpp, rpp, rrecs); 4458 4459 xfs_btree_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs); 4460 xfs_btree_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs); 4461 } else { 4462 /* It's a leaf. Move records. */ 4463 union xfs_btree_rec *lrp; /* left record pointer */ 4464 union xfs_btree_rec *rrp; /* right record pointer */ 4465 4466 lrp = xfs_btree_rec_addr(cur, lrecs + 1, left); 4467 rrp = xfs_btree_rec_addr(cur, 1, right); 4468 4469 xfs_btree_copy_recs(cur, lrp, rrp, rrecs); 4470 xfs_btree_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs); 4471 } 4472 4473 XFS_BTREE_STATS_INC(cur, join); 4474 4475 /* 4476 * Fix up the number of records and right block pointer in the 4477 * surviving block, and log it. 4478 */ 4479 xfs_btree_set_numrecs(left, lrecs + rrecs); 4480 xfs_btree_get_sibling(cur, right, &cptr, XFS_BB_RIGHTSIB); 4481 xfs_btree_set_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB); 4482 xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB); 4483 4484 /* If there is a right sibling, point it to the remaining block. */ 4485 xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB); 4486 if (!xfs_btree_ptr_is_null(cur, &cptr)) { 4487 error = xfs_btree_read_buf_block(cur, &cptr, 0, &rrblock, &rrbp); 4488 if (error) 4489 goto error0; 4490 xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB); 4491 xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB); 4492 } 4493 4494 /* Free the deleted block. */ 4495 error = xfs_btree_free_block(cur, rbp); 4496 if (error) 4497 goto error0; 4498 4499 /* 4500 * If we joined with the left neighbor, set the buffer in the 4501 * cursor to the left block, and fix up the index. 4502 */ 4503 if (bp != lbp) { 4504 cur->bc_levels[level].bp = lbp; 4505 cur->bc_levels[level].ptr += lrecs; 4506 cur->bc_levels[level].ra = 0; 4507 } 4508 /* 4509 * If we joined with the right neighbor and there's a level above 4510 * us, increment the cursor at that level. 4511 */ 4512 else if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE || 4513 level + 1 < cur->bc_nlevels) { 4514 error = xfs_btree_increment(cur, level + 1, &i); 4515 if (error) 4516 goto error0; 4517 } 4518 4519 /* 4520 * Readjust the ptr at this level if it's not a leaf, since it's 4521 * still pointing at the deletion point, which makes the cursor 4522 * inconsistent. If this makes the ptr 0, the caller fixes it up. 4523 * We can't use decrement because it would change the next level up. 4524 */ 4525 if (level > 0) 4526 cur->bc_levels[level].ptr--; 4527 4528 /* 4529 * We combined blocks, so we have to update the parent keys if the 4530 * btree supports overlapped intervals. However, 4531 * bc_levels[level + 1].ptr points to the old block so that the caller 4532 * knows which record to delete. Therefore, the caller must be savvy 4533 * enough to call updkeys for us if we return stat == 2. The other 4534 * exit points from this function don't require deletions further up 4535 * the tree, so they can call updkeys directly. 4536 */ 4537 4538 /* Return value means the next level up has something to do. */ 4539 *stat = 2; 4540 return 0; 4541 4542 error0: 4543 if (tcur) 4544 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); 4545 return error; 4546 } 4547 4548 /* 4549 * Delete the record pointed to by cur. 4550 * The cursor refers to the place where the record was (could be inserted) 4551 * when the operation returns. 4552 */ 4553 int /* error */ 4554 xfs_btree_delete( 4555 struct xfs_btree_cur *cur, 4556 int *stat) /* success/failure */ 4557 { 4558 int error; /* error return value */ 4559 int level; 4560 int i; 4561 bool joined = false; 4562 4563 /* 4564 * Go up the tree, starting at leaf level. 4565 * 4566 * If 2 is returned then a join was done; go to the next level. 4567 * Otherwise we are done. 4568 */ 4569 for (level = 0, i = 2; i == 2; level++) { 4570 error = xfs_btree_delrec(cur, level, &i); 4571 if (error) 4572 goto error0; 4573 if (i == 2) 4574 joined = true; 4575 } 4576 4577 /* 4578 * If we combined blocks as part of deleting the record, delrec won't 4579 * have updated the parent high keys so we have to do that here. 4580 */ 4581 if (joined && (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING)) { 4582 error = xfs_btree_updkeys_force(cur, 0); 4583 if (error) 4584 goto error0; 4585 } 4586 4587 if (i == 0) { 4588 for (level = 1; level < cur->bc_nlevels; level++) { 4589 if (cur->bc_levels[level].ptr == 0) { 4590 error = xfs_btree_decrement(cur, level, &i); 4591 if (error) 4592 goto error0; 4593 break; 4594 } 4595 } 4596 } 4597 4598 *stat = i; 4599 return 0; 4600 error0: 4601 return error; 4602 } 4603 4604 /* 4605 * Get the data from the pointed-to record. 4606 */ 4607 int /* error */ 4608 xfs_btree_get_rec( 4609 struct xfs_btree_cur *cur, /* btree cursor */ 4610 union xfs_btree_rec **recp, /* output: btree record */ 4611 int *stat) /* output: success/failure */ 4612 { 4613 struct xfs_btree_block *block; /* btree block */ 4614 struct xfs_buf *bp; /* buffer pointer */ 4615 int ptr; /* record number */ 4616 #ifdef DEBUG 4617 int error; /* error return value */ 4618 #endif 4619 4620 ptr = cur->bc_levels[0].ptr; 4621 block = xfs_btree_get_block(cur, 0, &bp); 4622 4623 #ifdef DEBUG 4624 error = xfs_btree_check_block(cur, block, 0, bp); 4625 if (error) 4626 return error; 4627 #endif 4628 4629 /* 4630 * Off the right end or left end, return failure. 4631 */ 4632 if (ptr > xfs_btree_get_numrecs(block) || ptr <= 0) { 4633 *stat = 0; 4634 return 0; 4635 } 4636 4637 /* 4638 * Point to the record and extract its data. 4639 */ 4640 *recp = xfs_btree_rec_addr(cur, ptr, block); 4641 *stat = 1; 4642 return 0; 4643 } 4644 4645 /* Visit a block in a btree. */ 4646 STATIC int 4647 xfs_btree_visit_block( 4648 struct xfs_btree_cur *cur, 4649 int level, 4650 xfs_btree_visit_blocks_fn fn, 4651 void *data) 4652 { 4653 struct xfs_btree_block *block; 4654 struct xfs_buf *bp; 4655 union xfs_btree_ptr rptr, bufptr; 4656 int error; 4657 4658 /* do right sibling readahead */ 4659 xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA); 4660 block = xfs_btree_get_block(cur, level, &bp); 4661 4662 /* process the block */ 4663 error = fn(cur, level, data); 4664 if (error) 4665 return error; 4666 4667 /* now read rh sibling block for next iteration */ 4668 xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB); 4669 if (xfs_btree_ptr_is_null(cur, &rptr)) 4670 return -ENOENT; 4671 4672 /* 4673 * We only visit blocks once in this walk, so we have to avoid the 4674 * internal xfs_btree_lookup_get_block() optimisation where it will 4675 * return the same block without checking if the right sibling points 4676 * back to us and creates a cyclic reference in the btree. 4677 */ 4678 xfs_btree_buf_to_ptr(cur, bp, &bufptr); 4679 if (xfs_btree_ptrs_equal(cur, &rptr, &bufptr)) { 4680 xfs_btree_mark_sick(cur); 4681 return -EFSCORRUPTED; 4682 } 4683 4684 return xfs_btree_lookup_get_block(cur, level, &rptr, &block); 4685 } 4686 4687 4688 /* Visit every block in a btree. */ 4689 int 4690 xfs_btree_visit_blocks( 4691 struct xfs_btree_cur *cur, 4692 xfs_btree_visit_blocks_fn fn, 4693 unsigned int flags, 4694 void *data) 4695 { 4696 union xfs_btree_ptr lptr; 4697 int level; 4698 struct xfs_btree_block *block = NULL; 4699 int error = 0; 4700 4701 xfs_btree_init_ptr_from_cur(cur, &lptr); 4702 4703 /* for each level */ 4704 for (level = cur->bc_nlevels - 1; level >= 0; level--) { 4705 /* grab the left hand block */ 4706 error = xfs_btree_lookup_get_block(cur, level, &lptr, &block); 4707 if (error) 4708 return error; 4709 4710 /* readahead the left most block for the next level down */ 4711 if (level > 0) { 4712 union xfs_btree_ptr *ptr; 4713 4714 ptr = xfs_btree_ptr_addr(cur, 1, block); 4715 xfs_btree_readahead_ptr(cur, ptr, 1); 4716 4717 /* save for the next iteration of the loop */ 4718 xfs_btree_copy_ptrs(cur, &lptr, ptr, 1); 4719 4720 if (!(flags & XFS_BTREE_VISIT_LEAVES)) 4721 continue; 4722 } else if (!(flags & XFS_BTREE_VISIT_RECORDS)) { 4723 continue; 4724 } 4725 4726 /* for each buffer in the level */ 4727 do { 4728 error = xfs_btree_visit_block(cur, level, fn, data); 4729 } while (!error); 4730 4731 if (error != -ENOENT) 4732 return error; 4733 } 4734 4735 return 0; 4736 } 4737 4738 /* 4739 * Change the owner of a btree. 4740 * 4741 * The mechanism we use here is ordered buffer logging. Because we don't know 4742 * how many buffers were are going to need to modify, we don't really want to 4743 * have to make transaction reservations for the worst case of every buffer in a 4744 * full size btree as that may be more space that we can fit in the log.... 4745 * 4746 * We do the btree walk in the most optimal manner possible - we have sibling 4747 * pointers so we can just walk all the blocks on each level from left to right 4748 * in a single pass, and then move to the next level and do the same. We can 4749 * also do readahead on the sibling pointers to get IO moving more quickly, 4750 * though for slow disks this is unlikely to make much difference to performance 4751 * as the amount of CPU work we have to do before moving to the next block is 4752 * relatively small. 4753 * 4754 * For each btree block that we load, modify the owner appropriately, set the 4755 * buffer as an ordered buffer and log it appropriately. We need to ensure that 4756 * we mark the region we change dirty so that if the buffer is relogged in 4757 * a subsequent transaction the changes we make here as an ordered buffer are 4758 * correctly relogged in that transaction. If we are in recovery context, then 4759 * just queue the modified buffer as delayed write buffer so the transaction 4760 * recovery completion writes the changes to disk. 4761 */ 4762 struct xfs_btree_block_change_owner_info { 4763 uint64_t new_owner; 4764 struct list_head *buffer_list; 4765 }; 4766 4767 static int 4768 xfs_btree_block_change_owner( 4769 struct xfs_btree_cur *cur, 4770 int level, 4771 void *data) 4772 { 4773 struct xfs_btree_block_change_owner_info *bbcoi = data; 4774 struct xfs_btree_block *block; 4775 struct xfs_buf *bp; 4776 4777 /* modify the owner */ 4778 block = xfs_btree_get_block(cur, level, &bp); 4779 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { 4780 if (block->bb_u.l.bb_owner == cpu_to_be64(bbcoi->new_owner)) 4781 return 0; 4782 block->bb_u.l.bb_owner = cpu_to_be64(bbcoi->new_owner); 4783 } else { 4784 if (block->bb_u.s.bb_owner == cpu_to_be32(bbcoi->new_owner)) 4785 return 0; 4786 block->bb_u.s.bb_owner = cpu_to_be32(bbcoi->new_owner); 4787 } 4788 4789 /* 4790 * If the block is a root block hosted in an inode, we might not have a 4791 * buffer pointer here and we shouldn't attempt to log the change as the 4792 * information is already held in the inode and discarded when the root 4793 * block is formatted into the on-disk inode fork. We still change it, 4794 * though, so everything is consistent in memory. 4795 */ 4796 if (!bp) { 4797 ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE); 4798 ASSERT(level == cur->bc_nlevels - 1); 4799 return 0; 4800 } 4801 4802 if (cur->bc_tp) { 4803 if (!xfs_trans_ordered_buf(cur->bc_tp, bp)) { 4804 xfs_btree_log_block(cur, bp, XFS_BB_OWNER); 4805 return -EAGAIN; 4806 } 4807 } else { 4808 xfs_buf_delwri_queue(bp, bbcoi->buffer_list); 4809 } 4810 4811 return 0; 4812 } 4813 4814 int 4815 xfs_btree_change_owner( 4816 struct xfs_btree_cur *cur, 4817 uint64_t new_owner, 4818 struct list_head *buffer_list) 4819 { 4820 struct xfs_btree_block_change_owner_info bbcoi; 4821 4822 bbcoi.new_owner = new_owner; 4823 bbcoi.buffer_list = buffer_list; 4824 4825 return xfs_btree_visit_blocks(cur, xfs_btree_block_change_owner, 4826 XFS_BTREE_VISIT_ALL, &bbcoi); 4827 } 4828 4829 /* Verify the v5 fields of a long-format btree block. */ 4830 xfs_failaddr_t 4831 xfs_btree_fsblock_v5hdr_verify( 4832 struct xfs_buf *bp, 4833 uint64_t owner) 4834 { 4835 struct xfs_mount *mp = bp->b_mount; 4836 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 4837 4838 if (!xfs_has_crc(mp)) 4839 return __this_address; 4840 if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid)) 4841 return __this_address; 4842 if (block->bb_u.l.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp))) 4843 return __this_address; 4844 if (owner != XFS_RMAP_OWN_UNKNOWN && 4845 be64_to_cpu(block->bb_u.l.bb_owner) != owner) 4846 return __this_address; 4847 return NULL; 4848 } 4849 4850 /* Verify a long-format btree block. */ 4851 xfs_failaddr_t 4852 xfs_btree_fsblock_verify( 4853 struct xfs_buf *bp, 4854 unsigned int max_recs) 4855 { 4856 struct xfs_mount *mp = bp->b_mount; 4857 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 4858 xfs_fsblock_t fsb; 4859 xfs_failaddr_t fa; 4860 4861 ASSERT(!xfs_buftarg_is_mem(bp->b_target)); 4862 4863 /* numrecs verification */ 4864 if (be16_to_cpu(block->bb_numrecs) > max_recs) 4865 return __this_address; 4866 4867 /* sibling pointer verification */ 4868 fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); 4869 fa = xfs_btree_check_fsblock_siblings(mp, fsb, 4870 block->bb_u.l.bb_leftsib); 4871 if (!fa) 4872 fa = xfs_btree_check_fsblock_siblings(mp, fsb, 4873 block->bb_u.l.bb_rightsib); 4874 return fa; 4875 } 4876 4877 /* Verify an in-memory btree block. */ 4878 xfs_failaddr_t 4879 xfs_btree_memblock_verify( 4880 struct xfs_buf *bp, 4881 unsigned int max_recs) 4882 { 4883 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 4884 struct xfs_buftarg *btp = bp->b_target; 4885 xfs_failaddr_t fa; 4886 xfbno_t bno; 4887 4888 ASSERT(xfs_buftarg_is_mem(bp->b_target)); 4889 4890 /* numrecs verification */ 4891 if (be16_to_cpu(block->bb_numrecs) > max_recs) 4892 return __this_address; 4893 4894 /* sibling pointer verification */ 4895 bno = xfs_daddr_to_xfbno(xfs_buf_daddr(bp)); 4896 fa = xfs_btree_check_memblock_siblings(btp, bno, 4897 block->bb_u.l.bb_leftsib); 4898 if (fa) 4899 return fa; 4900 fa = xfs_btree_check_memblock_siblings(btp, bno, 4901 block->bb_u.l.bb_rightsib); 4902 if (fa) 4903 return fa; 4904 4905 return NULL; 4906 } 4907 /** 4908 * xfs_btree_agblock_v5hdr_verify() -- verify the v5 fields of a short-format 4909 * btree block 4910 * 4911 * @bp: buffer containing the btree block 4912 */ 4913 xfs_failaddr_t 4914 xfs_btree_agblock_v5hdr_verify( 4915 struct xfs_buf *bp) 4916 { 4917 struct xfs_mount *mp = bp->b_mount; 4918 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 4919 struct xfs_perag *pag = bp->b_pag; 4920 4921 if (!xfs_has_crc(mp)) 4922 return __this_address; 4923 if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) 4924 return __this_address; 4925 if (block->bb_u.s.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp))) 4926 return __this_address; 4927 if (pag && be32_to_cpu(block->bb_u.s.bb_owner) != pag_agno(pag)) 4928 return __this_address; 4929 return NULL; 4930 } 4931 4932 /** 4933 * xfs_btree_agblock_verify() -- verify a short-format btree block 4934 * 4935 * @bp: buffer containing the btree block 4936 * @max_recs: maximum records allowed in this btree node 4937 */ 4938 xfs_failaddr_t 4939 xfs_btree_agblock_verify( 4940 struct xfs_buf *bp, 4941 unsigned int max_recs) 4942 { 4943 struct xfs_mount *mp = bp->b_mount; 4944 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 4945 xfs_agblock_t agbno; 4946 xfs_failaddr_t fa; 4947 4948 ASSERT(!xfs_buftarg_is_mem(bp->b_target)); 4949 4950 /* numrecs verification */ 4951 if (be16_to_cpu(block->bb_numrecs) > max_recs) 4952 return __this_address; 4953 4954 /* sibling pointer verification */ 4955 agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp)); 4956 fa = xfs_btree_check_agblock_siblings(bp->b_pag, agbno, 4957 block->bb_u.s.bb_leftsib); 4958 if (!fa) 4959 fa = xfs_btree_check_agblock_siblings(bp->b_pag, agbno, 4960 block->bb_u.s.bb_rightsib); 4961 return fa; 4962 } 4963 4964 /* 4965 * For the given limits on leaf and keyptr records per block, calculate the 4966 * height of the tree needed to index the number of leaf records. 4967 */ 4968 unsigned int 4969 xfs_btree_compute_maxlevels( 4970 const unsigned int *limits, 4971 unsigned long long records) 4972 { 4973 unsigned long long level_blocks = howmany_64(records, limits[0]); 4974 unsigned int height = 1; 4975 4976 while (level_blocks > 1) { 4977 level_blocks = howmany_64(level_blocks, limits[1]); 4978 height++; 4979 } 4980 4981 return height; 4982 } 4983 4984 /* 4985 * For the given limits on leaf and keyptr records per block, calculate the 4986 * number of blocks needed to index the given number of leaf records. 4987 */ 4988 unsigned long long 4989 xfs_btree_calc_size( 4990 const unsigned int *limits, 4991 unsigned long long records) 4992 { 4993 unsigned long long level_blocks = howmany_64(records, limits[0]); 4994 unsigned long long blocks = level_blocks; 4995 4996 while (level_blocks > 1) { 4997 level_blocks = howmany_64(level_blocks, limits[1]); 4998 blocks += level_blocks; 4999 } 5000 5001 return blocks; 5002 } 5003 5004 /* 5005 * Given a number of available blocks for the btree to consume with records and 5006 * pointers, calculate the height of the tree needed to index all the records 5007 * that space can hold based on the number of pointers each interior node 5008 * holds. 5009 * 5010 * We start by assuming a single level tree consumes a single block, then track 5011 * the number of blocks each node level consumes until we no longer have space 5012 * to store the next node level. At this point, we are indexing all the leaf 5013 * blocks in the space, and there's no more free space to split the tree any 5014 * further. That's our maximum btree height. 5015 */ 5016 unsigned int 5017 xfs_btree_space_to_height( 5018 const unsigned int *limits, 5019 unsigned long long leaf_blocks) 5020 { 5021 /* 5022 * The root btree block can have fewer than minrecs pointers in it 5023 * because the tree might not be big enough to require that amount of 5024 * fanout. Hence it has a minimum size of 2 pointers, not limits[1]. 5025 */ 5026 unsigned long long node_blocks = 2; 5027 unsigned long long blocks_left = leaf_blocks - 1; 5028 unsigned int height = 1; 5029 5030 if (leaf_blocks < 1) 5031 return 0; 5032 5033 while (node_blocks < blocks_left) { 5034 blocks_left -= node_blocks; 5035 node_blocks *= limits[1]; 5036 height++; 5037 } 5038 5039 return height; 5040 } 5041 5042 /* 5043 * Query a regular btree for all records overlapping a given interval. 5044 * Start with a LE lookup of the key of low_rec and return all records 5045 * until we find a record with a key greater than the key of high_rec. 5046 */ 5047 STATIC int 5048 xfs_btree_simple_query_range( 5049 struct xfs_btree_cur *cur, 5050 const union xfs_btree_key *low_key, 5051 const union xfs_btree_key *high_key, 5052 xfs_btree_query_range_fn fn, 5053 void *priv) 5054 { 5055 union xfs_btree_rec *recp; 5056 union xfs_btree_key rec_key; 5057 int stat; 5058 bool firstrec = true; 5059 int error; 5060 5061 ASSERT(cur->bc_ops->init_high_key_from_rec); 5062 ASSERT(cur->bc_ops->cmp_two_keys); 5063 5064 /* 5065 * Find the leftmost record. The btree cursor must be set 5066 * to the low record used to generate low_key. 5067 */ 5068 stat = 0; 5069 error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, &stat); 5070 if (error) 5071 goto out; 5072 5073 /* Nothing? See if there's anything to the right. */ 5074 if (!stat) { 5075 error = xfs_btree_increment(cur, 0, &stat); 5076 if (error) 5077 goto out; 5078 } 5079 5080 while (stat) { 5081 /* Find the record. */ 5082 error = xfs_btree_get_rec(cur, &recp, &stat); 5083 if (error || !stat) 5084 break; 5085 5086 /* Skip if low_key > high_key(rec). */ 5087 if (firstrec) { 5088 cur->bc_ops->init_high_key_from_rec(&rec_key, recp); 5089 firstrec = false; 5090 if (xfs_btree_keycmp_gt(cur, low_key, &rec_key)) 5091 goto advloop; 5092 } 5093 5094 /* Stop if low_key(rec) > high_key. */ 5095 cur->bc_ops->init_key_from_rec(&rec_key, recp); 5096 if (xfs_btree_keycmp_gt(cur, &rec_key, high_key)) 5097 break; 5098 5099 /* Callback */ 5100 error = fn(cur, recp, priv); 5101 if (error) 5102 break; 5103 5104 advloop: 5105 /* Move on to the next record. */ 5106 error = xfs_btree_increment(cur, 0, &stat); 5107 if (error) 5108 break; 5109 } 5110 5111 out: 5112 return error; 5113 } 5114 5115 /* 5116 * Query an overlapped interval btree for all records overlapping a given 5117 * interval. This function roughly follows the algorithm given in 5118 * "Interval Trees" of _Introduction to Algorithms_, which is section 5119 * 14.3 in the 2nd and 3rd editions. 5120 * 5121 * First, generate keys for the low and high records passed in. 5122 * 5123 * For any leaf node, generate the high and low keys for the record. 5124 * If the record keys overlap with the query low/high keys, pass the 5125 * record to the function iterator. 5126 * 5127 * For any internal node, compare the low and high keys of each 5128 * pointer against the query low/high keys. If there's an overlap, 5129 * follow the pointer. 5130 * 5131 * As an optimization, we stop scanning a block when we find a low key 5132 * that is greater than the query's high key. 5133 */ 5134 STATIC int 5135 xfs_btree_overlapped_query_range( 5136 struct xfs_btree_cur *cur, 5137 const union xfs_btree_key *low_key, 5138 const union xfs_btree_key *high_key, 5139 xfs_btree_query_range_fn fn, 5140 void *priv) 5141 { 5142 union xfs_btree_ptr ptr; 5143 union xfs_btree_ptr *pp; 5144 union xfs_btree_key rec_key; 5145 union xfs_btree_key rec_hkey; 5146 union xfs_btree_key *lkp; 5147 union xfs_btree_key *hkp; 5148 union xfs_btree_rec *recp; 5149 struct xfs_btree_block *block; 5150 int level; 5151 struct xfs_buf *bp; 5152 int i; 5153 int error; 5154 5155 /* Load the root of the btree. */ 5156 level = cur->bc_nlevels - 1; 5157 xfs_btree_init_ptr_from_cur(cur, &ptr); 5158 error = xfs_btree_lookup_get_block(cur, level, &ptr, &block); 5159 if (error) 5160 return error; 5161 xfs_btree_get_block(cur, level, &bp); 5162 trace_xfs_btree_overlapped_query_range(cur, level, bp); 5163 #ifdef DEBUG 5164 error = xfs_btree_check_block(cur, block, level, bp); 5165 if (error) 5166 goto out; 5167 #endif 5168 cur->bc_levels[level].ptr = 1; 5169 5170 while (level < cur->bc_nlevels) { 5171 block = xfs_btree_get_block(cur, level, &bp); 5172 5173 /* End of node, pop back towards the root. */ 5174 if (cur->bc_levels[level].ptr > 5175 be16_to_cpu(block->bb_numrecs)) { 5176 pop_up: 5177 if (level < cur->bc_nlevels - 1) 5178 cur->bc_levels[level + 1].ptr++; 5179 level++; 5180 continue; 5181 } 5182 5183 if (level == 0) { 5184 /* Handle a leaf node. */ 5185 recp = xfs_btree_rec_addr(cur, cur->bc_levels[0].ptr, 5186 block); 5187 5188 cur->bc_ops->init_high_key_from_rec(&rec_hkey, recp); 5189 cur->bc_ops->init_key_from_rec(&rec_key, recp); 5190 5191 /* 5192 * If (query's high key < record's low key), then there 5193 * are no more interesting records in this block. Pop 5194 * up to the leaf level to find more record blocks. 5195 * 5196 * If (record's high key >= query's low key) and 5197 * (query's high key >= record's low key), then 5198 * this record overlaps the query range; callback. 5199 */ 5200 if (xfs_btree_keycmp_lt(cur, high_key, &rec_key)) 5201 goto pop_up; 5202 if (xfs_btree_keycmp_ge(cur, &rec_hkey, low_key)) { 5203 error = fn(cur, recp, priv); 5204 if (error) 5205 break; 5206 } 5207 cur->bc_levels[level].ptr++; 5208 continue; 5209 } 5210 5211 /* Handle an internal node. */ 5212 lkp = xfs_btree_key_addr(cur, cur->bc_levels[level].ptr, block); 5213 hkp = xfs_btree_high_key_addr(cur, cur->bc_levels[level].ptr, 5214 block); 5215 pp = xfs_btree_ptr_addr(cur, cur->bc_levels[level].ptr, block); 5216 5217 /* 5218 * If (query's high key < pointer's low key), then there are no 5219 * more interesting keys in this block. Pop up one leaf level 5220 * to continue looking for records. 5221 * 5222 * If (pointer's high key >= query's low key) and 5223 * (query's high key >= pointer's low key), then 5224 * this record overlaps the query range; follow pointer. 5225 */ 5226 if (xfs_btree_keycmp_lt(cur, high_key, lkp)) 5227 goto pop_up; 5228 if (xfs_btree_keycmp_ge(cur, hkp, low_key)) { 5229 level--; 5230 error = xfs_btree_lookup_get_block(cur, level, pp, 5231 &block); 5232 if (error) 5233 goto out; 5234 xfs_btree_get_block(cur, level, &bp); 5235 trace_xfs_btree_overlapped_query_range(cur, level, bp); 5236 #ifdef DEBUG 5237 error = xfs_btree_check_block(cur, block, level, bp); 5238 if (error) 5239 goto out; 5240 #endif 5241 cur->bc_levels[level].ptr = 1; 5242 continue; 5243 } 5244 cur->bc_levels[level].ptr++; 5245 } 5246 5247 out: 5248 /* 5249 * If we don't end this function with the cursor pointing at a record 5250 * block, a subsequent non-error cursor deletion will not release 5251 * node-level buffers, causing a buffer leak. This is quite possible 5252 * with a zero-results range query, so release the buffers if we 5253 * failed to return any results. 5254 */ 5255 if (cur->bc_levels[0].bp == NULL) { 5256 for (i = 0; i < cur->bc_nlevels; i++) { 5257 if (cur->bc_levels[i].bp) { 5258 xfs_trans_brelse(cur->bc_tp, 5259 cur->bc_levels[i].bp); 5260 cur->bc_levels[i].bp = NULL; 5261 cur->bc_levels[i].ptr = 0; 5262 cur->bc_levels[i].ra = 0; 5263 } 5264 } 5265 } 5266 5267 return error; 5268 } 5269 5270 static inline void 5271 xfs_btree_key_from_irec( 5272 struct xfs_btree_cur *cur, 5273 union xfs_btree_key *key, 5274 const union xfs_btree_irec *irec) 5275 { 5276 union xfs_btree_rec rec; 5277 5278 cur->bc_rec = *irec; 5279 cur->bc_ops->init_rec_from_cur(cur, &rec); 5280 cur->bc_ops->init_key_from_rec(key, &rec); 5281 } 5282 5283 /* 5284 * Query a btree for all records overlapping a given interval of keys. The 5285 * supplied function will be called with each record found; return one of the 5286 * XFS_BTREE_QUERY_RANGE_{CONTINUE,ABORT} values or the usual negative error 5287 * code. This function returns -ECANCELED, zero, or a negative error code. 5288 */ 5289 int 5290 xfs_btree_query_range( 5291 struct xfs_btree_cur *cur, 5292 const union xfs_btree_irec *low_rec, 5293 const union xfs_btree_irec *high_rec, 5294 xfs_btree_query_range_fn fn, 5295 void *priv) 5296 { 5297 union xfs_btree_key low_key; 5298 union xfs_btree_key high_key; 5299 5300 /* Find the keys of both ends of the interval. */ 5301 xfs_btree_key_from_irec(cur, &high_key, high_rec); 5302 xfs_btree_key_from_irec(cur, &low_key, low_rec); 5303 5304 /* Enforce low key <= high key. */ 5305 if (!xfs_btree_keycmp_le(cur, &low_key, &high_key)) 5306 return -EINVAL; 5307 5308 if (!(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING)) 5309 return xfs_btree_simple_query_range(cur, &low_key, 5310 &high_key, fn, priv); 5311 return xfs_btree_overlapped_query_range(cur, &low_key, &high_key, 5312 fn, priv); 5313 } 5314 5315 /* Query a btree for all records. */ 5316 int 5317 xfs_btree_query_all( 5318 struct xfs_btree_cur *cur, 5319 xfs_btree_query_range_fn fn, 5320 void *priv) 5321 { 5322 union xfs_btree_key low_key; 5323 union xfs_btree_key high_key; 5324 5325 memset(&cur->bc_rec, 0, sizeof(cur->bc_rec)); 5326 memset(&low_key, 0, sizeof(low_key)); 5327 memset(&high_key, 0xFF, sizeof(high_key)); 5328 5329 return xfs_btree_simple_query_range(cur, &low_key, &high_key, fn, priv); 5330 } 5331 5332 static int 5333 xfs_btree_count_blocks_helper( 5334 struct xfs_btree_cur *cur, 5335 int level, 5336 void *data) 5337 { 5338 xfs_filblks_t *blocks = data; 5339 (*blocks)++; 5340 5341 return 0; 5342 } 5343 5344 /* Count the blocks in a btree and return the result in *blocks. */ 5345 int 5346 xfs_btree_count_blocks( 5347 struct xfs_btree_cur *cur, 5348 xfs_filblks_t *blocks) 5349 { 5350 *blocks = 0; 5351 return xfs_btree_visit_blocks(cur, xfs_btree_count_blocks_helper, 5352 XFS_BTREE_VISIT_ALL, blocks); 5353 } 5354 5355 /* Compare two btree pointers. */ 5356 int 5357 xfs_btree_cmp_two_ptrs( 5358 struct xfs_btree_cur *cur, 5359 const union xfs_btree_ptr *a, 5360 const union xfs_btree_ptr *b) 5361 { 5362 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) 5363 return cmp_int(be64_to_cpu(a->l), be64_to_cpu(b->l)); 5364 return cmp_int(be32_to_cpu(a->s), be32_to_cpu(b->s)); 5365 } 5366 5367 struct xfs_btree_has_records { 5368 /* Keys for the start and end of the range we want to know about. */ 5369 union xfs_btree_key start_key; 5370 union xfs_btree_key end_key; 5371 5372 /* Mask for key comparisons, if desired. */ 5373 const union xfs_btree_key *key_mask; 5374 5375 /* Highest record key we've seen so far. */ 5376 union xfs_btree_key high_key; 5377 5378 enum xbtree_recpacking outcome; 5379 }; 5380 5381 STATIC int 5382 xfs_btree_has_records_helper( 5383 struct xfs_btree_cur *cur, 5384 const union xfs_btree_rec *rec, 5385 void *priv) 5386 { 5387 union xfs_btree_key rec_key; 5388 union xfs_btree_key rec_high_key; 5389 struct xfs_btree_has_records *info = priv; 5390 enum xbtree_key_contig key_contig; 5391 5392 cur->bc_ops->init_key_from_rec(&rec_key, rec); 5393 5394 if (info->outcome == XBTREE_RECPACKING_EMPTY) { 5395 info->outcome = XBTREE_RECPACKING_SPARSE; 5396 5397 /* 5398 * If the first record we find does not overlap the start key, 5399 * then there is a hole at the start of the search range. 5400 * Classify this as sparse and stop immediately. 5401 */ 5402 if (xfs_btree_masked_keycmp_lt(cur, &info->start_key, &rec_key, 5403 info->key_mask)) 5404 return -ECANCELED; 5405 } else { 5406 /* 5407 * If a subsequent record does not overlap with the any record 5408 * we've seen so far, there is a hole in the middle of the 5409 * search range. Classify this as sparse and stop. 5410 * If the keys overlap and this btree does not allow overlap, 5411 * signal corruption. 5412 */ 5413 key_contig = cur->bc_ops->keys_contiguous(cur, &info->high_key, 5414 &rec_key, info->key_mask); 5415 if (key_contig == XBTREE_KEY_OVERLAP && 5416 !(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING)) 5417 return -EFSCORRUPTED; 5418 if (key_contig == XBTREE_KEY_GAP) 5419 return -ECANCELED; 5420 } 5421 5422 /* 5423 * If high_key(rec) is larger than any other high key we've seen, 5424 * remember it for later. 5425 */ 5426 cur->bc_ops->init_high_key_from_rec(&rec_high_key, rec); 5427 if (xfs_btree_masked_keycmp_gt(cur, &rec_high_key, &info->high_key, 5428 info->key_mask)) 5429 info->high_key = rec_high_key; /* struct copy */ 5430 5431 return 0; 5432 } 5433 5434 /* 5435 * Scan part of the keyspace of a btree and tell us if that keyspace does not 5436 * map to any records; is fully mapped to records; or is partially mapped to 5437 * records. This is the btree record equivalent to determining if a file is 5438 * sparse. 5439 * 5440 * For most btree types, the record scan should use all available btree key 5441 * fields to compare the keys encountered. These callers should pass NULL for 5442 * @mask. However, some callers (e.g. scanning physical space in the rmapbt) 5443 * want to ignore some part of the btree record keyspace when performing the 5444 * comparison. These callers should pass in a union xfs_btree_key object with 5445 * the fields that *should* be a part of the comparison set to any nonzero 5446 * value, and the rest zeroed. 5447 */ 5448 int 5449 xfs_btree_has_records( 5450 struct xfs_btree_cur *cur, 5451 const union xfs_btree_irec *low, 5452 const union xfs_btree_irec *high, 5453 const union xfs_btree_key *mask, 5454 enum xbtree_recpacking *outcome) 5455 { 5456 struct xfs_btree_has_records info = { 5457 .outcome = XBTREE_RECPACKING_EMPTY, 5458 .key_mask = mask, 5459 }; 5460 int error; 5461 5462 /* Not all btrees support this operation. */ 5463 if (!cur->bc_ops->keys_contiguous) { 5464 ASSERT(0); 5465 return -EOPNOTSUPP; 5466 } 5467 5468 xfs_btree_key_from_irec(cur, &info.start_key, low); 5469 xfs_btree_key_from_irec(cur, &info.end_key, high); 5470 5471 error = xfs_btree_query_range(cur, low, high, 5472 xfs_btree_has_records_helper, &info); 5473 if (error == -ECANCELED) 5474 goto out; 5475 if (error) 5476 return error; 5477 5478 if (info.outcome == XBTREE_RECPACKING_EMPTY) 5479 goto out; 5480 5481 /* 5482 * If the largest high_key(rec) we saw during the walk is greater than 5483 * the end of the search range, classify this as full. Otherwise, 5484 * there is a hole at the end of the search range. 5485 */ 5486 if (xfs_btree_masked_keycmp_ge(cur, &info.high_key, &info.end_key, 5487 mask)) 5488 info.outcome = XBTREE_RECPACKING_FULL; 5489 5490 out: 5491 *outcome = info.outcome; 5492 return 0; 5493 } 5494 5495 /* Are there more records in this btree? */ 5496 bool 5497 xfs_btree_has_more_records( 5498 struct xfs_btree_cur *cur) 5499 { 5500 struct xfs_btree_block *block; 5501 struct xfs_buf *bp; 5502 5503 block = xfs_btree_get_block(cur, 0, &bp); 5504 5505 /* There are still records in this block. */ 5506 if (cur->bc_levels[0].ptr < xfs_btree_get_numrecs(block)) 5507 return true; 5508 5509 /* There are more record blocks. */ 5510 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) 5511 return block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK); 5512 else 5513 return block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK); 5514 } 5515 5516 /* Set up all the btree cursor caches. */ 5517 int __init 5518 xfs_btree_init_cur_caches(void) 5519 { 5520 int error; 5521 5522 error = xfs_allocbt_init_cur_cache(); 5523 if (error) 5524 return error; 5525 error = xfs_inobt_init_cur_cache(); 5526 if (error) 5527 goto err; 5528 error = xfs_bmbt_init_cur_cache(); 5529 if (error) 5530 goto err; 5531 error = xfs_rmapbt_init_cur_cache(); 5532 if (error) 5533 goto err; 5534 error = xfs_refcountbt_init_cur_cache(); 5535 if (error) 5536 goto err; 5537 error = xfs_rtrmapbt_init_cur_cache(); 5538 if (error) 5539 goto err; 5540 error = xfs_rtrefcountbt_init_cur_cache(); 5541 if (error) 5542 goto err; 5543 5544 return 0; 5545 err: 5546 xfs_btree_destroy_cur_caches(); 5547 return error; 5548 } 5549 5550 /* Destroy all the btree cursor caches, if they've been allocated. */ 5551 void 5552 xfs_btree_destroy_cur_caches(void) 5553 { 5554 xfs_allocbt_destroy_cur_cache(); 5555 xfs_inobt_destroy_cur_cache(); 5556 xfs_bmbt_destroy_cur_cache(); 5557 xfs_rmapbt_destroy_cur_cache(); 5558 xfs_refcountbt_destroy_cur_cache(); 5559 xfs_rtrmapbt_destroy_cur_cache(); 5560 xfs_rtrefcountbt_destroy_cur_cache(); 5561 } 5562 5563 /* Move the btree cursor before the first record. */ 5564 int 5565 xfs_btree_goto_left_edge( 5566 struct xfs_btree_cur *cur) 5567 { 5568 int stat = 0; 5569 int error; 5570 5571 memset(&cur->bc_rec, 0, sizeof(cur->bc_rec)); 5572 error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, &stat); 5573 if (error) 5574 return error; 5575 if (!stat) 5576 return 0; 5577 5578 error = xfs_btree_decrement(cur, 0, &stat); 5579 if (error) 5580 return error; 5581 if (stat != 0) { 5582 ASSERT(0); 5583 xfs_btree_mark_sick(cur); 5584 return -EFSCORRUPTED; 5585 } 5586 5587 return 0; 5588 } 5589 5590 /* Allocate a block for an inode-rooted metadata btree. */ 5591 int 5592 xfs_btree_alloc_metafile_block( 5593 struct xfs_btree_cur *cur, 5594 const union xfs_btree_ptr *start, 5595 union xfs_btree_ptr *new, 5596 int *stat) 5597 { 5598 struct xfs_alloc_arg args = { 5599 .mp = cur->bc_mp, 5600 .tp = cur->bc_tp, 5601 .resv = XFS_AG_RESV_METAFILE, 5602 .minlen = 1, 5603 .maxlen = 1, 5604 .prod = 1, 5605 }; 5606 struct xfs_inode *ip = cur->bc_ino.ip; 5607 int error; 5608 5609 ASSERT(xfs_is_metadir_inode(ip)); 5610 5611 xfs_rmap_ino_bmbt_owner(&args.oinfo, ip->i_ino, cur->bc_ino.whichfork); 5612 error = xfs_alloc_vextent_start_ag(&args, 5613 XFS_INO_TO_FSB(cur->bc_mp, ip->i_ino)); 5614 if (error) 5615 return error; 5616 if (args.fsbno == NULLFSBLOCK) { 5617 *stat = 0; 5618 return 0; 5619 } 5620 ASSERT(args.len == 1); 5621 5622 xfs_metafile_resv_alloc_space(ip, &args); 5623 5624 new->l = cpu_to_be64(args.fsbno); 5625 *stat = 1; 5626 return 0; 5627 } 5628 5629 /* Free a block from an inode-rooted metadata btree. */ 5630 int 5631 xfs_btree_free_metafile_block( 5632 struct xfs_btree_cur *cur, 5633 struct xfs_buf *bp) 5634 { 5635 struct xfs_owner_info oinfo; 5636 struct xfs_mount *mp = cur->bc_mp; 5637 struct xfs_inode *ip = cur->bc_ino.ip; 5638 struct xfs_trans *tp = cur->bc_tp; 5639 xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); 5640 int error; 5641 5642 ASSERT(xfs_is_metadir_inode(ip)); 5643 5644 xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_ino.whichfork); 5645 error = xfs_free_extent_later(tp, fsbno, 1, &oinfo, XFS_AG_RESV_METAFILE, 5646 0); 5647 if (error) 5648 return error; 5649 5650 xfs_metafile_resv_free_space(ip, tp, 1); 5651 return 0; 5652 } 5653