1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_bit.h" 13 #include "xfs_mount.h" 14 #include "xfs_inode.h" 15 #include "xfs_trans.h" 16 #include "xfs_buf_item.h" 17 #include "xfs_btree.h" 18 #include "xfs_errortag.h" 19 #include "xfs_error.h" 20 #include "xfs_trace.h" 21 #include "xfs_alloc.h" 22 #include "xfs_log.h" 23 #include "xfs_btree_staging.h" 24 #include "xfs_ag.h" 25 #include "xfs_alloc_btree.h" 26 #include "xfs_ialloc_btree.h" 27 #include "xfs_bmap_btree.h" 28 #include "xfs_rmap_btree.h" 29 #include "xfs_refcount_btree.h" 30 #include "xfs_health.h" 31 #include "xfs_buf_mem.h" 32 #include "xfs_btree_mem.h" 33 #include "xfs_rtrmap_btree.h" 34 #include "xfs_bmap.h" 35 #include "xfs_rmap.h" 36 #include "xfs_quota.h" 37 #include "xfs_metafile.h" 38 #include "xfs_rtrefcount_btree.h" 39 40 /* 41 * Btree magic numbers. 42 */ 43 uint32_t 44 xfs_btree_magic( 45 struct xfs_mount *mp, 46 const struct xfs_btree_ops *ops) 47 { 48 int idx = xfs_has_crc(mp) ? 1 : 0; 49 __be32 magic = ops->buf_ops->magic[idx]; 50 51 /* Ensure we asked for crc for crc-only magics. */ 52 ASSERT(magic != 0); 53 return be32_to_cpu(magic); 54 } 55 56 /* 57 * These sibling pointer checks are optimised for null sibling pointers. This 58 * happens a lot, and we don't need to byte swap at runtime if the sibling 59 * pointer is NULL. 60 * 61 * These are explicitly marked at inline because the cost of calling them as 62 * functions instead of inlining them is about 36 bytes extra code per call site 63 * on x86-64. Yes, gcc-11 fails to inline them, and explicit inlining of these 64 * two sibling check functions reduces the compiled code size by over 300 65 * bytes. 66 */ 67 static inline xfs_failaddr_t 68 xfs_btree_check_fsblock_siblings( 69 struct xfs_mount *mp, 70 xfs_fsblock_t fsb, 71 __be64 dsibling) 72 { 73 xfs_fsblock_t sibling; 74 75 if (dsibling == cpu_to_be64(NULLFSBLOCK)) 76 return NULL; 77 78 sibling = be64_to_cpu(dsibling); 79 if (sibling == fsb) 80 return __this_address; 81 if (!xfs_verify_fsbno(mp, sibling)) 82 return __this_address; 83 return NULL; 84 } 85 86 static inline xfs_failaddr_t 87 xfs_btree_check_memblock_siblings( 88 struct xfs_buftarg *btp, 89 xfbno_t bno, 90 __be64 dsibling) 91 { 92 xfbno_t sibling; 93 94 if (dsibling == cpu_to_be64(NULLFSBLOCK)) 95 return NULL; 96 97 sibling = be64_to_cpu(dsibling); 98 if (sibling == bno) 99 return __this_address; 100 if (!xmbuf_verify_daddr(btp, xfbno_to_daddr(sibling))) 101 return __this_address; 102 return NULL; 103 } 104 105 static inline xfs_failaddr_t 106 xfs_btree_check_agblock_siblings( 107 struct xfs_perag *pag, 108 xfs_agblock_t agbno, 109 __be32 dsibling) 110 { 111 xfs_agblock_t sibling; 112 113 if (dsibling == cpu_to_be32(NULLAGBLOCK)) 114 return NULL; 115 116 sibling = be32_to_cpu(dsibling); 117 if (sibling == agbno) 118 return __this_address; 119 if (!xfs_verify_agbno(pag, sibling)) 120 return __this_address; 121 return NULL; 122 } 123 124 static xfs_failaddr_t 125 __xfs_btree_check_lblock_hdr( 126 struct xfs_btree_cur *cur, 127 struct xfs_btree_block *block, 128 int level, 129 struct xfs_buf *bp) 130 { 131 struct xfs_mount *mp = cur->bc_mp; 132 133 if (xfs_has_crc(mp)) { 134 if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid)) 135 return __this_address; 136 if (block->bb_u.l.bb_blkno != 137 cpu_to_be64(bp ? xfs_buf_daddr(bp) : XFS_BUF_DADDR_NULL)) 138 return __this_address; 139 if (block->bb_u.l.bb_pad != cpu_to_be32(0)) 140 return __this_address; 141 } 142 143 if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(mp, cur->bc_ops)) 144 return __this_address; 145 if (be16_to_cpu(block->bb_level) != level) 146 return __this_address; 147 if (be16_to_cpu(block->bb_numrecs) > 148 cur->bc_ops->get_maxrecs(cur, level)) 149 return __this_address; 150 151 return NULL; 152 } 153 154 /* 155 * Check a long btree block header. Return the address of the failing check, 156 * or NULL if everything is ok. 157 */ 158 static xfs_failaddr_t 159 __xfs_btree_check_fsblock( 160 struct xfs_btree_cur *cur, 161 struct xfs_btree_block *block, 162 int level, 163 struct xfs_buf *bp) 164 { 165 struct xfs_mount *mp = cur->bc_mp; 166 xfs_failaddr_t fa; 167 xfs_fsblock_t fsb; 168 169 fa = __xfs_btree_check_lblock_hdr(cur, block, level, bp); 170 if (fa) 171 return fa; 172 173 /* 174 * For inode-rooted btrees, the root block sits in the inode fork. In 175 * that case bp is NULL, and the block must not have any siblings. 176 */ 177 if (!bp) { 178 if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK)) 179 return __this_address; 180 if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK)) 181 return __this_address; 182 return NULL; 183 } 184 185 fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); 186 fa = xfs_btree_check_fsblock_siblings(mp, fsb, 187 block->bb_u.l.bb_leftsib); 188 if (!fa) 189 fa = xfs_btree_check_fsblock_siblings(mp, fsb, 190 block->bb_u.l.bb_rightsib); 191 return fa; 192 } 193 194 /* 195 * Check an in-memory btree block header. Return the address of the failing 196 * check, or NULL if everything is ok. 197 */ 198 static xfs_failaddr_t 199 __xfs_btree_check_memblock( 200 struct xfs_btree_cur *cur, 201 struct xfs_btree_block *block, 202 int level, 203 struct xfs_buf *bp) 204 { 205 struct xfs_buftarg *btp = cur->bc_mem.xfbtree->target; 206 xfs_failaddr_t fa; 207 xfbno_t bno; 208 209 fa = __xfs_btree_check_lblock_hdr(cur, block, level, bp); 210 if (fa) 211 return fa; 212 213 bno = xfs_daddr_to_xfbno(xfs_buf_daddr(bp)); 214 fa = xfs_btree_check_memblock_siblings(btp, bno, 215 block->bb_u.l.bb_leftsib); 216 if (!fa) 217 fa = xfs_btree_check_memblock_siblings(btp, bno, 218 block->bb_u.l.bb_rightsib); 219 return fa; 220 } 221 222 /* 223 * Check a short btree block header. Return the address of the failing check, 224 * or NULL if everything is ok. 225 */ 226 static xfs_failaddr_t 227 __xfs_btree_check_agblock( 228 struct xfs_btree_cur *cur, 229 struct xfs_btree_block *block, 230 int level, 231 struct xfs_buf *bp) 232 { 233 struct xfs_mount *mp = cur->bc_mp; 234 struct xfs_perag *pag = to_perag(cur->bc_group); 235 xfs_failaddr_t fa; 236 xfs_agblock_t agbno; 237 238 if (xfs_has_crc(mp)) { 239 if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) 240 return __this_address; 241 if (block->bb_u.s.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp))) 242 return __this_address; 243 } 244 245 if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(mp, cur->bc_ops)) 246 return __this_address; 247 if (be16_to_cpu(block->bb_level) != level) 248 return __this_address; 249 if (be16_to_cpu(block->bb_numrecs) > 250 cur->bc_ops->get_maxrecs(cur, level)) 251 return __this_address; 252 253 agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp)); 254 fa = xfs_btree_check_agblock_siblings(pag, agbno, 255 block->bb_u.s.bb_leftsib); 256 if (!fa) 257 fa = xfs_btree_check_agblock_siblings(pag, agbno, 258 block->bb_u.s.bb_rightsib); 259 return fa; 260 } 261 262 /* 263 * Internal btree block check. 264 * 265 * Return NULL if the block is ok or the address of the failed check otherwise. 266 */ 267 xfs_failaddr_t 268 __xfs_btree_check_block( 269 struct xfs_btree_cur *cur, 270 struct xfs_btree_block *block, 271 int level, 272 struct xfs_buf *bp) 273 { 274 switch (cur->bc_ops->type) { 275 case XFS_BTREE_TYPE_MEM: 276 return __xfs_btree_check_memblock(cur, block, level, bp); 277 case XFS_BTREE_TYPE_AG: 278 return __xfs_btree_check_agblock(cur, block, level, bp); 279 case XFS_BTREE_TYPE_INODE: 280 return __xfs_btree_check_fsblock(cur, block, level, bp); 281 default: 282 ASSERT(0); 283 return __this_address; 284 } 285 } 286 287 static inline unsigned int xfs_btree_block_errtag(struct xfs_btree_cur *cur) 288 { 289 if (cur->bc_ops->ptr_len == XFS_BTREE_SHORT_PTR_LEN) 290 return XFS_ERRTAG_BTREE_CHECK_SBLOCK; 291 return XFS_ERRTAG_BTREE_CHECK_LBLOCK; 292 } 293 294 /* 295 * Debug routine: check that block header is ok. 296 */ 297 int 298 xfs_btree_check_block( 299 struct xfs_btree_cur *cur, /* btree cursor */ 300 struct xfs_btree_block *block, /* generic btree block pointer */ 301 int level, /* level of the btree block */ 302 struct xfs_buf *bp) /* buffer containing block, if any */ 303 { 304 struct xfs_mount *mp = cur->bc_mp; 305 xfs_failaddr_t fa; 306 307 fa = __xfs_btree_check_block(cur, block, level, bp); 308 if (XFS_IS_CORRUPT(mp, fa != NULL) || 309 XFS_TEST_ERROR(false, mp, xfs_btree_block_errtag(cur))) { 310 if (bp) 311 trace_xfs_btree_corrupt(bp, _RET_IP_); 312 xfs_btree_mark_sick(cur); 313 return -EFSCORRUPTED; 314 } 315 return 0; 316 } 317 318 int 319 __xfs_btree_check_ptr( 320 struct xfs_btree_cur *cur, 321 const union xfs_btree_ptr *ptr, 322 int index, 323 int level) 324 { 325 if (level <= 0) 326 return -EFSCORRUPTED; 327 328 switch (cur->bc_ops->type) { 329 case XFS_BTREE_TYPE_MEM: 330 if (!xfbtree_verify_bno(cur->bc_mem.xfbtree, 331 be64_to_cpu((&ptr->l)[index]))) 332 return -EFSCORRUPTED; 333 break; 334 case XFS_BTREE_TYPE_INODE: 335 if (!xfs_verify_fsbno(cur->bc_mp, 336 be64_to_cpu((&ptr->l)[index]))) 337 return -EFSCORRUPTED; 338 break; 339 case XFS_BTREE_TYPE_AG: 340 if (!xfs_verify_agbno(to_perag(cur->bc_group), 341 be32_to_cpu((&ptr->s)[index]))) 342 return -EFSCORRUPTED; 343 break; 344 } 345 346 return 0; 347 } 348 349 /* 350 * Check that a given (indexed) btree pointer at a certain level of a 351 * btree is valid and doesn't point past where it should. 352 */ 353 static int 354 xfs_btree_check_ptr( 355 struct xfs_btree_cur *cur, 356 const union xfs_btree_ptr *ptr, 357 int index, 358 int level) 359 { 360 int error; 361 362 error = __xfs_btree_check_ptr(cur, ptr, index, level); 363 if (error) { 364 switch (cur->bc_ops->type) { 365 case XFS_BTREE_TYPE_MEM: 366 xfs_err(cur->bc_mp, 367 "In-memory: Corrupt %sbt flags 0x%x pointer at level %d index %d fa %pS.", 368 cur->bc_ops->name, cur->bc_flags, level, index, 369 __this_address); 370 break; 371 case XFS_BTREE_TYPE_INODE: 372 xfs_err(cur->bc_mp, 373 "Inode %llu fork %d: Corrupt %sbt pointer at level %d index %d.", 374 cur->bc_ino.ip->i_ino, 375 cur->bc_ino.whichfork, cur->bc_ops->name, 376 level, index); 377 break; 378 case XFS_BTREE_TYPE_AG: 379 xfs_err(cur->bc_mp, 380 "AG %u: Corrupt %sbt pointer at level %d index %d.", 381 cur->bc_group->xg_gno, cur->bc_ops->name, 382 level, index); 383 break; 384 } 385 xfs_btree_mark_sick(cur); 386 } 387 388 return error; 389 } 390 391 #ifdef DEBUG 392 # define xfs_btree_debug_check_ptr xfs_btree_check_ptr 393 #else 394 # define xfs_btree_debug_check_ptr(...) (0) 395 #endif 396 397 /* 398 * Calculate CRC on the whole btree block and stuff it into the 399 * long-form btree header. 400 * 401 * Prior to calculting the CRC, pull the LSN out of the buffer log item and put 402 * it into the buffer so recovery knows what the last modification was that made 403 * it to disk. 404 */ 405 void 406 xfs_btree_fsblock_calc_crc( 407 struct xfs_buf *bp) 408 { 409 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 410 struct xfs_buf_log_item *bip = bp->b_log_item; 411 412 if (!xfs_has_crc(bp->b_mount)) 413 return; 414 if (bip) 415 block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn); 416 xfs_buf_update_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF); 417 } 418 419 bool 420 xfs_btree_fsblock_verify_crc( 421 struct xfs_buf *bp) 422 { 423 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 424 struct xfs_mount *mp = bp->b_mount; 425 426 if (xfs_has_crc(mp)) { 427 if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.l.bb_lsn))) 428 return false; 429 return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF); 430 } 431 432 return true; 433 } 434 435 /* 436 * Calculate CRC on the whole btree block and stuff it into the 437 * short-form btree header. 438 * 439 * Prior to calculting the CRC, pull the LSN out of the buffer log item and put 440 * it into the buffer so recovery knows what the last modification was that made 441 * it to disk. 442 */ 443 void 444 xfs_btree_agblock_calc_crc( 445 struct xfs_buf *bp) 446 { 447 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 448 struct xfs_buf_log_item *bip = bp->b_log_item; 449 450 if (!xfs_has_crc(bp->b_mount)) 451 return; 452 if (bip) 453 block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn); 454 xfs_buf_update_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF); 455 } 456 457 bool 458 xfs_btree_agblock_verify_crc( 459 struct xfs_buf *bp) 460 { 461 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 462 struct xfs_mount *mp = bp->b_mount; 463 464 if (xfs_has_crc(mp)) { 465 if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.s.bb_lsn))) 466 return false; 467 return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF); 468 } 469 470 return true; 471 } 472 473 static int 474 xfs_btree_free_block( 475 struct xfs_btree_cur *cur, 476 struct xfs_buf *bp) 477 { 478 int error; 479 480 trace_xfs_btree_free_block(cur, bp); 481 482 /* 483 * Don't allow block freeing for a staging cursor, because staging 484 * cursors do not support regular btree modifications. 485 */ 486 if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) { 487 ASSERT(0); 488 return -EFSCORRUPTED; 489 } 490 491 error = cur->bc_ops->free_block(cur, bp); 492 if (!error) { 493 xfs_trans_binval(cur->bc_tp, bp); 494 XFS_BTREE_STATS_INC(cur, free); 495 } 496 return error; 497 } 498 499 /* 500 * Delete the btree cursor. 501 */ 502 void 503 xfs_btree_del_cursor( 504 struct xfs_btree_cur *cur, /* btree cursor */ 505 int error) /* del because of error */ 506 { 507 int i; /* btree level */ 508 509 /* 510 * Clear the buffer pointers and release the buffers. If we're doing 511 * this because of an error, inspect all of the entries in the bc_bufs 512 * array for buffers to be unlocked. This is because some of the btree 513 * code works from level n down to 0, and if we get an error along the 514 * way we won't have initialized all the entries down to 0. 515 */ 516 for (i = 0; i < cur->bc_nlevels; i++) { 517 if (cur->bc_levels[i].bp) 518 xfs_trans_brelse(cur->bc_tp, cur->bc_levels[i].bp); 519 else if (!error) 520 break; 521 } 522 523 /* 524 * If we are doing a BMBT update, the number of unaccounted blocks 525 * allocated during this cursor life time should be zero. If it's not 526 * zero, then we should be shut down or on our way to shutdown due to 527 * cancelling a dirty transaction on error. 528 */ 529 ASSERT(!xfs_btree_is_bmap(cur->bc_ops) || cur->bc_bmap.allocated == 0 || 530 xfs_is_shutdown(cur->bc_mp) || error != 0); 531 532 if (cur->bc_group) 533 xfs_group_put(cur->bc_group); 534 kmem_cache_free(cur->bc_cache, cur); 535 } 536 537 /* Return the buffer target for this btree's buffer. */ 538 static inline struct xfs_buftarg * 539 xfs_btree_buftarg( 540 struct xfs_btree_cur *cur) 541 { 542 if (cur->bc_ops->type == XFS_BTREE_TYPE_MEM) 543 return cur->bc_mem.xfbtree->target; 544 return cur->bc_mp->m_ddev_targp; 545 } 546 547 /* Return the block size (in units of 512b sectors) for this btree. */ 548 static inline unsigned int 549 xfs_btree_bbsize( 550 struct xfs_btree_cur *cur) 551 { 552 if (cur->bc_ops->type == XFS_BTREE_TYPE_MEM) 553 return XFBNO_BBSIZE; 554 return cur->bc_mp->m_bsize; 555 } 556 557 /* 558 * Duplicate the btree cursor. 559 * Allocate a new one, copy the record, re-get the buffers. 560 */ 561 int /* error */ 562 xfs_btree_dup_cursor( 563 struct xfs_btree_cur *cur, /* input cursor */ 564 struct xfs_btree_cur **ncur) /* output cursor */ 565 { 566 struct xfs_mount *mp = cur->bc_mp; 567 struct xfs_trans *tp = cur->bc_tp; 568 struct xfs_buf *bp; 569 struct xfs_btree_cur *new; 570 int error; 571 int i; 572 573 /* 574 * Don't allow staging cursors to be duplicated because they're supposed 575 * to be kept private to a single thread. 576 */ 577 if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) { 578 ASSERT(0); 579 return -EFSCORRUPTED; 580 } 581 582 /* 583 * Allocate a new cursor like the old one. 584 */ 585 new = cur->bc_ops->dup_cursor(cur); 586 587 /* 588 * Copy the record currently in the cursor. 589 */ 590 new->bc_rec = cur->bc_rec; 591 592 /* 593 * For each level current, re-get the buffer and copy the ptr value. 594 */ 595 for (i = 0; i < new->bc_nlevels; i++) { 596 new->bc_levels[i].ptr = cur->bc_levels[i].ptr; 597 new->bc_levels[i].ra = cur->bc_levels[i].ra; 598 bp = cur->bc_levels[i].bp; 599 if (bp) { 600 error = xfs_trans_read_buf(mp, tp, 601 xfs_btree_buftarg(cur), 602 xfs_buf_daddr(bp), 603 xfs_btree_bbsize(cur), 0, &bp, 604 cur->bc_ops->buf_ops); 605 if (xfs_metadata_is_sick(error)) 606 xfs_btree_mark_sick(new); 607 if (error) { 608 xfs_btree_del_cursor(new, error); 609 *ncur = NULL; 610 return error; 611 } 612 } 613 new->bc_levels[i].bp = bp; 614 } 615 *ncur = new; 616 return 0; 617 } 618 619 /* 620 * XFS btree block layout and addressing: 621 * 622 * There are two types of blocks in the btree: leaf and non-leaf blocks. 623 * 624 * The leaf record start with a header then followed by records containing 625 * the values. A non-leaf block also starts with the same header, and 626 * then first contains lookup keys followed by an equal number of pointers 627 * to the btree blocks at the previous level. 628 * 629 * +--------+-------+-------+-------+-------+-------+-------+ 630 * Leaf: | header | rec 1 | rec 2 | rec 3 | rec 4 | rec 5 | rec N | 631 * +--------+-------+-------+-------+-------+-------+-------+ 632 * 633 * +--------+-------+-------+-------+-------+-------+-------+ 634 * Non-Leaf: | header | key 1 | key 2 | key N | ptr 1 | ptr 2 | ptr N | 635 * +--------+-------+-------+-------+-------+-------+-------+ 636 * 637 * The header is called struct xfs_btree_block for reasons better left unknown 638 * and comes in different versions for short (32bit) and long (64bit) block 639 * pointers. The record and key structures are defined by the btree instances 640 * and opaque to the btree core. The block pointers are simple disk endian 641 * integers, available in a short (32bit) and long (64bit) variant. 642 * 643 * The helpers below calculate the offset of a given record, key or pointer 644 * into a btree block (xfs_btree_*_offset) or return a pointer to the given 645 * record, key or pointer (xfs_btree_*_addr). Note that all addressing 646 * inside the btree block is done using indices starting at one, not zero! 647 * 648 * If XFS_BTGEO_OVERLAPPING is set, then this btree supports keys containing 649 * overlapping intervals. In such a tree, records are still sorted lowest to 650 * highest and indexed by the smallest key value that refers to the record. 651 * However, nodes are different: each pointer has two associated keys -- one 652 * indexing the lowest key available in the block(s) below (the same behavior 653 * as the key in a regular btree) and another indexing the highest key 654 * available in the block(s) below. Because records are /not/ sorted by the 655 * highest key, all leaf block updates require us to compute the highest key 656 * that matches any record in the leaf and to recursively update the high keys 657 * in the nodes going further up in the tree, if necessary. Nodes look like 658 * this: 659 * 660 * +--------+-----+-----+-----+-----+-----+-------+-------+-----+ 661 * Non-Leaf: | header | lo1 | hi1 | lo2 | hi2 | ... | ptr 1 | ptr 2 | ... | 662 * +--------+-----+-----+-----+-----+-----+-------+-------+-----+ 663 * 664 * To perform an interval query on an overlapped tree, perform the usual 665 * depth-first search and use the low and high keys to decide if we can skip 666 * that particular node. If a leaf node is reached, return the records that 667 * intersect the interval. Note that an interval query may return numerous 668 * entries. For a non-overlapped tree, simply search for the record associated 669 * with the lowest key and iterate forward until a non-matching record is 670 * found. Section 14.3 ("Interval Trees") of _Introduction to Algorithms_ by 671 * Cormen, Leiserson, Rivest, and Stein (2nd or 3rd ed. only) discuss this in 672 * more detail. 673 * 674 * Why do we care about overlapping intervals? Let's say you have a bunch of 675 * reverse mapping records on a reflink filesystem: 676 * 677 * 1: +- file A startblock B offset C length D -----------+ 678 * 2: +- file E startblock F offset G length H --------------+ 679 * 3: +- file I startblock F offset J length K --+ 680 * 4: +- file L... --+ 681 * 682 * Now say we want to map block (B+D) into file A at offset (C+D). Ideally, 683 * we'd simply increment the length of record 1. But how do we find the record 684 * that ends at (B+D-1) (i.e. record 1)? A LE lookup of (B+D-1) would return 685 * record 3 because the keys are ordered first by startblock. An interval 686 * query would return records 1 and 2 because they both overlap (B+D-1), and 687 * from that we can pick out record 1 as the appropriate left neighbor. 688 * 689 * In the non-overlapped case you can do a LE lookup and decrement the cursor 690 * because a record's interval must end before the next record. 691 */ 692 693 /* 694 * Return size of the btree block header for this btree instance. 695 */ 696 static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur) 697 { 698 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { 699 if (xfs_has_crc(cur->bc_mp)) 700 return XFS_BTREE_LBLOCK_CRC_LEN; 701 return XFS_BTREE_LBLOCK_LEN; 702 } 703 if (xfs_has_crc(cur->bc_mp)) 704 return XFS_BTREE_SBLOCK_CRC_LEN; 705 return XFS_BTREE_SBLOCK_LEN; 706 } 707 708 /* 709 * Calculate offset of the n-th record in a btree block. 710 */ 711 STATIC size_t 712 xfs_btree_rec_offset( 713 struct xfs_btree_cur *cur, 714 int n) 715 { 716 return xfs_btree_block_len(cur) + 717 (n - 1) * cur->bc_ops->rec_len; 718 } 719 720 /* 721 * Calculate offset of the n-th key in a btree block. 722 */ 723 STATIC size_t 724 xfs_btree_key_offset( 725 struct xfs_btree_cur *cur, 726 int n) 727 { 728 return xfs_btree_block_len(cur) + 729 (n - 1) * cur->bc_ops->key_len; 730 } 731 732 /* 733 * Calculate offset of the n-th high key in a btree block. 734 */ 735 STATIC size_t 736 xfs_btree_high_key_offset( 737 struct xfs_btree_cur *cur, 738 int n) 739 { 740 return xfs_btree_block_len(cur) + 741 (n - 1) * cur->bc_ops->key_len + (cur->bc_ops->key_len / 2); 742 } 743 744 /* 745 * Calculate offset of the n-th block pointer in a btree block. 746 */ 747 STATIC size_t 748 xfs_btree_ptr_offset( 749 struct xfs_btree_cur *cur, 750 int n, 751 int level) 752 { 753 return xfs_btree_block_len(cur) + 754 cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len + 755 (n - 1) * cur->bc_ops->ptr_len; 756 } 757 758 /* 759 * Return a pointer to the n-th record in the btree block. 760 */ 761 union xfs_btree_rec * 762 xfs_btree_rec_addr( 763 struct xfs_btree_cur *cur, 764 int n, 765 struct xfs_btree_block *block) 766 { 767 return (union xfs_btree_rec *) 768 ((char *)block + xfs_btree_rec_offset(cur, n)); 769 } 770 771 /* 772 * Return a pointer to the n-th key in the btree block. 773 */ 774 union xfs_btree_key * 775 xfs_btree_key_addr( 776 struct xfs_btree_cur *cur, 777 int n, 778 struct xfs_btree_block *block) 779 { 780 return (union xfs_btree_key *) 781 ((char *)block + xfs_btree_key_offset(cur, n)); 782 } 783 784 /* 785 * Return a pointer to the n-th high key in the btree block. 786 */ 787 union xfs_btree_key * 788 xfs_btree_high_key_addr( 789 struct xfs_btree_cur *cur, 790 int n, 791 struct xfs_btree_block *block) 792 { 793 return (union xfs_btree_key *) 794 ((char *)block + xfs_btree_high_key_offset(cur, n)); 795 } 796 797 /* 798 * Return a pointer to the n-th block pointer in the btree block. 799 */ 800 union xfs_btree_ptr * 801 xfs_btree_ptr_addr( 802 struct xfs_btree_cur *cur, 803 int n, 804 struct xfs_btree_block *block) 805 { 806 int level = xfs_btree_get_level(block); 807 808 ASSERT(block->bb_level != 0); 809 810 return (union xfs_btree_ptr *) 811 ((char *)block + xfs_btree_ptr_offset(cur, n, level)); 812 } 813 814 struct xfs_ifork * 815 xfs_btree_ifork_ptr( 816 struct xfs_btree_cur *cur) 817 { 818 ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE); 819 820 if (cur->bc_flags & XFS_BTREE_STAGING) 821 return cur->bc_ino.ifake->if_fork; 822 return xfs_ifork_ptr(cur->bc_ino.ip, cur->bc_ino.whichfork); 823 } 824 825 /* 826 * Get the root block which is stored in the inode. 827 * 828 * For now this btree implementation assumes the btree root is always 829 * stored in the if_broot field of an inode fork. 830 */ 831 STATIC struct xfs_btree_block * 832 xfs_btree_get_iroot( 833 struct xfs_btree_cur *cur) 834 { 835 struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); 836 837 return (struct xfs_btree_block *)ifp->if_broot; 838 } 839 840 /* 841 * Retrieve the block pointer from the cursor at the given level. 842 * This may be an inode btree root or from a buffer. 843 */ 844 struct xfs_btree_block * /* generic btree block pointer */ 845 xfs_btree_get_block( 846 struct xfs_btree_cur *cur, /* btree cursor */ 847 int level, /* level in btree */ 848 struct xfs_buf **bpp) /* buffer containing the block */ 849 { 850 if (xfs_btree_at_iroot(cur, level)) { 851 *bpp = NULL; 852 return xfs_btree_get_iroot(cur); 853 } 854 855 *bpp = cur->bc_levels[level].bp; 856 return XFS_BUF_TO_BLOCK(*bpp); 857 } 858 859 /* 860 * Change the cursor to point to the first record at the given level. 861 * Other levels are unaffected. 862 */ 863 STATIC int /* success=1, failure=0 */ 864 xfs_btree_firstrec( 865 struct xfs_btree_cur *cur, /* btree cursor */ 866 int level) /* level to change */ 867 { 868 struct xfs_btree_block *block; /* generic btree block pointer */ 869 struct xfs_buf *bp; /* buffer containing block */ 870 871 /* 872 * Get the block pointer for this level. 873 */ 874 block = xfs_btree_get_block(cur, level, &bp); 875 if (xfs_btree_check_block(cur, block, level, bp)) 876 return 0; 877 /* 878 * It's empty, there is no such record. 879 */ 880 if (!block->bb_numrecs) 881 return 0; 882 /* 883 * Set the ptr value to 1, that's the first record/key. 884 */ 885 cur->bc_levels[level].ptr = 1; 886 return 1; 887 } 888 889 /* 890 * Change the cursor to point to the last record in the current block 891 * at the given level. Other levels are unaffected. 892 */ 893 STATIC int /* success=1, failure=0 */ 894 xfs_btree_lastrec( 895 struct xfs_btree_cur *cur, /* btree cursor */ 896 int level) /* level to change */ 897 { 898 struct xfs_btree_block *block; /* generic btree block pointer */ 899 struct xfs_buf *bp; /* buffer containing block */ 900 901 /* 902 * Get the block pointer for this level. 903 */ 904 block = xfs_btree_get_block(cur, level, &bp); 905 if (xfs_btree_check_block(cur, block, level, bp)) 906 return 0; 907 /* 908 * It's empty, there is no such record. 909 */ 910 if (!block->bb_numrecs) 911 return 0; 912 /* 913 * Set the ptr value to numrecs, that's the last record/key. 914 */ 915 cur->bc_levels[level].ptr = be16_to_cpu(block->bb_numrecs); 916 return 1; 917 } 918 919 /* 920 * Compute first and last byte offsets for the fields given. 921 * Interprets the offsets table, which contains struct field offsets. 922 */ 923 void 924 xfs_btree_offsets( 925 uint32_t fields, /* bitmask of fields */ 926 const short *offsets, /* table of field offsets */ 927 int nbits, /* number of bits to inspect */ 928 int *first, /* output: first byte offset */ 929 int *last) /* output: last byte offset */ 930 { 931 int i; /* current bit number */ 932 uint32_t imask; /* mask for current bit number */ 933 934 ASSERT(fields != 0); 935 /* 936 * Find the lowest bit, so the first byte offset. 937 */ 938 for (i = 0, imask = 1u; ; i++, imask <<= 1) { 939 if (imask & fields) { 940 *first = offsets[i]; 941 break; 942 } 943 } 944 /* 945 * Find the highest bit, so the last byte offset. 946 */ 947 for (i = nbits - 1, imask = 1u << i; ; i--, imask >>= 1) { 948 if (imask & fields) { 949 *last = offsets[i + 1] - 1; 950 break; 951 } 952 } 953 } 954 955 STATIC int 956 xfs_btree_readahead_fsblock( 957 struct xfs_btree_cur *cur, 958 int lr, 959 struct xfs_btree_block *block) 960 { 961 struct xfs_mount *mp = cur->bc_mp; 962 xfs_fsblock_t left = be64_to_cpu(block->bb_u.l.bb_leftsib); 963 xfs_fsblock_t right = be64_to_cpu(block->bb_u.l.bb_rightsib); 964 int rval = 0; 965 966 if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) { 967 xfs_buf_readahead(mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, left), 968 mp->m_bsize, cur->bc_ops->buf_ops); 969 rval++; 970 } 971 972 if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) { 973 xfs_buf_readahead(mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, right), 974 mp->m_bsize, cur->bc_ops->buf_ops); 975 rval++; 976 } 977 978 return rval; 979 } 980 981 STATIC int 982 xfs_btree_readahead_memblock( 983 struct xfs_btree_cur *cur, 984 int lr, 985 struct xfs_btree_block *block) 986 { 987 struct xfs_buftarg *btp = cur->bc_mem.xfbtree->target; 988 xfbno_t left = be64_to_cpu(block->bb_u.l.bb_leftsib); 989 xfbno_t right = be64_to_cpu(block->bb_u.l.bb_rightsib); 990 int rval = 0; 991 992 if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) { 993 xfs_buf_readahead(btp, xfbno_to_daddr(left), XFBNO_BBSIZE, 994 cur->bc_ops->buf_ops); 995 rval++; 996 } 997 998 if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) { 999 xfs_buf_readahead(btp, xfbno_to_daddr(right), XFBNO_BBSIZE, 1000 cur->bc_ops->buf_ops); 1001 rval++; 1002 } 1003 1004 return rval; 1005 } 1006 1007 STATIC int 1008 xfs_btree_readahead_agblock( 1009 struct xfs_btree_cur *cur, 1010 int lr, 1011 struct xfs_btree_block *block) 1012 { 1013 struct xfs_mount *mp = cur->bc_mp; 1014 struct xfs_perag *pag = to_perag(cur->bc_group); 1015 xfs_agblock_t left = be32_to_cpu(block->bb_u.s.bb_leftsib); 1016 xfs_agblock_t right = be32_to_cpu(block->bb_u.s.bb_rightsib); 1017 int rval = 0; 1018 1019 if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) { 1020 xfs_buf_readahead(mp->m_ddev_targp, 1021 xfs_agbno_to_daddr(pag, left), mp->m_bsize, 1022 cur->bc_ops->buf_ops); 1023 rval++; 1024 } 1025 1026 if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) { 1027 xfs_buf_readahead(mp->m_ddev_targp, 1028 xfs_agbno_to_daddr(pag, right), mp->m_bsize, 1029 cur->bc_ops->buf_ops); 1030 rval++; 1031 } 1032 1033 return rval; 1034 } 1035 1036 /* 1037 * Read-ahead btree blocks, at the given level. 1038 * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA. 1039 */ 1040 STATIC int 1041 xfs_btree_readahead( 1042 struct xfs_btree_cur *cur, /* btree cursor */ 1043 int lev, /* level in btree */ 1044 int lr) /* left/right bits */ 1045 { 1046 struct xfs_btree_block *block; 1047 1048 /* 1049 * No readahead needed if we are at the root level and the 1050 * btree root is stored in the inode. 1051 */ 1052 if (xfs_btree_at_iroot(cur, lev)) 1053 return 0; 1054 1055 if ((cur->bc_levels[lev].ra | lr) == cur->bc_levels[lev].ra) 1056 return 0; 1057 1058 cur->bc_levels[lev].ra |= lr; 1059 block = XFS_BUF_TO_BLOCK(cur->bc_levels[lev].bp); 1060 1061 switch (cur->bc_ops->type) { 1062 case XFS_BTREE_TYPE_AG: 1063 return xfs_btree_readahead_agblock(cur, lr, block); 1064 case XFS_BTREE_TYPE_INODE: 1065 return xfs_btree_readahead_fsblock(cur, lr, block); 1066 case XFS_BTREE_TYPE_MEM: 1067 return xfs_btree_readahead_memblock(cur, lr, block); 1068 default: 1069 ASSERT(0); 1070 return 0; 1071 } 1072 } 1073 1074 STATIC int 1075 xfs_btree_ptr_to_daddr( 1076 struct xfs_btree_cur *cur, 1077 const union xfs_btree_ptr *ptr, 1078 xfs_daddr_t *daddr) 1079 { 1080 int error; 1081 1082 error = xfs_btree_check_ptr(cur, ptr, 0, 1); 1083 if (error) 1084 return error; 1085 1086 switch (cur->bc_ops->type) { 1087 case XFS_BTREE_TYPE_AG: 1088 *daddr = xfs_agbno_to_daddr(to_perag(cur->bc_group), 1089 be32_to_cpu(ptr->s)); 1090 break; 1091 case XFS_BTREE_TYPE_INODE: 1092 *daddr = XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l)); 1093 break; 1094 case XFS_BTREE_TYPE_MEM: 1095 *daddr = xfbno_to_daddr(be64_to_cpu(ptr->l)); 1096 break; 1097 } 1098 return 0; 1099 } 1100 1101 /* 1102 * Readahead @count btree blocks at the given @ptr location. 1103 * 1104 * We don't need to care about long or short form btrees here as we have a 1105 * method of converting the ptr directly to a daddr available to us. 1106 */ 1107 STATIC void 1108 xfs_btree_readahead_ptr( 1109 struct xfs_btree_cur *cur, 1110 union xfs_btree_ptr *ptr, 1111 xfs_extlen_t count) 1112 { 1113 xfs_daddr_t daddr; 1114 1115 if (xfs_btree_ptr_to_daddr(cur, ptr, &daddr)) 1116 return; 1117 xfs_buf_readahead(xfs_btree_buftarg(cur), daddr, 1118 xfs_btree_bbsize(cur) * count, 1119 cur->bc_ops->buf_ops); 1120 } 1121 1122 /* 1123 * Set the buffer for level "lev" in the cursor to bp, releasing 1124 * any previous buffer. 1125 */ 1126 STATIC void 1127 xfs_btree_setbuf( 1128 struct xfs_btree_cur *cur, /* btree cursor */ 1129 int lev, /* level in btree */ 1130 struct xfs_buf *bp) /* new buffer to set */ 1131 { 1132 struct xfs_btree_block *b; /* btree block */ 1133 1134 if (cur->bc_levels[lev].bp) 1135 xfs_trans_brelse(cur->bc_tp, cur->bc_levels[lev].bp); 1136 cur->bc_levels[lev].bp = bp; 1137 cur->bc_levels[lev].ra = 0; 1138 1139 b = XFS_BUF_TO_BLOCK(bp); 1140 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { 1141 if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK)) 1142 cur->bc_levels[lev].ra |= XFS_BTCUR_LEFTRA; 1143 if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK)) 1144 cur->bc_levels[lev].ra |= XFS_BTCUR_RIGHTRA; 1145 } else { 1146 if (b->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK)) 1147 cur->bc_levels[lev].ra |= XFS_BTCUR_LEFTRA; 1148 if (b->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK)) 1149 cur->bc_levels[lev].ra |= XFS_BTCUR_RIGHTRA; 1150 } 1151 } 1152 1153 bool 1154 xfs_btree_ptr_is_null( 1155 struct xfs_btree_cur *cur, 1156 const union xfs_btree_ptr *ptr) 1157 { 1158 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) 1159 return ptr->l == cpu_to_be64(NULLFSBLOCK); 1160 else 1161 return ptr->s == cpu_to_be32(NULLAGBLOCK); 1162 } 1163 1164 void 1165 xfs_btree_set_ptr_null( 1166 struct xfs_btree_cur *cur, 1167 union xfs_btree_ptr *ptr) 1168 { 1169 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) 1170 ptr->l = cpu_to_be64(NULLFSBLOCK); 1171 else 1172 ptr->s = cpu_to_be32(NULLAGBLOCK); 1173 } 1174 1175 static inline bool 1176 xfs_btree_ptrs_equal( 1177 struct xfs_btree_cur *cur, 1178 union xfs_btree_ptr *ptr1, 1179 union xfs_btree_ptr *ptr2) 1180 { 1181 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) 1182 return ptr1->l == ptr2->l; 1183 return ptr1->s == ptr2->s; 1184 } 1185 1186 /* 1187 * Get/set/init sibling pointers 1188 */ 1189 void 1190 xfs_btree_get_sibling( 1191 struct xfs_btree_cur *cur, 1192 struct xfs_btree_block *block, 1193 union xfs_btree_ptr *ptr, 1194 int lr) 1195 { 1196 ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB); 1197 1198 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { 1199 if (lr == XFS_BB_RIGHTSIB) 1200 ptr->l = block->bb_u.l.bb_rightsib; 1201 else 1202 ptr->l = block->bb_u.l.bb_leftsib; 1203 } else { 1204 if (lr == XFS_BB_RIGHTSIB) 1205 ptr->s = block->bb_u.s.bb_rightsib; 1206 else 1207 ptr->s = block->bb_u.s.bb_leftsib; 1208 } 1209 } 1210 1211 void 1212 xfs_btree_set_sibling( 1213 struct xfs_btree_cur *cur, 1214 struct xfs_btree_block *block, 1215 const union xfs_btree_ptr *ptr, 1216 int lr) 1217 { 1218 ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB); 1219 1220 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { 1221 if (lr == XFS_BB_RIGHTSIB) 1222 block->bb_u.l.bb_rightsib = ptr->l; 1223 else 1224 block->bb_u.l.bb_leftsib = ptr->l; 1225 } else { 1226 if (lr == XFS_BB_RIGHTSIB) 1227 block->bb_u.s.bb_rightsib = ptr->s; 1228 else 1229 block->bb_u.s.bb_leftsib = ptr->s; 1230 } 1231 } 1232 1233 static void 1234 __xfs_btree_init_block( 1235 struct xfs_mount *mp, 1236 struct xfs_btree_block *buf, 1237 const struct xfs_btree_ops *ops, 1238 xfs_daddr_t blkno, 1239 __u16 level, 1240 __u16 numrecs, 1241 __u64 owner) 1242 { 1243 bool crc = xfs_has_crc(mp); 1244 __u32 magic = xfs_btree_magic(mp, ops); 1245 1246 buf->bb_magic = cpu_to_be32(magic); 1247 buf->bb_level = cpu_to_be16(level); 1248 buf->bb_numrecs = cpu_to_be16(numrecs); 1249 1250 if (ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { 1251 buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK); 1252 buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK); 1253 if (crc) { 1254 buf->bb_u.l.bb_blkno = cpu_to_be64(blkno); 1255 buf->bb_u.l.bb_owner = cpu_to_be64(owner); 1256 uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid); 1257 buf->bb_u.l.bb_pad = 0; 1258 buf->bb_u.l.bb_lsn = 0; 1259 } 1260 } else { 1261 buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK); 1262 buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK); 1263 if (crc) { 1264 buf->bb_u.s.bb_blkno = cpu_to_be64(blkno); 1265 /* owner is a 32 bit value on short blocks */ 1266 buf->bb_u.s.bb_owner = cpu_to_be32((__u32)owner); 1267 uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid); 1268 buf->bb_u.s.bb_lsn = 0; 1269 } 1270 } 1271 } 1272 1273 void 1274 xfs_btree_init_block( 1275 struct xfs_mount *mp, 1276 struct xfs_btree_block *block, 1277 const struct xfs_btree_ops *ops, 1278 __u16 level, 1279 __u16 numrecs, 1280 __u64 owner) 1281 { 1282 __xfs_btree_init_block(mp, block, ops, XFS_BUF_DADDR_NULL, level, 1283 numrecs, owner); 1284 } 1285 1286 void 1287 xfs_btree_init_buf( 1288 struct xfs_mount *mp, 1289 struct xfs_buf *bp, 1290 const struct xfs_btree_ops *ops, 1291 __u16 level, 1292 __u16 numrecs, 1293 __u64 owner) 1294 { 1295 __xfs_btree_init_block(mp, XFS_BUF_TO_BLOCK(bp), ops, 1296 xfs_buf_daddr(bp), level, numrecs, owner); 1297 bp->b_ops = ops->buf_ops; 1298 } 1299 1300 static inline __u64 1301 xfs_btree_owner( 1302 struct xfs_btree_cur *cur) 1303 { 1304 switch (cur->bc_ops->type) { 1305 case XFS_BTREE_TYPE_MEM: 1306 return cur->bc_mem.xfbtree->owner; 1307 case XFS_BTREE_TYPE_INODE: 1308 return cur->bc_ino.ip->i_ino; 1309 case XFS_BTREE_TYPE_AG: 1310 return cur->bc_group->xg_gno; 1311 default: 1312 ASSERT(0); 1313 return 0; 1314 } 1315 } 1316 1317 void 1318 xfs_btree_init_block_cur( 1319 struct xfs_btree_cur *cur, 1320 struct xfs_buf *bp, 1321 int level, 1322 int numrecs) 1323 { 1324 xfs_btree_init_buf(cur->bc_mp, bp, cur->bc_ops, level, numrecs, 1325 xfs_btree_owner(cur)); 1326 } 1327 1328 STATIC void 1329 xfs_btree_buf_to_ptr( 1330 struct xfs_btree_cur *cur, 1331 struct xfs_buf *bp, 1332 union xfs_btree_ptr *ptr) 1333 { 1334 switch (cur->bc_ops->type) { 1335 case XFS_BTREE_TYPE_AG: 1336 ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp, 1337 xfs_buf_daddr(bp))); 1338 break; 1339 case XFS_BTREE_TYPE_INODE: 1340 ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp, 1341 xfs_buf_daddr(bp))); 1342 break; 1343 case XFS_BTREE_TYPE_MEM: 1344 ptr->l = cpu_to_be64(xfs_daddr_to_xfbno(xfs_buf_daddr(bp))); 1345 break; 1346 } 1347 } 1348 1349 static inline void 1350 xfs_btree_set_refs( 1351 struct xfs_btree_cur *cur, 1352 struct xfs_buf *bp) 1353 { 1354 xfs_buf_set_ref(bp, cur->bc_ops->lru_refs); 1355 } 1356 1357 int 1358 xfs_btree_get_buf_block( 1359 struct xfs_btree_cur *cur, 1360 const union xfs_btree_ptr *ptr, 1361 struct xfs_btree_block **block, 1362 struct xfs_buf **bpp) 1363 { 1364 xfs_daddr_t d; 1365 int error; 1366 1367 error = xfs_btree_ptr_to_daddr(cur, ptr, &d); 1368 if (error) 1369 return error; 1370 error = xfs_trans_get_buf(cur->bc_tp, xfs_btree_buftarg(cur), d, 1371 xfs_btree_bbsize(cur), 0, bpp); 1372 if (error) 1373 return error; 1374 1375 (*bpp)->b_ops = cur->bc_ops->buf_ops; 1376 *block = XFS_BUF_TO_BLOCK(*bpp); 1377 return 0; 1378 } 1379 1380 /* 1381 * Read in the buffer at the given ptr and return the buffer and 1382 * the block pointer within the buffer. 1383 */ 1384 int 1385 xfs_btree_read_buf_block( 1386 struct xfs_btree_cur *cur, 1387 const union xfs_btree_ptr *ptr, 1388 int flags, 1389 struct xfs_btree_block **block, 1390 struct xfs_buf **bpp) 1391 { 1392 struct xfs_mount *mp = cur->bc_mp; 1393 xfs_daddr_t d; 1394 int error; 1395 1396 /* need to sort out how callers deal with failures first */ 1397 ASSERT(!(flags & XBF_TRYLOCK)); 1398 1399 error = xfs_btree_ptr_to_daddr(cur, ptr, &d); 1400 if (error) 1401 return error; 1402 error = xfs_trans_read_buf(mp, cur->bc_tp, xfs_btree_buftarg(cur), d, 1403 xfs_btree_bbsize(cur), flags, bpp, 1404 cur->bc_ops->buf_ops); 1405 if (xfs_metadata_is_sick(error)) 1406 xfs_btree_mark_sick(cur); 1407 if (error) 1408 return error; 1409 1410 xfs_btree_set_refs(cur, *bpp); 1411 *block = XFS_BUF_TO_BLOCK(*bpp); 1412 return 0; 1413 } 1414 1415 /* 1416 * Copy keys from one btree block to another. 1417 */ 1418 void 1419 xfs_btree_copy_keys( 1420 struct xfs_btree_cur *cur, 1421 union xfs_btree_key *dst_key, 1422 const union xfs_btree_key *src_key, 1423 int numkeys) 1424 { 1425 ASSERT(numkeys >= 0); 1426 memcpy(dst_key, src_key, numkeys * cur->bc_ops->key_len); 1427 } 1428 1429 /* 1430 * Copy records from one btree block to another. 1431 */ 1432 STATIC void 1433 xfs_btree_copy_recs( 1434 struct xfs_btree_cur *cur, 1435 union xfs_btree_rec *dst_rec, 1436 union xfs_btree_rec *src_rec, 1437 int numrecs) 1438 { 1439 ASSERT(numrecs >= 0); 1440 memcpy(dst_rec, src_rec, numrecs * cur->bc_ops->rec_len); 1441 } 1442 1443 /* 1444 * Copy block pointers from one btree block to another. 1445 */ 1446 void 1447 xfs_btree_copy_ptrs( 1448 struct xfs_btree_cur *cur, 1449 union xfs_btree_ptr *dst_ptr, 1450 const union xfs_btree_ptr *src_ptr, 1451 int numptrs) 1452 { 1453 ASSERT(numptrs >= 0); 1454 memcpy(dst_ptr, src_ptr, numptrs * cur->bc_ops->ptr_len); 1455 } 1456 1457 /* 1458 * Shift keys one index left/right inside a single btree block. 1459 */ 1460 STATIC void 1461 xfs_btree_shift_keys( 1462 struct xfs_btree_cur *cur, 1463 union xfs_btree_key *key, 1464 int dir, 1465 int numkeys) 1466 { 1467 char *dst_key; 1468 1469 ASSERT(numkeys >= 0); 1470 ASSERT(dir == 1 || dir == -1); 1471 1472 dst_key = (char *)key + (dir * cur->bc_ops->key_len); 1473 memmove(dst_key, key, numkeys * cur->bc_ops->key_len); 1474 } 1475 1476 /* 1477 * Shift records one index left/right inside a single btree block. 1478 */ 1479 STATIC void 1480 xfs_btree_shift_recs( 1481 struct xfs_btree_cur *cur, 1482 union xfs_btree_rec *rec, 1483 int dir, 1484 int numrecs) 1485 { 1486 char *dst_rec; 1487 1488 ASSERT(numrecs >= 0); 1489 ASSERT(dir == 1 || dir == -1); 1490 1491 dst_rec = (char *)rec + (dir * cur->bc_ops->rec_len); 1492 memmove(dst_rec, rec, numrecs * cur->bc_ops->rec_len); 1493 } 1494 1495 /* 1496 * Shift block pointers one index left/right inside a single btree block. 1497 */ 1498 STATIC void 1499 xfs_btree_shift_ptrs( 1500 struct xfs_btree_cur *cur, 1501 union xfs_btree_ptr *ptr, 1502 int dir, 1503 int numptrs) 1504 { 1505 char *dst_ptr; 1506 1507 ASSERT(numptrs >= 0); 1508 ASSERT(dir == 1 || dir == -1); 1509 1510 dst_ptr = (char *)ptr + (dir * cur->bc_ops->ptr_len); 1511 memmove(dst_ptr, ptr, numptrs * cur->bc_ops->ptr_len); 1512 } 1513 1514 /* 1515 * Log key values from the btree block. 1516 */ 1517 STATIC void 1518 xfs_btree_log_keys( 1519 struct xfs_btree_cur *cur, 1520 struct xfs_buf *bp, 1521 int first, 1522 int last) 1523 { 1524 1525 if (bp) { 1526 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); 1527 xfs_trans_log_buf(cur->bc_tp, bp, 1528 xfs_btree_key_offset(cur, first), 1529 xfs_btree_key_offset(cur, last + 1) - 1); 1530 } else { 1531 xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip, 1532 xfs_ilog_fbroot(cur->bc_ino.whichfork)); 1533 } 1534 } 1535 1536 /* 1537 * Log record values from the btree block. 1538 */ 1539 void 1540 xfs_btree_log_recs( 1541 struct xfs_btree_cur *cur, 1542 struct xfs_buf *bp, 1543 int first, 1544 int last) 1545 { 1546 if (!bp) { 1547 xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip, 1548 xfs_ilog_fbroot(cur->bc_ino.whichfork)); 1549 return; 1550 } 1551 1552 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); 1553 xfs_trans_log_buf(cur->bc_tp, bp, 1554 xfs_btree_rec_offset(cur, first), 1555 xfs_btree_rec_offset(cur, last + 1) - 1); 1556 } 1557 1558 /* 1559 * Log block pointer fields from a btree block (nonleaf). 1560 */ 1561 STATIC void 1562 xfs_btree_log_ptrs( 1563 struct xfs_btree_cur *cur, /* btree cursor */ 1564 struct xfs_buf *bp, /* buffer containing btree block */ 1565 int first, /* index of first pointer to log */ 1566 int last) /* index of last pointer to log */ 1567 { 1568 1569 if (bp) { 1570 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 1571 int level = xfs_btree_get_level(block); 1572 1573 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); 1574 xfs_trans_log_buf(cur->bc_tp, bp, 1575 xfs_btree_ptr_offset(cur, first, level), 1576 xfs_btree_ptr_offset(cur, last + 1, level) - 1); 1577 } else { 1578 xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip, 1579 xfs_ilog_fbroot(cur->bc_ino.whichfork)); 1580 } 1581 1582 } 1583 1584 /* 1585 * Log fields from a btree block header. 1586 */ 1587 void 1588 xfs_btree_log_block( 1589 struct xfs_btree_cur *cur, /* btree cursor */ 1590 struct xfs_buf *bp, /* buffer containing btree block */ 1591 uint32_t fields) /* mask of fields: XFS_BB_... */ 1592 { 1593 int first; /* first byte offset logged */ 1594 int last; /* last byte offset logged */ 1595 static const short soffsets[] = { /* table of offsets (short) */ 1596 offsetof(struct xfs_btree_block, bb_magic), 1597 offsetof(struct xfs_btree_block, bb_level), 1598 offsetof(struct xfs_btree_block, bb_numrecs), 1599 offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib), 1600 offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib), 1601 offsetof(struct xfs_btree_block, bb_u.s.bb_blkno), 1602 offsetof(struct xfs_btree_block, bb_u.s.bb_lsn), 1603 offsetof(struct xfs_btree_block, bb_u.s.bb_uuid), 1604 offsetof(struct xfs_btree_block, bb_u.s.bb_owner), 1605 offsetof(struct xfs_btree_block, bb_u.s.bb_crc), 1606 XFS_BTREE_SBLOCK_CRC_LEN 1607 }; 1608 static const short loffsets[] = { /* table of offsets (long) */ 1609 offsetof(struct xfs_btree_block, bb_magic), 1610 offsetof(struct xfs_btree_block, bb_level), 1611 offsetof(struct xfs_btree_block, bb_numrecs), 1612 offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib), 1613 offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib), 1614 offsetof(struct xfs_btree_block, bb_u.l.bb_blkno), 1615 offsetof(struct xfs_btree_block, bb_u.l.bb_lsn), 1616 offsetof(struct xfs_btree_block, bb_u.l.bb_uuid), 1617 offsetof(struct xfs_btree_block, bb_u.l.bb_owner), 1618 offsetof(struct xfs_btree_block, bb_u.l.bb_crc), 1619 offsetof(struct xfs_btree_block, bb_u.l.bb_pad), 1620 XFS_BTREE_LBLOCK_CRC_LEN 1621 }; 1622 1623 if (bp) { 1624 int nbits; 1625 1626 if (xfs_has_crc(cur->bc_mp)) { 1627 /* 1628 * We don't log the CRC when updating a btree 1629 * block but instead recreate it during log 1630 * recovery. As the log buffers have checksums 1631 * of their own this is safe and avoids logging a crc 1632 * update in a lot of places. 1633 */ 1634 if (fields == XFS_BB_ALL_BITS) 1635 fields = XFS_BB_ALL_BITS_CRC; 1636 nbits = XFS_BB_NUM_BITS_CRC; 1637 } else { 1638 nbits = XFS_BB_NUM_BITS; 1639 } 1640 xfs_btree_offsets(fields, 1641 (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) ? 1642 loffsets : soffsets, 1643 nbits, &first, &last); 1644 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); 1645 xfs_trans_log_buf(cur->bc_tp, bp, first, last); 1646 } else { 1647 xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip, 1648 xfs_ilog_fbroot(cur->bc_ino.whichfork)); 1649 } 1650 } 1651 1652 /* 1653 * Increment cursor by one record at the level. 1654 * For nonzero levels the leaf-ward information is untouched. 1655 */ 1656 int /* error */ 1657 xfs_btree_increment( 1658 struct xfs_btree_cur *cur, 1659 int level, 1660 int *stat) /* success/failure */ 1661 { 1662 struct xfs_btree_block *block; 1663 union xfs_btree_ptr ptr; 1664 struct xfs_buf *bp; 1665 int error; /* error return value */ 1666 int lev; 1667 1668 ASSERT(level < cur->bc_nlevels); 1669 1670 /* Read-ahead to the right at this level. */ 1671 xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA); 1672 1673 /* Get a pointer to the btree block. */ 1674 block = xfs_btree_get_block(cur, level, &bp); 1675 1676 #ifdef DEBUG 1677 error = xfs_btree_check_block(cur, block, level, bp); 1678 if (error) 1679 goto error0; 1680 #endif 1681 1682 /* We're done if we remain in the block after the increment. */ 1683 if (++cur->bc_levels[level].ptr <= xfs_btree_get_numrecs(block)) 1684 goto out1; 1685 1686 /* Fail if we just went off the right edge of the tree. */ 1687 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB); 1688 if (xfs_btree_ptr_is_null(cur, &ptr)) 1689 goto out0; 1690 1691 XFS_BTREE_STATS_INC(cur, increment); 1692 1693 /* 1694 * March up the tree incrementing pointers. 1695 * Stop when we don't go off the right edge of a block. 1696 */ 1697 for (lev = level + 1; lev < cur->bc_nlevels; lev++) { 1698 block = xfs_btree_get_block(cur, lev, &bp); 1699 1700 #ifdef DEBUG 1701 error = xfs_btree_check_block(cur, block, lev, bp); 1702 if (error) 1703 goto error0; 1704 #endif 1705 1706 if (++cur->bc_levels[lev].ptr <= xfs_btree_get_numrecs(block)) 1707 break; 1708 1709 /* Read-ahead the right block for the next loop. */ 1710 xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA); 1711 } 1712 1713 /* 1714 * If we went off the root then we are either seriously 1715 * confused or have the tree root in an inode. 1716 */ 1717 if (lev == cur->bc_nlevels) { 1718 if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) 1719 goto out0; 1720 ASSERT(0); 1721 xfs_btree_mark_sick(cur); 1722 error = -EFSCORRUPTED; 1723 goto error0; 1724 } 1725 ASSERT(lev < cur->bc_nlevels); 1726 1727 /* 1728 * Now walk back down the tree, fixing up the cursor's buffer 1729 * pointers and key numbers. 1730 */ 1731 for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) { 1732 union xfs_btree_ptr *ptrp; 1733 1734 ptrp = xfs_btree_ptr_addr(cur, cur->bc_levels[lev].ptr, block); 1735 --lev; 1736 error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp); 1737 if (error) 1738 goto error0; 1739 1740 xfs_btree_setbuf(cur, lev, bp); 1741 cur->bc_levels[lev].ptr = 1; 1742 } 1743 out1: 1744 *stat = 1; 1745 return 0; 1746 1747 out0: 1748 *stat = 0; 1749 return 0; 1750 1751 error0: 1752 return error; 1753 } 1754 1755 /* 1756 * Decrement cursor by one record at the level. 1757 * For nonzero levels the leaf-ward information is untouched. 1758 */ 1759 int /* error */ 1760 xfs_btree_decrement( 1761 struct xfs_btree_cur *cur, 1762 int level, 1763 int *stat) /* success/failure */ 1764 { 1765 struct xfs_btree_block *block; 1766 struct xfs_buf *bp; 1767 int error; /* error return value */ 1768 int lev; 1769 union xfs_btree_ptr ptr; 1770 1771 ASSERT(level < cur->bc_nlevels); 1772 1773 /* Read-ahead to the left at this level. */ 1774 xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA); 1775 1776 /* We're done if we remain in the block after the decrement. */ 1777 if (--cur->bc_levels[level].ptr > 0) 1778 goto out1; 1779 1780 /* Get a pointer to the btree block. */ 1781 block = xfs_btree_get_block(cur, level, &bp); 1782 1783 #ifdef DEBUG 1784 error = xfs_btree_check_block(cur, block, level, bp); 1785 if (error) 1786 goto error0; 1787 #endif 1788 1789 /* Fail if we just went off the left edge of the tree. */ 1790 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB); 1791 if (xfs_btree_ptr_is_null(cur, &ptr)) 1792 goto out0; 1793 1794 XFS_BTREE_STATS_INC(cur, decrement); 1795 1796 /* 1797 * March up the tree decrementing pointers. 1798 * Stop when we don't go off the left edge of a block. 1799 */ 1800 for (lev = level + 1; lev < cur->bc_nlevels; lev++) { 1801 if (--cur->bc_levels[lev].ptr > 0) 1802 break; 1803 /* Read-ahead the left block for the next loop. */ 1804 xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA); 1805 } 1806 1807 /* 1808 * If we went off the root then we are seriously confused. 1809 * or the root of the tree is in an inode. 1810 */ 1811 if (lev == cur->bc_nlevels) { 1812 if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) 1813 goto out0; 1814 ASSERT(0); 1815 xfs_btree_mark_sick(cur); 1816 error = -EFSCORRUPTED; 1817 goto error0; 1818 } 1819 ASSERT(lev < cur->bc_nlevels); 1820 1821 /* 1822 * Now walk back down the tree, fixing up the cursor's buffer 1823 * pointers and key numbers. 1824 */ 1825 for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) { 1826 union xfs_btree_ptr *ptrp; 1827 1828 ptrp = xfs_btree_ptr_addr(cur, cur->bc_levels[lev].ptr, block); 1829 --lev; 1830 error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp); 1831 if (error) 1832 goto error0; 1833 xfs_btree_setbuf(cur, lev, bp); 1834 cur->bc_levels[lev].ptr = xfs_btree_get_numrecs(block); 1835 } 1836 out1: 1837 *stat = 1; 1838 return 0; 1839 1840 out0: 1841 *stat = 0; 1842 return 0; 1843 1844 error0: 1845 return error; 1846 } 1847 1848 /* 1849 * Check the btree block owner now that we have the context to know who the 1850 * real owner is. 1851 */ 1852 static inline xfs_failaddr_t 1853 xfs_btree_check_block_owner( 1854 struct xfs_btree_cur *cur, 1855 struct xfs_btree_block *block) 1856 { 1857 __u64 owner; 1858 1859 if (!xfs_has_crc(cur->bc_mp) || 1860 (cur->bc_flags & XFS_BTREE_BMBT_INVALID_OWNER)) 1861 return NULL; 1862 1863 owner = xfs_btree_owner(cur); 1864 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { 1865 if (be64_to_cpu(block->bb_u.l.bb_owner) != owner) 1866 return __this_address; 1867 } else { 1868 if (be32_to_cpu(block->bb_u.s.bb_owner) != owner) 1869 return __this_address; 1870 } 1871 1872 return NULL; 1873 } 1874 1875 int 1876 xfs_btree_lookup_get_block( 1877 struct xfs_btree_cur *cur, /* btree cursor */ 1878 int level, /* level in the btree */ 1879 const union xfs_btree_ptr *pp, /* ptr to btree block */ 1880 struct xfs_btree_block **blkp) /* return btree block */ 1881 { 1882 struct xfs_buf *bp; /* buffer pointer for btree block */ 1883 xfs_daddr_t daddr; 1884 int error = 0; 1885 1886 /* special case the root block if in an inode */ 1887 if (xfs_btree_at_iroot(cur, level)) { 1888 *blkp = xfs_btree_get_iroot(cur); 1889 return 0; 1890 } 1891 1892 /* 1893 * If the old buffer at this level for the disk address we are 1894 * looking for re-use it. 1895 * 1896 * Otherwise throw it away and get a new one. 1897 */ 1898 bp = cur->bc_levels[level].bp; 1899 error = xfs_btree_ptr_to_daddr(cur, pp, &daddr); 1900 if (error) 1901 return error; 1902 if (bp && xfs_buf_daddr(bp) == daddr) { 1903 *blkp = XFS_BUF_TO_BLOCK(bp); 1904 return 0; 1905 } 1906 1907 error = xfs_btree_read_buf_block(cur, pp, 0, blkp, &bp); 1908 if (error) 1909 return error; 1910 1911 /* Check the inode owner since the verifiers don't. */ 1912 if (xfs_btree_check_block_owner(cur, *blkp) != NULL) 1913 goto out_bad; 1914 1915 /* Did we get the level we were looking for? */ 1916 if (be16_to_cpu((*blkp)->bb_level) != level) 1917 goto out_bad; 1918 1919 /* Check that internal nodes have at least one record. */ 1920 if (level != 0 && be16_to_cpu((*blkp)->bb_numrecs) == 0) 1921 goto out_bad; 1922 1923 xfs_btree_setbuf(cur, level, bp); 1924 return 0; 1925 1926 out_bad: 1927 *blkp = NULL; 1928 xfs_buf_mark_corrupt(bp); 1929 xfs_trans_brelse(cur->bc_tp, bp); 1930 xfs_btree_mark_sick(cur); 1931 return -EFSCORRUPTED; 1932 } 1933 1934 /* 1935 * Get current search key. For level 0 we don't actually have a key 1936 * structure so we make one up from the record. For all other levels 1937 * we just return the right key. 1938 */ 1939 STATIC union xfs_btree_key * 1940 xfs_lookup_get_search_key( 1941 struct xfs_btree_cur *cur, 1942 int level, 1943 int keyno, 1944 struct xfs_btree_block *block, 1945 union xfs_btree_key *kp) 1946 { 1947 if (level == 0) { 1948 cur->bc_ops->init_key_from_rec(kp, 1949 xfs_btree_rec_addr(cur, keyno, block)); 1950 return kp; 1951 } 1952 1953 return xfs_btree_key_addr(cur, keyno, block); 1954 } 1955 1956 /* 1957 * Initialize a pointer to the root block. 1958 */ 1959 void 1960 xfs_btree_init_ptr_from_cur( 1961 struct xfs_btree_cur *cur, 1962 union xfs_btree_ptr *ptr) 1963 { 1964 if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) { 1965 /* 1966 * Inode-rooted btrees call xfs_btree_get_iroot to find the root 1967 * in xfs_btree_lookup_get_block and don't need a pointer here. 1968 */ 1969 ptr->l = 0; 1970 } else if (cur->bc_flags & XFS_BTREE_STAGING) { 1971 ptr->s = cpu_to_be32(cur->bc_ag.afake->af_root); 1972 } else { 1973 cur->bc_ops->init_ptr_from_cur(cur, ptr); 1974 } 1975 } 1976 1977 /* 1978 * Lookup the record. The cursor is made to point to it, based on dir. 1979 * stat is set to 0 if can't find any such record, 1 for success. 1980 */ 1981 int /* error */ 1982 xfs_btree_lookup( 1983 struct xfs_btree_cur *cur, /* btree cursor */ 1984 xfs_lookup_t dir, /* <=, ==, or >= */ 1985 int *stat) /* success/failure */ 1986 { 1987 struct xfs_btree_block *block; /* current btree block */ 1988 int64_t diff; /* difference for the current key */ 1989 int error; /* error return value */ 1990 int keyno; /* current key number */ 1991 int level; /* level in the btree */ 1992 union xfs_btree_ptr *pp; /* ptr to btree block */ 1993 union xfs_btree_ptr ptr; /* ptr to btree block */ 1994 1995 XFS_BTREE_STATS_INC(cur, lookup); 1996 1997 /* No such thing as a zero-level tree. */ 1998 if (XFS_IS_CORRUPT(cur->bc_mp, cur->bc_nlevels == 0)) { 1999 xfs_btree_mark_sick(cur); 2000 return -EFSCORRUPTED; 2001 } 2002 2003 block = NULL; 2004 keyno = 0; 2005 2006 /* initialise start pointer from cursor */ 2007 xfs_btree_init_ptr_from_cur(cur, &ptr); 2008 pp = &ptr; 2009 2010 /* 2011 * Iterate over each level in the btree, starting at the root. 2012 * For each level above the leaves, find the key we need, based 2013 * on the lookup record, then follow the corresponding block 2014 * pointer down to the next level. 2015 */ 2016 for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) { 2017 /* Get the block we need to do the lookup on. */ 2018 error = xfs_btree_lookup_get_block(cur, level, pp, &block); 2019 if (error) 2020 goto error0; 2021 2022 if (diff == 0) { 2023 /* 2024 * If we already had a key match at a higher level, we 2025 * know we need to use the first entry in this block. 2026 */ 2027 keyno = 1; 2028 } else { 2029 /* Otherwise search this block. Do a binary search. */ 2030 2031 int high; /* high entry number */ 2032 int low; /* low entry number */ 2033 2034 /* Set low and high entry numbers, 1-based. */ 2035 low = 1; 2036 high = xfs_btree_get_numrecs(block); 2037 if (!high) { 2038 /* Block is empty, must be an empty leaf. */ 2039 if (level != 0 || cur->bc_nlevels != 1) { 2040 XFS_CORRUPTION_ERROR(__func__, 2041 XFS_ERRLEVEL_LOW, 2042 cur->bc_mp, block, 2043 sizeof(*block)); 2044 xfs_btree_mark_sick(cur); 2045 return -EFSCORRUPTED; 2046 } 2047 2048 cur->bc_levels[0].ptr = dir != XFS_LOOKUP_LE; 2049 *stat = 0; 2050 return 0; 2051 } 2052 2053 /* Binary search the block. */ 2054 while (low <= high) { 2055 union xfs_btree_key key; 2056 union xfs_btree_key *kp; 2057 2058 XFS_BTREE_STATS_INC(cur, compare); 2059 2060 /* keyno is average of low and high. */ 2061 keyno = (low + high) >> 1; 2062 2063 /* Get current search key */ 2064 kp = xfs_lookup_get_search_key(cur, level, 2065 keyno, block, &key); 2066 2067 /* 2068 * Compute difference to get next direction: 2069 * - less than, move right 2070 * - greater than, move left 2071 * - equal, we're done 2072 */ 2073 diff = cur->bc_ops->key_diff(cur, kp); 2074 if (diff < 0) 2075 low = keyno + 1; 2076 else if (diff > 0) 2077 high = keyno - 1; 2078 else 2079 break; 2080 } 2081 } 2082 2083 /* 2084 * If there are more levels, set up for the next level 2085 * by getting the block number and filling in the cursor. 2086 */ 2087 if (level > 0) { 2088 /* 2089 * If we moved left, need the previous key number, 2090 * unless there isn't one. 2091 */ 2092 if (diff > 0 && --keyno < 1) 2093 keyno = 1; 2094 pp = xfs_btree_ptr_addr(cur, keyno, block); 2095 2096 error = xfs_btree_debug_check_ptr(cur, pp, 0, level); 2097 if (error) 2098 goto error0; 2099 2100 cur->bc_levels[level].ptr = keyno; 2101 } 2102 } 2103 2104 /* Done with the search. See if we need to adjust the results. */ 2105 if (dir != XFS_LOOKUP_LE && diff < 0) { 2106 keyno++; 2107 /* 2108 * If ge search and we went off the end of the block, but it's 2109 * not the last block, we're in the wrong block. 2110 */ 2111 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB); 2112 if (dir == XFS_LOOKUP_GE && 2113 keyno > xfs_btree_get_numrecs(block) && 2114 !xfs_btree_ptr_is_null(cur, &ptr)) { 2115 int i; 2116 2117 cur->bc_levels[0].ptr = keyno; 2118 error = xfs_btree_increment(cur, 0, &i); 2119 if (error) 2120 goto error0; 2121 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 2122 xfs_btree_mark_sick(cur); 2123 return -EFSCORRUPTED; 2124 } 2125 *stat = 1; 2126 return 0; 2127 } 2128 } else if (dir == XFS_LOOKUP_LE && diff > 0) 2129 keyno--; 2130 cur->bc_levels[0].ptr = keyno; 2131 2132 /* Return if we succeeded or not. */ 2133 if (keyno == 0 || keyno > xfs_btree_get_numrecs(block)) 2134 *stat = 0; 2135 else if (dir != XFS_LOOKUP_EQ || diff == 0) 2136 *stat = 1; 2137 else 2138 *stat = 0; 2139 return 0; 2140 2141 error0: 2142 return error; 2143 } 2144 2145 /* Find the high key storage area from a regular key. */ 2146 union xfs_btree_key * 2147 xfs_btree_high_key_from_key( 2148 struct xfs_btree_cur *cur, 2149 union xfs_btree_key *key) 2150 { 2151 ASSERT(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING); 2152 return (union xfs_btree_key *)((char *)key + 2153 (cur->bc_ops->key_len / 2)); 2154 } 2155 2156 /* Determine the low (and high if overlapped) keys of a leaf block */ 2157 STATIC void 2158 xfs_btree_get_leaf_keys( 2159 struct xfs_btree_cur *cur, 2160 struct xfs_btree_block *block, 2161 union xfs_btree_key *key) 2162 { 2163 union xfs_btree_key max_hkey; 2164 union xfs_btree_key hkey; 2165 union xfs_btree_rec *rec; 2166 union xfs_btree_key *high; 2167 int n; 2168 2169 rec = xfs_btree_rec_addr(cur, 1, block); 2170 cur->bc_ops->init_key_from_rec(key, rec); 2171 2172 if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) { 2173 2174 cur->bc_ops->init_high_key_from_rec(&max_hkey, rec); 2175 for (n = 2; n <= xfs_btree_get_numrecs(block); n++) { 2176 rec = xfs_btree_rec_addr(cur, n, block); 2177 cur->bc_ops->init_high_key_from_rec(&hkey, rec); 2178 if (xfs_btree_keycmp_gt(cur, &hkey, &max_hkey)) 2179 max_hkey = hkey; 2180 } 2181 2182 high = xfs_btree_high_key_from_key(cur, key); 2183 memcpy(high, &max_hkey, cur->bc_ops->key_len / 2); 2184 } 2185 } 2186 2187 /* Determine the low (and high if overlapped) keys of a node block */ 2188 STATIC void 2189 xfs_btree_get_node_keys( 2190 struct xfs_btree_cur *cur, 2191 struct xfs_btree_block *block, 2192 union xfs_btree_key *key) 2193 { 2194 union xfs_btree_key *hkey; 2195 union xfs_btree_key *max_hkey; 2196 union xfs_btree_key *high; 2197 int n; 2198 2199 if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) { 2200 memcpy(key, xfs_btree_key_addr(cur, 1, block), 2201 cur->bc_ops->key_len / 2); 2202 2203 max_hkey = xfs_btree_high_key_addr(cur, 1, block); 2204 for (n = 2; n <= xfs_btree_get_numrecs(block); n++) { 2205 hkey = xfs_btree_high_key_addr(cur, n, block); 2206 if (xfs_btree_keycmp_gt(cur, hkey, max_hkey)) 2207 max_hkey = hkey; 2208 } 2209 2210 high = xfs_btree_high_key_from_key(cur, key); 2211 memcpy(high, max_hkey, cur->bc_ops->key_len / 2); 2212 } else { 2213 memcpy(key, xfs_btree_key_addr(cur, 1, block), 2214 cur->bc_ops->key_len); 2215 } 2216 } 2217 2218 /* Derive the keys for any btree block. */ 2219 void 2220 xfs_btree_get_keys( 2221 struct xfs_btree_cur *cur, 2222 struct xfs_btree_block *block, 2223 union xfs_btree_key *key) 2224 { 2225 if (be16_to_cpu(block->bb_level) == 0) 2226 xfs_btree_get_leaf_keys(cur, block, key); 2227 else 2228 xfs_btree_get_node_keys(cur, block, key); 2229 } 2230 2231 /* 2232 * Decide if we need to update the parent keys of a btree block. For 2233 * a standard btree this is only necessary if we're updating the first 2234 * record/key. For an overlapping btree, we must always update the 2235 * keys because the highest key can be in any of the records or keys 2236 * in the block. 2237 */ 2238 static inline bool 2239 xfs_btree_needs_key_update( 2240 struct xfs_btree_cur *cur, 2241 int ptr) 2242 { 2243 return (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) || ptr == 1; 2244 } 2245 2246 /* 2247 * Update the low and high parent keys of the given level, progressing 2248 * towards the root. If force_all is false, stop if the keys for a given 2249 * level do not need updating. 2250 */ 2251 STATIC int 2252 __xfs_btree_updkeys( 2253 struct xfs_btree_cur *cur, 2254 int level, 2255 struct xfs_btree_block *block, 2256 struct xfs_buf *bp0, 2257 bool force_all) 2258 { 2259 union xfs_btree_key key; /* keys from current level */ 2260 union xfs_btree_key *lkey; /* keys from the next level up */ 2261 union xfs_btree_key *hkey; 2262 union xfs_btree_key *nlkey; /* keys from the next level up */ 2263 union xfs_btree_key *nhkey; 2264 struct xfs_buf *bp; 2265 int ptr; 2266 2267 ASSERT(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING); 2268 2269 /* Exit if there aren't any parent levels to update. */ 2270 if (level + 1 >= cur->bc_nlevels) 2271 return 0; 2272 2273 trace_xfs_btree_updkeys(cur, level, bp0); 2274 2275 lkey = &key; 2276 hkey = xfs_btree_high_key_from_key(cur, lkey); 2277 xfs_btree_get_keys(cur, block, lkey); 2278 for (level++; level < cur->bc_nlevels; level++) { 2279 #ifdef DEBUG 2280 int error; 2281 #endif 2282 block = xfs_btree_get_block(cur, level, &bp); 2283 trace_xfs_btree_updkeys(cur, level, bp); 2284 #ifdef DEBUG 2285 error = xfs_btree_check_block(cur, block, level, bp); 2286 if (error) 2287 return error; 2288 #endif 2289 ptr = cur->bc_levels[level].ptr; 2290 nlkey = xfs_btree_key_addr(cur, ptr, block); 2291 nhkey = xfs_btree_high_key_addr(cur, ptr, block); 2292 if (!force_all && 2293 xfs_btree_keycmp_eq(cur, nlkey, lkey) && 2294 xfs_btree_keycmp_eq(cur, nhkey, hkey)) 2295 break; 2296 xfs_btree_copy_keys(cur, nlkey, lkey, 1); 2297 xfs_btree_log_keys(cur, bp, ptr, ptr); 2298 if (level + 1 >= cur->bc_nlevels) 2299 break; 2300 xfs_btree_get_node_keys(cur, block, lkey); 2301 } 2302 2303 return 0; 2304 } 2305 2306 /* Update all the keys from some level in cursor back to the root. */ 2307 STATIC int 2308 xfs_btree_updkeys_force( 2309 struct xfs_btree_cur *cur, 2310 int level) 2311 { 2312 struct xfs_buf *bp; 2313 struct xfs_btree_block *block; 2314 2315 block = xfs_btree_get_block(cur, level, &bp); 2316 return __xfs_btree_updkeys(cur, level, block, bp, true); 2317 } 2318 2319 /* 2320 * Update the parent keys of the given level, progressing towards the root. 2321 */ 2322 STATIC int 2323 xfs_btree_update_keys( 2324 struct xfs_btree_cur *cur, 2325 int level) 2326 { 2327 struct xfs_btree_block *block; 2328 struct xfs_buf *bp; 2329 union xfs_btree_key *kp; 2330 union xfs_btree_key key; 2331 int ptr; 2332 2333 ASSERT(level >= 0); 2334 2335 block = xfs_btree_get_block(cur, level, &bp); 2336 if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) 2337 return __xfs_btree_updkeys(cur, level, block, bp, false); 2338 2339 /* 2340 * Go up the tree from this level toward the root. 2341 * At each level, update the key value to the value input. 2342 * Stop when we reach a level where the cursor isn't pointing 2343 * at the first entry in the block. 2344 */ 2345 xfs_btree_get_keys(cur, block, &key); 2346 for (level++, ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) { 2347 #ifdef DEBUG 2348 int error; 2349 #endif 2350 block = xfs_btree_get_block(cur, level, &bp); 2351 #ifdef DEBUG 2352 error = xfs_btree_check_block(cur, block, level, bp); 2353 if (error) 2354 return error; 2355 #endif 2356 ptr = cur->bc_levels[level].ptr; 2357 kp = xfs_btree_key_addr(cur, ptr, block); 2358 xfs_btree_copy_keys(cur, kp, &key, 1); 2359 xfs_btree_log_keys(cur, bp, ptr, ptr); 2360 } 2361 2362 return 0; 2363 } 2364 2365 /* 2366 * Update the record referred to by cur to the value in the 2367 * given record. This either works (return 0) or gets an 2368 * EFSCORRUPTED error. 2369 */ 2370 int 2371 xfs_btree_update( 2372 struct xfs_btree_cur *cur, 2373 union xfs_btree_rec *rec) 2374 { 2375 struct xfs_btree_block *block; 2376 struct xfs_buf *bp; 2377 int error; 2378 int ptr; 2379 union xfs_btree_rec *rp; 2380 2381 /* Pick up the current block. */ 2382 block = xfs_btree_get_block(cur, 0, &bp); 2383 2384 #ifdef DEBUG 2385 error = xfs_btree_check_block(cur, block, 0, bp); 2386 if (error) 2387 goto error0; 2388 #endif 2389 /* Get the address of the rec to be updated. */ 2390 ptr = cur->bc_levels[0].ptr; 2391 rp = xfs_btree_rec_addr(cur, ptr, block); 2392 2393 /* Fill in the new contents and log them. */ 2394 xfs_btree_copy_recs(cur, rp, rec, 1); 2395 xfs_btree_log_recs(cur, bp, ptr, ptr); 2396 2397 /* Pass new key value up to our parent. */ 2398 if (xfs_btree_needs_key_update(cur, ptr)) { 2399 error = xfs_btree_update_keys(cur, 0); 2400 if (error) 2401 goto error0; 2402 } 2403 2404 return 0; 2405 2406 error0: 2407 return error; 2408 } 2409 2410 /* 2411 * Move 1 record left from cur/level if possible. 2412 * Update cur to reflect the new path. 2413 */ 2414 STATIC int /* error */ 2415 xfs_btree_lshift( 2416 struct xfs_btree_cur *cur, 2417 int level, 2418 int *stat) /* success/failure */ 2419 { 2420 struct xfs_buf *lbp; /* left buffer pointer */ 2421 struct xfs_btree_block *left; /* left btree block */ 2422 int lrecs; /* left record count */ 2423 struct xfs_buf *rbp; /* right buffer pointer */ 2424 struct xfs_btree_block *right; /* right btree block */ 2425 struct xfs_btree_cur *tcur; /* temporary btree cursor */ 2426 int rrecs; /* right record count */ 2427 union xfs_btree_ptr lptr; /* left btree pointer */ 2428 union xfs_btree_key *rkp = NULL; /* right btree key */ 2429 union xfs_btree_ptr *rpp = NULL; /* right address pointer */ 2430 union xfs_btree_rec *rrp = NULL; /* right record pointer */ 2431 int error; /* error return value */ 2432 int i; 2433 2434 if (xfs_btree_at_iroot(cur, level)) 2435 goto out0; 2436 2437 /* Set up variables for this block as "right". */ 2438 right = xfs_btree_get_block(cur, level, &rbp); 2439 2440 #ifdef DEBUG 2441 error = xfs_btree_check_block(cur, right, level, rbp); 2442 if (error) 2443 goto error0; 2444 #endif 2445 2446 /* If we've got no left sibling then we can't shift an entry left. */ 2447 xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB); 2448 if (xfs_btree_ptr_is_null(cur, &lptr)) 2449 goto out0; 2450 2451 /* 2452 * If the cursor entry is the one that would be moved, don't 2453 * do it... it's too complicated. 2454 */ 2455 if (cur->bc_levels[level].ptr <= 1) 2456 goto out0; 2457 2458 /* Set up the left neighbor as "left". */ 2459 error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp); 2460 if (error) 2461 goto error0; 2462 2463 /* If it's full, it can't take another entry. */ 2464 lrecs = xfs_btree_get_numrecs(left); 2465 if (lrecs == cur->bc_ops->get_maxrecs(cur, level)) 2466 goto out0; 2467 2468 rrecs = xfs_btree_get_numrecs(right); 2469 2470 /* 2471 * We add one entry to the left side and remove one for the right side. 2472 * Account for it here, the changes will be updated on disk and logged 2473 * later. 2474 */ 2475 lrecs++; 2476 rrecs--; 2477 2478 XFS_BTREE_STATS_INC(cur, lshift); 2479 XFS_BTREE_STATS_ADD(cur, moves, 1); 2480 2481 /* 2482 * If non-leaf, copy a key and a ptr to the left block. 2483 * Log the changes to the left block. 2484 */ 2485 if (level > 0) { 2486 /* It's a non-leaf. Move keys and pointers. */ 2487 union xfs_btree_key *lkp; /* left btree key */ 2488 union xfs_btree_ptr *lpp; /* left address pointer */ 2489 2490 lkp = xfs_btree_key_addr(cur, lrecs, left); 2491 rkp = xfs_btree_key_addr(cur, 1, right); 2492 2493 lpp = xfs_btree_ptr_addr(cur, lrecs, left); 2494 rpp = xfs_btree_ptr_addr(cur, 1, right); 2495 2496 error = xfs_btree_debug_check_ptr(cur, rpp, 0, level); 2497 if (error) 2498 goto error0; 2499 2500 xfs_btree_copy_keys(cur, lkp, rkp, 1); 2501 xfs_btree_copy_ptrs(cur, lpp, rpp, 1); 2502 2503 xfs_btree_log_keys(cur, lbp, lrecs, lrecs); 2504 xfs_btree_log_ptrs(cur, lbp, lrecs, lrecs); 2505 2506 ASSERT(cur->bc_ops->keys_inorder(cur, 2507 xfs_btree_key_addr(cur, lrecs - 1, left), lkp)); 2508 } else { 2509 /* It's a leaf. Move records. */ 2510 union xfs_btree_rec *lrp; /* left record pointer */ 2511 2512 lrp = xfs_btree_rec_addr(cur, lrecs, left); 2513 rrp = xfs_btree_rec_addr(cur, 1, right); 2514 2515 xfs_btree_copy_recs(cur, lrp, rrp, 1); 2516 xfs_btree_log_recs(cur, lbp, lrecs, lrecs); 2517 2518 ASSERT(cur->bc_ops->recs_inorder(cur, 2519 xfs_btree_rec_addr(cur, lrecs - 1, left), lrp)); 2520 } 2521 2522 xfs_btree_set_numrecs(left, lrecs); 2523 xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS); 2524 2525 xfs_btree_set_numrecs(right, rrecs); 2526 xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS); 2527 2528 /* 2529 * Slide the contents of right down one entry. 2530 */ 2531 XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1); 2532 if (level > 0) { 2533 /* It's a nonleaf. operate on keys and ptrs */ 2534 for (i = 0; i < rrecs; i++) { 2535 error = xfs_btree_debug_check_ptr(cur, rpp, i + 1, level); 2536 if (error) 2537 goto error0; 2538 } 2539 2540 xfs_btree_shift_keys(cur, 2541 xfs_btree_key_addr(cur, 2, right), 2542 -1, rrecs); 2543 xfs_btree_shift_ptrs(cur, 2544 xfs_btree_ptr_addr(cur, 2, right), 2545 -1, rrecs); 2546 2547 xfs_btree_log_keys(cur, rbp, 1, rrecs); 2548 xfs_btree_log_ptrs(cur, rbp, 1, rrecs); 2549 } else { 2550 /* It's a leaf. operate on records */ 2551 xfs_btree_shift_recs(cur, 2552 xfs_btree_rec_addr(cur, 2, right), 2553 -1, rrecs); 2554 xfs_btree_log_recs(cur, rbp, 1, rrecs); 2555 } 2556 2557 /* 2558 * Using a temporary cursor, update the parent key values of the 2559 * block on the left. 2560 */ 2561 if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) { 2562 error = xfs_btree_dup_cursor(cur, &tcur); 2563 if (error) 2564 goto error0; 2565 i = xfs_btree_firstrec(tcur, level); 2566 if (XFS_IS_CORRUPT(tcur->bc_mp, i != 1)) { 2567 xfs_btree_mark_sick(cur); 2568 error = -EFSCORRUPTED; 2569 goto error0; 2570 } 2571 2572 error = xfs_btree_decrement(tcur, level, &i); 2573 if (error) 2574 goto error1; 2575 2576 /* Update the parent high keys of the left block, if needed. */ 2577 error = xfs_btree_update_keys(tcur, level); 2578 if (error) 2579 goto error1; 2580 2581 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); 2582 } 2583 2584 /* Update the parent keys of the right block. */ 2585 error = xfs_btree_update_keys(cur, level); 2586 if (error) 2587 goto error0; 2588 2589 /* Slide the cursor value left one. */ 2590 cur->bc_levels[level].ptr--; 2591 2592 *stat = 1; 2593 return 0; 2594 2595 out0: 2596 *stat = 0; 2597 return 0; 2598 2599 error0: 2600 return error; 2601 2602 error1: 2603 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); 2604 return error; 2605 } 2606 2607 /* 2608 * Move 1 record right from cur/level if possible. 2609 * Update cur to reflect the new path. 2610 */ 2611 STATIC int /* error */ 2612 xfs_btree_rshift( 2613 struct xfs_btree_cur *cur, 2614 int level, 2615 int *stat) /* success/failure */ 2616 { 2617 struct xfs_buf *lbp; /* left buffer pointer */ 2618 struct xfs_btree_block *left; /* left btree block */ 2619 struct xfs_buf *rbp; /* right buffer pointer */ 2620 struct xfs_btree_block *right; /* right btree block */ 2621 struct xfs_btree_cur *tcur; /* temporary btree cursor */ 2622 union xfs_btree_ptr rptr; /* right block pointer */ 2623 union xfs_btree_key *rkp; /* right btree key */ 2624 int rrecs; /* right record count */ 2625 int lrecs; /* left record count */ 2626 int error; /* error return value */ 2627 int i; /* loop counter */ 2628 2629 if (xfs_btree_at_iroot(cur, level)) 2630 goto out0; 2631 2632 /* Set up variables for this block as "left". */ 2633 left = xfs_btree_get_block(cur, level, &lbp); 2634 2635 #ifdef DEBUG 2636 error = xfs_btree_check_block(cur, left, level, lbp); 2637 if (error) 2638 goto error0; 2639 #endif 2640 2641 /* If we've got no right sibling then we can't shift an entry right. */ 2642 xfs_btree_get_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB); 2643 if (xfs_btree_ptr_is_null(cur, &rptr)) 2644 goto out0; 2645 2646 /* 2647 * If the cursor entry is the one that would be moved, don't 2648 * do it... it's too complicated. 2649 */ 2650 lrecs = xfs_btree_get_numrecs(left); 2651 if (cur->bc_levels[level].ptr >= lrecs) 2652 goto out0; 2653 2654 /* Set up the right neighbor as "right". */ 2655 error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp); 2656 if (error) 2657 goto error0; 2658 2659 /* If it's full, it can't take another entry. */ 2660 rrecs = xfs_btree_get_numrecs(right); 2661 if (rrecs == cur->bc_ops->get_maxrecs(cur, level)) 2662 goto out0; 2663 2664 XFS_BTREE_STATS_INC(cur, rshift); 2665 XFS_BTREE_STATS_ADD(cur, moves, rrecs); 2666 2667 /* 2668 * Make a hole at the start of the right neighbor block, then 2669 * copy the last left block entry to the hole. 2670 */ 2671 if (level > 0) { 2672 /* It's a nonleaf. make a hole in the keys and ptrs */ 2673 union xfs_btree_key *lkp; 2674 union xfs_btree_ptr *lpp; 2675 union xfs_btree_ptr *rpp; 2676 2677 lkp = xfs_btree_key_addr(cur, lrecs, left); 2678 lpp = xfs_btree_ptr_addr(cur, lrecs, left); 2679 rkp = xfs_btree_key_addr(cur, 1, right); 2680 rpp = xfs_btree_ptr_addr(cur, 1, right); 2681 2682 for (i = rrecs - 1; i >= 0; i--) { 2683 error = xfs_btree_debug_check_ptr(cur, rpp, i, level); 2684 if (error) 2685 goto error0; 2686 } 2687 2688 xfs_btree_shift_keys(cur, rkp, 1, rrecs); 2689 xfs_btree_shift_ptrs(cur, rpp, 1, rrecs); 2690 2691 error = xfs_btree_debug_check_ptr(cur, lpp, 0, level); 2692 if (error) 2693 goto error0; 2694 2695 /* Now put the new data in, and log it. */ 2696 xfs_btree_copy_keys(cur, rkp, lkp, 1); 2697 xfs_btree_copy_ptrs(cur, rpp, lpp, 1); 2698 2699 xfs_btree_log_keys(cur, rbp, 1, rrecs + 1); 2700 xfs_btree_log_ptrs(cur, rbp, 1, rrecs + 1); 2701 2702 ASSERT(cur->bc_ops->keys_inorder(cur, rkp, 2703 xfs_btree_key_addr(cur, 2, right))); 2704 } else { 2705 /* It's a leaf. make a hole in the records */ 2706 union xfs_btree_rec *lrp; 2707 union xfs_btree_rec *rrp; 2708 2709 lrp = xfs_btree_rec_addr(cur, lrecs, left); 2710 rrp = xfs_btree_rec_addr(cur, 1, right); 2711 2712 xfs_btree_shift_recs(cur, rrp, 1, rrecs); 2713 2714 /* Now put the new data in, and log it. */ 2715 xfs_btree_copy_recs(cur, rrp, lrp, 1); 2716 xfs_btree_log_recs(cur, rbp, 1, rrecs + 1); 2717 } 2718 2719 /* 2720 * Decrement and log left's numrecs, bump and log right's numrecs. 2721 */ 2722 xfs_btree_set_numrecs(left, --lrecs); 2723 xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS); 2724 2725 xfs_btree_set_numrecs(right, ++rrecs); 2726 xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS); 2727 2728 /* 2729 * Using a temporary cursor, update the parent key values of the 2730 * block on the right. 2731 */ 2732 error = xfs_btree_dup_cursor(cur, &tcur); 2733 if (error) 2734 goto error0; 2735 i = xfs_btree_lastrec(tcur, level); 2736 if (XFS_IS_CORRUPT(tcur->bc_mp, i != 1)) { 2737 xfs_btree_mark_sick(cur); 2738 error = -EFSCORRUPTED; 2739 goto error0; 2740 } 2741 2742 error = xfs_btree_increment(tcur, level, &i); 2743 if (error) 2744 goto error1; 2745 2746 /* Update the parent high keys of the left block, if needed. */ 2747 if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) { 2748 error = xfs_btree_update_keys(cur, level); 2749 if (error) 2750 goto error1; 2751 } 2752 2753 /* Update the parent keys of the right block. */ 2754 error = xfs_btree_update_keys(tcur, level); 2755 if (error) 2756 goto error1; 2757 2758 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); 2759 2760 *stat = 1; 2761 return 0; 2762 2763 out0: 2764 *stat = 0; 2765 return 0; 2766 2767 error0: 2768 return error; 2769 2770 error1: 2771 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); 2772 return error; 2773 } 2774 2775 static inline int 2776 xfs_btree_alloc_block( 2777 struct xfs_btree_cur *cur, 2778 const union xfs_btree_ptr *hint_block, 2779 union xfs_btree_ptr *new_block, 2780 int *stat) 2781 { 2782 int error; 2783 2784 /* 2785 * Don't allow block allocation for a staging cursor, because staging 2786 * cursors do not support regular btree modifications. 2787 * 2788 * Bulk loading uses a separate callback to obtain new blocks from a 2789 * preallocated list, which prevents ENOSPC failures during loading. 2790 */ 2791 if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) { 2792 ASSERT(0); 2793 return -EFSCORRUPTED; 2794 } 2795 2796 error = cur->bc_ops->alloc_block(cur, hint_block, new_block, stat); 2797 trace_xfs_btree_alloc_block(cur, new_block, *stat, error); 2798 return error; 2799 } 2800 2801 /* 2802 * Split cur/level block in half. 2803 * Return new block number and the key to its first 2804 * record (to be inserted into parent). 2805 */ 2806 STATIC int /* error */ 2807 __xfs_btree_split( 2808 struct xfs_btree_cur *cur, 2809 int level, 2810 union xfs_btree_ptr *ptrp, 2811 union xfs_btree_key *key, 2812 struct xfs_btree_cur **curp, 2813 int *stat) /* success/failure */ 2814 { 2815 union xfs_btree_ptr lptr; /* left sibling block ptr */ 2816 struct xfs_buf *lbp; /* left buffer pointer */ 2817 struct xfs_btree_block *left; /* left btree block */ 2818 union xfs_btree_ptr rptr; /* right sibling block ptr */ 2819 struct xfs_buf *rbp; /* right buffer pointer */ 2820 struct xfs_btree_block *right; /* right btree block */ 2821 union xfs_btree_ptr rrptr; /* right-right sibling ptr */ 2822 struct xfs_buf *rrbp; /* right-right buffer pointer */ 2823 struct xfs_btree_block *rrblock; /* right-right btree block */ 2824 int lrecs; 2825 int rrecs; 2826 int src_index; 2827 int error; /* error return value */ 2828 int i; 2829 2830 XFS_BTREE_STATS_INC(cur, split); 2831 2832 /* Set up left block (current one). */ 2833 left = xfs_btree_get_block(cur, level, &lbp); 2834 2835 #ifdef DEBUG 2836 error = xfs_btree_check_block(cur, left, level, lbp); 2837 if (error) 2838 goto error0; 2839 #endif 2840 2841 xfs_btree_buf_to_ptr(cur, lbp, &lptr); 2842 2843 /* Allocate the new block. If we can't do it, we're toast. Give up. */ 2844 error = xfs_btree_alloc_block(cur, &lptr, &rptr, stat); 2845 if (error) 2846 goto error0; 2847 if (*stat == 0) 2848 goto out0; 2849 XFS_BTREE_STATS_INC(cur, alloc); 2850 2851 /* Set up the new block as "right". */ 2852 error = xfs_btree_get_buf_block(cur, &rptr, &right, &rbp); 2853 if (error) 2854 goto error0; 2855 2856 /* Fill in the btree header for the new right block. */ 2857 xfs_btree_init_block_cur(cur, rbp, xfs_btree_get_level(left), 0); 2858 2859 /* 2860 * Split the entries between the old and the new block evenly. 2861 * Make sure that if there's an odd number of entries now, that 2862 * each new block will have the same number of entries. 2863 */ 2864 lrecs = xfs_btree_get_numrecs(left); 2865 rrecs = lrecs / 2; 2866 if ((lrecs & 1) && cur->bc_levels[level].ptr <= rrecs + 1) 2867 rrecs++; 2868 src_index = (lrecs - rrecs + 1); 2869 2870 XFS_BTREE_STATS_ADD(cur, moves, rrecs); 2871 2872 /* Adjust numrecs for the later get_*_keys() calls. */ 2873 lrecs -= rrecs; 2874 xfs_btree_set_numrecs(left, lrecs); 2875 xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs); 2876 2877 /* 2878 * Copy btree block entries from the left block over to the 2879 * new block, the right. Update the right block and log the 2880 * changes. 2881 */ 2882 if (level > 0) { 2883 /* It's a non-leaf. Move keys and pointers. */ 2884 union xfs_btree_key *lkp; /* left btree key */ 2885 union xfs_btree_ptr *lpp; /* left address pointer */ 2886 union xfs_btree_key *rkp; /* right btree key */ 2887 union xfs_btree_ptr *rpp; /* right address pointer */ 2888 2889 lkp = xfs_btree_key_addr(cur, src_index, left); 2890 lpp = xfs_btree_ptr_addr(cur, src_index, left); 2891 rkp = xfs_btree_key_addr(cur, 1, right); 2892 rpp = xfs_btree_ptr_addr(cur, 1, right); 2893 2894 for (i = src_index; i < rrecs; i++) { 2895 error = xfs_btree_debug_check_ptr(cur, lpp, i, level); 2896 if (error) 2897 goto error0; 2898 } 2899 2900 /* Copy the keys & pointers to the new block. */ 2901 xfs_btree_copy_keys(cur, rkp, lkp, rrecs); 2902 xfs_btree_copy_ptrs(cur, rpp, lpp, rrecs); 2903 2904 xfs_btree_log_keys(cur, rbp, 1, rrecs); 2905 xfs_btree_log_ptrs(cur, rbp, 1, rrecs); 2906 2907 /* Stash the keys of the new block for later insertion. */ 2908 xfs_btree_get_node_keys(cur, right, key); 2909 } else { 2910 /* It's a leaf. Move records. */ 2911 union xfs_btree_rec *lrp; /* left record pointer */ 2912 union xfs_btree_rec *rrp; /* right record pointer */ 2913 2914 lrp = xfs_btree_rec_addr(cur, src_index, left); 2915 rrp = xfs_btree_rec_addr(cur, 1, right); 2916 2917 /* Copy records to the new block. */ 2918 xfs_btree_copy_recs(cur, rrp, lrp, rrecs); 2919 xfs_btree_log_recs(cur, rbp, 1, rrecs); 2920 2921 /* Stash the keys of the new block for later insertion. */ 2922 xfs_btree_get_leaf_keys(cur, right, key); 2923 } 2924 2925 /* 2926 * Find the left block number by looking in the buffer. 2927 * Adjust sibling pointers. 2928 */ 2929 xfs_btree_get_sibling(cur, left, &rrptr, XFS_BB_RIGHTSIB); 2930 xfs_btree_set_sibling(cur, right, &rrptr, XFS_BB_RIGHTSIB); 2931 xfs_btree_set_sibling(cur, right, &lptr, XFS_BB_LEFTSIB); 2932 xfs_btree_set_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB); 2933 2934 xfs_btree_log_block(cur, rbp, XFS_BB_ALL_BITS); 2935 xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB); 2936 2937 /* 2938 * If there's a block to the new block's right, make that block 2939 * point back to right instead of to left. 2940 */ 2941 if (!xfs_btree_ptr_is_null(cur, &rrptr)) { 2942 error = xfs_btree_read_buf_block(cur, &rrptr, 2943 0, &rrblock, &rrbp); 2944 if (error) 2945 goto error0; 2946 xfs_btree_set_sibling(cur, rrblock, &rptr, XFS_BB_LEFTSIB); 2947 xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB); 2948 } 2949 2950 /* Update the parent high keys of the left block, if needed. */ 2951 if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) { 2952 error = xfs_btree_update_keys(cur, level); 2953 if (error) 2954 goto error0; 2955 } 2956 2957 /* 2958 * If the cursor is really in the right block, move it there. 2959 * If it's just pointing past the last entry in left, then we'll 2960 * insert there, so don't change anything in that case. 2961 */ 2962 if (cur->bc_levels[level].ptr > lrecs + 1) { 2963 xfs_btree_setbuf(cur, level, rbp); 2964 cur->bc_levels[level].ptr -= lrecs; 2965 } 2966 /* 2967 * If there are more levels, we'll need another cursor which refers 2968 * the right block, no matter where this cursor was. 2969 */ 2970 if (level + 1 < cur->bc_nlevels) { 2971 error = xfs_btree_dup_cursor(cur, curp); 2972 if (error) 2973 goto error0; 2974 (*curp)->bc_levels[level + 1].ptr++; 2975 } 2976 *ptrp = rptr; 2977 *stat = 1; 2978 return 0; 2979 out0: 2980 *stat = 0; 2981 return 0; 2982 2983 error0: 2984 return error; 2985 } 2986 2987 #ifdef __KERNEL__ 2988 struct xfs_btree_split_args { 2989 struct xfs_btree_cur *cur; 2990 int level; 2991 union xfs_btree_ptr *ptrp; 2992 union xfs_btree_key *key; 2993 struct xfs_btree_cur **curp; 2994 int *stat; /* success/failure */ 2995 int result; 2996 bool kswapd; /* allocation in kswapd context */ 2997 struct completion *done; 2998 struct work_struct work; 2999 }; 3000 3001 /* 3002 * Stack switching interfaces for allocation 3003 */ 3004 static void 3005 xfs_btree_split_worker( 3006 struct work_struct *work) 3007 { 3008 struct xfs_btree_split_args *args = container_of(work, 3009 struct xfs_btree_split_args, work); 3010 unsigned long pflags; 3011 unsigned long new_pflags = 0; 3012 3013 /* 3014 * we are in a transaction context here, but may also be doing work 3015 * in kswapd context, and hence we may need to inherit that state 3016 * temporarily to ensure that we don't block waiting for memory reclaim 3017 * in any way. 3018 */ 3019 if (args->kswapd) 3020 new_pflags |= PF_MEMALLOC | PF_KSWAPD; 3021 3022 current_set_flags_nested(&pflags, new_pflags); 3023 xfs_trans_set_context(args->cur->bc_tp); 3024 3025 args->result = __xfs_btree_split(args->cur, args->level, args->ptrp, 3026 args->key, args->curp, args->stat); 3027 3028 xfs_trans_clear_context(args->cur->bc_tp); 3029 current_restore_flags_nested(&pflags, new_pflags); 3030 3031 /* 3032 * Do not access args after complete() has run here. We don't own args 3033 * and the owner may run and free args before we return here. 3034 */ 3035 complete(args->done); 3036 3037 } 3038 3039 /* 3040 * BMBT split requests often come in with little stack to work on so we push 3041 * them off to a worker thread so there is lots of stack to use. For the other 3042 * btree types, just call directly to avoid the context switch overhead here. 3043 * 3044 * Care must be taken here - the work queue rescuer thread introduces potential 3045 * AGF <> worker queue deadlocks if the BMBT block allocation has to lock new 3046 * AGFs to allocate blocks. A task being run by the rescuer could attempt to 3047 * lock an AGF that is already locked by a task queued to run by the rescuer, 3048 * resulting in an ABBA deadlock as the rescuer cannot run the lock holder to 3049 * release it until the current thread it is running gains the lock. 3050 * 3051 * To avoid this issue, we only ever queue BMBT splits that don't have an AGF 3052 * already locked to allocate from. The only place that doesn't hold an AGF 3053 * locked is unwritten extent conversion at IO completion, but that has already 3054 * been offloaded to a worker thread and hence has no stack consumption issues 3055 * we have to worry about. 3056 */ 3057 STATIC int /* error */ 3058 xfs_btree_split( 3059 struct xfs_btree_cur *cur, 3060 int level, 3061 union xfs_btree_ptr *ptrp, 3062 union xfs_btree_key *key, 3063 struct xfs_btree_cur **curp, 3064 int *stat) /* success/failure */ 3065 { 3066 struct xfs_btree_split_args args; 3067 DECLARE_COMPLETION_ONSTACK(done); 3068 3069 if (!xfs_btree_is_bmap(cur->bc_ops) || 3070 cur->bc_tp->t_highest_agno == NULLAGNUMBER) 3071 return __xfs_btree_split(cur, level, ptrp, key, curp, stat); 3072 3073 args.cur = cur; 3074 args.level = level; 3075 args.ptrp = ptrp; 3076 args.key = key; 3077 args.curp = curp; 3078 args.stat = stat; 3079 args.done = &done; 3080 args.kswapd = current_is_kswapd(); 3081 INIT_WORK_ONSTACK(&args.work, xfs_btree_split_worker); 3082 queue_work(xfs_alloc_wq, &args.work); 3083 wait_for_completion(&done); 3084 destroy_work_on_stack(&args.work); 3085 return args.result; 3086 } 3087 #else 3088 #define xfs_btree_split __xfs_btree_split 3089 #endif /* __KERNEL__ */ 3090 3091 /* Move the records from a root leaf block to a separate block. */ 3092 STATIC void 3093 xfs_btree_promote_leaf_iroot( 3094 struct xfs_btree_cur *cur, 3095 struct xfs_btree_block *block, 3096 struct xfs_buf *cbp, 3097 union xfs_btree_ptr *cptr, 3098 struct xfs_btree_block *cblock) 3099 { 3100 union xfs_btree_rec *rp; 3101 union xfs_btree_rec *crp; 3102 union xfs_btree_key *kp; 3103 union xfs_btree_ptr *pp; 3104 struct xfs_btree_block *broot; 3105 int numrecs = xfs_btree_get_numrecs(block); 3106 3107 /* Copy the records from the leaf broot into the new child block. */ 3108 rp = xfs_btree_rec_addr(cur, 1, block); 3109 crp = xfs_btree_rec_addr(cur, 1, cblock); 3110 xfs_btree_copy_recs(cur, crp, rp, numrecs); 3111 3112 /* 3113 * Increment the tree height. 3114 * 3115 * Trickery here: The amount of memory that we need per record for the 3116 * ifork's btree root block may change when we convert the broot from a 3117 * leaf to a node block. Free the existing leaf broot so that nobody 3118 * thinks we need to migrate node pointers when we realloc the broot 3119 * buffer after bumping nlevels. 3120 */ 3121 cur->bc_ops->broot_realloc(cur, 0); 3122 cur->bc_nlevels++; 3123 cur->bc_levels[1].ptr = 1; 3124 3125 /* 3126 * Allocate a new node broot and initialize it to point to the new 3127 * child block. 3128 */ 3129 broot = cur->bc_ops->broot_realloc(cur, 1); 3130 xfs_btree_init_block(cur->bc_mp, broot, cur->bc_ops, 3131 cur->bc_nlevels - 1, 1, cur->bc_ino.ip->i_ino); 3132 3133 pp = xfs_btree_ptr_addr(cur, 1, broot); 3134 kp = xfs_btree_key_addr(cur, 1, broot); 3135 xfs_btree_copy_ptrs(cur, pp, cptr, 1); 3136 xfs_btree_get_keys(cur, cblock, kp); 3137 3138 /* Attach the new block to the cursor and log it. */ 3139 xfs_btree_setbuf(cur, 0, cbp); 3140 xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS); 3141 xfs_btree_log_recs(cur, cbp, 1, numrecs); 3142 } 3143 3144 /* 3145 * Move the keys and pointers from a root block to a separate block. 3146 * 3147 * Since the keyptr size does not change, all we have to do is increase the 3148 * tree height, copy the keyptrs to the new internal node (cblock), shrink 3149 * the root, and copy the pointers there. 3150 */ 3151 STATIC int 3152 xfs_btree_promote_node_iroot( 3153 struct xfs_btree_cur *cur, 3154 struct xfs_btree_block *block, 3155 int level, 3156 struct xfs_buf *cbp, 3157 union xfs_btree_ptr *cptr, 3158 struct xfs_btree_block *cblock) 3159 { 3160 union xfs_btree_key *ckp; 3161 union xfs_btree_key *kp; 3162 union xfs_btree_ptr *cpp; 3163 union xfs_btree_ptr *pp; 3164 int i; 3165 int error; 3166 int numrecs = xfs_btree_get_numrecs(block); 3167 3168 /* 3169 * Increase tree height, adjusting the root block level to match. 3170 * We cannot change the root btree node size until we've copied the 3171 * block contents to the new child block. 3172 */ 3173 be16_add_cpu(&block->bb_level, 1); 3174 cur->bc_nlevels++; 3175 cur->bc_levels[level + 1].ptr = 1; 3176 3177 /* 3178 * Adjust the root btree record count, then copy the keys from the old 3179 * root to the new child block. 3180 */ 3181 xfs_btree_set_numrecs(block, 1); 3182 kp = xfs_btree_key_addr(cur, 1, block); 3183 ckp = xfs_btree_key_addr(cur, 1, cblock); 3184 xfs_btree_copy_keys(cur, ckp, kp, numrecs); 3185 3186 /* Check the pointers and copy them to the new child block. */ 3187 pp = xfs_btree_ptr_addr(cur, 1, block); 3188 cpp = xfs_btree_ptr_addr(cur, 1, cblock); 3189 for (i = 0; i < numrecs; i++) { 3190 error = xfs_btree_debug_check_ptr(cur, pp, i, level); 3191 if (error) 3192 return error; 3193 } 3194 xfs_btree_copy_ptrs(cur, cpp, pp, numrecs); 3195 3196 /* 3197 * Set the first keyptr to point to the new child block, then shrink 3198 * the memory buffer for the root block. 3199 */ 3200 error = xfs_btree_debug_check_ptr(cur, cptr, 0, level); 3201 if (error) 3202 return error; 3203 xfs_btree_copy_ptrs(cur, pp, cptr, 1); 3204 xfs_btree_get_keys(cur, cblock, kp); 3205 3206 cur->bc_ops->broot_realloc(cur, 1); 3207 3208 /* Attach the new block to the cursor and log it. */ 3209 xfs_btree_setbuf(cur, level, cbp); 3210 xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS); 3211 xfs_btree_log_keys(cur, cbp, 1, numrecs); 3212 xfs_btree_log_ptrs(cur, cbp, 1, numrecs); 3213 return 0; 3214 } 3215 3216 /* 3217 * Copy the old inode root contents into a real block and make the 3218 * broot point to it. 3219 */ 3220 int /* error */ 3221 xfs_btree_new_iroot( 3222 struct xfs_btree_cur *cur, /* btree cursor */ 3223 int *logflags, /* logging flags for inode */ 3224 int *stat) /* return status - 0 fail */ 3225 { 3226 struct xfs_buf *cbp; /* buffer for cblock */ 3227 struct xfs_btree_block *block; /* btree block */ 3228 struct xfs_btree_block *cblock; /* child btree block */ 3229 union xfs_btree_ptr aptr; 3230 union xfs_btree_ptr nptr; /* new block addr */ 3231 int level; /* btree level */ 3232 int error; /* error return code */ 3233 3234 XFS_BTREE_STATS_INC(cur, newroot); 3235 3236 ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE); 3237 3238 level = cur->bc_nlevels - 1; 3239 3240 block = xfs_btree_get_iroot(cur); 3241 ASSERT(level > 0 || (cur->bc_ops->geom_flags & XFS_BTGEO_IROOT_RECORDS)); 3242 if (level > 0) 3243 aptr = *xfs_btree_ptr_addr(cur, 1, block); 3244 else 3245 aptr.l = cpu_to_be64(XFS_INO_TO_FSB(cur->bc_mp, 3246 cur->bc_ino.ip->i_ino)); 3247 3248 /* Allocate the new block. If we can't do it, we're toast. Give up. */ 3249 error = xfs_btree_alloc_block(cur, &aptr, &nptr, stat); 3250 if (error) 3251 goto error0; 3252 if (*stat == 0) 3253 return 0; 3254 3255 XFS_BTREE_STATS_INC(cur, alloc); 3256 3257 /* Copy the root into a real block. */ 3258 error = xfs_btree_get_buf_block(cur, &nptr, &cblock, &cbp); 3259 if (error) 3260 goto error0; 3261 3262 /* 3263 * we can't just memcpy() the root in for CRC enabled btree blocks. 3264 * In that case have to also ensure the blkno remains correct 3265 */ 3266 memcpy(cblock, block, xfs_btree_block_len(cur)); 3267 if (xfs_has_crc(cur->bc_mp)) { 3268 __be64 bno = cpu_to_be64(xfs_buf_daddr(cbp)); 3269 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) 3270 cblock->bb_u.l.bb_blkno = bno; 3271 else 3272 cblock->bb_u.s.bb_blkno = bno; 3273 } 3274 3275 if (level > 0) { 3276 error = xfs_btree_promote_node_iroot(cur, block, level, cbp, 3277 &nptr, cblock); 3278 if (error) 3279 goto error0; 3280 } else { 3281 xfs_btree_promote_leaf_iroot(cur, block, cbp, &nptr, cblock); 3282 } 3283 3284 *logflags |= XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork); 3285 *stat = 1; 3286 return 0; 3287 error0: 3288 return error; 3289 } 3290 3291 static void 3292 xfs_btree_set_root( 3293 struct xfs_btree_cur *cur, 3294 const union xfs_btree_ptr *ptr, 3295 int inc) 3296 { 3297 if (cur->bc_flags & XFS_BTREE_STAGING) { 3298 /* Update the btree root information for a per-AG fake root. */ 3299 cur->bc_ag.afake->af_root = be32_to_cpu(ptr->s); 3300 cur->bc_ag.afake->af_levels += inc; 3301 } else { 3302 cur->bc_ops->set_root(cur, ptr, inc); 3303 } 3304 } 3305 3306 /* 3307 * Allocate a new root block, fill it in. 3308 */ 3309 STATIC int /* error */ 3310 xfs_btree_new_root( 3311 struct xfs_btree_cur *cur, /* btree cursor */ 3312 int *stat) /* success/failure */ 3313 { 3314 struct xfs_btree_block *block; /* one half of the old root block */ 3315 struct xfs_buf *bp; /* buffer containing block */ 3316 int error; /* error return value */ 3317 struct xfs_buf *lbp; /* left buffer pointer */ 3318 struct xfs_btree_block *left; /* left btree block */ 3319 struct xfs_buf *nbp; /* new (root) buffer */ 3320 struct xfs_btree_block *new; /* new (root) btree block */ 3321 int nptr; /* new value for key index, 1 or 2 */ 3322 struct xfs_buf *rbp; /* right buffer pointer */ 3323 struct xfs_btree_block *right; /* right btree block */ 3324 union xfs_btree_ptr rptr; 3325 union xfs_btree_ptr lptr; 3326 3327 XFS_BTREE_STATS_INC(cur, newroot); 3328 3329 /* initialise our start point from the cursor */ 3330 xfs_btree_init_ptr_from_cur(cur, &rptr); 3331 3332 /* Allocate the new block. If we can't do it, we're toast. Give up. */ 3333 error = xfs_btree_alloc_block(cur, &rptr, &lptr, stat); 3334 if (error) 3335 goto error0; 3336 if (*stat == 0) 3337 goto out0; 3338 XFS_BTREE_STATS_INC(cur, alloc); 3339 3340 /* Set up the new block. */ 3341 error = xfs_btree_get_buf_block(cur, &lptr, &new, &nbp); 3342 if (error) 3343 goto error0; 3344 3345 /* Set the root in the holding structure increasing the level by 1. */ 3346 xfs_btree_set_root(cur, &lptr, 1); 3347 3348 /* 3349 * At the previous root level there are now two blocks: the old root, 3350 * and the new block generated when it was split. We don't know which 3351 * one the cursor is pointing at, so we set up variables "left" and 3352 * "right" for each case. 3353 */ 3354 block = xfs_btree_get_block(cur, cur->bc_nlevels - 1, &bp); 3355 3356 #ifdef DEBUG 3357 error = xfs_btree_check_block(cur, block, cur->bc_nlevels - 1, bp); 3358 if (error) 3359 goto error0; 3360 #endif 3361 3362 xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB); 3363 if (!xfs_btree_ptr_is_null(cur, &rptr)) { 3364 /* Our block is left, pick up the right block. */ 3365 lbp = bp; 3366 xfs_btree_buf_to_ptr(cur, lbp, &lptr); 3367 left = block; 3368 error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp); 3369 if (error) 3370 goto error0; 3371 bp = rbp; 3372 nptr = 1; 3373 } else { 3374 /* Our block is right, pick up the left block. */ 3375 rbp = bp; 3376 xfs_btree_buf_to_ptr(cur, rbp, &rptr); 3377 right = block; 3378 xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB); 3379 error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp); 3380 if (error) 3381 goto error0; 3382 bp = lbp; 3383 nptr = 2; 3384 } 3385 3386 /* Fill in the new block's btree header and log it. */ 3387 xfs_btree_init_block_cur(cur, nbp, cur->bc_nlevels, 2); 3388 xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS); 3389 ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) && 3390 !xfs_btree_ptr_is_null(cur, &rptr)); 3391 3392 /* Fill in the key data in the new root. */ 3393 if (xfs_btree_get_level(left) > 0) { 3394 /* 3395 * Get the keys for the left block's keys and put them directly 3396 * in the parent block. Do the same for the right block. 3397 */ 3398 xfs_btree_get_node_keys(cur, left, 3399 xfs_btree_key_addr(cur, 1, new)); 3400 xfs_btree_get_node_keys(cur, right, 3401 xfs_btree_key_addr(cur, 2, new)); 3402 } else { 3403 /* 3404 * Get the keys for the left block's records and put them 3405 * directly in the parent block. Do the same for the right 3406 * block. 3407 */ 3408 xfs_btree_get_leaf_keys(cur, left, 3409 xfs_btree_key_addr(cur, 1, new)); 3410 xfs_btree_get_leaf_keys(cur, right, 3411 xfs_btree_key_addr(cur, 2, new)); 3412 } 3413 xfs_btree_log_keys(cur, nbp, 1, 2); 3414 3415 /* Fill in the pointer data in the new root. */ 3416 xfs_btree_copy_ptrs(cur, 3417 xfs_btree_ptr_addr(cur, 1, new), &lptr, 1); 3418 xfs_btree_copy_ptrs(cur, 3419 xfs_btree_ptr_addr(cur, 2, new), &rptr, 1); 3420 xfs_btree_log_ptrs(cur, nbp, 1, 2); 3421 3422 /* Fix up the cursor. */ 3423 xfs_btree_setbuf(cur, cur->bc_nlevels, nbp); 3424 cur->bc_levels[cur->bc_nlevels].ptr = nptr; 3425 cur->bc_nlevels++; 3426 ASSERT(cur->bc_nlevels <= cur->bc_maxlevels); 3427 *stat = 1; 3428 return 0; 3429 error0: 3430 return error; 3431 out0: 3432 *stat = 0; 3433 return 0; 3434 } 3435 3436 STATIC int 3437 xfs_btree_make_block_unfull( 3438 struct xfs_btree_cur *cur, /* btree cursor */ 3439 int level, /* btree level */ 3440 int numrecs,/* # of recs in block */ 3441 int *oindex,/* old tree index */ 3442 int *index, /* new tree index */ 3443 union xfs_btree_ptr *nptr, /* new btree ptr */ 3444 struct xfs_btree_cur **ncur, /* new btree cursor */ 3445 union xfs_btree_key *key, /* key of new block */ 3446 int *stat) 3447 { 3448 int error = 0; 3449 3450 if (xfs_btree_at_iroot(cur, level)) { 3451 struct xfs_inode *ip = cur->bc_ino.ip; 3452 3453 if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) { 3454 /* A root block that can be made bigger. */ 3455 cur->bc_ops->broot_realloc(cur, numrecs + 1); 3456 *stat = 1; 3457 } else { 3458 /* A root block that needs replacing */ 3459 int logflags = 0; 3460 3461 error = xfs_btree_new_iroot(cur, &logflags, stat); 3462 if (error || *stat == 0) 3463 return error; 3464 3465 xfs_trans_log_inode(cur->bc_tp, ip, logflags); 3466 } 3467 3468 return 0; 3469 } 3470 3471 /* First, try shifting an entry to the right neighbor. */ 3472 error = xfs_btree_rshift(cur, level, stat); 3473 if (error || *stat) 3474 return error; 3475 3476 /* Next, try shifting an entry to the left neighbor. */ 3477 error = xfs_btree_lshift(cur, level, stat); 3478 if (error) 3479 return error; 3480 3481 if (*stat) { 3482 *oindex = *index = cur->bc_levels[level].ptr; 3483 return 0; 3484 } 3485 3486 /* 3487 * Next, try splitting the current block in half. 3488 * 3489 * If this works we have to re-set our variables because we 3490 * could be in a different block now. 3491 */ 3492 error = xfs_btree_split(cur, level, nptr, key, ncur, stat); 3493 if (error || *stat == 0) 3494 return error; 3495 3496 3497 *index = cur->bc_levels[level].ptr; 3498 return 0; 3499 } 3500 3501 /* 3502 * Insert one record/level. Return information to the caller 3503 * allowing the next level up to proceed if necessary. 3504 */ 3505 STATIC int 3506 xfs_btree_insrec( 3507 struct xfs_btree_cur *cur, /* btree cursor */ 3508 int level, /* level to insert record at */ 3509 union xfs_btree_ptr *ptrp, /* i/o: block number inserted */ 3510 union xfs_btree_rec *rec, /* record to insert */ 3511 union xfs_btree_key *key, /* i/o: block key for ptrp */ 3512 struct xfs_btree_cur **curp, /* output: new cursor replacing cur */ 3513 int *stat) /* success/failure */ 3514 { 3515 struct xfs_btree_block *block; /* btree block */ 3516 struct xfs_buf *bp; /* buffer for block */ 3517 union xfs_btree_ptr nptr; /* new block ptr */ 3518 struct xfs_btree_cur *ncur = NULL; /* new btree cursor */ 3519 union xfs_btree_key nkey; /* new block key */ 3520 union xfs_btree_key *lkey; 3521 int optr; /* old key/record index */ 3522 int ptr; /* key/record index */ 3523 int numrecs;/* number of records */ 3524 int error; /* error return value */ 3525 int i; 3526 xfs_daddr_t old_bn; 3527 3528 ncur = NULL; 3529 lkey = &nkey; 3530 3531 /* 3532 * If we have an external root pointer, and we've made it to the 3533 * root level, allocate a new root block and we're done. 3534 */ 3535 if (cur->bc_ops->type != XFS_BTREE_TYPE_INODE && 3536 level >= cur->bc_nlevels) { 3537 error = xfs_btree_new_root(cur, stat); 3538 xfs_btree_set_ptr_null(cur, ptrp); 3539 3540 return error; 3541 } 3542 3543 /* If we're off the left edge, return failure. */ 3544 ptr = cur->bc_levels[level].ptr; 3545 if (ptr == 0) { 3546 *stat = 0; 3547 return 0; 3548 } 3549 3550 optr = ptr; 3551 3552 XFS_BTREE_STATS_INC(cur, insrec); 3553 3554 /* Get pointers to the btree buffer and block. */ 3555 block = xfs_btree_get_block(cur, level, &bp); 3556 old_bn = bp ? xfs_buf_daddr(bp) : XFS_BUF_DADDR_NULL; 3557 numrecs = xfs_btree_get_numrecs(block); 3558 3559 #ifdef DEBUG 3560 error = xfs_btree_check_block(cur, block, level, bp); 3561 if (error) 3562 goto error0; 3563 3564 /* Check that the new entry is being inserted in the right place. */ 3565 if (ptr <= numrecs) { 3566 if (level == 0) { 3567 ASSERT(cur->bc_ops->recs_inorder(cur, rec, 3568 xfs_btree_rec_addr(cur, ptr, block))); 3569 } else { 3570 ASSERT(cur->bc_ops->keys_inorder(cur, key, 3571 xfs_btree_key_addr(cur, ptr, block))); 3572 } 3573 } 3574 #endif 3575 3576 /* 3577 * If the block is full, we can't insert the new entry until we 3578 * make the block un-full. 3579 */ 3580 xfs_btree_set_ptr_null(cur, &nptr); 3581 if (numrecs == cur->bc_ops->get_maxrecs(cur, level)) { 3582 error = xfs_btree_make_block_unfull(cur, level, numrecs, 3583 &optr, &ptr, &nptr, &ncur, lkey, stat); 3584 if (error || *stat == 0) 3585 goto error0; 3586 } 3587 3588 /* 3589 * The current block may have changed if the block was 3590 * previously full and we have just made space in it. 3591 */ 3592 block = xfs_btree_get_block(cur, level, &bp); 3593 numrecs = xfs_btree_get_numrecs(block); 3594 3595 #ifdef DEBUG 3596 error = xfs_btree_check_block(cur, block, level, bp); 3597 if (error) 3598 goto error0; 3599 #endif 3600 3601 /* 3602 * At this point we know there's room for our new entry in the block 3603 * we're pointing at. 3604 */ 3605 XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr + 1); 3606 3607 if (level > 0) { 3608 /* It's a nonleaf. make a hole in the keys and ptrs */ 3609 union xfs_btree_key *kp; 3610 union xfs_btree_ptr *pp; 3611 3612 kp = xfs_btree_key_addr(cur, ptr, block); 3613 pp = xfs_btree_ptr_addr(cur, ptr, block); 3614 3615 for (i = numrecs - ptr; i >= 0; i--) { 3616 error = xfs_btree_debug_check_ptr(cur, pp, i, level); 3617 if (error) 3618 goto error0; 3619 } 3620 3621 xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1); 3622 xfs_btree_shift_ptrs(cur, pp, 1, numrecs - ptr + 1); 3623 3624 error = xfs_btree_debug_check_ptr(cur, ptrp, 0, level); 3625 if (error) 3626 goto error0; 3627 3628 /* Now put the new data in, bump numrecs and log it. */ 3629 xfs_btree_copy_keys(cur, kp, key, 1); 3630 xfs_btree_copy_ptrs(cur, pp, ptrp, 1); 3631 numrecs++; 3632 xfs_btree_set_numrecs(block, numrecs); 3633 xfs_btree_log_ptrs(cur, bp, ptr, numrecs); 3634 xfs_btree_log_keys(cur, bp, ptr, numrecs); 3635 #ifdef DEBUG 3636 if (ptr < numrecs) { 3637 ASSERT(cur->bc_ops->keys_inorder(cur, kp, 3638 xfs_btree_key_addr(cur, ptr + 1, block))); 3639 } 3640 #endif 3641 } else { 3642 /* It's a leaf. make a hole in the records */ 3643 union xfs_btree_rec *rp; 3644 3645 rp = xfs_btree_rec_addr(cur, ptr, block); 3646 3647 xfs_btree_shift_recs(cur, rp, 1, numrecs - ptr + 1); 3648 3649 /* Now put the new data in, bump numrecs and log it. */ 3650 xfs_btree_copy_recs(cur, rp, rec, 1); 3651 xfs_btree_set_numrecs(block, ++numrecs); 3652 xfs_btree_log_recs(cur, bp, ptr, numrecs); 3653 #ifdef DEBUG 3654 if (ptr < numrecs) { 3655 ASSERT(cur->bc_ops->recs_inorder(cur, rp, 3656 xfs_btree_rec_addr(cur, ptr + 1, block))); 3657 } 3658 #endif 3659 } 3660 3661 /* Log the new number of records in the btree header. */ 3662 xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS); 3663 3664 /* 3665 * Update btree keys to reflect the newly added record or keyptr. 3666 * There are three cases here to be aware of. Normally, all we have to 3667 * do is walk towards the root, updating keys as necessary. 3668 * 3669 * If the caller had us target a full block for the insertion, we dealt 3670 * with that by calling the _make_block_unfull function. If the 3671 * "make unfull" function splits the block, it'll hand us back the key 3672 * and pointer of the new block. We haven't yet added the new block to 3673 * the next level up, so if we decide to add the new record to the new 3674 * block (bp->b_bn != old_bn), we have to update the caller's pointer 3675 * so that the caller adds the new block with the correct key. 3676 * 3677 * However, there is a third possibility-- if the selected block is the 3678 * root block of an inode-rooted btree and cannot be expanded further, 3679 * the "make unfull" function moves the root block contents to a new 3680 * block and updates the root block to point to the new block. In this 3681 * case, no block pointer is passed back because the block has already 3682 * been added to the btree. In this case, we need to use the regular 3683 * key update function, just like the first case. This is critical for 3684 * overlapping btrees, because the high key must be updated to reflect 3685 * the entire tree, not just the subtree accessible through the first 3686 * child of the root (which is now two levels down from the root). 3687 */ 3688 if (!xfs_btree_ptr_is_null(cur, &nptr) && 3689 bp && xfs_buf_daddr(bp) != old_bn) { 3690 xfs_btree_get_keys(cur, block, lkey); 3691 } else if (xfs_btree_needs_key_update(cur, optr)) { 3692 error = xfs_btree_update_keys(cur, level); 3693 if (error) 3694 goto error0; 3695 } 3696 3697 /* 3698 * Return the new block number, if any. 3699 * If there is one, give back a record value and a cursor too. 3700 */ 3701 *ptrp = nptr; 3702 if (!xfs_btree_ptr_is_null(cur, &nptr)) { 3703 xfs_btree_copy_keys(cur, key, lkey, 1); 3704 *curp = ncur; 3705 } 3706 3707 *stat = 1; 3708 return 0; 3709 3710 error0: 3711 if (ncur) 3712 xfs_btree_del_cursor(ncur, error); 3713 return error; 3714 } 3715 3716 /* 3717 * Insert the record at the point referenced by cur. 3718 * 3719 * A multi-level split of the tree on insert will invalidate the original 3720 * cursor. All callers of this function should assume that the cursor is 3721 * no longer valid and revalidate it. 3722 */ 3723 int 3724 xfs_btree_insert( 3725 struct xfs_btree_cur *cur, 3726 int *stat) 3727 { 3728 int error; /* error return value */ 3729 int i; /* result value, 0 for failure */ 3730 int level; /* current level number in btree */ 3731 union xfs_btree_ptr nptr; /* new block number (split result) */ 3732 struct xfs_btree_cur *ncur; /* new cursor (split result) */ 3733 struct xfs_btree_cur *pcur; /* previous level's cursor */ 3734 union xfs_btree_key bkey; /* key of block to insert */ 3735 union xfs_btree_key *key; 3736 union xfs_btree_rec rec; /* record to insert */ 3737 3738 level = 0; 3739 ncur = NULL; 3740 pcur = cur; 3741 key = &bkey; 3742 3743 xfs_btree_set_ptr_null(cur, &nptr); 3744 3745 /* Make a key out of the record data to be inserted, and save it. */ 3746 cur->bc_ops->init_rec_from_cur(cur, &rec); 3747 cur->bc_ops->init_key_from_rec(key, &rec); 3748 3749 /* 3750 * Loop going up the tree, starting at the leaf level. 3751 * Stop when we don't get a split block, that must mean that 3752 * the insert is finished with this level. 3753 */ 3754 do { 3755 /* 3756 * Insert nrec/nptr into this level of the tree. 3757 * Note if we fail, nptr will be null. 3758 */ 3759 error = xfs_btree_insrec(pcur, level, &nptr, &rec, key, 3760 &ncur, &i); 3761 if (error) { 3762 if (pcur != cur) 3763 xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR); 3764 goto error0; 3765 } 3766 3767 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 3768 xfs_btree_mark_sick(cur); 3769 error = -EFSCORRUPTED; 3770 goto error0; 3771 } 3772 level++; 3773 3774 /* 3775 * See if the cursor we just used is trash. 3776 * Can't trash the caller's cursor, but otherwise we should 3777 * if ncur is a new cursor or we're about to be done. 3778 */ 3779 if (pcur != cur && 3780 (ncur || xfs_btree_ptr_is_null(cur, &nptr))) { 3781 /* Save the state from the cursor before we trash it */ 3782 if (cur->bc_ops->update_cursor && 3783 !(cur->bc_flags & XFS_BTREE_STAGING)) 3784 cur->bc_ops->update_cursor(pcur, cur); 3785 cur->bc_nlevels = pcur->bc_nlevels; 3786 xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR); 3787 } 3788 /* If we got a new cursor, switch to it. */ 3789 if (ncur) { 3790 pcur = ncur; 3791 ncur = NULL; 3792 } 3793 } while (!xfs_btree_ptr_is_null(cur, &nptr)); 3794 3795 *stat = i; 3796 return 0; 3797 error0: 3798 return error; 3799 } 3800 3801 /* Move the records from a child leaf block to the root block. */ 3802 STATIC void 3803 xfs_btree_demote_leaf_child( 3804 struct xfs_btree_cur *cur, 3805 struct xfs_btree_block *cblock, 3806 int numrecs) 3807 { 3808 union xfs_btree_rec *rp; 3809 union xfs_btree_rec *crp; 3810 struct xfs_btree_block *broot; 3811 3812 /* 3813 * Decrease the tree height. 3814 * 3815 * Trickery here: The amount of memory that we need per record for the 3816 * ifork's btree root block may change when we convert the broot from a 3817 * node to a leaf. Free the old node broot so that we can get a fresh 3818 * leaf broot. 3819 */ 3820 cur->bc_ops->broot_realloc(cur, 0); 3821 cur->bc_nlevels--; 3822 3823 /* 3824 * Allocate a new leaf broot and copy the records from the old child. 3825 * Detach the old child from the cursor. 3826 */ 3827 broot = cur->bc_ops->broot_realloc(cur, numrecs); 3828 xfs_btree_init_block(cur->bc_mp, broot, cur->bc_ops, 0, numrecs, 3829 cur->bc_ino.ip->i_ino); 3830 3831 rp = xfs_btree_rec_addr(cur, 1, broot); 3832 crp = xfs_btree_rec_addr(cur, 1, cblock); 3833 xfs_btree_copy_recs(cur, rp, crp, numrecs); 3834 3835 cur->bc_levels[0].bp = NULL; 3836 } 3837 3838 /* 3839 * Move the keyptrs from a child node block to the root block. 3840 * 3841 * Since the keyptr size does not change, all we have to do is increase the 3842 * tree height, copy the keyptrs to the new internal node (cblock), shrink 3843 * the root, and copy the pointers there. 3844 */ 3845 STATIC int 3846 xfs_btree_demote_node_child( 3847 struct xfs_btree_cur *cur, 3848 struct xfs_btree_block *cblock, 3849 int level, 3850 int numrecs) 3851 { 3852 struct xfs_btree_block *block; 3853 union xfs_btree_key *ckp; 3854 union xfs_btree_key *kp; 3855 union xfs_btree_ptr *cpp; 3856 union xfs_btree_ptr *pp; 3857 int i; 3858 int error; 3859 3860 /* 3861 * Adjust the root btree node size and the record count to match the 3862 * doomed child so that we can copy the keyptrs ahead of changing the 3863 * tree shape. 3864 */ 3865 block = cur->bc_ops->broot_realloc(cur, numrecs); 3866 3867 xfs_btree_set_numrecs(block, numrecs); 3868 ASSERT(block->bb_numrecs == cblock->bb_numrecs); 3869 3870 /* Copy keys from the doomed block. */ 3871 kp = xfs_btree_key_addr(cur, 1, block); 3872 ckp = xfs_btree_key_addr(cur, 1, cblock); 3873 xfs_btree_copy_keys(cur, kp, ckp, numrecs); 3874 3875 /* Copy pointers from the doomed block. */ 3876 pp = xfs_btree_ptr_addr(cur, 1, block); 3877 cpp = xfs_btree_ptr_addr(cur, 1, cblock); 3878 for (i = 0; i < numrecs; i++) { 3879 error = xfs_btree_debug_check_ptr(cur, cpp, i, level - 1); 3880 if (error) 3881 return error; 3882 } 3883 xfs_btree_copy_ptrs(cur, pp, cpp, numrecs); 3884 3885 /* Decrease tree height, adjusting the root block level to match. */ 3886 cur->bc_levels[level - 1].bp = NULL; 3887 be16_add_cpu(&block->bb_level, -1); 3888 cur->bc_nlevels--; 3889 return 0; 3890 } 3891 3892 /* 3893 * Try to merge a non-leaf block back into the inode root. 3894 * 3895 * Note: the killroot names comes from the fact that we're effectively 3896 * killing the old root block. But because we can't just delete the 3897 * inode we have to copy the single block it was pointing to into the 3898 * inode. 3899 */ 3900 STATIC int 3901 xfs_btree_kill_iroot( 3902 struct xfs_btree_cur *cur) 3903 { 3904 struct xfs_inode *ip = cur->bc_ino.ip; 3905 struct xfs_btree_block *block; 3906 struct xfs_btree_block *cblock; 3907 struct xfs_buf *cbp; 3908 int level; 3909 int numrecs; 3910 int error; 3911 #ifdef DEBUG 3912 union xfs_btree_ptr ptr; 3913 #endif 3914 3915 ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE); 3916 ASSERT((cur->bc_ops->geom_flags & XFS_BTGEO_IROOT_RECORDS) || 3917 cur->bc_nlevels > 1); 3918 3919 /* 3920 * Don't deal with the root block needs to be a leaf case. 3921 * We're just going to turn the thing back into extents anyway. 3922 */ 3923 level = cur->bc_nlevels - 1; 3924 if (level == 1 && !(cur->bc_ops->geom_flags & XFS_BTGEO_IROOT_RECORDS)) 3925 goto out0; 3926 3927 /* If we're already a leaf, jump out. */ 3928 if (level == 0) 3929 goto out0; 3930 3931 /* 3932 * Give up if the root has multiple children. 3933 */ 3934 block = xfs_btree_get_iroot(cur); 3935 if (xfs_btree_get_numrecs(block) != 1) 3936 goto out0; 3937 3938 cblock = xfs_btree_get_block(cur, level - 1, &cbp); 3939 numrecs = xfs_btree_get_numrecs(cblock); 3940 3941 /* 3942 * Only do this if the next level will fit. 3943 * Then the data must be copied up to the inode, 3944 * instead of freeing the root you free the next level. 3945 */ 3946 if (numrecs > cur->bc_ops->get_dmaxrecs(cur, level)) 3947 goto out0; 3948 3949 XFS_BTREE_STATS_INC(cur, killroot); 3950 3951 #ifdef DEBUG 3952 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB); 3953 ASSERT(xfs_btree_ptr_is_null(cur, &ptr)); 3954 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB); 3955 ASSERT(xfs_btree_ptr_is_null(cur, &ptr)); 3956 #endif 3957 3958 if (level > 1) { 3959 error = xfs_btree_demote_node_child(cur, cblock, level, 3960 numrecs); 3961 if (error) 3962 return error; 3963 } else 3964 xfs_btree_demote_leaf_child(cur, cblock, numrecs); 3965 3966 error = xfs_btree_free_block(cur, cbp); 3967 if (error) 3968 return error; 3969 3970 xfs_trans_log_inode(cur->bc_tp, ip, 3971 XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork)); 3972 out0: 3973 return 0; 3974 } 3975 3976 /* 3977 * Kill the current root node, and replace it with it's only child node. 3978 */ 3979 STATIC int 3980 xfs_btree_kill_root( 3981 struct xfs_btree_cur *cur, 3982 struct xfs_buf *bp, 3983 int level, 3984 union xfs_btree_ptr *newroot) 3985 { 3986 int error; 3987 3988 XFS_BTREE_STATS_INC(cur, killroot); 3989 3990 /* 3991 * Update the root pointer, decreasing the level by 1 and then 3992 * free the old root. 3993 */ 3994 xfs_btree_set_root(cur, newroot, -1); 3995 3996 error = xfs_btree_free_block(cur, bp); 3997 if (error) 3998 return error; 3999 4000 cur->bc_levels[level].bp = NULL; 4001 cur->bc_levels[level].ra = 0; 4002 cur->bc_nlevels--; 4003 4004 return 0; 4005 } 4006 4007 STATIC int 4008 xfs_btree_dec_cursor( 4009 struct xfs_btree_cur *cur, 4010 int level, 4011 int *stat) 4012 { 4013 int error; 4014 int i; 4015 4016 if (level > 0) { 4017 error = xfs_btree_decrement(cur, level, &i); 4018 if (error) 4019 return error; 4020 } 4021 4022 *stat = 1; 4023 return 0; 4024 } 4025 4026 /* 4027 * Single level of the btree record deletion routine. 4028 * Delete record pointed to by cur/level. 4029 * Remove the record from its block then rebalance the tree. 4030 * Return 0 for error, 1 for done, 2 to go on to the next level. 4031 */ 4032 STATIC int /* error */ 4033 xfs_btree_delrec( 4034 struct xfs_btree_cur *cur, /* btree cursor */ 4035 int level, /* level removing record from */ 4036 int *stat) /* fail/done/go-on */ 4037 { 4038 struct xfs_btree_block *block; /* btree block */ 4039 union xfs_btree_ptr cptr; /* current block ptr */ 4040 struct xfs_buf *bp; /* buffer for block */ 4041 int error; /* error return value */ 4042 int i; /* loop counter */ 4043 union xfs_btree_ptr lptr; /* left sibling block ptr */ 4044 struct xfs_buf *lbp; /* left buffer pointer */ 4045 struct xfs_btree_block *left; /* left btree block */ 4046 int lrecs = 0; /* left record count */ 4047 int ptr; /* key/record index */ 4048 union xfs_btree_ptr rptr; /* right sibling block ptr */ 4049 struct xfs_buf *rbp; /* right buffer pointer */ 4050 struct xfs_btree_block *right; /* right btree block */ 4051 struct xfs_btree_block *rrblock; /* right-right btree block */ 4052 struct xfs_buf *rrbp; /* right-right buffer pointer */ 4053 int rrecs = 0; /* right record count */ 4054 struct xfs_btree_cur *tcur; /* temporary btree cursor */ 4055 int numrecs; /* temporary numrec count */ 4056 4057 tcur = NULL; 4058 4059 /* Get the index of the entry being deleted, check for nothing there. */ 4060 ptr = cur->bc_levels[level].ptr; 4061 if (ptr == 0) { 4062 *stat = 0; 4063 return 0; 4064 } 4065 4066 /* Get the buffer & block containing the record or key/ptr. */ 4067 block = xfs_btree_get_block(cur, level, &bp); 4068 numrecs = xfs_btree_get_numrecs(block); 4069 4070 #ifdef DEBUG 4071 error = xfs_btree_check_block(cur, block, level, bp); 4072 if (error) 4073 goto error0; 4074 #endif 4075 4076 /* Fail if we're off the end of the block. */ 4077 if (ptr > numrecs) { 4078 *stat = 0; 4079 return 0; 4080 } 4081 4082 XFS_BTREE_STATS_INC(cur, delrec); 4083 XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr); 4084 4085 /* Excise the entries being deleted. */ 4086 if (level > 0) { 4087 /* It's a nonleaf. operate on keys and ptrs */ 4088 union xfs_btree_key *lkp; 4089 union xfs_btree_ptr *lpp; 4090 4091 lkp = xfs_btree_key_addr(cur, ptr + 1, block); 4092 lpp = xfs_btree_ptr_addr(cur, ptr + 1, block); 4093 4094 for (i = 0; i < numrecs - ptr; i++) { 4095 error = xfs_btree_debug_check_ptr(cur, lpp, i, level); 4096 if (error) 4097 goto error0; 4098 } 4099 4100 if (ptr < numrecs) { 4101 xfs_btree_shift_keys(cur, lkp, -1, numrecs - ptr); 4102 xfs_btree_shift_ptrs(cur, lpp, -1, numrecs - ptr); 4103 xfs_btree_log_keys(cur, bp, ptr, numrecs - 1); 4104 xfs_btree_log_ptrs(cur, bp, ptr, numrecs - 1); 4105 } 4106 } else { 4107 /* It's a leaf. operate on records */ 4108 if (ptr < numrecs) { 4109 xfs_btree_shift_recs(cur, 4110 xfs_btree_rec_addr(cur, ptr + 1, block), 4111 -1, numrecs - ptr); 4112 xfs_btree_log_recs(cur, bp, ptr, numrecs - 1); 4113 } 4114 } 4115 4116 /* 4117 * Decrement and log the number of entries in the block. 4118 */ 4119 xfs_btree_set_numrecs(block, --numrecs); 4120 xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS); 4121 4122 /* 4123 * We're at the root level. First, shrink the root block in-memory. 4124 * Try to get rid of the next level down. If we can't then there's 4125 * nothing left to do. numrecs was decremented above. 4126 */ 4127 if (xfs_btree_at_iroot(cur, level)) { 4128 cur->bc_ops->broot_realloc(cur, numrecs); 4129 4130 error = xfs_btree_kill_iroot(cur); 4131 if (error) 4132 goto error0; 4133 4134 error = xfs_btree_dec_cursor(cur, level, stat); 4135 if (error) 4136 goto error0; 4137 *stat = 1; 4138 return 0; 4139 } 4140 4141 /* 4142 * If this is the root level, and there's only one entry left, and it's 4143 * NOT the leaf level, then we can get rid of this level. 4144 */ 4145 if (level == cur->bc_nlevels - 1) { 4146 if (numrecs == 1 && level > 0) { 4147 union xfs_btree_ptr *pp; 4148 /* 4149 * pp is still set to the first pointer in the block. 4150 * Make it the new root of the btree. 4151 */ 4152 pp = xfs_btree_ptr_addr(cur, 1, block); 4153 error = xfs_btree_kill_root(cur, bp, level, pp); 4154 if (error) 4155 goto error0; 4156 } else if (level > 0) { 4157 error = xfs_btree_dec_cursor(cur, level, stat); 4158 if (error) 4159 goto error0; 4160 } 4161 *stat = 1; 4162 return 0; 4163 } 4164 4165 /* 4166 * If we deleted the leftmost entry in the block, update the 4167 * key values above us in the tree. 4168 */ 4169 if (xfs_btree_needs_key_update(cur, ptr)) { 4170 error = xfs_btree_update_keys(cur, level); 4171 if (error) 4172 goto error0; 4173 } 4174 4175 /* 4176 * If the number of records remaining in the block is at least 4177 * the minimum, we're done. 4178 */ 4179 if (numrecs >= cur->bc_ops->get_minrecs(cur, level)) { 4180 error = xfs_btree_dec_cursor(cur, level, stat); 4181 if (error) 4182 goto error0; 4183 return 0; 4184 } 4185 4186 /* 4187 * Otherwise, we have to move some records around to keep the 4188 * tree balanced. Look at the left and right sibling blocks to 4189 * see if we can re-balance by moving only one record. 4190 */ 4191 xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB); 4192 xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB); 4193 4194 if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) { 4195 /* 4196 * One child of root, need to get a chance to copy its contents 4197 * into the root and delete it. Can't go up to next level, 4198 * there's nothing to delete there. 4199 */ 4200 if (xfs_btree_ptr_is_null(cur, &rptr) && 4201 xfs_btree_ptr_is_null(cur, &lptr) && 4202 level == cur->bc_nlevels - 2) { 4203 error = xfs_btree_kill_iroot(cur); 4204 if (!error) 4205 error = xfs_btree_dec_cursor(cur, level, stat); 4206 if (error) 4207 goto error0; 4208 return 0; 4209 } 4210 } 4211 4212 ASSERT(!xfs_btree_ptr_is_null(cur, &rptr) || 4213 !xfs_btree_ptr_is_null(cur, &lptr)); 4214 4215 /* 4216 * Duplicate the cursor so our btree manipulations here won't 4217 * disrupt the next level up. 4218 */ 4219 error = xfs_btree_dup_cursor(cur, &tcur); 4220 if (error) 4221 goto error0; 4222 4223 /* 4224 * If there's a right sibling, see if it's ok to shift an entry 4225 * out of it. 4226 */ 4227 if (!xfs_btree_ptr_is_null(cur, &rptr)) { 4228 /* 4229 * Move the temp cursor to the last entry in the next block. 4230 * Actually any entry but the first would suffice. 4231 */ 4232 i = xfs_btree_lastrec(tcur, level); 4233 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 4234 xfs_btree_mark_sick(cur); 4235 error = -EFSCORRUPTED; 4236 goto error0; 4237 } 4238 4239 error = xfs_btree_increment(tcur, level, &i); 4240 if (error) 4241 goto error0; 4242 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 4243 xfs_btree_mark_sick(cur); 4244 error = -EFSCORRUPTED; 4245 goto error0; 4246 } 4247 4248 i = xfs_btree_lastrec(tcur, level); 4249 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 4250 xfs_btree_mark_sick(cur); 4251 error = -EFSCORRUPTED; 4252 goto error0; 4253 } 4254 4255 /* Grab a pointer to the block. */ 4256 right = xfs_btree_get_block(tcur, level, &rbp); 4257 #ifdef DEBUG 4258 error = xfs_btree_check_block(tcur, right, level, rbp); 4259 if (error) 4260 goto error0; 4261 #endif 4262 /* Grab the current block number, for future use. */ 4263 xfs_btree_get_sibling(tcur, right, &cptr, XFS_BB_LEFTSIB); 4264 4265 /* 4266 * If right block is full enough so that removing one entry 4267 * won't make it too empty, and left-shifting an entry out 4268 * of right to us works, we're done. 4269 */ 4270 if (xfs_btree_get_numrecs(right) - 1 >= 4271 cur->bc_ops->get_minrecs(tcur, level)) { 4272 error = xfs_btree_lshift(tcur, level, &i); 4273 if (error) 4274 goto error0; 4275 if (i) { 4276 ASSERT(xfs_btree_get_numrecs(block) >= 4277 cur->bc_ops->get_minrecs(tcur, level)); 4278 4279 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); 4280 tcur = NULL; 4281 4282 error = xfs_btree_dec_cursor(cur, level, stat); 4283 if (error) 4284 goto error0; 4285 return 0; 4286 } 4287 } 4288 4289 /* 4290 * Otherwise, grab the number of records in right for 4291 * future reference, and fix up the temp cursor to point 4292 * to our block again (last record). 4293 */ 4294 rrecs = xfs_btree_get_numrecs(right); 4295 if (!xfs_btree_ptr_is_null(cur, &lptr)) { 4296 i = xfs_btree_firstrec(tcur, level); 4297 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 4298 xfs_btree_mark_sick(cur); 4299 error = -EFSCORRUPTED; 4300 goto error0; 4301 } 4302 4303 error = xfs_btree_decrement(tcur, level, &i); 4304 if (error) 4305 goto error0; 4306 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 4307 xfs_btree_mark_sick(cur); 4308 error = -EFSCORRUPTED; 4309 goto error0; 4310 } 4311 } 4312 } 4313 4314 /* 4315 * If there's a left sibling, see if it's ok to shift an entry 4316 * out of it. 4317 */ 4318 if (!xfs_btree_ptr_is_null(cur, &lptr)) { 4319 /* 4320 * Move the temp cursor to the first entry in the 4321 * previous block. 4322 */ 4323 i = xfs_btree_firstrec(tcur, level); 4324 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 4325 xfs_btree_mark_sick(cur); 4326 error = -EFSCORRUPTED; 4327 goto error0; 4328 } 4329 4330 error = xfs_btree_decrement(tcur, level, &i); 4331 if (error) 4332 goto error0; 4333 i = xfs_btree_firstrec(tcur, level); 4334 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 4335 xfs_btree_mark_sick(cur); 4336 error = -EFSCORRUPTED; 4337 goto error0; 4338 } 4339 4340 /* Grab a pointer to the block. */ 4341 left = xfs_btree_get_block(tcur, level, &lbp); 4342 #ifdef DEBUG 4343 error = xfs_btree_check_block(cur, left, level, lbp); 4344 if (error) 4345 goto error0; 4346 #endif 4347 /* Grab the current block number, for future use. */ 4348 xfs_btree_get_sibling(tcur, left, &cptr, XFS_BB_RIGHTSIB); 4349 4350 /* 4351 * If left block is full enough so that removing one entry 4352 * won't make it too empty, and right-shifting an entry out 4353 * of left to us works, we're done. 4354 */ 4355 if (xfs_btree_get_numrecs(left) - 1 >= 4356 cur->bc_ops->get_minrecs(tcur, level)) { 4357 error = xfs_btree_rshift(tcur, level, &i); 4358 if (error) 4359 goto error0; 4360 if (i) { 4361 ASSERT(xfs_btree_get_numrecs(block) >= 4362 cur->bc_ops->get_minrecs(tcur, level)); 4363 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); 4364 tcur = NULL; 4365 if (level == 0) 4366 cur->bc_levels[0].ptr++; 4367 4368 *stat = 1; 4369 return 0; 4370 } 4371 } 4372 4373 /* 4374 * Otherwise, grab the number of records in right for 4375 * future reference. 4376 */ 4377 lrecs = xfs_btree_get_numrecs(left); 4378 } 4379 4380 /* Delete the temp cursor, we're done with it. */ 4381 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); 4382 tcur = NULL; 4383 4384 /* If here, we need to do a join to keep the tree balanced. */ 4385 ASSERT(!xfs_btree_ptr_is_null(cur, &cptr)); 4386 4387 if (!xfs_btree_ptr_is_null(cur, &lptr) && 4388 lrecs + xfs_btree_get_numrecs(block) <= 4389 cur->bc_ops->get_maxrecs(cur, level)) { 4390 /* 4391 * Set "right" to be the starting block, 4392 * "left" to be the left neighbor. 4393 */ 4394 rptr = cptr; 4395 right = block; 4396 rbp = bp; 4397 error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp); 4398 if (error) 4399 goto error0; 4400 4401 /* 4402 * If that won't work, see if we can join with the right neighbor block. 4403 */ 4404 } else if (!xfs_btree_ptr_is_null(cur, &rptr) && 4405 rrecs + xfs_btree_get_numrecs(block) <= 4406 cur->bc_ops->get_maxrecs(cur, level)) { 4407 /* 4408 * Set "left" to be the starting block, 4409 * "right" to be the right neighbor. 4410 */ 4411 lptr = cptr; 4412 left = block; 4413 lbp = bp; 4414 error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp); 4415 if (error) 4416 goto error0; 4417 4418 /* 4419 * Otherwise, we can't fix the imbalance. 4420 * Just return. This is probably a logic error, but it's not fatal. 4421 */ 4422 } else { 4423 error = xfs_btree_dec_cursor(cur, level, stat); 4424 if (error) 4425 goto error0; 4426 return 0; 4427 } 4428 4429 rrecs = xfs_btree_get_numrecs(right); 4430 lrecs = xfs_btree_get_numrecs(left); 4431 4432 /* 4433 * We're now going to join "left" and "right" by moving all the stuff 4434 * in "right" to "left" and deleting "right". 4435 */ 4436 XFS_BTREE_STATS_ADD(cur, moves, rrecs); 4437 if (level > 0) { 4438 /* It's a non-leaf. Move keys and pointers. */ 4439 union xfs_btree_key *lkp; /* left btree key */ 4440 union xfs_btree_ptr *lpp; /* left address pointer */ 4441 union xfs_btree_key *rkp; /* right btree key */ 4442 union xfs_btree_ptr *rpp; /* right address pointer */ 4443 4444 lkp = xfs_btree_key_addr(cur, lrecs + 1, left); 4445 lpp = xfs_btree_ptr_addr(cur, lrecs + 1, left); 4446 rkp = xfs_btree_key_addr(cur, 1, right); 4447 rpp = xfs_btree_ptr_addr(cur, 1, right); 4448 4449 for (i = 1; i < rrecs; i++) { 4450 error = xfs_btree_debug_check_ptr(cur, rpp, i, level); 4451 if (error) 4452 goto error0; 4453 } 4454 4455 xfs_btree_copy_keys(cur, lkp, rkp, rrecs); 4456 xfs_btree_copy_ptrs(cur, lpp, rpp, rrecs); 4457 4458 xfs_btree_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs); 4459 xfs_btree_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs); 4460 } else { 4461 /* It's a leaf. Move records. */ 4462 union xfs_btree_rec *lrp; /* left record pointer */ 4463 union xfs_btree_rec *rrp; /* right record pointer */ 4464 4465 lrp = xfs_btree_rec_addr(cur, lrecs + 1, left); 4466 rrp = xfs_btree_rec_addr(cur, 1, right); 4467 4468 xfs_btree_copy_recs(cur, lrp, rrp, rrecs); 4469 xfs_btree_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs); 4470 } 4471 4472 XFS_BTREE_STATS_INC(cur, join); 4473 4474 /* 4475 * Fix up the number of records and right block pointer in the 4476 * surviving block, and log it. 4477 */ 4478 xfs_btree_set_numrecs(left, lrecs + rrecs); 4479 xfs_btree_get_sibling(cur, right, &cptr, XFS_BB_RIGHTSIB); 4480 xfs_btree_set_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB); 4481 xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB); 4482 4483 /* If there is a right sibling, point it to the remaining block. */ 4484 xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB); 4485 if (!xfs_btree_ptr_is_null(cur, &cptr)) { 4486 error = xfs_btree_read_buf_block(cur, &cptr, 0, &rrblock, &rrbp); 4487 if (error) 4488 goto error0; 4489 xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB); 4490 xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB); 4491 } 4492 4493 /* Free the deleted block. */ 4494 error = xfs_btree_free_block(cur, rbp); 4495 if (error) 4496 goto error0; 4497 4498 /* 4499 * If we joined with the left neighbor, set the buffer in the 4500 * cursor to the left block, and fix up the index. 4501 */ 4502 if (bp != lbp) { 4503 cur->bc_levels[level].bp = lbp; 4504 cur->bc_levels[level].ptr += lrecs; 4505 cur->bc_levels[level].ra = 0; 4506 } 4507 /* 4508 * If we joined with the right neighbor and there's a level above 4509 * us, increment the cursor at that level. 4510 */ 4511 else if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE || 4512 level + 1 < cur->bc_nlevels) { 4513 error = xfs_btree_increment(cur, level + 1, &i); 4514 if (error) 4515 goto error0; 4516 } 4517 4518 /* 4519 * Readjust the ptr at this level if it's not a leaf, since it's 4520 * still pointing at the deletion point, which makes the cursor 4521 * inconsistent. If this makes the ptr 0, the caller fixes it up. 4522 * We can't use decrement because it would change the next level up. 4523 */ 4524 if (level > 0) 4525 cur->bc_levels[level].ptr--; 4526 4527 /* 4528 * We combined blocks, so we have to update the parent keys if the 4529 * btree supports overlapped intervals. However, 4530 * bc_levels[level + 1].ptr points to the old block so that the caller 4531 * knows which record to delete. Therefore, the caller must be savvy 4532 * enough to call updkeys for us if we return stat == 2. The other 4533 * exit points from this function don't require deletions further up 4534 * the tree, so they can call updkeys directly. 4535 */ 4536 4537 /* Return value means the next level up has something to do. */ 4538 *stat = 2; 4539 return 0; 4540 4541 error0: 4542 if (tcur) 4543 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); 4544 return error; 4545 } 4546 4547 /* 4548 * Delete the record pointed to by cur. 4549 * The cursor refers to the place where the record was (could be inserted) 4550 * when the operation returns. 4551 */ 4552 int /* error */ 4553 xfs_btree_delete( 4554 struct xfs_btree_cur *cur, 4555 int *stat) /* success/failure */ 4556 { 4557 int error; /* error return value */ 4558 int level; 4559 int i; 4560 bool joined = false; 4561 4562 /* 4563 * Go up the tree, starting at leaf level. 4564 * 4565 * If 2 is returned then a join was done; go to the next level. 4566 * Otherwise we are done. 4567 */ 4568 for (level = 0, i = 2; i == 2; level++) { 4569 error = xfs_btree_delrec(cur, level, &i); 4570 if (error) 4571 goto error0; 4572 if (i == 2) 4573 joined = true; 4574 } 4575 4576 /* 4577 * If we combined blocks as part of deleting the record, delrec won't 4578 * have updated the parent high keys so we have to do that here. 4579 */ 4580 if (joined && (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING)) { 4581 error = xfs_btree_updkeys_force(cur, 0); 4582 if (error) 4583 goto error0; 4584 } 4585 4586 if (i == 0) { 4587 for (level = 1; level < cur->bc_nlevels; level++) { 4588 if (cur->bc_levels[level].ptr == 0) { 4589 error = xfs_btree_decrement(cur, level, &i); 4590 if (error) 4591 goto error0; 4592 break; 4593 } 4594 } 4595 } 4596 4597 *stat = i; 4598 return 0; 4599 error0: 4600 return error; 4601 } 4602 4603 /* 4604 * Get the data from the pointed-to record. 4605 */ 4606 int /* error */ 4607 xfs_btree_get_rec( 4608 struct xfs_btree_cur *cur, /* btree cursor */ 4609 union xfs_btree_rec **recp, /* output: btree record */ 4610 int *stat) /* output: success/failure */ 4611 { 4612 struct xfs_btree_block *block; /* btree block */ 4613 struct xfs_buf *bp; /* buffer pointer */ 4614 int ptr; /* record number */ 4615 #ifdef DEBUG 4616 int error; /* error return value */ 4617 #endif 4618 4619 ptr = cur->bc_levels[0].ptr; 4620 block = xfs_btree_get_block(cur, 0, &bp); 4621 4622 #ifdef DEBUG 4623 error = xfs_btree_check_block(cur, block, 0, bp); 4624 if (error) 4625 return error; 4626 #endif 4627 4628 /* 4629 * Off the right end or left end, return failure. 4630 */ 4631 if (ptr > xfs_btree_get_numrecs(block) || ptr <= 0) { 4632 *stat = 0; 4633 return 0; 4634 } 4635 4636 /* 4637 * Point to the record and extract its data. 4638 */ 4639 *recp = xfs_btree_rec_addr(cur, ptr, block); 4640 *stat = 1; 4641 return 0; 4642 } 4643 4644 /* Visit a block in a btree. */ 4645 STATIC int 4646 xfs_btree_visit_block( 4647 struct xfs_btree_cur *cur, 4648 int level, 4649 xfs_btree_visit_blocks_fn fn, 4650 void *data) 4651 { 4652 struct xfs_btree_block *block; 4653 struct xfs_buf *bp; 4654 union xfs_btree_ptr rptr, bufptr; 4655 int error; 4656 4657 /* do right sibling readahead */ 4658 xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA); 4659 block = xfs_btree_get_block(cur, level, &bp); 4660 4661 /* process the block */ 4662 error = fn(cur, level, data); 4663 if (error) 4664 return error; 4665 4666 /* now read rh sibling block for next iteration */ 4667 xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB); 4668 if (xfs_btree_ptr_is_null(cur, &rptr)) 4669 return -ENOENT; 4670 4671 /* 4672 * We only visit blocks once in this walk, so we have to avoid the 4673 * internal xfs_btree_lookup_get_block() optimisation where it will 4674 * return the same block without checking if the right sibling points 4675 * back to us and creates a cyclic reference in the btree. 4676 */ 4677 xfs_btree_buf_to_ptr(cur, bp, &bufptr); 4678 if (xfs_btree_ptrs_equal(cur, &rptr, &bufptr)) { 4679 xfs_btree_mark_sick(cur); 4680 return -EFSCORRUPTED; 4681 } 4682 4683 return xfs_btree_lookup_get_block(cur, level, &rptr, &block); 4684 } 4685 4686 4687 /* Visit every block in a btree. */ 4688 int 4689 xfs_btree_visit_blocks( 4690 struct xfs_btree_cur *cur, 4691 xfs_btree_visit_blocks_fn fn, 4692 unsigned int flags, 4693 void *data) 4694 { 4695 union xfs_btree_ptr lptr; 4696 int level; 4697 struct xfs_btree_block *block = NULL; 4698 int error = 0; 4699 4700 xfs_btree_init_ptr_from_cur(cur, &lptr); 4701 4702 /* for each level */ 4703 for (level = cur->bc_nlevels - 1; level >= 0; level--) { 4704 /* grab the left hand block */ 4705 error = xfs_btree_lookup_get_block(cur, level, &lptr, &block); 4706 if (error) 4707 return error; 4708 4709 /* readahead the left most block for the next level down */ 4710 if (level > 0) { 4711 union xfs_btree_ptr *ptr; 4712 4713 ptr = xfs_btree_ptr_addr(cur, 1, block); 4714 xfs_btree_readahead_ptr(cur, ptr, 1); 4715 4716 /* save for the next iteration of the loop */ 4717 xfs_btree_copy_ptrs(cur, &lptr, ptr, 1); 4718 4719 if (!(flags & XFS_BTREE_VISIT_LEAVES)) 4720 continue; 4721 } else if (!(flags & XFS_BTREE_VISIT_RECORDS)) { 4722 continue; 4723 } 4724 4725 /* for each buffer in the level */ 4726 do { 4727 error = xfs_btree_visit_block(cur, level, fn, data); 4728 } while (!error); 4729 4730 if (error != -ENOENT) 4731 return error; 4732 } 4733 4734 return 0; 4735 } 4736 4737 /* 4738 * Change the owner of a btree. 4739 * 4740 * The mechanism we use here is ordered buffer logging. Because we don't know 4741 * how many buffers were are going to need to modify, we don't really want to 4742 * have to make transaction reservations for the worst case of every buffer in a 4743 * full size btree as that may be more space that we can fit in the log.... 4744 * 4745 * We do the btree walk in the most optimal manner possible - we have sibling 4746 * pointers so we can just walk all the blocks on each level from left to right 4747 * in a single pass, and then move to the next level and do the same. We can 4748 * also do readahead on the sibling pointers to get IO moving more quickly, 4749 * though for slow disks this is unlikely to make much difference to performance 4750 * as the amount of CPU work we have to do before moving to the next block is 4751 * relatively small. 4752 * 4753 * For each btree block that we load, modify the owner appropriately, set the 4754 * buffer as an ordered buffer and log it appropriately. We need to ensure that 4755 * we mark the region we change dirty so that if the buffer is relogged in 4756 * a subsequent transaction the changes we make here as an ordered buffer are 4757 * correctly relogged in that transaction. If we are in recovery context, then 4758 * just queue the modified buffer as delayed write buffer so the transaction 4759 * recovery completion writes the changes to disk. 4760 */ 4761 struct xfs_btree_block_change_owner_info { 4762 uint64_t new_owner; 4763 struct list_head *buffer_list; 4764 }; 4765 4766 static int 4767 xfs_btree_block_change_owner( 4768 struct xfs_btree_cur *cur, 4769 int level, 4770 void *data) 4771 { 4772 struct xfs_btree_block_change_owner_info *bbcoi = data; 4773 struct xfs_btree_block *block; 4774 struct xfs_buf *bp; 4775 4776 /* modify the owner */ 4777 block = xfs_btree_get_block(cur, level, &bp); 4778 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { 4779 if (block->bb_u.l.bb_owner == cpu_to_be64(bbcoi->new_owner)) 4780 return 0; 4781 block->bb_u.l.bb_owner = cpu_to_be64(bbcoi->new_owner); 4782 } else { 4783 if (block->bb_u.s.bb_owner == cpu_to_be32(bbcoi->new_owner)) 4784 return 0; 4785 block->bb_u.s.bb_owner = cpu_to_be32(bbcoi->new_owner); 4786 } 4787 4788 /* 4789 * If the block is a root block hosted in an inode, we might not have a 4790 * buffer pointer here and we shouldn't attempt to log the change as the 4791 * information is already held in the inode and discarded when the root 4792 * block is formatted into the on-disk inode fork. We still change it, 4793 * though, so everything is consistent in memory. 4794 */ 4795 if (!bp) { 4796 ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE); 4797 ASSERT(level == cur->bc_nlevels - 1); 4798 return 0; 4799 } 4800 4801 if (cur->bc_tp) { 4802 if (!xfs_trans_ordered_buf(cur->bc_tp, bp)) { 4803 xfs_btree_log_block(cur, bp, XFS_BB_OWNER); 4804 return -EAGAIN; 4805 } 4806 } else { 4807 xfs_buf_delwri_queue(bp, bbcoi->buffer_list); 4808 } 4809 4810 return 0; 4811 } 4812 4813 int 4814 xfs_btree_change_owner( 4815 struct xfs_btree_cur *cur, 4816 uint64_t new_owner, 4817 struct list_head *buffer_list) 4818 { 4819 struct xfs_btree_block_change_owner_info bbcoi; 4820 4821 bbcoi.new_owner = new_owner; 4822 bbcoi.buffer_list = buffer_list; 4823 4824 return xfs_btree_visit_blocks(cur, xfs_btree_block_change_owner, 4825 XFS_BTREE_VISIT_ALL, &bbcoi); 4826 } 4827 4828 /* Verify the v5 fields of a long-format btree block. */ 4829 xfs_failaddr_t 4830 xfs_btree_fsblock_v5hdr_verify( 4831 struct xfs_buf *bp, 4832 uint64_t owner) 4833 { 4834 struct xfs_mount *mp = bp->b_mount; 4835 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 4836 4837 if (!xfs_has_crc(mp)) 4838 return __this_address; 4839 if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid)) 4840 return __this_address; 4841 if (block->bb_u.l.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp))) 4842 return __this_address; 4843 if (owner != XFS_RMAP_OWN_UNKNOWN && 4844 be64_to_cpu(block->bb_u.l.bb_owner) != owner) 4845 return __this_address; 4846 return NULL; 4847 } 4848 4849 /* Verify a long-format btree block. */ 4850 xfs_failaddr_t 4851 xfs_btree_fsblock_verify( 4852 struct xfs_buf *bp, 4853 unsigned int max_recs) 4854 { 4855 struct xfs_mount *mp = bp->b_mount; 4856 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 4857 xfs_fsblock_t fsb; 4858 xfs_failaddr_t fa; 4859 4860 ASSERT(!xfs_buftarg_is_mem(bp->b_target)); 4861 4862 /* numrecs verification */ 4863 if (be16_to_cpu(block->bb_numrecs) > max_recs) 4864 return __this_address; 4865 4866 /* sibling pointer verification */ 4867 fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); 4868 fa = xfs_btree_check_fsblock_siblings(mp, fsb, 4869 block->bb_u.l.bb_leftsib); 4870 if (!fa) 4871 fa = xfs_btree_check_fsblock_siblings(mp, fsb, 4872 block->bb_u.l.bb_rightsib); 4873 return fa; 4874 } 4875 4876 /* Verify an in-memory btree block. */ 4877 xfs_failaddr_t 4878 xfs_btree_memblock_verify( 4879 struct xfs_buf *bp, 4880 unsigned int max_recs) 4881 { 4882 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 4883 struct xfs_buftarg *btp = bp->b_target; 4884 xfs_failaddr_t fa; 4885 xfbno_t bno; 4886 4887 ASSERT(xfs_buftarg_is_mem(bp->b_target)); 4888 4889 /* numrecs verification */ 4890 if (be16_to_cpu(block->bb_numrecs) > max_recs) 4891 return __this_address; 4892 4893 /* sibling pointer verification */ 4894 bno = xfs_daddr_to_xfbno(xfs_buf_daddr(bp)); 4895 fa = xfs_btree_check_memblock_siblings(btp, bno, 4896 block->bb_u.l.bb_leftsib); 4897 if (fa) 4898 return fa; 4899 fa = xfs_btree_check_memblock_siblings(btp, bno, 4900 block->bb_u.l.bb_rightsib); 4901 if (fa) 4902 return fa; 4903 4904 return NULL; 4905 } 4906 /** 4907 * xfs_btree_agblock_v5hdr_verify() -- verify the v5 fields of a short-format 4908 * btree block 4909 * 4910 * @bp: buffer containing the btree block 4911 */ 4912 xfs_failaddr_t 4913 xfs_btree_agblock_v5hdr_verify( 4914 struct xfs_buf *bp) 4915 { 4916 struct xfs_mount *mp = bp->b_mount; 4917 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 4918 struct xfs_perag *pag = bp->b_pag; 4919 4920 if (!xfs_has_crc(mp)) 4921 return __this_address; 4922 if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) 4923 return __this_address; 4924 if (block->bb_u.s.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp))) 4925 return __this_address; 4926 if (pag && be32_to_cpu(block->bb_u.s.bb_owner) != pag_agno(pag)) 4927 return __this_address; 4928 return NULL; 4929 } 4930 4931 /** 4932 * xfs_btree_agblock_verify() -- verify a short-format btree block 4933 * 4934 * @bp: buffer containing the btree block 4935 * @max_recs: maximum records allowed in this btree node 4936 */ 4937 xfs_failaddr_t 4938 xfs_btree_agblock_verify( 4939 struct xfs_buf *bp, 4940 unsigned int max_recs) 4941 { 4942 struct xfs_mount *mp = bp->b_mount; 4943 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 4944 xfs_agblock_t agbno; 4945 xfs_failaddr_t fa; 4946 4947 ASSERT(!xfs_buftarg_is_mem(bp->b_target)); 4948 4949 /* numrecs verification */ 4950 if (be16_to_cpu(block->bb_numrecs) > max_recs) 4951 return __this_address; 4952 4953 /* sibling pointer verification */ 4954 agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp)); 4955 fa = xfs_btree_check_agblock_siblings(bp->b_pag, agbno, 4956 block->bb_u.s.bb_leftsib); 4957 if (!fa) 4958 fa = xfs_btree_check_agblock_siblings(bp->b_pag, agbno, 4959 block->bb_u.s.bb_rightsib); 4960 return fa; 4961 } 4962 4963 /* 4964 * For the given limits on leaf and keyptr records per block, calculate the 4965 * height of the tree needed to index the number of leaf records. 4966 */ 4967 unsigned int 4968 xfs_btree_compute_maxlevels( 4969 const unsigned int *limits, 4970 unsigned long long records) 4971 { 4972 unsigned long long level_blocks = howmany_64(records, limits[0]); 4973 unsigned int height = 1; 4974 4975 while (level_blocks > 1) { 4976 level_blocks = howmany_64(level_blocks, limits[1]); 4977 height++; 4978 } 4979 4980 return height; 4981 } 4982 4983 /* 4984 * For the given limits on leaf and keyptr records per block, calculate the 4985 * number of blocks needed to index the given number of leaf records. 4986 */ 4987 unsigned long long 4988 xfs_btree_calc_size( 4989 const unsigned int *limits, 4990 unsigned long long records) 4991 { 4992 unsigned long long level_blocks = howmany_64(records, limits[0]); 4993 unsigned long long blocks = level_blocks; 4994 4995 while (level_blocks > 1) { 4996 level_blocks = howmany_64(level_blocks, limits[1]); 4997 blocks += level_blocks; 4998 } 4999 5000 return blocks; 5001 } 5002 5003 /* 5004 * Given a number of available blocks for the btree to consume with records and 5005 * pointers, calculate the height of the tree needed to index all the records 5006 * that space can hold based on the number of pointers each interior node 5007 * holds. 5008 * 5009 * We start by assuming a single level tree consumes a single block, then track 5010 * the number of blocks each node level consumes until we no longer have space 5011 * to store the next node level. At this point, we are indexing all the leaf 5012 * blocks in the space, and there's no more free space to split the tree any 5013 * further. That's our maximum btree height. 5014 */ 5015 unsigned int 5016 xfs_btree_space_to_height( 5017 const unsigned int *limits, 5018 unsigned long long leaf_blocks) 5019 { 5020 /* 5021 * The root btree block can have fewer than minrecs pointers in it 5022 * because the tree might not be big enough to require that amount of 5023 * fanout. Hence it has a minimum size of 2 pointers, not limits[1]. 5024 */ 5025 unsigned long long node_blocks = 2; 5026 unsigned long long blocks_left = leaf_blocks - 1; 5027 unsigned int height = 1; 5028 5029 if (leaf_blocks < 1) 5030 return 0; 5031 5032 while (node_blocks < blocks_left) { 5033 blocks_left -= node_blocks; 5034 node_blocks *= limits[1]; 5035 height++; 5036 } 5037 5038 return height; 5039 } 5040 5041 /* 5042 * Query a regular btree for all records overlapping a given interval. 5043 * Start with a LE lookup of the key of low_rec and return all records 5044 * until we find a record with a key greater than the key of high_rec. 5045 */ 5046 STATIC int 5047 xfs_btree_simple_query_range( 5048 struct xfs_btree_cur *cur, 5049 const union xfs_btree_key *low_key, 5050 const union xfs_btree_key *high_key, 5051 xfs_btree_query_range_fn fn, 5052 void *priv) 5053 { 5054 union xfs_btree_rec *recp; 5055 union xfs_btree_key rec_key; 5056 int stat; 5057 bool firstrec = true; 5058 int error; 5059 5060 ASSERT(cur->bc_ops->init_high_key_from_rec); 5061 ASSERT(cur->bc_ops->diff_two_keys); 5062 5063 /* 5064 * Find the leftmost record. The btree cursor must be set 5065 * to the low record used to generate low_key. 5066 */ 5067 stat = 0; 5068 error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, &stat); 5069 if (error) 5070 goto out; 5071 5072 /* Nothing? See if there's anything to the right. */ 5073 if (!stat) { 5074 error = xfs_btree_increment(cur, 0, &stat); 5075 if (error) 5076 goto out; 5077 } 5078 5079 while (stat) { 5080 /* Find the record. */ 5081 error = xfs_btree_get_rec(cur, &recp, &stat); 5082 if (error || !stat) 5083 break; 5084 5085 /* Skip if low_key > high_key(rec). */ 5086 if (firstrec) { 5087 cur->bc_ops->init_high_key_from_rec(&rec_key, recp); 5088 firstrec = false; 5089 if (xfs_btree_keycmp_gt(cur, low_key, &rec_key)) 5090 goto advloop; 5091 } 5092 5093 /* Stop if low_key(rec) > high_key. */ 5094 cur->bc_ops->init_key_from_rec(&rec_key, recp); 5095 if (xfs_btree_keycmp_gt(cur, &rec_key, high_key)) 5096 break; 5097 5098 /* Callback */ 5099 error = fn(cur, recp, priv); 5100 if (error) 5101 break; 5102 5103 advloop: 5104 /* Move on to the next record. */ 5105 error = xfs_btree_increment(cur, 0, &stat); 5106 if (error) 5107 break; 5108 } 5109 5110 out: 5111 return error; 5112 } 5113 5114 /* 5115 * Query an overlapped interval btree for all records overlapping a given 5116 * interval. This function roughly follows the algorithm given in 5117 * "Interval Trees" of _Introduction to Algorithms_, which is section 5118 * 14.3 in the 2nd and 3rd editions. 5119 * 5120 * First, generate keys for the low and high records passed in. 5121 * 5122 * For any leaf node, generate the high and low keys for the record. 5123 * If the record keys overlap with the query low/high keys, pass the 5124 * record to the function iterator. 5125 * 5126 * For any internal node, compare the low and high keys of each 5127 * pointer against the query low/high keys. If there's an overlap, 5128 * follow the pointer. 5129 * 5130 * As an optimization, we stop scanning a block when we find a low key 5131 * that is greater than the query's high key. 5132 */ 5133 STATIC int 5134 xfs_btree_overlapped_query_range( 5135 struct xfs_btree_cur *cur, 5136 const union xfs_btree_key *low_key, 5137 const union xfs_btree_key *high_key, 5138 xfs_btree_query_range_fn fn, 5139 void *priv) 5140 { 5141 union xfs_btree_ptr ptr; 5142 union xfs_btree_ptr *pp; 5143 union xfs_btree_key rec_key; 5144 union xfs_btree_key rec_hkey; 5145 union xfs_btree_key *lkp; 5146 union xfs_btree_key *hkp; 5147 union xfs_btree_rec *recp; 5148 struct xfs_btree_block *block; 5149 int level; 5150 struct xfs_buf *bp; 5151 int i; 5152 int error; 5153 5154 /* Load the root of the btree. */ 5155 level = cur->bc_nlevels - 1; 5156 xfs_btree_init_ptr_from_cur(cur, &ptr); 5157 error = xfs_btree_lookup_get_block(cur, level, &ptr, &block); 5158 if (error) 5159 return error; 5160 xfs_btree_get_block(cur, level, &bp); 5161 trace_xfs_btree_overlapped_query_range(cur, level, bp); 5162 #ifdef DEBUG 5163 error = xfs_btree_check_block(cur, block, level, bp); 5164 if (error) 5165 goto out; 5166 #endif 5167 cur->bc_levels[level].ptr = 1; 5168 5169 while (level < cur->bc_nlevels) { 5170 block = xfs_btree_get_block(cur, level, &bp); 5171 5172 /* End of node, pop back towards the root. */ 5173 if (cur->bc_levels[level].ptr > 5174 be16_to_cpu(block->bb_numrecs)) { 5175 pop_up: 5176 if (level < cur->bc_nlevels - 1) 5177 cur->bc_levels[level + 1].ptr++; 5178 level++; 5179 continue; 5180 } 5181 5182 if (level == 0) { 5183 /* Handle a leaf node. */ 5184 recp = xfs_btree_rec_addr(cur, cur->bc_levels[0].ptr, 5185 block); 5186 5187 cur->bc_ops->init_high_key_from_rec(&rec_hkey, recp); 5188 cur->bc_ops->init_key_from_rec(&rec_key, recp); 5189 5190 /* 5191 * If (query's high key < record's low key), then there 5192 * are no more interesting records in this block. Pop 5193 * up to the leaf level to find more record blocks. 5194 * 5195 * If (record's high key >= query's low key) and 5196 * (query's high key >= record's low key), then 5197 * this record overlaps the query range; callback. 5198 */ 5199 if (xfs_btree_keycmp_lt(cur, high_key, &rec_key)) 5200 goto pop_up; 5201 if (xfs_btree_keycmp_ge(cur, &rec_hkey, low_key)) { 5202 error = fn(cur, recp, priv); 5203 if (error) 5204 break; 5205 } 5206 cur->bc_levels[level].ptr++; 5207 continue; 5208 } 5209 5210 /* Handle an internal node. */ 5211 lkp = xfs_btree_key_addr(cur, cur->bc_levels[level].ptr, block); 5212 hkp = xfs_btree_high_key_addr(cur, cur->bc_levels[level].ptr, 5213 block); 5214 pp = xfs_btree_ptr_addr(cur, cur->bc_levels[level].ptr, block); 5215 5216 /* 5217 * If (query's high key < pointer's low key), then there are no 5218 * more interesting keys in this block. Pop up one leaf level 5219 * to continue looking for records. 5220 * 5221 * If (pointer's high key >= query's low key) and 5222 * (query's high key >= pointer's low key), then 5223 * this record overlaps the query range; follow pointer. 5224 */ 5225 if (xfs_btree_keycmp_lt(cur, high_key, lkp)) 5226 goto pop_up; 5227 if (xfs_btree_keycmp_ge(cur, hkp, low_key)) { 5228 level--; 5229 error = xfs_btree_lookup_get_block(cur, level, pp, 5230 &block); 5231 if (error) 5232 goto out; 5233 xfs_btree_get_block(cur, level, &bp); 5234 trace_xfs_btree_overlapped_query_range(cur, level, bp); 5235 #ifdef DEBUG 5236 error = xfs_btree_check_block(cur, block, level, bp); 5237 if (error) 5238 goto out; 5239 #endif 5240 cur->bc_levels[level].ptr = 1; 5241 continue; 5242 } 5243 cur->bc_levels[level].ptr++; 5244 } 5245 5246 out: 5247 /* 5248 * If we don't end this function with the cursor pointing at a record 5249 * block, a subsequent non-error cursor deletion will not release 5250 * node-level buffers, causing a buffer leak. This is quite possible 5251 * with a zero-results range query, so release the buffers if we 5252 * failed to return any results. 5253 */ 5254 if (cur->bc_levels[0].bp == NULL) { 5255 for (i = 0; i < cur->bc_nlevels; i++) { 5256 if (cur->bc_levels[i].bp) { 5257 xfs_trans_brelse(cur->bc_tp, 5258 cur->bc_levels[i].bp); 5259 cur->bc_levels[i].bp = NULL; 5260 cur->bc_levels[i].ptr = 0; 5261 cur->bc_levels[i].ra = 0; 5262 } 5263 } 5264 } 5265 5266 return error; 5267 } 5268 5269 static inline void 5270 xfs_btree_key_from_irec( 5271 struct xfs_btree_cur *cur, 5272 union xfs_btree_key *key, 5273 const union xfs_btree_irec *irec) 5274 { 5275 union xfs_btree_rec rec; 5276 5277 cur->bc_rec = *irec; 5278 cur->bc_ops->init_rec_from_cur(cur, &rec); 5279 cur->bc_ops->init_key_from_rec(key, &rec); 5280 } 5281 5282 /* 5283 * Query a btree for all records overlapping a given interval of keys. The 5284 * supplied function will be called with each record found; return one of the 5285 * XFS_BTREE_QUERY_RANGE_{CONTINUE,ABORT} values or the usual negative error 5286 * code. This function returns -ECANCELED, zero, or a negative error code. 5287 */ 5288 int 5289 xfs_btree_query_range( 5290 struct xfs_btree_cur *cur, 5291 const union xfs_btree_irec *low_rec, 5292 const union xfs_btree_irec *high_rec, 5293 xfs_btree_query_range_fn fn, 5294 void *priv) 5295 { 5296 union xfs_btree_key low_key; 5297 union xfs_btree_key high_key; 5298 5299 /* Find the keys of both ends of the interval. */ 5300 xfs_btree_key_from_irec(cur, &high_key, high_rec); 5301 xfs_btree_key_from_irec(cur, &low_key, low_rec); 5302 5303 /* Enforce low key <= high key. */ 5304 if (!xfs_btree_keycmp_le(cur, &low_key, &high_key)) 5305 return -EINVAL; 5306 5307 if (!(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING)) 5308 return xfs_btree_simple_query_range(cur, &low_key, 5309 &high_key, fn, priv); 5310 return xfs_btree_overlapped_query_range(cur, &low_key, &high_key, 5311 fn, priv); 5312 } 5313 5314 /* Query a btree for all records. */ 5315 int 5316 xfs_btree_query_all( 5317 struct xfs_btree_cur *cur, 5318 xfs_btree_query_range_fn fn, 5319 void *priv) 5320 { 5321 union xfs_btree_key low_key; 5322 union xfs_btree_key high_key; 5323 5324 memset(&cur->bc_rec, 0, sizeof(cur->bc_rec)); 5325 memset(&low_key, 0, sizeof(low_key)); 5326 memset(&high_key, 0xFF, sizeof(high_key)); 5327 5328 return xfs_btree_simple_query_range(cur, &low_key, &high_key, fn, priv); 5329 } 5330 5331 static int 5332 xfs_btree_count_blocks_helper( 5333 struct xfs_btree_cur *cur, 5334 int level, 5335 void *data) 5336 { 5337 xfs_filblks_t *blocks = data; 5338 (*blocks)++; 5339 5340 return 0; 5341 } 5342 5343 /* Count the blocks in a btree and return the result in *blocks. */ 5344 int 5345 xfs_btree_count_blocks( 5346 struct xfs_btree_cur *cur, 5347 xfs_filblks_t *blocks) 5348 { 5349 *blocks = 0; 5350 return xfs_btree_visit_blocks(cur, xfs_btree_count_blocks_helper, 5351 XFS_BTREE_VISIT_ALL, blocks); 5352 } 5353 5354 /* Compare two btree pointers. */ 5355 int64_t 5356 xfs_btree_diff_two_ptrs( 5357 struct xfs_btree_cur *cur, 5358 const union xfs_btree_ptr *a, 5359 const union xfs_btree_ptr *b) 5360 { 5361 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) 5362 return (int64_t)be64_to_cpu(a->l) - be64_to_cpu(b->l); 5363 return (int64_t)be32_to_cpu(a->s) - be32_to_cpu(b->s); 5364 } 5365 5366 struct xfs_btree_has_records { 5367 /* Keys for the start and end of the range we want to know about. */ 5368 union xfs_btree_key start_key; 5369 union xfs_btree_key end_key; 5370 5371 /* Mask for key comparisons, if desired. */ 5372 const union xfs_btree_key *key_mask; 5373 5374 /* Highest record key we've seen so far. */ 5375 union xfs_btree_key high_key; 5376 5377 enum xbtree_recpacking outcome; 5378 }; 5379 5380 STATIC int 5381 xfs_btree_has_records_helper( 5382 struct xfs_btree_cur *cur, 5383 const union xfs_btree_rec *rec, 5384 void *priv) 5385 { 5386 union xfs_btree_key rec_key; 5387 union xfs_btree_key rec_high_key; 5388 struct xfs_btree_has_records *info = priv; 5389 enum xbtree_key_contig key_contig; 5390 5391 cur->bc_ops->init_key_from_rec(&rec_key, rec); 5392 5393 if (info->outcome == XBTREE_RECPACKING_EMPTY) { 5394 info->outcome = XBTREE_RECPACKING_SPARSE; 5395 5396 /* 5397 * If the first record we find does not overlap the start key, 5398 * then there is a hole at the start of the search range. 5399 * Classify this as sparse and stop immediately. 5400 */ 5401 if (xfs_btree_masked_keycmp_lt(cur, &info->start_key, &rec_key, 5402 info->key_mask)) 5403 return -ECANCELED; 5404 } else { 5405 /* 5406 * If a subsequent record does not overlap with the any record 5407 * we've seen so far, there is a hole in the middle of the 5408 * search range. Classify this as sparse and stop. 5409 * If the keys overlap and this btree does not allow overlap, 5410 * signal corruption. 5411 */ 5412 key_contig = cur->bc_ops->keys_contiguous(cur, &info->high_key, 5413 &rec_key, info->key_mask); 5414 if (key_contig == XBTREE_KEY_OVERLAP && 5415 !(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING)) 5416 return -EFSCORRUPTED; 5417 if (key_contig == XBTREE_KEY_GAP) 5418 return -ECANCELED; 5419 } 5420 5421 /* 5422 * If high_key(rec) is larger than any other high key we've seen, 5423 * remember it for later. 5424 */ 5425 cur->bc_ops->init_high_key_from_rec(&rec_high_key, rec); 5426 if (xfs_btree_masked_keycmp_gt(cur, &rec_high_key, &info->high_key, 5427 info->key_mask)) 5428 info->high_key = rec_high_key; /* struct copy */ 5429 5430 return 0; 5431 } 5432 5433 /* 5434 * Scan part of the keyspace of a btree and tell us if that keyspace does not 5435 * map to any records; is fully mapped to records; or is partially mapped to 5436 * records. This is the btree record equivalent to determining if a file is 5437 * sparse. 5438 * 5439 * For most btree types, the record scan should use all available btree key 5440 * fields to compare the keys encountered. These callers should pass NULL for 5441 * @mask. However, some callers (e.g. scanning physical space in the rmapbt) 5442 * want to ignore some part of the btree record keyspace when performing the 5443 * comparison. These callers should pass in a union xfs_btree_key object with 5444 * the fields that *should* be a part of the comparison set to any nonzero 5445 * value, and the rest zeroed. 5446 */ 5447 int 5448 xfs_btree_has_records( 5449 struct xfs_btree_cur *cur, 5450 const union xfs_btree_irec *low, 5451 const union xfs_btree_irec *high, 5452 const union xfs_btree_key *mask, 5453 enum xbtree_recpacking *outcome) 5454 { 5455 struct xfs_btree_has_records info = { 5456 .outcome = XBTREE_RECPACKING_EMPTY, 5457 .key_mask = mask, 5458 }; 5459 int error; 5460 5461 /* Not all btrees support this operation. */ 5462 if (!cur->bc_ops->keys_contiguous) { 5463 ASSERT(0); 5464 return -EOPNOTSUPP; 5465 } 5466 5467 xfs_btree_key_from_irec(cur, &info.start_key, low); 5468 xfs_btree_key_from_irec(cur, &info.end_key, high); 5469 5470 error = xfs_btree_query_range(cur, low, high, 5471 xfs_btree_has_records_helper, &info); 5472 if (error == -ECANCELED) 5473 goto out; 5474 if (error) 5475 return error; 5476 5477 if (info.outcome == XBTREE_RECPACKING_EMPTY) 5478 goto out; 5479 5480 /* 5481 * If the largest high_key(rec) we saw during the walk is greater than 5482 * the end of the search range, classify this as full. Otherwise, 5483 * there is a hole at the end of the search range. 5484 */ 5485 if (xfs_btree_masked_keycmp_ge(cur, &info.high_key, &info.end_key, 5486 mask)) 5487 info.outcome = XBTREE_RECPACKING_FULL; 5488 5489 out: 5490 *outcome = info.outcome; 5491 return 0; 5492 } 5493 5494 /* Are there more records in this btree? */ 5495 bool 5496 xfs_btree_has_more_records( 5497 struct xfs_btree_cur *cur) 5498 { 5499 struct xfs_btree_block *block; 5500 struct xfs_buf *bp; 5501 5502 block = xfs_btree_get_block(cur, 0, &bp); 5503 5504 /* There are still records in this block. */ 5505 if (cur->bc_levels[0].ptr < xfs_btree_get_numrecs(block)) 5506 return true; 5507 5508 /* There are more record blocks. */ 5509 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) 5510 return block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK); 5511 else 5512 return block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK); 5513 } 5514 5515 /* Set up all the btree cursor caches. */ 5516 int __init 5517 xfs_btree_init_cur_caches(void) 5518 { 5519 int error; 5520 5521 error = xfs_allocbt_init_cur_cache(); 5522 if (error) 5523 return error; 5524 error = xfs_inobt_init_cur_cache(); 5525 if (error) 5526 goto err; 5527 error = xfs_bmbt_init_cur_cache(); 5528 if (error) 5529 goto err; 5530 error = xfs_rmapbt_init_cur_cache(); 5531 if (error) 5532 goto err; 5533 error = xfs_refcountbt_init_cur_cache(); 5534 if (error) 5535 goto err; 5536 error = xfs_rtrmapbt_init_cur_cache(); 5537 if (error) 5538 goto err; 5539 error = xfs_rtrefcountbt_init_cur_cache(); 5540 if (error) 5541 goto err; 5542 5543 return 0; 5544 err: 5545 xfs_btree_destroy_cur_caches(); 5546 return error; 5547 } 5548 5549 /* Destroy all the btree cursor caches, if they've been allocated. */ 5550 void 5551 xfs_btree_destroy_cur_caches(void) 5552 { 5553 xfs_allocbt_destroy_cur_cache(); 5554 xfs_inobt_destroy_cur_cache(); 5555 xfs_bmbt_destroy_cur_cache(); 5556 xfs_rmapbt_destroy_cur_cache(); 5557 xfs_refcountbt_destroy_cur_cache(); 5558 xfs_rtrmapbt_destroy_cur_cache(); 5559 xfs_rtrefcountbt_destroy_cur_cache(); 5560 } 5561 5562 /* Move the btree cursor before the first record. */ 5563 int 5564 xfs_btree_goto_left_edge( 5565 struct xfs_btree_cur *cur) 5566 { 5567 int stat = 0; 5568 int error; 5569 5570 memset(&cur->bc_rec, 0, sizeof(cur->bc_rec)); 5571 error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, &stat); 5572 if (error) 5573 return error; 5574 if (!stat) 5575 return 0; 5576 5577 error = xfs_btree_decrement(cur, 0, &stat); 5578 if (error) 5579 return error; 5580 if (stat != 0) { 5581 ASSERT(0); 5582 xfs_btree_mark_sick(cur); 5583 return -EFSCORRUPTED; 5584 } 5585 5586 return 0; 5587 } 5588 5589 /* Allocate a block for an inode-rooted metadata btree. */ 5590 int 5591 xfs_btree_alloc_metafile_block( 5592 struct xfs_btree_cur *cur, 5593 const union xfs_btree_ptr *start, 5594 union xfs_btree_ptr *new, 5595 int *stat) 5596 { 5597 struct xfs_alloc_arg args = { 5598 .mp = cur->bc_mp, 5599 .tp = cur->bc_tp, 5600 .resv = XFS_AG_RESV_METAFILE, 5601 .minlen = 1, 5602 .maxlen = 1, 5603 .prod = 1, 5604 }; 5605 struct xfs_inode *ip = cur->bc_ino.ip; 5606 int error; 5607 5608 ASSERT(xfs_is_metadir_inode(ip)); 5609 5610 xfs_rmap_ino_bmbt_owner(&args.oinfo, ip->i_ino, cur->bc_ino.whichfork); 5611 error = xfs_alloc_vextent_start_ag(&args, 5612 XFS_INO_TO_FSB(cur->bc_mp, ip->i_ino)); 5613 if (error) 5614 return error; 5615 if (args.fsbno == NULLFSBLOCK) { 5616 *stat = 0; 5617 return 0; 5618 } 5619 ASSERT(args.len == 1); 5620 5621 xfs_metafile_resv_alloc_space(ip, &args); 5622 5623 new->l = cpu_to_be64(args.fsbno); 5624 *stat = 1; 5625 return 0; 5626 } 5627 5628 /* Free a block from an inode-rooted metadata btree. */ 5629 int 5630 xfs_btree_free_metafile_block( 5631 struct xfs_btree_cur *cur, 5632 struct xfs_buf *bp) 5633 { 5634 struct xfs_owner_info oinfo; 5635 struct xfs_mount *mp = cur->bc_mp; 5636 struct xfs_inode *ip = cur->bc_ino.ip; 5637 struct xfs_trans *tp = cur->bc_tp; 5638 xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); 5639 int error; 5640 5641 ASSERT(xfs_is_metadir_inode(ip)); 5642 5643 xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_ino.whichfork); 5644 error = xfs_free_extent_later(tp, fsbno, 1, &oinfo, XFS_AG_RESV_METAFILE, 5645 0); 5646 if (error) 5647 return error; 5648 5649 xfs_metafile_resv_free_space(ip, tp, 1); 5650 return 0; 5651 } 5652