1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_bit.h" 13 #include "xfs_mount.h" 14 #include "xfs_inode.h" 15 #include "xfs_trans.h" 16 #include "xfs_buf_item.h" 17 #include "xfs_btree.h" 18 #include "xfs_errortag.h" 19 #include "xfs_error.h" 20 #include "xfs_trace.h" 21 #include "xfs_alloc.h" 22 #include "xfs_log.h" 23 #include "xfs_btree_staging.h" 24 #include "xfs_ag.h" 25 #include "xfs_alloc_btree.h" 26 #include "xfs_ialloc_btree.h" 27 #include "xfs_bmap_btree.h" 28 #include "xfs_rmap_btree.h" 29 #include "xfs_refcount_btree.h" 30 #include "xfs_health.h" 31 #include "xfs_buf_mem.h" 32 #include "xfs_btree_mem.h" 33 34 /* 35 * Btree magic numbers. 36 */ 37 uint32_t 38 xfs_btree_magic( 39 struct xfs_mount *mp, 40 const struct xfs_btree_ops *ops) 41 { 42 int idx = xfs_has_crc(mp) ? 1 : 0; 43 __be32 magic = ops->buf_ops->magic[idx]; 44 45 /* Ensure we asked for crc for crc-only magics. */ 46 ASSERT(magic != 0); 47 return be32_to_cpu(magic); 48 } 49 50 /* 51 * These sibling pointer checks are optimised for null sibling pointers. This 52 * happens a lot, and we don't need to byte swap at runtime if the sibling 53 * pointer is NULL. 54 * 55 * These are explicitly marked at inline because the cost of calling them as 56 * functions instead of inlining them is about 36 bytes extra code per call site 57 * on x86-64. Yes, gcc-11 fails to inline them, and explicit inlining of these 58 * two sibling check functions reduces the compiled code size by over 300 59 * bytes. 60 */ 61 static inline xfs_failaddr_t 62 xfs_btree_check_fsblock_siblings( 63 struct xfs_mount *mp, 64 xfs_fsblock_t fsb, 65 __be64 dsibling) 66 { 67 xfs_fsblock_t sibling; 68 69 if (dsibling == cpu_to_be64(NULLFSBLOCK)) 70 return NULL; 71 72 sibling = be64_to_cpu(dsibling); 73 if (sibling == fsb) 74 return __this_address; 75 if (!xfs_verify_fsbno(mp, sibling)) 76 return __this_address; 77 return NULL; 78 } 79 80 static inline xfs_failaddr_t 81 xfs_btree_check_memblock_siblings( 82 struct xfs_buftarg *btp, 83 xfbno_t bno, 84 __be64 dsibling) 85 { 86 xfbno_t sibling; 87 88 if (dsibling == cpu_to_be64(NULLFSBLOCK)) 89 return NULL; 90 91 sibling = be64_to_cpu(dsibling); 92 if (sibling == bno) 93 return __this_address; 94 if (!xmbuf_verify_daddr(btp, xfbno_to_daddr(sibling))) 95 return __this_address; 96 return NULL; 97 } 98 99 static inline xfs_failaddr_t 100 xfs_btree_check_agblock_siblings( 101 struct xfs_perag *pag, 102 xfs_agblock_t agbno, 103 __be32 dsibling) 104 { 105 xfs_agblock_t sibling; 106 107 if (dsibling == cpu_to_be32(NULLAGBLOCK)) 108 return NULL; 109 110 sibling = be32_to_cpu(dsibling); 111 if (sibling == agbno) 112 return __this_address; 113 if (!xfs_verify_agbno(pag, sibling)) 114 return __this_address; 115 return NULL; 116 } 117 118 static xfs_failaddr_t 119 __xfs_btree_check_lblock_hdr( 120 struct xfs_btree_cur *cur, 121 struct xfs_btree_block *block, 122 int level, 123 struct xfs_buf *bp) 124 { 125 struct xfs_mount *mp = cur->bc_mp; 126 127 if (xfs_has_crc(mp)) { 128 if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid)) 129 return __this_address; 130 if (block->bb_u.l.bb_blkno != 131 cpu_to_be64(bp ? xfs_buf_daddr(bp) : XFS_BUF_DADDR_NULL)) 132 return __this_address; 133 if (block->bb_u.l.bb_pad != cpu_to_be32(0)) 134 return __this_address; 135 } 136 137 if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(mp, cur->bc_ops)) 138 return __this_address; 139 if (be16_to_cpu(block->bb_level) != level) 140 return __this_address; 141 if (be16_to_cpu(block->bb_numrecs) > 142 cur->bc_ops->get_maxrecs(cur, level)) 143 return __this_address; 144 145 return NULL; 146 } 147 148 /* 149 * Check a long btree block header. Return the address of the failing check, 150 * or NULL if everything is ok. 151 */ 152 static xfs_failaddr_t 153 __xfs_btree_check_fsblock( 154 struct xfs_btree_cur *cur, 155 struct xfs_btree_block *block, 156 int level, 157 struct xfs_buf *bp) 158 { 159 struct xfs_mount *mp = cur->bc_mp; 160 xfs_failaddr_t fa; 161 xfs_fsblock_t fsb; 162 163 fa = __xfs_btree_check_lblock_hdr(cur, block, level, bp); 164 if (fa) 165 return fa; 166 167 /* 168 * For inode-rooted btrees, the root block sits in the inode fork. In 169 * that case bp is NULL, and the block must not have any siblings. 170 */ 171 if (!bp) { 172 if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK)) 173 return __this_address; 174 if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK)) 175 return __this_address; 176 return NULL; 177 } 178 179 fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); 180 fa = xfs_btree_check_fsblock_siblings(mp, fsb, 181 block->bb_u.l.bb_leftsib); 182 if (!fa) 183 fa = xfs_btree_check_fsblock_siblings(mp, fsb, 184 block->bb_u.l.bb_rightsib); 185 return fa; 186 } 187 188 /* 189 * Check an in-memory btree block header. Return the address of the failing 190 * check, or NULL if everything is ok. 191 */ 192 static xfs_failaddr_t 193 __xfs_btree_check_memblock( 194 struct xfs_btree_cur *cur, 195 struct xfs_btree_block *block, 196 int level, 197 struct xfs_buf *bp) 198 { 199 struct xfs_buftarg *btp = cur->bc_mem.xfbtree->target; 200 xfs_failaddr_t fa; 201 xfbno_t bno; 202 203 fa = __xfs_btree_check_lblock_hdr(cur, block, level, bp); 204 if (fa) 205 return fa; 206 207 bno = xfs_daddr_to_xfbno(xfs_buf_daddr(bp)); 208 fa = xfs_btree_check_memblock_siblings(btp, bno, 209 block->bb_u.l.bb_leftsib); 210 if (!fa) 211 fa = xfs_btree_check_memblock_siblings(btp, bno, 212 block->bb_u.l.bb_rightsib); 213 return fa; 214 } 215 216 /* 217 * Check a short btree block header. Return the address of the failing check, 218 * or NULL if everything is ok. 219 */ 220 static xfs_failaddr_t 221 __xfs_btree_check_agblock( 222 struct xfs_btree_cur *cur, 223 struct xfs_btree_block *block, 224 int level, 225 struct xfs_buf *bp) 226 { 227 struct xfs_mount *mp = cur->bc_mp; 228 struct xfs_perag *pag = cur->bc_ag.pag; 229 xfs_failaddr_t fa; 230 xfs_agblock_t agbno; 231 232 if (xfs_has_crc(mp)) { 233 if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) 234 return __this_address; 235 if (block->bb_u.s.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp))) 236 return __this_address; 237 } 238 239 if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(mp, cur->bc_ops)) 240 return __this_address; 241 if (be16_to_cpu(block->bb_level) != level) 242 return __this_address; 243 if (be16_to_cpu(block->bb_numrecs) > 244 cur->bc_ops->get_maxrecs(cur, level)) 245 return __this_address; 246 247 agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp)); 248 fa = xfs_btree_check_agblock_siblings(pag, agbno, 249 block->bb_u.s.bb_leftsib); 250 if (!fa) 251 fa = xfs_btree_check_agblock_siblings(pag, agbno, 252 block->bb_u.s.bb_rightsib); 253 return fa; 254 } 255 256 /* 257 * Internal btree block check. 258 * 259 * Return NULL if the block is ok or the address of the failed check otherwise. 260 */ 261 xfs_failaddr_t 262 __xfs_btree_check_block( 263 struct xfs_btree_cur *cur, 264 struct xfs_btree_block *block, 265 int level, 266 struct xfs_buf *bp) 267 { 268 switch (cur->bc_ops->type) { 269 case XFS_BTREE_TYPE_MEM: 270 return __xfs_btree_check_memblock(cur, block, level, bp); 271 case XFS_BTREE_TYPE_AG: 272 return __xfs_btree_check_agblock(cur, block, level, bp); 273 case XFS_BTREE_TYPE_INODE: 274 return __xfs_btree_check_fsblock(cur, block, level, bp); 275 default: 276 ASSERT(0); 277 return __this_address; 278 } 279 } 280 281 static inline unsigned int xfs_btree_block_errtag(struct xfs_btree_cur *cur) 282 { 283 if (cur->bc_ops->ptr_len == XFS_BTREE_SHORT_PTR_LEN) 284 return XFS_ERRTAG_BTREE_CHECK_SBLOCK; 285 return XFS_ERRTAG_BTREE_CHECK_LBLOCK; 286 } 287 288 /* 289 * Debug routine: check that block header is ok. 290 */ 291 int 292 xfs_btree_check_block( 293 struct xfs_btree_cur *cur, /* btree cursor */ 294 struct xfs_btree_block *block, /* generic btree block pointer */ 295 int level, /* level of the btree block */ 296 struct xfs_buf *bp) /* buffer containing block, if any */ 297 { 298 struct xfs_mount *mp = cur->bc_mp; 299 xfs_failaddr_t fa; 300 301 fa = __xfs_btree_check_block(cur, block, level, bp); 302 if (XFS_IS_CORRUPT(mp, fa != NULL) || 303 XFS_TEST_ERROR(false, mp, xfs_btree_block_errtag(cur))) { 304 if (bp) 305 trace_xfs_btree_corrupt(bp, _RET_IP_); 306 xfs_btree_mark_sick(cur); 307 return -EFSCORRUPTED; 308 } 309 return 0; 310 } 311 312 int 313 __xfs_btree_check_ptr( 314 struct xfs_btree_cur *cur, 315 const union xfs_btree_ptr *ptr, 316 int index, 317 int level) 318 { 319 if (level <= 0) 320 return -EFSCORRUPTED; 321 322 switch (cur->bc_ops->type) { 323 case XFS_BTREE_TYPE_MEM: 324 if (!xfbtree_verify_bno(cur->bc_mem.xfbtree, 325 be64_to_cpu((&ptr->l)[index]))) 326 return -EFSCORRUPTED; 327 break; 328 case XFS_BTREE_TYPE_INODE: 329 if (!xfs_verify_fsbno(cur->bc_mp, 330 be64_to_cpu((&ptr->l)[index]))) 331 return -EFSCORRUPTED; 332 break; 333 case XFS_BTREE_TYPE_AG: 334 if (!xfs_verify_agbno(cur->bc_ag.pag, 335 be32_to_cpu((&ptr->s)[index]))) 336 return -EFSCORRUPTED; 337 break; 338 } 339 340 return 0; 341 } 342 343 /* 344 * Check that a given (indexed) btree pointer at a certain level of a 345 * btree is valid and doesn't point past where it should. 346 */ 347 static int 348 xfs_btree_check_ptr( 349 struct xfs_btree_cur *cur, 350 const union xfs_btree_ptr *ptr, 351 int index, 352 int level) 353 { 354 int error; 355 356 error = __xfs_btree_check_ptr(cur, ptr, index, level); 357 if (error) { 358 switch (cur->bc_ops->type) { 359 case XFS_BTREE_TYPE_MEM: 360 xfs_err(cur->bc_mp, 361 "In-memory: Corrupt %sbt flags 0x%x pointer at level %d index %d fa %pS.", 362 cur->bc_ops->name, cur->bc_flags, level, index, 363 __this_address); 364 break; 365 case XFS_BTREE_TYPE_INODE: 366 xfs_err(cur->bc_mp, 367 "Inode %llu fork %d: Corrupt %sbt pointer at level %d index %d.", 368 cur->bc_ino.ip->i_ino, 369 cur->bc_ino.whichfork, cur->bc_ops->name, 370 level, index); 371 break; 372 case XFS_BTREE_TYPE_AG: 373 xfs_err(cur->bc_mp, 374 "AG %u: Corrupt %sbt pointer at level %d index %d.", 375 cur->bc_ag.pag->pag_agno, cur->bc_ops->name, 376 level, index); 377 break; 378 } 379 xfs_btree_mark_sick(cur); 380 } 381 382 return error; 383 } 384 385 #ifdef DEBUG 386 # define xfs_btree_debug_check_ptr xfs_btree_check_ptr 387 #else 388 # define xfs_btree_debug_check_ptr(...) (0) 389 #endif 390 391 /* 392 * Calculate CRC on the whole btree block and stuff it into the 393 * long-form btree header. 394 * 395 * Prior to calculting the CRC, pull the LSN out of the buffer log item and put 396 * it into the buffer so recovery knows what the last modification was that made 397 * it to disk. 398 */ 399 void 400 xfs_btree_fsblock_calc_crc( 401 struct xfs_buf *bp) 402 { 403 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 404 struct xfs_buf_log_item *bip = bp->b_log_item; 405 406 if (!xfs_has_crc(bp->b_mount)) 407 return; 408 if (bip) 409 block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn); 410 xfs_buf_update_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF); 411 } 412 413 bool 414 xfs_btree_fsblock_verify_crc( 415 struct xfs_buf *bp) 416 { 417 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 418 struct xfs_mount *mp = bp->b_mount; 419 420 if (xfs_has_crc(mp)) { 421 if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.l.bb_lsn))) 422 return false; 423 return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF); 424 } 425 426 return true; 427 } 428 429 /* 430 * Calculate CRC on the whole btree block and stuff it into the 431 * short-form btree header. 432 * 433 * Prior to calculting the CRC, pull the LSN out of the buffer log item and put 434 * it into the buffer so recovery knows what the last modification was that made 435 * it to disk. 436 */ 437 void 438 xfs_btree_agblock_calc_crc( 439 struct xfs_buf *bp) 440 { 441 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 442 struct xfs_buf_log_item *bip = bp->b_log_item; 443 444 if (!xfs_has_crc(bp->b_mount)) 445 return; 446 if (bip) 447 block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn); 448 xfs_buf_update_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF); 449 } 450 451 bool 452 xfs_btree_agblock_verify_crc( 453 struct xfs_buf *bp) 454 { 455 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 456 struct xfs_mount *mp = bp->b_mount; 457 458 if (xfs_has_crc(mp)) { 459 if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.s.bb_lsn))) 460 return false; 461 return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF); 462 } 463 464 return true; 465 } 466 467 static int 468 xfs_btree_free_block( 469 struct xfs_btree_cur *cur, 470 struct xfs_buf *bp) 471 { 472 int error; 473 474 trace_xfs_btree_free_block(cur, bp); 475 476 /* 477 * Don't allow block freeing for a staging cursor, because staging 478 * cursors do not support regular btree modifications. 479 */ 480 if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) { 481 ASSERT(0); 482 return -EFSCORRUPTED; 483 } 484 485 error = cur->bc_ops->free_block(cur, bp); 486 if (!error) { 487 xfs_trans_binval(cur->bc_tp, bp); 488 XFS_BTREE_STATS_INC(cur, free); 489 } 490 return error; 491 } 492 493 /* 494 * Delete the btree cursor. 495 */ 496 void 497 xfs_btree_del_cursor( 498 struct xfs_btree_cur *cur, /* btree cursor */ 499 int error) /* del because of error */ 500 { 501 int i; /* btree level */ 502 503 /* 504 * Clear the buffer pointers and release the buffers. If we're doing 505 * this because of an error, inspect all of the entries in the bc_bufs 506 * array for buffers to be unlocked. This is because some of the btree 507 * code works from level n down to 0, and if we get an error along the 508 * way we won't have initialized all the entries down to 0. 509 */ 510 for (i = 0; i < cur->bc_nlevels; i++) { 511 if (cur->bc_levels[i].bp) 512 xfs_trans_brelse(cur->bc_tp, cur->bc_levels[i].bp); 513 else if (!error) 514 break; 515 } 516 517 /* 518 * If we are doing a BMBT update, the number of unaccounted blocks 519 * allocated during this cursor life time should be zero. If it's not 520 * zero, then we should be shut down or on our way to shutdown due to 521 * cancelling a dirty transaction on error. 522 */ 523 ASSERT(!xfs_btree_is_bmap(cur->bc_ops) || cur->bc_bmap.allocated == 0 || 524 xfs_is_shutdown(cur->bc_mp) || error != 0); 525 526 switch (cur->bc_ops->type) { 527 case XFS_BTREE_TYPE_AG: 528 if (cur->bc_ag.pag) 529 xfs_perag_put(cur->bc_ag.pag); 530 break; 531 case XFS_BTREE_TYPE_INODE: 532 /* nothing to do */ 533 break; 534 case XFS_BTREE_TYPE_MEM: 535 if (cur->bc_mem.pag) 536 xfs_perag_put(cur->bc_mem.pag); 537 break; 538 } 539 540 kmem_cache_free(cur->bc_cache, cur); 541 } 542 543 /* Return the buffer target for this btree's buffer. */ 544 static inline struct xfs_buftarg * 545 xfs_btree_buftarg( 546 struct xfs_btree_cur *cur) 547 { 548 if (cur->bc_ops->type == XFS_BTREE_TYPE_MEM) 549 return cur->bc_mem.xfbtree->target; 550 return cur->bc_mp->m_ddev_targp; 551 } 552 553 /* Return the block size (in units of 512b sectors) for this btree. */ 554 static inline unsigned int 555 xfs_btree_bbsize( 556 struct xfs_btree_cur *cur) 557 { 558 if (cur->bc_ops->type == XFS_BTREE_TYPE_MEM) 559 return XFBNO_BBSIZE; 560 return cur->bc_mp->m_bsize; 561 } 562 563 /* 564 * Duplicate the btree cursor. 565 * Allocate a new one, copy the record, re-get the buffers. 566 */ 567 int /* error */ 568 xfs_btree_dup_cursor( 569 struct xfs_btree_cur *cur, /* input cursor */ 570 struct xfs_btree_cur **ncur) /* output cursor */ 571 { 572 struct xfs_mount *mp = cur->bc_mp; 573 struct xfs_trans *tp = cur->bc_tp; 574 struct xfs_buf *bp; 575 struct xfs_btree_cur *new; 576 int error; 577 int i; 578 579 /* 580 * Don't allow staging cursors to be duplicated because they're supposed 581 * to be kept private to a single thread. 582 */ 583 if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) { 584 ASSERT(0); 585 return -EFSCORRUPTED; 586 } 587 588 /* 589 * Allocate a new cursor like the old one. 590 */ 591 new = cur->bc_ops->dup_cursor(cur); 592 593 /* 594 * Copy the record currently in the cursor. 595 */ 596 new->bc_rec = cur->bc_rec; 597 598 /* 599 * For each level current, re-get the buffer and copy the ptr value. 600 */ 601 for (i = 0; i < new->bc_nlevels; i++) { 602 new->bc_levels[i].ptr = cur->bc_levels[i].ptr; 603 new->bc_levels[i].ra = cur->bc_levels[i].ra; 604 bp = cur->bc_levels[i].bp; 605 if (bp) { 606 error = xfs_trans_read_buf(mp, tp, 607 xfs_btree_buftarg(cur), 608 xfs_buf_daddr(bp), 609 xfs_btree_bbsize(cur), 0, &bp, 610 cur->bc_ops->buf_ops); 611 if (xfs_metadata_is_sick(error)) 612 xfs_btree_mark_sick(new); 613 if (error) { 614 xfs_btree_del_cursor(new, error); 615 *ncur = NULL; 616 return error; 617 } 618 } 619 new->bc_levels[i].bp = bp; 620 } 621 *ncur = new; 622 return 0; 623 } 624 625 /* 626 * XFS btree block layout and addressing: 627 * 628 * There are two types of blocks in the btree: leaf and non-leaf blocks. 629 * 630 * The leaf record start with a header then followed by records containing 631 * the values. A non-leaf block also starts with the same header, and 632 * then first contains lookup keys followed by an equal number of pointers 633 * to the btree blocks at the previous level. 634 * 635 * +--------+-------+-------+-------+-------+-------+-------+ 636 * Leaf: | header | rec 1 | rec 2 | rec 3 | rec 4 | rec 5 | rec N | 637 * +--------+-------+-------+-------+-------+-------+-------+ 638 * 639 * +--------+-------+-------+-------+-------+-------+-------+ 640 * Non-Leaf: | header | key 1 | key 2 | key N | ptr 1 | ptr 2 | ptr N | 641 * +--------+-------+-------+-------+-------+-------+-------+ 642 * 643 * The header is called struct xfs_btree_block for reasons better left unknown 644 * and comes in different versions for short (32bit) and long (64bit) block 645 * pointers. The record and key structures are defined by the btree instances 646 * and opaque to the btree core. The block pointers are simple disk endian 647 * integers, available in a short (32bit) and long (64bit) variant. 648 * 649 * The helpers below calculate the offset of a given record, key or pointer 650 * into a btree block (xfs_btree_*_offset) or return a pointer to the given 651 * record, key or pointer (xfs_btree_*_addr). Note that all addressing 652 * inside the btree block is done using indices starting at one, not zero! 653 * 654 * If XFS_BTGEO_OVERLAPPING is set, then this btree supports keys containing 655 * overlapping intervals. In such a tree, records are still sorted lowest to 656 * highest and indexed by the smallest key value that refers to the record. 657 * However, nodes are different: each pointer has two associated keys -- one 658 * indexing the lowest key available in the block(s) below (the same behavior 659 * as the key in a regular btree) and another indexing the highest key 660 * available in the block(s) below. Because records are /not/ sorted by the 661 * highest key, all leaf block updates require us to compute the highest key 662 * that matches any record in the leaf and to recursively update the high keys 663 * in the nodes going further up in the tree, if necessary. Nodes look like 664 * this: 665 * 666 * +--------+-----+-----+-----+-----+-----+-------+-------+-----+ 667 * Non-Leaf: | header | lo1 | hi1 | lo2 | hi2 | ... | ptr 1 | ptr 2 | ... | 668 * +--------+-----+-----+-----+-----+-----+-------+-------+-----+ 669 * 670 * To perform an interval query on an overlapped tree, perform the usual 671 * depth-first search and use the low and high keys to decide if we can skip 672 * that particular node. If a leaf node is reached, return the records that 673 * intersect the interval. Note that an interval query may return numerous 674 * entries. For a non-overlapped tree, simply search for the record associated 675 * with the lowest key and iterate forward until a non-matching record is 676 * found. Section 14.3 ("Interval Trees") of _Introduction to Algorithms_ by 677 * Cormen, Leiserson, Rivest, and Stein (2nd or 3rd ed. only) discuss this in 678 * more detail. 679 * 680 * Why do we care about overlapping intervals? Let's say you have a bunch of 681 * reverse mapping records on a reflink filesystem: 682 * 683 * 1: +- file A startblock B offset C length D -----------+ 684 * 2: +- file E startblock F offset G length H --------------+ 685 * 3: +- file I startblock F offset J length K --+ 686 * 4: +- file L... --+ 687 * 688 * Now say we want to map block (B+D) into file A at offset (C+D). Ideally, 689 * we'd simply increment the length of record 1. But how do we find the record 690 * that ends at (B+D-1) (i.e. record 1)? A LE lookup of (B+D-1) would return 691 * record 3 because the keys are ordered first by startblock. An interval 692 * query would return records 1 and 2 because they both overlap (B+D-1), and 693 * from that we can pick out record 1 as the appropriate left neighbor. 694 * 695 * In the non-overlapped case you can do a LE lookup and decrement the cursor 696 * because a record's interval must end before the next record. 697 */ 698 699 /* 700 * Return size of the btree block header for this btree instance. 701 */ 702 static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur) 703 { 704 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { 705 if (xfs_has_crc(cur->bc_mp)) 706 return XFS_BTREE_LBLOCK_CRC_LEN; 707 return XFS_BTREE_LBLOCK_LEN; 708 } 709 if (xfs_has_crc(cur->bc_mp)) 710 return XFS_BTREE_SBLOCK_CRC_LEN; 711 return XFS_BTREE_SBLOCK_LEN; 712 } 713 714 /* 715 * Calculate offset of the n-th record in a btree block. 716 */ 717 STATIC size_t 718 xfs_btree_rec_offset( 719 struct xfs_btree_cur *cur, 720 int n) 721 { 722 return xfs_btree_block_len(cur) + 723 (n - 1) * cur->bc_ops->rec_len; 724 } 725 726 /* 727 * Calculate offset of the n-th key in a btree block. 728 */ 729 STATIC size_t 730 xfs_btree_key_offset( 731 struct xfs_btree_cur *cur, 732 int n) 733 { 734 return xfs_btree_block_len(cur) + 735 (n - 1) * cur->bc_ops->key_len; 736 } 737 738 /* 739 * Calculate offset of the n-th high key in a btree block. 740 */ 741 STATIC size_t 742 xfs_btree_high_key_offset( 743 struct xfs_btree_cur *cur, 744 int n) 745 { 746 return xfs_btree_block_len(cur) + 747 (n - 1) * cur->bc_ops->key_len + (cur->bc_ops->key_len / 2); 748 } 749 750 /* 751 * Calculate offset of the n-th block pointer in a btree block. 752 */ 753 STATIC size_t 754 xfs_btree_ptr_offset( 755 struct xfs_btree_cur *cur, 756 int n, 757 int level) 758 { 759 return xfs_btree_block_len(cur) + 760 cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len + 761 (n - 1) * cur->bc_ops->ptr_len; 762 } 763 764 /* 765 * Return a pointer to the n-th record in the btree block. 766 */ 767 union xfs_btree_rec * 768 xfs_btree_rec_addr( 769 struct xfs_btree_cur *cur, 770 int n, 771 struct xfs_btree_block *block) 772 { 773 return (union xfs_btree_rec *) 774 ((char *)block + xfs_btree_rec_offset(cur, n)); 775 } 776 777 /* 778 * Return a pointer to the n-th key in the btree block. 779 */ 780 union xfs_btree_key * 781 xfs_btree_key_addr( 782 struct xfs_btree_cur *cur, 783 int n, 784 struct xfs_btree_block *block) 785 { 786 return (union xfs_btree_key *) 787 ((char *)block + xfs_btree_key_offset(cur, n)); 788 } 789 790 /* 791 * Return a pointer to the n-th high key in the btree block. 792 */ 793 union xfs_btree_key * 794 xfs_btree_high_key_addr( 795 struct xfs_btree_cur *cur, 796 int n, 797 struct xfs_btree_block *block) 798 { 799 return (union xfs_btree_key *) 800 ((char *)block + xfs_btree_high_key_offset(cur, n)); 801 } 802 803 /* 804 * Return a pointer to the n-th block pointer in the btree block. 805 */ 806 union xfs_btree_ptr * 807 xfs_btree_ptr_addr( 808 struct xfs_btree_cur *cur, 809 int n, 810 struct xfs_btree_block *block) 811 { 812 int level = xfs_btree_get_level(block); 813 814 ASSERT(block->bb_level != 0); 815 816 return (union xfs_btree_ptr *) 817 ((char *)block + xfs_btree_ptr_offset(cur, n, level)); 818 } 819 820 struct xfs_ifork * 821 xfs_btree_ifork_ptr( 822 struct xfs_btree_cur *cur) 823 { 824 ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE); 825 826 if (cur->bc_flags & XFS_BTREE_STAGING) 827 return cur->bc_ino.ifake->if_fork; 828 return xfs_ifork_ptr(cur->bc_ino.ip, cur->bc_ino.whichfork); 829 } 830 831 /* 832 * Get the root block which is stored in the inode. 833 * 834 * For now this btree implementation assumes the btree root is always 835 * stored in the if_broot field of an inode fork. 836 */ 837 STATIC struct xfs_btree_block * 838 xfs_btree_get_iroot( 839 struct xfs_btree_cur *cur) 840 { 841 struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); 842 843 return (struct xfs_btree_block *)ifp->if_broot; 844 } 845 846 /* 847 * Retrieve the block pointer from the cursor at the given level. 848 * This may be an inode btree root or from a buffer. 849 */ 850 struct xfs_btree_block * /* generic btree block pointer */ 851 xfs_btree_get_block( 852 struct xfs_btree_cur *cur, /* btree cursor */ 853 int level, /* level in btree */ 854 struct xfs_buf **bpp) /* buffer containing the block */ 855 { 856 if (xfs_btree_at_iroot(cur, level)) { 857 *bpp = NULL; 858 return xfs_btree_get_iroot(cur); 859 } 860 861 *bpp = cur->bc_levels[level].bp; 862 return XFS_BUF_TO_BLOCK(*bpp); 863 } 864 865 /* 866 * Change the cursor to point to the first record at the given level. 867 * Other levels are unaffected. 868 */ 869 STATIC int /* success=1, failure=0 */ 870 xfs_btree_firstrec( 871 struct xfs_btree_cur *cur, /* btree cursor */ 872 int level) /* level to change */ 873 { 874 struct xfs_btree_block *block; /* generic btree block pointer */ 875 struct xfs_buf *bp; /* buffer containing block */ 876 877 /* 878 * Get the block pointer for this level. 879 */ 880 block = xfs_btree_get_block(cur, level, &bp); 881 if (xfs_btree_check_block(cur, block, level, bp)) 882 return 0; 883 /* 884 * It's empty, there is no such record. 885 */ 886 if (!block->bb_numrecs) 887 return 0; 888 /* 889 * Set the ptr value to 1, that's the first record/key. 890 */ 891 cur->bc_levels[level].ptr = 1; 892 return 1; 893 } 894 895 /* 896 * Change the cursor to point to the last record in the current block 897 * at the given level. Other levels are unaffected. 898 */ 899 STATIC int /* success=1, failure=0 */ 900 xfs_btree_lastrec( 901 struct xfs_btree_cur *cur, /* btree cursor */ 902 int level) /* level to change */ 903 { 904 struct xfs_btree_block *block; /* generic btree block pointer */ 905 struct xfs_buf *bp; /* buffer containing block */ 906 907 /* 908 * Get the block pointer for this level. 909 */ 910 block = xfs_btree_get_block(cur, level, &bp); 911 if (xfs_btree_check_block(cur, block, level, bp)) 912 return 0; 913 /* 914 * It's empty, there is no such record. 915 */ 916 if (!block->bb_numrecs) 917 return 0; 918 /* 919 * Set the ptr value to numrecs, that's the last record/key. 920 */ 921 cur->bc_levels[level].ptr = be16_to_cpu(block->bb_numrecs); 922 return 1; 923 } 924 925 /* 926 * Compute first and last byte offsets for the fields given. 927 * Interprets the offsets table, which contains struct field offsets. 928 */ 929 void 930 xfs_btree_offsets( 931 uint32_t fields, /* bitmask of fields */ 932 const short *offsets, /* table of field offsets */ 933 int nbits, /* number of bits to inspect */ 934 int *first, /* output: first byte offset */ 935 int *last) /* output: last byte offset */ 936 { 937 int i; /* current bit number */ 938 uint32_t imask; /* mask for current bit number */ 939 940 ASSERT(fields != 0); 941 /* 942 * Find the lowest bit, so the first byte offset. 943 */ 944 for (i = 0, imask = 1u; ; i++, imask <<= 1) { 945 if (imask & fields) { 946 *first = offsets[i]; 947 break; 948 } 949 } 950 /* 951 * Find the highest bit, so the last byte offset. 952 */ 953 for (i = nbits - 1, imask = 1u << i; ; i--, imask >>= 1) { 954 if (imask & fields) { 955 *last = offsets[i + 1] - 1; 956 break; 957 } 958 } 959 } 960 961 STATIC int 962 xfs_btree_readahead_fsblock( 963 struct xfs_btree_cur *cur, 964 int lr, 965 struct xfs_btree_block *block) 966 { 967 struct xfs_mount *mp = cur->bc_mp; 968 xfs_fsblock_t left = be64_to_cpu(block->bb_u.l.bb_leftsib); 969 xfs_fsblock_t right = be64_to_cpu(block->bb_u.l.bb_rightsib); 970 int rval = 0; 971 972 if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) { 973 xfs_buf_readahead(mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, left), 974 mp->m_bsize, cur->bc_ops->buf_ops); 975 rval++; 976 } 977 978 if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) { 979 xfs_buf_readahead(mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, right), 980 mp->m_bsize, cur->bc_ops->buf_ops); 981 rval++; 982 } 983 984 return rval; 985 } 986 987 STATIC int 988 xfs_btree_readahead_memblock( 989 struct xfs_btree_cur *cur, 990 int lr, 991 struct xfs_btree_block *block) 992 { 993 struct xfs_buftarg *btp = cur->bc_mem.xfbtree->target; 994 xfbno_t left = be64_to_cpu(block->bb_u.l.bb_leftsib); 995 xfbno_t right = be64_to_cpu(block->bb_u.l.bb_rightsib); 996 int rval = 0; 997 998 if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) { 999 xfs_buf_readahead(btp, xfbno_to_daddr(left), XFBNO_BBSIZE, 1000 cur->bc_ops->buf_ops); 1001 rval++; 1002 } 1003 1004 if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) { 1005 xfs_buf_readahead(btp, xfbno_to_daddr(right), XFBNO_BBSIZE, 1006 cur->bc_ops->buf_ops); 1007 rval++; 1008 } 1009 1010 return rval; 1011 } 1012 1013 STATIC int 1014 xfs_btree_readahead_agblock( 1015 struct xfs_btree_cur *cur, 1016 int lr, 1017 struct xfs_btree_block *block) 1018 { 1019 struct xfs_mount *mp = cur->bc_mp; 1020 xfs_agnumber_t agno = cur->bc_ag.pag->pag_agno; 1021 xfs_agblock_t left = be32_to_cpu(block->bb_u.s.bb_leftsib); 1022 xfs_agblock_t right = be32_to_cpu(block->bb_u.s.bb_rightsib); 1023 int rval = 0; 1024 1025 if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) { 1026 xfs_buf_readahead(mp->m_ddev_targp, 1027 XFS_AGB_TO_DADDR(mp, agno, left), 1028 mp->m_bsize, cur->bc_ops->buf_ops); 1029 rval++; 1030 } 1031 1032 if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) { 1033 xfs_buf_readahead(mp->m_ddev_targp, 1034 XFS_AGB_TO_DADDR(mp, agno, right), 1035 mp->m_bsize, cur->bc_ops->buf_ops); 1036 rval++; 1037 } 1038 1039 return rval; 1040 } 1041 1042 /* 1043 * Read-ahead btree blocks, at the given level. 1044 * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA. 1045 */ 1046 STATIC int 1047 xfs_btree_readahead( 1048 struct xfs_btree_cur *cur, /* btree cursor */ 1049 int lev, /* level in btree */ 1050 int lr) /* left/right bits */ 1051 { 1052 struct xfs_btree_block *block; 1053 1054 /* 1055 * No readahead needed if we are at the root level and the 1056 * btree root is stored in the inode. 1057 */ 1058 if (xfs_btree_at_iroot(cur, lev)) 1059 return 0; 1060 1061 if ((cur->bc_levels[lev].ra | lr) == cur->bc_levels[lev].ra) 1062 return 0; 1063 1064 cur->bc_levels[lev].ra |= lr; 1065 block = XFS_BUF_TO_BLOCK(cur->bc_levels[lev].bp); 1066 1067 switch (cur->bc_ops->type) { 1068 case XFS_BTREE_TYPE_AG: 1069 return xfs_btree_readahead_agblock(cur, lr, block); 1070 case XFS_BTREE_TYPE_INODE: 1071 return xfs_btree_readahead_fsblock(cur, lr, block); 1072 case XFS_BTREE_TYPE_MEM: 1073 return xfs_btree_readahead_memblock(cur, lr, block); 1074 default: 1075 ASSERT(0); 1076 return 0; 1077 } 1078 } 1079 1080 STATIC int 1081 xfs_btree_ptr_to_daddr( 1082 struct xfs_btree_cur *cur, 1083 const union xfs_btree_ptr *ptr, 1084 xfs_daddr_t *daddr) 1085 { 1086 int error; 1087 1088 error = xfs_btree_check_ptr(cur, ptr, 0, 1); 1089 if (error) 1090 return error; 1091 1092 switch (cur->bc_ops->type) { 1093 case XFS_BTREE_TYPE_AG: 1094 *daddr = XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_ag.pag->pag_agno, 1095 be32_to_cpu(ptr->s)); 1096 break; 1097 case XFS_BTREE_TYPE_INODE: 1098 *daddr = XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l)); 1099 break; 1100 case XFS_BTREE_TYPE_MEM: 1101 *daddr = xfbno_to_daddr(be64_to_cpu(ptr->l)); 1102 break; 1103 } 1104 return 0; 1105 } 1106 1107 /* 1108 * Readahead @count btree blocks at the given @ptr location. 1109 * 1110 * We don't need to care about long or short form btrees here as we have a 1111 * method of converting the ptr directly to a daddr available to us. 1112 */ 1113 STATIC void 1114 xfs_btree_readahead_ptr( 1115 struct xfs_btree_cur *cur, 1116 union xfs_btree_ptr *ptr, 1117 xfs_extlen_t count) 1118 { 1119 xfs_daddr_t daddr; 1120 1121 if (xfs_btree_ptr_to_daddr(cur, ptr, &daddr)) 1122 return; 1123 xfs_buf_readahead(xfs_btree_buftarg(cur), daddr, 1124 xfs_btree_bbsize(cur) * count, 1125 cur->bc_ops->buf_ops); 1126 } 1127 1128 /* 1129 * Set the buffer for level "lev" in the cursor to bp, releasing 1130 * any previous buffer. 1131 */ 1132 STATIC void 1133 xfs_btree_setbuf( 1134 struct xfs_btree_cur *cur, /* btree cursor */ 1135 int lev, /* level in btree */ 1136 struct xfs_buf *bp) /* new buffer to set */ 1137 { 1138 struct xfs_btree_block *b; /* btree block */ 1139 1140 if (cur->bc_levels[lev].bp) 1141 xfs_trans_brelse(cur->bc_tp, cur->bc_levels[lev].bp); 1142 cur->bc_levels[lev].bp = bp; 1143 cur->bc_levels[lev].ra = 0; 1144 1145 b = XFS_BUF_TO_BLOCK(bp); 1146 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { 1147 if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK)) 1148 cur->bc_levels[lev].ra |= XFS_BTCUR_LEFTRA; 1149 if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK)) 1150 cur->bc_levels[lev].ra |= XFS_BTCUR_RIGHTRA; 1151 } else { 1152 if (b->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK)) 1153 cur->bc_levels[lev].ra |= XFS_BTCUR_LEFTRA; 1154 if (b->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK)) 1155 cur->bc_levels[lev].ra |= XFS_BTCUR_RIGHTRA; 1156 } 1157 } 1158 1159 bool 1160 xfs_btree_ptr_is_null( 1161 struct xfs_btree_cur *cur, 1162 const union xfs_btree_ptr *ptr) 1163 { 1164 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) 1165 return ptr->l == cpu_to_be64(NULLFSBLOCK); 1166 else 1167 return ptr->s == cpu_to_be32(NULLAGBLOCK); 1168 } 1169 1170 void 1171 xfs_btree_set_ptr_null( 1172 struct xfs_btree_cur *cur, 1173 union xfs_btree_ptr *ptr) 1174 { 1175 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) 1176 ptr->l = cpu_to_be64(NULLFSBLOCK); 1177 else 1178 ptr->s = cpu_to_be32(NULLAGBLOCK); 1179 } 1180 1181 static inline bool 1182 xfs_btree_ptrs_equal( 1183 struct xfs_btree_cur *cur, 1184 union xfs_btree_ptr *ptr1, 1185 union xfs_btree_ptr *ptr2) 1186 { 1187 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) 1188 return ptr1->l == ptr2->l; 1189 return ptr1->s == ptr2->s; 1190 } 1191 1192 /* 1193 * Get/set/init sibling pointers 1194 */ 1195 void 1196 xfs_btree_get_sibling( 1197 struct xfs_btree_cur *cur, 1198 struct xfs_btree_block *block, 1199 union xfs_btree_ptr *ptr, 1200 int lr) 1201 { 1202 ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB); 1203 1204 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { 1205 if (lr == XFS_BB_RIGHTSIB) 1206 ptr->l = block->bb_u.l.bb_rightsib; 1207 else 1208 ptr->l = block->bb_u.l.bb_leftsib; 1209 } else { 1210 if (lr == XFS_BB_RIGHTSIB) 1211 ptr->s = block->bb_u.s.bb_rightsib; 1212 else 1213 ptr->s = block->bb_u.s.bb_leftsib; 1214 } 1215 } 1216 1217 void 1218 xfs_btree_set_sibling( 1219 struct xfs_btree_cur *cur, 1220 struct xfs_btree_block *block, 1221 const union xfs_btree_ptr *ptr, 1222 int lr) 1223 { 1224 ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB); 1225 1226 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { 1227 if (lr == XFS_BB_RIGHTSIB) 1228 block->bb_u.l.bb_rightsib = ptr->l; 1229 else 1230 block->bb_u.l.bb_leftsib = ptr->l; 1231 } else { 1232 if (lr == XFS_BB_RIGHTSIB) 1233 block->bb_u.s.bb_rightsib = ptr->s; 1234 else 1235 block->bb_u.s.bb_leftsib = ptr->s; 1236 } 1237 } 1238 1239 static void 1240 __xfs_btree_init_block( 1241 struct xfs_mount *mp, 1242 struct xfs_btree_block *buf, 1243 const struct xfs_btree_ops *ops, 1244 xfs_daddr_t blkno, 1245 __u16 level, 1246 __u16 numrecs, 1247 __u64 owner) 1248 { 1249 bool crc = xfs_has_crc(mp); 1250 __u32 magic = xfs_btree_magic(mp, ops); 1251 1252 buf->bb_magic = cpu_to_be32(magic); 1253 buf->bb_level = cpu_to_be16(level); 1254 buf->bb_numrecs = cpu_to_be16(numrecs); 1255 1256 if (ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { 1257 buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK); 1258 buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK); 1259 if (crc) { 1260 buf->bb_u.l.bb_blkno = cpu_to_be64(blkno); 1261 buf->bb_u.l.bb_owner = cpu_to_be64(owner); 1262 uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid); 1263 buf->bb_u.l.bb_pad = 0; 1264 buf->bb_u.l.bb_lsn = 0; 1265 } 1266 } else { 1267 buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK); 1268 buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK); 1269 if (crc) { 1270 buf->bb_u.s.bb_blkno = cpu_to_be64(blkno); 1271 /* owner is a 32 bit value on short blocks */ 1272 buf->bb_u.s.bb_owner = cpu_to_be32((__u32)owner); 1273 uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid); 1274 buf->bb_u.s.bb_lsn = 0; 1275 } 1276 } 1277 } 1278 1279 void 1280 xfs_btree_init_block( 1281 struct xfs_mount *mp, 1282 struct xfs_btree_block *block, 1283 const struct xfs_btree_ops *ops, 1284 __u16 level, 1285 __u16 numrecs, 1286 __u64 owner) 1287 { 1288 __xfs_btree_init_block(mp, block, ops, XFS_BUF_DADDR_NULL, level, 1289 numrecs, owner); 1290 } 1291 1292 void 1293 xfs_btree_init_buf( 1294 struct xfs_mount *mp, 1295 struct xfs_buf *bp, 1296 const struct xfs_btree_ops *ops, 1297 __u16 level, 1298 __u16 numrecs, 1299 __u64 owner) 1300 { 1301 __xfs_btree_init_block(mp, XFS_BUF_TO_BLOCK(bp), ops, 1302 xfs_buf_daddr(bp), level, numrecs, owner); 1303 bp->b_ops = ops->buf_ops; 1304 } 1305 1306 static inline __u64 1307 xfs_btree_owner( 1308 struct xfs_btree_cur *cur) 1309 { 1310 switch (cur->bc_ops->type) { 1311 case XFS_BTREE_TYPE_MEM: 1312 return cur->bc_mem.xfbtree->owner; 1313 case XFS_BTREE_TYPE_INODE: 1314 return cur->bc_ino.ip->i_ino; 1315 case XFS_BTREE_TYPE_AG: 1316 return cur->bc_ag.pag->pag_agno; 1317 default: 1318 ASSERT(0); 1319 return 0; 1320 } 1321 } 1322 1323 void 1324 xfs_btree_init_block_cur( 1325 struct xfs_btree_cur *cur, 1326 struct xfs_buf *bp, 1327 int level, 1328 int numrecs) 1329 { 1330 xfs_btree_init_buf(cur->bc_mp, bp, cur->bc_ops, level, numrecs, 1331 xfs_btree_owner(cur)); 1332 } 1333 1334 STATIC void 1335 xfs_btree_buf_to_ptr( 1336 struct xfs_btree_cur *cur, 1337 struct xfs_buf *bp, 1338 union xfs_btree_ptr *ptr) 1339 { 1340 switch (cur->bc_ops->type) { 1341 case XFS_BTREE_TYPE_AG: 1342 ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp, 1343 xfs_buf_daddr(bp))); 1344 break; 1345 case XFS_BTREE_TYPE_INODE: 1346 ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp, 1347 xfs_buf_daddr(bp))); 1348 break; 1349 case XFS_BTREE_TYPE_MEM: 1350 ptr->l = cpu_to_be64(xfs_daddr_to_xfbno(xfs_buf_daddr(bp))); 1351 break; 1352 } 1353 } 1354 1355 static inline void 1356 xfs_btree_set_refs( 1357 struct xfs_btree_cur *cur, 1358 struct xfs_buf *bp) 1359 { 1360 xfs_buf_set_ref(bp, cur->bc_ops->lru_refs); 1361 } 1362 1363 int 1364 xfs_btree_get_buf_block( 1365 struct xfs_btree_cur *cur, 1366 const union xfs_btree_ptr *ptr, 1367 struct xfs_btree_block **block, 1368 struct xfs_buf **bpp) 1369 { 1370 xfs_daddr_t d; 1371 int error; 1372 1373 error = xfs_btree_ptr_to_daddr(cur, ptr, &d); 1374 if (error) 1375 return error; 1376 error = xfs_trans_get_buf(cur->bc_tp, xfs_btree_buftarg(cur), d, 1377 xfs_btree_bbsize(cur), 0, bpp); 1378 if (error) 1379 return error; 1380 1381 (*bpp)->b_ops = cur->bc_ops->buf_ops; 1382 *block = XFS_BUF_TO_BLOCK(*bpp); 1383 return 0; 1384 } 1385 1386 /* 1387 * Read in the buffer at the given ptr and return the buffer and 1388 * the block pointer within the buffer. 1389 */ 1390 int 1391 xfs_btree_read_buf_block( 1392 struct xfs_btree_cur *cur, 1393 const union xfs_btree_ptr *ptr, 1394 int flags, 1395 struct xfs_btree_block **block, 1396 struct xfs_buf **bpp) 1397 { 1398 struct xfs_mount *mp = cur->bc_mp; 1399 xfs_daddr_t d; 1400 int error; 1401 1402 /* need to sort out how callers deal with failures first */ 1403 ASSERT(!(flags & XBF_TRYLOCK)); 1404 1405 error = xfs_btree_ptr_to_daddr(cur, ptr, &d); 1406 if (error) 1407 return error; 1408 error = xfs_trans_read_buf(mp, cur->bc_tp, xfs_btree_buftarg(cur), d, 1409 xfs_btree_bbsize(cur), flags, bpp, 1410 cur->bc_ops->buf_ops); 1411 if (xfs_metadata_is_sick(error)) 1412 xfs_btree_mark_sick(cur); 1413 if (error) 1414 return error; 1415 1416 xfs_btree_set_refs(cur, *bpp); 1417 *block = XFS_BUF_TO_BLOCK(*bpp); 1418 return 0; 1419 } 1420 1421 /* 1422 * Copy keys from one btree block to another. 1423 */ 1424 void 1425 xfs_btree_copy_keys( 1426 struct xfs_btree_cur *cur, 1427 union xfs_btree_key *dst_key, 1428 const union xfs_btree_key *src_key, 1429 int numkeys) 1430 { 1431 ASSERT(numkeys >= 0); 1432 memcpy(dst_key, src_key, numkeys * cur->bc_ops->key_len); 1433 } 1434 1435 /* 1436 * Copy records from one btree block to another. 1437 */ 1438 STATIC void 1439 xfs_btree_copy_recs( 1440 struct xfs_btree_cur *cur, 1441 union xfs_btree_rec *dst_rec, 1442 union xfs_btree_rec *src_rec, 1443 int numrecs) 1444 { 1445 ASSERT(numrecs >= 0); 1446 memcpy(dst_rec, src_rec, numrecs * cur->bc_ops->rec_len); 1447 } 1448 1449 /* 1450 * Copy block pointers from one btree block to another. 1451 */ 1452 void 1453 xfs_btree_copy_ptrs( 1454 struct xfs_btree_cur *cur, 1455 union xfs_btree_ptr *dst_ptr, 1456 const union xfs_btree_ptr *src_ptr, 1457 int numptrs) 1458 { 1459 ASSERT(numptrs >= 0); 1460 memcpy(dst_ptr, src_ptr, numptrs * cur->bc_ops->ptr_len); 1461 } 1462 1463 /* 1464 * Shift keys one index left/right inside a single btree block. 1465 */ 1466 STATIC void 1467 xfs_btree_shift_keys( 1468 struct xfs_btree_cur *cur, 1469 union xfs_btree_key *key, 1470 int dir, 1471 int numkeys) 1472 { 1473 char *dst_key; 1474 1475 ASSERT(numkeys >= 0); 1476 ASSERT(dir == 1 || dir == -1); 1477 1478 dst_key = (char *)key + (dir * cur->bc_ops->key_len); 1479 memmove(dst_key, key, numkeys * cur->bc_ops->key_len); 1480 } 1481 1482 /* 1483 * Shift records one index left/right inside a single btree block. 1484 */ 1485 STATIC void 1486 xfs_btree_shift_recs( 1487 struct xfs_btree_cur *cur, 1488 union xfs_btree_rec *rec, 1489 int dir, 1490 int numrecs) 1491 { 1492 char *dst_rec; 1493 1494 ASSERT(numrecs >= 0); 1495 ASSERT(dir == 1 || dir == -1); 1496 1497 dst_rec = (char *)rec + (dir * cur->bc_ops->rec_len); 1498 memmove(dst_rec, rec, numrecs * cur->bc_ops->rec_len); 1499 } 1500 1501 /* 1502 * Shift block pointers one index left/right inside a single btree block. 1503 */ 1504 STATIC void 1505 xfs_btree_shift_ptrs( 1506 struct xfs_btree_cur *cur, 1507 union xfs_btree_ptr *ptr, 1508 int dir, 1509 int numptrs) 1510 { 1511 char *dst_ptr; 1512 1513 ASSERT(numptrs >= 0); 1514 ASSERT(dir == 1 || dir == -1); 1515 1516 dst_ptr = (char *)ptr + (dir * cur->bc_ops->ptr_len); 1517 memmove(dst_ptr, ptr, numptrs * cur->bc_ops->ptr_len); 1518 } 1519 1520 /* 1521 * Log key values from the btree block. 1522 */ 1523 STATIC void 1524 xfs_btree_log_keys( 1525 struct xfs_btree_cur *cur, 1526 struct xfs_buf *bp, 1527 int first, 1528 int last) 1529 { 1530 1531 if (bp) { 1532 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); 1533 xfs_trans_log_buf(cur->bc_tp, bp, 1534 xfs_btree_key_offset(cur, first), 1535 xfs_btree_key_offset(cur, last + 1) - 1); 1536 } else { 1537 xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip, 1538 xfs_ilog_fbroot(cur->bc_ino.whichfork)); 1539 } 1540 } 1541 1542 /* 1543 * Log record values from the btree block. 1544 */ 1545 void 1546 xfs_btree_log_recs( 1547 struct xfs_btree_cur *cur, 1548 struct xfs_buf *bp, 1549 int first, 1550 int last) 1551 { 1552 1553 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); 1554 xfs_trans_log_buf(cur->bc_tp, bp, 1555 xfs_btree_rec_offset(cur, first), 1556 xfs_btree_rec_offset(cur, last + 1) - 1); 1557 1558 } 1559 1560 /* 1561 * Log block pointer fields from a btree block (nonleaf). 1562 */ 1563 STATIC void 1564 xfs_btree_log_ptrs( 1565 struct xfs_btree_cur *cur, /* btree cursor */ 1566 struct xfs_buf *bp, /* buffer containing btree block */ 1567 int first, /* index of first pointer to log */ 1568 int last) /* index of last pointer to log */ 1569 { 1570 1571 if (bp) { 1572 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 1573 int level = xfs_btree_get_level(block); 1574 1575 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); 1576 xfs_trans_log_buf(cur->bc_tp, bp, 1577 xfs_btree_ptr_offset(cur, first, level), 1578 xfs_btree_ptr_offset(cur, last + 1, level) - 1); 1579 } else { 1580 xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip, 1581 xfs_ilog_fbroot(cur->bc_ino.whichfork)); 1582 } 1583 1584 } 1585 1586 /* 1587 * Log fields from a btree block header. 1588 */ 1589 void 1590 xfs_btree_log_block( 1591 struct xfs_btree_cur *cur, /* btree cursor */ 1592 struct xfs_buf *bp, /* buffer containing btree block */ 1593 uint32_t fields) /* mask of fields: XFS_BB_... */ 1594 { 1595 int first; /* first byte offset logged */ 1596 int last; /* last byte offset logged */ 1597 static const short soffsets[] = { /* table of offsets (short) */ 1598 offsetof(struct xfs_btree_block, bb_magic), 1599 offsetof(struct xfs_btree_block, bb_level), 1600 offsetof(struct xfs_btree_block, bb_numrecs), 1601 offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib), 1602 offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib), 1603 offsetof(struct xfs_btree_block, bb_u.s.bb_blkno), 1604 offsetof(struct xfs_btree_block, bb_u.s.bb_lsn), 1605 offsetof(struct xfs_btree_block, bb_u.s.bb_uuid), 1606 offsetof(struct xfs_btree_block, bb_u.s.bb_owner), 1607 offsetof(struct xfs_btree_block, bb_u.s.bb_crc), 1608 XFS_BTREE_SBLOCK_CRC_LEN 1609 }; 1610 static const short loffsets[] = { /* table of offsets (long) */ 1611 offsetof(struct xfs_btree_block, bb_magic), 1612 offsetof(struct xfs_btree_block, bb_level), 1613 offsetof(struct xfs_btree_block, bb_numrecs), 1614 offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib), 1615 offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib), 1616 offsetof(struct xfs_btree_block, bb_u.l.bb_blkno), 1617 offsetof(struct xfs_btree_block, bb_u.l.bb_lsn), 1618 offsetof(struct xfs_btree_block, bb_u.l.bb_uuid), 1619 offsetof(struct xfs_btree_block, bb_u.l.bb_owner), 1620 offsetof(struct xfs_btree_block, bb_u.l.bb_crc), 1621 offsetof(struct xfs_btree_block, bb_u.l.bb_pad), 1622 XFS_BTREE_LBLOCK_CRC_LEN 1623 }; 1624 1625 if (bp) { 1626 int nbits; 1627 1628 if (xfs_has_crc(cur->bc_mp)) { 1629 /* 1630 * We don't log the CRC when updating a btree 1631 * block but instead recreate it during log 1632 * recovery. As the log buffers have checksums 1633 * of their own this is safe and avoids logging a crc 1634 * update in a lot of places. 1635 */ 1636 if (fields == XFS_BB_ALL_BITS) 1637 fields = XFS_BB_ALL_BITS_CRC; 1638 nbits = XFS_BB_NUM_BITS_CRC; 1639 } else { 1640 nbits = XFS_BB_NUM_BITS; 1641 } 1642 xfs_btree_offsets(fields, 1643 (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) ? 1644 loffsets : soffsets, 1645 nbits, &first, &last); 1646 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); 1647 xfs_trans_log_buf(cur->bc_tp, bp, first, last); 1648 } else { 1649 xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip, 1650 xfs_ilog_fbroot(cur->bc_ino.whichfork)); 1651 } 1652 } 1653 1654 /* 1655 * Increment cursor by one record at the level. 1656 * For nonzero levels the leaf-ward information is untouched. 1657 */ 1658 int /* error */ 1659 xfs_btree_increment( 1660 struct xfs_btree_cur *cur, 1661 int level, 1662 int *stat) /* success/failure */ 1663 { 1664 struct xfs_btree_block *block; 1665 union xfs_btree_ptr ptr; 1666 struct xfs_buf *bp; 1667 int error; /* error return value */ 1668 int lev; 1669 1670 ASSERT(level < cur->bc_nlevels); 1671 1672 /* Read-ahead to the right at this level. */ 1673 xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA); 1674 1675 /* Get a pointer to the btree block. */ 1676 block = xfs_btree_get_block(cur, level, &bp); 1677 1678 #ifdef DEBUG 1679 error = xfs_btree_check_block(cur, block, level, bp); 1680 if (error) 1681 goto error0; 1682 #endif 1683 1684 /* We're done if we remain in the block after the increment. */ 1685 if (++cur->bc_levels[level].ptr <= xfs_btree_get_numrecs(block)) 1686 goto out1; 1687 1688 /* Fail if we just went off the right edge of the tree. */ 1689 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB); 1690 if (xfs_btree_ptr_is_null(cur, &ptr)) 1691 goto out0; 1692 1693 XFS_BTREE_STATS_INC(cur, increment); 1694 1695 /* 1696 * March up the tree incrementing pointers. 1697 * Stop when we don't go off the right edge of a block. 1698 */ 1699 for (lev = level + 1; lev < cur->bc_nlevels; lev++) { 1700 block = xfs_btree_get_block(cur, lev, &bp); 1701 1702 #ifdef DEBUG 1703 error = xfs_btree_check_block(cur, block, lev, bp); 1704 if (error) 1705 goto error0; 1706 #endif 1707 1708 if (++cur->bc_levels[lev].ptr <= xfs_btree_get_numrecs(block)) 1709 break; 1710 1711 /* Read-ahead the right block for the next loop. */ 1712 xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA); 1713 } 1714 1715 /* 1716 * If we went off the root then we are either seriously 1717 * confused or have the tree root in an inode. 1718 */ 1719 if (lev == cur->bc_nlevels) { 1720 if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) 1721 goto out0; 1722 ASSERT(0); 1723 xfs_btree_mark_sick(cur); 1724 error = -EFSCORRUPTED; 1725 goto error0; 1726 } 1727 ASSERT(lev < cur->bc_nlevels); 1728 1729 /* 1730 * Now walk back down the tree, fixing up the cursor's buffer 1731 * pointers and key numbers. 1732 */ 1733 for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) { 1734 union xfs_btree_ptr *ptrp; 1735 1736 ptrp = xfs_btree_ptr_addr(cur, cur->bc_levels[lev].ptr, block); 1737 --lev; 1738 error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp); 1739 if (error) 1740 goto error0; 1741 1742 xfs_btree_setbuf(cur, lev, bp); 1743 cur->bc_levels[lev].ptr = 1; 1744 } 1745 out1: 1746 *stat = 1; 1747 return 0; 1748 1749 out0: 1750 *stat = 0; 1751 return 0; 1752 1753 error0: 1754 return error; 1755 } 1756 1757 /* 1758 * Decrement cursor by one record at the level. 1759 * For nonzero levels the leaf-ward information is untouched. 1760 */ 1761 int /* error */ 1762 xfs_btree_decrement( 1763 struct xfs_btree_cur *cur, 1764 int level, 1765 int *stat) /* success/failure */ 1766 { 1767 struct xfs_btree_block *block; 1768 struct xfs_buf *bp; 1769 int error; /* error return value */ 1770 int lev; 1771 union xfs_btree_ptr ptr; 1772 1773 ASSERT(level < cur->bc_nlevels); 1774 1775 /* Read-ahead to the left at this level. */ 1776 xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA); 1777 1778 /* We're done if we remain in the block after the decrement. */ 1779 if (--cur->bc_levels[level].ptr > 0) 1780 goto out1; 1781 1782 /* Get a pointer to the btree block. */ 1783 block = xfs_btree_get_block(cur, level, &bp); 1784 1785 #ifdef DEBUG 1786 error = xfs_btree_check_block(cur, block, level, bp); 1787 if (error) 1788 goto error0; 1789 #endif 1790 1791 /* Fail if we just went off the left edge of the tree. */ 1792 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB); 1793 if (xfs_btree_ptr_is_null(cur, &ptr)) 1794 goto out0; 1795 1796 XFS_BTREE_STATS_INC(cur, decrement); 1797 1798 /* 1799 * March up the tree decrementing pointers. 1800 * Stop when we don't go off the left edge of a block. 1801 */ 1802 for (lev = level + 1; lev < cur->bc_nlevels; lev++) { 1803 if (--cur->bc_levels[lev].ptr > 0) 1804 break; 1805 /* Read-ahead the left block for the next loop. */ 1806 xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA); 1807 } 1808 1809 /* 1810 * If we went off the root then we are seriously confused. 1811 * or the root of the tree is in an inode. 1812 */ 1813 if (lev == cur->bc_nlevels) { 1814 if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) 1815 goto out0; 1816 ASSERT(0); 1817 xfs_btree_mark_sick(cur); 1818 error = -EFSCORRUPTED; 1819 goto error0; 1820 } 1821 ASSERT(lev < cur->bc_nlevels); 1822 1823 /* 1824 * Now walk back down the tree, fixing up the cursor's buffer 1825 * pointers and key numbers. 1826 */ 1827 for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) { 1828 union xfs_btree_ptr *ptrp; 1829 1830 ptrp = xfs_btree_ptr_addr(cur, cur->bc_levels[lev].ptr, block); 1831 --lev; 1832 error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp); 1833 if (error) 1834 goto error0; 1835 xfs_btree_setbuf(cur, lev, bp); 1836 cur->bc_levels[lev].ptr = xfs_btree_get_numrecs(block); 1837 } 1838 out1: 1839 *stat = 1; 1840 return 0; 1841 1842 out0: 1843 *stat = 0; 1844 return 0; 1845 1846 error0: 1847 return error; 1848 } 1849 1850 /* 1851 * Check the btree block owner now that we have the context to know who the 1852 * real owner is. 1853 */ 1854 static inline xfs_failaddr_t 1855 xfs_btree_check_block_owner( 1856 struct xfs_btree_cur *cur, 1857 struct xfs_btree_block *block) 1858 { 1859 __u64 owner; 1860 1861 if (!xfs_has_crc(cur->bc_mp) || 1862 (cur->bc_flags & XFS_BTREE_BMBT_INVALID_OWNER)) 1863 return NULL; 1864 1865 owner = xfs_btree_owner(cur); 1866 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { 1867 if (be64_to_cpu(block->bb_u.l.bb_owner) != owner) 1868 return __this_address; 1869 } else { 1870 if (be32_to_cpu(block->bb_u.s.bb_owner) != owner) 1871 return __this_address; 1872 } 1873 1874 return NULL; 1875 } 1876 1877 int 1878 xfs_btree_lookup_get_block( 1879 struct xfs_btree_cur *cur, /* btree cursor */ 1880 int level, /* level in the btree */ 1881 const union xfs_btree_ptr *pp, /* ptr to btree block */ 1882 struct xfs_btree_block **blkp) /* return btree block */ 1883 { 1884 struct xfs_buf *bp; /* buffer pointer for btree block */ 1885 xfs_daddr_t daddr; 1886 int error = 0; 1887 1888 /* special case the root block if in an inode */ 1889 if (xfs_btree_at_iroot(cur, level)) { 1890 *blkp = xfs_btree_get_iroot(cur); 1891 return 0; 1892 } 1893 1894 /* 1895 * If the old buffer at this level for the disk address we are 1896 * looking for re-use it. 1897 * 1898 * Otherwise throw it away and get a new one. 1899 */ 1900 bp = cur->bc_levels[level].bp; 1901 error = xfs_btree_ptr_to_daddr(cur, pp, &daddr); 1902 if (error) 1903 return error; 1904 if (bp && xfs_buf_daddr(bp) == daddr) { 1905 *blkp = XFS_BUF_TO_BLOCK(bp); 1906 return 0; 1907 } 1908 1909 error = xfs_btree_read_buf_block(cur, pp, 0, blkp, &bp); 1910 if (error) 1911 return error; 1912 1913 /* Check the inode owner since the verifiers don't. */ 1914 if (xfs_btree_check_block_owner(cur, *blkp) != NULL) 1915 goto out_bad; 1916 1917 /* Did we get the level we were looking for? */ 1918 if (be16_to_cpu((*blkp)->bb_level) != level) 1919 goto out_bad; 1920 1921 /* Check that internal nodes have at least one record. */ 1922 if (level != 0 && be16_to_cpu((*blkp)->bb_numrecs) == 0) 1923 goto out_bad; 1924 1925 xfs_btree_setbuf(cur, level, bp); 1926 return 0; 1927 1928 out_bad: 1929 *blkp = NULL; 1930 xfs_buf_mark_corrupt(bp); 1931 xfs_trans_brelse(cur->bc_tp, bp); 1932 xfs_btree_mark_sick(cur); 1933 return -EFSCORRUPTED; 1934 } 1935 1936 /* 1937 * Get current search key. For level 0 we don't actually have a key 1938 * structure so we make one up from the record. For all other levels 1939 * we just return the right key. 1940 */ 1941 STATIC union xfs_btree_key * 1942 xfs_lookup_get_search_key( 1943 struct xfs_btree_cur *cur, 1944 int level, 1945 int keyno, 1946 struct xfs_btree_block *block, 1947 union xfs_btree_key *kp) 1948 { 1949 if (level == 0) { 1950 cur->bc_ops->init_key_from_rec(kp, 1951 xfs_btree_rec_addr(cur, keyno, block)); 1952 return kp; 1953 } 1954 1955 return xfs_btree_key_addr(cur, keyno, block); 1956 } 1957 1958 /* 1959 * Initialize a pointer to the root block. 1960 */ 1961 void 1962 xfs_btree_init_ptr_from_cur( 1963 struct xfs_btree_cur *cur, 1964 union xfs_btree_ptr *ptr) 1965 { 1966 if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) { 1967 /* 1968 * Inode-rooted btrees call xfs_btree_get_iroot to find the root 1969 * in xfs_btree_lookup_get_block and don't need a pointer here. 1970 */ 1971 ptr->l = 0; 1972 } else if (cur->bc_flags & XFS_BTREE_STAGING) { 1973 ptr->s = cpu_to_be32(cur->bc_ag.afake->af_root); 1974 } else { 1975 cur->bc_ops->init_ptr_from_cur(cur, ptr); 1976 } 1977 } 1978 1979 /* 1980 * Lookup the record. The cursor is made to point to it, based on dir. 1981 * stat is set to 0 if can't find any such record, 1 for success. 1982 */ 1983 int /* error */ 1984 xfs_btree_lookup( 1985 struct xfs_btree_cur *cur, /* btree cursor */ 1986 xfs_lookup_t dir, /* <=, ==, or >= */ 1987 int *stat) /* success/failure */ 1988 { 1989 struct xfs_btree_block *block; /* current btree block */ 1990 int64_t diff; /* difference for the current key */ 1991 int error; /* error return value */ 1992 int keyno; /* current key number */ 1993 int level; /* level in the btree */ 1994 union xfs_btree_ptr *pp; /* ptr to btree block */ 1995 union xfs_btree_ptr ptr; /* ptr to btree block */ 1996 1997 XFS_BTREE_STATS_INC(cur, lookup); 1998 1999 /* No such thing as a zero-level tree. */ 2000 if (XFS_IS_CORRUPT(cur->bc_mp, cur->bc_nlevels == 0)) { 2001 xfs_btree_mark_sick(cur); 2002 return -EFSCORRUPTED; 2003 } 2004 2005 block = NULL; 2006 keyno = 0; 2007 2008 /* initialise start pointer from cursor */ 2009 xfs_btree_init_ptr_from_cur(cur, &ptr); 2010 pp = &ptr; 2011 2012 /* 2013 * Iterate over each level in the btree, starting at the root. 2014 * For each level above the leaves, find the key we need, based 2015 * on the lookup record, then follow the corresponding block 2016 * pointer down to the next level. 2017 */ 2018 for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) { 2019 /* Get the block we need to do the lookup on. */ 2020 error = xfs_btree_lookup_get_block(cur, level, pp, &block); 2021 if (error) 2022 goto error0; 2023 2024 if (diff == 0) { 2025 /* 2026 * If we already had a key match at a higher level, we 2027 * know we need to use the first entry in this block. 2028 */ 2029 keyno = 1; 2030 } else { 2031 /* Otherwise search this block. Do a binary search. */ 2032 2033 int high; /* high entry number */ 2034 int low; /* low entry number */ 2035 2036 /* Set low and high entry numbers, 1-based. */ 2037 low = 1; 2038 high = xfs_btree_get_numrecs(block); 2039 if (!high) { 2040 /* Block is empty, must be an empty leaf. */ 2041 if (level != 0 || cur->bc_nlevels != 1) { 2042 XFS_CORRUPTION_ERROR(__func__, 2043 XFS_ERRLEVEL_LOW, 2044 cur->bc_mp, block, 2045 sizeof(*block)); 2046 xfs_btree_mark_sick(cur); 2047 return -EFSCORRUPTED; 2048 } 2049 2050 cur->bc_levels[0].ptr = dir != XFS_LOOKUP_LE; 2051 *stat = 0; 2052 return 0; 2053 } 2054 2055 /* Binary search the block. */ 2056 while (low <= high) { 2057 union xfs_btree_key key; 2058 union xfs_btree_key *kp; 2059 2060 XFS_BTREE_STATS_INC(cur, compare); 2061 2062 /* keyno is average of low and high. */ 2063 keyno = (low + high) >> 1; 2064 2065 /* Get current search key */ 2066 kp = xfs_lookup_get_search_key(cur, level, 2067 keyno, block, &key); 2068 2069 /* 2070 * Compute difference to get next direction: 2071 * - less than, move right 2072 * - greater than, move left 2073 * - equal, we're done 2074 */ 2075 diff = cur->bc_ops->key_diff(cur, kp); 2076 if (diff < 0) 2077 low = keyno + 1; 2078 else if (diff > 0) 2079 high = keyno - 1; 2080 else 2081 break; 2082 } 2083 } 2084 2085 /* 2086 * If there are more levels, set up for the next level 2087 * by getting the block number and filling in the cursor. 2088 */ 2089 if (level > 0) { 2090 /* 2091 * If we moved left, need the previous key number, 2092 * unless there isn't one. 2093 */ 2094 if (diff > 0 && --keyno < 1) 2095 keyno = 1; 2096 pp = xfs_btree_ptr_addr(cur, keyno, block); 2097 2098 error = xfs_btree_debug_check_ptr(cur, pp, 0, level); 2099 if (error) 2100 goto error0; 2101 2102 cur->bc_levels[level].ptr = keyno; 2103 } 2104 } 2105 2106 /* Done with the search. See if we need to adjust the results. */ 2107 if (dir != XFS_LOOKUP_LE && diff < 0) { 2108 keyno++; 2109 /* 2110 * If ge search and we went off the end of the block, but it's 2111 * not the last block, we're in the wrong block. 2112 */ 2113 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB); 2114 if (dir == XFS_LOOKUP_GE && 2115 keyno > xfs_btree_get_numrecs(block) && 2116 !xfs_btree_ptr_is_null(cur, &ptr)) { 2117 int i; 2118 2119 cur->bc_levels[0].ptr = keyno; 2120 error = xfs_btree_increment(cur, 0, &i); 2121 if (error) 2122 goto error0; 2123 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 2124 xfs_btree_mark_sick(cur); 2125 return -EFSCORRUPTED; 2126 } 2127 *stat = 1; 2128 return 0; 2129 } 2130 } else if (dir == XFS_LOOKUP_LE && diff > 0) 2131 keyno--; 2132 cur->bc_levels[0].ptr = keyno; 2133 2134 /* Return if we succeeded or not. */ 2135 if (keyno == 0 || keyno > xfs_btree_get_numrecs(block)) 2136 *stat = 0; 2137 else if (dir != XFS_LOOKUP_EQ || diff == 0) 2138 *stat = 1; 2139 else 2140 *stat = 0; 2141 return 0; 2142 2143 error0: 2144 return error; 2145 } 2146 2147 /* Find the high key storage area from a regular key. */ 2148 union xfs_btree_key * 2149 xfs_btree_high_key_from_key( 2150 struct xfs_btree_cur *cur, 2151 union xfs_btree_key *key) 2152 { 2153 ASSERT(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING); 2154 return (union xfs_btree_key *)((char *)key + 2155 (cur->bc_ops->key_len / 2)); 2156 } 2157 2158 /* Determine the low (and high if overlapped) keys of a leaf block */ 2159 STATIC void 2160 xfs_btree_get_leaf_keys( 2161 struct xfs_btree_cur *cur, 2162 struct xfs_btree_block *block, 2163 union xfs_btree_key *key) 2164 { 2165 union xfs_btree_key max_hkey; 2166 union xfs_btree_key hkey; 2167 union xfs_btree_rec *rec; 2168 union xfs_btree_key *high; 2169 int n; 2170 2171 rec = xfs_btree_rec_addr(cur, 1, block); 2172 cur->bc_ops->init_key_from_rec(key, rec); 2173 2174 if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) { 2175 2176 cur->bc_ops->init_high_key_from_rec(&max_hkey, rec); 2177 for (n = 2; n <= xfs_btree_get_numrecs(block); n++) { 2178 rec = xfs_btree_rec_addr(cur, n, block); 2179 cur->bc_ops->init_high_key_from_rec(&hkey, rec); 2180 if (xfs_btree_keycmp_gt(cur, &hkey, &max_hkey)) 2181 max_hkey = hkey; 2182 } 2183 2184 high = xfs_btree_high_key_from_key(cur, key); 2185 memcpy(high, &max_hkey, cur->bc_ops->key_len / 2); 2186 } 2187 } 2188 2189 /* Determine the low (and high if overlapped) keys of a node block */ 2190 STATIC void 2191 xfs_btree_get_node_keys( 2192 struct xfs_btree_cur *cur, 2193 struct xfs_btree_block *block, 2194 union xfs_btree_key *key) 2195 { 2196 union xfs_btree_key *hkey; 2197 union xfs_btree_key *max_hkey; 2198 union xfs_btree_key *high; 2199 int n; 2200 2201 if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) { 2202 memcpy(key, xfs_btree_key_addr(cur, 1, block), 2203 cur->bc_ops->key_len / 2); 2204 2205 max_hkey = xfs_btree_high_key_addr(cur, 1, block); 2206 for (n = 2; n <= xfs_btree_get_numrecs(block); n++) { 2207 hkey = xfs_btree_high_key_addr(cur, n, block); 2208 if (xfs_btree_keycmp_gt(cur, hkey, max_hkey)) 2209 max_hkey = hkey; 2210 } 2211 2212 high = xfs_btree_high_key_from_key(cur, key); 2213 memcpy(high, max_hkey, cur->bc_ops->key_len / 2); 2214 } else { 2215 memcpy(key, xfs_btree_key_addr(cur, 1, block), 2216 cur->bc_ops->key_len); 2217 } 2218 } 2219 2220 /* Derive the keys for any btree block. */ 2221 void 2222 xfs_btree_get_keys( 2223 struct xfs_btree_cur *cur, 2224 struct xfs_btree_block *block, 2225 union xfs_btree_key *key) 2226 { 2227 if (be16_to_cpu(block->bb_level) == 0) 2228 xfs_btree_get_leaf_keys(cur, block, key); 2229 else 2230 xfs_btree_get_node_keys(cur, block, key); 2231 } 2232 2233 /* 2234 * Decide if we need to update the parent keys of a btree block. For 2235 * a standard btree this is only necessary if we're updating the first 2236 * record/key. For an overlapping btree, we must always update the 2237 * keys because the highest key can be in any of the records or keys 2238 * in the block. 2239 */ 2240 static inline bool 2241 xfs_btree_needs_key_update( 2242 struct xfs_btree_cur *cur, 2243 int ptr) 2244 { 2245 return (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) || ptr == 1; 2246 } 2247 2248 /* 2249 * Update the low and high parent keys of the given level, progressing 2250 * towards the root. If force_all is false, stop if the keys for a given 2251 * level do not need updating. 2252 */ 2253 STATIC int 2254 __xfs_btree_updkeys( 2255 struct xfs_btree_cur *cur, 2256 int level, 2257 struct xfs_btree_block *block, 2258 struct xfs_buf *bp0, 2259 bool force_all) 2260 { 2261 union xfs_btree_key key; /* keys from current level */ 2262 union xfs_btree_key *lkey; /* keys from the next level up */ 2263 union xfs_btree_key *hkey; 2264 union xfs_btree_key *nlkey; /* keys from the next level up */ 2265 union xfs_btree_key *nhkey; 2266 struct xfs_buf *bp; 2267 int ptr; 2268 2269 ASSERT(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING); 2270 2271 /* Exit if there aren't any parent levels to update. */ 2272 if (level + 1 >= cur->bc_nlevels) 2273 return 0; 2274 2275 trace_xfs_btree_updkeys(cur, level, bp0); 2276 2277 lkey = &key; 2278 hkey = xfs_btree_high_key_from_key(cur, lkey); 2279 xfs_btree_get_keys(cur, block, lkey); 2280 for (level++; level < cur->bc_nlevels; level++) { 2281 #ifdef DEBUG 2282 int error; 2283 #endif 2284 block = xfs_btree_get_block(cur, level, &bp); 2285 trace_xfs_btree_updkeys(cur, level, bp); 2286 #ifdef DEBUG 2287 error = xfs_btree_check_block(cur, block, level, bp); 2288 if (error) 2289 return error; 2290 #endif 2291 ptr = cur->bc_levels[level].ptr; 2292 nlkey = xfs_btree_key_addr(cur, ptr, block); 2293 nhkey = xfs_btree_high_key_addr(cur, ptr, block); 2294 if (!force_all && 2295 xfs_btree_keycmp_eq(cur, nlkey, lkey) && 2296 xfs_btree_keycmp_eq(cur, nhkey, hkey)) 2297 break; 2298 xfs_btree_copy_keys(cur, nlkey, lkey, 1); 2299 xfs_btree_log_keys(cur, bp, ptr, ptr); 2300 if (level + 1 >= cur->bc_nlevels) 2301 break; 2302 xfs_btree_get_node_keys(cur, block, lkey); 2303 } 2304 2305 return 0; 2306 } 2307 2308 /* Update all the keys from some level in cursor back to the root. */ 2309 STATIC int 2310 xfs_btree_updkeys_force( 2311 struct xfs_btree_cur *cur, 2312 int level) 2313 { 2314 struct xfs_buf *bp; 2315 struct xfs_btree_block *block; 2316 2317 block = xfs_btree_get_block(cur, level, &bp); 2318 return __xfs_btree_updkeys(cur, level, block, bp, true); 2319 } 2320 2321 /* 2322 * Update the parent keys of the given level, progressing towards the root. 2323 */ 2324 STATIC int 2325 xfs_btree_update_keys( 2326 struct xfs_btree_cur *cur, 2327 int level) 2328 { 2329 struct xfs_btree_block *block; 2330 struct xfs_buf *bp; 2331 union xfs_btree_key *kp; 2332 union xfs_btree_key key; 2333 int ptr; 2334 2335 ASSERT(level >= 0); 2336 2337 block = xfs_btree_get_block(cur, level, &bp); 2338 if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) 2339 return __xfs_btree_updkeys(cur, level, block, bp, false); 2340 2341 /* 2342 * Go up the tree from this level toward the root. 2343 * At each level, update the key value to the value input. 2344 * Stop when we reach a level where the cursor isn't pointing 2345 * at the first entry in the block. 2346 */ 2347 xfs_btree_get_keys(cur, block, &key); 2348 for (level++, ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) { 2349 #ifdef DEBUG 2350 int error; 2351 #endif 2352 block = xfs_btree_get_block(cur, level, &bp); 2353 #ifdef DEBUG 2354 error = xfs_btree_check_block(cur, block, level, bp); 2355 if (error) 2356 return error; 2357 #endif 2358 ptr = cur->bc_levels[level].ptr; 2359 kp = xfs_btree_key_addr(cur, ptr, block); 2360 xfs_btree_copy_keys(cur, kp, &key, 1); 2361 xfs_btree_log_keys(cur, bp, ptr, ptr); 2362 } 2363 2364 return 0; 2365 } 2366 2367 /* 2368 * Update the record referred to by cur to the value in the 2369 * given record. This either works (return 0) or gets an 2370 * EFSCORRUPTED error. 2371 */ 2372 int 2373 xfs_btree_update( 2374 struct xfs_btree_cur *cur, 2375 union xfs_btree_rec *rec) 2376 { 2377 struct xfs_btree_block *block; 2378 struct xfs_buf *bp; 2379 int error; 2380 int ptr; 2381 union xfs_btree_rec *rp; 2382 2383 /* Pick up the current block. */ 2384 block = xfs_btree_get_block(cur, 0, &bp); 2385 2386 #ifdef DEBUG 2387 error = xfs_btree_check_block(cur, block, 0, bp); 2388 if (error) 2389 goto error0; 2390 #endif 2391 /* Get the address of the rec to be updated. */ 2392 ptr = cur->bc_levels[0].ptr; 2393 rp = xfs_btree_rec_addr(cur, ptr, block); 2394 2395 /* Fill in the new contents and log them. */ 2396 xfs_btree_copy_recs(cur, rp, rec, 1); 2397 xfs_btree_log_recs(cur, bp, ptr, ptr); 2398 2399 /* Pass new key value up to our parent. */ 2400 if (xfs_btree_needs_key_update(cur, ptr)) { 2401 error = xfs_btree_update_keys(cur, 0); 2402 if (error) 2403 goto error0; 2404 } 2405 2406 return 0; 2407 2408 error0: 2409 return error; 2410 } 2411 2412 /* 2413 * Move 1 record left from cur/level if possible. 2414 * Update cur to reflect the new path. 2415 */ 2416 STATIC int /* error */ 2417 xfs_btree_lshift( 2418 struct xfs_btree_cur *cur, 2419 int level, 2420 int *stat) /* success/failure */ 2421 { 2422 struct xfs_buf *lbp; /* left buffer pointer */ 2423 struct xfs_btree_block *left; /* left btree block */ 2424 int lrecs; /* left record count */ 2425 struct xfs_buf *rbp; /* right buffer pointer */ 2426 struct xfs_btree_block *right; /* right btree block */ 2427 struct xfs_btree_cur *tcur; /* temporary btree cursor */ 2428 int rrecs; /* right record count */ 2429 union xfs_btree_ptr lptr; /* left btree pointer */ 2430 union xfs_btree_key *rkp = NULL; /* right btree key */ 2431 union xfs_btree_ptr *rpp = NULL; /* right address pointer */ 2432 union xfs_btree_rec *rrp = NULL; /* right record pointer */ 2433 int error; /* error return value */ 2434 int i; 2435 2436 if (xfs_btree_at_iroot(cur, level)) 2437 goto out0; 2438 2439 /* Set up variables for this block as "right". */ 2440 right = xfs_btree_get_block(cur, level, &rbp); 2441 2442 #ifdef DEBUG 2443 error = xfs_btree_check_block(cur, right, level, rbp); 2444 if (error) 2445 goto error0; 2446 #endif 2447 2448 /* If we've got no left sibling then we can't shift an entry left. */ 2449 xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB); 2450 if (xfs_btree_ptr_is_null(cur, &lptr)) 2451 goto out0; 2452 2453 /* 2454 * If the cursor entry is the one that would be moved, don't 2455 * do it... it's too complicated. 2456 */ 2457 if (cur->bc_levels[level].ptr <= 1) 2458 goto out0; 2459 2460 /* Set up the left neighbor as "left". */ 2461 error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp); 2462 if (error) 2463 goto error0; 2464 2465 /* If it's full, it can't take another entry. */ 2466 lrecs = xfs_btree_get_numrecs(left); 2467 if (lrecs == cur->bc_ops->get_maxrecs(cur, level)) 2468 goto out0; 2469 2470 rrecs = xfs_btree_get_numrecs(right); 2471 2472 /* 2473 * We add one entry to the left side and remove one for the right side. 2474 * Account for it here, the changes will be updated on disk and logged 2475 * later. 2476 */ 2477 lrecs++; 2478 rrecs--; 2479 2480 XFS_BTREE_STATS_INC(cur, lshift); 2481 XFS_BTREE_STATS_ADD(cur, moves, 1); 2482 2483 /* 2484 * If non-leaf, copy a key and a ptr to the left block. 2485 * Log the changes to the left block. 2486 */ 2487 if (level > 0) { 2488 /* It's a non-leaf. Move keys and pointers. */ 2489 union xfs_btree_key *lkp; /* left btree key */ 2490 union xfs_btree_ptr *lpp; /* left address pointer */ 2491 2492 lkp = xfs_btree_key_addr(cur, lrecs, left); 2493 rkp = xfs_btree_key_addr(cur, 1, right); 2494 2495 lpp = xfs_btree_ptr_addr(cur, lrecs, left); 2496 rpp = xfs_btree_ptr_addr(cur, 1, right); 2497 2498 error = xfs_btree_debug_check_ptr(cur, rpp, 0, level); 2499 if (error) 2500 goto error0; 2501 2502 xfs_btree_copy_keys(cur, lkp, rkp, 1); 2503 xfs_btree_copy_ptrs(cur, lpp, rpp, 1); 2504 2505 xfs_btree_log_keys(cur, lbp, lrecs, lrecs); 2506 xfs_btree_log_ptrs(cur, lbp, lrecs, lrecs); 2507 2508 ASSERT(cur->bc_ops->keys_inorder(cur, 2509 xfs_btree_key_addr(cur, lrecs - 1, left), lkp)); 2510 } else { 2511 /* It's a leaf. Move records. */ 2512 union xfs_btree_rec *lrp; /* left record pointer */ 2513 2514 lrp = xfs_btree_rec_addr(cur, lrecs, left); 2515 rrp = xfs_btree_rec_addr(cur, 1, right); 2516 2517 xfs_btree_copy_recs(cur, lrp, rrp, 1); 2518 xfs_btree_log_recs(cur, lbp, lrecs, lrecs); 2519 2520 ASSERT(cur->bc_ops->recs_inorder(cur, 2521 xfs_btree_rec_addr(cur, lrecs - 1, left), lrp)); 2522 } 2523 2524 xfs_btree_set_numrecs(left, lrecs); 2525 xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS); 2526 2527 xfs_btree_set_numrecs(right, rrecs); 2528 xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS); 2529 2530 /* 2531 * Slide the contents of right down one entry. 2532 */ 2533 XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1); 2534 if (level > 0) { 2535 /* It's a nonleaf. operate on keys and ptrs */ 2536 for (i = 0; i < rrecs; i++) { 2537 error = xfs_btree_debug_check_ptr(cur, rpp, i + 1, level); 2538 if (error) 2539 goto error0; 2540 } 2541 2542 xfs_btree_shift_keys(cur, 2543 xfs_btree_key_addr(cur, 2, right), 2544 -1, rrecs); 2545 xfs_btree_shift_ptrs(cur, 2546 xfs_btree_ptr_addr(cur, 2, right), 2547 -1, rrecs); 2548 2549 xfs_btree_log_keys(cur, rbp, 1, rrecs); 2550 xfs_btree_log_ptrs(cur, rbp, 1, rrecs); 2551 } else { 2552 /* It's a leaf. operate on records */ 2553 xfs_btree_shift_recs(cur, 2554 xfs_btree_rec_addr(cur, 2, right), 2555 -1, rrecs); 2556 xfs_btree_log_recs(cur, rbp, 1, rrecs); 2557 } 2558 2559 /* 2560 * Using a temporary cursor, update the parent key values of the 2561 * block on the left. 2562 */ 2563 if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) { 2564 error = xfs_btree_dup_cursor(cur, &tcur); 2565 if (error) 2566 goto error0; 2567 i = xfs_btree_firstrec(tcur, level); 2568 if (XFS_IS_CORRUPT(tcur->bc_mp, i != 1)) { 2569 xfs_btree_mark_sick(cur); 2570 error = -EFSCORRUPTED; 2571 goto error0; 2572 } 2573 2574 error = xfs_btree_decrement(tcur, level, &i); 2575 if (error) 2576 goto error1; 2577 2578 /* Update the parent high keys of the left block, if needed. */ 2579 error = xfs_btree_update_keys(tcur, level); 2580 if (error) 2581 goto error1; 2582 2583 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); 2584 } 2585 2586 /* Update the parent keys of the right block. */ 2587 error = xfs_btree_update_keys(cur, level); 2588 if (error) 2589 goto error0; 2590 2591 /* Slide the cursor value left one. */ 2592 cur->bc_levels[level].ptr--; 2593 2594 *stat = 1; 2595 return 0; 2596 2597 out0: 2598 *stat = 0; 2599 return 0; 2600 2601 error0: 2602 return error; 2603 2604 error1: 2605 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); 2606 return error; 2607 } 2608 2609 /* 2610 * Move 1 record right from cur/level if possible. 2611 * Update cur to reflect the new path. 2612 */ 2613 STATIC int /* error */ 2614 xfs_btree_rshift( 2615 struct xfs_btree_cur *cur, 2616 int level, 2617 int *stat) /* success/failure */ 2618 { 2619 struct xfs_buf *lbp; /* left buffer pointer */ 2620 struct xfs_btree_block *left; /* left btree block */ 2621 struct xfs_buf *rbp; /* right buffer pointer */ 2622 struct xfs_btree_block *right; /* right btree block */ 2623 struct xfs_btree_cur *tcur; /* temporary btree cursor */ 2624 union xfs_btree_ptr rptr; /* right block pointer */ 2625 union xfs_btree_key *rkp; /* right btree key */ 2626 int rrecs; /* right record count */ 2627 int lrecs; /* left record count */ 2628 int error; /* error return value */ 2629 int i; /* loop counter */ 2630 2631 if (xfs_btree_at_iroot(cur, level)) 2632 goto out0; 2633 2634 /* Set up variables for this block as "left". */ 2635 left = xfs_btree_get_block(cur, level, &lbp); 2636 2637 #ifdef DEBUG 2638 error = xfs_btree_check_block(cur, left, level, lbp); 2639 if (error) 2640 goto error0; 2641 #endif 2642 2643 /* If we've got no right sibling then we can't shift an entry right. */ 2644 xfs_btree_get_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB); 2645 if (xfs_btree_ptr_is_null(cur, &rptr)) 2646 goto out0; 2647 2648 /* 2649 * If the cursor entry is the one that would be moved, don't 2650 * do it... it's too complicated. 2651 */ 2652 lrecs = xfs_btree_get_numrecs(left); 2653 if (cur->bc_levels[level].ptr >= lrecs) 2654 goto out0; 2655 2656 /* Set up the right neighbor as "right". */ 2657 error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp); 2658 if (error) 2659 goto error0; 2660 2661 /* If it's full, it can't take another entry. */ 2662 rrecs = xfs_btree_get_numrecs(right); 2663 if (rrecs == cur->bc_ops->get_maxrecs(cur, level)) 2664 goto out0; 2665 2666 XFS_BTREE_STATS_INC(cur, rshift); 2667 XFS_BTREE_STATS_ADD(cur, moves, rrecs); 2668 2669 /* 2670 * Make a hole at the start of the right neighbor block, then 2671 * copy the last left block entry to the hole. 2672 */ 2673 if (level > 0) { 2674 /* It's a nonleaf. make a hole in the keys and ptrs */ 2675 union xfs_btree_key *lkp; 2676 union xfs_btree_ptr *lpp; 2677 union xfs_btree_ptr *rpp; 2678 2679 lkp = xfs_btree_key_addr(cur, lrecs, left); 2680 lpp = xfs_btree_ptr_addr(cur, lrecs, left); 2681 rkp = xfs_btree_key_addr(cur, 1, right); 2682 rpp = xfs_btree_ptr_addr(cur, 1, right); 2683 2684 for (i = rrecs - 1; i >= 0; i--) { 2685 error = xfs_btree_debug_check_ptr(cur, rpp, i, level); 2686 if (error) 2687 goto error0; 2688 } 2689 2690 xfs_btree_shift_keys(cur, rkp, 1, rrecs); 2691 xfs_btree_shift_ptrs(cur, rpp, 1, rrecs); 2692 2693 error = xfs_btree_debug_check_ptr(cur, lpp, 0, level); 2694 if (error) 2695 goto error0; 2696 2697 /* Now put the new data in, and log it. */ 2698 xfs_btree_copy_keys(cur, rkp, lkp, 1); 2699 xfs_btree_copy_ptrs(cur, rpp, lpp, 1); 2700 2701 xfs_btree_log_keys(cur, rbp, 1, rrecs + 1); 2702 xfs_btree_log_ptrs(cur, rbp, 1, rrecs + 1); 2703 2704 ASSERT(cur->bc_ops->keys_inorder(cur, rkp, 2705 xfs_btree_key_addr(cur, 2, right))); 2706 } else { 2707 /* It's a leaf. make a hole in the records */ 2708 union xfs_btree_rec *lrp; 2709 union xfs_btree_rec *rrp; 2710 2711 lrp = xfs_btree_rec_addr(cur, lrecs, left); 2712 rrp = xfs_btree_rec_addr(cur, 1, right); 2713 2714 xfs_btree_shift_recs(cur, rrp, 1, rrecs); 2715 2716 /* Now put the new data in, and log it. */ 2717 xfs_btree_copy_recs(cur, rrp, lrp, 1); 2718 xfs_btree_log_recs(cur, rbp, 1, rrecs + 1); 2719 } 2720 2721 /* 2722 * Decrement and log left's numrecs, bump and log right's numrecs. 2723 */ 2724 xfs_btree_set_numrecs(left, --lrecs); 2725 xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS); 2726 2727 xfs_btree_set_numrecs(right, ++rrecs); 2728 xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS); 2729 2730 /* 2731 * Using a temporary cursor, update the parent key values of the 2732 * block on the right. 2733 */ 2734 error = xfs_btree_dup_cursor(cur, &tcur); 2735 if (error) 2736 goto error0; 2737 i = xfs_btree_lastrec(tcur, level); 2738 if (XFS_IS_CORRUPT(tcur->bc_mp, i != 1)) { 2739 xfs_btree_mark_sick(cur); 2740 error = -EFSCORRUPTED; 2741 goto error0; 2742 } 2743 2744 error = xfs_btree_increment(tcur, level, &i); 2745 if (error) 2746 goto error1; 2747 2748 /* Update the parent high keys of the left block, if needed. */ 2749 if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) { 2750 error = xfs_btree_update_keys(cur, level); 2751 if (error) 2752 goto error1; 2753 } 2754 2755 /* Update the parent keys of the right block. */ 2756 error = xfs_btree_update_keys(tcur, level); 2757 if (error) 2758 goto error1; 2759 2760 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); 2761 2762 *stat = 1; 2763 return 0; 2764 2765 out0: 2766 *stat = 0; 2767 return 0; 2768 2769 error0: 2770 return error; 2771 2772 error1: 2773 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); 2774 return error; 2775 } 2776 2777 static inline int 2778 xfs_btree_alloc_block( 2779 struct xfs_btree_cur *cur, 2780 const union xfs_btree_ptr *hint_block, 2781 union xfs_btree_ptr *new_block, 2782 int *stat) 2783 { 2784 int error; 2785 2786 /* 2787 * Don't allow block allocation for a staging cursor, because staging 2788 * cursors do not support regular btree modifications. 2789 * 2790 * Bulk loading uses a separate callback to obtain new blocks from a 2791 * preallocated list, which prevents ENOSPC failures during loading. 2792 */ 2793 if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) { 2794 ASSERT(0); 2795 return -EFSCORRUPTED; 2796 } 2797 2798 error = cur->bc_ops->alloc_block(cur, hint_block, new_block, stat); 2799 trace_xfs_btree_alloc_block(cur, new_block, *stat, error); 2800 return error; 2801 } 2802 2803 /* 2804 * Split cur/level block in half. 2805 * Return new block number and the key to its first 2806 * record (to be inserted into parent). 2807 */ 2808 STATIC int /* error */ 2809 __xfs_btree_split( 2810 struct xfs_btree_cur *cur, 2811 int level, 2812 union xfs_btree_ptr *ptrp, 2813 union xfs_btree_key *key, 2814 struct xfs_btree_cur **curp, 2815 int *stat) /* success/failure */ 2816 { 2817 union xfs_btree_ptr lptr; /* left sibling block ptr */ 2818 struct xfs_buf *lbp; /* left buffer pointer */ 2819 struct xfs_btree_block *left; /* left btree block */ 2820 union xfs_btree_ptr rptr; /* right sibling block ptr */ 2821 struct xfs_buf *rbp; /* right buffer pointer */ 2822 struct xfs_btree_block *right; /* right btree block */ 2823 union xfs_btree_ptr rrptr; /* right-right sibling ptr */ 2824 struct xfs_buf *rrbp; /* right-right buffer pointer */ 2825 struct xfs_btree_block *rrblock; /* right-right btree block */ 2826 int lrecs; 2827 int rrecs; 2828 int src_index; 2829 int error; /* error return value */ 2830 int i; 2831 2832 XFS_BTREE_STATS_INC(cur, split); 2833 2834 /* Set up left block (current one). */ 2835 left = xfs_btree_get_block(cur, level, &lbp); 2836 2837 #ifdef DEBUG 2838 error = xfs_btree_check_block(cur, left, level, lbp); 2839 if (error) 2840 goto error0; 2841 #endif 2842 2843 xfs_btree_buf_to_ptr(cur, lbp, &lptr); 2844 2845 /* Allocate the new block. If we can't do it, we're toast. Give up. */ 2846 error = xfs_btree_alloc_block(cur, &lptr, &rptr, stat); 2847 if (error) 2848 goto error0; 2849 if (*stat == 0) 2850 goto out0; 2851 XFS_BTREE_STATS_INC(cur, alloc); 2852 2853 /* Set up the new block as "right". */ 2854 error = xfs_btree_get_buf_block(cur, &rptr, &right, &rbp); 2855 if (error) 2856 goto error0; 2857 2858 /* Fill in the btree header for the new right block. */ 2859 xfs_btree_init_block_cur(cur, rbp, xfs_btree_get_level(left), 0); 2860 2861 /* 2862 * Split the entries between the old and the new block evenly. 2863 * Make sure that if there's an odd number of entries now, that 2864 * each new block will have the same number of entries. 2865 */ 2866 lrecs = xfs_btree_get_numrecs(left); 2867 rrecs = lrecs / 2; 2868 if ((lrecs & 1) && cur->bc_levels[level].ptr <= rrecs + 1) 2869 rrecs++; 2870 src_index = (lrecs - rrecs + 1); 2871 2872 XFS_BTREE_STATS_ADD(cur, moves, rrecs); 2873 2874 /* Adjust numrecs for the later get_*_keys() calls. */ 2875 lrecs -= rrecs; 2876 xfs_btree_set_numrecs(left, lrecs); 2877 xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs); 2878 2879 /* 2880 * Copy btree block entries from the left block over to the 2881 * new block, the right. Update the right block and log the 2882 * changes. 2883 */ 2884 if (level > 0) { 2885 /* It's a non-leaf. Move keys and pointers. */ 2886 union xfs_btree_key *lkp; /* left btree key */ 2887 union xfs_btree_ptr *lpp; /* left address pointer */ 2888 union xfs_btree_key *rkp; /* right btree key */ 2889 union xfs_btree_ptr *rpp; /* right address pointer */ 2890 2891 lkp = xfs_btree_key_addr(cur, src_index, left); 2892 lpp = xfs_btree_ptr_addr(cur, src_index, left); 2893 rkp = xfs_btree_key_addr(cur, 1, right); 2894 rpp = xfs_btree_ptr_addr(cur, 1, right); 2895 2896 for (i = src_index; i < rrecs; i++) { 2897 error = xfs_btree_debug_check_ptr(cur, lpp, i, level); 2898 if (error) 2899 goto error0; 2900 } 2901 2902 /* Copy the keys & pointers to the new block. */ 2903 xfs_btree_copy_keys(cur, rkp, lkp, rrecs); 2904 xfs_btree_copy_ptrs(cur, rpp, lpp, rrecs); 2905 2906 xfs_btree_log_keys(cur, rbp, 1, rrecs); 2907 xfs_btree_log_ptrs(cur, rbp, 1, rrecs); 2908 2909 /* Stash the keys of the new block for later insertion. */ 2910 xfs_btree_get_node_keys(cur, right, key); 2911 } else { 2912 /* It's a leaf. Move records. */ 2913 union xfs_btree_rec *lrp; /* left record pointer */ 2914 union xfs_btree_rec *rrp; /* right record pointer */ 2915 2916 lrp = xfs_btree_rec_addr(cur, src_index, left); 2917 rrp = xfs_btree_rec_addr(cur, 1, right); 2918 2919 /* Copy records to the new block. */ 2920 xfs_btree_copy_recs(cur, rrp, lrp, rrecs); 2921 xfs_btree_log_recs(cur, rbp, 1, rrecs); 2922 2923 /* Stash the keys of the new block for later insertion. */ 2924 xfs_btree_get_leaf_keys(cur, right, key); 2925 } 2926 2927 /* 2928 * Find the left block number by looking in the buffer. 2929 * Adjust sibling pointers. 2930 */ 2931 xfs_btree_get_sibling(cur, left, &rrptr, XFS_BB_RIGHTSIB); 2932 xfs_btree_set_sibling(cur, right, &rrptr, XFS_BB_RIGHTSIB); 2933 xfs_btree_set_sibling(cur, right, &lptr, XFS_BB_LEFTSIB); 2934 xfs_btree_set_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB); 2935 2936 xfs_btree_log_block(cur, rbp, XFS_BB_ALL_BITS); 2937 xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB); 2938 2939 /* 2940 * If there's a block to the new block's right, make that block 2941 * point back to right instead of to left. 2942 */ 2943 if (!xfs_btree_ptr_is_null(cur, &rrptr)) { 2944 error = xfs_btree_read_buf_block(cur, &rrptr, 2945 0, &rrblock, &rrbp); 2946 if (error) 2947 goto error0; 2948 xfs_btree_set_sibling(cur, rrblock, &rptr, XFS_BB_LEFTSIB); 2949 xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB); 2950 } 2951 2952 /* Update the parent high keys of the left block, if needed. */ 2953 if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) { 2954 error = xfs_btree_update_keys(cur, level); 2955 if (error) 2956 goto error0; 2957 } 2958 2959 /* 2960 * If the cursor is really in the right block, move it there. 2961 * If it's just pointing past the last entry in left, then we'll 2962 * insert there, so don't change anything in that case. 2963 */ 2964 if (cur->bc_levels[level].ptr > lrecs + 1) { 2965 xfs_btree_setbuf(cur, level, rbp); 2966 cur->bc_levels[level].ptr -= lrecs; 2967 } 2968 /* 2969 * If there are more levels, we'll need another cursor which refers 2970 * the right block, no matter where this cursor was. 2971 */ 2972 if (level + 1 < cur->bc_nlevels) { 2973 error = xfs_btree_dup_cursor(cur, curp); 2974 if (error) 2975 goto error0; 2976 (*curp)->bc_levels[level + 1].ptr++; 2977 } 2978 *ptrp = rptr; 2979 *stat = 1; 2980 return 0; 2981 out0: 2982 *stat = 0; 2983 return 0; 2984 2985 error0: 2986 return error; 2987 } 2988 2989 #ifdef __KERNEL__ 2990 struct xfs_btree_split_args { 2991 struct xfs_btree_cur *cur; 2992 int level; 2993 union xfs_btree_ptr *ptrp; 2994 union xfs_btree_key *key; 2995 struct xfs_btree_cur **curp; 2996 int *stat; /* success/failure */ 2997 int result; 2998 bool kswapd; /* allocation in kswapd context */ 2999 struct completion *done; 3000 struct work_struct work; 3001 }; 3002 3003 /* 3004 * Stack switching interfaces for allocation 3005 */ 3006 static void 3007 xfs_btree_split_worker( 3008 struct work_struct *work) 3009 { 3010 struct xfs_btree_split_args *args = container_of(work, 3011 struct xfs_btree_split_args, work); 3012 unsigned long pflags; 3013 unsigned long new_pflags = 0; 3014 3015 /* 3016 * we are in a transaction context here, but may also be doing work 3017 * in kswapd context, and hence we may need to inherit that state 3018 * temporarily to ensure that we don't block waiting for memory reclaim 3019 * in any way. 3020 */ 3021 if (args->kswapd) 3022 new_pflags |= PF_MEMALLOC | PF_KSWAPD; 3023 3024 current_set_flags_nested(&pflags, new_pflags); 3025 xfs_trans_set_context(args->cur->bc_tp); 3026 3027 args->result = __xfs_btree_split(args->cur, args->level, args->ptrp, 3028 args->key, args->curp, args->stat); 3029 3030 xfs_trans_clear_context(args->cur->bc_tp); 3031 current_restore_flags_nested(&pflags, new_pflags); 3032 3033 /* 3034 * Do not access args after complete() has run here. We don't own args 3035 * and the owner may run and free args before we return here. 3036 */ 3037 complete(args->done); 3038 3039 } 3040 3041 /* 3042 * BMBT split requests often come in with little stack to work on so we push 3043 * them off to a worker thread so there is lots of stack to use. For the other 3044 * btree types, just call directly to avoid the context switch overhead here. 3045 * 3046 * Care must be taken here - the work queue rescuer thread introduces potential 3047 * AGF <> worker queue deadlocks if the BMBT block allocation has to lock new 3048 * AGFs to allocate blocks. A task being run by the rescuer could attempt to 3049 * lock an AGF that is already locked by a task queued to run by the rescuer, 3050 * resulting in an ABBA deadlock as the rescuer cannot run the lock holder to 3051 * release it until the current thread it is running gains the lock. 3052 * 3053 * To avoid this issue, we only ever queue BMBT splits that don't have an AGF 3054 * already locked to allocate from. The only place that doesn't hold an AGF 3055 * locked is unwritten extent conversion at IO completion, but that has already 3056 * been offloaded to a worker thread and hence has no stack consumption issues 3057 * we have to worry about. 3058 */ 3059 STATIC int /* error */ 3060 xfs_btree_split( 3061 struct xfs_btree_cur *cur, 3062 int level, 3063 union xfs_btree_ptr *ptrp, 3064 union xfs_btree_key *key, 3065 struct xfs_btree_cur **curp, 3066 int *stat) /* success/failure */ 3067 { 3068 struct xfs_btree_split_args args; 3069 DECLARE_COMPLETION_ONSTACK(done); 3070 3071 if (!xfs_btree_is_bmap(cur->bc_ops) || 3072 cur->bc_tp->t_highest_agno == NULLAGNUMBER) 3073 return __xfs_btree_split(cur, level, ptrp, key, curp, stat); 3074 3075 args.cur = cur; 3076 args.level = level; 3077 args.ptrp = ptrp; 3078 args.key = key; 3079 args.curp = curp; 3080 args.stat = stat; 3081 args.done = &done; 3082 args.kswapd = current_is_kswapd(); 3083 INIT_WORK_ONSTACK(&args.work, xfs_btree_split_worker); 3084 queue_work(xfs_alloc_wq, &args.work); 3085 wait_for_completion(&done); 3086 destroy_work_on_stack(&args.work); 3087 return args.result; 3088 } 3089 #else 3090 #define xfs_btree_split __xfs_btree_split 3091 #endif /* __KERNEL__ */ 3092 3093 /* 3094 * Copy the old inode root contents into a real block and make the 3095 * broot point to it. 3096 */ 3097 int /* error */ 3098 xfs_btree_new_iroot( 3099 struct xfs_btree_cur *cur, /* btree cursor */ 3100 int *logflags, /* logging flags for inode */ 3101 int *stat) /* return status - 0 fail */ 3102 { 3103 struct xfs_buf *cbp; /* buffer for cblock */ 3104 struct xfs_btree_block *block; /* btree block */ 3105 struct xfs_btree_block *cblock; /* child btree block */ 3106 union xfs_btree_key *ckp; /* child key pointer */ 3107 union xfs_btree_ptr *cpp; /* child ptr pointer */ 3108 union xfs_btree_key *kp; /* pointer to btree key */ 3109 union xfs_btree_ptr *pp; /* pointer to block addr */ 3110 union xfs_btree_ptr nptr; /* new block addr */ 3111 int level; /* btree level */ 3112 int error; /* error return code */ 3113 int i; /* loop counter */ 3114 3115 XFS_BTREE_STATS_INC(cur, newroot); 3116 3117 ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE); 3118 3119 level = cur->bc_nlevels - 1; 3120 3121 block = xfs_btree_get_iroot(cur); 3122 pp = xfs_btree_ptr_addr(cur, 1, block); 3123 3124 /* Allocate the new block. If we can't do it, we're toast. Give up. */ 3125 error = xfs_btree_alloc_block(cur, pp, &nptr, stat); 3126 if (error) 3127 goto error0; 3128 if (*stat == 0) 3129 return 0; 3130 3131 XFS_BTREE_STATS_INC(cur, alloc); 3132 3133 /* Copy the root into a real block. */ 3134 error = xfs_btree_get_buf_block(cur, &nptr, &cblock, &cbp); 3135 if (error) 3136 goto error0; 3137 3138 /* 3139 * we can't just memcpy() the root in for CRC enabled btree blocks. 3140 * In that case have to also ensure the blkno remains correct 3141 */ 3142 memcpy(cblock, block, xfs_btree_block_len(cur)); 3143 if (xfs_has_crc(cur->bc_mp)) { 3144 __be64 bno = cpu_to_be64(xfs_buf_daddr(cbp)); 3145 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) 3146 cblock->bb_u.l.bb_blkno = bno; 3147 else 3148 cblock->bb_u.s.bb_blkno = bno; 3149 } 3150 3151 be16_add_cpu(&block->bb_level, 1); 3152 xfs_btree_set_numrecs(block, 1); 3153 cur->bc_nlevels++; 3154 ASSERT(cur->bc_nlevels <= cur->bc_maxlevels); 3155 cur->bc_levels[level + 1].ptr = 1; 3156 3157 kp = xfs_btree_key_addr(cur, 1, block); 3158 ckp = xfs_btree_key_addr(cur, 1, cblock); 3159 xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock)); 3160 3161 cpp = xfs_btree_ptr_addr(cur, 1, cblock); 3162 for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) { 3163 error = xfs_btree_debug_check_ptr(cur, pp, i, level); 3164 if (error) 3165 goto error0; 3166 } 3167 3168 xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock)); 3169 3170 error = xfs_btree_debug_check_ptr(cur, &nptr, 0, level); 3171 if (error) 3172 goto error0; 3173 3174 xfs_btree_copy_ptrs(cur, pp, &nptr, 1); 3175 3176 xfs_iroot_realloc(cur->bc_ino.ip, 3177 1 - xfs_btree_get_numrecs(cblock), 3178 cur->bc_ino.whichfork); 3179 3180 xfs_btree_setbuf(cur, level, cbp); 3181 3182 /* 3183 * Do all this logging at the end so that 3184 * the root is at the right level. 3185 */ 3186 xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS); 3187 xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs)); 3188 xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs)); 3189 3190 *logflags |= 3191 XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork); 3192 *stat = 1; 3193 return 0; 3194 error0: 3195 return error; 3196 } 3197 3198 static void 3199 xfs_btree_set_root( 3200 struct xfs_btree_cur *cur, 3201 const union xfs_btree_ptr *ptr, 3202 int inc) 3203 { 3204 if (cur->bc_flags & XFS_BTREE_STAGING) { 3205 /* Update the btree root information for a per-AG fake root. */ 3206 cur->bc_ag.afake->af_root = be32_to_cpu(ptr->s); 3207 cur->bc_ag.afake->af_levels += inc; 3208 } else { 3209 cur->bc_ops->set_root(cur, ptr, inc); 3210 } 3211 } 3212 3213 /* 3214 * Allocate a new root block, fill it in. 3215 */ 3216 STATIC int /* error */ 3217 xfs_btree_new_root( 3218 struct xfs_btree_cur *cur, /* btree cursor */ 3219 int *stat) /* success/failure */ 3220 { 3221 struct xfs_btree_block *block; /* one half of the old root block */ 3222 struct xfs_buf *bp; /* buffer containing block */ 3223 int error; /* error return value */ 3224 struct xfs_buf *lbp; /* left buffer pointer */ 3225 struct xfs_btree_block *left; /* left btree block */ 3226 struct xfs_buf *nbp; /* new (root) buffer */ 3227 struct xfs_btree_block *new; /* new (root) btree block */ 3228 int nptr; /* new value for key index, 1 or 2 */ 3229 struct xfs_buf *rbp; /* right buffer pointer */ 3230 struct xfs_btree_block *right; /* right btree block */ 3231 union xfs_btree_ptr rptr; 3232 union xfs_btree_ptr lptr; 3233 3234 XFS_BTREE_STATS_INC(cur, newroot); 3235 3236 /* initialise our start point from the cursor */ 3237 xfs_btree_init_ptr_from_cur(cur, &rptr); 3238 3239 /* Allocate the new block. If we can't do it, we're toast. Give up. */ 3240 error = xfs_btree_alloc_block(cur, &rptr, &lptr, stat); 3241 if (error) 3242 goto error0; 3243 if (*stat == 0) 3244 goto out0; 3245 XFS_BTREE_STATS_INC(cur, alloc); 3246 3247 /* Set up the new block. */ 3248 error = xfs_btree_get_buf_block(cur, &lptr, &new, &nbp); 3249 if (error) 3250 goto error0; 3251 3252 /* Set the root in the holding structure increasing the level by 1. */ 3253 xfs_btree_set_root(cur, &lptr, 1); 3254 3255 /* 3256 * At the previous root level there are now two blocks: the old root, 3257 * and the new block generated when it was split. We don't know which 3258 * one the cursor is pointing at, so we set up variables "left" and 3259 * "right" for each case. 3260 */ 3261 block = xfs_btree_get_block(cur, cur->bc_nlevels - 1, &bp); 3262 3263 #ifdef DEBUG 3264 error = xfs_btree_check_block(cur, block, cur->bc_nlevels - 1, bp); 3265 if (error) 3266 goto error0; 3267 #endif 3268 3269 xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB); 3270 if (!xfs_btree_ptr_is_null(cur, &rptr)) { 3271 /* Our block is left, pick up the right block. */ 3272 lbp = bp; 3273 xfs_btree_buf_to_ptr(cur, lbp, &lptr); 3274 left = block; 3275 error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp); 3276 if (error) 3277 goto error0; 3278 bp = rbp; 3279 nptr = 1; 3280 } else { 3281 /* Our block is right, pick up the left block. */ 3282 rbp = bp; 3283 xfs_btree_buf_to_ptr(cur, rbp, &rptr); 3284 right = block; 3285 xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB); 3286 error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp); 3287 if (error) 3288 goto error0; 3289 bp = lbp; 3290 nptr = 2; 3291 } 3292 3293 /* Fill in the new block's btree header and log it. */ 3294 xfs_btree_init_block_cur(cur, nbp, cur->bc_nlevels, 2); 3295 xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS); 3296 ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) && 3297 !xfs_btree_ptr_is_null(cur, &rptr)); 3298 3299 /* Fill in the key data in the new root. */ 3300 if (xfs_btree_get_level(left) > 0) { 3301 /* 3302 * Get the keys for the left block's keys and put them directly 3303 * in the parent block. Do the same for the right block. 3304 */ 3305 xfs_btree_get_node_keys(cur, left, 3306 xfs_btree_key_addr(cur, 1, new)); 3307 xfs_btree_get_node_keys(cur, right, 3308 xfs_btree_key_addr(cur, 2, new)); 3309 } else { 3310 /* 3311 * Get the keys for the left block's records and put them 3312 * directly in the parent block. Do the same for the right 3313 * block. 3314 */ 3315 xfs_btree_get_leaf_keys(cur, left, 3316 xfs_btree_key_addr(cur, 1, new)); 3317 xfs_btree_get_leaf_keys(cur, right, 3318 xfs_btree_key_addr(cur, 2, new)); 3319 } 3320 xfs_btree_log_keys(cur, nbp, 1, 2); 3321 3322 /* Fill in the pointer data in the new root. */ 3323 xfs_btree_copy_ptrs(cur, 3324 xfs_btree_ptr_addr(cur, 1, new), &lptr, 1); 3325 xfs_btree_copy_ptrs(cur, 3326 xfs_btree_ptr_addr(cur, 2, new), &rptr, 1); 3327 xfs_btree_log_ptrs(cur, nbp, 1, 2); 3328 3329 /* Fix up the cursor. */ 3330 xfs_btree_setbuf(cur, cur->bc_nlevels, nbp); 3331 cur->bc_levels[cur->bc_nlevels].ptr = nptr; 3332 cur->bc_nlevels++; 3333 ASSERT(cur->bc_nlevels <= cur->bc_maxlevels); 3334 *stat = 1; 3335 return 0; 3336 error0: 3337 return error; 3338 out0: 3339 *stat = 0; 3340 return 0; 3341 } 3342 3343 STATIC int 3344 xfs_btree_make_block_unfull( 3345 struct xfs_btree_cur *cur, /* btree cursor */ 3346 int level, /* btree level */ 3347 int numrecs,/* # of recs in block */ 3348 int *oindex,/* old tree index */ 3349 int *index, /* new tree index */ 3350 union xfs_btree_ptr *nptr, /* new btree ptr */ 3351 struct xfs_btree_cur **ncur, /* new btree cursor */ 3352 union xfs_btree_key *key, /* key of new block */ 3353 int *stat) 3354 { 3355 int error = 0; 3356 3357 if (xfs_btree_at_iroot(cur, level)) { 3358 struct xfs_inode *ip = cur->bc_ino.ip; 3359 3360 if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) { 3361 /* A root block that can be made bigger. */ 3362 xfs_iroot_realloc(ip, 1, cur->bc_ino.whichfork); 3363 *stat = 1; 3364 } else { 3365 /* A root block that needs replacing */ 3366 int logflags = 0; 3367 3368 error = xfs_btree_new_iroot(cur, &logflags, stat); 3369 if (error || *stat == 0) 3370 return error; 3371 3372 xfs_trans_log_inode(cur->bc_tp, ip, logflags); 3373 } 3374 3375 return 0; 3376 } 3377 3378 /* First, try shifting an entry to the right neighbor. */ 3379 error = xfs_btree_rshift(cur, level, stat); 3380 if (error || *stat) 3381 return error; 3382 3383 /* Next, try shifting an entry to the left neighbor. */ 3384 error = xfs_btree_lshift(cur, level, stat); 3385 if (error) 3386 return error; 3387 3388 if (*stat) { 3389 *oindex = *index = cur->bc_levels[level].ptr; 3390 return 0; 3391 } 3392 3393 /* 3394 * Next, try splitting the current block in half. 3395 * 3396 * If this works we have to re-set our variables because we 3397 * could be in a different block now. 3398 */ 3399 error = xfs_btree_split(cur, level, nptr, key, ncur, stat); 3400 if (error || *stat == 0) 3401 return error; 3402 3403 3404 *index = cur->bc_levels[level].ptr; 3405 return 0; 3406 } 3407 3408 /* 3409 * Insert one record/level. Return information to the caller 3410 * allowing the next level up to proceed if necessary. 3411 */ 3412 STATIC int 3413 xfs_btree_insrec( 3414 struct xfs_btree_cur *cur, /* btree cursor */ 3415 int level, /* level to insert record at */ 3416 union xfs_btree_ptr *ptrp, /* i/o: block number inserted */ 3417 union xfs_btree_rec *rec, /* record to insert */ 3418 union xfs_btree_key *key, /* i/o: block key for ptrp */ 3419 struct xfs_btree_cur **curp, /* output: new cursor replacing cur */ 3420 int *stat) /* success/failure */ 3421 { 3422 struct xfs_btree_block *block; /* btree block */ 3423 struct xfs_buf *bp; /* buffer for block */ 3424 union xfs_btree_ptr nptr; /* new block ptr */ 3425 struct xfs_btree_cur *ncur = NULL; /* new btree cursor */ 3426 union xfs_btree_key nkey; /* new block key */ 3427 union xfs_btree_key *lkey; 3428 int optr; /* old key/record index */ 3429 int ptr; /* key/record index */ 3430 int numrecs;/* number of records */ 3431 int error; /* error return value */ 3432 int i; 3433 xfs_daddr_t old_bn; 3434 3435 ncur = NULL; 3436 lkey = &nkey; 3437 3438 /* 3439 * If we have an external root pointer, and we've made it to the 3440 * root level, allocate a new root block and we're done. 3441 */ 3442 if (cur->bc_ops->type != XFS_BTREE_TYPE_INODE && 3443 level >= cur->bc_nlevels) { 3444 error = xfs_btree_new_root(cur, stat); 3445 xfs_btree_set_ptr_null(cur, ptrp); 3446 3447 return error; 3448 } 3449 3450 /* If we're off the left edge, return failure. */ 3451 ptr = cur->bc_levels[level].ptr; 3452 if (ptr == 0) { 3453 *stat = 0; 3454 return 0; 3455 } 3456 3457 optr = ptr; 3458 3459 XFS_BTREE_STATS_INC(cur, insrec); 3460 3461 /* Get pointers to the btree buffer and block. */ 3462 block = xfs_btree_get_block(cur, level, &bp); 3463 old_bn = bp ? xfs_buf_daddr(bp) : XFS_BUF_DADDR_NULL; 3464 numrecs = xfs_btree_get_numrecs(block); 3465 3466 #ifdef DEBUG 3467 error = xfs_btree_check_block(cur, block, level, bp); 3468 if (error) 3469 goto error0; 3470 3471 /* Check that the new entry is being inserted in the right place. */ 3472 if (ptr <= numrecs) { 3473 if (level == 0) { 3474 ASSERT(cur->bc_ops->recs_inorder(cur, rec, 3475 xfs_btree_rec_addr(cur, ptr, block))); 3476 } else { 3477 ASSERT(cur->bc_ops->keys_inorder(cur, key, 3478 xfs_btree_key_addr(cur, ptr, block))); 3479 } 3480 } 3481 #endif 3482 3483 /* 3484 * If the block is full, we can't insert the new entry until we 3485 * make the block un-full. 3486 */ 3487 xfs_btree_set_ptr_null(cur, &nptr); 3488 if (numrecs == cur->bc_ops->get_maxrecs(cur, level)) { 3489 error = xfs_btree_make_block_unfull(cur, level, numrecs, 3490 &optr, &ptr, &nptr, &ncur, lkey, stat); 3491 if (error || *stat == 0) 3492 goto error0; 3493 } 3494 3495 /* 3496 * The current block may have changed if the block was 3497 * previously full and we have just made space in it. 3498 */ 3499 block = xfs_btree_get_block(cur, level, &bp); 3500 numrecs = xfs_btree_get_numrecs(block); 3501 3502 #ifdef DEBUG 3503 error = xfs_btree_check_block(cur, block, level, bp); 3504 if (error) 3505 goto error0; 3506 #endif 3507 3508 /* 3509 * At this point we know there's room for our new entry in the block 3510 * we're pointing at. 3511 */ 3512 XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr + 1); 3513 3514 if (level > 0) { 3515 /* It's a nonleaf. make a hole in the keys and ptrs */ 3516 union xfs_btree_key *kp; 3517 union xfs_btree_ptr *pp; 3518 3519 kp = xfs_btree_key_addr(cur, ptr, block); 3520 pp = xfs_btree_ptr_addr(cur, ptr, block); 3521 3522 for (i = numrecs - ptr; i >= 0; i--) { 3523 error = xfs_btree_debug_check_ptr(cur, pp, i, level); 3524 if (error) 3525 goto error0; 3526 } 3527 3528 xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1); 3529 xfs_btree_shift_ptrs(cur, pp, 1, numrecs - ptr + 1); 3530 3531 error = xfs_btree_debug_check_ptr(cur, ptrp, 0, level); 3532 if (error) 3533 goto error0; 3534 3535 /* Now put the new data in, bump numrecs and log it. */ 3536 xfs_btree_copy_keys(cur, kp, key, 1); 3537 xfs_btree_copy_ptrs(cur, pp, ptrp, 1); 3538 numrecs++; 3539 xfs_btree_set_numrecs(block, numrecs); 3540 xfs_btree_log_ptrs(cur, bp, ptr, numrecs); 3541 xfs_btree_log_keys(cur, bp, ptr, numrecs); 3542 #ifdef DEBUG 3543 if (ptr < numrecs) { 3544 ASSERT(cur->bc_ops->keys_inorder(cur, kp, 3545 xfs_btree_key_addr(cur, ptr + 1, block))); 3546 } 3547 #endif 3548 } else { 3549 /* It's a leaf. make a hole in the records */ 3550 union xfs_btree_rec *rp; 3551 3552 rp = xfs_btree_rec_addr(cur, ptr, block); 3553 3554 xfs_btree_shift_recs(cur, rp, 1, numrecs - ptr + 1); 3555 3556 /* Now put the new data in, bump numrecs and log it. */ 3557 xfs_btree_copy_recs(cur, rp, rec, 1); 3558 xfs_btree_set_numrecs(block, ++numrecs); 3559 xfs_btree_log_recs(cur, bp, ptr, numrecs); 3560 #ifdef DEBUG 3561 if (ptr < numrecs) { 3562 ASSERT(cur->bc_ops->recs_inorder(cur, rp, 3563 xfs_btree_rec_addr(cur, ptr + 1, block))); 3564 } 3565 #endif 3566 } 3567 3568 /* Log the new number of records in the btree header. */ 3569 xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS); 3570 3571 /* 3572 * If we just inserted into a new tree block, we have to 3573 * recalculate nkey here because nkey is out of date. 3574 * 3575 * Otherwise we're just updating an existing block (having shoved 3576 * some records into the new tree block), so use the regular key 3577 * update mechanism. 3578 */ 3579 if (bp && xfs_buf_daddr(bp) != old_bn) { 3580 xfs_btree_get_keys(cur, block, lkey); 3581 } else if (xfs_btree_needs_key_update(cur, optr)) { 3582 error = xfs_btree_update_keys(cur, level); 3583 if (error) 3584 goto error0; 3585 } 3586 3587 /* 3588 * Return the new block number, if any. 3589 * If there is one, give back a record value and a cursor too. 3590 */ 3591 *ptrp = nptr; 3592 if (!xfs_btree_ptr_is_null(cur, &nptr)) { 3593 xfs_btree_copy_keys(cur, key, lkey, 1); 3594 *curp = ncur; 3595 } 3596 3597 *stat = 1; 3598 return 0; 3599 3600 error0: 3601 if (ncur) 3602 xfs_btree_del_cursor(ncur, error); 3603 return error; 3604 } 3605 3606 /* 3607 * Insert the record at the point referenced by cur. 3608 * 3609 * A multi-level split of the tree on insert will invalidate the original 3610 * cursor. All callers of this function should assume that the cursor is 3611 * no longer valid and revalidate it. 3612 */ 3613 int 3614 xfs_btree_insert( 3615 struct xfs_btree_cur *cur, 3616 int *stat) 3617 { 3618 int error; /* error return value */ 3619 int i; /* result value, 0 for failure */ 3620 int level; /* current level number in btree */ 3621 union xfs_btree_ptr nptr; /* new block number (split result) */ 3622 struct xfs_btree_cur *ncur; /* new cursor (split result) */ 3623 struct xfs_btree_cur *pcur; /* previous level's cursor */ 3624 union xfs_btree_key bkey; /* key of block to insert */ 3625 union xfs_btree_key *key; 3626 union xfs_btree_rec rec; /* record to insert */ 3627 3628 level = 0; 3629 ncur = NULL; 3630 pcur = cur; 3631 key = &bkey; 3632 3633 xfs_btree_set_ptr_null(cur, &nptr); 3634 3635 /* Make a key out of the record data to be inserted, and save it. */ 3636 cur->bc_ops->init_rec_from_cur(cur, &rec); 3637 cur->bc_ops->init_key_from_rec(key, &rec); 3638 3639 /* 3640 * Loop going up the tree, starting at the leaf level. 3641 * Stop when we don't get a split block, that must mean that 3642 * the insert is finished with this level. 3643 */ 3644 do { 3645 /* 3646 * Insert nrec/nptr into this level of the tree. 3647 * Note if we fail, nptr will be null. 3648 */ 3649 error = xfs_btree_insrec(pcur, level, &nptr, &rec, key, 3650 &ncur, &i); 3651 if (error) { 3652 if (pcur != cur) 3653 xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR); 3654 goto error0; 3655 } 3656 3657 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 3658 xfs_btree_mark_sick(cur); 3659 error = -EFSCORRUPTED; 3660 goto error0; 3661 } 3662 level++; 3663 3664 /* 3665 * See if the cursor we just used is trash. 3666 * Can't trash the caller's cursor, but otherwise we should 3667 * if ncur is a new cursor or we're about to be done. 3668 */ 3669 if (pcur != cur && 3670 (ncur || xfs_btree_ptr_is_null(cur, &nptr))) { 3671 /* Save the state from the cursor before we trash it */ 3672 if (cur->bc_ops->update_cursor && 3673 !(cur->bc_flags & XFS_BTREE_STAGING)) 3674 cur->bc_ops->update_cursor(pcur, cur); 3675 cur->bc_nlevels = pcur->bc_nlevels; 3676 xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR); 3677 } 3678 /* If we got a new cursor, switch to it. */ 3679 if (ncur) { 3680 pcur = ncur; 3681 ncur = NULL; 3682 } 3683 } while (!xfs_btree_ptr_is_null(cur, &nptr)); 3684 3685 *stat = i; 3686 return 0; 3687 error0: 3688 return error; 3689 } 3690 3691 /* 3692 * Try to merge a non-leaf block back into the inode root. 3693 * 3694 * Note: the killroot names comes from the fact that we're effectively 3695 * killing the old root block. But because we can't just delete the 3696 * inode we have to copy the single block it was pointing to into the 3697 * inode. 3698 */ 3699 STATIC int 3700 xfs_btree_kill_iroot( 3701 struct xfs_btree_cur *cur) 3702 { 3703 int whichfork = cur->bc_ino.whichfork; 3704 struct xfs_inode *ip = cur->bc_ino.ip; 3705 struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); 3706 struct xfs_btree_block *block; 3707 struct xfs_btree_block *cblock; 3708 union xfs_btree_key *kp; 3709 union xfs_btree_key *ckp; 3710 union xfs_btree_ptr *pp; 3711 union xfs_btree_ptr *cpp; 3712 struct xfs_buf *cbp; 3713 int level; 3714 int index; 3715 int numrecs; 3716 int error; 3717 #ifdef DEBUG 3718 union xfs_btree_ptr ptr; 3719 #endif 3720 int i; 3721 3722 ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE); 3723 ASSERT(cur->bc_nlevels > 1); 3724 3725 /* 3726 * Don't deal with the root block needs to be a leaf case. 3727 * We're just going to turn the thing back into extents anyway. 3728 */ 3729 level = cur->bc_nlevels - 1; 3730 if (level == 1) 3731 goto out0; 3732 3733 /* 3734 * Give up if the root has multiple children. 3735 */ 3736 block = xfs_btree_get_iroot(cur); 3737 if (xfs_btree_get_numrecs(block) != 1) 3738 goto out0; 3739 3740 cblock = xfs_btree_get_block(cur, level - 1, &cbp); 3741 numrecs = xfs_btree_get_numrecs(cblock); 3742 3743 /* 3744 * Only do this if the next level will fit. 3745 * Then the data must be copied up to the inode, 3746 * instead of freeing the root you free the next level. 3747 */ 3748 if (numrecs > cur->bc_ops->get_dmaxrecs(cur, level)) 3749 goto out0; 3750 3751 XFS_BTREE_STATS_INC(cur, killroot); 3752 3753 #ifdef DEBUG 3754 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB); 3755 ASSERT(xfs_btree_ptr_is_null(cur, &ptr)); 3756 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB); 3757 ASSERT(xfs_btree_ptr_is_null(cur, &ptr)); 3758 #endif 3759 3760 index = numrecs - cur->bc_ops->get_maxrecs(cur, level); 3761 if (index) { 3762 xfs_iroot_realloc(cur->bc_ino.ip, index, 3763 cur->bc_ino.whichfork); 3764 block = ifp->if_broot; 3765 } 3766 3767 be16_add_cpu(&block->bb_numrecs, index); 3768 ASSERT(block->bb_numrecs == cblock->bb_numrecs); 3769 3770 kp = xfs_btree_key_addr(cur, 1, block); 3771 ckp = xfs_btree_key_addr(cur, 1, cblock); 3772 xfs_btree_copy_keys(cur, kp, ckp, numrecs); 3773 3774 pp = xfs_btree_ptr_addr(cur, 1, block); 3775 cpp = xfs_btree_ptr_addr(cur, 1, cblock); 3776 3777 for (i = 0; i < numrecs; i++) { 3778 error = xfs_btree_debug_check_ptr(cur, cpp, i, level - 1); 3779 if (error) 3780 return error; 3781 } 3782 3783 xfs_btree_copy_ptrs(cur, pp, cpp, numrecs); 3784 3785 error = xfs_btree_free_block(cur, cbp); 3786 if (error) 3787 return error; 3788 3789 cur->bc_levels[level - 1].bp = NULL; 3790 be16_add_cpu(&block->bb_level, -1); 3791 xfs_trans_log_inode(cur->bc_tp, ip, 3792 XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork)); 3793 cur->bc_nlevels--; 3794 out0: 3795 return 0; 3796 } 3797 3798 /* 3799 * Kill the current root node, and replace it with it's only child node. 3800 */ 3801 STATIC int 3802 xfs_btree_kill_root( 3803 struct xfs_btree_cur *cur, 3804 struct xfs_buf *bp, 3805 int level, 3806 union xfs_btree_ptr *newroot) 3807 { 3808 int error; 3809 3810 XFS_BTREE_STATS_INC(cur, killroot); 3811 3812 /* 3813 * Update the root pointer, decreasing the level by 1 and then 3814 * free the old root. 3815 */ 3816 xfs_btree_set_root(cur, newroot, -1); 3817 3818 error = xfs_btree_free_block(cur, bp); 3819 if (error) 3820 return error; 3821 3822 cur->bc_levels[level].bp = NULL; 3823 cur->bc_levels[level].ra = 0; 3824 cur->bc_nlevels--; 3825 3826 return 0; 3827 } 3828 3829 STATIC int 3830 xfs_btree_dec_cursor( 3831 struct xfs_btree_cur *cur, 3832 int level, 3833 int *stat) 3834 { 3835 int error; 3836 int i; 3837 3838 if (level > 0) { 3839 error = xfs_btree_decrement(cur, level, &i); 3840 if (error) 3841 return error; 3842 } 3843 3844 *stat = 1; 3845 return 0; 3846 } 3847 3848 /* 3849 * Single level of the btree record deletion routine. 3850 * Delete record pointed to by cur/level. 3851 * Remove the record from its block then rebalance the tree. 3852 * Return 0 for error, 1 for done, 2 to go on to the next level. 3853 */ 3854 STATIC int /* error */ 3855 xfs_btree_delrec( 3856 struct xfs_btree_cur *cur, /* btree cursor */ 3857 int level, /* level removing record from */ 3858 int *stat) /* fail/done/go-on */ 3859 { 3860 struct xfs_btree_block *block; /* btree block */ 3861 union xfs_btree_ptr cptr; /* current block ptr */ 3862 struct xfs_buf *bp; /* buffer for block */ 3863 int error; /* error return value */ 3864 int i; /* loop counter */ 3865 union xfs_btree_ptr lptr; /* left sibling block ptr */ 3866 struct xfs_buf *lbp; /* left buffer pointer */ 3867 struct xfs_btree_block *left; /* left btree block */ 3868 int lrecs = 0; /* left record count */ 3869 int ptr; /* key/record index */ 3870 union xfs_btree_ptr rptr; /* right sibling block ptr */ 3871 struct xfs_buf *rbp; /* right buffer pointer */ 3872 struct xfs_btree_block *right; /* right btree block */ 3873 struct xfs_btree_block *rrblock; /* right-right btree block */ 3874 struct xfs_buf *rrbp; /* right-right buffer pointer */ 3875 int rrecs = 0; /* right record count */ 3876 struct xfs_btree_cur *tcur; /* temporary btree cursor */ 3877 int numrecs; /* temporary numrec count */ 3878 3879 tcur = NULL; 3880 3881 /* Get the index of the entry being deleted, check for nothing there. */ 3882 ptr = cur->bc_levels[level].ptr; 3883 if (ptr == 0) { 3884 *stat = 0; 3885 return 0; 3886 } 3887 3888 /* Get the buffer & block containing the record or key/ptr. */ 3889 block = xfs_btree_get_block(cur, level, &bp); 3890 numrecs = xfs_btree_get_numrecs(block); 3891 3892 #ifdef DEBUG 3893 error = xfs_btree_check_block(cur, block, level, bp); 3894 if (error) 3895 goto error0; 3896 #endif 3897 3898 /* Fail if we're off the end of the block. */ 3899 if (ptr > numrecs) { 3900 *stat = 0; 3901 return 0; 3902 } 3903 3904 XFS_BTREE_STATS_INC(cur, delrec); 3905 XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr); 3906 3907 /* Excise the entries being deleted. */ 3908 if (level > 0) { 3909 /* It's a nonleaf. operate on keys and ptrs */ 3910 union xfs_btree_key *lkp; 3911 union xfs_btree_ptr *lpp; 3912 3913 lkp = xfs_btree_key_addr(cur, ptr + 1, block); 3914 lpp = xfs_btree_ptr_addr(cur, ptr + 1, block); 3915 3916 for (i = 0; i < numrecs - ptr; i++) { 3917 error = xfs_btree_debug_check_ptr(cur, lpp, i, level); 3918 if (error) 3919 goto error0; 3920 } 3921 3922 if (ptr < numrecs) { 3923 xfs_btree_shift_keys(cur, lkp, -1, numrecs - ptr); 3924 xfs_btree_shift_ptrs(cur, lpp, -1, numrecs - ptr); 3925 xfs_btree_log_keys(cur, bp, ptr, numrecs - 1); 3926 xfs_btree_log_ptrs(cur, bp, ptr, numrecs - 1); 3927 } 3928 } else { 3929 /* It's a leaf. operate on records */ 3930 if (ptr < numrecs) { 3931 xfs_btree_shift_recs(cur, 3932 xfs_btree_rec_addr(cur, ptr + 1, block), 3933 -1, numrecs - ptr); 3934 xfs_btree_log_recs(cur, bp, ptr, numrecs - 1); 3935 } 3936 } 3937 3938 /* 3939 * Decrement and log the number of entries in the block. 3940 */ 3941 xfs_btree_set_numrecs(block, --numrecs); 3942 xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS); 3943 3944 /* 3945 * We're at the root level. First, shrink the root block in-memory. 3946 * Try to get rid of the next level down. If we can't then there's 3947 * nothing left to do. 3948 */ 3949 if (xfs_btree_at_iroot(cur, level)) { 3950 xfs_iroot_realloc(cur->bc_ino.ip, -1, cur->bc_ino.whichfork); 3951 3952 error = xfs_btree_kill_iroot(cur); 3953 if (error) 3954 goto error0; 3955 3956 error = xfs_btree_dec_cursor(cur, level, stat); 3957 if (error) 3958 goto error0; 3959 *stat = 1; 3960 return 0; 3961 } 3962 3963 /* 3964 * If this is the root level, and there's only one entry left, and it's 3965 * NOT the leaf level, then we can get rid of this level. 3966 */ 3967 if (level == cur->bc_nlevels - 1) { 3968 if (numrecs == 1 && level > 0) { 3969 union xfs_btree_ptr *pp; 3970 /* 3971 * pp is still set to the first pointer in the block. 3972 * Make it the new root of the btree. 3973 */ 3974 pp = xfs_btree_ptr_addr(cur, 1, block); 3975 error = xfs_btree_kill_root(cur, bp, level, pp); 3976 if (error) 3977 goto error0; 3978 } else if (level > 0) { 3979 error = xfs_btree_dec_cursor(cur, level, stat); 3980 if (error) 3981 goto error0; 3982 } 3983 *stat = 1; 3984 return 0; 3985 } 3986 3987 /* 3988 * If we deleted the leftmost entry in the block, update the 3989 * key values above us in the tree. 3990 */ 3991 if (xfs_btree_needs_key_update(cur, ptr)) { 3992 error = xfs_btree_update_keys(cur, level); 3993 if (error) 3994 goto error0; 3995 } 3996 3997 /* 3998 * If the number of records remaining in the block is at least 3999 * the minimum, we're done. 4000 */ 4001 if (numrecs >= cur->bc_ops->get_minrecs(cur, level)) { 4002 error = xfs_btree_dec_cursor(cur, level, stat); 4003 if (error) 4004 goto error0; 4005 return 0; 4006 } 4007 4008 /* 4009 * Otherwise, we have to move some records around to keep the 4010 * tree balanced. Look at the left and right sibling blocks to 4011 * see if we can re-balance by moving only one record. 4012 */ 4013 xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB); 4014 xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB); 4015 4016 if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) { 4017 /* 4018 * One child of root, need to get a chance to copy its contents 4019 * into the root and delete it. Can't go up to next level, 4020 * there's nothing to delete there. 4021 */ 4022 if (xfs_btree_ptr_is_null(cur, &rptr) && 4023 xfs_btree_ptr_is_null(cur, &lptr) && 4024 level == cur->bc_nlevels - 2) { 4025 error = xfs_btree_kill_iroot(cur); 4026 if (!error) 4027 error = xfs_btree_dec_cursor(cur, level, stat); 4028 if (error) 4029 goto error0; 4030 return 0; 4031 } 4032 } 4033 4034 ASSERT(!xfs_btree_ptr_is_null(cur, &rptr) || 4035 !xfs_btree_ptr_is_null(cur, &lptr)); 4036 4037 /* 4038 * Duplicate the cursor so our btree manipulations here won't 4039 * disrupt the next level up. 4040 */ 4041 error = xfs_btree_dup_cursor(cur, &tcur); 4042 if (error) 4043 goto error0; 4044 4045 /* 4046 * If there's a right sibling, see if it's ok to shift an entry 4047 * out of it. 4048 */ 4049 if (!xfs_btree_ptr_is_null(cur, &rptr)) { 4050 /* 4051 * Move the temp cursor to the last entry in the next block. 4052 * Actually any entry but the first would suffice. 4053 */ 4054 i = xfs_btree_lastrec(tcur, level); 4055 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 4056 xfs_btree_mark_sick(cur); 4057 error = -EFSCORRUPTED; 4058 goto error0; 4059 } 4060 4061 error = xfs_btree_increment(tcur, level, &i); 4062 if (error) 4063 goto error0; 4064 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 4065 xfs_btree_mark_sick(cur); 4066 error = -EFSCORRUPTED; 4067 goto error0; 4068 } 4069 4070 i = xfs_btree_lastrec(tcur, level); 4071 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 4072 xfs_btree_mark_sick(cur); 4073 error = -EFSCORRUPTED; 4074 goto error0; 4075 } 4076 4077 /* Grab a pointer to the block. */ 4078 right = xfs_btree_get_block(tcur, level, &rbp); 4079 #ifdef DEBUG 4080 error = xfs_btree_check_block(tcur, right, level, rbp); 4081 if (error) 4082 goto error0; 4083 #endif 4084 /* Grab the current block number, for future use. */ 4085 xfs_btree_get_sibling(tcur, right, &cptr, XFS_BB_LEFTSIB); 4086 4087 /* 4088 * If right block is full enough so that removing one entry 4089 * won't make it too empty, and left-shifting an entry out 4090 * of right to us works, we're done. 4091 */ 4092 if (xfs_btree_get_numrecs(right) - 1 >= 4093 cur->bc_ops->get_minrecs(tcur, level)) { 4094 error = xfs_btree_lshift(tcur, level, &i); 4095 if (error) 4096 goto error0; 4097 if (i) { 4098 ASSERT(xfs_btree_get_numrecs(block) >= 4099 cur->bc_ops->get_minrecs(tcur, level)); 4100 4101 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); 4102 tcur = NULL; 4103 4104 error = xfs_btree_dec_cursor(cur, level, stat); 4105 if (error) 4106 goto error0; 4107 return 0; 4108 } 4109 } 4110 4111 /* 4112 * Otherwise, grab the number of records in right for 4113 * future reference, and fix up the temp cursor to point 4114 * to our block again (last record). 4115 */ 4116 rrecs = xfs_btree_get_numrecs(right); 4117 if (!xfs_btree_ptr_is_null(cur, &lptr)) { 4118 i = xfs_btree_firstrec(tcur, level); 4119 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 4120 xfs_btree_mark_sick(cur); 4121 error = -EFSCORRUPTED; 4122 goto error0; 4123 } 4124 4125 error = xfs_btree_decrement(tcur, level, &i); 4126 if (error) 4127 goto error0; 4128 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 4129 xfs_btree_mark_sick(cur); 4130 error = -EFSCORRUPTED; 4131 goto error0; 4132 } 4133 } 4134 } 4135 4136 /* 4137 * If there's a left sibling, see if it's ok to shift an entry 4138 * out of it. 4139 */ 4140 if (!xfs_btree_ptr_is_null(cur, &lptr)) { 4141 /* 4142 * Move the temp cursor to the first entry in the 4143 * previous block. 4144 */ 4145 i = xfs_btree_firstrec(tcur, level); 4146 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 4147 xfs_btree_mark_sick(cur); 4148 error = -EFSCORRUPTED; 4149 goto error0; 4150 } 4151 4152 error = xfs_btree_decrement(tcur, level, &i); 4153 if (error) 4154 goto error0; 4155 i = xfs_btree_firstrec(tcur, level); 4156 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 4157 xfs_btree_mark_sick(cur); 4158 error = -EFSCORRUPTED; 4159 goto error0; 4160 } 4161 4162 /* Grab a pointer to the block. */ 4163 left = xfs_btree_get_block(tcur, level, &lbp); 4164 #ifdef DEBUG 4165 error = xfs_btree_check_block(cur, left, level, lbp); 4166 if (error) 4167 goto error0; 4168 #endif 4169 /* Grab the current block number, for future use. */ 4170 xfs_btree_get_sibling(tcur, left, &cptr, XFS_BB_RIGHTSIB); 4171 4172 /* 4173 * If left block is full enough so that removing one entry 4174 * won't make it too empty, and right-shifting an entry out 4175 * of left to us works, we're done. 4176 */ 4177 if (xfs_btree_get_numrecs(left) - 1 >= 4178 cur->bc_ops->get_minrecs(tcur, level)) { 4179 error = xfs_btree_rshift(tcur, level, &i); 4180 if (error) 4181 goto error0; 4182 if (i) { 4183 ASSERT(xfs_btree_get_numrecs(block) >= 4184 cur->bc_ops->get_minrecs(tcur, level)); 4185 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); 4186 tcur = NULL; 4187 if (level == 0) 4188 cur->bc_levels[0].ptr++; 4189 4190 *stat = 1; 4191 return 0; 4192 } 4193 } 4194 4195 /* 4196 * Otherwise, grab the number of records in right for 4197 * future reference. 4198 */ 4199 lrecs = xfs_btree_get_numrecs(left); 4200 } 4201 4202 /* Delete the temp cursor, we're done with it. */ 4203 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); 4204 tcur = NULL; 4205 4206 /* If here, we need to do a join to keep the tree balanced. */ 4207 ASSERT(!xfs_btree_ptr_is_null(cur, &cptr)); 4208 4209 if (!xfs_btree_ptr_is_null(cur, &lptr) && 4210 lrecs + xfs_btree_get_numrecs(block) <= 4211 cur->bc_ops->get_maxrecs(cur, level)) { 4212 /* 4213 * Set "right" to be the starting block, 4214 * "left" to be the left neighbor. 4215 */ 4216 rptr = cptr; 4217 right = block; 4218 rbp = bp; 4219 error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp); 4220 if (error) 4221 goto error0; 4222 4223 /* 4224 * If that won't work, see if we can join with the right neighbor block. 4225 */ 4226 } else if (!xfs_btree_ptr_is_null(cur, &rptr) && 4227 rrecs + xfs_btree_get_numrecs(block) <= 4228 cur->bc_ops->get_maxrecs(cur, level)) { 4229 /* 4230 * Set "left" to be the starting block, 4231 * "right" to be the right neighbor. 4232 */ 4233 lptr = cptr; 4234 left = block; 4235 lbp = bp; 4236 error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp); 4237 if (error) 4238 goto error0; 4239 4240 /* 4241 * Otherwise, we can't fix the imbalance. 4242 * Just return. This is probably a logic error, but it's not fatal. 4243 */ 4244 } else { 4245 error = xfs_btree_dec_cursor(cur, level, stat); 4246 if (error) 4247 goto error0; 4248 return 0; 4249 } 4250 4251 rrecs = xfs_btree_get_numrecs(right); 4252 lrecs = xfs_btree_get_numrecs(left); 4253 4254 /* 4255 * We're now going to join "left" and "right" by moving all the stuff 4256 * in "right" to "left" and deleting "right". 4257 */ 4258 XFS_BTREE_STATS_ADD(cur, moves, rrecs); 4259 if (level > 0) { 4260 /* It's a non-leaf. Move keys and pointers. */ 4261 union xfs_btree_key *lkp; /* left btree key */ 4262 union xfs_btree_ptr *lpp; /* left address pointer */ 4263 union xfs_btree_key *rkp; /* right btree key */ 4264 union xfs_btree_ptr *rpp; /* right address pointer */ 4265 4266 lkp = xfs_btree_key_addr(cur, lrecs + 1, left); 4267 lpp = xfs_btree_ptr_addr(cur, lrecs + 1, left); 4268 rkp = xfs_btree_key_addr(cur, 1, right); 4269 rpp = xfs_btree_ptr_addr(cur, 1, right); 4270 4271 for (i = 1; i < rrecs; i++) { 4272 error = xfs_btree_debug_check_ptr(cur, rpp, i, level); 4273 if (error) 4274 goto error0; 4275 } 4276 4277 xfs_btree_copy_keys(cur, lkp, rkp, rrecs); 4278 xfs_btree_copy_ptrs(cur, lpp, rpp, rrecs); 4279 4280 xfs_btree_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs); 4281 xfs_btree_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs); 4282 } else { 4283 /* It's a leaf. Move records. */ 4284 union xfs_btree_rec *lrp; /* left record pointer */ 4285 union xfs_btree_rec *rrp; /* right record pointer */ 4286 4287 lrp = xfs_btree_rec_addr(cur, lrecs + 1, left); 4288 rrp = xfs_btree_rec_addr(cur, 1, right); 4289 4290 xfs_btree_copy_recs(cur, lrp, rrp, rrecs); 4291 xfs_btree_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs); 4292 } 4293 4294 XFS_BTREE_STATS_INC(cur, join); 4295 4296 /* 4297 * Fix up the number of records and right block pointer in the 4298 * surviving block, and log it. 4299 */ 4300 xfs_btree_set_numrecs(left, lrecs + rrecs); 4301 xfs_btree_get_sibling(cur, right, &cptr, XFS_BB_RIGHTSIB); 4302 xfs_btree_set_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB); 4303 xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB); 4304 4305 /* If there is a right sibling, point it to the remaining block. */ 4306 xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB); 4307 if (!xfs_btree_ptr_is_null(cur, &cptr)) { 4308 error = xfs_btree_read_buf_block(cur, &cptr, 0, &rrblock, &rrbp); 4309 if (error) 4310 goto error0; 4311 xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB); 4312 xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB); 4313 } 4314 4315 /* Free the deleted block. */ 4316 error = xfs_btree_free_block(cur, rbp); 4317 if (error) 4318 goto error0; 4319 4320 /* 4321 * If we joined with the left neighbor, set the buffer in the 4322 * cursor to the left block, and fix up the index. 4323 */ 4324 if (bp != lbp) { 4325 cur->bc_levels[level].bp = lbp; 4326 cur->bc_levels[level].ptr += lrecs; 4327 cur->bc_levels[level].ra = 0; 4328 } 4329 /* 4330 * If we joined with the right neighbor and there's a level above 4331 * us, increment the cursor at that level. 4332 */ 4333 else if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE || 4334 level + 1 < cur->bc_nlevels) { 4335 error = xfs_btree_increment(cur, level + 1, &i); 4336 if (error) 4337 goto error0; 4338 } 4339 4340 /* 4341 * Readjust the ptr at this level if it's not a leaf, since it's 4342 * still pointing at the deletion point, which makes the cursor 4343 * inconsistent. If this makes the ptr 0, the caller fixes it up. 4344 * We can't use decrement because it would change the next level up. 4345 */ 4346 if (level > 0) 4347 cur->bc_levels[level].ptr--; 4348 4349 /* 4350 * We combined blocks, so we have to update the parent keys if the 4351 * btree supports overlapped intervals. However, 4352 * bc_levels[level + 1].ptr points to the old block so that the caller 4353 * knows which record to delete. Therefore, the caller must be savvy 4354 * enough to call updkeys for us if we return stat == 2. The other 4355 * exit points from this function don't require deletions further up 4356 * the tree, so they can call updkeys directly. 4357 */ 4358 4359 /* Return value means the next level up has something to do. */ 4360 *stat = 2; 4361 return 0; 4362 4363 error0: 4364 if (tcur) 4365 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); 4366 return error; 4367 } 4368 4369 /* 4370 * Delete the record pointed to by cur. 4371 * The cursor refers to the place where the record was (could be inserted) 4372 * when the operation returns. 4373 */ 4374 int /* error */ 4375 xfs_btree_delete( 4376 struct xfs_btree_cur *cur, 4377 int *stat) /* success/failure */ 4378 { 4379 int error; /* error return value */ 4380 int level; 4381 int i; 4382 bool joined = false; 4383 4384 /* 4385 * Go up the tree, starting at leaf level. 4386 * 4387 * If 2 is returned then a join was done; go to the next level. 4388 * Otherwise we are done. 4389 */ 4390 for (level = 0, i = 2; i == 2; level++) { 4391 error = xfs_btree_delrec(cur, level, &i); 4392 if (error) 4393 goto error0; 4394 if (i == 2) 4395 joined = true; 4396 } 4397 4398 /* 4399 * If we combined blocks as part of deleting the record, delrec won't 4400 * have updated the parent high keys so we have to do that here. 4401 */ 4402 if (joined && (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING)) { 4403 error = xfs_btree_updkeys_force(cur, 0); 4404 if (error) 4405 goto error0; 4406 } 4407 4408 if (i == 0) { 4409 for (level = 1; level < cur->bc_nlevels; level++) { 4410 if (cur->bc_levels[level].ptr == 0) { 4411 error = xfs_btree_decrement(cur, level, &i); 4412 if (error) 4413 goto error0; 4414 break; 4415 } 4416 } 4417 } 4418 4419 *stat = i; 4420 return 0; 4421 error0: 4422 return error; 4423 } 4424 4425 /* 4426 * Get the data from the pointed-to record. 4427 */ 4428 int /* error */ 4429 xfs_btree_get_rec( 4430 struct xfs_btree_cur *cur, /* btree cursor */ 4431 union xfs_btree_rec **recp, /* output: btree record */ 4432 int *stat) /* output: success/failure */ 4433 { 4434 struct xfs_btree_block *block; /* btree block */ 4435 struct xfs_buf *bp; /* buffer pointer */ 4436 int ptr; /* record number */ 4437 #ifdef DEBUG 4438 int error; /* error return value */ 4439 #endif 4440 4441 ptr = cur->bc_levels[0].ptr; 4442 block = xfs_btree_get_block(cur, 0, &bp); 4443 4444 #ifdef DEBUG 4445 error = xfs_btree_check_block(cur, block, 0, bp); 4446 if (error) 4447 return error; 4448 #endif 4449 4450 /* 4451 * Off the right end or left end, return failure. 4452 */ 4453 if (ptr > xfs_btree_get_numrecs(block) || ptr <= 0) { 4454 *stat = 0; 4455 return 0; 4456 } 4457 4458 /* 4459 * Point to the record and extract its data. 4460 */ 4461 *recp = xfs_btree_rec_addr(cur, ptr, block); 4462 *stat = 1; 4463 return 0; 4464 } 4465 4466 /* Visit a block in a btree. */ 4467 STATIC int 4468 xfs_btree_visit_block( 4469 struct xfs_btree_cur *cur, 4470 int level, 4471 xfs_btree_visit_blocks_fn fn, 4472 void *data) 4473 { 4474 struct xfs_btree_block *block; 4475 struct xfs_buf *bp; 4476 union xfs_btree_ptr rptr, bufptr; 4477 int error; 4478 4479 /* do right sibling readahead */ 4480 xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA); 4481 block = xfs_btree_get_block(cur, level, &bp); 4482 4483 /* process the block */ 4484 error = fn(cur, level, data); 4485 if (error) 4486 return error; 4487 4488 /* now read rh sibling block for next iteration */ 4489 xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB); 4490 if (xfs_btree_ptr_is_null(cur, &rptr)) 4491 return -ENOENT; 4492 4493 /* 4494 * We only visit blocks once in this walk, so we have to avoid the 4495 * internal xfs_btree_lookup_get_block() optimisation where it will 4496 * return the same block without checking if the right sibling points 4497 * back to us and creates a cyclic reference in the btree. 4498 */ 4499 xfs_btree_buf_to_ptr(cur, bp, &bufptr); 4500 if (xfs_btree_ptrs_equal(cur, &rptr, &bufptr)) { 4501 xfs_btree_mark_sick(cur); 4502 return -EFSCORRUPTED; 4503 } 4504 4505 return xfs_btree_lookup_get_block(cur, level, &rptr, &block); 4506 } 4507 4508 4509 /* Visit every block in a btree. */ 4510 int 4511 xfs_btree_visit_blocks( 4512 struct xfs_btree_cur *cur, 4513 xfs_btree_visit_blocks_fn fn, 4514 unsigned int flags, 4515 void *data) 4516 { 4517 union xfs_btree_ptr lptr; 4518 int level; 4519 struct xfs_btree_block *block = NULL; 4520 int error = 0; 4521 4522 xfs_btree_init_ptr_from_cur(cur, &lptr); 4523 4524 /* for each level */ 4525 for (level = cur->bc_nlevels - 1; level >= 0; level--) { 4526 /* grab the left hand block */ 4527 error = xfs_btree_lookup_get_block(cur, level, &lptr, &block); 4528 if (error) 4529 return error; 4530 4531 /* readahead the left most block for the next level down */ 4532 if (level > 0) { 4533 union xfs_btree_ptr *ptr; 4534 4535 ptr = xfs_btree_ptr_addr(cur, 1, block); 4536 xfs_btree_readahead_ptr(cur, ptr, 1); 4537 4538 /* save for the next iteration of the loop */ 4539 xfs_btree_copy_ptrs(cur, &lptr, ptr, 1); 4540 4541 if (!(flags & XFS_BTREE_VISIT_LEAVES)) 4542 continue; 4543 } else if (!(flags & XFS_BTREE_VISIT_RECORDS)) { 4544 continue; 4545 } 4546 4547 /* for each buffer in the level */ 4548 do { 4549 error = xfs_btree_visit_block(cur, level, fn, data); 4550 } while (!error); 4551 4552 if (error != -ENOENT) 4553 return error; 4554 } 4555 4556 return 0; 4557 } 4558 4559 /* 4560 * Change the owner of a btree. 4561 * 4562 * The mechanism we use here is ordered buffer logging. Because we don't know 4563 * how many buffers were are going to need to modify, we don't really want to 4564 * have to make transaction reservations for the worst case of every buffer in a 4565 * full size btree as that may be more space that we can fit in the log.... 4566 * 4567 * We do the btree walk in the most optimal manner possible - we have sibling 4568 * pointers so we can just walk all the blocks on each level from left to right 4569 * in a single pass, and then move to the next level and do the same. We can 4570 * also do readahead on the sibling pointers to get IO moving more quickly, 4571 * though for slow disks this is unlikely to make much difference to performance 4572 * as the amount of CPU work we have to do before moving to the next block is 4573 * relatively small. 4574 * 4575 * For each btree block that we load, modify the owner appropriately, set the 4576 * buffer as an ordered buffer and log it appropriately. We need to ensure that 4577 * we mark the region we change dirty so that if the buffer is relogged in 4578 * a subsequent transaction the changes we make here as an ordered buffer are 4579 * correctly relogged in that transaction. If we are in recovery context, then 4580 * just queue the modified buffer as delayed write buffer so the transaction 4581 * recovery completion writes the changes to disk. 4582 */ 4583 struct xfs_btree_block_change_owner_info { 4584 uint64_t new_owner; 4585 struct list_head *buffer_list; 4586 }; 4587 4588 static int 4589 xfs_btree_block_change_owner( 4590 struct xfs_btree_cur *cur, 4591 int level, 4592 void *data) 4593 { 4594 struct xfs_btree_block_change_owner_info *bbcoi = data; 4595 struct xfs_btree_block *block; 4596 struct xfs_buf *bp; 4597 4598 /* modify the owner */ 4599 block = xfs_btree_get_block(cur, level, &bp); 4600 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { 4601 if (block->bb_u.l.bb_owner == cpu_to_be64(bbcoi->new_owner)) 4602 return 0; 4603 block->bb_u.l.bb_owner = cpu_to_be64(bbcoi->new_owner); 4604 } else { 4605 if (block->bb_u.s.bb_owner == cpu_to_be32(bbcoi->new_owner)) 4606 return 0; 4607 block->bb_u.s.bb_owner = cpu_to_be32(bbcoi->new_owner); 4608 } 4609 4610 /* 4611 * If the block is a root block hosted in an inode, we might not have a 4612 * buffer pointer here and we shouldn't attempt to log the change as the 4613 * information is already held in the inode and discarded when the root 4614 * block is formatted into the on-disk inode fork. We still change it, 4615 * though, so everything is consistent in memory. 4616 */ 4617 if (!bp) { 4618 ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE); 4619 ASSERT(level == cur->bc_nlevels - 1); 4620 return 0; 4621 } 4622 4623 if (cur->bc_tp) { 4624 if (!xfs_trans_ordered_buf(cur->bc_tp, bp)) { 4625 xfs_btree_log_block(cur, bp, XFS_BB_OWNER); 4626 return -EAGAIN; 4627 } 4628 } else { 4629 xfs_buf_delwri_queue(bp, bbcoi->buffer_list); 4630 } 4631 4632 return 0; 4633 } 4634 4635 int 4636 xfs_btree_change_owner( 4637 struct xfs_btree_cur *cur, 4638 uint64_t new_owner, 4639 struct list_head *buffer_list) 4640 { 4641 struct xfs_btree_block_change_owner_info bbcoi; 4642 4643 bbcoi.new_owner = new_owner; 4644 bbcoi.buffer_list = buffer_list; 4645 4646 return xfs_btree_visit_blocks(cur, xfs_btree_block_change_owner, 4647 XFS_BTREE_VISIT_ALL, &bbcoi); 4648 } 4649 4650 /* Verify the v5 fields of a long-format btree block. */ 4651 xfs_failaddr_t 4652 xfs_btree_fsblock_v5hdr_verify( 4653 struct xfs_buf *bp, 4654 uint64_t owner) 4655 { 4656 struct xfs_mount *mp = bp->b_mount; 4657 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 4658 4659 if (!xfs_has_crc(mp)) 4660 return __this_address; 4661 if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid)) 4662 return __this_address; 4663 if (block->bb_u.l.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp))) 4664 return __this_address; 4665 if (owner != XFS_RMAP_OWN_UNKNOWN && 4666 be64_to_cpu(block->bb_u.l.bb_owner) != owner) 4667 return __this_address; 4668 return NULL; 4669 } 4670 4671 /* Verify a long-format btree block. */ 4672 xfs_failaddr_t 4673 xfs_btree_fsblock_verify( 4674 struct xfs_buf *bp, 4675 unsigned int max_recs) 4676 { 4677 struct xfs_mount *mp = bp->b_mount; 4678 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 4679 xfs_fsblock_t fsb; 4680 xfs_failaddr_t fa; 4681 4682 ASSERT(!xfs_buftarg_is_mem(bp->b_target)); 4683 4684 /* numrecs verification */ 4685 if (be16_to_cpu(block->bb_numrecs) > max_recs) 4686 return __this_address; 4687 4688 /* sibling pointer verification */ 4689 fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); 4690 fa = xfs_btree_check_fsblock_siblings(mp, fsb, 4691 block->bb_u.l.bb_leftsib); 4692 if (!fa) 4693 fa = xfs_btree_check_fsblock_siblings(mp, fsb, 4694 block->bb_u.l.bb_rightsib); 4695 return fa; 4696 } 4697 4698 /* Verify an in-memory btree block. */ 4699 xfs_failaddr_t 4700 xfs_btree_memblock_verify( 4701 struct xfs_buf *bp, 4702 unsigned int max_recs) 4703 { 4704 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 4705 struct xfs_buftarg *btp = bp->b_target; 4706 xfs_failaddr_t fa; 4707 xfbno_t bno; 4708 4709 ASSERT(xfs_buftarg_is_mem(bp->b_target)); 4710 4711 /* numrecs verification */ 4712 if (be16_to_cpu(block->bb_numrecs) > max_recs) 4713 return __this_address; 4714 4715 /* sibling pointer verification */ 4716 bno = xfs_daddr_to_xfbno(xfs_buf_daddr(bp)); 4717 fa = xfs_btree_check_memblock_siblings(btp, bno, 4718 block->bb_u.l.bb_leftsib); 4719 if (fa) 4720 return fa; 4721 fa = xfs_btree_check_memblock_siblings(btp, bno, 4722 block->bb_u.l.bb_rightsib); 4723 if (fa) 4724 return fa; 4725 4726 return NULL; 4727 } 4728 /** 4729 * xfs_btree_agblock_v5hdr_verify() -- verify the v5 fields of a short-format 4730 * btree block 4731 * 4732 * @bp: buffer containing the btree block 4733 */ 4734 xfs_failaddr_t 4735 xfs_btree_agblock_v5hdr_verify( 4736 struct xfs_buf *bp) 4737 { 4738 struct xfs_mount *mp = bp->b_mount; 4739 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 4740 struct xfs_perag *pag = bp->b_pag; 4741 4742 if (!xfs_has_crc(mp)) 4743 return __this_address; 4744 if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) 4745 return __this_address; 4746 if (block->bb_u.s.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp))) 4747 return __this_address; 4748 if (pag && be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno) 4749 return __this_address; 4750 return NULL; 4751 } 4752 4753 /** 4754 * xfs_btree_agblock_verify() -- verify a short-format btree block 4755 * 4756 * @bp: buffer containing the btree block 4757 * @max_recs: maximum records allowed in this btree node 4758 */ 4759 xfs_failaddr_t 4760 xfs_btree_agblock_verify( 4761 struct xfs_buf *bp, 4762 unsigned int max_recs) 4763 { 4764 struct xfs_mount *mp = bp->b_mount; 4765 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 4766 xfs_agblock_t agbno; 4767 xfs_failaddr_t fa; 4768 4769 ASSERT(!xfs_buftarg_is_mem(bp->b_target)); 4770 4771 /* numrecs verification */ 4772 if (be16_to_cpu(block->bb_numrecs) > max_recs) 4773 return __this_address; 4774 4775 /* sibling pointer verification */ 4776 agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp)); 4777 fa = xfs_btree_check_agblock_siblings(bp->b_pag, agbno, 4778 block->bb_u.s.bb_leftsib); 4779 if (!fa) 4780 fa = xfs_btree_check_agblock_siblings(bp->b_pag, agbno, 4781 block->bb_u.s.bb_rightsib); 4782 return fa; 4783 } 4784 4785 /* 4786 * For the given limits on leaf and keyptr records per block, calculate the 4787 * height of the tree needed to index the number of leaf records. 4788 */ 4789 unsigned int 4790 xfs_btree_compute_maxlevels( 4791 const unsigned int *limits, 4792 unsigned long long records) 4793 { 4794 unsigned long long level_blocks = howmany_64(records, limits[0]); 4795 unsigned int height = 1; 4796 4797 while (level_blocks > 1) { 4798 level_blocks = howmany_64(level_blocks, limits[1]); 4799 height++; 4800 } 4801 4802 return height; 4803 } 4804 4805 /* 4806 * For the given limits on leaf and keyptr records per block, calculate the 4807 * number of blocks needed to index the given number of leaf records. 4808 */ 4809 unsigned long long 4810 xfs_btree_calc_size( 4811 const unsigned int *limits, 4812 unsigned long long records) 4813 { 4814 unsigned long long level_blocks = howmany_64(records, limits[0]); 4815 unsigned long long blocks = level_blocks; 4816 4817 while (level_blocks > 1) { 4818 level_blocks = howmany_64(level_blocks, limits[1]); 4819 blocks += level_blocks; 4820 } 4821 4822 return blocks; 4823 } 4824 4825 /* 4826 * Given a number of available blocks for the btree to consume with records and 4827 * pointers, calculate the height of the tree needed to index all the records 4828 * that space can hold based on the number of pointers each interior node 4829 * holds. 4830 * 4831 * We start by assuming a single level tree consumes a single block, then track 4832 * the number of blocks each node level consumes until we no longer have space 4833 * to store the next node level. At this point, we are indexing all the leaf 4834 * blocks in the space, and there's no more free space to split the tree any 4835 * further. That's our maximum btree height. 4836 */ 4837 unsigned int 4838 xfs_btree_space_to_height( 4839 const unsigned int *limits, 4840 unsigned long long leaf_blocks) 4841 { 4842 /* 4843 * The root btree block can have fewer than minrecs pointers in it 4844 * because the tree might not be big enough to require that amount of 4845 * fanout. Hence it has a minimum size of 2 pointers, not limits[1]. 4846 */ 4847 unsigned long long node_blocks = 2; 4848 unsigned long long blocks_left = leaf_blocks - 1; 4849 unsigned int height = 1; 4850 4851 if (leaf_blocks < 1) 4852 return 0; 4853 4854 while (node_blocks < blocks_left) { 4855 blocks_left -= node_blocks; 4856 node_blocks *= limits[1]; 4857 height++; 4858 } 4859 4860 return height; 4861 } 4862 4863 /* 4864 * Query a regular btree for all records overlapping a given interval. 4865 * Start with a LE lookup of the key of low_rec and return all records 4866 * until we find a record with a key greater than the key of high_rec. 4867 */ 4868 STATIC int 4869 xfs_btree_simple_query_range( 4870 struct xfs_btree_cur *cur, 4871 const union xfs_btree_key *low_key, 4872 const union xfs_btree_key *high_key, 4873 xfs_btree_query_range_fn fn, 4874 void *priv) 4875 { 4876 union xfs_btree_rec *recp; 4877 union xfs_btree_key rec_key; 4878 int stat; 4879 bool firstrec = true; 4880 int error; 4881 4882 ASSERT(cur->bc_ops->init_high_key_from_rec); 4883 ASSERT(cur->bc_ops->diff_two_keys); 4884 4885 /* 4886 * Find the leftmost record. The btree cursor must be set 4887 * to the low record used to generate low_key. 4888 */ 4889 stat = 0; 4890 error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, &stat); 4891 if (error) 4892 goto out; 4893 4894 /* Nothing? See if there's anything to the right. */ 4895 if (!stat) { 4896 error = xfs_btree_increment(cur, 0, &stat); 4897 if (error) 4898 goto out; 4899 } 4900 4901 while (stat) { 4902 /* Find the record. */ 4903 error = xfs_btree_get_rec(cur, &recp, &stat); 4904 if (error || !stat) 4905 break; 4906 4907 /* Skip if low_key > high_key(rec). */ 4908 if (firstrec) { 4909 cur->bc_ops->init_high_key_from_rec(&rec_key, recp); 4910 firstrec = false; 4911 if (xfs_btree_keycmp_gt(cur, low_key, &rec_key)) 4912 goto advloop; 4913 } 4914 4915 /* Stop if low_key(rec) > high_key. */ 4916 cur->bc_ops->init_key_from_rec(&rec_key, recp); 4917 if (xfs_btree_keycmp_gt(cur, &rec_key, high_key)) 4918 break; 4919 4920 /* Callback */ 4921 error = fn(cur, recp, priv); 4922 if (error) 4923 break; 4924 4925 advloop: 4926 /* Move on to the next record. */ 4927 error = xfs_btree_increment(cur, 0, &stat); 4928 if (error) 4929 break; 4930 } 4931 4932 out: 4933 return error; 4934 } 4935 4936 /* 4937 * Query an overlapped interval btree for all records overlapping a given 4938 * interval. This function roughly follows the algorithm given in 4939 * "Interval Trees" of _Introduction to Algorithms_, which is section 4940 * 14.3 in the 2nd and 3rd editions. 4941 * 4942 * First, generate keys for the low and high records passed in. 4943 * 4944 * For any leaf node, generate the high and low keys for the record. 4945 * If the record keys overlap with the query low/high keys, pass the 4946 * record to the function iterator. 4947 * 4948 * For any internal node, compare the low and high keys of each 4949 * pointer against the query low/high keys. If there's an overlap, 4950 * follow the pointer. 4951 * 4952 * As an optimization, we stop scanning a block when we find a low key 4953 * that is greater than the query's high key. 4954 */ 4955 STATIC int 4956 xfs_btree_overlapped_query_range( 4957 struct xfs_btree_cur *cur, 4958 const union xfs_btree_key *low_key, 4959 const union xfs_btree_key *high_key, 4960 xfs_btree_query_range_fn fn, 4961 void *priv) 4962 { 4963 union xfs_btree_ptr ptr; 4964 union xfs_btree_ptr *pp; 4965 union xfs_btree_key rec_key; 4966 union xfs_btree_key rec_hkey; 4967 union xfs_btree_key *lkp; 4968 union xfs_btree_key *hkp; 4969 union xfs_btree_rec *recp; 4970 struct xfs_btree_block *block; 4971 int level; 4972 struct xfs_buf *bp; 4973 int i; 4974 int error; 4975 4976 /* Load the root of the btree. */ 4977 level = cur->bc_nlevels - 1; 4978 xfs_btree_init_ptr_from_cur(cur, &ptr); 4979 error = xfs_btree_lookup_get_block(cur, level, &ptr, &block); 4980 if (error) 4981 return error; 4982 xfs_btree_get_block(cur, level, &bp); 4983 trace_xfs_btree_overlapped_query_range(cur, level, bp); 4984 #ifdef DEBUG 4985 error = xfs_btree_check_block(cur, block, level, bp); 4986 if (error) 4987 goto out; 4988 #endif 4989 cur->bc_levels[level].ptr = 1; 4990 4991 while (level < cur->bc_nlevels) { 4992 block = xfs_btree_get_block(cur, level, &bp); 4993 4994 /* End of node, pop back towards the root. */ 4995 if (cur->bc_levels[level].ptr > 4996 be16_to_cpu(block->bb_numrecs)) { 4997 pop_up: 4998 if (level < cur->bc_nlevels - 1) 4999 cur->bc_levels[level + 1].ptr++; 5000 level++; 5001 continue; 5002 } 5003 5004 if (level == 0) { 5005 /* Handle a leaf node. */ 5006 recp = xfs_btree_rec_addr(cur, cur->bc_levels[0].ptr, 5007 block); 5008 5009 cur->bc_ops->init_high_key_from_rec(&rec_hkey, recp); 5010 cur->bc_ops->init_key_from_rec(&rec_key, recp); 5011 5012 /* 5013 * If (query's high key < record's low key), then there 5014 * are no more interesting records in this block. Pop 5015 * up to the leaf level to find more record blocks. 5016 * 5017 * If (record's high key >= query's low key) and 5018 * (query's high key >= record's low key), then 5019 * this record overlaps the query range; callback. 5020 */ 5021 if (xfs_btree_keycmp_lt(cur, high_key, &rec_key)) 5022 goto pop_up; 5023 if (xfs_btree_keycmp_ge(cur, &rec_hkey, low_key)) { 5024 error = fn(cur, recp, priv); 5025 if (error) 5026 break; 5027 } 5028 cur->bc_levels[level].ptr++; 5029 continue; 5030 } 5031 5032 /* Handle an internal node. */ 5033 lkp = xfs_btree_key_addr(cur, cur->bc_levels[level].ptr, block); 5034 hkp = xfs_btree_high_key_addr(cur, cur->bc_levels[level].ptr, 5035 block); 5036 pp = xfs_btree_ptr_addr(cur, cur->bc_levels[level].ptr, block); 5037 5038 /* 5039 * If (query's high key < pointer's low key), then there are no 5040 * more interesting keys in this block. Pop up one leaf level 5041 * to continue looking for records. 5042 * 5043 * If (pointer's high key >= query's low key) and 5044 * (query's high key >= pointer's low key), then 5045 * this record overlaps the query range; follow pointer. 5046 */ 5047 if (xfs_btree_keycmp_lt(cur, high_key, lkp)) 5048 goto pop_up; 5049 if (xfs_btree_keycmp_ge(cur, hkp, low_key)) { 5050 level--; 5051 error = xfs_btree_lookup_get_block(cur, level, pp, 5052 &block); 5053 if (error) 5054 goto out; 5055 xfs_btree_get_block(cur, level, &bp); 5056 trace_xfs_btree_overlapped_query_range(cur, level, bp); 5057 #ifdef DEBUG 5058 error = xfs_btree_check_block(cur, block, level, bp); 5059 if (error) 5060 goto out; 5061 #endif 5062 cur->bc_levels[level].ptr = 1; 5063 continue; 5064 } 5065 cur->bc_levels[level].ptr++; 5066 } 5067 5068 out: 5069 /* 5070 * If we don't end this function with the cursor pointing at a record 5071 * block, a subsequent non-error cursor deletion will not release 5072 * node-level buffers, causing a buffer leak. This is quite possible 5073 * with a zero-results range query, so release the buffers if we 5074 * failed to return any results. 5075 */ 5076 if (cur->bc_levels[0].bp == NULL) { 5077 for (i = 0; i < cur->bc_nlevels; i++) { 5078 if (cur->bc_levels[i].bp) { 5079 xfs_trans_brelse(cur->bc_tp, 5080 cur->bc_levels[i].bp); 5081 cur->bc_levels[i].bp = NULL; 5082 cur->bc_levels[i].ptr = 0; 5083 cur->bc_levels[i].ra = 0; 5084 } 5085 } 5086 } 5087 5088 return error; 5089 } 5090 5091 static inline void 5092 xfs_btree_key_from_irec( 5093 struct xfs_btree_cur *cur, 5094 union xfs_btree_key *key, 5095 const union xfs_btree_irec *irec) 5096 { 5097 union xfs_btree_rec rec; 5098 5099 cur->bc_rec = *irec; 5100 cur->bc_ops->init_rec_from_cur(cur, &rec); 5101 cur->bc_ops->init_key_from_rec(key, &rec); 5102 } 5103 5104 /* 5105 * Query a btree for all records overlapping a given interval of keys. The 5106 * supplied function will be called with each record found; return one of the 5107 * XFS_BTREE_QUERY_RANGE_{CONTINUE,ABORT} values or the usual negative error 5108 * code. This function returns -ECANCELED, zero, or a negative error code. 5109 */ 5110 int 5111 xfs_btree_query_range( 5112 struct xfs_btree_cur *cur, 5113 const union xfs_btree_irec *low_rec, 5114 const union xfs_btree_irec *high_rec, 5115 xfs_btree_query_range_fn fn, 5116 void *priv) 5117 { 5118 union xfs_btree_key low_key; 5119 union xfs_btree_key high_key; 5120 5121 /* Find the keys of both ends of the interval. */ 5122 xfs_btree_key_from_irec(cur, &high_key, high_rec); 5123 xfs_btree_key_from_irec(cur, &low_key, low_rec); 5124 5125 /* Enforce low key <= high key. */ 5126 if (!xfs_btree_keycmp_le(cur, &low_key, &high_key)) 5127 return -EINVAL; 5128 5129 if (!(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING)) 5130 return xfs_btree_simple_query_range(cur, &low_key, 5131 &high_key, fn, priv); 5132 return xfs_btree_overlapped_query_range(cur, &low_key, &high_key, 5133 fn, priv); 5134 } 5135 5136 /* Query a btree for all records. */ 5137 int 5138 xfs_btree_query_all( 5139 struct xfs_btree_cur *cur, 5140 xfs_btree_query_range_fn fn, 5141 void *priv) 5142 { 5143 union xfs_btree_key low_key; 5144 union xfs_btree_key high_key; 5145 5146 memset(&cur->bc_rec, 0, sizeof(cur->bc_rec)); 5147 memset(&low_key, 0, sizeof(low_key)); 5148 memset(&high_key, 0xFF, sizeof(high_key)); 5149 5150 return xfs_btree_simple_query_range(cur, &low_key, &high_key, fn, priv); 5151 } 5152 5153 static int 5154 xfs_btree_count_blocks_helper( 5155 struct xfs_btree_cur *cur, 5156 int level, 5157 void *data) 5158 { 5159 xfs_extlen_t *blocks = data; 5160 (*blocks)++; 5161 5162 return 0; 5163 } 5164 5165 /* Count the blocks in a btree and return the result in *blocks. */ 5166 int 5167 xfs_btree_count_blocks( 5168 struct xfs_btree_cur *cur, 5169 xfs_extlen_t *blocks) 5170 { 5171 *blocks = 0; 5172 return xfs_btree_visit_blocks(cur, xfs_btree_count_blocks_helper, 5173 XFS_BTREE_VISIT_ALL, blocks); 5174 } 5175 5176 /* Compare two btree pointers. */ 5177 int64_t 5178 xfs_btree_diff_two_ptrs( 5179 struct xfs_btree_cur *cur, 5180 const union xfs_btree_ptr *a, 5181 const union xfs_btree_ptr *b) 5182 { 5183 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) 5184 return (int64_t)be64_to_cpu(a->l) - be64_to_cpu(b->l); 5185 return (int64_t)be32_to_cpu(a->s) - be32_to_cpu(b->s); 5186 } 5187 5188 struct xfs_btree_has_records { 5189 /* Keys for the start and end of the range we want to know about. */ 5190 union xfs_btree_key start_key; 5191 union xfs_btree_key end_key; 5192 5193 /* Mask for key comparisons, if desired. */ 5194 const union xfs_btree_key *key_mask; 5195 5196 /* Highest record key we've seen so far. */ 5197 union xfs_btree_key high_key; 5198 5199 enum xbtree_recpacking outcome; 5200 }; 5201 5202 STATIC int 5203 xfs_btree_has_records_helper( 5204 struct xfs_btree_cur *cur, 5205 const union xfs_btree_rec *rec, 5206 void *priv) 5207 { 5208 union xfs_btree_key rec_key; 5209 union xfs_btree_key rec_high_key; 5210 struct xfs_btree_has_records *info = priv; 5211 enum xbtree_key_contig key_contig; 5212 5213 cur->bc_ops->init_key_from_rec(&rec_key, rec); 5214 5215 if (info->outcome == XBTREE_RECPACKING_EMPTY) { 5216 info->outcome = XBTREE_RECPACKING_SPARSE; 5217 5218 /* 5219 * If the first record we find does not overlap the start key, 5220 * then there is a hole at the start of the search range. 5221 * Classify this as sparse and stop immediately. 5222 */ 5223 if (xfs_btree_masked_keycmp_lt(cur, &info->start_key, &rec_key, 5224 info->key_mask)) 5225 return -ECANCELED; 5226 } else { 5227 /* 5228 * If a subsequent record does not overlap with the any record 5229 * we've seen so far, there is a hole in the middle of the 5230 * search range. Classify this as sparse and stop. 5231 * If the keys overlap and this btree does not allow overlap, 5232 * signal corruption. 5233 */ 5234 key_contig = cur->bc_ops->keys_contiguous(cur, &info->high_key, 5235 &rec_key, info->key_mask); 5236 if (key_contig == XBTREE_KEY_OVERLAP && 5237 !(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING)) 5238 return -EFSCORRUPTED; 5239 if (key_contig == XBTREE_KEY_GAP) 5240 return -ECANCELED; 5241 } 5242 5243 /* 5244 * If high_key(rec) is larger than any other high key we've seen, 5245 * remember it for later. 5246 */ 5247 cur->bc_ops->init_high_key_from_rec(&rec_high_key, rec); 5248 if (xfs_btree_masked_keycmp_gt(cur, &rec_high_key, &info->high_key, 5249 info->key_mask)) 5250 info->high_key = rec_high_key; /* struct copy */ 5251 5252 return 0; 5253 } 5254 5255 /* 5256 * Scan part of the keyspace of a btree and tell us if that keyspace does not 5257 * map to any records; is fully mapped to records; or is partially mapped to 5258 * records. This is the btree record equivalent to determining if a file is 5259 * sparse. 5260 * 5261 * For most btree types, the record scan should use all available btree key 5262 * fields to compare the keys encountered. These callers should pass NULL for 5263 * @mask. However, some callers (e.g. scanning physical space in the rmapbt) 5264 * want to ignore some part of the btree record keyspace when performing the 5265 * comparison. These callers should pass in a union xfs_btree_key object with 5266 * the fields that *should* be a part of the comparison set to any nonzero 5267 * value, and the rest zeroed. 5268 */ 5269 int 5270 xfs_btree_has_records( 5271 struct xfs_btree_cur *cur, 5272 const union xfs_btree_irec *low, 5273 const union xfs_btree_irec *high, 5274 const union xfs_btree_key *mask, 5275 enum xbtree_recpacking *outcome) 5276 { 5277 struct xfs_btree_has_records info = { 5278 .outcome = XBTREE_RECPACKING_EMPTY, 5279 .key_mask = mask, 5280 }; 5281 int error; 5282 5283 /* Not all btrees support this operation. */ 5284 if (!cur->bc_ops->keys_contiguous) { 5285 ASSERT(0); 5286 return -EOPNOTSUPP; 5287 } 5288 5289 xfs_btree_key_from_irec(cur, &info.start_key, low); 5290 xfs_btree_key_from_irec(cur, &info.end_key, high); 5291 5292 error = xfs_btree_query_range(cur, low, high, 5293 xfs_btree_has_records_helper, &info); 5294 if (error == -ECANCELED) 5295 goto out; 5296 if (error) 5297 return error; 5298 5299 if (info.outcome == XBTREE_RECPACKING_EMPTY) 5300 goto out; 5301 5302 /* 5303 * If the largest high_key(rec) we saw during the walk is greater than 5304 * the end of the search range, classify this as full. Otherwise, 5305 * there is a hole at the end of the search range. 5306 */ 5307 if (xfs_btree_masked_keycmp_ge(cur, &info.high_key, &info.end_key, 5308 mask)) 5309 info.outcome = XBTREE_RECPACKING_FULL; 5310 5311 out: 5312 *outcome = info.outcome; 5313 return 0; 5314 } 5315 5316 /* Are there more records in this btree? */ 5317 bool 5318 xfs_btree_has_more_records( 5319 struct xfs_btree_cur *cur) 5320 { 5321 struct xfs_btree_block *block; 5322 struct xfs_buf *bp; 5323 5324 block = xfs_btree_get_block(cur, 0, &bp); 5325 5326 /* There are still records in this block. */ 5327 if (cur->bc_levels[0].ptr < xfs_btree_get_numrecs(block)) 5328 return true; 5329 5330 /* There are more record blocks. */ 5331 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) 5332 return block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK); 5333 else 5334 return block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK); 5335 } 5336 5337 /* Set up all the btree cursor caches. */ 5338 int __init 5339 xfs_btree_init_cur_caches(void) 5340 { 5341 int error; 5342 5343 error = xfs_allocbt_init_cur_cache(); 5344 if (error) 5345 return error; 5346 error = xfs_inobt_init_cur_cache(); 5347 if (error) 5348 goto err; 5349 error = xfs_bmbt_init_cur_cache(); 5350 if (error) 5351 goto err; 5352 error = xfs_rmapbt_init_cur_cache(); 5353 if (error) 5354 goto err; 5355 error = xfs_refcountbt_init_cur_cache(); 5356 if (error) 5357 goto err; 5358 5359 return 0; 5360 err: 5361 xfs_btree_destroy_cur_caches(); 5362 return error; 5363 } 5364 5365 /* Destroy all the btree cursor caches, if they've been allocated. */ 5366 void 5367 xfs_btree_destroy_cur_caches(void) 5368 { 5369 xfs_allocbt_destroy_cur_cache(); 5370 xfs_inobt_destroy_cur_cache(); 5371 xfs_bmbt_destroy_cur_cache(); 5372 xfs_rmapbt_destroy_cur_cache(); 5373 xfs_refcountbt_destroy_cur_cache(); 5374 } 5375 5376 /* Move the btree cursor before the first record. */ 5377 int 5378 xfs_btree_goto_left_edge( 5379 struct xfs_btree_cur *cur) 5380 { 5381 int stat = 0; 5382 int error; 5383 5384 memset(&cur->bc_rec, 0, sizeof(cur->bc_rec)); 5385 error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, &stat); 5386 if (error) 5387 return error; 5388 if (!stat) 5389 return 0; 5390 5391 error = xfs_btree_decrement(cur, 0, &stat); 5392 if (error) 5393 return error; 5394 if (stat != 0) { 5395 ASSERT(0); 5396 xfs_btree_mark_sick(cur); 5397 return -EFSCORRUPTED; 5398 } 5399 5400 return 0; 5401 } 5402