1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_bit.h" 13 #include "xfs_mount.h" 14 #include "xfs_inode.h" 15 #include "xfs_trans.h" 16 #include "xfs_buf_item.h" 17 #include "xfs_btree.h" 18 #include "xfs_errortag.h" 19 #include "xfs_error.h" 20 #include "xfs_trace.h" 21 #include "xfs_alloc.h" 22 #include "xfs_log.h" 23 #include "xfs_btree_staging.h" 24 #include "xfs_ag.h" 25 #include "xfs_alloc_btree.h" 26 #include "xfs_ialloc_btree.h" 27 #include "xfs_bmap_btree.h" 28 #include "xfs_rmap_btree.h" 29 #include "xfs_refcount_btree.h" 30 #include "xfs_health.h" 31 #include "xfs_buf_mem.h" 32 #include "xfs_btree_mem.h" 33 34 /* 35 * Btree magic numbers. 36 */ 37 uint32_t 38 xfs_btree_magic( 39 struct xfs_mount *mp, 40 const struct xfs_btree_ops *ops) 41 { 42 int idx = xfs_has_crc(mp) ? 1 : 0; 43 __be32 magic = ops->buf_ops->magic[idx]; 44 45 /* Ensure we asked for crc for crc-only magics. */ 46 ASSERT(magic != 0); 47 return be32_to_cpu(magic); 48 } 49 50 /* 51 * These sibling pointer checks are optimised for null sibling pointers. This 52 * happens a lot, and we don't need to byte swap at runtime if the sibling 53 * pointer is NULL. 54 * 55 * These are explicitly marked at inline because the cost of calling them as 56 * functions instead of inlining them is about 36 bytes extra code per call site 57 * on x86-64. Yes, gcc-11 fails to inline them, and explicit inlining of these 58 * two sibling check functions reduces the compiled code size by over 300 59 * bytes. 60 */ 61 static inline xfs_failaddr_t 62 xfs_btree_check_fsblock_siblings( 63 struct xfs_mount *mp, 64 xfs_fsblock_t fsb, 65 __be64 dsibling) 66 { 67 xfs_fsblock_t sibling; 68 69 if (dsibling == cpu_to_be64(NULLFSBLOCK)) 70 return NULL; 71 72 sibling = be64_to_cpu(dsibling); 73 if (sibling == fsb) 74 return __this_address; 75 if (!xfs_verify_fsbno(mp, sibling)) 76 return __this_address; 77 return NULL; 78 } 79 80 static inline xfs_failaddr_t 81 xfs_btree_check_memblock_siblings( 82 struct xfs_buftarg *btp, 83 xfbno_t bno, 84 __be64 dsibling) 85 { 86 xfbno_t sibling; 87 88 if (dsibling == cpu_to_be64(NULLFSBLOCK)) 89 return NULL; 90 91 sibling = be64_to_cpu(dsibling); 92 if (sibling == bno) 93 return __this_address; 94 if (!xmbuf_verify_daddr(btp, xfbno_to_daddr(sibling))) 95 return __this_address; 96 return NULL; 97 } 98 99 static inline xfs_failaddr_t 100 xfs_btree_check_agblock_siblings( 101 struct xfs_perag *pag, 102 xfs_agblock_t agbno, 103 __be32 dsibling) 104 { 105 xfs_agblock_t sibling; 106 107 if (dsibling == cpu_to_be32(NULLAGBLOCK)) 108 return NULL; 109 110 sibling = be32_to_cpu(dsibling); 111 if (sibling == agbno) 112 return __this_address; 113 if (!xfs_verify_agbno(pag, sibling)) 114 return __this_address; 115 return NULL; 116 } 117 118 static xfs_failaddr_t 119 __xfs_btree_check_lblock_hdr( 120 struct xfs_btree_cur *cur, 121 struct xfs_btree_block *block, 122 int level, 123 struct xfs_buf *bp) 124 { 125 struct xfs_mount *mp = cur->bc_mp; 126 127 if (xfs_has_crc(mp)) { 128 if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid)) 129 return __this_address; 130 if (block->bb_u.l.bb_blkno != 131 cpu_to_be64(bp ? xfs_buf_daddr(bp) : XFS_BUF_DADDR_NULL)) 132 return __this_address; 133 if (block->bb_u.l.bb_pad != cpu_to_be32(0)) 134 return __this_address; 135 } 136 137 if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(mp, cur->bc_ops)) 138 return __this_address; 139 if (be16_to_cpu(block->bb_level) != level) 140 return __this_address; 141 if (be16_to_cpu(block->bb_numrecs) > 142 cur->bc_ops->get_maxrecs(cur, level)) 143 return __this_address; 144 145 return NULL; 146 } 147 148 /* 149 * Check a long btree block header. Return the address of the failing check, 150 * or NULL if everything is ok. 151 */ 152 static xfs_failaddr_t 153 __xfs_btree_check_fsblock( 154 struct xfs_btree_cur *cur, 155 struct xfs_btree_block *block, 156 int level, 157 struct xfs_buf *bp) 158 { 159 struct xfs_mount *mp = cur->bc_mp; 160 xfs_failaddr_t fa; 161 xfs_fsblock_t fsb; 162 163 fa = __xfs_btree_check_lblock_hdr(cur, block, level, bp); 164 if (fa) 165 return fa; 166 167 /* 168 * For inode-rooted btrees, the root block sits in the inode fork. In 169 * that case bp is NULL, and the block must not have any siblings. 170 */ 171 if (!bp) { 172 if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK)) 173 return __this_address; 174 if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK)) 175 return __this_address; 176 return NULL; 177 } 178 179 fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); 180 fa = xfs_btree_check_fsblock_siblings(mp, fsb, 181 block->bb_u.l.bb_leftsib); 182 if (!fa) 183 fa = xfs_btree_check_fsblock_siblings(mp, fsb, 184 block->bb_u.l.bb_rightsib); 185 return fa; 186 } 187 188 /* 189 * Check an in-memory btree block header. Return the address of the failing 190 * check, or NULL if everything is ok. 191 */ 192 static xfs_failaddr_t 193 __xfs_btree_check_memblock( 194 struct xfs_btree_cur *cur, 195 struct xfs_btree_block *block, 196 int level, 197 struct xfs_buf *bp) 198 { 199 struct xfs_buftarg *btp = cur->bc_mem.xfbtree->target; 200 xfs_failaddr_t fa; 201 xfbno_t bno; 202 203 fa = __xfs_btree_check_lblock_hdr(cur, block, level, bp); 204 if (fa) 205 return fa; 206 207 bno = xfs_daddr_to_xfbno(xfs_buf_daddr(bp)); 208 fa = xfs_btree_check_memblock_siblings(btp, bno, 209 block->bb_u.l.bb_leftsib); 210 if (!fa) 211 fa = xfs_btree_check_memblock_siblings(btp, bno, 212 block->bb_u.l.bb_rightsib); 213 return fa; 214 } 215 216 /* 217 * Check a short btree block header. Return the address of the failing check, 218 * or NULL if everything is ok. 219 */ 220 static xfs_failaddr_t 221 __xfs_btree_check_agblock( 222 struct xfs_btree_cur *cur, 223 struct xfs_btree_block *block, 224 int level, 225 struct xfs_buf *bp) 226 { 227 struct xfs_mount *mp = cur->bc_mp; 228 struct xfs_perag *pag = cur->bc_ag.pag; 229 xfs_failaddr_t fa; 230 xfs_agblock_t agbno; 231 232 if (xfs_has_crc(mp)) { 233 if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) 234 return __this_address; 235 if (block->bb_u.s.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp))) 236 return __this_address; 237 } 238 239 if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(mp, cur->bc_ops)) 240 return __this_address; 241 if (be16_to_cpu(block->bb_level) != level) 242 return __this_address; 243 if (be16_to_cpu(block->bb_numrecs) > 244 cur->bc_ops->get_maxrecs(cur, level)) 245 return __this_address; 246 247 agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp)); 248 fa = xfs_btree_check_agblock_siblings(pag, agbno, 249 block->bb_u.s.bb_leftsib); 250 if (!fa) 251 fa = xfs_btree_check_agblock_siblings(pag, agbno, 252 block->bb_u.s.bb_rightsib); 253 return fa; 254 } 255 256 /* 257 * Internal btree block check. 258 * 259 * Return NULL if the block is ok or the address of the failed check otherwise. 260 */ 261 xfs_failaddr_t 262 __xfs_btree_check_block( 263 struct xfs_btree_cur *cur, 264 struct xfs_btree_block *block, 265 int level, 266 struct xfs_buf *bp) 267 { 268 switch (cur->bc_ops->type) { 269 case XFS_BTREE_TYPE_MEM: 270 return __xfs_btree_check_memblock(cur, block, level, bp); 271 case XFS_BTREE_TYPE_AG: 272 return __xfs_btree_check_agblock(cur, block, level, bp); 273 case XFS_BTREE_TYPE_INODE: 274 return __xfs_btree_check_fsblock(cur, block, level, bp); 275 default: 276 ASSERT(0); 277 return __this_address; 278 } 279 } 280 281 static inline unsigned int xfs_btree_block_errtag(struct xfs_btree_cur *cur) 282 { 283 if (cur->bc_ops->ptr_len == XFS_BTREE_SHORT_PTR_LEN) 284 return XFS_ERRTAG_BTREE_CHECK_SBLOCK; 285 return XFS_ERRTAG_BTREE_CHECK_LBLOCK; 286 } 287 288 /* 289 * Debug routine: check that block header is ok. 290 */ 291 int 292 xfs_btree_check_block( 293 struct xfs_btree_cur *cur, /* btree cursor */ 294 struct xfs_btree_block *block, /* generic btree block pointer */ 295 int level, /* level of the btree block */ 296 struct xfs_buf *bp) /* buffer containing block, if any */ 297 { 298 struct xfs_mount *mp = cur->bc_mp; 299 xfs_failaddr_t fa; 300 301 fa = __xfs_btree_check_block(cur, block, level, bp); 302 if (XFS_IS_CORRUPT(mp, fa != NULL) || 303 XFS_TEST_ERROR(false, mp, xfs_btree_block_errtag(cur))) { 304 if (bp) 305 trace_xfs_btree_corrupt(bp, _RET_IP_); 306 xfs_btree_mark_sick(cur); 307 return -EFSCORRUPTED; 308 } 309 return 0; 310 } 311 312 int 313 __xfs_btree_check_ptr( 314 struct xfs_btree_cur *cur, 315 const union xfs_btree_ptr *ptr, 316 int index, 317 int level) 318 { 319 if (level <= 0) 320 return -EFSCORRUPTED; 321 322 switch (cur->bc_ops->type) { 323 case XFS_BTREE_TYPE_MEM: 324 if (!xfbtree_verify_bno(cur->bc_mem.xfbtree, 325 be64_to_cpu((&ptr->l)[index]))) 326 return -EFSCORRUPTED; 327 break; 328 case XFS_BTREE_TYPE_INODE: 329 if (!xfs_verify_fsbno(cur->bc_mp, 330 be64_to_cpu((&ptr->l)[index]))) 331 return -EFSCORRUPTED; 332 break; 333 case XFS_BTREE_TYPE_AG: 334 if (!xfs_verify_agbno(cur->bc_ag.pag, 335 be32_to_cpu((&ptr->s)[index]))) 336 return -EFSCORRUPTED; 337 break; 338 } 339 340 return 0; 341 } 342 343 /* 344 * Check that a given (indexed) btree pointer at a certain level of a 345 * btree is valid and doesn't point past where it should. 346 */ 347 static int 348 xfs_btree_check_ptr( 349 struct xfs_btree_cur *cur, 350 const union xfs_btree_ptr *ptr, 351 int index, 352 int level) 353 { 354 int error; 355 356 error = __xfs_btree_check_ptr(cur, ptr, index, level); 357 if (error) { 358 switch (cur->bc_ops->type) { 359 case XFS_BTREE_TYPE_MEM: 360 xfs_err(cur->bc_mp, 361 "In-memory: Corrupt %sbt flags 0x%x pointer at level %d index %d fa %pS.", 362 cur->bc_ops->name, cur->bc_flags, level, index, 363 __this_address); 364 break; 365 case XFS_BTREE_TYPE_INODE: 366 xfs_err(cur->bc_mp, 367 "Inode %llu fork %d: Corrupt %sbt pointer at level %d index %d.", 368 cur->bc_ino.ip->i_ino, 369 cur->bc_ino.whichfork, cur->bc_ops->name, 370 level, index); 371 break; 372 case XFS_BTREE_TYPE_AG: 373 xfs_err(cur->bc_mp, 374 "AG %u: Corrupt %sbt pointer at level %d index %d.", 375 pag_agno(cur->bc_ag.pag), cur->bc_ops->name, 376 level, index); 377 break; 378 } 379 xfs_btree_mark_sick(cur); 380 } 381 382 return error; 383 } 384 385 #ifdef DEBUG 386 # define xfs_btree_debug_check_ptr xfs_btree_check_ptr 387 #else 388 # define xfs_btree_debug_check_ptr(...) (0) 389 #endif 390 391 /* 392 * Calculate CRC on the whole btree block and stuff it into the 393 * long-form btree header. 394 * 395 * Prior to calculting the CRC, pull the LSN out of the buffer log item and put 396 * it into the buffer so recovery knows what the last modification was that made 397 * it to disk. 398 */ 399 void 400 xfs_btree_fsblock_calc_crc( 401 struct xfs_buf *bp) 402 { 403 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 404 struct xfs_buf_log_item *bip = bp->b_log_item; 405 406 if (!xfs_has_crc(bp->b_mount)) 407 return; 408 if (bip) 409 block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn); 410 xfs_buf_update_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF); 411 } 412 413 bool 414 xfs_btree_fsblock_verify_crc( 415 struct xfs_buf *bp) 416 { 417 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 418 struct xfs_mount *mp = bp->b_mount; 419 420 if (xfs_has_crc(mp)) { 421 if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.l.bb_lsn))) 422 return false; 423 return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF); 424 } 425 426 return true; 427 } 428 429 /* 430 * Calculate CRC on the whole btree block and stuff it into the 431 * short-form btree header. 432 * 433 * Prior to calculting the CRC, pull the LSN out of the buffer log item and put 434 * it into the buffer so recovery knows what the last modification was that made 435 * it to disk. 436 */ 437 void 438 xfs_btree_agblock_calc_crc( 439 struct xfs_buf *bp) 440 { 441 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 442 struct xfs_buf_log_item *bip = bp->b_log_item; 443 444 if (!xfs_has_crc(bp->b_mount)) 445 return; 446 if (bip) 447 block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn); 448 xfs_buf_update_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF); 449 } 450 451 bool 452 xfs_btree_agblock_verify_crc( 453 struct xfs_buf *bp) 454 { 455 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 456 struct xfs_mount *mp = bp->b_mount; 457 458 if (xfs_has_crc(mp)) { 459 if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.s.bb_lsn))) 460 return false; 461 return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF); 462 } 463 464 return true; 465 } 466 467 static int 468 xfs_btree_free_block( 469 struct xfs_btree_cur *cur, 470 struct xfs_buf *bp) 471 { 472 int error; 473 474 trace_xfs_btree_free_block(cur, bp); 475 476 /* 477 * Don't allow block freeing for a staging cursor, because staging 478 * cursors do not support regular btree modifications. 479 */ 480 if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) { 481 ASSERT(0); 482 return -EFSCORRUPTED; 483 } 484 485 error = cur->bc_ops->free_block(cur, bp); 486 if (!error) { 487 xfs_trans_binval(cur->bc_tp, bp); 488 XFS_BTREE_STATS_INC(cur, free); 489 } 490 return error; 491 } 492 493 /* 494 * Delete the btree cursor. 495 */ 496 void 497 xfs_btree_del_cursor( 498 struct xfs_btree_cur *cur, /* btree cursor */ 499 int error) /* del because of error */ 500 { 501 int i; /* btree level */ 502 503 /* 504 * Clear the buffer pointers and release the buffers. If we're doing 505 * this because of an error, inspect all of the entries in the bc_bufs 506 * array for buffers to be unlocked. This is because some of the btree 507 * code works from level n down to 0, and if we get an error along the 508 * way we won't have initialized all the entries down to 0. 509 */ 510 for (i = 0; i < cur->bc_nlevels; i++) { 511 if (cur->bc_levels[i].bp) 512 xfs_trans_brelse(cur->bc_tp, cur->bc_levels[i].bp); 513 else if (!error) 514 break; 515 } 516 517 /* 518 * If we are doing a BMBT update, the number of unaccounted blocks 519 * allocated during this cursor life time should be zero. If it's not 520 * zero, then we should be shut down or on our way to shutdown due to 521 * cancelling a dirty transaction on error. 522 */ 523 ASSERT(!xfs_btree_is_bmap(cur->bc_ops) || cur->bc_bmap.allocated == 0 || 524 xfs_is_shutdown(cur->bc_mp) || error != 0); 525 526 switch (cur->bc_ops->type) { 527 case XFS_BTREE_TYPE_AG: 528 if (cur->bc_ag.pag) 529 xfs_perag_put(cur->bc_ag.pag); 530 break; 531 case XFS_BTREE_TYPE_INODE: 532 /* nothing to do */ 533 break; 534 case XFS_BTREE_TYPE_MEM: 535 if (cur->bc_mem.pag) 536 xfs_perag_put(cur->bc_mem.pag); 537 break; 538 } 539 540 kmem_cache_free(cur->bc_cache, cur); 541 } 542 543 /* Return the buffer target for this btree's buffer. */ 544 static inline struct xfs_buftarg * 545 xfs_btree_buftarg( 546 struct xfs_btree_cur *cur) 547 { 548 if (cur->bc_ops->type == XFS_BTREE_TYPE_MEM) 549 return cur->bc_mem.xfbtree->target; 550 return cur->bc_mp->m_ddev_targp; 551 } 552 553 /* Return the block size (in units of 512b sectors) for this btree. */ 554 static inline unsigned int 555 xfs_btree_bbsize( 556 struct xfs_btree_cur *cur) 557 { 558 if (cur->bc_ops->type == XFS_BTREE_TYPE_MEM) 559 return XFBNO_BBSIZE; 560 return cur->bc_mp->m_bsize; 561 } 562 563 /* 564 * Duplicate the btree cursor. 565 * Allocate a new one, copy the record, re-get the buffers. 566 */ 567 int /* error */ 568 xfs_btree_dup_cursor( 569 struct xfs_btree_cur *cur, /* input cursor */ 570 struct xfs_btree_cur **ncur) /* output cursor */ 571 { 572 struct xfs_mount *mp = cur->bc_mp; 573 struct xfs_trans *tp = cur->bc_tp; 574 struct xfs_buf *bp; 575 struct xfs_btree_cur *new; 576 int error; 577 int i; 578 579 /* 580 * Don't allow staging cursors to be duplicated because they're supposed 581 * to be kept private to a single thread. 582 */ 583 if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) { 584 ASSERT(0); 585 return -EFSCORRUPTED; 586 } 587 588 /* 589 * Allocate a new cursor like the old one. 590 */ 591 new = cur->bc_ops->dup_cursor(cur); 592 593 /* 594 * Copy the record currently in the cursor. 595 */ 596 new->bc_rec = cur->bc_rec; 597 598 /* 599 * For each level current, re-get the buffer and copy the ptr value. 600 */ 601 for (i = 0; i < new->bc_nlevels; i++) { 602 new->bc_levels[i].ptr = cur->bc_levels[i].ptr; 603 new->bc_levels[i].ra = cur->bc_levels[i].ra; 604 bp = cur->bc_levels[i].bp; 605 if (bp) { 606 error = xfs_trans_read_buf(mp, tp, 607 xfs_btree_buftarg(cur), 608 xfs_buf_daddr(bp), 609 xfs_btree_bbsize(cur), 0, &bp, 610 cur->bc_ops->buf_ops); 611 if (xfs_metadata_is_sick(error)) 612 xfs_btree_mark_sick(new); 613 if (error) { 614 xfs_btree_del_cursor(new, error); 615 *ncur = NULL; 616 return error; 617 } 618 } 619 new->bc_levels[i].bp = bp; 620 } 621 *ncur = new; 622 return 0; 623 } 624 625 /* 626 * XFS btree block layout and addressing: 627 * 628 * There are two types of blocks in the btree: leaf and non-leaf blocks. 629 * 630 * The leaf record start with a header then followed by records containing 631 * the values. A non-leaf block also starts with the same header, and 632 * then first contains lookup keys followed by an equal number of pointers 633 * to the btree blocks at the previous level. 634 * 635 * +--------+-------+-------+-------+-------+-------+-------+ 636 * Leaf: | header | rec 1 | rec 2 | rec 3 | rec 4 | rec 5 | rec N | 637 * +--------+-------+-------+-------+-------+-------+-------+ 638 * 639 * +--------+-------+-------+-------+-------+-------+-------+ 640 * Non-Leaf: | header | key 1 | key 2 | key N | ptr 1 | ptr 2 | ptr N | 641 * +--------+-------+-------+-------+-------+-------+-------+ 642 * 643 * The header is called struct xfs_btree_block for reasons better left unknown 644 * and comes in different versions for short (32bit) and long (64bit) block 645 * pointers. The record and key structures are defined by the btree instances 646 * and opaque to the btree core. The block pointers are simple disk endian 647 * integers, available in a short (32bit) and long (64bit) variant. 648 * 649 * The helpers below calculate the offset of a given record, key or pointer 650 * into a btree block (xfs_btree_*_offset) or return a pointer to the given 651 * record, key or pointer (xfs_btree_*_addr). Note that all addressing 652 * inside the btree block is done using indices starting at one, not zero! 653 * 654 * If XFS_BTGEO_OVERLAPPING is set, then this btree supports keys containing 655 * overlapping intervals. In such a tree, records are still sorted lowest to 656 * highest and indexed by the smallest key value that refers to the record. 657 * However, nodes are different: each pointer has two associated keys -- one 658 * indexing the lowest key available in the block(s) below (the same behavior 659 * as the key in a regular btree) and another indexing the highest key 660 * available in the block(s) below. Because records are /not/ sorted by the 661 * highest key, all leaf block updates require us to compute the highest key 662 * that matches any record in the leaf and to recursively update the high keys 663 * in the nodes going further up in the tree, if necessary. Nodes look like 664 * this: 665 * 666 * +--------+-----+-----+-----+-----+-----+-------+-------+-----+ 667 * Non-Leaf: | header | lo1 | hi1 | lo2 | hi2 | ... | ptr 1 | ptr 2 | ... | 668 * +--------+-----+-----+-----+-----+-----+-------+-------+-----+ 669 * 670 * To perform an interval query on an overlapped tree, perform the usual 671 * depth-first search and use the low and high keys to decide if we can skip 672 * that particular node. If a leaf node is reached, return the records that 673 * intersect the interval. Note that an interval query may return numerous 674 * entries. For a non-overlapped tree, simply search for the record associated 675 * with the lowest key and iterate forward until a non-matching record is 676 * found. Section 14.3 ("Interval Trees") of _Introduction to Algorithms_ by 677 * Cormen, Leiserson, Rivest, and Stein (2nd or 3rd ed. only) discuss this in 678 * more detail. 679 * 680 * Why do we care about overlapping intervals? Let's say you have a bunch of 681 * reverse mapping records on a reflink filesystem: 682 * 683 * 1: +- file A startblock B offset C length D -----------+ 684 * 2: +- file E startblock F offset G length H --------------+ 685 * 3: +- file I startblock F offset J length K --+ 686 * 4: +- file L... --+ 687 * 688 * Now say we want to map block (B+D) into file A at offset (C+D). Ideally, 689 * we'd simply increment the length of record 1. But how do we find the record 690 * that ends at (B+D-1) (i.e. record 1)? A LE lookup of (B+D-1) would return 691 * record 3 because the keys are ordered first by startblock. An interval 692 * query would return records 1 and 2 because they both overlap (B+D-1), and 693 * from that we can pick out record 1 as the appropriate left neighbor. 694 * 695 * In the non-overlapped case you can do a LE lookup and decrement the cursor 696 * because a record's interval must end before the next record. 697 */ 698 699 /* 700 * Return size of the btree block header for this btree instance. 701 */ 702 static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur) 703 { 704 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { 705 if (xfs_has_crc(cur->bc_mp)) 706 return XFS_BTREE_LBLOCK_CRC_LEN; 707 return XFS_BTREE_LBLOCK_LEN; 708 } 709 if (xfs_has_crc(cur->bc_mp)) 710 return XFS_BTREE_SBLOCK_CRC_LEN; 711 return XFS_BTREE_SBLOCK_LEN; 712 } 713 714 /* 715 * Calculate offset of the n-th record in a btree block. 716 */ 717 STATIC size_t 718 xfs_btree_rec_offset( 719 struct xfs_btree_cur *cur, 720 int n) 721 { 722 return xfs_btree_block_len(cur) + 723 (n - 1) * cur->bc_ops->rec_len; 724 } 725 726 /* 727 * Calculate offset of the n-th key in a btree block. 728 */ 729 STATIC size_t 730 xfs_btree_key_offset( 731 struct xfs_btree_cur *cur, 732 int n) 733 { 734 return xfs_btree_block_len(cur) + 735 (n - 1) * cur->bc_ops->key_len; 736 } 737 738 /* 739 * Calculate offset of the n-th high key in a btree block. 740 */ 741 STATIC size_t 742 xfs_btree_high_key_offset( 743 struct xfs_btree_cur *cur, 744 int n) 745 { 746 return xfs_btree_block_len(cur) + 747 (n - 1) * cur->bc_ops->key_len + (cur->bc_ops->key_len / 2); 748 } 749 750 /* 751 * Calculate offset of the n-th block pointer in a btree block. 752 */ 753 STATIC size_t 754 xfs_btree_ptr_offset( 755 struct xfs_btree_cur *cur, 756 int n, 757 int level) 758 { 759 return xfs_btree_block_len(cur) + 760 cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len + 761 (n - 1) * cur->bc_ops->ptr_len; 762 } 763 764 /* 765 * Return a pointer to the n-th record in the btree block. 766 */ 767 union xfs_btree_rec * 768 xfs_btree_rec_addr( 769 struct xfs_btree_cur *cur, 770 int n, 771 struct xfs_btree_block *block) 772 { 773 return (union xfs_btree_rec *) 774 ((char *)block + xfs_btree_rec_offset(cur, n)); 775 } 776 777 /* 778 * Return a pointer to the n-th key in the btree block. 779 */ 780 union xfs_btree_key * 781 xfs_btree_key_addr( 782 struct xfs_btree_cur *cur, 783 int n, 784 struct xfs_btree_block *block) 785 { 786 return (union xfs_btree_key *) 787 ((char *)block + xfs_btree_key_offset(cur, n)); 788 } 789 790 /* 791 * Return a pointer to the n-th high key in the btree block. 792 */ 793 union xfs_btree_key * 794 xfs_btree_high_key_addr( 795 struct xfs_btree_cur *cur, 796 int n, 797 struct xfs_btree_block *block) 798 { 799 return (union xfs_btree_key *) 800 ((char *)block + xfs_btree_high_key_offset(cur, n)); 801 } 802 803 /* 804 * Return a pointer to the n-th block pointer in the btree block. 805 */ 806 union xfs_btree_ptr * 807 xfs_btree_ptr_addr( 808 struct xfs_btree_cur *cur, 809 int n, 810 struct xfs_btree_block *block) 811 { 812 int level = xfs_btree_get_level(block); 813 814 ASSERT(block->bb_level != 0); 815 816 return (union xfs_btree_ptr *) 817 ((char *)block + xfs_btree_ptr_offset(cur, n, level)); 818 } 819 820 struct xfs_ifork * 821 xfs_btree_ifork_ptr( 822 struct xfs_btree_cur *cur) 823 { 824 ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE); 825 826 if (cur->bc_flags & XFS_BTREE_STAGING) 827 return cur->bc_ino.ifake->if_fork; 828 return xfs_ifork_ptr(cur->bc_ino.ip, cur->bc_ino.whichfork); 829 } 830 831 /* 832 * Get the root block which is stored in the inode. 833 * 834 * For now this btree implementation assumes the btree root is always 835 * stored in the if_broot field of an inode fork. 836 */ 837 STATIC struct xfs_btree_block * 838 xfs_btree_get_iroot( 839 struct xfs_btree_cur *cur) 840 { 841 struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); 842 843 return (struct xfs_btree_block *)ifp->if_broot; 844 } 845 846 /* 847 * Retrieve the block pointer from the cursor at the given level. 848 * This may be an inode btree root or from a buffer. 849 */ 850 struct xfs_btree_block * /* generic btree block pointer */ 851 xfs_btree_get_block( 852 struct xfs_btree_cur *cur, /* btree cursor */ 853 int level, /* level in btree */ 854 struct xfs_buf **bpp) /* buffer containing the block */ 855 { 856 if (xfs_btree_at_iroot(cur, level)) { 857 *bpp = NULL; 858 return xfs_btree_get_iroot(cur); 859 } 860 861 *bpp = cur->bc_levels[level].bp; 862 return XFS_BUF_TO_BLOCK(*bpp); 863 } 864 865 /* 866 * Change the cursor to point to the first record at the given level. 867 * Other levels are unaffected. 868 */ 869 STATIC int /* success=1, failure=0 */ 870 xfs_btree_firstrec( 871 struct xfs_btree_cur *cur, /* btree cursor */ 872 int level) /* level to change */ 873 { 874 struct xfs_btree_block *block; /* generic btree block pointer */ 875 struct xfs_buf *bp; /* buffer containing block */ 876 877 /* 878 * Get the block pointer for this level. 879 */ 880 block = xfs_btree_get_block(cur, level, &bp); 881 if (xfs_btree_check_block(cur, block, level, bp)) 882 return 0; 883 /* 884 * It's empty, there is no such record. 885 */ 886 if (!block->bb_numrecs) 887 return 0; 888 /* 889 * Set the ptr value to 1, that's the first record/key. 890 */ 891 cur->bc_levels[level].ptr = 1; 892 return 1; 893 } 894 895 /* 896 * Change the cursor to point to the last record in the current block 897 * at the given level. Other levels are unaffected. 898 */ 899 STATIC int /* success=1, failure=0 */ 900 xfs_btree_lastrec( 901 struct xfs_btree_cur *cur, /* btree cursor */ 902 int level) /* level to change */ 903 { 904 struct xfs_btree_block *block; /* generic btree block pointer */ 905 struct xfs_buf *bp; /* buffer containing block */ 906 907 /* 908 * Get the block pointer for this level. 909 */ 910 block = xfs_btree_get_block(cur, level, &bp); 911 if (xfs_btree_check_block(cur, block, level, bp)) 912 return 0; 913 /* 914 * It's empty, there is no such record. 915 */ 916 if (!block->bb_numrecs) 917 return 0; 918 /* 919 * Set the ptr value to numrecs, that's the last record/key. 920 */ 921 cur->bc_levels[level].ptr = be16_to_cpu(block->bb_numrecs); 922 return 1; 923 } 924 925 /* 926 * Compute first and last byte offsets for the fields given. 927 * Interprets the offsets table, which contains struct field offsets. 928 */ 929 void 930 xfs_btree_offsets( 931 uint32_t fields, /* bitmask of fields */ 932 const short *offsets, /* table of field offsets */ 933 int nbits, /* number of bits to inspect */ 934 int *first, /* output: first byte offset */ 935 int *last) /* output: last byte offset */ 936 { 937 int i; /* current bit number */ 938 uint32_t imask; /* mask for current bit number */ 939 940 ASSERT(fields != 0); 941 /* 942 * Find the lowest bit, so the first byte offset. 943 */ 944 for (i = 0, imask = 1u; ; i++, imask <<= 1) { 945 if (imask & fields) { 946 *first = offsets[i]; 947 break; 948 } 949 } 950 /* 951 * Find the highest bit, so the last byte offset. 952 */ 953 for (i = nbits - 1, imask = 1u << i; ; i--, imask >>= 1) { 954 if (imask & fields) { 955 *last = offsets[i + 1] - 1; 956 break; 957 } 958 } 959 } 960 961 STATIC int 962 xfs_btree_readahead_fsblock( 963 struct xfs_btree_cur *cur, 964 int lr, 965 struct xfs_btree_block *block) 966 { 967 struct xfs_mount *mp = cur->bc_mp; 968 xfs_fsblock_t left = be64_to_cpu(block->bb_u.l.bb_leftsib); 969 xfs_fsblock_t right = be64_to_cpu(block->bb_u.l.bb_rightsib); 970 int rval = 0; 971 972 if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) { 973 xfs_buf_readahead(mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, left), 974 mp->m_bsize, cur->bc_ops->buf_ops); 975 rval++; 976 } 977 978 if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) { 979 xfs_buf_readahead(mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, right), 980 mp->m_bsize, cur->bc_ops->buf_ops); 981 rval++; 982 } 983 984 return rval; 985 } 986 987 STATIC int 988 xfs_btree_readahead_memblock( 989 struct xfs_btree_cur *cur, 990 int lr, 991 struct xfs_btree_block *block) 992 { 993 struct xfs_buftarg *btp = cur->bc_mem.xfbtree->target; 994 xfbno_t left = be64_to_cpu(block->bb_u.l.bb_leftsib); 995 xfbno_t right = be64_to_cpu(block->bb_u.l.bb_rightsib); 996 int rval = 0; 997 998 if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) { 999 xfs_buf_readahead(btp, xfbno_to_daddr(left), XFBNO_BBSIZE, 1000 cur->bc_ops->buf_ops); 1001 rval++; 1002 } 1003 1004 if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) { 1005 xfs_buf_readahead(btp, xfbno_to_daddr(right), XFBNO_BBSIZE, 1006 cur->bc_ops->buf_ops); 1007 rval++; 1008 } 1009 1010 return rval; 1011 } 1012 1013 STATIC int 1014 xfs_btree_readahead_agblock( 1015 struct xfs_btree_cur *cur, 1016 int lr, 1017 struct xfs_btree_block *block) 1018 { 1019 struct xfs_mount *mp = cur->bc_mp; 1020 xfs_agblock_t left = be32_to_cpu(block->bb_u.s.bb_leftsib); 1021 xfs_agblock_t right = be32_to_cpu(block->bb_u.s.bb_rightsib); 1022 int rval = 0; 1023 1024 if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) { 1025 xfs_buf_readahead(mp->m_ddev_targp, 1026 xfs_agbno_to_daddr(cur->bc_ag.pag, left), 1027 mp->m_bsize, cur->bc_ops->buf_ops); 1028 rval++; 1029 } 1030 1031 if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) { 1032 xfs_buf_readahead(mp->m_ddev_targp, 1033 xfs_agbno_to_daddr(cur->bc_ag.pag, right), 1034 mp->m_bsize, cur->bc_ops->buf_ops); 1035 rval++; 1036 } 1037 1038 return rval; 1039 } 1040 1041 /* 1042 * Read-ahead btree blocks, at the given level. 1043 * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA. 1044 */ 1045 STATIC int 1046 xfs_btree_readahead( 1047 struct xfs_btree_cur *cur, /* btree cursor */ 1048 int lev, /* level in btree */ 1049 int lr) /* left/right bits */ 1050 { 1051 struct xfs_btree_block *block; 1052 1053 /* 1054 * No readahead needed if we are at the root level and the 1055 * btree root is stored in the inode. 1056 */ 1057 if (xfs_btree_at_iroot(cur, lev)) 1058 return 0; 1059 1060 if ((cur->bc_levels[lev].ra | lr) == cur->bc_levels[lev].ra) 1061 return 0; 1062 1063 cur->bc_levels[lev].ra |= lr; 1064 block = XFS_BUF_TO_BLOCK(cur->bc_levels[lev].bp); 1065 1066 switch (cur->bc_ops->type) { 1067 case XFS_BTREE_TYPE_AG: 1068 return xfs_btree_readahead_agblock(cur, lr, block); 1069 case XFS_BTREE_TYPE_INODE: 1070 return xfs_btree_readahead_fsblock(cur, lr, block); 1071 case XFS_BTREE_TYPE_MEM: 1072 return xfs_btree_readahead_memblock(cur, lr, block); 1073 default: 1074 ASSERT(0); 1075 return 0; 1076 } 1077 } 1078 1079 STATIC int 1080 xfs_btree_ptr_to_daddr( 1081 struct xfs_btree_cur *cur, 1082 const union xfs_btree_ptr *ptr, 1083 xfs_daddr_t *daddr) 1084 { 1085 int error; 1086 1087 error = xfs_btree_check_ptr(cur, ptr, 0, 1); 1088 if (error) 1089 return error; 1090 1091 switch (cur->bc_ops->type) { 1092 case XFS_BTREE_TYPE_AG: 1093 *daddr = xfs_agbno_to_daddr(cur->bc_ag.pag, 1094 be32_to_cpu(ptr->s)); 1095 break; 1096 case XFS_BTREE_TYPE_INODE: 1097 *daddr = XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l)); 1098 break; 1099 case XFS_BTREE_TYPE_MEM: 1100 *daddr = xfbno_to_daddr(be64_to_cpu(ptr->l)); 1101 break; 1102 } 1103 return 0; 1104 } 1105 1106 /* 1107 * Readahead @count btree blocks at the given @ptr location. 1108 * 1109 * We don't need to care about long or short form btrees here as we have a 1110 * method of converting the ptr directly to a daddr available to us. 1111 */ 1112 STATIC void 1113 xfs_btree_readahead_ptr( 1114 struct xfs_btree_cur *cur, 1115 union xfs_btree_ptr *ptr, 1116 xfs_extlen_t count) 1117 { 1118 xfs_daddr_t daddr; 1119 1120 if (xfs_btree_ptr_to_daddr(cur, ptr, &daddr)) 1121 return; 1122 xfs_buf_readahead(xfs_btree_buftarg(cur), daddr, 1123 xfs_btree_bbsize(cur) * count, 1124 cur->bc_ops->buf_ops); 1125 } 1126 1127 /* 1128 * Set the buffer for level "lev" in the cursor to bp, releasing 1129 * any previous buffer. 1130 */ 1131 STATIC void 1132 xfs_btree_setbuf( 1133 struct xfs_btree_cur *cur, /* btree cursor */ 1134 int lev, /* level in btree */ 1135 struct xfs_buf *bp) /* new buffer to set */ 1136 { 1137 struct xfs_btree_block *b; /* btree block */ 1138 1139 if (cur->bc_levels[lev].bp) 1140 xfs_trans_brelse(cur->bc_tp, cur->bc_levels[lev].bp); 1141 cur->bc_levels[lev].bp = bp; 1142 cur->bc_levels[lev].ra = 0; 1143 1144 b = XFS_BUF_TO_BLOCK(bp); 1145 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { 1146 if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK)) 1147 cur->bc_levels[lev].ra |= XFS_BTCUR_LEFTRA; 1148 if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK)) 1149 cur->bc_levels[lev].ra |= XFS_BTCUR_RIGHTRA; 1150 } else { 1151 if (b->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK)) 1152 cur->bc_levels[lev].ra |= XFS_BTCUR_LEFTRA; 1153 if (b->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK)) 1154 cur->bc_levels[lev].ra |= XFS_BTCUR_RIGHTRA; 1155 } 1156 } 1157 1158 bool 1159 xfs_btree_ptr_is_null( 1160 struct xfs_btree_cur *cur, 1161 const union xfs_btree_ptr *ptr) 1162 { 1163 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) 1164 return ptr->l == cpu_to_be64(NULLFSBLOCK); 1165 else 1166 return ptr->s == cpu_to_be32(NULLAGBLOCK); 1167 } 1168 1169 void 1170 xfs_btree_set_ptr_null( 1171 struct xfs_btree_cur *cur, 1172 union xfs_btree_ptr *ptr) 1173 { 1174 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) 1175 ptr->l = cpu_to_be64(NULLFSBLOCK); 1176 else 1177 ptr->s = cpu_to_be32(NULLAGBLOCK); 1178 } 1179 1180 static inline bool 1181 xfs_btree_ptrs_equal( 1182 struct xfs_btree_cur *cur, 1183 union xfs_btree_ptr *ptr1, 1184 union xfs_btree_ptr *ptr2) 1185 { 1186 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) 1187 return ptr1->l == ptr2->l; 1188 return ptr1->s == ptr2->s; 1189 } 1190 1191 /* 1192 * Get/set/init sibling pointers 1193 */ 1194 void 1195 xfs_btree_get_sibling( 1196 struct xfs_btree_cur *cur, 1197 struct xfs_btree_block *block, 1198 union xfs_btree_ptr *ptr, 1199 int lr) 1200 { 1201 ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB); 1202 1203 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { 1204 if (lr == XFS_BB_RIGHTSIB) 1205 ptr->l = block->bb_u.l.bb_rightsib; 1206 else 1207 ptr->l = block->bb_u.l.bb_leftsib; 1208 } else { 1209 if (lr == XFS_BB_RIGHTSIB) 1210 ptr->s = block->bb_u.s.bb_rightsib; 1211 else 1212 ptr->s = block->bb_u.s.bb_leftsib; 1213 } 1214 } 1215 1216 void 1217 xfs_btree_set_sibling( 1218 struct xfs_btree_cur *cur, 1219 struct xfs_btree_block *block, 1220 const union xfs_btree_ptr *ptr, 1221 int lr) 1222 { 1223 ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB); 1224 1225 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { 1226 if (lr == XFS_BB_RIGHTSIB) 1227 block->bb_u.l.bb_rightsib = ptr->l; 1228 else 1229 block->bb_u.l.bb_leftsib = ptr->l; 1230 } else { 1231 if (lr == XFS_BB_RIGHTSIB) 1232 block->bb_u.s.bb_rightsib = ptr->s; 1233 else 1234 block->bb_u.s.bb_leftsib = ptr->s; 1235 } 1236 } 1237 1238 static void 1239 __xfs_btree_init_block( 1240 struct xfs_mount *mp, 1241 struct xfs_btree_block *buf, 1242 const struct xfs_btree_ops *ops, 1243 xfs_daddr_t blkno, 1244 __u16 level, 1245 __u16 numrecs, 1246 __u64 owner) 1247 { 1248 bool crc = xfs_has_crc(mp); 1249 __u32 magic = xfs_btree_magic(mp, ops); 1250 1251 buf->bb_magic = cpu_to_be32(magic); 1252 buf->bb_level = cpu_to_be16(level); 1253 buf->bb_numrecs = cpu_to_be16(numrecs); 1254 1255 if (ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { 1256 buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK); 1257 buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK); 1258 if (crc) { 1259 buf->bb_u.l.bb_blkno = cpu_to_be64(blkno); 1260 buf->bb_u.l.bb_owner = cpu_to_be64(owner); 1261 uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid); 1262 buf->bb_u.l.bb_pad = 0; 1263 buf->bb_u.l.bb_lsn = 0; 1264 } 1265 } else { 1266 buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK); 1267 buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK); 1268 if (crc) { 1269 buf->bb_u.s.bb_blkno = cpu_to_be64(blkno); 1270 /* owner is a 32 bit value on short blocks */ 1271 buf->bb_u.s.bb_owner = cpu_to_be32((__u32)owner); 1272 uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid); 1273 buf->bb_u.s.bb_lsn = 0; 1274 } 1275 } 1276 } 1277 1278 void 1279 xfs_btree_init_block( 1280 struct xfs_mount *mp, 1281 struct xfs_btree_block *block, 1282 const struct xfs_btree_ops *ops, 1283 __u16 level, 1284 __u16 numrecs, 1285 __u64 owner) 1286 { 1287 __xfs_btree_init_block(mp, block, ops, XFS_BUF_DADDR_NULL, level, 1288 numrecs, owner); 1289 } 1290 1291 void 1292 xfs_btree_init_buf( 1293 struct xfs_mount *mp, 1294 struct xfs_buf *bp, 1295 const struct xfs_btree_ops *ops, 1296 __u16 level, 1297 __u16 numrecs, 1298 __u64 owner) 1299 { 1300 __xfs_btree_init_block(mp, XFS_BUF_TO_BLOCK(bp), ops, 1301 xfs_buf_daddr(bp), level, numrecs, owner); 1302 bp->b_ops = ops->buf_ops; 1303 } 1304 1305 static inline __u64 1306 xfs_btree_owner( 1307 struct xfs_btree_cur *cur) 1308 { 1309 switch (cur->bc_ops->type) { 1310 case XFS_BTREE_TYPE_MEM: 1311 return cur->bc_mem.xfbtree->owner; 1312 case XFS_BTREE_TYPE_INODE: 1313 return cur->bc_ino.ip->i_ino; 1314 case XFS_BTREE_TYPE_AG: 1315 return pag_agno(cur->bc_ag.pag); 1316 default: 1317 ASSERT(0); 1318 return 0; 1319 } 1320 } 1321 1322 void 1323 xfs_btree_init_block_cur( 1324 struct xfs_btree_cur *cur, 1325 struct xfs_buf *bp, 1326 int level, 1327 int numrecs) 1328 { 1329 xfs_btree_init_buf(cur->bc_mp, bp, cur->bc_ops, level, numrecs, 1330 xfs_btree_owner(cur)); 1331 } 1332 1333 STATIC void 1334 xfs_btree_buf_to_ptr( 1335 struct xfs_btree_cur *cur, 1336 struct xfs_buf *bp, 1337 union xfs_btree_ptr *ptr) 1338 { 1339 switch (cur->bc_ops->type) { 1340 case XFS_BTREE_TYPE_AG: 1341 ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp, 1342 xfs_buf_daddr(bp))); 1343 break; 1344 case XFS_BTREE_TYPE_INODE: 1345 ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp, 1346 xfs_buf_daddr(bp))); 1347 break; 1348 case XFS_BTREE_TYPE_MEM: 1349 ptr->l = cpu_to_be64(xfs_daddr_to_xfbno(xfs_buf_daddr(bp))); 1350 break; 1351 } 1352 } 1353 1354 static inline void 1355 xfs_btree_set_refs( 1356 struct xfs_btree_cur *cur, 1357 struct xfs_buf *bp) 1358 { 1359 xfs_buf_set_ref(bp, cur->bc_ops->lru_refs); 1360 } 1361 1362 int 1363 xfs_btree_get_buf_block( 1364 struct xfs_btree_cur *cur, 1365 const union xfs_btree_ptr *ptr, 1366 struct xfs_btree_block **block, 1367 struct xfs_buf **bpp) 1368 { 1369 xfs_daddr_t d; 1370 int error; 1371 1372 error = xfs_btree_ptr_to_daddr(cur, ptr, &d); 1373 if (error) 1374 return error; 1375 error = xfs_trans_get_buf(cur->bc_tp, xfs_btree_buftarg(cur), d, 1376 xfs_btree_bbsize(cur), 0, bpp); 1377 if (error) 1378 return error; 1379 1380 (*bpp)->b_ops = cur->bc_ops->buf_ops; 1381 *block = XFS_BUF_TO_BLOCK(*bpp); 1382 return 0; 1383 } 1384 1385 /* 1386 * Read in the buffer at the given ptr and return the buffer and 1387 * the block pointer within the buffer. 1388 */ 1389 int 1390 xfs_btree_read_buf_block( 1391 struct xfs_btree_cur *cur, 1392 const union xfs_btree_ptr *ptr, 1393 int flags, 1394 struct xfs_btree_block **block, 1395 struct xfs_buf **bpp) 1396 { 1397 struct xfs_mount *mp = cur->bc_mp; 1398 xfs_daddr_t d; 1399 int error; 1400 1401 /* need to sort out how callers deal with failures first */ 1402 ASSERT(!(flags & XBF_TRYLOCK)); 1403 1404 error = xfs_btree_ptr_to_daddr(cur, ptr, &d); 1405 if (error) 1406 return error; 1407 error = xfs_trans_read_buf(mp, cur->bc_tp, xfs_btree_buftarg(cur), d, 1408 xfs_btree_bbsize(cur), flags, bpp, 1409 cur->bc_ops->buf_ops); 1410 if (xfs_metadata_is_sick(error)) 1411 xfs_btree_mark_sick(cur); 1412 if (error) 1413 return error; 1414 1415 xfs_btree_set_refs(cur, *bpp); 1416 *block = XFS_BUF_TO_BLOCK(*bpp); 1417 return 0; 1418 } 1419 1420 /* 1421 * Copy keys from one btree block to another. 1422 */ 1423 void 1424 xfs_btree_copy_keys( 1425 struct xfs_btree_cur *cur, 1426 union xfs_btree_key *dst_key, 1427 const union xfs_btree_key *src_key, 1428 int numkeys) 1429 { 1430 ASSERT(numkeys >= 0); 1431 memcpy(dst_key, src_key, numkeys * cur->bc_ops->key_len); 1432 } 1433 1434 /* 1435 * Copy records from one btree block to another. 1436 */ 1437 STATIC void 1438 xfs_btree_copy_recs( 1439 struct xfs_btree_cur *cur, 1440 union xfs_btree_rec *dst_rec, 1441 union xfs_btree_rec *src_rec, 1442 int numrecs) 1443 { 1444 ASSERT(numrecs >= 0); 1445 memcpy(dst_rec, src_rec, numrecs * cur->bc_ops->rec_len); 1446 } 1447 1448 /* 1449 * Copy block pointers from one btree block to another. 1450 */ 1451 void 1452 xfs_btree_copy_ptrs( 1453 struct xfs_btree_cur *cur, 1454 union xfs_btree_ptr *dst_ptr, 1455 const union xfs_btree_ptr *src_ptr, 1456 int numptrs) 1457 { 1458 ASSERT(numptrs >= 0); 1459 memcpy(dst_ptr, src_ptr, numptrs * cur->bc_ops->ptr_len); 1460 } 1461 1462 /* 1463 * Shift keys one index left/right inside a single btree block. 1464 */ 1465 STATIC void 1466 xfs_btree_shift_keys( 1467 struct xfs_btree_cur *cur, 1468 union xfs_btree_key *key, 1469 int dir, 1470 int numkeys) 1471 { 1472 char *dst_key; 1473 1474 ASSERT(numkeys >= 0); 1475 ASSERT(dir == 1 || dir == -1); 1476 1477 dst_key = (char *)key + (dir * cur->bc_ops->key_len); 1478 memmove(dst_key, key, numkeys * cur->bc_ops->key_len); 1479 } 1480 1481 /* 1482 * Shift records one index left/right inside a single btree block. 1483 */ 1484 STATIC void 1485 xfs_btree_shift_recs( 1486 struct xfs_btree_cur *cur, 1487 union xfs_btree_rec *rec, 1488 int dir, 1489 int numrecs) 1490 { 1491 char *dst_rec; 1492 1493 ASSERT(numrecs >= 0); 1494 ASSERT(dir == 1 || dir == -1); 1495 1496 dst_rec = (char *)rec + (dir * cur->bc_ops->rec_len); 1497 memmove(dst_rec, rec, numrecs * cur->bc_ops->rec_len); 1498 } 1499 1500 /* 1501 * Shift block pointers one index left/right inside a single btree block. 1502 */ 1503 STATIC void 1504 xfs_btree_shift_ptrs( 1505 struct xfs_btree_cur *cur, 1506 union xfs_btree_ptr *ptr, 1507 int dir, 1508 int numptrs) 1509 { 1510 char *dst_ptr; 1511 1512 ASSERT(numptrs >= 0); 1513 ASSERT(dir == 1 || dir == -1); 1514 1515 dst_ptr = (char *)ptr + (dir * cur->bc_ops->ptr_len); 1516 memmove(dst_ptr, ptr, numptrs * cur->bc_ops->ptr_len); 1517 } 1518 1519 /* 1520 * Log key values from the btree block. 1521 */ 1522 STATIC void 1523 xfs_btree_log_keys( 1524 struct xfs_btree_cur *cur, 1525 struct xfs_buf *bp, 1526 int first, 1527 int last) 1528 { 1529 1530 if (bp) { 1531 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); 1532 xfs_trans_log_buf(cur->bc_tp, bp, 1533 xfs_btree_key_offset(cur, first), 1534 xfs_btree_key_offset(cur, last + 1) - 1); 1535 } else { 1536 xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip, 1537 xfs_ilog_fbroot(cur->bc_ino.whichfork)); 1538 } 1539 } 1540 1541 /* 1542 * Log record values from the btree block. 1543 */ 1544 void 1545 xfs_btree_log_recs( 1546 struct xfs_btree_cur *cur, 1547 struct xfs_buf *bp, 1548 int first, 1549 int last) 1550 { 1551 1552 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); 1553 xfs_trans_log_buf(cur->bc_tp, bp, 1554 xfs_btree_rec_offset(cur, first), 1555 xfs_btree_rec_offset(cur, last + 1) - 1); 1556 1557 } 1558 1559 /* 1560 * Log block pointer fields from a btree block (nonleaf). 1561 */ 1562 STATIC void 1563 xfs_btree_log_ptrs( 1564 struct xfs_btree_cur *cur, /* btree cursor */ 1565 struct xfs_buf *bp, /* buffer containing btree block */ 1566 int first, /* index of first pointer to log */ 1567 int last) /* index of last pointer to log */ 1568 { 1569 1570 if (bp) { 1571 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 1572 int level = xfs_btree_get_level(block); 1573 1574 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); 1575 xfs_trans_log_buf(cur->bc_tp, bp, 1576 xfs_btree_ptr_offset(cur, first, level), 1577 xfs_btree_ptr_offset(cur, last + 1, level) - 1); 1578 } else { 1579 xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip, 1580 xfs_ilog_fbroot(cur->bc_ino.whichfork)); 1581 } 1582 1583 } 1584 1585 /* 1586 * Log fields from a btree block header. 1587 */ 1588 void 1589 xfs_btree_log_block( 1590 struct xfs_btree_cur *cur, /* btree cursor */ 1591 struct xfs_buf *bp, /* buffer containing btree block */ 1592 uint32_t fields) /* mask of fields: XFS_BB_... */ 1593 { 1594 int first; /* first byte offset logged */ 1595 int last; /* last byte offset logged */ 1596 static const short soffsets[] = { /* table of offsets (short) */ 1597 offsetof(struct xfs_btree_block, bb_magic), 1598 offsetof(struct xfs_btree_block, bb_level), 1599 offsetof(struct xfs_btree_block, bb_numrecs), 1600 offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib), 1601 offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib), 1602 offsetof(struct xfs_btree_block, bb_u.s.bb_blkno), 1603 offsetof(struct xfs_btree_block, bb_u.s.bb_lsn), 1604 offsetof(struct xfs_btree_block, bb_u.s.bb_uuid), 1605 offsetof(struct xfs_btree_block, bb_u.s.bb_owner), 1606 offsetof(struct xfs_btree_block, bb_u.s.bb_crc), 1607 XFS_BTREE_SBLOCK_CRC_LEN 1608 }; 1609 static const short loffsets[] = { /* table of offsets (long) */ 1610 offsetof(struct xfs_btree_block, bb_magic), 1611 offsetof(struct xfs_btree_block, bb_level), 1612 offsetof(struct xfs_btree_block, bb_numrecs), 1613 offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib), 1614 offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib), 1615 offsetof(struct xfs_btree_block, bb_u.l.bb_blkno), 1616 offsetof(struct xfs_btree_block, bb_u.l.bb_lsn), 1617 offsetof(struct xfs_btree_block, bb_u.l.bb_uuid), 1618 offsetof(struct xfs_btree_block, bb_u.l.bb_owner), 1619 offsetof(struct xfs_btree_block, bb_u.l.bb_crc), 1620 offsetof(struct xfs_btree_block, bb_u.l.bb_pad), 1621 XFS_BTREE_LBLOCK_CRC_LEN 1622 }; 1623 1624 if (bp) { 1625 int nbits; 1626 1627 if (xfs_has_crc(cur->bc_mp)) { 1628 /* 1629 * We don't log the CRC when updating a btree 1630 * block but instead recreate it during log 1631 * recovery. As the log buffers have checksums 1632 * of their own this is safe and avoids logging a crc 1633 * update in a lot of places. 1634 */ 1635 if (fields == XFS_BB_ALL_BITS) 1636 fields = XFS_BB_ALL_BITS_CRC; 1637 nbits = XFS_BB_NUM_BITS_CRC; 1638 } else { 1639 nbits = XFS_BB_NUM_BITS; 1640 } 1641 xfs_btree_offsets(fields, 1642 (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) ? 1643 loffsets : soffsets, 1644 nbits, &first, &last); 1645 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); 1646 xfs_trans_log_buf(cur->bc_tp, bp, first, last); 1647 } else { 1648 xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip, 1649 xfs_ilog_fbroot(cur->bc_ino.whichfork)); 1650 } 1651 } 1652 1653 /* 1654 * Increment cursor by one record at the level. 1655 * For nonzero levels the leaf-ward information is untouched. 1656 */ 1657 int /* error */ 1658 xfs_btree_increment( 1659 struct xfs_btree_cur *cur, 1660 int level, 1661 int *stat) /* success/failure */ 1662 { 1663 struct xfs_btree_block *block; 1664 union xfs_btree_ptr ptr; 1665 struct xfs_buf *bp; 1666 int error; /* error return value */ 1667 int lev; 1668 1669 ASSERT(level < cur->bc_nlevels); 1670 1671 /* Read-ahead to the right at this level. */ 1672 xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA); 1673 1674 /* Get a pointer to the btree block. */ 1675 block = xfs_btree_get_block(cur, level, &bp); 1676 1677 #ifdef DEBUG 1678 error = xfs_btree_check_block(cur, block, level, bp); 1679 if (error) 1680 goto error0; 1681 #endif 1682 1683 /* We're done if we remain in the block after the increment. */ 1684 if (++cur->bc_levels[level].ptr <= xfs_btree_get_numrecs(block)) 1685 goto out1; 1686 1687 /* Fail if we just went off the right edge of the tree. */ 1688 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB); 1689 if (xfs_btree_ptr_is_null(cur, &ptr)) 1690 goto out0; 1691 1692 XFS_BTREE_STATS_INC(cur, increment); 1693 1694 /* 1695 * March up the tree incrementing pointers. 1696 * Stop when we don't go off the right edge of a block. 1697 */ 1698 for (lev = level + 1; lev < cur->bc_nlevels; lev++) { 1699 block = xfs_btree_get_block(cur, lev, &bp); 1700 1701 #ifdef DEBUG 1702 error = xfs_btree_check_block(cur, block, lev, bp); 1703 if (error) 1704 goto error0; 1705 #endif 1706 1707 if (++cur->bc_levels[lev].ptr <= xfs_btree_get_numrecs(block)) 1708 break; 1709 1710 /* Read-ahead the right block for the next loop. */ 1711 xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA); 1712 } 1713 1714 /* 1715 * If we went off the root then we are either seriously 1716 * confused or have the tree root in an inode. 1717 */ 1718 if (lev == cur->bc_nlevels) { 1719 if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) 1720 goto out0; 1721 ASSERT(0); 1722 xfs_btree_mark_sick(cur); 1723 error = -EFSCORRUPTED; 1724 goto error0; 1725 } 1726 ASSERT(lev < cur->bc_nlevels); 1727 1728 /* 1729 * Now walk back down the tree, fixing up the cursor's buffer 1730 * pointers and key numbers. 1731 */ 1732 for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) { 1733 union xfs_btree_ptr *ptrp; 1734 1735 ptrp = xfs_btree_ptr_addr(cur, cur->bc_levels[lev].ptr, block); 1736 --lev; 1737 error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp); 1738 if (error) 1739 goto error0; 1740 1741 xfs_btree_setbuf(cur, lev, bp); 1742 cur->bc_levels[lev].ptr = 1; 1743 } 1744 out1: 1745 *stat = 1; 1746 return 0; 1747 1748 out0: 1749 *stat = 0; 1750 return 0; 1751 1752 error0: 1753 return error; 1754 } 1755 1756 /* 1757 * Decrement cursor by one record at the level. 1758 * For nonzero levels the leaf-ward information is untouched. 1759 */ 1760 int /* error */ 1761 xfs_btree_decrement( 1762 struct xfs_btree_cur *cur, 1763 int level, 1764 int *stat) /* success/failure */ 1765 { 1766 struct xfs_btree_block *block; 1767 struct xfs_buf *bp; 1768 int error; /* error return value */ 1769 int lev; 1770 union xfs_btree_ptr ptr; 1771 1772 ASSERT(level < cur->bc_nlevels); 1773 1774 /* Read-ahead to the left at this level. */ 1775 xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA); 1776 1777 /* We're done if we remain in the block after the decrement. */ 1778 if (--cur->bc_levels[level].ptr > 0) 1779 goto out1; 1780 1781 /* Get a pointer to the btree block. */ 1782 block = xfs_btree_get_block(cur, level, &bp); 1783 1784 #ifdef DEBUG 1785 error = xfs_btree_check_block(cur, block, level, bp); 1786 if (error) 1787 goto error0; 1788 #endif 1789 1790 /* Fail if we just went off the left edge of the tree. */ 1791 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB); 1792 if (xfs_btree_ptr_is_null(cur, &ptr)) 1793 goto out0; 1794 1795 XFS_BTREE_STATS_INC(cur, decrement); 1796 1797 /* 1798 * March up the tree decrementing pointers. 1799 * Stop when we don't go off the left edge of a block. 1800 */ 1801 for (lev = level + 1; lev < cur->bc_nlevels; lev++) { 1802 if (--cur->bc_levels[lev].ptr > 0) 1803 break; 1804 /* Read-ahead the left block for the next loop. */ 1805 xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA); 1806 } 1807 1808 /* 1809 * If we went off the root then we are seriously confused. 1810 * or the root of the tree is in an inode. 1811 */ 1812 if (lev == cur->bc_nlevels) { 1813 if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) 1814 goto out0; 1815 ASSERT(0); 1816 xfs_btree_mark_sick(cur); 1817 error = -EFSCORRUPTED; 1818 goto error0; 1819 } 1820 ASSERT(lev < cur->bc_nlevels); 1821 1822 /* 1823 * Now walk back down the tree, fixing up the cursor's buffer 1824 * pointers and key numbers. 1825 */ 1826 for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) { 1827 union xfs_btree_ptr *ptrp; 1828 1829 ptrp = xfs_btree_ptr_addr(cur, cur->bc_levels[lev].ptr, block); 1830 --lev; 1831 error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp); 1832 if (error) 1833 goto error0; 1834 xfs_btree_setbuf(cur, lev, bp); 1835 cur->bc_levels[lev].ptr = xfs_btree_get_numrecs(block); 1836 } 1837 out1: 1838 *stat = 1; 1839 return 0; 1840 1841 out0: 1842 *stat = 0; 1843 return 0; 1844 1845 error0: 1846 return error; 1847 } 1848 1849 /* 1850 * Check the btree block owner now that we have the context to know who the 1851 * real owner is. 1852 */ 1853 static inline xfs_failaddr_t 1854 xfs_btree_check_block_owner( 1855 struct xfs_btree_cur *cur, 1856 struct xfs_btree_block *block) 1857 { 1858 __u64 owner; 1859 1860 if (!xfs_has_crc(cur->bc_mp) || 1861 (cur->bc_flags & XFS_BTREE_BMBT_INVALID_OWNER)) 1862 return NULL; 1863 1864 owner = xfs_btree_owner(cur); 1865 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { 1866 if (be64_to_cpu(block->bb_u.l.bb_owner) != owner) 1867 return __this_address; 1868 } else { 1869 if (be32_to_cpu(block->bb_u.s.bb_owner) != owner) 1870 return __this_address; 1871 } 1872 1873 return NULL; 1874 } 1875 1876 int 1877 xfs_btree_lookup_get_block( 1878 struct xfs_btree_cur *cur, /* btree cursor */ 1879 int level, /* level in the btree */ 1880 const union xfs_btree_ptr *pp, /* ptr to btree block */ 1881 struct xfs_btree_block **blkp) /* return btree block */ 1882 { 1883 struct xfs_buf *bp; /* buffer pointer for btree block */ 1884 xfs_daddr_t daddr; 1885 int error = 0; 1886 1887 /* special case the root block if in an inode */ 1888 if (xfs_btree_at_iroot(cur, level)) { 1889 *blkp = xfs_btree_get_iroot(cur); 1890 return 0; 1891 } 1892 1893 /* 1894 * If the old buffer at this level for the disk address we are 1895 * looking for re-use it. 1896 * 1897 * Otherwise throw it away and get a new one. 1898 */ 1899 bp = cur->bc_levels[level].bp; 1900 error = xfs_btree_ptr_to_daddr(cur, pp, &daddr); 1901 if (error) 1902 return error; 1903 if (bp && xfs_buf_daddr(bp) == daddr) { 1904 *blkp = XFS_BUF_TO_BLOCK(bp); 1905 return 0; 1906 } 1907 1908 error = xfs_btree_read_buf_block(cur, pp, 0, blkp, &bp); 1909 if (error) 1910 return error; 1911 1912 /* Check the inode owner since the verifiers don't. */ 1913 if (xfs_btree_check_block_owner(cur, *blkp) != NULL) 1914 goto out_bad; 1915 1916 /* Did we get the level we were looking for? */ 1917 if (be16_to_cpu((*blkp)->bb_level) != level) 1918 goto out_bad; 1919 1920 /* Check that internal nodes have at least one record. */ 1921 if (level != 0 && be16_to_cpu((*blkp)->bb_numrecs) == 0) 1922 goto out_bad; 1923 1924 xfs_btree_setbuf(cur, level, bp); 1925 return 0; 1926 1927 out_bad: 1928 *blkp = NULL; 1929 xfs_buf_mark_corrupt(bp); 1930 xfs_trans_brelse(cur->bc_tp, bp); 1931 xfs_btree_mark_sick(cur); 1932 return -EFSCORRUPTED; 1933 } 1934 1935 /* 1936 * Get current search key. For level 0 we don't actually have a key 1937 * structure so we make one up from the record. For all other levels 1938 * we just return the right key. 1939 */ 1940 STATIC union xfs_btree_key * 1941 xfs_lookup_get_search_key( 1942 struct xfs_btree_cur *cur, 1943 int level, 1944 int keyno, 1945 struct xfs_btree_block *block, 1946 union xfs_btree_key *kp) 1947 { 1948 if (level == 0) { 1949 cur->bc_ops->init_key_from_rec(kp, 1950 xfs_btree_rec_addr(cur, keyno, block)); 1951 return kp; 1952 } 1953 1954 return xfs_btree_key_addr(cur, keyno, block); 1955 } 1956 1957 /* 1958 * Initialize a pointer to the root block. 1959 */ 1960 void 1961 xfs_btree_init_ptr_from_cur( 1962 struct xfs_btree_cur *cur, 1963 union xfs_btree_ptr *ptr) 1964 { 1965 if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) { 1966 /* 1967 * Inode-rooted btrees call xfs_btree_get_iroot to find the root 1968 * in xfs_btree_lookup_get_block and don't need a pointer here. 1969 */ 1970 ptr->l = 0; 1971 } else if (cur->bc_flags & XFS_BTREE_STAGING) { 1972 ptr->s = cpu_to_be32(cur->bc_ag.afake->af_root); 1973 } else { 1974 cur->bc_ops->init_ptr_from_cur(cur, ptr); 1975 } 1976 } 1977 1978 /* 1979 * Lookup the record. The cursor is made to point to it, based on dir. 1980 * stat is set to 0 if can't find any such record, 1 for success. 1981 */ 1982 int /* error */ 1983 xfs_btree_lookup( 1984 struct xfs_btree_cur *cur, /* btree cursor */ 1985 xfs_lookup_t dir, /* <=, ==, or >= */ 1986 int *stat) /* success/failure */ 1987 { 1988 struct xfs_btree_block *block; /* current btree block */ 1989 int64_t diff; /* difference for the current key */ 1990 int error; /* error return value */ 1991 int keyno; /* current key number */ 1992 int level; /* level in the btree */ 1993 union xfs_btree_ptr *pp; /* ptr to btree block */ 1994 union xfs_btree_ptr ptr; /* ptr to btree block */ 1995 1996 XFS_BTREE_STATS_INC(cur, lookup); 1997 1998 /* No such thing as a zero-level tree. */ 1999 if (XFS_IS_CORRUPT(cur->bc_mp, cur->bc_nlevels == 0)) { 2000 xfs_btree_mark_sick(cur); 2001 return -EFSCORRUPTED; 2002 } 2003 2004 block = NULL; 2005 keyno = 0; 2006 2007 /* initialise start pointer from cursor */ 2008 xfs_btree_init_ptr_from_cur(cur, &ptr); 2009 pp = &ptr; 2010 2011 /* 2012 * Iterate over each level in the btree, starting at the root. 2013 * For each level above the leaves, find the key we need, based 2014 * on the lookup record, then follow the corresponding block 2015 * pointer down to the next level. 2016 */ 2017 for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) { 2018 /* Get the block we need to do the lookup on. */ 2019 error = xfs_btree_lookup_get_block(cur, level, pp, &block); 2020 if (error) 2021 goto error0; 2022 2023 if (diff == 0) { 2024 /* 2025 * If we already had a key match at a higher level, we 2026 * know we need to use the first entry in this block. 2027 */ 2028 keyno = 1; 2029 } else { 2030 /* Otherwise search this block. Do a binary search. */ 2031 2032 int high; /* high entry number */ 2033 int low; /* low entry number */ 2034 2035 /* Set low and high entry numbers, 1-based. */ 2036 low = 1; 2037 high = xfs_btree_get_numrecs(block); 2038 if (!high) { 2039 /* Block is empty, must be an empty leaf. */ 2040 if (level != 0 || cur->bc_nlevels != 1) { 2041 XFS_CORRUPTION_ERROR(__func__, 2042 XFS_ERRLEVEL_LOW, 2043 cur->bc_mp, block, 2044 sizeof(*block)); 2045 xfs_btree_mark_sick(cur); 2046 return -EFSCORRUPTED; 2047 } 2048 2049 cur->bc_levels[0].ptr = dir != XFS_LOOKUP_LE; 2050 *stat = 0; 2051 return 0; 2052 } 2053 2054 /* Binary search the block. */ 2055 while (low <= high) { 2056 union xfs_btree_key key; 2057 union xfs_btree_key *kp; 2058 2059 XFS_BTREE_STATS_INC(cur, compare); 2060 2061 /* keyno is average of low and high. */ 2062 keyno = (low + high) >> 1; 2063 2064 /* Get current search key */ 2065 kp = xfs_lookup_get_search_key(cur, level, 2066 keyno, block, &key); 2067 2068 /* 2069 * Compute difference to get next direction: 2070 * - less than, move right 2071 * - greater than, move left 2072 * - equal, we're done 2073 */ 2074 diff = cur->bc_ops->key_diff(cur, kp); 2075 if (diff < 0) 2076 low = keyno + 1; 2077 else if (diff > 0) 2078 high = keyno - 1; 2079 else 2080 break; 2081 } 2082 } 2083 2084 /* 2085 * If there are more levels, set up for the next level 2086 * by getting the block number and filling in the cursor. 2087 */ 2088 if (level > 0) { 2089 /* 2090 * If we moved left, need the previous key number, 2091 * unless there isn't one. 2092 */ 2093 if (diff > 0 && --keyno < 1) 2094 keyno = 1; 2095 pp = xfs_btree_ptr_addr(cur, keyno, block); 2096 2097 error = xfs_btree_debug_check_ptr(cur, pp, 0, level); 2098 if (error) 2099 goto error0; 2100 2101 cur->bc_levels[level].ptr = keyno; 2102 } 2103 } 2104 2105 /* Done with the search. See if we need to adjust the results. */ 2106 if (dir != XFS_LOOKUP_LE && diff < 0) { 2107 keyno++; 2108 /* 2109 * If ge search and we went off the end of the block, but it's 2110 * not the last block, we're in the wrong block. 2111 */ 2112 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB); 2113 if (dir == XFS_LOOKUP_GE && 2114 keyno > xfs_btree_get_numrecs(block) && 2115 !xfs_btree_ptr_is_null(cur, &ptr)) { 2116 int i; 2117 2118 cur->bc_levels[0].ptr = keyno; 2119 error = xfs_btree_increment(cur, 0, &i); 2120 if (error) 2121 goto error0; 2122 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 2123 xfs_btree_mark_sick(cur); 2124 return -EFSCORRUPTED; 2125 } 2126 *stat = 1; 2127 return 0; 2128 } 2129 } else if (dir == XFS_LOOKUP_LE && diff > 0) 2130 keyno--; 2131 cur->bc_levels[0].ptr = keyno; 2132 2133 /* Return if we succeeded or not. */ 2134 if (keyno == 0 || keyno > xfs_btree_get_numrecs(block)) 2135 *stat = 0; 2136 else if (dir != XFS_LOOKUP_EQ || diff == 0) 2137 *stat = 1; 2138 else 2139 *stat = 0; 2140 return 0; 2141 2142 error0: 2143 return error; 2144 } 2145 2146 /* Find the high key storage area from a regular key. */ 2147 union xfs_btree_key * 2148 xfs_btree_high_key_from_key( 2149 struct xfs_btree_cur *cur, 2150 union xfs_btree_key *key) 2151 { 2152 ASSERT(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING); 2153 return (union xfs_btree_key *)((char *)key + 2154 (cur->bc_ops->key_len / 2)); 2155 } 2156 2157 /* Determine the low (and high if overlapped) keys of a leaf block */ 2158 STATIC void 2159 xfs_btree_get_leaf_keys( 2160 struct xfs_btree_cur *cur, 2161 struct xfs_btree_block *block, 2162 union xfs_btree_key *key) 2163 { 2164 union xfs_btree_key max_hkey; 2165 union xfs_btree_key hkey; 2166 union xfs_btree_rec *rec; 2167 union xfs_btree_key *high; 2168 int n; 2169 2170 rec = xfs_btree_rec_addr(cur, 1, block); 2171 cur->bc_ops->init_key_from_rec(key, rec); 2172 2173 if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) { 2174 2175 cur->bc_ops->init_high_key_from_rec(&max_hkey, rec); 2176 for (n = 2; n <= xfs_btree_get_numrecs(block); n++) { 2177 rec = xfs_btree_rec_addr(cur, n, block); 2178 cur->bc_ops->init_high_key_from_rec(&hkey, rec); 2179 if (xfs_btree_keycmp_gt(cur, &hkey, &max_hkey)) 2180 max_hkey = hkey; 2181 } 2182 2183 high = xfs_btree_high_key_from_key(cur, key); 2184 memcpy(high, &max_hkey, cur->bc_ops->key_len / 2); 2185 } 2186 } 2187 2188 /* Determine the low (and high if overlapped) keys of a node block */ 2189 STATIC void 2190 xfs_btree_get_node_keys( 2191 struct xfs_btree_cur *cur, 2192 struct xfs_btree_block *block, 2193 union xfs_btree_key *key) 2194 { 2195 union xfs_btree_key *hkey; 2196 union xfs_btree_key *max_hkey; 2197 union xfs_btree_key *high; 2198 int n; 2199 2200 if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) { 2201 memcpy(key, xfs_btree_key_addr(cur, 1, block), 2202 cur->bc_ops->key_len / 2); 2203 2204 max_hkey = xfs_btree_high_key_addr(cur, 1, block); 2205 for (n = 2; n <= xfs_btree_get_numrecs(block); n++) { 2206 hkey = xfs_btree_high_key_addr(cur, n, block); 2207 if (xfs_btree_keycmp_gt(cur, hkey, max_hkey)) 2208 max_hkey = hkey; 2209 } 2210 2211 high = xfs_btree_high_key_from_key(cur, key); 2212 memcpy(high, max_hkey, cur->bc_ops->key_len / 2); 2213 } else { 2214 memcpy(key, xfs_btree_key_addr(cur, 1, block), 2215 cur->bc_ops->key_len); 2216 } 2217 } 2218 2219 /* Derive the keys for any btree block. */ 2220 void 2221 xfs_btree_get_keys( 2222 struct xfs_btree_cur *cur, 2223 struct xfs_btree_block *block, 2224 union xfs_btree_key *key) 2225 { 2226 if (be16_to_cpu(block->bb_level) == 0) 2227 xfs_btree_get_leaf_keys(cur, block, key); 2228 else 2229 xfs_btree_get_node_keys(cur, block, key); 2230 } 2231 2232 /* 2233 * Decide if we need to update the parent keys of a btree block. For 2234 * a standard btree this is only necessary if we're updating the first 2235 * record/key. For an overlapping btree, we must always update the 2236 * keys because the highest key can be in any of the records or keys 2237 * in the block. 2238 */ 2239 static inline bool 2240 xfs_btree_needs_key_update( 2241 struct xfs_btree_cur *cur, 2242 int ptr) 2243 { 2244 return (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) || ptr == 1; 2245 } 2246 2247 /* 2248 * Update the low and high parent keys of the given level, progressing 2249 * towards the root. If force_all is false, stop if the keys for a given 2250 * level do not need updating. 2251 */ 2252 STATIC int 2253 __xfs_btree_updkeys( 2254 struct xfs_btree_cur *cur, 2255 int level, 2256 struct xfs_btree_block *block, 2257 struct xfs_buf *bp0, 2258 bool force_all) 2259 { 2260 union xfs_btree_key key; /* keys from current level */ 2261 union xfs_btree_key *lkey; /* keys from the next level up */ 2262 union xfs_btree_key *hkey; 2263 union xfs_btree_key *nlkey; /* keys from the next level up */ 2264 union xfs_btree_key *nhkey; 2265 struct xfs_buf *bp; 2266 int ptr; 2267 2268 ASSERT(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING); 2269 2270 /* Exit if there aren't any parent levels to update. */ 2271 if (level + 1 >= cur->bc_nlevels) 2272 return 0; 2273 2274 trace_xfs_btree_updkeys(cur, level, bp0); 2275 2276 lkey = &key; 2277 hkey = xfs_btree_high_key_from_key(cur, lkey); 2278 xfs_btree_get_keys(cur, block, lkey); 2279 for (level++; level < cur->bc_nlevels; level++) { 2280 #ifdef DEBUG 2281 int error; 2282 #endif 2283 block = xfs_btree_get_block(cur, level, &bp); 2284 trace_xfs_btree_updkeys(cur, level, bp); 2285 #ifdef DEBUG 2286 error = xfs_btree_check_block(cur, block, level, bp); 2287 if (error) 2288 return error; 2289 #endif 2290 ptr = cur->bc_levels[level].ptr; 2291 nlkey = xfs_btree_key_addr(cur, ptr, block); 2292 nhkey = xfs_btree_high_key_addr(cur, ptr, block); 2293 if (!force_all && 2294 xfs_btree_keycmp_eq(cur, nlkey, lkey) && 2295 xfs_btree_keycmp_eq(cur, nhkey, hkey)) 2296 break; 2297 xfs_btree_copy_keys(cur, nlkey, lkey, 1); 2298 xfs_btree_log_keys(cur, bp, ptr, ptr); 2299 if (level + 1 >= cur->bc_nlevels) 2300 break; 2301 xfs_btree_get_node_keys(cur, block, lkey); 2302 } 2303 2304 return 0; 2305 } 2306 2307 /* Update all the keys from some level in cursor back to the root. */ 2308 STATIC int 2309 xfs_btree_updkeys_force( 2310 struct xfs_btree_cur *cur, 2311 int level) 2312 { 2313 struct xfs_buf *bp; 2314 struct xfs_btree_block *block; 2315 2316 block = xfs_btree_get_block(cur, level, &bp); 2317 return __xfs_btree_updkeys(cur, level, block, bp, true); 2318 } 2319 2320 /* 2321 * Update the parent keys of the given level, progressing towards the root. 2322 */ 2323 STATIC int 2324 xfs_btree_update_keys( 2325 struct xfs_btree_cur *cur, 2326 int level) 2327 { 2328 struct xfs_btree_block *block; 2329 struct xfs_buf *bp; 2330 union xfs_btree_key *kp; 2331 union xfs_btree_key key; 2332 int ptr; 2333 2334 ASSERT(level >= 0); 2335 2336 block = xfs_btree_get_block(cur, level, &bp); 2337 if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) 2338 return __xfs_btree_updkeys(cur, level, block, bp, false); 2339 2340 /* 2341 * Go up the tree from this level toward the root. 2342 * At each level, update the key value to the value input. 2343 * Stop when we reach a level where the cursor isn't pointing 2344 * at the first entry in the block. 2345 */ 2346 xfs_btree_get_keys(cur, block, &key); 2347 for (level++, ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) { 2348 #ifdef DEBUG 2349 int error; 2350 #endif 2351 block = xfs_btree_get_block(cur, level, &bp); 2352 #ifdef DEBUG 2353 error = xfs_btree_check_block(cur, block, level, bp); 2354 if (error) 2355 return error; 2356 #endif 2357 ptr = cur->bc_levels[level].ptr; 2358 kp = xfs_btree_key_addr(cur, ptr, block); 2359 xfs_btree_copy_keys(cur, kp, &key, 1); 2360 xfs_btree_log_keys(cur, bp, ptr, ptr); 2361 } 2362 2363 return 0; 2364 } 2365 2366 /* 2367 * Update the record referred to by cur to the value in the 2368 * given record. This either works (return 0) or gets an 2369 * EFSCORRUPTED error. 2370 */ 2371 int 2372 xfs_btree_update( 2373 struct xfs_btree_cur *cur, 2374 union xfs_btree_rec *rec) 2375 { 2376 struct xfs_btree_block *block; 2377 struct xfs_buf *bp; 2378 int error; 2379 int ptr; 2380 union xfs_btree_rec *rp; 2381 2382 /* Pick up the current block. */ 2383 block = xfs_btree_get_block(cur, 0, &bp); 2384 2385 #ifdef DEBUG 2386 error = xfs_btree_check_block(cur, block, 0, bp); 2387 if (error) 2388 goto error0; 2389 #endif 2390 /* Get the address of the rec to be updated. */ 2391 ptr = cur->bc_levels[0].ptr; 2392 rp = xfs_btree_rec_addr(cur, ptr, block); 2393 2394 /* Fill in the new contents and log them. */ 2395 xfs_btree_copy_recs(cur, rp, rec, 1); 2396 xfs_btree_log_recs(cur, bp, ptr, ptr); 2397 2398 /* Pass new key value up to our parent. */ 2399 if (xfs_btree_needs_key_update(cur, ptr)) { 2400 error = xfs_btree_update_keys(cur, 0); 2401 if (error) 2402 goto error0; 2403 } 2404 2405 return 0; 2406 2407 error0: 2408 return error; 2409 } 2410 2411 /* 2412 * Move 1 record left from cur/level if possible. 2413 * Update cur to reflect the new path. 2414 */ 2415 STATIC int /* error */ 2416 xfs_btree_lshift( 2417 struct xfs_btree_cur *cur, 2418 int level, 2419 int *stat) /* success/failure */ 2420 { 2421 struct xfs_buf *lbp; /* left buffer pointer */ 2422 struct xfs_btree_block *left; /* left btree block */ 2423 int lrecs; /* left record count */ 2424 struct xfs_buf *rbp; /* right buffer pointer */ 2425 struct xfs_btree_block *right; /* right btree block */ 2426 struct xfs_btree_cur *tcur; /* temporary btree cursor */ 2427 int rrecs; /* right record count */ 2428 union xfs_btree_ptr lptr; /* left btree pointer */ 2429 union xfs_btree_key *rkp = NULL; /* right btree key */ 2430 union xfs_btree_ptr *rpp = NULL; /* right address pointer */ 2431 union xfs_btree_rec *rrp = NULL; /* right record pointer */ 2432 int error; /* error return value */ 2433 int i; 2434 2435 if (xfs_btree_at_iroot(cur, level)) 2436 goto out0; 2437 2438 /* Set up variables for this block as "right". */ 2439 right = xfs_btree_get_block(cur, level, &rbp); 2440 2441 #ifdef DEBUG 2442 error = xfs_btree_check_block(cur, right, level, rbp); 2443 if (error) 2444 goto error0; 2445 #endif 2446 2447 /* If we've got no left sibling then we can't shift an entry left. */ 2448 xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB); 2449 if (xfs_btree_ptr_is_null(cur, &lptr)) 2450 goto out0; 2451 2452 /* 2453 * If the cursor entry is the one that would be moved, don't 2454 * do it... it's too complicated. 2455 */ 2456 if (cur->bc_levels[level].ptr <= 1) 2457 goto out0; 2458 2459 /* Set up the left neighbor as "left". */ 2460 error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp); 2461 if (error) 2462 goto error0; 2463 2464 /* If it's full, it can't take another entry. */ 2465 lrecs = xfs_btree_get_numrecs(left); 2466 if (lrecs == cur->bc_ops->get_maxrecs(cur, level)) 2467 goto out0; 2468 2469 rrecs = xfs_btree_get_numrecs(right); 2470 2471 /* 2472 * We add one entry to the left side and remove one for the right side. 2473 * Account for it here, the changes will be updated on disk and logged 2474 * later. 2475 */ 2476 lrecs++; 2477 rrecs--; 2478 2479 XFS_BTREE_STATS_INC(cur, lshift); 2480 XFS_BTREE_STATS_ADD(cur, moves, 1); 2481 2482 /* 2483 * If non-leaf, copy a key and a ptr to the left block. 2484 * Log the changes to the left block. 2485 */ 2486 if (level > 0) { 2487 /* It's a non-leaf. Move keys and pointers. */ 2488 union xfs_btree_key *lkp; /* left btree key */ 2489 union xfs_btree_ptr *lpp; /* left address pointer */ 2490 2491 lkp = xfs_btree_key_addr(cur, lrecs, left); 2492 rkp = xfs_btree_key_addr(cur, 1, right); 2493 2494 lpp = xfs_btree_ptr_addr(cur, lrecs, left); 2495 rpp = xfs_btree_ptr_addr(cur, 1, right); 2496 2497 error = xfs_btree_debug_check_ptr(cur, rpp, 0, level); 2498 if (error) 2499 goto error0; 2500 2501 xfs_btree_copy_keys(cur, lkp, rkp, 1); 2502 xfs_btree_copy_ptrs(cur, lpp, rpp, 1); 2503 2504 xfs_btree_log_keys(cur, lbp, lrecs, lrecs); 2505 xfs_btree_log_ptrs(cur, lbp, lrecs, lrecs); 2506 2507 ASSERT(cur->bc_ops->keys_inorder(cur, 2508 xfs_btree_key_addr(cur, lrecs - 1, left), lkp)); 2509 } else { 2510 /* It's a leaf. Move records. */ 2511 union xfs_btree_rec *lrp; /* left record pointer */ 2512 2513 lrp = xfs_btree_rec_addr(cur, lrecs, left); 2514 rrp = xfs_btree_rec_addr(cur, 1, right); 2515 2516 xfs_btree_copy_recs(cur, lrp, rrp, 1); 2517 xfs_btree_log_recs(cur, lbp, lrecs, lrecs); 2518 2519 ASSERT(cur->bc_ops->recs_inorder(cur, 2520 xfs_btree_rec_addr(cur, lrecs - 1, left), lrp)); 2521 } 2522 2523 xfs_btree_set_numrecs(left, lrecs); 2524 xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS); 2525 2526 xfs_btree_set_numrecs(right, rrecs); 2527 xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS); 2528 2529 /* 2530 * Slide the contents of right down one entry. 2531 */ 2532 XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1); 2533 if (level > 0) { 2534 /* It's a nonleaf. operate on keys and ptrs */ 2535 for (i = 0; i < rrecs; i++) { 2536 error = xfs_btree_debug_check_ptr(cur, rpp, i + 1, level); 2537 if (error) 2538 goto error0; 2539 } 2540 2541 xfs_btree_shift_keys(cur, 2542 xfs_btree_key_addr(cur, 2, right), 2543 -1, rrecs); 2544 xfs_btree_shift_ptrs(cur, 2545 xfs_btree_ptr_addr(cur, 2, right), 2546 -1, rrecs); 2547 2548 xfs_btree_log_keys(cur, rbp, 1, rrecs); 2549 xfs_btree_log_ptrs(cur, rbp, 1, rrecs); 2550 } else { 2551 /* It's a leaf. operate on records */ 2552 xfs_btree_shift_recs(cur, 2553 xfs_btree_rec_addr(cur, 2, right), 2554 -1, rrecs); 2555 xfs_btree_log_recs(cur, rbp, 1, rrecs); 2556 } 2557 2558 /* 2559 * Using a temporary cursor, update the parent key values of the 2560 * block on the left. 2561 */ 2562 if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) { 2563 error = xfs_btree_dup_cursor(cur, &tcur); 2564 if (error) 2565 goto error0; 2566 i = xfs_btree_firstrec(tcur, level); 2567 if (XFS_IS_CORRUPT(tcur->bc_mp, i != 1)) { 2568 xfs_btree_mark_sick(cur); 2569 error = -EFSCORRUPTED; 2570 goto error0; 2571 } 2572 2573 error = xfs_btree_decrement(tcur, level, &i); 2574 if (error) 2575 goto error1; 2576 2577 /* Update the parent high keys of the left block, if needed. */ 2578 error = xfs_btree_update_keys(tcur, level); 2579 if (error) 2580 goto error1; 2581 2582 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); 2583 } 2584 2585 /* Update the parent keys of the right block. */ 2586 error = xfs_btree_update_keys(cur, level); 2587 if (error) 2588 goto error0; 2589 2590 /* Slide the cursor value left one. */ 2591 cur->bc_levels[level].ptr--; 2592 2593 *stat = 1; 2594 return 0; 2595 2596 out0: 2597 *stat = 0; 2598 return 0; 2599 2600 error0: 2601 return error; 2602 2603 error1: 2604 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); 2605 return error; 2606 } 2607 2608 /* 2609 * Move 1 record right from cur/level if possible. 2610 * Update cur to reflect the new path. 2611 */ 2612 STATIC int /* error */ 2613 xfs_btree_rshift( 2614 struct xfs_btree_cur *cur, 2615 int level, 2616 int *stat) /* success/failure */ 2617 { 2618 struct xfs_buf *lbp; /* left buffer pointer */ 2619 struct xfs_btree_block *left; /* left btree block */ 2620 struct xfs_buf *rbp; /* right buffer pointer */ 2621 struct xfs_btree_block *right; /* right btree block */ 2622 struct xfs_btree_cur *tcur; /* temporary btree cursor */ 2623 union xfs_btree_ptr rptr; /* right block pointer */ 2624 union xfs_btree_key *rkp; /* right btree key */ 2625 int rrecs; /* right record count */ 2626 int lrecs; /* left record count */ 2627 int error; /* error return value */ 2628 int i; /* loop counter */ 2629 2630 if (xfs_btree_at_iroot(cur, level)) 2631 goto out0; 2632 2633 /* Set up variables for this block as "left". */ 2634 left = xfs_btree_get_block(cur, level, &lbp); 2635 2636 #ifdef DEBUG 2637 error = xfs_btree_check_block(cur, left, level, lbp); 2638 if (error) 2639 goto error0; 2640 #endif 2641 2642 /* If we've got no right sibling then we can't shift an entry right. */ 2643 xfs_btree_get_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB); 2644 if (xfs_btree_ptr_is_null(cur, &rptr)) 2645 goto out0; 2646 2647 /* 2648 * If the cursor entry is the one that would be moved, don't 2649 * do it... it's too complicated. 2650 */ 2651 lrecs = xfs_btree_get_numrecs(left); 2652 if (cur->bc_levels[level].ptr >= lrecs) 2653 goto out0; 2654 2655 /* Set up the right neighbor as "right". */ 2656 error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp); 2657 if (error) 2658 goto error0; 2659 2660 /* If it's full, it can't take another entry. */ 2661 rrecs = xfs_btree_get_numrecs(right); 2662 if (rrecs == cur->bc_ops->get_maxrecs(cur, level)) 2663 goto out0; 2664 2665 XFS_BTREE_STATS_INC(cur, rshift); 2666 XFS_BTREE_STATS_ADD(cur, moves, rrecs); 2667 2668 /* 2669 * Make a hole at the start of the right neighbor block, then 2670 * copy the last left block entry to the hole. 2671 */ 2672 if (level > 0) { 2673 /* It's a nonleaf. make a hole in the keys and ptrs */ 2674 union xfs_btree_key *lkp; 2675 union xfs_btree_ptr *lpp; 2676 union xfs_btree_ptr *rpp; 2677 2678 lkp = xfs_btree_key_addr(cur, lrecs, left); 2679 lpp = xfs_btree_ptr_addr(cur, lrecs, left); 2680 rkp = xfs_btree_key_addr(cur, 1, right); 2681 rpp = xfs_btree_ptr_addr(cur, 1, right); 2682 2683 for (i = rrecs - 1; i >= 0; i--) { 2684 error = xfs_btree_debug_check_ptr(cur, rpp, i, level); 2685 if (error) 2686 goto error0; 2687 } 2688 2689 xfs_btree_shift_keys(cur, rkp, 1, rrecs); 2690 xfs_btree_shift_ptrs(cur, rpp, 1, rrecs); 2691 2692 error = xfs_btree_debug_check_ptr(cur, lpp, 0, level); 2693 if (error) 2694 goto error0; 2695 2696 /* Now put the new data in, and log it. */ 2697 xfs_btree_copy_keys(cur, rkp, lkp, 1); 2698 xfs_btree_copy_ptrs(cur, rpp, lpp, 1); 2699 2700 xfs_btree_log_keys(cur, rbp, 1, rrecs + 1); 2701 xfs_btree_log_ptrs(cur, rbp, 1, rrecs + 1); 2702 2703 ASSERT(cur->bc_ops->keys_inorder(cur, rkp, 2704 xfs_btree_key_addr(cur, 2, right))); 2705 } else { 2706 /* It's a leaf. make a hole in the records */ 2707 union xfs_btree_rec *lrp; 2708 union xfs_btree_rec *rrp; 2709 2710 lrp = xfs_btree_rec_addr(cur, lrecs, left); 2711 rrp = xfs_btree_rec_addr(cur, 1, right); 2712 2713 xfs_btree_shift_recs(cur, rrp, 1, rrecs); 2714 2715 /* Now put the new data in, and log it. */ 2716 xfs_btree_copy_recs(cur, rrp, lrp, 1); 2717 xfs_btree_log_recs(cur, rbp, 1, rrecs + 1); 2718 } 2719 2720 /* 2721 * Decrement and log left's numrecs, bump and log right's numrecs. 2722 */ 2723 xfs_btree_set_numrecs(left, --lrecs); 2724 xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS); 2725 2726 xfs_btree_set_numrecs(right, ++rrecs); 2727 xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS); 2728 2729 /* 2730 * Using a temporary cursor, update the parent key values of the 2731 * block on the right. 2732 */ 2733 error = xfs_btree_dup_cursor(cur, &tcur); 2734 if (error) 2735 goto error0; 2736 i = xfs_btree_lastrec(tcur, level); 2737 if (XFS_IS_CORRUPT(tcur->bc_mp, i != 1)) { 2738 xfs_btree_mark_sick(cur); 2739 error = -EFSCORRUPTED; 2740 goto error0; 2741 } 2742 2743 error = xfs_btree_increment(tcur, level, &i); 2744 if (error) 2745 goto error1; 2746 2747 /* Update the parent high keys of the left block, if needed. */ 2748 if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) { 2749 error = xfs_btree_update_keys(cur, level); 2750 if (error) 2751 goto error1; 2752 } 2753 2754 /* Update the parent keys of the right block. */ 2755 error = xfs_btree_update_keys(tcur, level); 2756 if (error) 2757 goto error1; 2758 2759 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); 2760 2761 *stat = 1; 2762 return 0; 2763 2764 out0: 2765 *stat = 0; 2766 return 0; 2767 2768 error0: 2769 return error; 2770 2771 error1: 2772 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); 2773 return error; 2774 } 2775 2776 static inline int 2777 xfs_btree_alloc_block( 2778 struct xfs_btree_cur *cur, 2779 const union xfs_btree_ptr *hint_block, 2780 union xfs_btree_ptr *new_block, 2781 int *stat) 2782 { 2783 int error; 2784 2785 /* 2786 * Don't allow block allocation for a staging cursor, because staging 2787 * cursors do not support regular btree modifications. 2788 * 2789 * Bulk loading uses a separate callback to obtain new blocks from a 2790 * preallocated list, which prevents ENOSPC failures during loading. 2791 */ 2792 if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) { 2793 ASSERT(0); 2794 return -EFSCORRUPTED; 2795 } 2796 2797 error = cur->bc_ops->alloc_block(cur, hint_block, new_block, stat); 2798 trace_xfs_btree_alloc_block(cur, new_block, *stat, error); 2799 return error; 2800 } 2801 2802 /* 2803 * Split cur/level block in half. 2804 * Return new block number and the key to its first 2805 * record (to be inserted into parent). 2806 */ 2807 STATIC int /* error */ 2808 __xfs_btree_split( 2809 struct xfs_btree_cur *cur, 2810 int level, 2811 union xfs_btree_ptr *ptrp, 2812 union xfs_btree_key *key, 2813 struct xfs_btree_cur **curp, 2814 int *stat) /* success/failure */ 2815 { 2816 union xfs_btree_ptr lptr; /* left sibling block ptr */ 2817 struct xfs_buf *lbp; /* left buffer pointer */ 2818 struct xfs_btree_block *left; /* left btree block */ 2819 union xfs_btree_ptr rptr; /* right sibling block ptr */ 2820 struct xfs_buf *rbp; /* right buffer pointer */ 2821 struct xfs_btree_block *right; /* right btree block */ 2822 union xfs_btree_ptr rrptr; /* right-right sibling ptr */ 2823 struct xfs_buf *rrbp; /* right-right buffer pointer */ 2824 struct xfs_btree_block *rrblock; /* right-right btree block */ 2825 int lrecs; 2826 int rrecs; 2827 int src_index; 2828 int error; /* error return value */ 2829 int i; 2830 2831 XFS_BTREE_STATS_INC(cur, split); 2832 2833 /* Set up left block (current one). */ 2834 left = xfs_btree_get_block(cur, level, &lbp); 2835 2836 #ifdef DEBUG 2837 error = xfs_btree_check_block(cur, left, level, lbp); 2838 if (error) 2839 goto error0; 2840 #endif 2841 2842 xfs_btree_buf_to_ptr(cur, lbp, &lptr); 2843 2844 /* Allocate the new block. If we can't do it, we're toast. Give up. */ 2845 error = xfs_btree_alloc_block(cur, &lptr, &rptr, stat); 2846 if (error) 2847 goto error0; 2848 if (*stat == 0) 2849 goto out0; 2850 XFS_BTREE_STATS_INC(cur, alloc); 2851 2852 /* Set up the new block as "right". */ 2853 error = xfs_btree_get_buf_block(cur, &rptr, &right, &rbp); 2854 if (error) 2855 goto error0; 2856 2857 /* Fill in the btree header for the new right block. */ 2858 xfs_btree_init_block_cur(cur, rbp, xfs_btree_get_level(left), 0); 2859 2860 /* 2861 * Split the entries between the old and the new block evenly. 2862 * Make sure that if there's an odd number of entries now, that 2863 * each new block will have the same number of entries. 2864 */ 2865 lrecs = xfs_btree_get_numrecs(left); 2866 rrecs = lrecs / 2; 2867 if ((lrecs & 1) && cur->bc_levels[level].ptr <= rrecs + 1) 2868 rrecs++; 2869 src_index = (lrecs - rrecs + 1); 2870 2871 XFS_BTREE_STATS_ADD(cur, moves, rrecs); 2872 2873 /* Adjust numrecs for the later get_*_keys() calls. */ 2874 lrecs -= rrecs; 2875 xfs_btree_set_numrecs(left, lrecs); 2876 xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs); 2877 2878 /* 2879 * Copy btree block entries from the left block over to the 2880 * new block, the right. Update the right block and log the 2881 * changes. 2882 */ 2883 if (level > 0) { 2884 /* It's a non-leaf. Move keys and pointers. */ 2885 union xfs_btree_key *lkp; /* left btree key */ 2886 union xfs_btree_ptr *lpp; /* left address pointer */ 2887 union xfs_btree_key *rkp; /* right btree key */ 2888 union xfs_btree_ptr *rpp; /* right address pointer */ 2889 2890 lkp = xfs_btree_key_addr(cur, src_index, left); 2891 lpp = xfs_btree_ptr_addr(cur, src_index, left); 2892 rkp = xfs_btree_key_addr(cur, 1, right); 2893 rpp = xfs_btree_ptr_addr(cur, 1, right); 2894 2895 for (i = src_index; i < rrecs; i++) { 2896 error = xfs_btree_debug_check_ptr(cur, lpp, i, level); 2897 if (error) 2898 goto error0; 2899 } 2900 2901 /* Copy the keys & pointers to the new block. */ 2902 xfs_btree_copy_keys(cur, rkp, lkp, rrecs); 2903 xfs_btree_copy_ptrs(cur, rpp, lpp, rrecs); 2904 2905 xfs_btree_log_keys(cur, rbp, 1, rrecs); 2906 xfs_btree_log_ptrs(cur, rbp, 1, rrecs); 2907 2908 /* Stash the keys of the new block for later insertion. */ 2909 xfs_btree_get_node_keys(cur, right, key); 2910 } else { 2911 /* It's a leaf. Move records. */ 2912 union xfs_btree_rec *lrp; /* left record pointer */ 2913 union xfs_btree_rec *rrp; /* right record pointer */ 2914 2915 lrp = xfs_btree_rec_addr(cur, src_index, left); 2916 rrp = xfs_btree_rec_addr(cur, 1, right); 2917 2918 /* Copy records to the new block. */ 2919 xfs_btree_copy_recs(cur, rrp, lrp, rrecs); 2920 xfs_btree_log_recs(cur, rbp, 1, rrecs); 2921 2922 /* Stash the keys of the new block for later insertion. */ 2923 xfs_btree_get_leaf_keys(cur, right, key); 2924 } 2925 2926 /* 2927 * Find the left block number by looking in the buffer. 2928 * Adjust sibling pointers. 2929 */ 2930 xfs_btree_get_sibling(cur, left, &rrptr, XFS_BB_RIGHTSIB); 2931 xfs_btree_set_sibling(cur, right, &rrptr, XFS_BB_RIGHTSIB); 2932 xfs_btree_set_sibling(cur, right, &lptr, XFS_BB_LEFTSIB); 2933 xfs_btree_set_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB); 2934 2935 xfs_btree_log_block(cur, rbp, XFS_BB_ALL_BITS); 2936 xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB); 2937 2938 /* 2939 * If there's a block to the new block's right, make that block 2940 * point back to right instead of to left. 2941 */ 2942 if (!xfs_btree_ptr_is_null(cur, &rrptr)) { 2943 error = xfs_btree_read_buf_block(cur, &rrptr, 2944 0, &rrblock, &rrbp); 2945 if (error) 2946 goto error0; 2947 xfs_btree_set_sibling(cur, rrblock, &rptr, XFS_BB_LEFTSIB); 2948 xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB); 2949 } 2950 2951 /* Update the parent high keys of the left block, if needed. */ 2952 if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) { 2953 error = xfs_btree_update_keys(cur, level); 2954 if (error) 2955 goto error0; 2956 } 2957 2958 /* 2959 * If the cursor is really in the right block, move it there. 2960 * If it's just pointing past the last entry in left, then we'll 2961 * insert there, so don't change anything in that case. 2962 */ 2963 if (cur->bc_levels[level].ptr > lrecs + 1) { 2964 xfs_btree_setbuf(cur, level, rbp); 2965 cur->bc_levels[level].ptr -= lrecs; 2966 } 2967 /* 2968 * If there are more levels, we'll need another cursor which refers 2969 * the right block, no matter where this cursor was. 2970 */ 2971 if (level + 1 < cur->bc_nlevels) { 2972 error = xfs_btree_dup_cursor(cur, curp); 2973 if (error) 2974 goto error0; 2975 (*curp)->bc_levels[level + 1].ptr++; 2976 } 2977 *ptrp = rptr; 2978 *stat = 1; 2979 return 0; 2980 out0: 2981 *stat = 0; 2982 return 0; 2983 2984 error0: 2985 return error; 2986 } 2987 2988 #ifdef __KERNEL__ 2989 struct xfs_btree_split_args { 2990 struct xfs_btree_cur *cur; 2991 int level; 2992 union xfs_btree_ptr *ptrp; 2993 union xfs_btree_key *key; 2994 struct xfs_btree_cur **curp; 2995 int *stat; /* success/failure */ 2996 int result; 2997 bool kswapd; /* allocation in kswapd context */ 2998 struct completion *done; 2999 struct work_struct work; 3000 }; 3001 3002 /* 3003 * Stack switching interfaces for allocation 3004 */ 3005 static void 3006 xfs_btree_split_worker( 3007 struct work_struct *work) 3008 { 3009 struct xfs_btree_split_args *args = container_of(work, 3010 struct xfs_btree_split_args, work); 3011 unsigned long pflags; 3012 unsigned long new_pflags = 0; 3013 3014 /* 3015 * we are in a transaction context here, but may also be doing work 3016 * in kswapd context, and hence we may need to inherit that state 3017 * temporarily to ensure that we don't block waiting for memory reclaim 3018 * in any way. 3019 */ 3020 if (args->kswapd) 3021 new_pflags |= PF_MEMALLOC | PF_KSWAPD; 3022 3023 current_set_flags_nested(&pflags, new_pflags); 3024 xfs_trans_set_context(args->cur->bc_tp); 3025 3026 args->result = __xfs_btree_split(args->cur, args->level, args->ptrp, 3027 args->key, args->curp, args->stat); 3028 3029 xfs_trans_clear_context(args->cur->bc_tp); 3030 current_restore_flags_nested(&pflags, new_pflags); 3031 3032 /* 3033 * Do not access args after complete() has run here. We don't own args 3034 * and the owner may run and free args before we return here. 3035 */ 3036 complete(args->done); 3037 3038 } 3039 3040 /* 3041 * BMBT split requests often come in with little stack to work on so we push 3042 * them off to a worker thread so there is lots of stack to use. For the other 3043 * btree types, just call directly to avoid the context switch overhead here. 3044 * 3045 * Care must be taken here - the work queue rescuer thread introduces potential 3046 * AGF <> worker queue deadlocks if the BMBT block allocation has to lock new 3047 * AGFs to allocate blocks. A task being run by the rescuer could attempt to 3048 * lock an AGF that is already locked by a task queued to run by the rescuer, 3049 * resulting in an ABBA deadlock as the rescuer cannot run the lock holder to 3050 * release it until the current thread it is running gains the lock. 3051 * 3052 * To avoid this issue, we only ever queue BMBT splits that don't have an AGF 3053 * already locked to allocate from. The only place that doesn't hold an AGF 3054 * locked is unwritten extent conversion at IO completion, but that has already 3055 * been offloaded to a worker thread and hence has no stack consumption issues 3056 * we have to worry about. 3057 */ 3058 STATIC int /* error */ 3059 xfs_btree_split( 3060 struct xfs_btree_cur *cur, 3061 int level, 3062 union xfs_btree_ptr *ptrp, 3063 union xfs_btree_key *key, 3064 struct xfs_btree_cur **curp, 3065 int *stat) /* success/failure */ 3066 { 3067 struct xfs_btree_split_args args; 3068 DECLARE_COMPLETION_ONSTACK(done); 3069 3070 if (!xfs_btree_is_bmap(cur->bc_ops) || 3071 cur->bc_tp->t_highest_agno == NULLAGNUMBER) 3072 return __xfs_btree_split(cur, level, ptrp, key, curp, stat); 3073 3074 args.cur = cur; 3075 args.level = level; 3076 args.ptrp = ptrp; 3077 args.key = key; 3078 args.curp = curp; 3079 args.stat = stat; 3080 args.done = &done; 3081 args.kswapd = current_is_kswapd(); 3082 INIT_WORK_ONSTACK(&args.work, xfs_btree_split_worker); 3083 queue_work(xfs_alloc_wq, &args.work); 3084 wait_for_completion(&done); 3085 destroy_work_on_stack(&args.work); 3086 return args.result; 3087 } 3088 #else 3089 #define xfs_btree_split __xfs_btree_split 3090 #endif /* __KERNEL__ */ 3091 3092 /* 3093 * Copy the old inode root contents into a real block and make the 3094 * broot point to it. 3095 */ 3096 int /* error */ 3097 xfs_btree_new_iroot( 3098 struct xfs_btree_cur *cur, /* btree cursor */ 3099 int *logflags, /* logging flags for inode */ 3100 int *stat) /* return status - 0 fail */ 3101 { 3102 struct xfs_buf *cbp; /* buffer for cblock */ 3103 struct xfs_btree_block *block; /* btree block */ 3104 struct xfs_btree_block *cblock; /* child btree block */ 3105 union xfs_btree_key *ckp; /* child key pointer */ 3106 union xfs_btree_ptr *cpp; /* child ptr pointer */ 3107 union xfs_btree_key *kp; /* pointer to btree key */ 3108 union xfs_btree_ptr *pp; /* pointer to block addr */ 3109 union xfs_btree_ptr nptr; /* new block addr */ 3110 int level; /* btree level */ 3111 int error; /* error return code */ 3112 int i; /* loop counter */ 3113 3114 XFS_BTREE_STATS_INC(cur, newroot); 3115 3116 ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE); 3117 3118 level = cur->bc_nlevels - 1; 3119 3120 block = xfs_btree_get_iroot(cur); 3121 pp = xfs_btree_ptr_addr(cur, 1, block); 3122 3123 /* Allocate the new block. If we can't do it, we're toast. Give up. */ 3124 error = xfs_btree_alloc_block(cur, pp, &nptr, stat); 3125 if (error) 3126 goto error0; 3127 if (*stat == 0) 3128 return 0; 3129 3130 XFS_BTREE_STATS_INC(cur, alloc); 3131 3132 /* Copy the root into a real block. */ 3133 error = xfs_btree_get_buf_block(cur, &nptr, &cblock, &cbp); 3134 if (error) 3135 goto error0; 3136 3137 /* 3138 * we can't just memcpy() the root in for CRC enabled btree blocks. 3139 * In that case have to also ensure the blkno remains correct 3140 */ 3141 memcpy(cblock, block, xfs_btree_block_len(cur)); 3142 if (xfs_has_crc(cur->bc_mp)) { 3143 __be64 bno = cpu_to_be64(xfs_buf_daddr(cbp)); 3144 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) 3145 cblock->bb_u.l.bb_blkno = bno; 3146 else 3147 cblock->bb_u.s.bb_blkno = bno; 3148 } 3149 3150 be16_add_cpu(&block->bb_level, 1); 3151 xfs_btree_set_numrecs(block, 1); 3152 cur->bc_nlevels++; 3153 ASSERT(cur->bc_nlevels <= cur->bc_maxlevels); 3154 cur->bc_levels[level + 1].ptr = 1; 3155 3156 kp = xfs_btree_key_addr(cur, 1, block); 3157 ckp = xfs_btree_key_addr(cur, 1, cblock); 3158 xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock)); 3159 3160 cpp = xfs_btree_ptr_addr(cur, 1, cblock); 3161 for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) { 3162 error = xfs_btree_debug_check_ptr(cur, pp, i, level); 3163 if (error) 3164 goto error0; 3165 } 3166 3167 xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock)); 3168 3169 error = xfs_btree_debug_check_ptr(cur, &nptr, 0, level); 3170 if (error) 3171 goto error0; 3172 3173 xfs_btree_copy_ptrs(cur, pp, &nptr, 1); 3174 3175 xfs_iroot_realloc(cur->bc_ino.ip, 3176 1 - xfs_btree_get_numrecs(cblock), 3177 cur->bc_ino.whichfork); 3178 3179 xfs_btree_setbuf(cur, level, cbp); 3180 3181 /* 3182 * Do all this logging at the end so that 3183 * the root is at the right level. 3184 */ 3185 xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS); 3186 xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs)); 3187 xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs)); 3188 3189 *logflags |= 3190 XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork); 3191 *stat = 1; 3192 return 0; 3193 error0: 3194 return error; 3195 } 3196 3197 static void 3198 xfs_btree_set_root( 3199 struct xfs_btree_cur *cur, 3200 const union xfs_btree_ptr *ptr, 3201 int inc) 3202 { 3203 if (cur->bc_flags & XFS_BTREE_STAGING) { 3204 /* Update the btree root information for a per-AG fake root. */ 3205 cur->bc_ag.afake->af_root = be32_to_cpu(ptr->s); 3206 cur->bc_ag.afake->af_levels += inc; 3207 } else { 3208 cur->bc_ops->set_root(cur, ptr, inc); 3209 } 3210 } 3211 3212 /* 3213 * Allocate a new root block, fill it in. 3214 */ 3215 STATIC int /* error */ 3216 xfs_btree_new_root( 3217 struct xfs_btree_cur *cur, /* btree cursor */ 3218 int *stat) /* success/failure */ 3219 { 3220 struct xfs_btree_block *block; /* one half of the old root block */ 3221 struct xfs_buf *bp; /* buffer containing block */ 3222 int error; /* error return value */ 3223 struct xfs_buf *lbp; /* left buffer pointer */ 3224 struct xfs_btree_block *left; /* left btree block */ 3225 struct xfs_buf *nbp; /* new (root) buffer */ 3226 struct xfs_btree_block *new; /* new (root) btree block */ 3227 int nptr; /* new value for key index, 1 or 2 */ 3228 struct xfs_buf *rbp; /* right buffer pointer */ 3229 struct xfs_btree_block *right; /* right btree block */ 3230 union xfs_btree_ptr rptr; 3231 union xfs_btree_ptr lptr; 3232 3233 XFS_BTREE_STATS_INC(cur, newroot); 3234 3235 /* initialise our start point from the cursor */ 3236 xfs_btree_init_ptr_from_cur(cur, &rptr); 3237 3238 /* Allocate the new block. If we can't do it, we're toast. Give up. */ 3239 error = xfs_btree_alloc_block(cur, &rptr, &lptr, stat); 3240 if (error) 3241 goto error0; 3242 if (*stat == 0) 3243 goto out0; 3244 XFS_BTREE_STATS_INC(cur, alloc); 3245 3246 /* Set up the new block. */ 3247 error = xfs_btree_get_buf_block(cur, &lptr, &new, &nbp); 3248 if (error) 3249 goto error0; 3250 3251 /* Set the root in the holding structure increasing the level by 1. */ 3252 xfs_btree_set_root(cur, &lptr, 1); 3253 3254 /* 3255 * At the previous root level there are now two blocks: the old root, 3256 * and the new block generated when it was split. We don't know which 3257 * one the cursor is pointing at, so we set up variables "left" and 3258 * "right" for each case. 3259 */ 3260 block = xfs_btree_get_block(cur, cur->bc_nlevels - 1, &bp); 3261 3262 #ifdef DEBUG 3263 error = xfs_btree_check_block(cur, block, cur->bc_nlevels - 1, bp); 3264 if (error) 3265 goto error0; 3266 #endif 3267 3268 xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB); 3269 if (!xfs_btree_ptr_is_null(cur, &rptr)) { 3270 /* Our block is left, pick up the right block. */ 3271 lbp = bp; 3272 xfs_btree_buf_to_ptr(cur, lbp, &lptr); 3273 left = block; 3274 error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp); 3275 if (error) 3276 goto error0; 3277 bp = rbp; 3278 nptr = 1; 3279 } else { 3280 /* Our block is right, pick up the left block. */ 3281 rbp = bp; 3282 xfs_btree_buf_to_ptr(cur, rbp, &rptr); 3283 right = block; 3284 xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB); 3285 error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp); 3286 if (error) 3287 goto error0; 3288 bp = lbp; 3289 nptr = 2; 3290 } 3291 3292 /* Fill in the new block's btree header and log it. */ 3293 xfs_btree_init_block_cur(cur, nbp, cur->bc_nlevels, 2); 3294 xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS); 3295 ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) && 3296 !xfs_btree_ptr_is_null(cur, &rptr)); 3297 3298 /* Fill in the key data in the new root. */ 3299 if (xfs_btree_get_level(left) > 0) { 3300 /* 3301 * Get the keys for the left block's keys and put them directly 3302 * in the parent block. Do the same for the right block. 3303 */ 3304 xfs_btree_get_node_keys(cur, left, 3305 xfs_btree_key_addr(cur, 1, new)); 3306 xfs_btree_get_node_keys(cur, right, 3307 xfs_btree_key_addr(cur, 2, new)); 3308 } else { 3309 /* 3310 * Get the keys for the left block's records and put them 3311 * directly in the parent block. Do the same for the right 3312 * block. 3313 */ 3314 xfs_btree_get_leaf_keys(cur, left, 3315 xfs_btree_key_addr(cur, 1, new)); 3316 xfs_btree_get_leaf_keys(cur, right, 3317 xfs_btree_key_addr(cur, 2, new)); 3318 } 3319 xfs_btree_log_keys(cur, nbp, 1, 2); 3320 3321 /* Fill in the pointer data in the new root. */ 3322 xfs_btree_copy_ptrs(cur, 3323 xfs_btree_ptr_addr(cur, 1, new), &lptr, 1); 3324 xfs_btree_copy_ptrs(cur, 3325 xfs_btree_ptr_addr(cur, 2, new), &rptr, 1); 3326 xfs_btree_log_ptrs(cur, nbp, 1, 2); 3327 3328 /* Fix up the cursor. */ 3329 xfs_btree_setbuf(cur, cur->bc_nlevels, nbp); 3330 cur->bc_levels[cur->bc_nlevels].ptr = nptr; 3331 cur->bc_nlevels++; 3332 ASSERT(cur->bc_nlevels <= cur->bc_maxlevels); 3333 *stat = 1; 3334 return 0; 3335 error0: 3336 return error; 3337 out0: 3338 *stat = 0; 3339 return 0; 3340 } 3341 3342 STATIC int 3343 xfs_btree_make_block_unfull( 3344 struct xfs_btree_cur *cur, /* btree cursor */ 3345 int level, /* btree level */ 3346 int numrecs,/* # of recs in block */ 3347 int *oindex,/* old tree index */ 3348 int *index, /* new tree index */ 3349 union xfs_btree_ptr *nptr, /* new btree ptr */ 3350 struct xfs_btree_cur **ncur, /* new btree cursor */ 3351 union xfs_btree_key *key, /* key of new block */ 3352 int *stat) 3353 { 3354 int error = 0; 3355 3356 if (xfs_btree_at_iroot(cur, level)) { 3357 struct xfs_inode *ip = cur->bc_ino.ip; 3358 3359 if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) { 3360 /* A root block that can be made bigger. */ 3361 xfs_iroot_realloc(ip, 1, cur->bc_ino.whichfork); 3362 *stat = 1; 3363 } else { 3364 /* A root block that needs replacing */ 3365 int logflags = 0; 3366 3367 error = xfs_btree_new_iroot(cur, &logflags, stat); 3368 if (error || *stat == 0) 3369 return error; 3370 3371 xfs_trans_log_inode(cur->bc_tp, ip, logflags); 3372 } 3373 3374 return 0; 3375 } 3376 3377 /* First, try shifting an entry to the right neighbor. */ 3378 error = xfs_btree_rshift(cur, level, stat); 3379 if (error || *stat) 3380 return error; 3381 3382 /* Next, try shifting an entry to the left neighbor. */ 3383 error = xfs_btree_lshift(cur, level, stat); 3384 if (error) 3385 return error; 3386 3387 if (*stat) { 3388 *oindex = *index = cur->bc_levels[level].ptr; 3389 return 0; 3390 } 3391 3392 /* 3393 * Next, try splitting the current block in half. 3394 * 3395 * If this works we have to re-set our variables because we 3396 * could be in a different block now. 3397 */ 3398 error = xfs_btree_split(cur, level, nptr, key, ncur, stat); 3399 if (error || *stat == 0) 3400 return error; 3401 3402 3403 *index = cur->bc_levels[level].ptr; 3404 return 0; 3405 } 3406 3407 /* 3408 * Insert one record/level. Return information to the caller 3409 * allowing the next level up to proceed if necessary. 3410 */ 3411 STATIC int 3412 xfs_btree_insrec( 3413 struct xfs_btree_cur *cur, /* btree cursor */ 3414 int level, /* level to insert record at */ 3415 union xfs_btree_ptr *ptrp, /* i/o: block number inserted */ 3416 union xfs_btree_rec *rec, /* record to insert */ 3417 union xfs_btree_key *key, /* i/o: block key for ptrp */ 3418 struct xfs_btree_cur **curp, /* output: new cursor replacing cur */ 3419 int *stat) /* success/failure */ 3420 { 3421 struct xfs_btree_block *block; /* btree block */ 3422 struct xfs_buf *bp; /* buffer for block */ 3423 union xfs_btree_ptr nptr; /* new block ptr */ 3424 struct xfs_btree_cur *ncur = NULL; /* new btree cursor */ 3425 union xfs_btree_key nkey; /* new block key */ 3426 union xfs_btree_key *lkey; 3427 int optr; /* old key/record index */ 3428 int ptr; /* key/record index */ 3429 int numrecs;/* number of records */ 3430 int error; /* error return value */ 3431 int i; 3432 xfs_daddr_t old_bn; 3433 3434 ncur = NULL; 3435 lkey = &nkey; 3436 3437 /* 3438 * If we have an external root pointer, and we've made it to the 3439 * root level, allocate a new root block and we're done. 3440 */ 3441 if (cur->bc_ops->type != XFS_BTREE_TYPE_INODE && 3442 level >= cur->bc_nlevels) { 3443 error = xfs_btree_new_root(cur, stat); 3444 xfs_btree_set_ptr_null(cur, ptrp); 3445 3446 return error; 3447 } 3448 3449 /* If we're off the left edge, return failure. */ 3450 ptr = cur->bc_levels[level].ptr; 3451 if (ptr == 0) { 3452 *stat = 0; 3453 return 0; 3454 } 3455 3456 optr = ptr; 3457 3458 XFS_BTREE_STATS_INC(cur, insrec); 3459 3460 /* Get pointers to the btree buffer and block. */ 3461 block = xfs_btree_get_block(cur, level, &bp); 3462 old_bn = bp ? xfs_buf_daddr(bp) : XFS_BUF_DADDR_NULL; 3463 numrecs = xfs_btree_get_numrecs(block); 3464 3465 #ifdef DEBUG 3466 error = xfs_btree_check_block(cur, block, level, bp); 3467 if (error) 3468 goto error0; 3469 3470 /* Check that the new entry is being inserted in the right place. */ 3471 if (ptr <= numrecs) { 3472 if (level == 0) { 3473 ASSERT(cur->bc_ops->recs_inorder(cur, rec, 3474 xfs_btree_rec_addr(cur, ptr, block))); 3475 } else { 3476 ASSERT(cur->bc_ops->keys_inorder(cur, key, 3477 xfs_btree_key_addr(cur, ptr, block))); 3478 } 3479 } 3480 #endif 3481 3482 /* 3483 * If the block is full, we can't insert the new entry until we 3484 * make the block un-full. 3485 */ 3486 xfs_btree_set_ptr_null(cur, &nptr); 3487 if (numrecs == cur->bc_ops->get_maxrecs(cur, level)) { 3488 error = xfs_btree_make_block_unfull(cur, level, numrecs, 3489 &optr, &ptr, &nptr, &ncur, lkey, stat); 3490 if (error || *stat == 0) 3491 goto error0; 3492 } 3493 3494 /* 3495 * The current block may have changed if the block was 3496 * previously full and we have just made space in it. 3497 */ 3498 block = xfs_btree_get_block(cur, level, &bp); 3499 numrecs = xfs_btree_get_numrecs(block); 3500 3501 #ifdef DEBUG 3502 error = xfs_btree_check_block(cur, block, level, bp); 3503 if (error) 3504 goto error0; 3505 #endif 3506 3507 /* 3508 * At this point we know there's room for our new entry in the block 3509 * we're pointing at. 3510 */ 3511 XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr + 1); 3512 3513 if (level > 0) { 3514 /* It's a nonleaf. make a hole in the keys and ptrs */ 3515 union xfs_btree_key *kp; 3516 union xfs_btree_ptr *pp; 3517 3518 kp = xfs_btree_key_addr(cur, ptr, block); 3519 pp = xfs_btree_ptr_addr(cur, ptr, block); 3520 3521 for (i = numrecs - ptr; i >= 0; i--) { 3522 error = xfs_btree_debug_check_ptr(cur, pp, i, level); 3523 if (error) 3524 goto error0; 3525 } 3526 3527 xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1); 3528 xfs_btree_shift_ptrs(cur, pp, 1, numrecs - ptr + 1); 3529 3530 error = xfs_btree_debug_check_ptr(cur, ptrp, 0, level); 3531 if (error) 3532 goto error0; 3533 3534 /* Now put the new data in, bump numrecs and log it. */ 3535 xfs_btree_copy_keys(cur, kp, key, 1); 3536 xfs_btree_copy_ptrs(cur, pp, ptrp, 1); 3537 numrecs++; 3538 xfs_btree_set_numrecs(block, numrecs); 3539 xfs_btree_log_ptrs(cur, bp, ptr, numrecs); 3540 xfs_btree_log_keys(cur, bp, ptr, numrecs); 3541 #ifdef DEBUG 3542 if (ptr < numrecs) { 3543 ASSERT(cur->bc_ops->keys_inorder(cur, kp, 3544 xfs_btree_key_addr(cur, ptr + 1, block))); 3545 } 3546 #endif 3547 } else { 3548 /* It's a leaf. make a hole in the records */ 3549 union xfs_btree_rec *rp; 3550 3551 rp = xfs_btree_rec_addr(cur, ptr, block); 3552 3553 xfs_btree_shift_recs(cur, rp, 1, numrecs - ptr + 1); 3554 3555 /* Now put the new data in, bump numrecs and log it. */ 3556 xfs_btree_copy_recs(cur, rp, rec, 1); 3557 xfs_btree_set_numrecs(block, ++numrecs); 3558 xfs_btree_log_recs(cur, bp, ptr, numrecs); 3559 #ifdef DEBUG 3560 if (ptr < numrecs) { 3561 ASSERT(cur->bc_ops->recs_inorder(cur, rp, 3562 xfs_btree_rec_addr(cur, ptr + 1, block))); 3563 } 3564 #endif 3565 } 3566 3567 /* Log the new number of records in the btree header. */ 3568 xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS); 3569 3570 /* 3571 * If we just inserted into a new tree block, we have to 3572 * recalculate nkey here because nkey is out of date. 3573 * 3574 * Otherwise we're just updating an existing block (having shoved 3575 * some records into the new tree block), so use the regular key 3576 * update mechanism. 3577 */ 3578 if (bp && xfs_buf_daddr(bp) != old_bn) { 3579 xfs_btree_get_keys(cur, block, lkey); 3580 } else if (xfs_btree_needs_key_update(cur, optr)) { 3581 error = xfs_btree_update_keys(cur, level); 3582 if (error) 3583 goto error0; 3584 } 3585 3586 /* 3587 * Return the new block number, if any. 3588 * If there is one, give back a record value and a cursor too. 3589 */ 3590 *ptrp = nptr; 3591 if (!xfs_btree_ptr_is_null(cur, &nptr)) { 3592 xfs_btree_copy_keys(cur, key, lkey, 1); 3593 *curp = ncur; 3594 } 3595 3596 *stat = 1; 3597 return 0; 3598 3599 error0: 3600 if (ncur) 3601 xfs_btree_del_cursor(ncur, error); 3602 return error; 3603 } 3604 3605 /* 3606 * Insert the record at the point referenced by cur. 3607 * 3608 * A multi-level split of the tree on insert will invalidate the original 3609 * cursor. All callers of this function should assume that the cursor is 3610 * no longer valid and revalidate it. 3611 */ 3612 int 3613 xfs_btree_insert( 3614 struct xfs_btree_cur *cur, 3615 int *stat) 3616 { 3617 int error; /* error return value */ 3618 int i; /* result value, 0 for failure */ 3619 int level; /* current level number in btree */ 3620 union xfs_btree_ptr nptr; /* new block number (split result) */ 3621 struct xfs_btree_cur *ncur; /* new cursor (split result) */ 3622 struct xfs_btree_cur *pcur; /* previous level's cursor */ 3623 union xfs_btree_key bkey; /* key of block to insert */ 3624 union xfs_btree_key *key; 3625 union xfs_btree_rec rec; /* record to insert */ 3626 3627 level = 0; 3628 ncur = NULL; 3629 pcur = cur; 3630 key = &bkey; 3631 3632 xfs_btree_set_ptr_null(cur, &nptr); 3633 3634 /* Make a key out of the record data to be inserted, and save it. */ 3635 cur->bc_ops->init_rec_from_cur(cur, &rec); 3636 cur->bc_ops->init_key_from_rec(key, &rec); 3637 3638 /* 3639 * Loop going up the tree, starting at the leaf level. 3640 * Stop when we don't get a split block, that must mean that 3641 * the insert is finished with this level. 3642 */ 3643 do { 3644 /* 3645 * Insert nrec/nptr into this level of the tree. 3646 * Note if we fail, nptr will be null. 3647 */ 3648 error = xfs_btree_insrec(pcur, level, &nptr, &rec, key, 3649 &ncur, &i); 3650 if (error) { 3651 if (pcur != cur) 3652 xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR); 3653 goto error0; 3654 } 3655 3656 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 3657 xfs_btree_mark_sick(cur); 3658 error = -EFSCORRUPTED; 3659 goto error0; 3660 } 3661 level++; 3662 3663 /* 3664 * See if the cursor we just used is trash. 3665 * Can't trash the caller's cursor, but otherwise we should 3666 * if ncur is a new cursor or we're about to be done. 3667 */ 3668 if (pcur != cur && 3669 (ncur || xfs_btree_ptr_is_null(cur, &nptr))) { 3670 /* Save the state from the cursor before we trash it */ 3671 if (cur->bc_ops->update_cursor && 3672 !(cur->bc_flags & XFS_BTREE_STAGING)) 3673 cur->bc_ops->update_cursor(pcur, cur); 3674 cur->bc_nlevels = pcur->bc_nlevels; 3675 xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR); 3676 } 3677 /* If we got a new cursor, switch to it. */ 3678 if (ncur) { 3679 pcur = ncur; 3680 ncur = NULL; 3681 } 3682 } while (!xfs_btree_ptr_is_null(cur, &nptr)); 3683 3684 *stat = i; 3685 return 0; 3686 error0: 3687 return error; 3688 } 3689 3690 /* 3691 * Try to merge a non-leaf block back into the inode root. 3692 * 3693 * Note: the killroot names comes from the fact that we're effectively 3694 * killing the old root block. But because we can't just delete the 3695 * inode we have to copy the single block it was pointing to into the 3696 * inode. 3697 */ 3698 STATIC int 3699 xfs_btree_kill_iroot( 3700 struct xfs_btree_cur *cur) 3701 { 3702 int whichfork = cur->bc_ino.whichfork; 3703 struct xfs_inode *ip = cur->bc_ino.ip; 3704 struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); 3705 struct xfs_btree_block *block; 3706 struct xfs_btree_block *cblock; 3707 union xfs_btree_key *kp; 3708 union xfs_btree_key *ckp; 3709 union xfs_btree_ptr *pp; 3710 union xfs_btree_ptr *cpp; 3711 struct xfs_buf *cbp; 3712 int level; 3713 int index; 3714 int numrecs; 3715 int error; 3716 #ifdef DEBUG 3717 union xfs_btree_ptr ptr; 3718 #endif 3719 int i; 3720 3721 ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE); 3722 ASSERT(cur->bc_nlevels > 1); 3723 3724 /* 3725 * Don't deal with the root block needs to be a leaf case. 3726 * We're just going to turn the thing back into extents anyway. 3727 */ 3728 level = cur->bc_nlevels - 1; 3729 if (level == 1) 3730 goto out0; 3731 3732 /* 3733 * Give up if the root has multiple children. 3734 */ 3735 block = xfs_btree_get_iroot(cur); 3736 if (xfs_btree_get_numrecs(block) != 1) 3737 goto out0; 3738 3739 cblock = xfs_btree_get_block(cur, level - 1, &cbp); 3740 numrecs = xfs_btree_get_numrecs(cblock); 3741 3742 /* 3743 * Only do this if the next level will fit. 3744 * Then the data must be copied up to the inode, 3745 * instead of freeing the root you free the next level. 3746 */ 3747 if (numrecs > cur->bc_ops->get_dmaxrecs(cur, level)) 3748 goto out0; 3749 3750 XFS_BTREE_STATS_INC(cur, killroot); 3751 3752 #ifdef DEBUG 3753 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB); 3754 ASSERT(xfs_btree_ptr_is_null(cur, &ptr)); 3755 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB); 3756 ASSERT(xfs_btree_ptr_is_null(cur, &ptr)); 3757 #endif 3758 3759 index = numrecs - cur->bc_ops->get_maxrecs(cur, level); 3760 if (index) { 3761 xfs_iroot_realloc(cur->bc_ino.ip, index, 3762 cur->bc_ino.whichfork); 3763 block = ifp->if_broot; 3764 } 3765 3766 be16_add_cpu(&block->bb_numrecs, index); 3767 ASSERT(block->bb_numrecs == cblock->bb_numrecs); 3768 3769 kp = xfs_btree_key_addr(cur, 1, block); 3770 ckp = xfs_btree_key_addr(cur, 1, cblock); 3771 xfs_btree_copy_keys(cur, kp, ckp, numrecs); 3772 3773 pp = xfs_btree_ptr_addr(cur, 1, block); 3774 cpp = xfs_btree_ptr_addr(cur, 1, cblock); 3775 3776 for (i = 0; i < numrecs; i++) { 3777 error = xfs_btree_debug_check_ptr(cur, cpp, i, level - 1); 3778 if (error) 3779 return error; 3780 } 3781 3782 xfs_btree_copy_ptrs(cur, pp, cpp, numrecs); 3783 3784 error = xfs_btree_free_block(cur, cbp); 3785 if (error) 3786 return error; 3787 3788 cur->bc_levels[level - 1].bp = NULL; 3789 be16_add_cpu(&block->bb_level, -1); 3790 xfs_trans_log_inode(cur->bc_tp, ip, 3791 XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork)); 3792 cur->bc_nlevels--; 3793 out0: 3794 return 0; 3795 } 3796 3797 /* 3798 * Kill the current root node, and replace it with it's only child node. 3799 */ 3800 STATIC int 3801 xfs_btree_kill_root( 3802 struct xfs_btree_cur *cur, 3803 struct xfs_buf *bp, 3804 int level, 3805 union xfs_btree_ptr *newroot) 3806 { 3807 int error; 3808 3809 XFS_BTREE_STATS_INC(cur, killroot); 3810 3811 /* 3812 * Update the root pointer, decreasing the level by 1 and then 3813 * free the old root. 3814 */ 3815 xfs_btree_set_root(cur, newroot, -1); 3816 3817 error = xfs_btree_free_block(cur, bp); 3818 if (error) 3819 return error; 3820 3821 cur->bc_levels[level].bp = NULL; 3822 cur->bc_levels[level].ra = 0; 3823 cur->bc_nlevels--; 3824 3825 return 0; 3826 } 3827 3828 STATIC int 3829 xfs_btree_dec_cursor( 3830 struct xfs_btree_cur *cur, 3831 int level, 3832 int *stat) 3833 { 3834 int error; 3835 int i; 3836 3837 if (level > 0) { 3838 error = xfs_btree_decrement(cur, level, &i); 3839 if (error) 3840 return error; 3841 } 3842 3843 *stat = 1; 3844 return 0; 3845 } 3846 3847 /* 3848 * Single level of the btree record deletion routine. 3849 * Delete record pointed to by cur/level. 3850 * Remove the record from its block then rebalance the tree. 3851 * Return 0 for error, 1 for done, 2 to go on to the next level. 3852 */ 3853 STATIC int /* error */ 3854 xfs_btree_delrec( 3855 struct xfs_btree_cur *cur, /* btree cursor */ 3856 int level, /* level removing record from */ 3857 int *stat) /* fail/done/go-on */ 3858 { 3859 struct xfs_btree_block *block; /* btree block */ 3860 union xfs_btree_ptr cptr; /* current block ptr */ 3861 struct xfs_buf *bp; /* buffer for block */ 3862 int error; /* error return value */ 3863 int i; /* loop counter */ 3864 union xfs_btree_ptr lptr; /* left sibling block ptr */ 3865 struct xfs_buf *lbp; /* left buffer pointer */ 3866 struct xfs_btree_block *left; /* left btree block */ 3867 int lrecs = 0; /* left record count */ 3868 int ptr; /* key/record index */ 3869 union xfs_btree_ptr rptr; /* right sibling block ptr */ 3870 struct xfs_buf *rbp; /* right buffer pointer */ 3871 struct xfs_btree_block *right; /* right btree block */ 3872 struct xfs_btree_block *rrblock; /* right-right btree block */ 3873 struct xfs_buf *rrbp; /* right-right buffer pointer */ 3874 int rrecs = 0; /* right record count */ 3875 struct xfs_btree_cur *tcur; /* temporary btree cursor */ 3876 int numrecs; /* temporary numrec count */ 3877 3878 tcur = NULL; 3879 3880 /* Get the index of the entry being deleted, check for nothing there. */ 3881 ptr = cur->bc_levels[level].ptr; 3882 if (ptr == 0) { 3883 *stat = 0; 3884 return 0; 3885 } 3886 3887 /* Get the buffer & block containing the record or key/ptr. */ 3888 block = xfs_btree_get_block(cur, level, &bp); 3889 numrecs = xfs_btree_get_numrecs(block); 3890 3891 #ifdef DEBUG 3892 error = xfs_btree_check_block(cur, block, level, bp); 3893 if (error) 3894 goto error0; 3895 #endif 3896 3897 /* Fail if we're off the end of the block. */ 3898 if (ptr > numrecs) { 3899 *stat = 0; 3900 return 0; 3901 } 3902 3903 XFS_BTREE_STATS_INC(cur, delrec); 3904 XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr); 3905 3906 /* Excise the entries being deleted. */ 3907 if (level > 0) { 3908 /* It's a nonleaf. operate on keys and ptrs */ 3909 union xfs_btree_key *lkp; 3910 union xfs_btree_ptr *lpp; 3911 3912 lkp = xfs_btree_key_addr(cur, ptr + 1, block); 3913 lpp = xfs_btree_ptr_addr(cur, ptr + 1, block); 3914 3915 for (i = 0; i < numrecs - ptr; i++) { 3916 error = xfs_btree_debug_check_ptr(cur, lpp, i, level); 3917 if (error) 3918 goto error0; 3919 } 3920 3921 if (ptr < numrecs) { 3922 xfs_btree_shift_keys(cur, lkp, -1, numrecs - ptr); 3923 xfs_btree_shift_ptrs(cur, lpp, -1, numrecs - ptr); 3924 xfs_btree_log_keys(cur, bp, ptr, numrecs - 1); 3925 xfs_btree_log_ptrs(cur, bp, ptr, numrecs - 1); 3926 } 3927 } else { 3928 /* It's a leaf. operate on records */ 3929 if (ptr < numrecs) { 3930 xfs_btree_shift_recs(cur, 3931 xfs_btree_rec_addr(cur, ptr + 1, block), 3932 -1, numrecs - ptr); 3933 xfs_btree_log_recs(cur, bp, ptr, numrecs - 1); 3934 } 3935 } 3936 3937 /* 3938 * Decrement and log the number of entries in the block. 3939 */ 3940 xfs_btree_set_numrecs(block, --numrecs); 3941 xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS); 3942 3943 /* 3944 * We're at the root level. First, shrink the root block in-memory. 3945 * Try to get rid of the next level down. If we can't then there's 3946 * nothing left to do. 3947 */ 3948 if (xfs_btree_at_iroot(cur, level)) { 3949 xfs_iroot_realloc(cur->bc_ino.ip, -1, cur->bc_ino.whichfork); 3950 3951 error = xfs_btree_kill_iroot(cur); 3952 if (error) 3953 goto error0; 3954 3955 error = xfs_btree_dec_cursor(cur, level, stat); 3956 if (error) 3957 goto error0; 3958 *stat = 1; 3959 return 0; 3960 } 3961 3962 /* 3963 * If this is the root level, and there's only one entry left, and it's 3964 * NOT the leaf level, then we can get rid of this level. 3965 */ 3966 if (level == cur->bc_nlevels - 1) { 3967 if (numrecs == 1 && level > 0) { 3968 union xfs_btree_ptr *pp; 3969 /* 3970 * pp is still set to the first pointer in the block. 3971 * Make it the new root of the btree. 3972 */ 3973 pp = xfs_btree_ptr_addr(cur, 1, block); 3974 error = xfs_btree_kill_root(cur, bp, level, pp); 3975 if (error) 3976 goto error0; 3977 } else if (level > 0) { 3978 error = xfs_btree_dec_cursor(cur, level, stat); 3979 if (error) 3980 goto error0; 3981 } 3982 *stat = 1; 3983 return 0; 3984 } 3985 3986 /* 3987 * If we deleted the leftmost entry in the block, update the 3988 * key values above us in the tree. 3989 */ 3990 if (xfs_btree_needs_key_update(cur, ptr)) { 3991 error = xfs_btree_update_keys(cur, level); 3992 if (error) 3993 goto error0; 3994 } 3995 3996 /* 3997 * If the number of records remaining in the block is at least 3998 * the minimum, we're done. 3999 */ 4000 if (numrecs >= cur->bc_ops->get_minrecs(cur, level)) { 4001 error = xfs_btree_dec_cursor(cur, level, stat); 4002 if (error) 4003 goto error0; 4004 return 0; 4005 } 4006 4007 /* 4008 * Otherwise, we have to move some records around to keep the 4009 * tree balanced. Look at the left and right sibling blocks to 4010 * see if we can re-balance by moving only one record. 4011 */ 4012 xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB); 4013 xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB); 4014 4015 if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) { 4016 /* 4017 * One child of root, need to get a chance to copy its contents 4018 * into the root and delete it. Can't go up to next level, 4019 * there's nothing to delete there. 4020 */ 4021 if (xfs_btree_ptr_is_null(cur, &rptr) && 4022 xfs_btree_ptr_is_null(cur, &lptr) && 4023 level == cur->bc_nlevels - 2) { 4024 error = xfs_btree_kill_iroot(cur); 4025 if (!error) 4026 error = xfs_btree_dec_cursor(cur, level, stat); 4027 if (error) 4028 goto error0; 4029 return 0; 4030 } 4031 } 4032 4033 ASSERT(!xfs_btree_ptr_is_null(cur, &rptr) || 4034 !xfs_btree_ptr_is_null(cur, &lptr)); 4035 4036 /* 4037 * Duplicate the cursor so our btree manipulations here won't 4038 * disrupt the next level up. 4039 */ 4040 error = xfs_btree_dup_cursor(cur, &tcur); 4041 if (error) 4042 goto error0; 4043 4044 /* 4045 * If there's a right sibling, see if it's ok to shift an entry 4046 * out of it. 4047 */ 4048 if (!xfs_btree_ptr_is_null(cur, &rptr)) { 4049 /* 4050 * Move the temp cursor to the last entry in the next block. 4051 * Actually any entry but the first would suffice. 4052 */ 4053 i = xfs_btree_lastrec(tcur, level); 4054 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 4055 xfs_btree_mark_sick(cur); 4056 error = -EFSCORRUPTED; 4057 goto error0; 4058 } 4059 4060 error = xfs_btree_increment(tcur, level, &i); 4061 if (error) 4062 goto error0; 4063 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 4064 xfs_btree_mark_sick(cur); 4065 error = -EFSCORRUPTED; 4066 goto error0; 4067 } 4068 4069 i = xfs_btree_lastrec(tcur, level); 4070 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 4071 xfs_btree_mark_sick(cur); 4072 error = -EFSCORRUPTED; 4073 goto error0; 4074 } 4075 4076 /* Grab a pointer to the block. */ 4077 right = xfs_btree_get_block(tcur, level, &rbp); 4078 #ifdef DEBUG 4079 error = xfs_btree_check_block(tcur, right, level, rbp); 4080 if (error) 4081 goto error0; 4082 #endif 4083 /* Grab the current block number, for future use. */ 4084 xfs_btree_get_sibling(tcur, right, &cptr, XFS_BB_LEFTSIB); 4085 4086 /* 4087 * If right block is full enough so that removing one entry 4088 * won't make it too empty, and left-shifting an entry out 4089 * of right to us works, we're done. 4090 */ 4091 if (xfs_btree_get_numrecs(right) - 1 >= 4092 cur->bc_ops->get_minrecs(tcur, level)) { 4093 error = xfs_btree_lshift(tcur, level, &i); 4094 if (error) 4095 goto error0; 4096 if (i) { 4097 ASSERT(xfs_btree_get_numrecs(block) >= 4098 cur->bc_ops->get_minrecs(tcur, level)); 4099 4100 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); 4101 tcur = NULL; 4102 4103 error = xfs_btree_dec_cursor(cur, level, stat); 4104 if (error) 4105 goto error0; 4106 return 0; 4107 } 4108 } 4109 4110 /* 4111 * Otherwise, grab the number of records in right for 4112 * future reference, and fix up the temp cursor to point 4113 * to our block again (last record). 4114 */ 4115 rrecs = xfs_btree_get_numrecs(right); 4116 if (!xfs_btree_ptr_is_null(cur, &lptr)) { 4117 i = xfs_btree_firstrec(tcur, level); 4118 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 4119 xfs_btree_mark_sick(cur); 4120 error = -EFSCORRUPTED; 4121 goto error0; 4122 } 4123 4124 error = xfs_btree_decrement(tcur, level, &i); 4125 if (error) 4126 goto error0; 4127 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 4128 xfs_btree_mark_sick(cur); 4129 error = -EFSCORRUPTED; 4130 goto error0; 4131 } 4132 } 4133 } 4134 4135 /* 4136 * If there's a left sibling, see if it's ok to shift an entry 4137 * out of it. 4138 */ 4139 if (!xfs_btree_ptr_is_null(cur, &lptr)) { 4140 /* 4141 * Move the temp cursor to the first entry in the 4142 * previous block. 4143 */ 4144 i = xfs_btree_firstrec(tcur, level); 4145 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 4146 xfs_btree_mark_sick(cur); 4147 error = -EFSCORRUPTED; 4148 goto error0; 4149 } 4150 4151 error = xfs_btree_decrement(tcur, level, &i); 4152 if (error) 4153 goto error0; 4154 i = xfs_btree_firstrec(tcur, level); 4155 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 4156 xfs_btree_mark_sick(cur); 4157 error = -EFSCORRUPTED; 4158 goto error0; 4159 } 4160 4161 /* Grab a pointer to the block. */ 4162 left = xfs_btree_get_block(tcur, level, &lbp); 4163 #ifdef DEBUG 4164 error = xfs_btree_check_block(cur, left, level, lbp); 4165 if (error) 4166 goto error0; 4167 #endif 4168 /* Grab the current block number, for future use. */ 4169 xfs_btree_get_sibling(tcur, left, &cptr, XFS_BB_RIGHTSIB); 4170 4171 /* 4172 * If left block is full enough so that removing one entry 4173 * won't make it too empty, and right-shifting an entry out 4174 * of left to us works, we're done. 4175 */ 4176 if (xfs_btree_get_numrecs(left) - 1 >= 4177 cur->bc_ops->get_minrecs(tcur, level)) { 4178 error = xfs_btree_rshift(tcur, level, &i); 4179 if (error) 4180 goto error0; 4181 if (i) { 4182 ASSERT(xfs_btree_get_numrecs(block) >= 4183 cur->bc_ops->get_minrecs(tcur, level)); 4184 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); 4185 tcur = NULL; 4186 if (level == 0) 4187 cur->bc_levels[0].ptr++; 4188 4189 *stat = 1; 4190 return 0; 4191 } 4192 } 4193 4194 /* 4195 * Otherwise, grab the number of records in right for 4196 * future reference. 4197 */ 4198 lrecs = xfs_btree_get_numrecs(left); 4199 } 4200 4201 /* Delete the temp cursor, we're done with it. */ 4202 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); 4203 tcur = NULL; 4204 4205 /* If here, we need to do a join to keep the tree balanced. */ 4206 ASSERT(!xfs_btree_ptr_is_null(cur, &cptr)); 4207 4208 if (!xfs_btree_ptr_is_null(cur, &lptr) && 4209 lrecs + xfs_btree_get_numrecs(block) <= 4210 cur->bc_ops->get_maxrecs(cur, level)) { 4211 /* 4212 * Set "right" to be the starting block, 4213 * "left" to be the left neighbor. 4214 */ 4215 rptr = cptr; 4216 right = block; 4217 rbp = bp; 4218 error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp); 4219 if (error) 4220 goto error0; 4221 4222 /* 4223 * If that won't work, see if we can join with the right neighbor block. 4224 */ 4225 } else if (!xfs_btree_ptr_is_null(cur, &rptr) && 4226 rrecs + xfs_btree_get_numrecs(block) <= 4227 cur->bc_ops->get_maxrecs(cur, level)) { 4228 /* 4229 * Set "left" to be the starting block, 4230 * "right" to be the right neighbor. 4231 */ 4232 lptr = cptr; 4233 left = block; 4234 lbp = bp; 4235 error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp); 4236 if (error) 4237 goto error0; 4238 4239 /* 4240 * Otherwise, we can't fix the imbalance. 4241 * Just return. This is probably a logic error, but it's not fatal. 4242 */ 4243 } else { 4244 error = xfs_btree_dec_cursor(cur, level, stat); 4245 if (error) 4246 goto error0; 4247 return 0; 4248 } 4249 4250 rrecs = xfs_btree_get_numrecs(right); 4251 lrecs = xfs_btree_get_numrecs(left); 4252 4253 /* 4254 * We're now going to join "left" and "right" by moving all the stuff 4255 * in "right" to "left" and deleting "right". 4256 */ 4257 XFS_BTREE_STATS_ADD(cur, moves, rrecs); 4258 if (level > 0) { 4259 /* It's a non-leaf. Move keys and pointers. */ 4260 union xfs_btree_key *lkp; /* left btree key */ 4261 union xfs_btree_ptr *lpp; /* left address pointer */ 4262 union xfs_btree_key *rkp; /* right btree key */ 4263 union xfs_btree_ptr *rpp; /* right address pointer */ 4264 4265 lkp = xfs_btree_key_addr(cur, lrecs + 1, left); 4266 lpp = xfs_btree_ptr_addr(cur, lrecs + 1, left); 4267 rkp = xfs_btree_key_addr(cur, 1, right); 4268 rpp = xfs_btree_ptr_addr(cur, 1, right); 4269 4270 for (i = 1; i < rrecs; i++) { 4271 error = xfs_btree_debug_check_ptr(cur, rpp, i, level); 4272 if (error) 4273 goto error0; 4274 } 4275 4276 xfs_btree_copy_keys(cur, lkp, rkp, rrecs); 4277 xfs_btree_copy_ptrs(cur, lpp, rpp, rrecs); 4278 4279 xfs_btree_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs); 4280 xfs_btree_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs); 4281 } else { 4282 /* It's a leaf. Move records. */ 4283 union xfs_btree_rec *lrp; /* left record pointer */ 4284 union xfs_btree_rec *rrp; /* right record pointer */ 4285 4286 lrp = xfs_btree_rec_addr(cur, lrecs + 1, left); 4287 rrp = xfs_btree_rec_addr(cur, 1, right); 4288 4289 xfs_btree_copy_recs(cur, lrp, rrp, rrecs); 4290 xfs_btree_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs); 4291 } 4292 4293 XFS_BTREE_STATS_INC(cur, join); 4294 4295 /* 4296 * Fix up the number of records and right block pointer in the 4297 * surviving block, and log it. 4298 */ 4299 xfs_btree_set_numrecs(left, lrecs + rrecs); 4300 xfs_btree_get_sibling(cur, right, &cptr, XFS_BB_RIGHTSIB); 4301 xfs_btree_set_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB); 4302 xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB); 4303 4304 /* If there is a right sibling, point it to the remaining block. */ 4305 xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB); 4306 if (!xfs_btree_ptr_is_null(cur, &cptr)) { 4307 error = xfs_btree_read_buf_block(cur, &cptr, 0, &rrblock, &rrbp); 4308 if (error) 4309 goto error0; 4310 xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB); 4311 xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB); 4312 } 4313 4314 /* Free the deleted block. */ 4315 error = xfs_btree_free_block(cur, rbp); 4316 if (error) 4317 goto error0; 4318 4319 /* 4320 * If we joined with the left neighbor, set the buffer in the 4321 * cursor to the left block, and fix up the index. 4322 */ 4323 if (bp != lbp) { 4324 cur->bc_levels[level].bp = lbp; 4325 cur->bc_levels[level].ptr += lrecs; 4326 cur->bc_levels[level].ra = 0; 4327 } 4328 /* 4329 * If we joined with the right neighbor and there's a level above 4330 * us, increment the cursor at that level. 4331 */ 4332 else if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE || 4333 level + 1 < cur->bc_nlevels) { 4334 error = xfs_btree_increment(cur, level + 1, &i); 4335 if (error) 4336 goto error0; 4337 } 4338 4339 /* 4340 * Readjust the ptr at this level if it's not a leaf, since it's 4341 * still pointing at the deletion point, which makes the cursor 4342 * inconsistent. If this makes the ptr 0, the caller fixes it up. 4343 * We can't use decrement because it would change the next level up. 4344 */ 4345 if (level > 0) 4346 cur->bc_levels[level].ptr--; 4347 4348 /* 4349 * We combined blocks, so we have to update the parent keys if the 4350 * btree supports overlapped intervals. However, 4351 * bc_levels[level + 1].ptr points to the old block so that the caller 4352 * knows which record to delete. Therefore, the caller must be savvy 4353 * enough to call updkeys for us if we return stat == 2. The other 4354 * exit points from this function don't require deletions further up 4355 * the tree, so they can call updkeys directly. 4356 */ 4357 4358 /* Return value means the next level up has something to do. */ 4359 *stat = 2; 4360 return 0; 4361 4362 error0: 4363 if (tcur) 4364 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); 4365 return error; 4366 } 4367 4368 /* 4369 * Delete the record pointed to by cur. 4370 * The cursor refers to the place where the record was (could be inserted) 4371 * when the operation returns. 4372 */ 4373 int /* error */ 4374 xfs_btree_delete( 4375 struct xfs_btree_cur *cur, 4376 int *stat) /* success/failure */ 4377 { 4378 int error; /* error return value */ 4379 int level; 4380 int i; 4381 bool joined = false; 4382 4383 /* 4384 * Go up the tree, starting at leaf level. 4385 * 4386 * If 2 is returned then a join was done; go to the next level. 4387 * Otherwise we are done. 4388 */ 4389 for (level = 0, i = 2; i == 2; level++) { 4390 error = xfs_btree_delrec(cur, level, &i); 4391 if (error) 4392 goto error0; 4393 if (i == 2) 4394 joined = true; 4395 } 4396 4397 /* 4398 * If we combined blocks as part of deleting the record, delrec won't 4399 * have updated the parent high keys so we have to do that here. 4400 */ 4401 if (joined && (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING)) { 4402 error = xfs_btree_updkeys_force(cur, 0); 4403 if (error) 4404 goto error0; 4405 } 4406 4407 if (i == 0) { 4408 for (level = 1; level < cur->bc_nlevels; level++) { 4409 if (cur->bc_levels[level].ptr == 0) { 4410 error = xfs_btree_decrement(cur, level, &i); 4411 if (error) 4412 goto error0; 4413 break; 4414 } 4415 } 4416 } 4417 4418 *stat = i; 4419 return 0; 4420 error0: 4421 return error; 4422 } 4423 4424 /* 4425 * Get the data from the pointed-to record. 4426 */ 4427 int /* error */ 4428 xfs_btree_get_rec( 4429 struct xfs_btree_cur *cur, /* btree cursor */ 4430 union xfs_btree_rec **recp, /* output: btree record */ 4431 int *stat) /* output: success/failure */ 4432 { 4433 struct xfs_btree_block *block; /* btree block */ 4434 struct xfs_buf *bp; /* buffer pointer */ 4435 int ptr; /* record number */ 4436 #ifdef DEBUG 4437 int error; /* error return value */ 4438 #endif 4439 4440 ptr = cur->bc_levels[0].ptr; 4441 block = xfs_btree_get_block(cur, 0, &bp); 4442 4443 #ifdef DEBUG 4444 error = xfs_btree_check_block(cur, block, 0, bp); 4445 if (error) 4446 return error; 4447 #endif 4448 4449 /* 4450 * Off the right end or left end, return failure. 4451 */ 4452 if (ptr > xfs_btree_get_numrecs(block) || ptr <= 0) { 4453 *stat = 0; 4454 return 0; 4455 } 4456 4457 /* 4458 * Point to the record and extract its data. 4459 */ 4460 *recp = xfs_btree_rec_addr(cur, ptr, block); 4461 *stat = 1; 4462 return 0; 4463 } 4464 4465 /* Visit a block in a btree. */ 4466 STATIC int 4467 xfs_btree_visit_block( 4468 struct xfs_btree_cur *cur, 4469 int level, 4470 xfs_btree_visit_blocks_fn fn, 4471 void *data) 4472 { 4473 struct xfs_btree_block *block; 4474 struct xfs_buf *bp; 4475 union xfs_btree_ptr rptr, bufptr; 4476 int error; 4477 4478 /* do right sibling readahead */ 4479 xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA); 4480 block = xfs_btree_get_block(cur, level, &bp); 4481 4482 /* process the block */ 4483 error = fn(cur, level, data); 4484 if (error) 4485 return error; 4486 4487 /* now read rh sibling block for next iteration */ 4488 xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB); 4489 if (xfs_btree_ptr_is_null(cur, &rptr)) 4490 return -ENOENT; 4491 4492 /* 4493 * We only visit blocks once in this walk, so we have to avoid the 4494 * internal xfs_btree_lookup_get_block() optimisation where it will 4495 * return the same block without checking if the right sibling points 4496 * back to us and creates a cyclic reference in the btree. 4497 */ 4498 xfs_btree_buf_to_ptr(cur, bp, &bufptr); 4499 if (xfs_btree_ptrs_equal(cur, &rptr, &bufptr)) { 4500 xfs_btree_mark_sick(cur); 4501 return -EFSCORRUPTED; 4502 } 4503 4504 return xfs_btree_lookup_get_block(cur, level, &rptr, &block); 4505 } 4506 4507 4508 /* Visit every block in a btree. */ 4509 int 4510 xfs_btree_visit_blocks( 4511 struct xfs_btree_cur *cur, 4512 xfs_btree_visit_blocks_fn fn, 4513 unsigned int flags, 4514 void *data) 4515 { 4516 union xfs_btree_ptr lptr; 4517 int level; 4518 struct xfs_btree_block *block = NULL; 4519 int error = 0; 4520 4521 xfs_btree_init_ptr_from_cur(cur, &lptr); 4522 4523 /* for each level */ 4524 for (level = cur->bc_nlevels - 1; level >= 0; level--) { 4525 /* grab the left hand block */ 4526 error = xfs_btree_lookup_get_block(cur, level, &lptr, &block); 4527 if (error) 4528 return error; 4529 4530 /* readahead the left most block for the next level down */ 4531 if (level > 0) { 4532 union xfs_btree_ptr *ptr; 4533 4534 ptr = xfs_btree_ptr_addr(cur, 1, block); 4535 xfs_btree_readahead_ptr(cur, ptr, 1); 4536 4537 /* save for the next iteration of the loop */ 4538 xfs_btree_copy_ptrs(cur, &lptr, ptr, 1); 4539 4540 if (!(flags & XFS_BTREE_VISIT_LEAVES)) 4541 continue; 4542 } else if (!(flags & XFS_BTREE_VISIT_RECORDS)) { 4543 continue; 4544 } 4545 4546 /* for each buffer in the level */ 4547 do { 4548 error = xfs_btree_visit_block(cur, level, fn, data); 4549 } while (!error); 4550 4551 if (error != -ENOENT) 4552 return error; 4553 } 4554 4555 return 0; 4556 } 4557 4558 /* 4559 * Change the owner of a btree. 4560 * 4561 * The mechanism we use here is ordered buffer logging. Because we don't know 4562 * how many buffers were are going to need to modify, we don't really want to 4563 * have to make transaction reservations for the worst case of every buffer in a 4564 * full size btree as that may be more space that we can fit in the log.... 4565 * 4566 * We do the btree walk in the most optimal manner possible - we have sibling 4567 * pointers so we can just walk all the blocks on each level from left to right 4568 * in a single pass, and then move to the next level and do the same. We can 4569 * also do readahead on the sibling pointers to get IO moving more quickly, 4570 * though for slow disks this is unlikely to make much difference to performance 4571 * as the amount of CPU work we have to do before moving to the next block is 4572 * relatively small. 4573 * 4574 * For each btree block that we load, modify the owner appropriately, set the 4575 * buffer as an ordered buffer and log it appropriately. We need to ensure that 4576 * we mark the region we change dirty so that if the buffer is relogged in 4577 * a subsequent transaction the changes we make here as an ordered buffer are 4578 * correctly relogged in that transaction. If we are in recovery context, then 4579 * just queue the modified buffer as delayed write buffer so the transaction 4580 * recovery completion writes the changes to disk. 4581 */ 4582 struct xfs_btree_block_change_owner_info { 4583 uint64_t new_owner; 4584 struct list_head *buffer_list; 4585 }; 4586 4587 static int 4588 xfs_btree_block_change_owner( 4589 struct xfs_btree_cur *cur, 4590 int level, 4591 void *data) 4592 { 4593 struct xfs_btree_block_change_owner_info *bbcoi = data; 4594 struct xfs_btree_block *block; 4595 struct xfs_buf *bp; 4596 4597 /* modify the owner */ 4598 block = xfs_btree_get_block(cur, level, &bp); 4599 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { 4600 if (block->bb_u.l.bb_owner == cpu_to_be64(bbcoi->new_owner)) 4601 return 0; 4602 block->bb_u.l.bb_owner = cpu_to_be64(bbcoi->new_owner); 4603 } else { 4604 if (block->bb_u.s.bb_owner == cpu_to_be32(bbcoi->new_owner)) 4605 return 0; 4606 block->bb_u.s.bb_owner = cpu_to_be32(bbcoi->new_owner); 4607 } 4608 4609 /* 4610 * If the block is a root block hosted in an inode, we might not have a 4611 * buffer pointer here and we shouldn't attempt to log the change as the 4612 * information is already held in the inode and discarded when the root 4613 * block is formatted into the on-disk inode fork. We still change it, 4614 * though, so everything is consistent in memory. 4615 */ 4616 if (!bp) { 4617 ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE); 4618 ASSERT(level == cur->bc_nlevels - 1); 4619 return 0; 4620 } 4621 4622 if (cur->bc_tp) { 4623 if (!xfs_trans_ordered_buf(cur->bc_tp, bp)) { 4624 xfs_btree_log_block(cur, bp, XFS_BB_OWNER); 4625 return -EAGAIN; 4626 } 4627 } else { 4628 xfs_buf_delwri_queue(bp, bbcoi->buffer_list); 4629 } 4630 4631 return 0; 4632 } 4633 4634 int 4635 xfs_btree_change_owner( 4636 struct xfs_btree_cur *cur, 4637 uint64_t new_owner, 4638 struct list_head *buffer_list) 4639 { 4640 struct xfs_btree_block_change_owner_info bbcoi; 4641 4642 bbcoi.new_owner = new_owner; 4643 bbcoi.buffer_list = buffer_list; 4644 4645 return xfs_btree_visit_blocks(cur, xfs_btree_block_change_owner, 4646 XFS_BTREE_VISIT_ALL, &bbcoi); 4647 } 4648 4649 /* Verify the v5 fields of a long-format btree block. */ 4650 xfs_failaddr_t 4651 xfs_btree_fsblock_v5hdr_verify( 4652 struct xfs_buf *bp, 4653 uint64_t owner) 4654 { 4655 struct xfs_mount *mp = bp->b_mount; 4656 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 4657 4658 if (!xfs_has_crc(mp)) 4659 return __this_address; 4660 if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid)) 4661 return __this_address; 4662 if (block->bb_u.l.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp))) 4663 return __this_address; 4664 if (owner != XFS_RMAP_OWN_UNKNOWN && 4665 be64_to_cpu(block->bb_u.l.bb_owner) != owner) 4666 return __this_address; 4667 return NULL; 4668 } 4669 4670 /* Verify a long-format btree block. */ 4671 xfs_failaddr_t 4672 xfs_btree_fsblock_verify( 4673 struct xfs_buf *bp, 4674 unsigned int max_recs) 4675 { 4676 struct xfs_mount *mp = bp->b_mount; 4677 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 4678 xfs_fsblock_t fsb; 4679 xfs_failaddr_t fa; 4680 4681 ASSERT(!xfs_buftarg_is_mem(bp->b_target)); 4682 4683 /* numrecs verification */ 4684 if (be16_to_cpu(block->bb_numrecs) > max_recs) 4685 return __this_address; 4686 4687 /* sibling pointer verification */ 4688 fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); 4689 fa = xfs_btree_check_fsblock_siblings(mp, fsb, 4690 block->bb_u.l.bb_leftsib); 4691 if (!fa) 4692 fa = xfs_btree_check_fsblock_siblings(mp, fsb, 4693 block->bb_u.l.bb_rightsib); 4694 return fa; 4695 } 4696 4697 /* Verify an in-memory btree block. */ 4698 xfs_failaddr_t 4699 xfs_btree_memblock_verify( 4700 struct xfs_buf *bp, 4701 unsigned int max_recs) 4702 { 4703 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 4704 struct xfs_buftarg *btp = bp->b_target; 4705 xfs_failaddr_t fa; 4706 xfbno_t bno; 4707 4708 ASSERT(xfs_buftarg_is_mem(bp->b_target)); 4709 4710 /* numrecs verification */ 4711 if (be16_to_cpu(block->bb_numrecs) > max_recs) 4712 return __this_address; 4713 4714 /* sibling pointer verification */ 4715 bno = xfs_daddr_to_xfbno(xfs_buf_daddr(bp)); 4716 fa = xfs_btree_check_memblock_siblings(btp, bno, 4717 block->bb_u.l.bb_leftsib); 4718 if (fa) 4719 return fa; 4720 fa = xfs_btree_check_memblock_siblings(btp, bno, 4721 block->bb_u.l.bb_rightsib); 4722 if (fa) 4723 return fa; 4724 4725 return NULL; 4726 } 4727 /** 4728 * xfs_btree_agblock_v5hdr_verify() -- verify the v5 fields of a short-format 4729 * btree block 4730 * 4731 * @bp: buffer containing the btree block 4732 */ 4733 xfs_failaddr_t 4734 xfs_btree_agblock_v5hdr_verify( 4735 struct xfs_buf *bp) 4736 { 4737 struct xfs_mount *mp = bp->b_mount; 4738 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 4739 struct xfs_perag *pag = bp->b_pag; 4740 4741 if (!xfs_has_crc(mp)) 4742 return __this_address; 4743 if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) 4744 return __this_address; 4745 if (block->bb_u.s.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp))) 4746 return __this_address; 4747 if (pag && be32_to_cpu(block->bb_u.s.bb_owner) != pag_agno(pag)) 4748 return __this_address; 4749 return NULL; 4750 } 4751 4752 /** 4753 * xfs_btree_agblock_verify() -- verify a short-format btree block 4754 * 4755 * @bp: buffer containing the btree block 4756 * @max_recs: maximum records allowed in this btree node 4757 */ 4758 xfs_failaddr_t 4759 xfs_btree_agblock_verify( 4760 struct xfs_buf *bp, 4761 unsigned int max_recs) 4762 { 4763 struct xfs_mount *mp = bp->b_mount; 4764 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 4765 xfs_agblock_t agbno; 4766 xfs_failaddr_t fa; 4767 4768 ASSERT(!xfs_buftarg_is_mem(bp->b_target)); 4769 4770 /* numrecs verification */ 4771 if (be16_to_cpu(block->bb_numrecs) > max_recs) 4772 return __this_address; 4773 4774 /* sibling pointer verification */ 4775 agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp)); 4776 fa = xfs_btree_check_agblock_siblings(bp->b_pag, agbno, 4777 block->bb_u.s.bb_leftsib); 4778 if (!fa) 4779 fa = xfs_btree_check_agblock_siblings(bp->b_pag, agbno, 4780 block->bb_u.s.bb_rightsib); 4781 return fa; 4782 } 4783 4784 /* 4785 * For the given limits on leaf and keyptr records per block, calculate the 4786 * height of the tree needed to index the number of leaf records. 4787 */ 4788 unsigned int 4789 xfs_btree_compute_maxlevels( 4790 const unsigned int *limits, 4791 unsigned long long records) 4792 { 4793 unsigned long long level_blocks = howmany_64(records, limits[0]); 4794 unsigned int height = 1; 4795 4796 while (level_blocks > 1) { 4797 level_blocks = howmany_64(level_blocks, limits[1]); 4798 height++; 4799 } 4800 4801 return height; 4802 } 4803 4804 /* 4805 * For the given limits on leaf and keyptr records per block, calculate the 4806 * number of blocks needed to index the given number of leaf records. 4807 */ 4808 unsigned long long 4809 xfs_btree_calc_size( 4810 const unsigned int *limits, 4811 unsigned long long records) 4812 { 4813 unsigned long long level_blocks = howmany_64(records, limits[0]); 4814 unsigned long long blocks = level_blocks; 4815 4816 while (level_blocks > 1) { 4817 level_blocks = howmany_64(level_blocks, limits[1]); 4818 blocks += level_blocks; 4819 } 4820 4821 return blocks; 4822 } 4823 4824 /* 4825 * Given a number of available blocks for the btree to consume with records and 4826 * pointers, calculate the height of the tree needed to index all the records 4827 * that space can hold based on the number of pointers each interior node 4828 * holds. 4829 * 4830 * We start by assuming a single level tree consumes a single block, then track 4831 * the number of blocks each node level consumes until we no longer have space 4832 * to store the next node level. At this point, we are indexing all the leaf 4833 * blocks in the space, and there's no more free space to split the tree any 4834 * further. That's our maximum btree height. 4835 */ 4836 unsigned int 4837 xfs_btree_space_to_height( 4838 const unsigned int *limits, 4839 unsigned long long leaf_blocks) 4840 { 4841 /* 4842 * The root btree block can have fewer than minrecs pointers in it 4843 * because the tree might not be big enough to require that amount of 4844 * fanout. Hence it has a minimum size of 2 pointers, not limits[1]. 4845 */ 4846 unsigned long long node_blocks = 2; 4847 unsigned long long blocks_left = leaf_blocks - 1; 4848 unsigned int height = 1; 4849 4850 if (leaf_blocks < 1) 4851 return 0; 4852 4853 while (node_blocks < blocks_left) { 4854 blocks_left -= node_blocks; 4855 node_blocks *= limits[1]; 4856 height++; 4857 } 4858 4859 return height; 4860 } 4861 4862 /* 4863 * Query a regular btree for all records overlapping a given interval. 4864 * Start with a LE lookup of the key of low_rec and return all records 4865 * until we find a record with a key greater than the key of high_rec. 4866 */ 4867 STATIC int 4868 xfs_btree_simple_query_range( 4869 struct xfs_btree_cur *cur, 4870 const union xfs_btree_key *low_key, 4871 const union xfs_btree_key *high_key, 4872 xfs_btree_query_range_fn fn, 4873 void *priv) 4874 { 4875 union xfs_btree_rec *recp; 4876 union xfs_btree_key rec_key; 4877 int stat; 4878 bool firstrec = true; 4879 int error; 4880 4881 ASSERT(cur->bc_ops->init_high_key_from_rec); 4882 ASSERT(cur->bc_ops->diff_two_keys); 4883 4884 /* 4885 * Find the leftmost record. The btree cursor must be set 4886 * to the low record used to generate low_key. 4887 */ 4888 stat = 0; 4889 error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, &stat); 4890 if (error) 4891 goto out; 4892 4893 /* Nothing? See if there's anything to the right. */ 4894 if (!stat) { 4895 error = xfs_btree_increment(cur, 0, &stat); 4896 if (error) 4897 goto out; 4898 } 4899 4900 while (stat) { 4901 /* Find the record. */ 4902 error = xfs_btree_get_rec(cur, &recp, &stat); 4903 if (error || !stat) 4904 break; 4905 4906 /* Skip if low_key > high_key(rec). */ 4907 if (firstrec) { 4908 cur->bc_ops->init_high_key_from_rec(&rec_key, recp); 4909 firstrec = false; 4910 if (xfs_btree_keycmp_gt(cur, low_key, &rec_key)) 4911 goto advloop; 4912 } 4913 4914 /* Stop if low_key(rec) > high_key. */ 4915 cur->bc_ops->init_key_from_rec(&rec_key, recp); 4916 if (xfs_btree_keycmp_gt(cur, &rec_key, high_key)) 4917 break; 4918 4919 /* Callback */ 4920 error = fn(cur, recp, priv); 4921 if (error) 4922 break; 4923 4924 advloop: 4925 /* Move on to the next record. */ 4926 error = xfs_btree_increment(cur, 0, &stat); 4927 if (error) 4928 break; 4929 } 4930 4931 out: 4932 return error; 4933 } 4934 4935 /* 4936 * Query an overlapped interval btree for all records overlapping a given 4937 * interval. This function roughly follows the algorithm given in 4938 * "Interval Trees" of _Introduction to Algorithms_, which is section 4939 * 14.3 in the 2nd and 3rd editions. 4940 * 4941 * First, generate keys for the low and high records passed in. 4942 * 4943 * For any leaf node, generate the high and low keys for the record. 4944 * If the record keys overlap with the query low/high keys, pass the 4945 * record to the function iterator. 4946 * 4947 * For any internal node, compare the low and high keys of each 4948 * pointer against the query low/high keys. If there's an overlap, 4949 * follow the pointer. 4950 * 4951 * As an optimization, we stop scanning a block when we find a low key 4952 * that is greater than the query's high key. 4953 */ 4954 STATIC int 4955 xfs_btree_overlapped_query_range( 4956 struct xfs_btree_cur *cur, 4957 const union xfs_btree_key *low_key, 4958 const union xfs_btree_key *high_key, 4959 xfs_btree_query_range_fn fn, 4960 void *priv) 4961 { 4962 union xfs_btree_ptr ptr; 4963 union xfs_btree_ptr *pp; 4964 union xfs_btree_key rec_key; 4965 union xfs_btree_key rec_hkey; 4966 union xfs_btree_key *lkp; 4967 union xfs_btree_key *hkp; 4968 union xfs_btree_rec *recp; 4969 struct xfs_btree_block *block; 4970 int level; 4971 struct xfs_buf *bp; 4972 int i; 4973 int error; 4974 4975 /* Load the root of the btree. */ 4976 level = cur->bc_nlevels - 1; 4977 xfs_btree_init_ptr_from_cur(cur, &ptr); 4978 error = xfs_btree_lookup_get_block(cur, level, &ptr, &block); 4979 if (error) 4980 return error; 4981 xfs_btree_get_block(cur, level, &bp); 4982 trace_xfs_btree_overlapped_query_range(cur, level, bp); 4983 #ifdef DEBUG 4984 error = xfs_btree_check_block(cur, block, level, bp); 4985 if (error) 4986 goto out; 4987 #endif 4988 cur->bc_levels[level].ptr = 1; 4989 4990 while (level < cur->bc_nlevels) { 4991 block = xfs_btree_get_block(cur, level, &bp); 4992 4993 /* End of node, pop back towards the root. */ 4994 if (cur->bc_levels[level].ptr > 4995 be16_to_cpu(block->bb_numrecs)) { 4996 pop_up: 4997 if (level < cur->bc_nlevels - 1) 4998 cur->bc_levels[level + 1].ptr++; 4999 level++; 5000 continue; 5001 } 5002 5003 if (level == 0) { 5004 /* Handle a leaf node. */ 5005 recp = xfs_btree_rec_addr(cur, cur->bc_levels[0].ptr, 5006 block); 5007 5008 cur->bc_ops->init_high_key_from_rec(&rec_hkey, recp); 5009 cur->bc_ops->init_key_from_rec(&rec_key, recp); 5010 5011 /* 5012 * If (query's high key < record's low key), then there 5013 * are no more interesting records in this block. Pop 5014 * up to the leaf level to find more record blocks. 5015 * 5016 * If (record's high key >= query's low key) and 5017 * (query's high key >= record's low key), then 5018 * this record overlaps the query range; callback. 5019 */ 5020 if (xfs_btree_keycmp_lt(cur, high_key, &rec_key)) 5021 goto pop_up; 5022 if (xfs_btree_keycmp_ge(cur, &rec_hkey, low_key)) { 5023 error = fn(cur, recp, priv); 5024 if (error) 5025 break; 5026 } 5027 cur->bc_levels[level].ptr++; 5028 continue; 5029 } 5030 5031 /* Handle an internal node. */ 5032 lkp = xfs_btree_key_addr(cur, cur->bc_levels[level].ptr, block); 5033 hkp = xfs_btree_high_key_addr(cur, cur->bc_levels[level].ptr, 5034 block); 5035 pp = xfs_btree_ptr_addr(cur, cur->bc_levels[level].ptr, block); 5036 5037 /* 5038 * If (query's high key < pointer's low key), then there are no 5039 * more interesting keys in this block. Pop up one leaf level 5040 * to continue looking for records. 5041 * 5042 * If (pointer's high key >= query's low key) and 5043 * (query's high key >= pointer's low key), then 5044 * this record overlaps the query range; follow pointer. 5045 */ 5046 if (xfs_btree_keycmp_lt(cur, high_key, lkp)) 5047 goto pop_up; 5048 if (xfs_btree_keycmp_ge(cur, hkp, low_key)) { 5049 level--; 5050 error = xfs_btree_lookup_get_block(cur, level, pp, 5051 &block); 5052 if (error) 5053 goto out; 5054 xfs_btree_get_block(cur, level, &bp); 5055 trace_xfs_btree_overlapped_query_range(cur, level, bp); 5056 #ifdef DEBUG 5057 error = xfs_btree_check_block(cur, block, level, bp); 5058 if (error) 5059 goto out; 5060 #endif 5061 cur->bc_levels[level].ptr = 1; 5062 continue; 5063 } 5064 cur->bc_levels[level].ptr++; 5065 } 5066 5067 out: 5068 /* 5069 * If we don't end this function with the cursor pointing at a record 5070 * block, a subsequent non-error cursor deletion will not release 5071 * node-level buffers, causing a buffer leak. This is quite possible 5072 * with a zero-results range query, so release the buffers if we 5073 * failed to return any results. 5074 */ 5075 if (cur->bc_levels[0].bp == NULL) { 5076 for (i = 0; i < cur->bc_nlevels; i++) { 5077 if (cur->bc_levels[i].bp) { 5078 xfs_trans_brelse(cur->bc_tp, 5079 cur->bc_levels[i].bp); 5080 cur->bc_levels[i].bp = NULL; 5081 cur->bc_levels[i].ptr = 0; 5082 cur->bc_levels[i].ra = 0; 5083 } 5084 } 5085 } 5086 5087 return error; 5088 } 5089 5090 static inline void 5091 xfs_btree_key_from_irec( 5092 struct xfs_btree_cur *cur, 5093 union xfs_btree_key *key, 5094 const union xfs_btree_irec *irec) 5095 { 5096 union xfs_btree_rec rec; 5097 5098 cur->bc_rec = *irec; 5099 cur->bc_ops->init_rec_from_cur(cur, &rec); 5100 cur->bc_ops->init_key_from_rec(key, &rec); 5101 } 5102 5103 /* 5104 * Query a btree for all records overlapping a given interval of keys. The 5105 * supplied function will be called with each record found; return one of the 5106 * XFS_BTREE_QUERY_RANGE_{CONTINUE,ABORT} values or the usual negative error 5107 * code. This function returns -ECANCELED, zero, or a negative error code. 5108 */ 5109 int 5110 xfs_btree_query_range( 5111 struct xfs_btree_cur *cur, 5112 const union xfs_btree_irec *low_rec, 5113 const union xfs_btree_irec *high_rec, 5114 xfs_btree_query_range_fn fn, 5115 void *priv) 5116 { 5117 union xfs_btree_key low_key; 5118 union xfs_btree_key high_key; 5119 5120 /* Find the keys of both ends of the interval. */ 5121 xfs_btree_key_from_irec(cur, &high_key, high_rec); 5122 xfs_btree_key_from_irec(cur, &low_key, low_rec); 5123 5124 /* Enforce low key <= high key. */ 5125 if (!xfs_btree_keycmp_le(cur, &low_key, &high_key)) 5126 return -EINVAL; 5127 5128 if (!(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING)) 5129 return xfs_btree_simple_query_range(cur, &low_key, 5130 &high_key, fn, priv); 5131 return xfs_btree_overlapped_query_range(cur, &low_key, &high_key, 5132 fn, priv); 5133 } 5134 5135 /* Query a btree for all records. */ 5136 int 5137 xfs_btree_query_all( 5138 struct xfs_btree_cur *cur, 5139 xfs_btree_query_range_fn fn, 5140 void *priv) 5141 { 5142 union xfs_btree_key low_key; 5143 union xfs_btree_key high_key; 5144 5145 memset(&cur->bc_rec, 0, sizeof(cur->bc_rec)); 5146 memset(&low_key, 0, sizeof(low_key)); 5147 memset(&high_key, 0xFF, sizeof(high_key)); 5148 5149 return xfs_btree_simple_query_range(cur, &low_key, &high_key, fn, priv); 5150 } 5151 5152 static int 5153 xfs_btree_count_blocks_helper( 5154 struct xfs_btree_cur *cur, 5155 int level, 5156 void *data) 5157 { 5158 xfs_extlen_t *blocks = data; 5159 (*blocks)++; 5160 5161 return 0; 5162 } 5163 5164 /* Count the blocks in a btree and return the result in *blocks. */ 5165 int 5166 xfs_btree_count_blocks( 5167 struct xfs_btree_cur *cur, 5168 xfs_extlen_t *blocks) 5169 { 5170 *blocks = 0; 5171 return xfs_btree_visit_blocks(cur, xfs_btree_count_blocks_helper, 5172 XFS_BTREE_VISIT_ALL, blocks); 5173 } 5174 5175 /* Compare two btree pointers. */ 5176 int64_t 5177 xfs_btree_diff_two_ptrs( 5178 struct xfs_btree_cur *cur, 5179 const union xfs_btree_ptr *a, 5180 const union xfs_btree_ptr *b) 5181 { 5182 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) 5183 return (int64_t)be64_to_cpu(a->l) - be64_to_cpu(b->l); 5184 return (int64_t)be32_to_cpu(a->s) - be32_to_cpu(b->s); 5185 } 5186 5187 struct xfs_btree_has_records { 5188 /* Keys for the start and end of the range we want to know about. */ 5189 union xfs_btree_key start_key; 5190 union xfs_btree_key end_key; 5191 5192 /* Mask for key comparisons, if desired. */ 5193 const union xfs_btree_key *key_mask; 5194 5195 /* Highest record key we've seen so far. */ 5196 union xfs_btree_key high_key; 5197 5198 enum xbtree_recpacking outcome; 5199 }; 5200 5201 STATIC int 5202 xfs_btree_has_records_helper( 5203 struct xfs_btree_cur *cur, 5204 const union xfs_btree_rec *rec, 5205 void *priv) 5206 { 5207 union xfs_btree_key rec_key; 5208 union xfs_btree_key rec_high_key; 5209 struct xfs_btree_has_records *info = priv; 5210 enum xbtree_key_contig key_contig; 5211 5212 cur->bc_ops->init_key_from_rec(&rec_key, rec); 5213 5214 if (info->outcome == XBTREE_RECPACKING_EMPTY) { 5215 info->outcome = XBTREE_RECPACKING_SPARSE; 5216 5217 /* 5218 * If the first record we find does not overlap the start key, 5219 * then there is a hole at the start of the search range. 5220 * Classify this as sparse and stop immediately. 5221 */ 5222 if (xfs_btree_masked_keycmp_lt(cur, &info->start_key, &rec_key, 5223 info->key_mask)) 5224 return -ECANCELED; 5225 } else { 5226 /* 5227 * If a subsequent record does not overlap with the any record 5228 * we've seen so far, there is a hole in the middle of the 5229 * search range. Classify this as sparse and stop. 5230 * If the keys overlap and this btree does not allow overlap, 5231 * signal corruption. 5232 */ 5233 key_contig = cur->bc_ops->keys_contiguous(cur, &info->high_key, 5234 &rec_key, info->key_mask); 5235 if (key_contig == XBTREE_KEY_OVERLAP && 5236 !(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING)) 5237 return -EFSCORRUPTED; 5238 if (key_contig == XBTREE_KEY_GAP) 5239 return -ECANCELED; 5240 } 5241 5242 /* 5243 * If high_key(rec) is larger than any other high key we've seen, 5244 * remember it for later. 5245 */ 5246 cur->bc_ops->init_high_key_from_rec(&rec_high_key, rec); 5247 if (xfs_btree_masked_keycmp_gt(cur, &rec_high_key, &info->high_key, 5248 info->key_mask)) 5249 info->high_key = rec_high_key; /* struct copy */ 5250 5251 return 0; 5252 } 5253 5254 /* 5255 * Scan part of the keyspace of a btree and tell us if that keyspace does not 5256 * map to any records; is fully mapped to records; or is partially mapped to 5257 * records. This is the btree record equivalent to determining if a file is 5258 * sparse. 5259 * 5260 * For most btree types, the record scan should use all available btree key 5261 * fields to compare the keys encountered. These callers should pass NULL for 5262 * @mask. However, some callers (e.g. scanning physical space in the rmapbt) 5263 * want to ignore some part of the btree record keyspace when performing the 5264 * comparison. These callers should pass in a union xfs_btree_key object with 5265 * the fields that *should* be a part of the comparison set to any nonzero 5266 * value, and the rest zeroed. 5267 */ 5268 int 5269 xfs_btree_has_records( 5270 struct xfs_btree_cur *cur, 5271 const union xfs_btree_irec *low, 5272 const union xfs_btree_irec *high, 5273 const union xfs_btree_key *mask, 5274 enum xbtree_recpacking *outcome) 5275 { 5276 struct xfs_btree_has_records info = { 5277 .outcome = XBTREE_RECPACKING_EMPTY, 5278 .key_mask = mask, 5279 }; 5280 int error; 5281 5282 /* Not all btrees support this operation. */ 5283 if (!cur->bc_ops->keys_contiguous) { 5284 ASSERT(0); 5285 return -EOPNOTSUPP; 5286 } 5287 5288 xfs_btree_key_from_irec(cur, &info.start_key, low); 5289 xfs_btree_key_from_irec(cur, &info.end_key, high); 5290 5291 error = xfs_btree_query_range(cur, low, high, 5292 xfs_btree_has_records_helper, &info); 5293 if (error == -ECANCELED) 5294 goto out; 5295 if (error) 5296 return error; 5297 5298 if (info.outcome == XBTREE_RECPACKING_EMPTY) 5299 goto out; 5300 5301 /* 5302 * If the largest high_key(rec) we saw during the walk is greater than 5303 * the end of the search range, classify this as full. Otherwise, 5304 * there is a hole at the end of the search range. 5305 */ 5306 if (xfs_btree_masked_keycmp_ge(cur, &info.high_key, &info.end_key, 5307 mask)) 5308 info.outcome = XBTREE_RECPACKING_FULL; 5309 5310 out: 5311 *outcome = info.outcome; 5312 return 0; 5313 } 5314 5315 /* Are there more records in this btree? */ 5316 bool 5317 xfs_btree_has_more_records( 5318 struct xfs_btree_cur *cur) 5319 { 5320 struct xfs_btree_block *block; 5321 struct xfs_buf *bp; 5322 5323 block = xfs_btree_get_block(cur, 0, &bp); 5324 5325 /* There are still records in this block. */ 5326 if (cur->bc_levels[0].ptr < xfs_btree_get_numrecs(block)) 5327 return true; 5328 5329 /* There are more record blocks. */ 5330 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) 5331 return block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK); 5332 else 5333 return block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK); 5334 } 5335 5336 /* Set up all the btree cursor caches. */ 5337 int __init 5338 xfs_btree_init_cur_caches(void) 5339 { 5340 int error; 5341 5342 error = xfs_allocbt_init_cur_cache(); 5343 if (error) 5344 return error; 5345 error = xfs_inobt_init_cur_cache(); 5346 if (error) 5347 goto err; 5348 error = xfs_bmbt_init_cur_cache(); 5349 if (error) 5350 goto err; 5351 error = xfs_rmapbt_init_cur_cache(); 5352 if (error) 5353 goto err; 5354 error = xfs_refcountbt_init_cur_cache(); 5355 if (error) 5356 goto err; 5357 5358 return 0; 5359 err: 5360 xfs_btree_destroy_cur_caches(); 5361 return error; 5362 } 5363 5364 /* Destroy all the btree cursor caches, if they've been allocated. */ 5365 void 5366 xfs_btree_destroy_cur_caches(void) 5367 { 5368 xfs_allocbt_destroy_cur_cache(); 5369 xfs_inobt_destroy_cur_cache(); 5370 xfs_bmbt_destroy_cur_cache(); 5371 xfs_rmapbt_destroy_cur_cache(); 5372 xfs_refcountbt_destroy_cur_cache(); 5373 } 5374 5375 /* Move the btree cursor before the first record. */ 5376 int 5377 xfs_btree_goto_left_edge( 5378 struct xfs_btree_cur *cur) 5379 { 5380 int stat = 0; 5381 int error; 5382 5383 memset(&cur->bc_rec, 0, sizeof(cur->bc_rec)); 5384 error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, &stat); 5385 if (error) 5386 return error; 5387 if (!stat) 5388 return 0; 5389 5390 error = xfs_btree_decrement(cur, 0, &stat); 5391 if (error) 5392 return error; 5393 if (stat != 0) { 5394 ASSERT(0); 5395 xfs_btree_mark_sick(cur); 5396 return -EFSCORRUPTED; 5397 } 5398 5399 return 0; 5400 } 5401