1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_bit.h" 13 #include "xfs_mount.h" 14 #include "xfs_inode.h" 15 #include "xfs_trans.h" 16 #include "xfs_buf_item.h" 17 #include "xfs_btree.h" 18 #include "xfs_errortag.h" 19 #include "xfs_error.h" 20 #include "xfs_trace.h" 21 #include "xfs_alloc.h" 22 #include "xfs_log.h" 23 #include "xfs_btree_staging.h" 24 #include "xfs_ag.h" 25 #include "xfs_alloc_btree.h" 26 #include "xfs_ialloc_btree.h" 27 #include "xfs_bmap_btree.h" 28 #include "xfs_rmap_btree.h" 29 #include "xfs_refcount_btree.h" 30 #include "xfs_health.h" 31 #include "xfs_buf_mem.h" 32 #include "xfs_btree_mem.h" 33 34 /* 35 * Btree magic numbers. 36 */ 37 uint32_t 38 xfs_btree_magic( 39 struct xfs_mount *mp, 40 const struct xfs_btree_ops *ops) 41 { 42 int idx = xfs_has_crc(mp) ? 1 : 0; 43 __be32 magic = ops->buf_ops->magic[idx]; 44 45 /* Ensure we asked for crc for crc-only magics. */ 46 ASSERT(magic != 0); 47 return be32_to_cpu(magic); 48 } 49 50 /* 51 * These sibling pointer checks are optimised for null sibling pointers. This 52 * happens a lot, and we don't need to byte swap at runtime if the sibling 53 * pointer is NULL. 54 * 55 * These are explicitly marked at inline because the cost of calling them as 56 * functions instead of inlining them is about 36 bytes extra code per call site 57 * on x86-64. Yes, gcc-11 fails to inline them, and explicit inlining of these 58 * two sibling check functions reduces the compiled code size by over 300 59 * bytes. 60 */ 61 static inline xfs_failaddr_t 62 xfs_btree_check_fsblock_siblings( 63 struct xfs_mount *mp, 64 xfs_fsblock_t fsb, 65 __be64 dsibling) 66 { 67 xfs_fsblock_t sibling; 68 69 if (dsibling == cpu_to_be64(NULLFSBLOCK)) 70 return NULL; 71 72 sibling = be64_to_cpu(dsibling); 73 if (sibling == fsb) 74 return __this_address; 75 if (!xfs_verify_fsbno(mp, sibling)) 76 return __this_address; 77 return NULL; 78 } 79 80 static inline xfs_failaddr_t 81 xfs_btree_check_memblock_siblings( 82 struct xfs_buftarg *btp, 83 xfbno_t bno, 84 __be64 dsibling) 85 { 86 xfbno_t sibling; 87 88 if (dsibling == cpu_to_be64(NULLFSBLOCK)) 89 return NULL; 90 91 sibling = be64_to_cpu(dsibling); 92 if (sibling == bno) 93 return __this_address; 94 if (!xmbuf_verify_daddr(btp, xfbno_to_daddr(sibling))) 95 return __this_address; 96 return NULL; 97 } 98 99 static inline xfs_failaddr_t 100 xfs_btree_check_agblock_siblings( 101 struct xfs_perag *pag, 102 xfs_agblock_t agbno, 103 __be32 dsibling) 104 { 105 xfs_agblock_t sibling; 106 107 if (dsibling == cpu_to_be32(NULLAGBLOCK)) 108 return NULL; 109 110 sibling = be32_to_cpu(dsibling); 111 if (sibling == agbno) 112 return __this_address; 113 if (!xfs_verify_agbno(pag, sibling)) 114 return __this_address; 115 return NULL; 116 } 117 118 static xfs_failaddr_t 119 __xfs_btree_check_lblock_hdr( 120 struct xfs_btree_cur *cur, 121 struct xfs_btree_block *block, 122 int level, 123 struct xfs_buf *bp) 124 { 125 struct xfs_mount *mp = cur->bc_mp; 126 127 if (xfs_has_crc(mp)) { 128 if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid)) 129 return __this_address; 130 if (block->bb_u.l.bb_blkno != 131 cpu_to_be64(bp ? xfs_buf_daddr(bp) : XFS_BUF_DADDR_NULL)) 132 return __this_address; 133 if (block->bb_u.l.bb_pad != cpu_to_be32(0)) 134 return __this_address; 135 } 136 137 if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(mp, cur->bc_ops)) 138 return __this_address; 139 if (be16_to_cpu(block->bb_level) != level) 140 return __this_address; 141 if (be16_to_cpu(block->bb_numrecs) > 142 cur->bc_ops->get_maxrecs(cur, level)) 143 return __this_address; 144 145 return NULL; 146 } 147 148 /* 149 * Check a long btree block header. Return the address of the failing check, 150 * or NULL if everything is ok. 151 */ 152 static xfs_failaddr_t 153 __xfs_btree_check_fsblock( 154 struct xfs_btree_cur *cur, 155 struct xfs_btree_block *block, 156 int level, 157 struct xfs_buf *bp) 158 { 159 struct xfs_mount *mp = cur->bc_mp; 160 xfs_failaddr_t fa; 161 xfs_fsblock_t fsb; 162 163 fa = __xfs_btree_check_lblock_hdr(cur, block, level, bp); 164 if (fa) 165 return fa; 166 167 /* 168 * For inode-rooted btrees, the root block sits in the inode fork. In 169 * that case bp is NULL, and the block must not have any siblings. 170 */ 171 if (!bp) { 172 if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK)) 173 return __this_address; 174 if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK)) 175 return __this_address; 176 return NULL; 177 } 178 179 fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); 180 fa = xfs_btree_check_fsblock_siblings(mp, fsb, 181 block->bb_u.l.bb_leftsib); 182 if (!fa) 183 fa = xfs_btree_check_fsblock_siblings(mp, fsb, 184 block->bb_u.l.bb_rightsib); 185 return fa; 186 } 187 188 /* 189 * Check an in-memory btree block header. Return the address of the failing 190 * check, or NULL if everything is ok. 191 */ 192 static xfs_failaddr_t 193 __xfs_btree_check_memblock( 194 struct xfs_btree_cur *cur, 195 struct xfs_btree_block *block, 196 int level, 197 struct xfs_buf *bp) 198 { 199 struct xfs_buftarg *btp = cur->bc_mem.xfbtree->target; 200 xfs_failaddr_t fa; 201 xfbno_t bno; 202 203 fa = __xfs_btree_check_lblock_hdr(cur, block, level, bp); 204 if (fa) 205 return fa; 206 207 bno = xfs_daddr_to_xfbno(xfs_buf_daddr(bp)); 208 fa = xfs_btree_check_memblock_siblings(btp, bno, 209 block->bb_u.l.bb_leftsib); 210 if (!fa) 211 fa = xfs_btree_check_memblock_siblings(btp, bno, 212 block->bb_u.l.bb_rightsib); 213 return fa; 214 } 215 216 /* 217 * Check a short btree block header. Return the address of the failing check, 218 * or NULL if everything is ok. 219 */ 220 static xfs_failaddr_t 221 __xfs_btree_check_agblock( 222 struct xfs_btree_cur *cur, 223 struct xfs_btree_block *block, 224 int level, 225 struct xfs_buf *bp) 226 { 227 struct xfs_mount *mp = cur->bc_mp; 228 struct xfs_perag *pag = to_perag(cur->bc_group); 229 xfs_failaddr_t fa; 230 xfs_agblock_t agbno; 231 232 if (xfs_has_crc(mp)) { 233 if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) 234 return __this_address; 235 if (block->bb_u.s.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp))) 236 return __this_address; 237 } 238 239 if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(mp, cur->bc_ops)) 240 return __this_address; 241 if (be16_to_cpu(block->bb_level) != level) 242 return __this_address; 243 if (be16_to_cpu(block->bb_numrecs) > 244 cur->bc_ops->get_maxrecs(cur, level)) 245 return __this_address; 246 247 agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp)); 248 fa = xfs_btree_check_agblock_siblings(pag, agbno, 249 block->bb_u.s.bb_leftsib); 250 if (!fa) 251 fa = xfs_btree_check_agblock_siblings(pag, agbno, 252 block->bb_u.s.bb_rightsib); 253 return fa; 254 } 255 256 /* 257 * Internal btree block check. 258 * 259 * Return NULL if the block is ok or the address of the failed check otherwise. 260 */ 261 xfs_failaddr_t 262 __xfs_btree_check_block( 263 struct xfs_btree_cur *cur, 264 struct xfs_btree_block *block, 265 int level, 266 struct xfs_buf *bp) 267 { 268 switch (cur->bc_ops->type) { 269 case XFS_BTREE_TYPE_MEM: 270 return __xfs_btree_check_memblock(cur, block, level, bp); 271 case XFS_BTREE_TYPE_AG: 272 return __xfs_btree_check_agblock(cur, block, level, bp); 273 case XFS_BTREE_TYPE_INODE: 274 return __xfs_btree_check_fsblock(cur, block, level, bp); 275 default: 276 ASSERT(0); 277 return __this_address; 278 } 279 } 280 281 static inline unsigned int xfs_btree_block_errtag(struct xfs_btree_cur *cur) 282 { 283 if (cur->bc_ops->ptr_len == XFS_BTREE_SHORT_PTR_LEN) 284 return XFS_ERRTAG_BTREE_CHECK_SBLOCK; 285 return XFS_ERRTAG_BTREE_CHECK_LBLOCK; 286 } 287 288 /* 289 * Debug routine: check that block header is ok. 290 */ 291 int 292 xfs_btree_check_block( 293 struct xfs_btree_cur *cur, /* btree cursor */ 294 struct xfs_btree_block *block, /* generic btree block pointer */ 295 int level, /* level of the btree block */ 296 struct xfs_buf *bp) /* buffer containing block, if any */ 297 { 298 struct xfs_mount *mp = cur->bc_mp; 299 xfs_failaddr_t fa; 300 301 fa = __xfs_btree_check_block(cur, block, level, bp); 302 if (XFS_IS_CORRUPT(mp, fa != NULL) || 303 XFS_TEST_ERROR(false, mp, xfs_btree_block_errtag(cur))) { 304 if (bp) 305 trace_xfs_btree_corrupt(bp, _RET_IP_); 306 xfs_btree_mark_sick(cur); 307 return -EFSCORRUPTED; 308 } 309 return 0; 310 } 311 312 int 313 __xfs_btree_check_ptr( 314 struct xfs_btree_cur *cur, 315 const union xfs_btree_ptr *ptr, 316 int index, 317 int level) 318 { 319 if (level <= 0) 320 return -EFSCORRUPTED; 321 322 switch (cur->bc_ops->type) { 323 case XFS_BTREE_TYPE_MEM: 324 if (!xfbtree_verify_bno(cur->bc_mem.xfbtree, 325 be64_to_cpu((&ptr->l)[index]))) 326 return -EFSCORRUPTED; 327 break; 328 case XFS_BTREE_TYPE_INODE: 329 if (!xfs_verify_fsbno(cur->bc_mp, 330 be64_to_cpu((&ptr->l)[index]))) 331 return -EFSCORRUPTED; 332 break; 333 case XFS_BTREE_TYPE_AG: 334 if (!xfs_verify_agbno(to_perag(cur->bc_group), 335 be32_to_cpu((&ptr->s)[index]))) 336 return -EFSCORRUPTED; 337 break; 338 } 339 340 return 0; 341 } 342 343 /* 344 * Check that a given (indexed) btree pointer at a certain level of a 345 * btree is valid and doesn't point past where it should. 346 */ 347 static int 348 xfs_btree_check_ptr( 349 struct xfs_btree_cur *cur, 350 const union xfs_btree_ptr *ptr, 351 int index, 352 int level) 353 { 354 int error; 355 356 error = __xfs_btree_check_ptr(cur, ptr, index, level); 357 if (error) { 358 switch (cur->bc_ops->type) { 359 case XFS_BTREE_TYPE_MEM: 360 xfs_err(cur->bc_mp, 361 "In-memory: Corrupt %sbt flags 0x%x pointer at level %d index %d fa %pS.", 362 cur->bc_ops->name, cur->bc_flags, level, index, 363 __this_address); 364 break; 365 case XFS_BTREE_TYPE_INODE: 366 xfs_err(cur->bc_mp, 367 "Inode %llu fork %d: Corrupt %sbt pointer at level %d index %d.", 368 cur->bc_ino.ip->i_ino, 369 cur->bc_ino.whichfork, cur->bc_ops->name, 370 level, index); 371 break; 372 case XFS_BTREE_TYPE_AG: 373 xfs_err(cur->bc_mp, 374 "AG %u: Corrupt %sbt pointer at level %d index %d.", 375 cur->bc_group->xg_gno, cur->bc_ops->name, 376 level, index); 377 break; 378 } 379 xfs_btree_mark_sick(cur); 380 } 381 382 return error; 383 } 384 385 #ifdef DEBUG 386 # define xfs_btree_debug_check_ptr xfs_btree_check_ptr 387 #else 388 # define xfs_btree_debug_check_ptr(...) (0) 389 #endif 390 391 /* 392 * Calculate CRC on the whole btree block and stuff it into the 393 * long-form btree header. 394 * 395 * Prior to calculting the CRC, pull the LSN out of the buffer log item and put 396 * it into the buffer so recovery knows what the last modification was that made 397 * it to disk. 398 */ 399 void 400 xfs_btree_fsblock_calc_crc( 401 struct xfs_buf *bp) 402 { 403 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 404 struct xfs_buf_log_item *bip = bp->b_log_item; 405 406 if (!xfs_has_crc(bp->b_mount)) 407 return; 408 if (bip) 409 block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn); 410 xfs_buf_update_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF); 411 } 412 413 bool 414 xfs_btree_fsblock_verify_crc( 415 struct xfs_buf *bp) 416 { 417 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 418 struct xfs_mount *mp = bp->b_mount; 419 420 if (xfs_has_crc(mp)) { 421 if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.l.bb_lsn))) 422 return false; 423 return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF); 424 } 425 426 return true; 427 } 428 429 /* 430 * Calculate CRC on the whole btree block and stuff it into the 431 * short-form btree header. 432 * 433 * Prior to calculting the CRC, pull the LSN out of the buffer log item and put 434 * it into the buffer so recovery knows what the last modification was that made 435 * it to disk. 436 */ 437 void 438 xfs_btree_agblock_calc_crc( 439 struct xfs_buf *bp) 440 { 441 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 442 struct xfs_buf_log_item *bip = bp->b_log_item; 443 444 if (!xfs_has_crc(bp->b_mount)) 445 return; 446 if (bip) 447 block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn); 448 xfs_buf_update_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF); 449 } 450 451 bool 452 xfs_btree_agblock_verify_crc( 453 struct xfs_buf *bp) 454 { 455 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 456 struct xfs_mount *mp = bp->b_mount; 457 458 if (xfs_has_crc(mp)) { 459 if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.s.bb_lsn))) 460 return false; 461 return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF); 462 } 463 464 return true; 465 } 466 467 static int 468 xfs_btree_free_block( 469 struct xfs_btree_cur *cur, 470 struct xfs_buf *bp) 471 { 472 int error; 473 474 trace_xfs_btree_free_block(cur, bp); 475 476 /* 477 * Don't allow block freeing for a staging cursor, because staging 478 * cursors do not support regular btree modifications. 479 */ 480 if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) { 481 ASSERT(0); 482 return -EFSCORRUPTED; 483 } 484 485 error = cur->bc_ops->free_block(cur, bp); 486 if (!error) { 487 xfs_trans_binval(cur->bc_tp, bp); 488 XFS_BTREE_STATS_INC(cur, free); 489 } 490 return error; 491 } 492 493 /* 494 * Delete the btree cursor. 495 */ 496 void 497 xfs_btree_del_cursor( 498 struct xfs_btree_cur *cur, /* btree cursor */ 499 int error) /* del because of error */ 500 { 501 int i; /* btree level */ 502 503 /* 504 * Clear the buffer pointers and release the buffers. If we're doing 505 * this because of an error, inspect all of the entries in the bc_bufs 506 * array for buffers to be unlocked. This is because some of the btree 507 * code works from level n down to 0, and if we get an error along the 508 * way we won't have initialized all the entries down to 0. 509 */ 510 for (i = 0; i < cur->bc_nlevels; i++) { 511 if (cur->bc_levels[i].bp) 512 xfs_trans_brelse(cur->bc_tp, cur->bc_levels[i].bp); 513 else if (!error) 514 break; 515 } 516 517 /* 518 * If we are doing a BMBT update, the number of unaccounted blocks 519 * allocated during this cursor life time should be zero. If it's not 520 * zero, then we should be shut down or on our way to shutdown due to 521 * cancelling a dirty transaction on error. 522 */ 523 ASSERT(!xfs_btree_is_bmap(cur->bc_ops) || cur->bc_bmap.allocated == 0 || 524 xfs_is_shutdown(cur->bc_mp) || error != 0); 525 526 if (cur->bc_group) 527 xfs_group_put(cur->bc_group); 528 kmem_cache_free(cur->bc_cache, cur); 529 } 530 531 /* Return the buffer target for this btree's buffer. */ 532 static inline struct xfs_buftarg * 533 xfs_btree_buftarg( 534 struct xfs_btree_cur *cur) 535 { 536 if (cur->bc_ops->type == XFS_BTREE_TYPE_MEM) 537 return cur->bc_mem.xfbtree->target; 538 return cur->bc_mp->m_ddev_targp; 539 } 540 541 /* Return the block size (in units of 512b sectors) for this btree. */ 542 static inline unsigned int 543 xfs_btree_bbsize( 544 struct xfs_btree_cur *cur) 545 { 546 if (cur->bc_ops->type == XFS_BTREE_TYPE_MEM) 547 return XFBNO_BBSIZE; 548 return cur->bc_mp->m_bsize; 549 } 550 551 /* 552 * Duplicate the btree cursor. 553 * Allocate a new one, copy the record, re-get the buffers. 554 */ 555 int /* error */ 556 xfs_btree_dup_cursor( 557 struct xfs_btree_cur *cur, /* input cursor */ 558 struct xfs_btree_cur **ncur) /* output cursor */ 559 { 560 struct xfs_mount *mp = cur->bc_mp; 561 struct xfs_trans *tp = cur->bc_tp; 562 struct xfs_buf *bp; 563 struct xfs_btree_cur *new; 564 int error; 565 int i; 566 567 /* 568 * Don't allow staging cursors to be duplicated because they're supposed 569 * to be kept private to a single thread. 570 */ 571 if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) { 572 ASSERT(0); 573 return -EFSCORRUPTED; 574 } 575 576 /* 577 * Allocate a new cursor like the old one. 578 */ 579 new = cur->bc_ops->dup_cursor(cur); 580 581 /* 582 * Copy the record currently in the cursor. 583 */ 584 new->bc_rec = cur->bc_rec; 585 586 /* 587 * For each level current, re-get the buffer and copy the ptr value. 588 */ 589 for (i = 0; i < new->bc_nlevels; i++) { 590 new->bc_levels[i].ptr = cur->bc_levels[i].ptr; 591 new->bc_levels[i].ra = cur->bc_levels[i].ra; 592 bp = cur->bc_levels[i].bp; 593 if (bp) { 594 error = xfs_trans_read_buf(mp, tp, 595 xfs_btree_buftarg(cur), 596 xfs_buf_daddr(bp), 597 xfs_btree_bbsize(cur), 0, &bp, 598 cur->bc_ops->buf_ops); 599 if (xfs_metadata_is_sick(error)) 600 xfs_btree_mark_sick(new); 601 if (error) { 602 xfs_btree_del_cursor(new, error); 603 *ncur = NULL; 604 return error; 605 } 606 } 607 new->bc_levels[i].bp = bp; 608 } 609 *ncur = new; 610 return 0; 611 } 612 613 /* 614 * XFS btree block layout and addressing: 615 * 616 * There are two types of blocks in the btree: leaf and non-leaf blocks. 617 * 618 * The leaf record start with a header then followed by records containing 619 * the values. A non-leaf block also starts with the same header, and 620 * then first contains lookup keys followed by an equal number of pointers 621 * to the btree blocks at the previous level. 622 * 623 * +--------+-------+-------+-------+-------+-------+-------+ 624 * Leaf: | header | rec 1 | rec 2 | rec 3 | rec 4 | rec 5 | rec N | 625 * +--------+-------+-------+-------+-------+-------+-------+ 626 * 627 * +--------+-------+-------+-------+-------+-------+-------+ 628 * Non-Leaf: | header | key 1 | key 2 | key N | ptr 1 | ptr 2 | ptr N | 629 * +--------+-------+-------+-------+-------+-------+-------+ 630 * 631 * The header is called struct xfs_btree_block for reasons better left unknown 632 * and comes in different versions for short (32bit) and long (64bit) block 633 * pointers. The record and key structures are defined by the btree instances 634 * and opaque to the btree core. The block pointers are simple disk endian 635 * integers, available in a short (32bit) and long (64bit) variant. 636 * 637 * The helpers below calculate the offset of a given record, key or pointer 638 * into a btree block (xfs_btree_*_offset) or return a pointer to the given 639 * record, key or pointer (xfs_btree_*_addr). Note that all addressing 640 * inside the btree block is done using indices starting at one, not zero! 641 * 642 * If XFS_BTGEO_OVERLAPPING is set, then this btree supports keys containing 643 * overlapping intervals. In such a tree, records are still sorted lowest to 644 * highest and indexed by the smallest key value that refers to the record. 645 * However, nodes are different: each pointer has two associated keys -- one 646 * indexing the lowest key available in the block(s) below (the same behavior 647 * as the key in a regular btree) and another indexing the highest key 648 * available in the block(s) below. Because records are /not/ sorted by the 649 * highest key, all leaf block updates require us to compute the highest key 650 * that matches any record in the leaf and to recursively update the high keys 651 * in the nodes going further up in the tree, if necessary. Nodes look like 652 * this: 653 * 654 * +--------+-----+-----+-----+-----+-----+-------+-------+-----+ 655 * Non-Leaf: | header | lo1 | hi1 | lo2 | hi2 | ... | ptr 1 | ptr 2 | ... | 656 * +--------+-----+-----+-----+-----+-----+-------+-------+-----+ 657 * 658 * To perform an interval query on an overlapped tree, perform the usual 659 * depth-first search and use the low and high keys to decide if we can skip 660 * that particular node. If a leaf node is reached, return the records that 661 * intersect the interval. Note that an interval query may return numerous 662 * entries. For a non-overlapped tree, simply search for the record associated 663 * with the lowest key and iterate forward until a non-matching record is 664 * found. Section 14.3 ("Interval Trees") of _Introduction to Algorithms_ by 665 * Cormen, Leiserson, Rivest, and Stein (2nd or 3rd ed. only) discuss this in 666 * more detail. 667 * 668 * Why do we care about overlapping intervals? Let's say you have a bunch of 669 * reverse mapping records on a reflink filesystem: 670 * 671 * 1: +- file A startblock B offset C length D -----------+ 672 * 2: +- file E startblock F offset G length H --------------+ 673 * 3: +- file I startblock F offset J length K --+ 674 * 4: +- file L... --+ 675 * 676 * Now say we want to map block (B+D) into file A at offset (C+D). Ideally, 677 * we'd simply increment the length of record 1. But how do we find the record 678 * that ends at (B+D-1) (i.e. record 1)? A LE lookup of (B+D-1) would return 679 * record 3 because the keys are ordered first by startblock. An interval 680 * query would return records 1 and 2 because they both overlap (B+D-1), and 681 * from that we can pick out record 1 as the appropriate left neighbor. 682 * 683 * In the non-overlapped case you can do a LE lookup and decrement the cursor 684 * because a record's interval must end before the next record. 685 */ 686 687 /* 688 * Return size of the btree block header for this btree instance. 689 */ 690 static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur) 691 { 692 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { 693 if (xfs_has_crc(cur->bc_mp)) 694 return XFS_BTREE_LBLOCK_CRC_LEN; 695 return XFS_BTREE_LBLOCK_LEN; 696 } 697 if (xfs_has_crc(cur->bc_mp)) 698 return XFS_BTREE_SBLOCK_CRC_LEN; 699 return XFS_BTREE_SBLOCK_LEN; 700 } 701 702 /* 703 * Calculate offset of the n-th record in a btree block. 704 */ 705 STATIC size_t 706 xfs_btree_rec_offset( 707 struct xfs_btree_cur *cur, 708 int n) 709 { 710 return xfs_btree_block_len(cur) + 711 (n - 1) * cur->bc_ops->rec_len; 712 } 713 714 /* 715 * Calculate offset of the n-th key in a btree block. 716 */ 717 STATIC size_t 718 xfs_btree_key_offset( 719 struct xfs_btree_cur *cur, 720 int n) 721 { 722 return xfs_btree_block_len(cur) + 723 (n - 1) * cur->bc_ops->key_len; 724 } 725 726 /* 727 * Calculate offset of the n-th high key in a btree block. 728 */ 729 STATIC size_t 730 xfs_btree_high_key_offset( 731 struct xfs_btree_cur *cur, 732 int n) 733 { 734 return xfs_btree_block_len(cur) + 735 (n - 1) * cur->bc_ops->key_len + (cur->bc_ops->key_len / 2); 736 } 737 738 /* 739 * Calculate offset of the n-th block pointer in a btree block. 740 */ 741 STATIC size_t 742 xfs_btree_ptr_offset( 743 struct xfs_btree_cur *cur, 744 int n, 745 int level) 746 { 747 return xfs_btree_block_len(cur) + 748 cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len + 749 (n - 1) * cur->bc_ops->ptr_len; 750 } 751 752 /* 753 * Return a pointer to the n-th record in the btree block. 754 */ 755 union xfs_btree_rec * 756 xfs_btree_rec_addr( 757 struct xfs_btree_cur *cur, 758 int n, 759 struct xfs_btree_block *block) 760 { 761 return (union xfs_btree_rec *) 762 ((char *)block + xfs_btree_rec_offset(cur, n)); 763 } 764 765 /* 766 * Return a pointer to the n-th key in the btree block. 767 */ 768 union xfs_btree_key * 769 xfs_btree_key_addr( 770 struct xfs_btree_cur *cur, 771 int n, 772 struct xfs_btree_block *block) 773 { 774 return (union xfs_btree_key *) 775 ((char *)block + xfs_btree_key_offset(cur, n)); 776 } 777 778 /* 779 * Return a pointer to the n-th high key in the btree block. 780 */ 781 union xfs_btree_key * 782 xfs_btree_high_key_addr( 783 struct xfs_btree_cur *cur, 784 int n, 785 struct xfs_btree_block *block) 786 { 787 return (union xfs_btree_key *) 788 ((char *)block + xfs_btree_high_key_offset(cur, n)); 789 } 790 791 /* 792 * Return a pointer to the n-th block pointer in the btree block. 793 */ 794 union xfs_btree_ptr * 795 xfs_btree_ptr_addr( 796 struct xfs_btree_cur *cur, 797 int n, 798 struct xfs_btree_block *block) 799 { 800 int level = xfs_btree_get_level(block); 801 802 ASSERT(block->bb_level != 0); 803 804 return (union xfs_btree_ptr *) 805 ((char *)block + xfs_btree_ptr_offset(cur, n, level)); 806 } 807 808 struct xfs_ifork * 809 xfs_btree_ifork_ptr( 810 struct xfs_btree_cur *cur) 811 { 812 ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE); 813 814 if (cur->bc_flags & XFS_BTREE_STAGING) 815 return cur->bc_ino.ifake->if_fork; 816 return xfs_ifork_ptr(cur->bc_ino.ip, cur->bc_ino.whichfork); 817 } 818 819 /* 820 * Get the root block which is stored in the inode. 821 * 822 * For now this btree implementation assumes the btree root is always 823 * stored in the if_broot field of an inode fork. 824 */ 825 STATIC struct xfs_btree_block * 826 xfs_btree_get_iroot( 827 struct xfs_btree_cur *cur) 828 { 829 struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); 830 831 return (struct xfs_btree_block *)ifp->if_broot; 832 } 833 834 /* 835 * Retrieve the block pointer from the cursor at the given level. 836 * This may be an inode btree root or from a buffer. 837 */ 838 struct xfs_btree_block * /* generic btree block pointer */ 839 xfs_btree_get_block( 840 struct xfs_btree_cur *cur, /* btree cursor */ 841 int level, /* level in btree */ 842 struct xfs_buf **bpp) /* buffer containing the block */ 843 { 844 if (xfs_btree_at_iroot(cur, level)) { 845 *bpp = NULL; 846 return xfs_btree_get_iroot(cur); 847 } 848 849 *bpp = cur->bc_levels[level].bp; 850 return XFS_BUF_TO_BLOCK(*bpp); 851 } 852 853 /* 854 * Change the cursor to point to the first record at the given level. 855 * Other levels are unaffected. 856 */ 857 STATIC int /* success=1, failure=0 */ 858 xfs_btree_firstrec( 859 struct xfs_btree_cur *cur, /* btree cursor */ 860 int level) /* level to change */ 861 { 862 struct xfs_btree_block *block; /* generic btree block pointer */ 863 struct xfs_buf *bp; /* buffer containing block */ 864 865 /* 866 * Get the block pointer for this level. 867 */ 868 block = xfs_btree_get_block(cur, level, &bp); 869 if (xfs_btree_check_block(cur, block, level, bp)) 870 return 0; 871 /* 872 * It's empty, there is no such record. 873 */ 874 if (!block->bb_numrecs) 875 return 0; 876 /* 877 * Set the ptr value to 1, that's the first record/key. 878 */ 879 cur->bc_levels[level].ptr = 1; 880 return 1; 881 } 882 883 /* 884 * Change the cursor to point to the last record in the current block 885 * at the given level. Other levels are unaffected. 886 */ 887 STATIC int /* success=1, failure=0 */ 888 xfs_btree_lastrec( 889 struct xfs_btree_cur *cur, /* btree cursor */ 890 int level) /* level to change */ 891 { 892 struct xfs_btree_block *block; /* generic btree block pointer */ 893 struct xfs_buf *bp; /* buffer containing block */ 894 895 /* 896 * Get the block pointer for this level. 897 */ 898 block = xfs_btree_get_block(cur, level, &bp); 899 if (xfs_btree_check_block(cur, block, level, bp)) 900 return 0; 901 /* 902 * It's empty, there is no such record. 903 */ 904 if (!block->bb_numrecs) 905 return 0; 906 /* 907 * Set the ptr value to numrecs, that's the last record/key. 908 */ 909 cur->bc_levels[level].ptr = be16_to_cpu(block->bb_numrecs); 910 return 1; 911 } 912 913 /* 914 * Compute first and last byte offsets for the fields given. 915 * Interprets the offsets table, which contains struct field offsets. 916 */ 917 void 918 xfs_btree_offsets( 919 uint32_t fields, /* bitmask of fields */ 920 const short *offsets, /* table of field offsets */ 921 int nbits, /* number of bits to inspect */ 922 int *first, /* output: first byte offset */ 923 int *last) /* output: last byte offset */ 924 { 925 int i; /* current bit number */ 926 uint32_t imask; /* mask for current bit number */ 927 928 ASSERT(fields != 0); 929 /* 930 * Find the lowest bit, so the first byte offset. 931 */ 932 for (i = 0, imask = 1u; ; i++, imask <<= 1) { 933 if (imask & fields) { 934 *first = offsets[i]; 935 break; 936 } 937 } 938 /* 939 * Find the highest bit, so the last byte offset. 940 */ 941 for (i = nbits - 1, imask = 1u << i; ; i--, imask >>= 1) { 942 if (imask & fields) { 943 *last = offsets[i + 1] - 1; 944 break; 945 } 946 } 947 } 948 949 STATIC int 950 xfs_btree_readahead_fsblock( 951 struct xfs_btree_cur *cur, 952 int lr, 953 struct xfs_btree_block *block) 954 { 955 struct xfs_mount *mp = cur->bc_mp; 956 xfs_fsblock_t left = be64_to_cpu(block->bb_u.l.bb_leftsib); 957 xfs_fsblock_t right = be64_to_cpu(block->bb_u.l.bb_rightsib); 958 int rval = 0; 959 960 if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) { 961 xfs_buf_readahead(mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, left), 962 mp->m_bsize, cur->bc_ops->buf_ops); 963 rval++; 964 } 965 966 if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) { 967 xfs_buf_readahead(mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, right), 968 mp->m_bsize, cur->bc_ops->buf_ops); 969 rval++; 970 } 971 972 return rval; 973 } 974 975 STATIC int 976 xfs_btree_readahead_memblock( 977 struct xfs_btree_cur *cur, 978 int lr, 979 struct xfs_btree_block *block) 980 { 981 struct xfs_buftarg *btp = cur->bc_mem.xfbtree->target; 982 xfbno_t left = be64_to_cpu(block->bb_u.l.bb_leftsib); 983 xfbno_t right = be64_to_cpu(block->bb_u.l.bb_rightsib); 984 int rval = 0; 985 986 if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) { 987 xfs_buf_readahead(btp, xfbno_to_daddr(left), XFBNO_BBSIZE, 988 cur->bc_ops->buf_ops); 989 rval++; 990 } 991 992 if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) { 993 xfs_buf_readahead(btp, xfbno_to_daddr(right), XFBNO_BBSIZE, 994 cur->bc_ops->buf_ops); 995 rval++; 996 } 997 998 return rval; 999 } 1000 1001 STATIC int 1002 xfs_btree_readahead_agblock( 1003 struct xfs_btree_cur *cur, 1004 int lr, 1005 struct xfs_btree_block *block) 1006 { 1007 struct xfs_mount *mp = cur->bc_mp; 1008 struct xfs_perag *pag = to_perag(cur->bc_group); 1009 xfs_agblock_t left = be32_to_cpu(block->bb_u.s.bb_leftsib); 1010 xfs_agblock_t right = be32_to_cpu(block->bb_u.s.bb_rightsib); 1011 int rval = 0; 1012 1013 if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) { 1014 xfs_buf_readahead(mp->m_ddev_targp, 1015 xfs_agbno_to_daddr(pag, left), mp->m_bsize, 1016 cur->bc_ops->buf_ops); 1017 rval++; 1018 } 1019 1020 if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) { 1021 xfs_buf_readahead(mp->m_ddev_targp, 1022 xfs_agbno_to_daddr(pag, right), mp->m_bsize, 1023 cur->bc_ops->buf_ops); 1024 rval++; 1025 } 1026 1027 return rval; 1028 } 1029 1030 /* 1031 * Read-ahead btree blocks, at the given level. 1032 * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA. 1033 */ 1034 STATIC int 1035 xfs_btree_readahead( 1036 struct xfs_btree_cur *cur, /* btree cursor */ 1037 int lev, /* level in btree */ 1038 int lr) /* left/right bits */ 1039 { 1040 struct xfs_btree_block *block; 1041 1042 /* 1043 * No readahead needed if we are at the root level and the 1044 * btree root is stored in the inode. 1045 */ 1046 if (xfs_btree_at_iroot(cur, lev)) 1047 return 0; 1048 1049 if ((cur->bc_levels[lev].ra | lr) == cur->bc_levels[lev].ra) 1050 return 0; 1051 1052 cur->bc_levels[lev].ra |= lr; 1053 block = XFS_BUF_TO_BLOCK(cur->bc_levels[lev].bp); 1054 1055 switch (cur->bc_ops->type) { 1056 case XFS_BTREE_TYPE_AG: 1057 return xfs_btree_readahead_agblock(cur, lr, block); 1058 case XFS_BTREE_TYPE_INODE: 1059 return xfs_btree_readahead_fsblock(cur, lr, block); 1060 case XFS_BTREE_TYPE_MEM: 1061 return xfs_btree_readahead_memblock(cur, lr, block); 1062 default: 1063 ASSERT(0); 1064 return 0; 1065 } 1066 } 1067 1068 STATIC int 1069 xfs_btree_ptr_to_daddr( 1070 struct xfs_btree_cur *cur, 1071 const union xfs_btree_ptr *ptr, 1072 xfs_daddr_t *daddr) 1073 { 1074 int error; 1075 1076 error = xfs_btree_check_ptr(cur, ptr, 0, 1); 1077 if (error) 1078 return error; 1079 1080 switch (cur->bc_ops->type) { 1081 case XFS_BTREE_TYPE_AG: 1082 *daddr = xfs_agbno_to_daddr(to_perag(cur->bc_group), 1083 be32_to_cpu(ptr->s)); 1084 break; 1085 case XFS_BTREE_TYPE_INODE: 1086 *daddr = XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l)); 1087 break; 1088 case XFS_BTREE_TYPE_MEM: 1089 *daddr = xfbno_to_daddr(be64_to_cpu(ptr->l)); 1090 break; 1091 } 1092 return 0; 1093 } 1094 1095 /* 1096 * Readahead @count btree blocks at the given @ptr location. 1097 * 1098 * We don't need to care about long or short form btrees here as we have a 1099 * method of converting the ptr directly to a daddr available to us. 1100 */ 1101 STATIC void 1102 xfs_btree_readahead_ptr( 1103 struct xfs_btree_cur *cur, 1104 union xfs_btree_ptr *ptr, 1105 xfs_extlen_t count) 1106 { 1107 xfs_daddr_t daddr; 1108 1109 if (xfs_btree_ptr_to_daddr(cur, ptr, &daddr)) 1110 return; 1111 xfs_buf_readahead(xfs_btree_buftarg(cur), daddr, 1112 xfs_btree_bbsize(cur) * count, 1113 cur->bc_ops->buf_ops); 1114 } 1115 1116 /* 1117 * Set the buffer for level "lev" in the cursor to bp, releasing 1118 * any previous buffer. 1119 */ 1120 STATIC void 1121 xfs_btree_setbuf( 1122 struct xfs_btree_cur *cur, /* btree cursor */ 1123 int lev, /* level in btree */ 1124 struct xfs_buf *bp) /* new buffer to set */ 1125 { 1126 struct xfs_btree_block *b; /* btree block */ 1127 1128 if (cur->bc_levels[lev].bp) 1129 xfs_trans_brelse(cur->bc_tp, cur->bc_levels[lev].bp); 1130 cur->bc_levels[lev].bp = bp; 1131 cur->bc_levels[lev].ra = 0; 1132 1133 b = XFS_BUF_TO_BLOCK(bp); 1134 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { 1135 if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK)) 1136 cur->bc_levels[lev].ra |= XFS_BTCUR_LEFTRA; 1137 if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK)) 1138 cur->bc_levels[lev].ra |= XFS_BTCUR_RIGHTRA; 1139 } else { 1140 if (b->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK)) 1141 cur->bc_levels[lev].ra |= XFS_BTCUR_LEFTRA; 1142 if (b->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK)) 1143 cur->bc_levels[lev].ra |= XFS_BTCUR_RIGHTRA; 1144 } 1145 } 1146 1147 bool 1148 xfs_btree_ptr_is_null( 1149 struct xfs_btree_cur *cur, 1150 const union xfs_btree_ptr *ptr) 1151 { 1152 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) 1153 return ptr->l == cpu_to_be64(NULLFSBLOCK); 1154 else 1155 return ptr->s == cpu_to_be32(NULLAGBLOCK); 1156 } 1157 1158 void 1159 xfs_btree_set_ptr_null( 1160 struct xfs_btree_cur *cur, 1161 union xfs_btree_ptr *ptr) 1162 { 1163 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) 1164 ptr->l = cpu_to_be64(NULLFSBLOCK); 1165 else 1166 ptr->s = cpu_to_be32(NULLAGBLOCK); 1167 } 1168 1169 static inline bool 1170 xfs_btree_ptrs_equal( 1171 struct xfs_btree_cur *cur, 1172 union xfs_btree_ptr *ptr1, 1173 union xfs_btree_ptr *ptr2) 1174 { 1175 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) 1176 return ptr1->l == ptr2->l; 1177 return ptr1->s == ptr2->s; 1178 } 1179 1180 /* 1181 * Get/set/init sibling pointers 1182 */ 1183 void 1184 xfs_btree_get_sibling( 1185 struct xfs_btree_cur *cur, 1186 struct xfs_btree_block *block, 1187 union xfs_btree_ptr *ptr, 1188 int lr) 1189 { 1190 ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB); 1191 1192 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { 1193 if (lr == XFS_BB_RIGHTSIB) 1194 ptr->l = block->bb_u.l.bb_rightsib; 1195 else 1196 ptr->l = block->bb_u.l.bb_leftsib; 1197 } else { 1198 if (lr == XFS_BB_RIGHTSIB) 1199 ptr->s = block->bb_u.s.bb_rightsib; 1200 else 1201 ptr->s = block->bb_u.s.bb_leftsib; 1202 } 1203 } 1204 1205 void 1206 xfs_btree_set_sibling( 1207 struct xfs_btree_cur *cur, 1208 struct xfs_btree_block *block, 1209 const union xfs_btree_ptr *ptr, 1210 int lr) 1211 { 1212 ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB); 1213 1214 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { 1215 if (lr == XFS_BB_RIGHTSIB) 1216 block->bb_u.l.bb_rightsib = ptr->l; 1217 else 1218 block->bb_u.l.bb_leftsib = ptr->l; 1219 } else { 1220 if (lr == XFS_BB_RIGHTSIB) 1221 block->bb_u.s.bb_rightsib = ptr->s; 1222 else 1223 block->bb_u.s.bb_leftsib = ptr->s; 1224 } 1225 } 1226 1227 static void 1228 __xfs_btree_init_block( 1229 struct xfs_mount *mp, 1230 struct xfs_btree_block *buf, 1231 const struct xfs_btree_ops *ops, 1232 xfs_daddr_t blkno, 1233 __u16 level, 1234 __u16 numrecs, 1235 __u64 owner) 1236 { 1237 bool crc = xfs_has_crc(mp); 1238 __u32 magic = xfs_btree_magic(mp, ops); 1239 1240 buf->bb_magic = cpu_to_be32(magic); 1241 buf->bb_level = cpu_to_be16(level); 1242 buf->bb_numrecs = cpu_to_be16(numrecs); 1243 1244 if (ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { 1245 buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK); 1246 buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK); 1247 if (crc) { 1248 buf->bb_u.l.bb_blkno = cpu_to_be64(blkno); 1249 buf->bb_u.l.bb_owner = cpu_to_be64(owner); 1250 uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid); 1251 buf->bb_u.l.bb_pad = 0; 1252 buf->bb_u.l.bb_lsn = 0; 1253 } 1254 } else { 1255 buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK); 1256 buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK); 1257 if (crc) { 1258 buf->bb_u.s.bb_blkno = cpu_to_be64(blkno); 1259 /* owner is a 32 bit value on short blocks */ 1260 buf->bb_u.s.bb_owner = cpu_to_be32((__u32)owner); 1261 uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid); 1262 buf->bb_u.s.bb_lsn = 0; 1263 } 1264 } 1265 } 1266 1267 void 1268 xfs_btree_init_block( 1269 struct xfs_mount *mp, 1270 struct xfs_btree_block *block, 1271 const struct xfs_btree_ops *ops, 1272 __u16 level, 1273 __u16 numrecs, 1274 __u64 owner) 1275 { 1276 __xfs_btree_init_block(mp, block, ops, XFS_BUF_DADDR_NULL, level, 1277 numrecs, owner); 1278 } 1279 1280 void 1281 xfs_btree_init_buf( 1282 struct xfs_mount *mp, 1283 struct xfs_buf *bp, 1284 const struct xfs_btree_ops *ops, 1285 __u16 level, 1286 __u16 numrecs, 1287 __u64 owner) 1288 { 1289 __xfs_btree_init_block(mp, XFS_BUF_TO_BLOCK(bp), ops, 1290 xfs_buf_daddr(bp), level, numrecs, owner); 1291 bp->b_ops = ops->buf_ops; 1292 } 1293 1294 static inline __u64 1295 xfs_btree_owner( 1296 struct xfs_btree_cur *cur) 1297 { 1298 switch (cur->bc_ops->type) { 1299 case XFS_BTREE_TYPE_MEM: 1300 return cur->bc_mem.xfbtree->owner; 1301 case XFS_BTREE_TYPE_INODE: 1302 return cur->bc_ino.ip->i_ino; 1303 case XFS_BTREE_TYPE_AG: 1304 return cur->bc_group->xg_gno; 1305 default: 1306 ASSERT(0); 1307 return 0; 1308 } 1309 } 1310 1311 void 1312 xfs_btree_init_block_cur( 1313 struct xfs_btree_cur *cur, 1314 struct xfs_buf *bp, 1315 int level, 1316 int numrecs) 1317 { 1318 xfs_btree_init_buf(cur->bc_mp, bp, cur->bc_ops, level, numrecs, 1319 xfs_btree_owner(cur)); 1320 } 1321 1322 STATIC void 1323 xfs_btree_buf_to_ptr( 1324 struct xfs_btree_cur *cur, 1325 struct xfs_buf *bp, 1326 union xfs_btree_ptr *ptr) 1327 { 1328 switch (cur->bc_ops->type) { 1329 case XFS_BTREE_TYPE_AG: 1330 ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp, 1331 xfs_buf_daddr(bp))); 1332 break; 1333 case XFS_BTREE_TYPE_INODE: 1334 ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp, 1335 xfs_buf_daddr(bp))); 1336 break; 1337 case XFS_BTREE_TYPE_MEM: 1338 ptr->l = cpu_to_be64(xfs_daddr_to_xfbno(xfs_buf_daddr(bp))); 1339 break; 1340 } 1341 } 1342 1343 static inline void 1344 xfs_btree_set_refs( 1345 struct xfs_btree_cur *cur, 1346 struct xfs_buf *bp) 1347 { 1348 xfs_buf_set_ref(bp, cur->bc_ops->lru_refs); 1349 } 1350 1351 int 1352 xfs_btree_get_buf_block( 1353 struct xfs_btree_cur *cur, 1354 const union xfs_btree_ptr *ptr, 1355 struct xfs_btree_block **block, 1356 struct xfs_buf **bpp) 1357 { 1358 xfs_daddr_t d; 1359 int error; 1360 1361 error = xfs_btree_ptr_to_daddr(cur, ptr, &d); 1362 if (error) 1363 return error; 1364 error = xfs_trans_get_buf(cur->bc_tp, xfs_btree_buftarg(cur), d, 1365 xfs_btree_bbsize(cur), 0, bpp); 1366 if (error) 1367 return error; 1368 1369 (*bpp)->b_ops = cur->bc_ops->buf_ops; 1370 *block = XFS_BUF_TO_BLOCK(*bpp); 1371 return 0; 1372 } 1373 1374 /* 1375 * Read in the buffer at the given ptr and return the buffer and 1376 * the block pointer within the buffer. 1377 */ 1378 int 1379 xfs_btree_read_buf_block( 1380 struct xfs_btree_cur *cur, 1381 const union xfs_btree_ptr *ptr, 1382 int flags, 1383 struct xfs_btree_block **block, 1384 struct xfs_buf **bpp) 1385 { 1386 struct xfs_mount *mp = cur->bc_mp; 1387 xfs_daddr_t d; 1388 int error; 1389 1390 /* need to sort out how callers deal with failures first */ 1391 ASSERT(!(flags & XBF_TRYLOCK)); 1392 1393 error = xfs_btree_ptr_to_daddr(cur, ptr, &d); 1394 if (error) 1395 return error; 1396 error = xfs_trans_read_buf(mp, cur->bc_tp, xfs_btree_buftarg(cur), d, 1397 xfs_btree_bbsize(cur), flags, bpp, 1398 cur->bc_ops->buf_ops); 1399 if (xfs_metadata_is_sick(error)) 1400 xfs_btree_mark_sick(cur); 1401 if (error) 1402 return error; 1403 1404 xfs_btree_set_refs(cur, *bpp); 1405 *block = XFS_BUF_TO_BLOCK(*bpp); 1406 return 0; 1407 } 1408 1409 /* 1410 * Copy keys from one btree block to another. 1411 */ 1412 void 1413 xfs_btree_copy_keys( 1414 struct xfs_btree_cur *cur, 1415 union xfs_btree_key *dst_key, 1416 const union xfs_btree_key *src_key, 1417 int numkeys) 1418 { 1419 ASSERT(numkeys >= 0); 1420 memcpy(dst_key, src_key, numkeys * cur->bc_ops->key_len); 1421 } 1422 1423 /* 1424 * Copy records from one btree block to another. 1425 */ 1426 STATIC void 1427 xfs_btree_copy_recs( 1428 struct xfs_btree_cur *cur, 1429 union xfs_btree_rec *dst_rec, 1430 union xfs_btree_rec *src_rec, 1431 int numrecs) 1432 { 1433 ASSERT(numrecs >= 0); 1434 memcpy(dst_rec, src_rec, numrecs * cur->bc_ops->rec_len); 1435 } 1436 1437 /* 1438 * Copy block pointers from one btree block to another. 1439 */ 1440 void 1441 xfs_btree_copy_ptrs( 1442 struct xfs_btree_cur *cur, 1443 union xfs_btree_ptr *dst_ptr, 1444 const union xfs_btree_ptr *src_ptr, 1445 int numptrs) 1446 { 1447 ASSERT(numptrs >= 0); 1448 memcpy(dst_ptr, src_ptr, numptrs * cur->bc_ops->ptr_len); 1449 } 1450 1451 /* 1452 * Shift keys one index left/right inside a single btree block. 1453 */ 1454 STATIC void 1455 xfs_btree_shift_keys( 1456 struct xfs_btree_cur *cur, 1457 union xfs_btree_key *key, 1458 int dir, 1459 int numkeys) 1460 { 1461 char *dst_key; 1462 1463 ASSERT(numkeys >= 0); 1464 ASSERT(dir == 1 || dir == -1); 1465 1466 dst_key = (char *)key + (dir * cur->bc_ops->key_len); 1467 memmove(dst_key, key, numkeys * cur->bc_ops->key_len); 1468 } 1469 1470 /* 1471 * Shift records one index left/right inside a single btree block. 1472 */ 1473 STATIC void 1474 xfs_btree_shift_recs( 1475 struct xfs_btree_cur *cur, 1476 union xfs_btree_rec *rec, 1477 int dir, 1478 int numrecs) 1479 { 1480 char *dst_rec; 1481 1482 ASSERT(numrecs >= 0); 1483 ASSERT(dir == 1 || dir == -1); 1484 1485 dst_rec = (char *)rec + (dir * cur->bc_ops->rec_len); 1486 memmove(dst_rec, rec, numrecs * cur->bc_ops->rec_len); 1487 } 1488 1489 /* 1490 * Shift block pointers one index left/right inside a single btree block. 1491 */ 1492 STATIC void 1493 xfs_btree_shift_ptrs( 1494 struct xfs_btree_cur *cur, 1495 union xfs_btree_ptr *ptr, 1496 int dir, 1497 int numptrs) 1498 { 1499 char *dst_ptr; 1500 1501 ASSERT(numptrs >= 0); 1502 ASSERT(dir == 1 || dir == -1); 1503 1504 dst_ptr = (char *)ptr + (dir * cur->bc_ops->ptr_len); 1505 memmove(dst_ptr, ptr, numptrs * cur->bc_ops->ptr_len); 1506 } 1507 1508 /* 1509 * Log key values from the btree block. 1510 */ 1511 STATIC void 1512 xfs_btree_log_keys( 1513 struct xfs_btree_cur *cur, 1514 struct xfs_buf *bp, 1515 int first, 1516 int last) 1517 { 1518 1519 if (bp) { 1520 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); 1521 xfs_trans_log_buf(cur->bc_tp, bp, 1522 xfs_btree_key_offset(cur, first), 1523 xfs_btree_key_offset(cur, last + 1) - 1); 1524 } else { 1525 xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip, 1526 xfs_ilog_fbroot(cur->bc_ino.whichfork)); 1527 } 1528 } 1529 1530 /* 1531 * Log record values from the btree block. 1532 */ 1533 void 1534 xfs_btree_log_recs( 1535 struct xfs_btree_cur *cur, 1536 struct xfs_buf *bp, 1537 int first, 1538 int last) 1539 { 1540 1541 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); 1542 xfs_trans_log_buf(cur->bc_tp, bp, 1543 xfs_btree_rec_offset(cur, first), 1544 xfs_btree_rec_offset(cur, last + 1) - 1); 1545 1546 } 1547 1548 /* 1549 * Log block pointer fields from a btree block (nonleaf). 1550 */ 1551 STATIC void 1552 xfs_btree_log_ptrs( 1553 struct xfs_btree_cur *cur, /* btree cursor */ 1554 struct xfs_buf *bp, /* buffer containing btree block */ 1555 int first, /* index of first pointer to log */ 1556 int last) /* index of last pointer to log */ 1557 { 1558 1559 if (bp) { 1560 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 1561 int level = xfs_btree_get_level(block); 1562 1563 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); 1564 xfs_trans_log_buf(cur->bc_tp, bp, 1565 xfs_btree_ptr_offset(cur, first, level), 1566 xfs_btree_ptr_offset(cur, last + 1, level) - 1); 1567 } else { 1568 xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip, 1569 xfs_ilog_fbroot(cur->bc_ino.whichfork)); 1570 } 1571 1572 } 1573 1574 /* 1575 * Log fields from a btree block header. 1576 */ 1577 void 1578 xfs_btree_log_block( 1579 struct xfs_btree_cur *cur, /* btree cursor */ 1580 struct xfs_buf *bp, /* buffer containing btree block */ 1581 uint32_t fields) /* mask of fields: XFS_BB_... */ 1582 { 1583 int first; /* first byte offset logged */ 1584 int last; /* last byte offset logged */ 1585 static const short soffsets[] = { /* table of offsets (short) */ 1586 offsetof(struct xfs_btree_block, bb_magic), 1587 offsetof(struct xfs_btree_block, bb_level), 1588 offsetof(struct xfs_btree_block, bb_numrecs), 1589 offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib), 1590 offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib), 1591 offsetof(struct xfs_btree_block, bb_u.s.bb_blkno), 1592 offsetof(struct xfs_btree_block, bb_u.s.bb_lsn), 1593 offsetof(struct xfs_btree_block, bb_u.s.bb_uuid), 1594 offsetof(struct xfs_btree_block, bb_u.s.bb_owner), 1595 offsetof(struct xfs_btree_block, bb_u.s.bb_crc), 1596 XFS_BTREE_SBLOCK_CRC_LEN 1597 }; 1598 static const short loffsets[] = { /* table of offsets (long) */ 1599 offsetof(struct xfs_btree_block, bb_magic), 1600 offsetof(struct xfs_btree_block, bb_level), 1601 offsetof(struct xfs_btree_block, bb_numrecs), 1602 offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib), 1603 offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib), 1604 offsetof(struct xfs_btree_block, bb_u.l.bb_blkno), 1605 offsetof(struct xfs_btree_block, bb_u.l.bb_lsn), 1606 offsetof(struct xfs_btree_block, bb_u.l.bb_uuid), 1607 offsetof(struct xfs_btree_block, bb_u.l.bb_owner), 1608 offsetof(struct xfs_btree_block, bb_u.l.bb_crc), 1609 offsetof(struct xfs_btree_block, bb_u.l.bb_pad), 1610 XFS_BTREE_LBLOCK_CRC_LEN 1611 }; 1612 1613 if (bp) { 1614 int nbits; 1615 1616 if (xfs_has_crc(cur->bc_mp)) { 1617 /* 1618 * We don't log the CRC when updating a btree 1619 * block but instead recreate it during log 1620 * recovery. As the log buffers have checksums 1621 * of their own this is safe and avoids logging a crc 1622 * update in a lot of places. 1623 */ 1624 if (fields == XFS_BB_ALL_BITS) 1625 fields = XFS_BB_ALL_BITS_CRC; 1626 nbits = XFS_BB_NUM_BITS_CRC; 1627 } else { 1628 nbits = XFS_BB_NUM_BITS; 1629 } 1630 xfs_btree_offsets(fields, 1631 (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) ? 1632 loffsets : soffsets, 1633 nbits, &first, &last); 1634 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); 1635 xfs_trans_log_buf(cur->bc_tp, bp, first, last); 1636 } else { 1637 xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip, 1638 xfs_ilog_fbroot(cur->bc_ino.whichfork)); 1639 } 1640 } 1641 1642 /* 1643 * Increment cursor by one record at the level. 1644 * For nonzero levels the leaf-ward information is untouched. 1645 */ 1646 int /* error */ 1647 xfs_btree_increment( 1648 struct xfs_btree_cur *cur, 1649 int level, 1650 int *stat) /* success/failure */ 1651 { 1652 struct xfs_btree_block *block; 1653 union xfs_btree_ptr ptr; 1654 struct xfs_buf *bp; 1655 int error; /* error return value */ 1656 int lev; 1657 1658 ASSERT(level < cur->bc_nlevels); 1659 1660 /* Read-ahead to the right at this level. */ 1661 xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA); 1662 1663 /* Get a pointer to the btree block. */ 1664 block = xfs_btree_get_block(cur, level, &bp); 1665 1666 #ifdef DEBUG 1667 error = xfs_btree_check_block(cur, block, level, bp); 1668 if (error) 1669 goto error0; 1670 #endif 1671 1672 /* We're done if we remain in the block after the increment. */ 1673 if (++cur->bc_levels[level].ptr <= xfs_btree_get_numrecs(block)) 1674 goto out1; 1675 1676 /* Fail if we just went off the right edge of the tree. */ 1677 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB); 1678 if (xfs_btree_ptr_is_null(cur, &ptr)) 1679 goto out0; 1680 1681 XFS_BTREE_STATS_INC(cur, increment); 1682 1683 /* 1684 * March up the tree incrementing pointers. 1685 * Stop when we don't go off the right edge of a block. 1686 */ 1687 for (lev = level + 1; lev < cur->bc_nlevels; lev++) { 1688 block = xfs_btree_get_block(cur, lev, &bp); 1689 1690 #ifdef DEBUG 1691 error = xfs_btree_check_block(cur, block, lev, bp); 1692 if (error) 1693 goto error0; 1694 #endif 1695 1696 if (++cur->bc_levels[lev].ptr <= xfs_btree_get_numrecs(block)) 1697 break; 1698 1699 /* Read-ahead the right block for the next loop. */ 1700 xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA); 1701 } 1702 1703 /* 1704 * If we went off the root then we are either seriously 1705 * confused or have the tree root in an inode. 1706 */ 1707 if (lev == cur->bc_nlevels) { 1708 if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) 1709 goto out0; 1710 ASSERT(0); 1711 xfs_btree_mark_sick(cur); 1712 error = -EFSCORRUPTED; 1713 goto error0; 1714 } 1715 ASSERT(lev < cur->bc_nlevels); 1716 1717 /* 1718 * Now walk back down the tree, fixing up the cursor's buffer 1719 * pointers and key numbers. 1720 */ 1721 for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) { 1722 union xfs_btree_ptr *ptrp; 1723 1724 ptrp = xfs_btree_ptr_addr(cur, cur->bc_levels[lev].ptr, block); 1725 --lev; 1726 error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp); 1727 if (error) 1728 goto error0; 1729 1730 xfs_btree_setbuf(cur, lev, bp); 1731 cur->bc_levels[lev].ptr = 1; 1732 } 1733 out1: 1734 *stat = 1; 1735 return 0; 1736 1737 out0: 1738 *stat = 0; 1739 return 0; 1740 1741 error0: 1742 return error; 1743 } 1744 1745 /* 1746 * Decrement cursor by one record at the level. 1747 * For nonzero levels the leaf-ward information is untouched. 1748 */ 1749 int /* error */ 1750 xfs_btree_decrement( 1751 struct xfs_btree_cur *cur, 1752 int level, 1753 int *stat) /* success/failure */ 1754 { 1755 struct xfs_btree_block *block; 1756 struct xfs_buf *bp; 1757 int error; /* error return value */ 1758 int lev; 1759 union xfs_btree_ptr ptr; 1760 1761 ASSERT(level < cur->bc_nlevels); 1762 1763 /* Read-ahead to the left at this level. */ 1764 xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA); 1765 1766 /* We're done if we remain in the block after the decrement. */ 1767 if (--cur->bc_levels[level].ptr > 0) 1768 goto out1; 1769 1770 /* Get a pointer to the btree block. */ 1771 block = xfs_btree_get_block(cur, level, &bp); 1772 1773 #ifdef DEBUG 1774 error = xfs_btree_check_block(cur, block, level, bp); 1775 if (error) 1776 goto error0; 1777 #endif 1778 1779 /* Fail if we just went off the left edge of the tree. */ 1780 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB); 1781 if (xfs_btree_ptr_is_null(cur, &ptr)) 1782 goto out0; 1783 1784 XFS_BTREE_STATS_INC(cur, decrement); 1785 1786 /* 1787 * March up the tree decrementing pointers. 1788 * Stop when we don't go off the left edge of a block. 1789 */ 1790 for (lev = level + 1; lev < cur->bc_nlevels; lev++) { 1791 if (--cur->bc_levels[lev].ptr > 0) 1792 break; 1793 /* Read-ahead the left block for the next loop. */ 1794 xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA); 1795 } 1796 1797 /* 1798 * If we went off the root then we are seriously confused. 1799 * or the root of the tree is in an inode. 1800 */ 1801 if (lev == cur->bc_nlevels) { 1802 if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) 1803 goto out0; 1804 ASSERT(0); 1805 xfs_btree_mark_sick(cur); 1806 error = -EFSCORRUPTED; 1807 goto error0; 1808 } 1809 ASSERT(lev < cur->bc_nlevels); 1810 1811 /* 1812 * Now walk back down the tree, fixing up the cursor's buffer 1813 * pointers and key numbers. 1814 */ 1815 for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) { 1816 union xfs_btree_ptr *ptrp; 1817 1818 ptrp = xfs_btree_ptr_addr(cur, cur->bc_levels[lev].ptr, block); 1819 --lev; 1820 error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp); 1821 if (error) 1822 goto error0; 1823 xfs_btree_setbuf(cur, lev, bp); 1824 cur->bc_levels[lev].ptr = xfs_btree_get_numrecs(block); 1825 } 1826 out1: 1827 *stat = 1; 1828 return 0; 1829 1830 out0: 1831 *stat = 0; 1832 return 0; 1833 1834 error0: 1835 return error; 1836 } 1837 1838 /* 1839 * Check the btree block owner now that we have the context to know who the 1840 * real owner is. 1841 */ 1842 static inline xfs_failaddr_t 1843 xfs_btree_check_block_owner( 1844 struct xfs_btree_cur *cur, 1845 struct xfs_btree_block *block) 1846 { 1847 __u64 owner; 1848 1849 if (!xfs_has_crc(cur->bc_mp) || 1850 (cur->bc_flags & XFS_BTREE_BMBT_INVALID_OWNER)) 1851 return NULL; 1852 1853 owner = xfs_btree_owner(cur); 1854 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { 1855 if (be64_to_cpu(block->bb_u.l.bb_owner) != owner) 1856 return __this_address; 1857 } else { 1858 if (be32_to_cpu(block->bb_u.s.bb_owner) != owner) 1859 return __this_address; 1860 } 1861 1862 return NULL; 1863 } 1864 1865 int 1866 xfs_btree_lookup_get_block( 1867 struct xfs_btree_cur *cur, /* btree cursor */ 1868 int level, /* level in the btree */ 1869 const union xfs_btree_ptr *pp, /* ptr to btree block */ 1870 struct xfs_btree_block **blkp) /* return btree block */ 1871 { 1872 struct xfs_buf *bp; /* buffer pointer for btree block */ 1873 xfs_daddr_t daddr; 1874 int error = 0; 1875 1876 /* special case the root block if in an inode */ 1877 if (xfs_btree_at_iroot(cur, level)) { 1878 *blkp = xfs_btree_get_iroot(cur); 1879 return 0; 1880 } 1881 1882 /* 1883 * If the old buffer at this level for the disk address we are 1884 * looking for re-use it. 1885 * 1886 * Otherwise throw it away and get a new one. 1887 */ 1888 bp = cur->bc_levels[level].bp; 1889 error = xfs_btree_ptr_to_daddr(cur, pp, &daddr); 1890 if (error) 1891 return error; 1892 if (bp && xfs_buf_daddr(bp) == daddr) { 1893 *blkp = XFS_BUF_TO_BLOCK(bp); 1894 return 0; 1895 } 1896 1897 error = xfs_btree_read_buf_block(cur, pp, 0, blkp, &bp); 1898 if (error) 1899 return error; 1900 1901 /* Check the inode owner since the verifiers don't. */ 1902 if (xfs_btree_check_block_owner(cur, *blkp) != NULL) 1903 goto out_bad; 1904 1905 /* Did we get the level we were looking for? */ 1906 if (be16_to_cpu((*blkp)->bb_level) != level) 1907 goto out_bad; 1908 1909 /* Check that internal nodes have at least one record. */ 1910 if (level != 0 && be16_to_cpu((*blkp)->bb_numrecs) == 0) 1911 goto out_bad; 1912 1913 xfs_btree_setbuf(cur, level, bp); 1914 return 0; 1915 1916 out_bad: 1917 *blkp = NULL; 1918 xfs_buf_mark_corrupt(bp); 1919 xfs_trans_brelse(cur->bc_tp, bp); 1920 xfs_btree_mark_sick(cur); 1921 return -EFSCORRUPTED; 1922 } 1923 1924 /* 1925 * Get current search key. For level 0 we don't actually have a key 1926 * structure so we make one up from the record. For all other levels 1927 * we just return the right key. 1928 */ 1929 STATIC union xfs_btree_key * 1930 xfs_lookup_get_search_key( 1931 struct xfs_btree_cur *cur, 1932 int level, 1933 int keyno, 1934 struct xfs_btree_block *block, 1935 union xfs_btree_key *kp) 1936 { 1937 if (level == 0) { 1938 cur->bc_ops->init_key_from_rec(kp, 1939 xfs_btree_rec_addr(cur, keyno, block)); 1940 return kp; 1941 } 1942 1943 return xfs_btree_key_addr(cur, keyno, block); 1944 } 1945 1946 /* 1947 * Initialize a pointer to the root block. 1948 */ 1949 void 1950 xfs_btree_init_ptr_from_cur( 1951 struct xfs_btree_cur *cur, 1952 union xfs_btree_ptr *ptr) 1953 { 1954 if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) { 1955 /* 1956 * Inode-rooted btrees call xfs_btree_get_iroot to find the root 1957 * in xfs_btree_lookup_get_block and don't need a pointer here. 1958 */ 1959 ptr->l = 0; 1960 } else if (cur->bc_flags & XFS_BTREE_STAGING) { 1961 ptr->s = cpu_to_be32(cur->bc_ag.afake->af_root); 1962 } else { 1963 cur->bc_ops->init_ptr_from_cur(cur, ptr); 1964 } 1965 } 1966 1967 /* 1968 * Lookup the record. The cursor is made to point to it, based on dir. 1969 * stat is set to 0 if can't find any such record, 1 for success. 1970 */ 1971 int /* error */ 1972 xfs_btree_lookup( 1973 struct xfs_btree_cur *cur, /* btree cursor */ 1974 xfs_lookup_t dir, /* <=, ==, or >= */ 1975 int *stat) /* success/failure */ 1976 { 1977 struct xfs_btree_block *block; /* current btree block */ 1978 int64_t diff; /* difference for the current key */ 1979 int error; /* error return value */ 1980 int keyno; /* current key number */ 1981 int level; /* level in the btree */ 1982 union xfs_btree_ptr *pp; /* ptr to btree block */ 1983 union xfs_btree_ptr ptr; /* ptr to btree block */ 1984 1985 XFS_BTREE_STATS_INC(cur, lookup); 1986 1987 /* No such thing as a zero-level tree. */ 1988 if (XFS_IS_CORRUPT(cur->bc_mp, cur->bc_nlevels == 0)) { 1989 xfs_btree_mark_sick(cur); 1990 return -EFSCORRUPTED; 1991 } 1992 1993 block = NULL; 1994 keyno = 0; 1995 1996 /* initialise start pointer from cursor */ 1997 xfs_btree_init_ptr_from_cur(cur, &ptr); 1998 pp = &ptr; 1999 2000 /* 2001 * Iterate over each level in the btree, starting at the root. 2002 * For each level above the leaves, find the key we need, based 2003 * on the lookup record, then follow the corresponding block 2004 * pointer down to the next level. 2005 */ 2006 for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) { 2007 /* Get the block we need to do the lookup on. */ 2008 error = xfs_btree_lookup_get_block(cur, level, pp, &block); 2009 if (error) 2010 goto error0; 2011 2012 if (diff == 0) { 2013 /* 2014 * If we already had a key match at a higher level, we 2015 * know we need to use the first entry in this block. 2016 */ 2017 keyno = 1; 2018 } else { 2019 /* Otherwise search this block. Do a binary search. */ 2020 2021 int high; /* high entry number */ 2022 int low; /* low entry number */ 2023 2024 /* Set low and high entry numbers, 1-based. */ 2025 low = 1; 2026 high = xfs_btree_get_numrecs(block); 2027 if (!high) { 2028 /* Block is empty, must be an empty leaf. */ 2029 if (level != 0 || cur->bc_nlevels != 1) { 2030 XFS_CORRUPTION_ERROR(__func__, 2031 XFS_ERRLEVEL_LOW, 2032 cur->bc_mp, block, 2033 sizeof(*block)); 2034 xfs_btree_mark_sick(cur); 2035 return -EFSCORRUPTED; 2036 } 2037 2038 cur->bc_levels[0].ptr = dir != XFS_LOOKUP_LE; 2039 *stat = 0; 2040 return 0; 2041 } 2042 2043 /* Binary search the block. */ 2044 while (low <= high) { 2045 union xfs_btree_key key; 2046 union xfs_btree_key *kp; 2047 2048 XFS_BTREE_STATS_INC(cur, compare); 2049 2050 /* keyno is average of low and high. */ 2051 keyno = (low + high) >> 1; 2052 2053 /* Get current search key */ 2054 kp = xfs_lookup_get_search_key(cur, level, 2055 keyno, block, &key); 2056 2057 /* 2058 * Compute difference to get next direction: 2059 * - less than, move right 2060 * - greater than, move left 2061 * - equal, we're done 2062 */ 2063 diff = cur->bc_ops->key_diff(cur, kp); 2064 if (diff < 0) 2065 low = keyno + 1; 2066 else if (diff > 0) 2067 high = keyno - 1; 2068 else 2069 break; 2070 } 2071 } 2072 2073 /* 2074 * If there are more levels, set up for the next level 2075 * by getting the block number and filling in the cursor. 2076 */ 2077 if (level > 0) { 2078 /* 2079 * If we moved left, need the previous key number, 2080 * unless there isn't one. 2081 */ 2082 if (diff > 0 && --keyno < 1) 2083 keyno = 1; 2084 pp = xfs_btree_ptr_addr(cur, keyno, block); 2085 2086 error = xfs_btree_debug_check_ptr(cur, pp, 0, level); 2087 if (error) 2088 goto error0; 2089 2090 cur->bc_levels[level].ptr = keyno; 2091 } 2092 } 2093 2094 /* Done with the search. See if we need to adjust the results. */ 2095 if (dir != XFS_LOOKUP_LE && diff < 0) { 2096 keyno++; 2097 /* 2098 * If ge search and we went off the end of the block, but it's 2099 * not the last block, we're in the wrong block. 2100 */ 2101 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB); 2102 if (dir == XFS_LOOKUP_GE && 2103 keyno > xfs_btree_get_numrecs(block) && 2104 !xfs_btree_ptr_is_null(cur, &ptr)) { 2105 int i; 2106 2107 cur->bc_levels[0].ptr = keyno; 2108 error = xfs_btree_increment(cur, 0, &i); 2109 if (error) 2110 goto error0; 2111 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 2112 xfs_btree_mark_sick(cur); 2113 return -EFSCORRUPTED; 2114 } 2115 *stat = 1; 2116 return 0; 2117 } 2118 } else if (dir == XFS_LOOKUP_LE && diff > 0) 2119 keyno--; 2120 cur->bc_levels[0].ptr = keyno; 2121 2122 /* Return if we succeeded or not. */ 2123 if (keyno == 0 || keyno > xfs_btree_get_numrecs(block)) 2124 *stat = 0; 2125 else if (dir != XFS_LOOKUP_EQ || diff == 0) 2126 *stat = 1; 2127 else 2128 *stat = 0; 2129 return 0; 2130 2131 error0: 2132 return error; 2133 } 2134 2135 /* Find the high key storage area from a regular key. */ 2136 union xfs_btree_key * 2137 xfs_btree_high_key_from_key( 2138 struct xfs_btree_cur *cur, 2139 union xfs_btree_key *key) 2140 { 2141 ASSERT(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING); 2142 return (union xfs_btree_key *)((char *)key + 2143 (cur->bc_ops->key_len / 2)); 2144 } 2145 2146 /* Determine the low (and high if overlapped) keys of a leaf block */ 2147 STATIC void 2148 xfs_btree_get_leaf_keys( 2149 struct xfs_btree_cur *cur, 2150 struct xfs_btree_block *block, 2151 union xfs_btree_key *key) 2152 { 2153 union xfs_btree_key max_hkey; 2154 union xfs_btree_key hkey; 2155 union xfs_btree_rec *rec; 2156 union xfs_btree_key *high; 2157 int n; 2158 2159 rec = xfs_btree_rec_addr(cur, 1, block); 2160 cur->bc_ops->init_key_from_rec(key, rec); 2161 2162 if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) { 2163 2164 cur->bc_ops->init_high_key_from_rec(&max_hkey, rec); 2165 for (n = 2; n <= xfs_btree_get_numrecs(block); n++) { 2166 rec = xfs_btree_rec_addr(cur, n, block); 2167 cur->bc_ops->init_high_key_from_rec(&hkey, rec); 2168 if (xfs_btree_keycmp_gt(cur, &hkey, &max_hkey)) 2169 max_hkey = hkey; 2170 } 2171 2172 high = xfs_btree_high_key_from_key(cur, key); 2173 memcpy(high, &max_hkey, cur->bc_ops->key_len / 2); 2174 } 2175 } 2176 2177 /* Determine the low (and high if overlapped) keys of a node block */ 2178 STATIC void 2179 xfs_btree_get_node_keys( 2180 struct xfs_btree_cur *cur, 2181 struct xfs_btree_block *block, 2182 union xfs_btree_key *key) 2183 { 2184 union xfs_btree_key *hkey; 2185 union xfs_btree_key *max_hkey; 2186 union xfs_btree_key *high; 2187 int n; 2188 2189 if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) { 2190 memcpy(key, xfs_btree_key_addr(cur, 1, block), 2191 cur->bc_ops->key_len / 2); 2192 2193 max_hkey = xfs_btree_high_key_addr(cur, 1, block); 2194 for (n = 2; n <= xfs_btree_get_numrecs(block); n++) { 2195 hkey = xfs_btree_high_key_addr(cur, n, block); 2196 if (xfs_btree_keycmp_gt(cur, hkey, max_hkey)) 2197 max_hkey = hkey; 2198 } 2199 2200 high = xfs_btree_high_key_from_key(cur, key); 2201 memcpy(high, max_hkey, cur->bc_ops->key_len / 2); 2202 } else { 2203 memcpy(key, xfs_btree_key_addr(cur, 1, block), 2204 cur->bc_ops->key_len); 2205 } 2206 } 2207 2208 /* Derive the keys for any btree block. */ 2209 void 2210 xfs_btree_get_keys( 2211 struct xfs_btree_cur *cur, 2212 struct xfs_btree_block *block, 2213 union xfs_btree_key *key) 2214 { 2215 if (be16_to_cpu(block->bb_level) == 0) 2216 xfs_btree_get_leaf_keys(cur, block, key); 2217 else 2218 xfs_btree_get_node_keys(cur, block, key); 2219 } 2220 2221 /* 2222 * Decide if we need to update the parent keys of a btree block. For 2223 * a standard btree this is only necessary if we're updating the first 2224 * record/key. For an overlapping btree, we must always update the 2225 * keys because the highest key can be in any of the records or keys 2226 * in the block. 2227 */ 2228 static inline bool 2229 xfs_btree_needs_key_update( 2230 struct xfs_btree_cur *cur, 2231 int ptr) 2232 { 2233 return (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) || ptr == 1; 2234 } 2235 2236 /* 2237 * Update the low and high parent keys of the given level, progressing 2238 * towards the root. If force_all is false, stop if the keys for a given 2239 * level do not need updating. 2240 */ 2241 STATIC int 2242 __xfs_btree_updkeys( 2243 struct xfs_btree_cur *cur, 2244 int level, 2245 struct xfs_btree_block *block, 2246 struct xfs_buf *bp0, 2247 bool force_all) 2248 { 2249 union xfs_btree_key key; /* keys from current level */ 2250 union xfs_btree_key *lkey; /* keys from the next level up */ 2251 union xfs_btree_key *hkey; 2252 union xfs_btree_key *nlkey; /* keys from the next level up */ 2253 union xfs_btree_key *nhkey; 2254 struct xfs_buf *bp; 2255 int ptr; 2256 2257 ASSERT(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING); 2258 2259 /* Exit if there aren't any parent levels to update. */ 2260 if (level + 1 >= cur->bc_nlevels) 2261 return 0; 2262 2263 trace_xfs_btree_updkeys(cur, level, bp0); 2264 2265 lkey = &key; 2266 hkey = xfs_btree_high_key_from_key(cur, lkey); 2267 xfs_btree_get_keys(cur, block, lkey); 2268 for (level++; level < cur->bc_nlevels; level++) { 2269 #ifdef DEBUG 2270 int error; 2271 #endif 2272 block = xfs_btree_get_block(cur, level, &bp); 2273 trace_xfs_btree_updkeys(cur, level, bp); 2274 #ifdef DEBUG 2275 error = xfs_btree_check_block(cur, block, level, bp); 2276 if (error) 2277 return error; 2278 #endif 2279 ptr = cur->bc_levels[level].ptr; 2280 nlkey = xfs_btree_key_addr(cur, ptr, block); 2281 nhkey = xfs_btree_high_key_addr(cur, ptr, block); 2282 if (!force_all && 2283 xfs_btree_keycmp_eq(cur, nlkey, lkey) && 2284 xfs_btree_keycmp_eq(cur, nhkey, hkey)) 2285 break; 2286 xfs_btree_copy_keys(cur, nlkey, lkey, 1); 2287 xfs_btree_log_keys(cur, bp, ptr, ptr); 2288 if (level + 1 >= cur->bc_nlevels) 2289 break; 2290 xfs_btree_get_node_keys(cur, block, lkey); 2291 } 2292 2293 return 0; 2294 } 2295 2296 /* Update all the keys from some level in cursor back to the root. */ 2297 STATIC int 2298 xfs_btree_updkeys_force( 2299 struct xfs_btree_cur *cur, 2300 int level) 2301 { 2302 struct xfs_buf *bp; 2303 struct xfs_btree_block *block; 2304 2305 block = xfs_btree_get_block(cur, level, &bp); 2306 return __xfs_btree_updkeys(cur, level, block, bp, true); 2307 } 2308 2309 /* 2310 * Update the parent keys of the given level, progressing towards the root. 2311 */ 2312 STATIC int 2313 xfs_btree_update_keys( 2314 struct xfs_btree_cur *cur, 2315 int level) 2316 { 2317 struct xfs_btree_block *block; 2318 struct xfs_buf *bp; 2319 union xfs_btree_key *kp; 2320 union xfs_btree_key key; 2321 int ptr; 2322 2323 ASSERT(level >= 0); 2324 2325 block = xfs_btree_get_block(cur, level, &bp); 2326 if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) 2327 return __xfs_btree_updkeys(cur, level, block, bp, false); 2328 2329 /* 2330 * Go up the tree from this level toward the root. 2331 * At each level, update the key value to the value input. 2332 * Stop when we reach a level where the cursor isn't pointing 2333 * at the first entry in the block. 2334 */ 2335 xfs_btree_get_keys(cur, block, &key); 2336 for (level++, ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) { 2337 #ifdef DEBUG 2338 int error; 2339 #endif 2340 block = xfs_btree_get_block(cur, level, &bp); 2341 #ifdef DEBUG 2342 error = xfs_btree_check_block(cur, block, level, bp); 2343 if (error) 2344 return error; 2345 #endif 2346 ptr = cur->bc_levels[level].ptr; 2347 kp = xfs_btree_key_addr(cur, ptr, block); 2348 xfs_btree_copy_keys(cur, kp, &key, 1); 2349 xfs_btree_log_keys(cur, bp, ptr, ptr); 2350 } 2351 2352 return 0; 2353 } 2354 2355 /* 2356 * Update the record referred to by cur to the value in the 2357 * given record. This either works (return 0) or gets an 2358 * EFSCORRUPTED error. 2359 */ 2360 int 2361 xfs_btree_update( 2362 struct xfs_btree_cur *cur, 2363 union xfs_btree_rec *rec) 2364 { 2365 struct xfs_btree_block *block; 2366 struct xfs_buf *bp; 2367 int error; 2368 int ptr; 2369 union xfs_btree_rec *rp; 2370 2371 /* Pick up the current block. */ 2372 block = xfs_btree_get_block(cur, 0, &bp); 2373 2374 #ifdef DEBUG 2375 error = xfs_btree_check_block(cur, block, 0, bp); 2376 if (error) 2377 goto error0; 2378 #endif 2379 /* Get the address of the rec to be updated. */ 2380 ptr = cur->bc_levels[0].ptr; 2381 rp = xfs_btree_rec_addr(cur, ptr, block); 2382 2383 /* Fill in the new contents and log them. */ 2384 xfs_btree_copy_recs(cur, rp, rec, 1); 2385 xfs_btree_log_recs(cur, bp, ptr, ptr); 2386 2387 /* Pass new key value up to our parent. */ 2388 if (xfs_btree_needs_key_update(cur, ptr)) { 2389 error = xfs_btree_update_keys(cur, 0); 2390 if (error) 2391 goto error0; 2392 } 2393 2394 return 0; 2395 2396 error0: 2397 return error; 2398 } 2399 2400 /* 2401 * Move 1 record left from cur/level if possible. 2402 * Update cur to reflect the new path. 2403 */ 2404 STATIC int /* error */ 2405 xfs_btree_lshift( 2406 struct xfs_btree_cur *cur, 2407 int level, 2408 int *stat) /* success/failure */ 2409 { 2410 struct xfs_buf *lbp; /* left buffer pointer */ 2411 struct xfs_btree_block *left; /* left btree block */ 2412 int lrecs; /* left record count */ 2413 struct xfs_buf *rbp; /* right buffer pointer */ 2414 struct xfs_btree_block *right; /* right btree block */ 2415 struct xfs_btree_cur *tcur; /* temporary btree cursor */ 2416 int rrecs; /* right record count */ 2417 union xfs_btree_ptr lptr; /* left btree pointer */ 2418 union xfs_btree_key *rkp = NULL; /* right btree key */ 2419 union xfs_btree_ptr *rpp = NULL; /* right address pointer */ 2420 union xfs_btree_rec *rrp = NULL; /* right record pointer */ 2421 int error; /* error return value */ 2422 int i; 2423 2424 if (xfs_btree_at_iroot(cur, level)) 2425 goto out0; 2426 2427 /* Set up variables for this block as "right". */ 2428 right = xfs_btree_get_block(cur, level, &rbp); 2429 2430 #ifdef DEBUG 2431 error = xfs_btree_check_block(cur, right, level, rbp); 2432 if (error) 2433 goto error0; 2434 #endif 2435 2436 /* If we've got no left sibling then we can't shift an entry left. */ 2437 xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB); 2438 if (xfs_btree_ptr_is_null(cur, &lptr)) 2439 goto out0; 2440 2441 /* 2442 * If the cursor entry is the one that would be moved, don't 2443 * do it... it's too complicated. 2444 */ 2445 if (cur->bc_levels[level].ptr <= 1) 2446 goto out0; 2447 2448 /* Set up the left neighbor as "left". */ 2449 error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp); 2450 if (error) 2451 goto error0; 2452 2453 /* If it's full, it can't take another entry. */ 2454 lrecs = xfs_btree_get_numrecs(left); 2455 if (lrecs == cur->bc_ops->get_maxrecs(cur, level)) 2456 goto out0; 2457 2458 rrecs = xfs_btree_get_numrecs(right); 2459 2460 /* 2461 * We add one entry to the left side and remove one for the right side. 2462 * Account for it here, the changes will be updated on disk and logged 2463 * later. 2464 */ 2465 lrecs++; 2466 rrecs--; 2467 2468 XFS_BTREE_STATS_INC(cur, lshift); 2469 XFS_BTREE_STATS_ADD(cur, moves, 1); 2470 2471 /* 2472 * If non-leaf, copy a key and a ptr to the left block. 2473 * Log the changes to the left block. 2474 */ 2475 if (level > 0) { 2476 /* It's a non-leaf. Move keys and pointers. */ 2477 union xfs_btree_key *lkp; /* left btree key */ 2478 union xfs_btree_ptr *lpp; /* left address pointer */ 2479 2480 lkp = xfs_btree_key_addr(cur, lrecs, left); 2481 rkp = xfs_btree_key_addr(cur, 1, right); 2482 2483 lpp = xfs_btree_ptr_addr(cur, lrecs, left); 2484 rpp = xfs_btree_ptr_addr(cur, 1, right); 2485 2486 error = xfs_btree_debug_check_ptr(cur, rpp, 0, level); 2487 if (error) 2488 goto error0; 2489 2490 xfs_btree_copy_keys(cur, lkp, rkp, 1); 2491 xfs_btree_copy_ptrs(cur, lpp, rpp, 1); 2492 2493 xfs_btree_log_keys(cur, lbp, lrecs, lrecs); 2494 xfs_btree_log_ptrs(cur, lbp, lrecs, lrecs); 2495 2496 ASSERT(cur->bc_ops->keys_inorder(cur, 2497 xfs_btree_key_addr(cur, lrecs - 1, left), lkp)); 2498 } else { 2499 /* It's a leaf. Move records. */ 2500 union xfs_btree_rec *lrp; /* left record pointer */ 2501 2502 lrp = xfs_btree_rec_addr(cur, lrecs, left); 2503 rrp = xfs_btree_rec_addr(cur, 1, right); 2504 2505 xfs_btree_copy_recs(cur, lrp, rrp, 1); 2506 xfs_btree_log_recs(cur, lbp, lrecs, lrecs); 2507 2508 ASSERT(cur->bc_ops->recs_inorder(cur, 2509 xfs_btree_rec_addr(cur, lrecs - 1, left), lrp)); 2510 } 2511 2512 xfs_btree_set_numrecs(left, lrecs); 2513 xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS); 2514 2515 xfs_btree_set_numrecs(right, rrecs); 2516 xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS); 2517 2518 /* 2519 * Slide the contents of right down one entry. 2520 */ 2521 XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1); 2522 if (level > 0) { 2523 /* It's a nonleaf. operate on keys and ptrs */ 2524 for (i = 0; i < rrecs; i++) { 2525 error = xfs_btree_debug_check_ptr(cur, rpp, i + 1, level); 2526 if (error) 2527 goto error0; 2528 } 2529 2530 xfs_btree_shift_keys(cur, 2531 xfs_btree_key_addr(cur, 2, right), 2532 -1, rrecs); 2533 xfs_btree_shift_ptrs(cur, 2534 xfs_btree_ptr_addr(cur, 2, right), 2535 -1, rrecs); 2536 2537 xfs_btree_log_keys(cur, rbp, 1, rrecs); 2538 xfs_btree_log_ptrs(cur, rbp, 1, rrecs); 2539 } else { 2540 /* It's a leaf. operate on records */ 2541 xfs_btree_shift_recs(cur, 2542 xfs_btree_rec_addr(cur, 2, right), 2543 -1, rrecs); 2544 xfs_btree_log_recs(cur, rbp, 1, rrecs); 2545 } 2546 2547 /* 2548 * Using a temporary cursor, update the parent key values of the 2549 * block on the left. 2550 */ 2551 if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) { 2552 error = xfs_btree_dup_cursor(cur, &tcur); 2553 if (error) 2554 goto error0; 2555 i = xfs_btree_firstrec(tcur, level); 2556 if (XFS_IS_CORRUPT(tcur->bc_mp, i != 1)) { 2557 xfs_btree_mark_sick(cur); 2558 error = -EFSCORRUPTED; 2559 goto error0; 2560 } 2561 2562 error = xfs_btree_decrement(tcur, level, &i); 2563 if (error) 2564 goto error1; 2565 2566 /* Update the parent high keys of the left block, if needed. */ 2567 error = xfs_btree_update_keys(tcur, level); 2568 if (error) 2569 goto error1; 2570 2571 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); 2572 } 2573 2574 /* Update the parent keys of the right block. */ 2575 error = xfs_btree_update_keys(cur, level); 2576 if (error) 2577 goto error0; 2578 2579 /* Slide the cursor value left one. */ 2580 cur->bc_levels[level].ptr--; 2581 2582 *stat = 1; 2583 return 0; 2584 2585 out0: 2586 *stat = 0; 2587 return 0; 2588 2589 error0: 2590 return error; 2591 2592 error1: 2593 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); 2594 return error; 2595 } 2596 2597 /* 2598 * Move 1 record right from cur/level if possible. 2599 * Update cur to reflect the new path. 2600 */ 2601 STATIC int /* error */ 2602 xfs_btree_rshift( 2603 struct xfs_btree_cur *cur, 2604 int level, 2605 int *stat) /* success/failure */ 2606 { 2607 struct xfs_buf *lbp; /* left buffer pointer */ 2608 struct xfs_btree_block *left; /* left btree block */ 2609 struct xfs_buf *rbp; /* right buffer pointer */ 2610 struct xfs_btree_block *right; /* right btree block */ 2611 struct xfs_btree_cur *tcur; /* temporary btree cursor */ 2612 union xfs_btree_ptr rptr; /* right block pointer */ 2613 union xfs_btree_key *rkp; /* right btree key */ 2614 int rrecs; /* right record count */ 2615 int lrecs; /* left record count */ 2616 int error; /* error return value */ 2617 int i; /* loop counter */ 2618 2619 if (xfs_btree_at_iroot(cur, level)) 2620 goto out0; 2621 2622 /* Set up variables for this block as "left". */ 2623 left = xfs_btree_get_block(cur, level, &lbp); 2624 2625 #ifdef DEBUG 2626 error = xfs_btree_check_block(cur, left, level, lbp); 2627 if (error) 2628 goto error0; 2629 #endif 2630 2631 /* If we've got no right sibling then we can't shift an entry right. */ 2632 xfs_btree_get_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB); 2633 if (xfs_btree_ptr_is_null(cur, &rptr)) 2634 goto out0; 2635 2636 /* 2637 * If the cursor entry is the one that would be moved, don't 2638 * do it... it's too complicated. 2639 */ 2640 lrecs = xfs_btree_get_numrecs(left); 2641 if (cur->bc_levels[level].ptr >= lrecs) 2642 goto out0; 2643 2644 /* Set up the right neighbor as "right". */ 2645 error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp); 2646 if (error) 2647 goto error0; 2648 2649 /* If it's full, it can't take another entry. */ 2650 rrecs = xfs_btree_get_numrecs(right); 2651 if (rrecs == cur->bc_ops->get_maxrecs(cur, level)) 2652 goto out0; 2653 2654 XFS_BTREE_STATS_INC(cur, rshift); 2655 XFS_BTREE_STATS_ADD(cur, moves, rrecs); 2656 2657 /* 2658 * Make a hole at the start of the right neighbor block, then 2659 * copy the last left block entry to the hole. 2660 */ 2661 if (level > 0) { 2662 /* It's a nonleaf. make a hole in the keys and ptrs */ 2663 union xfs_btree_key *lkp; 2664 union xfs_btree_ptr *lpp; 2665 union xfs_btree_ptr *rpp; 2666 2667 lkp = xfs_btree_key_addr(cur, lrecs, left); 2668 lpp = xfs_btree_ptr_addr(cur, lrecs, left); 2669 rkp = xfs_btree_key_addr(cur, 1, right); 2670 rpp = xfs_btree_ptr_addr(cur, 1, right); 2671 2672 for (i = rrecs - 1; i >= 0; i--) { 2673 error = xfs_btree_debug_check_ptr(cur, rpp, i, level); 2674 if (error) 2675 goto error0; 2676 } 2677 2678 xfs_btree_shift_keys(cur, rkp, 1, rrecs); 2679 xfs_btree_shift_ptrs(cur, rpp, 1, rrecs); 2680 2681 error = xfs_btree_debug_check_ptr(cur, lpp, 0, level); 2682 if (error) 2683 goto error0; 2684 2685 /* Now put the new data in, and log it. */ 2686 xfs_btree_copy_keys(cur, rkp, lkp, 1); 2687 xfs_btree_copy_ptrs(cur, rpp, lpp, 1); 2688 2689 xfs_btree_log_keys(cur, rbp, 1, rrecs + 1); 2690 xfs_btree_log_ptrs(cur, rbp, 1, rrecs + 1); 2691 2692 ASSERT(cur->bc_ops->keys_inorder(cur, rkp, 2693 xfs_btree_key_addr(cur, 2, right))); 2694 } else { 2695 /* It's a leaf. make a hole in the records */ 2696 union xfs_btree_rec *lrp; 2697 union xfs_btree_rec *rrp; 2698 2699 lrp = xfs_btree_rec_addr(cur, lrecs, left); 2700 rrp = xfs_btree_rec_addr(cur, 1, right); 2701 2702 xfs_btree_shift_recs(cur, rrp, 1, rrecs); 2703 2704 /* Now put the new data in, and log it. */ 2705 xfs_btree_copy_recs(cur, rrp, lrp, 1); 2706 xfs_btree_log_recs(cur, rbp, 1, rrecs + 1); 2707 } 2708 2709 /* 2710 * Decrement and log left's numrecs, bump and log right's numrecs. 2711 */ 2712 xfs_btree_set_numrecs(left, --lrecs); 2713 xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS); 2714 2715 xfs_btree_set_numrecs(right, ++rrecs); 2716 xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS); 2717 2718 /* 2719 * Using a temporary cursor, update the parent key values of the 2720 * block on the right. 2721 */ 2722 error = xfs_btree_dup_cursor(cur, &tcur); 2723 if (error) 2724 goto error0; 2725 i = xfs_btree_lastrec(tcur, level); 2726 if (XFS_IS_CORRUPT(tcur->bc_mp, i != 1)) { 2727 xfs_btree_mark_sick(cur); 2728 error = -EFSCORRUPTED; 2729 goto error0; 2730 } 2731 2732 error = xfs_btree_increment(tcur, level, &i); 2733 if (error) 2734 goto error1; 2735 2736 /* Update the parent high keys of the left block, if needed. */ 2737 if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) { 2738 error = xfs_btree_update_keys(cur, level); 2739 if (error) 2740 goto error1; 2741 } 2742 2743 /* Update the parent keys of the right block. */ 2744 error = xfs_btree_update_keys(tcur, level); 2745 if (error) 2746 goto error1; 2747 2748 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); 2749 2750 *stat = 1; 2751 return 0; 2752 2753 out0: 2754 *stat = 0; 2755 return 0; 2756 2757 error0: 2758 return error; 2759 2760 error1: 2761 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); 2762 return error; 2763 } 2764 2765 static inline int 2766 xfs_btree_alloc_block( 2767 struct xfs_btree_cur *cur, 2768 const union xfs_btree_ptr *hint_block, 2769 union xfs_btree_ptr *new_block, 2770 int *stat) 2771 { 2772 int error; 2773 2774 /* 2775 * Don't allow block allocation for a staging cursor, because staging 2776 * cursors do not support regular btree modifications. 2777 * 2778 * Bulk loading uses a separate callback to obtain new blocks from a 2779 * preallocated list, which prevents ENOSPC failures during loading. 2780 */ 2781 if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) { 2782 ASSERT(0); 2783 return -EFSCORRUPTED; 2784 } 2785 2786 error = cur->bc_ops->alloc_block(cur, hint_block, new_block, stat); 2787 trace_xfs_btree_alloc_block(cur, new_block, *stat, error); 2788 return error; 2789 } 2790 2791 /* 2792 * Split cur/level block in half. 2793 * Return new block number and the key to its first 2794 * record (to be inserted into parent). 2795 */ 2796 STATIC int /* error */ 2797 __xfs_btree_split( 2798 struct xfs_btree_cur *cur, 2799 int level, 2800 union xfs_btree_ptr *ptrp, 2801 union xfs_btree_key *key, 2802 struct xfs_btree_cur **curp, 2803 int *stat) /* success/failure */ 2804 { 2805 union xfs_btree_ptr lptr; /* left sibling block ptr */ 2806 struct xfs_buf *lbp; /* left buffer pointer */ 2807 struct xfs_btree_block *left; /* left btree block */ 2808 union xfs_btree_ptr rptr; /* right sibling block ptr */ 2809 struct xfs_buf *rbp; /* right buffer pointer */ 2810 struct xfs_btree_block *right; /* right btree block */ 2811 union xfs_btree_ptr rrptr; /* right-right sibling ptr */ 2812 struct xfs_buf *rrbp; /* right-right buffer pointer */ 2813 struct xfs_btree_block *rrblock; /* right-right btree block */ 2814 int lrecs; 2815 int rrecs; 2816 int src_index; 2817 int error; /* error return value */ 2818 int i; 2819 2820 XFS_BTREE_STATS_INC(cur, split); 2821 2822 /* Set up left block (current one). */ 2823 left = xfs_btree_get_block(cur, level, &lbp); 2824 2825 #ifdef DEBUG 2826 error = xfs_btree_check_block(cur, left, level, lbp); 2827 if (error) 2828 goto error0; 2829 #endif 2830 2831 xfs_btree_buf_to_ptr(cur, lbp, &lptr); 2832 2833 /* Allocate the new block. If we can't do it, we're toast. Give up. */ 2834 error = xfs_btree_alloc_block(cur, &lptr, &rptr, stat); 2835 if (error) 2836 goto error0; 2837 if (*stat == 0) 2838 goto out0; 2839 XFS_BTREE_STATS_INC(cur, alloc); 2840 2841 /* Set up the new block as "right". */ 2842 error = xfs_btree_get_buf_block(cur, &rptr, &right, &rbp); 2843 if (error) 2844 goto error0; 2845 2846 /* Fill in the btree header for the new right block. */ 2847 xfs_btree_init_block_cur(cur, rbp, xfs_btree_get_level(left), 0); 2848 2849 /* 2850 * Split the entries between the old and the new block evenly. 2851 * Make sure that if there's an odd number of entries now, that 2852 * each new block will have the same number of entries. 2853 */ 2854 lrecs = xfs_btree_get_numrecs(left); 2855 rrecs = lrecs / 2; 2856 if ((lrecs & 1) && cur->bc_levels[level].ptr <= rrecs + 1) 2857 rrecs++; 2858 src_index = (lrecs - rrecs + 1); 2859 2860 XFS_BTREE_STATS_ADD(cur, moves, rrecs); 2861 2862 /* Adjust numrecs for the later get_*_keys() calls. */ 2863 lrecs -= rrecs; 2864 xfs_btree_set_numrecs(left, lrecs); 2865 xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs); 2866 2867 /* 2868 * Copy btree block entries from the left block over to the 2869 * new block, the right. Update the right block and log the 2870 * changes. 2871 */ 2872 if (level > 0) { 2873 /* It's a non-leaf. Move keys and pointers. */ 2874 union xfs_btree_key *lkp; /* left btree key */ 2875 union xfs_btree_ptr *lpp; /* left address pointer */ 2876 union xfs_btree_key *rkp; /* right btree key */ 2877 union xfs_btree_ptr *rpp; /* right address pointer */ 2878 2879 lkp = xfs_btree_key_addr(cur, src_index, left); 2880 lpp = xfs_btree_ptr_addr(cur, src_index, left); 2881 rkp = xfs_btree_key_addr(cur, 1, right); 2882 rpp = xfs_btree_ptr_addr(cur, 1, right); 2883 2884 for (i = src_index; i < rrecs; i++) { 2885 error = xfs_btree_debug_check_ptr(cur, lpp, i, level); 2886 if (error) 2887 goto error0; 2888 } 2889 2890 /* Copy the keys & pointers to the new block. */ 2891 xfs_btree_copy_keys(cur, rkp, lkp, rrecs); 2892 xfs_btree_copy_ptrs(cur, rpp, lpp, rrecs); 2893 2894 xfs_btree_log_keys(cur, rbp, 1, rrecs); 2895 xfs_btree_log_ptrs(cur, rbp, 1, rrecs); 2896 2897 /* Stash the keys of the new block for later insertion. */ 2898 xfs_btree_get_node_keys(cur, right, key); 2899 } else { 2900 /* It's a leaf. Move records. */ 2901 union xfs_btree_rec *lrp; /* left record pointer */ 2902 union xfs_btree_rec *rrp; /* right record pointer */ 2903 2904 lrp = xfs_btree_rec_addr(cur, src_index, left); 2905 rrp = xfs_btree_rec_addr(cur, 1, right); 2906 2907 /* Copy records to the new block. */ 2908 xfs_btree_copy_recs(cur, rrp, lrp, rrecs); 2909 xfs_btree_log_recs(cur, rbp, 1, rrecs); 2910 2911 /* Stash the keys of the new block for later insertion. */ 2912 xfs_btree_get_leaf_keys(cur, right, key); 2913 } 2914 2915 /* 2916 * Find the left block number by looking in the buffer. 2917 * Adjust sibling pointers. 2918 */ 2919 xfs_btree_get_sibling(cur, left, &rrptr, XFS_BB_RIGHTSIB); 2920 xfs_btree_set_sibling(cur, right, &rrptr, XFS_BB_RIGHTSIB); 2921 xfs_btree_set_sibling(cur, right, &lptr, XFS_BB_LEFTSIB); 2922 xfs_btree_set_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB); 2923 2924 xfs_btree_log_block(cur, rbp, XFS_BB_ALL_BITS); 2925 xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB); 2926 2927 /* 2928 * If there's a block to the new block's right, make that block 2929 * point back to right instead of to left. 2930 */ 2931 if (!xfs_btree_ptr_is_null(cur, &rrptr)) { 2932 error = xfs_btree_read_buf_block(cur, &rrptr, 2933 0, &rrblock, &rrbp); 2934 if (error) 2935 goto error0; 2936 xfs_btree_set_sibling(cur, rrblock, &rptr, XFS_BB_LEFTSIB); 2937 xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB); 2938 } 2939 2940 /* Update the parent high keys of the left block, if needed. */ 2941 if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) { 2942 error = xfs_btree_update_keys(cur, level); 2943 if (error) 2944 goto error0; 2945 } 2946 2947 /* 2948 * If the cursor is really in the right block, move it there. 2949 * If it's just pointing past the last entry in left, then we'll 2950 * insert there, so don't change anything in that case. 2951 */ 2952 if (cur->bc_levels[level].ptr > lrecs + 1) { 2953 xfs_btree_setbuf(cur, level, rbp); 2954 cur->bc_levels[level].ptr -= lrecs; 2955 } 2956 /* 2957 * If there are more levels, we'll need another cursor which refers 2958 * the right block, no matter where this cursor was. 2959 */ 2960 if (level + 1 < cur->bc_nlevels) { 2961 error = xfs_btree_dup_cursor(cur, curp); 2962 if (error) 2963 goto error0; 2964 (*curp)->bc_levels[level + 1].ptr++; 2965 } 2966 *ptrp = rptr; 2967 *stat = 1; 2968 return 0; 2969 out0: 2970 *stat = 0; 2971 return 0; 2972 2973 error0: 2974 return error; 2975 } 2976 2977 #ifdef __KERNEL__ 2978 struct xfs_btree_split_args { 2979 struct xfs_btree_cur *cur; 2980 int level; 2981 union xfs_btree_ptr *ptrp; 2982 union xfs_btree_key *key; 2983 struct xfs_btree_cur **curp; 2984 int *stat; /* success/failure */ 2985 int result; 2986 bool kswapd; /* allocation in kswapd context */ 2987 struct completion *done; 2988 struct work_struct work; 2989 }; 2990 2991 /* 2992 * Stack switching interfaces for allocation 2993 */ 2994 static void 2995 xfs_btree_split_worker( 2996 struct work_struct *work) 2997 { 2998 struct xfs_btree_split_args *args = container_of(work, 2999 struct xfs_btree_split_args, work); 3000 unsigned long pflags; 3001 unsigned long new_pflags = 0; 3002 3003 /* 3004 * we are in a transaction context here, but may also be doing work 3005 * in kswapd context, and hence we may need to inherit that state 3006 * temporarily to ensure that we don't block waiting for memory reclaim 3007 * in any way. 3008 */ 3009 if (args->kswapd) 3010 new_pflags |= PF_MEMALLOC | PF_KSWAPD; 3011 3012 current_set_flags_nested(&pflags, new_pflags); 3013 xfs_trans_set_context(args->cur->bc_tp); 3014 3015 args->result = __xfs_btree_split(args->cur, args->level, args->ptrp, 3016 args->key, args->curp, args->stat); 3017 3018 xfs_trans_clear_context(args->cur->bc_tp); 3019 current_restore_flags_nested(&pflags, new_pflags); 3020 3021 /* 3022 * Do not access args after complete() has run here. We don't own args 3023 * and the owner may run and free args before we return here. 3024 */ 3025 complete(args->done); 3026 3027 } 3028 3029 /* 3030 * BMBT split requests often come in with little stack to work on so we push 3031 * them off to a worker thread so there is lots of stack to use. For the other 3032 * btree types, just call directly to avoid the context switch overhead here. 3033 * 3034 * Care must be taken here - the work queue rescuer thread introduces potential 3035 * AGF <> worker queue deadlocks if the BMBT block allocation has to lock new 3036 * AGFs to allocate blocks. A task being run by the rescuer could attempt to 3037 * lock an AGF that is already locked by a task queued to run by the rescuer, 3038 * resulting in an ABBA deadlock as the rescuer cannot run the lock holder to 3039 * release it until the current thread it is running gains the lock. 3040 * 3041 * To avoid this issue, we only ever queue BMBT splits that don't have an AGF 3042 * already locked to allocate from. The only place that doesn't hold an AGF 3043 * locked is unwritten extent conversion at IO completion, but that has already 3044 * been offloaded to a worker thread and hence has no stack consumption issues 3045 * we have to worry about. 3046 */ 3047 STATIC int /* error */ 3048 xfs_btree_split( 3049 struct xfs_btree_cur *cur, 3050 int level, 3051 union xfs_btree_ptr *ptrp, 3052 union xfs_btree_key *key, 3053 struct xfs_btree_cur **curp, 3054 int *stat) /* success/failure */ 3055 { 3056 struct xfs_btree_split_args args; 3057 DECLARE_COMPLETION_ONSTACK(done); 3058 3059 if (!xfs_btree_is_bmap(cur->bc_ops) || 3060 cur->bc_tp->t_highest_agno == NULLAGNUMBER) 3061 return __xfs_btree_split(cur, level, ptrp, key, curp, stat); 3062 3063 args.cur = cur; 3064 args.level = level; 3065 args.ptrp = ptrp; 3066 args.key = key; 3067 args.curp = curp; 3068 args.stat = stat; 3069 args.done = &done; 3070 args.kswapd = current_is_kswapd(); 3071 INIT_WORK_ONSTACK(&args.work, xfs_btree_split_worker); 3072 queue_work(xfs_alloc_wq, &args.work); 3073 wait_for_completion(&done); 3074 destroy_work_on_stack(&args.work); 3075 return args.result; 3076 } 3077 #else 3078 #define xfs_btree_split __xfs_btree_split 3079 #endif /* __KERNEL__ */ 3080 3081 /* 3082 * Copy the old inode root contents into a real block and make the 3083 * broot point to it. 3084 */ 3085 int /* error */ 3086 xfs_btree_new_iroot( 3087 struct xfs_btree_cur *cur, /* btree cursor */ 3088 int *logflags, /* logging flags for inode */ 3089 int *stat) /* return status - 0 fail */ 3090 { 3091 struct xfs_buf *cbp; /* buffer for cblock */ 3092 struct xfs_btree_block *block; /* btree block */ 3093 struct xfs_btree_block *cblock; /* child btree block */ 3094 union xfs_btree_key *ckp; /* child key pointer */ 3095 union xfs_btree_ptr *cpp; /* child ptr pointer */ 3096 union xfs_btree_key *kp; /* pointer to btree key */ 3097 union xfs_btree_ptr *pp; /* pointer to block addr */ 3098 union xfs_btree_ptr nptr; /* new block addr */ 3099 int level; /* btree level */ 3100 int error; /* error return code */ 3101 int i; /* loop counter */ 3102 3103 XFS_BTREE_STATS_INC(cur, newroot); 3104 3105 ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE); 3106 3107 level = cur->bc_nlevels - 1; 3108 3109 block = xfs_btree_get_iroot(cur); 3110 pp = xfs_btree_ptr_addr(cur, 1, block); 3111 3112 /* Allocate the new block. If we can't do it, we're toast. Give up. */ 3113 error = xfs_btree_alloc_block(cur, pp, &nptr, stat); 3114 if (error) 3115 goto error0; 3116 if (*stat == 0) 3117 return 0; 3118 3119 XFS_BTREE_STATS_INC(cur, alloc); 3120 3121 /* Copy the root into a real block. */ 3122 error = xfs_btree_get_buf_block(cur, &nptr, &cblock, &cbp); 3123 if (error) 3124 goto error0; 3125 3126 /* 3127 * we can't just memcpy() the root in for CRC enabled btree blocks. 3128 * In that case have to also ensure the blkno remains correct 3129 */ 3130 memcpy(cblock, block, xfs_btree_block_len(cur)); 3131 if (xfs_has_crc(cur->bc_mp)) { 3132 __be64 bno = cpu_to_be64(xfs_buf_daddr(cbp)); 3133 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) 3134 cblock->bb_u.l.bb_blkno = bno; 3135 else 3136 cblock->bb_u.s.bb_blkno = bno; 3137 } 3138 3139 be16_add_cpu(&block->bb_level, 1); 3140 xfs_btree_set_numrecs(block, 1); 3141 cur->bc_nlevels++; 3142 ASSERT(cur->bc_nlevels <= cur->bc_maxlevels); 3143 cur->bc_levels[level + 1].ptr = 1; 3144 3145 kp = xfs_btree_key_addr(cur, 1, block); 3146 ckp = xfs_btree_key_addr(cur, 1, cblock); 3147 xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock)); 3148 3149 cpp = xfs_btree_ptr_addr(cur, 1, cblock); 3150 for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) { 3151 error = xfs_btree_debug_check_ptr(cur, pp, i, level); 3152 if (error) 3153 goto error0; 3154 } 3155 3156 xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock)); 3157 3158 error = xfs_btree_debug_check_ptr(cur, &nptr, 0, level); 3159 if (error) 3160 goto error0; 3161 3162 xfs_btree_copy_ptrs(cur, pp, &nptr, 1); 3163 3164 xfs_iroot_realloc(cur->bc_ino.ip, 3165 1 - xfs_btree_get_numrecs(cblock), 3166 cur->bc_ino.whichfork); 3167 3168 xfs_btree_setbuf(cur, level, cbp); 3169 3170 /* 3171 * Do all this logging at the end so that 3172 * the root is at the right level. 3173 */ 3174 xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS); 3175 xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs)); 3176 xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs)); 3177 3178 *logflags |= 3179 XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork); 3180 *stat = 1; 3181 return 0; 3182 error0: 3183 return error; 3184 } 3185 3186 static void 3187 xfs_btree_set_root( 3188 struct xfs_btree_cur *cur, 3189 const union xfs_btree_ptr *ptr, 3190 int inc) 3191 { 3192 if (cur->bc_flags & XFS_BTREE_STAGING) { 3193 /* Update the btree root information for a per-AG fake root. */ 3194 cur->bc_ag.afake->af_root = be32_to_cpu(ptr->s); 3195 cur->bc_ag.afake->af_levels += inc; 3196 } else { 3197 cur->bc_ops->set_root(cur, ptr, inc); 3198 } 3199 } 3200 3201 /* 3202 * Allocate a new root block, fill it in. 3203 */ 3204 STATIC int /* error */ 3205 xfs_btree_new_root( 3206 struct xfs_btree_cur *cur, /* btree cursor */ 3207 int *stat) /* success/failure */ 3208 { 3209 struct xfs_btree_block *block; /* one half of the old root block */ 3210 struct xfs_buf *bp; /* buffer containing block */ 3211 int error; /* error return value */ 3212 struct xfs_buf *lbp; /* left buffer pointer */ 3213 struct xfs_btree_block *left; /* left btree block */ 3214 struct xfs_buf *nbp; /* new (root) buffer */ 3215 struct xfs_btree_block *new; /* new (root) btree block */ 3216 int nptr; /* new value for key index, 1 or 2 */ 3217 struct xfs_buf *rbp; /* right buffer pointer */ 3218 struct xfs_btree_block *right; /* right btree block */ 3219 union xfs_btree_ptr rptr; 3220 union xfs_btree_ptr lptr; 3221 3222 XFS_BTREE_STATS_INC(cur, newroot); 3223 3224 /* initialise our start point from the cursor */ 3225 xfs_btree_init_ptr_from_cur(cur, &rptr); 3226 3227 /* Allocate the new block. If we can't do it, we're toast. Give up. */ 3228 error = xfs_btree_alloc_block(cur, &rptr, &lptr, stat); 3229 if (error) 3230 goto error0; 3231 if (*stat == 0) 3232 goto out0; 3233 XFS_BTREE_STATS_INC(cur, alloc); 3234 3235 /* Set up the new block. */ 3236 error = xfs_btree_get_buf_block(cur, &lptr, &new, &nbp); 3237 if (error) 3238 goto error0; 3239 3240 /* Set the root in the holding structure increasing the level by 1. */ 3241 xfs_btree_set_root(cur, &lptr, 1); 3242 3243 /* 3244 * At the previous root level there are now two blocks: the old root, 3245 * and the new block generated when it was split. We don't know which 3246 * one the cursor is pointing at, so we set up variables "left" and 3247 * "right" for each case. 3248 */ 3249 block = xfs_btree_get_block(cur, cur->bc_nlevels - 1, &bp); 3250 3251 #ifdef DEBUG 3252 error = xfs_btree_check_block(cur, block, cur->bc_nlevels - 1, bp); 3253 if (error) 3254 goto error0; 3255 #endif 3256 3257 xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB); 3258 if (!xfs_btree_ptr_is_null(cur, &rptr)) { 3259 /* Our block is left, pick up the right block. */ 3260 lbp = bp; 3261 xfs_btree_buf_to_ptr(cur, lbp, &lptr); 3262 left = block; 3263 error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp); 3264 if (error) 3265 goto error0; 3266 bp = rbp; 3267 nptr = 1; 3268 } else { 3269 /* Our block is right, pick up the left block. */ 3270 rbp = bp; 3271 xfs_btree_buf_to_ptr(cur, rbp, &rptr); 3272 right = block; 3273 xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB); 3274 error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp); 3275 if (error) 3276 goto error0; 3277 bp = lbp; 3278 nptr = 2; 3279 } 3280 3281 /* Fill in the new block's btree header and log it. */ 3282 xfs_btree_init_block_cur(cur, nbp, cur->bc_nlevels, 2); 3283 xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS); 3284 ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) && 3285 !xfs_btree_ptr_is_null(cur, &rptr)); 3286 3287 /* Fill in the key data in the new root. */ 3288 if (xfs_btree_get_level(left) > 0) { 3289 /* 3290 * Get the keys for the left block's keys and put them directly 3291 * in the parent block. Do the same for the right block. 3292 */ 3293 xfs_btree_get_node_keys(cur, left, 3294 xfs_btree_key_addr(cur, 1, new)); 3295 xfs_btree_get_node_keys(cur, right, 3296 xfs_btree_key_addr(cur, 2, new)); 3297 } else { 3298 /* 3299 * Get the keys for the left block's records and put them 3300 * directly in the parent block. Do the same for the right 3301 * block. 3302 */ 3303 xfs_btree_get_leaf_keys(cur, left, 3304 xfs_btree_key_addr(cur, 1, new)); 3305 xfs_btree_get_leaf_keys(cur, right, 3306 xfs_btree_key_addr(cur, 2, new)); 3307 } 3308 xfs_btree_log_keys(cur, nbp, 1, 2); 3309 3310 /* Fill in the pointer data in the new root. */ 3311 xfs_btree_copy_ptrs(cur, 3312 xfs_btree_ptr_addr(cur, 1, new), &lptr, 1); 3313 xfs_btree_copy_ptrs(cur, 3314 xfs_btree_ptr_addr(cur, 2, new), &rptr, 1); 3315 xfs_btree_log_ptrs(cur, nbp, 1, 2); 3316 3317 /* Fix up the cursor. */ 3318 xfs_btree_setbuf(cur, cur->bc_nlevels, nbp); 3319 cur->bc_levels[cur->bc_nlevels].ptr = nptr; 3320 cur->bc_nlevels++; 3321 ASSERT(cur->bc_nlevels <= cur->bc_maxlevels); 3322 *stat = 1; 3323 return 0; 3324 error0: 3325 return error; 3326 out0: 3327 *stat = 0; 3328 return 0; 3329 } 3330 3331 STATIC int 3332 xfs_btree_make_block_unfull( 3333 struct xfs_btree_cur *cur, /* btree cursor */ 3334 int level, /* btree level */ 3335 int numrecs,/* # of recs in block */ 3336 int *oindex,/* old tree index */ 3337 int *index, /* new tree index */ 3338 union xfs_btree_ptr *nptr, /* new btree ptr */ 3339 struct xfs_btree_cur **ncur, /* new btree cursor */ 3340 union xfs_btree_key *key, /* key of new block */ 3341 int *stat) 3342 { 3343 int error = 0; 3344 3345 if (xfs_btree_at_iroot(cur, level)) { 3346 struct xfs_inode *ip = cur->bc_ino.ip; 3347 3348 if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) { 3349 /* A root block that can be made bigger. */ 3350 xfs_iroot_realloc(ip, 1, cur->bc_ino.whichfork); 3351 *stat = 1; 3352 } else { 3353 /* A root block that needs replacing */ 3354 int logflags = 0; 3355 3356 error = xfs_btree_new_iroot(cur, &logflags, stat); 3357 if (error || *stat == 0) 3358 return error; 3359 3360 xfs_trans_log_inode(cur->bc_tp, ip, logflags); 3361 } 3362 3363 return 0; 3364 } 3365 3366 /* First, try shifting an entry to the right neighbor. */ 3367 error = xfs_btree_rshift(cur, level, stat); 3368 if (error || *stat) 3369 return error; 3370 3371 /* Next, try shifting an entry to the left neighbor. */ 3372 error = xfs_btree_lshift(cur, level, stat); 3373 if (error) 3374 return error; 3375 3376 if (*stat) { 3377 *oindex = *index = cur->bc_levels[level].ptr; 3378 return 0; 3379 } 3380 3381 /* 3382 * Next, try splitting the current block in half. 3383 * 3384 * If this works we have to re-set our variables because we 3385 * could be in a different block now. 3386 */ 3387 error = xfs_btree_split(cur, level, nptr, key, ncur, stat); 3388 if (error || *stat == 0) 3389 return error; 3390 3391 3392 *index = cur->bc_levels[level].ptr; 3393 return 0; 3394 } 3395 3396 /* 3397 * Insert one record/level. Return information to the caller 3398 * allowing the next level up to proceed if necessary. 3399 */ 3400 STATIC int 3401 xfs_btree_insrec( 3402 struct xfs_btree_cur *cur, /* btree cursor */ 3403 int level, /* level to insert record at */ 3404 union xfs_btree_ptr *ptrp, /* i/o: block number inserted */ 3405 union xfs_btree_rec *rec, /* record to insert */ 3406 union xfs_btree_key *key, /* i/o: block key for ptrp */ 3407 struct xfs_btree_cur **curp, /* output: new cursor replacing cur */ 3408 int *stat) /* success/failure */ 3409 { 3410 struct xfs_btree_block *block; /* btree block */ 3411 struct xfs_buf *bp; /* buffer for block */ 3412 union xfs_btree_ptr nptr; /* new block ptr */ 3413 struct xfs_btree_cur *ncur = NULL; /* new btree cursor */ 3414 union xfs_btree_key nkey; /* new block key */ 3415 union xfs_btree_key *lkey; 3416 int optr; /* old key/record index */ 3417 int ptr; /* key/record index */ 3418 int numrecs;/* number of records */ 3419 int error; /* error return value */ 3420 int i; 3421 xfs_daddr_t old_bn; 3422 3423 ncur = NULL; 3424 lkey = &nkey; 3425 3426 /* 3427 * If we have an external root pointer, and we've made it to the 3428 * root level, allocate a new root block and we're done. 3429 */ 3430 if (cur->bc_ops->type != XFS_BTREE_TYPE_INODE && 3431 level >= cur->bc_nlevels) { 3432 error = xfs_btree_new_root(cur, stat); 3433 xfs_btree_set_ptr_null(cur, ptrp); 3434 3435 return error; 3436 } 3437 3438 /* If we're off the left edge, return failure. */ 3439 ptr = cur->bc_levels[level].ptr; 3440 if (ptr == 0) { 3441 *stat = 0; 3442 return 0; 3443 } 3444 3445 optr = ptr; 3446 3447 XFS_BTREE_STATS_INC(cur, insrec); 3448 3449 /* Get pointers to the btree buffer and block. */ 3450 block = xfs_btree_get_block(cur, level, &bp); 3451 old_bn = bp ? xfs_buf_daddr(bp) : XFS_BUF_DADDR_NULL; 3452 numrecs = xfs_btree_get_numrecs(block); 3453 3454 #ifdef DEBUG 3455 error = xfs_btree_check_block(cur, block, level, bp); 3456 if (error) 3457 goto error0; 3458 3459 /* Check that the new entry is being inserted in the right place. */ 3460 if (ptr <= numrecs) { 3461 if (level == 0) { 3462 ASSERT(cur->bc_ops->recs_inorder(cur, rec, 3463 xfs_btree_rec_addr(cur, ptr, block))); 3464 } else { 3465 ASSERT(cur->bc_ops->keys_inorder(cur, key, 3466 xfs_btree_key_addr(cur, ptr, block))); 3467 } 3468 } 3469 #endif 3470 3471 /* 3472 * If the block is full, we can't insert the new entry until we 3473 * make the block un-full. 3474 */ 3475 xfs_btree_set_ptr_null(cur, &nptr); 3476 if (numrecs == cur->bc_ops->get_maxrecs(cur, level)) { 3477 error = xfs_btree_make_block_unfull(cur, level, numrecs, 3478 &optr, &ptr, &nptr, &ncur, lkey, stat); 3479 if (error || *stat == 0) 3480 goto error0; 3481 } 3482 3483 /* 3484 * The current block may have changed if the block was 3485 * previously full and we have just made space in it. 3486 */ 3487 block = xfs_btree_get_block(cur, level, &bp); 3488 numrecs = xfs_btree_get_numrecs(block); 3489 3490 #ifdef DEBUG 3491 error = xfs_btree_check_block(cur, block, level, bp); 3492 if (error) 3493 goto error0; 3494 #endif 3495 3496 /* 3497 * At this point we know there's room for our new entry in the block 3498 * we're pointing at. 3499 */ 3500 XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr + 1); 3501 3502 if (level > 0) { 3503 /* It's a nonleaf. make a hole in the keys and ptrs */ 3504 union xfs_btree_key *kp; 3505 union xfs_btree_ptr *pp; 3506 3507 kp = xfs_btree_key_addr(cur, ptr, block); 3508 pp = xfs_btree_ptr_addr(cur, ptr, block); 3509 3510 for (i = numrecs - ptr; i >= 0; i--) { 3511 error = xfs_btree_debug_check_ptr(cur, pp, i, level); 3512 if (error) 3513 goto error0; 3514 } 3515 3516 xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1); 3517 xfs_btree_shift_ptrs(cur, pp, 1, numrecs - ptr + 1); 3518 3519 error = xfs_btree_debug_check_ptr(cur, ptrp, 0, level); 3520 if (error) 3521 goto error0; 3522 3523 /* Now put the new data in, bump numrecs and log it. */ 3524 xfs_btree_copy_keys(cur, kp, key, 1); 3525 xfs_btree_copy_ptrs(cur, pp, ptrp, 1); 3526 numrecs++; 3527 xfs_btree_set_numrecs(block, numrecs); 3528 xfs_btree_log_ptrs(cur, bp, ptr, numrecs); 3529 xfs_btree_log_keys(cur, bp, ptr, numrecs); 3530 #ifdef DEBUG 3531 if (ptr < numrecs) { 3532 ASSERT(cur->bc_ops->keys_inorder(cur, kp, 3533 xfs_btree_key_addr(cur, ptr + 1, block))); 3534 } 3535 #endif 3536 } else { 3537 /* It's a leaf. make a hole in the records */ 3538 union xfs_btree_rec *rp; 3539 3540 rp = xfs_btree_rec_addr(cur, ptr, block); 3541 3542 xfs_btree_shift_recs(cur, rp, 1, numrecs - ptr + 1); 3543 3544 /* Now put the new data in, bump numrecs and log it. */ 3545 xfs_btree_copy_recs(cur, rp, rec, 1); 3546 xfs_btree_set_numrecs(block, ++numrecs); 3547 xfs_btree_log_recs(cur, bp, ptr, numrecs); 3548 #ifdef DEBUG 3549 if (ptr < numrecs) { 3550 ASSERT(cur->bc_ops->recs_inorder(cur, rp, 3551 xfs_btree_rec_addr(cur, ptr + 1, block))); 3552 } 3553 #endif 3554 } 3555 3556 /* Log the new number of records in the btree header. */ 3557 xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS); 3558 3559 /* 3560 * Update btree keys to reflect the newly added record or keyptr. 3561 * There are three cases here to be aware of. Normally, all we have to 3562 * do is walk towards the root, updating keys as necessary. 3563 * 3564 * If the caller had us target a full block for the insertion, we dealt 3565 * with that by calling the _make_block_unfull function. If the 3566 * "make unfull" function splits the block, it'll hand us back the key 3567 * and pointer of the new block. We haven't yet added the new block to 3568 * the next level up, so if we decide to add the new record to the new 3569 * block (bp->b_bn != old_bn), we have to update the caller's pointer 3570 * so that the caller adds the new block with the correct key. 3571 * 3572 * However, there is a third possibility-- if the selected block is the 3573 * root block of an inode-rooted btree and cannot be expanded further, 3574 * the "make unfull" function moves the root block contents to a new 3575 * block and updates the root block to point to the new block. In this 3576 * case, no block pointer is passed back because the block has already 3577 * been added to the btree. In this case, we need to use the regular 3578 * key update function, just like the first case. This is critical for 3579 * overlapping btrees, because the high key must be updated to reflect 3580 * the entire tree, not just the subtree accessible through the first 3581 * child of the root (which is now two levels down from the root). 3582 */ 3583 if (!xfs_btree_ptr_is_null(cur, &nptr) && 3584 bp && xfs_buf_daddr(bp) != old_bn) { 3585 xfs_btree_get_keys(cur, block, lkey); 3586 } else if (xfs_btree_needs_key_update(cur, optr)) { 3587 error = xfs_btree_update_keys(cur, level); 3588 if (error) 3589 goto error0; 3590 } 3591 3592 /* 3593 * Return the new block number, if any. 3594 * If there is one, give back a record value and a cursor too. 3595 */ 3596 *ptrp = nptr; 3597 if (!xfs_btree_ptr_is_null(cur, &nptr)) { 3598 xfs_btree_copy_keys(cur, key, lkey, 1); 3599 *curp = ncur; 3600 } 3601 3602 *stat = 1; 3603 return 0; 3604 3605 error0: 3606 if (ncur) 3607 xfs_btree_del_cursor(ncur, error); 3608 return error; 3609 } 3610 3611 /* 3612 * Insert the record at the point referenced by cur. 3613 * 3614 * A multi-level split of the tree on insert will invalidate the original 3615 * cursor. All callers of this function should assume that the cursor is 3616 * no longer valid and revalidate it. 3617 */ 3618 int 3619 xfs_btree_insert( 3620 struct xfs_btree_cur *cur, 3621 int *stat) 3622 { 3623 int error; /* error return value */ 3624 int i; /* result value, 0 for failure */ 3625 int level; /* current level number in btree */ 3626 union xfs_btree_ptr nptr; /* new block number (split result) */ 3627 struct xfs_btree_cur *ncur; /* new cursor (split result) */ 3628 struct xfs_btree_cur *pcur; /* previous level's cursor */ 3629 union xfs_btree_key bkey; /* key of block to insert */ 3630 union xfs_btree_key *key; 3631 union xfs_btree_rec rec; /* record to insert */ 3632 3633 level = 0; 3634 ncur = NULL; 3635 pcur = cur; 3636 key = &bkey; 3637 3638 xfs_btree_set_ptr_null(cur, &nptr); 3639 3640 /* Make a key out of the record data to be inserted, and save it. */ 3641 cur->bc_ops->init_rec_from_cur(cur, &rec); 3642 cur->bc_ops->init_key_from_rec(key, &rec); 3643 3644 /* 3645 * Loop going up the tree, starting at the leaf level. 3646 * Stop when we don't get a split block, that must mean that 3647 * the insert is finished with this level. 3648 */ 3649 do { 3650 /* 3651 * Insert nrec/nptr into this level of the tree. 3652 * Note if we fail, nptr will be null. 3653 */ 3654 error = xfs_btree_insrec(pcur, level, &nptr, &rec, key, 3655 &ncur, &i); 3656 if (error) { 3657 if (pcur != cur) 3658 xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR); 3659 goto error0; 3660 } 3661 3662 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 3663 xfs_btree_mark_sick(cur); 3664 error = -EFSCORRUPTED; 3665 goto error0; 3666 } 3667 level++; 3668 3669 /* 3670 * See if the cursor we just used is trash. 3671 * Can't trash the caller's cursor, but otherwise we should 3672 * if ncur is a new cursor or we're about to be done. 3673 */ 3674 if (pcur != cur && 3675 (ncur || xfs_btree_ptr_is_null(cur, &nptr))) { 3676 /* Save the state from the cursor before we trash it */ 3677 if (cur->bc_ops->update_cursor && 3678 !(cur->bc_flags & XFS_BTREE_STAGING)) 3679 cur->bc_ops->update_cursor(pcur, cur); 3680 cur->bc_nlevels = pcur->bc_nlevels; 3681 xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR); 3682 } 3683 /* If we got a new cursor, switch to it. */ 3684 if (ncur) { 3685 pcur = ncur; 3686 ncur = NULL; 3687 } 3688 } while (!xfs_btree_ptr_is_null(cur, &nptr)); 3689 3690 *stat = i; 3691 return 0; 3692 error0: 3693 return error; 3694 } 3695 3696 /* 3697 * Try to merge a non-leaf block back into the inode root. 3698 * 3699 * Note: the killroot names comes from the fact that we're effectively 3700 * killing the old root block. But because we can't just delete the 3701 * inode we have to copy the single block it was pointing to into the 3702 * inode. 3703 */ 3704 STATIC int 3705 xfs_btree_kill_iroot( 3706 struct xfs_btree_cur *cur) 3707 { 3708 int whichfork = cur->bc_ino.whichfork; 3709 struct xfs_inode *ip = cur->bc_ino.ip; 3710 struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); 3711 struct xfs_btree_block *block; 3712 struct xfs_btree_block *cblock; 3713 union xfs_btree_key *kp; 3714 union xfs_btree_key *ckp; 3715 union xfs_btree_ptr *pp; 3716 union xfs_btree_ptr *cpp; 3717 struct xfs_buf *cbp; 3718 int level; 3719 int index; 3720 int numrecs; 3721 int error; 3722 #ifdef DEBUG 3723 union xfs_btree_ptr ptr; 3724 #endif 3725 int i; 3726 3727 ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE); 3728 ASSERT(cur->bc_nlevels > 1); 3729 3730 /* 3731 * Don't deal with the root block needs to be a leaf case. 3732 * We're just going to turn the thing back into extents anyway. 3733 */ 3734 level = cur->bc_nlevels - 1; 3735 if (level == 1) 3736 goto out0; 3737 3738 /* 3739 * Give up if the root has multiple children. 3740 */ 3741 block = xfs_btree_get_iroot(cur); 3742 if (xfs_btree_get_numrecs(block) != 1) 3743 goto out0; 3744 3745 cblock = xfs_btree_get_block(cur, level - 1, &cbp); 3746 numrecs = xfs_btree_get_numrecs(cblock); 3747 3748 /* 3749 * Only do this if the next level will fit. 3750 * Then the data must be copied up to the inode, 3751 * instead of freeing the root you free the next level. 3752 */ 3753 if (numrecs > cur->bc_ops->get_dmaxrecs(cur, level)) 3754 goto out0; 3755 3756 XFS_BTREE_STATS_INC(cur, killroot); 3757 3758 #ifdef DEBUG 3759 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB); 3760 ASSERT(xfs_btree_ptr_is_null(cur, &ptr)); 3761 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB); 3762 ASSERT(xfs_btree_ptr_is_null(cur, &ptr)); 3763 #endif 3764 3765 index = numrecs - cur->bc_ops->get_maxrecs(cur, level); 3766 if (index) { 3767 xfs_iroot_realloc(cur->bc_ino.ip, index, 3768 cur->bc_ino.whichfork); 3769 block = ifp->if_broot; 3770 } 3771 3772 be16_add_cpu(&block->bb_numrecs, index); 3773 ASSERT(block->bb_numrecs == cblock->bb_numrecs); 3774 3775 kp = xfs_btree_key_addr(cur, 1, block); 3776 ckp = xfs_btree_key_addr(cur, 1, cblock); 3777 xfs_btree_copy_keys(cur, kp, ckp, numrecs); 3778 3779 pp = xfs_btree_ptr_addr(cur, 1, block); 3780 cpp = xfs_btree_ptr_addr(cur, 1, cblock); 3781 3782 for (i = 0; i < numrecs; i++) { 3783 error = xfs_btree_debug_check_ptr(cur, cpp, i, level - 1); 3784 if (error) 3785 return error; 3786 } 3787 3788 xfs_btree_copy_ptrs(cur, pp, cpp, numrecs); 3789 3790 error = xfs_btree_free_block(cur, cbp); 3791 if (error) 3792 return error; 3793 3794 cur->bc_levels[level - 1].bp = NULL; 3795 be16_add_cpu(&block->bb_level, -1); 3796 xfs_trans_log_inode(cur->bc_tp, ip, 3797 XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork)); 3798 cur->bc_nlevels--; 3799 out0: 3800 return 0; 3801 } 3802 3803 /* 3804 * Kill the current root node, and replace it with it's only child node. 3805 */ 3806 STATIC int 3807 xfs_btree_kill_root( 3808 struct xfs_btree_cur *cur, 3809 struct xfs_buf *bp, 3810 int level, 3811 union xfs_btree_ptr *newroot) 3812 { 3813 int error; 3814 3815 XFS_BTREE_STATS_INC(cur, killroot); 3816 3817 /* 3818 * Update the root pointer, decreasing the level by 1 and then 3819 * free the old root. 3820 */ 3821 xfs_btree_set_root(cur, newroot, -1); 3822 3823 error = xfs_btree_free_block(cur, bp); 3824 if (error) 3825 return error; 3826 3827 cur->bc_levels[level].bp = NULL; 3828 cur->bc_levels[level].ra = 0; 3829 cur->bc_nlevels--; 3830 3831 return 0; 3832 } 3833 3834 STATIC int 3835 xfs_btree_dec_cursor( 3836 struct xfs_btree_cur *cur, 3837 int level, 3838 int *stat) 3839 { 3840 int error; 3841 int i; 3842 3843 if (level > 0) { 3844 error = xfs_btree_decrement(cur, level, &i); 3845 if (error) 3846 return error; 3847 } 3848 3849 *stat = 1; 3850 return 0; 3851 } 3852 3853 /* 3854 * Single level of the btree record deletion routine. 3855 * Delete record pointed to by cur/level. 3856 * Remove the record from its block then rebalance the tree. 3857 * Return 0 for error, 1 for done, 2 to go on to the next level. 3858 */ 3859 STATIC int /* error */ 3860 xfs_btree_delrec( 3861 struct xfs_btree_cur *cur, /* btree cursor */ 3862 int level, /* level removing record from */ 3863 int *stat) /* fail/done/go-on */ 3864 { 3865 struct xfs_btree_block *block; /* btree block */ 3866 union xfs_btree_ptr cptr; /* current block ptr */ 3867 struct xfs_buf *bp; /* buffer for block */ 3868 int error; /* error return value */ 3869 int i; /* loop counter */ 3870 union xfs_btree_ptr lptr; /* left sibling block ptr */ 3871 struct xfs_buf *lbp; /* left buffer pointer */ 3872 struct xfs_btree_block *left; /* left btree block */ 3873 int lrecs = 0; /* left record count */ 3874 int ptr; /* key/record index */ 3875 union xfs_btree_ptr rptr; /* right sibling block ptr */ 3876 struct xfs_buf *rbp; /* right buffer pointer */ 3877 struct xfs_btree_block *right; /* right btree block */ 3878 struct xfs_btree_block *rrblock; /* right-right btree block */ 3879 struct xfs_buf *rrbp; /* right-right buffer pointer */ 3880 int rrecs = 0; /* right record count */ 3881 struct xfs_btree_cur *tcur; /* temporary btree cursor */ 3882 int numrecs; /* temporary numrec count */ 3883 3884 tcur = NULL; 3885 3886 /* Get the index of the entry being deleted, check for nothing there. */ 3887 ptr = cur->bc_levels[level].ptr; 3888 if (ptr == 0) { 3889 *stat = 0; 3890 return 0; 3891 } 3892 3893 /* Get the buffer & block containing the record or key/ptr. */ 3894 block = xfs_btree_get_block(cur, level, &bp); 3895 numrecs = xfs_btree_get_numrecs(block); 3896 3897 #ifdef DEBUG 3898 error = xfs_btree_check_block(cur, block, level, bp); 3899 if (error) 3900 goto error0; 3901 #endif 3902 3903 /* Fail if we're off the end of the block. */ 3904 if (ptr > numrecs) { 3905 *stat = 0; 3906 return 0; 3907 } 3908 3909 XFS_BTREE_STATS_INC(cur, delrec); 3910 XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr); 3911 3912 /* Excise the entries being deleted. */ 3913 if (level > 0) { 3914 /* It's a nonleaf. operate on keys and ptrs */ 3915 union xfs_btree_key *lkp; 3916 union xfs_btree_ptr *lpp; 3917 3918 lkp = xfs_btree_key_addr(cur, ptr + 1, block); 3919 lpp = xfs_btree_ptr_addr(cur, ptr + 1, block); 3920 3921 for (i = 0; i < numrecs - ptr; i++) { 3922 error = xfs_btree_debug_check_ptr(cur, lpp, i, level); 3923 if (error) 3924 goto error0; 3925 } 3926 3927 if (ptr < numrecs) { 3928 xfs_btree_shift_keys(cur, lkp, -1, numrecs - ptr); 3929 xfs_btree_shift_ptrs(cur, lpp, -1, numrecs - ptr); 3930 xfs_btree_log_keys(cur, bp, ptr, numrecs - 1); 3931 xfs_btree_log_ptrs(cur, bp, ptr, numrecs - 1); 3932 } 3933 } else { 3934 /* It's a leaf. operate on records */ 3935 if (ptr < numrecs) { 3936 xfs_btree_shift_recs(cur, 3937 xfs_btree_rec_addr(cur, ptr + 1, block), 3938 -1, numrecs - ptr); 3939 xfs_btree_log_recs(cur, bp, ptr, numrecs - 1); 3940 } 3941 } 3942 3943 /* 3944 * Decrement and log the number of entries in the block. 3945 */ 3946 xfs_btree_set_numrecs(block, --numrecs); 3947 xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS); 3948 3949 /* 3950 * We're at the root level. First, shrink the root block in-memory. 3951 * Try to get rid of the next level down. If we can't then there's 3952 * nothing left to do. 3953 */ 3954 if (xfs_btree_at_iroot(cur, level)) { 3955 xfs_iroot_realloc(cur->bc_ino.ip, -1, cur->bc_ino.whichfork); 3956 3957 error = xfs_btree_kill_iroot(cur); 3958 if (error) 3959 goto error0; 3960 3961 error = xfs_btree_dec_cursor(cur, level, stat); 3962 if (error) 3963 goto error0; 3964 *stat = 1; 3965 return 0; 3966 } 3967 3968 /* 3969 * If this is the root level, and there's only one entry left, and it's 3970 * NOT the leaf level, then we can get rid of this level. 3971 */ 3972 if (level == cur->bc_nlevels - 1) { 3973 if (numrecs == 1 && level > 0) { 3974 union xfs_btree_ptr *pp; 3975 /* 3976 * pp is still set to the first pointer in the block. 3977 * Make it the new root of the btree. 3978 */ 3979 pp = xfs_btree_ptr_addr(cur, 1, block); 3980 error = xfs_btree_kill_root(cur, bp, level, pp); 3981 if (error) 3982 goto error0; 3983 } else if (level > 0) { 3984 error = xfs_btree_dec_cursor(cur, level, stat); 3985 if (error) 3986 goto error0; 3987 } 3988 *stat = 1; 3989 return 0; 3990 } 3991 3992 /* 3993 * If we deleted the leftmost entry in the block, update the 3994 * key values above us in the tree. 3995 */ 3996 if (xfs_btree_needs_key_update(cur, ptr)) { 3997 error = xfs_btree_update_keys(cur, level); 3998 if (error) 3999 goto error0; 4000 } 4001 4002 /* 4003 * If the number of records remaining in the block is at least 4004 * the minimum, we're done. 4005 */ 4006 if (numrecs >= cur->bc_ops->get_minrecs(cur, level)) { 4007 error = xfs_btree_dec_cursor(cur, level, stat); 4008 if (error) 4009 goto error0; 4010 return 0; 4011 } 4012 4013 /* 4014 * Otherwise, we have to move some records around to keep the 4015 * tree balanced. Look at the left and right sibling blocks to 4016 * see if we can re-balance by moving only one record. 4017 */ 4018 xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB); 4019 xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB); 4020 4021 if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) { 4022 /* 4023 * One child of root, need to get a chance to copy its contents 4024 * into the root and delete it. Can't go up to next level, 4025 * there's nothing to delete there. 4026 */ 4027 if (xfs_btree_ptr_is_null(cur, &rptr) && 4028 xfs_btree_ptr_is_null(cur, &lptr) && 4029 level == cur->bc_nlevels - 2) { 4030 error = xfs_btree_kill_iroot(cur); 4031 if (!error) 4032 error = xfs_btree_dec_cursor(cur, level, stat); 4033 if (error) 4034 goto error0; 4035 return 0; 4036 } 4037 } 4038 4039 ASSERT(!xfs_btree_ptr_is_null(cur, &rptr) || 4040 !xfs_btree_ptr_is_null(cur, &lptr)); 4041 4042 /* 4043 * Duplicate the cursor so our btree manipulations here won't 4044 * disrupt the next level up. 4045 */ 4046 error = xfs_btree_dup_cursor(cur, &tcur); 4047 if (error) 4048 goto error0; 4049 4050 /* 4051 * If there's a right sibling, see if it's ok to shift an entry 4052 * out of it. 4053 */ 4054 if (!xfs_btree_ptr_is_null(cur, &rptr)) { 4055 /* 4056 * Move the temp cursor to the last entry in the next block. 4057 * Actually any entry but the first would suffice. 4058 */ 4059 i = xfs_btree_lastrec(tcur, level); 4060 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 4061 xfs_btree_mark_sick(cur); 4062 error = -EFSCORRUPTED; 4063 goto error0; 4064 } 4065 4066 error = xfs_btree_increment(tcur, level, &i); 4067 if (error) 4068 goto error0; 4069 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 4070 xfs_btree_mark_sick(cur); 4071 error = -EFSCORRUPTED; 4072 goto error0; 4073 } 4074 4075 i = xfs_btree_lastrec(tcur, level); 4076 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 4077 xfs_btree_mark_sick(cur); 4078 error = -EFSCORRUPTED; 4079 goto error0; 4080 } 4081 4082 /* Grab a pointer to the block. */ 4083 right = xfs_btree_get_block(tcur, level, &rbp); 4084 #ifdef DEBUG 4085 error = xfs_btree_check_block(tcur, right, level, rbp); 4086 if (error) 4087 goto error0; 4088 #endif 4089 /* Grab the current block number, for future use. */ 4090 xfs_btree_get_sibling(tcur, right, &cptr, XFS_BB_LEFTSIB); 4091 4092 /* 4093 * If right block is full enough so that removing one entry 4094 * won't make it too empty, and left-shifting an entry out 4095 * of right to us works, we're done. 4096 */ 4097 if (xfs_btree_get_numrecs(right) - 1 >= 4098 cur->bc_ops->get_minrecs(tcur, level)) { 4099 error = xfs_btree_lshift(tcur, level, &i); 4100 if (error) 4101 goto error0; 4102 if (i) { 4103 ASSERT(xfs_btree_get_numrecs(block) >= 4104 cur->bc_ops->get_minrecs(tcur, level)); 4105 4106 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); 4107 tcur = NULL; 4108 4109 error = xfs_btree_dec_cursor(cur, level, stat); 4110 if (error) 4111 goto error0; 4112 return 0; 4113 } 4114 } 4115 4116 /* 4117 * Otherwise, grab the number of records in right for 4118 * future reference, and fix up the temp cursor to point 4119 * to our block again (last record). 4120 */ 4121 rrecs = xfs_btree_get_numrecs(right); 4122 if (!xfs_btree_ptr_is_null(cur, &lptr)) { 4123 i = xfs_btree_firstrec(tcur, level); 4124 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 4125 xfs_btree_mark_sick(cur); 4126 error = -EFSCORRUPTED; 4127 goto error0; 4128 } 4129 4130 error = xfs_btree_decrement(tcur, level, &i); 4131 if (error) 4132 goto error0; 4133 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 4134 xfs_btree_mark_sick(cur); 4135 error = -EFSCORRUPTED; 4136 goto error0; 4137 } 4138 } 4139 } 4140 4141 /* 4142 * If there's a left sibling, see if it's ok to shift an entry 4143 * out of it. 4144 */ 4145 if (!xfs_btree_ptr_is_null(cur, &lptr)) { 4146 /* 4147 * Move the temp cursor to the first entry in the 4148 * previous block. 4149 */ 4150 i = xfs_btree_firstrec(tcur, level); 4151 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 4152 xfs_btree_mark_sick(cur); 4153 error = -EFSCORRUPTED; 4154 goto error0; 4155 } 4156 4157 error = xfs_btree_decrement(tcur, level, &i); 4158 if (error) 4159 goto error0; 4160 i = xfs_btree_firstrec(tcur, level); 4161 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 4162 xfs_btree_mark_sick(cur); 4163 error = -EFSCORRUPTED; 4164 goto error0; 4165 } 4166 4167 /* Grab a pointer to the block. */ 4168 left = xfs_btree_get_block(tcur, level, &lbp); 4169 #ifdef DEBUG 4170 error = xfs_btree_check_block(cur, left, level, lbp); 4171 if (error) 4172 goto error0; 4173 #endif 4174 /* Grab the current block number, for future use. */ 4175 xfs_btree_get_sibling(tcur, left, &cptr, XFS_BB_RIGHTSIB); 4176 4177 /* 4178 * If left block is full enough so that removing one entry 4179 * won't make it too empty, and right-shifting an entry out 4180 * of left to us works, we're done. 4181 */ 4182 if (xfs_btree_get_numrecs(left) - 1 >= 4183 cur->bc_ops->get_minrecs(tcur, level)) { 4184 error = xfs_btree_rshift(tcur, level, &i); 4185 if (error) 4186 goto error0; 4187 if (i) { 4188 ASSERT(xfs_btree_get_numrecs(block) >= 4189 cur->bc_ops->get_minrecs(tcur, level)); 4190 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); 4191 tcur = NULL; 4192 if (level == 0) 4193 cur->bc_levels[0].ptr++; 4194 4195 *stat = 1; 4196 return 0; 4197 } 4198 } 4199 4200 /* 4201 * Otherwise, grab the number of records in right for 4202 * future reference. 4203 */ 4204 lrecs = xfs_btree_get_numrecs(left); 4205 } 4206 4207 /* Delete the temp cursor, we're done with it. */ 4208 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); 4209 tcur = NULL; 4210 4211 /* If here, we need to do a join to keep the tree balanced. */ 4212 ASSERT(!xfs_btree_ptr_is_null(cur, &cptr)); 4213 4214 if (!xfs_btree_ptr_is_null(cur, &lptr) && 4215 lrecs + xfs_btree_get_numrecs(block) <= 4216 cur->bc_ops->get_maxrecs(cur, level)) { 4217 /* 4218 * Set "right" to be the starting block, 4219 * "left" to be the left neighbor. 4220 */ 4221 rptr = cptr; 4222 right = block; 4223 rbp = bp; 4224 error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp); 4225 if (error) 4226 goto error0; 4227 4228 /* 4229 * If that won't work, see if we can join with the right neighbor block. 4230 */ 4231 } else if (!xfs_btree_ptr_is_null(cur, &rptr) && 4232 rrecs + xfs_btree_get_numrecs(block) <= 4233 cur->bc_ops->get_maxrecs(cur, level)) { 4234 /* 4235 * Set "left" to be the starting block, 4236 * "right" to be the right neighbor. 4237 */ 4238 lptr = cptr; 4239 left = block; 4240 lbp = bp; 4241 error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp); 4242 if (error) 4243 goto error0; 4244 4245 /* 4246 * Otherwise, we can't fix the imbalance. 4247 * Just return. This is probably a logic error, but it's not fatal. 4248 */ 4249 } else { 4250 error = xfs_btree_dec_cursor(cur, level, stat); 4251 if (error) 4252 goto error0; 4253 return 0; 4254 } 4255 4256 rrecs = xfs_btree_get_numrecs(right); 4257 lrecs = xfs_btree_get_numrecs(left); 4258 4259 /* 4260 * We're now going to join "left" and "right" by moving all the stuff 4261 * in "right" to "left" and deleting "right". 4262 */ 4263 XFS_BTREE_STATS_ADD(cur, moves, rrecs); 4264 if (level > 0) { 4265 /* It's a non-leaf. Move keys and pointers. */ 4266 union xfs_btree_key *lkp; /* left btree key */ 4267 union xfs_btree_ptr *lpp; /* left address pointer */ 4268 union xfs_btree_key *rkp; /* right btree key */ 4269 union xfs_btree_ptr *rpp; /* right address pointer */ 4270 4271 lkp = xfs_btree_key_addr(cur, lrecs + 1, left); 4272 lpp = xfs_btree_ptr_addr(cur, lrecs + 1, left); 4273 rkp = xfs_btree_key_addr(cur, 1, right); 4274 rpp = xfs_btree_ptr_addr(cur, 1, right); 4275 4276 for (i = 1; i < rrecs; i++) { 4277 error = xfs_btree_debug_check_ptr(cur, rpp, i, level); 4278 if (error) 4279 goto error0; 4280 } 4281 4282 xfs_btree_copy_keys(cur, lkp, rkp, rrecs); 4283 xfs_btree_copy_ptrs(cur, lpp, rpp, rrecs); 4284 4285 xfs_btree_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs); 4286 xfs_btree_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs); 4287 } else { 4288 /* It's a leaf. Move records. */ 4289 union xfs_btree_rec *lrp; /* left record pointer */ 4290 union xfs_btree_rec *rrp; /* right record pointer */ 4291 4292 lrp = xfs_btree_rec_addr(cur, lrecs + 1, left); 4293 rrp = xfs_btree_rec_addr(cur, 1, right); 4294 4295 xfs_btree_copy_recs(cur, lrp, rrp, rrecs); 4296 xfs_btree_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs); 4297 } 4298 4299 XFS_BTREE_STATS_INC(cur, join); 4300 4301 /* 4302 * Fix up the number of records and right block pointer in the 4303 * surviving block, and log it. 4304 */ 4305 xfs_btree_set_numrecs(left, lrecs + rrecs); 4306 xfs_btree_get_sibling(cur, right, &cptr, XFS_BB_RIGHTSIB); 4307 xfs_btree_set_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB); 4308 xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB); 4309 4310 /* If there is a right sibling, point it to the remaining block. */ 4311 xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB); 4312 if (!xfs_btree_ptr_is_null(cur, &cptr)) { 4313 error = xfs_btree_read_buf_block(cur, &cptr, 0, &rrblock, &rrbp); 4314 if (error) 4315 goto error0; 4316 xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB); 4317 xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB); 4318 } 4319 4320 /* Free the deleted block. */ 4321 error = xfs_btree_free_block(cur, rbp); 4322 if (error) 4323 goto error0; 4324 4325 /* 4326 * If we joined with the left neighbor, set the buffer in the 4327 * cursor to the left block, and fix up the index. 4328 */ 4329 if (bp != lbp) { 4330 cur->bc_levels[level].bp = lbp; 4331 cur->bc_levels[level].ptr += lrecs; 4332 cur->bc_levels[level].ra = 0; 4333 } 4334 /* 4335 * If we joined with the right neighbor and there's a level above 4336 * us, increment the cursor at that level. 4337 */ 4338 else if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE || 4339 level + 1 < cur->bc_nlevels) { 4340 error = xfs_btree_increment(cur, level + 1, &i); 4341 if (error) 4342 goto error0; 4343 } 4344 4345 /* 4346 * Readjust the ptr at this level if it's not a leaf, since it's 4347 * still pointing at the deletion point, which makes the cursor 4348 * inconsistent. If this makes the ptr 0, the caller fixes it up. 4349 * We can't use decrement because it would change the next level up. 4350 */ 4351 if (level > 0) 4352 cur->bc_levels[level].ptr--; 4353 4354 /* 4355 * We combined blocks, so we have to update the parent keys if the 4356 * btree supports overlapped intervals. However, 4357 * bc_levels[level + 1].ptr points to the old block so that the caller 4358 * knows which record to delete. Therefore, the caller must be savvy 4359 * enough to call updkeys for us if we return stat == 2. The other 4360 * exit points from this function don't require deletions further up 4361 * the tree, so they can call updkeys directly. 4362 */ 4363 4364 /* Return value means the next level up has something to do. */ 4365 *stat = 2; 4366 return 0; 4367 4368 error0: 4369 if (tcur) 4370 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); 4371 return error; 4372 } 4373 4374 /* 4375 * Delete the record pointed to by cur. 4376 * The cursor refers to the place where the record was (could be inserted) 4377 * when the operation returns. 4378 */ 4379 int /* error */ 4380 xfs_btree_delete( 4381 struct xfs_btree_cur *cur, 4382 int *stat) /* success/failure */ 4383 { 4384 int error; /* error return value */ 4385 int level; 4386 int i; 4387 bool joined = false; 4388 4389 /* 4390 * Go up the tree, starting at leaf level. 4391 * 4392 * If 2 is returned then a join was done; go to the next level. 4393 * Otherwise we are done. 4394 */ 4395 for (level = 0, i = 2; i == 2; level++) { 4396 error = xfs_btree_delrec(cur, level, &i); 4397 if (error) 4398 goto error0; 4399 if (i == 2) 4400 joined = true; 4401 } 4402 4403 /* 4404 * If we combined blocks as part of deleting the record, delrec won't 4405 * have updated the parent high keys so we have to do that here. 4406 */ 4407 if (joined && (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING)) { 4408 error = xfs_btree_updkeys_force(cur, 0); 4409 if (error) 4410 goto error0; 4411 } 4412 4413 if (i == 0) { 4414 for (level = 1; level < cur->bc_nlevels; level++) { 4415 if (cur->bc_levels[level].ptr == 0) { 4416 error = xfs_btree_decrement(cur, level, &i); 4417 if (error) 4418 goto error0; 4419 break; 4420 } 4421 } 4422 } 4423 4424 *stat = i; 4425 return 0; 4426 error0: 4427 return error; 4428 } 4429 4430 /* 4431 * Get the data from the pointed-to record. 4432 */ 4433 int /* error */ 4434 xfs_btree_get_rec( 4435 struct xfs_btree_cur *cur, /* btree cursor */ 4436 union xfs_btree_rec **recp, /* output: btree record */ 4437 int *stat) /* output: success/failure */ 4438 { 4439 struct xfs_btree_block *block; /* btree block */ 4440 struct xfs_buf *bp; /* buffer pointer */ 4441 int ptr; /* record number */ 4442 #ifdef DEBUG 4443 int error; /* error return value */ 4444 #endif 4445 4446 ptr = cur->bc_levels[0].ptr; 4447 block = xfs_btree_get_block(cur, 0, &bp); 4448 4449 #ifdef DEBUG 4450 error = xfs_btree_check_block(cur, block, 0, bp); 4451 if (error) 4452 return error; 4453 #endif 4454 4455 /* 4456 * Off the right end or left end, return failure. 4457 */ 4458 if (ptr > xfs_btree_get_numrecs(block) || ptr <= 0) { 4459 *stat = 0; 4460 return 0; 4461 } 4462 4463 /* 4464 * Point to the record and extract its data. 4465 */ 4466 *recp = xfs_btree_rec_addr(cur, ptr, block); 4467 *stat = 1; 4468 return 0; 4469 } 4470 4471 /* Visit a block in a btree. */ 4472 STATIC int 4473 xfs_btree_visit_block( 4474 struct xfs_btree_cur *cur, 4475 int level, 4476 xfs_btree_visit_blocks_fn fn, 4477 void *data) 4478 { 4479 struct xfs_btree_block *block; 4480 struct xfs_buf *bp; 4481 union xfs_btree_ptr rptr, bufptr; 4482 int error; 4483 4484 /* do right sibling readahead */ 4485 xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA); 4486 block = xfs_btree_get_block(cur, level, &bp); 4487 4488 /* process the block */ 4489 error = fn(cur, level, data); 4490 if (error) 4491 return error; 4492 4493 /* now read rh sibling block for next iteration */ 4494 xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB); 4495 if (xfs_btree_ptr_is_null(cur, &rptr)) 4496 return -ENOENT; 4497 4498 /* 4499 * We only visit blocks once in this walk, so we have to avoid the 4500 * internal xfs_btree_lookup_get_block() optimisation where it will 4501 * return the same block without checking if the right sibling points 4502 * back to us and creates a cyclic reference in the btree. 4503 */ 4504 xfs_btree_buf_to_ptr(cur, bp, &bufptr); 4505 if (xfs_btree_ptrs_equal(cur, &rptr, &bufptr)) { 4506 xfs_btree_mark_sick(cur); 4507 return -EFSCORRUPTED; 4508 } 4509 4510 return xfs_btree_lookup_get_block(cur, level, &rptr, &block); 4511 } 4512 4513 4514 /* Visit every block in a btree. */ 4515 int 4516 xfs_btree_visit_blocks( 4517 struct xfs_btree_cur *cur, 4518 xfs_btree_visit_blocks_fn fn, 4519 unsigned int flags, 4520 void *data) 4521 { 4522 union xfs_btree_ptr lptr; 4523 int level; 4524 struct xfs_btree_block *block = NULL; 4525 int error = 0; 4526 4527 xfs_btree_init_ptr_from_cur(cur, &lptr); 4528 4529 /* for each level */ 4530 for (level = cur->bc_nlevels - 1; level >= 0; level--) { 4531 /* grab the left hand block */ 4532 error = xfs_btree_lookup_get_block(cur, level, &lptr, &block); 4533 if (error) 4534 return error; 4535 4536 /* readahead the left most block for the next level down */ 4537 if (level > 0) { 4538 union xfs_btree_ptr *ptr; 4539 4540 ptr = xfs_btree_ptr_addr(cur, 1, block); 4541 xfs_btree_readahead_ptr(cur, ptr, 1); 4542 4543 /* save for the next iteration of the loop */ 4544 xfs_btree_copy_ptrs(cur, &lptr, ptr, 1); 4545 4546 if (!(flags & XFS_BTREE_VISIT_LEAVES)) 4547 continue; 4548 } else if (!(flags & XFS_BTREE_VISIT_RECORDS)) { 4549 continue; 4550 } 4551 4552 /* for each buffer in the level */ 4553 do { 4554 error = xfs_btree_visit_block(cur, level, fn, data); 4555 } while (!error); 4556 4557 if (error != -ENOENT) 4558 return error; 4559 } 4560 4561 return 0; 4562 } 4563 4564 /* 4565 * Change the owner of a btree. 4566 * 4567 * The mechanism we use here is ordered buffer logging. Because we don't know 4568 * how many buffers were are going to need to modify, we don't really want to 4569 * have to make transaction reservations for the worst case of every buffer in a 4570 * full size btree as that may be more space that we can fit in the log.... 4571 * 4572 * We do the btree walk in the most optimal manner possible - we have sibling 4573 * pointers so we can just walk all the blocks on each level from left to right 4574 * in a single pass, and then move to the next level and do the same. We can 4575 * also do readahead on the sibling pointers to get IO moving more quickly, 4576 * though for slow disks this is unlikely to make much difference to performance 4577 * as the amount of CPU work we have to do before moving to the next block is 4578 * relatively small. 4579 * 4580 * For each btree block that we load, modify the owner appropriately, set the 4581 * buffer as an ordered buffer and log it appropriately. We need to ensure that 4582 * we mark the region we change dirty so that if the buffer is relogged in 4583 * a subsequent transaction the changes we make here as an ordered buffer are 4584 * correctly relogged in that transaction. If we are in recovery context, then 4585 * just queue the modified buffer as delayed write buffer so the transaction 4586 * recovery completion writes the changes to disk. 4587 */ 4588 struct xfs_btree_block_change_owner_info { 4589 uint64_t new_owner; 4590 struct list_head *buffer_list; 4591 }; 4592 4593 static int 4594 xfs_btree_block_change_owner( 4595 struct xfs_btree_cur *cur, 4596 int level, 4597 void *data) 4598 { 4599 struct xfs_btree_block_change_owner_info *bbcoi = data; 4600 struct xfs_btree_block *block; 4601 struct xfs_buf *bp; 4602 4603 /* modify the owner */ 4604 block = xfs_btree_get_block(cur, level, &bp); 4605 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { 4606 if (block->bb_u.l.bb_owner == cpu_to_be64(bbcoi->new_owner)) 4607 return 0; 4608 block->bb_u.l.bb_owner = cpu_to_be64(bbcoi->new_owner); 4609 } else { 4610 if (block->bb_u.s.bb_owner == cpu_to_be32(bbcoi->new_owner)) 4611 return 0; 4612 block->bb_u.s.bb_owner = cpu_to_be32(bbcoi->new_owner); 4613 } 4614 4615 /* 4616 * If the block is a root block hosted in an inode, we might not have a 4617 * buffer pointer here and we shouldn't attempt to log the change as the 4618 * information is already held in the inode and discarded when the root 4619 * block is formatted into the on-disk inode fork. We still change it, 4620 * though, so everything is consistent in memory. 4621 */ 4622 if (!bp) { 4623 ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE); 4624 ASSERT(level == cur->bc_nlevels - 1); 4625 return 0; 4626 } 4627 4628 if (cur->bc_tp) { 4629 if (!xfs_trans_ordered_buf(cur->bc_tp, bp)) { 4630 xfs_btree_log_block(cur, bp, XFS_BB_OWNER); 4631 return -EAGAIN; 4632 } 4633 } else { 4634 xfs_buf_delwri_queue(bp, bbcoi->buffer_list); 4635 } 4636 4637 return 0; 4638 } 4639 4640 int 4641 xfs_btree_change_owner( 4642 struct xfs_btree_cur *cur, 4643 uint64_t new_owner, 4644 struct list_head *buffer_list) 4645 { 4646 struct xfs_btree_block_change_owner_info bbcoi; 4647 4648 bbcoi.new_owner = new_owner; 4649 bbcoi.buffer_list = buffer_list; 4650 4651 return xfs_btree_visit_blocks(cur, xfs_btree_block_change_owner, 4652 XFS_BTREE_VISIT_ALL, &bbcoi); 4653 } 4654 4655 /* Verify the v5 fields of a long-format btree block. */ 4656 xfs_failaddr_t 4657 xfs_btree_fsblock_v5hdr_verify( 4658 struct xfs_buf *bp, 4659 uint64_t owner) 4660 { 4661 struct xfs_mount *mp = bp->b_mount; 4662 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 4663 4664 if (!xfs_has_crc(mp)) 4665 return __this_address; 4666 if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid)) 4667 return __this_address; 4668 if (block->bb_u.l.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp))) 4669 return __this_address; 4670 if (owner != XFS_RMAP_OWN_UNKNOWN && 4671 be64_to_cpu(block->bb_u.l.bb_owner) != owner) 4672 return __this_address; 4673 return NULL; 4674 } 4675 4676 /* Verify a long-format btree block. */ 4677 xfs_failaddr_t 4678 xfs_btree_fsblock_verify( 4679 struct xfs_buf *bp, 4680 unsigned int max_recs) 4681 { 4682 struct xfs_mount *mp = bp->b_mount; 4683 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 4684 xfs_fsblock_t fsb; 4685 xfs_failaddr_t fa; 4686 4687 ASSERT(!xfs_buftarg_is_mem(bp->b_target)); 4688 4689 /* numrecs verification */ 4690 if (be16_to_cpu(block->bb_numrecs) > max_recs) 4691 return __this_address; 4692 4693 /* sibling pointer verification */ 4694 fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); 4695 fa = xfs_btree_check_fsblock_siblings(mp, fsb, 4696 block->bb_u.l.bb_leftsib); 4697 if (!fa) 4698 fa = xfs_btree_check_fsblock_siblings(mp, fsb, 4699 block->bb_u.l.bb_rightsib); 4700 return fa; 4701 } 4702 4703 /* Verify an in-memory btree block. */ 4704 xfs_failaddr_t 4705 xfs_btree_memblock_verify( 4706 struct xfs_buf *bp, 4707 unsigned int max_recs) 4708 { 4709 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 4710 struct xfs_buftarg *btp = bp->b_target; 4711 xfs_failaddr_t fa; 4712 xfbno_t bno; 4713 4714 ASSERT(xfs_buftarg_is_mem(bp->b_target)); 4715 4716 /* numrecs verification */ 4717 if (be16_to_cpu(block->bb_numrecs) > max_recs) 4718 return __this_address; 4719 4720 /* sibling pointer verification */ 4721 bno = xfs_daddr_to_xfbno(xfs_buf_daddr(bp)); 4722 fa = xfs_btree_check_memblock_siblings(btp, bno, 4723 block->bb_u.l.bb_leftsib); 4724 if (fa) 4725 return fa; 4726 fa = xfs_btree_check_memblock_siblings(btp, bno, 4727 block->bb_u.l.bb_rightsib); 4728 if (fa) 4729 return fa; 4730 4731 return NULL; 4732 } 4733 /** 4734 * xfs_btree_agblock_v5hdr_verify() -- verify the v5 fields of a short-format 4735 * btree block 4736 * 4737 * @bp: buffer containing the btree block 4738 */ 4739 xfs_failaddr_t 4740 xfs_btree_agblock_v5hdr_verify( 4741 struct xfs_buf *bp) 4742 { 4743 struct xfs_mount *mp = bp->b_mount; 4744 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 4745 struct xfs_perag *pag = bp->b_pag; 4746 4747 if (!xfs_has_crc(mp)) 4748 return __this_address; 4749 if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) 4750 return __this_address; 4751 if (block->bb_u.s.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp))) 4752 return __this_address; 4753 if (pag && be32_to_cpu(block->bb_u.s.bb_owner) != pag_agno(pag)) 4754 return __this_address; 4755 return NULL; 4756 } 4757 4758 /** 4759 * xfs_btree_agblock_verify() -- verify a short-format btree block 4760 * 4761 * @bp: buffer containing the btree block 4762 * @max_recs: maximum records allowed in this btree node 4763 */ 4764 xfs_failaddr_t 4765 xfs_btree_agblock_verify( 4766 struct xfs_buf *bp, 4767 unsigned int max_recs) 4768 { 4769 struct xfs_mount *mp = bp->b_mount; 4770 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 4771 xfs_agblock_t agbno; 4772 xfs_failaddr_t fa; 4773 4774 ASSERT(!xfs_buftarg_is_mem(bp->b_target)); 4775 4776 /* numrecs verification */ 4777 if (be16_to_cpu(block->bb_numrecs) > max_recs) 4778 return __this_address; 4779 4780 /* sibling pointer verification */ 4781 agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp)); 4782 fa = xfs_btree_check_agblock_siblings(bp->b_pag, agbno, 4783 block->bb_u.s.bb_leftsib); 4784 if (!fa) 4785 fa = xfs_btree_check_agblock_siblings(bp->b_pag, agbno, 4786 block->bb_u.s.bb_rightsib); 4787 return fa; 4788 } 4789 4790 /* 4791 * For the given limits on leaf and keyptr records per block, calculate the 4792 * height of the tree needed to index the number of leaf records. 4793 */ 4794 unsigned int 4795 xfs_btree_compute_maxlevels( 4796 const unsigned int *limits, 4797 unsigned long long records) 4798 { 4799 unsigned long long level_blocks = howmany_64(records, limits[0]); 4800 unsigned int height = 1; 4801 4802 while (level_blocks > 1) { 4803 level_blocks = howmany_64(level_blocks, limits[1]); 4804 height++; 4805 } 4806 4807 return height; 4808 } 4809 4810 /* 4811 * For the given limits on leaf and keyptr records per block, calculate the 4812 * number of blocks needed to index the given number of leaf records. 4813 */ 4814 unsigned long long 4815 xfs_btree_calc_size( 4816 const unsigned int *limits, 4817 unsigned long long records) 4818 { 4819 unsigned long long level_blocks = howmany_64(records, limits[0]); 4820 unsigned long long blocks = level_blocks; 4821 4822 while (level_blocks > 1) { 4823 level_blocks = howmany_64(level_blocks, limits[1]); 4824 blocks += level_blocks; 4825 } 4826 4827 return blocks; 4828 } 4829 4830 /* 4831 * Given a number of available blocks for the btree to consume with records and 4832 * pointers, calculate the height of the tree needed to index all the records 4833 * that space can hold based on the number of pointers each interior node 4834 * holds. 4835 * 4836 * We start by assuming a single level tree consumes a single block, then track 4837 * the number of blocks each node level consumes until we no longer have space 4838 * to store the next node level. At this point, we are indexing all the leaf 4839 * blocks in the space, and there's no more free space to split the tree any 4840 * further. That's our maximum btree height. 4841 */ 4842 unsigned int 4843 xfs_btree_space_to_height( 4844 const unsigned int *limits, 4845 unsigned long long leaf_blocks) 4846 { 4847 /* 4848 * The root btree block can have fewer than minrecs pointers in it 4849 * because the tree might not be big enough to require that amount of 4850 * fanout. Hence it has a minimum size of 2 pointers, not limits[1]. 4851 */ 4852 unsigned long long node_blocks = 2; 4853 unsigned long long blocks_left = leaf_blocks - 1; 4854 unsigned int height = 1; 4855 4856 if (leaf_blocks < 1) 4857 return 0; 4858 4859 while (node_blocks < blocks_left) { 4860 blocks_left -= node_blocks; 4861 node_blocks *= limits[1]; 4862 height++; 4863 } 4864 4865 return height; 4866 } 4867 4868 /* 4869 * Query a regular btree for all records overlapping a given interval. 4870 * Start with a LE lookup of the key of low_rec and return all records 4871 * until we find a record with a key greater than the key of high_rec. 4872 */ 4873 STATIC int 4874 xfs_btree_simple_query_range( 4875 struct xfs_btree_cur *cur, 4876 const union xfs_btree_key *low_key, 4877 const union xfs_btree_key *high_key, 4878 xfs_btree_query_range_fn fn, 4879 void *priv) 4880 { 4881 union xfs_btree_rec *recp; 4882 union xfs_btree_key rec_key; 4883 int stat; 4884 bool firstrec = true; 4885 int error; 4886 4887 ASSERT(cur->bc_ops->init_high_key_from_rec); 4888 ASSERT(cur->bc_ops->diff_two_keys); 4889 4890 /* 4891 * Find the leftmost record. The btree cursor must be set 4892 * to the low record used to generate low_key. 4893 */ 4894 stat = 0; 4895 error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, &stat); 4896 if (error) 4897 goto out; 4898 4899 /* Nothing? See if there's anything to the right. */ 4900 if (!stat) { 4901 error = xfs_btree_increment(cur, 0, &stat); 4902 if (error) 4903 goto out; 4904 } 4905 4906 while (stat) { 4907 /* Find the record. */ 4908 error = xfs_btree_get_rec(cur, &recp, &stat); 4909 if (error || !stat) 4910 break; 4911 4912 /* Skip if low_key > high_key(rec). */ 4913 if (firstrec) { 4914 cur->bc_ops->init_high_key_from_rec(&rec_key, recp); 4915 firstrec = false; 4916 if (xfs_btree_keycmp_gt(cur, low_key, &rec_key)) 4917 goto advloop; 4918 } 4919 4920 /* Stop if low_key(rec) > high_key. */ 4921 cur->bc_ops->init_key_from_rec(&rec_key, recp); 4922 if (xfs_btree_keycmp_gt(cur, &rec_key, high_key)) 4923 break; 4924 4925 /* Callback */ 4926 error = fn(cur, recp, priv); 4927 if (error) 4928 break; 4929 4930 advloop: 4931 /* Move on to the next record. */ 4932 error = xfs_btree_increment(cur, 0, &stat); 4933 if (error) 4934 break; 4935 } 4936 4937 out: 4938 return error; 4939 } 4940 4941 /* 4942 * Query an overlapped interval btree for all records overlapping a given 4943 * interval. This function roughly follows the algorithm given in 4944 * "Interval Trees" of _Introduction to Algorithms_, which is section 4945 * 14.3 in the 2nd and 3rd editions. 4946 * 4947 * First, generate keys for the low and high records passed in. 4948 * 4949 * For any leaf node, generate the high and low keys for the record. 4950 * If the record keys overlap with the query low/high keys, pass the 4951 * record to the function iterator. 4952 * 4953 * For any internal node, compare the low and high keys of each 4954 * pointer against the query low/high keys. If there's an overlap, 4955 * follow the pointer. 4956 * 4957 * As an optimization, we stop scanning a block when we find a low key 4958 * that is greater than the query's high key. 4959 */ 4960 STATIC int 4961 xfs_btree_overlapped_query_range( 4962 struct xfs_btree_cur *cur, 4963 const union xfs_btree_key *low_key, 4964 const union xfs_btree_key *high_key, 4965 xfs_btree_query_range_fn fn, 4966 void *priv) 4967 { 4968 union xfs_btree_ptr ptr; 4969 union xfs_btree_ptr *pp; 4970 union xfs_btree_key rec_key; 4971 union xfs_btree_key rec_hkey; 4972 union xfs_btree_key *lkp; 4973 union xfs_btree_key *hkp; 4974 union xfs_btree_rec *recp; 4975 struct xfs_btree_block *block; 4976 int level; 4977 struct xfs_buf *bp; 4978 int i; 4979 int error; 4980 4981 /* Load the root of the btree. */ 4982 level = cur->bc_nlevels - 1; 4983 xfs_btree_init_ptr_from_cur(cur, &ptr); 4984 error = xfs_btree_lookup_get_block(cur, level, &ptr, &block); 4985 if (error) 4986 return error; 4987 xfs_btree_get_block(cur, level, &bp); 4988 trace_xfs_btree_overlapped_query_range(cur, level, bp); 4989 #ifdef DEBUG 4990 error = xfs_btree_check_block(cur, block, level, bp); 4991 if (error) 4992 goto out; 4993 #endif 4994 cur->bc_levels[level].ptr = 1; 4995 4996 while (level < cur->bc_nlevels) { 4997 block = xfs_btree_get_block(cur, level, &bp); 4998 4999 /* End of node, pop back towards the root. */ 5000 if (cur->bc_levels[level].ptr > 5001 be16_to_cpu(block->bb_numrecs)) { 5002 pop_up: 5003 if (level < cur->bc_nlevels - 1) 5004 cur->bc_levels[level + 1].ptr++; 5005 level++; 5006 continue; 5007 } 5008 5009 if (level == 0) { 5010 /* Handle a leaf node. */ 5011 recp = xfs_btree_rec_addr(cur, cur->bc_levels[0].ptr, 5012 block); 5013 5014 cur->bc_ops->init_high_key_from_rec(&rec_hkey, recp); 5015 cur->bc_ops->init_key_from_rec(&rec_key, recp); 5016 5017 /* 5018 * If (query's high key < record's low key), then there 5019 * are no more interesting records in this block. Pop 5020 * up to the leaf level to find more record blocks. 5021 * 5022 * If (record's high key >= query's low key) and 5023 * (query's high key >= record's low key), then 5024 * this record overlaps the query range; callback. 5025 */ 5026 if (xfs_btree_keycmp_lt(cur, high_key, &rec_key)) 5027 goto pop_up; 5028 if (xfs_btree_keycmp_ge(cur, &rec_hkey, low_key)) { 5029 error = fn(cur, recp, priv); 5030 if (error) 5031 break; 5032 } 5033 cur->bc_levels[level].ptr++; 5034 continue; 5035 } 5036 5037 /* Handle an internal node. */ 5038 lkp = xfs_btree_key_addr(cur, cur->bc_levels[level].ptr, block); 5039 hkp = xfs_btree_high_key_addr(cur, cur->bc_levels[level].ptr, 5040 block); 5041 pp = xfs_btree_ptr_addr(cur, cur->bc_levels[level].ptr, block); 5042 5043 /* 5044 * If (query's high key < pointer's low key), then there are no 5045 * more interesting keys in this block. Pop up one leaf level 5046 * to continue looking for records. 5047 * 5048 * If (pointer's high key >= query's low key) and 5049 * (query's high key >= pointer's low key), then 5050 * this record overlaps the query range; follow pointer. 5051 */ 5052 if (xfs_btree_keycmp_lt(cur, high_key, lkp)) 5053 goto pop_up; 5054 if (xfs_btree_keycmp_ge(cur, hkp, low_key)) { 5055 level--; 5056 error = xfs_btree_lookup_get_block(cur, level, pp, 5057 &block); 5058 if (error) 5059 goto out; 5060 xfs_btree_get_block(cur, level, &bp); 5061 trace_xfs_btree_overlapped_query_range(cur, level, bp); 5062 #ifdef DEBUG 5063 error = xfs_btree_check_block(cur, block, level, bp); 5064 if (error) 5065 goto out; 5066 #endif 5067 cur->bc_levels[level].ptr = 1; 5068 continue; 5069 } 5070 cur->bc_levels[level].ptr++; 5071 } 5072 5073 out: 5074 /* 5075 * If we don't end this function with the cursor pointing at a record 5076 * block, a subsequent non-error cursor deletion will not release 5077 * node-level buffers, causing a buffer leak. This is quite possible 5078 * with a zero-results range query, so release the buffers if we 5079 * failed to return any results. 5080 */ 5081 if (cur->bc_levels[0].bp == NULL) { 5082 for (i = 0; i < cur->bc_nlevels; i++) { 5083 if (cur->bc_levels[i].bp) { 5084 xfs_trans_brelse(cur->bc_tp, 5085 cur->bc_levels[i].bp); 5086 cur->bc_levels[i].bp = NULL; 5087 cur->bc_levels[i].ptr = 0; 5088 cur->bc_levels[i].ra = 0; 5089 } 5090 } 5091 } 5092 5093 return error; 5094 } 5095 5096 static inline void 5097 xfs_btree_key_from_irec( 5098 struct xfs_btree_cur *cur, 5099 union xfs_btree_key *key, 5100 const union xfs_btree_irec *irec) 5101 { 5102 union xfs_btree_rec rec; 5103 5104 cur->bc_rec = *irec; 5105 cur->bc_ops->init_rec_from_cur(cur, &rec); 5106 cur->bc_ops->init_key_from_rec(key, &rec); 5107 } 5108 5109 /* 5110 * Query a btree for all records overlapping a given interval of keys. The 5111 * supplied function will be called with each record found; return one of the 5112 * XFS_BTREE_QUERY_RANGE_{CONTINUE,ABORT} values or the usual negative error 5113 * code. This function returns -ECANCELED, zero, or a negative error code. 5114 */ 5115 int 5116 xfs_btree_query_range( 5117 struct xfs_btree_cur *cur, 5118 const union xfs_btree_irec *low_rec, 5119 const union xfs_btree_irec *high_rec, 5120 xfs_btree_query_range_fn fn, 5121 void *priv) 5122 { 5123 union xfs_btree_key low_key; 5124 union xfs_btree_key high_key; 5125 5126 /* Find the keys of both ends of the interval. */ 5127 xfs_btree_key_from_irec(cur, &high_key, high_rec); 5128 xfs_btree_key_from_irec(cur, &low_key, low_rec); 5129 5130 /* Enforce low key <= high key. */ 5131 if (!xfs_btree_keycmp_le(cur, &low_key, &high_key)) 5132 return -EINVAL; 5133 5134 if (!(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING)) 5135 return xfs_btree_simple_query_range(cur, &low_key, 5136 &high_key, fn, priv); 5137 return xfs_btree_overlapped_query_range(cur, &low_key, &high_key, 5138 fn, priv); 5139 } 5140 5141 /* Query a btree for all records. */ 5142 int 5143 xfs_btree_query_all( 5144 struct xfs_btree_cur *cur, 5145 xfs_btree_query_range_fn fn, 5146 void *priv) 5147 { 5148 union xfs_btree_key low_key; 5149 union xfs_btree_key high_key; 5150 5151 memset(&cur->bc_rec, 0, sizeof(cur->bc_rec)); 5152 memset(&low_key, 0, sizeof(low_key)); 5153 memset(&high_key, 0xFF, sizeof(high_key)); 5154 5155 return xfs_btree_simple_query_range(cur, &low_key, &high_key, fn, priv); 5156 } 5157 5158 static int 5159 xfs_btree_count_blocks_helper( 5160 struct xfs_btree_cur *cur, 5161 int level, 5162 void *data) 5163 { 5164 xfs_filblks_t *blocks = data; 5165 (*blocks)++; 5166 5167 return 0; 5168 } 5169 5170 /* Count the blocks in a btree and return the result in *blocks. */ 5171 int 5172 xfs_btree_count_blocks( 5173 struct xfs_btree_cur *cur, 5174 xfs_filblks_t *blocks) 5175 { 5176 *blocks = 0; 5177 return xfs_btree_visit_blocks(cur, xfs_btree_count_blocks_helper, 5178 XFS_BTREE_VISIT_ALL, blocks); 5179 } 5180 5181 /* Compare two btree pointers. */ 5182 int64_t 5183 xfs_btree_diff_two_ptrs( 5184 struct xfs_btree_cur *cur, 5185 const union xfs_btree_ptr *a, 5186 const union xfs_btree_ptr *b) 5187 { 5188 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) 5189 return (int64_t)be64_to_cpu(a->l) - be64_to_cpu(b->l); 5190 return (int64_t)be32_to_cpu(a->s) - be32_to_cpu(b->s); 5191 } 5192 5193 struct xfs_btree_has_records { 5194 /* Keys for the start and end of the range we want to know about. */ 5195 union xfs_btree_key start_key; 5196 union xfs_btree_key end_key; 5197 5198 /* Mask for key comparisons, if desired. */ 5199 const union xfs_btree_key *key_mask; 5200 5201 /* Highest record key we've seen so far. */ 5202 union xfs_btree_key high_key; 5203 5204 enum xbtree_recpacking outcome; 5205 }; 5206 5207 STATIC int 5208 xfs_btree_has_records_helper( 5209 struct xfs_btree_cur *cur, 5210 const union xfs_btree_rec *rec, 5211 void *priv) 5212 { 5213 union xfs_btree_key rec_key; 5214 union xfs_btree_key rec_high_key; 5215 struct xfs_btree_has_records *info = priv; 5216 enum xbtree_key_contig key_contig; 5217 5218 cur->bc_ops->init_key_from_rec(&rec_key, rec); 5219 5220 if (info->outcome == XBTREE_RECPACKING_EMPTY) { 5221 info->outcome = XBTREE_RECPACKING_SPARSE; 5222 5223 /* 5224 * If the first record we find does not overlap the start key, 5225 * then there is a hole at the start of the search range. 5226 * Classify this as sparse and stop immediately. 5227 */ 5228 if (xfs_btree_masked_keycmp_lt(cur, &info->start_key, &rec_key, 5229 info->key_mask)) 5230 return -ECANCELED; 5231 } else { 5232 /* 5233 * If a subsequent record does not overlap with the any record 5234 * we've seen so far, there is a hole in the middle of the 5235 * search range. Classify this as sparse and stop. 5236 * If the keys overlap and this btree does not allow overlap, 5237 * signal corruption. 5238 */ 5239 key_contig = cur->bc_ops->keys_contiguous(cur, &info->high_key, 5240 &rec_key, info->key_mask); 5241 if (key_contig == XBTREE_KEY_OVERLAP && 5242 !(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING)) 5243 return -EFSCORRUPTED; 5244 if (key_contig == XBTREE_KEY_GAP) 5245 return -ECANCELED; 5246 } 5247 5248 /* 5249 * If high_key(rec) is larger than any other high key we've seen, 5250 * remember it for later. 5251 */ 5252 cur->bc_ops->init_high_key_from_rec(&rec_high_key, rec); 5253 if (xfs_btree_masked_keycmp_gt(cur, &rec_high_key, &info->high_key, 5254 info->key_mask)) 5255 info->high_key = rec_high_key; /* struct copy */ 5256 5257 return 0; 5258 } 5259 5260 /* 5261 * Scan part of the keyspace of a btree and tell us if that keyspace does not 5262 * map to any records; is fully mapped to records; or is partially mapped to 5263 * records. This is the btree record equivalent to determining if a file is 5264 * sparse. 5265 * 5266 * For most btree types, the record scan should use all available btree key 5267 * fields to compare the keys encountered. These callers should pass NULL for 5268 * @mask. However, some callers (e.g. scanning physical space in the rmapbt) 5269 * want to ignore some part of the btree record keyspace when performing the 5270 * comparison. These callers should pass in a union xfs_btree_key object with 5271 * the fields that *should* be a part of the comparison set to any nonzero 5272 * value, and the rest zeroed. 5273 */ 5274 int 5275 xfs_btree_has_records( 5276 struct xfs_btree_cur *cur, 5277 const union xfs_btree_irec *low, 5278 const union xfs_btree_irec *high, 5279 const union xfs_btree_key *mask, 5280 enum xbtree_recpacking *outcome) 5281 { 5282 struct xfs_btree_has_records info = { 5283 .outcome = XBTREE_RECPACKING_EMPTY, 5284 .key_mask = mask, 5285 }; 5286 int error; 5287 5288 /* Not all btrees support this operation. */ 5289 if (!cur->bc_ops->keys_contiguous) { 5290 ASSERT(0); 5291 return -EOPNOTSUPP; 5292 } 5293 5294 xfs_btree_key_from_irec(cur, &info.start_key, low); 5295 xfs_btree_key_from_irec(cur, &info.end_key, high); 5296 5297 error = xfs_btree_query_range(cur, low, high, 5298 xfs_btree_has_records_helper, &info); 5299 if (error == -ECANCELED) 5300 goto out; 5301 if (error) 5302 return error; 5303 5304 if (info.outcome == XBTREE_RECPACKING_EMPTY) 5305 goto out; 5306 5307 /* 5308 * If the largest high_key(rec) we saw during the walk is greater than 5309 * the end of the search range, classify this as full. Otherwise, 5310 * there is a hole at the end of the search range. 5311 */ 5312 if (xfs_btree_masked_keycmp_ge(cur, &info.high_key, &info.end_key, 5313 mask)) 5314 info.outcome = XBTREE_RECPACKING_FULL; 5315 5316 out: 5317 *outcome = info.outcome; 5318 return 0; 5319 } 5320 5321 /* Are there more records in this btree? */ 5322 bool 5323 xfs_btree_has_more_records( 5324 struct xfs_btree_cur *cur) 5325 { 5326 struct xfs_btree_block *block; 5327 struct xfs_buf *bp; 5328 5329 block = xfs_btree_get_block(cur, 0, &bp); 5330 5331 /* There are still records in this block. */ 5332 if (cur->bc_levels[0].ptr < xfs_btree_get_numrecs(block)) 5333 return true; 5334 5335 /* There are more record blocks. */ 5336 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) 5337 return block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK); 5338 else 5339 return block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK); 5340 } 5341 5342 /* Set up all the btree cursor caches. */ 5343 int __init 5344 xfs_btree_init_cur_caches(void) 5345 { 5346 int error; 5347 5348 error = xfs_allocbt_init_cur_cache(); 5349 if (error) 5350 return error; 5351 error = xfs_inobt_init_cur_cache(); 5352 if (error) 5353 goto err; 5354 error = xfs_bmbt_init_cur_cache(); 5355 if (error) 5356 goto err; 5357 error = xfs_rmapbt_init_cur_cache(); 5358 if (error) 5359 goto err; 5360 error = xfs_refcountbt_init_cur_cache(); 5361 if (error) 5362 goto err; 5363 5364 return 0; 5365 err: 5366 xfs_btree_destroy_cur_caches(); 5367 return error; 5368 } 5369 5370 /* Destroy all the btree cursor caches, if they've been allocated. */ 5371 void 5372 xfs_btree_destroy_cur_caches(void) 5373 { 5374 xfs_allocbt_destroy_cur_cache(); 5375 xfs_inobt_destroy_cur_cache(); 5376 xfs_bmbt_destroy_cur_cache(); 5377 xfs_rmapbt_destroy_cur_cache(); 5378 xfs_refcountbt_destroy_cur_cache(); 5379 } 5380 5381 /* Move the btree cursor before the first record. */ 5382 int 5383 xfs_btree_goto_left_edge( 5384 struct xfs_btree_cur *cur) 5385 { 5386 int stat = 0; 5387 int error; 5388 5389 memset(&cur->bc_rec, 0, sizeof(cur->bc_rec)); 5390 error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, &stat); 5391 if (error) 5392 return error; 5393 if (!stat) 5394 return 0; 5395 5396 error = xfs_btree_decrement(cur, 0, &stat); 5397 if (error) 5398 return error; 5399 if (stat != 0) { 5400 ASSERT(0); 5401 xfs_btree_mark_sick(cur); 5402 return -EFSCORRUPTED; 5403 } 5404 5405 return 0; 5406 } 5407