1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_bit.h" 13 #include "xfs_mount.h" 14 #include "xfs_inode.h" 15 #include "xfs_trans.h" 16 #include "xfs_buf_item.h" 17 #include "xfs_btree.h" 18 #include "xfs_errortag.h" 19 #include "xfs_error.h" 20 #include "xfs_trace.h" 21 #include "xfs_alloc.h" 22 #include "xfs_log.h" 23 #include "xfs_btree_staging.h" 24 #include "xfs_ag.h" 25 #include "xfs_alloc_btree.h" 26 #include "xfs_ialloc_btree.h" 27 #include "xfs_bmap_btree.h" 28 #include "xfs_rmap_btree.h" 29 #include "xfs_refcount_btree.h" 30 #include "xfs_health.h" 31 #include "xfs_buf_mem.h" 32 #include "xfs_btree_mem.h" 33 34 /* 35 * Btree magic numbers. 36 */ 37 uint32_t 38 xfs_btree_magic( 39 struct xfs_mount *mp, 40 const struct xfs_btree_ops *ops) 41 { 42 int idx = xfs_has_crc(mp) ? 1 : 0; 43 __be32 magic = ops->buf_ops->magic[idx]; 44 45 /* Ensure we asked for crc for crc-only magics. */ 46 ASSERT(magic != 0); 47 return be32_to_cpu(magic); 48 } 49 50 /* 51 * These sibling pointer checks are optimised for null sibling pointers. This 52 * happens a lot, and we don't need to byte swap at runtime if the sibling 53 * pointer is NULL. 54 * 55 * These are explicitly marked at inline because the cost of calling them as 56 * functions instead of inlining them is about 36 bytes extra code per call site 57 * on x86-64. Yes, gcc-11 fails to inline them, and explicit inlining of these 58 * two sibling check functions reduces the compiled code size by over 300 59 * bytes. 60 */ 61 static inline xfs_failaddr_t 62 xfs_btree_check_fsblock_siblings( 63 struct xfs_mount *mp, 64 xfs_fsblock_t fsb, 65 __be64 dsibling) 66 { 67 xfs_fsblock_t sibling; 68 69 if (dsibling == cpu_to_be64(NULLFSBLOCK)) 70 return NULL; 71 72 sibling = be64_to_cpu(dsibling); 73 if (sibling == fsb) 74 return __this_address; 75 if (!xfs_verify_fsbno(mp, sibling)) 76 return __this_address; 77 return NULL; 78 } 79 80 static inline xfs_failaddr_t 81 xfs_btree_check_memblock_siblings( 82 struct xfs_buftarg *btp, 83 xfbno_t bno, 84 __be64 dsibling) 85 { 86 xfbno_t sibling; 87 88 if (dsibling == cpu_to_be64(NULLFSBLOCK)) 89 return NULL; 90 91 sibling = be64_to_cpu(dsibling); 92 if (sibling == bno) 93 return __this_address; 94 if (!xmbuf_verify_daddr(btp, xfbno_to_daddr(sibling))) 95 return __this_address; 96 return NULL; 97 } 98 99 static inline xfs_failaddr_t 100 xfs_btree_check_agblock_siblings( 101 struct xfs_perag *pag, 102 xfs_agblock_t agbno, 103 __be32 dsibling) 104 { 105 xfs_agblock_t sibling; 106 107 if (dsibling == cpu_to_be32(NULLAGBLOCK)) 108 return NULL; 109 110 sibling = be32_to_cpu(dsibling); 111 if (sibling == agbno) 112 return __this_address; 113 if (!xfs_verify_agbno(pag, sibling)) 114 return __this_address; 115 return NULL; 116 } 117 118 static xfs_failaddr_t 119 __xfs_btree_check_lblock_hdr( 120 struct xfs_btree_cur *cur, 121 struct xfs_btree_block *block, 122 int level, 123 struct xfs_buf *bp) 124 { 125 struct xfs_mount *mp = cur->bc_mp; 126 127 if (xfs_has_crc(mp)) { 128 if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid)) 129 return __this_address; 130 if (block->bb_u.l.bb_blkno != 131 cpu_to_be64(bp ? xfs_buf_daddr(bp) : XFS_BUF_DADDR_NULL)) 132 return __this_address; 133 if (block->bb_u.l.bb_pad != cpu_to_be32(0)) 134 return __this_address; 135 } 136 137 if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(mp, cur->bc_ops)) 138 return __this_address; 139 if (be16_to_cpu(block->bb_level) != level) 140 return __this_address; 141 if (be16_to_cpu(block->bb_numrecs) > 142 cur->bc_ops->get_maxrecs(cur, level)) 143 return __this_address; 144 145 return NULL; 146 } 147 148 /* 149 * Check a long btree block header. Return the address of the failing check, 150 * or NULL if everything is ok. 151 */ 152 static xfs_failaddr_t 153 __xfs_btree_check_fsblock( 154 struct xfs_btree_cur *cur, 155 struct xfs_btree_block *block, 156 int level, 157 struct xfs_buf *bp) 158 { 159 struct xfs_mount *mp = cur->bc_mp; 160 xfs_failaddr_t fa; 161 xfs_fsblock_t fsb; 162 163 fa = __xfs_btree_check_lblock_hdr(cur, block, level, bp); 164 if (fa) 165 return fa; 166 167 /* 168 * For inode-rooted btrees, the root block sits in the inode fork. In 169 * that case bp is NULL, and the block must not have any siblings. 170 */ 171 if (!bp) { 172 if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK)) 173 return __this_address; 174 if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK)) 175 return __this_address; 176 return NULL; 177 } 178 179 fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); 180 fa = xfs_btree_check_fsblock_siblings(mp, fsb, 181 block->bb_u.l.bb_leftsib); 182 if (!fa) 183 fa = xfs_btree_check_fsblock_siblings(mp, fsb, 184 block->bb_u.l.bb_rightsib); 185 return fa; 186 } 187 188 /* 189 * Check an in-memory btree block header. Return the address of the failing 190 * check, or NULL if everything is ok. 191 */ 192 static xfs_failaddr_t 193 __xfs_btree_check_memblock( 194 struct xfs_btree_cur *cur, 195 struct xfs_btree_block *block, 196 int level, 197 struct xfs_buf *bp) 198 { 199 struct xfs_buftarg *btp = cur->bc_mem.xfbtree->target; 200 xfs_failaddr_t fa; 201 xfbno_t bno; 202 203 fa = __xfs_btree_check_lblock_hdr(cur, block, level, bp); 204 if (fa) 205 return fa; 206 207 bno = xfs_daddr_to_xfbno(xfs_buf_daddr(bp)); 208 fa = xfs_btree_check_memblock_siblings(btp, bno, 209 block->bb_u.l.bb_leftsib); 210 if (!fa) 211 fa = xfs_btree_check_memblock_siblings(btp, bno, 212 block->bb_u.l.bb_rightsib); 213 return fa; 214 } 215 216 /* 217 * Check a short btree block header. Return the address of the failing check, 218 * or NULL if everything is ok. 219 */ 220 static xfs_failaddr_t 221 __xfs_btree_check_agblock( 222 struct xfs_btree_cur *cur, 223 struct xfs_btree_block *block, 224 int level, 225 struct xfs_buf *bp) 226 { 227 struct xfs_mount *mp = cur->bc_mp; 228 struct xfs_perag *pag = to_perag(cur->bc_group); 229 xfs_failaddr_t fa; 230 xfs_agblock_t agbno; 231 232 if (xfs_has_crc(mp)) { 233 if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) 234 return __this_address; 235 if (block->bb_u.s.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp))) 236 return __this_address; 237 } 238 239 if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(mp, cur->bc_ops)) 240 return __this_address; 241 if (be16_to_cpu(block->bb_level) != level) 242 return __this_address; 243 if (be16_to_cpu(block->bb_numrecs) > 244 cur->bc_ops->get_maxrecs(cur, level)) 245 return __this_address; 246 247 agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp)); 248 fa = xfs_btree_check_agblock_siblings(pag, agbno, 249 block->bb_u.s.bb_leftsib); 250 if (!fa) 251 fa = xfs_btree_check_agblock_siblings(pag, agbno, 252 block->bb_u.s.bb_rightsib); 253 return fa; 254 } 255 256 /* 257 * Internal btree block check. 258 * 259 * Return NULL if the block is ok or the address of the failed check otherwise. 260 */ 261 xfs_failaddr_t 262 __xfs_btree_check_block( 263 struct xfs_btree_cur *cur, 264 struct xfs_btree_block *block, 265 int level, 266 struct xfs_buf *bp) 267 { 268 switch (cur->bc_ops->type) { 269 case XFS_BTREE_TYPE_MEM: 270 return __xfs_btree_check_memblock(cur, block, level, bp); 271 case XFS_BTREE_TYPE_AG: 272 return __xfs_btree_check_agblock(cur, block, level, bp); 273 case XFS_BTREE_TYPE_INODE: 274 return __xfs_btree_check_fsblock(cur, block, level, bp); 275 default: 276 ASSERT(0); 277 return __this_address; 278 } 279 } 280 281 static inline unsigned int xfs_btree_block_errtag(struct xfs_btree_cur *cur) 282 { 283 if (cur->bc_ops->ptr_len == XFS_BTREE_SHORT_PTR_LEN) 284 return XFS_ERRTAG_BTREE_CHECK_SBLOCK; 285 return XFS_ERRTAG_BTREE_CHECK_LBLOCK; 286 } 287 288 /* 289 * Debug routine: check that block header is ok. 290 */ 291 int 292 xfs_btree_check_block( 293 struct xfs_btree_cur *cur, /* btree cursor */ 294 struct xfs_btree_block *block, /* generic btree block pointer */ 295 int level, /* level of the btree block */ 296 struct xfs_buf *bp) /* buffer containing block, if any */ 297 { 298 struct xfs_mount *mp = cur->bc_mp; 299 xfs_failaddr_t fa; 300 301 fa = __xfs_btree_check_block(cur, block, level, bp); 302 if (XFS_IS_CORRUPT(mp, fa != NULL) || 303 XFS_TEST_ERROR(false, mp, xfs_btree_block_errtag(cur))) { 304 if (bp) 305 trace_xfs_btree_corrupt(bp, _RET_IP_); 306 xfs_btree_mark_sick(cur); 307 return -EFSCORRUPTED; 308 } 309 return 0; 310 } 311 312 int 313 __xfs_btree_check_ptr( 314 struct xfs_btree_cur *cur, 315 const union xfs_btree_ptr *ptr, 316 int index, 317 int level) 318 { 319 if (level <= 0) 320 return -EFSCORRUPTED; 321 322 switch (cur->bc_ops->type) { 323 case XFS_BTREE_TYPE_MEM: 324 if (!xfbtree_verify_bno(cur->bc_mem.xfbtree, 325 be64_to_cpu((&ptr->l)[index]))) 326 return -EFSCORRUPTED; 327 break; 328 case XFS_BTREE_TYPE_INODE: 329 if (!xfs_verify_fsbno(cur->bc_mp, 330 be64_to_cpu((&ptr->l)[index]))) 331 return -EFSCORRUPTED; 332 break; 333 case XFS_BTREE_TYPE_AG: 334 if (!xfs_verify_agbno(to_perag(cur->bc_group), 335 be32_to_cpu((&ptr->s)[index]))) 336 return -EFSCORRUPTED; 337 break; 338 } 339 340 return 0; 341 } 342 343 /* 344 * Check that a given (indexed) btree pointer at a certain level of a 345 * btree is valid and doesn't point past where it should. 346 */ 347 static int 348 xfs_btree_check_ptr( 349 struct xfs_btree_cur *cur, 350 const union xfs_btree_ptr *ptr, 351 int index, 352 int level) 353 { 354 int error; 355 356 error = __xfs_btree_check_ptr(cur, ptr, index, level); 357 if (error) { 358 switch (cur->bc_ops->type) { 359 case XFS_BTREE_TYPE_MEM: 360 xfs_err(cur->bc_mp, 361 "In-memory: Corrupt %sbt flags 0x%x pointer at level %d index %d fa %pS.", 362 cur->bc_ops->name, cur->bc_flags, level, index, 363 __this_address); 364 break; 365 case XFS_BTREE_TYPE_INODE: 366 xfs_err(cur->bc_mp, 367 "Inode %llu fork %d: Corrupt %sbt pointer at level %d index %d.", 368 cur->bc_ino.ip->i_ino, 369 cur->bc_ino.whichfork, cur->bc_ops->name, 370 level, index); 371 break; 372 case XFS_BTREE_TYPE_AG: 373 xfs_err(cur->bc_mp, 374 "AG %u: Corrupt %sbt pointer at level %d index %d.", 375 cur->bc_group->xg_gno, cur->bc_ops->name, 376 level, index); 377 break; 378 } 379 xfs_btree_mark_sick(cur); 380 } 381 382 return error; 383 } 384 385 #ifdef DEBUG 386 # define xfs_btree_debug_check_ptr xfs_btree_check_ptr 387 #else 388 # define xfs_btree_debug_check_ptr(...) (0) 389 #endif 390 391 /* 392 * Calculate CRC on the whole btree block and stuff it into the 393 * long-form btree header. 394 * 395 * Prior to calculting the CRC, pull the LSN out of the buffer log item and put 396 * it into the buffer so recovery knows what the last modification was that made 397 * it to disk. 398 */ 399 void 400 xfs_btree_fsblock_calc_crc( 401 struct xfs_buf *bp) 402 { 403 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 404 struct xfs_buf_log_item *bip = bp->b_log_item; 405 406 if (!xfs_has_crc(bp->b_mount)) 407 return; 408 if (bip) 409 block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn); 410 xfs_buf_update_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF); 411 } 412 413 bool 414 xfs_btree_fsblock_verify_crc( 415 struct xfs_buf *bp) 416 { 417 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 418 struct xfs_mount *mp = bp->b_mount; 419 420 if (xfs_has_crc(mp)) { 421 if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.l.bb_lsn))) 422 return false; 423 return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF); 424 } 425 426 return true; 427 } 428 429 /* 430 * Calculate CRC on the whole btree block and stuff it into the 431 * short-form btree header. 432 * 433 * Prior to calculting the CRC, pull the LSN out of the buffer log item and put 434 * it into the buffer so recovery knows what the last modification was that made 435 * it to disk. 436 */ 437 void 438 xfs_btree_agblock_calc_crc( 439 struct xfs_buf *bp) 440 { 441 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 442 struct xfs_buf_log_item *bip = bp->b_log_item; 443 444 if (!xfs_has_crc(bp->b_mount)) 445 return; 446 if (bip) 447 block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn); 448 xfs_buf_update_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF); 449 } 450 451 bool 452 xfs_btree_agblock_verify_crc( 453 struct xfs_buf *bp) 454 { 455 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 456 struct xfs_mount *mp = bp->b_mount; 457 458 if (xfs_has_crc(mp)) { 459 if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.s.bb_lsn))) 460 return false; 461 return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF); 462 } 463 464 return true; 465 } 466 467 static int 468 xfs_btree_free_block( 469 struct xfs_btree_cur *cur, 470 struct xfs_buf *bp) 471 { 472 int error; 473 474 trace_xfs_btree_free_block(cur, bp); 475 476 /* 477 * Don't allow block freeing for a staging cursor, because staging 478 * cursors do not support regular btree modifications. 479 */ 480 if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) { 481 ASSERT(0); 482 return -EFSCORRUPTED; 483 } 484 485 error = cur->bc_ops->free_block(cur, bp); 486 if (!error) { 487 xfs_trans_binval(cur->bc_tp, bp); 488 XFS_BTREE_STATS_INC(cur, free); 489 } 490 return error; 491 } 492 493 /* 494 * Delete the btree cursor. 495 */ 496 void 497 xfs_btree_del_cursor( 498 struct xfs_btree_cur *cur, /* btree cursor */ 499 int error) /* del because of error */ 500 { 501 int i; /* btree level */ 502 503 /* 504 * Clear the buffer pointers and release the buffers. If we're doing 505 * this because of an error, inspect all of the entries in the bc_bufs 506 * array for buffers to be unlocked. This is because some of the btree 507 * code works from level n down to 0, and if we get an error along the 508 * way we won't have initialized all the entries down to 0. 509 */ 510 for (i = 0; i < cur->bc_nlevels; i++) { 511 if (cur->bc_levels[i].bp) 512 xfs_trans_brelse(cur->bc_tp, cur->bc_levels[i].bp); 513 else if (!error) 514 break; 515 } 516 517 /* 518 * If we are doing a BMBT update, the number of unaccounted blocks 519 * allocated during this cursor life time should be zero. If it's not 520 * zero, then we should be shut down or on our way to shutdown due to 521 * cancelling a dirty transaction on error. 522 */ 523 ASSERT(!xfs_btree_is_bmap(cur->bc_ops) || cur->bc_bmap.allocated == 0 || 524 xfs_is_shutdown(cur->bc_mp) || error != 0); 525 526 if (cur->bc_group) 527 xfs_group_put(cur->bc_group); 528 kmem_cache_free(cur->bc_cache, cur); 529 } 530 531 /* Return the buffer target for this btree's buffer. */ 532 static inline struct xfs_buftarg * 533 xfs_btree_buftarg( 534 struct xfs_btree_cur *cur) 535 { 536 if (cur->bc_ops->type == XFS_BTREE_TYPE_MEM) 537 return cur->bc_mem.xfbtree->target; 538 return cur->bc_mp->m_ddev_targp; 539 } 540 541 /* Return the block size (in units of 512b sectors) for this btree. */ 542 static inline unsigned int 543 xfs_btree_bbsize( 544 struct xfs_btree_cur *cur) 545 { 546 if (cur->bc_ops->type == XFS_BTREE_TYPE_MEM) 547 return XFBNO_BBSIZE; 548 return cur->bc_mp->m_bsize; 549 } 550 551 /* 552 * Duplicate the btree cursor. 553 * Allocate a new one, copy the record, re-get the buffers. 554 */ 555 int /* error */ 556 xfs_btree_dup_cursor( 557 struct xfs_btree_cur *cur, /* input cursor */ 558 struct xfs_btree_cur **ncur) /* output cursor */ 559 { 560 struct xfs_mount *mp = cur->bc_mp; 561 struct xfs_trans *tp = cur->bc_tp; 562 struct xfs_buf *bp; 563 struct xfs_btree_cur *new; 564 int error; 565 int i; 566 567 /* 568 * Don't allow staging cursors to be duplicated because they're supposed 569 * to be kept private to a single thread. 570 */ 571 if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) { 572 ASSERT(0); 573 return -EFSCORRUPTED; 574 } 575 576 /* 577 * Allocate a new cursor like the old one. 578 */ 579 new = cur->bc_ops->dup_cursor(cur); 580 581 /* 582 * Copy the record currently in the cursor. 583 */ 584 new->bc_rec = cur->bc_rec; 585 586 /* 587 * For each level current, re-get the buffer and copy the ptr value. 588 */ 589 for (i = 0; i < new->bc_nlevels; i++) { 590 new->bc_levels[i].ptr = cur->bc_levels[i].ptr; 591 new->bc_levels[i].ra = cur->bc_levels[i].ra; 592 bp = cur->bc_levels[i].bp; 593 if (bp) { 594 error = xfs_trans_read_buf(mp, tp, 595 xfs_btree_buftarg(cur), 596 xfs_buf_daddr(bp), 597 xfs_btree_bbsize(cur), 0, &bp, 598 cur->bc_ops->buf_ops); 599 if (xfs_metadata_is_sick(error)) 600 xfs_btree_mark_sick(new); 601 if (error) { 602 xfs_btree_del_cursor(new, error); 603 *ncur = NULL; 604 return error; 605 } 606 } 607 new->bc_levels[i].bp = bp; 608 } 609 *ncur = new; 610 return 0; 611 } 612 613 /* 614 * XFS btree block layout and addressing: 615 * 616 * There are two types of blocks in the btree: leaf and non-leaf blocks. 617 * 618 * The leaf record start with a header then followed by records containing 619 * the values. A non-leaf block also starts with the same header, and 620 * then first contains lookup keys followed by an equal number of pointers 621 * to the btree blocks at the previous level. 622 * 623 * +--------+-------+-------+-------+-------+-------+-------+ 624 * Leaf: | header | rec 1 | rec 2 | rec 3 | rec 4 | rec 5 | rec N | 625 * +--------+-------+-------+-------+-------+-------+-------+ 626 * 627 * +--------+-------+-------+-------+-------+-------+-------+ 628 * Non-Leaf: | header | key 1 | key 2 | key N | ptr 1 | ptr 2 | ptr N | 629 * +--------+-------+-------+-------+-------+-------+-------+ 630 * 631 * The header is called struct xfs_btree_block for reasons better left unknown 632 * and comes in different versions for short (32bit) and long (64bit) block 633 * pointers. The record and key structures are defined by the btree instances 634 * and opaque to the btree core. The block pointers are simple disk endian 635 * integers, available in a short (32bit) and long (64bit) variant. 636 * 637 * The helpers below calculate the offset of a given record, key or pointer 638 * into a btree block (xfs_btree_*_offset) or return a pointer to the given 639 * record, key or pointer (xfs_btree_*_addr). Note that all addressing 640 * inside the btree block is done using indices starting at one, not zero! 641 * 642 * If XFS_BTGEO_OVERLAPPING is set, then this btree supports keys containing 643 * overlapping intervals. In such a tree, records are still sorted lowest to 644 * highest and indexed by the smallest key value that refers to the record. 645 * However, nodes are different: each pointer has two associated keys -- one 646 * indexing the lowest key available in the block(s) below (the same behavior 647 * as the key in a regular btree) and another indexing the highest key 648 * available in the block(s) below. Because records are /not/ sorted by the 649 * highest key, all leaf block updates require us to compute the highest key 650 * that matches any record in the leaf and to recursively update the high keys 651 * in the nodes going further up in the tree, if necessary. Nodes look like 652 * this: 653 * 654 * +--------+-----+-----+-----+-----+-----+-------+-------+-----+ 655 * Non-Leaf: | header | lo1 | hi1 | lo2 | hi2 | ... | ptr 1 | ptr 2 | ... | 656 * +--------+-----+-----+-----+-----+-----+-------+-------+-----+ 657 * 658 * To perform an interval query on an overlapped tree, perform the usual 659 * depth-first search and use the low and high keys to decide if we can skip 660 * that particular node. If a leaf node is reached, return the records that 661 * intersect the interval. Note that an interval query may return numerous 662 * entries. For a non-overlapped tree, simply search for the record associated 663 * with the lowest key and iterate forward until a non-matching record is 664 * found. Section 14.3 ("Interval Trees") of _Introduction to Algorithms_ by 665 * Cormen, Leiserson, Rivest, and Stein (2nd or 3rd ed. only) discuss this in 666 * more detail. 667 * 668 * Why do we care about overlapping intervals? Let's say you have a bunch of 669 * reverse mapping records on a reflink filesystem: 670 * 671 * 1: +- file A startblock B offset C length D -----------+ 672 * 2: +- file E startblock F offset G length H --------------+ 673 * 3: +- file I startblock F offset J length K --+ 674 * 4: +- file L... --+ 675 * 676 * Now say we want to map block (B+D) into file A at offset (C+D). Ideally, 677 * we'd simply increment the length of record 1. But how do we find the record 678 * that ends at (B+D-1) (i.e. record 1)? A LE lookup of (B+D-1) would return 679 * record 3 because the keys are ordered first by startblock. An interval 680 * query would return records 1 and 2 because they both overlap (B+D-1), and 681 * from that we can pick out record 1 as the appropriate left neighbor. 682 * 683 * In the non-overlapped case you can do a LE lookup and decrement the cursor 684 * because a record's interval must end before the next record. 685 */ 686 687 /* 688 * Return size of the btree block header for this btree instance. 689 */ 690 static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur) 691 { 692 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { 693 if (xfs_has_crc(cur->bc_mp)) 694 return XFS_BTREE_LBLOCK_CRC_LEN; 695 return XFS_BTREE_LBLOCK_LEN; 696 } 697 if (xfs_has_crc(cur->bc_mp)) 698 return XFS_BTREE_SBLOCK_CRC_LEN; 699 return XFS_BTREE_SBLOCK_LEN; 700 } 701 702 /* 703 * Calculate offset of the n-th record in a btree block. 704 */ 705 STATIC size_t 706 xfs_btree_rec_offset( 707 struct xfs_btree_cur *cur, 708 int n) 709 { 710 return xfs_btree_block_len(cur) + 711 (n - 1) * cur->bc_ops->rec_len; 712 } 713 714 /* 715 * Calculate offset of the n-th key in a btree block. 716 */ 717 STATIC size_t 718 xfs_btree_key_offset( 719 struct xfs_btree_cur *cur, 720 int n) 721 { 722 return xfs_btree_block_len(cur) + 723 (n - 1) * cur->bc_ops->key_len; 724 } 725 726 /* 727 * Calculate offset of the n-th high key in a btree block. 728 */ 729 STATIC size_t 730 xfs_btree_high_key_offset( 731 struct xfs_btree_cur *cur, 732 int n) 733 { 734 return xfs_btree_block_len(cur) + 735 (n - 1) * cur->bc_ops->key_len + (cur->bc_ops->key_len / 2); 736 } 737 738 /* 739 * Calculate offset of the n-th block pointer in a btree block. 740 */ 741 STATIC size_t 742 xfs_btree_ptr_offset( 743 struct xfs_btree_cur *cur, 744 int n, 745 int level) 746 { 747 return xfs_btree_block_len(cur) + 748 cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len + 749 (n - 1) * cur->bc_ops->ptr_len; 750 } 751 752 /* 753 * Return a pointer to the n-th record in the btree block. 754 */ 755 union xfs_btree_rec * 756 xfs_btree_rec_addr( 757 struct xfs_btree_cur *cur, 758 int n, 759 struct xfs_btree_block *block) 760 { 761 return (union xfs_btree_rec *) 762 ((char *)block + xfs_btree_rec_offset(cur, n)); 763 } 764 765 /* 766 * Return a pointer to the n-th key in the btree block. 767 */ 768 union xfs_btree_key * 769 xfs_btree_key_addr( 770 struct xfs_btree_cur *cur, 771 int n, 772 struct xfs_btree_block *block) 773 { 774 return (union xfs_btree_key *) 775 ((char *)block + xfs_btree_key_offset(cur, n)); 776 } 777 778 /* 779 * Return a pointer to the n-th high key in the btree block. 780 */ 781 union xfs_btree_key * 782 xfs_btree_high_key_addr( 783 struct xfs_btree_cur *cur, 784 int n, 785 struct xfs_btree_block *block) 786 { 787 return (union xfs_btree_key *) 788 ((char *)block + xfs_btree_high_key_offset(cur, n)); 789 } 790 791 /* 792 * Return a pointer to the n-th block pointer in the btree block. 793 */ 794 union xfs_btree_ptr * 795 xfs_btree_ptr_addr( 796 struct xfs_btree_cur *cur, 797 int n, 798 struct xfs_btree_block *block) 799 { 800 int level = xfs_btree_get_level(block); 801 802 ASSERT(block->bb_level != 0); 803 804 return (union xfs_btree_ptr *) 805 ((char *)block + xfs_btree_ptr_offset(cur, n, level)); 806 } 807 808 struct xfs_ifork * 809 xfs_btree_ifork_ptr( 810 struct xfs_btree_cur *cur) 811 { 812 ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE); 813 814 if (cur->bc_flags & XFS_BTREE_STAGING) 815 return cur->bc_ino.ifake->if_fork; 816 return xfs_ifork_ptr(cur->bc_ino.ip, cur->bc_ino.whichfork); 817 } 818 819 /* 820 * Get the root block which is stored in the inode. 821 * 822 * For now this btree implementation assumes the btree root is always 823 * stored in the if_broot field of an inode fork. 824 */ 825 STATIC struct xfs_btree_block * 826 xfs_btree_get_iroot( 827 struct xfs_btree_cur *cur) 828 { 829 struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); 830 831 return (struct xfs_btree_block *)ifp->if_broot; 832 } 833 834 /* 835 * Retrieve the block pointer from the cursor at the given level. 836 * This may be an inode btree root or from a buffer. 837 */ 838 struct xfs_btree_block * /* generic btree block pointer */ 839 xfs_btree_get_block( 840 struct xfs_btree_cur *cur, /* btree cursor */ 841 int level, /* level in btree */ 842 struct xfs_buf **bpp) /* buffer containing the block */ 843 { 844 if (xfs_btree_at_iroot(cur, level)) { 845 *bpp = NULL; 846 return xfs_btree_get_iroot(cur); 847 } 848 849 *bpp = cur->bc_levels[level].bp; 850 return XFS_BUF_TO_BLOCK(*bpp); 851 } 852 853 /* 854 * Change the cursor to point to the first record at the given level. 855 * Other levels are unaffected. 856 */ 857 STATIC int /* success=1, failure=0 */ 858 xfs_btree_firstrec( 859 struct xfs_btree_cur *cur, /* btree cursor */ 860 int level) /* level to change */ 861 { 862 struct xfs_btree_block *block; /* generic btree block pointer */ 863 struct xfs_buf *bp; /* buffer containing block */ 864 865 /* 866 * Get the block pointer for this level. 867 */ 868 block = xfs_btree_get_block(cur, level, &bp); 869 if (xfs_btree_check_block(cur, block, level, bp)) 870 return 0; 871 /* 872 * It's empty, there is no such record. 873 */ 874 if (!block->bb_numrecs) 875 return 0; 876 /* 877 * Set the ptr value to 1, that's the first record/key. 878 */ 879 cur->bc_levels[level].ptr = 1; 880 return 1; 881 } 882 883 /* 884 * Change the cursor to point to the last record in the current block 885 * at the given level. Other levels are unaffected. 886 */ 887 STATIC int /* success=1, failure=0 */ 888 xfs_btree_lastrec( 889 struct xfs_btree_cur *cur, /* btree cursor */ 890 int level) /* level to change */ 891 { 892 struct xfs_btree_block *block; /* generic btree block pointer */ 893 struct xfs_buf *bp; /* buffer containing block */ 894 895 /* 896 * Get the block pointer for this level. 897 */ 898 block = xfs_btree_get_block(cur, level, &bp); 899 if (xfs_btree_check_block(cur, block, level, bp)) 900 return 0; 901 /* 902 * It's empty, there is no such record. 903 */ 904 if (!block->bb_numrecs) 905 return 0; 906 /* 907 * Set the ptr value to numrecs, that's the last record/key. 908 */ 909 cur->bc_levels[level].ptr = be16_to_cpu(block->bb_numrecs); 910 return 1; 911 } 912 913 /* 914 * Compute first and last byte offsets for the fields given. 915 * Interprets the offsets table, which contains struct field offsets. 916 */ 917 void 918 xfs_btree_offsets( 919 uint32_t fields, /* bitmask of fields */ 920 const short *offsets, /* table of field offsets */ 921 int nbits, /* number of bits to inspect */ 922 int *first, /* output: first byte offset */ 923 int *last) /* output: last byte offset */ 924 { 925 int i; /* current bit number */ 926 uint32_t imask; /* mask for current bit number */ 927 928 ASSERT(fields != 0); 929 /* 930 * Find the lowest bit, so the first byte offset. 931 */ 932 for (i = 0, imask = 1u; ; i++, imask <<= 1) { 933 if (imask & fields) { 934 *first = offsets[i]; 935 break; 936 } 937 } 938 /* 939 * Find the highest bit, so the last byte offset. 940 */ 941 for (i = nbits - 1, imask = 1u << i; ; i--, imask >>= 1) { 942 if (imask & fields) { 943 *last = offsets[i + 1] - 1; 944 break; 945 } 946 } 947 } 948 949 STATIC int 950 xfs_btree_readahead_fsblock( 951 struct xfs_btree_cur *cur, 952 int lr, 953 struct xfs_btree_block *block) 954 { 955 struct xfs_mount *mp = cur->bc_mp; 956 xfs_fsblock_t left = be64_to_cpu(block->bb_u.l.bb_leftsib); 957 xfs_fsblock_t right = be64_to_cpu(block->bb_u.l.bb_rightsib); 958 int rval = 0; 959 960 if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) { 961 xfs_buf_readahead(mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, left), 962 mp->m_bsize, cur->bc_ops->buf_ops); 963 rval++; 964 } 965 966 if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) { 967 xfs_buf_readahead(mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, right), 968 mp->m_bsize, cur->bc_ops->buf_ops); 969 rval++; 970 } 971 972 return rval; 973 } 974 975 STATIC int 976 xfs_btree_readahead_memblock( 977 struct xfs_btree_cur *cur, 978 int lr, 979 struct xfs_btree_block *block) 980 { 981 struct xfs_buftarg *btp = cur->bc_mem.xfbtree->target; 982 xfbno_t left = be64_to_cpu(block->bb_u.l.bb_leftsib); 983 xfbno_t right = be64_to_cpu(block->bb_u.l.bb_rightsib); 984 int rval = 0; 985 986 if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) { 987 xfs_buf_readahead(btp, xfbno_to_daddr(left), XFBNO_BBSIZE, 988 cur->bc_ops->buf_ops); 989 rval++; 990 } 991 992 if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) { 993 xfs_buf_readahead(btp, xfbno_to_daddr(right), XFBNO_BBSIZE, 994 cur->bc_ops->buf_ops); 995 rval++; 996 } 997 998 return rval; 999 } 1000 1001 STATIC int 1002 xfs_btree_readahead_agblock( 1003 struct xfs_btree_cur *cur, 1004 int lr, 1005 struct xfs_btree_block *block) 1006 { 1007 struct xfs_mount *mp = cur->bc_mp; 1008 struct xfs_perag *pag = to_perag(cur->bc_group); 1009 xfs_agblock_t left = be32_to_cpu(block->bb_u.s.bb_leftsib); 1010 xfs_agblock_t right = be32_to_cpu(block->bb_u.s.bb_rightsib); 1011 int rval = 0; 1012 1013 if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) { 1014 xfs_buf_readahead(mp->m_ddev_targp, 1015 xfs_agbno_to_daddr(pag, left), mp->m_bsize, 1016 cur->bc_ops->buf_ops); 1017 rval++; 1018 } 1019 1020 if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) { 1021 xfs_buf_readahead(mp->m_ddev_targp, 1022 xfs_agbno_to_daddr(pag, right), mp->m_bsize, 1023 cur->bc_ops->buf_ops); 1024 rval++; 1025 } 1026 1027 return rval; 1028 } 1029 1030 /* 1031 * Read-ahead btree blocks, at the given level. 1032 * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA. 1033 */ 1034 STATIC int 1035 xfs_btree_readahead( 1036 struct xfs_btree_cur *cur, /* btree cursor */ 1037 int lev, /* level in btree */ 1038 int lr) /* left/right bits */ 1039 { 1040 struct xfs_btree_block *block; 1041 1042 /* 1043 * No readahead needed if we are at the root level and the 1044 * btree root is stored in the inode. 1045 */ 1046 if (xfs_btree_at_iroot(cur, lev)) 1047 return 0; 1048 1049 if ((cur->bc_levels[lev].ra | lr) == cur->bc_levels[lev].ra) 1050 return 0; 1051 1052 cur->bc_levels[lev].ra |= lr; 1053 block = XFS_BUF_TO_BLOCK(cur->bc_levels[lev].bp); 1054 1055 switch (cur->bc_ops->type) { 1056 case XFS_BTREE_TYPE_AG: 1057 return xfs_btree_readahead_agblock(cur, lr, block); 1058 case XFS_BTREE_TYPE_INODE: 1059 return xfs_btree_readahead_fsblock(cur, lr, block); 1060 case XFS_BTREE_TYPE_MEM: 1061 return xfs_btree_readahead_memblock(cur, lr, block); 1062 default: 1063 ASSERT(0); 1064 return 0; 1065 } 1066 } 1067 1068 STATIC int 1069 xfs_btree_ptr_to_daddr( 1070 struct xfs_btree_cur *cur, 1071 const union xfs_btree_ptr *ptr, 1072 xfs_daddr_t *daddr) 1073 { 1074 int error; 1075 1076 error = xfs_btree_check_ptr(cur, ptr, 0, 1); 1077 if (error) 1078 return error; 1079 1080 switch (cur->bc_ops->type) { 1081 case XFS_BTREE_TYPE_AG: 1082 *daddr = xfs_agbno_to_daddr(to_perag(cur->bc_group), 1083 be32_to_cpu(ptr->s)); 1084 break; 1085 case XFS_BTREE_TYPE_INODE: 1086 *daddr = XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l)); 1087 break; 1088 case XFS_BTREE_TYPE_MEM: 1089 *daddr = xfbno_to_daddr(be64_to_cpu(ptr->l)); 1090 break; 1091 } 1092 return 0; 1093 } 1094 1095 /* 1096 * Readahead @count btree blocks at the given @ptr location. 1097 * 1098 * We don't need to care about long or short form btrees here as we have a 1099 * method of converting the ptr directly to a daddr available to us. 1100 */ 1101 STATIC void 1102 xfs_btree_readahead_ptr( 1103 struct xfs_btree_cur *cur, 1104 union xfs_btree_ptr *ptr, 1105 xfs_extlen_t count) 1106 { 1107 xfs_daddr_t daddr; 1108 1109 if (xfs_btree_ptr_to_daddr(cur, ptr, &daddr)) 1110 return; 1111 xfs_buf_readahead(xfs_btree_buftarg(cur), daddr, 1112 xfs_btree_bbsize(cur) * count, 1113 cur->bc_ops->buf_ops); 1114 } 1115 1116 /* 1117 * Set the buffer for level "lev" in the cursor to bp, releasing 1118 * any previous buffer. 1119 */ 1120 STATIC void 1121 xfs_btree_setbuf( 1122 struct xfs_btree_cur *cur, /* btree cursor */ 1123 int lev, /* level in btree */ 1124 struct xfs_buf *bp) /* new buffer to set */ 1125 { 1126 struct xfs_btree_block *b; /* btree block */ 1127 1128 if (cur->bc_levels[lev].bp) 1129 xfs_trans_brelse(cur->bc_tp, cur->bc_levels[lev].bp); 1130 cur->bc_levels[lev].bp = bp; 1131 cur->bc_levels[lev].ra = 0; 1132 1133 b = XFS_BUF_TO_BLOCK(bp); 1134 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { 1135 if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK)) 1136 cur->bc_levels[lev].ra |= XFS_BTCUR_LEFTRA; 1137 if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK)) 1138 cur->bc_levels[lev].ra |= XFS_BTCUR_RIGHTRA; 1139 } else { 1140 if (b->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK)) 1141 cur->bc_levels[lev].ra |= XFS_BTCUR_LEFTRA; 1142 if (b->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK)) 1143 cur->bc_levels[lev].ra |= XFS_BTCUR_RIGHTRA; 1144 } 1145 } 1146 1147 bool 1148 xfs_btree_ptr_is_null( 1149 struct xfs_btree_cur *cur, 1150 const union xfs_btree_ptr *ptr) 1151 { 1152 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) 1153 return ptr->l == cpu_to_be64(NULLFSBLOCK); 1154 else 1155 return ptr->s == cpu_to_be32(NULLAGBLOCK); 1156 } 1157 1158 void 1159 xfs_btree_set_ptr_null( 1160 struct xfs_btree_cur *cur, 1161 union xfs_btree_ptr *ptr) 1162 { 1163 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) 1164 ptr->l = cpu_to_be64(NULLFSBLOCK); 1165 else 1166 ptr->s = cpu_to_be32(NULLAGBLOCK); 1167 } 1168 1169 static inline bool 1170 xfs_btree_ptrs_equal( 1171 struct xfs_btree_cur *cur, 1172 union xfs_btree_ptr *ptr1, 1173 union xfs_btree_ptr *ptr2) 1174 { 1175 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) 1176 return ptr1->l == ptr2->l; 1177 return ptr1->s == ptr2->s; 1178 } 1179 1180 /* 1181 * Get/set/init sibling pointers 1182 */ 1183 void 1184 xfs_btree_get_sibling( 1185 struct xfs_btree_cur *cur, 1186 struct xfs_btree_block *block, 1187 union xfs_btree_ptr *ptr, 1188 int lr) 1189 { 1190 ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB); 1191 1192 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { 1193 if (lr == XFS_BB_RIGHTSIB) 1194 ptr->l = block->bb_u.l.bb_rightsib; 1195 else 1196 ptr->l = block->bb_u.l.bb_leftsib; 1197 } else { 1198 if (lr == XFS_BB_RIGHTSIB) 1199 ptr->s = block->bb_u.s.bb_rightsib; 1200 else 1201 ptr->s = block->bb_u.s.bb_leftsib; 1202 } 1203 } 1204 1205 void 1206 xfs_btree_set_sibling( 1207 struct xfs_btree_cur *cur, 1208 struct xfs_btree_block *block, 1209 const union xfs_btree_ptr *ptr, 1210 int lr) 1211 { 1212 ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB); 1213 1214 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { 1215 if (lr == XFS_BB_RIGHTSIB) 1216 block->bb_u.l.bb_rightsib = ptr->l; 1217 else 1218 block->bb_u.l.bb_leftsib = ptr->l; 1219 } else { 1220 if (lr == XFS_BB_RIGHTSIB) 1221 block->bb_u.s.bb_rightsib = ptr->s; 1222 else 1223 block->bb_u.s.bb_leftsib = ptr->s; 1224 } 1225 } 1226 1227 static void 1228 __xfs_btree_init_block( 1229 struct xfs_mount *mp, 1230 struct xfs_btree_block *buf, 1231 const struct xfs_btree_ops *ops, 1232 xfs_daddr_t blkno, 1233 __u16 level, 1234 __u16 numrecs, 1235 __u64 owner) 1236 { 1237 bool crc = xfs_has_crc(mp); 1238 __u32 magic = xfs_btree_magic(mp, ops); 1239 1240 buf->bb_magic = cpu_to_be32(magic); 1241 buf->bb_level = cpu_to_be16(level); 1242 buf->bb_numrecs = cpu_to_be16(numrecs); 1243 1244 if (ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { 1245 buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK); 1246 buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK); 1247 if (crc) { 1248 buf->bb_u.l.bb_blkno = cpu_to_be64(blkno); 1249 buf->bb_u.l.bb_owner = cpu_to_be64(owner); 1250 uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid); 1251 buf->bb_u.l.bb_pad = 0; 1252 buf->bb_u.l.bb_lsn = 0; 1253 } 1254 } else { 1255 buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK); 1256 buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK); 1257 if (crc) { 1258 buf->bb_u.s.bb_blkno = cpu_to_be64(blkno); 1259 /* owner is a 32 bit value on short blocks */ 1260 buf->bb_u.s.bb_owner = cpu_to_be32((__u32)owner); 1261 uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid); 1262 buf->bb_u.s.bb_lsn = 0; 1263 } 1264 } 1265 } 1266 1267 void 1268 xfs_btree_init_block( 1269 struct xfs_mount *mp, 1270 struct xfs_btree_block *block, 1271 const struct xfs_btree_ops *ops, 1272 __u16 level, 1273 __u16 numrecs, 1274 __u64 owner) 1275 { 1276 __xfs_btree_init_block(mp, block, ops, XFS_BUF_DADDR_NULL, level, 1277 numrecs, owner); 1278 } 1279 1280 void 1281 xfs_btree_init_buf( 1282 struct xfs_mount *mp, 1283 struct xfs_buf *bp, 1284 const struct xfs_btree_ops *ops, 1285 __u16 level, 1286 __u16 numrecs, 1287 __u64 owner) 1288 { 1289 __xfs_btree_init_block(mp, XFS_BUF_TO_BLOCK(bp), ops, 1290 xfs_buf_daddr(bp), level, numrecs, owner); 1291 bp->b_ops = ops->buf_ops; 1292 } 1293 1294 static inline __u64 1295 xfs_btree_owner( 1296 struct xfs_btree_cur *cur) 1297 { 1298 switch (cur->bc_ops->type) { 1299 case XFS_BTREE_TYPE_MEM: 1300 return cur->bc_mem.xfbtree->owner; 1301 case XFS_BTREE_TYPE_INODE: 1302 return cur->bc_ino.ip->i_ino; 1303 case XFS_BTREE_TYPE_AG: 1304 return cur->bc_group->xg_gno; 1305 default: 1306 ASSERT(0); 1307 return 0; 1308 } 1309 } 1310 1311 void 1312 xfs_btree_init_block_cur( 1313 struct xfs_btree_cur *cur, 1314 struct xfs_buf *bp, 1315 int level, 1316 int numrecs) 1317 { 1318 xfs_btree_init_buf(cur->bc_mp, bp, cur->bc_ops, level, numrecs, 1319 xfs_btree_owner(cur)); 1320 } 1321 1322 STATIC void 1323 xfs_btree_buf_to_ptr( 1324 struct xfs_btree_cur *cur, 1325 struct xfs_buf *bp, 1326 union xfs_btree_ptr *ptr) 1327 { 1328 switch (cur->bc_ops->type) { 1329 case XFS_BTREE_TYPE_AG: 1330 ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp, 1331 xfs_buf_daddr(bp))); 1332 break; 1333 case XFS_BTREE_TYPE_INODE: 1334 ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp, 1335 xfs_buf_daddr(bp))); 1336 break; 1337 case XFS_BTREE_TYPE_MEM: 1338 ptr->l = cpu_to_be64(xfs_daddr_to_xfbno(xfs_buf_daddr(bp))); 1339 break; 1340 } 1341 } 1342 1343 static inline void 1344 xfs_btree_set_refs( 1345 struct xfs_btree_cur *cur, 1346 struct xfs_buf *bp) 1347 { 1348 xfs_buf_set_ref(bp, cur->bc_ops->lru_refs); 1349 } 1350 1351 int 1352 xfs_btree_get_buf_block( 1353 struct xfs_btree_cur *cur, 1354 const union xfs_btree_ptr *ptr, 1355 struct xfs_btree_block **block, 1356 struct xfs_buf **bpp) 1357 { 1358 xfs_daddr_t d; 1359 int error; 1360 1361 error = xfs_btree_ptr_to_daddr(cur, ptr, &d); 1362 if (error) 1363 return error; 1364 error = xfs_trans_get_buf(cur->bc_tp, xfs_btree_buftarg(cur), d, 1365 xfs_btree_bbsize(cur), 0, bpp); 1366 if (error) 1367 return error; 1368 1369 (*bpp)->b_ops = cur->bc_ops->buf_ops; 1370 *block = XFS_BUF_TO_BLOCK(*bpp); 1371 return 0; 1372 } 1373 1374 /* 1375 * Read in the buffer at the given ptr and return the buffer and 1376 * the block pointer within the buffer. 1377 */ 1378 int 1379 xfs_btree_read_buf_block( 1380 struct xfs_btree_cur *cur, 1381 const union xfs_btree_ptr *ptr, 1382 int flags, 1383 struct xfs_btree_block **block, 1384 struct xfs_buf **bpp) 1385 { 1386 struct xfs_mount *mp = cur->bc_mp; 1387 xfs_daddr_t d; 1388 int error; 1389 1390 /* need to sort out how callers deal with failures first */ 1391 ASSERT(!(flags & XBF_TRYLOCK)); 1392 1393 error = xfs_btree_ptr_to_daddr(cur, ptr, &d); 1394 if (error) 1395 return error; 1396 error = xfs_trans_read_buf(mp, cur->bc_tp, xfs_btree_buftarg(cur), d, 1397 xfs_btree_bbsize(cur), flags, bpp, 1398 cur->bc_ops->buf_ops); 1399 if (xfs_metadata_is_sick(error)) 1400 xfs_btree_mark_sick(cur); 1401 if (error) 1402 return error; 1403 1404 xfs_btree_set_refs(cur, *bpp); 1405 *block = XFS_BUF_TO_BLOCK(*bpp); 1406 return 0; 1407 } 1408 1409 /* 1410 * Copy keys from one btree block to another. 1411 */ 1412 void 1413 xfs_btree_copy_keys( 1414 struct xfs_btree_cur *cur, 1415 union xfs_btree_key *dst_key, 1416 const union xfs_btree_key *src_key, 1417 int numkeys) 1418 { 1419 ASSERT(numkeys >= 0); 1420 memcpy(dst_key, src_key, numkeys * cur->bc_ops->key_len); 1421 } 1422 1423 /* 1424 * Copy records from one btree block to another. 1425 */ 1426 STATIC void 1427 xfs_btree_copy_recs( 1428 struct xfs_btree_cur *cur, 1429 union xfs_btree_rec *dst_rec, 1430 union xfs_btree_rec *src_rec, 1431 int numrecs) 1432 { 1433 ASSERT(numrecs >= 0); 1434 memcpy(dst_rec, src_rec, numrecs * cur->bc_ops->rec_len); 1435 } 1436 1437 /* 1438 * Copy block pointers from one btree block to another. 1439 */ 1440 void 1441 xfs_btree_copy_ptrs( 1442 struct xfs_btree_cur *cur, 1443 union xfs_btree_ptr *dst_ptr, 1444 const union xfs_btree_ptr *src_ptr, 1445 int numptrs) 1446 { 1447 ASSERT(numptrs >= 0); 1448 memcpy(dst_ptr, src_ptr, numptrs * cur->bc_ops->ptr_len); 1449 } 1450 1451 /* 1452 * Shift keys one index left/right inside a single btree block. 1453 */ 1454 STATIC void 1455 xfs_btree_shift_keys( 1456 struct xfs_btree_cur *cur, 1457 union xfs_btree_key *key, 1458 int dir, 1459 int numkeys) 1460 { 1461 char *dst_key; 1462 1463 ASSERT(numkeys >= 0); 1464 ASSERT(dir == 1 || dir == -1); 1465 1466 dst_key = (char *)key + (dir * cur->bc_ops->key_len); 1467 memmove(dst_key, key, numkeys * cur->bc_ops->key_len); 1468 } 1469 1470 /* 1471 * Shift records one index left/right inside a single btree block. 1472 */ 1473 STATIC void 1474 xfs_btree_shift_recs( 1475 struct xfs_btree_cur *cur, 1476 union xfs_btree_rec *rec, 1477 int dir, 1478 int numrecs) 1479 { 1480 char *dst_rec; 1481 1482 ASSERT(numrecs >= 0); 1483 ASSERT(dir == 1 || dir == -1); 1484 1485 dst_rec = (char *)rec + (dir * cur->bc_ops->rec_len); 1486 memmove(dst_rec, rec, numrecs * cur->bc_ops->rec_len); 1487 } 1488 1489 /* 1490 * Shift block pointers one index left/right inside a single btree block. 1491 */ 1492 STATIC void 1493 xfs_btree_shift_ptrs( 1494 struct xfs_btree_cur *cur, 1495 union xfs_btree_ptr *ptr, 1496 int dir, 1497 int numptrs) 1498 { 1499 char *dst_ptr; 1500 1501 ASSERT(numptrs >= 0); 1502 ASSERT(dir == 1 || dir == -1); 1503 1504 dst_ptr = (char *)ptr + (dir * cur->bc_ops->ptr_len); 1505 memmove(dst_ptr, ptr, numptrs * cur->bc_ops->ptr_len); 1506 } 1507 1508 /* 1509 * Log key values from the btree block. 1510 */ 1511 STATIC void 1512 xfs_btree_log_keys( 1513 struct xfs_btree_cur *cur, 1514 struct xfs_buf *bp, 1515 int first, 1516 int last) 1517 { 1518 1519 if (bp) { 1520 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); 1521 xfs_trans_log_buf(cur->bc_tp, bp, 1522 xfs_btree_key_offset(cur, first), 1523 xfs_btree_key_offset(cur, last + 1) - 1); 1524 } else { 1525 xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip, 1526 xfs_ilog_fbroot(cur->bc_ino.whichfork)); 1527 } 1528 } 1529 1530 /* 1531 * Log record values from the btree block. 1532 */ 1533 void 1534 xfs_btree_log_recs( 1535 struct xfs_btree_cur *cur, 1536 struct xfs_buf *bp, 1537 int first, 1538 int last) 1539 { 1540 1541 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); 1542 xfs_trans_log_buf(cur->bc_tp, bp, 1543 xfs_btree_rec_offset(cur, first), 1544 xfs_btree_rec_offset(cur, last + 1) - 1); 1545 1546 } 1547 1548 /* 1549 * Log block pointer fields from a btree block (nonleaf). 1550 */ 1551 STATIC void 1552 xfs_btree_log_ptrs( 1553 struct xfs_btree_cur *cur, /* btree cursor */ 1554 struct xfs_buf *bp, /* buffer containing btree block */ 1555 int first, /* index of first pointer to log */ 1556 int last) /* index of last pointer to log */ 1557 { 1558 1559 if (bp) { 1560 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 1561 int level = xfs_btree_get_level(block); 1562 1563 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); 1564 xfs_trans_log_buf(cur->bc_tp, bp, 1565 xfs_btree_ptr_offset(cur, first, level), 1566 xfs_btree_ptr_offset(cur, last + 1, level) - 1); 1567 } else { 1568 xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip, 1569 xfs_ilog_fbroot(cur->bc_ino.whichfork)); 1570 } 1571 1572 } 1573 1574 /* 1575 * Log fields from a btree block header. 1576 */ 1577 void 1578 xfs_btree_log_block( 1579 struct xfs_btree_cur *cur, /* btree cursor */ 1580 struct xfs_buf *bp, /* buffer containing btree block */ 1581 uint32_t fields) /* mask of fields: XFS_BB_... */ 1582 { 1583 int first; /* first byte offset logged */ 1584 int last; /* last byte offset logged */ 1585 static const short soffsets[] = { /* table of offsets (short) */ 1586 offsetof(struct xfs_btree_block, bb_magic), 1587 offsetof(struct xfs_btree_block, bb_level), 1588 offsetof(struct xfs_btree_block, bb_numrecs), 1589 offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib), 1590 offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib), 1591 offsetof(struct xfs_btree_block, bb_u.s.bb_blkno), 1592 offsetof(struct xfs_btree_block, bb_u.s.bb_lsn), 1593 offsetof(struct xfs_btree_block, bb_u.s.bb_uuid), 1594 offsetof(struct xfs_btree_block, bb_u.s.bb_owner), 1595 offsetof(struct xfs_btree_block, bb_u.s.bb_crc), 1596 XFS_BTREE_SBLOCK_CRC_LEN 1597 }; 1598 static const short loffsets[] = { /* table of offsets (long) */ 1599 offsetof(struct xfs_btree_block, bb_magic), 1600 offsetof(struct xfs_btree_block, bb_level), 1601 offsetof(struct xfs_btree_block, bb_numrecs), 1602 offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib), 1603 offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib), 1604 offsetof(struct xfs_btree_block, bb_u.l.bb_blkno), 1605 offsetof(struct xfs_btree_block, bb_u.l.bb_lsn), 1606 offsetof(struct xfs_btree_block, bb_u.l.bb_uuid), 1607 offsetof(struct xfs_btree_block, bb_u.l.bb_owner), 1608 offsetof(struct xfs_btree_block, bb_u.l.bb_crc), 1609 offsetof(struct xfs_btree_block, bb_u.l.bb_pad), 1610 XFS_BTREE_LBLOCK_CRC_LEN 1611 }; 1612 1613 if (bp) { 1614 int nbits; 1615 1616 if (xfs_has_crc(cur->bc_mp)) { 1617 /* 1618 * We don't log the CRC when updating a btree 1619 * block but instead recreate it during log 1620 * recovery. As the log buffers have checksums 1621 * of their own this is safe and avoids logging a crc 1622 * update in a lot of places. 1623 */ 1624 if (fields == XFS_BB_ALL_BITS) 1625 fields = XFS_BB_ALL_BITS_CRC; 1626 nbits = XFS_BB_NUM_BITS_CRC; 1627 } else { 1628 nbits = XFS_BB_NUM_BITS; 1629 } 1630 xfs_btree_offsets(fields, 1631 (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) ? 1632 loffsets : soffsets, 1633 nbits, &first, &last); 1634 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); 1635 xfs_trans_log_buf(cur->bc_tp, bp, first, last); 1636 } else { 1637 xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip, 1638 xfs_ilog_fbroot(cur->bc_ino.whichfork)); 1639 } 1640 } 1641 1642 /* 1643 * Increment cursor by one record at the level. 1644 * For nonzero levels the leaf-ward information is untouched. 1645 */ 1646 int /* error */ 1647 xfs_btree_increment( 1648 struct xfs_btree_cur *cur, 1649 int level, 1650 int *stat) /* success/failure */ 1651 { 1652 struct xfs_btree_block *block; 1653 union xfs_btree_ptr ptr; 1654 struct xfs_buf *bp; 1655 int error; /* error return value */ 1656 int lev; 1657 1658 ASSERT(level < cur->bc_nlevels); 1659 1660 /* Read-ahead to the right at this level. */ 1661 xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA); 1662 1663 /* Get a pointer to the btree block. */ 1664 block = xfs_btree_get_block(cur, level, &bp); 1665 1666 #ifdef DEBUG 1667 error = xfs_btree_check_block(cur, block, level, bp); 1668 if (error) 1669 goto error0; 1670 #endif 1671 1672 /* We're done if we remain in the block after the increment. */ 1673 if (++cur->bc_levels[level].ptr <= xfs_btree_get_numrecs(block)) 1674 goto out1; 1675 1676 /* Fail if we just went off the right edge of the tree. */ 1677 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB); 1678 if (xfs_btree_ptr_is_null(cur, &ptr)) 1679 goto out0; 1680 1681 XFS_BTREE_STATS_INC(cur, increment); 1682 1683 /* 1684 * March up the tree incrementing pointers. 1685 * Stop when we don't go off the right edge of a block. 1686 */ 1687 for (lev = level + 1; lev < cur->bc_nlevels; lev++) { 1688 block = xfs_btree_get_block(cur, lev, &bp); 1689 1690 #ifdef DEBUG 1691 error = xfs_btree_check_block(cur, block, lev, bp); 1692 if (error) 1693 goto error0; 1694 #endif 1695 1696 if (++cur->bc_levels[lev].ptr <= xfs_btree_get_numrecs(block)) 1697 break; 1698 1699 /* Read-ahead the right block for the next loop. */ 1700 xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA); 1701 } 1702 1703 /* 1704 * If we went off the root then we are either seriously 1705 * confused or have the tree root in an inode. 1706 */ 1707 if (lev == cur->bc_nlevels) { 1708 if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) 1709 goto out0; 1710 ASSERT(0); 1711 xfs_btree_mark_sick(cur); 1712 error = -EFSCORRUPTED; 1713 goto error0; 1714 } 1715 ASSERT(lev < cur->bc_nlevels); 1716 1717 /* 1718 * Now walk back down the tree, fixing up the cursor's buffer 1719 * pointers and key numbers. 1720 */ 1721 for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) { 1722 union xfs_btree_ptr *ptrp; 1723 1724 ptrp = xfs_btree_ptr_addr(cur, cur->bc_levels[lev].ptr, block); 1725 --lev; 1726 error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp); 1727 if (error) 1728 goto error0; 1729 1730 xfs_btree_setbuf(cur, lev, bp); 1731 cur->bc_levels[lev].ptr = 1; 1732 } 1733 out1: 1734 *stat = 1; 1735 return 0; 1736 1737 out0: 1738 *stat = 0; 1739 return 0; 1740 1741 error0: 1742 return error; 1743 } 1744 1745 /* 1746 * Decrement cursor by one record at the level. 1747 * For nonzero levels the leaf-ward information is untouched. 1748 */ 1749 int /* error */ 1750 xfs_btree_decrement( 1751 struct xfs_btree_cur *cur, 1752 int level, 1753 int *stat) /* success/failure */ 1754 { 1755 struct xfs_btree_block *block; 1756 struct xfs_buf *bp; 1757 int error; /* error return value */ 1758 int lev; 1759 union xfs_btree_ptr ptr; 1760 1761 ASSERT(level < cur->bc_nlevels); 1762 1763 /* Read-ahead to the left at this level. */ 1764 xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA); 1765 1766 /* We're done if we remain in the block after the decrement. */ 1767 if (--cur->bc_levels[level].ptr > 0) 1768 goto out1; 1769 1770 /* Get a pointer to the btree block. */ 1771 block = xfs_btree_get_block(cur, level, &bp); 1772 1773 #ifdef DEBUG 1774 error = xfs_btree_check_block(cur, block, level, bp); 1775 if (error) 1776 goto error0; 1777 #endif 1778 1779 /* Fail if we just went off the left edge of the tree. */ 1780 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB); 1781 if (xfs_btree_ptr_is_null(cur, &ptr)) 1782 goto out0; 1783 1784 XFS_BTREE_STATS_INC(cur, decrement); 1785 1786 /* 1787 * March up the tree decrementing pointers. 1788 * Stop when we don't go off the left edge of a block. 1789 */ 1790 for (lev = level + 1; lev < cur->bc_nlevels; lev++) { 1791 if (--cur->bc_levels[lev].ptr > 0) 1792 break; 1793 /* Read-ahead the left block for the next loop. */ 1794 xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA); 1795 } 1796 1797 /* 1798 * If we went off the root then we are seriously confused. 1799 * or the root of the tree is in an inode. 1800 */ 1801 if (lev == cur->bc_nlevels) { 1802 if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) 1803 goto out0; 1804 ASSERT(0); 1805 xfs_btree_mark_sick(cur); 1806 error = -EFSCORRUPTED; 1807 goto error0; 1808 } 1809 ASSERT(lev < cur->bc_nlevels); 1810 1811 /* 1812 * Now walk back down the tree, fixing up the cursor's buffer 1813 * pointers and key numbers. 1814 */ 1815 for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) { 1816 union xfs_btree_ptr *ptrp; 1817 1818 ptrp = xfs_btree_ptr_addr(cur, cur->bc_levels[lev].ptr, block); 1819 --lev; 1820 error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp); 1821 if (error) 1822 goto error0; 1823 xfs_btree_setbuf(cur, lev, bp); 1824 cur->bc_levels[lev].ptr = xfs_btree_get_numrecs(block); 1825 } 1826 out1: 1827 *stat = 1; 1828 return 0; 1829 1830 out0: 1831 *stat = 0; 1832 return 0; 1833 1834 error0: 1835 return error; 1836 } 1837 1838 /* 1839 * Check the btree block owner now that we have the context to know who the 1840 * real owner is. 1841 */ 1842 static inline xfs_failaddr_t 1843 xfs_btree_check_block_owner( 1844 struct xfs_btree_cur *cur, 1845 struct xfs_btree_block *block) 1846 { 1847 __u64 owner; 1848 1849 if (!xfs_has_crc(cur->bc_mp) || 1850 (cur->bc_flags & XFS_BTREE_BMBT_INVALID_OWNER)) 1851 return NULL; 1852 1853 owner = xfs_btree_owner(cur); 1854 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { 1855 if (be64_to_cpu(block->bb_u.l.bb_owner) != owner) 1856 return __this_address; 1857 } else { 1858 if (be32_to_cpu(block->bb_u.s.bb_owner) != owner) 1859 return __this_address; 1860 } 1861 1862 return NULL; 1863 } 1864 1865 int 1866 xfs_btree_lookup_get_block( 1867 struct xfs_btree_cur *cur, /* btree cursor */ 1868 int level, /* level in the btree */ 1869 const union xfs_btree_ptr *pp, /* ptr to btree block */ 1870 struct xfs_btree_block **blkp) /* return btree block */ 1871 { 1872 struct xfs_buf *bp; /* buffer pointer for btree block */ 1873 xfs_daddr_t daddr; 1874 int error = 0; 1875 1876 /* special case the root block if in an inode */ 1877 if (xfs_btree_at_iroot(cur, level)) { 1878 *blkp = xfs_btree_get_iroot(cur); 1879 return 0; 1880 } 1881 1882 /* 1883 * If the old buffer at this level for the disk address we are 1884 * looking for re-use it. 1885 * 1886 * Otherwise throw it away and get a new one. 1887 */ 1888 bp = cur->bc_levels[level].bp; 1889 error = xfs_btree_ptr_to_daddr(cur, pp, &daddr); 1890 if (error) 1891 return error; 1892 if (bp && xfs_buf_daddr(bp) == daddr) { 1893 *blkp = XFS_BUF_TO_BLOCK(bp); 1894 return 0; 1895 } 1896 1897 error = xfs_btree_read_buf_block(cur, pp, 0, blkp, &bp); 1898 if (error) 1899 return error; 1900 1901 /* Check the inode owner since the verifiers don't. */ 1902 if (xfs_btree_check_block_owner(cur, *blkp) != NULL) 1903 goto out_bad; 1904 1905 /* Did we get the level we were looking for? */ 1906 if (be16_to_cpu((*blkp)->bb_level) != level) 1907 goto out_bad; 1908 1909 /* Check that internal nodes have at least one record. */ 1910 if (level != 0 && be16_to_cpu((*blkp)->bb_numrecs) == 0) 1911 goto out_bad; 1912 1913 xfs_btree_setbuf(cur, level, bp); 1914 return 0; 1915 1916 out_bad: 1917 *blkp = NULL; 1918 xfs_buf_mark_corrupt(bp); 1919 xfs_trans_brelse(cur->bc_tp, bp); 1920 xfs_btree_mark_sick(cur); 1921 return -EFSCORRUPTED; 1922 } 1923 1924 /* 1925 * Get current search key. For level 0 we don't actually have a key 1926 * structure so we make one up from the record. For all other levels 1927 * we just return the right key. 1928 */ 1929 STATIC union xfs_btree_key * 1930 xfs_lookup_get_search_key( 1931 struct xfs_btree_cur *cur, 1932 int level, 1933 int keyno, 1934 struct xfs_btree_block *block, 1935 union xfs_btree_key *kp) 1936 { 1937 if (level == 0) { 1938 cur->bc_ops->init_key_from_rec(kp, 1939 xfs_btree_rec_addr(cur, keyno, block)); 1940 return kp; 1941 } 1942 1943 return xfs_btree_key_addr(cur, keyno, block); 1944 } 1945 1946 /* 1947 * Initialize a pointer to the root block. 1948 */ 1949 void 1950 xfs_btree_init_ptr_from_cur( 1951 struct xfs_btree_cur *cur, 1952 union xfs_btree_ptr *ptr) 1953 { 1954 if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) { 1955 /* 1956 * Inode-rooted btrees call xfs_btree_get_iroot to find the root 1957 * in xfs_btree_lookup_get_block and don't need a pointer here. 1958 */ 1959 ptr->l = 0; 1960 } else if (cur->bc_flags & XFS_BTREE_STAGING) { 1961 ptr->s = cpu_to_be32(cur->bc_ag.afake->af_root); 1962 } else { 1963 cur->bc_ops->init_ptr_from_cur(cur, ptr); 1964 } 1965 } 1966 1967 /* 1968 * Lookup the record. The cursor is made to point to it, based on dir. 1969 * stat is set to 0 if can't find any such record, 1 for success. 1970 */ 1971 int /* error */ 1972 xfs_btree_lookup( 1973 struct xfs_btree_cur *cur, /* btree cursor */ 1974 xfs_lookup_t dir, /* <=, ==, or >= */ 1975 int *stat) /* success/failure */ 1976 { 1977 struct xfs_btree_block *block; /* current btree block */ 1978 int64_t diff; /* difference for the current key */ 1979 int error; /* error return value */ 1980 int keyno; /* current key number */ 1981 int level; /* level in the btree */ 1982 union xfs_btree_ptr *pp; /* ptr to btree block */ 1983 union xfs_btree_ptr ptr; /* ptr to btree block */ 1984 1985 XFS_BTREE_STATS_INC(cur, lookup); 1986 1987 /* No such thing as a zero-level tree. */ 1988 if (XFS_IS_CORRUPT(cur->bc_mp, cur->bc_nlevels == 0)) { 1989 xfs_btree_mark_sick(cur); 1990 return -EFSCORRUPTED; 1991 } 1992 1993 block = NULL; 1994 keyno = 0; 1995 1996 /* initialise start pointer from cursor */ 1997 xfs_btree_init_ptr_from_cur(cur, &ptr); 1998 pp = &ptr; 1999 2000 /* 2001 * Iterate over each level in the btree, starting at the root. 2002 * For each level above the leaves, find the key we need, based 2003 * on the lookup record, then follow the corresponding block 2004 * pointer down to the next level. 2005 */ 2006 for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) { 2007 /* Get the block we need to do the lookup on. */ 2008 error = xfs_btree_lookup_get_block(cur, level, pp, &block); 2009 if (error) 2010 goto error0; 2011 2012 if (diff == 0) { 2013 /* 2014 * If we already had a key match at a higher level, we 2015 * know we need to use the first entry in this block. 2016 */ 2017 keyno = 1; 2018 } else { 2019 /* Otherwise search this block. Do a binary search. */ 2020 2021 int high; /* high entry number */ 2022 int low; /* low entry number */ 2023 2024 /* Set low and high entry numbers, 1-based. */ 2025 low = 1; 2026 high = xfs_btree_get_numrecs(block); 2027 if (!high) { 2028 /* Block is empty, must be an empty leaf. */ 2029 if (level != 0 || cur->bc_nlevels != 1) { 2030 XFS_CORRUPTION_ERROR(__func__, 2031 XFS_ERRLEVEL_LOW, 2032 cur->bc_mp, block, 2033 sizeof(*block)); 2034 xfs_btree_mark_sick(cur); 2035 return -EFSCORRUPTED; 2036 } 2037 2038 cur->bc_levels[0].ptr = dir != XFS_LOOKUP_LE; 2039 *stat = 0; 2040 return 0; 2041 } 2042 2043 /* Binary search the block. */ 2044 while (low <= high) { 2045 union xfs_btree_key key; 2046 union xfs_btree_key *kp; 2047 2048 XFS_BTREE_STATS_INC(cur, compare); 2049 2050 /* keyno is average of low and high. */ 2051 keyno = (low + high) >> 1; 2052 2053 /* Get current search key */ 2054 kp = xfs_lookup_get_search_key(cur, level, 2055 keyno, block, &key); 2056 2057 /* 2058 * Compute difference to get next direction: 2059 * - less than, move right 2060 * - greater than, move left 2061 * - equal, we're done 2062 */ 2063 diff = cur->bc_ops->key_diff(cur, kp); 2064 if (diff < 0) 2065 low = keyno + 1; 2066 else if (diff > 0) 2067 high = keyno - 1; 2068 else 2069 break; 2070 } 2071 } 2072 2073 /* 2074 * If there are more levels, set up for the next level 2075 * by getting the block number and filling in the cursor. 2076 */ 2077 if (level > 0) { 2078 /* 2079 * If we moved left, need the previous key number, 2080 * unless there isn't one. 2081 */ 2082 if (diff > 0 && --keyno < 1) 2083 keyno = 1; 2084 pp = xfs_btree_ptr_addr(cur, keyno, block); 2085 2086 error = xfs_btree_debug_check_ptr(cur, pp, 0, level); 2087 if (error) 2088 goto error0; 2089 2090 cur->bc_levels[level].ptr = keyno; 2091 } 2092 } 2093 2094 /* Done with the search. See if we need to adjust the results. */ 2095 if (dir != XFS_LOOKUP_LE && diff < 0) { 2096 keyno++; 2097 /* 2098 * If ge search and we went off the end of the block, but it's 2099 * not the last block, we're in the wrong block. 2100 */ 2101 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB); 2102 if (dir == XFS_LOOKUP_GE && 2103 keyno > xfs_btree_get_numrecs(block) && 2104 !xfs_btree_ptr_is_null(cur, &ptr)) { 2105 int i; 2106 2107 cur->bc_levels[0].ptr = keyno; 2108 error = xfs_btree_increment(cur, 0, &i); 2109 if (error) 2110 goto error0; 2111 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 2112 xfs_btree_mark_sick(cur); 2113 return -EFSCORRUPTED; 2114 } 2115 *stat = 1; 2116 return 0; 2117 } 2118 } else if (dir == XFS_LOOKUP_LE && diff > 0) 2119 keyno--; 2120 cur->bc_levels[0].ptr = keyno; 2121 2122 /* Return if we succeeded or not. */ 2123 if (keyno == 0 || keyno > xfs_btree_get_numrecs(block)) 2124 *stat = 0; 2125 else if (dir != XFS_LOOKUP_EQ || diff == 0) 2126 *stat = 1; 2127 else 2128 *stat = 0; 2129 return 0; 2130 2131 error0: 2132 return error; 2133 } 2134 2135 /* Find the high key storage area from a regular key. */ 2136 union xfs_btree_key * 2137 xfs_btree_high_key_from_key( 2138 struct xfs_btree_cur *cur, 2139 union xfs_btree_key *key) 2140 { 2141 ASSERT(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING); 2142 return (union xfs_btree_key *)((char *)key + 2143 (cur->bc_ops->key_len / 2)); 2144 } 2145 2146 /* Determine the low (and high if overlapped) keys of a leaf block */ 2147 STATIC void 2148 xfs_btree_get_leaf_keys( 2149 struct xfs_btree_cur *cur, 2150 struct xfs_btree_block *block, 2151 union xfs_btree_key *key) 2152 { 2153 union xfs_btree_key max_hkey; 2154 union xfs_btree_key hkey; 2155 union xfs_btree_rec *rec; 2156 union xfs_btree_key *high; 2157 int n; 2158 2159 rec = xfs_btree_rec_addr(cur, 1, block); 2160 cur->bc_ops->init_key_from_rec(key, rec); 2161 2162 if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) { 2163 2164 cur->bc_ops->init_high_key_from_rec(&max_hkey, rec); 2165 for (n = 2; n <= xfs_btree_get_numrecs(block); n++) { 2166 rec = xfs_btree_rec_addr(cur, n, block); 2167 cur->bc_ops->init_high_key_from_rec(&hkey, rec); 2168 if (xfs_btree_keycmp_gt(cur, &hkey, &max_hkey)) 2169 max_hkey = hkey; 2170 } 2171 2172 high = xfs_btree_high_key_from_key(cur, key); 2173 memcpy(high, &max_hkey, cur->bc_ops->key_len / 2); 2174 } 2175 } 2176 2177 /* Determine the low (and high if overlapped) keys of a node block */ 2178 STATIC void 2179 xfs_btree_get_node_keys( 2180 struct xfs_btree_cur *cur, 2181 struct xfs_btree_block *block, 2182 union xfs_btree_key *key) 2183 { 2184 union xfs_btree_key *hkey; 2185 union xfs_btree_key *max_hkey; 2186 union xfs_btree_key *high; 2187 int n; 2188 2189 if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) { 2190 memcpy(key, xfs_btree_key_addr(cur, 1, block), 2191 cur->bc_ops->key_len / 2); 2192 2193 max_hkey = xfs_btree_high_key_addr(cur, 1, block); 2194 for (n = 2; n <= xfs_btree_get_numrecs(block); n++) { 2195 hkey = xfs_btree_high_key_addr(cur, n, block); 2196 if (xfs_btree_keycmp_gt(cur, hkey, max_hkey)) 2197 max_hkey = hkey; 2198 } 2199 2200 high = xfs_btree_high_key_from_key(cur, key); 2201 memcpy(high, max_hkey, cur->bc_ops->key_len / 2); 2202 } else { 2203 memcpy(key, xfs_btree_key_addr(cur, 1, block), 2204 cur->bc_ops->key_len); 2205 } 2206 } 2207 2208 /* Derive the keys for any btree block. */ 2209 void 2210 xfs_btree_get_keys( 2211 struct xfs_btree_cur *cur, 2212 struct xfs_btree_block *block, 2213 union xfs_btree_key *key) 2214 { 2215 if (be16_to_cpu(block->bb_level) == 0) 2216 xfs_btree_get_leaf_keys(cur, block, key); 2217 else 2218 xfs_btree_get_node_keys(cur, block, key); 2219 } 2220 2221 /* 2222 * Decide if we need to update the parent keys of a btree block. For 2223 * a standard btree this is only necessary if we're updating the first 2224 * record/key. For an overlapping btree, we must always update the 2225 * keys because the highest key can be in any of the records or keys 2226 * in the block. 2227 */ 2228 static inline bool 2229 xfs_btree_needs_key_update( 2230 struct xfs_btree_cur *cur, 2231 int ptr) 2232 { 2233 return (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) || ptr == 1; 2234 } 2235 2236 /* 2237 * Update the low and high parent keys of the given level, progressing 2238 * towards the root. If force_all is false, stop if the keys for a given 2239 * level do not need updating. 2240 */ 2241 STATIC int 2242 __xfs_btree_updkeys( 2243 struct xfs_btree_cur *cur, 2244 int level, 2245 struct xfs_btree_block *block, 2246 struct xfs_buf *bp0, 2247 bool force_all) 2248 { 2249 union xfs_btree_key key; /* keys from current level */ 2250 union xfs_btree_key *lkey; /* keys from the next level up */ 2251 union xfs_btree_key *hkey; 2252 union xfs_btree_key *nlkey; /* keys from the next level up */ 2253 union xfs_btree_key *nhkey; 2254 struct xfs_buf *bp; 2255 int ptr; 2256 2257 ASSERT(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING); 2258 2259 /* Exit if there aren't any parent levels to update. */ 2260 if (level + 1 >= cur->bc_nlevels) 2261 return 0; 2262 2263 trace_xfs_btree_updkeys(cur, level, bp0); 2264 2265 lkey = &key; 2266 hkey = xfs_btree_high_key_from_key(cur, lkey); 2267 xfs_btree_get_keys(cur, block, lkey); 2268 for (level++; level < cur->bc_nlevels; level++) { 2269 #ifdef DEBUG 2270 int error; 2271 #endif 2272 block = xfs_btree_get_block(cur, level, &bp); 2273 trace_xfs_btree_updkeys(cur, level, bp); 2274 #ifdef DEBUG 2275 error = xfs_btree_check_block(cur, block, level, bp); 2276 if (error) 2277 return error; 2278 #endif 2279 ptr = cur->bc_levels[level].ptr; 2280 nlkey = xfs_btree_key_addr(cur, ptr, block); 2281 nhkey = xfs_btree_high_key_addr(cur, ptr, block); 2282 if (!force_all && 2283 xfs_btree_keycmp_eq(cur, nlkey, lkey) && 2284 xfs_btree_keycmp_eq(cur, nhkey, hkey)) 2285 break; 2286 xfs_btree_copy_keys(cur, nlkey, lkey, 1); 2287 xfs_btree_log_keys(cur, bp, ptr, ptr); 2288 if (level + 1 >= cur->bc_nlevels) 2289 break; 2290 xfs_btree_get_node_keys(cur, block, lkey); 2291 } 2292 2293 return 0; 2294 } 2295 2296 /* Update all the keys from some level in cursor back to the root. */ 2297 STATIC int 2298 xfs_btree_updkeys_force( 2299 struct xfs_btree_cur *cur, 2300 int level) 2301 { 2302 struct xfs_buf *bp; 2303 struct xfs_btree_block *block; 2304 2305 block = xfs_btree_get_block(cur, level, &bp); 2306 return __xfs_btree_updkeys(cur, level, block, bp, true); 2307 } 2308 2309 /* 2310 * Update the parent keys of the given level, progressing towards the root. 2311 */ 2312 STATIC int 2313 xfs_btree_update_keys( 2314 struct xfs_btree_cur *cur, 2315 int level) 2316 { 2317 struct xfs_btree_block *block; 2318 struct xfs_buf *bp; 2319 union xfs_btree_key *kp; 2320 union xfs_btree_key key; 2321 int ptr; 2322 2323 ASSERT(level >= 0); 2324 2325 block = xfs_btree_get_block(cur, level, &bp); 2326 if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) 2327 return __xfs_btree_updkeys(cur, level, block, bp, false); 2328 2329 /* 2330 * Go up the tree from this level toward the root. 2331 * At each level, update the key value to the value input. 2332 * Stop when we reach a level where the cursor isn't pointing 2333 * at the first entry in the block. 2334 */ 2335 xfs_btree_get_keys(cur, block, &key); 2336 for (level++, ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) { 2337 #ifdef DEBUG 2338 int error; 2339 #endif 2340 block = xfs_btree_get_block(cur, level, &bp); 2341 #ifdef DEBUG 2342 error = xfs_btree_check_block(cur, block, level, bp); 2343 if (error) 2344 return error; 2345 #endif 2346 ptr = cur->bc_levels[level].ptr; 2347 kp = xfs_btree_key_addr(cur, ptr, block); 2348 xfs_btree_copy_keys(cur, kp, &key, 1); 2349 xfs_btree_log_keys(cur, bp, ptr, ptr); 2350 } 2351 2352 return 0; 2353 } 2354 2355 /* 2356 * Update the record referred to by cur to the value in the 2357 * given record. This either works (return 0) or gets an 2358 * EFSCORRUPTED error. 2359 */ 2360 int 2361 xfs_btree_update( 2362 struct xfs_btree_cur *cur, 2363 union xfs_btree_rec *rec) 2364 { 2365 struct xfs_btree_block *block; 2366 struct xfs_buf *bp; 2367 int error; 2368 int ptr; 2369 union xfs_btree_rec *rp; 2370 2371 /* Pick up the current block. */ 2372 block = xfs_btree_get_block(cur, 0, &bp); 2373 2374 #ifdef DEBUG 2375 error = xfs_btree_check_block(cur, block, 0, bp); 2376 if (error) 2377 goto error0; 2378 #endif 2379 /* Get the address of the rec to be updated. */ 2380 ptr = cur->bc_levels[0].ptr; 2381 rp = xfs_btree_rec_addr(cur, ptr, block); 2382 2383 /* Fill in the new contents and log them. */ 2384 xfs_btree_copy_recs(cur, rp, rec, 1); 2385 xfs_btree_log_recs(cur, bp, ptr, ptr); 2386 2387 /* Pass new key value up to our parent. */ 2388 if (xfs_btree_needs_key_update(cur, ptr)) { 2389 error = xfs_btree_update_keys(cur, 0); 2390 if (error) 2391 goto error0; 2392 } 2393 2394 return 0; 2395 2396 error0: 2397 return error; 2398 } 2399 2400 /* 2401 * Move 1 record left from cur/level if possible. 2402 * Update cur to reflect the new path. 2403 */ 2404 STATIC int /* error */ 2405 xfs_btree_lshift( 2406 struct xfs_btree_cur *cur, 2407 int level, 2408 int *stat) /* success/failure */ 2409 { 2410 struct xfs_buf *lbp; /* left buffer pointer */ 2411 struct xfs_btree_block *left; /* left btree block */ 2412 int lrecs; /* left record count */ 2413 struct xfs_buf *rbp; /* right buffer pointer */ 2414 struct xfs_btree_block *right; /* right btree block */ 2415 struct xfs_btree_cur *tcur; /* temporary btree cursor */ 2416 int rrecs; /* right record count */ 2417 union xfs_btree_ptr lptr; /* left btree pointer */ 2418 union xfs_btree_key *rkp = NULL; /* right btree key */ 2419 union xfs_btree_ptr *rpp = NULL; /* right address pointer */ 2420 union xfs_btree_rec *rrp = NULL; /* right record pointer */ 2421 int error; /* error return value */ 2422 int i; 2423 2424 if (xfs_btree_at_iroot(cur, level)) 2425 goto out0; 2426 2427 /* Set up variables for this block as "right". */ 2428 right = xfs_btree_get_block(cur, level, &rbp); 2429 2430 #ifdef DEBUG 2431 error = xfs_btree_check_block(cur, right, level, rbp); 2432 if (error) 2433 goto error0; 2434 #endif 2435 2436 /* If we've got no left sibling then we can't shift an entry left. */ 2437 xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB); 2438 if (xfs_btree_ptr_is_null(cur, &lptr)) 2439 goto out0; 2440 2441 /* 2442 * If the cursor entry is the one that would be moved, don't 2443 * do it... it's too complicated. 2444 */ 2445 if (cur->bc_levels[level].ptr <= 1) 2446 goto out0; 2447 2448 /* Set up the left neighbor as "left". */ 2449 error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp); 2450 if (error) 2451 goto error0; 2452 2453 /* If it's full, it can't take another entry. */ 2454 lrecs = xfs_btree_get_numrecs(left); 2455 if (lrecs == cur->bc_ops->get_maxrecs(cur, level)) 2456 goto out0; 2457 2458 rrecs = xfs_btree_get_numrecs(right); 2459 2460 /* 2461 * We add one entry to the left side and remove one for the right side. 2462 * Account for it here, the changes will be updated on disk and logged 2463 * later. 2464 */ 2465 lrecs++; 2466 rrecs--; 2467 2468 XFS_BTREE_STATS_INC(cur, lshift); 2469 XFS_BTREE_STATS_ADD(cur, moves, 1); 2470 2471 /* 2472 * If non-leaf, copy a key and a ptr to the left block. 2473 * Log the changes to the left block. 2474 */ 2475 if (level > 0) { 2476 /* It's a non-leaf. Move keys and pointers. */ 2477 union xfs_btree_key *lkp; /* left btree key */ 2478 union xfs_btree_ptr *lpp; /* left address pointer */ 2479 2480 lkp = xfs_btree_key_addr(cur, lrecs, left); 2481 rkp = xfs_btree_key_addr(cur, 1, right); 2482 2483 lpp = xfs_btree_ptr_addr(cur, lrecs, left); 2484 rpp = xfs_btree_ptr_addr(cur, 1, right); 2485 2486 error = xfs_btree_debug_check_ptr(cur, rpp, 0, level); 2487 if (error) 2488 goto error0; 2489 2490 xfs_btree_copy_keys(cur, lkp, rkp, 1); 2491 xfs_btree_copy_ptrs(cur, lpp, rpp, 1); 2492 2493 xfs_btree_log_keys(cur, lbp, lrecs, lrecs); 2494 xfs_btree_log_ptrs(cur, lbp, lrecs, lrecs); 2495 2496 ASSERT(cur->bc_ops->keys_inorder(cur, 2497 xfs_btree_key_addr(cur, lrecs - 1, left), lkp)); 2498 } else { 2499 /* It's a leaf. Move records. */ 2500 union xfs_btree_rec *lrp; /* left record pointer */ 2501 2502 lrp = xfs_btree_rec_addr(cur, lrecs, left); 2503 rrp = xfs_btree_rec_addr(cur, 1, right); 2504 2505 xfs_btree_copy_recs(cur, lrp, rrp, 1); 2506 xfs_btree_log_recs(cur, lbp, lrecs, lrecs); 2507 2508 ASSERT(cur->bc_ops->recs_inorder(cur, 2509 xfs_btree_rec_addr(cur, lrecs - 1, left), lrp)); 2510 } 2511 2512 xfs_btree_set_numrecs(left, lrecs); 2513 xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS); 2514 2515 xfs_btree_set_numrecs(right, rrecs); 2516 xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS); 2517 2518 /* 2519 * Slide the contents of right down one entry. 2520 */ 2521 XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1); 2522 if (level > 0) { 2523 /* It's a nonleaf. operate on keys and ptrs */ 2524 for (i = 0; i < rrecs; i++) { 2525 error = xfs_btree_debug_check_ptr(cur, rpp, i + 1, level); 2526 if (error) 2527 goto error0; 2528 } 2529 2530 xfs_btree_shift_keys(cur, 2531 xfs_btree_key_addr(cur, 2, right), 2532 -1, rrecs); 2533 xfs_btree_shift_ptrs(cur, 2534 xfs_btree_ptr_addr(cur, 2, right), 2535 -1, rrecs); 2536 2537 xfs_btree_log_keys(cur, rbp, 1, rrecs); 2538 xfs_btree_log_ptrs(cur, rbp, 1, rrecs); 2539 } else { 2540 /* It's a leaf. operate on records */ 2541 xfs_btree_shift_recs(cur, 2542 xfs_btree_rec_addr(cur, 2, right), 2543 -1, rrecs); 2544 xfs_btree_log_recs(cur, rbp, 1, rrecs); 2545 } 2546 2547 /* 2548 * Using a temporary cursor, update the parent key values of the 2549 * block on the left. 2550 */ 2551 if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) { 2552 error = xfs_btree_dup_cursor(cur, &tcur); 2553 if (error) 2554 goto error0; 2555 i = xfs_btree_firstrec(tcur, level); 2556 if (XFS_IS_CORRUPT(tcur->bc_mp, i != 1)) { 2557 xfs_btree_mark_sick(cur); 2558 error = -EFSCORRUPTED; 2559 goto error0; 2560 } 2561 2562 error = xfs_btree_decrement(tcur, level, &i); 2563 if (error) 2564 goto error1; 2565 2566 /* Update the parent high keys of the left block, if needed. */ 2567 error = xfs_btree_update_keys(tcur, level); 2568 if (error) 2569 goto error1; 2570 2571 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); 2572 } 2573 2574 /* Update the parent keys of the right block. */ 2575 error = xfs_btree_update_keys(cur, level); 2576 if (error) 2577 goto error0; 2578 2579 /* Slide the cursor value left one. */ 2580 cur->bc_levels[level].ptr--; 2581 2582 *stat = 1; 2583 return 0; 2584 2585 out0: 2586 *stat = 0; 2587 return 0; 2588 2589 error0: 2590 return error; 2591 2592 error1: 2593 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); 2594 return error; 2595 } 2596 2597 /* 2598 * Move 1 record right from cur/level if possible. 2599 * Update cur to reflect the new path. 2600 */ 2601 STATIC int /* error */ 2602 xfs_btree_rshift( 2603 struct xfs_btree_cur *cur, 2604 int level, 2605 int *stat) /* success/failure */ 2606 { 2607 struct xfs_buf *lbp; /* left buffer pointer */ 2608 struct xfs_btree_block *left; /* left btree block */ 2609 struct xfs_buf *rbp; /* right buffer pointer */ 2610 struct xfs_btree_block *right; /* right btree block */ 2611 struct xfs_btree_cur *tcur; /* temporary btree cursor */ 2612 union xfs_btree_ptr rptr; /* right block pointer */ 2613 union xfs_btree_key *rkp; /* right btree key */ 2614 int rrecs; /* right record count */ 2615 int lrecs; /* left record count */ 2616 int error; /* error return value */ 2617 int i; /* loop counter */ 2618 2619 if (xfs_btree_at_iroot(cur, level)) 2620 goto out0; 2621 2622 /* Set up variables for this block as "left". */ 2623 left = xfs_btree_get_block(cur, level, &lbp); 2624 2625 #ifdef DEBUG 2626 error = xfs_btree_check_block(cur, left, level, lbp); 2627 if (error) 2628 goto error0; 2629 #endif 2630 2631 /* If we've got no right sibling then we can't shift an entry right. */ 2632 xfs_btree_get_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB); 2633 if (xfs_btree_ptr_is_null(cur, &rptr)) 2634 goto out0; 2635 2636 /* 2637 * If the cursor entry is the one that would be moved, don't 2638 * do it... it's too complicated. 2639 */ 2640 lrecs = xfs_btree_get_numrecs(left); 2641 if (cur->bc_levels[level].ptr >= lrecs) 2642 goto out0; 2643 2644 /* Set up the right neighbor as "right". */ 2645 error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp); 2646 if (error) 2647 goto error0; 2648 2649 /* If it's full, it can't take another entry. */ 2650 rrecs = xfs_btree_get_numrecs(right); 2651 if (rrecs == cur->bc_ops->get_maxrecs(cur, level)) 2652 goto out0; 2653 2654 XFS_BTREE_STATS_INC(cur, rshift); 2655 XFS_BTREE_STATS_ADD(cur, moves, rrecs); 2656 2657 /* 2658 * Make a hole at the start of the right neighbor block, then 2659 * copy the last left block entry to the hole. 2660 */ 2661 if (level > 0) { 2662 /* It's a nonleaf. make a hole in the keys and ptrs */ 2663 union xfs_btree_key *lkp; 2664 union xfs_btree_ptr *lpp; 2665 union xfs_btree_ptr *rpp; 2666 2667 lkp = xfs_btree_key_addr(cur, lrecs, left); 2668 lpp = xfs_btree_ptr_addr(cur, lrecs, left); 2669 rkp = xfs_btree_key_addr(cur, 1, right); 2670 rpp = xfs_btree_ptr_addr(cur, 1, right); 2671 2672 for (i = rrecs - 1; i >= 0; i--) { 2673 error = xfs_btree_debug_check_ptr(cur, rpp, i, level); 2674 if (error) 2675 goto error0; 2676 } 2677 2678 xfs_btree_shift_keys(cur, rkp, 1, rrecs); 2679 xfs_btree_shift_ptrs(cur, rpp, 1, rrecs); 2680 2681 error = xfs_btree_debug_check_ptr(cur, lpp, 0, level); 2682 if (error) 2683 goto error0; 2684 2685 /* Now put the new data in, and log it. */ 2686 xfs_btree_copy_keys(cur, rkp, lkp, 1); 2687 xfs_btree_copy_ptrs(cur, rpp, lpp, 1); 2688 2689 xfs_btree_log_keys(cur, rbp, 1, rrecs + 1); 2690 xfs_btree_log_ptrs(cur, rbp, 1, rrecs + 1); 2691 2692 ASSERT(cur->bc_ops->keys_inorder(cur, rkp, 2693 xfs_btree_key_addr(cur, 2, right))); 2694 } else { 2695 /* It's a leaf. make a hole in the records */ 2696 union xfs_btree_rec *lrp; 2697 union xfs_btree_rec *rrp; 2698 2699 lrp = xfs_btree_rec_addr(cur, lrecs, left); 2700 rrp = xfs_btree_rec_addr(cur, 1, right); 2701 2702 xfs_btree_shift_recs(cur, rrp, 1, rrecs); 2703 2704 /* Now put the new data in, and log it. */ 2705 xfs_btree_copy_recs(cur, rrp, lrp, 1); 2706 xfs_btree_log_recs(cur, rbp, 1, rrecs + 1); 2707 } 2708 2709 /* 2710 * Decrement and log left's numrecs, bump and log right's numrecs. 2711 */ 2712 xfs_btree_set_numrecs(left, --lrecs); 2713 xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS); 2714 2715 xfs_btree_set_numrecs(right, ++rrecs); 2716 xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS); 2717 2718 /* 2719 * Using a temporary cursor, update the parent key values of the 2720 * block on the right. 2721 */ 2722 error = xfs_btree_dup_cursor(cur, &tcur); 2723 if (error) 2724 goto error0; 2725 i = xfs_btree_lastrec(tcur, level); 2726 if (XFS_IS_CORRUPT(tcur->bc_mp, i != 1)) { 2727 xfs_btree_mark_sick(cur); 2728 error = -EFSCORRUPTED; 2729 goto error0; 2730 } 2731 2732 error = xfs_btree_increment(tcur, level, &i); 2733 if (error) 2734 goto error1; 2735 2736 /* Update the parent high keys of the left block, if needed. */ 2737 if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) { 2738 error = xfs_btree_update_keys(cur, level); 2739 if (error) 2740 goto error1; 2741 } 2742 2743 /* Update the parent keys of the right block. */ 2744 error = xfs_btree_update_keys(tcur, level); 2745 if (error) 2746 goto error1; 2747 2748 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); 2749 2750 *stat = 1; 2751 return 0; 2752 2753 out0: 2754 *stat = 0; 2755 return 0; 2756 2757 error0: 2758 return error; 2759 2760 error1: 2761 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); 2762 return error; 2763 } 2764 2765 static inline int 2766 xfs_btree_alloc_block( 2767 struct xfs_btree_cur *cur, 2768 const union xfs_btree_ptr *hint_block, 2769 union xfs_btree_ptr *new_block, 2770 int *stat) 2771 { 2772 int error; 2773 2774 /* 2775 * Don't allow block allocation for a staging cursor, because staging 2776 * cursors do not support regular btree modifications. 2777 * 2778 * Bulk loading uses a separate callback to obtain new blocks from a 2779 * preallocated list, which prevents ENOSPC failures during loading. 2780 */ 2781 if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) { 2782 ASSERT(0); 2783 return -EFSCORRUPTED; 2784 } 2785 2786 error = cur->bc_ops->alloc_block(cur, hint_block, new_block, stat); 2787 trace_xfs_btree_alloc_block(cur, new_block, *stat, error); 2788 return error; 2789 } 2790 2791 /* 2792 * Split cur/level block in half. 2793 * Return new block number and the key to its first 2794 * record (to be inserted into parent). 2795 */ 2796 STATIC int /* error */ 2797 __xfs_btree_split( 2798 struct xfs_btree_cur *cur, 2799 int level, 2800 union xfs_btree_ptr *ptrp, 2801 union xfs_btree_key *key, 2802 struct xfs_btree_cur **curp, 2803 int *stat) /* success/failure */ 2804 { 2805 union xfs_btree_ptr lptr; /* left sibling block ptr */ 2806 struct xfs_buf *lbp; /* left buffer pointer */ 2807 struct xfs_btree_block *left; /* left btree block */ 2808 union xfs_btree_ptr rptr; /* right sibling block ptr */ 2809 struct xfs_buf *rbp; /* right buffer pointer */ 2810 struct xfs_btree_block *right; /* right btree block */ 2811 union xfs_btree_ptr rrptr; /* right-right sibling ptr */ 2812 struct xfs_buf *rrbp; /* right-right buffer pointer */ 2813 struct xfs_btree_block *rrblock; /* right-right btree block */ 2814 int lrecs; 2815 int rrecs; 2816 int src_index; 2817 int error; /* error return value */ 2818 int i; 2819 2820 XFS_BTREE_STATS_INC(cur, split); 2821 2822 /* Set up left block (current one). */ 2823 left = xfs_btree_get_block(cur, level, &lbp); 2824 2825 #ifdef DEBUG 2826 error = xfs_btree_check_block(cur, left, level, lbp); 2827 if (error) 2828 goto error0; 2829 #endif 2830 2831 xfs_btree_buf_to_ptr(cur, lbp, &lptr); 2832 2833 /* Allocate the new block. If we can't do it, we're toast. Give up. */ 2834 error = xfs_btree_alloc_block(cur, &lptr, &rptr, stat); 2835 if (error) 2836 goto error0; 2837 if (*stat == 0) 2838 goto out0; 2839 XFS_BTREE_STATS_INC(cur, alloc); 2840 2841 /* Set up the new block as "right". */ 2842 error = xfs_btree_get_buf_block(cur, &rptr, &right, &rbp); 2843 if (error) 2844 goto error0; 2845 2846 /* Fill in the btree header for the new right block. */ 2847 xfs_btree_init_block_cur(cur, rbp, xfs_btree_get_level(left), 0); 2848 2849 /* 2850 * Split the entries between the old and the new block evenly. 2851 * Make sure that if there's an odd number of entries now, that 2852 * each new block will have the same number of entries. 2853 */ 2854 lrecs = xfs_btree_get_numrecs(left); 2855 rrecs = lrecs / 2; 2856 if ((lrecs & 1) && cur->bc_levels[level].ptr <= rrecs + 1) 2857 rrecs++; 2858 src_index = (lrecs - rrecs + 1); 2859 2860 XFS_BTREE_STATS_ADD(cur, moves, rrecs); 2861 2862 /* Adjust numrecs for the later get_*_keys() calls. */ 2863 lrecs -= rrecs; 2864 xfs_btree_set_numrecs(left, lrecs); 2865 xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs); 2866 2867 /* 2868 * Copy btree block entries from the left block over to the 2869 * new block, the right. Update the right block and log the 2870 * changes. 2871 */ 2872 if (level > 0) { 2873 /* It's a non-leaf. Move keys and pointers. */ 2874 union xfs_btree_key *lkp; /* left btree key */ 2875 union xfs_btree_ptr *lpp; /* left address pointer */ 2876 union xfs_btree_key *rkp; /* right btree key */ 2877 union xfs_btree_ptr *rpp; /* right address pointer */ 2878 2879 lkp = xfs_btree_key_addr(cur, src_index, left); 2880 lpp = xfs_btree_ptr_addr(cur, src_index, left); 2881 rkp = xfs_btree_key_addr(cur, 1, right); 2882 rpp = xfs_btree_ptr_addr(cur, 1, right); 2883 2884 for (i = src_index; i < rrecs; i++) { 2885 error = xfs_btree_debug_check_ptr(cur, lpp, i, level); 2886 if (error) 2887 goto error0; 2888 } 2889 2890 /* Copy the keys & pointers to the new block. */ 2891 xfs_btree_copy_keys(cur, rkp, lkp, rrecs); 2892 xfs_btree_copy_ptrs(cur, rpp, lpp, rrecs); 2893 2894 xfs_btree_log_keys(cur, rbp, 1, rrecs); 2895 xfs_btree_log_ptrs(cur, rbp, 1, rrecs); 2896 2897 /* Stash the keys of the new block for later insertion. */ 2898 xfs_btree_get_node_keys(cur, right, key); 2899 } else { 2900 /* It's a leaf. Move records. */ 2901 union xfs_btree_rec *lrp; /* left record pointer */ 2902 union xfs_btree_rec *rrp; /* right record pointer */ 2903 2904 lrp = xfs_btree_rec_addr(cur, src_index, left); 2905 rrp = xfs_btree_rec_addr(cur, 1, right); 2906 2907 /* Copy records to the new block. */ 2908 xfs_btree_copy_recs(cur, rrp, lrp, rrecs); 2909 xfs_btree_log_recs(cur, rbp, 1, rrecs); 2910 2911 /* Stash the keys of the new block for later insertion. */ 2912 xfs_btree_get_leaf_keys(cur, right, key); 2913 } 2914 2915 /* 2916 * Find the left block number by looking in the buffer. 2917 * Adjust sibling pointers. 2918 */ 2919 xfs_btree_get_sibling(cur, left, &rrptr, XFS_BB_RIGHTSIB); 2920 xfs_btree_set_sibling(cur, right, &rrptr, XFS_BB_RIGHTSIB); 2921 xfs_btree_set_sibling(cur, right, &lptr, XFS_BB_LEFTSIB); 2922 xfs_btree_set_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB); 2923 2924 xfs_btree_log_block(cur, rbp, XFS_BB_ALL_BITS); 2925 xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB); 2926 2927 /* 2928 * If there's a block to the new block's right, make that block 2929 * point back to right instead of to left. 2930 */ 2931 if (!xfs_btree_ptr_is_null(cur, &rrptr)) { 2932 error = xfs_btree_read_buf_block(cur, &rrptr, 2933 0, &rrblock, &rrbp); 2934 if (error) 2935 goto error0; 2936 xfs_btree_set_sibling(cur, rrblock, &rptr, XFS_BB_LEFTSIB); 2937 xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB); 2938 } 2939 2940 /* Update the parent high keys of the left block, if needed. */ 2941 if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) { 2942 error = xfs_btree_update_keys(cur, level); 2943 if (error) 2944 goto error0; 2945 } 2946 2947 /* 2948 * If the cursor is really in the right block, move it there. 2949 * If it's just pointing past the last entry in left, then we'll 2950 * insert there, so don't change anything in that case. 2951 */ 2952 if (cur->bc_levels[level].ptr > lrecs + 1) { 2953 xfs_btree_setbuf(cur, level, rbp); 2954 cur->bc_levels[level].ptr -= lrecs; 2955 } 2956 /* 2957 * If there are more levels, we'll need another cursor which refers 2958 * the right block, no matter where this cursor was. 2959 */ 2960 if (level + 1 < cur->bc_nlevels) { 2961 error = xfs_btree_dup_cursor(cur, curp); 2962 if (error) 2963 goto error0; 2964 (*curp)->bc_levels[level + 1].ptr++; 2965 } 2966 *ptrp = rptr; 2967 *stat = 1; 2968 return 0; 2969 out0: 2970 *stat = 0; 2971 return 0; 2972 2973 error0: 2974 return error; 2975 } 2976 2977 #ifdef __KERNEL__ 2978 struct xfs_btree_split_args { 2979 struct xfs_btree_cur *cur; 2980 int level; 2981 union xfs_btree_ptr *ptrp; 2982 union xfs_btree_key *key; 2983 struct xfs_btree_cur **curp; 2984 int *stat; /* success/failure */ 2985 int result; 2986 bool kswapd; /* allocation in kswapd context */ 2987 struct completion *done; 2988 struct work_struct work; 2989 }; 2990 2991 /* 2992 * Stack switching interfaces for allocation 2993 */ 2994 static void 2995 xfs_btree_split_worker( 2996 struct work_struct *work) 2997 { 2998 struct xfs_btree_split_args *args = container_of(work, 2999 struct xfs_btree_split_args, work); 3000 unsigned long pflags; 3001 unsigned long new_pflags = 0; 3002 3003 /* 3004 * we are in a transaction context here, but may also be doing work 3005 * in kswapd context, and hence we may need to inherit that state 3006 * temporarily to ensure that we don't block waiting for memory reclaim 3007 * in any way. 3008 */ 3009 if (args->kswapd) 3010 new_pflags |= PF_MEMALLOC | PF_KSWAPD; 3011 3012 current_set_flags_nested(&pflags, new_pflags); 3013 xfs_trans_set_context(args->cur->bc_tp); 3014 3015 args->result = __xfs_btree_split(args->cur, args->level, args->ptrp, 3016 args->key, args->curp, args->stat); 3017 3018 xfs_trans_clear_context(args->cur->bc_tp); 3019 current_restore_flags_nested(&pflags, new_pflags); 3020 3021 /* 3022 * Do not access args after complete() has run here. We don't own args 3023 * and the owner may run and free args before we return here. 3024 */ 3025 complete(args->done); 3026 3027 } 3028 3029 /* 3030 * BMBT split requests often come in with little stack to work on so we push 3031 * them off to a worker thread so there is lots of stack to use. For the other 3032 * btree types, just call directly to avoid the context switch overhead here. 3033 * 3034 * Care must be taken here - the work queue rescuer thread introduces potential 3035 * AGF <> worker queue deadlocks if the BMBT block allocation has to lock new 3036 * AGFs to allocate blocks. A task being run by the rescuer could attempt to 3037 * lock an AGF that is already locked by a task queued to run by the rescuer, 3038 * resulting in an ABBA deadlock as the rescuer cannot run the lock holder to 3039 * release it until the current thread it is running gains the lock. 3040 * 3041 * To avoid this issue, we only ever queue BMBT splits that don't have an AGF 3042 * already locked to allocate from. The only place that doesn't hold an AGF 3043 * locked is unwritten extent conversion at IO completion, but that has already 3044 * been offloaded to a worker thread and hence has no stack consumption issues 3045 * we have to worry about. 3046 */ 3047 STATIC int /* error */ 3048 xfs_btree_split( 3049 struct xfs_btree_cur *cur, 3050 int level, 3051 union xfs_btree_ptr *ptrp, 3052 union xfs_btree_key *key, 3053 struct xfs_btree_cur **curp, 3054 int *stat) /* success/failure */ 3055 { 3056 struct xfs_btree_split_args args; 3057 DECLARE_COMPLETION_ONSTACK(done); 3058 3059 if (!xfs_btree_is_bmap(cur->bc_ops) || 3060 cur->bc_tp->t_highest_agno == NULLAGNUMBER) 3061 return __xfs_btree_split(cur, level, ptrp, key, curp, stat); 3062 3063 args.cur = cur; 3064 args.level = level; 3065 args.ptrp = ptrp; 3066 args.key = key; 3067 args.curp = curp; 3068 args.stat = stat; 3069 args.done = &done; 3070 args.kswapd = current_is_kswapd(); 3071 INIT_WORK_ONSTACK(&args.work, xfs_btree_split_worker); 3072 queue_work(xfs_alloc_wq, &args.work); 3073 wait_for_completion(&done); 3074 destroy_work_on_stack(&args.work); 3075 return args.result; 3076 } 3077 #else 3078 #define xfs_btree_split __xfs_btree_split 3079 #endif /* __KERNEL__ */ 3080 3081 /* 3082 * Copy the old inode root contents into a real block and make the 3083 * broot point to it. 3084 */ 3085 int /* error */ 3086 xfs_btree_new_iroot( 3087 struct xfs_btree_cur *cur, /* btree cursor */ 3088 int *logflags, /* logging flags for inode */ 3089 int *stat) /* return status - 0 fail */ 3090 { 3091 struct xfs_buf *cbp; /* buffer for cblock */ 3092 struct xfs_btree_block *block; /* btree block */ 3093 struct xfs_btree_block *cblock; /* child btree block */ 3094 union xfs_btree_key *ckp; /* child key pointer */ 3095 union xfs_btree_ptr *cpp; /* child ptr pointer */ 3096 union xfs_btree_key *kp; /* pointer to btree key */ 3097 union xfs_btree_ptr *pp; /* pointer to block addr */ 3098 union xfs_btree_ptr nptr; /* new block addr */ 3099 int level; /* btree level */ 3100 int error; /* error return code */ 3101 int i; /* loop counter */ 3102 3103 XFS_BTREE_STATS_INC(cur, newroot); 3104 3105 ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE); 3106 3107 level = cur->bc_nlevels - 1; 3108 3109 block = xfs_btree_get_iroot(cur); 3110 pp = xfs_btree_ptr_addr(cur, 1, block); 3111 3112 /* Allocate the new block. If we can't do it, we're toast. Give up. */ 3113 error = xfs_btree_alloc_block(cur, pp, &nptr, stat); 3114 if (error) 3115 goto error0; 3116 if (*stat == 0) 3117 return 0; 3118 3119 XFS_BTREE_STATS_INC(cur, alloc); 3120 3121 /* Copy the root into a real block. */ 3122 error = xfs_btree_get_buf_block(cur, &nptr, &cblock, &cbp); 3123 if (error) 3124 goto error0; 3125 3126 /* 3127 * we can't just memcpy() the root in for CRC enabled btree blocks. 3128 * In that case have to also ensure the blkno remains correct 3129 */ 3130 memcpy(cblock, block, xfs_btree_block_len(cur)); 3131 if (xfs_has_crc(cur->bc_mp)) { 3132 __be64 bno = cpu_to_be64(xfs_buf_daddr(cbp)); 3133 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) 3134 cblock->bb_u.l.bb_blkno = bno; 3135 else 3136 cblock->bb_u.s.bb_blkno = bno; 3137 } 3138 3139 be16_add_cpu(&block->bb_level, 1); 3140 xfs_btree_set_numrecs(block, 1); 3141 cur->bc_nlevels++; 3142 ASSERT(cur->bc_nlevels <= cur->bc_maxlevels); 3143 cur->bc_levels[level + 1].ptr = 1; 3144 3145 kp = xfs_btree_key_addr(cur, 1, block); 3146 ckp = xfs_btree_key_addr(cur, 1, cblock); 3147 xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock)); 3148 3149 cpp = xfs_btree_ptr_addr(cur, 1, cblock); 3150 for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) { 3151 error = xfs_btree_debug_check_ptr(cur, pp, i, level); 3152 if (error) 3153 goto error0; 3154 } 3155 3156 xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock)); 3157 3158 error = xfs_btree_debug_check_ptr(cur, &nptr, 0, level); 3159 if (error) 3160 goto error0; 3161 3162 xfs_btree_copy_ptrs(cur, pp, &nptr, 1); 3163 3164 xfs_iroot_realloc(cur->bc_ino.ip, 3165 1 - xfs_btree_get_numrecs(cblock), 3166 cur->bc_ino.whichfork); 3167 3168 xfs_btree_setbuf(cur, level, cbp); 3169 3170 /* 3171 * Do all this logging at the end so that 3172 * the root is at the right level. 3173 */ 3174 xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS); 3175 xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs)); 3176 xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs)); 3177 3178 *logflags |= 3179 XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork); 3180 *stat = 1; 3181 return 0; 3182 error0: 3183 return error; 3184 } 3185 3186 static void 3187 xfs_btree_set_root( 3188 struct xfs_btree_cur *cur, 3189 const union xfs_btree_ptr *ptr, 3190 int inc) 3191 { 3192 if (cur->bc_flags & XFS_BTREE_STAGING) { 3193 /* Update the btree root information for a per-AG fake root. */ 3194 cur->bc_ag.afake->af_root = be32_to_cpu(ptr->s); 3195 cur->bc_ag.afake->af_levels += inc; 3196 } else { 3197 cur->bc_ops->set_root(cur, ptr, inc); 3198 } 3199 } 3200 3201 /* 3202 * Allocate a new root block, fill it in. 3203 */ 3204 STATIC int /* error */ 3205 xfs_btree_new_root( 3206 struct xfs_btree_cur *cur, /* btree cursor */ 3207 int *stat) /* success/failure */ 3208 { 3209 struct xfs_btree_block *block; /* one half of the old root block */ 3210 struct xfs_buf *bp; /* buffer containing block */ 3211 int error; /* error return value */ 3212 struct xfs_buf *lbp; /* left buffer pointer */ 3213 struct xfs_btree_block *left; /* left btree block */ 3214 struct xfs_buf *nbp; /* new (root) buffer */ 3215 struct xfs_btree_block *new; /* new (root) btree block */ 3216 int nptr; /* new value for key index, 1 or 2 */ 3217 struct xfs_buf *rbp; /* right buffer pointer */ 3218 struct xfs_btree_block *right; /* right btree block */ 3219 union xfs_btree_ptr rptr; 3220 union xfs_btree_ptr lptr; 3221 3222 XFS_BTREE_STATS_INC(cur, newroot); 3223 3224 /* initialise our start point from the cursor */ 3225 xfs_btree_init_ptr_from_cur(cur, &rptr); 3226 3227 /* Allocate the new block. If we can't do it, we're toast. Give up. */ 3228 error = xfs_btree_alloc_block(cur, &rptr, &lptr, stat); 3229 if (error) 3230 goto error0; 3231 if (*stat == 0) 3232 goto out0; 3233 XFS_BTREE_STATS_INC(cur, alloc); 3234 3235 /* Set up the new block. */ 3236 error = xfs_btree_get_buf_block(cur, &lptr, &new, &nbp); 3237 if (error) 3238 goto error0; 3239 3240 /* Set the root in the holding structure increasing the level by 1. */ 3241 xfs_btree_set_root(cur, &lptr, 1); 3242 3243 /* 3244 * At the previous root level there are now two blocks: the old root, 3245 * and the new block generated when it was split. We don't know which 3246 * one the cursor is pointing at, so we set up variables "left" and 3247 * "right" for each case. 3248 */ 3249 block = xfs_btree_get_block(cur, cur->bc_nlevels - 1, &bp); 3250 3251 #ifdef DEBUG 3252 error = xfs_btree_check_block(cur, block, cur->bc_nlevels - 1, bp); 3253 if (error) 3254 goto error0; 3255 #endif 3256 3257 xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB); 3258 if (!xfs_btree_ptr_is_null(cur, &rptr)) { 3259 /* Our block is left, pick up the right block. */ 3260 lbp = bp; 3261 xfs_btree_buf_to_ptr(cur, lbp, &lptr); 3262 left = block; 3263 error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp); 3264 if (error) 3265 goto error0; 3266 bp = rbp; 3267 nptr = 1; 3268 } else { 3269 /* Our block is right, pick up the left block. */ 3270 rbp = bp; 3271 xfs_btree_buf_to_ptr(cur, rbp, &rptr); 3272 right = block; 3273 xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB); 3274 error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp); 3275 if (error) 3276 goto error0; 3277 bp = lbp; 3278 nptr = 2; 3279 } 3280 3281 /* Fill in the new block's btree header and log it. */ 3282 xfs_btree_init_block_cur(cur, nbp, cur->bc_nlevels, 2); 3283 xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS); 3284 ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) && 3285 !xfs_btree_ptr_is_null(cur, &rptr)); 3286 3287 /* Fill in the key data in the new root. */ 3288 if (xfs_btree_get_level(left) > 0) { 3289 /* 3290 * Get the keys for the left block's keys and put them directly 3291 * in the parent block. Do the same for the right block. 3292 */ 3293 xfs_btree_get_node_keys(cur, left, 3294 xfs_btree_key_addr(cur, 1, new)); 3295 xfs_btree_get_node_keys(cur, right, 3296 xfs_btree_key_addr(cur, 2, new)); 3297 } else { 3298 /* 3299 * Get the keys for the left block's records and put them 3300 * directly in the parent block. Do the same for the right 3301 * block. 3302 */ 3303 xfs_btree_get_leaf_keys(cur, left, 3304 xfs_btree_key_addr(cur, 1, new)); 3305 xfs_btree_get_leaf_keys(cur, right, 3306 xfs_btree_key_addr(cur, 2, new)); 3307 } 3308 xfs_btree_log_keys(cur, nbp, 1, 2); 3309 3310 /* Fill in the pointer data in the new root. */ 3311 xfs_btree_copy_ptrs(cur, 3312 xfs_btree_ptr_addr(cur, 1, new), &lptr, 1); 3313 xfs_btree_copy_ptrs(cur, 3314 xfs_btree_ptr_addr(cur, 2, new), &rptr, 1); 3315 xfs_btree_log_ptrs(cur, nbp, 1, 2); 3316 3317 /* Fix up the cursor. */ 3318 xfs_btree_setbuf(cur, cur->bc_nlevels, nbp); 3319 cur->bc_levels[cur->bc_nlevels].ptr = nptr; 3320 cur->bc_nlevels++; 3321 ASSERT(cur->bc_nlevels <= cur->bc_maxlevels); 3322 *stat = 1; 3323 return 0; 3324 error0: 3325 return error; 3326 out0: 3327 *stat = 0; 3328 return 0; 3329 } 3330 3331 STATIC int 3332 xfs_btree_make_block_unfull( 3333 struct xfs_btree_cur *cur, /* btree cursor */ 3334 int level, /* btree level */ 3335 int numrecs,/* # of recs in block */ 3336 int *oindex,/* old tree index */ 3337 int *index, /* new tree index */ 3338 union xfs_btree_ptr *nptr, /* new btree ptr */ 3339 struct xfs_btree_cur **ncur, /* new btree cursor */ 3340 union xfs_btree_key *key, /* key of new block */ 3341 int *stat) 3342 { 3343 int error = 0; 3344 3345 if (xfs_btree_at_iroot(cur, level)) { 3346 struct xfs_inode *ip = cur->bc_ino.ip; 3347 3348 if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) { 3349 /* A root block that can be made bigger. */ 3350 xfs_iroot_realloc(ip, 1, cur->bc_ino.whichfork); 3351 *stat = 1; 3352 } else { 3353 /* A root block that needs replacing */ 3354 int logflags = 0; 3355 3356 error = xfs_btree_new_iroot(cur, &logflags, stat); 3357 if (error || *stat == 0) 3358 return error; 3359 3360 xfs_trans_log_inode(cur->bc_tp, ip, logflags); 3361 } 3362 3363 return 0; 3364 } 3365 3366 /* First, try shifting an entry to the right neighbor. */ 3367 error = xfs_btree_rshift(cur, level, stat); 3368 if (error || *stat) 3369 return error; 3370 3371 /* Next, try shifting an entry to the left neighbor. */ 3372 error = xfs_btree_lshift(cur, level, stat); 3373 if (error) 3374 return error; 3375 3376 if (*stat) { 3377 *oindex = *index = cur->bc_levels[level].ptr; 3378 return 0; 3379 } 3380 3381 /* 3382 * Next, try splitting the current block in half. 3383 * 3384 * If this works we have to re-set our variables because we 3385 * could be in a different block now. 3386 */ 3387 error = xfs_btree_split(cur, level, nptr, key, ncur, stat); 3388 if (error || *stat == 0) 3389 return error; 3390 3391 3392 *index = cur->bc_levels[level].ptr; 3393 return 0; 3394 } 3395 3396 /* 3397 * Insert one record/level. Return information to the caller 3398 * allowing the next level up to proceed if necessary. 3399 */ 3400 STATIC int 3401 xfs_btree_insrec( 3402 struct xfs_btree_cur *cur, /* btree cursor */ 3403 int level, /* level to insert record at */ 3404 union xfs_btree_ptr *ptrp, /* i/o: block number inserted */ 3405 union xfs_btree_rec *rec, /* record to insert */ 3406 union xfs_btree_key *key, /* i/o: block key for ptrp */ 3407 struct xfs_btree_cur **curp, /* output: new cursor replacing cur */ 3408 int *stat) /* success/failure */ 3409 { 3410 struct xfs_btree_block *block; /* btree block */ 3411 struct xfs_buf *bp; /* buffer for block */ 3412 union xfs_btree_ptr nptr; /* new block ptr */ 3413 struct xfs_btree_cur *ncur = NULL; /* new btree cursor */ 3414 union xfs_btree_key nkey; /* new block key */ 3415 union xfs_btree_key *lkey; 3416 int optr; /* old key/record index */ 3417 int ptr; /* key/record index */ 3418 int numrecs;/* number of records */ 3419 int error; /* error return value */ 3420 int i; 3421 xfs_daddr_t old_bn; 3422 3423 ncur = NULL; 3424 lkey = &nkey; 3425 3426 /* 3427 * If we have an external root pointer, and we've made it to the 3428 * root level, allocate a new root block and we're done. 3429 */ 3430 if (cur->bc_ops->type != XFS_BTREE_TYPE_INODE && 3431 level >= cur->bc_nlevels) { 3432 error = xfs_btree_new_root(cur, stat); 3433 xfs_btree_set_ptr_null(cur, ptrp); 3434 3435 return error; 3436 } 3437 3438 /* If we're off the left edge, return failure. */ 3439 ptr = cur->bc_levels[level].ptr; 3440 if (ptr == 0) { 3441 *stat = 0; 3442 return 0; 3443 } 3444 3445 optr = ptr; 3446 3447 XFS_BTREE_STATS_INC(cur, insrec); 3448 3449 /* Get pointers to the btree buffer and block. */ 3450 block = xfs_btree_get_block(cur, level, &bp); 3451 old_bn = bp ? xfs_buf_daddr(bp) : XFS_BUF_DADDR_NULL; 3452 numrecs = xfs_btree_get_numrecs(block); 3453 3454 #ifdef DEBUG 3455 error = xfs_btree_check_block(cur, block, level, bp); 3456 if (error) 3457 goto error0; 3458 3459 /* Check that the new entry is being inserted in the right place. */ 3460 if (ptr <= numrecs) { 3461 if (level == 0) { 3462 ASSERT(cur->bc_ops->recs_inorder(cur, rec, 3463 xfs_btree_rec_addr(cur, ptr, block))); 3464 } else { 3465 ASSERT(cur->bc_ops->keys_inorder(cur, key, 3466 xfs_btree_key_addr(cur, ptr, block))); 3467 } 3468 } 3469 #endif 3470 3471 /* 3472 * If the block is full, we can't insert the new entry until we 3473 * make the block un-full. 3474 */ 3475 xfs_btree_set_ptr_null(cur, &nptr); 3476 if (numrecs == cur->bc_ops->get_maxrecs(cur, level)) { 3477 error = xfs_btree_make_block_unfull(cur, level, numrecs, 3478 &optr, &ptr, &nptr, &ncur, lkey, stat); 3479 if (error || *stat == 0) 3480 goto error0; 3481 } 3482 3483 /* 3484 * The current block may have changed if the block was 3485 * previously full and we have just made space in it. 3486 */ 3487 block = xfs_btree_get_block(cur, level, &bp); 3488 numrecs = xfs_btree_get_numrecs(block); 3489 3490 #ifdef DEBUG 3491 error = xfs_btree_check_block(cur, block, level, bp); 3492 if (error) 3493 goto error0; 3494 #endif 3495 3496 /* 3497 * At this point we know there's room for our new entry in the block 3498 * we're pointing at. 3499 */ 3500 XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr + 1); 3501 3502 if (level > 0) { 3503 /* It's a nonleaf. make a hole in the keys and ptrs */ 3504 union xfs_btree_key *kp; 3505 union xfs_btree_ptr *pp; 3506 3507 kp = xfs_btree_key_addr(cur, ptr, block); 3508 pp = xfs_btree_ptr_addr(cur, ptr, block); 3509 3510 for (i = numrecs - ptr; i >= 0; i--) { 3511 error = xfs_btree_debug_check_ptr(cur, pp, i, level); 3512 if (error) 3513 goto error0; 3514 } 3515 3516 xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1); 3517 xfs_btree_shift_ptrs(cur, pp, 1, numrecs - ptr + 1); 3518 3519 error = xfs_btree_debug_check_ptr(cur, ptrp, 0, level); 3520 if (error) 3521 goto error0; 3522 3523 /* Now put the new data in, bump numrecs and log it. */ 3524 xfs_btree_copy_keys(cur, kp, key, 1); 3525 xfs_btree_copy_ptrs(cur, pp, ptrp, 1); 3526 numrecs++; 3527 xfs_btree_set_numrecs(block, numrecs); 3528 xfs_btree_log_ptrs(cur, bp, ptr, numrecs); 3529 xfs_btree_log_keys(cur, bp, ptr, numrecs); 3530 #ifdef DEBUG 3531 if (ptr < numrecs) { 3532 ASSERT(cur->bc_ops->keys_inorder(cur, kp, 3533 xfs_btree_key_addr(cur, ptr + 1, block))); 3534 } 3535 #endif 3536 } else { 3537 /* It's a leaf. make a hole in the records */ 3538 union xfs_btree_rec *rp; 3539 3540 rp = xfs_btree_rec_addr(cur, ptr, block); 3541 3542 xfs_btree_shift_recs(cur, rp, 1, numrecs - ptr + 1); 3543 3544 /* Now put the new data in, bump numrecs and log it. */ 3545 xfs_btree_copy_recs(cur, rp, rec, 1); 3546 xfs_btree_set_numrecs(block, ++numrecs); 3547 xfs_btree_log_recs(cur, bp, ptr, numrecs); 3548 #ifdef DEBUG 3549 if (ptr < numrecs) { 3550 ASSERT(cur->bc_ops->recs_inorder(cur, rp, 3551 xfs_btree_rec_addr(cur, ptr + 1, block))); 3552 } 3553 #endif 3554 } 3555 3556 /* Log the new number of records in the btree header. */ 3557 xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS); 3558 3559 /* 3560 * If we just inserted into a new tree block, we have to 3561 * recalculate nkey here because nkey is out of date. 3562 * 3563 * Otherwise we're just updating an existing block (having shoved 3564 * some records into the new tree block), so use the regular key 3565 * update mechanism. 3566 */ 3567 if (bp && xfs_buf_daddr(bp) != old_bn) { 3568 xfs_btree_get_keys(cur, block, lkey); 3569 } else if (xfs_btree_needs_key_update(cur, optr)) { 3570 error = xfs_btree_update_keys(cur, level); 3571 if (error) 3572 goto error0; 3573 } 3574 3575 /* 3576 * Return the new block number, if any. 3577 * If there is one, give back a record value and a cursor too. 3578 */ 3579 *ptrp = nptr; 3580 if (!xfs_btree_ptr_is_null(cur, &nptr)) { 3581 xfs_btree_copy_keys(cur, key, lkey, 1); 3582 *curp = ncur; 3583 } 3584 3585 *stat = 1; 3586 return 0; 3587 3588 error0: 3589 if (ncur) 3590 xfs_btree_del_cursor(ncur, error); 3591 return error; 3592 } 3593 3594 /* 3595 * Insert the record at the point referenced by cur. 3596 * 3597 * A multi-level split of the tree on insert will invalidate the original 3598 * cursor. All callers of this function should assume that the cursor is 3599 * no longer valid and revalidate it. 3600 */ 3601 int 3602 xfs_btree_insert( 3603 struct xfs_btree_cur *cur, 3604 int *stat) 3605 { 3606 int error; /* error return value */ 3607 int i; /* result value, 0 for failure */ 3608 int level; /* current level number in btree */ 3609 union xfs_btree_ptr nptr; /* new block number (split result) */ 3610 struct xfs_btree_cur *ncur; /* new cursor (split result) */ 3611 struct xfs_btree_cur *pcur; /* previous level's cursor */ 3612 union xfs_btree_key bkey; /* key of block to insert */ 3613 union xfs_btree_key *key; 3614 union xfs_btree_rec rec; /* record to insert */ 3615 3616 level = 0; 3617 ncur = NULL; 3618 pcur = cur; 3619 key = &bkey; 3620 3621 xfs_btree_set_ptr_null(cur, &nptr); 3622 3623 /* Make a key out of the record data to be inserted, and save it. */ 3624 cur->bc_ops->init_rec_from_cur(cur, &rec); 3625 cur->bc_ops->init_key_from_rec(key, &rec); 3626 3627 /* 3628 * Loop going up the tree, starting at the leaf level. 3629 * Stop when we don't get a split block, that must mean that 3630 * the insert is finished with this level. 3631 */ 3632 do { 3633 /* 3634 * Insert nrec/nptr into this level of the tree. 3635 * Note if we fail, nptr will be null. 3636 */ 3637 error = xfs_btree_insrec(pcur, level, &nptr, &rec, key, 3638 &ncur, &i); 3639 if (error) { 3640 if (pcur != cur) 3641 xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR); 3642 goto error0; 3643 } 3644 3645 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 3646 xfs_btree_mark_sick(cur); 3647 error = -EFSCORRUPTED; 3648 goto error0; 3649 } 3650 level++; 3651 3652 /* 3653 * See if the cursor we just used is trash. 3654 * Can't trash the caller's cursor, but otherwise we should 3655 * if ncur is a new cursor or we're about to be done. 3656 */ 3657 if (pcur != cur && 3658 (ncur || xfs_btree_ptr_is_null(cur, &nptr))) { 3659 /* Save the state from the cursor before we trash it */ 3660 if (cur->bc_ops->update_cursor && 3661 !(cur->bc_flags & XFS_BTREE_STAGING)) 3662 cur->bc_ops->update_cursor(pcur, cur); 3663 cur->bc_nlevels = pcur->bc_nlevels; 3664 xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR); 3665 } 3666 /* If we got a new cursor, switch to it. */ 3667 if (ncur) { 3668 pcur = ncur; 3669 ncur = NULL; 3670 } 3671 } while (!xfs_btree_ptr_is_null(cur, &nptr)); 3672 3673 *stat = i; 3674 return 0; 3675 error0: 3676 return error; 3677 } 3678 3679 /* 3680 * Try to merge a non-leaf block back into the inode root. 3681 * 3682 * Note: the killroot names comes from the fact that we're effectively 3683 * killing the old root block. But because we can't just delete the 3684 * inode we have to copy the single block it was pointing to into the 3685 * inode. 3686 */ 3687 STATIC int 3688 xfs_btree_kill_iroot( 3689 struct xfs_btree_cur *cur) 3690 { 3691 int whichfork = cur->bc_ino.whichfork; 3692 struct xfs_inode *ip = cur->bc_ino.ip; 3693 struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); 3694 struct xfs_btree_block *block; 3695 struct xfs_btree_block *cblock; 3696 union xfs_btree_key *kp; 3697 union xfs_btree_key *ckp; 3698 union xfs_btree_ptr *pp; 3699 union xfs_btree_ptr *cpp; 3700 struct xfs_buf *cbp; 3701 int level; 3702 int index; 3703 int numrecs; 3704 int error; 3705 #ifdef DEBUG 3706 union xfs_btree_ptr ptr; 3707 #endif 3708 int i; 3709 3710 ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE); 3711 ASSERT(cur->bc_nlevels > 1); 3712 3713 /* 3714 * Don't deal with the root block needs to be a leaf case. 3715 * We're just going to turn the thing back into extents anyway. 3716 */ 3717 level = cur->bc_nlevels - 1; 3718 if (level == 1) 3719 goto out0; 3720 3721 /* 3722 * Give up if the root has multiple children. 3723 */ 3724 block = xfs_btree_get_iroot(cur); 3725 if (xfs_btree_get_numrecs(block) != 1) 3726 goto out0; 3727 3728 cblock = xfs_btree_get_block(cur, level - 1, &cbp); 3729 numrecs = xfs_btree_get_numrecs(cblock); 3730 3731 /* 3732 * Only do this if the next level will fit. 3733 * Then the data must be copied up to the inode, 3734 * instead of freeing the root you free the next level. 3735 */ 3736 if (numrecs > cur->bc_ops->get_dmaxrecs(cur, level)) 3737 goto out0; 3738 3739 XFS_BTREE_STATS_INC(cur, killroot); 3740 3741 #ifdef DEBUG 3742 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB); 3743 ASSERT(xfs_btree_ptr_is_null(cur, &ptr)); 3744 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB); 3745 ASSERT(xfs_btree_ptr_is_null(cur, &ptr)); 3746 #endif 3747 3748 index = numrecs - cur->bc_ops->get_maxrecs(cur, level); 3749 if (index) { 3750 xfs_iroot_realloc(cur->bc_ino.ip, index, 3751 cur->bc_ino.whichfork); 3752 block = ifp->if_broot; 3753 } 3754 3755 be16_add_cpu(&block->bb_numrecs, index); 3756 ASSERT(block->bb_numrecs == cblock->bb_numrecs); 3757 3758 kp = xfs_btree_key_addr(cur, 1, block); 3759 ckp = xfs_btree_key_addr(cur, 1, cblock); 3760 xfs_btree_copy_keys(cur, kp, ckp, numrecs); 3761 3762 pp = xfs_btree_ptr_addr(cur, 1, block); 3763 cpp = xfs_btree_ptr_addr(cur, 1, cblock); 3764 3765 for (i = 0; i < numrecs; i++) { 3766 error = xfs_btree_debug_check_ptr(cur, cpp, i, level - 1); 3767 if (error) 3768 return error; 3769 } 3770 3771 xfs_btree_copy_ptrs(cur, pp, cpp, numrecs); 3772 3773 error = xfs_btree_free_block(cur, cbp); 3774 if (error) 3775 return error; 3776 3777 cur->bc_levels[level - 1].bp = NULL; 3778 be16_add_cpu(&block->bb_level, -1); 3779 xfs_trans_log_inode(cur->bc_tp, ip, 3780 XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork)); 3781 cur->bc_nlevels--; 3782 out0: 3783 return 0; 3784 } 3785 3786 /* 3787 * Kill the current root node, and replace it with it's only child node. 3788 */ 3789 STATIC int 3790 xfs_btree_kill_root( 3791 struct xfs_btree_cur *cur, 3792 struct xfs_buf *bp, 3793 int level, 3794 union xfs_btree_ptr *newroot) 3795 { 3796 int error; 3797 3798 XFS_BTREE_STATS_INC(cur, killroot); 3799 3800 /* 3801 * Update the root pointer, decreasing the level by 1 and then 3802 * free the old root. 3803 */ 3804 xfs_btree_set_root(cur, newroot, -1); 3805 3806 error = xfs_btree_free_block(cur, bp); 3807 if (error) 3808 return error; 3809 3810 cur->bc_levels[level].bp = NULL; 3811 cur->bc_levels[level].ra = 0; 3812 cur->bc_nlevels--; 3813 3814 return 0; 3815 } 3816 3817 STATIC int 3818 xfs_btree_dec_cursor( 3819 struct xfs_btree_cur *cur, 3820 int level, 3821 int *stat) 3822 { 3823 int error; 3824 int i; 3825 3826 if (level > 0) { 3827 error = xfs_btree_decrement(cur, level, &i); 3828 if (error) 3829 return error; 3830 } 3831 3832 *stat = 1; 3833 return 0; 3834 } 3835 3836 /* 3837 * Single level of the btree record deletion routine. 3838 * Delete record pointed to by cur/level. 3839 * Remove the record from its block then rebalance the tree. 3840 * Return 0 for error, 1 for done, 2 to go on to the next level. 3841 */ 3842 STATIC int /* error */ 3843 xfs_btree_delrec( 3844 struct xfs_btree_cur *cur, /* btree cursor */ 3845 int level, /* level removing record from */ 3846 int *stat) /* fail/done/go-on */ 3847 { 3848 struct xfs_btree_block *block; /* btree block */ 3849 union xfs_btree_ptr cptr; /* current block ptr */ 3850 struct xfs_buf *bp; /* buffer for block */ 3851 int error; /* error return value */ 3852 int i; /* loop counter */ 3853 union xfs_btree_ptr lptr; /* left sibling block ptr */ 3854 struct xfs_buf *lbp; /* left buffer pointer */ 3855 struct xfs_btree_block *left; /* left btree block */ 3856 int lrecs = 0; /* left record count */ 3857 int ptr; /* key/record index */ 3858 union xfs_btree_ptr rptr; /* right sibling block ptr */ 3859 struct xfs_buf *rbp; /* right buffer pointer */ 3860 struct xfs_btree_block *right; /* right btree block */ 3861 struct xfs_btree_block *rrblock; /* right-right btree block */ 3862 struct xfs_buf *rrbp; /* right-right buffer pointer */ 3863 int rrecs = 0; /* right record count */ 3864 struct xfs_btree_cur *tcur; /* temporary btree cursor */ 3865 int numrecs; /* temporary numrec count */ 3866 3867 tcur = NULL; 3868 3869 /* Get the index of the entry being deleted, check for nothing there. */ 3870 ptr = cur->bc_levels[level].ptr; 3871 if (ptr == 0) { 3872 *stat = 0; 3873 return 0; 3874 } 3875 3876 /* Get the buffer & block containing the record or key/ptr. */ 3877 block = xfs_btree_get_block(cur, level, &bp); 3878 numrecs = xfs_btree_get_numrecs(block); 3879 3880 #ifdef DEBUG 3881 error = xfs_btree_check_block(cur, block, level, bp); 3882 if (error) 3883 goto error0; 3884 #endif 3885 3886 /* Fail if we're off the end of the block. */ 3887 if (ptr > numrecs) { 3888 *stat = 0; 3889 return 0; 3890 } 3891 3892 XFS_BTREE_STATS_INC(cur, delrec); 3893 XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr); 3894 3895 /* Excise the entries being deleted. */ 3896 if (level > 0) { 3897 /* It's a nonleaf. operate on keys and ptrs */ 3898 union xfs_btree_key *lkp; 3899 union xfs_btree_ptr *lpp; 3900 3901 lkp = xfs_btree_key_addr(cur, ptr + 1, block); 3902 lpp = xfs_btree_ptr_addr(cur, ptr + 1, block); 3903 3904 for (i = 0; i < numrecs - ptr; i++) { 3905 error = xfs_btree_debug_check_ptr(cur, lpp, i, level); 3906 if (error) 3907 goto error0; 3908 } 3909 3910 if (ptr < numrecs) { 3911 xfs_btree_shift_keys(cur, lkp, -1, numrecs - ptr); 3912 xfs_btree_shift_ptrs(cur, lpp, -1, numrecs - ptr); 3913 xfs_btree_log_keys(cur, bp, ptr, numrecs - 1); 3914 xfs_btree_log_ptrs(cur, bp, ptr, numrecs - 1); 3915 } 3916 } else { 3917 /* It's a leaf. operate on records */ 3918 if (ptr < numrecs) { 3919 xfs_btree_shift_recs(cur, 3920 xfs_btree_rec_addr(cur, ptr + 1, block), 3921 -1, numrecs - ptr); 3922 xfs_btree_log_recs(cur, bp, ptr, numrecs - 1); 3923 } 3924 } 3925 3926 /* 3927 * Decrement and log the number of entries in the block. 3928 */ 3929 xfs_btree_set_numrecs(block, --numrecs); 3930 xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS); 3931 3932 /* 3933 * We're at the root level. First, shrink the root block in-memory. 3934 * Try to get rid of the next level down. If we can't then there's 3935 * nothing left to do. 3936 */ 3937 if (xfs_btree_at_iroot(cur, level)) { 3938 xfs_iroot_realloc(cur->bc_ino.ip, -1, cur->bc_ino.whichfork); 3939 3940 error = xfs_btree_kill_iroot(cur); 3941 if (error) 3942 goto error0; 3943 3944 error = xfs_btree_dec_cursor(cur, level, stat); 3945 if (error) 3946 goto error0; 3947 *stat = 1; 3948 return 0; 3949 } 3950 3951 /* 3952 * If this is the root level, and there's only one entry left, and it's 3953 * NOT the leaf level, then we can get rid of this level. 3954 */ 3955 if (level == cur->bc_nlevels - 1) { 3956 if (numrecs == 1 && level > 0) { 3957 union xfs_btree_ptr *pp; 3958 /* 3959 * pp is still set to the first pointer in the block. 3960 * Make it the new root of the btree. 3961 */ 3962 pp = xfs_btree_ptr_addr(cur, 1, block); 3963 error = xfs_btree_kill_root(cur, bp, level, pp); 3964 if (error) 3965 goto error0; 3966 } else if (level > 0) { 3967 error = xfs_btree_dec_cursor(cur, level, stat); 3968 if (error) 3969 goto error0; 3970 } 3971 *stat = 1; 3972 return 0; 3973 } 3974 3975 /* 3976 * If we deleted the leftmost entry in the block, update the 3977 * key values above us in the tree. 3978 */ 3979 if (xfs_btree_needs_key_update(cur, ptr)) { 3980 error = xfs_btree_update_keys(cur, level); 3981 if (error) 3982 goto error0; 3983 } 3984 3985 /* 3986 * If the number of records remaining in the block is at least 3987 * the minimum, we're done. 3988 */ 3989 if (numrecs >= cur->bc_ops->get_minrecs(cur, level)) { 3990 error = xfs_btree_dec_cursor(cur, level, stat); 3991 if (error) 3992 goto error0; 3993 return 0; 3994 } 3995 3996 /* 3997 * Otherwise, we have to move some records around to keep the 3998 * tree balanced. Look at the left and right sibling blocks to 3999 * see if we can re-balance by moving only one record. 4000 */ 4001 xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB); 4002 xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB); 4003 4004 if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) { 4005 /* 4006 * One child of root, need to get a chance to copy its contents 4007 * into the root and delete it. Can't go up to next level, 4008 * there's nothing to delete there. 4009 */ 4010 if (xfs_btree_ptr_is_null(cur, &rptr) && 4011 xfs_btree_ptr_is_null(cur, &lptr) && 4012 level == cur->bc_nlevels - 2) { 4013 error = xfs_btree_kill_iroot(cur); 4014 if (!error) 4015 error = xfs_btree_dec_cursor(cur, level, stat); 4016 if (error) 4017 goto error0; 4018 return 0; 4019 } 4020 } 4021 4022 ASSERT(!xfs_btree_ptr_is_null(cur, &rptr) || 4023 !xfs_btree_ptr_is_null(cur, &lptr)); 4024 4025 /* 4026 * Duplicate the cursor so our btree manipulations here won't 4027 * disrupt the next level up. 4028 */ 4029 error = xfs_btree_dup_cursor(cur, &tcur); 4030 if (error) 4031 goto error0; 4032 4033 /* 4034 * If there's a right sibling, see if it's ok to shift an entry 4035 * out of it. 4036 */ 4037 if (!xfs_btree_ptr_is_null(cur, &rptr)) { 4038 /* 4039 * Move the temp cursor to the last entry in the next block. 4040 * Actually any entry but the first would suffice. 4041 */ 4042 i = xfs_btree_lastrec(tcur, level); 4043 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 4044 xfs_btree_mark_sick(cur); 4045 error = -EFSCORRUPTED; 4046 goto error0; 4047 } 4048 4049 error = xfs_btree_increment(tcur, level, &i); 4050 if (error) 4051 goto error0; 4052 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 4053 xfs_btree_mark_sick(cur); 4054 error = -EFSCORRUPTED; 4055 goto error0; 4056 } 4057 4058 i = xfs_btree_lastrec(tcur, level); 4059 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 4060 xfs_btree_mark_sick(cur); 4061 error = -EFSCORRUPTED; 4062 goto error0; 4063 } 4064 4065 /* Grab a pointer to the block. */ 4066 right = xfs_btree_get_block(tcur, level, &rbp); 4067 #ifdef DEBUG 4068 error = xfs_btree_check_block(tcur, right, level, rbp); 4069 if (error) 4070 goto error0; 4071 #endif 4072 /* Grab the current block number, for future use. */ 4073 xfs_btree_get_sibling(tcur, right, &cptr, XFS_BB_LEFTSIB); 4074 4075 /* 4076 * If right block is full enough so that removing one entry 4077 * won't make it too empty, and left-shifting an entry out 4078 * of right to us works, we're done. 4079 */ 4080 if (xfs_btree_get_numrecs(right) - 1 >= 4081 cur->bc_ops->get_minrecs(tcur, level)) { 4082 error = xfs_btree_lshift(tcur, level, &i); 4083 if (error) 4084 goto error0; 4085 if (i) { 4086 ASSERT(xfs_btree_get_numrecs(block) >= 4087 cur->bc_ops->get_minrecs(tcur, level)); 4088 4089 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); 4090 tcur = NULL; 4091 4092 error = xfs_btree_dec_cursor(cur, level, stat); 4093 if (error) 4094 goto error0; 4095 return 0; 4096 } 4097 } 4098 4099 /* 4100 * Otherwise, grab the number of records in right for 4101 * future reference, and fix up the temp cursor to point 4102 * to our block again (last record). 4103 */ 4104 rrecs = xfs_btree_get_numrecs(right); 4105 if (!xfs_btree_ptr_is_null(cur, &lptr)) { 4106 i = xfs_btree_firstrec(tcur, level); 4107 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 4108 xfs_btree_mark_sick(cur); 4109 error = -EFSCORRUPTED; 4110 goto error0; 4111 } 4112 4113 error = xfs_btree_decrement(tcur, level, &i); 4114 if (error) 4115 goto error0; 4116 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 4117 xfs_btree_mark_sick(cur); 4118 error = -EFSCORRUPTED; 4119 goto error0; 4120 } 4121 } 4122 } 4123 4124 /* 4125 * If there's a left sibling, see if it's ok to shift an entry 4126 * out of it. 4127 */ 4128 if (!xfs_btree_ptr_is_null(cur, &lptr)) { 4129 /* 4130 * Move the temp cursor to the first entry in the 4131 * previous block. 4132 */ 4133 i = xfs_btree_firstrec(tcur, level); 4134 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 4135 xfs_btree_mark_sick(cur); 4136 error = -EFSCORRUPTED; 4137 goto error0; 4138 } 4139 4140 error = xfs_btree_decrement(tcur, level, &i); 4141 if (error) 4142 goto error0; 4143 i = xfs_btree_firstrec(tcur, level); 4144 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 4145 xfs_btree_mark_sick(cur); 4146 error = -EFSCORRUPTED; 4147 goto error0; 4148 } 4149 4150 /* Grab a pointer to the block. */ 4151 left = xfs_btree_get_block(tcur, level, &lbp); 4152 #ifdef DEBUG 4153 error = xfs_btree_check_block(cur, left, level, lbp); 4154 if (error) 4155 goto error0; 4156 #endif 4157 /* Grab the current block number, for future use. */ 4158 xfs_btree_get_sibling(tcur, left, &cptr, XFS_BB_RIGHTSIB); 4159 4160 /* 4161 * If left block is full enough so that removing one entry 4162 * won't make it too empty, and right-shifting an entry out 4163 * of left to us works, we're done. 4164 */ 4165 if (xfs_btree_get_numrecs(left) - 1 >= 4166 cur->bc_ops->get_minrecs(tcur, level)) { 4167 error = xfs_btree_rshift(tcur, level, &i); 4168 if (error) 4169 goto error0; 4170 if (i) { 4171 ASSERT(xfs_btree_get_numrecs(block) >= 4172 cur->bc_ops->get_minrecs(tcur, level)); 4173 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); 4174 tcur = NULL; 4175 if (level == 0) 4176 cur->bc_levels[0].ptr++; 4177 4178 *stat = 1; 4179 return 0; 4180 } 4181 } 4182 4183 /* 4184 * Otherwise, grab the number of records in right for 4185 * future reference. 4186 */ 4187 lrecs = xfs_btree_get_numrecs(left); 4188 } 4189 4190 /* Delete the temp cursor, we're done with it. */ 4191 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); 4192 tcur = NULL; 4193 4194 /* If here, we need to do a join to keep the tree balanced. */ 4195 ASSERT(!xfs_btree_ptr_is_null(cur, &cptr)); 4196 4197 if (!xfs_btree_ptr_is_null(cur, &lptr) && 4198 lrecs + xfs_btree_get_numrecs(block) <= 4199 cur->bc_ops->get_maxrecs(cur, level)) { 4200 /* 4201 * Set "right" to be the starting block, 4202 * "left" to be the left neighbor. 4203 */ 4204 rptr = cptr; 4205 right = block; 4206 rbp = bp; 4207 error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp); 4208 if (error) 4209 goto error0; 4210 4211 /* 4212 * If that won't work, see if we can join with the right neighbor block. 4213 */ 4214 } else if (!xfs_btree_ptr_is_null(cur, &rptr) && 4215 rrecs + xfs_btree_get_numrecs(block) <= 4216 cur->bc_ops->get_maxrecs(cur, level)) { 4217 /* 4218 * Set "left" to be the starting block, 4219 * "right" to be the right neighbor. 4220 */ 4221 lptr = cptr; 4222 left = block; 4223 lbp = bp; 4224 error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp); 4225 if (error) 4226 goto error0; 4227 4228 /* 4229 * Otherwise, we can't fix the imbalance. 4230 * Just return. This is probably a logic error, but it's not fatal. 4231 */ 4232 } else { 4233 error = xfs_btree_dec_cursor(cur, level, stat); 4234 if (error) 4235 goto error0; 4236 return 0; 4237 } 4238 4239 rrecs = xfs_btree_get_numrecs(right); 4240 lrecs = xfs_btree_get_numrecs(left); 4241 4242 /* 4243 * We're now going to join "left" and "right" by moving all the stuff 4244 * in "right" to "left" and deleting "right". 4245 */ 4246 XFS_BTREE_STATS_ADD(cur, moves, rrecs); 4247 if (level > 0) { 4248 /* It's a non-leaf. Move keys and pointers. */ 4249 union xfs_btree_key *lkp; /* left btree key */ 4250 union xfs_btree_ptr *lpp; /* left address pointer */ 4251 union xfs_btree_key *rkp; /* right btree key */ 4252 union xfs_btree_ptr *rpp; /* right address pointer */ 4253 4254 lkp = xfs_btree_key_addr(cur, lrecs + 1, left); 4255 lpp = xfs_btree_ptr_addr(cur, lrecs + 1, left); 4256 rkp = xfs_btree_key_addr(cur, 1, right); 4257 rpp = xfs_btree_ptr_addr(cur, 1, right); 4258 4259 for (i = 1; i < rrecs; i++) { 4260 error = xfs_btree_debug_check_ptr(cur, rpp, i, level); 4261 if (error) 4262 goto error0; 4263 } 4264 4265 xfs_btree_copy_keys(cur, lkp, rkp, rrecs); 4266 xfs_btree_copy_ptrs(cur, lpp, rpp, rrecs); 4267 4268 xfs_btree_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs); 4269 xfs_btree_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs); 4270 } else { 4271 /* It's a leaf. Move records. */ 4272 union xfs_btree_rec *lrp; /* left record pointer */ 4273 union xfs_btree_rec *rrp; /* right record pointer */ 4274 4275 lrp = xfs_btree_rec_addr(cur, lrecs + 1, left); 4276 rrp = xfs_btree_rec_addr(cur, 1, right); 4277 4278 xfs_btree_copy_recs(cur, lrp, rrp, rrecs); 4279 xfs_btree_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs); 4280 } 4281 4282 XFS_BTREE_STATS_INC(cur, join); 4283 4284 /* 4285 * Fix up the number of records and right block pointer in the 4286 * surviving block, and log it. 4287 */ 4288 xfs_btree_set_numrecs(left, lrecs + rrecs); 4289 xfs_btree_get_sibling(cur, right, &cptr, XFS_BB_RIGHTSIB); 4290 xfs_btree_set_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB); 4291 xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB); 4292 4293 /* If there is a right sibling, point it to the remaining block. */ 4294 xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB); 4295 if (!xfs_btree_ptr_is_null(cur, &cptr)) { 4296 error = xfs_btree_read_buf_block(cur, &cptr, 0, &rrblock, &rrbp); 4297 if (error) 4298 goto error0; 4299 xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB); 4300 xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB); 4301 } 4302 4303 /* Free the deleted block. */ 4304 error = xfs_btree_free_block(cur, rbp); 4305 if (error) 4306 goto error0; 4307 4308 /* 4309 * If we joined with the left neighbor, set the buffer in the 4310 * cursor to the left block, and fix up the index. 4311 */ 4312 if (bp != lbp) { 4313 cur->bc_levels[level].bp = lbp; 4314 cur->bc_levels[level].ptr += lrecs; 4315 cur->bc_levels[level].ra = 0; 4316 } 4317 /* 4318 * If we joined with the right neighbor and there's a level above 4319 * us, increment the cursor at that level. 4320 */ 4321 else if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE || 4322 level + 1 < cur->bc_nlevels) { 4323 error = xfs_btree_increment(cur, level + 1, &i); 4324 if (error) 4325 goto error0; 4326 } 4327 4328 /* 4329 * Readjust the ptr at this level if it's not a leaf, since it's 4330 * still pointing at the deletion point, which makes the cursor 4331 * inconsistent. If this makes the ptr 0, the caller fixes it up. 4332 * We can't use decrement because it would change the next level up. 4333 */ 4334 if (level > 0) 4335 cur->bc_levels[level].ptr--; 4336 4337 /* 4338 * We combined blocks, so we have to update the parent keys if the 4339 * btree supports overlapped intervals. However, 4340 * bc_levels[level + 1].ptr points to the old block so that the caller 4341 * knows which record to delete. Therefore, the caller must be savvy 4342 * enough to call updkeys for us if we return stat == 2. The other 4343 * exit points from this function don't require deletions further up 4344 * the tree, so they can call updkeys directly. 4345 */ 4346 4347 /* Return value means the next level up has something to do. */ 4348 *stat = 2; 4349 return 0; 4350 4351 error0: 4352 if (tcur) 4353 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); 4354 return error; 4355 } 4356 4357 /* 4358 * Delete the record pointed to by cur. 4359 * The cursor refers to the place where the record was (could be inserted) 4360 * when the operation returns. 4361 */ 4362 int /* error */ 4363 xfs_btree_delete( 4364 struct xfs_btree_cur *cur, 4365 int *stat) /* success/failure */ 4366 { 4367 int error; /* error return value */ 4368 int level; 4369 int i; 4370 bool joined = false; 4371 4372 /* 4373 * Go up the tree, starting at leaf level. 4374 * 4375 * If 2 is returned then a join was done; go to the next level. 4376 * Otherwise we are done. 4377 */ 4378 for (level = 0, i = 2; i == 2; level++) { 4379 error = xfs_btree_delrec(cur, level, &i); 4380 if (error) 4381 goto error0; 4382 if (i == 2) 4383 joined = true; 4384 } 4385 4386 /* 4387 * If we combined blocks as part of deleting the record, delrec won't 4388 * have updated the parent high keys so we have to do that here. 4389 */ 4390 if (joined && (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING)) { 4391 error = xfs_btree_updkeys_force(cur, 0); 4392 if (error) 4393 goto error0; 4394 } 4395 4396 if (i == 0) { 4397 for (level = 1; level < cur->bc_nlevels; level++) { 4398 if (cur->bc_levels[level].ptr == 0) { 4399 error = xfs_btree_decrement(cur, level, &i); 4400 if (error) 4401 goto error0; 4402 break; 4403 } 4404 } 4405 } 4406 4407 *stat = i; 4408 return 0; 4409 error0: 4410 return error; 4411 } 4412 4413 /* 4414 * Get the data from the pointed-to record. 4415 */ 4416 int /* error */ 4417 xfs_btree_get_rec( 4418 struct xfs_btree_cur *cur, /* btree cursor */ 4419 union xfs_btree_rec **recp, /* output: btree record */ 4420 int *stat) /* output: success/failure */ 4421 { 4422 struct xfs_btree_block *block; /* btree block */ 4423 struct xfs_buf *bp; /* buffer pointer */ 4424 int ptr; /* record number */ 4425 #ifdef DEBUG 4426 int error; /* error return value */ 4427 #endif 4428 4429 ptr = cur->bc_levels[0].ptr; 4430 block = xfs_btree_get_block(cur, 0, &bp); 4431 4432 #ifdef DEBUG 4433 error = xfs_btree_check_block(cur, block, 0, bp); 4434 if (error) 4435 return error; 4436 #endif 4437 4438 /* 4439 * Off the right end or left end, return failure. 4440 */ 4441 if (ptr > xfs_btree_get_numrecs(block) || ptr <= 0) { 4442 *stat = 0; 4443 return 0; 4444 } 4445 4446 /* 4447 * Point to the record and extract its data. 4448 */ 4449 *recp = xfs_btree_rec_addr(cur, ptr, block); 4450 *stat = 1; 4451 return 0; 4452 } 4453 4454 /* Visit a block in a btree. */ 4455 STATIC int 4456 xfs_btree_visit_block( 4457 struct xfs_btree_cur *cur, 4458 int level, 4459 xfs_btree_visit_blocks_fn fn, 4460 void *data) 4461 { 4462 struct xfs_btree_block *block; 4463 struct xfs_buf *bp; 4464 union xfs_btree_ptr rptr, bufptr; 4465 int error; 4466 4467 /* do right sibling readahead */ 4468 xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA); 4469 block = xfs_btree_get_block(cur, level, &bp); 4470 4471 /* process the block */ 4472 error = fn(cur, level, data); 4473 if (error) 4474 return error; 4475 4476 /* now read rh sibling block for next iteration */ 4477 xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB); 4478 if (xfs_btree_ptr_is_null(cur, &rptr)) 4479 return -ENOENT; 4480 4481 /* 4482 * We only visit blocks once in this walk, so we have to avoid the 4483 * internal xfs_btree_lookup_get_block() optimisation where it will 4484 * return the same block without checking if the right sibling points 4485 * back to us and creates a cyclic reference in the btree. 4486 */ 4487 xfs_btree_buf_to_ptr(cur, bp, &bufptr); 4488 if (xfs_btree_ptrs_equal(cur, &rptr, &bufptr)) { 4489 xfs_btree_mark_sick(cur); 4490 return -EFSCORRUPTED; 4491 } 4492 4493 return xfs_btree_lookup_get_block(cur, level, &rptr, &block); 4494 } 4495 4496 4497 /* Visit every block in a btree. */ 4498 int 4499 xfs_btree_visit_blocks( 4500 struct xfs_btree_cur *cur, 4501 xfs_btree_visit_blocks_fn fn, 4502 unsigned int flags, 4503 void *data) 4504 { 4505 union xfs_btree_ptr lptr; 4506 int level; 4507 struct xfs_btree_block *block = NULL; 4508 int error = 0; 4509 4510 xfs_btree_init_ptr_from_cur(cur, &lptr); 4511 4512 /* for each level */ 4513 for (level = cur->bc_nlevels - 1; level >= 0; level--) { 4514 /* grab the left hand block */ 4515 error = xfs_btree_lookup_get_block(cur, level, &lptr, &block); 4516 if (error) 4517 return error; 4518 4519 /* readahead the left most block for the next level down */ 4520 if (level > 0) { 4521 union xfs_btree_ptr *ptr; 4522 4523 ptr = xfs_btree_ptr_addr(cur, 1, block); 4524 xfs_btree_readahead_ptr(cur, ptr, 1); 4525 4526 /* save for the next iteration of the loop */ 4527 xfs_btree_copy_ptrs(cur, &lptr, ptr, 1); 4528 4529 if (!(flags & XFS_BTREE_VISIT_LEAVES)) 4530 continue; 4531 } else if (!(flags & XFS_BTREE_VISIT_RECORDS)) { 4532 continue; 4533 } 4534 4535 /* for each buffer in the level */ 4536 do { 4537 error = xfs_btree_visit_block(cur, level, fn, data); 4538 } while (!error); 4539 4540 if (error != -ENOENT) 4541 return error; 4542 } 4543 4544 return 0; 4545 } 4546 4547 /* 4548 * Change the owner of a btree. 4549 * 4550 * The mechanism we use here is ordered buffer logging. Because we don't know 4551 * how many buffers were are going to need to modify, we don't really want to 4552 * have to make transaction reservations for the worst case of every buffer in a 4553 * full size btree as that may be more space that we can fit in the log.... 4554 * 4555 * We do the btree walk in the most optimal manner possible - we have sibling 4556 * pointers so we can just walk all the blocks on each level from left to right 4557 * in a single pass, and then move to the next level and do the same. We can 4558 * also do readahead on the sibling pointers to get IO moving more quickly, 4559 * though for slow disks this is unlikely to make much difference to performance 4560 * as the amount of CPU work we have to do before moving to the next block is 4561 * relatively small. 4562 * 4563 * For each btree block that we load, modify the owner appropriately, set the 4564 * buffer as an ordered buffer and log it appropriately. We need to ensure that 4565 * we mark the region we change dirty so that if the buffer is relogged in 4566 * a subsequent transaction the changes we make here as an ordered buffer are 4567 * correctly relogged in that transaction. If we are in recovery context, then 4568 * just queue the modified buffer as delayed write buffer so the transaction 4569 * recovery completion writes the changes to disk. 4570 */ 4571 struct xfs_btree_block_change_owner_info { 4572 uint64_t new_owner; 4573 struct list_head *buffer_list; 4574 }; 4575 4576 static int 4577 xfs_btree_block_change_owner( 4578 struct xfs_btree_cur *cur, 4579 int level, 4580 void *data) 4581 { 4582 struct xfs_btree_block_change_owner_info *bbcoi = data; 4583 struct xfs_btree_block *block; 4584 struct xfs_buf *bp; 4585 4586 /* modify the owner */ 4587 block = xfs_btree_get_block(cur, level, &bp); 4588 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { 4589 if (block->bb_u.l.bb_owner == cpu_to_be64(bbcoi->new_owner)) 4590 return 0; 4591 block->bb_u.l.bb_owner = cpu_to_be64(bbcoi->new_owner); 4592 } else { 4593 if (block->bb_u.s.bb_owner == cpu_to_be32(bbcoi->new_owner)) 4594 return 0; 4595 block->bb_u.s.bb_owner = cpu_to_be32(bbcoi->new_owner); 4596 } 4597 4598 /* 4599 * If the block is a root block hosted in an inode, we might not have a 4600 * buffer pointer here and we shouldn't attempt to log the change as the 4601 * information is already held in the inode and discarded when the root 4602 * block is formatted into the on-disk inode fork. We still change it, 4603 * though, so everything is consistent in memory. 4604 */ 4605 if (!bp) { 4606 ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE); 4607 ASSERT(level == cur->bc_nlevels - 1); 4608 return 0; 4609 } 4610 4611 if (cur->bc_tp) { 4612 if (!xfs_trans_ordered_buf(cur->bc_tp, bp)) { 4613 xfs_btree_log_block(cur, bp, XFS_BB_OWNER); 4614 return -EAGAIN; 4615 } 4616 } else { 4617 xfs_buf_delwri_queue(bp, bbcoi->buffer_list); 4618 } 4619 4620 return 0; 4621 } 4622 4623 int 4624 xfs_btree_change_owner( 4625 struct xfs_btree_cur *cur, 4626 uint64_t new_owner, 4627 struct list_head *buffer_list) 4628 { 4629 struct xfs_btree_block_change_owner_info bbcoi; 4630 4631 bbcoi.new_owner = new_owner; 4632 bbcoi.buffer_list = buffer_list; 4633 4634 return xfs_btree_visit_blocks(cur, xfs_btree_block_change_owner, 4635 XFS_BTREE_VISIT_ALL, &bbcoi); 4636 } 4637 4638 /* Verify the v5 fields of a long-format btree block. */ 4639 xfs_failaddr_t 4640 xfs_btree_fsblock_v5hdr_verify( 4641 struct xfs_buf *bp, 4642 uint64_t owner) 4643 { 4644 struct xfs_mount *mp = bp->b_mount; 4645 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 4646 4647 if (!xfs_has_crc(mp)) 4648 return __this_address; 4649 if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid)) 4650 return __this_address; 4651 if (block->bb_u.l.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp))) 4652 return __this_address; 4653 if (owner != XFS_RMAP_OWN_UNKNOWN && 4654 be64_to_cpu(block->bb_u.l.bb_owner) != owner) 4655 return __this_address; 4656 return NULL; 4657 } 4658 4659 /* Verify a long-format btree block. */ 4660 xfs_failaddr_t 4661 xfs_btree_fsblock_verify( 4662 struct xfs_buf *bp, 4663 unsigned int max_recs) 4664 { 4665 struct xfs_mount *mp = bp->b_mount; 4666 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 4667 xfs_fsblock_t fsb; 4668 xfs_failaddr_t fa; 4669 4670 ASSERT(!xfs_buftarg_is_mem(bp->b_target)); 4671 4672 /* numrecs verification */ 4673 if (be16_to_cpu(block->bb_numrecs) > max_recs) 4674 return __this_address; 4675 4676 /* sibling pointer verification */ 4677 fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); 4678 fa = xfs_btree_check_fsblock_siblings(mp, fsb, 4679 block->bb_u.l.bb_leftsib); 4680 if (!fa) 4681 fa = xfs_btree_check_fsblock_siblings(mp, fsb, 4682 block->bb_u.l.bb_rightsib); 4683 return fa; 4684 } 4685 4686 /* Verify an in-memory btree block. */ 4687 xfs_failaddr_t 4688 xfs_btree_memblock_verify( 4689 struct xfs_buf *bp, 4690 unsigned int max_recs) 4691 { 4692 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 4693 struct xfs_buftarg *btp = bp->b_target; 4694 xfs_failaddr_t fa; 4695 xfbno_t bno; 4696 4697 ASSERT(xfs_buftarg_is_mem(bp->b_target)); 4698 4699 /* numrecs verification */ 4700 if (be16_to_cpu(block->bb_numrecs) > max_recs) 4701 return __this_address; 4702 4703 /* sibling pointer verification */ 4704 bno = xfs_daddr_to_xfbno(xfs_buf_daddr(bp)); 4705 fa = xfs_btree_check_memblock_siblings(btp, bno, 4706 block->bb_u.l.bb_leftsib); 4707 if (fa) 4708 return fa; 4709 fa = xfs_btree_check_memblock_siblings(btp, bno, 4710 block->bb_u.l.bb_rightsib); 4711 if (fa) 4712 return fa; 4713 4714 return NULL; 4715 } 4716 /** 4717 * xfs_btree_agblock_v5hdr_verify() -- verify the v5 fields of a short-format 4718 * btree block 4719 * 4720 * @bp: buffer containing the btree block 4721 */ 4722 xfs_failaddr_t 4723 xfs_btree_agblock_v5hdr_verify( 4724 struct xfs_buf *bp) 4725 { 4726 struct xfs_mount *mp = bp->b_mount; 4727 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 4728 struct xfs_perag *pag = bp->b_pag; 4729 4730 if (!xfs_has_crc(mp)) 4731 return __this_address; 4732 if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) 4733 return __this_address; 4734 if (block->bb_u.s.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp))) 4735 return __this_address; 4736 if (pag && be32_to_cpu(block->bb_u.s.bb_owner) != pag_agno(pag)) 4737 return __this_address; 4738 return NULL; 4739 } 4740 4741 /** 4742 * xfs_btree_agblock_verify() -- verify a short-format btree block 4743 * 4744 * @bp: buffer containing the btree block 4745 * @max_recs: maximum records allowed in this btree node 4746 */ 4747 xfs_failaddr_t 4748 xfs_btree_agblock_verify( 4749 struct xfs_buf *bp, 4750 unsigned int max_recs) 4751 { 4752 struct xfs_mount *mp = bp->b_mount; 4753 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 4754 xfs_agblock_t agbno; 4755 xfs_failaddr_t fa; 4756 4757 ASSERT(!xfs_buftarg_is_mem(bp->b_target)); 4758 4759 /* numrecs verification */ 4760 if (be16_to_cpu(block->bb_numrecs) > max_recs) 4761 return __this_address; 4762 4763 /* sibling pointer verification */ 4764 agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp)); 4765 fa = xfs_btree_check_agblock_siblings(bp->b_pag, agbno, 4766 block->bb_u.s.bb_leftsib); 4767 if (!fa) 4768 fa = xfs_btree_check_agblock_siblings(bp->b_pag, agbno, 4769 block->bb_u.s.bb_rightsib); 4770 return fa; 4771 } 4772 4773 /* 4774 * For the given limits on leaf and keyptr records per block, calculate the 4775 * height of the tree needed to index the number of leaf records. 4776 */ 4777 unsigned int 4778 xfs_btree_compute_maxlevels( 4779 const unsigned int *limits, 4780 unsigned long long records) 4781 { 4782 unsigned long long level_blocks = howmany_64(records, limits[0]); 4783 unsigned int height = 1; 4784 4785 while (level_blocks > 1) { 4786 level_blocks = howmany_64(level_blocks, limits[1]); 4787 height++; 4788 } 4789 4790 return height; 4791 } 4792 4793 /* 4794 * For the given limits on leaf and keyptr records per block, calculate the 4795 * number of blocks needed to index the given number of leaf records. 4796 */ 4797 unsigned long long 4798 xfs_btree_calc_size( 4799 const unsigned int *limits, 4800 unsigned long long records) 4801 { 4802 unsigned long long level_blocks = howmany_64(records, limits[0]); 4803 unsigned long long blocks = level_blocks; 4804 4805 while (level_blocks > 1) { 4806 level_blocks = howmany_64(level_blocks, limits[1]); 4807 blocks += level_blocks; 4808 } 4809 4810 return blocks; 4811 } 4812 4813 /* 4814 * Given a number of available blocks for the btree to consume with records and 4815 * pointers, calculate the height of the tree needed to index all the records 4816 * that space can hold based on the number of pointers each interior node 4817 * holds. 4818 * 4819 * We start by assuming a single level tree consumes a single block, then track 4820 * the number of blocks each node level consumes until we no longer have space 4821 * to store the next node level. At this point, we are indexing all the leaf 4822 * blocks in the space, and there's no more free space to split the tree any 4823 * further. That's our maximum btree height. 4824 */ 4825 unsigned int 4826 xfs_btree_space_to_height( 4827 const unsigned int *limits, 4828 unsigned long long leaf_blocks) 4829 { 4830 /* 4831 * The root btree block can have fewer than minrecs pointers in it 4832 * because the tree might not be big enough to require that amount of 4833 * fanout. Hence it has a minimum size of 2 pointers, not limits[1]. 4834 */ 4835 unsigned long long node_blocks = 2; 4836 unsigned long long blocks_left = leaf_blocks - 1; 4837 unsigned int height = 1; 4838 4839 if (leaf_blocks < 1) 4840 return 0; 4841 4842 while (node_blocks < blocks_left) { 4843 blocks_left -= node_blocks; 4844 node_blocks *= limits[1]; 4845 height++; 4846 } 4847 4848 return height; 4849 } 4850 4851 /* 4852 * Query a regular btree for all records overlapping a given interval. 4853 * Start with a LE lookup of the key of low_rec and return all records 4854 * until we find a record with a key greater than the key of high_rec. 4855 */ 4856 STATIC int 4857 xfs_btree_simple_query_range( 4858 struct xfs_btree_cur *cur, 4859 const union xfs_btree_key *low_key, 4860 const union xfs_btree_key *high_key, 4861 xfs_btree_query_range_fn fn, 4862 void *priv) 4863 { 4864 union xfs_btree_rec *recp; 4865 union xfs_btree_key rec_key; 4866 int stat; 4867 bool firstrec = true; 4868 int error; 4869 4870 ASSERT(cur->bc_ops->init_high_key_from_rec); 4871 ASSERT(cur->bc_ops->diff_two_keys); 4872 4873 /* 4874 * Find the leftmost record. The btree cursor must be set 4875 * to the low record used to generate low_key. 4876 */ 4877 stat = 0; 4878 error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, &stat); 4879 if (error) 4880 goto out; 4881 4882 /* Nothing? See if there's anything to the right. */ 4883 if (!stat) { 4884 error = xfs_btree_increment(cur, 0, &stat); 4885 if (error) 4886 goto out; 4887 } 4888 4889 while (stat) { 4890 /* Find the record. */ 4891 error = xfs_btree_get_rec(cur, &recp, &stat); 4892 if (error || !stat) 4893 break; 4894 4895 /* Skip if low_key > high_key(rec). */ 4896 if (firstrec) { 4897 cur->bc_ops->init_high_key_from_rec(&rec_key, recp); 4898 firstrec = false; 4899 if (xfs_btree_keycmp_gt(cur, low_key, &rec_key)) 4900 goto advloop; 4901 } 4902 4903 /* Stop if low_key(rec) > high_key. */ 4904 cur->bc_ops->init_key_from_rec(&rec_key, recp); 4905 if (xfs_btree_keycmp_gt(cur, &rec_key, high_key)) 4906 break; 4907 4908 /* Callback */ 4909 error = fn(cur, recp, priv); 4910 if (error) 4911 break; 4912 4913 advloop: 4914 /* Move on to the next record. */ 4915 error = xfs_btree_increment(cur, 0, &stat); 4916 if (error) 4917 break; 4918 } 4919 4920 out: 4921 return error; 4922 } 4923 4924 /* 4925 * Query an overlapped interval btree for all records overlapping a given 4926 * interval. This function roughly follows the algorithm given in 4927 * "Interval Trees" of _Introduction to Algorithms_, which is section 4928 * 14.3 in the 2nd and 3rd editions. 4929 * 4930 * First, generate keys for the low and high records passed in. 4931 * 4932 * For any leaf node, generate the high and low keys for the record. 4933 * If the record keys overlap with the query low/high keys, pass the 4934 * record to the function iterator. 4935 * 4936 * For any internal node, compare the low and high keys of each 4937 * pointer against the query low/high keys. If there's an overlap, 4938 * follow the pointer. 4939 * 4940 * As an optimization, we stop scanning a block when we find a low key 4941 * that is greater than the query's high key. 4942 */ 4943 STATIC int 4944 xfs_btree_overlapped_query_range( 4945 struct xfs_btree_cur *cur, 4946 const union xfs_btree_key *low_key, 4947 const union xfs_btree_key *high_key, 4948 xfs_btree_query_range_fn fn, 4949 void *priv) 4950 { 4951 union xfs_btree_ptr ptr; 4952 union xfs_btree_ptr *pp; 4953 union xfs_btree_key rec_key; 4954 union xfs_btree_key rec_hkey; 4955 union xfs_btree_key *lkp; 4956 union xfs_btree_key *hkp; 4957 union xfs_btree_rec *recp; 4958 struct xfs_btree_block *block; 4959 int level; 4960 struct xfs_buf *bp; 4961 int i; 4962 int error; 4963 4964 /* Load the root of the btree. */ 4965 level = cur->bc_nlevels - 1; 4966 xfs_btree_init_ptr_from_cur(cur, &ptr); 4967 error = xfs_btree_lookup_get_block(cur, level, &ptr, &block); 4968 if (error) 4969 return error; 4970 xfs_btree_get_block(cur, level, &bp); 4971 trace_xfs_btree_overlapped_query_range(cur, level, bp); 4972 #ifdef DEBUG 4973 error = xfs_btree_check_block(cur, block, level, bp); 4974 if (error) 4975 goto out; 4976 #endif 4977 cur->bc_levels[level].ptr = 1; 4978 4979 while (level < cur->bc_nlevels) { 4980 block = xfs_btree_get_block(cur, level, &bp); 4981 4982 /* End of node, pop back towards the root. */ 4983 if (cur->bc_levels[level].ptr > 4984 be16_to_cpu(block->bb_numrecs)) { 4985 pop_up: 4986 if (level < cur->bc_nlevels - 1) 4987 cur->bc_levels[level + 1].ptr++; 4988 level++; 4989 continue; 4990 } 4991 4992 if (level == 0) { 4993 /* Handle a leaf node. */ 4994 recp = xfs_btree_rec_addr(cur, cur->bc_levels[0].ptr, 4995 block); 4996 4997 cur->bc_ops->init_high_key_from_rec(&rec_hkey, recp); 4998 cur->bc_ops->init_key_from_rec(&rec_key, recp); 4999 5000 /* 5001 * If (query's high key < record's low key), then there 5002 * are no more interesting records in this block. Pop 5003 * up to the leaf level to find more record blocks. 5004 * 5005 * If (record's high key >= query's low key) and 5006 * (query's high key >= record's low key), then 5007 * this record overlaps the query range; callback. 5008 */ 5009 if (xfs_btree_keycmp_lt(cur, high_key, &rec_key)) 5010 goto pop_up; 5011 if (xfs_btree_keycmp_ge(cur, &rec_hkey, low_key)) { 5012 error = fn(cur, recp, priv); 5013 if (error) 5014 break; 5015 } 5016 cur->bc_levels[level].ptr++; 5017 continue; 5018 } 5019 5020 /* Handle an internal node. */ 5021 lkp = xfs_btree_key_addr(cur, cur->bc_levels[level].ptr, block); 5022 hkp = xfs_btree_high_key_addr(cur, cur->bc_levels[level].ptr, 5023 block); 5024 pp = xfs_btree_ptr_addr(cur, cur->bc_levels[level].ptr, block); 5025 5026 /* 5027 * If (query's high key < pointer's low key), then there are no 5028 * more interesting keys in this block. Pop up one leaf level 5029 * to continue looking for records. 5030 * 5031 * If (pointer's high key >= query's low key) and 5032 * (query's high key >= pointer's low key), then 5033 * this record overlaps the query range; follow pointer. 5034 */ 5035 if (xfs_btree_keycmp_lt(cur, high_key, lkp)) 5036 goto pop_up; 5037 if (xfs_btree_keycmp_ge(cur, hkp, low_key)) { 5038 level--; 5039 error = xfs_btree_lookup_get_block(cur, level, pp, 5040 &block); 5041 if (error) 5042 goto out; 5043 xfs_btree_get_block(cur, level, &bp); 5044 trace_xfs_btree_overlapped_query_range(cur, level, bp); 5045 #ifdef DEBUG 5046 error = xfs_btree_check_block(cur, block, level, bp); 5047 if (error) 5048 goto out; 5049 #endif 5050 cur->bc_levels[level].ptr = 1; 5051 continue; 5052 } 5053 cur->bc_levels[level].ptr++; 5054 } 5055 5056 out: 5057 /* 5058 * If we don't end this function with the cursor pointing at a record 5059 * block, a subsequent non-error cursor deletion will not release 5060 * node-level buffers, causing a buffer leak. This is quite possible 5061 * with a zero-results range query, so release the buffers if we 5062 * failed to return any results. 5063 */ 5064 if (cur->bc_levels[0].bp == NULL) { 5065 for (i = 0; i < cur->bc_nlevels; i++) { 5066 if (cur->bc_levels[i].bp) { 5067 xfs_trans_brelse(cur->bc_tp, 5068 cur->bc_levels[i].bp); 5069 cur->bc_levels[i].bp = NULL; 5070 cur->bc_levels[i].ptr = 0; 5071 cur->bc_levels[i].ra = 0; 5072 } 5073 } 5074 } 5075 5076 return error; 5077 } 5078 5079 static inline void 5080 xfs_btree_key_from_irec( 5081 struct xfs_btree_cur *cur, 5082 union xfs_btree_key *key, 5083 const union xfs_btree_irec *irec) 5084 { 5085 union xfs_btree_rec rec; 5086 5087 cur->bc_rec = *irec; 5088 cur->bc_ops->init_rec_from_cur(cur, &rec); 5089 cur->bc_ops->init_key_from_rec(key, &rec); 5090 } 5091 5092 /* 5093 * Query a btree for all records overlapping a given interval of keys. The 5094 * supplied function will be called with each record found; return one of the 5095 * XFS_BTREE_QUERY_RANGE_{CONTINUE,ABORT} values or the usual negative error 5096 * code. This function returns -ECANCELED, zero, or a negative error code. 5097 */ 5098 int 5099 xfs_btree_query_range( 5100 struct xfs_btree_cur *cur, 5101 const union xfs_btree_irec *low_rec, 5102 const union xfs_btree_irec *high_rec, 5103 xfs_btree_query_range_fn fn, 5104 void *priv) 5105 { 5106 union xfs_btree_key low_key; 5107 union xfs_btree_key high_key; 5108 5109 /* Find the keys of both ends of the interval. */ 5110 xfs_btree_key_from_irec(cur, &high_key, high_rec); 5111 xfs_btree_key_from_irec(cur, &low_key, low_rec); 5112 5113 /* Enforce low key <= high key. */ 5114 if (!xfs_btree_keycmp_le(cur, &low_key, &high_key)) 5115 return -EINVAL; 5116 5117 if (!(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING)) 5118 return xfs_btree_simple_query_range(cur, &low_key, 5119 &high_key, fn, priv); 5120 return xfs_btree_overlapped_query_range(cur, &low_key, &high_key, 5121 fn, priv); 5122 } 5123 5124 /* Query a btree for all records. */ 5125 int 5126 xfs_btree_query_all( 5127 struct xfs_btree_cur *cur, 5128 xfs_btree_query_range_fn fn, 5129 void *priv) 5130 { 5131 union xfs_btree_key low_key; 5132 union xfs_btree_key high_key; 5133 5134 memset(&cur->bc_rec, 0, sizeof(cur->bc_rec)); 5135 memset(&low_key, 0, sizeof(low_key)); 5136 memset(&high_key, 0xFF, sizeof(high_key)); 5137 5138 return xfs_btree_simple_query_range(cur, &low_key, &high_key, fn, priv); 5139 } 5140 5141 static int 5142 xfs_btree_count_blocks_helper( 5143 struct xfs_btree_cur *cur, 5144 int level, 5145 void *data) 5146 { 5147 xfs_extlen_t *blocks = data; 5148 (*blocks)++; 5149 5150 return 0; 5151 } 5152 5153 /* Count the blocks in a btree and return the result in *blocks. */ 5154 int 5155 xfs_btree_count_blocks( 5156 struct xfs_btree_cur *cur, 5157 xfs_extlen_t *blocks) 5158 { 5159 *blocks = 0; 5160 return xfs_btree_visit_blocks(cur, xfs_btree_count_blocks_helper, 5161 XFS_BTREE_VISIT_ALL, blocks); 5162 } 5163 5164 /* Compare two btree pointers. */ 5165 int64_t 5166 xfs_btree_diff_two_ptrs( 5167 struct xfs_btree_cur *cur, 5168 const union xfs_btree_ptr *a, 5169 const union xfs_btree_ptr *b) 5170 { 5171 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) 5172 return (int64_t)be64_to_cpu(a->l) - be64_to_cpu(b->l); 5173 return (int64_t)be32_to_cpu(a->s) - be32_to_cpu(b->s); 5174 } 5175 5176 struct xfs_btree_has_records { 5177 /* Keys for the start and end of the range we want to know about. */ 5178 union xfs_btree_key start_key; 5179 union xfs_btree_key end_key; 5180 5181 /* Mask for key comparisons, if desired. */ 5182 const union xfs_btree_key *key_mask; 5183 5184 /* Highest record key we've seen so far. */ 5185 union xfs_btree_key high_key; 5186 5187 enum xbtree_recpacking outcome; 5188 }; 5189 5190 STATIC int 5191 xfs_btree_has_records_helper( 5192 struct xfs_btree_cur *cur, 5193 const union xfs_btree_rec *rec, 5194 void *priv) 5195 { 5196 union xfs_btree_key rec_key; 5197 union xfs_btree_key rec_high_key; 5198 struct xfs_btree_has_records *info = priv; 5199 enum xbtree_key_contig key_contig; 5200 5201 cur->bc_ops->init_key_from_rec(&rec_key, rec); 5202 5203 if (info->outcome == XBTREE_RECPACKING_EMPTY) { 5204 info->outcome = XBTREE_RECPACKING_SPARSE; 5205 5206 /* 5207 * If the first record we find does not overlap the start key, 5208 * then there is a hole at the start of the search range. 5209 * Classify this as sparse and stop immediately. 5210 */ 5211 if (xfs_btree_masked_keycmp_lt(cur, &info->start_key, &rec_key, 5212 info->key_mask)) 5213 return -ECANCELED; 5214 } else { 5215 /* 5216 * If a subsequent record does not overlap with the any record 5217 * we've seen so far, there is a hole in the middle of the 5218 * search range. Classify this as sparse and stop. 5219 * If the keys overlap and this btree does not allow overlap, 5220 * signal corruption. 5221 */ 5222 key_contig = cur->bc_ops->keys_contiguous(cur, &info->high_key, 5223 &rec_key, info->key_mask); 5224 if (key_contig == XBTREE_KEY_OVERLAP && 5225 !(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING)) 5226 return -EFSCORRUPTED; 5227 if (key_contig == XBTREE_KEY_GAP) 5228 return -ECANCELED; 5229 } 5230 5231 /* 5232 * If high_key(rec) is larger than any other high key we've seen, 5233 * remember it for later. 5234 */ 5235 cur->bc_ops->init_high_key_from_rec(&rec_high_key, rec); 5236 if (xfs_btree_masked_keycmp_gt(cur, &rec_high_key, &info->high_key, 5237 info->key_mask)) 5238 info->high_key = rec_high_key; /* struct copy */ 5239 5240 return 0; 5241 } 5242 5243 /* 5244 * Scan part of the keyspace of a btree and tell us if that keyspace does not 5245 * map to any records; is fully mapped to records; or is partially mapped to 5246 * records. This is the btree record equivalent to determining if a file is 5247 * sparse. 5248 * 5249 * For most btree types, the record scan should use all available btree key 5250 * fields to compare the keys encountered. These callers should pass NULL for 5251 * @mask. However, some callers (e.g. scanning physical space in the rmapbt) 5252 * want to ignore some part of the btree record keyspace when performing the 5253 * comparison. These callers should pass in a union xfs_btree_key object with 5254 * the fields that *should* be a part of the comparison set to any nonzero 5255 * value, and the rest zeroed. 5256 */ 5257 int 5258 xfs_btree_has_records( 5259 struct xfs_btree_cur *cur, 5260 const union xfs_btree_irec *low, 5261 const union xfs_btree_irec *high, 5262 const union xfs_btree_key *mask, 5263 enum xbtree_recpacking *outcome) 5264 { 5265 struct xfs_btree_has_records info = { 5266 .outcome = XBTREE_RECPACKING_EMPTY, 5267 .key_mask = mask, 5268 }; 5269 int error; 5270 5271 /* Not all btrees support this operation. */ 5272 if (!cur->bc_ops->keys_contiguous) { 5273 ASSERT(0); 5274 return -EOPNOTSUPP; 5275 } 5276 5277 xfs_btree_key_from_irec(cur, &info.start_key, low); 5278 xfs_btree_key_from_irec(cur, &info.end_key, high); 5279 5280 error = xfs_btree_query_range(cur, low, high, 5281 xfs_btree_has_records_helper, &info); 5282 if (error == -ECANCELED) 5283 goto out; 5284 if (error) 5285 return error; 5286 5287 if (info.outcome == XBTREE_RECPACKING_EMPTY) 5288 goto out; 5289 5290 /* 5291 * If the largest high_key(rec) we saw during the walk is greater than 5292 * the end of the search range, classify this as full. Otherwise, 5293 * there is a hole at the end of the search range. 5294 */ 5295 if (xfs_btree_masked_keycmp_ge(cur, &info.high_key, &info.end_key, 5296 mask)) 5297 info.outcome = XBTREE_RECPACKING_FULL; 5298 5299 out: 5300 *outcome = info.outcome; 5301 return 0; 5302 } 5303 5304 /* Are there more records in this btree? */ 5305 bool 5306 xfs_btree_has_more_records( 5307 struct xfs_btree_cur *cur) 5308 { 5309 struct xfs_btree_block *block; 5310 struct xfs_buf *bp; 5311 5312 block = xfs_btree_get_block(cur, 0, &bp); 5313 5314 /* There are still records in this block. */ 5315 if (cur->bc_levels[0].ptr < xfs_btree_get_numrecs(block)) 5316 return true; 5317 5318 /* There are more record blocks. */ 5319 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) 5320 return block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK); 5321 else 5322 return block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK); 5323 } 5324 5325 /* Set up all the btree cursor caches. */ 5326 int __init 5327 xfs_btree_init_cur_caches(void) 5328 { 5329 int error; 5330 5331 error = xfs_allocbt_init_cur_cache(); 5332 if (error) 5333 return error; 5334 error = xfs_inobt_init_cur_cache(); 5335 if (error) 5336 goto err; 5337 error = xfs_bmbt_init_cur_cache(); 5338 if (error) 5339 goto err; 5340 error = xfs_rmapbt_init_cur_cache(); 5341 if (error) 5342 goto err; 5343 error = xfs_refcountbt_init_cur_cache(); 5344 if (error) 5345 goto err; 5346 5347 return 0; 5348 err: 5349 xfs_btree_destroy_cur_caches(); 5350 return error; 5351 } 5352 5353 /* Destroy all the btree cursor caches, if they've been allocated. */ 5354 void 5355 xfs_btree_destroy_cur_caches(void) 5356 { 5357 xfs_allocbt_destroy_cur_cache(); 5358 xfs_inobt_destroy_cur_cache(); 5359 xfs_bmbt_destroy_cur_cache(); 5360 xfs_rmapbt_destroy_cur_cache(); 5361 xfs_refcountbt_destroy_cur_cache(); 5362 } 5363 5364 /* Move the btree cursor before the first record. */ 5365 int 5366 xfs_btree_goto_left_edge( 5367 struct xfs_btree_cur *cur) 5368 { 5369 int stat = 0; 5370 int error; 5371 5372 memset(&cur->bc_rec, 0, sizeof(cur->bc_rec)); 5373 error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, &stat); 5374 if (error) 5375 return error; 5376 if (!stat) 5377 return 0; 5378 5379 error = xfs_btree_decrement(cur, 0, &stat); 5380 if (error) 5381 return error; 5382 if (stat != 0) { 5383 ASSERT(0); 5384 xfs_btree_mark_sick(cur); 5385 return -EFSCORRUPTED; 5386 } 5387 5388 return 0; 5389 } 5390