1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_bit.h" 13 #include "xfs_mount.h" 14 #include "xfs_inode.h" 15 #include "xfs_trans.h" 16 #include "xfs_buf_item.h" 17 #include "xfs_btree.h" 18 #include "xfs_errortag.h" 19 #include "xfs_error.h" 20 #include "xfs_trace.h" 21 #include "xfs_alloc.h" 22 #include "xfs_log.h" 23 #include "xfs_btree_staging.h" 24 #include "xfs_ag.h" 25 #include "xfs_alloc_btree.h" 26 #include "xfs_ialloc_btree.h" 27 #include "xfs_bmap_btree.h" 28 #include "xfs_rmap_btree.h" 29 #include "xfs_refcount_btree.h" 30 31 /* 32 * Btree magic numbers. 33 */ 34 static const uint32_t xfs_magics[2][XFS_BTNUM_MAX] = { 35 { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, 0, XFS_BMAP_MAGIC, XFS_IBT_MAGIC, 36 XFS_FIBT_MAGIC, 0 }, 37 { XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC, XFS_RMAP_CRC_MAGIC, 38 XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC, 39 XFS_REFC_CRC_MAGIC } 40 }; 41 42 uint32_t 43 xfs_btree_magic( 44 int crc, 45 xfs_btnum_t btnum) 46 { 47 uint32_t magic = xfs_magics[crc][btnum]; 48 49 /* Ensure we asked for crc for crc-only magics. */ 50 ASSERT(magic != 0); 51 return magic; 52 } 53 54 /* 55 * These sibling pointer checks are optimised for null sibling pointers. This 56 * happens a lot, and we don't need to byte swap at runtime if the sibling 57 * pointer is NULL. 58 * 59 * These are explicitly marked at inline because the cost of calling them as 60 * functions instead of inlining them is about 36 bytes extra code per call site 61 * on x86-64. Yes, gcc-11 fails to inline them, and explicit inlining of these 62 * two sibling check functions reduces the compiled code size by over 300 63 * bytes. 64 */ 65 static inline xfs_failaddr_t 66 xfs_btree_check_lblock_siblings( 67 struct xfs_mount *mp, 68 struct xfs_btree_cur *cur, 69 int level, 70 xfs_fsblock_t fsb, 71 __be64 dsibling) 72 { 73 xfs_fsblock_t sibling; 74 75 if (dsibling == cpu_to_be64(NULLFSBLOCK)) 76 return NULL; 77 78 sibling = be64_to_cpu(dsibling); 79 if (sibling == fsb) 80 return __this_address; 81 if (level >= 0) { 82 if (!xfs_btree_check_lptr(cur, sibling, level + 1)) 83 return __this_address; 84 } else { 85 if (!xfs_verify_fsbno(mp, sibling)) 86 return __this_address; 87 } 88 89 return NULL; 90 } 91 92 static inline xfs_failaddr_t 93 xfs_btree_check_sblock_siblings( 94 struct xfs_perag *pag, 95 struct xfs_btree_cur *cur, 96 int level, 97 xfs_agblock_t agbno, 98 __be32 dsibling) 99 { 100 xfs_agblock_t sibling; 101 102 if (dsibling == cpu_to_be32(NULLAGBLOCK)) 103 return NULL; 104 105 sibling = be32_to_cpu(dsibling); 106 if (sibling == agbno) 107 return __this_address; 108 if (level >= 0) { 109 if (!xfs_btree_check_sptr(cur, sibling, level + 1)) 110 return __this_address; 111 } else { 112 if (!xfs_verify_agbno(pag, sibling)) 113 return __this_address; 114 } 115 return NULL; 116 } 117 118 /* 119 * Check a long btree block header. Return the address of the failing check, 120 * or NULL if everything is ok. 121 */ 122 xfs_failaddr_t 123 __xfs_btree_check_lblock( 124 struct xfs_btree_cur *cur, 125 struct xfs_btree_block *block, 126 int level, 127 struct xfs_buf *bp) 128 { 129 struct xfs_mount *mp = cur->bc_mp; 130 xfs_btnum_t btnum = cur->bc_btnum; 131 int crc = xfs_has_crc(mp); 132 xfs_failaddr_t fa; 133 xfs_fsblock_t fsb = NULLFSBLOCK; 134 135 if (crc) { 136 if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid)) 137 return __this_address; 138 if (block->bb_u.l.bb_blkno != 139 cpu_to_be64(bp ? xfs_buf_daddr(bp) : XFS_BUF_DADDR_NULL)) 140 return __this_address; 141 if (block->bb_u.l.bb_pad != cpu_to_be32(0)) 142 return __this_address; 143 } 144 145 if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(crc, btnum)) 146 return __this_address; 147 if (be16_to_cpu(block->bb_level) != level) 148 return __this_address; 149 if (be16_to_cpu(block->bb_numrecs) > 150 cur->bc_ops->get_maxrecs(cur, level)) 151 return __this_address; 152 153 if (bp) 154 fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); 155 156 fa = xfs_btree_check_lblock_siblings(mp, cur, level, fsb, 157 block->bb_u.l.bb_leftsib); 158 if (!fa) 159 fa = xfs_btree_check_lblock_siblings(mp, cur, level, fsb, 160 block->bb_u.l.bb_rightsib); 161 return fa; 162 } 163 164 /* Check a long btree block header. */ 165 static int 166 xfs_btree_check_lblock( 167 struct xfs_btree_cur *cur, 168 struct xfs_btree_block *block, 169 int level, 170 struct xfs_buf *bp) 171 { 172 struct xfs_mount *mp = cur->bc_mp; 173 xfs_failaddr_t fa; 174 175 fa = __xfs_btree_check_lblock(cur, block, level, bp); 176 if (XFS_IS_CORRUPT(mp, fa != NULL) || 177 XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BTREE_CHECK_LBLOCK)) { 178 if (bp) 179 trace_xfs_btree_corrupt(bp, _RET_IP_); 180 return -EFSCORRUPTED; 181 } 182 return 0; 183 } 184 185 /* 186 * Check a short btree block header. Return the address of the failing check, 187 * or NULL if everything is ok. 188 */ 189 xfs_failaddr_t 190 __xfs_btree_check_sblock( 191 struct xfs_btree_cur *cur, 192 struct xfs_btree_block *block, 193 int level, 194 struct xfs_buf *bp) 195 { 196 struct xfs_mount *mp = cur->bc_mp; 197 struct xfs_perag *pag = cur->bc_ag.pag; 198 xfs_btnum_t btnum = cur->bc_btnum; 199 int crc = xfs_has_crc(mp); 200 xfs_failaddr_t fa; 201 xfs_agblock_t agbno = NULLAGBLOCK; 202 203 if (crc) { 204 if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) 205 return __this_address; 206 if (block->bb_u.s.bb_blkno != 207 cpu_to_be64(bp ? xfs_buf_daddr(bp) : XFS_BUF_DADDR_NULL)) 208 return __this_address; 209 } 210 211 if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(crc, btnum)) 212 return __this_address; 213 if (be16_to_cpu(block->bb_level) != level) 214 return __this_address; 215 if (be16_to_cpu(block->bb_numrecs) > 216 cur->bc_ops->get_maxrecs(cur, level)) 217 return __this_address; 218 219 if (bp) 220 agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp)); 221 222 fa = xfs_btree_check_sblock_siblings(pag, cur, level, agbno, 223 block->bb_u.s.bb_leftsib); 224 if (!fa) 225 fa = xfs_btree_check_sblock_siblings(pag, cur, level, agbno, 226 block->bb_u.s.bb_rightsib); 227 return fa; 228 } 229 230 /* Check a short btree block header. */ 231 STATIC int 232 xfs_btree_check_sblock( 233 struct xfs_btree_cur *cur, 234 struct xfs_btree_block *block, 235 int level, 236 struct xfs_buf *bp) 237 { 238 struct xfs_mount *mp = cur->bc_mp; 239 xfs_failaddr_t fa; 240 241 fa = __xfs_btree_check_sblock(cur, block, level, bp); 242 if (XFS_IS_CORRUPT(mp, fa != NULL) || 243 XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BTREE_CHECK_SBLOCK)) { 244 if (bp) 245 trace_xfs_btree_corrupt(bp, _RET_IP_); 246 return -EFSCORRUPTED; 247 } 248 return 0; 249 } 250 251 /* 252 * Debug routine: check that block header is ok. 253 */ 254 int 255 xfs_btree_check_block( 256 struct xfs_btree_cur *cur, /* btree cursor */ 257 struct xfs_btree_block *block, /* generic btree block pointer */ 258 int level, /* level of the btree block */ 259 struct xfs_buf *bp) /* buffer containing block, if any */ 260 { 261 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) 262 return xfs_btree_check_lblock(cur, block, level, bp); 263 else 264 return xfs_btree_check_sblock(cur, block, level, bp); 265 } 266 267 /* Check that this long pointer is valid and points within the fs. */ 268 bool 269 xfs_btree_check_lptr( 270 struct xfs_btree_cur *cur, 271 xfs_fsblock_t fsbno, 272 int level) 273 { 274 if (level <= 0) 275 return false; 276 return xfs_verify_fsbno(cur->bc_mp, fsbno); 277 } 278 279 /* Check that this short pointer is valid and points within the AG. */ 280 bool 281 xfs_btree_check_sptr( 282 struct xfs_btree_cur *cur, 283 xfs_agblock_t agbno, 284 int level) 285 { 286 if (level <= 0) 287 return false; 288 return xfs_verify_agbno(cur->bc_ag.pag, agbno); 289 } 290 291 /* 292 * Check that a given (indexed) btree pointer at a certain level of a 293 * btree is valid and doesn't point past where it should. 294 */ 295 static int 296 xfs_btree_check_ptr( 297 struct xfs_btree_cur *cur, 298 const union xfs_btree_ptr *ptr, 299 int index, 300 int level) 301 { 302 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { 303 if (xfs_btree_check_lptr(cur, be64_to_cpu((&ptr->l)[index]), 304 level)) 305 return 0; 306 xfs_err(cur->bc_mp, 307 "Inode %llu fork %d: Corrupt btree %d pointer at level %d index %d.", 308 cur->bc_ino.ip->i_ino, 309 cur->bc_ino.whichfork, cur->bc_btnum, 310 level, index); 311 } else { 312 if (xfs_btree_check_sptr(cur, be32_to_cpu((&ptr->s)[index]), 313 level)) 314 return 0; 315 xfs_err(cur->bc_mp, 316 "AG %u: Corrupt btree %d pointer at level %d index %d.", 317 cur->bc_ag.pag->pag_agno, cur->bc_btnum, 318 level, index); 319 } 320 321 return -EFSCORRUPTED; 322 } 323 324 #ifdef DEBUG 325 # define xfs_btree_debug_check_ptr xfs_btree_check_ptr 326 #else 327 # define xfs_btree_debug_check_ptr(...) (0) 328 #endif 329 330 /* 331 * Calculate CRC on the whole btree block and stuff it into the 332 * long-form btree header. 333 * 334 * Prior to calculting the CRC, pull the LSN out of the buffer log item and put 335 * it into the buffer so recovery knows what the last modification was that made 336 * it to disk. 337 */ 338 void 339 xfs_btree_lblock_calc_crc( 340 struct xfs_buf *bp) 341 { 342 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 343 struct xfs_buf_log_item *bip = bp->b_log_item; 344 345 if (!xfs_has_crc(bp->b_mount)) 346 return; 347 if (bip) 348 block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn); 349 xfs_buf_update_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF); 350 } 351 352 bool 353 xfs_btree_lblock_verify_crc( 354 struct xfs_buf *bp) 355 { 356 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 357 struct xfs_mount *mp = bp->b_mount; 358 359 if (xfs_has_crc(mp)) { 360 if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.l.bb_lsn))) 361 return false; 362 return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF); 363 } 364 365 return true; 366 } 367 368 /* 369 * Calculate CRC on the whole btree block and stuff it into the 370 * short-form btree header. 371 * 372 * Prior to calculting the CRC, pull the LSN out of the buffer log item and put 373 * it into the buffer so recovery knows what the last modification was that made 374 * it to disk. 375 */ 376 void 377 xfs_btree_sblock_calc_crc( 378 struct xfs_buf *bp) 379 { 380 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 381 struct xfs_buf_log_item *bip = bp->b_log_item; 382 383 if (!xfs_has_crc(bp->b_mount)) 384 return; 385 if (bip) 386 block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn); 387 xfs_buf_update_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF); 388 } 389 390 bool 391 xfs_btree_sblock_verify_crc( 392 struct xfs_buf *bp) 393 { 394 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 395 struct xfs_mount *mp = bp->b_mount; 396 397 if (xfs_has_crc(mp)) { 398 if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.s.bb_lsn))) 399 return false; 400 return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF); 401 } 402 403 return true; 404 } 405 406 static int 407 xfs_btree_free_block( 408 struct xfs_btree_cur *cur, 409 struct xfs_buf *bp) 410 { 411 int error; 412 413 error = cur->bc_ops->free_block(cur, bp); 414 if (!error) { 415 xfs_trans_binval(cur->bc_tp, bp); 416 XFS_BTREE_STATS_INC(cur, free); 417 } 418 return error; 419 } 420 421 /* 422 * Delete the btree cursor. 423 */ 424 void 425 xfs_btree_del_cursor( 426 struct xfs_btree_cur *cur, /* btree cursor */ 427 int error) /* del because of error */ 428 { 429 int i; /* btree level */ 430 431 /* 432 * Clear the buffer pointers and release the buffers. If we're doing 433 * this because of an error, inspect all of the entries in the bc_bufs 434 * array for buffers to be unlocked. This is because some of the btree 435 * code works from level n down to 0, and if we get an error along the 436 * way we won't have initialized all the entries down to 0. 437 */ 438 for (i = 0; i < cur->bc_nlevels; i++) { 439 if (cur->bc_levels[i].bp) 440 xfs_trans_brelse(cur->bc_tp, cur->bc_levels[i].bp); 441 else if (!error) 442 break; 443 } 444 445 /* 446 * If we are doing a BMBT update, the number of unaccounted blocks 447 * allocated during this cursor life time should be zero. If it's not 448 * zero, then we should be shut down or on our way to shutdown due to 449 * cancelling a dirty transaction on error. 450 */ 451 ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP || cur->bc_ino.allocated == 0 || 452 xfs_is_shutdown(cur->bc_mp) || error != 0); 453 if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) 454 kmem_free(cur->bc_ops); 455 if (!(cur->bc_flags & XFS_BTREE_LONG_PTRS) && cur->bc_ag.pag) 456 xfs_perag_put(cur->bc_ag.pag); 457 kmem_cache_free(cur->bc_cache, cur); 458 } 459 460 /* 461 * Duplicate the btree cursor. 462 * Allocate a new one, copy the record, re-get the buffers. 463 */ 464 int /* error */ 465 xfs_btree_dup_cursor( 466 struct xfs_btree_cur *cur, /* input cursor */ 467 struct xfs_btree_cur **ncur) /* output cursor */ 468 { 469 struct xfs_buf *bp; /* btree block's buffer pointer */ 470 int error; /* error return value */ 471 int i; /* level number of btree block */ 472 xfs_mount_t *mp; /* mount structure for filesystem */ 473 struct xfs_btree_cur *new; /* new cursor value */ 474 xfs_trans_t *tp; /* transaction pointer, can be NULL */ 475 476 tp = cur->bc_tp; 477 mp = cur->bc_mp; 478 479 /* 480 * Allocate a new cursor like the old one. 481 */ 482 new = cur->bc_ops->dup_cursor(cur); 483 484 /* 485 * Copy the record currently in the cursor. 486 */ 487 new->bc_rec = cur->bc_rec; 488 489 /* 490 * For each level current, re-get the buffer and copy the ptr value. 491 */ 492 for (i = 0; i < new->bc_nlevels; i++) { 493 new->bc_levels[i].ptr = cur->bc_levels[i].ptr; 494 new->bc_levels[i].ra = cur->bc_levels[i].ra; 495 bp = cur->bc_levels[i].bp; 496 if (bp) { 497 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, 498 xfs_buf_daddr(bp), mp->m_bsize, 499 0, &bp, 500 cur->bc_ops->buf_ops); 501 if (error) { 502 xfs_btree_del_cursor(new, error); 503 *ncur = NULL; 504 return error; 505 } 506 } 507 new->bc_levels[i].bp = bp; 508 } 509 *ncur = new; 510 return 0; 511 } 512 513 /* 514 * XFS btree block layout and addressing: 515 * 516 * There are two types of blocks in the btree: leaf and non-leaf blocks. 517 * 518 * The leaf record start with a header then followed by records containing 519 * the values. A non-leaf block also starts with the same header, and 520 * then first contains lookup keys followed by an equal number of pointers 521 * to the btree blocks at the previous level. 522 * 523 * +--------+-------+-------+-------+-------+-------+-------+ 524 * Leaf: | header | rec 1 | rec 2 | rec 3 | rec 4 | rec 5 | rec N | 525 * +--------+-------+-------+-------+-------+-------+-------+ 526 * 527 * +--------+-------+-------+-------+-------+-------+-------+ 528 * Non-Leaf: | header | key 1 | key 2 | key N | ptr 1 | ptr 2 | ptr N | 529 * +--------+-------+-------+-------+-------+-------+-------+ 530 * 531 * The header is called struct xfs_btree_block for reasons better left unknown 532 * and comes in different versions for short (32bit) and long (64bit) block 533 * pointers. The record and key structures are defined by the btree instances 534 * and opaque to the btree core. The block pointers are simple disk endian 535 * integers, available in a short (32bit) and long (64bit) variant. 536 * 537 * The helpers below calculate the offset of a given record, key or pointer 538 * into a btree block (xfs_btree_*_offset) or return a pointer to the given 539 * record, key or pointer (xfs_btree_*_addr). Note that all addressing 540 * inside the btree block is done using indices starting at one, not zero! 541 * 542 * If XFS_BTREE_OVERLAPPING is set, then this btree supports keys containing 543 * overlapping intervals. In such a tree, records are still sorted lowest to 544 * highest and indexed by the smallest key value that refers to the record. 545 * However, nodes are different: each pointer has two associated keys -- one 546 * indexing the lowest key available in the block(s) below (the same behavior 547 * as the key in a regular btree) and another indexing the highest key 548 * available in the block(s) below. Because records are /not/ sorted by the 549 * highest key, all leaf block updates require us to compute the highest key 550 * that matches any record in the leaf and to recursively update the high keys 551 * in the nodes going further up in the tree, if necessary. Nodes look like 552 * this: 553 * 554 * +--------+-----+-----+-----+-----+-----+-------+-------+-----+ 555 * Non-Leaf: | header | lo1 | hi1 | lo2 | hi2 | ... | ptr 1 | ptr 2 | ... | 556 * +--------+-----+-----+-----+-----+-----+-------+-------+-----+ 557 * 558 * To perform an interval query on an overlapped tree, perform the usual 559 * depth-first search and use the low and high keys to decide if we can skip 560 * that particular node. If a leaf node is reached, return the records that 561 * intersect the interval. Note that an interval query may return numerous 562 * entries. For a non-overlapped tree, simply search for the record associated 563 * with the lowest key and iterate forward until a non-matching record is 564 * found. Section 14.3 ("Interval Trees") of _Introduction to Algorithms_ by 565 * Cormen, Leiserson, Rivest, and Stein (2nd or 3rd ed. only) discuss this in 566 * more detail. 567 * 568 * Why do we care about overlapping intervals? Let's say you have a bunch of 569 * reverse mapping records on a reflink filesystem: 570 * 571 * 1: +- file A startblock B offset C length D -----------+ 572 * 2: +- file E startblock F offset G length H --------------+ 573 * 3: +- file I startblock F offset J length K --+ 574 * 4: +- file L... --+ 575 * 576 * Now say we want to map block (B+D) into file A at offset (C+D). Ideally, 577 * we'd simply increment the length of record 1. But how do we find the record 578 * that ends at (B+D-1) (i.e. record 1)? A LE lookup of (B+D-1) would return 579 * record 3 because the keys are ordered first by startblock. An interval 580 * query would return records 1 and 2 because they both overlap (B+D-1), and 581 * from that we can pick out record 1 as the appropriate left neighbor. 582 * 583 * In the non-overlapped case you can do a LE lookup and decrement the cursor 584 * because a record's interval must end before the next record. 585 */ 586 587 /* 588 * Return size of the btree block header for this btree instance. 589 */ 590 static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur) 591 { 592 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { 593 if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) 594 return XFS_BTREE_LBLOCK_CRC_LEN; 595 return XFS_BTREE_LBLOCK_LEN; 596 } 597 if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) 598 return XFS_BTREE_SBLOCK_CRC_LEN; 599 return XFS_BTREE_SBLOCK_LEN; 600 } 601 602 /* 603 * Return size of btree block pointers for this btree instance. 604 */ 605 static inline size_t xfs_btree_ptr_len(struct xfs_btree_cur *cur) 606 { 607 return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ? 608 sizeof(__be64) : sizeof(__be32); 609 } 610 611 /* 612 * Calculate offset of the n-th record in a btree block. 613 */ 614 STATIC size_t 615 xfs_btree_rec_offset( 616 struct xfs_btree_cur *cur, 617 int n) 618 { 619 return xfs_btree_block_len(cur) + 620 (n - 1) * cur->bc_ops->rec_len; 621 } 622 623 /* 624 * Calculate offset of the n-th key in a btree block. 625 */ 626 STATIC size_t 627 xfs_btree_key_offset( 628 struct xfs_btree_cur *cur, 629 int n) 630 { 631 return xfs_btree_block_len(cur) + 632 (n - 1) * cur->bc_ops->key_len; 633 } 634 635 /* 636 * Calculate offset of the n-th high key in a btree block. 637 */ 638 STATIC size_t 639 xfs_btree_high_key_offset( 640 struct xfs_btree_cur *cur, 641 int n) 642 { 643 return xfs_btree_block_len(cur) + 644 (n - 1) * cur->bc_ops->key_len + (cur->bc_ops->key_len / 2); 645 } 646 647 /* 648 * Calculate offset of the n-th block pointer in a btree block. 649 */ 650 STATIC size_t 651 xfs_btree_ptr_offset( 652 struct xfs_btree_cur *cur, 653 int n, 654 int level) 655 { 656 return xfs_btree_block_len(cur) + 657 cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len + 658 (n - 1) * xfs_btree_ptr_len(cur); 659 } 660 661 /* 662 * Return a pointer to the n-th record in the btree block. 663 */ 664 union xfs_btree_rec * 665 xfs_btree_rec_addr( 666 struct xfs_btree_cur *cur, 667 int n, 668 struct xfs_btree_block *block) 669 { 670 return (union xfs_btree_rec *) 671 ((char *)block + xfs_btree_rec_offset(cur, n)); 672 } 673 674 /* 675 * Return a pointer to the n-th key in the btree block. 676 */ 677 union xfs_btree_key * 678 xfs_btree_key_addr( 679 struct xfs_btree_cur *cur, 680 int n, 681 struct xfs_btree_block *block) 682 { 683 return (union xfs_btree_key *) 684 ((char *)block + xfs_btree_key_offset(cur, n)); 685 } 686 687 /* 688 * Return a pointer to the n-th high key in the btree block. 689 */ 690 union xfs_btree_key * 691 xfs_btree_high_key_addr( 692 struct xfs_btree_cur *cur, 693 int n, 694 struct xfs_btree_block *block) 695 { 696 return (union xfs_btree_key *) 697 ((char *)block + xfs_btree_high_key_offset(cur, n)); 698 } 699 700 /* 701 * Return a pointer to the n-th block pointer in the btree block. 702 */ 703 union xfs_btree_ptr * 704 xfs_btree_ptr_addr( 705 struct xfs_btree_cur *cur, 706 int n, 707 struct xfs_btree_block *block) 708 { 709 int level = xfs_btree_get_level(block); 710 711 ASSERT(block->bb_level != 0); 712 713 return (union xfs_btree_ptr *) 714 ((char *)block + xfs_btree_ptr_offset(cur, n, level)); 715 } 716 717 struct xfs_ifork * 718 xfs_btree_ifork_ptr( 719 struct xfs_btree_cur *cur) 720 { 721 ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); 722 723 if (cur->bc_flags & XFS_BTREE_STAGING) 724 return cur->bc_ino.ifake->if_fork; 725 return xfs_ifork_ptr(cur->bc_ino.ip, cur->bc_ino.whichfork); 726 } 727 728 /* 729 * Get the root block which is stored in the inode. 730 * 731 * For now this btree implementation assumes the btree root is always 732 * stored in the if_broot field of an inode fork. 733 */ 734 STATIC struct xfs_btree_block * 735 xfs_btree_get_iroot( 736 struct xfs_btree_cur *cur) 737 { 738 struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); 739 740 return (struct xfs_btree_block *)ifp->if_broot; 741 } 742 743 /* 744 * Retrieve the block pointer from the cursor at the given level. 745 * This may be an inode btree root or from a buffer. 746 */ 747 struct xfs_btree_block * /* generic btree block pointer */ 748 xfs_btree_get_block( 749 struct xfs_btree_cur *cur, /* btree cursor */ 750 int level, /* level in btree */ 751 struct xfs_buf **bpp) /* buffer containing the block */ 752 { 753 if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && 754 (level == cur->bc_nlevels - 1)) { 755 *bpp = NULL; 756 return xfs_btree_get_iroot(cur); 757 } 758 759 *bpp = cur->bc_levels[level].bp; 760 return XFS_BUF_TO_BLOCK(*bpp); 761 } 762 763 /* 764 * Change the cursor to point to the first record at the given level. 765 * Other levels are unaffected. 766 */ 767 STATIC int /* success=1, failure=0 */ 768 xfs_btree_firstrec( 769 struct xfs_btree_cur *cur, /* btree cursor */ 770 int level) /* level to change */ 771 { 772 struct xfs_btree_block *block; /* generic btree block pointer */ 773 struct xfs_buf *bp; /* buffer containing block */ 774 775 /* 776 * Get the block pointer for this level. 777 */ 778 block = xfs_btree_get_block(cur, level, &bp); 779 if (xfs_btree_check_block(cur, block, level, bp)) 780 return 0; 781 /* 782 * It's empty, there is no such record. 783 */ 784 if (!block->bb_numrecs) 785 return 0; 786 /* 787 * Set the ptr value to 1, that's the first record/key. 788 */ 789 cur->bc_levels[level].ptr = 1; 790 return 1; 791 } 792 793 /* 794 * Change the cursor to point to the last record in the current block 795 * at the given level. Other levels are unaffected. 796 */ 797 STATIC int /* success=1, failure=0 */ 798 xfs_btree_lastrec( 799 struct xfs_btree_cur *cur, /* btree cursor */ 800 int level) /* level to change */ 801 { 802 struct xfs_btree_block *block; /* generic btree block pointer */ 803 struct xfs_buf *bp; /* buffer containing block */ 804 805 /* 806 * Get the block pointer for this level. 807 */ 808 block = xfs_btree_get_block(cur, level, &bp); 809 if (xfs_btree_check_block(cur, block, level, bp)) 810 return 0; 811 /* 812 * It's empty, there is no such record. 813 */ 814 if (!block->bb_numrecs) 815 return 0; 816 /* 817 * Set the ptr value to numrecs, that's the last record/key. 818 */ 819 cur->bc_levels[level].ptr = be16_to_cpu(block->bb_numrecs); 820 return 1; 821 } 822 823 /* 824 * Compute first and last byte offsets for the fields given. 825 * Interprets the offsets table, which contains struct field offsets. 826 */ 827 void 828 xfs_btree_offsets( 829 uint32_t fields, /* bitmask of fields */ 830 const short *offsets, /* table of field offsets */ 831 int nbits, /* number of bits to inspect */ 832 int *first, /* output: first byte offset */ 833 int *last) /* output: last byte offset */ 834 { 835 int i; /* current bit number */ 836 uint32_t imask; /* mask for current bit number */ 837 838 ASSERT(fields != 0); 839 /* 840 * Find the lowest bit, so the first byte offset. 841 */ 842 for (i = 0, imask = 1u; ; i++, imask <<= 1) { 843 if (imask & fields) { 844 *first = offsets[i]; 845 break; 846 } 847 } 848 /* 849 * Find the highest bit, so the last byte offset. 850 */ 851 for (i = nbits - 1, imask = 1u << i; ; i--, imask >>= 1) { 852 if (imask & fields) { 853 *last = offsets[i + 1] - 1; 854 break; 855 } 856 } 857 } 858 859 /* 860 * Get a buffer for the block, return it read in. 861 * Long-form addressing. 862 */ 863 int 864 xfs_btree_read_bufl( 865 struct xfs_mount *mp, /* file system mount point */ 866 struct xfs_trans *tp, /* transaction pointer */ 867 xfs_fsblock_t fsbno, /* file system block number */ 868 struct xfs_buf **bpp, /* buffer for fsbno */ 869 int refval, /* ref count value for buffer */ 870 const struct xfs_buf_ops *ops) 871 { 872 struct xfs_buf *bp; /* return value */ 873 xfs_daddr_t d; /* real disk block address */ 874 int error; 875 876 if (!xfs_verify_fsbno(mp, fsbno)) 877 return -EFSCORRUPTED; 878 d = XFS_FSB_TO_DADDR(mp, fsbno); 879 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, 880 mp->m_bsize, 0, &bp, ops); 881 if (error) 882 return error; 883 if (bp) 884 xfs_buf_set_ref(bp, refval); 885 *bpp = bp; 886 return 0; 887 } 888 889 /* 890 * Read-ahead the block, don't wait for it, don't return a buffer. 891 * Long-form addressing. 892 */ 893 /* ARGSUSED */ 894 void 895 xfs_btree_reada_bufl( 896 struct xfs_mount *mp, /* file system mount point */ 897 xfs_fsblock_t fsbno, /* file system block number */ 898 xfs_extlen_t count, /* count of filesystem blocks */ 899 const struct xfs_buf_ops *ops) 900 { 901 xfs_daddr_t d; 902 903 ASSERT(fsbno != NULLFSBLOCK); 904 d = XFS_FSB_TO_DADDR(mp, fsbno); 905 xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops); 906 } 907 908 /* 909 * Read-ahead the block, don't wait for it, don't return a buffer. 910 * Short-form addressing. 911 */ 912 /* ARGSUSED */ 913 void 914 xfs_btree_reada_bufs( 915 struct xfs_mount *mp, /* file system mount point */ 916 xfs_agnumber_t agno, /* allocation group number */ 917 xfs_agblock_t agbno, /* allocation group block number */ 918 xfs_extlen_t count, /* count of filesystem blocks */ 919 const struct xfs_buf_ops *ops) 920 { 921 xfs_daddr_t d; 922 923 ASSERT(agno != NULLAGNUMBER); 924 ASSERT(agbno != NULLAGBLOCK); 925 d = XFS_AGB_TO_DADDR(mp, agno, agbno); 926 xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops); 927 } 928 929 STATIC int 930 xfs_btree_readahead_lblock( 931 struct xfs_btree_cur *cur, 932 int lr, 933 struct xfs_btree_block *block) 934 { 935 int rval = 0; 936 xfs_fsblock_t left = be64_to_cpu(block->bb_u.l.bb_leftsib); 937 xfs_fsblock_t right = be64_to_cpu(block->bb_u.l.bb_rightsib); 938 939 if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) { 940 xfs_btree_reada_bufl(cur->bc_mp, left, 1, 941 cur->bc_ops->buf_ops); 942 rval++; 943 } 944 945 if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) { 946 xfs_btree_reada_bufl(cur->bc_mp, right, 1, 947 cur->bc_ops->buf_ops); 948 rval++; 949 } 950 951 return rval; 952 } 953 954 STATIC int 955 xfs_btree_readahead_sblock( 956 struct xfs_btree_cur *cur, 957 int lr, 958 struct xfs_btree_block *block) 959 { 960 int rval = 0; 961 xfs_agblock_t left = be32_to_cpu(block->bb_u.s.bb_leftsib); 962 xfs_agblock_t right = be32_to_cpu(block->bb_u.s.bb_rightsib); 963 964 965 if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) { 966 xfs_btree_reada_bufs(cur->bc_mp, cur->bc_ag.pag->pag_agno, 967 left, 1, cur->bc_ops->buf_ops); 968 rval++; 969 } 970 971 if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) { 972 xfs_btree_reada_bufs(cur->bc_mp, cur->bc_ag.pag->pag_agno, 973 right, 1, cur->bc_ops->buf_ops); 974 rval++; 975 } 976 977 return rval; 978 } 979 980 /* 981 * Read-ahead btree blocks, at the given level. 982 * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA. 983 */ 984 STATIC int 985 xfs_btree_readahead( 986 struct xfs_btree_cur *cur, /* btree cursor */ 987 int lev, /* level in btree */ 988 int lr) /* left/right bits */ 989 { 990 struct xfs_btree_block *block; 991 992 /* 993 * No readahead needed if we are at the root level and the 994 * btree root is stored in the inode. 995 */ 996 if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && 997 (lev == cur->bc_nlevels - 1)) 998 return 0; 999 1000 if ((cur->bc_levels[lev].ra | lr) == cur->bc_levels[lev].ra) 1001 return 0; 1002 1003 cur->bc_levels[lev].ra |= lr; 1004 block = XFS_BUF_TO_BLOCK(cur->bc_levels[lev].bp); 1005 1006 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) 1007 return xfs_btree_readahead_lblock(cur, lr, block); 1008 return xfs_btree_readahead_sblock(cur, lr, block); 1009 } 1010 1011 STATIC int 1012 xfs_btree_ptr_to_daddr( 1013 struct xfs_btree_cur *cur, 1014 const union xfs_btree_ptr *ptr, 1015 xfs_daddr_t *daddr) 1016 { 1017 xfs_fsblock_t fsbno; 1018 xfs_agblock_t agbno; 1019 int error; 1020 1021 error = xfs_btree_check_ptr(cur, ptr, 0, 1); 1022 if (error) 1023 return error; 1024 1025 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { 1026 fsbno = be64_to_cpu(ptr->l); 1027 *daddr = XFS_FSB_TO_DADDR(cur->bc_mp, fsbno); 1028 } else { 1029 agbno = be32_to_cpu(ptr->s); 1030 *daddr = XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_ag.pag->pag_agno, 1031 agbno); 1032 } 1033 1034 return 0; 1035 } 1036 1037 /* 1038 * Readahead @count btree blocks at the given @ptr location. 1039 * 1040 * We don't need to care about long or short form btrees here as we have a 1041 * method of converting the ptr directly to a daddr available to us. 1042 */ 1043 STATIC void 1044 xfs_btree_readahead_ptr( 1045 struct xfs_btree_cur *cur, 1046 union xfs_btree_ptr *ptr, 1047 xfs_extlen_t count) 1048 { 1049 xfs_daddr_t daddr; 1050 1051 if (xfs_btree_ptr_to_daddr(cur, ptr, &daddr)) 1052 return; 1053 xfs_buf_readahead(cur->bc_mp->m_ddev_targp, daddr, 1054 cur->bc_mp->m_bsize * count, cur->bc_ops->buf_ops); 1055 } 1056 1057 /* 1058 * Set the buffer for level "lev" in the cursor to bp, releasing 1059 * any previous buffer. 1060 */ 1061 STATIC void 1062 xfs_btree_setbuf( 1063 struct xfs_btree_cur *cur, /* btree cursor */ 1064 int lev, /* level in btree */ 1065 struct xfs_buf *bp) /* new buffer to set */ 1066 { 1067 struct xfs_btree_block *b; /* btree block */ 1068 1069 if (cur->bc_levels[lev].bp) 1070 xfs_trans_brelse(cur->bc_tp, cur->bc_levels[lev].bp); 1071 cur->bc_levels[lev].bp = bp; 1072 cur->bc_levels[lev].ra = 0; 1073 1074 b = XFS_BUF_TO_BLOCK(bp); 1075 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { 1076 if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK)) 1077 cur->bc_levels[lev].ra |= XFS_BTCUR_LEFTRA; 1078 if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK)) 1079 cur->bc_levels[lev].ra |= XFS_BTCUR_RIGHTRA; 1080 } else { 1081 if (b->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK)) 1082 cur->bc_levels[lev].ra |= XFS_BTCUR_LEFTRA; 1083 if (b->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK)) 1084 cur->bc_levels[lev].ra |= XFS_BTCUR_RIGHTRA; 1085 } 1086 } 1087 1088 bool 1089 xfs_btree_ptr_is_null( 1090 struct xfs_btree_cur *cur, 1091 const union xfs_btree_ptr *ptr) 1092 { 1093 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) 1094 return ptr->l == cpu_to_be64(NULLFSBLOCK); 1095 else 1096 return ptr->s == cpu_to_be32(NULLAGBLOCK); 1097 } 1098 1099 void 1100 xfs_btree_set_ptr_null( 1101 struct xfs_btree_cur *cur, 1102 union xfs_btree_ptr *ptr) 1103 { 1104 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) 1105 ptr->l = cpu_to_be64(NULLFSBLOCK); 1106 else 1107 ptr->s = cpu_to_be32(NULLAGBLOCK); 1108 } 1109 1110 /* 1111 * Get/set/init sibling pointers 1112 */ 1113 void 1114 xfs_btree_get_sibling( 1115 struct xfs_btree_cur *cur, 1116 struct xfs_btree_block *block, 1117 union xfs_btree_ptr *ptr, 1118 int lr) 1119 { 1120 ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB); 1121 1122 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { 1123 if (lr == XFS_BB_RIGHTSIB) 1124 ptr->l = block->bb_u.l.bb_rightsib; 1125 else 1126 ptr->l = block->bb_u.l.bb_leftsib; 1127 } else { 1128 if (lr == XFS_BB_RIGHTSIB) 1129 ptr->s = block->bb_u.s.bb_rightsib; 1130 else 1131 ptr->s = block->bb_u.s.bb_leftsib; 1132 } 1133 } 1134 1135 void 1136 xfs_btree_set_sibling( 1137 struct xfs_btree_cur *cur, 1138 struct xfs_btree_block *block, 1139 const union xfs_btree_ptr *ptr, 1140 int lr) 1141 { 1142 ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB); 1143 1144 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { 1145 if (lr == XFS_BB_RIGHTSIB) 1146 block->bb_u.l.bb_rightsib = ptr->l; 1147 else 1148 block->bb_u.l.bb_leftsib = ptr->l; 1149 } else { 1150 if (lr == XFS_BB_RIGHTSIB) 1151 block->bb_u.s.bb_rightsib = ptr->s; 1152 else 1153 block->bb_u.s.bb_leftsib = ptr->s; 1154 } 1155 } 1156 1157 void 1158 xfs_btree_init_block_int( 1159 struct xfs_mount *mp, 1160 struct xfs_btree_block *buf, 1161 xfs_daddr_t blkno, 1162 xfs_btnum_t btnum, 1163 __u16 level, 1164 __u16 numrecs, 1165 __u64 owner, 1166 unsigned int flags) 1167 { 1168 int crc = xfs_has_crc(mp); 1169 __u32 magic = xfs_btree_magic(crc, btnum); 1170 1171 buf->bb_magic = cpu_to_be32(magic); 1172 buf->bb_level = cpu_to_be16(level); 1173 buf->bb_numrecs = cpu_to_be16(numrecs); 1174 1175 if (flags & XFS_BTREE_LONG_PTRS) { 1176 buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK); 1177 buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK); 1178 if (crc) { 1179 buf->bb_u.l.bb_blkno = cpu_to_be64(blkno); 1180 buf->bb_u.l.bb_owner = cpu_to_be64(owner); 1181 uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid); 1182 buf->bb_u.l.bb_pad = 0; 1183 buf->bb_u.l.bb_lsn = 0; 1184 } 1185 } else { 1186 /* owner is a 32 bit value on short blocks */ 1187 __u32 __owner = (__u32)owner; 1188 1189 buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK); 1190 buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK); 1191 if (crc) { 1192 buf->bb_u.s.bb_blkno = cpu_to_be64(blkno); 1193 buf->bb_u.s.bb_owner = cpu_to_be32(__owner); 1194 uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid); 1195 buf->bb_u.s.bb_lsn = 0; 1196 } 1197 } 1198 } 1199 1200 void 1201 xfs_btree_init_block( 1202 struct xfs_mount *mp, 1203 struct xfs_buf *bp, 1204 xfs_btnum_t btnum, 1205 __u16 level, 1206 __u16 numrecs, 1207 __u64 owner) 1208 { 1209 xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), xfs_buf_daddr(bp), 1210 btnum, level, numrecs, owner, 0); 1211 } 1212 1213 void 1214 xfs_btree_init_block_cur( 1215 struct xfs_btree_cur *cur, 1216 struct xfs_buf *bp, 1217 int level, 1218 int numrecs) 1219 { 1220 __u64 owner; 1221 1222 /* 1223 * we can pull the owner from the cursor right now as the different 1224 * owners align directly with the pointer size of the btree. This may 1225 * change in future, but is safe for current users of the generic btree 1226 * code. 1227 */ 1228 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) 1229 owner = cur->bc_ino.ip->i_ino; 1230 else 1231 owner = cur->bc_ag.pag->pag_agno; 1232 1233 xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), 1234 xfs_buf_daddr(bp), cur->bc_btnum, level, 1235 numrecs, owner, cur->bc_flags); 1236 } 1237 1238 /* 1239 * Return true if ptr is the last record in the btree and 1240 * we need to track updates to this record. The decision 1241 * will be further refined in the update_lastrec method. 1242 */ 1243 STATIC int 1244 xfs_btree_is_lastrec( 1245 struct xfs_btree_cur *cur, 1246 struct xfs_btree_block *block, 1247 int level) 1248 { 1249 union xfs_btree_ptr ptr; 1250 1251 if (level > 0) 1252 return 0; 1253 if (!(cur->bc_flags & XFS_BTREE_LASTREC_UPDATE)) 1254 return 0; 1255 1256 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB); 1257 if (!xfs_btree_ptr_is_null(cur, &ptr)) 1258 return 0; 1259 return 1; 1260 } 1261 1262 STATIC void 1263 xfs_btree_buf_to_ptr( 1264 struct xfs_btree_cur *cur, 1265 struct xfs_buf *bp, 1266 union xfs_btree_ptr *ptr) 1267 { 1268 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) 1269 ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp, 1270 xfs_buf_daddr(bp))); 1271 else { 1272 ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp, 1273 xfs_buf_daddr(bp))); 1274 } 1275 } 1276 1277 STATIC void 1278 xfs_btree_set_refs( 1279 struct xfs_btree_cur *cur, 1280 struct xfs_buf *bp) 1281 { 1282 switch (cur->bc_btnum) { 1283 case XFS_BTNUM_BNO: 1284 case XFS_BTNUM_CNT: 1285 xfs_buf_set_ref(bp, XFS_ALLOC_BTREE_REF); 1286 break; 1287 case XFS_BTNUM_INO: 1288 case XFS_BTNUM_FINO: 1289 xfs_buf_set_ref(bp, XFS_INO_BTREE_REF); 1290 break; 1291 case XFS_BTNUM_BMAP: 1292 xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF); 1293 break; 1294 case XFS_BTNUM_RMAP: 1295 xfs_buf_set_ref(bp, XFS_RMAP_BTREE_REF); 1296 break; 1297 case XFS_BTNUM_REFC: 1298 xfs_buf_set_ref(bp, XFS_REFC_BTREE_REF); 1299 break; 1300 default: 1301 ASSERT(0); 1302 } 1303 } 1304 1305 int 1306 xfs_btree_get_buf_block( 1307 struct xfs_btree_cur *cur, 1308 const union xfs_btree_ptr *ptr, 1309 struct xfs_btree_block **block, 1310 struct xfs_buf **bpp) 1311 { 1312 struct xfs_mount *mp = cur->bc_mp; 1313 xfs_daddr_t d; 1314 int error; 1315 1316 error = xfs_btree_ptr_to_daddr(cur, ptr, &d); 1317 if (error) 1318 return error; 1319 error = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d, mp->m_bsize, 1320 0, bpp); 1321 if (error) 1322 return error; 1323 1324 (*bpp)->b_ops = cur->bc_ops->buf_ops; 1325 *block = XFS_BUF_TO_BLOCK(*bpp); 1326 return 0; 1327 } 1328 1329 /* 1330 * Read in the buffer at the given ptr and return the buffer and 1331 * the block pointer within the buffer. 1332 */ 1333 STATIC int 1334 xfs_btree_read_buf_block( 1335 struct xfs_btree_cur *cur, 1336 const union xfs_btree_ptr *ptr, 1337 int flags, 1338 struct xfs_btree_block **block, 1339 struct xfs_buf **bpp) 1340 { 1341 struct xfs_mount *mp = cur->bc_mp; 1342 xfs_daddr_t d; 1343 int error; 1344 1345 /* need to sort out how callers deal with failures first */ 1346 ASSERT(!(flags & XBF_TRYLOCK)); 1347 1348 error = xfs_btree_ptr_to_daddr(cur, ptr, &d); 1349 if (error) 1350 return error; 1351 error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d, 1352 mp->m_bsize, flags, bpp, 1353 cur->bc_ops->buf_ops); 1354 if (error) 1355 return error; 1356 1357 xfs_btree_set_refs(cur, *bpp); 1358 *block = XFS_BUF_TO_BLOCK(*bpp); 1359 return 0; 1360 } 1361 1362 /* 1363 * Copy keys from one btree block to another. 1364 */ 1365 void 1366 xfs_btree_copy_keys( 1367 struct xfs_btree_cur *cur, 1368 union xfs_btree_key *dst_key, 1369 const union xfs_btree_key *src_key, 1370 int numkeys) 1371 { 1372 ASSERT(numkeys >= 0); 1373 memcpy(dst_key, src_key, numkeys * cur->bc_ops->key_len); 1374 } 1375 1376 /* 1377 * Copy records from one btree block to another. 1378 */ 1379 STATIC void 1380 xfs_btree_copy_recs( 1381 struct xfs_btree_cur *cur, 1382 union xfs_btree_rec *dst_rec, 1383 union xfs_btree_rec *src_rec, 1384 int numrecs) 1385 { 1386 ASSERT(numrecs >= 0); 1387 memcpy(dst_rec, src_rec, numrecs * cur->bc_ops->rec_len); 1388 } 1389 1390 /* 1391 * Copy block pointers from one btree block to another. 1392 */ 1393 void 1394 xfs_btree_copy_ptrs( 1395 struct xfs_btree_cur *cur, 1396 union xfs_btree_ptr *dst_ptr, 1397 const union xfs_btree_ptr *src_ptr, 1398 int numptrs) 1399 { 1400 ASSERT(numptrs >= 0); 1401 memcpy(dst_ptr, src_ptr, numptrs * xfs_btree_ptr_len(cur)); 1402 } 1403 1404 /* 1405 * Shift keys one index left/right inside a single btree block. 1406 */ 1407 STATIC void 1408 xfs_btree_shift_keys( 1409 struct xfs_btree_cur *cur, 1410 union xfs_btree_key *key, 1411 int dir, 1412 int numkeys) 1413 { 1414 char *dst_key; 1415 1416 ASSERT(numkeys >= 0); 1417 ASSERT(dir == 1 || dir == -1); 1418 1419 dst_key = (char *)key + (dir * cur->bc_ops->key_len); 1420 memmove(dst_key, key, numkeys * cur->bc_ops->key_len); 1421 } 1422 1423 /* 1424 * Shift records one index left/right inside a single btree block. 1425 */ 1426 STATIC void 1427 xfs_btree_shift_recs( 1428 struct xfs_btree_cur *cur, 1429 union xfs_btree_rec *rec, 1430 int dir, 1431 int numrecs) 1432 { 1433 char *dst_rec; 1434 1435 ASSERT(numrecs >= 0); 1436 ASSERT(dir == 1 || dir == -1); 1437 1438 dst_rec = (char *)rec + (dir * cur->bc_ops->rec_len); 1439 memmove(dst_rec, rec, numrecs * cur->bc_ops->rec_len); 1440 } 1441 1442 /* 1443 * Shift block pointers one index left/right inside a single btree block. 1444 */ 1445 STATIC void 1446 xfs_btree_shift_ptrs( 1447 struct xfs_btree_cur *cur, 1448 union xfs_btree_ptr *ptr, 1449 int dir, 1450 int numptrs) 1451 { 1452 char *dst_ptr; 1453 1454 ASSERT(numptrs >= 0); 1455 ASSERT(dir == 1 || dir == -1); 1456 1457 dst_ptr = (char *)ptr + (dir * xfs_btree_ptr_len(cur)); 1458 memmove(dst_ptr, ptr, numptrs * xfs_btree_ptr_len(cur)); 1459 } 1460 1461 /* 1462 * Log key values from the btree block. 1463 */ 1464 STATIC void 1465 xfs_btree_log_keys( 1466 struct xfs_btree_cur *cur, 1467 struct xfs_buf *bp, 1468 int first, 1469 int last) 1470 { 1471 1472 if (bp) { 1473 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); 1474 xfs_trans_log_buf(cur->bc_tp, bp, 1475 xfs_btree_key_offset(cur, first), 1476 xfs_btree_key_offset(cur, last + 1) - 1); 1477 } else { 1478 xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip, 1479 xfs_ilog_fbroot(cur->bc_ino.whichfork)); 1480 } 1481 } 1482 1483 /* 1484 * Log record values from the btree block. 1485 */ 1486 void 1487 xfs_btree_log_recs( 1488 struct xfs_btree_cur *cur, 1489 struct xfs_buf *bp, 1490 int first, 1491 int last) 1492 { 1493 1494 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); 1495 xfs_trans_log_buf(cur->bc_tp, bp, 1496 xfs_btree_rec_offset(cur, first), 1497 xfs_btree_rec_offset(cur, last + 1) - 1); 1498 1499 } 1500 1501 /* 1502 * Log block pointer fields from a btree block (nonleaf). 1503 */ 1504 STATIC void 1505 xfs_btree_log_ptrs( 1506 struct xfs_btree_cur *cur, /* btree cursor */ 1507 struct xfs_buf *bp, /* buffer containing btree block */ 1508 int first, /* index of first pointer to log */ 1509 int last) /* index of last pointer to log */ 1510 { 1511 1512 if (bp) { 1513 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 1514 int level = xfs_btree_get_level(block); 1515 1516 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); 1517 xfs_trans_log_buf(cur->bc_tp, bp, 1518 xfs_btree_ptr_offset(cur, first, level), 1519 xfs_btree_ptr_offset(cur, last + 1, level) - 1); 1520 } else { 1521 xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip, 1522 xfs_ilog_fbroot(cur->bc_ino.whichfork)); 1523 } 1524 1525 } 1526 1527 /* 1528 * Log fields from a btree block header. 1529 */ 1530 void 1531 xfs_btree_log_block( 1532 struct xfs_btree_cur *cur, /* btree cursor */ 1533 struct xfs_buf *bp, /* buffer containing btree block */ 1534 uint32_t fields) /* mask of fields: XFS_BB_... */ 1535 { 1536 int first; /* first byte offset logged */ 1537 int last; /* last byte offset logged */ 1538 static const short soffsets[] = { /* table of offsets (short) */ 1539 offsetof(struct xfs_btree_block, bb_magic), 1540 offsetof(struct xfs_btree_block, bb_level), 1541 offsetof(struct xfs_btree_block, bb_numrecs), 1542 offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib), 1543 offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib), 1544 offsetof(struct xfs_btree_block, bb_u.s.bb_blkno), 1545 offsetof(struct xfs_btree_block, bb_u.s.bb_lsn), 1546 offsetof(struct xfs_btree_block, bb_u.s.bb_uuid), 1547 offsetof(struct xfs_btree_block, bb_u.s.bb_owner), 1548 offsetof(struct xfs_btree_block, bb_u.s.bb_crc), 1549 XFS_BTREE_SBLOCK_CRC_LEN 1550 }; 1551 static const short loffsets[] = { /* table of offsets (long) */ 1552 offsetof(struct xfs_btree_block, bb_magic), 1553 offsetof(struct xfs_btree_block, bb_level), 1554 offsetof(struct xfs_btree_block, bb_numrecs), 1555 offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib), 1556 offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib), 1557 offsetof(struct xfs_btree_block, bb_u.l.bb_blkno), 1558 offsetof(struct xfs_btree_block, bb_u.l.bb_lsn), 1559 offsetof(struct xfs_btree_block, bb_u.l.bb_uuid), 1560 offsetof(struct xfs_btree_block, bb_u.l.bb_owner), 1561 offsetof(struct xfs_btree_block, bb_u.l.bb_crc), 1562 offsetof(struct xfs_btree_block, bb_u.l.bb_pad), 1563 XFS_BTREE_LBLOCK_CRC_LEN 1564 }; 1565 1566 if (bp) { 1567 int nbits; 1568 1569 if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) { 1570 /* 1571 * We don't log the CRC when updating a btree 1572 * block but instead recreate it during log 1573 * recovery. As the log buffers have checksums 1574 * of their own this is safe and avoids logging a crc 1575 * update in a lot of places. 1576 */ 1577 if (fields == XFS_BB_ALL_BITS) 1578 fields = XFS_BB_ALL_BITS_CRC; 1579 nbits = XFS_BB_NUM_BITS_CRC; 1580 } else { 1581 nbits = XFS_BB_NUM_BITS; 1582 } 1583 xfs_btree_offsets(fields, 1584 (cur->bc_flags & XFS_BTREE_LONG_PTRS) ? 1585 loffsets : soffsets, 1586 nbits, &first, &last); 1587 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); 1588 xfs_trans_log_buf(cur->bc_tp, bp, first, last); 1589 } else { 1590 xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip, 1591 xfs_ilog_fbroot(cur->bc_ino.whichfork)); 1592 } 1593 } 1594 1595 /* 1596 * Increment cursor by one record at the level. 1597 * For nonzero levels the leaf-ward information is untouched. 1598 */ 1599 int /* error */ 1600 xfs_btree_increment( 1601 struct xfs_btree_cur *cur, 1602 int level, 1603 int *stat) /* success/failure */ 1604 { 1605 struct xfs_btree_block *block; 1606 union xfs_btree_ptr ptr; 1607 struct xfs_buf *bp; 1608 int error; /* error return value */ 1609 int lev; 1610 1611 ASSERT(level < cur->bc_nlevels); 1612 1613 /* Read-ahead to the right at this level. */ 1614 xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA); 1615 1616 /* Get a pointer to the btree block. */ 1617 block = xfs_btree_get_block(cur, level, &bp); 1618 1619 #ifdef DEBUG 1620 error = xfs_btree_check_block(cur, block, level, bp); 1621 if (error) 1622 goto error0; 1623 #endif 1624 1625 /* We're done if we remain in the block after the increment. */ 1626 if (++cur->bc_levels[level].ptr <= xfs_btree_get_numrecs(block)) 1627 goto out1; 1628 1629 /* Fail if we just went off the right edge of the tree. */ 1630 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB); 1631 if (xfs_btree_ptr_is_null(cur, &ptr)) 1632 goto out0; 1633 1634 XFS_BTREE_STATS_INC(cur, increment); 1635 1636 /* 1637 * March up the tree incrementing pointers. 1638 * Stop when we don't go off the right edge of a block. 1639 */ 1640 for (lev = level + 1; lev < cur->bc_nlevels; lev++) { 1641 block = xfs_btree_get_block(cur, lev, &bp); 1642 1643 #ifdef DEBUG 1644 error = xfs_btree_check_block(cur, block, lev, bp); 1645 if (error) 1646 goto error0; 1647 #endif 1648 1649 if (++cur->bc_levels[lev].ptr <= xfs_btree_get_numrecs(block)) 1650 break; 1651 1652 /* Read-ahead the right block for the next loop. */ 1653 xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA); 1654 } 1655 1656 /* 1657 * If we went off the root then we are either seriously 1658 * confused or have the tree root in an inode. 1659 */ 1660 if (lev == cur->bc_nlevels) { 1661 if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) 1662 goto out0; 1663 ASSERT(0); 1664 error = -EFSCORRUPTED; 1665 goto error0; 1666 } 1667 ASSERT(lev < cur->bc_nlevels); 1668 1669 /* 1670 * Now walk back down the tree, fixing up the cursor's buffer 1671 * pointers and key numbers. 1672 */ 1673 for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) { 1674 union xfs_btree_ptr *ptrp; 1675 1676 ptrp = xfs_btree_ptr_addr(cur, cur->bc_levels[lev].ptr, block); 1677 --lev; 1678 error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp); 1679 if (error) 1680 goto error0; 1681 1682 xfs_btree_setbuf(cur, lev, bp); 1683 cur->bc_levels[lev].ptr = 1; 1684 } 1685 out1: 1686 *stat = 1; 1687 return 0; 1688 1689 out0: 1690 *stat = 0; 1691 return 0; 1692 1693 error0: 1694 return error; 1695 } 1696 1697 /* 1698 * Decrement cursor by one record at the level. 1699 * For nonzero levels the leaf-ward information is untouched. 1700 */ 1701 int /* error */ 1702 xfs_btree_decrement( 1703 struct xfs_btree_cur *cur, 1704 int level, 1705 int *stat) /* success/failure */ 1706 { 1707 struct xfs_btree_block *block; 1708 struct xfs_buf *bp; 1709 int error; /* error return value */ 1710 int lev; 1711 union xfs_btree_ptr ptr; 1712 1713 ASSERT(level < cur->bc_nlevels); 1714 1715 /* Read-ahead to the left at this level. */ 1716 xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA); 1717 1718 /* We're done if we remain in the block after the decrement. */ 1719 if (--cur->bc_levels[level].ptr > 0) 1720 goto out1; 1721 1722 /* Get a pointer to the btree block. */ 1723 block = xfs_btree_get_block(cur, level, &bp); 1724 1725 #ifdef DEBUG 1726 error = xfs_btree_check_block(cur, block, level, bp); 1727 if (error) 1728 goto error0; 1729 #endif 1730 1731 /* Fail if we just went off the left edge of the tree. */ 1732 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB); 1733 if (xfs_btree_ptr_is_null(cur, &ptr)) 1734 goto out0; 1735 1736 XFS_BTREE_STATS_INC(cur, decrement); 1737 1738 /* 1739 * March up the tree decrementing pointers. 1740 * Stop when we don't go off the left edge of a block. 1741 */ 1742 for (lev = level + 1; lev < cur->bc_nlevels; lev++) { 1743 if (--cur->bc_levels[lev].ptr > 0) 1744 break; 1745 /* Read-ahead the left block for the next loop. */ 1746 xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA); 1747 } 1748 1749 /* 1750 * If we went off the root then we are seriously confused. 1751 * or the root of the tree is in an inode. 1752 */ 1753 if (lev == cur->bc_nlevels) { 1754 if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) 1755 goto out0; 1756 ASSERT(0); 1757 error = -EFSCORRUPTED; 1758 goto error0; 1759 } 1760 ASSERT(lev < cur->bc_nlevels); 1761 1762 /* 1763 * Now walk back down the tree, fixing up the cursor's buffer 1764 * pointers and key numbers. 1765 */ 1766 for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) { 1767 union xfs_btree_ptr *ptrp; 1768 1769 ptrp = xfs_btree_ptr_addr(cur, cur->bc_levels[lev].ptr, block); 1770 --lev; 1771 error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp); 1772 if (error) 1773 goto error0; 1774 xfs_btree_setbuf(cur, lev, bp); 1775 cur->bc_levels[lev].ptr = xfs_btree_get_numrecs(block); 1776 } 1777 out1: 1778 *stat = 1; 1779 return 0; 1780 1781 out0: 1782 *stat = 0; 1783 return 0; 1784 1785 error0: 1786 return error; 1787 } 1788 1789 int 1790 xfs_btree_lookup_get_block( 1791 struct xfs_btree_cur *cur, /* btree cursor */ 1792 int level, /* level in the btree */ 1793 const union xfs_btree_ptr *pp, /* ptr to btree block */ 1794 struct xfs_btree_block **blkp) /* return btree block */ 1795 { 1796 struct xfs_buf *bp; /* buffer pointer for btree block */ 1797 xfs_daddr_t daddr; 1798 int error = 0; 1799 1800 /* special case the root block if in an inode */ 1801 if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && 1802 (level == cur->bc_nlevels - 1)) { 1803 *blkp = xfs_btree_get_iroot(cur); 1804 return 0; 1805 } 1806 1807 /* 1808 * If the old buffer at this level for the disk address we are 1809 * looking for re-use it. 1810 * 1811 * Otherwise throw it away and get a new one. 1812 */ 1813 bp = cur->bc_levels[level].bp; 1814 error = xfs_btree_ptr_to_daddr(cur, pp, &daddr); 1815 if (error) 1816 return error; 1817 if (bp && xfs_buf_daddr(bp) == daddr) { 1818 *blkp = XFS_BUF_TO_BLOCK(bp); 1819 return 0; 1820 } 1821 1822 error = xfs_btree_read_buf_block(cur, pp, 0, blkp, &bp); 1823 if (error) 1824 return error; 1825 1826 /* Check the inode owner since the verifiers don't. */ 1827 if (xfs_has_crc(cur->bc_mp) && 1828 !(cur->bc_ino.flags & XFS_BTCUR_BMBT_INVALID_OWNER) && 1829 (cur->bc_flags & XFS_BTREE_LONG_PTRS) && 1830 be64_to_cpu((*blkp)->bb_u.l.bb_owner) != 1831 cur->bc_ino.ip->i_ino) 1832 goto out_bad; 1833 1834 /* Did we get the level we were looking for? */ 1835 if (be16_to_cpu((*blkp)->bb_level) != level) 1836 goto out_bad; 1837 1838 /* Check that internal nodes have at least one record. */ 1839 if (level != 0 && be16_to_cpu((*blkp)->bb_numrecs) == 0) 1840 goto out_bad; 1841 1842 xfs_btree_setbuf(cur, level, bp); 1843 return 0; 1844 1845 out_bad: 1846 *blkp = NULL; 1847 xfs_buf_mark_corrupt(bp); 1848 xfs_trans_brelse(cur->bc_tp, bp); 1849 return -EFSCORRUPTED; 1850 } 1851 1852 /* 1853 * Get current search key. For level 0 we don't actually have a key 1854 * structure so we make one up from the record. For all other levels 1855 * we just return the right key. 1856 */ 1857 STATIC union xfs_btree_key * 1858 xfs_lookup_get_search_key( 1859 struct xfs_btree_cur *cur, 1860 int level, 1861 int keyno, 1862 struct xfs_btree_block *block, 1863 union xfs_btree_key *kp) 1864 { 1865 if (level == 0) { 1866 cur->bc_ops->init_key_from_rec(kp, 1867 xfs_btree_rec_addr(cur, keyno, block)); 1868 return kp; 1869 } 1870 1871 return xfs_btree_key_addr(cur, keyno, block); 1872 } 1873 1874 /* 1875 * Lookup the record. The cursor is made to point to it, based on dir. 1876 * stat is set to 0 if can't find any such record, 1 for success. 1877 */ 1878 int /* error */ 1879 xfs_btree_lookup( 1880 struct xfs_btree_cur *cur, /* btree cursor */ 1881 xfs_lookup_t dir, /* <=, ==, or >= */ 1882 int *stat) /* success/failure */ 1883 { 1884 struct xfs_btree_block *block; /* current btree block */ 1885 int64_t diff; /* difference for the current key */ 1886 int error; /* error return value */ 1887 int keyno; /* current key number */ 1888 int level; /* level in the btree */ 1889 union xfs_btree_ptr *pp; /* ptr to btree block */ 1890 union xfs_btree_ptr ptr; /* ptr to btree block */ 1891 1892 XFS_BTREE_STATS_INC(cur, lookup); 1893 1894 /* No such thing as a zero-level tree. */ 1895 if (XFS_IS_CORRUPT(cur->bc_mp, cur->bc_nlevels == 0)) 1896 return -EFSCORRUPTED; 1897 1898 block = NULL; 1899 keyno = 0; 1900 1901 /* initialise start pointer from cursor */ 1902 cur->bc_ops->init_ptr_from_cur(cur, &ptr); 1903 pp = &ptr; 1904 1905 /* 1906 * Iterate over each level in the btree, starting at the root. 1907 * For each level above the leaves, find the key we need, based 1908 * on the lookup record, then follow the corresponding block 1909 * pointer down to the next level. 1910 */ 1911 for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) { 1912 /* Get the block we need to do the lookup on. */ 1913 error = xfs_btree_lookup_get_block(cur, level, pp, &block); 1914 if (error) 1915 goto error0; 1916 1917 if (diff == 0) { 1918 /* 1919 * If we already had a key match at a higher level, we 1920 * know we need to use the first entry in this block. 1921 */ 1922 keyno = 1; 1923 } else { 1924 /* Otherwise search this block. Do a binary search. */ 1925 1926 int high; /* high entry number */ 1927 int low; /* low entry number */ 1928 1929 /* Set low and high entry numbers, 1-based. */ 1930 low = 1; 1931 high = xfs_btree_get_numrecs(block); 1932 if (!high) { 1933 /* Block is empty, must be an empty leaf. */ 1934 if (level != 0 || cur->bc_nlevels != 1) { 1935 XFS_CORRUPTION_ERROR(__func__, 1936 XFS_ERRLEVEL_LOW, 1937 cur->bc_mp, block, 1938 sizeof(*block)); 1939 return -EFSCORRUPTED; 1940 } 1941 1942 cur->bc_levels[0].ptr = dir != XFS_LOOKUP_LE; 1943 *stat = 0; 1944 return 0; 1945 } 1946 1947 /* Binary search the block. */ 1948 while (low <= high) { 1949 union xfs_btree_key key; 1950 union xfs_btree_key *kp; 1951 1952 XFS_BTREE_STATS_INC(cur, compare); 1953 1954 /* keyno is average of low and high. */ 1955 keyno = (low + high) >> 1; 1956 1957 /* Get current search key */ 1958 kp = xfs_lookup_get_search_key(cur, level, 1959 keyno, block, &key); 1960 1961 /* 1962 * Compute difference to get next direction: 1963 * - less than, move right 1964 * - greater than, move left 1965 * - equal, we're done 1966 */ 1967 diff = cur->bc_ops->key_diff(cur, kp); 1968 if (diff < 0) 1969 low = keyno + 1; 1970 else if (diff > 0) 1971 high = keyno - 1; 1972 else 1973 break; 1974 } 1975 } 1976 1977 /* 1978 * If there are more levels, set up for the next level 1979 * by getting the block number and filling in the cursor. 1980 */ 1981 if (level > 0) { 1982 /* 1983 * If we moved left, need the previous key number, 1984 * unless there isn't one. 1985 */ 1986 if (diff > 0 && --keyno < 1) 1987 keyno = 1; 1988 pp = xfs_btree_ptr_addr(cur, keyno, block); 1989 1990 error = xfs_btree_debug_check_ptr(cur, pp, 0, level); 1991 if (error) 1992 goto error0; 1993 1994 cur->bc_levels[level].ptr = keyno; 1995 } 1996 } 1997 1998 /* Done with the search. See if we need to adjust the results. */ 1999 if (dir != XFS_LOOKUP_LE && diff < 0) { 2000 keyno++; 2001 /* 2002 * If ge search and we went off the end of the block, but it's 2003 * not the last block, we're in the wrong block. 2004 */ 2005 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB); 2006 if (dir == XFS_LOOKUP_GE && 2007 keyno > xfs_btree_get_numrecs(block) && 2008 !xfs_btree_ptr_is_null(cur, &ptr)) { 2009 int i; 2010 2011 cur->bc_levels[0].ptr = keyno; 2012 error = xfs_btree_increment(cur, 0, &i); 2013 if (error) 2014 goto error0; 2015 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) 2016 return -EFSCORRUPTED; 2017 *stat = 1; 2018 return 0; 2019 } 2020 } else if (dir == XFS_LOOKUP_LE && diff > 0) 2021 keyno--; 2022 cur->bc_levels[0].ptr = keyno; 2023 2024 /* Return if we succeeded or not. */ 2025 if (keyno == 0 || keyno > xfs_btree_get_numrecs(block)) 2026 *stat = 0; 2027 else if (dir != XFS_LOOKUP_EQ || diff == 0) 2028 *stat = 1; 2029 else 2030 *stat = 0; 2031 return 0; 2032 2033 error0: 2034 return error; 2035 } 2036 2037 /* Find the high key storage area from a regular key. */ 2038 union xfs_btree_key * 2039 xfs_btree_high_key_from_key( 2040 struct xfs_btree_cur *cur, 2041 union xfs_btree_key *key) 2042 { 2043 ASSERT(cur->bc_flags & XFS_BTREE_OVERLAPPING); 2044 return (union xfs_btree_key *)((char *)key + 2045 (cur->bc_ops->key_len / 2)); 2046 } 2047 2048 /* Determine the low (and high if overlapped) keys of a leaf block */ 2049 STATIC void 2050 xfs_btree_get_leaf_keys( 2051 struct xfs_btree_cur *cur, 2052 struct xfs_btree_block *block, 2053 union xfs_btree_key *key) 2054 { 2055 union xfs_btree_key max_hkey; 2056 union xfs_btree_key hkey; 2057 union xfs_btree_rec *rec; 2058 union xfs_btree_key *high; 2059 int n; 2060 2061 rec = xfs_btree_rec_addr(cur, 1, block); 2062 cur->bc_ops->init_key_from_rec(key, rec); 2063 2064 if (cur->bc_flags & XFS_BTREE_OVERLAPPING) { 2065 2066 cur->bc_ops->init_high_key_from_rec(&max_hkey, rec); 2067 for (n = 2; n <= xfs_btree_get_numrecs(block); n++) { 2068 rec = xfs_btree_rec_addr(cur, n, block); 2069 cur->bc_ops->init_high_key_from_rec(&hkey, rec); 2070 if (cur->bc_ops->diff_two_keys(cur, &hkey, &max_hkey) 2071 > 0) 2072 max_hkey = hkey; 2073 } 2074 2075 high = xfs_btree_high_key_from_key(cur, key); 2076 memcpy(high, &max_hkey, cur->bc_ops->key_len / 2); 2077 } 2078 } 2079 2080 /* Determine the low (and high if overlapped) keys of a node block */ 2081 STATIC void 2082 xfs_btree_get_node_keys( 2083 struct xfs_btree_cur *cur, 2084 struct xfs_btree_block *block, 2085 union xfs_btree_key *key) 2086 { 2087 union xfs_btree_key *hkey; 2088 union xfs_btree_key *max_hkey; 2089 union xfs_btree_key *high; 2090 int n; 2091 2092 if (cur->bc_flags & XFS_BTREE_OVERLAPPING) { 2093 memcpy(key, xfs_btree_key_addr(cur, 1, block), 2094 cur->bc_ops->key_len / 2); 2095 2096 max_hkey = xfs_btree_high_key_addr(cur, 1, block); 2097 for (n = 2; n <= xfs_btree_get_numrecs(block); n++) { 2098 hkey = xfs_btree_high_key_addr(cur, n, block); 2099 if (cur->bc_ops->diff_two_keys(cur, hkey, max_hkey) > 0) 2100 max_hkey = hkey; 2101 } 2102 2103 high = xfs_btree_high_key_from_key(cur, key); 2104 memcpy(high, max_hkey, cur->bc_ops->key_len / 2); 2105 } else { 2106 memcpy(key, xfs_btree_key_addr(cur, 1, block), 2107 cur->bc_ops->key_len); 2108 } 2109 } 2110 2111 /* Derive the keys for any btree block. */ 2112 void 2113 xfs_btree_get_keys( 2114 struct xfs_btree_cur *cur, 2115 struct xfs_btree_block *block, 2116 union xfs_btree_key *key) 2117 { 2118 if (be16_to_cpu(block->bb_level) == 0) 2119 xfs_btree_get_leaf_keys(cur, block, key); 2120 else 2121 xfs_btree_get_node_keys(cur, block, key); 2122 } 2123 2124 /* 2125 * Decide if we need to update the parent keys of a btree block. For 2126 * a standard btree this is only necessary if we're updating the first 2127 * record/key. For an overlapping btree, we must always update the 2128 * keys because the highest key can be in any of the records or keys 2129 * in the block. 2130 */ 2131 static inline bool 2132 xfs_btree_needs_key_update( 2133 struct xfs_btree_cur *cur, 2134 int ptr) 2135 { 2136 return (cur->bc_flags & XFS_BTREE_OVERLAPPING) || ptr == 1; 2137 } 2138 2139 /* 2140 * Update the low and high parent keys of the given level, progressing 2141 * towards the root. If force_all is false, stop if the keys for a given 2142 * level do not need updating. 2143 */ 2144 STATIC int 2145 __xfs_btree_updkeys( 2146 struct xfs_btree_cur *cur, 2147 int level, 2148 struct xfs_btree_block *block, 2149 struct xfs_buf *bp0, 2150 bool force_all) 2151 { 2152 union xfs_btree_key key; /* keys from current level */ 2153 union xfs_btree_key *lkey; /* keys from the next level up */ 2154 union xfs_btree_key *hkey; 2155 union xfs_btree_key *nlkey; /* keys from the next level up */ 2156 union xfs_btree_key *nhkey; 2157 struct xfs_buf *bp; 2158 int ptr; 2159 2160 ASSERT(cur->bc_flags & XFS_BTREE_OVERLAPPING); 2161 2162 /* Exit if there aren't any parent levels to update. */ 2163 if (level + 1 >= cur->bc_nlevels) 2164 return 0; 2165 2166 trace_xfs_btree_updkeys(cur, level, bp0); 2167 2168 lkey = &key; 2169 hkey = xfs_btree_high_key_from_key(cur, lkey); 2170 xfs_btree_get_keys(cur, block, lkey); 2171 for (level++; level < cur->bc_nlevels; level++) { 2172 #ifdef DEBUG 2173 int error; 2174 #endif 2175 block = xfs_btree_get_block(cur, level, &bp); 2176 trace_xfs_btree_updkeys(cur, level, bp); 2177 #ifdef DEBUG 2178 error = xfs_btree_check_block(cur, block, level, bp); 2179 if (error) 2180 return error; 2181 #endif 2182 ptr = cur->bc_levels[level].ptr; 2183 nlkey = xfs_btree_key_addr(cur, ptr, block); 2184 nhkey = xfs_btree_high_key_addr(cur, ptr, block); 2185 if (!force_all && 2186 !(cur->bc_ops->diff_two_keys(cur, nlkey, lkey) != 0 || 2187 cur->bc_ops->diff_two_keys(cur, nhkey, hkey) != 0)) 2188 break; 2189 xfs_btree_copy_keys(cur, nlkey, lkey, 1); 2190 xfs_btree_log_keys(cur, bp, ptr, ptr); 2191 if (level + 1 >= cur->bc_nlevels) 2192 break; 2193 xfs_btree_get_node_keys(cur, block, lkey); 2194 } 2195 2196 return 0; 2197 } 2198 2199 /* Update all the keys from some level in cursor back to the root. */ 2200 STATIC int 2201 xfs_btree_updkeys_force( 2202 struct xfs_btree_cur *cur, 2203 int level) 2204 { 2205 struct xfs_buf *bp; 2206 struct xfs_btree_block *block; 2207 2208 block = xfs_btree_get_block(cur, level, &bp); 2209 return __xfs_btree_updkeys(cur, level, block, bp, true); 2210 } 2211 2212 /* 2213 * Update the parent keys of the given level, progressing towards the root. 2214 */ 2215 STATIC int 2216 xfs_btree_update_keys( 2217 struct xfs_btree_cur *cur, 2218 int level) 2219 { 2220 struct xfs_btree_block *block; 2221 struct xfs_buf *bp; 2222 union xfs_btree_key *kp; 2223 union xfs_btree_key key; 2224 int ptr; 2225 2226 ASSERT(level >= 0); 2227 2228 block = xfs_btree_get_block(cur, level, &bp); 2229 if (cur->bc_flags & XFS_BTREE_OVERLAPPING) 2230 return __xfs_btree_updkeys(cur, level, block, bp, false); 2231 2232 /* 2233 * Go up the tree from this level toward the root. 2234 * At each level, update the key value to the value input. 2235 * Stop when we reach a level where the cursor isn't pointing 2236 * at the first entry in the block. 2237 */ 2238 xfs_btree_get_keys(cur, block, &key); 2239 for (level++, ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) { 2240 #ifdef DEBUG 2241 int error; 2242 #endif 2243 block = xfs_btree_get_block(cur, level, &bp); 2244 #ifdef DEBUG 2245 error = xfs_btree_check_block(cur, block, level, bp); 2246 if (error) 2247 return error; 2248 #endif 2249 ptr = cur->bc_levels[level].ptr; 2250 kp = xfs_btree_key_addr(cur, ptr, block); 2251 xfs_btree_copy_keys(cur, kp, &key, 1); 2252 xfs_btree_log_keys(cur, bp, ptr, ptr); 2253 } 2254 2255 return 0; 2256 } 2257 2258 /* 2259 * Update the record referred to by cur to the value in the 2260 * given record. This either works (return 0) or gets an 2261 * EFSCORRUPTED error. 2262 */ 2263 int 2264 xfs_btree_update( 2265 struct xfs_btree_cur *cur, 2266 union xfs_btree_rec *rec) 2267 { 2268 struct xfs_btree_block *block; 2269 struct xfs_buf *bp; 2270 int error; 2271 int ptr; 2272 union xfs_btree_rec *rp; 2273 2274 /* Pick up the current block. */ 2275 block = xfs_btree_get_block(cur, 0, &bp); 2276 2277 #ifdef DEBUG 2278 error = xfs_btree_check_block(cur, block, 0, bp); 2279 if (error) 2280 goto error0; 2281 #endif 2282 /* Get the address of the rec to be updated. */ 2283 ptr = cur->bc_levels[0].ptr; 2284 rp = xfs_btree_rec_addr(cur, ptr, block); 2285 2286 /* Fill in the new contents and log them. */ 2287 xfs_btree_copy_recs(cur, rp, rec, 1); 2288 xfs_btree_log_recs(cur, bp, ptr, ptr); 2289 2290 /* 2291 * If we are tracking the last record in the tree and 2292 * we are at the far right edge of the tree, update it. 2293 */ 2294 if (xfs_btree_is_lastrec(cur, block, 0)) { 2295 cur->bc_ops->update_lastrec(cur, block, rec, 2296 ptr, LASTREC_UPDATE); 2297 } 2298 2299 /* Pass new key value up to our parent. */ 2300 if (xfs_btree_needs_key_update(cur, ptr)) { 2301 error = xfs_btree_update_keys(cur, 0); 2302 if (error) 2303 goto error0; 2304 } 2305 2306 return 0; 2307 2308 error0: 2309 return error; 2310 } 2311 2312 /* 2313 * Move 1 record left from cur/level if possible. 2314 * Update cur to reflect the new path. 2315 */ 2316 STATIC int /* error */ 2317 xfs_btree_lshift( 2318 struct xfs_btree_cur *cur, 2319 int level, 2320 int *stat) /* success/failure */ 2321 { 2322 struct xfs_buf *lbp; /* left buffer pointer */ 2323 struct xfs_btree_block *left; /* left btree block */ 2324 int lrecs; /* left record count */ 2325 struct xfs_buf *rbp; /* right buffer pointer */ 2326 struct xfs_btree_block *right; /* right btree block */ 2327 struct xfs_btree_cur *tcur; /* temporary btree cursor */ 2328 int rrecs; /* right record count */ 2329 union xfs_btree_ptr lptr; /* left btree pointer */ 2330 union xfs_btree_key *rkp = NULL; /* right btree key */ 2331 union xfs_btree_ptr *rpp = NULL; /* right address pointer */ 2332 union xfs_btree_rec *rrp = NULL; /* right record pointer */ 2333 int error; /* error return value */ 2334 int i; 2335 2336 if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && 2337 level == cur->bc_nlevels - 1) 2338 goto out0; 2339 2340 /* Set up variables for this block as "right". */ 2341 right = xfs_btree_get_block(cur, level, &rbp); 2342 2343 #ifdef DEBUG 2344 error = xfs_btree_check_block(cur, right, level, rbp); 2345 if (error) 2346 goto error0; 2347 #endif 2348 2349 /* If we've got no left sibling then we can't shift an entry left. */ 2350 xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB); 2351 if (xfs_btree_ptr_is_null(cur, &lptr)) 2352 goto out0; 2353 2354 /* 2355 * If the cursor entry is the one that would be moved, don't 2356 * do it... it's too complicated. 2357 */ 2358 if (cur->bc_levels[level].ptr <= 1) 2359 goto out0; 2360 2361 /* Set up the left neighbor as "left". */ 2362 error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp); 2363 if (error) 2364 goto error0; 2365 2366 /* If it's full, it can't take another entry. */ 2367 lrecs = xfs_btree_get_numrecs(left); 2368 if (lrecs == cur->bc_ops->get_maxrecs(cur, level)) 2369 goto out0; 2370 2371 rrecs = xfs_btree_get_numrecs(right); 2372 2373 /* 2374 * We add one entry to the left side and remove one for the right side. 2375 * Account for it here, the changes will be updated on disk and logged 2376 * later. 2377 */ 2378 lrecs++; 2379 rrecs--; 2380 2381 XFS_BTREE_STATS_INC(cur, lshift); 2382 XFS_BTREE_STATS_ADD(cur, moves, 1); 2383 2384 /* 2385 * If non-leaf, copy a key and a ptr to the left block. 2386 * Log the changes to the left block. 2387 */ 2388 if (level > 0) { 2389 /* It's a non-leaf. Move keys and pointers. */ 2390 union xfs_btree_key *lkp; /* left btree key */ 2391 union xfs_btree_ptr *lpp; /* left address pointer */ 2392 2393 lkp = xfs_btree_key_addr(cur, lrecs, left); 2394 rkp = xfs_btree_key_addr(cur, 1, right); 2395 2396 lpp = xfs_btree_ptr_addr(cur, lrecs, left); 2397 rpp = xfs_btree_ptr_addr(cur, 1, right); 2398 2399 error = xfs_btree_debug_check_ptr(cur, rpp, 0, level); 2400 if (error) 2401 goto error0; 2402 2403 xfs_btree_copy_keys(cur, lkp, rkp, 1); 2404 xfs_btree_copy_ptrs(cur, lpp, rpp, 1); 2405 2406 xfs_btree_log_keys(cur, lbp, lrecs, lrecs); 2407 xfs_btree_log_ptrs(cur, lbp, lrecs, lrecs); 2408 2409 ASSERT(cur->bc_ops->keys_inorder(cur, 2410 xfs_btree_key_addr(cur, lrecs - 1, left), lkp)); 2411 } else { 2412 /* It's a leaf. Move records. */ 2413 union xfs_btree_rec *lrp; /* left record pointer */ 2414 2415 lrp = xfs_btree_rec_addr(cur, lrecs, left); 2416 rrp = xfs_btree_rec_addr(cur, 1, right); 2417 2418 xfs_btree_copy_recs(cur, lrp, rrp, 1); 2419 xfs_btree_log_recs(cur, lbp, lrecs, lrecs); 2420 2421 ASSERT(cur->bc_ops->recs_inorder(cur, 2422 xfs_btree_rec_addr(cur, lrecs - 1, left), lrp)); 2423 } 2424 2425 xfs_btree_set_numrecs(left, lrecs); 2426 xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS); 2427 2428 xfs_btree_set_numrecs(right, rrecs); 2429 xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS); 2430 2431 /* 2432 * Slide the contents of right down one entry. 2433 */ 2434 XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1); 2435 if (level > 0) { 2436 /* It's a nonleaf. operate on keys and ptrs */ 2437 for (i = 0; i < rrecs; i++) { 2438 error = xfs_btree_debug_check_ptr(cur, rpp, i + 1, level); 2439 if (error) 2440 goto error0; 2441 } 2442 2443 xfs_btree_shift_keys(cur, 2444 xfs_btree_key_addr(cur, 2, right), 2445 -1, rrecs); 2446 xfs_btree_shift_ptrs(cur, 2447 xfs_btree_ptr_addr(cur, 2, right), 2448 -1, rrecs); 2449 2450 xfs_btree_log_keys(cur, rbp, 1, rrecs); 2451 xfs_btree_log_ptrs(cur, rbp, 1, rrecs); 2452 } else { 2453 /* It's a leaf. operate on records */ 2454 xfs_btree_shift_recs(cur, 2455 xfs_btree_rec_addr(cur, 2, right), 2456 -1, rrecs); 2457 xfs_btree_log_recs(cur, rbp, 1, rrecs); 2458 } 2459 2460 /* 2461 * Using a temporary cursor, update the parent key values of the 2462 * block on the left. 2463 */ 2464 if (cur->bc_flags & XFS_BTREE_OVERLAPPING) { 2465 error = xfs_btree_dup_cursor(cur, &tcur); 2466 if (error) 2467 goto error0; 2468 i = xfs_btree_firstrec(tcur, level); 2469 if (XFS_IS_CORRUPT(tcur->bc_mp, i != 1)) { 2470 error = -EFSCORRUPTED; 2471 goto error0; 2472 } 2473 2474 error = xfs_btree_decrement(tcur, level, &i); 2475 if (error) 2476 goto error1; 2477 2478 /* Update the parent high keys of the left block, if needed. */ 2479 error = xfs_btree_update_keys(tcur, level); 2480 if (error) 2481 goto error1; 2482 2483 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); 2484 } 2485 2486 /* Update the parent keys of the right block. */ 2487 error = xfs_btree_update_keys(cur, level); 2488 if (error) 2489 goto error0; 2490 2491 /* Slide the cursor value left one. */ 2492 cur->bc_levels[level].ptr--; 2493 2494 *stat = 1; 2495 return 0; 2496 2497 out0: 2498 *stat = 0; 2499 return 0; 2500 2501 error0: 2502 return error; 2503 2504 error1: 2505 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); 2506 return error; 2507 } 2508 2509 /* 2510 * Move 1 record right from cur/level if possible. 2511 * Update cur to reflect the new path. 2512 */ 2513 STATIC int /* error */ 2514 xfs_btree_rshift( 2515 struct xfs_btree_cur *cur, 2516 int level, 2517 int *stat) /* success/failure */ 2518 { 2519 struct xfs_buf *lbp; /* left buffer pointer */ 2520 struct xfs_btree_block *left; /* left btree block */ 2521 struct xfs_buf *rbp; /* right buffer pointer */ 2522 struct xfs_btree_block *right; /* right btree block */ 2523 struct xfs_btree_cur *tcur; /* temporary btree cursor */ 2524 union xfs_btree_ptr rptr; /* right block pointer */ 2525 union xfs_btree_key *rkp; /* right btree key */ 2526 int rrecs; /* right record count */ 2527 int lrecs; /* left record count */ 2528 int error; /* error return value */ 2529 int i; /* loop counter */ 2530 2531 if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && 2532 (level == cur->bc_nlevels - 1)) 2533 goto out0; 2534 2535 /* Set up variables for this block as "left". */ 2536 left = xfs_btree_get_block(cur, level, &lbp); 2537 2538 #ifdef DEBUG 2539 error = xfs_btree_check_block(cur, left, level, lbp); 2540 if (error) 2541 goto error0; 2542 #endif 2543 2544 /* If we've got no right sibling then we can't shift an entry right. */ 2545 xfs_btree_get_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB); 2546 if (xfs_btree_ptr_is_null(cur, &rptr)) 2547 goto out0; 2548 2549 /* 2550 * If the cursor entry is the one that would be moved, don't 2551 * do it... it's too complicated. 2552 */ 2553 lrecs = xfs_btree_get_numrecs(left); 2554 if (cur->bc_levels[level].ptr >= lrecs) 2555 goto out0; 2556 2557 /* Set up the right neighbor as "right". */ 2558 error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp); 2559 if (error) 2560 goto error0; 2561 2562 /* If it's full, it can't take another entry. */ 2563 rrecs = xfs_btree_get_numrecs(right); 2564 if (rrecs == cur->bc_ops->get_maxrecs(cur, level)) 2565 goto out0; 2566 2567 XFS_BTREE_STATS_INC(cur, rshift); 2568 XFS_BTREE_STATS_ADD(cur, moves, rrecs); 2569 2570 /* 2571 * Make a hole at the start of the right neighbor block, then 2572 * copy the last left block entry to the hole. 2573 */ 2574 if (level > 0) { 2575 /* It's a nonleaf. make a hole in the keys and ptrs */ 2576 union xfs_btree_key *lkp; 2577 union xfs_btree_ptr *lpp; 2578 union xfs_btree_ptr *rpp; 2579 2580 lkp = xfs_btree_key_addr(cur, lrecs, left); 2581 lpp = xfs_btree_ptr_addr(cur, lrecs, left); 2582 rkp = xfs_btree_key_addr(cur, 1, right); 2583 rpp = xfs_btree_ptr_addr(cur, 1, right); 2584 2585 for (i = rrecs - 1; i >= 0; i--) { 2586 error = xfs_btree_debug_check_ptr(cur, rpp, i, level); 2587 if (error) 2588 goto error0; 2589 } 2590 2591 xfs_btree_shift_keys(cur, rkp, 1, rrecs); 2592 xfs_btree_shift_ptrs(cur, rpp, 1, rrecs); 2593 2594 error = xfs_btree_debug_check_ptr(cur, lpp, 0, level); 2595 if (error) 2596 goto error0; 2597 2598 /* Now put the new data in, and log it. */ 2599 xfs_btree_copy_keys(cur, rkp, lkp, 1); 2600 xfs_btree_copy_ptrs(cur, rpp, lpp, 1); 2601 2602 xfs_btree_log_keys(cur, rbp, 1, rrecs + 1); 2603 xfs_btree_log_ptrs(cur, rbp, 1, rrecs + 1); 2604 2605 ASSERT(cur->bc_ops->keys_inorder(cur, rkp, 2606 xfs_btree_key_addr(cur, 2, right))); 2607 } else { 2608 /* It's a leaf. make a hole in the records */ 2609 union xfs_btree_rec *lrp; 2610 union xfs_btree_rec *rrp; 2611 2612 lrp = xfs_btree_rec_addr(cur, lrecs, left); 2613 rrp = xfs_btree_rec_addr(cur, 1, right); 2614 2615 xfs_btree_shift_recs(cur, rrp, 1, rrecs); 2616 2617 /* Now put the new data in, and log it. */ 2618 xfs_btree_copy_recs(cur, rrp, lrp, 1); 2619 xfs_btree_log_recs(cur, rbp, 1, rrecs + 1); 2620 } 2621 2622 /* 2623 * Decrement and log left's numrecs, bump and log right's numrecs. 2624 */ 2625 xfs_btree_set_numrecs(left, --lrecs); 2626 xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS); 2627 2628 xfs_btree_set_numrecs(right, ++rrecs); 2629 xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS); 2630 2631 /* 2632 * Using a temporary cursor, update the parent key values of the 2633 * block on the right. 2634 */ 2635 error = xfs_btree_dup_cursor(cur, &tcur); 2636 if (error) 2637 goto error0; 2638 i = xfs_btree_lastrec(tcur, level); 2639 if (XFS_IS_CORRUPT(tcur->bc_mp, i != 1)) { 2640 error = -EFSCORRUPTED; 2641 goto error0; 2642 } 2643 2644 error = xfs_btree_increment(tcur, level, &i); 2645 if (error) 2646 goto error1; 2647 2648 /* Update the parent high keys of the left block, if needed. */ 2649 if (cur->bc_flags & XFS_BTREE_OVERLAPPING) { 2650 error = xfs_btree_update_keys(cur, level); 2651 if (error) 2652 goto error1; 2653 } 2654 2655 /* Update the parent keys of the right block. */ 2656 error = xfs_btree_update_keys(tcur, level); 2657 if (error) 2658 goto error1; 2659 2660 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); 2661 2662 *stat = 1; 2663 return 0; 2664 2665 out0: 2666 *stat = 0; 2667 return 0; 2668 2669 error0: 2670 return error; 2671 2672 error1: 2673 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); 2674 return error; 2675 } 2676 2677 /* 2678 * Split cur/level block in half. 2679 * Return new block number and the key to its first 2680 * record (to be inserted into parent). 2681 */ 2682 STATIC int /* error */ 2683 __xfs_btree_split( 2684 struct xfs_btree_cur *cur, 2685 int level, 2686 union xfs_btree_ptr *ptrp, 2687 union xfs_btree_key *key, 2688 struct xfs_btree_cur **curp, 2689 int *stat) /* success/failure */ 2690 { 2691 union xfs_btree_ptr lptr; /* left sibling block ptr */ 2692 struct xfs_buf *lbp; /* left buffer pointer */ 2693 struct xfs_btree_block *left; /* left btree block */ 2694 union xfs_btree_ptr rptr; /* right sibling block ptr */ 2695 struct xfs_buf *rbp; /* right buffer pointer */ 2696 struct xfs_btree_block *right; /* right btree block */ 2697 union xfs_btree_ptr rrptr; /* right-right sibling ptr */ 2698 struct xfs_buf *rrbp; /* right-right buffer pointer */ 2699 struct xfs_btree_block *rrblock; /* right-right btree block */ 2700 int lrecs; 2701 int rrecs; 2702 int src_index; 2703 int error; /* error return value */ 2704 int i; 2705 2706 XFS_BTREE_STATS_INC(cur, split); 2707 2708 /* Set up left block (current one). */ 2709 left = xfs_btree_get_block(cur, level, &lbp); 2710 2711 #ifdef DEBUG 2712 error = xfs_btree_check_block(cur, left, level, lbp); 2713 if (error) 2714 goto error0; 2715 #endif 2716 2717 xfs_btree_buf_to_ptr(cur, lbp, &lptr); 2718 2719 /* Allocate the new block. If we can't do it, we're toast. Give up. */ 2720 error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, stat); 2721 if (error) 2722 goto error0; 2723 if (*stat == 0) 2724 goto out0; 2725 XFS_BTREE_STATS_INC(cur, alloc); 2726 2727 /* Set up the new block as "right". */ 2728 error = xfs_btree_get_buf_block(cur, &rptr, &right, &rbp); 2729 if (error) 2730 goto error0; 2731 2732 /* Fill in the btree header for the new right block. */ 2733 xfs_btree_init_block_cur(cur, rbp, xfs_btree_get_level(left), 0); 2734 2735 /* 2736 * Split the entries between the old and the new block evenly. 2737 * Make sure that if there's an odd number of entries now, that 2738 * each new block will have the same number of entries. 2739 */ 2740 lrecs = xfs_btree_get_numrecs(left); 2741 rrecs = lrecs / 2; 2742 if ((lrecs & 1) && cur->bc_levels[level].ptr <= rrecs + 1) 2743 rrecs++; 2744 src_index = (lrecs - rrecs + 1); 2745 2746 XFS_BTREE_STATS_ADD(cur, moves, rrecs); 2747 2748 /* Adjust numrecs for the later get_*_keys() calls. */ 2749 lrecs -= rrecs; 2750 xfs_btree_set_numrecs(left, lrecs); 2751 xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs); 2752 2753 /* 2754 * Copy btree block entries from the left block over to the 2755 * new block, the right. Update the right block and log the 2756 * changes. 2757 */ 2758 if (level > 0) { 2759 /* It's a non-leaf. Move keys and pointers. */ 2760 union xfs_btree_key *lkp; /* left btree key */ 2761 union xfs_btree_ptr *lpp; /* left address pointer */ 2762 union xfs_btree_key *rkp; /* right btree key */ 2763 union xfs_btree_ptr *rpp; /* right address pointer */ 2764 2765 lkp = xfs_btree_key_addr(cur, src_index, left); 2766 lpp = xfs_btree_ptr_addr(cur, src_index, left); 2767 rkp = xfs_btree_key_addr(cur, 1, right); 2768 rpp = xfs_btree_ptr_addr(cur, 1, right); 2769 2770 for (i = src_index; i < rrecs; i++) { 2771 error = xfs_btree_debug_check_ptr(cur, lpp, i, level); 2772 if (error) 2773 goto error0; 2774 } 2775 2776 /* Copy the keys & pointers to the new block. */ 2777 xfs_btree_copy_keys(cur, rkp, lkp, rrecs); 2778 xfs_btree_copy_ptrs(cur, rpp, lpp, rrecs); 2779 2780 xfs_btree_log_keys(cur, rbp, 1, rrecs); 2781 xfs_btree_log_ptrs(cur, rbp, 1, rrecs); 2782 2783 /* Stash the keys of the new block for later insertion. */ 2784 xfs_btree_get_node_keys(cur, right, key); 2785 } else { 2786 /* It's a leaf. Move records. */ 2787 union xfs_btree_rec *lrp; /* left record pointer */ 2788 union xfs_btree_rec *rrp; /* right record pointer */ 2789 2790 lrp = xfs_btree_rec_addr(cur, src_index, left); 2791 rrp = xfs_btree_rec_addr(cur, 1, right); 2792 2793 /* Copy records to the new block. */ 2794 xfs_btree_copy_recs(cur, rrp, lrp, rrecs); 2795 xfs_btree_log_recs(cur, rbp, 1, rrecs); 2796 2797 /* Stash the keys of the new block for later insertion. */ 2798 xfs_btree_get_leaf_keys(cur, right, key); 2799 } 2800 2801 /* 2802 * Find the left block number by looking in the buffer. 2803 * Adjust sibling pointers. 2804 */ 2805 xfs_btree_get_sibling(cur, left, &rrptr, XFS_BB_RIGHTSIB); 2806 xfs_btree_set_sibling(cur, right, &rrptr, XFS_BB_RIGHTSIB); 2807 xfs_btree_set_sibling(cur, right, &lptr, XFS_BB_LEFTSIB); 2808 xfs_btree_set_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB); 2809 2810 xfs_btree_log_block(cur, rbp, XFS_BB_ALL_BITS); 2811 xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB); 2812 2813 /* 2814 * If there's a block to the new block's right, make that block 2815 * point back to right instead of to left. 2816 */ 2817 if (!xfs_btree_ptr_is_null(cur, &rrptr)) { 2818 error = xfs_btree_read_buf_block(cur, &rrptr, 2819 0, &rrblock, &rrbp); 2820 if (error) 2821 goto error0; 2822 xfs_btree_set_sibling(cur, rrblock, &rptr, XFS_BB_LEFTSIB); 2823 xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB); 2824 } 2825 2826 /* Update the parent high keys of the left block, if needed. */ 2827 if (cur->bc_flags & XFS_BTREE_OVERLAPPING) { 2828 error = xfs_btree_update_keys(cur, level); 2829 if (error) 2830 goto error0; 2831 } 2832 2833 /* 2834 * If the cursor is really in the right block, move it there. 2835 * If it's just pointing past the last entry in left, then we'll 2836 * insert there, so don't change anything in that case. 2837 */ 2838 if (cur->bc_levels[level].ptr > lrecs + 1) { 2839 xfs_btree_setbuf(cur, level, rbp); 2840 cur->bc_levels[level].ptr -= lrecs; 2841 } 2842 /* 2843 * If there are more levels, we'll need another cursor which refers 2844 * the right block, no matter where this cursor was. 2845 */ 2846 if (level + 1 < cur->bc_nlevels) { 2847 error = xfs_btree_dup_cursor(cur, curp); 2848 if (error) 2849 goto error0; 2850 (*curp)->bc_levels[level + 1].ptr++; 2851 } 2852 *ptrp = rptr; 2853 *stat = 1; 2854 return 0; 2855 out0: 2856 *stat = 0; 2857 return 0; 2858 2859 error0: 2860 return error; 2861 } 2862 2863 #ifdef __KERNEL__ 2864 struct xfs_btree_split_args { 2865 struct xfs_btree_cur *cur; 2866 int level; 2867 union xfs_btree_ptr *ptrp; 2868 union xfs_btree_key *key; 2869 struct xfs_btree_cur **curp; 2870 int *stat; /* success/failure */ 2871 int result; 2872 bool kswapd; /* allocation in kswapd context */ 2873 struct completion *done; 2874 struct work_struct work; 2875 }; 2876 2877 /* 2878 * Stack switching interfaces for allocation 2879 */ 2880 static void 2881 xfs_btree_split_worker( 2882 struct work_struct *work) 2883 { 2884 struct xfs_btree_split_args *args = container_of(work, 2885 struct xfs_btree_split_args, work); 2886 unsigned long pflags; 2887 unsigned long new_pflags = 0; 2888 2889 /* 2890 * we are in a transaction context here, but may also be doing work 2891 * in kswapd context, and hence we may need to inherit that state 2892 * temporarily to ensure that we don't block waiting for memory reclaim 2893 * in any way. 2894 */ 2895 if (args->kswapd) 2896 new_pflags |= PF_MEMALLOC | PF_KSWAPD; 2897 2898 current_set_flags_nested(&pflags, new_pflags); 2899 xfs_trans_set_context(args->cur->bc_tp); 2900 2901 args->result = __xfs_btree_split(args->cur, args->level, args->ptrp, 2902 args->key, args->curp, args->stat); 2903 2904 xfs_trans_clear_context(args->cur->bc_tp); 2905 current_restore_flags_nested(&pflags, new_pflags); 2906 2907 /* 2908 * Do not access args after complete() has run here. We don't own args 2909 * and the owner may run and free args before we return here. 2910 */ 2911 complete(args->done); 2912 2913 } 2914 2915 /* 2916 * BMBT split requests often come in with little stack to work on. Push 2917 * them off to a worker thread so there is lots of stack to use. For the other 2918 * btree types, just call directly to avoid the context switch overhead here. 2919 */ 2920 STATIC int /* error */ 2921 xfs_btree_split( 2922 struct xfs_btree_cur *cur, 2923 int level, 2924 union xfs_btree_ptr *ptrp, 2925 union xfs_btree_key *key, 2926 struct xfs_btree_cur **curp, 2927 int *stat) /* success/failure */ 2928 { 2929 struct xfs_btree_split_args args; 2930 DECLARE_COMPLETION_ONSTACK(done); 2931 2932 if (cur->bc_btnum != XFS_BTNUM_BMAP) 2933 return __xfs_btree_split(cur, level, ptrp, key, curp, stat); 2934 2935 args.cur = cur; 2936 args.level = level; 2937 args.ptrp = ptrp; 2938 args.key = key; 2939 args.curp = curp; 2940 args.stat = stat; 2941 args.done = &done; 2942 args.kswapd = current_is_kswapd(); 2943 INIT_WORK_ONSTACK(&args.work, xfs_btree_split_worker); 2944 queue_work(xfs_alloc_wq, &args.work); 2945 wait_for_completion(&done); 2946 destroy_work_on_stack(&args.work); 2947 return args.result; 2948 } 2949 #else 2950 #define xfs_btree_split __xfs_btree_split 2951 #endif /* __KERNEL__ */ 2952 2953 2954 /* 2955 * Copy the old inode root contents into a real block and make the 2956 * broot point to it. 2957 */ 2958 int /* error */ 2959 xfs_btree_new_iroot( 2960 struct xfs_btree_cur *cur, /* btree cursor */ 2961 int *logflags, /* logging flags for inode */ 2962 int *stat) /* return status - 0 fail */ 2963 { 2964 struct xfs_buf *cbp; /* buffer for cblock */ 2965 struct xfs_btree_block *block; /* btree block */ 2966 struct xfs_btree_block *cblock; /* child btree block */ 2967 union xfs_btree_key *ckp; /* child key pointer */ 2968 union xfs_btree_ptr *cpp; /* child ptr pointer */ 2969 union xfs_btree_key *kp; /* pointer to btree key */ 2970 union xfs_btree_ptr *pp; /* pointer to block addr */ 2971 union xfs_btree_ptr nptr; /* new block addr */ 2972 int level; /* btree level */ 2973 int error; /* error return code */ 2974 int i; /* loop counter */ 2975 2976 XFS_BTREE_STATS_INC(cur, newroot); 2977 2978 ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); 2979 2980 level = cur->bc_nlevels - 1; 2981 2982 block = xfs_btree_get_iroot(cur); 2983 pp = xfs_btree_ptr_addr(cur, 1, block); 2984 2985 /* Allocate the new block. If we can't do it, we're toast. Give up. */ 2986 error = cur->bc_ops->alloc_block(cur, pp, &nptr, stat); 2987 if (error) 2988 goto error0; 2989 if (*stat == 0) 2990 return 0; 2991 2992 XFS_BTREE_STATS_INC(cur, alloc); 2993 2994 /* Copy the root into a real block. */ 2995 error = xfs_btree_get_buf_block(cur, &nptr, &cblock, &cbp); 2996 if (error) 2997 goto error0; 2998 2999 /* 3000 * we can't just memcpy() the root in for CRC enabled btree blocks. 3001 * In that case have to also ensure the blkno remains correct 3002 */ 3003 memcpy(cblock, block, xfs_btree_block_len(cur)); 3004 if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) { 3005 __be64 bno = cpu_to_be64(xfs_buf_daddr(cbp)); 3006 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) 3007 cblock->bb_u.l.bb_blkno = bno; 3008 else 3009 cblock->bb_u.s.bb_blkno = bno; 3010 } 3011 3012 be16_add_cpu(&block->bb_level, 1); 3013 xfs_btree_set_numrecs(block, 1); 3014 cur->bc_nlevels++; 3015 ASSERT(cur->bc_nlevels <= cur->bc_maxlevels); 3016 cur->bc_levels[level + 1].ptr = 1; 3017 3018 kp = xfs_btree_key_addr(cur, 1, block); 3019 ckp = xfs_btree_key_addr(cur, 1, cblock); 3020 xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock)); 3021 3022 cpp = xfs_btree_ptr_addr(cur, 1, cblock); 3023 for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) { 3024 error = xfs_btree_debug_check_ptr(cur, pp, i, level); 3025 if (error) 3026 goto error0; 3027 } 3028 3029 xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock)); 3030 3031 error = xfs_btree_debug_check_ptr(cur, &nptr, 0, level); 3032 if (error) 3033 goto error0; 3034 3035 xfs_btree_copy_ptrs(cur, pp, &nptr, 1); 3036 3037 xfs_iroot_realloc(cur->bc_ino.ip, 3038 1 - xfs_btree_get_numrecs(cblock), 3039 cur->bc_ino.whichfork); 3040 3041 xfs_btree_setbuf(cur, level, cbp); 3042 3043 /* 3044 * Do all this logging at the end so that 3045 * the root is at the right level. 3046 */ 3047 xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS); 3048 xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs)); 3049 xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs)); 3050 3051 *logflags |= 3052 XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork); 3053 *stat = 1; 3054 return 0; 3055 error0: 3056 return error; 3057 } 3058 3059 /* 3060 * Allocate a new root block, fill it in. 3061 */ 3062 STATIC int /* error */ 3063 xfs_btree_new_root( 3064 struct xfs_btree_cur *cur, /* btree cursor */ 3065 int *stat) /* success/failure */ 3066 { 3067 struct xfs_btree_block *block; /* one half of the old root block */ 3068 struct xfs_buf *bp; /* buffer containing block */ 3069 int error; /* error return value */ 3070 struct xfs_buf *lbp; /* left buffer pointer */ 3071 struct xfs_btree_block *left; /* left btree block */ 3072 struct xfs_buf *nbp; /* new (root) buffer */ 3073 struct xfs_btree_block *new; /* new (root) btree block */ 3074 int nptr; /* new value for key index, 1 or 2 */ 3075 struct xfs_buf *rbp; /* right buffer pointer */ 3076 struct xfs_btree_block *right; /* right btree block */ 3077 union xfs_btree_ptr rptr; 3078 union xfs_btree_ptr lptr; 3079 3080 XFS_BTREE_STATS_INC(cur, newroot); 3081 3082 /* initialise our start point from the cursor */ 3083 cur->bc_ops->init_ptr_from_cur(cur, &rptr); 3084 3085 /* Allocate the new block. If we can't do it, we're toast. Give up. */ 3086 error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, stat); 3087 if (error) 3088 goto error0; 3089 if (*stat == 0) 3090 goto out0; 3091 XFS_BTREE_STATS_INC(cur, alloc); 3092 3093 /* Set up the new block. */ 3094 error = xfs_btree_get_buf_block(cur, &lptr, &new, &nbp); 3095 if (error) 3096 goto error0; 3097 3098 /* Set the root in the holding structure increasing the level by 1. */ 3099 cur->bc_ops->set_root(cur, &lptr, 1); 3100 3101 /* 3102 * At the previous root level there are now two blocks: the old root, 3103 * and the new block generated when it was split. We don't know which 3104 * one the cursor is pointing at, so we set up variables "left" and 3105 * "right" for each case. 3106 */ 3107 block = xfs_btree_get_block(cur, cur->bc_nlevels - 1, &bp); 3108 3109 #ifdef DEBUG 3110 error = xfs_btree_check_block(cur, block, cur->bc_nlevels - 1, bp); 3111 if (error) 3112 goto error0; 3113 #endif 3114 3115 xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB); 3116 if (!xfs_btree_ptr_is_null(cur, &rptr)) { 3117 /* Our block is left, pick up the right block. */ 3118 lbp = bp; 3119 xfs_btree_buf_to_ptr(cur, lbp, &lptr); 3120 left = block; 3121 error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp); 3122 if (error) 3123 goto error0; 3124 bp = rbp; 3125 nptr = 1; 3126 } else { 3127 /* Our block is right, pick up the left block. */ 3128 rbp = bp; 3129 xfs_btree_buf_to_ptr(cur, rbp, &rptr); 3130 right = block; 3131 xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB); 3132 error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp); 3133 if (error) 3134 goto error0; 3135 bp = lbp; 3136 nptr = 2; 3137 } 3138 3139 /* Fill in the new block's btree header and log it. */ 3140 xfs_btree_init_block_cur(cur, nbp, cur->bc_nlevels, 2); 3141 xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS); 3142 ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) && 3143 !xfs_btree_ptr_is_null(cur, &rptr)); 3144 3145 /* Fill in the key data in the new root. */ 3146 if (xfs_btree_get_level(left) > 0) { 3147 /* 3148 * Get the keys for the left block's keys and put them directly 3149 * in the parent block. Do the same for the right block. 3150 */ 3151 xfs_btree_get_node_keys(cur, left, 3152 xfs_btree_key_addr(cur, 1, new)); 3153 xfs_btree_get_node_keys(cur, right, 3154 xfs_btree_key_addr(cur, 2, new)); 3155 } else { 3156 /* 3157 * Get the keys for the left block's records and put them 3158 * directly in the parent block. Do the same for the right 3159 * block. 3160 */ 3161 xfs_btree_get_leaf_keys(cur, left, 3162 xfs_btree_key_addr(cur, 1, new)); 3163 xfs_btree_get_leaf_keys(cur, right, 3164 xfs_btree_key_addr(cur, 2, new)); 3165 } 3166 xfs_btree_log_keys(cur, nbp, 1, 2); 3167 3168 /* Fill in the pointer data in the new root. */ 3169 xfs_btree_copy_ptrs(cur, 3170 xfs_btree_ptr_addr(cur, 1, new), &lptr, 1); 3171 xfs_btree_copy_ptrs(cur, 3172 xfs_btree_ptr_addr(cur, 2, new), &rptr, 1); 3173 xfs_btree_log_ptrs(cur, nbp, 1, 2); 3174 3175 /* Fix up the cursor. */ 3176 xfs_btree_setbuf(cur, cur->bc_nlevels, nbp); 3177 cur->bc_levels[cur->bc_nlevels].ptr = nptr; 3178 cur->bc_nlevels++; 3179 ASSERT(cur->bc_nlevels <= cur->bc_maxlevels); 3180 *stat = 1; 3181 return 0; 3182 error0: 3183 return error; 3184 out0: 3185 *stat = 0; 3186 return 0; 3187 } 3188 3189 STATIC int 3190 xfs_btree_make_block_unfull( 3191 struct xfs_btree_cur *cur, /* btree cursor */ 3192 int level, /* btree level */ 3193 int numrecs,/* # of recs in block */ 3194 int *oindex,/* old tree index */ 3195 int *index, /* new tree index */ 3196 union xfs_btree_ptr *nptr, /* new btree ptr */ 3197 struct xfs_btree_cur **ncur, /* new btree cursor */ 3198 union xfs_btree_key *key, /* key of new block */ 3199 int *stat) 3200 { 3201 int error = 0; 3202 3203 if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && 3204 level == cur->bc_nlevels - 1) { 3205 struct xfs_inode *ip = cur->bc_ino.ip; 3206 3207 if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) { 3208 /* A root block that can be made bigger. */ 3209 xfs_iroot_realloc(ip, 1, cur->bc_ino.whichfork); 3210 *stat = 1; 3211 } else { 3212 /* A root block that needs replacing */ 3213 int logflags = 0; 3214 3215 error = xfs_btree_new_iroot(cur, &logflags, stat); 3216 if (error || *stat == 0) 3217 return error; 3218 3219 xfs_trans_log_inode(cur->bc_tp, ip, logflags); 3220 } 3221 3222 return 0; 3223 } 3224 3225 /* First, try shifting an entry to the right neighbor. */ 3226 error = xfs_btree_rshift(cur, level, stat); 3227 if (error || *stat) 3228 return error; 3229 3230 /* Next, try shifting an entry to the left neighbor. */ 3231 error = xfs_btree_lshift(cur, level, stat); 3232 if (error) 3233 return error; 3234 3235 if (*stat) { 3236 *oindex = *index = cur->bc_levels[level].ptr; 3237 return 0; 3238 } 3239 3240 /* 3241 * Next, try splitting the current block in half. 3242 * 3243 * If this works we have to re-set our variables because we 3244 * could be in a different block now. 3245 */ 3246 error = xfs_btree_split(cur, level, nptr, key, ncur, stat); 3247 if (error || *stat == 0) 3248 return error; 3249 3250 3251 *index = cur->bc_levels[level].ptr; 3252 return 0; 3253 } 3254 3255 /* 3256 * Insert one record/level. Return information to the caller 3257 * allowing the next level up to proceed if necessary. 3258 */ 3259 STATIC int 3260 xfs_btree_insrec( 3261 struct xfs_btree_cur *cur, /* btree cursor */ 3262 int level, /* level to insert record at */ 3263 union xfs_btree_ptr *ptrp, /* i/o: block number inserted */ 3264 union xfs_btree_rec *rec, /* record to insert */ 3265 union xfs_btree_key *key, /* i/o: block key for ptrp */ 3266 struct xfs_btree_cur **curp, /* output: new cursor replacing cur */ 3267 int *stat) /* success/failure */ 3268 { 3269 struct xfs_btree_block *block; /* btree block */ 3270 struct xfs_buf *bp; /* buffer for block */ 3271 union xfs_btree_ptr nptr; /* new block ptr */ 3272 struct xfs_btree_cur *ncur = NULL; /* new btree cursor */ 3273 union xfs_btree_key nkey; /* new block key */ 3274 union xfs_btree_key *lkey; 3275 int optr; /* old key/record index */ 3276 int ptr; /* key/record index */ 3277 int numrecs;/* number of records */ 3278 int error; /* error return value */ 3279 int i; 3280 xfs_daddr_t old_bn; 3281 3282 ncur = NULL; 3283 lkey = &nkey; 3284 3285 /* 3286 * If we have an external root pointer, and we've made it to the 3287 * root level, allocate a new root block and we're done. 3288 */ 3289 if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && 3290 (level >= cur->bc_nlevels)) { 3291 error = xfs_btree_new_root(cur, stat); 3292 xfs_btree_set_ptr_null(cur, ptrp); 3293 3294 return error; 3295 } 3296 3297 /* If we're off the left edge, return failure. */ 3298 ptr = cur->bc_levels[level].ptr; 3299 if (ptr == 0) { 3300 *stat = 0; 3301 return 0; 3302 } 3303 3304 optr = ptr; 3305 3306 XFS_BTREE_STATS_INC(cur, insrec); 3307 3308 /* Get pointers to the btree buffer and block. */ 3309 block = xfs_btree_get_block(cur, level, &bp); 3310 old_bn = bp ? xfs_buf_daddr(bp) : XFS_BUF_DADDR_NULL; 3311 numrecs = xfs_btree_get_numrecs(block); 3312 3313 #ifdef DEBUG 3314 error = xfs_btree_check_block(cur, block, level, bp); 3315 if (error) 3316 goto error0; 3317 3318 /* Check that the new entry is being inserted in the right place. */ 3319 if (ptr <= numrecs) { 3320 if (level == 0) { 3321 ASSERT(cur->bc_ops->recs_inorder(cur, rec, 3322 xfs_btree_rec_addr(cur, ptr, block))); 3323 } else { 3324 ASSERT(cur->bc_ops->keys_inorder(cur, key, 3325 xfs_btree_key_addr(cur, ptr, block))); 3326 } 3327 } 3328 #endif 3329 3330 /* 3331 * If the block is full, we can't insert the new entry until we 3332 * make the block un-full. 3333 */ 3334 xfs_btree_set_ptr_null(cur, &nptr); 3335 if (numrecs == cur->bc_ops->get_maxrecs(cur, level)) { 3336 error = xfs_btree_make_block_unfull(cur, level, numrecs, 3337 &optr, &ptr, &nptr, &ncur, lkey, stat); 3338 if (error || *stat == 0) 3339 goto error0; 3340 } 3341 3342 /* 3343 * The current block may have changed if the block was 3344 * previously full and we have just made space in it. 3345 */ 3346 block = xfs_btree_get_block(cur, level, &bp); 3347 numrecs = xfs_btree_get_numrecs(block); 3348 3349 #ifdef DEBUG 3350 error = xfs_btree_check_block(cur, block, level, bp); 3351 if (error) 3352 goto error0; 3353 #endif 3354 3355 /* 3356 * At this point we know there's room for our new entry in the block 3357 * we're pointing at. 3358 */ 3359 XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr + 1); 3360 3361 if (level > 0) { 3362 /* It's a nonleaf. make a hole in the keys and ptrs */ 3363 union xfs_btree_key *kp; 3364 union xfs_btree_ptr *pp; 3365 3366 kp = xfs_btree_key_addr(cur, ptr, block); 3367 pp = xfs_btree_ptr_addr(cur, ptr, block); 3368 3369 for (i = numrecs - ptr; i >= 0; i--) { 3370 error = xfs_btree_debug_check_ptr(cur, pp, i, level); 3371 if (error) 3372 goto error0; 3373 } 3374 3375 xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1); 3376 xfs_btree_shift_ptrs(cur, pp, 1, numrecs - ptr + 1); 3377 3378 error = xfs_btree_debug_check_ptr(cur, ptrp, 0, level); 3379 if (error) 3380 goto error0; 3381 3382 /* Now put the new data in, bump numrecs and log it. */ 3383 xfs_btree_copy_keys(cur, kp, key, 1); 3384 xfs_btree_copy_ptrs(cur, pp, ptrp, 1); 3385 numrecs++; 3386 xfs_btree_set_numrecs(block, numrecs); 3387 xfs_btree_log_ptrs(cur, bp, ptr, numrecs); 3388 xfs_btree_log_keys(cur, bp, ptr, numrecs); 3389 #ifdef DEBUG 3390 if (ptr < numrecs) { 3391 ASSERT(cur->bc_ops->keys_inorder(cur, kp, 3392 xfs_btree_key_addr(cur, ptr + 1, block))); 3393 } 3394 #endif 3395 } else { 3396 /* It's a leaf. make a hole in the records */ 3397 union xfs_btree_rec *rp; 3398 3399 rp = xfs_btree_rec_addr(cur, ptr, block); 3400 3401 xfs_btree_shift_recs(cur, rp, 1, numrecs - ptr + 1); 3402 3403 /* Now put the new data in, bump numrecs and log it. */ 3404 xfs_btree_copy_recs(cur, rp, rec, 1); 3405 xfs_btree_set_numrecs(block, ++numrecs); 3406 xfs_btree_log_recs(cur, bp, ptr, numrecs); 3407 #ifdef DEBUG 3408 if (ptr < numrecs) { 3409 ASSERT(cur->bc_ops->recs_inorder(cur, rp, 3410 xfs_btree_rec_addr(cur, ptr + 1, block))); 3411 } 3412 #endif 3413 } 3414 3415 /* Log the new number of records in the btree header. */ 3416 xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS); 3417 3418 /* 3419 * If we just inserted into a new tree block, we have to 3420 * recalculate nkey here because nkey is out of date. 3421 * 3422 * Otherwise we're just updating an existing block (having shoved 3423 * some records into the new tree block), so use the regular key 3424 * update mechanism. 3425 */ 3426 if (bp && xfs_buf_daddr(bp) != old_bn) { 3427 xfs_btree_get_keys(cur, block, lkey); 3428 } else if (xfs_btree_needs_key_update(cur, optr)) { 3429 error = xfs_btree_update_keys(cur, level); 3430 if (error) 3431 goto error0; 3432 } 3433 3434 /* 3435 * If we are tracking the last record in the tree and 3436 * we are at the far right edge of the tree, update it. 3437 */ 3438 if (xfs_btree_is_lastrec(cur, block, level)) { 3439 cur->bc_ops->update_lastrec(cur, block, rec, 3440 ptr, LASTREC_INSREC); 3441 } 3442 3443 /* 3444 * Return the new block number, if any. 3445 * If there is one, give back a record value and a cursor too. 3446 */ 3447 *ptrp = nptr; 3448 if (!xfs_btree_ptr_is_null(cur, &nptr)) { 3449 xfs_btree_copy_keys(cur, key, lkey, 1); 3450 *curp = ncur; 3451 } 3452 3453 *stat = 1; 3454 return 0; 3455 3456 error0: 3457 if (ncur) 3458 xfs_btree_del_cursor(ncur, error); 3459 return error; 3460 } 3461 3462 /* 3463 * Insert the record at the point referenced by cur. 3464 * 3465 * A multi-level split of the tree on insert will invalidate the original 3466 * cursor. All callers of this function should assume that the cursor is 3467 * no longer valid and revalidate it. 3468 */ 3469 int 3470 xfs_btree_insert( 3471 struct xfs_btree_cur *cur, 3472 int *stat) 3473 { 3474 int error; /* error return value */ 3475 int i; /* result value, 0 for failure */ 3476 int level; /* current level number in btree */ 3477 union xfs_btree_ptr nptr; /* new block number (split result) */ 3478 struct xfs_btree_cur *ncur; /* new cursor (split result) */ 3479 struct xfs_btree_cur *pcur; /* previous level's cursor */ 3480 union xfs_btree_key bkey; /* key of block to insert */ 3481 union xfs_btree_key *key; 3482 union xfs_btree_rec rec; /* record to insert */ 3483 3484 level = 0; 3485 ncur = NULL; 3486 pcur = cur; 3487 key = &bkey; 3488 3489 xfs_btree_set_ptr_null(cur, &nptr); 3490 3491 /* Make a key out of the record data to be inserted, and save it. */ 3492 cur->bc_ops->init_rec_from_cur(cur, &rec); 3493 cur->bc_ops->init_key_from_rec(key, &rec); 3494 3495 /* 3496 * Loop going up the tree, starting at the leaf level. 3497 * Stop when we don't get a split block, that must mean that 3498 * the insert is finished with this level. 3499 */ 3500 do { 3501 /* 3502 * Insert nrec/nptr into this level of the tree. 3503 * Note if we fail, nptr will be null. 3504 */ 3505 error = xfs_btree_insrec(pcur, level, &nptr, &rec, key, 3506 &ncur, &i); 3507 if (error) { 3508 if (pcur != cur) 3509 xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR); 3510 goto error0; 3511 } 3512 3513 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 3514 error = -EFSCORRUPTED; 3515 goto error0; 3516 } 3517 level++; 3518 3519 /* 3520 * See if the cursor we just used is trash. 3521 * Can't trash the caller's cursor, but otherwise we should 3522 * if ncur is a new cursor or we're about to be done. 3523 */ 3524 if (pcur != cur && 3525 (ncur || xfs_btree_ptr_is_null(cur, &nptr))) { 3526 /* Save the state from the cursor before we trash it */ 3527 if (cur->bc_ops->update_cursor) 3528 cur->bc_ops->update_cursor(pcur, cur); 3529 cur->bc_nlevels = pcur->bc_nlevels; 3530 xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR); 3531 } 3532 /* If we got a new cursor, switch to it. */ 3533 if (ncur) { 3534 pcur = ncur; 3535 ncur = NULL; 3536 } 3537 } while (!xfs_btree_ptr_is_null(cur, &nptr)); 3538 3539 *stat = i; 3540 return 0; 3541 error0: 3542 return error; 3543 } 3544 3545 /* 3546 * Try to merge a non-leaf block back into the inode root. 3547 * 3548 * Note: the killroot names comes from the fact that we're effectively 3549 * killing the old root block. But because we can't just delete the 3550 * inode we have to copy the single block it was pointing to into the 3551 * inode. 3552 */ 3553 STATIC int 3554 xfs_btree_kill_iroot( 3555 struct xfs_btree_cur *cur) 3556 { 3557 int whichfork = cur->bc_ino.whichfork; 3558 struct xfs_inode *ip = cur->bc_ino.ip; 3559 struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); 3560 struct xfs_btree_block *block; 3561 struct xfs_btree_block *cblock; 3562 union xfs_btree_key *kp; 3563 union xfs_btree_key *ckp; 3564 union xfs_btree_ptr *pp; 3565 union xfs_btree_ptr *cpp; 3566 struct xfs_buf *cbp; 3567 int level; 3568 int index; 3569 int numrecs; 3570 int error; 3571 #ifdef DEBUG 3572 union xfs_btree_ptr ptr; 3573 #endif 3574 int i; 3575 3576 ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); 3577 ASSERT(cur->bc_nlevels > 1); 3578 3579 /* 3580 * Don't deal with the root block needs to be a leaf case. 3581 * We're just going to turn the thing back into extents anyway. 3582 */ 3583 level = cur->bc_nlevels - 1; 3584 if (level == 1) 3585 goto out0; 3586 3587 /* 3588 * Give up if the root has multiple children. 3589 */ 3590 block = xfs_btree_get_iroot(cur); 3591 if (xfs_btree_get_numrecs(block) != 1) 3592 goto out0; 3593 3594 cblock = xfs_btree_get_block(cur, level - 1, &cbp); 3595 numrecs = xfs_btree_get_numrecs(cblock); 3596 3597 /* 3598 * Only do this if the next level will fit. 3599 * Then the data must be copied up to the inode, 3600 * instead of freeing the root you free the next level. 3601 */ 3602 if (numrecs > cur->bc_ops->get_dmaxrecs(cur, level)) 3603 goto out0; 3604 3605 XFS_BTREE_STATS_INC(cur, killroot); 3606 3607 #ifdef DEBUG 3608 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB); 3609 ASSERT(xfs_btree_ptr_is_null(cur, &ptr)); 3610 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB); 3611 ASSERT(xfs_btree_ptr_is_null(cur, &ptr)); 3612 #endif 3613 3614 index = numrecs - cur->bc_ops->get_maxrecs(cur, level); 3615 if (index) { 3616 xfs_iroot_realloc(cur->bc_ino.ip, index, 3617 cur->bc_ino.whichfork); 3618 block = ifp->if_broot; 3619 } 3620 3621 be16_add_cpu(&block->bb_numrecs, index); 3622 ASSERT(block->bb_numrecs == cblock->bb_numrecs); 3623 3624 kp = xfs_btree_key_addr(cur, 1, block); 3625 ckp = xfs_btree_key_addr(cur, 1, cblock); 3626 xfs_btree_copy_keys(cur, kp, ckp, numrecs); 3627 3628 pp = xfs_btree_ptr_addr(cur, 1, block); 3629 cpp = xfs_btree_ptr_addr(cur, 1, cblock); 3630 3631 for (i = 0; i < numrecs; i++) { 3632 error = xfs_btree_debug_check_ptr(cur, cpp, i, level - 1); 3633 if (error) 3634 return error; 3635 } 3636 3637 xfs_btree_copy_ptrs(cur, pp, cpp, numrecs); 3638 3639 error = xfs_btree_free_block(cur, cbp); 3640 if (error) 3641 return error; 3642 3643 cur->bc_levels[level - 1].bp = NULL; 3644 be16_add_cpu(&block->bb_level, -1); 3645 xfs_trans_log_inode(cur->bc_tp, ip, 3646 XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork)); 3647 cur->bc_nlevels--; 3648 out0: 3649 return 0; 3650 } 3651 3652 /* 3653 * Kill the current root node, and replace it with it's only child node. 3654 */ 3655 STATIC int 3656 xfs_btree_kill_root( 3657 struct xfs_btree_cur *cur, 3658 struct xfs_buf *bp, 3659 int level, 3660 union xfs_btree_ptr *newroot) 3661 { 3662 int error; 3663 3664 XFS_BTREE_STATS_INC(cur, killroot); 3665 3666 /* 3667 * Update the root pointer, decreasing the level by 1 and then 3668 * free the old root. 3669 */ 3670 cur->bc_ops->set_root(cur, newroot, -1); 3671 3672 error = xfs_btree_free_block(cur, bp); 3673 if (error) 3674 return error; 3675 3676 cur->bc_levels[level].bp = NULL; 3677 cur->bc_levels[level].ra = 0; 3678 cur->bc_nlevels--; 3679 3680 return 0; 3681 } 3682 3683 STATIC int 3684 xfs_btree_dec_cursor( 3685 struct xfs_btree_cur *cur, 3686 int level, 3687 int *stat) 3688 { 3689 int error; 3690 int i; 3691 3692 if (level > 0) { 3693 error = xfs_btree_decrement(cur, level, &i); 3694 if (error) 3695 return error; 3696 } 3697 3698 *stat = 1; 3699 return 0; 3700 } 3701 3702 /* 3703 * Single level of the btree record deletion routine. 3704 * Delete record pointed to by cur/level. 3705 * Remove the record from its block then rebalance the tree. 3706 * Return 0 for error, 1 for done, 2 to go on to the next level. 3707 */ 3708 STATIC int /* error */ 3709 xfs_btree_delrec( 3710 struct xfs_btree_cur *cur, /* btree cursor */ 3711 int level, /* level removing record from */ 3712 int *stat) /* fail/done/go-on */ 3713 { 3714 struct xfs_btree_block *block; /* btree block */ 3715 union xfs_btree_ptr cptr; /* current block ptr */ 3716 struct xfs_buf *bp; /* buffer for block */ 3717 int error; /* error return value */ 3718 int i; /* loop counter */ 3719 union xfs_btree_ptr lptr; /* left sibling block ptr */ 3720 struct xfs_buf *lbp; /* left buffer pointer */ 3721 struct xfs_btree_block *left; /* left btree block */ 3722 int lrecs = 0; /* left record count */ 3723 int ptr; /* key/record index */ 3724 union xfs_btree_ptr rptr; /* right sibling block ptr */ 3725 struct xfs_buf *rbp; /* right buffer pointer */ 3726 struct xfs_btree_block *right; /* right btree block */ 3727 struct xfs_btree_block *rrblock; /* right-right btree block */ 3728 struct xfs_buf *rrbp; /* right-right buffer pointer */ 3729 int rrecs = 0; /* right record count */ 3730 struct xfs_btree_cur *tcur; /* temporary btree cursor */ 3731 int numrecs; /* temporary numrec count */ 3732 3733 tcur = NULL; 3734 3735 /* Get the index of the entry being deleted, check for nothing there. */ 3736 ptr = cur->bc_levels[level].ptr; 3737 if (ptr == 0) { 3738 *stat = 0; 3739 return 0; 3740 } 3741 3742 /* Get the buffer & block containing the record or key/ptr. */ 3743 block = xfs_btree_get_block(cur, level, &bp); 3744 numrecs = xfs_btree_get_numrecs(block); 3745 3746 #ifdef DEBUG 3747 error = xfs_btree_check_block(cur, block, level, bp); 3748 if (error) 3749 goto error0; 3750 #endif 3751 3752 /* Fail if we're off the end of the block. */ 3753 if (ptr > numrecs) { 3754 *stat = 0; 3755 return 0; 3756 } 3757 3758 XFS_BTREE_STATS_INC(cur, delrec); 3759 XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr); 3760 3761 /* Excise the entries being deleted. */ 3762 if (level > 0) { 3763 /* It's a nonleaf. operate on keys and ptrs */ 3764 union xfs_btree_key *lkp; 3765 union xfs_btree_ptr *lpp; 3766 3767 lkp = xfs_btree_key_addr(cur, ptr + 1, block); 3768 lpp = xfs_btree_ptr_addr(cur, ptr + 1, block); 3769 3770 for (i = 0; i < numrecs - ptr; i++) { 3771 error = xfs_btree_debug_check_ptr(cur, lpp, i, level); 3772 if (error) 3773 goto error0; 3774 } 3775 3776 if (ptr < numrecs) { 3777 xfs_btree_shift_keys(cur, lkp, -1, numrecs - ptr); 3778 xfs_btree_shift_ptrs(cur, lpp, -1, numrecs - ptr); 3779 xfs_btree_log_keys(cur, bp, ptr, numrecs - 1); 3780 xfs_btree_log_ptrs(cur, bp, ptr, numrecs - 1); 3781 } 3782 } else { 3783 /* It's a leaf. operate on records */ 3784 if (ptr < numrecs) { 3785 xfs_btree_shift_recs(cur, 3786 xfs_btree_rec_addr(cur, ptr + 1, block), 3787 -1, numrecs - ptr); 3788 xfs_btree_log_recs(cur, bp, ptr, numrecs - 1); 3789 } 3790 } 3791 3792 /* 3793 * Decrement and log the number of entries in the block. 3794 */ 3795 xfs_btree_set_numrecs(block, --numrecs); 3796 xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS); 3797 3798 /* 3799 * If we are tracking the last record in the tree and 3800 * we are at the far right edge of the tree, update it. 3801 */ 3802 if (xfs_btree_is_lastrec(cur, block, level)) { 3803 cur->bc_ops->update_lastrec(cur, block, NULL, 3804 ptr, LASTREC_DELREC); 3805 } 3806 3807 /* 3808 * We're at the root level. First, shrink the root block in-memory. 3809 * Try to get rid of the next level down. If we can't then there's 3810 * nothing left to do. 3811 */ 3812 if (level == cur->bc_nlevels - 1) { 3813 if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) { 3814 xfs_iroot_realloc(cur->bc_ino.ip, -1, 3815 cur->bc_ino.whichfork); 3816 3817 error = xfs_btree_kill_iroot(cur); 3818 if (error) 3819 goto error0; 3820 3821 error = xfs_btree_dec_cursor(cur, level, stat); 3822 if (error) 3823 goto error0; 3824 *stat = 1; 3825 return 0; 3826 } 3827 3828 /* 3829 * If this is the root level, and there's only one entry left, 3830 * and it's NOT the leaf level, then we can get rid of this 3831 * level. 3832 */ 3833 if (numrecs == 1 && level > 0) { 3834 union xfs_btree_ptr *pp; 3835 /* 3836 * pp is still set to the first pointer in the block. 3837 * Make it the new root of the btree. 3838 */ 3839 pp = xfs_btree_ptr_addr(cur, 1, block); 3840 error = xfs_btree_kill_root(cur, bp, level, pp); 3841 if (error) 3842 goto error0; 3843 } else if (level > 0) { 3844 error = xfs_btree_dec_cursor(cur, level, stat); 3845 if (error) 3846 goto error0; 3847 } 3848 *stat = 1; 3849 return 0; 3850 } 3851 3852 /* 3853 * If we deleted the leftmost entry in the block, update the 3854 * key values above us in the tree. 3855 */ 3856 if (xfs_btree_needs_key_update(cur, ptr)) { 3857 error = xfs_btree_update_keys(cur, level); 3858 if (error) 3859 goto error0; 3860 } 3861 3862 /* 3863 * If the number of records remaining in the block is at least 3864 * the minimum, we're done. 3865 */ 3866 if (numrecs >= cur->bc_ops->get_minrecs(cur, level)) { 3867 error = xfs_btree_dec_cursor(cur, level, stat); 3868 if (error) 3869 goto error0; 3870 return 0; 3871 } 3872 3873 /* 3874 * Otherwise, we have to move some records around to keep the 3875 * tree balanced. Look at the left and right sibling blocks to 3876 * see if we can re-balance by moving only one record. 3877 */ 3878 xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB); 3879 xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB); 3880 3881 if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) { 3882 /* 3883 * One child of root, need to get a chance to copy its contents 3884 * into the root and delete it. Can't go up to next level, 3885 * there's nothing to delete there. 3886 */ 3887 if (xfs_btree_ptr_is_null(cur, &rptr) && 3888 xfs_btree_ptr_is_null(cur, &lptr) && 3889 level == cur->bc_nlevels - 2) { 3890 error = xfs_btree_kill_iroot(cur); 3891 if (!error) 3892 error = xfs_btree_dec_cursor(cur, level, stat); 3893 if (error) 3894 goto error0; 3895 return 0; 3896 } 3897 } 3898 3899 ASSERT(!xfs_btree_ptr_is_null(cur, &rptr) || 3900 !xfs_btree_ptr_is_null(cur, &lptr)); 3901 3902 /* 3903 * Duplicate the cursor so our btree manipulations here won't 3904 * disrupt the next level up. 3905 */ 3906 error = xfs_btree_dup_cursor(cur, &tcur); 3907 if (error) 3908 goto error0; 3909 3910 /* 3911 * If there's a right sibling, see if it's ok to shift an entry 3912 * out of it. 3913 */ 3914 if (!xfs_btree_ptr_is_null(cur, &rptr)) { 3915 /* 3916 * Move the temp cursor to the last entry in the next block. 3917 * Actually any entry but the first would suffice. 3918 */ 3919 i = xfs_btree_lastrec(tcur, level); 3920 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 3921 error = -EFSCORRUPTED; 3922 goto error0; 3923 } 3924 3925 error = xfs_btree_increment(tcur, level, &i); 3926 if (error) 3927 goto error0; 3928 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 3929 error = -EFSCORRUPTED; 3930 goto error0; 3931 } 3932 3933 i = xfs_btree_lastrec(tcur, level); 3934 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 3935 error = -EFSCORRUPTED; 3936 goto error0; 3937 } 3938 3939 /* Grab a pointer to the block. */ 3940 right = xfs_btree_get_block(tcur, level, &rbp); 3941 #ifdef DEBUG 3942 error = xfs_btree_check_block(tcur, right, level, rbp); 3943 if (error) 3944 goto error0; 3945 #endif 3946 /* Grab the current block number, for future use. */ 3947 xfs_btree_get_sibling(tcur, right, &cptr, XFS_BB_LEFTSIB); 3948 3949 /* 3950 * If right block is full enough so that removing one entry 3951 * won't make it too empty, and left-shifting an entry out 3952 * of right to us works, we're done. 3953 */ 3954 if (xfs_btree_get_numrecs(right) - 1 >= 3955 cur->bc_ops->get_minrecs(tcur, level)) { 3956 error = xfs_btree_lshift(tcur, level, &i); 3957 if (error) 3958 goto error0; 3959 if (i) { 3960 ASSERT(xfs_btree_get_numrecs(block) >= 3961 cur->bc_ops->get_minrecs(tcur, level)); 3962 3963 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); 3964 tcur = NULL; 3965 3966 error = xfs_btree_dec_cursor(cur, level, stat); 3967 if (error) 3968 goto error0; 3969 return 0; 3970 } 3971 } 3972 3973 /* 3974 * Otherwise, grab the number of records in right for 3975 * future reference, and fix up the temp cursor to point 3976 * to our block again (last record). 3977 */ 3978 rrecs = xfs_btree_get_numrecs(right); 3979 if (!xfs_btree_ptr_is_null(cur, &lptr)) { 3980 i = xfs_btree_firstrec(tcur, level); 3981 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 3982 error = -EFSCORRUPTED; 3983 goto error0; 3984 } 3985 3986 error = xfs_btree_decrement(tcur, level, &i); 3987 if (error) 3988 goto error0; 3989 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 3990 error = -EFSCORRUPTED; 3991 goto error0; 3992 } 3993 } 3994 } 3995 3996 /* 3997 * If there's a left sibling, see if it's ok to shift an entry 3998 * out of it. 3999 */ 4000 if (!xfs_btree_ptr_is_null(cur, &lptr)) { 4001 /* 4002 * Move the temp cursor to the first entry in the 4003 * previous block. 4004 */ 4005 i = xfs_btree_firstrec(tcur, level); 4006 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 4007 error = -EFSCORRUPTED; 4008 goto error0; 4009 } 4010 4011 error = xfs_btree_decrement(tcur, level, &i); 4012 if (error) 4013 goto error0; 4014 i = xfs_btree_firstrec(tcur, level); 4015 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { 4016 error = -EFSCORRUPTED; 4017 goto error0; 4018 } 4019 4020 /* Grab a pointer to the block. */ 4021 left = xfs_btree_get_block(tcur, level, &lbp); 4022 #ifdef DEBUG 4023 error = xfs_btree_check_block(cur, left, level, lbp); 4024 if (error) 4025 goto error0; 4026 #endif 4027 /* Grab the current block number, for future use. */ 4028 xfs_btree_get_sibling(tcur, left, &cptr, XFS_BB_RIGHTSIB); 4029 4030 /* 4031 * If left block is full enough so that removing one entry 4032 * won't make it too empty, and right-shifting an entry out 4033 * of left to us works, we're done. 4034 */ 4035 if (xfs_btree_get_numrecs(left) - 1 >= 4036 cur->bc_ops->get_minrecs(tcur, level)) { 4037 error = xfs_btree_rshift(tcur, level, &i); 4038 if (error) 4039 goto error0; 4040 if (i) { 4041 ASSERT(xfs_btree_get_numrecs(block) >= 4042 cur->bc_ops->get_minrecs(tcur, level)); 4043 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); 4044 tcur = NULL; 4045 if (level == 0) 4046 cur->bc_levels[0].ptr++; 4047 4048 *stat = 1; 4049 return 0; 4050 } 4051 } 4052 4053 /* 4054 * Otherwise, grab the number of records in right for 4055 * future reference. 4056 */ 4057 lrecs = xfs_btree_get_numrecs(left); 4058 } 4059 4060 /* Delete the temp cursor, we're done with it. */ 4061 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); 4062 tcur = NULL; 4063 4064 /* If here, we need to do a join to keep the tree balanced. */ 4065 ASSERT(!xfs_btree_ptr_is_null(cur, &cptr)); 4066 4067 if (!xfs_btree_ptr_is_null(cur, &lptr) && 4068 lrecs + xfs_btree_get_numrecs(block) <= 4069 cur->bc_ops->get_maxrecs(cur, level)) { 4070 /* 4071 * Set "right" to be the starting block, 4072 * "left" to be the left neighbor. 4073 */ 4074 rptr = cptr; 4075 right = block; 4076 rbp = bp; 4077 error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp); 4078 if (error) 4079 goto error0; 4080 4081 /* 4082 * If that won't work, see if we can join with the right neighbor block. 4083 */ 4084 } else if (!xfs_btree_ptr_is_null(cur, &rptr) && 4085 rrecs + xfs_btree_get_numrecs(block) <= 4086 cur->bc_ops->get_maxrecs(cur, level)) { 4087 /* 4088 * Set "left" to be the starting block, 4089 * "right" to be the right neighbor. 4090 */ 4091 lptr = cptr; 4092 left = block; 4093 lbp = bp; 4094 error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp); 4095 if (error) 4096 goto error0; 4097 4098 /* 4099 * Otherwise, we can't fix the imbalance. 4100 * Just return. This is probably a logic error, but it's not fatal. 4101 */ 4102 } else { 4103 error = xfs_btree_dec_cursor(cur, level, stat); 4104 if (error) 4105 goto error0; 4106 return 0; 4107 } 4108 4109 rrecs = xfs_btree_get_numrecs(right); 4110 lrecs = xfs_btree_get_numrecs(left); 4111 4112 /* 4113 * We're now going to join "left" and "right" by moving all the stuff 4114 * in "right" to "left" and deleting "right". 4115 */ 4116 XFS_BTREE_STATS_ADD(cur, moves, rrecs); 4117 if (level > 0) { 4118 /* It's a non-leaf. Move keys and pointers. */ 4119 union xfs_btree_key *lkp; /* left btree key */ 4120 union xfs_btree_ptr *lpp; /* left address pointer */ 4121 union xfs_btree_key *rkp; /* right btree key */ 4122 union xfs_btree_ptr *rpp; /* right address pointer */ 4123 4124 lkp = xfs_btree_key_addr(cur, lrecs + 1, left); 4125 lpp = xfs_btree_ptr_addr(cur, lrecs + 1, left); 4126 rkp = xfs_btree_key_addr(cur, 1, right); 4127 rpp = xfs_btree_ptr_addr(cur, 1, right); 4128 4129 for (i = 1; i < rrecs; i++) { 4130 error = xfs_btree_debug_check_ptr(cur, rpp, i, level); 4131 if (error) 4132 goto error0; 4133 } 4134 4135 xfs_btree_copy_keys(cur, lkp, rkp, rrecs); 4136 xfs_btree_copy_ptrs(cur, lpp, rpp, rrecs); 4137 4138 xfs_btree_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs); 4139 xfs_btree_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs); 4140 } else { 4141 /* It's a leaf. Move records. */ 4142 union xfs_btree_rec *lrp; /* left record pointer */ 4143 union xfs_btree_rec *rrp; /* right record pointer */ 4144 4145 lrp = xfs_btree_rec_addr(cur, lrecs + 1, left); 4146 rrp = xfs_btree_rec_addr(cur, 1, right); 4147 4148 xfs_btree_copy_recs(cur, lrp, rrp, rrecs); 4149 xfs_btree_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs); 4150 } 4151 4152 XFS_BTREE_STATS_INC(cur, join); 4153 4154 /* 4155 * Fix up the number of records and right block pointer in the 4156 * surviving block, and log it. 4157 */ 4158 xfs_btree_set_numrecs(left, lrecs + rrecs); 4159 xfs_btree_get_sibling(cur, right, &cptr, XFS_BB_RIGHTSIB); 4160 xfs_btree_set_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB); 4161 xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB); 4162 4163 /* If there is a right sibling, point it to the remaining block. */ 4164 xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB); 4165 if (!xfs_btree_ptr_is_null(cur, &cptr)) { 4166 error = xfs_btree_read_buf_block(cur, &cptr, 0, &rrblock, &rrbp); 4167 if (error) 4168 goto error0; 4169 xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB); 4170 xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB); 4171 } 4172 4173 /* Free the deleted block. */ 4174 error = xfs_btree_free_block(cur, rbp); 4175 if (error) 4176 goto error0; 4177 4178 /* 4179 * If we joined with the left neighbor, set the buffer in the 4180 * cursor to the left block, and fix up the index. 4181 */ 4182 if (bp != lbp) { 4183 cur->bc_levels[level].bp = lbp; 4184 cur->bc_levels[level].ptr += lrecs; 4185 cur->bc_levels[level].ra = 0; 4186 } 4187 /* 4188 * If we joined with the right neighbor and there's a level above 4189 * us, increment the cursor at that level. 4190 */ 4191 else if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) || 4192 (level + 1 < cur->bc_nlevels)) { 4193 error = xfs_btree_increment(cur, level + 1, &i); 4194 if (error) 4195 goto error0; 4196 } 4197 4198 /* 4199 * Readjust the ptr at this level if it's not a leaf, since it's 4200 * still pointing at the deletion point, which makes the cursor 4201 * inconsistent. If this makes the ptr 0, the caller fixes it up. 4202 * We can't use decrement because it would change the next level up. 4203 */ 4204 if (level > 0) 4205 cur->bc_levels[level].ptr--; 4206 4207 /* 4208 * We combined blocks, so we have to update the parent keys if the 4209 * btree supports overlapped intervals. However, 4210 * bc_levels[level + 1].ptr points to the old block so that the caller 4211 * knows which record to delete. Therefore, the caller must be savvy 4212 * enough to call updkeys for us if we return stat == 2. The other 4213 * exit points from this function don't require deletions further up 4214 * the tree, so they can call updkeys directly. 4215 */ 4216 4217 /* Return value means the next level up has something to do. */ 4218 *stat = 2; 4219 return 0; 4220 4221 error0: 4222 if (tcur) 4223 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); 4224 return error; 4225 } 4226 4227 /* 4228 * Delete the record pointed to by cur. 4229 * The cursor refers to the place where the record was (could be inserted) 4230 * when the operation returns. 4231 */ 4232 int /* error */ 4233 xfs_btree_delete( 4234 struct xfs_btree_cur *cur, 4235 int *stat) /* success/failure */ 4236 { 4237 int error; /* error return value */ 4238 int level; 4239 int i; 4240 bool joined = false; 4241 4242 /* 4243 * Go up the tree, starting at leaf level. 4244 * 4245 * If 2 is returned then a join was done; go to the next level. 4246 * Otherwise we are done. 4247 */ 4248 for (level = 0, i = 2; i == 2; level++) { 4249 error = xfs_btree_delrec(cur, level, &i); 4250 if (error) 4251 goto error0; 4252 if (i == 2) 4253 joined = true; 4254 } 4255 4256 /* 4257 * If we combined blocks as part of deleting the record, delrec won't 4258 * have updated the parent high keys so we have to do that here. 4259 */ 4260 if (joined && (cur->bc_flags & XFS_BTREE_OVERLAPPING)) { 4261 error = xfs_btree_updkeys_force(cur, 0); 4262 if (error) 4263 goto error0; 4264 } 4265 4266 if (i == 0) { 4267 for (level = 1; level < cur->bc_nlevels; level++) { 4268 if (cur->bc_levels[level].ptr == 0) { 4269 error = xfs_btree_decrement(cur, level, &i); 4270 if (error) 4271 goto error0; 4272 break; 4273 } 4274 } 4275 } 4276 4277 *stat = i; 4278 return 0; 4279 error0: 4280 return error; 4281 } 4282 4283 /* 4284 * Get the data from the pointed-to record. 4285 */ 4286 int /* error */ 4287 xfs_btree_get_rec( 4288 struct xfs_btree_cur *cur, /* btree cursor */ 4289 union xfs_btree_rec **recp, /* output: btree record */ 4290 int *stat) /* output: success/failure */ 4291 { 4292 struct xfs_btree_block *block; /* btree block */ 4293 struct xfs_buf *bp; /* buffer pointer */ 4294 int ptr; /* record number */ 4295 #ifdef DEBUG 4296 int error; /* error return value */ 4297 #endif 4298 4299 ptr = cur->bc_levels[0].ptr; 4300 block = xfs_btree_get_block(cur, 0, &bp); 4301 4302 #ifdef DEBUG 4303 error = xfs_btree_check_block(cur, block, 0, bp); 4304 if (error) 4305 return error; 4306 #endif 4307 4308 /* 4309 * Off the right end or left end, return failure. 4310 */ 4311 if (ptr > xfs_btree_get_numrecs(block) || ptr <= 0) { 4312 *stat = 0; 4313 return 0; 4314 } 4315 4316 /* 4317 * Point to the record and extract its data. 4318 */ 4319 *recp = xfs_btree_rec_addr(cur, ptr, block); 4320 *stat = 1; 4321 return 0; 4322 } 4323 4324 /* Visit a block in a btree. */ 4325 STATIC int 4326 xfs_btree_visit_block( 4327 struct xfs_btree_cur *cur, 4328 int level, 4329 xfs_btree_visit_blocks_fn fn, 4330 void *data) 4331 { 4332 struct xfs_btree_block *block; 4333 struct xfs_buf *bp; 4334 union xfs_btree_ptr rptr; 4335 int error; 4336 4337 /* do right sibling readahead */ 4338 xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA); 4339 block = xfs_btree_get_block(cur, level, &bp); 4340 4341 /* process the block */ 4342 error = fn(cur, level, data); 4343 if (error) 4344 return error; 4345 4346 /* now read rh sibling block for next iteration */ 4347 xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB); 4348 if (xfs_btree_ptr_is_null(cur, &rptr)) 4349 return -ENOENT; 4350 4351 /* 4352 * We only visit blocks once in this walk, so we have to avoid the 4353 * internal xfs_btree_lookup_get_block() optimisation where it will 4354 * return the same block without checking if the right sibling points 4355 * back to us and creates a cyclic reference in the btree. 4356 */ 4357 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { 4358 if (be64_to_cpu(rptr.l) == XFS_DADDR_TO_FSB(cur->bc_mp, 4359 xfs_buf_daddr(bp))) 4360 return -EFSCORRUPTED; 4361 } else { 4362 if (be32_to_cpu(rptr.s) == xfs_daddr_to_agbno(cur->bc_mp, 4363 xfs_buf_daddr(bp))) 4364 return -EFSCORRUPTED; 4365 } 4366 return xfs_btree_lookup_get_block(cur, level, &rptr, &block); 4367 } 4368 4369 4370 /* Visit every block in a btree. */ 4371 int 4372 xfs_btree_visit_blocks( 4373 struct xfs_btree_cur *cur, 4374 xfs_btree_visit_blocks_fn fn, 4375 unsigned int flags, 4376 void *data) 4377 { 4378 union xfs_btree_ptr lptr; 4379 int level; 4380 struct xfs_btree_block *block = NULL; 4381 int error = 0; 4382 4383 cur->bc_ops->init_ptr_from_cur(cur, &lptr); 4384 4385 /* for each level */ 4386 for (level = cur->bc_nlevels - 1; level >= 0; level--) { 4387 /* grab the left hand block */ 4388 error = xfs_btree_lookup_get_block(cur, level, &lptr, &block); 4389 if (error) 4390 return error; 4391 4392 /* readahead the left most block for the next level down */ 4393 if (level > 0) { 4394 union xfs_btree_ptr *ptr; 4395 4396 ptr = xfs_btree_ptr_addr(cur, 1, block); 4397 xfs_btree_readahead_ptr(cur, ptr, 1); 4398 4399 /* save for the next iteration of the loop */ 4400 xfs_btree_copy_ptrs(cur, &lptr, ptr, 1); 4401 4402 if (!(flags & XFS_BTREE_VISIT_LEAVES)) 4403 continue; 4404 } else if (!(flags & XFS_BTREE_VISIT_RECORDS)) { 4405 continue; 4406 } 4407 4408 /* for each buffer in the level */ 4409 do { 4410 error = xfs_btree_visit_block(cur, level, fn, data); 4411 } while (!error); 4412 4413 if (error != -ENOENT) 4414 return error; 4415 } 4416 4417 return 0; 4418 } 4419 4420 /* 4421 * Change the owner of a btree. 4422 * 4423 * The mechanism we use here is ordered buffer logging. Because we don't know 4424 * how many buffers were are going to need to modify, we don't really want to 4425 * have to make transaction reservations for the worst case of every buffer in a 4426 * full size btree as that may be more space that we can fit in the log.... 4427 * 4428 * We do the btree walk in the most optimal manner possible - we have sibling 4429 * pointers so we can just walk all the blocks on each level from left to right 4430 * in a single pass, and then move to the next level and do the same. We can 4431 * also do readahead on the sibling pointers to get IO moving more quickly, 4432 * though for slow disks this is unlikely to make much difference to performance 4433 * as the amount of CPU work we have to do before moving to the next block is 4434 * relatively small. 4435 * 4436 * For each btree block that we load, modify the owner appropriately, set the 4437 * buffer as an ordered buffer and log it appropriately. We need to ensure that 4438 * we mark the region we change dirty so that if the buffer is relogged in 4439 * a subsequent transaction the changes we make here as an ordered buffer are 4440 * correctly relogged in that transaction. If we are in recovery context, then 4441 * just queue the modified buffer as delayed write buffer so the transaction 4442 * recovery completion writes the changes to disk. 4443 */ 4444 struct xfs_btree_block_change_owner_info { 4445 uint64_t new_owner; 4446 struct list_head *buffer_list; 4447 }; 4448 4449 static int 4450 xfs_btree_block_change_owner( 4451 struct xfs_btree_cur *cur, 4452 int level, 4453 void *data) 4454 { 4455 struct xfs_btree_block_change_owner_info *bbcoi = data; 4456 struct xfs_btree_block *block; 4457 struct xfs_buf *bp; 4458 4459 /* modify the owner */ 4460 block = xfs_btree_get_block(cur, level, &bp); 4461 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { 4462 if (block->bb_u.l.bb_owner == cpu_to_be64(bbcoi->new_owner)) 4463 return 0; 4464 block->bb_u.l.bb_owner = cpu_to_be64(bbcoi->new_owner); 4465 } else { 4466 if (block->bb_u.s.bb_owner == cpu_to_be32(bbcoi->new_owner)) 4467 return 0; 4468 block->bb_u.s.bb_owner = cpu_to_be32(bbcoi->new_owner); 4469 } 4470 4471 /* 4472 * If the block is a root block hosted in an inode, we might not have a 4473 * buffer pointer here and we shouldn't attempt to log the change as the 4474 * information is already held in the inode and discarded when the root 4475 * block is formatted into the on-disk inode fork. We still change it, 4476 * though, so everything is consistent in memory. 4477 */ 4478 if (!bp) { 4479 ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); 4480 ASSERT(level == cur->bc_nlevels - 1); 4481 return 0; 4482 } 4483 4484 if (cur->bc_tp) { 4485 if (!xfs_trans_ordered_buf(cur->bc_tp, bp)) { 4486 xfs_btree_log_block(cur, bp, XFS_BB_OWNER); 4487 return -EAGAIN; 4488 } 4489 } else { 4490 xfs_buf_delwri_queue(bp, bbcoi->buffer_list); 4491 } 4492 4493 return 0; 4494 } 4495 4496 int 4497 xfs_btree_change_owner( 4498 struct xfs_btree_cur *cur, 4499 uint64_t new_owner, 4500 struct list_head *buffer_list) 4501 { 4502 struct xfs_btree_block_change_owner_info bbcoi; 4503 4504 bbcoi.new_owner = new_owner; 4505 bbcoi.buffer_list = buffer_list; 4506 4507 return xfs_btree_visit_blocks(cur, xfs_btree_block_change_owner, 4508 XFS_BTREE_VISIT_ALL, &bbcoi); 4509 } 4510 4511 /* Verify the v5 fields of a long-format btree block. */ 4512 xfs_failaddr_t 4513 xfs_btree_lblock_v5hdr_verify( 4514 struct xfs_buf *bp, 4515 uint64_t owner) 4516 { 4517 struct xfs_mount *mp = bp->b_mount; 4518 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 4519 4520 if (!xfs_has_crc(mp)) 4521 return __this_address; 4522 if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid)) 4523 return __this_address; 4524 if (block->bb_u.l.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp))) 4525 return __this_address; 4526 if (owner != XFS_RMAP_OWN_UNKNOWN && 4527 be64_to_cpu(block->bb_u.l.bb_owner) != owner) 4528 return __this_address; 4529 return NULL; 4530 } 4531 4532 /* Verify a long-format btree block. */ 4533 xfs_failaddr_t 4534 xfs_btree_lblock_verify( 4535 struct xfs_buf *bp, 4536 unsigned int max_recs) 4537 { 4538 struct xfs_mount *mp = bp->b_mount; 4539 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 4540 xfs_fsblock_t fsb; 4541 xfs_failaddr_t fa; 4542 4543 /* numrecs verification */ 4544 if (be16_to_cpu(block->bb_numrecs) > max_recs) 4545 return __this_address; 4546 4547 /* sibling pointer verification */ 4548 fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); 4549 fa = xfs_btree_check_lblock_siblings(mp, NULL, -1, fsb, 4550 block->bb_u.l.bb_leftsib); 4551 if (!fa) 4552 fa = xfs_btree_check_lblock_siblings(mp, NULL, -1, fsb, 4553 block->bb_u.l.bb_rightsib); 4554 return fa; 4555 } 4556 4557 /** 4558 * xfs_btree_sblock_v5hdr_verify() -- verify the v5 fields of a short-format 4559 * btree block 4560 * 4561 * @bp: buffer containing the btree block 4562 */ 4563 xfs_failaddr_t 4564 xfs_btree_sblock_v5hdr_verify( 4565 struct xfs_buf *bp) 4566 { 4567 struct xfs_mount *mp = bp->b_mount; 4568 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 4569 struct xfs_perag *pag = bp->b_pag; 4570 4571 if (!xfs_has_crc(mp)) 4572 return __this_address; 4573 if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) 4574 return __this_address; 4575 if (block->bb_u.s.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp))) 4576 return __this_address; 4577 if (pag && be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno) 4578 return __this_address; 4579 return NULL; 4580 } 4581 4582 /** 4583 * xfs_btree_sblock_verify() -- verify a short-format btree block 4584 * 4585 * @bp: buffer containing the btree block 4586 * @max_recs: maximum records allowed in this btree node 4587 */ 4588 xfs_failaddr_t 4589 xfs_btree_sblock_verify( 4590 struct xfs_buf *bp, 4591 unsigned int max_recs) 4592 { 4593 struct xfs_mount *mp = bp->b_mount; 4594 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 4595 xfs_agblock_t agbno; 4596 xfs_failaddr_t fa; 4597 4598 /* numrecs verification */ 4599 if (be16_to_cpu(block->bb_numrecs) > max_recs) 4600 return __this_address; 4601 4602 /* sibling pointer verification */ 4603 agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp)); 4604 fa = xfs_btree_check_sblock_siblings(bp->b_pag, NULL, -1, agbno, 4605 block->bb_u.s.bb_leftsib); 4606 if (!fa) 4607 fa = xfs_btree_check_sblock_siblings(bp->b_pag, NULL, -1, agbno, 4608 block->bb_u.s.bb_rightsib); 4609 return fa; 4610 } 4611 4612 /* 4613 * For the given limits on leaf and keyptr records per block, calculate the 4614 * height of the tree needed to index the number of leaf records. 4615 */ 4616 unsigned int 4617 xfs_btree_compute_maxlevels( 4618 const unsigned int *limits, 4619 unsigned long long records) 4620 { 4621 unsigned long long level_blocks = howmany_64(records, limits[0]); 4622 unsigned int height = 1; 4623 4624 while (level_blocks > 1) { 4625 level_blocks = howmany_64(level_blocks, limits[1]); 4626 height++; 4627 } 4628 4629 return height; 4630 } 4631 4632 /* 4633 * For the given limits on leaf and keyptr records per block, calculate the 4634 * number of blocks needed to index the given number of leaf records. 4635 */ 4636 unsigned long long 4637 xfs_btree_calc_size( 4638 const unsigned int *limits, 4639 unsigned long long records) 4640 { 4641 unsigned long long level_blocks = howmany_64(records, limits[0]); 4642 unsigned long long blocks = level_blocks; 4643 4644 while (level_blocks > 1) { 4645 level_blocks = howmany_64(level_blocks, limits[1]); 4646 blocks += level_blocks; 4647 } 4648 4649 return blocks; 4650 } 4651 4652 /* 4653 * Given a number of available blocks for the btree to consume with records and 4654 * pointers, calculate the height of the tree needed to index all the records 4655 * that space can hold based on the number of pointers each interior node 4656 * holds. 4657 * 4658 * We start by assuming a single level tree consumes a single block, then track 4659 * the number of blocks each node level consumes until we no longer have space 4660 * to store the next node level. At this point, we are indexing all the leaf 4661 * blocks in the space, and there's no more free space to split the tree any 4662 * further. That's our maximum btree height. 4663 */ 4664 unsigned int 4665 xfs_btree_space_to_height( 4666 const unsigned int *limits, 4667 unsigned long long leaf_blocks) 4668 { 4669 unsigned long long node_blocks = limits[1]; 4670 unsigned long long blocks_left = leaf_blocks - 1; 4671 unsigned int height = 1; 4672 4673 if (leaf_blocks < 1) 4674 return 0; 4675 4676 while (node_blocks < blocks_left) { 4677 blocks_left -= node_blocks; 4678 node_blocks *= limits[1]; 4679 height++; 4680 } 4681 4682 return height; 4683 } 4684 4685 /* 4686 * Query a regular btree for all records overlapping a given interval. 4687 * Start with a LE lookup of the key of low_rec and return all records 4688 * until we find a record with a key greater than the key of high_rec. 4689 */ 4690 STATIC int 4691 xfs_btree_simple_query_range( 4692 struct xfs_btree_cur *cur, 4693 const union xfs_btree_key *low_key, 4694 const union xfs_btree_key *high_key, 4695 xfs_btree_query_range_fn fn, 4696 void *priv) 4697 { 4698 union xfs_btree_rec *recp; 4699 union xfs_btree_key rec_key; 4700 int64_t diff; 4701 int stat; 4702 bool firstrec = true; 4703 int error; 4704 4705 ASSERT(cur->bc_ops->init_high_key_from_rec); 4706 ASSERT(cur->bc_ops->diff_two_keys); 4707 4708 /* 4709 * Find the leftmost record. The btree cursor must be set 4710 * to the low record used to generate low_key. 4711 */ 4712 stat = 0; 4713 error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, &stat); 4714 if (error) 4715 goto out; 4716 4717 /* Nothing? See if there's anything to the right. */ 4718 if (!stat) { 4719 error = xfs_btree_increment(cur, 0, &stat); 4720 if (error) 4721 goto out; 4722 } 4723 4724 while (stat) { 4725 /* Find the record. */ 4726 error = xfs_btree_get_rec(cur, &recp, &stat); 4727 if (error || !stat) 4728 break; 4729 4730 /* Skip if high_key(rec) < low_key. */ 4731 if (firstrec) { 4732 cur->bc_ops->init_high_key_from_rec(&rec_key, recp); 4733 firstrec = false; 4734 diff = cur->bc_ops->diff_two_keys(cur, low_key, 4735 &rec_key); 4736 if (diff > 0) 4737 goto advloop; 4738 } 4739 4740 /* Stop if high_key < low_key(rec). */ 4741 cur->bc_ops->init_key_from_rec(&rec_key, recp); 4742 diff = cur->bc_ops->diff_two_keys(cur, &rec_key, high_key); 4743 if (diff > 0) 4744 break; 4745 4746 /* Callback */ 4747 error = fn(cur, recp, priv); 4748 if (error) 4749 break; 4750 4751 advloop: 4752 /* Move on to the next record. */ 4753 error = xfs_btree_increment(cur, 0, &stat); 4754 if (error) 4755 break; 4756 } 4757 4758 out: 4759 return error; 4760 } 4761 4762 /* 4763 * Query an overlapped interval btree for all records overlapping a given 4764 * interval. This function roughly follows the algorithm given in 4765 * "Interval Trees" of _Introduction to Algorithms_, which is section 4766 * 14.3 in the 2nd and 3rd editions. 4767 * 4768 * First, generate keys for the low and high records passed in. 4769 * 4770 * For any leaf node, generate the high and low keys for the record. 4771 * If the record keys overlap with the query low/high keys, pass the 4772 * record to the function iterator. 4773 * 4774 * For any internal node, compare the low and high keys of each 4775 * pointer against the query low/high keys. If there's an overlap, 4776 * follow the pointer. 4777 * 4778 * As an optimization, we stop scanning a block when we find a low key 4779 * that is greater than the query's high key. 4780 */ 4781 STATIC int 4782 xfs_btree_overlapped_query_range( 4783 struct xfs_btree_cur *cur, 4784 const union xfs_btree_key *low_key, 4785 const union xfs_btree_key *high_key, 4786 xfs_btree_query_range_fn fn, 4787 void *priv) 4788 { 4789 union xfs_btree_ptr ptr; 4790 union xfs_btree_ptr *pp; 4791 union xfs_btree_key rec_key; 4792 union xfs_btree_key rec_hkey; 4793 union xfs_btree_key *lkp; 4794 union xfs_btree_key *hkp; 4795 union xfs_btree_rec *recp; 4796 struct xfs_btree_block *block; 4797 int64_t ldiff; 4798 int64_t hdiff; 4799 int level; 4800 struct xfs_buf *bp; 4801 int i; 4802 int error; 4803 4804 /* Load the root of the btree. */ 4805 level = cur->bc_nlevels - 1; 4806 cur->bc_ops->init_ptr_from_cur(cur, &ptr); 4807 error = xfs_btree_lookup_get_block(cur, level, &ptr, &block); 4808 if (error) 4809 return error; 4810 xfs_btree_get_block(cur, level, &bp); 4811 trace_xfs_btree_overlapped_query_range(cur, level, bp); 4812 #ifdef DEBUG 4813 error = xfs_btree_check_block(cur, block, level, bp); 4814 if (error) 4815 goto out; 4816 #endif 4817 cur->bc_levels[level].ptr = 1; 4818 4819 while (level < cur->bc_nlevels) { 4820 block = xfs_btree_get_block(cur, level, &bp); 4821 4822 /* End of node, pop back towards the root. */ 4823 if (cur->bc_levels[level].ptr > 4824 be16_to_cpu(block->bb_numrecs)) { 4825 pop_up: 4826 if (level < cur->bc_nlevels - 1) 4827 cur->bc_levels[level + 1].ptr++; 4828 level++; 4829 continue; 4830 } 4831 4832 if (level == 0) { 4833 /* Handle a leaf node. */ 4834 recp = xfs_btree_rec_addr(cur, cur->bc_levels[0].ptr, 4835 block); 4836 4837 cur->bc_ops->init_high_key_from_rec(&rec_hkey, recp); 4838 ldiff = cur->bc_ops->diff_two_keys(cur, &rec_hkey, 4839 low_key); 4840 4841 cur->bc_ops->init_key_from_rec(&rec_key, recp); 4842 hdiff = cur->bc_ops->diff_two_keys(cur, high_key, 4843 &rec_key); 4844 4845 /* 4846 * If (record's high key >= query's low key) and 4847 * (query's high key >= record's low key), then 4848 * this record overlaps the query range; callback. 4849 */ 4850 if (ldiff >= 0 && hdiff >= 0) { 4851 error = fn(cur, recp, priv); 4852 if (error) 4853 break; 4854 } else if (hdiff < 0) { 4855 /* Record is larger than high key; pop. */ 4856 goto pop_up; 4857 } 4858 cur->bc_levels[level].ptr++; 4859 continue; 4860 } 4861 4862 /* Handle an internal node. */ 4863 lkp = xfs_btree_key_addr(cur, cur->bc_levels[level].ptr, block); 4864 hkp = xfs_btree_high_key_addr(cur, cur->bc_levels[level].ptr, 4865 block); 4866 pp = xfs_btree_ptr_addr(cur, cur->bc_levels[level].ptr, block); 4867 4868 ldiff = cur->bc_ops->diff_two_keys(cur, hkp, low_key); 4869 hdiff = cur->bc_ops->diff_two_keys(cur, high_key, lkp); 4870 4871 /* 4872 * If (pointer's high key >= query's low key) and 4873 * (query's high key >= pointer's low key), then 4874 * this record overlaps the query range; follow pointer. 4875 */ 4876 if (ldiff >= 0 && hdiff >= 0) { 4877 level--; 4878 error = xfs_btree_lookup_get_block(cur, level, pp, 4879 &block); 4880 if (error) 4881 goto out; 4882 xfs_btree_get_block(cur, level, &bp); 4883 trace_xfs_btree_overlapped_query_range(cur, level, bp); 4884 #ifdef DEBUG 4885 error = xfs_btree_check_block(cur, block, level, bp); 4886 if (error) 4887 goto out; 4888 #endif 4889 cur->bc_levels[level].ptr = 1; 4890 continue; 4891 } else if (hdiff < 0) { 4892 /* The low key is larger than the upper range; pop. */ 4893 goto pop_up; 4894 } 4895 cur->bc_levels[level].ptr++; 4896 } 4897 4898 out: 4899 /* 4900 * If we don't end this function with the cursor pointing at a record 4901 * block, a subsequent non-error cursor deletion will not release 4902 * node-level buffers, causing a buffer leak. This is quite possible 4903 * with a zero-results range query, so release the buffers if we 4904 * failed to return any results. 4905 */ 4906 if (cur->bc_levels[0].bp == NULL) { 4907 for (i = 0; i < cur->bc_nlevels; i++) { 4908 if (cur->bc_levels[i].bp) { 4909 xfs_trans_brelse(cur->bc_tp, 4910 cur->bc_levels[i].bp); 4911 cur->bc_levels[i].bp = NULL; 4912 cur->bc_levels[i].ptr = 0; 4913 cur->bc_levels[i].ra = 0; 4914 } 4915 } 4916 } 4917 4918 return error; 4919 } 4920 4921 /* 4922 * Query a btree for all records overlapping a given interval of keys. The 4923 * supplied function will be called with each record found; return one of the 4924 * XFS_BTREE_QUERY_RANGE_{CONTINUE,ABORT} values or the usual negative error 4925 * code. This function returns -ECANCELED, zero, or a negative error code. 4926 */ 4927 int 4928 xfs_btree_query_range( 4929 struct xfs_btree_cur *cur, 4930 const union xfs_btree_irec *low_rec, 4931 const union xfs_btree_irec *high_rec, 4932 xfs_btree_query_range_fn fn, 4933 void *priv) 4934 { 4935 union xfs_btree_rec rec; 4936 union xfs_btree_key low_key; 4937 union xfs_btree_key high_key; 4938 4939 /* Find the keys of both ends of the interval. */ 4940 cur->bc_rec = *high_rec; 4941 cur->bc_ops->init_rec_from_cur(cur, &rec); 4942 cur->bc_ops->init_key_from_rec(&high_key, &rec); 4943 4944 cur->bc_rec = *low_rec; 4945 cur->bc_ops->init_rec_from_cur(cur, &rec); 4946 cur->bc_ops->init_key_from_rec(&low_key, &rec); 4947 4948 /* Enforce low key < high key. */ 4949 if (cur->bc_ops->diff_two_keys(cur, &low_key, &high_key) > 0) 4950 return -EINVAL; 4951 4952 if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING)) 4953 return xfs_btree_simple_query_range(cur, &low_key, 4954 &high_key, fn, priv); 4955 return xfs_btree_overlapped_query_range(cur, &low_key, &high_key, 4956 fn, priv); 4957 } 4958 4959 /* Query a btree for all records. */ 4960 int 4961 xfs_btree_query_all( 4962 struct xfs_btree_cur *cur, 4963 xfs_btree_query_range_fn fn, 4964 void *priv) 4965 { 4966 union xfs_btree_key low_key; 4967 union xfs_btree_key high_key; 4968 4969 memset(&cur->bc_rec, 0, sizeof(cur->bc_rec)); 4970 memset(&low_key, 0, sizeof(low_key)); 4971 memset(&high_key, 0xFF, sizeof(high_key)); 4972 4973 return xfs_btree_simple_query_range(cur, &low_key, &high_key, fn, priv); 4974 } 4975 4976 static int 4977 xfs_btree_count_blocks_helper( 4978 struct xfs_btree_cur *cur, 4979 int level, 4980 void *data) 4981 { 4982 xfs_extlen_t *blocks = data; 4983 (*blocks)++; 4984 4985 return 0; 4986 } 4987 4988 /* Count the blocks in a btree and return the result in *blocks. */ 4989 int 4990 xfs_btree_count_blocks( 4991 struct xfs_btree_cur *cur, 4992 xfs_extlen_t *blocks) 4993 { 4994 *blocks = 0; 4995 return xfs_btree_visit_blocks(cur, xfs_btree_count_blocks_helper, 4996 XFS_BTREE_VISIT_ALL, blocks); 4997 } 4998 4999 /* Compare two btree pointers. */ 5000 int64_t 5001 xfs_btree_diff_two_ptrs( 5002 struct xfs_btree_cur *cur, 5003 const union xfs_btree_ptr *a, 5004 const union xfs_btree_ptr *b) 5005 { 5006 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) 5007 return (int64_t)be64_to_cpu(a->l) - be64_to_cpu(b->l); 5008 return (int64_t)be32_to_cpu(a->s) - be32_to_cpu(b->s); 5009 } 5010 5011 /* If there's an extent, we're done. */ 5012 STATIC int 5013 xfs_btree_has_record_helper( 5014 struct xfs_btree_cur *cur, 5015 const union xfs_btree_rec *rec, 5016 void *priv) 5017 { 5018 return -ECANCELED; 5019 } 5020 5021 /* Is there a record covering a given range of keys? */ 5022 int 5023 xfs_btree_has_record( 5024 struct xfs_btree_cur *cur, 5025 const union xfs_btree_irec *low, 5026 const union xfs_btree_irec *high, 5027 bool *exists) 5028 { 5029 int error; 5030 5031 error = xfs_btree_query_range(cur, low, high, 5032 &xfs_btree_has_record_helper, NULL); 5033 if (error == -ECANCELED) { 5034 *exists = true; 5035 return 0; 5036 } 5037 *exists = false; 5038 return error; 5039 } 5040 5041 /* Are there more records in this btree? */ 5042 bool 5043 xfs_btree_has_more_records( 5044 struct xfs_btree_cur *cur) 5045 { 5046 struct xfs_btree_block *block; 5047 struct xfs_buf *bp; 5048 5049 block = xfs_btree_get_block(cur, 0, &bp); 5050 5051 /* There are still records in this block. */ 5052 if (cur->bc_levels[0].ptr < xfs_btree_get_numrecs(block)) 5053 return true; 5054 5055 /* There are more record blocks. */ 5056 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) 5057 return block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK); 5058 else 5059 return block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK); 5060 } 5061 5062 /* Set up all the btree cursor caches. */ 5063 int __init 5064 xfs_btree_init_cur_caches(void) 5065 { 5066 int error; 5067 5068 error = xfs_allocbt_init_cur_cache(); 5069 if (error) 5070 return error; 5071 error = xfs_inobt_init_cur_cache(); 5072 if (error) 5073 goto err; 5074 error = xfs_bmbt_init_cur_cache(); 5075 if (error) 5076 goto err; 5077 error = xfs_rmapbt_init_cur_cache(); 5078 if (error) 5079 goto err; 5080 error = xfs_refcountbt_init_cur_cache(); 5081 if (error) 5082 goto err; 5083 5084 return 0; 5085 err: 5086 xfs_btree_destroy_cur_caches(); 5087 return error; 5088 } 5089 5090 /* Destroy all the btree cursor caches, if they've been allocated. */ 5091 void 5092 xfs_btree_destroy_cur_caches(void) 5093 { 5094 xfs_allocbt_destroy_cur_cache(); 5095 xfs_inobt_destroy_cur_cache(); 5096 xfs_bmbt_destroy_cur_cache(); 5097 xfs_rmapbt_destroy_cur_cache(); 5098 xfs_refcountbt_destroy_cur_cache(); 5099 } 5100