1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1990, 1993, 1994 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Mike Olson. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 #if defined(LIBC_SCCS) && !defined(lint) 36 static char sccsid[] = "@(#)bt_split.c 8.10 (Berkeley) 1/9/95"; 37 #endif /* LIBC_SCCS and not lint */ 38 #include <sys/cdefs.h> 39 __FBSDID("$FreeBSD$"); 40 41 #include <sys/param.h> 42 43 #include <limits.h> 44 #include <stdio.h> 45 #include <stdlib.h> 46 #include <string.h> 47 48 #include <db.h> 49 #include "btree.h" 50 51 static int bt_broot(BTREE *, PAGE *, PAGE *, PAGE *); 52 static PAGE *bt_page(BTREE *, PAGE *, PAGE **, PAGE **, indx_t *, size_t); 53 static int bt_preserve(BTREE *, pgno_t); 54 static PAGE *bt_psplit(BTREE *, PAGE *, PAGE *, PAGE *, indx_t *, size_t); 55 static PAGE *bt_root(BTREE *, PAGE *, PAGE **, PAGE **, indx_t *, size_t); 56 static int bt_rroot(BTREE *, PAGE *, PAGE *, PAGE *); 57 static recno_t rec_total(PAGE *); 58 59 #ifdef STATISTICS 60 u_long bt_rootsplit, bt_split, bt_sortsplit, bt_pfxsaved; 61 #endif 62 63 /* 64 * __BT_SPLIT -- Split the tree. 65 * 66 * Parameters: 67 * t: tree 68 * sp: page to split 69 * key: key to insert 70 * data: data to insert 71 * flags: BIGKEY/BIGDATA flags 72 * ilen: insert length 73 * skip: index to leave open 74 * 75 * Returns: 76 * RET_ERROR, RET_SUCCESS 77 */ 78 int 79 __bt_split(BTREE *t, PAGE *sp, const DBT *key, const DBT *data, int flags, 80 size_t ilen, u_int32_t argskip) 81 { 82 BINTERNAL *bi; 83 BLEAF *bl, *tbl; 84 DBT a, b; 85 EPGNO *parent; 86 PAGE *h, *l, *r, *lchild, *rchild; 87 indx_t nxtindex; 88 u_int16_t skip; 89 u_int32_t n, nbytes, nksize; 90 int parentsplit; 91 char *dest; 92 93 /* 94 * Split the page into two pages, l and r. The split routines return 95 * a pointer to the page into which the key should be inserted and with 96 * skip set to the offset which should be used. Additionally, l and r 97 * are pinned. 98 */ 99 skip = argskip; 100 h = sp->pgno == P_ROOT ? 101 bt_root(t, sp, &l, &r, &skip, ilen) : 102 bt_page(t, sp, &l, &r, &skip, ilen); 103 if (h == NULL) 104 return (RET_ERROR); 105 106 /* 107 * Insert the new key/data pair into the leaf page. (Key inserts 108 * always cause a leaf page to split first.) 109 */ 110 h->linp[skip] = h->upper -= ilen; 111 dest = (char *)h + h->upper; 112 if (F_ISSET(t, R_RECNO)) 113 WR_RLEAF(dest, data, flags) 114 else 115 WR_BLEAF(dest, key, data, flags) 116 117 /* If the root page was split, make it look right. */ 118 if (sp->pgno == P_ROOT && 119 (F_ISSET(t, R_RECNO) ? 120 bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR) 121 goto err2; 122 123 /* 124 * Now we walk the parent page stack -- a LIFO stack of the pages that 125 * were traversed when we searched for the page that split. Each stack 126 * entry is a page number and a page index offset. The offset is for 127 * the page traversed on the search. We've just split a page, so we 128 * have to insert a new key into the parent page. 129 * 130 * If the insert into the parent page causes it to split, may have to 131 * continue splitting all the way up the tree. We stop if the root 132 * splits or the page inserted into didn't have to split to hold the 133 * new key. Some algorithms replace the key for the old page as well 134 * as the new page. We don't, as there's no reason to believe that the 135 * first key on the old page is any better than the key we have, and, 136 * in the case of a key being placed at index 0 causing the split, the 137 * key is unavailable. 138 * 139 * There are a maximum of 5 pages pinned at any time. We keep the left 140 * and right pages pinned while working on the parent. The 5 are the 141 * two children, left parent and right parent (when the parent splits) 142 * and the root page or the overflow key page when calling bt_preserve. 143 * This code must make sure that all pins are released other than the 144 * root page or overflow page which is unlocked elsewhere. 145 */ 146 while ((parent = BT_POP(t)) != NULL) { 147 lchild = l; 148 rchild = r; 149 150 /* Get the parent page. */ 151 if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL) 152 goto err2; 153 154 /* 155 * The new key goes ONE AFTER the index, because the split 156 * was to the right. 157 */ 158 skip = parent->index + 1; 159 160 /* 161 * Calculate the space needed on the parent page. 162 * 163 * Prefix trees: space hack when inserting into BINTERNAL 164 * pages. Retain only what's needed to distinguish between 165 * the new entry and the LAST entry on the page to its left. 166 * If the keys compare equal, retain the entire key. Note, 167 * we don't touch overflow keys, and the entire key must be 168 * retained for the next-to-left most key on the leftmost 169 * page of each level, or the search will fail. Applicable 170 * ONLY to internal pages that have leaf pages as children. 171 * Further reduction of the key between pairs of internal 172 * pages loses too much information. 173 */ 174 switch (rchild->flags & P_TYPE) { 175 case P_BINTERNAL: 176 bi = GETBINTERNAL(rchild, 0); 177 nbytes = NBINTERNAL(bi->ksize); 178 break; 179 case P_BLEAF: 180 bl = GETBLEAF(rchild, 0); 181 nbytes = NBINTERNAL(bl->ksize); 182 if (t->bt_pfx && !(bl->flags & P_BIGKEY) && 183 (h->prevpg != P_INVALID || skip > 1)) { 184 tbl = GETBLEAF(lchild, NEXTINDEX(lchild) - 1); 185 a.size = tbl->ksize; 186 a.data = tbl->bytes; 187 b.size = bl->ksize; 188 b.data = bl->bytes; 189 nksize = t->bt_pfx(&a, &b); 190 n = NBINTERNAL(nksize); 191 if (n < nbytes) { 192 #ifdef STATISTICS 193 bt_pfxsaved += nbytes - n; 194 #endif 195 nbytes = n; 196 } else 197 nksize = 0; 198 } else 199 nksize = 0; 200 break; 201 case P_RINTERNAL: 202 case P_RLEAF: 203 nbytes = NRINTERNAL; 204 break; 205 default: 206 abort(); 207 } 208 209 /* Split the parent page if necessary or shift the indices. */ 210 if ((u_int32_t)(h->upper - h->lower) < nbytes + sizeof(indx_t)) { 211 sp = h; 212 h = h->pgno == P_ROOT ? 213 bt_root(t, h, &l, &r, &skip, nbytes) : 214 bt_page(t, h, &l, &r, &skip, nbytes); 215 if (h == NULL) 216 goto err1; 217 parentsplit = 1; 218 } else { 219 if (skip < (nxtindex = NEXTINDEX(h))) 220 memmove(h->linp + skip + 1, h->linp + skip, 221 (nxtindex - skip) * sizeof(indx_t)); 222 h->lower += sizeof(indx_t); 223 parentsplit = 0; 224 } 225 226 /* Insert the key into the parent page. */ 227 switch (rchild->flags & P_TYPE) { 228 case P_BINTERNAL: 229 h->linp[skip] = h->upper -= nbytes; 230 dest = (char *)h + h->linp[skip]; 231 memmove(dest, bi, nbytes); 232 ((BINTERNAL *)dest)->pgno = rchild->pgno; 233 break; 234 case P_BLEAF: 235 h->linp[skip] = h->upper -= nbytes; 236 dest = (char *)h + h->linp[skip]; 237 WR_BINTERNAL(dest, nksize ? nksize : bl->ksize, 238 rchild->pgno, bl->flags & P_BIGKEY); 239 memmove(dest, bl->bytes, nksize ? nksize : bl->ksize); 240 if (bl->flags & P_BIGKEY) { 241 pgno_t pgno; 242 memcpy(&pgno, bl->bytes, sizeof(pgno)); 243 if (bt_preserve(t, pgno) == RET_ERROR) 244 goto err1; 245 } 246 break; 247 case P_RINTERNAL: 248 /* 249 * Update the left page count. If split 250 * added at index 0, fix the correct page. 251 */ 252 if (skip > 0) 253 dest = (char *)h + h->linp[skip - 1]; 254 else 255 dest = (char *)l + l->linp[NEXTINDEX(l) - 1]; 256 ((RINTERNAL *)dest)->nrecs = rec_total(lchild); 257 ((RINTERNAL *)dest)->pgno = lchild->pgno; 258 259 /* Update the right page count. */ 260 h->linp[skip] = h->upper -= nbytes; 261 dest = (char *)h + h->linp[skip]; 262 ((RINTERNAL *)dest)->nrecs = rec_total(rchild); 263 ((RINTERNAL *)dest)->pgno = rchild->pgno; 264 break; 265 case P_RLEAF: 266 /* 267 * Update the left page count. If split 268 * added at index 0, fix the correct page. 269 */ 270 if (skip > 0) 271 dest = (char *)h + h->linp[skip - 1]; 272 else 273 dest = (char *)l + l->linp[NEXTINDEX(l) - 1]; 274 ((RINTERNAL *)dest)->nrecs = NEXTINDEX(lchild); 275 ((RINTERNAL *)dest)->pgno = lchild->pgno; 276 277 /* Update the right page count. */ 278 h->linp[skip] = h->upper -= nbytes; 279 dest = (char *)h + h->linp[skip]; 280 ((RINTERNAL *)dest)->nrecs = NEXTINDEX(rchild); 281 ((RINTERNAL *)dest)->pgno = rchild->pgno; 282 break; 283 default: 284 abort(); 285 } 286 287 /* Unpin the held pages. */ 288 if (!parentsplit) { 289 mpool_put(t->bt_mp, h, MPOOL_DIRTY); 290 break; 291 } 292 293 /* If the root page was split, make it look right. */ 294 if (sp->pgno == P_ROOT && 295 (F_ISSET(t, R_RECNO) ? 296 bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR) 297 goto err1; 298 299 mpool_put(t->bt_mp, lchild, MPOOL_DIRTY); 300 mpool_put(t->bt_mp, rchild, MPOOL_DIRTY); 301 } 302 303 /* Unpin the held pages. */ 304 mpool_put(t->bt_mp, l, MPOOL_DIRTY); 305 mpool_put(t->bt_mp, r, MPOOL_DIRTY); 306 307 /* Clear any pages left on the stack. */ 308 return (RET_SUCCESS); 309 310 /* 311 * If something fails in the above loop we were already walking back 312 * up the tree and the tree is now inconsistent. Nothing much we can 313 * do about it but release any memory we're holding. 314 */ 315 err1: mpool_put(t->bt_mp, lchild, MPOOL_DIRTY); 316 mpool_put(t->bt_mp, rchild, MPOOL_DIRTY); 317 318 err2: mpool_put(t->bt_mp, l, 0); 319 mpool_put(t->bt_mp, r, 0); 320 __dbpanic(t->bt_dbp); 321 return (RET_ERROR); 322 } 323 324 /* 325 * BT_PAGE -- Split a non-root page of a btree. 326 * 327 * Parameters: 328 * t: tree 329 * h: root page 330 * lp: pointer to left page pointer 331 * rp: pointer to right page pointer 332 * skip: pointer to index to leave open 333 * ilen: insert length 334 * 335 * Returns: 336 * Pointer to page in which to insert or NULL on error. 337 */ 338 static PAGE * 339 bt_page(BTREE *t, PAGE *h, PAGE **lp, PAGE **rp, indx_t *skip, size_t ilen) 340 { 341 PAGE *l, *r, *tp; 342 pgno_t npg; 343 344 #ifdef STATISTICS 345 ++bt_split; 346 #endif 347 /* Put the new right page for the split into place. */ 348 if ((r = __bt_new(t, &npg)) == NULL) 349 return (NULL); 350 r->pgno = npg; 351 r->lower = BTDATAOFF; 352 r->upper = t->bt_psize; 353 r->nextpg = h->nextpg; 354 r->prevpg = h->pgno; 355 r->flags = h->flags & P_TYPE; 356 357 /* 358 * If we're splitting the last page on a level because we're appending 359 * a key to it (skip is NEXTINDEX()), it's likely that the data is 360 * sorted. Adding an empty page on the side of the level is less work 361 * and can push the fill factor much higher than normal. If we're 362 * wrong it's no big deal, we'll just do the split the right way next 363 * time. It may look like it's equally easy to do a similar hack for 364 * reverse sorted data, that is, split the tree left, but it's not. 365 * Don't even try. 366 */ 367 if (h->nextpg == P_INVALID && *skip == NEXTINDEX(h)) { 368 #ifdef STATISTICS 369 ++bt_sortsplit; 370 #endif 371 h->nextpg = r->pgno; 372 r->lower = BTDATAOFF + sizeof(indx_t); 373 *skip = 0; 374 *lp = h; 375 *rp = r; 376 return (r); 377 } 378 379 /* Put the new left page for the split into place. */ 380 if ((l = (PAGE *)calloc(1, t->bt_psize)) == NULL) { 381 mpool_put(t->bt_mp, r, 0); 382 return (NULL); 383 } 384 l->pgno = h->pgno; 385 l->nextpg = r->pgno; 386 l->prevpg = h->prevpg; 387 l->lower = BTDATAOFF; 388 l->upper = t->bt_psize; 389 l->flags = h->flags & P_TYPE; 390 391 /* Fix up the previous pointer of the page after the split page. */ 392 if (h->nextpg != P_INVALID) { 393 if ((tp = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL) { 394 free(l); 395 /* XXX mpool_free(t->bt_mp, r->pgno); */ 396 return (NULL); 397 } 398 tp->prevpg = r->pgno; 399 mpool_put(t->bt_mp, tp, MPOOL_DIRTY); 400 } 401 402 /* 403 * Split right. The key/data pairs aren't sorted in the btree page so 404 * it's simpler to copy the data from the split page onto two new pages 405 * instead of copying half the data to the right page and compacting 406 * the left page in place. Since the left page can't change, we have 407 * to swap the original and the allocated left page after the split. 408 */ 409 tp = bt_psplit(t, h, l, r, skip, ilen); 410 411 /* Move the new left page onto the old left page. */ 412 memmove(h, l, t->bt_psize); 413 if (tp == l) 414 tp = h; 415 free(l); 416 417 *lp = h; 418 *rp = r; 419 return (tp); 420 } 421 422 /* 423 * BT_ROOT -- Split the root page of a btree. 424 * 425 * Parameters: 426 * t: tree 427 * h: root page 428 * lp: pointer to left page pointer 429 * rp: pointer to right page pointer 430 * skip: pointer to index to leave open 431 * ilen: insert length 432 * 433 * Returns: 434 * Pointer to page in which to insert or NULL on error. 435 */ 436 static PAGE * 437 bt_root(BTREE *t, PAGE *h, PAGE **lp, PAGE **rp, indx_t *skip, size_t ilen) 438 { 439 PAGE *l, *r, *tp; 440 pgno_t lnpg, rnpg; 441 442 #ifdef STATISTICS 443 ++bt_split; 444 ++bt_rootsplit; 445 #endif 446 /* Put the new left and right pages for the split into place. */ 447 if ((l = __bt_new(t, &lnpg)) == NULL || 448 (r = __bt_new(t, &rnpg)) == NULL) 449 return (NULL); 450 l->pgno = lnpg; 451 r->pgno = rnpg; 452 l->nextpg = r->pgno; 453 r->prevpg = l->pgno; 454 l->prevpg = r->nextpg = P_INVALID; 455 l->lower = r->lower = BTDATAOFF; 456 l->upper = r->upper = t->bt_psize; 457 l->flags = r->flags = h->flags & P_TYPE; 458 459 /* Split the root page. */ 460 tp = bt_psplit(t, h, l, r, skip, ilen); 461 462 *lp = l; 463 *rp = r; 464 return (tp); 465 } 466 467 /* 468 * BT_RROOT -- Fix up the recno root page after it has been split. 469 * 470 * Parameters: 471 * t: tree 472 * h: root page 473 * l: left page 474 * r: right page 475 * 476 * Returns: 477 * RET_ERROR, RET_SUCCESS 478 */ 479 static int 480 bt_rroot(BTREE *t, PAGE *h, PAGE *l, PAGE *r) 481 { 482 char *dest; 483 484 /* Insert the left and right keys, set the header information. */ 485 h->linp[0] = h->upper = t->bt_psize - NRINTERNAL; 486 dest = (char *)h + h->upper; 487 WR_RINTERNAL(dest, 488 l->flags & P_RLEAF ? NEXTINDEX(l) : rec_total(l), l->pgno); 489 490 __PAST_END(h->linp, 1) = h->upper -= NRINTERNAL; 491 dest = (char *)h + h->upper; 492 WR_RINTERNAL(dest, 493 r->flags & P_RLEAF ? NEXTINDEX(r) : rec_total(r), r->pgno); 494 495 h->lower = BTDATAOFF + 2 * sizeof(indx_t); 496 497 /* Unpin the root page, set to recno internal page. */ 498 h->flags &= ~P_TYPE; 499 h->flags |= P_RINTERNAL; 500 mpool_put(t->bt_mp, h, MPOOL_DIRTY); 501 502 return (RET_SUCCESS); 503 } 504 505 /* 506 * BT_BROOT -- Fix up the btree root page after it has been split. 507 * 508 * Parameters: 509 * t: tree 510 * h: root page 511 * l: left page 512 * r: right page 513 * 514 * Returns: 515 * RET_ERROR, RET_SUCCESS 516 */ 517 static int 518 bt_broot(BTREE *t, PAGE *h, PAGE *l, PAGE *r) 519 { 520 BINTERNAL *bi; 521 BLEAF *bl; 522 u_int32_t nbytes; 523 char *dest; 524 525 /* 526 * If the root page was a leaf page, change it into an internal page. 527 * We copy the key we split on (but not the key's data, in the case of 528 * a leaf page) to the new root page. 529 * 530 * The btree comparison code guarantees that the left-most key on any 531 * level of the tree is never used, so it doesn't need to be filled in. 532 */ 533 nbytes = NBINTERNAL(0); 534 h->linp[0] = h->upper = t->bt_psize - nbytes; 535 dest = (char *)h + h->upper; 536 WR_BINTERNAL(dest, 0, l->pgno, 0); 537 538 switch (h->flags & P_TYPE) { 539 case P_BLEAF: 540 bl = GETBLEAF(r, 0); 541 nbytes = NBINTERNAL(bl->ksize); 542 __PAST_END(h->linp, 1) = h->upper -= nbytes; 543 dest = (char *)h + h->upper; 544 WR_BINTERNAL(dest, bl->ksize, r->pgno, 0); 545 memmove(dest, bl->bytes, bl->ksize); 546 547 /* 548 * If the key is on an overflow page, mark the overflow chain 549 * so it isn't deleted when the leaf copy of the key is deleted. 550 */ 551 if (bl->flags & P_BIGKEY) { 552 pgno_t pgno; 553 memcpy(&pgno, bl->bytes, sizeof(pgno)); 554 if (bt_preserve(t, pgno) == RET_ERROR) 555 return (RET_ERROR); 556 } 557 break; 558 case P_BINTERNAL: 559 bi = GETBINTERNAL(r, 0); 560 nbytes = NBINTERNAL(bi->ksize); 561 __PAST_END(h->linp, 1) = h->upper -= nbytes; 562 dest = (char *)h + h->upper; 563 memmove(dest, bi, nbytes); 564 ((BINTERNAL *)dest)->pgno = r->pgno; 565 break; 566 default: 567 abort(); 568 } 569 570 /* There are two keys on the page. */ 571 h->lower = BTDATAOFF + 2 * sizeof(indx_t); 572 573 /* Unpin the root page, set to btree internal page. */ 574 h->flags &= ~P_TYPE; 575 h->flags |= P_BINTERNAL; 576 mpool_put(t->bt_mp, h, MPOOL_DIRTY); 577 578 return (RET_SUCCESS); 579 } 580 581 /* 582 * BT_PSPLIT -- Do the real work of splitting the page. 583 * 584 * Parameters: 585 * t: tree 586 * h: page to be split 587 * l: page to put lower half of data 588 * r: page to put upper half of data 589 * pskip: pointer to index to leave open 590 * ilen: insert length 591 * 592 * Returns: 593 * Pointer to page in which to insert. 594 */ 595 static PAGE * 596 bt_psplit(BTREE *t, PAGE *h, PAGE *l, PAGE *r, indx_t *pskip, size_t ilen) 597 { 598 BINTERNAL *bi; 599 BLEAF *bl; 600 CURSOR *c; 601 RLEAF *rl; 602 PAGE *rval; 603 void *src; 604 indx_t full, half, nxt, off, skip, top, used; 605 u_int32_t nbytes; 606 int bigkeycnt, isbigkey; 607 608 /* 609 * Split the data to the left and right pages. Leave the skip index 610 * open. Additionally, make some effort not to split on an overflow 611 * key. This makes internal page processing faster and can save 612 * space as overflow keys used by internal pages are never deleted. 613 */ 614 bigkeycnt = 0; 615 skip = *pskip; 616 full = t->bt_psize - BTDATAOFF; 617 half = full / 2; 618 used = 0; 619 for (nxt = off = 0, top = NEXTINDEX(h); nxt < top; ++off) { 620 if (skip == off) { 621 nbytes = ilen; 622 isbigkey = 0; /* XXX: not really known. */ 623 } else 624 switch (h->flags & P_TYPE) { 625 case P_BINTERNAL: 626 src = bi = GETBINTERNAL(h, nxt); 627 nbytes = NBINTERNAL(bi->ksize); 628 isbigkey = bi->flags & P_BIGKEY; 629 break; 630 case P_BLEAF: 631 src = bl = GETBLEAF(h, nxt); 632 nbytes = NBLEAF(bl); 633 isbigkey = bl->flags & P_BIGKEY; 634 break; 635 case P_RINTERNAL: 636 src = GETRINTERNAL(h, nxt); 637 nbytes = NRINTERNAL; 638 isbigkey = 0; 639 break; 640 case P_RLEAF: 641 src = rl = GETRLEAF(h, nxt); 642 nbytes = NRLEAF(rl); 643 isbigkey = 0; 644 break; 645 default: 646 abort(); 647 } 648 649 /* 650 * If the key/data pairs are substantial fractions of the max 651 * possible size for the page, it's possible to get situations 652 * where we decide to try and copy too much onto the left page. 653 * Make sure that doesn't happen. 654 */ 655 if ((skip <= off && used + nbytes + sizeof(indx_t) >= full) || 656 nxt == top - 1) { 657 --off; 658 break; 659 } 660 661 /* Copy the key/data pair, if not the skipped index. */ 662 if (skip != off) { 663 ++nxt; 664 665 l->linp[off] = l->upper -= nbytes; 666 memmove((char *)l + l->upper, src, nbytes); 667 } 668 669 used += nbytes + sizeof(indx_t); 670 if (used >= half) { 671 if (!isbigkey || bigkeycnt == 3) 672 break; 673 else 674 ++bigkeycnt; 675 } 676 } 677 678 /* 679 * Off is the last offset that's valid for the left page. 680 * Nxt is the first offset to be placed on the right page. 681 */ 682 l->lower += (off + 1) * sizeof(indx_t); 683 684 /* 685 * If splitting the page that the cursor was on, the cursor has to be 686 * adjusted to point to the same record as before the split. If the 687 * cursor is at or past the skipped slot, the cursor is incremented by 688 * one. If the cursor is on the right page, it is decremented by the 689 * number of records split to the left page. 690 */ 691 c = &t->bt_cursor; 692 if (F_ISSET(c, CURS_INIT) && c->pg.pgno == h->pgno) { 693 if (c->pg.index >= skip) 694 ++c->pg.index; 695 if (c->pg.index < nxt) /* Left page. */ 696 c->pg.pgno = l->pgno; 697 else { /* Right page. */ 698 c->pg.pgno = r->pgno; 699 c->pg.index -= nxt; 700 } 701 } 702 703 /* 704 * If the skipped index was on the left page, just return that page. 705 * Otherwise, adjust the skip index to reflect the new position on 706 * the right page. 707 */ 708 if (skip <= off) { 709 skip = MAX_PAGE_OFFSET; 710 rval = l; 711 } else { 712 rval = r; 713 *pskip -= nxt; 714 } 715 716 for (off = 0; nxt < top; ++off) { 717 if (skip == nxt) { 718 ++off; 719 skip = MAX_PAGE_OFFSET; 720 } 721 switch (h->flags & P_TYPE) { 722 case P_BINTERNAL: 723 src = bi = GETBINTERNAL(h, nxt); 724 nbytes = NBINTERNAL(bi->ksize); 725 break; 726 case P_BLEAF: 727 src = bl = GETBLEAF(h, nxt); 728 nbytes = NBLEAF(bl); 729 break; 730 case P_RINTERNAL: 731 src = GETRINTERNAL(h, nxt); 732 nbytes = NRINTERNAL; 733 break; 734 case P_RLEAF: 735 src = rl = GETRLEAF(h, nxt); 736 nbytes = NRLEAF(rl); 737 break; 738 default: 739 abort(); 740 } 741 ++nxt; 742 r->linp[off] = r->upper -= nbytes; 743 memmove((char *)r + r->upper, src, nbytes); 744 } 745 r->lower += off * sizeof(indx_t); 746 747 /* If the key is being appended to the page, adjust the index. */ 748 if (skip == top) 749 r->lower += sizeof(indx_t); 750 751 return (rval); 752 } 753 754 /* 755 * BT_PRESERVE -- Mark a chain of pages as used by an internal node. 756 * 757 * Chains of indirect blocks pointed to by leaf nodes get reclaimed when the 758 * record that references them gets deleted. Chains pointed to by internal 759 * pages never get deleted. This routine marks a chain as pointed to by an 760 * internal page. 761 * 762 * Parameters: 763 * t: tree 764 * pg: page number of first page in the chain. 765 * 766 * Returns: 767 * RET_SUCCESS, RET_ERROR. 768 */ 769 static int 770 bt_preserve(BTREE *t, pgno_t pg) 771 { 772 PAGE *h; 773 774 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) 775 return (RET_ERROR); 776 h->flags |= P_PRESERVE; 777 mpool_put(t->bt_mp, h, MPOOL_DIRTY); 778 return (RET_SUCCESS); 779 } 780 781 /* 782 * REC_TOTAL -- Return the number of recno entries below a page. 783 * 784 * Parameters: 785 * h: page 786 * 787 * Returns: 788 * The number of recno entries below a page. 789 * 790 * XXX 791 * These values could be set by the bt_psplit routine. The problem is that the 792 * entry has to be popped off of the stack etc. or the values have to be passed 793 * all the way back to bt_split/bt_rroot and it's not very clean. 794 */ 795 static recno_t 796 rec_total(PAGE *h) 797 { 798 recno_t recs; 799 indx_t nxt, top; 800 801 for (recs = 0, nxt = 0, top = NEXTINDEX(h); nxt < top; ++nxt) 802 recs += GETRINTERNAL(h, nxt)->nrecs; 803 return (recs); 804 } 805