1 /*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1990, 1993, 1994
5 * The Regents of the University of California. All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * Mike Olson.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 */
34
35 #include <sys/param.h>
36
37 #include <limits.h>
38 #include <stdio.h>
39 #include <stdlib.h>
40 #include <string.h>
41
42 #include <db.h>
43 #include "btree.h"
44
45 static int bt_broot(BTREE *, PAGE *, PAGE *, PAGE *);
46 static PAGE *bt_page(BTREE *, PAGE *, PAGE **, PAGE **, indx_t *, size_t);
47 static int bt_preserve(BTREE *, pgno_t);
48 static PAGE *bt_psplit(BTREE *, PAGE *, PAGE *, PAGE *, indx_t *, size_t);
49 static PAGE *bt_root(BTREE *, PAGE *, PAGE **, PAGE **, indx_t *, size_t);
50 static int bt_rroot(BTREE *, PAGE *, PAGE *, PAGE *);
51 static recno_t rec_total(PAGE *);
52
53 #ifdef STATISTICS
54 u_long bt_rootsplit, bt_split, bt_sortsplit, bt_pfxsaved;
55 #endif
56
57 /*
58 * __BT_SPLIT -- Split the tree.
59 *
60 * Parameters:
61 * t: tree
62 * sp: page to split
63 * key: key to insert
64 * data: data to insert
65 * flags: BIGKEY/BIGDATA flags
66 * ilen: insert length
67 * skip: index to leave open
68 *
69 * Returns:
70 * RET_ERROR, RET_SUCCESS
71 */
72 int
__bt_split(BTREE * t,PAGE * sp,const DBT * key,const DBT * data,int flags,size_t ilen,u_int32_t argskip)73 __bt_split(BTREE *t, PAGE *sp, const DBT *key, const DBT *data, int flags,
74 size_t ilen, u_int32_t argskip)
75 {
76 BINTERNAL *bi;
77 BLEAF *bl, *tbl;
78 DBT a, b;
79 EPGNO *parent;
80 PAGE *h, *l, *r, *lchild, *rchild;
81 indx_t nxtindex;
82 u_int16_t skip;
83 u_int32_t n, nbytes, nksize;
84 int parentsplit;
85 char *dest;
86
87 /*
88 * Split the page into two pages, l and r. The split routines return
89 * a pointer to the page into which the key should be inserted and with
90 * skip set to the offset which should be used. Additionally, l and r
91 * are pinned.
92 */
93 skip = argskip;
94 h = sp->pgno == P_ROOT ?
95 bt_root(t, sp, &l, &r, &skip, ilen) :
96 bt_page(t, sp, &l, &r, &skip, ilen);
97 if (h == NULL)
98 return (RET_ERROR);
99
100 /*
101 * Insert the new key/data pair into the leaf page. (Key inserts
102 * always cause a leaf page to split first.)
103 */
104 h->linp[skip] = h->upper -= ilen;
105 dest = (char *)h + h->upper;
106 if (F_ISSET(t, R_RECNO))
107 WR_RLEAF(dest, data, flags)
108 else
109 WR_BLEAF(dest, key, data, flags)
110
111 /* If the root page was split, make it look right. */
112 if (sp->pgno == P_ROOT &&
113 (F_ISSET(t, R_RECNO) ?
114 bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR)
115 goto err2;
116
117 /*
118 * Now we walk the parent page stack -- a LIFO stack of the pages that
119 * were traversed when we searched for the page that split. Each stack
120 * entry is a page number and a page index offset. The offset is for
121 * the page traversed on the search. We've just split a page, so we
122 * have to insert a new key into the parent page.
123 *
124 * If the insert into the parent page causes it to split, may have to
125 * continue splitting all the way up the tree. We stop if the root
126 * splits or the page inserted into didn't have to split to hold the
127 * new key. Some algorithms replace the key for the old page as well
128 * as the new page. We don't, as there's no reason to believe that the
129 * first key on the old page is any better than the key we have, and,
130 * in the case of a key being placed at index 0 causing the split, the
131 * key is unavailable.
132 *
133 * There are a maximum of 5 pages pinned at any time. We keep the left
134 * and right pages pinned while working on the parent. The 5 are the
135 * two children, left parent and right parent (when the parent splits)
136 * and the root page or the overflow key page when calling bt_preserve.
137 * This code must make sure that all pins are released other than the
138 * root page or overflow page which is unlocked elsewhere.
139 */
140 while ((parent = BT_POP(t)) != NULL) {
141 lchild = l;
142 rchild = r;
143
144 /* Get the parent page. */
145 if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL)
146 goto err2;
147
148 /*
149 * The new key goes ONE AFTER the index, because the split
150 * was to the right.
151 */
152 skip = parent->index + 1;
153
154 /*
155 * Calculate the space needed on the parent page.
156 *
157 * Prefix trees: space hack when inserting into BINTERNAL
158 * pages. Retain only what's needed to distinguish between
159 * the new entry and the LAST entry on the page to its left.
160 * If the keys compare equal, retain the entire key. Note,
161 * we don't touch overflow keys, and the entire key must be
162 * retained for the next-to-left most key on the leftmost
163 * page of each level, or the search will fail. Applicable
164 * ONLY to internal pages that have leaf pages as children.
165 * Further reduction of the key between pairs of internal
166 * pages loses too much information.
167 */
168 switch (rchild->flags & P_TYPE) {
169 case P_BINTERNAL:
170 bi = GETBINTERNAL(rchild, 0);
171 nbytes = NBINTERNAL(bi->ksize);
172 break;
173 case P_BLEAF:
174 bl = GETBLEAF(rchild, 0);
175 nbytes = NBINTERNAL(bl->ksize);
176 if (t->bt_pfx && !(bl->flags & P_BIGKEY) &&
177 (h->prevpg != P_INVALID || skip > 1)) {
178 tbl = GETBLEAF(lchild, NEXTINDEX(lchild) - 1);
179 a.size = tbl->ksize;
180 a.data = tbl->bytes;
181 b.size = bl->ksize;
182 b.data = bl->bytes;
183 nksize = t->bt_pfx(&a, &b);
184 n = NBINTERNAL(nksize);
185 if (n < nbytes) {
186 #ifdef STATISTICS
187 bt_pfxsaved += nbytes - n;
188 #endif
189 nbytes = n;
190 } else
191 nksize = 0;
192 } else
193 nksize = 0;
194 break;
195 case P_RINTERNAL:
196 case P_RLEAF:
197 nbytes = NRINTERNAL;
198 break;
199 default:
200 abort();
201 }
202
203 /* Split the parent page if necessary or shift the indices. */
204 if ((u_int32_t)(h->upper - h->lower) < nbytes + sizeof(indx_t)) {
205 sp = h;
206 h = h->pgno == P_ROOT ?
207 bt_root(t, h, &l, &r, &skip, nbytes) :
208 bt_page(t, h, &l, &r, &skip, nbytes);
209 if (h == NULL)
210 goto err1;
211 parentsplit = 1;
212 } else {
213 if (skip < (nxtindex = NEXTINDEX(h)))
214 memmove(h->linp + skip + 1, h->linp + skip,
215 (nxtindex - skip) * sizeof(indx_t));
216 h->lower += sizeof(indx_t);
217 parentsplit = 0;
218 }
219
220 /* Insert the key into the parent page. */
221 switch (rchild->flags & P_TYPE) {
222 case P_BINTERNAL:
223 h->linp[skip] = h->upper -= nbytes;
224 dest = (char *)h + h->linp[skip];
225 memmove(dest, bi, nbytes);
226 ((BINTERNAL *)dest)->pgno = rchild->pgno;
227 break;
228 case P_BLEAF:
229 h->linp[skip] = h->upper -= nbytes;
230 dest = (char *)h + h->linp[skip];
231 WR_BINTERNAL(dest, nksize ? nksize : bl->ksize,
232 rchild->pgno, bl->flags & P_BIGKEY);
233 memmove(dest, bl->bytes, nksize ? nksize : bl->ksize);
234 if (bl->flags & P_BIGKEY) {
235 pgno_t pgno;
236 memcpy(&pgno, bl->bytes, sizeof(pgno));
237 if (bt_preserve(t, pgno) == RET_ERROR)
238 goto err1;
239 }
240 break;
241 case P_RINTERNAL:
242 /*
243 * Update the left page count. If split
244 * added at index 0, fix the correct page.
245 */
246 if (skip > 0)
247 dest = (char *)h + h->linp[skip - 1];
248 else
249 dest = (char *)l + l->linp[NEXTINDEX(l) - 1];
250 ((RINTERNAL *)dest)->nrecs = rec_total(lchild);
251 ((RINTERNAL *)dest)->pgno = lchild->pgno;
252
253 /* Update the right page count. */
254 h->linp[skip] = h->upper -= nbytes;
255 dest = (char *)h + h->linp[skip];
256 ((RINTERNAL *)dest)->nrecs = rec_total(rchild);
257 ((RINTERNAL *)dest)->pgno = rchild->pgno;
258 break;
259 case P_RLEAF:
260 /*
261 * Update the left page count. If split
262 * added at index 0, fix the correct page.
263 */
264 if (skip > 0)
265 dest = (char *)h + h->linp[skip - 1];
266 else
267 dest = (char *)l + l->linp[NEXTINDEX(l) - 1];
268 ((RINTERNAL *)dest)->nrecs = NEXTINDEX(lchild);
269 ((RINTERNAL *)dest)->pgno = lchild->pgno;
270
271 /* Update the right page count. */
272 h->linp[skip] = h->upper -= nbytes;
273 dest = (char *)h + h->linp[skip];
274 ((RINTERNAL *)dest)->nrecs = NEXTINDEX(rchild);
275 ((RINTERNAL *)dest)->pgno = rchild->pgno;
276 break;
277 default:
278 abort();
279 }
280
281 /* Unpin the held pages. */
282 if (!parentsplit) {
283 mpool_put(t->bt_mp, h, MPOOL_DIRTY);
284 break;
285 }
286
287 /* If the root page was split, make it look right. */
288 if (sp->pgno == P_ROOT &&
289 (F_ISSET(t, R_RECNO) ?
290 bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR)
291 goto err1;
292
293 mpool_put(t->bt_mp, lchild, MPOOL_DIRTY);
294 mpool_put(t->bt_mp, rchild, MPOOL_DIRTY);
295 }
296
297 /* Unpin the held pages. */
298 mpool_put(t->bt_mp, l, MPOOL_DIRTY);
299 mpool_put(t->bt_mp, r, MPOOL_DIRTY);
300
301 /* Clear any pages left on the stack. */
302 return (RET_SUCCESS);
303
304 /*
305 * If something fails in the above loop we were already walking back
306 * up the tree and the tree is now inconsistent. Nothing much we can
307 * do about it but release any memory we're holding.
308 */
309 err1: mpool_put(t->bt_mp, lchild, MPOOL_DIRTY);
310 mpool_put(t->bt_mp, rchild, MPOOL_DIRTY);
311
312 err2: mpool_put(t->bt_mp, l, 0);
313 mpool_put(t->bt_mp, r, 0);
314 __dbpanic(t->bt_dbp);
315 return (RET_ERROR);
316 }
317
318 /*
319 * BT_PAGE -- Split a non-root page of a btree.
320 *
321 * Parameters:
322 * t: tree
323 * h: root page
324 * lp: pointer to left page pointer
325 * rp: pointer to right page pointer
326 * skip: pointer to index to leave open
327 * ilen: insert length
328 *
329 * Returns:
330 * Pointer to page in which to insert or NULL on error.
331 */
332 static PAGE *
bt_page(BTREE * t,PAGE * h,PAGE ** lp,PAGE ** rp,indx_t * skip,size_t ilen)333 bt_page(BTREE *t, PAGE *h, PAGE **lp, PAGE **rp, indx_t *skip, size_t ilen)
334 {
335 PAGE *l, *r, *tp;
336 pgno_t npg;
337
338 #ifdef STATISTICS
339 ++bt_split;
340 #endif
341 /* Put the new right page for the split into place. */
342 if ((r = __bt_new(t, &npg)) == NULL)
343 return (NULL);
344 r->pgno = npg;
345 r->lower = BTDATAOFF;
346 r->upper = t->bt_psize;
347 r->nextpg = h->nextpg;
348 r->prevpg = h->pgno;
349 r->flags = h->flags & P_TYPE;
350
351 /*
352 * If we're splitting the last page on a level because we're appending
353 * a key to it (skip is NEXTINDEX()), it's likely that the data is
354 * sorted. Adding an empty page on the side of the level is less work
355 * and can push the fill factor much higher than normal. If we're
356 * wrong it's no big deal, we'll just do the split the right way next
357 * time. It may look like it's equally easy to do a similar hack for
358 * reverse sorted data, that is, split the tree left, but it's not.
359 * Don't even try.
360 */
361 if (h->nextpg == P_INVALID && *skip == NEXTINDEX(h)) {
362 #ifdef STATISTICS
363 ++bt_sortsplit;
364 #endif
365 h->nextpg = r->pgno;
366 r->lower = BTDATAOFF + sizeof(indx_t);
367 *skip = 0;
368 *lp = h;
369 *rp = r;
370 return (r);
371 }
372
373 /* Put the new left page for the split into place. */
374 if ((l = (PAGE *)calloc(1, t->bt_psize)) == NULL) {
375 mpool_put(t->bt_mp, r, 0);
376 return (NULL);
377 }
378 l->pgno = h->pgno;
379 l->nextpg = r->pgno;
380 l->prevpg = h->prevpg;
381 l->lower = BTDATAOFF;
382 l->upper = t->bt_psize;
383 l->flags = h->flags & P_TYPE;
384
385 /* Fix up the previous pointer of the page after the split page. */
386 if (h->nextpg != P_INVALID) {
387 if ((tp = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL) {
388 free(l);
389 /* XXX mpool_free(t->bt_mp, r->pgno); */
390 return (NULL);
391 }
392 tp->prevpg = r->pgno;
393 mpool_put(t->bt_mp, tp, MPOOL_DIRTY);
394 }
395
396 /*
397 * Split right. The key/data pairs aren't sorted in the btree page so
398 * it's simpler to copy the data from the split page onto two new pages
399 * instead of copying half the data to the right page and compacting
400 * the left page in place. Since the left page can't change, we have
401 * to swap the original and the allocated left page after the split.
402 */
403 tp = bt_psplit(t, h, l, r, skip, ilen);
404
405 /* Move the new left page onto the old left page. */
406 memmove(h, l, t->bt_psize);
407 if (tp == l)
408 tp = h;
409 free(l);
410
411 *lp = h;
412 *rp = r;
413 return (tp);
414 }
415
416 /*
417 * BT_ROOT -- Split the root page of a btree.
418 *
419 * Parameters:
420 * t: tree
421 * h: root page
422 * lp: pointer to left page pointer
423 * rp: pointer to right page pointer
424 * skip: pointer to index to leave open
425 * ilen: insert length
426 *
427 * Returns:
428 * Pointer to page in which to insert or NULL on error.
429 */
430 static PAGE *
bt_root(BTREE * t,PAGE * h,PAGE ** lp,PAGE ** rp,indx_t * skip,size_t ilen)431 bt_root(BTREE *t, PAGE *h, PAGE **lp, PAGE **rp, indx_t *skip, size_t ilen)
432 {
433 PAGE *l, *r, *tp;
434 pgno_t lnpg, rnpg;
435
436 #ifdef STATISTICS
437 ++bt_split;
438 ++bt_rootsplit;
439 #endif
440 /* Put the new left and right pages for the split into place. */
441 if ((l = __bt_new(t, &lnpg)) == NULL ||
442 (r = __bt_new(t, &rnpg)) == NULL)
443 return (NULL);
444 l->pgno = lnpg;
445 r->pgno = rnpg;
446 l->nextpg = r->pgno;
447 r->prevpg = l->pgno;
448 l->prevpg = r->nextpg = P_INVALID;
449 l->lower = r->lower = BTDATAOFF;
450 l->upper = r->upper = t->bt_psize;
451 l->flags = r->flags = h->flags & P_TYPE;
452
453 /* Split the root page. */
454 tp = bt_psplit(t, h, l, r, skip, ilen);
455
456 *lp = l;
457 *rp = r;
458 return (tp);
459 }
460
461 /*
462 * BT_RROOT -- Fix up the recno root page after it has been split.
463 *
464 * Parameters:
465 * t: tree
466 * h: root page
467 * l: left page
468 * r: right page
469 *
470 * Returns:
471 * RET_ERROR, RET_SUCCESS
472 */
473 static int
bt_rroot(BTREE * t,PAGE * h,PAGE * l,PAGE * r)474 bt_rroot(BTREE *t, PAGE *h, PAGE *l, PAGE *r)
475 {
476 char *dest;
477
478 /* Insert the left and right keys, set the header information. */
479 h->linp[0] = h->upper = t->bt_psize - NRINTERNAL;
480 dest = (char *)h + h->upper;
481 WR_RINTERNAL(dest,
482 l->flags & P_RLEAF ? NEXTINDEX(l) : rec_total(l), l->pgno);
483
484 __PAST_END(h->linp, 1) = h->upper -= NRINTERNAL;
485 dest = (char *)h + h->upper;
486 WR_RINTERNAL(dest,
487 r->flags & P_RLEAF ? NEXTINDEX(r) : rec_total(r), r->pgno);
488
489 h->lower = BTDATAOFF + 2 * sizeof(indx_t);
490
491 /* Unpin the root page, set to recno internal page. */
492 h->flags &= ~P_TYPE;
493 h->flags |= P_RINTERNAL;
494 mpool_put(t->bt_mp, h, MPOOL_DIRTY);
495
496 return (RET_SUCCESS);
497 }
498
499 /*
500 * BT_BROOT -- Fix up the btree root page after it has been split.
501 *
502 * Parameters:
503 * t: tree
504 * h: root page
505 * l: left page
506 * r: right page
507 *
508 * Returns:
509 * RET_ERROR, RET_SUCCESS
510 */
511 static int
bt_broot(BTREE * t,PAGE * h,PAGE * l,PAGE * r)512 bt_broot(BTREE *t, PAGE *h, PAGE *l, PAGE *r)
513 {
514 BINTERNAL *bi;
515 BLEAF *bl;
516 u_int32_t nbytes;
517 char *dest;
518
519 /*
520 * If the root page was a leaf page, change it into an internal page.
521 * We copy the key we split on (but not the key's data, in the case of
522 * a leaf page) to the new root page.
523 *
524 * The btree comparison code guarantees that the left-most key on any
525 * level of the tree is never used, so it doesn't need to be filled in.
526 */
527 nbytes = NBINTERNAL(0);
528 h->linp[0] = h->upper = t->bt_psize - nbytes;
529 dest = (char *)h + h->upper;
530 WR_BINTERNAL(dest, 0, l->pgno, 0);
531
532 switch (h->flags & P_TYPE) {
533 case P_BLEAF:
534 bl = GETBLEAF(r, 0);
535 nbytes = NBINTERNAL(bl->ksize);
536 __PAST_END(h->linp, 1) = h->upper -= nbytes;
537 dest = (char *)h + h->upper;
538 WR_BINTERNAL(dest, bl->ksize, r->pgno, 0);
539 memmove(dest, bl->bytes, bl->ksize);
540
541 /*
542 * If the key is on an overflow page, mark the overflow chain
543 * so it isn't deleted when the leaf copy of the key is deleted.
544 */
545 if (bl->flags & P_BIGKEY) {
546 pgno_t pgno;
547 memcpy(&pgno, bl->bytes, sizeof(pgno));
548 if (bt_preserve(t, pgno) == RET_ERROR)
549 return (RET_ERROR);
550 }
551 break;
552 case P_BINTERNAL:
553 bi = GETBINTERNAL(r, 0);
554 nbytes = NBINTERNAL(bi->ksize);
555 __PAST_END(h->linp, 1) = h->upper -= nbytes;
556 dest = (char *)h + h->upper;
557 memmove(dest, bi, nbytes);
558 ((BINTERNAL *)dest)->pgno = r->pgno;
559 break;
560 default:
561 abort();
562 }
563
564 /* There are two keys on the page. */
565 h->lower = BTDATAOFF + 2 * sizeof(indx_t);
566
567 /* Unpin the root page, set to btree internal page. */
568 h->flags &= ~P_TYPE;
569 h->flags |= P_BINTERNAL;
570 mpool_put(t->bt_mp, h, MPOOL_DIRTY);
571
572 return (RET_SUCCESS);
573 }
574
575 /*
576 * BT_PSPLIT -- Do the real work of splitting the page.
577 *
578 * Parameters:
579 * t: tree
580 * h: page to be split
581 * l: page to put lower half of data
582 * r: page to put upper half of data
583 * pskip: pointer to index to leave open
584 * ilen: insert length
585 *
586 * Returns:
587 * Pointer to page in which to insert.
588 */
589 static PAGE *
bt_psplit(BTREE * t,PAGE * h,PAGE * l,PAGE * r,indx_t * pskip,size_t ilen)590 bt_psplit(BTREE *t, PAGE *h, PAGE *l, PAGE *r, indx_t *pskip, size_t ilen)
591 {
592 BINTERNAL *bi;
593 BLEAF *bl;
594 CURSOR *c;
595 RLEAF *rl;
596 PAGE *rval;
597 void *src;
598 indx_t full, half, nxt, off, skip, top, used;
599 u_int32_t nbytes;
600 int bigkeycnt, isbigkey;
601
602 /*
603 * Split the data to the left and right pages. Leave the skip index
604 * open. Additionally, make some effort not to split on an overflow
605 * key. This makes internal page processing faster and can save
606 * space as overflow keys used by internal pages are never deleted.
607 */
608 bigkeycnt = 0;
609 skip = *pskip;
610 full = t->bt_psize - BTDATAOFF;
611 half = full / 2;
612 used = 0;
613 for (nxt = off = 0, top = NEXTINDEX(h); nxt < top; ++off) {
614 if (skip == off) {
615 nbytes = ilen;
616 isbigkey = 0; /* XXX: not really known. */
617 } else
618 switch (h->flags & P_TYPE) {
619 case P_BINTERNAL:
620 src = bi = GETBINTERNAL(h, nxt);
621 nbytes = NBINTERNAL(bi->ksize);
622 isbigkey = bi->flags & P_BIGKEY;
623 break;
624 case P_BLEAF:
625 src = bl = GETBLEAF(h, nxt);
626 nbytes = NBLEAF(bl);
627 isbigkey = bl->flags & P_BIGKEY;
628 break;
629 case P_RINTERNAL:
630 src = GETRINTERNAL(h, nxt);
631 nbytes = NRINTERNAL;
632 isbigkey = 0;
633 break;
634 case P_RLEAF:
635 src = rl = GETRLEAF(h, nxt);
636 nbytes = NRLEAF(rl);
637 isbigkey = 0;
638 break;
639 default:
640 abort();
641 }
642
643 /*
644 * If the key/data pairs are substantial fractions of the max
645 * possible size for the page, it's possible to get situations
646 * where we decide to try and copy too much onto the left page.
647 * Make sure that doesn't happen.
648 */
649 if ((skip <= off && used + nbytes + sizeof(indx_t) >= full) ||
650 nxt == top - 1) {
651 --off;
652 break;
653 }
654
655 /* Copy the key/data pair, if not the skipped index. */
656 if (skip != off) {
657 ++nxt;
658
659 l->linp[off] = l->upper -= nbytes;
660 memmove((char *)l + l->upper, src, nbytes);
661 }
662
663 used += nbytes + sizeof(indx_t);
664 if (used >= half) {
665 if (!isbigkey || bigkeycnt == 3)
666 break;
667 else
668 ++bigkeycnt;
669 }
670 }
671
672 /*
673 * Off is the last offset that's valid for the left page.
674 * Nxt is the first offset to be placed on the right page.
675 */
676 l->lower += (off + 1) * sizeof(indx_t);
677
678 /*
679 * If splitting the page that the cursor was on, the cursor has to be
680 * adjusted to point to the same record as before the split. If the
681 * cursor is at or past the skipped slot, the cursor is incremented by
682 * one. If the cursor is on the right page, it is decremented by the
683 * number of records split to the left page.
684 */
685 c = &t->bt_cursor;
686 if (F_ISSET(c, CURS_INIT) && c->pg.pgno == h->pgno) {
687 if (c->pg.index >= skip)
688 ++c->pg.index;
689 if (c->pg.index < nxt) /* Left page. */
690 c->pg.pgno = l->pgno;
691 else { /* Right page. */
692 c->pg.pgno = r->pgno;
693 c->pg.index -= nxt;
694 }
695 }
696
697 /*
698 * If the skipped index was on the left page, just return that page.
699 * Otherwise, adjust the skip index to reflect the new position on
700 * the right page.
701 */
702 if (skip <= off) {
703 skip = MAX_PAGE_OFFSET;
704 rval = l;
705 } else {
706 rval = r;
707 *pskip -= nxt;
708 }
709
710 for (off = 0; nxt < top; ++off) {
711 if (skip == nxt) {
712 ++off;
713 skip = MAX_PAGE_OFFSET;
714 }
715 switch (h->flags & P_TYPE) {
716 case P_BINTERNAL:
717 src = bi = GETBINTERNAL(h, nxt);
718 nbytes = NBINTERNAL(bi->ksize);
719 break;
720 case P_BLEAF:
721 src = bl = GETBLEAF(h, nxt);
722 nbytes = NBLEAF(bl);
723 break;
724 case P_RINTERNAL:
725 src = GETRINTERNAL(h, nxt);
726 nbytes = NRINTERNAL;
727 break;
728 case P_RLEAF:
729 src = rl = GETRLEAF(h, nxt);
730 nbytes = NRLEAF(rl);
731 break;
732 default:
733 abort();
734 }
735 ++nxt;
736 r->linp[off] = r->upper -= nbytes;
737 memmove((char *)r + r->upper, src, nbytes);
738 }
739 r->lower += off * sizeof(indx_t);
740
741 /* If the key is being appended to the page, adjust the index. */
742 if (skip == top)
743 r->lower += sizeof(indx_t);
744
745 return (rval);
746 }
747
748 /*
749 * BT_PRESERVE -- Mark a chain of pages as used by an internal node.
750 *
751 * Chains of indirect blocks pointed to by leaf nodes get reclaimed when the
752 * record that references them gets deleted. Chains pointed to by internal
753 * pages never get deleted. This routine marks a chain as pointed to by an
754 * internal page.
755 *
756 * Parameters:
757 * t: tree
758 * pg: page number of first page in the chain.
759 *
760 * Returns:
761 * RET_SUCCESS, RET_ERROR.
762 */
763 static int
bt_preserve(BTREE * t,pgno_t pg)764 bt_preserve(BTREE *t, pgno_t pg)
765 {
766 PAGE *h;
767
768 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
769 return (RET_ERROR);
770 h->flags |= P_PRESERVE;
771 mpool_put(t->bt_mp, h, MPOOL_DIRTY);
772 return (RET_SUCCESS);
773 }
774
775 /*
776 * REC_TOTAL -- Return the number of recno entries below a page.
777 *
778 * Parameters:
779 * h: page
780 *
781 * Returns:
782 * The number of recno entries below a page.
783 *
784 * XXX
785 * These values could be set by the bt_psplit routine. The problem is that the
786 * entry has to be popped off of the stack etc. or the values have to be passed
787 * all the way back to bt_split/bt_rroot and it's not very clean.
788 */
789 static recno_t
rec_total(PAGE * h)790 rec_total(PAGE *h)
791 {
792 recno_t recs;
793 indx_t nxt, top;
794
795 for (recs = 0, nxt = 0, top = NEXTINDEX(h); nxt < top; ++nxt)
796 recs += GETRINTERNAL(h, nxt)->nrecs;
797 return (recs);
798 }
799