1*7c478bd9Sstevel@tonic-gate /*- 2*7c478bd9Sstevel@tonic-gate * See the file LICENSE for redistribution information. 3*7c478bd9Sstevel@tonic-gate * 4*7c478bd9Sstevel@tonic-gate * Copyright (c) 1996, 1997, 1998 5*7c478bd9Sstevel@tonic-gate * Sleepycat Software. All rights reserved. 6*7c478bd9Sstevel@tonic-gate */ 7*7c478bd9Sstevel@tonic-gate /* 8*7c478bd9Sstevel@tonic-gate * Copyright (c) 1990, 1993, 1994, 1995, 1996 9*7c478bd9Sstevel@tonic-gate * Keith Bostic. All rights reserved. 10*7c478bd9Sstevel@tonic-gate */ 11*7c478bd9Sstevel@tonic-gate /* 12*7c478bd9Sstevel@tonic-gate * Copyright (c) 1990, 1993, 1994, 1995 13*7c478bd9Sstevel@tonic-gate * The Regents of the University of California. All rights reserved. 14*7c478bd9Sstevel@tonic-gate * 15*7c478bd9Sstevel@tonic-gate * Redistribution and use in source and binary forms, with or without 16*7c478bd9Sstevel@tonic-gate * modification, are permitted provided that the following conditions 17*7c478bd9Sstevel@tonic-gate * are met: 18*7c478bd9Sstevel@tonic-gate * 1. Redistributions of source code must retain the above copyright 19*7c478bd9Sstevel@tonic-gate * notice, this list of conditions and the following disclaimer. 20*7c478bd9Sstevel@tonic-gate * 2. Redistributions in binary form must reproduce the above copyright 21*7c478bd9Sstevel@tonic-gate * notice, this list of conditions and the following disclaimer in the 22*7c478bd9Sstevel@tonic-gate * documentation and/or other materials provided with the distribution. 23*7c478bd9Sstevel@tonic-gate * 3. All advertising materials mentioning features or use of this software 24*7c478bd9Sstevel@tonic-gate * must display the following acknowledgement: 25*7c478bd9Sstevel@tonic-gate * This product includes software developed by the University of 26*7c478bd9Sstevel@tonic-gate * California, Berkeley and its contributors. 27*7c478bd9Sstevel@tonic-gate * 4. Neither the name of the University nor the names of its contributors 28*7c478bd9Sstevel@tonic-gate * may be used to endorse or promote products derived from this software 29*7c478bd9Sstevel@tonic-gate * without specific prior written permission. 30*7c478bd9Sstevel@tonic-gate * 31*7c478bd9Sstevel@tonic-gate * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 32*7c478bd9Sstevel@tonic-gate * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 33*7c478bd9Sstevel@tonic-gate * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 34*7c478bd9Sstevel@tonic-gate * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 35*7c478bd9Sstevel@tonic-gate * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 36*7c478bd9Sstevel@tonic-gate * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 37*7c478bd9Sstevel@tonic-gate * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 38*7c478bd9Sstevel@tonic-gate * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 39*7c478bd9Sstevel@tonic-gate * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 40*7c478bd9Sstevel@tonic-gate * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 41*7c478bd9Sstevel@tonic-gate * SUCH DAMAGE. 42*7c478bd9Sstevel@tonic-gate */ 43*7c478bd9Sstevel@tonic-gate 44*7c478bd9Sstevel@tonic-gate #include "config.h" 45*7c478bd9Sstevel@tonic-gate 46*7c478bd9Sstevel@tonic-gate #ifndef lint 47*7c478bd9Sstevel@tonic-gate static const char sccsid[] = "@(#)bt_split.c 10.33 (Sleepycat) 10/13/98"; 48*7c478bd9Sstevel@tonic-gate #endif /* not lint */ 49*7c478bd9Sstevel@tonic-gate 50*7c478bd9Sstevel@tonic-gate #ifndef NO_SYSTEM_INCLUDES 51*7c478bd9Sstevel@tonic-gate #include <sys/types.h> 52*7c478bd9Sstevel@tonic-gate 53*7c478bd9Sstevel@tonic-gate #include <errno.h> 54*7c478bd9Sstevel@tonic-gate #include <limits.h> 55*7c478bd9Sstevel@tonic-gate #include <string.h> 56*7c478bd9Sstevel@tonic-gate #endif 57*7c478bd9Sstevel@tonic-gate 58*7c478bd9Sstevel@tonic-gate #include "db_int.h" 59*7c478bd9Sstevel@tonic-gate #include "db_page.h" 60*7c478bd9Sstevel@tonic-gate #include "btree.h" 61*7c478bd9Sstevel@tonic-gate 62*7c478bd9Sstevel@tonic-gate static int __bam_broot __P((DBC *, PAGE *, PAGE *, PAGE *)); 63*7c478bd9Sstevel@tonic-gate static int __bam_page __P((DBC *, EPG *, EPG *)); 64*7c478bd9Sstevel@tonic-gate static int __bam_pinsert __P((DBC *, EPG *, PAGE *, PAGE *)); 65*7c478bd9Sstevel@tonic-gate static int __bam_psplit __P((DBC *, EPG *, PAGE *, PAGE *, db_indx_t *)); 66*7c478bd9Sstevel@tonic-gate static int __bam_root __P((DBC *, EPG *)); 67*7c478bd9Sstevel@tonic-gate static int __ram_root __P((DBC *, PAGE *, PAGE *, PAGE *)); 68*7c478bd9Sstevel@tonic-gate 69*7c478bd9Sstevel@tonic-gate /* 70*7c478bd9Sstevel@tonic-gate * __bam_split -- 71*7c478bd9Sstevel@tonic-gate * Split a page. 72*7c478bd9Sstevel@tonic-gate * 73*7c478bd9Sstevel@tonic-gate * PUBLIC: int __bam_split __P((DBC *, void *)); 74*7c478bd9Sstevel@tonic-gate */ 75*7c478bd9Sstevel@tonic-gate int 76*7c478bd9Sstevel@tonic-gate __bam_split(dbc, arg) 77*7c478bd9Sstevel@tonic-gate DBC *dbc; 78*7c478bd9Sstevel@tonic-gate void *arg; 79*7c478bd9Sstevel@tonic-gate { 80*7c478bd9Sstevel@tonic-gate BTREE *t; 81*7c478bd9Sstevel@tonic-gate CURSOR *cp; 82*7c478bd9Sstevel@tonic-gate DB *dbp; 83*7c478bd9Sstevel@tonic-gate enum { UP, DOWN } dir; 84*7c478bd9Sstevel@tonic-gate int exact, level, ret; 85*7c478bd9Sstevel@tonic-gate 86*7c478bd9Sstevel@tonic-gate dbp = dbc->dbp; 87*7c478bd9Sstevel@tonic-gate cp = dbc->internal; 88*7c478bd9Sstevel@tonic-gate 89*7c478bd9Sstevel@tonic-gate /* 90*7c478bd9Sstevel@tonic-gate * The locking protocol we use to avoid deadlock to acquire locks by 91*7c478bd9Sstevel@tonic-gate * walking down the tree, but we do it as lazily as possible, locking 92*7c478bd9Sstevel@tonic-gate * the root only as a last resort. We expect all stack pages to have 93*7c478bd9Sstevel@tonic-gate * been discarded before we're called; we discard all short-term locks. 94*7c478bd9Sstevel@tonic-gate * 95*7c478bd9Sstevel@tonic-gate * When __bam_split is first called, we know that a leaf page was too 96*7c478bd9Sstevel@tonic-gate * full for an insert. We don't know what leaf page it was, but we 97*7c478bd9Sstevel@tonic-gate * have the key/recno that caused the problem. We call XX_search to 98*7c478bd9Sstevel@tonic-gate * reacquire the leaf page, but this time get both the leaf page and 99*7c478bd9Sstevel@tonic-gate * its parent, locked. We then split the leaf page and see if the new 100*7c478bd9Sstevel@tonic-gate * internal key will fit into the parent page. If it will, we're done. 101*7c478bd9Sstevel@tonic-gate * 102*7c478bd9Sstevel@tonic-gate * If it won't, we discard our current locks and repeat the process, 103*7c478bd9Sstevel@tonic-gate * only this time acquiring the parent page and its parent, locked. 104*7c478bd9Sstevel@tonic-gate * This process repeats until we succeed in the split, splitting the 105*7c478bd9Sstevel@tonic-gate * root page as the final resort. The entire process then repeats, 106*7c478bd9Sstevel@tonic-gate * as necessary, until we split a leaf page. 107*7c478bd9Sstevel@tonic-gate * 108*7c478bd9Sstevel@tonic-gate * XXX 109*7c478bd9Sstevel@tonic-gate * A traditional method of speeding this up is to maintain a stack of 110*7c478bd9Sstevel@tonic-gate * the pages traversed in the original search. You can detect if the 111*7c478bd9Sstevel@tonic-gate * stack is correct by storing the page's LSN when it was searched and 112*7c478bd9Sstevel@tonic-gate * comparing that LSN with the current one when it's locked during the 113*7c478bd9Sstevel@tonic-gate * split. This would be an easy change for this code, but I have no 114*7c478bd9Sstevel@tonic-gate * numbers that indicate it's worthwhile. 115*7c478bd9Sstevel@tonic-gate */ 116*7c478bd9Sstevel@tonic-gate t = dbp->internal; 117*7c478bd9Sstevel@tonic-gate for (dir = UP, level = LEAFLEVEL;; dir == UP ? ++level : --level) { 118*7c478bd9Sstevel@tonic-gate /* 119*7c478bd9Sstevel@tonic-gate * Acquire a page and its parent, locked. 120*7c478bd9Sstevel@tonic-gate */ 121*7c478bd9Sstevel@tonic-gate if ((ret = (dbp->type == DB_BTREE ? 122*7c478bd9Sstevel@tonic-gate __bam_search(dbc, arg, S_WRPAIR, level, NULL, &exact) : 123*7c478bd9Sstevel@tonic-gate __bam_rsearch(dbc, 124*7c478bd9Sstevel@tonic-gate (db_recno_t *)arg, S_WRPAIR, level, &exact))) != 0) 125*7c478bd9Sstevel@tonic-gate return (ret); 126*7c478bd9Sstevel@tonic-gate 127*7c478bd9Sstevel@tonic-gate /* 128*7c478bd9Sstevel@tonic-gate * Split the page if it still needs it (it's possible another 129*7c478bd9Sstevel@tonic-gate * thread of control has already split the page). If we are 130*7c478bd9Sstevel@tonic-gate * guaranteed that two items will fit on the page, the split 131*7c478bd9Sstevel@tonic-gate * is no longer necessary. 132*7c478bd9Sstevel@tonic-gate */ 133*7c478bd9Sstevel@tonic-gate if (t->bt_ovflsize * 2 <= 134*7c478bd9Sstevel@tonic-gate (db_indx_t)P_FREESPACE(cp->csp[0].page)) { 135*7c478bd9Sstevel@tonic-gate __bam_stkrel(dbc, 1); 136*7c478bd9Sstevel@tonic-gate return (0); 137*7c478bd9Sstevel@tonic-gate } 138*7c478bd9Sstevel@tonic-gate ret = cp->csp[0].page->pgno == PGNO_ROOT ? 139*7c478bd9Sstevel@tonic-gate __bam_root(dbc, &cp->csp[0]) : 140*7c478bd9Sstevel@tonic-gate __bam_page(dbc, &cp->csp[-1], &cp->csp[0]); 141*7c478bd9Sstevel@tonic-gate BT_STK_CLR(cp); 142*7c478bd9Sstevel@tonic-gate 143*7c478bd9Sstevel@tonic-gate switch (ret) { 144*7c478bd9Sstevel@tonic-gate case 0: 145*7c478bd9Sstevel@tonic-gate /* Once we've split the leaf page, we're done. */ 146*7c478bd9Sstevel@tonic-gate if (level == LEAFLEVEL) 147*7c478bd9Sstevel@tonic-gate return (0); 148*7c478bd9Sstevel@tonic-gate 149*7c478bd9Sstevel@tonic-gate /* Switch directions. */ 150*7c478bd9Sstevel@tonic-gate if (dir == UP) 151*7c478bd9Sstevel@tonic-gate dir = DOWN; 152*7c478bd9Sstevel@tonic-gate break; 153*7c478bd9Sstevel@tonic-gate case DB_NEEDSPLIT: 154*7c478bd9Sstevel@tonic-gate /* 155*7c478bd9Sstevel@tonic-gate * It's possible to fail to split repeatedly, as other 156*7c478bd9Sstevel@tonic-gate * threads may be modifying the tree, or the page usage 157*7c478bd9Sstevel@tonic-gate * is sufficiently bad that we don't get enough space 158*7c478bd9Sstevel@tonic-gate * the first time. 159*7c478bd9Sstevel@tonic-gate */ 160*7c478bd9Sstevel@tonic-gate if (dir == DOWN) 161*7c478bd9Sstevel@tonic-gate dir = UP; 162*7c478bd9Sstevel@tonic-gate break; 163*7c478bd9Sstevel@tonic-gate default: 164*7c478bd9Sstevel@tonic-gate return (ret); 165*7c478bd9Sstevel@tonic-gate } 166*7c478bd9Sstevel@tonic-gate } 167*7c478bd9Sstevel@tonic-gate /* NOTREACHED */ 168*7c478bd9Sstevel@tonic-gate } 169*7c478bd9Sstevel@tonic-gate 170*7c478bd9Sstevel@tonic-gate /* 171*7c478bd9Sstevel@tonic-gate * __bam_root -- 172*7c478bd9Sstevel@tonic-gate * Split the root page of a btree. 173*7c478bd9Sstevel@tonic-gate */ 174*7c478bd9Sstevel@tonic-gate static int 175*7c478bd9Sstevel@tonic-gate __bam_root(dbc, cp) 176*7c478bd9Sstevel@tonic-gate DBC *dbc; 177*7c478bd9Sstevel@tonic-gate EPG *cp; 178*7c478bd9Sstevel@tonic-gate { 179*7c478bd9Sstevel@tonic-gate DB *dbp; 180*7c478bd9Sstevel@tonic-gate PAGE *lp, *rp; 181*7c478bd9Sstevel@tonic-gate db_indx_t split; 182*7c478bd9Sstevel@tonic-gate int ret; 183*7c478bd9Sstevel@tonic-gate 184*7c478bd9Sstevel@tonic-gate dbp = dbc->dbp; 185*7c478bd9Sstevel@tonic-gate 186*7c478bd9Sstevel@tonic-gate /* Yeah, right. */ 187*7c478bd9Sstevel@tonic-gate if (cp->page->level >= MAXBTREELEVEL) { 188*7c478bd9Sstevel@tonic-gate ret = ENOSPC; 189*7c478bd9Sstevel@tonic-gate goto err; 190*7c478bd9Sstevel@tonic-gate } 191*7c478bd9Sstevel@tonic-gate 192*7c478bd9Sstevel@tonic-gate /* Create new left and right pages for the split. */ 193*7c478bd9Sstevel@tonic-gate lp = rp = NULL; 194*7c478bd9Sstevel@tonic-gate if ((ret = __bam_new(dbc, TYPE(cp->page), &lp)) != 0 || 195*7c478bd9Sstevel@tonic-gate (ret = __bam_new(dbc, TYPE(cp->page), &rp)) != 0) 196*7c478bd9Sstevel@tonic-gate goto err; 197*7c478bd9Sstevel@tonic-gate P_INIT(lp, dbp->pgsize, lp->pgno, 198*7c478bd9Sstevel@tonic-gate PGNO_INVALID, ISINTERNAL(cp->page) ? PGNO_INVALID : rp->pgno, 199*7c478bd9Sstevel@tonic-gate cp->page->level, TYPE(cp->page)); 200*7c478bd9Sstevel@tonic-gate P_INIT(rp, dbp->pgsize, rp->pgno, 201*7c478bd9Sstevel@tonic-gate ISINTERNAL(cp->page) ? PGNO_INVALID : lp->pgno, PGNO_INVALID, 202*7c478bd9Sstevel@tonic-gate cp->page->level, TYPE(cp->page)); 203*7c478bd9Sstevel@tonic-gate 204*7c478bd9Sstevel@tonic-gate /* Split the page. */ 205*7c478bd9Sstevel@tonic-gate if ((ret = __bam_psplit(dbc, cp, lp, rp, &split)) != 0) 206*7c478bd9Sstevel@tonic-gate goto err; 207*7c478bd9Sstevel@tonic-gate 208*7c478bd9Sstevel@tonic-gate /* Log the change. */ 209*7c478bd9Sstevel@tonic-gate if (DB_LOGGING(dbc)) { 210*7c478bd9Sstevel@tonic-gate DBT __a; 211*7c478bd9Sstevel@tonic-gate DB_LSN __lsn; 212*7c478bd9Sstevel@tonic-gate memset(&__a, 0, sizeof(__a)); 213*7c478bd9Sstevel@tonic-gate __a.data = cp->page; 214*7c478bd9Sstevel@tonic-gate __a.size = dbp->pgsize; 215*7c478bd9Sstevel@tonic-gate ZERO_LSN(__lsn); 216*7c478bd9Sstevel@tonic-gate if ((ret = __bam_split_log(dbp->dbenv->lg_info, dbc->txn, 217*7c478bd9Sstevel@tonic-gate &LSN(cp->page), 0, dbp->log_fileid, PGNO(lp), &LSN(lp), 218*7c478bd9Sstevel@tonic-gate PGNO(rp), &LSN(rp), (u_int32_t)NUM_ENT(lp), 0, &__lsn, 219*7c478bd9Sstevel@tonic-gate &__a)) != 0) 220*7c478bd9Sstevel@tonic-gate goto err; 221*7c478bd9Sstevel@tonic-gate LSN(lp) = LSN(rp) = LSN(cp->page); 222*7c478bd9Sstevel@tonic-gate } 223*7c478bd9Sstevel@tonic-gate 224*7c478bd9Sstevel@tonic-gate /* Clean up the new root page. */ 225*7c478bd9Sstevel@tonic-gate if ((ret = (dbp->type == DB_RECNO ? 226*7c478bd9Sstevel@tonic-gate __ram_root(dbc, cp->page, lp, rp) : 227*7c478bd9Sstevel@tonic-gate __bam_broot(dbc, cp->page, lp, rp))) != 0) 228*7c478bd9Sstevel@tonic-gate goto err; 229*7c478bd9Sstevel@tonic-gate 230*7c478bd9Sstevel@tonic-gate /* Adjust any cursors. Do it last so we don't have to undo it. */ 231*7c478bd9Sstevel@tonic-gate __bam_ca_split(dbp, cp->page->pgno, lp->pgno, rp->pgno, split, 1); 232*7c478bd9Sstevel@tonic-gate 233*7c478bd9Sstevel@tonic-gate /* Success -- write the real pages back to the store. */ 234*7c478bd9Sstevel@tonic-gate (void)memp_fput(dbp->mpf, cp->page, DB_MPOOL_DIRTY); 235*7c478bd9Sstevel@tonic-gate (void)__BT_TLPUT(dbc, cp->lock); 236*7c478bd9Sstevel@tonic-gate (void)memp_fput(dbp->mpf, lp, DB_MPOOL_DIRTY); 237*7c478bd9Sstevel@tonic-gate (void)memp_fput(dbp->mpf, rp, DB_MPOOL_DIRTY); 238*7c478bd9Sstevel@tonic-gate 239*7c478bd9Sstevel@tonic-gate return (0); 240*7c478bd9Sstevel@tonic-gate 241*7c478bd9Sstevel@tonic-gate err: if (lp != NULL) 242*7c478bd9Sstevel@tonic-gate (void)__bam_free(dbc, lp); 243*7c478bd9Sstevel@tonic-gate if (rp != NULL) 244*7c478bd9Sstevel@tonic-gate (void)__bam_free(dbc, rp); 245*7c478bd9Sstevel@tonic-gate (void)memp_fput(dbp->mpf, cp->page, 0); 246*7c478bd9Sstevel@tonic-gate (void)__BT_TLPUT(dbc, cp->lock); 247*7c478bd9Sstevel@tonic-gate return (ret); 248*7c478bd9Sstevel@tonic-gate } 249*7c478bd9Sstevel@tonic-gate 250*7c478bd9Sstevel@tonic-gate /* 251*7c478bd9Sstevel@tonic-gate * __bam_page -- 252*7c478bd9Sstevel@tonic-gate * Split the non-root page of a btree. 253*7c478bd9Sstevel@tonic-gate */ 254*7c478bd9Sstevel@tonic-gate static int 255*7c478bd9Sstevel@tonic-gate __bam_page(dbc, pp, cp) 256*7c478bd9Sstevel@tonic-gate DBC *dbc; 257*7c478bd9Sstevel@tonic-gate EPG *pp, *cp; 258*7c478bd9Sstevel@tonic-gate { 259*7c478bd9Sstevel@tonic-gate DB *dbp; 260*7c478bd9Sstevel@tonic-gate DB_LOCK tplock; 261*7c478bd9Sstevel@tonic-gate PAGE *lp, *rp, *tp; 262*7c478bd9Sstevel@tonic-gate db_indx_t split; 263*7c478bd9Sstevel@tonic-gate int ret; 264*7c478bd9Sstevel@tonic-gate 265*7c478bd9Sstevel@tonic-gate dbp = dbc->dbp; 266*7c478bd9Sstevel@tonic-gate lp = rp = tp = NULL; 267*7c478bd9Sstevel@tonic-gate ret = -1; 268*7c478bd9Sstevel@tonic-gate 269*7c478bd9Sstevel@tonic-gate /* Create new right page for the split. */ 270*7c478bd9Sstevel@tonic-gate if ((ret = __bam_new(dbc, TYPE(cp->page), &rp)) != 0) 271*7c478bd9Sstevel@tonic-gate goto err; 272*7c478bd9Sstevel@tonic-gate P_INIT(rp, dbp->pgsize, rp->pgno, 273*7c478bd9Sstevel@tonic-gate ISINTERNAL(cp->page) ? PGNO_INVALID : cp->page->pgno, 274*7c478bd9Sstevel@tonic-gate ISINTERNAL(cp->page) ? PGNO_INVALID : cp->page->next_pgno, 275*7c478bd9Sstevel@tonic-gate cp->page->level, TYPE(cp->page)); 276*7c478bd9Sstevel@tonic-gate 277*7c478bd9Sstevel@tonic-gate /* Create new left page for the split. */ 278*7c478bd9Sstevel@tonic-gate if ((ret = __os_malloc(dbp->pgsize, NULL, &lp)) != 0) 279*7c478bd9Sstevel@tonic-gate goto err; 280*7c478bd9Sstevel@tonic-gate P_INIT(lp, dbp->pgsize, cp->page->pgno, 281*7c478bd9Sstevel@tonic-gate ISINTERNAL(cp->page) ? PGNO_INVALID : cp->page->prev_pgno, 282*7c478bd9Sstevel@tonic-gate ISINTERNAL(cp->page) ? PGNO_INVALID : rp->pgno, 283*7c478bd9Sstevel@tonic-gate cp->page->level, TYPE(cp->page)); 284*7c478bd9Sstevel@tonic-gate ZERO_LSN(lp->lsn); 285*7c478bd9Sstevel@tonic-gate 286*7c478bd9Sstevel@tonic-gate /* 287*7c478bd9Sstevel@tonic-gate * Split right. 288*7c478bd9Sstevel@tonic-gate * 289*7c478bd9Sstevel@tonic-gate * Only the indices are sorted on the page, i.e., the key/data pairs 290*7c478bd9Sstevel@tonic-gate * aren't, so it's simpler to copy the data from the split page onto 291*7c478bd9Sstevel@tonic-gate * two new pages instead of copying half the data to the right page 292*7c478bd9Sstevel@tonic-gate * and compacting the left page in place. Since the left page can't 293*7c478bd9Sstevel@tonic-gate * change, we swap the original and the allocated left page after the 294*7c478bd9Sstevel@tonic-gate * split. 295*7c478bd9Sstevel@tonic-gate */ 296*7c478bd9Sstevel@tonic-gate if ((ret = __bam_psplit(dbc, cp, lp, rp, &split)) != 0) 297*7c478bd9Sstevel@tonic-gate goto err; 298*7c478bd9Sstevel@tonic-gate 299*7c478bd9Sstevel@tonic-gate /* 300*7c478bd9Sstevel@tonic-gate * Fix up the previous pointer of any leaf page following the split 301*7c478bd9Sstevel@tonic-gate * page. 302*7c478bd9Sstevel@tonic-gate * 303*7c478bd9Sstevel@tonic-gate * !!! 304*7c478bd9Sstevel@tonic-gate * There are interesting deadlock situations here as we write-lock a 305*7c478bd9Sstevel@tonic-gate * page that's not in our direct ancestry. Consider a cursor walking 306*7c478bd9Sstevel@tonic-gate * through the leaf pages, that has the previous page read-locked and 307*7c478bd9Sstevel@tonic-gate * is waiting on a lock for the page we just split. It will deadlock 308*7c478bd9Sstevel@tonic-gate * here. If this is a problem, we can fail in the split; it's not a 309*7c478bd9Sstevel@tonic-gate * problem as the split will succeed after the cursor passes through 310*7c478bd9Sstevel@tonic-gate * the page we're splitting. 311*7c478bd9Sstevel@tonic-gate */ 312*7c478bd9Sstevel@tonic-gate if (TYPE(cp->page) == P_LBTREE && rp->next_pgno != PGNO_INVALID) { 313*7c478bd9Sstevel@tonic-gate if ((ret = __bam_lget(dbc, 314*7c478bd9Sstevel@tonic-gate 0, rp->next_pgno, DB_LOCK_WRITE, &tplock)) != 0) 315*7c478bd9Sstevel@tonic-gate goto err; 316*7c478bd9Sstevel@tonic-gate if ((ret = memp_fget(dbp->mpf, &rp->next_pgno, 0, &tp)) != 0) 317*7c478bd9Sstevel@tonic-gate goto err; 318*7c478bd9Sstevel@tonic-gate } 319*7c478bd9Sstevel@tonic-gate 320*7c478bd9Sstevel@tonic-gate /* Insert the new pages into the parent page. */ 321*7c478bd9Sstevel@tonic-gate if ((ret = __bam_pinsert(dbc, pp, lp, rp)) != 0) 322*7c478bd9Sstevel@tonic-gate goto err; 323*7c478bd9Sstevel@tonic-gate 324*7c478bd9Sstevel@tonic-gate /* Log the change. */ 325*7c478bd9Sstevel@tonic-gate if (DB_LOGGING(dbc)) { 326*7c478bd9Sstevel@tonic-gate DBT __a; 327*7c478bd9Sstevel@tonic-gate DB_LSN __lsn; 328*7c478bd9Sstevel@tonic-gate memset(&__a, 0, sizeof(__a)); 329*7c478bd9Sstevel@tonic-gate __a.data = cp->page; 330*7c478bd9Sstevel@tonic-gate __a.size = dbp->pgsize; 331*7c478bd9Sstevel@tonic-gate if (tp == NULL) 332*7c478bd9Sstevel@tonic-gate ZERO_LSN(__lsn); 333*7c478bd9Sstevel@tonic-gate if ((ret = __bam_split_log(dbp->dbenv->lg_info, dbc->txn, 334*7c478bd9Sstevel@tonic-gate &cp->page->lsn, 0, dbp->log_fileid, PGNO(cp->page), 335*7c478bd9Sstevel@tonic-gate &LSN(cp->page), PGNO(rp), &LSN(rp), (u_int32_t)NUM_ENT(lp), 336*7c478bd9Sstevel@tonic-gate tp == NULL ? 0 : PGNO(tp), 337*7c478bd9Sstevel@tonic-gate tp == NULL ? &__lsn : &LSN(tp), &__a)) != 0) 338*7c478bd9Sstevel@tonic-gate goto err; 339*7c478bd9Sstevel@tonic-gate 340*7c478bd9Sstevel@tonic-gate LSN(lp) = LSN(rp) = LSN(cp->page); 341*7c478bd9Sstevel@tonic-gate if (tp != NULL) 342*7c478bd9Sstevel@tonic-gate LSN(tp) = LSN(cp->page); 343*7c478bd9Sstevel@tonic-gate } 344*7c478bd9Sstevel@tonic-gate 345*7c478bd9Sstevel@tonic-gate /* Copy the allocated page into place. */ 346*7c478bd9Sstevel@tonic-gate memcpy(cp->page, lp, LOFFSET(lp)); 347*7c478bd9Sstevel@tonic-gate memcpy((u_int8_t *)cp->page + HOFFSET(lp), 348*7c478bd9Sstevel@tonic-gate (u_int8_t *)lp + HOFFSET(lp), dbp->pgsize - HOFFSET(lp)); 349*7c478bd9Sstevel@tonic-gate __os_free(lp, dbp->pgsize); 350*7c478bd9Sstevel@tonic-gate lp = NULL; 351*7c478bd9Sstevel@tonic-gate 352*7c478bd9Sstevel@tonic-gate /* Finish the next-page link. */ 353*7c478bd9Sstevel@tonic-gate if (tp != NULL) 354*7c478bd9Sstevel@tonic-gate tp->prev_pgno = rp->pgno; 355*7c478bd9Sstevel@tonic-gate 356*7c478bd9Sstevel@tonic-gate /* Adjust any cursors. Do so last so we don't have to undo it. */ 357*7c478bd9Sstevel@tonic-gate __bam_ca_split(dbp, cp->page->pgno, cp->page->pgno, rp->pgno, split, 0); 358*7c478bd9Sstevel@tonic-gate 359*7c478bd9Sstevel@tonic-gate /* Success -- write the real pages back to the store. */ 360*7c478bd9Sstevel@tonic-gate (void)memp_fput(dbp->mpf, pp->page, DB_MPOOL_DIRTY); 361*7c478bd9Sstevel@tonic-gate (void)__BT_TLPUT(dbc, pp->lock); 362*7c478bd9Sstevel@tonic-gate (void)memp_fput(dbp->mpf, cp->page, DB_MPOOL_DIRTY); 363*7c478bd9Sstevel@tonic-gate (void)__BT_TLPUT(dbc, cp->lock); 364*7c478bd9Sstevel@tonic-gate (void)memp_fput(dbp->mpf, rp, DB_MPOOL_DIRTY); 365*7c478bd9Sstevel@tonic-gate if (tp != NULL) { 366*7c478bd9Sstevel@tonic-gate (void)memp_fput(dbp->mpf, tp, DB_MPOOL_DIRTY); 367*7c478bd9Sstevel@tonic-gate (void)__BT_TLPUT(dbc, tplock); 368*7c478bd9Sstevel@tonic-gate } 369*7c478bd9Sstevel@tonic-gate return (0); 370*7c478bd9Sstevel@tonic-gate 371*7c478bd9Sstevel@tonic-gate err: if (lp != NULL) 372*7c478bd9Sstevel@tonic-gate __os_free(lp, dbp->pgsize); 373*7c478bd9Sstevel@tonic-gate if (rp != NULL) 374*7c478bd9Sstevel@tonic-gate (void)__bam_free(dbc, rp); 375*7c478bd9Sstevel@tonic-gate if (tp != NULL) { 376*7c478bd9Sstevel@tonic-gate (void)memp_fput(dbp->mpf, tp, 0); 377*7c478bd9Sstevel@tonic-gate if (ret == DB_NEEDSPLIT) 378*7c478bd9Sstevel@tonic-gate (void)__BT_LPUT(dbc, tplock); 379*7c478bd9Sstevel@tonic-gate else 380*7c478bd9Sstevel@tonic-gate (void)__BT_TLPUT(dbc, tplock); 381*7c478bd9Sstevel@tonic-gate } 382*7c478bd9Sstevel@tonic-gate (void)memp_fput(dbp->mpf, pp->page, 0); 383*7c478bd9Sstevel@tonic-gate if (ret == DB_NEEDSPLIT) 384*7c478bd9Sstevel@tonic-gate (void)__BT_LPUT(dbc, pp->lock); 385*7c478bd9Sstevel@tonic-gate else 386*7c478bd9Sstevel@tonic-gate (void)__BT_TLPUT(dbc, pp->lock); 387*7c478bd9Sstevel@tonic-gate (void)memp_fput(dbp->mpf, cp->page, 0); 388*7c478bd9Sstevel@tonic-gate if (ret == DB_NEEDSPLIT) 389*7c478bd9Sstevel@tonic-gate (void)__BT_LPUT(dbc, cp->lock); 390*7c478bd9Sstevel@tonic-gate else 391*7c478bd9Sstevel@tonic-gate (void)__BT_TLPUT(dbc, cp->lock); 392*7c478bd9Sstevel@tonic-gate return (ret); 393*7c478bd9Sstevel@tonic-gate } 394*7c478bd9Sstevel@tonic-gate 395*7c478bd9Sstevel@tonic-gate /* 396*7c478bd9Sstevel@tonic-gate * __bam_broot -- 397*7c478bd9Sstevel@tonic-gate * Fix up the btree root page after it has been split. 398*7c478bd9Sstevel@tonic-gate */ 399*7c478bd9Sstevel@tonic-gate static int 400*7c478bd9Sstevel@tonic-gate __bam_broot(dbc, rootp, lp, rp) 401*7c478bd9Sstevel@tonic-gate DBC *dbc; 402*7c478bd9Sstevel@tonic-gate PAGE *rootp, *lp, *rp; 403*7c478bd9Sstevel@tonic-gate { 404*7c478bd9Sstevel@tonic-gate BINTERNAL bi, *child_bi; 405*7c478bd9Sstevel@tonic-gate BKEYDATA *child_bk; 406*7c478bd9Sstevel@tonic-gate DB *dbp; 407*7c478bd9Sstevel@tonic-gate DBT hdr, data; 408*7c478bd9Sstevel@tonic-gate int ret; 409*7c478bd9Sstevel@tonic-gate 410*7c478bd9Sstevel@tonic-gate dbp = dbc->dbp; 411*7c478bd9Sstevel@tonic-gate 412*7c478bd9Sstevel@tonic-gate /* 413*7c478bd9Sstevel@tonic-gate * If the root page was a leaf page, change it into an internal page. 414*7c478bd9Sstevel@tonic-gate * We copy the key we split on (but not the key's data, in the case of 415*7c478bd9Sstevel@tonic-gate * a leaf page) to the new root page. 416*7c478bd9Sstevel@tonic-gate */ 417*7c478bd9Sstevel@tonic-gate P_INIT(rootp, dbp->pgsize, 418*7c478bd9Sstevel@tonic-gate PGNO_ROOT, PGNO_INVALID, PGNO_INVALID, lp->level + 1, P_IBTREE); 419*7c478bd9Sstevel@tonic-gate 420*7c478bd9Sstevel@tonic-gate memset(&data, 0, sizeof(data)); 421*7c478bd9Sstevel@tonic-gate memset(&hdr, 0, sizeof(hdr)); 422*7c478bd9Sstevel@tonic-gate 423*7c478bd9Sstevel@tonic-gate /* 424*7c478bd9Sstevel@tonic-gate * The btree comparison code guarantees that the left-most key on any 425*7c478bd9Sstevel@tonic-gate * level of the tree is never used, so it doesn't need to be filled in. 426*7c478bd9Sstevel@tonic-gate */ 427*7c478bd9Sstevel@tonic-gate memset(&bi, 0, sizeof(bi)); 428*7c478bd9Sstevel@tonic-gate bi.len = 0; 429*7c478bd9Sstevel@tonic-gate B_TSET(bi.type, B_KEYDATA, 0); 430*7c478bd9Sstevel@tonic-gate bi.pgno = lp->pgno; 431*7c478bd9Sstevel@tonic-gate if (F_ISSET(dbp, DB_BT_RECNUM)) { 432*7c478bd9Sstevel@tonic-gate bi.nrecs = __bam_total(lp); 433*7c478bd9Sstevel@tonic-gate RE_NREC_SET(rootp, bi.nrecs); 434*7c478bd9Sstevel@tonic-gate } 435*7c478bd9Sstevel@tonic-gate hdr.data = &bi; 436*7c478bd9Sstevel@tonic-gate hdr.size = SSZA(BINTERNAL, data); 437*7c478bd9Sstevel@tonic-gate if ((ret = 438*7c478bd9Sstevel@tonic-gate __db_pitem(dbc, rootp, 0, BINTERNAL_SIZE(0), &hdr, NULL)) != 0) 439*7c478bd9Sstevel@tonic-gate return (ret); 440*7c478bd9Sstevel@tonic-gate 441*7c478bd9Sstevel@tonic-gate switch (TYPE(rp)) { 442*7c478bd9Sstevel@tonic-gate case P_IBTREE: 443*7c478bd9Sstevel@tonic-gate /* Copy the first key of the child page onto the root page. */ 444*7c478bd9Sstevel@tonic-gate child_bi = GET_BINTERNAL(rp, 0); 445*7c478bd9Sstevel@tonic-gate 446*7c478bd9Sstevel@tonic-gate bi.len = child_bi->len; 447*7c478bd9Sstevel@tonic-gate B_TSET(bi.type, child_bi->type, 0); 448*7c478bd9Sstevel@tonic-gate bi.pgno = rp->pgno; 449*7c478bd9Sstevel@tonic-gate if (F_ISSET(dbp, DB_BT_RECNUM)) { 450*7c478bd9Sstevel@tonic-gate bi.nrecs = __bam_total(rp); 451*7c478bd9Sstevel@tonic-gate RE_NREC_ADJ(rootp, bi.nrecs); 452*7c478bd9Sstevel@tonic-gate } 453*7c478bd9Sstevel@tonic-gate hdr.data = &bi; 454*7c478bd9Sstevel@tonic-gate hdr.size = SSZA(BINTERNAL, data); 455*7c478bd9Sstevel@tonic-gate data.data = child_bi->data; 456*7c478bd9Sstevel@tonic-gate data.size = child_bi->len; 457*7c478bd9Sstevel@tonic-gate if ((ret = __db_pitem(dbc, rootp, 1, 458*7c478bd9Sstevel@tonic-gate BINTERNAL_SIZE(child_bi->len), &hdr, &data)) != 0) 459*7c478bd9Sstevel@tonic-gate return (ret); 460*7c478bd9Sstevel@tonic-gate 461*7c478bd9Sstevel@tonic-gate /* Increment the overflow ref count. */ 462*7c478bd9Sstevel@tonic-gate if (B_TYPE(child_bi->type) == B_OVERFLOW) 463*7c478bd9Sstevel@tonic-gate if ((ret = __db_ovref(dbc, 464*7c478bd9Sstevel@tonic-gate ((BOVERFLOW *)(child_bi->data))->pgno, 1)) != 0) 465*7c478bd9Sstevel@tonic-gate return (ret); 466*7c478bd9Sstevel@tonic-gate break; 467*7c478bd9Sstevel@tonic-gate case P_LBTREE: 468*7c478bd9Sstevel@tonic-gate /* Copy the first key of the child page onto the root page. */ 469*7c478bd9Sstevel@tonic-gate child_bk = GET_BKEYDATA(rp, 0); 470*7c478bd9Sstevel@tonic-gate switch (B_TYPE(child_bk->type)) { 471*7c478bd9Sstevel@tonic-gate case B_KEYDATA: 472*7c478bd9Sstevel@tonic-gate bi.len = child_bk->len; 473*7c478bd9Sstevel@tonic-gate B_TSET(bi.type, child_bk->type, 0); 474*7c478bd9Sstevel@tonic-gate bi.pgno = rp->pgno; 475*7c478bd9Sstevel@tonic-gate if (F_ISSET(dbp, DB_BT_RECNUM)) { 476*7c478bd9Sstevel@tonic-gate bi.nrecs = __bam_total(rp); 477*7c478bd9Sstevel@tonic-gate RE_NREC_ADJ(rootp, bi.nrecs); 478*7c478bd9Sstevel@tonic-gate } 479*7c478bd9Sstevel@tonic-gate hdr.data = &bi; 480*7c478bd9Sstevel@tonic-gate hdr.size = SSZA(BINTERNAL, data); 481*7c478bd9Sstevel@tonic-gate data.data = child_bk->data; 482*7c478bd9Sstevel@tonic-gate data.size = child_bk->len; 483*7c478bd9Sstevel@tonic-gate if ((ret = __db_pitem(dbc, rootp, 1, 484*7c478bd9Sstevel@tonic-gate BINTERNAL_SIZE(child_bk->len), &hdr, &data)) != 0) 485*7c478bd9Sstevel@tonic-gate return (ret); 486*7c478bd9Sstevel@tonic-gate break; 487*7c478bd9Sstevel@tonic-gate case B_DUPLICATE: 488*7c478bd9Sstevel@tonic-gate case B_OVERFLOW: 489*7c478bd9Sstevel@tonic-gate bi.len = BOVERFLOW_SIZE; 490*7c478bd9Sstevel@tonic-gate B_TSET(bi.type, child_bk->type, 0); 491*7c478bd9Sstevel@tonic-gate bi.pgno = rp->pgno; 492*7c478bd9Sstevel@tonic-gate if (F_ISSET(dbp, DB_BT_RECNUM)) { 493*7c478bd9Sstevel@tonic-gate bi.nrecs = __bam_total(rp); 494*7c478bd9Sstevel@tonic-gate RE_NREC_ADJ(rootp, bi.nrecs); 495*7c478bd9Sstevel@tonic-gate } 496*7c478bd9Sstevel@tonic-gate hdr.data = &bi; 497*7c478bd9Sstevel@tonic-gate hdr.size = SSZA(BINTERNAL, data); 498*7c478bd9Sstevel@tonic-gate data.data = child_bk; 499*7c478bd9Sstevel@tonic-gate data.size = BOVERFLOW_SIZE; 500*7c478bd9Sstevel@tonic-gate if ((ret = __db_pitem(dbc, rootp, 1, 501*7c478bd9Sstevel@tonic-gate BINTERNAL_SIZE(BOVERFLOW_SIZE), &hdr, &data)) != 0) 502*7c478bd9Sstevel@tonic-gate return (ret); 503*7c478bd9Sstevel@tonic-gate 504*7c478bd9Sstevel@tonic-gate /* Increment the overflow ref count. */ 505*7c478bd9Sstevel@tonic-gate if (B_TYPE(child_bk->type) == B_OVERFLOW) 506*7c478bd9Sstevel@tonic-gate if ((ret = __db_ovref(dbc, 507*7c478bd9Sstevel@tonic-gate ((BOVERFLOW *)child_bk)->pgno, 1)) != 0) 508*7c478bd9Sstevel@tonic-gate return (ret); 509*7c478bd9Sstevel@tonic-gate break; 510*7c478bd9Sstevel@tonic-gate default: 511*7c478bd9Sstevel@tonic-gate return (__db_pgfmt(dbp, rp->pgno)); 512*7c478bd9Sstevel@tonic-gate } 513*7c478bd9Sstevel@tonic-gate break; 514*7c478bd9Sstevel@tonic-gate default: 515*7c478bd9Sstevel@tonic-gate return (__db_pgfmt(dbp, rp->pgno)); 516*7c478bd9Sstevel@tonic-gate } 517*7c478bd9Sstevel@tonic-gate return (0); 518*7c478bd9Sstevel@tonic-gate } 519*7c478bd9Sstevel@tonic-gate 520*7c478bd9Sstevel@tonic-gate /* 521*7c478bd9Sstevel@tonic-gate * __ram_root -- 522*7c478bd9Sstevel@tonic-gate * Fix up the recno root page after it has been split. 523*7c478bd9Sstevel@tonic-gate */ 524*7c478bd9Sstevel@tonic-gate static int 525*7c478bd9Sstevel@tonic-gate __ram_root(dbc, rootp, lp, rp) 526*7c478bd9Sstevel@tonic-gate DBC *dbc; 527*7c478bd9Sstevel@tonic-gate PAGE *rootp, *lp, *rp; 528*7c478bd9Sstevel@tonic-gate { 529*7c478bd9Sstevel@tonic-gate DB *dbp; 530*7c478bd9Sstevel@tonic-gate DBT hdr; 531*7c478bd9Sstevel@tonic-gate RINTERNAL ri; 532*7c478bd9Sstevel@tonic-gate int ret; 533*7c478bd9Sstevel@tonic-gate 534*7c478bd9Sstevel@tonic-gate dbp = dbc->dbp; 535*7c478bd9Sstevel@tonic-gate 536*7c478bd9Sstevel@tonic-gate /* Initialize the page. */ 537*7c478bd9Sstevel@tonic-gate P_INIT(rootp, dbp->pgsize, 538*7c478bd9Sstevel@tonic-gate PGNO_ROOT, PGNO_INVALID, PGNO_INVALID, lp->level + 1, P_IRECNO); 539*7c478bd9Sstevel@tonic-gate 540*7c478bd9Sstevel@tonic-gate /* Initialize the header. */ 541*7c478bd9Sstevel@tonic-gate memset(&hdr, 0, sizeof(hdr)); 542*7c478bd9Sstevel@tonic-gate hdr.data = &ri; 543*7c478bd9Sstevel@tonic-gate hdr.size = RINTERNAL_SIZE; 544*7c478bd9Sstevel@tonic-gate 545*7c478bd9Sstevel@tonic-gate /* Insert the left and right keys, set the header information. */ 546*7c478bd9Sstevel@tonic-gate ri.pgno = lp->pgno; 547*7c478bd9Sstevel@tonic-gate ri.nrecs = __bam_total(lp); 548*7c478bd9Sstevel@tonic-gate if ((ret = __db_pitem(dbc, rootp, 0, RINTERNAL_SIZE, &hdr, NULL)) != 0) 549*7c478bd9Sstevel@tonic-gate return (ret); 550*7c478bd9Sstevel@tonic-gate RE_NREC_SET(rootp, ri.nrecs); 551*7c478bd9Sstevel@tonic-gate ri.pgno = rp->pgno; 552*7c478bd9Sstevel@tonic-gate ri.nrecs = __bam_total(rp); 553*7c478bd9Sstevel@tonic-gate if ((ret = __db_pitem(dbc, rootp, 1, RINTERNAL_SIZE, &hdr, NULL)) != 0) 554*7c478bd9Sstevel@tonic-gate return (ret); 555*7c478bd9Sstevel@tonic-gate RE_NREC_ADJ(rootp, ri.nrecs); 556*7c478bd9Sstevel@tonic-gate return (0); 557*7c478bd9Sstevel@tonic-gate } 558*7c478bd9Sstevel@tonic-gate 559*7c478bd9Sstevel@tonic-gate /* 560*7c478bd9Sstevel@tonic-gate * __bam_pinsert -- 561*7c478bd9Sstevel@tonic-gate * Insert a new key into a parent page, completing the split. 562*7c478bd9Sstevel@tonic-gate */ 563*7c478bd9Sstevel@tonic-gate static int 564*7c478bd9Sstevel@tonic-gate __bam_pinsert(dbc, parent, lchild, rchild) 565*7c478bd9Sstevel@tonic-gate DBC *dbc; 566*7c478bd9Sstevel@tonic-gate EPG *parent; 567*7c478bd9Sstevel@tonic-gate PAGE *lchild, *rchild; 568*7c478bd9Sstevel@tonic-gate { 569*7c478bd9Sstevel@tonic-gate BINTERNAL bi, *child_bi; 570*7c478bd9Sstevel@tonic-gate BKEYDATA *child_bk, *tmp_bk; 571*7c478bd9Sstevel@tonic-gate BTREE *t; 572*7c478bd9Sstevel@tonic-gate DB *dbp; 573*7c478bd9Sstevel@tonic-gate DBT a, b, hdr, data; 574*7c478bd9Sstevel@tonic-gate PAGE *ppage; 575*7c478bd9Sstevel@tonic-gate RINTERNAL ri; 576*7c478bd9Sstevel@tonic-gate db_indx_t off; 577*7c478bd9Sstevel@tonic-gate db_recno_t nrecs; 578*7c478bd9Sstevel@tonic-gate u_int32_t n, nbytes, nksize; 579*7c478bd9Sstevel@tonic-gate int ret; 580*7c478bd9Sstevel@tonic-gate 581*7c478bd9Sstevel@tonic-gate dbp = dbc->dbp; 582*7c478bd9Sstevel@tonic-gate t = dbp->internal; 583*7c478bd9Sstevel@tonic-gate ppage = parent->page; 584*7c478bd9Sstevel@tonic-gate 585*7c478bd9Sstevel@tonic-gate /* If handling record numbers, count records split to the right page. */ 586*7c478bd9Sstevel@tonic-gate nrecs = dbp->type == DB_RECNO || F_ISSET(dbp, DB_BT_RECNUM) ? 587*7c478bd9Sstevel@tonic-gate __bam_total(rchild) : 0; 588*7c478bd9Sstevel@tonic-gate 589*7c478bd9Sstevel@tonic-gate /* 590*7c478bd9Sstevel@tonic-gate * Now we insert the new page's first key into the parent page, which 591*7c478bd9Sstevel@tonic-gate * completes the split. The parent points to a PAGE and a page index 592*7c478bd9Sstevel@tonic-gate * offset, where the new key goes ONE AFTER the index, because we split 593*7c478bd9Sstevel@tonic-gate * to the right. 594*7c478bd9Sstevel@tonic-gate * 595*7c478bd9Sstevel@tonic-gate * XXX 596*7c478bd9Sstevel@tonic-gate * Some btree algorithms replace the key for the old page as well as 597*7c478bd9Sstevel@tonic-gate * the new page. We don't, as there's no reason to believe that the 598*7c478bd9Sstevel@tonic-gate * first key on the old page is any better than the key we have, and, 599*7c478bd9Sstevel@tonic-gate * in the case of a key being placed at index 0 causing the split, the 600*7c478bd9Sstevel@tonic-gate * key is unavailable. 601*7c478bd9Sstevel@tonic-gate */ 602*7c478bd9Sstevel@tonic-gate off = parent->indx + O_INDX; 603*7c478bd9Sstevel@tonic-gate 604*7c478bd9Sstevel@tonic-gate /* 605*7c478bd9Sstevel@tonic-gate * Calculate the space needed on the parent page. 606*7c478bd9Sstevel@tonic-gate * 607*7c478bd9Sstevel@tonic-gate * Prefix trees: space hack used when inserting into BINTERNAL pages. 608*7c478bd9Sstevel@tonic-gate * Retain only what's needed to distinguish between the new entry and 609*7c478bd9Sstevel@tonic-gate * the LAST entry on the page to its left. If the keys compare equal, 610*7c478bd9Sstevel@tonic-gate * retain the entire key. We ignore overflow keys, and the entire key 611*7c478bd9Sstevel@tonic-gate * must be retained for the next-to-leftmost key on the leftmost page 612*7c478bd9Sstevel@tonic-gate * of each level, or the search will fail. Applicable ONLY to internal 613*7c478bd9Sstevel@tonic-gate * pages that have leaf pages as children. Further reduction of the 614*7c478bd9Sstevel@tonic-gate * key between pairs of internal pages loses too much information. 615*7c478bd9Sstevel@tonic-gate */ 616*7c478bd9Sstevel@tonic-gate switch (TYPE(rchild)) { 617*7c478bd9Sstevel@tonic-gate case P_IBTREE: 618*7c478bd9Sstevel@tonic-gate child_bi = GET_BINTERNAL(rchild, 0); 619*7c478bd9Sstevel@tonic-gate nbytes = BINTERNAL_PSIZE(child_bi->len); 620*7c478bd9Sstevel@tonic-gate 621*7c478bd9Sstevel@tonic-gate if (P_FREESPACE(ppage) < nbytes) 622*7c478bd9Sstevel@tonic-gate return (DB_NEEDSPLIT); 623*7c478bd9Sstevel@tonic-gate 624*7c478bd9Sstevel@tonic-gate /* Add a new record for the right page. */ 625*7c478bd9Sstevel@tonic-gate memset(&bi, 0, sizeof(bi)); 626*7c478bd9Sstevel@tonic-gate bi.len = child_bi->len; 627*7c478bd9Sstevel@tonic-gate B_TSET(bi.type, child_bi->type, 0); 628*7c478bd9Sstevel@tonic-gate bi.pgno = rchild->pgno; 629*7c478bd9Sstevel@tonic-gate bi.nrecs = nrecs; 630*7c478bd9Sstevel@tonic-gate memset(&hdr, 0, sizeof(hdr)); 631*7c478bd9Sstevel@tonic-gate hdr.data = &bi; 632*7c478bd9Sstevel@tonic-gate hdr.size = SSZA(BINTERNAL, data); 633*7c478bd9Sstevel@tonic-gate memset(&data, 0, sizeof(data)); 634*7c478bd9Sstevel@tonic-gate data.data = child_bi->data; 635*7c478bd9Sstevel@tonic-gate data.size = child_bi->len; 636*7c478bd9Sstevel@tonic-gate if ((ret = __db_pitem(dbc, ppage, off, 637*7c478bd9Sstevel@tonic-gate BINTERNAL_SIZE(child_bi->len), &hdr, &data)) != 0) 638*7c478bd9Sstevel@tonic-gate return (ret); 639*7c478bd9Sstevel@tonic-gate 640*7c478bd9Sstevel@tonic-gate /* Increment the overflow ref count. */ 641*7c478bd9Sstevel@tonic-gate if (B_TYPE(child_bi->type) == B_OVERFLOW) 642*7c478bd9Sstevel@tonic-gate if ((ret = __db_ovref(dbc, 643*7c478bd9Sstevel@tonic-gate ((BOVERFLOW *)(child_bi->data))->pgno, 1)) != 0) 644*7c478bd9Sstevel@tonic-gate return (ret); 645*7c478bd9Sstevel@tonic-gate break; 646*7c478bd9Sstevel@tonic-gate case P_LBTREE: 647*7c478bd9Sstevel@tonic-gate child_bk = GET_BKEYDATA(rchild, 0); 648*7c478bd9Sstevel@tonic-gate switch (B_TYPE(child_bk->type)) { 649*7c478bd9Sstevel@tonic-gate case B_KEYDATA: 650*7c478bd9Sstevel@tonic-gate nbytes = BINTERNAL_PSIZE(child_bk->len); 651*7c478bd9Sstevel@tonic-gate nksize = child_bk->len; 652*7c478bd9Sstevel@tonic-gate if (t->bt_prefix == NULL) 653*7c478bd9Sstevel@tonic-gate goto noprefix; 654*7c478bd9Sstevel@tonic-gate if (ppage->prev_pgno == PGNO_INVALID && off <= 1) 655*7c478bd9Sstevel@tonic-gate goto noprefix; 656*7c478bd9Sstevel@tonic-gate tmp_bk = GET_BKEYDATA(lchild, NUM_ENT(lchild) - P_INDX); 657*7c478bd9Sstevel@tonic-gate if (B_TYPE(tmp_bk->type) != B_KEYDATA) 658*7c478bd9Sstevel@tonic-gate goto noprefix; 659*7c478bd9Sstevel@tonic-gate memset(&a, 0, sizeof(a)); 660*7c478bd9Sstevel@tonic-gate a.size = tmp_bk->len; 661*7c478bd9Sstevel@tonic-gate a.data = tmp_bk->data; 662*7c478bd9Sstevel@tonic-gate memset(&b, 0, sizeof(b)); 663*7c478bd9Sstevel@tonic-gate b.size = child_bk->len; 664*7c478bd9Sstevel@tonic-gate b.data = child_bk->data; 665*7c478bd9Sstevel@tonic-gate nksize = t->bt_prefix(&a, &b); 666*7c478bd9Sstevel@tonic-gate if ((n = BINTERNAL_PSIZE(nksize)) < nbytes) 667*7c478bd9Sstevel@tonic-gate nbytes = n; 668*7c478bd9Sstevel@tonic-gate else 669*7c478bd9Sstevel@tonic-gate noprefix: nksize = child_bk->len; 670*7c478bd9Sstevel@tonic-gate 671*7c478bd9Sstevel@tonic-gate if (P_FREESPACE(ppage) < nbytes) 672*7c478bd9Sstevel@tonic-gate return (DB_NEEDSPLIT); 673*7c478bd9Sstevel@tonic-gate 674*7c478bd9Sstevel@tonic-gate memset(&bi, 0, sizeof(bi)); 675*7c478bd9Sstevel@tonic-gate bi.len = nksize; 676*7c478bd9Sstevel@tonic-gate B_TSET(bi.type, child_bk->type, 0); 677*7c478bd9Sstevel@tonic-gate bi.pgno = rchild->pgno; 678*7c478bd9Sstevel@tonic-gate bi.nrecs = nrecs; 679*7c478bd9Sstevel@tonic-gate memset(&hdr, 0, sizeof(hdr)); 680*7c478bd9Sstevel@tonic-gate hdr.data = &bi; 681*7c478bd9Sstevel@tonic-gate hdr.size = SSZA(BINTERNAL, data); 682*7c478bd9Sstevel@tonic-gate memset(&data, 0, sizeof(data)); 683*7c478bd9Sstevel@tonic-gate data.data = child_bk->data; 684*7c478bd9Sstevel@tonic-gate data.size = nksize; 685*7c478bd9Sstevel@tonic-gate if ((ret = __db_pitem(dbc, ppage, off, 686*7c478bd9Sstevel@tonic-gate BINTERNAL_SIZE(nksize), &hdr, &data)) != 0) 687*7c478bd9Sstevel@tonic-gate return (ret); 688*7c478bd9Sstevel@tonic-gate break; 689*7c478bd9Sstevel@tonic-gate case B_DUPLICATE: 690*7c478bd9Sstevel@tonic-gate case B_OVERFLOW: 691*7c478bd9Sstevel@tonic-gate nbytes = BINTERNAL_PSIZE(BOVERFLOW_SIZE); 692*7c478bd9Sstevel@tonic-gate 693*7c478bd9Sstevel@tonic-gate if (P_FREESPACE(ppage) < nbytes) 694*7c478bd9Sstevel@tonic-gate return (DB_NEEDSPLIT); 695*7c478bd9Sstevel@tonic-gate 696*7c478bd9Sstevel@tonic-gate memset(&bi, 0, sizeof(bi)); 697*7c478bd9Sstevel@tonic-gate bi.len = BOVERFLOW_SIZE; 698*7c478bd9Sstevel@tonic-gate B_TSET(bi.type, child_bk->type, 0); 699*7c478bd9Sstevel@tonic-gate bi.pgno = rchild->pgno; 700*7c478bd9Sstevel@tonic-gate bi.nrecs = nrecs; 701*7c478bd9Sstevel@tonic-gate memset(&hdr, 0, sizeof(hdr)); 702*7c478bd9Sstevel@tonic-gate hdr.data = &bi; 703*7c478bd9Sstevel@tonic-gate hdr.size = SSZA(BINTERNAL, data); 704*7c478bd9Sstevel@tonic-gate memset(&data, 0, sizeof(data)); 705*7c478bd9Sstevel@tonic-gate data.data = child_bk; 706*7c478bd9Sstevel@tonic-gate data.size = BOVERFLOW_SIZE; 707*7c478bd9Sstevel@tonic-gate if ((ret = __db_pitem(dbc, ppage, off, 708*7c478bd9Sstevel@tonic-gate BINTERNAL_SIZE(BOVERFLOW_SIZE), &hdr, &data)) != 0) 709*7c478bd9Sstevel@tonic-gate return (ret); 710*7c478bd9Sstevel@tonic-gate 711*7c478bd9Sstevel@tonic-gate /* Increment the overflow ref count. */ 712*7c478bd9Sstevel@tonic-gate if (B_TYPE(child_bk->type) == B_OVERFLOW) 713*7c478bd9Sstevel@tonic-gate if ((ret = __db_ovref(dbc, 714*7c478bd9Sstevel@tonic-gate ((BOVERFLOW *)child_bk)->pgno, 1)) != 0) 715*7c478bd9Sstevel@tonic-gate return (ret); 716*7c478bd9Sstevel@tonic-gate break; 717*7c478bd9Sstevel@tonic-gate default: 718*7c478bd9Sstevel@tonic-gate return (__db_pgfmt(dbp, rchild->pgno)); 719*7c478bd9Sstevel@tonic-gate } 720*7c478bd9Sstevel@tonic-gate break; 721*7c478bd9Sstevel@tonic-gate case P_IRECNO: 722*7c478bd9Sstevel@tonic-gate case P_LRECNO: 723*7c478bd9Sstevel@tonic-gate nbytes = RINTERNAL_PSIZE; 724*7c478bd9Sstevel@tonic-gate 725*7c478bd9Sstevel@tonic-gate if (P_FREESPACE(ppage) < nbytes) 726*7c478bd9Sstevel@tonic-gate return (DB_NEEDSPLIT); 727*7c478bd9Sstevel@tonic-gate 728*7c478bd9Sstevel@tonic-gate /* Add a new record for the right page. */ 729*7c478bd9Sstevel@tonic-gate memset(&hdr, 0, sizeof(hdr)); 730*7c478bd9Sstevel@tonic-gate hdr.data = &ri; 731*7c478bd9Sstevel@tonic-gate hdr.size = RINTERNAL_SIZE; 732*7c478bd9Sstevel@tonic-gate ri.pgno = rchild->pgno; 733*7c478bd9Sstevel@tonic-gate ri.nrecs = nrecs; 734*7c478bd9Sstevel@tonic-gate if ((ret = __db_pitem(dbc, 735*7c478bd9Sstevel@tonic-gate ppage, off, RINTERNAL_SIZE, &hdr, NULL)) != 0) 736*7c478bd9Sstevel@tonic-gate return (ret); 737*7c478bd9Sstevel@tonic-gate break; 738*7c478bd9Sstevel@tonic-gate default: 739*7c478bd9Sstevel@tonic-gate return (__db_pgfmt(dbp, rchild->pgno)); 740*7c478bd9Sstevel@tonic-gate } 741*7c478bd9Sstevel@tonic-gate 742*7c478bd9Sstevel@tonic-gate /* Adjust the parent page's left page record count. */ 743*7c478bd9Sstevel@tonic-gate if (dbp->type == DB_RECNO || F_ISSET(dbp, DB_BT_RECNUM)) { 744*7c478bd9Sstevel@tonic-gate /* Log the change. */ 745*7c478bd9Sstevel@tonic-gate if (DB_LOGGING(dbc) && 746*7c478bd9Sstevel@tonic-gate (ret = __bam_cadjust_log(dbp->dbenv->lg_info, 747*7c478bd9Sstevel@tonic-gate dbc->txn, &LSN(ppage), 0, dbp->log_fileid, 748*7c478bd9Sstevel@tonic-gate PGNO(ppage), &LSN(ppage), (u_int32_t)parent->indx, 749*7c478bd9Sstevel@tonic-gate -(int32_t)nrecs, (int32_t)0)) != 0) 750*7c478bd9Sstevel@tonic-gate return (ret); 751*7c478bd9Sstevel@tonic-gate 752*7c478bd9Sstevel@tonic-gate /* Update the left page count. */ 753*7c478bd9Sstevel@tonic-gate if (dbp->type == DB_RECNO) 754*7c478bd9Sstevel@tonic-gate GET_RINTERNAL(ppage, parent->indx)->nrecs -= nrecs; 755*7c478bd9Sstevel@tonic-gate else 756*7c478bd9Sstevel@tonic-gate GET_BINTERNAL(ppage, parent->indx)->nrecs -= nrecs; 757*7c478bd9Sstevel@tonic-gate } 758*7c478bd9Sstevel@tonic-gate 759*7c478bd9Sstevel@tonic-gate return (0); 760*7c478bd9Sstevel@tonic-gate } 761*7c478bd9Sstevel@tonic-gate 762*7c478bd9Sstevel@tonic-gate /* 763*7c478bd9Sstevel@tonic-gate * __bam_psplit -- 764*7c478bd9Sstevel@tonic-gate * Do the real work of splitting the page. 765*7c478bd9Sstevel@tonic-gate */ 766*7c478bd9Sstevel@tonic-gate static int 767*7c478bd9Sstevel@tonic-gate __bam_psplit(dbc, cp, lp, rp, splitret) 768*7c478bd9Sstevel@tonic-gate DBC *dbc; 769*7c478bd9Sstevel@tonic-gate EPG *cp; 770*7c478bd9Sstevel@tonic-gate PAGE *lp, *rp; 771*7c478bd9Sstevel@tonic-gate db_indx_t *splitret; 772*7c478bd9Sstevel@tonic-gate { 773*7c478bd9Sstevel@tonic-gate DB *dbp; 774*7c478bd9Sstevel@tonic-gate PAGE *pp; 775*7c478bd9Sstevel@tonic-gate db_indx_t half, nbytes, off, splitp, top; 776*7c478bd9Sstevel@tonic-gate int adjust, cnt, isbigkey, ret; 777*7c478bd9Sstevel@tonic-gate 778*7c478bd9Sstevel@tonic-gate dbp = dbc->dbp; 779*7c478bd9Sstevel@tonic-gate pp = cp->page; 780*7c478bd9Sstevel@tonic-gate adjust = TYPE(pp) == P_LBTREE ? P_INDX : O_INDX; 781*7c478bd9Sstevel@tonic-gate 782*7c478bd9Sstevel@tonic-gate /* 783*7c478bd9Sstevel@tonic-gate * If we're splitting the first (last) page on a level because we're 784*7c478bd9Sstevel@tonic-gate * inserting (appending) a key to it, it's likely that the data is 785*7c478bd9Sstevel@tonic-gate * sorted. Moving a single item to the new page is less work and can 786*7c478bd9Sstevel@tonic-gate * push the fill factor higher than normal. If we're wrong it's not 787*7c478bd9Sstevel@tonic-gate * a big deal, we'll just do the split the right way next time. 788*7c478bd9Sstevel@tonic-gate */ 789*7c478bd9Sstevel@tonic-gate off = 0; 790*7c478bd9Sstevel@tonic-gate if (NEXT_PGNO(pp) == PGNO_INVALID && 791*7c478bd9Sstevel@tonic-gate ((ISINTERNAL(pp) && cp->indx == NUM_ENT(cp->page) - 1) || 792*7c478bd9Sstevel@tonic-gate (!ISINTERNAL(pp) && cp->indx == NUM_ENT(cp->page)))) 793*7c478bd9Sstevel@tonic-gate off = NUM_ENT(cp->page) - adjust; 794*7c478bd9Sstevel@tonic-gate else if (PREV_PGNO(pp) == PGNO_INVALID && cp->indx == 0) 795*7c478bd9Sstevel@tonic-gate off = adjust; 796*7c478bd9Sstevel@tonic-gate 797*7c478bd9Sstevel@tonic-gate if (off != 0) 798*7c478bd9Sstevel@tonic-gate goto sort; 799*7c478bd9Sstevel@tonic-gate 800*7c478bd9Sstevel@tonic-gate /* 801*7c478bd9Sstevel@tonic-gate * Split the data to the left and right pages. Try not to split on 802*7c478bd9Sstevel@tonic-gate * an overflow key. (Overflow keys on internal pages will slow down 803*7c478bd9Sstevel@tonic-gate * searches.) Refuse to split in the middle of a set of duplicates. 804*7c478bd9Sstevel@tonic-gate * 805*7c478bd9Sstevel@tonic-gate * First, find the optimum place to split. 806*7c478bd9Sstevel@tonic-gate * 807*7c478bd9Sstevel@tonic-gate * It's possible to try and split past the last record on the page if 808*7c478bd9Sstevel@tonic-gate * there's a very large record at the end of the page. Make sure this 809*7c478bd9Sstevel@tonic-gate * doesn't happen by bounding the check at the next-to-last entry on 810*7c478bd9Sstevel@tonic-gate * the page. 811*7c478bd9Sstevel@tonic-gate * 812*7c478bd9Sstevel@tonic-gate * Note, we try and split half the data present on the page. This is 813*7c478bd9Sstevel@tonic-gate * because another process may have already split the page and left 814*7c478bd9Sstevel@tonic-gate * it half empty. We don't try and skip the split -- we don't know 815*7c478bd9Sstevel@tonic-gate * how much space we're going to need on the page, and we may need up 816*7c478bd9Sstevel@tonic-gate * to half the page for a big item, so there's no easy test to decide 817*7c478bd9Sstevel@tonic-gate * if we need to split or not. Besides, if two threads are inserting 818*7c478bd9Sstevel@tonic-gate * data into the same place in the database, we're probably going to 819*7c478bd9Sstevel@tonic-gate * need more space soon anyway. 820*7c478bd9Sstevel@tonic-gate */ 821*7c478bd9Sstevel@tonic-gate top = NUM_ENT(pp) - adjust; 822*7c478bd9Sstevel@tonic-gate half = (dbp->pgsize - HOFFSET(pp)) / 2; 823*7c478bd9Sstevel@tonic-gate for (nbytes = 0, off = 0; off < top && nbytes < half; ++off) 824*7c478bd9Sstevel@tonic-gate switch (TYPE(pp)) { 825*7c478bd9Sstevel@tonic-gate case P_IBTREE: 826*7c478bd9Sstevel@tonic-gate if (B_TYPE(GET_BINTERNAL(pp, off)->type) == B_KEYDATA) 827*7c478bd9Sstevel@tonic-gate nbytes += 828*7c478bd9Sstevel@tonic-gate BINTERNAL_SIZE(GET_BINTERNAL(pp, off)->len); 829*7c478bd9Sstevel@tonic-gate else 830*7c478bd9Sstevel@tonic-gate nbytes += BINTERNAL_SIZE(BOVERFLOW_SIZE); 831*7c478bd9Sstevel@tonic-gate break; 832*7c478bd9Sstevel@tonic-gate case P_LBTREE: 833*7c478bd9Sstevel@tonic-gate if (B_TYPE(GET_BKEYDATA(pp, off)->type) == B_KEYDATA) 834*7c478bd9Sstevel@tonic-gate nbytes += 835*7c478bd9Sstevel@tonic-gate BKEYDATA_SIZE(GET_BKEYDATA(pp, off)->len); 836*7c478bd9Sstevel@tonic-gate else 837*7c478bd9Sstevel@tonic-gate nbytes += BOVERFLOW_SIZE; 838*7c478bd9Sstevel@tonic-gate 839*7c478bd9Sstevel@tonic-gate ++off; 840*7c478bd9Sstevel@tonic-gate if (B_TYPE(GET_BKEYDATA(pp, off)->type) == B_KEYDATA) 841*7c478bd9Sstevel@tonic-gate nbytes += 842*7c478bd9Sstevel@tonic-gate BKEYDATA_SIZE(GET_BKEYDATA(pp, off)->len); 843*7c478bd9Sstevel@tonic-gate else 844*7c478bd9Sstevel@tonic-gate nbytes += BOVERFLOW_SIZE; 845*7c478bd9Sstevel@tonic-gate break; 846*7c478bd9Sstevel@tonic-gate case P_IRECNO: 847*7c478bd9Sstevel@tonic-gate nbytes += RINTERNAL_SIZE; 848*7c478bd9Sstevel@tonic-gate break; 849*7c478bd9Sstevel@tonic-gate case P_LRECNO: 850*7c478bd9Sstevel@tonic-gate nbytes += BKEYDATA_SIZE(GET_BKEYDATA(pp, off)->len); 851*7c478bd9Sstevel@tonic-gate break; 852*7c478bd9Sstevel@tonic-gate default: 853*7c478bd9Sstevel@tonic-gate return (__db_pgfmt(dbp, pp->pgno)); 854*7c478bd9Sstevel@tonic-gate } 855*7c478bd9Sstevel@tonic-gate sort: splitp = off; 856*7c478bd9Sstevel@tonic-gate 857*7c478bd9Sstevel@tonic-gate /* 858*7c478bd9Sstevel@tonic-gate * Splitp is either at or just past the optimum split point. If 859*7c478bd9Sstevel@tonic-gate * it's a big key, try and find something close by that's not. 860*7c478bd9Sstevel@tonic-gate */ 861*7c478bd9Sstevel@tonic-gate if (TYPE(pp) == P_IBTREE) 862*7c478bd9Sstevel@tonic-gate isbigkey = B_TYPE(GET_BINTERNAL(pp, off)->type) != B_KEYDATA; 863*7c478bd9Sstevel@tonic-gate else if (TYPE(pp) == P_LBTREE) 864*7c478bd9Sstevel@tonic-gate isbigkey = B_TYPE(GET_BKEYDATA(pp, off)->type) != B_KEYDATA; 865*7c478bd9Sstevel@tonic-gate else 866*7c478bd9Sstevel@tonic-gate isbigkey = 0; 867*7c478bd9Sstevel@tonic-gate if (isbigkey) 868*7c478bd9Sstevel@tonic-gate for (cnt = 1; cnt <= 3; ++cnt) { 869*7c478bd9Sstevel@tonic-gate off = splitp + cnt * adjust; 870*7c478bd9Sstevel@tonic-gate if (off < (db_indx_t)NUM_ENT(pp) && 871*7c478bd9Sstevel@tonic-gate ((TYPE(pp) == P_IBTREE && 872*7c478bd9Sstevel@tonic-gate B_TYPE(GET_BINTERNAL(pp,off)->type) == B_KEYDATA) || 873*7c478bd9Sstevel@tonic-gate B_TYPE(GET_BKEYDATA(pp, off)->type) == B_KEYDATA)) { 874*7c478bd9Sstevel@tonic-gate splitp = off; 875*7c478bd9Sstevel@tonic-gate break; 876*7c478bd9Sstevel@tonic-gate } 877*7c478bd9Sstevel@tonic-gate if (splitp <= (db_indx_t)(cnt * adjust)) 878*7c478bd9Sstevel@tonic-gate continue; 879*7c478bd9Sstevel@tonic-gate off = splitp - cnt * adjust; 880*7c478bd9Sstevel@tonic-gate if (TYPE(pp) == P_IBTREE ? 881*7c478bd9Sstevel@tonic-gate B_TYPE(GET_BINTERNAL(pp, off)->type) == B_KEYDATA : 882*7c478bd9Sstevel@tonic-gate B_TYPE(GET_BKEYDATA(pp, off)->type) == B_KEYDATA) { 883*7c478bd9Sstevel@tonic-gate splitp = off; 884*7c478bd9Sstevel@tonic-gate break; 885*7c478bd9Sstevel@tonic-gate } 886*7c478bd9Sstevel@tonic-gate } 887*7c478bd9Sstevel@tonic-gate 888*7c478bd9Sstevel@tonic-gate /* 889*7c478bd9Sstevel@tonic-gate * We can't split in the middle a set of duplicates. We know that 890*7c478bd9Sstevel@tonic-gate * no duplicate set can take up more than about 25% of the page, 891*7c478bd9Sstevel@tonic-gate * because that's the point where we push it off onto a duplicate 892*7c478bd9Sstevel@tonic-gate * page set. So, this loop can't be unbounded. 893*7c478bd9Sstevel@tonic-gate */ 894*7c478bd9Sstevel@tonic-gate if (F_ISSET(dbp, DB_AM_DUP) && TYPE(pp) == P_LBTREE && 895*7c478bd9Sstevel@tonic-gate pp->inp[splitp] == pp->inp[splitp - adjust]) 896*7c478bd9Sstevel@tonic-gate for (cnt = 1;; ++cnt) { 897*7c478bd9Sstevel@tonic-gate off = splitp + cnt * adjust; 898*7c478bd9Sstevel@tonic-gate if (off < NUM_ENT(pp) && 899*7c478bd9Sstevel@tonic-gate pp->inp[splitp] != pp->inp[off]) { 900*7c478bd9Sstevel@tonic-gate splitp = off; 901*7c478bd9Sstevel@tonic-gate break; 902*7c478bd9Sstevel@tonic-gate } 903*7c478bd9Sstevel@tonic-gate if (splitp <= (db_indx_t)(cnt * adjust)) 904*7c478bd9Sstevel@tonic-gate continue; 905*7c478bd9Sstevel@tonic-gate off = splitp - cnt * adjust; 906*7c478bd9Sstevel@tonic-gate if (pp->inp[splitp] != pp->inp[off]) { 907*7c478bd9Sstevel@tonic-gate splitp = off + adjust; 908*7c478bd9Sstevel@tonic-gate break; 909*7c478bd9Sstevel@tonic-gate } 910*7c478bd9Sstevel@tonic-gate } 911*7c478bd9Sstevel@tonic-gate 912*7c478bd9Sstevel@tonic-gate 913*7c478bd9Sstevel@tonic-gate /* We're going to split at splitp. */ 914*7c478bd9Sstevel@tonic-gate if ((ret = __bam_copy(dbp, pp, lp, 0, splitp)) != 0) 915*7c478bd9Sstevel@tonic-gate return (ret); 916*7c478bd9Sstevel@tonic-gate if ((ret = __bam_copy(dbp, pp, rp, splitp, NUM_ENT(pp))) != 0) 917*7c478bd9Sstevel@tonic-gate return (ret); 918*7c478bd9Sstevel@tonic-gate 919*7c478bd9Sstevel@tonic-gate *splitret = splitp; 920*7c478bd9Sstevel@tonic-gate return (0); 921*7c478bd9Sstevel@tonic-gate } 922*7c478bd9Sstevel@tonic-gate 923*7c478bd9Sstevel@tonic-gate /* 924*7c478bd9Sstevel@tonic-gate * __bam_copy -- 925*7c478bd9Sstevel@tonic-gate * Copy a set of records from one page to another. 926*7c478bd9Sstevel@tonic-gate * 927*7c478bd9Sstevel@tonic-gate * PUBLIC: int __bam_copy __P((DB *, PAGE *, PAGE *, u_int32_t, u_int32_t)); 928*7c478bd9Sstevel@tonic-gate */ 929*7c478bd9Sstevel@tonic-gate int 930*7c478bd9Sstevel@tonic-gate __bam_copy(dbp, pp, cp, nxt, stop) 931*7c478bd9Sstevel@tonic-gate DB *dbp; 932*7c478bd9Sstevel@tonic-gate PAGE *pp, *cp; 933*7c478bd9Sstevel@tonic-gate u_int32_t nxt, stop; 934*7c478bd9Sstevel@tonic-gate { 935*7c478bd9Sstevel@tonic-gate db_indx_t nbytes, off; 936*7c478bd9Sstevel@tonic-gate 937*7c478bd9Sstevel@tonic-gate /* 938*7c478bd9Sstevel@tonic-gate * Copy the rest of the data to the right page. Nxt is the next 939*7c478bd9Sstevel@tonic-gate * offset placed on the target page. 940*7c478bd9Sstevel@tonic-gate */ 941*7c478bd9Sstevel@tonic-gate for (off = 0; nxt < stop; ++nxt, ++NUM_ENT(cp), ++off) { 942*7c478bd9Sstevel@tonic-gate switch (TYPE(pp)) { 943*7c478bd9Sstevel@tonic-gate case P_IBTREE: 944*7c478bd9Sstevel@tonic-gate if (B_TYPE(GET_BINTERNAL(pp, nxt)->type) == B_KEYDATA) 945*7c478bd9Sstevel@tonic-gate nbytes = 946*7c478bd9Sstevel@tonic-gate BINTERNAL_SIZE(GET_BINTERNAL(pp, nxt)->len); 947*7c478bd9Sstevel@tonic-gate else 948*7c478bd9Sstevel@tonic-gate nbytes = BINTERNAL_SIZE(BOVERFLOW_SIZE); 949*7c478bd9Sstevel@tonic-gate break; 950*7c478bd9Sstevel@tonic-gate case P_LBTREE: 951*7c478bd9Sstevel@tonic-gate /* 952*7c478bd9Sstevel@tonic-gate * If we're on a key and it's a duplicate, just copy 953*7c478bd9Sstevel@tonic-gate * the offset. 954*7c478bd9Sstevel@tonic-gate */ 955*7c478bd9Sstevel@tonic-gate if (off != 0 && (nxt % P_INDX) == 0 && 956*7c478bd9Sstevel@tonic-gate pp->inp[nxt] == pp->inp[nxt - P_INDX]) { 957*7c478bd9Sstevel@tonic-gate cp->inp[off] = cp->inp[off - P_INDX]; 958*7c478bd9Sstevel@tonic-gate continue; 959*7c478bd9Sstevel@tonic-gate } 960*7c478bd9Sstevel@tonic-gate /* FALLTHROUGH */ 961*7c478bd9Sstevel@tonic-gate case P_LRECNO: 962*7c478bd9Sstevel@tonic-gate if (B_TYPE(GET_BKEYDATA(pp, nxt)->type) == B_KEYDATA) 963*7c478bd9Sstevel@tonic-gate nbytes = 964*7c478bd9Sstevel@tonic-gate BKEYDATA_SIZE(GET_BKEYDATA(pp, nxt)->len); 965*7c478bd9Sstevel@tonic-gate else 966*7c478bd9Sstevel@tonic-gate nbytes = BOVERFLOW_SIZE; 967*7c478bd9Sstevel@tonic-gate break; 968*7c478bd9Sstevel@tonic-gate case P_IRECNO: 969*7c478bd9Sstevel@tonic-gate nbytes = RINTERNAL_SIZE; 970*7c478bd9Sstevel@tonic-gate break; 971*7c478bd9Sstevel@tonic-gate default: 972*7c478bd9Sstevel@tonic-gate return (__db_pgfmt(dbp, pp->pgno)); 973*7c478bd9Sstevel@tonic-gate } 974*7c478bd9Sstevel@tonic-gate cp->inp[off] = HOFFSET(cp) -= nbytes; 975*7c478bd9Sstevel@tonic-gate memcpy(P_ENTRY(cp, off), P_ENTRY(pp, nxt), nbytes); 976*7c478bd9Sstevel@tonic-gate } 977*7c478bd9Sstevel@tonic-gate return (0); 978*7c478bd9Sstevel@tonic-gate } 979