/* * Copyright (c) 1990, 1993, 1994 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Margo Seltzer. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * PACKAGE: hashing * * DESCRIPTION: * Page manipulation for hashing package. * * ROUTINES: * * External * __get_page * __add_ovflpage * Internal * overflow_page * open_temp */ #include #ifdef DEBUG #include #endif #include #include #include #include #include #include "db-int.h" #include "hash.h" #include "page.h" #include "extern.h" static int32_t add_bigptr __P((HTAB *, ITEM_INFO *, indx_t)); static u_int32_t *fetch_bitmap __P((HTAB *, int32_t)); static u_int32_t first_free __P((u_int32_t)); #ifdef DEBUG static indx_t next_realkey __P((PAGE16 *, indx_t)); #endif static u_int16_t overflow_page __P((HTAB *)); static void page_init __P((HTAB *, PAGE16 *, db_pgno_t, u_int8_t)); static indx_t prev_realkey __P((PAGE16 *, indx_t)); static void putpair __P((PAGE8 *, const DBT *, const DBT *)); static void swap_page_header_in __P((PAGE16 *)); static void swap_page_header_out __P((PAGE16 *)); #ifdef DEBUG_SLOW static void account_page(HTAB *, db_pgno_t, int); #endif u_int32_t __get_item(hashp, cursorp, key, val, item_info) HTAB *hashp; CURSOR *cursorp; DBT *key, *val; ITEM_INFO *item_info; { db_pgno_t next_pgno; int32_t i; /* Check if we need to get a page. */ if (!cursorp->pagep) { if (cursorp->pgno == INVALID_PGNO) { cursorp->pagep = __get_page(hashp, cursorp->bucket, A_BUCKET); cursorp->pgno = ADDR(cursorp->pagep); cursorp->ndx = 0; cursorp->pgndx = 0; } else cursorp->pagep = __get_page(hashp, cursorp->pgno, A_RAW); if (!cursorp->pagep) { item_info->status = ITEM_ERROR; return (-1); } } if (item_info->seek_size && FREESPACE(cursorp->pagep) > item_info->seek_size) item_info->seek_found_page = cursorp->pgno; if (cursorp->pgndx == NUM_ENT(cursorp->pagep)) { /* Fetch next page. */ if (NEXT_PGNO(cursorp->pagep) == INVALID_PGNO) { item_info->status = ITEM_NO_MORE; return (-1); } next_pgno = NEXT_PGNO(cursorp->pagep); cursorp->pgndx = 0; __put_page(hashp, cursorp->pagep, A_RAW, 0); cursorp->pagep = __get_page(hashp, next_pgno, A_RAW); if (!cursorp->pagep) { item_info->status = ITEM_ERROR; return (-1); } cursorp->pgno = next_pgno; } if (KEY_OFF(cursorp->pagep, cursorp->pgndx) != BIGPAIR) { if ((i = prev_realkey(cursorp->pagep, cursorp->pgndx)) == cursorp->pgndx) key->size = hashp->hdr.bsize - KEY_OFF(cursorp->pagep, cursorp->pgndx); else key->size = DATA_OFF(cursorp->pagep, i) - KEY_OFF(cursorp->pagep, cursorp->pgndx); } /* * All of this information will be set incorrectly for big keys, but * it will be ignored anyway. */ val->size = KEY_OFF(cursorp->pagep, cursorp->pgndx) - DATA_OFF(cursorp->pagep, cursorp->pgndx); key->data = KEY(cursorp->pagep, cursorp->pgndx); val->data = DATA(cursorp->pagep, cursorp->pgndx); item_info->pgno = cursorp->pgno; item_info->bucket = cursorp->bucket; item_info->ndx = cursorp->ndx; item_info->pgndx = cursorp->pgndx; item_info->key_off = KEY_OFF(cursorp->pagep, cursorp->pgndx); item_info->data_off = DATA_OFF(cursorp->pagep, cursorp->pgndx); item_info->status = ITEM_OK; return (0); } u_int32_t __get_item_reset(hashp, cursorp) HTAB *hashp; CURSOR *cursorp; { if (cursorp->pagep) __put_page(hashp, cursorp->pagep, A_RAW, 0); cursorp->pagep = NULL; cursorp->bucket = -1; cursorp->ndx = 0; cursorp->pgndx = 0; cursorp->pgno = INVALID_PGNO; return (0); } u_int32_t __get_item_done(hashp, cursorp) HTAB *hashp; CURSOR *cursorp; { if (cursorp->pagep) __put_page(hashp, cursorp->pagep, A_RAW, 0); cursorp->pagep = NULL; /* * We don't throw out the page number since we might want to * continue getting on this page. */ return (0); } u_int32_t __get_item_first(hashp, cursorp, key, val, item_info) HTAB *hashp; CURSOR *cursorp; DBT *key, *val; ITEM_INFO *item_info; { __get_item_reset(hashp, cursorp); cursorp->bucket = 0; return (__get_item_next(hashp, cursorp, key, val, item_info)); } /* * Returns a pointer to key/data pair on a page. In the case of bigkeys, * just returns the page number and index of the bigkey pointer pair. */ u_int32_t __get_item_next(hashp, cursorp, key, val, item_info) HTAB *hashp; CURSOR *cursorp; DBT *key, *val; ITEM_INFO *item_info; { int status; status = __get_item(hashp, cursorp, key, val, item_info); cursorp->ndx++; cursorp->pgndx++; return (status); } /* * Put a non-big pair on a page. */ static void putpair(p, key, val) PAGE8 *p; const DBT *key, *val; { u_int16_t *pagep, n, off; pagep = (PAGE16 *)p; /* Items on the page are 0-indexed. */ n = NUM_ENT(pagep); off = OFFSET(pagep) - key->size + 1; memmove(p + off, key->data, key->size); KEY_OFF(pagep, n) = off; off -= val->size; memmove(p + off, val->data, val->size); DATA_OFF(pagep, n) = off; /* Adjust page info. */ NUM_ENT(pagep) = n + 1; OFFSET(pagep) = off - 1; } /* * Returns the index of the next non-bigkey pair after n on the page. * Returns -1 if there are no more non-big things on the page. */ #ifdef DEBUG static indx_t next_realkey(PAGE16 * pagep, indx_t n) { indx_t i; for (i = n + 1; i < NUM_ENT(pagep); i++) if (KEY_OFF(pagep, i) != BIGPAIR) return (i); return (-1); } #endif /* * Returns the index of the previous non-bigkey pair after n on the page. * Returns n if there are no previous non-big things on the page. */ static indx_t #ifdef __STDC__ prev_realkey(PAGE16 * pagep, indx_t n) #else prev_realkey(pagep, n) PAGE16 *pagep; u_int32_t n; #endif { int32_t i; /* Need a signed value to do the compare properly. */ for (i = n - 1; i > -1; i--) if (KEY_OFF(pagep, i) != BIGPAIR) return (i); return (n); } /* * Returns: * 0 OK * -1 error */ extern int32_t __delpair(hashp, cursorp, item_info) HTAB *hashp; CURSOR *cursorp; ITEM_INFO *item_info; { PAGE16 *pagep; indx_t ndx; short check_ndx; int16_t delta, len; int32_t n; u_int8_t *src, *dest; ndx = cursorp->pgndx; if (!cursorp->pagep) { pagep = __get_page(hashp, cursorp->pgno, A_RAW); if (!pagep) return (-1); /* * KLUGE: pgndx has gone one too far, because cursor points * to the _next_ item. Use pgndx - 1. */ --ndx; } else pagep = cursorp->pagep; #ifdef DEBUG assert(ADDR(pagep) == cursorp->pgno); #endif if (KEY_OFF(pagep, ndx) == BIGPAIR) { delta = 0; __big_delete(hashp, pagep, ndx); } else { /* * Compute "delta", the amount we have to shift all of the * offsets. To find the delta, we need to make sure that * we aren't looking at the DATA_OFF of a big/keydata pair. */ for (check_ndx = (short)(ndx - 1); check_ndx >= 0 && KEY_OFF(pagep, check_ndx) == BIGPAIR; check_ndx--); if (check_ndx < 0) delta = hashp->hdr.bsize - DATA_OFF(pagep, ndx); else delta = DATA_OFF(pagep, check_ndx) - DATA_OFF(pagep, ndx); /* * The hard case: we want to remove something other than * the last item on the page. We need to shift data and * offsets down. */ if (ndx != NUM_ENT(pagep) - 1) { /* * Move the data: src is the address of the last data * item on the page. */ src = (u_int8_t *)pagep + OFFSET(pagep) + 1; /* * Length is the distance between where to start * deleting and end of the data on the page. */ len = DATA_OFF(pagep, ndx) - (OFFSET(pagep) + 1); /* * Dest is the location of the to-be-deleted item * occupied - length. */ if (check_ndx < 0) dest = (u_int8_t *)pagep + hashp->hdr.bsize - len; else dest = (u_int8_t *)pagep + DATA_OFF(pagep, (check_ndx)) - len; memmove(dest, src, len); } } /* Adjust the offsets. */ for (n = ndx; n < NUM_ENT(pagep) - 1; n++) if (KEY_OFF(pagep, (n + 1)) != BIGPAIR) { #ifdef DEBUG assert(next_realkey(pagep, n) != -1); #endif KEY_OFF(pagep, n) = KEY_OFF(pagep, (n + 1)) + delta; DATA_OFF(pagep, n) = DATA_OFF(pagep, (n + 1)) + delta; } else { KEY_OFF(pagep, n) = KEY_OFF(pagep, (n + 1)); DATA_OFF(pagep, n) = DATA_OFF(pagep, (n + 1)); } /* Adjust page metadata. */ OFFSET(pagep) = OFFSET(pagep) + delta; NUM_ENT(pagep) = NUM_ENT(pagep) - 1; --hashp->hdr.nkeys; /* Is this page now an empty overflow page? If so, free it. */ if (TYPE(pagep) == HASH_OVFLPAGE && NUM_ENT(pagep) == 0) { PAGE16 *empty_page; db_pgno_t to_find, next_pgno, link_page; /* * We need to go back to the first page in the chain and * look for this page so that we can update the previous * page's NEXT_PGNO field. */ to_find = ADDR(pagep); empty_page = pagep; link_page = NEXT_PGNO(empty_page); pagep = __get_page(hashp, item_info->bucket, A_BUCKET); if (!pagep) return (-1); while (NEXT_PGNO(pagep) != to_find) { next_pgno = NEXT_PGNO(pagep); #ifdef DEBUG assert(next_pgno != INVALID_PGNO); #endif __put_page(hashp, pagep, A_RAW, 0); pagep = __get_page(hashp, next_pgno, A_RAW); if (!pagep) return (-1); } /* * At this point, pagep should point to the page before the * page to be deleted. */ NEXT_PGNO(pagep) = link_page; if (item_info->pgno == to_find) { item_info->pgno = ADDR(pagep); item_info->pgndx = NUM_ENT(pagep); item_info->seek_found_page = ADDR(pagep); } __delete_page(hashp, empty_page, A_OVFL); } __put_page(hashp, pagep, A_RAW, 1); return (0); } extern int32_t __split_page(hashp, obucket, nbucket) HTAB *hashp; u_int32_t obucket, nbucket; { DBT key, val; ITEM_INFO old_ii, new_ii; PAGE16 *old_pagep, *temp_pagep; db_pgno_t next_pgno; int32_t off; u_int16_t n; int8_t base_page; off = hashp->hdr.bsize; old_pagep = __get_page(hashp, obucket, A_BUCKET); base_page = 1; temp_pagep = hashp->split_buf; memcpy(temp_pagep, old_pagep, hashp->hdr.bsize); page_init(hashp, old_pagep, ADDR(old_pagep), HASH_PAGE); __put_page(hashp, old_pagep, A_RAW, 1); old_ii.pgno = BUCKET_TO_PAGE(obucket); new_ii.pgno = BUCKET_TO_PAGE(nbucket); old_ii.bucket = obucket; new_ii.bucket = nbucket; old_ii.seek_found_page = new_ii.seek_found_page = 0; while (temp_pagep != 0) { off = hashp->hdr.bsize; for (n = 0; n < NUM_ENT(temp_pagep); n++) { if (KEY_OFF(temp_pagep, n) == BIGPAIR) { __get_bigkey(hashp, temp_pagep, n, &key); if (__call_hash(hashp, key.data, key.size) == obucket) add_bigptr(hashp, &old_ii, DATA_OFF(temp_pagep, n)); else add_bigptr(hashp, &new_ii, DATA_OFF(temp_pagep, n)); } else { key.size = off - KEY_OFF(temp_pagep, n); key.data = KEY(temp_pagep, n); off = KEY_OFF(temp_pagep, n); val.size = off - DATA_OFF(temp_pagep, n); val.data = DATA(temp_pagep, n); if (__call_hash(hashp, key.data, key.size) == obucket) __addel(hashp, &old_ii, &key, &val, NO_EXPAND, 1); else __addel(hashp, &new_ii, &key, &val, NO_EXPAND, 1); off = DATA_OFF(temp_pagep, n); } } next_pgno = NEXT_PGNO(temp_pagep); /* Clear temp_page; if it's an overflow page, free it. */ if (!base_page) __delete_page(hashp, temp_pagep, A_OVFL); else base_page = 0; if (next_pgno != INVALID_PGNO) temp_pagep = __get_page(hashp, next_pgno, A_RAW); else break; } return (0); } /* * Add the given pair to the page. * * * Returns: * 0 ==> OK * -1 ==> failure */ extern int32_t #ifdef __STDC__ __addel(HTAB *hashp, ITEM_INFO *item_info, const DBT *key, const DBT *val, u_int32_t num_items, const u_int8_t expanding) #else __addel(hashp, item_info, key, val, num_items, expanding) HTAB *hashp; ITEM_INFO *item_info; const DBT *key, *val; u_int32_t num_items; const u_int32_t expanding; #endif { PAGE16 *pagep; int32_t do_expand; db_pgno_t next_pgno; do_expand = 0; pagep = __get_page(hashp, item_info->seek_found_page != 0 ? item_info->seek_found_page : item_info->pgno, A_RAW); if (!pagep) return (-1); /* Advance to first page in chain with room for item. */ while (NUM_ENT(pagep) && NEXT_PGNO(pagep) != INVALID_PGNO) { /* * This may not be the end of the chain, but the pair may fit * anyway. */ if (ISBIG(PAIRSIZE(key, val), hashp) && BIGPAIRFITS(pagep)) break; if (PAIRFITS(pagep, key, val)) break; next_pgno = NEXT_PGNO(pagep); __put_page(hashp, pagep, A_RAW, 0); pagep = (PAGE16 *)__get_page(hashp, next_pgno, A_RAW); if (!pagep) return (-1); } if ((ISBIG(PAIRSIZE(key, val), hashp) && !BIGPAIRFITS(pagep)) || (!ISBIG(PAIRSIZE(key, val), hashp) && !PAIRFITS(pagep, key, val))) { do_expand = 1; pagep = __add_ovflpage(hashp, pagep); if (!pagep) return (-1); if ((ISBIG(PAIRSIZE(key, val), hashp) && !BIGPAIRFITS(pagep)) || (!ISBIG(PAIRSIZE(key, val), hashp) && !PAIRFITS(pagep, key, val))) { __put_page(hashp, pagep, A_RAW, 0); return (-1); } } /* At this point, we know the page fits, so we just add it */ if (ISBIG(PAIRSIZE(key, val), hashp)) { if (__big_insert(hashp, pagep, key, val)) return (-1); } else { putpair((PAGE8 *)pagep, key, val); } /* * For splits, we are going to update item_info's page number * field, so that we can easily return to the same page the * next time we come in here. For other operations, this shouldn't * matter, since adds are the last thing that happens before we * return to the user program. */ item_info->pgno = ADDR(pagep); if (!expanding) hashp->hdr.nkeys++; /* Kludge: if this is a big page, then it's already been put. */ if (!ISBIG(PAIRSIZE(key, val), hashp)) __put_page(hashp, pagep, A_RAW, 1); if (expanding) item_info->caused_expand = 0; else switch (num_items) { case NO_EXPAND: item_info->caused_expand = 0; break; case UNKNOWN: item_info->caused_expand |= (hashp->hdr.nkeys / hashp->hdr.max_bucket) > hashp->hdr.ffactor || item_info->pgndx > hashp->hdr.ffactor; break; default: item_info->caused_expand = num_items > hashp->hdr.ffactor ? 1 : do_expand; break; } return (0); } /* * Special __addel used in big splitting; this one just puts the pointer * to an already-allocated big page in the appropriate bucket. */ static int32_t #ifdef __STDC__ add_bigptr(HTAB * hashp, ITEM_INFO * item_info, indx_t big_pgno) #else add_bigptr(hashp, item_info, big_pgno) HTAB *hashp; ITEM_INFO *item_info; u_int32_t big_pgno; #endif { PAGE16 *pagep; db_pgno_t next_pgno; pagep = __get_page(hashp, item_info->bucket, A_BUCKET); if (!pagep) return (-1); /* * Note: in __addel(), we used item_info->pgno for the beginning of * our search for space. Now, we use item_info->bucket, since we * know that the space required by a big pair on the base page is * quite small, and we may very well find that space early in the * chain. */ /* Find first page in chain that has space for a big pair. */ while (NUM_ENT(pagep) && (NEXT_PGNO(pagep) != INVALID_PGNO)) { if (BIGPAIRFITS(pagep)) break; next_pgno = NEXT_PGNO(pagep); __put_page(hashp, pagep, A_RAW, 0); pagep = __get_page(hashp, next_pgno, A_RAW); if (!pagep) return (-1); } if (!BIGPAIRFITS(pagep)) { pagep = __add_ovflpage(hashp, pagep); if (!pagep) return (-1); #ifdef DEBUG assert(BIGPAIRFITS(pagep)); #endif } KEY_OFF(pagep, NUM_ENT(pagep)) = BIGPAIR; DATA_OFF(pagep, NUM_ENT(pagep)) = big_pgno; NUM_ENT(pagep) = NUM_ENT(pagep) + 1; __put_page(hashp, pagep, A_RAW, 1); return (0); } /* * * Returns: * pointer on success * NULL on error */ extern PAGE16 * __add_ovflpage(hashp, pagep) HTAB *hashp; PAGE16 *pagep; { PAGE16 *new_pagep; u_int16_t ovfl_num; /* Check if we are dynamically determining the fill factor. */ if (hashp->hdr.ffactor == DEF_FFACTOR) { hashp->hdr.ffactor = NUM_ENT(pagep) >> 1; if (hashp->hdr.ffactor < MIN_FFACTOR) hashp->hdr.ffactor = MIN_FFACTOR; } ovfl_num = overflow_page(hashp); if (!ovfl_num) return (NULL); if (__new_page(hashp, (u_int32_t)ovfl_num, A_OVFL) != 0) return (NULL); if (!ovfl_num || !(new_pagep = __get_page(hashp, ovfl_num, A_OVFL))) return (NULL); NEXT_PGNO(pagep) = (db_pgno_t)OADDR_TO_PAGE(ovfl_num); TYPE(new_pagep) = HASH_OVFLPAGE; __put_page(hashp, pagep, A_RAW, 1); #ifdef HASH_STATISTICS hash_overflows++; #endif return (new_pagep); } /* * * Returns: * pointer on success * NULL on error */ extern PAGE16 * #ifdef __STDC__ __add_bigpage(HTAB * hashp, PAGE16 * pagep, indx_t ndx, const u_int8_t is_basepage) #else __add_bigpage(hashp, pagep, ndx, is_basepage) HTAB *hashp; PAGE16 *pagep; u_int32_t ndx; const u_int32_t is_basepage; #endif { PAGE16 *new_pagep; u_int16_t ovfl_num; ovfl_num = overflow_page(hashp); if (!ovfl_num) return (NULL); if (__new_page(hashp, (u_int32_t)ovfl_num, A_OVFL) != 0) return (NULL); if (!ovfl_num || !(new_pagep = __get_page(hashp, ovfl_num, A_OVFL))) return (NULL); if (is_basepage) { KEY_OFF(pagep, ndx) = BIGPAIR; DATA_OFF(pagep, ndx) = (indx_t)ovfl_num; } else NEXT_PGNO(pagep) = ADDR(new_pagep); __put_page(hashp, pagep, A_RAW, 1); TYPE(new_pagep) = HASH_BIGPAGE; #ifdef HASH_STATISTICS hash_bigpages++; #endif return (new_pagep); } static void #ifdef __STDC__ page_init(HTAB * hashp, PAGE16 * pagep, db_pgno_t pgno, u_int8_t type) #else page_init(hashp, pagep, pgno, type) HTAB *hashp; PAGE16 *pagep; db_pgno_t pgno; u_int32_t type; #endif { NUM_ENT(pagep) = 0; PREV_PGNO(pagep) = NEXT_PGNO(pagep) = INVALID_PGNO; TYPE(pagep) = type; OFFSET(pagep) = hashp->hdr.bsize - 1; /* * Note: since in the current version ADDR(pagep) == PREV_PGNO(pagep), * make sure that ADDR(pagep) is set after resetting PREV_PGNO(pagep). * We reset PREV_PGNO(pagep) just in case the macros are changed. */ ADDR(pagep) = pgno; return; } int32_t __new_page(hashp, addr, addr_type) HTAB *hashp; u_int32_t addr; int32_t addr_type; { db_pgno_t paddr; PAGE16 *pagep; switch (addr_type) { /* Convert page number. */ case A_BUCKET: paddr = BUCKET_TO_PAGE(addr); break; case A_OVFL: case A_BITMAP: paddr = OADDR_TO_PAGE(addr); break; default: paddr = addr; break; } pagep = mpool_new(hashp->mp, &paddr, MPOOL_PAGE_REQUEST); if (!pagep) return (-1); #if DEBUG_SLOW account_page(hashp, paddr, 1); #endif if (addr_type != A_BITMAP) page_init(hashp, pagep, paddr, HASH_PAGE); __put_page(hashp, pagep, addr_type, 1); return (0); } int32_t __delete_page(hashp, pagep, page_type) HTAB *hashp; PAGE16 *pagep; int32_t page_type; { if (page_type == A_OVFL) __free_ovflpage(hashp, pagep); return (mpool_delete(hashp->mp, pagep)); } static u_int8_t is_bitmap_pgno(hashp, pgno) HTAB *hashp; db_pgno_t pgno; { int32_t i; for (i = 0; i < hashp->nmaps; i++) if (OADDR_TO_PAGE(hashp->hdr.bitmaps[i]) == pgno) return (1); return (0); } void __pgin_routine(pg_cookie, pgno, page) void *pg_cookie; db_pgno_t pgno; void *page; { HTAB *hashp; PAGE16 *pagep; int32_t max, i; pagep = (PAGE16 *)page; hashp = (HTAB *)pg_cookie; /* * There are the following cases for swapping: * 0) New page that may be unitialized. * 1) Bucket page or overflow page. Either swap * the header or initialize the page. * 2) Bitmap page. Swap the whole page! * 3) Header pages. Not handled here; these are written directly * to the file. */ if (NUM_ENT(pagep) == 0 && NEXT_PGNO(pagep) == 0 && !is_bitmap_pgno(hashp, pgno)) { /* XXX check for !0 LSN */ page_init(hashp, pagep, pgno, HASH_PAGE); return; } if (hashp->hdr.lorder == DB_BYTE_ORDER) return; if (is_bitmap_pgno(hashp, pgno)) { max = hashp->hdr.bsize >> 2; /* divide by 4 bytes */ for (i = 0; i < max; i++) M_32_SWAP(((int32_t *)pagep)[i]); } else swap_page_header_in(pagep); } void __pgout_routine(pg_cookie, pgno, page) void *pg_cookie; db_pgno_t pgno; void *page; { HTAB *hashp; PAGE16 *pagep; int32_t i, max; pagep = (PAGE16 *)page; hashp = (HTAB *)pg_cookie; /* * There are the following cases for swapping: * 1) Bucket page or overflow page. Just swap the header. * 2) Bitmap page. Swap the whole page! * 3) Header pages. Not handled here; these are written directly * to the file. */ if (hashp->hdr.lorder == DB_BYTE_ORDER) return; if (is_bitmap_pgno(hashp, pgno)) { max = hashp->hdr.bsize >> 2; /* divide by 4 bytes */ for (i = 0; i < max; i++) M_32_SWAP(((int32_t *)pagep)[i]); } else swap_page_header_out(pagep); } /* * * Returns: * 0 ==> OK * -1 ==>failure */ extern int32_t __put_page(hashp, pagep, addr_type, is_dirty) HTAB *hashp; PAGE16 *pagep; int32_t addr_type, is_dirty; { #if DEBUG_SLOW account_page(hashp, ((BKT *)((char *)pagep - sizeof(BKT)))->pgno, -1); #endif return (mpool_put(hashp->mp, pagep, (is_dirty ? MPOOL_DIRTY : 0))); } /* * Returns: * 0 indicates SUCCESS * -1 indicates FAILURE */ extern PAGE16 * __get_page(hashp, addr, addr_type) HTAB *hashp; u_int32_t addr; int32_t addr_type; { PAGE16 *pagep; db_pgno_t paddr; switch (addr_type) { /* Convert page number. */ case A_BUCKET: paddr = BUCKET_TO_PAGE(addr); break; case A_OVFL: case A_BITMAP: paddr = OADDR_TO_PAGE(addr); break; default: paddr = addr; break; } pagep = (PAGE16 *)mpool_get(hashp->mp, paddr, 0); #if DEBUG_SLOW account_page(hashp, paddr, 1); #endif #ifdef DEBUG assert(ADDR(pagep) == paddr || ADDR(pagep) == 0 || addr_type == A_BITMAP || addr_type == A_HEADER); #endif return (pagep); } static void swap_page_header_in(pagep) PAGE16 *pagep; { u_int32_t i; /* can leave type and filler alone, since they're 1-byte quantities */ M_32_SWAP(PREV_PGNO(pagep)); M_32_SWAP(NEXT_PGNO(pagep)); M_16_SWAP(NUM_ENT(pagep)); M_16_SWAP(OFFSET(pagep)); for (i = 0; i < NUM_ENT(pagep); i++) { M_16_SWAP(KEY_OFF(pagep, i)); M_16_SWAP(DATA_OFF(pagep, i)); } } static void swap_page_header_out(pagep) PAGE16 *pagep; { u_int32_t i; for (i = 0; i < NUM_ENT(pagep); i++) { M_16_SWAP(KEY_OFF(pagep, i)); M_16_SWAP(DATA_OFF(pagep, i)) } /* can leave type and filler alone, since they're 1-byte quantities */ M_32_SWAP(PREV_PGNO(pagep)); M_32_SWAP(NEXT_PGNO(pagep)); M_16_SWAP(NUM_ENT(pagep)); M_16_SWAP(OFFSET(pagep)); } #define BYTE_MASK ((1 << INT32_T_BYTE_SHIFT) -1) /* * Initialize a new bitmap page. Bitmap pages are left in memory * once they are read in. */ extern int32_t __ibitmap(hashp, pnum, nbits, ndx) HTAB *hashp; int32_t pnum, nbits, ndx; { u_int32_t *ip; int32_t clearbytes, clearints; /* make a new bitmap page */ if (__new_page(hashp, pnum, A_BITMAP) != 0) return (1); if (!(ip = (u_int32_t *)__get_page(hashp, pnum, A_BITMAP))) return (1); hashp->nmaps++; clearints = ((nbits - 1) >> INT32_T_BYTE_SHIFT) + 1; clearbytes = clearints << INT32_T_TO_BYTE; (void)memset((int8_t *)ip, 0, clearbytes); (void)memset((int8_t *)ip + clearbytes, 0xFF, hashp->hdr.bsize - clearbytes); ip[clearints - 1] = ALL_SET << (nbits & BYTE_MASK); SETBIT(ip, 0); hashp->hdr.bitmaps[ndx] = (u_int16_t)pnum; hashp->mapp[ndx] = ip; return (0); } static u_int32_t first_free(map) u_int32_t map; { u_int32_t i, mask; for (mask = 0x1, i = 0; i < BITS_PER_MAP; i++) { if (!(mask & map)) return (i); mask = mask << 1; } return (i); } /* * returns 0 on error */ static u_int16_t overflow_page(hashp) HTAB *hashp; { u_int32_t *freep; int32_t bit, first_page, free_bit, free_page, i, in_use_bits, j; int32_t max_free, offset, splitnum; u_int16_t addr; #ifdef DEBUG2 int32_t tmp1, tmp2; #endif splitnum = hashp->hdr.ovfl_point; max_free = hashp->hdr.spares[splitnum]; free_page = (max_free - 1) >> (hashp->hdr.bshift + BYTE_SHIFT); free_bit = (max_free - 1) & ((hashp->hdr.bsize << BYTE_SHIFT) - 1); /* * Look through all the free maps to find the first free block. * The compiler under -Wall will complain that freep may be used * before being set, however, this loop will ALWAYS get executed * at least once, so freep is guaranteed to be set. */ first_page = hashp->hdr.last_freed >> (hashp->hdr.bshift + BYTE_SHIFT); for (i = first_page; i <= free_page; i++) { if (!(freep = fetch_bitmap(hashp, i))) return (0); if (i == free_page) in_use_bits = free_bit; else in_use_bits = (hashp->hdr.bsize << BYTE_SHIFT) - 1; if (i == first_page) { bit = hashp->hdr.last_freed & ((hashp->hdr.bsize << BYTE_SHIFT) - 1); j = bit / BITS_PER_MAP; bit = bit & ~(BITS_PER_MAP - 1); } else { bit = 0; j = 0; } for (; bit <= in_use_bits; j++, bit += BITS_PER_MAP) if (freep[j] != ALL_SET) goto found; } /* No Free Page Found */ hashp->hdr.last_freed = hashp->hdr.spares[splitnum]; hashp->hdr.spares[splitnum]++; offset = hashp->hdr.spares[splitnum] - (splitnum ? hashp->hdr.spares[splitnum - 1] : 0); #define OVMSG "HASH: Out of overflow pages. Increase page size\n" if (offset > SPLITMASK) { if (++splitnum >= NCACHED) { (void)write(STDERR_FILENO, OVMSG, sizeof(OVMSG) - 1); return (0); } hashp->hdr.ovfl_point = splitnum; hashp->hdr.spares[splitnum] = hashp->hdr.spares[splitnum - 1]; hashp->hdr.spares[splitnum - 1]--; offset = 1; } /* Check if we need to allocate a new bitmap page. */ if (free_bit == (hashp->hdr.bsize << BYTE_SHIFT) - 1) { free_page++; if (free_page >= NCACHED) { (void)write(STDERR_FILENO, OVMSG, sizeof(OVMSG) - 1); return (0); } /* * This is tricky. The 1 indicates that you want the new page * allocated with 1 clear bit. Actually, you are going to * allocate 2 pages from this map. The first is going to be * the map page, the second is the overflow page we were * looking for. The __ibitmap routine automatically, sets * the first bit of itself to indicate that the bitmap itself * is in use. We would explicitly set the second bit, but * don't have to if we tell __ibitmap not to leave it clear * in the first place. */ if (__ibitmap(hashp, (int32_t)OADDR_OF(splitnum, offset), 1, free_page)) return (0); hashp->hdr.spares[splitnum]++; #ifdef DEBUG2 free_bit = 2; #endif offset++; if (offset > SPLITMASK) { if (++splitnum >= NCACHED) { (void)write(STDERR_FILENO, OVMSG, sizeof(OVMSG) - 1); return (0); } hashp->hdr.ovfl_point = splitnum; hashp->hdr.spares[splitnum] = hashp->hdr.spares[splitnum - 1]; hashp->hdr.spares[splitnum - 1]--; offset = 0; } } else { /* * Free_bit addresses the last used bit. Bump it to address * the first available bit. */ free_bit++; SETBIT(freep, free_bit); } /* Calculate address of the new overflow page */ addr = OADDR_OF(splitnum, offset); #ifdef DEBUG2 (void)fprintf(stderr, dgettext(TEXT_DOMAIN, "OVERFLOW_PAGE: ADDR: %d BIT: %d PAGE %d\n"), addr, free_bit, free_page); #endif if (OADDR_TO_PAGE(addr) > MAX_PAGES(hashp)) { (void)write(STDERR_FILENO, OVMSG, sizeof(OVMSG) - 1); return (0); } return (addr); found: bit = bit + first_free(freep[j]); SETBIT(freep, bit); #ifdef DEBUG2 tmp1 = bit; tmp2 = i; #endif /* * Bits are addressed starting with 0, but overflow pages are addressed * beginning at 1. Bit is a bit address number, so we need to increment * it to convert it to a page number. */ bit = 1 + bit + (i * (hashp->hdr.bsize << BYTE_SHIFT)); if (bit >= hashp->hdr.last_freed) hashp->hdr.last_freed = bit - 1; /* Calculate the split number for this page */ for (i = 0; i < splitnum && (bit > hashp->hdr.spares[i]); i++); offset = (i ? bit - hashp->hdr.spares[i - 1] : bit); if (offset >= SPLITMASK) return (0); /* Out of overflow pages */ addr = OADDR_OF(i, offset); #ifdef DEBUG2 (void)fprintf(stderr, dgettext(TEXT_DOMAIN, "OVERFLOW_PAGE: ADDR: %d BIT: %d PAGE %d\n"), addr, tmp1, tmp2); #endif if (OADDR_TO_PAGE(addr) > MAX_PAGES(hashp)) { (void)write(STDERR_FILENO, OVMSG, sizeof(OVMSG) - 1); return (0); } /* Allocate and return the overflow page */ return (addr); } #ifdef DEBUG int bucket_to_page(hashp, n) HTAB *hashp; int n; { int ret_val; ret_val = n + hashp->hdr.hdrpages; if (n != 0) ret_val += hashp->hdr.spares[__log2(n + 1) - 1]; return (ret_val); } int32_t oaddr_to_page(hashp, n) HTAB *hashp; int n; { int ret_val, temp; temp = (1 << SPLITNUM(n)) - 1; ret_val = bucket_to_page(hashp, temp); ret_val += (n & SPLITMASK); return (ret_val); } #endif /* DEBUG */ static indx_t page_to_oaddr(hashp, pgno) HTAB *hashp; db_pgno_t pgno; { int32_t sp, ret_val; /* * To convert page number to overflow address: * * 1. Find a starting split point -- use 0 since there are only * 32 split points. * 2. Find the split point s.t. 2^sp + hdr.spares[sp] < pgno and * 2^(sp+1) = hdr.spares[sp+1] > pgno. The overflow address will * be located at sp. * 3. return... */ pgno -= hashp->hdr.hdrpages; for (sp = 0; sp < NCACHED; sp++) if (POW2(sp) + hashp->hdr.spares[sp] < pgno && (POW2(sp + 1) + hashp->hdr.spares[sp + 1]) > pgno) break; ret_val = OADDR_OF(sp + 1, pgno - ((POW2(sp + 1) - 1) + hashp->hdr.spares[sp])); #ifdef DEBUG assert(OADDR_TO_PAGE(ret_val) == (pgno + hashp->hdr.hdrpages)); #endif return (ret_val); } /* * Mark this overflow page as free. */ extern void __free_ovflpage(hashp, pagep) HTAB *hashp; PAGE16 *pagep; { u_int32_t *freep; int32_t bit_address, free_page, free_bit; u_int16_t addr, ndx; addr = page_to_oaddr(hashp, ADDR(pagep)); #ifdef DEBUG2 (void)fprintf(stderr, dgettext(TEXT_DOMAIN, "Freeing %d\n"), addr); #endif ndx = ((u_int16_t)addr) >> SPLITSHIFT; bit_address = (ndx ? hashp->hdr.spares[ndx - 1] : 0) + (addr & SPLITMASK) - 1; if (bit_address < hashp->hdr.last_freed) hashp->hdr.last_freed = bit_address; free_page = (bit_address >> (hashp->hdr.bshift + BYTE_SHIFT)); free_bit = bit_address & ((hashp->hdr.bsize << BYTE_SHIFT) - 1); freep = fetch_bitmap(hashp, free_page); #ifdef DEBUG /* * This had better never happen. It means we tried to read a bitmap * that has already had overflow pages allocated off it, and we * failed to read it from the file. */ if (!freep) assert(0); #endif CLRBIT(freep, free_bit); #ifdef DEBUG2 (void)fprintf(stderr, dgettext(TEXT_DOMAIN, "FREE_OVFLPAGE: ADDR: %d BIT: %d PAGE %d\n"), obufp->addr, free_bit, free_page); #endif } static u_int32_t * fetch_bitmap(hashp, ndx) HTAB *hashp; int32_t ndx; { if (ndx >= hashp->nmaps) return (NULL); if (!hashp->mapp[ndx]) hashp->mapp[ndx] = (u_int32_t *)__get_page(hashp, hashp->hdr.bitmaps[ndx], A_BITMAP); return (hashp->mapp[ndx]); } #ifdef DEBUG_SLOW static void account_page(hashp, pgno, inout) HTAB *hashp; db_pgno_t pgno; int inout; { static struct { db_pgno_t pgno; int times; } list[100]; static int last; int i, j; if (inout == -1) /* XXX: Kluge */ inout = 0; /* Find page in list. */ for (i = 0; i < last; i++) if (list[i].pgno == pgno) break; /* Not found. */ if (i == last) { list[last].times = inout; list[last].pgno = pgno; last++; } list[i].times = inout; if (list[i].times == 0) { for (j = i; j < last; j++) list[j] = list[j + 1]; last--; } for (i = 0; i < last; i++, list[i].times++) if (list[i].times > 20 && !is_bitmap_pgno(hashp, list[i].pgno)) (void)fprintf(stderr, dgettext(TEXT_DOMAIN, "Warning: pg %d has been out for %d times\n"), list[i].pgno, list[i].times); } #endif /* DEBUG_SLOW */