1*7c478bd9Sstevel@tonic-gate /*- 2*7c478bd9Sstevel@tonic-gate * See the file LICENSE for redistribution information. 3*7c478bd9Sstevel@tonic-gate * 4*7c478bd9Sstevel@tonic-gate * Copyright (c) 1996, 1997, 1998 5*7c478bd9Sstevel@tonic-gate * Sleepycat Software. All rights reserved. 6*7c478bd9Sstevel@tonic-gate * 7*7c478bd9Sstevel@tonic-gate * @(#)db_page.h 10.18 (Sleepycat) 12/2/98 8*7c478bd9Sstevel@tonic-gate */ 9*7c478bd9Sstevel@tonic-gate 10*7c478bd9Sstevel@tonic-gate #ifndef _DB_PAGE_H_ 11*7c478bd9Sstevel@tonic-gate #define _DB_PAGE_H_ 12*7c478bd9Sstevel@tonic-gate 13*7c478bd9Sstevel@tonic-gate /* 14*7c478bd9Sstevel@tonic-gate * DB page formats. 15*7c478bd9Sstevel@tonic-gate * 16*7c478bd9Sstevel@tonic-gate * This implementation requires that values within the following structures 17*7c478bd9Sstevel@tonic-gate * NOT be padded -- note, ANSI C permits random padding within structures. 18*7c478bd9Sstevel@tonic-gate * If your compiler pads randomly you can just forget ever making DB run on 19*7c478bd9Sstevel@tonic-gate * your system. In addition, no data type can require larger alignment than 20*7c478bd9Sstevel@tonic-gate * its own size, e.g., a 4-byte data element may not require 8-byte alignment. 21*7c478bd9Sstevel@tonic-gate * 22*7c478bd9Sstevel@tonic-gate * Note that key/data lengths are often stored in db_indx_t's -- this is 23*7c478bd9Sstevel@tonic-gate * not accidental, nor does it limit the key/data size. If the key/data 24*7c478bd9Sstevel@tonic-gate * item fits on a page, it's guaranteed to be small enough to fit into a 25*7c478bd9Sstevel@tonic-gate * db_indx_t, and storing it in one saves space. 26*7c478bd9Sstevel@tonic-gate */ 27*7c478bd9Sstevel@tonic-gate 28*7c478bd9Sstevel@tonic-gate #define PGNO_METADATA 0 /* Metadata page number. */ 29*7c478bd9Sstevel@tonic-gate #define PGNO_INVALID 0 /* Metadata page number, therefore illegal. */ 30*7c478bd9Sstevel@tonic-gate #define PGNO_ROOT 1 /* Root is page #1. */ 31*7c478bd9Sstevel@tonic-gate 32*7c478bd9Sstevel@tonic-gate /* 33*7c478bd9Sstevel@tonic-gate * When we create pages in mpool, we ask mpool to clear some number of bytes 34*7c478bd9Sstevel@tonic-gate * in the header. This number must be at least as big as the regular page 35*7c478bd9Sstevel@tonic-gate * headers and cover enough of the btree and hash meta-data pages to obliterate 36*7c478bd9Sstevel@tonic-gate * the magic and version numbers. 37*7c478bd9Sstevel@tonic-gate */ 38*7c478bd9Sstevel@tonic-gate #define DB_PAGE_CLEAR_LEN 32 39*7c478bd9Sstevel@tonic-gate 40*7c478bd9Sstevel@tonic-gate /************************************************************************ 41*7c478bd9Sstevel@tonic-gate BTREE METADATA PAGE LAYOUT 42*7c478bd9Sstevel@tonic-gate ************************************************************************/ 43*7c478bd9Sstevel@tonic-gate 44*7c478bd9Sstevel@tonic-gate /* 45*7c478bd9Sstevel@tonic-gate * Btree metadata page layout: 46*7c478bd9Sstevel@tonic-gate */ 47*7c478bd9Sstevel@tonic-gate typedef struct _btmeta { 48*7c478bd9Sstevel@tonic-gate DB_LSN lsn; /* 00-07: LSN. */ 49*7c478bd9Sstevel@tonic-gate db_pgno_t pgno; /* 08-11: Current page number. */ 50*7c478bd9Sstevel@tonic-gate u_int32_t magic; /* 12-15: Magic number. */ 51*7c478bd9Sstevel@tonic-gate u_int32_t version; /* 16-19: Version. */ 52*7c478bd9Sstevel@tonic-gate u_int32_t pagesize; /* 20-23: Pagesize. */ 53*7c478bd9Sstevel@tonic-gate u_int32_t maxkey; /* 24-27: Btree: Maxkey. */ 54*7c478bd9Sstevel@tonic-gate u_int32_t minkey; /* 28-31: Btree: Minkey. */ 55*7c478bd9Sstevel@tonic-gate u_int32_t free; /* 32-35: Free list page number. */ 56*7c478bd9Sstevel@tonic-gate #define BTM_DUP 0x001 /* Duplicates. */ 57*7c478bd9Sstevel@tonic-gate #define BTM_RECNO 0x002 /* Recno tree. */ 58*7c478bd9Sstevel@tonic-gate #define BTM_RECNUM 0x004 /* Btree: maintain record count. */ 59*7c478bd9Sstevel@tonic-gate #define BTM_FIXEDLEN 0x008 /* Recno: fixed length records. */ 60*7c478bd9Sstevel@tonic-gate #define BTM_RENUMBER 0x010 /* Recno: renumber on insert/delete. */ 61*7c478bd9Sstevel@tonic-gate #define BTM_MASK 0x01f 62*7c478bd9Sstevel@tonic-gate u_int32_t flags; /* 36-39: Flags. */ 63*7c478bd9Sstevel@tonic-gate u_int32_t re_len; /* 40-43: Recno: fixed-length record length. */ 64*7c478bd9Sstevel@tonic-gate u_int32_t re_pad; /* 44-47: Recno: fixed-length record pad. */ 65*7c478bd9Sstevel@tonic-gate /* 48-67: Unique file ID. */ 66*7c478bd9Sstevel@tonic-gate u_int8_t uid[DB_FILE_ID_LEN]; 67*7c478bd9Sstevel@tonic-gate } BTMETA; 68*7c478bd9Sstevel@tonic-gate 69*7c478bd9Sstevel@tonic-gate /************************************************************************ 70*7c478bd9Sstevel@tonic-gate HASH METADATA PAGE LAYOUT 71*7c478bd9Sstevel@tonic-gate ************************************************************************/ 72*7c478bd9Sstevel@tonic-gate 73*7c478bd9Sstevel@tonic-gate /* 74*7c478bd9Sstevel@tonic-gate * Hash metadata page layout: 75*7c478bd9Sstevel@tonic-gate */ 76*7c478bd9Sstevel@tonic-gate /* Hash Table Information */ 77*7c478bd9Sstevel@tonic-gate typedef struct hashhdr { /* Disk resident portion */ 78*7c478bd9Sstevel@tonic-gate DB_LSN lsn; /* 00-07: LSN of the header page */ 79*7c478bd9Sstevel@tonic-gate db_pgno_t pgno; /* 08-11: Page number (btree compatibility). */ 80*7c478bd9Sstevel@tonic-gate u_int32_t magic; /* 12-15: Magic NO for hash tables */ 81*7c478bd9Sstevel@tonic-gate u_int32_t version; /* 16-19: Version ID */ 82*7c478bd9Sstevel@tonic-gate u_int32_t pagesize; /* 20-23: Bucket/Page Size */ 83*7c478bd9Sstevel@tonic-gate u_int32_t ovfl_point; /* 24-27: Overflow page allocation location */ 84*7c478bd9Sstevel@tonic-gate u_int32_t last_freed; /* 28-31: Last freed overflow page pgno */ 85*7c478bd9Sstevel@tonic-gate u_int32_t max_bucket; /* 32-35: ID of Maximum bucket in use */ 86*7c478bd9Sstevel@tonic-gate u_int32_t high_mask; /* 36-39: Modulo mask into table */ 87*7c478bd9Sstevel@tonic-gate u_int32_t low_mask; /* 40-43: Modulo mask into table lower half */ 88*7c478bd9Sstevel@tonic-gate u_int32_t ffactor; /* 44-47: Fill factor */ 89*7c478bd9Sstevel@tonic-gate u_int32_t nelem; /* 48-51: Number of keys in hash table */ 90*7c478bd9Sstevel@tonic-gate u_int32_t h_charkey; /* 52-55: Value of hash(CHARKEY) */ 91*7c478bd9Sstevel@tonic-gate #define DB_HASH_DUP 0x01 92*7c478bd9Sstevel@tonic-gate u_int32_t flags; /* 56-59: Allow duplicates. */ 93*7c478bd9Sstevel@tonic-gate #define NCACHED 32 /* number of spare points */ 94*7c478bd9Sstevel@tonic-gate /* 60-187: Spare pages for overflow */ 95*7c478bd9Sstevel@tonic-gate u_int32_t spares[NCACHED]; 96*7c478bd9Sstevel@tonic-gate /* 188-207: Unique file ID. */ 97*7c478bd9Sstevel@tonic-gate u_int8_t uid[DB_FILE_ID_LEN]; 98*7c478bd9Sstevel@tonic-gate 99*7c478bd9Sstevel@tonic-gate /* 100*7c478bd9Sstevel@tonic-gate * Minimum page size is 256. 101*7c478bd9Sstevel@tonic-gate */ 102*7c478bd9Sstevel@tonic-gate } HASHHDR; 103*7c478bd9Sstevel@tonic-gate 104*7c478bd9Sstevel@tonic-gate /************************************************************************ 105*7c478bd9Sstevel@tonic-gate MAIN PAGE LAYOUT 106*7c478bd9Sstevel@tonic-gate ************************************************************************/ 107*7c478bd9Sstevel@tonic-gate 108*7c478bd9Sstevel@tonic-gate /* 109*7c478bd9Sstevel@tonic-gate * +-----------------------------------+ 110*7c478bd9Sstevel@tonic-gate * | lsn | pgno | prev pgno | 111*7c478bd9Sstevel@tonic-gate * +-----------------------------------+ 112*7c478bd9Sstevel@tonic-gate * | next pgno | entries | hf offset | 113*7c478bd9Sstevel@tonic-gate * +-----------------------------------+ 114*7c478bd9Sstevel@tonic-gate * | level | type | index | 115*7c478bd9Sstevel@tonic-gate * +-----------------------------------+ 116*7c478bd9Sstevel@tonic-gate * | index | free --> | 117*7c478bd9Sstevel@tonic-gate * +-----------+-----------------------+ 118*7c478bd9Sstevel@tonic-gate * | F R E E A R E A | 119*7c478bd9Sstevel@tonic-gate * +-----------------------------------+ 120*7c478bd9Sstevel@tonic-gate * | <-- free | item | 121*7c478bd9Sstevel@tonic-gate * +-----------------------------------+ 122*7c478bd9Sstevel@tonic-gate * | item | item | item | 123*7c478bd9Sstevel@tonic-gate * +-----------------------------------+ 124*7c478bd9Sstevel@tonic-gate * 125*7c478bd9Sstevel@tonic-gate * sizeof(PAGE) == 26 bytes, and the following indices are guaranteed to be 126*7c478bd9Sstevel@tonic-gate * two-byte aligned. 127*7c478bd9Sstevel@tonic-gate * 128*7c478bd9Sstevel@tonic-gate * For hash and btree leaf pages, index items are paired, e.g., inp[0] is the 129*7c478bd9Sstevel@tonic-gate * key for inp[1]'s data. All other types of pages only contain single items. 130*7c478bd9Sstevel@tonic-gate */ 131*7c478bd9Sstevel@tonic-gate typedef struct _db_page { 132*7c478bd9Sstevel@tonic-gate DB_LSN lsn; /* 00-07: Log sequence number. */ 133*7c478bd9Sstevel@tonic-gate db_pgno_t pgno; /* 08-11: Current page number. */ 134*7c478bd9Sstevel@tonic-gate db_pgno_t prev_pgno; /* 12-15: Previous page number. */ 135*7c478bd9Sstevel@tonic-gate db_pgno_t next_pgno; /* 16-19: Next page number. */ 136*7c478bd9Sstevel@tonic-gate db_indx_t entries; /* 20-21: Number of item pairs on the page. */ 137*7c478bd9Sstevel@tonic-gate db_indx_t hf_offset; /* 22-23: High free byte page offset. */ 138*7c478bd9Sstevel@tonic-gate 139*7c478bd9Sstevel@tonic-gate /* 140*7c478bd9Sstevel@tonic-gate * The btree levels are numbered from the leaf to the root, starting 141*7c478bd9Sstevel@tonic-gate * with 1, so the leaf is level 1, its parent is level 2, and so on. 142*7c478bd9Sstevel@tonic-gate * We maintain this level on all btree pages, but the only place that 143*7c478bd9Sstevel@tonic-gate * we actually need it is on the root page. It would not be difficult 144*7c478bd9Sstevel@tonic-gate * to hide the byte on the root page once it becomes an internal page, 145*7c478bd9Sstevel@tonic-gate * so we could get this byte back if we needed it for something else. 146*7c478bd9Sstevel@tonic-gate */ 147*7c478bd9Sstevel@tonic-gate #define LEAFLEVEL 1 148*7c478bd9Sstevel@tonic-gate #define MAXBTREELEVEL 255 149*7c478bd9Sstevel@tonic-gate u_int8_t level; /* 24: Btree tree level. */ 150*7c478bd9Sstevel@tonic-gate 151*7c478bd9Sstevel@tonic-gate #define P_INVALID 0 /* Invalid page type. */ 152*7c478bd9Sstevel@tonic-gate #define P_DUPLICATE 1 /* Duplicate. */ 153*7c478bd9Sstevel@tonic-gate #define P_HASH 2 /* Hash. */ 154*7c478bd9Sstevel@tonic-gate #define P_IBTREE 3 /* Btree internal. */ 155*7c478bd9Sstevel@tonic-gate #define P_IRECNO 4 /* Recno internal. */ 156*7c478bd9Sstevel@tonic-gate #define P_LBTREE 5 /* Btree leaf. */ 157*7c478bd9Sstevel@tonic-gate #define P_LRECNO 6 /* Recno leaf. */ 158*7c478bd9Sstevel@tonic-gate #define P_OVERFLOW 7 /* Overflow. */ 159*7c478bd9Sstevel@tonic-gate u_int8_t type; /* 25: Page type. */ 160*7c478bd9Sstevel@tonic-gate db_indx_t inp[1]; /* Variable length index of items. */ 161*7c478bd9Sstevel@tonic-gate } PAGE; 162*7c478bd9Sstevel@tonic-gate 163*7c478bd9Sstevel@tonic-gate /* Element macros. */ 164*7c478bd9Sstevel@tonic-gate #define LSN(p) (((PAGE *)p)->lsn) 165*7c478bd9Sstevel@tonic-gate #define PGNO(p) (((PAGE *)p)->pgno) 166*7c478bd9Sstevel@tonic-gate #define PREV_PGNO(p) (((PAGE *)p)->prev_pgno) 167*7c478bd9Sstevel@tonic-gate #define NEXT_PGNO(p) (((PAGE *)p)->next_pgno) 168*7c478bd9Sstevel@tonic-gate #define NUM_ENT(p) (((PAGE *)p)->entries) 169*7c478bd9Sstevel@tonic-gate #define HOFFSET(p) (((PAGE *)p)->hf_offset) 170*7c478bd9Sstevel@tonic-gate #define LEVEL(p) (((PAGE *)p)->level) 171*7c478bd9Sstevel@tonic-gate #define TYPE(p) (((PAGE *)p)->type) 172*7c478bd9Sstevel@tonic-gate 173*7c478bd9Sstevel@tonic-gate /* 174*7c478bd9Sstevel@tonic-gate * !!! 175*7c478bd9Sstevel@tonic-gate * The next_pgno and prev_pgno fields are not maintained for btree and recno 176*7c478bd9Sstevel@tonic-gate * internal pages. It's a minor performance improvement, and more, it's 177*7c478bd9Sstevel@tonic-gate * hard to do when deleting internal pages, and it decreases the chance of 178*7c478bd9Sstevel@tonic-gate * deadlock during deletes and splits. 179*7c478bd9Sstevel@tonic-gate * 180*7c478bd9Sstevel@tonic-gate * !!! 181*7c478bd9Sstevel@tonic-gate * The btree/recno access method needs db_recno_t bytes of space on the root 182*7c478bd9Sstevel@tonic-gate * page to specify how many records are stored in the tree. (The alternative 183*7c478bd9Sstevel@tonic-gate * is to store the number of records in the meta-data page, which will create 184*7c478bd9Sstevel@tonic-gate * a second hot spot in trees being actively modified, or recalculate it from 185*7c478bd9Sstevel@tonic-gate * the BINTERNAL fields on each access.) Overload the prev_pgno field. 186*7c478bd9Sstevel@tonic-gate */ 187*7c478bd9Sstevel@tonic-gate #define RE_NREC(p) \ 188*7c478bd9Sstevel@tonic-gate (TYPE(p) == P_LBTREE ? NUM_ENT(p) / 2 : \ 189*7c478bd9Sstevel@tonic-gate TYPE(p) == P_LRECNO ? NUM_ENT(p) : PREV_PGNO(p)) 190*7c478bd9Sstevel@tonic-gate #define RE_NREC_ADJ(p, adj) \ 191*7c478bd9Sstevel@tonic-gate PREV_PGNO(p) += adj; 192*7c478bd9Sstevel@tonic-gate #define RE_NREC_SET(p, num) \ 193*7c478bd9Sstevel@tonic-gate PREV_PGNO(p) = num; 194*7c478bd9Sstevel@tonic-gate 195*7c478bd9Sstevel@tonic-gate /* 196*7c478bd9Sstevel@tonic-gate * Initialize a page. 197*7c478bd9Sstevel@tonic-gate * 198*7c478bd9Sstevel@tonic-gate * !!! 199*7c478bd9Sstevel@tonic-gate * Don't modify the page's LSN, code depends on it being unchanged after a 200*7c478bd9Sstevel@tonic-gate * P_INIT call. 201*7c478bd9Sstevel@tonic-gate */ 202*7c478bd9Sstevel@tonic-gate #define P_INIT(pg, pg_size, n, pg_prev, pg_next, btl, pg_type) do { \ 203*7c478bd9Sstevel@tonic-gate PGNO(pg) = n; \ 204*7c478bd9Sstevel@tonic-gate PREV_PGNO(pg) = pg_prev; \ 205*7c478bd9Sstevel@tonic-gate NEXT_PGNO(pg) = pg_next; \ 206*7c478bd9Sstevel@tonic-gate NUM_ENT(pg) = 0; \ 207*7c478bd9Sstevel@tonic-gate HOFFSET(pg) = pg_size; \ 208*7c478bd9Sstevel@tonic-gate LEVEL(pg) = btl; \ 209*7c478bd9Sstevel@tonic-gate TYPE(pg) = pg_type; \ 210*7c478bd9Sstevel@tonic-gate } while (0) 211*7c478bd9Sstevel@tonic-gate 212*7c478bd9Sstevel@tonic-gate /* Page header length (offset to first index). */ 213*7c478bd9Sstevel@tonic-gate #define P_OVERHEAD (SSZA(PAGE, inp)) 214*7c478bd9Sstevel@tonic-gate 215*7c478bd9Sstevel@tonic-gate /* First free byte. */ 216*7c478bd9Sstevel@tonic-gate #define LOFFSET(pg) (P_OVERHEAD + NUM_ENT(pg) * sizeof(db_indx_t)) 217*7c478bd9Sstevel@tonic-gate 218*7c478bd9Sstevel@tonic-gate /* Free space on the page. */ 219*7c478bd9Sstevel@tonic-gate #define P_FREESPACE(pg) (HOFFSET(pg) - LOFFSET(pg)) 220*7c478bd9Sstevel@tonic-gate 221*7c478bd9Sstevel@tonic-gate /* Get a pointer to the bytes at a specific index. */ 222*7c478bd9Sstevel@tonic-gate #define P_ENTRY(pg, indx) ((u_int8_t *)pg + ((PAGE *)pg)->inp[indx]) 223*7c478bd9Sstevel@tonic-gate 224*7c478bd9Sstevel@tonic-gate /************************************************************************ 225*7c478bd9Sstevel@tonic-gate OVERFLOW PAGE LAYOUT 226*7c478bd9Sstevel@tonic-gate ************************************************************************/ 227*7c478bd9Sstevel@tonic-gate 228*7c478bd9Sstevel@tonic-gate /* 229*7c478bd9Sstevel@tonic-gate * Overflow items are referenced by HOFFPAGE and BOVERFLOW structures, which 230*7c478bd9Sstevel@tonic-gate * store a page number (the first page of the overflow item) and a length 231*7c478bd9Sstevel@tonic-gate * (the total length of the overflow item). The overflow item consists of 232*7c478bd9Sstevel@tonic-gate * some number of overflow pages, linked by the next_pgno field of the page. 233*7c478bd9Sstevel@tonic-gate * A next_pgno field of PGNO_INVALID flags the end of the overflow item. 234*7c478bd9Sstevel@tonic-gate * 235*7c478bd9Sstevel@tonic-gate * Overflow page overloads: 236*7c478bd9Sstevel@tonic-gate * The amount of overflow data stored on each page is stored in the 237*7c478bd9Sstevel@tonic-gate * hf_offset field. 238*7c478bd9Sstevel@tonic-gate * 239*7c478bd9Sstevel@tonic-gate * The implementation reference counts overflow items as it's possible 240*7c478bd9Sstevel@tonic-gate * for them to be promoted onto btree internal pages. The reference 241*7c478bd9Sstevel@tonic-gate * count is stored in the entries field. 242*7c478bd9Sstevel@tonic-gate */ 243*7c478bd9Sstevel@tonic-gate #define OV_LEN(p) (((PAGE *)p)->hf_offset) 244*7c478bd9Sstevel@tonic-gate #define OV_REF(p) (((PAGE *)p)->entries) 245*7c478bd9Sstevel@tonic-gate 246*7c478bd9Sstevel@tonic-gate /* Maximum number of bytes that you can put on an overflow page. */ 247*7c478bd9Sstevel@tonic-gate #define P_MAXSPACE(psize) ((psize) - P_OVERHEAD) 248*7c478bd9Sstevel@tonic-gate 249*7c478bd9Sstevel@tonic-gate /************************************************************************ 250*7c478bd9Sstevel@tonic-gate HASH PAGE LAYOUT 251*7c478bd9Sstevel@tonic-gate ************************************************************************/ 252*7c478bd9Sstevel@tonic-gate 253*7c478bd9Sstevel@tonic-gate /* Each index references a group of bytes on the page. */ 254*7c478bd9Sstevel@tonic-gate #define H_KEYDATA 1 /* Key/data item. */ 255*7c478bd9Sstevel@tonic-gate #define H_DUPLICATE 2 /* Duplicate key/data item. */ 256*7c478bd9Sstevel@tonic-gate #define H_OFFPAGE 3 /* Overflow key/data item. */ 257*7c478bd9Sstevel@tonic-gate #define H_OFFDUP 4 /* Overflow page of duplicates. */ 258*7c478bd9Sstevel@tonic-gate 259*7c478bd9Sstevel@tonic-gate /* 260*7c478bd9Sstevel@tonic-gate * !!! 261*7c478bd9Sstevel@tonic-gate * Items on hash pages are (potentially) unaligned, so we can never cast the 262*7c478bd9Sstevel@tonic-gate * (page + offset) pointer to an HKEYDATA, HOFFPAGE or HOFFDUP structure, as 263*7c478bd9Sstevel@tonic-gate * we do with B+tree on-page structures. Because we frequently want the type 264*7c478bd9Sstevel@tonic-gate * field, it requires no alignment, and it's in the same location in all three 265*7c478bd9Sstevel@tonic-gate * structures, there's a pair of macros. 266*7c478bd9Sstevel@tonic-gate */ 267*7c478bd9Sstevel@tonic-gate #define HPAGE_PTYPE(p) (*(u_int8_t *)p) 268*7c478bd9Sstevel@tonic-gate #define HPAGE_TYPE(pg, indx) (*P_ENTRY(pg, indx)) 269*7c478bd9Sstevel@tonic-gate 270*7c478bd9Sstevel@tonic-gate /* 271*7c478bd9Sstevel@tonic-gate * The first and second types are H_KEYDATA and H_DUPLICATE, represented 272*7c478bd9Sstevel@tonic-gate * by the HKEYDATA structure: 273*7c478bd9Sstevel@tonic-gate * 274*7c478bd9Sstevel@tonic-gate * +-----------------------------------+ 275*7c478bd9Sstevel@tonic-gate * | type | key/data ... | 276*7c478bd9Sstevel@tonic-gate * +-----------------------------------+ 277*7c478bd9Sstevel@tonic-gate * 278*7c478bd9Sstevel@tonic-gate * For duplicates, the data field encodes duplicate elements in the data 279*7c478bd9Sstevel@tonic-gate * field: 280*7c478bd9Sstevel@tonic-gate * 281*7c478bd9Sstevel@tonic-gate * +---------------------------------------------------------------+ 282*7c478bd9Sstevel@tonic-gate * | type | len1 | element1 | len1 | len2 | element2 | len2 | 283*7c478bd9Sstevel@tonic-gate * +---------------------------------------------------------------+ 284*7c478bd9Sstevel@tonic-gate * 285*7c478bd9Sstevel@tonic-gate * Thus, by keeping track of the offset in the element, we can do both 286*7c478bd9Sstevel@tonic-gate * backward and forward traversal. 287*7c478bd9Sstevel@tonic-gate */ 288*7c478bd9Sstevel@tonic-gate typedef struct _hkeydata { 289*7c478bd9Sstevel@tonic-gate u_int8_t type; /* 00: Page type. */ 290*7c478bd9Sstevel@tonic-gate u_int8_t data[1]; /* Variable length key/data item. */ 291*7c478bd9Sstevel@tonic-gate } HKEYDATA; 292*7c478bd9Sstevel@tonic-gate #define HKEYDATA_DATA(p) (((u_int8_t *)p) + SSZA(HKEYDATA, data)) 293*7c478bd9Sstevel@tonic-gate 294*7c478bd9Sstevel@tonic-gate /* 295*7c478bd9Sstevel@tonic-gate * The length of any HKEYDATA item. Note that indx is an element index, 296*7c478bd9Sstevel@tonic-gate * not a PAIR index. 297*7c478bd9Sstevel@tonic-gate */ 298*7c478bd9Sstevel@tonic-gate #define LEN_HITEM(pg, pgsize, indx) \ 299*7c478bd9Sstevel@tonic-gate (((indx) == 0 ? pgsize : pg->inp[indx - 1]) - pg->inp[indx]) 300*7c478bd9Sstevel@tonic-gate 301*7c478bd9Sstevel@tonic-gate #define LEN_HKEYDATA(pg, psize, indx) \ 302*7c478bd9Sstevel@tonic-gate (((indx) == 0 ? psize : pg->inp[indx - 1]) - \ 303*7c478bd9Sstevel@tonic-gate pg->inp[indx] - HKEYDATA_SIZE(0)) 304*7c478bd9Sstevel@tonic-gate 305*7c478bd9Sstevel@tonic-gate /* 306*7c478bd9Sstevel@tonic-gate * Page space required to add a new HKEYDATA item to the page, with and 307*7c478bd9Sstevel@tonic-gate * without the index value. 308*7c478bd9Sstevel@tonic-gate */ 309*7c478bd9Sstevel@tonic-gate #define HKEYDATA_SIZE(len) \ 310*7c478bd9Sstevel@tonic-gate ((len) + SSZA(HKEYDATA, data)) 311*7c478bd9Sstevel@tonic-gate #define HKEYDATA_PSIZE(len) \ 312*7c478bd9Sstevel@tonic-gate (HKEYDATA_SIZE(len) + sizeof(db_indx_t)) 313*7c478bd9Sstevel@tonic-gate 314*7c478bd9Sstevel@tonic-gate /* Put a HKEYDATA item at the location referenced by a page entry. */ 315*7c478bd9Sstevel@tonic-gate #define PUT_HKEYDATA(pe, kd, len, type) { \ 316*7c478bd9Sstevel@tonic-gate ((HKEYDATA *)pe)->type = type; \ 317*7c478bd9Sstevel@tonic-gate memcpy((u_int8_t *)pe + sizeof(u_int8_t), kd, len); \ 318*7c478bd9Sstevel@tonic-gate } 319*7c478bd9Sstevel@tonic-gate 320*7c478bd9Sstevel@tonic-gate /* 321*7c478bd9Sstevel@tonic-gate * Macros the describe the page layout in terms of key-data pairs. 322*7c478bd9Sstevel@tonic-gate * The use of "pindex" indicates that the argument is the index 323*7c478bd9Sstevel@tonic-gate * expressed in pairs instead of individual elements. 324*7c478bd9Sstevel@tonic-gate */ 325*7c478bd9Sstevel@tonic-gate #define H_NUMPAIRS(pg) (NUM_ENT(pg) / 2) 326*7c478bd9Sstevel@tonic-gate #define H_KEYINDEX(pindx) (2 * (pindx)) 327*7c478bd9Sstevel@tonic-gate #define H_DATAINDEX(pindx) ((2 * (pindx)) + 1) 328*7c478bd9Sstevel@tonic-gate #define H_PAIRKEY(pg, pindx) P_ENTRY(pg, H_KEYINDEX(pindx)) 329*7c478bd9Sstevel@tonic-gate #define H_PAIRDATA(pg, pindx) P_ENTRY(pg, H_DATAINDEX(pindx)) 330*7c478bd9Sstevel@tonic-gate #define H_PAIRSIZE(pg, psize, pindx) \ 331*7c478bd9Sstevel@tonic-gate (LEN_HITEM(pg, psize, H_KEYINDEX(pindx)) + \ 332*7c478bd9Sstevel@tonic-gate LEN_HITEM(pg, psize, H_DATAINDEX(pindx))) 333*7c478bd9Sstevel@tonic-gate #define LEN_HDATA(p, psize, pindx) LEN_HKEYDATA(p, psize, H_DATAINDEX(pindx)) 334*7c478bd9Sstevel@tonic-gate #define LEN_HKEY(p, psize, pindx) LEN_HKEYDATA(p, psize, H_KEYINDEX(pindx)) 335*7c478bd9Sstevel@tonic-gate 336*7c478bd9Sstevel@tonic-gate /* 337*7c478bd9Sstevel@tonic-gate * The third type is the H_OFFPAGE, represented by the HOFFPAGE structure: 338*7c478bd9Sstevel@tonic-gate */ 339*7c478bd9Sstevel@tonic-gate typedef struct _hoffpage { 340*7c478bd9Sstevel@tonic-gate u_int8_t type; /* 00: Page type and delete flag. */ 341*7c478bd9Sstevel@tonic-gate u_int8_t unused[3]; /* 01-03: Padding, unused. */ 342*7c478bd9Sstevel@tonic-gate db_pgno_t pgno; /* 04-07: Offpage page number. */ 343*7c478bd9Sstevel@tonic-gate u_int32_t tlen; /* 08-11: Total length of item. */ 344*7c478bd9Sstevel@tonic-gate } HOFFPAGE; 345*7c478bd9Sstevel@tonic-gate 346*7c478bd9Sstevel@tonic-gate #define HOFFPAGE_PGNO(p) (((u_int8_t *)p) + SSZ(HOFFPAGE, pgno)) 347*7c478bd9Sstevel@tonic-gate #define HOFFPAGE_TLEN(p) (((u_int8_t *)p) + SSZ(HOFFPAGE, tlen)) 348*7c478bd9Sstevel@tonic-gate 349*7c478bd9Sstevel@tonic-gate /* 350*7c478bd9Sstevel@tonic-gate * Page space required to add a new HOFFPAGE item to the page, with and 351*7c478bd9Sstevel@tonic-gate * without the index value. 352*7c478bd9Sstevel@tonic-gate */ 353*7c478bd9Sstevel@tonic-gate #define HOFFPAGE_SIZE (sizeof(HOFFPAGE)) 354*7c478bd9Sstevel@tonic-gate #define HOFFPAGE_PSIZE (HOFFPAGE_SIZE + sizeof(db_indx_t)) 355*7c478bd9Sstevel@tonic-gate 356*7c478bd9Sstevel@tonic-gate /* 357*7c478bd9Sstevel@tonic-gate * The fourth type is H_OFFDUP represented by the HOFFDUP structure: 358*7c478bd9Sstevel@tonic-gate */ 359*7c478bd9Sstevel@tonic-gate typedef struct _hoffdup { 360*7c478bd9Sstevel@tonic-gate u_int8_t type; /* 00: Page type and delete flag. */ 361*7c478bd9Sstevel@tonic-gate u_int8_t unused[3]; /* 01-03: Padding, unused. */ 362*7c478bd9Sstevel@tonic-gate db_pgno_t pgno; /* 04-07: Offpage page number. */ 363*7c478bd9Sstevel@tonic-gate } HOFFDUP; 364*7c478bd9Sstevel@tonic-gate #define HOFFDUP_PGNO(p) (((u_int8_t *)p) + SSZ(HOFFDUP, pgno)) 365*7c478bd9Sstevel@tonic-gate 366*7c478bd9Sstevel@tonic-gate /* 367*7c478bd9Sstevel@tonic-gate * Page space required to add a new HOFFDUP item to the page, with and 368*7c478bd9Sstevel@tonic-gate * without the index value. 369*7c478bd9Sstevel@tonic-gate */ 370*7c478bd9Sstevel@tonic-gate #define HOFFDUP_SIZE (sizeof(HOFFDUP)) 371*7c478bd9Sstevel@tonic-gate #define HOFFDUP_PSIZE (HOFFDUP_SIZE + sizeof(db_indx_t)) 372*7c478bd9Sstevel@tonic-gate 373*7c478bd9Sstevel@tonic-gate /************************************************************************ 374*7c478bd9Sstevel@tonic-gate BTREE PAGE LAYOUT 375*7c478bd9Sstevel@tonic-gate ************************************************************************/ 376*7c478bd9Sstevel@tonic-gate 377*7c478bd9Sstevel@tonic-gate /* Each index references a group of bytes on the page. */ 378*7c478bd9Sstevel@tonic-gate #define B_KEYDATA 1 /* Key/data item. */ 379*7c478bd9Sstevel@tonic-gate #define B_DUPLICATE 2 /* Duplicate key/data item. */ 380*7c478bd9Sstevel@tonic-gate #define B_OVERFLOW 3 /* Overflow key/data item. */ 381*7c478bd9Sstevel@tonic-gate 382*7c478bd9Sstevel@tonic-gate /* 383*7c478bd9Sstevel@tonic-gate * We have to store a deleted entry flag in the page. The reason is complex, 384*7c478bd9Sstevel@tonic-gate * but the simple version is that we can't delete on-page items referenced by 385*7c478bd9Sstevel@tonic-gate * a cursor -- the return order of subsequent insertions might be wrong. The 386*7c478bd9Sstevel@tonic-gate * delete flag is an overload of the top bit of the type byte. 387*7c478bd9Sstevel@tonic-gate */ 388*7c478bd9Sstevel@tonic-gate #define B_DELETE (0x80) 389*7c478bd9Sstevel@tonic-gate #define B_DCLR(t) (t) &= ~B_DELETE 390*7c478bd9Sstevel@tonic-gate #define B_DSET(t) (t) |= B_DELETE 391*7c478bd9Sstevel@tonic-gate #define B_DISSET(t) ((t) & B_DELETE) 392*7c478bd9Sstevel@tonic-gate 393*7c478bd9Sstevel@tonic-gate #define B_TYPE(t) ((t) & ~B_DELETE) 394*7c478bd9Sstevel@tonic-gate #define B_TSET(t, type, deleted) { \ 395*7c478bd9Sstevel@tonic-gate (t) = (type); \ 396*7c478bd9Sstevel@tonic-gate if (deleted) \ 397*7c478bd9Sstevel@tonic-gate B_DSET(t); \ 398*7c478bd9Sstevel@tonic-gate } 399*7c478bd9Sstevel@tonic-gate 400*7c478bd9Sstevel@tonic-gate /* 401*7c478bd9Sstevel@tonic-gate * The first type is B_KEYDATA, represented by the BKEYDATA structure: 402*7c478bd9Sstevel@tonic-gate */ 403*7c478bd9Sstevel@tonic-gate typedef struct _bkeydata { 404*7c478bd9Sstevel@tonic-gate db_indx_t len; /* 00-01: Key/data item length. */ 405*7c478bd9Sstevel@tonic-gate u_int8_t type; /* 02: Page type AND DELETE FLAG. */ 406*7c478bd9Sstevel@tonic-gate u_int8_t data[1]; /* Variable length key/data item. */ 407*7c478bd9Sstevel@tonic-gate } BKEYDATA; 408*7c478bd9Sstevel@tonic-gate 409*7c478bd9Sstevel@tonic-gate /* Get a BKEYDATA item for a specific index. */ 410*7c478bd9Sstevel@tonic-gate #define GET_BKEYDATA(pg, indx) \ 411*7c478bd9Sstevel@tonic-gate ((BKEYDATA *)P_ENTRY(pg, indx)) 412*7c478bd9Sstevel@tonic-gate 413*7c478bd9Sstevel@tonic-gate /* 414*7c478bd9Sstevel@tonic-gate * Page space required to add a new BKEYDATA item to the page, with and 415*7c478bd9Sstevel@tonic-gate * without the index value. 416*7c478bd9Sstevel@tonic-gate */ 417*7c478bd9Sstevel@tonic-gate #define BKEYDATA_SIZE(len) \ 418*7c478bd9Sstevel@tonic-gate ALIGN((len) + SSZA(BKEYDATA, data), 4) 419*7c478bd9Sstevel@tonic-gate #define BKEYDATA_PSIZE(len) \ 420*7c478bd9Sstevel@tonic-gate (BKEYDATA_SIZE(len) + sizeof(db_indx_t)) 421*7c478bd9Sstevel@tonic-gate 422*7c478bd9Sstevel@tonic-gate /* 423*7c478bd9Sstevel@tonic-gate * The second and third types are B_DUPLICATE and B_OVERFLOW, represented 424*7c478bd9Sstevel@tonic-gate * by the BOVERFLOW structure. 425*7c478bd9Sstevel@tonic-gate */ 426*7c478bd9Sstevel@tonic-gate typedef struct _boverflow { 427*7c478bd9Sstevel@tonic-gate db_indx_t unused1; /* 00-01: Padding, unused. */ 428*7c478bd9Sstevel@tonic-gate u_int8_t type; /* 02: Page type AND DELETE FLAG. */ 429*7c478bd9Sstevel@tonic-gate u_int8_t unused2; /* 03: Padding, unused. */ 430*7c478bd9Sstevel@tonic-gate db_pgno_t pgno; /* 04-07: Next page number. */ 431*7c478bd9Sstevel@tonic-gate u_int32_t tlen; /* 08-11: Total length of item. */ 432*7c478bd9Sstevel@tonic-gate } BOVERFLOW; 433*7c478bd9Sstevel@tonic-gate 434*7c478bd9Sstevel@tonic-gate /* Get a BOVERFLOW item for a specific index. */ 435*7c478bd9Sstevel@tonic-gate #define GET_BOVERFLOW(pg, indx) \ 436*7c478bd9Sstevel@tonic-gate ((BOVERFLOW *)P_ENTRY(pg, indx)) 437*7c478bd9Sstevel@tonic-gate 438*7c478bd9Sstevel@tonic-gate /* 439*7c478bd9Sstevel@tonic-gate * Page space required to add a new BOVERFLOW item to the page, with and 440*7c478bd9Sstevel@tonic-gate * without the index value. 441*7c478bd9Sstevel@tonic-gate */ 442*7c478bd9Sstevel@tonic-gate #define BOVERFLOW_SIZE \ 443*7c478bd9Sstevel@tonic-gate ALIGN(sizeof(BOVERFLOW), 4) 444*7c478bd9Sstevel@tonic-gate #define BOVERFLOW_PSIZE \ 445*7c478bd9Sstevel@tonic-gate (BOVERFLOW_SIZE + sizeof(db_indx_t)) 446*7c478bd9Sstevel@tonic-gate 447*7c478bd9Sstevel@tonic-gate /* 448*7c478bd9Sstevel@tonic-gate * Btree leaf and hash page layouts group indices in sets of two, one 449*7c478bd9Sstevel@tonic-gate * for the key and one for the data. Everything else does it in sets 450*7c478bd9Sstevel@tonic-gate * of one to save space. I use the following macros so that it's real 451*7c478bd9Sstevel@tonic-gate * obvious what's going on... 452*7c478bd9Sstevel@tonic-gate */ 453*7c478bd9Sstevel@tonic-gate #define O_INDX 1 454*7c478bd9Sstevel@tonic-gate #define P_INDX 2 455*7c478bd9Sstevel@tonic-gate 456*7c478bd9Sstevel@tonic-gate /************************************************************************ 457*7c478bd9Sstevel@tonic-gate BTREE INTERNAL PAGE LAYOUT 458*7c478bd9Sstevel@tonic-gate ************************************************************************/ 459*7c478bd9Sstevel@tonic-gate 460*7c478bd9Sstevel@tonic-gate /* 461*7c478bd9Sstevel@tonic-gate * Btree internal entry. 462*7c478bd9Sstevel@tonic-gate */ 463*7c478bd9Sstevel@tonic-gate typedef struct _binternal { 464*7c478bd9Sstevel@tonic-gate db_indx_t len; /* 00-01: Key/data item length. */ 465*7c478bd9Sstevel@tonic-gate u_int8_t type; /* 02: Page type AND DELETE FLAG. */ 466*7c478bd9Sstevel@tonic-gate u_int8_t unused; /* 03: Padding, unused. */ 467*7c478bd9Sstevel@tonic-gate db_pgno_t pgno; /* 04-07: Page number of referenced page. */ 468*7c478bd9Sstevel@tonic-gate db_recno_t nrecs; /* 08-11: Subtree record count. */ 469*7c478bd9Sstevel@tonic-gate u_int8_t data[1]; /* Variable length key item. */ 470*7c478bd9Sstevel@tonic-gate } BINTERNAL; 471*7c478bd9Sstevel@tonic-gate 472*7c478bd9Sstevel@tonic-gate /* Get a BINTERNAL item for a specific index. */ 473*7c478bd9Sstevel@tonic-gate #define GET_BINTERNAL(pg, indx) \ 474*7c478bd9Sstevel@tonic-gate ((BINTERNAL *)P_ENTRY(pg, indx)) 475*7c478bd9Sstevel@tonic-gate 476*7c478bd9Sstevel@tonic-gate /* 477*7c478bd9Sstevel@tonic-gate * Page space required to add a new BINTERNAL item to the page, with and 478*7c478bd9Sstevel@tonic-gate * without the index value. 479*7c478bd9Sstevel@tonic-gate */ 480*7c478bd9Sstevel@tonic-gate #define BINTERNAL_SIZE(len) \ 481*7c478bd9Sstevel@tonic-gate ALIGN((len) + SSZA(BINTERNAL, data), 4) 482*7c478bd9Sstevel@tonic-gate #define BINTERNAL_PSIZE(len) \ 483*7c478bd9Sstevel@tonic-gate (BINTERNAL_SIZE(len) + sizeof(db_indx_t)) 484*7c478bd9Sstevel@tonic-gate 485*7c478bd9Sstevel@tonic-gate /************************************************************************ 486*7c478bd9Sstevel@tonic-gate RECNO INTERNAL PAGE LAYOUT 487*7c478bd9Sstevel@tonic-gate ************************************************************************/ 488*7c478bd9Sstevel@tonic-gate 489*7c478bd9Sstevel@tonic-gate /* 490*7c478bd9Sstevel@tonic-gate * The recno internal entry. 491*7c478bd9Sstevel@tonic-gate * 492*7c478bd9Sstevel@tonic-gate * XXX 493*7c478bd9Sstevel@tonic-gate * Why not fold this into the db_indx_t structure, it's fixed length? 494*7c478bd9Sstevel@tonic-gate */ 495*7c478bd9Sstevel@tonic-gate typedef struct _rinternal { 496*7c478bd9Sstevel@tonic-gate db_pgno_t pgno; /* 00-03: Page number of referenced page. */ 497*7c478bd9Sstevel@tonic-gate db_recno_t nrecs; /* 04-07: Subtree record count. */ 498*7c478bd9Sstevel@tonic-gate } RINTERNAL; 499*7c478bd9Sstevel@tonic-gate 500*7c478bd9Sstevel@tonic-gate /* Get a RINTERNAL item for a specific index. */ 501*7c478bd9Sstevel@tonic-gate #define GET_RINTERNAL(pg, indx) \ 502*7c478bd9Sstevel@tonic-gate ((RINTERNAL *)P_ENTRY(pg, indx)) 503*7c478bd9Sstevel@tonic-gate 504*7c478bd9Sstevel@tonic-gate /* 505*7c478bd9Sstevel@tonic-gate * Page space required to add a new RINTERNAL item to the page, with and 506*7c478bd9Sstevel@tonic-gate * without the index value. 507*7c478bd9Sstevel@tonic-gate */ 508*7c478bd9Sstevel@tonic-gate #define RINTERNAL_SIZE \ 509*7c478bd9Sstevel@tonic-gate ALIGN(sizeof(RINTERNAL), 4) 510*7c478bd9Sstevel@tonic-gate #define RINTERNAL_PSIZE \ 511*7c478bd9Sstevel@tonic-gate (RINTERNAL_SIZE + sizeof(db_indx_t)) 512*7c478bd9Sstevel@tonic-gate #endif /* _DB_PAGE_H_ */ 513