1 /*- 2 * See the file LICENSE for redistribution information. 3 * 4 * Copyright (c) 1996, 1997, 1998 5 * Sleepycat Software. All rights reserved. 6 * 7 * @(#)db_page.h 10.18 (Sleepycat) 12/2/98 8 */ 9 10 #ifndef _DB_PAGE_H_ 11 #define _DB_PAGE_H_ 12 13 /* 14 * DB page formats. 15 * 16 * This implementation requires that values within the following structures 17 * NOT be padded -- note, ANSI C permits random padding within structures. 18 * If your compiler pads randomly you can just forget ever making DB run on 19 * your system. In addition, no data type can require larger alignment than 20 * its own size, e.g., a 4-byte data element may not require 8-byte alignment. 21 * 22 * Note that key/data lengths are often stored in db_indx_t's -- this is 23 * not accidental, nor does it limit the key/data size. If the key/data 24 * item fits on a page, it's guaranteed to be small enough to fit into a 25 * db_indx_t, and storing it in one saves space. 26 */ 27 28 #define PGNO_METADATA 0 /* Metadata page number. */ 29 #define PGNO_INVALID 0 /* Metadata page number, therefore illegal. */ 30 #define PGNO_ROOT 1 /* Root is page #1. */ 31 32 /* 33 * When we create pages in mpool, we ask mpool to clear some number of bytes 34 * in the header. This number must be at least as big as the regular page 35 * headers and cover enough of the btree and hash meta-data pages to obliterate 36 * the magic and version numbers. 37 */ 38 #define DB_PAGE_CLEAR_LEN 32 39 40 /************************************************************************ 41 BTREE METADATA PAGE LAYOUT 42 ************************************************************************/ 43 44 /* 45 * Btree metadata page layout: 46 */ 47 typedef struct _btmeta { 48 DB_LSN lsn; /* 00-07: LSN. */ 49 db_pgno_t pgno; /* 08-11: Current page number. */ 50 u_int32_t magic; /* 12-15: Magic number. */ 51 u_int32_t version; /* 16-19: Version. */ 52 u_int32_t pagesize; /* 20-23: Pagesize. */ 53 u_int32_t maxkey; /* 24-27: Btree: Maxkey. */ 54 u_int32_t minkey; /* 28-31: Btree: Minkey. */ 55 u_int32_t free; /* 32-35: Free list page number. */ 56 #define BTM_DUP 0x001 /* Duplicates. */ 57 #define BTM_RECNO 0x002 /* Recno tree. */ 58 #define BTM_RECNUM 0x004 /* Btree: maintain record count. */ 59 #define BTM_FIXEDLEN 0x008 /* Recno: fixed length records. */ 60 #define BTM_RENUMBER 0x010 /* Recno: renumber on insert/delete. */ 61 #define BTM_MASK 0x01f 62 u_int32_t flags; /* 36-39: Flags. */ 63 u_int32_t re_len; /* 40-43: Recno: fixed-length record length. */ 64 u_int32_t re_pad; /* 44-47: Recno: fixed-length record pad. */ 65 /* 48-67: Unique file ID. */ 66 u_int8_t uid[DB_FILE_ID_LEN]; 67 } BTMETA; 68 69 /************************************************************************ 70 HASH METADATA PAGE LAYOUT 71 ************************************************************************/ 72 73 /* 74 * Hash metadata page layout: 75 */ 76 /* Hash Table Information */ 77 typedef struct hashhdr { /* Disk resident portion */ 78 DB_LSN lsn; /* 00-07: LSN of the header page */ 79 db_pgno_t pgno; /* 08-11: Page number (btree compatibility). */ 80 u_int32_t magic; /* 12-15: Magic NO for hash tables */ 81 u_int32_t version; /* 16-19: Version ID */ 82 u_int32_t pagesize; /* 20-23: Bucket/Page Size */ 83 u_int32_t ovfl_point; /* 24-27: Overflow page allocation location */ 84 u_int32_t last_freed; /* 28-31: Last freed overflow page pgno */ 85 u_int32_t max_bucket; /* 32-35: ID of Maximum bucket in use */ 86 u_int32_t high_mask; /* 36-39: Modulo mask into table */ 87 u_int32_t low_mask; /* 40-43: Modulo mask into table lower half */ 88 u_int32_t ffactor; /* 44-47: Fill factor */ 89 u_int32_t nelem; /* 48-51: Number of keys in hash table */ 90 u_int32_t h_charkey; /* 52-55: Value of hash(CHARKEY) */ 91 #define DB_HASH_DUP 0x01 92 u_int32_t flags; /* 56-59: Allow duplicates. */ 93 #define NCACHED 32 /* number of spare points */ 94 /* 60-187: Spare pages for overflow */ 95 u_int32_t spares[NCACHED]; 96 /* 188-207: Unique file ID. */ 97 u_int8_t uid[DB_FILE_ID_LEN]; 98 99 /* 100 * Minimum page size is 256. 101 */ 102 } HASHHDR; 103 104 /************************************************************************ 105 MAIN PAGE LAYOUT 106 ************************************************************************/ 107 108 /* 109 * +-----------------------------------+ 110 * | lsn | pgno | prev pgno | 111 * +-----------------------------------+ 112 * | next pgno | entries | hf offset | 113 * +-----------------------------------+ 114 * | level | type | index | 115 * +-----------------------------------+ 116 * | index | free --> | 117 * +-----------+-----------------------+ 118 * | F R E E A R E A | 119 * +-----------------------------------+ 120 * | <-- free | item | 121 * +-----------------------------------+ 122 * | item | item | item | 123 * +-----------------------------------+ 124 * 125 * sizeof(PAGE) == 26 bytes, and the following indices are guaranteed to be 126 * two-byte aligned. 127 * 128 * For hash and btree leaf pages, index items are paired, e.g., inp[0] is the 129 * key for inp[1]'s data. All other types of pages only contain single items. 130 */ 131 typedef struct _db_page { 132 DB_LSN lsn; /* 00-07: Log sequence number. */ 133 db_pgno_t pgno; /* 08-11: Current page number. */ 134 db_pgno_t prev_pgno; /* 12-15: Previous page number. */ 135 db_pgno_t next_pgno; /* 16-19: Next page number. */ 136 db_indx_t entries; /* 20-21: Number of item pairs on the page. */ 137 db_indx_t hf_offset; /* 22-23: High free byte page offset. */ 138 139 /* 140 * The btree levels are numbered from the leaf to the root, starting 141 * with 1, so the leaf is level 1, its parent is level 2, and so on. 142 * We maintain this level on all btree pages, but the only place that 143 * we actually need it is on the root page. It would not be difficult 144 * to hide the byte on the root page once it becomes an internal page, 145 * so we could get this byte back if we needed it for something else. 146 */ 147 #define LEAFLEVEL 1 148 #define MAXBTREELEVEL 255 149 u_int8_t level; /* 24: Btree tree level. */ 150 151 #define P_INVALID 0 /* Invalid page type. */ 152 #define P_DUPLICATE 1 /* Duplicate. */ 153 #define P_HASH 2 /* Hash. */ 154 #define P_IBTREE 3 /* Btree internal. */ 155 #define P_IRECNO 4 /* Recno internal. */ 156 #define P_LBTREE 5 /* Btree leaf. */ 157 #define P_LRECNO 6 /* Recno leaf. */ 158 #define P_OVERFLOW 7 /* Overflow. */ 159 u_int8_t type; /* 25: Page type. */ 160 db_indx_t inp[1]; /* Variable length index of items. */ 161 } PAGE; 162 163 /* Element macros. */ 164 #define LSN(p) (((PAGE *)p)->lsn) 165 #define PGNO(p) (((PAGE *)p)->pgno) 166 #define PREV_PGNO(p) (((PAGE *)p)->prev_pgno) 167 #define NEXT_PGNO(p) (((PAGE *)p)->next_pgno) 168 #define NUM_ENT(p) (((PAGE *)p)->entries) 169 #define HOFFSET(p) (((PAGE *)p)->hf_offset) 170 #define LEVEL(p) (((PAGE *)p)->level) 171 #define TYPE(p) (((PAGE *)p)->type) 172 173 /* 174 * !!! 175 * The next_pgno and prev_pgno fields are not maintained for btree and recno 176 * internal pages. It's a minor performance improvement, and more, it's 177 * hard to do when deleting internal pages, and it decreases the chance of 178 * deadlock during deletes and splits. 179 * 180 * !!! 181 * The btree/recno access method needs db_recno_t bytes of space on the root 182 * page to specify how many records are stored in the tree. (The alternative 183 * is to store the number of records in the meta-data page, which will create 184 * a second hot spot in trees being actively modified, or recalculate it from 185 * the BINTERNAL fields on each access.) Overload the prev_pgno field. 186 */ 187 #define RE_NREC(p) \ 188 (TYPE(p) == P_LBTREE ? NUM_ENT(p) / 2 : \ 189 TYPE(p) == P_LRECNO ? NUM_ENT(p) : PREV_PGNO(p)) 190 #define RE_NREC_ADJ(p, adj) \ 191 PREV_PGNO(p) += adj; 192 #define RE_NREC_SET(p, num) \ 193 PREV_PGNO(p) = num; 194 195 /* 196 * Initialize a page. 197 * 198 * !!! 199 * Don't modify the page's LSN, code depends on it being unchanged after a 200 * P_INIT call. 201 */ 202 #define P_INIT(pg, pg_size, n, pg_prev, pg_next, btl, pg_type) do { \ 203 PGNO(pg) = n; \ 204 PREV_PGNO(pg) = pg_prev; \ 205 NEXT_PGNO(pg) = pg_next; \ 206 NUM_ENT(pg) = 0; \ 207 HOFFSET(pg) = pg_size; \ 208 LEVEL(pg) = btl; \ 209 TYPE(pg) = pg_type; \ 210 } while (0) 211 212 /* Page header length (offset to first index). */ 213 #define P_OVERHEAD (SSZA(PAGE, inp)) 214 215 /* First free byte. */ 216 #define LOFFSET(pg) (P_OVERHEAD + NUM_ENT(pg) * sizeof(db_indx_t)) 217 218 /* Free space on the page. */ 219 #define P_FREESPACE(pg) (HOFFSET(pg) - LOFFSET(pg)) 220 221 /* Get a pointer to the bytes at a specific index. */ 222 #define P_ENTRY(pg, indx) ((u_int8_t *)pg + ((PAGE *)pg)->inp[indx]) 223 224 /************************************************************************ 225 OVERFLOW PAGE LAYOUT 226 ************************************************************************/ 227 228 /* 229 * Overflow items are referenced by HOFFPAGE and BOVERFLOW structures, which 230 * store a page number (the first page of the overflow item) and a length 231 * (the total length of the overflow item). The overflow item consists of 232 * some number of overflow pages, linked by the next_pgno field of the page. 233 * A next_pgno field of PGNO_INVALID flags the end of the overflow item. 234 * 235 * Overflow page overloads: 236 * The amount of overflow data stored on each page is stored in the 237 * hf_offset field. 238 * 239 * The implementation reference counts overflow items as it's possible 240 * for them to be promoted onto btree internal pages. The reference 241 * count is stored in the entries field. 242 */ 243 #define OV_LEN(p) (((PAGE *)p)->hf_offset) 244 #define OV_REF(p) (((PAGE *)p)->entries) 245 246 /* Maximum number of bytes that you can put on an overflow page. */ 247 #define P_MAXSPACE(psize) ((psize) - P_OVERHEAD) 248 249 /************************************************************************ 250 HASH PAGE LAYOUT 251 ************************************************************************/ 252 253 /* Each index references a group of bytes on the page. */ 254 #define H_KEYDATA 1 /* Key/data item. */ 255 #define H_DUPLICATE 2 /* Duplicate key/data item. */ 256 #define H_OFFPAGE 3 /* Overflow key/data item. */ 257 #define H_OFFDUP 4 /* Overflow page of duplicates. */ 258 259 /* 260 * !!! 261 * Items on hash pages are (potentially) unaligned, so we can never cast the 262 * (page + offset) pointer to an HKEYDATA, HOFFPAGE or HOFFDUP structure, as 263 * we do with B+tree on-page structures. Because we frequently want the type 264 * field, it requires no alignment, and it's in the same location in all three 265 * structures, there's a pair of macros. 266 */ 267 #define HPAGE_PTYPE(p) (*(u_int8_t *)p) 268 #define HPAGE_TYPE(pg, indx) (*P_ENTRY(pg, indx)) 269 270 /* 271 * The first and second types are H_KEYDATA and H_DUPLICATE, represented 272 * by the HKEYDATA structure: 273 * 274 * +-----------------------------------+ 275 * | type | key/data ... | 276 * +-----------------------------------+ 277 * 278 * For duplicates, the data field encodes duplicate elements in the data 279 * field: 280 * 281 * +---------------------------------------------------------------+ 282 * | type | len1 | element1 | len1 | len2 | element2 | len2 | 283 * +---------------------------------------------------------------+ 284 * 285 * Thus, by keeping track of the offset in the element, we can do both 286 * backward and forward traversal. 287 */ 288 typedef struct _hkeydata { 289 u_int8_t type; /* 00: Page type. */ 290 u_int8_t data[1]; /* Variable length key/data item. */ 291 } HKEYDATA; 292 #define HKEYDATA_DATA(p) (((u_int8_t *)p) + SSZA(HKEYDATA, data)) 293 294 /* 295 * The length of any HKEYDATA item. Note that indx is an element index, 296 * not a PAIR index. 297 */ 298 #define LEN_HITEM(pg, pgsize, indx) \ 299 (((indx) == 0 ? pgsize : pg->inp[indx - 1]) - pg->inp[indx]) 300 301 #define LEN_HKEYDATA(pg, psize, indx) \ 302 (((indx) == 0 ? psize : pg->inp[indx - 1]) - \ 303 pg->inp[indx] - HKEYDATA_SIZE(0)) 304 305 /* 306 * Page space required to add a new HKEYDATA item to the page, with and 307 * without the index value. 308 */ 309 #define HKEYDATA_SIZE(len) \ 310 ((len) + SSZA(HKEYDATA, data)) 311 #define HKEYDATA_PSIZE(len) \ 312 (HKEYDATA_SIZE(len) + sizeof(db_indx_t)) 313 314 /* Put a HKEYDATA item at the location referenced by a page entry. */ 315 #define PUT_HKEYDATA(pe, kd, len, type) { \ 316 ((HKEYDATA *)pe)->type = type; \ 317 memcpy((u_int8_t *)pe + sizeof(u_int8_t), kd, len); \ 318 } 319 320 /* 321 * Macros the describe the page layout in terms of key-data pairs. 322 * The use of "pindex" indicates that the argument is the index 323 * expressed in pairs instead of individual elements. 324 */ 325 #define H_NUMPAIRS(pg) (NUM_ENT(pg) / 2) 326 #define H_KEYINDEX(pindx) (2 * (pindx)) 327 #define H_DATAINDEX(pindx) ((2 * (pindx)) + 1) 328 #define H_PAIRKEY(pg, pindx) P_ENTRY(pg, H_KEYINDEX(pindx)) 329 #define H_PAIRDATA(pg, pindx) P_ENTRY(pg, H_DATAINDEX(pindx)) 330 #define H_PAIRSIZE(pg, psize, pindx) \ 331 (LEN_HITEM(pg, psize, H_KEYINDEX(pindx)) + \ 332 LEN_HITEM(pg, psize, H_DATAINDEX(pindx))) 333 #define LEN_HDATA(p, psize, pindx) LEN_HKEYDATA(p, psize, H_DATAINDEX(pindx)) 334 #define LEN_HKEY(p, psize, pindx) LEN_HKEYDATA(p, psize, H_KEYINDEX(pindx)) 335 336 /* 337 * The third type is the H_OFFPAGE, represented by the HOFFPAGE structure: 338 */ 339 typedef struct _hoffpage { 340 u_int8_t type; /* 00: Page type and delete flag. */ 341 u_int8_t unused[3]; /* 01-03: Padding, unused. */ 342 db_pgno_t pgno; /* 04-07: Offpage page number. */ 343 u_int32_t tlen; /* 08-11: Total length of item. */ 344 } HOFFPAGE; 345 346 #define HOFFPAGE_PGNO(p) (((u_int8_t *)p) + SSZ(HOFFPAGE, pgno)) 347 #define HOFFPAGE_TLEN(p) (((u_int8_t *)p) + SSZ(HOFFPAGE, tlen)) 348 349 /* 350 * Page space required to add a new HOFFPAGE item to the page, with and 351 * without the index value. 352 */ 353 #define HOFFPAGE_SIZE (sizeof(HOFFPAGE)) 354 #define HOFFPAGE_PSIZE (HOFFPAGE_SIZE + sizeof(db_indx_t)) 355 356 /* 357 * The fourth type is H_OFFDUP represented by the HOFFDUP structure: 358 */ 359 typedef struct _hoffdup { 360 u_int8_t type; /* 00: Page type and delete flag. */ 361 u_int8_t unused[3]; /* 01-03: Padding, unused. */ 362 db_pgno_t pgno; /* 04-07: Offpage page number. */ 363 } HOFFDUP; 364 #define HOFFDUP_PGNO(p) (((u_int8_t *)p) + SSZ(HOFFDUP, pgno)) 365 366 /* 367 * Page space required to add a new HOFFDUP item to the page, with and 368 * without the index value. 369 */ 370 #define HOFFDUP_SIZE (sizeof(HOFFDUP)) 371 #define HOFFDUP_PSIZE (HOFFDUP_SIZE + sizeof(db_indx_t)) 372 373 /************************************************************************ 374 BTREE PAGE LAYOUT 375 ************************************************************************/ 376 377 /* Each index references a group of bytes on the page. */ 378 #define B_KEYDATA 1 /* Key/data item. */ 379 #define B_DUPLICATE 2 /* Duplicate key/data item. */ 380 #define B_OVERFLOW 3 /* Overflow key/data item. */ 381 382 /* 383 * We have to store a deleted entry flag in the page. The reason is complex, 384 * but the simple version is that we can't delete on-page items referenced by 385 * a cursor -- the return order of subsequent insertions might be wrong. The 386 * delete flag is an overload of the top bit of the type byte. 387 */ 388 #define B_DELETE (0x80) 389 #define B_DCLR(t) (t) &= ~B_DELETE 390 #define B_DSET(t) (t) |= B_DELETE 391 #define B_DISSET(t) ((t) & B_DELETE) 392 393 #define B_TYPE(t) ((t) & ~B_DELETE) 394 #define B_TSET(t, type, deleted) { \ 395 (t) = (type); \ 396 if (deleted) \ 397 B_DSET(t); \ 398 } 399 400 /* 401 * The first type is B_KEYDATA, represented by the BKEYDATA structure: 402 */ 403 typedef struct _bkeydata { 404 db_indx_t len; /* 00-01: Key/data item length. */ 405 u_int8_t type; /* 02: Page type AND DELETE FLAG. */ 406 u_int8_t data[1]; /* Variable length key/data item. */ 407 } BKEYDATA; 408 409 /* Get a BKEYDATA item for a specific index. */ 410 #define GET_BKEYDATA(pg, indx) \ 411 ((BKEYDATA *)P_ENTRY(pg, indx)) 412 413 /* 414 * Page space required to add a new BKEYDATA item to the page, with and 415 * without the index value. 416 */ 417 #define BKEYDATA_SIZE(len) \ 418 ALIGN((len) + SSZA(BKEYDATA, data), 4) 419 #define BKEYDATA_PSIZE(len) \ 420 (BKEYDATA_SIZE(len) + sizeof(db_indx_t)) 421 422 /* 423 * The second and third types are B_DUPLICATE and B_OVERFLOW, represented 424 * by the BOVERFLOW structure. 425 */ 426 typedef struct _boverflow { 427 db_indx_t unused1; /* 00-01: Padding, unused. */ 428 u_int8_t type; /* 02: Page type AND DELETE FLAG. */ 429 u_int8_t unused2; /* 03: Padding, unused. */ 430 db_pgno_t pgno; /* 04-07: Next page number. */ 431 u_int32_t tlen; /* 08-11: Total length of item. */ 432 } BOVERFLOW; 433 434 /* Get a BOVERFLOW item for a specific index. */ 435 #define GET_BOVERFLOW(pg, indx) \ 436 ((BOVERFLOW *)P_ENTRY(pg, indx)) 437 438 /* 439 * Page space required to add a new BOVERFLOW item to the page, with and 440 * without the index value. 441 */ 442 #define BOVERFLOW_SIZE \ 443 ALIGN(sizeof(BOVERFLOW), 4) 444 #define BOVERFLOW_PSIZE \ 445 (BOVERFLOW_SIZE + sizeof(db_indx_t)) 446 447 /* 448 * Btree leaf and hash page layouts group indices in sets of two, one 449 * for the key and one for the data. Everything else does it in sets 450 * of one to save space. I use the following macros so that it's real 451 * obvious what's going on... 452 */ 453 #define O_INDX 1 454 #define P_INDX 2 455 456 /************************************************************************ 457 BTREE INTERNAL PAGE LAYOUT 458 ************************************************************************/ 459 460 /* 461 * Btree internal entry. 462 */ 463 typedef struct _binternal { 464 db_indx_t len; /* 00-01: Key/data item length. */ 465 u_int8_t type; /* 02: Page type AND DELETE FLAG. */ 466 u_int8_t unused; /* 03: Padding, unused. */ 467 db_pgno_t pgno; /* 04-07: Page number of referenced page. */ 468 db_recno_t nrecs; /* 08-11: Subtree record count. */ 469 u_int8_t data[1]; /* Variable length key item. */ 470 } BINTERNAL; 471 472 /* Get a BINTERNAL item for a specific index. */ 473 #define GET_BINTERNAL(pg, indx) \ 474 ((BINTERNAL *)P_ENTRY(pg, indx)) 475 476 /* 477 * Page space required to add a new BINTERNAL item to the page, with and 478 * without the index value. 479 */ 480 #define BINTERNAL_SIZE(len) \ 481 ALIGN((len) + SSZA(BINTERNAL, data), 4) 482 #define BINTERNAL_PSIZE(len) \ 483 (BINTERNAL_SIZE(len) + sizeof(db_indx_t)) 484 485 /************************************************************************ 486 RECNO INTERNAL PAGE LAYOUT 487 ************************************************************************/ 488 489 /* 490 * The recno internal entry. 491 * 492 * XXX 493 * Why not fold this into the db_indx_t structure, it's fixed length? 494 */ 495 typedef struct _rinternal { 496 db_pgno_t pgno; /* 00-03: Page number of referenced page. */ 497 db_recno_t nrecs; /* 04-07: Subtree record count. */ 498 } RINTERNAL; 499 500 /* Get a RINTERNAL item for a specific index. */ 501 #define GET_RINTERNAL(pg, indx) \ 502 ((RINTERNAL *)P_ENTRY(pg, indx)) 503 504 /* 505 * Page space required to add a new RINTERNAL item to the page, with and 506 * without the index value. 507 */ 508 #define RINTERNAL_SIZE \ 509 ALIGN(sizeof(RINTERNAL), 4) 510 #define RINTERNAL_PSIZE \ 511 (RINTERNAL_SIZE + sizeof(db_indx_t)) 512 #endif /* _DB_PAGE_H_ */ 513