1 /*- 2 * See the file LICENSE for redistribution information. 3 * 4 * Copyright (c) 1996, 1997, 1998 5 * Sleepycat Software. All rights reserved. 6 * 7 * @(#)mp.h 10.37 (Sleepycat) 1/1/99 8 */ 9 10 struct __bh; typedef struct __bh BH; 11 struct __db_mpreg; typedef struct __db_mpreg DB_MPREG; 12 struct __mpool; typedef struct __mpool MPOOL; 13 struct __mpoolfile; typedef struct __mpoolfile MPOOLFILE; 14 15 /* Default mpool name. */ 16 #define DB_DEFAULT_MPOOL_FILE "__db_mpool.share" 17 18 /* 19 * We default to 256K (32 8K pages) if the user doesn't specify, and 20 * require a minimum of 20K. 21 */ 22 #ifndef DB_CACHESIZE_DEF 23 #define DB_CACHESIZE_DEF (256 * 1024) 24 #endif 25 #define DB_CACHESIZE_MIN ( 20 * 1024) 26 27 #define INVALID 0 /* Invalid shared memory offset. */ 28 29 /* 30 * There are three ways we do locking in the mpool code: 31 * 32 * Locking a handle mutex to provide concurrency for DB_THREAD operations. 33 * Locking the region mutex to provide mutual exclusion while reading and 34 * writing structures in the shared region. 35 * Locking buffer header mutexes during I/O. 36 * 37 * The first will not be further described here. We use the shared mpool 38 * region lock to provide mutual exclusion while reading/modifying all of 39 * the data structures, including the buffer headers. We use a per-buffer 40 * header lock to wait on buffer I/O. The order of locking is as follows: 41 * 42 * Searching for a buffer: 43 * Acquire the region lock. 44 * Find the buffer header. 45 * Increment the reference count (guarantee the buffer stays). 46 * While the BH_LOCKED flag is set (I/O is going on) { 47 * Release the region lock. 48 * Explicitly yield the processor if it's not the first pass 49 * through this loop, otherwise, we can simply spin because 50 * we'll be simply switching between the two locks. 51 * Request the buffer lock. 52 * The I/O will complete... 53 * Acquire the buffer lock. 54 * Release the buffer lock. 55 * Acquire the region lock. 56 * } 57 * Return the buffer. 58 * 59 * Reading/writing a buffer: 60 * Acquire the region lock. 61 * Find/create the buffer header. 62 * If reading, increment the reference count (guarantee the buffer stays). 63 * Set the BH_LOCKED flag. 64 * Acquire the buffer lock (guaranteed not to block). 65 * Release the region lock. 66 * Do the I/O and/or initialize the buffer contents. 67 * Release the buffer lock. 68 * At this point, the buffer lock is available, but the logical 69 * operation (flagged by BH_LOCKED) is not yet completed. For 70 * this reason, among others, threads checking the BH_LOCKED flag 71 * must loop around their test. 72 * Acquire the region lock. 73 * Clear the BH_LOCKED flag. 74 * Release the region lock. 75 * Return/discard the buffer. 76 * 77 * Pointers to DB_MPOOL, MPOOL, DB_MPOOLFILE and MPOOLFILE structures are not 78 * reacquired when a region lock is reacquired because they couldn't have been 79 * closed/discarded and because they never move in memory. 80 */ 81 #define LOCKINIT(dbmp, mutexp) \ 82 if (F_ISSET(dbmp, MP_LOCKHANDLE | MP_LOCKREGION)) \ 83 (void)__db_mutex_init(mutexp, \ 84 MUTEX_LOCK_OFFSET((dbmp)->reginfo.addr, mutexp)) 85 86 #define LOCKHANDLE(dbmp, mutexp) \ 87 if (F_ISSET(dbmp, MP_LOCKHANDLE)) \ 88 (void)__db_mutex_lock(mutexp, (dbmp)->reginfo.fd) 89 #define UNLOCKHANDLE(dbmp, mutexp) \ 90 if (F_ISSET(dbmp, MP_LOCKHANDLE)) \ 91 (void)__db_mutex_unlock(mutexp, (dbmp)->reginfo.fd) 92 93 #define LOCKREGION(dbmp) \ 94 if (F_ISSET(dbmp, MP_LOCKREGION)) \ 95 (void)__db_mutex_lock(&((RLAYOUT *)(dbmp)->mp)->lock, \ 96 (dbmp)->reginfo.fd) 97 #define UNLOCKREGION(dbmp) \ 98 if (F_ISSET(dbmp, MP_LOCKREGION)) \ 99 (void)__db_mutex_unlock(&((RLAYOUT *)(dbmp)->mp)->lock, \ 100 (dbmp)->reginfo.fd) 101 102 #define LOCKBUFFER(dbmp, bhp) \ 103 if (F_ISSET(dbmp, MP_LOCKREGION)) \ 104 (void)__db_mutex_lock(&(bhp)->mutex, (dbmp)->reginfo.fd) 105 #define UNLOCKBUFFER(dbmp, bhp) \ 106 if (F_ISSET(dbmp, MP_LOCKREGION)) \ 107 (void)__db_mutex_unlock(&(bhp)->mutex, (dbmp)->reginfo.fd) 108 109 /* Check for region catastrophic shutdown. */ 110 #define MP_PANIC_CHECK(dbmp) { \ 111 if ((dbmp)->mp->rlayout.panic) \ 112 return (DB_RUNRECOVERY); \ 113 } 114 115 /* 116 * DB_MPOOL -- 117 * Per-process memory pool structure. 118 */ 119 struct __db_mpool { 120 /* These fields need to be protected for multi-threaded support. */ 121 db_mutex_t *mutexp; /* Structure lock. */ 122 123 /* List of pgin/pgout routines. */ 124 LIST_HEAD(__db_mpregh, __db_mpreg) dbregq; 125 126 /* List of DB_MPOOLFILE's. */ 127 TAILQ_HEAD(__db_mpoolfileh, __db_mpoolfile) dbmfq; 128 129 /* These fields are not protected. */ 130 DB_ENV *dbenv; /* Reference to error information. */ 131 REGINFO reginfo; /* Region information. */ 132 133 MPOOL *mp; /* Address of the shared MPOOL. */ 134 135 void *addr; /* Address of shalloc() region. */ 136 137 DB_HASHTAB *htab; /* Hash table of bucket headers. */ 138 139 #define MP_LOCKHANDLE 0x01 /* Threaded, lock handles and region. */ 140 #define MP_LOCKREGION 0x02 /* Concurrent access, lock region. */ 141 u_int32_t flags; 142 }; 143 144 /* 145 * DB_MPREG -- 146 * DB_MPOOL registry of pgin/pgout functions. 147 */ 148 struct __db_mpreg { 149 LIST_ENTRY(__db_mpreg) q; /* Linked list. */ 150 151 int ftype; /* File type. */ 152 /* Pgin, pgout routines. */ 153 int (DB_CALLBACK *pgin) __P((db_pgno_t, void *, DBT *)); 154 int (DB_CALLBACK *pgout) __P((db_pgno_t, void *, DBT *)); 155 }; 156 157 /* 158 * DB_MPOOLFILE -- 159 * Per-process DB_MPOOLFILE information. 160 */ 161 struct __db_mpoolfile { 162 /* These fields need to be protected for multi-threaded support. */ 163 db_mutex_t *mutexp; /* Structure lock. */ 164 165 int fd; /* Underlying file descriptor. */ 166 167 u_int32_t ref; /* Reference count. */ 168 169 /* 170 * !!! 171 * This field is a special case -- it's protected by the region lock 172 * NOT the thread lock. The reason for this is that we always have 173 * the region lock immediately before or after we modify the field, 174 * and we don't want to use the structure lock to protect it because 175 * then I/O (which is done with the structure lock held because of 176 * the race between the seek and write of the file descriptor) will 177 * block any other put/get calls using this DB_MPOOLFILE structure. 178 */ 179 u_int32_t pinref; /* Pinned block reference count. */ 180 181 /* These fields are not protected. */ 182 TAILQ_ENTRY(__db_mpoolfile) q; /* Linked list of DB_MPOOLFILE's. */ 183 184 DB_MPOOL *dbmp; /* Overlying DB_MPOOL. */ 185 MPOOLFILE *mfp; /* Underlying MPOOLFILE. */ 186 187 void *addr; /* Address of mmap'd region. */ 188 size_t len; /* Length of mmap'd region. */ 189 190 /* These fields need to be protected for multi-threaded support. */ 191 #define MP_READONLY 0x01 /* File is readonly. */ 192 #define MP_UPGRADE 0x02 /* File descriptor is readwrite. */ 193 #define MP_UPGRADE_FAIL 0x04 /* Upgrade wasn't possible. */ 194 u_int32_t flags; 195 }; 196 197 /* 198 * MPOOL -- 199 * Shared memory pool region. One of these is allocated in shared 200 * memory, and describes the pool. 201 */ 202 struct __mpool { 203 RLAYOUT rlayout; /* General region information. */ 204 205 SH_TAILQ_HEAD(__bhq) bhq; /* LRU list of buckets. */ 206 SH_TAILQ_HEAD(__bhfq) bhfq; /* Free buckets. */ 207 SH_TAILQ_HEAD(__mpfq) mpfq; /* List of MPOOLFILEs. */ 208 209 /* 210 * We make the assumption that the early pages of the file are far 211 * more likely to be retrieved than the later pages, which means 212 * that the top bits are more interesting for hashing since they're 213 * less likely to collide. On the other hand, since 512 4K pages 214 * represents a 2MB file, only the bottom 9 bits of the page number 215 * are likely to be set. We XOR in the offset in the MPOOL of the 216 * MPOOLFILE that backs this particular page, since that should also 217 * be unique for the page. 218 */ 219 #define BUCKET(mp, mf_offset, pgno) \ 220 (((pgno) ^ ((mf_offset) << 9)) % (mp)->htab_buckets) 221 222 size_t htab; /* Hash table offset. */ 223 size_t htab_buckets; /* Number of hash table entries. */ 224 225 DB_LSN lsn; /* Maximum checkpoint LSN. */ 226 u_int32_t lsn_cnt; /* Checkpoint buffers left to write. */ 227 228 DB_MPOOL_STAT stat; /* Global mpool statistics. */ 229 230 #define MP_LSN_RETRY 0x01 /* Retry all BH_WRITE buffers. */ 231 u_int32_t flags; 232 }; 233 234 /* 235 * MPOOLFILE -- 236 * Shared DB_MPOOLFILE information. 237 */ 238 struct __mpoolfile { 239 SH_TAILQ_ENTRY q; /* List of MPOOLFILEs */ 240 241 u_int32_t ref; /* Reference count. */ 242 243 int ftype; /* File type. */ 244 245 int32_t lsn_off; /* Page's LSN offset. */ 246 u_int32_t clear_len; /* Bytes to clear on page create. */ 247 248 size_t path_off; /* File name location. */ 249 size_t fileid_off; /* File identification location. */ 250 251 size_t pgcookie_len; /* Pgin/pgout cookie length. */ 252 size_t pgcookie_off; /* Pgin/pgout cookie location. */ 253 254 u_int32_t lsn_cnt; /* Checkpoint buffers left to write. */ 255 256 db_pgno_t last_pgno; /* Last page in the file. */ 257 db_pgno_t orig_last_pgno; /* Original last page in the file. */ 258 259 #define MP_CAN_MMAP 0x01 /* If the file can be mmap'd. */ 260 #define MP_TEMP 0x02 /* Backing file is a temporary. */ 261 u_int32_t flags; 262 263 DB_MPOOL_FSTAT stat; /* Per-file mpool statistics. */ 264 }; 265 266 /* 267 * BH -- 268 * Buffer header. 269 */ 270 struct __bh { 271 db_mutex_t mutex; /* Structure lock. */ 272 273 u_int16_t ref; /* Reference count. */ 274 275 #define BH_CALLPGIN 0x001 /* Page needs to be reworked... */ 276 #define BH_DIRTY 0x002 /* Page was modified. */ 277 #define BH_DISCARD 0x004 /* Page is useless. */ 278 #define BH_LOCKED 0x008 /* Page is locked (I/O in progress). */ 279 #define BH_TRASH 0x010 /* Page is garbage. */ 280 #define BH_WRITE 0x020 /* Page scheduled for writing. */ 281 u_int16_t flags; 282 283 SH_TAILQ_ENTRY q; /* LRU queue. */ 284 SH_TAILQ_ENTRY hq; /* MPOOL hash bucket queue. */ 285 286 db_pgno_t pgno; /* Underlying MPOOLFILE page number. */ 287 size_t mf_offset; /* Associated MPOOLFILE offset. */ 288 289 /* 290 * !!! 291 * This array must be size_t aligned -- the DB access methods put PAGE 292 * and other structures into it, and expect to be able to access them 293 * directly. (We guarantee size_t alignment in the db_mpool(3) manual 294 * page as well.) 295 */ 296 u_int8_t buf[1]; /* Variable length data. */ 297 }; 298 299 #include "mp_ext.h" 300