1 /*- 2 * See the file LICENSE for redistribution information. 3 * 4 * Copyright (c) 1996, 1997, 1998 5 * Sleepycat Software. All rights reserved. 6 */ 7 #include "config.h" 8 9 #ifndef lint 10 static const char sccsid[] = "@(#)mp_fget.c 10.53 (Sleepycat) 11/16/98"; 11 #endif /* not lint */ 12 13 #ifndef NO_SYSTEM_INCLUDES 14 #include <sys/types.h> 15 16 #include <errno.h> 17 #include <string.h> 18 #endif 19 20 #include "db_int.h" 21 #include "shqueue.h" 22 #include "db_shash.h" 23 #include "mp.h" 24 #include "common_ext.h" 25 26 /* 27 * memp_fget -- 28 * Get a page from the file. 29 */ 30 int 31 memp_fget(dbmfp, pgnoaddr, flags, addrp) 32 DB_MPOOLFILE *dbmfp; 33 db_pgno_t *pgnoaddr; 34 u_int32_t flags; 35 void *addrp; 36 { 37 BH *bhp; 38 DB_MPOOL *dbmp; 39 MPOOL *mp; 40 MPOOLFILE *mfp; 41 size_t bucket, mf_offset; 42 u_int32_t st_hsearch; 43 int b_incr, first, ret; 44 45 dbmp = dbmfp->dbmp; 46 mp = dbmp->mp; 47 mfp = dbmfp->mfp; 48 49 MP_PANIC_CHECK(dbmp); 50 51 /* 52 * Validate arguments. 53 * 54 * !!! 55 * Don't test for DB_MPOOL_CREATE and DB_MPOOL_NEW flags for readonly 56 * files here, and create non-existent pages in readonly files if the 57 * flags are set, later. The reason is that the hash access method 58 * wants to get empty pages that don't really exist in readonly files. 59 * The only alternative is for hash to write the last "bucket" all the 60 * time, which we don't want to do because one of our big goals in life 61 * is to keep database files small. It's sleazy as hell, but we catch 62 * any attempt to actually write the file in memp_fput(). 63 */ 64 #define OKFLAGS (DB_MPOOL_CREATE | DB_MPOOL_LAST | DB_MPOOL_NEW) 65 if (flags != 0) { 66 if ((ret = 67 __db_fchk(dbmp->dbenv, "memp_fget", flags, OKFLAGS)) != 0) 68 return (ret); 69 70 switch (flags) { 71 case DB_MPOOL_CREATE: 72 case DB_MPOOL_LAST: 73 case DB_MPOOL_NEW: 74 case 0: 75 break; 76 default: 77 return (__db_ferr(dbmp->dbenv, "memp_fget", 1)); 78 } 79 } 80 81 #ifdef DIAGNOSTIC 82 /* 83 * XXX 84 * We want to switch threads as often as possible. Yield every time 85 * we get a new page to ensure contention. 86 */ 87 if (DB_GLOBAL(db_pageyield)) 88 __os_yield(1); 89 #endif 90 91 /* Initialize remaining local variables. */ 92 mf_offset = R_OFFSET(dbmp, mfp); 93 bhp = NULL; 94 st_hsearch = 0; 95 b_incr = ret = 0; 96 97 /* Determine the hash bucket where this page will live. */ 98 bucket = BUCKET(mp, mf_offset, *pgnoaddr); 99 100 LOCKREGION(dbmp); 101 102 /* 103 * Check for the last or last + 1 page requests. 104 * 105 * Examine and update the file's last_pgno value. We don't care if 106 * the last_pgno value immediately changes due to another thread -- 107 * at this instant in time, the value is correct. We do increment the 108 * current last_pgno value if the thread is asking for a new page, 109 * however, to ensure that two threads creating pages don't get the 110 * same one. 111 */ 112 if (LF_ISSET(DB_MPOOL_LAST | DB_MPOOL_NEW)) { 113 if (LF_ISSET(DB_MPOOL_NEW)) 114 ++mfp->last_pgno; 115 *pgnoaddr = mfp->last_pgno; 116 bucket = BUCKET(mp, mf_offset, mfp->last_pgno); 117 118 if (LF_ISSET(DB_MPOOL_NEW)) 119 goto alloc; 120 } 121 122 /* 123 * If mmap'ing the file and the page is not past the end of the file, 124 * just return a pointer. 125 * 126 * The page may be past the end of the file, so check the page number 127 * argument against the original length of the file. If we previously 128 * returned pages past the original end of the file, last_pgno will 129 * have been updated to match the "new" end of the file, and checking 130 * against it would return pointers past the end of the mmap'd region. 131 * 132 * If another process has opened the file for writing since we mmap'd 133 * it, we will start playing the game by their rules, i.e. everything 134 * goes through the cache. All pages previously returned will be safe, 135 * as long as the correct locking protocol was observed. 136 * 137 * XXX 138 * We don't discard the map because we don't know when all of the 139 * pages will have been discarded from the process' address space. 140 * It would be possible to do so by reference counting the open 141 * pages from the mmap, but it's unclear to me that it's worth it. 142 */ 143 if (dbmfp->addr != NULL && F_ISSET(mfp, MP_CAN_MMAP)) 144 if (*pgnoaddr > mfp->orig_last_pgno) { 145 /* 146 * !!! 147 * See the comment above about non-existent pages and 148 * the hash access method. 149 */ 150 if (!LF_ISSET(DB_MPOOL_CREATE)) { 151 __db_err(dbmp->dbenv, 152 "%s: page %lu doesn't exist", 153 __memp_fn(dbmfp), (u_long)*pgnoaddr); 154 ret = EINVAL; 155 goto err; 156 } 157 } else { 158 *(void **)addrp = 159 R_ADDR(dbmfp, *pgnoaddr * mfp->stat.st_pagesize); 160 ++mp->stat.st_map; 161 ++mfp->stat.st_map; 162 goto done; 163 } 164 165 /* Search the hash chain for the page. */ 166 for (bhp = SH_TAILQ_FIRST(&dbmp->htab[bucket], __bh); 167 bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh)) { 168 ++st_hsearch; 169 if (bhp->pgno != *pgnoaddr || bhp->mf_offset != mf_offset) 170 continue; 171 172 /* Increment the reference count. */ 173 if (bhp->ref == UINT16_T_MAX) { 174 __db_err(dbmp->dbenv, 175 "%s: page %lu: reference count overflow", 176 __memp_fn(dbmfp), (u_long)bhp->pgno); 177 ret = EINVAL; 178 goto err; 179 } 180 181 /* 182 * Increment the reference count. We may discard the region 183 * lock as we evaluate and/or read the buffer, so we need to 184 * ensure that it doesn't move and that its contents remain 185 * unchanged. 186 */ 187 ++bhp->ref; 188 b_incr = 1; 189 190 /* 191 * Any buffer we find might be trouble. 192 * 193 * BH_LOCKED -- 194 * I/O is in progress. Because we've incremented the buffer 195 * reference count, we know the buffer can't move. Unlock 196 * the region lock, wait for the I/O to complete, and reacquire 197 * the region. 198 */ 199 for (first = 1; F_ISSET(bhp, BH_LOCKED); first = 0) { 200 UNLOCKREGION(dbmp); 201 202 /* 203 * Explicitly yield the processor if it's not the first 204 * pass through this loop -- if we don't, we might end 205 * up running to the end of our CPU quantum as we will 206 * simply be swapping between the two locks. 207 */ 208 if (!first) 209 __os_yield(1); 210 211 LOCKBUFFER(dbmp, bhp); 212 /* Wait for I/O to finish... */ 213 UNLOCKBUFFER(dbmp, bhp); 214 LOCKREGION(dbmp); 215 } 216 217 /* 218 * BH_TRASH -- 219 * The contents of the buffer are garbage. Shouldn't happen, 220 * and this read is likely to fail, but might as well try. 221 */ 222 if (F_ISSET(bhp, BH_TRASH)) 223 goto reread; 224 225 /* 226 * BH_CALLPGIN -- 227 * The buffer was converted so it could be written, and the 228 * contents need to be converted again. 229 */ 230 if (F_ISSET(bhp, BH_CALLPGIN)) { 231 if ((ret = __memp_pg(dbmfp, bhp, 1)) != 0) 232 goto err; 233 F_CLR(bhp, BH_CALLPGIN); 234 } 235 236 ++mp->stat.st_cache_hit; 237 ++mfp->stat.st_cache_hit; 238 *(void **)addrp = bhp->buf; 239 goto done; 240 } 241 242 alloc: /* Allocate new buffer header and data space. */ 243 if ((ret = __memp_alloc(dbmp, sizeof(BH) - 244 sizeof(u_int8_t) + mfp->stat.st_pagesize, NULL, &bhp)) != 0) 245 goto err; 246 247 #ifdef DIAGNOSTIC 248 if ((ALIGNTYPE)bhp->buf & (sizeof(size_t) - 1)) { 249 __db_err(dbmp->dbenv, 250 "Internal error: BH data NOT size_t aligned."); 251 ret = EINVAL; 252 goto err; 253 } 254 #endif 255 /* Initialize the BH fields. */ 256 memset(bhp, 0, sizeof(BH)); 257 LOCKINIT(dbmp, &bhp->mutex); 258 bhp->ref = 1; 259 bhp->pgno = *pgnoaddr; 260 bhp->mf_offset = mf_offset; 261 262 /* 263 * Prepend the bucket header to the head of the appropriate MPOOL 264 * bucket hash list. Append the bucket header to the tail of the 265 * MPOOL LRU chain. 266 */ 267 SH_TAILQ_INSERT_HEAD(&dbmp->htab[bucket], bhp, hq, __bh); 268 SH_TAILQ_INSERT_TAIL(&mp->bhq, bhp, q); 269 270 /* 271 * If we created the page, zero it out and continue. 272 * 273 * !!! 274 * Note: DB_MPOOL_NEW specifically doesn't call the pgin function. 275 * If DB_MPOOL_CREATE is used, then the application's pgin function 276 * has to be able to handle pages of 0's -- if it uses DB_MPOOL_NEW, 277 * it can detect all of its page creates, and not bother. 278 * 279 * Otherwise, read the page into memory, optionally creating it if 280 * DB_MPOOL_CREATE is set. 281 */ 282 if (LF_ISSET(DB_MPOOL_NEW)) { 283 if (mfp->clear_len == 0) 284 memset(bhp->buf, 0, mfp->stat.st_pagesize); 285 else { 286 memset(bhp->buf, 0, mfp->clear_len); 287 #ifdef DIAGNOSTIC 288 memset(bhp->buf + mfp->clear_len, 0xdb, 289 mfp->stat.st_pagesize - mfp->clear_len); 290 #endif 291 } 292 293 ++mp->stat.st_page_create; 294 ++mfp->stat.st_page_create; 295 } else { 296 /* 297 * It's possible for the read function to fail, which means 298 * that we fail as well. Note, the __memp_pgread() function 299 * discards the region lock, so the buffer must be pinned 300 * down so that it cannot move and its contents are unchanged. 301 */ 302 reread: if ((ret = __memp_pgread(dbmfp, 303 bhp, LF_ISSET(DB_MPOOL_CREATE))) != 0) { 304 /* 305 * !!! 306 * Discard the buffer unless another thread is waiting 307 * on our I/O to complete. Regardless, the header has 308 * the BH_TRASH flag set. 309 */ 310 if (bhp->ref == 1) 311 __memp_bhfree(dbmp, mfp, bhp, 1); 312 goto err; 313 } 314 315 ++mp->stat.st_cache_miss; 316 ++mfp->stat.st_cache_miss; 317 } 318 319 /* 320 * If we're returning a page after our current notion of the last-page, 321 * update our information. Note, there's no way to un-instantiate this 322 * page, it's going to exist whether it's returned to us dirty or not. 323 */ 324 if (bhp->pgno > mfp->last_pgno) 325 mfp->last_pgno = bhp->pgno; 326 327 ++mp->stat.st_page_clean; 328 *(void **)addrp = bhp->buf; 329 330 done: /* Update the chain search statistics. */ 331 if (st_hsearch) { 332 ++mp->stat.st_hash_searches; 333 if (st_hsearch > mp->stat.st_hash_longest) 334 mp->stat.st_hash_longest = st_hsearch; 335 mp->stat.st_hash_examined += st_hsearch; 336 } 337 338 ++dbmfp->pinref; 339 340 UNLOCKREGION(dbmp); 341 342 return (0); 343 344 err: /* Discard our reference. */ 345 if (b_incr) 346 --bhp->ref; 347 UNLOCKREGION(dbmp); 348 349 *(void **)addrp = NULL; 350 return (ret); 351 } 352