1 /*- 2 * See the file LICENSE for redistribution information. 3 * 4 * Copyright (c) 1996, 1997, 1998 5 * Sleepycat Software. All rights reserved. 6 */ 7 #include "config.h" 8 9 #ifndef lint 10 static const char sccsid[] = "@(#)mp_fopen.c 10.60 (Sleepycat) 1/1/99"; 11 #endif /* not lint */ 12 13 #ifndef NO_SYSTEM_INCLUDES 14 #include <sys/types.h> 15 16 #include <errno.h> 17 #include <string.h> 18 #endif 19 20 #include "db_int.h" 21 #include "shqueue.h" 22 #include "db_shash.h" 23 #include "mp.h" 24 #include "common_ext.h" 25 26 static int __memp_mf_close __P((DB_MPOOL *, DB_MPOOLFILE *)); 27 static int __memp_mf_open __P((DB_MPOOL *, 28 const char *, size_t, db_pgno_t, DB_MPOOL_FINFO *, MPOOLFILE **)); 29 30 /* 31 * memp_fopen -- 32 * Open a backing file for the memory pool. 33 */ 34 int 35 memp_fopen(dbmp, path, flags, mode, pagesize, finfop, retp) 36 DB_MPOOL *dbmp; 37 const char *path; 38 u_int32_t flags; 39 int mode; 40 size_t pagesize; 41 DB_MPOOL_FINFO *finfop; 42 DB_MPOOLFILE **retp; 43 { 44 int ret; 45 46 MP_PANIC_CHECK(dbmp); 47 48 /* Validate arguments. */ 49 if ((ret = __db_fchk(dbmp->dbenv, 50 "memp_fopen", flags, DB_CREATE | DB_NOMMAP | DB_RDONLY)) != 0) 51 return (ret); 52 53 /* Require a non-zero pagesize. */ 54 if (pagesize == 0) { 55 __db_err(dbmp->dbenv, "memp_fopen: pagesize not specified"); 56 return (EINVAL); 57 } 58 if (finfop != NULL && finfop->clear_len > pagesize) 59 return (EINVAL); 60 61 return (__memp_fopen(dbmp, 62 NULL, path, flags, mode, pagesize, 1, finfop, retp)); 63 } 64 65 /* 66 * __memp_fopen -- 67 * Open a backing file for the memory pool; internal version. 68 * 69 * PUBLIC: int __memp_fopen __P((DB_MPOOL *, MPOOLFILE *, const char *, 70 * PUBLIC: u_int32_t, int, size_t, int, DB_MPOOL_FINFO *, DB_MPOOLFILE **)); 71 */ 72 int 73 __memp_fopen(dbmp, mfp, path, flags, mode, pagesize, needlock, finfop, retp) 74 DB_MPOOL *dbmp; 75 MPOOLFILE *mfp; 76 const char *path; 77 u_int32_t flags; 78 int mode, needlock; 79 size_t pagesize; 80 DB_MPOOL_FINFO *finfop; 81 DB_MPOOLFILE **retp; 82 { 83 DB_ENV *dbenv; 84 DB_MPOOLFILE *dbmfp; 85 DB_MPOOL_FINFO finfo; 86 db_pgno_t last_pgno; 87 size_t maxmap; 88 u_int32_t mbytes, bytes; 89 int ret; 90 u_int8_t idbuf[DB_FILE_ID_LEN]; 91 char *rpath; 92 93 dbenv = dbmp->dbenv; 94 ret = 0; 95 rpath = NULL; 96 97 /* 98 * If mfp is provided, we take the DB_MPOOL_FINFO information from 99 * the mfp. We don't bother initializing everything, because some 100 * of them are expensive to acquire. If no mfp is provided and the 101 * finfop argument is NULL, we default the values. 102 */ 103 if (finfop == NULL) { 104 memset(&finfo, 0, sizeof(finfo)); 105 if (mfp != NULL) { 106 finfo.ftype = mfp->ftype; 107 finfo.pgcookie = NULL; 108 finfo.fileid = NULL; 109 finfo.lsn_offset = mfp->lsn_off; 110 finfo.clear_len = mfp->clear_len; 111 } else { 112 finfo.ftype = 0; 113 finfo.pgcookie = NULL; 114 finfo.fileid = NULL; 115 finfo.lsn_offset = -1; 116 finfo.clear_len = 0; 117 } 118 finfop = &finfo; 119 } 120 121 /* Allocate and initialize the per-process structure. */ 122 if ((ret = __os_calloc(1, sizeof(DB_MPOOLFILE), &dbmfp)) != 0) 123 return (ret); 124 dbmfp->dbmp = dbmp; 125 dbmfp->fd = -1; 126 dbmfp->ref = 1; 127 if (LF_ISSET(DB_RDONLY)) 128 F_SET(dbmfp, MP_READONLY); 129 130 if (path == NULL) { 131 if (LF_ISSET(DB_RDONLY)) { 132 __db_err(dbenv, 133 "memp_fopen: temporary files can't be readonly"); 134 ret = EINVAL; 135 goto err; 136 } 137 last_pgno = 0; 138 } else { 139 /* Get the real name for this file and open it. */ 140 if ((ret = __db_appname(dbenv, 141 DB_APP_DATA, NULL, path, 0, NULL, &rpath)) != 0) 142 goto err; 143 if ((ret = __db_open(rpath, 144 LF_ISSET(DB_CREATE | DB_RDONLY), 145 DB_CREATE | DB_RDONLY, mode, &dbmfp->fd)) != 0) { 146 __db_err(dbenv, "%s: %s", rpath, strerror(ret)); 147 goto err; 148 } 149 150 /* 151 * Don't permit files that aren't a multiple of the pagesize, 152 * and find the number of the last page in the file, all the 153 * time being careful not to overflow 32 bits. 154 * 155 * !!! 156 * We can't use off_t's here, or in any code in the mainline 157 * library for that matter. (We have to use them in the os 158 * stubs, of course, as there are system calls that take them 159 * as arguments.) The reason is that some customers build in 160 * environments where an off_t is 32-bits, but still run where 161 * offsets are 64-bits, and they pay us a lot of money. 162 */ 163 if ((ret = __os_ioinfo(rpath, 164 dbmfp->fd, &mbytes, &bytes, NULL)) != 0) { 165 __db_err(dbenv, "%s: %s", rpath, strerror(ret)); 166 goto err; 167 } 168 169 /* Page sizes have to be a power-of-two, ignore mbytes. */ 170 if (bytes % pagesize != 0) { 171 __db_err(dbenv, 172 "%s: file size not a multiple of the pagesize", 173 rpath); 174 ret = EINVAL; 175 goto err; 176 } 177 178 last_pgno = mbytes * (MEGABYTE / pagesize); 179 last_pgno += bytes / pagesize; 180 181 /* Correction: page numbers are zero-based, not 1-based. */ 182 if (last_pgno != 0) 183 --last_pgno; 184 185 /* 186 * Get the file id if we weren't given one. Generated file id's 187 * don't use timestamps, otherwise there'd be no chance of any 188 * other process joining the party. 189 */ 190 if (finfop->fileid == NULL) { 191 if ((ret = __os_fileid(dbenv, rpath, 0, idbuf)) != 0) 192 goto err; 193 finfop->fileid = idbuf; 194 } 195 } 196 197 /* 198 * If we weren't provided an underlying shared object to join with, 199 * find/allocate the shared file objects. Also allocate space for 200 * for the per-process thread lock. 201 */ 202 if (needlock) 203 LOCKREGION(dbmp); 204 205 if (mfp == NULL) 206 ret = __memp_mf_open(dbmp, 207 path, pagesize, last_pgno, finfop, &mfp); 208 else { 209 ++mfp->ref; 210 ret = 0; 211 } 212 if (ret == 0 && 213 F_ISSET(dbmp, MP_LOCKHANDLE) && (ret = 214 __memp_alloc(dbmp, sizeof(db_mutex_t), NULL, &dbmfp->mutexp)) == 0) 215 LOCKINIT(dbmp, dbmfp->mutexp); 216 217 if (needlock) 218 UNLOCKREGION(dbmp); 219 if (ret != 0) 220 goto err; 221 222 dbmfp->mfp = mfp; 223 224 /* 225 * If a file: 226 * + is read-only 227 * + isn't temporary 228 * + doesn't require any pgin/pgout support 229 * + the DB_NOMMAP flag wasn't set 230 * + and is less than mp_mmapsize bytes in size 231 * 232 * we can mmap it instead of reading/writing buffers. Don't do error 233 * checking based on the mmap call failure. We want to do normal I/O 234 * on the file if the reason we failed was because the file was on an 235 * NFS mounted partition, and we can fail in buffer I/O just as easily 236 * as here. 237 * 238 * XXX 239 * We'd like to test to see if the file is too big to mmap. Since we 240 * don't know what size or type off_t's or size_t's are, or the largest 241 * unsigned integral type is, or what random insanity the local C 242 * compiler will perpetrate, doing the comparison in a portable way is 243 * flatly impossible. Hope that mmap fails if the file is too large. 244 */ 245 #define DB_MAXMMAPSIZE (10 * 1024 * 1024) /* 10 Mb. */ 246 if (F_ISSET(mfp, MP_CAN_MMAP)) { 247 if (!F_ISSET(dbmfp, MP_READONLY)) 248 F_CLR(mfp, MP_CAN_MMAP); 249 if (path == NULL) 250 F_CLR(mfp, MP_CAN_MMAP); 251 if (finfop->ftype != 0) 252 F_CLR(mfp, MP_CAN_MMAP); 253 if (LF_ISSET(DB_NOMMAP)) 254 F_CLR(mfp, MP_CAN_MMAP); 255 maxmap = dbenv == NULL || dbenv->mp_mmapsize == 0 ? 256 DB_MAXMMAPSIZE : dbenv->mp_mmapsize; 257 if (mbytes > maxmap / MEGABYTE || 258 (mbytes == maxmap / MEGABYTE && bytes >= maxmap % MEGABYTE)) 259 F_CLR(mfp, MP_CAN_MMAP); 260 } 261 dbmfp->addr = NULL; 262 if (F_ISSET(mfp, MP_CAN_MMAP)) { 263 dbmfp->len = (size_t)mbytes * MEGABYTE + bytes; 264 if (__db_mapfile(rpath, 265 dbmfp->fd, dbmfp->len, 1, &dbmfp->addr) != 0) { 266 dbmfp->addr = NULL; 267 F_CLR(mfp, MP_CAN_MMAP); 268 } 269 } 270 if (rpath != NULL) 271 __os_freestr(rpath); 272 273 LOCKHANDLE(dbmp, dbmp->mutexp); 274 TAILQ_INSERT_TAIL(&dbmp->dbmfq, dbmfp, q); 275 UNLOCKHANDLE(dbmp, dbmp->mutexp); 276 277 *retp = dbmfp; 278 return (0); 279 280 err: /* 281 * Note that we do not have to free the thread mutex, because we 282 * never get to here after we have successfully allocated it. 283 */ 284 if (rpath != NULL) 285 __os_freestr(rpath); 286 if (dbmfp->fd != -1) 287 (void)__os_close(dbmfp->fd); 288 if (dbmfp != NULL) 289 __os_free(dbmfp, sizeof(DB_MPOOLFILE)); 290 return (ret); 291 } 292 293 /* 294 * __memp_mf_open -- 295 * Open an MPOOLFILE. 296 */ 297 static int 298 __memp_mf_open(dbmp, path, pagesize, last_pgno, finfop, retp) 299 DB_MPOOL *dbmp; 300 const char *path; 301 size_t pagesize; 302 db_pgno_t last_pgno; 303 DB_MPOOL_FINFO *finfop; 304 MPOOLFILE **retp; 305 { 306 MPOOLFILE *mfp; 307 int ret; 308 void *p; 309 310 #define ISTEMPORARY (path == NULL) 311 312 /* 313 * Walk the list of MPOOLFILE's, looking for a matching file. 314 * Temporary files can't match previous files. 315 */ 316 if (!ISTEMPORARY) 317 for (mfp = SH_TAILQ_FIRST(&dbmp->mp->mpfq, __mpoolfile); 318 mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) { 319 if (F_ISSET(mfp, MP_TEMP)) 320 continue; 321 if (!memcmp(finfop->fileid, 322 R_ADDR(dbmp, mfp->fileid_off), DB_FILE_ID_LEN)) { 323 if (finfop->clear_len != mfp->clear_len || 324 finfop->ftype != mfp->ftype || 325 pagesize != mfp->stat.st_pagesize) { 326 __db_err(dbmp->dbenv, 327 "%s: ftype, clear length or pagesize changed", 328 path); 329 return (EINVAL); 330 } 331 332 /* Found it: increment the reference count. */ 333 ++mfp->ref; 334 *retp = mfp; 335 return (0); 336 } 337 } 338 339 /* Allocate a new MPOOLFILE. */ 340 if ((ret = __memp_alloc(dbmp, sizeof(MPOOLFILE), NULL, &mfp)) != 0) 341 return (ret); 342 *retp = mfp; 343 344 /* Initialize the structure. */ 345 memset(mfp, 0, sizeof(MPOOLFILE)); 346 mfp->ref = 1; 347 mfp->ftype = finfop->ftype; 348 mfp->lsn_off = finfop->lsn_offset; 349 mfp->clear_len = finfop->clear_len; 350 351 /* 352 * If the user specifies DB_MPOOL_LAST or DB_MPOOL_NEW on a memp_fget, 353 * we have to know the last page in the file. Figure it out and save 354 * it away. 355 */ 356 mfp->stat.st_pagesize = pagesize; 357 mfp->orig_last_pgno = mfp->last_pgno = last_pgno; 358 359 if (ISTEMPORARY) 360 F_SET(mfp, MP_TEMP); 361 else { 362 /* Copy the file path into shared memory. */ 363 if ((ret = __memp_alloc(dbmp, 364 strlen(path) + 1, &mfp->path_off, &p)) != 0) 365 goto err; 366 memcpy(p, path, strlen(path) + 1); 367 368 /* Copy the file identification string into shared memory. */ 369 if ((ret = __memp_alloc(dbmp, 370 DB_FILE_ID_LEN, &mfp->fileid_off, &p)) != 0) 371 goto err; 372 memcpy(p, finfop->fileid, DB_FILE_ID_LEN); 373 374 F_SET(mfp, MP_CAN_MMAP); 375 } 376 377 /* Copy the page cookie into shared memory. */ 378 if (finfop->pgcookie == NULL || finfop->pgcookie->size == 0) { 379 mfp->pgcookie_len = 0; 380 mfp->pgcookie_off = 0; 381 } else { 382 if ((ret = __memp_alloc(dbmp, 383 finfop->pgcookie->size, &mfp->pgcookie_off, &p)) != 0) 384 goto err; 385 memcpy(p, finfop->pgcookie->data, finfop->pgcookie->size); 386 mfp->pgcookie_len = finfop->pgcookie->size; 387 } 388 389 /* Prepend the MPOOLFILE to the list of MPOOLFILE's. */ 390 SH_TAILQ_INSERT_HEAD(&dbmp->mp->mpfq, mfp, q, __mpoolfile); 391 392 if (0) { 393 err: if (mfp->path_off != 0) 394 __db_shalloc_free(dbmp->addr, 395 R_ADDR(dbmp, mfp->path_off)); 396 if (mfp->fileid_off != 0) 397 __db_shalloc_free(dbmp->addr, 398 R_ADDR(dbmp, mfp->fileid_off)); 399 if (mfp != NULL) 400 __db_shalloc_free(dbmp->addr, mfp); 401 mfp = NULL; 402 } 403 return (0); 404 } 405 406 /* 407 * memp_fclose -- 408 * Close a backing file for the memory pool. 409 */ 410 int 411 memp_fclose(dbmfp) 412 DB_MPOOLFILE *dbmfp; 413 { 414 DB_MPOOL *dbmp; 415 int ret, t_ret; 416 417 dbmp = dbmfp->dbmp; 418 ret = 0; 419 420 MP_PANIC_CHECK(dbmp); 421 422 for (;;) { 423 LOCKHANDLE(dbmp, dbmp->mutexp); 424 425 /* 426 * We have to reference count DB_MPOOLFILE structures as other 427 * threads may be using them. The problem only happens if the 428 * application makes a bad design choice. Here's the path: 429 * 430 * Thread A opens a database. 431 * Thread B uses thread A's DB_MPOOLFILE to write a buffer 432 * in order to free up memory in the mpool cache. 433 * Thread A closes the database while thread B is using the 434 * DB_MPOOLFILE structure. 435 * 436 * By opening all databases before creating the threads, and 437 * closing them after the threads have exited, applications 438 * get better performance and avoid the problem path entirely. 439 * 440 * Regardless, holding the DB_MPOOLFILE to flush a dirty buffer 441 * is a short-term lock, even in worst case, since we better be 442 * the only thread of control using the DB_MPOOLFILE structure 443 * to read pages *into* the cache. Wait until we're the only 444 * reference holder and remove the DB_MPOOLFILE structure from 445 * the list, so nobody else can even find it. 446 */ 447 if (dbmfp->ref == 1) { 448 TAILQ_REMOVE(&dbmp->dbmfq, dbmfp, q); 449 break; 450 } 451 UNLOCKHANDLE(dbmp, dbmp->mutexp); 452 453 (void)__os_sleep(1, 0); 454 } 455 UNLOCKHANDLE(dbmp, dbmp->mutexp); 456 457 /* Complain if pinned blocks never returned. */ 458 if (dbmfp->pinref != 0) 459 __db_err(dbmp->dbenv, "%s: close: %lu blocks left pinned", 460 __memp_fn(dbmfp), (u_long)dbmfp->pinref); 461 462 /* Close the underlying MPOOLFILE. */ 463 (void)__memp_mf_close(dbmp, dbmfp); 464 465 /* Discard any mmap information. */ 466 if (dbmfp->addr != NULL && 467 (ret = __db_unmapfile(dbmfp->addr, dbmfp->len)) != 0) 468 __db_err(dbmp->dbenv, 469 "%s: %s", __memp_fn(dbmfp), strerror(ret)); 470 471 /* Close the file; temporary files may not yet have been created. */ 472 if (dbmfp->fd != -1 && (t_ret = __os_close(dbmfp->fd)) != 0) { 473 __db_err(dbmp->dbenv, 474 "%s: %s", __memp_fn(dbmfp), strerror(t_ret)); 475 if (ret != 0) 476 t_ret = ret; 477 } 478 479 /* Free memory. */ 480 if (dbmfp->mutexp != NULL) { 481 LOCKREGION(dbmp); 482 __db_shalloc_free(dbmp->addr, dbmfp->mutexp); 483 UNLOCKREGION(dbmp); 484 } 485 486 /* Discard the DB_MPOOLFILE structure. */ 487 __os_free(dbmfp, sizeof(DB_MPOOLFILE)); 488 489 return (ret); 490 } 491 492 /* 493 * __memp_mf_close -- 494 * Close down an MPOOLFILE. 495 */ 496 static int 497 __memp_mf_close(dbmp, dbmfp) 498 DB_MPOOL *dbmp; 499 DB_MPOOLFILE *dbmfp; 500 { 501 BH *bhp, *nbhp; 502 MPOOL *mp; 503 MPOOLFILE *mfp; 504 size_t mf_offset; 505 506 mp = dbmp->mp; 507 mfp = dbmfp->mfp; 508 509 LOCKREGION(dbmp); 510 511 /* If more than a single reference, simply decrement. */ 512 if (mfp->ref > 1) { 513 --mfp->ref; 514 goto ret1; 515 } 516 517 /* 518 * Move any BH's held by the file to the free list. We don't free the 519 * memory itself because we may be discarding the memory pool, and it's 520 * fairly expensive to reintegrate the buffers back into the region for 521 * no purpose. 522 */ 523 mf_offset = R_OFFSET(dbmp, mfp); 524 for (bhp = SH_TAILQ_FIRST(&mp->bhq, __bh); bhp != NULL; bhp = nbhp) { 525 nbhp = SH_TAILQ_NEXT(bhp, q, __bh); 526 527 #ifdef DEBUG_NO_DIRTY 528 /* Complain if we find any blocks that were left dirty. */ 529 if (F_ISSET(bhp, BH_DIRTY)) 530 __db_err(dbmp->dbenv, 531 "%s: close: pgno %lu left dirty; ref %lu", 532 __memp_fn(dbmfp), 533 (u_long)bhp->pgno, (u_long)bhp->ref); 534 #endif 535 536 if (bhp->mf_offset == mf_offset) { 537 if (F_ISSET(bhp, BH_DIRTY)) { 538 ++mp->stat.st_page_clean; 539 --mp->stat.st_page_dirty; 540 } 541 __memp_bhfree(dbmp, mfp, bhp, 0); 542 SH_TAILQ_INSERT_HEAD(&mp->bhfq, bhp, q, __bh); 543 } 544 } 545 546 /* Delete from the list of MPOOLFILEs. */ 547 SH_TAILQ_REMOVE(&mp->mpfq, mfp, q, __mpoolfile); 548 549 /* Free the space. */ 550 if (mfp->path_off != 0) 551 __db_shalloc_free(dbmp->addr, R_ADDR(dbmp, mfp->path_off)); 552 if (mfp->fileid_off != 0) 553 __db_shalloc_free(dbmp->addr, R_ADDR(dbmp, mfp->fileid_off)); 554 if (mfp->pgcookie_off != 0) 555 __db_shalloc_free(dbmp->addr, R_ADDR(dbmp, mfp->pgcookie_off)); 556 __db_shalloc_free(dbmp->addr, mfp); 557 558 ret1: UNLOCKREGION(dbmp); 559 return (0); 560 } 561