1 /*- 2 * See the file LICENSE for redistribution information. 3 * 4 * Copyright (c) 1996, 1997, 1998 5 * Sleepycat Software. All rights reserved. 6 */ 7 #include "config.h" 8 9 #ifndef lint 10 static const char sccsid[] = "@(#)mp_bh.c 10.45 (Sleepycat) 11/25/98"; 11 #endif /* not lint */ 12 13 #ifndef NO_SYSTEM_INCLUDES 14 #include <sys/types.h> 15 16 #include <errno.h> 17 #include <string.h> 18 #include <unistd.h> 19 #endif 20 21 #include "db_int.h" 22 #include "shqueue.h" 23 #include "db_shash.h" 24 #include "mp.h" 25 #include "common_ext.h" 26 27 static int __memp_upgrade __P((DB_MPOOL *, DB_MPOOLFILE *, MPOOLFILE *)); 28 29 /* 30 * __memp_bhwrite -- 31 * Write the page associated with a given bucket header. 32 * 33 * PUBLIC: int __memp_bhwrite 34 * PUBLIC: __P((DB_MPOOL *, MPOOLFILE *, BH *, int *, int *)); 35 */ 36 int 37 __memp_bhwrite(dbmp, mfp, bhp, restartp, wrotep) 38 DB_MPOOL *dbmp; 39 MPOOLFILE *mfp; 40 BH *bhp; 41 int *restartp, *wrotep; 42 { 43 DB_MPOOLFILE *dbmfp; 44 DB_MPREG *mpreg; 45 int incremented, ret; 46 47 if (restartp != NULL) 48 *restartp = 0; 49 if (wrotep != NULL) 50 *wrotep = 0; 51 incremented = 0; 52 53 /* 54 * Walk the process' DB_MPOOLFILE list and find a file descriptor for 55 * the file. We also check that the descriptor is open for writing. 56 * If we find a descriptor on the file that's not open for writing, we 57 * try and upgrade it to make it writeable. If that fails, we're done. 58 */ 59 LOCKHANDLE(dbmp, dbmp->mutexp); 60 for (dbmfp = TAILQ_FIRST(&dbmp->dbmfq); 61 dbmfp != NULL; dbmfp = TAILQ_NEXT(dbmfp, q)) 62 if (dbmfp->mfp == mfp) { 63 if (F_ISSET(dbmfp, MP_READONLY) && 64 __memp_upgrade(dbmp, dbmfp, mfp)) { 65 UNLOCKHANDLE(dbmp, dbmp->mutexp); 66 return (0); 67 } 68 69 /* 70 * Increment the reference count -- see the comment in 71 * memp_fclose(). 72 */ 73 ++dbmfp->ref; 74 incremented = 1; 75 break; 76 } 77 UNLOCKHANDLE(dbmp, dbmp->mutexp); 78 if (dbmfp != NULL) 79 goto found; 80 81 /* 82 * It's not a page from a file we've opened. If the file requires 83 * input/output processing, see if this process has ever registered 84 * information as to how to write this type of file. If not, there's 85 * nothing we can do. 86 */ 87 if (mfp->ftype != 0) { 88 LOCKHANDLE(dbmp, dbmp->mutexp); 89 for (mpreg = LIST_FIRST(&dbmp->dbregq); 90 mpreg != NULL; mpreg = LIST_NEXT(mpreg, q)) 91 if (mpreg->ftype == mfp->ftype) 92 break; 93 UNLOCKHANDLE(dbmp, dbmp->mutexp); 94 if (mpreg == NULL) 95 return (0); 96 } 97 98 /* 99 * Try and open the file, attaching to the underlying shared area. 100 * 101 * XXX 102 * Don't try to attach to temporary files. There are two problems in 103 * trying to do that. First, if we have different privileges than the 104 * process that "owns" the temporary file, we might create the backing 105 * disk file such that the owning process couldn't read/write its own 106 * buffers, e.g., memp_trickle() running as root creating a file owned 107 * as root, mode 600. Second, if the temporary file has already been 108 * created, we don't have any way of finding out what its real name is, 109 * and, even if we did, it was already unlinked (so that it won't be 110 * left if the process dies horribly). This decision causes a problem, 111 * however: if the temporary file consumes the entire buffer cache, 112 * and the owner doesn't flush the buffers to disk, we could end up 113 * with resource starvation, and the memp_trickle() thread couldn't do 114 * anything about it. That's a pretty unlikely scenario, though. 115 * 116 * XXX 117 * There's no negative cache, so we may repeatedly try and open files 118 * that we have previously tried (and failed) to open. 119 * 120 * Ignore any error, assume it's a permissions problem. 121 */ 122 if (F_ISSET(mfp, MP_TEMP)) 123 return (0); 124 125 if (__memp_fopen(dbmp, mfp, R_ADDR(dbmp, mfp->path_off), 126 0, 0, mfp->stat.st_pagesize, 0, NULL, &dbmfp) != 0) 127 return (0); 128 129 found: ret = __memp_pgwrite(dbmfp, bhp, restartp, wrotep); 130 131 if (incremented) { 132 LOCKHANDLE(dbmp, dbmp->mutexp); 133 --dbmfp->ref; 134 UNLOCKHANDLE(dbmp, dbmp->mutexp); 135 } 136 137 return (ret); 138 } 139 140 /* 141 * __memp_pgread -- 142 * Read a page from a file. 143 * 144 * PUBLIC: int __memp_pgread __P((DB_MPOOLFILE *, BH *, int)); 145 */ 146 int 147 __memp_pgread(dbmfp, bhp, can_create) 148 DB_MPOOLFILE *dbmfp; 149 BH *bhp; 150 int can_create; 151 { 152 DB_IO db_io; 153 DB_MPOOL *dbmp; 154 MPOOLFILE *mfp; 155 size_t len, pagesize; 156 ssize_t nr; 157 int created, ret; 158 159 dbmp = dbmfp->dbmp; 160 mfp = dbmfp->mfp; 161 pagesize = mfp->stat.st_pagesize; 162 163 F_SET(bhp, BH_LOCKED | BH_TRASH); 164 LOCKBUFFER(dbmp, bhp); 165 UNLOCKREGION(dbmp); 166 167 /* 168 * Temporary files may not yet have been created. We don't create 169 * them now, we create them when the pages have to be flushed. 170 */ 171 nr = 0; 172 if (dbmfp->fd == -1) 173 ret = 0; 174 else { 175 /* 176 * Ignore read errors if we have permission to create the page. 177 * Assume that the page doesn't exist, and that we'll create it 178 * when we write it out. 179 */ 180 db_io.fd_io = dbmfp->fd; 181 db_io.fd_lock = dbmp->reginfo.fd; 182 db_io.mutexp = 183 F_ISSET(dbmp, MP_LOCKHANDLE) ? dbmfp->mutexp : NULL; 184 db_io.pagesize = db_io.bytes = pagesize; 185 db_io.pgno = bhp->pgno; 186 db_io.buf = bhp->buf; 187 188 ret = __os_io(&db_io, DB_IO_READ, &nr); 189 } 190 191 created = 0; 192 if (nr < (ssize_t)pagesize) 193 if (can_create) 194 created = 1; 195 else { 196 /* If we had a short read, ret may be 0. */ 197 if (ret == 0) 198 ret = EIO; 199 __db_err(dbmp->dbenv, 200 "%s: page %lu doesn't exist, create flag not set", 201 __memp_fn(dbmfp), (u_long)bhp->pgno); 202 goto err; 203 } 204 205 /* 206 * Clear any bytes we didn't read that need to be cleared. If we're 207 * running in diagnostic mode, smash any bytes on the page that are 208 * unknown quantities for the caller. 209 */ 210 if (nr != (ssize_t)pagesize) { 211 len = mfp->clear_len == 0 ? pagesize : mfp->clear_len; 212 if (nr < (ssize_t)len) 213 memset(bhp->buf + nr, 0, len - nr); 214 #ifdef DIAGNOSTIC 215 if (nr > (ssize_t)len) 216 len = nr; 217 if (len < pagesize) 218 memset(bhp->buf + len, 0xdb, pagesize - len); 219 #endif 220 } 221 222 /* Call any pgin function. */ 223 ret = mfp->ftype == 0 ? 0 : __memp_pg(dbmfp, bhp, 1); 224 225 /* Unlock the buffer and reacquire the region lock. */ 226 err: UNLOCKBUFFER(dbmp, bhp); 227 LOCKREGION(dbmp); 228 229 /* 230 * If no errors occurred, the data is now valid, clear the BH_TRASH 231 * flag; regardless, clear the lock bit and let other threads proceed. 232 */ 233 F_CLR(bhp, BH_LOCKED); 234 if (ret == 0) { 235 F_CLR(bhp, BH_TRASH); 236 237 /* Update the statistics. */ 238 if (created) { 239 ++dbmp->mp->stat.st_page_create; 240 ++mfp->stat.st_page_create; 241 } else { 242 ++dbmp->mp->stat.st_page_in; 243 ++mfp->stat.st_page_in; 244 } 245 } 246 247 return (ret); 248 } 249 250 /* 251 * __memp_pgwrite -- 252 * Write a page to a file. 253 * 254 * PUBLIC: int __memp_pgwrite __P((DB_MPOOLFILE *, BH *, int *, int *)); 255 */ 256 int 257 __memp_pgwrite(dbmfp, bhp, restartp, wrotep) 258 DB_MPOOLFILE *dbmfp; 259 BH *bhp; 260 int *restartp, *wrotep; 261 { 262 DB_ENV *dbenv; 263 DB_IO db_io; 264 DB_LOG *lg_info; 265 DB_LSN lsn; 266 DB_MPOOL *dbmp; 267 MPOOL *mp; 268 MPOOLFILE *mfp; 269 ssize_t nw; 270 int callpgin, dosync, ret, syncfail; 271 const char *fail; 272 273 dbmp = dbmfp->dbmp; 274 dbenv = dbmp->dbenv; 275 mp = dbmp->mp; 276 mfp = dbmfp->mfp; 277 278 if (restartp != NULL) 279 *restartp = 0; 280 if (wrotep != NULL) 281 *wrotep = 0; 282 callpgin = 0; 283 284 /* 285 * Check the dirty bit -- this buffer may have been written since we 286 * decided to write it. 287 */ 288 if (!F_ISSET(bhp, BH_DIRTY)) { 289 if (wrotep != NULL) 290 *wrotep = 1; 291 return (0); 292 } 293 294 LOCKBUFFER(dbmp, bhp); 295 296 /* 297 * If there were two writers, we may have just been waiting while the 298 * other writer completed I/O on this buffer. Check the dirty bit one 299 * more time. 300 */ 301 if (!F_ISSET(bhp, BH_DIRTY)) { 302 UNLOCKBUFFER(dbmp, bhp); 303 304 if (wrotep != NULL) 305 *wrotep = 1; 306 return (0); 307 } 308 309 F_SET(bhp, BH_LOCKED); 310 UNLOCKREGION(dbmp); 311 312 if (restartp != NULL) 313 *restartp = 1; 314 315 /* Copy the LSN off the page if we're going to need it. */ 316 lg_info = dbenv->lg_info; 317 if (lg_info != NULL || F_ISSET(bhp, BH_WRITE)) 318 memcpy(&lsn, bhp->buf + mfp->lsn_off, sizeof(DB_LSN)); 319 320 /* Ensure the appropriate log records are on disk. */ 321 if (lg_info != NULL && (ret = log_flush(lg_info, &lsn)) != 0) 322 goto err; 323 324 /* 325 * Call any pgout function. We set the callpgin flag so that we flag 326 * that the contents of the buffer will need to be passed through pgin 327 * before they are reused. 328 */ 329 if (mfp->ftype == 0) 330 ret = 0; 331 else { 332 callpgin = 1; 333 if ((ret = __memp_pg(dbmfp, bhp, 0)) != 0) 334 goto err; 335 } 336 337 /* Temporary files may not yet have been created. */ 338 if (dbmfp->fd == -1) { 339 LOCKHANDLE(dbmp, dbmfp->mutexp); 340 if (dbmfp->fd == -1 && ((ret = __db_appname(dbenv, 341 DB_APP_TMP, NULL, NULL, DB_CREATE | DB_EXCL | DB_TEMPORARY, 342 &dbmfp->fd, NULL)) != 0 || dbmfp->fd == -1)) { 343 UNLOCKHANDLE(dbmp, dbmfp->mutexp); 344 __db_err(dbenv, 345 "unable to create temporary backing file"); 346 goto err; 347 } 348 UNLOCKHANDLE(dbmp, dbmfp->mutexp); 349 } 350 351 /* Write the page. */ 352 db_io.fd_io = dbmfp->fd; 353 db_io.fd_lock = dbmp->reginfo.fd; 354 db_io.mutexp = F_ISSET(dbmp, MP_LOCKHANDLE) ? dbmfp->mutexp : NULL; 355 db_io.pagesize = db_io.bytes = mfp->stat.st_pagesize; 356 db_io.pgno = bhp->pgno; 357 db_io.buf = bhp->buf; 358 if ((ret = __os_io(&db_io, DB_IO_WRITE, &nw)) != 0) { 359 __db_panic(dbenv, ret); 360 fail = "write"; 361 goto syserr; 362 } 363 if (nw != (ssize_t)mfp->stat.st_pagesize) { 364 ret = EIO; 365 fail = "write"; 366 goto syserr; 367 } 368 369 if (wrotep != NULL) 370 *wrotep = 1; 371 372 /* Unlock the buffer and reacquire the region lock. */ 373 UNLOCKBUFFER(dbmp, bhp); 374 LOCKREGION(dbmp); 375 376 /* 377 * Clean up the flags based on a successful write. 378 * 379 * If we rewrote the page, it will need processing by the pgin 380 * routine before reuse. 381 */ 382 if (callpgin) 383 F_SET(bhp, BH_CALLPGIN); 384 F_CLR(bhp, BH_DIRTY | BH_LOCKED); 385 386 /* 387 * If we write a buffer for which a checkpoint is waiting, update 388 * the count of pending buffers (both in the mpool as a whole and 389 * for this file). If the count for this file goes to zero, set a 390 * flag so we flush the writes. 391 */ 392 if (F_ISSET(bhp, BH_WRITE)) { 393 F_CLR(bhp, BH_WRITE); 394 395 --mp->lsn_cnt; 396 dosync = --mfp->lsn_cnt == 0 ? 1 : 0; 397 } else 398 dosync = 0; 399 400 /* Update the page clean/dirty statistics. */ 401 ++mp->stat.st_page_clean; 402 --mp->stat.st_page_dirty; 403 404 /* Update I/O statistics. */ 405 ++mp->stat.st_page_out; 406 ++mfp->stat.st_page_out; 407 408 /* 409 * Do the sync after everything else has been updated, so any incoming 410 * checkpoint doesn't see inconsistent information. 411 * 412 * XXX: 413 * Don't lock the region around the sync, fsync(2) has no atomicity 414 * issues. 415 * 416 * XXX: 417 * We ignore errors from the sync -- it makes no sense to return an 418 * error to the calling process, so set a flag causing the checkpoint 419 * to be retried later. There is a possibility, of course, that a 420 * subsequent checkpoint was started and that we're going to force it 421 * to fail. That should be unlikely, and fixing it would be difficult. 422 */ 423 if (dosync) { 424 UNLOCKREGION(dbmp); 425 syncfail = __os_fsync(dbmfp->fd) != 0; 426 LOCKREGION(dbmp); 427 if (syncfail) 428 F_SET(mp, MP_LSN_RETRY); 429 } 430 431 return (0); 432 433 syserr: __db_err(dbenv, "%s: %s failed for page %lu", 434 __memp_fn(dbmfp), fail, (u_long)bhp->pgno); 435 436 err: /* Unlock the buffer and reacquire the region lock. */ 437 UNLOCKBUFFER(dbmp, bhp); 438 LOCKREGION(dbmp); 439 440 /* 441 * Clean up the flags based on a failure. 442 * 443 * The page remains dirty but we remove our lock. If we rewrote the 444 * page, it will need processing by the pgin routine before reuse. 445 */ 446 if (callpgin) 447 F_SET(bhp, BH_CALLPGIN); 448 F_CLR(bhp, BH_LOCKED); 449 450 return (ret); 451 } 452 453 /* 454 * __memp_pg -- 455 * Call the pgin/pgout routine. 456 * 457 * PUBLIC: int __memp_pg __P((DB_MPOOLFILE *, BH *, int)); 458 */ 459 int 460 __memp_pg(dbmfp, bhp, is_pgin) 461 DB_MPOOLFILE *dbmfp; 462 BH *bhp; 463 int is_pgin; 464 { 465 DBT dbt, *dbtp; 466 DB_MPOOL *dbmp; 467 DB_MPREG *mpreg; 468 MPOOLFILE *mfp; 469 int ftype, ret; 470 471 dbmp = dbmfp->dbmp; 472 mfp = dbmfp->mfp; 473 474 LOCKHANDLE(dbmp, dbmp->mutexp); 475 476 ftype = mfp->ftype; 477 for (mpreg = LIST_FIRST(&dbmp->dbregq); 478 mpreg != NULL; mpreg = LIST_NEXT(mpreg, q)) { 479 if (ftype != mpreg->ftype) 480 continue; 481 if (mfp->pgcookie_len == 0) 482 dbtp = NULL; 483 else { 484 dbt.size = mfp->pgcookie_len; 485 dbt.data = R_ADDR(dbmp, mfp->pgcookie_off); 486 dbtp = &dbt; 487 } 488 UNLOCKHANDLE(dbmp, dbmp->mutexp); 489 490 if (is_pgin) { 491 if (mpreg->pgin != NULL && (ret = 492 mpreg->pgin(bhp->pgno, bhp->buf, dbtp)) != 0) 493 goto err; 494 } else 495 if (mpreg->pgout != NULL && (ret = 496 mpreg->pgout(bhp->pgno, bhp->buf, dbtp)) != 0) 497 goto err; 498 break; 499 } 500 501 if (mpreg == NULL) 502 UNLOCKHANDLE(dbmp, dbmp->mutexp); 503 504 return (0); 505 506 err: UNLOCKHANDLE(dbmp, dbmp->mutexp); 507 __db_err(dbmp->dbenv, "%s: %s failed for page %lu", 508 __memp_fn(dbmfp), is_pgin ? "pgin" : "pgout", (u_long)bhp->pgno); 509 return (ret); 510 } 511 512 /* 513 * __memp_bhfree -- 514 * Free a bucket header and its referenced data. 515 * 516 * PUBLIC: void __memp_bhfree __P((DB_MPOOL *, MPOOLFILE *, BH *, int)); 517 */ 518 void 519 __memp_bhfree(dbmp, mfp, bhp, free_mem) 520 DB_MPOOL *dbmp; 521 MPOOLFILE *mfp; 522 BH *bhp; 523 int free_mem; 524 { 525 size_t off; 526 527 /* Delete the buffer header from the hash bucket queue. */ 528 off = BUCKET(dbmp->mp, R_OFFSET(dbmp, mfp), bhp->pgno); 529 SH_TAILQ_REMOVE(&dbmp->htab[off], bhp, hq, __bh); 530 531 /* Delete the buffer header from the LRU queue. */ 532 SH_TAILQ_REMOVE(&dbmp->mp->bhq, bhp, q, __bh); 533 534 /* 535 * If we're not reusing it immediately, free the buffer header 536 * and data for real. 537 */ 538 if (free_mem) { 539 __db_shalloc_free(dbmp->addr, bhp); 540 --dbmp->mp->stat.st_page_clean; 541 } 542 } 543 544 /* 545 * __memp_upgrade -- 546 * Upgrade a file descriptor from readonly to readwrite. 547 */ 548 static int 549 __memp_upgrade(dbmp, dbmfp, mfp) 550 DB_MPOOL *dbmp; 551 DB_MPOOLFILE *dbmfp; 552 MPOOLFILE *mfp; 553 { 554 int fd, ret; 555 char *rpath; 556 557 /* 558 * !!! 559 * We expect the handle to already be locked. 560 */ 561 562 /* Check to see if we've already upgraded. */ 563 if (F_ISSET(dbmfp, MP_UPGRADE)) 564 return (0); 565 566 /* Check to see if we've already failed. */ 567 if (F_ISSET(dbmfp, MP_UPGRADE_FAIL)) 568 return (1); 569 570 /* 571 * Calculate the real name for this file and try to open it read/write. 572 * We know we have a valid pathname for the file because it's the only 573 * way we could have gotten a file descriptor of any kind. 574 */ 575 if ((ret = __db_appname(dbmp->dbenv, DB_APP_DATA, 576 NULL, R_ADDR(dbmp, mfp->path_off), 0, NULL, &rpath)) != 0) 577 return (ret); 578 if (__db_open(rpath, 0, 0, 0, &fd) != 0) { 579 F_SET(dbmfp, MP_UPGRADE_FAIL); 580 ret = 1; 581 } else { 582 /* Swap the descriptors and set the upgrade flag. */ 583 (void)__os_close(dbmfp->fd); 584 dbmfp->fd = fd; 585 F_SET(dbmfp, MP_UPGRADE); 586 ret = 0; 587 } 588 __os_freestr(rpath); 589 return (ret); 590 } 591