1 /*- 2 * See the file LICENSE for redistribution information. 3 * 4 * Copyright (c) 1996, 1997, 1998 5 * Sleepycat Software. All rights reserved. 6 */ 7 #include "config.h" 8 9 #ifndef lint 10 static const char sccsid[] = "@(#)log_put.c 10.44 (Sleepycat) 11/3/98"; 11 #endif /* not lint */ 12 13 #ifndef NO_SYSTEM_INCLUDES 14 #include <sys/types.h> 15 16 #include <errno.h> 17 #include <stdio.h> 18 #include <string.h> 19 #include <time.h> 20 #include <unistd.h> 21 #endif 22 23 #include "db_int.h" 24 #include "shqueue.h" 25 #include "db_page.h" 26 #include "log.h" 27 #include "hash.h" 28 #include "clib_ext.h" 29 #include "common_ext.h" 30 31 static int __log_fill __P((DB_LOG *, DB_LSN *, void *, u_int32_t)); 32 static int __log_flush __P((DB_LOG *, const DB_LSN *)); 33 static int __log_newfd __P((DB_LOG *)); 34 static int __log_putr __P((DB_LOG *, DB_LSN *, const DBT *, u_int32_t)); 35 static int __log_write __P((DB_LOG *, void *, u_int32_t)); 36 37 /* 38 * log_put -- 39 * Write a log record. 40 */ 41 int 42 log_put(dblp, lsn, dbt, flags) 43 DB_LOG *dblp; 44 DB_LSN *lsn; 45 const DBT *dbt; 46 u_int32_t flags; 47 { 48 int ret; 49 50 LOG_PANIC_CHECK(dblp); 51 52 /* Validate arguments. */ 53 if (flags != 0 && flags != DB_CHECKPOINT && 54 flags != DB_CURLSN && flags != DB_FLUSH) 55 return (__db_ferr(dblp->dbenv, "log_put", 0)); 56 57 LOCK_LOGREGION(dblp); 58 ret = __log_put(dblp, lsn, dbt, flags); 59 UNLOCK_LOGREGION(dblp); 60 return (ret); 61 } 62 63 /* 64 * __log_put -- 65 * Write a log record; internal version. 66 * 67 * PUBLIC: int __log_put __P((DB_LOG *, DB_LSN *, const DBT *, u_int32_t)); 68 */ 69 int 70 __log_put(dblp, lsn, dbt, flags) 71 DB_LOG *dblp; 72 DB_LSN *lsn; 73 const DBT *dbt; 74 u_int32_t flags; 75 { 76 DBT fid_dbt, t; 77 DB_LSN r_unused; 78 FNAME *fnp; 79 LOG *lp; 80 u_int32_t lastoff; 81 int ret; 82 83 lp = dblp->lp; 84 85 /* 86 * If the application just wants to know where we are, fill in 87 * the information. Currently used by the transaction manager 88 * to avoid writing TXN_begin records. 89 */ 90 if (flags == DB_CURLSN) { 91 lsn->file = lp->lsn.file; 92 lsn->offset = lp->lsn.offset; 93 return (0); 94 } 95 96 /* If this information won't fit in the file, swap files. */ 97 if (lp->lsn.offset + sizeof(HDR) + dbt->size > lp->persist.lg_max) { 98 if (sizeof(HDR) + 99 sizeof(LOGP) + dbt->size > lp->persist.lg_max) { 100 __db_err(dblp->dbenv, 101 "log_put: record larger than maximum file size"); 102 return (EINVAL); 103 } 104 105 /* Flush the log. */ 106 if ((ret = __log_flush(dblp, NULL)) != 0) 107 return (ret); 108 109 /* 110 * Save the last known offset from the previous file, we'll 111 * need it to initialize the persistent header information. 112 */ 113 lastoff = lp->lsn.offset; 114 115 /* Point the current LSN to the new file. */ 116 ++lp->lsn.file; 117 lp->lsn.offset = 0; 118 119 /* Reset the file write offset. */ 120 lp->w_off = 0; 121 } else 122 lastoff = 0; 123 124 /* Initialize the LSN information returned to the user. */ 125 lsn->file = lp->lsn.file; 126 lsn->offset = lp->lsn.offset; 127 128 /* 129 * Insert persistent information as the first record in every file. 130 * Note that the previous length is wrong for the very first record 131 * of the log, but that's okay, we check for it during retrieval. 132 */ 133 if (lp->lsn.offset == 0) { 134 t.data = &lp->persist; 135 t.size = sizeof(LOGP); 136 if ((ret = __log_putr(dblp, lsn, 137 &t, lastoff == 0 ? 0 : lastoff - lp->len)) != 0) 138 return (ret); 139 140 /* Update the LSN information returned to the user. */ 141 lsn->file = lp->lsn.file; 142 lsn->offset = lp->lsn.offset; 143 } 144 145 /* Write the application's log record. */ 146 if ((ret = __log_putr(dblp, lsn, dbt, lp->lsn.offset - lp->len)) != 0) 147 return (ret); 148 149 /* 150 * On a checkpoint, we: 151 * Put out the checkpoint record (above). 152 * Save the LSN of the checkpoint in the shared region. 153 * Append the set of file name information into the log. 154 */ 155 if (flags == DB_CHECKPOINT) { 156 lp->chkpt_lsn = *lsn; 157 158 for (fnp = SH_TAILQ_FIRST(&dblp->lp->fq, __fname); 159 fnp != NULL; fnp = SH_TAILQ_NEXT(fnp, q, __fname)) { 160 if (fnp->ref == 0) /* Entry not in use. */ 161 continue; 162 memset(&t, 0, sizeof(t)); 163 t.data = R_ADDR(dblp, fnp->name_off); 164 t.size = strlen(t.data) + 1; 165 memset(&fid_dbt, 0, sizeof(fid_dbt)); 166 fid_dbt.data = fnp->ufid; 167 fid_dbt.size = DB_FILE_ID_LEN; 168 if ((ret = __log_register_log(dblp, NULL, &r_unused, 0, 169 LOG_CHECKPOINT, &t, &fid_dbt, fnp->id, fnp->s_type)) 170 != 0) 171 return (ret); 172 } 173 } 174 175 /* 176 * On a checkpoint or when flush is requested, we: 177 * Flush the current buffer contents to disk. 178 * Sync the log to disk. 179 */ 180 if (flags == DB_FLUSH || flags == DB_CHECKPOINT) 181 if ((ret = __log_flush(dblp, NULL)) != 0) 182 return (ret); 183 184 /* 185 * On a checkpoint, we: 186 * Save the time the checkpoint was written. 187 * Reset the bytes written since the last checkpoint. 188 */ 189 if (flags == DB_CHECKPOINT) { 190 (void)time(&lp->chkpt); 191 lp->stat.st_wc_bytes = lp->stat.st_wc_mbytes = 0; 192 } 193 return (0); 194 } 195 196 /* 197 * __log_putr -- 198 * Actually put a record into the log. 199 */ 200 static int 201 __log_putr(dblp, lsn, dbt, prev) 202 DB_LOG *dblp; 203 DB_LSN *lsn; 204 const DBT *dbt; 205 u_int32_t prev; 206 { 207 HDR hdr; 208 LOG *lp; 209 int ret; 210 211 lp = dblp->lp; 212 213 /* 214 * Initialize the header. If we just switched files, lsn.offset will 215 * be 0, and what we really want is the offset of the previous record 216 * in the previous file. Fortunately, prev holds the value we want. 217 */ 218 hdr.prev = prev; 219 hdr.len = sizeof(HDR) + dbt->size; 220 hdr.cksum = __ham_func4(dbt->data, dbt->size); 221 222 if ((ret = __log_fill(dblp, lsn, &hdr, sizeof(HDR))) != 0) 223 return (ret); 224 lp->len = sizeof(HDR); 225 lp->lsn.offset += sizeof(HDR); 226 227 if ((ret = __log_fill(dblp, lsn, dbt->data, dbt->size)) != 0) 228 return (ret); 229 lp->len += dbt->size; 230 lp->lsn.offset += dbt->size; 231 return (0); 232 } 233 234 /* 235 * log_flush -- 236 * Write all records less than or equal to the specified LSN. 237 */ 238 int 239 log_flush(dblp, lsn) 240 DB_LOG *dblp; 241 const DB_LSN *lsn; 242 { 243 int ret; 244 245 LOG_PANIC_CHECK(dblp); 246 247 LOCK_LOGREGION(dblp); 248 ret = __log_flush(dblp, lsn); 249 UNLOCK_LOGREGION(dblp); 250 return (ret); 251 } 252 253 /* 254 * __log_flush -- 255 * Write all records less than or equal to the specified LSN; internal 256 * version. 257 */ 258 static int 259 __log_flush(dblp, lsn) 260 DB_LOG *dblp; 261 const DB_LSN *lsn; 262 { 263 DB_LSN t_lsn; 264 LOG *lp; 265 int current, ret; 266 267 ret = 0; 268 lp = dblp->lp; 269 270 /* 271 * If no LSN specified, flush the entire log by setting the flush LSN 272 * to the last LSN written in the log. Otherwise, check that the LSN 273 * isn't a non-existent record for the log. 274 */ 275 if (lsn == NULL) { 276 t_lsn.file = lp->lsn.file; 277 t_lsn.offset = lp->lsn.offset - lp->len; 278 lsn = &t_lsn; 279 } else 280 if (lsn->file > lp->lsn.file || 281 (lsn->file == lp->lsn.file && 282 lsn->offset > lp->lsn.offset - lp->len)) { 283 __db_err(dblp->dbenv, 284 "log_flush: LSN past current end-of-log"); 285 return (EINVAL); 286 } 287 288 /* 289 * If the LSN is less than the last-sync'd LSN, we're done. Note, 290 * the last-sync LSN saved in s_lsn is the LSN of the first byte 291 * we absolutely know has been written to disk, so the test is <=. 292 */ 293 if (lsn->file < lp->s_lsn.file || 294 (lsn->file == lp->s_lsn.file && lsn->offset <= lp->s_lsn.offset)) 295 return (0); 296 297 /* 298 * We may need to write the current buffer. We have to write the 299 * current buffer if the flush LSN is greater than or equal to the 300 * buffer's starting LSN. 301 */ 302 current = 0; 303 if (lp->b_off != 0 && log_compare(lsn, &lp->f_lsn) >= 0) { 304 if ((ret = __log_write(dblp, lp->buf, lp->b_off)) != 0) 305 return (ret); 306 307 lp->b_off = 0; 308 current = 1; 309 } 310 311 /* 312 * It's possible that this thread may never have written to this log 313 * file. Acquire a file descriptor if we don't already have one. 314 */ 315 if (dblp->lfname != dblp->lp->lsn.file) 316 if ((ret = __log_newfd(dblp)) != 0) 317 return (ret); 318 319 /* Sync all writes to disk. */ 320 if ((ret = __os_fsync(dblp->lfd)) != 0) { 321 __db_panic(dblp->dbenv, ret); 322 return (ret); 323 } 324 ++lp->stat.st_scount; 325 326 /* 327 * Set the last-synced LSN, using the LSN of the current buffer. If 328 * the current buffer was flushed, we know the LSN of the first byte 329 * of the buffer is on disk, otherwise, we only know that the LSN of 330 * the record before the one beginning the current buffer is on disk. 331 * 332 * XXX 333 * Check to make sure that the saved lsn isn't 0 before we go making 334 * this change. If DB_CHECKPOINT was called before we actually wrote 335 * something, you can end up here without ever having written anything 336 * to a log file, and decrementing either s_lsn.file or s_lsn.offset 337 * will cause much sadness later on. 338 */ 339 lp->s_lsn = lp->f_lsn; 340 if (!current && lp->s_lsn.file != 0) 341 if (lp->s_lsn.offset == 0) { 342 --lp->s_lsn.file; 343 lp->s_lsn.offset = lp->persist.lg_max; 344 } else 345 --lp->s_lsn.offset; 346 347 return (0); 348 } 349 350 /* 351 * __log_fill -- 352 * Write information into the log. 353 */ 354 static int 355 __log_fill(dblp, lsn, addr, len) 356 DB_LOG *dblp; 357 DB_LSN *lsn; 358 void *addr; 359 u_int32_t len; 360 { 361 LOG *lp; 362 u_int32_t nrec; 363 size_t nw, remain; 364 int ret; 365 366 /* Copy out the data. */ 367 for (lp = dblp->lp; len > 0;) { 368 /* 369 * If we're beginning a new buffer, note the user LSN to which 370 * the first byte of the buffer belongs. We have to know this 371 * when flushing the buffer so that we know if the in-memory 372 * buffer needs to be flushed. 373 */ 374 if (lp->b_off == 0) 375 lp->f_lsn = *lsn; 376 377 /* 378 * If we're on a buffer boundary and the data is big enough, 379 * copy as many records as we can directly from the data. 380 */ 381 if (lp->b_off == 0 && len >= sizeof(lp->buf)) { 382 nrec = len / sizeof(lp->buf); 383 if ((ret = __log_write(dblp, 384 addr, nrec * sizeof(lp->buf))) != 0) 385 return (ret); 386 addr = (u_int8_t *)addr + nrec * sizeof(lp->buf); 387 len -= nrec * sizeof(lp->buf); 388 continue; 389 } 390 391 /* Figure out how many bytes we can copy this time. */ 392 remain = sizeof(lp->buf) - lp->b_off; 393 nw = remain > len ? len : remain; 394 memcpy(lp->buf + lp->b_off, addr, nw); 395 addr = (u_int8_t *)addr + nw; 396 len -= nw; 397 lp->b_off += nw; 398 399 /* If we fill the buffer, flush it. */ 400 if (lp->b_off == sizeof(lp->buf)) { 401 if ((ret = 402 __log_write(dblp, lp->buf, sizeof(lp->buf))) != 0) 403 return (ret); 404 lp->b_off = 0; 405 } 406 } 407 return (0); 408 } 409 410 /* 411 * __log_write -- 412 * Write the log buffer to disk. 413 */ 414 static int 415 __log_write(dblp, addr, len) 416 DB_LOG *dblp; 417 void *addr; 418 u_int32_t len; 419 { 420 LOG *lp; 421 ssize_t nw; 422 int ret; 423 424 /* 425 * If we haven't opened the log file yet or the current one 426 * has changed, acquire a new log file. 427 */ 428 lp = dblp->lp; 429 if (dblp->lfd == -1 || dblp->lfname != lp->lsn.file) 430 if ((ret = __log_newfd(dblp)) != 0) 431 return (ret); 432 433 /* 434 * Seek to the offset in the file (someone may have written it 435 * since we last did). 436 */ 437 if ((ret = __os_seek(dblp->lfd, 0, 0, lp->w_off, 0, SEEK_SET)) != 0 || 438 (ret = __os_write(dblp->lfd, addr, len, &nw)) != 0) { 439 __db_panic(dblp->dbenv, ret); 440 return (ret); 441 } 442 if (nw != (int32_t)len) 443 return (EIO); 444 445 /* Reset the buffer offset and update the seek offset. */ 446 lp->w_off += len; 447 448 /* Update written statistics. */ 449 if ((lp->stat.st_w_bytes += len) >= MEGABYTE) { 450 lp->stat.st_w_bytes -= MEGABYTE; 451 ++lp->stat.st_w_mbytes; 452 } 453 if ((lp->stat.st_wc_bytes += len) >= MEGABYTE) { 454 lp->stat.st_wc_bytes -= MEGABYTE; 455 ++lp->stat.st_wc_mbytes; 456 } 457 ++lp->stat.st_wcount; 458 459 return (0); 460 } 461 462 /* 463 * log_file -- 464 * Map a DB_LSN to a file name. 465 */ 466 int 467 log_file(dblp, lsn, namep, len) 468 DB_LOG *dblp; 469 const DB_LSN *lsn; 470 char *namep; 471 size_t len; 472 { 473 int ret; 474 char *name; 475 476 LOG_PANIC_CHECK(dblp); 477 478 LOCK_LOGREGION(dblp); 479 ret = __log_name(dblp, lsn->file, &name, NULL, 0); 480 UNLOCK_LOGREGION(dblp); 481 if (ret != 0) 482 return (ret); 483 484 /* Check to make sure there's enough room and copy the name. */ 485 if (len < strlen(name) + 1) { 486 *namep = '\0'; 487 return (ENOMEM); 488 } 489 (void)strcpy(namep, name); 490 __os_freestr(name); 491 492 return (0); 493 } 494 495 /* 496 * __log_newfd -- 497 * Acquire a file descriptor for the current log file. 498 */ 499 static int 500 __log_newfd(dblp) 501 DB_LOG *dblp; 502 { 503 int ret; 504 char *name; 505 506 /* Close any previous file descriptor. */ 507 if (dblp->lfd != -1) { 508 (void)__os_close(dblp->lfd); 509 dblp->lfd = -1; 510 } 511 512 /* Get the path of the new file and open it. */ 513 dblp->lfname = dblp->lp->lsn.file; 514 if ((ret = __log_name(dblp, 515 dblp->lfname, &name, &dblp->lfd, DB_CREATE | DB_SEQUENTIAL)) != 0) 516 __db_err(dblp->dbenv, "log_put: %s: %s", name, strerror(ret)); 517 518 __os_freestr(name); 519 return (ret); 520 } 521 522 /* 523 * __log_name -- 524 * Return the log name for a particular file, and optionally open it. 525 * 526 * PUBLIC: int __log_name __P((DB_LOG *, u_int32_t, char **, int *, u_int32_t)); 527 */ 528 int 529 __log_name(dblp, filenumber, namep, fdp, flags) 530 DB_LOG *dblp; 531 u_int32_t filenumber, flags; 532 char **namep; 533 int *fdp; 534 { 535 int ret; 536 char *oname; 537 char old[sizeof(LFPREFIX) + 5 + 20], new[sizeof(LFPREFIX) + 10 + 20]; 538 539 /* 540 * !!! 541 * The semantics of this routine are bizarre. 542 * 543 * The reason for all of this is that we need a place where we can 544 * intercept requests for log files, and, if appropriate, check for 545 * both the old-style and new-style log file names. The trick is 546 * that all callers of this routine that are opening the log file 547 * read-only want to use an old-style file name if they can't find 548 * a match using a new-style name. The only down-side is that some 549 * callers may check for the old-style when they really don't need 550 * to, but that shouldn't mess up anything, and we only check for 551 * the old-style name when we've already failed to find a new-style 552 * one. 553 * 554 * Create a new-style file name, and if we're not going to open the 555 * file, return regardless. 556 */ 557 (void)snprintf(new, sizeof(new), LFNAME, filenumber); 558 if ((ret = __db_appname(dblp->dbenv, 559 DB_APP_LOG, dblp->dir, new, 0, NULL, namep)) != 0 || fdp == NULL) 560 return (ret); 561 562 /* Open the new-style file -- if we succeed, we're done. */ 563 if ((ret = __db_open(*namep, 564 flags, flags, dblp->lp->persist.mode, fdp)) == 0) 565 return (0); 566 567 /* 568 * The open failed... if the DB_RDONLY flag isn't set, we're done, 569 * the caller isn't interested in old-style files. 570 */ 571 if (!LF_ISSET(DB_RDONLY)) 572 return (ret); 573 574 /* Create an old-style file name. */ 575 (void)snprintf(old, sizeof(old), LFNAME_V1, filenumber); 576 if ((ret = __db_appname(dblp->dbenv, 577 DB_APP_LOG, dblp->dir, old, 0, NULL, &oname)) != 0) 578 goto err; 579 580 /* 581 * Open the old-style file -- if we succeed, we're done. Free the 582 * space allocated for the new-style name and return the old-style 583 * name to the caller. 584 */ 585 if ((ret = __db_open(oname, 586 flags, flags, dblp->lp->persist.mode, fdp)) == 0) { 587 __os_freestr(*namep); 588 *namep = oname; 589 return (0); 590 } 591 592 /* 593 * Couldn't find either style of name -- return the new-style name 594 * for the caller's error message. If it's an old-style name that's 595 * actually missing we're going to confuse the user with the error 596 * message, but that implies that not only were we looking for an 597 * old-style name, but we expected it to exist and we weren't just 598 * looking for any log file. That's not a likely error. 599 */ 600 err: __os_freestr(oname); 601 return (ret); 602 } 603