1 /* 2 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 3 * Use is subject to license terms. 4 */ 5 6 /* 7 ** 2001 September 15 8 ** 9 ** The author disclaims copyright to this source code. In place of 10 ** a legal notice, here is a blessing: 11 ** 12 ** May you do good and not evil. 13 ** May you find forgiveness for yourself and forgive others. 14 ** May you share freely, never taking more than you give. 15 ** 16 ************************************************************************* 17 ** This is the implementation of the page cache subsystem or "pager". 18 ** 19 ** The pager is used to access a database disk file. It implements 20 ** atomic commit and rollback through the use of a journal file that 21 ** is separate from the database file. The pager also implements file 22 ** locking to prevent two processes from writing the same database 23 ** file simultaneously, or one process from reading the database while 24 ** another is writing. 25 ** 26 ** @(#) $Id: pager.c,v 1.101 2004/02/25 02:20:41 drh Exp $ 27 */ 28 #include "os.h" /* Must be first to enable large file support */ 29 #include "sqliteInt.h" 30 #include "pager.h" 31 #include <assert.h> 32 #include <string.h> 33 34 /* 35 ** Macros for troubleshooting. Normally turned off 36 */ 37 #if 0 38 static Pager *mainPager = 0; 39 #define SET_PAGER(X) if( mainPager==0 ) mainPager = (X) 40 #define CLR_PAGER(X) if( mainPager==(X) ) mainPager = 0 41 #define TRACE1(X) if( pPager==mainPager ) fprintf(stderr,X) 42 #define TRACE2(X,Y) if( pPager==mainPager ) fprintf(stderr,X,Y) 43 #define TRACE3(X,Y,Z) if( pPager==mainPager ) fprintf(stderr,X,Y,Z) 44 #else 45 #define SET_PAGER(X) 46 #define CLR_PAGER(X) 47 #define TRACE1(X) 48 #define TRACE2(X,Y) 49 #define TRACE3(X,Y,Z) 50 #endif 51 52 53 /* 54 ** The page cache as a whole is always in one of the following 55 ** states: 56 ** 57 ** SQLITE_UNLOCK The page cache is not currently reading or 58 ** writing the database file. There is no 59 ** data held in memory. This is the initial 60 ** state. 61 ** 62 ** SQLITE_READLOCK The page cache is reading the database. 63 ** Writing is not permitted. There can be 64 ** multiple readers accessing the same database 65 ** file at the same time. 66 ** 67 ** SQLITE_WRITELOCK The page cache is writing the database. 68 ** Access is exclusive. No other processes or 69 ** threads can be reading or writing while one 70 ** process is writing. 71 ** 72 ** The page cache comes up in SQLITE_UNLOCK. The first time a 73 ** sqlite_page_get() occurs, the state transitions to SQLITE_READLOCK. 74 ** After all pages have been released using sqlite_page_unref(), 75 ** the state transitions back to SQLITE_UNLOCK. The first time 76 ** that sqlite_page_write() is called, the state transitions to 77 ** SQLITE_WRITELOCK. (Note that sqlite_page_write() can only be 78 ** called on an outstanding page which means that the pager must 79 ** be in SQLITE_READLOCK before it transitions to SQLITE_WRITELOCK.) 80 ** The sqlite_page_rollback() and sqlite_page_commit() functions 81 ** transition the state from SQLITE_WRITELOCK back to SQLITE_READLOCK. 82 */ 83 #define SQLITE_UNLOCK 0 84 #define SQLITE_READLOCK 1 85 #define SQLITE_WRITELOCK 2 86 87 88 /* 89 ** Each in-memory image of a page begins with the following header. 90 ** This header is only visible to this pager module. The client 91 ** code that calls pager sees only the data that follows the header. 92 ** 93 ** Client code should call sqlitepager_write() on a page prior to making 94 ** any modifications to that page. The first time sqlitepager_write() 95 ** is called, the original page contents are written into the rollback 96 ** journal and PgHdr.inJournal and PgHdr.needSync are set. Later, once 97 ** the journal page has made it onto the disk surface, PgHdr.needSync 98 ** is cleared. The modified page cannot be written back into the original 99 ** database file until the journal pages has been synced to disk and the 100 ** PgHdr.needSync has been cleared. 101 ** 102 ** The PgHdr.dirty flag is set when sqlitepager_write() is called and 103 ** is cleared again when the page content is written back to the original 104 ** database file. 105 */ 106 typedef struct PgHdr PgHdr; 107 struct PgHdr { 108 Pager *pPager; /* The pager to which this page belongs */ 109 Pgno pgno; /* The page number for this page */ 110 PgHdr *pNextHash, *pPrevHash; /* Hash collision chain for PgHdr.pgno */ 111 int nRef; /* Number of users of this page */ 112 PgHdr *pNextFree, *pPrevFree; /* Freelist of pages where nRef==0 */ 113 PgHdr *pNextAll, *pPrevAll; /* A list of all pages */ 114 PgHdr *pNextCkpt, *pPrevCkpt; /* List of pages in the checkpoint journal */ 115 u8 inJournal; /* TRUE if has been written to journal */ 116 u8 inCkpt; /* TRUE if written to the checkpoint journal */ 117 u8 dirty; /* TRUE if we need to write back changes */ 118 u8 needSync; /* Sync journal before writing this page */ 119 u8 alwaysRollback; /* Disable dont_rollback() for this page */ 120 PgHdr *pDirty; /* Dirty pages sorted by PgHdr.pgno */ 121 /* SQLITE_PAGE_SIZE bytes of page data follow this header */ 122 /* Pager.nExtra bytes of local data follow the page data */ 123 }; 124 125 126 /* 127 ** A macro used for invoking the codec if there is one 128 */ 129 #ifdef SQLITE_HAS_CODEC 130 # define CODEC(P,D,N,X) if( P->xCodec ){ P->xCodec(P->pCodecArg,D,N,X); } 131 #else 132 # define CODEC(P,D,N,X) 133 #endif 134 135 /* 136 ** Convert a pointer to a PgHdr into a pointer to its data 137 ** and back again. 138 */ 139 #define PGHDR_TO_DATA(P) ((void*)(&(P)[1])) 140 #define DATA_TO_PGHDR(D) (&((PgHdr*)(D))[-1]) 141 #define PGHDR_TO_EXTRA(P) ((void*)&((char*)(&(P)[1]))[SQLITE_PAGE_SIZE]) 142 143 /* 144 ** How big to make the hash table used for locating in-memory pages 145 ** by page number. 146 */ 147 #define N_PG_HASH 2048 148 149 /* 150 ** Hash a page number 151 */ 152 #define pager_hash(PN) ((PN)&(N_PG_HASH-1)) 153 154 /* 155 ** A open page cache is an instance of the following structure. 156 */ 157 struct Pager { 158 char *zFilename; /* Name of the database file */ 159 char *zJournal; /* Name of the journal file */ 160 char *zDirectory; /* Directory hold database and journal files */ 161 OsFile fd, jfd; /* File descriptors for database and journal */ 162 OsFile cpfd; /* File descriptor for the checkpoint journal */ 163 int dbSize; /* Number of pages in the file */ 164 int origDbSize; /* dbSize before the current change */ 165 int ckptSize; /* Size of database (in pages) at ckpt_begin() */ 166 off_t ckptJSize; /* Size of journal at ckpt_begin() */ 167 int nRec; /* Number of pages written to the journal */ 168 u32 cksumInit; /* Quasi-random value added to every checksum */ 169 int ckptNRec; /* Number of records in the checkpoint journal */ 170 int nExtra; /* Add this many bytes to each in-memory page */ 171 void (*xDestructor)(void*); /* Call this routine when freeing pages */ 172 int nPage; /* Total number of in-memory pages */ 173 int nRef; /* Number of in-memory pages with PgHdr.nRef>0 */ 174 int mxPage; /* Maximum number of pages to hold in cache */ 175 int nHit, nMiss, nOvfl; /* Cache hits, missing, and LRU overflows */ 176 void (*xCodec)(void*,void*,Pgno,int); /* Routine for en/decoding data */ 177 void *pCodecArg; /* First argument to xCodec() */ 178 u8 journalOpen; /* True if journal file descriptors is valid */ 179 u8 journalStarted; /* True if header of journal is synced */ 180 u8 useJournal; /* Use a rollback journal on this file */ 181 u8 ckptOpen; /* True if the checkpoint journal is open */ 182 u8 ckptInUse; /* True we are in a checkpoint */ 183 u8 ckptAutoopen; /* Open ckpt journal when main journal is opened*/ 184 u8 noSync; /* Do not sync the journal if true */ 185 u8 fullSync; /* Do extra syncs of the journal for robustness */ 186 u8 state; /* SQLITE_UNLOCK, _READLOCK or _WRITELOCK */ 187 u8 errMask; /* One of several kinds of errors */ 188 u8 tempFile; /* zFilename is a temporary file */ 189 u8 readOnly; /* True for a read-only database */ 190 u8 needSync; /* True if an fsync() is needed on the journal */ 191 u8 dirtyFile; /* True if database file has changed in any way */ 192 u8 alwaysRollback; /* Disable dont_rollback() for all pages */ 193 u8 *aInJournal; /* One bit for each page in the database file */ 194 u8 *aInCkpt; /* One bit for each page in the database */ 195 PgHdr *pFirst, *pLast; /* List of free pages */ 196 PgHdr *pFirstSynced; /* First free page with PgHdr.needSync==0 */ 197 PgHdr *pAll; /* List of all pages */ 198 PgHdr *pCkpt; /* List of pages in the checkpoint journal */ 199 PgHdr *aHash[N_PG_HASH]; /* Hash table to map page number of PgHdr */ 200 }; 201 202 /* 203 ** These are bits that can be set in Pager.errMask. 204 */ 205 #define PAGER_ERR_FULL 0x01 /* a write() failed */ 206 #define PAGER_ERR_MEM 0x02 /* malloc() failed */ 207 #define PAGER_ERR_LOCK 0x04 /* error in the locking protocol */ 208 #define PAGER_ERR_CORRUPT 0x08 /* database or journal corruption */ 209 #define PAGER_ERR_DISK 0x10 /* general disk I/O error - bad hard drive? */ 210 211 /* 212 ** The journal file contains page records in the following 213 ** format. 214 ** 215 ** Actually, this structure is the complete page record for pager 216 ** formats less than 3. Beginning with format 3, this record is surrounded 217 ** by two checksums. 218 */ 219 typedef struct PageRecord PageRecord; 220 struct PageRecord { 221 Pgno pgno; /* The page number */ 222 char aData[SQLITE_PAGE_SIZE]; /* Original data for page pgno */ 223 }; 224 225 /* 226 ** Journal files begin with the following magic string. The data 227 ** was obtained from /dev/random. It is used only as a sanity check. 228 ** 229 ** There are three journal formats (so far). The 1st journal format writes 230 ** 32-bit integers in the byte-order of the host machine. New 231 ** formats writes integers as big-endian. All new journals use the 232 ** new format, but we have to be able to read an older journal in order 233 ** to rollback journals created by older versions of the library. 234 ** 235 ** The 3rd journal format (added for 2.8.0) adds additional sanity 236 ** checking information to the journal. If the power fails while the 237 ** journal is being written, semi-random garbage data might appear in 238 ** the journal file after power is restored. If an attempt is then made 239 ** to roll the journal back, the database could be corrupted. The additional 240 ** sanity checking data is an attempt to discover the garbage in the 241 ** journal and ignore it. 242 ** 243 ** The sanity checking information for the 3rd journal format consists 244 ** of a 32-bit checksum on each page of data. The checksum covers both 245 ** the page number and the SQLITE_PAGE_SIZE bytes of data for the page. 246 ** This cksum is initialized to a 32-bit random value that appears in the 247 ** journal file right after the header. The random initializer is important, 248 ** because garbage data that appears at the end of a journal is likely 249 ** data that was once in other files that have now been deleted. If the 250 ** garbage data came from an obsolete journal file, the checksums might 251 ** be correct. But by initializing the checksum to random value which 252 ** is different for every journal, we minimize that risk. 253 */ 254 static const unsigned char aJournalMagic1[] = { 255 0xd9, 0xd5, 0x05, 0xf9, 0x20, 0xa1, 0x63, 0xd4, 256 }; 257 static const unsigned char aJournalMagic2[] = { 258 0xd9, 0xd5, 0x05, 0xf9, 0x20, 0xa1, 0x63, 0xd5, 259 }; 260 static const unsigned char aJournalMagic3[] = { 261 0xd9, 0xd5, 0x05, 0xf9, 0x20, 0xa1, 0x63, 0xd6, 262 }; 263 #define JOURNAL_FORMAT_1 1 264 #define JOURNAL_FORMAT_2 2 265 #define JOURNAL_FORMAT_3 3 266 267 /* 268 ** The following integer determines what format to use when creating 269 ** new primary journal files. By default we always use format 3. 270 ** When testing, we can set this value to older journal formats in order to 271 ** make sure that newer versions of the library are able to rollback older 272 ** journal files. 273 ** 274 ** Note that checkpoint journals always use format 2 and omit the header. 275 */ 276 #ifdef SQLITE_TEST 277 int journal_format = 3; 278 #else 279 # define journal_format 3 280 #endif 281 282 /* 283 ** The size of the header and of each page in the journal varies according 284 ** to which journal format is being used. The following macros figure out 285 ** the sizes based on format numbers. 286 */ 287 #define JOURNAL_HDR_SZ(X) \ 288 (sizeof(aJournalMagic1) + sizeof(Pgno) + ((X)>=3)*2*sizeof(u32)) 289 #define JOURNAL_PG_SZ(X) \ 290 (SQLITE_PAGE_SIZE + sizeof(Pgno) + ((X)>=3)*sizeof(u32)) 291 292 /* 293 ** Enable reference count tracking here: 294 */ 295 #ifdef SQLITE_TEST 296 int pager_refinfo_enable = 0; 297 static void pager_refinfo(PgHdr *p){ 298 static int cnt = 0; 299 if( !pager_refinfo_enable ) return; 300 printf( 301 "REFCNT: %4d addr=0x%08x nRef=%d\n", 302 p->pgno, (int)PGHDR_TO_DATA(p), p->nRef 303 ); 304 cnt++; /* Something to set a breakpoint on */ 305 } 306 # define REFINFO(X) pager_refinfo(X) 307 #else 308 # define REFINFO(X) 309 #endif 310 311 /* 312 ** Read a 32-bit integer from the given file descriptor. Store the integer 313 ** that is read in *pRes. Return SQLITE_OK if everything worked, or an 314 ** error code is something goes wrong. 315 ** 316 ** If the journal format is 2 or 3, read a big-endian integer. If the 317 ** journal format is 1, read an integer in the native byte-order of the 318 ** host machine. 319 */ 320 static int read32bits(int format, OsFile *fd, u32 *pRes){ 321 u32 res; 322 int rc; 323 rc = sqliteOsRead(fd, &res, sizeof(res)); 324 if( rc==SQLITE_OK && format>JOURNAL_FORMAT_1 ){ 325 unsigned char ac[4]; 326 memcpy(ac, &res, 4); 327 res = (ac[0]<<24) | (ac[1]<<16) | (ac[2]<<8) | ac[3]; 328 } 329 *pRes = res; 330 return rc; 331 } 332 333 /* 334 ** Write a 32-bit integer into the given file descriptor. Return SQLITE_OK 335 ** on success or an error code is something goes wrong. 336 ** 337 ** If the journal format is 2 or 3, write the integer as 4 big-endian 338 ** bytes. If the journal format is 1, write the integer in the native 339 ** byte order. In normal operation, only formats 2 and 3 are used. 340 ** Journal format 1 is only used for testing. 341 */ 342 static int write32bits(OsFile *fd, u32 val){ 343 unsigned char ac[4]; 344 if( journal_format<=1 ){ 345 return sqliteOsWrite(fd, &val, 4); 346 } 347 ac[0] = (val>>24) & 0xff; 348 ac[1] = (val>>16) & 0xff; 349 ac[2] = (val>>8) & 0xff; 350 ac[3] = val & 0xff; 351 return sqliteOsWrite(fd, ac, 4); 352 } 353 354 /* 355 ** Write a 32-bit integer into a page header right before the 356 ** page data. This will overwrite the PgHdr.pDirty pointer. 357 ** 358 ** The integer is big-endian for formats 2 and 3 and native byte order 359 ** for journal format 1. 360 */ 361 static void store32bits(u32 val, PgHdr *p, int offset){ 362 unsigned char *ac; 363 ac = &((unsigned char*)PGHDR_TO_DATA(p))[offset]; 364 if( journal_format<=1 ){ 365 memcpy(ac, &val, 4); 366 }else{ 367 ac[0] = (val>>24) & 0xff; 368 ac[1] = (val>>16) & 0xff; 369 ac[2] = (val>>8) & 0xff; 370 ac[3] = val & 0xff; 371 } 372 } 373 374 375 /* 376 ** Convert the bits in the pPager->errMask into an approprate 377 ** return code. 378 */ 379 static int pager_errcode(Pager *pPager){ 380 int rc = SQLITE_OK; 381 if( pPager->errMask & PAGER_ERR_LOCK ) rc = SQLITE_PROTOCOL; 382 if( pPager->errMask & PAGER_ERR_DISK ) rc = SQLITE_IOERR; 383 if( pPager->errMask & PAGER_ERR_FULL ) rc = SQLITE_FULL; 384 if( pPager->errMask & PAGER_ERR_MEM ) rc = SQLITE_NOMEM; 385 if( pPager->errMask & PAGER_ERR_CORRUPT ) rc = SQLITE_CORRUPT; 386 return rc; 387 } 388 389 /* 390 ** Add or remove a page from the list of all pages that are in the 391 ** checkpoint journal. 392 ** 393 ** The Pager keeps a separate list of pages that are currently in 394 ** the checkpoint journal. This helps the sqlitepager_ckpt_commit() 395 ** routine run MUCH faster for the common case where there are many 396 ** pages in memory but only a few are in the checkpoint journal. 397 */ 398 static void page_add_to_ckpt_list(PgHdr *pPg){ 399 Pager *pPager = pPg->pPager; 400 if( pPg->inCkpt ) return; 401 assert( pPg->pPrevCkpt==0 && pPg->pNextCkpt==0 ); 402 pPg->pPrevCkpt = 0; 403 if( pPager->pCkpt ){ 404 pPager->pCkpt->pPrevCkpt = pPg; 405 } 406 pPg->pNextCkpt = pPager->pCkpt; 407 pPager->pCkpt = pPg; 408 pPg->inCkpt = 1; 409 } 410 static void page_remove_from_ckpt_list(PgHdr *pPg){ 411 if( !pPg->inCkpt ) return; 412 if( pPg->pPrevCkpt ){ 413 assert( pPg->pPrevCkpt->pNextCkpt==pPg ); 414 pPg->pPrevCkpt->pNextCkpt = pPg->pNextCkpt; 415 }else{ 416 assert( pPg->pPager->pCkpt==pPg ); 417 pPg->pPager->pCkpt = pPg->pNextCkpt; 418 } 419 if( pPg->pNextCkpt ){ 420 assert( pPg->pNextCkpt->pPrevCkpt==pPg ); 421 pPg->pNextCkpt->pPrevCkpt = pPg->pPrevCkpt; 422 } 423 pPg->pNextCkpt = 0; 424 pPg->pPrevCkpt = 0; 425 pPg->inCkpt = 0; 426 } 427 428 /* 429 ** Find a page in the hash table given its page number. Return 430 ** a pointer to the page or NULL if not found. 431 */ 432 static PgHdr *pager_lookup(Pager *pPager, Pgno pgno){ 433 PgHdr *p = pPager->aHash[pager_hash(pgno)]; 434 while( p && p->pgno!=pgno ){ 435 p = p->pNextHash; 436 } 437 return p; 438 } 439 440 /* 441 ** Unlock the database and clear the in-memory cache. This routine 442 ** sets the state of the pager back to what it was when it was first 443 ** opened. Any outstanding pages are invalidated and subsequent attempts 444 ** to access those pages will likely result in a coredump. 445 */ 446 static void pager_reset(Pager *pPager){ 447 PgHdr *pPg, *pNext; 448 for(pPg=pPager->pAll; pPg; pPg=pNext){ 449 pNext = pPg->pNextAll; 450 sqliteFree(pPg); 451 } 452 pPager->pFirst = 0; 453 pPager->pFirstSynced = 0; 454 pPager->pLast = 0; 455 pPager->pAll = 0; 456 memset(pPager->aHash, 0, sizeof(pPager->aHash)); 457 pPager->nPage = 0; 458 if( pPager->state>=SQLITE_WRITELOCK ){ 459 sqlitepager_rollback(pPager); 460 } 461 sqliteOsUnlock(&pPager->fd); 462 pPager->state = SQLITE_UNLOCK; 463 pPager->dbSize = -1; 464 pPager->nRef = 0; 465 assert( pPager->journalOpen==0 ); 466 } 467 468 /* 469 ** When this routine is called, the pager has the journal file open and 470 ** a write lock on the database. This routine releases the database 471 ** write lock and acquires a read lock in its place. The journal file 472 ** is deleted and closed. 473 ** 474 ** TODO: Consider keeping the journal file open for temporary databases. 475 ** This might give a performance improvement on windows where opening 476 ** a file is an expensive operation. 477 */ 478 static int pager_unwritelock(Pager *pPager){ 479 int rc; 480 PgHdr *pPg; 481 if( pPager->state<SQLITE_WRITELOCK ) return SQLITE_OK; 482 sqlitepager_ckpt_commit(pPager); 483 if( pPager->ckptOpen ){ 484 sqliteOsClose(&pPager->cpfd); 485 pPager->ckptOpen = 0; 486 } 487 if( pPager->journalOpen ){ 488 sqliteOsClose(&pPager->jfd); 489 pPager->journalOpen = 0; 490 sqliteOsDelete(pPager->zJournal); 491 sqliteFree( pPager->aInJournal ); 492 pPager->aInJournal = 0; 493 for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){ 494 pPg->inJournal = 0; 495 pPg->dirty = 0; 496 pPg->needSync = 0; 497 } 498 }else{ 499 assert( pPager->dirtyFile==0 || pPager->useJournal==0 ); 500 } 501 rc = sqliteOsReadLock(&pPager->fd); 502 if( rc==SQLITE_OK ){ 503 pPager->state = SQLITE_READLOCK; 504 }else{ 505 /* This can only happen if a process does a BEGIN, then forks and the 506 ** child process does the COMMIT. Because of the semantics of unix 507 ** file locking, the unlock will fail. 508 */ 509 pPager->state = SQLITE_UNLOCK; 510 } 511 return rc; 512 } 513 514 /* 515 ** Compute and return a checksum for the page of data. 516 ** 517 ** This is not a real checksum. It is really just the sum of the 518 ** random initial value and the page number. We considered do a checksum 519 ** of the database, but that was found to be too slow. 520 */ 521 static u32 pager_cksum(Pager *pPager, Pgno pgno, const char *aData){ 522 u32 cksum = pPager->cksumInit + pgno; 523 return cksum; 524 } 525 526 /* 527 ** Read a single page from the journal file opened on file descriptor 528 ** jfd. Playback this one page. 529 ** 530 ** There are three different journal formats. The format parameter determines 531 ** which format is used by the journal that is played back. 532 */ 533 static int pager_playback_one_page(Pager *pPager, OsFile *jfd, int format){ 534 int rc; 535 PgHdr *pPg; /* An existing page in the cache */ 536 PageRecord pgRec; 537 u32 cksum; 538 539 rc = read32bits(format, jfd, &pgRec.pgno); 540 if( rc!=SQLITE_OK ) return rc; 541 rc = sqliteOsRead(jfd, &pgRec.aData, sizeof(pgRec.aData)); 542 if( rc!=SQLITE_OK ) return rc; 543 544 /* Sanity checking on the page. This is more important that I originally 545 ** thought. If a power failure occurs while the journal is being written, 546 ** it could cause invalid data to be written into the journal. We need to 547 ** detect this invalid data (with high probability) and ignore it. 548 */ 549 if( pgRec.pgno==0 ){ 550 return SQLITE_DONE; 551 } 552 if( pgRec.pgno>(unsigned)pPager->dbSize ){ 553 return SQLITE_OK; 554 } 555 if( format>=JOURNAL_FORMAT_3 ){ 556 rc = read32bits(format, jfd, &cksum); 557 if( rc ) return rc; 558 if( pager_cksum(pPager, pgRec.pgno, pgRec.aData)!=cksum ){ 559 return SQLITE_DONE; 560 } 561 } 562 563 /* Playback the page. Update the in-memory copy of the page 564 ** at the same time, if there is one. 565 */ 566 pPg = pager_lookup(pPager, pgRec.pgno); 567 TRACE2("PLAYBACK %d\n", pgRec.pgno); 568 sqliteOsSeek(&pPager->fd, (pgRec.pgno-1)*(off_t)SQLITE_PAGE_SIZE); 569 rc = sqliteOsWrite(&pPager->fd, pgRec.aData, SQLITE_PAGE_SIZE); 570 if( pPg ){ 571 /* No page should ever be rolled back that is in use, except for page 572 ** 1 which is held in use in order to keep the lock on the database 573 ** active. However, such a page may be rolled back as a result of an 574 ** internal error resulting in an automatic call to 575 ** sqlitepager_rollback(), so we can't assert() it. 576 */ 577 /* assert( pPg->nRef==0 || pPg->pgno==1 ) */ 578 memcpy(PGHDR_TO_DATA(pPg), pgRec.aData, SQLITE_PAGE_SIZE); 579 memset(PGHDR_TO_EXTRA(pPg), 0, pPager->nExtra); 580 pPg->dirty = 0; 581 pPg->needSync = 0; 582 CODEC(pPager, PGHDR_TO_DATA(pPg), pPg->pgno, 3); 583 } 584 return rc; 585 } 586 587 /* 588 ** Playback the journal and thus restore the database file to 589 ** the state it was in before we started making changes. 590 ** 591 ** The journal file format is as follows: 592 ** 593 ** * 8 byte prefix. One of the aJournalMagic123 vectors defined 594 ** above. The format of the journal file is determined by which 595 ** of the three prefix vectors is seen. 596 ** * 4 byte big-endian integer which is the number of valid page records 597 ** in the journal. If this value is 0xffffffff, then compute the 598 ** number of page records from the journal size. This field appears 599 ** in format 3 only. 600 ** * 4 byte big-endian integer which is the initial value for the 601 ** sanity checksum. This field appears in format 3 only. 602 ** * 4 byte integer which is the number of pages to truncate the 603 ** database to during a rollback. 604 ** * Zero or more pages instances, each as follows: 605 ** + 4 byte page number. 606 ** + SQLITE_PAGE_SIZE bytes of data. 607 ** + 4 byte checksum (format 3 only) 608 ** 609 ** When we speak of the journal header, we mean the first 4 bullets above. 610 ** Each entry in the journal is an instance of the 5th bullet. Note that 611 ** bullets 2 and 3 only appear in format-3 journals. 612 ** 613 ** Call the value from the second bullet "nRec". nRec is the number of 614 ** valid page entries in the journal. In most cases, you can compute the 615 ** value of nRec from the size of the journal file. But if a power 616 ** failure occurred while the journal was being written, it could be the 617 ** case that the size of the journal file had already been increased but 618 ** the extra entries had not yet made it safely to disk. In such a case, 619 ** the value of nRec computed from the file size would be too large. For 620 ** that reason, we always use the nRec value in the header. 621 ** 622 ** If the nRec value is 0xffffffff it means that nRec should be computed 623 ** from the file size. This value is used when the user selects the 624 ** no-sync option for the journal. A power failure could lead to corruption 625 ** in this case. But for things like temporary table (which will be 626 ** deleted when the power is restored) we don't care. 627 ** 628 ** Journal formats 1 and 2 do not have an nRec value in the header so we 629 ** have to compute nRec from the file size. This has risks (as described 630 ** above) which is why all persistent tables have been changed to use 631 ** format 3. 632 ** 633 ** If the file opened as the journal file is not a well-formed 634 ** journal file then the database will likely already be 635 ** corrupted, so the PAGER_ERR_CORRUPT bit is set in pPager->errMask 636 ** and SQLITE_CORRUPT is returned. If it all works, then this routine 637 ** returns SQLITE_OK. 638 */ 639 static int pager_playback(Pager *pPager, int useJournalSize){ 640 off_t szJ; /* Size of the journal file in bytes */ 641 int nRec; /* Number of Records in the journal */ 642 int i; /* Loop counter */ 643 Pgno mxPg = 0; /* Size of the original file in pages */ 644 int format; /* Format of the journal file. */ 645 unsigned char aMagic[sizeof(aJournalMagic1)]; 646 int rc; 647 648 /* Figure out how many records are in the journal. Abort early if 649 ** the journal is empty. 650 */ 651 assert( pPager->journalOpen ); 652 sqliteOsSeek(&pPager->jfd, 0); 653 rc = sqliteOsFileSize(&pPager->jfd, &szJ); 654 if( rc!=SQLITE_OK ){ 655 goto end_playback; 656 } 657 658 /* If the journal file is too small to contain a complete header, 659 ** it must mean that the process that created the journal was just 660 ** beginning to write the journal file when it died. In that case, 661 ** the database file should have still been completely unchanged. 662 ** Nothing needs to be rolled back. We can safely ignore this journal. 663 */ 664 if( szJ < sizeof(aMagic)+sizeof(Pgno) ){ 665 goto end_playback; 666 } 667 668 /* Read the beginning of the journal and truncate the 669 ** database file back to its original size. 670 */ 671 rc = sqliteOsRead(&pPager->jfd, aMagic, sizeof(aMagic)); 672 if( rc!=SQLITE_OK ){ 673 rc = SQLITE_PROTOCOL; 674 goto end_playback; 675 } 676 if( memcmp(aMagic, aJournalMagic3, sizeof(aMagic))==0 ){ 677 format = JOURNAL_FORMAT_3; 678 }else if( memcmp(aMagic, aJournalMagic2, sizeof(aMagic))==0 ){ 679 format = JOURNAL_FORMAT_2; 680 }else if( memcmp(aMagic, aJournalMagic1, sizeof(aMagic))==0 ){ 681 format = JOURNAL_FORMAT_1; 682 }else{ 683 rc = SQLITE_PROTOCOL; 684 goto end_playback; 685 } 686 if( format>=JOURNAL_FORMAT_3 ){ 687 if( szJ < sizeof(aMagic) + 3*sizeof(u32) ){ 688 /* Ignore the journal if it is too small to contain a complete 689 ** header. We already did this test once above, but at the prior 690 ** test, we did not know the journal format and so we had to assume 691 ** the smallest possible header. Now we know the header is bigger 692 ** than the minimum so we test again. 693 */ 694 goto end_playback; 695 } 696 rc = read32bits(format, &pPager->jfd, (u32*)&nRec); 697 if( rc ) goto end_playback; 698 rc = read32bits(format, &pPager->jfd, &pPager->cksumInit); 699 if( rc ) goto end_playback; 700 if( nRec==0xffffffff || useJournalSize ){ 701 nRec = (szJ - JOURNAL_HDR_SZ(3))/JOURNAL_PG_SZ(3); 702 } 703 }else{ 704 nRec = (szJ - JOURNAL_HDR_SZ(2))/JOURNAL_PG_SZ(2); 705 assert( nRec*JOURNAL_PG_SZ(2)+JOURNAL_HDR_SZ(2)==szJ ); 706 } 707 rc = read32bits(format, &pPager->jfd, &mxPg); 708 if( rc!=SQLITE_OK ){ 709 goto end_playback; 710 } 711 assert( pPager->origDbSize==0 || pPager->origDbSize==mxPg ); 712 rc = sqliteOsTruncate(&pPager->fd, SQLITE_PAGE_SIZE*(off_t)mxPg); 713 if( rc!=SQLITE_OK ){ 714 goto end_playback; 715 } 716 pPager->dbSize = mxPg; 717 718 /* Copy original pages out of the journal and back into the database file. 719 */ 720 for(i=0; i<nRec; i++){ 721 rc = pager_playback_one_page(pPager, &pPager->jfd, format); 722 if( rc!=SQLITE_OK ){ 723 if( rc==SQLITE_DONE ){ 724 rc = SQLITE_OK; 725 } 726 break; 727 } 728 } 729 730 /* Pages that have been written to the journal but never synced 731 ** where not restored by the loop above. We have to restore those 732 ** pages by reading them back from the original database. 733 */ 734 if( rc==SQLITE_OK ){ 735 PgHdr *pPg; 736 for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){ 737 char zBuf[SQLITE_PAGE_SIZE]; 738 if( !pPg->dirty ) continue; 739 if( (int)pPg->pgno <= pPager->origDbSize ){ 740 sqliteOsSeek(&pPager->fd, SQLITE_PAGE_SIZE*(off_t)(pPg->pgno-1)); 741 rc = sqliteOsRead(&pPager->fd, zBuf, SQLITE_PAGE_SIZE); 742 TRACE2("REFETCH %d\n", pPg->pgno); 743 CODEC(pPager, zBuf, pPg->pgno, 2); 744 if( rc ) break; 745 }else{ 746 memset(zBuf, 0, SQLITE_PAGE_SIZE); 747 } 748 if( pPg->nRef==0 || memcmp(zBuf, PGHDR_TO_DATA(pPg), SQLITE_PAGE_SIZE) ){ 749 memcpy(PGHDR_TO_DATA(pPg), zBuf, SQLITE_PAGE_SIZE); 750 memset(PGHDR_TO_EXTRA(pPg), 0, pPager->nExtra); 751 } 752 pPg->needSync = 0; 753 pPg->dirty = 0; 754 } 755 } 756 757 end_playback: 758 if( rc!=SQLITE_OK ){ 759 pager_unwritelock(pPager); 760 pPager->errMask |= PAGER_ERR_CORRUPT; 761 rc = SQLITE_CORRUPT; 762 }else{ 763 rc = pager_unwritelock(pPager); 764 } 765 return rc; 766 } 767 768 /* 769 ** Playback the checkpoint journal. 770 ** 771 ** This is similar to playing back the transaction journal but with 772 ** a few extra twists. 773 ** 774 ** (1) The number of pages in the database file at the start of 775 ** the checkpoint is stored in pPager->ckptSize, not in the 776 ** journal file itself. 777 ** 778 ** (2) In addition to playing back the checkpoint journal, also 779 ** playback all pages of the transaction journal beginning 780 ** at offset pPager->ckptJSize. 781 */ 782 static int pager_ckpt_playback(Pager *pPager){ 783 off_t szJ; /* Size of the full journal */ 784 int nRec; /* Number of Records */ 785 int i; /* Loop counter */ 786 int rc; 787 788 /* Truncate the database back to its original size. 789 */ 790 rc = sqliteOsTruncate(&pPager->fd, SQLITE_PAGE_SIZE*(off_t)pPager->ckptSize); 791 pPager->dbSize = pPager->ckptSize; 792 793 /* Figure out how many records are in the checkpoint journal. 794 */ 795 assert( pPager->ckptInUse && pPager->journalOpen ); 796 sqliteOsSeek(&pPager->cpfd, 0); 797 nRec = pPager->ckptNRec; 798 799 /* Copy original pages out of the checkpoint journal and back into the 800 ** database file. Note that the checkpoint journal always uses format 801 ** 2 instead of format 3 since it does not need to be concerned with 802 ** power failures corrupting the journal and can thus omit the checksums. 803 */ 804 for(i=nRec-1; i>=0; i--){ 805 rc = pager_playback_one_page(pPager, &pPager->cpfd, 2); 806 assert( rc!=SQLITE_DONE ); 807 if( rc!=SQLITE_OK ) goto end_ckpt_playback; 808 } 809 810 /* Figure out how many pages need to be copied out of the transaction 811 ** journal. 812 */ 813 rc = sqliteOsSeek(&pPager->jfd, pPager->ckptJSize); 814 if( rc!=SQLITE_OK ){ 815 goto end_ckpt_playback; 816 } 817 rc = sqliteOsFileSize(&pPager->jfd, &szJ); 818 if( rc!=SQLITE_OK ){ 819 goto end_ckpt_playback; 820 } 821 nRec = (szJ - pPager->ckptJSize)/JOURNAL_PG_SZ(journal_format); 822 for(i=nRec-1; i>=0; i--){ 823 rc = pager_playback_one_page(pPager, &pPager->jfd, journal_format); 824 if( rc!=SQLITE_OK ){ 825 assert( rc!=SQLITE_DONE ); 826 goto end_ckpt_playback; 827 } 828 } 829 830 end_ckpt_playback: 831 if( rc!=SQLITE_OK ){ 832 pPager->errMask |= PAGER_ERR_CORRUPT; 833 rc = SQLITE_CORRUPT; 834 } 835 return rc; 836 } 837 838 /* 839 ** Change the maximum number of in-memory pages that are allowed. 840 ** 841 ** The maximum number is the absolute value of the mxPage parameter. 842 ** If mxPage is negative, the noSync flag is also set. noSync bypasses 843 ** calls to sqliteOsSync(). The pager runs much faster with noSync on, 844 ** but if the operating system crashes or there is an abrupt power 845 ** failure, the database file might be left in an inconsistent and 846 ** unrepairable state. 847 */ 848 void sqlitepager_set_cachesize(Pager *pPager, int mxPage){ 849 if( mxPage>=0 ){ 850 pPager->noSync = pPager->tempFile; 851 if( pPager->noSync==0 ) pPager->needSync = 0; 852 }else{ 853 pPager->noSync = 1; 854 mxPage = -mxPage; 855 } 856 if( mxPage>10 ){ 857 pPager->mxPage = mxPage; 858 } 859 } 860 861 /* 862 ** Adjust the robustness of the database to damage due to OS crashes 863 ** or power failures by changing the number of syncs()s when writing 864 ** the rollback journal. There are three levels: 865 ** 866 ** OFF sqliteOsSync() is never called. This is the default 867 ** for temporary and transient files. 868 ** 869 ** NORMAL The journal is synced once before writes begin on the 870 ** database. This is normally adequate protection, but 871 ** it is theoretically possible, though very unlikely, 872 ** that an inopertune power failure could leave the journal 873 ** in a state which would cause damage to the database 874 ** when it is rolled back. 875 ** 876 ** FULL The journal is synced twice before writes begin on the 877 ** database (with some additional information - the nRec field 878 ** of the journal header - being written in between the two 879 ** syncs). If we assume that writing a 880 ** single disk sector is atomic, then this mode provides 881 ** assurance that the journal will not be corrupted to the 882 ** point of causing damage to the database during rollback. 883 ** 884 ** Numeric values associated with these states are OFF==1, NORMAL=2, 885 ** and FULL=3. 886 */ 887 void sqlitepager_set_safety_level(Pager *pPager, int level){ 888 pPager->noSync = level==1 || pPager->tempFile; 889 pPager->fullSync = level==3 && !pPager->tempFile; 890 if( pPager->noSync==0 ) pPager->needSync = 0; 891 } 892 893 /* 894 ** Open a temporary file. Write the name of the file into zName 895 ** (zName must be at least SQLITE_TEMPNAME_SIZE bytes long.) Write 896 ** the file descriptor into *fd. Return SQLITE_OK on success or some 897 ** other error code if we fail. 898 ** 899 ** The OS will automatically delete the temporary file when it is 900 ** closed. 901 */ 902 static int sqlitepager_opentemp(char *zFile, OsFile *fd){ 903 int cnt = 8; 904 int rc; 905 do{ 906 cnt--; 907 sqliteOsTempFileName(zFile); 908 rc = sqliteOsOpenExclusive(zFile, fd, 1); 909 }while( cnt>0 && rc!=SQLITE_OK ); 910 return rc; 911 } 912 913 /* 914 ** Create a new page cache and put a pointer to the page cache in *ppPager. 915 ** The file to be cached need not exist. The file is not locked until 916 ** the first call to sqlitepager_get() and is only held open until the 917 ** last page is released using sqlitepager_unref(). 918 ** 919 ** If zFilename is NULL then a randomly-named temporary file is created 920 ** and used as the file to be cached. The file will be deleted 921 ** automatically when it is closed. 922 */ 923 int sqlitepager_open( 924 Pager **ppPager, /* Return the Pager structure here */ 925 const char *zFilename, /* Name of the database file to open */ 926 int mxPage, /* Max number of in-memory cache pages */ 927 int nExtra, /* Extra bytes append to each in-memory page */ 928 int useJournal /* TRUE to use a rollback journal on this file */ 929 ){ 930 Pager *pPager; 931 char *zFullPathname; 932 int nameLen; 933 OsFile fd; 934 int rc, i; 935 int tempFile; 936 int readOnly = 0; 937 char zTemp[SQLITE_TEMPNAME_SIZE]; 938 939 *ppPager = 0; 940 if( sqlite_malloc_failed ){ 941 return SQLITE_NOMEM; 942 } 943 if( zFilename && zFilename[0] ){ 944 zFullPathname = sqliteOsFullPathname(zFilename); 945 rc = sqliteOsOpenReadWrite(zFullPathname, &fd, &readOnly); 946 tempFile = 0; 947 }else{ 948 rc = sqlitepager_opentemp(zTemp, &fd); 949 zFilename = zTemp; 950 zFullPathname = sqliteOsFullPathname(zFilename); 951 tempFile = 1; 952 } 953 if( sqlite_malloc_failed ){ 954 return SQLITE_NOMEM; 955 } 956 if( rc!=SQLITE_OK ){ 957 sqliteFree(zFullPathname); 958 return SQLITE_CANTOPEN; 959 } 960 nameLen = strlen(zFullPathname); 961 pPager = sqliteMalloc( sizeof(*pPager) + nameLen*3 + 30 ); 962 if( pPager==0 ){ 963 sqliteOsClose(&fd); 964 sqliteFree(zFullPathname); 965 return SQLITE_NOMEM; 966 } 967 SET_PAGER(pPager); 968 pPager->zFilename = (char*)&pPager[1]; 969 pPager->zDirectory = &pPager->zFilename[nameLen+1]; 970 pPager->zJournal = &pPager->zDirectory[nameLen+1]; 971 strcpy(pPager->zFilename, zFullPathname); 972 strcpy(pPager->zDirectory, zFullPathname); 973 for(i=nameLen; i>0 && pPager->zDirectory[i-1]!='/'; i--){} 974 if( i>0 ) pPager->zDirectory[i-1] = 0; 975 strcpy(pPager->zJournal, zFullPathname); 976 sqliteFree(zFullPathname); 977 strcpy(&pPager->zJournal[nameLen], "-journal"); 978 pPager->fd = fd; 979 pPager->journalOpen = 0; 980 pPager->useJournal = useJournal; 981 pPager->ckptOpen = 0; 982 pPager->ckptInUse = 0; 983 pPager->nRef = 0; 984 pPager->dbSize = -1; 985 pPager->ckptSize = 0; 986 pPager->ckptJSize = 0; 987 pPager->nPage = 0; 988 pPager->mxPage = mxPage>5 ? mxPage : 10; 989 pPager->state = SQLITE_UNLOCK; 990 pPager->errMask = 0; 991 pPager->tempFile = tempFile; 992 pPager->readOnly = readOnly; 993 pPager->needSync = 0; 994 pPager->noSync = pPager->tempFile || !useJournal; 995 pPager->pFirst = 0; 996 pPager->pFirstSynced = 0; 997 pPager->pLast = 0; 998 pPager->nExtra = nExtra; 999 memset(pPager->aHash, 0, sizeof(pPager->aHash)); 1000 *ppPager = pPager; 1001 return SQLITE_OK; 1002 } 1003 1004 /* 1005 ** Set the destructor for this pager. If not NULL, the destructor is called 1006 ** when the reference count on each page reaches zero. The destructor can 1007 ** be used to clean up information in the extra segment appended to each page. 1008 ** 1009 ** The destructor is not called as a result sqlitepager_close(). 1010 ** Destructors are only called by sqlitepager_unref(). 1011 */ 1012 void sqlitepager_set_destructor(Pager *pPager, void (*xDesc)(void*)){ 1013 pPager->xDestructor = xDesc; 1014 } 1015 1016 /* 1017 ** Return the total number of pages in the disk file associated with 1018 ** pPager. 1019 */ 1020 int sqlitepager_pagecount(Pager *pPager){ 1021 off_t n; 1022 assert( pPager!=0 ); 1023 if( pPager->dbSize>=0 ){ 1024 return pPager->dbSize; 1025 } 1026 if( sqliteOsFileSize(&pPager->fd, &n)!=SQLITE_OK ){ 1027 pPager->errMask |= PAGER_ERR_DISK; 1028 return 0; 1029 } 1030 n /= SQLITE_PAGE_SIZE; 1031 if( pPager->state!=SQLITE_UNLOCK ){ 1032 pPager->dbSize = n; 1033 } 1034 return n; 1035 } 1036 1037 /* 1038 ** Forward declaration 1039 */ 1040 static int syncJournal(Pager*); 1041 1042 /* 1043 ** Truncate the file to the number of pages specified. 1044 */ 1045 int sqlitepager_truncate(Pager *pPager, Pgno nPage){ 1046 int rc; 1047 if( pPager->dbSize<0 ){ 1048 sqlitepager_pagecount(pPager); 1049 } 1050 if( pPager->errMask!=0 ){ 1051 rc = pager_errcode(pPager); 1052 return rc; 1053 } 1054 if( nPage>=(unsigned)pPager->dbSize ){ 1055 return SQLITE_OK; 1056 } 1057 syncJournal(pPager); 1058 rc = sqliteOsTruncate(&pPager->fd, SQLITE_PAGE_SIZE*(off_t)nPage); 1059 if( rc==SQLITE_OK ){ 1060 pPager->dbSize = nPage; 1061 } 1062 return rc; 1063 } 1064 1065 /* 1066 ** Shutdown the page cache. Free all memory and close all files. 1067 ** 1068 ** If a transaction was in progress when this routine is called, that 1069 ** transaction is rolled back. All outstanding pages are invalidated 1070 ** and their memory is freed. Any attempt to use a page associated 1071 ** with this page cache after this function returns will likely 1072 ** result in a coredump. 1073 */ 1074 int sqlitepager_close(Pager *pPager){ 1075 PgHdr *pPg, *pNext; 1076 switch( pPager->state ){ 1077 case SQLITE_WRITELOCK: { 1078 sqlitepager_rollback(pPager); 1079 sqliteOsUnlock(&pPager->fd); 1080 assert( pPager->journalOpen==0 ); 1081 break; 1082 } 1083 case SQLITE_READLOCK: { 1084 sqliteOsUnlock(&pPager->fd); 1085 break; 1086 } 1087 default: { 1088 /* Do nothing */ 1089 break; 1090 } 1091 } 1092 for(pPg=pPager->pAll; pPg; pPg=pNext){ 1093 pNext = pPg->pNextAll; 1094 sqliteFree(pPg); 1095 } 1096 sqliteOsClose(&pPager->fd); 1097 assert( pPager->journalOpen==0 ); 1098 /* Temp files are automatically deleted by the OS 1099 ** if( pPager->tempFile ){ 1100 ** sqliteOsDelete(pPager->zFilename); 1101 ** } 1102 */ 1103 CLR_PAGER(pPager); 1104 if( pPager->zFilename!=(char*)&pPager[1] ){ 1105 assert( 0 ); /* Cannot happen */ 1106 sqliteFree(pPager->zFilename); 1107 sqliteFree(pPager->zJournal); 1108 sqliteFree(pPager->zDirectory); 1109 } 1110 sqliteFree(pPager); 1111 return SQLITE_OK; 1112 } 1113 1114 /* 1115 ** Return the page number for the given page data. 1116 */ 1117 Pgno sqlitepager_pagenumber(void *pData){ 1118 PgHdr *p = DATA_TO_PGHDR(pData); 1119 return p->pgno; 1120 } 1121 1122 /* 1123 ** Increment the reference count for a page. If the page is 1124 ** currently on the freelist (the reference count is zero) then 1125 ** remove it from the freelist. 1126 */ 1127 #define page_ref(P) ((P)->nRef==0?_page_ref(P):(void)(P)->nRef++) 1128 static void _page_ref(PgHdr *pPg){ 1129 if( pPg->nRef==0 ){ 1130 /* The page is currently on the freelist. Remove it. */ 1131 if( pPg==pPg->pPager->pFirstSynced ){ 1132 PgHdr *p = pPg->pNextFree; 1133 while( p && p->needSync ){ p = p->pNextFree; } 1134 pPg->pPager->pFirstSynced = p; 1135 } 1136 if( pPg->pPrevFree ){ 1137 pPg->pPrevFree->pNextFree = pPg->pNextFree; 1138 }else{ 1139 pPg->pPager->pFirst = pPg->pNextFree; 1140 } 1141 if( pPg->pNextFree ){ 1142 pPg->pNextFree->pPrevFree = pPg->pPrevFree; 1143 }else{ 1144 pPg->pPager->pLast = pPg->pPrevFree; 1145 } 1146 pPg->pPager->nRef++; 1147 } 1148 pPg->nRef++; 1149 REFINFO(pPg); 1150 } 1151 1152 /* 1153 ** Increment the reference count for a page. The input pointer is 1154 ** a reference to the page data. 1155 */ 1156 int sqlitepager_ref(void *pData){ 1157 PgHdr *pPg = DATA_TO_PGHDR(pData); 1158 page_ref(pPg); 1159 return SQLITE_OK; 1160 } 1161 1162 /* 1163 ** Sync the journal. In other words, make sure all the pages that have 1164 ** been written to the journal have actually reached the surface of the 1165 ** disk. It is not safe to modify the original database file until after 1166 ** the journal has been synced. If the original database is modified before 1167 ** the journal is synced and a power failure occurs, the unsynced journal 1168 ** data would be lost and we would be unable to completely rollback the 1169 ** database changes. Database corruption would occur. 1170 ** 1171 ** This routine also updates the nRec field in the header of the journal. 1172 ** (See comments on the pager_playback() routine for additional information.) 1173 ** If the sync mode is FULL, two syncs will occur. First the whole journal 1174 ** is synced, then the nRec field is updated, then a second sync occurs. 1175 ** 1176 ** For temporary databases, we do not care if we are able to rollback 1177 ** after a power failure, so sync occurs. 1178 ** 1179 ** This routine clears the needSync field of every page current held in 1180 ** memory. 1181 */ 1182 static int syncJournal(Pager *pPager){ 1183 PgHdr *pPg; 1184 int rc = SQLITE_OK; 1185 1186 /* Sync the journal before modifying the main database 1187 ** (assuming there is a journal and it needs to be synced.) 1188 */ 1189 if( pPager->needSync ){ 1190 if( !pPager->tempFile ){ 1191 assert( pPager->journalOpen ); 1192 /* assert( !pPager->noSync ); // noSync might be set if synchronous 1193 ** was turned off after the transaction was started. Ticket #615 */ 1194 #ifndef NDEBUG 1195 { 1196 /* Make sure the pPager->nRec counter we are keeping agrees 1197 ** with the nRec computed from the size of the journal file. 1198 */ 1199 off_t hdrSz, pgSz, jSz; 1200 hdrSz = JOURNAL_HDR_SZ(journal_format); 1201 pgSz = JOURNAL_PG_SZ(journal_format); 1202 rc = sqliteOsFileSize(&pPager->jfd, &jSz); 1203 if( rc!=0 ) return rc; 1204 assert( pPager->nRec*pgSz+hdrSz==jSz ); 1205 } 1206 #endif 1207 if( journal_format>=3 ){ 1208 /* Write the nRec value into the journal file header */ 1209 off_t szJ; 1210 if( pPager->fullSync ){ 1211 TRACE1("SYNC\n"); 1212 rc = sqliteOsSync(&pPager->jfd); 1213 if( rc!=0 ) return rc; 1214 } 1215 sqliteOsSeek(&pPager->jfd, sizeof(aJournalMagic1)); 1216 rc = write32bits(&pPager->jfd, pPager->nRec); 1217 if( rc ) return rc; 1218 szJ = JOURNAL_HDR_SZ(journal_format) + 1219 pPager->nRec*JOURNAL_PG_SZ(journal_format); 1220 sqliteOsSeek(&pPager->jfd, szJ); 1221 } 1222 TRACE1("SYNC\n"); 1223 rc = sqliteOsSync(&pPager->jfd); 1224 if( rc!=0 ) return rc; 1225 pPager->journalStarted = 1; 1226 } 1227 pPager->needSync = 0; 1228 1229 /* Erase the needSync flag from every page. 1230 */ 1231 for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){ 1232 pPg->needSync = 0; 1233 } 1234 pPager->pFirstSynced = pPager->pFirst; 1235 } 1236 1237 #ifndef NDEBUG 1238 /* If the Pager.needSync flag is clear then the PgHdr.needSync 1239 ** flag must also be clear for all pages. Verify that this 1240 ** invariant is true. 1241 */ 1242 else{ 1243 for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){ 1244 assert( pPg->needSync==0 ); 1245 } 1246 assert( pPager->pFirstSynced==pPager->pFirst ); 1247 } 1248 #endif 1249 1250 return rc; 1251 } 1252 1253 /* 1254 ** Given a list of pages (connected by the PgHdr.pDirty pointer) write 1255 ** every one of those pages out to the database file and mark them all 1256 ** as clean. 1257 */ 1258 static int pager_write_pagelist(PgHdr *pList){ 1259 Pager *pPager; 1260 int rc; 1261 1262 if( pList==0 ) return SQLITE_OK; 1263 pPager = pList->pPager; 1264 while( pList ){ 1265 assert( pList->dirty ); 1266 sqliteOsSeek(&pPager->fd, (pList->pgno-1)*(off_t)SQLITE_PAGE_SIZE); 1267 CODEC(pPager, PGHDR_TO_DATA(pList), pList->pgno, 6); 1268 TRACE2("STORE %d\n", pList->pgno); 1269 rc = sqliteOsWrite(&pPager->fd, PGHDR_TO_DATA(pList), SQLITE_PAGE_SIZE); 1270 CODEC(pPager, PGHDR_TO_DATA(pList), pList->pgno, 0); 1271 if( rc ) return rc; 1272 pList->dirty = 0; 1273 pList = pList->pDirty; 1274 } 1275 return SQLITE_OK; 1276 } 1277 1278 /* 1279 ** Collect every dirty page into a dirty list and 1280 ** return a pointer to the head of that list. All pages are 1281 ** collected even if they are still in use. 1282 */ 1283 static PgHdr *pager_get_all_dirty_pages(Pager *pPager){ 1284 PgHdr *p, *pList; 1285 pList = 0; 1286 for(p=pPager->pAll; p; p=p->pNextAll){ 1287 if( p->dirty ){ 1288 p->pDirty = pList; 1289 pList = p; 1290 } 1291 } 1292 return pList; 1293 } 1294 1295 /* 1296 ** Acquire a page. 1297 ** 1298 ** A read lock on the disk file is obtained when the first page is acquired. 1299 ** This read lock is dropped when the last page is released. 1300 ** 1301 ** A _get works for any page number greater than 0. If the database 1302 ** file is smaller than the requested page, then no actual disk 1303 ** read occurs and the memory image of the page is initialized to 1304 ** all zeros. The extra data appended to a page is always initialized 1305 ** to zeros the first time a page is loaded into memory. 1306 ** 1307 ** The acquisition might fail for several reasons. In all cases, 1308 ** an appropriate error code is returned and *ppPage is set to NULL. 1309 ** 1310 ** See also sqlitepager_lookup(). Both this routine and _lookup() attempt 1311 ** to find a page in the in-memory cache first. If the page is not already 1312 ** in memory, this routine goes to disk to read it in whereas _lookup() 1313 ** just returns 0. This routine acquires a read-lock the first time it 1314 ** has to go to disk, and could also playback an old journal if necessary. 1315 ** Since _lookup() never goes to disk, it never has to deal with locks 1316 ** or journal files. 1317 */ 1318 int sqlitepager_get(Pager *pPager, Pgno pgno, void **ppPage){ 1319 PgHdr *pPg; 1320 int rc; 1321 1322 /* Make sure we have not hit any critical errors. 1323 */ 1324 assert( pPager!=0 ); 1325 assert( pgno!=0 ); 1326 *ppPage = 0; 1327 if( pPager->errMask & ~(PAGER_ERR_FULL) ){ 1328 return pager_errcode(pPager); 1329 } 1330 1331 /* If this is the first page accessed, then get a read lock 1332 ** on the database file. 1333 */ 1334 if( pPager->nRef==0 ){ 1335 rc = sqliteOsReadLock(&pPager->fd); 1336 if( rc!=SQLITE_OK ){ 1337 return rc; 1338 } 1339 pPager->state = SQLITE_READLOCK; 1340 1341 /* If a journal file exists, try to play it back. 1342 */ 1343 if( pPager->useJournal && sqliteOsFileExists(pPager->zJournal) ){ 1344 int rc; 1345 1346 /* Get a write lock on the database 1347 */ 1348 rc = sqliteOsWriteLock(&pPager->fd); 1349 if( rc!=SQLITE_OK ){ 1350 if( sqliteOsUnlock(&pPager->fd)!=SQLITE_OK ){ 1351 /* This should never happen! */ 1352 rc = SQLITE_INTERNAL; 1353 } 1354 return rc; 1355 } 1356 pPager->state = SQLITE_WRITELOCK; 1357 1358 /* Open the journal for reading only. Return SQLITE_BUSY if 1359 ** we are unable to open the journal file. 1360 ** 1361 ** The journal file does not need to be locked itself. The 1362 ** journal file is never open unless the main database file holds 1363 ** a write lock, so there is never any chance of two or more 1364 ** processes opening the journal at the same time. 1365 */ 1366 rc = sqliteOsOpenReadOnly(pPager->zJournal, &pPager->jfd); 1367 if( rc!=SQLITE_OK ){ 1368 rc = sqliteOsUnlock(&pPager->fd); 1369 assert( rc==SQLITE_OK ); 1370 return SQLITE_BUSY; 1371 } 1372 pPager->journalOpen = 1; 1373 pPager->journalStarted = 0; 1374 1375 /* Playback and delete the journal. Drop the database write 1376 ** lock and reacquire the read lock. 1377 */ 1378 rc = pager_playback(pPager, 0); 1379 if( rc!=SQLITE_OK ){ 1380 return rc; 1381 } 1382 } 1383 pPg = 0; 1384 }else{ 1385 /* Search for page in cache */ 1386 pPg = pager_lookup(pPager, pgno); 1387 } 1388 if( pPg==0 ){ 1389 /* The requested page is not in the page cache. */ 1390 int h; 1391 pPager->nMiss++; 1392 if( pPager->nPage<pPager->mxPage || pPager->pFirst==0 ){ 1393 /* Create a new page */ 1394 pPg = sqliteMallocRaw( sizeof(*pPg) + SQLITE_PAGE_SIZE 1395 + sizeof(u32) + pPager->nExtra ); 1396 if( pPg==0 ){ 1397 pager_unwritelock(pPager); 1398 pPager->errMask |= PAGER_ERR_MEM; 1399 return SQLITE_NOMEM; 1400 } 1401 memset(pPg, 0, sizeof(*pPg)); 1402 pPg->pPager = pPager; 1403 pPg->pNextAll = pPager->pAll; 1404 if( pPager->pAll ){ 1405 pPager->pAll->pPrevAll = pPg; 1406 } 1407 pPg->pPrevAll = 0; 1408 pPager->pAll = pPg; 1409 pPager->nPage++; 1410 }else{ 1411 /* Find a page to recycle. Try to locate a page that does not 1412 ** require us to do an fsync() on the journal. 1413 */ 1414 pPg = pPager->pFirstSynced; 1415 1416 /* If we could not find a page that does not require an fsync() 1417 ** on the journal file then fsync the journal file. This is a 1418 ** very slow operation, so we work hard to avoid it. But sometimes 1419 ** it can't be helped. 1420 */ 1421 if( pPg==0 ){ 1422 int rc = syncJournal(pPager); 1423 if( rc!=0 ){ 1424 sqlitepager_rollback(pPager); 1425 return SQLITE_IOERR; 1426 } 1427 pPg = pPager->pFirst; 1428 } 1429 assert( pPg->nRef==0 ); 1430 1431 /* Write the page to the database file if it is dirty. 1432 */ 1433 if( pPg->dirty ){ 1434 assert( pPg->needSync==0 ); 1435 pPg->pDirty = 0; 1436 rc = pager_write_pagelist( pPg ); 1437 if( rc!=SQLITE_OK ){ 1438 sqlitepager_rollback(pPager); 1439 return SQLITE_IOERR; 1440 } 1441 } 1442 assert( pPg->dirty==0 ); 1443 1444 /* If the page we are recycling is marked as alwaysRollback, then 1445 ** set the global alwaysRollback flag, thus disabling the 1446 ** sqlite_dont_rollback() optimization for the rest of this transaction. 1447 ** It is necessary to do this because the page marked alwaysRollback 1448 ** might be reloaded at a later time but at that point we won't remember 1449 ** that is was marked alwaysRollback. This means that all pages must 1450 ** be marked as alwaysRollback from here on out. 1451 */ 1452 if( pPg->alwaysRollback ){ 1453 pPager->alwaysRollback = 1; 1454 } 1455 1456 /* Unlink the old page from the free list and the hash table 1457 */ 1458 if( pPg==pPager->pFirstSynced ){ 1459 PgHdr *p = pPg->pNextFree; 1460 while( p && p->needSync ){ p = p->pNextFree; } 1461 pPager->pFirstSynced = p; 1462 } 1463 if( pPg->pPrevFree ){ 1464 pPg->pPrevFree->pNextFree = pPg->pNextFree; 1465 }else{ 1466 assert( pPager->pFirst==pPg ); 1467 pPager->pFirst = pPg->pNextFree; 1468 } 1469 if( pPg->pNextFree ){ 1470 pPg->pNextFree->pPrevFree = pPg->pPrevFree; 1471 }else{ 1472 assert( pPager->pLast==pPg ); 1473 pPager->pLast = pPg->pPrevFree; 1474 } 1475 pPg->pNextFree = pPg->pPrevFree = 0; 1476 if( pPg->pNextHash ){ 1477 pPg->pNextHash->pPrevHash = pPg->pPrevHash; 1478 } 1479 if( pPg->pPrevHash ){ 1480 pPg->pPrevHash->pNextHash = pPg->pNextHash; 1481 }else{ 1482 h = pager_hash(pPg->pgno); 1483 assert( pPager->aHash[h]==pPg ); 1484 pPager->aHash[h] = pPg->pNextHash; 1485 } 1486 pPg->pNextHash = pPg->pPrevHash = 0; 1487 pPager->nOvfl++; 1488 } 1489 pPg->pgno = pgno; 1490 if( pPager->aInJournal && (int)pgno<=pPager->origDbSize ){ 1491 sqliteCheckMemory(pPager->aInJournal, pgno/8); 1492 assert( pPager->journalOpen ); 1493 pPg->inJournal = (pPager->aInJournal[pgno/8] & (1<<(pgno&7)))!=0; 1494 pPg->needSync = 0; 1495 }else{ 1496 pPg->inJournal = 0; 1497 pPg->needSync = 0; 1498 } 1499 if( pPager->aInCkpt && (int)pgno<=pPager->ckptSize 1500 && (pPager->aInCkpt[pgno/8] & (1<<(pgno&7)))!=0 ){ 1501 page_add_to_ckpt_list(pPg); 1502 }else{ 1503 page_remove_from_ckpt_list(pPg); 1504 } 1505 pPg->dirty = 0; 1506 pPg->nRef = 1; 1507 REFINFO(pPg); 1508 pPager->nRef++; 1509 h = pager_hash(pgno); 1510 pPg->pNextHash = pPager->aHash[h]; 1511 pPager->aHash[h] = pPg; 1512 if( pPg->pNextHash ){ 1513 assert( pPg->pNextHash->pPrevHash==0 ); 1514 pPg->pNextHash->pPrevHash = pPg; 1515 } 1516 if( pPager->nExtra>0 ){ 1517 memset(PGHDR_TO_EXTRA(pPg), 0, pPager->nExtra); 1518 } 1519 if( pPager->dbSize<0 ) sqlitepager_pagecount(pPager); 1520 if( pPager->errMask!=0 ){ 1521 sqlitepager_unref(PGHDR_TO_DATA(pPg)); 1522 rc = pager_errcode(pPager); 1523 return rc; 1524 } 1525 if( pPager->dbSize<(int)pgno ){ 1526 memset(PGHDR_TO_DATA(pPg), 0, SQLITE_PAGE_SIZE); 1527 }else{ 1528 int rc; 1529 sqliteOsSeek(&pPager->fd, (pgno-1)*(off_t)SQLITE_PAGE_SIZE); 1530 rc = sqliteOsRead(&pPager->fd, PGHDR_TO_DATA(pPg), SQLITE_PAGE_SIZE); 1531 TRACE2("FETCH %d\n", pPg->pgno); 1532 CODEC(pPager, PGHDR_TO_DATA(pPg), pPg->pgno, 3); 1533 if( rc!=SQLITE_OK ){ 1534 off_t fileSize; 1535 if( sqliteOsFileSize(&pPager->fd,&fileSize)!=SQLITE_OK 1536 || fileSize>=pgno*SQLITE_PAGE_SIZE ){ 1537 sqlitepager_unref(PGHDR_TO_DATA(pPg)); 1538 return rc; 1539 }else{ 1540 memset(PGHDR_TO_DATA(pPg), 0, SQLITE_PAGE_SIZE); 1541 } 1542 } 1543 } 1544 }else{ 1545 /* The requested page is in the page cache. */ 1546 pPager->nHit++; 1547 page_ref(pPg); 1548 } 1549 *ppPage = PGHDR_TO_DATA(pPg); 1550 return SQLITE_OK; 1551 } 1552 1553 /* 1554 ** Acquire a page if it is already in the in-memory cache. Do 1555 ** not read the page from disk. Return a pointer to the page, 1556 ** or 0 if the page is not in cache. 1557 ** 1558 ** See also sqlitepager_get(). The difference between this routine 1559 ** and sqlitepager_get() is that _get() will go to the disk and read 1560 ** in the page if the page is not already in cache. This routine 1561 ** returns NULL if the page is not in cache or if a disk I/O error 1562 ** has ever happened. 1563 */ 1564 void *sqlitepager_lookup(Pager *pPager, Pgno pgno){ 1565 PgHdr *pPg; 1566 1567 assert( pPager!=0 ); 1568 assert( pgno!=0 ); 1569 if( pPager->errMask & ~(PAGER_ERR_FULL) ){ 1570 return 0; 1571 } 1572 /* if( pPager->nRef==0 ){ 1573 ** return 0; 1574 ** } 1575 */ 1576 pPg = pager_lookup(pPager, pgno); 1577 if( pPg==0 ) return 0; 1578 page_ref(pPg); 1579 return PGHDR_TO_DATA(pPg); 1580 } 1581 1582 /* 1583 ** Release a page. 1584 ** 1585 ** If the number of references to the page drop to zero, then the 1586 ** page is added to the LRU list. When all references to all pages 1587 ** are released, a rollback occurs and the lock on the database is 1588 ** removed. 1589 */ 1590 int sqlitepager_unref(void *pData){ 1591 PgHdr *pPg; 1592 1593 /* Decrement the reference count for this page 1594 */ 1595 pPg = DATA_TO_PGHDR(pData); 1596 assert( pPg->nRef>0 ); 1597 pPg->nRef--; 1598 REFINFO(pPg); 1599 1600 /* When the number of references to a page reach 0, call the 1601 ** destructor and add the page to the freelist. 1602 */ 1603 if( pPg->nRef==0 ){ 1604 Pager *pPager; 1605 pPager = pPg->pPager; 1606 pPg->pNextFree = 0; 1607 pPg->pPrevFree = pPager->pLast; 1608 pPager->pLast = pPg; 1609 if( pPg->pPrevFree ){ 1610 pPg->pPrevFree->pNextFree = pPg; 1611 }else{ 1612 pPager->pFirst = pPg; 1613 } 1614 if( pPg->needSync==0 && pPager->pFirstSynced==0 ){ 1615 pPager->pFirstSynced = pPg; 1616 } 1617 if( pPager->xDestructor ){ 1618 pPager->xDestructor(pData); 1619 } 1620 1621 /* When all pages reach the freelist, drop the read lock from 1622 ** the database file. 1623 */ 1624 pPager->nRef--; 1625 assert( pPager->nRef>=0 ); 1626 if( pPager->nRef==0 ){ 1627 pager_reset(pPager); 1628 } 1629 } 1630 return SQLITE_OK; 1631 } 1632 1633 /* 1634 ** Create a journal file for pPager. There should already be a write 1635 ** lock on the database file when this routine is called. 1636 ** 1637 ** Return SQLITE_OK if everything. Return an error code and release the 1638 ** write lock if anything goes wrong. 1639 */ 1640 static int pager_open_journal(Pager *pPager){ 1641 int rc; 1642 assert( pPager->state==SQLITE_WRITELOCK ); 1643 assert( pPager->journalOpen==0 ); 1644 assert( pPager->useJournal ); 1645 sqlitepager_pagecount(pPager); 1646 pPager->aInJournal = sqliteMalloc( pPager->dbSize/8 + 1 ); 1647 if( pPager->aInJournal==0 ){ 1648 sqliteOsReadLock(&pPager->fd); 1649 pPager->state = SQLITE_READLOCK; 1650 return SQLITE_NOMEM; 1651 } 1652 rc = sqliteOsOpenExclusive(pPager->zJournal, &pPager->jfd,pPager->tempFile); 1653 if( rc!=SQLITE_OK ){ 1654 sqliteFree(pPager->aInJournal); 1655 pPager->aInJournal = 0; 1656 sqliteOsReadLock(&pPager->fd); 1657 pPager->state = SQLITE_READLOCK; 1658 return SQLITE_CANTOPEN; 1659 } 1660 sqliteOsOpenDirectory(pPager->zDirectory, &pPager->jfd); 1661 pPager->journalOpen = 1; 1662 pPager->journalStarted = 0; 1663 pPager->needSync = 0; 1664 pPager->alwaysRollback = 0; 1665 pPager->nRec = 0; 1666 if( pPager->errMask!=0 ){ 1667 rc = pager_errcode(pPager); 1668 return rc; 1669 } 1670 pPager->origDbSize = pPager->dbSize; 1671 if( journal_format==JOURNAL_FORMAT_3 ){ 1672 rc = sqliteOsWrite(&pPager->jfd, aJournalMagic3, sizeof(aJournalMagic3)); 1673 if( rc==SQLITE_OK ){ 1674 rc = write32bits(&pPager->jfd, pPager->noSync ? 0xffffffff : 0); 1675 } 1676 if( rc==SQLITE_OK ){ 1677 sqliteRandomness(sizeof(pPager->cksumInit), &pPager->cksumInit); 1678 rc = write32bits(&pPager->jfd, pPager->cksumInit); 1679 } 1680 }else if( journal_format==JOURNAL_FORMAT_2 ){ 1681 rc = sqliteOsWrite(&pPager->jfd, aJournalMagic2, sizeof(aJournalMagic2)); 1682 }else{ 1683 assert( journal_format==JOURNAL_FORMAT_1 ); 1684 rc = sqliteOsWrite(&pPager->jfd, aJournalMagic1, sizeof(aJournalMagic1)); 1685 } 1686 if( rc==SQLITE_OK ){ 1687 rc = write32bits(&pPager->jfd, pPager->dbSize); 1688 } 1689 if( pPager->ckptAutoopen && rc==SQLITE_OK ){ 1690 rc = sqlitepager_ckpt_begin(pPager); 1691 } 1692 if( rc!=SQLITE_OK ){ 1693 rc = pager_unwritelock(pPager); 1694 if( rc==SQLITE_OK ){ 1695 rc = SQLITE_FULL; 1696 } 1697 } 1698 return rc; 1699 } 1700 1701 /* 1702 ** Acquire a write-lock on the database. The lock is removed when 1703 ** the any of the following happen: 1704 ** 1705 ** * sqlitepager_commit() is called. 1706 ** * sqlitepager_rollback() is called. 1707 ** * sqlitepager_close() is called. 1708 ** * sqlitepager_unref() is called to on every outstanding page. 1709 ** 1710 ** The parameter to this routine is a pointer to any open page of the 1711 ** database file. Nothing changes about the page - it is used merely 1712 ** to acquire a pointer to the Pager structure and as proof that there 1713 ** is already a read-lock on the database. 1714 ** 1715 ** A journal file is opened if this is not a temporary file. For 1716 ** temporary files, the opening of the journal file is deferred until 1717 ** there is an actual need to write to the journal. 1718 ** 1719 ** If the database is already write-locked, this routine is a no-op. 1720 */ 1721 int sqlitepager_begin(void *pData){ 1722 PgHdr *pPg = DATA_TO_PGHDR(pData); 1723 Pager *pPager = pPg->pPager; 1724 int rc = SQLITE_OK; 1725 assert( pPg->nRef>0 ); 1726 assert( pPager->state!=SQLITE_UNLOCK ); 1727 if( pPager->state==SQLITE_READLOCK ){ 1728 assert( pPager->aInJournal==0 ); 1729 rc = sqliteOsWriteLock(&pPager->fd); 1730 if( rc!=SQLITE_OK ){ 1731 return rc; 1732 } 1733 pPager->state = SQLITE_WRITELOCK; 1734 pPager->dirtyFile = 0; 1735 TRACE1("TRANSACTION\n"); 1736 if( pPager->useJournal && !pPager->tempFile ){ 1737 rc = pager_open_journal(pPager); 1738 } 1739 } 1740 return rc; 1741 } 1742 1743 /* 1744 ** Mark a data page as writeable. The page is written into the journal 1745 ** if it is not there already. This routine must be called before making 1746 ** changes to a page. 1747 ** 1748 ** The first time this routine is called, the pager creates a new 1749 ** journal and acquires a write lock on the database. If the write 1750 ** lock could not be acquired, this routine returns SQLITE_BUSY. The 1751 ** calling routine must check for that return value and be careful not to 1752 ** change any page data until this routine returns SQLITE_OK. 1753 ** 1754 ** If the journal file could not be written because the disk is full, 1755 ** then this routine returns SQLITE_FULL and does an immediate rollback. 1756 ** All subsequent write attempts also return SQLITE_FULL until there 1757 ** is a call to sqlitepager_commit() or sqlitepager_rollback() to 1758 ** reset. 1759 */ 1760 int sqlitepager_write(void *pData){ 1761 PgHdr *pPg = DATA_TO_PGHDR(pData); 1762 Pager *pPager = pPg->pPager; 1763 int rc = SQLITE_OK; 1764 1765 /* Check for errors 1766 */ 1767 if( pPager->errMask ){ 1768 return pager_errcode(pPager); 1769 } 1770 if( pPager->readOnly ){ 1771 return SQLITE_PERM; 1772 } 1773 1774 /* Mark the page as dirty. If the page has already been written 1775 ** to the journal then we can return right away. 1776 */ 1777 pPg->dirty = 1; 1778 if( pPg->inJournal && (pPg->inCkpt || pPager->ckptInUse==0) ){ 1779 pPager->dirtyFile = 1; 1780 return SQLITE_OK; 1781 } 1782 1783 /* If we get this far, it means that the page needs to be 1784 ** written to the transaction journal or the ckeckpoint journal 1785 ** or both. 1786 ** 1787 ** First check to see that the transaction journal exists and 1788 ** create it if it does not. 1789 */ 1790 assert( pPager->state!=SQLITE_UNLOCK ); 1791 rc = sqlitepager_begin(pData); 1792 if( rc!=SQLITE_OK ){ 1793 return rc; 1794 } 1795 assert( pPager->state==SQLITE_WRITELOCK ); 1796 if( !pPager->journalOpen && pPager->useJournal ){ 1797 rc = pager_open_journal(pPager); 1798 if( rc!=SQLITE_OK ) return rc; 1799 } 1800 assert( pPager->journalOpen || !pPager->useJournal ); 1801 pPager->dirtyFile = 1; 1802 1803 /* The transaction journal now exists and we have a write lock on the 1804 ** main database file. Write the current page to the transaction 1805 ** journal if it is not there already. 1806 */ 1807 if( !pPg->inJournal && pPager->useJournal ){ 1808 if( (int)pPg->pgno <= pPager->origDbSize ){ 1809 int szPg; 1810 u32 saved; 1811 if( journal_format>=JOURNAL_FORMAT_3 ){ 1812 u32 cksum = pager_cksum(pPager, pPg->pgno, pData); 1813 saved = *(u32*)PGHDR_TO_EXTRA(pPg); 1814 store32bits(cksum, pPg, SQLITE_PAGE_SIZE); 1815 szPg = SQLITE_PAGE_SIZE+8; 1816 }else{ 1817 szPg = SQLITE_PAGE_SIZE+4; 1818 } 1819 store32bits(pPg->pgno, pPg, -4); 1820 CODEC(pPager, pData, pPg->pgno, 7); 1821 rc = sqliteOsWrite(&pPager->jfd, &((char*)pData)[-4], szPg); 1822 TRACE3("JOURNAL %d %d\n", pPg->pgno, pPg->needSync); 1823 CODEC(pPager, pData, pPg->pgno, 0); 1824 if( journal_format>=JOURNAL_FORMAT_3 ){ 1825 *(u32*)PGHDR_TO_EXTRA(pPg) = saved; 1826 } 1827 if( rc!=SQLITE_OK ){ 1828 sqlitepager_rollback(pPager); 1829 pPager->errMask |= PAGER_ERR_FULL; 1830 return rc; 1831 } 1832 pPager->nRec++; 1833 assert( pPager->aInJournal!=0 ); 1834 pPager->aInJournal[pPg->pgno/8] |= 1<<(pPg->pgno&7); 1835 pPg->needSync = !pPager->noSync; 1836 pPg->inJournal = 1; 1837 if( pPager->ckptInUse ){ 1838 pPager->aInCkpt[pPg->pgno/8] |= 1<<(pPg->pgno&7); 1839 page_add_to_ckpt_list(pPg); 1840 } 1841 }else{ 1842 pPg->needSync = !pPager->journalStarted && !pPager->noSync; 1843 TRACE3("APPEND %d %d\n", pPg->pgno, pPg->needSync); 1844 } 1845 if( pPg->needSync ){ 1846 pPager->needSync = 1; 1847 } 1848 } 1849 1850 /* If the checkpoint journal is open and the page is not in it, 1851 ** then write the current page to the checkpoint journal. Note that 1852 ** the checkpoint journal always uses the simplier format 2 that lacks 1853 ** checksums. The header is also omitted from the checkpoint journal. 1854 */ 1855 if( pPager->ckptInUse && !pPg->inCkpt && (int)pPg->pgno<=pPager->ckptSize ){ 1856 assert( pPg->inJournal || (int)pPg->pgno>pPager->origDbSize ); 1857 store32bits(pPg->pgno, pPg, -4); 1858 CODEC(pPager, pData, pPg->pgno, 7); 1859 rc = sqliteOsWrite(&pPager->cpfd, &((char*)pData)[-4], SQLITE_PAGE_SIZE+4); 1860 TRACE2("CKPT-JOURNAL %d\n", pPg->pgno); 1861 CODEC(pPager, pData, pPg->pgno, 0); 1862 if( rc!=SQLITE_OK ){ 1863 sqlitepager_rollback(pPager); 1864 pPager->errMask |= PAGER_ERR_FULL; 1865 return rc; 1866 } 1867 pPager->ckptNRec++; 1868 assert( pPager->aInCkpt!=0 ); 1869 pPager->aInCkpt[pPg->pgno/8] |= 1<<(pPg->pgno&7); 1870 page_add_to_ckpt_list(pPg); 1871 } 1872 1873 /* Update the database size and return. 1874 */ 1875 if( pPager->dbSize<(int)pPg->pgno ){ 1876 pPager->dbSize = pPg->pgno; 1877 } 1878 return rc; 1879 } 1880 1881 /* 1882 ** Return TRUE if the page given in the argument was previously passed 1883 ** to sqlitepager_write(). In other words, return TRUE if it is ok 1884 ** to change the content of the page. 1885 */ 1886 int sqlitepager_iswriteable(void *pData){ 1887 PgHdr *pPg = DATA_TO_PGHDR(pData); 1888 return pPg->dirty; 1889 } 1890 1891 /* 1892 ** Replace the content of a single page with the information in the third 1893 ** argument. 1894 */ 1895 int sqlitepager_overwrite(Pager *pPager, Pgno pgno, void *pData){ 1896 void *pPage; 1897 int rc; 1898 1899 rc = sqlitepager_get(pPager, pgno, &pPage); 1900 if( rc==SQLITE_OK ){ 1901 rc = sqlitepager_write(pPage); 1902 if( rc==SQLITE_OK ){ 1903 memcpy(pPage, pData, SQLITE_PAGE_SIZE); 1904 } 1905 sqlitepager_unref(pPage); 1906 } 1907 return rc; 1908 } 1909 1910 /* 1911 ** A call to this routine tells the pager that it is not necessary to 1912 ** write the information on page "pgno" back to the disk, even though 1913 ** that page might be marked as dirty. 1914 ** 1915 ** The overlying software layer calls this routine when all of the data 1916 ** on the given page is unused. The pager marks the page as clean so 1917 ** that it does not get written to disk. 1918 ** 1919 ** Tests show that this optimization, together with the 1920 ** sqlitepager_dont_rollback() below, more than double the speed 1921 ** of large INSERT operations and quadruple the speed of large DELETEs. 1922 ** 1923 ** When this routine is called, set the alwaysRollback flag to true. 1924 ** Subsequent calls to sqlitepager_dont_rollback() for the same page 1925 ** will thereafter be ignored. This is necessary to avoid a problem 1926 ** where a page with data is added to the freelist during one part of 1927 ** a transaction then removed from the freelist during a later part 1928 ** of the same transaction and reused for some other purpose. When it 1929 ** is first added to the freelist, this routine is called. When reused, 1930 ** the dont_rollback() routine is called. But because the page contains 1931 ** critical data, we still need to be sure it gets rolled back in spite 1932 ** of the dont_rollback() call. 1933 */ 1934 void sqlitepager_dont_write(Pager *pPager, Pgno pgno){ 1935 PgHdr *pPg; 1936 1937 pPg = pager_lookup(pPager, pgno); 1938 pPg->alwaysRollback = 1; 1939 if( pPg && pPg->dirty ){ 1940 if( pPager->dbSize==(int)pPg->pgno && pPager->origDbSize<pPager->dbSize ){ 1941 /* If this pages is the last page in the file and the file has grown 1942 ** during the current transaction, then do NOT mark the page as clean. 1943 ** When the database file grows, we must make sure that the last page 1944 ** gets written at least once so that the disk file will be the correct 1945 ** size. If you do not write this page and the size of the file 1946 ** on the disk ends up being too small, that can lead to database 1947 ** corruption during the next transaction. 1948 */ 1949 }else{ 1950 TRACE2("DONT_WRITE %d\n", pgno); 1951 pPg->dirty = 0; 1952 } 1953 } 1954 } 1955 1956 /* 1957 ** A call to this routine tells the pager that if a rollback occurs, 1958 ** it is not necessary to restore the data on the given page. This 1959 ** means that the pager does not have to record the given page in the 1960 ** rollback journal. 1961 */ 1962 void sqlitepager_dont_rollback(void *pData){ 1963 PgHdr *pPg = DATA_TO_PGHDR(pData); 1964 Pager *pPager = pPg->pPager; 1965 1966 if( pPager->state!=SQLITE_WRITELOCK || pPager->journalOpen==0 ) return; 1967 if( pPg->alwaysRollback || pPager->alwaysRollback ) return; 1968 if( !pPg->inJournal && (int)pPg->pgno <= pPager->origDbSize ){ 1969 assert( pPager->aInJournal!=0 ); 1970 pPager->aInJournal[pPg->pgno/8] |= 1<<(pPg->pgno&7); 1971 pPg->inJournal = 1; 1972 if( pPager->ckptInUse ){ 1973 pPager->aInCkpt[pPg->pgno/8] |= 1<<(pPg->pgno&7); 1974 page_add_to_ckpt_list(pPg); 1975 } 1976 TRACE2("DONT_ROLLBACK %d\n", pPg->pgno); 1977 } 1978 if( pPager->ckptInUse && !pPg->inCkpt && (int)pPg->pgno<=pPager->ckptSize ){ 1979 assert( pPg->inJournal || (int)pPg->pgno>pPager->origDbSize ); 1980 assert( pPager->aInCkpt!=0 ); 1981 pPager->aInCkpt[pPg->pgno/8] |= 1<<(pPg->pgno&7); 1982 page_add_to_ckpt_list(pPg); 1983 } 1984 } 1985 1986 /* 1987 ** Commit all changes to the database and release the write lock. 1988 ** 1989 ** If the commit fails for any reason, a rollback attempt is made 1990 ** and an error code is returned. If the commit worked, SQLITE_OK 1991 ** is returned. 1992 */ 1993 int sqlitepager_commit(Pager *pPager){ 1994 int rc; 1995 PgHdr *pPg; 1996 1997 if( pPager->errMask==PAGER_ERR_FULL ){ 1998 rc = sqlitepager_rollback(pPager); 1999 if( rc==SQLITE_OK ){ 2000 rc = SQLITE_FULL; 2001 } 2002 return rc; 2003 } 2004 if( pPager->errMask!=0 ){ 2005 rc = pager_errcode(pPager); 2006 return rc; 2007 } 2008 if( pPager->state!=SQLITE_WRITELOCK ){ 2009 return SQLITE_ERROR; 2010 } 2011 TRACE1("COMMIT\n"); 2012 if( pPager->dirtyFile==0 ){ 2013 /* Exit early (without doing the time-consuming sqliteOsSync() calls) 2014 ** if there have been no changes to the database file. */ 2015 assert( pPager->needSync==0 ); 2016 rc = pager_unwritelock(pPager); 2017 pPager->dbSize = -1; 2018 return rc; 2019 } 2020 assert( pPager->journalOpen ); 2021 rc = syncJournal(pPager); 2022 if( rc!=SQLITE_OK ){ 2023 goto commit_abort; 2024 } 2025 pPg = pager_get_all_dirty_pages(pPager); 2026 if( pPg ){ 2027 rc = pager_write_pagelist(pPg); 2028 if( rc || (!pPager->noSync && sqliteOsSync(&pPager->fd)!=SQLITE_OK) ){ 2029 goto commit_abort; 2030 } 2031 } 2032 rc = pager_unwritelock(pPager); 2033 pPager->dbSize = -1; 2034 return rc; 2035 2036 /* Jump here if anything goes wrong during the commit process. 2037 */ 2038 commit_abort: 2039 rc = sqlitepager_rollback(pPager); 2040 if( rc==SQLITE_OK ){ 2041 rc = SQLITE_FULL; 2042 } 2043 return rc; 2044 } 2045 2046 /* 2047 ** Rollback all changes. The database falls back to read-only mode. 2048 ** All in-memory cache pages revert to their original data contents. 2049 ** The journal is deleted. 2050 ** 2051 ** This routine cannot fail unless some other process is not following 2052 ** the correct locking protocol (SQLITE_PROTOCOL) or unless some other 2053 ** process is writing trash into the journal file (SQLITE_CORRUPT) or 2054 ** unless a prior malloc() failed (SQLITE_NOMEM). Appropriate error 2055 ** codes are returned for all these occasions. Otherwise, 2056 ** SQLITE_OK is returned. 2057 */ 2058 int sqlitepager_rollback(Pager *pPager){ 2059 int rc; 2060 TRACE1("ROLLBACK\n"); 2061 if( !pPager->dirtyFile || !pPager->journalOpen ){ 2062 rc = pager_unwritelock(pPager); 2063 pPager->dbSize = -1; 2064 return rc; 2065 } 2066 2067 if( pPager->errMask!=0 && pPager->errMask!=PAGER_ERR_FULL ){ 2068 if( pPager->state>=SQLITE_WRITELOCK ){ 2069 pager_playback(pPager, 1); 2070 } 2071 return pager_errcode(pPager); 2072 } 2073 if( pPager->state!=SQLITE_WRITELOCK ){ 2074 return SQLITE_OK; 2075 } 2076 rc = pager_playback(pPager, 1); 2077 if( rc!=SQLITE_OK ){ 2078 rc = SQLITE_CORRUPT; 2079 pPager->errMask |= PAGER_ERR_CORRUPT; 2080 } 2081 pPager->dbSize = -1; 2082 return rc; 2083 } 2084 2085 /* 2086 ** Return TRUE if the database file is opened read-only. Return FALSE 2087 ** if the database is (in theory) writable. 2088 */ 2089 int sqlitepager_isreadonly(Pager *pPager){ 2090 return pPager->readOnly; 2091 } 2092 2093 /* 2094 ** This routine is used for testing and analysis only. 2095 */ 2096 int *sqlitepager_stats(Pager *pPager){ 2097 static int a[9]; 2098 a[0] = pPager->nRef; 2099 a[1] = pPager->nPage; 2100 a[2] = pPager->mxPage; 2101 a[3] = pPager->dbSize; 2102 a[4] = pPager->state; 2103 a[5] = pPager->errMask; 2104 a[6] = pPager->nHit; 2105 a[7] = pPager->nMiss; 2106 a[8] = pPager->nOvfl; 2107 return a; 2108 } 2109 2110 /* 2111 ** Set the checkpoint. 2112 ** 2113 ** This routine should be called with the transaction journal already 2114 ** open. A new checkpoint journal is created that can be used to rollback 2115 ** changes of a single SQL command within a larger transaction. 2116 */ 2117 int sqlitepager_ckpt_begin(Pager *pPager){ 2118 int rc; 2119 char zTemp[SQLITE_TEMPNAME_SIZE]; 2120 if( !pPager->journalOpen ){ 2121 pPager->ckptAutoopen = 1; 2122 return SQLITE_OK; 2123 } 2124 assert( pPager->journalOpen ); 2125 assert( !pPager->ckptInUse ); 2126 pPager->aInCkpt = sqliteMalloc( pPager->dbSize/8 + 1 ); 2127 if( pPager->aInCkpt==0 ){ 2128 sqliteOsReadLock(&pPager->fd); 2129 return SQLITE_NOMEM; 2130 } 2131 #ifndef NDEBUG 2132 rc = sqliteOsFileSize(&pPager->jfd, &pPager->ckptJSize); 2133 if( rc ) goto ckpt_begin_failed; 2134 assert( pPager->ckptJSize == 2135 pPager->nRec*JOURNAL_PG_SZ(journal_format)+JOURNAL_HDR_SZ(journal_format) ); 2136 #endif 2137 pPager->ckptJSize = pPager->nRec*JOURNAL_PG_SZ(journal_format) 2138 + JOURNAL_HDR_SZ(journal_format); 2139 pPager->ckptSize = pPager->dbSize; 2140 if( !pPager->ckptOpen ){ 2141 rc = sqlitepager_opentemp(zTemp, &pPager->cpfd); 2142 if( rc ) goto ckpt_begin_failed; 2143 pPager->ckptOpen = 1; 2144 pPager->ckptNRec = 0; 2145 } 2146 pPager->ckptInUse = 1; 2147 return SQLITE_OK; 2148 2149 ckpt_begin_failed: 2150 if( pPager->aInCkpt ){ 2151 sqliteFree(pPager->aInCkpt); 2152 pPager->aInCkpt = 0; 2153 } 2154 return rc; 2155 } 2156 2157 /* 2158 ** Commit a checkpoint. 2159 */ 2160 int sqlitepager_ckpt_commit(Pager *pPager){ 2161 if( pPager->ckptInUse ){ 2162 PgHdr *pPg, *pNext; 2163 sqliteOsSeek(&pPager->cpfd, 0); 2164 /* sqliteOsTruncate(&pPager->cpfd, 0); */ 2165 pPager->ckptNRec = 0; 2166 pPager->ckptInUse = 0; 2167 sqliteFree( pPager->aInCkpt ); 2168 pPager->aInCkpt = 0; 2169 for(pPg=pPager->pCkpt; pPg; pPg=pNext){ 2170 pNext = pPg->pNextCkpt; 2171 assert( pPg->inCkpt ); 2172 pPg->inCkpt = 0; 2173 pPg->pPrevCkpt = pPg->pNextCkpt = 0; 2174 } 2175 pPager->pCkpt = 0; 2176 } 2177 pPager->ckptAutoopen = 0; 2178 return SQLITE_OK; 2179 } 2180 2181 /* 2182 ** Rollback a checkpoint. 2183 */ 2184 int sqlitepager_ckpt_rollback(Pager *pPager){ 2185 int rc; 2186 if( pPager->ckptInUse ){ 2187 rc = pager_ckpt_playback(pPager); 2188 sqlitepager_ckpt_commit(pPager); 2189 }else{ 2190 rc = SQLITE_OK; 2191 } 2192 pPager->ckptAutoopen = 0; 2193 return rc; 2194 } 2195 2196 /* 2197 ** Return the full pathname of the database file. 2198 */ 2199 const char *sqlitepager_filename(Pager *pPager){ 2200 return pPager->zFilename; 2201 } 2202 2203 /* 2204 ** Set the codec for this pager 2205 */ 2206 void sqlitepager_set_codec( 2207 Pager *pPager, 2208 void (*xCodec)(void*,void*,Pgno,int), 2209 void *pCodecArg 2210 ){ 2211 pPager->xCodec = xCodec; 2212 pPager->pCodecArg = pCodecArg; 2213 } 2214 2215 #ifdef SQLITE_TEST 2216 /* 2217 ** Print a listing of all referenced pages and their ref count. 2218 */ 2219 void sqlitepager_refdump(Pager *pPager){ 2220 PgHdr *pPg; 2221 for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){ 2222 if( pPg->nRef<=0 ) continue; 2223 printf("PAGE %3d addr=0x%08x nRef=%d\n", 2224 pPg->pgno, (int)PGHDR_TO_DATA(pPg), pPg->nRef); 2225 } 2226 } 2227 #endif 2228