1 /* 2 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 3 * Use is subject to license terms. 4 */ 5 6 #pragma ident "%Z%%M% %I% %E% SMI" 7 8 /* 9 ** 2001 September 15 10 ** 11 ** The author disclaims copyright to this source code. In place of 12 ** a legal notice, here is a blessing: 13 ** 14 ** May you do good and not evil. 15 ** May you find forgiveness for yourself and forgive others. 16 ** May you share freely, never taking more than you give. 17 ** 18 ************************************************************************* 19 ** This is the implementation of the page cache subsystem or "pager". 20 ** 21 ** The pager is used to access a database disk file. It implements 22 ** atomic commit and rollback through the use of a journal file that 23 ** is separate from the database file. The pager also implements file 24 ** locking to prevent two processes from writing the same database 25 ** file simultaneously, or one process from reading the database while 26 ** another is writing. 27 ** 28 ** @(#) $Id: pager.c,v 1.101 2004/02/25 02:20:41 drh Exp $ 29 */ 30 #include "os.h" /* Must be first to enable large file support */ 31 #include "sqliteInt.h" 32 #include "pager.h" 33 #include <assert.h> 34 #include <string.h> 35 36 /* 37 ** Macros for troubleshooting. Normally turned off 38 */ 39 #if 0 40 static Pager *mainPager = 0; 41 #define SET_PAGER(X) if( mainPager==0 ) mainPager = (X) 42 #define CLR_PAGER(X) if( mainPager==(X) ) mainPager = 0 43 #define TRACE1(X) if( pPager==mainPager ) fprintf(stderr,X) 44 #define TRACE2(X,Y) if( pPager==mainPager ) fprintf(stderr,X,Y) 45 #define TRACE3(X,Y,Z) if( pPager==mainPager ) fprintf(stderr,X,Y,Z) 46 #else 47 #define SET_PAGER(X) 48 #define CLR_PAGER(X) 49 #define TRACE1(X) 50 #define TRACE2(X,Y) 51 #define TRACE3(X,Y,Z) 52 #endif 53 54 55 /* 56 ** The page cache as a whole is always in one of the following 57 ** states: 58 ** 59 ** SQLITE_UNLOCK The page cache is not currently reading or 60 ** writing the database file. There is no 61 ** data held in memory. This is the initial 62 ** state. 63 ** 64 ** SQLITE_READLOCK The page cache is reading the database. 65 ** Writing is not permitted. There can be 66 ** multiple readers accessing the same database 67 ** file at the same time. 68 ** 69 ** SQLITE_WRITELOCK The page cache is writing the database. 70 ** Access is exclusive. No other processes or 71 ** threads can be reading or writing while one 72 ** process is writing. 73 ** 74 ** The page cache comes up in SQLITE_UNLOCK. The first time a 75 ** sqlite_page_get() occurs, the state transitions to SQLITE_READLOCK. 76 ** After all pages have been released using sqlite_page_unref(), 77 ** the state transitions back to SQLITE_UNLOCK. The first time 78 ** that sqlite_page_write() is called, the state transitions to 79 ** SQLITE_WRITELOCK. (Note that sqlite_page_write() can only be 80 ** called on an outstanding page which means that the pager must 81 ** be in SQLITE_READLOCK before it transitions to SQLITE_WRITELOCK.) 82 ** The sqlite_page_rollback() and sqlite_page_commit() functions 83 ** transition the state from SQLITE_WRITELOCK back to SQLITE_READLOCK. 84 */ 85 #define SQLITE_UNLOCK 0 86 #define SQLITE_READLOCK 1 87 #define SQLITE_WRITELOCK 2 88 89 90 /* 91 ** Each in-memory image of a page begins with the following header. 92 ** This header is only visible to this pager module. The client 93 ** code that calls pager sees only the data that follows the header. 94 ** 95 ** Client code should call sqlitepager_write() on a page prior to making 96 ** any modifications to that page. The first time sqlitepager_write() 97 ** is called, the original page contents are written into the rollback 98 ** journal and PgHdr.inJournal and PgHdr.needSync are set. Later, once 99 ** the journal page has made it onto the disk surface, PgHdr.needSync 100 ** is cleared. The modified page cannot be written back into the original 101 ** database file until the journal pages has been synced to disk and the 102 ** PgHdr.needSync has been cleared. 103 ** 104 ** The PgHdr.dirty flag is set when sqlitepager_write() is called and 105 ** is cleared again when the page content is written back to the original 106 ** database file. 107 */ 108 typedef struct PgHdr PgHdr; 109 struct PgHdr { 110 Pager *pPager; /* The pager to which this page belongs */ 111 Pgno pgno; /* The page number for this page */ 112 PgHdr *pNextHash, *pPrevHash; /* Hash collision chain for PgHdr.pgno */ 113 int nRef; /* Number of users of this page */ 114 PgHdr *pNextFree, *pPrevFree; /* Freelist of pages where nRef==0 */ 115 PgHdr *pNextAll, *pPrevAll; /* A list of all pages */ 116 PgHdr *pNextCkpt, *pPrevCkpt; /* List of pages in the checkpoint journal */ 117 u8 inJournal; /* TRUE if has been written to journal */ 118 u8 inCkpt; /* TRUE if written to the checkpoint journal */ 119 u8 dirty; /* TRUE if we need to write back changes */ 120 u8 needSync; /* Sync journal before writing this page */ 121 u8 alwaysRollback; /* Disable dont_rollback() for this page */ 122 PgHdr *pDirty; /* Dirty pages sorted by PgHdr.pgno */ 123 /* SQLITE_PAGE_SIZE bytes of page data follow this header */ 124 /* Pager.nExtra bytes of local data follow the page data */ 125 }; 126 127 128 /* 129 ** A macro used for invoking the codec if there is one 130 */ 131 #ifdef SQLITE_HAS_CODEC 132 # define CODEC(P,D,N,X) if( P->xCodec ){ P->xCodec(P->pCodecArg,D,N,X); } 133 #else 134 # define CODEC(P,D,N,X) 135 #endif 136 137 /* 138 ** Convert a pointer to a PgHdr into a pointer to its data 139 ** and back again. 140 */ 141 #define PGHDR_TO_DATA(P) ((void*)(&(P)[1])) 142 #define DATA_TO_PGHDR(D) (&((PgHdr*)(D))[-1]) 143 #define PGHDR_TO_EXTRA(P) ((void*)&((char*)(&(P)[1]))[SQLITE_PAGE_SIZE]) 144 145 /* 146 ** How big to make the hash table used for locating in-memory pages 147 ** by page number. 148 */ 149 #define N_PG_HASH 2048 150 151 /* 152 ** Hash a page number 153 */ 154 #define pager_hash(PN) ((PN)&(N_PG_HASH-1)) 155 156 /* 157 ** A open page cache is an instance of the following structure. 158 */ 159 struct Pager { 160 char *zFilename; /* Name of the database file */ 161 char *zJournal; /* Name of the journal file */ 162 char *zDirectory; /* Directory hold database and journal files */ 163 OsFile fd, jfd; /* File descriptors for database and journal */ 164 OsFile cpfd; /* File descriptor for the checkpoint journal */ 165 int dbSize; /* Number of pages in the file */ 166 int origDbSize; /* dbSize before the current change */ 167 int ckptSize; /* Size of database (in pages) at ckpt_begin() */ 168 off_t ckptJSize; /* Size of journal at ckpt_begin() */ 169 int nRec; /* Number of pages written to the journal */ 170 u32 cksumInit; /* Quasi-random value added to every checksum */ 171 int ckptNRec; /* Number of records in the checkpoint journal */ 172 int nExtra; /* Add this many bytes to each in-memory page */ 173 void (*xDestructor)(void*); /* Call this routine when freeing pages */ 174 int nPage; /* Total number of in-memory pages */ 175 int nRef; /* Number of in-memory pages with PgHdr.nRef>0 */ 176 int mxPage; /* Maximum number of pages to hold in cache */ 177 int nHit, nMiss, nOvfl; /* Cache hits, missing, and LRU overflows */ 178 void (*xCodec)(void*,void*,Pgno,int); /* Routine for en/decoding data */ 179 void *pCodecArg; /* First argument to xCodec() */ 180 u8 journalOpen; /* True if journal file descriptors is valid */ 181 u8 journalStarted; /* True if header of journal is synced */ 182 u8 useJournal; /* Use a rollback journal on this file */ 183 u8 ckptOpen; /* True if the checkpoint journal is open */ 184 u8 ckptInUse; /* True we are in a checkpoint */ 185 u8 ckptAutoopen; /* Open ckpt journal when main journal is opened*/ 186 u8 noSync; /* Do not sync the journal if true */ 187 u8 fullSync; /* Do extra syncs of the journal for robustness */ 188 u8 state; /* SQLITE_UNLOCK, _READLOCK or _WRITELOCK */ 189 u8 errMask; /* One of several kinds of errors */ 190 u8 tempFile; /* zFilename is a temporary file */ 191 u8 readOnly; /* True for a read-only database */ 192 u8 needSync; /* True if an fsync() is needed on the journal */ 193 u8 dirtyFile; /* True if database file has changed in any way */ 194 u8 alwaysRollback; /* Disable dont_rollback() for all pages */ 195 u8 *aInJournal; /* One bit for each page in the database file */ 196 u8 *aInCkpt; /* One bit for each page in the database */ 197 PgHdr *pFirst, *pLast; /* List of free pages */ 198 PgHdr *pFirstSynced; /* First free page with PgHdr.needSync==0 */ 199 PgHdr *pAll; /* List of all pages */ 200 PgHdr *pCkpt; /* List of pages in the checkpoint journal */ 201 PgHdr *aHash[N_PG_HASH]; /* Hash table to map page number of PgHdr */ 202 }; 203 204 /* 205 ** These are bits that can be set in Pager.errMask. 206 */ 207 #define PAGER_ERR_FULL 0x01 /* a write() failed */ 208 #define PAGER_ERR_MEM 0x02 /* malloc() failed */ 209 #define PAGER_ERR_LOCK 0x04 /* error in the locking protocol */ 210 #define PAGER_ERR_CORRUPT 0x08 /* database or journal corruption */ 211 #define PAGER_ERR_DISK 0x10 /* general disk I/O error - bad hard drive? */ 212 213 /* 214 ** The journal file contains page records in the following 215 ** format. 216 ** 217 ** Actually, this structure is the complete page record for pager 218 ** formats less than 3. Beginning with format 3, this record is surrounded 219 ** by two checksums. 220 */ 221 typedef struct PageRecord PageRecord; 222 struct PageRecord { 223 Pgno pgno; /* The page number */ 224 char aData[SQLITE_PAGE_SIZE]; /* Original data for page pgno */ 225 }; 226 227 /* 228 ** Journal files begin with the following magic string. The data 229 ** was obtained from /dev/random. It is used only as a sanity check. 230 ** 231 ** There are three journal formats (so far). The 1st journal format writes 232 ** 32-bit integers in the byte-order of the host machine. New 233 ** formats writes integers as big-endian. All new journals use the 234 ** new format, but we have to be able to read an older journal in order 235 ** to rollback journals created by older versions of the library. 236 ** 237 ** The 3rd journal format (added for 2.8.0) adds additional sanity 238 ** checking information to the journal. If the power fails while the 239 ** journal is being written, semi-random garbage data might appear in 240 ** the journal file after power is restored. If an attempt is then made 241 ** to roll the journal back, the database could be corrupted. The additional 242 ** sanity checking data is an attempt to discover the garbage in the 243 ** journal and ignore it. 244 ** 245 ** The sanity checking information for the 3rd journal format consists 246 ** of a 32-bit checksum on each page of data. The checksum covers both 247 ** the page number and the SQLITE_PAGE_SIZE bytes of data for the page. 248 ** This cksum is initialized to a 32-bit random value that appears in the 249 ** journal file right after the header. The random initializer is important, 250 ** because garbage data that appears at the end of a journal is likely 251 ** data that was once in other files that have now been deleted. If the 252 ** garbage data came from an obsolete journal file, the checksums might 253 ** be correct. But by initializing the checksum to random value which 254 ** is different for every journal, we minimize that risk. 255 */ 256 static const unsigned char aJournalMagic1[] = { 257 0xd9, 0xd5, 0x05, 0xf9, 0x20, 0xa1, 0x63, 0xd4, 258 }; 259 static const unsigned char aJournalMagic2[] = { 260 0xd9, 0xd5, 0x05, 0xf9, 0x20, 0xa1, 0x63, 0xd5, 261 }; 262 static const unsigned char aJournalMagic3[] = { 263 0xd9, 0xd5, 0x05, 0xf9, 0x20, 0xa1, 0x63, 0xd6, 264 }; 265 #define JOURNAL_FORMAT_1 1 266 #define JOURNAL_FORMAT_2 2 267 #define JOURNAL_FORMAT_3 3 268 269 /* 270 ** The following integer determines what format to use when creating 271 ** new primary journal files. By default we always use format 3. 272 ** When testing, we can set this value to older journal formats in order to 273 ** make sure that newer versions of the library are able to rollback older 274 ** journal files. 275 ** 276 ** Note that checkpoint journals always use format 2 and omit the header. 277 */ 278 #ifdef SQLITE_TEST 279 int journal_format = 3; 280 #else 281 # define journal_format 3 282 #endif 283 284 /* 285 ** The size of the header and of each page in the journal varies according 286 ** to which journal format is being used. The following macros figure out 287 ** the sizes based on format numbers. 288 */ 289 #define JOURNAL_HDR_SZ(X) \ 290 (sizeof(aJournalMagic1) + sizeof(Pgno) + ((X)>=3)*2*sizeof(u32)) 291 #define JOURNAL_PG_SZ(X) \ 292 (SQLITE_PAGE_SIZE + sizeof(Pgno) + ((X)>=3)*sizeof(u32)) 293 294 /* 295 ** Enable reference count tracking here: 296 */ 297 #ifdef SQLITE_TEST 298 int pager_refinfo_enable = 0; 299 static void pager_refinfo(PgHdr *p){ 300 static int cnt = 0; 301 if( !pager_refinfo_enable ) return; 302 printf( 303 "REFCNT: %4d addr=0x%08x nRef=%d\n", 304 p->pgno, (int)PGHDR_TO_DATA(p), p->nRef 305 ); 306 cnt++; /* Something to set a breakpoint on */ 307 } 308 # define REFINFO(X) pager_refinfo(X) 309 #else 310 # define REFINFO(X) 311 #endif 312 313 /* 314 ** Read a 32-bit integer from the given file descriptor. Store the integer 315 ** that is read in *pRes. Return SQLITE_OK if everything worked, or an 316 ** error code is something goes wrong. 317 ** 318 ** If the journal format is 2 or 3, read a big-endian integer. If the 319 ** journal format is 1, read an integer in the native byte-order of the 320 ** host machine. 321 */ 322 static int read32bits(int format, OsFile *fd, u32 *pRes){ 323 u32 res; 324 int rc; 325 rc = sqliteOsRead(fd, &res, sizeof(res)); 326 if( rc==SQLITE_OK && format>JOURNAL_FORMAT_1 ){ 327 unsigned char ac[4]; 328 memcpy(ac, &res, 4); 329 res = (ac[0]<<24) | (ac[1]<<16) | (ac[2]<<8) | ac[3]; 330 } 331 *pRes = res; 332 return rc; 333 } 334 335 /* 336 ** Write a 32-bit integer into the given file descriptor. Return SQLITE_OK 337 ** on success or an error code is something goes wrong. 338 ** 339 ** If the journal format is 2 or 3, write the integer as 4 big-endian 340 ** bytes. If the journal format is 1, write the integer in the native 341 ** byte order. In normal operation, only formats 2 and 3 are used. 342 ** Journal format 1 is only used for testing. 343 */ 344 static int write32bits(OsFile *fd, u32 val){ 345 unsigned char ac[4]; 346 if( journal_format<=1 ){ 347 return sqliteOsWrite(fd, &val, 4); 348 } 349 ac[0] = (val>>24) & 0xff; 350 ac[1] = (val>>16) & 0xff; 351 ac[2] = (val>>8) & 0xff; 352 ac[3] = val & 0xff; 353 return sqliteOsWrite(fd, ac, 4); 354 } 355 356 /* 357 ** Write a 32-bit integer into a page header right before the 358 ** page data. This will overwrite the PgHdr.pDirty pointer. 359 ** 360 ** The integer is big-endian for formats 2 and 3 and native byte order 361 ** for journal format 1. 362 */ 363 static void store32bits(u32 val, PgHdr *p, int offset){ 364 unsigned char *ac; 365 ac = &((unsigned char*)PGHDR_TO_DATA(p))[offset]; 366 if( journal_format<=1 ){ 367 memcpy(ac, &val, 4); 368 }else{ 369 ac[0] = (val>>24) & 0xff; 370 ac[1] = (val>>16) & 0xff; 371 ac[2] = (val>>8) & 0xff; 372 ac[3] = val & 0xff; 373 } 374 } 375 376 377 /* 378 ** Convert the bits in the pPager->errMask into an approprate 379 ** return code. 380 */ 381 static int pager_errcode(Pager *pPager){ 382 int rc = SQLITE_OK; 383 if( pPager->errMask & PAGER_ERR_LOCK ) rc = SQLITE_PROTOCOL; 384 if( pPager->errMask & PAGER_ERR_DISK ) rc = SQLITE_IOERR; 385 if( pPager->errMask & PAGER_ERR_FULL ) rc = SQLITE_FULL; 386 if( pPager->errMask & PAGER_ERR_MEM ) rc = SQLITE_NOMEM; 387 if( pPager->errMask & PAGER_ERR_CORRUPT ) rc = SQLITE_CORRUPT; 388 return rc; 389 } 390 391 /* 392 ** Add or remove a page from the list of all pages that are in the 393 ** checkpoint journal. 394 ** 395 ** The Pager keeps a separate list of pages that are currently in 396 ** the checkpoint journal. This helps the sqlitepager_ckpt_commit() 397 ** routine run MUCH faster for the common case where there are many 398 ** pages in memory but only a few are in the checkpoint journal. 399 */ 400 static void page_add_to_ckpt_list(PgHdr *pPg){ 401 Pager *pPager = pPg->pPager; 402 if( pPg->inCkpt ) return; 403 assert( pPg->pPrevCkpt==0 && pPg->pNextCkpt==0 ); 404 pPg->pPrevCkpt = 0; 405 if( pPager->pCkpt ){ 406 pPager->pCkpt->pPrevCkpt = pPg; 407 } 408 pPg->pNextCkpt = pPager->pCkpt; 409 pPager->pCkpt = pPg; 410 pPg->inCkpt = 1; 411 } 412 static void page_remove_from_ckpt_list(PgHdr *pPg){ 413 if( !pPg->inCkpt ) return; 414 if( pPg->pPrevCkpt ){ 415 assert( pPg->pPrevCkpt->pNextCkpt==pPg ); 416 pPg->pPrevCkpt->pNextCkpt = pPg->pNextCkpt; 417 }else{ 418 assert( pPg->pPager->pCkpt==pPg ); 419 pPg->pPager->pCkpt = pPg->pNextCkpt; 420 } 421 if( pPg->pNextCkpt ){ 422 assert( pPg->pNextCkpt->pPrevCkpt==pPg ); 423 pPg->pNextCkpt->pPrevCkpt = pPg->pPrevCkpt; 424 } 425 pPg->pNextCkpt = 0; 426 pPg->pPrevCkpt = 0; 427 pPg->inCkpt = 0; 428 } 429 430 /* 431 ** Find a page in the hash table given its page number. Return 432 ** a pointer to the page or NULL if not found. 433 */ 434 static PgHdr *pager_lookup(Pager *pPager, Pgno pgno){ 435 PgHdr *p = pPager->aHash[pager_hash(pgno)]; 436 while( p && p->pgno!=pgno ){ 437 p = p->pNextHash; 438 } 439 return p; 440 } 441 442 /* 443 ** Unlock the database and clear the in-memory cache. This routine 444 ** sets the state of the pager back to what it was when it was first 445 ** opened. Any outstanding pages are invalidated and subsequent attempts 446 ** to access those pages will likely result in a coredump. 447 */ 448 static void pager_reset(Pager *pPager){ 449 PgHdr *pPg, *pNext; 450 for(pPg=pPager->pAll; pPg; pPg=pNext){ 451 pNext = pPg->pNextAll; 452 sqliteFree(pPg); 453 } 454 pPager->pFirst = 0; 455 pPager->pFirstSynced = 0; 456 pPager->pLast = 0; 457 pPager->pAll = 0; 458 memset(pPager->aHash, 0, sizeof(pPager->aHash)); 459 pPager->nPage = 0; 460 if( pPager->state>=SQLITE_WRITELOCK ){ 461 sqlitepager_rollback(pPager); 462 } 463 sqliteOsUnlock(&pPager->fd); 464 pPager->state = SQLITE_UNLOCK; 465 pPager->dbSize = -1; 466 pPager->nRef = 0; 467 assert( pPager->journalOpen==0 ); 468 } 469 470 /* 471 ** When this routine is called, the pager has the journal file open and 472 ** a write lock on the database. This routine releases the database 473 ** write lock and acquires a read lock in its place. The journal file 474 ** is deleted and closed. 475 ** 476 ** TODO: Consider keeping the journal file open for temporary databases. 477 ** This might give a performance improvement on windows where opening 478 ** a file is an expensive operation. 479 */ 480 static int pager_unwritelock(Pager *pPager){ 481 int rc; 482 PgHdr *pPg; 483 if( pPager->state<SQLITE_WRITELOCK ) return SQLITE_OK; 484 sqlitepager_ckpt_commit(pPager); 485 if( pPager->ckptOpen ){ 486 sqliteOsClose(&pPager->cpfd); 487 pPager->ckptOpen = 0; 488 } 489 if( pPager->journalOpen ){ 490 sqliteOsClose(&pPager->jfd); 491 pPager->journalOpen = 0; 492 sqliteOsDelete(pPager->zJournal); 493 sqliteFree( pPager->aInJournal ); 494 pPager->aInJournal = 0; 495 for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){ 496 pPg->inJournal = 0; 497 pPg->dirty = 0; 498 pPg->needSync = 0; 499 } 500 }else{ 501 assert( pPager->dirtyFile==0 || pPager->useJournal==0 ); 502 } 503 rc = sqliteOsReadLock(&pPager->fd); 504 if( rc==SQLITE_OK ){ 505 pPager->state = SQLITE_READLOCK; 506 }else{ 507 /* This can only happen if a process does a BEGIN, then forks and the 508 ** child process does the COMMIT. Because of the semantics of unix 509 ** file locking, the unlock will fail. 510 */ 511 pPager->state = SQLITE_UNLOCK; 512 } 513 return rc; 514 } 515 516 /* 517 ** Compute and return a checksum for the page of data. 518 ** 519 ** This is not a real checksum. It is really just the sum of the 520 ** random initial value and the page number. We considered do a checksum 521 ** of the database, but that was found to be too slow. 522 */ 523 static u32 pager_cksum(Pager *pPager, Pgno pgno, const char *aData){ 524 u32 cksum = pPager->cksumInit + pgno; 525 return cksum; 526 } 527 528 /* 529 ** Read a single page from the journal file opened on file descriptor 530 ** jfd. Playback this one page. 531 ** 532 ** There are three different journal formats. The format parameter determines 533 ** which format is used by the journal that is played back. 534 */ 535 static int pager_playback_one_page(Pager *pPager, OsFile *jfd, int format){ 536 int rc; 537 PgHdr *pPg; /* An existing page in the cache */ 538 PageRecord pgRec; 539 u32 cksum; 540 541 rc = read32bits(format, jfd, &pgRec.pgno); 542 if( rc!=SQLITE_OK ) return rc; 543 rc = sqliteOsRead(jfd, &pgRec.aData, sizeof(pgRec.aData)); 544 if( rc!=SQLITE_OK ) return rc; 545 546 /* Sanity checking on the page. This is more important that I originally 547 ** thought. If a power failure occurs while the journal is being written, 548 ** it could cause invalid data to be written into the journal. We need to 549 ** detect this invalid data (with high probability) and ignore it. 550 */ 551 if( pgRec.pgno==0 ){ 552 return SQLITE_DONE; 553 } 554 if( pgRec.pgno>(unsigned)pPager->dbSize ){ 555 return SQLITE_OK; 556 } 557 if( format>=JOURNAL_FORMAT_3 ){ 558 rc = read32bits(format, jfd, &cksum); 559 if( rc ) return rc; 560 if( pager_cksum(pPager, pgRec.pgno, pgRec.aData)!=cksum ){ 561 return SQLITE_DONE; 562 } 563 } 564 565 /* Playback the page. Update the in-memory copy of the page 566 ** at the same time, if there is one. 567 */ 568 pPg = pager_lookup(pPager, pgRec.pgno); 569 TRACE2("PLAYBACK %d\n", pgRec.pgno); 570 sqliteOsSeek(&pPager->fd, (pgRec.pgno-1)*(off_t)SQLITE_PAGE_SIZE); 571 rc = sqliteOsWrite(&pPager->fd, pgRec.aData, SQLITE_PAGE_SIZE); 572 if( pPg ){ 573 /* No page should ever be rolled back that is in use, except for page 574 ** 1 which is held in use in order to keep the lock on the database 575 ** active. However, such a page may be rolled back as a result of an 576 ** internal error resulting in an automatic call to 577 ** sqlitepager_rollback(), so we can't assert() it. 578 */ 579 /* assert( pPg->nRef==0 || pPg->pgno==1 ) */ 580 memcpy(PGHDR_TO_DATA(pPg), pgRec.aData, SQLITE_PAGE_SIZE); 581 memset(PGHDR_TO_EXTRA(pPg), 0, pPager->nExtra); 582 pPg->dirty = 0; 583 pPg->needSync = 0; 584 CODEC(pPager, PGHDR_TO_DATA(pPg), pPg->pgno, 3); 585 } 586 return rc; 587 } 588 589 /* 590 ** Playback the journal and thus restore the database file to 591 ** the state it was in before we started making changes. 592 ** 593 ** The journal file format is as follows: 594 ** 595 ** * 8 byte prefix. One of the aJournalMagic123 vectors defined 596 ** above. The format of the journal file is determined by which 597 ** of the three prefix vectors is seen. 598 ** * 4 byte big-endian integer which is the number of valid page records 599 ** in the journal. If this value is 0xffffffff, then compute the 600 ** number of page records from the journal size. This field appears 601 ** in format 3 only. 602 ** * 4 byte big-endian integer which is the initial value for the 603 ** sanity checksum. This field appears in format 3 only. 604 ** * 4 byte integer which is the number of pages to truncate the 605 ** database to during a rollback. 606 ** * Zero or more pages instances, each as follows: 607 ** + 4 byte page number. 608 ** + SQLITE_PAGE_SIZE bytes of data. 609 ** + 4 byte checksum (format 3 only) 610 ** 611 ** When we speak of the journal header, we mean the first 4 bullets above. 612 ** Each entry in the journal is an instance of the 5th bullet. Note that 613 ** bullets 2 and 3 only appear in format-3 journals. 614 ** 615 ** Call the value from the second bullet "nRec". nRec is the number of 616 ** valid page entries in the journal. In most cases, you can compute the 617 ** value of nRec from the size of the journal file. But if a power 618 ** failure occurred while the journal was being written, it could be the 619 ** case that the size of the journal file had already been increased but 620 ** the extra entries had not yet made it safely to disk. In such a case, 621 ** the value of nRec computed from the file size would be too large. For 622 ** that reason, we always use the nRec value in the header. 623 ** 624 ** If the nRec value is 0xffffffff it means that nRec should be computed 625 ** from the file size. This value is used when the user selects the 626 ** no-sync option for the journal. A power failure could lead to corruption 627 ** in this case. But for things like temporary table (which will be 628 ** deleted when the power is restored) we don't care. 629 ** 630 ** Journal formats 1 and 2 do not have an nRec value in the header so we 631 ** have to compute nRec from the file size. This has risks (as described 632 ** above) which is why all persistent tables have been changed to use 633 ** format 3. 634 ** 635 ** If the file opened as the journal file is not a well-formed 636 ** journal file then the database will likely already be 637 ** corrupted, so the PAGER_ERR_CORRUPT bit is set in pPager->errMask 638 ** and SQLITE_CORRUPT is returned. If it all works, then this routine 639 ** returns SQLITE_OK. 640 */ 641 static int pager_playback(Pager *pPager, int useJournalSize){ 642 off_t szJ; /* Size of the journal file in bytes */ 643 int nRec; /* Number of Records in the journal */ 644 int i; /* Loop counter */ 645 Pgno mxPg = 0; /* Size of the original file in pages */ 646 int format; /* Format of the journal file. */ 647 unsigned char aMagic[sizeof(aJournalMagic1)]; 648 int rc; 649 650 /* Figure out how many records are in the journal. Abort early if 651 ** the journal is empty. 652 */ 653 assert( pPager->journalOpen ); 654 sqliteOsSeek(&pPager->jfd, 0); 655 rc = sqliteOsFileSize(&pPager->jfd, &szJ); 656 if( rc!=SQLITE_OK ){ 657 goto end_playback; 658 } 659 660 /* If the journal file is too small to contain a complete header, 661 ** it must mean that the process that created the journal was just 662 ** beginning to write the journal file when it died. In that case, 663 ** the database file should have still been completely unchanged. 664 ** Nothing needs to be rolled back. We can safely ignore this journal. 665 */ 666 if( szJ < sizeof(aMagic)+sizeof(Pgno) ){ 667 goto end_playback; 668 } 669 670 /* Read the beginning of the journal and truncate the 671 ** database file back to its original size. 672 */ 673 rc = sqliteOsRead(&pPager->jfd, aMagic, sizeof(aMagic)); 674 if( rc!=SQLITE_OK ){ 675 rc = SQLITE_PROTOCOL; 676 goto end_playback; 677 } 678 if( memcmp(aMagic, aJournalMagic3, sizeof(aMagic))==0 ){ 679 format = JOURNAL_FORMAT_3; 680 }else if( memcmp(aMagic, aJournalMagic2, sizeof(aMagic))==0 ){ 681 format = JOURNAL_FORMAT_2; 682 }else if( memcmp(aMagic, aJournalMagic1, sizeof(aMagic))==0 ){ 683 format = JOURNAL_FORMAT_1; 684 }else{ 685 rc = SQLITE_PROTOCOL; 686 goto end_playback; 687 } 688 if( format>=JOURNAL_FORMAT_3 ){ 689 if( szJ < sizeof(aMagic) + 3*sizeof(u32) ){ 690 /* Ignore the journal if it is too small to contain a complete 691 ** header. We already did this test once above, but at the prior 692 ** test, we did not know the journal format and so we had to assume 693 ** the smallest possible header. Now we know the header is bigger 694 ** than the minimum so we test again. 695 */ 696 goto end_playback; 697 } 698 rc = read32bits(format, &pPager->jfd, (u32*)&nRec); 699 if( rc ) goto end_playback; 700 rc = read32bits(format, &pPager->jfd, &pPager->cksumInit); 701 if( rc ) goto end_playback; 702 if( nRec==0xffffffff || useJournalSize ){ 703 nRec = (szJ - JOURNAL_HDR_SZ(3))/JOURNAL_PG_SZ(3); 704 } 705 }else{ 706 nRec = (szJ - JOURNAL_HDR_SZ(2))/JOURNAL_PG_SZ(2); 707 assert( nRec*JOURNAL_PG_SZ(2)+JOURNAL_HDR_SZ(2)==szJ ); 708 } 709 rc = read32bits(format, &pPager->jfd, &mxPg); 710 if( rc!=SQLITE_OK ){ 711 goto end_playback; 712 } 713 assert( pPager->origDbSize==0 || pPager->origDbSize==mxPg ); 714 rc = sqliteOsTruncate(&pPager->fd, SQLITE_PAGE_SIZE*(off_t)mxPg); 715 if( rc!=SQLITE_OK ){ 716 goto end_playback; 717 } 718 pPager->dbSize = mxPg; 719 720 /* Copy original pages out of the journal and back into the database file. 721 */ 722 for(i=0; i<nRec; i++){ 723 rc = pager_playback_one_page(pPager, &pPager->jfd, format); 724 if( rc!=SQLITE_OK ){ 725 if( rc==SQLITE_DONE ){ 726 rc = SQLITE_OK; 727 } 728 break; 729 } 730 } 731 732 /* Pages that have been written to the journal but never synced 733 ** where not restored by the loop above. We have to restore those 734 ** pages by reading them back from the original database. 735 */ 736 if( rc==SQLITE_OK ){ 737 PgHdr *pPg; 738 for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){ 739 char zBuf[SQLITE_PAGE_SIZE]; 740 if( !pPg->dirty ) continue; 741 if( (int)pPg->pgno <= pPager->origDbSize ){ 742 sqliteOsSeek(&pPager->fd, SQLITE_PAGE_SIZE*(off_t)(pPg->pgno-1)); 743 rc = sqliteOsRead(&pPager->fd, zBuf, SQLITE_PAGE_SIZE); 744 TRACE2("REFETCH %d\n", pPg->pgno); 745 CODEC(pPager, zBuf, pPg->pgno, 2); 746 if( rc ) break; 747 }else{ 748 memset(zBuf, 0, SQLITE_PAGE_SIZE); 749 } 750 if( pPg->nRef==0 || memcmp(zBuf, PGHDR_TO_DATA(pPg), SQLITE_PAGE_SIZE) ){ 751 memcpy(PGHDR_TO_DATA(pPg), zBuf, SQLITE_PAGE_SIZE); 752 memset(PGHDR_TO_EXTRA(pPg), 0, pPager->nExtra); 753 } 754 pPg->needSync = 0; 755 pPg->dirty = 0; 756 } 757 } 758 759 end_playback: 760 if( rc!=SQLITE_OK ){ 761 pager_unwritelock(pPager); 762 pPager->errMask |= PAGER_ERR_CORRUPT; 763 rc = SQLITE_CORRUPT; 764 }else{ 765 rc = pager_unwritelock(pPager); 766 } 767 return rc; 768 } 769 770 /* 771 ** Playback the checkpoint journal. 772 ** 773 ** This is similar to playing back the transaction journal but with 774 ** a few extra twists. 775 ** 776 ** (1) The number of pages in the database file at the start of 777 ** the checkpoint is stored in pPager->ckptSize, not in the 778 ** journal file itself. 779 ** 780 ** (2) In addition to playing back the checkpoint journal, also 781 ** playback all pages of the transaction journal beginning 782 ** at offset pPager->ckptJSize. 783 */ 784 static int pager_ckpt_playback(Pager *pPager){ 785 off_t szJ; /* Size of the full journal */ 786 int nRec; /* Number of Records */ 787 int i; /* Loop counter */ 788 int rc; 789 790 /* Truncate the database back to its original size. 791 */ 792 rc = sqliteOsTruncate(&pPager->fd, SQLITE_PAGE_SIZE*(off_t)pPager->ckptSize); 793 pPager->dbSize = pPager->ckptSize; 794 795 /* Figure out how many records are in the checkpoint journal. 796 */ 797 assert( pPager->ckptInUse && pPager->journalOpen ); 798 sqliteOsSeek(&pPager->cpfd, 0); 799 nRec = pPager->ckptNRec; 800 801 /* Copy original pages out of the checkpoint journal and back into the 802 ** database file. Note that the checkpoint journal always uses format 803 ** 2 instead of format 3 since it does not need to be concerned with 804 ** power failures corrupting the journal and can thus omit the checksums. 805 */ 806 for(i=nRec-1; i>=0; i--){ 807 rc = pager_playback_one_page(pPager, &pPager->cpfd, 2); 808 assert( rc!=SQLITE_DONE ); 809 if( rc!=SQLITE_OK ) goto end_ckpt_playback; 810 } 811 812 /* Figure out how many pages need to be copied out of the transaction 813 ** journal. 814 */ 815 rc = sqliteOsSeek(&pPager->jfd, pPager->ckptJSize); 816 if( rc!=SQLITE_OK ){ 817 goto end_ckpt_playback; 818 } 819 rc = sqliteOsFileSize(&pPager->jfd, &szJ); 820 if( rc!=SQLITE_OK ){ 821 goto end_ckpt_playback; 822 } 823 nRec = (szJ - pPager->ckptJSize)/JOURNAL_PG_SZ(journal_format); 824 for(i=nRec-1; i>=0; i--){ 825 rc = pager_playback_one_page(pPager, &pPager->jfd, journal_format); 826 if( rc!=SQLITE_OK ){ 827 assert( rc!=SQLITE_DONE ); 828 goto end_ckpt_playback; 829 } 830 } 831 832 end_ckpt_playback: 833 if( rc!=SQLITE_OK ){ 834 pPager->errMask |= PAGER_ERR_CORRUPT; 835 rc = SQLITE_CORRUPT; 836 } 837 return rc; 838 } 839 840 /* 841 ** Change the maximum number of in-memory pages that are allowed. 842 ** 843 ** The maximum number is the absolute value of the mxPage parameter. 844 ** If mxPage is negative, the noSync flag is also set. noSync bypasses 845 ** calls to sqliteOsSync(). The pager runs much faster with noSync on, 846 ** but if the operating system crashes or there is an abrupt power 847 ** failure, the database file might be left in an inconsistent and 848 ** unrepairable state. 849 */ 850 void sqlitepager_set_cachesize(Pager *pPager, int mxPage){ 851 if( mxPage>=0 ){ 852 pPager->noSync = pPager->tempFile; 853 if( pPager->noSync==0 ) pPager->needSync = 0; 854 }else{ 855 pPager->noSync = 1; 856 mxPage = -mxPage; 857 } 858 if( mxPage>10 ){ 859 pPager->mxPage = mxPage; 860 } 861 } 862 863 /* 864 ** Adjust the robustness of the database to damage due to OS crashes 865 ** or power failures by changing the number of syncs()s when writing 866 ** the rollback journal. There are three levels: 867 ** 868 ** OFF sqliteOsSync() is never called. This is the default 869 ** for temporary and transient files. 870 ** 871 ** NORMAL The journal is synced once before writes begin on the 872 ** database. This is normally adequate protection, but 873 ** it is theoretically possible, though very unlikely, 874 ** that an inopertune power failure could leave the journal 875 ** in a state which would cause damage to the database 876 ** when it is rolled back. 877 ** 878 ** FULL The journal is synced twice before writes begin on the 879 ** database (with some additional information - the nRec field 880 ** of the journal header - being written in between the two 881 ** syncs). If we assume that writing a 882 ** single disk sector is atomic, then this mode provides 883 ** assurance that the journal will not be corrupted to the 884 ** point of causing damage to the database during rollback. 885 ** 886 ** Numeric values associated with these states are OFF==1, NORMAL=2, 887 ** and FULL=3. 888 */ 889 void sqlitepager_set_safety_level(Pager *pPager, int level){ 890 pPager->noSync = level==1 || pPager->tempFile; 891 pPager->fullSync = level==3 && !pPager->tempFile; 892 if( pPager->noSync==0 ) pPager->needSync = 0; 893 } 894 895 /* 896 ** Open a temporary file. Write the name of the file into zName 897 ** (zName must be at least SQLITE_TEMPNAME_SIZE bytes long.) Write 898 ** the file descriptor into *fd. Return SQLITE_OK on success or some 899 ** other error code if we fail. 900 ** 901 ** The OS will automatically delete the temporary file when it is 902 ** closed. 903 */ 904 static int sqlitepager_opentemp(char *zFile, OsFile *fd){ 905 int cnt = 8; 906 int rc; 907 do{ 908 cnt--; 909 sqliteOsTempFileName(zFile); 910 rc = sqliteOsOpenExclusive(zFile, fd, 1); 911 }while( cnt>0 && rc!=SQLITE_OK ); 912 return rc; 913 } 914 915 /* 916 ** Create a new page cache and put a pointer to the page cache in *ppPager. 917 ** The file to be cached need not exist. The file is not locked until 918 ** the first call to sqlitepager_get() and is only held open until the 919 ** last page is released using sqlitepager_unref(). 920 ** 921 ** If zFilename is NULL then a randomly-named temporary file is created 922 ** and used as the file to be cached. The file will be deleted 923 ** automatically when it is closed. 924 */ 925 int sqlitepager_open( 926 Pager **ppPager, /* Return the Pager structure here */ 927 const char *zFilename, /* Name of the database file to open */ 928 int mxPage, /* Max number of in-memory cache pages */ 929 int nExtra, /* Extra bytes append to each in-memory page */ 930 int useJournal /* TRUE to use a rollback journal on this file */ 931 ){ 932 Pager *pPager; 933 char *zFullPathname; 934 int nameLen; 935 OsFile fd; 936 int rc, i; 937 int tempFile; 938 int readOnly = 0; 939 char zTemp[SQLITE_TEMPNAME_SIZE]; 940 941 *ppPager = 0; 942 if( sqlite_malloc_failed ){ 943 return SQLITE_NOMEM; 944 } 945 if( zFilename && zFilename[0] ){ 946 zFullPathname = sqliteOsFullPathname(zFilename); 947 rc = sqliteOsOpenReadWrite(zFullPathname, &fd, &readOnly); 948 tempFile = 0; 949 }else{ 950 rc = sqlitepager_opentemp(zTemp, &fd); 951 zFilename = zTemp; 952 zFullPathname = sqliteOsFullPathname(zFilename); 953 tempFile = 1; 954 } 955 if( sqlite_malloc_failed ){ 956 return SQLITE_NOMEM; 957 } 958 if( rc!=SQLITE_OK ){ 959 sqliteFree(zFullPathname); 960 return SQLITE_CANTOPEN; 961 } 962 nameLen = strlen(zFullPathname); 963 pPager = sqliteMalloc( sizeof(*pPager) + nameLen*3 + 30 ); 964 if( pPager==0 ){ 965 sqliteOsClose(&fd); 966 sqliteFree(zFullPathname); 967 return SQLITE_NOMEM; 968 } 969 SET_PAGER(pPager); 970 pPager->zFilename = (char*)&pPager[1]; 971 pPager->zDirectory = &pPager->zFilename[nameLen+1]; 972 pPager->zJournal = &pPager->zDirectory[nameLen+1]; 973 strcpy(pPager->zFilename, zFullPathname); 974 strcpy(pPager->zDirectory, zFullPathname); 975 for(i=nameLen; i>0 && pPager->zDirectory[i-1]!='/'; i--){} 976 if( i>0 ) pPager->zDirectory[i-1] = 0; 977 strcpy(pPager->zJournal, zFullPathname); 978 sqliteFree(zFullPathname); 979 strcpy(&pPager->zJournal[nameLen], "-journal"); 980 pPager->fd = fd; 981 pPager->journalOpen = 0; 982 pPager->useJournal = useJournal; 983 pPager->ckptOpen = 0; 984 pPager->ckptInUse = 0; 985 pPager->nRef = 0; 986 pPager->dbSize = -1; 987 pPager->ckptSize = 0; 988 pPager->ckptJSize = 0; 989 pPager->nPage = 0; 990 pPager->mxPage = mxPage>5 ? mxPage : 10; 991 pPager->state = SQLITE_UNLOCK; 992 pPager->errMask = 0; 993 pPager->tempFile = tempFile; 994 pPager->readOnly = readOnly; 995 pPager->needSync = 0; 996 pPager->noSync = pPager->tempFile || !useJournal; 997 pPager->pFirst = 0; 998 pPager->pFirstSynced = 0; 999 pPager->pLast = 0; 1000 pPager->nExtra = nExtra; 1001 memset(pPager->aHash, 0, sizeof(pPager->aHash)); 1002 *ppPager = pPager; 1003 return SQLITE_OK; 1004 } 1005 1006 /* 1007 ** Set the destructor for this pager. If not NULL, the destructor is called 1008 ** when the reference count on each page reaches zero. The destructor can 1009 ** be used to clean up information in the extra segment appended to each page. 1010 ** 1011 ** The destructor is not called as a result sqlitepager_close(). 1012 ** Destructors are only called by sqlitepager_unref(). 1013 */ 1014 void sqlitepager_set_destructor(Pager *pPager, void (*xDesc)(void*)){ 1015 pPager->xDestructor = xDesc; 1016 } 1017 1018 /* 1019 ** Return the total number of pages in the disk file associated with 1020 ** pPager. 1021 */ 1022 int sqlitepager_pagecount(Pager *pPager){ 1023 off_t n; 1024 assert( pPager!=0 ); 1025 if( pPager->dbSize>=0 ){ 1026 return pPager->dbSize; 1027 } 1028 if( sqliteOsFileSize(&pPager->fd, &n)!=SQLITE_OK ){ 1029 pPager->errMask |= PAGER_ERR_DISK; 1030 return 0; 1031 } 1032 n /= SQLITE_PAGE_SIZE; 1033 if( pPager->state!=SQLITE_UNLOCK ){ 1034 pPager->dbSize = n; 1035 } 1036 return n; 1037 } 1038 1039 /* 1040 ** Forward declaration 1041 */ 1042 static int syncJournal(Pager*); 1043 1044 /* 1045 ** Truncate the file to the number of pages specified. 1046 */ 1047 int sqlitepager_truncate(Pager *pPager, Pgno nPage){ 1048 int rc; 1049 if( pPager->dbSize<0 ){ 1050 sqlitepager_pagecount(pPager); 1051 } 1052 if( pPager->errMask!=0 ){ 1053 rc = pager_errcode(pPager); 1054 return rc; 1055 } 1056 if( nPage>=(unsigned)pPager->dbSize ){ 1057 return SQLITE_OK; 1058 } 1059 syncJournal(pPager); 1060 rc = sqliteOsTruncate(&pPager->fd, SQLITE_PAGE_SIZE*(off_t)nPage); 1061 if( rc==SQLITE_OK ){ 1062 pPager->dbSize = nPage; 1063 } 1064 return rc; 1065 } 1066 1067 /* 1068 ** Shutdown the page cache. Free all memory and close all files. 1069 ** 1070 ** If a transaction was in progress when this routine is called, that 1071 ** transaction is rolled back. All outstanding pages are invalidated 1072 ** and their memory is freed. Any attempt to use a page associated 1073 ** with this page cache after this function returns will likely 1074 ** result in a coredump. 1075 */ 1076 int sqlitepager_close(Pager *pPager){ 1077 PgHdr *pPg, *pNext; 1078 switch( pPager->state ){ 1079 case SQLITE_WRITELOCK: { 1080 sqlitepager_rollback(pPager); 1081 sqliteOsUnlock(&pPager->fd); 1082 assert( pPager->journalOpen==0 ); 1083 break; 1084 } 1085 case SQLITE_READLOCK: { 1086 sqliteOsUnlock(&pPager->fd); 1087 break; 1088 } 1089 default: { 1090 /* Do nothing */ 1091 break; 1092 } 1093 } 1094 for(pPg=pPager->pAll; pPg; pPg=pNext){ 1095 pNext = pPg->pNextAll; 1096 sqliteFree(pPg); 1097 } 1098 sqliteOsClose(&pPager->fd); 1099 assert( pPager->journalOpen==0 ); 1100 /* Temp files are automatically deleted by the OS 1101 ** if( pPager->tempFile ){ 1102 ** sqliteOsDelete(pPager->zFilename); 1103 ** } 1104 */ 1105 CLR_PAGER(pPager); 1106 if( pPager->zFilename!=(char*)&pPager[1] ){ 1107 assert( 0 ); /* Cannot happen */ 1108 sqliteFree(pPager->zFilename); 1109 sqliteFree(pPager->zJournal); 1110 sqliteFree(pPager->zDirectory); 1111 } 1112 sqliteFree(pPager); 1113 return SQLITE_OK; 1114 } 1115 1116 /* 1117 ** Return the page number for the given page data. 1118 */ 1119 Pgno sqlitepager_pagenumber(void *pData){ 1120 PgHdr *p = DATA_TO_PGHDR(pData); 1121 return p->pgno; 1122 } 1123 1124 /* 1125 ** Increment the reference count for a page. If the page is 1126 ** currently on the freelist (the reference count is zero) then 1127 ** remove it from the freelist. 1128 */ 1129 #define page_ref(P) ((P)->nRef==0?_page_ref(P):(void)(P)->nRef++) 1130 static void _page_ref(PgHdr *pPg){ 1131 if( pPg->nRef==0 ){ 1132 /* The page is currently on the freelist. Remove it. */ 1133 if( pPg==pPg->pPager->pFirstSynced ){ 1134 PgHdr *p = pPg->pNextFree; 1135 while( p && p->needSync ){ p = p->pNextFree; } 1136 pPg->pPager->pFirstSynced = p; 1137 } 1138 if( pPg->pPrevFree ){ 1139 pPg->pPrevFree->pNextFree = pPg->pNextFree; 1140 }else{ 1141 pPg->pPager->pFirst = pPg->pNextFree; 1142 } 1143 if( pPg->pNextFree ){ 1144 pPg->pNextFree->pPrevFree = pPg->pPrevFree; 1145 }else{ 1146 pPg->pPager->pLast = pPg->pPrevFree; 1147 } 1148 pPg->pPager->nRef++; 1149 } 1150 pPg->nRef++; 1151 REFINFO(pPg); 1152 } 1153 1154 /* 1155 ** Increment the reference count for a page. The input pointer is 1156 ** a reference to the page data. 1157 */ 1158 int sqlitepager_ref(void *pData){ 1159 PgHdr *pPg = DATA_TO_PGHDR(pData); 1160 page_ref(pPg); 1161 return SQLITE_OK; 1162 } 1163 1164 /* 1165 ** Sync the journal. In other words, make sure all the pages that have 1166 ** been written to the journal have actually reached the surface of the 1167 ** disk. It is not safe to modify the original database file until after 1168 ** the journal has been synced. If the original database is modified before 1169 ** the journal is synced and a power failure occurs, the unsynced journal 1170 ** data would be lost and we would be unable to completely rollback the 1171 ** database changes. Database corruption would occur. 1172 ** 1173 ** This routine also updates the nRec field in the header of the journal. 1174 ** (See comments on the pager_playback() routine for additional information.) 1175 ** If the sync mode is FULL, two syncs will occur. First the whole journal 1176 ** is synced, then the nRec field is updated, then a second sync occurs. 1177 ** 1178 ** For temporary databases, we do not care if we are able to rollback 1179 ** after a power failure, so sync occurs. 1180 ** 1181 ** This routine clears the needSync field of every page current held in 1182 ** memory. 1183 */ 1184 static int syncJournal(Pager *pPager){ 1185 PgHdr *pPg; 1186 int rc = SQLITE_OK; 1187 1188 /* Sync the journal before modifying the main database 1189 ** (assuming there is a journal and it needs to be synced.) 1190 */ 1191 if( pPager->needSync ){ 1192 if( !pPager->tempFile ){ 1193 assert( pPager->journalOpen ); 1194 /* assert( !pPager->noSync ); // noSync might be set if synchronous 1195 ** was turned off after the transaction was started. Ticket #615 */ 1196 #ifndef NDEBUG 1197 { 1198 /* Make sure the pPager->nRec counter we are keeping agrees 1199 ** with the nRec computed from the size of the journal file. 1200 */ 1201 off_t hdrSz, pgSz, jSz; 1202 hdrSz = JOURNAL_HDR_SZ(journal_format); 1203 pgSz = JOURNAL_PG_SZ(journal_format); 1204 rc = sqliteOsFileSize(&pPager->jfd, &jSz); 1205 if( rc!=0 ) return rc; 1206 assert( pPager->nRec*pgSz+hdrSz==jSz ); 1207 } 1208 #endif 1209 if( journal_format>=3 ){ 1210 /* Write the nRec value into the journal file header */ 1211 off_t szJ; 1212 if( pPager->fullSync ){ 1213 TRACE1("SYNC\n"); 1214 rc = sqliteOsSync(&pPager->jfd); 1215 if( rc!=0 ) return rc; 1216 } 1217 sqliteOsSeek(&pPager->jfd, sizeof(aJournalMagic1)); 1218 rc = write32bits(&pPager->jfd, pPager->nRec); 1219 if( rc ) return rc; 1220 szJ = JOURNAL_HDR_SZ(journal_format) + 1221 pPager->nRec*JOURNAL_PG_SZ(journal_format); 1222 sqliteOsSeek(&pPager->jfd, szJ); 1223 } 1224 TRACE1("SYNC\n"); 1225 rc = sqliteOsSync(&pPager->jfd); 1226 if( rc!=0 ) return rc; 1227 pPager->journalStarted = 1; 1228 } 1229 pPager->needSync = 0; 1230 1231 /* Erase the needSync flag from every page. 1232 */ 1233 for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){ 1234 pPg->needSync = 0; 1235 } 1236 pPager->pFirstSynced = pPager->pFirst; 1237 } 1238 1239 #ifndef NDEBUG 1240 /* If the Pager.needSync flag is clear then the PgHdr.needSync 1241 ** flag must also be clear for all pages. Verify that this 1242 ** invariant is true. 1243 */ 1244 else{ 1245 for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){ 1246 assert( pPg->needSync==0 ); 1247 } 1248 assert( pPager->pFirstSynced==pPager->pFirst ); 1249 } 1250 #endif 1251 1252 return rc; 1253 } 1254 1255 /* 1256 ** Given a list of pages (connected by the PgHdr.pDirty pointer) write 1257 ** every one of those pages out to the database file and mark them all 1258 ** as clean. 1259 */ 1260 static int pager_write_pagelist(PgHdr *pList){ 1261 Pager *pPager; 1262 int rc; 1263 1264 if( pList==0 ) return SQLITE_OK; 1265 pPager = pList->pPager; 1266 while( pList ){ 1267 assert( pList->dirty ); 1268 sqliteOsSeek(&pPager->fd, (pList->pgno-1)*(off_t)SQLITE_PAGE_SIZE); 1269 CODEC(pPager, PGHDR_TO_DATA(pList), pList->pgno, 6); 1270 TRACE2("STORE %d\n", pList->pgno); 1271 rc = sqliteOsWrite(&pPager->fd, PGHDR_TO_DATA(pList), SQLITE_PAGE_SIZE); 1272 CODEC(pPager, PGHDR_TO_DATA(pList), pList->pgno, 0); 1273 if( rc ) return rc; 1274 pList->dirty = 0; 1275 pList = pList->pDirty; 1276 } 1277 return SQLITE_OK; 1278 } 1279 1280 /* 1281 ** Collect every dirty page into a dirty list and 1282 ** return a pointer to the head of that list. All pages are 1283 ** collected even if they are still in use. 1284 */ 1285 static PgHdr *pager_get_all_dirty_pages(Pager *pPager){ 1286 PgHdr *p, *pList; 1287 pList = 0; 1288 for(p=pPager->pAll; p; p=p->pNextAll){ 1289 if( p->dirty ){ 1290 p->pDirty = pList; 1291 pList = p; 1292 } 1293 } 1294 return pList; 1295 } 1296 1297 /* 1298 ** Acquire a page. 1299 ** 1300 ** A read lock on the disk file is obtained when the first page is acquired. 1301 ** This read lock is dropped when the last page is released. 1302 ** 1303 ** A _get works for any page number greater than 0. If the database 1304 ** file is smaller than the requested page, then no actual disk 1305 ** read occurs and the memory image of the page is initialized to 1306 ** all zeros. The extra data appended to a page is always initialized 1307 ** to zeros the first time a page is loaded into memory. 1308 ** 1309 ** The acquisition might fail for several reasons. In all cases, 1310 ** an appropriate error code is returned and *ppPage is set to NULL. 1311 ** 1312 ** See also sqlitepager_lookup(). Both this routine and _lookup() attempt 1313 ** to find a page in the in-memory cache first. If the page is not already 1314 ** in memory, this routine goes to disk to read it in whereas _lookup() 1315 ** just returns 0. This routine acquires a read-lock the first time it 1316 ** has to go to disk, and could also playback an old journal if necessary. 1317 ** Since _lookup() never goes to disk, it never has to deal with locks 1318 ** or journal files. 1319 */ 1320 int sqlitepager_get(Pager *pPager, Pgno pgno, void **ppPage){ 1321 PgHdr *pPg; 1322 int rc; 1323 1324 /* Make sure we have not hit any critical errors. 1325 */ 1326 assert( pPager!=0 ); 1327 assert( pgno!=0 ); 1328 *ppPage = 0; 1329 if( pPager->errMask & ~(PAGER_ERR_FULL) ){ 1330 return pager_errcode(pPager); 1331 } 1332 1333 /* If this is the first page accessed, then get a read lock 1334 ** on the database file. 1335 */ 1336 if( pPager->nRef==0 ){ 1337 rc = sqliteOsReadLock(&pPager->fd); 1338 if( rc!=SQLITE_OK ){ 1339 return rc; 1340 } 1341 pPager->state = SQLITE_READLOCK; 1342 1343 /* If a journal file exists, try to play it back. 1344 */ 1345 if( pPager->useJournal && sqliteOsFileExists(pPager->zJournal) ){ 1346 int rc; 1347 1348 /* Get a write lock on the database 1349 */ 1350 rc = sqliteOsWriteLock(&pPager->fd); 1351 if( rc!=SQLITE_OK ){ 1352 if( sqliteOsUnlock(&pPager->fd)!=SQLITE_OK ){ 1353 /* This should never happen! */ 1354 rc = SQLITE_INTERNAL; 1355 } 1356 return rc; 1357 } 1358 pPager->state = SQLITE_WRITELOCK; 1359 1360 /* Open the journal for reading only. Return SQLITE_BUSY if 1361 ** we are unable to open the journal file. 1362 ** 1363 ** The journal file does not need to be locked itself. The 1364 ** journal file is never open unless the main database file holds 1365 ** a write lock, so there is never any chance of two or more 1366 ** processes opening the journal at the same time. 1367 */ 1368 rc = sqliteOsOpenReadOnly(pPager->zJournal, &pPager->jfd); 1369 if( rc!=SQLITE_OK ){ 1370 rc = sqliteOsUnlock(&pPager->fd); 1371 assert( rc==SQLITE_OK ); 1372 return SQLITE_BUSY; 1373 } 1374 pPager->journalOpen = 1; 1375 pPager->journalStarted = 0; 1376 1377 /* Playback and delete the journal. Drop the database write 1378 ** lock and reacquire the read lock. 1379 */ 1380 rc = pager_playback(pPager, 0); 1381 if( rc!=SQLITE_OK ){ 1382 return rc; 1383 } 1384 } 1385 pPg = 0; 1386 }else{ 1387 /* Search for page in cache */ 1388 pPg = pager_lookup(pPager, pgno); 1389 } 1390 if( pPg==0 ){ 1391 /* The requested page is not in the page cache. */ 1392 int h; 1393 pPager->nMiss++; 1394 if( pPager->nPage<pPager->mxPage || pPager->pFirst==0 ){ 1395 /* Create a new page */ 1396 pPg = sqliteMallocRaw( sizeof(*pPg) + SQLITE_PAGE_SIZE 1397 + sizeof(u32) + pPager->nExtra ); 1398 if( pPg==0 ){ 1399 pager_unwritelock(pPager); 1400 pPager->errMask |= PAGER_ERR_MEM; 1401 return SQLITE_NOMEM; 1402 } 1403 memset(pPg, 0, sizeof(*pPg)); 1404 pPg->pPager = pPager; 1405 pPg->pNextAll = pPager->pAll; 1406 if( pPager->pAll ){ 1407 pPager->pAll->pPrevAll = pPg; 1408 } 1409 pPg->pPrevAll = 0; 1410 pPager->pAll = pPg; 1411 pPager->nPage++; 1412 }else{ 1413 /* Find a page to recycle. Try to locate a page that does not 1414 ** require us to do an fsync() on the journal. 1415 */ 1416 pPg = pPager->pFirstSynced; 1417 1418 /* If we could not find a page that does not require an fsync() 1419 ** on the journal file then fsync the journal file. This is a 1420 ** very slow operation, so we work hard to avoid it. But sometimes 1421 ** it can't be helped. 1422 */ 1423 if( pPg==0 ){ 1424 int rc = syncJournal(pPager); 1425 if( rc!=0 ){ 1426 sqlitepager_rollback(pPager); 1427 return SQLITE_IOERR; 1428 } 1429 pPg = pPager->pFirst; 1430 } 1431 assert( pPg->nRef==0 ); 1432 1433 /* Write the page to the database file if it is dirty. 1434 */ 1435 if( pPg->dirty ){ 1436 assert( pPg->needSync==0 ); 1437 pPg->pDirty = 0; 1438 rc = pager_write_pagelist( pPg ); 1439 if( rc!=SQLITE_OK ){ 1440 sqlitepager_rollback(pPager); 1441 return SQLITE_IOERR; 1442 } 1443 } 1444 assert( pPg->dirty==0 ); 1445 1446 /* If the page we are recycling is marked as alwaysRollback, then 1447 ** set the global alwaysRollback flag, thus disabling the 1448 ** sqlite_dont_rollback() optimization for the rest of this transaction. 1449 ** It is necessary to do this because the page marked alwaysRollback 1450 ** might be reloaded at a later time but at that point we won't remember 1451 ** that is was marked alwaysRollback. This means that all pages must 1452 ** be marked as alwaysRollback from here on out. 1453 */ 1454 if( pPg->alwaysRollback ){ 1455 pPager->alwaysRollback = 1; 1456 } 1457 1458 /* Unlink the old page from the free list and the hash table 1459 */ 1460 if( pPg==pPager->pFirstSynced ){ 1461 PgHdr *p = pPg->pNextFree; 1462 while( p && p->needSync ){ p = p->pNextFree; } 1463 pPager->pFirstSynced = p; 1464 } 1465 if( pPg->pPrevFree ){ 1466 pPg->pPrevFree->pNextFree = pPg->pNextFree; 1467 }else{ 1468 assert( pPager->pFirst==pPg ); 1469 pPager->pFirst = pPg->pNextFree; 1470 } 1471 if( pPg->pNextFree ){ 1472 pPg->pNextFree->pPrevFree = pPg->pPrevFree; 1473 }else{ 1474 assert( pPager->pLast==pPg ); 1475 pPager->pLast = pPg->pPrevFree; 1476 } 1477 pPg->pNextFree = pPg->pPrevFree = 0; 1478 if( pPg->pNextHash ){ 1479 pPg->pNextHash->pPrevHash = pPg->pPrevHash; 1480 } 1481 if( pPg->pPrevHash ){ 1482 pPg->pPrevHash->pNextHash = pPg->pNextHash; 1483 }else{ 1484 h = pager_hash(pPg->pgno); 1485 assert( pPager->aHash[h]==pPg ); 1486 pPager->aHash[h] = pPg->pNextHash; 1487 } 1488 pPg->pNextHash = pPg->pPrevHash = 0; 1489 pPager->nOvfl++; 1490 } 1491 pPg->pgno = pgno; 1492 if( pPager->aInJournal && (int)pgno<=pPager->origDbSize ){ 1493 sqliteCheckMemory(pPager->aInJournal, pgno/8); 1494 assert( pPager->journalOpen ); 1495 pPg->inJournal = (pPager->aInJournal[pgno/8] & (1<<(pgno&7)))!=0; 1496 pPg->needSync = 0; 1497 }else{ 1498 pPg->inJournal = 0; 1499 pPg->needSync = 0; 1500 } 1501 if( pPager->aInCkpt && (int)pgno<=pPager->ckptSize 1502 && (pPager->aInCkpt[pgno/8] & (1<<(pgno&7)))!=0 ){ 1503 page_add_to_ckpt_list(pPg); 1504 }else{ 1505 page_remove_from_ckpt_list(pPg); 1506 } 1507 pPg->dirty = 0; 1508 pPg->nRef = 1; 1509 REFINFO(pPg); 1510 pPager->nRef++; 1511 h = pager_hash(pgno); 1512 pPg->pNextHash = pPager->aHash[h]; 1513 pPager->aHash[h] = pPg; 1514 if( pPg->pNextHash ){ 1515 assert( pPg->pNextHash->pPrevHash==0 ); 1516 pPg->pNextHash->pPrevHash = pPg; 1517 } 1518 if( pPager->nExtra>0 ){ 1519 memset(PGHDR_TO_EXTRA(pPg), 0, pPager->nExtra); 1520 } 1521 if( pPager->dbSize<0 ) sqlitepager_pagecount(pPager); 1522 if( pPager->errMask!=0 ){ 1523 sqlitepager_unref(PGHDR_TO_DATA(pPg)); 1524 rc = pager_errcode(pPager); 1525 return rc; 1526 } 1527 if( pPager->dbSize<(int)pgno ){ 1528 memset(PGHDR_TO_DATA(pPg), 0, SQLITE_PAGE_SIZE); 1529 }else{ 1530 int rc; 1531 sqliteOsSeek(&pPager->fd, (pgno-1)*(off_t)SQLITE_PAGE_SIZE); 1532 rc = sqliteOsRead(&pPager->fd, PGHDR_TO_DATA(pPg), SQLITE_PAGE_SIZE); 1533 TRACE2("FETCH %d\n", pPg->pgno); 1534 CODEC(pPager, PGHDR_TO_DATA(pPg), pPg->pgno, 3); 1535 if( rc!=SQLITE_OK ){ 1536 off_t fileSize; 1537 if( sqliteOsFileSize(&pPager->fd,&fileSize)!=SQLITE_OK 1538 || fileSize>=pgno*SQLITE_PAGE_SIZE ){ 1539 sqlitepager_unref(PGHDR_TO_DATA(pPg)); 1540 return rc; 1541 }else{ 1542 memset(PGHDR_TO_DATA(pPg), 0, SQLITE_PAGE_SIZE); 1543 } 1544 } 1545 } 1546 }else{ 1547 /* The requested page is in the page cache. */ 1548 pPager->nHit++; 1549 page_ref(pPg); 1550 } 1551 *ppPage = PGHDR_TO_DATA(pPg); 1552 return SQLITE_OK; 1553 } 1554 1555 /* 1556 ** Acquire a page if it is already in the in-memory cache. Do 1557 ** not read the page from disk. Return a pointer to the page, 1558 ** or 0 if the page is not in cache. 1559 ** 1560 ** See also sqlitepager_get(). The difference between this routine 1561 ** and sqlitepager_get() is that _get() will go to the disk and read 1562 ** in the page if the page is not already in cache. This routine 1563 ** returns NULL if the page is not in cache or if a disk I/O error 1564 ** has ever happened. 1565 */ 1566 void *sqlitepager_lookup(Pager *pPager, Pgno pgno){ 1567 PgHdr *pPg; 1568 1569 assert( pPager!=0 ); 1570 assert( pgno!=0 ); 1571 if( pPager->errMask & ~(PAGER_ERR_FULL) ){ 1572 return 0; 1573 } 1574 /* if( pPager->nRef==0 ){ 1575 ** return 0; 1576 ** } 1577 */ 1578 pPg = pager_lookup(pPager, pgno); 1579 if( pPg==0 ) return 0; 1580 page_ref(pPg); 1581 return PGHDR_TO_DATA(pPg); 1582 } 1583 1584 /* 1585 ** Release a page. 1586 ** 1587 ** If the number of references to the page drop to zero, then the 1588 ** page is added to the LRU list. When all references to all pages 1589 ** are released, a rollback occurs and the lock on the database is 1590 ** removed. 1591 */ 1592 int sqlitepager_unref(void *pData){ 1593 PgHdr *pPg; 1594 1595 /* Decrement the reference count for this page 1596 */ 1597 pPg = DATA_TO_PGHDR(pData); 1598 assert( pPg->nRef>0 ); 1599 pPg->nRef--; 1600 REFINFO(pPg); 1601 1602 /* When the number of references to a page reach 0, call the 1603 ** destructor and add the page to the freelist. 1604 */ 1605 if( pPg->nRef==0 ){ 1606 Pager *pPager; 1607 pPager = pPg->pPager; 1608 pPg->pNextFree = 0; 1609 pPg->pPrevFree = pPager->pLast; 1610 pPager->pLast = pPg; 1611 if( pPg->pPrevFree ){ 1612 pPg->pPrevFree->pNextFree = pPg; 1613 }else{ 1614 pPager->pFirst = pPg; 1615 } 1616 if( pPg->needSync==0 && pPager->pFirstSynced==0 ){ 1617 pPager->pFirstSynced = pPg; 1618 } 1619 if( pPager->xDestructor ){ 1620 pPager->xDestructor(pData); 1621 } 1622 1623 /* When all pages reach the freelist, drop the read lock from 1624 ** the database file. 1625 */ 1626 pPager->nRef--; 1627 assert( pPager->nRef>=0 ); 1628 if( pPager->nRef==0 ){ 1629 pager_reset(pPager); 1630 } 1631 } 1632 return SQLITE_OK; 1633 } 1634 1635 /* 1636 ** Create a journal file for pPager. There should already be a write 1637 ** lock on the database file when this routine is called. 1638 ** 1639 ** Return SQLITE_OK if everything. Return an error code and release the 1640 ** write lock if anything goes wrong. 1641 */ 1642 static int pager_open_journal(Pager *pPager){ 1643 int rc; 1644 assert( pPager->state==SQLITE_WRITELOCK ); 1645 assert( pPager->journalOpen==0 ); 1646 assert( pPager->useJournal ); 1647 sqlitepager_pagecount(pPager); 1648 pPager->aInJournal = sqliteMalloc( pPager->dbSize/8 + 1 ); 1649 if( pPager->aInJournal==0 ){ 1650 sqliteOsReadLock(&pPager->fd); 1651 pPager->state = SQLITE_READLOCK; 1652 return SQLITE_NOMEM; 1653 } 1654 rc = sqliteOsOpenExclusive(pPager->zJournal, &pPager->jfd,pPager->tempFile); 1655 if( rc!=SQLITE_OK ){ 1656 sqliteFree(pPager->aInJournal); 1657 pPager->aInJournal = 0; 1658 sqliteOsReadLock(&pPager->fd); 1659 pPager->state = SQLITE_READLOCK; 1660 return SQLITE_CANTOPEN; 1661 } 1662 sqliteOsOpenDirectory(pPager->zDirectory, &pPager->jfd); 1663 pPager->journalOpen = 1; 1664 pPager->journalStarted = 0; 1665 pPager->needSync = 0; 1666 pPager->alwaysRollback = 0; 1667 pPager->nRec = 0; 1668 if( pPager->errMask!=0 ){ 1669 rc = pager_errcode(pPager); 1670 return rc; 1671 } 1672 pPager->origDbSize = pPager->dbSize; 1673 if( journal_format==JOURNAL_FORMAT_3 ){ 1674 rc = sqliteOsWrite(&pPager->jfd, aJournalMagic3, sizeof(aJournalMagic3)); 1675 if( rc==SQLITE_OK ){ 1676 rc = write32bits(&pPager->jfd, pPager->noSync ? 0xffffffff : 0); 1677 } 1678 if( rc==SQLITE_OK ){ 1679 sqliteRandomness(sizeof(pPager->cksumInit), &pPager->cksumInit); 1680 rc = write32bits(&pPager->jfd, pPager->cksumInit); 1681 } 1682 }else if( journal_format==JOURNAL_FORMAT_2 ){ 1683 rc = sqliteOsWrite(&pPager->jfd, aJournalMagic2, sizeof(aJournalMagic2)); 1684 }else{ 1685 assert( journal_format==JOURNAL_FORMAT_1 ); 1686 rc = sqliteOsWrite(&pPager->jfd, aJournalMagic1, sizeof(aJournalMagic1)); 1687 } 1688 if( rc==SQLITE_OK ){ 1689 rc = write32bits(&pPager->jfd, pPager->dbSize); 1690 } 1691 if( pPager->ckptAutoopen && rc==SQLITE_OK ){ 1692 rc = sqlitepager_ckpt_begin(pPager); 1693 } 1694 if( rc!=SQLITE_OK ){ 1695 rc = pager_unwritelock(pPager); 1696 if( rc==SQLITE_OK ){ 1697 rc = SQLITE_FULL; 1698 } 1699 } 1700 return rc; 1701 } 1702 1703 /* 1704 ** Acquire a write-lock on the database. The lock is removed when 1705 ** the any of the following happen: 1706 ** 1707 ** * sqlitepager_commit() is called. 1708 ** * sqlitepager_rollback() is called. 1709 ** * sqlitepager_close() is called. 1710 ** * sqlitepager_unref() is called to on every outstanding page. 1711 ** 1712 ** The parameter to this routine is a pointer to any open page of the 1713 ** database file. Nothing changes about the page - it is used merely 1714 ** to acquire a pointer to the Pager structure and as proof that there 1715 ** is already a read-lock on the database. 1716 ** 1717 ** A journal file is opened if this is not a temporary file. For 1718 ** temporary files, the opening of the journal file is deferred until 1719 ** there is an actual need to write to the journal. 1720 ** 1721 ** If the database is already write-locked, this routine is a no-op. 1722 */ 1723 int sqlitepager_begin(void *pData){ 1724 PgHdr *pPg = DATA_TO_PGHDR(pData); 1725 Pager *pPager = pPg->pPager; 1726 int rc = SQLITE_OK; 1727 assert( pPg->nRef>0 ); 1728 assert( pPager->state!=SQLITE_UNLOCK ); 1729 if( pPager->state==SQLITE_READLOCK ){ 1730 assert( pPager->aInJournal==0 ); 1731 rc = sqliteOsWriteLock(&pPager->fd); 1732 if( rc!=SQLITE_OK ){ 1733 return rc; 1734 } 1735 pPager->state = SQLITE_WRITELOCK; 1736 pPager->dirtyFile = 0; 1737 TRACE1("TRANSACTION\n"); 1738 if( pPager->useJournal && !pPager->tempFile ){ 1739 rc = pager_open_journal(pPager); 1740 } 1741 } 1742 return rc; 1743 } 1744 1745 /* 1746 ** Mark a data page as writeable. The page is written into the journal 1747 ** if it is not there already. This routine must be called before making 1748 ** changes to a page. 1749 ** 1750 ** The first time this routine is called, the pager creates a new 1751 ** journal and acquires a write lock on the database. If the write 1752 ** lock could not be acquired, this routine returns SQLITE_BUSY. The 1753 ** calling routine must check for that return value and be careful not to 1754 ** change any page data until this routine returns SQLITE_OK. 1755 ** 1756 ** If the journal file could not be written because the disk is full, 1757 ** then this routine returns SQLITE_FULL and does an immediate rollback. 1758 ** All subsequent write attempts also return SQLITE_FULL until there 1759 ** is a call to sqlitepager_commit() or sqlitepager_rollback() to 1760 ** reset. 1761 */ 1762 int sqlitepager_write(void *pData){ 1763 PgHdr *pPg = DATA_TO_PGHDR(pData); 1764 Pager *pPager = pPg->pPager; 1765 int rc = SQLITE_OK; 1766 1767 /* Check for errors 1768 */ 1769 if( pPager->errMask ){ 1770 return pager_errcode(pPager); 1771 } 1772 if( pPager->readOnly ){ 1773 return SQLITE_PERM; 1774 } 1775 1776 /* Mark the page as dirty. If the page has already been written 1777 ** to the journal then we can return right away. 1778 */ 1779 pPg->dirty = 1; 1780 if( pPg->inJournal && (pPg->inCkpt || pPager->ckptInUse==0) ){ 1781 pPager->dirtyFile = 1; 1782 return SQLITE_OK; 1783 } 1784 1785 /* If we get this far, it means that the page needs to be 1786 ** written to the transaction journal or the ckeckpoint journal 1787 ** or both. 1788 ** 1789 ** First check to see that the transaction journal exists and 1790 ** create it if it does not. 1791 */ 1792 assert( pPager->state!=SQLITE_UNLOCK ); 1793 rc = sqlitepager_begin(pData); 1794 if( rc!=SQLITE_OK ){ 1795 return rc; 1796 } 1797 assert( pPager->state==SQLITE_WRITELOCK ); 1798 if( !pPager->journalOpen && pPager->useJournal ){ 1799 rc = pager_open_journal(pPager); 1800 if( rc!=SQLITE_OK ) return rc; 1801 } 1802 assert( pPager->journalOpen || !pPager->useJournal ); 1803 pPager->dirtyFile = 1; 1804 1805 /* The transaction journal now exists and we have a write lock on the 1806 ** main database file. Write the current page to the transaction 1807 ** journal if it is not there already. 1808 */ 1809 if( !pPg->inJournal && pPager->useJournal ){ 1810 if( (int)pPg->pgno <= pPager->origDbSize ){ 1811 int szPg; 1812 u32 saved; 1813 if( journal_format>=JOURNAL_FORMAT_3 ){ 1814 u32 cksum = pager_cksum(pPager, pPg->pgno, pData); 1815 saved = *(u32*)PGHDR_TO_EXTRA(pPg); 1816 store32bits(cksum, pPg, SQLITE_PAGE_SIZE); 1817 szPg = SQLITE_PAGE_SIZE+8; 1818 }else{ 1819 szPg = SQLITE_PAGE_SIZE+4; 1820 } 1821 store32bits(pPg->pgno, pPg, -4); 1822 CODEC(pPager, pData, pPg->pgno, 7); 1823 rc = sqliteOsWrite(&pPager->jfd, &((char*)pData)[-4], szPg); 1824 TRACE3("JOURNAL %d %d\n", pPg->pgno, pPg->needSync); 1825 CODEC(pPager, pData, pPg->pgno, 0); 1826 if( journal_format>=JOURNAL_FORMAT_3 ){ 1827 *(u32*)PGHDR_TO_EXTRA(pPg) = saved; 1828 } 1829 if( rc!=SQLITE_OK ){ 1830 sqlitepager_rollback(pPager); 1831 pPager->errMask |= PAGER_ERR_FULL; 1832 return rc; 1833 } 1834 pPager->nRec++; 1835 assert( pPager->aInJournal!=0 ); 1836 pPager->aInJournal[pPg->pgno/8] |= 1<<(pPg->pgno&7); 1837 pPg->needSync = !pPager->noSync; 1838 pPg->inJournal = 1; 1839 if( pPager->ckptInUse ){ 1840 pPager->aInCkpt[pPg->pgno/8] |= 1<<(pPg->pgno&7); 1841 page_add_to_ckpt_list(pPg); 1842 } 1843 }else{ 1844 pPg->needSync = !pPager->journalStarted && !pPager->noSync; 1845 TRACE3("APPEND %d %d\n", pPg->pgno, pPg->needSync); 1846 } 1847 if( pPg->needSync ){ 1848 pPager->needSync = 1; 1849 } 1850 } 1851 1852 /* If the checkpoint journal is open and the page is not in it, 1853 ** then write the current page to the checkpoint journal. Note that 1854 ** the checkpoint journal always uses the simplier format 2 that lacks 1855 ** checksums. The header is also omitted from the checkpoint journal. 1856 */ 1857 if( pPager->ckptInUse && !pPg->inCkpt && (int)pPg->pgno<=pPager->ckptSize ){ 1858 assert( pPg->inJournal || (int)pPg->pgno>pPager->origDbSize ); 1859 store32bits(pPg->pgno, pPg, -4); 1860 CODEC(pPager, pData, pPg->pgno, 7); 1861 rc = sqliteOsWrite(&pPager->cpfd, &((char*)pData)[-4], SQLITE_PAGE_SIZE+4); 1862 TRACE2("CKPT-JOURNAL %d\n", pPg->pgno); 1863 CODEC(pPager, pData, pPg->pgno, 0); 1864 if( rc!=SQLITE_OK ){ 1865 sqlitepager_rollback(pPager); 1866 pPager->errMask |= PAGER_ERR_FULL; 1867 return rc; 1868 } 1869 pPager->ckptNRec++; 1870 assert( pPager->aInCkpt!=0 ); 1871 pPager->aInCkpt[pPg->pgno/8] |= 1<<(pPg->pgno&7); 1872 page_add_to_ckpt_list(pPg); 1873 } 1874 1875 /* Update the database size and return. 1876 */ 1877 if( pPager->dbSize<(int)pPg->pgno ){ 1878 pPager->dbSize = pPg->pgno; 1879 } 1880 return rc; 1881 } 1882 1883 /* 1884 ** Return TRUE if the page given in the argument was previously passed 1885 ** to sqlitepager_write(). In other words, return TRUE if it is ok 1886 ** to change the content of the page. 1887 */ 1888 int sqlitepager_iswriteable(void *pData){ 1889 PgHdr *pPg = DATA_TO_PGHDR(pData); 1890 return pPg->dirty; 1891 } 1892 1893 /* 1894 ** Replace the content of a single page with the information in the third 1895 ** argument. 1896 */ 1897 int sqlitepager_overwrite(Pager *pPager, Pgno pgno, void *pData){ 1898 void *pPage; 1899 int rc; 1900 1901 rc = sqlitepager_get(pPager, pgno, &pPage); 1902 if( rc==SQLITE_OK ){ 1903 rc = sqlitepager_write(pPage); 1904 if( rc==SQLITE_OK ){ 1905 memcpy(pPage, pData, SQLITE_PAGE_SIZE); 1906 } 1907 sqlitepager_unref(pPage); 1908 } 1909 return rc; 1910 } 1911 1912 /* 1913 ** A call to this routine tells the pager that it is not necessary to 1914 ** write the information on page "pgno" back to the disk, even though 1915 ** that page might be marked as dirty. 1916 ** 1917 ** The overlying software layer calls this routine when all of the data 1918 ** on the given page is unused. The pager marks the page as clean so 1919 ** that it does not get written to disk. 1920 ** 1921 ** Tests show that this optimization, together with the 1922 ** sqlitepager_dont_rollback() below, more than double the speed 1923 ** of large INSERT operations and quadruple the speed of large DELETEs. 1924 ** 1925 ** When this routine is called, set the alwaysRollback flag to true. 1926 ** Subsequent calls to sqlitepager_dont_rollback() for the same page 1927 ** will thereafter be ignored. This is necessary to avoid a problem 1928 ** where a page with data is added to the freelist during one part of 1929 ** a transaction then removed from the freelist during a later part 1930 ** of the same transaction and reused for some other purpose. When it 1931 ** is first added to the freelist, this routine is called. When reused, 1932 ** the dont_rollback() routine is called. But because the page contains 1933 ** critical data, we still need to be sure it gets rolled back in spite 1934 ** of the dont_rollback() call. 1935 */ 1936 void sqlitepager_dont_write(Pager *pPager, Pgno pgno){ 1937 PgHdr *pPg; 1938 1939 pPg = pager_lookup(pPager, pgno); 1940 pPg->alwaysRollback = 1; 1941 if( pPg && pPg->dirty ){ 1942 if( pPager->dbSize==(int)pPg->pgno && pPager->origDbSize<pPager->dbSize ){ 1943 /* If this pages is the last page in the file and the file has grown 1944 ** during the current transaction, then do NOT mark the page as clean. 1945 ** When the database file grows, we must make sure that the last page 1946 ** gets written at least once so that the disk file will be the correct 1947 ** size. If you do not write this page and the size of the file 1948 ** on the disk ends up being too small, that can lead to database 1949 ** corruption during the next transaction. 1950 */ 1951 }else{ 1952 TRACE2("DONT_WRITE %d\n", pgno); 1953 pPg->dirty = 0; 1954 } 1955 } 1956 } 1957 1958 /* 1959 ** A call to this routine tells the pager that if a rollback occurs, 1960 ** it is not necessary to restore the data on the given page. This 1961 ** means that the pager does not have to record the given page in the 1962 ** rollback journal. 1963 */ 1964 void sqlitepager_dont_rollback(void *pData){ 1965 PgHdr *pPg = DATA_TO_PGHDR(pData); 1966 Pager *pPager = pPg->pPager; 1967 1968 if( pPager->state!=SQLITE_WRITELOCK || pPager->journalOpen==0 ) return; 1969 if( pPg->alwaysRollback || pPager->alwaysRollback ) return; 1970 if( !pPg->inJournal && (int)pPg->pgno <= pPager->origDbSize ){ 1971 assert( pPager->aInJournal!=0 ); 1972 pPager->aInJournal[pPg->pgno/8] |= 1<<(pPg->pgno&7); 1973 pPg->inJournal = 1; 1974 if( pPager->ckptInUse ){ 1975 pPager->aInCkpt[pPg->pgno/8] |= 1<<(pPg->pgno&7); 1976 page_add_to_ckpt_list(pPg); 1977 } 1978 TRACE2("DONT_ROLLBACK %d\n", pPg->pgno); 1979 } 1980 if( pPager->ckptInUse && !pPg->inCkpt && (int)pPg->pgno<=pPager->ckptSize ){ 1981 assert( pPg->inJournal || (int)pPg->pgno>pPager->origDbSize ); 1982 assert( pPager->aInCkpt!=0 ); 1983 pPager->aInCkpt[pPg->pgno/8] |= 1<<(pPg->pgno&7); 1984 page_add_to_ckpt_list(pPg); 1985 } 1986 } 1987 1988 /* 1989 ** Commit all changes to the database and release the write lock. 1990 ** 1991 ** If the commit fails for any reason, a rollback attempt is made 1992 ** and an error code is returned. If the commit worked, SQLITE_OK 1993 ** is returned. 1994 */ 1995 int sqlitepager_commit(Pager *pPager){ 1996 int rc; 1997 PgHdr *pPg; 1998 1999 if( pPager->errMask==PAGER_ERR_FULL ){ 2000 rc = sqlitepager_rollback(pPager); 2001 if( rc==SQLITE_OK ){ 2002 rc = SQLITE_FULL; 2003 } 2004 return rc; 2005 } 2006 if( pPager->errMask!=0 ){ 2007 rc = pager_errcode(pPager); 2008 return rc; 2009 } 2010 if( pPager->state!=SQLITE_WRITELOCK ){ 2011 return SQLITE_ERROR; 2012 } 2013 TRACE1("COMMIT\n"); 2014 if( pPager->dirtyFile==0 ){ 2015 /* Exit early (without doing the time-consuming sqliteOsSync() calls) 2016 ** if there have been no changes to the database file. */ 2017 assert( pPager->needSync==0 ); 2018 rc = pager_unwritelock(pPager); 2019 pPager->dbSize = -1; 2020 return rc; 2021 } 2022 assert( pPager->journalOpen ); 2023 rc = syncJournal(pPager); 2024 if( rc!=SQLITE_OK ){ 2025 goto commit_abort; 2026 } 2027 pPg = pager_get_all_dirty_pages(pPager); 2028 if( pPg ){ 2029 rc = pager_write_pagelist(pPg); 2030 if( rc || (!pPager->noSync && sqliteOsSync(&pPager->fd)!=SQLITE_OK) ){ 2031 goto commit_abort; 2032 } 2033 } 2034 rc = pager_unwritelock(pPager); 2035 pPager->dbSize = -1; 2036 return rc; 2037 2038 /* Jump here if anything goes wrong during the commit process. 2039 */ 2040 commit_abort: 2041 rc = sqlitepager_rollback(pPager); 2042 if( rc==SQLITE_OK ){ 2043 rc = SQLITE_FULL; 2044 } 2045 return rc; 2046 } 2047 2048 /* 2049 ** Rollback all changes. The database falls back to read-only mode. 2050 ** All in-memory cache pages revert to their original data contents. 2051 ** The journal is deleted. 2052 ** 2053 ** This routine cannot fail unless some other process is not following 2054 ** the correct locking protocol (SQLITE_PROTOCOL) or unless some other 2055 ** process is writing trash into the journal file (SQLITE_CORRUPT) or 2056 ** unless a prior malloc() failed (SQLITE_NOMEM). Appropriate error 2057 ** codes are returned for all these occasions. Otherwise, 2058 ** SQLITE_OK is returned. 2059 */ 2060 int sqlitepager_rollback(Pager *pPager){ 2061 int rc; 2062 TRACE1("ROLLBACK\n"); 2063 if( !pPager->dirtyFile || !pPager->journalOpen ){ 2064 rc = pager_unwritelock(pPager); 2065 pPager->dbSize = -1; 2066 return rc; 2067 } 2068 2069 if( pPager->errMask!=0 && pPager->errMask!=PAGER_ERR_FULL ){ 2070 if( pPager->state>=SQLITE_WRITELOCK ){ 2071 pager_playback(pPager, 1); 2072 } 2073 return pager_errcode(pPager); 2074 } 2075 if( pPager->state!=SQLITE_WRITELOCK ){ 2076 return SQLITE_OK; 2077 } 2078 rc = pager_playback(pPager, 1); 2079 if( rc!=SQLITE_OK ){ 2080 rc = SQLITE_CORRUPT; 2081 pPager->errMask |= PAGER_ERR_CORRUPT; 2082 } 2083 pPager->dbSize = -1; 2084 return rc; 2085 } 2086 2087 /* 2088 ** Return TRUE if the database file is opened read-only. Return FALSE 2089 ** if the database is (in theory) writable. 2090 */ 2091 int sqlitepager_isreadonly(Pager *pPager){ 2092 return pPager->readOnly; 2093 } 2094 2095 /* 2096 ** This routine is used for testing and analysis only. 2097 */ 2098 int *sqlitepager_stats(Pager *pPager){ 2099 static int a[9]; 2100 a[0] = pPager->nRef; 2101 a[1] = pPager->nPage; 2102 a[2] = pPager->mxPage; 2103 a[3] = pPager->dbSize; 2104 a[4] = pPager->state; 2105 a[5] = pPager->errMask; 2106 a[6] = pPager->nHit; 2107 a[7] = pPager->nMiss; 2108 a[8] = pPager->nOvfl; 2109 return a; 2110 } 2111 2112 /* 2113 ** Set the checkpoint. 2114 ** 2115 ** This routine should be called with the transaction journal already 2116 ** open. A new checkpoint journal is created that can be used to rollback 2117 ** changes of a single SQL command within a larger transaction. 2118 */ 2119 int sqlitepager_ckpt_begin(Pager *pPager){ 2120 int rc; 2121 char zTemp[SQLITE_TEMPNAME_SIZE]; 2122 if( !pPager->journalOpen ){ 2123 pPager->ckptAutoopen = 1; 2124 return SQLITE_OK; 2125 } 2126 assert( pPager->journalOpen ); 2127 assert( !pPager->ckptInUse ); 2128 pPager->aInCkpt = sqliteMalloc( pPager->dbSize/8 + 1 ); 2129 if( pPager->aInCkpt==0 ){ 2130 sqliteOsReadLock(&pPager->fd); 2131 return SQLITE_NOMEM; 2132 } 2133 #ifndef NDEBUG 2134 rc = sqliteOsFileSize(&pPager->jfd, &pPager->ckptJSize); 2135 if( rc ) goto ckpt_begin_failed; 2136 assert( pPager->ckptJSize == 2137 pPager->nRec*JOURNAL_PG_SZ(journal_format)+JOURNAL_HDR_SZ(journal_format) ); 2138 #endif 2139 pPager->ckptJSize = pPager->nRec*JOURNAL_PG_SZ(journal_format) 2140 + JOURNAL_HDR_SZ(journal_format); 2141 pPager->ckptSize = pPager->dbSize; 2142 if( !pPager->ckptOpen ){ 2143 rc = sqlitepager_opentemp(zTemp, &pPager->cpfd); 2144 if( rc ) goto ckpt_begin_failed; 2145 pPager->ckptOpen = 1; 2146 pPager->ckptNRec = 0; 2147 } 2148 pPager->ckptInUse = 1; 2149 return SQLITE_OK; 2150 2151 ckpt_begin_failed: 2152 if( pPager->aInCkpt ){ 2153 sqliteFree(pPager->aInCkpt); 2154 pPager->aInCkpt = 0; 2155 } 2156 return rc; 2157 } 2158 2159 /* 2160 ** Commit a checkpoint. 2161 */ 2162 int sqlitepager_ckpt_commit(Pager *pPager){ 2163 if( pPager->ckptInUse ){ 2164 PgHdr *pPg, *pNext; 2165 sqliteOsSeek(&pPager->cpfd, 0); 2166 /* sqliteOsTruncate(&pPager->cpfd, 0); */ 2167 pPager->ckptNRec = 0; 2168 pPager->ckptInUse = 0; 2169 sqliteFree( pPager->aInCkpt ); 2170 pPager->aInCkpt = 0; 2171 for(pPg=pPager->pCkpt; pPg; pPg=pNext){ 2172 pNext = pPg->pNextCkpt; 2173 assert( pPg->inCkpt ); 2174 pPg->inCkpt = 0; 2175 pPg->pPrevCkpt = pPg->pNextCkpt = 0; 2176 } 2177 pPager->pCkpt = 0; 2178 } 2179 pPager->ckptAutoopen = 0; 2180 return SQLITE_OK; 2181 } 2182 2183 /* 2184 ** Rollback a checkpoint. 2185 */ 2186 int sqlitepager_ckpt_rollback(Pager *pPager){ 2187 int rc; 2188 if( pPager->ckptInUse ){ 2189 rc = pager_ckpt_playback(pPager); 2190 sqlitepager_ckpt_commit(pPager); 2191 }else{ 2192 rc = SQLITE_OK; 2193 } 2194 pPager->ckptAutoopen = 0; 2195 return rc; 2196 } 2197 2198 /* 2199 ** Return the full pathname of the database file. 2200 */ 2201 const char *sqlitepager_filename(Pager *pPager){ 2202 return pPager->zFilename; 2203 } 2204 2205 /* 2206 ** Set the codec for this pager 2207 */ 2208 void sqlitepager_set_codec( 2209 Pager *pPager, 2210 void (*xCodec)(void*,void*,Pgno,int), 2211 void *pCodecArg 2212 ){ 2213 pPager->xCodec = xCodec; 2214 pPager->pCodecArg = pCodecArg; 2215 } 2216 2217 #ifdef SQLITE_TEST 2218 /* 2219 ** Print a listing of all referenced pages and their ref count. 2220 */ 2221 void sqlitepager_refdump(Pager *pPager){ 2222 PgHdr *pPg; 2223 for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){ 2224 if( pPg->nRef<=0 ) continue; 2225 printf("PAGE %3d addr=0x%08x nRef=%d\n", 2226 pPg->pgno, (int)PGHDR_TO_DATA(pPg), pPg->nRef); 2227 } 2228 } 2229 #endif 2230