xref: /titanic_50/usr/src/lib/libsqlite/src/pager.c (revision 80c94ecd7a524eb933a4bb221a9618b9dc490e76)
1 /*
2  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
3  * Use is subject to license terms.
4  */
5 
6 #pragma ident	"%Z%%M%	%I%	%E% SMI"
7 
8 /*
9 ** 2001 September 15
10 **
11 ** The author disclaims copyright to this source code.  In place of
12 ** a legal notice, here is a blessing:
13 **
14 **    May you do good and not evil.
15 **    May you find forgiveness for yourself and forgive others.
16 **    May you share freely, never taking more than you give.
17 **
18 *************************************************************************
19 ** This is the implementation of the page cache subsystem or "pager".
20 **
21 ** The pager is used to access a database disk file.  It implements
22 ** atomic commit and rollback through the use of a journal file that
23 ** is separate from the database file.  The pager also implements file
24 ** locking to prevent two processes from writing the same database
25 ** file simultaneously, or one process from reading the database while
26 ** another is writing.
27 **
28 ** @(#) $Id: pager.c,v 1.101 2004/02/25 02:20:41 drh Exp $
29 */
30 #include "os.h"         /* Must be first to enable large file support */
31 #include "sqliteInt.h"
32 #include "pager.h"
33 #include <assert.h>
34 #include <string.h>
35 
36 /*
37 ** Macros for troubleshooting.  Normally turned off
38 */
39 #if 0
40 static Pager *mainPager = 0;
41 #define SET_PAGER(X)  if( mainPager==0 ) mainPager = (X)
42 #define CLR_PAGER(X)  if( mainPager==(X) ) mainPager = 0
43 #define TRACE1(X)     if( pPager==mainPager ) fprintf(stderr,X)
44 #define TRACE2(X,Y)   if( pPager==mainPager ) fprintf(stderr,X,Y)
45 #define TRACE3(X,Y,Z) if( pPager==mainPager ) fprintf(stderr,X,Y,Z)
46 #else
47 #define SET_PAGER(X)
48 #define CLR_PAGER(X)
49 #define TRACE1(X)
50 #define TRACE2(X,Y)
51 #define TRACE3(X,Y,Z)
52 #endif
53 
54 
55 /*
56 ** The page cache as a whole is always in one of the following
57 ** states:
58 **
59 **   SQLITE_UNLOCK       The page cache is not currently reading or
60 **                       writing the database file.  There is no
61 **                       data held in memory.  This is the initial
62 **                       state.
63 **
64 **   SQLITE_READLOCK     The page cache is reading the database.
65 **                       Writing is not permitted.  There can be
66 **                       multiple readers accessing the same database
67 **                       file at the same time.
68 **
69 **   SQLITE_WRITELOCK    The page cache is writing the database.
70 **                       Access is exclusive.  No other processes or
71 **                       threads can be reading or writing while one
72 **                       process is writing.
73 **
74 ** The page cache comes up in SQLITE_UNLOCK.  The first time a
75 ** sqlite_page_get() occurs, the state transitions to SQLITE_READLOCK.
76 ** After all pages have been released using sqlite_page_unref(),
77 ** the state transitions back to SQLITE_UNLOCK.  The first time
78 ** that sqlite_page_write() is called, the state transitions to
79 ** SQLITE_WRITELOCK.  (Note that sqlite_page_write() can only be
80 ** called on an outstanding page which means that the pager must
81 ** be in SQLITE_READLOCK before it transitions to SQLITE_WRITELOCK.)
82 ** The sqlite_page_rollback() and sqlite_page_commit() functions
83 ** transition the state from SQLITE_WRITELOCK back to SQLITE_READLOCK.
84 */
85 #define SQLITE_UNLOCK      0
86 #define SQLITE_READLOCK    1
87 #define SQLITE_WRITELOCK   2
88 
89 
90 /*
91 ** Each in-memory image of a page begins with the following header.
92 ** This header is only visible to this pager module.  The client
93 ** code that calls pager sees only the data that follows the header.
94 **
95 ** Client code should call sqlitepager_write() on a page prior to making
96 ** any modifications to that page.  The first time sqlitepager_write()
97 ** is called, the original page contents are written into the rollback
98 ** journal and PgHdr.inJournal and PgHdr.needSync are set.  Later, once
99 ** the journal page has made it onto the disk surface, PgHdr.needSync
100 ** is cleared.  The modified page cannot be written back into the original
101 ** database file until the journal pages has been synced to disk and the
102 ** PgHdr.needSync has been cleared.
103 **
104 ** The PgHdr.dirty flag is set when sqlitepager_write() is called and
105 ** is cleared again when the page content is written back to the original
106 ** database file.
107 */
108 typedef struct PgHdr PgHdr;
109 struct PgHdr {
110   Pager *pPager;                 /* The pager to which this page belongs */
111   Pgno pgno;                     /* The page number for this page */
112   PgHdr *pNextHash, *pPrevHash;  /* Hash collision chain for PgHdr.pgno */
113   int nRef;                      /* Number of users of this page */
114   PgHdr *pNextFree, *pPrevFree;  /* Freelist of pages where nRef==0 */
115   PgHdr *pNextAll, *pPrevAll;    /* A list of all pages */
116   PgHdr *pNextCkpt, *pPrevCkpt;  /* List of pages in the checkpoint journal */
117   u8 inJournal;                  /* TRUE if has been written to journal */
118   u8 inCkpt;                     /* TRUE if written to the checkpoint journal */
119   u8 dirty;                      /* TRUE if we need to write back changes */
120   u8 needSync;                   /* Sync journal before writing this page */
121   u8 alwaysRollback;             /* Disable dont_rollback() for this page */
122   PgHdr *pDirty;                 /* Dirty pages sorted by PgHdr.pgno */
123   /* SQLITE_PAGE_SIZE bytes of page data follow this header */
124   /* Pager.nExtra bytes of local data follow the page data */
125 };
126 
127 
128 /*
129 ** A macro used for invoking the codec if there is one
130 */
131 #ifdef SQLITE_HAS_CODEC
132 # define CODEC(P,D,N,X) if( P->xCodec ){ P->xCodec(P->pCodecArg,D,N,X); }
133 #else
134 # define CODEC(P,D,N,X)
135 #endif
136 
137 /*
138 ** Convert a pointer to a PgHdr into a pointer to its data
139 ** and back again.
140 */
141 #define PGHDR_TO_DATA(P)  ((void*)(&(P)[1]))
142 #define DATA_TO_PGHDR(D)  (&((PgHdr*)(D))[-1])
143 #define PGHDR_TO_EXTRA(P) ((void*)&((char*)(&(P)[1]))[SQLITE_PAGE_SIZE])
144 
145 /*
146 ** How big to make the hash table used for locating in-memory pages
147 ** by page number.
148 */
149 #define N_PG_HASH 2048
150 
151 /*
152 ** Hash a page number
153 */
154 #define pager_hash(PN)  ((PN)&(N_PG_HASH-1))
155 
156 /*
157 ** A open page cache is an instance of the following structure.
158 */
159 struct Pager {
160   char *zFilename;            /* Name of the database file */
161   char *zJournal;             /* Name of the journal file */
162   char *zDirectory;           /* Directory hold database and journal files */
163   OsFile fd, jfd;             /* File descriptors for database and journal */
164   OsFile cpfd;                /* File descriptor for the checkpoint journal */
165   int dbSize;                 /* Number of pages in the file */
166   int origDbSize;             /* dbSize before the current change */
167   int ckptSize;               /* Size of database (in pages) at ckpt_begin() */
168   off_t ckptJSize;            /* Size of journal at ckpt_begin() */
169   int nRec;                   /* Number of pages written to the journal */
170   u32 cksumInit;              /* Quasi-random value added to every checksum */
171   int ckptNRec;               /* Number of records in the checkpoint journal */
172   int nExtra;                 /* Add this many bytes to each in-memory page */
173   void (*xDestructor)(void*); /* Call this routine when freeing pages */
174   int nPage;                  /* Total number of in-memory pages */
175   int nRef;                   /* Number of in-memory pages with PgHdr.nRef>0 */
176   int mxPage;                 /* Maximum number of pages to hold in cache */
177   int nHit, nMiss, nOvfl;     /* Cache hits, missing, and LRU overflows */
178   void (*xCodec)(void*,void*,Pgno,int); /* Routine for en/decoding data */
179   void *pCodecArg;            /* First argument to xCodec() */
180   u8 journalOpen;             /* True if journal file descriptors is valid */
181   u8 journalStarted;          /* True if header of journal is synced */
182   u8 useJournal;              /* Use a rollback journal on this file */
183   u8 ckptOpen;                /* True if the checkpoint journal is open */
184   u8 ckptInUse;               /* True we are in a checkpoint */
185   u8 ckptAutoopen;            /* Open ckpt journal when main journal is opened*/
186   u8 noSync;                  /* Do not sync the journal if true */
187   u8 fullSync;                /* Do extra syncs of the journal for robustness */
188   u8 state;                   /* SQLITE_UNLOCK, _READLOCK or _WRITELOCK */
189   u8 errMask;                 /* One of several kinds of errors */
190   u8 tempFile;                /* zFilename is a temporary file */
191   u8 readOnly;                /* True for a read-only database */
192   u8 needSync;                /* True if an fsync() is needed on the journal */
193   u8 dirtyFile;               /* True if database file has changed in any way */
194   u8 alwaysRollback;          /* Disable dont_rollback() for all pages */
195   u8 *aInJournal;             /* One bit for each page in the database file */
196   u8 *aInCkpt;                /* One bit for each page in the database */
197   PgHdr *pFirst, *pLast;      /* List of free pages */
198   PgHdr *pFirstSynced;        /* First free page with PgHdr.needSync==0 */
199   PgHdr *pAll;                /* List of all pages */
200   PgHdr *pCkpt;               /* List of pages in the checkpoint journal */
201   PgHdr *aHash[N_PG_HASH];    /* Hash table to map page number of PgHdr */
202 };
203 
204 /*
205 ** These are bits that can be set in Pager.errMask.
206 */
207 #define PAGER_ERR_FULL     0x01  /* a write() failed */
208 #define PAGER_ERR_MEM      0x02  /* malloc() failed */
209 #define PAGER_ERR_LOCK     0x04  /* error in the locking protocol */
210 #define PAGER_ERR_CORRUPT  0x08  /* database or journal corruption */
211 #define PAGER_ERR_DISK     0x10  /* general disk I/O error - bad hard drive? */
212 
213 /*
214 ** The journal file contains page records in the following
215 ** format.
216 **
217 ** Actually, this structure is the complete page record for pager
218 ** formats less than 3.  Beginning with format 3, this record is surrounded
219 ** by two checksums.
220 */
221 typedef struct PageRecord PageRecord;
222 struct PageRecord {
223   Pgno pgno;                      /* The page number */
224   char aData[SQLITE_PAGE_SIZE];   /* Original data for page pgno */
225 };
226 
227 /*
228 ** Journal files begin with the following magic string.  The data
229 ** was obtained from /dev/random.  It is used only as a sanity check.
230 **
231 ** There are three journal formats (so far). The 1st journal format writes
232 ** 32-bit integers in the byte-order of the host machine.  New
233 ** formats writes integers as big-endian.  All new journals use the
234 ** new format, but we have to be able to read an older journal in order
235 ** to rollback journals created by older versions of the library.
236 **
237 ** The 3rd journal format (added for 2.8.0) adds additional sanity
238 ** checking information to the journal.  If the power fails while the
239 ** journal is being written, semi-random garbage data might appear in
240 ** the journal file after power is restored.  If an attempt is then made
241 ** to roll the journal back, the database could be corrupted.  The additional
242 ** sanity checking data is an attempt to discover the garbage in the
243 ** journal and ignore it.
244 **
245 ** The sanity checking information for the 3rd journal format consists
246 ** of a 32-bit checksum on each page of data.  The checksum covers both
247 ** the page number and the SQLITE_PAGE_SIZE bytes of data for the page.
248 ** This cksum is initialized to a 32-bit random value that appears in the
249 ** journal file right after the header.  The random initializer is important,
250 ** because garbage data that appears at the end of a journal is likely
251 ** data that was once in other files that have now been deleted.  If the
252 ** garbage data came from an obsolete journal file, the checksums might
253 ** be correct.  But by initializing the checksum to random value which
254 ** is different for every journal, we minimize that risk.
255 */
256 static const unsigned char aJournalMagic1[] = {
257   0xd9, 0xd5, 0x05, 0xf9, 0x20, 0xa1, 0x63, 0xd4,
258 };
259 static const unsigned char aJournalMagic2[] = {
260   0xd9, 0xd5, 0x05, 0xf9, 0x20, 0xa1, 0x63, 0xd5,
261 };
262 static const unsigned char aJournalMagic3[] = {
263   0xd9, 0xd5, 0x05, 0xf9, 0x20, 0xa1, 0x63, 0xd6,
264 };
265 #define JOURNAL_FORMAT_1 1
266 #define JOURNAL_FORMAT_2 2
267 #define JOURNAL_FORMAT_3 3
268 
269 /*
270 ** The following integer determines what format to use when creating
271 ** new primary journal files.  By default we always use format 3.
272 ** When testing, we can set this value to older journal formats in order to
273 ** make sure that newer versions of the library are able to rollback older
274 ** journal files.
275 **
276 ** Note that checkpoint journals always use format 2 and omit the header.
277 */
278 #ifdef SQLITE_TEST
279 int journal_format = 3;
280 #else
281 # define journal_format 3
282 #endif
283 
284 /*
285 ** The size of the header and of each page in the journal varies according
286 ** to which journal format is being used.  The following macros figure out
287 ** the sizes based on format numbers.
288 */
289 #define JOURNAL_HDR_SZ(X) \
290    (sizeof(aJournalMagic1) + sizeof(Pgno) + ((X)>=3)*2*sizeof(u32))
291 #define JOURNAL_PG_SZ(X) \
292    (SQLITE_PAGE_SIZE + sizeof(Pgno) + ((X)>=3)*sizeof(u32))
293 
294 /*
295 ** Enable reference count tracking here:
296 */
297 #ifdef SQLITE_TEST
298   int pager_refinfo_enable = 0;
299   static void pager_refinfo(PgHdr *p){
300     static int cnt = 0;
301     if( !pager_refinfo_enable ) return;
302     printf(
303        "REFCNT: %4d addr=0x%08x nRef=%d\n",
304        p->pgno, (int)PGHDR_TO_DATA(p), p->nRef
305     );
306     cnt++;   /* Something to set a breakpoint on */
307   }
308 # define REFINFO(X)  pager_refinfo(X)
309 #else
310 # define REFINFO(X)
311 #endif
312 
313 /*
314 ** Read a 32-bit integer from the given file descriptor.  Store the integer
315 ** that is read in *pRes.  Return SQLITE_OK if everything worked, or an
316 ** error code is something goes wrong.
317 **
318 ** If the journal format is 2 or 3, read a big-endian integer.  If the
319 ** journal format is 1, read an integer in the native byte-order of the
320 ** host machine.
321 */
322 static int read32bits(int format, OsFile *fd, u32 *pRes){
323   u32 res;
324   int rc;
325   rc = sqliteOsRead(fd, &res, sizeof(res));
326   if( rc==SQLITE_OK && format>JOURNAL_FORMAT_1 ){
327     unsigned char ac[4];
328     memcpy(ac, &res, 4);
329     res = (ac[0]<<24) | (ac[1]<<16) | (ac[2]<<8) | ac[3];
330   }
331   *pRes = res;
332   return rc;
333 }
334 
335 /*
336 ** Write a 32-bit integer into the given file descriptor.  Return SQLITE_OK
337 ** on success or an error code is something goes wrong.
338 **
339 ** If the journal format is 2 or 3, write the integer as 4 big-endian
340 ** bytes.  If the journal format is 1, write the integer in the native
341 ** byte order.  In normal operation, only formats 2 and 3 are used.
342 ** Journal format 1 is only used for testing.
343 */
344 static int write32bits(OsFile *fd, u32 val){
345   unsigned char ac[4];
346   if( journal_format<=1 ){
347     return sqliteOsWrite(fd, &val, 4);
348   }
349   ac[0] = (val>>24) & 0xff;
350   ac[1] = (val>>16) & 0xff;
351   ac[2] = (val>>8) & 0xff;
352   ac[3] = val & 0xff;
353   return sqliteOsWrite(fd, ac, 4);
354 }
355 
356 /*
357 ** Write a 32-bit integer into a page header right before the
358 ** page data.  This will overwrite the PgHdr.pDirty pointer.
359 **
360 ** The integer is big-endian for formats 2 and 3 and native byte order
361 ** for journal format 1.
362 */
363 static void store32bits(u32 val, PgHdr *p, int offset){
364   unsigned char *ac;
365   ac = &((unsigned char*)PGHDR_TO_DATA(p))[offset];
366   if( journal_format<=1 ){
367     memcpy(ac, &val, 4);
368   }else{
369     ac[0] = (val>>24) & 0xff;
370     ac[1] = (val>>16) & 0xff;
371     ac[2] = (val>>8) & 0xff;
372     ac[3] = val & 0xff;
373   }
374 }
375 
376 
377 /*
378 ** Convert the bits in the pPager->errMask into an approprate
379 ** return code.
380 */
381 static int pager_errcode(Pager *pPager){
382   int rc = SQLITE_OK;
383   if( pPager->errMask & PAGER_ERR_LOCK )    rc = SQLITE_PROTOCOL;
384   if( pPager->errMask & PAGER_ERR_DISK )    rc = SQLITE_IOERR;
385   if( pPager->errMask & PAGER_ERR_FULL )    rc = SQLITE_FULL;
386   if( pPager->errMask & PAGER_ERR_MEM )     rc = SQLITE_NOMEM;
387   if( pPager->errMask & PAGER_ERR_CORRUPT ) rc = SQLITE_CORRUPT;
388   return rc;
389 }
390 
391 /*
392 ** Add or remove a page from the list of all pages that are in the
393 ** checkpoint journal.
394 **
395 ** The Pager keeps a separate list of pages that are currently in
396 ** the checkpoint journal.  This helps the sqlitepager_ckpt_commit()
397 ** routine run MUCH faster for the common case where there are many
398 ** pages in memory but only a few are in the checkpoint journal.
399 */
400 static void page_add_to_ckpt_list(PgHdr *pPg){
401   Pager *pPager = pPg->pPager;
402   if( pPg->inCkpt ) return;
403   assert( pPg->pPrevCkpt==0 && pPg->pNextCkpt==0 );
404   pPg->pPrevCkpt = 0;
405   if( pPager->pCkpt ){
406     pPager->pCkpt->pPrevCkpt = pPg;
407   }
408   pPg->pNextCkpt = pPager->pCkpt;
409   pPager->pCkpt = pPg;
410   pPg->inCkpt = 1;
411 }
412 static void page_remove_from_ckpt_list(PgHdr *pPg){
413   if( !pPg->inCkpt ) return;
414   if( pPg->pPrevCkpt ){
415     assert( pPg->pPrevCkpt->pNextCkpt==pPg );
416     pPg->pPrevCkpt->pNextCkpt = pPg->pNextCkpt;
417   }else{
418     assert( pPg->pPager->pCkpt==pPg );
419     pPg->pPager->pCkpt = pPg->pNextCkpt;
420   }
421   if( pPg->pNextCkpt ){
422     assert( pPg->pNextCkpt->pPrevCkpt==pPg );
423     pPg->pNextCkpt->pPrevCkpt = pPg->pPrevCkpt;
424   }
425   pPg->pNextCkpt = 0;
426   pPg->pPrevCkpt = 0;
427   pPg->inCkpt = 0;
428 }
429 
430 /*
431 ** Find a page in the hash table given its page number.  Return
432 ** a pointer to the page or NULL if not found.
433 */
434 static PgHdr *pager_lookup(Pager *pPager, Pgno pgno){
435   PgHdr *p = pPager->aHash[pager_hash(pgno)];
436   while( p && p->pgno!=pgno ){
437     p = p->pNextHash;
438   }
439   return p;
440 }
441 
442 /*
443 ** Unlock the database and clear the in-memory cache.  This routine
444 ** sets the state of the pager back to what it was when it was first
445 ** opened.  Any outstanding pages are invalidated and subsequent attempts
446 ** to access those pages will likely result in a coredump.
447 */
448 static void pager_reset(Pager *pPager){
449   PgHdr *pPg, *pNext;
450   for(pPg=pPager->pAll; pPg; pPg=pNext){
451     pNext = pPg->pNextAll;
452     sqliteFree(pPg);
453   }
454   pPager->pFirst = 0;
455   pPager->pFirstSynced = 0;
456   pPager->pLast = 0;
457   pPager->pAll = 0;
458   memset(pPager->aHash, 0, sizeof(pPager->aHash));
459   pPager->nPage = 0;
460   if( pPager->state>=SQLITE_WRITELOCK ){
461     sqlitepager_rollback(pPager);
462   }
463   sqliteOsUnlock(&pPager->fd);
464   pPager->state = SQLITE_UNLOCK;
465   pPager->dbSize = -1;
466   pPager->nRef = 0;
467   assert( pPager->journalOpen==0 );
468 }
469 
470 /*
471 ** When this routine is called, the pager has the journal file open and
472 ** a write lock on the database.  This routine releases the database
473 ** write lock and acquires a read lock in its place.  The journal file
474 ** is deleted and closed.
475 **
476 ** TODO: Consider keeping the journal file open for temporary databases.
477 ** This might give a performance improvement on windows where opening
478 ** a file is an expensive operation.
479 */
480 static int pager_unwritelock(Pager *pPager){
481   int rc;
482   PgHdr *pPg;
483   if( pPager->state<SQLITE_WRITELOCK ) return SQLITE_OK;
484   sqlitepager_ckpt_commit(pPager);
485   if( pPager->ckptOpen ){
486     sqliteOsClose(&pPager->cpfd);
487     pPager->ckptOpen = 0;
488   }
489   if( pPager->journalOpen ){
490     sqliteOsClose(&pPager->jfd);
491     pPager->journalOpen = 0;
492     sqliteOsDelete(pPager->zJournal);
493     sqliteFree( pPager->aInJournal );
494     pPager->aInJournal = 0;
495     for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
496       pPg->inJournal = 0;
497       pPg->dirty = 0;
498       pPg->needSync = 0;
499     }
500   }else{
501     assert( pPager->dirtyFile==0 || pPager->useJournal==0 );
502   }
503   rc = sqliteOsReadLock(&pPager->fd);
504   if( rc==SQLITE_OK ){
505     pPager->state = SQLITE_READLOCK;
506   }else{
507     /* This can only happen if a process does a BEGIN, then forks and the
508     ** child process does the COMMIT.  Because of the semantics of unix
509     ** file locking, the unlock will fail.
510     */
511     pPager->state = SQLITE_UNLOCK;
512   }
513   return rc;
514 }
515 
516 /*
517 ** Compute and return a checksum for the page of data.
518 **
519 ** This is not a real checksum.  It is really just the sum of the
520 ** random initial value and the page number.  We considered do a checksum
521 ** of the database, but that was found to be too slow.
522 */
523 static u32 pager_cksum(Pager *pPager, Pgno pgno, const char *aData){
524   u32 cksum = pPager->cksumInit + pgno;
525   return cksum;
526 }
527 
528 /*
529 ** Read a single page from the journal file opened on file descriptor
530 ** jfd.  Playback this one page.
531 **
532 ** There are three different journal formats.  The format parameter determines
533 ** which format is used by the journal that is played back.
534 */
535 static int pager_playback_one_page(Pager *pPager, OsFile *jfd, int format){
536   int rc;
537   PgHdr *pPg;              /* An existing page in the cache */
538   PageRecord pgRec;
539   u32 cksum;
540 
541   rc = read32bits(format, jfd, &pgRec.pgno);
542   if( rc!=SQLITE_OK ) return rc;
543   rc = sqliteOsRead(jfd, &pgRec.aData, sizeof(pgRec.aData));
544   if( rc!=SQLITE_OK ) return rc;
545 
546   /* Sanity checking on the page.  This is more important that I originally
547   ** thought.  If a power failure occurs while the journal is being written,
548   ** it could cause invalid data to be written into the journal.  We need to
549   ** detect this invalid data (with high probability) and ignore it.
550   */
551   if( pgRec.pgno==0 ){
552     return SQLITE_DONE;
553   }
554   if( pgRec.pgno>(unsigned)pPager->dbSize ){
555     return SQLITE_OK;
556   }
557   if( format>=JOURNAL_FORMAT_3 ){
558     rc = read32bits(format, jfd, &cksum);
559     if( rc ) return rc;
560     if( pager_cksum(pPager, pgRec.pgno, pgRec.aData)!=cksum ){
561       return SQLITE_DONE;
562     }
563   }
564 
565   /* Playback the page.  Update the in-memory copy of the page
566   ** at the same time, if there is one.
567   */
568   pPg = pager_lookup(pPager, pgRec.pgno);
569   TRACE2("PLAYBACK %d\n", pgRec.pgno);
570   sqliteOsSeek(&pPager->fd, (pgRec.pgno-1)*(off_t)SQLITE_PAGE_SIZE);
571   rc = sqliteOsWrite(&pPager->fd, pgRec.aData, SQLITE_PAGE_SIZE);
572   if( pPg ){
573     /* No page should ever be rolled back that is in use, except for page
574     ** 1 which is held in use in order to keep the lock on the database
575     ** active.  However, such a page may be rolled back as a result of an
576     ** internal error resulting in an automatic call to
577     ** sqlitepager_rollback(), so we can't assert() it.
578     */
579     /* assert( pPg->nRef==0 || pPg->pgno==1 ) */
580     memcpy(PGHDR_TO_DATA(pPg), pgRec.aData, SQLITE_PAGE_SIZE);
581     memset(PGHDR_TO_EXTRA(pPg), 0, pPager->nExtra);
582     pPg->dirty = 0;
583     pPg->needSync = 0;
584     CODEC(pPager, PGHDR_TO_DATA(pPg), pPg->pgno, 3);
585   }
586   return rc;
587 }
588 
589 /*
590 ** Playback the journal and thus restore the database file to
591 ** the state it was in before we started making changes.
592 **
593 ** The journal file format is as follows:
594 **
595 **    *  8 byte prefix.  One of the aJournalMagic123 vectors defined
596 **       above.  The format of the journal file is determined by which
597 **       of the three prefix vectors is seen.
598 **    *  4 byte big-endian integer which is the number of valid page records
599 **       in the journal.  If this value is 0xffffffff, then compute the
600 **       number of page records from the journal size.  This field appears
601 **       in format 3 only.
602 **    *  4 byte big-endian integer which is the initial value for the
603 **       sanity checksum.  This field appears in format 3 only.
604 **    *  4 byte integer which is the number of pages to truncate the
605 **       database to during a rollback.
606 **    *  Zero or more pages instances, each as follows:
607 **        +  4 byte page number.
608 **        +  SQLITE_PAGE_SIZE bytes of data.
609 **        +  4 byte checksum (format 3 only)
610 **
611 ** When we speak of the journal header, we mean the first 4 bullets above.
612 ** Each entry in the journal is an instance of the 5th bullet.  Note that
613 ** bullets 2 and 3 only appear in format-3 journals.
614 **
615 ** Call the value from the second bullet "nRec".  nRec is the number of
616 ** valid page entries in the journal.  In most cases, you can compute the
617 ** value of nRec from the size of the journal file.  But if a power
618 ** failure occurred while the journal was being written, it could be the
619 ** case that the size of the journal file had already been increased but
620 ** the extra entries had not yet made it safely to disk.  In such a case,
621 ** the value of nRec computed from the file size would be too large.  For
622 ** that reason, we always use the nRec value in the header.
623 **
624 ** If the nRec value is 0xffffffff it means that nRec should be computed
625 ** from the file size.  This value is used when the user selects the
626 ** no-sync option for the journal.  A power failure could lead to corruption
627 ** in this case.  But for things like temporary table (which will be
628 ** deleted when the power is restored) we don't care.
629 **
630 ** Journal formats 1 and 2 do not have an nRec value in the header so we
631 ** have to compute nRec from the file size.  This has risks (as described
632 ** above) which is why all persistent tables have been changed to use
633 ** format 3.
634 **
635 ** If the file opened as the journal file is not a well-formed
636 ** journal file then the database will likely already be
637 ** corrupted, so the PAGER_ERR_CORRUPT bit is set in pPager->errMask
638 ** and SQLITE_CORRUPT is returned.  If it all works, then this routine
639 ** returns SQLITE_OK.
640 */
641 static int pager_playback(Pager *pPager, int useJournalSize){
642   off_t szJ;               /* Size of the journal file in bytes */
643   int nRec;                /* Number of Records in the journal */
644   int i;                   /* Loop counter */
645   Pgno mxPg = 0;           /* Size of the original file in pages */
646   int format;              /* Format of the journal file. */
647   unsigned char aMagic[sizeof(aJournalMagic1)];
648   int rc;
649 
650   /* Figure out how many records are in the journal.  Abort early if
651   ** the journal is empty.
652   */
653   assert( pPager->journalOpen );
654   sqliteOsSeek(&pPager->jfd, 0);
655   rc = sqliteOsFileSize(&pPager->jfd, &szJ);
656   if( rc!=SQLITE_OK ){
657     goto end_playback;
658   }
659 
660   /* If the journal file is too small to contain a complete header,
661   ** it must mean that the process that created the journal was just
662   ** beginning to write the journal file when it died.  In that case,
663   ** the database file should have still been completely unchanged.
664   ** Nothing needs to be rolled back.  We can safely ignore this journal.
665   */
666   if( szJ < sizeof(aMagic)+sizeof(Pgno) ){
667     goto end_playback;
668   }
669 
670   /* Read the beginning of the journal and truncate the
671   ** database file back to its original size.
672   */
673   rc = sqliteOsRead(&pPager->jfd, aMagic, sizeof(aMagic));
674   if( rc!=SQLITE_OK ){
675     rc = SQLITE_PROTOCOL;
676     goto end_playback;
677   }
678   if( memcmp(aMagic, aJournalMagic3, sizeof(aMagic))==0 ){
679     format = JOURNAL_FORMAT_3;
680   }else if( memcmp(aMagic, aJournalMagic2, sizeof(aMagic))==0 ){
681     format = JOURNAL_FORMAT_2;
682   }else if( memcmp(aMagic, aJournalMagic1, sizeof(aMagic))==0 ){
683     format = JOURNAL_FORMAT_1;
684   }else{
685     rc = SQLITE_PROTOCOL;
686     goto end_playback;
687   }
688   if( format>=JOURNAL_FORMAT_3 ){
689     if( szJ < sizeof(aMagic) + 3*sizeof(u32) ){
690       /* Ignore the journal if it is too small to contain a complete
691       ** header.  We already did this test once above, but at the prior
692       ** test, we did not know the journal format and so we had to assume
693       ** the smallest possible header.  Now we know the header is bigger
694       ** than the minimum so we test again.
695       */
696       goto end_playback;
697     }
698     rc = read32bits(format, &pPager->jfd, (u32*)&nRec);
699     if( rc ) goto end_playback;
700     rc = read32bits(format, &pPager->jfd, &pPager->cksumInit);
701     if( rc ) goto end_playback;
702     if( nRec==0xffffffff || useJournalSize ){
703       nRec = (szJ - JOURNAL_HDR_SZ(3))/JOURNAL_PG_SZ(3);
704     }
705   }else{
706     nRec = (szJ - JOURNAL_HDR_SZ(2))/JOURNAL_PG_SZ(2);
707     assert( nRec*JOURNAL_PG_SZ(2)+JOURNAL_HDR_SZ(2)==szJ );
708   }
709   rc = read32bits(format, &pPager->jfd, &mxPg);
710   if( rc!=SQLITE_OK ){
711     goto end_playback;
712   }
713   assert( pPager->origDbSize==0 || pPager->origDbSize==mxPg );
714   rc = sqliteOsTruncate(&pPager->fd, SQLITE_PAGE_SIZE*(off_t)mxPg);
715   if( rc!=SQLITE_OK ){
716     goto end_playback;
717   }
718   pPager->dbSize = mxPg;
719 
720   /* Copy original pages out of the journal and back into the database file.
721   */
722   for(i=0; i<nRec; i++){
723     rc = pager_playback_one_page(pPager, &pPager->jfd, format);
724     if( rc!=SQLITE_OK ){
725       if( rc==SQLITE_DONE ){
726         rc = SQLITE_OK;
727       }
728       break;
729     }
730   }
731 
732   /* Pages that have been written to the journal but never synced
733   ** where not restored by the loop above.  We have to restore those
734   ** pages by reading them back from the original database.
735   */
736   if( rc==SQLITE_OK ){
737     PgHdr *pPg;
738     for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
739       char zBuf[SQLITE_PAGE_SIZE];
740       if( !pPg->dirty ) continue;
741       if( (int)pPg->pgno <= pPager->origDbSize ){
742         sqliteOsSeek(&pPager->fd, SQLITE_PAGE_SIZE*(off_t)(pPg->pgno-1));
743         rc = sqliteOsRead(&pPager->fd, zBuf, SQLITE_PAGE_SIZE);
744         TRACE2("REFETCH %d\n", pPg->pgno);
745         CODEC(pPager, zBuf, pPg->pgno, 2);
746         if( rc ) break;
747       }else{
748         memset(zBuf, 0, SQLITE_PAGE_SIZE);
749       }
750       if( pPg->nRef==0 || memcmp(zBuf, PGHDR_TO_DATA(pPg), SQLITE_PAGE_SIZE) ){
751         memcpy(PGHDR_TO_DATA(pPg), zBuf, SQLITE_PAGE_SIZE);
752         memset(PGHDR_TO_EXTRA(pPg), 0, pPager->nExtra);
753       }
754       pPg->needSync = 0;
755       pPg->dirty = 0;
756     }
757   }
758 
759 end_playback:
760   if( rc!=SQLITE_OK ){
761     pager_unwritelock(pPager);
762     pPager->errMask |= PAGER_ERR_CORRUPT;
763     rc = SQLITE_CORRUPT;
764   }else{
765     rc = pager_unwritelock(pPager);
766   }
767   return rc;
768 }
769 
770 /*
771 ** Playback the checkpoint journal.
772 **
773 ** This is similar to playing back the transaction journal but with
774 ** a few extra twists.
775 **
776 **    (1)  The number of pages in the database file at the start of
777 **         the checkpoint is stored in pPager->ckptSize, not in the
778 **         journal file itself.
779 **
780 **    (2)  In addition to playing back the checkpoint journal, also
781 **         playback all pages of the transaction journal beginning
782 **         at offset pPager->ckptJSize.
783 */
784 static int pager_ckpt_playback(Pager *pPager){
785   off_t szJ;               /* Size of the full journal */
786   int nRec;                /* Number of Records */
787   int i;                   /* Loop counter */
788   int rc;
789 
790   /* Truncate the database back to its original size.
791   */
792   rc = sqliteOsTruncate(&pPager->fd, SQLITE_PAGE_SIZE*(off_t)pPager->ckptSize);
793   pPager->dbSize = pPager->ckptSize;
794 
795   /* Figure out how many records are in the checkpoint journal.
796   */
797   assert( pPager->ckptInUse && pPager->journalOpen );
798   sqliteOsSeek(&pPager->cpfd, 0);
799   nRec = pPager->ckptNRec;
800 
801   /* Copy original pages out of the checkpoint journal and back into the
802   ** database file.  Note that the checkpoint journal always uses format
803   ** 2 instead of format 3 since it does not need to be concerned with
804   ** power failures corrupting the journal and can thus omit the checksums.
805   */
806   for(i=nRec-1; i>=0; i--){
807     rc = pager_playback_one_page(pPager, &pPager->cpfd, 2);
808     assert( rc!=SQLITE_DONE );
809     if( rc!=SQLITE_OK ) goto end_ckpt_playback;
810   }
811 
812   /* Figure out how many pages need to be copied out of the transaction
813   ** journal.
814   */
815   rc = sqliteOsSeek(&pPager->jfd, pPager->ckptJSize);
816   if( rc!=SQLITE_OK ){
817     goto end_ckpt_playback;
818   }
819   rc = sqliteOsFileSize(&pPager->jfd, &szJ);
820   if( rc!=SQLITE_OK ){
821     goto end_ckpt_playback;
822   }
823   nRec = (szJ - pPager->ckptJSize)/JOURNAL_PG_SZ(journal_format);
824   for(i=nRec-1; i>=0; i--){
825     rc = pager_playback_one_page(pPager, &pPager->jfd, journal_format);
826     if( rc!=SQLITE_OK ){
827       assert( rc!=SQLITE_DONE );
828       goto end_ckpt_playback;
829     }
830   }
831 
832 end_ckpt_playback:
833   if( rc!=SQLITE_OK ){
834     pPager->errMask |= PAGER_ERR_CORRUPT;
835     rc = SQLITE_CORRUPT;
836   }
837   return rc;
838 }
839 
840 /*
841 ** Change the maximum number of in-memory pages that are allowed.
842 **
843 ** The maximum number is the absolute value of the mxPage parameter.
844 ** If mxPage is negative, the noSync flag is also set.  noSync bypasses
845 ** calls to sqliteOsSync().  The pager runs much faster with noSync on,
846 ** but if the operating system crashes or there is an abrupt power
847 ** failure, the database file might be left in an inconsistent and
848 ** unrepairable state.
849 */
850 void sqlitepager_set_cachesize(Pager *pPager, int mxPage){
851   if( mxPage>=0 ){
852     pPager->noSync = pPager->tempFile;
853     if( pPager->noSync==0 ) pPager->needSync = 0;
854   }else{
855     pPager->noSync = 1;
856     mxPage = -mxPage;
857   }
858   if( mxPage>10 ){
859     pPager->mxPage = mxPage;
860   }
861 }
862 
863 /*
864 ** Adjust the robustness of the database to damage due to OS crashes
865 ** or power failures by changing the number of syncs()s when writing
866 ** the rollback journal.  There are three levels:
867 **
868 **    OFF       sqliteOsSync() is never called.  This is the default
869 **              for temporary and transient files.
870 **
871 **    NORMAL    The journal is synced once before writes begin on the
872 **              database.  This is normally adequate protection, but
873 **              it is theoretically possible, though very unlikely,
874 **              that an inopertune power failure could leave the journal
875 **              in a state which would cause damage to the database
876 **              when it is rolled back.
877 **
878 **    FULL      The journal is synced twice before writes begin on the
879 **              database (with some additional information - the nRec field
880 **              of the journal header - being written in between the two
881 **              syncs).  If we assume that writing a
882 **              single disk sector is atomic, then this mode provides
883 **              assurance that the journal will not be corrupted to the
884 **              point of causing damage to the database during rollback.
885 **
886 ** Numeric values associated with these states are OFF==1, NORMAL=2,
887 ** and FULL=3.
888 */
889 void sqlitepager_set_safety_level(Pager *pPager, int level){
890   pPager->noSync =  level==1 || pPager->tempFile;
891   pPager->fullSync = level==3 && !pPager->tempFile;
892   if( pPager->noSync==0 ) pPager->needSync = 0;
893 }
894 
895 /*
896 ** Open a temporary file.  Write the name of the file into zName
897 ** (zName must be at least SQLITE_TEMPNAME_SIZE bytes long.)  Write
898 ** the file descriptor into *fd.  Return SQLITE_OK on success or some
899 ** other error code if we fail.
900 **
901 ** The OS will automatically delete the temporary file when it is
902 ** closed.
903 */
904 static int sqlitepager_opentemp(char *zFile, OsFile *fd){
905   int cnt = 8;
906   int rc;
907   do{
908     cnt--;
909     sqliteOsTempFileName(zFile);
910     rc = sqliteOsOpenExclusive(zFile, fd, 1);
911   }while( cnt>0 && rc!=SQLITE_OK );
912   return rc;
913 }
914 
915 /*
916 ** Create a new page cache and put a pointer to the page cache in *ppPager.
917 ** The file to be cached need not exist.  The file is not locked until
918 ** the first call to sqlitepager_get() and is only held open until the
919 ** last page is released using sqlitepager_unref().
920 **
921 ** If zFilename is NULL then a randomly-named temporary file is created
922 ** and used as the file to be cached.  The file will be deleted
923 ** automatically when it is closed.
924 */
925 int sqlitepager_open(
926   Pager **ppPager,         /* Return the Pager structure here */
927   const char *zFilename,   /* Name of the database file to open */
928   int mxPage,              /* Max number of in-memory cache pages */
929   int nExtra,              /* Extra bytes append to each in-memory page */
930   int useJournal           /* TRUE to use a rollback journal on this file */
931 ){
932   Pager *pPager;
933   char *zFullPathname;
934   int nameLen;
935   OsFile fd;
936   int rc, i;
937   int tempFile;
938   int readOnly = 0;
939   char zTemp[SQLITE_TEMPNAME_SIZE];
940 
941   *ppPager = 0;
942   if( sqlite_malloc_failed ){
943     return SQLITE_NOMEM;
944   }
945   if( zFilename && zFilename[0] ){
946     zFullPathname = sqliteOsFullPathname(zFilename);
947     rc = sqliteOsOpenReadWrite(zFullPathname, &fd, &readOnly);
948     tempFile = 0;
949   }else{
950     rc = sqlitepager_opentemp(zTemp, &fd);
951     zFilename = zTemp;
952     zFullPathname = sqliteOsFullPathname(zFilename);
953     tempFile = 1;
954   }
955   if( sqlite_malloc_failed ){
956     return SQLITE_NOMEM;
957   }
958   if( rc!=SQLITE_OK ){
959     sqliteFree(zFullPathname);
960     return SQLITE_CANTOPEN;
961   }
962   nameLen = strlen(zFullPathname);
963   pPager = sqliteMalloc( sizeof(*pPager) + nameLen*3 + 30 );
964   if( pPager==0 ){
965     sqliteOsClose(&fd);
966     sqliteFree(zFullPathname);
967     return SQLITE_NOMEM;
968   }
969   SET_PAGER(pPager);
970   pPager->zFilename = (char*)&pPager[1];
971   pPager->zDirectory = &pPager->zFilename[nameLen+1];
972   pPager->zJournal = &pPager->zDirectory[nameLen+1];
973   strcpy(pPager->zFilename, zFullPathname);
974   strcpy(pPager->zDirectory, zFullPathname);
975   for(i=nameLen; i>0 && pPager->zDirectory[i-1]!='/'; i--){}
976   if( i>0 ) pPager->zDirectory[i-1] = 0;
977   strcpy(pPager->zJournal, zFullPathname);
978   sqliteFree(zFullPathname);
979   strcpy(&pPager->zJournal[nameLen], "-journal");
980   pPager->fd = fd;
981   pPager->journalOpen = 0;
982   pPager->useJournal = useJournal;
983   pPager->ckptOpen = 0;
984   pPager->ckptInUse = 0;
985   pPager->nRef = 0;
986   pPager->dbSize = -1;
987   pPager->ckptSize = 0;
988   pPager->ckptJSize = 0;
989   pPager->nPage = 0;
990   pPager->mxPage = mxPage>5 ? mxPage : 10;
991   pPager->state = SQLITE_UNLOCK;
992   pPager->errMask = 0;
993   pPager->tempFile = tempFile;
994   pPager->readOnly = readOnly;
995   pPager->needSync = 0;
996   pPager->noSync = pPager->tempFile || !useJournal;
997   pPager->pFirst = 0;
998   pPager->pFirstSynced = 0;
999   pPager->pLast = 0;
1000   pPager->nExtra = nExtra;
1001   memset(pPager->aHash, 0, sizeof(pPager->aHash));
1002   *ppPager = pPager;
1003   return SQLITE_OK;
1004 }
1005 
1006 /*
1007 ** Set the destructor for this pager.  If not NULL, the destructor is called
1008 ** when the reference count on each page reaches zero.  The destructor can
1009 ** be used to clean up information in the extra segment appended to each page.
1010 **
1011 ** The destructor is not called as a result sqlitepager_close().
1012 ** Destructors are only called by sqlitepager_unref().
1013 */
1014 void sqlitepager_set_destructor(Pager *pPager, void (*xDesc)(void*)){
1015   pPager->xDestructor = xDesc;
1016 }
1017 
1018 /*
1019 ** Return the total number of pages in the disk file associated with
1020 ** pPager.
1021 */
1022 int sqlitepager_pagecount(Pager *pPager){
1023   off_t n;
1024   assert( pPager!=0 );
1025   if( pPager->dbSize>=0 ){
1026     return pPager->dbSize;
1027   }
1028   if( sqliteOsFileSize(&pPager->fd, &n)!=SQLITE_OK ){
1029     pPager->errMask |= PAGER_ERR_DISK;
1030     return 0;
1031   }
1032   n /= SQLITE_PAGE_SIZE;
1033   if( pPager->state!=SQLITE_UNLOCK ){
1034     pPager->dbSize = n;
1035   }
1036   return n;
1037 }
1038 
1039 /*
1040 ** Forward declaration
1041 */
1042 static int syncJournal(Pager*);
1043 
1044 /*
1045 ** Truncate the file to the number of pages specified.
1046 */
1047 int sqlitepager_truncate(Pager *pPager, Pgno nPage){
1048   int rc;
1049   if( pPager->dbSize<0 ){
1050     sqlitepager_pagecount(pPager);
1051   }
1052   if( pPager->errMask!=0 ){
1053     rc = pager_errcode(pPager);
1054     return rc;
1055   }
1056   if( nPage>=(unsigned)pPager->dbSize ){
1057     return SQLITE_OK;
1058   }
1059   syncJournal(pPager);
1060   rc = sqliteOsTruncate(&pPager->fd, SQLITE_PAGE_SIZE*(off_t)nPage);
1061   if( rc==SQLITE_OK ){
1062     pPager->dbSize = nPage;
1063   }
1064   return rc;
1065 }
1066 
1067 /*
1068 ** Shutdown the page cache.  Free all memory and close all files.
1069 **
1070 ** If a transaction was in progress when this routine is called, that
1071 ** transaction is rolled back.  All outstanding pages are invalidated
1072 ** and their memory is freed.  Any attempt to use a page associated
1073 ** with this page cache after this function returns will likely
1074 ** result in a coredump.
1075 */
1076 int sqlitepager_close(Pager *pPager){
1077   PgHdr *pPg, *pNext;
1078   switch( pPager->state ){
1079     case SQLITE_WRITELOCK: {
1080       sqlitepager_rollback(pPager);
1081       sqliteOsUnlock(&pPager->fd);
1082       assert( pPager->journalOpen==0 );
1083       break;
1084     }
1085     case SQLITE_READLOCK: {
1086       sqliteOsUnlock(&pPager->fd);
1087       break;
1088     }
1089     default: {
1090       /* Do nothing */
1091       break;
1092     }
1093   }
1094   for(pPg=pPager->pAll; pPg; pPg=pNext){
1095     pNext = pPg->pNextAll;
1096     sqliteFree(pPg);
1097   }
1098   sqliteOsClose(&pPager->fd);
1099   assert( pPager->journalOpen==0 );
1100   /* Temp files are automatically deleted by the OS
1101   ** if( pPager->tempFile ){
1102   **   sqliteOsDelete(pPager->zFilename);
1103   ** }
1104   */
1105   CLR_PAGER(pPager);
1106   if( pPager->zFilename!=(char*)&pPager[1] ){
1107     assert( 0 );  /* Cannot happen */
1108     sqliteFree(pPager->zFilename);
1109     sqliteFree(pPager->zJournal);
1110     sqliteFree(pPager->zDirectory);
1111   }
1112   sqliteFree(pPager);
1113   return SQLITE_OK;
1114 }
1115 
1116 /*
1117 ** Return the page number for the given page data.
1118 */
1119 Pgno sqlitepager_pagenumber(void *pData){
1120   PgHdr *p = DATA_TO_PGHDR(pData);
1121   return p->pgno;
1122 }
1123 
1124 /*
1125 ** Increment the reference count for a page.  If the page is
1126 ** currently on the freelist (the reference count is zero) then
1127 ** remove it from the freelist.
1128 */
1129 #define page_ref(P)   ((P)->nRef==0?_page_ref(P):(void)(P)->nRef++)
1130 static void _page_ref(PgHdr *pPg){
1131   if( pPg->nRef==0 ){
1132     /* The page is currently on the freelist.  Remove it. */
1133     if( pPg==pPg->pPager->pFirstSynced ){
1134       PgHdr *p = pPg->pNextFree;
1135       while( p && p->needSync ){ p = p->pNextFree; }
1136       pPg->pPager->pFirstSynced = p;
1137     }
1138     if( pPg->pPrevFree ){
1139       pPg->pPrevFree->pNextFree = pPg->pNextFree;
1140     }else{
1141       pPg->pPager->pFirst = pPg->pNextFree;
1142     }
1143     if( pPg->pNextFree ){
1144       pPg->pNextFree->pPrevFree = pPg->pPrevFree;
1145     }else{
1146       pPg->pPager->pLast = pPg->pPrevFree;
1147     }
1148     pPg->pPager->nRef++;
1149   }
1150   pPg->nRef++;
1151   REFINFO(pPg);
1152 }
1153 
1154 /*
1155 ** Increment the reference count for a page.  The input pointer is
1156 ** a reference to the page data.
1157 */
1158 int sqlitepager_ref(void *pData){
1159   PgHdr *pPg = DATA_TO_PGHDR(pData);
1160   page_ref(pPg);
1161   return SQLITE_OK;
1162 }
1163 
1164 /*
1165 ** Sync the journal.  In other words, make sure all the pages that have
1166 ** been written to the journal have actually reached the surface of the
1167 ** disk.  It is not safe to modify the original database file until after
1168 ** the journal has been synced.  If the original database is modified before
1169 ** the journal is synced and a power failure occurs, the unsynced journal
1170 ** data would be lost and we would be unable to completely rollback the
1171 ** database changes.  Database corruption would occur.
1172 **
1173 ** This routine also updates the nRec field in the header of the journal.
1174 ** (See comments on the pager_playback() routine for additional information.)
1175 ** If the sync mode is FULL, two syncs will occur.  First the whole journal
1176 ** is synced, then the nRec field is updated, then a second sync occurs.
1177 **
1178 ** For temporary databases, we do not care if we are able to rollback
1179 ** after a power failure, so sync occurs.
1180 **
1181 ** This routine clears the needSync field of every page current held in
1182 ** memory.
1183 */
1184 static int syncJournal(Pager *pPager){
1185   PgHdr *pPg;
1186   int rc = SQLITE_OK;
1187 
1188   /* Sync the journal before modifying the main database
1189   ** (assuming there is a journal and it needs to be synced.)
1190   */
1191   if( pPager->needSync ){
1192     if( !pPager->tempFile ){
1193       assert( pPager->journalOpen );
1194       /* assert( !pPager->noSync ); // noSync might be set if synchronous
1195       ** was turned off after the transaction was started.  Ticket #615 */
1196 #ifndef NDEBUG
1197       {
1198         /* Make sure the pPager->nRec counter we are keeping agrees
1199         ** with the nRec computed from the size of the journal file.
1200         */
1201         off_t hdrSz, pgSz, jSz;
1202         hdrSz = JOURNAL_HDR_SZ(journal_format);
1203         pgSz = JOURNAL_PG_SZ(journal_format);
1204         rc = sqliteOsFileSize(&pPager->jfd, &jSz);
1205         if( rc!=0 ) return rc;
1206         assert( pPager->nRec*pgSz+hdrSz==jSz );
1207       }
1208 #endif
1209       if( journal_format>=3 ){
1210         /* Write the nRec value into the journal file header */
1211         off_t szJ;
1212         if( pPager->fullSync ){
1213           TRACE1("SYNC\n");
1214           rc = sqliteOsSync(&pPager->jfd);
1215           if( rc!=0 ) return rc;
1216         }
1217         sqliteOsSeek(&pPager->jfd, sizeof(aJournalMagic1));
1218         rc = write32bits(&pPager->jfd, pPager->nRec);
1219         if( rc ) return rc;
1220         szJ = JOURNAL_HDR_SZ(journal_format) +
1221                  pPager->nRec*JOURNAL_PG_SZ(journal_format);
1222         sqliteOsSeek(&pPager->jfd, szJ);
1223       }
1224       TRACE1("SYNC\n");
1225       rc = sqliteOsSync(&pPager->jfd);
1226       if( rc!=0 ) return rc;
1227       pPager->journalStarted = 1;
1228     }
1229     pPager->needSync = 0;
1230 
1231     /* Erase the needSync flag from every page.
1232     */
1233     for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
1234       pPg->needSync = 0;
1235     }
1236     pPager->pFirstSynced = pPager->pFirst;
1237   }
1238 
1239 #ifndef NDEBUG
1240   /* If the Pager.needSync flag is clear then the PgHdr.needSync
1241   ** flag must also be clear for all pages.  Verify that this
1242   ** invariant is true.
1243   */
1244   else{
1245     for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
1246       assert( pPg->needSync==0 );
1247     }
1248     assert( pPager->pFirstSynced==pPager->pFirst );
1249   }
1250 #endif
1251 
1252   return rc;
1253 }
1254 
1255 /*
1256 ** Given a list of pages (connected by the PgHdr.pDirty pointer) write
1257 ** every one of those pages out to the database file and mark them all
1258 ** as clean.
1259 */
1260 static int pager_write_pagelist(PgHdr *pList){
1261   Pager *pPager;
1262   int rc;
1263 
1264   if( pList==0 ) return SQLITE_OK;
1265   pPager = pList->pPager;
1266   while( pList ){
1267     assert( pList->dirty );
1268     sqliteOsSeek(&pPager->fd, (pList->pgno-1)*(off_t)SQLITE_PAGE_SIZE);
1269     CODEC(pPager, PGHDR_TO_DATA(pList), pList->pgno, 6);
1270     TRACE2("STORE %d\n", pList->pgno);
1271     rc = sqliteOsWrite(&pPager->fd, PGHDR_TO_DATA(pList), SQLITE_PAGE_SIZE);
1272     CODEC(pPager, PGHDR_TO_DATA(pList), pList->pgno, 0);
1273     if( rc ) return rc;
1274     pList->dirty = 0;
1275     pList = pList->pDirty;
1276   }
1277   return SQLITE_OK;
1278 }
1279 
1280 /*
1281 ** Collect every dirty page into a dirty list and
1282 ** return a pointer to the head of that list.  All pages are
1283 ** collected even if they are still in use.
1284 */
1285 static PgHdr *pager_get_all_dirty_pages(Pager *pPager){
1286   PgHdr *p, *pList;
1287   pList = 0;
1288   for(p=pPager->pAll; p; p=p->pNextAll){
1289     if( p->dirty ){
1290       p->pDirty = pList;
1291       pList = p;
1292     }
1293   }
1294   return pList;
1295 }
1296 
1297 /*
1298 ** Acquire a page.
1299 **
1300 ** A read lock on the disk file is obtained when the first page is acquired.
1301 ** This read lock is dropped when the last page is released.
1302 **
1303 ** A _get works for any page number greater than 0.  If the database
1304 ** file is smaller than the requested page, then no actual disk
1305 ** read occurs and the memory image of the page is initialized to
1306 ** all zeros.  The extra data appended to a page is always initialized
1307 ** to zeros the first time a page is loaded into memory.
1308 **
1309 ** The acquisition might fail for several reasons.  In all cases,
1310 ** an appropriate error code is returned and *ppPage is set to NULL.
1311 **
1312 ** See also sqlitepager_lookup().  Both this routine and _lookup() attempt
1313 ** to find a page in the in-memory cache first.  If the page is not already
1314 ** in memory, this routine goes to disk to read it in whereas _lookup()
1315 ** just returns 0.  This routine acquires a read-lock the first time it
1316 ** has to go to disk, and could also playback an old journal if necessary.
1317 ** Since _lookup() never goes to disk, it never has to deal with locks
1318 ** or journal files.
1319 */
1320 int sqlitepager_get(Pager *pPager, Pgno pgno, void **ppPage){
1321   PgHdr *pPg;
1322   int rc;
1323 
1324   /* Make sure we have not hit any critical errors.
1325   */
1326   assert( pPager!=0 );
1327   assert( pgno!=0 );
1328   *ppPage = 0;
1329   if( pPager->errMask & ~(PAGER_ERR_FULL) ){
1330     return pager_errcode(pPager);
1331   }
1332 
1333   /* If this is the first page accessed, then get a read lock
1334   ** on the database file.
1335   */
1336   if( pPager->nRef==0 ){
1337     rc = sqliteOsReadLock(&pPager->fd);
1338     if( rc!=SQLITE_OK ){
1339       return rc;
1340     }
1341     pPager->state = SQLITE_READLOCK;
1342 
1343     /* If a journal file exists, try to play it back.
1344     */
1345     if( pPager->useJournal && sqliteOsFileExists(pPager->zJournal) ){
1346        int rc;
1347 
1348        /* Get a write lock on the database
1349        */
1350        rc = sqliteOsWriteLock(&pPager->fd);
1351        if( rc!=SQLITE_OK ){
1352          if( sqliteOsUnlock(&pPager->fd)!=SQLITE_OK ){
1353            /* This should never happen! */
1354            rc = SQLITE_INTERNAL;
1355          }
1356          return rc;
1357        }
1358        pPager->state = SQLITE_WRITELOCK;
1359 
1360        /* Open the journal for reading only.  Return SQLITE_BUSY if
1361        ** we are unable to open the journal file.
1362        **
1363        ** The journal file does not need to be locked itself.  The
1364        ** journal file is never open unless the main database file holds
1365        ** a write lock, so there is never any chance of two or more
1366        ** processes opening the journal at the same time.
1367        */
1368        rc = sqliteOsOpenReadOnly(pPager->zJournal, &pPager->jfd);
1369        if( rc!=SQLITE_OK ){
1370          rc = sqliteOsUnlock(&pPager->fd);
1371          assert( rc==SQLITE_OK );
1372          return SQLITE_BUSY;
1373        }
1374        pPager->journalOpen = 1;
1375        pPager->journalStarted = 0;
1376 
1377        /* Playback and delete the journal.  Drop the database write
1378        ** lock and reacquire the read lock.
1379        */
1380        rc = pager_playback(pPager, 0);
1381        if( rc!=SQLITE_OK ){
1382          return rc;
1383        }
1384     }
1385     pPg = 0;
1386   }else{
1387     /* Search for page in cache */
1388     pPg = pager_lookup(pPager, pgno);
1389   }
1390   if( pPg==0 ){
1391     /* The requested page is not in the page cache. */
1392     int h;
1393     pPager->nMiss++;
1394     if( pPager->nPage<pPager->mxPage || pPager->pFirst==0 ){
1395       /* Create a new page */
1396       pPg = sqliteMallocRaw( sizeof(*pPg) + SQLITE_PAGE_SIZE
1397                               + sizeof(u32) + pPager->nExtra );
1398       if( pPg==0 ){
1399         pager_unwritelock(pPager);
1400         pPager->errMask |= PAGER_ERR_MEM;
1401         return SQLITE_NOMEM;
1402       }
1403       memset(pPg, 0, sizeof(*pPg));
1404       pPg->pPager = pPager;
1405       pPg->pNextAll = pPager->pAll;
1406       if( pPager->pAll ){
1407         pPager->pAll->pPrevAll = pPg;
1408       }
1409       pPg->pPrevAll = 0;
1410       pPager->pAll = pPg;
1411       pPager->nPage++;
1412     }else{
1413       /* Find a page to recycle.  Try to locate a page that does not
1414       ** require us to do an fsync() on the journal.
1415       */
1416       pPg = pPager->pFirstSynced;
1417 
1418       /* If we could not find a page that does not require an fsync()
1419       ** on the journal file then fsync the journal file.  This is a
1420       ** very slow operation, so we work hard to avoid it.  But sometimes
1421       ** it can't be helped.
1422       */
1423       if( pPg==0 ){
1424         int rc = syncJournal(pPager);
1425         if( rc!=0 ){
1426           sqlitepager_rollback(pPager);
1427           return SQLITE_IOERR;
1428         }
1429         pPg = pPager->pFirst;
1430       }
1431       assert( pPg->nRef==0 );
1432 
1433       /* Write the page to the database file if it is dirty.
1434       */
1435       if( pPg->dirty ){
1436         assert( pPg->needSync==0 );
1437         pPg->pDirty = 0;
1438         rc = pager_write_pagelist( pPg );
1439         if( rc!=SQLITE_OK ){
1440           sqlitepager_rollback(pPager);
1441           return SQLITE_IOERR;
1442         }
1443       }
1444       assert( pPg->dirty==0 );
1445 
1446       /* If the page we are recycling is marked as alwaysRollback, then
1447       ** set the global alwaysRollback flag, thus disabling the
1448       ** sqlite_dont_rollback() optimization for the rest of this transaction.
1449       ** It is necessary to do this because the page marked alwaysRollback
1450       ** might be reloaded at a later time but at that point we won't remember
1451       ** that is was marked alwaysRollback.  This means that all pages must
1452       ** be marked as alwaysRollback from here on out.
1453       */
1454       if( pPg->alwaysRollback ){
1455         pPager->alwaysRollback = 1;
1456       }
1457 
1458       /* Unlink the old page from the free list and the hash table
1459       */
1460       if( pPg==pPager->pFirstSynced ){
1461         PgHdr *p = pPg->pNextFree;
1462         while( p && p->needSync ){ p = p->pNextFree; }
1463         pPager->pFirstSynced = p;
1464       }
1465       if( pPg->pPrevFree ){
1466         pPg->pPrevFree->pNextFree = pPg->pNextFree;
1467       }else{
1468         assert( pPager->pFirst==pPg );
1469         pPager->pFirst = pPg->pNextFree;
1470       }
1471       if( pPg->pNextFree ){
1472         pPg->pNextFree->pPrevFree = pPg->pPrevFree;
1473       }else{
1474         assert( pPager->pLast==pPg );
1475         pPager->pLast = pPg->pPrevFree;
1476       }
1477       pPg->pNextFree = pPg->pPrevFree = 0;
1478       if( pPg->pNextHash ){
1479         pPg->pNextHash->pPrevHash = pPg->pPrevHash;
1480       }
1481       if( pPg->pPrevHash ){
1482         pPg->pPrevHash->pNextHash = pPg->pNextHash;
1483       }else{
1484         h = pager_hash(pPg->pgno);
1485         assert( pPager->aHash[h]==pPg );
1486         pPager->aHash[h] = pPg->pNextHash;
1487       }
1488       pPg->pNextHash = pPg->pPrevHash = 0;
1489       pPager->nOvfl++;
1490     }
1491     pPg->pgno = pgno;
1492     if( pPager->aInJournal && (int)pgno<=pPager->origDbSize ){
1493       sqliteCheckMemory(pPager->aInJournal, pgno/8);
1494       assert( pPager->journalOpen );
1495       pPg->inJournal = (pPager->aInJournal[pgno/8] & (1<<(pgno&7)))!=0;
1496       pPg->needSync = 0;
1497     }else{
1498       pPg->inJournal = 0;
1499       pPg->needSync = 0;
1500     }
1501     if( pPager->aInCkpt && (int)pgno<=pPager->ckptSize
1502              && (pPager->aInCkpt[pgno/8] & (1<<(pgno&7)))!=0 ){
1503       page_add_to_ckpt_list(pPg);
1504     }else{
1505       page_remove_from_ckpt_list(pPg);
1506     }
1507     pPg->dirty = 0;
1508     pPg->nRef = 1;
1509     REFINFO(pPg);
1510     pPager->nRef++;
1511     h = pager_hash(pgno);
1512     pPg->pNextHash = pPager->aHash[h];
1513     pPager->aHash[h] = pPg;
1514     if( pPg->pNextHash ){
1515       assert( pPg->pNextHash->pPrevHash==0 );
1516       pPg->pNextHash->pPrevHash = pPg;
1517     }
1518     if( pPager->nExtra>0 ){
1519       memset(PGHDR_TO_EXTRA(pPg), 0, pPager->nExtra);
1520     }
1521     if( pPager->dbSize<0 ) sqlitepager_pagecount(pPager);
1522     if( pPager->errMask!=0 ){
1523       sqlitepager_unref(PGHDR_TO_DATA(pPg));
1524       rc = pager_errcode(pPager);
1525       return rc;
1526     }
1527     if( pPager->dbSize<(int)pgno ){
1528       memset(PGHDR_TO_DATA(pPg), 0, SQLITE_PAGE_SIZE);
1529     }else{
1530       int rc;
1531       sqliteOsSeek(&pPager->fd, (pgno-1)*(off_t)SQLITE_PAGE_SIZE);
1532       rc = sqliteOsRead(&pPager->fd, PGHDR_TO_DATA(pPg), SQLITE_PAGE_SIZE);
1533       TRACE2("FETCH %d\n", pPg->pgno);
1534       CODEC(pPager, PGHDR_TO_DATA(pPg), pPg->pgno, 3);
1535       if( rc!=SQLITE_OK ){
1536         off_t fileSize;
1537         if( sqliteOsFileSize(&pPager->fd,&fileSize)!=SQLITE_OK
1538                || fileSize>=pgno*SQLITE_PAGE_SIZE ){
1539           sqlitepager_unref(PGHDR_TO_DATA(pPg));
1540           return rc;
1541         }else{
1542           memset(PGHDR_TO_DATA(pPg), 0, SQLITE_PAGE_SIZE);
1543         }
1544       }
1545     }
1546   }else{
1547     /* The requested page is in the page cache. */
1548     pPager->nHit++;
1549     page_ref(pPg);
1550   }
1551   *ppPage = PGHDR_TO_DATA(pPg);
1552   return SQLITE_OK;
1553 }
1554 
1555 /*
1556 ** Acquire a page if it is already in the in-memory cache.  Do
1557 ** not read the page from disk.  Return a pointer to the page,
1558 ** or 0 if the page is not in cache.
1559 **
1560 ** See also sqlitepager_get().  The difference between this routine
1561 ** and sqlitepager_get() is that _get() will go to the disk and read
1562 ** in the page if the page is not already in cache.  This routine
1563 ** returns NULL if the page is not in cache or if a disk I/O error
1564 ** has ever happened.
1565 */
1566 void *sqlitepager_lookup(Pager *pPager, Pgno pgno){
1567   PgHdr *pPg;
1568 
1569   assert( pPager!=0 );
1570   assert( pgno!=0 );
1571   if( pPager->errMask & ~(PAGER_ERR_FULL) ){
1572     return 0;
1573   }
1574   /* if( pPager->nRef==0 ){
1575   **  return 0;
1576   ** }
1577   */
1578   pPg = pager_lookup(pPager, pgno);
1579   if( pPg==0 ) return 0;
1580   page_ref(pPg);
1581   return PGHDR_TO_DATA(pPg);
1582 }
1583 
1584 /*
1585 ** Release a page.
1586 **
1587 ** If the number of references to the page drop to zero, then the
1588 ** page is added to the LRU list.  When all references to all pages
1589 ** are released, a rollback occurs and the lock on the database is
1590 ** removed.
1591 */
1592 int sqlitepager_unref(void *pData){
1593   PgHdr *pPg;
1594 
1595   /* Decrement the reference count for this page
1596   */
1597   pPg = DATA_TO_PGHDR(pData);
1598   assert( pPg->nRef>0 );
1599   pPg->nRef--;
1600   REFINFO(pPg);
1601 
1602   /* When the number of references to a page reach 0, call the
1603   ** destructor and add the page to the freelist.
1604   */
1605   if( pPg->nRef==0 ){
1606     Pager *pPager;
1607     pPager = pPg->pPager;
1608     pPg->pNextFree = 0;
1609     pPg->pPrevFree = pPager->pLast;
1610     pPager->pLast = pPg;
1611     if( pPg->pPrevFree ){
1612       pPg->pPrevFree->pNextFree = pPg;
1613     }else{
1614       pPager->pFirst = pPg;
1615     }
1616     if( pPg->needSync==0 && pPager->pFirstSynced==0 ){
1617       pPager->pFirstSynced = pPg;
1618     }
1619     if( pPager->xDestructor ){
1620       pPager->xDestructor(pData);
1621     }
1622 
1623     /* When all pages reach the freelist, drop the read lock from
1624     ** the database file.
1625     */
1626     pPager->nRef--;
1627     assert( pPager->nRef>=0 );
1628     if( pPager->nRef==0 ){
1629       pager_reset(pPager);
1630     }
1631   }
1632   return SQLITE_OK;
1633 }
1634 
1635 /*
1636 ** Create a journal file for pPager.  There should already be a write
1637 ** lock on the database file when this routine is called.
1638 **
1639 ** Return SQLITE_OK if everything.  Return an error code and release the
1640 ** write lock if anything goes wrong.
1641 */
1642 static int pager_open_journal(Pager *pPager){
1643   int rc;
1644   assert( pPager->state==SQLITE_WRITELOCK );
1645   assert( pPager->journalOpen==0 );
1646   assert( pPager->useJournal );
1647   sqlitepager_pagecount(pPager);
1648   pPager->aInJournal = sqliteMalloc( pPager->dbSize/8 + 1 );
1649   if( pPager->aInJournal==0 ){
1650     sqliteOsReadLock(&pPager->fd);
1651     pPager->state = SQLITE_READLOCK;
1652     return SQLITE_NOMEM;
1653   }
1654   rc = sqliteOsOpenExclusive(pPager->zJournal, &pPager->jfd,pPager->tempFile);
1655   if( rc!=SQLITE_OK ){
1656     sqliteFree(pPager->aInJournal);
1657     pPager->aInJournal = 0;
1658     sqliteOsReadLock(&pPager->fd);
1659     pPager->state = SQLITE_READLOCK;
1660     return SQLITE_CANTOPEN;
1661   }
1662   sqliteOsOpenDirectory(pPager->zDirectory, &pPager->jfd);
1663   pPager->journalOpen = 1;
1664   pPager->journalStarted = 0;
1665   pPager->needSync = 0;
1666   pPager->alwaysRollback = 0;
1667   pPager->nRec = 0;
1668   if( pPager->errMask!=0 ){
1669     rc = pager_errcode(pPager);
1670     return rc;
1671   }
1672   pPager->origDbSize = pPager->dbSize;
1673   if( journal_format==JOURNAL_FORMAT_3 ){
1674     rc = sqliteOsWrite(&pPager->jfd, aJournalMagic3, sizeof(aJournalMagic3));
1675     if( rc==SQLITE_OK ){
1676       rc = write32bits(&pPager->jfd, pPager->noSync ? 0xffffffff : 0);
1677     }
1678     if( rc==SQLITE_OK ){
1679       sqliteRandomness(sizeof(pPager->cksumInit), &pPager->cksumInit);
1680       rc = write32bits(&pPager->jfd, pPager->cksumInit);
1681     }
1682   }else if( journal_format==JOURNAL_FORMAT_2 ){
1683     rc = sqliteOsWrite(&pPager->jfd, aJournalMagic2, sizeof(aJournalMagic2));
1684   }else{
1685     assert( journal_format==JOURNAL_FORMAT_1 );
1686     rc = sqliteOsWrite(&pPager->jfd, aJournalMagic1, sizeof(aJournalMagic1));
1687   }
1688   if( rc==SQLITE_OK ){
1689     rc = write32bits(&pPager->jfd, pPager->dbSize);
1690   }
1691   if( pPager->ckptAutoopen && rc==SQLITE_OK ){
1692     rc = sqlitepager_ckpt_begin(pPager);
1693   }
1694   if( rc!=SQLITE_OK ){
1695     rc = pager_unwritelock(pPager);
1696     if( rc==SQLITE_OK ){
1697       rc = SQLITE_FULL;
1698     }
1699   }
1700   return rc;
1701 }
1702 
1703 /*
1704 ** Acquire a write-lock on the database.  The lock is removed when
1705 ** the any of the following happen:
1706 **
1707 **   *  sqlitepager_commit() is called.
1708 **   *  sqlitepager_rollback() is called.
1709 **   *  sqlitepager_close() is called.
1710 **   *  sqlitepager_unref() is called to on every outstanding page.
1711 **
1712 ** The parameter to this routine is a pointer to any open page of the
1713 ** database file.  Nothing changes about the page - it is used merely
1714 ** to acquire a pointer to the Pager structure and as proof that there
1715 ** is already a read-lock on the database.
1716 **
1717 ** A journal file is opened if this is not a temporary file.  For
1718 ** temporary files, the opening of the journal file is deferred until
1719 ** there is an actual need to write to the journal.
1720 **
1721 ** If the database is already write-locked, this routine is a no-op.
1722 */
1723 int sqlitepager_begin(void *pData){
1724   PgHdr *pPg = DATA_TO_PGHDR(pData);
1725   Pager *pPager = pPg->pPager;
1726   int rc = SQLITE_OK;
1727   assert( pPg->nRef>0 );
1728   assert( pPager->state!=SQLITE_UNLOCK );
1729   if( pPager->state==SQLITE_READLOCK ){
1730     assert( pPager->aInJournal==0 );
1731     rc = sqliteOsWriteLock(&pPager->fd);
1732     if( rc!=SQLITE_OK ){
1733       return rc;
1734     }
1735     pPager->state = SQLITE_WRITELOCK;
1736     pPager->dirtyFile = 0;
1737     TRACE1("TRANSACTION\n");
1738     if( pPager->useJournal && !pPager->tempFile ){
1739       rc = pager_open_journal(pPager);
1740     }
1741   }
1742   return rc;
1743 }
1744 
1745 /*
1746 ** Mark a data page as writeable.  The page is written into the journal
1747 ** if it is not there already.  This routine must be called before making
1748 ** changes to a page.
1749 **
1750 ** The first time this routine is called, the pager creates a new
1751 ** journal and acquires a write lock on the database.  If the write
1752 ** lock could not be acquired, this routine returns SQLITE_BUSY.  The
1753 ** calling routine must check for that return value and be careful not to
1754 ** change any page data until this routine returns SQLITE_OK.
1755 **
1756 ** If the journal file could not be written because the disk is full,
1757 ** then this routine returns SQLITE_FULL and does an immediate rollback.
1758 ** All subsequent write attempts also return SQLITE_FULL until there
1759 ** is a call to sqlitepager_commit() or sqlitepager_rollback() to
1760 ** reset.
1761 */
1762 int sqlitepager_write(void *pData){
1763   PgHdr *pPg = DATA_TO_PGHDR(pData);
1764   Pager *pPager = pPg->pPager;
1765   int rc = SQLITE_OK;
1766 
1767   /* Check for errors
1768   */
1769   if( pPager->errMask ){
1770     return pager_errcode(pPager);
1771   }
1772   if( pPager->readOnly ){
1773     return SQLITE_PERM;
1774   }
1775 
1776   /* Mark the page as dirty.  If the page has already been written
1777   ** to the journal then we can return right away.
1778   */
1779   pPg->dirty = 1;
1780   if( pPg->inJournal && (pPg->inCkpt || pPager->ckptInUse==0) ){
1781     pPager->dirtyFile = 1;
1782     return SQLITE_OK;
1783   }
1784 
1785   /* If we get this far, it means that the page needs to be
1786   ** written to the transaction journal or the ckeckpoint journal
1787   ** or both.
1788   **
1789   ** First check to see that the transaction journal exists and
1790   ** create it if it does not.
1791   */
1792   assert( pPager->state!=SQLITE_UNLOCK );
1793   rc = sqlitepager_begin(pData);
1794   if( rc!=SQLITE_OK ){
1795     return rc;
1796   }
1797   assert( pPager->state==SQLITE_WRITELOCK );
1798   if( !pPager->journalOpen && pPager->useJournal ){
1799     rc = pager_open_journal(pPager);
1800     if( rc!=SQLITE_OK ) return rc;
1801   }
1802   assert( pPager->journalOpen || !pPager->useJournal );
1803   pPager->dirtyFile = 1;
1804 
1805   /* The transaction journal now exists and we have a write lock on the
1806   ** main database file.  Write the current page to the transaction
1807   ** journal if it is not there already.
1808   */
1809   if( !pPg->inJournal && pPager->useJournal ){
1810     if( (int)pPg->pgno <= pPager->origDbSize ){
1811       int szPg;
1812       u32 saved;
1813       if( journal_format>=JOURNAL_FORMAT_3 ){
1814         u32 cksum = pager_cksum(pPager, pPg->pgno, pData);
1815         saved = *(u32*)PGHDR_TO_EXTRA(pPg);
1816         store32bits(cksum, pPg, SQLITE_PAGE_SIZE);
1817         szPg = SQLITE_PAGE_SIZE+8;
1818       }else{
1819         szPg = SQLITE_PAGE_SIZE+4;
1820       }
1821       store32bits(pPg->pgno, pPg, -4);
1822       CODEC(pPager, pData, pPg->pgno, 7);
1823       rc = sqliteOsWrite(&pPager->jfd, &((char*)pData)[-4], szPg);
1824       TRACE3("JOURNAL %d %d\n", pPg->pgno, pPg->needSync);
1825       CODEC(pPager, pData, pPg->pgno, 0);
1826       if( journal_format>=JOURNAL_FORMAT_3 ){
1827         *(u32*)PGHDR_TO_EXTRA(pPg) = saved;
1828       }
1829       if( rc!=SQLITE_OK ){
1830         sqlitepager_rollback(pPager);
1831         pPager->errMask |= PAGER_ERR_FULL;
1832         return rc;
1833       }
1834       pPager->nRec++;
1835       assert( pPager->aInJournal!=0 );
1836       pPager->aInJournal[pPg->pgno/8] |= 1<<(pPg->pgno&7);
1837       pPg->needSync = !pPager->noSync;
1838       pPg->inJournal = 1;
1839       if( pPager->ckptInUse ){
1840         pPager->aInCkpt[pPg->pgno/8] |= 1<<(pPg->pgno&7);
1841         page_add_to_ckpt_list(pPg);
1842       }
1843     }else{
1844       pPg->needSync = !pPager->journalStarted && !pPager->noSync;
1845       TRACE3("APPEND %d %d\n", pPg->pgno, pPg->needSync);
1846     }
1847     if( pPg->needSync ){
1848       pPager->needSync = 1;
1849     }
1850   }
1851 
1852   /* If the checkpoint journal is open and the page is not in it,
1853   ** then write the current page to the checkpoint journal.  Note that
1854   ** the checkpoint journal always uses the simplier format 2 that lacks
1855   ** checksums.  The header is also omitted from the checkpoint journal.
1856   */
1857   if( pPager->ckptInUse && !pPg->inCkpt && (int)pPg->pgno<=pPager->ckptSize ){
1858     assert( pPg->inJournal || (int)pPg->pgno>pPager->origDbSize );
1859     store32bits(pPg->pgno, pPg, -4);
1860     CODEC(pPager, pData, pPg->pgno, 7);
1861     rc = sqliteOsWrite(&pPager->cpfd, &((char*)pData)[-4], SQLITE_PAGE_SIZE+4);
1862     TRACE2("CKPT-JOURNAL %d\n", pPg->pgno);
1863     CODEC(pPager, pData, pPg->pgno, 0);
1864     if( rc!=SQLITE_OK ){
1865       sqlitepager_rollback(pPager);
1866       pPager->errMask |= PAGER_ERR_FULL;
1867       return rc;
1868     }
1869     pPager->ckptNRec++;
1870     assert( pPager->aInCkpt!=0 );
1871     pPager->aInCkpt[pPg->pgno/8] |= 1<<(pPg->pgno&7);
1872     page_add_to_ckpt_list(pPg);
1873   }
1874 
1875   /* Update the database size and return.
1876   */
1877   if( pPager->dbSize<(int)pPg->pgno ){
1878     pPager->dbSize = pPg->pgno;
1879   }
1880   return rc;
1881 }
1882 
1883 /*
1884 ** Return TRUE if the page given in the argument was previously passed
1885 ** to sqlitepager_write().  In other words, return TRUE if it is ok
1886 ** to change the content of the page.
1887 */
1888 int sqlitepager_iswriteable(void *pData){
1889   PgHdr *pPg = DATA_TO_PGHDR(pData);
1890   return pPg->dirty;
1891 }
1892 
1893 /*
1894 ** Replace the content of a single page with the information in the third
1895 ** argument.
1896 */
1897 int sqlitepager_overwrite(Pager *pPager, Pgno pgno, void *pData){
1898   void *pPage;
1899   int rc;
1900 
1901   rc = sqlitepager_get(pPager, pgno, &pPage);
1902   if( rc==SQLITE_OK ){
1903     rc = sqlitepager_write(pPage);
1904     if( rc==SQLITE_OK ){
1905       memcpy(pPage, pData, SQLITE_PAGE_SIZE);
1906     }
1907     sqlitepager_unref(pPage);
1908   }
1909   return rc;
1910 }
1911 
1912 /*
1913 ** A call to this routine tells the pager that it is not necessary to
1914 ** write the information on page "pgno" back to the disk, even though
1915 ** that page might be marked as dirty.
1916 **
1917 ** The overlying software layer calls this routine when all of the data
1918 ** on the given page is unused.  The pager marks the page as clean so
1919 ** that it does not get written to disk.
1920 **
1921 ** Tests show that this optimization, together with the
1922 ** sqlitepager_dont_rollback() below, more than double the speed
1923 ** of large INSERT operations and quadruple the speed of large DELETEs.
1924 **
1925 ** When this routine is called, set the alwaysRollback flag to true.
1926 ** Subsequent calls to sqlitepager_dont_rollback() for the same page
1927 ** will thereafter be ignored.  This is necessary to avoid a problem
1928 ** where a page with data is added to the freelist during one part of
1929 ** a transaction then removed from the freelist during a later part
1930 ** of the same transaction and reused for some other purpose.  When it
1931 ** is first added to the freelist, this routine is called.  When reused,
1932 ** the dont_rollback() routine is called.  But because the page contains
1933 ** critical data, we still need to be sure it gets rolled back in spite
1934 ** of the dont_rollback() call.
1935 */
1936 void sqlitepager_dont_write(Pager *pPager, Pgno pgno){
1937   PgHdr *pPg;
1938 
1939   pPg = pager_lookup(pPager, pgno);
1940   pPg->alwaysRollback = 1;
1941   if( pPg && pPg->dirty ){
1942     if( pPager->dbSize==(int)pPg->pgno && pPager->origDbSize<pPager->dbSize ){
1943       /* If this pages is the last page in the file and the file has grown
1944       ** during the current transaction, then do NOT mark the page as clean.
1945       ** When the database file grows, we must make sure that the last page
1946       ** gets written at least once so that the disk file will be the correct
1947       ** size. If you do not write this page and the size of the file
1948       ** on the disk ends up being too small, that can lead to database
1949       ** corruption during the next transaction.
1950       */
1951     }else{
1952       TRACE2("DONT_WRITE %d\n", pgno);
1953       pPg->dirty = 0;
1954     }
1955   }
1956 }
1957 
1958 /*
1959 ** A call to this routine tells the pager that if a rollback occurs,
1960 ** it is not necessary to restore the data on the given page.  This
1961 ** means that the pager does not have to record the given page in the
1962 ** rollback journal.
1963 */
1964 void sqlitepager_dont_rollback(void *pData){
1965   PgHdr *pPg = DATA_TO_PGHDR(pData);
1966   Pager *pPager = pPg->pPager;
1967 
1968   if( pPager->state!=SQLITE_WRITELOCK || pPager->journalOpen==0 ) return;
1969   if( pPg->alwaysRollback || pPager->alwaysRollback ) return;
1970   if( !pPg->inJournal && (int)pPg->pgno <= pPager->origDbSize ){
1971     assert( pPager->aInJournal!=0 );
1972     pPager->aInJournal[pPg->pgno/8] |= 1<<(pPg->pgno&7);
1973     pPg->inJournal = 1;
1974     if( pPager->ckptInUse ){
1975       pPager->aInCkpt[pPg->pgno/8] |= 1<<(pPg->pgno&7);
1976       page_add_to_ckpt_list(pPg);
1977     }
1978     TRACE2("DONT_ROLLBACK %d\n", pPg->pgno);
1979   }
1980   if( pPager->ckptInUse && !pPg->inCkpt && (int)pPg->pgno<=pPager->ckptSize ){
1981     assert( pPg->inJournal || (int)pPg->pgno>pPager->origDbSize );
1982     assert( pPager->aInCkpt!=0 );
1983     pPager->aInCkpt[pPg->pgno/8] |= 1<<(pPg->pgno&7);
1984     page_add_to_ckpt_list(pPg);
1985   }
1986 }
1987 
1988 /*
1989 ** Commit all changes to the database and release the write lock.
1990 **
1991 ** If the commit fails for any reason, a rollback attempt is made
1992 ** and an error code is returned.  If the commit worked, SQLITE_OK
1993 ** is returned.
1994 */
1995 int sqlitepager_commit(Pager *pPager){
1996   int rc;
1997   PgHdr *pPg;
1998 
1999   if( pPager->errMask==PAGER_ERR_FULL ){
2000     rc = sqlitepager_rollback(pPager);
2001     if( rc==SQLITE_OK ){
2002       rc = SQLITE_FULL;
2003     }
2004     return rc;
2005   }
2006   if( pPager->errMask!=0 ){
2007     rc = pager_errcode(pPager);
2008     return rc;
2009   }
2010   if( pPager->state!=SQLITE_WRITELOCK ){
2011     return SQLITE_ERROR;
2012   }
2013   TRACE1("COMMIT\n");
2014   if( pPager->dirtyFile==0 ){
2015     /* Exit early (without doing the time-consuming sqliteOsSync() calls)
2016     ** if there have been no changes to the database file. */
2017     assert( pPager->needSync==0 );
2018     rc = pager_unwritelock(pPager);
2019     pPager->dbSize = -1;
2020     return rc;
2021   }
2022   assert( pPager->journalOpen );
2023   rc = syncJournal(pPager);
2024   if( rc!=SQLITE_OK ){
2025     goto commit_abort;
2026   }
2027   pPg = pager_get_all_dirty_pages(pPager);
2028   if( pPg ){
2029     rc = pager_write_pagelist(pPg);
2030     if( rc || (!pPager->noSync && sqliteOsSync(&pPager->fd)!=SQLITE_OK) ){
2031       goto commit_abort;
2032     }
2033   }
2034   rc = pager_unwritelock(pPager);
2035   pPager->dbSize = -1;
2036   return rc;
2037 
2038   /* Jump here if anything goes wrong during the commit process.
2039   */
2040 commit_abort:
2041   rc = sqlitepager_rollback(pPager);
2042   if( rc==SQLITE_OK ){
2043     rc = SQLITE_FULL;
2044   }
2045   return rc;
2046 }
2047 
2048 /*
2049 ** Rollback all changes.  The database falls back to read-only mode.
2050 ** All in-memory cache pages revert to their original data contents.
2051 ** The journal is deleted.
2052 **
2053 ** This routine cannot fail unless some other process is not following
2054 ** the correct locking protocol (SQLITE_PROTOCOL) or unless some other
2055 ** process is writing trash into the journal file (SQLITE_CORRUPT) or
2056 ** unless a prior malloc() failed (SQLITE_NOMEM).  Appropriate error
2057 ** codes are returned for all these occasions.  Otherwise,
2058 ** SQLITE_OK is returned.
2059 */
2060 int sqlitepager_rollback(Pager *pPager){
2061   int rc;
2062   TRACE1("ROLLBACK\n");
2063   if( !pPager->dirtyFile || !pPager->journalOpen ){
2064     rc = pager_unwritelock(pPager);
2065     pPager->dbSize = -1;
2066     return rc;
2067   }
2068 
2069   if( pPager->errMask!=0 && pPager->errMask!=PAGER_ERR_FULL ){
2070     if( pPager->state>=SQLITE_WRITELOCK ){
2071       pager_playback(pPager, 1);
2072     }
2073     return pager_errcode(pPager);
2074   }
2075   if( pPager->state!=SQLITE_WRITELOCK ){
2076     return SQLITE_OK;
2077   }
2078   rc = pager_playback(pPager, 1);
2079   if( rc!=SQLITE_OK ){
2080     rc = SQLITE_CORRUPT;
2081     pPager->errMask |= PAGER_ERR_CORRUPT;
2082   }
2083   pPager->dbSize = -1;
2084   return rc;
2085 }
2086 
2087 /*
2088 ** Return TRUE if the database file is opened read-only.  Return FALSE
2089 ** if the database is (in theory) writable.
2090 */
2091 int sqlitepager_isreadonly(Pager *pPager){
2092   return pPager->readOnly;
2093 }
2094 
2095 /*
2096 ** This routine is used for testing and analysis only.
2097 */
2098 int *sqlitepager_stats(Pager *pPager){
2099   static int a[9];
2100   a[0] = pPager->nRef;
2101   a[1] = pPager->nPage;
2102   a[2] = pPager->mxPage;
2103   a[3] = pPager->dbSize;
2104   a[4] = pPager->state;
2105   a[5] = pPager->errMask;
2106   a[6] = pPager->nHit;
2107   a[7] = pPager->nMiss;
2108   a[8] = pPager->nOvfl;
2109   return a;
2110 }
2111 
2112 /*
2113 ** Set the checkpoint.
2114 **
2115 ** This routine should be called with the transaction journal already
2116 ** open.  A new checkpoint journal is created that can be used to rollback
2117 ** changes of a single SQL command within a larger transaction.
2118 */
2119 int sqlitepager_ckpt_begin(Pager *pPager){
2120   int rc;
2121   char zTemp[SQLITE_TEMPNAME_SIZE];
2122   if( !pPager->journalOpen ){
2123     pPager->ckptAutoopen = 1;
2124     return SQLITE_OK;
2125   }
2126   assert( pPager->journalOpen );
2127   assert( !pPager->ckptInUse );
2128   pPager->aInCkpt = sqliteMalloc( pPager->dbSize/8 + 1 );
2129   if( pPager->aInCkpt==0 ){
2130     sqliteOsReadLock(&pPager->fd);
2131     return SQLITE_NOMEM;
2132   }
2133 #ifndef NDEBUG
2134   rc = sqliteOsFileSize(&pPager->jfd, &pPager->ckptJSize);
2135   if( rc ) goto ckpt_begin_failed;
2136   assert( pPager->ckptJSize ==
2137     pPager->nRec*JOURNAL_PG_SZ(journal_format)+JOURNAL_HDR_SZ(journal_format) );
2138 #endif
2139   pPager->ckptJSize = pPager->nRec*JOURNAL_PG_SZ(journal_format)
2140                          + JOURNAL_HDR_SZ(journal_format);
2141   pPager->ckptSize = pPager->dbSize;
2142   if( !pPager->ckptOpen ){
2143     rc = sqlitepager_opentemp(zTemp, &pPager->cpfd);
2144     if( rc ) goto ckpt_begin_failed;
2145     pPager->ckptOpen = 1;
2146     pPager->ckptNRec = 0;
2147   }
2148   pPager->ckptInUse = 1;
2149   return SQLITE_OK;
2150 
2151 ckpt_begin_failed:
2152   if( pPager->aInCkpt ){
2153     sqliteFree(pPager->aInCkpt);
2154     pPager->aInCkpt = 0;
2155   }
2156   return rc;
2157 }
2158 
2159 /*
2160 ** Commit a checkpoint.
2161 */
2162 int sqlitepager_ckpt_commit(Pager *pPager){
2163   if( pPager->ckptInUse ){
2164     PgHdr *pPg, *pNext;
2165     sqliteOsSeek(&pPager->cpfd, 0);
2166     /* sqliteOsTruncate(&pPager->cpfd, 0); */
2167     pPager->ckptNRec = 0;
2168     pPager->ckptInUse = 0;
2169     sqliteFree( pPager->aInCkpt );
2170     pPager->aInCkpt = 0;
2171     for(pPg=pPager->pCkpt; pPg; pPg=pNext){
2172       pNext = pPg->pNextCkpt;
2173       assert( pPg->inCkpt );
2174       pPg->inCkpt = 0;
2175       pPg->pPrevCkpt = pPg->pNextCkpt = 0;
2176     }
2177     pPager->pCkpt = 0;
2178   }
2179   pPager->ckptAutoopen = 0;
2180   return SQLITE_OK;
2181 }
2182 
2183 /*
2184 ** Rollback a checkpoint.
2185 */
2186 int sqlitepager_ckpt_rollback(Pager *pPager){
2187   int rc;
2188   if( pPager->ckptInUse ){
2189     rc = pager_ckpt_playback(pPager);
2190     sqlitepager_ckpt_commit(pPager);
2191   }else{
2192     rc = SQLITE_OK;
2193   }
2194   pPager->ckptAutoopen = 0;
2195   return rc;
2196 }
2197 
2198 /*
2199 ** Return the full pathname of the database file.
2200 */
2201 const char *sqlitepager_filename(Pager *pPager){
2202   return pPager->zFilename;
2203 }
2204 
2205 /*
2206 ** Set the codec for this pager
2207 */
2208 void sqlitepager_set_codec(
2209   Pager *pPager,
2210   void (*xCodec)(void*,void*,Pgno,int),
2211   void *pCodecArg
2212 ){
2213   pPager->xCodec = xCodec;
2214   pPager->pCodecArg = pCodecArg;
2215 }
2216 
2217 #ifdef SQLITE_TEST
2218 /*
2219 ** Print a listing of all referenced pages and their ref count.
2220 */
2221 void sqlitepager_refdump(Pager *pPager){
2222   PgHdr *pPg;
2223   for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
2224     if( pPg->nRef<=0 ) continue;
2225     printf("PAGE %3d addr=0x%08x nRef=%d\n",
2226        pPg->pgno, (int)PGHDR_TO_DATA(pPg), pPg->nRef);
2227   }
2228 }
2229 #endif
2230