1 /*
2 * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
3 * Use is subject to license terms.
4 */
5
6 #pragma ident "%Z%%M% %I% %E% SMI"
7
8 /*
9 ** 2001 September 15
10 **
11 ** The author disclaims copyright to this source code. In place of
12 ** a legal notice, here is a blessing:
13 **
14 ** May you do good and not evil.
15 ** May you find forgiveness for yourself and forgive others.
16 ** May you share freely, never taking more than you give.
17 **
18 *************************************************************************
19 ** This is the implementation of the page cache subsystem or "pager".
20 **
21 ** The pager is used to access a database disk file. It implements
22 ** atomic commit and rollback through the use of a journal file that
23 ** is separate from the database file. The pager also implements file
24 ** locking to prevent two processes from writing the same database
25 ** file simultaneously, or one process from reading the database while
26 ** another is writing.
27 **
28 ** @(#) $Id: pager.c,v 1.101 2004/02/25 02:20:41 drh Exp $
29 */
30 #include "os.h" /* Must be first to enable large file support */
31 #include "sqliteInt.h"
32 #include "pager.h"
33 #include <assert.h>
34 #include <string.h>
35
36 /*
37 ** Macros for troubleshooting. Normally turned off
38 */
39 #if 0
40 static Pager *mainPager = 0;
41 #define SET_PAGER(X) if( mainPager==0 ) mainPager = (X)
42 #define CLR_PAGER(X) if( mainPager==(X) ) mainPager = 0
43 #define TRACE1(X) if( pPager==mainPager ) fprintf(stderr,X)
44 #define TRACE2(X,Y) if( pPager==mainPager ) fprintf(stderr,X,Y)
45 #define TRACE3(X,Y,Z) if( pPager==mainPager ) fprintf(stderr,X,Y,Z)
46 #else
47 #define SET_PAGER(X)
48 #define CLR_PAGER(X)
49 #define TRACE1(X)
50 #define TRACE2(X,Y)
51 #define TRACE3(X,Y,Z)
52 #endif
53
54
55 /*
56 ** The page cache as a whole is always in one of the following
57 ** states:
58 **
59 ** SQLITE_UNLOCK The page cache is not currently reading or
60 ** writing the database file. There is no
61 ** data held in memory. This is the initial
62 ** state.
63 **
64 ** SQLITE_READLOCK The page cache is reading the database.
65 ** Writing is not permitted. There can be
66 ** multiple readers accessing the same database
67 ** file at the same time.
68 **
69 ** SQLITE_WRITELOCK The page cache is writing the database.
70 ** Access is exclusive. No other processes or
71 ** threads can be reading or writing while one
72 ** process is writing.
73 **
74 ** The page cache comes up in SQLITE_UNLOCK. The first time a
75 ** sqlite_page_get() occurs, the state transitions to SQLITE_READLOCK.
76 ** After all pages have been released using sqlite_page_unref(),
77 ** the state transitions back to SQLITE_UNLOCK. The first time
78 ** that sqlite_page_write() is called, the state transitions to
79 ** SQLITE_WRITELOCK. (Note that sqlite_page_write() can only be
80 ** called on an outstanding page which means that the pager must
81 ** be in SQLITE_READLOCK before it transitions to SQLITE_WRITELOCK.)
82 ** The sqlite_page_rollback() and sqlite_page_commit() functions
83 ** transition the state from SQLITE_WRITELOCK back to SQLITE_READLOCK.
84 */
85 #define SQLITE_UNLOCK 0
86 #define SQLITE_READLOCK 1
87 #define SQLITE_WRITELOCK 2
88
89
90 /*
91 ** Each in-memory image of a page begins with the following header.
92 ** This header is only visible to this pager module. The client
93 ** code that calls pager sees only the data that follows the header.
94 **
95 ** Client code should call sqlitepager_write() on a page prior to making
96 ** any modifications to that page. The first time sqlitepager_write()
97 ** is called, the original page contents are written into the rollback
98 ** journal and PgHdr.inJournal and PgHdr.needSync are set. Later, once
99 ** the journal page has made it onto the disk surface, PgHdr.needSync
100 ** is cleared. The modified page cannot be written back into the original
101 ** database file until the journal pages has been synced to disk and the
102 ** PgHdr.needSync has been cleared.
103 **
104 ** The PgHdr.dirty flag is set when sqlitepager_write() is called and
105 ** is cleared again when the page content is written back to the original
106 ** database file.
107 */
108 typedef struct PgHdr PgHdr;
109 struct PgHdr {
110 Pager *pPager; /* The pager to which this page belongs */
111 Pgno pgno; /* The page number for this page */
112 PgHdr *pNextHash, *pPrevHash; /* Hash collision chain for PgHdr.pgno */
113 int nRef; /* Number of users of this page */
114 PgHdr *pNextFree, *pPrevFree; /* Freelist of pages where nRef==0 */
115 PgHdr *pNextAll, *pPrevAll; /* A list of all pages */
116 PgHdr *pNextCkpt, *pPrevCkpt; /* List of pages in the checkpoint journal */
117 u8 inJournal; /* TRUE if has been written to journal */
118 u8 inCkpt; /* TRUE if written to the checkpoint journal */
119 u8 dirty; /* TRUE if we need to write back changes */
120 u8 needSync; /* Sync journal before writing this page */
121 u8 alwaysRollback; /* Disable dont_rollback() for this page */
122 PgHdr *pDirty; /* Dirty pages sorted by PgHdr.pgno */
123 /* SQLITE_PAGE_SIZE bytes of page data follow this header */
124 /* Pager.nExtra bytes of local data follow the page data */
125 };
126
127
128 /*
129 ** A macro used for invoking the codec if there is one
130 */
131 #ifdef SQLITE_HAS_CODEC
132 # define CODEC(P,D,N,X) if( P->xCodec ){ P->xCodec(P->pCodecArg,D,N,X); }
133 #else
134 # define CODEC(P,D,N,X)
135 #endif
136
137 /*
138 ** Convert a pointer to a PgHdr into a pointer to its data
139 ** and back again.
140 */
141 #define PGHDR_TO_DATA(P) ((void*)(&(P)[1]))
142 #define DATA_TO_PGHDR(D) (&((PgHdr*)(D))[-1])
143 #define PGHDR_TO_EXTRA(P) ((void*)&((char*)(&(P)[1]))[SQLITE_PAGE_SIZE])
144
145 /*
146 ** How big to make the hash table used for locating in-memory pages
147 ** by page number.
148 */
149 #define N_PG_HASH 2048
150
151 /*
152 ** Hash a page number
153 */
154 #define pager_hash(PN) ((PN)&(N_PG_HASH-1))
155
156 /*
157 ** A open page cache is an instance of the following structure.
158 */
159 struct Pager {
160 char *zFilename; /* Name of the database file */
161 char *zJournal; /* Name of the journal file */
162 char *zDirectory; /* Directory hold database and journal files */
163 OsFile fd, jfd; /* File descriptors for database and journal */
164 OsFile cpfd; /* File descriptor for the checkpoint journal */
165 int dbSize; /* Number of pages in the file */
166 int origDbSize; /* dbSize before the current change */
167 int ckptSize; /* Size of database (in pages) at ckpt_begin() */
168 off_t ckptJSize; /* Size of journal at ckpt_begin() */
169 int nRec; /* Number of pages written to the journal */
170 u32 cksumInit; /* Quasi-random value added to every checksum */
171 int ckptNRec; /* Number of records in the checkpoint journal */
172 int nExtra; /* Add this many bytes to each in-memory page */
173 void (*xDestructor)(void*); /* Call this routine when freeing pages */
174 int nPage; /* Total number of in-memory pages */
175 int nRef; /* Number of in-memory pages with PgHdr.nRef>0 */
176 int mxPage; /* Maximum number of pages to hold in cache */
177 int nHit, nMiss, nOvfl; /* Cache hits, missing, and LRU overflows */
178 void (*xCodec)(void*,void*,Pgno,int); /* Routine for en/decoding data */
179 void *pCodecArg; /* First argument to xCodec() */
180 u8 journalOpen; /* True if journal file descriptors is valid */
181 u8 journalStarted; /* True if header of journal is synced */
182 u8 useJournal; /* Use a rollback journal on this file */
183 u8 ckptOpen; /* True if the checkpoint journal is open */
184 u8 ckptInUse; /* True we are in a checkpoint */
185 u8 ckptAutoopen; /* Open ckpt journal when main journal is opened*/
186 u8 noSync; /* Do not sync the journal if true */
187 u8 fullSync; /* Do extra syncs of the journal for robustness */
188 u8 state; /* SQLITE_UNLOCK, _READLOCK or _WRITELOCK */
189 u8 errMask; /* One of several kinds of errors */
190 u8 tempFile; /* zFilename is a temporary file */
191 u8 readOnly; /* True for a read-only database */
192 u8 needSync; /* True if an fsync() is needed on the journal */
193 u8 dirtyFile; /* True if database file has changed in any way */
194 u8 alwaysRollback; /* Disable dont_rollback() for all pages */
195 u8 *aInJournal; /* One bit for each page in the database file */
196 u8 *aInCkpt; /* One bit for each page in the database */
197 PgHdr *pFirst, *pLast; /* List of free pages */
198 PgHdr *pFirstSynced; /* First free page with PgHdr.needSync==0 */
199 PgHdr *pAll; /* List of all pages */
200 PgHdr *pCkpt; /* List of pages in the checkpoint journal */
201 PgHdr *aHash[N_PG_HASH]; /* Hash table to map page number of PgHdr */
202 };
203
204 /*
205 ** These are bits that can be set in Pager.errMask.
206 */
207 #define PAGER_ERR_FULL 0x01 /* a write() failed */
208 #define PAGER_ERR_MEM 0x02 /* malloc() failed */
209 #define PAGER_ERR_LOCK 0x04 /* error in the locking protocol */
210 #define PAGER_ERR_CORRUPT 0x08 /* database or journal corruption */
211 #define PAGER_ERR_DISK 0x10 /* general disk I/O error - bad hard drive? */
212
213 /*
214 ** The journal file contains page records in the following
215 ** format.
216 **
217 ** Actually, this structure is the complete page record for pager
218 ** formats less than 3. Beginning with format 3, this record is surrounded
219 ** by two checksums.
220 */
221 typedef struct PageRecord PageRecord;
222 struct PageRecord {
223 Pgno pgno; /* The page number */
224 char aData[SQLITE_PAGE_SIZE]; /* Original data for page pgno */
225 };
226
227 /*
228 ** Journal files begin with the following magic string. The data
229 ** was obtained from /dev/random. It is used only as a sanity check.
230 **
231 ** There are three journal formats (so far). The 1st journal format writes
232 ** 32-bit integers in the byte-order of the host machine. New
233 ** formats writes integers as big-endian. All new journals use the
234 ** new format, but we have to be able to read an older journal in order
235 ** to rollback journals created by older versions of the library.
236 **
237 ** The 3rd journal format (added for 2.8.0) adds additional sanity
238 ** checking information to the journal. If the power fails while the
239 ** journal is being written, semi-random garbage data might appear in
240 ** the journal file after power is restored. If an attempt is then made
241 ** to roll the journal back, the database could be corrupted. The additional
242 ** sanity checking data is an attempt to discover the garbage in the
243 ** journal and ignore it.
244 **
245 ** The sanity checking information for the 3rd journal format consists
246 ** of a 32-bit checksum on each page of data. The checksum covers both
247 ** the page number and the SQLITE_PAGE_SIZE bytes of data for the page.
248 ** This cksum is initialized to a 32-bit random value that appears in the
249 ** journal file right after the header. The random initializer is important,
250 ** because garbage data that appears at the end of a journal is likely
251 ** data that was once in other files that have now been deleted. If the
252 ** garbage data came from an obsolete journal file, the checksums might
253 ** be correct. But by initializing the checksum to random value which
254 ** is different for every journal, we minimize that risk.
255 */
256 static const unsigned char aJournalMagic1[] = {
257 0xd9, 0xd5, 0x05, 0xf9, 0x20, 0xa1, 0x63, 0xd4,
258 };
259 static const unsigned char aJournalMagic2[] = {
260 0xd9, 0xd5, 0x05, 0xf9, 0x20, 0xa1, 0x63, 0xd5,
261 };
262 static const unsigned char aJournalMagic3[] = {
263 0xd9, 0xd5, 0x05, 0xf9, 0x20, 0xa1, 0x63, 0xd6,
264 };
265 #define JOURNAL_FORMAT_1 1
266 #define JOURNAL_FORMAT_2 2
267 #define JOURNAL_FORMAT_3 3
268
269 /*
270 ** The following integer determines what format to use when creating
271 ** new primary journal files. By default we always use format 3.
272 ** When testing, we can set this value to older journal formats in order to
273 ** make sure that newer versions of the library are able to rollback older
274 ** journal files.
275 **
276 ** Note that checkpoint journals always use format 2 and omit the header.
277 */
278 #ifdef SQLITE_TEST
279 int journal_format = 3;
280 #else
281 # define journal_format 3
282 #endif
283
284 /*
285 ** The size of the header and of each page in the journal varies according
286 ** to which journal format is being used. The following macros figure out
287 ** the sizes based on format numbers.
288 */
289 #define JOURNAL_HDR_SZ(X) \
290 (sizeof(aJournalMagic1) + sizeof(Pgno) + ((X)>=3)*2*sizeof(u32))
291 #define JOURNAL_PG_SZ(X) \
292 (SQLITE_PAGE_SIZE + sizeof(Pgno) + ((X)>=3)*sizeof(u32))
293
294 /*
295 ** Enable reference count tracking here:
296 */
297 #ifdef SQLITE_TEST
298 int pager_refinfo_enable = 0;
pager_refinfo(PgHdr * p)299 static void pager_refinfo(PgHdr *p){
300 static int cnt = 0;
301 if( !pager_refinfo_enable ) return;
302 printf(
303 "REFCNT: %4d addr=0x%08x nRef=%d\n",
304 p->pgno, (int)PGHDR_TO_DATA(p), p->nRef
305 );
306 cnt++; /* Something to set a breakpoint on */
307 }
308 # define REFINFO(X) pager_refinfo(X)
309 #else
310 # define REFINFO(X)
311 #endif
312
313 /*
314 ** Read a 32-bit integer from the given file descriptor. Store the integer
315 ** that is read in *pRes. Return SQLITE_OK if everything worked, or an
316 ** error code is something goes wrong.
317 **
318 ** If the journal format is 2 or 3, read a big-endian integer. If the
319 ** journal format is 1, read an integer in the native byte-order of the
320 ** host machine.
321 */
read32bits(int format,OsFile * fd,u32 * pRes)322 static int read32bits(int format, OsFile *fd, u32 *pRes){
323 u32 res;
324 int rc;
325 rc = sqliteOsRead(fd, &res, sizeof(res));
326 if( rc==SQLITE_OK && format>JOURNAL_FORMAT_1 ){
327 unsigned char ac[4];
328 memcpy(ac, &res, 4);
329 res = (ac[0]<<24) | (ac[1]<<16) | (ac[2]<<8) | ac[3];
330 }
331 *pRes = res;
332 return rc;
333 }
334
335 /*
336 ** Write a 32-bit integer into the given file descriptor. Return SQLITE_OK
337 ** on success or an error code is something goes wrong.
338 **
339 ** If the journal format is 2 or 3, write the integer as 4 big-endian
340 ** bytes. If the journal format is 1, write the integer in the native
341 ** byte order. In normal operation, only formats 2 and 3 are used.
342 ** Journal format 1 is only used for testing.
343 */
write32bits(OsFile * fd,u32 val)344 static int write32bits(OsFile *fd, u32 val){
345 unsigned char ac[4];
346 if( journal_format<=1 ){
347 return sqliteOsWrite(fd, &val, 4);
348 }
349 ac[0] = (val>>24) & 0xff;
350 ac[1] = (val>>16) & 0xff;
351 ac[2] = (val>>8) & 0xff;
352 ac[3] = val & 0xff;
353 return sqliteOsWrite(fd, ac, 4);
354 }
355
356 /*
357 ** Write a 32-bit integer into a page header right before the
358 ** page data. This will overwrite the PgHdr.pDirty pointer.
359 **
360 ** The integer is big-endian for formats 2 and 3 and native byte order
361 ** for journal format 1.
362 */
store32bits(u32 val,PgHdr * p,int offset)363 static void store32bits(u32 val, PgHdr *p, int offset){
364 unsigned char *ac;
365 ac = &((unsigned char*)PGHDR_TO_DATA(p))[offset];
366 if( journal_format<=1 ){
367 memcpy(ac, &val, 4);
368 }else{
369 ac[0] = (val>>24) & 0xff;
370 ac[1] = (val>>16) & 0xff;
371 ac[2] = (val>>8) & 0xff;
372 ac[3] = val & 0xff;
373 }
374 }
375
376
377 /*
378 ** Convert the bits in the pPager->errMask into an approprate
379 ** return code.
380 */
pager_errcode(Pager * pPager)381 static int pager_errcode(Pager *pPager){
382 int rc = SQLITE_OK;
383 if( pPager->errMask & PAGER_ERR_LOCK ) rc = SQLITE_PROTOCOL;
384 if( pPager->errMask & PAGER_ERR_DISK ) rc = SQLITE_IOERR;
385 if( pPager->errMask & PAGER_ERR_FULL ) rc = SQLITE_FULL;
386 if( pPager->errMask & PAGER_ERR_MEM ) rc = SQLITE_NOMEM;
387 if( pPager->errMask & PAGER_ERR_CORRUPT ) rc = SQLITE_CORRUPT;
388 return rc;
389 }
390
391 /*
392 ** Add or remove a page from the list of all pages that are in the
393 ** checkpoint journal.
394 **
395 ** The Pager keeps a separate list of pages that are currently in
396 ** the checkpoint journal. This helps the sqlitepager_ckpt_commit()
397 ** routine run MUCH faster for the common case where there are many
398 ** pages in memory but only a few are in the checkpoint journal.
399 */
page_add_to_ckpt_list(PgHdr * pPg)400 static void page_add_to_ckpt_list(PgHdr *pPg){
401 Pager *pPager = pPg->pPager;
402 if( pPg->inCkpt ) return;
403 assert( pPg->pPrevCkpt==0 && pPg->pNextCkpt==0 );
404 pPg->pPrevCkpt = 0;
405 if( pPager->pCkpt ){
406 pPager->pCkpt->pPrevCkpt = pPg;
407 }
408 pPg->pNextCkpt = pPager->pCkpt;
409 pPager->pCkpt = pPg;
410 pPg->inCkpt = 1;
411 }
page_remove_from_ckpt_list(PgHdr * pPg)412 static void page_remove_from_ckpt_list(PgHdr *pPg){
413 if( !pPg->inCkpt ) return;
414 if( pPg->pPrevCkpt ){
415 assert( pPg->pPrevCkpt->pNextCkpt==pPg );
416 pPg->pPrevCkpt->pNextCkpt = pPg->pNextCkpt;
417 }else{
418 assert( pPg->pPager->pCkpt==pPg );
419 pPg->pPager->pCkpt = pPg->pNextCkpt;
420 }
421 if( pPg->pNextCkpt ){
422 assert( pPg->pNextCkpt->pPrevCkpt==pPg );
423 pPg->pNextCkpt->pPrevCkpt = pPg->pPrevCkpt;
424 }
425 pPg->pNextCkpt = 0;
426 pPg->pPrevCkpt = 0;
427 pPg->inCkpt = 0;
428 }
429
430 /*
431 ** Find a page in the hash table given its page number. Return
432 ** a pointer to the page or NULL if not found.
433 */
pager_lookup(Pager * pPager,Pgno pgno)434 static PgHdr *pager_lookup(Pager *pPager, Pgno pgno){
435 PgHdr *p = pPager->aHash[pager_hash(pgno)];
436 while( p && p->pgno!=pgno ){
437 p = p->pNextHash;
438 }
439 return p;
440 }
441
442 /*
443 ** Unlock the database and clear the in-memory cache. This routine
444 ** sets the state of the pager back to what it was when it was first
445 ** opened. Any outstanding pages are invalidated and subsequent attempts
446 ** to access those pages will likely result in a coredump.
447 */
pager_reset(Pager * pPager)448 static void pager_reset(Pager *pPager){
449 PgHdr *pPg, *pNext;
450 for(pPg=pPager->pAll; pPg; pPg=pNext){
451 pNext = pPg->pNextAll;
452 sqliteFree(pPg);
453 }
454 pPager->pFirst = 0;
455 pPager->pFirstSynced = 0;
456 pPager->pLast = 0;
457 pPager->pAll = 0;
458 memset(pPager->aHash, 0, sizeof(pPager->aHash));
459 pPager->nPage = 0;
460 if( pPager->state>=SQLITE_WRITELOCK ){
461 sqlitepager_rollback(pPager);
462 }
463 sqliteOsUnlock(&pPager->fd);
464 pPager->state = SQLITE_UNLOCK;
465 pPager->dbSize = -1;
466 pPager->nRef = 0;
467 assert( pPager->journalOpen==0 );
468 }
469
470 /*
471 ** When this routine is called, the pager has the journal file open and
472 ** a write lock on the database. This routine releases the database
473 ** write lock and acquires a read lock in its place. The journal file
474 ** is deleted and closed.
475 **
476 ** TODO: Consider keeping the journal file open for temporary databases.
477 ** This might give a performance improvement on windows where opening
478 ** a file is an expensive operation.
479 */
pager_unwritelock(Pager * pPager)480 static int pager_unwritelock(Pager *pPager){
481 int rc;
482 PgHdr *pPg;
483 if( pPager->state<SQLITE_WRITELOCK ) return SQLITE_OK;
484 sqlitepager_ckpt_commit(pPager);
485 if( pPager->ckptOpen ){
486 sqliteOsClose(&pPager->cpfd);
487 pPager->ckptOpen = 0;
488 }
489 if( pPager->journalOpen ){
490 sqliteOsClose(&pPager->jfd);
491 pPager->journalOpen = 0;
492 sqliteOsDelete(pPager->zJournal);
493 sqliteFree( pPager->aInJournal );
494 pPager->aInJournal = 0;
495 for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
496 pPg->inJournal = 0;
497 pPg->dirty = 0;
498 pPg->needSync = 0;
499 }
500 }else{
501 assert( pPager->dirtyFile==0 || pPager->useJournal==0 );
502 }
503 rc = sqliteOsReadLock(&pPager->fd);
504 if( rc==SQLITE_OK ){
505 pPager->state = SQLITE_READLOCK;
506 }else{
507 /* This can only happen if a process does a BEGIN, then forks and the
508 ** child process does the COMMIT. Because of the semantics of unix
509 ** file locking, the unlock will fail.
510 */
511 pPager->state = SQLITE_UNLOCK;
512 }
513 return rc;
514 }
515
516 /*
517 ** Compute and return a checksum for the page of data.
518 **
519 ** This is not a real checksum. It is really just the sum of the
520 ** random initial value and the page number. We considered do a checksum
521 ** of the database, but that was found to be too slow.
522 */
pager_cksum(Pager * pPager,Pgno pgno,const char * aData)523 static u32 pager_cksum(Pager *pPager, Pgno pgno, const char *aData){
524 u32 cksum = pPager->cksumInit + pgno;
525 return cksum;
526 }
527
528 /*
529 ** Read a single page from the journal file opened on file descriptor
530 ** jfd. Playback this one page.
531 **
532 ** There are three different journal formats. The format parameter determines
533 ** which format is used by the journal that is played back.
534 */
pager_playback_one_page(Pager * pPager,OsFile * jfd,int format)535 static int pager_playback_one_page(Pager *pPager, OsFile *jfd, int format){
536 int rc;
537 PgHdr *pPg; /* An existing page in the cache */
538 PageRecord pgRec;
539 u32 cksum;
540
541 rc = read32bits(format, jfd, &pgRec.pgno);
542 if( rc!=SQLITE_OK ) return rc;
543 rc = sqliteOsRead(jfd, &pgRec.aData, sizeof(pgRec.aData));
544 if( rc!=SQLITE_OK ) return rc;
545
546 /* Sanity checking on the page. This is more important that I originally
547 ** thought. If a power failure occurs while the journal is being written,
548 ** it could cause invalid data to be written into the journal. We need to
549 ** detect this invalid data (with high probability) and ignore it.
550 */
551 if( pgRec.pgno==0 ){
552 return SQLITE_DONE;
553 }
554 if( pgRec.pgno>(unsigned)pPager->dbSize ){
555 return SQLITE_OK;
556 }
557 if( format>=JOURNAL_FORMAT_3 ){
558 rc = read32bits(format, jfd, &cksum);
559 if( rc ) return rc;
560 if( pager_cksum(pPager, pgRec.pgno, pgRec.aData)!=cksum ){
561 return SQLITE_DONE;
562 }
563 }
564
565 /* Playback the page. Update the in-memory copy of the page
566 ** at the same time, if there is one.
567 */
568 pPg = pager_lookup(pPager, pgRec.pgno);
569 TRACE2("PLAYBACK %d\n", pgRec.pgno);
570 sqliteOsSeek(&pPager->fd, (pgRec.pgno-1)*(off_t)SQLITE_PAGE_SIZE);
571 rc = sqliteOsWrite(&pPager->fd, pgRec.aData, SQLITE_PAGE_SIZE);
572 if( pPg ){
573 /* No page should ever be rolled back that is in use, except for page
574 ** 1 which is held in use in order to keep the lock on the database
575 ** active. However, such a page may be rolled back as a result of an
576 ** internal error resulting in an automatic call to
577 ** sqlitepager_rollback(), so we can't assert() it.
578 */
579 /* assert( pPg->nRef==0 || pPg->pgno==1 ) */
580 memcpy(PGHDR_TO_DATA(pPg), pgRec.aData, SQLITE_PAGE_SIZE);
581 memset(PGHDR_TO_EXTRA(pPg), 0, pPager->nExtra);
582 pPg->dirty = 0;
583 pPg->needSync = 0;
584 CODEC(pPager, PGHDR_TO_DATA(pPg), pPg->pgno, 3);
585 }
586 return rc;
587 }
588
589 /*
590 ** Playback the journal and thus restore the database file to
591 ** the state it was in before we started making changes.
592 **
593 ** The journal file format is as follows:
594 **
595 ** * 8 byte prefix. One of the aJournalMagic123 vectors defined
596 ** above. The format of the journal file is determined by which
597 ** of the three prefix vectors is seen.
598 ** * 4 byte big-endian integer which is the number of valid page records
599 ** in the journal. If this value is 0xffffffff, then compute the
600 ** number of page records from the journal size. This field appears
601 ** in format 3 only.
602 ** * 4 byte big-endian integer which is the initial value for the
603 ** sanity checksum. This field appears in format 3 only.
604 ** * 4 byte integer which is the number of pages to truncate the
605 ** database to during a rollback.
606 ** * Zero or more pages instances, each as follows:
607 ** + 4 byte page number.
608 ** + SQLITE_PAGE_SIZE bytes of data.
609 ** + 4 byte checksum (format 3 only)
610 **
611 ** When we speak of the journal header, we mean the first 4 bullets above.
612 ** Each entry in the journal is an instance of the 5th bullet. Note that
613 ** bullets 2 and 3 only appear in format-3 journals.
614 **
615 ** Call the value from the second bullet "nRec". nRec is the number of
616 ** valid page entries in the journal. In most cases, you can compute the
617 ** value of nRec from the size of the journal file. But if a power
618 ** failure occurred while the journal was being written, it could be the
619 ** case that the size of the journal file had already been increased but
620 ** the extra entries had not yet made it safely to disk. In such a case,
621 ** the value of nRec computed from the file size would be too large. For
622 ** that reason, we always use the nRec value in the header.
623 **
624 ** If the nRec value is 0xffffffff it means that nRec should be computed
625 ** from the file size. This value is used when the user selects the
626 ** no-sync option for the journal. A power failure could lead to corruption
627 ** in this case. But for things like temporary table (which will be
628 ** deleted when the power is restored) we don't care.
629 **
630 ** Journal formats 1 and 2 do not have an nRec value in the header so we
631 ** have to compute nRec from the file size. This has risks (as described
632 ** above) which is why all persistent tables have been changed to use
633 ** format 3.
634 **
635 ** If the file opened as the journal file is not a well-formed
636 ** journal file then the database will likely already be
637 ** corrupted, so the PAGER_ERR_CORRUPT bit is set in pPager->errMask
638 ** and SQLITE_CORRUPT is returned. If it all works, then this routine
639 ** returns SQLITE_OK.
640 */
pager_playback(Pager * pPager,int useJournalSize)641 static int pager_playback(Pager *pPager, int useJournalSize){
642 off_t szJ; /* Size of the journal file in bytes */
643 int nRec; /* Number of Records in the journal */
644 int i; /* Loop counter */
645 Pgno mxPg = 0; /* Size of the original file in pages */
646 int format; /* Format of the journal file. */
647 unsigned char aMagic[sizeof(aJournalMagic1)];
648 int rc;
649
650 /* Figure out how many records are in the journal. Abort early if
651 ** the journal is empty.
652 */
653 assert( pPager->journalOpen );
654 sqliteOsSeek(&pPager->jfd, 0);
655 rc = sqliteOsFileSize(&pPager->jfd, &szJ);
656 if( rc!=SQLITE_OK ){
657 goto end_playback;
658 }
659
660 /* If the journal file is too small to contain a complete header,
661 ** it must mean that the process that created the journal was just
662 ** beginning to write the journal file when it died. In that case,
663 ** the database file should have still been completely unchanged.
664 ** Nothing needs to be rolled back. We can safely ignore this journal.
665 */
666 if( szJ < sizeof(aMagic)+sizeof(Pgno) ){
667 goto end_playback;
668 }
669
670 /* Read the beginning of the journal and truncate the
671 ** database file back to its original size.
672 */
673 rc = sqliteOsRead(&pPager->jfd, aMagic, sizeof(aMagic));
674 if( rc!=SQLITE_OK ){
675 rc = SQLITE_PROTOCOL;
676 goto end_playback;
677 }
678 if( memcmp(aMagic, aJournalMagic3, sizeof(aMagic))==0 ){
679 format = JOURNAL_FORMAT_3;
680 }else if( memcmp(aMagic, aJournalMagic2, sizeof(aMagic))==0 ){
681 format = JOURNAL_FORMAT_2;
682 }else if( memcmp(aMagic, aJournalMagic1, sizeof(aMagic))==0 ){
683 format = JOURNAL_FORMAT_1;
684 }else{
685 rc = SQLITE_PROTOCOL;
686 goto end_playback;
687 }
688 if( format>=JOURNAL_FORMAT_3 ){
689 if( szJ < sizeof(aMagic) + 3*sizeof(u32) ){
690 /* Ignore the journal if it is too small to contain a complete
691 ** header. We already did this test once above, but at the prior
692 ** test, we did not know the journal format and so we had to assume
693 ** the smallest possible header. Now we know the header is bigger
694 ** than the minimum so we test again.
695 */
696 goto end_playback;
697 }
698 rc = read32bits(format, &pPager->jfd, (u32*)&nRec);
699 if( rc ) goto end_playback;
700 rc = read32bits(format, &pPager->jfd, &pPager->cksumInit);
701 if( rc ) goto end_playback;
702 if( nRec==0xffffffff || useJournalSize ){
703 nRec = (szJ - JOURNAL_HDR_SZ(3))/JOURNAL_PG_SZ(3);
704 }
705 }else{
706 nRec = (szJ - JOURNAL_HDR_SZ(2))/JOURNAL_PG_SZ(2);
707 assert( nRec*JOURNAL_PG_SZ(2)+JOURNAL_HDR_SZ(2)==szJ );
708 }
709 rc = read32bits(format, &pPager->jfd, &mxPg);
710 if( rc!=SQLITE_OK ){
711 goto end_playback;
712 }
713 assert( pPager->origDbSize==0 || pPager->origDbSize==mxPg );
714 rc = sqliteOsTruncate(&pPager->fd, SQLITE_PAGE_SIZE*(off_t)mxPg);
715 if( rc!=SQLITE_OK ){
716 goto end_playback;
717 }
718 pPager->dbSize = mxPg;
719
720 /* Copy original pages out of the journal and back into the database file.
721 */
722 for(i=0; i<nRec; i++){
723 rc = pager_playback_one_page(pPager, &pPager->jfd, format);
724 if( rc!=SQLITE_OK ){
725 if( rc==SQLITE_DONE ){
726 rc = SQLITE_OK;
727 }
728 break;
729 }
730 }
731
732 /* Pages that have been written to the journal but never synced
733 ** where not restored by the loop above. We have to restore those
734 ** pages by reading them back from the original database.
735 */
736 if( rc==SQLITE_OK ){
737 PgHdr *pPg;
738 for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
739 char zBuf[SQLITE_PAGE_SIZE];
740 if( !pPg->dirty ) continue;
741 if( (int)pPg->pgno <= pPager->origDbSize ){
742 sqliteOsSeek(&pPager->fd, SQLITE_PAGE_SIZE*(off_t)(pPg->pgno-1));
743 rc = sqliteOsRead(&pPager->fd, zBuf, SQLITE_PAGE_SIZE);
744 TRACE2("REFETCH %d\n", pPg->pgno);
745 CODEC(pPager, zBuf, pPg->pgno, 2);
746 if( rc ) break;
747 }else{
748 memset(zBuf, 0, SQLITE_PAGE_SIZE);
749 }
750 if( pPg->nRef==0 || memcmp(zBuf, PGHDR_TO_DATA(pPg), SQLITE_PAGE_SIZE) ){
751 memcpy(PGHDR_TO_DATA(pPg), zBuf, SQLITE_PAGE_SIZE);
752 memset(PGHDR_TO_EXTRA(pPg), 0, pPager->nExtra);
753 }
754 pPg->needSync = 0;
755 pPg->dirty = 0;
756 }
757 }
758
759 end_playback:
760 if( rc!=SQLITE_OK ){
761 pager_unwritelock(pPager);
762 pPager->errMask |= PAGER_ERR_CORRUPT;
763 rc = SQLITE_CORRUPT;
764 }else{
765 rc = pager_unwritelock(pPager);
766 }
767 return rc;
768 }
769
770 /*
771 ** Playback the checkpoint journal.
772 **
773 ** This is similar to playing back the transaction journal but with
774 ** a few extra twists.
775 **
776 ** (1) The number of pages in the database file at the start of
777 ** the checkpoint is stored in pPager->ckptSize, not in the
778 ** journal file itself.
779 **
780 ** (2) In addition to playing back the checkpoint journal, also
781 ** playback all pages of the transaction journal beginning
782 ** at offset pPager->ckptJSize.
783 */
pager_ckpt_playback(Pager * pPager)784 static int pager_ckpt_playback(Pager *pPager){
785 off_t szJ; /* Size of the full journal */
786 int nRec; /* Number of Records */
787 int i; /* Loop counter */
788 int rc;
789
790 /* Truncate the database back to its original size.
791 */
792 rc = sqliteOsTruncate(&pPager->fd, SQLITE_PAGE_SIZE*(off_t)pPager->ckptSize);
793 pPager->dbSize = pPager->ckptSize;
794
795 /* Figure out how many records are in the checkpoint journal.
796 */
797 assert( pPager->ckptInUse && pPager->journalOpen );
798 sqliteOsSeek(&pPager->cpfd, 0);
799 nRec = pPager->ckptNRec;
800
801 /* Copy original pages out of the checkpoint journal and back into the
802 ** database file. Note that the checkpoint journal always uses format
803 ** 2 instead of format 3 since it does not need to be concerned with
804 ** power failures corrupting the journal and can thus omit the checksums.
805 */
806 for(i=nRec-1; i>=0; i--){
807 rc = pager_playback_one_page(pPager, &pPager->cpfd, 2);
808 assert( rc!=SQLITE_DONE );
809 if( rc!=SQLITE_OK ) goto end_ckpt_playback;
810 }
811
812 /* Figure out how many pages need to be copied out of the transaction
813 ** journal.
814 */
815 rc = sqliteOsSeek(&pPager->jfd, pPager->ckptJSize);
816 if( rc!=SQLITE_OK ){
817 goto end_ckpt_playback;
818 }
819 rc = sqliteOsFileSize(&pPager->jfd, &szJ);
820 if( rc!=SQLITE_OK ){
821 goto end_ckpt_playback;
822 }
823 nRec = (szJ - pPager->ckptJSize)/JOURNAL_PG_SZ(journal_format);
824 for(i=nRec-1; i>=0; i--){
825 rc = pager_playback_one_page(pPager, &pPager->jfd, journal_format);
826 if( rc!=SQLITE_OK ){
827 assert( rc!=SQLITE_DONE );
828 goto end_ckpt_playback;
829 }
830 }
831
832 end_ckpt_playback:
833 if( rc!=SQLITE_OK ){
834 pPager->errMask |= PAGER_ERR_CORRUPT;
835 rc = SQLITE_CORRUPT;
836 }
837 return rc;
838 }
839
840 /*
841 ** Change the maximum number of in-memory pages that are allowed.
842 **
843 ** The maximum number is the absolute value of the mxPage parameter.
844 ** If mxPage is negative, the noSync flag is also set. noSync bypasses
845 ** calls to sqliteOsSync(). The pager runs much faster with noSync on,
846 ** but if the operating system crashes or there is an abrupt power
847 ** failure, the database file might be left in an inconsistent and
848 ** unrepairable state.
849 */
sqlitepager_set_cachesize(Pager * pPager,int mxPage)850 void sqlitepager_set_cachesize(Pager *pPager, int mxPage){
851 if( mxPage>=0 ){
852 pPager->noSync = pPager->tempFile;
853 if( pPager->noSync==0 ) pPager->needSync = 0;
854 }else{
855 pPager->noSync = 1;
856 mxPage = -mxPage;
857 }
858 if( mxPage>10 ){
859 pPager->mxPage = mxPage;
860 }
861 }
862
863 /*
864 ** Adjust the robustness of the database to damage due to OS crashes
865 ** or power failures by changing the number of syncs()s when writing
866 ** the rollback journal. There are three levels:
867 **
868 ** OFF sqliteOsSync() is never called. This is the default
869 ** for temporary and transient files.
870 **
871 ** NORMAL The journal is synced once before writes begin on the
872 ** database. This is normally adequate protection, but
873 ** it is theoretically possible, though very unlikely,
874 ** that an inopertune power failure could leave the journal
875 ** in a state which would cause damage to the database
876 ** when it is rolled back.
877 **
878 ** FULL The journal is synced twice before writes begin on the
879 ** database (with some additional information - the nRec field
880 ** of the journal header - being written in between the two
881 ** syncs). If we assume that writing a
882 ** single disk sector is atomic, then this mode provides
883 ** assurance that the journal will not be corrupted to the
884 ** point of causing damage to the database during rollback.
885 **
886 ** Numeric values associated with these states are OFF==1, NORMAL=2,
887 ** and FULL=3.
888 */
sqlitepager_set_safety_level(Pager * pPager,int level)889 void sqlitepager_set_safety_level(Pager *pPager, int level){
890 pPager->noSync = level==1 || pPager->tempFile;
891 pPager->fullSync = level==3 && !pPager->tempFile;
892 if( pPager->noSync==0 ) pPager->needSync = 0;
893 }
894
895 /*
896 ** Open a temporary file. Write the name of the file into zName
897 ** (zName must be at least SQLITE_TEMPNAME_SIZE bytes long.) Write
898 ** the file descriptor into *fd. Return SQLITE_OK on success or some
899 ** other error code if we fail.
900 **
901 ** The OS will automatically delete the temporary file when it is
902 ** closed.
903 */
sqlitepager_opentemp(char * zFile,OsFile * fd)904 static int sqlitepager_opentemp(char *zFile, OsFile *fd){
905 int cnt = 8;
906 int rc;
907 do{
908 cnt--;
909 sqliteOsTempFileName(zFile);
910 rc = sqliteOsOpenExclusive(zFile, fd, 1);
911 }while( cnt>0 && rc!=SQLITE_OK );
912 return rc;
913 }
914
915 /*
916 ** Create a new page cache and put a pointer to the page cache in *ppPager.
917 ** The file to be cached need not exist. The file is not locked until
918 ** the first call to sqlitepager_get() and is only held open until the
919 ** last page is released using sqlitepager_unref().
920 **
921 ** If zFilename is NULL then a randomly-named temporary file is created
922 ** and used as the file to be cached. The file will be deleted
923 ** automatically when it is closed.
924 */
sqlitepager_open(Pager ** ppPager,const char * zFilename,int mxPage,int nExtra,int useJournal)925 int sqlitepager_open(
926 Pager **ppPager, /* Return the Pager structure here */
927 const char *zFilename, /* Name of the database file to open */
928 int mxPage, /* Max number of in-memory cache pages */
929 int nExtra, /* Extra bytes append to each in-memory page */
930 int useJournal /* TRUE to use a rollback journal on this file */
931 ){
932 Pager *pPager;
933 char *zFullPathname;
934 int nameLen;
935 OsFile fd;
936 int rc, i;
937 int tempFile;
938 int readOnly = 0;
939 char zTemp[SQLITE_TEMPNAME_SIZE];
940
941 *ppPager = 0;
942 if( sqlite_malloc_failed ){
943 return SQLITE_NOMEM;
944 }
945 if( zFilename && zFilename[0] ){
946 zFullPathname = sqliteOsFullPathname(zFilename);
947 rc = sqliteOsOpenReadWrite(zFullPathname, &fd, &readOnly);
948 tempFile = 0;
949 }else{
950 rc = sqlitepager_opentemp(zTemp, &fd);
951 zFilename = zTemp;
952 zFullPathname = sqliteOsFullPathname(zFilename);
953 tempFile = 1;
954 }
955 if( sqlite_malloc_failed ){
956 return SQLITE_NOMEM;
957 }
958 if( rc!=SQLITE_OK ){
959 sqliteFree(zFullPathname);
960 return SQLITE_CANTOPEN;
961 }
962 nameLen = strlen(zFullPathname);
963 pPager = sqliteMalloc( sizeof(*pPager) + nameLen*3 + 30 );
964 if( pPager==0 ){
965 sqliteOsClose(&fd);
966 sqliteFree(zFullPathname);
967 return SQLITE_NOMEM;
968 }
969 SET_PAGER(pPager);
970 pPager->zFilename = (char*)&pPager[1];
971 pPager->zDirectory = &pPager->zFilename[nameLen+1];
972 pPager->zJournal = &pPager->zDirectory[nameLen+1];
973 strcpy(pPager->zFilename, zFullPathname);
974 strcpy(pPager->zDirectory, zFullPathname);
975 for(i=nameLen; i>0 && pPager->zDirectory[i-1]!='/'; i--){}
976 if( i>0 ) pPager->zDirectory[i-1] = 0;
977 strcpy(pPager->zJournal, zFullPathname);
978 sqliteFree(zFullPathname);
979 strcpy(&pPager->zJournal[nameLen], "-journal");
980 pPager->fd = fd;
981 pPager->journalOpen = 0;
982 pPager->useJournal = useJournal;
983 pPager->ckptOpen = 0;
984 pPager->ckptInUse = 0;
985 pPager->nRef = 0;
986 pPager->dbSize = -1;
987 pPager->ckptSize = 0;
988 pPager->ckptJSize = 0;
989 pPager->nPage = 0;
990 pPager->mxPage = mxPage>5 ? mxPage : 10;
991 pPager->state = SQLITE_UNLOCK;
992 pPager->errMask = 0;
993 pPager->tempFile = tempFile;
994 pPager->readOnly = readOnly;
995 pPager->needSync = 0;
996 pPager->noSync = pPager->tempFile || !useJournal;
997 pPager->pFirst = 0;
998 pPager->pFirstSynced = 0;
999 pPager->pLast = 0;
1000 pPager->nExtra = nExtra;
1001 memset(pPager->aHash, 0, sizeof(pPager->aHash));
1002 *ppPager = pPager;
1003 return SQLITE_OK;
1004 }
1005
1006 /*
1007 ** Set the destructor for this pager. If not NULL, the destructor is called
1008 ** when the reference count on each page reaches zero. The destructor can
1009 ** be used to clean up information in the extra segment appended to each page.
1010 **
1011 ** The destructor is not called as a result sqlitepager_close().
1012 ** Destructors are only called by sqlitepager_unref().
1013 */
sqlitepager_set_destructor(Pager * pPager,void (* xDesc)(void *))1014 void sqlitepager_set_destructor(Pager *pPager, void (*xDesc)(void*)){
1015 pPager->xDestructor = xDesc;
1016 }
1017
1018 /*
1019 ** Return the total number of pages in the disk file associated with
1020 ** pPager.
1021 */
sqlitepager_pagecount(Pager * pPager)1022 int sqlitepager_pagecount(Pager *pPager){
1023 off_t n;
1024 assert( pPager!=0 );
1025 if( pPager->dbSize>=0 ){
1026 return pPager->dbSize;
1027 }
1028 if( sqliteOsFileSize(&pPager->fd, &n)!=SQLITE_OK ){
1029 pPager->errMask |= PAGER_ERR_DISK;
1030 return 0;
1031 }
1032 n /= SQLITE_PAGE_SIZE;
1033 if( pPager->state!=SQLITE_UNLOCK ){
1034 pPager->dbSize = n;
1035 }
1036 return n;
1037 }
1038
1039 /*
1040 ** Forward declaration
1041 */
1042 static int syncJournal(Pager*);
1043
1044 /*
1045 ** Truncate the file to the number of pages specified.
1046 */
sqlitepager_truncate(Pager * pPager,Pgno nPage)1047 int sqlitepager_truncate(Pager *pPager, Pgno nPage){
1048 int rc;
1049 if( pPager->dbSize<0 ){
1050 sqlitepager_pagecount(pPager);
1051 }
1052 if( pPager->errMask!=0 ){
1053 rc = pager_errcode(pPager);
1054 return rc;
1055 }
1056 if( nPage>=(unsigned)pPager->dbSize ){
1057 return SQLITE_OK;
1058 }
1059 syncJournal(pPager);
1060 rc = sqliteOsTruncate(&pPager->fd, SQLITE_PAGE_SIZE*(off_t)nPage);
1061 if( rc==SQLITE_OK ){
1062 pPager->dbSize = nPage;
1063 }
1064 return rc;
1065 }
1066
1067 /*
1068 ** Shutdown the page cache. Free all memory and close all files.
1069 **
1070 ** If a transaction was in progress when this routine is called, that
1071 ** transaction is rolled back. All outstanding pages are invalidated
1072 ** and their memory is freed. Any attempt to use a page associated
1073 ** with this page cache after this function returns will likely
1074 ** result in a coredump.
1075 */
sqlitepager_close(Pager * pPager)1076 int sqlitepager_close(Pager *pPager){
1077 PgHdr *pPg, *pNext;
1078 switch( pPager->state ){
1079 case SQLITE_WRITELOCK: {
1080 sqlitepager_rollback(pPager);
1081 sqliteOsUnlock(&pPager->fd);
1082 assert( pPager->journalOpen==0 );
1083 break;
1084 }
1085 case SQLITE_READLOCK: {
1086 sqliteOsUnlock(&pPager->fd);
1087 break;
1088 }
1089 default: {
1090 /* Do nothing */
1091 break;
1092 }
1093 }
1094 for(pPg=pPager->pAll; pPg; pPg=pNext){
1095 pNext = pPg->pNextAll;
1096 sqliteFree(pPg);
1097 }
1098 sqliteOsClose(&pPager->fd);
1099 assert( pPager->journalOpen==0 );
1100 /* Temp files are automatically deleted by the OS
1101 ** if( pPager->tempFile ){
1102 ** sqliteOsDelete(pPager->zFilename);
1103 ** }
1104 */
1105 CLR_PAGER(pPager);
1106 if( pPager->zFilename!=(char*)&pPager[1] ){
1107 assert( 0 ); /* Cannot happen */
1108 sqliteFree(pPager->zFilename);
1109 sqliteFree(pPager->zJournal);
1110 sqliteFree(pPager->zDirectory);
1111 }
1112 sqliteFree(pPager);
1113 return SQLITE_OK;
1114 }
1115
1116 /*
1117 ** Return the page number for the given page data.
1118 */
sqlitepager_pagenumber(void * pData)1119 Pgno sqlitepager_pagenumber(void *pData){
1120 PgHdr *p = DATA_TO_PGHDR(pData);
1121 return p->pgno;
1122 }
1123
1124 /*
1125 ** Increment the reference count for a page. If the page is
1126 ** currently on the freelist (the reference count is zero) then
1127 ** remove it from the freelist.
1128 */
1129 #define page_ref(P) ((P)->nRef==0?_page_ref(P):(void)(P)->nRef++)
_page_ref(PgHdr * pPg)1130 static void _page_ref(PgHdr *pPg){
1131 if( pPg->nRef==0 ){
1132 /* The page is currently on the freelist. Remove it. */
1133 if( pPg==pPg->pPager->pFirstSynced ){
1134 PgHdr *p = pPg->pNextFree;
1135 while( p && p->needSync ){ p = p->pNextFree; }
1136 pPg->pPager->pFirstSynced = p;
1137 }
1138 if( pPg->pPrevFree ){
1139 pPg->pPrevFree->pNextFree = pPg->pNextFree;
1140 }else{
1141 pPg->pPager->pFirst = pPg->pNextFree;
1142 }
1143 if( pPg->pNextFree ){
1144 pPg->pNextFree->pPrevFree = pPg->pPrevFree;
1145 }else{
1146 pPg->pPager->pLast = pPg->pPrevFree;
1147 }
1148 pPg->pPager->nRef++;
1149 }
1150 pPg->nRef++;
1151 REFINFO(pPg);
1152 }
1153
1154 /*
1155 ** Increment the reference count for a page. The input pointer is
1156 ** a reference to the page data.
1157 */
sqlitepager_ref(void * pData)1158 int sqlitepager_ref(void *pData){
1159 PgHdr *pPg = DATA_TO_PGHDR(pData);
1160 page_ref(pPg);
1161 return SQLITE_OK;
1162 }
1163
1164 /*
1165 ** Sync the journal. In other words, make sure all the pages that have
1166 ** been written to the journal have actually reached the surface of the
1167 ** disk. It is not safe to modify the original database file until after
1168 ** the journal has been synced. If the original database is modified before
1169 ** the journal is synced and a power failure occurs, the unsynced journal
1170 ** data would be lost and we would be unable to completely rollback the
1171 ** database changes. Database corruption would occur.
1172 **
1173 ** This routine also updates the nRec field in the header of the journal.
1174 ** (See comments on the pager_playback() routine for additional information.)
1175 ** If the sync mode is FULL, two syncs will occur. First the whole journal
1176 ** is synced, then the nRec field is updated, then a second sync occurs.
1177 **
1178 ** For temporary databases, we do not care if we are able to rollback
1179 ** after a power failure, so sync occurs.
1180 **
1181 ** This routine clears the needSync field of every page current held in
1182 ** memory.
1183 */
syncJournal(Pager * pPager)1184 static int syncJournal(Pager *pPager){
1185 PgHdr *pPg;
1186 int rc = SQLITE_OK;
1187
1188 /* Sync the journal before modifying the main database
1189 ** (assuming there is a journal and it needs to be synced.)
1190 */
1191 if( pPager->needSync ){
1192 if( !pPager->tempFile ){
1193 assert( pPager->journalOpen );
1194 /* assert( !pPager->noSync ); // noSync might be set if synchronous
1195 ** was turned off after the transaction was started. Ticket #615 */
1196 #ifndef NDEBUG
1197 {
1198 /* Make sure the pPager->nRec counter we are keeping agrees
1199 ** with the nRec computed from the size of the journal file.
1200 */
1201 off_t hdrSz, pgSz, jSz;
1202 hdrSz = JOURNAL_HDR_SZ(journal_format);
1203 pgSz = JOURNAL_PG_SZ(journal_format);
1204 rc = sqliteOsFileSize(&pPager->jfd, &jSz);
1205 if( rc!=0 ) return rc;
1206 assert( pPager->nRec*pgSz+hdrSz==jSz );
1207 }
1208 #endif
1209 if( journal_format>=3 ){
1210 /* Write the nRec value into the journal file header */
1211 off_t szJ;
1212 if( pPager->fullSync ){
1213 TRACE1("SYNC\n");
1214 rc = sqliteOsSync(&pPager->jfd);
1215 if( rc!=0 ) return rc;
1216 }
1217 sqliteOsSeek(&pPager->jfd, sizeof(aJournalMagic1));
1218 rc = write32bits(&pPager->jfd, pPager->nRec);
1219 if( rc ) return rc;
1220 szJ = JOURNAL_HDR_SZ(journal_format) +
1221 pPager->nRec*JOURNAL_PG_SZ(journal_format);
1222 sqliteOsSeek(&pPager->jfd, szJ);
1223 }
1224 TRACE1("SYNC\n");
1225 rc = sqliteOsSync(&pPager->jfd);
1226 if( rc!=0 ) return rc;
1227 pPager->journalStarted = 1;
1228 }
1229 pPager->needSync = 0;
1230
1231 /* Erase the needSync flag from every page.
1232 */
1233 for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
1234 pPg->needSync = 0;
1235 }
1236 pPager->pFirstSynced = pPager->pFirst;
1237 }
1238
1239 #ifndef NDEBUG
1240 /* If the Pager.needSync flag is clear then the PgHdr.needSync
1241 ** flag must also be clear for all pages. Verify that this
1242 ** invariant is true.
1243 */
1244 else{
1245 for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
1246 assert( pPg->needSync==0 );
1247 }
1248 assert( pPager->pFirstSynced==pPager->pFirst );
1249 }
1250 #endif
1251
1252 return rc;
1253 }
1254
1255 /*
1256 ** Given a list of pages (connected by the PgHdr.pDirty pointer) write
1257 ** every one of those pages out to the database file and mark them all
1258 ** as clean.
1259 */
pager_write_pagelist(PgHdr * pList)1260 static int pager_write_pagelist(PgHdr *pList){
1261 Pager *pPager;
1262 int rc;
1263
1264 if( pList==0 ) return SQLITE_OK;
1265 pPager = pList->pPager;
1266 while( pList ){
1267 assert( pList->dirty );
1268 sqliteOsSeek(&pPager->fd, (pList->pgno-1)*(off_t)SQLITE_PAGE_SIZE);
1269 CODEC(pPager, PGHDR_TO_DATA(pList), pList->pgno, 6);
1270 TRACE2("STORE %d\n", pList->pgno);
1271 rc = sqliteOsWrite(&pPager->fd, PGHDR_TO_DATA(pList), SQLITE_PAGE_SIZE);
1272 CODEC(pPager, PGHDR_TO_DATA(pList), pList->pgno, 0);
1273 if( rc ) return rc;
1274 pList->dirty = 0;
1275 pList = pList->pDirty;
1276 }
1277 return SQLITE_OK;
1278 }
1279
1280 /*
1281 ** Collect every dirty page into a dirty list and
1282 ** return a pointer to the head of that list. All pages are
1283 ** collected even if they are still in use.
1284 */
pager_get_all_dirty_pages(Pager * pPager)1285 static PgHdr *pager_get_all_dirty_pages(Pager *pPager){
1286 PgHdr *p, *pList;
1287 pList = 0;
1288 for(p=pPager->pAll; p; p=p->pNextAll){
1289 if( p->dirty ){
1290 p->pDirty = pList;
1291 pList = p;
1292 }
1293 }
1294 return pList;
1295 }
1296
1297 /*
1298 ** Acquire a page.
1299 **
1300 ** A read lock on the disk file is obtained when the first page is acquired.
1301 ** This read lock is dropped when the last page is released.
1302 **
1303 ** A _get works for any page number greater than 0. If the database
1304 ** file is smaller than the requested page, then no actual disk
1305 ** read occurs and the memory image of the page is initialized to
1306 ** all zeros. The extra data appended to a page is always initialized
1307 ** to zeros the first time a page is loaded into memory.
1308 **
1309 ** The acquisition might fail for several reasons. In all cases,
1310 ** an appropriate error code is returned and *ppPage is set to NULL.
1311 **
1312 ** See also sqlitepager_lookup(). Both this routine and _lookup() attempt
1313 ** to find a page in the in-memory cache first. If the page is not already
1314 ** in memory, this routine goes to disk to read it in whereas _lookup()
1315 ** just returns 0. This routine acquires a read-lock the first time it
1316 ** has to go to disk, and could also playback an old journal if necessary.
1317 ** Since _lookup() never goes to disk, it never has to deal with locks
1318 ** or journal files.
1319 */
sqlitepager_get(Pager * pPager,Pgno pgno,void ** ppPage)1320 int sqlitepager_get(Pager *pPager, Pgno pgno, void **ppPage){
1321 PgHdr *pPg;
1322 int rc;
1323
1324 /* Make sure we have not hit any critical errors.
1325 */
1326 assert( pPager!=0 );
1327 assert( pgno!=0 );
1328 *ppPage = 0;
1329 if( pPager->errMask & ~(PAGER_ERR_FULL) ){
1330 return pager_errcode(pPager);
1331 }
1332
1333 /* If this is the first page accessed, then get a read lock
1334 ** on the database file.
1335 */
1336 if( pPager->nRef==0 ){
1337 rc = sqliteOsReadLock(&pPager->fd);
1338 if( rc!=SQLITE_OK ){
1339 return rc;
1340 }
1341 pPager->state = SQLITE_READLOCK;
1342
1343 /* If a journal file exists, try to play it back.
1344 */
1345 if( pPager->useJournal && sqliteOsFileExists(pPager->zJournal) ){
1346 int rc;
1347
1348 /* Get a write lock on the database
1349 */
1350 rc = sqliteOsWriteLock(&pPager->fd);
1351 if( rc!=SQLITE_OK ){
1352 if( sqliteOsUnlock(&pPager->fd)!=SQLITE_OK ){
1353 /* This should never happen! */
1354 rc = SQLITE_INTERNAL;
1355 }
1356 return rc;
1357 }
1358 pPager->state = SQLITE_WRITELOCK;
1359
1360 /* Open the journal for reading only. Return SQLITE_BUSY if
1361 ** we are unable to open the journal file.
1362 **
1363 ** The journal file does not need to be locked itself. The
1364 ** journal file is never open unless the main database file holds
1365 ** a write lock, so there is never any chance of two or more
1366 ** processes opening the journal at the same time.
1367 */
1368 rc = sqliteOsOpenReadOnly(pPager->zJournal, &pPager->jfd);
1369 if( rc!=SQLITE_OK ){
1370 rc = sqliteOsUnlock(&pPager->fd);
1371 assert( rc==SQLITE_OK );
1372 return SQLITE_BUSY;
1373 }
1374 pPager->journalOpen = 1;
1375 pPager->journalStarted = 0;
1376
1377 /* Playback and delete the journal. Drop the database write
1378 ** lock and reacquire the read lock.
1379 */
1380 rc = pager_playback(pPager, 0);
1381 if( rc!=SQLITE_OK ){
1382 return rc;
1383 }
1384 }
1385 pPg = 0;
1386 }else{
1387 /* Search for page in cache */
1388 pPg = pager_lookup(pPager, pgno);
1389 }
1390 if( pPg==0 ){
1391 /* The requested page is not in the page cache. */
1392 int h;
1393 pPager->nMiss++;
1394 if( pPager->nPage<pPager->mxPage || pPager->pFirst==0 ){
1395 /* Create a new page */
1396 pPg = sqliteMallocRaw( sizeof(*pPg) + SQLITE_PAGE_SIZE
1397 + sizeof(u32) + pPager->nExtra );
1398 if( pPg==0 ){
1399 pager_unwritelock(pPager);
1400 pPager->errMask |= PAGER_ERR_MEM;
1401 return SQLITE_NOMEM;
1402 }
1403 memset(pPg, 0, sizeof(*pPg));
1404 pPg->pPager = pPager;
1405 pPg->pNextAll = pPager->pAll;
1406 if( pPager->pAll ){
1407 pPager->pAll->pPrevAll = pPg;
1408 }
1409 pPg->pPrevAll = 0;
1410 pPager->pAll = pPg;
1411 pPager->nPage++;
1412 }else{
1413 /* Find a page to recycle. Try to locate a page that does not
1414 ** require us to do an fsync() on the journal.
1415 */
1416 pPg = pPager->pFirstSynced;
1417
1418 /* If we could not find a page that does not require an fsync()
1419 ** on the journal file then fsync the journal file. This is a
1420 ** very slow operation, so we work hard to avoid it. But sometimes
1421 ** it can't be helped.
1422 */
1423 if( pPg==0 ){
1424 int rc = syncJournal(pPager);
1425 if( rc!=0 ){
1426 sqlitepager_rollback(pPager);
1427 return SQLITE_IOERR;
1428 }
1429 pPg = pPager->pFirst;
1430 }
1431 assert( pPg->nRef==0 );
1432
1433 /* Write the page to the database file if it is dirty.
1434 */
1435 if( pPg->dirty ){
1436 assert( pPg->needSync==0 );
1437 pPg->pDirty = 0;
1438 rc = pager_write_pagelist( pPg );
1439 if( rc!=SQLITE_OK ){
1440 sqlitepager_rollback(pPager);
1441 return SQLITE_IOERR;
1442 }
1443 }
1444 assert( pPg->dirty==0 );
1445
1446 /* If the page we are recycling is marked as alwaysRollback, then
1447 ** set the global alwaysRollback flag, thus disabling the
1448 ** sqlite_dont_rollback() optimization for the rest of this transaction.
1449 ** It is necessary to do this because the page marked alwaysRollback
1450 ** might be reloaded at a later time but at that point we won't remember
1451 ** that is was marked alwaysRollback. This means that all pages must
1452 ** be marked as alwaysRollback from here on out.
1453 */
1454 if( pPg->alwaysRollback ){
1455 pPager->alwaysRollback = 1;
1456 }
1457
1458 /* Unlink the old page from the free list and the hash table
1459 */
1460 if( pPg==pPager->pFirstSynced ){
1461 PgHdr *p = pPg->pNextFree;
1462 while( p && p->needSync ){ p = p->pNextFree; }
1463 pPager->pFirstSynced = p;
1464 }
1465 if( pPg->pPrevFree ){
1466 pPg->pPrevFree->pNextFree = pPg->pNextFree;
1467 }else{
1468 assert( pPager->pFirst==pPg );
1469 pPager->pFirst = pPg->pNextFree;
1470 }
1471 if( pPg->pNextFree ){
1472 pPg->pNextFree->pPrevFree = pPg->pPrevFree;
1473 }else{
1474 assert( pPager->pLast==pPg );
1475 pPager->pLast = pPg->pPrevFree;
1476 }
1477 pPg->pNextFree = pPg->pPrevFree = 0;
1478 if( pPg->pNextHash ){
1479 pPg->pNextHash->pPrevHash = pPg->pPrevHash;
1480 }
1481 if( pPg->pPrevHash ){
1482 pPg->pPrevHash->pNextHash = pPg->pNextHash;
1483 }else{
1484 h = pager_hash(pPg->pgno);
1485 assert( pPager->aHash[h]==pPg );
1486 pPager->aHash[h] = pPg->pNextHash;
1487 }
1488 pPg->pNextHash = pPg->pPrevHash = 0;
1489 pPager->nOvfl++;
1490 }
1491 pPg->pgno = pgno;
1492 if( pPager->aInJournal && (int)pgno<=pPager->origDbSize ){
1493 sqliteCheckMemory(pPager->aInJournal, pgno/8);
1494 assert( pPager->journalOpen );
1495 pPg->inJournal = (pPager->aInJournal[pgno/8] & (1<<(pgno&7)))!=0;
1496 pPg->needSync = 0;
1497 }else{
1498 pPg->inJournal = 0;
1499 pPg->needSync = 0;
1500 }
1501 if( pPager->aInCkpt && (int)pgno<=pPager->ckptSize
1502 && (pPager->aInCkpt[pgno/8] & (1<<(pgno&7)))!=0 ){
1503 page_add_to_ckpt_list(pPg);
1504 }else{
1505 page_remove_from_ckpt_list(pPg);
1506 }
1507 pPg->dirty = 0;
1508 pPg->nRef = 1;
1509 REFINFO(pPg);
1510 pPager->nRef++;
1511 h = pager_hash(pgno);
1512 pPg->pNextHash = pPager->aHash[h];
1513 pPager->aHash[h] = pPg;
1514 if( pPg->pNextHash ){
1515 assert( pPg->pNextHash->pPrevHash==0 );
1516 pPg->pNextHash->pPrevHash = pPg;
1517 }
1518 if( pPager->nExtra>0 ){
1519 memset(PGHDR_TO_EXTRA(pPg), 0, pPager->nExtra);
1520 }
1521 if( pPager->dbSize<0 ) sqlitepager_pagecount(pPager);
1522 if( pPager->errMask!=0 ){
1523 sqlitepager_unref(PGHDR_TO_DATA(pPg));
1524 rc = pager_errcode(pPager);
1525 return rc;
1526 }
1527 if( pPager->dbSize<(int)pgno ){
1528 memset(PGHDR_TO_DATA(pPg), 0, SQLITE_PAGE_SIZE);
1529 }else{
1530 int rc;
1531 sqliteOsSeek(&pPager->fd, (pgno-1)*(off_t)SQLITE_PAGE_SIZE);
1532 rc = sqliteOsRead(&pPager->fd, PGHDR_TO_DATA(pPg), SQLITE_PAGE_SIZE);
1533 TRACE2("FETCH %d\n", pPg->pgno);
1534 CODEC(pPager, PGHDR_TO_DATA(pPg), pPg->pgno, 3);
1535 if( rc!=SQLITE_OK ){
1536 off_t fileSize;
1537 if( sqliteOsFileSize(&pPager->fd,&fileSize)!=SQLITE_OK
1538 || fileSize>=pgno*SQLITE_PAGE_SIZE ){
1539 sqlitepager_unref(PGHDR_TO_DATA(pPg));
1540 return rc;
1541 }else{
1542 memset(PGHDR_TO_DATA(pPg), 0, SQLITE_PAGE_SIZE);
1543 }
1544 }
1545 }
1546 }else{
1547 /* The requested page is in the page cache. */
1548 pPager->nHit++;
1549 page_ref(pPg);
1550 }
1551 *ppPage = PGHDR_TO_DATA(pPg);
1552 return SQLITE_OK;
1553 }
1554
1555 /*
1556 ** Acquire a page if it is already in the in-memory cache. Do
1557 ** not read the page from disk. Return a pointer to the page,
1558 ** or 0 if the page is not in cache.
1559 **
1560 ** See also sqlitepager_get(). The difference between this routine
1561 ** and sqlitepager_get() is that _get() will go to the disk and read
1562 ** in the page if the page is not already in cache. This routine
1563 ** returns NULL if the page is not in cache or if a disk I/O error
1564 ** has ever happened.
1565 */
sqlitepager_lookup(Pager * pPager,Pgno pgno)1566 void *sqlitepager_lookup(Pager *pPager, Pgno pgno){
1567 PgHdr *pPg;
1568
1569 assert( pPager!=0 );
1570 assert( pgno!=0 );
1571 if( pPager->errMask & ~(PAGER_ERR_FULL) ){
1572 return 0;
1573 }
1574 /* if( pPager->nRef==0 ){
1575 ** return 0;
1576 ** }
1577 */
1578 pPg = pager_lookup(pPager, pgno);
1579 if( pPg==0 ) return 0;
1580 page_ref(pPg);
1581 return PGHDR_TO_DATA(pPg);
1582 }
1583
1584 /*
1585 ** Release a page.
1586 **
1587 ** If the number of references to the page drop to zero, then the
1588 ** page is added to the LRU list. When all references to all pages
1589 ** are released, a rollback occurs and the lock on the database is
1590 ** removed.
1591 */
sqlitepager_unref(void * pData)1592 int sqlitepager_unref(void *pData){
1593 PgHdr *pPg;
1594
1595 /* Decrement the reference count for this page
1596 */
1597 pPg = DATA_TO_PGHDR(pData);
1598 assert( pPg->nRef>0 );
1599 pPg->nRef--;
1600 REFINFO(pPg);
1601
1602 /* When the number of references to a page reach 0, call the
1603 ** destructor and add the page to the freelist.
1604 */
1605 if( pPg->nRef==0 ){
1606 Pager *pPager;
1607 pPager = pPg->pPager;
1608 pPg->pNextFree = 0;
1609 pPg->pPrevFree = pPager->pLast;
1610 pPager->pLast = pPg;
1611 if( pPg->pPrevFree ){
1612 pPg->pPrevFree->pNextFree = pPg;
1613 }else{
1614 pPager->pFirst = pPg;
1615 }
1616 if( pPg->needSync==0 && pPager->pFirstSynced==0 ){
1617 pPager->pFirstSynced = pPg;
1618 }
1619 if( pPager->xDestructor ){
1620 pPager->xDestructor(pData);
1621 }
1622
1623 /* When all pages reach the freelist, drop the read lock from
1624 ** the database file.
1625 */
1626 pPager->nRef--;
1627 assert( pPager->nRef>=0 );
1628 if( pPager->nRef==0 ){
1629 pager_reset(pPager);
1630 }
1631 }
1632 return SQLITE_OK;
1633 }
1634
1635 /*
1636 ** Create a journal file for pPager. There should already be a write
1637 ** lock on the database file when this routine is called.
1638 **
1639 ** Return SQLITE_OK if everything. Return an error code and release the
1640 ** write lock if anything goes wrong.
1641 */
pager_open_journal(Pager * pPager)1642 static int pager_open_journal(Pager *pPager){
1643 int rc;
1644 assert( pPager->state==SQLITE_WRITELOCK );
1645 assert( pPager->journalOpen==0 );
1646 assert( pPager->useJournal );
1647 sqlitepager_pagecount(pPager);
1648 pPager->aInJournal = sqliteMalloc( pPager->dbSize/8 + 1 );
1649 if( pPager->aInJournal==0 ){
1650 sqliteOsReadLock(&pPager->fd);
1651 pPager->state = SQLITE_READLOCK;
1652 return SQLITE_NOMEM;
1653 }
1654 rc = sqliteOsOpenExclusive(pPager->zJournal, &pPager->jfd,pPager->tempFile);
1655 if( rc!=SQLITE_OK ){
1656 sqliteFree(pPager->aInJournal);
1657 pPager->aInJournal = 0;
1658 sqliteOsReadLock(&pPager->fd);
1659 pPager->state = SQLITE_READLOCK;
1660 return SQLITE_CANTOPEN;
1661 }
1662 sqliteOsOpenDirectory(pPager->zDirectory, &pPager->jfd);
1663 pPager->journalOpen = 1;
1664 pPager->journalStarted = 0;
1665 pPager->needSync = 0;
1666 pPager->alwaysRollback = 0;
1667 pPager->nRec = 0;
1668 if( pPager->errMask!=0 ){
1669 rc = pager_errcode(pPager);
1670 return rc;
1671 }
1672 pPager->origDbSize = pPager->dbSize;
1673 if( journal_format==JOURNAL_FORMAT_3 ){
1674 rc = sqliteOsWrite(&pPager->jfd, aJournalMagic3, sizeof(aJournalMagic3));
1675 if( rc==SQLITE_OK ){
1676 rc = write32bits(&pPager->jfd, pPager->noSync ? 0xffffffff : 0);
1677 }
1678 if( rc==SQLITE_OK ){
1679 sqliteRandomness(sizeof(pPager->cksumInit), &pPager->cksumInit);
1680 rc = write32bits(&pPager->jfd, pPager->cksumInit);
1681 }
1682 }else if( journal_format==JOURNAL_FORMAT_2 ){
1683 rc = sqliteOsWrite(&pPager->jfd, aJournalMagic2, sizeof(aJournalMagic2));
1684 }else{
1685 assert( journal_format==JOURNAL_FORMAT_1 );
1686 rc = sqliteOsWrite(&pPager->jfd, aJournalMagic1, sizeof(aJournalMagic1));
1687 }
1688 if( rc==SQLITE_OK ){
1689 rc = write32bits(&pPager->jfd, pPager->dbSize);
1690 }
1691 if( pPager->ckptAutoopen && rc==SQLITE_OK ){
1692 rc = sqlitepager_ckpt_begin(pPager);
1693 }
1694 if( rc!=SQLITE_OK ){
1695 rc = pager_unwritelock(pPager);
1696 if( rc==SQLITE_OK ){
1697 rc = SQLITE_FULL;
1698 }
1699 }
1700 return rc;
1701 }
1702
1703 /*
1704 ** Acquire a write-lock on the database. The lock is removed when
1705 ** the any of the following happen:
1706 **
1707 ** * sqlitepager_commit() is called.
1708 ** * sqlitepager_rollback() is called.
1709 ** * sqlitepager_close() is called.
1710 ** * sqlitepager_unref() is called to on every outstanding page.
1711 **
1712 ** The parameter to this routine is a pointer to any open page of the
1713 ** database file. Nothing changes about the page - it is used merely
1714 ** to acquire a pointer to the Pager structure and as proof that there
1715 ** is already a read-lock on the database.
1716 **
1717 ** A journal file is opened if this is not a temporary file. For
1718 ** temporary files, the opening of the journal file is deferred until
1719 ** there is an actual need to write to the journal.
1720 **
1721 ** If the database is already write-locked, this routine is a no-op.
1722 */
sqlitepager_begin(void * pData)1723 int sqlitepager_begin(void *pData){
1724 PgHdr *pPg = DATA_TO_PGHDR(pData);
1725 Pager *pPager = pPg->pPager;
1726 int rc = SQLITE_OK;
1727 assert( pPg->nRef>0 );
1728 assert( pPager->state!=SQLITE_UNLOCK );
1729 if( pPager->state==SQLITE_READLOCK ){
1730 assert( pPager->aInJournal==0 );
1731 rc = sqliteOsWriteLock(&pPager->fd);
1732 if( rc!=SQLITE_OK ){
1733 return rc;
1734 }
1735 pPager->state = SQLITE_WRITELOCK;
1736 pPager->dirtyFile = 0;
1737 TRACE1("TRANSACTION\n");
1738 if( pPager->useJournal && !pPager->tempFile ){
1739 rc = pager_open_journal(pPager);
1740 }
1741 }
1742 return rc;
1743 }
1744
1745 /*
1746 ** Mark a data page as writeable. The page is written into the journal
1747 ** if it is not there already. This routine must be called before making
1748 ** changes to a page.
1749 **
1750 ** The first time this routine is called, the pager creates a new
1751 ** journal and acquires a write lock on the database. If the write
1752 ** lock could not be acquired, this routine returns SQLITE_BUSY. The
1753 ** calling routine must check for that return value and be careful not to
1754 ** change any page data until this routine returns SQLITE_OK.
1755 **
1756 ** If the journal file could not be written because the disk is full,
1757 ** then this routine returns SQLITE_FULL and does an immediate rollback.
1758 ** All subsequent write attempts also return SQLITE_FULL until there
1759 ** is a call to sqlitepager_commit() or sqlitepager_rollback() to
1760 ** reset.
1761 */
sqlitepager_write(void * pData)1762 int sqlitepager_write(void *pData){
1763 PgHdr *pPg = DATA_TO_PGHDR(pData);
1764 Pager *pPager = pPg->pPager;
1765 int rc = SQLITE_OK;
1766
1767 /* Check for errors
1768 */
1769 if( pPager->errMask ){
1770 return pager_errcode(pPager);
1771 }
1772 if( pPager->readOnly ){
1773 return SQLITE_PERM;
1774 }
1775
1776 /* Mark the page as dirty. If the page has already been written
1777 ** to the journal then we can return right away.
1778 */
1779 pPg->dirty = 1;
1780 if( pPg->inJournal && (pPg->inCkpt || pPager->ckptInUse==0) ){
1781 pPager->dirtyFile = 1;
1782 return SQLITE_OK;
1783 }
1784
1785 /* If we get this far, it means that the page needs to be
1786 ** written to the transaction journal or the ckeckpoint journal
1787 ** or both.
1788 **
1789 ** First check to see that the transaction journal exists and
1790 ** create it if it does not.
1791 */
1792 assert( pPager->state!=SQLITE_UNLOCK );
1793 rc = sqlitepager_begin(pData);
1794 if( rc!=SQLITE_OK ){
1795 return rc;
1796 }
1797 assert( pPager->state==SQLITE_WRITELOCK );
1798 if( !pPager->journalOpen && pPager->useJournal ){
1799 rc = pager_open_journal(pPager);
1800 if( rc!=SQLITE_OK ) return rc;
1801 }
1802 assert( pPager->journalOpen || !pPager->useJournal );
1803 pPager->dirtyFile = 1;
1804
1805 /* The transaction journal now exists and we have a write lock on the
1806 ** main database file. Write the current page to the transaction
1807 ** journal if it is not there already.
1808 */
1809 if( !pPg->inJournal && pPager->useJournal ){
1810 if( (int)pPg->pgno <= pPager->origDbSize ){
1811 int szPg;
1812 u32 saved;
1813 if( journal_format>=JOURNAL_FORMAT_3 ){
1814 u32 cksum = pager_cksum(pPager, pPg->pgno, pData);
1815 saved = *(u32*)PGHDR_TO_EXTRA(pPg);
1816 store32bits(cksum, pPg, SQLITE_PAGE_SIZE);
1817 szPg = SQLITE_PAGE_SIZE+8;
1818 }else{
1819 szPg = SQLITE_PAGE_SIZE+4;
1820 }
1821 store32bits(pPg->pgno, pPg, -4);
1822 CODEC(pPager, pData, pPg->pgno, 7);
1823 rc = sqliteOsWrite(&pPager->jfd, &((char*)pData)[-4], szPg);
1824 TRACE3("JOURNAL %d %d\n", pPg->pgno, pPg->needSync);
1825 CODEC(pPager, pData, pPg->pgno, 0);
1826 if( journal_format>=JOURNAL_FORMAT_3 ){
1827 *(u32*)PGHDR_TO_EXTRA(pPg) = saved;
1828 }
1829 if( rc!=SQLITE_OK ){
1830 sqlitepager_rollback(pPager);
1831 pPager->errMask |= PAGER_ERR_FULL;
1832 return rc;
1833 }
1834 pPager->nRec++;
1835 assert( pPager->aInJournal!=0 );
1836 pPager->aInJournal[pPg->pgno/8] |= 1<<(pPg->pgno&7);
1837 pPg->needSync = !pPager->noSync;
1838 pPg->inJournal = 1;
1839 if( pPager->ckptInUse ){
1840 pPager->aInCkpt[pPg->pgno/8] |= 1<<(pPg->pgno&7);
1841 page_add_to_ckpt_list(pPg);
1842 }
1843 }else{
1844 pPg->needSync = !pPager->journalStarted && !pPager->noSync;
1845 TRACE3("APPEND %d %d\n", pPg->pgno, pPg->needSync);
1846 }
1847 if( pPg->needSync ){
1848 pPager->needSync = 1;
1849 }
1850 }
1851
1852 /* If the checkpoint journal is open and the page is not in it,
1853 ** then write the current page to the checkpoint journal. Note that
1854 ** the checkpoint journal always uses the simplier format 2 that lacks
1855 ** checksums. The header is also omitted from the checkpoint journal.
1856 */
1857 if( pPager->ckptInUse && !pPg->inCkpt && (int)pPg->pgno<=pPager->ckptSize ){
1858 assert( pPg->inJournal || (int)pPg->pgno>pPager->origDbSize );
1859 store32bits(pPg->pgno, pPg, -4);
1860 CODEC(pPager, pData, pPg->pgno, 7);
1861 rc = sqliteOsWrite(&pPager->cpfd, &((char*)pData)[-4], SQLITE_PAGE_SIZE+4);
1862 TRACE2("CKPT-JOURNAL %d\n", pPg->pgno);
1863 CODEC(pPager, pData, pPg->pgno, 0);
1864 if( rc!=SQLITE_OK ){
1865 sqlitepager_rollback(pPager);
1866 pPager->errMask |= PAGER_ERR_FULL;
1867 return rc;
1868 }
1869 pPager->ckptNRec++;
1870 assert( pPager->aInCkpt!=0 );
1871 pPager->aInCkpt[pPg->pgno/8] |= 1<<(pPg->pgno&7);
1872 page_add_to_ckpt_list(pPg);
1873 }
1874
1875 /* Update the database size and return.
1876 */
1877 if( pPager->dbSize<(int)pPg->pgno ){
1878 pPager->dbSize = pPg->pgno;
1879 }
1880 return rc;
1881 }
1882
1883 /*
1884 ** Return TRUE if the page given in the argument was previously passed
1885 ** to sqlitepager_write(). In other words, return TRUE if it is ok
1886 ** to change the content of the page.
1887 */
sqlitepager_iswriteable(void * pData)1888 int sqlitepager_iswriteable(void *pData){
1889 PgHdr *pPg = DATA_TO_PGHDR(pData);
1890 return pPg->dirty;
1891 }
1892
1893 /*
1894 ** Replace the content of a single page with the information in the third
1895 ** argument.
1896 */
sqlitepager_overwrite(Pager * pPager,Pgno pgno,void * pData)1897 int sqlitepager_overwrite(Pager *pPager, Pgno pgno, void *pData){
1898 void *pPage;
1899 int rc;
1900
1901 rc = sqlitepager_get(pPager, pgno, &pPage);
1902 if( rc==SQLITE_OK ){
1903 rc = sqlitepager_write(pPage);
1904 if( rc==SQLITE_OK ){
1905 memcpy(pPage, pData, SQLITE_PAGE_SIZE);
1906 }
1907 sqlitepager_unref(pPage);
1908 }
1909 return rc;
1910 }
1911
1912 /*
1913 ** A call to this routine tells the pager that it is not necessary to
1914 ** write the information on page "pgno" back to the disk, even though
1915 ** that page might be marked as dirty.
1916 **
1917 ** The overlying software layer calls this routine when all of the data
1918 ** on the given page is unused. The pager marks the page as clean so
1919 ** that it does not get written to disk.
1920 **
1921 ** Tests show that this optimization, together with the
1922 ** sqlitepager_dont_rollback() below, more than double the speed
1923 ** of large INSERT operations and quadruple the speed of large DELETEs.
1924 **
1925 ** When this routine is called, set the alwaysRollback flag to true.
1926 ** Subsequent calls to sqlitepager_dont_rollback() for the same page
1927 ** will thereafter be ignored. This is necessary to avoid a problem
1928 ** where a page with data is added to the freelist during one part of
1929 ** a transaction then removed from the freelist during a later part
1930 ** of the same transaction and reused for some other purpose. When it
1931 ** is first added to the freelist, this routine is called. When reused,
1932 ** the dont_rollback() routine is called. But because the page contains
1933 ** critical data, we still need to be sure it gets rolled back in spite
1934 ** of the dont_rollback() call.
1935 */
sqlitepager_dont_write(Pager * pPager,Pgno pgno)1936 void sqlitepager_dont_write(Pager *pPager, Pgno pgno){
1937 PgHdr *pPg;
1938
1939 pPg = pager_lookup(pPager, pgno);
1940 pPg->alwaysRollback = 1;
1941 if( pPg && pPg->dirty ){
1942 if( pPager->dbSize==(int)pPg->pgno && pPager->origDbSize<pPager->dbSize ){
1943 /* If this pages is the last page in the file and the file has grown
1944 ** during the current transaction, then do NOT mark the page as clean.
1945 ** When the database file grows, we must make sure that the last page
1946 ** gets written at least once so that the disk file will be the correct
1947 ** size. If you do not write this page and the size of the file
1948 ** on the disk ends up being too small, that can lead to database
1949 ** corruption during the next transaction.
1950 */
1951 }else{
1952 TRACE2("DONT_WRITE %d\n", pgno);
1953 pPg->dirty = 0;
1954 }
1955 }
1956 }
1957
1958 /*
1959 ** A call to this routine tells the pager that if a rollback occurs,
1960 ** it is not necessary to restore the data on the given page. This
1961 ** means that the pager does not have to record the given page in the
1962 ** rollback journal.
1963 */
sqlitepager_dont_rollback(void * pData)1964 void sqlitepager_dont_rollback(void *pData){
1965 PgHdr *pPg = DATA_TO_PGHDR(pData);
1966 Pager *pPager = pPg->pPager;
1967
1968 if( pPager->state!=SQLITE_WRITELOCK || pPager->journalOpen==0 ) return;
1969 if( pPg->alwaysRollback || pPager->alwaysRollback ) return;
1970 if( !pPg->inJournal && (int)pPg->pgno <= pPager->origDbSize ){
1971 assert( pPager->aInJournal!=0 );
1972 pPager->aInJournal[pPg->pgno/8] |= 1<<(pPg->pgno&7);
1973 pPg->inJournal = 1;
1974 if( pPager->ckptInUse ){
1975 pPager->aInCkpt[pPg->pgno/8] |= 1<<(pPg->pgno&7);
1976 page_add_to_ckpt_list(pPg);
1977 }
1978 TRACE2("DONT_ROLLBACK %d\n", pPg->pgno);
1979 }
1980 if( pPager->ckptInUse && !pPg->inCkpt && (int)pPg->pgno<=pPager->ckptSize ){
1981 assert( pPg->inJournal || (int)pPg->pgno>pPager->origDbSize );
1982 assert( pPager->aInCkpt!=0 );
1983 pPager->aInCkpt[pPg->pgno/8] |= 1<<(pPg->pgno&7);
1984 page_add_to_ckpt_list(pPg);
1985 }
1986 }
1987
1988 /*
1989 ** Commit all changes to the database and release the write lock.
1990 **
1991 ** If the commit fails for any reason, a rollback attempt is made
1992 ** and an error code is returned. If the commit worked, SQLITE_OK
1993 ** is returned.
1994 */
sqlitepager_commit(Pager * pPager)1995 int sqlitepager_commit(Pager *pPager){
1996 int rc;
1997 PgHdr *pPg;
1998
1999 if( pPager->errMask==PAGER_ERR_FULL ){
2000 rc = sqlitepager_rollback(pPager);
2001 if( rc==SQLITE_OK ){
2002 rc = SQLITE_FULL;
2003 }
2004 return rc;
2005 }
2006 if( pPager->errMask!=0 ){
2007 rc = pager_errcode(pPager);
2008 return rc;
2009 }
2010 if( pPager->state!=SQLITE_WRITELOCK ){
2011 return SQLITE_ERROR;
2012 }
2013 TRACE1("COMMIT\n");
2014 if( pPager->dirtyFile==0 ){
2015 /* Exit early (without doing the time-consuming sqliteOsSync() calls)
2016 ** if there have been no changes to the database file. */
2017 assert( pPager->needSync==0 );
2018 rc = pager_unwritelock(pPager);
2019 pPager->dbSize = -1;
2020 return rc;
2021 }
2022 assert( pPager->journalOpen );
2023 rc = syncJournal(pPager);
2024 if( rc!=SQLITE_OK ){
2025 goto commit_abort;
2026 }
2027 pPg = pager_get_all_dirty_pages(pPager);
2028 if( pPg ){
2029 rc = pager_write_pagelist(pPg);
2030 if( rc || (!pPager->noSync && sqliteOsSync(&pPager->fd)!=SQLITE_OK) ){
2031 goto commit_abort;
2032 }
2033 }
2034 rc = pager_unwritelock(pPager);
2035 pPager->dbSize = -1;
2036 return rc;
2037
2038 /* Jump here if anything goes wrong during the commit process.
2039 */
2040 commit_abort:
2041 rc = sqlitepager_rollback(pPager);
2042 if( rc==SQLITE_OK ){
2043 rc = SQLITE_FULL;
2044 }
2045 return rc;
2046 }
2047
2048 /*
2049 ** Rollback all changes. The database falls back to read-only mode.
2050 ** All in-memory cache pages revert to their original data contents.
2051 ** The journal is deleted.
2052 **
2053 ** This routine cannot fail unless some other process is not following
2054 ** the correct locking protocol (SQLITE_PROTOCOL) or unless some other
2055 ** process is writing trash into the journal file (SQLITE_CORRUPT) or
2056 ** unless a prior malloc() failed (SQLITE_NOMEM). Appropriate error
2057 ** codes are returned for all these occasions. Otherwise,
2058 ** SQLITE_OK is returned.
2059 */
sqlitepager_rollback(Pager * pPager)2060 int sqlitepager_rollback(Pager *pPager){
2061 int rc;
2062 TRACE1("ROLLBACK\n");
2063 if( !pPager->dirtyFile || !pPager->journalOpen ){
2064 rc = pager_unwritelock(pPager);
2065 pPager->dbSize = -1;
2066 return rc;
2067 }
2068
2069 if( pPager->errMask!=0 && pPager->errMask!=PAGER_ERR_FULL ){
2070 if( pPager->state>=SQLITE_WRITELOCK ){
2071 pager_playback(pPager, 1);
2072 }
2073 return pager_errcode(pPager);
2074 }
2075 if( pPager->state!=SQLITE_WRITELOCK ){
2076 return SQLITE_OK;
2077 }
2078 rc = pager_playback(pPager, 1);
2079 if( rc!=SQLITE_OK ){
2080 rc = SQLITE_CORRUPT;
2081 pPager->errMask |= PAGER_ERR_CORRUPT;
2082 }
2083 pPager->dbSize = -1;
2084 return rc;
2085 }
2086
2087 /*
2088 ** Return TRUE if the database file is opened read-only. Return FALSE
2089 ** if the database is (in theory) writable.
2090 */
sqlitepager_isreadonly(Pager * pPager)2091 int sqlitepager_isreadonly(Pager *pPager){
2092 return pPager->readOnly;
2093 }
2094
2095 /*
2096 ** This routine is used for testing and analysis only.
2097 */
sqlitepager_stats(Pager * pPager)2098 int *sqlitepager_stats(Pager *pPager){
2099 static int a[9];
2100 a[0] = pPager->nRef;
2101 a[1] = pPager->nPage;
2102 a[2] = pPager->mxPage;
2103 a[3] = pPager->dbSize;
2104 a[4] = pPager->state;
2105 a[5] = pPager->errMask;
2106 a[6] = pPager->nHit;
2107 a[7] = pPager->nMiss;
2108 a[8] = pPager->nOvfl;
2109 return a;
2110 }
2111
2112 /*
2113 ** Set the checkpoint.
2114 **
2115 ** This routine should be called with the transaction journal already
2116 ** open. A new checkpoint journal is created that can be used to rollback
2117 ** changes of a single SQL command within a larger transaction.
2118 */
sqlitepager_ckpt_begin(Pager * pPager)2119 int sqlitepager_ckpt_begin(Pager *pPager){
2120 int rc;
2121 char zTemp[SQLITE_TEMPNAME_SIZE];
2122 if( !pPager->journalOpen ){
2123 pPager->ckptAutoopen = 1;
2124 return SQLITE_OK;
2125 }
2126 assert( pPager->journalOpen );
2127 assert( !pPager->ckptInUse );
2128 pPager->aInCkpt = sqliteMalloc( pPager->dbSize/8 + 1 );
2129 if( pPager->aInCkpt==0 ){
2130 sqliteOsReadLock(&pPager->fd);
2131 return SQLITE_NOMEM;
2132 }
2133 #ifndef NDEBUG
2134 rc = sqliteOsFileSize(&pPager->jfd, &pPager->ckptJSize);
2135 if( rc ) goto ckpt_begin_failed;
2136 assert( pPager->ckptJSize ==
2137 pPager->nRec*JOURNAL_PG_SZ(journal_format)+JOURNAL_HDR_SZ(journal_format) );
2138 #endif
2139 pPager->ckptJSize = pPager->nRec*JOURNAL_PG_SZ(journal_format)
2140 + JOURNAL_HDR_SZ(journal_format);
2141 pPager->ckptSize = pPager->dbSize;
2142 if( !pPager->ckptOpen ){
2143 rc = sqlitepager_opentemp(zTemp, &pPager->cpfd);
2144 if( rc ) goto ckpt_begin_failed;
2145 pPager->ckptOpen = 1;
2146 pPager->ckptNRec = 0;
2147 }
2148 pPager->ckptInUse = 1;
2149 return SQLITE_OK;
2150
2151 ckpt_begin_failed:
2152 if( pPager->aInCkpt ){
2153 sqliteFree(pPager->aInCkpt);
2154 pPager->aInCkpt = 0;
2155 }
2156 return rc;
2157 }
2158
2159 /*
2160 ** Commit a checkpoint.
2161 */
sqlitepager_ckpt_commit(Pager * pPager)2162 int sqlitepager_ckpt_commit(Pager *pPager){
2163 if( pPager->ckptInUse ){
2164 PgHdr *pPg, *pNext;
2165 sqliteOsSeek(&pPager->cpfd, 0);
2166 /* sqliteOsTruncate(&pPager->cpfd, 0); */
2167 pPager->ckptNRec = 0;
2168 pPager->ckptInUse = 0;
2169 sqliteFree( pPager->aInCkpt );
2170 pPager->aInCkpt = 0;
2171 for(pPg=pPager->pCkpt; pPg; pPg=pNext){
2172 pNext = pPg->pNextCkpt;
2173 assert( pPg->inCkpt );
2174 pPg->inCkpt = 0;
2175 pPg->pPrevCkpt = pPg->pNextCkpt = 0;
2176 }
2177 pPager->pCkpt = 0;
2178 }
2179 pPager->ckptAutoopen = 0;
2180 return SQLITE_OK;
2181 }
2182
2183 /*
2184 ** Rollback a checkpoint.
2185 */
sqlitepager_ckpt_rollback(Pager * pPager)2186 int sqlitepager_ckpt_rollback(Pager *pPager){
2187 int rc;
2188 if( pPager->ckptInUse ){
2189 rc = pager_ckpt_playback(pPager);
2190 sqlitepager_ckpt_commit(pPager);
2191 }else{
2192 rc = SQLITE_OK;
2193 }
2194 pPager->ckptAutoopen = 0;
2195 return rc;
2196 }
2197
2198 /*
2199 ** Return the full pathname of the database file.
2200 */
sqlitepager_filename(Pager * pPager)2201 const char *sqlitepager_filename(Pager *pPager){
2202 return pPager->zFilename;
2203 }
2204
2205 /*
2206 ** Set the codec for this pager
2207 */
sqlitepager_set_codec(Pager * pPager,void (* xCodec)(void *,void *,Pgno,int),void * pCodecArg)2208 void sqlitepager_set_codec(
2209 Pager *pPager,
2210 void (*xCodec)(void*,void*,Pgno,int),
2211 void *pCodecArg
2212 ){
2213 pPager->xCodec = xCodec;
2214 pPager->pCodecArg = pCodecArg;
2215 }
2216
2217 #ifdef SQLITE_TEST
2218 /*
2219 ** Print a listing of all referenced pages and their ref count.
2220 */
sqlitepager_refdump(Pager * pPager)2221 void sqlitepager_refdump(Pager *pPager){
2222 PgHdr *pPg;
2223 for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
2224 if( pPg->nRef<=0 ) continue;
2225 printf("PAGE %3d addr=0x%08x nRef=%d\n",
2226 pPg->pgno, (int)PGHDR_TO_DATA(pPg), pPg->nRef);
2227 }
2228 }
2229 #endif
2230