1 /*
2 * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
3 * Use is subject to license terms.
4 */
5
6 /*
7 ** 2001 September 15
8 **
9 ** The author disclaims copyright to this source code. In place of
10 ** a legal notice, here is a blessing:
11 **
12 ** May you do good and not evil.
13 ** May you find forgiveness for yourself and forgive others.
14 ** May you share freely, never taking more than you give.
15 **
16 *************************************************************************
17 ** This is the implementation of the page cache subsystem or "pager".
18 **
19 ** The pager is used to access a database disk file. It implements
20 ** atomic commit and rollback through the use of a journal file that
21 ** is separate from the database file. The pager also implements file
22 ** locking to prevent two processes from writing the same database
23 ** file simultaneously, or one process from reading the database while
24 ** another is writing.
25 **
26 ** @(#) $Id: pager.c,v 1.101 2004/02/25 02:20:41 drh Exp $
27 */
28 #include "os.h" /* Must be first to enable large file support */
29 #include "sqliteInt.h"
30 #include "pager.h"
31 #include <assert.h>
32 #include <string.h>
33
34 /*
35 ** Macros for troubleshooting. Normally turned off
36 */
37 #if 0
38 static Pager *mainPager = 0;
39 #define SET_PAGER(X) if( mainPager==0 ) mainPager = (X)
40 #define CLR_PAGER(X) if( mainPager==(X) ) mainPager = 0
41 #define TRACE1(X) if( pPager==mainPager ) fprintf(stderr,X)
42 #define TRACE2(X,Y) if( pPager==mainPager ) fprintf(stderr,X,Y)
43 #define TRACE3(X,Y,Z) if( pPager==mainPager ) fprintf(stderr,X,Y,Z)
44 #else
45 #define SET_PAGER(X)
46 #define CLR_PAGER(X)
47 #define TRACE1(X)
48 #define TRACE2(X,Y)
49 #define TRACE3(X,Y,Z)
50 #endif
51
52
53 /*
54 ** The page cache as a whole is always in one of the following
55 ** states:
56 **
57 ** SQLITE_UNLOCK The page cache is not currently reading or
58 ** writing the database file. There is no
59 ** data held in memory. This is the initial
60 ** state.
61 **
62 ** SQLITE_READLOCK The page cache is reading the database.
63 ** Writing is not permitted. There can be
64 ** multiple readers accessing the same database
65 ** file at the same time.
66 **
67 ** SQLITE_WRITELOCK The page cache is writing the database.
68 ** Access is exclusive. No other processes or
69 ** threads can be reading or writing while one
70 ** process is writing.
71 **
72 ** The page cache comes up in SQLITE_UNLOCK. The first time a
73 ** sqlite_page_get() occurs, the state transitions to SQLITE_READLOCK.
74 ** After all pages have been released using sqlite_page_unref(),
75 ** the state transitions back to SQLITE_UNLOCK. The first time
76 ** that sqlite_page_write() is called, the state transitions to
77 ** SQLITE_WRITELOCK. (Note that sqlite_page_write() can only be
78 ** called on an outstanding page which means that the pager must
79 ** be in SQLITE_READLOCK before it transitions to SQLITE_WRITELOCK.)
80 ** The sqlite_page_rollback() and sqlite_page_commit() functions
81 ** transition the state from SQLITE_WRITELOCK back to SQLITE_READLOCK.
82 */
83 #define SQLITE_UNLOCK 0
84 #define SQLITE_READLOCK 1
85 #define SQLITE_WRITELOCK 2
86
87
88 /*
89 ** Each in-memory image of a page begins with the following header.
90 ** This header is only visible to this pager module. The client
91 ** code that calls pager sees only the data that follows the header.
92 **
93 ** Client code should call sqlitepager_write() on a page prior to making
94 ** any modifications to that page. The first time sqlitepager_write()
95 ** is called, the original page contents are written into the rollback
96 ** journal and PgHdr.inJournal and PgHdr.needSync are set. Later, once
97 ** the journal page has made it onto the disk surface, PgHdr.needSync
98 ** is cleared. The modified page cannot be written back into the original
99 ** database file until the journal pages has been synced to disk and the
100 ** PgHdr.needSync has been cleared.
101 **
102 ** The PgHdr.dirty flag is set when sqlitepager_write() is called and
103 ** is cleared again when the page content is written back to the original
104 ** database file.
105 */
106 typedef struct PgHdr PgHdr;
107 struct PgHdr {
108 Pager *pPager; /* The pager to which this page belongs */
109 Pgno pgno; /* The page number for this page */
110 PgHdr *pNextHash, *pPrevHash; /* Hash collision chain for PgHdr.pgno */
111 int nRef; /* Number of users of this page */
112 PgHdr *pNextFree, *pPrevFree; /* Freelist of pages where nRef==0 */
113 PgHdr *pNextAll, *pPrevAll; /* A list of all pages */
114 PgHdr *pNextCkpt, *pPrevCkpt; /* List of pages in the checkpoint journal */
115 u8 inJournal; /* TRUE if has been written to journal */
116 u8 inCkpt; /* TRUE if written to the checkpoint journal */
117 u8 dirty; /* TRUE if we need to write back changes */
118 u8 needSync; /* Sync journal before writing this page */
119 u8 alwaysRollback; /* Disable dont_rollback() for this page */
120 PgHdr *pDirty; /* Dirty pages sorted by PgHdr.pgno */
121 /* SQLITE_PAGE_SIZE bytes of page data follow this header */
122 /* Pager.nExtra bytes of local data follow the page data */
123 };
124
125
126 /*
127 ** A macro used for invoking the codec if there is one
128 */
129 #ifdef SQLITE_HAS_CODEC
130 # define CODEC(P,D,N,X) if( P->xCodec ){ P->xCodec(P->pCodecArg,D,N,X); }
131 #else
132 # define CODEC(P,D,N,X)
133 #endif
134
135 /*
136 ** Convert a pointer to a PgHdr into a pointer to its data
137 ** and back again.
138 */
139 #define PGHDR_TO_DATA(P) ((void*)(&(P)[1]))
140 #define DATA_TO_PGHDR(D) (&((PgHdr*)(D))[-1])
141 #define PGHDR_TO_EXTRA(P) ((void*)&((char*)(&(P)[1]))[SQLITE_PAGE_SIZE])
142
143 /*
144 ** How big to make the hash table used for locating in-memory pages
145 ** by page number.
146 */
147 #define N_PG_HASH 2048
148
149 /*
150 ** Hash a page number
151 */
152 #define pager_hash(PN) ((PN)&(N_PG_HASH-1))
153
154 /*
155 ** A open page cache is an instance of the following structure.
156 */
157 struct Pager {
158 char *zFilename; /* Name of the database file */
159 char *zJournal; /* Name of the journal file */
160 char *zDirectory; /* Directory hold database and journal files */
161 OsFile fd, jfd; /* File descriptors for database and journal */
162 OsFile cpfd; /* File descriptor for the checkpoint journal */
163 int dbSize; /* Number of pages in the file */
164 int origDbSize; /* dbSize before the current change */
165 int ckptSize; /* Size of database (in pages) at ckpt_begin() */
166 off_t ckptJSize; /* Size of journal at ckpt_begin() */
167 int nRec; /* Number of pages written to the journal */
168 u32 cksumInit; /* Quasi-random value added to every checksum */
169 int ckptNRec; /* Number of records in the checkpoint journal */
170 int nExtra; /* Add this many bytes to each in-memory page */
171 void (*xDestructor)(void*); /* Call this routine when freeing pages */
172 int nPage; /* Total number of in-memory pages */
173 int nRef; /* Number of in-memory pages with PgHdr.nRef>0 */
174 int mxPage; /* Maximum number of pages to hold in cache */
175 int nHit, nMiss, nOvfl; /* Cache hits, missing, and LRU overflows */
176 void (*xCodec)(void*,void*,Pgno,int); /* Routine for en/decoding data */
177 void *pCodecArg; /* First argument to xCodec() */
178 u8 journalOpen; /* True if journal file descriptors is valid */
179 u8 journalStarted; /* True if header of journal is synced */
180 u8 useJournal; /* Use a rollback journal on this file */
181 u8 ckptOpen; /* True if the checkpoint journal is open */
182 u8 ckptInUse; /* True we are in a checkpoint */
183 u8 ckptAutoopen; /* Open ckpt journal when main journal is opened*/
184 u8 noSync; /* Do not sync the journal if true */
185 u8 fullSync; /* Do extra syncs of the journal for robustness */
186 u8 state; /* SQLITE_UNLOCK, _READLOCK or _WRITELOCK */
187 u8 errMask; /* One of several kinds of errors */
188 u8 tempFile; /* zFilename is a temporary file */
189 u8 readOnly; /* True for a read-only database */
190 u8 needSync; /* True if an fsync() is needed on the journal */
191 u8 dirtyFile; /* True if database file has changed in any way */
192 u8 alwaysRollback; /* Disable dont_rollback() for all pages */
193 u8 *aInJournal; /* One bit for each page in the database file */
194 u8 *aInCkpt; /* One bit for each page in the database */
195 PgHdr *pFirst, *pLast; /* List of free pages */
196 PgHdr *pFirstSynced; /* First free page with PgHdr.needSync==0 */
197 PgHdr *pAll; /* List of all pages */
198 PgHdr *pCkpt; /* List of pages in the checkpoint journal */
199 PgHdr *aHash[N_PG_HASH]; /* Hash table to map page number of PgHdr */
200 };
201
202 /*
203 ** These are bits that can be set in Pager.errMask.
204 */
205 #define PAGER_ERR_FULL 0x01 /* a write() failed */
206 #define PAGER_ERR_MEM 0x02 /* malloc() failed */
207 #define PAGER_ERR_LOCK 0x04 /* error in the locking protocol */
208 #define PAGER_ERR_CORRUPT 0x08 /* database or journal corruption */
209 #define PAGER_ERR_DISK 0x10 /* general disk I/O error - bad hard drive? */
210
211 /*
212 ** The journal file contains page records in the following
213 ** format.
214 **
215 ** Actually, this structure is the complete page record for pager
216 ** formats less than 3. Beginning with format 3, this record is surrounded
217 ** by two checksums.
218 */
219 typedef struct PageRecord PageRecord;
220 struct PageRecord {
221 Pgno pgno; /* The page number */
222 char aData[SQLITE_PAGE_SIZE]; /* Original data for page pgno */
223 };
224
225 /*
226 ** Journal files begin with the following magic string. The data
227 ** was obtained from /dev/random. It is used only as a sanity check.
228 **
229 ** There are three journal formats (so far). The 1st journal format writes
230 ** 32-bit integers in the byte-order of the host machine. New
231 ** formats writes integers as big-endian. All new journals use the
232 ** new format, but we have to be able to read an older journal in order
233 ** to rollback journals created by older versions of the library.
234 **
235 ** The 3rd journal format (added for 2.8.0) adds additional sanity
236 ** checking information to the journal. If the power fails while the
237 ** journal is being written, semi-random garbage data might appear in
238 ** the journal file after power is restored. If an attempt is then made
239 ** to roll the journal back, the database could be corrupted. The additional
240 ** sanity checking data is an attempt to discover the garbage in the
241 ** journal and ignore it.
242 **
243 ** The sanity checking information for the 3rd journal format consists
244 ** of a 32-bit checksum on each page of data. The checksum covers both
245 ** the page number and the SQLITE_PAGE_SIZE bytes of data for the page.
246 ** This cksum is initialized to a 32-bit random value that appears in the
247 ** journal file right after the header. The random initializer is important,
248 ** because garbage data that appears at the end of a journal is likely
249 ** data that was once in other files that have now been deleted. If the
250 ** garbage data came from an obsolete journal file, the checksums might
251 ** be correct. But by initializing the checksum to random value which
252 ** is different for every journal, we minimize that risk.
253 */
254 static const unsigned char aJournalMagic1[] = {
255 0xd9, 0xd5, 0x05, 0xf9, 0x20, 0xa1, 0x63, 0xd4,
256 };
257 static const unsigned char aJournalMagic2[] = {
258 0xd9, 0xd5, 0x05, 0xf9, 0x20, 0xa1, 0x63, 0xd5,
259 };
260 static const unsigned char aJournalMagic3[] = {
261 0xd9, 0xd5, 0x05, 0xf9, 0x20, 0xa1, 0x63, 0xd6,
262 };
263 #define JOURNAL_FORMAT_1 1
264 #define JOURNAL_FORMAT_2 2
265 #define JOURNAL_FORMAT_3 3
266
267 /*
268 ** The following integer determines what format to use when creating
269 ** new primary journal files. By default we always use format 3.
270 ** When testing, we can set this value to older journal formats in order to
271 ** make sure that newer versions of the library are able to rollback older
272 ** journal files.
273 **
274 ** Note that checkpoint journals always use format 2 and omit the header.
275 */
276 #ifdef SQLITE_TEST
277 int journal_format = 3;
278 #else
279 # define journal_format 3
280 #endif
281
282 /*
283 ** The size of the header and of each page in the journal varies according
284 ** to which journal format is being used. The following macros figure out
285 ** the sizes based on format numbers.
286 */
287 #define JOURNAL_HDR_SZ(X) \
288 (sizeof(aJournalMagic1) + sizeof(Pgno) + ((X)>=3)*2*sizeof(u32))
289 #define JOURNAL_PG_SZ(X) \
290 (SQLITE_PAGE_SIZE + sizeof(Pgno) + ((X)>=3)*sizeof(u32))
291
292 /*
293 ** Enable reference count tracking here:
294 */
295 #ifdef SQLITE_TEST
296 int pager_refinfo_enable = 0;
pager_refinfo(PgHdr * p)297 static void pager_refinfo(PgHdr *p){
298 static int cnt = 0;
299 if( !pager_refinfo_enable ) return;
300 printf(
301 "REFCNT: %4d addr=0x%08x nRef=%d\n",
302 p->pgno, (int)PGHDR_TO_DATA(p), p->nRef
303 );
304 cnt++; /* Something to set a breakpoint on */
305 }
306 # define REFINFO(X) pager_refinfo(X)
307 #else
308 # define REFINFO(X)
309 #endif
310
311 /*
312 ** Read a 32-bit integer from the given file descriptor. Store the integer
313 ** that is read in *pRes. Return SQLITE_OK if everything worked, or an
314 ** error code is something goes wrong.
315 **
316 ** If the journal format is 2 or 3, read a big-endian integer. If the
317 ** journal format is 1, read an integer in the native byte-order of the
318 ** host machine.
319 */
read32bits(int format,OsFile * fd,u32 * pRes)320 static int read32bits(int format, OsFile *fd, u32 *pRes){
321 u32 res;
322 int rc;
323 rc = sqliteOsRead(fd, &res, sizeof(res));
324 if( rc==SQLITE_OK && format>JOURNAL_FORMAT_1 ){
325 unsigned char ac[4];
326 memcpy(ac, &res, 4);
327 res = (ac[0]<<24) | (ac[1]<<16) | (ac[2]<<8) | ac[3];
328 }
329 *pRes = res;
330 return rc;
331 }
332
333 /*
334 ** Write a 32-bit integer into the given file descriptor. Return SQLITE_OK
335 ** on success or an error code is something goes wrong.
336 **
337 ** If the journal format is 2 or 3, write the integer as 4 big-endian
338 ** bytes. If the journal format is 1, write the integer in the native
339 ** byte order. In normal operation, only formats 2 and 3 are used.
340 ** Journal format 1 is only used for testing.
341 */
write32bits(OsFile * fd,u32 val)342 static int write32bits(OsFile *fd, u32 val){
343 unsigned char ac[4];
344 if( journal_format<=1 ){
345 return sqliteOsWrite(fd, &val, 4);
346 }
347 ac[0] = (val>>24) & 0xff;
348 ac[1] = (val>>16) & 0xff;
349 ac[2] = (val>>8) & 0xff;
350 ac[3] = val & 0xff;
351 return sqliteOsWrite(fd, ac, 4);
352 }
353
354 /*
355 ** Write a 32-bit integer into a page header right before the
356 ** page data. This will overwrite the PgHdr.pDirty pointer.
357 **
358 ** The integer is big-endian for formats 2 and 3 and native byte order
359 ** for journal format 1.
360 */
store32bits(u32 val,PgHdr * p,int offset)361 static void store32bits(u32 val, PgHdr *p, int offset){
362 unsigned char *ac;
363 ac = &((unsigned char*)PGHDR_TO_DATA(p))[offset];
364 if( journal_format<=1 ){
365 memcpy(ac, &val, 4);
366 }else{
367 ac[0] = (val>>24) & 0xff;
368 ac[1] = (val>>16) & 0xff;
369 ac[2] = (val>>8) & 0xff;
370 ac[3] = val & 0xff;
371 }
372 }
373
374
375 /*
376 ** Convert the bits in the pPager->errMask into an approprate
377 ** return code.
378 */
pager_errcode(Pager * pPager)379 static int pager_errcode(Pager *pPager){
380 int rc = SQLITE_OK;
381 if( pPager->errMask & PAGER_ERR_LOCK ) rc = SQLITE_PROTOCOL;
382 if( pPager->errMask & PAGER_ERR_DISK ) rc = SQLITE_IOERR;
383 if( pPager->errMask & PAGER_ERR_FULL ) rc = SQLITE_FULL;
384 if( pPager->errMask & PAGER_ERR_MEM ) rc = SQLITE_NOMEM;
385 if( pPager->errMask & PAGER_ERR_CORRUPT ) rc = SQLITE_CORRUPT;
386 return rc;
387 }
388
389 /*
390 ** Add or remove a page from the list of all pages that are in the
391 ** checkpoint journal.
392 **
393 ** The Pager keeps a separate list of pages that are currently in
394 ** the checkpoint journal. This helps the sqlitepager_ckpt_commit()
395 ** routine run MUCH faster for the common case where there are many
396 ** pages in memory but only a few are in the checkpoint journal.
397 */
page_add_to_ckpt_list(PgHdr * pPg)398 static void page_add_to_ckpt_list(PgHdr *pPg){
399 Pager *pPager = pPg->pPager;
400 if( pPg->inCkpt ) return;
401 assert( pPg->pPrevCkpt==0 && pPg->pNextCkpt==0 );
402 pPg->pPrevCkpt = 0;
403 if( pPager->pCkpt ){
404 pPager->pCkpt->pPrevCkpt = pPg;
405 }
406 pPg->pNextCkpt = pPager->pCkpt;
407 pPager->pCkpt = pPg;
408 pPg->inCkpt = 1;
409 }
page_remove_from_ckpt_list(PgHdr * pPg)410 static void page_remove_from_ckpt_list(PgHdr *pPg){
411 if( !pPg->inCkpt ) return;
412 if( pPg->pPrevCkpt ){
413 assert( pPg->pPrevCkpt->pNextCkpt==pPg );
414 pPg->pPrevCkpt->pNextCkpt = pPg->pNextCkpt;
415 }else{
416 assert( pPg->pPager->pCkpt==pPg );
417 pPg->pPager->pCkpt = pPg->pNextCkpt;
418 }
419 if( pPg->pNextCkpt ){
420 assert( pPg->pNextCkpt->pPrevCkpt==pPg );
421 pPg->pNextCkpt->pPrevCkpt = pPg->pPrevCkpt;
422 }
423 pPg->pNextCkpt = 0;
424 pPg->pPrevCkpt = 0;
425 pPg->inCkpt = 0;
426 }
427
428 /*
429 ** Find a page in the hash table given its page number. Return
430 ** a pointer to the page or NULL if not found.
431 */
pager_lookup(Pager * pPager,Pgno pgno)432 static PgHdr *pager_lookup(Pager *pPager, Pgno pgno){
433 PgHdr *p = pPager->aHash[pager_hash(pgno)];
434 while( p && p->pgno!=pgno ){
435 p = p->pNextHash;
436 }
437 return p;
438 }
439
440 /*
441 ** Unlock the database and clear the in-memory cache. This routine
442 ** sets the state of the pager back to what it was when it was first
443 ** opened. Any outstanding pages are invalidated and subsequent attempts
444 ** to access those pages will likely result in a coredump.
445 */
pager_reset(Pager * pPager)446 static void pager_reset(Pager *pPager){
447 PgHdr *pPg, *pNext;
448 for(pPg=pPager->pAll; pPg; pPg=pNext){
449 pNext = pPg->pNextAll;
450 sqliteFree(pPg);
451 }
452 pPager->pFirst = 0;
453 pPager->pFirstSynced = 0;
454 pPager->pLast = 0;
455 pPager->pAll = 0;
456 memset(pPager->aHash, 0, sizeof(pPager->aHash));
457 pPager->nPage = 0;
458 if( pPager->state>=SQLITE_WRITELOCK ){
459 sqlitepager_rollback(pPager);
460 }
461 sqliteOsUnlock(&pPager->fd);
462 pPager->state = SQLITE_UNLOCK;
463 pPager->dbSize = -1;
464 pPager->nRef = 0;
465 assert( pPager->journalOpen==0 );
466 }
467
468 /*
469 ** When this routine is called, the pager has the journal file open and
470 ** a write lock on the database. This routine releases the database
471 ** write lock and acquires a read lock in its place. The journal file
472 ** is deleted and closed.
473 **
474 ** TODO: Consider keeping the journal file open for temporary databases.
475 ** This might give a performance improvement on windows where opening
476 ** a file is an expensive operation.
477 */
pager_unwritelock(Pager * pPager)478 static int pager_unwritelock(Pager *pPager){
479 int rc;
480 PgHdr *pPg;
481 if( pPager->state<SQLITE_WRITELOCK ) return SQLITE_OK;
482 sqlitepager_ckpt_commit(pPager);
483 if( pPager->ckptOpen ){
484 sqliteOsClose(&pPager->cpfd);
485 pPager->ckptOpen = 0;
486 }
487 if( pPager->journalOpen ){
488 sqliteOsClose(&pPager->jfd);
489 pPager->journalOpen = 0;
490 sqliteOsDelete(pPager->zJournal);
491 sqliteFree( pPager->aInJournal );
492 pPager->aInJournal = 0;
493 for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
494 pPg->inJournal = 0;
495 pPg->dirty = 0;
496 pPg->needSync = 0;
497 }
498 }else{
499 assert( pPager->dirtyFile==0 || pPager->useJournal==0 );
500 }
501 rc = sqliteOsReadLock(&pPager->fd);
502 if( rc==SQLITE_OK ){
503 pPager->state = SQLITE_READLOCK;
504 }else{
505 /* This can only happen if a process does a BEGIN, then forks and the
506 ** child process does the COMMIT. Because of the semantics of unix
507 ** file locking, the unlock will fail.
508 */
509 pPager->state = SQLITE_UNLOCK;
510 }
511 return rc;
512 }
513
514 /*
515 ** Compute and return a checksum for the page of data.
516 **
517 ** This is not a real checksum. It is really just the sum of the
518 ** random initial value and the page number. We considered do a checksum
519 ** of the database, but that was found to be too slow.
520 */
pager_cksum(Pager * pPager,Pgno pgno,const char * aData)521 static u32 pager_cksum(Pager *pPager, Pgno pgno, const char *aData){
522 u32 cksum = pPager->cksumInit + pgno;
523 return cksum;
524 }
525
526 /*
527 ** Read a single page from the journal file opened on file descriptor
528 ** jfd. Playback this one page.
529 **
530 ** There are three different journal formats. The format parameter determines
531 ** which format is used by the journal that is played back.
532 */
pager_playback_one_page(Pager * pPager,OsFile * jfd,int format)533 static int pager_playback_one_page(Pager *pPager, OsFile *jfd, int format){
534 int rc;
535 PgHdr *pPg; /* An existing page in the cache */
536 PageRecord pgRec;
537 u32 cksum;
538
539 rc = read32bits(format, jfd, &pgRec.pgno);
540 if( rc!=SQLITE_OK ) return rc;
541 rc = sqliteOsRead(jfd, &pgRec.aData, sizeof(pgRec.aData));
542 if( rc!=SQLITE_OK ) return rc;
543
544 /* Sanity checking on the page. This is more important that I originally
545 ** thought. If a power failure occurs while the journal is being written,
546 ** it could cause invalid data to be written into the journal. We need to
547 ** detect this invalid data (with high probability) and ignore it.
548 */
549 if( pgRec.pgno==0 ){
550 return SQLITE_DONE;
551 }
552 if( pgRec.pgno>(unsigned)pPager->dbSize ){
553 return SQLITE_OK;
554 }
555 if( format>=JOURNAL_FORMAT_3 ){
556 rc = read32bits(format, jfd, &cksum);
557 if( rc ) return rc;
558 if( pager_cksum(pPager, pgRec.pgno, pgRec.aData)!=cksum ){
559 return SQLITE_DONE;
560 }
561 }
562
563 /* Playback the page. Update the in-memory copy of the page
564 ** at the same time, if there is one.
565 */
566 pPg = pager_lookup(pPager, pgRec.pgno);
567 TRACE2("PLAYBACK %d\n", pgRec.pgno);
568 sqliteOsSeek(&pPager->fd, (pgRec.pgno-1)*(off_t)SQLITE_PAGE_SIZE);
569 rc = sqliteOsWrite(&pPager->fd, pgRec.aData, SQLITE_PAGE_SIZE);
570 if( pPg ){
571 /* No page should ever be rolled back that is in use, except for page
572 ** 1 which is held in use in order to keep the lock on the database
573 ** active. However, such a page may be rolled back as a result of an
574 ** internal error resulting in an automatic call to
575 ** sqlitepager_rollback(), so we can't assert() it.
576 */
577 /* assert( pPg->nRef==0 || pPg->pgno==1 ) */
578 memcpy(PGHDR_TO_DATA(pPg), pgRec.aData, SQLITE_PAGE_SIZE);
579 memset(PGHDR_TO_EXTRA(pPg), 0, pPager->nExtra);
580 pPg->dirty = 0;
581 pPg->needSync = 0;
582 CODEC(pPager, PGHDR_TO_DATA(pPg), pPg->pgno, 3);
583 }
584 return rc;
585 }
586
587 /*
588 ** Playback the journal and thus restore the database file to
589 ** the state it was in before we started making changes.
590 **
591 ** The journal file format is as follows:
592 **
593 ** * 8 byte prefix. One of the aJournalMagic123 vectors defined
594 ** above. The format of the journal file is determined by which
595 ** of the three prefix vectors is seen.
596 ** * 4 byte big-endian integer which is the number of valid page records
597 ** in the journal. If this value is 0xffffffff, then compute the
598 ** number of page records from the journal size. This field appears
599 ** in format 3 only.
600 ** * 4 byte big-endian integer which is the initial value for the
601 ** sanity checksum. This field appears in format 3 only.
602 ** * 4 byte integer which is the number of pages to truncate the
603 ** database to during a rollback.
604 ** * Zero or more pages instances, each as follows:
605 ** + 4 byte page number.
606 ** + SQLITE_PAGE_SIZE bytes of data.
607 ** + 4 byte checksum (format 3 only)
608 **
609 ** When we speak of the journal header, we mean the first 4 bullets above.
610 ** Each entry in the journal is an instance of the 5th bullet. Note that
611 ** bullets 2 and 3 only appear in format-3 journals.
612 **
613 ** Call the value from the second bullet "nRec". nRec is the number of
614 ** valid page entries in the journal. In most cases, you can compute the
615 ** value of nRec from the size of the journal file. But if a power
616 ** failure occurred while the journal was being written, it could be the
617 ** case that the size of the journal file had already been increased but
618 ** the extra entries had not yet made it safely to disk. In such a case,
619 ** the value of nRec computed from the file size would be too large. For
620 ** that reason, we always use the nRec value in the header.
621 **
622 ** If the nRec value is 0xffffffff it means that nRec should be computed
623 ** from the file size. This value is used when the user selects the
624 ** no-sync option for the journal. A power failure could lead to corruption
625 ** in this case. But for things like temporary table (which will be
626 ** deleted when the power is restored) we don't care.
627 **
628 ** Journal formats 1 and 2 do not have an nRec value in the header so we
629 ** have to compute nRec from the file size. This has risks (as described
630 ** above) which is why all persistent tables have been changed to use
631 ** format 3.
632 **
633 ** If the file opened as the journal file is not a well-formed
634 ** journal file then the database will likely already be
635 ** corrupted, so the PAGER_ERR_CORRUPT bit is set in pPager->errMask
636 ** and SQLITE_CORRUPT is returned. If it all works, then this routine
637 ** returns SQLITE_OK.
638 */
pager_playback(Pager * pPager,int useJournalSize)639 static int pager_playback(Pager *pPager, int useJournalSize){
640 off_t szJ; /* Size of the journal file in bytes */
641 int nRec; /* Number of Records in the journal */
642 int i; /* Loop counter */
643 Pgno mxPg = 0; /* Size of the original file in pages */
644 int format; /* Format of the journal file. */
645 unsigned char aMagic[sizeof(aJournalMagic1)];
646 int rc;
647
648 /* Figure out how many records are in the journal. Abort early if
649 ** the journal is empty.
650 */
651 assert( pPager->journalOpen );
652 sqliteOsSeek(&pPager->jfd, 0);
653 rc = sqliteOsFileSize(&pPager->jfd, &szJ);
654 if( rc!=SQLITE_OK ){
655 goto end_playback;
656 }
657
658 /* If the journal file is too small to contain a complete header,
659 ** it must mean that the process that created the journal was just
660 ** beginning to write the journal file when it died. In that case,
661 ** the database file should have still been completely unchanged.
662 ** Nothing needs to be rolled back. We can safely ignore this journal.
663 */
664 if( szJ < sizeof(aMagic)+sizeof(Pgno) ){
665 goto end_playback;
666 }
667
668 /* Read the beginning of the journal and truncate the
669 ** database file back to its original size.
670 */
671 rc = sqliteOsRead(&pPager->jfd, aMagic, sizeof(aMagic));
672 if( rc!=SQLITE_OK ){
673 rc = SQLITE_PROTOCOL;
674 goto end_playback;
675 }
676 if( memcmp(aMagic, aJournalMagic3, sizeof(aMagic))==0 ){
677 format = JOURNAL_FORMAT_3;
678 }else if( memcmp(aMagic, aJournalMagic2, sizeof(aMagic))==0 ){
679 format = JOURNAL_FORMAT_2;
680 }else if( memcmp(aMagic, aJournalMagic1, sizeof(aMagic))==0 ){
681 format = JOURNAL_FORMAT_1;
682 }else{
683 rc = SQLITE_PROTOCOL;
684 goto end_playback;
685 }
686 if( format>=JOURNAL_FORMAT_3 ){
687 if( szJ < sizeof(aMagic) + 3*sizeof(u32) ){
688 /* Ignore the journal if it is too small to contain a complete
689 ** header. We already did this test once above, but at the prior
690 ** test, we did not know the journal format and so we had to assume
691 ** the smallest possible header. Now we know the header is bigger
692 ** than the minimum so we test again.
693 */
694 goto end_playback;
695 }
696 rc = read32bits(format, &pPager->jfd, (u32*)&nRec);
697 if( rc ) goto end_playback;
698 rc = read32bits(format, &pPager->jfd, &pPager->cksumInit);
699 if( rc ) goto end_playback;
700 if( nRec==0xffffffff || useJournalSize ){
701 nRec = (szJ - JOURNAL_HDR_SZ(3))/JOURNAL_PG_SZ(3);
702 }
703 }else{
704 nRec = (szJ - JOURNAL_HDR_SZ(2))/JOURNAL_PG_SZ(2);
705 assert( nRec*JOURNAL_PG_SZ(2)+JOURNAL_HDR_SZ(2)==szJ );
706 }
707 rc = read32bits(format, &pPager->jfd, &mxPg);
708 if( rc!=SQLITE_OK ){
709 goto end_playback;
710 }
711 assert( pPager->origDbSize==0 || pPager->origDbSize==mxPg );
712 rc = sqliteOsTruncate(&pPager->fd, SQLITE_PAGE_SIZE*(off_t)mxPg);
713 if( rc!=SQLITE_OK ){
714 goto end_playback;
715 }
716 pPager->dbSize = mxPg;
717
718 /* Copy original pages out of the journal and back into the database file.
719 */
720 for(i=0; i<nRec; i++){
721 rc = pager_playback_one_page(pPager, &pPager->jfd, format);
722 if( rc!=SQLITE_OK ){
723 if( rc==SQLITE_DONE ){
724 rc = SQLITE_OK;
725 }
726 break;
727 }
728 }
729
730 /* Pages that have been written to the journal but never synced
731 ** where not restored by the loop above. We have to restore those
732 ** pages by reading them back from the original database.
733 */
734 if( rc==SQLITE_OK ){
735 PgHdr *pPg;
736 for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
737 char zBuf[SQLITE_PAGE_SIZE];
738 if( !pPg->dirty ) continue;
739 if( (int)pPg->pgno <= pPager->origDbSize ){
740 sqliteOsSeek(&pPager->fd, SQLITE_PAGE_SIZE*(off_t)(pPg->pgno-1));
741 rc = sqliteOsRead(&pPager->fd, zBuf, SQLITE_PAGE_SIZE);
742 TRACE2("REFETCH %d\n", pPg->pgno);
743 CODEC(pPager, zBuf, pPg->pgno, 2);
744 if( rc ) break;
745 }else{
746 memset(zBuf, 0, SQLITE_PAGE_SIZE);
747 }
748 if( pPg->nRef==0 || memcmp(zBuf, PGHDR_TO_DATA(pPg), SQLITE_PAGE_SIZE) ){
749 memcpy(PGHDR_TO_DATA(pPg), zBuf, SQLITE_PAGE_SIZE);
750 memset(PGHDR_TO_EXTRA(pPg), 0, pPager->nExtra);
751 }
752 pPg->needSync = 0;
753 pPg->dirty = 0;
754 }
755 }
756
757 end_playback:
758 if( rc!=SQLITE_OK ){
759 pager_unwritelock(pPager);
760 pPager->errMask |= PAGER_ERR_CORRUPT;
761 rc = SQLITE_CORRUPT;
762 }else{
763 rc = pager_unwritelock(pPager);
764 }
765 return rc;
766 }
767
768 /*
769 ** Playback the checkpoint journal.
770 **
771 ** This is similar to playing back the transaction journal but with
772 ** a few extra twists.
773 **
774 ** (1) The number of pages in the database file at the start of
775 ** the checkpoint is stored in pPager->ckptSize, not in the
776 ** journal file itself.
777 **
778 ** (2) In addition to playing back the checkpoint journal, also
779 ** playback all pages of the transaction journal beginning
780 ** at offset pPager->ckptJSize.
781 */
pager_ckpt_playback(Pager * pPager)782 static int pager_ckpt_playback(Pager *pPager){
783 off_t szJ; /* Size of the full journal */
784 int nRec; /* Number of Records */
785 int i; /* Loop counter */
786 int rc;
787
788 /* Truncate the database back to its original size.
789 */
790 rc = sqliteOsTruncate(&pPager->fd, SQLITE_PAGE_SIZE*(off_t)pPager->ckptSize);
791 pPager->dbSize = pPager->ckptSize;
792
793 /* Figure out how many records are in the checkpoint journal.
794 */
795 assert( pPager->ckptInUse && pPager->journalOpen );
796 sqliteOsSeek(&pPager->cpfd, 0);
797 nRec = pPager->ckptNRec;
798
799 /* Copy original pages out of the checkpoint journal and back into the
800 ** database file. Note that the checkpoint journal always uses format
801 ** 2 instead of format 3 since it does not need to be concerned with
802 ** power failures corrupting the journal and can thus omit the checksums.
803 */
804 for(i=nRec-1; i>=0; i--){
805 rc = pager_playback_one_page(pPager, &pPager->cpfd, 2);
806 assert( rc!=SQLITE_DONE );
807 if( rc!=SQLITE_OK ) goto end_ckpt_playback;
808 }
809
810 /* Figure out how many pages need to be copied out of the transaction
811 ** journal.
812 */
813 rc = sqliteOsSeek(&pPager->jfd, pPager->ckptJSize);
814 if( rc!=SQLITE_OK ){
815 goto end_ckpt_playback;
816 }
817 rc = sqliteOsFileSize(&pPager->jfd, &szJ);
818 if( rc!=SQLITE_OK ){
819 goto end_ckpt_playback;
820 }
821 nRec = (szJ - pPager->ckptJSize)/JOURNAL_PG_SZ(journal_format);
822 for(i=nRec-1; i>=0; i--){
823 rc = pager_playback_one_page(pPager, &pPager->jfd, journal_format);
824 if( rc!=SQLITE_OK ){
825 assert( rc!=SQLITE_DONE );
826 goto end_ckpt_playback;
827 }
828 }
829
830 end_ckpt_playback:
831 if( rc!=SQLITE_OK ){
832 pPager->errMask |= PAGER_ERR_CORRUPT;
833 rc = SQLITE_CORRUPT;
834 }
835 return rc;
836 }
837
838 /*
839 ** Change the maximum number of in-memory pages that are allowed.
840 **
841 ** The maximum number is the absolute value of the mxPage parameter.
842 ** If mxPage is negative, the noSync flag is also set. noSync bypasses
843 ** calls to sqliteOsSync(). The pager runs much faster with noSync on,
844 ** but if the operating system crashes or there is an abrupt power
845 ** failure, the database file might be left in an inconsistent and
846 ** unrepairable state.
847 */
sqlitepager_set_cachesize(Pager * pPager,int mxPage)848 void sqlitepager_set_cachesize(Pager *pPager, int mxPage){
849 if( mxPage>=0 ){
850 pPager->noSync = pPager->tempFile;
851 if( pPager->noSync==0 ) pPager->needSync = 0;
852 }else{
853 pPager->noSync = 1;
854 mxPage = -mxPage;
855 }
856 if( mxPage>10 ){
857 pPager->mxPage = mxPage;
858 }
859 }
860
861 /*
862 ** Adjust the robustness of the database to damage due to OS crashes
863 ** or power failures by changing the number of syncs()s when writing
864 ** the rollback journal. There are three levels:
865 **
866 ** OFF sqliteOsSync() is never called. This is the default
867 ** for temporary and transient files.
868 **
869 ** NORMAL The journal is synced once before writes begin on the
870 ** database. This is normally adequate protection, but
871 ** it is theoretically possible, though very unlikely,
872 ** that an inopertune power failure could leave the journal
873 ** in a state which would cause damage to the database
874 ** when it is rolled back.
875 **
876 ** FULL The journal is synced twice before writes begin on the
877 ** database (with some additional information - the nRec field
878 ** of the journal header - being written in between the two
879 ** syncs). If we assume that writing a
880 ** single disk sector is atomic, then this mode provides
881 ** assurance that the journal will not be corrupted to the
882 ** point of causing damage to the database during rollback.
883 **
884 ** Numeric values associated with these states are OFF==1, NORMAL=2,
885 ** and FULL=3.
886 */
sqlitepager_set_safety_level(Pager * pPager,int level)887 void sqlitepager_set_safety_level(Pager *pPager, int level){
888 pPager->noSync = level==1 || pPager->tempFile;
889 pPager->fullSync = level==3 && !pPager->tempFile;
890 if( pPager->noSync==0 ) pPager->needSync = 0;
891 }
892
893 /*
894 ** Open a temporary file. Write the name of the file into zName
895 ** (zName must be at least SQLITE_TEMPNAME_SIZE bytes long.) Write
896 ** the file descriptor into *fd. Return SQLITE_OK on success or some
897 ** other error code if we fail.
898 **
899 ** The OS will automatically delete the temporary file when it is
900 ** closed.
901 */
sqlitepager_opentemp(char * zFile,OsFile * fd)902 static int sqlitepager_opentemp(char *zFile, OsFile *fd){
903 int cnt = 8;
904 int rc;
905 do{
906 cnt--;
907 sqliteOsTempFileName(zFile);
908 rc = sqliteOsOpenExclusive(zFile, fd, 1);
909 }while( cnt>0 && rc!=SQLITE_OK );
910 return rc;
911 }
912
913 /*
914 ** Create a new page cache and put a pointer to the page cache in *ppPager.
915 ** The file to be cached need not exist. The file is not locked until
916 ** the first call to sqlitepager_get() and is only held open until the
917 ** last page is released using sqlitepager_unref().
918 **
919 ** If zFilename is NULL then a randomly-named temporary file is created
920 ** and used as the file to be cached. The file will be deleted
921 ** automatically when it is closed.
922 */
sqlitepager_open(Pager ** ppPager,const char * zFilename,int mxPage,int nExtra,int useJournal)923 int sqlitepager_open(
924 Pager **ppPager, /* Return the Pager structure here */
925 const char *zFilename, /* Name of the database file to open */
926 int mxPage, /* Max number of in-memory cache pages */
927 int nExtra, /* Extra bytes append to each in-memory page */
928 int useJournal /* TRUE to use a rollback journal on this file */
929 ){
930 Pager *pPager;
931 char *zFullPathname;
932 int nameLen;
933 OsFile fd;
934 int rc, i;
935 int tempFile;
936 int readOnly = 0;
937 char zTemp[SQLITE_TEMPNAME_SIZE];
938
939 *ppPager = 0;
940 if( sqlite_malloc_failed ){
941 return SQLITE_NOMEM;
942 }
943 if( zFilename && zFilename[0] ){
944 zFullPathname = sqliteOsFullPathname(zFilename);
945 rc = sqliteOsOpenReadWrite(zFullPathname, &fd, &readOnly);
946 tempFile = 0;
947 }else{
948 rc = sqlitepager_opentemp(zTemp, &fd);
949 zFilename = zTemp;
950 zFullPathname = sqliteOsFullPathname(zFilename);
951 tempFile = 1;
952 }
953 if( sqlite_malloc_failed ){
954 return SQLITE_NOMEM;
955 }
956 if( rc!=SQLITE_OK ){
957 sqliteFree(zFullPathname);
958 return SQLITE_CANTOPEN;
959 }
960 nameLen = strlen(zFullPathname);
961 pPager = sqliteMalloc( sizeof(*pPager) + nameLen*3 + 30 );
962 if( pPager==0 ){
963 sqliteOsClose(&fd);
964 sqliteFree(zFullPathname);
965 return SQLITE_NOMEM;
966 }
967 SET_PAGER(pPager);
968 pPager->zFilename = (char*)&pPager[1];
969 pPager->zDirectory = &pPager->zFilename[nameLen+1];
970 pPager->zJournal = &pPager->zDirectory[nameLen+1];
971 strcpy(pPager->zFilename, zFullPathname);
972 strcpy(pPager->zDirectory, zFullPathname);
973 for(i=nameLen; i>0 && pPager->zDirectory[i-1]!='/'; i--){}
974 if( i>0 ) pPager->zDirectory[i-1] = 0;
975 strcpy(pPager->zJournal, zFullPathname);
976 sqliteFree(zFullPathname);
977 strcpy(&pPager->zJournal[nameLen], "-journal");
978 pPager->fd = fd;
979 pPager->journalOpen = 0;
980 pPager->useJournal = useJournal;
981 pPager->ckptOpen = 0;
982 pPager->ckptInUse = 0;
983 pPager->nRef = 0;
984 pPager->dbSize = -1;
985 pPager->ckptSize = 0;
986 pPager->ckptJSize = 0;
987 pPager->nPage = 0;
988 pPager->mxPage = mxPage>5 ? mxPage : 10;
989 pPager->state = SQLITE_UNLOCK;
990 pPager->errMask = 0;
991 pPager->tempFile = tempFile;
992 pPager->readOnly = readOnly;
993 pPager->needSync = 0;
994 pPager->noSync = pPager->tempFile || !useJournal;
995 pPager->pFirst = 0;
996 pPager->pFirstSynced = 0;
997 pPager->pLast = 0;
998 pPager->nExtra = nExtra;
999 memset(pPager->aHash, 0, sizeof(pPager->aHash));
1000 *ppPager = pPager;
1001 return SQLITE_OK;
1002 }
1003
1004 /*
1005 ** Set the destructor for this pager. If not NULL, the destructor is called
1006 ** when the reference count on each page reaches zero. The destructor can
1007 ** be used to clean up information in the extra segment appended to each page.
1008 **
1009 ** The destructor is not called as a result sqlitepager_close().
1010 ** Destructors are only called by sqlitepager_unref().
1011 */
sqlitepager_set_destructor(Pager * pPager,void (* xDesc)(void *))1012 void sqlitepager_set_destructor(Pager *pPager, void (*xDesc)(void*)){
1013 pPager->xDestructor = xDesc;
1014 }
1015
1016 /*
1017 ** Return the total number of pages in the disk file associated with
1018 ** pPager.
1019 */
sqlitepager_pagecount(Pager * pPager)1020 int sqlitepager_pagecount(Pager *pPager){
1021 off_t n;
1022 assert( pPager!=0 );
1023 if( pPager->dbSize>=0 ){
1024 return pPager->dbSize;
1025 }
1026 if( sqliteOsFileSize(&pPager->fd, &n)!=SQLITE_OK ){
1027 pPager->errMask |= PAGER_ERR_DISK;
1028 return 0;
1029 }
1030 n /= SQLITE_PAGE_SIZE;
1031 if( pPager->state!=SQLITE_UNLOCK ){
1032 pPager->dbSize = n;
1033 }
1034 return n;
1035 }
1036
1037 /*
1038 ** Forward declaration
1039 */
1040 static int syncJournal(Pager*);
1041
1042 /*
1043 ** Truncate the file to the number of pages specified.
1044 */
sqlitepager_truncate(Pager * pPager,Pgno nPage)1045 int sqlitepager_truncate(Pager *pPager, Pgno nPage){
1046 int rc;
1047 if( pPager->dbSize<0 ){
1048 sqlitepager_pagecount(pPager);
1049 }
1050 if( pPager->errMask!=0 ){
1051 rc = pager_errcode(pPager);
1052 return rc;
1053 }
1054 if( nPage>=(unsigned)pPager->dbSize ){
1055 return SQLITE_OK;
1056 }
1057 syncJournal(pPager);
1058 rc = sqliteOsTruncate(&pPager->fd, SQLITE_PAGE_SIZE*(off_t)nPage);
1059 if( rc==SQLITE_OK ){
1060 pPager->dbSize = nPage;
1061 }
1062 return rc;
1063 }
1064
1065 /*
1066 ** Shutdown the page cache. Free all memory and close all files.
1067 **
1068 ** If a transaction was in progress when this routine is called, that
1069 ** transaction is rolled back. All outstanding pages are invalidated
1070 ** and their memory is freed. Any attempt to use a page associated
1071 ** with this page cache after this function returns will likely
1072 ** result in a coredump.
1073 */
sqlitepager_close(Pager * pPager)1074 int sqlitepager_close(Pager *pPager){
1075 PgHdr *pPg, *pNext;
1076 switch( pPager->state ){
1077 case SQLITE_WRITELOCK: {
1078 sqlitepager_rollback(pPager);
1079 sqliteOsUnlock(&pPager->fd);
1080 assert( pPager->journalOpen==0 );
1081 break;
1082 }
1083 case SQLITE_READLOCK: {
1084 sqliteOsUnlock(&pPager->fd);
1085 break;
1086 }
1087 default: {
1088 /* Do nothing */
1089 break;
1090 }
1091 }
1092 for(pPg=pPager->pAll; pPg; pPg=pNext){
1093 pNext = pPg->pNextAll;
1094 sqliteFree(pPg);
1095 }
1096 sqliteOsClose(&pPager->fd);
1097 assert( pPager->journalOpen==0 );
1098 /* Temp files are automatically deleted by the OS
1099 ** if( pPager->tempFile ){
1100 ** sqliteOsDelete(pPager->zFilename);
1101 ** }
1102 */
1103 CLR_PAGER(pPager);
1104 if( pPager->zFilename!=(char*)&pPager[1] ){
1105 assert( 0 ); /* Cannot happen */
1106 sqliteFree(pPager->zFilename);
1107 sqliteFree(pPager->zJournal);
1108 sqliteFree(pPager->zDirectory);
1109 }
1110 sqliteFree(pPager);
1111 return SQLITE_OK;
1112 }
1113
1114 /*
1115 ** Return the page number for the given page data.
1116 */
sqlitepager_pagenumber(void * pData)1117 Pgno sqlitepager_pagenumber(void *pData){
1118 PgHdr *p = DATA_TO_PGHDR(pData);
1119 return p->pgno;
1120 }
1121
1122 /*
1123 ** Increment the reference count for a page. If the page is
1124 ** currently on the freelist (the reference count is zero) then
1125 ** remove it from the freelist.
1126 */
1127 #define page_ref(P) ((P)->nRef==0?_page_ref(P):(void)(P)->nRef++)
_page_ref(PgHdr * pPg)1128 static void _page_ref(PgHdr *pPg){
1129 if( pPg->nRef==0 ){
1130 /* The page is currently on the freelist. Remove it. */
1131 if( pPg==pPg->pPager->pFirstSynced ){
1132 PgHdr *p = pPg->pNextFree;
1133 while( p && p->needSync ){ p = p->pNextFree; }
1134 pPg->pPager->pFirstSynced = p;
1135 }
1136 if( pPg->pPrevFree ){
1137 pPg->pPrevFree->pNextFree = pPg->pNextFree;
1138 }else{
1139 pPg->pPager->pFirst = pPg->pNextFree;
1140 }
1141 if( pPg->pNextFree ){
1142 pPg->pNextFree->pPrevFree = pPg->pPrevFree;
1143 }else{
1144 pPg->pPager->pLast = pPg->pPrevFree;
1145 }
1146 pPg->pPager->nRef++;
1147 }
1148 pPg->nRef++;
1149 REFINFO(pPg);
1150 }
1151
1152 /*
1153 ** Increment the reference count for a page. The input pointer is
1154 ** a reference to the page data.
1155 */
sqlitepager_ref(void * pData)1156 int sqlitepager_ref(void *pData){
1157 PgHdr *pPg = DATA_TO_PGHDR(pData);
1158 page_ref(pPg);
1159 return SQLITE_OK;
1160 }
1161
1162 /*
1163 ** Sync the journal. In other words, make sure all the pages that have
1164 ** been written to the journal have actually reached the surface of the
1165 ** disk. It is not safe to modify the original database file until after
1166 ** the journal has been synced. If the original database is modified before
1167 ** the journal is synced and a power failure occurs, the unsynced journal
1168 ** data would be lost and we would be unable to completely rollback the
1169 ** database changes. Database corruption would occur.
1170 **
1171 ** This routine also updates the nRec field in the header of the journal.
1172 ** (See comments on the pager_playback() routine for additional information.)
1173 ** If the sync mode is FULL, two syncs will occur. First the whole journal
1174 ** is synced, then the nRec field is updated, then a second sync occurs.
1175 **
1176 ** For temporary databases, we do not care if we are able to rollback
1177 ** after a power failure, so sync occurs.
1178 **
1179 ** This routine clears the needSync field of every page current held in
1180 ** memory.
1181 */
syncJournal(Pager * pPager)1182 static int syncJournal(Pager *pPager){
1183 PgHdr *pPg;
1184 int rc = SQLITE_OK;
1185
1186 /* Sync the journal before modifying the main database
1187 ** (assuming there is a journal and it needs to be synced.)
1188 */
1189 if( pPager->needSync ){
1190 if( !pPager->tempFile ){
1191 assert( pPager->journalOpen );
1192 /* assert( !pPager->noSync ); // noSync might be set if synchronous
1193 ** was turned off after the transaction was started. Ticket #615 */
1194 #ifndef NDEBUG
1195 {
1196 /* Make sure the pPager->nRec counter we are keeping agrees
1197 ** with the nRec computed from the size of the journal file.
1198 */
1199 off_t hdrSz, pgSz, jSz;
1200 hdrSz = JOURNAL_HDR_SZ(journal_format);
1201 pgSz = JOURNAL_PG_SZ(journal_format);
1202 rc = sqliteOsFileSize(&pPager->jfd, &jSz);
1203 if( rc!=0 ) return rc;
1204 assert( pPager->nRec*pgSz+hdrSz==jSz );
1205 }
1206 #endif
1207 if( journal_format>=3 ){
1208 /* Write the nRec value into the journal file header */
1209 off_t szJ;
1210 if( pPager->fullSync ){
1211 TRACE1("SYNC\n");
1212 rc = sqliteOsSync(&pPager->jfd);
1213 if( rc!=0 ) return rc;
1214 }
1215 sqliteOsSeek(&pPager->jfd, sizeof(aJournalMagic1));
1216 rc = write32bits(&pPager->jfd, pPager->nRec);
1217 if( rc ) return rc;
1218 szJ = JOURNAL_HDR_SZ(journal_format) +
1219 pPager->nRec*JOURNAL_PG_SZ(journal_format);
1220 sqliteOsSeek(&pPager->jfd, szJ);
1221 }
1222 TRACE1("SYNC\n");
1223 rc = sqliteOsSync(&pPager->jfd);
1224 if( rc!=0 ) return rc;
1225 pPager->journalStarted = 1;
1226 }
1227 pPager->needSync = 0;
1228
1229 /* Erase the needSync flag from every page.
1230 */
1231 for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
1232 pPg->needSync = 0;
1233 }
1234 pPager->pFirstSynced = pPager->pFirst;
1235 }
1236
1237 #ifndef NDEBUG
1238 /* If the Pager.needSync flag is clear then the PgHdr.needSync
1239 ** flag must also be clear for all pages. Verify that this
1240 ** invariant is true.
1241 */
1242 else{
1243 for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
1244 assert( pPg->needSync==0 );
1245 }
1246 assert( pPager->pFirstSynced==pPager->pFirst );
1247 }
1248 #endif
1249
1250 return rc;
1251 }
1252
1253 /*
1254 ** Given a list of pages (connected by the PgHdr.pDirty pointer) write
1255 ** every one of those pages out to the database file and mark them all
1256 ** as clean.
1257 */
pager_write_pagelist(PgHdr * pList)1258 static int pager_write_pagelist(PgHdr *pList){
1259 Pager *pPager;
1260 int rc;
1261
1262 if( pList==0 ) return SQLITE_OK;
1263 pPager = pList->pPager;
1264 while( pList ){
1265 assert( pList->dirty );
1266 sqliteOsSeek(&pPager->fd, (pList->pgno-1)*(off_t)SQLITE_PAGE_SIZE);
1267 CODEC(pPager, PGHDR_TO_DATA(pList), pList->pgno, 6);
1268 TRACE2("STORE %d\n", pList->pgno);
1269 rc = sqliteOsWrite(&pPager->fd, PGHDR_TO_DATA(pList), SQLITE_PAGE_SIZE);
1270 CODEC(pPager, PGHDR_TO_DATA(pList), pList->pgno, 0);
1271 if( rc ) return rc;
1272 pList->dirty = 0;
1273 pList = pList->pDirty;
1274 }
1275 return SQLITE_OK;
1276 }
1277
1278 /*
1279 ** Collect every dirty page into a dirty list and
1280 ** return a pointer to the head of that list. All pages are
1281 ** collected even if they are still in use.
1282 */
pager_get_all_dirty_pages(Pager * pPager)1283 static PgHdr *pager_get_all_dirty_pages(Pager *pPager){
1284 PgHdr *p, *pList;
1285 pList = 0;
1286 for(p=pPager->pAll; p; p=p->pNextAll){
1287 if( p->dirty ){
1288 p->pDirty = pList;
1289 pList = p;
1290 }
1291 }
1292 return pList;
1293 }
1294
1295 /*
1296 ** Acquire a page.
1297 **
1298 ** A read lock on the disk file is obtained when the first page is acquired.
1299 ** This read lock is dropped when the last page is released.
1300 **
1301 ** A _get works for any page number greater than 0. If the database
1302 ** file is smaller than the requested page, then no actual disk
1303 ** read occurs and the memory image of the page is initialized to
1304 ** all zeros. The extra data appended to a page is always initialized
1305 ** to zeros the first time a page is loaded into memory.
1306 **
1307 ** The acquisition might fail for several reasons. In all cases,
1308 ** an appropriate error code is returned and *ppPage is set to NULL.
1309 **
1310 ** See also sqlitepager_lookup(). Both this routine and _lookup() attempt
1311 ** to find a page in the in-memory cache first. If the page is not already
1312 ** in memory, this routine goes to disk to read it in whereas _lookup()
1313 ** just returns 0. This routine acquires a read-lock the first time it
1314 ** has to go to disk, and could also playback an old journal if necessary.
1315 ** Since _lookup() never goes to disk, it never has to deal with locks
1316 ** or journal files.
1317 */
sqlitepager_get(Pager * pPager,Pgno pgno,void ** ppPage)1318 int sqlitepager_get(Pager *pPager, Pgno pgno, void **ppPage){
1319 PgHdr *pPg;
1320 int rc;
1321
1322 /* Make sure we have not hit any critical errors.
1323 */
1324 assert( pPager!=0 );
1325 assert( pgno!=0 );
1326 *ppPage = 0;
1327 if( pPager->errMask & ~(PAGER_ERR_FULL) ){
1328 return pager_errcode(pPager);
1329 }
1330
1331 /* If this is the first page accessed, then get a read lock
1332 ** on the database file.
1333 */
1334 if( pPager->nRef==0 ){
1335 rc = sqliteOsReadLock(&pPager->fd);
1336 if( rc!=SQLITE_OK ){
1337 return rc;
1338 }
1339 pPager->state = SQLITE_READLOCK;
1340
1341 /* If a journal file exists, try to play it back.
1342 */
1343 if( pPager->useJournal && sqliteOsFileExists(pPager->zJournal) ){
1344 int rc;
1345
1346 /* Get a write lock on the database
1347 */
1348 rc = sqliteOsWriteLock(&pPager->fd);
1349 if( rc!=SQLITE_OK ){
1350 if( sqliteOsUnlock(&pPager->fd)!=SQLITE_OK ){
1351 /* This should never happen! */
1352 rc = SQLITE_INTERNAL;
1353 }
1354 return rc;
1355 }
1356 pPager->state = SQLITE_WRITELOCK;
1357
1358 /* Open the journal for reading only. Return SQLITE_BUSY if
1359 ** we are unable to open the journal file.
1360 **
1361 ** The journal file does not need to be locked itself. The
1362 ** journal file is never open unless the main database file holds
1363 ** a write lock, so there is never any chance of two or more
1364 ** processes opening the journal at the same time.
1365 */
1366 rc = sqliteOsOpenReadOnly(pPager->zJournal, &pPager->jfd);
1367 if( rc!=SQLITE_OK ){
1368 rc = sqliteOsUnlock(&pPager->fd);
1369 assert( rc==SQLITE_OK );
1370 return SQLITE_BUSY;
1371 }
1372 pPager->journalOpen = 1;
1373 pPager->journalStarted = 0;
1374
1375 /* Playback and delete the journal. Drop the database write
1376 ** lock and reacquire the read lock.
1377 */
1378 rc = pager_playback(pPager, 0);
1379 if( rc!=SQLITE_OK ){
1380 return rc;
1381 }
1382 }
1383 pPg = 0;
1384 }else{
1385 /* Search for page in cache */
1386 pPg = pager_lookup(pPager, pgno);
1387 }
1388 if( pPg==0 ){
1389 /* The requested page is not in the page cache. */
1390 int h;
1391 pPager->nMiss++;
1392 if( pPager->nPage<pPager->mxPage || pPager->pFirst==0 ){
1393 /* Create a new page */
1394 pPg = sqliteMallocRaw( sizeof(*pPg) + SQLITE_PAGE_SIZE
1395 + sizeof(u32) + pPager->nExtra );
1396 if( pPg==0 ){
1397 pager_unwritelock(pPager);
1398 pPager->errMask |= PAGER_ERR_MEM;
1399 return SQLITE_NOMEM;
1400 }
1401 memset(pPg, 0, sizeof(*pPg));
1402 pPg->pPager = pPager;
1403 pPg->pNextAll = pPager->pAll;
1404 if( pPager->pAll ){
1405 pPager->pAll->pPrevAll = pPg;
1406 }
1407 pPg->pPrevAll = 0;
1408 pPager->pAll = pPg;
1409 pPager->nPage++;
1410 }else{
1411 /* Find a page to recycle. Try to locate a page that does not
1412 ** require us to do an fsync() on the journal.
1413 */
1414 pPg = pPager->pFirstSynced;
1415
1416 /* If we could not find a page that does not require an fsync()
1417 ** on the journal file then fsync the journal file. This is a
1418 ** very slow operation, so we work hard to avoid it. But sometimes
1419 ** it can't be helped.
1420 */
1421 if( pPg==0 ){
1422 int rc = syncJournal(pPager);
1423 if( rc!=0 ){
1424 sqlitepager_rollback(pPager);
1425 return SQLITE_IOERR;
1426 }
1427 pPg = pPager->pFirst;
1428 }
1429 assert( pPg->nRef==0 );
1430
1431 /* Write the page to the database file if it is dirty.
1432 */
1433 if( pPg->dirty ){
1434 assert( pPg->needSync==0 );
1435 pPg->pDirty = 0;
1436 rc = pager_write_pagelist( pPg );
1437 if( rc!=SQLITE_OK ){
1438 sqlitepager_rollback(pPager);
1439 return SQLITE_IOERR;
1440 }
1441 }
1442 assert( pPg->dirty==0 );
1443
1444 /* If the page we are recycling is marked as alwaysRollback, then
1445 ** set the global alwaysRollback flag, thus disabling the
1446 ** sqlite_dont_rollback() optimization for the rest of this transaction.
1447 ** It is necessary to do this because the page marked alwaysRollback
1448 ** might be reloaded at a later time but at that point we won't remember
1449 ** that is was marked alwaysRollback. This means that all pages must
1450 ** be marked as alwaysRollback from here on out.
1451 */
1452 if( pPg->alwaysRollback ){
1453 pPager->alwaysRollback = 1;
1454 }
1455
1456 /* Unlink the old page from the free list and the hash table
1457 */
1458 if( pPg==pPager->pFirstSynced ){
1459 PgHdr *p = pPg->pNextFree;
1460 while( p && p->needSync ){ p = p->pNextFree; }
1461 pPager->pFirstSynced = p;
1462 }
1463 if( pPg->pPrevFree ){
1464 pPg->pPrevFree->pNextFree = pPg->pNextFree;
1465 }else{
1466 assert( pPager->pFirst==pPg );
1467 pPager->pFirst = pPg->pNextFree;
1468 }
1469 if( pPg->pNextFree ){
1470 pPg->pNextFree->pPrevFree = pPg->pPrevFree;
1471 }else{
1472 assert( pPager->pLast==pPg );
1473 pPager->pLast = pPg->pPrevFree;
1474 }
1475 pPg->pNextFree = pPg->pPrevFree = 0;
1476 if( pPg->pNextHash ){
1477 pPg->pNextHash->pPrevHash = pPg->pPrevHash;
1478 }
1479 if( pPg->pPrevHash ){
1480 pPg->pPrevHash->pNextHash = pPg->pNextHash;
1481 }else{
1482 h = pager_hash(pPg->pgno);
1483 assert( pPager->aHash[h]==pPg );
1484 pPager->aHash[h] = pPg->pNextHash;
1485 }
1486 pPg->pNextHash = pPg->pPrevHash = 0;
1487 pPager->nOvfl++;
1488 }
1489 pPg->pgno = pgno;
1490 if( pPager->aInJournal && (int)pgno<=pPager->origDbSize ){
1491 sqliteCheckMemory(pPager->aInJournal, pgno/8);
1492 assert( pPager->journalOpen );
1493 pPg->inJournal = (pPager->aInJournal[pgno/8] & (1<<(pgno&7)))!=0;
1494 pPg->needSync = 0;
1495 }else{
1496 pPg->inJournal = 0;
1497 pPg->needSync = 0;
1498 }
1499 if( pPager->aInCkpt && (int)pgno<=pPager->ckptSize
1500 && (pPager->aInCkpt[pgno/8] & (1<<(pgno&7)))!=0 ){
1501 page_add_to_ckpt_list(pPg);
1502 }else{
1503 page_remove_from_ckpt_list(pPg);
1504 }
1505 pPg->dirty = 0;
1506 pPg->nRef = 1;
1507 REFINFO(pPg);
1508 pPager->nRef++;
1509 h = pager_hash(pgno);
1510 pPg->pNextHash = pPager->aHash[h];
1511 pPager->aHash[h] = pPg;
1512 if( pPg->pNextHash ){
1513 assert( pPg->pNextHash->pPrevHash==0 );
1514 pPg->pNextHash->pPrevHash = pPg;
1515 }
1516 if( pPager->nExtra>0 ){
1517 memset(PGHDR_TO_EXTRA(pPg), 0, pPager->nExtra);
1518 }
1519 if( pPager->dbSize<0 ) sqlitepager_pagecount(pPager);
1520 if( pPager->errMask!=0 ){
1521 sqlitepager_unref(PGHDR_TO_DATA(pPg));
1522 rc = pager_errcode(pPager);
1523 return rc;
1524 }
1525 if( pPager->dbSize<(int)pgno ){
1526 memset(PGHDR_TO_DATA(pPg), 0, SQLITE_PAGE_SIZE);
1527 }else{
1528 int rc;
1529 sqliteOsSeek(&pPager->fd, (pgno-1)*(off_t)SQLITE_PAGE_SIZE);
1530 rc = sqliteOsRead(&pPager->fd, PGHDR_TO_DATA(pPg), SQLITE_PAGE_SIZE);
1531 TRACE2("FETCH %d\n", pPg->pgno);
1532 CODEC(pPager, PGHDR_TO_DATA(pPg), pPg->pgno, 3);
1533 if( rc!=SQLITE_OK ){
1534 off_t fileSize;
1535 if( sqliteOsFileSize(&pPager->fd,&fileSize)!=SQLITE_OK
1536 || fileSize>=pgno*SQLITE_PAGE_SIZE ){
1537 sqlitepager_unref(PGHDR_TO_DATA(pPg));
1538 return rc;
1539 }else{
1540 memset(PGHDR_TO_DATA(pPg), 0, SQLITE_PAGE_SIZE);
1541 }
1542 }
1543 }
1544 }else{
1545 /* The requested page is in the page cache. */
1546 pPager->nHit++;
1547 page_ref(pPg);
1548 }
1549 *ppPage = PGHDR_TO_DATA(pPg);
1550 return SQLITE_OK;
1551 }
1552
1553 /*
1554 ** Acquire a page if it is already in the in-memory cache. Do
1555 ** not read the page from disk. Return a pointer to the page,
1556 ** or 0 if the page is not in cache.
1557 **
1558 ** See also sqlitepager_get(). The difference between this routine
1559 ** and sqlitepager_get() is that _get() will go to the disk and read
1560 ** in the page if the page is not already in cache. This routine
1561 ** returns NULL if the page is not in cache or if a disk I/O error
1562 ** has ever happened.
1563 */
sqlitepager_lookup(Pager * pPager,Pgno pgno)1564 void *sqlitepager_lookup(Pager *pPager, Pgno pgno){
1565 PgHdr *pPg;
1566
1567 assert( pPager!=0 );
1568 assert( pgno!=0 );
1569 if( pPager->errMask & ~(PAGER_ERR_FULL) ){
1570 return 0;
1571 }
1572 /* if( pPager->nRef==0 ){
1573 ** return 0;
1574 ** }
1575 */
1576 pPg = pager_lookup(pPager, pgno);
1577 if( pPg==0 ) return 0;
1578 page_ref(pPg);
1579 return PGHDR_TO_DATA(pPg);
1580 }
1581
1582 /*
1583 ** Release a page.
1584 **
1585 ** If the number of references to the page drop to zero, then the
1586 ** page is added to the LRU list. When all references to all pages
1587 ** are released, a rollback occurs and the lock on the database is
1588 ** removed.
1589 */
sqlitepager_unref(void * pData)1590 int sqlitepager_unref(void *pData){
1591 PgHdr *pPg;
1592
1593 /* Decrement the reference count for this page
1594 */
1595 pPg = DATA_TO_PGHDR(pData);
1596 assert( pPg->nRef>0 );
1597 pPg->nRef--;
1598 REFINFO(pPg);
1599
1600 /* When the number of references to a page reach 0, call the
1601 ** destructor and add the page to the freelist.
1602 */
1603 if( pPg->nRef==0 ){
1604 Pager *pPager;
1605 pPager = pPg->pPager;
1606 pPg->pNextFree = 0;
1607 pPg->pPrevFree = pPager->pLast;
1608 pPager->pLast = pPg;
1609 if( pPg->pPrevFree ){
1610 pPg->pPrevFree->pNextFree = pPg;
1611 }else{
1612 pPager->pFirst = pPg;
1613 }
1614 if( pPg->needSync==0 && pPager->pFirstSynced==0 ){
1615 pPager->pFirstSynced = pPg;
1616 }
1617 if( pPager->xDestructor ){
1618 pPager->xDestructor(pData);
1619 }
1620
1621 /* When all pages reach the freelist, drop the read lock from
1622 ** the database file.
1623 */
1624 pPager->nRef--;
1625 assert( pPager->nRef>=0 );
1626 if( pPager->nRef==0 ){
1627 pager_reset(pPager);
1628 }
1629 }
1630 return SQLITE_OK;
1631 }
1632
1633 /*
1634 ** Create a journal file for pPager. There should already be a write
1635 ** lock on the database file when this routine is called.
1636 **
1637 ** Return SQLITE_OK if everything. Return an error code and release the
1638 ** write lock if anything goes wrong.
1639 */
pager_open_journal(Pager * pPager)1640 static int pager_open_journal(Pager *pPager){
1641 int rc;
1642 assert( pPager->state==SQLITE_WRITELOCK );
1643 assert( pPager->journalOpen==0 );
1644 assert( pPager->useJournal );
1645 sqlitepager_pagecount(pPager);
1646 pPager->aInJournal = sqliteMalloc( pPager->dbSize/8 + 1 );
1647 if( pPager->aInJournal==0 ){
1648 sqliteOsReadLock(&pPager->fd);
1649 pPager->state = SQLITE_READLOCK;
1650 return SQLITE_NOMEM;
1651 }
1652 rc = sqliteOsOpenExclusive(pPager->zJournal, &pPager->jfd,pPager->tempFile);
1653 if( rc!=SQLITE_OK ){
1654 sqliteFree(pPager->aInJournal);
1655 pPager->aInJournal = 0;
1656 sqliteOsReadLock(&pPager->fd);
1657 pPager->state = SQLITE_READLOCK;
1658 return SQLITE_CANTOPEN;
1659 }
1660 sqliteOsOpenDirectory(pPager->zDirectory, &pPager->jfd);
1661 pPager->journalOpen = 1;
1662 pPager->journalStarted = 0;
1663 pPager->needSync = 0;
1664 pPager->alwaysRollback = 0;
1665 pPager->nRec = 0;
1666 if( pPager->errMask!=0 ){
1667 rc = pager_errcode(pPager);
1668 return rc;
1669 }
1670 pPager->origDbSize = pPager->dbSize;
1671 if( journal_format==JOURNAL_FORMAT_3 ){
1672 rc = sqliteOsWrite(&pPager->jfd, aJournalMagic3, sizeof(aJournalMagic3));
1673 if( rc==SQLITE_OK ){
1674 rc = write32bits(&pPager->jfd, pPager->noSync ? 0xffffffff : 0);
1675 }
1676 if( rc==SQLITE_OK ){
1677 sqliteRandomness(sizeof(pPager->cksumInit), &pPager->cksumInit);
1678 rc = write32bits(&pPager->jfd, pPager->cksumInit);
1679 }
1680 }else if( journal_format==JOURNAL_FORMAT_2 ){
1681 rc = sqliteOsWrite(&pPager->jfd, aJournalMagic2, sizeof(aJournalMagic2));
1682 }else{
1683 assert( journal_format==JOURNAL_FORMAT_1 );
1684 rc = sqliteOsWrite(&pPager->jfd, aJournalMagic1, sizeof(aJournalMagic1));
1685 }
1686 if( rc==SQLITE_OK ){
1687 rc = write32bits(&pPager->jfd, pPager->dbSize);
1688 }
1689 if( pPager->ckptAutoopen && rc==SQLITE_OK ){
1690 rc = sqlitepager_ckpt_begin(pPager);
1691 }
1692 if( rc!=SQLITE_OK ){
1693 rc = pager_unwritelock(pPager);
1694 if( rc==SQLITE_OK ){
1695 rc = SQLITE_FULL;
1696 }
1697 }
1698 return rc;
1699 }
1700
1701 /*
1702 ** Acquire a write-lock on the database. The lock is removed when
1703 ** the any of the following happen:
1704 **
1705 ** * sqlitepager_commit() is called.
1706 ** * sqlitepager_rollback() is called.
1707 ** * sqlitepager_close() is called.
1708 ** * sqlitepager_unref() is called to on every outstanding page.
1709 **
1710 ** The parameter to this routine is a pointer to any open page of the
1711 ** database file. Nothing changes about the page - it is used merely
1712 ** to acquire a pointer to the Pager structure and as proof that there
1713 ** is already a read-lock on the database.
1714 **
1715 ** A journal file is opened if this is not a temporary file. For
1716 ** temporary files, the opening of the journal file is deferred until
1717 ** there is an actual need to write to the journal.
1718 **
1719 ** If the database is already write-locked, this routine is a no-op.
1720 */
sqlitepager_begin(void * pData)1721 int sqlitepager_begin(void *pData){
1722 PgHdr *pPg = DATA_TO_PGHDR(pData);
1723 Pager *pPager = pPg->pPager;
1724 int rc = SQLITE_OK;
1725 assert( pPg->nRef>0 );
1726 assert( pPager->state!=SQLITE_UNLOCK );
1727 if( pPager->state==SQLITE_READLOCK ){
1728 assert( pPager->aInJournal==0 );
1729 rc = sqliteOsWriteLock(&pPager->fd);
1730 if( rc!=SQLITE_OK ){
1731 return rc;
1732 }
1733 pPager->state = SQLITE_WRITELOCK;
1734 pPager->dirtyFile = 0;
1735 TRACE1("TRANSACTION\n");
1736 if( pPager->useJournal && !pPager->tempFile ){
1737 rc = pager_open_journal(pPager);
1738 }
1739 }
1740 return rc;
1741 }
1742
1743 /*
1744 ** Mark a data page as writeable. The page is written into the journal
1745 ** if it is not there already. This routine must be called before making
1746 ** changes to a page.
1747 **
1748 ** The first time this routine is called, the pager creates a new
1749 ** journal and acquires a write lock on the database. If the write
1750 ** lock could not be acquired, this routine returns SQLITE_BUSY. The
1751 ** calling routine must check for that return value and be careful not to
1752 ** change any page data until this routine returns SQLITE_OK.
1753 **
1754 ** If the journal file could not be written because the disk is full,
1755 ** then this routine returns SQLITE_FULL and does an immediate rollback.
1756 ** All subsequent write attempts also return SQLITE_FULL until there
1757 ** is a call to sqlitepager_commit() or sqlitepager_rollback() to
1758 ** reset.
1759 */
sqlitepager_write(void * pData)1760 int sqlitepager_write(void *pData){
1761 PgHdr *pPg = DATA_TO_PGHDR(pData);
1762 Pager *pPager = pPg->pPager;
1763 int rc = SQLITE_OK;
1764
1765 /* Check for errors
1766 */
1767 if( pPager->errMask ){
1768 return pager_errcode(pPager);
1769 }
1770 if( pPager->readOnly ){
1771 return SQLITE_PERM;
1772 }
1773
1774 /* Mark the page as dirty. If the page has already been written
1775 ** to the journal then we can return right away.
1776 */
1777 pPg->dirty = 1;
1778 if( pPg->inJournal && (pPg->inCkpt || pPager->ckptInUse==0) ){
1779 pPager->dirtyFile = 1;
1780 return SQLITE_OK;
1781 }
1782
1783 /* If we get this far, it means that the page needs to be
1784 ** written to the transaction journal or the ckeckpoint journal
1785 ** or both.
1786 **
1787 ** First check to see that the transaction journal exists and
1788 ** create it if it does not.
1789 */
1790 assert( pPager->state!=SQLITE_UNLOCK );
1791 rc = sqlitepager_begin(pData);
1792 if( rc!=SQLITE_OK ){
1793 return rc;
1794 }
1795 assert( pPager->state==SQLITE_WRITELOCK );
1796 if( !pPager->journalOpen && pPager->useJournal ){
1797 rc = pager_open_journal(pPager);
1798 if( rc!=SQLITE_OK ) return rc;
1799 }
1800 assert( pPager->journalOpen || !pPager->useJournal );
1801 pPager->dirtyFile = 1;
1802
1803 /* The transaction journal now exists and we have a write lock on the
1804 ** main database file. Write the current page to the transaction
1805 ** journal if it is not there already.
1806 */
1807 if( !pPg->inJournal && pPager->useJournal ){
1808 if( (int)pPg->pgno <= pPager->origDbSize ){
1809 int szPg;
1810 u32 saved;
1811 if( journal_format>=JOURNAL_FORMAT_3 ){
1812 u32 cksum = pager_cksum(pPager, pPg->pgno, pData);
1813 saved = *(u32*)PGHDR_TO_EXTRA(pPg);
1814 store32bits(cksum, pPg, SQLITE_PAGE_SIZE);
1815 szPg = SQLITE_PAGE_SIZE+8;
1816 }else{
1817 szPg = SQLITE_PAGE_SIZE+4;
1818 }
1819 store32bits(pPg->pgno, pPg, -4);
1820 CODEC(pPager, pData, pPg->pgno, 7);
1821 rc = sqliteOsWrite(&pPager->jfd, &((char*)pData)[-4], szPg);
1822 TRACE3("JOURNAL %d %d\n", pPg->pgno, pPg->needSync);
1823 CODEC(pPager, pData, pPg->pgno, 0);
1824 if( journal_format>=JOURNAL_FORMAT_3 ){
1825 *(u32*)PGHDR_TO_EXTRA(pPg) = saved;
1826 }
1827 if( rc!=SQLITE_OK ){
1828 sqlitepager_rollback(pPager);
1829 pPager->errMask |= PAGER_ERR_FULL;
1830 return rc;
1831 }
1832 pPager->nRec++;
1833 assert( pPager->aInJournal!=0 );
1834 pPager->aInJournal[pPg->pgno/8] |= 1<<(pPg->pgno&7);
1835 pPg->needSync = !pPager->noSync;
1836 pPg->inJournal = 1;
1837 if( pPager->ckptInUse ){
1838 pPager->aInCkpt[pPg->pgno/8] |= 1<<(pPg->pgno&7);
1839 page_add_to_ckpt_list(pPg);
1840 }
1841 }else{
1842 pPg->needSync = !pPager->journalStarted && !pPager->noSync;
1843 TRACE3("APPEND %d %d\n", pPg->pgno, pPg->needSync);
1844 }
1845 if( pPg->needSync ){
1846 pPager->needSync = 1;
1847 }
1848 }
1849
1850 /* If the checkpoint journal is open and the page is not in it,
1851 ** then write the current page to the checkpoint journal. Note that
1852 ** the checkpoint journal always uses the simplier format 2 that lacks
1853 ** checksums. The header is also omitted from the checkpoint journal.
1854 */
1855 if( pPager->ckptInUse && !pPg->inCkpt && (int)pPg->pgno<=pPager->ckptSize ){
1856 assert( pPg->inJournal || (int)pPg->pgno>pPager->origDbSize );
1857 store32bits(pPg->pgno, pPg, -4);
1858 CODEC(pPager, pData, pPg->pgno, 7);
1859 rc = sqliteOsWrite(&pPager->cpfd, &((char*)pData)[-4], SQLITE_PAGE_SIZE+4);
1860 TRACE2("CKPT-JOURNAL %d\n", pPg->pgno);
1861 CODEC(pPager, pData, pPg->pgno, 0);
1862 if( rc!=SQLITE_OK ){
1863 sqlitepager_rollback(pPager);
1864 pPager->errMask |= PAGER_ERR_FULL;
1865 return rc;
1866 }
1867 pPager->ckptNRec++;
1868 assert( pPager->aInCkpt!=0 );
1869 pPager->aInCkpt[pPg->pgno/8] |= 1<<(pPg->pgno&7);
1870 page_add_to_ckpt_list(pPg);
1871 }
1872
1873 /* Update the database size and return.
1874 */
1875 if( pPager->dbSize<(int)pPg->pgno ){
1876 pPager->dbSize = pPg->pgno;
1877 }
1878 return rc;
1879 }
1880
1881 /*
1882 ** Return TRUE if the page given in the argument was previously passed
1883 ** to sqlitepager_write(). In other words, return TRUE if it is ok
1884 ** to change the content of the page.
1885 */
sqlitepager_iswriteable(void * pData)1886 int sqlitepager_iswriteable(void *pData){
1887 PgHdr *pPg = DATA_TO_PGHDR(pData);
1888 return pPg->dirty;
1889 }
1890
1891 /*
1892 ** Replace the content of a single page with the information in the third
1893 ** argument.
1894 */
sqlitepager_overwrite(Pager * pPager,Pgno pgno,void * pData)1895 int sqlitepager_overwrite(Pager *pPager, Pgno pgno, void *pData){
1896 void *pPage;
1897 int rc;
1898
1899 rc = sqlitepager_get(pPager, pgno, &pPage);
1900 if( rc==SQLITE_OK ){
1901 rc = sqlitepager_write(pPage);
1902 if( rc==SQLITE_OK ){
1903 memcpy(pPage, pData, SQLITE_PAGE_SIZE);
1904 }
1905 sqlitepager_unref(pPage);
1906 }
1907 return rc;
1908 }
1909
1910 /*
1911 ** A call to this routine tells the pager that it is not necessary to
1912 ** write the information on page "pgno" back to the disk, even though
1913 ** that page might be marked as dirty.
1914 **
1915 ** The overlying software layer calls this routine when all of the data
1916 ** on the given page is unused. The pager marks the page as clean so
1917 ** that it does not get written to disk.
1918 **
1919 ** Tests show that this optimization, together with the
1920 ** sqlitepager_dont_rollback() below, more than double the speed
1921 ** of large INSERT operations and quadruple the speed of large DELETEs.
1922 **
1923 ** When this routine is called, set the alwaysRollback flag to true.
1924 ** Subsequent calls to sqlitepager_dont_rollback() for the same page
1925 ** will thereafter be ignored. This is necessary to avoid a problem
1926 ** where a page with data is added to the freelist during one part of
1927 ** a transaction then removed from the freelist during a later part
1928 ** of the same transaction and reused for some other purpose. When it
1929 ** is first added to the freelist, this routine is called. When reused,
1930 ** the dont_rollback() routine is called. But because the page contains
1931 ** critical data, we still need to be sure it gets rolled back in spite
1932 ** of the dont_rollback() call.
1933 */
sqlitepager_dont_write(Pager * pPager,Pgno pgno)1934 void sqlitepager_dont_write(Pager *pPager, Pgno pgno){
1935 PgHdr *pPg;
1936
1937 pPg = pager_lookup(pPager, pgno);
1938 pPg->alwaysRollback = 1;
1939 if( pPg && pPg->dirty ){
1940 if( pPager->dbSize==(int)pPg->pgno && pPager->origDbSize<pPager->dbSize ){
1941 /* If this pages is the last page in the file and the file has grown
1942 ** during the current transaction, then do NOT mark the page as clean.
1943 ** When the database file grows, we must make sure that the last page
1944 ** gets written at least once so that the disk file will be the correct
1945 ** size. If you do not write this page and the size of the file
1946 ** on the disk ends up being too small, that can lead to database
1947 ** corruption during the next transaction.
1948 */
1949 }else{
1950 TRACE2("DONT_WRITE %d\n", pgno);
1951 pPg->dirty = 0;
1952 }
1953 }
1954 }
1955
1956 /*
1957 ** A call to this routine tells the pager that if a rollback occurs,
1958 ** it is not necessary to restore the data on the given page. This
1959 ** means that the pager does not have to record the given page in the
1960 ** rollback journal.
1961 */
sqlitepager_dont_rollback(void * pData)1962 void sqlitepager_dont_rollback(void *pData){
1963 PgHdr *pPg = DATA_TO_PGHDR(pData);
1964 Pager *pPager = pPg->pPager;
1965
1966 if( pPager->state!=SQLITE_WRITELOCK || pPager->journalOpen==0 ) return;
1967 if( pPg->alwaysRollback || pPager->alwaysRollback ) return;
1968 if( !pPg->inJournal && (int)pPg->pgno <= pPager->origDbSize ){
1969 assert( pPager->aInJournal!=0 );
1970 pPager->aInJournal[pPg->pgno/8] |= 1<<(pPg->pgno&7);
1971 pPg->inJournal = 1;
1972 if( pPager->ckptInUse ){
1973 pPager->aInCkpt[pPg->pgno/8] |= 1<<(pPg->pgno&7);
1974 page_add_to_ckpt_list(pPg);
1975 }
1976 TRACE2("DONT_ROLLBACK %d\n", pPg->pgno);
1977 }
1978 if( pPager->ckptInUse && !pPg->inCkpt && (int)pPg->pgno<=pPager->ckptSize ){
1979 assert( pPg->inJournal || (int)pPg->pgno>pPager->origDbSize );
1980 assert( pPager->aInCkpt!=0 );
1981 pPager->aInCkpt[pPg->pgno/8] |= 1<<(pPg->pgno&7);
1982 page_add_to_ckpt_list(pPg);
1983 }
1984 }
1985
1986 /*
1987 ** Commit all changes to the database and release the write lock.
1988 **
1989 ** If the commit fails for any reason, a rollback attempt is made
1990 ** and an error code is returned. If the commit worked, SQLITE_OK
1991 ** is returned.
1992 */
sqlitepager_commit(Pager * pPager)1993 int sqlitepager_commit(Pager *pPager){
1994 int rc;
1995 PgHdr *pPg;
1996
1997 if( pPager->errMask==PAGER_ERR_FULL ){
1998 rc = sqlitepager_rollback(pPager);
1999 if( rc==SQLITE_OK ){
2000 rc = SQLITE_FULL;
2001 }
2002 return rc;
2003 }
2004 if( pPager->errMask!=0 ){
2005 rc = pager_errcode(pPager);
2006 return rc;
2007 }
2008 if( pPager->state!=SQLITE_WRITELOCK ){
2009 return SQLITE_ERROR;
2010 }
2011 TRACE1("COMMIT\n");
2012 if( pPager->dirtyFile==0 ){
2013 /* Exit early (without doing the time-consuming sqliteOsSync() calls)
2014 ** if there have been no changes to the database file. */
2015 assert( pPager->needSync==0 );
2016 rc = pager_unwritelock(pPager);
2017 pPager->dbSize = -1;
2018 return rc;
2019 }
2020 assert( pPager->journalOpen );
2021 rc = syncJournal(pPager);
2022 if( rc!=SQLITE_OK ){
2023 goto commit_abort;
2024 }
2025 pPg = pager_get_all_dirty_pages(pPager);
2026 if( pPg ){
2027 rc = pager_write_pagelist(pPg);
2028 if( rc || (!pPager->noSync && sqliteOsSync(&pPager->fd)!=SQLITE_OK) ){
2029 goto commit_abort;
2030 }
2031 }
2032 rc = pager_unwritelock(pPager);
2033 pPager->dbSize = -1;
2034 return rc;
2035
2036 /* Jump here if anything goes wrong during the commit process.
2037 */
2038 commit_abort:
2039 rc = sqlitepager_rollback(pPager);
2040 if( rc==SQLITE_OK ){
2041 rc = SQLITE_FULL;
2042 }
2043 return rc;
2044 }
2045
2046 /*
2047 ** Rollback all changes. The database falls back to read-only mode.
2048 ** All in-memory cache pages revert to their original data contents.
2049 ** The journal is deleted.
2050 **
2051 ** This routine cannot fail unless some other process is not following
2052 ** the correct locking protocol (SQLITE_PROTOCOL) or unless some other
2053 ** process is writing trash into the journal file (SQLITE_CORRUPT) or
2054 ** unless a prior malloc() failed (SQLITE_NOMEM). Appropriate error
2055 ** codes are returned for all these occasions. Otherwise,
2056 ** SQLITE_OK is returned.
2057 */
sqlitepager_rollback(Pager * pPager)2058 int sqlitepager_rollback(Pager *pPager){
2059 int rc;
2060 TRACE1("ROLLBACK\n");
2061 if( !pPager->dirtyFile || !pPager->journalOpen ){
2062 rc = pager_unwritelock(pPager);
2063 pPager->dbSize = -1;
2064 return rc;
2065 }
2066
2067 if( pPager->errMask!=0 && pPager->errMask!=PAGER_ERR_FULL ){
2068 if( pPager->state>=SQLITE_WRITELOCK ){
2069 pager_playback(pPager, 1);
2070 }
2071 return pager_errcode(pPager);
2072 }
2073 if( pPager->state!=SQLITE_WRITELOCK ){
2074 return SQLITE_OK;
2075 }
2076 rc = pager_playback(pPager, 1);
2077 if( rc!=SQLITE_OK ){
2078 rc = SQLITE_CORRUPT;
2079 pPager->errMask |= PAGER_ERR_CORRUPT;
2080 }
2081 pPager->dbSize = -1;
2082 return rc;
2083 }
2084
2085 /*
2086 ** Return TRUE if the database file is opened read-only. Return FALSE
2087 ** if the database is (in theory) writable.
2088 */
sqlitepager_isreadonly(Pager * pPager)2089 int sqlitepager_isreadonly(Pager *pPager){
2090 return pPager->readOnly;
2091 }
2092
2093 /*
2094 ** This routine is used for testing and analysis only.
2095 */
sqlitepager_stats(Pager * pPager)2096 int *sqlitepager_stats(Pager *pPager){
2097 static int a[9];
2098 a[0] = pPager->nRef;
2099 a[1] = pPager->nPage;
2100 a[2] = pPager->mxPage;
2101 a[3] = pPager->dbSize;
2102 a[4] = pPager->state;
2103 a[5] = pPager->errMask;
2104 a[6] = pPager->nHit;
2105 a[7] = pPager->nMiss;
2106 a[8] = pPager->nOvfl;
2107 return a;
2108 }
2109
2110 /*
2111 ** Set the checkpoint.
2112 **
2113 ** This routine should be called with the transaction journal already
2114 ** open. A new checkpoint journal is created that can be used to rollback
2115 ** changes of a single SQL command within a larger transaction.
2116 */
sqlitepager_ckpt_begin(Pager * pPager)2117 int sqlitepager_ckpt_begin(Pager *pPager){
2118 int rc;
2119 char zTemp[SQLITE_TEMPNAME_SIZE];
2120 if( !pPager->journalOpen ){
2121 pPager->ckptAutoopen = 1;
2122 return SQLITE_OK;
2123 }
2124 assert( pPager->journalOpen );
2125 assert( !pPager->ckptInUse );
2126 pPager->aInCkpt = sqliteMalloc( pPager->dbSize/8 + 1 );
2127 if( pPager->aInCkpt==0 ){
2128 sqliteOsReadLock(&pPager->fd);
2129 return SQLITE_NOMEM;
2130 }
2131 #ifndef NDEBUG
2132 rc = sqliteOsFileSize(&pPager->jfd, &pPager->ckptJSize);
2133 if( rc ) goto ckpt_begin_failed;
2134 assert( pPager->ckptJSize ==
2135 pPager->nRec*JOURNAL_PG_SZ(journal_format)+JOURNAL_HDR_SZ(journal_format) );
2136 #endif
2137 pPager->ckptJSize = pPager->nRec*JOURNAL_PG_SZ(journal_format)
2138 + JOURNAL_HDR_SZ(journal_format);
2139 pPager->ckptSize = pPager->dbSize;
2140 if( !pPager->ckptOpen ){
2141 rc = sqlitepager_opentemp(zTemp, &pPager->cpfd);
2142 if( rc ) goto ckpt_begin_failed;
2143 pPager->ckptOpen = 1;
2144 pPager->ckptNRec = 0;
2145 }
2146 pPager->ckptInUse = 1;
2147 return SQLITE_OK;
2148
2149 ckpt_begin_failed:
2150 if( pPager->aInCkpt ){
2151 sqliteFree(pPager->aInCkpt);
2152 pPager->aInCkpt = 0;
2153 }
2154 return rc;
2155 }
2156
2157 /*
2158 ** Commit a checkpoint.
2159 */
sqlitepager_ckpt_commit(Pager * pPager)2160 int sqlitepager_ckpt_commit(Pager *pPager){
2161 if( pPager->ckptInUse ){
2162 PgHdr *pPg, *pNext;
2163 sqliteOsSeek(&pPager->cpfd, 0);
2164 /* sqliteOsTruncate(&pPager->cpfd, 0); */
2165 pPager->ckptNRec = 0;
2166 pPager->ckptInUse = 0;
2167 sqliteFree( pPager->aInCkpt );
2168 pPager->aInCkpt = 0;
2169 for(pPg=pPager->pCkpt; pPg; pPg=pNext){
2170 pNext = pPg->pNextCkpt;
2171 assert( pPg->inCkpt );
2172 pPg->inCkpt = 0;
2173 pPg->pPrevCkpt = pPg->pNextCkpt = 0;
2174 }
2175 pPager->pCkpt = 0;
2176 }
2177 pPager->ckptAutoopen = 0;
2178 return SQLITE_OK;
2179 }
2180
2181 /*
2182 ** Rollback a checkpoint.
2183 */
sqlitepager_ckpt_rollback(Pager * pPager)2184 int sqlitepager_ckpt_rollback(Pager *pPager){
2185 int rc;
2186 if( pPager->ckptInUse ){
2187 rc = pager_ckpt_playback(pPager);
2188 sqlitepager_ckpt_commit(pPager);
2189 }else{
2190 rc = SQLITE_OK;
2191 }
2192 pPager->ckptAutoopen = 0;
2193 return rc;
2194 }
2195
2196 /*
2197 ** Return the full pathname of the database file.
2198 */
sqlitepager_filename(Pager * pPager)2199 const char *sqlitepager_filename(Pager *pPager){
2200 return pPager->zFilename;
2201 }
2202
2203 /*
2204 ** Set the codec for this pager
2205 */
sqlitepager_set_codec(Pager * pPager,void (* xCodec)(void *,void *,Pgno,int),void * pCodecArg)2206 void sqlitepager_set_codec(
2207 Pager *pPager,
2208 void (*xCodec)(void*,void*,Pgno,int),
2209 void *pCodecArg
2210 ){
2211 pPager->xCodec = xCodec;
2212 pPager->pCodecArg = pCodecArg;
2213 }
2214
2215 #ifdef SQLITE_TEST
2216 /*
2217 ** Print a listing of all referenced pages and their ref count.
2218 */
sqlitepager_refdump(Pager * pPager)2219 void sqlitepager_refdump(Pager *pPager){
2220 PgHdr *pPg;
2221 for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
2222 if( pPg->nRef<=0 ) continue;
2223 printf("PAGE %3d addr=0x%08x nRef=%d\n",
2224 pPg->pgno, (int)PGHDR_TO_DATA(pPg), pPg->nRef);
2225 }
2226 }
2227 #endif
2228