1 /* 2 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 3 * Use is subject to license terms. 4 */ 5 6 /* 7 ** 2001 September 15 8 ** 9 ** The author disclaims copyright to this source code. In place of 10 ** a legal notice, here is a blessing: 11 ** 12 ** May you do good and not evil. 13 ** May you find forgiveness for yourself and forgive others. 14 ** May you share freely, never taking more than you give. 15 ** 16 ************************************************************************* 17 ** $Id: btree.c,v 1.103 2004/03/10 13:42:38 drh Exp $ 18 ** 19 ** This file implements a external (disk-based) database using BTrees. 20 ** For a detailed discussion of BTrees, refer to 21 ** 22 ** Donald E. Knuth, THE ART OF COMPUTER PROGRAMMING, Volume 3: 23 ** "Sorting And Searching", pages 473-480. Addison-Wesley 24 ** Publishing Company, Reading, Massachusetts. 25 ** 26 ** The basic idea is that each page of the file contains N database 27 ** entries and N+1 pointers to subpages. 28 ** 29 ** ---------------------------------------------------------------- 30 ** | Ptr(0) | Key(0) | Ptr(1) | Key(1) | ... | Key(N) | Ptr(N+1) | 31 ** ---------------------------------------------------------------- 32 ** 33 ** All of the keys on the page that Ptr(0) points to have values less 34 ** than Key(0). All of the keys on page Ptr(1) and its subpages have 35 ** values greater than Key(0) and less than Key(1). All of the keys 36 ** on Ptr(N+1) and its subpages have values greater than Key(N). And 37 ** so forth. 38 ** 39 ** Finding a particular key requires reading O(log(M)) pages from the 40 ** disk where M is the number of entries in the tree. 41 ** 42 ** In this implementation, a single file can hold one or more separate 43 ** BTrees. Each BTree is identified by the index of its root page. The 44 ** key and data for any entry are combined to form the "payload". Up to 45 ** MX_LOCAL_PAYLOAD bytes of payload can be carried directly on the 46 ** database page. If the payload is larger than MX_LOCAL_PAYLOAD bytes 47 ** then surplus bytes are stored on overflow pages. The payload for an 48 ** entry and the preceding pointer are combined to form a "Cell". Each 49 ** page has a small header which contains the Ptr(N+1) pointer. 50 ** 51 ** The first page of the file contains a magic string used to verify that 52 ** the file really is a valid BTree database, a pointer to a list of unused 53 ** pages in the file, and some meta information. The root of the first 54 ** BTree begins on page 2 of the file. (Pages are numbered beginning with 55 ** 1, not 0.) Thus a minimum database contains 2 pages. 56 */ 57 #include "sqliteInt.h" 58 #include "pager.h" 59 #include "btree.h" 60 #include <assert.h> 61 62 /* Forward declarations */ 63 static BtOps sqliteBtreeOps; 64 static BtCursorOps sqliteBtreeCursorOps; 65 66 /* 67 ** Macros used for byteswapping. B is a pointer to the Btree 68 ** structure. This is needed to access the Btree.needSwab boolean 69 ** in order to tell if byte swapping is needed or not. 70 ** X is an unsigned integer. SWAB16 byte swaps a 16-bit integer. 71 ** SWAB32 byteswaps a 32-bit integer. 72 */ 73 #define SWAB16(B,X) ((B)->needSwab? swab16((u16)X) : ((u16)X)) 74 #define SWAB32(B,X) ((B)->needSwab? swab32(X) : (X)) 75 #define SWAB_ADD(B,X,A) \ 76 if((B)->needSwab){ X=swab32(swab32(X)+A); }else{ X += (A); } 77 78 /* 79 ** The following global variable - available only if SQLITE_TEST is 80 ** defined - is used to determine whether new databases are created in 81 ** native byte order or in non-native byte order. Non-native byte order 82 ** databases are created for testing purposes only. Under normal operation, 83 ** only native byte-order databases should be created, but we should be 84 ** able to read or write existing databases regardless of the byteorder. 85 */ 86 #ifdef SQLITE_TEST 87 int btree_native_byte_order = 1; 88 #else 89 # define btree_native_byte_order 1 90 #endif 91 92 /* 93 ** Forward declarations of structures used only in this file. 94 */ 95 typedef struct PageOne PageOne; 96 typedef struct MemPage MemPage; 97 typedef struct PageHdr PageHdr; 98 typedef struct Cell Cell; 99 typedef struct CellHdr CellHdr; 100 typedef struct FreeBlk FreeBlk; 101 typedef struct OverflowPage OverflowPage; 102 typedef struct FreelistInfo FreelistInfo; 103 104 /* 105 ** All structures on a database page are aligned to 4-byte boundries. 106 ** This routine rounds up a number of bytes to the next multiple of 4. 107 ** 108 ** This might need to change for computer architectures that require 109 ** and 8-byte alignment boundry for structures. 110 */ 111 #define ROUNDUP(X) ((X+3) & ~3) 112 113 /* 114 ** This is a magic string that appears at the beginning of every 115 ** SQLite database in order to identify the file as a real database. 116 */ 117 static const char zMagicHeader[] = 118 "** This file contains an SQLite 2.1 database **"; 119 #define MAGIC_SIZE (sizeof(zMagicHeader)) 120 121 /* 122 ** This is a magic integer also used to test the integrity of the database 123 ** file. This integer is used in addition to the string above so that 124 ** if the file is written on a little-endian architecture and read 125 ** on a big-endian architectures (or vice versa) we can detect the 126 ** problem. 127 ** 128 ** The number used was obtained at random and has no special 129 ** significance other than the fact that it represents a different 130 ** integer on little-endian and big-endian machines. 131 */ 132 #define MAGIC 0xdae37528 133 134 /* 135 ** The first page of the database file contains a magic header string 136 ** to identify the file as an SQLite database file. It also contains 137 ** a pointer to the first free page of the file. Page 2 contains the 138 ** root of the principle BTree. The file might contain other BTrees 139 ** rooted on pages above 2. 140 ** 141 ** The first page also contains SQLITE_N_BTREE_META integers that 142 ** can be used by higher-level routines. 143 ** 144 ** Remember that pages are numbered beginning with 1. (See pager.c 145 ** for additional information.) Page 0 does not exist and a page 146 ** number of 0 is used to mean "no such page". 147 */ 148 struct PageOne { 149 char zMagic[MAGIC_SIZE]; /* String that identifies the file as a database */ 150 int iMagic; /* Integer to verify correct byte order */ 151 Pgno freeList; /* First free page in a list of all free pages */ 152 int nFree; /* Number of pages on the free list */ 153 int aMeta[SQLITE_N_BTREE_META-1]; /* User defined integers */ 154 }; 155 156 /* 157 ** Each database page has a header that is an instance of this 158 ** structure. 159 ** 160 ** PageHdr.firstFree is 0 if there is no free space on this page. 161 ** Otherwise, PageHdr.firstFree is the index in MemPage.u.aDisk[] of a 162 ** FreeBlk structure that describes the first block of free space. 163 ** All free space is defined by a linked list of FreeBlk structures. 164 ** 165 ** Data is stored in a linked list of Cell structures. PageHdr.firstCell 166 ** is the index into MemPage.u.aDisk[] of the first cell on the page. The 167 ** Cells are kept in sorted order. 168 ** 169 ** A Cell contains all information about a database entry and a pointer 170 ** to a child page that contains other entries less than itself. In 171 ** other words, the i-th Cell contains both Ptr(i) and Key(i). The 172 ** right-most pointer of the page is contained in PageHdr.rightChild. 173 */ 174 struct PageHdr { 175 Pgno rightChild; /* Child page that comes after all cells on this page */ 176 u16 firstCell; /* Index in MemPage.u.aDisk[] of the first cell */ 177 u16 firstFree; /* Index in MemPage.u.aDisk[] of the first free block */ 178 }; 179 180 /* 181 ** Entries on a page of the database are called "Cells". Each Cell 182 ** has a header and data. This structure defines the header. The 183 ** key and data (collectively the "payload") follow this header on 184 ** the database page. 185 ** 186 ** A definition of the complete Cell structure is given below. The 187 ** header for the cell must be defined first in order to do some 188 ** of the sizing #defines that follow. 189 */ 190 struct CellHdr { 191 Pgno leftChild; /* Child page that comes before this cell */ 192 u16 nKey; /* Number of bytes in the key */ 193 u16 iNext; /* Index in MemPage.u.aDisk[] of next cell in sorted order */ 194 u8 nKeyHi; /* Upper 8 bits of key size for keys larger than 64K bytes */ 195 u8 nDataHi; /* Upper 8 bits of data size when the size is more than 64K */ 196 u16 nData; /* Number of bytes of data */ 197 }; 198 199 /* 200 ** The key and data size are split into a lower 16-bit segment and an 201 ** upper 8-bit segment in order to pack them together into a smaller 202 ** space. The following macros reassembly a key or data size back 203 ** into an integer. 204 */ 205 #define NKEY(b,h) (SWAB16(b,h.nKey) + h.nKeyHi*65536) 206 #define NDATA(b,h) (SWAB16(b,h.nData) + h.nDataHi*65536) 207 208 /* 209 ** The minimum size of a complete Cell. The Cell must contain a header 210 ** and at least 4 bytes of payload. 211 */ 212 #define MIN_CELL_SIZE (sizeof(CellHdr)+4) 213 214 /* 215 ** The maximum number of database entries that can be held in a single 216 ** page of the database. 217 */ 218 #define MX_CELL ((SQLITE_USABLE_SIZE-sizeof(PageHdr))/MIN_CELL_SIZE) 219 220 /* 221 ** The amount of usable space on a single page of the BTree. This is the 222 ** page size minus the overhead of the page header. 223 */ 224 #define USABLE_SPACE (SQLITE_USABLE_SIZE - sizeof(PageHdr)) 225 226 /* 227 ** The maximum amount of payload (in bytes) that can be stored locally for 228 ** a database entry. If the entry contains more data than this, the 229 ** extra goes onto overflow pages. 230 ** 231 ** This number is chosen so that at least 4 cells will fit on every page. 232 */ 233 #define MX_LOCAL_PAYLOAD ((USABLE_SPACE/4-(sizeof(CellHdr)+sizeof(Pgno)))&~3) 234 235 /* 236 ** Data on a database page is stored as a linked list of Cell structures. 237 ** Both the key and the data are stored in aPayload[]. The key always comes 238 ** first. The aPayload[] field grows as necessary to hold the key and data, 239 ** up to a maximum of MX_LOCAL_PAYLOAD bytes. If the size of the key and 240 ** data combined exceeds MX_LOCAL_PAYLOAD bytes, then Cell.ovfl is the 241 ** page number of the first overflow page. 242 ** 243 ** Though this structure is fixed in size, the Cell on the database 244 ** page varies in size. Every cell has a CellHdr and at least 4 bytes 245 ** of payload space. Additional payload bytes (up to the maximum of 246 ** MX_LOCAL_PAYLOAD) and the Cell.ovfl value are allocated only as 247 ** needed. 248 */ 249 struct Cell { 250 CellHdr h; /* The cell header */ 251 char aPayload[MX_LOCAL_PAYLOAD]; /* Key and data */ 252 Pgno ovfl; /* The first overflow page */ 253 }; 254 255 /* 256 ** Free space on a page is remembered using a linked list of the FreeBlk 257 ** structures. Space on a database page is allocated in increments of 258 ** at least 4 bytes and is always aligned to a 4-byte boundry. The 259 ** linked list of FreeBlks is always kept in order by address. 260 */ 261 struct FreeBlk { 262 u16 iSize; /* Number of bytes in this block of free space */ 263 u16 iNext; /* Index in MemPage.u.aDisk[] of the next free block */ 264 }; 265 266 /* 267 ** The number of bytes of payload that will fit on a single overflow page. 268 */ 269 #define OVERFLOW_SIZE (SQLITE_USABLE_SIZE-sizeof(Pgno)) 270 271 /* 272 ** When the key and data for a single entry in the BTree will not fit in 273 ** the MX_LOCAL_PAYLOAD bytes of space available on the database page, 274 ** then all extra bytes are written to a linked list of overflow pages. 275 ** Each overflow page is an instance of the following structure. 276 ** 277 ** Unused pages in the database are also represented by instances of 278 ** the OverflowPage structure. The PageOne.freeList field is the 279 ** page number of the first page in a linked list of unused database 280 ** pages. 281 */ 282 struct OverflowPage { 283 Pgno iNext; 284 char aPayload[OVERFLOW_SIZE]; 285 }; 286 287 /* 288 ** The PageOne.freeList field points to a linked list of overflow pages 289 ** hold information about free pages. The aPayload section of each 290 ** overflow page contains an instance of the following structure. The 291 ** aFree[] array holds the page number of nFree unused pages in the disk 292 ** file. 293 */ 294 struct FreelistInfo { 295 int nFree; 296 Pgno aFree[(OVERFLOW_SIZE-sizeof(int))/sizeof(Pgno)]; 297 }; 298 299 /* 300 ** For every page in the database file, an instance of the following structure 301 ** is stored in memory. The u.aDisk[] array contains the raw bits read from 302 ** the disk. The rest is auxiliary information held in memory only. The 303 ** auxiliary info is only valid for regular database pages - it is not 304 ** used for overflow pages and pages on the freelist. 305 ** 306 ** Of particular interest in the auxiliary info is the apCell[] entry. Each 307 ** apCell[] entry is a pointer to a Cell structure in u.aDisk[]. The cells are 308 ** put in this array so that they can be accessed in constant time, rather 309 ** than in linear time which would be needed if we had to walk the linked 310 ** list on every access. 311 ** 312 ** Note that apCell[] contains enough space to hold up to two more Cells 313 ** than can possibly fit on one page. In the steady state, every apCell[] 314 ** points to memory inside u.aDisk[]. But in the middle of an insert 315 ** operation, some apCell[] entries may temporarily point to data space 316 ** outside of u.aDisk[]. This is a transient situation that is quickly 317 ** resolved. But while it is happening, it is possible for a database 318 ** page to hold as many as two more cells than it might otherwise hold. 319 ** The extra two entries in apCell[] are an allowance for this situation. 320 ** 321 ** The pParent field points back to the parent page. This allows us to 322 ** walk up the BTree from any leaf to the root. Care must be taken to 323 ** unref() the parent page pointer when this page is no longer referenced. 324 ** The pageDestructor() routine handles that chore. 325 */ 326 struct MemPage { 327 union u_page_data { 328 char aDisk[SQLITE_PAGE_SIZE]; /* Page data stored on disk */ 329 PageHdr hdr; /* Overlay page header */ 330 } u; 331 u8 isInit; /* True if auxiliary data is initialized */ 332 u8 idxShift; /* True if apCell[] indices have changed */ 333 u8 isOverfull; /* Some apCell[] points outside u.aDisk[] */ 334 MemPage *pParent; /* The parent of this page. NULL for root */ 335 int idxParent; /* Index in pParent->apCell[] of this node */ 336 int nFree; /* Number of free bytes in u.aDisk[] */ 337 int nCell; /* Number of entries on this page */ 338 Cell *apCell[MX_CELL+2]; /* All data entires in sorted order */ 339 }; 340 341 /* 342 ** The in-memory image of a disk page has the auxiliary information appended 343 ** to the end. EXTRA_SIZE is the number of bytes of space needed to hold 344 ** that extra information. 345 */ 346 #define EXTRA_SIZE (sizeof(MemPage)-sizeof(union u_page_data)) 347 348 /* 349 ** Everything we need to know about an open database 350 */ 351 struct Btree { 352 BtOps *pOps; /* Function table */ 353 Pager *pPager; /* The page cache */ 354 BtCursor *pCursor; /* A list of all open cursors */ 355 PageOne *page1; /* First page of the database */ 356 u8 inTrans; /* True if a transaction is in progress */ 357 u8 inCkpt; /* True if there is a checkpoint on the transaction */ 358 u8 readOnly; /* True if the underlying file is readonly */ 359 u8 needSwab; /* Need to byte-swapping */ 360 }; 361 typedef Btree Bt; 362 363 /* 364 ** A cursor is a pointer to a particular entry in the BTree. 365 ** The entry is identified by its MemPage and the index in 366 ** MemPage.apCell[] of the entry. 367 */ 368 struct BtCursor { 369 BtCursorOps *pOps; /* Function table */ 370 Btree *pBt; /* The Btree to which this cursor belongs */ 371 BtCursor *pNext, *pPrev; /* Forms a linked list of all cursors */ 372 BtCursor *pShared; /* Loop of cursors with the same root page */ 373 Pgno pgnoRoot; /* The root page of this tree */ 374 MemPage *pPage; /* Page that contains the entry */ 375 int idx; /* Index of the entry in pPage->apCell[] */ 376 u8 wrFlag; /* True if writable */ 377 u8 eSkip; /* Determines if next step operation is a no-op */ 378 u8 iMatch; /* compare result from last sqliteBtreeMoveto() */ 379 }; 380 381 /* 382 ** Legal values for BtCursor.eSkip. 383 */ 384 #define SKIP_NONE 0 /* Always step the cursor */ 385 #define SKIP_NEXT 1 /* The next sqliteBtreeNext() is a no-op */ 386 #define SKIP_PREV 2 /* The next sqliteBtreePrevious() is a no-op */ 387 #define SKIP_INVALID 3 /* Calls to Next() and Previous() are invalid */ 388 389 /* Forward declarations */ 390 static int fileBtreeCloseCursor(BtCursor *pCur); 391 392 /* 393 ** Routines for byte swapping. 394 */ 395 u16 swab16(u16 x){ 396 return ((x & 0xff)<<8) | ((x>>8)&0xff); 397 } 398 u32 swab32(u32 x){ 399 return ((x & 0xff)<<24) | ((x & 0xff00)<<8) | 400 ((x>>8) & 0xff00) | ((x>>24)&0xff); 401 } 402 403 /* 404 ** Compute the total number of bytes that a Cell needs on the main 405 ** database page. The number returned includes the Cell header, 406 ** local payload storage, and the pointer to overflow pages (if 407 ** applicable). Additional space allocated on overflow pages 408 ** is NOT included in the value returned from this routine. 409 */ 410 static int cellSize(Btree *pBt, Cell *pCell){ 411 int n = NKEY(pBt, pCell->h) + NDATA(pBt, pCell->h); 412 if( n>MX_LOCAL_PAYLOAD ){ 413 n = MX_LOCAL_PAYLOAD + sizeof(Pgno); 414 }else{ 415 n = ROUNDUP(n); 416 } 417 n += sizeof(CellHdr); 418 return n; 419 } 420 421 /* 422 ** Defragment the page given. All Cells are moved to the 423 ** beginning of the page and all free space is collected 424 ** into one big FreeBlk at the end of the page. 425 */ 426 static void defragmentPage(Btree *pBt, MemPage *pPage){ 427 int pc, i, n; 428 FreeBlk *pFBlk; 429 char newPage[SQLITE_USABLE_SIZE]; 430 431 assert( sqlitepager_iswriteable(pPage) ); 432 assert( pPage->isInit ); 433 pc = sizeof(PageHdr); 434 pPage->u.hdr.firstCell = SWAB16(pBt, pc); 435 memcpy(newPage, pPage->u.aDisk, pc); 436 for(i=0; i<pPage->nCell; i++){ 437 Cell *pCell = pPage->apCell[i]; 438 439 /* This routine should never be called on an overfull page. The 440 ** following asserts verify that constraint. */ 441 assert( Addr(pCell) > Addr(pPage) ); 442 assert( Addr(pCell) < Addr(pPage) + SQLITE_USABLE_SIZE ); 443 444 n = cellSize(pBt, pCell); 445 pCell->h.iNext = SWAB16(pBt, pc + n); 446 memcpy(&newPage[pc], pCell, n); 447 pPage->apCell[i] = (Cell*)&pPage->u.aDisk[pc]; 448 pc += n; 449 } 450 assert( pPage->nFree==SQLITE_USABLE_SIZE-pc ); 451 memcpy(pPage->u.aDisk, newPage, pc); 452 if( pPage->nCell>0 ){ 453 pPage->apCell[pPage->nCell-1]->h.iNext = 0; 454 } 455 pFBlk = (FreeBlk*)&pPage->u.aDisk[pc]; 456 pFBlk->iSize = SWAB16(pBt, SQLITE_USABLE_SIZE - pc); 457 pFBlk->iNext = 0; 458 pPage->u.hdr.firstFree = SWAB16(pBt, pc); 459 memset(&pFBlk[1], 0, SQLITE_USABLE_SIZE - pc - sizeof(FreeBlk)); 460 } 461 462 /* 463 ** Allocate nByte bytes of space on a page. nByte must be a 464 ** multiple of 4. 465 ** 466 ** Return the index into pPage->u.aDisk[] of the first byte of 467 ** the new allocation. Or return 0 if there is not enough free 468 ** space on the page to satisfy the allocation request. 469 ** 470 ** If the page contains nBytes of free space but does not contain 471 ** nBytes of contiguous free space, then this routine automatically 472 ** calls defragementPage() to consolidate all free space before 473 ** allocating the new chunk. 474 */ 475 static int allocateSpace(Btree *pBt, MemPage *pPage, int nByte){ 476 FreeBlk *p; 477 u16 *pIdx; 478 int start; 479 int iSize; 480 #ifndef NDEBUG 481 int cnt = 0; 482 #endif 483 484 assert( sqlitepager_iswriteable(pPage) ); 485 assert( nByte==ROUNDUP(nByte) ); 486 assert( pPage->isInit ); 487 if( pPage->nFree<nByte || pPage->isOverfull ) return 0; 488 pIdx = &pPage->u.hdr.firstFree; 489 p = (FreeBlk*)&pPage->u.aDisk[SWAB16(pBt, *pIdx)]; 490 while( (iSize = SWAB16(pBt, p->iSize))<nByte ){ 491 assert( cnt++ < SQLITE_USABLE_SIZE/4 ); 492 if( p->iNext==0 ){ 493 defragmentPage(pBt, pPage); 494 pIdx = &pPage->u.hdr.firstFree; 495 }else{ 496 pIdx = &p->iNext; 497 } 498 p = (FreeBlk*)&pPage->u.aDisk[SWAB16(pBt, *pIdx)]; 499 } 500 if( iSize==nByte ){ 501 start = SWAB16(pBt, *pIdx); 502 *pIdx = p->iNext; 503 }else{ 504 FreeBlk *pNew; 505 start = SWAB16(pBt, *pIdx); 506 pNew = (FreeBlk*)&pPage->u.aDisk[start + nByte]; 507 pNew->iNext = p->iNext; 508 pNew->iSize = SWAB16(pBt, iSize - nByte); 509 *pIdx = SWAB16(pBt, start + nByte); 510 } 511 pPage->nFree -= nByte; 512 return start; 513 } 514 515 /* 516 ** Return a section of the MemPage.u.aDisk[] to the freelist. 517 ** The first byte of the new free block is pPage->u.aDisk[start] 518 ** and the size of the block is "size" bytes. Size must be 519 ** a multiple of 4. 520 ** 521 ** Most of the effort here is involved in coalesing adjacent 522 ** free blocks into a single big free block. 523 */ 524 static void freeSpace(Btree *pBt, MemPage *pPage, int start, int size){ 525 int end = start + size; 526 u16 *pIdx, idx; 527 FreeBlk *pFBlk; 528 FreeBlk *pNew; 529 FreeBlk *pNext; 530 int iSize; 531 532 assert( sqlitepager_iswriteable(pPage) ); 533 assert( size == ROUNDUP(size) ); 534 assert( start == ROUNDUP(start) ); 535 assert( pPage->isInit ); 536 pIdx = &pPage->u.hdr.firstFree; 537 idx = SWAB16(pBt, *pIdx); 538 while( idx!=0 && idx<start ){ 539 pFBlk = (FreeBlk*)&pPage->u.aDisk[idx]; 540 iSize = SWAB16(pBt, pFBlk->iSize); 541 if( idx + iSize == start ){ 542 pFBlk->iSize = SWAB16(pBt, iSize + size); 543 if( idx + iSize + size == SWAB16(pBt, pFBlk->iNext) ){ 544 pNext = (FreeBlk*)&pPage->u.aDisk[idx + iSize + size]; 545 if( pBt->needSwab ){ 546 pFBlk->iSize = swab16((u16)swab16(pNext->iSize)+iSize+size); 547 }else{ 548 pFBlk->iSize += pNext->iSize; 549 } 550 pFBlk->iNext = pNext->iNext; 551 } 552 pPage->nFree += size; 553 return; 554 } 555 pIdx = &pFBlk->iNext; 556 idx = SWAB16(pBt, *pIdx); 557 } 558 pNew = (FreeBlk*)&pPage->u.aDisk[start]; 559 if( idx != end ){ 560 pNew->iSize = SWAB16(pBt, size); 561 pNew->iNext = SWAB16(pBt, idx); 562 }else{ 563 pNext = (FreeBlk*)&pPage->u.aDisk[idx]; 564 pNew->iSize = SWAB16(pBt, size + SWAB16(pBt, pNext->iSize)); 565 pNew->iNext = pNext->iNext; 566 } 567 *pIdx = SWAB16(pBt, start); 568 pPage->nFree += size; 569 } 570 571 /* 572 ** Initialize the auxiliary information for a disk block. 573 ** 574 ** The pParent parameter must be a pointer to the MemPage which 575 ** is the parent of the page being initialized. The root of the 576 ** BTree (usually page 2) has no parent and so for that page, 577 ** pParent==NULL. 578 ** 579 ** Return SQLITE_OK on success. If we see that the page does 580 ** not contain a well-formed database page, then return 581 ** SQLITE_CORRUPT. Note that a return of SQLITE_OK does not 582 ** guarantee that the page is well-formed. It only shows that 583 ** we failed to detect any corruption. 584 */ 585 static int initPage(Bt *pBt, MemPage *pPage, Pgno pgnoThis, MemPage *pParent){ 586 int idx; /* An index into pPage->u.aDisk[] */ 587 Cell *pCell; /* A pointer to a Cell in pPage->u.aDisk[] */ 588 FreeBlk *pFBlk; /* A pointer to a free block in pPage->u.aDisk[] */ 589 int sz; /* The size of a Cell in bytes */ 590 int freeSpace; /* Amount of free space on the page */ 591 592 if( pPage->pParent ){ 593 assert( pPage->pParent==pParent ); 594 return SQLITE_OK; 595 } 596 if( pParent ){ 597 pPage->pParent = pParent; 598 sqlitepager_ref(pParent); 599 } 600 if( pPage->isInit ) return SQLITE_OK; 601 pPage->isInit = 1; 602 pPage->nCell = 0; 603 freeSpace = USABLE_SPACE; 604 idx = SWAB16(pBt, pPage->u.hdr.firstCell); 605 while( idx!=0 ){ 606 if( idx>SQLITE_USABLE_SIZE-MIN_CELL_SIZE ) goto page_format_error; 607 if( idx<sizeof(PageHdr) ) goto page_format_error; 608 if( idx!=ROUNDUP(idx) ) goto page_format_error; 609 pCell = (Cell*)&pPage->u.aDisk[idx]; 610 sz = cellSize(pBt, pCell); 611 if( idx+sz > SQLITE_USABLE_SIZE ) goto page_format_error; 612 freeSpace -= sz; 613 pPage->apCell[pPage->nCell++] = pCell; 614 idx = SWAB16(pBt, pCell->h.iNext); 615 } 616 pPage->nFree = 0; 617 idx = SWAB16(pBt, pPage->u.hdr.firstFree); 618 while( idx!=0 ){ 619 int iNext; 620 if( idx>SQLITE_USABLE_SIZE-sizeof(FreeBlk) ) goto page_format_error; 621 if( idx<sizeof(PageHdr) ) goto page_format_error; 622 pFBlk = (FreeBlk*)&pPage->u.aDisk[idx]; 623 pPage->nFree += SWAB16(pBt, pFBlk->iSize); 624 iNext = SWAB16(pBt, pFBlk->iNext); 625 if( iNext>0 && iNext <= idx ) goto page_format_error; 626 idx = iNext; 627 } 628 if( pPage->nCell==0 && pPage->nFree==0 ){ 629 /* As a special case, an uninitialized root page appears to be 630 ** an empty database */ 631 return SQLITE_OK; 632 } 633 if( pPage->nFree!=freeSpace ) goto page_format_error; 634 return SQLITE_OK; 635 636 page_format_error: 637 return SQLITE_CORRUPT; 638 } 639 640 /* 641 ** Set up a raw page so that it looks like a database page holding 642 ** no entries. 643 */ 644 static void zeroPage(Btree *pBt, MemPage *pPage){ 645 PageHdr *pHdr; 646 FreeBlk *pFBlk; 647 assert( sqlitepager_iswriteable(pPage) ); 648 memset(pPage, 0, SQLITE_USABLE_SIZE); 649 pHdr = &pPage->u.hdr; 650 pHdr->firstCell = 0; 651 pHdr->firstFree = SWAB16(pBt, sizeof(*pHdr)); 652 pFBlk = (FreeBlk*)&pHdr[1]; 653 pFBlk->iNext = 0; 654 pPage->nFree = SQLITE_USABLE_SIZE - sizeof(*pHdr); 655 pFBlk->iSize = SWAB16(pBt, pPage->nFree); 656 pPage->nCell = 0; 657 pPage->isOverfull = 0; 658 } 659 660 /* 661 ** This routine is called when the reference count for a page 662 ** reaches zero. We need to unref the pParent pointer when that 663 ** happens. 664 */ 665 static void pageDestructor(void *pData){ 666 MemPage *pPage = (MemPage*)pData; 667 if( pPage->pParent ){ 668 MemPage *pParent = pPage->pParent; 669 pPage->pParent = 0; 670 sqlitepager_unref(pParent); 671 } 672 } 673 674 /* 675 ** Open a new database. 676 ** 677 ** Actually, this routine just sets up the internal data structures 678 ** for accessing the database. We do not open the database file 679 ** until the first page is loaded. 680 ** 681 ** zFilename is the name of the database file. If zFilename is NULL 682 ** a new database with a random name is created. This randomly named 683 ** database file will be deleted when sqliteBtreeClose() is called. 684 */ 685 int sqliteBtreeOpen( 686 const char *zFilename, /* Name of the file containing the BTree database */ 687 int omitJournal, /* if TRUE then do not journal this file */ 688 int nCache, /* How many pages in the page cache */ 689 Btree **ppBtree /* Pointer to new Btree object written here */ 690 ){ 691 Btree *pBt; 692 int rc; 693 694 /* 695 ** The following asserts make sure that structures used by the btree are 696 ** the right size. This is to guard against size changes that result 697 ** when compiling on a different architecture. 698 */ 699 assert( sizeof(u32)==4 ); 700 assert( sizeof(u16)==2 ); 701 assert( sizeof(Pgno)==4 ); 702 assert( sizeof(PageHdr)==8 ); 703 assert( sizeof(CellHdr)==12 ); 704 assert( sizeof(FreeBlk)==4 ); 705 assert( sizeof(OverflowPage)==SQLITE_USABLE_SIZE ); 706 assert( sizeof(FreelistInfo)==OVERFLOW_SIZE ); 707 assert( sizeof(ptr)==sizeof(char*) ); 708 assert( sizeof(uptr)==sizeof(ptr) ); 709 710 pBt = sqliteMalloc( sizeof(*pBt) ); 711 if( pBt==0 ){ 712 *ppBtree = 0; 713 return SQLITE_NOMEM; 714 } 715 if( nCache<10 ) nCache = 10; 716 rc = sqlitepager_open(&pBt->pPager, zFilename, nCache, EXTRA_SIZE, 717 !omitJournal); 718 if( rc!=SQLITE_OK ){ 719 if( pBt->pPager ) sqlitepager_close(pBt->pPager); 720 sqliteFree(pBt); 721 *ppBtree = 0; 722 return rc; 723 } 724 sqlitepager_set_destructor(pBt->pPager, pageDestructor); 725 pBt->pCursor = 0; 726 pBt->page1 = 0; 727 pBt->readOnly = sqlitepager_isreadonly(pBt->pPager); 728 pBt->pOps = &sqliteBtreeOps; 729 *ppBtree = pBt; 730 return SQLITE_OK; 731 } 732 733 /* 734 ** Close an open database and invalidate all cursors. 735 */ 736 static int fileBtreeClose(Btree *pBt){ 737 while( pBt->pCursor ){ 738 fileBtreeCloseCursor(pBt->pCursor); 739 } 740 sqlitepager_close(pBt->pPager); 741 sqliteFree(pBt); 742 return SQLITE_OK; 743 } 744 745 /* 746 ** Change the limit on the number of pages allowed in the cache. 747 ** 748 ** The maximum number of cache pages is set to the absolute 749 ** value of mxPage. If mxPage is negative, the pager will 750 ** operate asynchronously - it will not stop to do fsync()s 751 ** to insure data is written to the disk surface before 752 ** continuing. Transactions still work if synchronous is off, 753 ** and the database cannot be corrupted if this program 754 ** crashes. But if the operating system crashes or there is 755 ** an abrupt power failure when synchronous is off, the database 756 ** could be left in an inconsistent and unrecoverable state. 757 ** Synchronous is on by default so database corruption is not 758 ** normally a worry. 759 */ 760 static int fileBtreeSetCacheSize(Btree *pBt, int mxPage){ 761 sqlitepager_set_cachesize(pBt->pPager, mxPage); 762 return SQLITE_OK; 763 } 764 765 /* 766 ** Change the way data is synced to disk in order to increase or decrease 767 ** how well the database resists damage due to OS crashes and power 768 ** failures. Level 1 is the same as asynchronous (no syncs() occur and 769 ** there is a high probability of damage) Level 2 is the default. There 770 ** is a very low but non-zero probability of damage. Level 3 reduces the 771 ** probability of damage to near zero but with a write performance reduction. 772 */ 773 static int fileBtreeSetSafetyLevel(Btree *pBt, int level){ 774 sqlitepager_set_safety_level(pBt->pPager, level); 775 return SQLITE_OK; 776 } 777 778 /* 779 ** Get a reference to page1 of the database file. This will 780 ** also acquire a readlock on that file. 781 ** 782 ** SQLITE_OK is returned on success. If the file is not a 783 ** well-formed database file, then SQLITE_CORRUPT is returned. 784 ** SQLITE_BUSY is returned if the database is locked. SQLITE_NOMEM 785 ** is returned if we run out of memory. SQLITE_PROTOCOL is returned 786 ** if there is a locking protocol violation. 787 */ 788 static int lockBtree(Btree *pBt){ 789 int rc; 790 if( pBt->page1 ) return SQLITE_OK; 791 rc = sqlitepager_get(pBt->pPager, 1, (void**)&pBt->page1); 792 if( rc!=SQLITE_OK ) return rc; 793 794 /* Do some checking to help insure the file we opened really is 795 ** a valid database file. 796 */ 797 if( sqlitepager_pagecount(pBt->pPager)>0 ){ 798 PageOne *pP1 = pBt->page1; 799 if( strcmp(pP1->zMagic,zMagicHeader)!=0 || 800 (pP1->iMagic!=MAGIC && swab32(pP1->iMagic)!=MAGIC) ){ 801 rc = SQLITE_NOTADB; 802 goto page1_init_failed; 803 } 804 pBt->needSwab = pP1->iMagic!=MAGIC; 805 } 806 return rc; 807 808 page1_init_failed: 809 sqlitepager_unref(pBt->page1); 810 pBt->page1 = 0; 811 return rc; 812 } 813 814 /* 815 ** If there are no outstanding cursors and we are not in the middle 816 ** of a transaction but there is a read lock on the database, then 817 ** this routine unrefs the first page of the database file which 818 ** has the effect of releasing the read lock. 819 ** 820 ** If there are any outstanding cursors, this routine is a no-op. 821 ** 822 ** If there is a transaction in progress, this routine is a no-op. 823 */ 824 static void unlockBtreeIfUnused(Btree *pBt){ 825 if( pBt->inTrans==0 && pBt->pCursor==0 && pBt->page1!=0 ){ 826 sqlitepager_unref(pBt->page1); 827 pBt->page1 = 0; 828 pBt->inTrans = 0; 829 pBt->inCkpt = 0; 830 } 831 } 832 833 /* 834 ** Create a new database by initializing the first two pages of the 835 ** file. 836 */ 837 static int newDatabase(Btree *pBt){ 838 MemPage *pRoot; 839 PageOne *pP1; 840 int rc; 841 if( sqlitepager_pagecount(pBt->pPager)>1 ) return SQLITE_OK; 842 pP1 = pBt->page1; 843 rc = sqlitepager_write(pBt->page1); 844 if( rc ) return rc; 845 rc = sqlitepager_get(pBt->pPager, 2, (void**)&pRoot); 846 if( rc ) return rc; 847 rc = sqlitepager_write(pRoot); 848 if( rc ){ 849 sqlitepager_unref(pRoot); 850 return rc; 851 } 852 strcpy(pP1->zMagic, zMagicHeader); 853 if( btree_native_byte_order ){ 854 pP1->iMagic = MAGIC; 855 pBt->needSwab = 0; 856 }else{ 857 pP1->iMagic = swab32(MAGIC); 858 pBt->needSwab = 1; 859 } 860 zeroPage(pBt, pRoot); 861 sqlitepager_unref(pRoot); 862 return SQLITE_OK; 863 } 864 865 /* 866 ** Attempt to start a new transaction. 867 ** 868 ** A transaction must be started before attempting any changes 869 ** to the database. None of the following routines will work 870 ** unless a transaction is started first: 871 ** 872 ** sqliteBtreeCreateTable() 873 ** sqliteBtreeCreateIndex() 874 ** sqliteBtreeClearTable() 875 ** sqliteBtreeDropTable() 876 ** sqliteBtreeInsert() 877 ** sqliteBtreeDelete() 878 ** sqliteBtreeUpdateMeta() 879 */ 880 static int fileBtreeBeginTrans(Btree *pBt){ 881 int rc; 882 if( pBt->inTrans ) return SQLITE_ERROR; 883 if( pBt->readOnly ) return SQLITE_READONLY; 884 if( pBt->page1==0 ){ 885 rc = lockBtree(pBt); 886 if( rc!=SQLITE_OK ){ 887 return rc; 888 } 889 } 890 rc = sqlitepager_begin(pBt->page1); 891 if( rc==SQLITE_OK ){ 892 rc = newDatabase(pBt); 893 } 894 if( rc==SQLITE_OK ){ 895 pBt->inTrans = 1; 896 pBt->inCkpt = 0; 897 }else{ 898 unlockBtreeIfUnused(pBt); 899 } 900 return rc; 901 } 902 903 /* 904 ** Commit the transaction currently in progress. 905 ** 906 ** This will release the write lock on the database file. If there 907 ** are no active cursors, it also releases the read lock. 908 */ 909 static int fileBtreeCommit(Btree *pBt){ 910 int rc; 911 rc = pBt->readOnly ? SQLITE_OK : sqlitepager_commit(pBt->pPager); 912 pBt->inTrans = 0; 913 pBt->inCkpt = 0; 914 unlockBtreeIfUnused(pBt); 915 return rc; 916 } 917 918 /* 919 ** Rollback the transaction in progress. All cursors will be 920 ** invalided by this operation. Any attempt to use a cursor 921 ** that was open at the beginning of this operation will result 922 ** in an error. 923 ** 924 ** This will release the write lock on the database file. If there 925 ** are no active cursors, it also releases the read lock. 926 */ 927 static int fileBtreeRollback(Btree *pBt){ 928 int rc; 929 BtCursor *pCur; 930 if( pBt->inTrans==0 ) return SQLITE_OK; 931 pBt->inTrans = 0; 932 pBt->inCkpt = 0; 933 rc = pBt->readOnly ? SQLITE_OK : sqlitepager_rollback(pBt->pPager); 934 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){ 935 if( pCur->pPage && pCur->pPage->isInit==0 ){ 936 sqlitepager_unref(pCur->pPage); 937 pCur->pPage = 0; 938 } 939 } 940 unlockBtreeIfUnused(pBt); 941 return rc; 942 } 943 944 /* 945 ** Set the checkpoint for the current transaction. The checkpoint serves 946 ** as a sub-transaction that can be rolled back independently of the 947 ** main transaction. You must start a transaction before starting a 948 ** checkpoint. The checkpoint is ended automatically if the transaction 949 ** commits or rolls back. 950 ** 951 ** Only one checkpoint may be active at a time. It is an error to try 952 ** to start a new checkpoint if another checkpoint is already active. 953 */ 954 static int fileBtreeBeginCkpt(Btree *pBt){ 955 int rc; 956 if( !pBt->inTrans || pBt->inCkpt ){ 957 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR; 958 } 959 rc = pBt->readOnly ? SQLITE_OK : sqlitepager_ckpt_begin(pBt->pPager); 960 pBt->inCkpt = 1; 961 return rc; 962 } 963 964 965 /* 966 ** Commit a checkpoint to transaction currently in progress. If no 967 ** checkpoint is active, this is a no-op. 968 */ 969 static int fileBtreeCommitCkpt(Btree *pBt){ 970 int rc; 971 if( pBt->inCkpt && !pBt->readOnly ){ 972 rc = sqlitepager_ckpt_commit(pBt->pPager); 973 }else{ 974 rc = SQLITE_OK; 975 } 976 pBt->inCkpt = 0; 977 return rc; 978 } 979 980 /* 981 ** Rollback the checkpoint to the current transaction. If there 982 ** is no active checkpoint or transaction, this routine is a no-op. 983 ** 984 ** All cursors will be invalided by this operation. Any attempt 985 ** to use a cursor that was open at the beginning of this operation 986 ** will result in an error. 987 */ 988 static int fileBtreeRollbackCkpt(Btree *pBt){ 989 int rc; 990 BtCursor *pCur; 991 if( pBt->inCkpt==0 || pBt->readOnly ) return SQLITE_OK; 992 rc = sqlitepager_ckpt_rollback(pBt->pPager); 993 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){ 994 if( pCur->pPage && pCur->pPage->isInit==0 ){ 995 sqlitepager_unref(pCur->pPage); 996 pCur->pPage = 0; 997 } 998 } 999 pBt->inCkpt = 0; 1000 return rc; 1001 } 1002 1003 /* 1004 ** Create a new cursor for the BTree whose root is on the page 1005 ** iTable. The act of acquiring a cursor gets a read lock on 1006 ** the database file. 1007 ** 1008 ** If wrFlag==0, then the cursor can only be used for reading. 1009 ** If wrFlag==1, then the cursor can be used for reading or for 1010 ** writing if other conditions for writing are also met. These 1011 ** are the conditions that must be met in order for writing to 1012 ** be allowed: 1013 ** 1014 ** 1: The cursor must have been opened with wrFlag==1 1015 ** 1016 ** 2: No other cursors may be open with wrFlag==0 on the same table 1017 ** 1018 ** 3: The database must be writable (not on read-only media) 1019 ** 1020 ** 4: There must be an active transaction. 1021 ** 1022 ** Condition 2 warrants further discussion. If any cursor is opened 1023 ** on a table with wrFlag==0, that prevents all other cursors from 1024 ** writing to that table. This is a kind of "read-lock". When a cursor 1025 ** is opened with wrFlag==0 it is guaranteed that the table will not 1026 ** change as long as the cursor is open. This allows the cursor to 1027 ** do a sequential scan of the table without having to worry about 1028 ** entries being inserted or deleted during the scan. Cursors should 1029 ** be opened with wrFlag==0 only if this read-lock property is needed. 1030 ** That is to say, cursors should be opened with wrFlag==0 only if they 1031 ** intend to use the sqliteBtreeNext() system call. All other cursors 1032 ** should be opened with wrFlag==1 even if they never really intend 1033 ** to write. 1034 ** 1035 ** No checking is done to make sure that page iTable really is the 1036 ** root page of a b-tree. If it is not, then the cursor acquired 1037 ** will not work correctly. 1038 */ 1039 static 1040 int fileBtreeCursor(Btree *pBt, int iTable, int wrFlag, BtCursor **ppCur){ 1041 int rc; 1042 BtCursor *pCur, *pRing; 1043 1044 if( pBt->readOnly && wrFlag ){ 1045 *ppCur = 0; 1046 return SQLITE_READONLY; 1047 } 1048 if( pBt->page1==0 ){ 1049 rc = lockBtree(pBt); 1050 if( rc!=SQLITE_OK ){ 1051 *ppCur = 0; 1052 return rc; 1053 } 1054 } 1055 pCur = sqliteMalloc( sizeof(*pCur) ); 1056 if( pCur==0 ){ 1057 rc = SQLITE_NOMEM; 1058 goto create_cursor_exception; 1059 } 1060 pCur->pgnoRoot = (Pgno)iTable; 1061 rc = sqlitepager_get(pBt->pPager, pCur->pgnoRoot, (void**)&pCur->pPage); 1062 if( rc!=SQLITE_OK ){ 1063 goto create_cursor_exception; 1064 } 1065 rc = initPage(pBt, pCur->pPage, pCur->pgnoRoot, 0); 1066 if( rc!=SQLITE_OK ){ 1067 goto create_cursor_exception; 1068 } 1069 pCur->pOps = &sqliteBtreeCursorOps; 1070 pCur->pBt = pBt; 1071 pCur->wrFlag = wrFlag; 1072 pCur->idx = 0; 1073 pCur->eSkip = SKIP_INVALID; 1074 pCur->pNext = pBt->pCursor; 1075 if( pCur->pNext ){ 1076 pCur->pNext->pPrev = pCur; 1077 } 1078 pCur->pPrev = 0; 1079 pRing = pBt->pCursor; 1080 while( pRing && pRing->pgnoRoot!=pCur->pgnoRoot ){ pRing = pRing->pNext; } 1081 if( pRing ){ 1082 pCur->pShared = pRing->pShared; 1083 pRing->pShared = pCur; 1084 }else{ 1085 pCur->pShared = pCur; 1086 } 1087 pBt->pCursor = pCur; 1088 *ppCur = pCur; 1089 return SQLITE_OK; 1090 1091 create_cursor_exception: 1092 *ppCur = 0; 1093 if( pCur ){ 1094 if( pCur->pPage ) sqlitepager_unref(pCur->pPage); 1095 sqliteFree(pCur); 1096 } 1097 unlockBtreeIfUnused(pBt); 1098 return rc; 1099 } 1100 1101 /* 1102 ** Close a cursor. The read lock on the database file is released 1103 ** when the last cursor is closed. 1104 */ 1105 static int fileBtreeCloseCursor(BtCursor *pCur){ 1106 Btree *pBt = pCur->pBt; 1107 if( pCur->pPrev ){ 1108 pCur->pPrev->pNext = pCur->pNext; 1109 }else{ 1110 pBt->pCursor = pCur->pNext; 1111 } 1112 if( pCur->pNext ){ 1113 pCur->pNext->pPrev = pCur->pPrev; 1114 } 1115 if( pCur->pPage ){ 1116 sqlitepager_unref(pCur->pPage); 1117 } 1118 if( pCur->pShared!=pCur ){ 1119 BtCursor *pRing = pCur->pShared; 1120 while( pRing->pShared!=pCur ){ pRing = pRing->pShared; } 1121 pRing->pShared = pCur->pShared; 1122 } 1123 unlockBtreeIfUnused(pBt); 1124 sqliteFree(pCur); 1125 return SQLITE_OK; 1126 } 1127 1128 /* 1129 ** Make a temporary cursor by filling in the fields of pTempCur. 1130 ** The temporary cursor is not on the cursor list for the Btree. 1131 */ 1132 static void getTempCursor(BtCursor *pCur, BtCursor *pTempCur){ 1133 memcpy(pTempCur, pCur, sizeof(*pCur)); 1134 pTempCur->pNext = 0; 1135 pTempCur->pPrev = 0; 1136 if( pTempCur->pPage ){ 1137 sqlitepager_ref(pTempCur->pPage); 1138 } 1139 } 1140 1141 /* 1142 ** Delete a temporary cursor such as was made by the CreateTemporaryCursor() 1143 ** function above. 1144 */ 1145 static void releaseTempCursor(BtCursor *pCur){ 1146 if( pCur->pPage ){ 1147 sqlitepager_unref(pCur->pPage); 1148 } 1149 } 1150 1151 /* 1152 ** Set *pSize to the number of bytes of key in the entry the 1153 ** cursor currently points to. Always return SQLITE_OK. 1154 ** Failure is not possible. If the cursor is not currently 1155 ** pointing to an entry (which can happen, for example, if 1156 ** the database is empty) then *pSize is set to 0. 1157 */ 1158 static int fileBtreeKeySize(BtCursor *pCur, int *pSize){ 1159 Cell *pCell; 1160 MemPage *pPage; 1161 1162 pPage = pCur->pPage; 1163 assert( pPage!=0 ); 1164 if( pCur->idx >= pPage->nCell ){ 1165 *pSize = 0; 1166 }else{ 1167 pCell = pPage->apCell[pCur->idx]; 1168 *pSize = NKEY(pCur->pBt, pCell->h); 1169 } 1170 return SQLITE_OK; 1171 } 1172 1173 /* 1174 ** Read payload information from the entry that the pCur cursor is 1175 ** pointing to. Begin reading the payload at "offset" and read 1176 ** a total of "amt" bytes. Put the result in zBuf. 1177 ** 1178 ** This routine does not make a distinction between key and data. 1179 ** It just reads bytes from the payload area. 1180 */ 1181 static int getPayload(BtCursor *pCur, int offset, int amt, char *zBuf){ 1182 char *aPayload; 1183 Pgno nextPage; 1184 int rc; 1185 Btree *pBt = pCur->pBt; 1186 assert( pCur!=0 && pCur->pPage!=0 ); 1187 assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell ); 1188 aPayload = pCur->pPage->apCell[pCur->idx]->aPayload; 1189 if( offset<MX_LOCAL_PAYLOAD ){ 1190 int a = amt; 1191 if( a+offset>MX_LOCAL_PAYLOAD ){ 1192 a = MX_LOCAL_PAYLOAD - offset; 1193 } 1194 memcpy(zBuf, &aPayload[offset], a); 1195 if( a==amt ){ 1196 return SQLITE_OK; 1197 } 1198 offset = 0; 1199 zBuf += a; 1200 amt -= a; 1201 }else{ 1202 offset -= MX_LOCAL_PAYLOAD; 1203 } 1204 if( amt>0 ){ 1205 nextPage = SWAB32(pBt, pCur->pPage->apCell[pCur->idx]->ovfl); 1206 } 1207 while( amt>0 && nextPage ){ 1208 OverflowPage *pOvfl; 1209 rc = sqlitepager_get(pBt->pPager, nextPage, (void**)&pOvfl); 1210 if( rc!=0 ){ 1211 return rc; 1212 } 1213 nextPage = SWAB32(pBt, pOvfl->iNext); 1214 if( offset<OVERFLOW_SIZE ){ 1215 int a = amt; 1216 if( a + offset > OVERFLOW_SIZE ){ 1217 a = OVERFLOW_SIZE - offset; 1218 } 1219 memcpy(zBuf, &pOvfl->aPayload[offset], a); 1220 offset = 0; 1221 amt -= a; 1222 zBuf += a; 1223 }else{ 1224 offset -= OVERFLOW_SIZE; 1225 } 1226 sqlitepager_unref(pOvfl); 1227 } 1228 if( amt>0 ){ 1229 return SQLITE_CORRUPT; 1230 } 1231 return SQLITE_OK; 1232 } 1233 1234 /* 1235 ** Read part of the key associated with cursor pCur. A maximum 1236 ** of "amt" bytes will be transfered into zBuf[]. The transfer 1237 ** begins at "offset". The number of bytes actually read is 1238 ** returned. 1239 ** 1240 ** Change: It used to be that the amount returned will be smaller 1241 ** than the amount requested if there are not enough bytes in the key 1242 ** to satisfy the request. But now, it must be the case that there 1243 ** is enough data available to satisfy the request. If not, an exception 1244 ** is raised. The change was made in an effort to boost performance 1245 ** by eliminating unneeded tests. 1246 */ 1247 static int fileBtreeKey(BtCursor *pCur, int offset, int amt, char *zBuf){ 1248 MemPage *pPage; 1249 1250 assert( amt>=0 ); 1251 assert( offset>=0 ); 1252 assert( pCur->pPage!=0 ); 1253 pPage = pCur->pPage; 1254 if( pCur->idx >= pPage->nCell ){ 1255 return 0; 1256 } 1257 assert( amt+offset <= NKEY(pCur->pBt, pPage->apCell[pCur->idx]->h) ); 1258 getPayload(pCur, offset, amt, zBuf); 1259 return amt; 1260 } 1261 1262 /* 1263 ** Set *pSize to the number of bytes of data in the entry the 1264 ** cursor currently points to. Always return SQLITE_OK. 1265 ** Failure is not possible. If the cursor is not currently 1266 ** pointing to an entry (which can happen, for example, if 1267 ** the database is empty) then *pSize is set to 0. 1268 */ 1269 static int fileBtreeDataSize(BtCursor *pCur, int *pSize){ 1270 Cell *pCell; 1271 MemPage *pPage; 1272 1273 pPage = pCur->pPage; 1274 assert( pPage!=0 ); 1275 if( pCur->idx >= pPage->nCell ){ 1276 *pSize = 0; 1277 }else{ 1278 pCell = pPage->apCell[pCur->idx]; 1279 *pSize = NDATA(pCur->pBt, pCell->h); 1280 } 1281 return SQLITE_OK; 1282 } 1283 1284 /* 1285 ** Read part of the data associated with cursor pCur. A maximum 1286 ** of "amt" bytes will be transfered into zBuf[]. The transfer 1287 ** begins at "offset". The number of bytes actually read is 1288 ** returned. The amount returned will be smaller than the 1289 ** amount requested if there are not enough bytes in the data 1290 ** to satisfy the request. 1291 */ 1292 static int fileBtreeData(BtCursor *pCur, int offset, int amt, char *zBuf){ 1293 Cell *pCell; 1294 MemPage *pPage; 1295 1296 assert( amt>=0 ); 1297 assert( offset>=0 ); 1298 assert( pCur->pPage!=0 ); 1299 pPage = pCur->pPage; 1300 if( pCur->idx >= pPage->nCell ){ 1301 return 0; 1302 } 1303 pCell = pPage->apCell[pCur->idx]; 1304 assert( amt+offset <= NDATA(pCur->pBt, pCell->h) ); 1305 getPayload(pCur, offset + NKEY(pCur->pBt, pCell->h), amt, zBuf); 1306 return amt; 1307 } 1308 1309 /* 1310 ** Compare an external key against the key on the entry that pCur points to. 1311 ** 1312 ** The external key is pKey and is nKey bytes long. The last nIgnore bytes 1313 ** of the key associated with pCur are ignored, as if they do not exist. 1314 ** (The normal case is for nIgnore to be zero in which case the entire 1315 ** internal key is used in the comparison.) 1316 ** 1317 ** The comparison result is written to *pRes as follows: 1318 ** 1319 ** *pRes<0 This means pCur<pKey 1320 ** 1321 ** *pRes==0 This means pCur==pKey for all nKey bytes 1322 ** 1323 ** *pRes>0 This means pCur>pKey 1324 ** 1325 ** When one key is an exact prefix of the other, the shorter key is 1326 ** considered less than the longer one. In order to be equal the 1327 ** keys must be exactly the same length. (The length of the pCur key 1328 ** is the actual key length minus nIgnore bytes.) 1329 */ 1330 static int fileBtreeKeyCompare( 1331 BtCursor *pCur, /* Pointer to entry to compare against */ 1332 const void *pKey, /* Key to compare against entry that pCur points to */ 1333 int nKey, /* Number of bytes in pKey */ 1334 int nIgnore, /* Ignore this many bytes at the end of pCur */ 1335 int *pResult /* Write the result here */ 1336 ){ 1337 Pgno nextPage; 1338 int n, c, rc, nLocal; 1339 Cell *pCell; 1340 Btree *pBt = pCur->pBt; 1341 const char *zKey = (const char*)pKey; 1342 1343 assert( pCur->pPage ); 1344 assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell ); 1345 pCell = pCur->pPage->apCell[pCur->idx]; 1346 nLocal = NKEY(pBt, pCell->h) - nIgnore; 1347 if( nLocal<0 ) nLocal = 0; 1348 n = nKey<nLocal ? nKey : nLocal; 1349 if( n>MX_LOCAL_PAYLOAD ){ 1350 n = MX_LOCAL_PAYLOAD; 1351 } 1352 c = memcmp(pCell->aPayload, zKey, n); 1353 if( c!=0 ){ 1354 *pResult = c; 1355 return SQLITE_OK; 1356 } 1357 zKey += n; 1358 nKey -= n; 1359 nLocal -= n; 1360 nextPage = SWAB32(pBt, pCell->ovfl); 1361 while( nKey>0 && nLocal>0 ){ 1362 OverflowPage *pOvfl; 1363 if( nextPage==0 ){ 1364 return SQLITE_CORRUPT; 1365 } 1366 rc = sqlitepager_get(pBt->pPager, nextPage, (void**)&pOvfl); 1367 if( rc ){ 1368 return rc; 1369 } 1370 nextPage = SWAB32(pBt, pOvfl->iNext); 1371 n = nKey<nLocal ? nKey : nLocal; 1372 if( n>OVERFLOW_SIZE ){ 1373 n = OVERFLOW_SIZE; 1374 } 1375 c = memcmp(pOvfl->aPayload, zKey, n); 1376 sqlitepager_unref(pOvfl); 1377 if( c!=0 ){ 1378 *pResult = c; 1379 return SQLITE_OK; 1380 } 1381 nKey -= n; 1382 nLocal -= n; 1383 zKey += n; 1384 } 1385 if( c==0 ){ 1386 c = nLocal - nKey; 1387 } 1388 *pResult = c; 1389 return SQLITE_OK; 1390 } 1391 1392 /* 1393 ** Move the cursor down to a new child page. The newPgno argument is the 1394 ** page number of the child page in the byte order of the disk image. 1395 */ 1396 static int moveToChild(BtCursor *pCur, int newPgno){ 1397 int rc; 1398 MemPage *pNewPage; 1399 Btree *pBt = pCur->pBt; 1400 1401 newPgno = SWAB32(pBt, newPgno); 1402 rc = sqlitepager_get(pBt->pPager, newPgno, (void**)&pNewPage); 1403 if( rc ) return rc; 1404 rc = initPage(pBt, pNewPage, newPgno, pCur->pPage); 1405 if( rc ) return rc; 1406 assert( pCur->idx>=pCur->pPage->nCell 1407 || pCur->pPage->apCell[pCur->idx]->h.leftChild==SWAB32(pBt,newPgno) ); 1408 assert( pCur->idx<pCur->pPage->nCell 1409 || pCur->pPage->u.hdr.rightChild==SWAB32(pBt,newPgno) ); 1410 pNewPage->idxParent = pCur->idx; 1411 pCur->pPage->idxShift = 0; 1412 sqlitepager_unref(pCur->pPage); 1413 pCur->pPage = pNewPage; 1414 pCur->idx = 0; 1415 if( pNewPage->nCell<1 ){ 1416 return SQLITE_CORRUPT; 1417 } 1418 return SQLITE_OK; 1419 } 1420 1421 /* 1422 ** Move the cursor up to the parent page. 1423 ** 1424 ** pCur->idx is set to the cell index that contains the pointer 1425 ** to the page we are coming from. If we are coming from the 1426 ** right-most child page then pCur->idx is set to one more than 1427 ** the largest cell index. 1428 */ 1429 static void moveToParent(BtCursor *pCur){ 1430 Pgno oldPgno; 1431 MemPage *pParent; 1432 MemPage *pPage; 1433 int idxParent; 1434 pPage = pCur->pPage; 1435 assert( pPage!=0 ); 1436 pParent = pPage->pParent; 1437 assert( pParent!=0 ); 1438 idxParent = pPage->idxParent; 1439 sqlitepager_ref(pParent); 1440 sqlitepager_unref(pPage); 1441 pCur->pPage = pParent; 1442 assert( pParent->idxShift==0 ); 1443 if( pParent->idxShift==0 ){ 1444 pCur->idx = idxParent; 1445 #ifndef NDEBUG 1446 /* Verify that pCur->idx is the correct index to point back to the child 1447 ** page we just came from 1448 */ 1449 oldPgno = SWAB32(pCur->pBt, sqlitepager_pagenumber(pPage)); 1450 if( pCur->idx<pParent->nCell ){ 1451 assert( pParent->apCell[idxParent]->h.leftChild==oldPgno ); 1452 }else{ 1453 assert( pParent->u.hdr.rightChild==oldPgno ); 1454 } 1455 #endif 1456 }else{ 1457 /* The MemPage.idxShift flag indicates that cell indices might have 1458 ** changed since idxParent was set and hence idxParent might be out 1459 ** of date. So recompute the parent cell index by scanning all cells 1460 ** and locating the one that points to the child we just came from. 1461 */ 1462 int i; 1463 pCur->idx = pParent->nCell; 1464 oldPgno = SWAB32(pCur->pBt, sqlitepager_pagenumber(pPage)); 1465 for(i=0; i<pParent->nCell; i++){ 1466 if( pParent->apCell[i]->h.leftChild==oldPgno ){ 1467 pCur->idx = i; 1468 break; 1469 } 1470 } 1471 } 1472 } 1473 1474 /* 1475 ** Move the cursor to the root page 1476 */ 1477 static int moveToRoot(BtCursor *pCur){ 1478 MemPage *pNew; 1479 int rc; 1480 Btree *pBt = pCur->pBt; 1481 1482 rc = sqlitepager_get(pBt->pPager, pCur->pgnoRoot, (void**)&pNew); 1483 if( rc ) return rc; 1484 rc = initPage(pBt, pNew, pCur->pgnoRoot, 0); 1485 if( rc ) return rc; 1486 sqlitepager_unref(pCur->pPage); 1487 pCur->pPage = pNew; 1488 pCur->idx = 0; 1489 return SQLITE_OK; 1490 } 1491 1492 /* 1493 ** Move the cursor down to the left-most leaf entry beneath the 1494 ** entry to which it is currently pointing. 1495 */ 1496 static int moveToLeftmost(BtCursor *pCur){ 1497 Pgno pgno; 1498 int rc; 1499 1500 while( (pgno = pCur->pPage->apCell[pCur->idx]->h.leftChild)!=0 ){ 1501 rc = moveToChild(pCur, pgno); 1502 if( rc ) return rc; 1503 } 1504 return SQLITE_OK; 1505 } 1506 1507 /* 1508 ** Move the cursor down to the right-most leaf entry beneath the 1509 ** page to which it is currently pointing. Notice the difference 1510 ** between moveToLeftmost() and moveToRightmost(). moveToLeftmost() 1511 ** finds the left-most entry beneath the *entry* whereas moveToRightmost() 1512 ** finds the right-most entry beneath the *page*. 1513 */ 1514 static int moveToRightmost(BtCursor *pCur){ 1515 Pgno pgno; 1516 int rc; 1517 1518 while( (pgno = pCur->pPage->u.hdr.rightChild)!=0 ){ 1519 pCur->idx = pCur->pPage->nCell; 1520 rc = moveToChild(pCur, pgno); 1521 if( rc ) return rc; 1522 } 1523 pCur->idx = pCur->pPage->nCell - 1; 1524 return SQLITE_OK; 1525 } 1526 1527 /* Move the cursor to the first entry in the table. Return SQLITE_OK 1528 ** on success. Set *pRes to 0 if the cursor actually points to something 1529 ** or set *pRes to 1 if the table is empty. 1530 */ 1531 static int fileBtreeFirst(BtCursor *pCur, int *pRes){ 1532 int rc; 1533 if( pCur->pPage==0 ) return SQLITE_ABORT; 1534 rc = moveToRoot(pCur); 1535 if( rc ) return rc; 1536 if( pCur->pPage->nCell==0 ){ 1537 *pRes = 1; 1538 return SQLITE_OK; 1539 } 1540 *pRes = 0; 1541 rc = moveToLeftmost(pCur); 1542 pCur->eSkip = SKIP_NONE; 1543 return rc; 1544 } 1545 1546 /* Move the cursor to the last entry in the table. Return SQLITE_OK 1547 ** on success. Set *pRes to 0 if the cursor actually points to something 1548 ** or set *pRes to 1 if the table is empty. 1549 */ 1550 static int fileBtreeLast(BtCursor *pCur, int *pRes){ 1551 int rc; 1552 if( pCur->pPage==0 ) return SQLITE_ABORT; 1553 rc = moveToRoot(pCur); 1554 if( rc ) return rc; 1555 assert( pCur->pPage->isInit ); 1556 if( pCur->pPage->nCell==0 ){ 1557 *pRes = 1; 1558 return SQLITE_OK; 1559 } 1560 *pRes = 0; 1561 rc = moveToRightmost(pCur); 1562 pCur->eSkip = SKIP_NONE; 1563 return rc; 1564 } 1565 1566 /* Move the cursor so that it points to an entry near pKey. 1567 ** Return a success code. 1568 ** 1569 ** If an exact match is not found, then the cursor is always 1570 ** left pointing at a leaf page which would hold the entry if it 1571 ** were present. The cursor might point to an entry that comes 1572 ** before or after the key. 1573 ** 1574 ** The result of comparing the key with the entry to which the 1575 ** cursor is left pointing is stored in pCur->iMatch. The same 1576 ** value is also written to *pRes if pRes!=NULL. The meaning of 1577 ** this value is as follows: 1578 ** 1579 ** *pRes<0 The cursor is left pointing at an entry that 1580 ** is smaller than pKey or if the table is empty 1581 ** and the cursor is therefore left point to nothing. 1582 ** 1583 ** *pRes==0 The cursor is left pointing at an entry that 1584 ** exactly matches pKey. 1585 ** 1586 ** *pRes>0 The cursor is left pointing at an entry that 1587 ** is larger than pKey. 1588 */ 1589 static 1590 int fileBtreeMoveto(BtCursor *pCur, const void *pKey, int nKey, int *pRes){ 1591 int rc; 1592 if( pCur->pPage==0 ) return SQLITE_ABORT; 1593 pCur->eSkip = SKIP_NONE; 1594 rc = moveToRoot(pCur); 1595 if( rc ) return rc; 1596 for(;;){ 1597 int lwr, upr; 1598 Pgno chldPg; 1599 MemPage *pPage = pCur->pPage; 1600 int c = -1; /* pRes return if table is empty must be -1 */ 1601 lwr = 0; 1602 upr = pPage->nCell-1; 1603 while( lwr<=upr ){ 1604 pCur->idx = (lwr+upr)/2; 1605 rc = fileBtreeKeyCompare(pCur, pKey, nKey, 0, &c); 1606 if( rc ) return rc; 1607 if( c==0 ){ 1608 pCur->iMatch = c; 1609 if( pRes ) *pRes = 0; 1610 return SQLITE_OK; 1611 } 1612 if( c<0 ){ 1613 lwr = pCur->idx+1; 1614 }else{ 1615 upr = pCur->idx-1; 1616 } 1617 } 1618 assert( lwr==upr+1 ); 1619 assert( pPage->isInit ); 1620 if( lwr>=pPage->nCell ){ 1621 chldPg = pPage->u.hdr.rightChild; 1622 }else{ 1623 chldPg = pPage->apCell[lwr]->h.leftChild; 1624 } 1625 if( chldPg==0 ){ 1626 pCur->iMatch = c; 1627 if( pRes ) *pRes = c; 1628 return SQLITE_OK; 1629 } 1630 pCur->idx = lwr; 1631 rc = moveToChild(pCur, chldPg); 1632 if( rc ) return rc; 1633 } 1634 /* NOT REACHED */ 1635 } 1636 1637 /* 1638 ** Advance the cursor to the next entry in the database. If 1639 ** successful then set *pRes=0. If the cursor 1640 ** was already pointing to the last entry in the database before 1641 ** this routine was called, then set *pRes=1. 1642 */ 1643 static int fileBtreeNext(BtCursor *pCur, int *pRes){ 1644 int rc; 1645 MemPage *pPage = pCur->pPage; 1646 assert( pRes!=0 ); 1647 if( pPage==0 ){ 1648 *pRes = 1; 1649 return SQLITE_ABORT; 1650 } 1651 assert( pPage->isInit ); 1652 assert( pCur->eSkip!=SKIP_INVALID ); 1653 if( pPage->nCell==0 ){ 1654 *pRes = 1; 1655 return SQLITE_OK; 1656 } 1657 assert( pCur->idx<pPage->nCell ); 1658 if( pCur->eSkip==SKIP_NEXT ){ 1659 pCur->eSkip = SKIP_NONE; 1660 *pRes = 0; 1661 return SQLITE_OK; 1662 } 1663 pCur->eSkip = SKIP_NONE; 1664 pCur->idx++; 1665 if( pCur->idx>=pPage->nCell ){ 1666 if( pPage->u.hdr.rightChild ){ 1667 rc = moveToChild(pCur, pPage->u.hdr.rightChild); 1668 if( rc ) return rc; 1669 rc = moveToLeftmost(pCur); 1670 *pRes = 0; 1671 return rc; 1672 } 1673 do{ 1674 if( pPage->pParent==0 ){ 1675 *pRes = 1; 1676 return SQLITE_OK; 1677 } 1678 moveToParent(pCur); 1679 pPage = pCur->pPage; 1680 }while( pCur->idx>=pPage->nCell ); 1681 *pRes = 0; 1682 return SQLITE_OK; 1683 } 1684 *pRes = 0; 1685 if( pPage->u.hdr.rightChild==0 ){ 1686 return SQLITE_OK; 1687 } 1688 rc = moveToLeftmost(pCur); 1689 return rc; 1690 } 1691 1692 /* 1693 ** Step the cursor to the back to the previous entry in the database. If 1694 ** successful then set *pRes=0. If the cursor 1695 ** was already pointing to the first entry in the database before 1696 ** this routine was called, then set *pRes=1. 1697 */ 1698 static int fileBtreePrevious(BtCursor *pCur, int *pRes){ 1699 int rc; 1700 Pgno pgno; 1701 MemPage *pPage; 1702 pPage = pCur->pPage; 1703 if( pPage==0 ){ 1704 *pRes = 1; 1705 return SQLITE_ABORT; 1706 } 1707 assert( pPage->isInit ); 1708 assert( pCur->eSkip!=SKIP_INVALID ); 1709 if( pPage->nCell==0 ){ 1710 *pRes = 1; 1711 return SQLITE_OK; 1712 } 1713 if( pCur->eSkip==SKIP_PREV ){ 1714 pCur->eSkip = SKIP_NONE; 1715 *pRes = 0; 1716 return SQLITE_OK; 1717 } 1718 pCur->eSkip = SKIP_NONE; 1719 assert( pCur->idx>=0 ); 1720 if( (pgno = pPage->apCell[pCur->idx]->h.leftChild)!=0 ){ 1721 rc = moveToChild(pCur, pgno); 1722 if( rc ) return rc; 1723 rc = moveToRightmost(pCur); 1724 }else{ 1725 while( pCur->idx==0 ){ 1726 if( pPage->pParent==0 ){ 1727 if( pRes ) *pRes = 1; 1728 return SQLITE_OK; 1729 } 1730 moveToParent(pCur); 1731 pPage = pCur->pPage; 1732 } 1733 pCur->idx--; 1734 rc = SQLITE_OK; 1735 } 1736 *pRes = 0; 1737 return rc; 1738 } 1739 1740 /* 1741 ** Allocate a new page from the database file. 1742 ** 1743 ** The new page is marked as dirty. (In other words, sqlitepager_write() 1744 ** has already been called on the new page.) The new page has also 1745 ** been referenced and the calling routine is responsible for calling 1746 ** sqlitepager_unref() on the new page when it is done. 1747 ** 1748 ** SQLITE_OK is returned on success. Any other return value indicates 1749 ** an error. *ppPage and *pPgno are undefined in the event of an error. 1750 ** Do not invoke sqlitepager_unref() on *ppPage if an error is returned. 1751 ** 1752 ** If the "nearby" parameter is not 0, then a (feeble) effort is made to 1753 ** locate a page close to the page number "nearby". This can be used in an 1754 ** attempt to keep related pages close to each other in the database file, 1755 ** which in turn can make database access faster. 1756 */ 1757 static int allocatePage(Btree *pBt, MemPage **ppPage, Pgno *pPgno, Pgno nearby){ 1758 PageOne *pPage1 = pBt->page1; 1759 int rc; 1760 if( pPage1->freeList ){ 1761 OverflowPage *pOvfl; 1762 FreelistInfo *pInfo; 1763 1764 rc = sqlitepager_write(pPage1); 1765 if( rc ) return rc; 1766 SWAB_ADD(pBt, pPage1->nFree, -1); 1767 rc = sqlitepager_get(pBt->pPager, SWAB32(pBt, pPage1->freeList), 1768 (void**)&pOvfl); 1769 if( rc ) return rc; 1770 rc = sqlitepager_write(pOvfl); 1771 if( rc ){ 1772 sqlitepager_unref(pOvfl); 1773 return rc; 1774 } 1775 pInfo = (FreelistInfo*)pOvfl->aPayload; 1776 if( pInfo->nFree==0 ){ 1777 *pPgno = SWAB32(pBt, pPage1->freeList); 1778 pPage1->freeList = pOvfl->iNext; 1779 *ppPage = (MemPage*)pOvfl; 1780 }else{ 1781 int closest, n; 1782 n = SWAB32(pBt, pInfo->nFree); 1783 if( n>1 && nearby>0 ){ 1784 int i, dist; 1785 closest = 0; 1786 dist = SWAB32(pBt, pInfo->aFree[0]) - nearby; 1787 if( dist<0 ) dist = -dist; 1788 for(i=1; i<n; i++){ 1789 int d2 = SWAB32(pBt, pInfo->aFree[i]) - nearby; 1790 if( d2<0 ) d2 = -d2; 1791 if( d2<dist ) closest = i; 1792 } 1793 }else{ 1794 closest = 0; 1795 } 1796 SWAB_ADD(pBt, pInfo->nFree, -1); 1797 *pPgno = SWAB32(pBt, pInfo->aFree[closest]); 1798 pInfo->aFree[closest] = pInfo->aFree[n-1]; 1799 rc = sqlitepager_get(pBt->pPager, *pPgno, (void**)ppPage); 1800 sqlitepager_unref(pOvfl); 1801 if( rc==SQLITE_OK ){ 1802 sqlitepager_dont_rollback(*ppPage); 1803 rc = sqlitepager_write(*ppPage); 1804 } 1805 } 1806 }else{ 1807 *pPgno = sqlitepager_pagecount(pBt->pPager) + 1; 1808 rc = sqlitepager_get(pBt->pPager, *pPgno, (void**)ppPage); 1809 if( rc ) return rc; 1810 rc = sqlitepager_write(*ppPage); 1811 } 1812 return rc; 1813 } 1814 1815 /* 1816 ** Add a page of the database file to the freelist. Either pgno or 1817 ** pPage but not both may be 0. 1818 ** 1819 ** sqlitepager_unref() is NOT called for pPage. 1820 */ 1821 static int freePage(Btree *pBt, void *pPage, Pgno pgno){ 1822 PageOne *pPage1 = pBt->page1; 1823 OverflowPage *pOvfl = (OverflowPage*)pPage; 1824 int rc; 1825 int needUnref = 0; 1826 MemPage *pMemPage; 1827 1828 if( pgno==0 ){ 1829 assert( pOvfl!=0 ); 1830 pgno = sqlitepager_pagenumber(pOvfl); 1831 } 1832 assert( pgno>2 ); 1833 assert( sqlitepager_pagenumber(pOvfl)==pgno ); 1834 pMemPage = (MemPage*)pPage; 1835 pMemPage->isInit = 0; 1836 if( pMemPage->pParent ){ 1837 sqlitepager_unref(pMemPage->pParent); 1838 pMemPage->pParent = 0; 1839 } 1840 rc = sqlitepager_write(pPage1); 1841 if( rc ){ 1842 return rc; 1843 } 1844 SWAB_ADD(pBt, pPage1->nFree, 1); 1845 if( pPage1->nFree!=0 && pPage1->freeList!=0 ){ 1846 OverflowPage *pFreeIdx; 1847 rc = sqlitepager_get(pBt->pPager, SWAB32(pBt, pPage1->freeList), 1848 (void**)&pFreeIdx); 1849 if( rc==SQLITE_OK ){ 1850 FreelistInfo *pInfo = (FreelistInfo*)pFreeIdx->aPayload; 1851 int n = SWAB32(pBt, pInfo->nFree); 1852 if( n<(sizeof(pInfo->aFree)/sizeof(pInfo->aFree[0])) ){ 1853 rc = sqlitepager_write(pFreeIdx); 1854 if( rc==SQLITE_OK ){ 1855 pInfo->aFree[n] = SWAB32(pBt, pgno); 1856 SWAB_ADD(pBt, pInfo->nFree, 1); 1857 sqlitepager_unref(pFreeIdx); 1858 sqlitepager_dont_write(pBt->pPager, pgno); 1859 return rc; 1860 } 1861 } 1862 sqlitepager_unref(pFreeIdx); 1863 } 1864 } 1865 if( pOvfl==0 ){ 1866 assert( pgno>0 ); 1867 rc = sqlitepager_get(pBt->pPager, pgno, (void**)&pOvfl); 1868 if( rc ) return rc; 1869 needUnref = 1; 1870 } 1871 rc = sqlitepager_write(pOvfl); 1872 if( rc ){ 1873 if( needUnref ) sqlitepager_unref(pOvfl); 1874 return rc; 1875 } 1876 pOvfl->iNext = pPage1->freeList; 1877 pPage1->freeList = SWAB32(pBt, pgno); 1878 memset(pOvfl->aPayload, 0, OVERFLOW_SIZE); 1879 if( needUnref ) rc = sqlitepager_unref(pOvfl); 1880 return rc; 1881 } 1882 1883 /* 1884 ** Erase all the data out of a cell. This involves returning overflow 1885 ** pages back the freelist. 1886 */ 1887 static int clearCell(Btree *pBt, Cell *pCell){ 1888 Pager *pPager = pBt->pPager; 1889 OverflowPage *pOvfl; 1890 Pgno ovfl, nextOvfl; 1891 int rc; 1892 1893 if( NKEY(pBt, pCell->h) + NDATA(pBt, pCell->h) <= MX_LOCAL_PAYLOAD ){ 1894 return SQLITE_OK; 1895 } 1896 ovfl = SWAB32(pBt, pCell->ovfl); 1897 pCell->ovfl = 0; 1898 while( ovfl ){ 1899 rc = sqlitepager_get(pPager, ovfl, (void**)&pOvfl); 1900 if( rc ) return rc; 1901 nextOvfl = SWAB32(pBt, pOvfl->iNext); 1902 rc = freePage(pBt, pOvfl, ovfl); 1903 if( rc ) return rc; 1904 sqlitepager_unref(pOvfl); 1905 ovfl = nextOvfl; 1906 } 1907 return SQLITE_OK; 1908 } 1909 1910 /* 1911 ** Create a new cell from key and data. Overflow pages are allocated as 1912 ** necessary and linked to this cell. 1913 */ 1914 static int fillInCell( 1915 Btree *pBt, /* The whole Btree. Needed to allocate pages */ 1916 Cell *pCell, /* Populate this Cell structure */ 1917 const void *pKey, int nKey, /* The key */ 1918 const void *pData,int nData /* The data */ 1919 ){ 1920 OverflowPage *pOvfl, *pPrior; 1921 Pgno *pNext; 1922 int spaceLeft; 1923 int n, rc; 1924 int nPayload; 1925 const char *pPayload; 1926 char *pSpace; 1927 Pgno nearby = 0; 1928 1929 pCell->h.leftChild = 0; 1930 pCell->h.nKey = SWAB16(pBt, nKey & 0xffff); 1931 pCell->h.nKeyHi = nKey >> 16; 1932 pCell->h.nData = SWAB16(pBt, nData & 0xffff); 1933 pCell->h.nDataHi = nData >> 16; 1934 pCell->h.iNext = 0; 1935 1936 pNext = &pCell->ovfl; 1937 pSpace = pCell->aPayload; 1938 spaceLeft = MX_LOCAL_PAYLOAD; 1939 pPayload = pKey; 1940 pKey = 0; 1941 nPayload = nKey; 1942 pPrior = 0; 1943 while( nPayload>0 ){ 1944 if( spaceLeft==0 ){ 1945 rc = allocatePage(pBt, (MemPage**)&pOvfl, pNext, nearby); 1946 if( rc ){ 1947 *pNext = 0; 1948 }else{ 1949 nearby = *pNext; 1950 } 1951 if( pPrior ) sqlitepager_unref(pPrior); 1952 if( rc ){ 1953 clearCell(pBt, pCell); 1954 return rc; 1955 } 1956 if( pBt->needSwab ) *pNext = swab32(*pNext); 1957 pPrior = pOvfl; 1958 spaceLeft = OVERFLOW_SIZE; 1959 pSpace = pOvfl->aPayload; 1960 pNext = &pOvfl->iNext; 1961 } 1962 n = nPayload; 1963 if( n>spaceLeft ) n = spaceLeft; 1964 memcpy(pSpace, pPayload, n); 1965 nPayload -= n; 1966 if( nPayload==0 && pData ){ 1967 pPayload = pData; 1968 nPayload = nData; 1969 pData = 0; 1970 }else{ 1971 pPayload += n; 1972 } 1973 spaceLeft -= n; 1974 pSpace += n; 1975 } 1976 *pNext = 0; 1977 if( pPrior ){ 1978 sqlitepager_unref(pPrior); 1979 } 1980 return SQLITE_OK; 1981 } 1982 1983 /* 1984 ** Change the MemPage.pParent pointer on the page whose number is 1985 ** given in the second argument so that MemPage.pParent holds the 1986 ** pointer in the third argument. 1987 */ 1988 static void reparentPage(Pager *pPager, Pgno pgno, MemPage *pNewParent,int idx){ 1989 MemPage *pThis; 1990 1991 if( pgno==0 ) return; 1992 assert( pPager!=0 ); 1993 pThis = sqlitepager_lookup(pPager, pgno); 1994 if( pThis && pThis->isInit ){ 1995 if( pThis->pParent!=pNewParent ){ 1996 if( pThis->pParent ) sqlitepager_unref(pThis->pParent); 1997 pThis->pParent = pNewParent; 1998 if( pNewParent ) sqlitepager_ref(pNewParent); 1999 } 2000 pThis->idxParent = idx; 2001 sqlitepager_unref(pThis); 2002 } 2003 } 2004 2005 /* 2006 ** Reparent all children of the given page to be the given page. 2007 ** In other words, for every child of pPage, invoke reparentPage() 2008 ** to make sure that each child knows that pPage is its parent. 2009 ** 2010 ** This routine gets called after you memcpy() one page into 2011 ** another. 2012 */ 2013 static void reparentChildPages(Btree *pBt, MemPage *pPage){ 2014 int i; 2015 Pager *pPager = pBt->pPager; 2016 for(i=0; i<pPage->nCell; i++){ 2017 reparentPage(pPager, SWAB32(pBt, pPage->apCell[i]->h.leftChild), pPage, i); 2018 } 2019 reparentPage(pPager, SWAB32(pBt, pPage->u.hdr.rightChild), pPage, i); 2020 pPage->idxShift = 0; 2021 } 2022 2023 /* 2024 ** Remove the i-th cell from pPage. This routine effects pPage only. 2025 ** The cell content is not freed or deallocated. It is assumed that 2026 ** the cell content has been copied someplace else. This routine just 2027 ** removes the reference to the cell from pPage. 2028 ** 2029 ** "sz" must be the number of bytes in the cell. 2030 ** 2031 ** Do not bother maintaining the integrity of the linked list of Cells. 2032 ** Only the pPage->apCell[] array is important. The relinkCellList() 2033 ** routine will be called soon after this routine in order to rebuild 2034 ** the linked list. 2035 */ 2036 static void dropCell(Btree *pBt, MemPage *pPage, int idx, int sz){ 2037 int j; 2038 assert( idx>=0 && idx<pPage->nCell ); 2039 assert( sz==cellSize(pBt, pPage->apCell[idx]) ); 2040 assert( sqlitepager_iswriteable(pPage) ); 2041 freeSpace(pBt, pPage, Addr(pPage->apCell[idx]) - Addr(pPage), sz); 2042 for(j=idx; j<pPage->nCell-1; j++){ 2043 pPage->apCell[j] = pPage->apCell[j+1]; 2044 } 2045 pPage->nCell--; 2046 pPage->idxShift = 1; 2047 } 2048 2049 /* 2050 ** Insert a new cell on pPage at cell index "i". pCell points to the 2051 ** content of the cell. 2052 ** 2053 ** If the cell content will fit on the page, then put it there. If it 2054 ** will not fit, then just make pPage->apCell[i] point to the content 2055 ** and set pPage->isOverfull. 2056 ** 2057 ** Do not bother maintaining the integrity of the linked list of Cells. 2058 ** Only the pPage->apCell[] array is important. The relinkCellList() 2059 ** routine will be called soon after this routine in order to rebuild 2060 ** the linked list. 2061 */ 2062 static void insertCell(Btree *pBt, MemPage *pPage, int i, Cell *pCell, int sz){ 2063 int idx, j; 2064 assert( i>=0 && i<=pPage->nCell ); 2065 assert( sz==cellSize(pBt, pCell) ); 2066 assert( sqlitepager_iswriteable(pPage) ); 2067 idx = allocateSpace(pBt, pPage, sz); 2068 for(j=pPage->nCell; j>i; j--){ 2069 pPage->apCell[j] = pPage->apCell[j-1]; 2070 } 2071 pPage->nCell++; 2072 if( idx<=0 ){ 2073 pPage->isOverfull = 1; 2074 pPage->apCell[i] = pCell; 2075 }else{ 2076 memcpy(&pPage->u.aDisk[idx], pCell, sz); 2077 pPage->apCell[i] = (Cell*)&pPage->u.aDisk[idx]; 2078 } 2079 pPage->idxShift = 1; 2080 } 2081 2082 /* 2083 ** Rebuild the linked list of cells on a page so that the cells 2084 ** occur in the order specified by the pPage->apCell[] array. 2085 ** Invoke this routine once to repair damage after one or more 2086 ** invocations of either insertCell() or dropCell(). 2087 */ 2088 static void relinkCellList(Btree *pBt, MemPage *pPage){ 2089 int i; 2090 u16 *pIdx; 2091 assert( sqlitepager_iswriteable(pPage) ); 2092 pIdx = &pPage->u.hdr.firstCell; 2093 for(i=0; i<pPage->nCell; i++){ 2094 int idx = Addr(pPage->apCell[i]) - Addr(pPage); 2095 assert( idx>0 && idx<SQLITE_USABLE_SIZE ); 2096 *pIdx = SWAB16(pBt, idx); 2097 pIdx = &pPage->apCell[i]->h.iNext; 2098 } 2099 *pIdx = 0; 2100 } 2101 2102 /* 2103 ** Make a copy of the contents of pFrom into pTo. The pFrom->apCell[] 2104 ** pointers that point into pFrom->u.aDisk[] must be adjusted to point 2105 ** into pTo->u.aDisk[] instead. But some pFrom->apCell[] entries might 2106 ** not point to pFrom->u.aDisk[]. Those are unchanged. 2107 */ 2108 static void copyPage(MemPage *pTo, MemPage *pFrom){ 2109 uptr from, to; 2110 int i; 2111 memcpy(pTo->u.aDisk, pFrom->u.aDisk, SQLITE_USABLE_SIZE); 2112 pTo->pParent = 0; 2113 pTo->isInit = 1; 2114 pTo->nCell = pFrom->nCell; 2115 pTo->nFree = pFrom->nFree; 2116 pTo->isOverfull = pFrom->isOverfull; 2117 to = Addr(pTo); 2118 from = Addr(pFrom); 2119 for(i=0; i<pTo->nCell; i++){ 2120 uptr x = Addr(pFrom->apCell[i]); 2121 if( x>from && x<from+SQLITE_USABLE_SIZE ){ 2122 *((uptr*)&pTo->apCell[i]) = x + to - from; 2123 }else{ 2124 pTo->apCell[i] = pFrom->apCell[i]; 2125 } 2126 } 2127 } 2128 2129 /* 2130 ** The following parameters determine how many adjacent pages get involved 2131 ** in a balancing operation. NN is the number of neighbors on either side 2132 ** of the page that participate in the balancing operation. NB is the 2133 ** total number of pages that participate, including the target page and 2134 ** NN neighbors on either side. 2135 ** 2136 ** The minimum value of NN is 1 (of course). Increasing NN above 1 2137 ** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance 2138 ** in exchange for a larger degradation in INSERT and UPDATE performance. 2139 ** The value of NN appears to give the best results overall. 2140 */ 2141 #define NN 1 /* Number of neighbors on either side of pPage */ 2142 #define NB (NN*2+1) /* Total pages involved in the balance */ 2143 2144 /* 2145 ** This routine redistributes Cells on pPage and up to two siblings 2146 ** of pPage so that all pages have about the same amount of free space. 2147 ** Usually one sibling on either side of pPage is used in the balancing, 2148 ** though both siblings might come from one side if pPage is the first 2149 ** or last child of its parent. If pPage has fewer than two siblings 2150 ** (something which can only happen if pPage is the root page or a 2151 ** child of root) then all available siblings participate in the balancing. 2152 ** 2153 ** The number of siblings of pPage might be increased or decreased by 2154 ** one in an effort to keep pages between 66% and 100% full. The root page 2155 ** is special and is allowed to be less than 66% full. If pPage is 2156 ** the root page, then the depth of the tree might be increased 2157 ** or decreased by one, as necessary, to keep the root page from being 2158 ** overfull or empty. 2159 ** 2160 ** This routine calls relinkCellList() on its input page regardless of 2161 ** whether or not it does any real balancing. Client routines will typically 2162 ** invoke insertCell() or dropCell() before calling this routine, so we 2163 ** need to call relinkCellList() to clean up the mess that those other 2164 ** routines left behind. 2165 ** 2166 ** pCur is left pointing to the same cell as when this routine was called 2167 ** even if that cell gets moved to a different page. pCur may be NULL. 2168 ** Set the pCur parameter to NULL if you do not care about keeping track 2169 ** of a cell as that will save this routine the work of keeping track of it. 2170 ** 2171 ** Note that when this routine is called, some of the Cells on pPage 2172 ** might not actually be stored in pPage->u.aDisk[]. This can happen 2173 ** if the page is overfull. Part of the job of this routine is to 2174 ** make sure all Cells for pPage once again fit in pPage->u.aDisk[]. 2175 ** 2176 ** In the course of balancing the siblings of pPage, the parent of pPage 2177 ** might become overfull or underfull. If that happens, then this routine 2178 ** is called recursively on the parent. 2179 ** 2180 ** If this routine fails for any reason, it might leave the database 2181 ** in a corrupted state. So if this routine fails, the database should 2182 ** be rolled back. 2183 */ 2184 static int balance(Btree *pBt, MemPage *pPage, BtCursor *pCur){ 2185 MemPage *pParent; /* The parent of pPage */ 2186 int nCell; /* Number of cells in apCell[] */ 2187 int nOld; /* Number of pages in apOld[] */ 2188 int nNew; /* Number of pages in apNew[] */ 2189 int nDiv; /* Number of cells in apDiv[] */ 2190 int i, j, k; /* Loop counters */ 2191 int idx; /* Index of pPage in pParent->apCell[] */ 2192 int nxDiv; /* Next divider slot in pParent->apCell[] */ 2193 int rc; /* The return code */ 2194 int iCur; /* apCell[iCur] is the cell of the cursor */ 2195 MemPage *pOldCurPage; /* The cursor originally points to this page */ 2196 int subtotal; /* Subtotal of bytes in cells on one page */ 2197 MemPage *extraUnref = 0; /* A page that needs to be unref-ed */ 2198 MemPage *apOld[NB]; /* pPage and up to two siblings */ 2199 Pgno pgnoOld[NB]; /* Page numbers for each page in apOld[] */ 2200 MemPage *apNew[NB+1]; /* pPage and up to NB siblings after balancing */ 2201 Pgno pgnoNew[NB+1]; /* Page numbers for each page in apNew[] */ 2202 int idxDiv[NB]; /* Indices of divider cells in pParent */ 2203 Cell *apDiv[NB]; /* Divider cells in pParent */ 2204 Cell aTemp[NB]; /* Temporary holding area for apDiv[] */ 2205 int cntNew[NB+1]; /* Index in apCell[] of cell after i-th page */ 2206 int szNew[NB+1]; /* Combined size of cells place on i-th page */ 2207 MemPage aOld[NB]; /* Temporary copies of pPage and its siblings */ 2208 Cell *apCell[(MX_CELL+2)*NB]; /* All cells from pages being balanced */ 2209 int szCell[(MX_CELL+2)*NB]; /* Local size of all cells */ 2210 2211 /* 2212 ** Return without doing any work if pPage is neither overfull nor 2213 ** underfull. 2214 */ 2215 assert( sqlitepager_iswriteable(pPage) ); 2216 if( !pPage->isOverfull && pPage->nFree<SQLITE_USABLE_SIZE/2 2217 && pPage->nCell>=2){ 2218 relinkCellList(pBt, pPage); 2219 return SQLITE_OK; 2220 } 2221 2222 /* 2223 ** Find the parent of the page to be balanceed. 2224 ** If there is no parent, it means this page is the root page and 2225 ** special rules apply. 2226 */ 2227 pParent = pPage->pParent; 2228 if( pParent==0 ){ 2229 Pgno pgnoChild; 2230 MemPage *pChild; 2231 assert( pPage->isInit ); 2232 if( pPage->nCell==0 ){ 2233 if( pPage->u.hdr.rightChild ){ 2234 /* 2235 ** The root page is empty. Copy the one child page 2236 ** into the root page and return. This reduces the depth 2237 ** of the BTree by one. 2238 */ 2239 pgnoChild = SWAB32(pBt, pPage->u.hdr.rightChild); 2240 rc = sqlitepager_get(pBt->pPager, pgnoChild, (void**)&pChild); 2241 if( rc ) return rc; 2242 memcpy(pPage, pChild, SQLITE_USABLE_SIZE); 2243 pPage->isInit = 0; 2244 rc = initPage(pBt, pPage, sqlitepager_pagenumber(pPage), 0); 2245 assert( rc==SQLITE_OK ); 2246 reparentChildPages(pBt, pPage); 2247 if( pCur && pCur->pPage==pChild ){ 2248 sqlitepager_unref(pChild); 2249 pCur->pPage = pPage; 2250 sqlitepager_ref(pPage); 2251 } 2252 freePage(pBt, pChild, pgnoChild); 2253 sqlitepager_unref(pChild); 2254 }else{ 2255 relinkCellList(pBt, pPage); 2256 } 2257 return SQLITE_OK; 2258 } 2259 if( !pPage->isOverfull ){ 2260 /* It is OK for the root page to be less than half full. 2261 */ 2262 relinkCellList(pBt, pPage); 2263 return SQLITE_OK; 2264 } 2265 /* 2266 ** If we get to here, it means the root page is overfull. 2267 ** When this happens, Create a new child page and copy the 2268 ** contents of the root into the child. Then make the root 2269 ** page an empty page with rightChild pointing to the new 2270 ** child. Then fall thru to the code below which will cause 2271 ** the overfull child page to be split. 2272 */ 2273 rc = sqlitepager_write(pPage); 2274 if( rc ) return rc; 2275 rc = allocatePage(pBt, &pChild, &pgnoChild, sqlitepager_pagenumber(pPage)); 2276 if( rc ) return rc; 2277 assert( sqlitepager_iswriteable(pChild) ); 2278 copyPage(pChild, pPage); 2279 pChild->pParent = pPage; 2280 pChild->idxParent = 0; 2281 sqlitepager_ref(pPage); 2282 pChild->isOverfull = 1; 2283 if( pCur && pCur->pPage==pPage ){ 2284 sqlitepager_unref(pPage); 2285 pCur->pPage = pChild; 2286 }else{ 2287 extraUnref = pChild; 2288 } 2289 zeroPage(pBt, pPage); 2290 pPage->u.hdr.rightChild = SWAB32(pBt, pgnoChild); 2291 pParent = pPage; 2292 pPage = pChild; 2293 } 2294 rc = sqlitepager_write(pParent); 2295 if( rc ) return rc; 2296 assert( pParent->isInit ); 2297 2298 /* 2299 ** Find the Cell in the parent page whose h.leftChild points back 2300 ** to pPage. The "idx" variable is the index of that cell. If pPage 2301 ** is the rightmost child of pParent then set idx to pParent->nCell 2302 */ 2303 if( pParent->idxShift ){ 2304 Pgno pgno, swabPgno; 2305 pgno = sqlitepager_pagenumber(pPage); 2306 swabPgno = SWAB32(pBt, pgno); 2307 for(idx=0; idx<pParent->nCell; idx++){ 2308 if( pParent->apCell[idx]->h.leftChild==swabPgno ){ 2309 break; 2310 } 2311 } 2312 assert( idx<pParent->nCell || pParent->u.hdr.rightChild==swabPgno ); 2313 }else{ 2314 idx = pPage->idxParent; 2315 } 2316 2317 /* 2318 ** Initialize variables so that it will be safe to jump 2319 ** directly to balance_cleanup at any moment. 2320 */ 2321 nOld = nNew = 0; 2322 sqlitepager_ref(pParent); 2323 2324 /* 2325 ** Find sibling pages to pPage and the Cells in pParent that divide 2326 ** the siblings. An attempt is made to find NN siblings on either 2327 ** side of pPage. More siblings are taken from one side, however, if 2328 ** pPage there are fewer than NN siblings on the other side. If pParent 2329 ** has NB or fewer children then all children of pParent are taken. 2330 */ 2331 nxDiv = idx - NN; 2332 if( nxDiv + NB > pParent->nCell ){ 2333 nxDiv = pParent->nCell - NB + 1; 2334 } 2335 if( nxDiv<0 ){ 2336 nxDiv = 0; 2337 } 2338 nDiv = 0; 2339 for(i=0, k=nxDiv; i<NB; i++, k++){ 2340 if( k<pParent->nCell ){ 2341 idxDiv[i] = k; 2342 apDiv[i] = pParent->apCell[k]; 2343 nDiv++; 2344 pgnoOld[i] = SWAB32(pBt, apDiv[i]->h.leftChild); 2345 }else if( k==pParent->nCell ){ 2346 pgnoOld[i] = SWAB32(pBt, pParent->u.hdr.rightChild); 2347 }else{ 2348 break; 2349 } 2350 rc = sqlitepager_get(pBt->pPager, pgnoOld[i], (void**)&apOld[i]); 2351 if( rc ) goto balance_cleanup; 2352 rc = initPage(pBt, apOld[i], pgnoOld[i], pParent); 2353 if( rc ) goto balance_cleanup; 2354 apOld[i]->idxParent = k; 2355 nOld++; 2356 } 2357 2358 /* 2359 ** Set iCur to be the index in apCell[] of the cell that the cursor 2360 ** is pointing to. We will need this later on in order to keep the 2361 ** cursor pointing at the same cell. If pCur points to a page that 2362 ** has no involvement with this rebalancing, then set iCur to a large 2363 ** number so that the iCur==j tests always fail in the main cell 2364 ** distribution loop below. 2365 */ 2366 if( pCur ){ 2367 iCur = 0; 2368 for(i=0; i<nOld; i++){ 2369 if( pCur->pPage==apOld[i] ){ 2370 iCur += pCur->idx; 2371 break; 2372 } 2373 iCur += apOld[i]->nCell; 2374 if( i<nOld-1 && pCur->pPage==pParent && pCur->idx==idxDiv[i] ){ 2375 break; 2376 } 2377 iCur++; 2378 } 2379 pOldCurPage = pCur->pPage; 2380 } 2381 2382 /* 2383 ** Make copies of the content of pPage and its siblings into aOld[]. 2384 ** The rest of this function will use data from the copies rather 2385 ** that the original pages since the original pages will be in the 2386 ** process of being overwritten. 2387 */ 2388 for(i=0; i<nOld; i++){ 2389 copyPage(&aOld[i], apOld[i]); 2390 } 2391 2392 /* 2393 ** Load pointers to all cells on sibling pages and the divider cells 2394 ** into the local apCell[] array. Make copies of the divider cells 2395 ** into aTemp[] and remove the the divider Cells from pParent. 2396 */ 2397 nCell = 0; 2398 for(i=0; i<nOld; i++){ 2399 MemPage *pOld = &aOld[i]; 2400 for(j=0; j<pOld->nCell; j++){ 2401 apCell[nCell] = pOld->apCell[j]; 2402 szCell[nCell] = cellSize(pBt, apCell[nCell]); 2403 nCell++; 2404 } 2405 if( i<nOld-1 ){ 2406 szCell[nCell] = cellSize(pBt, apDiv[i]); 2407 memcpy(&aTemp[i], apDiv[i], szCell[nCell]); 2408 apCell[nCell] = &aTemp[i]; 2409 dropCell(pBt, pParent, nxDiv, szCell[nCell]); 2410 assert( SWAB32(pBt, apCell[nCell]->h.leftChild)==pgnoOld[i] ); 2411 apCell[nCell]->h.leftChild = pOld->u.hdr.rightChild; 2412 nCell++; 2413 } 2414 } 2415 2416 /* 2417 ** Figure out the number of pages needed to hold all nCell cells. 2418 ** Store this number in "k". Also compute szNew[] which is the total 2419 ** size of all cells on the i-th page and cntNew[] which is the index 2420 ** in apCell[] of the cell that divides path i from path i+1. 2421 ** cntNew[k] should equal nCell. 2422 ** 2423 ** This little patch of code is critical for keeping the tree 2424 ** balanced. 2425 */ 2426 for(subtotal=k=i=0; i<nCell; i++){ 2427 subtotal += szCell[i]; 2428 if( subtotal > USABLE_SPACE ){ 2429 szNew[k] = subtotal - szCell[i]; 2430 cntNew[k] = i; 2431 subtotal = 0; 2432 k++; 2433 } 2434 } 2435 szNew[k] = subtotal; 2436 cntNew[k] = nCell; 2437 k++; 2438 for(i=k-1; i>0; i--){ 2439 while( szNew[i]<USABLE_SPACE/2 ){ 2440 cntNew[i-1]--; 2441 assert( cntNew[i-1]>0 ); 2442 szNew[i] += szCell[cntNew[i-1]]; 2443 szNew[i-1] -= szCell[cntNew[i-1]-1]; 2444 } 2445 } 2446 assert( cntNew[0]>0 ); 2447 2448 /* 2449 ** Allocate k new pages. Reuse old pages where possible. 2450 */ 2451 for(i=0; i<k; i++){ 2452 if( i<nOld ){ 2453 apNew[i] = apOld[i]; 2454 pgnoNew[i] = pgnoOld[i]; 2455 apOld[i] = 0; 2456 rc = sqlitepager_write(apNew[i]); 2457 if( rc ) goto balance_cleanup; 2458 }else{ 2459 rc = allocatePage(pBt, &apNew[i], &pgnoNew[i], pgnoNew[i-1]); 2460 if( rc ) goto balance_cleanup; 2461 } 2462 nNew++; 2463 zeroPage(pBt, apNew[i]); 2464 apNew[i]->isInit = 1; 2465 } 2466 2467 /* Free any old pages that were not reused as new pages. 2468 */ 2469 while( i<nOld ){ 2470 rc = freePage(pBt, apOld[i], pgnoOld[i]); 2471 if( rc ) goto balance_cleanup; 2472 sqlitepager_unref(apOld[i]); 2473 apOld[i] = 0; 2474 i++; 2475 } 2476 2477 /* 2478 ** Put the new pages in accending order. This helps to 2479 ** keep entries in the disk file in order so that a scan 2480 ** of the table is a linear scan through the file. That 2481 ** in turn helps the operating system to deliver pages 2482 ** from the disk more rapidly. 2483 ** 2484 ** An O(n^2) insertion sort algorithm is used, but since 2485 ** n is never more than NB (a small constant), that should 2486 ** not be a problem. 2487 ** 2488 ** When NB==3, this one optimization makes the database 2489 ** about 25% faster for large insertions and deletions. 2490 */ 2491 for(i=0; i<k-1; i++){ 2492 int minV = pgnoNew[i]; 2493 int minI = i; 2494 for(j=i+1; j<k; j++){ 2495 if( pgnoNew[j]<(unsigned)minV ){ 2496 minI = j; 2497 minV = pgnoNew[j]; 2498 } 2499 } 2500 if( minI>i ){ 2501 int t; 2502 MemPage *pT; 2503 t = pgnoNew[i]; 2504 pT = apNew[i]; 2505 pgnoNew[i] = pgnoNew[minI]; 2506 apNew[i] = apNew[minI]; 2507 pgnoNew[minI] = t; 2508 apNew[minI] = pT; 2509 } 2510 } 2511 2512 /* 2513 ** Evenly distribute the data in apCell[] across the new pages. 2514 ** Insert divider cells into pParent as necessary. 2515 */ 2516 j = 0; 2517 for(i=0; i<nNew; i++){ 2518 MemPage *pNew = apNew[i]; 2519 while( j<cntNew[i] ){ 2520 assert( pNew->nFree>=szCell[j] ); 2521 if( pCur && iCur==j ){ pCur->pPage = pNew; pCur->idx = pNew->nCell; } 2522 insertCell(pBt, pNew, pNew->nCell, apCell[j], szCell[j]); 2523 j++; 2524 } 2525 assert( pNew->nCell>0 ); 2526 assert( !pNew->isOverfull ); 2527 relinkCellList(pBt, pNew); 2528 if( i<nNew-1 && j<nCell ){ 2529 pNew->u.hdr.rightChild = apCell[j]->h.leftChild; 2530 apCell[j]->h.leftChild = SWAB32(pBt, pgnoNew[i]); 2531 if( pCur && iCur==j ){ pCur->pPage = pParent; pCur->idx = nxDiv; } 2532 insertCell(pBt, pParent, nxDiv, apCell[j], szCell[j]); 2533 j++; 2534 nxDiv++; 2535 } 2536 } 2537 assert( j==nCell ); 2538 apNew[nNew-1]->u.hdr.rightChild = aOld[nOld-1].u.hdr.rightChild; 2539 if( nxDiv==pParent->nCell ){ 2540 pParent->u.hdr.rightChild = SWAB32(pBt, pgnoNew[nNew-1]); 2541 }else{ 2542 pParent->apCell[nxDiv]->h.leftChild = SWAB32(pBt, pgnoNew[nNew-1]); 2543 } 2544 if( pCur ){ 2545 if( j<=iCur && pCur->pPage==pParent && pCur->idx>idxDiv[nOld-1] ){ 2546 assert( pCur->pPage==pOldCurPage ); 2547 pCur->idx += nNew - nOld; 2548 }else{ 2549 assert( pOldCurPage!=0 ); 2550 sqlitepager_ref(pCur->pPage); 2551 sqlitepager_unref(pOldCurPage); 2552 } 2553 } 2554 2555 /* 2556 ** Reparent children of all cells. 2557 */ 2558 for(i=0; i<nNew; i++){ 2559 reparentChildPages(pBt, apNew[i]); 2560 } 2561 reparentChildPages(pBt, pParent); 2562 2563 /* 2564 ** balance the parent page. 2565 */ 2566 rc = balance(pBt, pParent, pCur); 2567 2568 /* 2569 ** Cleanup before returning. 2570 */ 2571 balance_cleanup: 2572 if( extraUnref ){ 2573 sqlitepager_unref(extraUnref); 2574 } 2575 for(i=0; i<nOld; i++){ 2576 if( apOld[i]!=0 && apOld[i]!=&aOld[i] ) sqlitepager_unref(apOld[i]); 2577 } 2578 for(i=0; i<nNew; i++){ 2579 sqlitepager_unref(apNew[i]); 2580 } 2581 if( pCur && pCur->pPage==0 ){ 2582 pCur->pPage = pParent; 2583 pCur->idx = 0; 2584 }else{ 2585 sqlitepager_unref(pParent); 2586 } 2587 return rc; 2588 } 2589 2590 /* 2591 ** This routine checks all cursors that point to the same table 2592 ** as pCur points to. If any of those cursors were opened with 2593 ** wrFlag==0 then this routine returns SQLITE_LOCKED. If all 2594 ** cursors point to the same table were opened with wrFlag==1 2595 ** then this routine returns SQLITE_OK. 2596 ** 2597 ** In addition to checking for read-locks (where a read-lock 2598 ** means a cursor opened with wrFlag==0) this routine also moves 2599 ** all cursors other than pCur so that they are pointing to the 2600 ** first Cell on root page. This is necessary because an insert 2601 ** or delete might change the number of cells on a page or delete 2602 ** a page entirely and we do not want to leave any cursors 2603 ** pointing to non-existant pages or cells. 2604 */ 2605 static int checkReadLocks(BtCursor *pCur){ 2606 BtCursor *p; 2607 assert( pCur->wrFlag ); 2608 for(p=pCur->pShared; p!=pCur; p=p->pShared){ 2609 assert( p ); 2610 assert( p->pgnoRoot==pCur->pgnoRoot ); 2611 if( p->wrFlag==0 ) return SQLITE_LOCKED; 2612 if( sqlitepager_pagenumber(p->pPage)!=p->pgnoRoot ){ 2613 moveToRoot(p); 2614 } 2615 } 2616 return SQLITE_OK; 2617 } 2618 2619 /* 2620 ** Insert a new record into the BTree. The key is given by (pKey,nKey) 2621 ** and the data is given by (pData,nData). The cursor is used only to 2622 ** define what database the record should be inserted into. The cursor 2623 ** is left pointing at the new record. 2624 */ 2625 static int fileBtreeInsert( 2626 BtCursor *pCur, /* Insert data into the table of this cursor */ 2627 const void *pKey, int nKey, /* The key of the new record */ 2628 const void *pData, int nData /* The data of the new record */ 2629 ){ 2630 Cell newCell; 2631 int rc; 2632 int loc; 2633 int szNew; 2634 MemPage *pPage; 2635 Btree *pBt = pCur->pBt; 2636 2637 if( pCur->pPage==0 ){ 2638 return SQLITE_ABORT; /* A rollback destroyed this cursor */ 2639 } 2640 if( !pBt->inTrans || nKey+nData==0 ){ 2641 /* Must start a transaction before doing an insert */ 2642 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR; 2643 } 2644 assert( !pBt->readOnly ); 2645 if( !pCur->wrFlag ){ 2646 return SQLITE_PERM; /* Cursor not open for writing */ 2647 } 2648 if( checkReadLocks(pCur) ){ 2649 return SQLITE_LOCKED; /* The table pCur points to has a read lock */ 2650 } 2651 rc = fileBtreeMoveto(pCur, pKey, nKey, &loc); 2652 if( rc ) return rc; 2653 pPage = pCur->pPage; 2654 assert( pPage->isInit ); 2655 rc = sqlitepager_write(pPage); 2656 if( rc ) return rc; 2657 rc = fillInCell(pBt, &newCell, pKey, nKey, pData, nData); 2658 if( rc ) return rc; 2659 szNew = cellSize(pBt, &newCell); 2660 if( loc==0 ){ 2661 newCell.h.leftChild = pPage->apCell[pCur->idx]->h.leftChild; 2662 rc = clearCell(pBt, pPage->apCell[pCur->idx]); 2663 if( rc ) return rc; 2664 dropCell(pBt, pPage, pCur->idx, cellSize(pBt, pPage->apCell[pCur->idx])); 2665 }else if( loc<0 && pPage->nCell>0 ){ 2666 assert( pPage->u.hdr.rightChild==0 ); /* Must be a leaf page */ 2667 pCur->idx++; 2668 }else{ 2669 assert( pPage->u.hdr.rightChild==0 ); /* Must be a leaf page */ 2670 } 2671 insertCell(pBt, pPage, pCur->idx, &newCell, szNew); 2672 rc = balance(pCur->pBt, pPage, pCur); 2673 /* sqliteBtreePageDump(pCur->pBt, pCur->pgnoRoot, 1); */ 2674 /* fflush(stdout); */ 2675 pCur->eSkip = SKIP_INVALID; 2676 return rc; 2677 } 2678 2679 /* 2680 ** Delete the entry that the cursor is pointing to. 2681 ** 2682 ** The cursor is left pointing at either the next or the previous 2683 ** entry. If the cursor is left pointing to the next entry, then 2684 ** the pCur->eSkip flag is set to SKIP_NEXT which forces the next call to 2685 ** sqliteBtreeNext() to be a no-op. That way, you can always call 2686 ** sqliteBtreeNext() after a delete and the cursor will be left 2687 ** pointing to the first entry after the deleted entry. Similarly, 2688 ** pCur->eSkip is set to SKIP_PREV is the cursor is left pointing to 2689 ** the entry prior to the deleted entry so that a subsequent call to 2690 ** sqliteBtreePrevious() will always leave the cursor pointing at the 2691 ** entry immediately before the one that was deleted. 2692 */ 2693 static int fileBtreeDelete(BtCursor *pCur){ 2694 MemPage *pPage = pCur->pPage; 2695 Cell *pCell; 2696 int rc; 2697 Pgno pgnoChild; 2698 Btree *pBt = pCur->pBt; 2699 2700 assert( pPage->isInit ); 2701 if( pCur->pPage==0 ){ 2702 return SQLITE_ABORT; /* A rollback destroyed this cursor */ 2703 } 2704 if( !pBt->inTrans ){ 2705 /* Must start a transaction before doing a delete */ 2706 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR; 2707 } 2708 assert( !pBt->readOnly ); 2709 if( pCur->idx >= pPage->nCell ){ 2710 return SQLITE_ERROR; /* The cursor is not pointing to anything */ 2711 } 2712 if( !pCur->wrFlag ){ 2713 return SQLITE_PERM; /* Did not open this cursor for writing */ 2714 } 2715 if( checkReadLocks(pCur) ){ 2716 return SQLITE_LOCKED; /* The table pCur points to has a read lock */ 2717 } 2718 rc = sqlitepager_write(pPage); 2719 if( rc ) return rc; 2720 pCell = pPage->apCell[pCur->idx]; 2721 pgnoChild = SWAB32(pBt, pCell->h.leftChild); 2722 rc = clearCell(pBt, pCell); 2723 if( rc ) return rc; 2724 if( pgnoChild ){ 2725 /* 2726 ** The entry we are about to delete is not a leaf so if we do not 2727 ** do something we will leave a hole on an internal page. 2728 ** We have to fill the hole by moving in a cell from a leaf. The 2729 ** next Cell after the one to be deleted is guaranteed to exist and 2730 ** to be a leaf so we can use it. 2731 */ 2732 BtCursor leafCur; 2733 Cell *pNext; 2734 int szNext; 2735 int notUsed; 2736 getTempCursor(pCur, &leafCur); 2737 rc = fileBtreeNext(&leafCur, ¬Used); 2738 if( rc!=SQLITE_OK ){ 2739 if( rc!=SQLITE_NOMEM ) rc = SQLITE_CORRUPT; 2740 return rc; 2741 } 2742 rc = sqlitepager_write(leafCur.pPage); 2743 if( rc ) return rc; 2744 dropCell(pBt, pPage, pCur->idx, cellSize(pBt, pCell)); 2745 pNext = leafCur.pPage->apCell[leafCur.idx]; 2746 szNext = cellSize(pBt, pNext); 2747 pNext->h.leftChild = SWAB32(pBt, pgnoChild); 2748 insertCell(pBt, pPage, pCur->idx, pNext, szNext); 2749 rc = balance(pBt, pPage, pCur); 2750 if( rc ) return rc; 2751 pCur->eSkip = SKIP_NEXT; 2752 dropCell(pBt, leafCur.pPage, leafCur.idx, szNext); 2753 rc = balance(pBt, leafCur.pPage, pCur); 2754 releaseTempCursor(&leafCur); 2755 }else{ 2756 dropCell(pBt, pPage, pCur->idx, cellSize(pBt, pCell)); 2757 if( pCur->idx>=pPage->nCell ){ 2758 pCur->idx = pPage->nCell-1; 2759 if( pCur->idx<0 ){ 2760 pCur->idx = 0; 2761 pCur->eSkip = SKIP_NEXT; 2762 }else{ 2763 pCur->eSkip = SKIP_PREV; 2764 } 2765 }else{ 2766 pCur->eSkip = SKIP_NEXT; 2767 } 2768 rc = balance(pBt, pPage, pCur); 2769 } 2770 return rc; 2771 } 2772 2773 /* 2774 ** Create a new BTree table. Write into *piTable the page 2775 ** number for the root page of the new table. 2776 ** 2777 ** In the current implementation, BTree tables and BTree indices are the 2778 ** the same. In the future, we may change this so that BTree tables 2779 ** are restricted to having a 4-byte integer key and arbitrary data and 2780 ** BTree indices are restricted to having an arbitrary key and no data. 2781 ** But for now, this routine also serves to create indices. 2782 */ 2783 static int fileBtreeCreateTable(Btree *pBt, int *piTable){ 2784 MemPage *pRoot; 2785 Pgno pgnoRoot; 2786 int rc; 2787 if( !pBt->inTrans ){ 2788 /* Must start a transaction first */ 2789 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR; 2790 } 2791 if( pBt->readOnly ){ 2792 return SQLITE_READONLY; 2793 } 2794 rc = allocatePage(pBt, &pRoot, &pgnoRoot, 0); 2795 if( rc ) return rc; 2796 assert( sqlitepager_iswriteable(pRoot) ); 2797 zeroPage(pBt, pRoot); 2798 sqlitepager_unref(pRoot); 2799 *piTable = (int)pgnoRoot; 2800 return SQLITE_OK; 2801 } 2802 2803 /* 2804 ** Erase the given database page and all its children. Return 2805 ** the page to the freelist. 2806 */ 2807 static int clearDatabasePage(Btree *pBt, Pgno pgno, int freePageFlag){ 2808 MemPage *pPage; 2809 int rc; 2810 Cell *pCell; 2811 int idx; 2812 2813 rc = sqlitepager_get(pBt->pPager, pgno, (void**)&pPage); 2814 if( rc ) return rc; 2815 rc = sqlitepager_write(pPage); 2816 if( rc ) return rc; 2817 rc = initPage(pBt, pPage, pgno, 0); 2818 if( rc ) return rc; 2819 idx = SWAB16(pBt, pPage->u.hdr.firstCell); 2820 while( idx>0 ){ 2821 pCell = (Cell*)&pPage->u.aDisk[idx]; 2822 idx = SWAB16(pBt, pCell->h.iNext); 2823 if( pCell->h.leftChild ){ 2824 rc = clearDatabasePage(pBt, SWAB32(pBt, pCell->h.leftChild), 1); 2825 if( rc ) return rc; 2826 } 2827 rc = clearCell(pBt, pCell); 2828 if( rc ) return rc; 2829 } 2830 if( pPage->u.hdr.rightChild ){ 2831 rc = clearDatabasePage(pBt, SWAB32(pBt, pPage->u.hdr.rightChild), 1); 2832 if( rc ) return rc; 2833 } 2834 if( freePageFlag ){ 2835 rc = freePage(pBt, pPage, pgno); 2836 }else{ 2837 zeroPage(pBt, pPage); 2838 } 2839 sqlitepager_unref(pPage); 2840 return rc; 2841 } 2842 2843 /* 2844 ** Delete all information from a single table in the database. 2845 */ 2846 static int fileBtreeClearTable(Btree *pBt, int iTable){ 2847 int rc; 2848 BtCursor *pCur; 2849 if( !pBt->inTrans ){ 2850 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR; 2851 } 2852 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){ 2853 if( pCur->pgnoRoot==(Pgno)iTable ){ 2854 if( pCur->wrFlag==0 ) return SQLITE_LOCKED; 2855 moveToRoot(pCur); 2856 } 2857 } 2858 rc = clearDatabasePage(pBt, (Pgno)iTable, 0); 2859 if( rc ){ 2860 fileBtreeRollback(pBt); 2861 } 2862 return rc; 2863 } 2864 2865 /* 2866 ** Erase all information in a table and add the root of the table to 2867 ** the freelist. Except, the root of the principle table (the one on 2868 ** page 2) is never added to the freelist. 2869 */ 2870 static int fileBtreeDropTable(Btree *pBt, int iTable){ 2871 int rc; 2872 MemPage *pPage; 2873 BtCursor *pCur; 2874 if( !pBt->inTrans ){ 2875 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR; 2876 } 2877 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){ 2878 if( pCur->pgnoRoot==(Pgno)iTable ){ 2879 return SQLITE_LOCKED; /* Cannot drop a table that has a cursor */ 2880 } 2881 } 2882 rc = sqlitepager_get(pBt->pPager, (Pgno)iTable, (void**)&pPage); 2883 if( rc ) return rc; 2884 rc = fileBtreeClearTable(pBt, iTable); 2885 if( rc ) return rc; 2886 if( iTable>2 ){ 2887 rc = freePage(pBt, pPage, iTable); 2888 }else{ 2889 zeroPage(pBt, pPage); 2890 } 2891 sqlitepager_unref(pPage); 2892 return rc; 2893 } 2894 2895 #if 0 /* UNTESTED */ 2896 /* 2897 ** Copy all cell data from one database file into another. 2898 ** pages back the freelist. 2899 */ 2900 static int copyCell(Btree *pBtFrom, BTree *pBtTo, Cell *pCell){ 2901 Pager *pFromPager = pBtFrom->pPager; 2902 OverflowPage *pOvfl; 2903 Pgno ovfl, nextOvfl; 2904 Pgno *pPrev; 2905 int rc = SQLITE_OK; 2906 MemPage *pNew, *pPrevPg; 2907 Pgno new; 2908 2909 if( NKEY(pBtTo, pCell->h) + NDATA(pBtTo, pCell->h) <= MX_LOCAL_PAYLOAD ){ 2910 return SQLITE_OK; 2911 } 2912 pPrev = &pCell->ovfl; 2913 pPrevPg = 0; 2914 ovfl = SWAB32(pBtTo, pCell->ovfl); 2915 while( ovfl && rc==SQLITE_OK ){ 2916 rc = sqlitepager_get(pFromPager, ovfl, (void**)&pOvfl); 2917 if( rc ) return rc; 2918 nextOvfl = SWAB32(pBtFrom, pOvfl->iNext); 2919 rc = allocatePage(pBtTo, &pNew, &new, 0); 2920 if( rc==SQLITE_OK ){ 2921 rc = sqlitepager_write(pNew); 2922 if( rc==SQLITE_OK ){ 2923 memcpy(pNew, pOvfl, SQLITE_USABLE_SIZE); 2924 *pPrev = SWAB32(pBtTo, new); 2925 if( pPrevPg ){ 2926 sqlitepager_unref(pPrevPg); 2927 } 2928 pPrev = &pOvfl->iNext; 2929 pPrevPg = pNew; 2930 } 2931 } 2932 sqlitepager_unref(pOvfl); 2933 ovfl = nextOvfl; 2934 } 2935 if( pPrevPg ){ 2936 sqlitepager_unref(pPrevPg); 2937 } 2938 return rc; 2939 } 2940 #endif 2941 2942 2943 #if 0 /* UNTESTED */ 2944 /* 2945 ** Copy a page of data from one database over to another. 2946 */ 2947 static int copyDatabasePage( 2948 Btree *pBtFrom, 2949 Pgno pgnoFrom, 2950 Btree *pBtTo, 2951 Pgno *pTo 2952 ){ 2953 MemPage *pPageFrom, *pPage; 2954 Pgno to; 2955 int rc; 2956 Cell *pCell; 2957 int idx; 2958 2959 rc = sqlitepager_get(pBtFrom->pPager, pgno, (void**)&pPageFrom); 2960 if( rc ) return rc; 2961 rc = allocatePage(pBt, &pPage, pTo, 0); 2962 if( rc==SQLITE_OK ){ 2963 rc = sqlitepager_write(pPage); 2964 } 2965 if( rc==SQLITE_OK ){ 2966 memcpy(pPage, pPageFrom, SQLITE_USABLE_SIZE); 2967 idx = SWAB16(pBt, pPage->u.hdr.firstCell); 2968 while( idx>0 ){ 2969 pCell = (Cell*)&pPage->u.aDisk[idx]; 2970 idx = SWAB16(pBt, pCell->h.iNext); 2971 if( pCell->h.leftChild ){ 2972 Pgno newChld; 2973 rc = copyDatabasePage(pBtFrom, SWAB32(pBtFrom, pCell->h.leftChild), 2974 pBtTo, &newChld); 2975 if( rc ) return rc; 2976 pCell->h.leftChild = SWAB32(pBtFrom, newChld); 2977 } 2978 rc = copyCell(pBtFrom, pBtTo, pCell); 2979 if( rc ) return rc; 2980 } 2981 if( pPage->u.hdr.rightChild ){ 2982 Pgno newChld; 2983 rc = copyDatabasePage(pBtFrom, SWAB32(pBtFrom, pPage->u.hdr.rightChild), 2984 pBtTo, &newChld); 2985 if( rc ) return rc; 2986 pPage->u.hdr.rightChild = SWAB32(pBtTo, newChild); 2987 } 2988 } 2989 sqlitepager_unref(pPage); 2990 return rc; 2991 } 2992 #endif 2993 2994 /* 2995 ** Read the meta-information out of a database file. 2996 */ 2997 static int fileBtreeGetMeta(Btree *pBt, int *aMeta){ 2998 PageOne *pP1; 2999 int rc; 3000 int i; 3001 3002 rc = sqlitepager_get(pBt->pPager, 1, (void**)&pP1); 3003 if( rc ) return rc; 3004 aMeta[0] = SWAB32(pBt, pP1->nFree); 3005 for(i=0; i<sizeof(pP1->aMeta)/sizeof(pP1->aMeta[0]); i++){ 3006 aMeta[i+1] = SWAB32(pBt, pP1->aMeta[i]); 3007 } 3008 sqlitepager_unref(pP1); 3009 return SQLITE_OK; 3010 } 3011 3012 /* 3013 ** Write meta-information back into the database. 3014 */ 3015 static int fileBtreeUpdateMeta(Btree *pBt, int *aMeta){ 3016 PageOne *pP1; 3017 int rc, i; 3018 if( !pBt->inTrans ){ 3019 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR; 3020 } 3021 pP1 = pBt->page1; 3022 rc = sqlitepager_write(pP1); 3023 if( rc ) return rc; 3024 for(i=0; i<sizeof(pP1->aMeta)/sizeof(pP1->aMeta[0]); i++){ 3025 pP1->aMeta[i] = SWAB32(pBt, aMeta[i+1]); 3026 } 3027 return SQLITE_OK; 3028 } 3029 3030 /****************************************************************************** 3031 ** The complete implementation of the BTree subsystem is above this line. 3032 ** All the code the follows is for testing and troubleshooting the BTree 3033 ** subsystem. None of the code that follows is used during normal operation. 3034 ******************************************************************************/ 3035 3036 /* 3037 ** Print a disassembly of the given page on standard output. This routine 3038 ** is used for debugging and testing only. 3039 */ 3040 #ifdef SQLITE_TEST 3041 static int fileBtreePageDump(Btree *pBt, int pgno, int recursive){ 3042 int rc; 3043 MemPage *pPage; 3044 int i, j; 3045 int nFree; 3046 u16 idx; 3047 char range[20]; 3048 unsigned char payload[20]; 3049 rc = sqlitepager_get(pBt->pPager, (Pgno)pgno, (void**)&pPage); 3050 if( rc ){ 3051 return rc; 3052 } 3053 if( recursive ) printf("PAGE %d:\n", pgno); 3054 i = 0; 3055 idx = SWAB16(pBt, pPage->u.hdr.firstCell); 3056 while( idx>0 && idx<=SQLITE_USABLE_SIZE-MIN_CELL_SIZE ){ 3057 Cell *pCell = (Cell*)&pPage->u.aDisk[idx]; 3058 int sz = cellSize(pBt, pCell); 3059 sprintf(range,"%d..%d", idx, idx+sz-1); 3060 sz = NKEY(pBt, pCell->h) + NDATA(pBt, pCell->h); 3061 if( sz>sizeof(payload)-1 ) sz = sizeof(payload)-1; 3062 memcpy(payload, pCell->aPayload, sz); 3063 for(j=0; j<sz; j++){ 3064 if( payload[j]<0x20 || payload[j]>0x7f ) payload[j] = '.'; 3065 } 3066 payload[sz] = 0; 3067 printf( 3068 "cell %2d: i=%-10s chld=%-4d nk=%-4d nd=%-4d payload=%s\n", 3069 i, range, (int)pCell->h.leftChild, 3070 NKEY(pBt, pCell->h), NDATA(pBt, pCell->h), 3071 payload 3072 ); 3073 if( pPage->isInit && pPage->apCell[i]!=pCell ){ 3074 printf("**** apCell[%d] does not match on prior entry ****\n", i); 3075 } 3076 i++; 3077 idx = SWAB16(pBt, pCell->h.iNext); 3078 } 3079 if( idx!=0 ){ 3080 printf("ERROR: next cell index out of range: %d\n", idx); 3081 } 3082 printf("right_child: %d\n", SWAB32(pBt, pPage->u.hdr.rightChild)); 3083 nFree = 0; 3084 i = 0; 3085 idx = SWAB16(pBt, pPage->u.hdr.firstFree); 3086 while( idx>0 && idx<SQLITE_USABLE_SIZE ){ 3087 FreeBlk *p = (FreeBlk*)&pPage->u.aDisk[idx]; 3088 sprintf(range,"%d..%d", idx, idx+p->iSize-1); 3089 nFree += SWAB16(pBt, p->iSize); 3090 printf("freeblock %2d: i=%-10s size=%-4d total=%d\n", 3091 i, range, SWAB16(pBt, p->iSize), nFree); 3092 idx = SWAB16(pBt, p->iNext); 3093 i++; 3094 } 3095 if( idx!=0 ){ 3096 printf("ERROR: next freeblock index out of range: %d\n", idx); 3097 } 3098 if( recursive && pPage->u.hdr.rightChild!=0 ){ 3099 idx = SWAB16(pBt, pPage->u.hdr.firstCell); 3100 while( idx>0 && idx<SQLITE_USABLE_SIZE-MIN_CELL_SIZE ){ 3101 Cell *pCell = (Cell*)&pPage->u.aDisk[idx]; 3102 fileBtreePageDump(pBt, SWAB32(pBt, pCell->h.leftChild), 1); 3103 idx = SWAB16(pBt, pCell->h.iNext); 3104 } 3105 fileBtreePageDump(pBt, SWAB32(pBt, pPage->u.hdr.rightChild), 1); 3106 } 3107 sqlitepager_unref(pPage); 3108 return SQLITE_OK; 3109 } 3110 #endif 3111 3112 #ifdef SQLITE_TEST 3113 /* 3114 ** Fill aResult[] with information about the entry and page that the 3115 ** cursor is pointing to. 3116 ** 3117 ** aResult[0] = The page number 3118 ** aResult[1] = The entry number 3119 ** aResult[2] = Total number of entries on this page 3120 ** aResult[3] = Size of this entry 3121 ** aResult[4] = Number of free bytes on this page 3122 ** aResult[5] = Number of free blocks on the page 3123 ** aResult[6] = Page number of the left child of this entry 3124 ** aResult[7] = Page number of the right child for the whole page 3125 ** 3126 ** This routine is used for testing and debugging only. 3127 */ 3128 static int fileBtreeCursorDump(BtCursor *pCur, int *aResult){ 3129 int cnt, idx; 3130 MemPage *pPage = pCur->pPage; 3131 Btree *pBt = pCur->pBt; 3132 aResult[0] = sqlitepager_pagenumber(pPage); 3133 aResult[1] = pCur->idx; 3134 aResult[2] = pPage->nCell; 3135 if( pCur->idx>=0 && pCur->idx<pPage->nCell ){ 3136 aResult[3] = cellSize(pBt, pPage->apCell[pCur->idx]); 3137 aResult[6] = SWAB32(pBt, pPage->apCell[pCur->idx]->h.leftChild); 3138 }else{ 3139 aResult[3] = 0; 3140 aResult[6] = 0; 3141 } 3142 aResult[4] = pPage->nFree; 3143 cnt = 0; 3144 idx = SWAB16(pBt, pPage->u.hdr.firstFree); 3145 while( idx>0 && idx<SQLITE_USABLE_SIZE ){ 3146 cnt++; 3147 idx = SWAB16(pBt, ((FreeBlk*)&pPage->u.aDisk[idx])->iNext); 3148 } 3149 aResult[5] = cnt; 3150 aResult[7] = SWAB32(pBt, pPage->u.hdr.rightChild); 3151 return SQLITE_OK; 3152 } 3153 #endif 3154 3155 /* 3156 ** Return the pager associated with a BTree. This routine is used for 3157 ** testing and debugging only. 3158 */ 3159 static Pager *fileBtreePager(Btree *pBt){ 3160 return pBt->pPager; 3161 } 3162 3163 /* 3164 ** This structure is passed around through all the sanity checking routines 3165 ** in order to keep track of some global state information. 3166 */ 3167 typedef struct IntegrityCk IntegrityCk; 3168 struct IntegrityCk { 3169 Btree *pBt; /* The tree being checked out */ 3170 Pager *pPager; /* The associated pager. Also accessible by pBt->pPager */ 3171 int nPage; /* Number of pages in the database */ 3172 int *anRef; /* Number of times each page is referenced */ 3173 char *zErrMsg; /* An error message. NULL of no errors seen. */ 3174 }; 3175 3176 /* 3177 ** Append a message to the error message string. 3178 */ 3179 static void checkAppendMsg(IntegrityCk *pCheck, char *zMsg1, char *zMsg2){ 3180 if( pCheck->zErrMsg ){ 3181 char *zOld = pCheck->zErrMsg; 3182 pCheck->zErrMsg = 0; 3183 sqliteSetString(&pCheck->zErrMsg, zOld, "\n", zMsg1, zMsg2, (char*)0); 3184 sqliteFree(zOld); 3185 }else{ 3186 sqliteSetString(&pCheck->zErrMsg, zMsg1, zMsg2, (char*)0); 3187 } 3188 } 3189 3190 /* 3191 ** Add 1 to the reference count for page iPage. If this is the second 3192 ** reference to the page, add an error message to pCheck->zErrMsg. 3193 ** Return 1 if there are 2 ore more references to the page and 0 if 3194 ** if this is the first reference to the page. 3195 ** 3196 ** Also check that the page number is in bounds. 3197 */ 3198 static int checkRef(IntegrityCk *pCheck, int iPage, char *zContext){ 3199 if( iPage==0 ) return 1; 3200 if( iPage>pCheck->nPage || iPage<0 ){ 3201 char zBuf[100]; 3202 sprintf(zBuf, "invalid page number %d", iPage); 3203 checkAppendMsg(pCheck, zContext, zBuf); 3204 return 1; 3205 } 3206 if( pCheck->anRef[iPage]==1 ){ 3207 char zBuf[100]; 3208 sprintf(zBuf, "2nd reference to page %d", iPage); 3209 checkAppendMsg(pCheck, zContext, zBuf); 3210 return 1; 3211 } 3212 return (pCheck->anRef[iPage]++)>1; 3213 } 3214 3215 /* 3216 ** Check the integrity of the freelist or of an overflow page list. 3217 ** Verify that the number of pages on the list is N. 3218 */ 3219 static void checkList( 3220 IntegrityCk *pCheck, /* Integrity checking context */ 3221 int isFreeList, /* True for a freelist. False for overflow page list */ 3222 int iPage, /* Page number for first page in the list */ 3223 int N, /* Expected number of pages in the list */ 3224 char *zContext /* Context for error messages */ 3225 ){ 3226 int i; 3227 char zMsg[100]; 3228 while( N-- > 0 ){ 3229 OverflowPage *pOvfl; 3230 if( iPage<1 ){ 3231 sprintf(zMsg, "%d pages missing from overflow list", N+1); 3232 checkAppendMsg(pCheck, zContext, zMsg); 3233 break; 3234 } 3235 if( checkRef(pCheck, iPage, zContext) ) break; 3236 if( sqlitepager_get(pCheck->pPager, (Pgno)iPage, (void**)&pOvfl) ){ 3237 sprintf(zMsg, "failed to get page %d", iPage); 3238 checkAppendMsg(pCheck, zContext, zMsg); 3239 break; 3240 } 3241 if( isFreeList ){ 3242 FreelistInfo *pInfo = (FreelistInfo*)pOvfl->aPayload; 3243 int n = SWAB32(pCheck->pBt, pInfo->nFree); 3244 for(i=0; i<n; i++){ 3245 checkRef(pCheck, SWAB32(pCheck->pBt, pInfo->aFree[i]), zContext); 3246 } 3247 N -= n; 3248 } 3249 iPage = SWAB32(pCheck->pBt, pOvfl->iNext); 3250 sqlitepager_unref(pOvfl); 3251 } 3252 } 3253 3254 /* 3255 ** Return negative if zKey1<zKey2. 3256 ** Return zero if zKey1==zKey2. 3257 ** Return positive if zKey1>zKey2. 3258 */ 3259 static int keyCompare( 3260 const char *zKey1, int nKey1, 3261 const char *zKey2, int nKey2 3262 ){ 3263 int min = nKey1>nKey2 ? nKey2 : nKey1; 3264 int c = memcmp(zKey1, zKey2, min); 3265 if( c==0 ){ 3266 c = nKey1 - nKey2; 3267 } 3268 return c; 3269 } 3270 3271 /* 3272 ** Do various sanity checks on a single page of a tree. Return 3273 ** the tree depth. Root pages return 0. Parents of root pages 3274 ** return 1, and so forth. 3275 ** 3276 ** These checks are done: 3277 ** 3278 ** 1. Make sure that cells and freeblocks do not overlap 3279 ** but combine to completely cover the page. 3280 ** 2. Make sure cell keys are in order. 3281 ** 3. Make sure no key is less than or equal to zLowerBound. 3282 ** 4. Make sure no key is greater than or equal to zUpperBound. 3283 ** 5. Check the integrity of overflow pages. 3284 ** 6. Recursively call checkTreePage on all children. 3285 ** 7. Verify that the depth of all children is the same. 3286 ** 8. Make sure this page is at least 33% full or else it is 3287 ** the root of the tree. 3288 */ 3289 static int checkTreePage( 3290 IntegrityCk *pCheck, /* Context for the sanity check */ 3291 int iPage, /* Page number of the page to check */ 3292 MemPage *pParent, /* Parent page */ 3293 char *zParentContext, /* Parent context */ 3294 char *zLowerBound, /* All keys should be greater than this, if not NULL */ 3295 int nLower, /* Number of characters in zLowerBound */ 3296 char *zUpperBound, /* All keys should be less than this, if not NULL */ 3297 int nUpper /* Number of characters in zUpperBound */ 3298 ){ 3299 MemPage *pPage; 3300 int i, rc, depth, d2, pgno; 3301 char *zKey1, *zKey2; 3302 int nKey1, nKey2; 3303 BtCursor cur; 3304 Btree *pBt; 3305 char zMsg[100]; 3306 char zContext[100]; 3307 char hit[SQLITE_USABLE_SIZE]; 3308 3309 /* Check that the page exists 3310 */ 3311 cur.pBt = pBt = pCheck->pBt; 3312 if( iPage==0 ) return 0; 3313 if( checkRef(pCheck, iPage, zParentContext) ) return 0; 3314 sprintf(zContext, "On tree page %d: ", iPage); 3315 if( (rc = sqlitepager_get(pCheck->pPager, (Pgno)iPage, (void**)&pPage))!=0 ){ 3316 sprintf(zMsg, "unable to get the page. error code=%d", rc); 3317 checkAppendMsg(pCheck, zContext, zMsg); 3318 return 0; 3319 } 3320 if( (rc = initPage(pBt, pPage, (Pgno)iPage, pParent))!=0 ){ 3321 sprintf(zMsg, "initPage() returns error code %d", rc); 3322 checkAppendMsg(pCheck, zContext, zMsg); 3323 sqlitepager_unref(pPage); 3324 return 0; 3325 } 3326 3327 /* Check out all the cells. 3328 */ 3329 depth = 0; 3330 if( zLowerBound ){ 3331 zKey1 = sqliteMalloc( nLower+1 ); 3332 memcpy(zKey1, zLowerBound, nLower); 3333 zKey1[nLower] = 0; 3334 }else{ 3335 zKey1 = 0; 3336 } 3337 nKey1 = nLower; 3338 cur.pPage = pPage; 3339 for(i=0; i<pPage->nCell; i++){ 3340 Cell *pCell = pPage->apCell[i]; 3341 int sz; 3342 3343 /* Check payload overflow pages 3344 */ 3345 nKey2 = NKEY(pBt, pCell->h); 3346 sz = nKey2 + NDATA(pBt, pCell->h); 3347 sprintf(zContext, "On page %d cell %d: ", iPage, i); 3348 if( sz>MX_LOCAL_PAYLOAD ){ 3349 int nPage = (sz - MX_LOCAL_PAYLOAD + OVERFLOW_SIZE - 1)/OVERFLOW_SIZE; 3350 checkList(pCheck, 0, SWAB32(pBt, pCell->ovfl), nPage, zContext); 3351 } 3352 3353 /* Check that keys are in the right order 3354 */ 3355 cur.idx = i; 3356 zKey2 = sqliteMallocRaw( nKey2+1 ); 3357 getPayload(&cur, 0, nKey2, zKey2); 3358 if( zKey1 && keyCompare(zKey1, nKey1, zKey2, nKey2)>=0 ){ 3359 checkAppendMsg(pCheck, zContext, "Key is out of order"); 3360 } 3361 3362 /* Check sanity of left child page. 3363 */ 3364 pgno = SWAB32(pBt, pCell->h.leftChild); 3365 d2 = checkTreePage(pCheck, pgno, pPage, zContext, zKey1,nKey1,zKey2,nKey2); 3366 if( i>0 && d2!=depth ){ 3367 checkAppendMsg(pCheck, zContext, "Child page depth differs"); 3368 } 3369 depth = d2; 3370 sqliteFree(zKey1); 3371 zKey1 = zKey2; 3372 nKey1 = nKey2; 3373 } 3374 pgno = SWAB32(pBt, pPage->u.hdr.rightChild); 3375 sprintf(zContext, "On page %d at right child: ", iPage); 3376 checkTreePage(pCheck, pgno, pPage, zContext, zKey1,nKey1,zUpperBound,nUpper); 3377 sqliteFree(zKey1); 3378 3379 /* Check for complete coverage of the page 3380 */ 3381 memset(hit, 0, sizeof(hit)); 3382 memset(hit, 1, sizeof(PageHdr)); 3383 for(i=SWAB16(pBt, pPage->u.hdr.firstCell); i>0 && i<SQLITE_USABLE_SIZE; ){ 3384 Cell *pCell = (Cell*)&pPage->u.aDisk[i]; 3385 int j; 3386 for(j=i+cellSize(pBt, pCell)-1; j>=i; j--) hit[j]++; 3387 i = SWAB16(pBt, pCell->h.iNext); 3388 } 3389 for(i=SWAB16(pBt,pPage->u.hdr.firstFree); i>0 && i<SQLITE_USABLE_SIZE; ){ 3390 FreeBlk *pFBlk = (FreeBlk*)&pPage->u.aDisk[i]; 3391 int j; 3392 for(j=i+SWAB16(pBt,pFBlk->iSize)-1; j>=i; j--) hit[j]++; 3393 i = SWAB16(pBt,pFBlk->iNext); 3394 } 3395 for(i=0; i<SQLITE_USABLE_SIZE; i++){ 3396 if( hit[i]==0 ){ 3397 sprintf(zMsg, "Unused space at byte %d of page %d", i, iPage); 3398 checkAppendMsg(pCheck, zMsg, 0); 3399 break; 3400 }else if( hit[i]>1 ){ 3401 sprintf(zMsg, "Multiple uses for byte %d of page %d", i, iPage); 3402 checkAppendMsg(pCheck, zMsg, 0); 3403 break; 3404 } 3405 } 3406 3407 /* Check that free space is kept to a minimum 3408 */ 3409 #if 0 3410 if( pParent && pParent->nCell>2 && pPage->nFree>3*SQLITE_USABLE_SIZE/4 ){ 3411 sprintf(zMsg, "free space (%d) greater than max (%d)", pPage->nFree, 3412 SQLITE_USABLE_SIZE/3); 3413 checkAppendMsg(pCheck, zContext, zMsg); 3414 } 3415 #endif 3416 3417 sqlitepager_unref(pPage); 3418 return depth; 3419 } 3420 3421 /* 3422 ** This routine does a complete check of the given BTree file. aRoot[] is 3423 ** an array of pages numbers were each page number is the root page of 3424 ** a table. nRoot is the number of entries in aRoot. 3425 ** 3426 ** If everything checks out, this routine returns NULL. If something is 3427 ** amiss, an error message is written into memory obtained from malloc() 3428 ** and a pointer to that error message is returned. The calling function 3429 ** is responsible for freeing the error message when it is done. 3430 */ 3431 char *fileBtreeIntegrityCheck(Btree *pBt, int *aRoot, int nRoot){ 3432 int i; 3433 int nRef; 3434 IntegrityCk sCheck; 3435 3436 nRef = *sqlitepager_stats(pBt->pPager); 3437 if( lockBtree(pBt)!=SQLITE_OK ){ 3438 return sqliteStrDup("Unable to acquire a read lock on the database"); 3439 } 3440 sCheck.pBt = pBt; 3441 sCheck.pPager = pBt->pPager; 3442 sCheck.nPage = sqlitepager_pagecount(sCheck.pPager); 3443 if( sCheck.nPage==0 ){ 3444 unlockBtreeIfUnused(pBt); 3445 return 0; 3446 } 3447 sCheck.anRef = sqliteMallocRaw( (sCheck.nPage+1)*sizeof(sCheck.anRef[0]) ); 3448 sCheck.anRef[1] = 1; 3449 for(i=2; i<=sCheck.nPage; i++){ sCheck.anRef[i] = 0; } 3450 sCheck.zErrMsg = 0; 3451 3452 /* Check the integrity of the freelist 3453 */ 3454 checkList(&sCheck, 1, SWAB32(pBt, pBt->page1->freeList), 3455 SWAB32(pBt, pBt->page1->nFree), "Main freelist: "); 3456 3457 /* Check all the tables. 3458 */ 3459 for(i=0; i<nRoot; i++){ 3460 if( aRoot[i]==0 ) continue; 3461 checkTreePage(&sCheck, aRoot[i], 0, "List of tree roots: ", 0,0,0,0); 3462 } 3463 3464 /* Make sure every page in the file is referenced 3465 */ 3466 for(i=1; i<=sCheck.nPage; i++){ 3467 if( sCheck.anRef[i]==0 ){ 3468 char zBuf[100]; 3469 sprintf(zBuf, "Page %d is never used", i); 3470 checkAppendMsg(&sCheck, zBuf, 0); 3471 } 3472 } 3473 3474 /* Make sure this analysis did not leave any unref() pages 3475 */ 3476 unlockBtreeIfUnused(pBt); 3477 if( nRef != *sqlitepager_stats(pBt->pPager) ){ 3478 char zBuf[100]; 3479 sprintf(zBuf, 3480 "Outstanding page count goes from %d to %d during this analysis", 3481 nRef, *sqlitepager_stats(pBt->pPager) 3482 ); 3483 checkAppendMsg(&sCheck, zBuf, 0); 3484 } 3485 3486 /* Clean up and report errors. 3487 */ 3488 sqliteFree(sCheck.anRef); 3489 return sCheck.zErrMsg; 3490 } 3491 3492 /* 3493 ** Return the full pathname of the underlying database file. 3494 */ 3495 static const char *fileBtreeGetFilename(Btree *pBt){ 3496 assert( pBt->pPager!=0 ); 3497 return sqlitepager_filename(pBt->pPager); 3498 } 3499 3500 /* 3501 ** Copy the complete content of pBtFrom into pBtTo. A transaction 3502 ** must be active for both files. 3503 ** 3504 ** The size of file pBtFrom may be reduced by this operation. 3505 ** If anything goes wrong, the transaction on pBtFrom is rolled back. 3506 */ 3507 static int fileBtreeCopyFile(Btree *pBtTo, Btree *pBtFrom){ 3508 int rc = SQLITE_OK; 3509 Pgno i, nPage, nToPage; 3510 3511 if( !pBtTo->inTrans || !pBtFrom->inTrans ) return SQLITE_ERROR; 3512 if( pBtTo->needSwab!=pBtFrom->needSwab ) return SQLITE_ERROR; 3513 if( pBtTo->pCursor ) return SQLITE_BUSY; 3514 memcpy(pBtTo->page1, pBtFrom->page1, SQLITE_USABLE_SIZE); 3515 rc = sqlitepager_overwrite(pBtTo->pPager, 1, pBtFrom->page1); 3516 nToPage = sqlitepager_pagecount(pBtTo->pPager); 3517 nPage = sqlitepager_pagecount(pBtFrom->pPager); 3518 for(i=2; rc==SQLITE_OK && i<=nPage; i++){ 3519 void *pPage; 3520 rc = sqlitepager_get(pBtFrom->pPager, i, &pPage); 3521 if( rc ) break; 3522 rc = sqlitepager_overwrite(pBtTo->pPager, i, pPage); 3523 if( rc ) break; 3524 sqlitepager_unref(pPage); 3525 } 3526 for(i=nPage+1; rc==SQLITE_OK && i<=nToPage; i++){ 3527 void *pPage; 3528 rc = sqlitepager_get(pBtTo->pPager, i, &pPage); 3529 if( rc ) break; 3530 rc = sqlitepager_write(pPage); 3531 sqlitepager_unref(pPage); 3532 sqlitepager_dont_write(pBtTo->pPager, i); 3533 } 3534 if( !rc && nPage<nToPage ){ 3535 rc = sqlitepager_truncate(pBtTo->pPager, nPage); 3536 } 3537 if( rc ){ 3538 fileBtreeRollback(pBtTo); 3539 } 3540 return rc; 3541 } 3542 3543 /* 3544 ** The following tables contain pointers to all of the interface 3545 ** routines for this implementation of the B*Tree backend. To 3546 ** substitute a different implemention of the backend, one has merely 3547 ** to provide pointers to alternative functions in similar tables. 3548 */ 3549 static BtOps sqliteBtreeOps = { 3550 fileBtreeClose, 3551 fileBtreeSetCacheSize, 3552 fileBtreeSetSafetyLevel, 3553 fileBtreeBeginTrans, 3554 fileBtreeCommit, 3555 fileBtreeRollback, 3556 fileBtreeBeginCkpt, 3557 fileBtreeCommitCkpt, 3558 fileBtreeRollbackCkpt, 3559 fileBtreeCreateTable, 3560 fileBtreeCreateTable, /* Really sqliteBtreeCreateIndex() */ 3561 fileBtreeDropTable, 3562 fileBtreeClearTable, 3563 fileBtreeCursor, 3564 fileBtreeGetMeta, 3565 fileBtreeUpdateMeta, 3566 fileBtreeIntegrityCheck, 3567 fileBtreeGetFilename, 3568 fileBtreeCopyFile, 3569 fileBtreePager, 3570 #ifdef SQLITE_TEST 3571 fileBtreePageDump, 3572 #endif 3573 }; 3574 static BtCursorOps sqliteBtreeCursorOps = { 3575 fileBtreeMoveto, 3576 fileBtreeDelete, 3577 fileBtreeInsert, 3578 fileBtreeFirst, 3579 fileBtreeLast, 3580 fileBtreeNext, 3581 fileBtreePrevious, 3582 fileBtreeKeySize, 3583 fileBtreeKey, 3584 fileBtreeKeyCompare, 3585 fileBtreeDataSize, 3586 fileBtreeData, 3587 fileBtreeCloseCursor, 3588 #ifdef SQLITE_TEST 3589 fileBtreeCursorDump, 3590 #endif 3591 }; 3592