1 /*- 2 * See the file LICENSE for redistribution information. 3 * 4 * Copyright (c) 1996, 1997, 1998 5 * Sleepycat Software. All rights reserved. 6 */ 7 8 #include "config.h" 9 10 #ifndef lint 11 static const char sccsid[] = "@(#)db_region.c 10.53 (Sleepycat) 11/10/98"; 12 #endif /* not lint */ 13 14 #ifndef NO_SYSTEM_INCLUDES 15 #include <sys/types.h> 16 17 #include <errno.h> 18 #include <string.h> 19 #include <unistd.h> 20 #endif 21 22 #include "db_int.h" 23 #include "common_ext.h" 24 25 static int __db_growregion __P((REGINFO *, size_t)); 26 27 /* 28 * __db_rattach -- 29 * Optionally create and attach to a shared memory region. 30 * 31 * PUBLIC: int __db_rattach __P((REGINFO *)); 32 */ 33 int 34 __db_rattach(infop) 35 REGINFO *infop; 36 { 37 RLAYOUT *rlp, rl; 38 size_t grow_region, size; 39 ssize_t nr, nw; 40 u_int32_t flags, mbytes, bytes; 41 u_int8_t *p; 42 int malloc_possible, ret, retry_cnt; 43 44 grow_region = 0; 45 malloc_possible = 1; 46 ret = retry_cnt = 0; 47 48 /* Round off the requested size to the next page boundary. */ 49 DB_ROUNDOFF(infop->size, DB_VMPAGESIZE); 50 51 /* Some architectures have hard limits on the maximum region size. */ 52 #ifdef DB_REGIONSIZE_MAX 53 if (infop->size > DB_REGIONSIZE_MAX) { 54 __db_err(infop->dbenv, "__db_rattach: cache size too large"); 55 return (EINVAL); 56 } 57 #endif 58 59 /* Intialize the return information in the REGINFO structure. */ 60 loop: infop->addr = NULL; 61 infop->fd = -1; 62 infop->segid = INVALID_SEGID; 63 if (infop->name != NULL) { 64 __os_freestr(infop->name); 65 infop->name = NULL; 66 } 67 F_CLR(infop, REGION_CANGROW | REGION_CREATED); 68 69 #ifndef HAVE_SPINLOCKS 70 /* 71 * XXX 72 * Lacking spinlocks, we must have a file descriptor for fcntl(2) 73 * locking, which implies using mmap(2) to map in a regular file. 74 * (Theoretically, we could probably get a file descriptor to lock 75 * other types of shared regions, but I don't see any reason to 76 * bother.) 77 * 78 * Since we may be using shared memory regions, e.g., shmget(2), 79 * and not mmap of regular files, the backing file may be only a 80 * few tens of bytes in length. So, this depends on the ability 81 * to fcntl lock file offsets much larger than the physical file. 82 */ 83 malloc_possible = 0; 84 #endif 85 86 #ifdef __hppa 87 /* 88 * XXX 89 * HP-UX won't permit mutexes to live in anything but shared memory. 90 * Instantiate a shared region file on that architecture, regardless. 91 */ 92 malloc_possible = 0; 93 #endif 94 /* 95 * If a region is truly private, malloc the memory. That's faster 96 * than either anonymous memory or a shared file. 97 */ 98 if (malloc_possible && F_ISSET(infop, REGION_PRIVATE)) { 99 if ((ret = __os_malloc(infop->size, NULL, &infop->addr)) != 0) 100 return (ret); 101 102 /* 103 * It's sometimes significantly faster to page-fault in all of 104 * the region's pages before we run the application, as we see 105 * nasty side-effects when we page-fault while holding various 106 * locks, i.e., the lock takes a long time to acquire because 107 * of the underlying page fault, and the other threads convoy 108 * behind the lock holder. 109 */ 110 if (DB_GLOBAL(db_region_init)) 111 for (p = infop->addr; 112 p < (u_int8_t *)infop->addr + infop->size; 113 p += DB_VMPAGESIZE) 114 p[0] = '\0'; 115 116 F_SET(infop, REGION_CREATED | REGION_MALLOC); 117 goto region_init; 118 } 119 120 /* 121 * Get the name of the region (creating the file if a temporary file 122 * is being used). The dbenv contains the current DB environment, 123 * including naming information. The path argument may be a file or 124 * a directory. If path is a directory, it must exist and file is the 125 * file name to be created inside the directory. If path is a file, 126 * then file must be NULL. 127 */ 128 if ((ret = __db_appname(infop->dbenv, infop->appname, infop->path, 129 infop->file, infop->dbflags, &infop->fd, &infop->name)) != 0) 130 return (ret); 131 if (infop->fd != -1) 132 F_SET(infop, REGION_CREATED); 133 134 /* 135 * Try to create the file, if we have authority. We have to make sure 136 * that multiple threads/processes attempting to simultaneously create 137 * the region are properly ordered, so we open it using DB_CREATE and 138 * DB_EXCL, so two attempts to create the region will return failure in 139 * one. 140 */ 141 if (infop->fd == -1 && infop->dbflags & DB_CREATE) { 142 flags = infop->dbflags; 143 LF_SET(DB_EXCL); 144 if ((ret = __db_open(infop->name, 145 flags, flags, infop->mode, &infop->fd)) == 0) 146 F_SET(infop, REGION_CREATED); 147 else 148 if (ret != EEXIST) 149 goto errmsg; 150 } 151 152 /* If we couldn't create the file, try and open it. */ 153 if (infop->fd == -1) { 154 flags = infop->dbflags; 155 LF_CLR(DB_CREATE | DB_EXCL); 156 if ((ret = __db_open(infop->name, 157 flags, flags, infop->mode, &infop->fd)) != 0) 158 goto errmsg; 159 } 160 161 /* 162 * There are three cases we support: 163 * 1. Named anonymous memory (shmget(2)). 164 * 2. Unnamed anonymous memory (mmap(2): MAP_ANON/MAP_ANONYMOUS). 165 * 3. Memory backed by a regular file (mmap(2)). 166 * 167 * We instantiate a backing file in all cases, which contains at least 168 * the RLAYOUT structure, and in case #3, contains the actual region. 169 * This is necessary for a couple of reasons: 170 * 171 * First, the mpool region uses temporary files to name regions, and 172 * since you may have multiple regions in the same directory, we need 173 * a filesystem name to ensure that they don't collide. 174 * 175 * Second, applications are allowed to forcibly remove regions, even 176 * if they don't know anything about them other than the name. If a 177 * region is backed by anonymous memory, there has to be some way for 178 * the application to find out that information, and, in some cases, 179 * determine ID information for the anonymous memory. 180 */ 181 if (F_ISSET(infop, REGION_CREATED)) { 182 /* 183 * If we're using anonymous memory to back this region, set 184 * the flag. 185 */ 186 if (DB_GLOBAL(db_region_anon)) 187 F_SET(infop, REGION_ANONYMOUS); 188 189 /* 190 * If we're using a regular file to back a region we created, 191 * grow it to the specified size. 192 */ 193 if (!DB_GLOBAL(db_region_anon) && 194 (ret = __db_growregion(infop, infop->size)) != 0) 195 goto err; 196 } else { 197 /* 198 * If we're joining a region, figure out what it looks like. 199 * 200 * XXX 201 * We have to figure out if the file is a regular file backing 202 * a region that we want to map into our address space, or a 203 * file with the information we need to find a shared anonymous 204 * region that we want to map into our address space. 205 * 206 * All this noise is because some systems don't have a coherent 207 * VM and buffer cache, and worse, if you mix operations on the 208 * VM and buffer cache, half the time you hang the system. 209 * 210 * There are two possibilities. If the file is the size of an 211 * RLAYOUT structure, then we know that the real region is in 212 * shared memory, because otherwise it would be bigger. (As 213 * the RLAYOUT structure size is smaller than a disk sector, 214 * the only way it can be this size is if deliberately written 215 * that way.) In which case, retrieve the information we need 216 * from the RLAYOUT structure and use it to acquire the shared 217 * memory. 218 * 219 * If the structure is larger than an RLAYOUT structure, then 220 * the file is backing the shared memory region, and we use 221 * the current size of the file without reading any information 222 * from the file itself so that we don't confuse the VM. 223 * 224 * And yes, this makes me want to take somebody and kill them, 225 * but I can't think of any other solution. 226 */ 227 if ((ret = __os_ioinfo(infop->name, 228 infop->fd, &mbytes, &bytes, NULL)) != 0) 229 goto errmsg; 230 size = mbytes * MEGABYTE + bytes; 231 232 if (size <= sizeof(RLAYOUT)) { 233 /* 234 * If the size is too small, the read fails or the 235 * valid flag is incorrect, assume it's because the 236 * RLAYOUT information hasn't been written out yet, 237 * and retry. 238 */ 239 if (size < sizeof(RLAYOUT)) 240 goto retry; 241 if ((ret = 242 __os_read(infop->fd, &rl, sizeof(rl), &nr)) != 0) 243 goto retry; 244 if (rl.valid != DB_REGIONMAGIC) 245 goto retry; 246 247 /* Copy the size, memory id and characteristics. */ 248 size = rl.size; 249 infop->segid = rl.segid; 250 if (F_ISSET(&rl, REGION_ANONYMOUS)) 251 F_SET(infop, REGION_ANONYMOUS); 252 } 253 254 /* 255 * If the region is larger than we think, that's okay, use the 256 * current size. If it's smaller than we think, and we were 257 * just using the default size, that's okay, use the current 258 * size. If it's smaller than we think and we really care, 259 * save the size and we'll catch that further down -- we can't 260 * correct it here because we have to have a lock to grow the 261 * region. 262 */ 263 if (infop->size > size && !F_ISSET(infop, REGION_SIZEDEF)) 264 grow_region = infop->size; 265 infop->size = size; 266 } 267 268 /* 269 * Map the region into our address space. If we're creating it, the 270 * underlying routines will make it the right size. 271 * 272 * There are at least two cases where we can "reasonably" fail when 273 * we attempt to map in the region. On Windows/95, closing the last 274 * reference to a region causes it to be zeroed out. On UNIX, when 275 * using the shmget(2) interfaces, the region will no longer exist 276 * if the system was rebooted. In these cases, the underlying map call 277 * returns EAGAIN, and we *remove* our file and try again. There are 278 * obvious races in doing this, but it should eventually settle down 279 * to a winner and then things should proceed normally. 280 */ 281 if ((ret = __db_mapregion(infop->name, infop)) != 0) 282 if (ret == EAGAIN) { 283 /* 284 * Pretend we created the region even if we didn't so 285 * that our error processing unlinks it. 286 */ 287 F_SET(infop, REGION_CREATED); 288 ret = 0; 289 goto retry; 290 } else 291 goto err; 292 293 region_init: 294 /* 295 * Initialize the common region information. 296 * 297 * !!! 298 * We have to order the region creates so that two processes don't try 299 * to simultaneously create the region. This is handled by using the 300 * DB_CREATE and DB_EXCL flags when we create the "backing" region file. 301 * 302 * We also have to order region joins so that processes joining regions 303 * never see inconsistent data. We'd like to play permissions games 304 * with the backing file, but we can't because WNT filesystems won't 305 * open a file mode 0. 306 */ 307 rlp = (RLAYOUT *)infop->addr; 308 if (F_ISSET(infop, REGION_CREATED)) { 309 /* 310 * The process creating the region acquires a lock before it 311 * sets the valid flag. Any processes joining the region will 312 * check the valid flag before acquiring the lock. 313 * 314 * Check the return of __db_mutex_init() and __db_mutex_lock(), 315 * even though we don't usually check elsewhere. This is the 316 * first lock we initialize and acquire, and we have to know if 317 * it fails. (It CAN fail, e.g., SunOS, when using fcntl(2) 318 * for locking, with an in-memory filesystem specified as the 319 * database home.) 320 */ 321 if ((ret = __db_mutex_init(&rlp->lock, 322 MUTEX_LOCK_OFFSET(rlp, &rlp->lock))) != 0 || 323 (ret = __db_mutex_lock(&rlp->lock, infop->fd)) != 0) 324 goto err; 325 326 /* Initialize the remaining region information. */ 327 rlp->refcnt = 1; 328 rlp->size = infop->size; 329 db_version(&rlp->majver, &rlp->minver, &rlp->patch); 330 rlp->panic = 0; 331 rlp->segid = infop->segid; 332 rlp->flags = 0; 333 if (F_ISSET(infop, REGION_ANONYMOUS)) 334 F_SET(rlp, REGION_ANONYMOUS); 335 336 /* 337 * Fill in the valid field last -- use a magic number, memory 338 * may not be zero-filled, and we want to minimize the chance 339 * for collision. 340 */ 341 rlp->valid = DB_REGIONMAGIC; 342 343 /* 344 * If the region is anonymous, write the RLAYOUT information 345 * into the backing file so that future region join and unlink 346 * calls can find it. 347 * 348 * XXX 349 * We MUST do the seek before we do the write. On Win95, while 350 * closing the last reference to an anonymous shared region 351 * doesn't discard the region, it does zero it out. So, the 352 * REGION_CREATED may be set, but the file may have already 353 * been written and the file descriptor may be at the end of 354 * the file. 355 */ 356 if (F_ISSET(infop, REGION_ANONYMOUS)) { 357 if ((ret = __os_seek(infop->fd, 0, 0, 0, 0, 0)) != 0) 358 goto err; 359 if ((ret = 360 __os_write(infop->fd, rlp, sizeof(*rlp), &nw)) != 0) 361 goto err; 362 } 363 } else { 364 /* Check to see if the region has had catastrophic failure. */ 365 if (rlp->panic) { 366 ret = DB_RUNRECOVERY; 367 goto err; 368 } 369 370 /* 371 * Check the valid flag to ensure the region is initialized. 372 * If the valid flag has not been set, the mutex may not have 373 * been initialized, and an attempt to get it could lead to 374 * random behavior. 375 */ 376 if (rlp->valid != DB_REGIONMAGIC) 377 goto retry; 378 379 /* Get the region lock. */ 380 (void)__db_mutex_lock(&rlp->lock, infop->fd); 381 382 /* 383 * We now own the region. There are a couple of things that 384 * may have gone wrong, however. 385 * 386 * Problem #1: while we were waiting for the lock, the region 387 * was deleted. Detected by re-checking the valid flag, since 388 * it's cleared by the delete region routines. 389 */ 390 if (rlp->valid != DB_REGIONMAGIC) { 391 (void)__db_mutex_unlock(&rlp->lock, infop->fd); 392 goto retry; 393 } 394 395 /* 396 * Problem #3: when we checked the size of the file, it was 397 * still growing as part of creation. Detected by the fact 398 * that infop->size isn't the same size as the region. 399 */ 400 if (infop->size != rlp->size) { 401 (void)__db_mutex_unlock(&rlp->lock, infop->fd); 402 goto retry; 403 } 404 405 /* Increment the reference count. */ 406 ++rlp->refcnt; 407 } 408 409 /* Return the region in a locked condition. */ 410 411 if (0) { 412 errmsg: __db_err(infop->dbenv, "%s: %s", infop->name, strerror(ret)); 413 414 err: 415 retry: /* Discard the region. */ 416 if (infop->addr != NULL) { 417 (void)__db_unmapregion(infop); 418 infop->addr = NULL; 419 } 420 421 /* Discard the backing file. */ 422 if (infop->fd != -1) { 423 (void)__os_close(infop->fd); 424 infop->fd = -1; 425 426 if (F_ISSET(infop, REGION_CREATED)) 427 (void)__os_unlink(infop->name); 428 } 429 430 /* Discard the name. */ 431 if (infop->name != NULL) { 432 __os_freestr(infop->name); 433 infop->name = NULL; 434 } 435 436 /* 437 * If we had a temporary error, wait a few seconds and 438 * try again. 439 */ 440 if (ret == 0) { 441 if (++retry_cnt <= 3) { 442 __os_sleep(retry_cnt * 2, 0); 443 goto loop; 444 } 445 ret = EAGAIN; 446 } 447 } 448 449 /* 450 * XXX 451 * HP-UX won't permit mutexes to live in anything but shared memory. 452 * Instantiate a shared region file on that architecture, regardless. 453 * 454 * XXX 455 * There's a problem in cleaning this up on application exit, or on 456 * application failure. If an application opens a database without 457 * an environment, we create a temporary backing mpool region for it. 458 * That region is marked REGION_PRIVATE, but as HP-UX won't permit 459 * mutexes to live in anything but shared memory, we instantiate a 460 * real file plus a memory region of some form. If the application 461 * crashes, the necessary information to delete the backing file and 462 * any system region (e.g., the shmget(2) segment ID) is no longer 463 * available. We can't completely fix the problem, but we try. 464 * 465 * The underlying UNIX __db_mapregion() code preferentially uses the 466 * mmap(2) interface with the MAP_ANON/MAP_ANONYMOUS flags for regions 467 * that are marked REGION_PRIVATE. This means that we normally aren't 468 * holding any system resources when we get here, in which case we can 469 * delete the backing file. This results in a short race, from the 470 * __db_open() call above to here. 471 * 472 * If, for some reason, we are holding system resources when we get 473 * here, we don't have any choice -- we can't delete the backing file 474 * because we may need it to detach from the resources. Set the 475 * REGION_LASTDETACH flag, so that we do all necessary cleanup when 476 * the application closes the region. 477 */ 478 if (F_ISSET(infop, REGION_PRIVATE) && !F_ISSET(infop, REGION_MALLOC)) 479 if (F_ISSET(infop, REGION_HOLDINGSYS)) 480 F_SET(infop, REGION_LASTDETACH); 481 else { 482 F_SET(infop, REGION_REMOVED); 483 F_CLR(infop, REGION_CANGROW); 484 485 (void)__os_close(infop->fd); 486 (void)__os_unlink(infop->name); 487 } 488 489 return (ret); 490 } 491 492 /* 493 * __db_rdetach -- 494 * De-attach from a shared memory region. 495 * 496 * PUBLIC: int __db_rdetach __P((REGINFO *)); 497 */ 498 int 499 __db_rdetach(infop) 500 REGINFO *infop; 501 { 502 RLAYOUT *rlp; 503 int detach, ret, t_ret; 504 505 ret = 0; 506 507 /* 508 * If the region was removed when it was created, no further action 509 * is required. 510 */ 511 if (F_ISSET(infop, REGION_REMOVED)) 512 goto done; 513 /* 514 * If the region was created in memory returned by malloc, the only 515 * action required is freeing the memory. 516 */ 517 if (F_ISSET(infop, REGION_MALLOC)) { 518 __os_free(infop->addr, 0); 519 goto done; 520 } 521 522 /* Otherwise, attach to the region and optionally delete it. */ 523 rlp = infop->addr; 524 525 /* Get the lock. */ 526 (void)__db_mutex_lock(&rlp->lock, infop->fd); 527 528 /* Decrement the reference count. */ 529 if (rlp->refcnt == 0) 530 __db_err(infop->dbenv, 531 "region rdetach: reference count went to zero!"); 532 else 533 --rlp->refcnt; 534 535 /* 536 * If we're going to remove the region, clear the valid flag so 537 * that any region join that's blocked waiting for us will know 538 * what happened. 539 */ 540 detach = 0; 541 if (F_ISSET(infop, REGION_LASTDETACH)) 542 if (rlp->refcnt == 0) { 543 detach = 1; 544 rlp->valid = 0; 545 } else 546 ret = EBUSY; 547 548 /* Release the lock. */ 549 (void)__db_mutex_unlock(&rlp->lock, infop->fd); 550 551 /* Close the backing file descriptor. */ 552 (void)__os_close(infop->fd); 553 infop->fd = -1; 554 555 /* Discard our mapping of the region. */ 556 if ((t_ret = __db_unmapregion(infop)) != 0 && ret == 0) 557 ret = t_ret; 558 559 /* Discard the region itself. */ 560 if (detach) { 561 if ((t_ret = 562 __db_unlinkregion(infop->name, infop) != 0) && ret == 0) 563 ret = t_ret; 564 if ((t_ret = __os_unlink(infop->name) != 0) && ret == 0) 565 ret = t_ret; 566 } 567 568 done: /* Discard the name. */ 569 if (infop->name != NULL) { 570 __os_freestr(infop->name); 571 infop->name = NULL; 572 } 573 574 return (ret); 575 } 576 577 /* 578 * __db_runlink -- 579 * Remove a region. 580 * 581 * PUBLIC: int __db_runlink __P((REGINFO *, int)); 582 */ 583 int 584 __db_runlink(infop, force) 585 REGINFO *infop; 586 int force; 587 { 588 RLAYOUT rl, *rlp; 589 size_t size; 590 ssize_t nr; 591 u_int32_t mbytes, bytes; 592 int fd, ret, t_ret; 593 char *name; 594 595 /* 596 * XXX 597 * We assume that we've created a new REGINFO structure for this 598 * call, not used one that was already initialized. Regardless, 599 * if anyone is planning to use it after we're done, they're going 600 * to be sorely disappointed. 601 * 602 * If force isn't set, we attach to the region, set a flag to delete 603 * the region on last close, and let the region delete code do the 604 * work. 605 */ 606 if (!force) { 607 if ((ret = __db_rattach(infop)) != 0) 608 return (ret); 609 610 rlp = (RLAYOUT *)infop->addr; 611 (void)__db_mutex_unlock(&rlp->lock, infop->fd); 612 613 F_SET(infop, REGION_LASTDETACH); 614 615 return (__db_rdetach(infop)); 616 } 617 618 /* 619 * Otherwise, we don't want to attach to the region. We may have been 620 * called to clean up if a process died leaving a region locked and/or 621 * corrupted, which could cause the attach to hang. 622 */ 623 if ((ret = __db_appname(infop->dbenv, infop->appname, 624 infop->path, infop->file, infop->dbflags, NULL, &name)) != 0) 625 return (ret); 626 627 /* 628 * An underlying file is created for all regions other than private 629 * (REGION_PRIVATE) ones, regardless of whether or not it's used to 630 * back the region. If that file doesn't exist, we're done. 631 */ 632 if (__os_exists(name, NULL) != 0) { 633 __os_freestr(name); 634 return (0); 635 } 636 637 /* 638 * See the comments in __db_rattach -- figure out if this is a regular 639 * file backing a region or if it's a regular file with information 640 * about a region. 641 */ 642 if ((ret = __db_open(name, DB_RDONLY, DB_RDONLY, 0, &fd)) != 0) 643 goto errmsg; 644 if ((ret = __os_ioinfo(name, fd, &mbytes, &bytes, NULL)) != 0) 645 goto errmsg; 646 size = mbytes * MEGABYTE + bytes; 647 648 if (size <= sizeof(RLAYOUT)) { 649 if ((ret = __os_read(fd, &rl, sizeof(rl), &nr)) != 0) 650 goto errmsg; 651 if (rl.valid != DB_REGIONMAGIC) { 652 __db_err(infop->dbenv, 653 "%s: illegal region magic number", name); 654 ret = EINVAL; 655 goto err; 656 } 657 658 /* Set the size, memory id and characteristics. */ 659 infop->size = rl.size; 660 infop->segid = rl.segid; 661 if (F_ISSET(&rl, REGION_ANONYMOUS)) 662 F_SET(infop, REGION_ANONYMOUS); 663 } else { 664 infop->size = size; 665 infop->segid = INVALID_SEGID; 666 } 667 668 /* Remove the underlying region. */ 669 ret = __db_unlinkregion(name, infop); 670 671 /* 672 * Unlink the backing file. Close the open file descriptor first, 673 * because some architectures (e.g., Win32) won't unlink a file if 674 * open file descriptors remain. 675 */ 676 (void)__os_close(fd); 677 if ((t_ret = __os_unlink(name)) != 0 && ret == 0) 678 ret = t_ret; 679 680 if (0) { 681 errmsg: __db_err(infop->dbenv, "%s: %s", name, strerror(ret)); 682 err: (void)__os_close(fd); 683 } 684 685 __os_freestr(name); 686 return (ret); 687 } 688 689 /* 690 * __db_rgrow -- 691 * Extend a region. 692 * 693 * PUBLIC: int __db_rgrow __P((REGINFO *, size_t)); 694 */ 695 int 696 __db_rgrow(infop, new_size) 697 REGINFO *infop; 698 size_t new_size; 699 { 700 RLAYOUT *rlp; 701 size_t increment; 702 int ret; 703 704 /* 705 * !!! 706 * This routine MUST be called with the region already locked. 707 */ 708 709 /* The underlying routines have flagged if this region can grow. */ 710 if (!F_ISSET(infop, REGION_CANGROW)) 711 return (EINVAL); 712 713 /* 714 * Round off the requested size to the next page boundary, and 715 * determine the additional space required. 716 */ 717 rlp = (RLAYOUT *)infop->addr; 718 DB_ROUNDOFF(new_size, DB_VMPAGESIZE); 719 increment = new_size - rlp->size; 720 721 if ((ret = __db_growregion(infop, increment)) != 0) 722 return (ret); 723 724 /* Update the on-disk region size. */ 725 rlp->size = new_size; 726 727 /* Detach from and reattach to the region. */ 728 return (__db_rreattach(infop, new_size)); 729 } 730 731 /* 732 * __db_growregion -- 733 * Grow a shared memory region. 734 */ 735 static int 736 __db_growregion(infop, increment) 737 REGINFO *infop; 738 size_t increment; 739 { 740 db_pgno_t pages; 741 size_t i; 742 ssize_t nr, nw; 743 u_int32_t relative; 744 int ret; 745 char buf[DB_VMPAGESIZE]; 746 747 /* Seek to the end of the region. */ 748 if ((ret = __os_seek(infop->fd, 0, 0, 0, 0, SEEK_END)) != 0) 749 goto err; 750 751 /* Write nuls to the new bytes. */ 752 memset(buf, 0, sizeof(buf)); 753 754 /* 755 * Some systems require that all of the bytes of the region be 756 * written before it can be mapped and accessed randomly, and 757 * other systems don't zero out the pages. 758 */ 759 if (__db_mapinit()) 760 /* Extend the region by writing each new page. */ 761 for (i = 0; i < increment; i += DB_VMPAGESIZE) { 762 if ((ret = 763 __os_write(infop->fd, buf, sizeof(buf), &nw)) != 0) 764 goto err; 765 if (nw != sizeof(buf)) 766 goto eio; 767 } 768 else { 769 /* 770 * Extend the region by writing the last page. If the region 771 * is >4Gb, increment may be larger than the maximum possible 772 * seek "relative" argument, as it's an unsigned 32-bit value. 773 * Break the offset into pages of 1MB each so that we don't 774 * overflow (2^20 + 2^32 is bigger than any memory I expect 775 * to see for awhile). 776 */ 777 pages = (increment - DB_VMPAGESIZE) / MEGABYTE; 778 relative = (increment - DB_VMPAGESIZE) % MEGABYTE; 779 if ((ret = __os_seek(infop->fd, 780 MEGABYTE, pages, relative, 0, SEEK_CUR)) != 0) 781 goto err; 782 if ((ret = __os_write(infop->fd, buf, sizeof(buf), &nw)) != 0) 783 goto err; 784 if (nw != sizeof(buf)) 785 goto eio; 786 787 /* 788 * It's sometimes significantly faster to page-fault in all of 789 * the region's pages before we run the application, as we see 790 * nasty side-effects when we page-fault while holding various 791 * locks, i.e., the lock takes a long time to acquire because 792 * of the underlying page fault, and the other threads convoy 793 * behind the lock holder. 794 * 795 * We also use REGION_INIT to guarantee that there is enough 796 * disk space for the region, so we also write a byte to each 797 * page. Reading the byte is insufficient as some systems 798 * (e.g., Solaris) do not instantiate disk pages to satisfy 799 * a read, and so we don't know if there is enough disk space 800 * or not. 801 */ 802 if (DB_GLOBAL(db_region_init)) { 803 pages = increment / MEGABYTE; 804 relative = increment % MEGABYTE; 805 if ((ret = __os_seek(infop->fd, 806 MEGABYTE, pages, relative, 1, SEEK_END)) != 0) 807 goto err; 808 809 /* Write a byte to each page. */ 810 for (i = 0; i < increment; i += DB_VMPAGESIZE) { 811 if ((ret = 812 __os_write(infop->fd, buf, 1, &nr)) != 0) 813 goto err; 814 if (nr != 1) 815 goto eio; 816 if ((ret = __os_seek(infop->fd, 817 0, 0, DB_VMPAGESIZE - 1, 0, SEEK_CUR)) != 0) 818 goto err; 819 } 820 } 821 } 822 return (0); 823 824 eio: ret = EIO; 825 err: __db_err(infop->dbenv, "region grow: %s", strerror(ret)); 826 return (ret); 827 } 828 829 /* 830 * __db_rreattach -- 831 * Detach from and reattach to a region. 832 * 833 * PUBLIC: int __db_rreattach __P((REGINFO *, size_t)); 834 */ 835 int 836 __db_rreattach(infop, new_size) 837 REGINFO *infop; 838 size_t new_size; 839 { 840 int ret; 841 842 #ifdef DIAGNOSTIC 843 if (infop->name == NULL) { 844 __db_err(infop->dbenv, "__db_rreattach: name was NULL"); 845 return (EINVAL); 846 } 847 #endif 848 /* 849 * If we're growing an already mapped region, we have to unmap it 850 * and get it back. We have it locked, so nobody else can get in, 851 * which makes it fairly straight-forward to do, as everybody else 852 * is going to block while we do the unmap/remap. NB: if we fail 853 * to get it back, the pooch is genuinely screwed, because we can 854 * never release the lock we're holding. 855 * 856 * Detach from the region. We have to do this first so architectures 857 * that don't permit a file to be mapped into different places in the 858 * address space simultaneously, e.g., HP's PaRisc, will work. 859 */ 860 if ((ret = __db_unmapregion(infop)) != 0) 861 return (ret); 862 863 /* Update the caller's REGINFO size to the new map size. */ 864 infop->size = new_size; 865 866 /* Attach to the region. */ 867 ret = __db_mapregion(infop->name, infop); 868 869 return (ret); 870 } 871