xref: /titanic_44/usr/src/cmd/sendmail/db/db/db_region.c (revision 70ab954a5d6c4d36858fd6e7e3dd4498d06d2c40)
1 /*-
2  * See the file LICENSE for redistribution information.
3  *
4  * Copyright (c) 1996, 1997, 1998
5  *	Sleepycat Software.  All rights reserved.
6  */
7 
8 #include "config.h"
9 
10 #ifndef lint
11 static const char sccsid[] = "@(#)db_region.c	10.53 (Sleepycat) 11/10/98";
12 #endif /* not lint */
13 
14 #ifndef NO_SYSTEM_INCLUDES
15 #include <sys/types.h>
16 
17 #include <errno.h>
18 #include <string.h>
19 #include <unistd.h>
20 #endif
21 
22 #include "db_int.h"
23 #include "common_ext.h"
24 
25 static int __db_growregion __P((REGINFO *, size_t));
26 
27 /*
28  * __db_rattach --
29  *	Optionally create and attach to a shared memory region.
30  *
31  * PUBLIC: int __db_rattach __P((REGINFO *));
32  */
33 int
34 __db_rattach(infop)
35 	REGINFO *infop;
36 {
37 	RLAYOUT *rlp, rl;
38 	size_t grow_region, size;
39 	ssize_t nr, nw;
40 	u_int32_t flags, mbytes, bytes;
41 	u_int8_t *p;
42 	int malloc_possible, ret, retry_cnt;
43 
44 	grow_region = 0;
45 	malloc_possible = 1;
46 	ret = retry_cnt = 0;
47 
48 	/* Round off the requested size to the next page boundary. */
49 	DB_ROUNDOFF(infop->size, DB_VMPAGESIZE);
50 
51 	/* Some architectures have hard limits on the maximum region size. */
52 #ifdef DB_REGIONSIZE_MAX
53 	if (infop->size > DB_REGIONSIZE_MAX) {
54 		__db_err(infop->dbenv, "__db_rattach: cache size too large");
55 		return (EINVAL);
56 	}
57 #endif
58 
59 	/* Intialize the return information in the REGINFO structure. */
60 loop:	infop->addr = NULL;
61 	infop->fd = -1;
62 	infop->segid = INVALID_SEGID;
63 	if (infop->name != NULL) {
64 		__os_freestr(infop->name);
65 		infop->name = NULL;
66 	}
67 	F_CLR(infop, REGION_CANGROW | REGION_CREATED);
68 
69 #ifndef HAVE_SPINLOCKS
70 	/*
71 	 * XXX
72 	 * Lacking spinlocks, we must have a file descriptor for fcntl(2)
73 	 * locking, which implies using mmap(2) to map in a regular file.
74 	 * (Theoretically, we could probably get a file descriptor to lock
75 	 * other types of shared regions, but I don't see any reason to
76 	 * bother.)
77 	 *
78 	 * Since we may be using shared memory regions, e.g., shmget(2),
79 	 * and not mmap of regular files, the backing file may be only a
80 	 * few tens of bytes in length.  So, this depends on the ability
81 	 * to fcntl lock file offsets much larger than the physical file.
82 	 */
83 	malloc_possible = 0;
84 #endif
85 
86 #ifdef __hppa
87 	/*
88 	 * XXX
89 	 * HP-UX won't permit mutexes to live in anything but shared memory.
90 	 * Instantiate a shared region file on that architecture, regardless.
91 	 */
92 	malloc_possible = 0;
93 #endif
94 	/*
95 	 * If a region is truly private, malloc the memory.  That's faster
96 	 * than either anonymous memory or a shared file.
97 	 */
98 	if (malloc_possible && F_ISSET(infop, REGION_PRIVATE)) {
99 		if ((ret = __os_malloc(infop->size, NULL, &infop->addr)) != 0)
100 			return (ret);
101 
102 		/*
103 		 * It's sometimes significantly faster to page-fault in all of
104 		 * the region's pages before we run the application, as we see
105 		 * nasty side-effects when we page-fault while holding various
106 		 * locks, i.e., the lock takes a long time to acquire because
107 		 * of the underlying page fault, and the other threads convoy
108 		 * behind the lock holder.
109 		 */
110 		if (DB_GLOBAL(db_region_init))
111 			for (p = infop->addr;
112 			    p < (u_int8_t *)infop->addr + infop->size;
113 			    p += DB_VMPAGESIZE)
114 				p[0] = '\0';
115 
116 		F_SET(infop, REGION_CREATED | REGION_MALLOC);
117 		goto region_init;
118 	}
119 
120 	/*
121 	 * Get the name of the region (creating the file if a temporary file
122 	 * is being used).  The dbenv contains the current DB environment,
123 	 * including naming information.  The path argument may be a file or
124 	 * a directory.  If path is a directory, it must exist and file is the
125 	 * file name to be created inside the directory.  If path is a file,
126 	 * then file must be NULL.
127 	 */
128 	if ((ret = __db_appname(infop->dbenv, infop->appname, infop->path,
129 	    infop->file, infop->dbflags, &infop->fd, &infop->name)) != 0)
130 		return (ret);
131 	if (infop->fd != -1)
132 		F_SET(infop, REGION_CREATED);
133 
134 	/*
135 	 * Try to create the file, if we have authority.  We have to make sure
136 	 * that multiple threads/processes attempting to simultaneously create
137 	 * the region are properly ordered, so we open it using DB_CREATE and
138 	 * DB_EXCL, so two attempts to create the region will return failure in
139 	 * one.
140 	 */
141 	if (infop->fd == -1 && infop->dbflags & DB_CREATE) {
142 		flags = infop->dbflags;
143 		LF_SET(DB_EXCL);
144 		if ((ret = __db_open(infop->name,
145 		    flags, flags, infop->mode, &infop->fd)) == 0)
146 			F_SET(infop, REGION_CREATED);
147 		else
148 			if (ret != EEXIST)
149 				goto errmsg;
150 	}
151 
152 	/* If we couldn't create the file, try and open it. */
153 	if (infop->fd == -1) {
154 		flags = infop->dbflags;
155 		LF_CLR(DB_CREATE | DB_EXCL);
156 		if ((ret = __db_open(infop->name,
157 		    flags, flags, infop->mode, &infop->fd)) != 0)
158 			goto errmsg;
159 	}
160 
161 	/*
162 	 * There are three cases we support:
163 	 *    1. Named anonymous memory (shmget(2)).
164 	 *    2. Unnamed anonymous memory (mmap(2): MAP_ANON/MAP_ANONYMOUS).
165 	 *    3. Memory backed by a regular file (mmap(2)).
166 	 *
167 	 * We instantiate a backing file in all cases, which contains at least
168 	 * the RLAYOUT structure, and in case #3, contains the actual region.
169 	 * This is necessary for a couple of reasons:
170 	 *
171 	 * First, the mpool region uses temporary files to name regions, and
172 	 * since you may have multiple regions in the same directory, we need
173 	 * a filesystem name to ensure that they don't collide.
174 	 *
175 	 * Second, applications are allowed to forcibly remove regions, even
176 	 * if they don't know anything about them other than the name.  If a
177 	 * region is backed by anonymous memory, there has to be some way for
178 	 * the application to find out that information, and, in some cases,
179 	 * determine ID information for the anonymous memory.
180 	 */
181 	if (F_ISSET(infop, REGION_CREATED)) {
182 		/*
183 		 * If we're using anonymous memory to back this region, set
184 		 * the flag.
185 		 */
186 		if (DB_GLOBAL(db_region_anon))
187 			F_SET(infop, REGION_ANONYMOUS);
188 
189 		/*
190 		 * If we're using a regular file to back a region we created,
191 		 * grow it to the specified size.
192 		 */
193 		if (!DB_GLOBAL(db_region_anon) &&
194 		    (ret = __db_growregion(infop, infop->size)) != 0)
195 			goto err;
196 	} else {
197 		/*
198 		 * If we're joining a region, figure out what it looks like.
199 		 *
200 		 * XXX
201 		 * We have to figure out if the file is a regular file backing
202 		 * a region that we want to map into our address space, or a
203 		 * file with the information we need to find a shared anonymous
204 		 * region that we want to map into our address space.
205 		 *
206 		 * All this noise is because some systems don't have a coherent
207 		 * VM and buffer cache, and worse, if you mix operations on the
208 		 * VM and buffer cache, half the time you hang the system.
209 		 *
210 		 * There are two possibilities.  If the file is the size of an
211 		 * RLAYOUT structure, then we know that the real region is in
212 		 * shared memory, because otherwise it would be bigger.  (As
213 		 * the RLAYOUT structure size is smaller than a disk sector,
214 		 * the only way it can be this size is if deliberately written
215 		 * that way.)  In which case, retrieve the information we need
216 		 * from the RLAYOUT structure and use it to acquire the shared
217 		 * memory.
218 		 *
219 		 * If the structure is larger than an RLAYOUT structure, then
220 		 * the file is backing the shared memory region, and we use
221 		 * the current size of the file without reading any information
222 		 * from the file itself so that we don't confuse the VM.
223 		 *
224 		 * And yes, this makes me want to take somebody and kill them,
225 		 * but I can't think of any other solution.
226 		 */
227 		if ((ret = __os_ioinfo(infop->name,
228 		    infop->fd, &mbytes, &bytes, NULL)) != 0)
229 			goto errmsg;
230 		size = mbytes * MEGABYTE + bytes;
231 
232 		if (size <= sizeof(RLAYOUT)) {
233 			/*
234 			 * If the size is too small, the read fails or the
235 			 * valid flag is incorrect, assume it's because the
236 			 * RLAYOUT information hasn't been written out yet,
237 			 * and retry.
238 			 */
239 			if (size < sizeof(RLAYOUT))
240 				goto retry;
241 			if ((ret =
242 			    __os_read(infop->fd, &rl, sizeof(rl), &nr)) != 0)
243 				goto retry;
244 			if (rl.valid != DB_REGIONMAGIC)
245 				goto retry;
246 
247 			/* Copy the size, memory id and characteristics. */
248 			size = rl.size;
249 			infop->segid = rl.segid;
250 			if (F_ISSET(&rl, REGION_ANONYMOUS))
251 				F_SET(infop, REGION_ANONYMOUS);
252 		}
253 
254 		/*
255 		 * If the region is larger than we think, that's okay, use the
256 		 * current size.  If it's smaller than we think, and we were
257 		 * just using the default size, that's okay, use the current
258 		 * size.  If it's smaller than we think and we really care,
259 		 * save the size and we'll catch that further down -- we can't
260 		 * correct it here because we have to have a lock to grow the
261 		 * region.
262 		 */
263 		if (infop->size > size && !F_ISSET(infop, REGION_SIZEDEF))
264 			grow_region = infop->size;
265 		infop->size = size;
266 	}
267 
268 	/*
269 	 * Map the region into our address space.  If we're creating it, the
270 	 * underlying routines will make it the right size.
271 	 *
272 	 * There are at least two cases where we can "reasonably" fail when
273 	 * we attempt to map in the region.  On Windows/95, closing the last
274 	 * reference to a region causes it to be zeroed out.  On UNIX, when
275 	 * using the shmget(2) interfaces, the region will no longer exist
276 	 * if the system was rebooted.  In these cases, the underlying map call
277 	 * returns EAGAIN, and we *remove* our file and try again.  There are
278 	 * obvious races in doing this, but it should eventually settle down
279 	 * to a winner and then things should proceed normally.
280 	 */
281 	if ((ret = __db_mapregion(infop->name, infop)) != 0)
282 		if (ret == EAGAIN) {
283 			/*
284 			 * Pretend we created the region even if we didn't so
285 			 * that our error processing unlinks it.
286 			 */
287 			F_SET(infop, REGION_CREATED);
288 			ret = 0;
289 			goto retry;
290 		} else
291 			goto err;
292 
293 region_init:
294 	/*
295 	 * Initialize the common region information.
296 	 *
297 	 * !!!
298 	 * We have to order the region creates so that two processes don't try
299 	 * to simultaneously create the region.  This is handled by using the
300 	 * DB_CREATE and DB_EXCL flags when we create the "backing" region file.
301 	 *
302 	 * We also have to order region joins so that processes joining regions
303 	 * never see inconsistent data.  We'd like to play permissions games
304 	 * with the backing file, but we can't because WNT filesystems won't
305 	 * open a file mode 0.
306 	 */
307 	rlp = (RLAYOUT *)infop->addr;
308 	if (F_ISSET(infop, REGION_CREATED)) {
309 		/*
310 		 * The process creating the region acquires a lock before it
311 		 * sets the valid flag.  Any processes joining the region will
312 		 * check the valid flag before acquiring the lock.
313 		 *
314 		 * Check the return of __db_mutex_init() and __db_mutex_lock(),
315 		 * even though we don't usually check elsewhere.  This is the
316 		 * first lock we initialize and acquire, and we have to know if
317 		 * it fails.  (It CAN fail, e.g., SunOS, when using fcntl(2)
318 		 * for locking, with an in-memory filesystem specified as the
319 		 * database home.)
320 		 */
321 		if ((ret = __db_mutex_init(&rlp->lock,
322 		    MUTEX_LOCK_OFFSET(rlp, &rlp->lock))) != 0 ||
323 		    (ret = __db_mutex_lock(&rlp->lock, infop->fd)) != 0)
324 			goto err;
325 
326 		/* Initialize the remaining region information. */
327 		rlp->refcnt = 1;
328 		rlp->size = infop->size;
329 		db_version(&rlp->majver, &rlp->minver, &rlp->patch);
330 		rlp->panic = 0;
331 		rlp->segid = infop->segid;
332 		rlp->flags = 0;
333 		if (F_ISSET(infop, REGION_ANONYMOUS))
334 			F_SET(rlp, REGION_ANONYMOUS);
335 
336 		/*
337 		 * Fill in the valid field last -- use a magic number, memory
338 		 * may not be zero-filled, and we want to minimize the chance
339 		 * for collision.
340 		 */
341 		rlp->valid = DB_REGIONMAGIC;
342 
343 		/*
344 		 * If the region is anonymous, write the RLAYOUT information
345 		 * into the backing file so that future region join and unlink
346 		 * calls can find it.
347 		 *
348 		 * XXX
349 		 * We MUST do the seek before we do the write.  On Win95, while
350 		 * closing the last reference to an anonymous shared region
351 		 * doesn't discard the region, it does zero it out.  So, the
352 		 * REGION_CREATED may be set, but the file may have already
353 		 * been written and the file descriptor may be at the end of
354 		 * the file.
355 		 */
356 		if (F_ISSET(infop, REGION_ANONYMOUS)) {
357 			if ((ret = __os_seek(infop->fd, 0, 0, 0, 0, 0)) != 0)
358 				goto err;
359 			if ((ret =
360 			    __os_write(infop->fd, rlp, sizeof(*rlp), &nw)) != 0)
361 				goto err;
362 		}
363 	} else {
364 		/* Check to see if the region has had catastrophic failure. */
365 		if (rlp->panic) {
366 			ret = DB_RUNRECOVERY;
367 			goto err;
368 		}
369 
370 		/*
371 		 * Check the valid flag to ensure the region is initialized.
372 		 * If the valid flag has not been set, the mutex may not have
373 		 * been initialized, and an attempt to get it could lead to
374 		 * random behavior.
375 		 */
376 		if (rlp->valid != DB_REGIONMAGIC)
377 			goto retry;
378 
379 		/* Get the region lock. */
380 		(void)__db_mutex_lock(&rlp->lock, infop->fd);
381 
382 		/*
383 		 * We now own the region.  There are a couple of things that
384 		 * may have gone wrong, however.
385 		 *
386 		 * Problem #1: while we were waiting for the lock, the region
387 		 * was deleted.  Detected by re-checking the valid flag, since
388 		 * it's cleared by the delete region routines.
389 		 */
390 		if (rlp->valid != DB_REGIONMAGIC) {
391 			(void)__db_mutex_unlock(&rlp->lock, infop->fd);
392 			goto retry;
393 		}
394 
395 		/*
396 		 * Problem #3: when we checked the size of the file, it was
397 		 * still growing as part of creation.  Detected by the fact
398 		 * that infop->size isn't the same size as the region.
399 		 */
400 		if (infop->size != rlp->size) {
401 			(void)__db_mutex_unlock(&rlp->lock, infop->fd);
402 			goto retry;
403 		}
404 
405 		/* Increment the reference count. */
406 		++rlp->refcnt;
407 	}
408 
409 	/* Return the region in a locked condition. */
410 
411 	if (0) {
412 errmsg:		__db_err(infop->dbenv, "%s: %s", infop->name, strerror(ret));
413 
414 err:
415 retry:		/* Discard the region. */
416 		if (infop->addr != NULL) {
417 			(void)__db_unmapregion(infop);
418 			infop->addr = NULL;
419 		}
420 
421 		/* Discard the backing file. */
422 		if (infop->fd != -1) {
423 			(void)__os_close(infop->fd);
424 			infop->fd = -1;
425 
426 			if (F_ISSET(infop, REGION_CREATED))
427 				(void)__os_unlink(infop->name);
428 		}
429 
430 		/* Discard the name. */
431 		if (infop->name != NULL) {
432 			__os_freestr(infop->name);
433 			infop->name = NULL;
434 		}
435 
436 		/*
437 		 * If we had a temporary error, wait a few seconds and
438 		 * try again.
439 		 */
440 		if (ret == 0) {
441 			if (++retry_cnt <= 3) {
442 				__os_sleep(retry_cnt * 2, 0);
443 				goto loop;
444 			}
445 			ret = EAGAIN;
446 		}
447 	}
448 
449 	/*
450 	 * XXX
451 	 * HP-UX won't permit mutexes to live in anything but shared memory.
452 	 * Instantiate a shared region file on that architecture, regardless.
453 	 *
454 	 * XXX
455 	 * There's a problem in cleaning this up on application exit, or on
456 	 * application failure.  If an application opens a database without
457 	 * an environment, we create a temporary backing mpool region for it.
458 	 * That region is marked REGION_PRIVATE, but as HP-UX won't permit
459 	 * mutexes to live in anything but shared memory, we instantiate a
460 	 * real file plus a memory region of some form.  If the application
461 	 * crashes, the necessary information to delete the backing file and
462 	 * any system region (e.g., the shmget(2) segment ID) is no longer
463 	 * available.  We can't completely fix the problem, but we try.
464 	 *
465 	 * The underlying UNIX __db_mapregion() code preferentially uses the
466 	 * mmap(2) interface with the MAP_ANON/MAP_ANONYMOUS flags for regions
467 	 * that are marked REGION_PRIVATE.  This means that we normally aren't
468 	 * holding any system resources when we get here, in which case we can
469 	 * delete the backing file.  This results in a short race, from the
470 	 * __db_open() call above to here.
471 	 *
472 	 * If, for some reason, we are holding system resources when we get
473 	 * here, we don't have any choice -- we can't delete the backing file
474 	 * because we may need it to detach from the resources.  Set the
475 	 * REGION_LASTDETACH flag, so that we do all necessary cleanup when
476 	 * the application closes the region.
477 	 */
478 	if (F_ISSET(infop, REGION_PRIVATE) && !F_ISSET(infop, REGION_MALLOC))
479 		if (F_ISSET(infop, REGION_HOLDINGSYS))
480 			F_SET(infop, REGION_LASTDETACH);
481 		else {
482 			F_SET(infop, REGION_REMOVED);
483 			F_CLR(infop, REGION_CANGROW);
484 
485 			(void)__os_close(infop->fd);
486 			(void)__os_unlink(infop->name);
487 		}
488 
489 	return (ret);
490 }
491 
492 /*
493  * __db_rdetach --
494  *	De-attach from a shared memory region.
495  *
496  * PUBLIC: int __db_rdetach __P((REGINFO *));
497  */
498 int
499 __db_rdetach(infop)
500 	REGINFO *infop;
501 {
502 	RLAYOUT *rlp;
503 	int detach, ret, t_ret;
504 
505 	ret = 0;
506 
507 	/*
508 	 * If the region was removed when it was created, no further action
509 	 * is required.
510 	 */
511 	if (F_ISSET(infop, REGION_REMOVED))
512 		goto done;
513 	/*
514 	 * If the region was created in memory returned by malloc, the only
515 	 * action required is freeing the memory.
516 	 */
517 	if (F_ISSET(infop, REGION_MALLOC)) {
518 		__os_free(infop->addr, 0);
519 		goto done;
520 	}
521 
522 	/* Otherwise, attach to the region and optionally delete it. */
523 	rlp = infop->addr;
524 
525 	/* Get the lock. */
526 	(void)__db_mutex_lock(&rlp->lock, infop->fd);
527 
528 	/* Decrement the reference count. */
529 	if (rlp->refcnt == 0)
530 		__db_err(infop->dbenv,
531 		    "region rdetach: reference count went to zero!");
532 	else
533 		--rlp->refcnt;
534 
535 	/*
536 	 * If we're going to remove the region, clear the valid flag so
537 	 * that any region join that's blocked waiting for us will know
538 	 * what happened.
539 	 */
540 	detach = 0;
541 	if (F_ISSET(infop, REGION_LASTDETACH))
542 		if (rlp->refcnt == 0) {
543 			detach = 1;
544 			rlp->valid = 0;
545 		} else
546 			ret = EBUSY;
547 
548 	/* Release the lock. */
549 	(void)__db_mutex_unlock(&rlp->lock, infop->fd);
550 
551 	/* Close the backing file descriptor. */
552 	(void)__os_close(infop->fd);
553 	infop->fd = -1;
554 
555 	/* Discard our mapping of the region. */
556 	if ((t_ret = __db_unmapregion(infop)) != 0 && ret == 0)
557 		ret = t_ret;
558 
559 	/* Discard the region itself. */
560 	if (detach) {
561 		if ((t_ret =
562 		    __db_unlinkregion(infop->name, infop) != 0) && ret == 0)
563 			ret = t_ret;
564 		if ((t_ret = __os_unlink(infop->name) != 0) && ret == 0)
565 			ret = t_ret;
566 	}
567 
568 done:	/* Discard the name. */
569 	if (infop->name != NULL) {
570 		__os_freestr(infop->name);
571 		infop->name = NULL;
572 	}
573 
574 	return (ret);
575 }
576 
577 /*
578  * __db_runlink --
579  *	Remove a region.
580  *
581  * PUBLIC: int __db_runlink __P((REGINFO *, int));
582  */
583 int
584 __db_runlink(infop, force)
585 	REGINFO *infop;
586 	int force;
587 {
588 	RLAYOUT rl, *rlp;
589 	size_t size;
590 	ssize_t nr;
591 	u_int32_t mbytes, bytes;
592 	int fd, ret, t_ret;
593 	char *name;
594 
595 	/*
596 	 * XXX
597 	 * We assume that we've created a new REGINFO structure for this
598 	 * call, not used one that was already initialized.  Regardless,
599 	 * if anyone is planning to use it after we're done, they're going
600 	 * to be sorely disappointed.
601 	 *
602 	 * If force isn't set, we attach to the region, set a flag to delete
603 	 * the region on last close, and let the region delete code do the
604 	 * work.
605 	 */
606 	if (!force) {
607 		if ((ret = __db_rattach(infop)) != 0)
608 			return (ret);
609 
610 		rlp = (RLAYOUT *)infop->addr;
611 		(void)__db_mutex_unlock(&rlp->lock, infop->fd);
612 
613 		F_SET(infop, REGION_LASTDETACH);
614 
615 		return (__db_rdetach(infop));
616 	}
617 
618 	/*
619 	 * Otherwise, we don't want to attach to the region.  We may have been
620 	 * called to clean up if a process died leaving a region locked and/or
621 	 * corrupted, which could cause the attach to hang.
622 	 */
623 	if ((ret = __db_appname(infop->dbenv, infop->appname,
624 	    infop->path, infop->file, infop->dbflags, NULL, &name)) != 0)
625 		return (ret);
626 
627 	/*
628 	 * An underlying file is created for all regions other than private
629 	 * (REGION_PRIVATE) ones, regardless of whether or not it's used to
630 	 * back the region.  If that file doesn't exist, we're done.
631 	 */
632 	if (__os_exists(name, NULL) != 0) {
633 		__os_freestr(name);
634 		return (0);
635 	}
636 
637 	/*
638 	 * See the comments in __db_rattach -- figure out if this is a regular
639 	 * file backing a region or if it's a regular file with information
640 	 * about a region.
641 	 */
642 	if ((ret = __db_open(name, DB_RDONLY, DB_RDONLY, 0, &fd)) != 0)
643 		goto errmsg;
644 	if ((ret = __os_ioinfo(name, fd, &mbytes, &bytes, NULL)) != 0)
645 		goto errmsg;
646 	size = mbytes * MEGABYTE + bytes;
647 
648 	if (size <= sizeof(RLAYOUT)) {
649 		if ((ret = __os_read(fd, &rl, sizeof(rl), &nr)) != 0)
650 			goto errmsg;
651 		if (rl.valid != DB_REGIONMAGIC) {
652 			__db_err(infop->dbenv,
653 			    "%s: illegal region magic number", name);
654 			ret = EINVAL;
655 			goto err;
656 		}
657 
658 		/* Set the size, memory id and characteristics. */
659 		infop->size = rl.size;
660 		infop->segid = rl.segid;
661 		if (F_ISSET(&rl, REGION_ANONYMOUS))
662 			F_SET(infop, REGION_ANONYMOUS);
663 	} else {
664 		infop->size = size;
665 		infop->segid = INVALID_SEGID;
666 	}
667 
668 	/* Remove the underlying region. */
669 	ret = __db_unlinkregion(name, infop);
670 
671 	/*
672 	 * Unlink the backing file.  Close the open file descriptor first,
673 	 * because some architectures (e.g., Win32) won't unlink a file if
674 	 * open file descriptors remain.
675 	 */
676 	(void)__os_close(fd);
677 	if ((t_ret = __os_unlink(name)) != 0 && ret == 0)
678 		ret = t_ret;
679 
680 	if (0) {
681 errmsg:		__db_err(infop->dbenv, "%s: %s", name, strerror(ret));
682 err:		(void)__os_close(fd);
683 	}
684 
685 	__os_freestr(name);
686 	return (ret);
687 }
688 
689 /*
690  * __db_rgrow --
691  *	Extend a region.
692  *
693  * PUBLIC: int __db_rgrow __P((REGINFO *, size_t));
694  */
695 int
696 __db_rgrow(infop, new_size)
697 	REGINFO *infop;
698 	size_t new_size;
699 {
700 	RLAYOUT *rlp;
701 	size_t increment;
702 	int ret;
703 
704 	/*
705 	 * !!!
706 	 * This routine MUST be called with the region already locked.
707 	 */
708 
709 	/* The underlying routines have flagged if this region can grow. */
710 	if (!F_ISSET(infop, REGION_CANGROW))
711 		return (EINVAL);
712 
713 	/*
714 	 * Round off the requested size to the next page boundary, and
715 	 * determine the additional space required.
716 	 */
717 	rlp = (RLAYOUT *)infop->addr;
718 	DB_ROUNDOFF(new_size, DB_VMPAGESIZE);
719 	increment = new_size - rlp->size;
720 
721 	if ((ret = __db_growregion(infop, increment)) != 0)
722 		return (ret);
723 
724 	/* Update the on-disk region size. */
725 	rlp->size = new_size;
726 
727 	/* Detach from and reattach to the region. */
728 	return (__db_rreattach(infop, new_size));
729 }
730 
731 /*
732  * __db_growregion --
733  *	Grow a shared memory region.
734  */
735 static int
736 __db_growregion(infop, increment)
737 	REGINFO *infop;
738 	size_t increment;
739 {
740 	db_pgno_t pages;
741 	size_t i;
742 	ssize_t nr, nw;
743 	u_int32_t relative;
744 	int ret;
745 	char buf[DB_VMPAGESIZE];
746 
747 	/* Seek to the end of the region. */
748 	if ((ret = __os_seek(infop->fd, 0, 0, 0, 0, SEEK_END)) != 0)
749 		goto err;
750 
751 	/* Write nuls to the new bytes. */
752 	memset(buf, 0, sizeof(buf));
753 
754 	/*
755 	 * Some systems require that all of the bytes of the region be
756 	 * written before it can be mapped and accessed randomly, and
757 	 * other systems don't zero out the pages.
758 	 */
759 	if (__db_mapinit())
760 		/* Extend the region by writing each new page. */
761 		for (i = 0; i < increment; i += DB_VMPAGESIZE) {
762 			if ((ret =
763 			    __os_write(infop->fd, buf, sizeof(buf), &nw)) != 0)
764 				goto err;
765 			if (nw != sizeof(buf))
766 				goto eio;
767 		}
768 	else {
769 		/*
770 		 * Extend the region by writing the last page.  If the region
771 		 * is >4Gb, increment may be larger than the maximum possible
772 		 * seek "relative" argument, as it's an unsigned 32-bit value.
773 		 * Break the offset into pages of 1MB each so that we don't
774 		 * overflow (2^20 + 2^32 is bigger than any memory I expect
775 		 * to see for awhile).
776 		 */
777 		pages = (increment - DB_VMPAGESIZE) / MEGABYTE;
778 		relative = (increment - DB_VMPAGESIZE) % MEGABYTE;
779 		if ((ret = __os_seek(infop->fd,
780 		    MEGABYTE, pages, relative, 0, SEEK_CUR)) != 0)
781 			goto err;
782 		if ((ret = __os_write(infop->fd, buf, sizeof(buf), &nw)) != 0)
783 			goto err;
784 		if (nw != sizeof(buf))
785 			goto eio;
786 
787 		/*
788 		 * It's sometimes significantly faster to page-fault in all of
789 		 * the region's pages before we run the application, as we see
790 		 * nasty side-effects when we page-fault while holding various
791 		 * locks, i.e., the lock takes a long time to acquire because
792 		 * of the underlying page fault, and the other threads convoy
793 		 * behind the lock holder.
794 		 *
795 		 * We also use REGION_INIT to guarantee that there is enough
796 		 * disk space for the region, so we also write a byte to each
797 		 * page.  Reading the byte is insufficient as some systems
798 		 * (e.g., Solaris) do not instantiate disk pages to satisfy
799 		 * a read, and so we don't know if there is enough disk space
800 		 * or not.
801 		 */
802 		if (DB_GLOBAL(db_region_init)) {
803 			pages = increment / MEGABYTE;
804 			relative = increment % MEGABYTE;
805 			if ((ret = __os_seek(infop->fd,
806 			    MEGABYTE, pages, relative, 1, SEEK_END)) != 0)
807 				goto err;
808 
809 			/* Write a byte to each page. */
810 			for (i = 0; i < increment; i += DB_VMPAGESIZE) {
811 				if ((ret =
812 				    __os_write(infop->fd, buf, 1, &nr)) != 0)
813 					goto err;
814 				if (nr != 1)
815 					goto eio;
816 				if ((ret = __os_seek(infop->fd,
817 				    0, 0, DB_VMPAGESIZE - 1, 0, SEEK_CUR)) != 0)
818 					goto err;
819 			}
820 		}
821 	}
822 	return (0);
823 
824 eio:	ret = EIO;
825 err:	__db_err(infop->dbenv, "region grow: %s", strerror(ret));
826 	return (ret);
827 }
828 
829 /*
830  * __db_rreattach --
831  *	Detach from and reattach to a region.
832  *
833  * PUBLIC: int __db_rreattach __P((REGINFO *, size_t));
834  */
835 int
836 __db_rreattach(infop, new_size)
837 	REGINFO *infop;
838 	size_t new_size;
839 {
840 	int ret;
841 
842 #ifdef DIAGNOSTIC
843 	if (infop->name == NULL) {
844 		__db_err(infop->dbenv, "__db_rreattach: name was NULL");
845 		return (EINVAL);
846 	}
847 #endif
848 	/*
849 	 * If we're growing an already mapped region, we have to unmap it
850 	 * and get it back.  We have it locked, so nobody else can get in,
851 	 * which makes it fairly straight-forward to do, as everybody else
852 	 * is going to block while we do the unmap/remap.  NB: if we fail
853 	 * to get it back, the pooch is genuinely screwed, because we can
854 	 * never release the lock we're holding.
855 	 *
856 	 * Detach from the region.  We have to do this first so architectures
857 	 * that don't permit a file to be mapped into different places in the
858 	 * address space simultaneously, e.g., HP's PaRisc, will work.
859 	 */
860 	if ((ret = __db_unmapregion(infop)) != 0)
861 		return (ret);
862 
863 	/* Update the caller's REGINFO size to the new map size. */
864 	infop->size = new_size;
865 
866 	/* Attach to the region. */
867 	ret = __db_mapregion(infop->name, infop);
868 
869 	return (ret);
870 }
871