xref: /titanic_52/usr/src/cmd/sendmail/db/mp/mp_fopen.c (revision 430b4c467020edf2445feb0c21db01c88b86243a)
1 /*-
2  * See the file LICENSE for redistribution information.
3  *
4  * Copyright (c) 1996, 1997, 1998
5  *	Sleepycat Software.  All rights reserved.
6  */
7 #include "config.h"
8 
9 #ifndef lint
10 static const char sccsid[] = "@(#)mp_fopen.c	10.60 (Sleepycat) 1/1/99";
11 #endif /* not lint */
12 
13 #ifndef NO_SYSTEM_INCLUDES
14 #include <sys/types.h>
15 
16 #include <errno.h>
17 #include <string.h>
18 #endif
19 
20 #include "db_int.h"
21 #include "shqueue.h"
22 #include "db_shash.h"
23 #include "mp.h"
24 #include "common_ext.h"
25 
26 static int __memp_mf_close __P((DB_MPOOL *, DB_MPOOLFILE *));
27 static int __memp_mf_open __P((DB_MPOOL *,
28     const char *, size_t, db_pgno_t, DB_MPOOL_FINFO *, MPOOLFILE **));
29 
30 /*
31  * memp_fopen --
32  *	Open a backing file for the memory pool.
33  */
34 int
35 memp_fopen(dbmp, path, flags, mode, pagesize, finfop, retp)
36 	DB_MPOOL *dbmp;
37 	const char *path;
38 	u_int32_t flags;
39 	int mode;
40 	size_t pagesize;
41 	DB_MPOOL_FINFO *finfop;
42 	DB_MPOOLFILE **retp;
43 {
44 	int ret;
45 
46 	MP_PANIC_CHECK(dbmp);
47 
48 	/* Validate arguments. */
49 	if ((ret = __db_fchk(dbmp->dbenv,
50 	    "memp_fopen", flags, DB_CREATE | DB_NOMMAP | DB_RDONLY)) != 0)
51 		return (ret);
52 
53 	/* Require a non-zero pagesize. */
54 	if (pagesize == 0) {
55 		__db_err(dbmp->dbenv, "memp_fopen: pagesize not specified");
56 		return (EINVAL);
57 	}
58 	if (finfop != NULL && finfop->clear_len > pagesize)
59 		return (EINVAL);
60 
61 	return (__memp_fopen(dbmp,
62 	    NULL, path, flags, mode, pagesize, 1, finfop, retp));
63 }
64 
65 /*
66  * __memp_fopen --
67  *	Open a backing file for the memory pool; internal version.
68  *
69  * PUBLIC: int __memp_fopen __P((DB_MPOOL *, MPOOLFILE *, const char *,
70  * PUBLIC:    u_int32_t, int, size_t, int, DB_MPOOL_FINFO *, DB_MPOOLFILE **));
71  */
72 int
73 __memp_fopen(dbmp, mfp, path, flags, mode, pagesize, needlock, finfop, retp)
74 	DB_MPOOL *dbmp;
75 	MPOOLFILE *mfp;
76 	const char *path;
77 	u_int32_t flags;
78 	int mode, needlock;
79 	size_t pagesize;
80 	DB_MPOOL_FINFO *finfop;
81 	DB_MPOOLFILE **retp;
82 {
83 	DB_ENV *dbenv;
84 	DB_MPOOLFILE *dbmfp;
85 	DB_MPOOL_FINFO finfo;
86 	db_pgno_t last_pgno;
87 	size_t maxmap;
88 	u_int32_t mbytes, bytes;
89 	int ret;
90 	u_int8_t idbuf[DB_FILE_ID_LEN];
91 	char *rpath;
92 
93 	dbenv = dbmp->dbenv;
94 	ret = 0;
95 	rpath = NULL;
96 
97 	/*
98 	 * If mfp is provided, we take the DB_MPOOL_FINFO information from
99 	 * the mfp.  We don't bother initializing everything, because some
100 	 * of them are expensive to acquire.  If no mfp is provided and the
101 	 * finfop argument is NULL, we default the values.
102 	 */
103 	if (finfop == NULL) {
104 		memset(&finfo, 0, sizeof(finfo));
105 		if (mfp != NULL) {
106 			finfo.ftype = mfp->ftype;
107 			finfo.pgcookie = NULL;
108 			finfo.fileid = NULL;
109 			finfo.lsn_offset = mfp->lsn_off;
110 			finfo.clear_len = mfp->clear_len;
111 		} else {
112 			finfo.ftype = 0;
113 			finfo.pgcookie = NULL;
114 			finfo.fileid = NULL;
115 			finfo.lsn_offset = -1;
116 			finfo.clear_len = 0;
117 		}
118 		finfop = &finfo;
119 	}
120 
121 	/* Allocate and initialize the per-process structure. */
122 	if ((ret = __os_calloc(1, sizeof(DB_MPOOLFILE), &dbmfp)) != 0)
123 		return (ret);
124 	dbmfp->dbmp = dbmp;
125 	dbmfp->fd = -1;
126 	dbmfp->ref = 1;
127 	if (LF_ISSET(DB_RDONLY))
128 		F_SET(dbmfp, MP_READONLY);
129 
130 	if (path == NULL) {
131 		if (LF_ISSET(DB_RDONLY)) {
132 			__db_err(dbenv,
133 			    "memp_fopen: temporary files can't be readonly");
134 			ret = EINVAL;
135 			goto err;
136 		}
137 		last_pgno = 0;
138 	} else {
139 		/* Get the real name for this file and open it. */
140 		if ((ret = __db_appname(dbenv,
141 		    DB_APP_DATA, NULL, path, 0, NULL, &rpath)) != 0)
142 			goto err;
143 		if ((ret = __db_open(rpath,
144 		   LF_ISSET(DB_CREATE | DB_RDONLY),
145 		   DB_CREATE | DB_RDONLY, mode, &dbmfp->fd)) != 0) {
146 			__db_err(dbenv, "%s: %s", rpath, strerror(ret));
147 			goto err;
148 		}
149 
150 		/*
151 		 * Don't permit files that aren't a multiple of the pagesize,
152 		 * and find the number of the last page in the file, all the
153 		 * time being careful not to overflow 32 bits.
154 		 *
155 		 * !!!
156 		 * We can't use off_t's here, or in any code in the mainline
157 		 * library for that matter.  (We have to use them in the os
158 		 * stubs, of course, as there are system calls that take them
159 		 * as arguments.)  The reason is that some customers build in
160 		 * environments where an off_t is 32-bits, but still run where
161 		 * offsets are 64-bits, and they pay us a lot of money.
162 		 */
163 		if ((ret = __os_ioinfo(rpath,
164 		    dbmfp->fd, &mbytes, &bytes, NULL)) != 0) {
165 			__db_err(dbenv, "%s: %s", rpath, strerror(ret));
166 			goto err;
167 		}
168 
169 		/* Page sizes have to be a power-of-two, ignore mbytes. */
170 		if (bytes % pagesize != 0) {
171 			__db_err(dbenv,
172 			    "%s: file size not a multiple of the pagesize",
173 			    rpath);
174 			ret = EINVAL;
175 			goto err;
176 		}
177 
178 		last_pgno = mbytes * (MEGABYTE / pagesize);
179 		last_pgno += bytes / pagesize;
180 
181 		/* Correction: page numbers are zero-based, not 1-based. */
182 		if (last_pgno != 0)
183 			--last_pgno;
184 
185 		/*
186 		 * Get the file id if we weren't given one.  Generated file id's
187 		 * don't use timestamps, otherwise there'd be no chance of any
188 		 * other process joining the party.
189 		 */
190 		if (finfop->fileid == NULL) {
191 			if ((ret = __os_fileid(dbenv, rpath, 0, idbuf)) != 0)
192 				goto err;
193 			finfop->fileid = idbuf;
194 		}
195 	}
196 
197 	/*
198 	 * If we weren't provided an underlying shared object to join with,
199 	 * find/allocate the shared file objects.  Also allocate space for
200 	 * for the per-process thread lock.
201 	 */
202 	if (needlock)
203 		LOCKREGION(dbmp);
204 
205 	if (mfp == NULL)
206 		ret = __memp_mf_open(dbmp,
207 		    path, pagesize, last_pgno, finfop, &mfp);
208 	else {
209 		++mfp->ref;
210 		ret = 0;
211 	}
212 	if (ret == 0 &&
213 	    F_ISSET(dbmp, MP_LOCKHANDLE) && (ret =
214 	    __memp_alloc(dbmp, sizeof(db_mutex_t), NULL, &dbmfp->mutexp)) == 0)
215 		LOCKINIT(dbmp, dbmfp->mutexp);
216 
217 	if (needlock)
218 		UNLOCKREGION(dbmp);
219 	if (ret != 0)
220 		goto err;
221 
222 	dbmfp->mfp = mfp;
223 
224 	/*
225 	 * If a file:
226 	 *	+ is read-only
227 	 *	+ isn't temporary
228 	 *	+ doesn't require any pgin/pgout support
229 	 *	+ the DB_NOMMAP flag wasn't set
230 	 *	+ and is less than mp_mmapsize bytes in size
231 	 *
232 	 * we can mmap it instead of reading/writing buffers.  Don't do error
233 	 * checking based on the mmap call failure.  We want to do normal I/O
234 	 * on the file if the reason we failed was because the file was on an
235 	 * NFS mounted partition, and we can fail in buffer I/O just as easily
236 	 * as here.
237 	 *
238 	 * XXX
239 	 * We'd like to test to see if the file is too big to mmap.  Since we
240 	 * don't know what size or type off_t's or size_t's are, or the largest
241 	 * unsigned integral type is, or what random insanity the local C
242 	 * compiler will perpetrate, doing the comparison in a portable way is
243 	 * flatly impossible.  Hope that mmap fails if the file is too large.
244 	 */
245 #define	DB_MAXMMAPSIZE	(10 * 1024 * 1024)	/* 10 Mb. */
246 	if (F_ISSET(mfp, MP_CAN_MMAP)) {
247 		if (!F_ISSET(dbmfp, MP_READONLY))
248 			F_CLR(mfp, MP_CAN_MMAP);
249 		if (path == NULL)
250 			F_CLR(mfp, MP_CAN_MMAP);
251 		if (finfop->ftype != 0)
252 			F_CLR(mfp, MP_CAN_MMAP);
253 		if (LF_ISSET(DB_NOMMAP))
254 			F_CLR(mfp, MP_CAN_MMAP);
255 		maxmap = dbenv == NULL || dbenv->mp_mmapsize == 0 ?
256 		    DB_MAXMMAPSIZE : dbenv->mp_mmapsize;
257 		if (mbytes > maxmap / MEGABYTE ||
258 		    (mbytes == maxmap / MEGABYTE && bytes >= maxmap % MEGABYTE))
259 			F_CLR(mfp, MP_CAN_MMAP);
260 	}
261 	dbmfp->addr = NULL;
262 	if (F_ISSET(mfp, MP_CAN_MMAP)) {
263 		dbmfp->len = (size_t)mbytes * MEGABYTE + bytes;
264 		if (__db_mapfile(rpath,
265 		    dbmfp->fd, dbmfp->len, 1, &dbmfp->addr) != 0) {
266 			dbmfp->addr = NULL;
267 			F_CLR(mfp, MP_CAN_MMAP);
268 		}
269 	}
270 	if (rpath != NULL)
271 		__os_freestr(rpath);
272 
273 	LOCKHANDLE(dbmp, dbmp->mutexp);
274 	TAILQ_INSERT_TAIL(&dbmp->dbmfq, dbmfp, q);
275 	UNLOCKHANDLE(dbmp, dbmp->mutexp);
276 
277 	*retp = dbmfp;
278 	return (0);
279 
280 err:	/*
281 	 * Note that we do not have to free the thread mutex, because we
282 	 * never get to here after we have successfully allocated it.
283 	 */
284 	if (rpath != NULL)
285 		__os_freestr(rpath);
286 	if (dbmfp->fd != -1)
287 		(void)__os_close(dbmfp->fd);
288 	if (dbmfp != NULL)
289 		__os_free(dbmfp, sizeof(DB_MPOOLFILE));
290 	return (ret);
291 }
292 
293 /*
294  * __memp_mf_open --
295  *	Open an MPOOLFILE.
296  */
297 static int
298 __memp_mf_open(dbmp, path, pagesize, last_pgno, finfop, retp)
299 	DB_MPOOL *dbmp;
300 	const char *path;
301 	size_t pagesize;
302 	db_pgno_t last_pgno;
303 	DB_MPOOL_FINFO *finfop;
304 	MPOOLFILE **retp;
305 {
306 	MPOOLFILE *mfp;
307 	int ret;
308 	void *p;
309 
310 #define	ISTEMPORARY	(path == NULL)
311 
312 	/*
313 	 * Walk the list of MPOOLFILE's, looking for a matching file.
314 	 * Temporary files can't match previous files.
315 	 */
316 	if (!ISTEMPORARY)
317 		for (mfp = SH_TAILQ_FIRST(&dbmp->mp->mpfq, __mpoolfile);
318 		    mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) {
319 			if (F_ISSET(mfp, MP_TEMP))
320 				continue;
321 			if (!memcmp(finfop->fileid,
322 			    R_ADDR(dbmp, mfp->fileid_off), DB_FILE_ID_LEN)) {
323 				if (finfop->clear_len != mfp->clear_len ||
324 				    finfop->ftype != mfp->ftype ||
325 				    pagesize != mfp->stat.st_pagesize) {
326 					__db_err(dbmp->dbenv,
327 			    "%s: ftype, clear length or pagesize changed",
328 					    path);
329 					return (EINVAL);
330 				}
331 
332 				/* Found it: increment the reference count. */
333 				++mfp->ref;
334 				*retp = mfp;
335 				return (0);
336 			}
337 		}
338 
339 	/* Allocate a new MPOOLFILE. */
340 	if ((ret = __memp_alloc(dbmp, sizeof(MPOOLFILE), NULL, &mfp)) != 0)
341 		return (ret);
342 	*retp = mfp;
343 
344 	/* Initialize the structure. */
345 	memset(mfp, 0, sizeof(MPOOLFILE));
346 	mfp->ref = 1;
347 	mfp->ftype = finfop->ftype;
348 	mfp->lsn_off = finfop->lsn_offset;
349 	mfp->clear_len = finfop->clear_len;
350 
351 	/*
352 	 * If the user specifies DB_MPOOL_LAST or DB_MPOOL_NEW on a memp_fget,
353 	 * we have to know the last page in the file.  Figure it out and save
354 	 * it away.
355 	 */
356 	mfp->stat.st_pagesize = pagesize;
357 	mfp->orig_last_pgno = mfp->last_pgno = last_pgno;
358 
359 	if (ISTEMPORARY)
360 		F_SET(mfp, MP_TEMP);
361 	else {
362 		/* Copy the file path into shared memory. */
363 		if ((ret = __memp_alloc(dbmp,
364 		    strlen(path) + 1, &mfp->path_off, &p)) != 0)
365 			goto err;
366 		memcpy(p, path, strlen(path) + 1);
367 
368 		/* Copy the file identification string into shared memory. */
369 		if ((ret = __memp_alloc(dbmp,
370 		    DB_FILE_ID_LEN, &mfp->fileid_off, &p)) != 0)
371 			goto err;
372 		memcpy(p, finfop->fileid, DB_FILE_ID_LEN);
373 
374 		F_SET(mfp, MP_CAN_MMAP);
375 	}
376 
377 	/* Copy the page cookie into shared memory. */
378 	if (finfop->pgcookie == NULL || finfop->pgcookie->size == 0) {
379 		mfp->pgcookie_len = 0;
380 		mfp->pgcookie_off = 0;
381 	} else {
382 		if ((ret = __memp_alloc(dbmp,
383 		    finfop->pgcookie->size, &mfp->pgcookie_off, &p)) != 0)
384 			goto err;
385 		memcpy(p, finfop->pgcookie->data, finfop->pgcookie->size);
386 		mfp->pgcookie_len = finfop->pgcookie->size;
387 	}
388 
389 	/* Prepend the MPOOLFILE to the list of MPOOLFILE's. */
390 	SH_TAILQ_INSERT_HEAD(&dbmp->mp->mpfq, mfp, q, __mpoolfile);
391 
392 	if (0) {
393 err:		if (mfp->path_off != 0)
394 			__db_shalloc_free(dbmp->addr,
395 			    R_ADDR(dbmp, mfp->path_off));
396 		if (mfp->fileid_off != 0)
397 			__db_shalloc_free(dbmp->addr,
398 			    R_ADDR(dbmp, mfp->fileid_off));
399 		if (mfp != NULL)
400 			__db_shalloc_free(dbmp->addr, mfp);
401 		mfp = NULL;
402 	}
403 	return (0);
404 }
405 
406 /*
407  * memp_fclose --
408  *	Close a backing file for the memory pool.
409  */
410 int
411 memp_fclose(dbmfp)
412 	DB_MPOOLFILE *dbmfp;
413 {
414 	DB_MPOOL *dbmp;
415 	int ret, t_ret;
416 
417 	dbmp = dbmfp->dbmp;
418 	ret = 0;
419 
420 	MP_PANIC_CHECK(dbmp);
421 
422 	for (;;) {
423 		LOCKHANDLE(dbmp, dbmp->mutexp);
424 
425 		/*
426 		 * We have to reference count DB_MPOOLFILE structures as other
427 		 * threads may be using them.  The problem only happens if the
428 		 * application makes a bad design choice.  Here's the path:
429 		 *
430 		 * Thread A opens a database.
431 		 * Thread B uses thread A's DB_MPOOLFILE to write a buffer
432 		 *    in order to free up memory in the mpool cache.
433 		 * Thread A closes the database while thread B is using the
434 		 *    DB_MPOOLFILE structure.
435 		 *
436 		 * By opening all databases before creating the threads, and
437 		 * closing them after the threads have exited, applications
438 		 * get better performance and avoid the problem path entirely.
439 		 *
440 		 * Regardless, holding the DB_MPOOLFILE to flush a dirty buffer
441 		 * is a short-term lock, even in worst case, since we better be
442 		 * the only thread of control using the DB_MPOOLFILE structure
443 		 * to read pages *into* the cache.  Wait until we're the only
444 		 * reference holder and remove the DB_MPOOLFILE structure from
445 		 * the list, so nobody else can even find it.
446 		 */
447 		if (dbmfp->ref == 1) {
448 			TAILQ_REMOVE(&dbmp->dbmfq, dbmfp, q);
449 			break;
450 		}
451 		UNLOCKHANDLE(dbmp, dbmp->mutexp);
452 
453 		(void)__os_sleep(1, 0);
454 	}
455 	UNLOCKHANDLE(dbmp, dbmp->mutexp);
456 
457 	/* Complain if pinned blocks never returned. */
458 	if (dbmfp->pinref != 0)
459 		__db_err(dbmp->dbenv, "%s: close: %lu blocks left pinned",
460 		    __memp_fn(dbmfp), (u_long)dbmfp->pinref);
461 
462 	/* Close the underlying MPOOLFILE. */
463 	(void)__memp_mf_close(dbmp, dbmfp);
464 
465 	/* Discard any mmap information. */
466 	if (dbmfp->addr != NULL &&
467 	    (ret = __db_unmapfile(dbmfp->addr, dbmfp->len)) != 0)
468 		__db_err(dbmp->dbenv,
469 		    "%s: %s", __memp_fn(dbmfp), strerror(ret));
470 
471 	/* Close the file; temporary files may not yet have been created. */
472 	if (dbmfp->fd != -1 && (t_ret = __os_close(dbmfp->fd)) != 0) {
473 		__db_err(dbmp->dbenv,
474 		    "%s: %s", __memp_fn(dbmfp), strerror(t_ret));
475 		if (ret != 0)
476 			t_ret = ret;
477 	}
478 
479 	/* Free memory. */
480 	if (dbmfp->mutexp != NULL) {
481 		LOCKREGION(dbmp);
482 		__db_shalloc_free(dbmp->addr, dbmfp->mutexp);
483 		UNLOCKREGION(dbmp);
484 	}
485 
486 	/* Discard the DB_MPOOLFILE structure. */
487 	__os_free(dbmfp, sizeof(DB_MPOOLFILE));
488 
489 	return (ret);
490 }
491 
492 /*
493  * __memp_mf_close --
494  *	Close down an MPOOLFILE.
495  */
496 static int
497 __memp_mf_close(dbmp, dbmfp)
498 	DB_MPOOL *dbmp;
499 	DB_MPOOLFILE *dbmfp;
500 {
501 	BH *bhp, *nbhp;
502 	MPOOL *mp;
503 	MPOOLFILE *mfp;
504 	size_t mf_offset;
505 
506 	mp = dbmp->mp;
507 	mfp = dbmfp->mfp;
508 
509 	LOCKREGION(dbmp);
510 
511 	/* If more than a single reference, simply decrement. */
512 	if (mfp->ref > 1) {
513 		--mfp->ref;
514 		goto ret1;
515 	}
516 
517 	/*
518 	 * Move any BH's held by the file to the free list.  We don't free the
519 	 * memory itself because we may be discarding the memory pool, and it's
520 	 * fairly expensive to reintegrate the buffers back into the region for
521 	 * no purpose.
522 	 */
523 	mf_offset = R_OFFSET(dbmp, mfp);
524 	for (bhp = SH_TAILQ_FIRST(&mp->bhq, __bh); bhp != NULL; bhp = nbhp) {
525 		nbhp = SH_TAILQ_NEXT(bhp, q, __bh);
526 
527 #ifdef DEBUG_NO_DIRTY
528 		/* Complain if we find any blocks that were left dirty. */
529 		if (F_ISSET(bhp, BH_DIRTY))
530 			__db_err(dbmp->dbenv,
531 			    "%s: close: pgno %lu left dirty; ref %lu",
532 			    __memp_fn(dbmfp),
533 			    (u_long)bhp->pgno, (u_long)bhp->ref);
534 #endif
535 
536 		if (bhp->mf_offset == mf_offset) {
537 			if (F_ISSET(bhp, BH_DIRTY)) {
538 				++mp->stat.st_page_clean;
539 				--mp->stat.st_page_dirty;
540 			}
541 			__memp_bhfree(dbmp, mfp, bhp, 0);
542 			SH_TAILQ_INSERT_HEAD(&mp->bhfq, bhp, q, __bh);
543 		}
544 	}
545 
546 	/* Delete from the list of MPOOLFILEs. */
547 	SH_TAILQ_REMOVE(&mp->mpfq, mfp, q, __mpoolfile);
548 
549 	/* Free the space. */
550 	if (mfp->path_off != 0)
551 		__db_shalloc_free(dbmp->addr, R_ADDR(dbmp, mfp->path_off));
552 	if (mfp->fileid_off != 0)
553 		__db_shalloc_free(dbmp->addr, R_ADDR(dbmp, mfp->fileid_off));
554 	if (mfp->pgcookie_off != 0)
555 		__db_shalloc_free(dbmp->addr, R_ADDR(dbmp, mfp->pgcookie_off));
556 	__db_shalloc_free(dbmp->addr, mfp);
557 
558 ret1:	UNLOCKREGION(dbmp);
559 	return (0);
560 }
561