xref: /titanic_41/usr/src/cmd/sendmail/db/mp/mp_bh.c (revision 7c2fbfb345896881c631598ee3852ce9ce33fb07)
1 /*-
2  * See the file LICENSE for redistribution information.
3  *
4  * Copyright (c) 1996, 1997, 1998
5  *	Sleepycat Software.  All rights reserved.
6  */
7 #include "config.h"
8 
9 #ifndef lint
10 static const char sccsid[] = "@(#)mp_bh.c	10.45 (Sleepycat) 11/25/98";
11 #endif /* not lint */
12 
13 #ifndef NO_SYSTEM_INCLUDES
14 #include <sys/types.h>
15 
16 #include <errno.h>
17 #include <string.h>
18 #include <unistd.h>
19 #endif
20 
21 #include "db_int.h"
22 #include "shqueue.h"
23 #include "db_shash.h"
24 #include "mp.h"
25 #include "common_ext.h"
26 
27 static int __memp_upgrade __P((DB_MPOOL *, DB_MPOOLFILE *, MPOOLFILE *));
28 
29 /*
30  * __memp_bhwrite --
31  *	Write the page associated with a given bucket header.
32  *
33  * PUBLIC: int __memp_bhwrite
34  * PUBLIC:     __P((DB_MPOOL *, MPOOLFILE *, BH *, int *, int *));
35  */
36 int
37 __memp_bhwrite(dbmp, mfp, bhp, restartp, wrotep)
38 	DB_MPOOL *dbmp;
39 	MPOOLFILE *mfp;
40 	BH *bhp;
41 	int *restartp, *wrotep;
42 {
43 	DB_MPOOLFILE *dbmfp;
44 	DB_MPREG *mpreg;
45 	int incremented, ret;
46 
47 	if (restartp != NULL)
48 		*restartp = 0;
49 	if (wrotep != NULL)
50 		*wrotep = 0;
51 	incremented = 0;
52 
53 	/*
54 	 * Walk the process' DB_MPOOLFILE list and find a file descriptor for
55 	 * the file.  We also check that the descriptor is open for writing.
56 	 * If we find a descriptor on the file that's not open for writing, we
57 	 * try and upgrade it to make it writeable.  If that fails, we're done.
58 	 */
59 	LOCKHANDLE(dbmp, dbmp->mutexp);
60 	for (dbmfp = TAILQ_FIRST(&dbmp->dbmfq);
61 	    dbmfp != NULL; dbmfp = TAILQ_NEXT(dbmfp, q))
62 		if (dbmfp->mfp == mfp) {
63 			if (F_ISSET(dbmfp, MP_READONLY) &&
64 			    __memp_upgrade(dbmp, dbmfp, mfp)) {
65 				UNLOCKHANDLE(dbmp, dbmp->mutexp);
66 				return (0);
67 			}
68 
69 			/*
70 			 * Increment the reference count -- see the comment in
71 			 * memp_fclose().
72 			 */
73 			++dbmfp->ref;
74 			incremented = 1;
75 			break;
76 		}
77 	UNLOCKHANDLE(dbmp, dbmp->mutexp);
78 	if (dbmfp != NULL)
79 		goto found;
80 
81 	/*
82 	 * It's not a page from a file we've opened.  If the file requires
83 	 * input/output processing, see if this process has ever registered
84 	 * information as to how to write this type of file.  If not, there's
85 	 * nothing we can do.
86 	 */
87 	if (mfp->ftype != 0) {
88 		LOCKHANDLE(dbmp, dbmp->mutexp);
89 		for (mpreg = LIST_FIRST(&dbmp->dbregq);
90 		    mpreg != NULL; mpreg = LIST_NEXT(mpreg, q))
91 			if (mpreg->ftype == mfp->ftype)
92 				break;
93 		UNLOCKHANDLE(dbmp, dbmp->mutexp);
94 		if (mpreg == NULL)
95 			return (0);
96 	}
97 
98 	/*
99 	 * Try and open the file, attaching to the underlying shared area.
100 	 *
101 	 * XXX
102 	 * Don't try to attach to temporary files.  There are two problems in
103 	 * trying to do that.  First, if we have different privileges than the
104 	 * process that "owns" the temporary file, we might create the backing
105 	 * disk file such that the owning process couldn't read/write its own
106 	 * buffers, e.g., memp_trickle() running as root creating a file owned
107 	 * as root, mode 600.  Second, if the temporary file has already been
108 	 * created, we don't have any way of finding out what its real name is,
109 	 * and, even if we did, it was already unlinked (so that it won't be
110 	 * left if the process dies horribly).  This decision causes a problem,
111 	 * however: if the temporary file consumes the entire buffer cache,
112 	 * and the owner doesn't flush the buffers to disk, we could end up
113 	 * with resource starvation, and the memp_trickle() thread couldn't do
114 	 * anything about it.  That's a pretty unlikely scenario, though.
115 	 *
116 	 * XXX
117 	 * There's no negative cache, so we may repeatedly try and open files
118 	 * that we have previously tried (and failed) to open.
119 	 *
120 	 * Ignore any error, assume it's a permissions problem.
121 	 */
122 	if (F_ISSET(mfp, MP_TEMP))
123 		return (0);
124 
125 	if (__memp_fopen(dbmp, mfp, R_ADDR(dbmp, mfp->path_off),
126 	    0, 0, mfp->stat.st_pagesize, 0, NULL, &dbmfp) != 0)
127 		return (0);
128 
129 found:	ret = __memp_pgwrite(dbmfp, bhp, restartp, wrotep);
130 
131 	if (incremented) {
132 		LOCKHANDLE(dbmp, dbmp->mutexp);
133 		--dbmfp->ref;
134 		UNLOCKHANDLE(dbmp, dbmp->mutexp);
135 	}
136 
137 	return (ret);
138 }
139 
140 /*
141  * __memp_pgread --
142  *	Read a page from a file.
143  *
144  * PUBLIC: int __memp_pgread __P((DB_MPOOLFILE *, BH *, int));
145  */
146 int
147 __memp_pgread(dbmfp, bhp, can_create)
148 	DB_MPOOLFILE *dbmfp;
149 	BH *bhp;
150 	int can_create;
151 {
152 	DB_IO db_io;
153 	DB_MPOOL *dbmp;
154 	MPOOLFILE *mfp;
155 	size_t len, pagesize;
156 	ssize_t nr;
157 	int created, ret;
158 
159 	dbmp = dbmfp->dbmp;
160 	mfp = dbmfp->mfp;
161 	pagesize = mfp->stat.st_pagesize;
162 
163 	F_SET(bhp, BH_LOCKED | BH_TRASH);
164 	LOCKBUFFER(dbmp, bhp);
165 	UNLOCKREGION(dbmp);
166 
167 	/*
168 	 * Temporary files may not yet have been created.  We don't create
169 	 * them now, we create them when the pages have to be flushed.
170 	 */
171 	nr = 0;
172 	if (dbmfp->fd == -1)
173 		ret = 0;
174 	else {
175 		/*
176 		 * Ignore read errors if we have permission to create the page.
177 		 * Assume that the page doesn't exist, and that we'll create it
178 		 * when we write it out.
179 		 */
180 		db_io.fd_io = dbmfp->fd;
181 		db_io.fd_lock = dbmp->reginfo.fd;
182 		db_io.mutexp =
183 		    F_ISSET(dbmp, MP_LOCKHANDLE) ? dbmfp->mutexp : NULL;
184 		db_io.pagesize = db_io.bytes = pagesize;
185 		db_io.pgno = bhp->pgno;
186 		db_io.buf = bhp->buf;
187 
188 		ret = __os_io(&db_io, DB_IO_READ, &nr);
189 	}
190 
191 	created = 0;
192 	if (nr < (ssize_t)pagesize)
193 		if (can_create)
194 			created = 1;
195 		else {
196 			/* If we had a short read, ret may be 0. */
197 			if (ret == 0)
198 				ret = EIO;
199 			__db_err(dbmp->dbenv,
200 			    "%s: page %lu doesn't exist, create flag not set",
201 			    __memp_fn(dbmfp), (u_long)bhp->pgno);
202 			goto err;
203 		}
204 
205 	/*
206 	 * Clear any bytes we didn't read that need to be cleared.  If we're
207 	 * running in diagnostic mode, smash any bytes on the page that are
208 	 * unknown quantities for the caller.
209 	 */
210 	if (nr != (ssize_t)pagesize) {
211 		len = mfp->clear_len == 0 ? pagesize : mfp->clear_len;
212 		if (nr < (ssize_t)len)
213 			memset(bhp->buf + nr, 0, len - nr);
214 #ifdef DIAGNOSTIC
215 		if (nr > (ssize_t)len)
216 			len = nr;
217 		if (len < pagesize)
218 			memset(bhp->buf + len, 0xdb, pagesize - len);
219 #endif
220 	}
221 
222 	/* Call any pgin function. */
223 	ret = mfp->ftype == 0 ? 0 : __memp_pg(dbmfp, bhp, 1);
224 
225 	/* Unlock the buffer and reacquire the region lock. */
226 err:	UNLOCKBUFFER(dbmp, bhp);
227 	LOCKREGION(dbmp);
228 
229 	/*
230 	 * If no errors occurred, the data is now valid, clear the BH_TRASH
231 	 * flag; regardless, clear the lock bit and let other threads proceed.
232 	 */
233 	F_CLR(bhp, BH_LOCKED);
234 	if (ret == 0) {
235 		F_CLR(bhp, BH_TRASH);
236 
237 		/* Update the statistics. */
238 		if (created) {
239 			++dbmp->mp->stat.st_page_create;
240 			++mfp->stat.st_page_create;
241 		} else {
242 			++dbmp->mp->stat.st_page_in;
243 			++mfp->stat.st_page_in;
244 		}
245 	}
246 
247 	return (ret);
248 }
249 
250 /*
251  * __memp_pgwrite --
252  *	Write a page to a file.
253  *
254  * PUBLIC: int __memp_pgwrite __P((DB_MPOOLFILE *, BH *, int *, int *));
255  */
256 int
257 __memp_pgwrite(dbmfp, bhp, restartp, wrotep)
258 	DB_MPOOLFILE *dbmfp;
259 	BH *bhp;
260 	int *restartp, *wrotep;
261 {
262 	DB_ENV *dbenv;
263 	DB_IO db_io;
264 	DB_LOG *lg_info;
265 	DB_LSN lsn;
266 	DB_MPOOL *dbmp;
267 	MPOOL *mp;
268 	MPOOLFILE *mfp;
269 	ssize_t nw;
270 	int callpgin, dosync, ret, syncfail;
271 	const char *fail;
272 
273 	dbmp = dbmfp->dbmp;
274 	dbenv = dbmp->dbenv;
275 	mp = dbmp->mp;
276 	mfp = dbmfp->mfp;
277 
278 	if (restartp != NULL)
279 		*restartp = 0;
280 	if (wrotep != NULL)
281 		*wrotep = 0;
282 	callpgin = 0;
283 
284 	/*
285 	 * Check the dirty bit -- this buffer may have been written since we
286 	 * decided to write it.
287 	 */
288 	if (!F_ISSET(bhp, BH_DIRTY)) {
289 		if (wrotep != NULL)
290 			*wrotep = 1;
291 		return (0);
292 	}
293 
294 	LOCKBUFFER(dbmp, bhp);
295 
296 	/*
297 	 * If there were two writers, we may have just been waiting while the
298 	 * other writer completed I/O on this buffer.  Check the dirty bit one
299 	 * more time.
300 	 */
301 	if (!F_ISSET(bhp, BH_DIRTY)) {
302 		UNLOCKBUFFER(dbmp, bhp);
303 
304 		if (wrotep != NULL)
305 			*wrotep = 1;
306 		return (0);
307 	}
308 
309 	F_SET(bhp, BH_LOCKED);
310 	UNLOCKREGION(dbmp);
311 
312 	if (restartp != NULL)
313 		*restartp = 1;
314 
315 	/* Copy the LSN off the page if we're going to need it. */
316 	lg_info = dbenv->lg_info;
317 	if (lg_info != NULL || F_ISSET(bhp, BH_WRITE))
318 		memcpy(&lsn, bhp->buf + mfp->lsn_off, sizeof(DB_LSN));
319 
320 	/* Ensure the appropriate log records are on disk. */
321 	if (lg_info != NULL && (ret = log_flush(lg_info, &lsn)) != 0)
322 		goto err;
323 
324 	/*
325 	 * Call any pgout function.  We set the callpgin flag so that we flag
326 	 * that the contents of the buffer will need to be passed through pgin
327 	 * before they are reused.
328 	 */
329 	if (mfp->ftype == 0)
330 		ret = 0;
331 	else {
332 		callpgin = 1;
333 		if ((ret = __memp_pg(dbmfp, bhp, 0)) != 0)
334 			goto err;
335 	}
336 
337 	/* Temporary files may not yet have been created. */
338 	if (dbmfp->fd == -1) {
339 		LOCKHANDLE(dbmp, dbmfp->mutexp);
340 		if (dbmfp->fd == -1 && ((ret = __db_appname(dbenv,
341 		    DB_APP_TMP, NULL, NULL, DB_CREATE | DB_EXCL | DB_TEMPORARY,
342 		    &dbmfp->fd, NULL)) != 0 || dbmfp->fd == -1)) {
343 			UNLOCKHANDLE(dbmp, dbmfp->mutexp);
344 			__db_err(dbenv,
345 			    "unable to create temporary backing file");
346 			goto err;
347 		}
348 		UNLOCKHANDLE(dbmp, dbmfp->mutexp);
349 	}
350 
351 	/* Write the page. */
352 	db_io.fd_io = dbmfp->fd;
353 	db_io.fd_lock = dbmp->reginfo.fd;
354 	db_io.mutexp = F_ISSET(dbmp, MP_LOCKHANDLE) ? dbmfp->mutexp : NULL;
355 	db_io.pagesize = db_io.bytes = mfp->stat.st_pagesize;
356 	db_io.pgno = bhp->pgno;
357 	db_io.buf = bhp->buf;
358 	if ((ret = __os_io(&db_io, DB_IO_WRITE, &nw)) != 0) {
359 		__db_panic(dbenv, ret);
360 		fail = "write";
361 		goto syserr;
362 	}
363 	if (nw != (ssize_t)mfp->stat.st_pagesize) {
364 		ret = EIO;
365 		fail = "write";
366 		goto syserr;
367 	}
368 
369 	if (wrotep != NULL)
370 		*wrotep = 1;
371 
372 	/* Unlock the buffer and reacquire the region lock. */
373 	UNLOCKBUFFER(dbmp, bhp);
374 	LOCKREGION(dbmp);
375 
376 	/*
377 	 * Clean up the flags based on a successful write.
378 	 *
379 	 * If we rewrote the page, it will need processing by the pgin
380 	 * routine before reuse.
381 	 */
382 	if (callpgin)
383 		F_SET(bhp, BH_CALLPGIN);
384 	F_CLR(bhp, BH_DIRTY | BH_LOCKED);
385 
386 	/*
387 	 * If we write a buffer for which a checkpoint is waiting, update
388 	 * the count of pending buffers (both in the mpool as a whole and
389 	 * for this file).  If the count for this file goes to zero, set a
390 	 * flag so we flush the writes.
391 	 */
392 	if (F_ISSET(bhp, BH_WRITE)) {
393 		F_CLR(bhp, BH_WRITE);
394 
395 		--mp->lsn_cnt;
396 		dosync = --mfp->lsn_cnt == 0 ? 1 : 0;
397 	} else
398 		dosync = 0;
399 
400 	/* Update the page clean/dirty statistics. */
401 	++mp->stat.st_page_clean;
402 	--mp->stat.st_page_dirty;
403 
404 	/* Update I/O statistics. */
405 	++mp->stat.st_page_out;
406 	++mfp->stat.st_page_out;
407 
408 	/*
409 	 * Do the sync after everything else has been updated, so any incoming
410 	 * checkpoint doesn't see inconsistent information.
411 	 *
412 	 * XXX:
413 	 * Don't lock the region around the sync, fsync(2) has no atomicity
414 	 * issues.
415 	 *
416 	 * XXX:
417 	 * We ignore errors from the sync -- it makes no sense to return an
418 	 * error to the calling process, so set a flag causing the checkpoint
419 	 * to be retried later.  There is a possibility, of course, that a
420 	 * subsequent checkpoint was started and that we're going to force it
421 	 * to fail.  That should be unlikely, and fixing it would be difficult.
422 	 */
423 	if (dosync) {
424 		UNLOCKREGION(dbmp);
425 		syncfail = __os_fsync(dbmfp->fd) != 0;
426 		LOCKREGION(dbmp);
427 		if (syncfail)
428 			F_SET(mp, MP_LSN_RETRY);
429 	}
430 
431 	return (0);
432 
433 syserr:	__db_err(dbenv, "%s: %s failed for page %lu",
434 	    __memp_fn(dbmfp), fail, (u_long)bhp->pgno);
435 
436 err:	/* Unlock the buffer and reacquire the region lock. */
437 	UNLOCKBUFFER(dbmp, bhp);
438 	LOCKREGION(dbmp);
439 
440 	/*
441 	 * Clean up the flags based on a failure.
442 	 *
443 	 * The page remains dirty but we remove our lock.  If we rewrote the
444 	 * page, it will need processing by the pgin routine before reuse.
445 	 */
446 	if (callpgin)
447 		F_SET(bhp, BH_CALLPGIN);
448 	F_CLR(bhp, BH_LOCKED);
449 
450 	return (ret);
451 }
452 
453 /*
454  * __memp_pg --
455  *	Call the pgin/pgout routine.
456  *
457  * PUBLIC: int __memp_pg __P((DB_MPOOLFILE *, BH *, int));
458  */
459 int
460 __memp_pg(dbmfp, bhp, is_pgin)
461 	DB_MPOOLFILE *dbmfp;
462 	BH *bhp;
463 	int is_pgin;
464 {
465 	DBT dbt, *dbtp;
466 	DB_MPOOL *dbmp;
467 	DB_MPREG *mpreg;
468 	MPOOLFILE *mfp;
469 	int ftype, ret;
470 
471 	dbmp = dbmfp->dbmp;
472 	mfp = dbmfp->mfp;
473 
474 	LOCKHANDLE(dbmp, dbmp->mutexp);
475 
476 	ftype = mfp->ftype;
477 	for (mpreg = LIST_FIRST(&dbmp->dbregq);
478 	    mpreg != NULL; mpreg = LIST_NEXT(mpreg, q)) {
479 		if (ftype != mpreg->ftype)
480 			continue;
481 		if (mfp->pgcookie_len == 0)
482 			dbtp = NULL;
483 		else {
484 			dbt.size = mfp->pgcookie_len;
485 			dbt.data = R_ADDR(dbmp, mfp->pgcookie_off);
486 			dbtp = &dbt;
487 		}
488 		UNLOCKHANDLE(dbmp, dbmp->mutexp);
489 
490 		if (is_pgin) {
491 			if (mpreg->pgin != NULL && (ret =
492 			    mpreg->pgin(bhp->pgno, bhp->buf, dbtp)) != 0)
493 				goto err;
494 		} else
495 			if (mpreg->pgout != NULL && (ret =
496 			    mpreg->pgout(bhp->pgno, bhp->buf, dbtp)) != 0)
497 				goto err;
498 		break;
499 	}
500 
501 	if (mpreg == NULL)
502 		UNLOCKHANDLE(dbmp, dbmp->mutexp);
503 
504 	return (0);
505 
506 err:	UNLOCKHANDLE(dbmp, dbmp->mutexp);
507 	__db_err(dbmp->dbenv, "%s: %s failed for page %lu",
508 	    __memp_fn(dbmfp), is_pgin ? "pgin" : "pgout", (u_long)bhp->pgno);
509 	return (ret);
510 }
511 
512 /*
513  * __memp_bhfree --
514  *	Free a bucket header and its referenced data.
515  *
516  * PUBLIC: void __memp_bhfree __P((DB_MPOOL *, MPOOLFILE *, BH *, int));
517  */
518 void
519 __memp_bhfree(dbmp, mfp, bhp, free_mem)
520 	DB_MPOOL *dbmp;
521 	MPOOLFILE *mfp;
522 	BH *bhp;
523 	int free_mem;
524 {
525 	size_t off;
526 
527 	/* Delete the buffer header from the hash bucket queue. */
528 	off = BUCKET(dbmp->mp, R_OFFSET(dbmp, mfp), bhp->pgno);
529 	SH_TAILQ_REMOVE(&dbmp->htab[off], bhp, hq, __bh);
530 
531 	/* Delete the buffer header from the LRU queue. */
532 	SH_TAILQ_REMOVE(&dbmp->mp->bhq, bhp, q, __bh);
533 
534 	/*
535 	 * If we're not reusing it immediately, free the buffer header
536 	 * and data for real.
537 	 */
538 	if (free_mem) {
539 		__db_shalloc_free(dbmp->addr, bhp);
540 		--dbmp->mp->stat.st_page_clean;
541 	}
542 }
543 
544 /*
545  * __memp_upgrade --
546  *	Upgrade a file descriptor from readonly to readwrite.
547  */
548 static int
549 __memp_upgrade(dbmp, dbmfp, mfp)
550 	DB_MPOOL *dbmp;
551 	DB_MPOOLFILE *dbmfp;
552 	MPOOLFILE *mfp;
553 {
554 	int fd, ret;
555 	char *rpath;
556 
557 	/*
558 	 * !!!
559 	 * We expect the handle to already be locked.
560 	 */
561 
562 	/* Check to see if we've already upgraded. */
563 	if (F_ISSET(dbmfp, MP_UPGRADE))
564 		return (0);
565 
566 	/* Check to see if we've already failed. */
567 	if (F_ISSET(dbmfp, MP_UPGRADE_FAIL))
568 		return (1);
569 
570 	/*
571 	 * Calculate the real name for this file and try to open it read/write.
572 	 * We know we have a valid pathname for the file because it's the only
573 	 * way we could have gotten a file descriptor of any kind.
574 	 */
575 	if ((ret = __db_appname(dbmp->dbenv, DB_APP_DATA,
576 	    NULL, R_ADDR(dbmp, mfp->path_off), 0, NULL, &rpath)) != 0)
577 		return (ret);
578 	if (__db_open(rpath, 0, 0, 0, &fd) != 0) {
579 		F_SET(dbmfp, MP_UPGRADE_FAIL);
580 		ret = 1;
581 	} else {
582 		/* Swap the descriptors and set the upgrade flag. */
583 		(void)__os_close(dbmfp->fd);
584 		dbmfp->fd = fd;
585 		F_SET(dbmfp, MP_UPGRADE);
586 		ret = 0;
587 	}
588 	__os_freestr(rpath);
589 	return (ret);
590 }
591