xref: /freebsd/sys/kern/vfs_bio.c (revision a8445737e740901f5f2c8d24c12ef7fc8b00134e)
1 /*
2  * Copyright (c) 1994,1997 John S. Dyson
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice immediately at the beginning of the file, without modification,
10  *    this list of conditions, and the following disclaimer.
11  * 2. Absolutely no warranty of function or purpose is made by the author
12  *		John S. Dyson.
13  *
14  * $Id: vfs_bio.c,v 1.175 1998/09/05 14:13:06 phk Exp $
15  */
16 
17 /*
18  * this file contains a new buffer I/O scheme implementing a coherent
19  * VM object and buffer cache scheme.  Pains have been taken to make
20  * sure that the performance degradation associated with schemes such
21  * as this is not realized.
22  *
23  * Author:  John S. Dyson
24  * Significant help during the development and debugging phases
25  * had been provided by David Greenman, also of the FreeBSD core team.
26  */
27 
28 #include "opt_bounce.h"
29 
30 #define VMIO
31 #include <sys/param.h>
32 #include <sys/systm.h>
33 #include <sys/sysproto.h>
34 #include <sys/kernel.h>
35 #include <sys/sysctl.h>
36 #include <sys/proc.h>
37 #include <sys/vnode.h>
38 #include <sys/vmmeter.h>
39 #include <sys/lock.h>
40 #include <miscfs/specfs/specdev.h>
41 #include <vm/vm.h>
42 #include <vm/vm_param.h>
43 #include <vm/vm_prot.h>
44 #include <vm/vm_kern.h>
45 #include <vm/vm_pageout.h>
46 #include <vm/vm_page.h>
47 #include <vm/vm_object.h>
48 #include <vm/vm_extern.h>
49 #include <vm/vm_map.h>
50 #include <sys/buf.h>
51 #include <sys/mount.h>
52 #include <sys/malloc.h>
53 #include <sys/resourcevar.h>
54 
55 static MALLOC_DEFINE(M_BIOBUF, "BIO buffer", "BIO buffer");
56 
57 struct	bio_ops bioops;		/* I/O operation notification */
58 
59 #if 0 	/* replaced bu sched_sync */
60 static void vfs_update __P((void));
61 static struct	proc *updateproc;
62 static struct kproc_desc up_kp = {
63 	"update",
64 	vfs_update,
65 	&updateproc
66 };
67 SYSINIT_KT(update, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
68 #endif
69 
70 struct buf *buf;		/* buffer header pool */
71 struct swqueue bswlist;
72 
73 static int count_lock_queue __P((void));
74 static void vm_hold_free_pages(struct buf * bp, vm_offset_t from,
75 		vm_offset_t to);
76 static void vm_hold_load_pages(struct buf * bp, vm_offset_t from,
77 		vm_offset_t to);
78 static void vfs_buf_set_valid(struct buf *bp, vm_ooffset_t foff,
79 			      vm_offset_t off, vm_offset_t size,
80 			      vm_page_t m);
81 static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off,
82 			       int pageno, vm_page_t m);
83 static void vfs_clean_pages(struct buf * bp);
84 static void vfs_setdirty(struct buf *bp);
85 static void vfs_vmio_release(struct buf *bp);
86 static void flushdirtybuffers(int slpflag, int slptimeo);
87 
88 int needsbuffer;
89 
90 /*
91  * Internal update daemon, process 3
92  *	The variable vfs_update_wakeup allows for internal syncs.
93  */
94 int vfs_update_wakeup;
95 
96 
97 /*
98  * buffers base kva
99  */
100 
101 /*
102  * bogus page -- for I/O to/from partially complete buffers
103  * this is a temporary solution to the problem, but it is not
104  * really that bad.  it would be better to split the buffer
105  * for input in the case of buffers partially already in memory,
106  * but the code is intricate enough already.
107  */
108 vm_page_t bogus_page;
109 static vm_offset_t bogus_offset;
110 
111 static int bufspace, maxbufspace, vmiospace, maxvmiobufspace,
112 	bufmallocspace, maxbufmallocspace;
113 int numdirtybuffers;
114 static int lodirtybuffers, hidirtybuffers;
115 static int numfreebuffers, lofreebuffers, hifreebuffers;
116 static int kvafreespace;
117 
118 SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD,
119 	&numdirtybuffers, 0, "");
120 SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW,
121 	&lodirtybuffers, 0, "");
122 SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW,
123 	&hidirtybuffers, 0, "");
124 SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD,
125 	&numfreebuffers, 0, "");
126 SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW,
127 	&lofreebuffers, 0, "");
128 SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW,
129 	&hifreebuffers, 0, "");
130 SYSCTL_INT(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RW,
131 	&maxbufspace, 0, "");
132 SYSCTL_INT(_vfs, OID_AUTO, bufspace, CTLFLAG_RD,
133 	&bufspace, 0, "");
134 SYSCTL_INT(_vfs, OID_AUTO, maxvmiobufspace, CTLFLAG_RW,
135 	&maxvmiobufspace, 0, "");
136 SYSCTL_INT(_vfs, OID_AUTO, vmiospace, CTLFLAG_RD,
137 	&vmiospace, 0, "");
138 SYSCTL_INT(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW,
139 	&maxbufmallocspace, 0, "");
140 SYSCTL_INT(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD,
141 	&bufmallocspace, 0, "");
142 SYSCTL_INT(_vfs, OID_AUTO, kvafreespace, CTLFLAG_RD,
143 	&kvafreespace, 0, "");
144 
145 static LIST_HEAD(bufhashhdr, buf) bufhashtbl[BUFHSZ], invalhash;
146 struct bqueues bufqueues[BUFFER_QUEUES] = {0};
147 
148 extern int vm_swap_size;
149 
150 #define BUF_MAXUSE 24
151 
152 #define VFS_BIO_NEED_ANY 1
153 #define VFS_BIO_NEED_LOWLIMIT 2
154 #define VFS_BIO_NEED_FREE 4
155 
156 /*
157  * Initialize buffer headers and related structures.
158  */
159 void
160 bufinit()
161 {
162 	struct buf *bp;
163 	int i;
164 
165 	TAILQ_INIT(&bswlist);
166 	LIST_INIT(&invalhash);
167 
168 	/* first, make a null hash table */
169 	for (i = 0; i < BUFHSZ; i++)
170 		LIST_INIT(&bufhashtbl[i]);
171 
172 	/* next, make a null set of free lists */
173 	for (i = 0; i < BUFFER_QUEUES; i++)
174 		TAILQ_INIT(&bufqueues[i]);
175 
176 	/* finally, initialize each buffer header and stick on empty q */
177 	for (i = 0; i < nbuf; i++) {
178 		bp = &buf[i];
179 		bzero(bp, sizeof *bp);
180 		bp->b_flags = B_INVAL;	/* we're just an empty header */
181 		bp->b_dev = NODEV;
182 		bp->b_rcred = NOCRED;
183 		bp->b_wcred = NOCRED;
184 		bp->b_qindex = QUEUE_EMPTY;
185 		bp->b_vnbufs.le_next = NOLIST;
186 		LIST_INIT(&bp->b_dep);
187 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
188 		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
189 	}
190 /*
191  * maxbufspace is currently calculated to support all filesystem blocks
192  * to be 8K.  If you happen to use a 16K filesystem, the size of the buffer
193  * cache is still the same as it would be for 8K filesystems.  This
194  * keeps the size of the buffer cache "in check" for big block filesystems.
195  */
196 	maxbufspace = (nbuf + 8) * DFLTBSIZE;
197 /*
198  * reserve 1/3 of the buffers for metadata (VDIR) which might not be VMIO'ed
199  */
200 	maxvmiobufspace = 2 * maxbufspace / 3;
201 /*
202  * Limit the amount of malloc memory since it is wired permanently into
203  * the kernel space.  Even though this is accounted for in the buffer
204  * allocation, we don't want the malloced region to grow uncontrolled.
205  * The malloc scheme improves memory utilization significantly on average
206  * (small) directories.
207  */
208 	maxbufmallocspace = maxbufspace / 20;
209 
210 /*
211  * Remove the probability of deadlock conditions by limiting the
212  * number of dirty buffers.
213  */
214 	hidirtybuffers = nbuf / 8 + 20;
215 	lodirtybuffers = nbuf / 16 + 10;
216 	numdirtybuffers = 0;
217 	lofreebuffers = nbuf / 18 + 5;
218 	hifreebuffers = 2 * lofreebuffers;
219 	numfreebuffers = nbuf;
220 	kvafreespace = 0;
221 
222 	bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE);
223 	bogus_page = vm_page_alloc(kernel_object,
224 			((bogus_offset - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
225 			VM_ALLOC_NORMAL);
226 
227 }
228 
229 /*
230  * Free the kva allocation for a buffer
231  * Must be called only at splbio or higher,
232  *  as this is the only locking for buffer_map.
233  */
234 static void
235 bfreekva(struct buf * bp)
236 {
237 	if (bp->b_kvasize == 0)
238 		return;
239 
240 	vm_map_delete(buffer_map,
241 		(vm_offset_t) bp->b_kvabase,
242 		(vm_offset_t) bp->b_kvabase + bp->b_kvasize);
243 
244 	bp->b_kvasize = 0;
245 
246 }
247 
248 /*
249  * remove the buffer from the appropriate free list
250  */
251 void
252 bremfree(struct buf * bp)
253 {
254 	int s = splbio();
255 
256 	if (bp->b_qindex != QUEUE_NONE) {
257 		if (bp->b_qindex == QUEUE_EMPTY) {
258 			kvafreespace -= bp->b_kvasize;
259 		}
260 		TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
261 		bp->b_qindex = QUEUE_NONE;
262 	} else {
263 #if !defined(MAX_PERF)
264 		panic("bremfree: removing a buffer when not on a queue");
265 #endif
266 	}
267 	if ((bp->b_flags & B_INVAL) ||
268 		(bp->b_flags & (B_DELWRI|B_LOCKED)) == 0)
269 		--numfreebuffers;
270 	splx(s);
271 }
272 
273 
274 /*
275  * Get a buffer with the specified data.  Look in the cache first.
276  */
277 int
278 bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred,
279     struct buf ** bpp)
280 {
281 	struct buf *bp;
282 
283 	bp = getblk(vp, blkno, size, 0, 0);
284 	*bpp = bp;
285 
286 	/* if not found in cache, do some I/O */
287 	if ((bp->b_flags & B_CACHE) == 0) {
288 		if (curproc != NULL)
289 			curproc->p_stats->p_ru.ru_inblock++;
290 		bp->b_flags |= B_READ;
291 		bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
292 		if (bp->b_rcred == NOCRED) {
293 			if (cred != NOCRED)
294 				crhold(cred);
295 			bp->b_rcred = cred;
296 		}
297 		vfs_busy_pages(bp, 0);
298 		VOP_STRATEGY(vp, bp);
299 		return (biowait(bp));
300 	}
301 	return (0);
302 }
303 
304 /*
305  * Operates like bread, but also starts asynchronous I/O on
306  * read-ahead blocks.
307  */
308 int
309 breadn(struct vnode * vp, daddr_t blkno, int size,
310     daddr_t * rablkno, int *rabsize,
311     int cnt, struct ucred * cred, struct buf ** bpp)
312 {
313 	struct buf *bp, *rabp;
314 	int i;
315 	int rv = 0, readwait = 0;
316 
317 	*bpp = bp = getblk(vp, blkno, size, 0, 0);
318 
319 	/* if not found in cache, do some I/O */
320 	if ((bp->b_flags & B_CACHE) == 0) {
321 		if (curproc != NULL)
322 			curproc->p_stats->p_ru.ru_inblock++;
323 		bp->b_flags |= B_READ;
324 		bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
325 		if (bp->b_rcred == NOCRED) {
326 			if (cred != NOCRED)
327 				crhold(cred);
328 			bp->b_rcred = cred;
329 		}
330 		vfs_busy_pages(bp, 0);
331 		VOP_STRATEGY(vp, bp);
332 		++readwait;
333 	}
334 	for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
335 		if (inmem(vp, *rablkno))
336 			continue;
337 		rabp = getblk(vp, *rablkno, *rabsize, 0, 0);
338 
339 		if ((rabp->b_flags & B_CACHE) == 0) {
340 			if (curproc != NULL)
341 				curproc->p_stats->p_ru.ru_inblock++;
342 			rabp->b_flags |= B_READ | B_ASYNC;
343 			rabp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
344 			if (rabp->b_rcred == NOCRED) {
345 				if (cred != NOCRED)
346 					crhold(cred);
347 				rabp->b_rcred = cred;
348 			}
349 			vfs_busy_pages(rabp, 0);
350 			VOP_STRATEGY(vp, rabp);
351 		} else {
352 			brelse(rabp);
353 		}
354 	}
355 
356 	if (readwait) {
357 		rv = biowait(bp);
358 	}
359 	return (rv);
360 }
361 
362 /*
363  * Write, release buffer on completion.  (Done by iodone
364  * if async.)
365  */
366 int
367 bwrite(struct buf * bp)
368 {
369 	int oldflags, s;
370 	struct vnode *vp;
371 	struct mount *mp;
372 
373 
374 	if (bp->b_flags & B_INVAL) {
375 		brelse(bp);
376 		return (0);
377 	}
378 
379 	oldflags = bp->b_flags;
380 
381 #if !defined(MAX_PERF)
382 	if ((bp->b_flags & B_BUSY) == 0)
383 		panic("bwrite: buffer is not busy???");
384 #endif
385 
386 	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
387 	bp->b_flags |= B_WRITEINPROG;
388 
389 	s = splbio();
390 	if ((oldflags & B_DELWRI) == B_DELWRI) {
391 		--numdirtybuffers;
392 		reassignbuf(bp, bp->b_vp);
393 	}
394 
395 	bp->b_vp->v_numoutput++;
396 	vfs_busy_pages(bp, 1);
397 	if (curproc != NULL)
398 		curproc->p_stats->p_ru.ru_oublock++;
399 	splx(s);
400 	VOP_STRATEGY(bp->b_vp, bp);
401 
402 	/*
403 	 * Collect statistics on synchronous and asynchronous writes.
404 	 * Writes to block devices are charged to their associated
405 	 * filesystem (if any).
406 	 */
407 	if ((vp = bp->b_vp) != NULL) {
408 		if (vp->v_type == VBLK)
409 			mp = vp->v_specmountpoint;
410 		else
411 			mp = vp->v_mount;
412 		if (mp != NULL)
413 			if ((oldflags & B_ASYNC) == 0)
414 				mp->mnt_stat.f_syncwrites++;
415 			else
416 				mp->mnt_stat.f_asyncwrites++;
417 	}
418 
419 	if ((oldflags & B_ASYNC) == 0) {
420 		int rtval = biowait(bp);
421 		brelse(bp);
422 		return (rtval);
423 	}
424 	return (0);
425 }
426 
427 __inline void
428 vfs_bio_need_satisfy(void) {
429 	++numfreebuffers;
430 	if (!needsbuffer)
431 		return;
432 	if (numdirtybuffers < lodirtybuffers) {
433 		needsbuffer &= ~(VFS_BIO_NEED_ANY | VFS_BIO_NEED_LOWLIMIT);
434 	} else {
435 		needsbuffer &= ~VFS_BIO_NEED_ANY;
436 	}
437 	if (numfreebuffers >= hifreebuffers) {
438 		needsbuffer &= ~VFS_BIO_NEED_FREE;
439 	}
440 	wakeup(&needsbuffer);
441 }
442 
443 /*
444  * Delayed write. (Buffer is marked dirty).
445  */
446 void
447 bdwrite(struct buf * bp)
448 {
449 	int s;
450 	struct vnode *vp;
451 
452 #if !defined(MAX_PERF)
453 	if ((bp->b_flags & B_BUSY) == 0) {
454 		panic("bdwrite: buffer is not busy");
455 	}
456 #endif
457 
458 	if (bp->b_flags & B_INVAL) {
459 		brelse(bp);
460 		return;
461 	}
462 	bp->b_flags &= ~(B_READ|B_RELBUF);
463 	if ((bp->b_flags & B_DELWRI) == 0) {
464 		bp->b_flags |= B_DONE | B_DELWRI;
465 		reassignbuf(bp, bp->b_vp);
466 		++numdirtybuffers;
467 	}
468 
469 	/*
470 	 * This bmap keeps the system from needing to do the bmap later,
471 	 * perhaps when the system is attempting to do a sync.  Since it
472 	 * is likely that the indirect block -- or whatever other datastructure
473 	 * that the filesystem needs is still in memory now, it is a good
474 	 * thing to do this.  Note also, that if the pageout daemon is
475 	 * requesting a sync -- there might not be enough memory to do
476 	 * the bmap then...  So, this is important to do.
477 	 */
478 	if (bp->b_lblkno == bp->b_blkno) {
479 		VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
480 	}
481 
482 	/*
483 	 * Set the *dirty* buffer range based upon the VM system dirty pages.
484 	 */
485 	vfs_setdirty(bp);
486 
487 	/*
488 	 * We need to do this here to satisfy the vnode_pager and the
489 	 * pageout daemon, so that it thinks that the pages have been
490 	 * "cleaned".  Note that since the pages are in a delayed write
491 	 * buffer -- the VFS layer "will" see that the pages get written
492 	 * out on the next sync, or perhaps the cluster will be completed.
493 	 */
494 	vfs_clean_pages(bp);
495 	bqrelse(bp);
496 
497 	/*
498 	 * XXX The soft dependency code is not prepared to
499 	 * have I/O done when a bdwrite is requested. For
500 	 * now we just let the write be delayed if it is
501 	 * requested by the soft dependency code.
502 	 */
503 	if ((vp = bp->b_vp) &&
504 	    (vp->v_type == VBLK && vp->v_specmountpoint &&
505 	    (vp->v_specmountpoint->mnt_flag & MNT_SOFTDEP)) ||
506 	    (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SOFTDEP)))
507 		return;
508 
509 	if (numdirtybuffers >= hidirtybuffers)
510 		flushdirtybuffers(0, 0);
511 
512 	return;
513 }
514 
515 
516 /*
517  * Same as first half of bdwrite, mark buffer dirty, but do not release it.
518  * Check how this compares with vfs_setdirty(); XXX [JRE]
519  */
520 void
521 bdirty(bp)
522       struct buf *bp;
523 {
524 	int s;
525 
526 	bp->b_flags &= ~(B_READ|B_RELBUF); /* XXX ??? check this */
527 	if ((bp->b_flags & B_DELWRI) == 0) {
528 		bp->b_flags |= B_DONE | B_DELWRI; /* why done? XXX JRE */
529 		reassignbuf(bp, bp->b_vp);
530 		++numdirtybuffers;
531 	}
532 }
533 
534 /*
535  * Asynchronous write.
536  * Start output on a buffer, but do not wait for it to complete.
537  * The buffer is released when the output completes.
538  */
539 void
540 bawrite(struct buf * bp)
541 {
542 	bp->b_flags |= B_ASYNC;
543 	(void) VOP_BWRITE(bp);
544 }
545 
546 /*
547  * Ordered write.
548  * Start output on a buffer, and flag it so that the device will write
549  * it in the order it was queued.  The buffer is released when the output
550  * completes.
551  */
552 int
553 bowrite(struct buf * bp)
554 {
555 	bp->b_flags |= B_ORDERED|B_ASYNC;
556 	return (VOP_BWRITE(bp));
557 }
558 
559 /*
560  * Release a buffer.
561  */
562 void
563 brelse(struct buf * bp)
564 {
565 	int s;
566 
567 	if (bp->b_flags & B_CLUSTER) {
568 		relpbuf(bp);
569 		return;
570 	}
571 
572 	s = splbio();
573 
574 	/* anyone need this block? */
575 	if (bp->b_flags & B_WANTED) {
576 		bp->b_flags &= ~(B_WANTED | B_AGE);
577 		wakeup(bp);
578 	}
579 
580 	if (bp->b_flags & B_LOCKED)
581 		bp->b_flags &= ~B_ERROR;
582 
583 	if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_FREEBUF)) ||
584 	    (bp->b_bufsize <= 0)) {
585 		bp->b_flags |= B_INVAL;
586 		if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate)
587 			(*bioops.io_deallocate)(bp);
588 		if (bp->b_flags & B_DELWRI)
589 			--numdirtybuffers;
590 		bp->b_flags &= ~(B_DELWRI | B_CACHE | B_FREEBUF);
591 		if ((bp->b_flags & B_VMIO) == 0) {
592 			if (bp->b_bufsize)
593 				allocbuf(bp, 0);
594 			if (bp->b_vp)
595 				brelvp(bp);
596 		}
597 	}
598 
599 	/*
600 	 * VMIO buffer rundown.  It is not very necessary to keep a VMIO buffer
601 	 * constituted, so the B_INVAL flag is used to *invalidate* the buffer,
602 	 * but the VM object is kept around.  The B_NOCACHE flag is used to
603 	 * invalidate the pages in the VM object.
604 	 *
605 	 * If the buffer is a partially filled NFS buffer, keep it
606 	 * since invalidating it now will lose informatio.  The valid
607 	 * flags in the vm_pages have only DEV_BSIZE resolution but
608 	 * the b_validoff, b_validend fields have byte resolution.
609 	 * This can avoid unnecessary re-reads of the buffer.
610 	 * XXX this seems to cause performance problems.
611 	 */
612 	if ((bp->b_flags & B_VMIO)
613 	    && !(bp->b_vp->v_tag == VT_NFS &&
614 		 bp->b_vp->v_type != VBLK &&
615 		 (bp->b_flags & B_DELWRI) != 0)
616 #ifdef notdef
617 	    && (bp->b_vp->v_tag != VT_NFS
618 		|| bp->b_vp->v_type == VBLK
619 		|| (bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR))
620 		|| bp->b_validend == 0
621 		|| (bp->b_validoff == 0
622 		    && bp->b_validend == bp->b_bufsize))
623 #endif
624 	    ) {
625 
626 		int i, j, resid;
627 		vm_page_t m;
628 		off_t foff;
629 		vm_pindex_t poff;
630 		vm_object_t obj;
631 		struct vnode *vp;
632 
633 		vp = bp->b_vp;
634 
635 		resid = bp->b_bufsize;
636 		foff = bp->b_offset;
637 
638 		for (i = 0; i < bp->b_npages; i++) {
639 			m = bp->b_pages[i];
640 			vm_page_flag_clear(m, PG_ZERO);
641 			if (m == bogus_page) {
642 
643 				obj = (vm_object_t) vp->v_object;
644 				poff = OFF_TO_IDX(bp->b_offset);
645 
646 				for (j = i; j < bp->b_npages; j++) {
647 					m = bp->b_pages[j];
648 					if (m == bogus_page) {
649 						m = vm_page_lookup(obj, poff + j);
650 #if !defined(MAX_PERF)
651 						if (!m) {
652 							panic("brelse: page missing\n");
653 						}
654 #endif
655 						bp->b_pages[j] = m;
656 					}
657 				}
658 
659 				if ((bp->b_flags & B_INVAL) == 0) {
660 					pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
661 				}
662 			}
663 			if (bp->b_flags & (B_NOCACHE|B_ERROR)) {
664 				int poffset = foff & PAGE_MASK;
665 				int presid = resid > (PAGE_SIZE - poffset) ?
666 					(PAGE_SIZE - poffset) : resid;
667 				vm_page_set_invalid(m, poffset, presid);
668 			}
669 			resid -= PAGE_SIZE;
670 		}
671 
672 		if (bp->b_flags & (B_INVAL | B_RELBUF))
673 			vfs_vmio_release(bp);
674 
675 	} else if (bp->b_flags & B_VMIO) {
676 
677 		if (bp->b_flags & (B_INVAL | B_RELBUF))
678 			vfs_vmio_release(bp);
679 
680 	}
681 
682 #if !defined(MAX_PERF)
683 	if (bp->b_qindex != QUEUE_NONE)
684 		panic("brelse: free buffer onto another queue???");
685 #endif
686 
687 	/* enqueue */
688 	/* buffers with no memory */
689 	if (bp->b_bufsize == 0) {
690 		bp->b_flags |= B_INVAL;
691 		bp->b_qindex = QUEUE_EMPTY;
692 		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
693 		LIST_REMOVE(bp, b_hash);
694 		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
695 		bp->b_dev = NODEV;
696 		kvafreespace += bp->b_kvasize;
697 
698 	/* buffers with junk contents */
699 	} else if (bp->b_flags & (B_ERROR | B_INVAL | B_NOCACHE | B_RELBUF)) {
700 		bp->b_flags |= B_INVAL;
701 		bp->b_qindex = QUEUE_AGE;
702 		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist);
703 		LIST_REMOVE(bp, b_hash);
704 		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
705 		bp->b_dev = NODEV;
706 
707 	/* buffers that are locked */
708 	} else if (bp->b_flags & B_LOCKED) {
709 		bp->b_qindex = QUEUE_LOCKED;
710 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
711 
712 	/* buffers with stale but valid contents */
713 	} else if (bp->b_flags & B_AGE) {
714 		bp->b_qindex = QUEUE_AGE;
715 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist);
716 
717 	/* buffers with valid and quite potentially reuseable contents */
718 	} else {
719 		bp->b_qindex = QUEUE_LRU;
720 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
721 	}
722 
723 	if ((bp->b_flags & B_INVAL) ||
724 		(bp->b_flags & (B_LOCKED|B_DELWRI)) == 0) {
725 		if (bp->b_flags & B_DELWRI) {
726 			--numdirtybuffers;
727 			bp->b_flags &= ~B_DELWRI;
728 		}
729 		vfs_bio_need_satisfy();
730 	}
731 
732 	/* unlock */
733 	bp->b_flags &= ~(B_ORDERED | B_WANTED | B_BUSY |
734 		B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
735 	splx(s);
736 }
737 
738 /*
739  * Release a buffer.
740  */
741 void
742 bqrelse(struct buf * bp)
743 {
744 	int s;
745 
746 	s = splbio();
747 
748 	/* anyone need this block? */
749 	if (bp->b_flags & B_WANTED) {
750 		bp->b_flags &= ~(B_WANTED | B_AGE);
751 		wakeup(bp);
752 	}
753 
754 #if !defined(MAX_PERF)
755 	if (bp->b_qindex != QUEUE_NONE)
756 		panic("bqrelse: free buffer onto another queue???");
757 #endif
758 
759 	if (bp->b_flags & B_LOCKED) {
760 		bp->b_flags &= ~B_ERROR;
761 		bp->b_qindex = QUEUE_LOCKED;
762 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
763 		/* buffers with stale but valid contents */
764 	} else {
765 		bp->b_qindex = QUEUE_LRU;
766 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
767 	}
768 
769 	if ((bp->b_flags & (B_LOCKED|B_DELWRI)) == 0) {
770 		vfs_bio_need_satisfy();
771 	}
772 
773 	/* unlock */
774 	bp->b_flags &= ~(B_ORDERED | B_WANTED | B_BUSY |
775 		B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
776 	splx(s);
777 }
778 
779 static void
780 vfs_vmio_release(bp)
781 	struct buf *bp;
782 {
783 	int i;
784 	vm_page_t m;
785 
786 	for (i = 0; i < bp->b_npages; i++) {
787 		m = bp->b_pages[i];
788 		bp->b_pages[i] = NULL;
789 		vm_page_unwire(m);
790 
791 		/*
792 		 * We don't mess with busy pages, it is
793 		 * the responsibility of the process that
794 		 * busied the pages to deal with them.
795 		 */
796 		if ((m->flags & PG_BUSY) || (m->busy != 0))
797 			continue;
798 
799 		if (m->wire_count == 0) {
800 
801 			/*
802 			 * If this is an async free -- we cannot place
803 			 * pages onto the cache queue.  If it is an
804 			 * async free, then we don't modify any queues.
805 			 * This is probably in error (for perf reasons),
806 			 * and we will eventually need to build
807 			 * a more complete infrastructure to support I/O
808 			 * rundown.
809 			 */
810 			if ((bp->b_flags & B_ASYNC) == 0) {
811 
812 			/*
813 			 * In the case of sync buffer frees, we can do pretty much
814 			 * anything to any of the memory queues.  Specifically,
815 			 * the cache queue is okay to be modified.
816 			 */
817 				if (m->valid) {
818 					if(m->dirty == 0)
819 						vm_page_test_dirty(m);
820 					/*
821 					 * this keeps pressure off of the process memory
822 					 */
823 					if (m->dirty == 0 && m->hold_count == 0)
824 						vm_page_cache(m);
825 					else
826 						vm_page_deactivate(m);
827 					vm_page_flag_clear(m, PG_ZERO);
828 				} else if (m->hold_count == 0) {
829 					vm_page_busy(m);
830 					vm_page_protect(m, VM_PROT_NONE);
831 					vm_page_free(m);
832 				}
833 			} else {
834 				/*
835 				 * If async, then at least we clear the
836 				 * act_count.
837 				 */
838 				m->act_count = 0;
839 				vm_page_flag_clear(m, PG_ZERO);
840 			}
841 		}
842 	}
843 	bufspace -= bp->b_bufsize;
844 	vmiospace -= bp->b_bufsize;
845 	pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
846 	bp->b_npages = 0;
847 	bp->b_bufsize = 0;
848 	bp->b_flags &= ~B_VMIO;
849 	if (bp->b_vp)
850 		brelvp(bp);
851 }
852 
853 /*
854  * Check to see if a block is currently memory resident.
855  */
856 struct buf *
857 gbincore(struct vnode * vp, daddr_t blkno)
858 {
859 	struct buf *bp;
860 	struct bufhashhdr *bh;
861 
862 	bh = BUFHASH(vp, blkno);
863 	bp = bh->lh_first;
864 
865 	/* Search hash chain */
866 	while (bp != NULL) {
867 		/* hit */
868 		if (bp->b_vp == vp && bp->b_lblkno == blkno &&
869 		    (bp->b_flags & B_INVAL) == 0) {
870 			break;
871 		}
872 		bp = bp->b_hash.le_next;
873 	}
874 	return (bp);
875 }
876 
877 /*
878  * this routine implements clustered async writes for
879  * clearing out B_DELWRI buffers...  This is much better
880  * than the old way of writing only one buffer at a time.
881  */
882 int
883 vfs_bio_awrite(struct buf * bp)
884 {
885 	int i;
886 	daddr_t lblkno = bp->b_lblkno;
887 	struct vnode *vp = bp->b_vp;
888 	int s;
889 	int ncl;
890 	struct buf *bpa;
891 	int nwritten;
892 	int size;
893 	int maxcl;
894 
895 	s = splbio();
896 	/*
897 	 * right now we support clustered writing only to regular files
898 	 */
899 	if ((vp->v_type == VREG) &&
900 	    (vp->v_mount != 0) && /* Only on nodes that have the size info */
901 	    (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
902 
903 		size = vp->v_mount->mnt_stat.f_iosize;
904 		maxcl = MAXPHYS / size;
905 
906 		for (i = 1; i < maxcl; i++) {
907 			if ((bpa = gbincore(vp, lblkno + i)) &&
908 			    ((bpa->b_flags & (B_BUSY | B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
909 			    (B_DELWRI | B_CLUSTEROK)) &&
910 			    (bpa->b_bufsize == size)) {
911 				if ((bpa->b_blkno == bpa->b_lblkno) ||
912 				    (bpa->b_blkno != bp->b_blkno + ((i * size) >> DEV_BSHIFT)))
913 					break;
914 			} else {
915 				break;
916 			}
917 		}
918 		ncl = i;
919 		/*
920 		 * this is a possible cluster write
921 		 */
922 		if (ncl != 1) {
923 			nwritten = cluster_wbuild(vp, size, lblkno, ncl);
924 			splx(s);
925 			return nwritten;
926 		}
927 	}
928 
929 	bremfree(bp);
930 	bp->b_flags |= B_BUSY | B_ASYNC;
931 
932 	splx(s);
933 	/*
934 	 * default (old) behavior, writing out only one block
935 	 */
936 	nwritten = bp->b_bufsize;
937 	(void) VOP_BWRITE(bp);
938 	return nwritten;
939 }
940 
941 
942 /*
943  * Find a buffer header which is available for use.
944  */
945 static struct buf *
946 getnewbuf(struct vnode *vp, daddr_t blkno,
947 	int slpflag, int slptimeo, int size, int maxsize)
948 {
949 	struct buf *bp, *bp1;
950 	int nbyteswritten = 0;
951 	vm_offset_t addr;
952 	static int writerecursion = 0;
953 
954 start:
955 	if (bufspace >= maxbufspace)
956 		goto trytofreespace;
957 
958 	/* can we constitute a new buffer? */
959 	if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]))) {
960 #if !defined(MAX_PERF)
961 		if (bp->b_qindex != QUEUE_EMPTY)
962 			panic("getnewbuf: inconsistent EMPTY queue, qindex=%d",
963 			    bp->b_qindex);
964 #endif
965 		bp->b_flags |= B_BUSY;
966 		bremfree(bp);
967 		goto fillbuf;
968 	}
969 trytofreespace:
970 	/*
971 	 * We keep the file I/O from hogging metadata I/O
972 	 * This is desirable because file data is cached in the
973 	 * VM/Buffer cache even if a buffer is freed.
974 	 */
975 	if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]))) {
976 #if !defined(MAX_PERF)
977 		if (bp->b_qindex != QUEUE_AGE)
978 			panic("getnewbuf: inconsistent AGE queue, qindex=%d",
979 			    bp->b_qindex);
980 #endif
981 	} else if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]))) {
982 #if !defined(MAX_PERF)
983 		if (bp->b_qindex != QUEUE_LRU)
984 			panic("getnewbuf: inconsistent LRU queue, qindex=%d",
985 			    bp->b_qindex);
986 #endif
987 	}
988 	if (!bp) {
989 		/* wait for a free buffer of any kind */
990 		needsbuffer |= VFS_BIO_NEED_ANY;
991 		do
992 			tsleep(&needsbuffer, (PRIBIO + 4) | slpflag, "newbuf",
993 			    slptimeo);
994 		while (needsbuffer & VFS_BIO_NEED_ANY);
995 		return (0);
996 	}
997 
998 #if defined(DIAGNOSTIC)
999 	if (bp->b_flags & B_BUSY) {
1000 		panic("getnewbuf: busy buffer on free list\n");
1001 	}
1002 #endif
1003 
1004 	/*
1005 	 * We are fairly aggressive about freeing VMIO buffers, but since
1006 	 * the buffering is intact without buffer headers, there is not
1007 	 * much loss.  We gain by maintaining non-VMIOed metadata in buffers.
1008 	 */
1009 	if ((bp->b_qindex == QUEUE_LRU) && (bp->b_usecount > 0)) {
1010 		if ((bp->b_flags & B_VMIO) == 0 ||
1011 			(vmiospace < maxvmiobufspace)) {
1012 			--bp->b_usecount;
1013 			TAILQ_REMOVE(&bufqueues[QUEUE_LRU], bp, b_freelist);
1014 			if (TAILQ_FIRST(&bufqueues[QUEUE_LRU]) != NULL) {
1015 				TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
1016 				goto start;
1017 			}
1018 			TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
1019 		}
1020 	}
1021 
1022 
1023 	/* if we are a delayed write, convert to an async write */
1024 	if ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) {
1025 
1026 		/*
1027 		 * If our delayed write is likely to be used soon, then
1028 		 * recycle back onto the LRU queue.
1029 		 */
1030 		if (vp && (bp->b_vp == vp) && (bp->b_qindex == QUEUE_LRU) &&
1031 			(bp->b_lblkno >= blkno) && (maxsize > 0)) {
1032 
1033 			if (bp->b_usecount > 0) {
1034 				if (bp->b_lblkno < blkno + (MAXPHYS / maxsize)) {
1035 
1036 					TAILQ_REMOVE(&bufqueues[QUEUE_LRU], bp, b_freelist);
1037 
1038 					if (TAILQ_FIRST(&bufqueues[QUEUE_LRU]) != NULL) {
1039 						TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
1040 						bp->b_usecount--;
1041 						goto start;
1042 					}
1043 					TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
1044 				}
1045 			}
1046 		}
1047 
1048 		/*
1049 		 * Certain layered filesystems can recursively re-enter the vfs_bio
1050 		 * code, due to delayed writes.  This helps keep the system from
1051 		 * deadlocking.
1052 		 */
1053 		if (writerecursion > 0) {
1054 			if (writerecursion > 5) {
1055 				bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]);
1056 				while (bp) {
1057 					if ((bp->b_flags & B_DELWRI) == 0)
1058 						break;
1059 					bp = TAILQ_NEXT(bp, b_freelist);
1060 				}
1061 				if (bp == NULL) {
1062 					bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]);
1063 					while (bp) {
1064 						if ((bp->b_flags & B_DELWRI) == 0)
1065 							break;
1066 						bp = TAILQ_NEXT(bp, b_freelist);
1067 					}
1068 				}
1069 				if (bp == NULL)
1070 					panic("getnewbuf: cannot get buffer, infinite recursion failure");
1071 			} else {
1072 				bremfree(bp);
1073 				bp->b_flags |= B_BUSY | B_AGE | B_ASYNC;
1074 				nbyteswritten += bp->b_bufsize;
1075 				++writerecursion;
1076 				VOP_BWRITE(bp);
1077 				--writerecursion;
1078 				if (!slpflag && !slptimeo) {
1079 					return (0);
1080 				}
1081 				goto start;
1082 			}
1083 		} else {
1084 			++writerecursion;
1085 			nbyteswritten += vfs_bio_awrite(bp);
1086 			--writerecursion;
1087 			if (!slpflag && !slptimeo) {
1088 				return (0);
1089 			}
1090 			goto start;
1091 		}
1092 	}
1093 
1094 	if (bp->b_flags & B_WANTED) {
1095 		bp->b_flags &= ~B_WANTED;
1096 		wakeup(bp);
1097 	}
1098 	bremfree(bp);
1099 	bp->b_flags |= B_BUSY;
1100 
1101 	if (bp->b_flags & B_VMIO) {
1102 		bp->b_flags &= ~B_ASYNC;
1103 		vfs_vmio_release(bp);
1104 	}
1105 
1106 	if (bp->b_vp)
1107 		brelvp(bp);
1108 
1109 fillbuf:
1110 
1111 	/* we are not free, nor do we contain interesting data */
1112 	if (bp->b_rcred != NOCRED) {
1113 		crfree(bp->b_rcred);
1114 		bp->b_rcred = NOCRED;
1115 	}
1116 	if (bp->b_wcred != NOCRED) {
1117 		crfree(bp->b_wcred);
1118 		bp->b_wcred = NOCRED;
1119 	}
1120 	if (LIST_FIRST(&bp->b_dep) != NULL &&
1121 	    bioops.io_deallocate)
1122 		(*bioops.io_deallocate)(bp);
1123 
1124 	LIST_REMOVE(bp, b_hash);
1125 	LIST_INSERT_HEAD(&invalhash, bp, b_hash);
1126 	if (bp->b_bufsize) {
1127 		allocbuf(bp, 0);
1128 	}
1129 	bp->b_flags = B_BUSY;
1130 	bp->b_dev = NODEV;
1131 	bp->b_vp = NULL;
1132 	bp->b_blkno = bp->b_lblkno = 0;
1133 	bp->b_offset = NOOFFSET;
1134 	bp->b_iodone = 0;
1135 	bp->b_error = 0;
1136 	bp->b_resid = 0;
1137 	bp->b_bcount = 0;
1138 	bp->b_npages = 0;
1139 	bp->b_dirtyoff = bp->b_dirtyend = 0;
1140 	bp->b_validoff = bp->b_validend = 0;
1141 	bp->b_usecount = 5;
1142 	/* Here, not kern_physio.c, is where this should be done*/
1143 	LIST_INIT(&bp->b_dep);
1144 
1145 	maxsize = (maxsize + PAGE_MASK) & ~PAGE_MASK;
1146 
1147 	/*
1148 	 * we assume that buffer_map is not at address 0
1149 	 */
1150 	addr = 0;
1151 	if (maxsize != bp->b_kvasize) {
1152 		bfreekva(bp);
1153 
1154 findkvaspace:
1155 		/*
1156 		 * See if we have buffer kva space
1157 		 */
1158 		if (vm_map_findspace(buffer_map,
1159 			vm_map_min(buffer_map), maxsize, &addr)) {
1160 			if (kvafreespace > 0) {
1161 				int totfree = 0, freed;
1162 				do {
1163 					freed = 0;
1164 					for (bp1 = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
1165 						bp1 != NULL; bp1 = TAILQ_NEXT(bp1, b_freelist)) {
1166 						if (bp1->b_kvasize != 0) {
1167 							totfree += bp1->b_kvasize;
1168 							freed = bp1->b_kvasize;
1169 							bremfree(bp1);
1170 							bfreekva(bp1);
1171 							brelse(bp1);
1172 							break;
1173 						}
1174 					}
1175 				} while (freed);
1176 				/*
1177 				 * if we found free space, then retry with the same buffer.
1178 				 */
1179 				if (totfree)
1180 					goto findkvaspace;
1181 			}
1182 			bp->b_flags |= B_INVAL;
1183 			brelse(bp);
1184 			goto trytofreespace;
1185 		}
1186 	}
1187 
1188 	/*
1189 	 * See if we are below are allocated minimum
1190 	 */
1191 	if (bufspace >= (maxbufspace + nbyteswritten)) {
1192 		bp->b_flags |= B_INVAL;
1193 		brelse(bp);
1194 		goto trytofreespace;
1195 	}
1196 
1197 	/*
1198 	 * create a map entry for the buffer -- in essence
1199 	 * reserving the kva space.
1200 	 */
1201 	if (addr) {
1202 		vm_map_insert(buffer_map, NULL, 0,
1203 			addr, addr + maxsize,
1204 			VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT);
1205 
1206 		bp->b_kvabase = (caddr_t) addr;
1207 		bp->b_kvasize = maxsize;
1208 	}
1209 	bp->b_data = bp->b_kvabase;
1210 
1211 	return (bp);
1212 }
1213 
1214 static void
1215 waitfreebuffers(int slpflag, int slptimeo) {
1216 	while (numfreebuffers < hifreebuffers) {
1217 		flushdirtybuffers(slpflag, slptimeo);
1218 		if (numfreebuffers < hifreebuffers)
1219 			break;
1220 		needsbuffer |= VFS_BIO_NEED_FREE;
1221 		if (tsleep(&needsbuffer, (PRIBIO + 4)|slpflag, "biofre", slptimeo))
1222 			break;
1223 	}
1224 }
1225 
1226 static void
1227 flushdirtybuffers(int slpflag, int slptimeo) {
1228 	int s;
1229 	static pid_t flushing = 0;
1230 
1231 	s = splbio();
1232 
1233 	if (flushing) {
1234 		if (flushing == curproc->p_pid) {
1235 			splx(s);
1236 			return;
1237 		}
1238 		while (flushing) {
1239 			if (tsleep(&flushing, (PRIBIO + 4)|slpflag, "biofls", slptimeo)) {
1240 				splx(s);
1241 				return;
1242 			}
1243 		}
1244 	}
1245 	flushing = curproc->p_pid;
1246 
1247 	while (numdirtybuffers > lodirtybuffers) {
1248 		struct buf *bp;
1249 		needsbuffer |= VFS_BIO_NEED_LOWLIMIT;
1250 		bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]);
1251 		if (bp == NULL)
1252 			bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]);
1253 
1254 		while (bp && ((bp->b_flags & B_DELWRI) == 0)) {
1255 			bp = TAILQ_NEXT(bp, b_freelist);
1256 		}
1257 
1258 		if (bp) {
1259 			vfs_bio_awrite(bp);
1260 			continue;
1261 		}
1262 		break;
1263 	}
1264 
1265 	flushing = 0;
1266 	wakeup(&flushing);
1267 	splx(s);
1268 }
1269 
1270 /*
1271  * Check to see if a block is currently memory resident.
1272  */
1273 struct buf *
1274 incore(struct vnode * vp, daddr_t blkno)
1275 {
1276 	struct buf *bp;
1277 
1278 	int s = splbio();
1279 	bp = gbincore(vp, blkno);
1280 	splx(s);
1281 	return (bp);
1282 }
1283 
1284 /*
1285  * Returns true if no I/O is needed to access the
1286  * associated VM object.  This is like incore except
1287  * it also hunts around in the VM system for the data.
1288  */
1289 
1290 int
1291 inmem(struct vnode * vp, daddr_t blkno)
1292 {
1293 	vm_object_t obj;
1294 	vm_offset_t toff, tinc;
1295 	vm_page_t m;
1296 	vm_ooffset_t off;
1297 
1298 	if (incore(vp, blkno))
1299 		return 1;
1300 	if (vp->v_mount == NULL)
1301 		return 0;
1302 	if ((vp->v_object == NULL) || (vp->v_flag & VOBJBUF) == 0)
1303 		return 0;
1304 
1305 	obj = vp->v_object;
1306 	tinc = PAGE_SIZE;
1307 	if (tinc > vp->v_mount->mnt_stat.f_iosize)
1308 		tinc = vp->v_mount->mnt_stat.f_iosize;
1309 	off = blkno * vp->v_mount->mnt_stat.f_iosize;
1310 
1311 	for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
1312 
1313 		m = vm_page_lookup(obj, OFF_TO_IDX(off + toff));
1314 		if (!m)
1315 			return 0;
1316 		if (vm_page_is_valid(m,
1317 		    (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0)
1318 			return 0;
1319 	}
1320 	return 1;
1321 }
1322 
1323 /*
1324  * now we set the dirty range for the buffer --
1325  * for NFS -- if the file is mapped and pages have
1326  * been written to, let it know.  We want the
1327  * entire range of the buffer to be marked dirty if
1328  * any of the pages have been written to for consistancy
1329  * with the b_validoff, b_validend set in the nfs write
1330  * code, and used by the nfs read code.
1331  */
1332 static void
1333 vfs_setdirty(struct buf *bp) {
1334 	int i;
1335 	vm_object_t object;
1336 	vm_offset_t boffset, offset;
1337 	/*
1338 	 * We qualify the scan for modified pages on whether the
1339 	 * object has been flushed yet.  The OBJ_WRITEABLE flag
1340 	 * is not cleared simply by protecting pages off.
1341 	 */
1342 	if ((bp->b_flags & B_VMIO) &&
1343 		((object = bp->b_pages[0]->object)->flags & (OBJ_WRITEABLE|OBJ_CLEANING))) {
1344 		/*
1345 		 * test the pages to see if they have been modified directly
1346 		 * by users through the VM system.
1347 		 */
1348 		for (i = 0; i < bp->b_npages; i++) {
1349 			vm_page_flag_clear(bp->b_pages[i], PG_ZERO);
1350 			vm_page_test_dirty(bp->b_pages[i]);
1351 		}
1352 
1353 		/*
1354 		 * scan forwards for the first page modified
1355 		 */
1356 		for (i = 0; i < bp->b_npages; i++) {
1357 			if (bp->b_pages[i]->dirty) {
1358 				break;
1359 			}
1360 		}
1361 		boffset = (i << PAGE_SHIFT);
1362 		if (boffset < bp->b_dirtyoff) {
1363 			bp->b_dirtyoff = boffset;
1364 		}
1365 
1366 		/*
1367 		 * scan backwards for the last page modified
1368 		 */
1369 		for (i = bp->b_npages - 1; i >= 0; --i) {
1370 			if (bp->b_pages[i]->dirty) {
1371 				break;
1372 			}
1373 		}
1374 		boffset = (i + 1);
1375 		offset = boffset + bp->b_pages[0]->pindex;
1376 		if (offset >= object->size)
1377 			boffset = object->size - bp->b_pages[0]->pindex;
1378 		if (bp->b_dirtyend < (boffset << PAGE_SHIFT))
1379 			bp->b_dirtyend = (boffset << PAGE_SHIFT);
1380 	}
1381 }
1382 
1383 /*
1384  * Get a block given a specified block and offset into a file/device.
1385  */
1386 struct buf *
1387 getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo)
1388 {
1389 	struct buf *bp;
1390 	int i, s;
1391 	struct bufhashhdr *bh;
1392 	int maxsize;
1393 	int generation;
1394 	int checksize;
1395 
1396 	if (vp->v_mount) {
1397 		maxsize = vp->v_mount->mnt_stat.f_iosize;
1398 		/*
1399 		 * This happens on mount points.
1400 		 */
1401 		if (maxsize < size)
1402 			maxsize = size;
1403 	} else {
1404 		maxsize = size;
1405 	}
1406 
1407 #if !defined(MAX_PERF)
1408 	if (size > MAXBSIZE)
1409 		panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE);
1410 #endif
1411 
1412 	s = splbio();
1413 loop:
1414 	if (numfreebuffers < lofreebuffers) {
1415 		waitfreebuffers(slpflag, slptimeo);
1416 	}
1417 
1418 	if ((bp = gbincore(vp, blkno))) {
1419 loop1:
1420 		if (bp->b_flags & B_BUSY) {
1421 
1422 			bp->b_flags |= B_WANTED;
1423 			if (bp->b_usecount < BUF_MAXUSE)
1424 				++bp->b_usecount;
1425 
1426 			if (!tsleep(bp,
1427 				(PRIBIO + 4) | slpflag, "getblk", slptimeo)) {
1428 				goto loop;
1429 			}
1430 
1431 			splx(s);
1432 			return (struct buf *) NULL;
1433 		}
1434 		bp->b_flags |= B_BUSY | B_CACHE;
1435 		bremfree(bp);
1436 
1437 		/*
1438 		 * check for size inconsistancies (note that they shouldn't
1439 		 * happen but do when filesystems don't handle the size changes
1440 		 * correctly.) We are conservative on metadata and don't just
1441 		 * extend the buffer but write (if needed) and re-constitute it.
1442 		 */
1443 
1444 		if (bp->b_bcount != size) {
1445 			if ((bp->b_flags & B_VMIO) && (size <= bp->b_kvasize)) {
1446 				allocbuf(bp, size);
1447 			} else {
1448 				if (bp->b_flags & B_DELWRI) {
1449 					bp->b_flags |= B_NOCACHE;
1450 					VOP_BWRITE(bp);
1451 				} else {
1452 					if ((bp->b_flags & B_VMIO) &&
1453 					   (LIST_FIRST(&bp->b_dep) == NULL)) {
1454 						bp->b_flags |= B_RELBUF;
1455 						brelse(bp);
1456 					} else {
1457 						bp->b_flags |= B_NOCACHE;
1458 						VOP_BWRITE(bp);
1459 					}
1460 				}
1461 				goto loop;
1462 			}
1463 		}
1464 
1465 #ifdef DIAGNOSTIC
1466 		if (bp->b_offset == NOOFFSET)
1467 			panic("getblk: no buffer offset");
1468 #endif
1469 
1470 		/*
1471 		 * Check that the constituted buffer really deserves for the
1472 		 * B_CACHE bit to be set.  B_VMIO type buffers might not
1473 		 * contain fully valid pages.  Normal (old-style) buffers
1474 		 * should be fully valid.
1475 		 */
1476 		if (bp->b_flags & B_VMIO) {
1477 			checksize = bp->b_bufsize;
1478 			for (i = 0; i < bp->b_npages; i++) {
1479 				int resid;
1480 				int poffset;
1481 				poffset = bp->b_offset & PAGE_MASK;
1482 				resid = (checksize > (PAGE_SIZE - poffset)) ?
1483 					(PAGE_SIZE - poffset) : checksize;
1484 				if (!vm_page_is_valid(bp->b_pages[i], poffset, resid)) {
1485 					bp->b_flags &= ~(B_CACHE | B_DONE);
1486 					break;
1487 				}
1488 				checksize -= resid;
1489 			}
1490 		}
1491 
1492 		if (bp->b_usecount < BUF_MAXUSE)
1493 			++bp->b_usecount;
1494 		splx(s);
1495 		return (bp);
1496 	} else {
1497 		vm_object_t obj;
1498 
1499 		if ((bp = getnewbuf(vp, blkno,
1500 			slpflag, slptimeo, size, maxsize)) == 0) {
1501 			if (slpflag || slptimeo) {
1502 				splx(s);
1503 				return NULL;
1504 			}
1505 			goto loop;
1506 		}
1507 
1508 		/*
1509 		 * This code is used to make sure that a buffer is not
1510 		 * created while the getnewbuf routine is blocked.
1511 		 * Normally the vnode is locked so this isn't a problem.
1512 		 * VBLK type I/O requests, however, don't lock the vnode.
1513 		 */
1514 		if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE && gbincore(vp, blkno)) {
1515 			bp->b_flags |= B_INVAL;
1516 			brelse(bp);
1517 			goto loop;
1518 		}
1519 
1520 		/*
1521 		 * Insert the buffer into the hash, so that it can
1522 		 * be found by incore.
1523 		 */
1524 		bp->b_blkno = bp->b_lblkno = blkno;
1525 
1526 		if (vp->v_type != VBLK)
1527 			bp->b_offset = (off_t) blkno * maxsize;
1528 		else
1529 			bp->b_offset = (off_t) blkno * DEV_BSIZE;
1530 
1531 		bgetvp(vp, bp);
1532 		LIST_REMOVE(bp, b_hash);
1533 		bh = BUFHASH(vp, blkno);
1534 		LIST_INSERT_HEAD(bh, bp, b_hash);
1535 
1536 		if ((obj = vp->v_object) && (vp->v_flag & VOBJBUF)) {
1537 			bp->b_flags |= (B_VMIO | B_CACHE);
1538 #if defined(VFS_BIO_DEBUG)
1539 			if (vp->v_type != VREG && vp->v_type != VBLK)
1540 				printf("getblk: vmioing file type %d???\n", vp->v_type);
1541 #endif
1542 		} else {
1543 			bp->b_flags &= ~B_VMIO;
1544 		}
1545 
1546 		allocbuf(bp, size);
1547 
1548 		splx(s);
1549 		return (bp);
1550 	}
1551 }
1552 
1553 /*
1554  * Get an empty, disassociated buffer of given size.
1555  */
1556 struct buf *
1557 geteblk(int size)
1558 {
1559 	struct buf *bp;
1560 	int s;
1561 
1562 	s = splbio();
1563 	while ((bp = getnewbuf(0, (daddr_t) 0, 0, 0, size, MAXBSIZE)) == 0);
1564 	splx(s);
1565 	allocbuf(bp, size);
1566 	bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */
1567 	return (bp);
1568 }
1569 
1570 
1571 /*
1572  * This code constitutes the buffer memory from either anonymous system
1573  * memory (in the case of non-VMIO operations) or from an associated
1574  * VM object (in the case of VMIO operations).
1575  *
1576  * Note that this code is tricky, and has many complications to resolve
1577  * deadlock or inconsistant data situations.  Tread lightly!!!
1578  *
1579  * Modify the length of a buffer's underlying buffer storage without
1580  * destroying information (unless, of course the buffer is shrinking).
1581  */
1582 int
1583 allocbuf(struct buf * bp, int size)
1584 {
1585 
1586 	int s;
1587 	int newbsize, mbsize;
1588 	int i;
1589 
1590 #if !defined(MAX_PERF)
1591 	if (!(bp->b_flags & B_BUSY))
1592 		panic("allocbuf: buffer not busy");
1593 
1594 	if (bp->b_kvasize < size)
1595 		panic("allocbuf: buffer too small");
1596 #endif
1597 
1598 	if ((bp->b_flags & B_VMIO) == 0) {
1599 		caddr_t origbuf;
1600 		int origbufsize;
1601 		/*
1602 		 * Just get anonymous memory from the kernel
1603 		 */
1604 		mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
1605 #if !defined(NO_B_MALLOC)
1606 		if (bp->b_flags & B_MALLOC)
1607 			newbsize = mbsize;
1608 		else
1609 #endif
1610 			newbsize = round_page(size);
1611 
1612 		if (newbsize < bp->b_bufsize) {
1613 #if !defined(NO_B_MALLOC)
1614 			/*
1615 			 * malloced buffers are not shrunk
1616 			 */
1617 			if (bp->b_flags & B_MALLOC) {
1618 				if (newbsize) {
1619 					bp->b_bcount = size;
1620 				} else {
1621 					free(bp->b_data, M_BIOBUF);
1622 					bufspace -= bp->b_bufsize;
1623 					bufmallocspace -= bp->b_bufsize;
1624 					bp->b_data = bp->b_kvabase;
1625 					bp->b_bufsize = 0;
1626 					bp->b_bcount = 0;
1627 					bp->b_flags &= ~B_MALLOC;
1628 				}
1629 				return 1;
1630 			}
1631 #endif
1632 			vm_hold_free_pages(
1633 			    bp,
1634 			    (vm_offset_t) bp->b_data + newbsize,
1635 			    (vm_offset_t) bp->b_data + bp->b_bufsize);
1636 		} else if (newbsize > bp->b_bufsize) {
1637 #if !defined(NO_B_MALLOC)
1638 			/*
1639 			 * We only use malloced memory on the first allocation.
1640 			 * and revert to page-allocated memory when the buffer grows.
1641 			 */
1642 			if ( (bufmallocspace < maxbufmallocspace) &&
1643 				(bp->b_bufsize == 0) &&
1644 				(mbsize <= PAGE_SIZE/2)) {
1645 
1646 				bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK);
1647 				bp->b_bufsize = mbsize;
1648 				bp->b_bcount = size;
1649 				bp->b_flags |= B_MALLOC;
1650 				bufspace += mbsize;
1651 				bufmallocspace += mbsize;
1652 				return 1;
1653 			}
1654 #endif
1655 			origbuf = NULL;
1656 			origbufsize = 0;
1657 #if !defined(NO_B_MALLOC)
1658 			/*
1659 			 * If the buffer is growing on its other-than-first allocation,
1660 			 * then we revert to the page-allocation scheme.
1661 			 */
1662 			if (bp->b_flags & B_MALLOC) {
1663 				origbuf = bp->b_data;
1664 				origbufsize = bp->b_bufsize;
1665 				bp->b_data = bp->b_kvabase;
1666 				bufspace -= bp->b_bufsize;
1667 				bufmallocspace -= bp->b_bufsize;
1668 				bp->b_bufsize = 0;
1669 				bp->b_flags &= ~B_MALLOC;
1670 				newbsize = round_page(newbsize);
1671 			}
1672 #endif
1673 			vm_hold_load_pages(
1674 			    bp,
1675 			    (vm_offset_t) bp->b_data + bp->b_bufsize,
1676 			    (vm_offset_t) bp->b_data + newbsize);
1677 #if !defined(NO_B_MALLOC)
1678 			if (origbuf) {
1679 				bcopy(origbuf, bp->b_data, origbufsize);
1680 				free(origbuf, M_BIOBUF);
1681 			}
1682 #endif
1683 		}
1684 	} else {
1685 		vm_page_t m;
1686 		int desiredpages;
1687 
1688 		newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
1689 		desiredpages = (round_page(newbsize) >> PAGE_SHIFT);
1690 
1691 #if !defined(NO_B_MALLOC)
1692 		if (bp->b_flags & B_MALLOC)
1693 			panic("allocbuf: VMIO buffer can't be malloced");
1694 #endif
1695 
1696 		if (newbsize < bp->b_bufsize) {
1697 			if (desiredpages < bp->b_npages) {
1698 				for (i = desiredpages; i < bp->b_npages; i++) {
1699 					/*
1700 					 * the page is not freed here -- it
1701 					 * is the responsibility of vnode_pager_setsize
1702 					 */
1703 					m = bp->b_pages[i];
1704 #if defined(DIAGNOSTIC)
1705 					if (m == bogus_page)
1706 						panic("allocbuf: bogus page found");
1707 #endif
1708 					vm_page_sleep(m, "biodep", &m->busy);
1709 
1710 					bp->b_pages[i] = NULL;
1711 					vm_page_unwire(m);
1712 				}
1713 				pmap_qremove((vm_offset_t) trunc_page(bp->b_data) +
1714 				    (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages));
1715 				bp->b_npages = desiredpages;
1716 			}
1717 		} else if (newbsize > bp->b_bufsize) {
1718 			vm_object_t obj;
1719 			vm_offset_t tinc, toff;
1720 			vm_ooffset_t off;
1721 			vm_pindex_t objoff;
1722 			int pageindex, curbpnpages;
1723 			struct vnode *vp;
1724 			int bsize;
1725 			int orig_validoff = bp->b_validoff;
1726 			int orig_validend = bp->b_validend;
1727 
1728 			vp = bp->b_vp;
1729 
1730 			if (vp->v_type == VBLK)
1731 				bsize = DEV_BSIZE;
1732 			else
1733 				bsize = vp->v_mount->mnt_stat.f_iosize;
1734 
1735 			if (bp->b_npages < desiredpages) {
1736 				obj = vp->v_object;
1737 				tinc = PAGE_SIZE;
1738 				if (tinc > bsize)
1739 					tinc = bsize;
1740 
1741 				off = bp->b_offset;
1742 #ifdef DIAGNOSTIC
1743 				if (bp->b_offset == NOOFFSET)
1744 					panic("allocbuf: no buffer offset");
1745 #endif
1746 
1747 				curbpnpages = bp->b_npages;
1748 		doretry:
1749 				bp->b_validoff = orig_validoff;
1750 				bp->b_validend = orig_validend;
1751 				bp->b_flags |= B_CACHE;
1752 				for (toff = 0; toff < newbsize; toff += tinc) {
1753 					int bytesinpage;
1754 
1755 					pageindex = toff >> PAGE_SHIFT;
1756 					objoff = OFF_TO_IDX(off + toff);
1757 					if (pageindex < curbpnpages) {
1758 
1759 						m = bp->b_pages[pageindex];
1760 #ifdef VFS_BIO_DIAG
1761 						if (m->pindex != objoff)
1762 							panic("allocbuf: page changed offset??!!!?");
1763 #endif
1764 						bytesinpage = tinc;
1765 						if (tinc > (newbsize - toff))
1766 							bytesinpage = newbsize - toff;
1767 						if (bp->b_flags & B_CACHE)
1768 							vfs_buf_set_valid(bp, off, toff, bytesinpage, m);
1769 						continue;
1770 					}
1771 					m = vm_page_lookup(obj, objoff);
1772 					if (!m) {
1773 						m = vm_page_alloc(obj, objoff, VM_ALLOC_NORMAL);
1774 						if (!m) {
1775 							VM_WAIT;
1776 							vm_pageout_deficit += (desiredpages - bp->b_npages);
1777 							goto doretry;
1778 						}
1779 
1780 						vm_page_wire(m);
1781 						vm_page_flag_clear(m, PG_BUSY);
1782 						bp->b_flags &= ~B_CACHE;
1783 
1784 					} else if (m->flags & PG_BUSY) {
1785 						s = splvm();
1786 						if (m->flags & PG_BUSY) {
1787 							vm_page_flag_set(m, PG_WANTED);
1788 							tsleep(m, PVM, "pgtblk", 0);
1789 						}
1790 						splx(s);
1791 						goto doretry;
1792 					} else {
1793 						if ((curproc != pageproc) &&
1794 							((m->queue - m->pc) == PQ_CACHE) &&
1795 						    ((cnt.v_free_count + cnt.v_cache_count) <
1796 								(cnt.v_free_min + cnt.v_cache_min))) {
1797 							pagedaemon_wakeup();
1798 						}
1799 						bytesinpage = tinc;
1800 						if (tinc > (newbsize - toff))
1801 							bytesinpage = newbsize - toff;
1802 						if (bp->b_flags & B_CACHE)
1803 							vfs_buf_set_valid(bp, off, toff, bytesinpage, m);
1804 						vm_page_flag_clear(m, PG_ZERO);
1805 						vm_page_wire(m);
1806 					}
1807 					bp->b_pages[pageindex] = m;
1808 					curbpnpages = pageindex + 1;
1809 				}
1810 				if (vp->v_tag == VT_NFS &&
1811 				    vp->v_type != VBLK) {
1812 					if (bp->b_dirtyend > 0) {
1813 						bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff);
1814 						bp->b_validend = max(bp->b_validend, bp->b_dirtyend);
1815 					}
1816 					if (bp->b_validend == 0)
1817 						bp->b_flags &= ~B_CACHE;
1818 				}
1819 				bp->b_data = (caddr_t) trunc_page(bp->b_data);
1820 				bp->b_npages = curbpnpages;
1821 				pmap_qenter((vm_offset_t) bp->b_data,
1822 					bp->b_pages, bp->b_npages);
1823 				((vm_offset_t) bp->b_data) |= off & PAGE_MASK;
1824 			}
1825 		}
1826 	}
1827 	if (bp->b_flags & B_VMIO)
1828 		vmiospace += (newbsize - bp->b_bufsize);
1829 	bufspace += (newbsize - bp->b_bufsize);
1830 	bp->b_bufsize = newbsize;
1831 	bp->b_bcount = size;
1832 	return 1;
1833 }
1834 
1835 /*
1836  * Wait for buffer I/O completion, returning error status.
1837  */
1838 int
1839 biowait(register struct buf * bp)
1840 {
1841 	int s;
1842 
1843 	s = splbio();
1844 	while ((bp->b_flags & B_DONE) == 0)
1845 #if defined(NO_SCHEDULE_MODS)
1846 		tsleep(bp, PRIBIO, "biowait", 0);
1847 #else
1848 		if (bp->b_flags & B_READ)
1849 			tsleep(bp, PRIBIO, "biord", 0);
1850 		else
1851 			tsleep(bp, PRIBIO, "biowr", 0);
1852 #endif
1853 	splx(s);
1854 	if (bp->b_flags & B_EINTR) {
1855 		bp->b_flags &= ~B_EINTR;
1856 		return (EINTR);
1857 	}
1858 	if (bp->b_flags & B_ERROR) {
1859 		return (bp->b_error ? bp->b_error : EIO);
1860 	} else {
1861 		return (0);
1862 	}
1863 }
1864 
1865 /*
1866  * Finish I/O on a buffer, calling an optional function.
1867  * This is usually called from interrupt level, so process blocking
1868  * is not *a good idea*.
1869  */
1870 void
1871 biodone(register struct buf * bp)
1872 {
1873 	int s;
1874 
1875 	s = splbio();
1876 
1877 #if !defined(MAX_PERF)
1878 	if (!(bp->b_flags & B_BUSY))
1879 		panic("biodone: buffer not busy");
1880 #endif
1881 
1882 	if (bp->b_flags & B_DONE) {
1883 		splx(s);
1884 #if !defined(MAX_PERF)
1885 		printf("biodone: buffer already done\n");
1886 #endif
1887 		return;
1888 	}
1889 	bp->b_flags |= B_DONE;
1890 
1891 	if (bp->b_flags & B_FREEBUF) {
1892 		brelse(bp);
1893 		splx(s);
1894 		return;
1895 	}
1896 
1897 	if ((bp->b_flags & B_READ) == 0) {
1898 		vwakeup(bp);
1899 	}
1900 
1901 #ifdef BOUNCE_BUFFERS
1902 	if (bp->b_flags & B_BOUNCE) {
1903 		vm_bounce_free(bp);
1904 	}
1905 #endif
1906 
1907 	/* call optional completion function if requested */
1908 	if (bp->b_flags & B_CALL) {
1909 		bp->b_flags &= ~B_CALL;
1910 		(*bp->b_iodone) (bp);
1911 		splx(s);
1912 		return;
1913 	}
1914 	if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_complete)
1915 		(*bioops.io_complete)(bp);
1916 
1917 	if (bp->b_flags & B_VMIO) {
1918 		int i, resid;
1919 		vm_ooffset_t foff;
1920 		vm_page_t m;
1921 		vm_object_t obj;
1922 		int iosize;
1923 		struct vnode *vp = bp->b_vp;
1924 
1925 		obj = vp->v_object;
1926 
1927 #if defined(VFS_BIO_DEBUG)
1928 		if (vp->v_usecount == 0) {
1929 			panic("biodone: zero vnode ref count");
1930 		}
1931 
1932 		if (vp->v_object == NULL) {
1933 			panic("biodone: missing VM object");
1934 		}
1935 
1936 		if ((vp->v_flag & VOBJBUF) == 0) {
1937 			panic("biodone: vnode is not setup for merged cache");
1938 		}
1939 #endif
1940 
1941 		foff = bp->b_offset;
1942 #ifdef DIAGNOSTIC
1943 		if (bp->b_offset == NOOFFSET)
1944 			panic("biodone: no buffer offset");
1945 #endif
1946 
1947 #if !defined(MAX_PERF)
1948 		if (!obj) {
1949 			panic("biodone: no object");
1950 		}
1951 #endif
1952 #if defined(VFS_BIO_DEBUG)
1953 		if (obj->paging_in_progress < bp->b_npages) {
1954 			printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n",
1955 			    obj->paging_in_progress, bp->b_npages);
1956 		}
1957 #endif
1958 		iosize = bp->b_bufsize;
1959 		for (i = 0; i < bp->b_npages; i++) {
1960 			int bogusflag = 0;
1961 			m = bp->b_pages[i];
1962 			if (m == bogus_page) {
1963 				bogusflag = 1;
1964 				m = vm_page_lookup(obj, OFF_TO_IDX(foff));
1965 				if (!m) {
1966 #if defined(VFS_BIO_DEBUG)
1967 					printf("biodone: page disappeared\n");
1968 #endif
1969 					vm_object_pip_subtract(obj, 1);
1970 					continue;
1971 				}
1972 				bp->b_pages[i] = m;
1973 				pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
1974 			}
1975 #if defined(VFS_BIO_DEBUG)
1976 			if (OFF_TO_IDX(foff) != m->pindex) {
1977 				printf("biodone: foff(%d)/m->pindex(%d) mismatch\n", foff, m->pindex);
1978 			}
1979 #endif
1980 			resid = IDX_TO_OFF(m->pindex + 1) - foff;
1981 			if (resid > iosize)
1982 				resid = iosize;
1983 
1984 			/*
1985 			 * In the write case, the valid and clean bits are
1986 			 * already changed correctly, so we only need to do this
1987 			 * here in the read case.
1988 			 */
1989 			if ((bp->b_flags & B_READ) && !bogusflag && resid > 0) {
1990 				vfs_page_set_valid(bp, foff, i, m);
1991 			}
1992 			vm_page_flag_clear(m, PG_ZERO);
1993 
1994 			/*
1995 			 * when debugging new filesystems or buffer I/O methods, this
1996 			 * is the most common error that pops up.  if you see this, you
1997 			 * have not set the page busy flag correctly!!!
1998 			 */
1999 			if (m->busy == 0) {
2000 #if !defined(MAX_PERF)
2001 				printf("biodone: page busy < 0, "
2002 				    "pindex: %d, foff: 0x(%x,%x), "
2003 				    "resid: %d, index: %d\n",
2004 				    (int) m->pindex, (int)(foff >> 32),
2005 						(int) foff & 0xffffffff, resid, i);
2006 #endif
2007 				if (vp->v_type != VBLK)
2008 #if !defined(MAX_PERF)
2009 					printf(" iosize: %ld, lblkno: %d, flags: 0x%lx, npages: %d\n",
2010 					    bp->b_vp->v_mount->mnt_stat.f_iosize,
2011 					    (int) bp->b_lblkno,
2012 					    bp->b_flags, bp->b_npages);
2013 				else
2014 					printf(" VDEV, lblkno: %d, flags: 0x%lx, npages: %d\n",
2015 					    (int) bp->b_lblkno,
2016 					    bp->b_flags, bp->b_npages);
2017 				printf(" valid: 0x%x, dirty: 0x%x, wired: %d\n",
2018 				    m->valid, m->dirty, m->wire_count);
2019 #endif
2020 				panic("biodone: page busy < 0\n");
2021 			}
2022 			vm_page_io_finish(m);
2023 			vm_object_pip_subtract(obj, 1);
2024 			foff += resid;
2025 			iosize -= resid;
2026 		}
2027 		if (obj &&
2028 			(obj->paging_in_progress == 0) &&
2029 		    (obj->flags & OBJ_PIPWNT)) {
2030 			vm_object_clear_flag(obj, OBJ_PIPWNT);
2031 			wakeup(obj);
2032 		}
2033 	}
2034 	/*
2035 	 * For asynchronous completions, release the buffer now. The brelse
2036 	 * checks for B_WANTED and will do the wakeup there if necessary - so
2037 	 * no need to do a wakeup here in the async case.
2038 	 */
2039 
2040 	if (bp->b_flags & B_ASYNC) {
2041 		if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_RELBUF)) != 0)
2042 			brelse(bp);
2043 		else
2044 			bqrelse(bp);
2045 	} else {
2046 		bp->b_flags &= ~B_WANTED;
2047 		wakeup(bp);
2048 	}
2049 	splx(s);
2050 }
2051 
2052 static int
2053 count_lock_queue()
2054 {
2055 	int count;
2056 	struct buf *bp;
2057 
2058 	count = 0;
2059 	for (bp = TAILQ_FIRST(&bufqueues[QUEUE_LOCKED]);
2060 	    bp != NULL;
2061 	    bp = TAILQ_NEXT(bp, b_freelist))
2062 		count++;
2063 	return (count);
2064 }
2065 
2066 #if 0	/* not with kirks code */
2067 static int vfs_update_interval = 30;
2068 
2069 static void
2070 vfs_update()
2071 {
2072 	while (1) {
2073 		tsleep(&vfs_update_wakeup, PUSER, "update",
2074 		    hz * vfs_update_interval);
2075 		vfs_update_wakeup = 0;
2076 		sync(curproc, NULL);
2077 	}
2078 }
2079 
2080 static int
2081 sysctl_kern_updateinterval SYSCTL_HANDLER_ARGS
2082 {
2083 	int error = sysctl_handle_int(oidp,
2084 		oidp->oid_arg1, oidp->oid_arg2, req);
2085 	if (!error)
2086 		wakeup(&vfs_update_wakeup);
2087 	return error;
2088 }
2089 
2090 SYSCTL_PROC(_kern, KERN_UPDATEINTERVAL, update, CTLTYPE_INT|CTLFLAG_RW,
2091 	&vfs_update_interval, 0, sysctl_kern_updateinterval, "I", "");
2092 
2093 #endif
2094 
2095 
2096 /*
2097  * This routine is called in lieu of iodone in the case of
2098  * incomplete I/O.  This keeps the busy status for pages
2099  * consistant.
2100  */
2101 void
2102 vfs_unbusy_pages(struct buf * bp)
2103 {
2104 	int i, s;
2105 
2106 	if (bp->b_flags & B_VMIO) {
2107 		struct vnode *vp = bp->b_vp;
2108 		vm_object_t obj = vp->v_object;
2109 
2110 		for (i = 0; i < bp->b_npages; i++) {
2111 			vm_page_t m = bp->b_pages[i];
2112 
2113 			if (m == bogus_page) {
2114 				m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i);
2115 #if !defined(MAX_PERF)
2116 				if (!m) {
2117 					panic("vfs_unbusy_pages: page missing\n");
2118 				}
2119 #endif
2120 				bp->b_pages[i] = m;
2121 				pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
2122 			}
2123 			vm_object_pip_subtract(obj, 1);
2124 			vm_page_flag_clear(m, PG_ZERO);
2125 			vm_page_io_finish(m);
2126 		}
2127 		if (obj->paging_in_progress == 0 &&
2128 		    (obj->flags & OBJ_PIPWNT)) {
2129 			vm_object_clear_flag(obj, OBJ_PIPWNT);
2130 			wakeup(obj);
2131 		}
2132 	}
2133 }
2134 
2135 /*
2136  * Set NFS' b_validoff and b_validend fields from the valid bits
2137  * of a page.  If the consumer is not NFS, and the page is not
2138  * valid for the entire range, clear the B_CACHE flag to force
2139  * the consumer to re-read the page.
2140  */
2141 static void
2142 vfs_buf_set_valid(struct buf *bp,
2143 		  vm_ooffset_t foff, vm_offset_t off, vm_offset_t size,
2144 		  vm_page_t m)
2145 {
2146 	if (bp->b_vp->v_tag == VT_NFS && bp->b_vp->v_type != VBLK) {
2147 		vm_offset_t svalid, evalid;
2148 		int validbits = m->valid;
2149 
2150 		/*
2151 		 * This only bothers with the first valid range in the
2152 		 * page.
2153 		 */
2154 		svalid = off;
2155 		while (validbits && !(validbits & 1)) {
2156 			svalid += DEV_BSIZE;
2157 			validbits >>= 1;
2158 		}
2159 		evalid = svalid;
2160 		while (validbits & 1) {
2161 			evalid += DEV_BSIZE;
2162 			validbits >>= 1;
2163 		}
2164 		/*
2165 		 * Make sure this range is contiguous with the range
2166 		 * built up from previous pages.  If not, then we will
2167 		 * just use the range from the previous pages.
2168 		 */
2169 		if (svalid == bp->b_validend) {
2170 			bp->b_validoff = min(bp->b_validoff, svalid);
2171 			bp->b_validend = max(bp->b_validend, evalid);
2172 		}
2173 	} else if (!vm_page_is_valid(m,
2174 				     (vm_offset_t) ((foff + off) & PAGE_MASK),
2175 				     size)) {
2176 		bp->b_flags &= ~B_CACHE;
2177 	}
2178 }
2179 
2180 /*
2181  * Set the valid bits in a page, taking care of the b_validoff,
2182  * b_validend fields which NFS uses to optimise small reads.  Off is
2183  * the offset within the file and pageno is the page index within the buf.
2184  */
2185 static void
2186 vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m)
2187 {
2188 	struct vnode *vp = bp->b_vp;
2189 	vm_ooffset_t soff, eoff;
2190 
2191 	soff = off;
2192 	eoff = off + min(PAGE_SIZE, bp->b_bufsize);
2193 	if (vp->v_tag == VT_NFS && vp->v_type != VBLK) {
2194 		vm_ooffset_t sv, ev;
2195 		vm_page_set_invalid(m,
2196 		    (vm_offset_t) (soff & PAGE_MASK),
2197 		    (vm_offset_t) (eoff - soff));
2198 		off = off - pageno * PAGE_SIZE;
2199 		sv = off + ((bp->b_validoff + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1));
2200 		ev = off + ((bp->b_validend + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1));
2201 		soff = qmax(sv, soff);
2202 		eoff = qmin(ev, eoff);
2203 	}
2204 	if (eoff > soff)
2205 		vm_page_set_validclean(m,
2206 	       (vm_offset_t) (soff & PAGE_MASK),
2207 	       (vm_offset_t) (eoff - soff));
2208 }
2209 
2210 /*
2211  * This routine is called before a device strategy routine.
2212  * It is used to tell the VM system that paging I/O is in
2213  * progress, and treat the pages associated with the buffer
2214  * almost as being PG_BUSY.  Also the object paging_in_progress
2215  * flag is handled to make sure that the object doesn't become
2216  * inconsistant.
2217  */
2218 void
2219 vfs_busy_pages(struct buf * bp, int clear_modify)
2220 {
2221 	int i, s;
2222 
2223 	if (bp->b_flags & B_VMIO) {
2224 		struct vnode *vp = bp->b_vp;
2225 		vm_object_t obj = vp->v_object;
2226 		vm_ooffset_t foff;
2227 
2228 		foff = bp->b_offset;
2229 #ifdef DIAGNOSTIC
2230 		if (bp->b_offset == NOOFFSET)
2231 			panic("vfs_busy_pages: no buffer offset");
2232 #endif
2233 
2234 		vfs_setdirty(bp);
2235 
2236 retry:
2237 		for (i = 0; i < bp->b_npages; i++) {
2238 			vm_page_t m = bp->b_pages[i];
2239 			if (vm_page_sleep(m, "vbpage", NULL))
2240 				goto retry;
2241 		}
2242 
2243 		for (i = 0; i < bp->b_npages; i++, foff += PAGE_SIZE) {
2244 			vm_page_t m = bp->b_pages[i];
2245 
2246 			vm_page_flag_clear(m, PG_ZERO);
2247 			if ((bp->b_flags & B_CLUSTER) == 0) {
2248 				vm_object_pip_add(obj, 1);
2249 				vm_page_io_start(m);
2250 			}
2251 
2252 			vm_page_protect(m, VM_PROT_NONE);
2253 			if (clear_modify)
2254 				vfs_page_set_valid(bp, foff, i, m);
2255 			else if (bp->b_bcount >= PAGE_SIZE) {
2256 				if (m->valid && (bp->b_flags & B_CACHE) == 0) {
2257 					bp->b_pages[i] = bogus_page;
2258 					pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
2259 				}
2260 			}
2261 		}
2262 	}
2263 }
2264 
2265 /*
2266  * Tell the VM system that the pages associated with this buffer
2267  * are clean.  This is used for delayed writes where the data is
2268  * going to go to disk eventually without additional VM intevention.
2269  */
2270 void
2271 vfs_clean_pages(struct buf * bp)
2272 {
2273 	int i;
2274 
2275 	if (bp->b_flags & B_VMIO) {
2276 		struct vnode *vp = bp->b_vp;
2277 		vm_ooffset_t foff;
2278 		foff = bp->b_offset;
2279 
2280 #ifdef DIAGNOSTIC
2281 		if (bp->b_offset == NOOFFSET)
2282 			panic("vfs_clean_pages: no buffer offset");
2283 #endif
2284 
2285 		for (i = 0; i < bp->b_npages; i++, foff += PAGE_SIZE) {
2286 			vm_page_t m = bp->b_pages[i];
2287 			vfs_page_set_valid(bp, foff, i, m);
2288 		}
2289 	}
2290 }
2291 
2292 void
2293 vfs_bio_clrbuf(struct buf *bp) {
2294 	int i;
2295 	if ((bp->b_flags & (B_VMIO | B_MALLOC)) == B_VMIO) {
2296 		if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE)) {
2297 			int mask;
2298 			mask = 0;
2299 			for(i=0;i<bp->b_bufsize;i+=DEV_BSIZE)
2300 				mask |= (1 << (i/DEV_BSIZE));
2301 			if(((bp->b_pages[0]->flags & PG_ZERO) == 0) &&
2302 				(bp->b_pages[0]->valid != mask)) {
2303 				bzero(bp->b_data, bp->b_bufsize);
2304 			}
2305 			bp->b_pages[0]->valid = mask;
2306 			bp->b_resid = 0;
2307 			return;
2308 		}
2309 		for(i=0;i<bp->b_npages;i++) {
2310 			if( bp->b_pages[i]->valid == VM_PAGE_BITS_ALL)
2311 				continue;
2312 			if( bp->b_pages[i]->valid == 0) {
2313 				if ((bp->b_pages[i]->flags & PG_ZERO) == 0) {
2314 					bzero(bp->b_data + (i << PAGE_SHIFT), PAGE_SIZE);
2315 				}
2316 			} else {
2317 				int j;
2318 				for(j=0;j<PAGE_SIZE/DEV_BSIZE;j++) {
2319 					if (((bp->b_pages[i]->flags & PG_ZERO) == 0) &&
2320 						(bp->b_pages[i]->valid & (1<<j)) == 0)
2321 						bzero(bp->b_data + (i << PAGE_SHIFT) + j * DEV_BSIZE, DEV_BSIZE);
2322 				}
2323 			}
2324 			bp->b_pages[i]->valid = VM_PAGE_BITS_ALL;
2325 			vm_page_flag_clear(bp->b_pages[i], PG_ZERO);
2326 		}
2327 		bp->b_resid = 0;
2328 	} else {
2329 		clrbuf(bp);
2330 	}
2331 }
2332 
2333 /*
2334  * vm_hold_load_pages and vm_hold_unload pages get pages into
2335  * a buffers address space.  The pages are anonymous and are
2336  * not associated with a file object.
2337  */
2338 void
2339 vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
2340 {
2341 	vm_offset_t pg;
2342 	vm_page_t p;
2343 	int index;
2344 
2345 	to = round_page(to);
2346 	from = round_page(from);
2347 	index = (from - trunc_page(bp->b_data)) >> PAGE_SHIFT;
2348 
2349 	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
2350 
2351 tryagain:
2352 
2353 		p = vm_page_alloc(kernel_object,
2354 			((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
2355 		    VM_ALLOC_NORMAL);
2356 		if (!p) {
2357 			vm_pageout_deficit += (to - from) >> PAGE_SHIFT;
2358 			VM_WAIT;
2359 			goto tryagain;
2360 		}
2361 		vm_page_wire(p);
2362 		p->valid = VM_PAGE_BITS_ALL;
2363 		vm_page_flag_clear(p, PG_ZERO);
2364 		pmap_kenter(pg, VM_PAGE_TO_PHYS(p));
2365 		bp->b_pages[index] = p;
2366 		vm_page_wakeup(p);
2367 	}
2368 	bp->b_npages = index;
2369 }
2370 
2371 void
2372 vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
2373 {
2374 	vm_offset_t pg;
2375 	vm_page_t p;
2376 	int index, newnpages;
2377 
2378 	from = round_page(from);
2379 	to = round_page(to);
2380 	newnpages = index = (from - trunc_page(bp->b_data)) >> PAGE_SHIFT;
2381 
2382 	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
2383 		p = bp->b_pages[index];
2384 		if (p && (index < bp->b_npages)) {
2385 #if !defined(MAX_PERF)
2386 			if (p->busy) {
2387 				printf("vm_hold_free_pages: blkno: %d, lblkno: %d\n",
2388 					bp->b_blkno, bp->b_lblkno);
2389 			}
2390 #endif
2391 			bp->b_pages[index] = NULL;
2392 			pmap_kremove(pg);
2393 			vm_page_busy(p);
2394 			vm_page_unwire(p);
2395 			vm_page_free(p);
2396 		}
2397 	}
2398 	bp->b_npages = newnpages;
2399 }
2400 
2401 
2402 #include "opt_ddb.h"
2403 #ifdef DDB
2404 #include <ddb/ddb.h>
2405 
2406 DB_SHOW_COMMAND(buffer, db_show_buffer)
2407 {
2408 	/* get args */
2409 	struct buf *bp = (struct buf *)addr;
2410 
2411 	if (!have_addr) {
2412 		db_printf("usage: show buffer <addr>\n");
2413 		return;
2414 	}
2415 
2416 	db_printf("b_proc = %p,\nb_flags = 0x%b\n", (void *)bp->b_proc,
2417 		  (u_int)bp->b_flags, PRINT_BUF_FLAGS);
2418 	db_printf("b_error = %d, b_bufsize = %ld, b_bcount = %ld, "
2419 		  "b_resid = %ld\nb_dev = 0x%x, b_data = %p, "
2420 		  "b_blkno = %d, b_pblkno = %d\n",
2421 		  bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid,
2422 		  bp->b_dev, bp->b_data, bp->b_blkno, bp->b_pblkno);
2423 	if (bp->b_npages) {
2424 		int i;
2425 		db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages);
2426 		for (i = 0; i < bp->b_npages; i++) {
2427 			vm_page_t m;
2428 			m = bp->b_pages[i];
2429 			db_printf("(%p, 0x%lx, 0x%lx)", (void *)m->object,
2430 			    (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m));
2431 			if ((i + 1) < bp->b_npages)
2432 				db_printf(",");
2433 		}
2434 		db_printf("\n");
2435 	}
2436 }
2437 #endif /* DDB */
2438