xref: /freebsd/sys/kern/vfs_bio.c (revision 380a989b3223d455375b4fae70fd0b9bdd43bafb)
1 /*
2  * Copyright (c) 1994,1997 John S. Dyson
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice immediately at the beginning of the file, without modification,
10  *    this list of conditions, and the following disclaimer.
11  * 2. Absolutely no warranty of function or purpose is made by the author
12  *		John S. Dyson.
13  *
14  * $Id: vfs_bio.c,v 1.189 1998/12/22 18:57:30 dillon Exp $
15  */
16 
17 /*
18  * this file contains a new buffer I/O scheme implementing a coherent
19  * VM object and buffer cache scheme.  Pains have been taken to make
20  * sure that the performance degradation associated with schemes such
21  * as this is not realized.
22  *
23  * Author:  John S. Dyson
24  * Significant help during the development and debugging phases
25  * had been provided by David Greenman, also of the FreeBSD core team.
26  *
27  * see man buf(9) for more info.
28  */
29 
30 #define VMIO
31 #include <sys/param.h>
32 #include <sys/systm.h>
33 #include <sys/sysproto.h>
34 #include <sys/kernel.h>
35 #include <sys/sysctl.h>
36 #include <sys/proc.h>
37 #include <sys/vnode.h>
38 #include <sys/vmmeter.h>
39 #include <sys/lock.h>
40 #include <miscfs/specfs/specdev.h>
41 #include <vm/vm.h>
42 #include <vm/vm_param.h>
43 #include <vm/vm_prot.h>
44 #include <vm/vm_kern.h>
45 #include <vm/vm_pageout.h>
46 #include <vm/vm_page.h>
47 #include <vm/vm_object.h>
48 #include <vm/vm_extern.h>
49 #include <vm/vm_map.h>
50 #include <sys/buf.h>
51 #include <sys/mount.h>
52 #include <sys/malloc.h>
53 #include <sys/resourcevar.h>
54 
55 static MALLOC_DEFINE(M_BIOBUF, "BIO buffer", "BIO buffer");
56 
57 struct	bio_ops bioops;		/* I/O operation notification */
58 
59 #if 0 	/* replaced bu sched_sync */
60 static void vfs_update __P((void));
61 static struct	proc *updateproc;
62 static struct kproc_desc up_kp = {
63 	"update",
64 	vfs_update,
65 	&updateproc
66 };
67 SYSINIT_KT(update, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
68 #endif
69 
70 struct buf *buf;		/* buffer header pool */
71 struct swqueue bswlist;
72 
73 static void vm_hold_free_pages(struct buf * bp, vm_offset_t from,
74 		vm_offset_t to);
75 static void vm_hold_load_pages(struct buf * bp, vm_offset_t from,
76 		vm_offset_t to);
77 static void vfs_buf_set_valid(struct buf *bp, vm_ooffset_t foff,
78 			      vm_offset_t off, vm_offset_t size,
79 			      vm_page_t m);
80 static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off,
81 			       int pageno, vm_page_t m);
82 static void vfs_clean_pages(struct buf * bp);
83 static void vfs_setdirty(struct buf *bp);
84 static void vfs_vmio_release(struct buf *bp);
85 static void flushdirtybuffers(int slpflag, int slptimeo);
86 
87 int needsbuffer;
88 
89 /*
90  * Internal update daemon, process 3
91  *	The variable vfs_update_wakeup allows for internal syncs.
92  */
93 int vfs_update_wakeup;
94 
95 
96 /*
97  * buffers base kva
98  */
99 
100 /*
101  * bogus page -- for I/O to/from partially complete buffers
102  * this is a temporary solution to the problem, but it is not
103  * really that bad.  it would be better to split the buffer
104  * for input in the case of buffers partially already in memory,
105  * but the code is intricate enough already.
106  */
107 vm_page_t bogus_page;
108 static vm_offset_t bogus_offset;
109 
110 static int bufspace, maxbufspace, vmiospace, maxvmiobufspace,
111 	bufmallocspace, maxbufmallocspace;
112 int numdirtybuffers;
113 static int lodirtybuffers, hidirtybuffers;
114 static int numfreebuffers, lofreebuffers, hifreebuffers;
115 static int kvafreespace;
116 
117 SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD,
118 	&numdirtybuffers, 0, "");
119 SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW,
120 	&lodirtybuffers, 0, "");
121 SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW,
122 	&hidirtybuffers, 0, "");
123 SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD,
124 	&numfreebuffers, 0, "");
125 SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW,
126 	&lofreebuffers, 0, "");
127 SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW,
128 	&hifreebuffers, 0, "");
129 SYSCTL_INT(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RW,
130 	&maxbufspace, 0, "");
131 SYSCTL_INT(_vfs, OID_AUTO, bufspace, CTLFLAG_RD,
132 	&bufspace, 0, "");
133 SYSCTL_INT(_vfs, OID_AUTO, maxvmiobufspace, CTLFLAG_RW,
134 	&maxvmiobufspace, 0, "");
135 SYSCTL_INT(_vfs, OID_AUTO, vmiospace, CTLFLAG_RD,
136 	&vmiospace, 0, "");
137 SYSCTL_INT(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW,
138 	&maxbufmallocspace, 0, "");
139 SYSCTL_INT(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD,
140 	&bufmallocspace, 0, "");
141 SYSCTL_INT(_vfs, OID_AUTO, kvafreespace, CTLFLAG_RD,
142 	&kvafreespace, 0, "");
143 
144 static LIST_HEAD(bufhashhdr, buf) bufhashtbl[BUFHSZ], invalhash;
145 struct bqueues bufqueues[BUFFER_QUEUES] = {0};
146 
147 extern int vm_swap_size;
148 
149 #define BUF_MAXUSE 24
150 
151 #define VFS_BIO_NEED_ANY 1
152 #define VFS_BIO_NEED_LOWLIMIT 2
153 #define VFS_BIO_NEED_FREE 4
154 
155 /*
156  * Initialize buffer headers and related structures.
157  */
158 void
159 bufinit()
160 {
161 	struct buf *bp;
162 	int i;
163 
164 	TAILQ_INIT(&bswlist);
165 	LIST_INIT(&invalhash);
166 
167 	/* first, make a null hash table */
168 	for (i = 0; i < BUFHSZ; i++)
169 		LIST_INIT(&bufhashtbl[i]);
170 
171 	/* next, make a null set of free lists */
172 	for (i = 0; i < BUFFER_QUEUES; i++)
173 		TAILQ_INIT(&bufqueues[i]);
174 
175 	/* finally, initialize each buffer header and stick on empty q */
176 	for (i = 0; i < nbuf; i++) {
177 		bp = &buf[i];
178 		bzero(bp, sizeof *bp);
179 		bp->b_flags = B_INVAL;	/* we're just an empty header */
180 		bp->b_dev = NODEV;
181 		bp->b_rcred = NOCRED;
182 		bp->b_wcred = NOCRED;
183 		bp->b_qindex = QUEUE_EMPTY;
184 		bp->b_xflags = 0;
185 		LIST_INIT(&bp->b_dep);
186 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
187 		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
188 	}
189 /*
190  * maxbufspace is currently calculated to support all filesystem blocks
191  * to be 8K.  If you happen to use a 16K filesystem, the size of the buffer
192  * cache is still the same as it would be for 8K filesystems.  This
193  * keeps the size of the buffer cache "in check" for big block filesystems.
194  */
195 	maxbufspace = (nbuf + 8) * DFLTBSIZE;
196 /*
197  * reserve 1/3 of the buffers for metadata (VDIR) which might not be VMIO'ed
198  */
199 	maxvmiobufspace = 2 * maxbufspace / 3;
200 /*
201  * Limit the amount of malloc memory since it is wired permanently into
202  * the kernel space.  Even though this is accounted for in the buffer
203  * allocation, we don't want the malloced region to grow uncontrolled.
204  * The malloc scheme improves memory utilization significantly on average
205  * (small) directories.
206  */
207 	maxbufmallocspace = maxbufspace / 20;
208 
209 /*
210  * Remove the probability of deadlock conditions by limiting the
211  * number of dirty buffers.
212  */
213 	hidirtybuffers = nbuf / 8 + 20;
214 	lodirtybuffers = nbuf / 16 + 10;
215 	numdirtybuffers = 0;
216 	lofreebuffers = nbuf / 18 + 5;
217 	hifreebuffers = 2 * lofreebuffers;
218 	numfreebuffers = nbuf;
219 	kvafreespace = 0;
220 
221 	bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE);
222 	bogus_page = vm_page_alloc(kernel_object,
223 			((bogus_offset - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
224 			VM_ALLOC_NORMAL);
225 
226 }
227 
228 /*
229  * Free the kva allocation for a buffer
230  * Must be called only at splbio or higher,
231  *  as this is the only locking for buffer_map.
232  */
233 static void
234 bfreekva(struct buf * bp)
235 {
236 	if (bp->b_kvasize == 0)
237 		return;
238 
239 	vm_map_delete(buffer_map,
240 		(vm_offset_t) bp->b_kvabase,
241 		(vm_offset_t) bp->b_kvabase + bp->b_kvasize);
242 
243 	bp->b_kvasize = 0;
244 
245 }
246 
247 /*
248  * remove the buffer from the appropriate free list
249  */
250 void
251 bremfree(struct buf * bp)
252 {
253 	int s = splbio();
254 
255 	if (bp->b_qindex != QUEUE_NONE) {
256 		if (bp->b_qindex == QUEUE_EMPTY) {
257 			kvafreespace -= bp->b_kvasize;
258 		}
259 		TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
260 		bp->b_qindex = QUEUE_NONE;
261 	} else {
262 #if !defined(MAX_PERF)
263 		panic("bremfree: removing a buffer when not on a queue");
264 #endif
265 	}
266 	if ((bp->b_flags & B_INVAL) ||
267 		(bp->b_flags & (B_DELWRI|B_LOCKED)) == 0)
268 		--numfreebuffers;
269 	splx(s);
270 }
271 
272 
273 /*
274  * Get a buffer with the specified data.  Look in the cache first.
275  */
276 int
277 bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred,
278     struct buf ** bpp)
279 {
280 	struct buf *bp;
281 
282 	bp = getblk(vp, blkno, size, 0, 0);
283 	*bpp = bp;
284 
285 	/* if not found in cache, do some I/O */
286 	if ((bp->b_flags & B_CACHE) == 0) {
287 		if (curproc != NULL)
288 			curproc->p_stats->p_ru.ru_inblock++;
289 		bp->b_flags |= B_READ;
290 		bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
291 		if (bp->b_rcred == NOCRED) {
292 			if (cred != NOCRED)
293 				crhold(cred);
294 			bp->b_rcred = cred;
295 		}
296 		vfs_busy_pages(bp, 0);
297 		VOP_STRATEGY(vp, bp);
298 		return (biowait(bp));
299 	}
300 	return (0);
301 }
302 
303 /*
304  * Operates like bread, but also starts asynchronous I/O on
305  * read-ahead blocks.
306  */
307 int
308 breadn(struct vnode * vp, daddr_t blkno, int size,
309     daddr_t * rablkno, int *rabsize,
310     int cnt, struct ucred * cred, struct buf ** bpp)
311 {
312 	struct buf *bp, *rabp;
313 	int i;
314 	int rv = 0, readwait = 0;
315 
316 	*bpp = bp = getblk(vp, blkno, size, 0, 0);
317 
318 	/* if not found in cache, do some I/O */
319 	if ((bp->b_flags & B_CACHE) == 0) {
320 		if (curproc != NULL)
321 			curproc->p_stats->p_ru.ru_inblock++;
322 		bp->b_flags |= B_READ;
323 		bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
324 		if (bp->b_rcred == NOCRED) {
325 			if (cred != NOCRED)
326 				crhold(cred);
327 			bp->b_rcred = cred;
328 		}
329 		vfs_busy_pages(bp, 0);
330 		VOP_STRATEGY(vp, bp);
331 		++readwait;
332 	}
333 	for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
334 		if (inmem(vp, *rablkno))
335 			continue;
336 		rabp = getblk(vp, *rablkno, *rabsize, 0, 0);
337 
338 		if ((rabp->b_flags & B_CACHE) == 0) {
339 			if (curproc != NULL)
340 				curproc->p_stats->p_ru.ru_inblock++;
341 			rabp->b_flags |= B_READ | B_ASYNC;
342 			rabp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
343 			if (rabp->b_rcred == NOCRED) {
344 				if (cred != NOCRED)
345 					crhold(cred);
346 				rabp->b_rcred = cred;
347 			}
348 			vfs_busy_pages(rabp, 0);
349 			VOP_STRATEGY(vp, rabp);
350 		} else {
351 			brelse(rabp);
352 		}
353 	}
354 
355 	if (readwait) {
356 		rv = biowait(bp);
357 	}
358 	return (rv);
359 }
360 
361 /*
362  * Write, release buffer on completion.  (Done by iodone
363  * if async.)
364  */
365 int
366 bwrite(struct buf * bp)
367 {
368 	int oldflags, s;
369 	struct vnode *vp;
370 	struct mount *mp;
371 
372 
373 	if (bp->b_flags & B_INVAL) {
374 		brelse(bp);
375 		return (0);
376 	}
377 
378 	oldflags = bp->b_flags;
379 
380 #if !defined(MAX_PERF)
381 	if ((bp->b_flags & B_BUSY) == 0)
382 		panic("bwrite: buffer is not busy???");
383 #endif
384 
385 	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
386 	bp->b_flags |= B_WRITEINPROG;
387 
388 	s = splbio();
389 	if ((oldflags & B_DELWRI) == B_DELWRI) {
390 		--numdirtybuffers;
391 		reassignbuf(bp, bp->b_vp);
392 	}
393 
394 	bp->b_vp->v_numoutput++;
395 	vfs_busy_pages(bp, 1);
396 	if (curproc != NULL)
397 		curproc->p_stats->p_ru.ru_oublock++;
398 	splx(s);
399 	VOP_STRATEGY(bp->b_vp, bp);
400 
401 	/*
402 	 * Collect statistics on synchronous and asynchronous writes.
403 	 * Writes to block devices are charged to their associated
404 	 * filesystem (if any).
405 	 */
406 	if ((vp = bp->b_vp) != NULL) {
407 		if (vp->v_type == VBLK)
408 			mp = vp->v_specmountpoint;
409 		else
410 			mp = vp->v_mount;
411 		if (mp != NULL)
412 			if ((oldflags & B_ASYNC) == 0)
413 				mp->mnt_stat.f_syncwrites++;
414 			else
415 				mp->mnt_stat.f_asyncwrites++;
416 	}
417 
418 	if ((oldflags & B_ASYNC) == 0) {
419 		int rtval = biowait(bp);
420 		brelse(bp);
421 		return (rtval);
422 	}
423 	return (0);
424 }
425 
426 void
427 vfs_bio_need_satisfy(void) {
428 	++numfreebuffers;
429 	if (!needsbuffer)
430 		return;
431 	if (numdirtybuffers < lodirtybuffers) {
432 		needsbuffer &= ~(VFS_BIO_NEED_ANY | VFS_BIO_NEED_LOWLIMIT);
433 	} else {
434 		needsbuffer &= ~VFS_BIO_NEED_ANY;
435 	}
436 	if (numfreebuffers >= hifreebuffers) {
437 		needsbuffer &= ~VFS_BIO_NEED_FREE;
438 	}
439 	wakeup(&needsbuffer);
440 }
441 
442 /*
443  * Delayed write. (Buffer is marked dirty).
444  */
445 void
446 bdwrite(struct buf * bp)
447 {
448 	struct vnode *vp;
449 
450 #if !defined(MAX_PERF)
451 	if ((bp->b_flags & B_BUSY) == 0) {
452 		panic("bdwrite: buffer is not busy");
453 	}
454 #endif
455 
456 	if (bp->b_flags & B_INVAL) {
457 		brelse(bp);
458 		return;
459 	}
460 	bp->b_flags &= ~(B_READ|B_RELBUF);
461 	if ((bp->b_flags & B_DELWRI) == 0) {
462 		bp->b_flags |= B_DONE | B_DELWRI;
463 		reassignbuf(bp, bp->b_vp);
464 		++numdirtybuffers;
465 	}
466 
467 	/*
468 	 * This bmap keeps the system from needing to do the bmap later,
469 	 * perhaps when the system is attempting to do a sync.  Since it
470 	 * is likely that the indirect block -- or whatever other datastructure
471 	 * that the filesystem needs is still in memory now, it is a good
472 	 * thing to do this.  Note also, that if the pageout daemon is
473 	 * requesting a sync -- there might not be enough memory to do
474 	 * the bmap then...  So, this is important to do.
475 	 */
476 	if (bp->b_lblkno == bp->b_blkno) {
477 		VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
478 	}
479 
480 	/*
481 	 * Set the *dirty* buffer range based upon the VM system dirty pages.
482 	 */
483 	vfs_setdirty(bp);
484 
485 	/*
486 	 * We need to do this here to satisfy the vnode_pager and the
487 	 * pageout daemon, so that it thinks that the pages have been
488 	 * "cleaned".  Note that since the pages are in a delayed write
489 	 * buffer -- the VFS layer "will" see that the pages get written
490 	 * out on the next sync, or perhaps the cluster will be completed.
491 	 */
492 	vfs_clean_pages(bp);
493 	bqrelse(bp);
494 
495 	/*
496 	 * XXX The soft dependency code is not prepared to
497 	 * have I/O done when a bdwrite is requested. For
498 	 * now we just let the write be delayed if it is
499 	 * requested by the soft dependency code.
500 	 */
501 	if ((vp = bp->b_vp) &&
502 	    ((vp->v_type == VBLK && vp->v_specmountpoint &&
503 		  (vp->v_specmountpoint->mnt_flag & MNT_SOFTDEP)) ||
504 		 (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SOFTDEP))))
505 		return;
506 
507 	if (numdirtybuffers >= hidirtybuffers)
508 		flushdirtybuffers(0, 0);
509 
510 	return;
511 }
512 
513 
514 /*
515  * Same as first half of bdwrite, mark buffer dirty, but do not release it.
516  * Check how this compares with vfs_setdirty(); XXX [JRE]
517  */
518 void
519 bdirty(bp)
520       struct buf *bp;
521 {
522 
523 	bp->b_flags &= ~(B_READ|B_RELBUF); /* XXX ??? check this */
524 	if ((bp->b_flags & B_DELWRI) == 0) {
525 		bp->b_flags |= B_DONE | B_DELWRI; /* why done? XXX JRE */
526 		reassignbuf(bp, bp->b_vp);
527 		++numdirtybuffers;
528 	}
529 }
530 
531 /*
532  * Asynchronous write.
533  * Start output on a buffer, but do not wait for it to complete.
534  * The buffer is released when the output completes.
535  */
536 void
537 bawrite(struct buf * bp)
538 {
539 	bp->b_flags |= B_ASYNC;
540 	(void) VOP_BWRITE(bp);
541 }
542 
543 /*
544  * Ordered write.
545  * Start output on a buffer, and flag it so that the device will write
546  * it in the order it was queued.  The buffer is released when the output
547  * completes.
548  */
549 int
550 bowrite(struct buf * bp)
551 {
552 	bp->b_flags |= B_ORDERED|B_ASYNC;
553 	return (VOP_BWRITE(bp));
554 }
555 
556 /*
557  * Release a buffer.
558  */
559 void
560 brelse(struct buf * bp)
561 {
562 	int s;
563 
564 	if (bp->b_flags & B_CLUSTER) {
565 		relpbuf(bp);
566 		return;
567 	}
568 
569 	s = splbio();
570 
571 	/* anyone need this block? */
572 	if (bp->b_flags & B_WANTED) {
573 		bp->b_flags &= ~(B_WANTED | B_AGE);
574 		wakeup(bp);
575 	}
576 
577 	if (bp->b_flags & B_LOCKED)
578 		bp->b_flags &= ~B_ERROR;
579 
580 	if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_FREEBUF)) ||
581 	    (bp->b_bufsize <= 0)) {
582 		bp->b_flags |= B_INVAL;
583 		if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate)
584 			(*bioops.io_deallocate)(bp);
585 		if (bp->b_flags & B_DELWRI)
586 			--numdirtybuffers;
587 		bp->b_flags &= ~(B_DELWRI | B_CACHE | B_FREEBUF);
588 		if ((bp->b_flags & B_VMIO) == 0) {
589 			if (bp->b_bufsize)
590 				allocbuf(bp, 0);
591 			if (bp->b_vp)
592 				brelvp(bp);
593 		}
594 	}
595 
596 	/*
597 	 * We must clear B_RELBUF if B_DELWRI is set.  If vfs_vmio_release()
598 	 * is called with B_DELWRI set, the underlying pages may wind up
599 	 * getting freed causing a previous write (bdwrite()) to get 'lost'
600 	 * because pages associated with a B_DELWRI bp are marked clean.
601 	 *
602 	 * We still allow the B_INVAL case to call vfs_vmio_release(), even
603 	 * if B_DELWRI is set.
604 	 */
605 
606 	if (bp->b_flags & B_DELWRI)
607 		bp->b_flags &= ~B_RELBUF;
608 
609 	/*
610 	 * VMIO buffer rundown.  It is not very necessary to keep a VMIO buffer
611 	 * constituted, so the B_INVAL flag is used to *invalidate* the buffer,
612 	 * but the VM object is kept around.  The B_NOCACHE flag is used to
613 	 * invalidate the pages in the VM object.
614 	 *
615 	 * The b_{validoff,validend,dirtyoff,dirtyend} values are relative
616 	 * to b_offset and currently have byte granularity, whereas the
617 	 * valid flags in the vm_pages have only DEV_BSIZE resolution.
618 	 * The byte resolution fields are used to avoid unnecessary re-reads
619 	 * of the buffer but the code really needs to be genericized so
620 	 * other filesystem modules can take advantage of these fields.
621 	 *
622 	 * XXX this seems to cause performance problems.
623 	 */
624 	if ((bp->b_flags & B_VMIO)
625 	    && !(bp->b_vp->v_tag == VT_NFS &&
626 		 bp->b_vp->v_type != VBLK &&
627 		 (bp->b_flags & B_DELWRI) != 0)
628 #ifdef notdef
629 	    && (bp->b_vp->v_tag != VT_NFS
630 		|| bp->b_vp->v_type == VBLK
631 		|| (bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR))
632 		|| bp->b_validend == 0
633 		|| (bp->b_validoff == 0
634 		    && bp->b_validend == bp->b_bufsize))
635 #endif
636 	    ) {
637 
638 		int i, j, resid;
639 		vm_page_t m;
640 		off_t foff;
641 		vm_pindex_t poff;
642 		vm_object_t obj;
643 		struct vnode *vp;
644 
645 		vp = bp->b_vp;
646 
647 		/*
648 		 * Get the base offset and length of the buffer.  Note that
649 		 * for block sizes that are less then PAGE_SIZE, the b_data
650 		 * base of the buffer does not represent exactly b_offset and
651 		 * neither b_offset nor b_size are necessarily page aligned.
652 		 * Instead, the starting position of b_offset is:
653 		 *
654 		 * 	b_data + (b_offset & PAGE_MASK)
655 		 *
656 		 * block sizes less then DEV_BSIZE (usually 512) are not
657 		 * supported due to the page granularity bits (m->valid,
658 		 * m->dirty, etc...).
659 		 *
660 		 * See man buf(9) for more information
661 		 */
662 
663 		resid = bp->b_bufsize;
664 		foff = bp->b_offset;
665 
666 		for (i = 0; i < bp->b_npages; i++) {
667 			m = bp->b_pages[i];
668 			vm_page_flag_clear(m, PG_ZERO);
669 			if (m == bogus_page) {
670 
671 				obj = (vm_object_t) vp->v_object;
672 				poff = OFF_TO_IDX(bp->b_offset);
673 
674 				for (j = i; j < bp->b_npages; j++) {
675 					m = bp->b_pages[j];
676 					if (m == bogus_page) {
677 						m = vm_page_lookup(obj, poff + j);
678 #if !defined(MAX_PERF)
679 						if (!m) {
680 							panic("brelse: page missing\n");
681 						}
682 #endif
683 						bp->b_pages[j] = m;
684 					}
685 				}
686 
687 				if ((bp->b_flags & B_INVAL) == 0) {
688 					pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
689 				}
690 			}
691 			if (bp->b_flags & (B_NOCACHE|B_ERROR)) {
692 				int poffset = foff & PAGE_MASK;
693 				int presid = resid > (PAGE_SIZE - poffset) ?
694 					(PAGE_SIZE - poffset) : resid;
695 				KASSERT(presid >= 0, ("brelse: extra page"));
696 				vm_page_set_invalid(m, poffset, presid);
697 			}
698 			resid -= PAGE_SIZE - (foff & PAGE_MASK);
699 			foff = (foff + PAGE_SIZE) & ~PAGE_MASK;
700 		}
701 
702 		if (bp->b_flags & (B_INVAL | B_RELBUF))
703 			vfs_vmio_release(bp);
704 
705 	} else if (bp->b_flags & B_VMIO) {
706 
707 		if (bp->b_flags & (B_INVAL | B_RELBUF))
708 			vfs_vmio_release(bp);
709 
710 	}
711 
712 #if !defined(MAX_PERF)
713 	if (bp->b_qindex != QUEUE_NONE)
714 		panic("brelse: free buffer onto another queue???");
715 #endif
716 
717 	/* enqueue */
718 	/* buffers with no memory */
719 	if (bp->b_bufsize == 0) {
720 		bp->b_flags |= B_INVAL;
721 		bp->b_qindex = QUEUE_EMPTY;
722 		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
723 		LIST_REMOVE(bp, b_hash);
724 		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
725 		bp->b_dev = NODEV;
726 		kvafreespace += bp->b_kvasize;
727 
728 	/* buffers with junk contents */
729 	} else if (bp->b_flags & (B_ERROR | B_INVAL | B_NOCACHE | B_RELBUF)) {
730 		bp->b_flags |= B_INVAL;
731 		bp->b_qindex = QUEUE_AGE;
732 		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist);
733 		LIST_REMOVE(bp, b_hash);
734 		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
735 		bp->b_dev = NODEV;
736 
737 	/* buffers that are locked */
738 	} else if (bp->b_flags & B_LOCKED) {
739 		bp->b_qindex = QUEUE_LOCKED;
740 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
741 
742 	/* buffers with stale but valid contents */
743 	} else if (bp->b_flags & B_AGE) {
744 		bp->b_qindex = QUEUE_AGE;
745 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist);
746 
747 	/* buffers with valid and quite potentially reuseable contents */
748 	} else {
749 		bp->b_qindex = QUEUE_LRU;
750 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
751 	}
752 
753 	if ((bp->b_flags & B_INVAL) ||
754 		(bp->b_flags & (B_LOCKED|B_DELWRI)) == 0) {
755 		if (bp->b_flags & B_DELWRI) {
756 			--numdirtybuffers;
757 			bp->b_flags &= ~B_DELWRI;
758 		}
759 		vfs_bio_need_satisfy();
760 	}
761 
762 	/* unlock */
763 	bp->b_flags &= ~(B_ORDERED | B_WANTED | B_BUSY |
764 		B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
765 	splx(s);
766 }
767 
768 /*
769  * Release a buffer.
770  */
771 void
772 bqrelse(struct buf * bp)
773 {
774 	int s;
775 
776 	s = splbio();
777 
778 	/* anyone need this block? */
779 	if (bp->b_flags & B_WANTED) {
780 		bp->b_flags &= ~(B_WANTED | B_AGE);
781 		wakeup(bp);
782 	}
783 
784 #if !defined(MAX_PERF)
785 	if (bp->b_qindex != QUEUE_NONE)
786 		panic("bqrelse: free buffer onto another queue???");
787 #endif
788 
789 	if (bp->b_flags & B_LOCKED) {
790 		bp->b_flags &= ~B_ERROR;
791 		bp->b_qindex = QUEUE_LOCKED;
792 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
793 		/* buffers with stale but valid contents */
794 	} else {
795 		bp->b_qindex = QUEUE_LRU;
796 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
797 	}
798 
799 	if ((bp->b_flags & (B_LOCKED|B_DELWRI)) == 0) {
800 		vfs_bio_need_satisfy();
801 	}
802 
803 	/* unlock */
804 	bp->b_flags &= ~(B_ORDERED | B_WANTED | B_BUSY |
805 		B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
806 	splx(s);
807 }
808 
809 static void
810 vfs_vmio_release(bp)
811 	struct buf *bp;
812 {
813 	int i, s;
814 	vm_page_t m;
815 
816 	s = splvm();
817 	for (i = 0; i < bp->b_npages; i++) {
818 		m = bp->b_pages[i];
819 		bp->b_pages[i] = NULL;
820 		/*
821 		 * In order to keep page LRU ordering consistent, put
822 		 * everything on the inactive queue.
823 		 */
824 		vm_page_unwire(m, 0);
825 		/*
826 		 * We don't mess with busy pages, it is
827 		 * the responsibility of the process that
828 		 * busied the pages to deal with them.
829 		 */
830 		if ((m->flags & PG_BUSY) || (m->busy != 0))
831 			continue;
832 
833 		if (m->wire_count == 0) {
834 			vm_page_flag_clear(m, PG_ZERO);
835 			/*
836 			 * Might as well free the page if we can and it has
837 			 * no valid data.
838 			 */
839 			if ((bp->b_flags & B_ASYNC) == 0 && !m->valid && m->hold_count == 0) {
840 				vm_page_busy(m);
841 				vm_page_protect(m, VM_PROT_NONE);
842 				vm_page_free(m);
843 			}
844 		}
845 	}
846 	splx(s);
847 	bufspace -= bp->b_bufsize;
848 	vmiospace -= bp->b_bufsize;
849 	pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
850 	bp->b_npages = 0;
851 	bp->b_bufsize = 0;
852 	bp->b_flags &= ~B_VMIO;
853 	if (bp->b_vp)
854 		brelvp(bp);
855 }
856 
857 /*
858  * Check to see if a block is currently memory resident.
859  */
860 struct buf *
861 gbincore(struct vnode * vp, daddr_t blkno)
862 {
863 	struct buf *bp;
864 	struct bufhashhdr *bh;
865 
866 	bh = BUFHASH(vp, blkno);
867 	bp = bh->lh_first;
868 
869 	/* Search hash chain */
870 	while (bp != NULL) {
871 		/* hit */
872 		if (bp->b_vp == vp && bp->b_lblkno == blkno &&
873 		    (bp->b_flags & B_INVAL) == 0) {
874 			break;
875 		}
876 		bp = bp->b_hash.le_next;
877 	}
878 	return (bp);
879 }
880 
881 /*
882  * this routine implements clustered async writes for
883  * clearing out B_DELWRI buffers...  This is much better
884  * than the old way of writing only one buffer at a time.
885  */
886 int
887 vfs_bio_awrite(struct buf * bp)
888 {
889 	int i;
890 	daddr_t lblkno = bp->b_lblkno;
891 	struct vnode *vp = bp->b_vp;
892 	int s;
893 	int ncl;
894 	struct buf *bpa;
895 	int nwritten;
896 	int size;
897 	int maxcl;
898 
899 	s = splbio();
900 	/*
901 	 * right now we support clustered writing only to regular files
902 	 */
903 	if ((vp->v_type == VREG) &&
904 	    (vp->v_mount != 0) && /* Only on nodes that have the size info */
905 	    (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
906 
907 		size = vp->v_mount->mnt_stat.f_iosize;
908 		maxcl = MAXPHYS / size;
909 
910 		for (i = 1; i < maxcl; i++) {
911 			if ((bpa = gbincore(vp, lblkno + i)) &&
912 			    ((bpa->b_flags & (B_BUSY | B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
913 			    (B_DELWRI | B_CLUSTEROK)) &&
914 			    (bpa->b_bufsize == size)) {
915 				if ((bpa->b_blkno == bpa->b_lblkno) ||
916 				    (bpa->b_blkno != bp->b_blkno + ((i * size) >> DEV_BSHIFT)))
917 					break;
918 			} else {
919 				break;
920 			}
921 		}
922 		ncl = i;
923 		/*
924 		 * this is a possible cluster write
925 		 */
926 		if (ncl != 1) {
927 			nwritten = cluster_wbuild(vp, size, lblkno, ncl);
928 			splx(s);
929 			return nwritten;
930 		}
931 	}
932 
933 	bremfree(bp);
934 	bp->b_flags |= B_BUSY | B_ASYNC;
935 
936 	splx(s);
937 	/*
938 	 * default (old) behavior, writing out only one block
939 	 */
940 	nwritten = bp->b_bufsize;
941 	(void) VOP_BWRITE(bp);
942 	return nwritten;
943 }
944 
945 
946 /*
947  * Find a buffer header which is available for use.
948  */
949 static struct buf *
950 getnewbuf(struct vnode *vp, daddr_t blkno,
951 	int slpflag, int slptimeo, int size, int maxsize)
952 {
953 	struct buf *bp, *bp1;
954 	int nbyteswritten = 0;
955 	vm_offset_t addr;
956 	static int writerecursion = 0;
957 
958 start:
959 	if (bufspace >= maxbufspace)
960 		goto trytofreespace;
961 
962 	/* can we constitute a new buffer? */
963 	if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]))) {
964 #if !defined(MAX_PERF)
965 		if (bp->b_qindex != QUEUE_EMPTY)
966 			panic("getnewbuf: inconsistent EMPTY queue, qindex=%d",
967 			    bp->b_qindex);
968 #endif
969 		bp->b_flags |= B_BUSY;
970 		bremfree(bp);
971 		goto fillbuf;
972 	}
973 trytofreespace:
974 	/*
975 	 * We keep the file I/O from hogging metadata I/O
976 	 * This is desirable because file data is cached in the
977 	 * VM/Buffer cache even if a buffer is freed.
978 	 */
979 	if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]))) {
980 #if !defined(MAX_PERF)
981 		if (bp->b_qindex != QUEUE_AGE)
982 			panic("getnewbuf: inconsistent AGE queue, qindex=%d",
983 			    bp->b_qindex);
984 #endif
985 	} else if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]))) {
986 #if !defined(MAX_PERF)
987 		if (bp->b_qindex != QUEUE_LRU)
988 			panic("getnewbuf: inconsistent LRU queue, qindex=%d",
989 			    bp->b_qindex);
990 #endif
991 	}
992 	if (!bp) {
993 		/* wait for a free buffer of any kind */
994 		needsbuffer |= VFS_BIO_NEED_ANY;
995 		do
996 			tsleep(&needsbuffer, (PRIBIO + 4) | slpflag, "newbuf",
997 			    slptimeo);
998 		while (needsbuffer & VFS_BIO_NEED_ANY);
999 		return (0);
1000 	}
1001 
1002 	KASSERT(!(bp->b_flags & B_BUSY),
1003 		("getnewbuf: busy buffer on free list\n"));
1004 
1005 	/*
1006 	 * We are fairly aggressive about freeing VMIO buffers, but since
1007 	 * the buffering is intact without buffer headers, there is not
1008 	 * much loss.  We gain by maintaining non-VMIOed metadata in buffers.
1009 	 */
1010 	if ((bp->b_qindex == QUEUE_LRU) && (bp->b_usecount > 0)) {
1011 		if ((bp->b_flags & B_VMIO) == 0 ||
1012 			(vmiospace < maxvmiobufspace)) {
1013 			--bp->b_usecount;
1014 			TAILQ_REMOVE(&bufqueues[QUEUE_LRU], bp, b_freelist);
1015 			if (TAILQ_FIRST(&bufqueues[QUEUE_LRU]) != NULL) {
1016 				TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
1017 				goto start;
1018 			}
1019 			TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
1020 		}
1021 	}
1022 
1023 
1024 	/* if we are a delayed write, convert to an async write */
1025 	if ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) {
1026 
1027 		/*
1028 		 * If our delayed write is likely to be used soon, then
1029 		 * recycle back onto the LRU queue.
1030 		 */
1031 		if (vp && (bp->b_vp == vp) && (bp->b_qindex == QUEUE_LRU) &&
1032 			(bp->b_lblkno >= blkno) && (maxsize > 0)) {
1033 
1034 			if (bp->b_usecount > 0) {
1035 				if (bp->b_lblkno < blkno + (MAXPHYS / maxsize)) {
1036 
1037 					TAILQ_REMOVE(&bufqueues[QUEUE_LRU], bp, b_freelist);
1038 
1039 					if (TAILQ_FIRST(&bufqueues[QUEUE_LRU]) != NULL) {
1040 						TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
1041 						bp->b_usecount--;
1042 						goto start;
1043 					}
1044 					TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
1045 				}
1046 			}
1047 		}
1048 
1049 		/*
1050 		 * Certain layered filesystems can recursively re-enter the vfs_bio
1051 		 * code, due to delayed writes.  This helps keep the system from
1052 		 * deadlocking.
1053 		 */
1054 		if (writerecursion > 0) {
1055 			if (writerecursion > 5) {
1056 				bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]);
1057 				while (bp) {
1058 					if ((bp->b_flags & B_DELWRI) == 0)
1059 						break;
1060 					bp = TAILQ_NEXT(bp, b_freelist);
1061 				}
1062 				if (bp == NULL) {
1063 					bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]);
1064 					while (bp) {
1065 						if ((bp->b_flags & B_DELWRI) == 0)
1066 							break;
1067 						bp = TAILQ_NEXT(bp, b_freelist);
1068 					}
1069 				}
1070 				if (bp == NULL)
1071 					panic("getnewbuf: cannot get buffer, infinite recursion failure");
1072 			} else {
1073 				bremfree(bp);
1074 				bp->b_flags |= B_BUSY | B_AGE | B_ASYNC;
1075 				nbyteswritten += bp->b_bufsize;
1076 				++writerecursion;
1077 				VOP_BWRITE(bp);
1078 				--writerecursion;
1079 				if (!slpflag && !slptimeo) {
1080 					return (0);
1081 				}
1082 				goto start;
1083 			}
1084 		} else {
1085 			++writerecursion;
1086 			nbyteswritten += vfs_bio_awrite(bp);
1087 			--writerecursion;
1088 			if (!slpflag && !slptimeo) {
1089 				return (0);
1090 			}
1091 			goto start;
1092 		}
1093 	}
1094 
1095 	if (bp->b_flags & B_WANTED) {
1096 		bp->b_flags &= ~B_WANTED;
1097 		wakeup(bp);
1098 	}
1099 	bremfree(bp);
1100 	bp->b_flags |= B_BUSY;
1101 
1102 	if (bp->b_flags & B_VMIO) {
1103 		bp->b_flags &= ~B_ASYNC;
1104 		vfs_vmio_release(bp);
1105 	}
1106 
1107 	if (bp->b_vp)
1108 		brelvp(bp);
1109 
1110 fillbuf:
1111 
1112 	/* we are not free, nor do we contain interesting data */
1113 	if (bp->b_rcred != NOCRED) {
1114 		crfree(bp->b_rcred);
1115 		bp->b_rcred = NOCRED;
1116 	}
1117 	if (bp->b_wcred != NOCRED) {
1118 		crfree(bp->b_wcred);
1119 		bp->b_wcred = NOCRED;
1120 	}
1121 	if (LIST_FIRST(&bp->b_dep) != NULL &&
1122 	    bioops.io_deallocate)
1123 		(*bioops.io_deallocate)(bp);
1124 
1125 	LIST_REMOVE(bp, b_hash);
1126 	LIST_INSERT_HEAD(&invalhash, bp, b_hash);
1127 	if (bp->b_bufsize) {
1128 		allocbuf(bp, 0);
1129 	}
1130 	bp->b_flags = B_BUSY;
1131 	bp->b_dev = NODEV;
1132 	bp->b_vp = NULL;
1133 	bp->b_blkno = bp->b_lblkno = 0;
1134 	bp->b_offset = NOOFFSET;
1135 	bp->b_iodone = 0;
1136 	bp->b_error = 0;
1137 	bp->b_resid = 0;
1138 	bp->b_bcount = 0;
1139 	bp->b_npages = 0;
1140 	bp->b_dirtyoff = bp->b_dirtyend = 0;
1141 	bp->b_validoff = bp->b_validend = 0;
1142 	bp->b_usecount = 5;
1143 	/* Here, not kern_physio.c, is where this should be done*/
1144 	LIST_INIT(&bp->b_dep);
1145 
1146 	maxsize = (maxsize + PAGE_MASK) & ~PAGE_MASK;
1147 
1148 	/*
1149 	 * we assume that buffer_map is not at address 0
1150 	 */
1151 	addr = 0;
1152 	if (maxsize != bp->b_kvasize) {
1153 		bfreekva(bp);
1154 
1155 findkvaspace:
1156 		/*
1157 		 * See if we have buffer kva space
1158 		 */
1159 		if (vm_map_findspace(buffer_map,
1160 			vm_map_min(buffer_map), maxsize, &addr)) {
1161 			if (kvafreespace > 0) {
1162 				int totfree = 0, freed;
1163 				do {
1164 					freed = 0;
1165 					for (bp1 = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
1166 						bp1 != NULL; bp1 = TAILQ_NEXT(bp1, b_freelist)) {
1167 						if (bp1->b_kvasize != 0) {
1168 							totfree += bp1->b_kvasize;
1169 							freed = bp1->b_kvasize;
1170 							bremfree(bp1);
1171 							bfreekva(bp1);
1172 							brelse(bp1);
1173 							break;
1174 						}
1175 					}
1176 				} while (freed);
1177 				/*
1178 				 * if we found free space, then retry with the same buffer.
1179 				 */
1180 				if (totfree)
1181 					goto findkvaspace;
1182 			}
1183 			bp->b_flags |= B_INVAL;
1184 			brelse(bp);
1185 			goto trytofreespace;
1186 		}
1187 	}
1188 
1189 	/*
1190 	 * See if we are below are allocated minimum
1191 	 */
1192 	if (bufspace >= (maxbufspace + nbyteswritten)) {
1193 		bp->b_flags |= B_INVAL;
1194 		brelse(bp);
1195 		goto trytofreespace;
1196 	}
1197 
1198 	/*
1199 	 * create a map entry for the buffer -- in essence
1200 	 * reserving the kva space.
1201 	 */
1202 	if (addr) {
1203 		vm_map_insert(buffer_map, NULL, 0,
1204 			addr, addr + maxsize,
1205 			VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT);
1206 
1207 		bp->b_kvabase = (caddr_t) addr;
1208 		bp->b_kvasize = maxsize;
1209 	}
1210 	bp->b_data = bp->b_kvabase;
1211 
1212 	return (bp);
1213 }
1214 
1215 static void
1216 waitfreebuffers(int slpflag, int slptimeo) {
1217 	while (numfreebuffers < hifreebuffers) {
1218 		flushdirtybuffers(slpflag, slptimeo);
1219 		if (numfreebuffers < hifreebuffers)
1220 			break;
1221 		needsbuffer |= VFS_BIO_NEED_FREE;
1222 		if (tsleep(&needsbuffer, (PRIBIO + 4)|slpflag, "biofre", slptimeo))
1223 			break;
1224 	}
1225 }
1226 
1227 static void
1228 flushdirtybuffers(int slpflag, int slptimeo) {
1229 	int s;
1230 	static pid_t flushing = 0;
1231 
1232 	s = splbio();
1233 
1234 	if (flushing) {
1235 		if (flushing == curproc->p_pid) {
1236 			splx(s);
1237 			return;
1238 		}
1239 		while (flushing) {
1240 			if (tsleep(&flushing, (PRIBIO + 4)|slpflag, "biofls", slptimeo)) {
1241 				splx(s);
1242 				return;
1243 			}
1244 		}
1245 	}
1246 	flushing = curproc->p_pid;
1247 
1248 	while (numdirtybuffers > lodirtybuffers) {
1249 		struct buf *bp;
1250 		needsbuffer |= VFS_BIO_NEED_LOWLIMIT;
1251 		bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]);
1252 		if (bp == NULL)
1253 			bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]);
1254 
1255 		while (bp && ((bp->b_flags & B_DELWRI) == 0)) {
1256 			bp = TAILQ_NEXT(bp, b_freelist);
1257 		}
1258 
1259 		if (bp) {
1260 			vfs_bio_awrite(bp);
1261 			continue;
1262 		}
1263 		break;
1264 	}
1265 
1266 	flushing = 0;
1267 	wakeup(&flushing);
1268 	splx(s);
1269 }
1270 
1271 /*
1272  * Check to see if a block is currently memory resident.
1273  */
1274 struct buf *
1275 incore(struct vnode * vp, daddr_t blkno)
1276 {
1277 	struct buf *bp;
1278 
1279 	int s = splbio();
1280 	bp = gbincore(vp, blkno);
1281 	splx(s);
1282 	return (bp);
1283 }
1284 
1285 /*
1286  * Returns true if no I/O is needed to access the
1287  * associated VM object.  This is like incore except
1288  * it also hunts around in the VM system for the data.
1289  */
1290 
1291 int
1292 inmem(struct vnode * vp, daddr_t blkno)
1293 {
1294 	vm_object_t obj;
1295 	vm_offset_t toff, tinc, size;
1296 	vm_page_t m;
1297 	vm_ooffset_t off;
1298 
1299 	if (incore(vp, blkno))
1300 		return 1;
1301 	if (vp->v_mount == NULL)
1302 		return 0;
1303 	if ((vp->v_object == NULL) || (vp->v_flag & VOBJBUF) == 0)
1304 		return 0;
1305 
1306 	obj = vp->v_object;
1307 	size = PAGE_SIZE;
1308 	if (size > vp->v_mount->mnt_stat.f_iosize)
1309 		size = vp->v_mount->mnt_stat.f_iosize;
1310 	off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize;
1311 
1312 	for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
1313 		m = vm_page_lookup(obj, OFF_TO_IDX(off + toff));
1314 		if (!m)
1315 			return 0;
1316 		tinc = size;
1317 		if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK))
1318 			tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK);
1319 		if (vm_page_is_valid(m,
1320 		    (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0)
1321 			return 0;
1322 	}
1323 	return 1;
1324 }
1325 
1326 /*
1327  * now we set the dirty range for the buffer --
1328  * for NFS -- if the file is mapped and pages have
1329  * been written to, let it know.  We want the
1330  * entire range of the buffer to be marked dirty if
1331  * any of the pages have been written to for consistancy
1332  * with the b_validoff, b_validend set in the nfs write
1333  * code, and used by the nfs read code.
1334  */
1335 static void
1336 vfs_setdirty(struct buf *bp) {
1337 	int i;
1338 	vm_object_t object;
1339 	vm_offset_t boffset, offset;
1340 	/*
1341 	 * We qualify the scan for modified pages on whether the
1342 	 * object has been flushed yet.  The OBJ_WRITEABLE flag
1343 	 * is not cleared simply by protecting pages off.
1344 	 */
1345 	if ((bp->b_flags & B_VMIO) &&
1346 		((object = bp->b_pages[0]->object)->flags & (OBJ_WRITEABLE|OBJ_CLEANING))) {
1347 		/*
1348 		 * test the pages to see if they have been modified directly
1349 		 * by users through the VM system.
1350 		 */
1351 		for (i = 0; i < bp->b_npages; i++) {
1352 			vm_page_flag_clear(bp->b_pages[i], PG_ZERO);
1353 			vm_page_test_dirty(bp->b_pages[i]);
1354 		}
1355 
1356 		/*
1357 		 * scan forwards for the first page modified
1358 		 */
1359 		for (i = 0; i < bp->b_npages; i++) {
1360 			if (bp->b_pages[i]->dirty) {
1361 				break;
1362 			}
1363 		}
1364 		boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
1365 		if (boffset < bp->b_dirtyoff) {
1366 			bp->b_dirtyoff = max(boffset, 0);
1367 		}
1368 
1369 		/*
1370 		 * scan backwards for the last page modified
1371 		 */
1372 		for (i = bp->b_npages - 1; i >= 0; --i) {
1373 			if (bp->b_pages[i]->dirty) {
1374 				break;
1375 			}
1376 		}
1377 		boffset = (i + 1);
1378 #if 0
1379 		offset = boffset + bp->b_pages[0]->pindex;
1380 		if (offset >= object->size)
1381 			boffset = object->size - bp->b_pages[0]->pindex;
1382 #endif
1383 		boffset = (boffset << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
1384 		if (bp->b_dirtyend < boffset)
1385 			bp->b_dirtyend = min(boffset, bp->b_bufsize);
1386 	}
1387 }
1388 
1389 /*
1390  * Get a block given a specified block and offset into a file/device.
1391  */
1392 struct buf *
1393 getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo)
1394 {
1395 	struct buf *bp;
1396 	int i, s;
1397 	struct bufhashhdr *bh;
1398 	int maxsize;
1399 
1400 #if !defined(MAX_PERF)
1401 	if (size > MAXBSIZE)
1402 		panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE);
1403 #endif
1404 
1405 	s = splbio();
1406 loop:
1407 	if (numfreebuffers < lofreebuffers) {
1408 		waitfreebuffers(slpflag, slptimeo);
1409 	}
1410 
1411 	if ((bp = gbincore(vp, blkno))) {
1412 		if (bp->b_flags & B_BUSY) {
1413 
1414 			bp->b_flags |= B_WANTED;
1415 			if (bp->b_usecount < BUF_MAXUSE)
1416 				++bp->b_usecount;
1417 
1418 			if (!tsleep(bp,
1419 				(PRIBIO + 4) | slpflag, "getblk", slptimeo)) {
1420 				goto loop;
1421 			}
1422 
1423 			splx(s);
1424 			return (struct buf *) NULL;
1425 		}
1426 		bp->b_flags |= B_BUSY | B_CACHE;
1427 		bremfree(bp);
1428 
1429 		/*
1430 		 * check for size inconsistancies (note that they shouldn't
1431 		 * happen but do when filesystems don't handle the size changes
1432 		 * correctly.) We are conservative on metadata and don't just
1433 		 * extend the buffer but write (if needed) and re-constitute it.
1434 		 */
1435 
1436 		if (bp->b_bcount != size) {
1437 			if ((bp->b_flags & B_VMIO) && (size <= bp->b_kvasize)) {
1438 				allocbuf(bp, size);
1439 			} else {
1440 				if (bp->b_flags & B_DELWRI) {
1441 					bp->b_flags |= B_NOCACHE;
1442 					VOP_BWRITE(bp);
1443 				} else {
1444 					if ((bp->b_flags & B_VMIO) &&
1445 					   (LIST_FIRST(&bp->b_dep) == NULL)) {
1446 						bp->b_flags |= B_RELBUF;
1447 						brelse(bp);
1448 					} else {
1449 						bp->b_flags |= B_NOCACHE;
1450 						VOP_BWRITE(bp);
1451 					}
1452 				}
1453 				goto loop;
1454 			}
1455 		}
1456 
1457 		KASSERT(bp->b_offset != NOOFFSET,
1458 			("getblk: no buffer offset"));
1459 
1460 		/*
1461 		 * Check that the constituted buffer really deserves for the
1462 		 * B_CACHE bit to be set.  B_VMIO type buffers might not
1463 		 * contain fully valid pages.  Normal (old-style) buffers
1464 		 * should be fully valid.
1465 		 */
1466 		if (bp->b_flags & B_VMIO) {
1467 			int checksize = bp->b_bufsize;
1468 			int poffset = bp->b_offset & PAGE_MASK;
1469 			int resid;
1470 			for (i = 0; i < bp->b_npages; i++) {
1471 				resid = (checksize > (PAGE_SIZE - poffset)) ?
1472 					(PAGE_SIZE - poffset) : checksize;
1473 				if (!vm_page_is_valid(bp->b_pages[i], poffset, resid)) {
1474 					bp->b_flags &= ~(B_CACHE | B_DONE);
1475 					break;
1476 				}
1477 				checksize -= resid;
1478 				poffset = 0;
1479 			}
1480 		}
1481 
1482 		if (bp->b_usecount < BUF_MAXUSE)
1483 			++bp->b_usecount;
1484 		splx(s);
1485 		return (bp);
1486 	} else {
1487 		int bsize, maxsize, vmio;
1488 		off_t offset;
1489 
1490 		if (vp->v_type == VBLK)
1491 			bsize = DEV_BSIZE;
1492 		else if (vp->v_mountedhere)
1493 			bsize = vp->v_mountedhere->mnt_stat.f_iosize;
1494 		else if (vp->v_mount)
1495 			bsize = vp->v_mount->mnt_stat.f_iosize;
1496 		else
1497 			bsize = size;
1498 
1499 		offset = (off_t)blkno * bsize;
1500 		vmio = (vp->v_object != 0) && (vp->v_flag & VOBJBUF);
1501 		maxsize = vmio ? size + (offset & PAGE_MASK) : size;
1502 		maxsize = imax(maxsize, bsize);
1503 
1504 		if ((bp = getnewbuf(vp, blkno,
1505 			slpflag, slptimeo, size, maxsize)) == 0) {
1506 			if (slpflag || slptimeo) {
1507 				splx(s);
1508 				return NULL;
1509 			}
1510 			goto loop;
1511 		}
1512 
1513 		/*
1514 		 * This code is used to make sure that a buffer is not
1515 		 * created while the getnewbuf routine is blocked.
1516 		 * Normally the vnode is locked so this isn't a problem.
1517 		 * VBLK type I/O requests, however, don't lock the vnode.
1518 		 */
1519 		if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE && gbincore(vp, blkno)) {
1520 			bp->b_flags |= B_INVAL;
1521 			brelse(bp);
1522 			goto loop;
1523 		}
1524 
1525 		/*
1526 		 * Insert the buffer into the hash, so that it can
1527 		 * be found by incore.
1528 		 */
1529 		bp->b_blkno = bp->b_lblkno = blkno;
1530 		bp->b_offset = offset;
1531 
1532 		bgetvp(vp, bp);
1533 		LIST_REMOVE(bp, b_hash);
1534 		bh = BUFHASH(vp, blkno);
1535 		LIST_INSERT_HEAD(bh, bp, b_hash);
1536 
1537 		if (vmio) {
1538 			bp->b_flags |= (B_VMIO | B_CACHE);
1539 #if defined(VFS_BIO_DEBUG)
1540 			if (vp->v_type != VREG && vp->v_type != VBLK)
1541 				printf("getblk: vmioing file type %d???\n", vp->v_type);
1542 #endif
1543 		} else {
1544 			bp->b_flags &= ~B_VMIO;
1545 		}
1546 
1547 		allocbuf(bp, size);
1548 
1549 		splx(s);
1550 		return (bp);
1551 	}
1552 }
1553 
1554 /*
1555  * Get an empty, disassociated buffer of given size.
1556  */
1557 struct buf *
1558 geteblk(int size)
1559 {
1560 	struct buf *bp;
1561 	int s;
1562 
1563 	s = splbio();
1564 	while ((bp = getnewbuf(0, (daddr_t) 0, 0, 0, size, MAXBSIZE)) == 0);
1565 	splx(s);
1566 	allocbuf(bp, size);
1567 	bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */
1568 	return (bp);
1569 }
1570 
1571 
1572 /*
1573  * This code constitutes the buffer memory from either anonymous system
1574  * memory (in the case of non-VMIO operations) or from an associated
1575  * VM object (in the case of VMIO operations).
1576  *
1577  * Note that this code is tricky, and has many complications to resolve
1578  * deadlock or inconsistant data situations.  Tread lightly!!!
1579  *
1580  * Modify the length of a buffer's underlying buffer storage without
1581  * destroying information (unless, of course the buffer is shrinking).
1582  */
1583 int
1584 allocbuf(struct buf * bp, int size)
1585 {
1586 
1587 	int s;
1588 	int newbsize, mbsize;
1589 	int i;
1590 
1591 #if !defined(MAX_PERF)
1592 	if (!(bp->b_flags & B_BUSY))
1593 		panic("allocbuf: buffer not busy");
1594 
1595 	if (bp->b_kvasize < size)
1596 		panic("allocbuf: buffer too small");
1597 #endif
1598 
1599 	if ((bp->b_flags & B_VMIO) == 0) {
1600 		caddr_t origbuf;
1601 		int origbufsize;
1602 		/*
1603 		 * Just get anonymous memory from the kernel
1604 		 */
1605 		mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
1606 #if !defined(NO_B_MALLOC)
1607 		if (bp->b_flags & B_MALLOC)
1608 			newbsize = mbsize;
1609 		else
1610 #endif
1611 			newbsize = round_page(size);
1612 
1613 		if (newbsize < bp->b_bufsize) {
1614 #if !defined(NO_B_MALLOC)
1615 			/*
1616 			 * malloced buffers are not shrunk
1617 			 */
1618 			if (bp->b_flags & B_MALLOC) {
1619 				if (newbsize) {
1620 					bp->b_bcount = size;
1621 				} else {
1622 					free(bp->b_data, M_BIOBUF);
1623 					bufspace -= bp->b_bufsize;
1624 					bufmallocspace -= bp->b_bufsize;
1625 					bp->b_data = bp->b_kvabase;
1626 					bp->b_bufsize = 0;
1627 					bp->b_bcount = 0;
1628 					bp->b_flags &= ~B_MALLOC;
1629 				}
1630 				return 1;
1631 			}
1632 #endif
1633 			vm_hold_free_pages(
1634 			    bp,
1635 			    (vm_offset_t) bp->b_data + newbsize,
1636 			    (vm_offset_t) bp->b_data + bp->b_bufsize);
1637 		} else if (newbsize > bp->b_bufsize) {
1638 #if !defined(NO_B_MALLOC)
1639 			/*
1640 			 * We only use malloced memory on the first allocation.
1641 			 * and revert to page-allocated memory when the buffer grows.
1642 			 */
1643 			if ( (bufmallocspace < maxbufmallocspace) &&
1644 				(bp->b_bufsize == 0) &&
1645 				(mbsize <= PAGE_SIZE/2)) {
1646 
1647 				bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK);
1648 				bp->b_bufsize = mbsize;
1649 				bp->b_bcount = size;
1650 				bp->b_flags |= B_MALLOC;
1651 				bufspace += mbsize;
1652 				bufmallocspace += mbsize;
1653 				return 1;
1654 			}
1655 #endif
1656 			origbuf = NULL;
1657 			origbufsize = 0;
1658 #if !defined(NO_B_MALLOC)
1659 			/*
1660 			 * If the buffer is growing on its other-than-first allocation,
1661 			 * then we revert to the page-allocation scheme.
1662 			 */
1663 			if (bp->b_flags & B_MALLOC) {
1664 				origbuf = bp->b_data;
1665 				origbufsize = bp->b_bufsize;
1666 				bp->b_data = bp->b_kvabase;
1667 				bufspace -= bp->b_bufsize;
1668 				bufmallocspace -= bp->b_bufsize;
1669 				bp->b_bufsize = 0;
1670 				bp->b_flags &= ~B_MALLOC;
1671 				newbsize = round_page(newbsize);
1672 			}
1673 #endif
1674 			vm_hold_load_pages(
1675 			    bp,
1676 			    (vm_offset_t) bp->b_data + bp->b_bufsize,
1677 			    (vm_offset_t) bp->b_data + newbsize);
1678 #if !defined(NO_B_MALLOC)
1679 			if (origbuf) {
1680 				bcopy(origbuf, bp->b_data, origbufsize);
1681 				free(origbuf, M_BIOBUF);
1682 			}
1683 #endif
1684 		}
1685 	} else {
1686 		vm_page_t m;
1687 		int desiredpages;
1688 
1689 		newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
1690 		desiredpages = (size == 0) ? 0 :
1691 			num_pages((bp->b_offset & PAGE_MASK) + newbsize);
1692 
1693 #if !defined(NO_B_MALLOC)
1694 		if (bp->b_flags & B_MALLOC)
1695 			panic("allocbuf: VMIO buffer can't be malloced");
1696 #endif
1697 
1698 		if (newbsize < bp->b_bufsize) {
1699 			if (desiredpages < bp->b_npages) {
1700 				for (i = desiredpages; i < bp->b_npages; i++) {
1701 					/*
1702 					 * the page is not freed here -- it
1703 					 * is the responsibility of vnode_pager_setsize
1704 					 */
1705 					m = bp->b_pages[i];
1706 					KASSERT(m != bogus_page,
1707 						("allocbuf: bogus page found"));
1708 					vm_page_sleep(m, "biodep", &m->busy);
1709 
1710 					bp->b_pages[i] = NULL;
1711 					vm_page_unwire(m, 0);
1712 				}
1713 				pmap_qremove((vm_offset_t) trunc_page((vm_offset_t)bp->b_data) +
1714 				    (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages));
1715 				bp->b_npages = desiredpages;
1716 			}
1717 		} else if (newbsize > bp->b_bufsize) {
1718 			vm_object_t obj;
1719 			vm_offset_t tinc, toff;
1720 			vm_ooffset_t off;
1721 			vm_pindex_t objoff;
1722 			int pageindex, curbpnpages;
1723 			struct vnode *vp;
1724 			int bsize;
1725 			int orig_validoff = bp->b_validoff;
1726 			int orig_validend = bp->b_validend;
1727 
1728 			vp = bp->b_vp;
1729 
1730 			if (vp->v_type == VBLK)
1731 				bsize = DEV_BSIZE;
1732 			else
1733 				bsize = vp->v_mount->mnt_stat.f_iosize;
1734 
1735 			if (bp->b_npages < desiredpages) {
1736 				obj = vp->v_object;
1737 				tinc = PAGE_SIZE;
1738 
1739 				off = bp->b_offset;
1740 				KASSERT(bp->b_offset != NOOFFSET,
1741 					("allocbuf: no buffer offset"));
1742 
1743 				curbpnpages = bp->b_npages;
1744 		doretry:
1745 				bp->b_validoff = orig_validoff;
1746 				bp->b_validend = orig_validend;
1747 				bp->b_flags |= B_CACHE;
1748 				for (toff = 0; toff < newbsize; toff += tinc) {
1749 					objoff = OFF_TO_IDX(off + toff);
1750 					pageindex = objoff - OFF_TO_IDX(off);
1751 					tinc = PAGE_SIZE - ((off + toff) & PAGE_MASK);
1752 					if (pageindex < curbpnpages) {
1753 
1754 						m = bp->b_pages[pageindex];
1755 #ifdef VFS_BIO_DIAG
1756 						if (m->pindex != objoff)
1757 							panic("allocbuf: page changed offset?!!!?");
1758 #endif
1759 						if (tinc > (newbsize - toff))
1760 							tinc = newbsize - toff;
1761 						if (bp->b_flags & B_CACHE)
1762 							vfs_buf_set_valid(bp, off, toff, tinc, m);
1763 						continue;
1764 					}
1765 					m = vm_page_lookup(obj, objoff);
1766 					if (!m) {
1767 						m = vm_page_alloc(obj, objoff, VM_ALLOC_NORMAL);
1768 						if (!m) {
1769 							VM_WAIT;
1770 							vm_pageout_deficit += (desiredpages - curbpnpages);
1771 							goto doretry;
1772 						}
1773 
1774 						vm_page_wire(m);
1775 						vm_page_flag_clear(m, PG_BUSY);
1776 						bp->b_flags &= ~B_CACHE;
1777 
1778 					} else if (m->flags & PG_BUSY) {
1779 						s = splvm();
1780 						if (m->flags & PG_BUSY) {
1781 							vm_page_flag_set(m, PG_WANTED);
1782 							tsleep(m, PVM, "pgtblk", 0);
1783 						}
1784 						splx(s);
1785 						goto doretry;
1786 					} else {
1787 						if ((curproc != pageproc) &&
1788 							((m->queue - m->pc) == PQ_CACHE) &&
1789 						    ((cnt.v_free_count + cnt.v_cache_count) <
1790 								(cnt.v_free_min + cnt.v_cache_min))) {
1791 							pagedaemon_wakeup();
1792 						}
1793 						if (tinc > (newbsize - toff))
1794 							tinc = newbsize - toff;
1795 						if (bp->b_flags & B_CACHE)
1796 							vfs_buf_set_valid(bp, off, toff, tinc, m);
1797 						vm_page_flag_clear(m, PG_ZERO);
1798 						vm_page_wire(m);
1799 					}
1800 					bp->b_pages[pageindex] = m;
1801 					curbpnpages = pageindex + 1;
1802 				}
1803 				if (vp->v_tag == VT_NFS &&
1804 				    vp->v_type != VBLK) {
1805 					if (bp->b_dirtyend > 0) {
1806 						bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff);
1807 						bp->b_validend = max(bp->b_validend, bp->b_dirtyend);
1808 					}
1809 					if (bp->b_validend == 0)
1810 						bp->b_flags &= ~B_CACHE;
1811 				}
1812 				bp->b_data = (caddr_t) trunc_page((vm_offset_t)bp->b_data);
1813 				bp->b_npages = curbpnpages;
1814 				pmap_qenter((vm_offset_t) bp->b_data,
1815 					bp->b_pages, bp->b_npages);
1816 				((vm_offset_t) bp->b_data) |= off & PAGE_MASK;
1817 			}
1818 		}
1819 	}
1820 	if (bp->b_flags & B_VMIO)
1821 		vmiospace += (newbsize - bp->b_bufsize);
1822 	bufspace += (newbsize - bp->b_bufsize);
1823 	bp->b_bufsize = newbsize;
1824 	bp->b_bcount = size;
1825 	return 1;
1826 }
1827 
1828 /*
1829  * Wait for buffer I/O completion, returning error status.
1830  */
1831 int
1832 biowait(register struct buf * bp)
1833 {
1834 	int s;
1835 
1836 	s = splbio();
1837 	while ((bp->b_flags & B_DONE) == 0)
1838 #if defined(NO_SCHEDULE_MODS)
1839 		tsleep(bp, PRIBIO, "biowait", 0);
1840 #else
1841 		if (bp->b_flags & B_READ)
1842 			tsleep(bp, PRIBIO, "biord", 0);
1843 		else
1844 			tsleep(bp, PRIBIO, "biowr", 0);
1845 #endif
1846 	splx(s);
1847 	if (bp->b_flags & B_EINTR) {
1848 		bp->b_flags &= ~B_EINTR;
1849 		return (EINTR);
1850 	}
1851 	if (bp->b_flags & B_ERROR) {
1852 		return (bp->b_error ? bp->b_error : EIO);
1853 	} else {
1854 		return (0);
1855 	}
1856 }
1857 
1858 /*
1859  * Finish I/O on a buffer, calling an optional function.
1860  * This is usually called from interrupt level, so process blocking
1861  * is not *a good idea*.
1862  */
1863 void
1864 biodone(register struct buf * bp)
1865 {
1866 	int s;
1867 
1868 	s = splbio();
1869 
1870 #if !defined(MAX_PERF)
1871 	if (!(bp->b_flags & B_BUSY))
1872 		panic("biodone: buffer not busy");
1873 #endif
1874 
1875 	if (bp->b_flags & B_DONE) {
1876 		splx(s);
1877 #if !defined(MAX_PERF)
1878 		printf("biodone: buffer already done\n");
1879 #endif
1880 		return;
1881 	}
1882 	bp->b_flags |= B_DONE;
1883 
1884 	if (bp->b_flags & B_FREEBUF) {
1885 		brelse(bp);
1886 		splx(s);
1887 		return;
1888 	}
1889 
1890 	if ((bp->b_flags & B_READ) == 0) {
1891 		vwakeup(bp);
1892 	}
1893 
1894 	/* call optional completion function if requested */
1895 	if (bp->b_flags & B_CALL) {
1896 		bp->b_flags &= ~B_CALL;
1897 		(*bp->b_iodone) (bp);
1898 		splx(s);
1899 		return;
1900 	}
1901 	if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_complete)
1902 		(*bioops.io_complete)(bp);
1903 
1904 	if (bp->b_flags & B_VMIO) {
1905 		int i, resid;
1906 		vm_ooffset_t foff;
1907 		vm_page_t m;
1908 		vm_object_t obj;
1909 		int iosize;
1910 		struct vnode *vp = bp->b_vp;
1911 
1912 		obj = vp->v_object;
1913 
1914 #if defined(VFS_BIO_DEBUG)
1915 		if (vp->v_usecount == 0) {
1916 			panic("biodone: zero vnode ref count");
1917 		}
1918 
1919 		if (vp->v_object == NULL) {
1920 			panic("biodone: missing VM object");
1921 		}
1922 
1923 		if ((vp->v_flag & VOBJBUF) == 0) {
1924 			panic("biodone: vnode is not setup for merged cache");
1925 		}
1926 #endif
1927 
1928 		foff = bp->b_offset;
1929 		KASSERT(bp->b_offset != NOOFFSET,
1930 			("biodone: no buffer offset"));
1931 
1932 #if !defined(MAX_PERF)
1933 		if (!obj) {
1934 			panic("biodone: no object");
1935 		}
1936 #endif
1937 #if defined(VFS_BIO_DEBUG)
1938 		if (obj->paging_in_progress < bp->b_npages) {
1939 			printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n",
1940 			    obj->paging_in_progress, bp->b_npages);
1941 		}
1942 #endif
1943 		iosize = bp->b_bufsize;
1944 		for (i = 0; i < bp->b_npages; i++) {
1945 			int bogusflag = 0;
1946 			m = bp->b_pages[i];
1947 			if (m == bogus_page) {
1948 				bogusflag = 1;
1949 				m = vm_page_lookup(obj, OFF_TO_IDX(foff));
1950 				if (!m) {
1951 #if defined(VFS_BIO_DEBUG)
1952 					printf("biodone: page disappeared\n");
1953 #endif
1954 					vm_object_pip_subtract(obj, 1);
1955 					continue;
1956 				}
1957 				bp->b_pages[i] = m;
1958 				pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
1959 			}
1960 #if defined(VFS_BIO_DEBUG)
1961 			if (OFF_TO_IDX(foff) != m->pindex) {
1962 				printf("biodone: foff(%d)/m->pindex(%d) mismatch\n", foff, m->pindex);
1963 			}
1964 #endif
1965 			resid = IDX_TO_OFF(m->pindex + 1) - foff;
1966 			if (resid > iosize)
1967 				resid = iosize;
1968 
1969 			/*
1970 			 * In the write case, the valid and clean bits are
1971 			 * already changed correctly, so we only need to do this
1972 			 * here in the read case.
1973 			 */
1974 			if ((bp->b_flags & B_READ) && !bogusflag && resid > 0) {
1975 				vfs_page_set_valid(bp, foff, i, m);
1976 			}
1977 			vm_page_flag_clear(m, PG_ZERO);
1978 
1979 			/*
1980 			 * when debugging new filesystems or buffer I/O methods, this
1981 			 * is the most common error that pops up.  if you see this, you
1982 			 * have not set the page busy flag correctly!!!
1983 			 */
1984 			if (m->busy == 0) {
1985 #if !defined(MAX_PERF)
1986 				printf("biodone: page busy < 0, "
1987 				    "pindex: %d, foff: 0x(%x,%x), "
1988 				    "resid: %d, index: %d\n",
1989 				    (int) m->pindex, (int)(foff >> 32),
1990 						(int) foff & 0xffffffff, resid, i);
1991 #endif
1992 				if (vp->v_type != VBLK)
1993 #if !defined(MAX_PERF)
1994 					printf(" iosize: %ld, lblkno: %d, flags: 0x%lx, npages: %d\n",
1995 					    bp->b_vp->v_mount->mnt_stat.f_iosize,
1996 					    (int) bp->b_lblkno,
1997 					    bp->b_flags, bp->b_npages);
1998 				else
1999 					printf(" VDEV, lblkno: %d, flags: 0x%lx, npages: %d\n",
2000 					    (int) bp->b_lblkno,
2001 					    bp->b_flags, bp->b_npages);
2002 				printf(" valid: 0x%x, dirty: 0x%x, wired: %d\n",
2003 				    m->valid, m->dirty, m->wire_count);
2004 #endif
2005 				panic("biodone: page busy < 0\n");
2006 			}
2007 			vm_page_io_finish(m);
2008 			vm_object_pip_subtract(obj, 1);
2009 			foff += resid;
2010 			iosize -= resid;
2011 		}
2012 		if (obj &&
2013 			(obj->paging_in_progress == 0) &&
2014 		    (obj->flags & OBJ_PIPWNT)) {
2015 			vm_object_clear_flag(obj, OBJ_PIPWNT);
2016 			wakeup(obj);
2017 		}
2018 	}
2019 	/*
2020 	 * For asynchronous completions, release the buffer now. The brelse
2021 	 * checks for B_WANTED and will do the wakeup there if necessary - so
2022 	 * no need to do a wakeup here in the async case.
2023 	 */
2024 
2025 	if (bp->b_flags & B_ASYNC) {
2026 		if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_RELBUF)) != 0)
2027 			brelse(bp);
2028 		else
2029 			bqrelse(bp);
2030 	} else {
2031 		bp->b_flags &= ~B_WANTED;
2032 		wakeup(bp);
2033 	}
2034 	splx(s);
2035 }
2036 
2037 #if 0	/* not with kirks code */
2038 static int vfs_update_interval = 30;
2039 
2040 static void
2041 vfs_update()
2042 {
2043 	while (1) {
2044 		tsleep(&vfs_update_wakeup, PUSER, "update",
2045 		    hz * vfs_update_interval);
2046 		vfs_update_wakeup = 0;
2047 		sync(curproc, NULL);
2048 	}
2049 }
2050 
2051 static int
2052 sysctl_kern_updateinterval SYSCTL_HANDLER_ARGS
2053 {
2054 	int error = sysctl_handle_int(oidp,
2055 		oidp->oid_arg1, oidp->oid_arg2, req);
2056 	if (!error)
2057 		wakeup(&vfs_update_wakeup);
2058 	return error;
2059 }
2060 
2061 SYSCTL_PROC(_kern, KERN_UPDATEINTERVAL, update, CTLTYPE_INT|CTLFLAG_RW,
2062 	&vfs_update_interval, 0, sysctl_kern_updateinterval, "I", "");
2063 
2064 #endif
2065 
2066 
2067 /*
2068  * This routine is called in lieu of iodone in the case of
2069  * incomplete I/O.  This keeps the busy status for pages
2070  * consistant.
2071  */
2072 void
2073 vfs_unbusy_pages(struct buf * bp)
2074 {
2075 	int i;
2076 
2077 	if (bp->b_flags & B_VMIO) {
2078 		struct vnode *vp = bp->b_vp;
2079 		vm_object_t obj = vp->v_object;
2080 
2081 		for (i = 0; i < bp->b_npages; i++) {
2082 			vm_page_t m = bp->b_pages[i];
2083 
2084 			if (m == bogus_page) {
2085 				m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i);
2086 #if !defined(MAX_PERF)
2087 				if (!m) {
2088 					panic("vfs_unbusy_pages: page missing\n");
2089 				}
2090 #endif
2091 				bp->b_pages[i] = m;
2092 				pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
2093 			}
2094 			vm_object_pip_subtract(obj, 1);
2095 			vm_page_flag_clear(m, PG_ZERO);
2096 			vm_page_io_finish(m);
2097 		}
2098 		if (obj->paging_in_progress == 0 &&
2099 		    (obj->flags & OBJ_PIPWNT)) {
2100 			vm_object_clear_flag(obj, OBJ_PIPWNT);
2101 			wakeup(obj);
2102 		}
2103 	}
2104 }
2105 
2106 /*
2107  * Set NFS' b_validoff and b_validend fields from the valid bits
2108  * of a page.  If the consumer is not NFS, and the page is not
2109  * valid for the entire range, clear the B_CACHE flag to force
2110  * the consumer to re-read the page.
2111  */
2112 static void
2113 vfs_buf_set_valid(struct buf *bp,
2114 		  vm_ooffset_t foff, vm_offset_t off, vm_offset_t size,
2115 		  vm_page_t m)
2116 {
2117 	if (bp->b_vp->v_tag == VT_NFS && bp->b_vp->v_type != VBLK) {
2118 		vm_offset_t svalid, evalid;
2119 		int validbits = m->valid >> (((foff+off)&PAGE_MASK)/DEV_BSIZE);
2120 
2121 		/*
2122 		 * This only bothers with the first valid range in the
2123 		 * page.
2124 		 */
2125 		svalid = off;
2126 		while (validbits && !(validbits & 1)) {
2127 			svalid += DEV_BSIZE;
2128 			validbits >>= 1;
2129 		}
2130 		evalid = svalid;
2131 		while (validbits & 1) {
2132 			evalid += DEV_BSIZE;
2133 			validbits >>= 1;
2134 		}
2135 		evalid = min(evalid, off + size);
2136 		/*
2137 		 * Make sure this range is contiguous with the range
2138 		 * built up from previous pages.  If not, then we will
2139 		 * just use the range from the previous pages.
2140 		 */
2141 		if (svalid == bp->b_validend) {
2142 			bp->b_validoff = min(bp->b_validoff, svalid);
2143 			bp->b_validend = max(bp->b_validend, evalid);
2144 		}
2145 	} else if (!vm_page_is_valid(m,
2146 				     (vm_offset_t) ((foff + off) & PAGE_MASK),
2147 				     size)) {
2148 		bp->b_flags &= ~B_CACHE;
2149 	}
2150 }
2151 
2152 /*
2153  * Set the valid bits in a page, taking care of the b_validoff,
2154  * b_validend fields which NFS uses to optimise small reads.  Off is
2155  * the offset within the file and pageno is the page index within the buf.
2156  */
2157 static void
2158 vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m)
2159 {
2160 	struct vnode *vp = bp->b_vp;
2161 	vm_ooffset_t soff, eoff;
2162 
2163 	soff = off;
2164 	eoff = (off + PAGE_SIZE) & ~PAGE_MASK;
2165 	if (eoff > bp->b_offset + bp->b_bufsize)
2166 		eoff = bp->b_offset + bp->b_bufsize;
2167 	if (vp->v_tag == VT_NFS && vp->v_type != VBLK) {
2168 		vm_ooffset_t sv, ev;
2169 		vm_page_set_invalid(m,
2170 		    (vm_offset_t) (soff & PAGE_MASK),
2171 		    (vm_offset_t) (eoff - soff));
2172 		sv = (bp->b_offset + bp->b_validoff + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
2173 		ev = (bp->b_offset + bp->b_validend) & ~(DEV_BSIZE - 1);
2174 		soff = qmax(sv, soff);
2175 		eoff = qmin(ev, eoff);
2176 	}
2177 	if (eoff > soff)
2178 		vm_page_set_validclean(m,
2179 	       (vm_offset_t) (soff & PAGE_MASK),
2180 	       (vm_offset_t) (eoff - soff));
2181 }
2182 
2183 /*
2184  * This routine is called before a device strategy routine.
2185  * It is used to tell the VM system that paging I/O is in
2186  * progress, and treat the pages associated with the buffer
2187  * almost as being PG_BUSY.  Also the object paging_in_progress
2188  * flag is handled to make sure that the object doesn't become
2189  * inconsistant.
2190  */
2191 void
2192 vfs_busy_pages(struct buf * bp, int clear_modify)
2193 {
2194 	int i, bogus;
2195 
2196 	if (bp->b_flags & B_VMIO) {
2197 		struct vnode *vp = bp->b_vp;
2198 		vm_object_t obj = vp->v_object;
2199 		vm_ooffset_t foff;
2200 
2201 		foff = bp->b_offset;
2202 		KASSERT(bp->b_offset != NOOFFSET,
2203 			("vfs_busy_pages: no buffer offset"));
2204 
2205 		vfs_setdirty(bp);
2206 
2207 retry:
2208 		for (i = 0; i < bp->b_npages; i++) {
2209 			vm_page_t m = bp->b_pages[i];
2210 			if (vm_page_sleep(m, "vbpage", NULL))
2211 				goto retry;
2212 		}
2213 
2214 		bogus = 0;
2215 		for (i = 0; i < bp->b_npages; i++) {
2216 			vm_page_t m = bp->b_pages[i];
2217 
2218 			vm_page_flag_clear(m, PG_ZERO);
2219 			if ((bp->b_flags & B_CLUSTER) == 0) {
2220 				vm_object_pip_add(obj, 1);
2221 				vm_page_io_start(m);
2222 			}
2223 
2224 			vm_page_protect(m, VM_PROT_NONE);
2225 			if (clear_modify)
2226 				vfs_page_set_valid(bp, foff, i, m);
2227 			else if (m->valid == VM_PAGE_BITS_ALL &&
2228 				(bp->b_flags & B_CACHE) == 0) {
2229 				bp->b_pages[i] = bogus_page;
2230 				bogus++;
2231 			}
2232 			foff = (foff + PAGE_SIZE) & ~PAGE_MASK;
2233 		}
2234 		if (bogus)
2235 			pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
2236 	}
2237 }
2238 
2239 /*
2240  * Tell the VM system that the pages associated with this buffer
2241  * are clean.  This is used for delayed writes where the data is
2242  * going to go to disk eventually without additional VM intevention.
2243  */
2244 void
2245 vfs_clean_pages(struct buf * bp)
2246 {
2247 	int i;
2248 
2249 	if (bp->b_flags & B_VMIO) {
2250 		vm_ooffset_t foff;
2251 		foff = bp->b_offset;
2252 
2253 		KASSERT(bp->b_offset != NOOFFSET,
2254 			("vfs_clean_pages: no buffer offset"));
2255 
2256 		for (i = 0; i < bp->b_npages; i++) {
2257 			vm_page_t m = bp->b_pages[i];
2258 			vfs_page_set_valid(bp, foff, i, m);
2259 			foff = (foff + PAGE_SIZE) & ~PAGE_MASK;
2260 		}
2261 	}
2262 }
2263 
2264 void
2265 vfs_bio_clrbuf(struct buf *bp) {
2266 	int i, size, mask = 0;
2267 	caddr_t sa, ea;
2268 	if ((bp->b_flags & (B_VMIO | B_MALLOC)) == B_VMIO) {
2269 		if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) &&
2270 		    (bp->b_offset & PAGE_MASK) == 0) {
2271 			mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1;
2272 			if (((bp->b_pages[0]->flags & PG_ZERO) == 0) &&
2273 			    ((bp->b_pages[0]->valid & mask) != mask)) {
2274 				bzero(bp->b_data, bp->b_bufsize);
2275 			}
2276 			bp->b_pages[0]->valid |= mask;
2277 			bp->b_resid = 0;
2278 			return;
2279 		}
2280 		ea = sa = bp->b_data;
2281 		for(i=0;i<bp->b_npages;i++,sa=ea) {
2282 			int j = ((u_long)sa & PAGE_MASK) / DEV_BSIZE;
2283 			ea = (caddr_t)trunc_page((vm_offset_t)sa + PAGE_SIZE);
2284 			ea = (caddr_t)ulmin((u_long)ea,
2285 				(u_long)bp->b_data + bp->b_bufsize);
2286 			mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j;
2287 			if ((bp->b_pages[i]->valid & mask) == mask)
2288 				continue;
2289 			if ((bp->b_pages[i]->valid & mask) == 0) {
2290 				if ((bp->b_pages[i]->flags & PG_ZERO) == 0) {
2291 					bzero(sa, ea - sa);
2292 				}
2293 			} else {
2294 				for (; sa < ea; sa += DEV_BSIZE, j++) {
2295 					if (((bp->b_pages[i]->flags & PG_ZERO) == 0) &&
2296 						(bp->b_pages[i]->valid & (1<<j)) == 0)
2297 						bzero(sa, DEV_BSIZE);
2298 				}
2299 			}
2300 			bp->b_pages[i]->valid |= mask;
2301 			vm_page_flag_clear(bp->b_pages[i], PG_ZERO);
2302 		}
2303 		bp->b_resid = 0;
2304 	} else {
2305 		clrbuf(bp);
2306 	}
2307 }
2308 
2309 /*
2310  * vm_hold_load_pages and vm_hold_unload pages get pages into
2311  * a buffers address space.  The pages are anonymous and are
2312  * not associated with a file object.
2313  */
2314 void
2315 vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
2316 {
2317 	vm_offset_t pg;
2318 	vm_page_t p;
2319 	int index;
2320 
2321 	to = round_page(to);
2322 	from = round_page(from);
2323 	index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
2324 
2325 	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
2326 
2327 tryagain:
2328 
2329 		p = vm_page_alloc(kernel_object,
2330 			((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
2331 		    VM_ALLOC_NORMAL);
2332 		if (!p) {
2333 			vm_pageout_deficit += (to - from) >> PAGE_SHIFT;
2334 			VM_WAIT;
2335 			goto tryagain;
2336 		}
2337 		vm_page_wire(p);
2338 		p->valid = VM_PAGE_BITS_ALL;
2339 		vm_page_flag_clear(p, PG_ZERO);
2340 		pmap_kenter(pg, VM_PAGE_TO_PHYS(p));
2341 		bp->b_pages[index] = p;
2342 		vm_page_wakeup(p);
2343 	}
2344 	bp->b_npages = index;
2345 }
2346 
2347 void
2348 vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
2349 {
2350 	vm_offset_t pg;
2351 	vm_page_t p;
2352 	int index, newnpages;
2353 
2354 	from = round_page(from);
2355 	to = round_page(to);
2356 	newnpages = index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
2357 
2358 	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
2359 		p = bp->b_pages[index];
2360 		if (p && (index < bp->b_npages)) {
2361 #if !defined(MAX_PERF)
2362 			if (p->busy) {
2363 				printf("vm_hold_free_pages: blkno: %d, lblkno: %d\n",
2364 					bp->b_blkno, bp->b_lblkno);
2365 			}
2366 #endif
2367 			bp->b_pages[index] = NULL;
2368 			pmap_kremove(pg);
2369 			vm_page_busy(p);
2370 			vm_page_unwire(p, 0);
2371 			vm_page_free(p);
2372 		}
2373 	}
2374 	bp->b_npages = newnpages;
2375 }
2376 
2377 
2378 #include "opt_ddb.h"
2379 #ifdef DDB
2380 #include <ddb/ddb.h>
2381 
2382 DB_SHOW_COMMAND(buffer, db_show_buffer)
2383 {
2384 	/* get args */
2385 	struct buf *bp = (struct buf *)addr;
2386 
2387 	if (!have_addr) {
2388 		db_printf("usage: show buffer <addr>\n");
2389 		return;
2390 	}
2391 
2392 	db_printf("b_proc = %p,\nb_flags = 0x%b\n", (void *)bp->b_proc,
2393 		  (u_int)bp->b_flags, PRINT_BUF_FLAGS);
2394 	db_printf("b_error = %d, b_bufsize = %ld, b_bcount = %ld, "
2395 		  "b_resid = %ld\nb_dev = 0x%x, b_data = %p, "
2396 		  "b_blkno = %d, b_pblkno = %d\n",
2397 		  bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid,
2398 		  bp->b_dev, bp->b_data, bp->b_blkno, bp->b_pblkno);
2399 	if (bp->b_npages) {
2400 		int i;
2401 		db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages);
2402 		for (i = 0; i < bp->b_npages; i++) {
2403 			vm_page_t m;
2404 			m = bp->b_pages[i];
2405 			db_printf("(%p, 0x%lx, 0x%lx)", (void *)m->object,
2406 			    (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m));
2407 			if ((i + 1) < bp->b_npages)
2408 				db_printf(",");
2409 		}
2410 		db_printf("\n");
2411 	}
2412 }
2413 #endif /* DDB */
2414