xref: /freebsd/sys/kern/vfs_bio.c (revision 952d112864d8008aa87278a30a539d888a8493cd)
1 /*
2  * Copyright (c) 1994 John S. Dyson
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice immediately at the beginning of the file, without modification,
10  *    this list of conditions, and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  * 3. Absolutely no warranty of function or purpose is made by the author
15  *    John S. Dyson.
16  * 4. This work was done expressly for inclusion into FreeBSD.  Other use
17  *    is allowed if this notation is included.
18  * 5. Modifications may be freely made to this file if the above conditions
19  *    are met.
20  *
21  * $Id: vfs_bio.c,v 1.112 1997/02/22 09:39:30 peter Exp $
22  */
23 
24 /*
25  * this file contains a new buffer I/O scheme implementing a coherent
26  * VM object and buffer cache scheme.  Pains have been taken to make
27  * sure that the performance degradation associated with schemes such
28  * as this is not realized.
29  *
30  * Author:  John S. Dyson
31  * Significant help during the development and debugging phases
32  * had been provided by David Greenman, also of the FreeBSD core team.
33  */
34 
35 #include "opt_bounce.h"
36 
37 #define VMIO
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/sysproto.h>
41 #include <sys/kernel.h>
42 #include <sys/sysctl.h>
43 #include <sys/proc.h>
44 #include <sys/vnode.h>
45 #include <sys/vmmeter.h>
46 #include <vm/vm.h>
47 #include <vm/vm_param.h>
48 #include <vm/vm_prot.h>
49 #include <vm/vm_kern.h>
50 #include <vm/vm_pageout.h>
51 #include <vm/vm_page.h>
52 #include <vm/vm_object.h>
53 #include <vm/vm_extern.h>
54 #include <vm/vm_map.h>
55 #include <sys/buf.h>
56 #include <sys/mount.h>
57 #include <sys/malloc.h>
58 #include <sys/resourcevar.h>
59 #include <sys/proc.h>
60 
61 #include <miscfs/specfs/specdev.h>
62 
63 static void vfs_update __P((void));
64 static struct	proc *updateproc;
65 static struct kproc_desc up_kp = {
66 	"update",
67 	vfs_update,
68 	&updateproc
69 };
70 SYSINIT_KT(update, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
71 
72 struct buf *buf;		/* buffer header pool */
73 struct swqueue bswlist;
74 
75 int count_lock_queue __P((void));
76 static void vm_hold_free_pages(struct buf * bp, vm_offset_t from,
77 		vm_offset_t to);
78 static void vm_hold_load_pages(struct buf * bp, vm_offset_t from,
79 		vm_offset_t to);
80 static void vfs_clean_pages(struct buf * bp);
81 static void vfs_setdirty(struct buf *bp);
82 static void vfs_vmio_release(struct buf *bp);
83 
84 int needsbuffer;
85 
86 /*
87  * Internal update daemon, process 3
88  *	The variable vfs_update_wakeup allows for internal syncs.
89  */
90 int vfs_update_wakeup;
91 
92 
93 /*
94  * buffers base kva
95  */
96 
97 /*
98  * bogus page -- for I/O to/from partially complete buffers
99  * this is a temporary solution to the problem, but it is not
100  * really that bad.  it would be better to split the buffer
101  * for input in the case of buffers partially already in memory,
102  * but the code is intricate enough already.
103  */
104 vm_page_t bogus_page;
105 static vm_offset_t bogus_offset;
106 
107 static int bufspace, maxbufspace, vmiospace, maxvmiobufspace,
108 	bufmallocspace, maxbufmallocspace;
109 
110 static struct bufhashhdr bufhashtbl[BUFHSZ], invalhash;
111 static struct bqueues bufqueues[BUFFER_QUEUES];
112 
113 extern int vm_swap_size;
114 
115 #define BUF_MAXUSE 16
116 
117 /*
118  * Initialize buffer headers and related structures.
119  */
120 void
121 bufinit()
122 {
123 	struct buf *bp;
124 	int i;
125 
126 	TAILQ_INIT(&bswlist);
127 	LIST_INIT(&invalhash);
128 
129 	/* first, make a null hash table */
130 	for (i = 0; i < BUFHSZ; i++)
131 		LIST_INIT(&bufhashtbl[i]);
132 
133 	/* next, make a null set of free lists */
134 	for (i = 0; i < BUFFER_QUEUES; i++)
135 		TAILQ_INIT(&bufqueues[i]);
136 
137 	/* finally, initialize each buffer header and stick on empty q */
138 	for (i = 0; i < nbuf; i++) {
139 		bp = &buf[i];
140 		bzero(bp, sizeof *bp);
141 		bp->b_flags = B_INVAL;	/* we're just an empty header */
142 		bp->b_dev = NODEV;
143 		bp->b_rcred = NOCRED;
144 		bp->b_wcred = NOCRED;
145 		bp->b_qindex = QUEUE_EMPTY;
146 		bp->b_vnbufs.le_next = NOLIST;
147 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
148 		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
149 	}
150 /*
151  * maxbufspace is currently calculated to support all filesystem blocks
152  * to be 8K.  If you happen to use a 16K filesystem, the size of the buffer
153  * cache is still the same as it would be for 8K filesystems.  This
154  * keeps the size of the buffer cache "in check" for big block filesystems.
155  */
156 	maxbufspace = (nbuf + 8) * DFLTBSIZE;
157 /*
158  * reserve 1/3 of the buffers for metadata (VDIR) which might not be VMIO'ed
159  */
160 	maxvmiobufspace = 2 * maxbufspace / 3;
161 /*
162  * Limit the amount of malloc memory since it is wired permanently into
163  * the kernel space.  Even though this is accounted for in the buffer
164  * allocation, we don't want the malloced region to grow uncontrolled.
165  * The malloc scheme improves memory utilization significantly on average
166  * (small) directories.
167  */
168 	maxbufmallocspace = maxbufspace / 20;
169 
170 	bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE);
171 	bogus_page = vm_page_alloc(kernel_object,
172 			((bogus_offset - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
173 			VM_ALLOC_NORMAL);
174 
175 }
176 
177 /*
178  * Free the kva allocation for a buffer
179  * Must be called only at splbio or higher,
180  *  as this is the only locking for buffer_map.
181  */
182 static void
183 bfreekva(struct buf * bp)
184 {
185 	if (bp->b_kvasize == 0)
186 		return;
187 
188 	vm_map_delete(buffer_map,
189 		(vm_offset_t) bp->b_kvabase,
190 		(vm_offset_t) bp->b_kvabase + bp->b_kvasize);
191 
192 	bp->b_kvasize = 0;
193 
194 }
195 
196 /*
197  * remove the buffer from the appropriate free list
198  */
199 void
200 bremfree(struct buf * bp)
201 {
202 	int s = splbio();
203 
204 	if (bp->b_qindex != QUEUE_NONE) {
205 		TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
206 		bp->b_qindex = QUEUE_NONE;
207 	} else {
208 		panic("bremfree: removing a buffer when not on a queue");
209 	}
210 	splx(s);
211 }
212 
213 /*
214  * Get a buffer with the specified data.  Look in the cache first.
215  */
216 int
217 bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred,
218     struct buf ** bpp)
219 {
220 	struct buf *bp;
221 
222 	bp = getblk(vp, blkno, size, 0, 0);
223 	*bpp = bp;
224 
225 	/* if not found in cache, do some I/O */
226 	if ((bp->b_flags & B_CACHE) == 0) {
227 		if (curproc != NULL)
228 			curproc->p_stats->p_ru.ru_inblock++;
229 		bp->b_flags |= B_READ;
230 		bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
231 		if (bp->b_rcred == NOCRED) {
232 			if (cred != NOCRED)
233 				crhold(cred);
234 			bp->b_rcred = cred;
235 		}
236 		vfs_busy_pages(bp, 0);
237 		VOP_STRATEGY(bp);
238 		return (biowait(bp));
239 	}
240 	return (0);
241 }
242 
243 /*
244  * Operates like bread, but also starts asynchronous I/O on
245  * read-ahead blocks.
246  */
247 int
248 breadn(struct vnode * vp, daddr_t blkno, int size,
249     daddr_t * rablkno, int *rabsize,
250     int cnt, struct ucred * cred, struct buf ** bpp)
251 {
252 	struct buf *bp, *rabp;
253 	int i;
254 	int rv = 0, readwait = 0;
255 
256 	*bpp = bp = getblk(vp, blkno, size, 0, 0);
257 
258 	/* if not found in cache, do some I/O */
259 	if ((bp->b_flags & B_CACHE) == 0) {
260 		if (curproc != NULL)
261 			curproc->p_stats->p_ru.ru_inblock++;
262 		bp->b_flags |= B_READ;
263 		bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
264 		if (bp->b_rcred == NOCRED) {
265 			if (cred != NOCRED)
266 				crhold(cred);
267 			bp->b_rcred = cred;
268 		}
269 		vfs_busy_pages(bp, 0);
270 		VOP_STRATEGY(bp);
271 		++readwait;
272 	}
273 	for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
274 		if (inmem(vp, *rablkno))
275 			continue;
276 		rabp = getblk(vp, *rablkno, *rabsize, 0, 0);
277 
278 		if ((rabp->b_flags & B_CACHE) == 0) {
279 			if (curproc != NULL)
280 				curproc->p_stats->p_ru.ru_inblock++;
281 			rabp->b_flags |= B_READ | B_ASYNC;
282 			rabp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
283 			if (rabp->b_rcred == NOCRED) {
284 				if (cred != NOCRED)
285 					crhold(cred);
286 				rabp->b_rcred = cred;
287 			}
288 			vfs_busy_pages(rabp, 0);
289 			VOP_STRATEGY(rabp);
290 		} else {
291 			brelse(rabp);
292 		}
293 	}
294 
295 	if (readwait) {
296 		rv = biowait(bp);
297 	}
298 	return (rv);
299 }
300 
301 /*
302  * Write, release buffer on completion.  (Done by iodone
303  * if async.)
304  */
305 int
306 bwrite(struct buf * bp)
307 {
308 	int oldflags = bp->b_flags;
309 
310 	if (bp->b_flags & B_INVAL) {
311 		brelse(bp);
312 		return (0);
313 	}
314 	if (!(bp->b_flags & B_BUSY))
315 		panic("bwrite: buffer is not busy???");
316 
317 	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
318 	bp->b_flags |= B_WRITEINPROG;
319 
320 	if ((oldflags & (B_ASYNC|B_DELWRI)) == (B_ASYNC|B_DELWRI)) {
321 		reassignbuf(bp, bp->b_vp);
322 	}
323 
324 	bp->b_vp->v_numoutput++;
325 	vfs_busy_pages(bp, 1);
326 	if (curproc != NULL)
327 		curproc->p_stats->p_ru.ru_oublock++;
328 	VOP_STRATEGY(bp);
329 
330 	/*
331 	 * Handle ordered writes here.
332 	 * If the write was originally flagged as ordered,
333 	 * then we check to see if it was converted to async.
334 	 * If it was converted to async, and is done now, then
335 	 * we release the buffer.  Otherwise we clear the
336 	 * ordered flag because it is not needed anymore.
337 	 *
338  	 * Note that biodone has been modified so that it does
339 	 * not release ordered buffers.  This allows us to have
340 	 * a chance to determine whether or not the driver
341 	 * has set the async flag in the strategy routine.  Otherwise
342 	 * if biodone was not modified, then the buffer may have been
343 	 * reused before we have had a chance to check the flag.
344 	 */
345 
346 	if ((oldflags & B_ORDERED) == B_ORDERED) {
347 		int s;
348 		s = splbio();
349 		if (bp->b_flags & B_ASYNC)  {
350 			if ((bp->b_flags & B_DONE)) {
351 				if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_RELBUF)) != 0)
352 					brelse(bp);
353 				else
354 					bqrelse(bp);
355 			}
356 			splx(s);
357 			return (0);
358 		} else {
359 			bp->b_flags &= ~B_ORDERED;
360 		}
361 		splx(s);
362 	}
363 
364 	if ((oldflags & B_ASYNC) == 0) {
365 		int rtval = biowait(bp);
366 
367 		if (oldflags & B_DELWRI) {
368 			reassignbuf(bp, bp->b_vp);
369 		}
370 		brelse(bp);
371 		return (rtval);
372 	}
373 	return (0);
374 }
375 
376 int
377 vn_bwrite(ap)
378 	struct vop_bwrite_args *ap;
379 {
380 	return (bwrite(ap->a_bp));
381 }
382 
383 /*
384  * Delayed write. (Buffer is marked dirty).
385  */
386 void
387 bdwrite(struct buf * bp)
388 {
389 
390 	if ((bp->b_flags & B_BUSY) == 0) {
391 		panic("bdwrite: buffer is not busy");
392 	}
393 	if (bp->b_flags & B_INVAL) {
394 		brelse(bp);
395 		return;
396 	}
397 	if (bp->b_flags & B_TAPE) {
398 		bawrite(bp);
399 		return;
400 	}
401 	bp->b_flags &= ~(B_READ|B_RELBUF);
402 	if ((bp->b_flags & B_DELWRI) == 0) {
403 		bp->b_flags |= B_DONE | B_DELWRI;
404 		reassignbuf(bp, bp->b_vp);
405 	}
406 
407 	/*
408 	 * This bmap keeps the system from needing to do the bmap later,
409 	 * perhaps when the system is attempting to do a sync.  Since it
410 	 * is likely that the indirect block -- or whatever other datastructure
411 	 * that the filesystem needs is still in memory now, it is a good
412 	 * thing to do this.  Note also, that if the pageout daemon is
413 	 * requesting a sync -- there might not be enough memory to do
414 	 * the bmap then...  So, this is important to do.
415 	 */
416 	if( bp->b_lblkno == bp->b_blkno) {
417 		VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
418 	}
419 
420 	/*
421 	 * Set the *dirty* buffer range based upon the VM system dirty pages.
422 	 */
423 	vfs_setdirty(bp);
424 
425 	/*
426 	 * We need to do this here to satisfy the vnode_pager and the
427 	 * pageout daemon, so that it thinks that the pages have been
428 	 * "cleaned".  Note that since the pages are in a delayed write
429 	 * buffer -- the VFS layer "will" see that the pages get written
430 	 * out on the next sync, or perhaps the cluster will be completed.
431 	 */
432 	vfs_clean_pages(bp);
433 	bqrelse(bp);
434 	return;
435 }
436 
437 /*
438  * Asynchronous write.
439  * Start output on a buffer, but do not wait for it to complete.
440  * The buffer is released when the output completes.
441  */
442 void
443 bawrite(struct buf * bp)
444 {
445 	bp->b_flags |= B_ASYNC;
446 	(void) VOP_BWRITE(bp);
447 }
448 
449 /*
450  * Ordered write.
451  * Start output on a buffer, but only wait for it to complete if the
452  * output device cannot guarantee ordering in some other way.  Devices
453  * that can perform asynchronous ordered writes will set the B_ASYNC
454  * flag in their strategy routine.
455  * The buffer is released when the output completes.
456  */
457 int
458 bowrite(struct buf * bp)
459 {
460 	bp->b_flags |= B_ORDERED;
461 	return (VOP_BWRITE(bp));
462 }
463 
464 /*
465  * Release a buffer.
466  */
467 void
468 brelse(struct buf * bp)
469 {
470 	int s;
471 
472 	if (bp->b_flags & B_CLUSTER) {
473 		relpbuf(bp);
474 		return;
475 	}
476 	/* anyone need a "free" block? */
477 	s = splbio();
478 
479 	/* anyone need this block? */
480 	if (bp->b_flags & B_WANTED) {
481 		bp->b_flags &= ~(B_WANTED | B_AGE);
482 		wakeup(bp);
483 	}
484 
485 	if (bp->b_flags & B_LOCKED)
486 		bp->b_flags &= ~B_ERROR;
487 
488 	if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR)) ||
489 	    (bp->b_bufsize <= 0)) {
490 		bp->b_flags |= B_INVAL;
491 		bp->b_flags &= ~(B_DELWRI | B_CACHE);
492 		if (((bp->b_flags & B_VMIO) == 0) && bp->b_vp) {
493 			if (bp->b_bufsize)
494 				allocbuf(bp, 0);
495 			brelvp(bp);
496 		}
497 	}
498 
499 	/*
500 	 * VMIO buffer rundown.  It is not very necessary to keep a VMIO buffer
501 	 * constituted, so the B_INVAL flag is used to *invalidate* the buffer,
502 	 * but the VM object is kept around.  The B_NOCACHE flag is used to
503 	 * invalidate the pages in the VM object.
504 	 */
505 	if (bp->b_flags & B_VMIO) {
506 		vm_ooffset_t foff;
507 		vm_object_t obj;
508 		int i, resid;
509 		vm_page_t m;
510 		struct vnode *vp;
511 		int iototal = bp->b_bufsize;
512 
513 		vp = bp->b_vp;
514 		if (!vp)
515 			panic("brelse: missing vp");
516 
517 		if (bp->b_npages) {
518 			vm_pindex_t poff;
519 			obj = (vm_object_t) vp->v_object;
520 			if (vp->v_type == VBLK)
521 				foff = ((vm_ooffset_t) bp->b_lblkno) << DEV_BSHIFT;
522 			else
523 				foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
524 			poff = OFF_TO_IDX(foff);
525 			for (i = 0; i < bp->b_npages; i++) {
526 				m = bp->b_pages[i];
527 				if (m == bogus_page) {
528 					m = vm_page_lookup(obj, poff + i);
529 					if (!m) {
530 						panic("brelse: page missing\n");
531 					}
532 					bp->b_pages[i] = m;
533 					pmap_qenter(trunc_page(bp->b_data),
534 						bp->b_pages, bp->b_npages);
535 				}
536 				resid = IDX_TO_OFF(m->pindex+1) - foff;
537 				if (resid > iototal)
538 					resid = iototal;
539 				if (resid > 0) {
540 					/*
541 					 * Don't invalidate the page if the local machine has already
542 					 * modified it.  This is the lesser of two evils, and should
543 					 * be fixed.
544 					 */
545 					if (bp->b_flags & (B_NOCACHE | B_ERROR)) {
546 						vm_page_test_dirty(m);
547 						if (m->dirty == 0) {
548 							vm_page_set_invalid(m, (vm_offset_t) foff, resid);
549 							if (m->valid == 0)
550 								vm_page_protect(m, VM_PROT_NONE);
551 						}
552 					}
553 					if (resid >= PAGE_SIZE) {
554 						if ((m->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) {
555 							bp->b_flags |= B_INVAL;
556 						}
557 					} else {
558 						if (!vm_page_is_valid(m,
559 							(((vm_offset_t) bp->b_data) & PAGE_MASK), resid)) {
560 							bp->b_flags |= B_INVAL;
561 						}
562 					}
563 				}
564 				foff += resid;
565 				iototal -= resid;
566 			}
567 		}
568 		if (bp->b_flags & (B_INVAL | B_RELBUF))
569 			vfs_vmio_release(bp);
570 	}
571 	if (bp->b_qindex != QUEUE_NONE)
572 		panic("brelse: free buffer onto another queue???");
573 
574 	/* enqueue */
575 	/* buffers with no memory */
576 	if (bp->b_bufsize == 0) {
577 		bp->b_qindex = QUEUE_EMPTY;
578 		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
579 		LIST_REMOVE(bp, b_hash);
580 		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
581 		bp->b_dev = NODEV;
582 		/*
583 		 * Get rid of the kva allocation *now*
584 		 */
585 		bfreekva(bp);
586 		if (needsbuffer) {
587 			wakeup(&needsbuffer);
588 			needsbuffer=0;
589 		}
590 		/* buffers with junk contents */
591 	} else if (bp->b_flags & (B_ERROR | B_INVAL | B_NOCACHE | B_RELBUF)) {
592 		bp->b_qindex = QUEUE_AGE;
593 		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist);
594 		LIST_REMOVE(bp, b_hash);
595 		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
596 		bp->b_dev = NODEV;
597 		if (needsbuffer) {
598 			wakeup(&needsbuffer);
599 			needsbuffer=0;
600 		}
601 		/* buffers that are locked */
602 	} else if (bp->b_flags & B_LOCKED) {
603 		bp->b_qindex = QUEUE_LOCKED;
604 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
605 		/* buffers with stale but valid contents */
606 	} else if (bp->b_flags & B_AGE) {
607 		bp->b_qindex = QUEUE_AGE;
608 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist);
609 		if (needsbuffer) {
610 			wakeup(&needsbuffer);
611 			needsbuffer=0;
612 		}
613 		/* buffers with valid and quite potentially reuseable contents */
614 	} else {
615 		bp->b_qindex = QUEUE_LRU;
616 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
617 		if (needsbuffer) {
618 			wakeup(&needsbuffer);
619 			needsbuffer=0;
620 		}
621 	}
622 
623 	/* unlock */
624 	bp->b_flags &= ~(B_ORDERED | B_WANTED | B_BUSY |
625 				B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
626 	splx(s);
627 }
628 
629 /*
630  * Release a buffer.
631  */
632 void
633 bqrelse(struct buf * bp)
634 {
635 	int s;
636 
637 	s = splbio();
638 
639 
640 	/* anyone need this block? */
641 	if (bp->b_flags & B_WANTED) {
642 		bp->b_flags &= ~(B_WANTED | B_AGE);
643 		wakeup(bp);
644 	}
645 
646 	if (bp->b_qindex != QUEUE_NONE)
647 		panic("bqrelse: free buffer onto another queue???");
648 
649 	if (bp->b_flags & B_LOCKED) {
650 		bp->b_flags &= ~B_ERROR;
651 		bp->b_qindex = QUEUE_LOCKED;
652 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
653 		/* buffers with stale but valid contents */
654 	} else {
655 		bp->b_qindex = QUEUE_LRU;
656 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
657 		if (needsbuffer) {
658 			wakeup(&needsbuffer);
659 			needsbuffer=0;
660 		}
661 	}
662 
663 	/* unlock */
664 	bp->b_flags &= ~(B_ORDERED | B_WANTED | B_BUSY |
665 		B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
666 	splx(s);
667 }
668 
669 static void
670 vfs_vmio_release(bp)
671 	struct buf *bp;
672 {
673 	int i;
674 	vm_page_t m;
675 
676 	for (i = 0; i < bp->b_npages; i++) {
677 		m = bp->b_pages[i];
678 		bp->b_pages[i] = NULL;
679 		vm_page_unwire(m);
680 		/*
681 		 * We don't mess with busy pages, it is
682 		 * the responsibility of the process that
683 		 * busied the pages to deal with them.
684 		 */
685 		if ((m->flags & PG_BUSY) || (m->busy != 0))
686 			continue;
687 
688 		if (m->wire_count == 0) {
689 
690 			if (m->flags & PG_WANTED) {
691 				m->flags &= ~PG_WANTED;
692 				wakeup(m);
693 			}
694 
695 			/*
696 			 * If this is an async free -- we cannot place
697 			 * pages onto the cache queue, so our policy for
698 			 * such buffers is to avoid the cache queue, and
699 			 * only modify the active queue or free queue.
700 			 */
701 			if ((bp->b_flags & B_ASYNC) == 0) {
702 
703 			/*
704 			 * In the case of sync buffer frees, we can do pretty much
705 			 * anything to any of the memory queues.  Specifically,
706 			 * the cache queue is free to be modified.
707 			 */
708 				if (m->valid) {
709 					if(m->dirty == 0)
710 						vm_page_test_dirty(m);
711 					/*
712 					 * this keeps pressure off of the process memory
713 					 */
714 					if ((vm_swap_size == 0) ||
715 						(cnt.v_free_count < cnt.v_free_min)) {
716 						if ((m->dirty == 0) &&
717 							(m->hold_count == 0))
718 							vm_page_cache(m);
719 						else
720 							vm_page_deactivate(m);
721 					}
722 				} else if (m->hold_count == 0) {
723 					vm_page_protect(m, VM_PROT_NONE);
724 					vm_page_free(m);
725 				}
726 			} else {
727 				/*
728 				 * If async, then at least we clear the
729 				 * act_count.
730 				 */
731 				m->act_count = 0;
732 			}
733 		}
734 	}
735 	bufspace -= bp->b_bufsize;
736 	vmiospace -= bp->b_bufsize;
737 	pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
738 	bp->b_npages = 0;
739 	bp->b_bufsize = 0;
740 	bp->b_flags &= ~B_VMIO;
741 	if (bp->b_vp)
742 		brelvp(bp);
743 }
744 
745 /*
746  * Check to see if a block is currently memory resident.
747  */
748 struct buf *
749 gbincore(struct vnode * vp, daddr_t blkno)
750 {
751 	struct buf *bp;
752 	struct bufhashhdr *bh;
753 
754 	bh = BUFHASH(vp, blkno);
755 	bp = bh->lh_first;
756 
757 	/* Search hash chain */
758 	while (bp != NULL) {
759 		/* hit */
760 		if (bp->b_vp == vp && bp->b_lblkno == blkno &&
761 		    (bp->b_flags & B_INVAL) == 0) {
762 			break;
763 		}
764 		bp = bp->b_hash.le_next;
765 	}
766 	return (bp);
767 }
768 
769 /*
770  * this routine implements clustered async writes for
771  * clearing out B_DELWRI buffers...  This is much better
772  * than the old way of writing only one buffer at a time.
773  */
774 int
775 vfs_bio_awrite(struct buf * bp)
776 {
777 	int i;
778 	daddr_t lblkno = bp->b_lblkno;
779 	struct vnode *vp = bp->b_vp;
780 	int s;
781 	int ncl;
782 	struct buf *bpa;
783 	int nwritten;
784 
785 	s = splbio();
786 	/*
787 	 * right now we support clustered writing only to regular files
788 	 */
789 	if ((vp->v_type == VREG) &&
790 	    (vp->v_mount != 0) && /* Only on nodes that have the size info */
791 	    (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
792 		int size;
793 		int maxcl;
794 
795 		size = vp->v_mount->mnt_stat.f_iosize;
796 		maxcl = MAXPHYS / size;
797 
798 		for (i = 1; i < maxcl; i++) {
799 			if ((bpa = gbincore(vp, lblkno + i)) &&
800 			    ((bpa->b_flags & (B_BUSY | B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
801 			    (B_DELWRI | B_CLUSTEROK)) &&
802 			    (bpa->b_bufsize == size)) {
803 				if ((bpa->b_blkno == bpa->b_lblkno) ||
804 				    (bpa->b_blkno != bp->b_blkno + ((i * size) >> DEV_BSHIFT)))
805 					break;
806 			} else {
807 				break;
808 			}
809 		}
810 		ncl = i;
811 		/*
812 		 * this is a possible cluster write
813 		 */
814 		if (ncl != 1) {
815 			nwritten = cluster_wbuild(vp, size, lblkno, ncl);
816 			splx(s);
817 			return nwritten;
818 		}
819 	}
820 	bremfree(bp);
821 	splx(s);
822 	/*
823 	 * default (old) behavior, writing out only one block
824 	 */
825 	bp->b_flags |= B_BUSY | B_ASYNC;
826 	nwritten = bp->b_bufsize;
827 	(void) VOP_BWRITE(bp);
828 	return nwritten;
829 }
830 
831 
832 /*
833  * Find a buffer header which is available for use.
834  */
835 static struct buf *
836 getnewbuf(int slpflag, int slptimeo, int size, int maxsize)
837 {
838 	struct buf *bp;
839 	int nbyteswritten = 0;
840 	vm_offset_t addr;
841 
842 start:
843 	if (bufspace >= maxbufspace)
844 		goto trytofreespace;
845 
846 	/* can we constitute a new buffer? */
847 	if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]))) {
848 		if (bp->b_qindex != QUEUE_EMPTY)
849 			panic("getnewbuf: inconsistent EMPTY queue, qindex=%d",
850 			    bp->b_qindex);
851 		bp->b_flags |= B_BUSY;
852 		bremfree(bp);
853 		goto fillbuf;
854 	}
855 trytofreespace:
856 	/*
857 	 * We keep the file I/O from hogging metadata I/O
858 	 * This is desirable because file data is cached in the
859 	 * VM/Buffer cache even if a buffer is freed.
860 	 */
861 	if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]))) {
862 		if (bp->b_qindex != QUEUE_AGE)
863 			panic("getnewbuf: inconsistent AGE queue, qindex=%d",
864 			    bp->b_qindex);
865 	} else if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]))) {
866 		if (bp->b_qindex != QUEUE_LRU)
867 			panic("getnewbuf: inconsistent LRU queue, qindex=%d",
868 			    bp->b_qindex);
869 	}
870 	if (!bp) {
871 		/* wait for a free buffer of any kind */
872 		needsbuffer = 1;
873 		tsleep(&needsbuffer,
874 			(PRIBIO + 1) | slpflag, "newbuf", slptimeo);
875 		return (0);
876 	}
877 
878 #if defined(DIAGNOSTIC)
879 	if (bp->b_flags & B_BUSY) {
880 		panic("getnewbuf: busy buffer on free list\n");
881 	}
882 #endif
883 
884 	/*
885 	 * We are fairly aggressive about freeing VMIO buffers, but since
886 	 * the buffering is intact without buffer headers, there is not
887 	 * much loss.  We gain by maintaining non-VMIOed metadata in buffers.
888 	 */
889 	if ((bp->b_qindex == QUEUE_LRU) && (bp->b_usecount > 0)) {
890 		if ((bp->b_flags & B_VMIO) == 0 ||
891 			(vmiospace < maxvmiobufspace)) {
892 			--bp->b_usecount;
893 			TAILQ_REMOVE(&bufqueues[QUEUE_LRU], bp, b_freelist);
894 			if (TAILQ_FIRST(&bufqueues[QUEUE_LRU]) != NULL) {
895 				TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
896 				goto start;
897 			}
898 			TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
899 		}
900 	}
901 
902 	/* if we are a delayed write, convert to an async write */
903 	if ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) {
904 		nbyteswritten += vfs_bio_awrite(bp);
905 		if (!slpflag && !slptimeo) {
906 			return (0);
907 		}
908 		goto start;
909 	}
910 
911 	if (bp->b_flags & B_WANTED) {
912 		bp->b_flags &= ~B_WANTED;
913 		wakeup(bp);
914 	}
915 	bremfree(bp);
916 	bp->b_flags |= B_BUSY;
917 
918 	if (bp->b_flags & B_VMIO) {
919 		bp->b_flags &= ~B_ASYNC;
920 		vfs_vmio_release(bp);
921 	}
922 
923 	if (bp->b_vp)
924 		brelvp(bp);
925 
926 fillbuf:
927 	/* we are not free, nor do we contain interesting data */
928 	if (bp->b_rcred != NOCRED) {
929 		crfree(bp->b_rcred);
930 		bp->b_rcred = NOCRED;
931 	}
932 	if (bp->b_wcred != NOCRED) {
933 		crfree(bp->b_wcred);
934 		bp->b_wcred = NOCRED;
935 	}
936 
937 	LIST_REMOVE(bp, b_hash);
938 	LIST_INSERT_HEAD(&invalhash, bp, b_hash);
939 	if (bp->b_bufsize) {
940 		allocbuf(bp, 0);
941 	}
942 	bp->b_flags = B_BUSY;
943 	bp->b_dev = NODEV;
944 	bp->b_vp = NULL;
945 	bp->b_blkno = bp->b_lblkno = 0;
946 	bp->b_iodone = 0;
947 	bp->b_error = 0;
948 	bp->b_resid = 0;
949 	bp->b_bcount = 0;
950 	bp->b_npages = 0;
951 	bp->b_dirtyoff = bp->b_dirtyend = 0;
952 	bp->b_validoff = bp->b_validend = 0;
953 	bp->b_usecount = 4;
954 
955 	maxsize = (maxsize + PAGE_MASK) & ~PAGE_MASK;
956 
957 	/*
958 	 * we assume that buffer_map is not at address 0
959 	 */
960 	addr = 0;
961 	if (maxsize != bp->b_kvasize) {
962 		bfreekva(bp);
963 
964 		/*
965 		 * See if we have buffer kva space
966 		 */
967 		if (vm_map_findspace(buffer_map,
968 			vm_map_min(buffer_map), maxsize, &addr)) {
969 			bp->b_flags |= B_INVAL;
970 			brelse(bp);
971 			goto trytofreespace;
972 		}
973 	}
974 
975 	/*
976 	 * See if we are below are allocated minimum
977 	 */
978 	if (bufspace >= (maxbufspace + nbyteswritten)) {
979 		bp->b_flags |= B_INVAL;
980 		brelse(bp);
981 		goto trytofreespace;
982 	}
983 
984 	/*
985 	 * create a map entry for the buffer -- in essence
986 	 * reserving the kva space.
987 	 */
988 	if (addr) {
989 		vm_map_insert(buffer_map, NULL, 0,
990 			addr, addr + maxsize,
991 			VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT);
992 
993 		bp->b_kvabase = (caddr_t) addr;
994 		bp->b_kvasize = maxsize;
995 	}
996 	bp->b_data = bp->b_kvabase;
997 
998 	return (bp);
999 }
1000 
1001 /*
1002  * Check to see if a block is currently memory resident.
1003  */
1004 struct buf *
1005 incore(struct vnode * vp, daddr_t blkno)
1006 {
1007 	struct buf *bp;
1008 
1009 	int s = splbio();
1010 	bp = gbincore(vp, blkno);
1011 	splx(s);
1012 	return (bp);
1013 }
1014 
1015 /*
1016  * Returns true if no I/O is needed to access the
1017  * associated VM object.  This is like incore except
1018  * it also hunts around in the VM system for the data.
1019  */
1020 
1021 int
1022 inmem(struct vnode * vp, daddr_t blkno)
1023 {
1024 	vm_object_t obj;
1025 	vm_offset_t toff, tinc;
1026 	vm_page_t m;
1027 	vm_ooffset_t off;
1028 
1029 	if (incore(vp, blkno))
1030 		return 1;
1031 	if (vp->v_mount == NULL)
1032 		return 0;
1033 	if ((vp->v_object == NULL) || (vp->v_flag & VVMIO) == 0)
1034 		return 0;
1035 
1036 	obj = vp->v_object;
1037 	tinc = PAGE_SIZE;
1038 	if (tinc > vp->v_mount->mnt_stat.f_iosize)
1039 		tinc = vp->v_mount->mnt_stat.f_iosize;
1040 	off = blkno * vp->v_mount->mnt_stat.f_iosize;
1041 
1042 	for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
1043 
1044 		m = vm_page_lookup(obj, OFF_TO_IDX(off + toff));
1045 		if (!m)
1046 			return 0;
1047 		if (vm_page_is_valid(m, (vm_offset_t) (toff + off), tinc) == 0)
1048 			return 0;
1049 	}
1050 	return 1;
1051 }
1052 
1053 /*
1054  * now we set the dirty range for the buffer --
1055  * for NFS -- if the file is mapped and pages have
1056  * been written to, let it know.  We want the
1057  * entire range of the buffer to be marked dirty if
1058  * any of the pages have been written to for consistancy
1059  * with the b_validoff, b_validend set in the nfs write
1060  * code, and used by the nfs read code.
1061  */
1062 static void
1063 vfs_setdirty(struct buf *bp) {
1064 	int i;
1065 	vm_object_t object;
1066 	vm_offset_t boffset, offset;
1067 	/*
1068 	 * We qualify the scan for modified pages on whether the
1069 	 * object has been flushed yet.  The OBJ_WRITEABLE flag
1070 	 * is not cleared simply by protecting pages off.
1071 	 */
1072 	if ((bp->b_flags & B_VMIO) &&
1073 		((object = bp->b_pages[0]->object)->flags & (OBJ_WRITEABLE|OBJ_CLEANING))) {
1074 		/*
1075 		 * test the pages to see if they have been modified directly
1076 		 * by users through the VM system.
1077 		 */
1078 		for (i = 0; i < bp->b_npages; i++)
1079 			vm_page_test_dirty(bp->b_pages[i]);
1080 
1081 		/*
1082 		 * scan forwards for the first page modified
1083 		 */
1084 		for (i = 0; i < bp->b_npages; i++) {
1085 			if (bp->b_pages[i]->dirty) {
1086 				break;
1087 			}
1088 		}
1089 		boffset = (i << PAGE_SHIFT);
1090 		if (boffset < bp->b_dirtyoff) {
1091 			bp->b_dirtyoff = boffset;
1092 		}
1093 
1094 		/*
1095 		 * scan backwards for the last page modified
1096 		 */
1097 		for (i = bp->b_npages - 1; i >= 0; --i) {
1098 			if (bp->b_pages[i]->dirty) {
1099 				break;
1100 			}
1101 		}
1102 		boffset = (i + 1);
1103 		offset = boffset + bp->b_pages[0]->pindex;
1104 		if (offset >= object->size)
1105 			boffset = object->size - bp->b_pages[0]->pindex;
1106 		if (bp->b_dirtyend < (boffset << PAGE_SHIFT))
1107 			bp->b_dirtyend = (boffset << PAGE_SHIFT);
1108 	}
1109 }
1110 
1111 /*
1112  * Get a block given a specified block and offset into a file/device.
1113  */
1114 struct buf *
1115 getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo)
1116 {
1117 	struct buf *bp;
1118 	int s;
1119 	struct bufhashhdr *bh;
1120 	int maxsize;
1121 
1122 	if (vp->v_mount) {
1123 		maxsize = vp->v_mount->mnt_stat.f_iosize;
1124 		/*
1125 		 * This happens on mount points.
1126 		 */
1127 		if (maxsize < size)
1128 			maxsize = size;
1129 	} else {
1130 		maxsize = size;
1131 	}
1132 
1133 	if (size > MAXBSIZE)
1134 		panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE);
1135 
1136 	s = splbio();
1137 loop:
1138 	if ((bp = gbincore(vp, blkno))) {
1139 		if (bp->b_flags & B_BUSY) {
1140 			bp->b_flags |= B_WANTED;
1141 			if (bp->b_usecount < BUF_MAXUSE)
1142 				++bp->b_usecount;
1143 			if (!tsleep(bp,
1144 				(PRIBIO + 1) | slpflag, "getblk", slptimeo))
1145 				goto loop;
1146 
1147 			splx(s);
1148 			return (struct buf *) NULL;
1149 		}
1150 		bp->b_flags |= B_BUSY | B_CACHE;
1151 		bremfree(bp);
1152 
1153 		/*
1154 		 * check for size inconsistancies (note that they shouldn't happen
1155 		 * but do when filesystems don't handle the size changes correctly.)
1156 		 * We are conservative on metadata and don't just extend the buffer
1157 		 * but write and re-constitute it.
1158 		 */
1159 
1160 		if (bp->b_bcount != size) {
1161 			if ((bp->b_flags & B_VMIO) && (size <= bp->b_kvasize)) {
1162 				allocbuf(bp, size);
1163 			} else {
1164 				bp->b_flags |= B_NOCACHE;
1165 				VOP_BWRITE(bp);
1166 				goto loop;
1167 			}
1168 		}
1169 
1170 		if (bp->b_usecount < BUF_MAXUSE)
1171 			++bp->b_usecount;
1172 		splx(s);
1173 		return (bp);
1174 	} else {
1175 		vm_object_t obj;
1176 
1177 		if ((bp = getnewbuf(slpflag, slptimeo, size, maxsize)) == 0) {
1178 			if (slpflag || slptimeo) {
1179 				splx(s);
1180 				return NULL;
1181 			}
1182 			goto loop;
1183 		}
1184 
1185 		/*
1186 		 * This code is used to make sure that a buffer is not
1187 		 * created while the getnewbuf routine is blocked.
1188 		 * Normally the vnode is locked so this isn't a problem.
1189 		 * VBLK type I/O requests, however, don't lock the vnode.
1190 		 */
1191 		if (!VOP_ISLOCKED(vp) && gbincore(vp, blkno)) {
1192 			bp->b_flags |= B_INVAL;
1193 			brelse(bp);
1194 			goto loop;
1195 		}
1196 
1197 		/*
1198 		 * Insert the buffer into the hash, so that it can
1199 		 * be found by incore.
1200 		 */
1201 		bp->b_blkno = bp->b_lblkno = blkno;
1202 		bgetvp(vp, bp);
1203 		LIST_REMOVE(bp, b_hash);
1204 		bh = BUFHASH(vp, blkno);
1205 		LIST_INSERT_HEAD(bh, bp, b_hash);
1206 
1207 		if ((obj = vp->v_object) && (vp->v_flag & VVMIO)) {
1208 			bp->b_flags |= (B_VMIO | B_CACHE);
1209 #if defined(VFS_BIO_DEBUG)
1210 			if (vp->v_type != VREG && vp->v_type != VBLK)
1211 				printf("getblk: vmioing file type %d???\n", vp->v_type);
1212 #endif
1213 		} else {
1214 			bp->b_flags &= ~B_VMIO;
1215 		}
1216 		splx(s);
1217 
1218 		allocbuf(bp, size);
1219 #ifdef	PC98
1220 		/*
1221 		 * 1024byte/sector support
1222 		 */
1223 #define B_XXX2 0x8000000
1224 		if (vp->v_flag & 0x10000) bp->b_flags |= B_XXX2;
1225 #endif
1226 		return (bp);
1227 	}
1228 }
1229 
1230 /*
1231  * Get an empty, disassociated buffer of given size.
1232  */
1233 struct buf *
1234 geteblk(int size)
1235 {
1236 	struct buf *bp;
1237 	int s;
1238 
1239 	s = splbio();
1240 	while ((bp = getnewbuf(0, 0, size, MAXBSIZE)) == 0);
1241 	splx(s);
1242 	allocbuf(bp, size);
1243 	bp->b_flags |= B_INVAL;
1244 	return (bp);
1245 }
1246 
1247 
1248 /*
1249  * This code constitutes the buffer memory from either anonymous system
1250  * memory (in the case of non-VMIO operations) or from an associated
1251  * VM object (in the case of VMIO operations).
1252  *
1253  * Note that this code is tricky, and has many complications to resolve
1254  * deadlock or inconsistant data situations.  Tread lightly!!!
1255  *
1256  * Modify the length of a buffer's underlying buffer storage without
1257  * destroying information (unless, of course the buffer is shrinking).
1258  */
1259 int
1260 allocbuf(struct buf * bp, int size)
1261 {
1262 
1263 	int s;
1264 	int newbsize, mbsize;
1265 	int i;
1266 
1267 	if (!(bp->b_flags & B_BUSY))
1268 		panic("allocbuf: buffer not busy");
1269 
1270 	if (bp->b_kvasize < size)
1271 		panic("allocbuf: buffer too small");
1272 
1273 	if ((bp->b_flags & B_VMIO) == 0) {
1274 		caddr_t origbuf;
1275 		int origbufsize;
1276 		/*
1277 		 * Just get anonymous memory from the kernel
1278 		 */
1279 		mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
1280 #if !defined(NO_B_MALLOC)
1281 		if (bp->b_flags & B_MALLOC)
1282 			newbsize = mbsize;
1283 		else
1284 #endif
1285 			newbsize = round_page(size);
1286 
1287 		if (newbsize < bp->b_bufsize) {
1288 #if !defined(NO_B_MALLOC)
1289 			/*
1290 			 * malloced buffers are not shrunk
1291 			 */
1292 			if (bp->b_flags & B_MALLOC) {
1293 				if (newbsize) {
1294 					bp->b_bcount = size;
1295 				} else {
1296 					free(bp->b_data, M_BIOBUF);
1297 					bufspace -= bp->b_bufsize;
1298 					bufmallocspace -= bp->b_bufsize;
1299 					bp->b_data = bp->b_kvabase;
1300 					bp->b_bufsize = 0;
1301 					bp->b_bcount = 0;
1302 					bp->b_flags &= ~B_MALLOC;
1303 				}
1304 				return 1;
1305 			}
1306 #endif
1307 			vm_hold_free_pages(
1308 			    bp,
1309 			    (vm_offset_t) bp->b_data + newbsize,
1310 			    (vm_offset_t) bp->b_data + bp->b_bufsize);
1311 		} else if (newbsize > bp->b_bufsize) {
1312 #if !defined(NO_B_MALLOC)
1313 			/*
1314 			 * We only use malloced memory on the first allocation.
1315 			 * and revert to page-allocated memory when the buffer grows.
1316 			 */
1317 			if ( (bufmallocspace < maxbufmallocspace) &&
1318 				(bp->b_bufsize == 0) &&
1319 				(mbsize <= PAGE_SIZE/2)) {
1320 
1321 				bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK);
1322 				bp->b_bufsize = mbsize;
1323 				bp->b_bcount = size;
1324 				bp->b_flags |= B_MALLOC;
1325 				bufspace += mbsize;
1326 				bufmallocspace += mbsize;
1327 				return 1;
1328 			}
1329 #endif
1330 			origbuf = NULL;
1331 			origbufsize = 0;
1332 #if !defined(NO_B_MALLOC)
1333 			/*
1334 			 * If the buffer is growing on it's other-than-first allocation,
1335 			 * then we revert to the page-allocation scheme.
1336 			 */
1337 			if (bp->b_flags & B_MALLOC) {
1338 				origbuf = bp->b_data;
1339 				origbufsize = bp->b_bufsize;
1340 				bp->b_data = bp->b_kvabase;
1341 				bufspace -= bp->b_bufsize;
1342 				bufmallocspace -= bp->b_bufsize;
1343 				bp->b_bufsize = 0;
1344 				bp->b_flags &= ~B_MALLOC;
1345 				newbsize = round_page(newbsize);
1346 			}
1347 #endif
1348 			vm_hold_load_pages(
1349 			    bp,
1350 			    (vm_offset_t) bp->b_data + bp->b_bufsize,
1351 			    (vm_offset_t) bp->b_data + newbsize);
1352 #if !defined(NO_B_MALLOC)
1353 			if (origbuf) {
1354 				bcopy(origbuf, bp->b_data, origbufsize);
1355 				free(origbuf, M_BIOBUF);
1356 			}
1357 #endif
1358 		}
1359 	} else {
1360 		vm_page_t m;
1361 		int desiredpages;
1362 
1363 		newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
1364 		desiredpages = (round_page(newbsize) >> PAGE_SHIFT);
1365 
1366 #if !defined(NO_B_MALLOC)
1367 		if (bp->b_flags & B_MALLOC)
1368 			panic("allocbuf: VMIO buffer can't be malloced");
1369 #endif
1370 
1371 		if (newbsize < bp->b_bufsize) {
1372 			if (desiredpages < bp->b_npages) {
1373 				for (i = desiredpages; i < bp->b_npages; i++) {
1374 					/*
1375 					 * the page is not freed here -- it
1376 					 * is the responsibility of vnode_pager_setsize
1377 					 */
1378 					m = bp->b_pages[i];
1379 #if defined(DIAGNOSTIC)
1380 					if (m == bogus_page)
1381 						panic("allocbuf: bogus page found");
1382 #endif
1383 					s = splvm();
1384 					while ((m->flags & PG_BUSY) || (m->busy != 0)) {
1385 						m->flags |= PG_WANTED;
1386 						tsleep(m, PVM, "biodep", 0);
1387 					}
1388 					splx(s);
1389 
1390 					bp->b_pages[i] = NULL;
1391 					vm_page_unwire(m);
1392 				}
1393 				pmap_qremove((vm_offset_t) trunc_page(bp->b_data) +
1394 				    (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages));
1395 				bp->b_npages = desiredpages;
1396 			}
1397 		} else if (newbsize > bp->b_bufsize) {
1398 			vm_object_t obj;
1399 			vm_offset_t tinc, toff;
1400 			vm_ooffset_t off;
1401 			vm_pindex_t objoff;
1402 			int pageindex, curbpnpages;
1403 			struct vnode *vp;
1404 			int bsize;
1405 
1406 			vp = bp->b_vp;
1407 
1408 			if (vp->v_type == VBLK)
1409 				bsize = DEV_BSIZE;
1410 			else
1411 				bsize = vp->v_mount->mnt_stat.f_iosize;
1412 
1413 			if (bp->b_npages < desiredpages) {
1414 				obj = vp->v_object;
1415 				tinc = PAGE_SIZE;
1416 				if (tinc > bsize)
1417 					tinc = bsize;
1418 				off = (vm_ooffset_t) bp->b_lblkno * bsize;
1419 				curbpnpages = bp->b_npages;
1420 		doretry:
1421 				bp->b_flags |= B_CACHE;
1422 				for (toff = 0; toff < newbsize; toff += tinc) {
1423 					int bytesinpage;
1424 
1425 					pageindex = toff >> PAGE_SHIFT;
1426 					objoff = OFF_TO_IDX(off + toff);
1427 					if (pageindex < curbpnpages) {
1428 
1429 						m = bp->b_pages[pageindex];
1430 #ifdef VFS_BIO_DIAG
1431 						if (m->pindex != objoff)
1432 							panic("allocbuf: page changed offset??!!!?");
1433 #endif
1434 						bytesinpage = tinc;
1435 						if (tinc > (newbsize - toff))
1436 							bytesinpage = newbsize - toff;
1437 						if ((bp->b_flags & B_CACHE) &&
1438 							!vm_page_is_valid(m,
1439 							(vm_offset_t) ((toff + off) & PAGE_MASK),
1440 							bytesinpage)) {
1441 							bp->b_flags &= ~B_CACHE;
1442 						}
1443 						continue;
1444 					}
1445 					m = vm_page_lookup(obj, objoff);
1446 					if (!m) {
1447 						m = vm_page_alloc(obj, objoff, VM_ALLOC_NORMAL);
1448 						if (!m) {
1449 							VM_WAIT;
1450 							goto doretry;
1451 						}
1452 						/*
1453 						 * Normally it is unwise to clear PG_BUSY without
1454 						 * PAGE_WAKEUP -- but it is okay here, as there is
1455 						 * no chance for blocking between here and vm_page_alloc
1456 						 */
1457 						m->flags &= ~PG_BUSY;
1458 						vm_page_wire(m);
1459 						bp->b_flags &= ~B_CACHE;
1460 					} else if (m->flags & PG_BUSY) {
1461 						s = splvm();
1462 						if (m->flags & PG_BUSY) {
1463 							m->flags |= PG_WANTED;
1464 							tsleep(m, PVM, "pgtblk", 0);
1465 						}
1466 						splx(s);
1467 						goto doretry;
1468 					} else {
1469 						if ((curproc != pageproc) &&
1470 							((m->queue - m->pc) == PQ_CACHE) &&
1471 						    ((cnt.v_free_count + cnt.v_cache_count) <
1472 								(cnt.v_free_min + cnt.v_cache_min))) {
1473 							pagedaemon_wakeup();
1474 						}
1475 						bytesinpage = tinc;
1476 						if (tinc > (newbsize - toff))
1477 							bytesinpage = newbsize - toff;
1478 						if ((bp->b_flags & B_CACHE) &&
1479 							!vm_page_is_valid(m,
1480 							(vm_offset_t) ((toff + off) & PAGE_MASK),
1481 							bytesinpage)) {
1482 							bp->b_flags &= ~B_CACHE;
1483 						}
1484 						vm_page_wire(m);
1485 					}
1486 					bp->b_pages[pageindex] = m;
1487 					curbpnpages = pageindex + 1;
1488 				}
1489 				bp->b_data = (caddr_t) trunc_page(bp->b_data);
1490 				bp->b_npages = curbpnpages;
1491 				pmap_qenter((vm_offset_t) bp->b_data,
1492 					bp->b_pages, bp->b_npages);
1493 				((vm_offset_t) bp->b_data) |= off & PAGE_MASK;
1494 			}
1495 		}
1496 	}
1497 	if (bp->b_flags & B_VMIO)
1498 		vmiospace += bp->b_bufsize;
1499 	bufspace += (newbsize - bp->b_bufsize);
1500 	bp->b_bufsize = newbsize;
1501 	bp->b_bcount = size;
1502 	return 1;
1503 }
1504 
1505 /*
1506  * Wait for buffer I/O completion, returning error status.
1507  */
1508 int
1509 biowait(register struct buf * bp)
1510 {
1511 	int s;
1512 
1513 	s = splbio();
1514 	while ((bp->b_flags & B_DONE) == 0)
1515 		tsleep(bp, PRIBIO, "biowait", 0);
1516 	splx(s);
1517 	if (bp->b_flags & B_EINTR) {
1518 		bp->b_flags &= ~B_EINTR;
1519 		return (EINTR);
1520 	}
1521 	if (bp->b_flags & B_ERROR) {
1522 		return (bp->b_error ? bp->b_error : EIO);
1523 	} else {
1524 		return (0);
1525 	}
1526 }
1527 
1528 /*
1529  * Finish I/O on a buffer, calling an optional function.
1530  * This is usually called from interrupt level, so process blocking
1531  * is not *a good idea*.
1532  */
1533 void
1534 biodone(register struct buf * bp)
1535 {
1536 	int s;
1537 
1538 	s = splbio();
1539 	if (!(bp->b_flags & B_BUSY))
1540 		panic("biodone: buffer not busy");
1541 
1542 	if (bp->b_flags & B_DONE) {
1543 		splx(s);
1544 		printf("biodone: buffer already done\n");
1545 		return;
1546 	}
1547 	bp->b_flags |= B_DONE;
1548 
1549 	if ((bp->b_flags & B_READ) == 0) {
1550 		vwakeup(bp);
1551 	}
1552 #ifdef BOUNCE_BUFFERS
1553 	if (bp->b_flags & B_BOUNCE)
1554 		vm_bounce_free(bp);
1555 #endif
1556 
1557 	/* call optional completion function if requested */
1558 	if (bp->b_flags & B_CALL) {
1559 		bp->b_flags &= ~B_CALL;
1560 		(*bp->b_iodone) (bp);
1561 		splx(s);
1562 		return;
1563 	}
1564 	if (bp->b_flags & B_VMIO) {
1565 		int i, resid;
1566 		vm_ooffset_t foff;
1567 		vm_page_t m;
1568 		vm_object_t obj;
1569 		int iosize;
1570 		struct vnode *vp = bp->b_vp;
1571 
1572 		if (vp->v_type == VBLK)
1573 			foff = (vm_ooffset_t) DEV_BSIZE * bp->b_lblkno;
1574 		else
1575 			foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
1576 		obj = vp->v_object;
1577 		if (!obj) {
1578 			panic("biodone: no object");
1579 		}
1580 #if defined(VFS_BIO_DEBUG)
1581 		if (obj->paging_in_progress < bp->b_npages) {
1582 			printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n",
1583 			    obj->paging_in_progress, bp->b_npages);
1584 		}
1585 #endif
1586 		iosize = bp->b_bufsize;
1587 		for (i = 0; i < bp->b_npages; i++) {
1588 			int bogusflag = 0;
1589 			m = bp->b_pages[i];
1590 			if (m == bogus_page) {
1591 				bogusflag = 1;
1592 				m = vm_page_lookup(obj, OFF_TO_IDX(foff));
1593 				if (!m) {
1594 #if defined(VFS_BIO_DEBUG)
1595 					printf("biodone: page disappeared\n");
1596 #endif
1597 					--obj->paging_in_progress;
1598 					continue;
1599 				}
1600 				bp->b_pages[i] = m;
1601 				pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
1602 			}
1603 #if defined(VFS_BIO_DEBUG)
1604 			if (OFF_TO_IDX(foff) != m->pindex) {
1605 				printf("biodone: foff(%d)/m->pindex(%d) mismatch\n", foff, m->pindex);
1606 			}
1607 #endif
1608 			resid = IDX_TO_OFF(m->pindex + 1) - foff;
1609 			if (resid > iosize)
1610 				resid = iosize;
1611 			/*
1612 			 * In the write case, the valid and clean bits are
1613 			 * already changed correctly, so we only need to do this
1614 			 * here in the read case.
1615 			 */
1616 			if ((bp->b_flags & B_READ) && !bogusflag && resid > 0) {
1617 				vm_page_set_validclean(m,
1618 					(vm_offset_t) (foff & PAGE_MASK), resid);
1619 			}
1620 
1621 			/*
1622 			 * when debugging new filesystems or buffer I/O methods, this
1623 			 * is the most common error that pops up.  if you see this, you
1624 			 * have not set the page busy flag correctly!!!
1625 			 */
1626 			if (m->busy == 0) {
1627 				printf("biodone: page busy < 0, "
1628 				    "pindex: %d, foff: 0x(%x,%x), "
1629 				    "resid: %d, index: %d\n",
1630 				    (int) m->pindex, (int)(foff >> 32),
1631 						(int) foff & 0xffffffff, resid, i);
1632 				if (vp->v_type != VBLK)
1633 					printf(" iosize: %ld, lblkno: %d, flags: 0x%lx, npages: %d\n",
1634 					    bp->b_vp->v_mount->mnt_stat.f_iosize,
1635 					    (int) bp->b_lblkno,
1636 					    bp->b_flags, bp->b_npages);
1637 				else
1638 					printf(" VDEV, lblkno: %d, flags: 0x%lx, npages: %d\n",
1639 					    (int) bp->b_lblkno,
1640 					    bp->b_flags, bp->b_npages);
1641 				printf(" valid: 0x%x, dirty: 0x%x, wired: %d\n",
1642 				    m->valid, m->dirty, m->wire_count);
1643 				panic("biodone: page busy < 0\n");
1644 			}
1645 			--m->busy;
1646 			if ((m->busy == 0) && (m->flags & PG_WANTED)) {
1647 				m->flags &= ~PG_WANTED;
1648 				wakeup(m);
1649 			}
1650 			--obj->paging_in_progress;
1651 			foff += resid;
1652 			iosize -= resid;
1653 		}
1654 		if (obj && obj->paging_in_progress == 0 &&
1655 		    (obj->flags & OBJ_PIPWNT)) {
1656 			obj->flags &= ~OBJ_PIPWNT;
1657 			wakeup(obj);
1658 		}
1659 	}
1660 	/*
1661 	 * For asynchronous completions, release the buffer now. The brelse
1662 	 * checks for B_WANTED and will do the wakeup there if necessary - so
1663 	 * no need to do a wakeup here in the async case.
1664 	 */
1665 
1666 	if (bp->b_flags & B_ASYNC) {
1667 		if ((bp->b_flags & B_ORDERED) == 0) {
1668 			if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_RELBUF)) != 0)
1669 				brelse(bp);
1670 			else
1671 				bqrelse(bp);
1672 		}
1673 	} else {
1674 		bp->b_flags &= ~B_WANTED;
1675 		wakeup(bp);
1676 	}
1677 	splx(s);
1678 }
1679 
1680 int
1681 count_lock_queue()
1682 {
1683 	int count;
1684 	struct buf *bp;
1685 
1686 	count = 0;
1687 	for (bp = TAILQ_FIRST(&bufqueues[QUEUE_LOCKED]);
1688 	    bp != NULL;
1689 	    bp = TAILQ_NEXT(bp, b_freelist))
1690 		count++;
1691 	return (count);
1692 }
1693 
1694 int vfs_update_interval = 30;
1695 
1696 static void
1697 vfs_update()
1698 {
1699 	while (1) {
1700 		tsleep(&vfs_update_wakeup, PUSER, "update",
1701 		    hz * vfs_update_interval);
1702 		vfs_update_wakeup = 0;
1703 		sync(curproc, NULL, NULL);
1704 	}
1705 }
1706 
1707 static int
1708 sysctl_kern_updateinterval SYSCTL_HANDLER_ARGS
1709 {
1710 	int error = sysctl_handle_int(oidp,
1711 		oidp->oid_arg1, oidp->oid_arg2, req);
1712 	if (!error)
1713 		wakeup(&vfs_update_wakeup);
1714 	return error;
1715 }
1716 
1717 SYSCTL_PROC(_kern, KERN_UPDATEINTERVAL, update, CTLTYPE_INT|CTLFLAG_RW,
1718 	&vfs_update_interval, 0, sysctl_kern_updateinterval, "I", "");
1719 
1720 
1721 /*
1722  * This routine is called in lieu of iodone in the case of
1723  * incomplete I/O.  This keeps the busy status for pages
1724  * consistant.
1725  */
1726 void
1727 vfs_unbusy_pages(struct buf * bp)
1728 {
1729 	int i;
1730 
1731 	if (bp->b_flags & B_VMIO) {
1732 		struct vnode *vp = bp->b_vp;
1733 		vm_object_t obj = vp->v_object;
1734 		vm_ooffset_t foff;
1735 
1736 		foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
1737 
1738 		for (i = 0; i < bp->b_npages; i++) {
1739 			vm_page_t m = bp->b_pages[i];
1740 
1741 			if (m == bogus_page) {
1742 				m = vm_page_lookup(obj, OFF_TO_IDX(foff) + i);
1743 				if (!m) {
1744 					panic("vfs_unbusy_pages: page missing\n");
1745 				}
1746 				bp->b_pages[i] = m;
1747 				pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
1748 			}
1749 			--obj->paging_in_progress;
1750 			--m->busy;
1751 			if ((m->busy == 0) && (m->flags & PG_WANTED)) {
1752 				m->flags &= ~PG_WANTED;
1753 				wakeup(m);
1754 			}
1755 		}
1756 		if (obj->paging_in_progress == 0 &&
1757 		    (obj->flags & OBJ_PIPWNT)) {
1758 			obj->flags &= ~OBJ_PIPWNT;
1759 			wakeup(obj);
1760 		}
1761 	}
1762 }
1763 
1764 /*
1765  * This routine is called before a device strategy routine.
1766  * It is used to tell the VM system that paging I/O is in
1767  * progress, and treat the pages associated with the buffer
1768  * almost as being PG_BUSY.  Also the object paging_in_progress
1769  * flag is handled to make sure that the object doesn't become
1770  * inconsistant.
1771  */
1772 void
1773 vfs_busy_pages(struct buf * bp, int clear_modify)
1774 {
1775 	int i;
1776 
1777 	if (bp->b_flags & B_VMIO) {
1778 		vm_object_t obj = bp->b_vp->v_object;
1779 		vm_ooffset_t foff;
1780 		int iocount = bp->b_bufsize;
1781 
1782 		if (bp->b_vp->v_type == VBLK)
1783 			foff = (vm_ooffset_t) DEV_BSIZE * bp->b_lblkno;
1784 		else
1785 			foff = (vm_ooffset_t) bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
1786 		vfs_setdirty(bp);
1787 		for (i = 0; i < bp->b_npages; i++) {
1788 			vm_page_t m = bp->b_pages[i];
1789 			int resid = IDX_TO_OFF(m->pindex + 1) - foff;
1790 
1791 			if (resid > iocount)
1792 				resid = iocount;
1793 			if ((bp->b_flags & B_CLUSTER) == 0) {
1794 				obj->paging_in_progress++;
1795 				m->busy++;
1796 			}
1797 			vm_page_protect(m, VM_PROT_NONE);
1798 			if (clear_modify) {
1799 				vm_page_set_validclean(m,
1800 					(vm_offset_t) (foff & PAGE_MASK), resid);
1801 			} else if (bp->b_bcount >= PAGE_SIZE) {
1802 				if (m->valid && (bp->b_flags & B_CACHE) == 0) {
1803 					bp->b_pages[i] = bogus_page;
1804 					pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
1805 				}
1806 			}
1807 			foff += resid;
1808 			iocount -= resid;
1809 		}
1810 	}
1811 }
1812 
1813 /*
1814  * Tell the VM system that the pages associated with this buffer
1815  * are clean.  This is used for delayed writes where the data is
1816  * going to go to disk eventually without additional VM intevention.
1817  */
1818 void
1819 vfs_clean_pages(struct buf * bp)
1820 {
1821 	int i;
1822 
1823 	if (bp->b_flags & B_VMIO) {
1824 		vm_ooffset_t foff;
1825 		int iocount = bp->b_bufsize;
1826 
1827 		if (bp->b_vp->v_type == VBLK)
1828 			foff = (vm_ooffset_t) DEV_BSIZE * bp->b_lblkno;
1829 		else
1830 			foff = (vm_ooffset_t) bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
1831 
1832 		for (i = 0; i < bp->b_npages; i++) {
1833 			vm_page_t m = bp->b_pages[i];
1834 			int resid = IDX_TO_OFF(m->pindex + 1) - foff;
1835 
1836 			if (resid > iocount)
1837 				resid = iocount;
1838 			if (resid > 0) {
1839 				vm_page_set_validclean(m,
1840 					((vm_offset_t) foff & PAGE_MASK), resid);
1841 			}
1842 			foff += resid;
1843 			iocount -= resid;
1844 		}
1845 	}
1846 }
1847 
1848 void
1849 vfs_bio_clrbuf(struct buf *bp) {
1850 	int i;
1851 	if( bp->b_flags & B_VMIO) {
1852 		if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE)) {
1853 			int mask;
1854 			mask = 0;
1855 			for(i=0;i<bp->b_bufsize;i+=DEV_BSIZE)
1856 				mask |= (1 << (i/DEV_BSIZE));
1857 			if( bp->b_pages[0]->valid != mask) {
1858 				bzero(bp->b_data, bp->b_bufsize);
1859 			}
1860 			bp->b_pages[0]->valid = mask;
1861 			bp->b_resid = 0;
1862 			return;
1863 		}
1864 		for(i=0;i<bp->b_npages;i++) {
1865 			if( bp->b_pages[i]->valid == VM_PAGE_BITS_ALL)
1866 				continue;
1867 			if( bp->b_pages[i]->valid == 0) {
1868 				if ((bp->b_pages[i]->flags & PG_ZERO) == 0) {
1869 					bzero(bp->b_data + (i << PAGE_SHIFT), PAGE_SIZE);
1870 				}
1871 			} else {
1872 				int j;
1873 				for(j=0;j<PAGE_SIZE/DEV_BSIZE;j++) {
1874 					if( (bp->b_pages[i]->valid & (1<<j)) == 0)
1875 						bzero(bp->b_data + (i << PAGE_SHIFT) + j * DEV_BSIZE, DEV_BSIZE);
1876 				}
1877 			}
1878 			/* bp->b_pages[i]->valid = VM_PAGE_BITS_ALL; */
1879 		}
1880 		bp->b_resid = 0;
1881 	} else {
1882 		clrbuf(bp);
1883 	}
1884 }
1885 
1886 /*
1887  * vm_hold_load_pages and vm_hold_unload pages get pages into
1888  * a buffers address space.  The pages are anonymous and are
1889  * not associated with a file object.
1890  */
1891 void
1892 vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
1893 {
1894 	vm_offset_t pg;
1895 	vm_page_t p;
1896 	int index;
1897 
1898 	to = round_page(to);
1899 	from = round_page(from);
1900 	index = (from - trunc_page(bp->b_data)) >> PAGE_SHIFT;
1901 
1902 	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
1903 
1904 tryagain:
1905 
1906 		p = vm_page_alloc(kernel_object, ((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
1907 		    VM_ALLOC_NORMAL);
1908 		if (!p) {
1909 			VM_WAIT;
1910 			goto tryagain;
1911 		}
1912 		vm_page_wire(p);
1913 		pmap_kenter(pg, VM_PAGE_TO_PHYS(p));
1914 		bp->b_pages[index] = p;
1915 		PAGE_WAKEUP(p);
1916 	}
1917 	bp->b_npages = to >> PAGE_SHIFT;
1918 }
1919 
1920 void
1921 vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
1922 {
1923 	vm_offset_t pg;
1924 	vm_page_t p;
1925 	int index;
1926 
1927 	from = round_page(from);
1928 	to = round_page(to);
1929 	index = (from - trunc_page(bp->b_data)) >> PAGE_SHIFT;
1930 
1931 	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
1932 		p = bp->b_pages[index];
1933 		if (p && (index < bp->b_npages)) {
1934 			if (p->busy) {
1935 				printf("vm_hold_free_pages: blkno: %d, lblkno: %d\n",
1936 					bp->b_blkno, bp->b_lblkno);
1937 			}
1938 			bp->b_pages[index] = NULL;
1939 			pmap_kremove(pg);
1940 			vm_page_unwire(p);
1941 			vm_page_free(p);
1942 		}
1943 	}
1944 	bp->b_npages = from >> PAGE_SHIFT;
1945 }
1946