xref: /freebsd/sys/kern/vfs_bio.c (revision a316b26e50bbed7cf655fbba726ab87d8ab7599d)
1 /*
2  * Copyright (c) 1994 John S. Dyson
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice immediately at the beginning of the file, without modification,
10  *    this list of conditions, and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  * 3. Absolutely no warranty of function or purpose is made by the author
15  *    John S. Dyson.
16  * 4. This work was done expressly for inclusion into FreeBSD.  Other use
17  *    is allowed if this notation is included.
18  * 5. Modifications may be freely made to this file if the above conditions
19  *    are met.
20  *
21  * $Id: vfs_bio.c,v 1.19 1995/01/10 09:20:34 davidg Exp $
22  */
23 
24 /*
25  * this file contains a new buffer I/O scheme implementing a coherent
26  * VM object and buffer cache scheme.  Pains have been taken to make
27  * sure that the performance degradation associated with schemes such
28  * as this is not realized.
29  *
30  * Author:  John S. Dyson
31  * Significant help during the development and debugging phases
32  * had been provided by David Greenman, also of the FreeBSD core team.
33  */
34 
35 #define VMIO
36 #include <sys/param.h>
37 #include <sys/systm.h>
38 #include <sys/kernel.h>
39 #include <sys/proc.h>
40 #include <sys/vnode.h>
41 #include <vm/vm.h>
42 #include <vm/vm_pageout.h>
43 #include <vm/vm_page.h>
44 #include <vm/vm_object.h>
45 #include <sys/buf.h>
46 #include <sys/mount.h>
47 #include <sys/malloc.h>
48 #include <sys/resourcevar.h>
49 #include <sys/proc.h>
50 
51 #include <miscfs/specfs/specdev.h>
52 
53 struct buf *buf;		/* buffer header pool */
54 int nbuf;			/* number of buffer headers calculated
55 				 * elsewhere */
56 struct swqueue bswlist;
57 int nvmio, nlru;
58 
59 extern vm_map_t buffer_map, io_map, kernel_map, pager_map;
60 
61 void vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to);
62 void vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to);
63 void vfs_dirty_pages(struct buf * bp);
64 void vfs_busy_pages(struct buf *, int clear_modify);
65 
66 int needsbuffer;
67 
68 /*
69  * Internal update daemon, process 3
70  *	The variable vfs_update_wakeup allows for internal syncs.
71  */
72 int vfs_update_wakeup;
73 
74 
75 /*
76  * buffers base kva
77  */
78 caddr_t buffers_kva;
79 
80 /*
81  * bogus page -- for I/O to/from partially complete buffers
82  */
83 vm_page_t bogus_page;
84 vm_offset_t bogus_offset;
85 
86 /*
87  * Initialize buffer headers and related structures.
88  */
89 void
90 bufinit()
91 {
92 	struct buf *bp;
93 	int i;
94 
95 	TAILQ_INIT(&bswlist);
96 	LIST_INIT(&invalhash);
97 
98 	/* first, make a null hash table */
99 	for (i = 0; i < BUFHSZ; i++)
100 		LIST_INIT(&bufhashtbl[i]);
101 
102 	/* next, make a null set of free lists */
103 	for (i = 0; i < BUFFER_QUEUES; i++)
104 		TAILQ_INIT(&bufqueues[i]);
105 
106 	buffers_kva = (caddr_t) kmem_alloc_pageable(buffer_map, MAXBSIZE * nbuf);
107 	/* finally, initialize each buffer header and stick on empty q */
108 	for (i = 0; i < nbuf; i++) {
109 		bp = &buf[i];
110 		bzero(bp, sizeof *bp);
111 		bp->b_flags = B_INVAL;	/* we're just an empty header */
112 		bp->b_dev = NODEV;
113 		bp->b_vp = NULL;
114 		bp->b_rcred = NOCRED;
115 		bp->b_wcred = NOCRED;
116 		bp->b_qindex = QUEUE_EMPTY;
117 		bp->b_vnbufs.le_next = NOLIST;
118 		bp->b_data = buffers_kva + i * MAXBSIZE;
119 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
120 		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
121 	}
122 
123 	bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE);
124 	bogus_page = vm_page_alloc(kernel_object, bogus_offset - VM_MIN_KERNEL_ADDRESS, 0);
125 
126 }
127 
128 /*
129  * remove the buffer from the appropriate free list
130  */
131 void
132 bremfree(struct buf * bp)
133 {
134 	int s = splbio();
135 
136 	if (bp->b_qindex != QUEUE_NONE) {
137 		if (bp->b_qindex == QUEUE_LRU)
138 			--nlru;
139 		TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
140 		bp->b_qindex = QUEUE_NONE;
141 	} else {
142 		panic("bremfree: removing a buffer when not on a queue");
143 	}
144 	splx(s);
145 }
146 
147 /*
148  * Get a buffer with the specified data.  Look in the cache first.
149  */
150 int
151 bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred,
152     struct buf ** bpp)
153 {
154 	struct buf *bp;
155 
156 	bp = getblk(vp, blkno, size, 0, 0);
157 	*bpp = bp;
158 
159 	/* if not found in cache, do some I/O */
160 	if ((bp->b_flags & B_CACHE) == 0) {
161 		if (curproc && curproc->p_stats)	/* count block I/O */
162 			curproc->p_stats->p_ru.ru_inblock++;
163 		bp->b_flags |= B_READ;
164 		bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
165 		if (bp->b_rcred == NOCRED) {
166 			if (cred != NOCRED)
167 				crhold(cred);
168 			bp->b_rcred = cred;
169 		}
170 		vfs_busy_pages(bp, 0);
171 		VOP_STRATEGY(bp);
172 		return (biowait(bp));
173 	} else if (bp->b_lblkno == bp->b_blkno) {
174 		VOP_BMAP(vp, bp->b_lblkno, (struct vnode **) 0,
175 		    &bp->b_blkno, (int *) 0);
176 	}
177 	return (0);
178 }
179 
180 /*
181  * Operates like bread, but also starts asynchronous I/O on
182  * read-ahead blocks.
183  */
184 int
185 breadn(struct vnode * vp, daddr_t blkno, int size,
186     daddr_t * rablkno, int *rabsize,
187     int cnt, struct ucred * cred, struct buf ** bpp)
188 {
189 	struct buf *bp, *rabp;
190 	int i;
191 	int rv = 0, readwait = 0;
192 
193 	*bpp = bp = getblk(vp, blkno, size, 0, 0);
194 
195 	/* if not found in cache, do some I/O */
196 	if ((bp->b_flags & B_CACHE) == 0) {
197 		if (curproc && curproc->p_stats)	/* count block I/O */
198 			curproc->p_stats->p_ru.ru_inblock++;
199 		bp->b_flags |= B_READ;
200 		bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
201 		if (bp->b_rcred == NOCRED) {
202 			if (cred != NOCRED)
203 				crhold(cred);
204 			bp->b_rcred = cred;
205 		}
206 		vfs_busy_pages(bp, 0);
207 		VOP_STRATEGY(bp);
208 		++readwait;
209 	} else if (bp->b_lblkno == bp->b_blkno) {
210 		VOP_BMAP(vp, bp->b_lblkno, (struct vnode **) 0,
211 		    &bp->b_blkno, (int *) 0);
212 	}
213 	for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
214 		if (inmem(vp, *rablkno))
215 			continue;
216 		rabp = getblk(vp, *rablkno, *rabsize, 0, 0);
217 
218 		if ((rabp->b_flags & B_CACHE) == 0) {
219 			if (curproc && curproc->p_stats)
220 				curproc->p_stats->p_ru.ru_inblock++;
221 			rabp->b_flags |= B_READ | B_ASYNC;
222 			rabp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
223 			if (rabp->b_rcred == NOCRED) {
224 				if (cred != NOCRED)
225 					crhold(cred);
226 				rabp->b_rcred = cred;
227 			}
228 			vfs_busy_pages(rabp, 0);
229 			VOP_STRATEGY(rabp);
230 		} else {
231 			brelse(rabp);
232 		}
233 	}
234 
235 	if (readwait) {
236 		rv = biowait(bp);
237 	}
238 	return (rv);
239 }
240 
241 /*
242  * this routine is used by filesystems to get at pages in the PG_CACHE
243  * queue.  also, it is used to read pages that are currently being
244  * written out by the file i/o routines.
245  */
246 int
247 vfs_read_bypass(struct vnode * vp, struct uio * uio, int maxread, daddr_t lbn)
248 {
249 	vm_page_t m;
250 	vm_offset_t kv;
251 	int nread;
252 	int error;
253 	struct buf *bp, *bpa;
254 	vm_object_t obj;
255 	int off;
256 	int nrest;
257 	int flags;
258 	int s;
259 
260 	return 0;
261 	/*
262 	 * don't use the bypass mechanism for non-vmio vnodes
263 	 */
264 	if ((vp->v_flag & VVMIO) == 0)
265 		return 0;
266 	/*
267 	 * get the VM object (it has the pages)
268 	 */
269 	obj = (vm_object_t) vp->v_vmdata;
270 	if (obj == NULL)
271 		return 0;
272 
273 	/*
274 	 * if there is a buffer that is not busy, it is faster to use it.
275 	 * This like read-ahead, etc work better
276 	 */
277 
278 	s = splbio();
279 	if ((bp = incore(vp, lbn)) &&
280 	    (((bp->b_flags & B_READ) && (bp->b_flags & B_BUSY))
281 		|| (bp->b_flags & B_BUSY) == 0)) {
282 		splx(s);
283 		return 0;
284 	}
285 	splx(s);
286 
287 	/*
288 	 * get a pbuf --> we just use the kva
289 	 */
290 	kv = kmem_alloc_wait(pager_map, PAGE_SIZE);
291 	nread = 0;
292 	error = 0;
293 
294 	while (!error && uio->uio_resid && maxread > 0) {
295 		int po;
296 		int count;
297 		int s;
298 
299 relookup:
300 		/*
301 		 * lookup the page
302 		 */
303 		m = vm_page_lookup(obj, trunc_page(uio->uio_offset));
304 		if (!m)
305 			break;
306 		/*
307 		 * get the offset into the page, and the amount to read in the
308 		 * page
309 		 */
310 		nrest = round_page(uio->uio_offset) - uio->uio_offset;
311 		if (nrest > uio->uio_resid)
312 			nrest = uio->uio_resid;
313 
314 		/*
315 		 * check the valid bits for the page (DEV_BSIZE chunks)
316 		 */
317 		if (!vm_page_is_valid(m, uio->uio_offset, nrest))
318 			break;
319 
320 		/*
321 		 * if the page is busy, wait for it
322 		 */
323 		s = splhigh();
324 		if (!m->valid || (m->flags & PG_BUSY)) {
325 			m->flags |= PG_WANTED;
326 			tsleep((caddr_t) m, PVM, "vnibyp", 0);
327 			splx(s);
328 			goto relookup;
329 		}
330 		/*
331 		 * if the page is on the cache queue, remove it -- cache queue
332 		 * pages should be freeable by vm_page_alloc anytime.
333 		 */
334 		if (m->flags & PG_CACHE) {
335 			if (cnt.v_free_count + cnt.v_cache_count < cnt.v_free_reserved) {
336 				VM_WAIT;
337 				goto relookup;
338 			}
339 			vm_page_unqueue(m);
340 		}
341 		/*
342 		 * add a buffer mapping (essentially wires the page too).
343 		 */
344 		m->bmapped++;
345 		splx(s);
346 
347 		/*
348 		 * enter it into the kva
349 		 */
350 		pmap_qenter(kv, &m, 1);
351 
352 		/*
353 		 * do the copy
354 		 */
355 		po = uio->uio_offset & (PAGE_SIZE - 1);
356 		count = PAGE_SIZE - po;
357 		if (count > maxread)
358 			count = maxread;
359 		if (count > uio->uio_resid)
360 			count = uio->uio_resid;
361 
362 		error = uiomove((caddr_t) kv + po, count, uio);
363 		if (!error) {
364 			nread += count;
365 			maxread -= count;
366 		}
367 		/*
368 		 * remove from kva
369 		 */
370 		pmap_qremove(kv, 1);
371 		PAGE_WAKEUP(m);	/* XXX probably unnecessary */
372 		/*
373 		 * If the page was on the cache queue, then by definition
374 		 * bmapped was 0. Thus the following case will also take care
375 		 * of the page being removed from the cache queue above.
376 		 * Also, it is possible that the page was already entered onto
377 		 * another queue (or was already there), so we don't put it
378 		 * onto the cache queue...
379 		 */
380 		m->bmapped--;
381 		if (m->bmapped == 0 &&
382 		    (m->flags & (PG_CACHE | PG_ACTIVE | PG_INACTIVE)) == 0 &&
383 		    m->wire_count == 0) {
384 			vm_page_test_dirty(m);
385 
386 			/*
387 			 * make sure that the darned page is on a queue
388 			 * somewhere...
389 			 */
390 			if ((m->dirty & m->valid) == 0) {
391 				vm_page_cache(m);
392 			} else if (m->hold_count == 0) {
393 				vm_page_deactivate(m);
394 			} else {
395 				vm_page_activate(m);
396 			}
397 		}
398 	}
399 	/*
400 	 * release our buffer(kva).
401 	 */
402 	kmem_free_wakeup(pager_map, kv, PAGE_SIZE);
403 	return nread;
404 }
405 
406 
407 /*
408  * Write, release buffer on completion.  (Done by iodone
409  * if async.)
410  */
411 int
412 bwrite(struct buf * bp)
413 {
414 	int oldflags = bp->b_flags;
415 
416 	if (bp->b_flags & B_INVAL) {
417 		brelse(bp);
418 		return (0);
419 	}
420 	if (!(bp->b_flags & B_BUSY))
421 		panic("bwrite: buffer is not busy???");
422 
423 	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
424 	bp->b_flags |= B_WRITEINPROG;
425 
426 	if (oldflags & B_ASYNC) {
427 		if (oldflags & B_DELWRI) {
428 			reassignbuf(bp, bp->b_vp);
429 		} else if (curproc) {
430 			++curproc->p_stats->p_ru.ru_oublock;
431 		}
432 	}
433 	bp->b_vp->v_numoutput++;
434 	vfs_busy_pages(bp, 1);
435 	VOP_STRATEGY(bp);
436 
437 	if ((oldflags & B_ASYNC) == 0) {
438 		int rtval = biowait(bp);
439 
440 		if (oldflags & B_DELWRI) {
441 			reassignbuf(bp, bp->b_vp);
442 		} else if (curproc) {
443 			++curproc->p_stats->p_ru.ru_oublock;
444 		}
445 		brelse(bp);
446 		return (rtval);
447 	}
448 	return (0);
449 }
450 
451 int
452 vn_bwrite(ap)
453 	struct vop_bwrite_args *ap;
454 {
455 	return (bwrite(ap->a_bp));
456 }
457 
458 /*
459  * Delayed write. (Buffer is marked dirty).
460  */
461 void
462 bdwrite(struct buf * bp)
463 {
464 
465 	if ((bp->b_flags & B_BUSY) == 0) {
466 		panic("bdwrite: buffer is not busy");
467 	}
468 	if (bp->b_flags & B_INVAL) {
469 		brelse(bp);
470 		return;
471 	}
472 	if (bp->b_flags & B_TAPE) {
473 		bawrite(bp);
474 		return;
475 	}
476 	bp->b_flags &= ~B_READ;
477 	vfs_dirty_pages(bp);
478 	if ((bp->b_flags & B_DELWRI) == 0) {
479 		if (curproc)
480 			++curproc->p_stats->p_ru.ru_oublock;
481 		bp->b_flags |= B_DONE | B_DELWRI;
482 		reassignbuf(bp, bp->b_vp);
483 	}
484 	brelse(bp);
485 	return;
486 }
487 
488 /*
489  * Asynchronous write.
490  * Start output on a buffer, but do not wait for it to complete.
491  * The buffer is released when the output completes.
492  */
493 void
494 bawrite(struct buf * bp)
495 {
496 	if (((bp->b_flags & B_DELWRI) == 0) && (bp->b_vp->v_numoutput > 24)) {
497 		int s = splbio();
498 
499 		while (bp->b_vp->v_numoutput > 16) {
500 			bp->b_vp->v_flag |= VBWAIT;
501 			tsleep((caddr_t) &bp->b_vp->v_numoutput, PRIBIO, "bawnmo", 0);
502 		}
503 		splx(s);
504 	}
505 	bp->b_flags |= B_ASYNC;
506 	(void) bwrite(bp);
507 }
508 
509 /*
510  * Release a buffer.
511  */
512 void
513 brelse(struct buf * bp)
514 {
515 	int s;
516 
517 	if (bp->b_flags & B_CLUSTER) {
518 		relpbuf(bp);
519 		return;
520 	}
521 	/* anyone need a "free" block? */
522 	s = splbio();
523 
524 	if (needsbuffer) {
525 		needsbuffer = 0;
526 		wakeup((caddr_t) &needsbuffer);
527 	}
528 	/* anyone need this block? */
529 	if (bp->b_flags & B_WANTED) {
530 		bp->b_flags &= ~(B_PDWANTED | B_WANTED | B_AGE);
531 		wakeup((caddr_t) bp);
532 	} else if (bp->b_flags & B_VMIO) {
533 		bp->b_flags &= ~(B_WANTED | B_PDWANTED);
534 		wakeup((caddr_t) bp);
535 	}
536 	if (bp->b_flags & B_LOCKED)
537 		bp->b_flags &= ~B_ERROR;
538 
539 	if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR)) ||
540 	    (bp->b_bufsize <= 0)) {
541 		bp->b_flags |= B_INVAL;
542 		bp->b_flags &= ~(B_DELWRI | B_CACHE);
543 		if (((bp->b_flags & B_VMIO) == 0) && bp->b_vp)
544 			brelvp(bp);
545 	}
546 	if (bp->b_flags & B_VMIO) {
547 		vm_offset_t foff;
548 		vm_object_t obj;
549 		int i, resid;
550 		vm_page_t m;
551 		int iototal = bp->b_bufsize;
552 
553 		foff = 0;
554 		obj = 0;
555 		if (bp->b_npages) {
556 			if (bp->b_vp && bp->b_vp->v_mount) {
557 				foff = bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
558 			} else {
559 				/*
560 				 * vnode pointer has been ripped away --
561 				 * probably file gone...
562 				 */
563 				foff = bp->b_pages[0]->offset;
564 			}
565 		}
566 		for (i = 0; i < bp->b_npages; i++) {
567 			m = bp->b_pages[i];
568 			if (m == bogus_page) {
569 				panic("brelse: bogus page found");
570 			}
571 			resid = (m->offset + PAGE_SIZE) - foff;
572 			if (resid > iototal)
573 				resid = iototal;
574 			if (resid > 0) {
575 				if (bp->b_flags & (B_ERROR | B_NOCACHE)) {
576 					vm_page_set_invalid(m, foff, resid);
577 				} else if ((bp->b_flags & B_DELWRI) == 0) {
578 					vm_page_set_clean(m, foff, resid);
579 					vm_page_set_valid(m, foff, resid);
580 				}
581 			} else {
582 				vm_page_test_dirty(m);
583 			}
584 			if (bp->b_flags & B_INVAL) {
585 				if (m->bmapped == 0) {
586 					panic("brelse: bmapped is zero for page\n");
587 				}
588 				--m->bmapped;
589 				if (m->bmapped == 0) {
590 					PAGE_WAKEUP(m);
591 					if ((m->dirty & m->valid) == 0)
592 						vm_page_cache(m);
593 				}
594 			}
595 			foff += resid;
596 			iototal -= resid;
597 		}
598 
599 		if (bp->b_flags & B_INVAL) {
600 			pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
601 			bp->b_npages = 0;
602 			bp->b_bufsize = 0;
603 			bp->b_flags &= ~B_VMIO;
604 			if (bp->b_vp)
605 				brelvp(bp);
606 			--nvmio;
607 		}
608 	}
609 	if (bp->b_qindex != QUEUE_NONE)
610 		panic("brelse: free buffer onto another queue???");
611 
612 	/* enqueue */
613 	/* buffers with no memory */
614 	if (bp->b_bufsize == 0) {
615 		bp->b_qindex = QUEUE_EMPTY;
616 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
617 		LIST_REMOVE(bp, b_hash);
618 		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
619 		bp->b_dev = NODEV;
620 		/* buffers with junk contents */
621 	} else if (bp->b_flags & (B_ERROR | B_INVAL | B_NOCACHE)) {
622 		bp->b_qindex = QUEUE_AGE;
623 		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist);
624 		LIST_REMOVE(bp, b_hash);
625 		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
626 		bp->b_dev = NODEV;
627 		/* buffers that are locked */
628 	} else if (bp->b_flags & B_LOCKED) {
629 		bp->b_qindex = QUEUE_LOCKED;
630 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
631 		/* buffers with stale but valid contents */
632 	} else if (bp->b_flags & B_AGE) {
633 		bp->b_qindex = QUEUE_AGE;
634 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist);
635 		/* buffers with valid and quite potentially reuseable contents */
636 	} else {
637 		if (bp->b_flags & B_VMIO)
638 			bp->b_qindex = QUEUE_VMIO;
639 		else {
640 			bp->b_qindex = QUEUE_LRU;
641 			++nlru;
642 		}
643 		TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist);
644 	}
645 
646 	/* unlock */
647 	bp->b_flags &= ~(B_PDWANTED | B_WANTED | B_BUSY | B_ASYNC | B_NOCACHE | B_AGE);
648 	splx(s);
649 }
650 
651 /*
652  * this routine implements clustered async writes for
653  * clearing out B_DELWRI buffers...
654  */
655 void
656 vfs_bio_awrite(struct buf * bp)
657 {
658 	int i;
659 	daddr_t lblkno = bp->b_lblkno;
660 	struct vnode *vp = bp->b_vp;
661 	int s;
662 	int ncl;
663 	struct buf *bpa;
664 
665 	s = splbio();
666 	if( vp->v_mount && (vp->v_flag & VVMIO) &&
667 		(bp->b_flags & (B_CLUSTEROK|B_INVAL)) == B_CLUSTEROK) {
668 		int size  = vp->v_mount->mnt_stat.f_iosize;
669 		for (i = 1; i < MAXPHYS / size; i++) {
670 			if ((bpa = incore(vp, lblkno + i)) &&
671 			    ((bpa->b_flags & (B_BUSY | B_DELWRI | B_BUSY | B_CLUSTEROK | B_INVAL)) == B_DELWRI | B_CLUSTEROK) &&
672 			    (bpa->b_bufsize == size)) {
673 				if ((bpa->b_blkno == bpa->b_lblkno) ||
674 				    (bpa->b_blkno != bp->b_blkno + (i * size) / DEV_BSIZE))
675 					break;
676 			} else {
677 				break;
678 			}
679 		}
680 		ncl = i;
681 		/*
682 		 * this is a possible cluster write
683 		 */
684 		if (ncl != 1) {
685 			cluster_wbuild(vp, NULL, size, lblkno, ncl, -1);
686 			splx(s);
687 			return;
688 		}
689 	}
690 	/*
691 	 * default (old) behavior, writing out only one block
692 	 */
693 	bremfree(bp);
694 	bp->b_flags |= B_BUSY | B_ASYNC;
695 	bwrite(bp);
696 	splx(s);
697 }
698 
699 int freebufspace;
700 int allocbufspace;
701 
702 /*
703  * Find a buffer header which is available for use.
704  */
705 struct buf *
706 getnewbuf(int slpflag, int slptimeo, int doingvmio)
707 {
708 	struct buf *bp;
709 	int s;
710 	int firstbp = 1;
711 
712 	s = splbio();
713 start:
714 	/* can we constitute a new buffer? */
715 	if ((bp = bufqueues[QUEUE_EMPTY].tqh_first)) {
716 		if (bp->b_qindex != QUEUE_EMPTY)
717 			panic("getnewbuf: inconsistent EMPTY queue");
718 		bremfree(bp);
719 		goto fillbuf;
720 	}
721 	/*
722 	 * we keep the file I/O from hogging metadata I/O
723 	 */
724 	if (bp = bufqueues[QUEUE_AGE].tqh_first) {
725 		if (bp->b_qindex != QUEUE_AGE)
726 			panic("getnewbuf: inconsistent AGE queue");
727 	} else if ((nvmio > (2 * nbuf / 3))
728 	    && (bp = bufqueues[QUEUE_VMIO].tqh_first)) {
729 		if (bp->b_qindex != QUEUE_VMIO)
730 			panic("getnewbuf: inconsistent VMIO queue");
731 	} else if ((!doingvmio || (nlru > (2 * nbuf / 3))) &&
732 	    (bp = bufqueues[QUEUE_LRU].tqh_first)) {
733 		if (bp->b_qindex != QUEUE_LRU)
734 			panic("getnewbuf: inconsistent LRU queue");
735 	}
736 	if (!bp) {
737 		if (doingvmio) {
738 			if (bp = bufqueues[QUEUE_VMIO].tqh_first) {
739 				if (bp->b_qindex != QUEUE_VMIO)
740 					panic("getnewbuf: inconsistent VMIO queue");
741 			} else if (bp = bufqueues[QUEUE_LRU].tqh_first) {
742 				if (bp->b_qindex != QUEUE_LRU)
743 					panic("getnewbuf: inconsistent LRU queue");
744 			}
745 		} else {
746 			if (bp = bufqueues[QUEUE_LRU].tqh_first) {
747 				if (bp->b_qindex != QUEUE_LRU)
748 					panic("getnewbuf: inconsistent LRU queue");
749 			} else if (bp = bufqueues[QUEUE_VMIO].tqh_first) {
750 				if (bp->b_qindex != QUEUE_VMIO)
751 					panic("getnewbuf: inconsistent VMIO queue");
752 			}
753 		}
754 	}
755 	if (!bp) {
756 		/* wait for a free buffer of any kind */
757 		needsbuffer = 1;
758 		tsleep((caddr_t) &needsbuffer, PRIBIO | slpflag, "newbuf", slptimeo);
759 		splx(s);
760 		return (0);
761 	}
762 	/* if we are a delayed write, convert to an async write */
763 	if ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) {
764 		vfs_bio_awrite(bp);
765 		if (!slpflag && !slptimeo) {
766 			splx(s);
767 			return (0);
768 		}
769 		goto start;
770 	}
771 	bremfree(bp);
772 
773 	if (bp->b_flags & B_VMIO) {
774 		bp->b_flags |= B_INVAL | B_BUSY;
775 		brelse(bp);
776 		bremfree(bp);
777 	}
778 	if (bp->b_vp)
779 		brelvp(bp);
780 
781 	/* we are not free, nor do we contain interesting data */
782 	if (bp->b_rcred != NOCRED)
783 		crfree(bp->b_rcred);
784 	if (bp->b_wcred != NOCRED)
785 		crfree(bp->b_wcred);
786 fillbuf:
787 	bp->b_flags = B_BUSY;
788 	LIST_REMOVE(bp, b_hash);
789 	LIST_INSERT_HEAD(&invalhash, bp, b_hash);
790 	splx(s);
791 	if (bp->b_bufsize) {
792 		allocbuf(bp, 0, 0);
793 	}
794 	bp->b_dev = NODEV;
795 	bp->b_vp = NULL;
796 	bp->b_blkno = bp->b_lblkno = 0;
797 	bp->b_iodone = 0;
798 	bp->b_error = 0;
799 	bp->b_resid = 0;
800 	bp->b_bcount = 0;
801 	bp->b_npages = 0;
802 	bp->b_wcred = bp->b_rcred = NOCRED;
803 	bp->b_data = buffers_kva + (bp - buf) * MAXBSIZE;
804 	bp->b_dirtyoff = bp->b_dirtyend = 0;
805 	bp->b_validoff = bp->b_validend = 0;
806 	return (bp);
807 }
808 
809 /*
810  * Check to see if a block is currently memory resident.
811  */
812 struct buf *
813 incore(struct vnode * vp, daddr_t blkno)
814 {
815 	struct buf *bp;
816 	struct bufhashhdr *bh;
817 
818 	int s = splbio();
819 
820 	bh = BUFHASH(vp, blkno);
821 	bp = bh->lh_first;
822 
823 	/* Search hash chain */
824 	while (bp) {
825 		/* hit */
826 		if (bp->b_lblkno == blkno && bp->b_vp == vp
827 		    && (bp->b_flags & B_INVAL) == 0) {
828 			splx(s);
829 			return (bp);
830 		}
831 		bp = bp->b_hash.le_next;
832 	}
833 	splx(s);
834 
835 	return (0);
836 }
837 
838 /*
839  * returns true if no I/O is needed to access the
840  * associated VM object.
841  */
842 
843 int
844 inmem(struct vnode * vp, daddr_t blkno)
845 {
846 	vm_object_t obj;
847 	vm_offset_t off, toff, tinc;
848 	vm_page_t m;
849 
850 	if (incore(vp, blkno))
851 		return 1;
852 	if (vp->v_mount == 0)
853 		return 0;
854 	if (vp->v_vmdata == 0)
855 		return 0;
856 
857 	obj = (vm_object_t) vp->v_vmdata;
858 	tinc = PAGE_SIZE;
859 	if (tinc > vp->v_mount->mnt_stat.f_iosize)
860 		tinc = vp->v_mount->mnt_stat.f_iosize;
861 	off = blkno * vp->v_mount->mnt_stat.f_iosize;
862 
863 	for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
864 		int mask;
865 
866 		m = vm_page_lookup(obj, trunc_page(toff + off));
867 		if (!m)
868 			return 0;
869 		if (vm_page_is_valid(m, toff + off, tinc) == 0)
870 			return 0;
871 	}
872 	return 1;
873 }
874 
875 /*
876  * Get a block given a specified block and offset into a file/device.
877  */
878 struct buf *
879 getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo)
880 {
881 	struct buf *bp;
882 	int s;
883 	struct bufhashhdr *bh;
884 	vm_offset_t off;
885 	int nleft;
886 
887 	s = splbio();
888 loop:
889 	if ((cnt.v_free_count + cnt.v_cache_count) <
890 	    cnt.v_free_reserved + MAXBSIZE / PAGE_SIZE)
891 		wakeup((caddr_t) &vm_pages_needed);
892 	if (bp = incore(vp, blkno)) {
893 		if (bp->b_flags & B_BUSY) {
894 			bp->b_flags |= B_WANTED;
895 			if (curproc == pageproc) {
896 				bp->b_flags |= B_PDWANTED;
897 				wakeup((caddr_t) &cnt.v_free_count);
898 			}
899 			if (!tsleep((caddr_t) bp, PRIBIO | slpflag, "getblk", slptimeo))
900 				goto loop;
901 			splx(s);
902 			return (struct buf *) NULL;
903 		}
904 		bp->b_flags |= B_BUSY | B_CACHE;
905 		bremfree(bp);
906 		/*
907 		 * check for size inconsistancies
908 		 */
909 		if (bp->b_bcount != size) {
910 #if defined(VFS_BIO_DEBUG)
911 			printf("getblk: invalid buffer size: %ld\n", bp->b_bcount);
912 #endif
913 			bp->b_flags |= B_INVAL;
914 			bwrite(bp);
915 			goto loop;
916 		}
917 		splx(s);
918 		return (bp);
919 	} else {
920 		vm_object_t obj;
921 		int doingvmio;
922 
923 		if ((obj = (vm_object_t) vp->v_vmdata) &&
924 		    (vp->v_flag & VVMIO) /* && (blkno >= 0) */ ) {
925 			doingvmio = 1;
926 		} else {
927 			doingvmio = 0;
928 		}
929 		if ((bp = getnewbuf(slpflag, slptimeo, doingvmio)) == 0) {
930 			if (slpflag || slptimeo)
931 				return NULL;
932 			goto loop;
933 		}
934 		if (incore(vp, blkno)) {
935 			bp->b_flags |= B_INVAL;
936 			brelse(bp);
937 			goto loop;
938 		}
939 		bp->b_blkno = bp->b_lblkno = blkno;
940 		bgetvp(vp, bp);
941 		LIST_REMOVE(bp, b_hash);
942 		bh = BUFHASH(vp, blkno);
943 		LIST_INSERT_HEAD(bh, bp, b_hash);
944 		if (doingvmio) {
945 			bp->b_flags |= (B_VMIO | B_CACHE);
946 #if defined(VFS_BIO_DEBUG)
947 			if (vp->v_type != VREG)
948 				printf("getblk: vmioing file type %d???\n", vp->v_type);
949 #endif
950 			++nvmio;
951 		} else {
952 			if (bp->b_flags & B_VMIO)
953 				--nvmio;
954 			bp->b_flags &= ~B_VMIO;
955 		}
956 		splx(s);
957 		if (!allocbuf(bp, size, 1)) {
958 			s = splbio();
959 			goto loop;
960 		}
961 		return (bp);
962 	}
963 }
964 
965 /*
966  * Get an empty, disassociated buffer of given size.
967  */
968 struct buf *
969 geteblk(int size)
970 {
971 	struct buf *bp;
972 
973 	while ((bp = getnewbuf(0, 0, 0)) == 0);
974 	allocbuf(bp, size, 0);
975 	bp->b_flags |= B_INVAL;
976 	return (bp);
977 }
978 
979 /*
980  * Modify the length of a buffer's underlying buffer storage without
981  * destroying information (unless, of course the buffer is shrinking).
982  */
983 int
984 allocbuf(struct buf * bp, int size, int vmio)
985 {
986 
987 	int s;
988 	int newbsize;
989 	int i;
990 
991 	if ((bp->b_flags & B_VMIO) == 0) {
992 		newbsize = round_page(size);
993 		if (newbsize == bp->b_bufsize) {
994 			bp->b_bcount = size;
995 			return 1;
996 		} else if (newbsize < bp->b_bufsize) {
997 			if (bp->b_flags & B_MALLOC) {
998 				bp->b_bcount = size;
999 				return 1;
1000 			}
1001 			vm_hold_free_pages(
1002 			    bp,
1003 			    (vm_offset_t) bp->b_data + newbsize,
1004 			    (vm_offset_t) bp->b_data + bp->b_bufsize);
1005 		} else if (newbsize > bp->b_bufsize) {
1006 			if (bp->b_flags & B_MALLOC) {
1007 				vm_offset_t bufaddr;
1008 
1009 				bufaddr = (vm_offset_t) bp->b_data;
1010 				bp->b_data = buffers_kva + (bp - buf) * MAXBSIZE;
1011 				vm_hold_load_pages(
1012 				    bp,
1013 				    (vm_offset_t) bp->b_data,
1014 				    (vm_offset_t) bp->b_data + newbsize);
1015 				bcopy((caddr_t) bufaddr, bp->b_data, bp->b_bcount);
1016 				free((caddr_t) bufaddr, M_TEMP);
1017 			} else if ((newbsize <= PAGE_SIZE / 2) && (bp->b_bufsize == 0)) {
1018 				bp->b_flags |= B_MALLOC;
1019 				bp->b_data = malloc(newbsize, M_TEMP, M_WAITOK);
1020 				bp->b_npages = 0;
1021 			} else {
1022 				vm_hold_load_pages(
1023 				    bp,
1024 				    (vm_offset_t) bp->b_data + bp->b_bufsize,
1025 				    (vm_offset_t) bp->b_data + newbsize);
1026 			}
1027 		}
1028 		/*
1029 		 * adjust buffer cache's idea of memory allocated to buffer
1030 		 * contents
1031 		 */
1032 		freebufspace -= newbsize - bp->b_bufsize;
1033 		allocbufspace += newbsize - bp->b_bufsize;
1034 	} else {
1035 		vm_page_t m;
1036 		int desiredpages;
1037 
1038 		newbsize = ((size + DEV_BSIZE - 1) / DEV_BSIZE) * DEV_BSIZE;
1039 		desiredpages = round_page(newbsize) / PAGE_SIZE;
1040 
1041 		if (newbsize == bp->b_bufsize) {
1042 			bp->b_bcount = size;
1043 			return 1;
1044 		} else if (newbsize < bp->b_bufsize) {
1045 			if (desiredpages < bp->b_npages) {
1046 				pmap_qremove((vm_offset_t) trunc_page(bp->b_data) +
1047 				    desiredpages * PAGE_SIZE, (bp->b_npages - desiredpages));
1048 				for (i = desiredpages; i < bp->b_npages; i++) {
1049 					m = bp->b_pages[i];
1050 					s = splhigh();
1051 					if ((m->flags & PG_BUSY) || (m->busy != 0)) {
1052 						m->flags |= PG_WANTED;
1053 						tsleep(m, PVM, "biodep", 0);
1054 					}
1055 					splx(s);
1056 
1057 					if (m->bmapped == 0) {
1058 						printf("allocbuf: bmapped is zero for page %d\n", i);
1059 						panic("allocbuf: error");
1060 					}
1061 					--m->bmapped;
1062 					if (m->bmapped == 0) {
1063 						PAGE_WAKEUP(m);
1064 						pmap_page_protect(VM_PAGE_TO_PHYS(m), VM_PROT_NONE);
1065 						vm_page_free(m);
1066 					}
1067 					bp->b_pages[i] = NULL;
1068 				}
1069 				bp->b_npages = desiredpages;
1070 			}
1071 		} else {
1072 			vm_object_t obj;
1073 			vm_offset_t tinc, off, toff, objoff;
1074 			int pageindex, curbpnpages;
1075 			struct vnode *vp;
1076 			int bsize;
1077 
1078 			vp = bp->b_vp;
1079 			bsize = vp->v_mount->mnt_stat.f_iosize;
1080 
1081 			if (bp->b_npages < desiredpages) {
1082 				obj = (vm_object_t) vp->v_vmdata;
1083 				tinc = PAGE_SIZE;
1084 				if (tinc > bsize)
1085 					tinc = bsize;
1086 				off = bp->b_lblkno * bsize;
1087 				curbpnpages = bp->b_npages;
1088 		doretry:
1089 				for (toff = 0; toff < newbsize; toff += tinc) {
1090 					int mask;
1091 					int bytesinpage;
1092 
1093 					pageindex = toff / PAGE_SIZE;
1094 					objoff = trunc_page(toff + off);
1095 					if (pageindex < curbpnpages) {
1096 						int pb;
1097 
1098 						m = bp->b_pages[pageindex];
1099 						if (m->offset != objoff)
1100 							panic("allocbuf: page changed offset??!!!?");
1101 						bytesinpage = tinc;
1102 						if (tinc > (newbsize - toff))
1103 							bytesinpage = newbsize - toff;
1104 						if (!vm_page_is_valid(m, toff + off, bytesinpage)) {
1105 							bp->b_flags &= ~B_CACHE;
1106 						}
1107 						if ((m->flags & PG_ACTIVE) == 0)
1108 							vm_page_activate(m);
1109 						continue;
1110 					}
1111 					m = vm_page_lookup(obj, objoff);
1112 					if (!m) {
1113 						m = vm_page_alloc(obj, objoff, 0);
1114 						if (!m) {
1115 							int j;
1116 
1117 							for (j = bp->b_npages; j < pageindex; j++) {
1118 								vm_page_t mt = bp->b_pages[j];
1119 
1120 								PAGE_WAKEUP(mt);
1121 								if (!mt->valid) {
1122 									vm_page_free(mt);
1123 								}
1124 							}
1125 							VM_WAIT;
1126 							if (vmio && (bp->b_flags & B_PDWANTED)) {
1127 								--nvmio;
1128 								bp->b_flags &= ~B_VMIO;
1129 								bp->b_flags |= B_INVAL;
1130 								brelse(bp);
1131 								return 0;
1132 							}
1133 							curbpnpages = bp->b_npages;
1134 							goto doretry;
1135 						}
1136 						m->valid = 0;
1137 						vm_page_activate(m);
1138 					} else if ((m->valid == 0) || (m->flags & PG_BUSY)) {
1139 						int j;
1140 						int bufferdestroyed = 0;
1141 
1142 						for (j = bp->b_npages; j < pageindex; j++) {
1143 							vm_page_t mt = bp->b_pages[j];
1144 
1145 							PAGE_WAKEUP(mt);
1146 							if (mt->valid == 0) {
1147 								vm_page_free(mt);
1148 							}
1149 						}
1150 						if (vmio && (bp->b_flags & B_PDWANTED)) {
1151 							--nvmio;
1152 							bp->b_flags &= ~B_VMIO;
1153 							bp->b_flags |= B_INVAL;
1154 							brelse(bp);
1155 							VM_WAIT;
1156 							bufferdestroyed = 1;
1157 						}
1158 						s = splbio();
1159 						if (m) {
1160 							m->flags |= PG_WANTED;
1161 							tsleep(m, PRIBIO, "pgtblk", 0);
1162 						}
1163 						splx(s);
1164 						if (bufferdestroyed)
1165 							return 0;
1166 						curbpnpages = bp->b_npages;
1167 						goto doretry;
1168 					} else {
1169 						int pb;
1170 
1171 						if ((m->flags & PG_CACHE) &&
1172 						    (cnt.v_free_count + cnt.v_cache_count) < cnt.v_free_reserved) {
1173 							int j;
1174 
1175 							for (j = bp->b_npages; j < pageindex; j++) {
1176 								vm_page_t mt = bp->b_pages[j];
1177 
1178 								PAGE_WAKEUP(mt);
1179 								if (mt->valid == 0) {
1180 									vm_page_free(mt);
1181 								}
1182 							}
1183 							VM_WAIT;
1184 							if (vmio && (bp->b_flags & B_PDWANTED)) {
1185 								--nvmio;
1186 								bp->b_flags &= ~B_VMIO;
1187 								bp->b_flags |= B_INVAL;
1188 								brelse(bp);
1189 								return 0;
1190 							}
1191 							curbpnpages = bp->b_npages;
1192 							goto doretry;
1193 						}
1194 						bytesinpage = tinc;
1195 						if (tinc > (newbsize - toff))
1196 							bytesinpage = newbsize - toff;
1197 						if (!vm_page_is_valid(m, toff + off, bytesinpage)) {
1198 							bp->b_flags &= ~B_CACHE;
1199 						}
1200 						if ((m->flags & PG_ACTIVE) == 0)
1201 							vm_page_activate(m);
1202 						m->flags |= PG_BUSY;
1203 					}
1204 					bp->b_pages[pageindex] = m;
1205 					curbpnpages = pageindex + 1;
1206 				}
1207 				if (bsize >= PAGE_SIZE) {
1208 					for (i = bp->b_npages; i < curbpnpages; i++) {
1209 						m = bp->b_pages[i];
1210 						if (m->valid == 0) {
1211 							bp->b_flags &= ~B_CACHE;
1212 						}
1213 						m->bmapped++;
1214 						PAGE_WAKEUP(m);
1215 					}
1216 				} else {
1217 					if (!vm_page_is_valid(bp->b_pages[0], off, bsize))
1218 						bp->b_flags &= ~B_CACHE;
1219 					bp->b_pages[0]->bmapped++;
1220 					PAGE_WAKEUP(bp->b_pages[0]);
1221 				}
1222 				bp->b_npages = curbpnpages;
1223 				bp->b_data = buffers_kva + (bp - buf) * MAXBSIZE;
1224 				pmap_qenter((vm_offset_t) bp->b_data, bp->b_pages, bp->b_npages);
1225 				bp->b_data += off % PAGE_SIZE;
1226 			}
1227 		}
1228 	}
1229 	bp->b_bufsize = newbsize;
1230 	bp->b_bcount = size;
1231 	return 1;
1232 }
1233 
1234 /*
1235  * Wait for buffer I/O completion, returning error status.
1236  */
1237 int
1238 biowait(register struct buf * bp)
1239 {
1240 	int s;
1241 
1242 	s = splbio();
1243 	while ((bp->b_flags & B_DONE) == 0)
1244 		tsleep((caddr_t) bp, PRIBIO, "biowait", 0);
1245 	if ((bp->b_flags & B_ERROR) || bp->b_error) {
1246 		if ((bp->b_flags & B_INVAL) == 0) {
1247 			bp->b_flags |= B_INVAL;
1248 			bp->b_dev = NODEV;
1249 			LIST_REMOVE(bp, b_hash);
1250 			LIST_INSERT_HEAD(&invalhash, bp, b_hash);
1251 			wakeup((caddr_t) bp);
1252 		}
1253 		if (!bp->b_error)
1254 			bp->b_error = EIO;
1255 		else
1256 			bp->b_flags |= B_ERROR;
1257 		splx(s);
1258 		return (bp->b_error);
1259 	} else {
1260 		splx(s);
1261 		return (0);
1262 	}
1263 }
1264 
1265 /*
1266  * Finish I/O on a buffer, calling an optional function.
1267  * This is usually called from interrupt level, so process blocking
1268  * is not *a good idea*.
1269  */
1270 void
1271 biodone(register struct buf * bp)
1272 {
1273 	int s;
1274 
1275 	s = splbio();
1276 	if (bp->b_flags & B_DONE)
1277 		printf("biodone: buffer already done\n");
1278 	bp->b_flags |= B_DONE;
1279 
1280 	if ((bp->b_flags & B_READ) == 0) {
1281 		vwakeup(bp);
1282 	}
1283 #ifdef BOUNCE_BUFFERS
1284 	if (bp->b_flags & B_BOUNCE)
1285 		vm_bounce_free(bp);
1286 #endif
1287 
1288 	/* call optional completion function if requested */
1289 	if (bp->b_flags & B_CALL) {
1290 		bp->b_flags &= ~B_CALL;
1291 		(*bp->b_iodone) (bp);
1292 		splx(s);
1293 		return;
1294 	}
1295 	if (bp->b_flags & B_VMIO) {
1296 		int i, resid;
1297 		vm_offset_t foff;
1298 		vm_page_t m;
1299 		vm_object_t obj;
1300 		int iosize;
1301 		struct vnode *vp = bp->b_vp;
1302 
1303 		foff = vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
1304 		obj = (vm_object_t) vp->v_vmdata;
1305 		if (!obj) {
1306 			return;
1307 		}
1308 #if defined(VFS_BIO_DEBUG)
1309 		if (obj->paging_in_progress < bp->b_npages) {
1310 			printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n",
1311 			    obj->paging_in_progress, bp->b_npages);
1312 		}
1313 #endif
1314 		iosize = bp->b_bufsize;
1315 		for (i = 0; i < bp->b_npages; i++) {
1316 			m = bp->b_pages[i];
1317 			if (m == bogus_page) {
1318 				m = vm_page_lookup(obj, foff);
1319 				if (!m) {
1320 #if defined(VFS_BIO_DEBUG)
1321 					printf("biodone: page disappeared\n");
1322 #endif
1323 					--obj->paging_in_progress;
1324 					continue;
1325 				}
1326 				bp->b_pages[i] = m;
1327 				pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
1328 			}
1329 #if defined(VFS_BIO_DEBUG)
1330 			if (trunc_page(foff) != m->offset) {
1331 				printf("biodone: foff(%d)/m->offset(%d) mismatch\n", foff, m->offset);
1332 			}
1333 #endif
1334 			resid = (m->offset + PAGE_SIZE) - foff;
1335 			if (resid > iosize)
1336 				resid = iosize;
1337 			if (resid > 0) {
1338 				vm_page_set_valid(m, foff, resid);
1339 				vm_page_set_clean(m, foff, resid);
1340 			}
1341 			if (m->busy == 0) {
1342 				printf("biodone: page busy < 0, off: %d, foff: %d, resid: %d, index: %d\n",
1343 				    m->offset, foff, resid, i);
1344 				printf(" iosize: %d, lblkno: %d\n",
1345 				    bp->b_vp->v_mount->mnt_stat.f_iosize, bp->b_lblkno);
1346 				printf(" valid: 0x%x, dirty: 0x%x, mapped: %d\n",
1347 				    m->valid, m->dirty, m->bmapped);
1348 				panic("biodone: page busy < 0\n");
1349 			}
1350 			--m->busy;
1351 			PAGE_WAKEUP(m);
1352 			--obj->paging_in_progress;
1353 			foff += resid;
1354 			iosize -= resid;
1355 		}
1356 		if (obj && obj->paging_in_progress == 0)
1357 			wakeup((caddr_t) obj);
1358 	}
1359 	/*
1360 	 * For asynchronous completions, release the buffer now. The brelse
1361 	 * checks for B_WANTED and will do the wakeup there if necessary - so
1362 	 * no need to do a wakeup here in the async case.
1363 	 */
1364 
1365 	if (bp->b_flags & B_ASYNC) {
1366 		brelse(bp);
1367 	} else {
1368 		bp->b_flags &= ~(B_WANTED | B_PDWANTED);
1369 		wakeup((caddr_t) bp);
1370 	}
1371 	splx(s);
1372 }
1373 
1374 int
1375 count_lock_queue()
1376 {
1377 	int count;
1378 	struct buf *bp;
1379 
1380 	count = 0;
1381 	for (bp = bufqueues[QUEUE_LOCKED].tqh_first;
1382 	    bp != NULL;
1383 	    bp = bp->b_freelist.tqe_next)
1384 		count++;
1385 	return (count);
1386 }
1387 
1388 int vfs_update_interval = 30;
1389 
1390 void
1391 vfs_update()
1392 {
1393 	(void) spl0();
1394 	while (1) {
1395 		tsleep((caddr_t) &vfs_update_wakeup, PRIBIO, "update",
1396 		    hz * vfs_update_interval);
1397 		vfs_update_wakeup = 0;
1398 		sync(curproc, NULL, NULL);
1399 	}
1400 }
1401 
1402 void
1403 vfs_unbusy_pages(struct buf * bp)
1404 {
1405 	int i;
1406 
1407 	if (bp->b_flags & B_VMIO) {
1408 		struct vnode *vp = bp->b_vp;
1409 		vm_object_t obj = (vm_object_t) vp->v_vmdata;
1410 		vm_offset_t foff;
1411 
1412 		foff = vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
1413 
1414 		for (i = 0; i < bp->b_npages; i++) {
1415 			vm_page_t m = bp->b_pages[i];
1416 
1417 			if (m == bogus_page) {
1418 				m = vm_page_lookup(obj, foff);
1419 				if (!m) {
1420 					panic("vfs_unbusy_pages: page missing\n");
1421 				}
1422 				bp->b_pages[i] = m;
1423 				pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
1424 			}
1425 			--obj->paging_in_progress;
1426 			--m->busy;
1427 			PAGE_WAKEUP(m);
1428 		}
1429 		if (obj->paging_in_progress == 0)
1430 			wakeup((caddr_t) obj);
1431 	}
1432 }
1433 
1434 void
1435 vfs_busy_pages(struct buf * bp, int clear_modify)
1436 {
1437 	int i;
1438 
1439 	if (bp->b_flags & B_VMIO) {
1440 		vm_object_t obj = (vm_object_t) bp->b_vp->v_vmdata;
1441 		vm_offset_t foff = bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
1442 		int iocount = bp->b_bufsize;
1443 
1444 		for (i = 0; i < bp->b_npages; i++) {
1445 			vm_page_t m = bp->b_pages[i];
1446 			int resid = (m->offset + PAGE_SIZE) - foff;
1447 
1448 			if (resid > iocount)
1449 				resid = iocount;
1450 			obj->paging_in_progress++;
1451 			m->busy++;
1452 			if (clear_modify) {
1453 				vm_page_test_dirty(m);
1454 				pmap_page_protect(VM_PAGE_TO_PHYS(m), VM_PROT_READ);
1455 			} else if (bp->b_bcount >= PAGE_SIZE) {
1456 				if (m->valid && (bp->b_flags & B_CACHE) == 0) {
1457 					bp->b_pages[i] = bogus_page;
1458 					pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
1459 				}
1460 			}
1461 			foff += resid;
1462 			iocount -= resid;
1463 		}
1464 	}
1465 }
1466 
1467 void
1468 vfs_dirty_pages(struct buf * bp)
1469 {
1470 	int i;
1471 
1472 	if (bp->b_flags & B_VMIO) {
1473 		vm_offset_t foff = bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
1474 		int iocount = bp->b_bufsize;
1475 
1476 		for (i = 0; i < bp->b_npages; i++) {
1477 			vm_page_t m = bp->b_pages[i];
1478 			int resid = (m->offset + PAGE_SIZE) - foff;
1479 
1480 			if (resid > iocount)
1481 				resid = iocount;
1482 			if (resid > 0) {
1483 				vm_page_set_valid(m, foff, resid);
1484 				vm_page_set_dirty(m, foff, resid);
1485 			}
1486 			PAGE_WAKEUP(m);
1487 			foff += resid;
1488 			iocount -= resid;
1489 		}
1490 	}
1491 }
1492 /*
1493  * these routines are not in the correct place (yet)
1494  * also they work *ONLY* for kernel_pmap!!!
1495  */
1496 void
1497 vm_hold_load_pages(struct buf * bp, vm_offset_t froma, vm_offset_t toa)
1498 {
1499 	vm_offset_t pg;
1500 	vm_page_t p;
1501 	vm_offset_t from = round_page(froma);
1502 	vm_offset_t to = round_page(toa);
1503 
1504 tryagain0:
1505 	if ((curproc != pageproc) && ((cnt.v_free_count + cnt.v_cache_count) <=
1506 		cnt.v_free_reserved + (toa - froma) / PAGE_SIZE)) {
1507 		VM_WAIT;
1508 		goto tryagain0;
1509 	}
1510 	for (pg = from; pg < to; pg += PAGE_SIZE) {
1511 
1512 tryagain:
1513 
1514 		p = vm_page_alloc(kernel_object, pg - VM_MIN_KERNEL_ADDRESS, 0);
1515 		if (!p) {
1516 			VM_WAIT;
1517 			goto tryagain;
1518 		}
1519 		vm_page_wire(p);
1520 		pmap_kenter(pg, VM_PAGE_TO_PHYS(p));
1521 		bp->b_pages[((caddr_t) pg - bp->b_data) / PAGE_SIZE] = p;
1522 		PAGE_WAKEUP(p);
1523 		bp->b_npages++;
1524 	}
1525 }
1526 
1527 void
1528 vm_hold_free_pages(struct buf * bp, vm_offset_t froma, vm_offset_t toa)
1529 {
1530 	vm_offset_t pg;
1531 	vm_page_t p;
1532 	vm_offset_t from = round_page(froma);
1533 	vm_offset_t to = round_page(toa);
1534 
1535 	for (pg = from; pg < to; pg += PAGE_SIZE) {
1536 		p = bp->b_pages[((caddr_t) pg - bp->b_data) / PAGE_SIZE];
1537 		bp->b_pages[((caddr_t) pg - bp->b_data) / PAGE_SIZE] = 0;
1538 		pmap_kremove(pg);
1539 		vm_page_free(p);
1540 		--bp->b_npages;
1541 	}
1542 }
1543 
1544 void
1545 bufstats()
1546 {
1547 }
1548