xref: /freebsd/sys/kern/vfs_bio.c (revision 17ee9d00bc1ae1e598c38f25826f861e4bc6c3ce)
1 /*
2  * Copyright (c) 1994 John S. Dyson
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice immediately at the beginning of the file, without modification,
10  *    this list of conditions, and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  * 3. Absolutely no warranty of function or purpose is made by the author
15  *    John S. Dyson.
16  * 4. This work was done expressly for inclusion into FreeBSD.  Other use
17  *    is allowed if this notation is included.
18  * 5. Modifications may be freely made to this file if the above conditions
19  *    are met.
20  *
21  * $Id: vfs_bio.c,v 1.29 1995/02/22 09:16:07 davidg Exp $
22  */
23 
24 /*
25  * this file contains a new buffer I/O scheme implementing a coherent
26  * VM object and buffer cache scheme.  Pains have been taken to make
27  * sure that the performance degradation associated with schemes such
28  * as this is not realized.
29  *
30  * Author:  John S. Dyson
31  * Significant help during the development and debugging phases
32  * had been provided by David Greenman, also of the FreeBSD core team.
33  */
34 
35 #define VMIO
36 #include <sys/param.h>
37 #include <sys/systm.h>
38 #include <sys/kernel.h>
39 #include <sys/proc.h>
40 #include <sys/vnode.h>
41 #include <vm/vm.h>
42 #include <vm/vm_pageout.h>
43 #include <vm/vm_page.h>
44 #include <vm/vm_object.h>
45 #include <sys/buf.h>
46 #include <sys/mount.h>
47 #include <sys/malloc.h>
48 #include <sys/resourcevar.h>
49 #include <sys/proc.h>
50 
51 #include <miscfs/specfs/specdev.h>
52 
53 struct buf *buf;		/* buffer header pool */
54 int nbuf;			/* number of buffer headers calculated
55 				 * elsewhere */
56 struct swqueue bswlist;
57 int nvmio, nlru;
58 
59 extern vm_map_t buffer_map, io_map, kernel_map, pager_map;
60 
61 void vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to);
62 void vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to);
63 void vfs_dirty_pages(struct buf * bp);
64 void vfs_busy_pages(struct buf *, int clear_modify);
65 
66 int needsbuffer;
67 
68 /*
69  * Internal update daemon, process 3
70  *	The variable vfs_update_wakeup allows for internal syncs.
71  */
72 int vfs_update_wakeup;
73 
74 
75 /*
76  * buffers base kva
77  */
78 caddr_t buffers_kva;
79 
80 /*
81  * bogus page -- for I/O to/from partially complete buffers
82  * this is a temporary solution to the problem, but it is not
83  * really that bad.  it would be better to split the buffer
84  * for input in the case of buffers partially already in memory,
85  * but the code is intricate enough already.
86  */
87 vm_page_t bogus_page;
88 vm_offset_t bogus_offset;
89 
90 int bufspace, maxbufspace;
91 
92 /*
93  * advisory minimum for size of LRU queue or VMIO queue
94  */
95 int minbuf;
96 
97 /*
98  * Initialize buffer headers and related structures.
99  */
100 void
101 bufinit()
102 {
103 	struct buf *bp;
104 	int i;
105 
106 	TAILQ_INIT(&bswlist);
107 	LIST_INIT(&invalhash);
108 
109 	/* first, make a null hash table */
110 	for (i = 0; i < BUFHSZ; i++)
111 		LIST_INIT(&bufhashtbl[i]);
112 
113 	/* next, make a null set of free lists */
114 	for (i = 0; i < BUFFER_QUEUES; i++)
115 		TAILQ_INIT(&bufqueues[i]);
116 
117 	buffers_kva = (caddr_t) kmem_alloc_pageable(buffer_map, MAXBSIZE * nbuf);
118 	/* finally, initialize each buffer header and stick on empty q */
119 	for (i = 0; i < nbuf; i++) {
120 		bp = &buf[i];
121 		bzero(bp, sizeof *bp);
122 		bp->b_flags = B_INVAL;	/* we're just an empty header */
123 		bp->b_dev = NODEV;
124 		bp->b_vp = NULL;
125 		bp->b_rcred = NOCRED;
126 		bp->b_wcred = NOCRED;
127 		bp->b_qindex = QUEUE_EMPTY;
128 		bp->b_vnbufs.le_next = NOLIST;
129 		bp->b_data = buffers_kva + i * MAXBSIZE;
130 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
131 		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
132 	}
133 /*
134  * this will change later!!!
135  */
136 	minbuf = nbuf / 3;
137 	maxbufspace = 2 * (nbuf + 8) * PAGE_SIZE;
138 
139 	bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE);
140 	bogus_page = vm_page_alloc(kernel_object,
141 			bogus_offset - VM_MIN_KERNEL_ADDRESS, VM_ALLOC_NORMAL);
142 
143 }
144 
145 /*
146  * remove the buffer from the appropriate free list
147  */
148 void
149 bremfree(struct buf * bp)
150 {
151 	int s = splbio();
152 
153 	if (bp->b_qindex != QUEUE_NONE) {
154 		if (bp->b_qindex == QUEUE_LRU)
155 			--nlru;
156 		TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
157 		bp->b_qindex = QUEUE_NONE;
158 	} else {
159 		panic("bremfree: removing a buffer when not on a queue");
160 	}
161 	splx(s);
162 }
163 
164 /*
165  * Get a buffer with the specified data.  Look in the cache first.
166  */
167 int
168 bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred,
169     struct buf ** bpp)
170 {
171 	struct buf *bp;
172 
173 	bp = getblk(vp, blkno, size, 0, 0);
174 	*bpp = bp;
175 
176 	/* if not found in cache, do some I/O */
177 	if ((bp->b_flags & B_CACHE) == 0) {
178 		if (curproc && curproc->p_stats)	/* count block I/O */
179 			curproc->p_stats->p_ru.ru_inblock++;
180 		bp->b_flags |= B_READ;
181 		bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
182 		if (bp->b_rcred == NOCRED) {
183 			if (cred != NOCRED)
184 				crhold(cred);
185 			bp->b_rcred = cred;
186 		}
187 		vfs_busy_pages(bp, 0);
188 		VOP_STRATEGY(bp);
189 		return (biowait(bp));
190 	} else if (bp->b_lblkno == bp->b_blkno) {
191 		VOP_BMAP(vp, bp->b_lblkno, (struct vnode **) 0,
192 		    &bp->b_blkno, (int *) 0);
193 	}
194 	return (0);
195 }
196 
197 /*
198  * Operates like bread, but also starts asynchronous I/O on
199  * read-ahead blocks.
200  */
201 int
202 breadn(struct vnode * vp, daddr_t blkno, int size,
203     daddr_t * rablkno, int *rabsize,
204     int cnt, struct ucred * cred, struct buf ** bpp)
205 {
206 	struct buf *bp, *rabp;
207 	int i;
208 	int rv = 0, readwait = 0;
209 
210 	*bpp = bp = getblk(vp, blkno, size, 0, 0);
211 
212 	/* if not found in cache, do some I/O */
213 	if ((bp->b_flags & B_CACHE) == 0) {
214 		if (curproc && curproc->p_stats)	/* count block I/O */
215 			curproc->p_stats->p_ru.ru_inblock++;
216 		bp->b_flags |= B_READ;
217 		bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
218 		if (bp->b_rcred == NOCRED) {
219 			if (cred != NOCRED)
220 				crhold(cred);
221 			bp->b_rcred = cred;
222 		}
223 		vfs_busy_pages(bp, 0);
224 		VOP_STRATEGY(bp);
225 		++readwait;
226 	} else if (bp->b_lblkno == bp->b_blkno) {
227 		VOP_BMAP(vp, bp->b_lblkno, (struct vnode **) 0,
228 		    &bp->b_blkno, (int *) 0);
229 	}
230 	for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
231 		if (inmem(vp, *rablkno))
232 			continue;
233 		rabp = getblk(vp, *rablkno, *rabsize, 0, 0);
234 
235 		if ((rabp->b_flags & B_CACHE) == 0) {
236 			if (curproc && curproc->p_stats)
237 				curproc->p_stats->p_ru.ru_inblock++;
238 			rabp->b_flags |= B_READ | B_ASYNC;
239 			rabp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
240 			if (rabp->b_rcred == NOCRED) {
241 				if (cred != NOCRED)
242 					crhold(cred);
243 				rabp->b_rcred = cred;
244 			}
245 			vfs_busy_pages(rabp, 0);
246 			VOP_STRATEGY(rabp);
247 		} else {
248 			brelse(rabp);
249 		}
250 	}
251 
252 	if (readwait) {
253 		rv = biowait(bp);
254 	}
255 	return (rv);
256 }
257 
258 /*
259  * Write, release buffer on completion.  (Done by iodone
260  * if async.)
261  */
262 int
263 bwrite(struct buf * bp)
264 {
265 	int oldflags = bp->b_flags;
266 
267 	if (bp->b_flags & B_INVAL) {
268 		brelse(bp);
269 		return (0);
270 	}
271 	if (!(bp->b_flags & B_BUSY))
272 		panic("bwrite: buffer is not busy???");
273 
274 	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
275 	bp->b_flags |= B_WRITEINPROG;
276 
277 	if (oldflags & B_ASYNC) {
278 		if (oldflags & B_DELWRI) {
279 			reassignbuf(bp, bp->b_vp);
280 		} else if (curproc) {
281 			++curproc->p_stats->p_ru.ru_oublock;
282 		}
283 	}
284 	bp->b_vp->v_numoutput++;
285 	vfs_busy_pages(bp, 1);
286 	VOP_STRATEGY(bp);
287 
288 	if ((oldflags & B_ASYNC) == 0) {
289 		int rtval = biowait(bp);
290 
291 		if (oldflags & B_DELWRI) {
292 			reassignbuf(bp, bp->b_vp);
293 		} else if (curproc) {
294 			++curproc->p_stats->p_ru.ru_oublock;
295 		}
296 		brelse(bp);
297 		return (rtval);
298 	}
299 	return (0);
300 }
301 
302 int
303 vn_bwrite(ap)
304 	struct vop_bwrite_args *ap;
305 {
306 	return (bwrite(ap->a_bp));
307 }
308 
309 /*
310  * Delayed write. (Buffer is marked dirty).
311  */
312 void
313 bdwrite(struct buf * bp)
314 {
315 
316 	if ((bp->b_flags & B_BUSY) == 0) {
317 		panic("bdwrite: buffer is not busy");
318 	}
319 	if (bp->b_flags & B_INVAL) {
320 		brelse(bp);
321 		return;
322 	}
323 	if (bp->b_flags & B_TAPE) {
324 		bawrite(bp);
325 		return;
326 	}
327 	bp->b_flags &= ~B_READ;
328 	vfs_dirty_pages(bp);
329 	if ((bp->b_flags & B_DELWRI) == 0) {
330 		if (curproc)
331 			++curproc->p_stats->p_ru.ru_oublock;
332 		bp->b_flags |= B_DONE | B_DELWRI;
333 		reassignbuf(bp, bp->b_vp);
334 	}
335 	brelse(bp);
336 	return;
337 }
338 
339 /*
340  * Asynchronous write.
341  * Start output on a buffer, but do not wait for it to complete.
342  * The buffer is released when the output completes.
343  */
344 void
345 bawrite(struct buf * bp)
346 {
347 	struct vnode *vp;
348 	vp = bp->b_vp;
349 	bp->b_flags |= B_ASYNC;
350 	(void) bwrite(bp);
351 	/*
352 	 * this code supports limits on the amount of outstanding
353 	 * writes to a disk file.  this helps keep from overwhelming
354 	 * the buffer cache with writes, thereby allowing other files
355 	 * to be operated upon.
356 	 */
357 	if (vp->v_numoutput > (nbuf/2)) {
358 		int s = splbio();
359 
360 		while (vp->v_numoutput > (nbuf/4)) {
361 			vp->v_flag |= VBWAIT;
362 			tsleep((caddr_t) &vp->v_numoutput, PRIBIO, "bawnmo", 0);
363 		}
364 		splx(s);
365 	}
366 }
367 
368 /*
369  * Release a buffer.
370  */
371 void
372 brelse(struct buf * bp)
373 {
374 	int s;
375 
376 	if (bp->b_flags & B_CLUSTER) {
377 		relpbuf(bp);
378 		return;
379 	}
380 	/* anyone need a "free" block? */
381 	s = splbio();
382 
383 	if (needsbuffer) {
384 		needsbuffer = 0;
385 		wakeup((caddr_t) &needsbuffer);
386 	}
387 
388 	/* anyone need this block? */
389 	if (bp->b_flags & B_WANTED) {
390 		bp->b_flags &= ~(B_PDWANTED | B_WANTED | B_AGE);
391 		wakeup((caddr_t) bp);
392 	} else if (bp->b_flags & B_VMIO) {
393 		bp->b_flags &= ~(B_WANTED | B_PDWANTED);
394 		wakeup((caddr_t) bp);
395 	}
396 	if (bp->b_flags & B_LOCKED)
397 		bp->b_flags &= ~B_ERROR;
398 
399 	if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR)) ||
400 	    (bp->b_bufsize <= 0)) {
401 		bp->b_flags |= B_INVAL;
402 		bp->b_flags &= ~(B_DELWRI | B_CACHE);
403 		if (((bp->b_flags & B_VMIO) == 0) && bp->b_vp)
404 			brelvp(bp);
405 	}
406 
407 	/*
408 	 * VMIO buffer rundown.  It is not very necessary to keep a VMIO buffer
409 	 * constituted, so the B_INVAL flag is used to *invalidate* the buffer,
410 	 * but the VM object is kept around.  The B_NOCACHE flag is used to
411 	 * invalidate the pages in the VM object.
412 	 */
413 	if (bp->b_flags & B_VMIO) {
414 		vm_offset_t foff;
415 		vm_object_t obj;
416 		int i, resid;
417 		vm_page_t m;
418 		int iototal = bp->b_bufsize;
419 
420 		foff = 0;
421 		obj = 0;
422 		if (bp->b_npages) {
423 			if (bp->b_vp && bp->b_vp->v_mount) {
424 				foff = bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
425 			} else {
426 				/*
427 				 * vnode pointer has been ripped away --
428 				 * probably file gone...
429 				 */
430 				foff = bp->b_pages[0]->offset;
431 			}
432 		}
433 		for (i = 0; i < bp->b_npages; i++) {
434 			m = bp->b_pages[i];
435 			if (m == bogus_page) {
436 				panic("brelse: bogus page found");
437 			}
438 			resid = (m->offset + PAGE_SIZE) - foff;
439 			if (resid > iototal)
440 				resid = iototal;
441 			if (resid > 0) {
442 				if (bp->b_flags & (B_ERROR | B_NOCACHE)) {
443 					vm_page_set_invalid(m, foff, resid);
444 				} else if ((bp->b_flags & B_DELWRI) == 0) {
445 					vm_page_set_clean(m, foff, resid);
446 					vm_page_set_valid(m, foff, resid);
447 				}
448 			} else {
449 				vm_page_test_dirty(m);
450 			}
451 			foff += resid;
452 			iototal -= resid;
453 		}
454 
455 		if (bp->b_flags & B_INVAL) {
456 			for(i=0;i<bp->b_npages;i++) {
457 				m = bp->b_pages[i];
458 				--m->bmapped;
459 				if (m->bmapped == 0) {
460 					PAGE_WAKEUP(m);
461 					if (m->valid == 0) {
462 						pmap_page_protect(VM_PAGE_TO_PHYS(m), VM_PROT_NONE);
463 						vm_page_free(m);
464 					} else if ((m->dirty & m->valid) == 0 &&
465 						(m->flags & PG_REFERENCED) == 0 &&
466 							!pmap_is_referenced(VM_PAGE_TO_PHYS(m)))
467 						vm_page_cache(m);
468 					else if( (m->flags & PG_ACTIVE) == 0)
469 						vm_page_activate(m);
470 				}
471 			}
472 			bufspace -= bp->b_bufsize;
473 			pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
474 			bp->b_npages = 0;
475 			bp->b_bufsize = 0;
476 			bp->b_flags &= ~B_VMIO;
477 			if (bp->b_vp)
478 				brelvp(bp);
479 			--nvmio;
480 		}
481 	}
482 	if (bp->b_qindex != QUEUE_NONE)
483 		panic("brelse: free buffer onto another queue???");
484 
485 	/* enqueue */
486 	/* buffers with no memory */
487 	if (bp->b_bufsize == 0) {
488 		bp->b_qindex = QUEUE_EMPTY;
489 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
490 		LIST_REMOVE(bp, b_hash);
491 		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
492 		bp->b_dev = NODEV;
493 		/* buffers with junk contents */
494 	} else if (bp->b_flags & (B_ERROR | B_INVAL | B_NOCACHE)) {
495 		bp->b_qindex = QUEUE_AGE;
496 		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist);
497 		LIST_REMOVE(bp, b_hash);
498 		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
499 		bp->b_dev = NODEV;
500 		/* buffers that are locked */
501 	} else if (bp->b_flags & B_LOCKED) {
502 		bp->b_qindex = QUEUE_LOCKED;
503 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
504 		/* buffers with stale but valid contents */
505 	} else if (bp->b_flags & B_AGE) {
506 		bp->b_qindex = QUEUE_AGE;
507 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist);
508 		/* buffers with valid and quite potentially reuseable contents */
509 	} else {
510 		if (bp->b_flags & B_VMIO)
511 			bp->b_qindex = QUEUE_VMIO;
512 		else {
513 			bp->b_qindex = QUEUE_LRU;
514 			++nlru;
515 		}
516 		TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist);
517 	}
518 
519 	/* unlock */
520 	bp->b_flags &= ~(B_PDWANTED | B_WANTED | B_BUSY | B_ASYNC | B_NOCACHE | B_AGE);
521 	splx(s);
522 }
523 
524 /*
525  * this routine implements clustered async writes for
526  * clearing out B_DELWRI buffers...  This is much better
527  * than the old way of writing only one buffer at a time.
528  */
529 void
530 vfs_bio_awrite(struct buf * bp)
531 {
532 	int i;
533 	daddr_t lblkno = bp->b_lblkno;
534 	struct vnode *vp = bp->b_vp;
535 	int s;
536 	int ncl;
537 	struct buf *bpa;
538 
539 	s = splbio();
540 	if( vp->v_mount && (vp->v_flag & VVMIO) &&
541 	    	(bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
542 		int size = vp->v_mount->mnt_stat.f_iosize;
543 
544 		for (i = 1; i < MAXPHYS / size; i++) {
545 			if ((bpa = incore(vp, lblkno + i)) &&
546 			    ((bpa->b_flags & (B_BUSY | B_DELWRI | B_BUSY | B_CLUSTEROK | B_INVAL)) == B_DELWRI | B_CLUSTEROK) &&
547 			    (bpa->b_bufsize == size)) {
548 				if ((bpa->b_blkno == bpa->b_lblkno) ||
549 				    (bpa->b_blkno != bp->b_blkno + (i * size) / DEV_BSIZE))
550 					break;
551 			} else {
552 				break;
553 			}
554 		}
555 		ncl = i;
556 		/*
557 		 * this is a possible cluster write
558 		 */
559 		if (ncl != 1) {
560 			cluster_wbuild(vp, NULL, size, lblkno, ncl, -1);
561 			splx(s);
562 			return;
563 		}
564 	}
565 	/*
566 	 * default (old) behavior, writing out only one block
567 	 */
568 	bremfree(bp);
569 	bp->b_flags |= B_BUSY | B_ASYNC;
570 	bwrite(bp);
571 	splx(s);
572 }
573 
574 
575 /*
576  * Find a buffer header which is available for use.
577  */
578 struct buf *
579 getnewbuf(int slpflag, int slptimeo, int doingvmio)
580 {
581 	struct buf *bp;
582 	int s;
583 	int firstbp = 1;
584 
585 	s = splbio();
586 start:
587 	if (bufspace >= maxbufspace)
588 		goto trytofreespace;
589 
590 	/* can we constitute a new buffer? */
591 	if ((bp = bufqueues[QUEUE_EMPTY].tqh_first)) {
592 		if (bp->b_qindex != QUEUE_EMPTY)
593 			panic("getnewbuf: inconsistent EMPTY queue");
594 		bremfree(bp);
595 		goto fillbuf;
596 	}
597 trytofreespace:
598 	/*
599 	 * We keep the file I/O from hogging metadata I/O
600 	 * This is desirable because file data is cached in the
601 	 * VM/Buffer cache even if a buffer is freed.
602 	 */
603 	if (bp = bufqueues[QUEUE_AGE].tqh_first) {
604 		if (bp->b_qindex != QUEUE_AGE)
605 			panic("getnewbuf: inconsistent AGE queue");
606 	} else if ((nvmio > nbuf - minbuf)
607 	    && (bp = bufqueues[QUEUE_VMIO].tqh_first)) {
608 		if (bp->b_qindex != QUEUE_VMIO)
609 			panic("getnewbuf: inconsistent VMIO queue");
610 	} else if ((!doingvmio || (nlru > nbuf - minbuf)) &&
611 	    (bp = bufqueues[QUEUE_LRU].tqh_first)) {
612 		if (bp->b_qindex != QUEUE_LRU)
613 			panic("getnewbuf: inconsistent LRU queue");
614 	}
615 	if (!bp) {
616 		if (doingvmio) {
617 			if (bp = bufqueues[QUEUE_VMIO].tqh_first) {
618 				if (bp->b_qindex != QUEUE_VMIO)
619 					panic("getnewbuf: inconsistent VMIO queue");
620 			} else if (bp = bufqueues[QUEUE_LRU].tqh_first) {
621 				if (bp->b_qindex != QUEUE_LRU)
622 					panic("getnewbuf: inconsistent LRU queue");
623 			}
624 		} else {
625 			if (bp = bufqueues[QUEUE_LRU].tqh_first) {
626 				if (bp->b_qindex != QUEUE_LRU)
627 					panic("getnewbuf: inconsistent LRU queue");
628 			} else if (bp = bufqueues[QUEUE_VMIO].tqh_first) {
629 				if (bp->b_qindex != QUEUE_VMIO)
630 					panic("getnewbuf: inconsistent VMIO queue");
631 			}
632 		}
633 	}
634 	if (!bp) {
635 		/* wait for a free buffer of any kind */
636 		needsbuffer = 1;
637 		tsleep((caddr_t) &needsbuffer, PRIBIO | slpflag, "newbuf", slptimeo);
638 		splx(s);
639 		return (0);
640 	}
641 	/* if we are a delayed write, convert to an async write */
642 	if ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) {
643 		vfs_bio_awrite(bp);
644 		if (!slpflag && !slptimeo) {
645 			splx(s);
646 			return (0);
647 		}
648 		goto start;
649 	}
650 	bremfree(bp);
651 
652 	if (bp->b_flags & B_VMIO) {
653 		bp->b_flags |= B_INVAL | B_BUSY;
654 		brelse(bp);
655 		bremfree(bp);
656 	}
657 	if (bp->b_vp)
658 		brelvp(bp);
659 
660 	/* we are not free, nor do we contain interesting data */
661 	if (bp->b_rcred != NOCRED)
662 		crfree(bp->b_rcred);
663 	if (bp->b_wcred != NOCRED)
664 		crfree(bp->b_wcred);
665 fillbuf:
666 	bp->b_flags |= B_BUSY;
667 	LIST_REMOVE(bp, b_hash);
668 	LIST_INSERT_HEAD(&invalhash, bp, b_hash);
669 	splx(s);
670 	if (bp->b_bufsize) {
671 		allocbuf(bp, 0, 0);
672 	}
673 	bp->b_flags = B_BUSY;
674 	bp->b_dev = NODEV;
675 	bp->b_vp = NULL;
676 	bp->b_blkno = bp->b_lblkno = 0;
677 	bp->b_iodone = 0;
678 	bp->b_error = 0;
679 	bp->b_resid = 0;
680 	bp->b_bcount = 0;
681 	bp->b_npages = 0;
682 	bp->b_wcred = bp->b_rcred = NOCRED;
683 	bp->b_data = buffers_kva + (bp - buf) * MAXBSIZE;
684 	bp->b_dirtyoff = bp->b_dirtyend = 0;
685 	bp->b_validoff = bp->b_validend = 0;
686 	if (bufspace >= maxbufspace) {
687 		s = splbio();
688 		bp->b_flags |= B_INVAL;
689 		brelse(bp);
690 		goto trytofreespace;
691 	}
692 	return (bp);
693 }
694 
695 /*
696  * Check to see if a block is currently memory resident.
697  */
698 struct buf *
699 incore(struct vnode * vp, daddr_t blkno)
700 {
701 	struct buf *bp;
702 	struct bufhashhdr *bh;
703 
704 	int s = splbio();
705 
706 	bh = BUFHASH(vp, blkno);
707 	bp = bh->lh_first;
708 
709 	/* Search hash chain */
710 	while (bp) {
711 		/* hit */
712 		if (bp->b_lblkno == blkno && bp->b_vp == vp
713 		    && (bp->b_flags & B_INVAL) == 0) {
714 			splx(s);
715 			return (bp);
716 		}
717 		bp = bp->b_hash.le_next;
718 	}
719 	splx(s);
720 
721 	return (0);
722 }
723 
724 /*
725  * Returns true if no I/O is needed to access the
726  * associated VM object.  This is like incore except
727  * it also hunts around in the VM system for the data.
728  */
729 
730 int
731 inmem(struct vnode * vp, daddr_t blkno)
732 {
733 	vm_object_t obj;
734 	vm_offset_t off, toff, tinc;
735 	vm_page_t m;
736 
737 	if (incore(vp, blkno))
738 		return 1;
739 	if (vp->v_mount == 0)
740 		return 0;
741 	if ((vp->v_vmdata == 0) || (vp->v_flag & VVMIO) == 0)
742 		return 0;
743 
744 	obj = (vm_object_t) vp->v_vmdata;
745 	tinc = PAGE_SIZE;
746 	if (tinc > vp->v_mount->mnt_stat.f_iosize)
747 		tinc = vp->v_mount->mnt_stat.f_iosize;
748 	off = blkno * vp->v_mount->mnt_stat.f_iosize;
749 
750 	for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
751 		int mask;
752 
753 		m = vm_page_lookup(obj, trunc_page(toff + off));
754 		if (!m)
755 			return 0;
756 		if (vm_page_is_valid(m, toff + off, tinc) == 0)
757 			return 0;
758 	}
759 	return 1;
760 }
761 
762 /*
763  * Get a block given a specified block and offset into a file/device.
764  */
765 struct buf *
766 getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo)
767 {
768 	struct buf *bp;
769 	int s;
770 	struct bufhashhdr *bh;
771 	vm_offset_t off;
772 	int nleft;
773 
774 	s = splbio();
775 loop:
776 	if ((cnt.v_free_count + cnt.v_cache_count) < cnt.v_cache_min)
777 		wakeup((caddr_t) &vm_pages_needed);
778 
779 	if (bp = incore(vp, blkno)) {
780 		if (bp->b_flags & B_BUSY) {
781 			bp->b_flags |= B_WANTED;
782 			if (curproc == pageproc) {
783 				bp->b_flags |= B_PDWANTED;
784 				wakeup((caddr_t) &cnt.v_free_count);
785 			}
786 			if (!tsleep((caddr_t) bp, PRIBIO | slpflag, "getblk", slptimeo))
787 				goto loop;
788 			splx(s);
789 			return (struct buf *) NULL;
790 		}
791 		bp->b_flags |= B_BUSY | B_CACHE;
792 		bremfree(bp);
793 		/*
794 		 * check for size inconsistancies
795 		 */
796 		if (bp->b_bcount != size) {
797 #if defined(VFS_BIO_DEBUG)
798 			printf("getblk: invalid buffer size: %ld\n", bp->b_bcount);
799 #endif
800 			bp->b_flags |= B_INVAL;
801 			bwrite(bp);
802 			goto loop;
803 		}
804 		splx(s);
805 		return (bp);
806 	} else {
807 		vm_object_t obj;
808 		int doingvmio;
809 
810 		if ((obj = (vm_object_t) vp->v_vmdata) && (vp->v_flag & VVMIO)) {
811 			doingvmio = 1;
812 		} else {
813 			doingvmio = 0;
814 		}
815 		if ((bp = getnewbuf(slpflag, slptimeo, doingvmio)) == 0) {
816 			if (slpflag || slptimeo)
817 				return NULL;
818 			goto loop;
819 		}
820 		/*
821 		 * It is possible that another buffer has been constituted
822 		 * during the time that getnewbuf is blocked.  This checks
823 		 * for this possibility, and handles it.
824 		 */
825 		if (incore(vp, blkno)) {
826 			bp->b_flags |= B_INVAL;
827 			brelse(bp);
828 			goto loop;
829 		}
830 		/*
831 		 * Insert the buffer into the hash, so that it can
832 		 * be found by incore.
833 		 */
834 		bp->b_blkno = bp->b_lblkno = blkno;
835 		bgetvp(vp, bp);
836 		LIST_REMOVE(bp, b_hash);
837 		bh = BUFHASH(vp, blkno);
838 		LIST_INSERT_HEAD(bh, bp, b_hash);
839 
840 		if (doingvmio) {
841 			bp->b_flags |= (B_VMIO | B_CACHE);
842 #if defined(VFS_BIO_DEBUG)
843 			if (vp->v_type != VREG)
844 				printf("getblk: vmioing file type %d???\n", vp->v_type);
845 #endif
846 			++nvmio;
847 		} else {
848 			if (bp->b_flags & B_VMIO)
849 				--nvmio;
850 			bp->b_flags &= ~B_VMIO;
851 		}
852 		splx(s);
853 
854 		if (!allocbuf(bp, size, 1)) {
855 			s = splbio();
856 			goto loop;
857 		}
858 		return (bp);
859 	}
860 }
861 
862 /*
863  * Get an empty, disassociated buffer of given size.
864  */
865 struct buf *
866 geteblk(int size)
867 {
868 	struct buf *bp;
869 
870 	while ((bp = getnewbuf(0, 0, 0)) == 0);
871 	allocbuf(bp, size, 0);
872 	bp->b_flags |= B_INVAL;
873 	return (bp);
874 }
875 
876 /*
877  * This code constitutes the buffer memory from either anonymous system
878  * memory (in the case of non-VMIO operations) or from an associated
879  * VM object (in the case of VMIO operations).
880  *
881  * Note that this code is tricky, and has many complications to resolve
882  * deadlock or inconsistant data situations.  Tread lightly!!!
883  *
884  * Modify the length of a buffer's underlying buffer storage without
885  * destroying information (unless, of course the buffer is shrinking).
886  */
887 int
888 allocbuf(struct buf * bp, int size, int vmio)
889 {
890 
891 	int s;
892 	int newbsize, mbsize;
893 	int i;
894 
895 	if ((bp->b_flags & B_VMIO) == 0) {
896 		/*
897 		 * Just get anonymous memory from the kernel
898 		 */
899 		mbsize = ((size + DEV_BSIZE - 1) / DEV_BSIZE) * DEV_BSIZE;
900 		newbsize = round_page(size);
901 
902 		if (newbsize == bp->b_bufsize) {
903 			bp->b_bcount = size;
904 			return 1;
905 		} else if (newbsize < bp->b_bufsize) {
906 			vm_hold_free_pages(
907 			    bp,
908 			    (vm_offset_t) bp->b_data + newbsize,
909 			    (vm_offset_t) bp->b_data + bp->b_bufsize);
910 			bufspace -= (bp->b_bufsize - newbsize);
911 		} else if (newbsize > bp->b_bufsize) {
912 			vm_hold_load_pages(
913 			    bp,
914 			    (vm_offset_t) bp->b_data + bp->b_bufsize,
915 			    (vm_offset_t) bp->b_data + newbsize);
916 			bufspace += (newbsize - bp->b_bufsize);
917 		}
918 	} else {
919 		vm_page_t m;
920 		int desiredpages;
921 
922 		newbsize = ((size + DEV_BSIZE - 1) / DEV_BSIZE) * DEV_BSIZE;
923 		desiredpages = round_page(newbsize) / PAGE_SIZE;
924 
925 		if (newbsize == bp->b_bufsize) {
926 			bp->b_bcount = size;
927 			return 1;
928 		} else if (newbsize < bp->b_bufsize) {
929 			if (desiredpages < bp->b_npages) {
930 				pmap_qremove((vm_offset_t) trunc_page(bp->b_data) +
931 				    desiredpages * PAGE_SIZE, (bp->b_npages - desiredpages));
932 				for (i = desiredpages; i < bp->b_npages; i++) {
933 					m = bp->b_pages[i];
934 					s = splhigh();
935 					while ((m->flags & PG_BUSY) || (m->busy != 0)) {
936 						m->flags |= PG_WANTED;
937 						tsleep(m, PVM, "biodep", 0);
938 					}
939 					splx(s);
940 
941 					if (m->bmapped == 0) {
942 						printf("allocbuf: bmapped is zero for page %d\n", i);
943 						panic("allocbuf: error");
944 					}
945 					--m->bmapped;
946 					if (m->bmapped == 0) {
947 						PAGE_WAKEUP(m);
948 						if (m->valid == 0) {
949 							pmap_page_protect(VM_PAGE_TO_PHYS(m), VM_PROT_NONE);
950 							vm_page_free(m);
951 						}
952 					}
953 					bp->b_pages[i] = NULL;
954 				}
955 				bp->b_npages = desiredpages;
956 				bufspace -= (bp->b_bufsize - newbsize);
957 			}
958 		} else {
959 			vm_object_t obj;
960 			vm_offset_t tinc, off, toff, objoff;
961 			int pageindex, curbpnpages;
962 			struct vnode *vp;
963 			int bsize;
964 
965 			vp = bp->b_vp;
966 			bsize = vp->v_mount->mnt_stat.f_iosize;
967 
968 			if (bp->b_npages < desiredpages) {
969 				obj = (vm_object_t) vp->v_vmdata;
970 				tinc = PAGE_SIZE;
971 				if (tinc > bsize)
972 					tinc = bsize;
973 				off = bp->b_lblkno * bsize;
974 				curbpnpages = bp->b_npages;
975 		doretry:
976 				for (toff = 0; toff < newbsize; toff += tinc) {
977 					int mask;
978 					int bytesinpage;
979 
980 					pageindex = toff / PAGE_SIZE;
981 					objoff = trunc_page(toff + off);
982 					if (pageindex < curbpnpages) {
983 						int pb;
984 
985 						m = bp->b_pages[pageindex];
986 						if (m->offset != objoff)
987 							panic("allocbuf: page changed offset??!!!?");
988 						bytesinpage = tinc;
989 						if (tinc > (newbsize - toff))
990 							bytesinpage = newbsize - toff;
991 						if (!vm_page_is_valid(m, toff + off, bytesinpage)) {
992 							bp->b_flags &= ~B_CACHE;
993 						}
994 						if ((m->flags & PG_ACTIVE) == 0)
995 							vm_page_activate(m);
996 						continue;
997 					}
998 					m = vm_page_lookup(obj, objoff);
999 					if (!m) {
1000 						m = vm_page_alloc(obj, objoff, VM_ALLOC_NORMAL);
1001 						if (!m) {
1002 							int j;
1003 
1004 							for (j = bp->b_npages; j < pageindex; j++) {
1005 								vm_page_t mt = bp->b_pages[j];
1006 
1007 								PAGE_WAKEUP(mt);
1008 								if (mt->valid == 0 && mt->bmapped == 0) {
1009 									vm_page_free(mt);
1010 								}
1011 							}
1012 							VM_WAIT;
1013 							if (vmio && (bp->b_flags & B_PDWANTED)) {
1014 								bp->b_flags |= B_INVAL;
1015 								brelse(bp);
1016 								return 0;
1017 							}
1018 							curbpnpages = bp->b_npages;
1019 							goto doretry;
1020 						}
1021 						m->valid = 0;
1022 						vm_page_activate(m);
1023 					} else if ((m->valid == 0) || (m->flags & PG_BUSY)) {
1024 						int j;
1025 						int bufferdestroyed = 0;
1026 
1027 						for (j = bp->b_npages; j < pageindex; j++) {
1028 							vm_page_t mt = bp->b_pages[j];
1029 
1030 							PAGE_WAKEUP(mt);
1031 							if (mt->valid == 0 && mt->bmapped == 0) {
1032 								vm_page_free(mt);
1033 							}
1034 						}
1035 						if (vmio && (bp->b_flags & B_PDWANTED)) {
1036 							bp->b_flags |= B_INVAL;
1037 							brelse(bp);
1038 							VM_WAIT;
1039 							bufferdestroyed = 1;
1040 						}
1041 						s = splbio();
1042 						if (m->flags & PG_BUSY) {
1043 							m->flags |= PG_WANTED;
1044 							tsleep(m, PRIBIO, "pgtblk", 0);
1045 						} else if( m->valid == 0 && m->bmapped == 0) {
1046 							vm_page_free(m);
1047 						}
1048 						splx(s);
1049 						if (bufferdestroyed)
1050 							return 0;
1051 						curbpnpages = bp->b_npages;
1052 						goto doretry;
1053 					} else {
1054 						int pb;
1055 
1056 						if ((m->flags & PG_CACHE) &&
1057 						    (cnt.v_free_count + cnt.v_cache_count) < cnt.v_free_min) {
1058 							int j;
1059 
1060 							for (j = bp->b_npages; j < pageindex; j++) {
1061 								vm_page_t mt = bp->b_pages[j];
1062 
1063 								PAGE_WAKEUP(mt);
1064 								if (mt->valid == 0 && mt->bmapped == 0) {
1065 									vm_page_free(mt);
1066 								}
1067 							}
1068 							VM_WAIT;
1069 							if (vmio && (bp->b_flags & B_PDWANTED)) {
1070 								bp->b_flags |= B_INVAL;
1071 								brelse(bp);
1072 								return 0;
1073 							}
1074 							curbpnpages = bp->b_npages;
1075 							goto doretry;
1076 						}
1077 						bytesinpage = tinc;
1078 						if (tinc > (newbsize - toff))
1079 							bytesinpage = newbsize - toff;
1080 						if (!vm_page_is_valid(m, toff + off, bytesinpage)) {
1081 							bp->b_flags &= ~B_CACHE;
1082 						}
1083 						if ((m->flags & PG_ACTIVE) == 0)
1084 							vm_page_activate(m);
1085 						m->flags |= PG_BUSY;
1086 					}
1087 					bp->b_pages[pageindex] = m;
1088 					curbpnpages = pageindex + 1;
1089 				}
1090 				if (bsize >= PAGE_SIZE) {
1091 					for (i = bp->b_npages; i < curbpnpages; i++) {
1092 						m = bp->b_pages[i];
1093 						if (m->valid == 0) {
1094 							bp->b_flags &= ~B_CACHE;
1095 						}
1096 						m->bmapped++;
1097 						PAGE_WAKEUP(m);
1098 					}
1099 				} else {
1100 					if (!vm_page_is_valid(bp->b_pages[0], off, bsize))
1101 						bp->b_flags &= ~B_CACHE;
1102 					bp->b_pages[0]->bmapped++;
1103 					PAGE_WAKEUP(bp->b_pages[0]);
1104 				}
1105 				bp->b_npages = curbpnpages;
1106 				bp->b_data = buffers_kva + (bp - buf) * MAXBSIZE;
1107 				pmap_qenter((vm_offset_t) bp->b_data, bp->b_pages, bp->b_npages);
1108 				bp->b_data += off % PAGE_SIZE;
1109 			}
1110 			bufspace += (newbsize - bp->b_bufsize);
1111 		}
1112 	}
1113 	bp->b_bufsize = newbsize;
1114 	bp->b_bcount = size;
1115 	return 1;
1116 }
1117 
1118 /*
1119  * Wait for buffer I/O completion, returning error status.
1120  */
1121 int
1122 biowait(register struct buf * bp)
1123 {
1124 	int s;
1125 
1126 	s = splbio();
1127 	while ((bp->b_flags & B_DONE) == 0)
1128 		tsleep((caddr_t) bp, PRIBIO, "biowait", 0);
1129 	if ((bp->b_flags & B_ERROR) || bp->b_error) {
1130 		if ((bp->b_flags & B_INVAL) == 0) {
1131 			bp->b_flags |= B_INVAL;
1132 			bp->b_dev = NODEV;
1133 			LIST_REMOVE(bp, b_hash);
1134 			LIST_INSERT_HEAD(&invalhash, bp, b_hash);
1135 			wakeup((caddr_t) bp);
1136 		}
1137 		if (!bp->b_error)
1138 			bp->b_error = EIO;
1139 		else
1140 			bp->b_flags |= B_ERROR;
1141 		splx(s);
1142 		return (bp->b_error);
1143 	} else {
1144 		splx(s);
1145 		return (0);
1146 	}
1147 }
1148 
1149 /*
1150  * Finish I/O on a buffer, calling an optional function.
1151  * This is usually called from interrupt level, so process blocking
1152  * is not *a good idea*.
1153  */
1154 void
1155 biodone(register struct buf * bp)
1156 {
1157 	int s;
1158 
1159 	s = splbio();
1160 	if (bp->b_flags & B_DONE)
1161 		printf("biodone: buffer already done\n");
1162 	bp->b_flags |= B_DONE;
1163 
1164 	if ((bp->b_flags & B_READ) == 0) {
1165 		struct vnode *vp = bp->b_vp;
1166 		vwakeup(bp);
1167 		if (vp && (vp->v_numoutput == (nbuf/4)) && (vp->v_flag & VBWAIT)) {
1168 			vp->v_flag &= ~VBWAIT;
1169 			wakeup((caddr_t) &vp->v_numoutput);
1170 		}
1171 	}
1172 #ifdef BOUNCE_BUFFERS
1173 	if (bp->b_flags & B_BOUNCE)
1174 		vm_bounce_free(bp);
1175 #endif
1176 
1177 	/* call optional completion function if requested */
1178 	if (bp->b_flags & B_CALL) {
1179 		bp->b_flags &= ~B_CALL;
1180 		(*bp->b_iodone) (bp);
1181 		splx(s);
1182 		return;
1183 	}
1184 	if (bp->b_flags & B_VMIO) {
1185 		int i, resid;
1186 		vm_offset_t foff;
1187 		vm_page_t m;
1188 		vm_object_t obj;
1189 		int iosize;
1190 		struct vnode *vp = bp->b_vp;
1191 
1192 		foff = vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
1193 		obj = (vm_object_t) vp->v_vmdata;
1194 		if (!obj) {
1195 			return;
1196 		}
1197 #if defined(VFS_BIO_DEBUG)
1198 		if (obj->paging_in_progress < bp->b_npages) {
1199 			printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n",
1200 			    obj->paging_in_progress, bp->b_npages);
1201 		}
1202 #endif
1203 		iosize = bp->b_bufsize;
1204 		for (i = 0; i < bp->b_npages; i++) {
1205 			m = bp->b_pages[i];
1206 			if (m == bogus_page) {
1207 				m = vm_page_lookup(obj, foff);
1208 				if (!m) {
1209 #if defined(VFS_BIO_DEBUG)
1210 					printf("biodone: page disappeared\n");
1211 #endif
1212 					--obj->paging_in_progress;
1213 					continue;
1214 				}
1215 				bp->b_pages[i] = m;
1216 				pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
1217 			}
1218 #if defined(VFS_BIO_DEBUG)
1219 			if (trunc_page(foff) != m->offset) {
1220 				printf("biodone: foff(%d)/m->offset(%d) mismatch\n", foff, m->offset);
1221 			}
1222 #endif
1223 			resid = (m->offset + PAGE_SIZE) - foff;
1224 			if (resid > iosize)
1225 				resid = iosize;
1226 			if (resid > 0) {
1227 				vm_page_set_valid(m, foff, resid);
1228 				vm_page_set_clean(m, foff, resid);
1229 			}
1230 
1231 			/*
1232 			 * when debugging new filesystems or buffer I/O methods, this
1233 			 * is the most common error that pops up.  if you see this, you
1234 			 * have not set the page busy flag correctly!!!
1235 			 */
1236 			if (m->busy == 0) {
1237 				printf("biodone: page busy < 0, off: %d, foff: %d, resid: %d, index: %d\n",
1238 				    m->offset, foff, resid, i);
1239 				printf(" iosize: %d, lblkno: %d\n",
1240 				    bp->b_vp->v_mount->mnt_stat.f_iosize, bp->b_lblkno);
1241 				printf(" valid: 0x%x, dirty: 0x%x, mapped: %d\n",
1242 				    m->valid, m->dirty, m->bmapped);
1243 				panic("biodone: page busy < 0\n");
1244 			}
1245 			--m->busy;
1246 			PAGE_WAKEUP(m);
1247 			--obj->paging_in_progress;
1248 			foff += resid;
1249 			iosize -= resid;
1250 		}
1251 		if (obj && obj->paging_in_progress == 0 &&
1252 		    (obj->flags & OBJ_PIPWNT)) {
1253 			obj->flags &= ~OBJ_PIPWNT;
1254 			wakeup((caddr_t) obj);
1255 		}
1256 	}
1257 	/*
1258 	 * For asynchronous completions, release the buffer now. The brelse
1259 	 * checks for B_WANTED and will do the wakeup there if necessary - so
1260 	 * no need to do a wakeup here in the async case.
1261 	 */
1262 
1263 	if (bp->b_flags & B_ASYNC) {
1264 		brelse(bp);
1265 	} else {
1266 		bp->b_flags &= ~(B_WANTED | B_PDWANTED);
1267 		wakeup((caddr_t) bp);
1268 	}
1269 	splx(s);
1270 }
1271 
1272 int
1273 count_lock_queue()
1274 {
1275 	int count;
1276 	struct buf *bp;
1277 
1278 	count = 0;
1279 	for (bp = bufqueues[QUEUE_LOCKED].tqh_first;
1280 	    bp != NULL;
1281 	    bp = bp->b_freelist.tqe_next)
1282 		count++;
1283 	return (count);
1284 }
1285 
1286 int vfs_update_interval = 30;
1287 
1288 void
1289 vfs_update()
1290 {
1291 	(void) spl0();
1292 	while (1) {
1293 		tsleep((caddr_t) &vfs_update_wakeup, PRIBIO, "update",
1294 		    hz * vfs_update_interval);
1295 		vfs_update_wakeup = 0;
1296 		sync(curproc, NULL, NULL);
1297 	}
1298 }
1299 
1300 /*
1301  * This routine is called in lieu of iodone in the case of
1302  * incomplete I/O.  This keeps the busy status for pages
1303  * consistant.
1304  */
1305 void
1306 vfs_unbusy_pages(struct buf * bp)
1307 {
1308 	int i;
1309 
1310 	if (bp->b_flags & B_VMIO) {
1311 		struct vnode *vp = bp->b_vp;
1312 		vm_object_t obj = (vm_object_t) vp->v_vmdata;
1313 		vm_offset_t foff;
1314 
1315 		foff = vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
1316 
1317 		for (i = 0; i < bp->b_npages; i++) {
1318 			vm_page_t m = bp->b_pages[i];
1319 
1320 			if (m == bogus_page) {
1321 				m = vm_page_lookup(obj, foff);
1322 				if (!m) {
1323 					panic("vfs_unbusy_pages: page missing\n");
1324 				}
1325 				bp->b_pages[i] = m;
1326 				pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
1327 			}
1328 			--obj->paging_in_progress;
1329 			--m->busy;
1330 			PAGE_WAKEUP(m);
1331 		}
1332 		if (obj->paging_in_progress == 0 &&
1333 		    (obj->flags & OBJ_PIPWNT)) {
1334 			obj->flags &= ~OBJ_PIPWNT;
1335 			wakeup((caddr_t) obj);
1336 		}
1337 	}
1338 }
1339 
1340 /*
1341  * This routine is called before a device strategy routine.
1342  * It is used to tell the VM system that paging I/O is in
1343  * progress, and treat the pages associated with the buffer
1344  * almost as being PG_BUSY.  Also the object paging_in_progress
1345  * flag is handled to make sure that the object doesn't become
1346  * inconsistant.
1347  */
1348 void
1349 vfs_busy_pages(struct buf * bp, int clear_modify)
1350 {
1351 	int i;
1352 
1353 	if (bp->b_flags & B_VMIO) {
1354 		vm_object_t obj = (vm_object_t) bp->b_vp->v_vmdata;
1355 		vm_offset_t foff = bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
1356 		int iocount = bp->b_bufsize;
1357 
1358 		for (i = 0; i < bp->b_npages; i++) {
1359 			vm_page_t m = bp->b_pages[i];
1360 			int resid = (m->offset + PAGE_SIZE) - foff;
1361 
1362 			if (resid > iocount)
1363 				resid = iocount;
1364 			obj->paging_in_progress++;
1365 			m->busy++;
1366 			if (clear_modify) {
1367 				vm_page_test_dirty(m);
1368 				pmap_page_protect(VM_PAGE_TO_PHYS(m), VM_PROT_READ);
1369 			} else if (bp->b_bcount >= PAGE_SIZE) {
1370 				if (m->valid && (bp->b_flags & B_CACHE) == 0) {
1371 					bp->b_pages[i] = bogus_page;
1372 					pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
1373 				}
1374 			}
1375 			foff += resid;
1376 			iocount -= resid;
1377 		}
1378 	}
1379 }
1380 
1381 /*
1382  * Tell the VM system that the pages associated with this buffer
1383  * are dirty.  This is in case of the unlikely circumstance that
1384  * a buffer has to be destroyed before it is flushed.
1385  */
1386 void
1387 vfs_dirty_pages(struct buf * bp)
1388 {
1389 	int i;
1390 
1391 	if (bp->b_flags & B_VMIO) {
1392 		vm_offset_t foff = bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
1393 		int iocount = bp->b_bufsize;
1394 
1395 		for (i = 0; i < bp->b_npages; i++) {
1396 			vm_page_t m = bp->b_pages[i];
1397 			int resid = (m->offset + PAGE_SIZE) - foff;
1398 
1399 			if (resid > iocount)
1400 				resid = iocount;
1401 			if (resid > 0) {
1402 				vm_page_set_valid(m, foff, resid);
1403 				vm_page_set_dirty(m, foff, resid);
1404 			}
1405 			PAGE_WAKEUP(m);
1406 			foff += resid;
1407 			iocount -= resid;
1408 		}
1409 	}
1410 }
1411 /*
1412  * vm_hold_load_pages and vm_hold_unload pages get pages into
1413  * a buffers address space.  The pages are anonymous and are
1414  * not associated with a file object.
1415  */
1416 void
1417 vm_hold_load_pages(struct buf * bp, vm_offset_t froma, vm_offset_t toa)
1418 {
1419 	vm_offset_t pg;
1420 	vm_page_t p;
1421 	vm_offset_t from = round_page(froma);
1422 	vm_offset_t to = round_page(toa);
1423 
1424 	for (pg = from; pg < to; pg += PAGE_SIZE) {
1425 
1426 tryagain:
1427 
1428 		p = vm_page_alloc(kernel_object, pg - VM_MIN_KERNEL_ADDRESS,
1429 		    VM_ALLOC_NORMAL);
1430 		if (!p) {
1431 			VM_WAIT;
1432 			goto tryagain;
1433 		}
1434 		vm_page_wire(p);
1435 		pmap_kenter(pg, VM_PAGE_TO_PHYS(p));
1436 		bp->b_pages[((caddr_t) pg - bp->b_data) / PAGE_SIZE] = p;
1437 		PAGE_WAKEUP(p);
1438 		bp->b_npages++;
1439 	}
1440 }
1441 
1442 void
1443 vm_hold_free_pages(struct buf * bp, vm_offset_t froma, vm_offset_t toa)
1444 {
1445 	vm_offset_t pg;
1446 	vm_page_t p;
1447 	vm_offset_t from = round_page(froma);
1448 	vm_offset_t to = round_page(toa);
1449 
1450 	for (pg = from; pg < to; pg += PAGE_SIZE) {
1451 		p = bp->b_pages[((caddr_t) pg - bp->b_data) / PAGE_SIZE];
1452 		bp->b_pages[((caddr_t) pg - bp->b_data) / PAGE_SIZE] = 0;
1453 		pmap_kremove(pg);
1454 		vm_page_free(p);
1455 		--bp->b_npages;
1456 	}
1457 }
1458 
1459 void
1460 bufstats()
1461 {
1462 }
1463