xref: /freebsd/sys/kern/vfs_bio.c (revision 5ebc7e6281887681c3a348a5a4c902e262ccd656)
1 /*
2  * Copyright (c) 1994 John S. Dyson
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice immediately at the beginning of the file, without modification,
10  *    this list of conditions, and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  * 3. Absolutely no warranty of function or purpose is made by the author
15  *    John S. Dyson.
16  * 4. This work was done expressly for inclusion into FreeBSD.  Other use
17  *    is allowed if this notation is included.
18  * 5. Modifications may be freely made to this file if the above conditions
19  *    are met.
20  *
21  * $Id: vfs_bio.c,v 1.44 1995/05/11 19:26:29 rgrimes Exp $
22  */
23 
24 /*
25  * this file contains a new buffer I/O scheme implementing a coherent
26  * VM object and buffer cache scheme.  Pains have been taken to make
27  * sure that the performance degradation associated with schemes such
28  * as this is not realized.
29  *
30  * Author:  John S. Dyson
31  * Significant help during the development and debugging phases
32  * had been provided by David Greenman, also of the FreeBSD core team.
33  */
34 
35 #define VMIO
36 #include <sys/param.h>
37 #include <sys/systm.h>
38 #include <sys/kernel.h>
39 #include <sys/proc.h>
40 #include <sys/vnode.h>
41 #include <vm/vm.h>
42 #include <vm/vm_kern.h>
43 #include <vm/vm_pageout.h>
44 #include <vm/vm_page.h>
45 #include <vm/vm_object.h>
46 #include <sys/buf.h>
47 #include <sys/mount.h>
48 #include <sys/malloc.h>
49 #include <sys/resourcevar.h>
50 #include <sys/proc.h>
51 
52 #include <miscfs/specfs/specdev.h>
53 
54 struct buf *buf;		/* buffer header pool */
55 int nbuf;			/* number of buffer headers calculated
56 				 * elsewhere */
57 struct swqueue bswlist;
58 
59 void vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to);
60 void vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to);
61 void vfs_clean_pages(struct buf * bp);
62 static void vfs_setdirty(struct buf *bp);
63 
64 int needsbuffer;
65 
66 /*
67  * Internal update daemon, process 3
68  *	The variable vfs_update_wakeup allows for internal syncs.
69  */
70 int vfs_update_wakeup;
71 
72 
73 /*
74  * buffers base kva
75  */
76 caddr_t buffers_kva;
77 
78 /*
79  * bogus page -- for I/O to/from partially complete buffers
80  * this is a temporary solution to the problem, but it is not
81  * really that bad.  it would be better to split the buffer
82  * for input in the case of buffers partially already in memory,
83  * but the code is intricate enough already.
84  */
85 vm_page_t bogus_page;
86 vm_offset_t bogus_offset;
87 
88 int bufspace, maxbufspace;
89 
90 /*
91  * advisory minimum for size of LRU queue or VMIO queue
92  */
93 int minbuf;
94 
95 /*
96  * Initialize buffer headers and related structures.
97  */
98 void
99 bufinit()
100 {
101 	struct buf *bp;
102 	int i;
103 
104 	TAILQ_INIT(&bswlist);
105 	LIST_INIT(&invalhash);
106 
107 	/* first, make a null hash table */
108 	for (i = 0; i < BUFHSZ; i++)
109 		LIST_INIT(&bufhashtbl[i]);
110 
111 	/* next, make a null set of free lists */
112 	for (i = 0; i < BUFFER_QUEUES; i++)
113 		TAILQ_INIT(&bufqueues[i]);
114 
115 	buffers_kva = (caddr_t) kmem_alloc_pageable(buffer_map, MAXBSIZE * nbuf);
116 	/* finally, initialize each buffer header and stick on empty q */
117 	for (i = 0; i < nbuf; i++) {
118 		bp = &buf[i];
119 		bzero(bp, sizeof *bp);
120 		bp->b_flags = B_INVAL;	/* we're just an empty header */
121 		bp->b_dev = NODEV;
122 		bp->b_rcred = NOCRED;
123 		bp->b_wcred = NOCRED;
124 		bp->b_qindex = QUEUE_EMPTY;
125 		bp->b_vnbufs.le_next = NOLIST;
126 		bp->b_data = buffers_kva + i * MAXBSIZE;
127 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
128 		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
129 	}
130 /*
131  * maxbufspace is currently calculated to support all filesystem blocks
132  * to be 8K.  If you happen to use a 16K filesystem, the size of the buffer
133  * cache is still the same as it would be for 8K filesystems.  This
134  * keeps the size of the buffer cache "in check" for big block filesystems.
135  */
136 	minbuf = nbuf / 3;
137 	maxbufspace = 2 * (nbuf + 8) * PAGE_SIZE;
138 
139 	bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE);
140 	bogus_page = vm_page_alloc(kernel_object,
141 			bogus_offset - VM_MIN_KERNEL_ADDRESS, VM_ALLOC_NORMAL);
142 
143 }
144 
145 /*
146  * remove the buffer from the appropriate free list
147  */
148 void
149 bremfree(struct buf * bp)
150 {
151 	int s = splbio();
152 
153 	if (bp->b_qindex != QUEUE_NONE) {
154 		TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
155 		bp->b_qindex = QUEUE_NONE;
156 	} else {
157 		panic("bremfree: removing a buffer when not on a queue");
158 	}
159 	splx(s);
160 }
161 
162 /*
163  * Get a buffer with the specified data.  Look in the cache first.
164  */
165 int
166 bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred,
167     struct buf ** bpp)
168 {
169 	struct buf *bp;
170 
171 	bp = getblk(vp, blkno, size, 0, 0);
172 	*bpp = bp;
173 
174 	/* if not found in cache, do some I/O */
175 	if ((bp->b_flags & B_CACHE) == 0) {
176 		if (curproc != NULL)
177 			curproc->p_stats->p_ru.ru_inblock++;
178 		bp->b_flags |= B_READ;
179 		bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
180 		if (bp->b_rcred == NOCRED) {
181 			if (cred != NOCRED)
182 				crhold(cred);
183 			bp->b_rcred = cred;
184 		}
185 		vfs_busy_pages(bp, 0);
186 		VOP_STRATEGY(bp);
187 		return (biowait(bp));
188 	}
189 	return (0);
190 }
191 
192 /*
193  * Operates like bread, but also starts asynchronous I/O on
194  * read-ahead blocks.
195  */
196 int
197 breadn(struct vnode * vp, daddr_t blkno, int size,
198     daddr_t * rablkno, int *rabsize,
199     int cnt, struct ucred * cred, struct buf ** bpp)
200 {
201 	struct buf *bp, *rabp;
202 	int i;
203 	int rv = 0, readwait = 0;
204 
205 	*bpp = bp = getblk(vp, blkno, size, 0, 0);
206 
207 	/* if not found in cache, do some I/O */
208 	if ((bp->b_flags & B_CACHE) == 0) {
209 		if (curproc != NULL)
210 			curproc->p_stats->p_ru.ru_inblock++;
211 		bp->b_flags |= B_READ;
212 		bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
213 		if (bp->b_rcred == NOCRED) {
214 			if (cred != NOCRED)
215 				crhold(cred);
216 			bp->b_rcred = cred;
217 		}
218 		vfs_busy_pages(bp, 0);
219 		VOP_STRATEGY(bp);
220 		++readwait;
221 	}
222 	for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
223 		if (inmem(vp, *rablkno))
224 			continue;
225 		rabp = getblk(vp, *rablkno, *rabsize, 0, 0);
226 
227 		if ((rabp->b_flags & B_CACHE) == 0) {
228 			if (curproc != NULL)
229 				curproc->p_stats->p_ru.ru_inblock++;
230 			rabp->b_flags |= B_READ | B_ASYNC;
231 			rabp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
232 			if (rabp->b_rcred == NOCRED) {
233 				if (cred != NOCRED)
234 					crhold(cred);
235 				rabp->b_rcred = cred;
236 			}
237 			vfs_busy_pages(rabp, 0);
238 			VOP_STRATEGY(rabp);
239 		} else {
240 			brelse(rabp);
241 		}
242 	}
243 
244 	if (readwait) {
245 		rv = biowait(bp);
246 	}
247 	return (rv);
248 }
249 
250 /*
251  * Write, release buffer on completion.  (Done by iodone
252  * if async.)
253  */
254 int
255 bwrite(struct buf * bp)
256 {
257 	int oldflags = bp->b_flags;
258 
259 	if (bp->b_flags & B_INVAL) {
260 		brelse(bp);
261 		return (0);
262 	}
263 	if (!(bp->b_flags & B_BUSY))
264 		panic("bwrite: buffer is not busy???");
265 
266 	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
267 	bp->b_flags |= B_WRITEINPROG;
268 
269 	if ((oldflags & (B_ASYNC|B_DELWRI)) == (B_ASYNC|B_DELWRI)) {
270 		reassignbuf(bp, bp->b_vp);
271 	}
272 
273 	bp->b_vp->v_numoutput++;
274 	vfs_busy_pages(bp, 1);
275 	if (curproc != NULL)
276 		curproc->p_stats->p_ru.ru_oublock++;
277 	VOP_STRATEGY(bp);
278 
279 	if ((oldflags & B_ASYNC) == 0) {
280 		int rtval = biowait(bp);
281 
282 		if (oldflags & B_DELWRI) {
283 			reassignbuf(bp, bp->b_vp);
284 		}
285 		brelse(bp);
286 		return (rtval);
287 	}
288 	return (0);
289 }
290 
291 int
292 vn_bwrite(ap)
293 	struct vop_bwrite_args *ap;
294 {
295 	return (bwrite(ap->a_bp));
296 }
297 
298 /*
299  * Delayed write. (Buffer is marked dirty).
300  */
301 void
302 bdwrite(struct buf * bp)
303 {
304 
305 	if ((bp->b_flags & B_BUSY) == 0) {
306 		panic("bdwrite: buffer is not busy");
307 	}
308 	if (bp->b_flags & B_INVAL) {
309 		brelse(bp);
310 		return;
311 	}
312 	if (bp->b_flags & B_TAPE) {
313 		bawrite(bp);
314 		return;
315 	}
316 	bp->b_flags &= ~(B_READ|B_RELBUF);
317 	if ((bp->b_flags & B_DELWRI) == 0) {
318 		bp->b_flags |= B_DONE | B_DELWRI;
319 		reassignbuf(bp, bp->b_vp);
320 	}
321 
322 	/*
323 	 * This bmap keeps the system from needing to do the bmap later,
324 	 * perhaps when the system is attempting to do a sync.  Since it
325 	 * is likely that the indirect block -- or whatever other datastructure
326 	 * that the filesystem needs is still in memory now, it is a good
327 	 * thing to do this.  Note also, that if the pageout daemon is
328 	 * requesting a sync -- there might not be enough memory to do
329 	 * the bmap then...  So, this is important to do.
330 	 */
331 	if( bp->b_lblkno == bp->b_blkno) {
332 		VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL);
333 	}
334 
335 	/*
336 	 * Set the *dirty* buffer range based upon the VM system dirty pages.
337 	 */
338 	vfs_setdirty(bp);
339 
340 	/*
341 	 * We need to do this here to satisfy the vnode_pager and the
342 	 * pageout daemon, so that it thinks that the pages have been
343 	 * "cleaned".  Note that since the pages are in a delayed write
344 	 * buffer -- the VFS layer "will" see that the pages get written
345 	 * out on the next sync, or perhaps the cluster will be completed.
346 	 */
347 	vfs_clean_pages(bp);
348 	brelse(bp);
349 	return;
350 }
351 
352 /*
353  * Asynchronous write.
354  * Start output on a buffer, but do not wait for it to complete.
355  * The buffer is released when the output completes.
356  */
357 void
358 bawrite(struct buf * bp)
359 {
360 	bp->b_flags |= B_ASYNC;
361 	(void) VOP_BWRITE(bp);
362 }
363 
364 /*
365  * Release a buffer.
366  */
367 void
368 brelse(struct buf * bp)
369 {
370 	int s;
371 
372 	if (bp->b_flags & B_CLUSTER) {
373 		relpbuf(bp);
374 		return;
375 	}
376 	/* anyone need a "free" block? */
377 	s = splbio();
378 
379 	if (needsbuffer) {
380 		needsbuffer = 0;
381 		wakeup((caddr_t) &needsbuffer);
382 	}
383 
384 	/* anyone need this block? */
385 	if (bp->b_flags & B_WANTED) {
386 		bp->b_flags &= ~B_WANTED | B_AGE;
387 		wakeup((caddr_t) bp);
388 	} else if (bp->b_flags & B_VMIO) {
389 		bp->b_flags &= ~B_WANTED;
390 		wakeup((caddr_t) bp);
391 	}
392 	if (bp->b_flags & B_LOCKED)
393 		bp->b_flags &= ~B_ERROR;
394 
395 	if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR)) ||
396 	    (bp->b_bufsize <= 0)) {
397 		bp->b_flags |= B_INVAL;
398 		bp->b_flags &= ~(B_DELWRI | B_CACHE);
399 		if (((bp->b_flags & B_VMIO) == 0) && bp->b_vp)
400 			brelvp(bp);
401 	}
402 
403 	/*
404 	 * VMIO buffer rundown.  It is not very necessary to keep a VMIO buffer
405 	 * constituted, so the B_INVAL flag is used to *invalidate* the buffer,
406 	 * but the VM object is kept around.  The B_NOCACHE flag is used to
407 	 * invalidate the pages in the VM object.
408 	 */
409 	if (bp->b_flags & B_VMIO) {
410 		vm_offset_t foff;
411 		vm_object_t obj;
412 		int i, resid;
413 		vm_page_t m;
414 		int iototal = bp->b_bufsize;
415 
416 		foff = 0;
417 		obj = 0;
418 		if (bp->b_npages) {
419 			if (bp->b_vp && bp->b_vp->v_mount) {
420 				foff = bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
421 			} else {
422 				/*
423 				 * vnode pointer has been ripped away --
424 				 * probably file gone...
425 				 */
426 				foff = bp->b_pages[0]->offset;
427 			}
428 		}
429 		for (i = 0; i < bp->b_npages; i++) {
430 			m = bp->b_pages[i];
431 			if (m == bogus_page) {
432 				m = vm_page_lookup(obj, foff);
433 				if (!m) {
434 					panic("brelse: page missing\n");
435 				}
436 				bp->b_pages[i] = m;
437 				pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
438 			}
439 			resid = (m->offset + PAGE_SIZE) - foff;
440 			if (resid > iototal)
441 				resid = iototal;
442 			if (resid > 0) {
443 				/*
444 				 * Don't invalidate the page if the local machine has already
445 				 * modified it.  This is the lesser of two evils, and should
446 				 * be fixed.
447 				 */
448 				if (bp->b_flags & (B_NOCACHE | B_ERROR)) {
449 					vm_page_test_dirty(m);
450 					if (m->dirty == 0) {
451 						vm_page_set_invalid(m, foff, resid);
452 						if (m->valid == 0)
453 							vm_page_protect(m, VM_PROT_NONE);
454 					}
455 				}
456 			}
457 			foff += resid;
458 			iototal -= resid;
459 		}
460 
461 		if (bp->b_flags & (B_INVAL | B_RELBUF)) {
462 			for(i=0;i<bp->b_npages;i++) {
463 				m = bp->b_pages[i];
464 				--m->bmapped;
465 				if (m->bmapped == 0) {
466 					if (m->flags & PG_WANTED) {
467 						wakeup((caddr_t) m);
468 						m->flags &= ~PG_WANTED;
469 					}
470 					vm_page_test_dirty(m);
471 					if ((m->dirty & m->valid) == 0 &&
472 						(m->flags & PG_REFERENCED) == 0 &&
473 							!pmap_is_referenced(VM_PAGE_TO_PHYS(m))) {
474 						vm_page_cache(m);
475 					} else if ((m->flags & PG_ACTIVE) == 0) {
476 						vm_page_activate(m);
477 						m->act_count = 0;
478 					}
479 				}
480 			}
481 			bufspace -= bp->b_bufsize;
482 			pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
483 			bp->b_npages = 0;
484 			bp->b_bufsize = 0;
485 			bp->b_flags &= ~B_VMIO;
486 			if (bp->b_vp)
487 				brelvp(bp);
488 		}
489 	}
490 	if (bp->b_qindex != QUEUE_NONE)
491 		panic("brelse: free buffer onto another queue???");
492 
493 	/* enqueue */
494 	/* buffers with no memory */
495 	if (bp->b_bufsize == 0) {
496 		bp->b_qindex = QUEUE_EMPTY;
497 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
498 		LIST_REMOVE(bp, b_hash);
499 		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
500 		bp->b_dev = NODEV;
501 		/* buffers with junk contents */
502 	} else if (bp->b_flags & (B_ERROR | B_INVAL | B_NOCACHE | B_RELBUF)) {
503 		bp->b_qindex = QUEUE_AGE;
504 		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist);
505 		LIST_REMOVE(bp, b_hash);
506 		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
507 		bp->b_dev = NODEV;
508 		/* buffers that are locked */
509 	} else if (bp->b_flags & B_LOCKED) {
510 		bp->b_qindex = QUEUE_LOCKED;
511 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
512 		/* buffers with stale but valid contents */
513 	} else if (bp->b_flags & B_AGE) {
514 		bp->b_qindex = QUEUE_AGE;
515 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist);
516 		/* buffers with valid and quite potentially reuseable contents */
517 	} else {
518 		bp->b_qindex = QUEUE_LRU;
519 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
520 	}
521 
522 	/* unlock */
523 	bp->b_flags &= ~(B_WANTED | B_BUSY | B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
524 	splx(s);
525 }
526 
527 /*
528  * this routine implements clustered async writes for
529  * clearing out B_DELWRI buffers...  This is much better
530  * than the old way of writing only one buffer at a time.
531  */
532 void
533 vfs_bio_awrite(struct buf * bp)
534 {
535 	int i;
536 	daddr_t lblkno = bp->b_lblkno;
537 	struct vnode *vp = bp->b_vp;
538 	int s;
539 	int ncl;
540 	struct buf *bpa;
541 
542 	s = splbio();
543 	if( vp->v_mount && (vp->v_flag & VVMIO) &&
544 	    	(bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
545 		int size = vp->v_mount->mnt_stat.f_iosize;
546 
547 		for (i = 1; i < MAXPHYS / size; i++) {
548 			if ((bpa = incore(vp, lblkno + i)) &&
549 			    ((bpa->b_flags & (B_BUSY | B_DELWRI | B_BUSY | B_CLUSTEROK | B_INVAL)) == B_DELWRI | B_CLUSTEROK) &&
550 			    (bpa->b_bufsize == size)) {
551 				if ((bpa->b_blkno == bpa->b_lblkno) ||
552 				    (bpa->b_blkno != bp->b_blkno + (i * size) / DEV_BSIZE))
553 					break;
554 			} else {
555 				break;
556 			}
557 		}
558 		ncl = i;
559 		/*
560 		 * this is a possible cluster write
561 		 */
562 		if (ncl != 1) {
563 			bremfree(bp);
564 			cluster_wbuild(vp, bp, size, lblkno, ncl, -1);
565 			splx(s);
566 			return;
567 		}
568 	}
569 	/*
570 	 * default (old) behavior, writing out only one block
571 	 */
572 	bremfree(bp);
573 	bp->b_flags |= B_BUSY | B_ASYNC;
574 	(void) VOP_BWRITE(bp);
575 	splx(s);
576 }
577 
578 
579 /*
580  * Find a buffer header which is available for use.
581  */
582 static struct buf *
583 getnewbuf(int slpflag, int slptimeo, int doingvmio)
584 {
585 	struct buf *bp;
586 	int s;
587 	int firstbp = 1;
588 
589 	s = splbio();
590 start:
591 	if (bufspace >= maxbufspace)
592 		goto trytofreespace;
593 
594 	/* can we constitute a new buffer? */
595 	if ((bp = bufqueues[QUEUE_EMPTY].tqh_first)) {
596 		if (bp->b_qindex != QUEUE_EMPTY)
597 			panic("getnewbuf: inconsistent EMPTY queue");
598 		bremfree(bp);
599 		goto fillbuf;
600 	}
601 trytofreespace:
602 	/*
603 	 * We keep the file I/O from hogging metadata I/O
604 	 * This is desirable because file data is cached in the
605 	 * VM/Buffer cache even if a buffer is freed.
606 	 */
607 	if ((bp = bufqueues[QUEUE_AGE].tqh_first)) {
608 		if (bp->b_qindex != QUEUE_AGE)
609 			panic("getnewbuf: inconsistent AGE queue");
610 	} else if ((bp = bufqueues[QUEUE_LRU].tqh_first)) {
611 		if (bp->b_qindex != QUEUE_LRU)
612 			panic("getnewbuf: inconsistent LRU queue");
613 	}
614 	if (!bp) {
615 		/* wait for a free buffer of any kind */
616 		needsbuffer = 1;
617 		tsleep((caddr_t) &needsbuffer, PRIBIO | slpflag, "newbuf", slptimeo);
618 		splx(s);
619 		return (0);
620 	}
621 
622 	/* if we are a delayed write, convert to an async write */
623 	if ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) {
624 		vfs_bio_awrite(bp);
625 		if (!slpflag && !slptimeo) {
626 			splx(s);
627 			return (0);
628 		}
629 		goto start;
630 	}
631 
632 	if (bp->b_flags & B_WANTED) {
633 		bp->b_flags &= ~B_WANTED;
634 		wakeup((caddr_t) bp);
635 	}
636 	bremfree(bp);
637 
638 	if (bp->b_flags & B_VMIO) {
639 		bp->b_flags |= B_RELBUF | B_BUSY | B_DONE;
640 		brelse(bp);
641 		bremfree(bp);
642 	}
643 
644 	if (bp->b_vp)
645 		brelvp(bp);
646 
647 	/* we are not free, nor do we contain interesting data */
648 	if (bp->b_rcred != NOCRED)
649 		crfree(bp->b_rcred);
650 	if (bp->b_wcred != NOCRED)
651 		crfree(bp->b_wcred);
652 fillbuf:
653 	bp->b_flags |= B_BUSY;
654 	LIST_REMOVE(bp, b_hash);
655 	LIST_INSERT_HEAD(&invalhash, bp, b_hash);
656 	splx(s);
657 	if (bp->b_bufsize) {
658 		allocbuf(bp, 0);
659 	}
660 	bp->b_flags = B_BUSY;
661 	bp->b_dev = NODEV;
662 	bp->b_vp = NULL;
663 	bp->b_blkno = bp->b_lblkno = 0;
664 	bp->b_iodone = 0;
665 	bp->b_error = 0;
666 	bp->b_resid = 0;
667 	bp->b_bcount = 0;
668 	bp->b_npages = 0;
669 	bp->b_wcred = bp->b_rcred = NOCRED;
670 	bp->b_data = buffers_kva + (bp - buf) * MAXBSIZE;
671 	bp->b_dirtyoff = bp->b_dirtyend = 0;
672 	bp->b_validoff = bp->b_validend = 0;
673 	if (bufspace >= maxbufspace) {
674 		s = splbio();
675 		bp->b_flags |= B_INVAL;
676 		brelse(bp);
677 		goto trytofreespace;
678 	}
679 	return (bp);
680 }
681 
682 /*
683  * Check to see if a block is currently memory resident.
684  */
685 struct buf *
686 incore(struct vnode * vp, daddr_t blkno)
687 {
688 	struct buf *bp;
689 	struct bufhashhdr *bh;
690 
691 	int s = splbio();
692 
693 	bh = BUFHASH(vp, blkno);
694 	bp = bh->lh_first;
695 
696 	/* Search hash chain */
697 	while (bp) {
698 		/* hit */
699 		if (bp->b_lblkno == blkno && bp->b_vp == vp &&
700 		    (bp->b_flags & B_INVAL) == 0) {
701 			splx(s);
702 			return (bp);
703 		}
704 		bp = bp->b_hash.le_next;
705 	}
706 	splx(s);
707 
708 	return (0);
709 }
710 
711 /*
712  * Returns true if no I/O is needed to access the
713  * associated VM object.  This is like incore except
714  * it also hunts around in the VM system for the data.
715  */
716 
717 int
718 inmem(struct vnode * vp, daddr_t blkno)
719 {
720 	vm_object_t obj;
721 	vm_offset_t off, toff, tinc;
722 	vm_page_t m;
723 
724 	if (incore(vp, blkno))
725 		return 1;
726 	if (vp->v_mount == 0)
727 		return 0;
728 	if ((vp->v_vmdata == 0) || (vp->v_flag & VVMIO) == 0)
729 		return 0;
730 
731 	obj = (vm_object_t) vp->v_vmdata;
732 	tinc = PAGE_SIZE;
733 	if (tinc > vp->v_mount->mnt_stat.f_iosize)
734 		tinc = vp->v_mount->mnt_stat.f_iosize;
735 	off = blkno * vp->v_mount->mnt_stat.f_iosize;
736 
737 	for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
738 		int mask;
739 
740 		m = vm_page_lookup(obj, trunc_page(toff + off));
741 		if (!m)
742 			return 0;
743 		if (vm_page_is_valid(m, toff + off, tinc) == 0)
744 			return 0;
745 	}
746 	return 1;
747 }
748 
749 /*
750  * now we set the dirty range for the buffer --
751  * for NFS -- if the file is mapped and pages have
752  * been written to, let it know.  We want the
753  * entire range of the buffer to be marked dirty if
754  * any of the pages have been written to for consistancy
755  * with the b_validoff, b_validend set in the nfs write
756  * code, and used by the nfs read code.
757  */
758 static void
759 vfs_setdirty(struct buf *bp) {
760 	int i;
761 	vm_object_t object;
762 	vm_offset_t boffset, offset;
763 	/*
764 	 * We qualify the scan for modified pages on whether the
765 	 * object has been flushed yet.  The OBJ_WRITEABLE flag
766 	 * is not cleared simply by protecting pages off.
767 	 */
768 	if ((bp->b_flags & B_VMIO) &&
769 		((object = bp->b_pages[0]->object)->flags & OBJ_WRITEABLE)) {
770 		/*
771 		 * test the pages to see if they have been modified directly
772 		 * by users through the VM system.
773 		 */
774 		for (i = 0; i < bp->b_npages; i++)
775 			vm_page_test_dirty(bp->b_pages[i]);
776 
777 		/*
778 		 * scan forwards for the first page modified
779 		 */
780 		for (i = 0; i < bp->b_npages; i++) {
781 			if (bp->b_pages[i]->dirty) {
782 				break;
783 			}
784 		}
785 		boffset = i * PAGE_SIZE;
786 		if (boffset < bp->b_dirtyoff) {
787 			bp->b_dirtyoff = boffset;
788 		}
789 
790 		/*
791 		 * scan backwards for the last page modified
792 		 */
793 		for (i = bp->b_npages - 1; i >= 0; --i) {
794 			if (bp->b_pages[i]->dirty) {
795 				break;
796 			}
797 		}
798 		boffset = (i + 1) * PAGE_SIZE;
799 		offset = boffset + bp->b_pages[0]->offset;
800 		if (offset >= object->size) {
801 			boffset = object->size - bp->b_pages[0]->offset;
802 		}
803 		if (bp->b_dirtyend < boffset) {
804 			bp->b_dirtyend = boffset;
805 		}
806 	}
807 }
808 
809 /*
810  * Get a block given a specified block and offset into a file/device.
811  */
812 struct buf *
813 getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo)
814 {
815 	struct buf *bp;
816 	int s;
817 	struct bufhashhdr *bh;
818 	vm_offset_t off;
819 	int nleft;
820 
821 	s = splbio();
822 loop:
823 	if (bp = incore(vp, blkno)) {
824 		if (bp->b_flags & B_BUSY) {
825 			bp->b_flags |= B_WANTED;
826 			if (!tsleep((caddr_t) bp, PRIBIO | slpflag, "getblk", slptimeo))
827 				goto loop;
828 
829 			splx(s);
830 			return (struct buf *) NULL;
831 		}
832 		bp->b_flags |= B_BUSY | B_CACHE;
833 		bremfree(bp);
834 		/*
835 		 * check for size inconsistancies
836 		 */
837 		if (bp->b_bcount != size) {
838 #if defined(VFS_BIO_DEBUG)
839 			printf("getblk: invalid buffer size: %ld\n", bp->b_bcount);
840 #endif
841 			bp->b_flags |= B_NOCACHE;
842 			(void) VOP_BWRITE(bp);
843 			goto loop;
844 		}
845 		splx(s);
846 		return (bp);
847 	} else {
848 		vm_object_t obj;
849 		int doingvmio;
850 
851 		if ((obj = (vm_object_t) vp->v_vmdata) && (vp->v_flag & VVMIO)) {
852 			doingvmio = 1;
853 		} else {
854 			doingvmio = 0;
855 		}
856 		if ((bp = getnewbuf(slpflag, slptimeo, doingvmio)) == 0) {
857 			if (slpflag || slptimeo)
858 				return NULL;
859 			goto loop;
860 		}
861 
862 		/*
863 		 * This code is used to make sure that a buffer is not
864 		 * created while the getnewbuf routine is blocked.
865 		 * Normally the vnode is locked so this isn't a problem.
866 		 * VBLK type I/O requests, however, don't lock the vnode.
867 		 * VOP_ISLOCKED would be much better but is also much
868 		 * slower.
869 		 */
870 		if ((vp->v_type == VBLK) && incore(vp, blkno)) {
871 			bp->b_flags |= B_INVAL;
872 			brelse(bp);
873 			goto loop;
874 		}
875 
876 		/*
877 		 * Insert the buffer into the hash, so that it can
878 		 * be found by incore.
879 		 */
880 		bp->b_blkno = bp->b_lblkno = blkno;
881 		bgetvp(vp, bp);
882 		LIST_REMOVE(bp, b_hash);
883 		bh = BUFHASH(vp, blkno);
884 		LIST_INSERT_HEAD(bh, bp, b_hash);
885 
886 		if (doingvmio) {
887 			bp->b_flags |= (B_VMIO | B_CACHE);
888 #if defined(VFS_BIO_DEBUG)
889 			if (vp->v_type != VREG)
890 				printf("getblk: vmioing file type %d???\n", vp->v_type);
891 #endif
892 		} else {
893 			bp->b_flags &= ~B_VMIO;
894 		}
895 		splx(s);
896 
897 		allocbuf(bp, size);
898 		return (bp);
899 	}
900 }
901 
902 /*
903  * Get an empty, disassociated buffer of given size.
904  */
905 struct buf *
906 geteblk(int size)
907 {
908 	struct buf *bp;
909 
910 	while ((bp = getnewbuf(0, 0, 0)) == 0);
911 	allocbuf(bp, size);
912 	bp->b_flags |= B_INVAL;
913 	return (bp);
914 }
915 
916 /*
917  * This code constitutes the buffer memory from either anonymous system
918  * memory (in the case of non-VMIO operations) or from an associated
919  * VM object (in the case of VMIO operations).
920  *
921  * Note that this code is tricky, and has many complications to resolve
922  * deadlock or inconsistant data situations.  Tread lightly!!!
923  *
924  * Modify the length of a buffer's underlying buffer storage without
925  * destroying information (unless, of course the buffer is shrinking).
926  */
927 int
928 allocbuf(struct buf * bp, int size)
929 {
930 
931 	int s;
932 	int newbsize, mbsize;
933 	int i;
934 
935 	if ((bp->b_flags & B_VMIO) == 0) {
936 		/*
937 		 * Just get anonymous memory from the kernel
938 		 */
939 		mbsize = ((size + DEV_BSIZE - 1) / DEV_BSIZE) * DEV_BSIZE;
940 		newbsize = round_page(size);
941 
942 		if (newbsize == bp->b_bufsize) {
943 			bp->b_bcount = size;
944 			return 1;
945 		} else if (newbsize < bp->b_bufsize) {
946 			vm_hold_free_pages(
947 			    bp,
948 			    (vm_offset_t) bp->b_data + newbsize,
949 			    (vm_offset_t) bp->b_data + bp->b_bufsize);
950 			bufspace -= (bp->b_bufsize - newbsize);
951 		} else if (newbsize > bp->b_bufsize) {
952 			vm_hold_load_pages(
953 			    bp,
954 			    (vm_offset_t) bp->b_data + bp->b_bufsize,
955 			    (vm_offset_t) bp->b_data + newbsize);
956 			bufspace += (newbsize - bp->b_bufsize);
957 		}
958 	} else {
959 		vm_page_t m;
960 		int desiredpages;
961 
962 		newbsize = ((size + DEV_BSIZE - 1) / DEV_BSIZE) * DEV_BSIZE;
963 		desiredpages = round_page(newbsize) / PAGE_SIZE;
964 
965 		if (newbsize == bp->b_bufsize) {
966 			bp->b_bcount = size;
967 			return 1;
968 		} else if (newbsize < bp->b_bufsize) {
969 			if (desiredpages < bp->b_npages) {
970 				pmap_qremove((vm_offset_t) trunc_page(bp->b_data) +
971 				    desiredpages * PAGE_SIZE, (bp->b_npages - desiredpages));
972 				for (i = desiredpages; i < bp->b_npages; i++) {
973 					m = bp->b_pages[i];
974 					s = splhigh();
975 					while ((m->flags & PG_BUSY) || (m->busy != 0)) {
976 						m->flags |= PG_WANTED;
977 						tsleep(m, PVM, "biodep", 0);
978 					}
979 					splx(s);
980 
981 					if (m->bmapped == 0) {
982 						printf("allocbuf: bmapped is zero for page %d\n", i);
983 						panic("allocbuf: error");
984 					}
985 					--m->bmapped;
986 					if (m->bmapped == 0) {
987 						vm_page_protect(m, VM_PROT_NONE);
988 						vm_page_free(m);
989 					}
990 					bp->b_pages[i] = NULL;
991 				}
992 				bp->b_npages = desiredpages;
993 				bufspace -= (bp->b_bufsize - newbsize);
994 			}
995 		} else {
996 			vm_object_t obj;
997 			vm_offset_t tinc, off, toff, objoff;
998 			int pageindex, curbpnpages;
999 			struct vnode *vp;
1000 			int bsize;
1001 
1002 			vp = bp->b_vp;
1003 			bsize = vp->v_mount->mnt_stat.f_iosize;
1004 
1005 			if (bp->b_npages < desiredpages) {
1006 				obj = (vm_object_t) vp->v_vmdata;
1007 				tinc = PAGE_SIZE;
1008 				if (tinc > bsize)
1009 					tinc = bsize;
1010 				off = bp->b_lblkno * bsize;
1011 				curbpnpages = bp->b_npages;
1012 		doretry:
1013 				bp->b_flags |= B_CACHE;
1014 				for (toff = 0; toff < newbsize; toff += tinc) {
1015 					int mask;
1016 					int bytesinpage;
1017 
1018 					pageindex = toff / PAGE_SIZE;
1019 					objoff = trunc_page(toff + off);
1020 					if (pageindex < curbpnpages) {
1021 						int pb;
1022 
1023 						m = bp->b_pages[pageindex];
1024 						if (m->offset != objoff)
1025 							panic("allocbuf: page changed offset??!!!?");
1026 						bytesinpage = tinc;
1027 						if (tinc > (newbsize - toff))
1028 							bytesinpage = newbsize - toff;
1029 						if (!vm_page_is_valid(m, toff + off, bytesinpage)) {
1030 							bp->b_flags &= ~B_CACHE;
1031 						}
1032 						if ((m->flags & PG_ACTIVE) == 0) {
1033 							vm_page_activate(m);
1034 							m->act_count = 0;
1035 						}
1036 						continue;
1037 					}
1038 					m = vm_page_lookup(obj, objoff);
1039 					if (!m) {
1040 						m = vm_page_alloc(obj, objoff, VM_ALLOC_NORMAL);
1041 						if (!m) {
1042 							int j;
1043 
1044 							for (j = bp->b_npages; j < pageindex; j++) {
1045 								PAGE_WAKEUP(bp->b_pages[j]);
1046 							}
1047 							VM_WAIT;
1048 							curbpnpages = bp->b_npages;
1049 							goto doretry;
1050 						}
1051 						vm_page_activate(m);
1052 						m->act_count = 0;
1053 						m->valid = 0;
1054 					} else if (m->flags & PG_BUSY) {
1055 						int j;
1056 
1057 						for (j = bp->b_npages; j < pageindex; j++) {
1058 							PAGE_WAKEUP(bp->b_pages[j]);
1059 						}
1060 
1061 						s = splbio();
1062 						m->flags |= PG_WANTED;
1063 						tsleep(m, PRIBIO, "pgtblk", 0);
1064 						splx(s);
1065 
1066 						curbpnpages = bp->b_npages;
1067 						goto doretry;
1068 					} else {
1069 						int pb;
1070 						if ((curproc != pageproc) &&
1071 							(m->flags & PG_CACHE) &&
1072 						    (cnt.v_free_count + cnt.v_cache_count) < cnt.v_free_min) {
1073 							pagedaemon_wakeup();
1074 						}
1075 						bytesinpage = tinc;
1076 						if (tinc > (newbsize - toff))
1077 							bytesinpage = newbsize - toff;
1078 						if (!vm_page_is_valid(m, toff + off, bytesinpage)) {
1079 							bp->b_flags &= ~B_CACHE;
1080 						}
1081 						if ((m->flags & PG_ACTIVE) == 0) {
1082 							vm_page_activate(m);
1083 							m->act_count = 0;
1084 						}
1085 						m->flags |= PG_BUSY;
1086 					}
1087 					bp->b_pages[pageindex] = m;
1088 					curbpnpages = pageindex + 1;
1089 				}
1090 				if (bsize >= PAGE_SIZE) {
1091 					for (i = bp->b_npages; i < curbpnpages; i++) {
1092 						m = bp->b_pages[i];
1093 						if (m->valid == 0) {
1094 							bp->b_flags &= ~B_CACHE;
1095 						}
1096 						m->bmapped++;
1097 						PAGE_WAKEUP(m);
1098 					}
1099 				} else {
1100 					if (!vm_page_is_valid(bp->b_pages[0], off, bsize))
1101 						bp->b_flags &= ~B_CACHE;
1102 					bp->b_pages[0]->bmapped++;
1103 					PAGE_WAKEUP(bp->b_pages[0]);
1104 				}
1105 				bp->b_npages = curbpnpages;
1106 				bp->b_data = buffers_kva + (bp - buf) * MAXBSIZE;
1107 				pmap_qenter((vm_offset_t) bp->b_data, bp->b_pages, bp->b_npages);
1108 				bp->b_data += off % PAGE_SIZE;
1109 			}
1110 			bufspace += (newbsize - bp->b_bufsize);
1111 		}
1112 	}
1113 	bp->b_bufsize = newbsize;
1114 	bp->b_bcount = size;
1115 	return 1;
1116 }
1117 
1118 /*
1119  * Wait for buffer I/O completion, returning error status.
1120  */
1121 int
1122 biowait(register struct buf * bp)
1123 {
1124 	int s;
1125 
1126 	s = splbio();
1127 	while ((bp->b_flags & B_DONE) == 0)
1128 		tsleep((caddr_t) bp, PRIBIO, "biowait", 0);
1129 	splx(s);
1130 	if (bp->b_flags & B_EINTR) {
1131 		bp->b_flags &= ~B_EINTR;
1132 		return (EINTR);
1133 	}
1134 	if (bp->b_flags & B_ERROR) {
1135 		return (bp->b_error ? bp->b_error : EIO);
1136 	} else {
1137 		return (0);
1138 	}
1139 }
1140 
1141 /*
1142  * Finish I/O on a buffer, calling an optional function.
1143  * This is usually called from interrupt level, so process blocking
1144  * is not *a good idea*.
1145  */
1146 void
1147 biodone(register struct buf * bp)
1148 {
1149 	int s;
1150 
1151 	s = splbio();
1152 	if (bp->b_flags & B_DONE) {
1153 		splx(s);
1154 		printf("biodone: buffer already done\n");
1155 		return;
1156 	}
1157 	bp->b_flags |= B_DONE;
1158 
1159 	if ((bp->b_flags & B_READ) == 0) {
1160 		struct vnode *vp = bp->b_vp;
1161 		vwakeup(bp);
1162 	}
1163 #ifdef BOUNCE_BUFFERS
1164 	if (bp->b_flags & B_BOUNCE)
1165 		vm_bounce_free(bp);
1166 #endif
1167 
1168 	/* call optional completion function if requested */
1169 	if (bp->b_flags & B_CALL) {
1170 		bp->b_flags &= ~B_CALL;
1171 		(*bp->b_iodone) (bp);
1172 		splx(s);
1173 		return;
1174 	}
1175 	if (bp->b_flags & B_VMIO) {
1176 		int i, resid;
1177 		vm_offset_t foff;
1178 		vm_page_t m;
1179 		vm_object_t obj;
1180 		int iosize;
1181 		struct vnode *vp = bp->b_vp;
1182 
1183 		foff = vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
1184 		obj = (vm_object_t) vp->v_vmdata;
1185 		if (!obj) {
1186 			return;
1187 		}
1188 #if defined(VFS_BIO_DEBUG)
1189 		if (obj->paging_in_progress < bp->b_npages) {
1190 			printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n",
1191 			    obj->paging_in_progress, bp->b_npages);
1192 		}
1193 #endif
1194 		iosize = bp->b_bufsize;
1195 		for (i = 0; i < bp->b_npages; i++) {
1196 			int bogusflag = 0;
1197 			m = bp->b_pages[i];
1198 			if (m == bogus_page) {
1199 				bogusflag = 1;
1200 				m = vm_page_lookup(obj, foff);
1201 				if (!m) {
1202 #if defined(VFS_BIO_DEBUG)
1203 					printf("biodone: page disappeared\n");
1204 #endif
1205 					--obj->paging_in_progress;
1206 					continue;
1207 				}
1208 				bp->b_pages[i] = m;
1209 				pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
1210 			}
1211 #if defined(VFS_BIO_DEBUG)
1212 			if (trunc_page(foff) != m->offset) {
1213 				printf("biodone: foff(%d)/m->offset(%d) mismatch\n", foff, m->offset);
1214 			}
1215 #endif
1216 			resid = (m->offset + PAGE_SIZE) - foff;
1217 			if (resid > iosize)
1218 				resid = iosize;
1219 			/*
1220 			 * In the write case, the valid and clean bits are
1221 			 * already changed correctly, so we only need to do this
1222 			 * here in the read case.
1223 			 */
1224 			if ((bp->b_flags & B_READ) && !bogusflag && resid > 0) {
1225 				vm_page_set_valid(m, foff & (PAGE_SIZE-1), resid);
1226 				vm_page_set_clean(m, foff & (PAGE_SIZE-1), resid);
1227 			}
1228 
1229 			/*
1230 			 * when debugging new filesystems or buffer I/O methods, this
1231 			 * is the most common error that pops up.  if you see this, you
1232 			 * have not set the page busy flag correctly!!!
1233 			 */
1234 			if (m->busy == 0) {
1235 				printf("biodone: page busy < 0, "
1236 				    "off: %ld, foff: %ld, "
1237 				    "resid: %d, index: %d\n",
1238 				    m->offset, foff, resid, i);
1239 				printf(" iosize: %ld, lblkno: %ld\n",
1240 				    bp->b_vp->v_mount->mnt_stat.f_iosize,
1241 				    bp->b_lblkno);
1242 				printf(" valid: 0x%x, dirty: 0x%x, mapped: %d\n",
1243 				    m->valid, m->dirty, m->bmapped);
1244 				panic("biodone: page busy < 0\n");
1245 			}
1246 			--m->busy;
1247 			if( (m->busy == 0) && (m->flags & PG_WANTED))
1248 				wakeup((caddr_t) m);
1249 			--obj->paging_in_progress;
1250 			foff += resid;
1251 			iosize -= resid;
1252 		}
1253 		if (obj && obj->paging_in_progress == 0 &&
1254 		    (obj->flags & OBJ_PIPWNT)) {
1255 			obj->flags &= ~OBJ_PIPWNT;
1256 			wakeup((caddr_t) obj);
1257 		}
1258 	}
1259 	/*
1260 	 * For asynchronous completions, release the buffer now. The brelse
1261 	 * checks for B_WANTED and will do the wakeup there if necessary - so
1262 	 * no need to do a wakeup here in the async case.
1263 	 */
1264 
1265 	if (bp->b_flags & B_ASYNC) {
1266 		brelse(bp);
1267 	} else {
1268 		bp->b_flags &= ~B_WANTED;
1269 		wakeup((caddr_t) bp);
1270 	}
1271 	splx(s);
1272 }
1273 
1274 int
1275 count_lock_queue()
1276 {
1277 	int count;
1278 	struct buf *bp;
1279 
1280 	count = 0;
1281 	for (bp = bufqueues[QUEUE_LOCKED].tqh_first;
1282 	    bp != NULL;
1283 	    bp = bp->b_freelist.tqe_next)
1284 		count++;
1285 	return (count);
1286 }
1287 
1288 int vfs_update_interval = 30;
1289 
1290 void
1291 vfs_update()
1292 {
1293 	(void) spl0();
1294 	while (1) {
1295 		tsleep((caddr_t) &vfs_update_wakeup, PRIBIO, "update",
1296 		    hz * vfs_update_interval);
1297 		vfs_update_wakeup = 0;
1298 		sync(curproc, NULL, NULL);
1299 	}
1300 }
1301 
1302 /*
1303  * This routine is called in lieu of iodone in the case of
1304  * incomplete I/O.  This keeps the busy status for pages
1305  * consistant.
1306  */
1307 void
1308 vfs_unbusy_pages(struct buf * bp)
1309 {
1310 	int i;
1311 
1312 	if (bp->b_flags & B_VMIO) {
1313 		struct vnode *vp = bp->b_vp;
1314 		vm_object_t obj = (vm_object_t) vp->v_vmdata;
1315 		vm_offset_t foff;
1316 
1317 		foff = vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
1318 
1319 		for (i = 0; i < bp->b_npages; i++) {
1320 			vm_page_t m = bp->b_pages[i];
1321 
1322 			if (m == bogus_page) {
1323 				m = vm_page_lookup(obj, foff);
1324 				if (!m) {
1325 					panic("vfs_unbusy_pages: page missing\n");
1326 				}
1327 				bp->b_pages[i] = m;
1328 				pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
1329 			}
1330 			--obj->paging_in_progress;
1331 			--m->busy;
1332 			if( (m->busy == 0) && (m->flags & PG_WANTED))
1333 				wakeup((caddr_t) m);
1334 		}
1335 		if (obj->paging_in_progress == 0 &&
1336 		    (obj->flags & OBJ_PIPWNT)) {
1337 			obj->flags &= ~OBJ_PIPWNT;
1338 			wakeup((caddr_t) obj);
1339 		}
1340 	}
1341 }
1342 
1343 /*
1344  * This routine is called before a device strategy routine.
1345  * It is used to tell the VM system that paging I/O is in
1346  * progress, and treat the pages associated with the buffer
1347  * almost as being PG_BUSY.  Also the object paging_in_progress
1348  * flag is handled to make sure that the object doesn't become
1349  * inconsistant.
1350  */
1351 void
1352 vfs_busy_pages(struct buf * bp, int clear_modify)
1353 {
1354 	int i;
1355 
1356 	if (bp->b_flags & B_VMIO) {
1357 		vm_object_t obj = (vm_object_t) bp->b_vp->v_vmdata;
1358 		vm_offset_t foff = bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
1359 		int iocount = bp->b_bufsize;
1360 
1361 		vfs_setdirty(bp);
1362 		for (i = 0; i < bp->b_npages; i++) {
1363 			vm_page_t m = bp->b_pages[i];
1364 			int resid = (m->offset + PAGE_SIZE) - foff;
1365 
1366 			if (resid > iocount)
1367 				resid = iocount;
1368 			obj->paging_in_progress++;
1369 			m->busy++;
1370 			if (clear_modify) {
1371 				vm_page_protect(m, VM_PROT_READ);
1372 				vm_page_set_valid(m,
1373 					foff & (PAGE_SIZE-1), resid);
1374 				vm_page_set_clean(m,
1375 					foff & (PAGE_SIZE-1), resid);
1376 			} else if (bp->b_bcount >= PAGE_SIZE) {
1377 				if (m->valid && (bp->b_flags & B_CACHE) == 0) {
1378 					bp->b_pages[i] = bogus_page;
1379 					pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
1380 				}
1381 			}
1382 			foff += resid;
1383 			iocount -= resid;
1384 		}
1385 	}
1386 }
1387 
1388 /*
1389  * Tell the VM system that the pages associated with this buffer
1390  * are clean.  This is used for delayed writes where the data is
1391  * going to go to disk eventually without additional VM intevention.
1392  */
1393 void
1394 vfs_clean_pages(struct buf * bp)
1395 {
1396 	int i;
1397 
1398 	if (bp->b_flags & B_VMIO) {
1399 		vm_offset_t foff =
1400 			bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
1401 		int iocount = bp->b_bufsize;
1402 
1403 		for (i = 0; i < bp->b_npages; i++) {
1404 			vm_page_t m = bp->b_pages[i];
1405 			int resid = (m->offset + PAGE_SIZE) - foff;
1406 
1407 			if (resid > iocount)
1408 				resid = iocount;
1409 			if (resid > 0) {
1410 				vm_page_set_valid(m,
1411 					foff & (PAGE_SIZE-1), resid);
1412 				vm_page_set_clean(m,
1413 					foff & (PAGE_SIZE-1), resid);
1414 			}
1415 			foff += resid;
1416 			iocount -= resid;
1417 		}
1418 	}
1419 }
1420 
1421 void
1422 vfs_bio_clrbuf(struct buf *bp) {
1423 	int i;
1424 	if( bp->b_flags & B_VMIO) {
1425 		if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE)) {
1426 			int j;
1427 			if( bp->b_pages[0]->valid != VM_PAGE_BITS_ALL) {
1428 				for(j=0; j < bp->b_bufsize / DEV_BSIZE;j++) {
1429 					bzero(bp->b_data + j * DEV_BSIZE, DEV_BSIZE);
1430 				}
1431 			}
1432 			bp->b_resid = 0;
1433 			return;
1434 		}
1435 		for(i=0;i<bp->b_npages;i++) {
1436 			if( bp->b_pages[i]->valid == VM_PAGE_BITS_ALL)
1437 				continue;
1438 			if( bp->b_pages[i]->valid == 0) {
1439 				bzero(bp->b_data + i * PAGE_SIZE, PAGE_SIZE);
1440 			} else {
1441 				int j;
1442 				for(j=0;j<PAGE_SIZE/DEV_BSIZE;j++) {
1443 					if( (bp->b_pages[i]->valid & (1<<j)) == 0)
1444 						bzero(bp->b_data + i * PAGE_SIZE + j * DEV_BSIZE, DEV_BSIZE);
1445 				}
1446 			}
1447 			bp->b_pages[i]->valid = VM_PAGE_BITS_ALL;
1448 		}
1449 		bp->b_resid = 0;
1450 	} else {
1451 		clrbuf(bp);
1452 	}
1453 }
1454 
1455 /*
1456  * vm_hold_load_pages and vm_hold_unload pages get pages into
1457  * a buffers address space.  The pages are anonymous and are
1458  * not associated with a file object.
1459  */
1460 void
1461 vm_hold_load_pages(struct buf * bp, vm_offset_t froma, vm_offset_t toa)
1462 {
1463 	vm_offset_t pg;
1464 	vm_page_t p;
1465 	vm_offset_t from = round_page(froma);
1466 	vm_offset_t to = round_page(toa);
1467 
1468 	for (pg = from; pg < to; pg += PAGE_SIZE) {
1469 
1470 tryagain:
1471 
1472 		p = vm_page_alloc(kernel_object, pg - VM_MIN_KERNEL_ADDRESS,
1473 		    VM_ALLOC_NORMAL);
1474 		if (!p) {
1475 			VM_WAIT;
1476 			goto tryagain;
1477 		}
1478 		vm_page_wire(p);
1479 		pmap_kenter(pg, VM_PAGE_TO_PHYS(p));
1480 		bp->b_pages[((caddr_t) pg - bp->b_data) / PAGE_SIZE] = p;
1481 		PAGE_WAKEUP(p);
1482 		bp->b_npages++;
1483 	}
1484 }
1485 
1486 void
1487 vm_hold_free_pages(struct buf * bp, vm_offset_t froma, vm_offset_t toa)
1488 {
1489 	vm_offset_t pg;
1490 	vm_page_t p;
1491 	vm_offset_t from = round_page(froma);
1492 	vm_offset_t to = round_page(toa);
1493 
1494 	for (pg = from; pg < to; pg += PAGE_SIZE) {
1495 		p = bp->b_pages[((caddr_t) pg - bp->b_data) / PAGE_SIZE];
1496 		bp->b_pages[((caddr_t) pg - bp->b_data) / PAGE_SIZE] = 0;
1497 		pmap_kremove(pg);
1498 		vm_page_free(p);
1499 		--bp->b_npages;
1500 	}
1501 }
1502