xref: /freebsd/sys/kern/vfs_bio.c (revision a14a0223ae1b172e96dd2a1d849e22026a98b692)
1 /*
2  * Copyright (c) 1994,1997 John S. Dyson
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice immediately at the beginning of the file, without modification,
10  *    this list of conditions, and the following disclaimer.
11  * 2. Absolutely no warranty of function or purpose is made by the author
12  *		John S. Dyson.
13  *
14  * $FreeBSD$
15  */
16 
17 /*
18  * this file contains a new buffer I/O scheme implementing a coherent
19  * VM object and buffer cache scheme.  Pains have been taken to make
20  * sure that the performance degradation associated with schemes such
21  * as this is not realized.
22  *
23  * Author:  John S. Dyson
24  * Significant help during the development and debugging phases
25  * had been provided by David Greenman, also of the FreeBSD core team.
26  *
27  * see man buf(9) for more info.
28  */
29 
30 #include <sys/param.h>
31 #include <sys/systm.h>
32 #include <sys/kernel.h>
33 #include <sys/sysctl.h>
34 #include <sys/proc.h>
35 #include <sys/kthread.h>
36 #include <sys/vnode.h>
37 #include <sys/vmmeter.h>
38 #include <sys/lock.h>
39 #include <vm/vm.h>
40 #include <vm/vm_param.h>
41 #include <vm/vm_kern.h>
42 #include <vm/vm_pageout.h>
43 #include <vm/vm_page.h>
44 #include <vm/vm_object.h>
45 #include <vm/vm_extern.h>
46 #include <vm/vm_map.h>
47 #include <sys/buf.h>
48 #include <sys/mount.h>
49 #include <sys/malloc.h>
50 #include <sys/resourcevar.h>
51 #include <sys/conf.h>
52 
53 static MALLOC_DEFINE(M_BIOBUF, "BIO buffer", "BIO buffer");
54 
55 struct	bio_ops bioops;		/* I/O operation notification */
56 
57 struct buf *buf;		/* buffer header pool */
58 struct swqueue bswlist;
59 
60 static void vm_hold_free_pages(struct buf * bp, vm_offset_t from,
61 		vm_offset_t to);
62 static void vm_hold_load_pages(struct buf * bp, vm_offset_t from,
63 		vm_offset_t to);
64 static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off,
65 			       int pageno, vm_page_t m);
66 static void vfs_clean_pages(struct buf * bp);
67 static void vfs_setdirty(struct buf *bp);
68 static void vfs_vmio_release(struct buf *bp);
69 static int flushbufqueues(void);
70 
71 static int bd_request;
72 
73 static void buf_daemon __P((void));
74 /*
75  * bogus page -- for I/O to/from partially complete buffers
76  * this is a temporary solution to the problem, but it is not
77  * really that bad.  it would be better to split the buffer
78  * for input in the case of buffers partially already in memory,
79  * but the code is intricate enough already.
80  */
81 vm_page_t bogus_page;
82 int runningbufspace;
83 int vmiodirenable = FALSE;
84 int buf_maxio = DFLTPHYS;
85 static vm_offset_t bogus_offset;
86 
87 static int bufspace, maxbufspace, vmiospace,
88 	bufmallocspace, maxbufmallocspace, hibufspace;
89 static int maxbdrun;
90 static int needsbuffer;
91 static int numdirtybuffers, lodirtybuffers, hidirtybuffers;
92 static int numfreebuffers, lofreebuffers, hifreebuffers;
93 static int getnewbufcalls;
94 static int getnewbufrestarts;
95 static int kvafreespace;
96 
97 SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD,
98 	&numdirtybuffers, 0, "");
99 SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW,
100 	&lodirtybuffers, 0, "");
101 SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW,
102 	&hidirtybuffers, 0, "");
103 SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD,
104 	&numfreebuffers, 0, "");
105 SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW,
106 	&lofreebuffers, 0, "");
107 SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW,
108 	&hifreebuffers, 0, "");
109 SYSCTL_INT(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD,
110 	&runningbufspace, 0, "");
111 SYSCTL_INT(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RW,
112 	&maxbufspace, 0, "");
113 SYSCTL_INT(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD,
114 	&hibufspace, 0, "");
115 SYSCTL_INT(_vfs, OID_AUTO, bufspace, CTLFLAG_RD,
116 	&bufspace, 0, "");
117 SYSCTL_INT(_vfs, OID_AUTO, maxbdrun, CTLFLAG_RW,
118 	&maxbdrun, 0, "");
119 SYSCTL_INT(_vfs, OID_AUTO, vmiospace, CTLFLAG_RD,
120 	&vmiospace, 0, "");
121 SYSCTL_INT(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW,
122 	&maxbufmallocspace, 0, "");
123 SYSCTL_INT(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD,
124 	&bufmallocspace, 0, "");
125 SYSCTL_INT(_vfs, OID_AUTO, kvafreespace, CTLFLAG_RD,
126 	&kvafreespace, 0, "");
127 SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW,
128 	&getnewbufcalls, 0, "");
129 SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW,
130 	&getnewbufrestarts, 0, "");
131 SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW,
132 	&vmiodirenable, 0, "");
133 
134 
135 static int bufhashmask;
136 static LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
137 struct bqueues bufqueues[BUFFER_QUEUES] = { { 0 } };
138 char *buf_wmesg = BUF_WMESG;
139 
140 extern int vm_swap_size;
141 
142 #define BUF_MAXUSE		24
143 
144 #define VFS_BIO_NEED_ANY	0x01	/* any freeable buffer */
145 #define VFS_BIO_NEED_DIRTYFLUSH	0x02	/* waiting for dirty buffer flush */
146 #define VFS_BIO_NEED_FREE	0x04	/* wait for free bufs, hi hysteresis */
147 #define VFS_BIO_NEED_BUFSPACE	0x08	/* wait for buf space, lo hysteresis */
148 #define VFS_BIO_NEED_KVASPACE	0x10	/* wait for buffer_map space, emerg  */
149 
150 /*
151  * Buffer hash table code.  Note that the logical block scans linearly, which
152  * gives us some L1 cache locality.
153  */
154 
155 static __inline
156 struct bufhashhdr *
157 bufhash(struct vnode *vnp, daddr_t bn)
158 {
159 	return(&bufhashtbl[(((uintptr_t)(vnp) >> 7) + (int)bn) & bufhashmask]);
160 }
161 
162 /*
163  *	kvaspacewakeup:
164  *
165  *	Called when kva space is potential available for recovery or when
166  *	kva space is recovered in the buffer_map.  This function wakes up
167  *	anyone waiting for buffer_map kva space.  Even though the buffer_map
168  *	is larger then maxbufspace, this situation will typically occur
169  *	when the buffer_map gets fragmented.
170  */
171 
172 static __inline void
173 kvaspacewakeup(void)
174 {
175 	/*
176 	 * If someone is waiting for KVA space, wake them up.  Even
177 	 * though we haven't freed the kva space yet, the waiting
178 	 * process will be able to now.
179 	 */
180 	if (needsbuffer & VFS_BIO_NEED_KVASPACE) {
181 		needsbuffer &= ~VFS_BIO_NEED_KVASPACE;
182 		wakeup(&needsbuffer);
183 	}
184 }
185 
186 /*
187  *	numdirtywakeup:
188  *
189  *	If someone is blocked due to there being too many dirty buffers,
190  *	and numdirtybuffers is now reasonable, wake them up.
191  */
192 
193 static __inline void
194 numdirtywakeup(void)
195 {
196 	if (numdirtybuffers < hidirtybuffers) {
197 		if (needsbuffer & VFS_BIO_NEED_DIRTYFLUSH) {
198 			needsbuffer &= ~VFS_BIO_NEED_DIRTYFLUSH;
199 			wakeup(&needsbuffer);
200 		}
201 	}
202 }
203 
204 /*
205  *	bufspacewakeup:
206  *
207  *	Called when buffer space is potentially available for recovery or when
208  *	buffer space is recovered.  getnewbuf() will block on this flag when
209  *	it is unable to free sufficient buffer space.  Buffer space becomes
210  *	recoverable when bp's get placed back in the queues.
211  */
212 
213 static __inline void
214 bufspacewakeup(void)
215 {
216 	/*
217 	 * If someone is waiting for BUF space, wake them up.  Even
218 	 * though we haven't freed the kva space yet, the waiting
219 	 * process will be able to now.
220 	 */
221 	if (needsbuffer & VFS_BIO_NEED_BUFSPACE) {
222 		needsbuffer &= ~VFS_BIO_NEED_BUFSPACE;
223 		wakeup(&needsbuffer);
224 	}
225 }
226 
227 /*
228  *	bufcountwakeup:
229  *
230  *	Called when a buffer has been added to one of the free queues to
231  *	account for the buffer and to wakeup anyone waiting for free buffers.
232  *	This typically occurs when large amounts of metadata are being handled
233  *	by the buffer cache ( else buffer space runs out first, usually ).
234  */
235 
236 static __inline void
237 bufcountwakeup(void)
238 {
239 	++numfreebuffers;
240 	if (needsbuffer) {
241 		needsbuffer &= ~VFS_BIO_NEED_ANY;
242 		if (numfreebuffers >= hifreebuffers)
243 			needsbuffer &= ~VFS_BIO_NEED_FREE;
244 		wakeup(&needsbuffer);
245 	}
246 }
247 
248 /*
249  *	vfs_buf_test_cache:
250  *
251  *	Called when a buffer is extended.  This function clears the B_CACHE
252  *	bit if the newly extended portion of the buffer does not contain
253  *	valid data.
254  */
255 static __inline__
256 void
257 vfs_buf_test_cache(struct buf *bp,
258 		  vm_ooffset_t foff, vm_offset_t off, vm_offset_t size,
259 		  vm_page_t m)
260 {
261 	if (bp->b_flags & B_CACHE) {
262 		int base = (foff + off) & PAGE_MASK;
263 		if (vm_page_is_valid(m, base, size) == 0)
264 			bp->b_flags &= ~B_CACHE;
265 	}
266 }
267 
268 static __inline__
269 void
270 bd_wakeup(int dirtybuflevel)
271 {
272 	if (numdirtybuffers >= dirtybuflevel && bd_request == 0) {
273 		bd_request = 1;
274 		wakeup(&bd_request);
275 	}
276 }
277 
278 
279 /*
280  * Initialize buffer headers and related structures.
281  */
282 
283 caddr_t
284 bufhashinit(caddr_t vaddr)
285 {
286 	/* first, make a null hash table */
287 	for (bufhashmask = 8; bufhashmask < nbuf / 4; bufhashmask <<= 1)
288 		;
289 	bufhashtbl = (void *)vaddr;
290 	vaddr = vaddr + sizeof(*bufhashtbl) * bufhashmask;
291 	--bufhashmask;
292 	return(vaddr);
293 }
294 
295 void
296 bufinit(void)
297 {
298 	struct buf *bp;
299 	int i;
300 
301 	TAILQ_INIT(&bswlist);
302 	LIST_INIT(&invalhash);
303 	simple_lock_init(&buftimelock);
304 
305 	for (i = 0; i <= bufhashmask; i++)
306 		LIST_INIT(&bufhashtbl[i]);
307 
308 	/* next, make a null set of free lists */
309 	for (i = 0; i < BUFFER_QUEUES; i++)
310 		TAILQ_INIT(&bufqueues[i]);
311 
312 	/* finally, initialize each buffer header and stick on empty q */
313 	for (i = 0; i < nbuf; i++) {
314 		bp = &buf[i];
315 		bzero(bp, sizeof *bp);
316 		bp->b_flags = B_INVAL;	/* we're just an empty header */
317 		bp->b_dev = NODEV;
318 		bp->b_rcred = NOCRED;
319 		bp->b_wcred = NOCRED;
320 		bp->b_qindex = QUEUE_EMPTY;
321 		bp->b_xflags = 0;
322 		LIST_INIT(&bp->b_dep);
323 		BUF_LOCKINIT(bp);
324 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
325 		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
326 	}
327 
328 	/*
329 	 * maxbufspace is currently calculated to be maximally efficient
330 	 * when the filesystem block size is DFLTBSIZE or DFLTBSIZE*2
331 	 * (4K or 8K).  To reduce the number of stall points our calculation
332 	 * is based on DFLTBSIZE which should reduce the chances of actually
333 	 * running out of buffer headers.  The maxbufspace calculation is also
334 	 * based on DFLTBSIZE (4K) instead of BKVASIZE (8K) in order to
335 	 * reduce the chance that a KVA allocation will fail due to
336 	 * fragmentation.  While this does not usually create a stall,
337 	 * the KVA map allocation/free functions are O(N) rather then O(1)
338 	 * so running them constantly would result in inefficient O(N*M)
339 	 * buffer cache operation.
340 	 */
341 	maxbufspace = (nbuf + 8) * DFLTBSIZE;
342 	hibufspace = imax(3 * maxbufspace / 4, maxbufspace - MAXBSIZE * 5);
343 /*
344  * Limit the amount of malloc memory since it is wired permanently into
345  * the kernel space.  Even though this is accounted for in the buffer
346  * allocation, we don't want the malloced region to grow uncontrolled.
347  * The malloc scheme improves memory utilization significantly on average
348  * (small) directories.
349  */
350 	maxbufmallocspace = hibufspace / 20;
351 
352 /*
353  * Reduce the chance of a deadlock occuring by limiting the number
354  * of delayed-write dirty buffers we allow to stack up.
355  */
356 	lodirtybuffers = nbuf / 7 + 10;
357 	hidirtybuffers = nbuf / 4 + 20;
358 	numdirtybuffers = 0;
359 /*
360  * To support extreme low-memory systems, make sure hidirtybuffers cannot
361  * eat up all available buffer space.  This occurs when our minimum cannot
362  * be met.  We try to size hidirtybuffers to 3/4 our buffer space assuming
363  * BKVASIZE'd (8K) buffers.  We also reduce buf_maxio in this case (used
364  * by the clustering code) in an attempt to further reduce the load on
365  * the buffer cache.
366  */
367 	while (hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) {
368 		lodirtybuffers >>= 1;
369 		hidirtybuffers >>= 1;
370 		buf_maxio >>= 1;
371 	}
372 	if (lodirtybuffers < 2) {
373 		lodirtybuffers = 2;
374 		hidirtybuffers = 4;
375 	}
376 
377 	/*
378 	 * Temporary, BKVASIZE may be manipulated soon, make sure we don't
379 	 * do something illegal. XXX
380 	 */
381 #if BKVASIZE < MAXBSIZE
382 	if (buf_maxio < BKVASIZE * 2)
383 		buf_maxio = BKVASIZE * 2;
384 #else
385 	if (buf_maxio < MAXBSIZE)
386 		buf_maxio = MAXBSIZE;
387 #endif
388 
389 /*
390  * Try to keep the number of free buffers in the specified range,
391  * and give the syncer access to an emergency reserve.
392  */
393 	lofreebuffers = nbuf / 18 + 5;
394 	hifreebuffers = 2 * lofreebuffers;
395 	numfreebuffers = nbuf;
396 
397 /*
398  * Maximum number of async ops initiated per buf_daemon loop.  This is
399  * somewhat of a hack at the moment, we really need to limit ourselves
400  * based on the number of bytes of I/O in-transit that were initiated
401  * from buf_daemon.
402  */
403 	if ((maxbdrun = nswbuf / 4) < 4)
404 		maxbdrun = 4;
405 
406 	kvafreespace = 0;
407 
408 	bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE);
409 	bogus_page = vm_page_alloc(kernel_object,
410 			((bogus_offset - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
411 			VM_ALLOC_NORMAL);
412 	cnt.v_wire_count++;
413 
414 }
415 
416 /*
417  * Free the kva allocation for a buffer
418  * Must be called only at splbio or higher,
419  *  as this is the only locking for buffer_map.
420  */
421 static void
422 bfreekva(struct buf * bp)
423 {
424 	if (bp->b_kvasize) {
425 		vm_map_delete(buffer_map,
426 		    (vm_offset_t) bp->b_kvabase,
427 		    (vm_offset_t) bp->b_kvabase + bp->b_kvasize
428 		);
429 		bp->b_kvasize = 0;
430 		kvaspacewakeup();
431 	}
432 }
433 
434 /*
435  *	bremfree:
436  *
437  *	Remove the buffer from the appropriate free list.
438  */
439 void
440 bremfree(struct buf * bp)
441 {
442 	int s = splbio();
443 	int old_qindex = bp->b_qindex;
444 
445 	if (bp->b_qindex != QUEUE_NONE) {
446 		if (bp->b_qindex == QUEUE_EMPTYKVA) {
447 			kvafreespace -= bp->b_kvasize;
448 		}
449 		KASSERT(BUF_REFCNT(bp) == 1, ("bremfree: bp %p not locked",bp));
450 		TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
451 		bp->b_qindex = QUEUE_NONE;
452 		runningbufspace += bp->b_bufsize;
453 	} else {
454 #if !defined(MAX_PERF)
455 		if (BUF_REFCNT(bp) <= 1)
456 			panic("bremfree: removing a buffer not on a queue");
457 #endif
458 	}
459 
460 	/*
461 	 * Fixup numfreebuffers count.  If the buffer is invalid or not
462 	 * delayed-write, and it was on the EMPTY, LRU, or AGE queues,
463 	 * the buffer was free and we must decrement numfreebuffers.
464 	 */
465 	if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) {
466 		switch(old_qindex) {
467 		case QUEUE_DIRTY:
468 		case QUEUE_CLEAN:
469 		case QUEUE_EMPTY:
470 		case QUEUE_EMPTYKVA:
471 			--numfreebuffers;
472 			break;
473 		default:
474 			break;
475 		}
476 	}
477 	splx(s);
478 }
479 
480 
481 /*
482  * Get a buffer with the specified data.  Look in the cache first.  We
483  * must clear B_ERROR and B_INVAL prior to initiating I/O.  If B_CACHE
484  * is set, the buffer is valid and we do not have to do anything ( see
485  * getblk() ).
486  */
487 int
488 bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred,
489     struct buf ** bpp)
490 {
491 	struct buf *bp;
492 
493 	bp = getblk(vp, blkno, size, 0, 0);
494 	*bpp = bp;
495 
496 	/* if not found in cache, do some I/O */
497 	if ((bp->b_flags & B_CACHE) == 0) {
498 		if (curproc != NULL)
499 			curproc->p_stats->p_ru.ru_inblock++;
500 		KASSERT(!(bp->b_flags & B_ASYNC), ("bread: illegal async bp %p", bp));
501 		bp->b_flags |= B_READ;
502 		bp->b_flags &= ~(B_ERROR | B_INVAL);
503 		if (bp->b_rcred == NOCRED) {
504 			if (cred != NOCRED)
505 				crhold(cred);
506 			bp->b_rcred = cred;
507 		}
508 		vfs_busy_pages(bp, 0);
509 		VOP_STRATEGY(vp, bp);
510 		return (biowait(bp));
511 	}
512 	return (0);
513 }
514 
515 /*
516  * Operates like bread, but also starts asynchronous I/O on
517  * read-ahead blocks.  We must clear B_ERROR and B_INVAL prior
518  * to initiating I/O . If B_CACHE is set, the buffer is valid
519  * and we do not have to do anything.
520  */
521 int
522 breadn(struct vnode * vp, daddr_t blkno, int size,
523     daddr_t * rablkno, int *rabsize,
524     int cnt, struct ucred * cred, struct buf ** bpp)
525 {
526 	struct buf *bp, *rabp;
527 	int i;
528 	int rv = 0, readwait = 0;
529 
530 	*bpp = bp = getblk(vp, blkno, size, 0, 0);
531 
532 	/* if not found in cache, do some I/O */
533 	if ((bp->b_flags & B_CACHE) == 0) {
534 		if (curproc != NULL)
535 			curproc->p_stats->p_ru.ru_inblock++;
536 		bp->b_flags |= B_READ;
537 		bp->b_flags &= ~(B_ERROR | B_INVAL);
538 		if (bp->b_rcred == NOCRED) {
539 			if (cred != NOCRED)
540 				crhold(cred);
541 			bp->b_rcred = cred;
542 		}
543 		vfs_busy_pages(bp, 0);
544 		VOP_STRATEGY(vp, bp);
545 		++readwait;
546 	}
547 
548 	for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
549 		if (inmem(vp, *rablkno))
550 			continue;
551 		rabp = getblk(vp, *rablkno, *rabsize, 0, 0);
552 
553 		if ((rabp->b_flags & B_CACHE) == 0) {
554 			if (curproc != NULL)
555 				curproc->p_stats->p_ru.ru_inblock++;
556 			rabp->b_flags |= B_READ | B_ASYNC;
557 			rabp->b_flags &= ~(B_ERROR | B_INVAL);
558 			if (rabp->b_rcred == NOCRED) {
559 				if (cred != NOCRED)
560 					crhold(cred);
561 				rabp->b_rcred = cred;
562 			}
563 			vfs_busy_pages(rabp, 0);
564 			BUF_KERNPROC(rabp);
565 			VOP_STRATEGY(vp, rabp);
566 		} else {
567 			brelse(rabp);
568 		}
569 	}
570 
571 	if (readwait) {
572 		rv = biowait(bp);
573 	}
574 	return (rv);
575 }
576 
577 /*
578  * Write, release buffer on completion.  (Done by iodone
579  * if async).  Do not bother writing anything if the buffer
580  * is invalid.
581  *
582  * Note that we set B_CACHE here, indicating that buffer is
583  * fully valid and thus cacheable.  This is true even of NFS
584  * now so we set it generally.  This could be set either here
585  * or in biodone() since the I/O is synchronous.  We put it
586  * here.
587  */
588 int
589 bwrite(struct buf * bp)
590 {
591 	int oldflags, s;
592 	struct vnode *vp;
593 	struct mount *mp;
594 
595 	if (bp->b_flags & B_INVAL) {
596 		brelse(bp);
597 		return (0);
598 	}
599 
600 	oldflags = bp->b_flags;
601 
602 #if !defined(MAX_PERF)
603 	if (BUF_REFCNT(bp) == 0)
604 		panic("bwrite: buffer is not busy???");
605 #endif
606 	s = splbio();
607 	bundirty(bp);
608 
609 	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR);
610 	bp->b_flags |= B_WRITEINPROG | B_CACHE;
611 
612 	bp->b_vp->v_numoutput++;
613 	vfs_busy_pages(bp, 1);
614 	if (curproc != NULL)
615 		curproc->p_stats->p_ru.ru_oublock++;
616 	splx(s);
617 	if (oldflags & B_ASYNC)
618 		BUF_KERNPROC(bp);
619 	VOP_STRATEGY(bp->b_vp, bp);
620 
621 	/*
622 	 * Collect statistics on synchronous and asynchronous writes.
623 	 * Writes to block devices are charged to their associated
624 	 * filesystem (if any).
625 	 */
626 	if ((vp = bp->b_vp) != NULL) {
627 		if (vp->v_type == VBLK)
628 			mp = vp->v_specmountpoint;
629 		else
630 			mp = vp->v_mount;
631 		if (mp != NULL) {
632 			if ((oldflags & B_ASYNC) == 0)
633 				mp->mnt_stat.f_syncwrites++;
634 			else
635 				mp->mnt_stat.f_asyncwrites++;
636 		}
637 	}
638 
639 	if ((oldflags & B_ASYNC) == 0) {
640 		int rtval = biowait(bp);
641 		brelse(bp);
642 		return (rtval);
643 	}
644 
645 	return (0);
646 }
647 
648 /*
649  * Delayed write. (Buffer is marked dirty).  Do not bother writing
650  * anything if the buffer is marked invalid.
651  *
652  * Note that since the buffer must be completely valid, we can safely
653  * set B_CACHE.  In fact, we have to set B_CACHE here rather then in
654  * biodone() in order to prevent getblk from writing the buffer
655  * out synchronously.
656  */
657 void
658 bdwrite(struct buf * bp)
659 {
660 #if !defined(MAX_PERF)
661 	if (BUF_REFCNT(bp) == 0)
662 		panic("bdwrite: buffer is not busy");
663 #endif
664 
665 	if (bp->b_flags & B_INVAL) {
666 		brelse(bp);
667 		return;
668 	}
669 	bdirty(bp);
670 
671 	/*
672 	 * Set B_CACHE, indicating that the buffer is fully valid.  This is
673 	 * true even of NFS now.
674 	 */
675 	bp->b_flags |= B_CACHE;
676 
677 	/*
678 	 * This bmap keeps the system from needing to do the bmap later,
679 	 * perhaps when the system is attempting to do a sync.  Since it
680 	 * is likely that the indirect block -- or whatever other datastructure
681 	 * that the filesystem needs is still in memory now, it is a good
682 	 * thing to do this.  Note also, that if the pageout daemon is
683 	 * requesting a sync -- there might not be enough memory to do
684 	 * the bmap then...  So, this is important to do.
685 	 */
686 	if (bp->b_lblkno == bp->b_blkno) {
687 		VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
688 	}
689 
690 	/*
691 	 * Set the *dirty* buffer range based upon the VM system dirty pages.
692 	 */
693 	vfs_setdirty(bp);
694 
695 	/*
696 	 * We need to do this here to satisfy the vnode_pager and the
697 	 * pageout daemon, so that it thinks that the pages have been
698 	 * "cleaned".  Note that since the pages are in a delayed write
699 	 * buffer -- the VFS layer "will" see that the pages get written
700 	 * out on the next sync, or perhaps the cluster will be completed.
701 	 */
702 	vfs_clean_pages(bp);
703 	bqrelse(bp);
704 
705 	/*
706 	 * Wakeup the buffer flushing daemon if we have saturated the
707 	 * buffer cache.
708 	 */
709 
710 	bd_wakeup(hidirtybuffers);
711 
712 	/*
713 	 * note: we cannot initiate I/O from a bdwrite even if we wanted to,
714 	 * due to the softdep code.
715 	 */
716 }
717 
718 /*
719  *	bdirty:
720  *
721  *	Turn buffer into delayed write request.  We must clear B_READ and
722  *	B_RELBUF, and we must set B_DELWRI.  We reassign the buffer to
723  *	itself to properly update it in the dirty/clean lists.  We mark it
724  *	B_DONE to ensure that any asynchronization of the buffer properly
725  *	clears B_DONE ( else a panic will occur later ).
726  *
727  *	bdirty() is kinda like bdwrite() - we have to clear B_INVAL which
728  *	might have been set pre-getblk().  Unlike bwrite/bdwrite, bdirty()
729  *	should only be called if the buffer is known-good.
730  *
731  *	Since the buffer is not on a queue, we do not update the numfreebuffers
732  *	count.
733  *
734  *	Must be called at splbio().
735  *	The buffer must be on QUEUE_NONE.
736  */
737 void
738 bdirty(bp)
739 	struct buf *bp;
740 {
741 	KASSERT(bp->b_qindex == QUEUE_NONE, ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex));
742 	bp->b_flags &= ~(B_READ|B_RELBUF);
743 
744 	if ((bp->b_flags & B_DELWRI) == 0) {
745 		bp->b_flags |= B_DONE | B_DELWRI;
746 		reassignbuf(bp, bp->b_vp);
747 		++numdirtybuffers;
748 		bd_wakeup(hidirtybuffers);
749 	}
750 }
751 
752 /*
753  *	bundirty:
754  *
755  *	Clear B_DELWRI for buffer.
756  *
757  *	Since the buffer is not on a queue, we do not update the numfreebuffers
758  *	count.
759  *
760  *	Must be called at splbio().
761  *	The buffer must be on QUEUE_NONE.
762  */
763 
764 void
765 bundirty(bp)
766 	struct buf *bp;
767 {
768 	KASSERT(bp->b_qindex == QUEUE_NONE, ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex));
769 
770 	if (bp->b_flags & B_DELWRI) {
771 		bp->b_flags &= ~B_DELWRI;
772 		reassignbuf(bp, bp->b_vp);
773 		--numdirtybuffers;
774 		numdirtywakeup();
775 	}
776 }
777 
778 /*
779  *	bawrite:
780  *
781  *	Asynchronous write.  Start output on a buffer, but do not wait for
782  *	it to complete.  The buffer is released when the output completes.
783  *
784  *	bwrite() ( or the VOP routine anyway ) is responsible for handling
785  *	B_INVAL buffers.  Not us.
786  */
787 void
788 bawrite(struct buf * bp)
789 {
790 	bp->b_flags |= B_ASYNC;
791 	(void) VOP_BWRITE(bp->b_vp, bp);
792 }
793 
794 /*
795  *	bowrite:
796  *
797  *	Ordered write.  Start output on a buffer, and flag it so that the
798  *	device will write it in the order it was queued.  The buffer is
799  *	released when the output completes.  bwrite() ( or the VOP routine
800  *	anyway ) is responsible for handling B_INVAL buffers.
801  */
802 int
803 bowrite(struct buf * bp)
804 {
805 	bp->b_flags |= B_ORDERED | B_ASYNC;
806 	return (VOP_BWRITE(bp->b_vp, bp));
807 }
808 
809 /*
810  *	bwillwrite:
811  *
812  *	Called prior to the locking of any vnodes when we are expecting to
813  *	write.  We do not want to starve the buffer cache with too many
814  *	dirty buffers so we block here.  By blocking prior to the locking
815  *	of any vnodes we attempt to avoid the situation where a locked vnode
816  *	prevents the various system daemons from flushing related buffers.
817  */
818 
819 void
820 bwillwrite(void)
821 {
822 	int twenty = (hidirtybuffers - lodirtybuffers) / 5;
823 
824 	if (numdirtybuffers > hidirtybuffers + twenty) {
825 		int s;
826 
827 		s = splbio();
828 		while (numdirtybuffers > hidirtybuffers) {
829 			bd_wakeup(hidirtybuffers);
830 			needsbuffer |= VFS_BIO_NEED_DIRTYFLUSH;
831 			tsleep(&needsbuffer, (PRIBIO + 4), "flswai", 0);
832 		}
833 		splx(s);
834 	}
835 }
836 
837 /*
838  *	brelse:
839  *
840  *	Release a busy buffer and, if requested, free its resources.  The
841  *	buffer will be stashed in the appropriate bufqueue[] allowing it
842  *	to be accessed later as a cache entity or reused for other purposes.
843  */
844 void
845 brelse(struct buf * bp)
846 {
847 	int s;
848 	int kvawakeup = 0;
849 
850 	KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
851 
852 	s = splbio();
853 
854 	if (bp->b_flags & B_LOCKED)
855 		bp->b_flags &= ~B_ERROR;
856 
857 	if ((bp->b_flags & (B_READ | B_ERROR | B_INVAL)) == B_ERROR) {
858 		/*
859 		 * Failed write, redirty.  Must clear B_ERROR to prevent
860 		 * pages from being scrapped.  If B_INVAL is set then
861 		 * this case is not run and the next case is run to
862 		 * destroy the buffer.  B_INVAL can occur if the buffer
863 		 * is outside the range supported by the underlying device.
864 		 */
865 		bp->b_flags &= ~B_ERROR;
866 		bdirty(bp);
867 	} else if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_FREEBUF)) ||
868 	    (bp->b_bufsize <= 0)) {
869 		/*
870 		 * Either a failed I/O or we were asked to free or not
871 		 * cache the buffer.
872 		 */
873 		bp->b_flags |= B_INVAL;
874 		if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate)
875 			(*bioops.io_deallocate)(bp);
876 		if (bp->b_flags & B_DELWRI) {
877 			--numdirtybuffers;
878 			numdirtywakeup();
879 		}
880 		bp->b_flags &= ~(B_DELWRI | B_CACHE | B_FREEBUF);
881 		if ((bp->b_flags & B_VMIO) == 0) {
882 			if (bp->b_bufsize)
883 				allocbuf(bp, 0);
884 			if (bp->b_vp)
885 				brelvp(bp);
886 		}
887 	}
888 
889 	/*
890 	 * We must clear B_RELBUF if B_DELWRI is set.  If vfs_vmio_release()
891 	 * is called with B_DELWRI set, the underlying pages may wind up
892 	 * getting freed causing a previous write (bdwrite()) to get 'lost'
893 	 * because pages associated with a B_DELWRI bp are marked clean.
894 	 *
895 	 * We still allow the B_INVAL case to call vfs_vmio_release(), even
896 	 * if B_DELWRI is set.
897 	 */
898 
899 	if (bp->b_flags & B_DELWRI)
900 		bp->b_flags &= ~B_RELBUF;
901 
902 	/*
903 	 * VMIO buffer rundown.  It is not very necessary to keep a VMIO buffer
904 	 * constituted, not even NFS buffers now.  Two flags effect this.  If
905 	 * B_INVAL, the struct buf is invalidated but the VM object is kept
906 	 * around ( i.e. so it is trivial to reconstitute the buffer later ).
907 	 *
908 	 * If B_ERROR or B_NOCACHE is set, pages in the VM object will be
909 	 * invalidated.  B_ERROR cannot be set for a failed write unless the
910 	 * buffer is also B_INVAL because it hits the re-dirtying code above.
911 	 *
912 	 * Normally we can do this whether a buffer is B_DELWRI or not.  If
913 	 * the buffer is an NFS buffer, it is tracking piecemeal writes or
914 	 * the commit state and we cannot afford to lose the buffer.
915 	 */
916 	if ((bp->b_flags & B_VMIO)
917 	    && !(bp->b_vp->v_tag == VT_NFS &&
918 		 bp->b_vp->v_type != VBLK &&
919 		 (bp->b_flags & B_DELWRI))
920 	    ) {
921 
922 		int i, j, resid;
923 		vm_page_t m;
924 		off_t foff;
925 		vm_pindex_t poff;
926 		vm_object_t obj;
927 		struct vnode *vp;
928 
929 		vp = bp->b_vp;
930 
931 		/*
932 		 * Get the base offset and length of the buffer.  Note that
933 		 * for block sizes that are less then PAGE_SIZE, the b_data
934 		 * base of the buffer does not represent exactly b_offset and
935 		 * neither b_offset nor b_size are necessarily page aligned.
936 		 * Instead, the starting position of b_offset is:
937 		 *
938 		 * 	b_data + (b_offset & PAGE_MASK)
939 		 *
940 		 * block sizes less then DEV_BSIZE (usually 512) are not
941 		 * supported due to the page granularity bits (m->valid,
942 		 * m->dirty, etc...).
943 		 *
944 		 * See man buf(9) for more information
945 		 */
946 
947 		resid = bp->b_bufsize;
948 		foff = bp->b_offset;
949 
950 		for (i = 0; i < bp->b_npages; i++) {
951 			m = bp->b_pages[i];
952 			vm_page_flag_clear(m, PG_ZERO);
953 			if (m == bogus_page) {
954 
955 				obj = (vm_object_t) vp->v_object;
956 				poff = OFF_TO_IDX(bp->b_offset);
957 
958 				for (j = i; j < bp->b_npages; j++) {
959 					m = bp->b_pages[j];
960 					if (m == bogus_page) {
961 						m = vm_page_lookup(obj, poff + j);
962 #if !defined(MAX_PERF)
963 						if (!m) {
964 							panic("brelse: page missing\n");
965 						}
966 #endif
967 						bp->b_pages[j] = m;
968 					}
969 				}
970 
971 				if ((bp->b_flags & B_INVAL) == 0) {
972 					pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
973 				}
974 			}
975 			if (bp->b_flags & (B_NOCACHE|B_ERROR)) {
976 				int poffset = foff & PAGE_MASK;
977 				int presid = resid > (PAGE_SIZE - poffset) ?
978 					(PAGE_SIZE - poffset) : resid;
979 
980 				KASSERT(presid >= 0, ("brelse: extra page"));
981 				vm_page_set_invalid(m, poffset, presid);
982 			}
983 			resid -= PAGE_SIZE - (foff & PAGE_MASK);
984 			foff = (foff + PAGE_SIZE) & ~PAGE_MASK;
985 		}
986 
987 		if (bp->b_flags & (B_INVAL | B_RELBUF))
988 			vfs_vmio_release(bp);
989 
990 	} else if (bp->b_flags & B_VMIO) {
991 
992 		if (bp->b_flags & (B_INVAL | B_RELBUF))
993 			vfs_vmio_release(bp);
994 
995 	}
996 
997 #if !defined(MAX_PERF)
998 	if (bp->b_qindex != QUEUE_NONE)
999 		panic("brelse: free buffer onto another queue???");
1000 #endif
1001 	if (BUF_REFCNT(bp) > 1) {
1002 		/* Temporary panic to verify exclusive locking */
1003 		/* This panic goes away when we allow shared refs */
1004 		panic("brelse: multiple refs");
1005 		/* do not release to free list */
1006 		BUF_UNLOCK(bp);
1007 		splx(s);
1008 		return;
1009 	}
1010 
1011 	/* enqueue */
1012 
1013 	/* buffers with no memory */
1014 	if (bp->b_bufsize == 0) {
1015 		bp->b_flags |= B_INVAL;
1016 		if (bp->b_kvasize) {
1017 			bp->b_qindex = QUEUE_EMPTYKVA;
1018 			kvawakeup = 1;
1019 		} else {
1020 			bp->b_qindex = QUEUE_EMPTY;
1021 		}
1022 		TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist);
1023 		LIST_REMOVE(bp, b_hash);
1024 		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
1025 		bp->b_dev = NODEV;
1026 		kvafreespace += bp->b_kvasize;
1027 	/* buffers with junk contents */
1028 	} else if (bp->b_flags & (B_ERROR | B_INVAL | B_NOCACHE | B_RELBUF)) {
1029 		bp->b_flags |= B_INVAL;
1030 		bp->b_qindex = QUEUE_CLEAN;
1031 		if (bp->b_kvasize)
1032 			kvawakeup = 1;
1033 		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
1034 		LIST_REMOVE(bp, b_hash);
1035 		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
1036 		bp->b_dev = NODEV;
1037 
1038 	/* buffers that are locked */
1039 	} else if (bp->b_flags & B_LOCKED) {
1040 		bp->b_qindex = QUEUE_LOCKED;
1041 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
1042 
1043 	/* remaining buffers */
1044 	} else {
1045 		switch(bp->b_flags & (B_DELWRI|B_AGE)) {
1046 		case B_DELWRI | B_AGE:
1047 		    bp->b_qindex = QUEUE_DIRTY;
1048 		    TAILQ_INSERT_HEAD(&bufqueues[QUEUE_DIRTY], bp, b_freelist);
1049 		    break;
1050 		case B_DELWRI:
1051 		    bp->b_qindex = QUEUE_DIRTY;
1052 		    TAILQ_INSERT_TAIL(&bufqueues[QUEUE_DIRTY], bp, b_freelist);
1053 		    break;
1054 		case B_AGE:
1055 		    bp->b_qindex = QUEUE_CLEAN;
1056 		    TAILQ_INSERT_HEAD(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
1057 		    if (bp->b_kvasize)
1058 			    kvawakeup = 1;
1059 		    break;
1060 		default:
1061 		    bp->b_qindex = QUEUE_CLEAN;
1062 		    TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
1063 		    if (bp->b_kvasize)
1064 			    kvawakeup = 1;
1065 		    break;
1066 		}
1067 	}
1068 
1069 	/*
1070 	 * If B_INVAL, clear B_DELWRI.  We've already placed the buffer
1071 	 * on the correct queue.
1072 	 */
1073 	if ((bp->b_flags & (B_INVAL|B_DELWRI)) == (B_INVAL|B_DELWRI)) {
1074 		bp->b_flags &= ~B_DELWRI;
1075 		--numdirtybuffers;
1076 		numdirtywakeup();
1077 	}
1078 
1079 	runningbufspace -= bp->b_bufsize;
1080 
1081 	/*
1082 	 * Fixup numfreebuffers count.  The bp is on an appropriate queue
1083 	 * unless locked.  We then bump numfreebuffers if it is not B_DELWRI.
1084 	 * We've already handled the B_INVAL case ( B_DELWRI will be clear
1085 	 * if B_INVAL is set ).
1086 	 */
1087 
1088 	if ((bp->b_flags & B_LOCKED) == 0 && !(bp->b_flags & B_DELWRI))
1089 		bufcountwakeup();
1090 
1091 	/*
1092 	 * Something we can maybe free.
1093 	 */
1094 
1095 	if (bp->b_bufsize)
1096 		bufspacewakeup();
1097 	if (kvawakeup)
1098 		kvaspacewakeup();
1099 
1100 	/* unlock */
1101 	BUF_UNLOCK(bp);
1102 	bp->b_flags &= ~(B_ORDERED | B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
1103 	splx(s);
1104 }
1105 
1106 /*
1107  * Release a buffer back to the appropriate queue but do not try to free
1108  * it.
1109  *
1110  * bqrelse() is used by bdwrite() to requeue a delayed write, and used by
1111  * biodone() to requeue an async I/O on completion.  It is also used when
1112  * known good buffers need to be requeued but we think we may need the data
1113  * again soon.
1114  */
1115 void
1116 bqrelse(struct buf * bp)
1117 {
1118 	int s;
1119 
1120 	s = splbio();
1121 
1122 	KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
1123 
1124 #if !defined(MAX_PERF)
1125 	if (bp->b_qindex != QUEUE_NONE)
1126 		panic("bqrelse: free buffer onto another queue???");
1127 #endif
1128 	if (BUF_REFCNT(bp) > 1) {
1129 		/* do not release to free list */
1130 		panic("bqrelse: multiple refs");
1131 		BUF_UNLOCK(bp);
1132 		splx(s);
1133 		return;
1134 	}
1135 	if (bp->b_flags & B_LOCKED) {
1136 		bp->b_flags &= ~B_ERROR;
1137 		bp->b_qindex = QUEUE_LOCKED;
1138 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
1139 		/* buffers with stale but valid contents */
1140 	} else if (bp->b_flags & B_DELWRI) {
1141 		bp->b_qindex = QUEUE_DIRTY;
1142 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_DIRTY], bp, b_freelist);
1143 	} else {
1144 		bp->b_qindex = QUEUE_CLEAN;
1145 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
1146 	}
1147 
1148 	runningbufspace -= bp->b_bufsize;
1149 
1150 	if ((bp->b_flags & B_LOCKED) == 0 &&
1151 	    ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI))) {
1152 		bufcountwakeup();
1153 	}
1154 
1155 	/*
1156 	 * Something we can maybe wakeup
1157 	 */
1158 	if (bp->b_bufsize && !(bp->b_flags & B_DELWRI))
1159 		bufspacewakeup();
1160 
1161 	/* unlock */
1162 	BUF_UNLOCK(bp);
1163 	bp->b_flags &= ~(B_ORDERED | B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
1164 	splx(s);
1165 }
1166 
1167 static void
1168 vfs_vmio_release(bp)
1169 	struct buf *bp;
1170 {
1171 	int i, s;
1172 	vm_page_t m;
1173 
1174 	s = splvm();
1175 	for (i = 0; i < bp->b_npages; i++) {
1176 		m = bp->b_pages[i];
1177 		bp->b_pages[i] = NULL;
1178 		/*
1179 		 * In order to keep page LRU ordering consistent, put
1180 		 * everything on the inactive queue.
1181 		 */
1182 		vm_page_unwire(m, 0);
1183 		/*
1184 		 * We don't mess with busy pages, it is
1185 		 * the responsibility of the process that
1186 		 * busied the pages to deal with them.
1187 		 */
1188 		if ((m->flags & PG_BUSY) || (m->busy != 0))
1189 			continue;
1190 
1191 		if (m->wire_count == 0) {
1192 			vm_page_flag_clear(m, PG_ZERO);
1193 			/*
1194 			 * Might as well free the page if we can and it has
1195 			 * no valid data.
1196 			 */
1197 			if ((bp->b_flags & B_ASYNC) == 0 && !m->valid && m->hold_count == 0) {
1198 				vm_page_busy(m);
1199 				vm_page_protect(m, VM_PROT_NONE);
1200 				vm_page_free(m);
1201 			}
1202 		}
1203 	}
1204 	bufspace -= bp->b_bufsize;
1205 	vmiospace -= bp->b_bufsize;
1206 	runningbufspace -= bp->b_bufsize;
1207 	splx(s);
1208 	pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
1209 	if (bp->b_bufsize)
1210 		bufspacewakeup();
1211 	bp->b_npages = 0;
1212 	bp->b_bufsize = 0;
1213 	bp->b_flags &= ~B_VMIO;
1214 	if (bp->b_vp)
1215 		brelvp(bp);
1216 }
1217 
1218 /*
1219  * Check to see if a block is currently memory resident.
1220  */
1221 struct buf *
1222 gbincore(struct vnode * vp, daddr_t blkno)
1223 {
1224 	struct buf *bp;
1225 	struct bufhashhdr *bh;
1226 
1227 	bh = bufhash(vp, blkno);
1228 	bp = bh->lh_first;
1229 
1230 	/* Search hash chain */
1231 	while (bp != NULL) {
1232 		/* hit */
1233 		if (bp->b_vp == vp && bp->b_lblkno == blkno &&
1234 		    (bp->b_flags & B_INVAL) == 0) {
1235 			break;
1236 		}
1237 		bp = bp->b_hash.le_next;
1238 	}
1239 	return (bp);
1240 }
1241 
1242 /*
1243  *	vfs_bio_awrite:
1244  *
1245  *	Implement clustered async writes for clearing out B_DELWRI buffers.
1246  *	This is much better then the old way of writing only one buffer at
1247  *	a time.  Note that we may not be presented with the buffers in the
1248  *	correct order, so we search for the cluster in both directions.
1249  */
1250 int
1251 vfs_bio_awrite(struct buf * bp)
1252 {
1253 	int i;
1254 	int j;
1255 	daddr_t lblkno = bp->b_lblkno;
1256 	struct vnode *vp = bp->b_vp;
1257 	int s;
1258 	int ncl;
1259 	struct buf *bpa;
1260 	int nwritten;
1261 	int size;
1262 	int maxcl;
1263 
1264 	s = splbio();
1265 	/*
1266 	 * right now we support clustered writing only to regular files.  If
1267 	 * we find a clusterable block we could be in the middle of a cluster
1268 	 * rather then at the beginning.
1269 	 */
1270 	if ((vp->v_type == VREG) &&
1271 	    (vp->v_mount != 0) && /* Only on nodes that have the size info */
1272 	    (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
1273 
1274 		size = vp->v_mount->mnt_stat.f_iosize;
1275 		maxcl = MAXPHYS / size;
1276 
1277 		for (i = 1; i < maxcl; i++) {
1278 			if ((bpa = gbincore(vp, lblkno + i)) &&
1279 			    BUF_REFCNT(bpa) == 0 &&
1280 			    ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
1281 			    (B_DELWRI | B_CLUSTEROK)) &&
1282 			    (bpa->b_bufsize == size)) {
1283 				if ((bpa->b_blkno == bpa->b_lblkno) ||
1284 				    (bpa->b_blkno !=
1285 				     bp->b_blkno + ((i * size) >> DEV_BSHIFT)))
1286 					break;
1287 			} else {
1288 				break;
1289 			}
1290 		}
1291 		for (j = 1; i + j <= maxcl && j <= lblkno; j++) {
1292 			if ((bpa = gbincore(vp, lblkno - j)) &&
1293 			    BUF_REFCNT(bpa) == 0 &&
1294 			    ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
1295 			    (B_DELWRI | B_CLUSTEROK)) &&
1296 			    (bpa->b_bufsize == size)) {
1297 				if ((bpa->b_blkno == bpa->b_lblkno) ||
1298 				    (bpa->b_blkno !=
1299 				     bp->b_blkno - ((j * size) >> DEV_BSHIFT)))
1300 					break;
1301 			} else {
1302 				break;
1303 			}
1304 		}
1305 		--j;
1306 		ncl = i + j;
1307 		/*
1308 		 * this is a possible cluster write
1309 		 */
1310 		if (ncl != 1) {
1311 			nwritten = cluster_wbuild(vp, size, lblkno - j, ncl);
1312 			splx(s);
1313 			return nwritten;
1314 		}
1315 	}
1316 
1317 	BUF_LOCK(bp, LK_EXCLUSIVE);
1318 	bremfree(bp);
1319 	bp->b_flags |= B_ASYNC;
1320 
1321 	splx(s);
1322 	/*
1323 	 * default (old) behavior, writing out only one block
1324 	 *
1325 	 * XXX returns b_bufsize instead of b_bcount for nwritten?
1326 	 */
1327 	nwritten = bp->b_bufsize;
1328 	(void) VOP_BWRITE(bp->b_vp, bp);
1329 
1330 	return nwritten;
1331 }
1332 
1333 /*
1334  *	getnewbuf:
1335  *
1336  *	Find and initialize a new buffer header, freeing up existing buffers
1337  *	in the bufqueues as necessary.  The new buffer is returned locked.
1338  *
1339  *	Important:  B_INVAL is not set.  If the caller wishes to throw the
1340  *	buffer away, the caller must set B_INVAL prior to calling brelse().
1341  *
1342  *	We block if:
1343  *		We have insufficient buffer headers
1344  *		We have insufficient buffer space
1345  *		buffer_map is too fragmented ( space reservation fails )
1346  *		If we have to flush dirty buffers ( but we try to avoid this )
1347  *
1348  *	To avoid VFS layer recursion we do not flush dirty buffers ourselves.
1349  *	Instead we ask the buf daemon to do it for us.  We attempt to
1350  *	avoid piecemeal wakeups of the pageout daemon.
1351  */
1352 
1353 static struct buf *
1354 getnewbuf(int slpflag, int slptimeo, int size, int maxsize)
1355 {
1356 	struct buf *bp;
1357 	struct buf *nbp;
1358 	struct buf *dbp;
1359 	int outofspace;
1360 	int nqindex;
1361 	int defrag = 0;
1362 
1363 	++getnewbufcalls;
1364 	--getnewbufrestarts;
1365 restart:
1366 	++getnewbufrestarts;
1367 
1368 	/*
1369 	 * Calculate whether we are out of buffer space.  This state is
1370 	 * recalculated on every restart.  If we are out of space, we
1371 	 * have to turn off defragmentation.  Setting defrag to -1 when
1372 	 * outofspace is positive means "defrag while freeing buffers".
1373 	 * The looping conditional will be muffed up if defrag is left
1374 	 * positive when outofspace is positive.
1375 	 */
1376 
1377 	dbp = NULL;
1378 	outofspace = 0;
1379 	if (bufspace >= hibufspace) {
1380 		if ((curproc && (curproc->p_flag & P_BUFEXHAUST) == 0) ||
1381 		    bufspace >= maxbufspace) {
1382 			outofspace = 1;
1383 			if (defrag > 0)
1384 				defrag = -1;
1385 		}
1386 	}
1387 
1388 	/*
1389 	 * defrag state is semi-persistant.  1 means we are flagged for
1390 	 * defragging.  -1 means we actually defragged something.
1391 	 */
1392 	/* nop */
1393 
1394 	/*
1395 	 * Setup for scan.  If we do not have enough free buffers,
1396 	 * we setup a degenerate case that immediately fails.  Note
1397 	 * that if we are specially marked process, we are allowed to
1398 	 * dip into our reserves.
1399 	 *
1400 	 * Normally we want to find an EMPTYKVA buffer.  That is, a
1401 	 * buffer with kva already allocated.  If there are no EMPTYKVA
1402 	 * buffers we back up to the truely EMPTY buffers.  When defragging
1403 	 * we do not bother backing up since we have to locate buffers with
1404 	 * kva to defrag.  If we are out of space we skip both EMPTY and
1405 	 * EMPTYKVA and dig right into the CLEAN queue.
1406 	 *
1407 	 * In this manner we avoid scanning unnecessary buffers.  It is very
1408 	 * important for us to do this because the buffer cache is almost
1409 	 * constantly out of space or in need of defragmentation.
1410 	 */
1411 
1412 	if (curproc && (curproc->p_flag & P_BUFEXHAUST) == 0 &&
1413 	    numfreebuffers < lofreebuffers) {
1414 		nqindex = QUEUE_CLEAN;
1415 		nbp = NULL;
1416 	} else {
1417 		nqindex = QUEUE_EMPTYKVA;
1418 		nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]);
1419 		if (nbp == NULL) {
1420 			if (defrag <= 0) {
1421 				nqindex = QUEUE_EMPTY;
1422 				nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
1423 			}
1424 		}
1425 		if (outofspace || nbp == NULL) {
1426 			nqindex = QUEUE_CLEAN;
1427 			nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
1428 		}
1429 	}
1430 
1431 	/*
1432 	 * Run scan, possibly freeing data and/or kva mappings on the fly
1433 	 * depending.
1434 	 */
1435 
1436 	while ((bp = nbp) != NULL) {
1437 		int qindex = nqindex;
1438 
1439 		/*
1440 		 * Calculate next bp ( we can only use it if we do not block
1441 		 * or do other fancy things ).
1442 		 */
1443 		if ((nbp = TAILQ_NEXT(bp, b_freelist)) == NULL) {
1444 			switch(qindex) {
1445 			case QUEUE_EMPTY:
1446 				nqindex = QUEUE_EMPTYKVA;
1447 				if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA])))
1448 					break;
1449 				/* fall through */
1450 			case QUEUE_EMPTYKVA:
1451 				nqindex = QUEUE_CLEAN;
1452 				if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN])))
1453 					break;
1454 				/* fall through */
1455 			case QUEUE_CLEAN:
1456 				/*
1457 				 * nbp is NULL.
1458 				 */
1459 				break;
1460 			}
1461 		}
1462 
1463 		/*
1464 		 * Sanity Checks
1465 		 */
1466 		KASSERT(bp->b_qindex == qindex, ("getnewbuf: inconsistant queue %d bp %p", qindex, bp));
1467 
1468 		/*
1469 		 * Note: we no longer distinguish between VMIO and non-VMIO
1470 		 * buffers.
1471 		 */
1472 
1473 		KASSERT((bp->b_flags & B_DELWRI) == 0, ("delwri buffer %p found in queue %d", bp, qindex));
1474 
1475 		/*
1476 		 * If we are defragging and the buffer isn't useful for fixing
1477 		 * that problem we continue.  If we are out of space and the
1478 		 * buffer isn't useful for fixing that problem we continue.
1479 		 */
1480 
1481 		if (defrag > 0 && bp->b_kvasize == 0)
1482 			continue;
1483 		if (outofspace > 0 && bp->b_bufsize == 0)
1484 			continue;
1485 
1486 		/*
1487 		 * Start freeing the bp.  This is somewhat involved.  nbp
1488 		 * remains valid only for QUEUE_EMPTY[KVA] bp's.
1489 		 */
1490 
1491 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) != 0)
1492 			panic("getnewbuf: locked buf");
1493 		bremfree(bp);
1494 
1495 		if (qindex == QUEUE_CLEAN) {
1496 			if (bp->b_flags & B_VMIO) {
1497 				bp->b_flags &= ~B_ASYNC;
1498 				vfs_vmio_release(bp);
1499 			}
1500 			if (bp->b_vp)
1501 				brelvp(bp);
1502 		}
1503 
1504 		/*
1505 		 * NOTE:  nbp is now entirely invalid.  We can only restart
1506 		 * the scan from this point on.
1507 		 *
1508 		 * Get the rest of the buffer freed up.  b_kva* is still
1509 		 * valid after this operation.
1510 		 */
1511 
1512 		if (bp->b_rcred != NOCRED) {
1513 			crfree(bp->b_rcred);
1514 			bp->b_rcred = NOCRED;
1515 		}
1516 		if (bp->b_wcred != NOCRED) {
1517 			crfree(bp->b_wcred);
1518 			bp->b_wcred = NOCRED;
1519 		}
1520 		if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate)
1521 			(*bioops.io_deallocate)(bp);
1522 		LIST_REMOVE(bp, b_hash);
1523 		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
1524 
1525 		if (bp->b_bufsize)
1526 			allocbuf(bp, 0);
1527 
1528 		bp->b_flags = 0;
1529 		bp->b_dev = NODEV;
1530 		bp->b_vp = NULL;
1531 		bp->b_blkno = bp->b_lblkno = 0;
1532 		bp->b_offset = NOOFFSET;
1533 		bp->b_iodone = 0;
1534 		bp->b_error = 0;
1535 		bp->b_resid = 0;
1536 		bp->b_bcount = 0;
1537 		bp->b_npages = 0;
1538 		bp->b_dirtyoff = bp->b_dirtyend = 0;
1539 
1540 		LIST_INIT(&bp->b_dep);
1541 
1542 		/*
1543 		 * Ok, now that we have a free buffer, if we are defragging
1544 		 * we have to recover the kvaspace.  If we are out of space
1545 		 * we have to free the buffer (which we just did), but we
1546 		 * do not have to recover kva space unless we hit a defrag
1547 		 * hicup.  Being able to avoid freeing the kva space leads
1548 		 * to a significant reduction in overhead.
1549 		 */
1550 
1551 		if (defrag > 0) {
1552 			defrag = -1;
1553 			bp->b_flags |= B_INVAL;
1554 			bfreekva(bp);
1555 			brelse(bp);
1556 			goto restart;
1557 		}
1558 
1559 		if (outofspace > 0) {
1560 			outofspace = -1;
1561 			bp->b_flags |= B_INVAL;
1562 			if (defrag < 0)
1563 				bfreekva(bp);
1564 			brelse(bp);
1565 			goto restart;
1566 		}
1567 
1568 		/*
1569 		 * We are done
1570 		 */
1571 		break;
1572 	}
1573 
1574 	/*
1575 	 * If we exhausted our list, sleep as appropriate.  We may have to
1576 	 * wakeup various daemons and write out some dirty buffers.
1577 	 *
1578 	 * Generally we are sleeping due to insufficient buffer space.
1579 	 */
1580 
1581 	if (bp == NULL) {
1582 		int flags;
1583 		char *waitmsg;
1584 
1585 		if (defrag > 0) {
1586 			flags = VFS_BIO_NEED_KVASPACE;
1587 			waitmsg = "nbufkv";
1588 		} else if (outofspace > 0) {
1589 			waitmsg = "nbufbs";
1590 			flags = VFS_BIO_NEED_BUFSPACE;
1591 		} else {
1592 			waitmsg = "newbuf";
1593 			flags = VFS_BIO_NEED_ANY;
1594 		}
1595 
1596 		/* XXX */
1597 
1598 		(void) speedup_syncer();
1599 		needsbuffer |= flags;
1600 		while (needsbuffer & flags) {
1601 			if (tsleep(&needsbuffer, (PRIBIO + 4) | slpflag,
1602 			    waitmsg, slptimeo))
1603 				return (NULL);
1604 		}
1605 	} else {
1606 		/*
1607 		 * We finally have a valid bp.  We aren't quite out of the
1608 		 * woods, we still have to reserve kva space.
1609 		 */
1610 		vm_offset_t addr = 0;
1611 
1612 		maxsize = (maxsize + PAGE_MASK) & ~PAGE_MASK;
1613 
1614 		if (maxsize != bp->b_kvasize) {
1615 			bfreekva(bp);
1616 
1617 			if (vm_map_findspace(buffer_map,
1618 				vm_map_min(buffer_map), maxsize, &addr)) {
1619 				/*
1620 				 * Uh oh.  Buffer map is to fragmented.  Try
1621 				 * to defragment.
1622 				 */
1623 				if (defrag <= 0) {
1624 					defrag = 1;
1625 					bp->b_flags |= B_INVAL;
1626 					brelse(bp);
1627 					goto restart;
1628 				}
1629 				/*
1630 				 * Uh oh.  We couldn't seem to defragment
1631 				 */
1632 				panic("getnewbuf: unreachable code reached");
1633 			}
1634 		}
1635 		if (addr) {
1636 			vm_map_insert(buffer_map, NULL, 0,
1637 				addr, addr + maxsize,
1638 				VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT);
1639 
1640 			bp->b_kvabase = (caddr_t) addr;
1641 			bp->b_kvasize = maxsize;
1642 		}
1643 		bp->b_data = bp->b_kvabase;
1644 	}
1645 	return(bp);
1646 }
1647 
1648 /*
1649  *	waitfreebuffers:
1650  *
1651  *	Wait for sufficient free buffers.  Only called from normal processes.
1652  */
1653 
1654 static void
1655 waitfreebuffers(int slpflag, int slptimeo)
1656 {
1657 	while (numfreebuffers < hifreebuffers) {
1658 		if (numfreebuffers >= hifreebuffers)
1659 			break;
1660 		needsbuffer |= VFS_BIO_NEED_FREE;
1661 		if (tsleep(&needsbuffer, (PRIBIO + 4)|slpflag, "biofre", slptimeo))
1662 			break;
1663 	}
1664 }
1665 
1666 /*
1667  *	buf_daemon:
1668  *
1669  *	buffer flushing daemon.  Buffers are normally flushed by the
1670  *	update daemon but if it cannot keep up this process starts to
1671  *	take the load in an attempt to prevent getnewbuf() from blocking.
1672  */
1673 
1674 static struct proc *bufdaemonproc;
1675 static int bd_interval;
1676 static int bd_flushto;
1677 
1678 static struct kproc_desc buf_kp = {
1679 	"bufdaemon",
1680 	buf_daemon,
1681 	&bufdaemonproc
1682 };
1683 SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp)
1684 
1685 static void
1686 buf_daemon()
1687 {
1688 	int s;
1689 	/*
1690 	 * This process is allowed to take the buffer cache to the limit
1691 	 */
1692 	curproc->p_flag |= P_BUFEXHAUST;
1693 	s = splbio();
1694 
1695 	bd_interval = 5 * hz;	/* dynamically adjusted */
1696 	bd_flushto = hidirtybuffers;	/* dynamically adjusted */
1697 
1698 	while (TRUE) {
1699 		bd_request = 0;
1700 
1701 		/*
1702 		 * Do the flush.  Limit the number of buffers we flush in one
1703 		 * go.  The failure condition occurs when processes are writing
1704 		 * buffers faster then we can dispose of them.  In this case
1705 		 * we may be flushing so often that the previous set of flushes
1706 		 * have not had time to complete, causing us to run out of
1707 		 * physical buffers and block.
1708 		 */
1709 		{
1710 			int runcount = maxbdrun;
1711 
1712 			while (numdirtybuffers > bd_flushto && runcount) {
1713 				--runcount;
1714 				if (flushbufqueues() == 0)
1715 					break;
1716 			}
1717 		}
1718 
1719 		/*
1720 		 * If nobody is requesting anything we sleep
1721 		 */
1722 		if (bd_request == 0)
1723 			tsleep(&bd_request, PVM, "psleep", bd_interval);
1724 
1725 		/*
1726 		 * We calculate how much to add or subtract from bd_flushto
1727 		 * and bd_interval based on how far off we are from the
1728 		 * optimal number of dirty buffers, which is 20% below the
1729 		 * hidirtybuffers mark.  We cannot use hidirtybuffers straight
1730 		 * because being right on the mark will cause getnewbuf()
1731 		 * to oscillate our wakeup.
1732 		 *
1733 		 * The larger the error in either direction, the more we adjust
1734 		 * bd_flushto and bd_interval.  The time interval is adjusted
1735 		 * by 2 seconds per whole-buffer-range of error.  This is an
1736 		 * exponential convergence algorithm, with large errors
1737 		 * producing large changes and small errors producing small
1738 		 * changes.
1739 		 */
1740 
1741 		{
1742 			int brange = hidirtybuffers - lodirtybuffers;
1743 			int middb = hidirtybuffers - brange / 5;
1744 			int deltabuf = middb - numdirtybuffers;
1745 
1746 			bd_flushto += deltabuf / 20;
1747 			bd_interval += deltabuf * (2 * hz) / (brange * 1);
1748 		}
1749 		if (bd_flushto < lodirtybuffers)
1750 			bd_flushto = lodirtybuffers;
1751 		if (bd_flushto > hidirtybuffers)
1752 			bd_flushto = hidirtybuffers;
1753 		if (bd_interval < hz / 10)
1754 			bd_interval = hz / 10;
1755 		if (bd_interval > 5 * hz)
1756 			bd_interval = 5 * hz;
1757 	}
1758 }
1759 
1760 /*
1761  *	flushbufqueues:
1762  *
1763  *	Try to flush a buffer in the dirty queue.  We must be careful to
1764  *	free up B_INVAL buffers instead of write them, which NFS is
1765  *	particularly sensitive to.
1766  */
1767 
1768 static int
1769 flushbufqueues(void)
1770 {
1771 	struct buf *bp;
1772 	int r = 0;
1773 
1774 	bp = TAILQ_FIRST(&bufqueues[QUEUE_DIRTY]);
1775 
1776 	while (bp) {
1777 		KASSERT((bp->b_flags & B_DELWRI), ("unexpected clean buffer %p", bp));
1778 		if ((bp->b_flags & B_DELWRI) != 0) {
1779 			if (bp->b_flags & B_INVAL) {
1780 				if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) != 0)
1781 					panic("flushbufqueues: locked buf");
1782 				bremfree(bp);
1783 				brelse(bp);
1784 				++r;
1785 				break;
1786 			}
1787 			vfs_bio_awrite(bp);
1788 			++r;
1789 			break;
1790 		}
1791 		bp = TAILQ_NEXT(bp, b_freelist);
1792 	}
1793 	return(r);
1794 }
1795 
1796 /*
1797  * Check to see if a block is currently memory resident.
1798  */
1799 struct buf *
1800 incore(struct vnode * vp, daddr_t blkno)
1801 {
1802 	struct buf *bp;
1803 
1804 	int s = splbio();
1805 	bp = gbincore(vp, blkno);
1806 	splx(s);
1807 	return (bp);
1808 }
1809 
1810 /*
1811  * Returns true if no I/O is needed to access the
1812  * associated VM object.  This is like incore except
1813  * it also hunts around in the VM system for the data.
1814  */
1815 
1816 int
1817 inmem(struct vnode * vp, daddr_t blkno)
1818 {
1819 	vm_object_t obj;
1820 	vm_offset_t toff, tinc, size;
1821 	vm_page_t m;
1822 	vm_ooffset_t off;
1823 
1824 	if (incore(vp, blkno))
1825 		return 1;
1826 	if (vp->v_mount == NULL)
1827 		return 0;
1828 	if ((vp->v_object == NULL) || (vp->v_flag & VOBJBUF) == 0)
1829 		return 0;
1830 
1831 	obj = vp->v_object;
1832 	size = PAGE_SIZE;
1833 	if (size > vp->v_mount->mnt_stat.f_iosize)
1834 		size = vp->v_mount->mnt_stat.f_iosize;
1835 	off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize;
1836 
1837 	for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
1838 		m = vm_page_lookup(obj, OFF_TO_IDX(off + toff));
1839 		if (!m)
1840 			return 0;
1841 		tinc = size;
1842 		if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK))
1843 			tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK);
1844 		if (vm_page_is_valid(m,
1845 		    (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0)
1846 			return 0;
1847 	}
1848 	return 1;
1849 }
1850 
1851 /*
1852  *	vfs_setdirty:
1853  *
1854  *	Sets the dirty range for a buffer based on the status of the dirty
1855  *	bits in the pages comprising the buffer.
1856  *
1857  *	The range is limited to the size of the buffer.
1858  *
1859  *	This routine is primarily used by NFS, but is generalized for the
1860  *	B_VMIO case.
1861  */
1862 static void
1863 vfs_setdirty(struct buf *bp)
1864 {
1865 	int i;
1866 	vm_object_t object;
1867 
1868 	/*
1869 	 * Degenerate case - empty buffer
1870 	 */
1871 
1872 	if (bp->b_bufsize == 0)
1873 		return;
1874 
1875 	/*
1876 	 * We qualify the scan for modified pages on whether the
1877 	 * object has been flushed yet.  The OBJ_WRITEABLE flag
1878 	 * is not cleared simply by protecting pages off.
1879 	 */
1880 
1881 	if ((bp->b_flags & B_VMIO) == 0)
1882 		return;
1883 
1884 	object = bp->b_pages[0]->object;
1885 
1886 	if ((object->flags & OBJ_WRITEABLE) && !(object->flags & OBJ_MIGHTBEDIRTY))
1887 		printf("Warning: object %p writeable but not mightbedirty\n", object);
1888 	if (!(object->flags & OBJ_WRITEABLE) && (object->flags & OBJ_MIGHTBEDIRTY))
1889 		printf("Warning: object %p mightbedirty but not writeable\n", object);
1890 
1891 	if (object->flags & (OBJ_MIGHTBEDIRTY|OBJ_CLEANING)) {
1892 		vm_offset_t boffset;
1893 		vm_offset_t eoffset;
1894 
1895 		/*
1896 		 * test the pages to see if they have been modified directly
1897 		 * by users through the VM system.
1898 		 */
1899 		for (i = 0; i < bp->b_npages; i++) {
1900 			vm_page_flag_clear(bp->b_pages[i], PG_ZERO);
1901 			vm_page_test_dirty(bp->b_pages[i]);
1902 		}
1903 
1904 		/*
1905 		 * Calculate the encompassing dirty range, boffset and eoffset,
1906 		 * (eoffset - boffset) bytes.
1907 		 */
1908 
1909 		for (i = 0; i < bp->b_npages; i++) {
1910 			if (bp->b_pages[i]->dirty)
1911 				break;
1912 		}
1913 		boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
1914 
1915 		for (i = bp->b_npages - 1; i >= 0; --i) {
1916 			if (bp->b_pages[i]->dirty) {
1917 				break;
1918 			}
1919 		}
1920 		eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
1921 
1922 		/*
1923 		 * Fit it to the buffer.
1924 		 */
1925 
1926 		if (eoffset > bp->b_bcount)
1927 			eoffset = bp->b_bcount;
1928 
1929 		/*
1930 		 * If we have a good dirty range, merge with the existing
1931 		 * dirty range.
1932 		 */
1933 
1934 		if (boffset < eoffset) {
1935 			if (bp->b_dirtyoff > boffset)
1936 				bp->b_dirtyoff = boffset;
1937 			if (bp->b_dirtyend < eoffset)
1938 				bp->b_dirtyend = eoffset;
1939 		}
1940 	}
1941 }
1942 
1943 /*
1944  *	getblk:
1945  *
1946  *	Get a block given a specified block and offset into a file/device.
1947  *	The buffers B_DONE bit will be cleared on return, making it almost
1948  * 	ready for an I/O initiation.  B_INVAL may or may not be set on
1949  *	return.  The caller should clear B_INVAL prior to initiating a
1950  *	READ.
1951  *
1952  *	For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for
1953  *	an existing buffer.
1954  *
1955  *	For a VMIO buffer, B_CACHE is modified according to the backing VM.
1956  *	If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set
1957  *	and then cleared based on the backing VM.  If the previous buffer is
1958  *	non-0-sized but invalid, B_CACHE will be cleared.
1959  *
1960  *	If getblk() must create a new buffer, the new buffer is returned with
1961  *	both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which
1962  *	case it is returned with B_INVAL clear and B_CACHE set based on the
1963  *	backing VM.
1964  *
1965  *	getblk() also forces a VOP_BWRITE() for any B_DELWRI buffer whos
1966  *	B_CACHE bit is clear.
1967  *
1968  *	What this means, basically, is that the caller should use B_CACHE to
1969  *	determine whether the buffer is fully valid or not and should clear
1970  *	B_INVAL prior to issuing a read.  If the caller intends to validate
1971  *	the buffer by loading its data area with something, the caller needs
1972  *	to clear B_INVAL.  If the caller does this without issuing an I/O,
1973  *	the caller should set B_CACHE ( as an optimization ), else the caller
1974  *	should issue the I/O and biodone() will set B_CACHE if the I/O was
1975  *	a write attempt or if it was a successfull read.  If the caller
1976  *	intends to issue a READ, the caller must clear B_INVAL and B_ERROR
1977  *	prior to issuing the READ.  biodone() will *not* clear B_INVAL.
1978  */
1979 struct buf *
1980 getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo)
1981 {
1982 	struct buf *bp;
1983 	int s;
1984 	struct bufhashhdr *bh;
1985 
1986 #if !defined(MAX_PERF)
1987 	if (size > MAXBSIZE)
1988 		panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE);
1989 #endif
1990 
1991 	s = splbio();
1992 loop:
1993 	/*
1994 	 * Block if we are low on buffers.   Certain processes are allowed
1995 	 * to completely exhaust the buffer cache.
1996          *
1997          * If this check ever becomes a bottleneck it may be better to
1998          * move it into the else, when gbincore() fails.  At the moment
1999          * it isn't a problem.
2000          */
2001 	if (!curproc || (curproc->p_flag & P_BUFEXHAUST)) {
2002 		if (numfreebuffers == 0) {
2003 			if (!curproc)
2004 				return NULL;
2005 			needsbuffer |= VFS_BIO_NEED_ANY;
2006 			tsleep(&needsbuffer, (PRIBIO + 4) | slpflag, "newbuf",
2007 			    slptimeo);
2008 		}
2009 	} else if (numfreebuffers < lofreebuffers) {
2010 		waitfreebuffers(slpflag, slptimeo);
2011 	}
2012 
2013 	if ((bp = gbincore(vp, blkno))) {
2014 		/*
2015 		 * Buffer is in-core.  If the buffer is not busy, it must
2016 		 * be on a queue.
2017 		 */
2018 
2019 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
2020 			if (BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL,
2021 			    "getblk", slpflag, slptimeo) == ENOLCK)
2022 				goto loop;
2023 			splx(s);
2024 			return (struct buf *) NULL;
2025 		}
2026 
2027 		/*
2028 		 * The buffer is locked.  B_CACHE is cleared if the buffer is
2029 		 * invalid.  Ohterwise, for a non-VMIO buffer, B_CACHE is set
2030 		 * and for a VMIO buffer B_CACHE is adjusted according to the
2031 		 * backing VM cache.
2032 		 */
2033 		if (bp->b_flags & B_INVAL)
2034 			bp->b_flags &= ~B_CACHE;
2035 		else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0)
2036 			bp->b_flags |= B_CACHE;
2037 		bremfree(bp);
2038 
2039 		/*
2040 		 * check for size inconsistancies for non-VMIO case.
2041 		 */
2042 
2043 		if (bp->b_bcount != size) {
2044 			if ((bp->b_flags & B_VMIO) == 0 ||
2045 			    (size > bp->b_kvasize)) {
2046 				if (bp->b_flags & B_DELWRI) {
2047 					bp->b_flags |= B_NOCACHE;
2048 					VOP_BWRITE(bp->b_vp, bp);
2049 				} else {
2050 					if ((bp->b_flags & B_VMIO) &&
2051 					   (LIST_FIRST(&bp->b_dep) == NULL)) {
2052 						bp->b_flags |= B_RELBUF;
2053 						brelse(bp);
2054 					} else {
2055 						bp->b_flags |= B_NOCACHE;
2056 						VOP_BWRITE(bp->b_vp, bp);
2057 					}
2058 				}
2059 				goto loop;
2060 			}
2061 		}
2062 
2063 		/*
2064 		 * If the size is inconsistant in the VMIO case, we can resize
2065 		 * the buffer.  This might lead to B_CACHE getting set or
2066 		 * cleared.  If the size has not changed, B_CACHE remains
2067 		 * unchanged from its previous state.
2068 		 */
2069 
2070 		if (bp->b_bcount != size)
2071 			allocbuf(bp, size);
2072 
2073 		KASSERT(bp->b_offset != NOOFFSET,
2074 		    ("getblk: no buffer offset"));
2075 
2076 		/*
2077 		 * A buffer with B_DELWRI set and B_CACHE clear must
2078 		 * be committed before we can return the buffer in
2079 		 * order to prevent the caller from issuing a read
2080 		 * ( due to B_CACHE not being set ) and overwriting
2081 		 * it.
2082 		 *
2083 		 * Most callers, including NFS and FFS, need this to
2084 		 * operate properly either because they assume they
2085 		 * can issue a read if B_CACHE is not set, or because
2086 		 * ( for example ) an uncached B_DELWRI might loop due
2087 		 * to softupdates re-dirtying the buffer.  In the latter
2088 		 * case, B_CACHE is set after the first write completes,
2089 		 * preventing further loops.
2090 		 */
2091 
2092 		if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) {
2093 			VOP_BWRITE(bp->b_vp, bp);
2094 			goto loop;
2095 		}
2096 
2097 		splx(s);
2098 		bp->b_flags &= ~B_DONE;
2099 	} else {
2100 		/*
2101 		 * Buffer is not in-core, create new buffer.  The buffer
2102 		 * returned by getnewbuf() is locked.  Note that the returned
2103 		 * buffer is also considered valid (not marked B_INVAL).
2104 		 */
2105 		int bsize, maxsize, vmio;
2106 		off_t offset;
2107 
2108 		if (vp->v_type == VBLK)
2109 			bsize = DEV_BSIZE;
2110 		else if (vp->v_mountedhere)
2111 			bsize = vp->v_mountedhere->mnt_stat.f_iosize;
2112 		else if (vp->v_mount)
2113 			bsize = vp->v_mount->mnt_stat.f_iosize;
2114 		else
2115 			bsize = size;
2116 
2117 		offset = (off_t)blkno * bsize;
2118 		vmio = (vp->v_object != 0) && (vp->v_flag & VOBJBUF);
2119 		maxsize = vmio ? size + (offset & PAGE_MASK) : size;
2120 		maxsize = imax(maxsize, bsize);
2121 
2122 		if ((bp = getnewbuf(slpflag, slptimeo, size, maxsize)) == NULL) {
2123 			if (slpflag || slptimeo) {
2124 				splx(s);
2125 				return NULL;
2126 			}
2127 			goto loop;
2128 		}
2129 
2130 		/*
2131 		 * This code is used to make sure that a buffer is not
2132 		 * created while the getnewbuf routine is blocked.
2133 		 * This can be a problem whether the vnode is locked or not.
2134 		 * If the buffer is created out from under us, we have to
2135 		 * throw away the one we just created.  There is now window
2136 		 * race because we are safely running at splbio() from the
2137 		 * point of the duplicate buffer creation through to here,
2138 		 * and we've locked the buffer.
2139 		 */
2140 		if (gbincore(vp, blkno)) {
2141 			bp->b_flags |= B_INVAL;
2142 			brelse(bp);
2143 			goto loop;
2144 		}
2145 
2146 		/*
2147 		 * Insert the buffer into the hash, so that it can
2148 		 * be found by incore.
2149 		 */
2150 		bp->b_blkno = bp->b_lblkno = blkno;
2151 		bp->b_offset = offset;
2152 
2153 		bgetvp(vp, bp);
2154 		LIST_REMOVE(bp, b_hash);
2155 		bh = bufhash(vp, blkno);
2156 		LIST_INSERT_HEAD(bh, bp, b_hash);
2157 
2158 		/*
2159 		 * set B_VMIO bit.  allocbuf() the buffer bigger.  Since the
2160 		 * buffer size starts out as 0, B_CACHE will be set by
2161 		 * allocbuf() for the VMIO case prior to it testing the
2162 		 * backing store for validity.
2163 		 */
2164 
2165 		if (vmio) {
2166 			bp->b_flags |= B_VMIO;
2167 #if defined(VFS_BIO_DEBUG)
2168 			if (vp->v_type != VREG && vp->v_type != VBLK)
2169 				printf("getblk: vmioing file type %d???\n", vp->v_type);
2170 #endif
2171 		} else {
2172 			bp->b_flags &= ~B_VMIO;
2173 		}
2174 
2175 		allocbuf(bp, size);
2176 
2177 		splx(s);
2178 		bp->b_flags &= ~B_DONE;
2179 	}
2180 	return (bp);
2181 }
2182 
2183 /*
2184  * Get an empty, disassociated buffer of given size.  The buffer is initially
2185  * set to B_INVAL.
2186  */
2187 struct buf *
2188 geteblk(int size)
2189 {
2190 	struct buf *bp;
2191 	int s;
2192 
2193 	s = splbio();
2194 	while ((bp = getnewbuf(0, 0, size, MAXBSIZE)) == 0);
2195 	splx(s);
2196 	allocbuf(bp, size);
2197 	bp->b_flags |= B_INVAL;	/* b_dep cleared by getnewbuf() */
2198 	return (bp);
2199 }
2200 
2201 
2202 /*
2203  * This code constitutes the buffer memory from either anonymous system
2204  * memory (in the case of non-VMIO operations) or from an associated
2205  * VM object (in the case of VMIO operations).  This code is able to
2206  * resize a buffer up or down.
2207  *
2208  * Note that this code is tricky, and has many complications to resolve
2209  * deadlock or inconsistant data situations.  Tread lightly!!!
2210  * There are B_CACHE and B_DELWRI interactions that must be dealt with by
2211  * the caller.  Calling this code willy nilly can result in the loss of data.
2212  *
2213  * allocbuf() only adjusts B_CACHE for VMIO buffers.  getblk() deals with
2214  * B_CACHE for the non-VMIO case.
2215  */
2216 
2217 int
2218 allocbuf(struct buf *bp, int size)
2219 {
2220 	int newbsize, mbsize;
2221 	int i;
2222 
2223 #if !defined(MAX_PERF)
2224 	if (BUF_REFCNT(bp) == 0)
2225 		panic("allocbuf: buffer not busy");
2226 
2227 	if (bp->b_kvasize < size)
2228 		panic("allocbuf: buffer too small");
2229 #endif
2230 
2231 	if ((bp->b_flags & B_VMIO) == 0) {
2232 		caddr_t origbuf;
2233 		int origbufsize;
2234 		/*
2235 		 * Just get anonymous memory from the kernel.  Don't
2236 		 * mess with B_CACHE.
2237 		 */
2238 		mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
2239 #if !defined(NO_B_MALLOC)
2240 		if (bp->b_flags & B_MALLOC)
2241 			newbsize = mbsize;
2242 		else
2243 #endif
2244 			newbsize = round_page(size);
2245 
2246 		if (newbsize < bp->b_bufsize) {
2247 #if !defined(NO_B_MALLOC)
2248 			/*
2249 			 * malloced buffers are not shrunk
2250 			 */
2251 			if (bp->b_flags & B_MALLOC) {
2252 				if (newbsize) {
2253 					bp->b_bcount = size;
2254 				} else {
2255 					free(bp->b_data, M_BIOBUF);
2256 					bufspace -= bp->b_bufsize;
2257 					bufmallocspace -= bp->b_bufsize;
2258 					runningbufspace -= bp->b_bufsize;
2259 					if (bp->b_bufsize)
2260 						bufspacewakeup();
2261 					bp->b_data = bp->b_kvabase;
2262 					bp->b_bufsize = 0;
2263 					bp->b_bcount = 0;
2264 					bp->b_flags &= ~B_MALLOC;
2265 				}
2266 				return 1;
2267 			}
2268 #endif
2269 			vm_hold_free_pages(
2270 			    bp,
2271 			    (vm_offset_t) bp->b_data + newbsize,
2272 			    (vm_offset_t) bp->b_data + bp->b_bufsize);
2273 		} else if (newbsize > bp->b_bufsize) {
2274 #if !defined(NO_B_MALLOC)
2275 			/*
2276 			 * We only use malloced memory on the first allocation.
2277 			 * and revert to page-allocated memory when the buffer
2278 			 * grows.
2279 			 */
2280 			if ( (bufmallocspace < maxbufmallocspace) &&
2281 				(bp->b_bufsize == 0) &&
2282 				(mbsize <= PAGE_SIZE/2)) {
2283 
2284 				bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK);
2285 				bp->b_bufsize = mbsize;
2286 				bp->b_bcount = size;
2287 				bp->b_flags |= B_MALLOC;
2288 				bufspace += mbsize;
2289 				bufmallocspace += mbsize;
2290 				runningbufspace += bp->b_bufsize;
2291 				return 1;
2292 			}
2293 #endif
2294 			origbuf = NULL;
2295 			origbufsize = 0;
2296 #if !defined(NO_B_MALLOC)
2297 			/*
2298 			 * If the buffer is growing on its other-than-first allocation,
2299 			 * then we revert to the page-allocation scheme.
2300 			 */
2301 			if (bp->b_flags & B_MALLOC) {
2302 				origbuf = bp->b_data;
2303 				origbufsize = bp->b_bufsize;
2304 				bp->b_data = bp->b_kvabase;
2305 				bufspace -= bp->b_bufsize;
2306 				bufmallocspace -= bp->b_bufsize;
2307 				runningbufspace -= bp->b_bufsize;
2308 				if (bp->b_bufsize)
2309 					bufspacewakeup();
2310 				bp->b_bufsize = 0;
2311 				bp->b_flags &= ~B_MALLOC;
2312 				newbsize = round_page(newbsize);
2313 			}
2314 #endif
2315 			vm_hold_load_pages(
2316 			    bp,
2317 			    (vm_offset_t) bp->b_data + bp->b_bufsize,
2318 			    (vm_offset_t) bp->b_data + newbsize);
2319 #if !defined(NO_B_MALLOC)
2320 			if (origbuf) {
2321 				bcopy(origbuf, bp->b_data, origbufsize);
2322 				free(origbuf, M_BIOBUF);
2323 			}
2324 #endif
2325 		}
2326 	} else {
2327 		vm_page_t m;
2328 		int desiredpages;
2329 
2330 		newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
2331 		desiredpages = (size == 0) ? 0 :
2332 			num_pages((bp->b_offset & PAGE_MASK) + newbsize);
2333 
2334 #if !defined(NO_B_MALLOC)
2335 		if (bp->b_flags & B_MALLOC)
2336 			panic("allocbuf: VMIO buffer can't be malloced");
2337 #endif
2338 		/*
2339 		 * Set B_CACHE initially if buffer is 0 length or will become
2340 		 * 0-length.
2341 		 */
2342 		if (size == 0 || bp->b_bufsize == 0)
2343 			bp->b_flags |= B_CACHE;
2344 
2345 		if (newbsize < bp->b_bufsize) {
2346 			/*
2347 			 * DEV_BSIZE aligned new buffer size is less then the
2348 			 * DEV_BSIZE aligned existing buffer size.  Figure out
2349 			 * if we have to remove any pages.
2350 			 */
2351 			if (desiredpages < bp->b_npages) {
2352 				for (i = desiredpages; i < bp->b_npages; i++) {
2353 					/*
2354 					 * the page is not freed here -- it
2355 					 * is the responsibility of
2356 					 * vnode_pager_setsize
2357 					 */
2358 					m = bp->b_pages[i];
2359 					KASSERT(m != bogus_page,
2360 					    ("allocbuf: bogus page found"));
2361 					while (vm_page_sleep_busy(m, TRUE, "biodep"))
2362 						;
2363 
2364 					bp->b_pages[i] = NULL;
2365 					vm_page_unwire(m, 0);
2366 				}
2367 				pmap_qremove((vm_offset_t) trunc_page((vm_offset_t)bp->b_data) +
2368 				    (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages));
2369 				bp->b_npages = desiredpages;
2370 			}
2371 		} else if (size > bp->b_bcount) {
2372 			/*
2373 			 * We are growing the buffer, possibly in a
2374 			 * byte-granular fashion.
2375 			 */
2376 			struct vnode *vp;
2377 			vm_object_t obj;
2378 			vm_offset_t toff;
2379 			vm_offset_t tinc;
2380 
2381 			/*
2382 			 * Step 1, bring in the VM pages from the object,
2383 			 * allocating them if necessary.  We must clear
2384 			 * B_CACHE if these pages are not valid for the
2385 			 * range covered by the buffer.
2386 			 */
2387 
2388 			vp = bp->b_vp;
2389 			obj = vp->v_object;
2390 
2391 			while (bp->b_npages < desiredpages) {
2392 				vm_page_t m;
2393 				vm_pindex_t pi;
2394 
2395 				pi = OFF_TO_IDX(bp->b_offset) + bp->b_npages;
2396 				if ((m = vm_page_lookup(obj, pi)) == NULL) {
2397 					m = vm_page_alloc(obj, pi, VM_ALLOC_NORMAL);
2398 					if (m == NULL) {
2399 						VM_WAIT;
2400 						vm_pageout_deficit += desiredpages - bp->b_npages;
2401 					} else {
2402 						vm_page_wire(m);
2403 						vm_page_wakeup(m);
2404 						bp->b_flags &= ~B_CACHE;
2405 						bp->b_pages[bp->b_npages] = m;
2406 						++bp->b_npages;
2407 					}
2408 					continue;
2409 				}
2410 
2411 				/*
2412 				 * We found a page.  If we have to sleep on it,
2413 				 * retry because it might have gotten freed out
2414 				 * from under us.
2415 				 *
2416 				 * We can only test PG_BUSY here.  Blocking on
2417 				 * m->busy might lead to a deadlock:
2418 				 *
2419 				 *  vm_fault->getpages->cluster_read->allocbuf
2420 				 *
2421 				 */
2422 
2423 				if (vm_page_sleep_busy(m, FALSE, "pgtblk"))
2424 					continue;
2425 
2426 				/*
2427 				 * We have a good page.  Should we wakeup the
2428 				 * page daemon?
2429 				 */
2430 				if ((curproc != pageproc) &&
2431 				    ((m->queue - m->pc) == PQ_CACHE) &&
2432 				    ((cnt.v_free_count + cnt.v_cache_count) <
2433 					(cnt.v_free_min + cnt.v_cache_min))) {
2434 					pagedaemon_wakeup();
2435 				}
2436 				vm_page_flag_clear(m, PG_ZERO);
2437 				vm_page_wire(m);
2438 				bp->b_pages[bp->b_npages] = m;
2439 				++bp->b_npages;
2440 			}
2441 
2442 			/*
2443 			 * Step 2.  We've loaded the pages into the buffer,
2444 			 * we have to figure out if we can still have B_CACHE
2445 			 * set.  Note that B_CACHE is set according to the
2446 			 * byte-granular range ( bcount and size ), new the
2447 			 * aligned range ( newbsize ).
2448 			 *
2449 			 * The VM test is against m->valid, which is DEV_BSIZE
2450 			 * aligned.  Needless to say, the validity of the data
2451 			 * needs to also be DEV_BSIZE aligned.  Note that this
2452 			 * fails with NFS if the server or some other client
2453 			 * extends the file's EOF.  If our buffer is resized,
2454 			 * B_CACHE may remain set! XXX
2455 			 */
2456 
2457 			toff = bp->b_bcount;
2458 			tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK);
2459 
2460 			while ((bp->b_flags & B_CACHE) && toff < size) {
2461 				vm_pindex_t pi;
2462 
2463 				if (tinc > (size - toff))
2464 					tinc = size - toff;
2465 
2466 				pi = ((bp->b_offset & PAGE_MASK) + toff) >>
2467 				    PAGE_SHIFT;
2468 
2469 				vfs_buf_test_cache(
2470 				    bp,
2471 				    bp->b_offset,
2472 				    toff,
2473 				    tinc,
2474 				    bp->b_pages[pi]
2475 				);
2476 				toff += tinc;
2477 				tinc = PAGE_SIZE;
2478 			}
2479 
2480 			/*
2481 			 * Step 3, fixup the KVM pmap.  Remember that
2482 			 * bp->b_data is relative to bp->b_offset, but
2483 			 * bp->b_offset may be offset into the first page.
2484 			 */
2485 
2486 			bp->b_data = (caddr_t)
2487 			    trunc_page((vm_offset_t)bp->b_data);
2488 			pmap_qenter(
2489 			    (vm_offset_t)bp->b_data,
2490 			    bp->b_pages,
2491 			    bp->b_npages
2492 			);
2493 			bp->b_data = (caddr_t)((vm_offset_t)bp->b_data |
2494 			    (vm_offset_t)(bp->b_offset & PAGE_MASK));
2495 		}
2496 	}
2497 	if (bp->b_flags & B_VMIO)
2498 		vmiospace += (newbsize - bp->b_bufsize);
2499 	bufspace += (newbsize - bp->b_bufsize);
2500 	runningbufspace += (newbsize - bp->b_bufsize);
2501 	if (newbsize < bp->b_bufsize)
2502 		bufspacewakeup();
2503 	bp->b_bufsize = newbsize;	/* actual buffer allocation	*/
2504 	bp->b_bcount = size;		/* requested buffer size	*/
2505 	return 1;
2506 }
2507 
2508 /*
2509  *	biowait:
2510  *
2511  *	Wait for buffer I/O completion, returning error status.  The buffer
2512  *	is left locked and B_DONE on return.  B_EINTR is converted into a EINTR
2513  *	error and cleared.
2514  */
2515 int
2516 biowait(register struct buf * bp)
2517 {
2518 	int s;
2519 
2520 	s = splbio();
2521 	while ((bp->b_flags & B_DONE) == 0) {
2522 #if defined(NO_SCHEDULE_MODS)
2523 		tsleep(bp, PRIBIO, "biowait", 0);
2524 #else
2525 		if (bp->b_flags & B_READ)
2526 			tsleep(bp, PRIBIO, "biord", 0);
2527 		else
2528 			tsleep(bp, PRIBIO, "biowr", 0);
2529 #endif
2530 	}
2531 	splx(s);
2532 	if (bp->b_flags & B_EINTR) {
2533 		bp->b_flags &= ~B_EINTR;
2534 		return (EINTR);
2535 	}
2536 	if (bp->b_flags & B_ERROR) {
2537 		return (bp->b_error ? bp->b_error : EIO);
2538 	} else {
2539 		return (0);
2540 	}
2541 }
2542 
2543 /*
2544  *	biodone:
2545  *
2546  *	Finish I/O on a buffer, optionally calling a completion function.
2547  *	This is usually called from an interrupt so process blocking is
2548  *	not allowed.
2549  *
2550  *	biodone is also responsible for setting B_CACHE in a B_VMIO bp.
2551  *	In a non-VMIO bp, B_CACHE will be set on the next getblk()
2552  *	assuming B_INVAL is clear.
2553  *
2554  *	For the VMIO case, we set B_CACHE if the op was a read and no
2555  *	read error occured, or if the op was a write.  B_CACHE is never
2556  *	set if the buffer is invalid or otherwise uncacheable.
2557  *
2558  *	biodone does not mess with B_INVAL, allowing the I/O routine or the
2559  *	initiator to leave B_INVAL set to brelse the buffer out of existance
2560  *	in the biodone routine.
2561  */
2562 void
2563 biodone(register struct buf * bp)
2564 {
2565 	int s;
2566 
2567 	s = splbio();
2568 
2569 	KASSERT(BUF_REFCNT(bp) > 0, ("biodone: bp %p not busy %d", bp, BUF_REFCNT(bp)));
2570 	KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp));
2571 
2572 	bp->b_flags |= B_DONE;
2573 
2574 	if (bp->b_flags & B_FREEBUF) {
2575 		brelse(bp);
2576 		splx(s);
2577 		return;
2578 	}
2579 
2580 	if ((bp->b_flags & B_READ) == 0) {
2581 		vwakeup(bp);
2582 	}
2583 
2584 	/* call optional completion function if requested */
2585 	if (bp->b_flags & B_CALL) {
2586 		bp->b_flags &= ~B_CALL;
2587 		(*bp->b_iodone) (bp);
2588 		splx(s);
2589 		return;
2590 	}
2591 	if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_complete)
2592 		(*bioops.io_complete)(bp);
2593 
2594 	if (bp->b_flags & B_VMIO) {
2595 		int i, resid;
2596 		vm_ooffset_t foff;
2597 		vm_page_t m;
2598 		vm_object_t obj;
2599 		int iosize;
2600 		struct vnode *vp = bp->b_vp;
2601 
2602 		obj = vp->v_object;
2603 
2604 #if defined(VFS_BIO_DEBUG)
2605 		if (vp->v_usecount == 0) {
2606 			panic("biodone: zero vnode ref count");
2607 		}
2608 
2609 		if (vp->v_object == NULL) {
2610 			panic("biodone: missing VM object");
2611 		}
2612 
2613 		if ((vp->v_flag & VOBJBUF) == 0) {
2614 			panic("biodone: vnode is not setup for merged cache");
2615 		}
2616 #endif
2617 
2618 		foff = bp->b_offset;
2619 		KASSERT(bp->b_offset != NOOFFSET,
2620 		    ("biodone: no buffer offset"));
2621 
2622 #if !defined(MAX_PERF)
2623 		if (!obj) {
2624 			panic("biodone: no object");
2625 		}
2626 #endif
2627 #if defined(VFS_BIO_DEBUG)
2628 		if (obj->paging_in_progress < bp->b_npages) {
2629 			printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n",
2630 			    obj->paging_in_progress, bp->b_npages);
2631 		}
2632 #endif
2633 
2634 		/*
2635 		 * Set B_CACHE if the op was a normal read and no error
2636 		 * occured.  B_CACHE is set for writes in the b*write()
2637 		 * routines.
2638 		 */
2639 		iosize = bp->b_bcount - bp->b_resid;
2640 		if ((bp->b_flags & (B_READ|B_FREEBUF|B_INVAL|B_NOCACHE|B_ERROR)) == B_READ) {
2641 			bp->b_flags |= B_CACHE;
2642 		}
2643 
2644 		for (i = 0; i < bp->b_npages; i++) {
2645 			int bogusflag = 0;
2646 			m = bp->b_pages[i];
2647 			if (m == bogus_page) {
2648 				bogusflag = 1;
2649 				m = vm_page_lookup(obj, OFF_TO_IDX(foff));
2650 				if (!m) {
2651 #if defined(VFS_BIO_DEBUG)
2652 					printf("biodone: page disappeared\n");
2653 #endif
2654 					vm_object_pip_subtract(obj, 1);
2655 					bp->b_flags &= ~B_CACHE;
2656 					continue;
2657 				}
2658 				bp->b_pages[i] = m;
2659 				pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
2660 			}
2661 #if defined(VFS_BIO_DEBUG)
2662 			if (OFF_TO_IDX(foff) != m->pindex) {
2663 				printf(
2664 "biodone: foff(%lu)/m->pindex(%d) mismatch\n",
2665 				    (unsigned long)foff, m->pindex);
2666 			}
2667 #endif
2668 			resid = IDX_TO_OFF(m->pindex + 1) - foff;
2669 			if (resid > iosize)
2670 				resid = iosize;
2671 
2672 			/*
2673 			 * In the write case, the valid and clean bits are
2674 			 * already changed correctly ( see bdwrite() ), so we
2675 			 * only need to do this here in the read case.
2676 			 */
2677 			if ((bp->b_flags & B_READ) && !bogusflag && resid > 0) {
2678 				vfs_page_set_valid(bp, foff, i, m);
2679 			}
2680 			vm_page_flag_clear(m, PG_ZERO);
2681 
2682 			/*
2683 			 * when debugging new filesystems or buffer I/O methods, this
2684 			 * is the most common error that pops up.  if you see this, you
2685 			 * have not set the page busy flag correctly!!!
2686 			 */
2687 			if (m->busy == 0) {
2688 #if !defined(MAX_PERF)
2689 				printf("biodone: page busy < 0, "
2690 				    "pindex: %d, foff: 0x(%x,%x), "
2691 				    "resid: %d, index: %d\n",
2692 				    (int) m->pindex, (int)(foff >> 32),
2693 						(int) foff & 0xffffffff, resid, i);
2694 #endif
2695 				if (vp->v_type != VBLK)
2696 #if !defined(MAX_PERF)
2697 					printf(" iosize: %ld, lblkno: %d, flags: 0x%lx, npages: %d\n",
2698 					    bp->b_vp->v_mount->mnt_stat.f_iosize,
2699 					    (int) bp->b_lblkno,
2700 					    bp->b_flags, bp->b_npages);
2701 				else
2702 					printf(" VDEV, lblkno: %d, flags: 0x%lx, npages: %d\n",
2703 					    (int) bp->b_lblkno,
2704 					    bp->b_flags, bp->b_npages);
2705 				printf(" valid: 0x%x, dirty: 0x%x, wired: %d\n",
2706 				    m->valid, m->dirty, m->wire_count);
2707 #endif
2708 				panic("biodone: page busy < 0\n");
2709 			}
2710 			vm_page_io_finish(m);
2711 			vm_object_pip_subtract(obj, 1);
2712 			foff += resid;
2713 			iosize -= resid;
2714 		}
2715 		if (obj)
2716 			vm_object_pip_wakeupn(obj, 0);
2717 	}
2718 	/*
2719 	 * For asynchronous completions, release the buffer now. The brelse
2720 	 * will do a wakeup there if necessary - so no need to do a wakeup
2721 	 * here in the async case. The sync case always needs to do a wakeup.
2722 	 */
2723 
2724 	if (bp->b_flags & B_ASYNC) {
2725 		if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_RELBUF)) != 0)
2726 			brelse(bp);
2727 		else
2728 			bqrelse(bp);
2729 	} else {
2730 		wakeup(bp);
2731 	}
2732 	splx(s);
2733 }
2734 
2735 /*
2736  * This routine is called in lieu of iodone in the case of
2737  * incomplete I/O.  This keeps the busy status for pages
2738  * consistant.
2739  */
2740 void
2741 vfs_unbusy_pages(struct buf * bp)
2742 {
2743 	int i;
2744 
2745 	if (bp->b_flags & B_VMIO) {
2746 		struct vnode *vp = bp->b_vp;
2747 		vm_object_t obj = vp->v_object;
2748 
2749 		for (i = 0; i < bp->b_npages; i++) {
2750 			vm_page_t m = bp->b_pages[i];
2751 
2752 			if (m == bogus_page) {
2753 				m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i);
2754 #if !defined(MAX_PERF)
2755 				if (!m) {
2756 					panic("vfs_unbusy_pages: page missing\n");
2757 				}
2758 #endif
2759 				bp->b_pages[i] = m;
2760 				pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
2761 			}
2762 			vm_object_pip_subtract(obj, 1);
2763 			vm_page_flag_clear(m, PG_ZERO);
2764 			vm_page_io_finish(m);
2765 		}
2766 		vm_object_pip_wakeupn(obj, 0);
2767 	}
2768 }
2769 
2770 /*
2771  * vfs_page_set_valid:
2772  *
2773  *	Set the valid bits in a page based on the supplied offset.   The
2774  *	range is restricted to the buffer's size.
2775  *
2776  *	This routine is typically called after a read completes.
2777  */
2778 static void
2779 vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m)
2780 {
2781 	vm_ooffset_t soff, eoff;
2782 
2783 	/*
2784 	 * Start and end offsets in buffer.  eoff - soff may not cross a
2785 	 * page boundry or cross the end of the buffer.  The end of the
2786 	 * buffer, in this case, is our file EOF, not the allocation size
2787 	 * of the buffer.
2788 	 */
2789 	soff = off;
2790 	eoff = (off + PAGE_SIZE) & ~PAGE_MASK;
2791 	if (eoff > bp->b_offset + bp->b_bcount)
2792 		eoff = bp->b_offset + bp->b_bcount;
2793 
2794 	/*
2795 	 * Set valid range.  This is typically the entire buffer and thus the
2796 	 * entire page.
2797 	 */
2798 	if (eoff > soff) {
2799 		vm_page_set_validclean(
2800 		    m,
2801 		   (vm_offset_t) (soff & PAGE_MASK),
2802 		   (vm_offset_t) (eoff - soff)
2803 		);
2804 	}
2805 }
2806 
2807 /*
2808  * This routine is called before a device strategy routine.
2809  * It is used to tell the VM system that paging I/O is in
2810  * progress, and treat the pages associated with the buffer
2811  * almost as being PG_BUSY.  Also the object paging_in_progress
2812  * flag is handled to make sure that the object doesn't become
2813  * inconsistant.
2814  *
2815  * Since I/O has not been initiated yet, certain buffer flags
2816  * such as B_ERROR or B_INVAL may be in an inconsistant state
2817  * and should be ignored.
2818  */
2819 void
2820 vfs_busy_pages(struct buf * bp, int clear_modify)
2821 {
2822 	int i, bogus;
2823 
2824 	if (bp->b_flags & B_VMIO) {
2825 		struct vnode *vp = bp->b_vp;
2826 		vm_object_t obj = vp->v_object;
2827 		vm_ooffset_t foff;
2828 
2829 		foff = bp->b_offset;
2830 		KASSERT(bp->b_offset != NOOFFSET,
2831 		    ("vfs_busy_pages: no buffer offset"));
2832 		vfs_setdirty(bp);
2833 
2834 retry:
2835 		for (i = 0; i < bp->b_npages; i++) {
2836 			vm_page_t m = bp->b_pages[i];
2837 			if (vm_page_sleep_busy(m, FALSE, "vbpage"))
2838 				goto retry;
2839 		}
2840 
2841 		bogus = 0;
2842 		for (i = 0; i < bp->b_npages; i++) {
2843 			vm_page_t m = bp->b_pages[i];
2844 
2845 			vm_page_flag_clear(m, PG_ZERO);
2846 			if ((bp->b_flags & B_CLUSTER) == 0) {
2847 				vm_object_pip_add(obj, 1);
2848 				vm_page_io_start(m);
2849 			}
2850 
2851 			/*
2852 			 * When readying a buffer for a read ( i.e
2853 			 * clear_modify == 0 ), it is important to do
2854 			 * bogus_page replacement for valid pages in
2855 			 * partially instantiated buffers.  Partially
2856 			 * instantiated buffers can, in turn, occur when
2857 			 * reconstituting a buffer from its VM backing store
2858 			 * base.  We only have to do this if B_CACHE is
2859 			 * clear ( which causes the I/O to occur in the
2860 			 * first place ).  The replacement prevents the read
2861 			 * I/O from overwriting potentially dirty VM-backed
2862 			 * pages.  XXX bogus page replacement is, uh, bogus.
2863 			 * It may not work properly with small-block devices.
2864 			 * We need to find a better way.
2865 			 */
2866 
2867 			vm_page_protect(m, VM_PROT_NONE);
2868 			if (clear_modify)
2869 				vfs_page_set_valid(bp, foff, i, m);
2870 			else if (m->valid == VM_PAGE_BITS_ALL &&
2871 				(bp->b_flags & B_CACHE) == 0) {
2872 				bp->b_pages[i] = bogus_page;
2873 				bogus++;
2874 			}
2875 			foff = (foff + PAGE_SIZE) & ~PAGE_MASK;
2876 		}
2877 		if (bogus)
2878 			pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
2879 	}
2880 }
2881 
2882 /*
2883  * Tell the VM system that the pages associated with this buffer
2884  * are clean.  This is used for delayed writes where the data is
2885  * going to go to disk eventually without additional VM intevention.
2886  *
2887  * Note that while we only really need to clean through to b_bcount, we
2888  * just go ahead and clean through to b_bufsize.
2889  */
2890 static void
2891 vfs_clean_pages(struct buf * bp)
2892 {
2893 	int i;
2894 
2895 	if (bp->b_flags & B_VMIO) {
2896 		vm_ooffset_t foff;
2897 
2898 		foff = bp->b_offset;
2899 		KASSERT(bp->b_offset != NOOFFSET,
2900 		    ("vfs_clean_pages: no buffer offset"));
2901 		for (i = 0; i < bp->b_npages; i++) {
2902 			vm_page_t m = bp->b_pages[i];
2903 			vm_ooffset_t noff = (foff + PAGE_SIZE) & ~PAGE_MASK;
2904 			vm_ooffset_t eoff = noff;
2905 
2906 			if (eoff > bp->b_offset + bp->b_bufsize)
2907 				eoff = bp->b_offset + bp->b_bufsize;
2908 			vfs_page_set_valid(bp, foff, i, m);
2909 			/* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */
2910 			foff = noff;
2911 		}
2912 	}
2913 }
2914 
2915 /*
2916  *	vfs_bio_set_validclean:
2917  *
2918  *	Set the range within the buffer to valid and clean.  The range is
2919  *	relative to the beginning of the buffer, b_offset.  Note that b_offset
2920  *	itself may be offset from the beginning of the first page.
2921  */
2922 
2923 void
2924 vfs_bio_set_validclean(struct buf *bp, int base, int size)
2925 {
2926 	if (bp->b_flags & B_VMIO) {
2927 		int i;
2928 		int n;
2929 
2930 		/*
2931 		 * Fixup base to be relative to beginning of first page.
2932 		 * Set initial n to be the maximum number of bytes in the
2933 		 * first page that can be validated.
2934 		 */
2935 
2936 		base += (bp->b_offset & PAGE_MASK);
2937 		n = PAGE_SIZE - (base & PAGE_MASK);
2938 
2939 		for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) {
2940 			vm_page_t m = bp->b_pages[i];
2941 
2942 			if (n > size)
2943 				n = size;
2944 
2945 			vm_page_set_validclean(m, base & PAGE_MASK, n);
2946 			base += n;
2947 			size -= n;
2948 			n = PAGE_SIZE;
2949 		}
2950 	}
2951 }
2952 
2953 /*
2954  *	vfs_bio_clrbuf:
2955  *
2956  *	clear a buffer.  This routine essentially fakes an I/O, so we need
2957  *	to clear B_ERROR and B_INVAL.
2958  *
2959  *	Note that while we only theoretically need to clear through b_bcount,
2960  *	we go ahead and clear through b_bufsize.
2961  */
2962 
2963 void
2964 vfs_bio_clrbuf(struct buf *bp) {
2965 	int i, mask = 0;
2966 	caddr_t sa, ea;
2967 	if ((bp->b_flags & (B_VMIO | B_MALLOC)) == B_VMIO) {
2968 		bp->b_flags &= ~(B_INVAL|B_ERROR);
2969 		if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) &&
2970 		    (bp->b_offset & PAGE_MASK) == 0) {
2971 			mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1;
2972 			if (((bp->b_pages[0]->flags & PG_ZERO) == 0) &&
2973 			    ((bp->b_pages[0]->valid & mask) != mask)) {
2974 				bzero(bp->b_data, bp->b_bufsize);
2975 			}
2976 			bp->b_pages[0]->valid |= mask;
2977 			bp->b_resid = 0;
2978 			return;
2979 		}
2980 		ea = sa = bp->b_data;
2981 		for(i=0;i<bp->b_npages;i++,sa=ea) {
2982 			int j = ((vm_offset_t)sa & PAGE_MASK) / DEV_BSIZE;
2983 			ea = (caddr_t)trunc_page((vm_offset_t)sa + PAGE_SIZE);
2984 			ea = (caddr_t)(vm_offset_t)ulmin(
2985 			    (u_long)(vm_offset_t)ea,
2986 			    (u_long)(vm_offset_t)bp->b_data + bp->b_bufsize);
2987 			mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j;
2988 			if ((bp->b_pages[i]->valid & mask) == mask)
2989 				continue;
2990 			if ((bp->b_pages[i]->valid & mask) == 0) {
2991 				if ((bp->b_pages[i]->flags & PG_ZERO) == 0) {
2992 					bzero(sa, ea - sa);
2993 				}
2994 			} else {
2995 				for (; sa < ea; sa += DEV_BSIZE, j++) {
2996 					if (((bp->b_pages[i]->flags & PG_ZERO) == 0) &&
2997 						(bp->b_pages[i]->valid & (1<<j)) == 0)
2998 						bzero(sa, DEV_BSIZE);
2999 				}
3000 			}
3001 			bp->b_pages[i]->valid |= mask;
3002 			vm_page_flag_clear(bp->b_pages[i], PG_ZERO);
3003 		}
3004 		bp->b_resid = 0;
3005 	} else {
3006 		clrbuf(bp);
3007 	}
3008 }
3009 
3010 /*
3011  * vm_hold_load_pages and vm_hold_unload pages get pages into
3012  * a buffers address space.  The pages are anonymous and are
3013  * not associated with a file object.
3014  */
3015 void
3016 vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
3017 {
3018 	vm_offset_t pg;
3019 	vm_page_t p;
3020 	int index;
3021 
3022 	to = round_page(to);
3023 	from = round_page(from);
3024 	index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
3025 
3026 	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
3027 
3028 tryagain:
3029 
3030 		p = vm_page_alloc(kernel_object,
3031 			((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
3032 		    VM_ALLOC_NORMAL);
3033 		if (!p) {
3034 			vm_pageout_deficit += (to - from) >> PAGE_SHIFT;
3035 			VM_WAIT;
3036 			goto tryagain;
3037 		}
3038 		vm_page_wire(p);
3039 		p->valid = VM_PAGE_BITS_ALL;
3040 		vm_page_flag_clear(p, PG_ZERO);
3041 		pmap_kenter(pg, VM_PAGE_TO_PHYS(p));
3042 		bp->b_pages[index] = p;
3043 		vm_page_wakeup(p);
3044 	}
3045 	bp->b_npages = index;
3046 }
3047 
3048 void
3049 vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
3050 {
3051 	vm_offset_t pg;
3052 	vm_page_t p;
3053 	int index, newnpages;
3054 
3055 	from = round_page(from);
3056 	to = round_page(to);
3057 	newnpages = index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
3058 
3059 	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
3060 		p = bp->b_pages[index];
3061 		if (p && (index < bp->b_npages)) {
3062 #if !defined(MAX_PERF)
3063 			if (p->busy) {
3064 				printf("vm_hold_free_pages: blkno: %d, lblkno: %d\n",
3065 					bp->b_blkno, bp->b_lblkno);
3066 			}
3067 #endif
3068 			bp->b_pages[index] = NULL;
3069 			pmap_kremove(pg);
3070 			vm_page_busy(p);
3071 			vm_page_unwire(p, 0);
3072 			vm_page_free(p);
3073 		}
3074 	}
3075 	bp->b_npages = newnpages;
3076 }
3077 
3078 
3079 #include "opt_ddb.h"
3080 #ifdef DDB
3081 #include <ddb/ddb.h>
3082 
3083 DB_SHOW_COMMAND(buffer, db_show_buffer)
3084 {
3085 	/* get args */
3086 	struct buf *bp = (struct buf *)addr;
3087 
3088 	if (!have_addr) {
3089 		db_printf("usage: show buffer <addr>\n");
3090 		return;
3091 	}
3092 
3093 	db_printf("b_flags = 0x%b\n", (u_int)bp->b_flags, PRINT_BUF_FLAGS);
3094 	db_printf("b_error = %d, b_bufsize = %ld, b_bcount = %ld, "
3095 		  "b_resid = %ld\nb_dev = (%d,%d), b_data = %p, "
3096 		  "b_blkno = %d, b_pblkno = %d\n",
3097 		  bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid,
3098 		  major(bp->b_dev), minor(bp->b_dev),
3099 		  bp->b_data, bp->b_blkno, bp->b_pblkno);
3100 	if (bp->b_npages) {
3101 		int i;
3102 		db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages);
3103 		for (i = 0; i < bp->b_npages; i++) {
3104 			vm_page_t m;
3105 			m = bp->b_pages[i];
3106 			db_printf("(%p, 0x%lx, 0x%lx)", (void *)m->object,
3107 			    (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m));
3108 			if ((i + 1) < bp->b_npages)
3109 				db_printf(",");
3110 		}
3111 		db_printf("\n");
3112 	}
3113 }
3114 #endif /* DDB */
3115