xref: /freebsd/sys/kern/vfs_bio.c (revision 4cf49a43559ed9fdad601bdcccd2c55963008675)
1 /*
2  * Copyright (c) 1994,1997 John S. Dyson
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice immediately at the beginning of the file, without modification,
10  *    this list of conditions, and the following disclaimer.
11  * 2. Absolutely no warranty of function or purpose is made by the author
12  *		John S. Dyson.
13  *
14  * $FreeBSD$
15  */
16 
17 /*
18  * this file contains a new buffer I/O scheme implementing a coherent
19  * VM object and buffer cache scheme.  Pains have been taken to make
20  * sure that the performance degradation associated with schemes such
21  * as this is not realized.
22  *
23  * Author:  John S. Dyson
24  * Significant help during the development and debugging phases
25  * had been provided by David Greenman, also of the FreeBSD core team.
26  *
27  * see man buf(9) for more info.
28  */
29 
30 #define VMIO
31 #include <sys/param.h>
32 #include <sys/systm.h>
33 #include <sys/kernel.h>
34 #include <sys/sysctl.h>
35 #include <sys/proc.h>
36 #include <sys/kthread.h>
37 #include <sys/vnode.h>
38 #include <sys/vmmeter.h>
39 #include <sys/lock.h>
40 #include <vm/vm.h>
41 #include <vm/vm_param.h>
42 #include <vm/vm_prot.h>
43 #include <vm/vm_kern.h>
44 #include <vm/vm_pageout.h>
45 #include <vm/vm_page.h>
46 #include <vm/vm_object.h>
47 #include <vm/vm_extern.h>
48 #include <vm/vm_map.h>
49 #include <sys/buf.h>
50 #include <sys/mount.h>
51 #include <sys/malloc.h>
52 #include <sys/resourcevar.h>
53 #include <sys/conf.h>
54 
55 static MALLOC_DEFINE(M_BIOBUF, "BIO buffer", "BIO buffer");
56 
57 struct	bio_ops bioops;		/* I/O operation notification */
58 
59 struct buf *buf;		/* buffer header pool */
60 struct swqueue bswlist;
61 
62 static void vm_hold_free_pages(struct buf * bp, vm_offset_t from,
63 		vm_offset_t to);
64 static void vm_hold_load_pages(struct buf * bp, vm_offset_t from,
65 		vm_offset_t to);
66 static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off,
67 			       int pageno, vm_page_t m);
68 static void vfs_clean_pages(struct buf * bp);
69 static void vfs_setdirty(struct buf *bp);
70 static void vfs_vmio_release(struct buf *bp);
71 static int flushbufqueues(void);
72 
73 static int bd_request;
74 
75 static void buf_daemon __P((void));
76 /*
77  * bogus page -- for I/O to/from partially complete buffers
78  * this is a temporary solution to the problem, but it is not
79  * really that bad.  it would be better to split the buffer
80  * for input in the case of buffers partially already in memory,
81  * but the code is intricate enough already.
82  */
83 vm_page_t bogus_page;
84 int runningbufspace;
85 int vmiodirenable = FALSE;
86 static vm_offset_t bogus_offset;
87 
88 static int bufspace, maxbufspace, vmiospace,
89 	bufmallocspace, maxbufmallocspace, hibufspace;
90 #if 0
91 static int maxvmiobufspace;
92 #endif
93 static int maxbdrun;
94 static int needsbuffer;
95 static int numdirtybuffers, lodirtybuffers, hidirtybuffers;
96 static int numfreebuffers, lofreebuffers, hifreebuffers;
97 static int getnewbufcalls;
98 static int getnewbufrestarts;
99 static int kvafreespace;
100 
101 SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD,
102 	&numdirtybuffers, 0, "");
103 SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW,
104 	&lodirtybuffers, 0, "");
105 SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW,
106 	&hidirtybuffers, 0, "");
107 SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD,
108 	&numfreebuffers, 0, "");
109 SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW,
110 	&lofreebuffers, 0, "");
111 SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW,
112 	&hifreebuffers, 0, "");
113 SYSCTL_INT(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD,
114 	&runningbufspace, 0, "");
115 SYSCTL_INT(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RW,
116 	&maxbufspace, 0, "");
117 SYSCTL_INT(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD,
118 	&hibufspace, 0, "");
119 SYSCTL_INT(_vfs, OID_AUTO, bufspace, CTLFLAG_RD,
120 	&bufspace, 0, "");
121 SYSCTL_INT(_vfs, OID_AUTO, maxbdrun, CTLFLAG_RW,
122 	&maxbdrun, 0, "");
123 #if 0
124 SYSCTL_INT(_vfs, OID_AUTO, maxvmiobufspace, CTLFLAG_RW,
125 	&maxvmiobufspace, 0, "");
126 #endif
127 SYSCTL_INT(_vfs, OID_AUTO, vmiospace, CTLFLAG_RD,
128 	&vmiospace, 0, "");
129 SYSCTL_INT(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW,
130 	&maxbufmallocspace, 0, "");
131 SYSCTL_INT(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD,
132 	&bufmallocspace, 0, "");
133 SYSCTL_INT(_vfs, OID_AUTO, kvafreespace, CTLFLAG_RD,
134 	&kvafreespace, 0, "");
135 SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW,
136 	&getnewbufcalls, 0, "");
137 SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW,
138 	&getnewbufrestarts, 0, "");
139 SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW,
140 	&vmiodirenable, 0, "");
141 
142 
143 static int bufhashmask;
144 static LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
145 struct bqueues bufqueues[BUFFER_QUEUES] = { { 0 } };
146 char *buf_wmesg = BUF_WMESG;
147 
148 extern int vm_swap_size;
149 
150 #define BUF_MAXUSE		24
151 
152 #define VFS_BIO_NEED_ANY	0x01	/* any freeable buffer */
153 #define VFS_BIO_NEED_DIRTYFLUSH	0x02	/* waiting for dirty buffer flush */
154 #define VFS_BIO_NEED_FREE	0x04	/* wait for free bufs, hi hysteresis */
155 #define VFS_BIO_NEED_BUFSPACE	0x08	/* wait for buf space, lo hysteresis */
156 #define VFS_BIO_NEED_KVASPACE	0x10	/* wait for buffer_map space, emerg  */
157 
158 /*
159  * Buffer hash table code.  Note that the logical block scans linearly, which
160  * gives us some L1 cache locality.
161  */
162 
163 static __inline
164 struct bufhashhdr *
165 bufhash(struct vnode *vnp, daddr_t bn)
166 {
167 	return(&bufhashtbl[(((uintptr_t)(vnp) >> 7) + (int)bn) & bufhashmask]);
168 }
169 
170 /*
171  *	kvaspacewakeup:
172  *
173  *	Called when kva space is potential available for recovery or when
174  *	kva space is recovered in the buffer_map.  This function wakes up
175  *	anyone waiting for buffer_map kva space.  Even though the buffer_map
176  *	is larger then maxbufspace, this situation will typically occur
177  *	when the buffer_map gets fragmented.
178  */
179 
180 static __inline void
181 kvaspacewakeup(void)
182 {
183 	/*
184 	 * If someone is waiting for KVA space, wake them up.  Even
185 	 * though we haven't freed the kva space yet, the waiting
186 	 * process will be able to now.
187 	 */
188 	if (needsbuffer & VFS_BIO_NEED_KVASPACE) {
189 		needsbuffer &= ~VFS_BIO_NEED_KVASPACE;
190 		wakeup(&needsbuffer);
191 	}
192 }
193 
194 /*
195  *	numdirtywakeup:
196  *
197  *	If someone is blocked due to there being too many dirty buffers,
198  *	and numdirtybuffers is now reasonable, wake them up.
199  */
200 
201 static __inline void
202 numdirtywakeup(void)
203 {
204 	if (numdirtybuffers < hidirtybuffers) {
205 		if (needsbuffer & VFS_BIO_NEED_DIRTYFLUSH) {
206 			needsbuffer &= ~VFS_BIO_NEED_DIRTYFLUSH;
207 			wakeup(&needsbuffer);
208 		}
209 	}
210 }
211 
212 /*
213  *	bufspacewakeup:
214  *
215  *	Called when buffer space is potentially available for recovery or when
216  *	buffer space is recovered.  getnewbuf() will block on this flag when
217  *	it is unable to free sufficient buffer space.  Buffer space becomes
218  *	recoverable when bp's get placed back in the queues.
219  */
220 
221 static __inline void
222 bufspacewakeup(void)
223 {
224 	/*
225 	 * If someone is waiting for BUF space, wake them up.  Even
226 	 * though we haven't freed the kva space yet, the waiting
227 	 * process will be able to now.
228 	 */
229 	if (needsbuffer & VFS_BIO_NEED_BUFSPACE) {
230 		needsbuffer &= ~VFS_BIO_NEED_BUFSPACE;
231 		wakeup(&needsbuffer);
232 	}
233 }
234 
235 /*
236  *	bufcountwakeup:
237  *
238  *	Called when a buffer has been added to one of the free queues to
239  *	account for the buffer and to wakeup anyone waiting for free buffers.
240  *	This typically occurs when large amounts of metadata are being handled
241  *	by the buffer cache ( else buffer space runs out first, usually ).
242  */
243 
244 static __inline void
245 bufcountwakeup(void)
246 {
247 	++numfreebuffers;
248 	if (needsbuffer) {
249 		needsbuffer &= ~VFS_BIO_NEED_ANY;
250 		if (numfreebuffers >= hifreebuffers)
251 			needsbuffer &= ~VFS_BIO_NEED_FREE;
252 		wakeup(&needsbuffer);
253 	}
254 }
255 
256 /*
257  *	vfs_buf_test_cache:
258  *
259  *	Called when a buffer is extended.  This function clears the B_CACHE
260  *	bit if the newly extended portion of the buffer does not contain
261  *	valid data.
262  */
263 static __inline__
264 void
265 vfs_buf_test_cache(struct buf *bp,
266 		  vm_ooffset_t foff, vm_offset_t off, vm_offset_t size,
267 		  vm_page_t m)
268 {
269 	if (bp->b_flags & B_CACHE) {
270 		int base = (foff + off) & PAGE_MASK;
271 		if (vm_page_is_valid(m, base, size) == 0)
272 			bp->b_flags &= ~B_CACHE;
273 	}
274 }
275 
276 static __inline__
277 void
278 bd_wakeup(int dirtybuflevel)
279 {
280 	if (numdirtybuffers >= dirtybuflevel && bd_request == 0) {
281 		bd_request = 1;
282 		wakeup(&bd_request);
283 	}
284 }
285 
286 
287 /*
288  * Initialize buffer headers and related structures.
289  */
290 
291 caddr_t
292 bufhashinit(caddr_t vaddr)
293 {
294 	/* first, make a null hash table */
295 	for (bufhashmask = 8; bufhashmask < nbuf / 4; bufhashmask <<= 1)
296 		;
297 	bufhashtbl = (void *)vaddr;
298 	vaddr = vaddr + sizeof(*bufhashtbl) * bufhashmask;
299 	--bufhashmask;
300 	return(vaddr);
301 }
302 
303 void
304 bufinit(void)
305 {
306 	struct buf *bp;
307 	int i;
308 
309 	TAILQ_INIT(&bswlist);
310 	LIST_INIT(&invalhash);
311 	simple_lock_init(&buftimelock);
312 
313 	for (i = 0; i <= bufhashmask; i++)
314 		LIST_INIT(&bufhashtbl[i]);
315 
316 	/* next, make a null set of free lists */
317 	for (i = 0; i < BUFFER_QUEUES; i++)
318 		TAILQ_INIT(&bufqueues[i]);
319 
320 	/* finally, initialize each buffer header and stick on empty q */
321 	for (i = 0; i < nbuf; i++) {
322 		bp = &buf[i];
323 		bzero(bp, sizeof *bp);
324 		bp->b_flags = B_INVAL;	/* we're just an empty header */
325 		bp->b_dev = NODEV;
326 		bp->b_rcred = NOCRED;
327 		bp->b_wcred = NOCRED;
328 		bp->b_qindex = QUEUE_EMPTY;
329 		bp->b_xflags = 0;
330 		LIST_INIT(&bp->b_dep);
331 		BUF_LOCKINIT(bp);
332 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
333 		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
334 	}
335 
336 	/*
337 	 * maxbufspace is currently calculated to support all filesystem
338 	 * blocks to be 8K.  If you happen to use a 16K filesystem, the size
339 	 * of the buffer cache is still the same as it would be for 8K
340 	 * filesystems.  This keeps the size of the buffer cache "in check"
341 	 * for big block filesystems.
342 	 *
343 	 * maxbufspace is calculated as around 50% of the KVA available in
344 	 * the buffer_map ( DFLTSIZE vs BKVASIZE ), I presume to reduce the
345 	 * effect of fragmentation.
346 	 */
347 	maxbufspace = (nbuf + 8) * DFLTBSIZE;
348 	if ((hibufspace = maxbufspace - MAXBSIZE * 5) <= MAXBSIZE)
349 		hibufspace = 3 * maxbufspace / 4;
350 #if 0
351 /*
352  * reserve 1/3 of the buffers for metadata (VDIR) which might not be VMIO'ed
353  */
354 	maxvmiobufspace = 2 * hibufspace / 3;
355 #endif
356 /*
357  * Limit the amount of malloc memory since it is wired permanently into
358  * the kernel space.  Even though this is accounted for in the buffer
359  * allocation, we don't want the malloced region to grow uncontrolled.
360  * The malloc scheme improves memory utilization significantly on average
361  * (small) directories.
362  */
363 	maxbufmallocspace = hibufspace / 20;
364 
365 /*
366  * Reduce the chance of a deadlock occuring by limiting the number
367  * of delayed-write dirty buffers we allow to stack up.
368  */
369 	lodirtybuffers = nbuf / 7 + 10;
370 	hidirtybuffers = nbuf / 4 + 20;
371 	numdirtybuffers = 0;
372 
373 /*
374  * Try to keep the number of free buffers in the specified range,
375  * and give the syncer access to an emergency reserve.
376  */
377 	lofreebuffers = nbuf / 18 + 5;
378 	hifreebuffers = 2 * lofreebuffers;
379 	numfreebuffers = nbuf;
380 
381 /*
382  * Maximum number of async ops initiated per buf_daemon loop.  This is
383  * somewhat of a hack at the moment, we really need to limit ourselves
384  * based on the number of bytes of I/O in-transit that were initiated
385  * from buf_daemon.
386  */
387 	if ((maxbdrun = nswbuf / 4) < 4)
388 		maxbdrun = 4;
389 
390 	kvafreespace = 0;
391 
392 	bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE);
393 	bogus_page = vm_page_alloc(kernel_object,
394 			((bogus_offset - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
395 			VM_ALLOC_NORMAL);
396 	cnt.v_wire_count++;
397 
398 }
399 
400 /*
401  * Free the kva allocation for a buffer
402  * Must be called only at splbio or higher,
403  *  as this is the only locking for buffer_map.
404  */
405 static void
406 bfreekva(struct buf * bp)
407 {
408 	if (bp->b_kvasize) {
409 		vm_map_delete(buffer_map,
410 		    (vm_offset_t) bp->b_kvabase,
411 		    (vm_offset_t) bp->b_kvabase + bp->b_kvasize
412 		);
413 		bp->b_kvasize = 0;
414 		kvaspacewakeup();
415 	}
416 }
417 
418 /*
419  *	bremfree:
420  *
421  *	Remove the buffer from the appropriate free list.
422  */
423 void
424 bremfree(struct buf * bp)
425 {
426 	int s = splbio();
427 	int old_qindex = bp->b_qindex;
428 
429 	if (bp->b_qindex != QUEUE_NONE) {
430 		if (bp->b_qindex == QUEUE_EMPTYKVA) {
431 			kvafreespace -= bp->b_kvasize;
432 		}
433 		KASSERT(BUF_REFCNT(bp) == 1, ("bremfree: bp %p not locked",bp));
434 		TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
435 		bp->b_qindex = QUEUE_NONE;
436 		runningbufspace += bp->b_bufsize;
437 	} else {
438 #if !defined(MAX_PERF)
439 		if (BUF_REFCNT(bp) <= 1)
440 			panic("bremfree: removing a buffer not on a queue");
441 #endif
442 	}
443 
444 	/*
445 	 * Fixup numfreebuffers count.  If the buffer is invalid or not
446 	 * delayed-write, and it was on the EMPTY, LRU, or AGE queues,
447 	 * the buffer was free and we must decrement numfreebuffers.
448 	 */
449 	if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) {
450 		switch(old_qindex) {
451 		case QUEUE_DIRTY:
452 		case QUEUE_CLEAN:
453 		case QUEUE_EMPTY:
454 		case QUEUE_EMPTYKVA:
455 			--numfreebuffers;
456 			break;
457 		default:
458 			break;
459 		}
460 	}
461 	splx(s);
462 }
463 
464 
465 /*
466  * Get a buffer with the specified data.  Look in the cache first.  We
467  * must clear B_ERROR and B_INVAL prior to initiating I/O.  If B_CACHE
468  * is set, the buffer is valid and we do not have to do anything ( see
469  * getblk() ).
470  */
471 int
472 bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred,
473     struct buf ** bpp)
474 {
475 	struct buf *bp;
476 
477 	bp = getblk(vp, blkno, size, 0, 0);
478 	*bpp = bp;
479 
480 	/* if not found in cache, do some I/O */
481 	if ((bp->b_flags & B_CACHE) == 0) {
482 		if (curproc != NULL)
483 			curproc->p_stats->p_ru.ru_inblock++;
484 		KASSERT(!(bp->b_flags & B_ASYNC), ("bread: illegal async bp %p", bp));
485 		bp->b_flags |= B_READ;
486 		bp->b_flags &= ~(B_ERROR | B_INVAL);
487 		if (bp->b_rcred == NOCRED) {
488 			if (cred != NOCRED)
489 				crhold(cred);
490 			bp->b_rcred = cred;
491 		}
492 		vfs_busy_pages(bp, 0);
493 		VOP_STRATEGY(vp, bp);
494 		return (biowait(bp));
495 	}
496 	return (0);
497 }
498 
499 /*
500  * Operates like bread, but also starts asynchronous I/O on
501  * read-ahead blocks.  We must clear B_ERROR and B_INVAL prior
502  * to initiating I/O . If B_CACHE is set, the buffer is valid
503  * and we do not have to do anything.
504  */
505 int
506 breadn(struct vnode * vp, daddr_t blkno, int size,
507     daddr_t * rablkno, int *rabsize,
508     int cnt, struct ucred * cred, struct buf ** bpp)
509 {
510 	struct buf *bp, *rabp;
511 	int i;
512 	int rv = 0, readwait = 0;
513 
514 	*bpp = bp = getblk(vp, blkno, size, 0, 0);
515 
516 	/* if not found in cache, do some I/O */
517 	if ((bp->b_flags & B_CACHE) == 0) {
518 		if (curproc != NULL)
519 			curproc->p_stats->p_ru.ru_inblock++;
520 		bp->b_flags |= B_READ;
521 		bp->b_flags &= ~(B_ERROR | B_INVAL);
522 		if (bp->b_rcred == NOCRED) {
523 			if (cred != NOCRED)
524 				crhold(cred);
525 			bp->b_rcred = cred;
526 		}
527 		vfs_busy_pages(bp, 0);
528 		VOP_STRATEGY(vp, bp);
529 		++readwait;
530 	}
531 
532 	for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
533 		if (inmem(vp, *rablkno))
534 			continue;
535 		rabp = getblk(vp, *rablkno, *rabsize, 0, 0);
536 
537 		if ((rabp->b_flags & B_CACHE) == 0) {
538 			if (curproc != NULL)
539 				curproc->p_stats->p_ru.ru_inblock++;
540 			rabp->b_flags |= B_READ | B_ASYNC;
541 			rabp->b_flags &= ~(B_ERROR | B_INVAL);
542 			if (rabp->b_rcred == NOCRED) {
543 				if (cred != NOCRED)
544 					crhold(cred);
545 				rabp->b_rcred = cred;
546 			}
547 			vfs_busy_pages(rabp, 0);
548 			BUF_KERNPROC(rabp);
549 			VOP_STRATEGY(vp, rabp);
550 		} else {
551 			brelse(rabp);
552 		}
553 	}
554 
555 	if (readwait) {
556 		rv = biowait(bp);
557 	}
558 	return (rv);
559 }
560 
561 /*
562  * Write, release buffer on completion.  (Done by iodone
563  * if async).  Do not bother writing anything if the buffer
564  * is invalid.
565  *
566  * Note that we set B_CACHE here, indicating that buffer is
567  * fully valid and thus cacheable.  This is true even of NFS
568  * now so we set it generally.  This could be set either here
569  * or in biodone() since the I/O is synchronous.  We put it
570  * here.
571  */
572 int
573 bwrite(struct buf * bp)
574 {
575 	int oldflags, s;
576 	struct vnode *vp;
577 	struct mount *mp;
578 
579 	if (bp->b_flags & B_INVAL) {
580 		brelse(bp);
581 		return (0);
582 	}
583 
584 	oldflags = bp->b_flags;
585 
586 #if !defined(MAX_PERF)
587 	if (BUF_REFCNT(bp) == 0)
588 		panic("bwrite: buffer is not busy???");
589 #endif
590 	s = splbio();
591 	bundirty(bp);
592 
593 	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR);
594 	bp->b_flags |= B_WRITEINPROG | B_CACHE;
595 
596 	bp->b_vp->v_numoutput++;
597 	vfs_busy_pages(bp, 1);
598 	if (curproc != NULL)
599 		curproc->p_stats->p_ru.ru_oublock++;
600 	splx(s);
601 	if (oldflags & B_ASYNC)
602 		BUF_KERNPROC(bp);
603 	VOP_STRATEGY(bp->b_vp, bp);
604 
605 	/*
606 	 * Collect statistics on synchronous and asynchronous writes.
607 	 * Writes to block devices are charged to their associated
608 	 * filesystem (if any).
609 	 */
610 	if ((vp = bp->b_vp) != NULL) {
611 		if (vp->v_type == VBLK)
612 			mp = vp->v_specmountpoint;
613 		else
614 			mp = vp->v_mount;
615 		if (mp != NULL) {
616 			if ((oldflags & B_ASYNC) == 0)
617 				mp->mnt_stat.f_syncwrites++;
618 			else
619 				mp->mnt_stat.f_asyncwrites++;
620 		}
621 	}
622 
623 	if ((oldflags & B_ASYNC) == 0) {
624 		int rtval = biowait(bp);
625 		brelse(bp);
626 		return (rtval);
627 	}
628 
629 	return (0);
630 }
631 
632 /*
633  * Delayed write. (Buffer is marked dirty).  Do not bother writing
634  * anything if the buffer is marked invalid.
635  *
636  * Note that since the buffer must be completely valid, we can safely
637  * set B_CACHE.  In fact, we have to set B_CACHE here rather then in
638  * biodone() in order to prevent getblk from writing the buffer
639  * out synchronously.
640  */
641 void
642 bdwrite(struct buf * bp)
643 {
644 #if 0
645 	struct vnode *vp;
646 #endif
647 
648 #if !defined(MAX_PERF)
649 	if (BUF_REFCNT(bp) == 0)
650 		panic("bdwrite: buffer is not busy");
651 #endif
652 
653 	if (bp->b_flags & B_INVAL) {
654 		brelse(bp);
655 		return;
656 	}
657 	bdirty(bp);
658 
659 	/*
660 	 * Set B_CACHE, indicating that the buffer is fully valid.  This is
661 	 * true even of NFS now.
662 	 */
663 	bp->b_flags |= B_CACHE;
664 
665 	/*
666 	 * This bmap keeps the system from needing to do the bmap later,
667 	 * perhaps when the system is attempting to do a sync.  Since it
668 	 * is likely that the indirect block -- or whatever other datastructure
669 	 * that the filesystem needs is still in memory now, it is a good
670 	 * thing to do this.  Note also, that if the pageout daemon is
671 	 * requesting a sync -- there might not be enough memory to do
672 	 * the bmap then...  So, this is important to do.
673 	 */
674 	if (bp->b_lblkno == bp->b_blkno) {
675 		VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
676 	}
677 
678 	/*
679 	 * Set the *dirty* buffer range based upon the VM system dirty pages.
680 	 */
681 	vfs_setdirty(bp);
682 
683 	/*
684 	 * We need to do this here to satisfy the vnode_pager and the
685 	 * pageout daemon, so that it thinks that the pages have been
686 	 * "cleaned".  Note that since the pages are in a delayed write
687 	 * buffer -- the VFS layer "will" see that the pages get written
688 	 * out on the next sync, or perhaps the cluster will be completed.
689 	 */
690 	vfs_clean_pages(bp);
691 	bqrelse(bp);
692 
693 	/*
694 	 * Wakeup the buffer flushing daemon if we have saturated the
695 	 * buffer cache.
696 	 */
697 
698 	bd_wakeup(hidirtybuffers);
699 
700 	/*
701 	 * note: we cannot initiate I/O from a bdwrite even if we wanted to,
702 	 * due to the softdep code.
703 	 */
704 #if 0
705 	/*
706 	 * XXX The soft dependency code is not prepared to
707 	 * have I/O done when a bdwrite is requested. For
708 	 * now we just let the write be delayed if it is
709 	 * requested by the soft dependency code.
710 	 */
711 	if ((vp = bp->b_vp) &&
712 	    ((vp->v_type == VBLK && vp->v_specmountpoint &&
713 		  (vp->v_specmountpoint->mnt_flag & MNT_SOFTDEP)) ||
714 		 (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SOFTDEP))))
715 		return;
716 #endif
717 }
718 
719 /*
720  *	bdirty:
721  *
722  *	Turn buffer into delayed write request.  We must clear B_READ and
723  *	B_RELBUF, and we must set B_DELWRI.  We reassign the buffer to
724  *	itself to properly update it in the dirty/clean lists.  We mark it
725  *	B_DONE to ensure that any asynchronization of the buffer properly
726  *	clears B_DONE ( else a panic will occur later ).
727  *
728  *	bdirty() is kinda like bdwrite() - we have to clear B_INVAL which
729  *	might have been set pre-getblk().  Unlike bwrite/bdwrite, bdirty()
730  *	should only be called if the buffer is known-good.
731  *
732  *	Since the buffer is not on a queue, we do not update the numfreebuffers
733  *	count.
734  *
735  *	Must be called at splbio().
736  *	The buffer must be on QUEUE_NONE.
737  */
738 void
739 bdirty(bp)
740 	struct buf *bp;
741 {
742 	KASSERT(bp->b_qindex == QUEUE_NONE, ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex));
743 	bp->b_flags &= ~(B_READ|B_RELBUF);
744 
745 	if ((bp->b_flags & B_DELWRI) == 0) {
746 		bp->b_flags |= B_DONE | B_DELWRI;
747 		reassignbuf(bp, bp->b_vp);
748 		++numdirtybuffers;
749 		bd_wakeup(hidirtybuffers);
750 	}
751 }
752 
753 /*
754  *	bundirty:
755  *
756  *	Clear B_DELWRI for buffer.
757  *
758  *	Since the buffer is not on a queue, we do not update the numfreebuffers
759  *	count.
760  *
761  *	Must be called at splbio().
762  *	The buffer must be on QUEUE_NONE.
763  */
764 
765 void
766 bundirty(bp)
767 	struct buf *bp;
768 {
769 	KASSERT(bp->b_qindex == QUEUE_NONE, ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex));
770 
771 	if (bp->b_flags & B_DELWRI) {
772 		bp->b_flags &= ~B_DELWRI;
773 		reassignbuf(bp, bp->b_vp);
774 		--numdirtybuffers;
775 		numdirtywakeup();
776 	}
777 }
778 
779 /*
780  *	bawrite:
781  *
782  *	Asynchronous write.  Start output on a buffer, but do not wait for
783  *	it to complete.  The buffer is released when the output completes.
784  *
785  *	bwrite() ( or the VOP routine anyway ) is responsible for handling
786  *	B_INVAL buffers.  Not us.
787  */
788 void
789 bawrite(struct buf * bp)
790 {
791 	bp->b_flags |= B_ASYNC;
792 	(void) VOP_BWRITE(bp->b_vp, bp);
793 }
794 
795 /*
796  *	bowrite:
797  *
798  *	Ordered write.  Start output on a buffer, and flag it so that the
799  *	device will write it in the order it was queued.  The buffer is
800  *	released when the output completes.  bwrite() ( or the VOP routine
801  *	anyway ) is responsible for handling B_INVAL buffers.
802  */
803 int
804 bowrite(struct buf * bp)
805 {
806 	bp->b_flags |= B_ORDERED | B_ASYNC;
807 	return (VOP_BWRITE(bp->b_vp, bp));
808 }
809 
810 /*
811  *	bwillwrite:
812  *
813  *	Called prior to the locking of any vnodes when we are expecting to
814  *	write.  We do not want to starve the buffer cache with too many
815  *	dirty buffers so we block here.  By blocking prior to the locking
816  *	of any vnodes we attempt to avoid the situation where a locked vnode
817  *	prevents the various system daemons from flushing related buffers.
818  */
819 
820 void
821 bwillwrite(void)
822 {
823 	int twenty = (hidirtybuffers - lodirtybuffers) / 5;
824 
825 	if (numdirtybuffers > hidirtybuffers + twenty) {
826 		int s;
827 
828 		s = splbio();
829 		while (numdirtybuffers > hidirtybuffers) {
830 			bd_wakeup(hidirtybuffers);
831 			needsbuffer |= VFS_BIO_NEED_DIRTYFLUSH;
832 			tsleep(&needsbuffer, (PRIBIO + 4), "flswai", 0);
833 		}
834 		splx(s);
835 	}
836 }
837 
838 /*
839  *	brelse:
840  *
841  *	Release a busy buffer and, if requested, free its resources.  The
842  *	buffer will be stashed in the appropriate bufqueue[] allowing it
843  *	to be accessed later as a cache entity or reused for other purposes.
844  */
845 void
846 brelse(struct buf * bp)
847 {
848 	int s;
849 
850 	KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
851 
852 #if 0
853 	if (bp->b_flags & B_CLUSTER) {
854 		relpbuf(bp, NULL);
855 		return;
856 	}
857 #endif
858 
859 	s = splbio();
860 
861 	if (bp->b_flags & B_LOCKED)
862 		bp->b_flags &= ~B_ERROR;
863 
864 	if ((bp->b_flags & (B_READ | B_ERROR | B_INVAL)) == B_ERROR) {
865 		/*
866 		 * Failed write, redirty.  Must clear B_ERROR to prevent
867 		 * pages from being scrapped.  If B_INVAL is set then
868 		 * this case is not run and the next case is run to
869 		 * destroy the buffer.  B_INVAL can occur if the buffer
870 		 * is outside the range supported by the underlying device.
871 		 */
872 		bp->b_flags &= ~B_ERROR;
873 		bdirty(bp);
874 	} else if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_FREEBUF)) ||
875 	    (bp->b_bufsize <= 0)) {
876 		/*
877 		 * Either a failed I/O or we were asked to free or not
878 		 * cache the buffer.
879 		 */
880 		bp->b_flags |= B_INVAL;
881 		if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate)
882 			(*bioops.io_deallocate)(bp);
883 		if (bp->b_flags & B_DELWRI) {
884 			--numdirtybuffers;
885 			numdirtywakeup();
886 		}
887 		bp->b_flags &= ~(B_DELWRI | B_CACHE | B_FREEBUF);
888 		if ((bp->b_flags & B_VMIO) == 0) {
889 			if (bp->b_bufsize)
890 				allocbuf(bp, 0);
891 			if (bp->b_vp)
892 				brelvp(bp);
893 		}
894 	}
895 
896 	/*
897 	 * We must clear B_RELBUF if B_DELWRI is set.  If vfs_vmio_release()
898 	 * is called with B_DELWRI set, the underlying pages may wind up
899 	 * getting freed causing a previous write (bdwrite()) to get 'lost'
900 	 * because pages associated with a B_DELWRI bp are marked clean.
901 	 *
902 	 * We still allow the B_INVAL case to call vfs_vmio_release(), even
903 	 * if B_DELWRI is set.
904 	 */
905 
906 	if (bp->b_flags & B_DELWRI)
907 		bp->b_flags &= ~B_RELBUF;
908 
909 	/*
910 	 * VMIO buffer rundown.  It is not very necessary to keep a VMIO buffer
911 	 * constituted, not even NFS buffers now.  Two flags effect this.  If
912 	 * B_INVAL, the struct buf is invalidated but the VM object is kept
913 	 * around ( i.e. so it is trivial to reconstitute the buffer later ).
914 	 *
915 	 * If B_ERROR or B_NOCACHE is set, pages in the VM object will be
916 	 * invalidated.  B_ERROR cannot be set for a failed write unless the
917 	 * buffer is also B_INVAL because it hits the re-dirtying code above.
918 	 *
919 	 * Normally we can do this whether a buffer is B_DELWRI or not.  If
920 	 * the buffer is an NFS buffer, it is tracking piecemeal writes or
921 	 * the commit state and we cannot afford to lose the buffer.
922 	 */
923 	if ((bp->b_flags & B_VMIO)
924 	    && !(bp->b_vp->v_tag == VT_NFS &&
925 		 bp->b_vp->v_type != VBLK &&
926 		 (bp->b_flags & B_DELWRI))
927 	    ) {
928 
929 		int i, j, resid;
930 		vm_page_t m;
931 		off_t foff;
932 		vm_pindex_t poff;
933 		vm_object_t obj;
934 		struct vnode *vp;
935 
936 		vp = bp->b_vp;
937 
938 		/*
939 		 * Get the base offset and length of the buffer.  Note that
940 		 * for block sizes that are less then PAGE_SIZE, the b_data
941 		 * base of the buffer does not represent exactly b_offset and
942 		 * neither b_offset nor b_size are necessarily page aligned.
943 		 * Instead, the starting position of b_offset is:
944 		 *
945 		 * 	b_data + (b_offset & PAGE_MASK)
946 		 *
947 		 * block sizes less then DEV_BSIZE (usually 512) are not
948 		 * supported due to the page granularity bits (m->valid,
949 		 * m->dirty, etc...).
950 		 *
951 		 * See man buf(9) for more information
952 		 */
953 
954 		resid = bp->b_bufsize;
955 		foff = bp->b_offset;
956 
957 		for (i = 0; i < bp->b_npages; i++) {
958 			m = bp->b_pages[i];
959 			vm_page_flag_clear(m, PG_ZERO);
960 			if (m == bogus_page) {
961 
962 				obj = (vm_object_t) vp->v_object;
963 				poff = OFF_TO_IDX(bp->b_offset);
964 
965 				for (j = i; j < bp->b_npages; j++) {
966 					m = bp->b_pages[j];
967 					if (m == bogus_page) {
968 						m = vm_page_lookup(obj, poff + j);
969 #if !defined(MAX_PERF)
970 						if (!m) {
971 							panic("brelse: page missing\n");
972 						}
973 #endif
974 						bp->b_pages[j] = m;
975 					}
976 				}
977 
978 				if ((bp->b_flags & B_INVAL) == 0) {
979 					pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
980 				}
981 			}
982 			if (bp->b_flags & (B_NOCACHE|B_ERROR)) {
983 				int poffset = foff & PAGE_MASK;
984 				int presid = resid > (PAGE_SIZE - poffset) ?
985 					(PAGE_SIZE - poffset) : resid;
986 
987 				KASSERT(presid >= 0, ("brelse: extra page"));
988 				vm_page_set_invalid(m, poffset, presid);
989 			}
990 			resid -= PAGE_SIZE - (foff & PAGE_MASK);
991 			foff = (foff + PAGE_SIZE) & ~PAGE_MASK;
992 		}
993 
994 		if (bp->b_flags & (B_INVAL | B_RELBUF))
995 			vfs_vmio_release(bp);
996 
997 	} else if (bp->b_flags & B_VMIO) {
998 
999 		if (bp->b_flags & (B_INVAL | B_RELBUF))
1000 			vfs_vmio_release(bp);
1001 
1002 	}
1003 
1004 #if !defined(MAX_PERF)
1005 	if (bp->b_qindex != QUEUE_NONE)
1006 		panic("brelse: free buffer onto another queue???");
1007 #endif
1008 	if (BUF_REFCNT(bp) > 1) {
1009 		/* Temporary panic to verify exclusive locking */
1010 		/* This panic goes away when we allow shared refs */
1011 		panic("brelse: multiple refs");
1012 		/* do not release to free list */
1013 		BUF_UNLOCK(bp);
1014 		splx(s);
1015 		return;
1016 	}
1017 
1018 	/* enqueue */
1019 
1020 	/* buffers with no memory */
1021 	if (bp->b_bufsize == 0) {
1022 		bp->b_flags |= B_INVAL;
1023 		if (bp->b_kvasize)
1024 			bp->b_qindex = QUEUE_EMPTYKVA;
1025 		else
1026 			bp->b_qindex = QUEUE_EMPTY;
1027 		TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist);
1028 		LIST_REMOVE(bp, b_hash);
1029 		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
1030 		bp->b_dev = NODEV;
1031 		kvafreespace += bp->b_kvasize;
1032 		if (bp->b_kvasize)
1033 			kvaspacewakeup();
1034 	/* buffers with junk contents */
1035 	} else if (bp->b_flags & (B_ERROR | B_INVAL | B_NOCACHE | B_RELBUF)) {
1036 		bp->b_flags |= B_INVAL;
1037 		bp->b_qindex = QUEUE_CLEAN;
1038 		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
1039 		LIST_REMOVE(bp, b_hash);
1040 		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
1041 		bp->b_dev = NODEV;
1042 
1043 	/* buffers that are locked */
1044 	} else if (bp->b_flags & B_LOCKED) {
1045 		bp->b_qindex = QUEUE_LOCKED;
1046 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
1047 
1048 	/* remaining buffers */
1049 	} else {
1050 		switch(bp->b_flags & (B_DELWRI|B_AGE)) {
1051 		case B_DELWRI | B_AGE:
1052 		    bp->b_qindex = QUEUE_DIRTY;
1053 		    TAILQ_INSERT_HEAD(&bufqueues[QUEUE_DIRTY], bp, b_freelist);
1054 		    break;
1055 		case B_DELWRI:
1056 		    bp->b_qindex = QUEUE_DIRTY;
1057 		    TAILQ_INSERT_TAIL(&bufqueues[QUEUE_DIRTY], bp, b_freelist);
1058 		    break;
1059 		case B_AGE:
1060 		    bp->b_qindex = QUEUE_CLEAN;
1061 		    TAILQ_INSERT_HEAD(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
1062 		    break;
1063 		default:
1064 		    bp->b_qindex = QUEUE_CLEAN;
1065 		    TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
1066 		    break;
1067 		}
1068 	}
1069 
1070 	/*
1071 	 * If B_INVAL, clear B_DELWRI.  We've already placed the buffer
1072 	 * on the correct queue.
1073 	 */
1074 	if ((bp->b_flags & (B_INVAL|B_DELWRI)) == (B_INVAL|B_DELWRI)) {
1075 		bp->b_flags &= ~B_DELWRI;
1076 		--numdirtybuffers;
1077 		numdirtywakeup();
1078 	}
1079 
1080 	runningbufspace -= bp->b_bufsize;
1081 
1082 	/*
1083 	 * Fixup numfreebuffers count.  The bp is on an appropriate queue
1084 	 * unless locked.  We then bump numfreebuffers if it is not B_DELWRI.
1085 	 * We've already handled the B_INVAL case ( B_DELWRI will be clear
1086 	 * if B_INVAL is set ).
1087 	 */
1088 
1089 	if ((bp->b_flags & B_LOCKED) == 0 && !(bp->b_flags & B_DELWRI))
1090 		bufcountwakeup();
1091 
1092 	/*
1093 	 * Something we can maybe free.
1094 	 */
1095 
1096 	if (bp->b_bufsize)
1097 		bufspacewakeup();
1098 
1099 	/* unlock */
1100 	BUF_UNLOCK(bp);
1101 	bp->b_flags &= ~(B_ORDERED | B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
1102 	splx(s);
1103 }
1104 
1105 /*
1106  * Release a buffer back to the appropriate queue but do not try to free
1107  * it.
1108  *
1109  * bqrelse() is used by bdwrite() to requeue a delayed write, and used by
1110  * biodone() to requeue an async I/O on completion.  It is also used when
1111  * known good buffers need to be requeued but we think we may need the data
1112  * again soon.
1113  */
1114 void
1115 bqrelse(struct buf * bp)
1116 {
1117 	int s;
1118 
1119 	s = splbio();
1120 
1121 	KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
1122 
1123 #if !defined(MAX_PERF)
1124 	if (bp->b_qindex != QUEUE_NONE)
1125 		panic("bqrelse: free buffer onto another queue???");
1126 #endif
1127 	if (BUF_REFCNT(bp) > 1) {
1128 		/* do not release to free list */
1129 		panic("bqrelse: multiple refs");
1130 		BUF_UNLOCK(bp);
1131 		splx(s);
1132 		return;
1133 	}
1134 	if (bp->b_flags & B_LOCKED) {
1135 		bp->b_flags &= ~B_ERROR;
1136 		bp->b_qindex = QUEUE_LOCKED;
1137 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
1138 		/* buffers with stale but valid contents */
1139 	} else if (bp->b_flags & B_DELWRI) {
1140 		bp->b_qindex = QUEUE_DIRTY;
1141 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_DIRTY], bp, b_freelist);
1142 	} else {
1143 		bp->b_qindex = QUEUE_CLEAN;
1144 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
1145 	}
1146 
1147 	runningbufspace -= bp->b_bufsize;
1148 
1149 	if ((bp->b_flags & B_LOCKED) == 0 &&
1150 	    ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI))) {
1151 		bufcountwakeup();
1152 	}
1153 
1154 	/*
1155 	 * Something we can maybe wakeup
1156 	 */
1157 	if (bp->b_bufsize && !(bp->b_flags & B_DELWRI))
1158 		bufspacewakeup();
1159 
1160 	/* unlock */
1161 	BUF_UNLOCK(bp);
1162 	bp->b_flags &= ~(B_ORDERED | B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
1163 	splx(s);
1164 }
1165 
1166 static void
1167 vfs_vmio_release(bp)
1168 	struct buf *bp;
1169 {
1170 	int i, s;
1171 	vm_page_t m;
1172 
1173 	s = splvm();
1174 	for (i = 0; i < bp->b_npages; i++) {
1175 		m = bp->b_pages[i];
1176 		bp->b_pages[i] = NULL;
1177 		/*
1178 		 * In order to keep page LRU ordering consistent, put
1179 		 * everything on the inactive queue.
1180 		 */
1181 		vm_page_unwire(m, 0);
1182 		/*
1183 		 * We don't mess with busy pages, it is
1184 		 * the responsibility of the process that
1185 		 * busied the pages to deal with them.
1186 		 */
1187 		if ((m->flags & PG_BUSY) || (m->busy != 0))
1188 			continue;
1189 
1190 		if (m->wire_count == 0) {
1191 			vm_page_flag_clear(m, PG_ZERO);
1192 			/*
1193 			 * Might as well free the page if we can and it has
1194 			 * no valid data.
1195 			 */
1196 			if ((bp->b_flags & B_ASYNC) == 0 && !m->valid && m->hold_count == 0) {
1197 				vm_page_busy(m);
1198 				vm_page_protect(m, VM_PROT_NONE);
1199 				vm_page_free(m);
1200 			}
1201 		}
1202 	}
1203 	bufspace -= bp->b_bufsize;
1204 	vmiospace -= bp->b_bufsize;
1205 	runningbufspace -= bp->b_bufsize;
1206 	splx(s);
1207 	pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
1208 	if (bp->b_bufsize)
1209 		bufspacewakeup();
1210 	bp->b_npages = 0;
1211 	bp->b_bufsize = 0;
1212 	bp->b_flags &= ~B_VMIO;
1213 	if (bp->b_vp)
1214 		brelvp(bp);
1215 }
1216 
1217 /*
1218  * Check to see if a block is currently memory resident.
1219  */
1220 struct buf *
1221 gbincore(struct vnode * vp, daddr_t blkno)
1222 {
1223 	struct buf *bp;
1224 	struct bufhashhdr *bh;
1225 
1226 	bh = bufhash(vp, blkno);
1227 	bp = bh->lh_first;
1228 
1229 	/* Search hash chain */
1230 	while (bp != NULL) {
1231 		/* hit */
1232 		if (bp->b_vp == vp && bp->b_lblkno == blkno &&
1233 		    (bp->b_flags & B_INVAL) == 0) {
1234 			break;
1235 		}
1236 		bp = bp->b_hash.le_next;
1237 	}
1238 	return (bp);
1239 }
1240 
1241 /*
1242  *	vfs_bio_awrite:
1243  *
1244  *	Implement clustered async writes for clearing out B_DELWRI buffers.
1245  *	This is much better then the old way of writing only one buffer at
1246  *	a time.  Note that we may not be presented with the buffers in the
1247  *	correct order, so we search for the cluster in both directions.
1248  */
1249 int
1250 vfs_bio_awrite(struct buf * bp)
1251 {
1252 	int i;
1253 	int j;
1254 	daddr_t lblkno = bp->b_lblkno;
1255 	struct vnode *vp = bp->b_vp;
1256 	int s;
1257 	int ncl;
1258 	struct buf *bpa;
1259 	int nwritten;
1260 	int size;
1261 	int maxcl;
1262 
1263 	s = splbio();
1264 	/*
1265 	 * right now we support clustered writing only to regular files.  If
1266 	 * we find a clusterable block we could be in the middle of a cluster
1267 	 * rather then at the beginning.
1268 	 */
1269 	if ((vp->v_type == VREG) &&
1270 	    (vp->v_mount != 0) && /* Only on nodes that have the size info */
1271 	    (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
1272 
1273 		size = vp->v_mount->mnt_stat.f_iosize;
1274 		maxcl = MAXPHYS / size;
1275 
1276 		for (i = 1; i < maxcl; i++) {
1277 			if ((bpa = gbincore(vp, lblkno + i)) &&
1278 			    BUF_REFCNT(bpa) == 0 &&
1279 			    ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
1280 			    (B_DELWRI | B_CLUSTEROK)) &&
1281 			    (bpa->b_bufsize == size)) {
1282 				if ((bpa->b_blkno == bpa->b_lblkno) ||
1283 				    (bpa->b_blkno !=
1284 				     bp->b_blkno + ((i * size) >> DEV_BSHIFT)))
1285 					break;
1286 			} else {
1287 				break;
1288 			}
1289 		}
1290 		for (j = 1; i + j <= maxcl && j <= lblkno; j++) {
1291 			if ((bpa = gbincore(vp, lblkno - j)) &&
1292 			    BUF_REFCNT(bpa) == 0 &&
1293 			    ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
1294 			    (B_DELWRI | B_CLUSTEROK)) &&
1295 			    (bpa->b_bufsize == size)) {
1296 				if ((bpa->b_blkno == bpa->b_lblkno) ||
1297 				    (bpa->b_blkno !=
1298 				     bp->b_blkno - ((j * size) >> DEV_BSHIFT)))
1299 					break;
1300 			} else {
1301 				break;
1302 			}
1303 		}
1304 		--j;
1305 		ncl = i + j;
1306 		/*
1307 		 * this is a possible cluster write
1308 		 */
1309 		if (ncl != 1) {
1310 			nwritten = cluster_wbuild(vp, size, lblkno - j, ncl);
1311 			splx(s);
1312 			return nwritten;
1313 		}
1314 	}
1315 
1316 	BUF_LOCK(bp, LK_EXCLUSIVE);
1317 	bremfree(bp);
1318 	bp->b_flags |= B_ASYNC;
1319 
1320 	splx(s);
1321 	/*
1322 	 * default (old) behavior, writing out only one block
1323 	 *
1324 	 * XXX returns b_bufsize instead of b_bcount for nwritten?
1325 	 */
1326 	nwritten = bp->b_bufsize;
1327 	(void) VOP_BWRITE(bp->b_vp, bp);
1328 
1329 	return nwritten;
1330 }
1331 
1332 /*
1333  *	getnewbuf:
1334  *
1335  *	Find and initialize a new buffer header, freeing up existing buffers
1336  *	in the bufqueues as necessary.  The new buffer is returned locked.
1337  *
1338  *	Important:  B_INVAL is not set.  If the caller wishes to throw the
1339  *	buffer away, the caller must set B_INVAL prior to calling brelse().
1340  *
1341  *	We block if:
1342  *		We have insufficient buffer headers
1343  *		We have insufficient buffer space
1344  *		buffer_map is too fragmented ( space reservation fails )
1345  *		If we have to flush dirty buffers ( but we try to avoid this )
1346  *
1347  *	To avoid VFS layer recursion we do not flush dirty buffers ourselves.
1348  *	Instead we ask the buf daemon to do it for us.  We attempt to
1349  *	avoid piecemeal wakeups of the pageout daemon.
1350  */
1351 
1352 static struct buf *
1353 getnewbuf(int slpflag, int slptimeo, int size, int maxsize)
1354 {
1355 	struct buf *bp;
1356 	struct buf *nbp;
1357 	struct buf *dbp;
1358 	int outofspace;
1359 	int nqindex;
1360 	int defrag = 0;
1361 
1362 	++getnewbufcalls;
1363 	--getnewbufrestarts;
1364 restart:
1365 	++getnewbufrestarts;
1366 
1367 	/*
1368 	 * Calculate whether we are out of buffer space.  This state is
1369 	 * recalculated on every restart.  If we are out of space, we
1370 	 * have to turn off defragmentation.  Setting defrag to -1 when
1371 	 * outofspace is positive means "defrag while freeing buffers".
1372 	 * The looping conditional will be muffed up if defrag is left
1373 	 * positive when outofspace is positive.
1374 	 */
1375 
1376 	dbp = NULL;
1377 	outofspace = 0;
1378 	if (bufspace >= hibufspace) {
1379 		if ((curproc && (curproc->p_flag & P_BUFEXHAUST) == 0) ||
1380 		    bufspace >= maxbufspace) {
1381 			outofspace = 1;
1382 			if (defrag > 0)
1383 				defrag = -1;
1384 		}
1385 	}
1386 
1387 	/*
1388 	 * defrag state is semi-persistant.  1 means we are flagged for
1389 	 * defragging.  -1 means we actually defragged something.
1390 	 */
1391 	/* nop */
1392 
1393 	/*
1394 	 * Setup for scan.  If we do not have enough free buffers,
1395 	 * we setup a degenerate case that immediately fails.  Note
1396 	 * that if we are specially marked process, we are allowed to
1397 	 * dip into our reserves.
1398 	 *
1399 	 * Normally we want to find an EMPTYKVA buffer.  That is, a
1400 	 * buffer with kva already allocated.  If there are no EMPTYKVA
1401 	 * buffers we back up to the truely EMPTY buffers.  When defragging
1402 	 * we do not bother backing up since we have to locate buffers with
1403 	 * kva to defrag.  If we are out of space we skip both EMPTY and
1404 	 * EMPTYKVA and dig right into the CLEAN queue.
1405 	 *
1406 	 * In this manner we avoid scanning unnecessary buffers.  It is very
1407 	 * important for us to do this because the buffer cache is almost
1408 	 * constantly out of space or in need of defragmentation.
1409 	 */
1410 
1411 	if (curproc && (curproc->p_flag & P_BUFEXHAUST) == 0 &&
1412 	    numfreebuffers < lofreebuffers) {
1413 		nqindex = QUEUE_CLEAN;
1414 		nbp = NULL;
1415 	} else {
1416 		nqindex = QUEUE_EMPTYKVA;
1417 		nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]);
1418 		if (nbp == NULL) {
1419 			if (defrag <= 0) {
1420 				nqindex = QUEUE_EMPTY;
1421 				nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
1422 			}
1423 		}
1424 		if (outofspace || nbp == NULL) {
1425 			nqindex = QUEUE_CLEAN;
1426 			nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
1427 		}
1428 	}
1429 
1430 	/*
1431 	 * Run scan, possibly freeing data and/or kva mappings on the fly
1432 	 * depending.
1433 	 */
1434 
1435 	while ((bp = nbp) != NULL) {
1436 		int qindex = nqindex;
1437 
1438 		/*
1439 		 * Calculate next bp ( we can only use it if we do not block
1440 		 * or do other fancy things ).
1441 		 */
1442 		if ((nbp = TAILQ_NEXT(bp, b_freelist)) == NULL) {
1443 			switch(qindex) {
1444 			case QUEUE_EMPTY:
1445 				nqindex = QUEUE_EMPTYKVA;
1446 				if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA])))
1447 					break;
1448 				/* fall through */
1449 			case QUEUE_EMPTYKVA:
1450 				nqindex = QUEUE_CLEAN;
1451 				if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN])))
1452 					break;
1453 				/* fall through */
1454 			case QUEUE_CLEAN:
1455 				/*
1456 				 * nbp is NULL.
1457 				 */
1458 				break;
1459 			}
1460 		}
1461 
1462 		/*
1463 		 * Sanity Checks
1464 		 */
1465 		KASSERT(bp->b_qindex == qindex, ("getnewbuf: inconsistant queue %d bp %p", qindex, bp));
1466 
1467 		/*
1468 		 * Note: we no longer distinguish between VMIO and non-VMIO
1469 		 * buffers.
1470 		 */
1471 
1472 		KASSERT((bp->b_flags & B_DELWRI) == 0, ("delwri buffer %p found in queue %d", bp, qindex));
1473 
1474 		/*
1475 		 * If we are defragging and the buffer isn't useful for fixing
1476 		 * that problem we continue.  If we are out of space and the
1477 		 * buffer isn't useful for fixing that problem we continue.
1478 		 */
1479 
1480 		if (defrag > 0 && bp->b_kvasize == 0)
1481 			continue;
1482 		if (outofspace > 0 && bp->b_bufsize == 0)
1483 			continue;
1484 
1485 		/*
1486 		 * Start freeing the bp.  This is somewhat involved.  nbp
1487 		 * remains valid only for QUEUE_EMPTY[KVA] bp's.
1488 		 */
1489 
1490 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) != 0)
1491 			panic("getnewbuf: locked buf");
1492 		bremfree(bp);
1493 
1494 		if (qindex == QUEUE_CLEAN) {
1495 			if (bp->b_flags & B_VMIO) {
1496 				bp->b_flags &= ~B_ASYNC;
1497 				vfs_vmio_release(bp);
1498 			}
1499 			if (bp->b_vp)
1500 				brelvp(bp);
1501 		}
1502 
1503 		/*
1504 		 * NOTE:  nbp is now entirely invalid.  We can only restart
1505 		 * the scan from this point on.
1506 		 *
1507 		 * Get the rest of the buffer freed up.  b_kva* is still
1508 		 * valid after this operation.
1509 		 */
1510 
1511 		if (bp->b_rcred != NOCRED) {
1512 			crfree(bp->b_rcred);
1513 			bp->b_rcred = NOCRED;
1514 		}
1515 		if (bp->b_wcred != NOCRED) {
1516 			crfree(bp->b_wcred);
1517 			bp->b_wcred = NOCRED;
1518 		}
1519 		if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate)
1520 			(*bioops.io_deallocate)(bp);
1521 		LIST_REMOVE(bp, b_hash);
1522 		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
1523 
1524 		if (bp->b_bufsize)
1525 			allocbuf(bp, 0);
1526 
1527 		bp->b_flags = 0;
1528 		bp->b_dev = NODEV;
1529 		bp->b_vp = NULL;
1530 		bp->b_blkno = bp->b_lblkno = 0;
1531 		bp->b_offset = NOOFFSET;
1532 		bp->b_iodone = 0;
1533 		bp->b_error = 0;
1534 		bp->b_resid = 0;
1535 		bp->b_bcount = 0;
1536 		bp->b_npages = 0;
1537 		bp->b_dirtyoff = bp->b_dirtyend = 0;
1538 
1539 		LIST_INIT(&bp->b_dep);
1540 
1541 		/*
1542 		 * Ok, now that we have a free buffer, if we are defragging
1543 		 * we have to recover the kvaspace.  If we are out of space
1544 		 * we have to free the buffer (which we just did), but we
1545 		 * do not have to recover kva space unless we hit a defrag
1546 		 * hicup.  Being able to avoid freeing the kva space leads
1547 		 * to a significant reduction in overhead.
1548 		 */
1549 
1550 		if (defrag > 0) {
1551 			defrag = -1;
1552 			bp->b_flags |= B_INVAL;
1553 			bfreekva(bp);
1554 			brelse(bp);
1555 			goto restart;
1556 		}
1557 
1558 		if (outofspace > 0) {
1559 			outofspace = -1;
1560 			bp->b_flags |= B_INVAL;
1561 			if (defrag < 0)
1562 				bfreekva(bp);
1563 			brelse(bp);
1564 			goto restart;
1565 		}
1566 
1567 		/*
1568 		 * We are done
1569 		 */
1570 		break;
1571 	}
1572 
1573 	/*
1574 	 * If we exhausted our list, sleep as appropriate.  We may have to
1575 	 * wakeup various daemons and write out some dirty buffers.
1576 	 *
1577 	 * Generally we are sleeping due to insufficient buffer space.
1578 	 */
1579 
1580 	if (bp == NULL) {
1581 		int flags;
1582 		char *waitmsg;
1583 
1584 dosleep:
1585 		if (defrag > 0) {
1586 			flags = VFS_BIO_NEED_KVASPACE;
1587 			waitmsg = "nbufkv";
1588 		} else if (outofspace > 0) {
1589 			waitmsg = "nbufbs";
1590 			flags = VFS_BIO_NEED_BUFSPACE;
1591 		} else {
1592 			waitmsg = "newbuf";
1593 			flags = VFS_BIO_NEED_ANY;
1594 		}
1595 
1596 		/* XXX */
1597 
1598 		(void) speedup_syncer();
1599 		needsbuffer |= flags;
1600 		while (needsbuffer & flags) {
1601 			if (tsleep(&needsbuffer, (PRIBIO + 4) | slpflag,
1602 			    waitmsg, slptimeo))
1603 				return (NULL);
1604 		}
1605 	} else {
1606 		/*
1607 		 * We finally have a valid bp.  We aren't quite out of the
1608 		 * woods, we still have to reserve kva space.
1609 		 */
1610 		vm_offset_t addr = 0;
1611 
1612 		maxsize = (maxsize + PAGE_MASK) & ~PAGE_MASK;
1613 
1614 		if (maxsize != bp->b_kvasize) {
1615 			bfreekva(bp);
1616 
1617 			if (vm_map_findspace(buffer_map,
1618 				vm_map_min(buffer_map), maxsize, &addr)) {
1619 				/*
1620 				 * Uh oh.  Buffer map is to fragmented.  Try
1621 				 * to defragment.
1622 				 */
1623 				if (defrag <= 0) {
1624 					defrag = 1;
1625 					bp->b_flags |= B_INVAL;
1626 					brelse(bp);
1627 					goto restart;
1628 				}
1629 				/*
1630 				 * Uh oh.  We couldn't seem to defragment
1631 				 */
1632 				bp = NULL;
1633 				goto dosleep;
1634 			}
1635 		}
1636 		if (addr) {
1637 			vm_map_insert(buffer_map, NULL, 0,
1638 				addr, addr + maxsize,
1639 				VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT);
1640 
1641 			bp->b_kvabase = (caddr_t) addr;
1642 			bp->b_kvasize = maxsize;
1643 		}
1644 		bp->b_data = bp->b_kvabase;
1645 	}
1646 	return(bp);
1647 }
1648 
1649 /*
1650  *	waitfreebuffers:
1651  *
1652  *	Wait for sufficient free buffers.  Only called from normal processes.
1653  */
1654 
1655 static void
1656 waitfreebuffers(int slpflag, int slptimeo)
1657 {
1658 	while (numfreebuffers < hifreebuffers) {
1659 		if (numfreebuffers >= hifreebuffers)
1660 			break;
1661 		needsbuffer |= VFS_BIO_NEED_FREE;
1662 		if (tsleep(&needsbuffer, (PRIBIO + 4)|slpflag, "biofre", slptimeo))
1663 			break;
1664 	}
1665 }
1666 
1667 /*
1668  *	buf_daemon:
1669  *
1670  *	buffer flushing daemon.  Buffers are normally flushed by the
1671  *	update daemon but if it cannot keep up this process starts to
1672  *	take the load in an attempt to prevent getnewbuf() from blocking.
1673  */
1674 
1675 static struct proc *bufdaemonproc;
1676 static int bd_interval;
1677 static int bd_flushto;
1678 
1679 static struct kproc_desc buf_kp = {
1680 	"bufdaemon",
1681 	buf_daemon,
1682 	&bufdaemonproc
1683 };
1684 SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp)
1685 
1686 static void
1687 buf_daemon()
1688 {
1689 	int s;
1690 	/*
1691 	 * This process is allowed to take the buffer cache to the limit
1692 	 */
1693 	curproc->p_flag |= P_BUFEXHAUST;
1694 	s = splbio();
1695 
1696 	bd_interval = 5 * hz;	/* dynamically adjusted */
1697 	bd_flushto = hidirtybuffers;	/* dynamically adjusted */
1698 
1699 	while (TRUE) {
1700 		bd_request = 0;
1701 
1702 		/*
1703 		 * Do the flush.  Limit the number of buffers we flush in one
1704 		 * go.  The failure condition occurs when processes are writing
1705 		 * buffers faster then we can dispose of them.  In this case
1706 		 * we may be flushing so often that the previous set of flushes
1707 		 * have not had time to complete, causing us to run out of
1708 		 * physical buffers and block.
1709 		 */
1710 		{
1711 			int runcount = maxbdrun;
1712 
1713 			while (numdirtybuffers > bd_flushto && runcount) {
1714 				--runcount;
1715 				if (flushbufqueues() == 0)
1716 					break;
1717 			}
1718 		}
1719 
1720 		/*
1721 		 * If nobody is requesting anything we sleep
1722 		 */
1723 		if (bd_request == 0)
1724 			tsleep(&bd_request, PVM, "psleep", bd_interval);
1725 
1726 		/*
1727 		 * We calculate how much to add or subtract from bd_flushto
1728 		 * and bd_interval based on how far off we are from the
1729 		 * optimal number of dirty buffers, which is 20% below the
1730 		 * hidirtybuffers mark.  We cannot use hidirtybuffers straight
1731 		 * because being right on the mark will cause getnewbuf()
1732 		 * to oscillate our wakeup.
1733 		 *
1734 		 * The larger the error in either direction, the more we adjust
1735 		 * bd_flushto and bd_interval.  The time interval is adjusted
1736 		 * by 2 seconds per whole-buffer-range of error.  This is an
1737 		 * exponential convergence algorithm, with large errors
1738 		 * producing large changes and small errors producing small
1739 		 * changes.
1740 		 */
1741 
1742 		{
1743 			int brange = hidirtybuffers - lodirtybuffers;
1744 			int middb = hidirtybuffers - brange / 5;
1745 			int deltabuf = middb - numdirtybuffers;
1746 
1747 			bd_flushto += deltabuf / 20;
1748 			bd_interval += deltabuf * (2 * hz) / (brange * 1);
1749 		}
1750 		if (bd_flushto < lodirtybuffers)
1751 			bd_flushto = lodirtybuffers;
1752 		if (bd_flushto > hidirtybuffers)
1753 			bd_flushto = hidirtybuffers;
1754 		if (bd_interval < hz / 10)
1755 			bd_interval = hz / 10;
1756 		if (bd_interval > 5 * hz)
1757 			bd_interval = 5 * hz;
1758 	}
1759 }
1760 
1761 /*
1762  *	flushbufqueues:
1763  *
1764  *	Try to flush a buffer in the dirty queue.  We must be careful to
1765  *	free up B_INVAL buffers instead of write them, which NFS is
1766  *	particularly sensitive to.
1767  */
1768 
1769 static int
1770 flushbufqueues(void)
1771 {
1772 	struct buf *bp;
1773 	int r = 0;
1774 
1775 	bp = TAILQ_FIRST(&bufqueues[QUEUE_DIRTY]);
1776 
1777 	while (bp) {
1778 		KASSERT((bp->b_flags & B_DELWRI), ("unexpected clean buffer %p", bp));
1779 		if ((bp->b_flags & B_DELWRI) != 0) {
1780 			if (bp->b_flags & B_INVAL) {
1781 				if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) != 0)
1782 					panic("flushbufqueues: locked buf");
1783 				bremfree(bp);
1784 				brelse(bp);
1785 				++r;
1786 				break;
1787 			}
1788 			vfs_bio_awrite(bp);
1789 			++r;
1790 			break;
1791 		}
1792 		bp = TAILQ_NEXT(bp, b_freelist);
1793 	}
1794 	return(r);
1795 }
1796 
1797 /*
1798  * Check to see if a block is currently memory resident.
1799  */
1800 struct buf *
1801 incore(struct vnode * vp, daddr_t blkno)
1802 {
1803 	struct buf *bp;
1804 
1805 	int s = splbio();
1806 	bp = gbincore(vp, blkno);
1807 	splx(s);
1808 	return (bp);
1809 }
1810 
1811 /*
1812  * Returns true if no I/O is needed to access the
1813  * associated VM object.  This is like incore except
1814  * it also hunts around in the VM system for the data.
1815  */
1816 
1817 int
1818 inmem(struct vnode * vp, daddr_t blkno)
1819 {
1820 	vm_object_t obj;
1821 	vm_offset_t toff, tinc, size;
1822 	vm_page_t m;
1823 	vm_ooffset_t off;
1824 
1825 	if (incore(vp, blkno))
1826 		return 1;
1827 	if (vp->v_mount == NULL)
1828 		return 0;
1829 	if ((vp->v_object == NULL) || (vp->v_flag & VOBJBUF) == 0)
1830 		return 0;
1831 
1832 	obj = vp->v_object;
1833 	size = PAGE_SIZE;
1834 	if (size > vp->v_mount->mnt_stat.f_iosize)
1835 		size = vp->v_mount->mnt_stat.f_iosize;
1836 	off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize;
1837 
1838 	for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
1839 		m = vm_page_lookup(obj, OFF_TO_IDX(off + toff));
1840 		if (!m)
1841 			return 0;
1842 		tinc = size;
1843 		if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK))
1844 			tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK);
1845 		if (vm_page_is_valid(m,
1846 		    (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0)
1847 			return 0;
1848 	}
1849 	return 1;
1850 }
1851 
1852 /*
1853  *	vfs_setdirty:
1854  *
1855  *	Sets the dirty range for a buffer based on the status of the dirty
1856  *	bits in the pages comprising the buffer.
1857  *
1858  *	The range is limited to the size of the buffer.
1859  *
1860  *	This routine is primarily used by NFS, but is generalized for the
1861  *	B_VMIO case.
1862  */
1863 static void
1864 vfs_setdirty(struct buf *bp)
1865 {
1866 	int i;
1867 	vm_object_t object;
1868 
1869 	/*
1870 	 * Degenerate case - empty buffer
1871 	 */
1872 
1873 	if (bp->b_bufsize == 0)
1874 		return;
1875 
1876 	/*
1877 	 * We qualify the scan for modified pages on whether the
1878 	 * object has been flushed yet.  The OBJ_WRITEABLE flag
1879 	 * is not cleared simply by protecting pages off.
1880 	 */
1881 
1882 	if ((bp->b_flags & B_VMIO) == 0)
1883 		return;
1884 
1885 	object = bp->b_pages[0]->object;
1886 
1887 	if ((object->flags & OBJ_WRITEABLE) && !(object->flags & OBJ_MIGHTBEDIRTY))
1888 		printf("Warning: object %p writeable but not mightbedirty\n", object);
1889 	if (!(object->flags & OBJ_WRITEABLE) && (object->flags & OBJ_MIGHTBEDIRTY))
1890 		printf("Warning: object %p mightbedirty but not writeable\n", object);
1891 
1892 	if (object->flags & (OBJ_MIGHTBEDIRTY|OBJ_CLEANING)) {
1893 		vm_offset_t boffset;
1894 		vm_offset_t eoffset;
1895 
1896 		/*
1897 		 * test the pages to see if they have been modified directly
1898 		 * by users through the VM system.
1899 		 */
1900 		for (i = 0; i < bp->b_npages; i++) {
1901 			vm_page_flag_clear(bp->b_pages[i], PG_ZERO);
1902 			vm_page_test_dirty(bp->b_pages[i]);
1903 		}
1904 
1905 		/*
1906 		 * Calculate the encompassing dirty range, boffset and eoffset,
1907 		 * (eoffset - boffset) bytes.
1908 		 */
1909 
1910 		for (i = 0; i < bp->b_npages; i++) {
1911 			if (bp->b_pages[i]->dirty)
1912 				break;
1913 		}
1914 		boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
1915 
1916 		for (i = bp->b_npages - 1; i >= 0; --i) {
1917 			if (bp->b_pages[i]->dirty) {
1918 				break;
1919 			}
1920 		}
1921 		eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
1922 
1923 		/*
1924 		 * Fit it to the buffer.
1925 		 */
1926 
1927 		if (eoffset > bp->b_bcount)
1928 			eoffset = bp->b_bcount;
1929 
1930 		/*
1931 		 * If we have a good dirty range, merge with the existing
1932 		 * dirty range.
1933 		 */
1934 
1935 		if (boffset < eoffset) {
1936 			if (bp->b_dirtyoff > boffset)
1937 				bp->b_dirtyoff = boffset;
1938 			if (bp->b_dirtyend < eoffset)
1939 				bp->b_dirtyend = eoffset;
1940 		}
1941 	}
1942 }
1943 
1944 /*
1945  *	getblk:
1946  *
1947  *	Get a block given a specified block and offset into a file/device.
1948  *	The buffers B_DONE bit will be cleared on return, making it almost
1949  * 	ready for an I/O initiation.  B_INVAL may or may not be set on
1950  *	return.  The caller should clear B_INVAL prior to initiating a
1951  *	READ.
1952  *
1953  *	For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for
1954  *	an existing buffer.
1955  *
1956  *	For a VMIO buffer, B_CACHE is modified according to the backing VM.
1957  *	If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set
1958  *	and then cleared based on the backing VM.  If the previous buffer is
1959  *	non-0-sized but invalid, B_CACHE will be cleared.
1960  *
1961  *	If getblk() must create a new buffer, the new buffer is returned with
1962  *	both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which
1963  *	case it is returned with B_INVAL clear and B_CACHE set based on the
1964  *	backing VM.
1965  *
1966  *	getblk() also forces a VOP_BWRITE() for any B_DELWRI buffer whos
1967  *	B_CACHE bit is clear.
1968  *
1969  *	What this means, basically, is that the caller should use B_CACHE to
1970  *	determine whether the buffer is fully valid or not and should clear
1971  *	B_INVAL prior to issuing a read.  If the caller intends to validate
1972  *	the buffer by loading its data area with something, the caller needs
1973  *	to clear B_INVAL.  If the caller does this without issuing an I/O,
1974  *	the caller should set B_CACHE ( as an optimization ), else the caller
1975  *	should issue the I/O and biodone() will set B_CACHE if the I/O was
1976  *	a write attempt or if it was a successfull read.  If the caller
1977  *	intends to issue a READ, the caller must clear B_INVAL and B_ERROR
1978  *	prior to issuing the READ.  biodone() will *not* clear B_INVAL.
1979  */
1980 struct buf *
1981 getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo)
1982 {
1983 	struct buf *bp;
1984 	int s;
1985 	struct bufhashhdr *bh;
1986 
1987 #if !defined(MAX_PERF)
1988 	if (size > MAXBSIZE)
1989 		panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE);
1990 #endif
1991 
1992 	s = splbio();
1993 loop:
1994 	/*
1995 	 * Block if we are low on buffers.   Certain processes are allowed
1996 	 * to completely exhaust the buffer cache.
1997          *
1998          * If this check ever becomes a bottleneck it may be better to
1999          * move it into the else, when gbincore() fails.  At the moment
2000          * it isn't a problem.
2001          */
2002 	if (!curproc || (curproc->p_flag & P_BUFEXHAUST)) {
2003 		if (numfreebuffers == 0) {
2004 			if (!curproc)
2005 				return NULL;
2006 			needsbuffer |= VFS_BIO_NEED_ANY;
2007 			tsleep(&needsbuffer, (PRIBIO + 4) | slpflag, "newbuf",
2008 			    slptimeo);
2009 		}
2010 	} else if (numfreebuffers < lofreebuffers) {
2011 		waitfreebuffers(slpflag, slptimeo);
2012 	}
2013 
2014 	if ((bp = gbincore(vp, blkno))) {
2015 		/*
2016 		 * Buffer is in-core.  If the buffer is not busy, it must
2017 		 * be on a queue.
2018 		 */
2019 
2020 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
2021 			if (BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL,
2022 			    "getblk", slpflag, slptimeo) == ENOLCK)
2023 				goto loop;
2024 			splx(s);
2025 			return (struct buf *) NULL;
2026 		}
2027 
2028 		/*
2029 		 * The buffer is locked.  B_CACHE is cleared if the buffer is
2030 		 * invalid.  Ohterwise, for a non-VMIO buffer, B_CACHE is set
2031 		 * and for a VMIO buffer B_CACHE is adjusted according to the
2032 		 * backing VM cache.
2033 		 */
2034 		if (bp->b_flags & B_INVAL)
2035 			bp->b_flags &= ~B_CACHE;
2036 		else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0)
2037 			bp->b_flags |= B_CACHE;
2038 		bremfree(bp);
2039 
2040 		/*
2041 		 * check for size inconsistancies for non-VMIO case.
2042 		 */
2043 
2044 		if (bp->b_bcount != size) {
2045 			if ((bp->b_flags & B_VMIO) == 0 ||
2046 			    (size > bp->b_kvasize)) {
2047 				if (bp->b_flags & B_DELWRI) {
2048 					bp->b_flags |= B_NOCACHE;
2049 					VOP_BWRITE(bp->b_vp, bp);
2050 				} else {
2051 					if ((bp->b_flags & B_VMIO) &&
2052 					   (LIST_FIRST(&bp->b_dep) == NULL)) {
2053 						bp->b_flags |= B_RELBUF;
2054 						brelse(bp);
2055 					} else {
2056 						bp->b_flags |= B_NOCACHE;
2057 						VOP_BWRITE(bp->b_vp, bp);
2058 					}
2059 				}
2060 				goto loop;
2061 			}
2062 		}
2063 
2064 		/*
2065 		 * If the size is inconsistant in the VMIO case, we can resize
2066 		 * the buffer.  This might lead to B_CACHE getting set or
2067 		 * cleared.  If the size has not changed, B_CACHE remains
2068 		 * unchanged from its previous state.
2069 		 */
2070 
2071 		if (bp->b_bcount != size)
2072 			allocbuf(bp, size);
2073 
2074 		KASSERT(bp->b_offset != NOOFFSET,
2075 		    ("getblk: no buffer offset"));
2076 
2077 		/*
2078 		 * A buffer with B_DELWRI set and B_CACHE clear must
2079 		 * be committed before we can return the buffer in
2080 		 * order to prevent the caller from issuing a read
2081 		 * ( due to B_CACHE not being set ) and overwriting
2082 		 * it.
2083 		 *
2084 		 * Most callers, including NFS and FFS, need this to
2085 		 * operate properly either because they assume they
2086 		 * can issue a read if B_CACHE is not set, or because
2087 		 * ( for example ) an uncached B_DELWRI might loop due
2088 		 * to softupdates re-dirtying the buffer.  In the latter
2089 		 * case, B_CACHE is set after the first write completes,
2090 		 * preventing further loops.
2091 		 */
2092 
2093 		if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) {
2094 			VOP_BWRITE(bp->b_vp, bp);
2095 			goto loop;
2096 		}
2097 
2098 		splx(s);
2099 		bp->b_flags &= ~B_DONE;
2100 	} else {
2101 		/*
2102 		 * Buffer is not in-core, create new buffer.  The buffer
2103 		 * returned by getnewbuf() is locked.  Note that the returned
2104 		 * buffer is also considered valid (not marked B_INVAL).
2105 		 */
2106 		int bsize, maxsize, vmio;
2107 		off_t offset;
2108 
2109 		if (vp->v_type == VBLK)
2110 			bsize = DEV_BSIZE;
2111 		else if (vp->v_mountedhere)
2112 			bsize = vp->v_mountedhere->mnt_stat.f_iosize;
2113 		else if (vp->v_mount)
2114 			bsize = vp->v_mount->mnt_stat.f_iosize;
2115 		else
2116 			bsize = size;
2117 
2118 		offset = (off_t)blkno * bsize;
2119 		vmio = (vp->v_object != 0) && (vp->v_flag & VOBJBUF);
2120 		maxsize = vmio ? size + (offset & PAGE_MASK) : size;
2121 		maxsize = imax(maxsize, bsize);
2122 
2123 		if ((bp = getnewbuf(slpflag, slptimeo, size, maxsize)) == NULL) {
2124 			if (slpflag || slptimeo) {
2125 				splx(s);
2126 				return NULL;
2127 			}
2128 			goto loop;
2129 		}
2130 
2131 		/*
2132 		 * This code is used to make sure that a buffer is not
2133 		 * created while the getnewbuf routine is blocked.
2134 		 * This can be a problem whether the vnode is locked or not.
2135 		 * If the buffer is created out from under us, we have to
2136 		 * throw away the one we just created.  There is now window
2137 		 * race because we are safely running at splbio() from the
2138 		 * point of the duplicate buffer creation through to here,
2139 		 * and we've locked the buffer.
2140 		 */
2141 		if (gbincore(vp, blkno)) {
2142 			bp->b_flags |= B_INVAL;
2143 			brelse(bp);
2144 			goto loop;
2145 		}
2146 
2147 		/*
2148 		 * Insert the buffer into the hash, so that it can
2149 		 * be found by incore.
2150 		 */
2151 		bp->b_blkno = bp->b_lblkno = blkno;
2152 		bp->b_offset = offset;
2153 
2154 		bgetvp(vp, bp);
2155 		LIST_REMOVE(bp, b_hash);
2156 		bh = bufhash(vp, blkno);
2157 		LIST_INSERT_HEAD(bh, bp, b_hash);
2158 
2159 		/*
2160 		 * set B_VMIO bit.  allocbuf() the buffer bigger.  Since the
2161 		 * buffer size starts out as 0, B_CACHE will be set by
2162 		 * allocbuf() for the VMIO case prior to it testing the
2163 		 * backing store for validity.
2164 		 */
2165 
2166 		if (vmio) {
2167 			bp->b_flags |= B_VMIO;
2168 #if defined(VFS_BIO_DEBUG)
2169 			if (vp->v_type != VREG && vp->v_type != VBLK)
2170 				printf("getblk: vmioing file type %d???\n", vp->v_type);
2171 #endif
2172 		} else {
2173 			bp->b_flags &= ~B_VMIO;
2174 		}
2175 
2176 		allocbuf(bp, size);
2177 
2178 		splx(s);
2179 		bp->b_flags &= ~B_DONE;
2180 	}
2181 	return (bp);
2182 }
2183 
2184 /*
2185  * Get an empty, disassociated buffer of given size.  The buffer is initially
2186  * set to B_INVAL.
2187  */
2188 struct buf *
2189 geteblk(int size)
2190 {
2191 	struct buf *bp;
2192 	int s;
2193 
2194 	s = splbio();
2195 	while ((bp = getnewbuf(0, 0, size, MAXBSIZE)) == 0);
2196 	splx(s);
2197 	allocbuf(bp, size);
2198 	bp->b_flags |= B_INVAL;	/* b_dep cleared by getnewbuf() */
2199 	return (bp);
2200 }
2201 
2202 
2203 /*
2204  * This code constitutes the buffer memory from either anonymous system
2205  * memory (in the case of non-VMIO operations) or from an associated
2206  * VM object (in the case of VMIO operations).  This code is able to
2207  * resize a buffer up or down.
2208  *
2209  * Note that this code is tricky, and has many complications to resolve
2210  * deadlock or inconsistant data situations.  Tread lightly!!!
2211  * There are B_CACHE and B_DELWRI interactions that must be dealt with by
2212  * the caller.  Calling this code willy nilly can result in the loss of data.
2213  *
2214  * allocbuf() only adjusts B_CACHE for VMIO buffers.  getblk() deals with
2215  * B_CACHE for the non-VMIO case.
2216  */
2217 
2218 int
2219 allocbuf(struct buf *bp, int size)
2220 {
2221 	int newbsize, mbsize;
2222 	int i;
2223 
2224 #if !defined(MAX_PERF)
2225 	if (BUF_REFCNT(bp) == 0)
2226 		panic("allocbuf: buffer not busy");
2227 
2228 	if (bp->b_kvasize < size)
2229 		panic("allocbuf: buffer too small");
2230 #endif
2231 
2232 	if ((bp->b_flags & B_VMIO) == 0) {
2233 		caddr_t origbuf;
2234 		int origbufsize;
2235 		/*
2236 		 * Just get anonymous memory from the kernel.  Don't
2237 		 * mess with B_CACHE.
2238 		 */
2239 		mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
2240 #if !defined(NO_B_MALLOC)
2241 		if (bp->b_flags & B_MALLOC)
2242 			newbsize = mbsize;
2243 		else
2244 #endif
2245 			newbsize = round_page(size);
2246 
2247 		if (newbsize < bp->b_bufsize) {
2248 #if !defined(NO_B_MALLOC)
2249 			/*
2250 			 * malloced buffers are not shrunk
2251 			 */
2252 			if (bp->b_flags & B_MALLOC) {
2253 				if (newbsize) {
2254 					bp->b_bcount = size;
2255 				} else {
2256 					free(bp->b_data, M_BIOBUF);
2257 					bufspace -= bp->b_bufsize;
2258 					bufmallocspace -= bp->b_bufsize;
2259 					runningbufspace -= bp->b_bufsize;
2260 					if (bp->b_bufsize)
2261 						bufspacewakeup();
2262 					bp->b_data = bp->b_kvabase;
2263 					bp->b_bufsize = 0;
2264 					bp->b_bcount = 0;
2265 					bp->b_flags &= ~B_MALLOC;
2266 				}
2267 				return 1;
2268 			}
2269 #endif
2270 			vm_hold_free_pages(
2271 			    bp,
2272 			    (vm_offset_t) bp->b_data + newbsize,
2273 			    (vm_offset_t) bp->b_data + bp->b_bufsize);
2274 		} else if (newbsize > bp->b_bufsize) {
2275 #if !defined(NO_B_MALLOC)
2276 			/*
2277 			 * We only use malloced memory on the first allocation.
2278 			 * and revert to page-allocated memory when the buffer
2279 			 * grows.
2280 			 */
2281 			if ( (bufmallocspace < maxbufmallocspace) &&
2282 				(bp->b_bufsize == 0) &&
2283 				(mbsize <= PAGE_SIZE/2)) {
2284 
2285 				bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK);
2286 				bp->b_bufsize = mbsize;
2287 				bp->b_bcount = size;
2288 				bp->b_flags |= B_MALLOC;
2289 				bufspace += mbsize;
2290 				bufmallocspace += mbsize;
2291 				runningbufspace += bp->b_bufsize;
2292 				return 1;
2293 			}
2294 #endif
2295 			origbuf = NULL;
2296 			origbufsize = 0;
2297 #if !defined(NO_B_MALLOC)
2298 			/*
2299 			 * If the buffer is growing on its other-than-first allocation,
2300 			 * then we revert to the page-allocation scheme.
2301 			 */
2302 			if (bp->b_flags & B_MALLOC) {
2303 				origbuf = bp->b_data;
2304 				origbufsize = bp->b_bufsize;
2305 				bp->b_data = bp->b_kvabase;
2306 				bufspace -= bp->b_bufsize;
2307 				bufmallocspace -= bp->b_bufsize;
2308 				runningbufspace -= bp->b_bufsize;
2309 				if (bp->b_bufsize)
2310 					bufspacewakeup();
2311 				bp->b_bufsize = 0;
2312 				bp->b_flags &= ~B_MALLOC;
2313 				newbsize = round_page(newbsize);
2314 			}
2315 #endif
2316 			vm_hold_load_pages(
2317 			    bp,
2318 			    (vm_offset_t) bp->b_data + bp->b_bufsize,
2319 			    (vm_offset_t) bp->b_data + newbsize);
2320 #if !defined(NO_B_MALLOC)
2321 			if (origbuf) {
2322 				bcopy(origbuf, bp->b_data, origbufsize);
2323 				free(origbuf, M_BIOBUF);
2324 			}
2325 #endif
2326 		}
2327 	} else {
2328 		vm_page_t m;
2329 		int desiredpages;
2330 
2331 		newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
2332 		desiredpages = (size == 0) ? 0 :
2333 			num_pages((bp->b_offset & PAGE_MASK) + newbsize);
2334 
2335 #if !defined(NO_B_MALLOC)
2336 		if (bp->b_flags & B_MALLOC)
2337 			panic("allocbuf: VMIO buffer can't be malloced");
2338 #endif
2339 		/*
2340 		 * Set B_CACHE initially if buffer is 0 length or will become
2341 		 * 0-length.
2342 		 */
2343 		if (size == 0 || bp->b_bufsize == 0)
2344 			bp->b_flags |= B_CACHE;
2345 
2346 		if (newbsize < bp->b_bufsize) {
2347 			/*
2348 			 * DEV_BSIZE aligned new buffer size is less then the
2349 			 * DEV_BSIZE aligned existing buffer size.  Figure out
2350 			 * if we have to remove any pages.
2351 			 */
2352 			if (desiredpages < bp->b_npages) {
2353 				for (i = desiredpages; i < bp->b_npages; i++) {
2354 					/*
2355 					 * the page is not freed here -- it
2356 					 * is the responsibility of
2357 					 * vnode_pager_setsize
2358 					 */
2359 					m = bp->b_pages[i];
2360 					KASSERT(m != bogus_page,
2361 					    ("allocbuf: bogus page found"));
2362 					while (vm_page_sleep_busy(m, TRUE, "biodep"))
2363 						;
2364 
2365 					bp->b_pages[i] = NULL;
2366 					vm_page_unwire(m, 0);
2367 				}
2368 				pmap_qremove((vm_offset_t) trunc_page((vm_offset_t)bp->b_data) +
2369 				    (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages));
2370 				bp->b_npages = desiredpages;
2371 			}
2372 		} else if (size > bp->b_bcount) {
2373 			/*
2374 			 * We are growing the buffer, possibly in a
2375 			 * byte-granular fashion.
2376 			 */
2377 			struct vnode *vp;
2378 			vm_object_t obj;
2379 			vm_offset_t toff;
2380 			vm_offset_t tinc;
2381 
2382 			/*
2383 			 * Step 1, bring in the VM pages from the object,
2384 			 * allocating them if necessary.  We must clear
2385 			 * B_CACHE if these pages are not valid for the
2386 			 * range covered by the buffer.
2387 			 */
2388 
2389 			vp = bp->b_vp;
2390 			obj = vp->v_object;
2391 
2392 			while (bp->b_npages < desiredpages) {
2393 				vm_page_t m;
2394 				vm_pindex_t pi;
2395 
2396 				pi = OFF_TO_IDX(bp->b_offset) + bp->b_npages;
2397 				if ((m = vm_page_lookup(obj, pi)) == NULL) {
2398 					m = vm_page_alloc(obj, pi, VM_ALLOC_NORMAL);
2399 					if (m == NULL) {
2400 						VM_WAIT;
2401 						vm_pageout_deficit += desiredpages - bp->b_npages;
2402 					} else {
2403 						vm_page_wire(m);
2404 						vm_page_wakeup(m);
2405 						bp->b_flags &= ~B_CACHE;
2406 						bp->b_pages[bp->b_npages] = m;
2407 						++bp->b_npages;
2408 					}
2409 					continue;
2410 				}
2411 
2412 				/*
2413 				 * We found a page.  If we have to sleep on it,
2414 				 * retry because it might have gotten freed out
2415 				 * from under us.
2416 				 *
2417 				 * We can only test PG_BUSY here.  Blocking on
2418 				 * m->busy might lead to a deadlock:
2419 				 *
2420 				 *  vm_fault->getpages->cluster_read->allocbuf
2421 				 *
2422 				 */
2423 
2424 				if (vm_page_sleep_busy(m, FALSE, "pgtblk"))
2425 					continue;
2426 
2427 				/*
2428 				 * We have a good page.  Should we wakeup the
2429 				 * page daemon?
2430 				 */
2431 				if ((curproc != pageproc) &&
2432 				    ((m->queue - m->pc) == PQ_CACHE) &&
2433 				    ((cnt.v_free_count + cnt.v_cache_count) <
2434 					(cnt.v_free_min + cnt.v_cache_min))) {
2435 					pagedaemon_wakeup();
2436 				}
2437 				vm_page_flag_clear(m, PG_ZERO);
2438 				vm_page_wire(m);
2439 				bp->b_pages[bp->b_npages] = m;
2440 				++bp->b_npages;
2441 			}
2442 
2443 			/*
2444 			 * Step 2.  We've loaded the pages into the buffer,
2445 			 * we have to figure out if we can still have B_CACHE
2446 			 * set.  Note that B_CACHE is set according to the
2447 			 * byte-granular range ( bcount and size ), new the
2448 			 * aligned range ( newbsize ).
2449 			 *
2450 			 * The VM test is against m->valid, which is DEV_BSIZE
2451 			 * aligned.  Needless to say, the validity of the data
2452 			 * needs to also be DEV_BSIZE aligned.  Note that this
2453 			 * fails with NFS if the server or some other client
2454 			 * extends the file's EOF.  If our buffer is resized,
2455 			 * B_CACHE may remain set! XXX
2456 			 */
2457 
2458 			toff = bp->b_bcount;
2459 			tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK);
2460 
2461 			while ((bp->b_flags & B_CACHE) && toff < size) {
2462 				vm_pindex_t pi;
2463 
2464 				if (tinc > (size - toff))
2465 					tinc = size - toff;
2466 
2467 				pi = ((bp->b_offset & PAGE_MASK) + toff) >>
2468 				    PAGE_SHIFT;
2469 
2470 				vfs_buf_test_cache(
2471 				    bp,
2472 				    bp->b_offset,
2473 				    toff,
2474 				    tinc,
2475 				    bp->b_pages[pi]
2476 				);
2477 				toff += tinc;
2478 				tinc = PAGE_SIZE;
2479 			}
2480 
2481 			/*
2482 			 * Step 3, fixup the KVM pmap.  Remember that
2483 			 * bp->b_data is relative to bp->b_offset, but
2484 			 * bp->b_offset may be offset into the first page.
2485 			 */
2486 
2487 			bp->b_data = (caddr_t)
2488 			    trunc_page((vm_offset_t)bp->b_data);
2489 			pmap_qenter(
2490 			    (vm_offset_t)bp->b_data,
2491 			    bp->b_pages,
2492 			    bp->b_npages
2493 			);
2494 			bp->b_data = (caddr_t)((vm_offset_t)bp->b_data |
2495 			    (vm_offset_t)(bp->b_offset & PAGE_MASK));
2496 		}
2497 	}
2498 	if (bp->b_flags & B_VMIO)
2499 		vmiospace += (newbsize - bp->b_bufsize);
2500 	bufspace += (newbsize - bp->b_bufsize);
2501 	runningbufspace += (newbsize - bp->b_bufsize);
2502 	if (newbsize < bp->b_bufsize)
2503 		bufspacewakeup();
2504 	bp->b_bufsize = newbsize;	/* actual buffer allocation	*/
2505 	bp->b_bcount = size;		/* requested buffer size	*/
2506 	return 1;
2507 }
2508 
2509 /*
2510  *	biowait:
2511  *
2512  *	Wait for buffer I/O completion, returning error status.  The buffer
2513  *	is left locked and B_DONE on return.  B_EINTR is converted into a EINTR
2514  *	error and cleared.
2515  */
2516 int
2517 biowait(register struct buf * bp)
2518 {
2519 	int s;
2520 
2521 	s = splbio();
2522 	while ((bp->b_flags & B_DONE) == 0) {
2523 #if defined(NO_SCHEDULE_MODS)
2524 		tsleep(bp, PRIBIO, "biowait", 0);
2525 #else
2526 		if (bp->b_flags & B_READ)
2527 			tsleep(bp, PRIBIO, "biord", 0);
2528 		else
2529 			tsleep(bp, PRIBIO, "biowr", 0);
2530 #endif
2531 	}
2532 	splx(s);
2533 	if (bp->b_flags & B_EINTR) {
2534 		bp->b_flags &= ~B_EINTR;
2535 		return (EINTR);
2536 	}
2537 	if (bp->b_flags & B_ERROR) {
2538 		return (bp->b_error ? bp->b_error : EIO);
2539 	} else {
2540 		return (0);
2541 	}
2542 }
2543 
2544 /*
2545  *	biodone:
2546  *
2547  *	Finish I/O on a buffer, optionally calling a completion function.
2548  *	This is usually called from an interrupt so process blocking is
2549  *	not allowed.
2550  *
2551  *	biodone is also responsible for setting B_CACHE in a B_VMIO bp.
2552  *	In a non-VMIO bp, B_CACHE will be set on the next getblk()
2553  *	assuming B_INVAL is clear.
2554  *
2555  *	For the VMIO case, we set B_CACHE if the op was a read and no
2556  *	read error occured, or if the op was a write.  B_CACHE is never
2557  *	set if the buffer is invalid or otherwise uncacheable.
2558  *
2559  *	biodone does not mess with B_INVAL, allowing the I/O routine or the
2560  *	initiator to leave B_INVAL set to brelse the buffer out of existance
2561  *	in the biodone routine.
2562  */
2563 void
2564 biodone(register struct buf * bp)
2565 {
2566 	int s;
2567 
2568 	s = splbio();
2569 
2570 	KASSERT(BUF_REFCNT(bp) > 0, ("biodone: bp %p not busy %d", bp, BUF_REFCNT(bp)));
2571 	KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp));
2572 
2573 	bp->b_flags |= B_DONE;
2574 
2575 	if (bp->b_flags & B_FREEBUF) {
2576 		brelse(bp);
2577 		splx(s);
2578 		return;
2579 	}
2580 
2581 	if ((bp->b_flags & B_READ) == 0) {
2582 		vwakeup(bp);
2583 	}
2584 
2585 	/* call optional completion function if requested */
2586 	if (bp->b_flags & B_CALL) {
2587 		bp->b_flags &= ~B_CALL;
2588 		(*bp->b_iodone) (bp);
2589 		splx(s);
2590 		return;
2591 	}
2592 	if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_complete)
2593 		(*bioops.io_complete)(bp);
2594 
2595 	if (bp->b_flags & B_VMIO) {
2596 		int i, resid;
2597 		vm_ooffset_t foff;
2598 		vm_page_t m;
2599 		vm_object_t obj;
2600 		int iosize;
2601 		struct vnode *vp = bp->b_vp;
2602 
2603 		obj = vp->v_object;
2604 
2605 #if defined(VFS_BIO_DEBUG)
2606 		if (vp->v_usecount == 0) {
2607 			panic("biodone: zero vnode ref count");
2608 		}
2609 
2610 		if (vp->v_object == NULL) {
2611 			panic("biodone: missing VM object");
2612 		}
2613 
2614 		if ((vp->v_flag & VOBJBUF) == 0) {
2615 			panic("biodone: vnode is not setup for merged cache");
2616 		}
2617 #endif
2618 
2619 		foff = bp->b_offset;
2620 		KASSERT(bp->b_offset != NOOFFSET,
2621 		    ("biodone: no buffer offset"));
2622 
2623 #if !defined(MAX_PERF)
2624 		if (!obj) {
2625 			panic("biodone: no object");
2626 		}
2627 #endif
2628 #if defined(VFS_BIO_DEBUG)
2629 		if (obj->paging_in_progress < bp->b_npages) {
2630 			printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n",
2631 			    obj->paging_in_progress, bp->b_npages);
2632 		}
2633 #endif
2634 
2635 		/*
2636 		 * Set B_CACHE if the op was a normal read and no error
2637 		 * occured.  B_CACHE is set for writes in the b*write()
2638 		 * routines.
2639 		 */
2640 		iosize = bp->b_bcount;
2641 		if ((bp->b_flags & (B_READ|B_FREEBUF|B_INVAL|B_NOCACHE|B_ERROR)) == B_READ) {
2642 			bp->b_flags |= B_CACHE;
2643 		}
2644 
2645 		for (i = 0; i < bp->b_npages; i++) {
2646 			int bogusflag = 0;
2647 			m = bp->b_pages[i];
2648 			if (m == bogus_page) {
2649 				bogusflag = 1;
2650 				m = vm_page_lookup(obj, OFF_TO_IDX(foff));
2651 				if (!m) {
2652 #if defined(VFS_BIO_DEBUG)
2653 					printf("biodone: page disappeared\n");
2654 #endif
2655 					vm_object_pip_subtract(obj, 1);
2656 					bp->b_flags &= ~B_CACHE;
2657 					continue;
2658 				}
2659 				bp->b_pages[i] = m;
2660 				pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
2661 			}
2662 #if defined(VFS_BIO_DEBUG)
2663 			if (OFF_TO_IDX(foff) != m->pindex) {
2664 				printf(
2665 "biodone: foff(%lu)/m->pindex(%d) mismatch\n",
2666 				    (unsigned long)foff, m->pindex);
2667 			}
2668 #endif
2669 			resid = IDX_TO_OFF(m->pindex + 1) - foff;
2670 			if (resid > iosize)
2671 				resid = iosize;
2672 
2673 			/*
2674 			 * In the write case, the valid and clean bits are
2675 			 * already changed correctly ( see bdwrite() ), so we
2676 			 * only need to do this here in the read case.
2677 			 */
2678 			if ((bp->b_flags & B_READ) && !bogusflag && resid > 0) {
2679 				vfs_page_set_valid(bp, foff, i, m);
2680 			}
2681 			vm_page_flag_clear(m, PG_ZERO);
2682 
2683 			/*
2684 			 * when debugging new filesystems or buffer I/O methods, this
2685 			 * is the most common error that pops up.  if you see this, you
2686 			 * have not set the page busy flag correctly!!!
2687 			 */
2688 			if (m->busy == 0) {
2689 #if !defined(MAX_PERF)
2690 				printf("biodone: page busy < 0, "
2691 				    "pindex: %d, foff: 0x(%x,%x), "
2692 				    "resid: %d, index: %d\n",
2693 				    (int) m->pindex, (int)(foff >> 32),
2694 						(int) foff & 0xffffffff, resid, i);
2695 #endif
2696 				if (vp->v_type != VBLK)
2697 #if !defined(MAX_PERF)
2698 					printf(" iosize: %ld, lblkno: %d, flags: 0x%lx, npages: %d\n",
2699 					    bp->b_vp->v_mount->mnt_stat.f_iosize,
2700 					    (int) bp->b_lblkno,
2701 					    bp->b_flags, bp->b_npages);
2702 				else
2703 					printf(" VDEV, lblkno: %d, flags: 0x%lx, npages: %d\n",
2704 					    (int) bp->b_lblkno,
2705 					    bp->b_flags, bp->b_npages);
2706 				printf(" valid: 0x%x, dirty: 0x%x, wired: %d\n",
2707 				    m->valid, m->dirty, m->wire_count);
2708 #endif
2709 				panic("biodone: page busy < 0\n");
2710 			}
2711 			vm_page_io_finish(m);
2712 			vm_object_pip_subtract(obj, 1);
2713 			foff += resid;
2714 			iosize -= resid;
2715 		}
2716 		if (obj)
2717 			vm_object_pip_wakeupn(obj, 0);
2718 	}
2719 	/*
2720 	 * For asynchronous completions, release the buffer now. The brelse
2721 	 * will do a wakeup there if necessary - so no need to do a wakeup
2722 	 * here in the async case. The sync case always needs to do a wakeup.
2723 	 */
2724 
2725 	if (bp->b_flags & B_ASYNC) {
2726 		if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_RELBUF)) != 0)
2727 			brelse(bp);
2728 		else
2729 			bqrelse(bp);
2730 	} else {
2731 		wakeup(bp);
2732 	}
2733 	splx(s);
2734 }
2735 
2736 /*
2737  * This routine is called in lieu of iodone in the case of
2738  * incomplete I/O.  This keeps the busy status for pages
2739  * consistant.
2740  */
2741 void
2742 vfs_unbusy_pages(struct buf * bp)
2743 {
2744 	int i;
2745 
2746 	if (bp->b_flags & B_VMIO) {
2747 		struct vnode *vp = bp->b_vp;
2748 		vm_object_t obj = vp->v_object;
2749 
2750 		for (i = 0; i < bp->b_npages; i++) {
2751 			vm_page_t m = bp->b_pages[i];
2752 
2753 			if (m == bogus_page) {
2754 				m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i);
2755 #if !defined(MAX_PERF)
2756 				if (!m) {
2757 					panic("vfs_unbusy_pages: page missing\n");
2758 				}
2759 #endif
2760 				bp->b_pages[i] = m;
2761 				pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
2762 			}
2763 			vm_object_pip_subtract(obj, 1);
2764 			vm_page_flag_clear(m, PG_ZERO);
2765 			vm_page_io_finish(m);
2766 		}
2767 		vm_object_pip_wakeupn(obj, 0);
2768 	}
2769 }
2770 
2771 /*
2772  * vfs_page_set_valid:
2773  *
2774  *	Set the valid bits in a page based on the supplied offset.   The
2775  *	range is restricted to the buffer's size.
2776  *
2777  *	This routine is typically called after a read completes.
2778  */
2779 static void
2780 vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m)
2781 {
2782 	vm_ooffset_t soff, eoff;
2783 
2784 	/*
2785 	 * Start and end offsets in buffer.  eoff - soff may not cross a
2786 	 * page boundry or cross the end of the buffer.  The end of the
2787 	 * buffer, in this case, is our file EOF, not the allocation size
2788 	 * of the buffer.
2789 	 */
2790 	soff = off;
2791 	eoff = (off + PAGE_SIZE) & ~PAGE_MASK;
2792 	if (eoff > bp->b_offset + bp->b_bcount)
2793 		eoff = bp->b_offset + bp->b_bcount;
2794 
2795 	/*
2796 	 * Set valid range.  This is typically the entire buffer and thus the
2797 	 * entire page.
2798 	 */
2799 	if (eoff > soff) {
2800 		vm_page_set_validclean(
2801 		    m,
2802 		   (vm_offset_t) (soff & PAGE_MASK),
2803 		   (vm_offset_t) (eoff - soff)
2804 		);
2805 	}
2806 }
2807 
2808 /*
2809  * This routine is called before a device strategy routine.
2810  * It is used to tell the VM system that paging I/O is in
2811  * progress, and treat the pages associated with the buffer
2812  * almost as being PG_BUSY.  Also the object paging_in_progress
2813  * flag is handled to make sure that the object doesn't become
2814  * inconsistant.
2815  *
2816  * Since I/O has not been initiated yet, certain buffer flags
2817  * such as B_ERROR or B_INVAL may be in an inconsistant state
2818  * and should be ignored.
2819  */
2820 void
2821 vfs_busy_pages(struct buf * bp, int clear_modify)
2822 {
2823 	int i, bogus;
2824 
2825 	if (bp->b_flags & B_VMIO) {
2826 		struct vnode *vp = bp->b_vp;
2827 		vm_object_t obj = vp->v_object;
2828 		vm_ooffset_t foff;
2829 
2830 		foff = bp->b_offset;
2831 		KASSERT(bp->b_offset != NOOFFSET,
2832 		    ("vfs_busy_pages: no buffer offset"));
2833 		vfs_setdirty(bp);
2834 
2835 retry:
2836 		for (i = 0; i < bp->b_npages; i++) {
2837 			vm_page_t m = bp->b_pages[i];
2838 			if (vm_page_sleep_busy(m, FALSE, "vbpage"))
2839 				goto retry;
2840 		}
2841 
2842 		bogus = 0;
2843 		for (i = 0; i < bp->b_npages; i++) {
2844 			vm_page_t m = bp->b_pages[i];
2845 
2846 			vm_page_flag_clear(m, PG_ZERO);
2847 			if ((bp->b_flags & B_CLUSTER) == 0) {
2848 				vm_object_pip_add(obj, 1);
2849 				vm_page_io_start(m);
2850 			}
2851 
2852 			/*
2853 			 * When readying a buffer for a read ( i.e
2854 			 * clear_modify == 0 ), it is important to do
2855 			 * bogus_page replacement for valid pages in
2856 			 * partially instantiated buffers.  Partially
2857 			 * instantiated buffers can, in turn, occur when
2858 			 * reconstituting a buffer from its VM backing store
2859 			 * base.  We only have to do this if B_CACHE is
2860 			 * clear ( which causes the I/O to occur in the
2861 			 * first place ).  The replacement prevents the read
2862 			 * I/O from overwriting potentially dirty VM-backed
2863 			 * pages.  XXX bogus page replacement is, uh, bogus.
2864 			 * It may not work properly with small-block devices.
2865 			 * We need to find a better way.
2866 			 */
2867 
2868 			vm_page_protect(m, VM_PROT_NONE);
2869 			if (clear_modify)
2870 				vfs_page_set_valid(bp, foff, i, m);
2871 			else if (m->valid == VM_PAGE_BITS_ALL &&
2872 				(bp->b_flags & B_CACHE) == 0) {
2873 				bp->b_pages[i] = bogus_page;
2874 				bogus++;
2875 			}
2876 			foff = (foff + PAGE_SIZE) & ~PAGE_MASK;
2877 		}
2878 		if (bogus)
2879 			pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
2880 	}
2881 }
2882 
2883 /*
2884  * Tell the VM system that the pages associated with this buffer
2885  * are clean.  This is used for delayed writes where the data is
2886  * going to go to disk eventually without additional VM intevention.
2887  *
2888  * Note that while we only really need to clean through to b_bcount, we
2889  * just go ahead and clean through to b_bufsize.
2890  */
2891 static void
2892 vfs_clean_pages(struct buf * bp)
2893 {
2894 	int i;
2895 
2896 	if (bp->b_flags & B_VMIO) {
2897 		vm_ooffset_t foff;
2898 
2899 		foff = bp->b_offset;
2900 		KASSERT(bp->b_offset != NOOFFSET,
2901 		    ("vfs_clean_pages: no buffer offset"));
2902 		for (i = 0; i < bp->b_npages; i++) {
2903 			vm_page_t m = bp->b_pages[i];
2904 			vm_ooffset_t noff = (foff + PAGE_SIZE) & ~PAGE_MASK;
2905 			vm_ooffset_t eoff = noff;
2906 
2907 			if (eoff > bp->b_offset + bp->b_bufsize)
2908 				eoff = bp->b_offset + bp->b_bufsize;
2909 			vfs_page_set_valid(bp, foff, i, m);
2910 			/* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */
2911 			foff = noff;
2912 		}
2913 	}
2914 }
2915 
2916 /*
2917  *	vfs_bio_set_validclean:
2918  *
2919  *	Set the range within the buffer to valid and clean.  The range is
2920  *	relative to the beginning of the buffer, b_offset.  Note that b_offset
2921  *	itself may be offset from the beginning of the first page.
2922  */
2923 
2924 void
2925 vfs_bio_set_validclean(struct buf *bp, int base, int size)
2926 {
2927 	if (bp->b_flags & B_VMIO) {
2928 		int i;
2929 		int n;
2930 
2931 		/*
2932 		 * Fixup base to be relative to beginning of first page.
2933 		 * Set initial n to be the maximum number of bytes in the
2934 		 * first page that can be validated.
2935 		 */
2936 
2937 		base += (bp->b_offset & PAGE_MASK);
2938 		n = PAGE_SIZE - (base & PAGE_MASK);
2939 
2940 		for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) {
2941 			vm_page_t m = bp->b_pages[i];
2942 
2943 			if (n > size)
2944 				n = size;
2945 
2946 			vm_page_set_validclean(m, base & PAGE_MASK, n);
2947 			base += n;
2948 			size -= n;
2949 			n = PAGE_SIZE;
2950 		}
2951 	}
2952 }
2953 
2954 /*
2955  *	vfs_bio_clrbuf:
2956  *
2957  *	clear a buffer.  This routine essentially fakes an I/O, so we need
2958  *	to clear B_ERROR and B_INVAL.
2959  *
2960  *	Note that while we only theoretically need to clear through b_bcount,
2961  *	we go ahead and clear through b_bufsize.
2962  */
2963 
2964 void
2965 vfs_bio_clrbuf(struct buf *bp) {
2966 	int i, mask = 0;
2967 	caddr_t sa, ea;
2968 	if ((bp->b_flags & (B_VMIO | B_MALLOC)) == B_VMIO) {
2969 		bp->b_flags &= ~(B_INVAL|B_ERROR);
2970 		if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) &&
2971 		    (bp->b_offset & PAGE_MASK) == 0) {
2972 			mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1;
2973 			if (((bp->b_pages[0]->flags & PG_ZERO) == 0) &&
2974 			    ((bp->b_pages[0]->valid & mask) != mask)) {
2975 				bzero(bp->b_data, bp->b_bufsize);
2976 			}
2977 			bp->b_pages[0]->valid |= mask;
2978 			bp->b_resid = 0;
2979 			return;
2980 		}
2981 		ea = sa = bp->b_data;
2982 		for(i=0;i<bp->b_npages;i++,sa=ea) {
2983 			int j = ((vm_offset_t)sa & PAGE_MASK) / DEV_BSIZE;
2984 			ea = (caddr_t)trunc_page((vm_offset_t)sa + PAGE_SIZE);
2985 			ea = (caddr_t)(vm_offset_t)ulmin(
2986 			    (u_long)(vm_offset_t)ea,
2987 			    (u_long)(vm_offset_t)bp->b_data + bp->b_bufsize);
2988 			mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j;
2989 			if ((bp->b_pages[i]->valid & mask) == mask)
2990 				continue;
2991 			if ((bp->b_pages[i]->valid & mask) == 0) {
2992 				if ((bp->b_pages[i]->flags & PG_ZERO) == 0) {
2993 					bzero(sa, ea - sa);
2994 				}
2995 			} else {
2996 				for (; sa < ea; sa += DEV_BSIZE, j++) {
2997 					if (((bp->b_pages[i]->flags & PG_ZERO) == 0) &&
2998 						(bp->b_pages[i]->valid & (1<<j)) == 0)
2999 						bzero(sa, DEV_BSIZE);
3000 				}
3001 			}
3002 			bp->b_pages[i]->valid |= mask;
3003 			vm_page_flag_clear(bp->b_pages[i], PG_ZERO);
3004 		}
3005 		bp->b_resid = 0;
3006 	} else {
3007 		clrbuf(bp);
3008 	}
3009 }
3010 
3011 /*
3012  * vm_hold_load_pages and vm_hold_unload pages get pages into
3013  * a buffers address space.  The pages are anonymous and are
3014  * not associated with a file object.
3015  */
3016 void
3017 vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
3018 {
3019 	vm_offset_t pg;
3020 	vm_page_t p;
3021 	int index;
3022 
3023 	to = round_page(to);
3024 	from = round_page(from);
3025 	index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
3026 
3027 	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
3028 
3029 tryagain:
3030 
3031 		p = vm_page_alloc(kernel_object,
3032 			((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
3033 		    VM_ALLOC_NORMAL);
3034 		if (!p) {
3035 			vm_pageout_deficit += (to - from) >> PAGE_SHIFT;
3036 			VM_WAIT;
3037 			goto tryagain;
3038 		}
3039 		vm_page_wire(p);
3040 		p->valid = VM_PAGE_BITS_ALL;
3041 		vm_page_flag_clear(p, PG_ZERO);
3042 		pmap_kenter(pg, VM_PAGE_TO_PHYS(p));
3043 		bp->b_pages[index] = p;
3044 		vm_page_wakeup(p);
3045 	}
3046 	bp->b_npages = index;
3047 }
3048 
3049 void
3050 vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
3051 {
3052 	vm_offset_t pg;
3053 	vm_page_t p;
3054 	int index, newnpages;
3055 
3056 	from = round_page(from);
3057 	to = round_page(to);
3058 	newnpages = index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
3059 
3060 	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
3061 		p = bp->b_pages[index];
3062 		if (p && (index < bp->b_npages)) {
3063 #if !defined(MAX_PERF)
3064 			if (p->busy) {
3065 				printf("vm_hold_free_pages: blkno: %d, lblkno: %d\n",
3066 					bp->b_blkno, bp->b_lblkno);
3067 			}
3068 #endif
3069 			bp->b_pages[index] = NULL;
3070 			pmap_kremove(pg);
3071 			vm_page_busy(p);
3072 			vm_page_unwire(p, 0);
3073 			vm_page_free(p);
3074 		}
3075 	}
3076 	bp->b_npages = newnpages;
3077 }
3078 
3079 
3080 #include "opt_ddb.h"
3081 #ifdef DDB
3082 #include <ddb/ddb.h>
3083 
3084 DB_SHOW_COMMAND(buffer, db_show_buffer)
3085 {
3086 	/* get args */
3087 	struct buf *bp = (struct buf *)addr;
3088 
3089 	if (!have_addr) {
3090 		db_printf("usage: show buffer <addr>\n");
3091 		return;
3092 	}
3093 
3094 	db_printf("b_flags = 0x%b\n", (u_int)bp->b_flags, PRINT_BUF_FLAGS);
3095 	db_printf("b_error = %d, b_bufsize = %ld, b_bcount = %ld, "
3096 		  "b_resid = %ld\nb_dev = (%d,%d), b_data = %p, "
3097 		  "b_blkno = %d, b_pblkno = %d\n",
3098 		  bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid,
3099 		  major(bp->b_dev), minor(bp->b_dev),
3100 		  bp->b_data, bp->b_blkno, bp->b_pblkno);
3101 	if (bp->b_npages) {
3102 		int i;
3103 		db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages);
3104 		for (i = 0; i < bp->b_npages; i++) {
3105 			vm_page_t m;
3106 			m = bp->b_pages[i];
3107 			db_printf("(%p, 0x%lx, 0x%lx)", (void *)m->object,
3108 			    (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m));
3109 			if ((i + 1) < bp->b_npages)
3110 				db_printf(",");
3111 		}
3112 		db_printf("\n");
3113 	}
3114 }
3115 #endif /* DDB */
3116