xref: /freebsd/sys/kern/vfs_cluster.c (revision 05c7a37afb48ddd5ee1bd921a5d46fe59cc70b15)
1 /*-
2  * Copyright (c) 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * Modifications/enhancements:
5  * 	Copyright (c) 1995 John S. Dyson.  All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. All advertising materials mentioning features or use of this software
16  *    must display the following acknowledgement:
17  *	This product includes software developed by the University of
18  *	California, Berkeley and its contributors.
19  * 4. Neither the name of the University nor the names of its contributors
20  *    may be used to endorse or promote products derived from this software
21  *    without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  *
35  *	@(#)vfs_cluster.c	8.7 (Berkeley) 2/13/94
36  * $Id: vfs_cluster.c,v 1.34 1996/01/28 18:25:54 dyson Exp $
37  */
38 
39 #include <sys/param.h>
40 #include <sys/systm.h>
41 #include <sys/proc.h>
42 #include <sys/buf.h>
43 #include <sys/vnode.h>
44 #include <sys/mount.h>
45 #include <sys/malloc.h>
46 #include <sys/resourcevar.h>
47 #include <sys/vmmeter.h>
48 #include <miscfs/specfs/specdev.h>
49 #include <vm/vm.h>
50 #include <vm/vm_param.h>
51 #include <vm/vm_prot.h>
52 #include <vm/vm_object.h>
53 #include <vm/vm_page.h>
54 
55 #ifdef notyet_block_reallocation_enabled
56 #ifdef DEBUG
57 #include <sys/sysctl.h>
58 #include <sys/kernel.h>
59 
60 static int	doreallocblks = 0;
61 SYSCTL_INT(_debug, 13, doreallocblks, CTLFLAG_RW, &doreallocblks, 0, "");
62 #else
63 #define	doreallocblks 0
64 #endif
65 #endif /* notyet_block_reallocation_enabled */
66 
67 #ifdef notyet_block_reallocation_enabled
68 static struct cluster_save *
69 	cluster_collectbufs __P((struct vnode *vp, struct buf *last_bp));
70 #endif
71 static struct buf *
72 	cluster_rbuild __P((struct vnode *vp, u_quad_t filesize, daddr_t lbn,
73 			    daddr_t blkno, long size, int run));
74 
75 static int	totreads;
76 static int	totreadblocks;
77 extern vm_page_t	bogus_page;
78 
79 #ifdef DIAGNOSTIC
80 /*
81  * Set to 1 if reads of block zero should cause readahead to be done.
82  * Set to 0 treats a read of block zero as a non-sequential read.
83  *
84  * Setting to one assumes that most reads of block zero of files are due to
85  * sequential passes over the files (e.g. cat, sum) where additional blocks
86  * will soon be needed.  Setting to zero assumes that the majority are
87  * surgical strikes to get particular info (e.g. size, file) where readahead
88  * blocks will not be used and, in fact, push out other potentially useful
89  * blocks from the cache.  The former seems intuitive, but some quick tests
90  * showed that the latter performed better from a system-wide point of view.
91  */
92 	int doclusterraz = 0;
93 
94 #define ISSEQREAD(vp, blk) \
95 	(((blk) != 0 || doclusterraz) && \
96 	 ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr))
97 #else
98 #define ISSEQREAD(vp, blk) \
99 	(/* (blk) != 0 && */ ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr))
100 #endif
101 
102 /*
103  * allow for three entire read-aheads...  The system will
104  * adjust downwards rapidly if needed...
105  */
106 #define RA_MULTIPLE_FAST	2
107 #define RA_MULTIPLE_SLOW	3
108 #define RA_SHIFTDOWN	1	/* approx lg2(RA_MULTIPLE) */
109 /*
110  * This replaces bread.  If this is a bread at the beginning of a file and
111  * lastr is 0, we assume this is the first read and we'll read up to two
112  * blocks if they are sequential.  After that, we'll do regular read ahead
113  * in clustered chunks.
114  * 	bp is the block requested.
115  *	rbp is the read-ahead block.
116  *	If either is NULL, then you don't have to do the I/O.
117  */
118 int
119 cluster_read(vp, filesize, lblkno, size, cred, bpp)
120 	struct vnode *vp;
121 	u_quad_t filesize;
122 	daddr_t lblkno;
123 	long size;
124 	struct ucred *cred;
125 	struct buf **bpp;
126 {
127 	struct buf *bp, *rbp;
128 	daddr_t blkno, rablkno, origlblkno;
129 	int error, num_ra, alreadyincore;
130 	int i;
131 	int seq;
132 
133 	error = 0;
134 	/*
135 	 * get the requested block
136 	 */
137 	origlblkno = lblkno;
138 	*bpp = bp = getblk(vp, lblkno, size, 0, 0);
139 
140 	seq = ISSEQREAD(vp, lblkno);
141 	/*
142 	 * if it is in the cache, then check to see if the reads have been
143 	 * sequential.  If they have, then try some read-ahead, otherwise
144 	 * back-off on prospective read-aheads.
145 	 */
146 	if (bp->b_flags & B_CACHE) {
147 		if (!seq) {
148 			vp->v_maxra = bp->b_lblkno + bp->b_bcount / size;
149 			vp->v_ralen >>= RA_SHIFTDOWN;
150 			return 0;
151 		} else if( vp->v_maxra > lblkno) {
152 			if ((vp->v_ralen + 1) < RA_MULTIPLE_FAST * (MAXPHYS / size))
153 				++vp->v_ralen;
154 			if ( vp->v_maxra > lblkno + vp->v_ralen ) {
155 				return 0;
156 			}
157 			lblkno = vp->v_maxra;
158 		} else {
159 			lblkno += 1;
160 		}
161 		bp = NULL;
162 	} else {
163 		/*
164 		 * if it isn't in the cache, then get a chunk from disk if
165 		 * sequential, otherwise just get the block.
166 		 */
167 		bp->b_flags |= B_READ;
168 		lblkno += 1;
169 		curproc->p_stats->p_ru.ru_inblock++;	/* XXX */
170 		vp->v_ralen = 0;
171 	}
172 	/*
173 	 * assume no read-ahead
174 	 */
175 	alreadyincore = 1;
176 	rablkno = lblkno;
177 
178 	/*
179 	 * if we have been doing sequential I/O, then do some read-ahead
180 	 */
181 	if (seq) {
182 		alreadyincore = 0;
183 
184 	/*
185 	 * bump ralen a bit...
186 	 */
187 		if ((vp->v_ralen + 1) < RA_MULTIPLE_SLOW*(MAXPHYS / size))
188 			++vp->v_ralen;
189 		/*
190 		 * this code makes sure that the stuff that we have read-ahead
191 		 * is still in the cache.  If it isn't, we have been reading
192 		 * ahead too much, and we need to back-off, otherwise we might
193 		 * try to read more.
194 		 */
195 		for (i = 0; i < vp->v_maxra - lblkno; i++) {
196 			rablkno = lblkno + i;
197 			alreadyincore = (int) incore(vp, rablkno);
198 			if (!alreadyincore) {
199 				vp->v_maxra = rablkno;
200 				vp->v_ralen >>= RA_SHIFTDOWN;
201 				alreadyincore = 1;
202 			}
203 		}
204 	}
205 	/*
206 	 * we now build the read-ahead buffer if it is desirable.
207 	 */
208 	rbp = NULL;
209 	if (!alreadyincore &&
210 	    ((u_quad_t)(rablkno + 1) * size) <= filesize &&
211 	    !(error = VOP_BMAP(vp, rablkno, NULL, &blkno, &num_ra, NULL)) &&
212 	    blkno != -1) {
213 		if (num_ra > vp->v_ralen)
214 			num_ra = vp->v_ralen;
215 
216 		if (num_ra) {
217 			rbp = cluster_rbuild(vp, filesize, rablkno, blkno, size,
218 				num_ra + 1);
219 		} else {
220 			rbp = getblk(vp, rablkno, size, 0, 0);
221 			rbp->b_flags |= B_READ | B_ASYNC;
222 			rbp->b_blkno = blkno;
223 		}
224 	}
225 
226 	/*
227 	 * handle the synchronous read
228 	 */
229 	if (bp) {
230 		if (bp->b_flags & (B_DONE | B_DELWRI))
231 			panic("cluster_read: DONE bp");
232 		else {
233 			vfs_busy_pages(bp, 0);
234 			error = VOP_STRATEGY(bp);
235 			vp->v_maxra = bp->b_lblkno + bp->b_bcount / size;
236 			totreads++;
237 			totreadblocks += bp->b_bcount / size;
238 			curproc->p_stats->p_ru.ru_inblock++;
239 		}
240 	}
241 	/*
242 	 * and if we have read-aheads, do them too
243 	 */
244 	if (rbp) {
245 		vp->v_maxra = rbp->b_lblkno + rbp->b_bcount / size;
246 		if (error) {
247 			rbp->b_flags &= ~(B_ASYNC | B_READ);
248 			brelse(rbp);
249 		} else if (rbp->b_flags & B_CACHE) {
250 			rbp->b_flags &= ~(B_ASYNC | B_READ);
251 			bqrelse(rbp);
252 		} else {
253 			if ((rbp->b_flags & B_CLUSTER) == 0)
254 				vfs_busy_pages(rbp, 0);
255 			(void) VOP_STRATEGY(rbp);
256 			totreads++;
257 			totreadblocks += rbp->b_bcount / size;
258 			curproc->p_stats->p_ru.ru_inblock++;
259 		}
260 	}
261 	if (bp && ((bp->b_flags & B_ASYNC) == 0))
262 		return (biowait(bp));
263 	return (error);
264 }
265 
266 /*
267  * If blocks are contiguous on disk, use this to provide clustered
268  * read ahead.  We will read as many blocks as possible sequentially
269  * and then parcel them up into logical blocks in the buffer hash table.
270  */
271 static struct buf *
272 cluster_rbuild(vp, filesize, lbn, blkno, size, run)
273 	struct vnode *vp;
274 	u_quad_t filesize;
275 	daddr_t lbn;
276 	daddr_t blkno;
277 	long size;
278 	int run;
279 {
280 	struct buf *bp, *tbp;
281 	daddr_t bn;
282 	int i, inc, j;
283 
284 #ifdef DIAGNOSTIC
285 	if (size != vp->v_mount->mnt_stat.f_iosize)
286 		panic("cluster_rbuild: size %d != filesize %d\n",
287 		    size, vp->v_mount->mnt_stat.f_iosize);
288 #endif
289 	/*
290 	 * avoid a division
291 	 */
292 	while ((u_quad_t) size * (lbn + run) > filesize) {
293 		--run;
294 	}
295 
296 	tbp = getblk(vp, lbn, size, 0, 0);
297 	if (tbp->b_flags & (B_CACHE|B_MALLOC))
298 		return tbp;
299 
300 	tbp->b_blkno = blkno;
301 	tbp->b_flags |= B_ASYNC | B_READ;
302 	if( ((tbp->b_flags & B_VMIO) == 0) || (run <= 1) )
303 		return tbp;
304 
305 	bp = trypbuf();
306 	if (bp == 0)
307 		return tbp;
308 
309 	(vm_offset_t) bp->b_data |= ((vm_offset_t) tbp->b_data) & PAGE_MASK;
310 	bp->b_flags = B_ASYNC | B_READ | B_CALL | B_BUSY | B_CLUSTER | B_VMIO;
311 	bp->b_iodone = cluster_callback;
312 	bp->b_blkno = blkno;
313 	bp->b_lblkno = lbn;
314 	pbgetvp(vp, bp);
315 
316 	TAILQ_INIT(&bp->b_cluster.cluster_head);
317 
318 	bp->b_bcount = 0;
319 	bp->b_bufsize = 0;
320 	bp->b_npages = 0;
321 
322 	inc = btodb(size);
323 	for (bn = blkno, i = 0; i < run; ++i, bn += inc) {
324 		if (i != 0) {
325 			if ((bp->b_npages * PAGE_SIZE) +
326 				round_page(size) > MAXPHYS)
327 				break;
328 
329 			if (incore(vp, lbn + i))
330 				break;
331 
332 			tbp = getblk(vp, lbn + i, size, 0, 0);
333 
334 			if ((tbp->b_flags & B_CACHE) ||
335 				(tbp->b_flags & B_VMIO) == 0) {
336 				bqrelse(tbp);
337 				break;
338 			}
339 
340 			for (j=0;j<tbp->b_npages;j++) {
341 				if (tbp->b_pages[j]->valid) {
342 					break;
343 				}
344 			}
345 
346 			if (j != tbp->b_npages) {
347 				/*
348 				 * force buffer to be re-constituted later
349 				 */
350 				tbp->b_flags |= B_RELBUF;
351 				brelse(tbp);
352 				break;
353 			}
354 
355 			tbp->b_flags |= B_READ | B_ASYNC;
356 			if (tbp->b_blkno == tbp->b_lblkno) {
357 				tbp->b_blkno = bn;
358 			} else if (tbp->b_blkno != bn) {
359 				brelse(tbp);
360 				break;
361 			}
362 		}
363 		TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head,
364 			tbp, b_cluster.cluster_entry);
365 		for (j = 0; j < tbp->b_npages; j += 1) {
366 			vm_page_t m;
367 			m = tbp->b_pages[j];
368 			++m->busy;
369 			++m->object->paging_in_progress;
370 			if ((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) {
371 				m = bogus_page;
372 			}
373 			if ((bp->b_npages == 0) ||
374 				(bp->b_pages[bp->b_npages-1] != m)) {
375 				bp->b_pages[bp->b_npages] = m;
376 				bp->b_npages++;
377 			}
378 		}
379 		bp->b_bcount += tbp->b_bcount;
380 		bp->b_bufsize += tbp->b_bufsize;
381 	}
382 	pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
383 		(vm_page_t *)bp->b_pages, bp->b_npages);
384 	return (bp);
385 }
386 
387 /*
388  * Cleanup after a clustered read or write.
389  * This is complicated by the fact that any of the buffers might have
390  * extra memory (if there were no empty buffer headers at allocbuf time)
391  * that we will need to shift around.
392  */
393 void
394 cluster_callback(bp)
395 	struct buf *bp;
396 {
397 	struct buf *nbp, *tbp;
398 	int error = 0;
399 
400 	/*
401 	 * Must propogate errors to all the components.
402 	 */
403 	if (bp->b_flags & B_ERROR)
404 		error = bp->b_error;
405 
406 	pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
407 	/*
408 	 * Move memory from the large cluster buffer into the component
409 	 * buffers and mark IO as done on these.
410 	 */
411 	for (tbp = bp->b_cluster.cluster_head.tqh_first;
412 		tbp; tbp = nbp) {
413 		nbp = tbp->b_cluster.cluster_entry.tqe_next;
414 		if (error) {
415 			tbp->b_flags |= B_ERROR;
416 			tbp->b_error = error;
417 		}
418 		biodone(tbp);
419 	}
420 	relpbuf(bp);
421 }
422 
423 /*
424  * Do clustered write for FFS.
425  *
426  * Three cases:
427  *	1. Write is not sequential (write asynchronously)
428  *	Write is sequential:
429  *	2.	beginning of cluster - begin cluster
430  *	3.	middle of a cluster - add to cluster
431  *	4.	end of a cluster - asynchronously write cluster
432  */
433 void
434 cluster_write(bp, filesize)
435 	struct buf *bp;
436 	u_quad_t filesize;
437 {
438 	struct vnode *vp;
439 	daddr_t lbn;
440 	int maxclen, cursize;
441 	int lblocksize;
442 	int async;
443 
444 	vp = bp->b_vp;
445 	async = (vp->v_mount && (vp->v_mount->mnt_flag & MNT_ASYNC));
446 	lblocksize = vp->v_mount->mnt_stat.f_iosize;
447 	lbn = bp->b_lblkno;
448 
449 	/* Initialize vnode to beginning of file. */
450 	if (lbn == 0)
451 		vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
452 
453 	if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 ||
454 	    (bp->b_blkno != vp->v_lasta + btodb(lblocksize))) {
455 		maxclen = MAXPHYS / lblocksize - 1;
456 		if (vp->v_clen != 0) {
457 			/*
458 			 * Next block is not sequential.
459 			 *
460 			 * If we are not writing at end of file, the process
461 			 * seeked to another point in the file since its last
462 			 * write, or we have reached our maximum cluster size,
463 			 * then push the previous cluster. Otherwise try
464 			 * reallocating to make it sequential.
465 			 */
466 			cursize = vp->v_lastw - vp->v_cstart + 1;
467 #ifndef notyet_block_reallocation_enabled
468 			if (((u_quad_t)(lbn + 1) * lblocksize) != filesize ||
469 				lbn != vp->v_lastw + 1 ||
470 				vp->v_clen <= cursize) {
471 				if (!async)
472 					cluster_wbuild(vp, lblocksize,
473 						vp->v_cstart, cursize);
474 			}
475 #else
476 			if (!doreallocblks ||
477 			    (lbn + 1) * lblocksize != filesize ||
478 			    lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) {
479 				if (!async)
480 					cluster_wbuild(vp, lblocksize,
481 						vp->v_cstart, cursize);
482 			} else {
483 				struct buf **bpp, **endbp;
484 				struct cluster_save *buflist;
485 
486 				buflist = cluster_collectbufs(vp, bp);
487 				endbp = &buflist->bs_children
488 				    [buflist->bs_nchildren - 1];
489 				if (VOP_REALLOCBLKS(vp, buflist)) {
490 					/*
491 					 * Failed, push the previous cluster.
492 					 */
493 					for (bpp = buflist->bs_children;
494 					     bpp < endbp; bpp++)
495 						brelse(*bpp);
496 					free(buflist, M_SEGMENT);
497 					cluster_wbuild(vp, lblocksize,
498 					    vp->v_cstart, cursize);
499 				} else {
500 					/*
501 					 * Succeeded, keep building cluster.
502 					 */
503 					for (bpp = buflist->bs_children;
504 					     bpp <= endbp; bpp++)
505 						bdwrite(*bpp);
506 					free(buflist, M_SEGMENT);
507 					vp->v_lastw = lbn;
508 					vp->v_lasta = bp->b_blkno;
509 					return;
510 				}
511 			}
512 #endif /* notyet_block_reallocation_enabled */
513 		}
514 		/*
515 		 * Consider beginning a cluster. If at end of file, make
516 		 * cluster as large as possible, otherwise find size of
517 		 * existing cluster.
518 		 */
519 		if (((u_quad_t) (lbn + 1) * lblocksize) != filesize &&
520 		    (bp->b_blkno == bp->b_lblkno) &&
521 		    (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen, NULL) ||
522 		     bp->b_blkno == -1)) {
523 			bawrite(bp);
524 			vp->v_clen = 0;
525 			vp->v_lasta = bp->b_blkno;
526 			vp->v_cstart = lbn + 1;
527 			vp->v_lastw = lbn;
528 			return;
529 		}
530 		vp->v_clen = maxclen;
531 		if (!async && maxclen == 0) {	/* I/O not contiguous */
532 			vp->v_cstart = lbn + 1;
533 			bawrite(bp);
534 		} else {	/* Wait for rest of cluster */
535 			vp->v_cstart = lbn;
536 			bdwrite(bp);
537 		}
538 	} else if (lbn == vp->v_cstart + vp->v_clen) {
539 		/*
540 		 * At end of cluster, write it out.
541 		 */
542 		bdwrite(bp);
543 		cluster_wbuild(vp, lblocksize, vp->v_cstart, vp->v_clen + 1);
544 		vp->v_clen = 0;
545 		vp->v_cstart = lbn + 1;
546 	} else
547 		/*
548 		 * In the middle of a cluster, so just delay the I/O for now.
549 		 */
550 		bdwrite(bp);
551 	vp->v_lastw = lbn;
552 	vp->v_lasta = bp->b_blkno;
553 }
554 
555 
556 /*
557  * This is an awful lot like cluster_rbuild...wish they could be combined.
558  * The last lbn argument is the current block on which I/O is being
559  * performed.  Check to see that it doesn't fall in the middle of
560  * the current block (if last_bp == NULL).
561  */
562 int
563 cluster_wbuild(vp, size, start_lbn, len)
564 	struct vnode *vp;
565 	long size;
566 	daddr_t start_lbn;
567 	int len;
568 {
569 	struct buf *bp, *tbp;
570 	int i, j, s;
571 	int totalwritten = 0;
572 	int dbsize = btodb(size);
573 	while (len > 0) {
574 		s = splbio();
575 		if ( ((tbp = gbincore(vp, start_lbn)) == NULL) ||
576 			((tbp->b_flags & (B_INVAL|B_BUSY|B_DELWRI)) != B_DELWRI)) {
577 			++start_lbn;
578 			--len;
579 			splx(s);
580 			continue;
581 		}
582 		bremfree(tbp);
583 		tbp->b_flags |= B_BUSY;
584 		tbp->b_flags &= ~B_DONE;
585 		splx(s);
586 
587 	/*
588 	 * Extra memory in the buffer, punt on this buffer. XXX we could
589 	 * handle this in most cases, but we would have to push the extra
590 	 * memory down to after our max possible cluster size and then
591 	 * potentially pull it back up if the cluster was terminated
592 	 * prematurely--too much hassle.
593 	 */
594 		if (((tbp->b_flags & (B_CLUSTEROK|B_MALLOC)) != B_CLUSTEROK) ||
595 			(tbp->b_bcount != tbp->b_bufsize) ||
596 			(tbp->b_bcount != size) ||
597 			len == 1) {
598 			totalwritten += tbp->b_bufsize;
599 			bawrite(tbp);
600 			++start_lbn;
601 			--len;
602 			continue;
603 		}
604 
605 		bp = trypbuf();
606 		if (bp == NULL) {
607 			totalwritten += tbp->b_bufsize;
608 			bawrite(tbp);
609 			++start_lbn;
610 			--len;
611 			continue;
612 		}
613 
614 		TAILQ_INIT(&bp->b_cluster.cluster_head);
615 		bp->b_bcount = 0;
616 		bp->b_bufsize = 0;
617 		bp->b_npages = 0;
618 
619 		bp->b_blkno = tbp->b_blkno;
620 		bp->b_lblkno = tbp->b_lblkno;
621 		(vm_offset_t) bp->b_data |= ((vm_offset_t) tbp->b_data) & PAGE_MASK;
622 		bp->b_flags |= B_CALL | B_BUSY | B_CLUSTER | (tbp->b_flags & B_VMIO);
623 		bp->b_iodone = cluster_callback;
624 		pbgetvp(vp, bp);
625 
626 		for (i = 0; i < len; ++i, ++start_lbn) {
627 			if (i != 0) {
628 				s = splbio();
629 				if ((tbp = gbincore(vp, start_lbn)) == NULL) {
630 					splx(s);
631 					break;
632 				}
633 
634 				if ((tbp->b_flags & (B_VMIO|B_CLUSTEROK|B_INVAL|B_BUSY|B_DELWRI)) != (B_DELWRI|B_CLUSTEROK|(bp->b_flags & B_VMIO))) {
635 					splx(s);
636 					break;
637 				}
638 
639 				if ((tbp->b_bcount != size) ||
640 					((bp->b_blkno + dbsize * i) != tbp->b_blkno) ||
641 					((tbp->b_npages + bp->b_npages) > (MAXPHYS / PAGE_SIZE))) {
642 					splx(s);
643 					break;
644 				}
645 				bremfree(tbp);
646 				tbp->b_flags |= B_BUSY;
647 				tbp->b_flags &= ~B_DONE;
648 				splx(s);
649 			}
650 			if (tbp->b_flags & B_VMIO) {
651 				for (j = 0; j < tbp->b_npages; j += 1) {
652 					vm_page_t m;
653 					m = tbp->b_pages[j];
654 					++m->busy;
655 					++m->object->paging_in_progress;
656 					if ((bp->b_npages == 0) ||
657 						(bp->b_pages[bp->b_npages - 1] != m)) {
658 						bp->b_pages[bp->b_npages] = m;
659 						bp->b_npages++;
660 					}
661 				}
662 			}
663 			bp->b_bcount += size;
664 			bp->b_bufsize += size;
665 
666 			tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
667 			tbp->b_flags |= B_ASYNC;
668 			s = splbio();
669 			reassignbuf(tbp, tbp->b_vp);	/* put on clean list */
670 			++tbp->b_vp->v_numoutput;
671 			splx(s);
672 			TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head,
673 				tbp, b_cluster.cluster_entry);
674 		}
675 		pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
676 			(vm_page_t *) bp->b_pages, bp->b_npages);
677 		totalwritten += bp->b_bufsize;
678 		bawrite(bp);
679 
680 		len -= i;
681 	}
682 	return totalwritten;
683 }
684 
685 #ifdef notyet_block_reallocation_enabled
686 /*
687  * Collect together all the buffers in a cluster.
688  * Plus add one additional buffer.
689  */
690 static struct cluster_save *
691 cluster_collectbufs(vp, last_bp)
692 	struct vnode *vp;
693 	struct buf *last_bp;
694 {
695 	struct cluster_save *buflist;
696 	daddr_t lbn;
697 	int i, len;
698 
699 	len = vp->v_lastw - vp->v_cstart + 1;
700 	buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist),
701 	    M_SEGMENT, M_WAITOK);
702 	buflist->bs_nchildren = 0;
703 	buflist->bs_children = (struct buf **) (buflist + 1);
704 	for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++)
705 		(void) bread(vp, lbn, last_bp->b_bcount, NOCRED,
706 		    &buflist->bs_children[i]);
707 	buflist->bs_children[i] = last_bp;
708 	buflist->bs_nchildren = i + 1;
709 	return (buflist);
710 }
711 #endif /* notyet_block_reallocation_enabled */
712