xref: /freebsd/sys/kern/vfs_cluster.c (revision 9ee40678bbdcedc6a3ac1e311abe740018911cf1)
1 /*-
2  * Copyright (c) 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * Modifications/enhancements:
5  * 	Copyright (c) 1995 John S. Dyson.  All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. All advertising materials mentioning features or use of this software
16  *    must display the following acknowledgement:
17  *	This product includes software developed by the University of
18  *	California, Berkeley and its contributors.
19  * 4. Neither the name of the University nor the names of its contributors
20  *    may be used to endorse or promote products derived from this software
21  *    without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  *
35  *	@(#)vfs_cluster.c	8.7 (Berkeley) 2/13/94
36  * $Id: vfs_cluster.c,v 1.37 1996/07/27 18:49:18 dyson Exp $
37  */
38 
39 #include <sys/param.h>
40 #include <sys/systm.h>
41 #include <sys/proc.h>
42 #include <sys/buf.h>
43 #include <sys/vnode.h>
44 #include <sys/mount.h>
45 #include <sys/malloc.h>
46 #include <sys/resourcevar.h>
47 #include <sys/vmmeter.h>
48 #include <miscfs/specfs/specdev.h>
49 #include <vm/vm.h>
50 #include <vm/vm_param.h>
51 #include <vm/vm_prot.h>
52 #include <vm/vm_object.h>
53 #include <vm/vm_page.h>
54 
55 #ifdef notyet_block_reallocation_enabled
56 #ifdef DEBUG
57 #include <sys/sysctl.h>
58 #include <sys/kernel.h>
59 
60 static int	doreallocblks = 0;
61 SYSCTL_INT(_debug, 13, doreallocblks, CTLFLAG_RW, &doreallocblks, 0, "");
62 #else
63 #define	doreallocblks 0
64 #endif
65 #endif /* notyet_block_reallocation_enabled */
66 
67 #ifdef notyet_block_reallocation_enabled
68 static struct cluster_save *
69 	cluster_collectbufs __P((struct vnode *vp, struct buf *last_bp));
70 #endif
71 static struct buf *
72 	cluster_rbuild __P((struct vnode *vp, u_quad_t filesize, daddr_t lbn,
73 			    daddr_t blkno, long size, int run));
74 
75 static int	totreads;
76 static int	totreadblocks;
77 extern vm_page_t	bogus_page;
78 
79 #ifdef DIAGNOSTIC
80 /*
81  * Set to 1 if reads of block zero should cause readahead to be done.
82  * Set to 0 treats a read of block zero as a non-sequential read.
83  *
84  * Setting to one assumes that most reads of block zero of files are due to
85  * sequential passes over the files (e.g. cat, sum) where additional blocks
86  * will soon be needed.  Setting to zero assumes that the majority are
87  * surgical strikes to get particular info (e.g. size, file) where readahead
88  * blocks will not be used and, in fact, push out other potentially useful
89  * blocks from the cache.  The former seems intuitive, but some quick tests
90  * showed that the latter performed better from a system-wide point of view.
91  */
92 	int doclusterraz = 0;
93 
94 #define ISSEQREAD(vp, blk) \
95 	(((blk) != 0 || doclusterraz) && \
96 	 ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr))
97 #else
98 #define ISSEQREAD(vp, blk) \
99 	(/* (blk) != 0 && */ ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr))
100 #endif
101 
102 /*
103  * allow for three entire read-aheads...  The system will
104  * adjust downwards rapidly if needed...
105  */
106 #define RA_MULTIPLE_FAST	2
107 #define RA_MULTIPLE_SLOW	3
108 #define RA_SHIFTDOWN	1	/* approx lg2(RA_MULTIPLE) */
109 /*
110  * This replaces bread.  If this is a bread at the beginning of a file and
111  * lastr is 0, we assume this is the first read and we'll read up to two
112  * blocks if they are sequential.  After that, we'll do regular read ahead
113  * in clustered chunks.
114  * 	bp is the block requested.
115  *	rbp is the read-ahead block.
116  *	If either is NULL, then you don't have to do the I/O.
117  */
118 int
119 cluster_read(vp, filesize, lblkno, size, cred, bpp)
120 	struct vnode *vp;
121 	u_quad_t filesize;
122 	daddr_t lblkno;
123 	long size;
124 	struct ucred *cred;
125 	struct buf **bpp;
126 {
127 	struct buf *bp, *rbp;
128 	daddr_t blkno, rablkno, origlblkno;
129 	int error, num_ra, alreadyincore;
130 	int i;
131 	int seq;
132 
133 	error = 0;
134 	/*
135 	 * get the requested block
136 	 */
137 	origlblkno = lblkno;
138 	*bpp = bp = getblk(vp, lblkno, size, 0, 0);
139 
140 	seq = ISSEQREAD(vp, lblkno);
141 	/*
142 	 * if it is in the cache, then check to see if the reads have been
143 	 * sequential.  If they have, then try some read-ahead, otherwise
144 	 * back-off on prospective read-aheads.
145 	 */
146 	if (bp->b_flags & B_CACHE) {
147 		if (!seq) {
148 			vp->v_maxra = bp->b_lblkno + bp->b_bcount / size;
149 			vp->v_ralen >>= RA_SHIFTDOWN;
150 			return 0;
151 		} else if( vp->v_maxra > lblkno) {
152 			if ((vp->v_ralen + 1) < RA_MULTIPLE_FAST * (MAXPHYS / size))
153 				++vp->v_ralen;
154 			if ( vp->v_maxra > lblkno + vp->v_ralen ) {
155 				return 0;
156 			}
157 			lblkno = vp->v_maxra;
158 		} else {
159 			lblkno += 1;
160 		}
161 		bp = NULL;
162 	} else {
163 		/*
164 		 * if it isn't in the cache, then get a chunk from disk if
165 		 * sequential, otherwise just get the block.
166 		 */
167 		bp->b_flags |= B_READ;
168 		lblkno += 1;
169 		curproc->p_stats->p_ru.ru_inblock++;	/* XXX */
170 		vp->v_ralen = 0;
171 	}
172 	/*
173 	 * assume no read-ahead
174 	 */
175 	alreadyincore = 1;
176 	rablkno = lblkno;
177 
178 	/*
179 	 * if we have been doing sequential I/O, then do some read-ahead
180 	 */
181 	if (seq) {
182 		alreadyincore = 0;
183 
184 	/*
185 	 * bump ralen a bit...
186 	 */
187 		if ((vp->v_ralen + 1) < RA_MULTIPLE_SLOW*(MAXPHYS / size))
188 			++vp->v_ralen;
189 		/*
190 		 * this code makes sure that the stuff that we have read-ahead
191 		 * is still in the cache.  If it isn't, we have been reading
192 		 * ahead too much, and we need to back-off, otherwise we might
193 		 * try to read more.
194 		 */
195 		for (i = 0; i < vp->v_maxra - lblkno; i++) {
196 			rablkno = lblkno + i;
197 			alreadyincore = (int) incore(vp, rablkno);
198 			if (!alreadyincore) {
199 				vp->v_maxra = rablkno;
200 				vp->v_ralen >>= RA_SHIFTDOWN;
201 				alreadyincore = 1;
202 			}
203 		}
204 	}
205 	/*
206 	 * we now build the read-ahead buffer if it is desirable.
207 	 */
208 	rbp = NULL;
209 	if (!alreadyincore &&
210 	    ((u_quad_t)(rablkno + 1) * size) <= filesize &&
211 	    !(error = VOP_BMAP(vp, rablkno, NULL, &blkno, &num_ra, NULL)) &&
212 	    blkno != -1) {
213 		if (num_ra > vp->v_ralen)
214 			num_ra = vp->v_ralen;
215 
216 		if (num_ra) {
217 			rbp = cluster_rbuild(vp, filesize, rablkno, blkno, size,
218 				num_ra + 1);
219 		} else {
220 			rbp = getblk(vp, rablkno, size, 0, 0);
221 			rbp->b_flags |= B_READ | B_ASYNC;
222 			rbp->b_blkno = blkno;
223 		}
224 	}
225 
226 	/*
227 	 * handle the synchronous read
228 	 */
229 	if (bp) {
230 		if (bp->b_flags & (B_DONE | B_DELWRI))
231 			panic("cluster_read: DONE bp");
232 		else {
233 			vfs_busy_pages(bp, 0);
234 			error = VOP_STRATEGY(bp);
235 			vp->v_maxra = bp->b_lblkno + bp->b_bcount / size;
236 			totreads++;
237 			totreadblocks += bp->b_bcount / size;
238 			curproc->p_stats->p_ru.ru_inblock++;
239 		}
240 	}
241 	/*
242 	 * and if we have read-aheads, do them too
243 	 */
244 	if (rbp) {
245 		vp->v_maxra = rbp->b_lblkno + rbp->b_bcount / size;
246 		if (error) {
247 			rbp->b_flags &= ~(B_ASYNC | B_READ);
248 			brelse(rbp);
249 		} else if (rbp->b_flags & B_CACHE) {
250 			rbp->b_flags &= ~(B_ASYNC | B_READ);
251 			bqrelse(rbp);
252 		} else {
253 			if ((rbp->b_flags & B_CLUSTER) == 0)
254 				vfs_busy_pages(rbp, 0);
255 			(void) VOP_STRATEGY(rbp);
256 			totreads++;
257 			totreadblocks += rbp->b_bcount / size;
258 			curproc->p_stats->p_ru.ru_inblock++;
259 		}
260 	}
261 	if (bp && ((bp->b_flags & B_ASYNC) == 0))
262 		return (biowait(bp));
263 	return (error);
264 }
265 
266 /*
267  * If blocks are contiguous on disk, use this to provide clustered
268  * read ahead.  We will read as many blocks as possible sequentially
269  * and then parcel them up into logical blocks in the buffer hash table.
270  */
271 static struct buf *
272 cluster_rbuild(vp, filesize, lbn, blkno, size, run)
273 	struct vnode *vp;
274 	u_quad_t filesize;
275 	daddr_t lbn;
276 	daddr_t blkno;
277 	long size;
278 	int run;
279 {
280 	struct buf *bp, *tbp;
281 	daddr_t bn;
282 	int i, inc, j;
283 
284 #ifdef DIAGNOSTIC
285 	if (size != vp->v_mount->mnt_stat.f_iosize)
286 		panic("cluster_rbuild: size %d != filesize %d\n",
287 		    size, vp->v_mount->mnt_stat.f_iosize);
288 #endif
289 	/*
290 	 * avoid a division
291 	 */
292 	while ((u_quad_t) size * (lbn + run) > filesize) {
293 		--run;
294 	}
295 
296 	tbp = getblk(vp, lbn, size, 0, 0);
297 	if (tbp->b_flags & B_CACHE)
298 		return tbp;
299 
300 	tbp->b_blkno = blkno;
301 	tbp->b_flags |= B_ASYNC | B_READ;
302 	if( (tbp->b_flags & B_MALLOC) ||
303 		((tbp->b_flags & B_VMIO) == 0) || (run <= 1) )
304 		return tbp;
305 
306 	bp = trypbuf();
307 	if (bp == 0)
308 		return tbp;
309 
310 	(vm_offset_t) bp->b_data |= ((vm_offset_t) tbp->b_data) & PAGE_MASK;
311 	bp->b_flags = B_ASYNC | B_READ | B_CALL | B_BUSY | B_CLUSTER | B_VMIO;
312 	bp->b_iodone = cluster_callback;
313 	bp->b_blkno = blkno;
314 	bp->b_lblkno = lbn;
315 	pbgetvp(vp, bp);
316 
317 	TAILQ_INIT(&bp->b_cluster.cluster_head);
318 
319 	bp->b_bcount = 0;
320 	bp->b_bufsize = 0;
321 	bp->b_npages = 0;
322 
323 	inc = btodb(size);
324 	for (bn = blkno, i = 0; i < run; ++i, bn += inc) {
325 		if (i != 0) {
326 			if ((bp->b_npages * PAGE_SIZE) +
327 				round_page(size) > MAXPHYS)
328 				break;
329 
330 			if (incore(vp, lbn + i))
331 				break;
332 
333 			tbp = getblk(vp, lbn + i, size, 0, 0);
334 
335 			if ((tbp->b_flags & B_CACHE) ||
336 				(tbp->b_flags & B_VMIO) == 0) {
337 				bqrelse(tbp);
338 				break;
339 			}
340 
341 			for (j=0;j<tbp->b_npages;j++) {
342 				if (tbp->b_pages[j]->valid) {
343 					break;
344 				}
345 			}
346 
347 			if (j != tbp->b_npages) {
348 				/*
349 				 * force buffer to be re-constituted later
350 				 */
351 				tbp->b_flags |= B_RELBUF;
352 				brelse(tbp);
353 				break;
354 			}
355 
356 			tbp->b_flags |= B_READ | B_ASYNC;
357 			if (tbp->b_blkno == tbp->b_lblkno) {
358 				tbp->b_blkno = bn;
359 			} else if (tbp->b_blkno != bn) {
360 				brelse(tbp);
361 				break;
362 			}
363 		}
364 		TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head,
365 			tbp, b_cluster.cluster_entry);
366 		for (j = 0; j < tbp->b_npages; j += 1) {
367 			vm_page_t m;
368 			m = tbp->b_pages[j];
369 			++m->busy;
370 			++m->object->paging_in_progress;
371 			if ((bp->b_npages == 0) ||
372 				(bp->b_pages[bp->b_npages-1] != m)) {
373 				bp->b_pages[bp->b_npages] = m;
374 				bp->b_npages++;
375 			}
376 			if ((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL)
377 				tbp->b_pages[j] = bogus_page;
378 		}
379 		bp->b_bcount += tbp->b_bcount;
380 		bp->b_bufsize += tbp->b_bufsize;
381 	}
382 
383 	for(j=0;j<bp->b_npages;j++) {
384 		if ((bp->b_pages[j]->valid & VM_PAGE_BITS_ALL) ==
385 			VM_PAGE_BITS_ALL)
386 			bp->b_pages[j] = bogus_page;
387 	}
388 
389 	pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
390 		(vm_page_t *)bp->b_pages, bp->b_npages);
391 	return (bp);
392 }
393 
394 /*
395  * Cleanup after a clustered read or write.
396  * This is complicated by the fact that any of the buffers might have
397  * extra memory (if there were no empty buffer headers at allocbuf time)
398  * that we will need to shift around.
399  */
400 void
401 cluster_callback(bp)
402 	struct buf *bp;
403 {
404 	struct buf *nbp, *tbp;
405 	int error = 0;
406 
407 	/*
408 	 * Must propogate errors to all the components.
409 	 */
410 	if (bp->b_flags & B_ERROR)
411 		error = bp->b_error;
412 
413 	pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
414 	/*
415 	 * Move memory from the large cluster buffer into the component
416 	 * buffers and mark IO as done on these.
417 	 */
418 	for (tbp = bp->b_cluster.cluster_head.tqh_first;
419 		tbp; tbp = nbp) {
420 		nbp = tbp->b_cluster.cluster_entry.tqe_next;
421 		if (error) {
422 			tbp->b_flags |= B_ERROR;
423 			tbp->b_error = error;
424 		}
425 		biodone(tbp);
426 	}
427 	relpbuf(bp);
428 }
429 
430 /*
431  * Do clustered write for FFS.
432  *
433  * Three cases:
434  *	1. Write is not sequential (write asynchronously)
435  *	Write is sequential:
436  *	2.	beginning of cluster - begin cluster
437  *	3.	middle of a cluster - add to cluster
438  *	4.	end of a cluster - asynchronously write cluster
439  */
440 void
441 cluster_write(bp, filesize)
442 	struct buf *bp;
443 	u_quad_t filesize;
444 {
445 	struct vnode *vp;
446 	daddr_t lbn;
447 	int maxclen, cursize;
448 	int lblocksize;
449 	int async;
450 
451 	vp = bp->b_vp;
452 	async = (vp->v_mount && (vp->v_mount->mnt_flag & MNT_ASYNC));
453 	lblocksize = vp->v_mount->mnt_stat.f_iosize;
454 	lbn = bp->b_lblkno;
455 
456 	/* Initialize vnode to beginning of file. */
457 	if (lbn == 0)
458 		vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
459 
460 	if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 ||
461 	    (bp->b_blkno != vp->v_lasta + btodb(lblocksize))) {
462 		maxclen = MAXPHYS / lblocksize - 1;
463 		if (vp->v_clen != 0) {
464 			/*
465 			 * Next block is not sequential.
466 			 *
467 			 * If we are not writing at end of file, the process
468 			 * seeked to another point in the file since its last
469 			 * write, or we have reached our maximum cluster size,
470 			 * then push the previous cluster. Otherwise try
471 			 * reallocating to make it sequential.
472 			 */
473 			cursize = vp->v_lastw - vp->v_cstart + 1;
474 #ifndef notyet_block_reallocation_enabled
475 			if (((u_quad_t)(lbn + 1) * lblocksize) != filesize ||
476 				lbn != vp->v_lastw + 1 ||
477 				vp->v_clen <= cursize) {
478 				if (!async)
479 					cluster_wbuild(vp, lblocksize,
480 						vp->v_cstart, cursize);
481 			}
482 #else
483 			if (!doreallocblks ||
484 			    (lbn + 1) * lblocksize != filesize ||
485 			    lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) {
486 				if (!async)
487 					cluster_wbuild(vp, lblocksize,
488 						vp->v_cstart, cursize);
489 			} else {
490 				struct buf **bpp, **endbp;
491 				struct cluster_save *buflist;
492 
493 				buflist = cluster_collectbufs(vp, bp);
494 				endbp = &buflist->bs_children
495 				    [buflist->bs_nchildren - 1];
496 				if (VOP_REALLOCBLKS(vp, buflist)) {
497 					/*
498 					 * Failed, push the previous cluster.
499 					 */
500 					for (bpp = buflist->bs_children;
501 					     bpp < endbp; bpp++)
502 						brelse(*bpp);
503 					free(buflist, M_SEGMENT);
504 					cluster_wbuild(vp, lblocksize,
505 					    vp->v_cstart, cursize);
506 				} else {
507 					/*
508 					 * Succeeded, keep building cluster.
509 					 */
510 					for (bpp = buflist->bs_children;
511 					     bpp <= endbp; bpp++)
512 						bdwrite(*bpp);
513 					free(buflist, M_SEGMENT);
514 					vp->v_lastw = lbn;
515 					vp->v_lasta = bp->b_blkno;
516 					return;
517 				}
518 			}
519 #endif /* notyet_block_reallocation_enabled */
520 		}
521 		/*
522 		 * Consider beginning a cluster. If at end of file, make
523 		 * cluster as large as possible, otherwise find size of
524 		 * existing cluster.
525 		 */
526 		if (((u_quad_t) (lbn + 1) * lblocksize) != filesize &&
527 		    (bp->b_blkno == bp->b_lblkno) &&
528 		    (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen, NULL) ||
529 		     bp->b_blkno == -1)) {
530 			bawrite(bp);
531 			vp->v_clen = 0;
532 			vp->v_lasta = bp->b_blkno;
533 			vp->v_cstart = lbn + 1;
534 			vp->v_lastw = lbn;
535 			return;
536 		}
537 		vp->v_clen = maxclen;
538 		if (!async && maxclen == 0) {	/* I/O not contiguous */
539 			vp->v_cstart = lbn + 1;
540 			bawrite(bp);
541 		} else {	/* Wait for rest of cluster */
542 			vp->v_cstart = lbn;
543 			bdwrite(bp);
544 		}
545 	} else if (lbn == vp->v_cstart + vp->v_clen) {
546 		/*
547 		 * At end of cluster, write it out.
548 		 */
549 		bdwrite(bp);
550 		cluster_wbuild(vp, lblocksize, vp->v_cstart, vp->v_clen + 1);
551 		vp->v_clen = 0;
552 		vp->v_cstart = lbn + 1;
553 	} else
554 		/*
555 		 * In the middle of a cluster, so just delay the I/O for now.
556 		 */
557 		bdwrite(bp);
558 	vp->v_lastw = lbn;
559 	vp->v_lasta = bp->b_blkno;
560 }
561 
562 
563 /*
564  * This is an awful lot like cluster_rbuild...wish they could be combined.
565  * The last lbn argument is the current block on which I/O is being
566  * performed.  Check to see that it doesn't fall in the middle of
567  * the current block (if last_bp == NULL).
568  */
569 int
570 cluster_wbuild(vp, size, start_lbn, len)
571 	struct vnode *vp;
572 	long size;
573 	daddr_t start_lbn;
574 	int len;
575 {
576 	struct buf *bp, *tbp;
577 	int i, j, s;
578 	int totalwritten = 0;
579 	int dbsize = btodb(size);
580 	while (len > 0) {
581 		s = splbio();
582 		if ( ((tbp = gbincore(vp, start_lbn)) == NULL) ||
583 			((tbp->b_flags & (B_INVAL|B_BUSY|B_DELWRI)) != B_DELWRI)) {
584 			++start_lbn;
585 			--len;
586 			splx(s);
587 			continue;
588 		}
589 		bremfree(tbp);
590 		tbp->b_flags |= B_BUSY;
591 		tbp->b_flags &= ~B_DONE;
592 		splx(s);
593 
594 	/*
595 	 * Extra memory in the buffer, punt on this buffer. XXX we could
596 	 * handle this in most cases, but we would have to push the extra
597 	 * memory down to after our max possible cluster size and then
598 	 * potentially pull it back up if the cluster was terminated
599 	 * prematurely--too much hassle.
600 	 */
601 		if (((tbp->b_flags & (B_CLUSTEROK|B_MALLOC)) != B_CLUSTEROK) ||
602 			(tbp->b_bcount != tbp->b_bufsize) ||
603 			(tbp->b_bcount != size) ||
604 			len == 1) {
605 			totalwritten += tbp->b_bufsize;
606 			bawrite(tbp);
607 			++start_lbn;
608 			--len;
609 			continue;
610 		}
611 
612 		bp = trypbuf();
613 		if (bp == NULL) {
614 			totalwritten += tbp->b_bufsize;
615 			bawrite(tbp);
616 			++start_lbn;
617 			--len;
618 			continue;
619 		}
620 
621 		TAILQ_INIT(&bp->b_cluster.cluster_head);
622 		bp->b_bcount = 0;
623 		bp->b_bufsize = 0;
624 		bp->b_npages = 0;
625 		if (tbp->b_wcred != NOCRED) {
626 		    bp->b_wcred = tbp->b_wcred;
627 		    crhold(bp->b_wcred);
628 		}
629 
630 		bp->b_blkno = tbp->b_blkno;
631 		bp->b_lblkno = tbp->b_lblkno;
632 		(vm_offset_t) bp->b_data |= ((vm_offset_t) tbp->b_data) & PAGE_MASK;
633 		bp->b_flags |= B_CALL | B_BUSY | B_CLUSTER | (tbp->b_flags & (B_VMIO|B_NEEDCOMMIT));
634 		bp->b_iodone = cluster_callback;
635 		pbgetvp(vp, bp);
636 
637 		for (i = 0; i < len; ++i, ++start_lbn) {
638 			if (i != 0) {
639 				s = splbio();
640 				if ((tbp = gbincore(vp, start_lbn)) == NULL) {
641 					splx(s);
642 					break;
643 				}
644 
645 				if ((tbp->b_flags & (B_VMIO|B_CLUSTEROK|B_INVAL|B_BUSY|B_DELWRI|B_NEEDCOMMIT)) != (B_DELWRI|B_CLUSTEROK|(bp->b_flags & (B_VMIO|B_NEEDCOMMIT)))) {
646 					splx(s);
647 					break;
648 				}
649 
650 				if (tbp->b_wcred != bp->b_wcred) {
651 					splx(s);
652 					break;
653 				}
654 
655 				if ((tbp->b_bcount != size) ||
656 					((bp->b_blkno + dbsize * i) != tbp->b_blkno) ||
657 					((tbp->b_npages + bp->b_npages) > (MAXPHYS / PAGE_SIZE))) {
658 					splx(s);
659 					break;
660 				}
661 				bremfree(tbp);
662 				tbp->b_flags |= B_BUSY;
663 				tbp->b_flags &= ~B_DONE;
664 				splx(s);
665 			}
666 			if (tbp->b_flags & B_VMIO) {
667 				for (j = 0; j < tbp->b_npages; j += 1) {
668 					vm_page_t m;
669 					m = tbp->b_pages[j];
670 					++m->busy;
671 					++m->object->paging_in_progress;
672 					if ((bp->b_npages == 0) ||
673 						(bp->b_pages[bp->b_npages - 1] != m)) {
674 						bp->b_pages[bp->b_npages] = m;
675 						bp->b_npages++;
676 					}
677 				}
678 			}
679 			bp->b_bcount += size;
680 			bp->b_bufsize += size;
681 
682 			tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
683 			tbp->b_flags |= B_ASYNC;
684 			s = splbio();
685 			reassignbuf(tbp, tbp->b_vp);	/* put on clean list */
686 			++tbp->b_vp->v_numoutput;
687 			splx(s);
688 			TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head,
689 				tbp, b_cluster.cluster_entry);
690 		}
691 		pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
692 			(vm_page_t *) bp->b_pages, bp->b_npages);
693 		totalwritten += bp->b_bufsize;
694 		bp->b_dirtyoff = 0;
695 		bp->b_dirtyend = bp->b_bufsize;
696 		bawrite(bp);
697 
698 		len -= i;
699 	}
700 	return totalwritten;
701 }
702 
703 #ifdef notyet_block_reallocation_enabled
704 /*
705  * Collect together all the buffers in a cluster.
706  * Plus add one additional buffer.
707  */
708 static struct cluster_save *
709 cluster_collectbufs(vp, last_bp)
710 	struct vnode *vp;
711 	struct buf *last_bp;
712 {
713 	struct cluster_save *buflist;
714 	daddr_t lbn;
715 	int i, len;
716 
717 	len = vp->v_lastw - vp->v_cstart + 1;
718 	buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist),
719 	    M_SEGMENT, M_WAITOK);
720 	buflist->bs_nchildren = 0;
721 	buflist->bs_children = (struct buf **) (buflist + 1);
722 	for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++)
723 		(void) bread(vp, lbn, last_bp->b_bcount, NOCRED,
724 		    &buflist->bs_children[i]);
725 	buflist->bs_children[i] = last_bp;
726 	buflist->bs_nchildren = i + 1;
727 	return (buflist);
728 }
729 #endif /* notyet_block_reallocation_enabled */
730