xref: /freebsd/sys/kern/vfs_cluster.c (revision e627b39baccd1ec9129690167cf5e6d860509655)
1 /*-
2  * Copyright (c) 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * Modifications/enhancements:
5  * 	Copyright (c) 1995 John S. Dyson.  All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. All advertising materials mentioning features or use of this software
16  *    must display the following acknowledgement:
17  *	This product includes software developed by the University of
18  *	California, Berkeley and its contributors.
19  * 4. Neither the name of the University nor the names of its contributors
20  *    may be used to endorse or promote products derived from this software
21  *    without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  *
35  *	@(#)vfs_cluster.c	8.7 (Berkeley) 2/13/94
36  * $Id: vfs_cluster.c,v 1.36 1996/06/03 04:40:35 dyson Exp $
37  */
38 
39 #include <sys/param.h>
40 #include <sys/systm.h>
41 #include <sys/proc.h>
42 #include <sys/buf.h>
43 #include <sys/vnode.h>
44 #include <sys/mount.h>
45 #include <sys/malloc.h>
46 #include <sys/resourcevar.h>
47 #include <sys/vmmeter.h>
48 #include <miscfs/specfs/specdev.h>
49 #include <vm/vm.h>
50 #include <vm/vm_param.h>
51 #include <vm/vm_prot.h>
52 #include <vm/vm_object.h>
53 #include <vm/vm_page.h>
54 
55 #ifdef notyet_block_reallocation_enabled
56 #ifdef DEBUG
57 #include <sys/sysctl.h>
58 #include <sys/kernel.h>
59 
60 static int	doreallocblks = 0;
61 SYSCTL_INT(_debug, 13, doreallocblks, CTLFLAG_RW, &doreallocblks, 0, "");
62 #else
63 #define	doreallocblks 0
64 #endif
65 #endif /* notyet_block_reallocation_enabled */
66 
67 #ifdef notyet_block_reallocation_enabled
68 static struct cluster_save *
69 	cluster_collectbufs __P((struct vnode *vp, struct buf *last_bp));
70 #endif
71 static struct buf *
72 	cluster_rbuild __P((struct vnode *vp, u_quad_t filesize, daddr_t lbn,
73 			    daddr_t blkno, long size, int run));
74 
75 static int	totreads;
76 static int	totreadblocks;
77 extern vm_page_t	bogus_page;
78 
79 #ifdef DIAGNOSTIC
80 /*
81  * Set to 1 if reads of block zero should cause readahead to be done.
82  * Set to 0 treats a read of block zero as a non-sequential read.
83  *
84  * Setting to one assumes that most reads of block zero of files are due to
85  * sequential passes over the files (e.g. cat, sum) where additional blocks
86  * will soon be needed.  Setting to zero assumes that the majority are
87  * surgical strikes to get particular info (e.g. size, file) where readahead
88  * blocks will not be used and, in fact, push out other potentially useful
89  * blocks from the cache.  The former seems intuitive, but some quick tests
90  * showed that the latter performed better from a system-wide point of view.
91  */
92 	int doclusterraz = 0;
93 
94 #define ISSEQREAD(vp, blk) \
95 	(((blk) != 0 || doclusterraz) && \
96 	 ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr))
97 #else
98 #define ISSEQREAD(vp, blk) \
99 	(/* (blk) != 0 && */ ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr))
100 #endif
101 
102 /*
103  * allow for three entire read-aheads...  The system will
104  * adjust downwards rapidly if needed...
105  */
106 #define RA_MULTIPLE_FAST	2
107 #define RA_MULTIPLE_SLOW	3
108 #define RA_SHIFTDOWN	1	/* approx lg2(RA_MULTIPLE) */
109 /*
110  * This replaces bread.  If this is a bread at the beginning of a file and
111  * lastr is 0, we assume this is the first read and we'll read up to two
112  * blocks if they are sequential.  After that, we'll do regular read ahead
113  * in clustered chunks.
114  * 	bp is the block requested.
115  *	rbp is the read-ahead block.
116  *	If either is NULL, then you don't have to do the I/O.
117  */
118 int
119 cluster_read(vp, filesize, lblkno, size, cred, bpp)
120 	struct vnode *vp;
121 	u_quad_t filesize;
122 	daddr_t lblkno;
123 	long size;
124 	struct ucred *cred;
125 	struct buf **bpp;
126 {
127 	struct buf *bp, *rbp;
128 	daddr_t blkno, rablkno, origlblkno;
129 	int error, num_ra, alreadyincore;
130 	int i;
131 	int seq;
132 
133 	error = 0;
134 	/*
135 	 * get the requested block
136 	 */
137 	origlblkno = lblkno;
138 	*bpp = bp = getblk(vp, lblkno, size, 0, 0);
139 
140 	seq = ISSEQREAD(vp, lblkno);
141 	/*
142 	 * if it is in the cache, then check to see if the reads have been
143 	 * sequential.  If they have, then try some read-ahead, otherwise
144 	 * back-off on prospective read-aheads.
145 	 */
146 	if (bp->b_flags & B_CACHE) {
147 		if (!seq) {
148 			vp->v_maxra = bp->b_lblkno + bp->b_bcount / size;
149 			vp->v_ralen >>= RA_SHIFTDOWN;
150 			return 0;
151 		} else if( vp->v_maxra > lblkno) {
152 			if ((vp->v_ralen + 1) < RA_MULTIPLE_FAST * (MAXPHYS / size))
153 				++vp->v_ralen;
154 			if ( vp->v_maxra > lblkno + vp->v_ralen ) {
155 				return 0;
156 			}
157 			lblkno = vp->v_maxra;
158 		} else {
159 			lblkno += 1;
160 		}
161 		bp = NULL;
162 	} else {
163 		/*
164 		 * if it isn't in the cache, then get a chunk from disk if
165 		 * sequential, otherwise just get the block.
166 		 */
167 		bp->b_flags |= B_READ;
168 		lblkno += 1;
169 		curproc->p_stats->p_ru.ru_inblock++;	/* XXX */
170 		vp->v_ralen = 0;
171 	}
172 	/*
173 	 * assume no read-ahead
174 	 */
175 	alreadyincore = 1;
176 	rablkno = lblkno;
177 
178 	/*
179 	 * if we have been doing sequential I/O, then do some read-ahead
180 	 */
181 	if (seq) {
182 		alreadyincore = 0;
183 
184 	/*
185 	 * bump ralen a bit...
186 	 */
187 		if ((vp->v_ralen + 1) < RA_MULTIPLE_SLOW*(MAXPHYS / size))
188 			++vp->v_ralen;
189 		/*
190 		 * this code makes sure that the stuff that we have read-ahead
191 		 * is still in the cache.  If it isn't, we have been reading
192 		 * ahead too much, and we need to back-off, otherwise we might
193 		 * try to read more.
194 		 */
195 		for (i = 0; i < vp->v_maxra - lblkno; i++) {
196 			rablkno = lblkno + i;
197 			alreadyincore = (int) incore(vp, rablkno);
198 			if (!alreadyincore) {
199 				vp->v_maxra = rablkno;
200 				vp->v_ralen >>= RA_SHIFTDOWN;
201 				alreadyincore = 1;
202 			}
203 		}
204 	}
205 	/*
206 	 * we now build the read-ahead buffer if it is desirable.
207 	 */
208 	rbp = NULL;
209 	if (!alreadyincore &&
210 	    ((u_quad_t)(rablkno + 1) * size) <= filesize &&
211 	    !(error = VOP_BMAP(vp, rablkno, NULL, &blkno, &num_ra, NULL)) &&
212 	    blkno != -1) {
213 		if (num_ra > vp->v_ralen)
214 			num_ra = vp->v_ralen;
215 
216 		if (num_ra) {
217 			rbp = cluster_rbuild(vp, filesize, rablkno, blkno, size,
218 				num_ra + 1);
219 		} else {
220 			rbp = getblk(vp, rablkno, size, 0, 0);
221 			rbp->b_flags |= B_READ | B_ASYNC;
222 			rbp->b_blkno = blkno;
223 		}
224 	}
225 
226 	/*
227 	 * handle the synchronous read
228 	 */
229 	if (bp) {
230 		if (bp->b_flags & (B_DONE | B_DELWRI))
231 			panic("cluster_read: DONE bp");
232 		else {
233 			vfs_busy_pages(bp, 0);
234 			error = VOP_STRATEGY(bp);
235 			vp->v_maxra = bp->b_lblkno + bp->b_bcount / size;
236 			totreads++;
237 			totreadblocks += bp->b_bcount / size;
238 			curproc->p_stats->p_ru.ru_inblock++;
239 		}
240 	}
241 	/*
242 	 * and if we have read-aheads, do them too
243 	 */
244 	if (rbp) {
245 		vp->v_maxra = rbp->b_lblkno + rbp->b_bcount / size;
246 		if (error) {
247 			rbp->b_flags &= ~(B_ASYNC | B_READ);
248 			brelse(rbp);
249 		} else if (rbp->b_flags & B_CACHE) {
250 			rbp->b_flags &= ~(B_ASYNC | B_READ);
251 			bqrelse(rbp);
252 		} else {
253 			if ((rbp->b_flags & B_CLUSTER) == 0)
254 				vfs_busy_pages(rbp, 0);
255 			(void) VOP_STRATEGY(rbp);
256 			totreads++;
257 			totreadblocks += rbp->b_bcount / size;
258 			curproc->p_stats->p_ru.ru_inblock++;
259 		}
260 	}
261 	if (bp && ((bp->b_flags & B_ASYNC) == 0))
262 		return (biowait(bp));
263 	return (error);
264 }
265 
266 /*
267  * If blocks are contiguous on disk, use this to provide clustered
268  * read ahead.  We will read as many blocks as possible sequentially
269  * and then parcel them up into logical blocks in the buffer hash table.
270  */
271 static struct buf *
272 cluster_rbuild(vp, filesize, lbn, blkno, size, run)
273 	struct vnode *vp;
274 	u_quad_t filesize;
275 	daddr_t lbn;
276 	daddr_t blkno;
277 	long size;
278 	int run;
279 {
280 	struct buf *bp, *tbp;
281 	daddr_t bn;
282 	int i, inc, j;
283 
284 #ifdef DIAGNOSTIC
285 	if (size != vp->v_mount->mnt_stat.f_iosize)
286 		panic("cluster_rbuild: size %d != filesize %d\n",
287 		    size, vp->v_mount->mnt_stat.f_iosize);
288 #endif
289 	/*
290 	 * avoid a division
291 	 */
292 	while ((u_quad_t) size * (lbn + run) > filesize) {
293 		--run;
294 	}
295 
296 	tbp = getblk(vp, lbn, size, 0, 0);
297 	if (tbp->b_flags & B_CACHE)
298 		return tbp;
299 
300 	tbp->b_blkno = blkno;
301 	tbp->b_flags |= B_ASYNC | B_READ;
302 	if( (tbp->b_flags & B_MALLOC) ||
303 		((tbp->b_flags & B_VMIO) == 0) || (run <= 1) )
304 		return tbp;
305 
306 	bp = trypbuf();
307 	if (bp == 0)
308 		return tbp;
309 
310 	(vm_offset_t) bp->b_data |= ((vm_offset_t) tbp->b_data) & PAGE_MASK;
311 	bp->b_flags = B_ASYNC | B_READ | B_CALL | B_BUSY | B_CLUSTER | B_VMIO;
312 	bp->b_iodone = cluster_callback;
313 	bp->b_blkno = blkno;
314 	bp->b_lblkno = lbn;
315 	pbgetvp(vp, bp);
316 
317 	TAILQ_INIT(&bp->b_cluster.cluster_head);
318 
319 	bp->b_bcount = 0;
320 	bp->b_bufsize = 0;
321 	bp->b_npages = 0;
322 
323 	inc = btodb(size);
324 	for (bn = blkno, i = 0; i < run; ++i, bn += inc) {
325 		if (i != 0) {
326 			if ((bp->b_npages * PAGE_SIZE) +
327 				round_page(size) > MAXPHYS)
328 				break;
329 
330 			if (incore(vp, lbn + i))
331 				break;
332 
333 			tbp = getblk(vp, lbn + i, size, 0, 0);
334 
335 			if ((tbp->b_flags & B_CACHE) ||
336 				(tbp->b_flags & B_VMIO) == 0) {
337 				bqrelse(tbp);
338 				break;
339 			}
340 
341 			for (j=0;j<tbp->b_npages;j++) {
342 				if (tbp->b_pages[j]->valid) {
343 					break;
344 				}
345 			}
346 
347 			if (j != tbp->b_npages) {
348 				/*
349 				 * force buffer to be re-constituted later
350 				 */
351 				tbp->b_flags |= B_RELBUF;
352 				brelse(tbp);
353 				break;
354 			}
355 
356 			tbp->b_flags |= B_READ | B_ASYNC;
357 			if (tbp->b_blkno == tbp->b_lblkno) {
358 				tbp->b_blkno = bn;
359 			} else if (tbp->b_blkno != bn) {
360 				brelse(tbp);
361 				break;
362 			}
363 		}
364 		TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head,
365 			tbp, b_cluster.cluster_entry);
366 		for (j = 0; j < tbp->b_npages; j += 1) {
367 			vm_page_t m;
368 			m = tbp->b_pages[j];
369 			++m->busy;
370 			++m->object->paging_in_progress;
371 			if ((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) {
372 				m = bogus_page;
373 			}
374 			if ((bp->b_npages == 0) ||
375 				(bp->b_pages[bp->b_npages-1] != m)) {
376 				bp->b_pages[bp->b_npages] = m;
377 				bp->b_npages++;
378 			}
379 		}
380 		bp->b_bcount += tbp->b_bcount;
381 		bp->b_bufsize += tbp->b_bufsize;
382 	}
383 	pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
384 		(vm_page_t *)bp->b_pages, bp->b_npages);
385 	return (bp);
386 }
387 
388 /*
389  * Cleanup after a clustered read or write.
390  * This is complicated by the fact that any of the buffers might have
391  * extra memory (if there were no empty buffer headers at allocbuf time)
392  * that we will need to shift around.
393  */
394 void
395 cluster_callback(bp)
396 	struct buf *bp;
397 {
398 	struct buf *nbp, *tbp;
399 	int error = 0;
400 
401 	/*
402 	 * Must propogate errors to all the components.
403 	 */
404 	if (bp->b_flags & B_ERROR)
405 		error = bp->b_error;
406 
407 	pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
408 	/*
409 	 * Move memory from the large cluster buffer into the component
410 	 * buffers and mark IO as done on these.
411 	 */
412 	for (tbp = bp->b_cluster.cluster_head.tqh_first;
413 		tbp; tbp = nbp) {
414 		nbp = tbp->b_cluster.cluster_entry.tqe_next;
415 		if (error) {
416 			tbp->b_flags |= B_ERROR;
417 			tbp->b_error = error;
418 		}
419 		biodone(tbp);
420 	}
421 	relpbuf(bp);
422 }
423 
424 /*
425  * Do clustered write for FFS.
426  *
427  * Three cases:
428  *	1. Write is not sequential (write asynchronously)
429  *	Write is sequential:
430  *	2.	beginning of cluster - begin cluster
431  *	3.	middle of a cluster - add to cluster
432  *	4.	end of a cluster - asynchronously write cluster
433  */
434 void
435 cluster_write(bp, filesize)
436 	struct buf *bp;
437 	u_quad_t filesize;
438 {
439 	struct vnode *vp;
440 	daddr_t lbn;
441 	int maxclen, cursize;
442 	int lblocksize;
443 	int async;
444 
445 	vp = bp->b_vp;
446 	async = (vp->v_mount && (vp->v_mount->mnt_flag & MNT_ASYNC));
447 	lblocksize = vp->v_mount->mnt_stat.f_iosize;
448 	lbn = bp->b_lblkno;
449 
450 	/* Initialize vnode to beginning of file. */
451 	if (lbn == 0)
452 		vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
453 
454 	if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 ||
455 	    (bp->b_blkno != vp->v_lasta + btodb(lblocksize))) {
456 		maxclen = MAXPHYS / lblocksize - 1;
457 		if (vp->v_clen != 0) {
458 			/*
459 			 * Next block is not sequential.
460 			 *
461 			 * If we are not writing at end of file, the process
462 			 * seeked to another point in the file since its last
463 			 * write, or we have reached our maximum cluster size,
464 			 * then push the previous cluster. Otherwise try
465 			 * reallocating to make it sequential.
466 			 */
467 			cursize = vp->v_lastw - vp->v_cstart + 1;
468 #ifndef notyet_block_reallocation_enabled
469 			if (((u_quad_t)(lbn + 1) * lblocksize) != filesize ||
470 				lbn != vp->v_lastw + 1 ||
471 				vp->v_clen <= cursize) {
472 				if (!async)
473 					cluster_wbuild(vp, lblocksize,
474 						vp->v_cstart, cursize);
475 			}
476 #else
477 			if (!doreallocblks ||
478 			    (lbn + 1) * lblocksize != filesize ||
479 			    lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) {
480 				if (!async)
481 					cluster_wbuild(vp, lblocksize,
482 						vp->v_cstart, cursize);
483 			} else {
484 				struct buf **bpp, **endbp;
485 				struct cluster_save *buflist;
486 
487 				buflist = cluster_collectbufs(vp, bp);
488 				endbp = &buflist->bs_children
489 				    [buflist->bs_nchildren - 1];
490 				if (VOP_REALLOCBLKS(vp, buflist)) {
491 					/*
492 					 * Failed, push the previous cluster.
493 					 */
494 					for (bpp = buflist->bs_children;
495 					     bpp < endbp; bpp++)
496 						brelse(*bpp);
497 					free(buflist, M_SEGMENT);
498 					cluster_wbuild(vp, lblocksize,
499 					    vp->v_cstart, cursize);
500 				} else {
501 					/*
502 					 * Succeeded, keep building cluster.
503 					 */
504 					for (bpp = buflist->bs_children;
505 					     bpp <= endbp; bpp++)
506 						bdwrite(*bpp);
507 					free(buflist, M_SEGMENT);
508 					vp->v_lastw = lbn;
509 					vp->v_lasta = bp->b_blkno;
510 					return;
511 				}
512 			}
513 #endif /* notyet_block_reallocation_enabled */
514 		}
515 		/*
516 		 * Consider beginning a cluster. If at end of file, make
517 		 * cluster as large as possible, otherwise find size of
518 		 * existing cluster.
519 		 */
520 		if (((u_quad_t) (lbn + 1) * lblocksize) != filesize &&
521 		    (bp->b_blkno == bp->b_lblkno) &&
522 		    (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen, NULL) ||
523 		     bp->b_blkno == -1)) {
524 			bawrite(bp);
525 			vp->v_clen = 0;
526 			vp->v_lasta = bp->b_blkno;
527 			vp->v_cstart = lbn + 1;
528 			vp->v_lastw = lbn;
529 			return;
530 		}
531 		vp->v_clen = maxclen;
532 		if (!async && maxclen == 0) {	/* I/O not contiguous */
533 			vp->v_cstart = lbn + 1;
534 			bawrite(bp);
535 		} else {	/* Wait for rest of cluster */
536 			vp->v_cstart = lbn;
537 			bdwrite(bp);
538 		}
539 	} else if (lbn == vp->v_cstart + vp->v_clen) {
540 		/*
541 		 * At end of cluster, write it out.
542 		 */
543 		bdwrite(bp);
544 		cluster_wbuild(vp, lblocksize, vp->v_cstart, vp->v_clen + 1);
545 		vp->v_clen = 0;
546 		vp->v_cstart = lbn + 1;
547 	} else
548 		/*
549 		 * In the middle of a cluster, so just delay the I/O for now.
550 		 */
551 		bdwrite(bp);
552 	vp->v_lastw = lbn;
553 	vp->v_lasta = bp->b_blkno;
554 }
555 
556 
557 /*
558  * This is an awful lot like cluster_rbuild...wish they could be combined.
559  * The last lbn argument is the current block on which I/O is being
560  * performed.  Check to see that it doesn't fall in the middle of
561  * the current block (if last_bp == NULL).
562  */
563 int
564 cluster_wbuild(vp, size, start_lbn, len)
565 	struct vnode *vp;
566 	long size;
567 	daddr_t start_lbn;
568 	int len;
569 {
570 	struct buf *bp, *tbp;
571 	int i, j, s;
572 	int totalwritten = 0;
573 	int dbsize = btodb(size);
574 	while (len > 0) {
575 		s = splbio();
576 		if ( ((tbp = gbincore(vp, start_lbn)) == NULL) ||
577 			((tbp->b_flags & (B_INVAL|B_BUSY|B_DELWRI)) != B_DELWRI)) {
578 			++start_lbn;
579 			--len;
580 			splx(s);
581 			continue;
582 		}
583 		bremfree(tbp);
584 		tbp->b_flags |= B_BUSY;
585 		tbp->b_flags &= ~B_DONE;
586 		splx(s);
587 
588 	/*
589 	 * Extra memory in the buffer, punt on this buffer. XXX we could
590 	 * handle this in most cases, but we would have to push the extra
591 	 * memory down to after our max possible cluster size and then
592 	 * potentially pull it back up if the cluster was terminated
593 	 * prematurely--too much hassle.
594 	 */
595 		if (((tbp->b_flags & (B_CLUSTEROK|B_MALLOC)) != B_CLUSTEROK) ||
596 			(tbp->b_bcount != tbp->b_bufsize) ||
597 			(tbp->b_bcount != size) ||
598 			len == 1) {
599 			totalwritten += tbp->b_bufsize;
600 			bawrite(tbp);
601 			++start_lbn;
602 			--len;
603 			continue;
604 		}
605 
606 		bp = trypbuf();
607 		if (bp == NULL) {
608 			totalwritten += tbp->b_bufsize;
609 			bawrite(tbp);
610 			++start_lbn;
611 			--len;
612 			continue;
613 		}
614 
615 		TAILQ_INIT(&bp->b_cluster.cluster_head);
616 		bp->b_bcount = 0;
617 		bp->b_bufsize = 0;
618 		bp->b_npages = 0;
619 		if (tbp->b_wcred != NOCRED) {
620 		    bp->b_wcred = tbp->b_wcred;
621 		    crhold(bp->b_wcred);
622 		}
623 
624 		bp->b_blkno = tbp->b_blkno;
625 		bp->b_lblkno = tbp->b_lblkno;
626 		(vm_offset_t) bp->b_data |= ((vm_offset_t) tbp->b_data) & PAGE_MASK;
627 		bp->b_flags |= B_CALL | B_BUSY | B_CLUSTER | (tbp->b_flags & (B_VMIO|B_NEEDCOMMIT));
628 		bp->b_iodone = cluster_callback;
629 		pbgetvp(vp, bp);
630 
631 		for (i = 0; i < len; ++i, ++start_lbn) {
632 			if (i != 0) {
633 				s = splbio();
634 				if ((tbp = gbincore(vp, start_lbn)) == NULL) {
635 					splx(s);
636 					break;
637 				}
638 
639 				if ((tbp->b_flags & (B_VMIO|B_CLUSTEROK|B_INVAL|B_BUSY|B_DELWRI|B_NEEDCOMMIT)) != (B_DELWRI|B_CLUSTEROK|(bp->b_flags & (B_VMIO|B_NEEDCOMMIT)))) {
640 					splx(s);
641 					break;
642 				}
643 
644 				if (tbp->b_wcred != bp->b_wcred) {
645 					splx(s);
646 					break;
647 				}
648 
649 				if ((tbp->b_bcount != size) ||
650 					((bp->b_blkno + dbsize * i) != tbp->b_blkno) ||
651 					((tbp->b_npages + bp->b_npages) > (MAXPHYS / PAGE_SIZE))) {
652 					splx(s);
653 					break;
654 				}
655 				bremfree(tbp);
656 				tbp->b_flags |= B_BUSY;
657 				tbp->b_flags &= ~B_DONE;
658 				splx(s);
659 			}
660 			if (tbp->b_flags & B_VMIO) {
661 				for (j = 0; j < tbp->b_npages; j += 1) {
662 					vm_page_t m;
663 					m = tbp->b_pages[j];
664 					++m->busy;
665 					++m->object->paging_in_progress;
666 					if ((bp->b_npages == 0) ||
667 						(bp->b_pages[bp->b_npages - 1] != m)) {
668 						bp->b_pages[bp->b_npages] = m;
669 						bp->b_npages++;
670 					}
671 				}
672 			}
673 			bp->b_bcount += size;
674 			bp->b_bufsize += size;
675 
676 			tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
677 			tbp->b_flags |= B_ASYNC;
678 			s = splbio();
679 			reassignbuf(tbp, tbp->b_vp);	/* put on clean list */
680 			++tbp->b_vp->v_numoutput;
681 			splx(s);
682 			TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head,
683 				tbp, b_cluster.cluster_entry);
684 		}
685 		pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
686 			(vm_page_t *) bp->b_pages, bp->b_npages);
687 		totalwritten += bp->b_bufsize;
688 		bp->b_dirtyoff = 0;
689 		bp->b_dirtyend = bp->b_bufsize;
690 		bawrite(bp);
691 
692 		len -= i;
693 	}
694 	return totalwritten;
695 }
696 
697 #ifdef notyet_block_reallocation_enabled
698 /*
699  * Collect together all the buffers in a cluster.
700  * Plus add one additional buffer.
701  */
702 static struct cluster_save *
703 cluster_collectbufs(vp, last_bp)
704 	struct vnode *vp;
705 	struct buf *last_bp;
706 {
707 	struct cluster_save *buflist;
708 	daddr_t lbn;
709 	int i, len;
710 
711 	len = vp->v_lastw - vp->v_cstart + 1;
712 	buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist),
713 	    M_SEGMENT, M_WAITOK);
714 	buflist->bs_nchildren = 0;
715 	buflist->bs_children = (struct buf **) (buflist + 1);
716 	for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++)
717 		(void) bread(vp, lbn, last_bp->b_bcount, NOCRED,
718 		    &buflist->bs_children[i]);
719 	buflist->bs_children[i] = last_bp;
720 	buflist->bs_nchildren = i + 1;
721 	return (buflist);
722 }
723 #endif /* notyet_block_reallocation_enabled */
724