xref: /freebsd/sys/kern/vfs_cluster.c (revision df7f5d4de4592a8948a25ce01e5bddfbb7ce39dc)
1 /*-
2  * Copyright (c) 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * Modifications/enhancements:
5  * 	Copyright (c) 1995 John S. Dyson.  All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. All advertising materials mentioning features or use of this software
16  *    must display the following acknowledgement:
17  *	This product includes software developed by the University of
18  *	California, Berkeley and its contributors.
19  * 4. Neither the name of the University nor the names of its contributors
20  *    may be used to endorse or promote products derived from this software
21  *    without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  *
35  *	@(#)vfs_cluster.c	8.7 (Berkeley) 2/13/94
36  * $Id: vfs_cluster.c,v 1.42 1997/02/22 09:39:31 peter Exp $
37  */
38 
39 #include <sys/param.h>
40 #include <sys/systm.h>
41 #include <sys/proc.h>
42 #include <sys/buf.h>
43 #include <sys/vnode.h>
44 #include <sys/mount.h>
45 #include <sys/malloc.h>
46 #include <sys/resourcevar.h>
47 #include <sys/vmmeter.h>
48 #include <miscfs/specfs/specdev.h>
49 #include <vm/vm.h>
50 #include <vm/vm_param.h>
51 #include <vm/vm_prot.h>
52 #include <vm/vm_object.h>
53 #include <vm/vm_page.h>
54 
55 #if defined(CLUSTERDEBUG)
56 #include <sys/sysctl.h>
57 #include <sys/kernel.h>
58 static int	rcluster= 0;
59 SYSCTL_INT(_debug, 14, rcluster, CTLFLAG_RW, &rcluster, 0, "");
60 #endif
61 
62 #ifdef notyet_block_reallocation_enabled
63 #ifdef DEBUG
64 #include <sys/sysctl.h>
65 #include <sys/kernel.h>
66 
67 static int	doreallocblks = 0;
68 SYSCTL_INT(_debug, 13, doreallocblks, CTLFLAG_RW, &doreallocblks, 0, "");
69 #else
70 #define	doreallocblks 0
71 #endif
72 #endif /* notyet_block_reallocation_enabled */
73 
74 #ifdef notyet_block_reallocation_enabled
75 static struct cluster_save *
76 	cluster_collectbufs __P((struct vnode *vp, struct buf *last_bp));
77 #endif
78 static struct buf *
79 	cluster_rbuild __P((struct vnode *vp, u_quad_t filesize, daddr_t lbn,
80 			    daddr_t blkno, long size, int run, struct buf *fbp));
81 
82 extern vm_page_t	bogus_page;
83 
84 /*
85  * Maximum number of blocks for read-ahead.
86  */
87 #define MAXRA 32
88 
89 /*
90  * This replaces bread.
91  */
92 int
93 cluster_read(vp, filesize, lblkno, size, cred, totread, seqcount, bpp)
94 	struct vnode *vp;
95 	u_quad_t filesize;
96 	daddr_t lblkno;
97 	long size;
98 	struct ucred *cred;
99 	long totread;
100 	int seqcount;
101 	struct buf **bpp;
102 {
103 	struct buf *bp, *rbp, *reqbp;
104 	daddr_t blkno, rablkno, origblkno;
105 	int error, num_ra;
106 	int i;
107 	int maxra, racluster;
108 	long origtotread;
109 
110 	error = 0;
111 
112 	/*
113 	 * Try to limit the amount of read-ahead by a few
114 	 * ad-hoc parameters.  This needs work!!!
115 	 */
116 	racluster = MAXPHYS/size;
117 	maxra = 2 * racluster + (totread / size);
118 	if (maxra > MAXRA)
119 		maxra = MAXRA;
120 	if (maxra > nbuf/8)
121 		maxra = nbuf/8;
122 
123 	/*
124 	 * get the requested block
125 	 */
126 	*bpp = reqbp = bp = getblk(vp, lblkno, size, 0, 0);
127 	origblkno = lblkno;
128 	origtotread = totread;
129 
130 	/*
131 	 * if it is in the cache, then check to see if the reads have been
132 	 * sequential.  If they have, then try some read-ahead, otherwise
133 	 * back-off on prospective read-aheads.
134 	 */
135 	if (bp->b_flags & B_CACHE) {
136 		if (!seqcount) {
137 			return 0;
138 		} else if ((bp->b_flags & B_RAM) == 0) {
139 			return 0;
140 		} else {
141 			int s;
142 			struct buf *tbp;
143 			bp->b_flags &= ~B_RAM;
144 			/*
145 			 * We do the spl here so that there is no window
146 			 * between the incore and the b_usecount increment
147 			 * below.  We opt to keep the spl out of the loop
148 			 * for efficiency.
149 			 */
150 			s = splbio();
151 			for(i=1;i<maxra;i++) {
152 
153 				if (!(tbp = incore(vp, lblkno+i))) {
154 					break;
155 				}
156 
157 				/*
158 				 * Set another read-ahead mark so we know to check
159 				 * again.
160 				 */
161 				if (((i % racluster) == (racluster - 1)) ||
162 					(i == (maxra - 1)))
163 					tbp->b_flags |= B_RAM;
164 
165 #if 0
166 				if (tbp->b_usecount == 0) {
167 					/*
168 					 * Make sure that the soon-to-be used readaheads
169 					 * are still there.  The getblk/bqrelse pair will
170 					 * boost the priority of the buffer.
171 					 */
172 					tbp = getblk(vp, lblkno+i, size, 0, 0);
173 					bqrelse(tbp);
174 				}
175 #endif
176 			}
177 			splx(s);
178 			if (i >= maxra) {
179 				return 0;
180 			}
181 			lblkno += i;
182 		}
183 		reqbp = bp = NULL;
184 	} else {
185 		u_quad_t firstread;
186 		firstread = (u_quad_t) lblkno * size;
187 		if (firstread + totread > filesize)
188 			totread = filesize - firstread;
189 		if (totread > size) {
190 			int nblks = 0;
191 			int ncontigafter;
192 			while (totread > 0) {
193 				nblks++;
194 				totread -= size;
195 			}
196 			if (nblks == 1)
197 				goto single_block_read;
198 			if (nblks > racluster)
199 				nblks = racluster;
200 
201 	    		error = VOP_BMAP(vp, lblkno, NULL,
202 				&blkno, &ncontigafter, NULL);
203 			if (error)
204 				goto single_block_read;
205 			if (blkno == -1)
206 				goto single_block_read;
207 			if (ncontigafter == 0)
208 				goto single_block_read;
209 			if (ncontigafter + 1 < nblks)
210 				nblks = ncontigafter + 1;
211 
212 			bp = cluster_rbuild(vp, filesize, lblkno,
213 				blkno, size, nblks, bp);
214 			lblkno += nblks;
215 		} else {
216 single_block_read:
217 			/*
218 			 * if it isn't in the cache, then get a chunk from
219 			 * disk if sequential, otherwise just get the block.
220 			 */
221 			bp->b_flags |= B_READ | B_RAM;
222 			lblkno += 1;
223 		}
224 	}
225 
226 	/*
227 	 * if we have been doing sequential I/O, then do some read-ahead
228 	 */
229 	rbp = NULL;
230 	/* if (seqcount && (lblkno < (origblkno + maxra))) { */
231 	if (seqcount && (lblkno < (origblkno + seqcount))) {
232 		/*
233 		 * we now build the read-ahead buffer if it is desirable.
234 		 */
235 		if (((u_quad_t)(lblkno + 1) * size) <= filesize &&
236 		    !(error = VOP_BMAP(vp, lblkno, NULL, &blkno, &num_ra, NULL)) &&
237 		    blkno != -1) {
238 			int nblksread;
239 			int ntoread = num_ra + 1;
240 			nblksread = (origtotread + size - 1) / size;
241 			if (seqcount < nblksread)
242 				seqcount = nblksread;
243 			if (seqcount < ntoread)
244 				ntoread = seqcount;
245 			if (num_ra) {
246 				rbp = cluster_rbuild(vp, filesize, lblkno,
247 					blkno, size, ntoread, NULL);
248 			} else {
249 				rbp = getblk(vp, lblkno, size, 0, 0);
250 				rbp->b_flags |= B_READ | B_ASYNC | B_RAM;
251 				rbp->b_blkno = blkno;
252 			}
253 		}
254 	}
255 
256 	/*
257 	 * handle the synchronous read
258 	 */
259 	if (bp) {
260 		if (bp->b_flags & (B_DONE | B_DELWRI)) {
261 			panic("cluster_read: DONE bp");
262 		} else {
263 #if defined(CLUSTERDEBUG)
264 			if (rcluster)
265 				printf("S(%d,%d,%d) ",
266 					bp->b_lblkno, bp->b_bcount, seqcount);
267 #endif
268 			if ((bp->b_flags & B_CLUSTER) == 0)
269 				vfs_busy_pages(bp, 0);
270 			error = VOP_STRATEGY(bp);
271 			curproc->p_stats->p_ru.ru_inblock++;
272 		}
273 	}
274 	/*
275 	 * and if we have read-aheads, do them too
276 	 */
277 	if (rbp) {
278 		if (error) {
279 			rbp->b_flags &= ~(B_ASYNC | B_READ);
280 			brelse(rbp);
281 		} else if (rbp->b_flags & B_CACHE) {
282 			rbp->b_flags &= ~(B_ASYNC | B_READ);
283 			bqrelse(rbp);
284 		} else {
285 #if defined(CLUSTERDEBUG)
286 			if (rcluster) {
287 				if (bp)
288 					printf("A+(%d,%d,%d,%d) ",
289 					rbp->b_lblkno, rbp->b_bcount,
290 					rbp->b_lblkno - origblkno,
291 					seqcount);
292 				else
293 					printf("A(%d,%d,%d,%d) ",
294 					rbp->b_lblkno, rbp->b_bcount,
295 					rbp->b_lblkno - origblkno,
296 					seqcount);
297 			}
298 #endif
299 
300 			if ((rbp->b_flags & B_CLUSTER) == 0)
301 				vfs_busy_pages(rbp, 0);
302 			(void) VOP_STRATEGY(rbp);
303 			curproc->p_stats->p_ru.ru_inblock++;
304 		}
305 	}
306 	if (reqbp)
307 		return (biowait(reqbp));
308 	else
309 		return (error);
310 }
311 
312 /*
313  * If blocks are contiguous on disk, use this to provide clustered
314  * read ahead.  We will read as many blocks as possible sequentially
315  * and then parcel them up into logical blocks in the buffer hash table.
316  */
317 static struct buf *
318 cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp)
319 	struct vnode *vp;
320 	u_quad_t filesize;
321 	daddr_t lbn;
322 	daddr_t blkno;
323 	long size;
324 	int run;
325 	struct buf *fbp;
326 {
327 	struct buf *bp, *tbp;
328 	daddr_t bn;
329 	int i, inc, j;
330 
331 #ifdef DIAGNOSTIC
332 	if (size != vp->v_mount->mnt_stat.f_iosize)
333 		panic("cluster_rbuild: size %d != filesize %d\n",
334 		    size, vp->v_mount->mnt_stat.f_iosize);
335 #endif
336 	/*
337 	 * avoid a division
338 	 */
339 	while ((u_quad_t) size * (lbn + run) > filesize) {
340 		--run;
341 	}
342 
343 	if (fbp) {
344 		tbp = fbp;
345 		tbp->b_flags |= B_READ;
346 	} else {
347 		tbp = getblk(vp, lbn, size, 0, 0);
348 		if (tbp->b_flags & B_CACHE)
349 			return tbp;
350 		tbp->b_flags |= B_ASYNC | B_READ | B_RAM;
351 	}
352 
353 	tbp->b_blkno = blkno;
354 	if( (tbp->b_flags & B_MALLOC) ||
355 		((tbp->b_flags & B_VMIO) == 0) || (run <= 1) )
356 		return tbp;
357 
358 	bp = trypbuf();
359 	if (bp == 0)
360 		return tbp;
361 
362 	(vm_offset_t) bp->b_data |= ((vm_offset_t) tbp->b_data) & PAGE_MASK;
363 	bp->b_flags = B_ASYNC | B_READ | B_CALL | B_BUSY | B_CLUSTER | B_VMIO;
364 	bp->b_iodone = cluster_callback;
365 	bp->b_blkno = blkno;
366 	bp->b_lblkno = lbn;
367 	pbgetvp(vp, bp);
368 
369 	TAILQ_INIT(&bp->b_cluster.cluster_head);
370 
371 	bp->b_bcount = 0;
372 	bp->b_bufsize = 0;
373 	bp->b_npages = 0;
374 
375 	inc = btodb(size);
376 	for (bn = blkno, i = 0; i < run; ++i, bn += inc) {
377 		if (i != 0) {
378 			if ((bp->b_npages * PAGE_SIZE) +
379 				round_page(size) > MAXPHYS)
380 				break;
381 
382 			if (incore(vp, lbn + i))
383 				break;
384 
385 			tbp = getblk(vp, lbn + i, size, 0, 0);
386 
387 			if ((tbp->b_flags & B_CACHE) ||
388 				(tbp->b_flags & B_VMIO) == 0) {
389 				bqrelse(tbp);
390 				break;
391 			}
392 
393 			for (j=0;j<tbp->b_npages;j++) {
394 				if (tbp->b_pages[j]->valid) {
395 					break;
396 				}
397 			}
398 
399 			if (j != tbp->b_npages) {
400 				/*
401 				 * force buffer to be re-constituted later
402 				 */
403 				tbp->b_flags |= B_RELBUF;
404 				brelse(tbp);
405 				break;
406 			}
407 
408 			if ((fbp && (i == 1)) || (i == (run - 1)))
409 				tbp->b_flags |= B_RAM;
410 			tbp->b_flags |= B_READ | B_ASYNC;
411 			if (tbp->b_blkno == tbp->b_lblkno) {
412 				tbp->b_blkno = bn;
413 			} else if (tbp->b_blkno != bn) {
414 				brelse(tbp);
415 				break;
416 			}
417 		}
418 		TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head,
419 			tbp, b_cluster.cluster_entry);
420 		for (j = 0; j < tbp->b_npages; j += 1) {
421 			vm_page_t m;
422 			m = tbp->b_pages[j];
423 			++m->busy;
424 			++m->object->paging_in_progress;
425 			if ((bp->b_npages == 0) ||
426 				(bp->b_pages[bp->b_npages-1] != m)) {
427 				bp->b_pages[bp->b_npages] = m;
428 				bp->b_npages++;
429 			}
430 			if ((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL)
431 				tbp->b_pages[j] = bogus_page;
432 		}
433 		bp->b_bcount += tbp->b_bcount;
434 		bp->b_bufsize += tbp->b_bufsize;
435 	}
436 
437 	for(j=0;j<bp->b_npages;j++) {
438 		if ((bp->b_pages[j]->valid & VM_PAGE_BITS_ALL) ==
439 			VM_PAGE_BITS_ALL)
440 			bp->b_pages[j] = bogus_page;
441 	}
442 	if (bp->b_bufsize > bp->b_kvasize)
443 		panic("cluster_rbuild: b_bufsize(%d) > b_kvasize(%d)\n",
444 			bp->b_bufsize, bp->b_kvasize);
445 	bp->b_kvasize = bp->b_bufsize;
446 
447 	pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
448 		(vm_page_t *)bp->b_pages, bp->b_npages);
449 	return (bp);
450 }
451 
452 /*
453  * Cleanup after a clustered read or write.
454  * This is complicated by the fact that any of the buffers might have
455  * extra memory (if there were no empty buffer headers at allocbuf time)
456  * that we will need to shift around.
457  */
458 void
459 cluster_callback(bp)
460 	struct buf *bp;
461 {
462 	struct buf *nbp, *tbp;
463 	int error = 0;
464 
465 	/*
466 	 * Must propogate errors to all the components.
467 	 */
468 	if (bp->b_flags & B_ERROR)
469 		error = bp->b_error;
470 
471 	pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
472 	/*
473 	 * Move memory from the large cluster buffer into the component
474 	 * buffers and mark IO as done on these.
475 	 */
476 	for (tbp = TAILQ_FIRST(&bp->b_cluster.cluster_head);
477 		tbp; tbp = nbp) {
478 		nbp = TAILQ_NEXT(&tbp->b_cluster, cluster_entry);
479 		if (error) {
480 			tbp->b_flags |= B_ERROR;
481 			tbp->b_error = error;
482 		}
483 		biodone(tbp);
484 	}
485 	relpbuf(bp);
486 }
487 
488 /*
489  * Do clustered write for FFS.
490  *
491  * Three cases:
492  *	1. Write is not sequential (write asynchronously)
493  *	Write is sequential:
494  *	2.	beginning of cluster - begin cluster
495  *	3.	middle of a cluster - add to cluster
496  *	4.	end of a cluster - asynchronously write cluster
497  */
498 void
499 cluster_write(bp, filesize)
500 	struct buf *bp;
501 	u_quad_t filesize;
502 {
503 	struct vnode *vp;
504 	daddr_t lbn;
505 	int maxclen, cursize;
506 	int lblocksize;
507 	int async;
508 
509 	vp = bp->b_vp;
510 	async = vp->v_mount->mnt_flag & MNT_ASYNC;
511 	lblocksize = vp->v_mount->mnt_stat.f_iosize;
512 	lbn = bp->b_lblkno;
513 
514 	/* Initialize vnode to beginning of file. */
515 	if (lbn == 0)
516 		vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
517 
518 	if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 ||
519 	    (bp->b_blkno != vp->v_lasta + btodb(lblocksize))) {
520 		maxclen = MAXPHYS / lblocksize - 1;
521 		if (vp->v_clen != 0) {
522 			/*
523 			 * Next block is not sequential.
524 			 *
525 			 * If we are not writing at end of file, the process
526 			 * seeked to another point in the file since its last
527 			 * write, or we have reached our maximum cluster size,
528 			 * then push the previous cluster. Otherwise try
529 			 * reallocating to make it sequential.
530 			 */
531 			cursize = vp->v_lastw - vp->v_cstart + 1;
532 #ifndef notyet_block_reallocation_enabled
533 			if (((u_quad_t)(lbn + 1) * lblocksize) != filesize ||
534 				lbn != vp->v_lastw + 1 ||
535 				vp->v_clen <= cursize) {
536 				if (!async)
537 					cluster_wbuild(vp, lblocksize,
538 						vp->v_cstart, cursize);
539 			}
540 #else
541 			if (!doreallocblks ||
542 			    (lbn + 1) * lblocksize != filesize ||
543 			    lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) {
544 				if (!async)
545 					cluster_wbuild(vp, lblocksize,
546 						vp->v_cstart, cursize);
547 			} else {
548 				struct buf **bpp, **endbp;
549 				struct cluster_save *buflist;
550 
551 				buflist = cluster_collectbufs(vp, bp);
552 				endbp = &buflist->bs_children
553 				    [buflist->bs_nchildren - 1];
554 				if (VOP_REALLOCBLKS(vp, buflist)) {
555 					/*
556 					 * Failed, push the previous cluster.
557 					 */
558 					for (bpp = buflist->bs_children;
559 					     bpp < endbp; bpp++)
560 						brelse(*bpp);
561 					free(buflist, M_SEGMENT);
562 					cluster_wbuild(vp, lblocksize,
563 					    vp->v_cstart, cursize);
564 				} else {
565 					/*
566 					 * Succeeded, keep building cluster.
567 					 */
568 					for (bpp = buflist->bs_children;
569 					     bpp <= endbp; bpp++)
570 						bdwrite(*bpp);
571 					free(buflist, M_SEGMENT);
572 					vp->v_lastw = lbn;
573 					vp->v_lasta = bp->b_blkno;
574 					return;
575 				}
576 			}
577 #endif /* notyet_block_reallocation_enabled */
578 		}
579 		/*
580 		 * Consider beginning a cluster. If at end of file, make
581 		 * cluster as large as possible, otherwise find size of
582 		 * existing cluster.
583 		 */
584 		if (((u_quad_t) (lbn + 1) * lblocksize) != filesize &&
585 		    (bp->b_blkno == bp->b_lblkno) &&
586 		    (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen, NULL) ||
587 		     bp->b_blkno == -1)) {
588 			bawrite(bp);
589 			vp->v_clen = 0;
590 			vp->v_lasta = bp->b_blkno;
591 			vp->v_cstart = lbn + 1;
592 			vp->v_lastw = lbn;
593 			return;
594 		}
595 		vp->v_clen = maxclen;
596 		if (!async && maxclen == 0) {	/* I/O not contiguous */
597 			vp->v_cstart = lbn + 1;
598 			bawrite(bp);
599 		} else {	/* Wait for rest of cluster */
600 			vp->v_cstart = lbn;
601 			bdwrite(bp);
602 		}
603 	} else if (lbn == vp->v_cstart + vp->v_clen) {
604 		/*
605 		 * At end of cluster, write it out.
606 		 */
607 		bdwrite(bp);
608 		cluster_wbuild(vp, lblocksize, vp->v_cstart, vp->v_clen + 1);
609 		vp->v_clen = 0;
610 		vp->v_cstart = lbn + 1;
611 	} else
612 		/*
613 		 * In the middle of a cluster, so just delay the I/O for now.
614 		 */
615 		bdwrite(bp);
616 	vp->v_lastw = lbn;
617 	vp->v_lasta = bp->b_blkno;
618 }
619 
620 
621 /*
622  * This is an awful lot like cluster_rbuild...wish they could be combined.
623  * The last lbn argument is the current block on which I/O is being
624  * performed.  Check to see that it doesn't fall in the middle of
625  * the current block (if last_bp == NULL).
626  */
627 int
628 cluster_wbuild(vp, size, start_lbn, len)
629 	struct vnode *vp;
630 	long size;
631 	daddr_t start_lbn;
632 	int len;
633 {
634 	struct buf *bp, *tbp;
635 	int i, j, s;
636 	int totalwritten = 0;
637 	int dbsize = btodb(size);
638 	while (len > 0) {
639 		s = splbio();
640 		if ( ((tbp = gbincore(vp, start_lbn)) == NULL) ||
641 			((tbp->b_flags & (B_INVAL|B_BUSY|B_DELWRI)) != B_DELWRI)) {
642 			++start_lbn;
643 			--len;
644 			splx(s);
645 			continue;
646 		}
647 		bremfree(tbp);
648 		tbp->b_flags |= B_BUSY;
649 		tbp->b_flags &= ~B_DONE;
650 		splx(s);
651 
652 	/*
653 	 * Extra memory in the buffer, punt on this buffer. XXX we could
654 	 * handle this in most cases, but we would have to push the extra
655 	 * memory down to after our max possible cluster size and then
656 	 * potentially pull it back up if the cluster was terminated
657 	 * prematurely--too much hassle.
658 	 */
659 		if (((tbp->b_flags & (B_CLUSTEROK|B_MALLOC)) != B_CLUSTEROK) ||
660 			(tbp->b_bcount != tbp->b_bufsize) ||
661 			(tbp->b_bcount != size) ||
662 			len == 1) {
663 			totalwritten += tbp->b_bufsize;
664 			bawrite(tbp);
665 			++start_lbn;
666 			--len;
667 			continue;
668 		}
669 
670 		bp = trypbuf();
671 		if (bp == NULL) {
672 			totalwritten += tbp->b_bufsize;
673 			bawrite(tbp);
674 			++start_lbn;
675 			--len;
676 			continue;
677 		}
678 
679 		TAILQ_INIT(&bp->b_cluster.cluster_head);
680 		bp->b_bcount = 0;
681 		bp->b_bufsize = 0;
682 		bp->b_npages = 0;
683 		if (tbp->b_wcred != NOCRED) {
684 		    bp->b_wcred = tbp->b_wcred;
685 		    crhold(bp->b_wcred);
686 		}
687 
688 		bp->b_blkno = tbp->b_blkno;
689 		bp->b_lblkno = tbp->b_lblkno;
690 		(vm_offset_t) bp->b_data |= ((vm_offset_t) tbp->b_data) & PAGE_MASK;
691 		bp->b_flags |= B_CALL | B_BUSY | B_CLUSTER | (tbp->b_flags & (B_VMIO|B_NEEDCOMMIT));
692 		bp->b_iodone = cluster_callback;
693 		pbgetvp(vp, bp);
694 
695 		for (i = 0; i < len; ++i, ++start_lbn) {
696 			if (i != 0) {
697 				s = splbio();
698 				if ((tbp = gbincore(vp, start_lbn)) == NULL) {
699 					splx(s);
700 					break;
701 				}
702 
703 				if ((tbp->b_flags & (B_VMIO|B_CLUSTEROK|B_INVAL|B_BUSY|B_DELWRI|B_NEEDCOMMIT)) != (B_DELWRI|B_CLUSTEROK|(bp->b_flags & (B_VMIO|B_NEEDCOMMIT)))) {
704 					splx(s);
705 					break;
706 				}
707 
708 				if (tbp->b_wcred != bp->b_wcred) {
709 					splx(s);
710 					break;
711 				}
712 
713 				if ((tbp->b_bcount != size) ||
714 					((bp->b_blkno + dbsize * i) != tbp->b_blkno) ||
715 					((tbp->b_npages + bp->b_npages) > (MAXPHYS / PAGE_SIZE))) {
716 					splx(s);
717 					break;
718 				}
719 				bremfree(tbp);
720 				tbp->b_flags |= B_BUSY;
721 				tbp->b_flags &= ~B_DONE;
722 				splx(s);
723 			}
724 			if (tbp->b_flags & B_VMIO) {
725 				for (j = 0; j < tbp->b_npages; j += 1) {
726 					vm_page_t m;
727 					m = tbp->b_pages[j];
728 					++m->busy;
729 					++m->object->paging_in_progress;
730 					if ((bp->b_npages == 0) ||
731 						(bp->b_pages[bp->b_npages - 1] != m)) {
732 						bp->b_pages[bp->b_npages] = m;
733 						bp->b_npages++;
734 					}
735 				}
736 			}
737 			bp->b_bcount += size;
738 			bp->b_bufsize += size;
739 
740 			tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
741 			tbp->b_flags |= B_ASYNC;
742 			s = splbio();
743 			reassignbuf(tbp, tbp->b_vp);	/* put on clean list */
744 			++tbp->b_vp->v_numoutput;
745 			splx(s);
746 			TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head,
747 				tbp, b_cluster.cluster_entry);
748 		}
749 		pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
750 			(vm_page_t *) bp->b_pages, bp->b_npages);
751 		if (bp->b_bufsize > bp->b_kvasize)
752 			panic("cluster_wbuild: b_bufsize(%d) > b_kvasize(%d)\n",
753 				bp->b_bufsize, bp->b_kvasize);
754 		bp->b_kvasize = bp->b_bufsize;
755 		totalwritten += bp->b_bufsize;
756 		bp->b_dirtyoff = 0;
757 		bp->b_dirtyend = bp->b_bufsize;
758 		bawrite(bp);
759 
760 		len -= i;
761 	}
762 	return totalwritten;
763 }
764 
765 #ifdef notyet_block_reallocation_enabled
766 /*
767  * Collect together all the buffers in a cluster.
768  * Plus add one additional buffer.
769  */
770 static struct cluster_save *
771 cluster_collectbufs(vp, last_bp)
772 	struct vnode *vp;
773 	struct buf *last_bp;
774 {
775 	struct cluster_save *buflist;
776 	daddr_t lbn;
777 	int i, len;
778 
779 	len = vp->v_lastw - vp->v_cstart + 1;
780 	buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist),
781 	    M_SEGMENT, M_WAITOK);
782 	buflist->bs_nchildren = 0;
783 	buflist->bs_children = (struct buf **) (buflist + 1);
784 	for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++)
785 		(void) bread(vp, lbn, last_bp->b_bcount, NOCRED,
786 		    &buflist->bs_children[i]);
787 	buflist->bs_children[i] = last_bp;
788 	buflist->bs_nchildren = i + 1;
789 	return (buflist);
790 }
791 #endif /* notyet_block_reallocation_enabled */
792