1 /*- 2 * Copyright (c) 1993 3 * The Regents of the University of California. All rights reserved. 4 * Modifications/enhancements: 5 * Copyright (c) 1995 John S. Dyson. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. All advertising materials mentioning features or use of this software 16 * must display the following acknowledgement: 17 * This product includes software developed by the University of 18 * California, Berkeley and its contributors. 19 * 4. Neither the name of the University nor the names of its contributors 20 * may be used to endorse or promote products derived from this software 21 * without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 * 35 * @(#)vfs_cluster.c 8.7 (Berkeley) 2/13/94 36 * $Id: vfs_cluster.c,v 1.82 1999/06/16 15:54:30 dg Exp $ 37 */ 38 39 #include "opt_debug_cluster.h" 40 41 #include <sys/param.h> 42 #include <sys/systm.h> 43 #include <sys/kernel.h> 44 #include <sys/proc.h> 45 #include <sys/buf.h> 46 #include <sys/vnode.h> 47 #include <sys/malloc.h> 48 #include <sys/mount.h> 49 #include <sys/resourcevar.h> 50 #include <vm/vm.h> 51 #include <vm/vm_prot.h> 52 #include <vm/vm_object.h> 53 #include <vm/vm_page.h> 54 55 #if defined(CLUSTERDEBUG) 56 #include <sys/sysctl.h> 57 static int rcluster= 0; 58 SYSCTL_INT(_debug, OID_AUTO, rcluster, CTLFLAG_RW, &rcluster, 0, ""); 59 #endif 60 61 static MALLOC_DEFINE(M_SEGMENT, "cluster_save buffer", "cluster_save buffer"); 62 63 static struct cluster_save * 64 cluster_collectbufs __P((struct vnode *vp, struct buf *last_bp)); 65 static struct buf * 66 cluster_rbuild __P((struct vnode *vp, u_quad_t filesize, daddr_t lbn, 67 daddr_t blkno, long size, int run, struct buf *fbp)); 68 69 extern vm_page_t bogus_page; 70 71 extern int cluster_pbuf_freecnt; 72 73 /* 74 * Maximum number of blocks for read-ahead. 75 */ 76 #define MAXRA 32 77 78 /* 79 * This replaces bread. 80 */ 81 int 82 cluster_read(vp, filesize, lblkno, size, cred, totread, seqcount, bpp) 83 struct vnode *vp; 84 u_quad_t filesize; 85 daddr_t lblkno; 86 long size; 87 struct ucred *cred; 88 long totread; 89 int seqcount; 90 struct buf **bpp; 91 { 92 struct buf *bp, *rbp, *reqbp; 93 daddr_t blkno, origblkno; 94 int error, num_ra; 95 int i; 96 int maxra, racluster; 97 long origtotread; 98 99 error = 0; 100 if (vp->v_maxio == 0) 101 vp->v_maxio = DFLTPHYS; 102 103 /* 104 * Try to limit the amount of read-ahead by a few 105 * ad-hoc parameters. This needs work!!! 106 */ 107 racluster = vp->v_maxio/size; 108 maxra = 2 * racluster + (totread / size); 109 if (maxra > MAXRA) 110 maxra = MAXRA; 111 if (maxra > nbuf/8) 112 maxra = nbuf/8; 113 114 /* 115 * get the requested block 116 */ 117 *bpp = reqbp = bp = getblk(vp, lblkno, size, 0, 0); 118 origblkno = lblkno; 119 origtotread = totread; 120 121 /* 122 * if it is in the cache, then check to see if the reads have been 123 * sequential. If they have, then try some read-ahead, otherwise 124 * back-off on prospective read-aheads. 125 */ 126 if (bp->b_flags & B_CACHE) { 127 if (!seqcount) { 128 return 0; 129 } else if ((bp->b_flags & B_RAM) == 0) { 130 return 0; 131 } else { 132 int s; 133 struct buf *tbp; 134 bp->b_flags &= ~B_RAM; 135 /* 136 * We do the spl here so that there is no window 137 * between the incore and the b_usecount increment 138 * below. We opt to keep the spl out of the loop 139 * for efficiency. 140 */ 141 s = splbio(); 142 for(i=1;i<maxra;i++) { 143 144 if (!(tbp = incore(vp, lblkno+i))) { 145 break; 146 } 147 148 /* 149 * Set another read-ahead mark so we know to check 150 * again. 151 */ 152 if (((i % racluster) == (racluster - 1)) || 153 (i == (maxra - 1))) 154 tbp->b_flags |= B_RAM; 155 156 if ((tbp->b_usecount < 1) && 157 ((tbp->b_flags & B_BUSY) == 0) && 158 (tbp->b_qindex == QUEUE_LRU)) { 159 TAILQ_REMOVE(&bufqueues[QUEUE_LRU], tbp, b_freelist); 160 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], tbp, b_freelist); 161 } 162 } 163 splx(s); 164 if (i >= maxra) { 165 return 0; 166 } 167 lblkno += i; 168 } 169 reqbp = bp = NULL; 170 } else { 171 off_t firstread = bp->b_offset; 172 173 KASSERT(bp->b_offset != NOOFFSET, 174 ("cluster_read: no buffer offset")); 175 if (firstread + totread > filesize) 176 totread = filesize - firstread; 177 if (totread > size) { 178 int nblks = 0; 179 int ncontigafter; 180 while (totread > 0) { 181 nblks++; 182 totread -= size; 183 } 184 if (nblks == 1) 185 goto single_block_read; 186 if (nblks > racluster) 187 nblks = racluster; 188 189 error = VOP_BMAP(vp, lblkno, NULL, 190 &blkno, &ncontigafter, NULL); 191 if (error) 192 goto single_block_read; 193 if (blkno == -1) 194 goto single_block_read; 195 if (ncontigafter == 0) 196 goto single_block_read; 197 if (ncontigafter + 1 < nblks) 198 nblks = ncontigafter + 1; 199 200 bp = cluster_rbuild(vp, filesize, lblkno, 201 blkno, size, nblks, bp); 202 lblkno += (bp->b_bufsize / size); 203 } else { 204 single_block_read: 205 /* 206 * if it isn't in the cache, then get a chunk from 207 * disk if sequential, otherwise just get the block. 208 */ 209 bp->b_flags |= B_READ | B_RAM; 210 lblkno += 1; 211 } 212 } 213 214 /* 215 * if we have been doing sequential I/O, then do some read-ahead 216 */ 217 rbp = NULL; 218 if (seqcount && (lblkno < (origblkno + seqcount))) { 219 /* 220 * we now build the read-ahead buffer if it is desirable. 221 */ 222 if (((u_quad_t)(lblkno + 1) * size) <= filesize && 223 !(error = VOP_BMAP(vp, lblkno, NULL, &blkno, &num_ra, NULL)) && 224 blkno != -1) { 225 int nblksread; 226 int ntoread = num_ra + 1; 227 nblksread = (origtotread + size - 1) / size; 228 if (seqcount < nblksread) 229 seqcount = nblksread; 230 if (seqcount < ntoread) 231 ntoread = seqcount; 232 if (num_ra) { 233 rbp = cluster_rbuild(vp, filesize, lblkno, 234 blkno, size, ntoread, NULL); 235 } else { 236 rbp = getblk(vp, lblkno, size, 0, 0); 237 rbp->b_flags |= B_READ | B_ASYNC | B_RAM; 238 rbp->b_blkno = blkno; 239 } 240 } 241 } 242 243 /* 244 * handle the synchronous read 245 */ 246 if (bp) { 247 #if defined(CLUSTERDEBUG) 248 if (rcluster) 249 printf("S(%ld,%ld,%d) ", 250 (long)bp->b_lblkno, bp->b_bcount, seqcount); 251 #endif 252 if ((bp->b_flags & B_CLUSTER) == 0) 253 vfs_busy_pages(bp, 0); 254 bp->b_flags &= ~(B_ERROR|B_INVAL); 255 error = VOP_STRATEGY(vp, bp); 256 curproc->p_stats->p_ru.ru_inblock++; 257 } 258 259 /* 260 * and if we have read-aheads, do them too 261 */ 262 if (rbp) { 263 if (error) { 264 rbp->b_flags &= ~(B_ASYNC | B_READ); 265 brelse(rbp); 266 } else if (rbp->b_flags & B_CACHE) { 267 rbp->b_flags &= ~(B_ASYNC | B_READ); 268 bqrelse(rbp); 269 } else { 270 #if defined(CLUSTERDEBUG) 271 if (rcluster) { 272 if (bp) 273 printf("A+(%ld,%ld,%ld,%d) ", 274 (long)rbp->b_lblkno, rbp->b_bcount, 275 (long)(rbp->b_lblkno - origblkno), 276 seqcount); 277 else 278 printf("A(%ld,%ld,%ld,%d) ", 279 (long)rbp->b_lblkno, rbp->b_bcount, 280 (long)(rbp->b_lblkno - origblkno), 281 seqcount); 282 } 283 #endif 284 285 if ((rbp->b_flags & B_CLUSTER) == 0) 286 vfs_busy_pages(rbp, 0); 287 rbp->b_flags &= ~(B_ERROR|B_INVAL); 288 (void) VOP_STRATEGY(vp, rbp); 289 curproc->p_stats->p_ru.ru_inblock++; 290 } 291 } 292 if (reqbp) 293 return (biowait(reqbp)); 294 else 295 return (error); 296 } 297 298 /* 299 * If blocks are contiguous on disk, use this to provide clustered 300 * read ahead. We will read as many blocks as possible sequentially 301 * and then parcel them up into logical blocks in the buffer hash table. 302 */ 303 static struct buf * 304 cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp) 305 struct vnode *vp; 306 u_quad_t filesize; 307 daddr_t lbn; 308 daddr_t blkno; 309 long size; 310 int run; 311 struct buf *fbp; 312 { 313 struct buf *bp, *tbp; 314 daddr_t bn; 315 int i, inc, j; 316 317 KASSERT(size == vp->v_mount->mnt_stat.f_iosize, 318 ("cluster_rbuild: size %ld != filesize %ld\n", 319 size, vp->v_mount->mnt_stat.f_iosize)); 320 321 /* 322 * avoid a division 323 */ 324 while ((u_quad_t) size * (lbn + run) > filesize) { 325 --run; 326 } 327 328 if (fbp) { 329 tbp = fbp; 330 tbp->b_flags |= B_READ; 331 } else { 332 tbp = getblk(vp, lbn, size, 0, 0); 333 if (tbp->b_flags & B_CACHE) 334 return tbp; 335 tbp->b_flags |= B_ASYNC | B_READ | B_RAM; 336 } 337 338 tbp->b_blkno = blkno; 339 if( (tbp->b_flags & B_MALLOC) || 340 ((tbp->b_flags & B_VMIO) == 0) || (run <= 1) ) 341 return tbp; 342 343 bp = trypbuf(&cluster_pbuf_freecnt); 344 if (bp == 0) 345 return tbp; 346 347 bp->b_data = (char *)((vm_offset_t)bp->b_data | 348 ((vm_offset_t)tbp->b_data & PAGE_MASK)); 349 bp->b_flags = B_ASYNC | B_READ | B_CALL | B_BUSY | B_CLUSTER | B_VMIO; 350 bp->b_iodone = cluster_callback; 351 bp->b_blkno = blkno; 352 bp->b_lblkno = lbn; 353 bp->b_offset = tbp->b_offset; 354 KASSERT(bp->b_offset != NOOFFSET, ("cluster_rbuild: no buffer offset")); 355 pbgetvp(vp, bp); 356 357 TAILQ_INIT(&bp->b_cluster.cluster_head); 358 359 bp->b_bcount = 0; 360 bp->b_bufsize = 0; 361 bp->b_npages = 0; 362 363 if (vp->v_maxio == 0) 364 vp->v_maxio = DFLTPHYS; 365 inc = btodb(size); 366 for (bn = blkno, i = 0; i < run; ++i, bn += inc) { 367 if (i != 0) { 368 if ((bp->b_npages * PAGE_SIZE) + 369 round_page(size) > vp->v_maxio) 370 break; 371 372 if ((tbp = incore(vp, lbn + i)) != NULL) { 373 if (tbp->b_flags & B_BUSY) 374 break; 375 376 for (j = 0; j < tbp->b_npages; j++) 377 if (tbp->b_pages[j]->valid) 378 break; 379 380 if (j != tbp->b_npages) 381 break; 382 383 if (tbp->b_bcount != size) 384 break; 385 } 386 387 tbp = getblk(vp, lbn + i, size, 0, 0); 388 389 if ((tbp->b_flags & B_CACHE) || 390 (tbp->b_flags & B_VMIO) == 0) { 391 bqrelse(tbp); 392 break; 393 } 394 395 for (j = 0;j < tbp->b_npages; j++) 396 if (tbp->b_pages[j]->valid) 397 break; 398 399 if (j != tbp->b_npages) { 400 bqrelse(tbp); 401 break; 402 } 403 404 if ((fbp && (i == 1)) || (i == (run - 1))) 405 tbp->b_flags |= B_RAM; 406 tbp->b_flags |= B_READ | B_ASYNC; 407 if (tbp->b_blkno == tbp->b_lblkno) { 408 tbp->b_blkno = bn; 409 } else if (tbp->b_blkno != bn) { 410 brelse(tbp); 411 break; 412 } 413 } 414 TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head, 415 tbp, b_cluster.cluster_entry); 416 for (j = 0; j < tbp->b_npages; j += 1) { 417 vm_page_t m; 418 m = tbp->b_pages[j]; 419 vm_page_io_start(m); 420 vm_object_pip_add(m->object, 1); 421 if ((bp->b_npages == 0) || 422 (bp->b_pages[bp->b_npages-1] != m)) { 423 bp->b_pages[bp->b_npages] = m; 424 bp->b_npages++; 425 } 426 if ((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) 427 tbp->b_pages[j] = bogus_page; 428 } 429 bp->b_bcount += tbp->b_bcount; 430 bp->b_bufsize += tbp->b_bufsize; 431 } 432 433 for(j=0;j<bp->b_npages;j++) { 434 if ((bp->b_pages[j]->valid & VM_PAGE_BITS_ALL) == 435 VM_PAGE_BITS_ALL) 436 bp->b_pages[j] = bogus_page; 437 } 438 if (bp->b_bufsize > bp->b_kvasize) 439 panic("cluster_rbuild: b_bufsize(%ld) > b_kvasize(%d)\n", 440 bp->b_bufsize, bp->b_kvasize); 441 bp->b_kvasize = bp->b_bufsize; 442 443 pmap_qenter(trunc_page((vm_offset_t) bp->b_data), 444 (vm_page_t *)bp->b_pages, bp->b_npages); 445 return (bp); 446 } 447 448 /* 449 * Cleanup after a clustered read or write. 450 * This is complicated by the fact that any of the buffers might have 451 * extra memory (if there were no empty buffer headers at allocbuf time) 452 * that we will need to shift around. 453 */ 454 void 455 cluster_callback(bp) 456 struct buf *bp; 457 { 458 struct buf *nbp, *tbp; 459 int error = 0; 460 461 /* 462 * Must propogate errors to all the components. 463 */ 464 if (bp->b_flags & B_ERROR) 465 error = bp->b_error; 466 467 pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages); 468 /* 469 * Move memory from the large cluster buffer into the component 470 * buffers and mark IO as done on these. 471 */ 472 for (tbp = TAILQ_FIRST(&bp->b_cluster.cluster_head); 473 tbp; tbp = nbp) { 474 nbp = TAILQ_NEXT(&tbp->b_cluster, cluster_entry); 475 if (error) { 476 tbp->b_flags |= B_ERROR; 477 tbp->b_error = error; 478 } else { 479 tbp->b_dirtyoff = tbp->b_dirtyend = 0; 480 tbp->b_flags &= ~(B_ERROR|B_INVAL); 481 } 482 biodone(tbp); 483 } 484 relpbuf(bp, &cluster_pbuf_freecnt); 485 } 486 487 /* 488 * Do clustered write for FFS. 489 * 490 * Three cases: 491 * 1. Write is not sequential (write asynchronously) 492 * Write is sequential: 493 * 2. beginning of cluster - begin cluster 494 * 3. middle of a cluster - add to cluster 495 * 4. end of a cluster - asynchronously write cluster 496 */ 497 void 498 cluster_write(bp, filesize) 499 struct buf *bp; 500 u_quad_t filesize; 501 { 502 struct vnode *vp; 503 daddr_t lbn; 504 int maxclen, cursize; 505 int lblocksize; 506 int async; 507 508 vp = bp->b_vp; 509 if (vp->v_maxio == 0) 510 vp->v_maxio = DFLTPHYS; 511 if (vp->v_type == VREG) { 512 async = vp->v_mount->mnt_flag & MNT_ASYNC; 513 lblocksize = vp->v_mount->mnt_stat.f_iosize; 514 } else { 515 async = 0; 516 lblocksize = bp->b_bufsize; 517 } 518 lbn = bp->b_lblkno; 519 KASSERT(bp->b_offset != NOOFFSET, ("cluster_write: no buffer offset")); 520 521 /* Initialize vnode to beginning of file. */ 522 if (lbn == 0) 523 vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; 524 525 if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 || 526 (bp->b_blkno != vp->v_lasta + btodb(lblocksize))) { 527 maxclen = vp->v_maxio / lblocksize - 1; 528 if (vp->v_clen != 0) { 529 /* 530 * Next block is not sequential. 531 * 532 * If we are not writing at end of file, the process 533 * seeked to another point in the file since its last 534 * write, or we have reached our maximum cluster size, 535 * then push the previous cluster. Otherwise try 536 * reallocating to make it sequential. 537 */ 538 cursize = vp->v_lastw - vp->v_cstart + 1; 539 if (((u_quad_t) bp->b_offset + lblocksize) != filesize || 540 lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) { 541 if (!async) 542 cluster_wbuild(vp, lblocksize, 543 vp->v_cstart, cursize); 544 } else { 545 struct buf **bpp, **endbp; 546 struct cluster_save *buflist; 547 548 buflist = cluster_collectbufs(vp, bp); 549 endbp = &buflist->bs_children 550 [buflist->bs_nchildren - 1]; 551 if (VOP_REALLOCBLKS(vp, buflist)) { 552 /* 553 * Failed, push the previous cluster. 554 */ 555 for (bpp = buflist->bs_children; 556 bpp < endbp; bpp++) 557 brelse(*bpp); 558 free(buflist, M_SEGMENT); 559 cluster_wbuild(vp, lblocksize, 560 vp->v_cstart, cursize); 561 } else { 562 /* 563 * Succeeded, keep building cluster. 564 */ 565 for (bpp = buflist->bs_children; 566 bpp <= endbp; bpp++) 567 bdwrite(*bpp); 568 free(buflist, M_SEGMENT); 569 vp->v_lastw = lbn; 570 vp->v_lasta = bp->b_blkno; 571 return; 572 } 573 } 574 } 575 /* 576 * Consider beginning a cluster. If at end of file, make 577 * cluster as large as possible, otherwise find size of 578 * existing cluster. 579 */ 580 if ((vp->v_type == VREG) && 581 ((u_quad_t) bp->b_offset + lblocksize) != filesize && 582 (bp->b_blkno == bp->b_lblkno) && 583 (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen, NULL) || 584 bp->b_blkno == -1)) { 585 bawrite(bp); 586 vp->v_clen = 0; 587 vp->v_lasta = bp->b_blkno; 588 vp->v_cstart = lbn + 1; 589 vp->v_lastw = lbn; 590 return; 591 } 592 vp->v_clen = maxclen; 593 if (!async && maxclen == 0) { /* I/O not contiguous */ 594 vp->v_cstart = lbn + 1; 595 bawrite(bp); 596 } else { /* Wait for rest of cluster */ 597 vp->v_cstart = lbn; 598 bdwrite(bp); 599 } 600 } else if (lbn == vp->v_cstart + vp->v_clen) { 601 /* 602 * At end of cluster, write it out. 603 */ 604 bdwrite(bp); 605 cluster_wbuild(vp, lblocksize, vp->v_cstart, vp->v_clen + 1); 606 vp->v_clen = 0; 607 vp->v_cstart = lbn + 1; 608 } else 609 /* 610 * In the middle of a cluster, so just delay the I/O for now. 611 */ 612 bdwrite(bp); 613 vp->v_lastw = lbn; 614 vp->v_lasta = bp->b_blkno; 615 } 616 617 618 /* 619 * This is an awful lot like cluster_rbuild...wish they could be combined. 620 * The last lbn argument is the current block on which I/O is being 621 * performed. Check to see that it doesn't fall in the middle of 622 * the current block (if last_bp == NULL). 623 */ 624 int 625 cluster_wbuild(vp, size, start_lbn, len) 626 struct vnode *vp; 627 long size; 628 daddr_t start_lbn; 629 int len; 630 { 631 struct buf *bp, *tbp; 632 int i, j, s; 633 int totalwritten = 0; 634 int dbsize = btodb(size); 635 636 if (vp->v_maxio == 0) 637 vp->v_maxio = DFLTPHYS; 638 while (len > 0) { 639 s = splbio(); 640 if (((tbp = gbincore(vp, start_lbn)) == NULL) || 641 ((tbp->b_flags & (B_INVAL|B_BUSY|B_DELWRI)) != B_DELWRI)) { 642 ++start_lbn; 643 --len; 644 splx(s); 645 continue; 646 } 647 bremfree(tbp); 648 tbp->b_flags |= B_BUSY; 649 tbp->b_flags &= ~B_DONE; 650 splx(s); 651 652 /* 653 * Extra memory in the buffer, punt on this buffer. 654 * XXX we could handle this in most cases, but we would 655 * have to push the extra memory down to after our max 656 * possible cluster size and then potentially pull it back 657 * up if the cluster was terminated prematurely--too much 658 * hassle. 659 */ 660 if (((tbp->b_flags & (B_CLUSTEROK|B_MALLOC)) != B_CLUSTEROK) || 661 (tbp->b_bcount != tbp->b_bufsize) || 662 (tbp->b_bcount != size) || 663 (len == 1) || 664 ((bp = getpbuf(&cluster_pbuf_freecnt)) == NULL)) { 665 totalwritten += tbp->b_bufsize; 666 bawrite(tbp); 667 ++start_lbn; 668 --len; 669 continue; 670 } 671 672 /* 673 * We got a pbuf to make the cluster in. 674 * so initialise it. 675 */ 676 TAILQ_INIT(&bp->b_cluster.cluster_head); 677 bp->b_bcount = 0; 678 bp->b_bufsize = 0; 679 bp->b_npages = 0; 680 if (tbp->b_wcred != NOCRED) { 681 bp->b_wcred = tbp->b_wcred; 682 crhold(bp->b_wcred); 683 } 684 685 bp->b_blkno = tbp->b_blkno; 686 bp->b_lblkno = tbp->b_lblkno; 687 bp->b_offset = tbp->b_offset; 688 bp->b_data = (char *)((vm_offset_t)bp->b_data | 689 ((vm_offset_t)tbp->b_data & PAGE_MASK)); 690 bp->b_flags |= B_CALL | B_BUSY | B_CLUSTER | 691 (tbp->b_flags & (B_VMIO | B_NEEDCOMMIT)); 692 bp->b_iodone = cluster_callback; 693 pbgetvp(vp, bp); 694 /* 695 * From this location in the file, scan forward to see 696 * if there are buffers with adjacent data that need to 697 * be written as well. 698 */ 699 for (i = 0; i < len; ++i, ++start_lbn) { 700 if (i != 0) { /* If not the first buffer */ 701 s = splbio(); 702 /* 703 * If the adjacent data is not even in core it 704 * can't need to be written. 705 */ 706 if ((tbp = gbincore(vp, start_lbn)) == NULL) { 707 splx(s); 708 break; 709 } 710 711 /* 712 * If it IS in core, but has different 713 * characteristics, don't cluster with it. 714 */ 715 if ((tbp->b_flags & 716 (B_VMIO | B_CLUSTEROK | B_INVAL | B_BUSY | 717 B_DELWRI | B_NEEDCOMMIT)) 718 != (B_DELWRI | B_CLUSTEROK | 719 (bp->b_flags & (B_VMIO | B_NEEDCOMMIT)))) { 720 splx(s); 721 break; 722 } 723 724 if (tbp->b_wcred != bp->b_wcred) { 725 splx(s); 726 break; 727 } 728 729 /* 730 * Check that the combined cluster 731 * would make sense with regard to pages 732 * and would not be too large 733 */ 734 if ((tbp->b_bcount != size) || 735 ((bp->b_blkno + (dbsize * i)) != 736 tbp->b_blkno) || 737 ((tbp->b_npages + bp->b_npages) > 738 (vp->v_maxio / PAGE_SIZE))) { 739 splx(s); 740 break; 741 } 742 /* 743 * Ok, it's passed all the tests, 744 * so remove it from the free list 745 * and mark it busy. We will use it. 746 */ 747 bremfree(tbp); 748 tbp->b_flags |= B_BUSY; 749 tbp->b_flags &= ~B_DONE; 750 splx(s); 751 } /* end of code for non-first buffers only */ 752 /* check for latent dependencies to be handled */ 753 if ((LIST_FIRST(&tbp->b_dep)) != NULL && 754 bioops.io_start) 755 (*bioops.io_start)(tbp); 756 /* 757 * If the IO is via the VM then we do some 758 * special VM hackery. (yuck) 759 */ 760 if (tbp->b_flags & B_VMIO) { 761 vm_page_t m; 762 763 if (i != 0) { /* if not first buffer */ 764 for (j = 0; j < tbp->b_npages; j += 1) { 765 m = tbp->b_pages[j]; 766 if (m->flags & PG_BUSY) 767 goto finishcluster; 768 } 769 } 770 771 for (j = 0; j < tbp->b_npages; j += 1) { 772 m = tbp->b_pages[j]; 773 vm_page_io_start(m); 774 vm_object_pip_add(m->object, 1); 775 if ((bp->b_npages == 0) || 776 (bp->b_pages[bp->b_npages - 1] != m)) { 777 bp->b_pages[bp->b_npages] = m; 778 bp->b_npages++; 779 } 780 } 781 } 782 bp->b_bcount += size; 783 bp->b_bufsize += size; 784 785 s = splbio(); 786 bundirty(tbp); 787 tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR); 788 tbp->b_flags |= B_ASYNC; 789 reassignbuf(tbp, tbp->b_vp); /* put on clean list */ 790 ++tbp->b_vp->v_numoutput; 791 splx(s); 792 TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head, 793 tbp, b_cluster.cluster_entry); 794 } 795 finishcluster: 796 pmap_qenter(trunc_page((vm_offset_t) bp->b_data), 797 (vm_page_t *) bp->b_pages, bp->b_npages); 798 if (bp->b_bufsize > bp->b_kvasize) 799 panic( 800 "cluster_wbuild: b_bufsize(%ld) > b_kvasize(%d)\n", 801 bp->b_bufsize, bp->b_kvasize); 802 bp->b_kvasize = bp->b_bufsize; 803 totalwritten += bp->b_bufsize; 804 bp->b_dirtyoff = 0; 805 bp->b_dirtyend = bp->b_bufsize; 806 bawrite(bp); 807 808 len -= i; 809 } 810 return totalwritten; 811 } 812 813 /* 814 * Collect together all the buffers in a cluster. 815 * Plus add one additional buffer. 816 */ 817 static struct cluster_save * 818 cluster_collectbufs(vp, last_bp) 819 struct vnode *vp; 820 struct buf *last_bp; 821 { 822 struct cluster_save *buflist; 823 struct buf *bp; 824 daddr_t lbn; 825 int i, len; 826 827 len = vp->v_lastw - vp->v_cstart + 1; 828 buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist), 829 M_SEGMENT, M_WAITOK); 830 buflist->bs_nchildren = 0; 831 buflist->bs_children = (struct buf **) (buflist + 1); 832 for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++) { 833 (void) bread(vp, lbn, last_bp->b_bcount, NOCRED, &bp); 834 buflist->bs_children[i] = bp; 835 if (bp->b_blkno == bp->b_lblkno) 836 VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, 837 NULL, NULL); 838 } 839 buflist->bs_children[i] = bp = last_bp; 840 if (bp->b_blkno == bp->b_lblkno) 841 VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, 842 NULL, NULL); 843 buflist->bs_nchildren = i + 1; 844 return (buflist); 845 } 846