1 /*- 2 * Copyright (c) 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. All advertising materials mentioning features or use of this software 14 * must display the following acknowledgement: 15 * This product includes software developed by the University of 16 * California, Berkeley and its contributors. 17 * 4. Neither the name of the University nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * @(#)vfs_cluster.c 8.7 (Berkeley) 2/13/94 34 */ 35 36 #include <sys/param.h> 37 #include <sys/systm.h> 38 #include <sys/proc.h> 39 #include <sys/buf.h> 40 #include <sys/vnode.h> 41 #include <sys/mount.h> 42 #include <sys/trace.h> 43 #include <sys/malloc.h> 44 #include <sys/resourcevar.h> 45 46 #ifdef DEBUG 47 #include <vm/vm.h> 48 #include <sys/sysctl.h> 49 int doreallocblks = 1; 50 struct ctldebug debug13 = { "doreallocblks", &doreallocblks }; 51 #else 52 /* XXX for cluster_write */ 53 #define doreallocblks 1 54 #endif 55 56 /* 57 * Local declarations 58 */ 59 struct buf *cluster_newbuf __P((struct vnode *, struct buf *, long, daddr_t, 60 daddr_t, long, int)); 61 struct buf *cluster_rbuild __P((struct vnode *, u_quad_t, struct buf *, 62 daddr_t, daddr_t, long, int, long)); 63 void cluster_wbuild __P((struct vnode *, struct buf *, long, 64 daddr_t, int, daddr_t)); 65 struct cluster_save *cluster_collectbufs __P((struct vnode *, struct buf *)); 66 67 #ifdef DIAGNOSTIC 68 /* 69 * Set to 1 if reads of block zero should cause readahead to be done. 70 * Set to 0 treats a read of block zero as a non-sequential read. 71 * 72 * Setting to one assumes that most reads of block zero of files are due to 73 * sequential passes over the files (e.g. cat, sum) where additional blocks 74 * will soon be needed. Setting to zero assumes that the majority are 75 * surgical strikes to get particular info (e.g. size, file) where readahead 76 * blocks will not be used and, in fact, push out other potentially useful 77 * blocks from the cache. The former seems intuitive, but some quick tests 78 * showed that the latter performed better from a system-wide point of view. 79 */ 80 int doclusterraz = 0; 81 #define ISSEQREAD(vp, blk) \ 82 (((blk) != 0 || doclusterraz) && \ 83 ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr)) 84 #else 85 #define ISSEQREAD(vp, blk) \ 86 ((blk) != 0 && ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr)) 87 #endif 88 89 /* 90 * This replaces bread. If this is a bread at the beginning of a file and 91 * lastr is 0, we assume this is the first read and we'll read up to two 92 * blocks if they are sequential. After that, we'll do regular read ahead 93 * in clustered chunks. 94 * 95 * There are 4 or 5 cases depending on how you count: 96 * Desired block is in the cache: 97 * 1 Not sequential access (0 I/Os). 98 * 2 Access is sequential, do read-ahead (1 ASYNC). 99 * Desired block is not in cache: 100 * 3 Not sequential access (1 SYNC). 101 * 4 Sequential access, next block is contiguous (1 SYNC). 102 * 5 Sequential access, next block is not contiguous (1 SYNC, 1 ASYNC) 103 * 104 * There are potentially two buffers that require I/O. 105 * bp is the block requested. 106 * rbp is the read-ahead block. 107 * If either is NULL, then you don't have to do the I/O. 108 */ 109 int 110 cluster_read(vp, filesize, lblkno, size, cred, bpp) 111 struct vnode *vp; 112 u_quad_t filesize; 113 daddr_t lblkno; 114 long size; 115 struct ucred *cred; 116 struct buf **bpp; 117 { 118 struct buf *bp, *rbp; 119 daddr_t blkno, ioblkno; 120 long flags; 121 int error, num_ra, alreadyincore; 122 123 #ifdef DIAGNOSTIC 124 if (size == 0) 125 panic("cluster_read: size = 0"); 126 #endif 127 128 error = 0; 129 flags = B_READ; 130 *bpp = bp = getblk(vp, lblkno, size, 0, 0); 131 if (bp->b_flags & B_CACHE) { 132 /* 133 * Desired block is in cache; do any readahead ASYNC. 134 * Case 1, 2. 135 */ 136 trace(TR_BREADHIT, pack(vp, size), lblkno); 137 flags |= B_ASYNC; 138 ioblkno = lblkno + (vp->v_ralen ? vp->v_ralen : 1); 139 alreadyincore = (int)incore(vp, ioblkno); 140 bp = NULL; 141 } else { 142 /* Block wasn't in cache, case 3, 4, 5. */ 143 trace(TR_BREADMISS, pack(vp, size), lblkno); 144 bp->b_flags |= B_READ; 145 ioblkno = lblkno; 146 alreadyincore = 0; 147 curproc->p_stats->p_ru.ru_inblock++; /* XXX */ 148 } 149 /* 150 * XXX 151 * Replace 1 with a window size based on some permutation of 152 * maxcontig and rot_delay. This will let you figure out how 153 * many blocks you should read-ahead (case 2, 4, 5). 154 * 155 * If the access isn't sequential, reset the window to 1. 156 * Note that a read to the same block is considered sequential. 157 * This catches the case where the file is being read sequentially, 158 * but at smaller than the filesystem block size. 159 */ 160 rbp = NULL; 161 if (!ISSEQREAD(vp, lblkno)) { 162 vp->v_ralen = 0; 163 vp->v_maxra = lblkno; 164 } else if ((ioblkno + 1) * size <= filesize && !alreadyincore && 165 !(error = VOP_BMAP(vp, ioblkno, NULL, &blkno, &num_ra)) && 166 blkno != -1) { 167 /* 168 * Reading sequentially, and the next block is not in the 169 * cache. We are going to try reading ahead. 170 */ 171 if (num_ra) { 172 /* 173 * If our desired readahead block had been read 174 * in a previous readahead but is no longer in 175 * core, then we may be reading ahead too far 176 * or are not using our readahead very rapidly. 177 * In this case we scale back the window. 178 */ 179 if (!alreadyincore && ioblkno <= vp->v_maxra) 180 vp->v_ralen = max(vp->v_ralen >> 1, 1); 181 /* 182 * There are more sequential blocks than our current 183 * window allows, scale up. Ideally we want to get 184 * in sync with the filesystem maxcontig value. 185 */ 186 else if (num_ra > vp->v_ralen && lblkno != vp->v_lastr) 187 vp->v_ralen = vp->v_ralen ? 188 min(num_ra, vp->v_ralen << 1) : 1; 189 190 if (num_ra > vp->v_ralen) 191 num_ra = vp->v_ralen; 192 } 193 194 if (num_ra) /* case 2, 4 */ 195 rbp = cluster_rbuild(vp, filesize, 196 bp, ioblkno, blkno, size, num_ra, flags); 197 else if (ioblkno == lblkno) { 198 bp->b_blkno = blkno; 199 /* Case 5: check how many blocks to read ahead */ 200 ++ioblkno; 201 if ((ioblkno + 1) * size > filesize || 202 incore(vp, ioblkno) || (error = VOP_BMAP(vp, 203 ioblkno, NULL, &blkno, &num_ra)) || blkno == -1) 204 goto skip_readahead; 205 /* 206 * Adjust readahead as above 207 */ 208 if (num_ra) { 209 if (!alreadyincore && ioblkno <= vp->v_maxra) 210 vp->v_ralen = max(vp->v_ralen >> 1, 1); 211 else if (num_ra > vp->v_ralen && 212 lblkno != vp->v_lastr) 213 vp->v_ralen = vp->v_ralen ? 214 min(num_ra,vp->v_ralen<<1) : 1; 215 if (num_ra > vp->v_ralen) 216 num_ra = vp->v_ralen; 217 } 218 flags |= B_ASYNC; 219 if (num_ra) 220 rbp = cluster_rbuild(vp, filesize, 221 NULL, ioblkno, blkno, size, num_ra, flags); 222 else { 223 rbp = getblk(vp, ioblkno, size, 0, 0); 224 rbp->b_flags |= flags; 225 rbp->b_blkno = blkno; 226 } 227 } else { 228 /* case 2; read ahead single block */ 229 rbp = getblk(vp, ioblkno, size, 0, 0); 230 rbp->b_flags |= flags; 231 rbp->b_blkno = blkno; 232 } 233 234 if (rbp == bp) /* case 4 */ 235 rbp = NULL; 236 else if (rbp) { /* case 2, 5 */ 237 trace(TR_BREADMISSRA, 238 pack(vp, (num_ra + 1) * size), ioblkno); 239 curproc->p_stats->p_ru.ru_inblock++; /* XXX */ 240 } 241 } 242 243 /* XXX Kirk, do we need to make sure the bp has creds? */ 244 skip_readahead: 245 if (bp) 246 if (bp->b_flags & (B_DONE | B_DELWRI)) 247 panic("cluster_read: DONE bp"); 248 else 249 error = VOP_STRATEGY(bp); 250 251 if (rbp) 252 if (error || rbp->b_flags & (B_DONE | B_DELWRI)) { 253 rbp->b_flags &= ~(B_ASYNC | B_READ); 254 brelse(rbp); 255 } else 256 (void) VOP_STRATEGY(rbp); 257 258 /* 259 * Recalculate our maximum readahead 260 */ 261 if (rbp == NULL) 262 rbp = bp; 263 if (rbp) 264 vp->v_maxra = rbp->b_lblkno + (rbp->b_bufsize / size) - 1; 265 266 if (bp) 267 return(biowait(bp)); 268 return(error); 269 } 270 271 /* 272 * If blocks are contiguous on disk, use this to provide clustered 273 * read ahead. We will read as many blocks as possible sequentially 274 * and then parcel them up into logical blocks in the buffer hash table. 275 */ 276 struct buf * 277 cluster_rbuild(vp, filesize, bp, lbn, blkno, size, run, flags) 278 struct vnode *vp; 279 u_quad_t filesize; 280 struct buf *bp; 281 daddr_t lbn; 282 daddr_t blkno; 283 long size; 284 int run; 285 long flags; 286 { 287 struct cluster_save *b_save; 288 struct buf *tbp; 289 daddr_t bn; 290 int i, inc; 291 292 #ifdef DIAGNOSTIC 293 if (size != vp->v_mount->mnt_stat.f_iosize) 294 panic("cluster_rbuild: size %d != filesize %d\n", 295 size, vp->v_mount->mnt_stat.f_iosize); 296 #endif 297 if (size * (lbn + run + 1) > filesize) 298 --run; 299 if (run == 0) { 300 if (!bp) { 301 bp = getblk(vp, lbn, size, 0, 0); 302 bp->b_blkno = blkno; 303 bp->b_flags |= flags; 304 } 305 return(bp); 306 } 307 308 bp = cluster_newbuf(vp, bp, flags, blkno, lbn, size, run + 1); 309 if (bp->b_flags & (B_DONE | B_DELWRI)) 310 return (bp); 311 312 b_save = malloc(sizeof(struct buf *) * run + sizeof(struct cluster_save), 313 M_SEGMENT, M_WAITOK); 314 b_save->bs_bufsize = b_save->bs_bcount = size; 315 b_save->bs_nchildren = 0; 316 b_save->bs_children = (struct buf **)(b_save + 1); 317 b_save->bs_saveaddr = bp->b_saveaddr; 318 bp->b_saveaddr = (caddr_t) b_save; 319 320 inc = btodb(size); 321 for (bn = blkno + inc, i = 1; i <= run; ++i, bn += inc) { 322 if (incore(vp, lbn + i)) { 323 if (i == 1) { 324 bp->b_saveaddr = b_save->bs_saveaddr; 325 bp->b_flags &= ~B_CALL; 326 bp->b_iodone = NULL; 327 allocbuf(bp, size); 328 free(b_save, M_SEGMENT); 329 } else 330 allocbuf(bp, size * i); 331 break; 332 } 333 tbp = getblk(vp, lbn + i, 0, 0, 0); 334 /* 335 * getblk may return some memory in the buffer if there were 336 * no empty buffers to shed it to. If there is currently 337 * memory in the buffer, we move it down size bytes to make 338 * room for the valid pages that cluster_callback will insert. 339 * We do this now so we don't have to do it at interrupt time 340 * in the callback routine. 341 */ 342 if (tbp->b_bufsize != 0) { 343 caddr_t bdata = (char *)tbp->b_data; 344 345 if (tbp->b_bufsize + size > MAXBSIZE) 346 panic("cluster_rbuild: too much memory"); 347 if (tbp->b_bufsize > size) { 348 /* 349 * XXX if the source and destination regions 350 * overlap we have to copy backward to avoid 351 * clobbering any valid pages (i.e. pagemove 352 * implementations typically can't handle 353 * overlap). 354 */ 355 bdata += tbp->b_bufsize; 356 while (bdata > (char *)tbp->b_data) { 357 bdata -= CLBYTES; 358 pagemove(bdata, bdata + size, CLBYTES); 359 } 360 } else 361 pagemove(bdata, bdata + size, tbp->b_bufsize); 362 } 363 tbp->b_blkno = bn; 364 tbp->b_flags |= flags | B_READ | B_ASYNC; 365 ++b_save->bs_nchildren; 366 b_save->bs_children[i - 1] = tbp; 367 } 368 return(bp); 369 } 370 371 /* 372 * Either get a new buffer or grow the existing one. 373 */ 374 struct buf * 375 cluster_newbuf(vp, bp, flags, blkno, lblkno, size, run) 376 struct vnode *vp; 377 struct buf *bp; 378 long flags; 379 daddr_t blkno; 380 daddr_t lblkno; 381 long size; 382 int run; 383 { 384 if (!bp) { 385 bp = getblk(vp, lblkno, size, 0, 0); 386 if (bp->b_flags & (B_DONE | B_DELWRI)) { 387 bp->b_blkno = blkno; 388 return(bp); 389 } 390 } 391 allocbuf(bp, run * size); 392 bp->b_blkno = blkno; 393 bp->b_iodone = cluster_callback; 394 bp->b_flags |= flags | B_CALL; 395 return(bp); 396 } 397 398 /* 399 * Cleanup after a clustered read or write. 400 * This is complicated by the fact that any of the buffers might have 401 * extra memory (if there were no empty buffer headers at allocbuf time) 402 * that we will need to shift around. 403 */ 404 void 405 cluster_callback(bp) 406 struct buf *bp; 407 { 408 struct cluster_save *b_save; 409 struct buf **bpp, *tbp; 410 long bsize; 411 caddr_t cp; 412 int error = 0; 413 414 /* 415 * Must propogate errors to all the components. 416 */ 417 if (bp->b_flags & B_ERROR) 418 error = bp->b_error; 419 420 b_save = (struct cluster_save *)(bp->b_saveaddr); 421 bp->b_saveaddr = b_save->bs_saveaddr; 422 423 bsize = b_save->bs_bufsize; 424 cp = (char *)bp->b_data + bsize; 425 /* 426 * Move memory from the large cluster buffer into the component 427 * buffers and mark IO as done on these. 428 */ 429 for (bpp = b_save->bs_children; b_save->bs_nchildren--; ++bpp) { 430 tbp = *bpp; 431 pagemove(cp, tbp->b_data, bsize); 432 tbp->b_bufsize += bsize; 433 tbp->b_bcount = bsize; 434 if (error) { 435 tbp->b_flags |= B_ERROR; 436 tbp->b_error = error; 437 } 438 biodone(tbp); 439 bp->b_bufsize -= bsize; 440 cp += bsize; 441 } 442 /* 443 * If there was excess memory in the cluster buffer, 444 * slide it up adjacent to the remaining valid data. 445 */ 446 if (bp->b_bufsize != bsize) { 447 if (bp->b_bufsize < bsize) 448 panic("cluster_callback: too little memory"); 449 pagemove(cp, (char *)bp->b_data + bsize, bp->b_bufsize - bsize); 450 } 451 bp->b_bcount = bsize; 452 bp->b_iodone = NULL; 453 free(b_save, M_SEGMENT); 454 if (bp->b_flags & B_ASYNC) 455 brelse(bp); 456 else { 457 bp->b_flags &= ~B_WANTED; 458 wakeup((caddr_t)bp); 459 } 460 } 461 462 /* 463 * Do clustered write for FFS. 464 * 465 * Three cases: 466 * 1. Write is not sequential (write asynchronously) 467 * Write is sequential: 468 * 2. beginning of cluster - begin cluster 469 * 3. middle of a cluster - add to cluster 470 * 4. end of a cluster - asynchronously write cluster 471 */ 472 void 473 cluster_write(bp, filesize) 474 struct buf *bp; 475 u_quad_t filesize; 476 { 477 struct vnode *vp; 478 daddr_t lbn; 479 int maxclen, cursize; 480 481 vp = bp->b_vp; 482 lbn = bp->b_lblkno; 483 484 /* Initialize vnode to beginning of file. */ 485 if (lbn == 0) 486 vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; 487 488 if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 || 489 (bp->b_blkno != vp->v_lasta + btodb(bp->b_bcount))) { 490 maxclen = MAXBSIZE / vp->v_mount->mnt_stat.f_iosize - 1; 491 if (vp->v_clen != 0) { 492 /* 493 * Next block is not sequential. 494 * 495 * If we are not writing at end of file, the process 496 * seeked to another point in the file since its 497 * last write, or we have reached our maximum 498 * cluster size, then push the previous cluster. 499 * Otherwise try reallocating to make it sequential. 500 */ 501 cursize = vp->v_lastw - vp->v_cstart + 1; 502 if (!doreallocblks || 503 (lbn + 1) * bp->b_bcount != filesize || 504 lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) { 505 cluster_wbuild(vp, NULL, bp->b_bcount, 506 vp->v_cstart, cursize, lbn); 507 } else { 508 struct buf **bpp, **endbp; 509 struct cluster_save *buflist; 510 511 buflist = cluster_collectbufs(vp, bp); 512 endbp = &buflist->bs_children 513 [buflist->bs_nchildren - 1]; 514 if (VOP_REALLOCBLKS(vp, buflist)) { 515 /* 516 * Failed, push the previous cluster. 517 */ 518 for (bpp = buflist->bs_children; 519 bpp < endbp; bpp++) 520 brelse(*bpp); 521 free(buflist, M_SEGMENT); 522 cluster_wbuild(vp, NULL, bp->b_bcount, 523 vp->v_cstart, cursize, lbn); 524 } else { 525 /* 526 * Succeeded, keep building cluster. 527 */ 528 for (bpp = buflist->bs_children; 529 bpp <= endbp; bpp++) 530 bdwrite(*bpp); 531 free(buflist, M_SEGMENT); 532 vp->v_lastw = lbn; 533 vp->v_lasta = bp->b_blkno; 534 return; 535 } 536 } 537 } 538 /* 539 * Consider beginning a cluster. 540 * If at end of file, make cluster as large as possible, 541 * otherwise find size of existing cluster. 542 */ 543 if ((lbn + 1) * bp->b_bcount != filesize && 544 (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen) || 545 bp->b_blkno == -1)) { 546 bawrite(bp); 547 vp->v_clen = 0; 548 vp->v_lasta = bp->b_blkno; 549 vp->v_cstart = lbn + 1; 550 vp->v_lastw = lbn; 551 return; 552 } 553 vp->v_clen = maxclen; 554 if (maxclen == 0) { /* I/O not contiguous */ 555 vp->v_cstart = lbn + 1; 556 bawrite(bp); 557 } else { /* Wait for rest of cluster */ 558 vp->v_cstart = lbn; 559 bdwrite(bp); 560 } 561 } else if (lbn == vp->v_cstart + vp->v_clen) { 562 /* 563 * At end of cluster, write it out. 564 */ 565 cluster_wbuild(vp, bp, bp->b_bcount, vp->v_cstart, 566 vp->v_clen + 1, lbn); 567 vp->v_clen = 0; 568 vp->v_cstart = lbn + 1; 569 } else 570 /* 571 * In the middle of a cluster, so just delay the 572 * I/O for now. 573 */ 574 bdwrite(bp); 575 vp->v_lastw = lbn; 576 vp->v_lasta = bp->b_blkno; 577 } 578 579 580 /* 581 * This is an awful lot like cluster_rbuild...wish they could be combined. 582 * The last lbn argument is the current block on which I/O is being 583 * performed. Check to see that it doesn't fall in the middle of 584 * the current block (if last_bp == NULL). 585 */ 586 void 587 cluster_wbuild(vp, last_bp, size, start_lbn, len, lbn) 588 struct vnode *vp; 589 struct buf *last_bp; 590 long size; 591 daddr_t start_lbn; 592 int len; 593 daddr_t lbn; 594 { 595 struct cluster_save *b_save; 596 struct buf *bp, *tbp; 597 caddr_t cp; 598 int i, s; 599 600 #ifdef DIAGNOSTIC 601 if (size != vp->v_mount->mnt_stat.f_iosize) 602 panic("cluster_wbuild: size %d != filesize %d\n", 603 size, vp->v_mount->mnt_stat.f_iosize); 604 #endif 605 redo: 606 while ((!incore(vp, start_lbn) || start_lbn == lbn) && len) { 607 ++start_lbn; 608 --len; 609 } 610 611 /* Get more memory for current buffer */ 612 if (len <= 1) { 613 if (last_bp) { 614 bawrite(last_bp); 615 } else if (len) { 616 bp = getblk(vp, start_lbn, size, 0, 0); 617 bawrite(bp); 618 } 619 return; 620 } 621 622 bp = getblk(vp, start_lbn, size, 0, 0); 623 if (!(bp->b_flags & B_DELWRI)) { 624 ++start_lbn; 625 --len; 626 brelse(bp); 627 goto redo; 628 } 629 630 /* 631 * Extra memory in the buffer, punt on this buffer. 632 * XXX we could handle this in most cases, but we would have to 633 * push the extra memory down to after our max possible cluster 634 * size and then potentially pull it back up if the cluster was 635 * terminated prematurely--too much hassle. 636 */ 637 if (bp->b_bcount != bp->b_bufsize) { 638 ++start_lbn; 639 --len; 640 bawrite(bp); 641 goto redo; 642 } 643 644 --len; 645 b_save = malloc(sizeof(struct buf *) * len + sizeof(struct cluster_save), 646 M_SEGMENT, M_WAITOK); 647 b_save->bs_bcount = bp->b_bcount; 648 b_save->bs_bufsize = bp->b_bufsize; 649 b_save->bs_nchildren = 0; 650 b_save->bs_children = (struct buf **)(b_save + 1); 651 b_save->bs_saveaddr = bp->b_saveaddr; 652 bp->b_saveaddr = (caddr_t) b_save; 653 654 bp->b_flags |= B_CALL; 655 bp->b_iodone = cluster_callback; 656 cp = (char *)bp->b_data + size; 657 for (++start_lbn, i = 0; i < len; ++i, ++start_lbn) { 658 /* 659 * Block is not in core or the non-sequential block 660 * ending our cluster was part of the cluster (in which 661 * case we don't want to write it twice). 662 */ 663 if (!incore(vp, start_lbn) || 664 last_bp == NULL && start_lbn == lbn) 665 break; 666 667 /* 668 * Get the desired block buffer (unless it is the final 669 * sequential block whose buffer was passed in explictly 670 * as last_bp). 671 */ 672 if (last_bp == NULL || start_lbn != lbn) { 673 tbp = getblk(vp, start_lbn, size, 0, 0); 674 if (!(tbp->b_flags & B_DELWRI)) { 675 brelse(tbp); 676 break; 677 } 678 } else 679 tbp = last_bp; 680 681 ++b_save->bs_nchildren; 682 683 /* Move memory from children to parent */ 684 if (tbp->b_blkno != (bp->b_blkno + btodb(bp->b_bufsize))) { 685 printf("Clustered Block: %d addr %x bufsize: %d\n", 686 bp->b_lblkno, bp->b_blkno, bp->b_bufsize); 687 printf("Child Block: %d addr: %x\n", tbp->b_lblkno, 688 tbp->b_blkno); 689 panic("Clustered write to wrong blocks"); 690 } 691 692 pagemove(tbp->b_data, cp, size); 693 bp->b_bcount += size; 694 bp->b_bufsize += size; 695 696 tbp->b_bufsize -= size; 697 tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); 698 tbp->b_flags |= (B_ASYNC | B_AGE); 699 s = splbio(); 700 reassignbuf(tbp, tbp->b_vp); /* put on clean list */ 701 ++tbp->b_vp->v_numoutput; 702 splx(s); 703 b_save->bs_children[i] = tbp; 704 705 cp += size; 706 } 707 708 if (i == 0) { 709 /* None to cluster */ 710 bp->b_saveaddr = b_save->bs_saveaddr; 711 bp->b_flags &= ~B_CALL; 712 bp->b_iodone = NULL; 713 free(b_save, M_SEGMENT); 714 } 715 bawrite(bp); 716 if (i < len) { 717 len -= i + 1; 718 start_lbn += 1; 719 goto redo; 720 } 721 } 722 723 /* 724 * Collect together all the buffers in a cluster. 725 * Plus add one additional buffer. 726 */ 727 struct cluster_save * 728 cluster_collectbufs(vp, last_bp) 729 struct vnode *vp; 730 struct buf *last_bp; 731 { 732 struct cluster_save *buflist; 733 daddr_t lbn; 734 int i, len; 735 736 len = vp->v_lastw - vp->v_cstart + 1; 737 buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist), 738 M_SEGMENT, M_WAITOK); 739 buflist->bs_nchildren = 0; 740 buflist->bs_children = (struct buf **)(buflist + 1); 741 for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++) 742 (void)bread(vp, lbn, last_bp->b_bcount, NOCRED, 743 &buflist->bs_children[i]); 744 buflist->bs_children[i] = last_bp; 745 buflist->bs_nchildren = i + 1; 746 return (buflist); 747 } 748