1 /*- 2 * Copyright (c) 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. All advertising materials mentioning features or use of this software 14 * must display the following acknowledgement: 15 * This product includes software developed by the University of 16 * California, Berkeley and its contributors. 17 * 4. Neither the name of the University nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * @(#)vfs_cluster.c 8.7 (Berkeley) 2/13/94 34 * $Id: vfs_cluster.c,v 1.4 1994/08/08 09:11:31 davidg Exp $ 35 */ 36 37 #include <sys/param.h> 38 #include <sys/systm.h> 39 #include <sys/proc.h> 40 #include <sys/buf.h> 41 #include <sys/vnode.h> 42 #include <sys/mount.h> 43 #include <sys/trace.h> 44 #include <sys/malloc.h> 45 #include <sys/resourcevar.h> 46 47 #ifdef DEBUG 48 #include <vm/vm.h> 49 #include <sys/sysctl.h> 50 int doreallocblks = 0; 51 struct ctldebug debug13 = { "doreallocblks", &doreallocblks }; 52 #else 53 /* XXX for cluster_write */ 54 #define doreallocblks 0 55 #endif 56 57 /* 58 * Local declarations 59 */ 60 struct buf *cluster_newbuf __P((struct vnode *, struct buf *, long, daddr_t, 61 daddr_t, long, int)); 62 struct buf *cluster_rbuild __P((struct vnode *, u_quad_t, struct buf *, 63 daddr_t, daddr_t, long, int, long)); 64 void cluster_wbuild __P((struct vnode *, struct buf *, long, 65 daddr_t, int, daddr_t)); 66 struct cluster_save *cluster_collectbufs __P((struct vnode *, struct buf *)); 67 68 #ifdef DIAGNOSTIC 69 /* 70 * Set to 1 if reads of block zero should cause readahead to be done. 71 * Set to 0 treats a read of block zero as a non-sequential read. 72 * 73 * Setting to one assumes that most reads of block zero of files are due to 74 * sequential passes over the files (e.g. cat, sum) where additional blocks 75 * will soon be needed. Setting to zero assumes that the majority are 76 * surgical strikes to get particular info (e.g. size, file) where readahead 77 * blocks will not be used and, in fact, push out other potentially useful 78 * blocks from the cache. The former seems intuitive, but some quick tests 79 * showed that the latter performed better from a system-wide point of view. 80 */ 81 int doclusterraz = 0; 82 #define ISSEQREAD(vp, blk) \ 83 (((blk) != 0 || doclusterraz) && \ 84 ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr)) 85 #else 86 #define ISSEQREAD(vp, blk) \ 87 ((blk) != 0 && ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr)) 88 #endif 89 90 /* 91 * This replaces bread. If this is a bread at the beginning of a file and 92 * lastr is 0, we assume this is the first read and we'll read up to two 93 * blocks if they are sequential. After that, we'll do regular read ahead 94 * in clustered chunks. 95 * 96 * There are 4 or 5 cases depending on how you count: 97 * Desired block is in the cache: 98 * 1 Not sequential access (0 I/Os). 99 * 2 Access is sequential, do read-ahead (1 ASYNC). 100 * Desired block is not in cache: 101 * 3 Not sequential access (1 SYNC). 102 * 4 Sequential access, next block is contiguous (1 SYNC). 103 * 5 Sequential access, next block is not contiguous (1 SYNC, 1 ASYNC) 104 * 105 * There are potentially two buffers that require I/O. 106 * bp is the block requested. 107 * rbp is the read-ahead block. 108 * If either is NULL, then you don't have to do the I/O. 109 */ 110 int 111 cluster_read(vp, filesize, lblkno, size, cred, bpp) 112 struct vnode *vp; 113 u_quad_t filesize; 114 daddr_t lblkno; 115 long size; 116 struct ucred *cred; 117 struct buf **bpp; 118 { 119 struct buf *bp, *rbp; 120 daddr_t blkno, ioblkno; 121 long flags; 122 int error, num_ra, alreadyincore; 123 124 #ifdef DIAGNOSTIC 125 if (size == 0) 126 panic("cluster_read: size = 0"); 127 #endif 128 129 error = 0; 130 flags = B_READ; 131 *bpp = bp = getblk(vp, lblkno, size, 0, 0); 132 if (bp->b_flags & B_CACHE) { 133 /* 134 * Desired block is in cache; do any readahead ASYNC. 135 * Case 1, 2. 136 */ 137 trace(TR_BREADHIT, pack(vp, size), lblkno); 138 flags |= B_ASYNC; 139 ioblkno = lblkno + (vp->v_ralen ? vp->v_ralen : 1); 140 alreadyincore = (int)incore(vp, ioblkno); 141 bp = NULL; 142 } else { 143 /* Block wasn't in cache, case 3, 4, 5. */ 144 trace(TR_BREADMISS, pack(vp, size), lblkno); 145 bp->b_flags |= B_READ; 146 ioblkno = lblkno; 147 alreadyincore = 0; 148 curproc->p_stats->p_ru.ru_inblock++; /* XXX */ 149 } 150 /* 151 * XXX 152 * Replace 1 with a window size based on some permutation of 153 * maxcontig and rot_delay. This will let you figure out how 154 * many blocks you should read-ahead (case 2, 4, 5). 155 * 156 * If the access isn't sequential, reset the window to 1. 157 * Note that a read to the same block is considered sequential. 158 * This catches the case where the file is being read sequentially, 159 * but at smaller than the filesystem block size. 160 */ 161 rbp = NULL; 162 if (!ISSEQREAD(vp, lblkno)) { 163 vp->v_ralen = 0; 164 vp->v_maxra = lblkno; 165 } else if ((ioblkno + 1) * size <= filesize && !alreadyincore && 166 !(error = VOP_BMAP(vp, ioblkno, NULL, &blkno, &num_ra)) && 167 blkno != -1) { 168 /* 169 * Reading sequentially, and the next block is not in the 170 * cache. We are going to try reading ahead. 171 */ 172 if (num_ra) { 173 /* 174 * If our desired readahead block had been read 175 * in a previous readahead but is no longer in 176 * core, then we may be reading ahead too far 177 * or are not using our readahead very rapidly. 178 * In this case we scale back the window. 179 */ 180 if (!alreadyincore && ioblkno <= vp->v_maxra) 181 vp->v_ralen = max(vp->v_ralen >> 1, 1); 182 /* 183 * There are more sequential blocks than our current 184 * window allows, scale up. Ideally we want to get 185 * in sync with the filesystem maxcontig value. 186 */ 187 else if (num_ra > vp->v_ralen && lblkno != vp->v_lastr) 188 vp->v_ralen = vp->v_ralen ? 189 min(num_ra, vp->v_ralen << 1) : 1; 190 191 if (num_ra > vp->v_ralen) 192 num_ra = vp->v_ralen; 193 } 194 195 if (num_ra) /* case 2, 4 */ 196 rbp = cluster_rbuild(vp, filesize, 197 bp, ioblkno, blkno, size, num_ra, flags); 198 else if (ioblkno == lblkno) { 199 bp->b_blkno = blkno; 200 /* Case 5: check how many blocks to read ahead */ 201 ++ioblkno; 202 if ((ioblkno + 1) * size > filesize || 203 incore(vp, ioblkno) || (error = VOP_BMAP(vp, 204 ioblkno, NULL, &blkno, &num_ra)) || blkno == -1) 205 goto skip_readahead; 206 /* 207 * Adjust readahead as above 208 */ 209 if (num_ra) { 210 if (!alreadyincore && ioblkno <= vp->v_maxra) 211 vp->v_ralen = max(vp->v_ralen >> 1, 1); 212 else if (num_ra > vp->v_ralen && 213 lblkno != vp->v_lastr) 214 vp->v_ralen = vp->v_ralen ? 215 min(num_ra,vp->v_ralen<<1) : 1; 216 if (num_ra > vp->v_ralen) 217 num_ra = vp->v_ralen; 218 } 219 flags |= B_ASYNC; 220 if (num_ra) 221 rbp = cluster_rbuild(vp, filesize, 222 NULL, ioblkno, blkno, size, num_ra, flags); 223 else { 224 rbp = getblk(vp, ioblkno, size, 0, 0); 225 rbp->b_flags |= flags; 226 rbp->b_blkno = blkno; 227 } 228 } else { 229 /* case 2; read ahead single block */ 230 rbp = getblk(vp, ioblkno, size, 0, 0); 231 rbp->b_flags |= flags; 232 rbp->b_blkno = blkno; 233 } 234 235 if (rbp == bp) /* case 4 */ 236 rbp = NULL; 237 else if (rbp) { /* case 2, 5 */ 238 trace(TR_BREADMISSRA, 239 pack(vp, (num_ra + 1) * size), ioblkno); 240 curproc->p_stats->p_ru.ru_inblock++; /* XXX */ 241 } 242 } 243 244 /* XXX Kirk, do we need to make sure the bp has creds? */ 245 skip_readahead: 246 if (bp) 247 if (bp->b_flags & (B_DONE | B_DELWRI)) 248 panic("cluster_read: DONE bp"); 249 else 250 error = VOP_STRATEGY(bp); 251 252 if (rbp) 253 if (error || rbp->b_flags & (B_DONE | B_DELWRI)) { 254 rbp->b_flags &= ~(B_ASYNC | B_READ); 255 brelse(rbp); 256 } else 257 (void) VOP_STRATEGY(rbp); 258 259 /* 260 * Recalculate our maximum readahead 261 */ 262 if (rbp == NULL) 263 rbp = bp; 264 if (rbp) 265 vp->v_maxra = rbp->b_lblkno + (rbp->b_bufsize / size) - 1; 266 267 if (bp) 268 return(biowait(bp)); 269 return(error); 270 } 271 272 /* 273 * If blocks are contiguous on disk, use this to provide clustered 274 * read ahead. We will read as many blocks as possible sequentially 275 * and then parcel them up into logical blocks in the buffer hash table. 276 */ 277 struct buf * 278 cluster_rbuild(vp, filesize, bp, lbn, blkno, size, run, flags) 279 struct vnode *vp; 280 u_quad_t filesize; 281 struct buf *bp; 282 daddr_t lbn; 283 daddr_t blkno; 284 long size; 285 int run; 286 long flags; 287 { 288 struct cluster_save *b_save; 289 struct buf *tbp; 290 daddr_t bn; 291 int i, inc; 292 293 #ifdef DIAGNOSTIC 294 if (size != vp->v_mount->mnt_stat.f_iosize) 295 panic("cluster_rbuild: size %d != filesize %d\n", 296 size, vp->v_mount->mnt_stat.f_iosize); 297 #endif 298 if (size * (lbn + run + 1) > filesize) 299 --run; 300 if (run == 0) { 301 if (!bp) { 302 bp = getblk(vp, lbn, size, 0, 0); 303 bp->b_blkno = blkno; 304 bp->b_flags |= flags; 305 } 306 return(bp); 307 } 308 309 bp = cluster_newbuf(vp, bp, flags, blkno, lbn, size, run + 1); 310 if (bp->b_flags & (B_DONE | B_DELWRI)) 311 return (bp); 312 313 b_save = malloc(sizeof(struct buf *) * run + sizeof(struct cluster_save), 314 M_SEGMENT, M_WAITOK); 315 b_save->bs_bufsize = b_save->bs_bcount = size; 316 b_save->bs_nchildren = 0; 317 b_save->bs_children = (struct buf **)(b_save + 1); 318 b_save->bs_saveaddr = bp->b_saveaddr; 319 bp->b_saveaddr = (caddr_t) b_save; 320 321 inc = btodb(size); 322 for (bn = blkno + inc, i = 1; i <= run; ++i, bn += inc) { 323 if (incore(vp, lbn + i)) { 324 if (i == 1) { 325 bp->b_saveaddr = b_save->bs_saveaddr; 326 bp->b_flags &= ~B_CALL; 327 bp->b_iodone = NULL; 328 allocbuf(bp, size); 329 free(b_save, M_SEGMENT); 330 } else 331 allocbuf(bp, size * i); 332 break; 333 } 334 tbp = getblk(vp, lbn + i, 0, 0, 0); 335 /* 336 * getblk may return some memory in the buffer if there were 337 * no empty buffers to shed it to. If there is currently 338 * memory in the buffer, we move it down size bytes to make 339 * room for the valid pages that cluster_callback will insert. 340 * We do this now so we don't have to do it at interrupt time 341 * in the callback routine. 342 */ 343 if (tbp->b_bufsize != 0) { 344 caddr_t bdata = (char *)tbp->b_data; 345 346 if (tbp->b_bufsize + size > MAXBSIZE) 347 panic("cluster_rbuild: too much memory"); 348 if (tbp->b_bufsize > size) { 349 /* 350 * XXX if the source and destination regions 351 * overlap we have to copy backward to avoid 352 * clobbering any valid pages (i.e. pagemove 353 * implementations typically can't handle 354 * overlap). 355 */ 356 bdata += tbp->b_bufsize; 357 while (bdata > (char *)tbp->b_data) { 358 bdata -= CLBYTES; 359 pagemove(bdata, bdata + size, CLBYTES); 360 } 361 } else 362 pagemove(bdata, bdata + size, tbp->b_bufsize); 363 } 364 tbp->b_blkno = bn; 365 tbp->b_flags |= flags | B_READ | B_ASYNC; 366 ++b_save->bs_nchildren; 367 b_save->bs_children[i - 1] = tbp; 368 } 369 return(bp); 370 } 371 372 /* 373 * Either get a new buffer or grow the existing one. 374 */ 375 struct buf * 376 cluster_newbuf(vp, bp, flags, blkno, lblkno, size, run) 377 struct vnode *vp; 378 struct buf *bp; 379 long flags; 380 daddr_t blkno; 381 daddr_t lblkno; 382 long size; 383 int run; 384 { 385 if (!bp) { 386 bp = getblk(vp, lblkno, size, 0, 0); 387 if (bp->b_flags & (B_DONE | B_DELWRI)) { 388 bp->b_blkno = blkno; 389 return(bp); 390 } 391 } 392 allocbuf(bp, run * size); 393 bp->b_blkno = blkno; 394 bp->b_iodone = cluster_callback; 395 bp->b_flags |= flags | B_CALL; 396 return(bp); 397 } 398 399 /* 400 * Cleanup after a clustered read or write. 401 * This is complicated by the fact that any of the buffers might have 402 * extra memory (if there were no empty buffer headers at allocbuf time) 403 * that we will need to shift around. 404 */ 405 void 406 cluster_callback(bp) 407 struct buf *bp; 408 { 409 struct cluster_save *b_save; 410 struct buf **bpp, *tbp; 411 long bsize; 412 caddr_t cp; 413 int error = 0; 414 415 /* 416 * Must propogate errors to all the components. 417 */ 418 if (bp->b_flags & B_ERROR) 419 error = bp->b_error; 420 421 b_save = (struct cluster_save *)(bp->b_saveaddr); 422 bp->b_saveaddr = b_save->bs_saveaddr; 423 424 bsize = b_save->bs_bufsize; 425 cp = (char *)bp->b_data + bsize; 426 /* 427 * Move memory from the large cluster buffer into the component 428 * buffers and mark IO as done on these. 429 */ 430 for (bpp = b_save->bs_children; b_save->bs_nchildren--; ++bpp) { 431 tbp = *bpp; 432 pagemove(cp, tbp->b_data, bsize); 433 tbp->b_bufsize += bsize; 434 tbp->b_bcount = bsize; 435 if (error) { 436 tbp->b_flags |= B_ERROR; 437 tbp->b_error = error; 438 } 439 biodone(tbp); 440 bp->b_bufsize -= bsize; 441 cp += bsize; 442 } 443 /* 444 * If there was excess memory in the cluster buffer, 445 * slide it up adjacent to the remaining valid data. 446 */ 447 if (bp->b_bufsize != bsize) { 448 if (bp->b_bufsize < bsize) 449 panic("cluster_callback: too little memory"); 450 pagemove(cp, (char *)bp->b_data + bsize, bp->b_bufsize - bsize); 451 } 452 bp->b_bcount = bsize; 453 bp->b_iodone = NULL; 454 free(b_save, M_SEGMENT); 455 if (bp->b_flags & B_ASYNC) 456 brelse(bp); 457 else { 458 bp->b_flags &= ~B_WANTED; 459 wakeup((caddr_t)bp); 460 } 461 } 462 463 /* 464 * Do clustered write for FFS. 465 * 466 * Three cases: 467 * 1. Write is not sequential (write asynchronously) 468 * Write is sequential: 469 * 2. beginning of cluster - begin cluster 470 * 3. middle of a cluster - add to cluster 471 * 4. end of a cluster - asynchronously write cluster 472 */ 473 void 474 cluster_write(bp, filesize) 475 struct buf *bp; 476 u_quad_t filesize; 477 { 478 struct vnode *vp; 479 daddr_t lbn; 480 int maxclen, cursize; 481 482 vp = bp->b_vp; 483 lbn = bp->b_lblkno; 484 485 /* Initialize vnode to beginning of file. */ 486 if (lbn == 0) 487 vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; 488 489 if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 || 490 (bp->b_blkno != vp->v_lasta + btodb(bp->b_bcount))) { 491 maxclen = MAXBSIZE / vp->v_mount->mnt_stat.f_iosize - 1; 492 if (vp->v_clen != 0) { 493 /* 494 * Next block is not sequential. 495 * 496 * If we are not writing at end of file, the process 497 * seeked to another point in the file since its 498 * last write, or we have reached our maximum 499 * cluster size, then push the previous cluster. 500 * Otherwise try reallocating to make it sequential. 501 */ 502 cursize = vp->v_lastw - vp->v_cstart + 1; 503 if (!doreallocblks || 504 (lbn + 1) * bp->b_bcount != filesize || 505 lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) { 506 cluster_wbuild(vp, NULL, bp->b_bcount, 507 vp->v_cstart, cursize, lbn); 508 } else { 509 struct buf **bpp, **endbp; 510 struct cluster_save *buflist; 511 512 buflist = cluster_collectbufs(vp, bp); 513 endbp = &buflist->bs_children 514 [buflist->bs_nchildren - 1]; 515 if (VOP_REALLOCBLKS(vp, buflist)) { 516 /* 517 * Failed, push the previous cluster. 518 */ 519 for (bpp = buflist->bs_children; 520 bpp < endbp; bpp++) 521 brelse(*bpp); 522 free(buflist, M_SEGMENT); 523 cluster_wbuild(vp, NULL, bp->b_bcount, 524 vp->v_cstart, cursize, lbn); 525 } else { 526 /* 527 * Succeeded, keep building cluster. 528 */ 529 for (bpp = buflist->bs_children; 530 bpp <= endbp; bpp++) 531 bdwrite(*bpp); 532 free(buflist, M_SEGMENT); 533 vp->v_lastw = lbn; 534 vp->v_lasta = bp->b_blkno; 535 return; 536 } 537 } 538 } 539 /* 540 * Consider beginning a cluster. 541 * If at end of file, make cluster as large as possible, 542 * otherwise find size of existing cluster. 543 */ 544 if ((lbn + 1) * bp->b_bcount != filesize && 545 (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen) || 546 bp->b_blkno == -1)) { 547 bawrite(bp); 548 vp->v_clen = 0; 549 vp->v_lasta = bp->b_blkno; 550 vp->v_cstart = lbn + 1; 551 vp->v_lastw = lbn; 552 return; 553 } 554 vp->v_clen = maxclen; 555 if (maxclen == 0) { /* I/O not contiguous */ 556 vp->v_cstart = lbn + 1; 557 bawrite(bp); 558 } else { /* Wait for rest of cluster */ 559 vp->v_cstart = lbn; 560 bdwrite(bp); 561 } 562 } else if (lbn == vp->v_cstart + vp->v_clen) { 563 /* 564 * At end of cluster, write it out. 565 */ 566 cluster_wbuild(vp, bp, bp->b_bcount, vp->v_cstart, 567 vp->v_clen + 1, lbn); 568 vp->v_clen = 0; 569 vp->v_cstart = lbn + 1; 570 } else 571 /* 572 * In the middle of a cluster, so just delay the 573 * I/O for now. 574 */ 575 bdwrite(bp); 576 vp->v_lastw = lbn; 577 vp->v_lasta = bp->b_blkno; 578 } 579 580 581 /* 582 * This is an awful lot like cluster_rbuild...wish they could be combined. 583 * The last lbn argument is the current block on which I/O is being 584 * performed. Check to see that it doesn't fall in the middle of 585 * the current block (if last_bp == NULL). 586 */ 587 void 588 cluster_wbuild(vp, last_bp, size, start_lbn, len, lbn) 589 struct vnode *vp; 590 struct buf *last_bp; 591 long size; 592 daddr_t start_lbn; 593 int len; 594 daddr_t lbn; 595 { 596 struct cluster_save *b_save; 597 struct buf *bp, *tbp; 598 caddr_t cp; 599 int i, s; 600 601 #ifdef DIAGNOSTIC 602 if (size != vp->v_mount->mnt_stat.f_iosize) 603 panic("cluster_wbuild: size %d != filesize %d\n", 604 size, vp->v_mount->mnt_stat.f_iosize); 605 #endif 606 redo: 607 while ((!incore(vp, start_lbn) || start_lbn == lbn) && len) { 608 ++start_lbn; 609 --len; 610 } 611 612 /* Get more memory for current buffer */ 613 if (len <= 1) { 614 if (last_bp) { 615 bawrite(last_bp); 616 } else if (len) { 617 bp = getblk(vp, start_lbn, size, 0, 0); 618 bawrite(bp); 619 } 620 return; 621 } 622 623 bp = getblk(vp, start_lbn, size, 0, 0); 624 if (!(bp->b_flags & B_DELWRI)) { 625 ++start_lbn; 626 --len; 627 brelse(bp); 628 goto redo; 629 } 630 631 /* 632 * Extra memory in the buffer, punt on this buffer. 633 * XXX we could handle this in most cases, but we would have to 634 * push the extra memory down to after our max possible cluster 635 * size and then potentially pull it back up if the cluster was 636 * terminated prematurely--too much hassle. 637 */ 638 if (bp->b_bcount != bp->b_bufsize) { 639 ++start_lbn; 640 --len; 641 bawrite(bp); 642 goto redo; 643 } 644 645 --len; 646 b_save = malloc(sizeof(struct buf *) * len + sizeof(struct cluster_save), 647 M_SEGMENT, M_WAITOK); 648 b_save->bs_bcount = bp->b_bcount; 649 b_save->bs_bufsize = bp->b_bufsize; 650 b_save->bs_nchildren = 0; 651 b_save->bs_children = (struct buf **)(b_save + 1); 652 b_save->bs_saveaddr = bp->b_saveaddr; 653 bp->b_saveaddr = (caddr_t) b_save; 654 655 bp->b_flags |= B_CALL; 656 bp->b_iodone = cluster_callback; 657 cp = (char *)bp->b_data + size; 658 for (++start_lbn, i = 0; i < len; ++i, ++start_lbn) { 659 /* 660 * Block is not in core or the non-sequential block 661 * ending our cluster was part of the cluster (in which 662 * case we don't want to write it twice). 663 */ 664 if (!incore(vp, start_lbn) || 665 last_bp == NULL && start_lbn == lbn) 666 break; 667 668 /* 669 * Get the desired block buffer (unless it is the final 670 * sequential block whose buffer was passed in explictly 671 * as last_bp). 672 */ 673 if (last_bp == NULL || start_lbn != lbn) { 674 tbp = getblk(vp, start_lbn, size, 0, 0); 675 if (!(tbp->b_flags & B_DELWRI)) { 676 brelse(tbp); 677 break; 678 } 679 } else 680 tbp = last_bp; 681 682 ++b_save->bs_nchildren; 683 684 /* Move memory from children to parent */ 685 if (tbp->b_blkno != (bp->b_blkno + btodb(bp->b_bufsize))) { 686 printf("Clustered Block: %d addr %x bufsize: %d\n", 687 bp->b_lblkno, bp->b_blkno, bp->b_bufsize); 688 printf("Child Block: %d addr: %x\n", tbp->b_lblkno, 689 tbp->b_blkno); 690 panic("Clustered write to wrong blocks"); 691 } 692 693 pagemove(tbp->b_data, cp, size); 694 bp->b_bcount += size; 695 bp->b_bufsize += size; 696 697 tbp->b_bufsize -= size; 698 tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); 699 tbp->b_flags |= B_ASYNC; 700 s = splbio(); 701 reassignbuf(tbp, tbp->b_vp); /* put on clean list */ 702 ++tbp->b_vp->v_numoutput; 703 splx(s); 704 b_save->bs_children[i] = tbp; 705 706 cp += size; 707 } 708 709 if (i == 0) { 710 /* None to cluster */ 711 bp->b_saveaddr = b_save->bs_saveaddr; 712 bp->b_flags &= ~B_CALL; 713 bp->b_iodone = NULL; 714 free(b_save, M_SEGMENT); 715 } 716 bawrite(bp); 717 if (i < len) { 718 len -= i + 1; 719 start_lbn += 1; 720 goto redo; 721 } 722 } 723 724 /* 725 * Collect together all the buffers in a cluster. 726 * Plus add one additional buffer. 727 */ 728 struct cluster_save * 729 cluster_collectbufs(vp, last_bp) 730 struct vnode *vp; 731 struct buf *last_bp; 732 { 733 struct cluster_save *buflist; 734 daddr_t lbn; 735 int i, len; 736 737 len = vp->v_lastw - vp->v_cstart + 1; 738 buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist), 739 M_SEGMENT, M_WAITOK); 740 buflist->bs_nchildren = 0; 741 buflist->bs_children = (struct buf **)(buflist + 1); 742 for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++) 743 (void)bread(vp, lbn, last_bp->b_bcount, NOCRED, 744 &buflist->bs_children[i]); 745 buflist->bs_children[i] = last_bp; 746 buflist->bs_nchildren = i + 1; 747 return (buflist); 748 } 749