1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1993 5 * The Regents of the University of California. All rights reserved. 6 * Modifications/enhancements: 7 * Copyright (c) 1995 John S. Dyson. All rights reserved. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 3. Neither the name of the University nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * @(#)vfs_cluster.c 8.7 (Berkeley) 2/13/94 34 */ 35 36 #include <sys/cdefs.h> 37 #include <sys/param.h> 38 #include <sys/systm.h> 39 #include <sys/kernel.h> 40 #include <sys/proc.h> 41 #include <sys/bio.h> 42 #include <sys/buf.h> 43 #include <sys/vnode.h> 44 #include <sys/malloc.h> 45 #include <sys/mount.h> 46 #include <sys/racct.h> 47 #include <sys/resourcevar.h> 48 #include <sys/rwlock.h> 49 #include <sys/vmmeter.h> 50 #include <vm/vm.h> 51 #include <vm/vm_object.h> 52 #include <vm/vm_page.h> 53 #include <sys/sysctl.h> 54 55 static MALLOC_DEFINE(M_SEGMENT, "cl_savebuf", "cluster_save buffer"); 56 static uma_zone_t cluster_pbuf_zone; 57 58 static void cluster_init(void *); 59 static struct cluster_save *cluster_collectbufs(struct vnode *vp, 60 struct vn_clusterw *vnc, struct buf *last_bp, int gbflags); 61 static struct buf *cluster_rbuild(struct vnode *vp, u_quad_t filesize, 62 daddr_t lbn, daddr_t blkno, long size, int run, int gbflags, 63 struct buf *fbp); 64 static void cluster_callback(struct buf *); 65 66 static int write_behind = 1; 67 SYSCTL_INT(_vfs, OID_AUTO, write_behind, CTLFLAG_RW, &write_behind, 0, 68 "Cluster write-behind; 0: disable, 1: enable, 2: backed off"); 69 70 static int read_max = 64; 71 SYSCTL_INT(_vfs, OID_AUTO, read_max, CTLFLAG_RW, &read_max, 0, 72 "Cluster read-ahead max block count"); 73 74 static int read_min = 1; 75 SYSCTL_INT(_vfs, OID_AUTO, read_min, CTLFLAG_RW, &read_min, 0, 76 "Cluster read min block count"); 77 78 SYSINIT(cluster, SI_SUB_CPU, SI_ORDER_ANY, cluster_init, NULL); 79 80 static void 81 cluster_init(void *dummy) 82 { 83 84 cluster_pbuf_zone = pbuf_zsecond_create("clpbuf", nswbuf / 2); 85 } 86 87 /* 88 * Read data to a buf, including read-ahead if we find this to be beneficial. 89 * cluster_read replaces bread. 90 */ 91 int 92 cluster_read(struct vnode *vp, u_quad_t filesize, daddr_t lblkno, long size, 93 struct ucred *cred, long totread, int seqcount, int gbflags, 94 struct buf **bpp) 95 { 96 struct buf *bp, *rbp, *reqbp; 97 struct bufobj *bo; 98 struct thread *td; 99 daddr_t blkno, origblkno; 100 int maxra, racluster; 101 int error, ncontig; 102 int i; 103 104 error = 0; 105 td = curthread; 106 bo = &vp->v_bufobj; 107 if (!unmapped_buf_allowed) 108 gbflags &= ~GB_UNMAPPED; 109 110 /* 111 * Try to limit the amount of read-ahead by a few 112 * ad-hoc parameters. This needs work!!! 113 */ 114 racluster = vp->v_mount->mnt_iosize_max / size; 115 maxra = seqcount; 116 maxra = min(read_max, maxra); 117 maxra = min(nbuf/8, maxra); 118 if (((u_quad_t)(lblkno + maxra + 1) * size) > filesize) 119 maxra = (filesize / size) - lblkno; 120 121 /* 122 * get the requested block 123 */ 124 error = getblkx(vp, lblkno, lblkno, size, 0, 0, gbflags, &bp); 125 if (error != 0) { 126 *bpp = NULL; 127 return (error); 128 } 129 gbflags &= ~GB_NOSPARSE; 130 origblkno = lblkno; 131 *bpp = reqbp = bp; 132 133 /* 134 * if it is in the cache, then check to see if the reads have been 135 * sequential. If they have, then try some read-ahead, otherwise 136 * back-off on prospective read-aheads. 137 */ 138 if (bp->b_flags & B_CACHE) { 139 if (!seqcount) { 140 return 0; 141 } else if ((bp->b_flags & B_RAM) == 0) { 142 return 0; 143 } else { 144 bp->b_flags &= ~B_RAM; 145 BO_RLOCK(bo); 146 for (i = 1; i < maxra; i++) { 147 /* 148 * Stop if the buffer does not exist or it 149 * is invalid (about to go away?) 150 */ 151 rbp = gbincore(&vp->v_bufobj, lblkno+i); 152 if (rbp == NULL || (rbp->b_flags & B_INVAL)) 153 break; 154 155 /* 156 * Set another read-ahead mark so we know 157 * to check again. (If we can lock the 158 * buffer without waiting) 159 */ 160 if ((((i % racluster) == (racluster - 1)) || 161 (i == (maxra - 1))) 162 && (0 == BUF_LOCK(rbp, 163 LK_EXCLUSIVE | LK_NOWAIT, NULL))) { 164 rbp->b_flags |= B_RAM; 165 BUF_UNLOCK(rbp); 166 } 167 } 168 BO_RUNLOCK(bo); 169 if (i >= maxra) { 170 return 0; 171 } 172 lblkno += i; 173 } 174 reqbp = bp = NULL; 175 /* 176 * If it isn't in the cache, then get a chunk from 177 * disk if sequential, otherwise just get the block. 178 */ 179 } else { 180 off_t firstread = bp->b_offset; 181 int nblks; 182 long minread; 183 184 KASSERT(bp->b_offset != NOOFFSET, 185 ("cluster_read: no buffer offset")); 186 187 ncontig = 0; 188 189 /* 190 * Adjust totread if needed 191 */ 192 minread = read_min * size; 193 if (minread > totread) 194 totread = minread; 195 196 /* 197 * Compute the total number of blocks that we should read 198 * synchronously. 199 */ 200 if (firstread + totread > filesize) 201 totread = filesize - firstread; 202 nblks = howmany(totread, size); 203 if (nblks > racluster) 204 nblks = racluster; 205 206 /* 207 * Now compute the number of contiguous blocks. 208 */ 209 if (nblks > 1) { 210 error = VOP_BMAP(vp, lblkno, NULL, 211 &blkno, &ncontig, NULL); 212 /* 213 * If this failed to map just do the original block. 214 */ 215 if (error || blkno == -1) 216 ncontig = 0; 217 } 218 219 /* 220 * If we have contiguous data available do a cluster 221 * otherwise just read the requested block. 222 */ 223 if (ncontig) { 224 /* Account for our first block. */ 225 ncontig = min(ncontig + 1, nblks); 226 if (ncontig < nblks) 227 nblks = ncontig; 228 bp = cluster_rbuild(vp, filesize, lblkno, 229 blkno, size, nblks, gbflags, bp); 230 lblkno += (bp->b_bufsize / size); 231 } else { 232 bp->b_flags |= B_RAM; 233 bp->b_iocmd = BIO_READ; 234 lblkno += 1; 235 } 236 } 237 238 /* 239 * handle the synchronous read so that it is available ASAP. 240 */ 241 if (bp) { 242 if ((bp->b_flags & B_CLUSTER) == 0) { 243 vfs_busy_pages(bp, 0); 244 } 245 bp->b_flags &= ~B_INVAL; 246 bp->b_ioflags &= ~BIO_ERROR; 247 if ((bp->b_flags & B_ASYNC) || bp->b_iodone != NULL) 248 BUF_KERNPROC(bp); 249 bp->b_iooffset = dbtob(bp->b_blkno); 250 bstrategy(bp); 251 #ifdef RACCT 252 if (racct_enable) { 253 PROC_LOCK(td->td_proc); 254 racct_add_buf(td->td_proc, bp, 0); 255 PROC_UNLOCK(td->td_proc); 256 } 257 #endif /* RACCT */ 258 td->td_ru.ru_inblock++; 259 } 260 261 /* 262 * If we have been doing sequential I/O, then do some read-ahead. 263 */ 264 while (lblkno < (origblkno + maxra)) { 265 error = VOP_BMAP(vp, lblkno, NULL, &blkno, &ncontig, NULL); 266 if (error) 267 break; 268 269 if (blkno == -1) 270 break; 271 272 /* 273 * We could throttle ncontig here by maxra but we might as 274 * well read the data if it is contiguous. We're throttled 275 * by racluster anyway. 276 */ 277 if (ncontig) { 278 ncontig = min(ncontig + 1, racluster); 279 rbp = cluster_rbuild(vp, filesize, lblkno, blkno, 280 size, ncontig, gbflags, NULL); 281 lblkno += (rbp->b_bufsize / size); 282 if (rbp->b_flags & B_DELWRI) { 283 bqrelse(rbp); 284 continue; 285 } 286 } else { 287 rbp = getblk(vp, lblkno, size, 0, 0, gbflags); 288 lblkno += 1; 289 if (rbp->b_flags & B_DELWRI) { 290 bqrelse(rbp); 291 continue; 292 } 293 rbp->b_flags |= B_ASYNC | B_RAM; 294 rbp->b_iocmd = BIO_READ; 295 rbp->b_blkno = blkno; 296 } 297 if (rbp->b_flags & B_CACHE) { 298 rbp->b_flags &= ~B_ASYNC; 299 bqrelse(rbp); 300 continue; 301 } 302 if ((rbp->b_flags & B_CLUSTER) == 0) { 303 vfs_busy_pages(rbp, 0); 304 } 305 rbp->b_flags &= ~B_INVAL; 306 rbp->b_ioflags &= ~BIO_ERROR; 307 if ((rbp->b_flags & B_ASYNC) || rbp->b_iodone != NULL) 308 BUF_KERNPROC(rbp); 309 rbp->b_iooffset = dbtob(rbp->b_blkno); 310 bstrategy(rbp); 311 #ifdef RACCT 312 if (racct_enable) { 313 PROC_LOCK(td->td_proc); 314 racct_add_buf(td->td_proc, rbp, 0); 315 PROC_UNLOCK(td->td_proc); 316 } 317 #endif /* RACCT */ 318 td->td_ru.ru_inblock++; 319 } 320 321 if (reqbp) { 322 /* 323 * Like bread, always brelse() the buffer when 324 * returning an error. 325 */ 326 error = bufwait(reqbp); 327 if (error != 0) { 328 brelse(reqbp); 329 *bpp = NULL; 330 } 331 } 332 return (error); 333 } 334 335 /* 336 * If blocks are contiguous on disk, use this to provide clustered 337 * read ahead. We will read as many blocks as possible sequentially 338 * and then parcel them up into logical blocks in the buffer hash table. 339 */ 340 static struct buf * 341 cluster_rbuild(struct vnode *vp, u_quad_t filesize, daddr_t lbn, 342 daddr_t blkno, long size, int run, int gbflags, struct buf *fbp) 343 { 344 struct buf *bp, *tbp; 345 daddr_t bn; 346 off_t off; 347 long tinc, tsize; 348 int i, inc, j, k, toff; 349 350 KASSERT(size == vp->v_mount->mnt_stat.f_iosize, 351 ("cluster_rbuild: size %ld != f_iosize %jd\n", 352 size, (intmax_t)vp->v_mount->mnt_stat.f_iosize)); 353 354 /* 355 * avoid a division 356 */ 357 while ((u_quad_t) size * (lbn + run) > filesize) { 358 --run; 359 } 360 361 if (fbp) { 362 tbp = fbp; 363 tbp->b_iocmd = BIO_READ; 364 } else { 365 tbp = getblk(vp, lbn, size, 0, 0, gbflags); 366 if (tbp->b_flags & B_CACHE) 367 return tbp; 368 tbp->b_flags |= B_ASYNC | B_RAM; 369 tbp->b_iocmd = BIO_READ; 370 } 371 tbp->b_blkno = blkno; 372 if ( (tbp->b_flags & B_MALLOC) || 373 ((tbp->b_flags & B_VMIO) == 0) || (run <= 1) ) 374 return tbp; 375 376 bp = uma_zalloc(cluster_pbuf_zone, M_NOWAIT); 377 if (bp == NULL) 378 return tbp; 379 MPASS((bp->b_flags & B_MAXPHYS) != 0); 380 381 /* 382 * We are synthesizing a buffer out of vm_page_t's, but 383 * if the block size is not page aligned then the starting 384 * address may not be either. Inherit the b_data offset 385 * from the original buffer. 386 */ 387 bp->b_flags = B_ASYNC | B_CLUSTER | B_VMIO; 388 if ((gbflags & GB_UNMAPPED) != 0) { 389 bp->b_data = unmapped_buf; 390 } else { 391 bp->b_data = (char *)((vm_offset_t)bp->b_data | 392 ((vm_offset_t)tbp->b_data & PAGE_MASK)); 393 } 394 bp->b_iocmd = BIO_READ; 395 bp->b_iodone = cluster_callback; 396 bp->b_blkno = blkno; 397 bp->b_lblkno = lbn; 398 bp->b_offset = tbp->b_offset; 399 KASSERT(bp->b_offset != NOOFFSET, ("cluster_rbuild: no buffer offset")); 400 pbgetvp(vp, bp); 401 402 TAILQ_INIT(&bp->b_cluster.cluster_head); 403 404 bp->b_bcount = 0; 405 bp->b_bufsize = 0; 406 bp->b_npages = 0; 407 408 inc = btodb(size); 409 for (bn = blkno, i = 0; i < run; ++i, bn += inc) { 410 if (i == 0) { 411 vm_object_pip_add(tbp->b_bufobj->bo_object, 412 tbp->b_npages); 413 vfs_busy_pages_acquire(tbp); 414 } else { 415 if ((bp->b_npages * PAGE_SIZE) + 416 round_page(size) > vp->v_mount->mnt_iosize_max) { 417 break; 418 } 419 420 tbp = getblk(vp, lbn + i, size, 0, 0, GB_LOCK_NOWAIT | 421 (gbflags & GB_UNMAPPED)); 422 423 /* Don't wait around for locked bufs. */ 424 if (tbp == NULL) 425 break; 426 427 /* 428 * Stop scanning if the buffer is fully valid 429 * (marked B_CACHE), or locked (may be doing a 430 * background write), or if the buffer is not 431 * VMIO backed. The clustering code can only deal 432 * with VMIO-backed buffers. The bo lock is not 433 * required for the BKGRDINPROG check since it 434 * can not be set without the buf lock. 435 */ 436 if ((tbp->b_vflags & BV_BKGRDINPROG) || 437 (tbp->b_flags & B_CACHE) || 438 (tbp->b_flags & B_VMIO) == 0) { 439 bqrelse(tbp); 440 break; 441 } 442 443 /* 444 * The buffer must be completely invalid in order to 445 * take part in the cluster. If it is partially valid 446 * then we stop. 447 */ 448 off = tbp->b_offset; 449 tsize = size; 450 for (j = 0; tsize > 0; j++) { 451 toff = off & PAGE_MASK; 452 tinc = tsize; 453 if (toff + tinc > PAGE_SIZE) 454 tinc = PAGE_SIZE - toff; 455 if (vm_page_trysbusy(tbp->b_pages[j]) == 0) 456 break; 457 if ((tbp->b_pages[j]->valid & 458 vm_page_bits(toff, tinc)) != 0) { 459 vm_page_sunbusy(tbp->b_pages[j]); 460 break; 461 } 462 vm_object_pip_add(tbp->b_bufobj->bo_object, 1); 463 off += tinc; 464 tsize -= tinc; 465 } 466 if (tsize > 0) { 467 clean_sbusy: 468 vm_object_pip_wakeupn(tbp->b_bufobj->bo_object, 469 j); 470 for (k = 0; k < j; k++) 471 vm_page_sunbusy(tbp->b_pages[k]); 472 bqrelse(tbp); 473 break; 474 } 475 476 /* 477 * Set a read-ahead mark as appropriate 478 */ 479 if ((fbp && (i == 1)) || (i == (run - 1))) 480 tbp->b_flags |= B_RAM; 481 482 /* 483 * Set the buffer up for an async read (XXX should 484 * we do this only if we do not wind up brelse()ing?). 485 * Set the block number if it isn't set, otherwise 486 * if it is make sure it matches the block number we 487 * expect. 488 */ 489 tbp->b_flags |= B_ASYNC; 490 tbp->b_iocmd = BIO_READ; 491 if (tbp->b_blkno == tbp->b_lblkno) { 492 tbp->b_blkno = bn; 493 } else if (tbp->b_blkno != bn) { 494 goto clean_sbusy; 495 } 496 } 497 /* 498 * XXX fbp from caller may not be B_ASYNC, but we are going 499 * to biodone() it in cluster_callback() anyway 500 */ 501 BUF_KERNPROC(tbp); 502 TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head, 503 tbp, b_cluster.cluster_entry); 504 for (j = 0; j < tbp->b_npages; j += 1) { 505 vm_page_t m; 506 507 m = tbp->b_pages[j]; 508 if ((bp->b_npages == 0) || 509 (bp->b_pages[bp->b_npages-1] != m)) { 510 bp->b_pages[bp->b_npages] = m; 511 bp->b_npages++; 512 } 513 if (vm_page_all_valid(m)) 514 tbp->b_pages[j] = bogus_page; 515 } 516 517 /* 518 * Don't inherit tbp->b_bufsize as it may be larger due to 519 * a non-page-aligned size. Instead just aggregate using 520 * 'size'. 521 */ 522 if (tbp->b_bcount != size) 523 printf("warning: tbp->b_bcount wrong %ld vs %ld\n", tbp->b_bcount, size); 524 if (tbp->b_bufsize != size) 525 printf("warning: tbp->b_bufsize wrong %ld vs %ld\n", tbp->b_bufsize, size); 526 bp->b_bcount += size; 527 bp->b_bufsize += size; 528 } 529 530 /* 531 * Fully valid pages in the cluster are already good and do not need 532 * to be re-read from disk. Replace the page with bogus_page 533 */ 534 for (j = 0; j < bp->b_npages; j++) { 535 if (vm_page_all_valid(bp->b_pages[j])) 536 bp->b_pages[j] = bogus_page; 537 } 538 if (bp->b_bufsize > bp->b_kvasize) 539 panic("cluster_rbuild: b_bufsize(%ld) > b_kvasize(%d)\n", 540 bp->b_bufsize, bp->b_kvasize); 541 542 if (buf_mapped(bp)) { 543 pmap_qenter(trunc_page((vm_offset_t) bp->b_data), 544 (vm_page_t *)bp->b_pages, bp->b_npages); 545 } 546 return (bp); 547 } 548 549 /* 550 * Cleanup after a clustered read or write. 551 * This is complicated by the fact that any of the buffers might have 552 * extra memory (if there were no empty buffer headers at allocbuf time) 553 * that we will need to shift around. 554 */ 555 static void 556 cluster_callback(struct buf *bp) 557 { 558 struct buf *nbp, *tbp; 559 int error = 0; 560 561 /* 562 * Must propagate errors to all the components. 563 */ 564 if (bp->b_ioflags & BIO_ERROR) 565 error = bp->b_error; 566 567 if (buf_mapped(bp)) { 568 pmap_qremove(trunc_page((vm_offset_t) bp->b_data), 569 bp->b_npages); 570 } 571 /* 572 * Move memory from the large cluster buffer into the component 573 * buffers and mark IO as done on these. 574 */ 575 for (tbp = TAILQ_FIRST(&bp->b_cluster.cluster_head); 576 tbp; tbp = nbp) { 577 nbp = TAILQ_NEXT(&tbp->b_cluster, cluster_entry); 578 if (error) { 579 tbp->b_ioflags |= BIO_ERROR; 580 tbp->b_error = error; 581 } else { 582 tbp->b_dirtyoff = tbp->b_dirtyend = 0; 583 tbp->b_flags &= ~B_INVAL; 584 tbp->b_ioflags &= ~BIO_ERROR; 585 /* 586 * XXX the bdwrite()/bqrelse() issued during 587 * cluster building clears B_RELBUF (see bqrelse() 588 * comment). If direct I/O was specified, we have 589 * to restore it here to allow the buffer and VM 590 * to be freed. 591 */ 592 if (tbp->b_flags & B_DIRECT) 593 tbp->b_flags |= B_RELBUF; 594 } 595 bufdone(tbp); 596 } 597 pbrelvp(bp); 598 uma_zfree(cluster_pbuf_zone, bp); 599 } 600 601 /* 602 * cluster_wbuild_wb: 603 * 604 * Implement modified write build for cluster. 605 * 606 * write_behind = 0 write behind disabled 607 * write_behind = 1 write behind normal (default) 608 * write_behind = 2 write behind backed-off 609 */ 610 611 static __inline int 612 cluster_wbuild_wb(struct vnode *vp, long size, daddr_t start_lbn, int len, 613 int gbflags) 614 { 615 int r = 0; 616 617 switch (write_behind) { 618 case 2: 619 if (start_lbn < len) 620 break; 621 start_lbn -= len; 622 /* FALLTHROUGH */ 623 case 1: 624 r = cluster_wbuild(vp, size, start_lbn, len, gbflags); 625 /* FALLTHROUGH */ 626 default: 627 /* FALLTHROUGH */ 628 break; 629 } 630 return(r); 631 } 632 633 /* 634 * Do clustered write for FFS. 635 * 636 * Three cases: 637 * 1. Write is not sequential (write asynchronously) 638 * Write is sequential: 639 * 2. beginning of cluster - begin cluster 640 * 3. middle of a cluster - add to cluster 641 * 4. end of a cluster - asynchronously write cluster 642 */ 643 void 644 cluster_write(struct vnode *vp, struct vn_clusterw *vnc, struct buf *bp, 645 u_quad_t filesize, int seqcount, int gbflags) 646 { 647 daddr_t lbn, pbn; 648 int maxclen, cursize; 649 int lblocksize; 650 int async; 651 652 if (!unmapped_buf_allowed) 653 gbflags &= ~GB_UNMAPPED; 654 655 if (vp->v_type == VREG) { 656 async = DOINGASYNC(vp); 657 lblocksize = vp->v_mount->mnt_stat.f_iosize; 658 } else { 659 async = 0; 660 lblocksize = bp->b_bufsize; 661 } 662 lbn = bp->b_lblkno; 663 KASSERT(bp->b_offset != NOOFFSET, ("cluster_write: no buffer offset")); 664 665 /* Initialize vnode to beginning of file. */ 666 if (lbn == 0) 667 vnc->v_lasta = vnc->v_clen = vnc->v_cstart = vnc->v_lastw = 0; 668 669 if (vnc->v_clen == 0 || lbn != vnc->v_lastw + 1 || 670 (bp->b_blkno != vnc->v_lasta + btodb(lblocksize))) { 671 maxclen = vp->v_mount->mnt_iosize_max / lblocksize - 1; 672 if (vnc->v_clen != 0) { 673 /* 674 * Next block is not sequential. 675 * 676 * If we are not writing at end of file, the process 677 * seeked to another point in the file since its last 678 * write, or we have reached our maximum cluster size, 679 * then push the previous cluster. Otherwise try 680 * reallocating to make it sequential. 681 * 682 * Change to algorithm: only push previous cluster if 683 * it was sequential from the point of view of the 684 * seqcount heuristic, otherwise leave the buffer 685 * intact so we can potentially optimize the I/O 686 * later on in the buf_daemon or update daemon 687 * flush. 688 */ 689 cursize = vnc->v_lastw - vnc->v_cstart + 1; 690 if ((u_quad_t)bp->b_offset + lblocksize != filesize || 691 lbn != vnc->v_lastw + 1 || vnc->v_clen <= cursize) { 692 if (!async && seqcount > 0) { 693 cluster_wbuild_wb(vp, lblocksize, 694 vnc->v_cstart, cursize, gbflags); 695 } 696 } else { 697 struct buf **bpp, **endbp; 698 struct cluster_save *buflist; 699 700 buflist = cluster_collectbufs(vp, vnc, bp, 701 gbflags); 702 if (buflist == NULL) { 703 /* 704 * Cluster build failed so just write 705 * it now. 706 */ 707 bawrite(bp); 708 return; 709 } 710 endbp = &buflist->bs_children 711 [buflist->bs_nchildren - 1]; 712 if (VOP_REALLOCBLKS(vp, buflist)) { 713 /* 714 * Failed, push the previous cluster 715 * if *really* writing sequentially 716 * in the logical file (seqcount > 1), 717 * otherwise delay it in the hopes that 718 * the low level disk driver can 719 * optimize the write ordering. 720 */ 721 for (bpp = buflist->bs_children; 722 bpp < endbp; bpp++) 723 brelse(*bpp); 724 free(buflist, M_SEGMENT); 725 if (seqcount > 1) { 726 cluster_wbuild_wb(vp, 727 lblocksize, vnc->v_cstart, 728 cursize, gbflags); 729 } 730 } else { 731 /* 732 * Succeeded, keep building cluster. 733 */ 734 for (bpp = buflist->bs_children; 735 bpp <= endbp; bpp++) 736 bdwrite(*bpp); 737 free(buflist, M_SEGMENT); 738 vnc->v_lastw = lbn; 739 vnc->v_lasta = bp->b_blkno; 740 return; 741 } 742 } 743 } 744 /* 745 * Consider beginning a cluster. If at end of file, make 746 * cluster as large as possible, otherwise find size of 747 * existing cluster. 748 */ 749 if (vp->v_type == VREG && 750 (u_quad_t) bp->b_offset + lblocksize != filesize && 751 bp->b_blkno == bp->b_lblkno && 752 (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen, 753 NULL) != 0 || bp->b_blkno == -1)) { 754 pbn = bp->b_blkno; 755 bawrite(bp); 756 vnc->v_clen = 0; 757 vnc->v_lasta = pbn; 758 vnc->v_cstart = lbn + 1; 759 vnc->v_lastw = lbn; 760 return; 761 } 762 vnc->v_clen = maxclen; 763 pbn = bp->b_blkno; 764 if (!async && maxclen == 0) { /* I/O not contiguous */ 765 vnc->v_cstart = lbn + 1; 766 bawrite(bp); 767 } else { /* Wait for rest of cluster */ 768 vnc->v_cstart = lbn; 769 bdwrite(bp); 770 } 771 } else if (lbn == vnc->v_cstart + vnc->v_clen) { 772 /* 773 * At end of cluster, write it out if seqcount tells us we 774 * are operating sequentially, otherwise let the buf or 775 * update daemon handle it. 776 */ 777 pbn = bp->b_blkno; 778 bdwrite(bp); 779 if (seqcount > 1) { 780 cluster_wbuild_wb(vp, lblocksize, vnc->v_cstart, 781 vnc->v_clen + 1, gbflags); 782 } 783 vnc->v_clen = 0; 784 vnc->v_cstart = lbn + 1; 785 } else if (vm_page_count_severe()) { 786 /* 787 * We are low on memory, get it going NOW 788 */ 789 pbn = bp->b_blkno; 790 bawrite(bp); 791 } else { 792 /* 793 * In the middle of a cluster, so just delay the I/O for now. 794 */ 795 pbn = bp->b_blkno; 796 bdwrite(bp); 797 } 798 vnc->v_lastw = lbn; 799 vnc->v_lasta = pbn; 800 } 801 802 /* 803 * This is an awful lot like cluster_rbuild...wish they could be combined. 804 * The last lbn argument is the current block on which I/O is being 805 * performed. Check to see that it doesn't fall in the middle of 806 * the current block (if last_bp == NULL). 807 */ 808 int 809 cluster_wbuild(struct vnode *vp, long size, daddr_t start_lbn, int len, 810 int gbflags) 811 { 812 struct buf *bp, *tbp; 813 struct bufobj *bo; 814 int i, j; 815 int totalwritten = 0; 816 int dbsize = btodb(size); 817 818 if (!unmapped_buf_allowed) 819 gbflags &= ~GB_UNMAPPED; 820 821 bo = &vp->v_bufobj; 822 while (len > 0) { 823 /* 824 * If the buffer is not delayed-write (i.e. dirty), or it 825 * is delayed-write but either locked or inval, it cannot 826 * partake in the clustered write. 827 */ 828 BO_LOCK(bo); 829 if ((tbp = gbincore(&vp->v_bufobj, start_lbn)) == NULL || 830 (tbp->b_vflags & BV_BKGRDINPROG)) { 831 BO_UNLOCK(bo); 832 ++start_lbn; 833 --len; 834 continue; 835 } 836 if (BUF_LOCK(tbp, 837 LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, BO_LOCKPTR(bo))) { 838 ++start_lbn; 839 --len; 840 continue; 841 } 842 if ((tbp->b_flags & (B_INVAL | B_DELWRI)) != B_DELWRI) { 843 BUF_UNLOCK(tbp); 844 ++start_lbn; 845 --len; 846 continue; 847 } 848 bremfree(tbp); 849 tbp->b_flags &= ~B_DONE; 850 851 /* 852 * Extra memory in the buffer, punt on this buffer. 853 * XXX we could handle this in most cases, but we would 854 * have to push the extra memory down to after our max 855 * possible cluster size and then potentially pull it back 856 * up if the cluster was terminated prematurely--too much 857 * hassle. 858 */ 859 if (((tbp->b_flags & (B_CLUSTEROK | B_MALLOC | B_VMIO)) != 860 (B_CLUSTEROK | B_VMIO)) || 861 (tbp->b_bcount != tbp->b_bufsize) || 862 (tbp->b_bcount != size) || 863 (len == 1) || 864 ((bp = uma_zalloc(cluster_pbuf_zone, M_NOWAIT)) == NULL)) { 865 totalwritten += tbp->b_bufsize; 866 bawrite(tbp); 867 ++start_lbn; 868 --len; 869 continue; 870 } 871 MPASS((bp->b_flags & B_MAXPHYS) != 0); 872 873 /* 874 * We got a pbuf to make the cluster in. 875 * so initialise it. 876 */ 877 TAILQ_INIT(&bp->b_cluster.cluster_head); 878 bp->b_bcount = 0; 879 bp->b_bufsize = 0; 880 bp->b_npages = 0; 881 if (tbp->b_wcred != NOCRED) 882 bp->b_wcred = crhold(tbp->b_wcred); 883 884 bp->b_blkno = tbp->b_blkno; 885 bp->b_lblkno = tbp->b_lblkno; 886 bp->b_offset = tbp->b_offset; 887 888 /* 889 * We are synthesizing a buffer out of vm_page_t's, but 890 * if the block size is not page aligned then the starting 891 * address may not be either. Inherit the b_data offset 892 * from the original buffer. 893 */ 894 if ((gbflags & GB_UNMAPPED) == 0 || 895 (tbp->b_flags & B_VMIO) == 0) { 896 bp->b_data = (char *)((vm_offset_t)bp->b_data | 897 ((vm_offset_t)tbp->b_data & PAGE_MASK)); 898 } else { 899 bp->b_data = unmapped_buf; 900 } 901 bp->b_flags |= B_CLUSTER | (tbp->b_flags & (B_VMIO | 902 B_NEEDCOMMIT)); 903 bp->b_iodone = cluster_callback; 904 pbgetvp(vp, bp); 905 /* 906 * From this location in the file, scan forward to see 907 * if there are buffers with adjacent data that need to 908 * be written as well. 909 */ 910 for (i = 0; i < len; ++i, ++start_lbn) { 911 if (i != 0) { /* If not the first buffer */ 912 /* 913 * If the adjacent data is not even in core it 914 * can't need to be written. 915 */ 916 BO_LOCK(bo); 917 if ((tbp = gbincore(bo, start_lbn)) == NULL || 918 (tbp->b_vflags & BV_BKGRDINPROG)) { 919 BO_UNLOCK(bo); 920 break; 921 } 922 923 /* 924 * If it IS in core, but has different 925 * characteristics, or is locked (which 926 * means it could be undergoing a background 927 * I/O or be in a weird state), then don't 928 * cluster with it. 929 */ 930 if (BUF_LOCK(tbp, 931 LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, 932 BO_LOCKPTR(bo))) 933 break; 934 935 if ((tbp->b_flags & (B_VMIO | B_CLUSTEROK | 936 B_INVAL | B_DELWRI | B_NEEDCOMMIT)) 937 != (B_DELWRI | B_CLUSTEROK | 938 (bp->b_flags & (B_VMIO | B_NEEDCOMMIT))) || 939 tbp->b_wcred != bp->b_wcred) { 940 BUF_UNLOCK(tbp); 941 break; 942 } 943 944 /* 945 * Check that the combined cluster 946 * would make sense with regard to pages 947 * and would not be too large 948 */ 949 if ((tbp->b_bcount != size) || 950 ((bp->b_blkno + (dbsize * i)) != 951 tbp->b_blkno) || 952 ((tbp->b_npages + bp->b_npages) > 953 (vp->v_mount->mnt_iosize_max / PAGE_SIZE))) { 954 BUF_UNLOCK(tbp); 955 break; 956 } 957 958 /* 959 * Ok, it's passed all the tests, 960 * so remove it from the free list 961 * and mark it busy. We will use it. 962 */ 963 bremfree(tbp); 964 tbp->b_flags &= ~B_DONE; 965 } /* end of code for non-first buffers only */ 966 /* 967 * If the IO is via the VM then we do some 968 * special VM hackery (yuck). Since the buffer's 969 * block size may not be page-aligned it is possible 970 * for a page to be shared between two buffers. We 971 * have to get rid of the duplication when building 972 * the cluster. 973 */ 974 if (tbp->b_flags & B_VMIO) { 975 vm_page_t m; 976 977 if (i == 0) { 978 vfs_busy_pages_acquire(tbp); 979 } else { /* if not first buffer */ 980 for (j = 0; j < tbp->b_npages; j += 1) { 981 m = tbp->b_pages[j]; 982 if (vm_page_trysbusy(m) == 0) { 983 for (j--; j >= 0; j--) 984 vm_page_sunbusy( 985 tbp->b_pages[j]); 986 bqrelse(tbp); 987 goto finishcluster; 988 } 989 } 990 } 991 vm_object_pip_add(tbp->b_bufobj->bo_object, 992 tbp->b_npages); 993 for (j = 0; j < tbp->b_npages; j += 1) { 994 m = tbp->b_pages[j]; 995 if ((bp->b_npages == 0) || 996 (bp->b_pages[bp->b_npages - 1] != m)) { 997 bp->b_pages[bp->b_npages] = m; 998 bp->b_npages++; 999 } 1000 } 1001 } 1002 bp->b_bcount += size; 1003 bp->b_bufsize += size; 1004 /* 1005 * If any of the clustered buffers have their 1006 * B_BARRIER flag set, transfer that request to 1007 * the cluster. 1008 */ 1009 bp->b_flags |= (tbp->b_flags & B_BARRIER); 1010 tbp->b_flags &= ~(B_DONE | B_BARRIER); 1011 tbp->b_flags |= B_ASYNC; 1012 tbp->b_ioflags &= ~BIO_ERROR; 1013 tbp->b_iocmd = BIO_WRITE; 1014 bundirty(tbp); 1015 reassignbuf(tbp); /* put on clean list */ 1016 bufobj_wref(tbp->b_bufobj); 1017 BUF_KERNPROC(tbp); 1018 buf_track(tbp, __func__); 1019 TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head, 1020 tbp, b_cluster.cluster_entry); 1021 } 1022 finishcluster: 1023 if (buf_mapped(bp)) { 1024 pmap_qenter(trunc_page((vm_offset_t) bp->b_data), 1025 (vm_page_t *)bp->b_pages, bp->b_npages); 1026 } 1027 if (bp->b_bufsize > bp->b_kvasize) 1028 panic( 1029 "cluster_wbuild: b_bufsize(%ld) > b_kvasize(%d)\n", 1030 bp->b_bufsize, bp->b_kvasize); 1031 totalwritten += bp->b_bufsize; 1032 bp->b_dirtyoff = 0; 1033 bp->b_dirtyend = bp->b_bufsize; 1034 bawrite(bp); 1035 1036 len -= i; 1037 } 1038 return totalwritten; 1039 } 1040 1041 /* 1042 * Collect together all the buffers in a cluster. 1043 * Plus add one additional buffer. 1044 */ 1045 static struct cluster_save * 1046 cluster_collectbufs(struct vnode *vp, struct vn_clusterw *vnc, 1047 struct buf *last_bp, int gbflags) 1048 { 1049 struct cluster_save *buflist; 1050 struct buf *bp; 1051 daddr_t lbn; 1052 int i, j, len, error; 1053 1054 len = vnc->v_lastw - vnc->v_cstart + 1; 1055 buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist), 1056 M_SEGMENT, M_WAITOK); 1057 buflist->bs_nchildren = 0; 1058 buflist->bs_children = (struct buf **) (buflist + 1); 1059 for (lbn = vnc->v_cstart, i = 0; i < len; lbn++, i++) { 1060 error = bread_gb(vp, lbn, last_bp->b_bcount, NOCRED, 1061 gbflags, &bp); 1062 if (error != 0) { 1063 /* 1064 * If read fails, release collected buffers 1065 * and return failure. 1066 */ 1067 for (j = 0; j < i; j++) 1068 brelse(buflist->bs_children[j]); 1069 free(buflist, M_SEGMENT); 1070 return (NULL); 1071 } 1072 buflist->bs_children[i] = bp; 1073 if (bp->b_blkno == bp->b_lblkno) 1074 VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, 1075 NULL, NULL); 1076 } 1077 buflist->bs_children[i] = bp = last_bp; 1078 if (bp->b_blkno == bp->b_lblkno) 1079 VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL); 1080 buflist->bs_nchildren = i + 1; 1081 return (buflist); 1082 } 1083 1084 void 1085 cluster_init_vn(struct vn_clusterw *vnc) 1086 { 1087 vnc->v_lasta = 0; 1088 vnc->v_clen = 0; 1089 vnc->v_cstart = 0; 1090 vnc->v_lastw = 0; 1091 } 1092