1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1993 5 * The Regents of the University of California. All rights reserved. 6 * Modifications/enhancements: 7 * Copyright (c) 1995 John S. Dyson. All rights reserved. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 3. Neither the name of the University nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 */ 33 34 #include <sys/param.h> 35 #include <sys/systm.h> 36 #include <sys/kernel.h> 37 #include <sys/proc.h> 38 #include <sys/bio.h> 39 #include <sys/buf.h> 40 #include <sys/vnode.h> 41 #include <sys/malloc.h> 42 #include <sys/mount.h> 43 #include <sys/racct.h> 44 #include <sys/resourcevar.h> 45 #include <sys/rwlock.h> 46 #include <sys/vmmeter.h> 47 #include <vm/vm.h> 48 #include <vm/vm_object.h> 49 #include <vm/vm_page.h> 50 #include <sys/sysctl.h> 51 52 static MALLOC_DEFINE(M_SEGMENT, "cl_savebuf", "cluster_save buffer"); 53 static uma_zone_t cluster_pbuf_zone; 54 55 static void cluster_init(void *); 56 static struct cluster_save *cluster_collectbufs(struct vnode *vp, 57 struct vn_clusterw *vnc, struct buf *last_bp, int gbflags); 58 static struct buf *cluster_rbuild(struct vnode *vp, u_quad_t filesize, 59 daddr_t lbn, daddr_t blkno, long size, int run, int gbflags, 60 struct buf *fbp); 61 static void cluster_callback(struct buf *); 62 63 static int write_behind = 1; 64 SYSCTL_INT(_vfs, OID_AUTO, write_behind, CTLFLAG_RW, &write_behind, 0, 65 "Cluster write-behind; 0: disable, 1: enable, 2: backed off"); 66 67 static int read_max = 64; 68 SYSCTL_INT(_vfs, OID_AUTO, read_max, CTLFLAG_RW, &read_max, 0, 69 "Cluster read-ahead max block count"); 70 71 static int read_min = 1; 72 SYSCTL_INT(_vfs, OID_AUTO, read_min, CTLFLAG_RW, &read_min, 0, 73 "Cluster read min block count"); 74 75 SYSINIT(cluster, SI_SUB_CPU, SI_ORDER_ANY, cluster_init, NULL); 76 77 static void 78 cluster_init(void *dummy) 79 { 80 81 cluster_pbuf_zone = pbuf_zsecond_create("clpbuf", nswbuf / 2); 82 } 83 84 /* 85 * Read data to a buf, including read-ahead if we find this to be beneficial. 86 * cluster_read replaces bread. 87 */ 88 int 89 cluster_read(struct vnode *vp, u_quad_t filesize, daddr_t lblkno, long size, 90 struct ucred *cred, long totread, int seqcount, int gbflags, 91 struct buf **bpp) 92 { 93 struct buf *bp, *rbp, *reqbp; 94 struct bufobj *bo; 95 struct thread *td; 96 daddr_t blkno, origblkno; 97 int maxra, racluster; 98 int error, ncontig; 99 int i; 100 101 error = 0; 102 td = curthread; 103 bo = &vp->v_bufobj; 104 if (!unmapped_buf_allowed) 105 gbflags &= ~GB_UNMAPPED; 106 107 /* 108 * Try to limit the amount of read-ahead by a few 109 * ad-hoc parameters. This needs work!!! 110 */ 111 racluster = vp->v_mount->mnt_iosize_max / size; 112 maxra = seqcount; 113 maxra = min(read_max, maxra); 114 maxra = min(nbuf/8, maxra); 115 if (((u_quad_t)(lblkno + maxra + 1) * size) > filesize) 116 maxra = (filesize / size) - lblkno; 117 118 /* 119 * get the requested block 120 */ 121 error = getblkx(vp, lblkno, lblkno, size, 0, 0, gbflags, &bp); 122 if (error != 0) { 123 *bpp = NULL; 124 return (error); 125 } 126 gbflags &= ~GB_NOSPARSE; 127 origblkno = lblkno; 128 *bpp = reqbp = bp; 129 130 /* 131 * if it is in the cache, then check to see if the reads have been 132 * sequential. If they have, then try some read-ahead, otherwise 133 * back-off on prospective read-aheads. 134 */ 135 if (bp->b_flags & B_CACHE) { 136 if (!seqcount) { 137 return 0; 138 } else if ((bp->b_flags & B_RAM) == 0) { 139 return 0; 140 } else { 141 bp->b_flags &= ~B_RAM; 142 BO_RLOCK(bo); 143 for (i = 1; i < maxra; i++) { 144 /* 145 * Stop if the buffer does not exist or it 146 * is invalid (about to go away?) 147 */ 148 rbp = gbincore(&vp->v_bufobj, lblkno+i); 149 if (rbp == NULL || (rbp->b_flags & B_INVAL)) 150 break; 151 152 /* 153 * Set another read-ahead mark so we know 154 * to check again. (If we can lock the 155 * buffer without waiting) 156 */ 157 if ((((i % racluster) == (racluster - 1)) || 158 (i == (maxra - 1))) 159 && (0 == BUF_LOCK(rbp, 160 LK_EXCLUSIVE | LK_NOWAIT, NULL))) { 161 rbp->b_flags |= B_RAM; 162 BUF_UNLOCK(rbp); 163 } 164 } 165 BO_RUNLOCK(bo); 166 if (i >= maxra) { 167 return 0; 168 } 169 lblkno += i; 170 } 171 reqbp = bp = NULL; 172 /* 173 * If it isn't in the cache, then get a chunk from 174 * disk if sequential, otherwise just get the block. 175 */ 176 } else { 177 off_t firstread = bp->b_offset; 178 int nblks; 179 long minread; 180 181 KASSERT(bp->b_offset != NOOFFSET, 182 ("cluster_read: no buffer offset")); 183 184 ncontig = 0; 185 186 /* 187 * Adjust totread if needed 188 */ 189 minread = read_min * size; 190 if (minread > totread) 191 totread = minread; 192 193 /* 194 * Compute the total number of blocks that we should read 195 * synchronously. 196 */ 197 if (firstread + totread > filesize) 198 totread = filesize - firstread; 199 nblks = howmany(totread, size); 200 if (nblks > racluster) 201 nblks = racluster; 202 203 /* 204 * Now compute the number of contiguous blocks. 205 */ 206 if (nblks > 1) { 207 error = VOP_BMAP(vp, lblkno, NULL, 208 &blkno, &ncontig, NULL); 209 /* 210 * If this failed to map just do the original block. 211 */ 212 if (error || blkno == -1) 213 ncontig = 0; 214 } 215 216 /* 217 * If we have contiguous data available do a cluster 218 * otherwise just read the requested block. 219 */ 220 if (ncontig) { 221 /* Account for our first block. */ 222 ncontig = min(ncontig + 1, nblks); 223 if (ncontig < nblks) 224 nblks = ncontig; 225 bp = cluster_rbuild(vp, filesize, lblkno, 226 blkno, size, nblks, gbflags, bp); 227 lblkno += (bp->b_bufsize / size); 228 } else { 229 bp->b_flags |= B_RAM; 230 bp->b_iocmd = BIO_READ; 231 lblkno += 1; 232 } 233 } 234 235 /* 236 * handle the synchronous read so that it is available ASAP. 237 */ 238 if (bp) { 239 if ((bp->b_flags & B_CLUSTER) == 0) { 240 vfs_busy_pages(bp, 0); 241 } 242 bp->b_flags &= ~B_INVAL; 243 bp->b_ioflags &= ~BIO_ERROR; 244 if ((bp->b_flags & B_ASYNC) || bp->b_iodone != NULL) 245 BUF_KERNPROC(bp); 246 bp->b_iooffset = dbtob(bp->b_blkno); 247 bstrategy(bp); 248 #ifdef RACCT 249 if (racct_enable) { 250 PROC_LOCK(td->td_proc); 251 racct_add_buf(td->td_proc, bp, 0); 252 PROC_UNLOCK(td->td_proc); 253 } 254 #endif /* RACCT */ 255 td->td_ru.ru_inblock++; 256 } 257 258 /* 259 * If we have been doing sequential I/O, then do some read-ahead. 260 */ 261 while (lblkno < (origblkno + maxra)) { 262 error = VOP_BMAP(vp, lblkno, NULL, &blkno, &ncontig, NULL); 263 if (error) 264 break; 265 266 if (blkno == -1) 267 break; 268 269 /* 270 * We could throttle ncontig here by maxra but we might as 271 * well read the data if it is contiguous. We're throttled 272 * by racluster anyway. 273 */ 274 if (ncontig) { 275 ncontig = min(ncontig + 1, racluster); 276 rbp = cluster_rbuild(vp, filesize, lblkno, blkno, 277 size, ncontig, gbflags, NULL); 278 lblkno += (rbp->b_bufsize / size); 279 if (rbp->b_flags & B_DELWRI) { 280 bqrelse(rbp); 281 continue; 282 } 283 } else { 284 rbp = getblk(vp, lblkno, size, 0, 0, gbflags); 285 lblkno += 1; 286 if (rbp->b_flags & B_DELWRI) { 287 bqrelse(rbp); 288 continue; 289 } 290 rbp->b_flags |= B_ASYNC | B_RAM; 291 rbp->b_iocmd = BIO_READ; 292 rbp->b_blkno = blkno; 293 } 294 if (rbp->b_flags & B_CACHE) { 295 rbp->b_flags &= ~B_ASYNC; 296 bqrelse(rbp); 297 continue; 298 } 299 if ((rbp->b_flags & B_CLUSTER) == 0) { 300 vfs_busy_pages(rbp, 0); 301 } 302 rbp->b_flags &= ~B_INVAL; 303 rbp->b_ioflags &= ~BIO_ERROR; 304 if ((rbp->b_flags & B_ASYNC) || rbp->b_iodone != NULL) 305 BUF_KERNPROC(rbp); 306 rbp->b_iooffset = dbtob(rbp->b_blkno); 307 bstrategy(rbp); 308 #ifdef RACCT 309 if (racct_enable) { 310 PROC_LOCK(td->td_proc); 311 racct_add_buf(td->td_proc, rbp, 0); 312 PROC_UNLOCK(td->td_proc); 313 } 314 #endif /* RACCT */ 315 td->td_ru.ru_inblock++; 316 } 317 318 if (reqbp) { 319 /* 320 * Like bread, always brelse() the buffer when 321 * returning an error. 322 */ 323 error = bufwait(reqbp); 324 if (error != 0) { 325 brelse(reqbp); 326 *bpp = NULL; 327 } 328 } 329 return (error); 330 } 331 332 /* 333 * If blocks are contiguous on disk, use this to provide clustered 334 * read ahead. We will read as many blocks as possible sequentially 335 * and then parcel them up into logical blocks in the buffer hash table. 336 */ 337 static struct buf * 338 cluster_rbuild(struct vnode *vp, u_quad_t filesize, daddr_t lbn, 339 daddr_t blkno, long size, int run, int gbflags, struct buf *fbp) 340 { 341 struct buf *bp, *tbp; 342 daddr_t bn; 343 off_t off; 344 long tinc, tsize; 345 int i, inc, j, k, toff; 346 347 KASSERT(size == vp->v_mount->mnt_stat.f_iosize, 348 ("cluster_rbuild: size %ld != f_iosize %jd\n", 349 size, (intmax_t)vp->v_mount->mnt_stat.f_iosize)); 350 351 /* 352 * avoid a division 353 */ 354 while ((u_quad_t) size * (lbn + run) > filesize) { 355 --run; 356 } 357 358 if (fbp) { 359 tbp = fbp; 360 tbp->b_iocmd = BIO_READ; 361 } else { 362 tbp = getblk(vp, lbn, size, 0, 0, gbflags); 363 if (tbp->b_flags & B_CACHE) 364 return tbp; 365 tbp->b_flags |= B_ASYNC | B_RAM; 366 tbp->b_iocmd = BIO_READ; 367 } 368 tbp->b_blkno = blkno; 369 if ( (tbp->b_flags & B_MALLOC) || 370 ((tbp->b_flags & B_VMIO) == 0) || (run <= 1) ) 371 return tbp; 372 373 bp = uma_zalloc(cluster_pbuf_zone, M_NOWAIT); 374 if (bp == NULL) 375 return tbp; 376 MPASS((bp->b_flags & B_MAXPHYS) != 0); 377 378 /* 379 * We are synthesizing a buffer out of vm_page_t's, but 380 * if the block size is not page aligned then the starting 381 * address may not be either. Inherit the b_data offset 382 * from the original buffer. 383 */ 384 bp->b_flags = B_ASYNC | B_CLUSTER | B_VMIO; 385 if ((gbflags & GB_UNMAPPED) != 0) { 386 bp->b_data = unmapped_buf; 387 } else { 388 bp->b_data = (char *)((vm_offset_t)bp->b_data | 389 ((vm_offset_t)tbp->b_data & PAGE_MASK)); 390 } 391 bp->b_iocmd = BIO_READ; 392 bp->b_iodone = cluster_callback; 393 bp->b_blkno = blkno; 394 bp->b_lblkno = lbn; 395 bp->b_offset = tbp->b_offset; 396 KASSERT(bp->b_offset != NOOFFSET, ("cluster_rbuild: no buffer offset")); 397 pbgetvp(vp, bp); 398 399 TAILQ_INIT(&bp->b_cluster.cluster_head); 400 401 bp->b_bcount = 0; 402 bp->b_bufsize = 0; 403 bp->b_npages = 0; 404 405 inc = btodb(size); 406 for (bn = blkno, i = 0; i < run; ++i, bn += inc) { 407 if (i == 0) { 408 vm_object_pip_add(tbp->b_bufobj->bo_object, 409 tbp->b_npages); 410 vfs_busy_pages_acquire(tbp); 411 } else { 412 if ((bp->b_npages * PAGE_SIZE) + 413 round_page(size) > vp->v_mount->mnt_iosize_max) { 414 break; 415 } 416 417 tbp = getblk(vp, lbn + i, size, 0, 0, GB_LOCK_NOWAIT | 418 (gbflags & GB_UNMAPPED)); 419 420 /* Don't wait around for locked bufs. */ 421 if (tbp == NULL) 422 break; 423 424 /* 425 * Stop scanning if the buffer is fully valid 426 * (marked B_CACHE), or locked (may be doing a 427 * background write), or if the buffer is not 428 * VMIO backed. The clustering code can only deal 429 * with VMIO-backed buffers. The bo lock is not 430 * required for the BKGRDINPROG check since it 431 * can not be set without the buf lock. 432 */ 433 if ((tbp->b_vflags & BV_BKGRDINPROG) || 434 (tbp->b_flags & B_CACHE) || 435 (tbp->b_flags & B_VMIO) == 0) { 436 bqrelse(tbp); 437 break; 438 } 439 440 /* 441 * The buffer must be completely invalid in order to 442 * take part in the cluster. If it is partially valid 443 * then we stop. 444 */ 445 off = tbp->b_offset; 446 tsize = size; 447 for (j = 0; tsize > 0; j++) { 448 toff = off & PAGE_MASK; 449 tinc = tsize; 450 if (toff + tinc > PAGE_SIZE) 451 tinc = PAGE_SIZE - toff; 452 if (vm_page_trysbusy(tbp->b_pages[j]) == 0) 453 break; 454 if ((tbp->b_pages[j]->valid & 455 vm_page_bits(toff, tinc)) != 0) { 456 vm_page_sunbusy(tbp->b_pages[j]); 457 break; 458 } 459 vm_object_pip_add(tbp->b_bufobj->bo_object, 1); 460 off += tinc; 461 tsize -= tinc; 462 } 463 if (tsize > 0) { 464 clean_sbusy: 465 vm_object_pip_wakeupn(tbp->b_bufobj->bo_object, 466 j); 467 for (k = 0; k < j; k++) 468 vm_page_sunbusy(tbp->b_pages[k]); 469 bqrelse(tbp); 470 break; 471 } 472 473 /* 474 * Set a read-ahead mark as appropriate 475 */ 476 if ((fbp && (i == 1)) || (i == (run - 1))) 477 tbp->b_flags |= B_RAM; 478 479 /* 480 * Set the buffer up for an async read (XXX should 481 * we do this only if we do not wind up brelse()ing?). 482 * Set the block number if it isn't set, otherwise 483 * if it is make sure it matches the block number we 484 * expect. 485 */ 486 tbp->b_flags |= B_ASYNC; 487 tbp->b_iocmd = BIO_READ; 488 if (tbp->b_blkno == tbp->b_lblkno) { 489 tbp->b_blkno = bn; 490 } else if (tbp->b_blkno != bn) { 491 goto clean_sbusy; 492 } 493 } 494 /* 495 * XXX fbp from caller may not be B_ASYNC, but we are going 496 * to biodone() it in cluster_callback() anyway 497 */ 498 BUF_KERNPROC(tbp); 499 TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head, 500 tbp, b_cluster.cluster_entry); 501 for (j = 0; j < tbp->b_npages; j += 1) { 502 vm_page_t m; 503 504 m = tbp->b_pages[j]; 505 if ((bp->b_npages == 0) || 506 (bp->b_pages[bp->b_npages-1] != m)) { 507 bp->b_pages[bp->b_npages] = m; 508 bp->b_npages++; 509 } 510 if (vm_page_all_valid(m)) 511 tbp->b_pages[j] = bogus_page; 512 } 513 514 /* 515 * Don't inherit tbp->b_bufsize as it may be larger due to 516 * a non-page-aligned size. Instead just aggregate using 517 * 'size'. 518 */ 519 if (tbp->b_bcount != size) 520 printf("warning: tbp->b_bcount wrong %ld vs %ld\n", tbp->b_bcount, size); 521 if (tbp->b_bufsize != size) 522 printf("warning: tbp->b_bufsize wrong %ld vs %ld\n", tbp->b_bufsize, size); 523 bp->b_bcount += size; 524 bp->b_bufsize += size; 525 } 526 527 /* 528 * Fully valid pages in the cluster are already good and do not need 529 * to be re-read from disk. Replace the page with bogus_page 530 */ 531 for (j = 0; j < bp->b_npages; j++) { 532 if (vm_page_all_valid(bp->b_pages[j])) 533 bp->b_pages[j] = bogus_page; 534 } 535 if (bp->b_bufsize > bp->b_kvasize) 536 panic("cluster_rbuild: b_bufsize(%ld) > b_kvasize(%d)\n", 537 bp->b_bufsize, bp->b_kvasize); 538 539 if (buf_mapped(bp)) { 540 pmap_qenter(trunc_page((vm_offset_t) bp->b_data), 541 (vm_page_t *)bp->b_pages, bp->b_npages); 542 } 543 return (bp); 544 } 545 546 /* 547 * Cleanup after a clustered read or write. 548 * This is complicated by the fact that any of the buffers might have 549 * extra memory (if there were no empty buffer headers at allocbuf time) 550 * that we will need to shift around. 551 */ 552 static void 553 cluster_callback(struct buf *bp) 554 { 555 struct buf *nbp, *tbp; 556 int error = 0; 557 558 /* 559 * Must propagate errors to all the components. 560 */ 561 if (bp->b_ioflags & BIO_ERROR) 562 error = bp->b_error; 563 564 if (buf_mapped(bp)) { 565 pmap_qremove(trunc_page((vm_offset_t) bp->b_data), 566 bp->b_npages); 567 } 568 /* 569 * Move memory from the large cluster buffer into the component 570 * buffers and mark IO as done on these. 571 */ 572 for (tbp = TAILQ_FIRST(&bp->b_cluster.cluster_head); 573 tbp; tbp = nbp) { 574 nbp = TAILQ_NEXT(&tbp->b_cluster, cluster_entry); 575 if (error) { 576 tbp->b_ioflags |= BIO_ERROR; 577 tbp->b_error = error; 578 } else { 579 tbp->b_dirtyoff = tbp->b_dirtyend = 0; 580 tbp->b_flags &= ~B_INVAL; 581 tbp->b_ioflags &= ~BIO_ERROR; 582 /* 583 * XXX the bdwrite()/bqrelse() issued during 584 * cluster building clears B_RELBUF (see bqrelse() 585 * comment). If direct I/O was specified, we have 586 * to restore it here to allow the buffer and VM 587 * to be freed. 588 */ 589 if (tbp->b_flags & B_DIRECT) 590 tbp->b_flags |= B_RELBUF; 591 } 592 bufdone(tbp); 593 } 594 pbrelvp(bp); 595 uma_zfree(cluster_pbuf_zone, bp); 596 } 597 598 /* 599 * cluster_wbuild_wb: 600 * 601 * Implement modified write build for cluster. 602 * 603 * write_behind = 0 write behind disabled 604 * write_behind = 1 write behind normal (default) 605 * write_behind = 2 write behind backed-off 606 */ 607 608 static __inline int 609 cluster_wbuild_wb(struct vnode *vp, long size, daddr_t start_lbn, int len, 610 int gbflags) 611 { 612 int r = 0; 613 614 switch (write_behind) { 615 case 2: 616 if (start_lbn < len) 617 break; 618 start_lbn -= len; 619 /* FALLTHROUGH */ 620 case 1: 621 r = cluster_wbuild(vp, size, start_lbn, len, gbflags); 622 /* FALLTHROUGH */ 623 default: 624 /* FALLTHROUGH */ 625 break; 626 } 627 return(r); 628 } 629 630 /* 631 * Do clustered write for FFS. 632 * 633 * Three cases: 634 * 1. Write is not sequential (write asynchronously) 635 * Write is sequential: 636 * 2. beginning of cluster - begin cluster 637 * 3. middle of a cluster - add to cluster 638 * 4. end of a cluster - asynchronously write cluster 639 */ 640 void 641 cluster_write(struct vnode *vp, struct vn_clusterw *vnc, struct buf *bp, 642 u_quad_t filesize, int seqcount, int gbflags) 643 { 644 daddr_t lbn, pbn; 645 int maxclen, cursize; 646 int lblocksize; 647 int async; 648 649 if (!unmapped_buf_allowed) 650 gbflags &= ~GB_UNMAPPED; 651 652 if (vp->v_type == VREG) { 653 async = DOINGASYNC(vp); 654 lblocksize = vp->v_mount->mnt_stat.f_iosize; 655 } else { 656 async = 0; 657 lblocksize = bp->b_bufsize; 658 } 659 lbn = bp->b_lblkno; 660 KASSERT(bp->b_offset != NOOFFSET, ("cluster_write: no buffer offset")); 661 662 /* Initialize vnode to beginning of file. */ 663 if (lbn == 0) 664 vnc->v_lasta = vnc->v_clen = vnc->v_cstart = vnc->v_lastw = 0; 665 666 if (vnc->v_clen == 0 || lbn != vnc->v_lastw + 1 || 667 (bp->b_blkno != vnc->v_lasta + btodb(lblocksize))) { 668 maxclen = vp->v_mount->mnt_iosize_max / lblocksize - 1; 669 if (vnc->v_clen != 0) { 670 /* 671 * Next block is not sequential. 672 * 673 * If we are not writing at end of file, the process 674 * seeked to another point in the file since its last 675 * write, or we have reached our maximum cluster size, 676 * then push the previous cluster. Otherwise try 677 * reallocating to make it sequential. 678 * 679 * Change to algorithm: only push previous cluster if 680 * it was sequential from the point of view of the 681 * seqcount heuristic, otherwise leave the buffer 682 * intact so we can potentially optimize the I/O 683 * later on in the buf_daemon or update daemon 684 * flush. 685 */ 686 cursize = vnc->v_lastw - vnc->v_cstart + 1; 687 if ((u_quad_t)bp->b_offset + lblocksize != filesize || 688 lbn != vnc->v_lastw + 1 || vnc->v_clen <= cursize) { 689 if (!async && seqcount > 0) { 690 cluster_wbuild_wb(vp, lblocksize, 691 vnc->v_cstart, cursize, gbflags); 692 } 693 } else { 694 struct buf **bpp, **endbp; 695 struct cluster_save *buflist; 696 697 buflist = cluster_collectbufs(vp, vnc, bp, 698 gbflags); 699 if (buflist == NULL) { 700 /* 701 * Cluster build failed so just write 702 * it now. 703 */ 704 bawrite(bp); 705 return; 706 } 707 endbp = &buflist->bs_children 708 [buflist->bs_nchildren - 1]; 709 if (VOP_REALLOCBLKS(vp, buflist)) { 710 /* 711 * Failed, push the previous cluster 712 * if *really* writing sequentially 713 * in the logical file (seqcount > 1), 714 * otherwise delay it in the hopes that 715 * the low level disk driver can 716 * optimize the write ordering. 717 */ 718 for (bpp = buflist->bs_children; 719 bpp < endbp; bpp++) 720 brelse(*bpp); 721 free(buflist, M_SEGMENT); 722 if (seqcount > 1) { 723 cluster_wbuild_wb(vp, 724 lblocksize, vnc->v_cstart, 725 cursize, gbflags); 726 } 727 } else { 728 /* 729 * Succeeded, keep building cluster. 730 */ 731 for (bpp = buflist->bs_children; 732 bpp <= endbp; bpp++) 733 bdwrite(*bpp); 734 free(buflist, M_SEGMENT); 735 vnc->v_lastw = lbn; 736 vnc->v_lasta = bp->b_blkno; 737 return; 738 } 739 } 740 } 741 /* 742 * Consider beginning a cluster. If at end of file, make 743 * cluster as large as possible, otherwise find size of 744 * existing cluster. 745 */ 746 if (vp->v_type == VREG && 747 (u_quad_t) bp->b_offset + lblocksize != filesize && 748 bp->b_blkno == bp->b_lblkno && 749 (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen, 750 NULL) != 0 || bp->b_blkno == -1)) { 751 pbn = bp->b_blkno; 752 bawrite(bp); 753 vnc->v_clen = 0; 754 vnc->v_lasta = pbn; 755 vnc->v_cstart = lbn + 1; 756 vnc->v_lastw = lbn; 757 return; 758 } 759 vnc->v_clen = maxclen; 760 pbn = bp->b_blkno; 761 if (!async && maxclen == 0) { /* I/O not contiguous */ 762 vnc->v_cstart = lbn + 1; 763 bawrite(bp); 764 } else { /* Wait for rest of cluster */ 765 vnc->v_cstart = lbn; 766 bdwrite(bp); 767 } 768 } else if (lbn == vnc->v_cstart + vnc->v_clen) { 769 /* 770 * At end of cluster, write it out if seqcount tells us we 771 * are operating sequentially, otherwise let the buf or 772 * update daemon handle it. 773 */ 774 pbn = bp->b_blkno; 775 bdwrite(bp); 776 if (seqcount > 1) { 777 cluster_wbuild_wb(vp, lblocksize, vnc->v_cstart, 778 vnc->v_clen + 1, gbflags); 779 } 780 vnc->v_clen = 0; 781 vnc->v_cstart = lbn + 1; 782 } else if (vm_page_count_severe()) { 783 /* 784 * We are low on memory, get it going NOW 785 */ 786 pbn = bp->b_blkno; 787 bawrite(bp); 788 } else { 789 /* 790 * In the middle of a cluster, so just delay the I/O for now. 791 */ 792 pbn = bp->b_blkno; 793 bdwrite(bp); 794 } 795 vnc->v_lastw = lbn; 796 vnc->v_lasta = pbn; 797 } 798 799 /* 800 * This is an awful lot like cluster_rbuild...wish they could be combined. 801 * The last lbn argument is the current block on which I/O is being 802 * performed. Check to see that it doesn't fall in the middle of 803 * the current block (if last_bp == NULL). 804 */ 805 int 806 cluster_wbuild(struct vnode *vp, long size, daddr_t start_lbn, int len, 807 int gbflags) 808 { 809 struct buf *bp, *tbp; 810 struct bufobj *bo; 811 int i, j; 812 int totalwritten = 0; 813 int dbsize = btodb(size); 814 815 if (!unmapped_buf_allowed) 816 gbflags &= ~GB_UNMAPPED; 817 818 bo = &vp->v_bufobj; 819 while (len > 0) { 820 /* 821 * If the buffer is not delayed-write (i.e. dirty), or it 822 * is delayed-write but either locked or inval, it cannot 823 * partake in the clustered write. 824 */ 825 BO_LOCK(bo); 826 if ((tbp = gbincore(&vp->v_bufobj, start_lbn)) == NULL || 827 (tbp->b_vflags & BV_BKGRDINPROG)) { 828 BO_UNLOCK(bo); 829 ++start_lbn; 830 --len; 831 continue; 832 } 833 if (BUF_LOCK(tbp, 834 LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, BO_LOCKPTR(bo))) { 835 ++start_lbn; 836 --len; 837 continue; 838 } 839 if ((tbp->b_flags & (B_INVAL | B_DELWRI)) != B_DELWRI) { 840 BUF_UNLOCK(tbp); 841 ++start_lbn; 842 --len; 843 continue; 844 } 845 bremfree(tbp); 846 tbp->b_flags &= ~B_DONE; 847 848 /* 849 * Extra memory in the buffer, punt on this buffer. 850 * XXX we could handle this in most cases, but we would 851 * have to push the extra memory down to after our max 852 * possible cluster size and then potentially pull it back 853 * up if the cluster was terminated prematurely--too much 854 * hassle. 855 */ 856 if (((tbp->b_flags & (B_CLUSTEROK | B_MALLOC | B_VMIO)) != 857 (B_CLUSTEROK | B_VMIO)) || 858 (tbp->b_bcount != tbp->b_bufsize) || 859 (tbp->b_bcount != size) || 860 (len == 1) || 861 ((bp = uma_zalloc(cluster_pbuf_zone, M_NOWAIT)) == NULL)) { 862 totalwritten += tbp->b_bufsize; 863 bawrite(tbp); 864 ++start_lbn; 865 --len; 866 continue; 867 } 868 MPASS((bp->b_flags & B_MAXPHYS) != 0); 869 870 /* 871 * We got a pbuf to make the cluster in. 872 * so initialise it. 873 */ 874 TAILQ_INIT(&bp->b_cluster.cluster_head); 875 bp->b_bcount = 0; 876 bp->b_bufsize = 0; 877 bp->b_npages = 0; 878 if (tbp->b_wcred != NOCRED) 879 bp->b_wcred = crhold(tbp->b_wcred); 880 881 bp->b_blkno = tbp->b_blkno; 882 bp->b_lblkno = tbp->b_lblkno; 883 bp->b_offset = tbp->b_offset; 884 885 /* 886 * We are synthesizing a buffer out of vm_page_t's, but 887 * if the block size is not page aligned then the starting 888 * address may not be either. Inherit the b_data offset 889 * from the original buffer. 890 */ 891 if ((gbflags & GB_UNMAPPED) == 0 || 892 (tbp->b_flags & B_VMIO) == 0) { 893 bp->b_data = (char *)((vm_offset_t)bp->b_data | 894 ((vm_offset_t)tbp->b_data & PAGE_MASK)); 895 } else { 896 bp->b_data = unmapped_buf; 897 } 898 bp->b_flags |= B_CLUSTER | (tbp->b_flags & (B_VMIO | 899 B_NEEDCOMMIT)); 900 bp->b_iodone = cluster_callback; 901 pbgetvp(vp, bp); 902 /* 903 * From this location in the file, scan forward to see 904 * if there are buffers with adjacent data that need to 905 * be written as well. 906 */ 907 for (i = 0; i < len; ++i, ++start_lbn) { 908 if (i != 0) { /* If not the first buffer */ 909 /* 910 * If the adjacent data is not even in core it 911 * can't need to be written. 912 */ 913 BO_LOCK(bo); 914 if ((tbp = gbincore(bo, start_lbn)) == NULL || 915 (tbp->b_vflags & BV_BKGRDINPROG)) { 916 BO_UNLOCK(bo); 917 break; 918 } 919 920 /* 921 * If it IS in core, but has different 922 * characteristics, or is locked (which 923 * means it could be undergoing a background 924 * I/O or be in a weird state), then don't 925 * cluster with it. 926 */ 927 if (BUF_LOCK(tbp, 928 LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, 929 BO_LOCKPTR(bo))) 930 break; 931 932 if ((tbp->b_flags & (B_VMIO | B_CLUSTEROK | 933 B_INVAL | B_DELWRI | B_NEEDCOMMIT)) 934 != (B_DELWRI | B_CLUSTEROK | 935 (bp->b_flags & (B_VMIO | B_NEEDCOMMIT))) || 936 tbp->b_wcred != bp->b_wcred) { 937 BUF_UNLOCK(tbp); 938 break; 939 } 940 941 /* 942 * Check that the combined cluster 943 * would make sense with regard to pages 944 * and would not be too large 945 */ 946 if ((tbp->b_bcount != size) || 947 ((bp->b_blkno + (dbsize * i)) != 948 tbp->b_blkno) || 949 ((tbp->b_npages + bp->b_npages) > 950 (vp->v_mount->mnt_iosize_max / PAGE_SIZE))) { 951 BUF_UNLOCK(tbp); 952 break; 953 } 954 955 /* 956 * Ok, it's passed all the tests, 957 * so remove it from the free list 958 * and mark it busy. We will use it. 959 */ 960 bremfree(tbp); 961 tbp->b_flags &= ~B_DONE; 962 } /* end of code for non-first buffers only */ 963 /* 964 * If the IO is via the VM then we do some 965 * special VM hackery (yuck). Since the buffer's 966 * block size may not be page-aligned it is possible 967 * for a page to be shared between two buffers. We 968 * have to get rid of the duplication when building 969 * the cluster. 970 */ 971 if (tbp->b_flags & B_VMIO) { 972 vm_page_t m; 973 974 if (i == 0) { 975 vfs_busy_pages_acquire(tbp); 976 } else { /* if not first buffer */ 977 for (j = 0; j < tbp->b_npages; j += 1) { 978 m = tbp->b_pages[j]; 979 if (vm_page_trysbusy(m) == 0) { 980 for (j--; j >= 0; j--) 981 vm_page_sunbusy( 982 tbp->b_pages[j]); 983 bqrelse(tbp); 984 goto finishcluster; 985 } 986 } 987 } 988 vm_object_pip_add(tbp->b_bufobj->bo_object, 989 tbp->b_npages); 990 for (j = 0; j < tbp->b_npages; j += 1) { 991 m = tbp->b_pages[j]; 992 if ((bp->b_npages == 0) || 993 (bp->b_pages[bp->b_npages - 1] != m)) { 994 bp->b_pages[bp->b_npages] = m; 995 bp->b_npages++; 996 } 997 } 998 } 999 bp->b_bcount += size; 1000 bp->b_bufsize += size; 1001 /* 1002 * If any of the clustered buffers have their 1003 * B_BARRIER flag set, transfer that request to 1004 * the cluster. 1005 */ 1006 bp->b_flags |= (tbp->b_flags & B_BARRIER); 1007 tbp->b_flags &= ~(B_DONE | B_BARRIER); 1008 tbp->b_flags |= B_ASYNC; 1009 tbp->b_ioflags &= ~BIO_ERROR; 1010 tbp->b_iocmd = BIO_WRITE; 1011 bundirty(tbp); 1012 reassignbuf(tbp); /* put on clean list */ 1013 bufobj_wref(tbp->b_bufobj); 1014 BUF_KERNPROC(tbp); 1015 buf_track(tbp, __func__); 1016 TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head, 1017 tbp, b_cluster.cluster_entry); 1018 } 1019 finishcluster: 1020 if (buf_mapped(bp)) { 1021 pmap_qenter(trunc_page((vm_offset_t) bp->b_data), 1022 (vm_page_t *)bp->b_pages, bp->b_npages); 1023 } 1024 if (bp->b_bufsize > bp->b_kvasize) 1025 panic( 1026 "cluster_wbuild: b_bufsize(%ld) > b_kvasize(%d)\n", 1027 bp->b_bufsize, bp->b_kvasize); 1028 totalwritten += bp->b_bufsize; 1029 bp->b_dirtyoff = 0; 1030 bp->b_dirtyend = bp->b_bufsize; 1031 bawrite(bp); 1032 1033 len -= i; 1034 } 1035 return totalwritten; 1036 } 1037 1038 /* 1039 * Collect together all the buffers in a cluster. 1040 * Plus add one additional buffer. 1041 */ 1042 static struct cluster_save * 1043 cluster_collectbufs(struct vnode *vp, struct vn_clusterw *vnc, 1044 struct buf *last_bp, int gbflags) 1045 { 1046 struct cluster_save *buflist; 1047 struct buf *bp; 1048 daddr_t lbn; 1049 int i, j, len, error; 1050 1051 len = vnc->v_lastw - vnc->v_cstart + 1; 1052 buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist), 1053 M_SEGMENT, M_WAITOK); 1054 buflist->bs_nchildren = 0; 1055 buflist->bs_children = (struct buf **) (buflist + 1); 1056 for (lbn = vnc->v_cstart, i = 0; i < len; lbn++, i++) { 1057 error = bread_gb(vp, lbn, last_bp->b_bcount, NOCRED, 1058 gbflags, &bp); 1059 if (error != 0) { 1060 /* 1061 * If read fails, release collected buffers 1062 * and return failure. 1063 */ 1064 for (j = 0; j < i; j++) 1065 brelse(buflist->bs_children[j]); 1066 free(buflist, M_SEGMENT); 1067 return (NULL); 1068 } 1069 buflist->bs_children[i] = bp; 1070 if (bp->b_blkno == bp->b_lblkno) 1071 VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, 1072 NULL, NULL); 1073 } 1074 buflist->bs_children[i] = bp = last_bp; 1075 if (bp->b_blkno == bp->b_lblkno) 1076 VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL); 1077 buflist->bs_nchildren = i + 1; 1078 return (buflist); 1079 } 1080 1081 void 1082 cluster_init_vn(struct vn_clusterw *vnc) 1083 { 1084 vnc->v_lasta = 0; 1085 vnc->v_clen = 0; 1086 vnc->v_cstart = 0; 1087 vnc->v_lastw = 0; 1088 } 1089