1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * Portions of this source code were derived from Berkeley 4.3 BSD 32 * under license from the Regents of the University of California. 33 */ 34 35 #pragma ident "%Z%%M% %I% %E% SMI" 36 37 #include <sys/types.h> 38 #include <sys/t_lock.h> 39 #include <sys/param.h> 40 #include <sys/time.h> 41 #include <sys/systm.h> 42 #include <sys/sysmacros.h> 43 #include <sys/resource.h> 44 #include <sys/signal.h> 45 #include <sys/cred.h> 46 #include <sys/user.h> 47 #include <sys/buf.h> 48 #include <sys/vfs.h> 49 #include <sys/vnode.h> 50 #include <sys/proc.h> 51 #include <sys/disp.h> 52 #include <sys/file.h> 53 #include <sys/fcntl.h> 54 #include <sys/flock.h> 55 #include <sys/kmem.h> 56 #include <sys/uio.h> 57 #include <sys/dnlc.h> 58 #include <sys/conf.h> 59 #include <sys/mman.h> 60 #include <sys/pathname.h> 61 #include <sys/debug.h> 62 #include <sys/vmsystm.h> 63 #include <sys/cmn_err.h> 64 #include <sys/vtrace.h> 65 #include <sys/filio.h> 66 #include <sys/atomic.h> 67 68 #include <sys/fssnap_if.h> 69 #include <sys/fs/ufs_fs.h> 70 #include <sys/fs/ufs_lockfs.h> 71 #include <sys/fs/ufs_filio.h> 72 #include <sys/fs/ufs_inode.h> 73 #include <sys/fs/ufs_fsdir.h> 74 #include <sys/fs/ufs_quota.h> 75 #include <sys/fs/ufs_trans.h> 76 #include <sys/fs/ufs_panic.h> 77 #include <sys/dirent.h> /* must be AFTER <sys/fs/fsdir.h>! */ 78 #include <sys/errno.h> 79 80 #include <sys/filio.h> /* _FIOIO */ 81 82 #include <vm/hat.h> 83 #include <vm/page.h> 84 #include <vm/pvn.h> 85 #include <vm/as.h> 86 #include <vm/seg.h> 87 #include <vm/seg_map.h> 88 #include <vm/seg_vn.h> 89 #include <vm/seg_kmem.h> 90 #include <vm/rm.h> 91 #include <sys/swap.h> 92 #include <sys/epm.h> 93 94 #include <fs/fs_subr.h> 95 96 static void *ufs_directio_zero_buf; 97 static int ufs_directio_zero_len = 8192; 98 99 int ufs_directio_enabled = 1; /* feature is enabled */ 100 101 /* 102 * for kstats reader 103 */ 104 struct ufs_directio_kstats { 105 kstat_named_t logical_reads; 106 kstat_named_t phys_reads; 107 kstat_named_t hole_reads; 108 kstat_named_t nread; 109 kstat_named_t logical_writes; 110 kstat_named_t phys_writes; 111 kstat_named_t nwritten; 112 kstat_named_t nflushes; 113 } ufs_directio_kstats = { 114 { "logical_reads", KSTAT_DATA_UINT64 }, 115 { "phys_reads", KSTAT_DATA_UINT64 }, 116 { "hole_reads", KSTAT_DATA_UINT64 }, 117 { "nread", KSTAT_DATA_UINT64 }, 118 { "logical_writes", KSTAT_DATA_UINT64 }, 119 { "phys_writes", KSTAT_DATA_UINT64 }, 120 { "nwritten", KSTAT_DATA_UINT64 }, 121 { "nflushes", KSTAT_DATA_UINT64 }, 122 }; 123 124 kstat_t *ufs_directio_kstatsp; 125 126 /* 127 * use kmem_cache_create for direct-physio buffers. This has shown 128 * a better cache distribution compared to buffers on the 129 * stack. It also avoids semaphore construction/deconstruction 130 * per request 131 */ 132 struct directio_buf { 133 struct directio_buf *next; 134 char *addr; 135 size_t nbytes; 136 struct buf buf; 137 }; 138 static struct kmem_cache *directio_buf_cache; 139 140 141 /* ARGSUSED */ 142 static int 143 directio_buf_constructor(void *dbp, void *cdrarg, int kmflags) 144 { 145 bioinit((struct buf *)&((struct directio_buf *)dbp)->buf); 146 return (0); 147 } 148 149 /* ARGSUSED */ 150 static void 151 directio_buf_destructor(void *dbp, void *cdrarg) 152 { 153 biofini((struct buf *)&((struct directio_buf *)dbp)->buf); 154 } 155 156 void 157 directio_bufs_init(void) 158 { 159 directio_buf_cache = kmem_cache_create("directio_buf_cache", 160 sizeof (struct directio_buf), 0, 161 directio_buf_constructor, directio_buf_destructor, 162 NULL, NULL, NULL, 0); 163 } 164 165 void 166 ufs_directio_init(void) 167 { 168 /* 169 * kstats 170 */ 171 ufs_directio_kstatsp = kstat_create("ufs", 0, 172 "directio", "ufs", KSTAT_TYPE_NAMED, 173 sizeof (ufs_directio_kstats) / sizeof (kstat_named_t), 174 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE); 175 if (ufs_directio_kstatsp) { 176 ufs_directio_kstatsp->ks_data = (void *)&ufs_directio_kstats; 177 kstat_install(ufs_directio_kstatsp); 178 } 179 /* 180 * kzero is broken so we have to use a private buf of zeroes 181 */ 182 ufs_directio_zero_buf = kmem_zalloc(ufs_directio_zero_len, KM_SLEEP); 183 directio_bufs_init(); 184 } 185 186 /* 187 * Wait for the first direct IO operation to finish 188 */ 189 static int 190 directio_wait_one(struct directio_buf *dbp, long *bytes_iop) 191 { 192 buf_t *bp; 193 int error; 194 195 /* 196 * Wait for IO to finish 197 */ 198 bp = &dbp->buf; 199 error = biowait(bp); 200 201 /* 202 * bytes_io will be used to figure out a resid 203 * for the caller. The resid is approximated by reporting 204 * the bytes following the first failed IO as the residual. 205 * 206 * I am cautious about using b_resid because I 207 * am not sure how well the disk drivers maintain it. 208 */ 209 if (error) 210 if (bp->b_resid) 211 *bytes_iop = bp->b_bcount - bp->b_resid; 212 else 213 *bytes_iop = 0; 214 else 215 *bytes_iop += bp->b_bcount; 216 /* 217 * Release direct IO resources 218 */ 219 bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_SHADOW); 220 kmem_cache_free(directio_buf_cache, dbp); 221 return (error); 222 } 223 224 /* 225 * Wait for all of the direct IO operations to finish 226 */ 227 228 uint32_t ufs_directio_drop_kpri = 0; /* enable kpri hack */ 229 230 static int 231 directio_wait(struct directio_buf *tail, long *bytes_iop) 232 { 233 int error = 0, newerror; 234 struct directio_buf *dbp; 235 uint_t kpri_req_save; 236 237 /* 238 * The linked list of directio buf structures is maintained 239 * in reverse order (tail->last request->penultimate request->...) 240 */ 241 /* 242 * This is the k_pri_req hack. Large numbers of threads 243 * sleeping with kernel priority will cause scheduler thrashing 244 * on an MP machine. This can be seen running Oracle using 245 * directio to ufs files. Sleep at normal priority here to 246 * more closely mimic physio to a device partition. This 247 * workaround is disabled by default as a niced thread could 248 * be starved from running while holding i_rwlock and i_contents. 249 */ 250 if (ufs_directio_drop_kpri) { 251 kpri_req_save = curthread->t_kpri_req; 252 curthread->t_kpri_req = 0; 253 } 254 while ((dbp = tail) != NULL) { 255 tail = dbp->next; 256 newerror = directio_wait_one(dbp, bytes_iop); 257 if (error == 0) 258 error = newerror; 259 } 260 if (ufs_directio_drop_kpri) 261 curthread->t_kpri_req = kpri_req_save; 262 return (error); 263 } 264 /* 265 * Initiate direct IO request 266 */ 267 static void 268 directio_start(struct ufsvfs *ufsvfsp, dev_t dev, size_t nbytes, 269 offset_t offset, char *addr, enum seg_rw rw, struct proc *procp, 270 struct directio_buf **tailp, page_t **pplist) 271 { 272 buf_t *bp; 273 struct directio_buf *dbp; 274 275 /* 276 * Allocate a directio buf header 277 * Note - list is maintained in reverse order. 278 * directio_wait_one() depends on this fact when 279 * adjusting the ``bytes_io'' param. bytes_io 280 * is used to compute a residual in the case of error. 281 */ 282 dbp = kmem_cache_alloc(directio_buf_cache, KM_SLEEP); 283 dbp->next = *tailp; 284 *tailp = dbp; 285 286 /* 287 * Initialize buf header 288 */ 289 dbp->addr = addr; 290 dbp->nbytes = nbytes; 291 bp = &dbp->buf; 292 bp->b_edev = dev; 293 bp->b_lblkno = btodt(offset); 294 bp->b_bcount = nbytes; 295 bp->b_un.b_addr = addr; 296 bp->b_proc = procp; 297 298 /* 299 * Note that S_WRITE implies B_READ and vice versa: a read(2) 300 * will B_READ data from the filesystem and S_WRITE it into 301 * the user's buffer; a write(2) will S_READ data from the 302 * user's buffer and B_WRITE it to the filesystem. 303 */ 304 if (rw == S_WRITE) { 305 bp->b_flags = B_BUSY | B_PHYS | B_READ; 306 ufs_directio_kstats.phys_reads.value.ui64++; 307 ufs_directio_kstats.nread.value.ui64 += nbytes; 308 } else { 309 bp->b_flags = B_BUSY | B_PHYS | B_WRITE; 310 ufs_directio_kstats.phys_writes.value.ui64++; 311 ufs_directio_kstats.nwritten.value.ui64 += nbytes; 312 } 313 bp->b_shadow = pplist; 314 if (pplist != NULL) 315 bp->b_flags |= B_SHADOW; 316 317 /* 318 * Issue I/O request. 319 */ 320 ufsvfsp->vfs_iotstamp = lbolt; 321 if (ufsvfsp->vfs_snapshot) 322 fssnap_strategy(&ufsvfsp->vfs_snapshot, bp); 323 else 324 (void) bdev_strategy(bp); 325 326 if (rw == S_WRITE) 327 lwp_stat_update(LWP_STAT_OUBLK, 1); 328 else 329 lwp_stat_update(LWP_STAT_INBLK, 1); 330 331 } 332 333 uint32_t ufs_shared_writes; /* writes done w/ lock shared */ 334 uint32_t ufs_cur_writes; /* # concurrent writes */ 335 uint32_t ufs_maxcur_writes; /* high water concurrent writes */ 336 uint32_t ufs_posix_hits; /* writes done /w lock excl. */ 337 338 /* 339 * Force POSIX syncronous data integrity on all writes for testing. 340 */ 341 uint32_t ufs_force_posix_sdi = 0; 342 343 /* 344 * Direct Write 345 */ 346 347 int 348 ufs_directio_write(struct inode *ip, uio_t *arg_uio, int ioflag, int rewrite, 349 cred_t *cr, int *statusp) 350 { 351 long resid, bytes_written; 352 u_offset_t size, uoff; 353 uio_t *uio = arg_uio; 354 rlim64_t limit = uio->uio_llimit; 355 int on, n, error, newerror, len, has_holes; 356 daddr_t bn; 357 size_t nbytes; 358 struct fs *fs; 359 vnode_t *vp; 360 iovec_t *iov; 361 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 362 struct proc *procp; 363 struct as *as; 364 struct directio_buf *tail; 365 int exclusive, ncur, bmap_peek; 366 uio_t copy_uio; 367 iovec_t copy_iov; 368 char *copy_base; 369 long copy_resid; 370 371 /* 372 * assume that directio isn't possible (normal case) 373 */ 374 *statusp = DIRECTIO_FAILURE; 375 376 /* 377 * Don't go direct 378 */ 379 if (ufs_directio_enabled == 0) 380 return (0); 381 382 /* 383 * mapped file; nevermind 384 */ 385 if (ip->i_mapcnt) 386 return (0); 387 388 /* 389 * CAN WE DO DIRECT IO? 390 */ 391 uoff = uio->uio_loffset; 392 resid = uio->uio_resid; 393 394 /* 395 * beyond limit 396 */ 397 if (uoff + resid > limit) 398 return (0); 399 400 /* 401 * must be sector aligned 402 */ 403 if ((uoff & (u_offset_t)(DEV_BSIZE - 1)) || (resid & (DEV_BSIZE - 1))) 404 return (0); 405 406 /* 407 * SHOULD WE DO DIRECT IO? 408 */ 409 size = ip->i_size; 410 has_holes = -1; 411 412 /* 413 * only on regular files; no metadata 414 */ 415 if (((ip->i_mode & IFMT) != IFREG) || ip->i_ufsvfs->vfs_qinod == ip) 416 return (0); 417 418 /* 419 * Synchronous, allocating writes run very slow in Direct-Mode 420 * XXX - can be fixed with bmap_write changes for large writes!!! 421 * XXX - can be fixed for updates to "almost-full" files 422 * XXX - WARNING - system hangs if bmap_write() has to 423 * allocate lots of pages since pageout 424 * suspends on locked inode 425 */ 426 if (!rewrite && (ip->i_flag & ISYNC)) { 427 if ((uoff + resid) > size) 428 return (0); 429 has_holes = bmap_has_holes(ip); 430 if (has_holes) 431 return (0); 432 } 433 434 /* 435 * Each iovec must be short aligned and sector aligned. If 436 * one is not, then kmem_alloc a new buffer and copy all of 437 * the smaller buffers into the new buffer. This new 438 * buffer will be short aligned and sector aligned. 439 */ 440 iov = uio->uio_iov; 441 nbytes = uio->uio_iovcnt; 442 while (nbytes--) { 443 if (((uint_t)iov->iov_len & (DEV_BSIZE - 1)) != 0 || 444 (intptr_t)(iov->iov_base) & 1) { 445 copy_resid = uio->uio_resid; 446 copy_base = kmem_alloc(copy_resid, KM_NOSLEEP); 447 if (copy_base == NULL) 448 return (0); 449 copy_iov.iov_base = copy_base; 450 copy_iov.iov_len = copy_resid; 451 copy_uio.uio_iov = ©_iov; 452 copy_uio.uio_iovcnt = 1; 453 copy_uio.uio_segflg = UIO_SYSSPACE; 454 copy_uio.uio_extflg = UIO_COPY_DEFAULT; 455 copy_uio.uio_loffset = uio->uio_loffset; 456 copy_uio.uio_resid = uio->uio_resid; 457 copy_uio.uio_llimit = uio->uio_llimit; 458 error = uiomove(copy_base, copy_resid, UIO_WRITE, uio); 459 if (error) { 460 kmem_free(copy_base, copy_resid); 461 return (0); 462 } 463 uio = ©_uio; 464 break; 465 } 466 iov++; 467 } 468 469 /* 470 * From here on down, all error exits must go to errout and 471 * not simply return a 0. 472 */ 473 474 /* 475 * DIRECTIO 476 */ 477 478 fs = ip->i_fs; 479 480 /* 481 * POSIX check. If attempting a concurrent re-write, make sure 482 * that this will be a single request to the driver to meet 483 * POSIX synchronous data integrity requirements. 484 */ 485 bmap_peek = 0; 486 if (rewrite && ((ioflag & FDSYNC) || ufs_force_posix_sdi)) { 487 int upgrade = 0; 488 489 /* check easy conditions first */ 490 if (uio->uio_iovcnt != 1 || resid > ufsvfsp->vfs_ioclustsz) { 491 upgrade = 1; 492 } else { 493 /* now look for contiguous allocation */ 494 len = (ssize_t)blkroundup(fs, resid); 495 error = bmap_read(ip, uoff, &bn, &len); 496 if (error || bn == UFS_HOLE || len == 0) 497 goto errout; 498 /* save a call to bmap_read later */ 499 bmap_peek = 1; 500 if (len < resid) 501 upgrade = 1; 502 } 503 if (upgrade) { 504 rw_exit(&ip->i_contents); 505 rw_enter(&ip->i_contents, RW_WRITER); 506 ufs_posix_hits++; 507 } 508 } 509 510 511 /* 512 * allocate space 513 */ 514 515 /* 516 * If attempting a re-write, there is no allocation to do. 517 * bmap_write would trip an ASSERT if i_contents is held shared. 518 */ 519 if (rewrite) 520 goto skip_alloc; 521 522 do { 523 on = (int)blkoff(fs, uoff); 524 n = (int)MIN(fs->fs_bsize - on, resid); 525 if ((uoff + n) > ip->i_size) { 526 error = bmap_write(ip, uoff, (int)(on + n), 527 (int)(uoff & (offset_t)MAXBOFFSET) == 0, 528 NULL, cr); 529 /* Caller is responsible for updating i_seq if needed */ 530 if (error) 531 break; 532 ip->i_size = uoff + n; 533 ip->i_flag |= IATTCHG; 534 } else if (n == MAXBSIZE) { 535 error = bmap_write(ip, uoff, (int)(on + n), 536 BI_ALLOC_ONLY, NULL, cr); 537 /* Caller is responsible for updating i_seq if needed */ 538 } else { 539 if (has_holes < 0) 540 has_holes = bmap_has_holes(ip); 541 if (has_holes) { 542 uint_t blk_size; 543 u_offset_t offset; 544 545 offset = uoff & (offset_t)fs->fs_bmask; 546 blk_size = (int)blksize(fs, ip, 547 (daddr_t)lblkno(fs, offset)); 548 error = bmap_write(ip, uoff, blk_size, 549 BI_NORMAL, NULL, cr); 550 /* 551 * Caller is responsible for updating 552 * i_seq if needed 553 */ 554 } else 555 error = 0; 556 } 557 if (error) 558 break; 559 uoff += n; 560 resid -= n; 561 /* 562 * if file has grown larger than 2GB, set flag 563 * in superblock if not already set 564 */ 565 if ((ip->i_size > MAXOFF32_T) && 566 !(fs->fs_flags & FSLARGEFILES)) { 567 ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES); 568 mutex_enter(&ufsvfsp->vfs_lock); 569 fs->fs_flags |= FSLARGEFILES; 570 ufs_sbwrite(ufsvfsp); 571 mutex_exit(&ufsvfsp->vfs_lock); 572 } 573 } while (resid); 574 575 if (error) { 576 /* 577 * restore original state 578 */ 579 if (resid) { 580 if (size == ip->i_size) 581 goto errout; 582 (void) ufs_itrunc(ip, size, 0, cr); 583 } 584 /* 585 * try non-directio path 586 */ 587 goto errout; 588 } 589 skip_alloc: 590 591 /* 592 * get rid of cached pages 593 */ 594 vp = ITOV(ip); 595 exclusive = rw_write_held(&ip->i_contents); 596 if (vn_has_cached_data(vp)) { 597 if (!exclusive) { 598 /* 599 * Still holding i_rwlock, so no allocations 600 * can happen after dropping contents. 601 */ 602 rw_exit(&ip->i_contents); 603 rw_enter(&ip->i_contents, RW_WRITER); 604 } 605 (void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0, B_INVAL, cr); 606 if (vn_has_cached_data(vp)) 607 goto errout; 608 if (!exclusive) 609 rw_downgrade(&ip->i_contents); 610 ufs_directio_kstats.nflushes.value.ui64++; 611 } 612 613 /* 614 * Direct Writes 615 */ 616 617 if (!exclusive) { 618 ufs_shared_writes++; 619 ncur = atomic_add_32_nv(&ufs_cur_writes, 1); 620 if (ncur > ufs_maxcur_writes) 621 ufs_maxcur_writes = ncur; 622 } 623 624 /* 625 * proc and as are for VM operations in directio_start() 626 */ 627 if (uio->uio_segflg == UIO_USERSPACE) { 628 procp = ttoproc(curthread); 629 as = procp->p_as; 630 } else { 631 procp = NULL; 632 as = &kas; 633 } 634 *statusp = DIRECTIO_SUCCESS; 635 error = 0; 636 newerror = 0; 637 resid = uio->uio_resid; 638 bytes_written = 0; 639 ufs_directio_kstats.logical_writes.value.ui64++; 640 while (error == 0 && newerror == 0 && resid && uio->uio_iovcnt) { 641 size_t pglck_len, pglck_size; 642 caddr_t pglck_base; 643 page_t **pplist, **spplist; 644 645 tail = NULL; 646 647 /* 648 * Adjust number of bytes 649 */ 650 iov = uio->uio_iov; 651 pglck_len = (size_t)MIN(iov->iov_len, resid); 652 pglck_base = iov->iov_base; 653 if (pglck_len == 0) { 654 uio->uio_iov++; 655 uio->uio_iovcnt--; 656 continue; 657 } 658 659 /* 660 * Try to Lock down the largest chunck of pages possible. 661 */ 662 pglck_len = (size_t)MIN(pglck_len, ufsvfsp->vfs_ioclustsz); 663 error = as_pagelock(as, &pplist, pglck_base, pglck_len, S_READ); 664 665 if (error) 666 break; 667 668 pglck_size = pglck_len; 669 while (pglck_len) { 670 671 nbytes = pglck_len; 672 uoff = uio->uio_loffset; 673 674 if (!bmap_peek) { 675 676 /* 677 * Re-adjust number of bytes to contiguous 678 * range. May have already called bmap_read 679 * in the case of a concurrent rewrite. 680 */ 681 len = (ssize_t)blkroundup(fs, nbytes); 682 error = bmap_read(ip, uoff, &bn, &len); 683 if (error) 684 break; 685 if (bn == UFS_HOLE || len == 0) 686 break; 687 } 688 nbytes = (size_t)MIN(nbytes, len); 689 bmap_peek = 0; 690 691 /* 692 * Get the pagelist pointer for this offset to be 693 * passed to directio_start. 694 */ 695 696 if (pplist != NULL) 697 spplist = pplist + 698 btop((uintptr_t)iov->iov_base - 699 ((uintptr_t)pglck_base & PAGEMASK)); 700 else 701 spplist = NULL; 702 703 /* 704 * Kick off the direct write requests 705 */ 706 directio_start(ufsvfsp, ip->i_dev, nbytes, ldbtob(bn), 707 iov->iov_base, S_READ, procp, &tail, spplist); 708 709 /* 710 * Adjust pointers and counters 711 */ 712 iov->iov_len -= nbytes; 713 iov->iov_base += nbytes; 714 uio->uio_loffset += nbytes; 715 resid -= nbytes; 716 pglck_len -= nbytes; 717 } 718 719 /* 720 * Wait for outstanding requests 721 */ 722 newerror = directio_wait(tail, &bytes_written); 723 724 /* 725 * Release VM resources 726 */ 727 as_pageunlock(as, pplist, pglck_base, pglck_size, S_READ); 728 729 } 730 731 if (!exclusive) { 732 atomic_add_32(&ufs_cur_writes, -1); 733 /* 734 * If this write was done shared, readers may 735 * have pulled in unmodified pages. Get rid of 736 * these potentially stale pages. 737 */ 738 if (vn_has_cached_data(vp)) { 739 rw_exit(&ip->i_contents); 740 rw_enter(&ip->i_contents, RW_WRITER); 741 (void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0, 742 B_INVAL, cr); 743 ufs_directio_kstats.nflushes.value.ui64++; 744 rw_downgrade(&ip->i_contents); 745 } 746 } 747 748 /* 749 * If error, adjust resid to begin at the first 750 * un-writable byte. 751 */ 752 if (error == 0) 753 error = newerror; 754 if (error) 755 resid = uio->uio_resid - bytes_written; 756 arg_uio->uio_resid = resid; 757 758 if (!rewrite) { 759 ip->i_flag |= IUPD | ICHG; 760 /* Caller will update i_seq */ 761 TRANS_INODE(ip->i_ufsvfs, ip); 762 } 763 /* 764 * If there is a residual; adjust the EOF if necessary 765 */ 766 if (resid) { 767 if (size != ip->i_size) { 768 if (uio->uio_loffset > size) 769 size = uio->uio_loffset; 770 (void) ufs_itrunc(ip, size, 0, cr); 771 } 772 } 773 774 if (uio == ©_uio) 775 kmem_free(copy_base, copy_resid); 776 777 return (error); 778 779 errout: 780 if (uio == ©_uio) 781 kmem_free(copy_base, copy_resid); 782 783 return (0); 784 } 785 /* 786 * Direct read of a hole 787 */ 788 static int 789 directio_hole(struct uio *uio, size_t nbytes) 790 { 791 int error = 0, nzero; 792 uio_t phys_uio; 793 iovec_t phys_iov; 794 795 ufs_directio_kstats.hole_reads.value.ui64++; 796 ufs_directio_kstats.nread.value.ui64 += nbytes; 797 798 phys_iov.iov_base = uio->uio_iov->iov_base; 799 phys_iov.iov_len = nbytes; 800 801 phys_uio.uio_iov = &phys_iov; 802 phys_uio.uio_iovcnt = 1; 803 phys_uio.uio_resid = phys_iov.iov_len; 804 phys_uio.uio_segflg = uio->uio_segflg; 805 phys_uio.uio_extflg = uio->uio_extflg; 806 while (error == 0 && phys_uio.uio_resid) { 807 nzero = (int)MIN(phys_iov.iov_len, ufs_directio_zero_len); 808 error = uiomove(ufs_directio_zero_buf, nzero, UIO_READ, 809 &phys_uio); 810 } 811 return (error); 812 } 813 814 /* 815 * Direct Read 816 */ 817 int 818 ufs_directio_read(struct inode *ip, uio_t *uio, cred_t *cr, int *statusp) 819 { 820 ssize_t resid, bytes_read; 821 u_offset_t size, uoff; 822 int error, newerror, len; 823 size_t nbytes; 824 struct fs *fs; 825 vnode_t *vp; 826 daddr_t bn; 827 iovec_t *iov; 828 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 829 struct proc *procp; 830 struct as *as; 831 struct directio_buf *tail; 832 833 /* 834 * assume that directio isn't possible (normal case) 835 */ 836 *statusp = DIRECTIO_FAILURE; 837 838 /* 839 * Don't go direct 840 */ 841 if (ufs_directio_enabled == 0) 842 return (0); 843 844 /* 845 * mapped file; nevermind 846 */ 847 if (ip->i_mapcnt) 848 return (0); 849 850 /* 851 * CAN WE DO DIRECT IO? 852 */ 853 /* 854 * must be sector aligned 855 */ 856 uoff = uio->uio_loffset; 857 resid = uio->uio_resid; 858 if ((uoff & (u_offset_t)(DEV_BSIZE - 1)) || (resid & (DEV_BSIZE - 1))) 859 return (0); 860 /* 861 * must be short aligned and sector aligned 862 */ 863 iov = uio->uio_iov; 864 nbytes = uio->uio_iovcnt; 865 while (nbytes--) { 866 if (((size_t)iov->iov_len & (DEV_BSIZE - 1)) != 0) 867 return (0); 868 if ((intptr_t)(iov++->iov_base) & 1) 869 return (0); 870 } 871 872 /* 873 * DIRECTIO 874 */ 875 fs = ip->i_fs; 876 877 /* 878 * don't read past EOF 879 */ 880 size = ip->i_size; 881 882 /* 883 * The file offset is past EOF so bail out here; we don't want 884 * to update uio_resid and make it look like we read something. 885 * We say that direct I/O was a success to avoid having rdip() 886 * go through the same "read past EOF logic". 887 */ 888 if (uoff >= size) { 889 *statusp = DIRECTIO_SUCCESS; 890 return (0); 891 } 892 893 /* 894 * The read would extend past EOF so make it smaller. 895 */ 896 if ((uoff + resid) > size) { 897 resid = size - uoff; 898 /* 899 * recheck sector alignment 900 */ 901 if (resid & (DEV_BSIZE - 1)) 902 return (0); 903 } 904 905 /* 906 * At this point, we know there is some real work to do. 907 */ 908 ASSERT(resid); 909 910 /* 911 * get rid of cached pages 912 */ 913 vp = ITOV(ip); 914 if (vn_has_cached_data(vp)) { 915 rw_exit(&ip->i_contents); 916 rw_enter(&ip->i_contents, RW_WRITER); 917 (void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0, B_INVAL, cr); 918 if (vn_has_cached_data(vp)) 919 return (0); 920 rw_downgrade(&ip->i_contents); 921 ufs_directio_kstats.nflushes.value.ui64++; 922 } 923 /* 924 * Direct Reads 925 */ 926 927 /* 928 * proc and as are for VM operations in directio_start() 929 */ 930 if (uio->uio_segflg == UIO_USERSPACE) { 931 procp = ttoproc(curthread); 932 as = procp->p_as; 933 } else { 934 procp = NULL; 935 as = &kas; 936 } 937 938 *statusp = DIRECTIO_SUCCESS; 939 error = 0; 940 newerror = 0; 941 bytes_read = 0; 942 ufs_directio_kstats.logical_reads.value.ui64++; 943 while (error == 0 && newerror == 0 && resid && uio->uio_iovcnt) { 944 size_t pglck_len, pglck_size; 945 caddr_t pglck_base; 946 page_t **pplist, **spplist; 947 948 tail = NULL; 949 950 /* 951 * Adjust number of bytes 952 */ 953 iov = uio->uio_iov; 954 pglck_len = (size_t)MIN(iov->iov_len, resid); 955 pglck_base = iov->iov_base; 956 if (pglck_len == 0) { 957 uio->uio_iov++; 958 uio->uio_iovcnt--; 959 continue; 960 } 961 962 /* 963 * Try to Lock down the largest chunck of pages possible. 964 */ 965 pglck_len = (size_t)MIN(pglck_len, ufsvfsp->vfs_ioclustsz); 966 error = as_pagelock(as, &pplist, pglck_base, 967 pglck_len, S_WRITE); 968 969 if (error) 970 break; 971 972 pglck_size = pglck_len; 973 while (pglck_len) { 974 975 nbytes = pglck_len; 976 uoff = uio->uio_loffset; 977 978 /* 979 * Re-adjust number of bytes to contiguous range 980 */ 981 len = (ssize_t)blkroundup(fs, nbytes); 982 error = bmap_read(ip, uoff, &bn, &len); 983 if (error) 984 break; 985 986 if (bn == UFS_HOLE) { 987 nbytes = (size_t)MIN(fs->fs_bsize - 988 (long)blkoff(fs, uoff), nbytes); 989 error = directio_hole(uio, nbytes); 990 /* 991 * Hole reads are not added to the list 992 * processed by directio_wait() below so 993 * account for bytes read here. 994 */ 995 if (!error) 996 bytes_read += nbytes; 997 } else { 998 nbytes = (size_t)MIN(nbytes, len); 999 1000 /* 1001 * Get the pagelist pointer for this offset 1002 * to be passed to directio_start. 1003 */ 1004 if (pplist != NULL) 1005 spplist = pplist + 1006 btop((uintptr_t)iov->iov_base - 1007 ((uintptr_t)pglck_base & PAGEMASK)); 1008 else 1009 spplist = NULL; 1010 1011 /* 1012 * Kick off the direct read requests 1013 */ 1014 directio_start(ufsvfsp, ip->i_dev, nbytes, 1015 ldbtob(bn), iov->iov_base, 1016 S_WRITE, procp, &tail, spplist); 1017 } 1018 1019 if (error) 1020 break; 1021 1022 /* 1023 * Adjust pointers and counters 1024 */ 1025 iov->iov_len -= nbytes; 1026 iov->iov_base += nbytes; 1027 uio->uio_loffset += nbytes; 1028 resid -= nbytes; 1029 pglck_len -= nbytes; 1030 } 1031 1032 /* 1033 * Wait for outstanding requests 1034 */ 1035 newerror = directio_wait(tail, &bytes_read); 1036 /* 1037 * Release VM resources 1038 */ 1039 as_pageunlock(as, pplist, pglck_base, pglck_size, S_WRITE); 1040 1041 } 1042 1043 /* 1044 * If error, adjust resid to begin at the first 1045 * un-read byte. 1046 */ 1047 if (error == 0) 1048 error = newerror; 1049 uio->uio_resid -= bytes_read; 1050 return (error); 1051 } 1052