1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * Portions of this source code were derived from Berkeley 4.3 BSD 32 * under license from the Regents of the University of California. 33 */ 34 35 #pragma ident "%Z%%M% %I% %E% SMI" 36 37 #include <sys/types.h> 38 #include <sys/t_lock.h> 39 #include <sys/param.h> 40 #include <sys/time.h> 41 #include <sys/systm.h> 42 #include <sys/sysmacros.h> 43 #include <sys/resource.h> 44 #include <sys/signal.h> 45 #include <sys/cred.h> 46 #include <sys/user.h> 47 #include <sys/buf.h> 48 #include <sys/vfs.h> 49 #include <sys/vnode.h> 50 #include <sys/proc.h> 51 #include <sys/disp.h> 52 #include <sys/file.h> 53 #include <sys/fcntl.h> 54 #include <sys/flock.h> 55 #include <sys/kmem.h> 56 #include <sys/uio.h> 57 #include <sys/dnlc.h> 58 #include <sys/conf.h> 59 #include <sys/mman.h> 60 #include <sys/pathname.h> 61 #include <sys/debug.h> 62 #include <sys/vmsystm.h> 63 #include <sys/cmn_err.h> 64 #include <sys/vtrace.h> 65 #include <sys/filio.h> 66 #include <sys/atomic.h> 67 68 #include <sys/fssnap_if.h> 69 #include <sys/fs/ufs_fs.h> 70 #include <sys/fs/ufs_lockfs.h> 71 #include <sys/fs/ufs_filio.h> 72 #include <sys/fs/ufs_inode.h> 73 #include <sys/fs/ufs_fsdir.h> 74 #include <sys/fs/ufs_quota.h> 75 #include <sys/fs/ufs_trans.h> 76 #include <sys/fs/ufs_panic.h> 77 #include <sys/dirent.h> /* must be AFTER <sys/fs/fsdir.h>! */ 78 #include <sys/errno.h> 79 80 #include <sys/filio.h> /* _FIOIO */ 81 82 #include <vm/hat.h> 83 #include <vm/page.h> 84 #include <vm/pvn.h> 85 #include <vm/as.h> 86 #include <vm/seg.h> 87 #include <vm/seg_map.h> 88 #include <vm/seg_vn.h> 89 #include <vm/seg_kmem.h> 90 #include <vm/rm.h> 91 #include <sys/swap.h> 92 #include <sys/epm.h> 93 94 #include <fs/fs_subr.h> 95 96 static void *ufs_directio_zero_buf; 97 static int ufs_directio_zero_len = 8192; 98 99 int ufs_directio_enabled = 1; /* feature is enabled */ 100 101 /* 102 * for kstats reader 103 */ 104 struct ufs_directio_kstats { 105 uint_t logical_reads; 106 uint_t phys_reads; 107 uint_t hole_reads; 108 uint_t nread; 109 uint_t logical_writes; 110 uint_t phys_writes; 111 uint_t nwritten; 112 uint_t nflushes; 113 } ufs_directio_kstats; 114 115 kstat_t *ufs_directio_kstatsp; 116 117 /* 118 * use kmem_cache_create for direct-physio buffers. This has shown 119 * a better cache distribution compared to buffers on the 120 * stack. It also avoids semaphore construction/deconstruction 121 * per request 122 */ 123 struct directio_buf { 124 struct directio_buf *next; 125 char *addr; 126 size_t nbytes; 127 struct buf buf; 128 }; 129 static struct kmem_cache *directio_buf_cache; 130 131 132 /* ARGSUSED */ 133 static int 134 directio_buf_constructor(void *dbp, void *cdrarg, int kmflags) 135 { 136 bioinit((struct buf *)&((struct directio_buf *)dbp)->buf); 137 return (0); 138 } 139 140 /* ARGSUSED */ 141 static void 142 directio_buf_destructor(void *dbp, void *cdrarg) 143 { 144 biofini((struct buf *)&((struct directio_buf *)dbp)->buf); 145 } 146 147 void 148 directio_bufs_init(void) 149 { 150 directio_buf_cache = kmem_cache_create("directio_buf_cache", 151 sizeof (struct directio_buf), 0, 152 directio_buf_constructor, directio_buf_destructor, 153 NULL, NULL, NULL, 0); 154 } 155 156 void 157 ufs_directio_init(void) 158 { 159 /* 160 * kstats 161 */ 162 ufs_directio_kstatsp = kstat_create("ufs directio", 0, 163 "UFS DirectIO Stats", "ufs directio", 164 KSTAT_TYPE_RAW, sizeof (ufs_directio_kstats), 165 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE); 166 if (ufs_directio_kstatsp) { 167 ufs_directio_kstatsp->ks_data = (void *)&ufs_directio_kstats; 168 kstat_install(ufs_directio_kstatsp); 169 } 170 /* 171 * kzero is broken so we have to use a private buf of zeroes 172 */ 173 ufs_directio_zero_buf = kmem_zalloc(ufs_directio_zero_len, KM_SLEEP); 174 directio_bufs_init(); 175 } 176 177 /* 178 * Wait for the first direct IO operation to finish 179 */ 180 static int 181 directio_wait_one(struct directio_buf *dbp, long *bytes_iop) 182 { 183 buf_t *bp; 184 int error; 185 186 /* 187 * Wait for IO to finish 188 */ 189 bp = &dbp->buf; 190 error = biowait(bp); 191 192 /* 193 * bytes_io will be used to figure out a resid 194 * for the caller. The resid is approximated by reporting 195 * the bytes following the first failed IO as the residual. 196 * 197 * I am cautious about using b_resid because I 198 * am not sure how well the disk drivers maintain it. 199 */ 200 if (error) 201 if (bp->b_resid) 202 *bytes_iop = bp->b_bcount - bp->b_resid; 203 else 204 *bytes_iop = 0; 205 else 206 *bytes_iop += bp->b_bcount; 207 /* 208 * Release direct IO resources 209 */ 210 bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_SHADOW); 211 kmem_cache_free(directio_buf_cache, dbp); 212 return (error); 213 } 214 215 /* 216 * Wait for all of the direct IO operations to finish 217 */ 218 219 uint32_t ufs_directio_drop_kpri = 0; /* enable kpri hack */ 220 221 static int 222 directio_wait(struct directio_buf *tail, long *bytes_iop) 223 { 224 int error = 0, newerror; 225 struct directio_buf *dbp; 226 uint_t kpri_req_save; 227 228 /* 229 * The linked list of directio buf structures is maintained 230 * in reverse order (tail->last request->penultimate request->...) 231 */ 232 /* 233 * This is the k_pri_req hack. Large numbers of threads 234 * sleeping with kernel priority will cause scheduler thrashing 235 * on an MP machine. This can be seen running Oracle using 236 * directio to ufs files. Sleep at normal priority here to 237 * more closely mimic physio to a device partition. This 238 * workaround is disabled by default as a niced thread could 239 * be starved from running while holding i_rwlock and i_contents. 240 */ 241 if (ufs_directio_drop_kpri) { 242 kpri_req_save = curthread->t_kpri_req; 243 curthread->t_kpri_req = 0; 244 } 245 while ((dbp = tail) != NULL) { 246 tail = dbp->next; 247 newerror = directio_wait_one(dbp, bytes_iop); 248 if (error == 0) 249 error = newerror; 250 } 251 if (ufs_directio_drop_kpri) 252 curthread->t_kpri_req = kpri_req_save; 253 return (error); 254 } 255 /* 256 * Initiate direct IO request 257 */ 258 static void 259 directio_start(struct ufsvfs *ufsvfsp, dev_t dev, size_t nbytes, 260 offset_t offset, char *addr, enum seg_rw rw, struct proc *procp, 261 struct directio_buf **tailp, page_t **pplist) 262 { 263 buf_t *bp; 264 struct directio_buf *dbp; 265 266 /* 267 * Allocate a directio buf header 268 * Note - list is maintained in reverse order. 269 * directio_wait_one() depends on this fact when 270 * adjusting the ``bytes_io'' param. bytes_io 271 * is used to compute a residual in the case of error. 272 */ 273 dbp = kmem_cache_alloc(directio_buf_cache, KM_SLEEP); 274 dbp->next = *tailp; 275 *tailp = dbp; 276 277 /* 278 * Initialize buf header 279 */ 280 dbp->addr = addr; 281 dbp->nbytes = nbytes; 282 bp = &dbp->buf; 283 bp->b_edev = dev; 284 bp->b_lblkno = btodt(offset); 285 bp->b_bcount = nbytes; 286 bp->b_un.b_addr = addr; 287 bp->b_proc = procp; 288 289 /* 290 * Note that S_WRITE implies B_READ and vice versa: a read(2) 291 * will B_READ data from the filesystem and S_WRITE it into 292 * the user's buffer; a write(2) will S_READ data from the 293 * user's buffer and B_WRITE it to the filesystem. 294 */ 295 if (rw == S_WRITE) { 296 bp->b_flags = B_BUSY | B_PHYS | B_READ; 297 ufs_directio_kstats.phys_reads++; 298 ufs_directio_kstats.nread += nbytes; 299 } else { 300 bp->b_flags = B_BUSY | B_PHYS | B_WRITE; 301 ufs_directio_kstats.phys_writes++; 302 ufs_directio_kstats.nwritten += nbytes; 303 } 304 bp->b_shadow = pplist; 305 if (pplist != NULL) 306 bp->b_flags |= B_SHADOW; 307 308 /* 309 * Issue I/O request. 310 */ 311 ufsvfsp->vfs_iotstamp = lbolt; 312 if (ufsvfsp->vfs_snapshot) 313 fssnap_strategy(&ufsvfsp->vfs_snapshot, bp); 314 else 315 (void) bdev_strategy(bp); 316 317 if (rw == S_WRITE) 318 lwp_stat_update(LWP_STAT_OUBLK, 1); 319 else 320 lwp_stat_update(LWP_STAT_INBLK, 1); 321 322 } 323 324 uint32_t ufs_shared_writes; /* writes done w/ lock shared */ 325 uint32_t ufs_cur_writes; /* # concurrent writes */ 326 uint32_t ufs_maxcur_writes; /* high water concurrent writes */ 327 uint32_t ufs_posix_hits; /* writes done /w lock excl. */ 328 329 /* 330 * Force POSIX syncronous data integrity on all writes for testing. 331 */ 332 uint32_t ufs_force_posix_sdi = 0; 333 334 /* 335 * Direct Write 336 */ 337 338 int 339 ufs_directio_write(struct inode *ip, uio_t *arg_uio, int ioflag, int rewrite, 340 cred_t *cr, int *statusp) 341 { 342 long resid, bytes_written; 343 u_offset_t size, uoff; 344 uio_t *uio = arg_uio; 345 rlim64_t limit = uio->uio_llimit; 346 int on, n, error, newerror, len, has_holes; 347 daddr_t bn; 348 size_t nbytes; 349 struct fs *fs; 350 vnode_t *vp; 351 iovec_t *iov; 352 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 353 struct proc *procp; 354 struct as *as; 355 struct directio_buf *tail; 356 int exclusive, ncur, bmap_peek; 357 uio_t copy_uio; 358 iovec_t copy_iov; 359 char *copy_base; 360 long copy_resid; 361 362 /* 363 * assume that directio isn't possible (normal case) 364 */ 365 *statusp = DIRECTIO_FAILURE; 366 367 /* 368 * Don't go direct 369 */ 370 if (ufs_directio_enabled == 0) 371 return (0); 372 373 /* 374 * mapped file; nevermind 375 */ 376 if (ip->i_mapcnt) 377 return (0); 378 379 /* 380 * CAN WE DO DIRECT IO? 381 */ 382 uoff = uio->uio_loffset; 383 resid = uio->uio_resid; 384 385 /* 386 * beyond limit 387 */ 388 if (uoff + resid > limit) 389 return (0); 390 391 /* 392 * must be sector aligned 393 */ 394 if ((uoff & (u_offset_t)(DEV_BSIZE - 1)) || (resid & (DEV_BSIZE - 1))) 395 return (0); 396 397 /* 398 * SHOULD WE DO DIRECT IO? 399 */ 400 size = ip->i_size; 401 has_holes = -1; 402 403 /* 404 * only on regular files; no metadata 405 */ 406 if (((ip->i_mode & IFMT) != IFREG) || ip->i_ufsvfs->vfs_qinod == ip) 407 return (0); 408 409 /* 410 * Synchronous, allocating writes run very slow in Direct-Mode 411 * XXX - can be fixed with bmap_write changes for large writes!!! 412 * XXX - can be fixed for updates to "almost-full" files 413 * XXX - WARNING - system hangs if bmap_write() has to 414 * allocate lots of pages since pageout 415 * suspends on locked inode 416 */ 417 if (!rewrite && (ip->i_flag & ISYNC)) { 418 if ((uoff + resid) > size) 419 return (0); 420 has_holes = bmap_has_holes(ip); 421 if (has_holes) 422 return (0); 423 } 424 425 /* 426 * Each iovec must be short aligned and sector aligned. If 427 * one is not, then kmem_alloc a new buffer and copy all of 428 * the smaller buffers into the new buffer. This new 429 * buffer will be short aligned and sector aligned. 430 */ 431 iov = uio->uio_iov; 432 nbytes = uio->uio_iovcnt; 433 while (nbytes--) { 434 if (((uint_t)iov->iov_len & (DEV_BSIZE - 1)) != 0 || 435 (intptr_t)(iov->iov_base) & 1) { 436 copy_resid = uio->uio_resid; 437 copy_base = kmem_alloc(copy_resid, KM_NOSLEEP); 438 if (copy_base == NULL) 439 return (0); 440 copy_iov.iov_base = copy_base; 441 copy_iov.iov_len = copy_resid; 442 copy_uio.uio_iov = ©_iov; 443 copy_uio.uio_iovcnt = 1; 444 copy_uio.uio_segflg = UIO_SYSSPACE; 445 copy_uio.uio_extflg = UIO_COPY_DEFAULT; 446 copy_uio.uio_loffset = uio->uio_loffset; 447 copy_uio.uio_resid = uio->uio_resid; 448 copy_uio.uio_llimit = uio->uio_llimit; 449 error = uiomove(copy_base, copy_resid, UIO_WRITE, uio); 450 if (error) { 451 kmem_free(copy_base, copy_resid); 452 return (0); 453 } 454 uio = ©_uio; 455 break; 456 } 457 iov++; 458 } 459 460 /* 461 * From here on down, all error exits must go to errout and 462 * not simply return a 0. 463 */ 464 465 /* 466 * DIRECTIO 467 */ 468 469 fs = ip->i_fs; 470 471 /* 472 * POSIX check. If attempting a concurrent re-write, make sure 473 * that this will be a single request to the driver to meet 474 * POSIX synchronous data integrity requirements. 475 */ 476 bmap_peek = 0; 477 if (rewrite && ((ioflag & FDSYNC) || ufs_force_posix_sdi)) { 478 int upgrade = 0; 479 480 /* check easy conditions first */ 481 if (uio->uio_iovcnt != 1 || resid > ufsvfsp->vfs_ioclustsz) { 482 upgrade = 1; 483 } else { 484 /* now look for contiguous allocation */ 485 len = (ssize_t)blkroundup(fs, resid); 486 error = bmap_read(ip, uoff, &bn, &len); 487 if (error || bn == UFS_HOLE || len == 0) 488 goto errout; 489 /* save a call to bmap_read later */ 490 bmap_peek = 1; 491 if (len < resid) 492 upgrade = 1; 493 } 494 if (upgrade) { 495 rw_exit(&ip->i_contents); 496 rw_enter(&ip->i_contents, RW_WRITER); 497 ufs_posix_hits++; 498 } 499 } 500 501 502 /* 503 * allocate space 504 */ 505 506 /* 507 * If attempting a re-write, there is no allocation to do. 508 * bmap_write would trip an ASSERT if i_contents is held shared. 509 */ 510 if (rewrite) 511 goto skip_alloc; 512 513 do { 514 on = (int)blkoff(fs, uoff); 515 n = (int)MIN(fs->fs_bsize - on, resid); 516 if ((uoff + n) > ip->i_size) { 517 error = bmap_write(ip, uoff, (int)(on + n), 518 (int)(uoff & (offset_t)MAXBOFFSET) == 0, 519 NULL, cr); 520 /* Caller is responsible for updating i_seq if needed */ 521 if (error) 522 break; 523 ip->i_size = uoff + n; 524 ip->i_flag |= IATTCHG; 525 } else if (n == MAXBSIZE) { 526 error = bmap_write(ip, uoff, (int)(on + n), 527 BI_ALLOC_ONLY, NULL, cr); 528 /* Caller is responsible for updating i_seq if needed */ 529 } else { 530 if (has_holes < 0) 531 has_holes = bmap_has_holes(ip); 532 if (has_holes) { 533 uint_t blk_size; 534 u_offset_t offset; 535 536 offset = uoff & (offset_t)fs->fs_bmask; 537 blk_size = (int)blksize(fs, ip, 538 (daddr_t)lblkno(fs, offset)); 539 error = bmap_write(ip, uoff, blk_size, 540 BI_NORMAL, NULL, cr); 541 /* 542 * Caller is responsible for updating 543 * i_seq if needed 544 */ 545 } else 546 error = 0; 547 } 548 if (error) 549 break; 550 uoff += n; 551 resid -= n; 552 /* 553 * if file has grown larger than 2GB, set flag 554 * in superblock if not already set 555 */ 556 if ((ip->i_size > MAXOFF32_T) && 557 !(fs->fs_flags & FSLARGEFILES)) { 558 ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES); 559 mutex_enter(&ufsvfsp->vfs_lock); 560 fs->fs_flags |= FSLARGEFILES; 561 ufs_sbwrite(ufsvfsp); 562 mutex_exit(&ufsvfsp->vfs_lock); 563 } 564 } while (resid); 565 566 if (error) { 567 /* 568 * restore original state 569 */ 570 if (resid) { 571 if (size == ip->i_size) 572 goto errout; 573 (void) ufs_itrunc(ip, size, 0, cr); 574 } 575 /* 576 * try non-directio path 577 */ 578 goto errout; 579 } 580 skip_alloc: 581 582 /* 583 * get rid of cached pages 584 */ 585 vp = ITOV(ip); 586 exclusive = rw_write_held(&ip->i_contents); 587 if (vn_has_cached_data(vp)) { 588 if (!exclusive) { 589 /* 590 * Still holding i_rwlock, so no allocations 591 * can happen after dropping contents. 592 */ 593 rw_exit(&ip->i_contents); 594 rw_enter(&ip->i_contents, RW_WRITER); 595 } 596 (void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0, B_INVAL, cr); 597 if (vn_has_cached_data(vp)) 598 goto errout; 599 if (!exclusive) 600 rw_downgrade(&ip->i_contents); 601 ufs_directio_kstats.nflushes++; 602 } 603 604 /* 605 * Direct Writes 606 */ 607 608 if (!exclusive) { 609 ufs_shared_writes++; 610 ncur = atomic_add_32_nv(&ufs_cur_writes, 1); 611 if (ncur > ufs_maxcur_writes) 612 ufs_maxcur_writes = ncur; 613 } 614 615 /* 616 * proc and as are for VM operations in directio_start() 617 */ 618 if (uio->uio_segflg == UIO_USERSPACE) { 619 procp = ttoproc(curthread); 620 as = procp->p_as; 621 } else { 622 procp = NULL; 623 as = &kas; 624 } 625 *statusp = DIRECTIO_SUCCESS; 626 error = 0; 627 newerror = 0; 628 resid = uio->uio_resid; 629 bytes_written = 0; 630 ufs_directio_kstats.logical_writes++; 631 while (error == 0 && newerror == 0 && resid && uio->uio_iovcnt) { 632 size_t pglck_len, pglck_size; 633 caddr_t pglck_base; 634 page_t **pplist, **spplist; 635 636 tail = NULL; 637 638 /* 639 * Adjust number of bytes 640 */ 641 iov = uio->uio_iov; 642 pglck_len = (size_t)MIN(iov->iov_len, resid); 643 pglck_base = iov->iov_base; 644 if (pglck_len == 0) { 645 uio->uio_iov++; 646 uio->uio_iovcnt--; 647 continue; 648 } 649 650 /* 651 * Try to Lock down the largest chunck of pages possible. 652 */ 653 pglck_len = (size_t)MIN(pglck_len, ufsvfsp->vfs_ioclustsz); 654 error = as_pagelock(as, &pplist, pglck_base, pglck_len, S_READ); 655 656 if (error) 657 break; 658 659 pglck_size = pglck_len; 660 while (pglck_len) { 661 662 nbytes = pglck_len; 663 uoff = uio->uio_loffset; 664 665 if (!bmap_peek) { 666 667 /* 668 * Re-adjust number of bytes to contiguous 669 * range. May have already called bmap_read 670 * in the case of a concurrent rewrite. 671 */ 672 len = (ssize_t)blkroundup(fs, nbytes); 673 error = bmap_read(ip, uoff, &bn, &len); 674 if (error) 675 break; 676 if (bn == UFS_HOLE || len == 0) 677 break; 678 } 679 nbytes = (size_t)MIN(nbytes, len); 680 bmap_peek = 0; 681 682 /* 683 * Get the pagelist pointer for this offset to be 684 * passed to directio_start. 685 */ 686 687 if (pplist != NULL) 688 spplist = pplist + 689 btop((uintptr_t)iov->iov_base - 690 ((uintptr_t)pglck_base & PAGEMASK)); 691 else 692 spplist = NULL; 693 694 /* 695 * Kick off the direct write requests 696 */ 697 directio_start(ufsvfsp, ip->i_dev, nbytes, ldbtob(bn), 698 iov->iov_base, S_READ, procp, &tail, spplist); 699 700 /* 701 * Adjust pointers and counters 702 */ 703 iov->iov_len -= nbytes; 704 iov->iov_base += nbytes; 705 uio->uio_loffset += nbytes; 706 resid -= nbytes; 707 pglck_len -= nbytes; 708 } 709 710 /* 711 * Wait for outstanding requests 712 */ 713 newerror = directio_wait(tail, &bytes_written); 714 715 /* 716 * Release VM resources 717 */ 718 as_pageunlock(as, pplist, pglck_base, pglck_size, S_READ); 719 720 } 721 722 if (!exclusive) { 723 atomic_add_32(&ufs_cur_writes, -1); 724 /* 725 * If this write was done shared, readers may 726 * have pulled in unmodified pages. Get rid of 727 * these potentially stale pages. 728 */ 729 if (vn_has_cached_data(vp)) { 730 rw_exit(&ip->i_contents); 731 rw_enter(&ip->i_contents, RW_WRITER); 732 (void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0, 733 B_INVAL, cr); 734 ufs_directio_kstats.nflushes++; 735 rw_downgrade(&ip->i_contents); 736 } 737 } 738 739 /* 740 * If error, adjust resid to begin at the first 741 * un-writable byte. 742 */ 743 if (error == 0) 744 error = newerror; 745 if (error) 746 resid = uio->uio_resid - bytes_written; 747 arg_uio->uio_resid = resid; 748 749 if (!rewrite) { 750 ip->i_flag |= IUPD | ICHG; 751 /* Caller will update i_seq */ 752 TRANS_INODE(ip->i_ufsvfs, ip); 753 } 754 /* 755 * If there is a residual; adjust the EOF if necessary 756 */ 757 if (resid) { 758 if (size != ip->i_size) { 759 if (uio->uio_loffset > size) 760 size = uio->uio_loffset; 761 (void) ufs_itrunc(ip, size, 0, cr); 762 } 763 } 764 765 if (uio == ©_uio) 766 kmem_free(copy_base, copy_resid); 767 768 return (error); 769 770 errout: 771 if (uio == ©_uio) 772 kmem_free(copy_base, copy_resid); 773 774 return (0); 775 } 776 /* 777 * Direct read of a hole 778 */ 779 static int 780 directio_hole(struct uio *uio, size_t nbytes) 781 { 782 int error = 0, nzero; 783 uio_t phys_uio; 784 iovec_t phys_iov; 785 786 ufs_directio_kstats.hole_reads++; 787 ufs_directio_kstats.nread += nbytes; 788 789 phys_iov.iov_base = uio->uio_iov->iov_base; 790 phys_iov.iov_len = nbytes; 791 792 phys_uio.uio_iov = &phys_iov; 793 phys_uio.uio_iovcnt = 1; 794 phys_uio.uio_resid = phys_iov.iov_len; 795 phys_uio.uio_segflg = uio->uio_segflg; 796 phys_uio.uio_extflg = uio->uio_extflg; 797 while (error == 0 && phys_uio.uio_resid) { 798 nzero = (int)MIN(phys_iov.iov_len, ufs_directio_zero_len); 799 error = uiomove(ufs_directio_zero_buf, nzero, UIO_READ, 800 &phys_uio); 801 } 802 return (error); 803 } 804 805 /* 806 * Direct Read 807 */ 808 int 809 ufs_directio_read(struct inode *ip, uio_t *uio, cred_t *cr, int *statusp) 810 { 811 ssize_t resid, bytes_read; 812 u_offset_t size, uoff; 813 int error, newerror, len; 814 size_t nbytes; 815 struct fs *fs; 816 vnode_t *vp; 817 daddr_t bn; 818 iovec_t *iov; 819 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 820 struct proc *procp; 821 struct as *as; 822 struct directio_buf *tail; 823 824 /* 825 * assume that directio isn't possible (normal case) 826 */ 827 *statusp = DIRECTIO_FAILURE; 828 829 /* 830 * Don't go direct 831 */ 832 if (ufs_directio_enabled == 0) 833 return (0); 834 835 /* 836 * mapped file; nevermind 837 */ 838 if (ip->i_mapcnt) 839 return (0); 840 841 /* 842 * CAN WE DO DIRECT IO? 843 */ 844 /* 845 * must be sector aligned 846 */ 847 uoff = uio->uio_loffset; 848 resid = uio->uio_resid; 849 if ((uoff & (u_offset_t)(DEV_BSIZE - 1)) || (resid & (DEV_BSIZE - 1))) 850 return (0); 851 /* 852 * must be short aligned and sector aligned 853 */ 854 iov = uio->uio_iov; 855 nbytes = uio->uio_iovcnt; 856 while (nbytes--) { 857 if (((size_t)iov->iov_len & (DEV_BSIZE - 1)) != 0) 858 return (0); 859 if ((intptr_t)(iov++->iov_base) & 1) 860 return (0); 861 } 862 863 /* 864 * DIRECTIO 865 */ 866 fs = ip->i_fs; 867 868 /* 869 * don't read past EOF 870 */ 871 size = ip->i_size; 872 873 /* 874 * The file offset is past EOF so bail out here; we don't want 875 * to update uio_resid and make it look like we read something. 876 * We say that direct I/O was a success to avoid having rdip() 877 * go through the same "read past EOF logic". 878 */ 879 if (uoff >= size) { 880 *statusp = DIRECTIO_SUCCESS; 881 return (0); 882 } 883 884 /* 885 * The read would extend past EOF so make it smaller. 886 */ 887 if ((uoff + resid) > size) { 888 resid = size - uoff; 889 /* 890 * recheck sector alignment 891 */ 892 if (resid & (DEV_BSIZE - 1)) 893 return (0); 894 } 895 896 /* 897 * At this point, we know there is some real work to do. 898 */ 899 ASSERT(resid); 900 901 /* 902 * get rid of cached pages 903 */ 904 vp = ITOV(ip); 905 if (vn_has_cached_data(vp)) { 906 rw_exit(&ip->i_contents); 907 rw_enter(&ip->i_contents, RW_WRITER); 908 (void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0, B_INVAL, cr); 909 if (vn_has_cached_data(vp)) 910 return (0); 911 rw_downgrade(&ip->i_contents); 912 ufs_directio_kstats.nflushes++; 913 } 914 /* 915 * Direct Reads 916 */ 917 918 /* 919 * proc and as are for VM operations in directio_start() 920 */ 921 if (uio->uio_segflg == UIO_USERSPACE) { 922 procp = ttoproc(curthread); 923 as = procp->p_as; 924 } else { 925 procp = NULL; 926 as = &kas; 927 } 928 929 *statusp = DIRECTIO_SUCCESS; 930 error = 0; 931 newerror = 0; 932 bytes_read = 0; 933 ufs_directio_kstats.logical_reads++; 934 while (error == 0 && newerror == 0 && resid && uio->uio_iovcnt) { 935 size_t pglck_len, pglck_size; 936 caddr_t pglck_base; 937 page_t **pplist, **spplist; 938 939 tail = NULL; 940 941 /* 942 * Adjust number of bytes 943 */ 944 iov = uio->uio_iov; 945 pglck_len = (size_t)MIN(iov->iov_len, resid); 946 pglck_base = iov->iov_base; 947 if (pglck_len == 0) { 948 uio->uio_iov++; 949 uio->uio_iovcnt--; 950 continue; 951 } 952 953 /* 954 * Try to Lock down the largest chunck of pages possible. 955 */ 956 pglck_len = (size_t)MIN(pglck_len, ufsvfsp->vfs_ioclustsz); 957 error = as_pagelock(as, &pplist, pglck_base, 958 pglck_len, S_WRITE); 959 960 if (error) 961 break; 962 963 pglck_size = pglck_len; 964 while (pglck_len) { 965 966 nbytes = pglck_len; 967 uoff = uio->uio_loffset; 968 969 /* 970 * Re-adjust number of bytes to contiguous range 971 */ 972 len = (ssize_t)blkroundup(fs, nbytes); 973 error = bmap_read(ip, uoff, &bn, &len); 974 if (error) 975 break; 976 977 if (bn == UFS_HOLE) { 978 nbytes = (size_t)MIN(fs->fs_bsize - 979 (long)blkoff(fs, uoff), nbytes); 980 error = directio_hole(uio, nbytes); 981 /* 982 * Hole reads are not added to the list 983 * processed by directio_wait() below so 984 * account for bytes read here. 985 */ 986 if (!error) 987 bytes_read += nbytes; 988 } else { 989 nbytes = (size_t)MIN(nbytes, len); 990 991 /* 992 * Get the pagelist pointer for this offset 993 * to be passed to directio_start. 994 */ 995 if (pplist != NULL) 996 spplist = pplist + 997 btop((uintptr_t)iov->iov_base - 998 ((uintptr_t)pglck_base & PAGEMASK)); 999 else 1000 spplist = NULL; 1001 1002 /* 1003 * Kick off the direct read requests 1004 */ 1005 directio_start(ufsvfsp, ip->i_dev, nbytes, 1006 ldbtob(bn), iov->iov_base, 1007 S_WRITE, procp, &tail, spplist); 1008 } 1009 1010 if (error) 1011 break; 1012 1013 /* 1014 * Adjust pointers and counters 1015 */ 1016 iov->iov_len -= nbytes; 1017 iov->iov_base += nbytes; 1018 uio->uio_loffset += nbytes; 1019 resid -= nbytes; 1020 pglck_len -= nbytes; 1021 } 1022 1023 /* 1024 * Wait for outstanding requests 1025 */ 1026 newerror = directio_wait(tail, &bytes_read); 1027 /* 1028 * Release VM resources 1029 */ 1030 as_pageunlock(as, pplist, pglck_base, pglck_size, S_WRITE); 1031 1032 } 1033 1034 /* 1035 * If error, adjust resid to begin at the first 1036 * un-read byte. 1037 */ 1038 if (error == 0) 1039 error = newerror; 1040 uio->uio_resid -= bytes_read; 1041 return (error); 1042 } 1043