1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * Portions of this source code were derived from Berkeley 4.3 BSD 31 * under license from the Regents of the University of California. 32 */ 33 34 #pragma ident "%Z%%M% %I% %E% SMI" 35 36 #include <sys/types.h> 37 #include <sys/t_lock.h> 38 #include <sys/param.h> 39 #include <sys/time.h> 40 #include <sys/systm.h> 41 #include <sys/sysmacros.h> 42 #include <sys/resource.h> 43 #include <sys/signal.h> 44 #include <sys/cred.h> 45 #include <sys/user.h> 46 #include <sys/buf.h> 47 #include <sys/vfs.h> 48 #include <sys/vnode.h> 49 #include <sys/proc.h> 50 #include <sys/disp.h> 51 #include <sys/file.h> 52 #include <sys/fcntl.h> 53 #include <sys/flock.h> 54 #include <sys/kmem.h> 55 #include <sys/uio.h> 56 #include <sys/dnlc.h> 57 #include <sys/conf.h> 58 #include <sys/mman.h> 59 #include <sys/pathname.h> 60 #include <sys/debug.h> 61 #include <sys/vmsystm.h> 62 #include <sys/cmn_err.h> 63 #include <sys/filio.h> 64 #include <sys/atomic.h> 65 66 #include <sys/fssnap_if.h> 67 #include <sys/fs/ufs_fs.h> 68 #include <sys/fs/ufs_lockfs.h> 69 #include <sys/fs/ufs_filio.h> 70 #include <sys/fs/ufs_inode.h> 71 #include <sys/fs/ufs_fsdir.h> 72 #include <sys/fs/ufs_quota.h> 73 #include <sys/fs/ufs_trans.h> 74 #include <sys/fs/ufs_panic.h> 75 #include <sys/dirent.h> /* must be AFTER <sys/fs/fsdir.h>! */ 76 #include <sys/errno.h> 77 78 #include <sys/filio.h> /* _FIOIO */ 79 80 #include <vm/hat.h> 81 #include <vm/page.h> 82 #include <vm/pvn.h> 83 #include <vm/as.h> 84 #include <vm/seg.h> 85 #include <vm/seg_map.h> 86 #include <vm/seg_vn.h> 87 #include <vm/seg_kmem.h> 88 #include <vm/rm.h> 89 #include <sys/swap.h> 90 #include <sys/epm.h> 91 92 #include <fs/fs_subr.h> 93 94 static void *ufs_directio_zero_buf; 95 static int ufs_directio_zero_len = 8192; 96 97 int ufs_directio_enabled = 1; /* feature is enabled */ 98 99 /* 100 * for kstats reader 101 */ 102 struct ufs_directio_kstats { 103 kstat_named_t logical_reads; 104 kstat_named_t phys_reads; 105 kstat_named_t hole_reads; 106 kstat_named_t nread; 107 kstat_named_t logical_writes; 108 kstat_named_t phys_writes; 109 kstat_named_t nwritten; 110 kstat_named_t nflushes; 111 } ufs_directio_kstats = { 112 { "logical_reads", KSTAT_DATA_UINT64 }, 113 { "phys_reads", KSTAT_DATA_UINT64 }, 114 { "hole_reads", KSTAT_DATA_UINT64 }, 115 { "nread", KSTAT_DATA_UINT64 }, 116 { "logical_writes", KSTAT_DATA_UINT64 }, 117 { "phys_writes", KSTAT_DATA_UINT64 }, 118 { "nwritten", KSTAT_DATA_UINT64 }, 119 { "nflushes", KSTAT_DATA_UINT64 }, 120 }; 121 122 kstat_t *ufs_directio_kstatsp; 123 124 /* 125 * use kmem_cache_create for direct-physio buffers. This has shown 126 * a better cache distribution compared to buffers on the 127 * stack. It also avoids semaphore construction/deconstruction 128 * per request 129 */ 130 struct directio_buf { 131 struct directio_buf *next; 132 char *addr; 133 size_t nbytes; 134 struct buf buf; 135 }; 136 static struct kmem_cache *directio_buf_cache; 137 138 139 /* ARGSUSED */ 140 static int 141 directio_buf_constructor(void *dbp, void *cdrarg, int kmflags) 142 { 143 bioinit((struct buf *)&((struct directio_buf *)dbp)->buf); 144 return (0); 145 } 146 147 /* ARGSUSED */ 148 static void 149 directio_buf_destructor(void *dbp, void *cdrarg) 150 { 151 biofini((struct buf *)&((struct directio_buf *)dbp)->buf); 152 } 153 154 void 155 directio_bufs_init(void) 156 { 157 directio_buf_cache = kmem_cache_create("directio_buf_cache", 158 sizeof (struct directio_buf), 0, 159 directio_buf_constructor, directio_buf_destructor, 160 NULL, NULL, NULL, 0); 161 } 162 163 void 164 ufs_directio_init(void) 165 { 166 /* 167 * kstats 168 */ 169 ufs_directio_kstatsp = kstat_create("ufs", 0, 170 "directio", "ufs", KSTAT_TYPE_NAMED, 171 sizeof (ufs_directio_kstats) / sizeof (kstat_named_t), 172 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE); 173 if (ufs_directio_kstatsp) { 174 ufs_directio_kstatsp->ks_data = (void *)&ufs_directio_kstats; 175 kstat_install(ufs_directio_kstatsp); 176 } 177 /* 178 * kzero is broken so we have to use a private buf of zeroes 179 */ 180 ufs_directio_zero_buf = kmem_zalloc(ufs_directio_zero_len, KM_SLEEP); 181 directio_bufs_init(); 182 } 183 184 /* 185 * Wait for the first direct IO operation to finish 186 */ 187 static int 188 directio_wait_one(struct directio_buf *dbp, long *bytes_iop) 189 { 190 buf_t *bp; 191 int error; 192 193 /* 194 * Wait for IO to finish 195 */ 196 bp = &dbp->buf; 197 error = biowait(bp); 198 199 /* 200 * bytes_io will be used to figure out a resid 201 * for the caller. The resid is approximated by reporting 202 * the bytes following the first failed IO as the residual. 203 * 204 * I am cautious about using b_resid because I 205 * am not sure how well the disk drivers maintain it. 206 */ 207 if (error) 208 if (bp->b_resid) 209 *bytes_iop = bp->b_bcount - bp->b_resid; 210 else 211 *bytes_iop = 0; 212 else 213 *bytes_iop += bp->b_bcount; 214 /* 215 * Release direct IO resources 216 */ 217 bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_SHADOW); 218 kmem_cache_free(directio_buf_cache, dbp); 219 return (error); 220 } 221 222 /* 223 * Wait for all of the direct IO operations to finish 224 */ 225 226 uint32_t ufs_directio_drop_kpri = 0; /* enable kpri hack */ 227 228 static int 229 directio_wait(struct directio_buf *tail, long *bytes_iop) 230 { 231 int error = 0, newerror; 232 struct directio_buf *dbp; 233 uint_t kpri_req_save; 234 235 /* 236 * The linked list of directio buf structures is maintained 237 * in reverse order (tail->last request->penultimate request->...) 238 */ 239 /* 240 * This is the k_pri_req hack. Large numbers of threads 241 * sleeping with kernel priority will cause scheduler thrashing 242 * on an MP machine. This can be seen running Oracle using 243 * directio to ufs files. Sleep at normal priority here to 244 * more closely mimic physio to a device partition. This 245 * workaround is disabled by default as a niced thread could 246 * be starved from running while holding i_rwlock and i_contents. 247 */ 248 if (ufs_directio_drop_kpri) { 249 kpri_req_save = curthread->t_kpri_req; 250 curthread->t_kpri_req = 0; 251 } 252 while ((dbp = tail) != NULL) { 253 tail = dbp->next; 254 newerror = directio_wait_one(dbp, bytes_iop); 255 if (error == 0) 256 error = newerror; 257 } 258 if (ufs_directio_drop_kpri) 259 curthread->t_kpri_req = kpri_req_save; 260 return (error); 261 } 262 /* 263 * Initiate direct IO request 264 */ 265 static void 266 directio_start(struct ufsvfs *ufsvfsp, dev_t dev, size_t nbytes, 267 offset_t offset, char *addr, enum seg_rw rw, struct proc *procp, 268 struct directio_buf **tailp, page_t **pplist) 269 { 270 buf_t *bp; 271 struct directio_buf *dbp; 272 273 /* 274 * Allocate a directio buf header 275 * Note - list is maintained in reverse order. 276 * directio_wait_one() depends on this fact when 277 * adjusting the ``bytes_io'' param. bytes_io 278 * is used to compute a residual in the case of error. 279 */ 280 dbp = kmem_cache_alloc(directio_buf_cache, KM_SLEEP); 281 dbp->next = *tailp; 282 *tailp = dbp; 283 284 /* 285 * Initialize buf header 286 */ 287 dbp->addr = addr; 288 dbp->nbytes = nbytes; 289 bp = &dbp->buf; 290 bp->b_edev = dev; 291 bp->b_lblkno = btodt(offset); 292 bp->b_bcount = nbytes; 293 bp->b_un.b_addr = addr; 294 bp->b_proc = procp; 295 296 /* 297 * Note that S_WRITE implies B_READ and vice versa: a read(2) 298 * will B_READ data from the filesystem and S_WRITE it into 299 * the user's buffer; a write(2) will S_READ data from the 300 * user's buffer and B_WRITE it to the filesystem. 301 */ 302 if (rw == S_WRITE) { 303 bp->b_flags = B_BUSY | B_PHYS | B_READ; 304 ufs_directio_kstats.phys_reads.value.ui64++; 305 ufs_directio_kstats.nread.value.ui64 += nbytes; 306 } else { 307 bp->b_flags = B_BUSY | B_PHYS | B_WRITE; 308 ufs_directio_kstats.phys_writes.value.ui64++; 309 ufs_directio_kstats.nwritten.value.ui64 += nbytes; 310 } 311 bp->b_shadow = pplist; 312 if (pplist != NULL) 313 bp->b_flags |= B_SHADOW; 314 315 /* 316 * Issue I/O request. 317 */ 318 ufsvfsp->vfs_iotstamp = lbolt; 319 if (ufsvfsp->vfs_snapshot) 320 fssnap_strategy(&ufsvfsp->vfs_snapshot, bp); 321 else 322 (void) bdev_strategy(bp); 323 324 if (rw == S_WRITE) 325 lwp_stat_update(LWP_STAT_OUBLK, 1); 326 else 327 lwp_stat_update(LWP_STAT_INBLK, 1); 328 329 } 330 331 uint32_t ufs_shared_writes; /* writes done w/ lock shared */ 332 uint32_t ufs_cur_writes; /* # concurrent writes */ 333 uint32_t ufs_maxcur_writes; /* high water concurrent writes */ 334 uint32_t ufs_posix_hits; /* writes done /w lock excl. */ 335 336 /* 337 * Force POSIX syncronous data integrity on all writes for testing. 338 */ 339 uint32_t ufs_force_posix_sdi = 0; 340 341 /* 342 * Direct Write 343 */ 344 345 int 346 ufs_directio_write(struct inode *ip, uio_t *arg_uio, int ioflag, int rewrite, 347 cred_t *cr, int *statusp) 348 { 349 long resid, bytes_written; 350 u_offset_t size, uoff; 351 uio_t *uio = arg_uio; 352 rlim64_t limit = uio->uio_llimit; 353 int on, n, error, newerror, len, has_holes; 354 daddr_t bn; 355 size_t nbytes; 356 struct fs *fs; 357 vnode_t *vp; 358 iovec_t *iov; 359 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 360 struct proc *procp; 361 struct as *as; 362 struct directio_buf *tail; 363 int exclusive, ncur, bmap_peek; 364 uio_t copy_uio; 365 iovec_t copy_iov; 366 char *copy_base; 367 long copy_resid; 368 369 /* 370 * assume that directio isn't possible (normal case) 371 */ 372 *statusp = DIRECTIO_FAILURE; 373 374 /* 375 * Don't go direct 376 */ 377 if (ufs_directio_enabled == 0) 378 return (0); 379 380 /* 381 * mapped file; nevermind 382 */ 383 if (ip->i_mapcnt) 384 return (0); 385 386 /* 387 * CAN WE DO DIRECT IO? 388 */ 389 uoff = uio->uio_loffset; 390 resid = uio->uio_resid; 391 392 /* 393 * beyond limit 394 */ 395 if (uoff + resid > limit) 396 return (0); 397 398 /* 399 * must be sector aligned 400 */ 401 if ((uoff & (u_offset_t)(DEV_BSIZE - 1)) || (resid & (DEV_BSIZE - 1))) 402 return (0); 403 404 /* 405 * SHOULD WE DO DIRECT IO? 406 */ 407 size = ip->i_size; 408 has_holes = -1; 409 410 /* 411 * only on regular files; no metadata 412 */ 413 if (((ip->i_mode & IFMT) != IFREG) || ip->i_ufsvfs->vfs_qinod == ip) 414 return (0); 415 416 /* 417 * Synchronous, allocating writes run very slow in Direct-Mode 418 * XXX - can be fixed with bmap_write changes for large writes!!! 419 * XXX - can be fixed for updates to "almost-full" files 420 * XXX - WARNING - system hangs if bmap_write() has to 421 * allocate lots of pages since pageout 422 * suspends on locked inode 423 */ 424 if (!rewrite && (ip->i_flag & ISYNC)) { 425 if ((uoff + resid) > size) 426 return (0); 427 has_holes = bmap_has_holes(ip); 428 if (has_holes) 429 return (0); 430 } 431 432 /* 433 * Each iovec must be short aligned and sector aligned. If 434 * one is not, then kmem_alloc a new buffer and copy all of 435 * the smaller buffers into the new buffer. This new 436 * buffer will be short aligned and sector aligned. 437 */ 438 iov = uio->uio_iov; 439 nbytes = uio->uio_iovcnt; 440 while (nbytes--) { 441 if (((uint_t)iov->iov_len & (DEV_BSIZE - 1)) != 0 || 442 (intptr_t)(iov->iov_base) & 1) { 443 copy_resid = uio->uio_resid; 444 copy_base = kmem_alloc(copy_resid, KM_NOSLEEP); 445 if (copy_base == NULL) 446 return (0); 447 copy_iov.iov_base = copy_base; 448 copy_iov.iov_len = copy_resid; 449 copy_uio.uio_iov = ©_iov; 450 copy_uio.uio_iovcnt = 1; 451 copy_uio.uio_segflg = UIO_SYSSPACE; 452 copy_uio.uio_extflg = UIO_COPY_DEFAULT; 453 copy_uio.uio_loffset = uio->uio_loffset; 454 copy_uio.uio_resid = uio->uio_resid; 455 copy_uio.uio_llimit = uio->uio_llimit; 456 error = uiomove(copy_base, copy_resid, UIO_WRITE, uio); 457 if (error) { 458 kmem_free(copy_base, copy_resid); 459 return (0); 460 } 461 uio = ©_uio; 462 break; 463 } 464 iov++; 465 } 466 467 /* 468 * From here on down, all error exits must go to errout and 469 * not simply return a 0. 470 */ 471 472 /* 473 * DIRECTIO 474 */ 475 476 fs = ip->i_fs; 477 478 /* 479 * POSIX check. If attempting a concurrent re-write, make sure 480 * that this will be a single request to the driver to meet 481 * POSIX synchronous data integrity requirements. 482 */ 483 bmap_peek = 0; 484 if (rewrite && ((ioflag & FDSYNC) || ufs_force_posix_sdi)) { 485 int upgrade = 0; 486 487 /* check easy conditions first */ 488 if (uio->uio_iovcnt != 1 || resid > ufsvfsp->vfs_ioclustsz) { 489 upgrade = 1; 490 } else { 491 /* now look for contiguous allocation */ 492 len = (ssize_t)blkroundup(fs, resid); 493 error = bmap_read(ip, uoff, &bn, &len); 494 if (error || bn == UFS_HOLE || len == 0) 495 goto errout; 496 /* save a call to bmap_read later */ 497 bmap_peek = 1; 498 if (len < resid) 499 upgrade = 1; 500 } 501 if (upgrade) { 502 rw_exit(&ip->i_contents); 503 rw_enter(&ip->i_contents, RW_WRITER); 504 ufs_posix_hits++; 505 } 506 } 507 508 509 /* 510 * allocate space 511 */ 512 513 /* 514 * If attempting a re-write, there is no allocation to do. 515 * bmap_write would trip an ASSERT if i_contents is held shared. 516 */ 517 if (rewrite) 518 goto skip_alloc; 519 520 do { 521 on = (int)blkoff(fs, uoff); 522 n = (int)MIN(fs->fs_bsize - on, resid); 523 if ((uoff + n) > ip->i_size) { 524 error = bmap_write(ip, uoff, (int)(on + n), 525 (int)(uoff & (offset_t)MAXBOFFSET) == 0, 526 NULL, cr); 527 /* Caller is responsible for updating i_seq if needed */ 528 if (error) 529 break; 530 ip->i_size = uoff + n; 531 ip->i_flag |= IATTCHG; 532 } else if (n == MAXBSIZE) { 533 error = bmap_write(ip, uoff, (int)(on + n), 534 BI_ALLOC_ONLY, NULL, cr); 535 /* Caller is responsible for updating i_seq if needed */ 536 } else { 537 if (has_holes < 0) 538 has_holes = bmap_has_holes(ip); 539 if (has_holes) { 540 uint_t blk_size; 541 u_offset_t offset; 542 543 offset = uoff & (offset_t)fs->fs_bmask; 544 blk_size = (int)blksize(fs, ip, 545 (daddr_t)lblkno(fs, offset)); 546 error = bmap_write(ip, uoff, blk_size, 547 BI_NORMAL, NULL, cr); 548 /* 549 * Caller is responsible for updating 550 * i_seq if needed 551 */ 552 } else 553 error = 0; 554 } 555 if (error) 556 break; 557 uoff += n; 558 resid -= n; 559 /* 560 * if file has grown larger than 2GB, set flag 561 * in superblock if not already set 562 */ 563 if ((ip->i_size > MAXOFF32_T) && 564 !(fs->fs_flags & FSLARGEFILES)) { 565 ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES); 566 mutex_enter(&ufsvfsp->vfs_lock); 567 fs->fs_flags |= FSLARGEFILES; 568 ufs_sbwrite(ufsvfsp); 569 mutex_exit(&ufsvfsp->vfs_lock); 570 } 571 } while (resid); 572 573 if (error) { 574 /* 575 * restore original state 576 */ 577 if (resid) { 578 if (size == ip->i_size) 579 goto errout; 580 (void) ufs_itrunc(ip, size, 0, cr); 581 } 582 /* 583 * try non-directio path 584 */ 585 goto errout; 586 } 587 skip_alloc: 588 589 /* 590 * get rid of cached pages 591 */ 592 vp = ITOV(ip); 593 exclusive = rw_write_held(&ip->i_contents); 594 if (vn_has_cached_data(vp)) { 595 if (!exclusive) { 596 /* 597 * Still holding i_rwlock, so no allocations 598 * can happen after dropping contents. 599 */ 600 rw_exit(&ip->i_contents); 601 rw_enter(&ip->i_contents, RW_WRITER); 602 } 603 (void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0, B_INVAL, cr); 604 if (vn_has_cached_data(vp)) 605 goto errout; 606 if (!exclusive) 607 rw_downgrade(&ip->i_contents); 608 ufs_directio_kstats.nflushes.value.ui64++; 609 } 610 611 /* 612 * Direct Writes 613 */ 614 615 if (!exclusive) { 616 ufs_shared_writes++; 617 ncur = atomic_add_32_nv(&ufs_cur_writes, 1); 618 if (ncur > ufs_maxcur_writes) 619 ufs_maxcur_writes = ncur; 620 } 621 622 /* 623 * proc and as are for VM operations in directio_start() 624 */ 625 if (uio->uio_segflg == UIO_USERSPACE) { 626 procp = ttoproc(curthread); 627 as = procp->p_as; 628 } else { 629 procp = NULL; 630 as = &kas; 631 } 632 *statusp = DIRECTIO_SUCCESS; 633 error = 0; 634 newerror = 0; 635 resid = uio->uio_resid; 636 bytes_written = 0; 637 ufs_directio_kstats.logical_writes.value.ui64++; 638 while (error == 0 && newerror == 0 && resid && uio->uio_iovcnt) { 639 size_t pglck_len, pglck_size; 640 caddr_t pglck_base; 641 page_t **pplist, **spplist; 642 643 tail = NULL; 644 645 /* 646 * Adjust number of bytes 647 */ 648 iov = uio->uio_iov; 649 pglck_len = (size_t)MIN(iov->iov_len, resid); 650 pglck_base = iov->iov_base; 651 if (pglck_len == 0) { 652 uio->uio_iov++; 653 uio->uio_iovcnt--; 654 continue; 655 } 656 657 /* 658 * Try to Lock down the largest chunck of pages possible. 659 */ 660 pglck_len = (size_t)MIN(pglck_len, ufsvfsp->vfs_ioclustsz); 661 error = as_pagelock(as, &pplist, pglck_base, pglck_len, S_READ); 662 663 if (error) 664 break; 665 666 pglck_size = pglck_len; 667 while (pglck_len) { 668 669 nbytes = pglck_len; 670 uoff = uio->uio_loffset; 671 672 if (!bmap_peek) { 673 674 /* 675 * Re-adjust number of bytes to contiguous 676 * range. May have already called bmap_read 677 * in the case of a concurrent rewrite. 678 */ 679 len = (ssize_t)blkroundup(fs, nbytes); 680 error = bmap_read(ip, uoff, &bn, &len); 681 if (error) 682 break; 683 if (bn == UFS_HOLE || len == 0) 684 break; 685 } 686 nbytes = (size_t)MIN(nbytes, len); 687 bmap_peek = 0; 688 689 /* 690 * Get the pagelist pointer for this offset to be 691 * passed to directio_start. 692 */ 693 694 if (pplist != NULL) 695 spplist = pplist + 696 btop((uintptr_t)iov->iov_base - 697 ((uintptr_t)pglck_base & PAGEMASK)); 698 else 699 spplist = NULL; 700 701 /* 702 * Kick off the direct write requests 703 */ 704 directio_start(ufsvfsp, ip->i_dev, nbytes, ldbtob(bn), 705 iov->iov_base, S_READ, procp, &tail, spplist); 706 707 /* 708 * Adjust pointers and counters 709 */ 710 iov->iov_len -= nbytes; 711 iov->iov_base += nbytes; 712 uio->uio_loffset += nbytes; 713 resid -= nbytes; 714 pglck_len -= nbytes; 715 } 716 717 /* 718 * Wait for outstanding requests 719 */ 720 newerror = directio_wait(tail, &bytes_written); 721 722 /* 723 * Release VM resources 724 */ 725 as_pageunlock(as, pplist, pglck_base, pglck_size, S_READ); 726 727 } 728 729 if (!exclusive) { 730 atomic_add_32(&ufs_cur_writes, -1); 731 /* 732 * If this write was done shared, readers may 733 * have pulled in unmodified pages. Get rid of 734 * these potentially stale pages. 735 */ 736 if (vn_has_cached_data(vp)) { 737 rw_exit(&ip->i_contents); 738 rw_enter(&ip->i_contents, RW_WRITER); 739 (void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0, 740 B_INVAL, cr); 741 ufs_directio_kstats.nflushes.value.ui64++; 742 rw_downgrade(&ip->i_contents); 743 } 744 } 745 746 /* 747 * If error, adjust resid to begin at the first 748 * un-writable byte. 749 */ 750 if (error == 0) 751 error = newerror; 752 if (error) 753 resid = uio->uio_resid - bytes_written; 754 arg_uio->uio_resid = resid; 755 756 if (!rewrite) { 757 ip->i_flag |= IUPD | ICHG; 758 /* Caller will update i_seq */ 759 TRANS_INODE(ip->i_ufsvfs, ip); 760 } 761 /* 762 * If there is a residual; adjust the EOF if necessary 763 */ 764 if (resid) { 765 if (size != ip->i_size) { 766 if (uio->uio_loffset > size) 767 size = uio->uio_loffset; 768 (void) ufs_itrunc(ip, size, 0, cr); 769 } 770 } 771 772 if (uio == ©_uio) 773 kmem_free(copy_base, copy_resid); 774 775 return (error); 776 777 errout: 778 if (uio == ©_uio) 779 kmem_free(copy_base, copy_resid); 780 781 return (0); 782 } 783 /* 784 * Direct read of a hole 785 */ 786 static int 787 directio_hole(struct uio *uio, size_t nbytes) 788 { 789 int error = 0, nzero; 790 uio_t phys_uio; 791 iovec_t phys_iov; 792 793 ufs_directio_kstats.hole_reads.value.ui64++; 794 ufs_directio_kstats.nread.value.ui64 += nbytes; 795 796 phys_iov.iov_base = uio->uio_iov->iov_base; 797 phys_iov.iov_len = nbytes; 798 799 phys_uio.uio_iov = &phys_iov; 800 phys_uio.uio_iovcnt = 1; 801 phys_uio.uio_resid = phys_iov.iov_len; 802 phys_uio.uio_segflg = uio->uio_segflg; 803 phys_uio.uio_extflg = uio->uio_extflg; 804 while (error == 0 && phys_uio.uio_resid) { 805 nzero = (int)MIN(phys_iov.iov_len, ufs_directio_zero_len); 806 error = uiomove(ufs_directio_zero_buf, nzero, UIO_READ, 807 &phys_uio); 808 } 809 return (error); 810 } 811 812 /* 813 * Direct Read 814 */ 815 int 816 ufs_directio_read(struct inode *ip, uio_t *uio, cred_t *cr, int *statusp) 817 { 818 ssize_t resid, bytes_read; 819 u_offset_t size, uoff; 820 int error, newerror, len; 821 size_t nbytes; 822 struct fs *fs; 823 vnode_t *vp; 824 daddr_t bn; 825 iovec_t *iov; 826 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 827 struct proc *procp; 828 struct as *as; 829 struct directio_buf *tail; 830 831 /* 832 * assume that directio isn't possible (normal case) 833 */ 834 *statusp = DIRECTIO_FAILURE; 835 836 /* 837 * Don't go direct 838 */ 839 if (ufs_directio_enabled == 0) 840 return (0); 841 842 /* 843 * mapped file; nevermind 844 */ 845 if (ip->i_mapcnt) 846 return (0); 847 848 /* 849 * CAN WE DO DIRECT IO? 850 */ 851 /* 852 * must be sector aligned 853 */ 854 uoff = uio->uio_loffset; 855 resid = uio->uio_resid; 856 if ((uoff & (u_offset_t)(DEV_BSIZE - 1)) || (resid & (DEV_BSIZE - 1))) 857 return (0); 858 /* 859 * must be short aligned and sector aligned 860 */ 861 iov = uio->uio_iov; 862 nbytes = uio->uio_iovcnt; 863 while (nbytes--) { 864 if (((size_t)iov->iov_len & (DEV_BSIZE - 1)) != 0) 865 return (0); 866 if ((intptr_t)(iov++->iov_base) & 1) 867 return (0); 868 } 869 870 /* 871 * DIRECTIO 872 */ 873 fs = ip->i_fs; 874 875 /* 876 * don't read past EOF 877 */ 878 size = ip->i_size; 879 880 /* 881 * The file offset is past EOF so bail out here; we don't want 882 * to update uio_resid and make it look like we read something. 883 * We say that direct I/O was a success to avoid having rdip() 884 * go through the same "read past EOF logic". 885 */ 886 if (uoff >= size) { 887 *statusp = DIRECTIO_SUCCESS; 888 return (0); 889 } 890 891 /* 892 * The read would extend past EOF so make it smaller. 893 */ 894 if ((uoff + resid) > size) { 895 resid = size - uoff; 896 /* 897 * recheck sector alignment 898 */ 899 if (resid & (DEV_BSIZE - 1)) 900 return (0); 901 } 902 903 /* 904 * At this point, we know there is some real work to do. 905 */ 906 ASSERT(resid); 907 908 /* 909 * get rid of cached pages 910 */ 911 vp = ITOV(ip); 912 if (vn_has_cached_data(vp)) { 913 rw_exit(&ip->i_contents); 914 rw_enter(&ip->i_contents, RW_WRITER); 915 (void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0, B_INVAL, cr); 916 if (vn_has_cached_data(vp)) 917 return (0); 918 rw_downgrade(&ip->i_contents); 919 ufs_directio_kstats.nflushes.value.ui64++; 920 } 921 /* 922 * Direct Reads 923 */ 924 925 /* 926 * proc and as are for VM operations in directio_start() 927 */ 928 if (uio->uio_segflg == UIO_USERSPACE) { 929 procp = ttoproc(curthread); 930 as = procp->p_as; 931 } else { 932 procp = NULL; 933 as = &kas; 934 } 935 936 *statusp = DIRECTIO_SUCCESS; 937 error = 0; 938 newerror = 0; 939 bytes_read = 0; 940 ufs_directio_kstats.logical_reads.value.ui64++; 941 while (error == 0 && newerror == 0 && resid && uio->uio_iovcnt) { 942 size_t pglck_len, pglck_size; 943 caddr_t pglck_base; 944 page_t **pplist, **spplist; 945 946 tail = NULL; 947 948 /* 949 * Adjust number of bytes 950 */ 951 iov = uio->uio_iov; 952 pglck_len = (size_t)MIN(iov->iov_len, resid); 953 pglck_base = iov->iov_base; 954 if (pglck_len == 0) { 955 uio->uio_iov++; 956 uio->uio_iovcnt--; 957 continue; 958 } 959 960 /* 961 * Try to Lock down the largest chunck of pages possible. 962 */ 963 pglck_len = (size_t)MIN(pglck_len, ufsvfsp->vfs_ioclustsz); 964 error = as_pagelock(as, &pplist, pglck_base, 965 pglck_len, S_WRITE); 966 967 if (error) 968 break; 969 970 pglck_size = pglck_len; 971 while (pglck_len) { 972 973 nbytes = pglck_len; 974 uoff = uio->uio_loffset; 975 976 /* 977 * Re-adjust number of bytes to contiguous range 978 */ 979 len = (ssize_t)blkroundup(fs, nbytes); 980 error = bmap_read(ip, uoff, &bn, &len); 981 if (error) 982 break; 983 984 if (bn == UFS_HOLE) { 985 nbytes = (size_t)MIN(fs->fs_bsize - 986 (long)blkoff(fs, uoff), nbytes); 987 error = directio_hole(uio, nbytes); 988 /* 989 * Hole reads are not added to the list 990 * processed by directio_wait() below so 991 * account for bytes read here. 992 */ 993 if (!error) 994 bytes_read += nbytes; 995 } else { 996 nbytes = (size_t)MIN(nbytes, len); 997 998 /* 999 * Get the pagelist pointer for this offset 1000 * to be passed to directio_start. 1001 */ 1002 if (pplist != NULL) 1003 spplist = pplist + 1004 btop((uintptr_t)iov->iov_base - 1005 ((uintptr_t)pglck_base & PAGEMASK)); 1006 else 1007 spplist = NULL; 1008 1009 /* 1010 * Kick off the direct read requests 1011 */ 1012 directio_start(ufsvfsp, ip->i_dev, nbytes, 1013 ldbtob(bn), iov->iov_base, 1014 S_WRITE, procp, &tail, spplist); 1015 } 1016 1017 if (error) 1018 break; 1019 1020 /* 1021 * Adjust pointers and counters 1022 */ 1023 iov->iov_len -= nbytes; 1024 iov->iov_base += nbytes; 1025 uio->uio_loffset += nbytes; 1026 resid -= nbytes; 1027 pglck_len -= nbytes; 1028 } 1029 1030 /* 1031 * Wait for outstanding requests 1032 */ 1033 newerror = directio_wait(tail, &bytes_read); 1034 /* 1035 * Release VM resources 1036 */ 1037 as_pageunlock(as, pplist, pglck_base, pglck_size, S_WRITE); 1038 1039 } 1040 1041 /* 1042 * If error, adjust resid to begin at the first 1043 * un-read byte. 1044 */ 1045 if (error == 0) 1046 error = newerror; 1047 uio->uio_resid -= bytes_read; 1048 return (error); 1049 } 1050