/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License, Version 1.0 only * (the "License"). You may not use this file except in compliance * with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2005 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ /* All Rights Reserved */ /* * Portions of this source code were derived from Berkeley 4.3 BSD * under license from the Regents of the University of California. */ #pragma ident "%Z%%M% %I% %E% SMI" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* must be AFTER ! */ #include #include /* _FIOIO */ #include #include #include #include #include #include #include #include #include #include #include #include static void *ufs_directio_zero_buf; static int ufs_directio_zero_len = 8192; int ufs_directio_enabled = 1; /* feature is enabled */ /* * for kstats reader */ struct ufs_directio_kstats { uint_t logical_reads; uint_t phys_reads; uint_t hole_reads; uint_t nread; uint_t logical_writes; uint_t phys_writes; uint_t nwritten; uint_t nflushes; } ufs_directio_kstats; kstat_t *ufs_directio_kstatsp; /* * use kmem_cache_create for direct-physio buffers. This has shown * a better cache distribution compared to buffers on the * stack. It also avoids semaphore construction/deconstruction * per request */ struct directio_buf { struct directio_buf *next; char *addr; size_t nbytes; struct buf buf; }; static struct kmem_cache *directio_buf_cache; /* ARGSUSED */ static int directio_buf_constructor(void *dbp, void *cdrarg, int kmflags) { bioinit((struct buf *)&((struct directio_buf *)dbp)->buf); return (0); } /* ARGSUSED */ static void directio_buf_destructor(void *dbp, void *cdrarg) { biofini((struct buf *)&((struct directio_buf *)dbp)->buf); } void directio_bufs_init(void) { directio_buf_cache = kmem_cache_create("directio_buf_cache", sizeof (struct directio_buf), 0, directio_buf_constructor, directio_buf_destructor, NULL, NULL, NULL, 0); } void ufs_directio_init(void) { /* * kstats */ ufs_directio_kstatsp = kstat_create("ufs directio", 0, "UFS DirectIO Stats", "ufs directio", KSTAT_TYPE_RAW, sizeof (ufs_directio_kstats), KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE); if (ufs_directio_kstatsp) { ufs_directio_kstatsp->ks_data = (void *)&ufs_directio_kstats; kstat_install(ufs_directio_kstatsp); } /* * kzero is broken so we have to use a private buf of zeroes */ ufs_directio_zero_buf = kmem_zalloc(ufs_directio_zero_len, KM_SLEEP); directio_bufs_init(); } /* * Wait for the first direct IO operation to finish */ static int directio_wait_one(struct directio_buf *dbp, long *bytes_iop) { buf_t *bp; int error; /* * Wait for IO to finish */ bp = &dbp->buf; error = biowait(bp); /* * bytes_io will be used to figure out a resid * for the caller. The resid is approximated by reporting * the bytes following the first failed IO as the residual. * * I am cautious about using b_resid because I * am not sure how well the disk drivers maintain it. */ if (error) if (bp->b_resid) *bytes_iop = bp->b_bcount - bp->b_resid; else *bytes_iop = 0; else *bytes_iop += bp->b_bcount; /* * Release direct IO resources */ bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_SHADOW); kmem_cache_free(directio_buf_cache, dbp); return (error); } /* * Wait for all of the direct IO operations to finish */ uint32_t ufs_directio_drop_kpri = 0; /* enable kpri hack */ static int directio_wait(struct directio_buf *tail, long *bytes_iop) { int error = 0, newerror; struct directio_buf *dbp; uint_t kpri_req_save; /* * The linked list of directio buf structures is maintained * in reverse order (tail->last request->penultimate request->...) */ /* * This is the k_pri_req hack. Large numbers of threads * sleeping with kernel priority will cause scheduler thrashing * on an MP machine. This can be seen running Oracle using * directio to ufs files. Sleep at normal priority here to * more closely mimic physio to a device partition. This * workaround is disabled by default as a niced thread could * be starved from running while holding i_rwlock and i_contents. */ if (ufs_directio_drop_kpri) { kpri_req_save = curthread->t_kpri_req; curthread->t_kpri_req = 0; } while ((dbp = tail) != NULL) { tail = dbp->next; newerror = directio_wait_one(dbp, bytes_iop); if (error == 0) error = newerror; } if (ufs_directio_drop_kpri) curthread->t_kpri_req = kpri_req_save; return (error); } /* * Initiate direct IO request */ static void directio_start(struct ufsvfs *ufsvfsp, dev_t dev, size_t nbytes, offset_t offset, char *addr, enum seg_rw rw, struct proc *procp, struct directio_buf **tailp, page_t **pplist) { buf_t *bp; struct directio_buf *dbp; /* * Allocate a directio buf header * Note - list is maintained in reverse order. * directio_wait_one() depends on this fact when * adjusting the ``bytes_io'' param. bytes_io * is used to compute a residual in the case of error. */ dbp = kmem_cache_alloc(directio_buf_cache, KM_SLEEP); dbp->next = *tailp; *tailp = dbp; /* * Initialize buf header */ dbp->addr = addr; dbp->nbytes = nbytes; bp = &dbp->buf; bp->b_edev = dev; bp->b_lblkno = btodt(offset); bp->b_bcount = nbytes; bp->b_un.b_addr = addr; bp->b_proc = procp; /* * Note that S_WRITE implies B_READ and vice versa: a read(2) * will B_READ data from the filesystem and S_WRITE it into * the user's buffer; a write(2) will S_READ data from the * user's buffer and B_WRITE it to the filesystem. */ if (rw == S_WRITE) { bp->b_flags = B_BUSY | B_PHYS | B_READ; ufs_directio_kstats.phys_reads++; ufs_directio_kstats.nread += nbytes; } else { bp->b_flags = B_BUSY | B_PHYS | B_WRITE; ufs_directio_kstats.phys_writes++; ufs_directio_kstats.nwritten += nbytes; } bp->b_shadow = pplist; if (pplist != NULL) bp->b_flags |= B_SHADOW; /* * Issue I/O request. */ ufsvfsp->vfs_iotstamp = lbolt; if (ufsvfsp->vfs_snapshot) fssnap_strategy(&ufsvfsp->vfs_snapshot, bp); else (void) bdev_strategy(bp); if (rw == S_WRITE) lwp_stat_update(LWP_STAT_OUBLK, 1); else lwp_stat_update(LWP_STAT_INBLK, 1); } uint32_t ufs_shared_writes; /* writes done w/ lock shared */ uint32_t ufs_cur_writes; /* # concurrent writes */ uint32_t ufs_maxcur_writes; /* high water concurrent writes */ uint32_t ufs_posix_hits; /* writes done /w lock excl. */ /* * Force POSIX syncronous data integrity on all writes for testing. */ uint32_t ufs_force_posix_sdi = 0; /* * Direct Write */ int ufs_directio_write(struct inode *ip, uio_t *arg_uio, int ioflag, int rewrite, cred_t *cr, int *statusp) { long resid, bytes_written; u_offset_t size, uoff; uio_t *uio = arg_uio; rlim64_t limit = uio->uio_llimit; int on, n, error, newerror, len, has_holes; daddr_t bn; size_t nbytes; struct fs *fs; vnode_t *vp; iovec_t *iov; struct ufsvfs *ufsvfsp = ip->i_ufsvfs; struct proc *procp; struct as *as; struct directio_buf *tail; int exclusive, ncur, bmap_peek; uio_t copy_uio; iovec_t copy_iov; char *copy_base; long copy_resid; /* * assume that directio isn't possible (normal case) */ *statusp = DIRECTIO_FAILURE; /* * Don't go direct */ if (ufs_directio_enabled == 0) return (0); /* * mapped file; nevermind */ if (ip->i_mapcnt) return (0); /* * CAN WE DO DIRECT IO? */ uoff = uio->uio_loffset; resid = uio->uio_resid; /* * beyond limit */ if (uoff + resid > limit) return (0); /* * must be sector aligned */ if ((uoff & (u_offset_t)(DEV_BSIZE - 1)) || (resid & (DEV_BSIZE - 1))) return (0); /* * SHOULD WE DO DIRECT IO? */ size = ip->i_size; has_holes = -1; /* * only on regular files; no metadata */ if (((ip->i_mode & IFMT) != IFREG) || ip->i_ufsvfs->vfs_qinod == ip) return (0); /* * Synchronous, allocating writes run very slow in Direct-Mode * XXX - can be fixed with bmap_write changes for large writes!!! * XXX - can be fixed for updates to "almost-full" files * XXX - WARNING - system hangs if bmap_write() has to * allocate lots of pages since pageout * suspends on locked inode */ if (!rewrite && (ip->i_flag & ISYNC)) { if ((uoff + resid) > size) return (0); has_holes = bmap_has_holes(ip); if (has_holes) return (0); } /* * Each iovec must be short aligned and sector aligned. If * one is not, then kmem_alloc a new buffer and copy all of * the smaller buffers into the new buffer. This new * buffer will be short aligned and sector aligned. */ iov = uio->uio_iov; nbytes = uio->uio_iovcnt; while (nbytes--) { if (((uint_t)iov->iov_len & (DEV_BSIZE - 1)) != 0 || (intptr_t)(iov->iov_base) & 1) { copy_resid = uio->uio_resid; copy_base = kmem_alloc(copy_resid, KM_NOSLEEP); if (copy_base == NULL) return (0); copy_iov.iov_base = copy_base; copy_iov.iov_len = copy_resid; copy_uio.uio_iov = ©_iov; copy_uio.uio_iovcnt = 1; copy_uio.uio_segflg = UIO_SYSSPACE; copy_uio.uio_extflg = UIO_COPY_DEFAULT; copy_uio.uio_loffset = uio->uio_loffset; copy_uio.uio_resid = uio->uio_resid; copy_uio.uio_llimit = uio->uio_llimit; error = uiomove(copy_base, copy_resid, UIO_WRITE, uio); if (error) { kmem_free(copy_base, copy_resid); return (0); } uio = ©_uio; break; } iov++; } /* * From here on down, all error exits must go to errout and * not simply return a 0. */ /* * DIRECTIO */ fs = ip->i_fs; /* * POSIX check. If attempting a concurrent re-write, make sure * that this will be a single request to the driver to meet * POSIX synchronous data integrity requirements. */ bmap_peek = 0; if (rewrite && ((ioflag & FDSYNC) || ufs_force_posix_sdi)) { int upgrade = 0; /* check easy conditions first */ if (uio->uio_iovcnt != 1 || resid > ufsvfsp->vfs_ioclustsz) { upgrade = 1; } else { /* now look for contiguous allocation */ len = (ssize_t)blkroundup(fs, resid); error = bmap_read(ip, uoff, &bn, &len); if (error || bn == UFS_HOLE || len == 0) goto errout; /* save a call to bmap_read later */ bmap_peek = 1; if (len < resid) upgrade = 1; } if (upgrade) { rw_exit(&ip->i_contents); rw_enter(&ip->i_contents, RW_WRITER); ufs_posix_hits++; } } /* * allocate space */ /* * If attempting a re-write, there is no allocation to do. * bmap_write would trip an ASSERT if i_contents is held shared. */ if (rewrite) goto skip_alloc; do { on = (int)blkoff(fs, uoff); n = (int)MIN(fs->fs_bsize - on, resid); if ((uoff + n) > ip->i_size) { error = bmap_write(ip, uoff, (int)(on + n), (int)(uoff & (offset_t)MAXBOFFSET) == 0, cr); /* Caller is responsible for updating i_seq if needed */ if (error) break; ip->i_size = uoff + n; ip->i_flag |= IATTCHG; } else if (n == MAXBSIZE) { error = bmap_write(ip, uoff, (int)(on + n), 1, cr); /* Caller is responsible for updating i_seq if needed */ } else { if (has_holes < 0) has_holes = bmap_has_holes(ip); if (has_holes) { uint_t blk_size; u_offset_t offset; offset = uoff & (offset_t)fs->fs_bmask; blk_size = (int)blksize(fs, ip, (daddr_t)lblkno(fs, offset)); error = bmap_write(ip, uoff, blk_size, 0, cr); /* * Caller is responsible for updating * i_seq if needed */ } else error = 0; } if (error) break; uoff += n; resid -= n; /* * if file has grown larger than 2GB, set flag * in superblock if not already set */ if ((ip->i_size > MAXOFF32_T) && !(fs->fs_flags & FSLARGEFILES)) { ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES); mutex_enter(&ufsvfsp->vfs_lock); fs->fs_flags |= FSLARGEFILES; ufs_sbwrite(ufsvfsp); mutex_exit(&ufsvfsp->vfs_lock); } } while (resid); if (error) { /* * restore original state */ if (resid) { if (size == ip->i_size) goto errout; (void) ufs_itrunc(ip, size, 0, cr); } /* * try non-directio path */ goto errout; } skip_alloc: /* * get rid of cached pages */ vp = ITOV(ip); exclusive = rw_write_held(&ip->i_contents); if (vn_has_cached_data(vp)) { if (!exclusive) { /* * Still holding i_rwlock, so no allocations * can happen after dropping contents. */ rw_exit(&ip->i_contents); rw_enter(&ip->i_contents, RW_WRITER); } (void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0, B_INVAL, cr); if (vn_has_cached_data(vp)) goto errout; if (!exclusive) rw_downgrade(&ip->i_contents); ufs_directio_kstats.nflushes++; } /* * Direct Writes */ if (!exclusive) { ufs_shared_writes++; ncur = atomic_add_32_nv(&ufs_cur_writes, 1); if (ncur > ufs_maxcur_writes) ufs_maxcur_writes = ncur; } /* * proc and as are for VM operations in directio_start() */ if (uio->uio_segflg == UIO_USERSPACE) { procp = ttoproc(curthread); as = procp->p_as; } else { procp = NULL; as = &kas; } *statusp = DIRECTIO_SUCCESS; error = 0; newerror = 0; resid = uio->uio_resid; bytes_written = 0; ufs_directio_kstats.logical_writes++; while (error == 0 && newerror == 0 && resid && uio->uio_iovcnt) { size_t pglck_len, pglck_size; caddr_t pglck_base; page_t **pplist, **spplist; tail = NULL; /* * Adjust number of bytes */ iov = uio->uio_iov; pglck_len = (size_t)MIN(iov->iov_len, resid); pglck_base = iov->iov_base; if (pglck_len == 0) { uio->uio_iov++; uio->uio_iovcnt--; continue; } /* * Try to Lock down the largest chunck of pages possible. */ pglck_len = (size_t)MIN(pglck_len, ufsvfsp->vfs_ioclustsz); error = as_pagelock(as, &pplist, pglck_base, pglck_len, S_READ); if (error) break; pglck_size = pglck_len; while (pglck_len) { nbytes = pglck_len; uoff = uio->uio_loffset; if (!bmap_peek) { /* * Re-adjust number of bytes to contiguous * range. May have already called bmap_read * in the case of a concurrent rewrite. */ len = (ssize_t)blkroundup(fs, nbytes); error = bmap_read(ip, uoff, &bn, &len); if (error) break; if (bn == UFS_HOLE || len == 0) break; } nbytes = (size_t)MIN(nbytes, len); bmap_peek = 0; /* * Get the pagelist pointer for this offset to be * passed to directio_start. */ if (pplist != NULL) spplist = pplist + btop((uintptr_t)iov->iov_base - ((uintptr_t)pglck_base & PAGEMASK)); else spplist = NULL; /* * Kick off the direct write requests */ directio_start(ufsvfsp, ip->i_dev, nbytes, ldbtob(bn), iov->iov_base, S_READ, procp, &tail, spplist); /* * Adjust pointers and counters */ iov->iov_len -= nbytes; iov->iov_base += nbytes; uio->uio_loffset += nbytes; resid -= nbytes; pglck_len -= nbytes; } /* * Wait for outstanding requests */ newerror = directio_wait(tail, &bytes_written); /* * Release VM resources */ as_pageunlock(as, pplist, pglck_base, pglck_size, S_READ); } if (!exclusive) { atomic_add_32(&ufs_cur_writes, -1); /* * If this write was done shared, readers may * have pulled in unmodified pages. Get rid of * these potentially stale pages. */ if (vn_has_cached_data(vp)) { rw_exit(&ip->i_contents); rw_enter(&ip->i_contents, RW_WRITER); (void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0, B_INVAL, cr); ufs_directio_kstats.nflushes++; rw_downgrade(&ip->i_contents); } } /* * If error, adjust resid to begin at the first * un-writable byte. */ if (error == 0) error = newerror; if (error) resid = uio->uio_resid - bytes_written; arg_uio->uio_resid = resid; if (!rewrite) { ip->i_flag |= IUPD | ICHG; /* Caller will update i_seq */ TRANS_INODE(ip->i_ufsvfs, ip); } /* * If there is a residual; adjust the EOF if necessary */ if (resid) { if (size != ip->i_size) { if (uio->uio_loffset > size) size = uio->uio_loffset; (void) ufs_itrunc(ip, size, 0, cr); } } if (uio == ©_uio) kmem_free(copy_base, copy_resid); return (error); errout: if (uio == ©_uio) kmem_free(copy_base, copy_resid); return (0); } /* * Direct read of a hole */ static int directio_hole(struct uio *uio, size_t nbytes) { int error = 0, nzero; uio_t phys_uio; iovec_t phys_iov; ufs_directio_kstats.hole_reads++; ufs_directio_kstats.nread += nbytes; phys_iov.iov_base = uio->uio_iov->iov_base; phys_iov.iov_len = nbytes; phys_uio.uio_iov = &phys_iov; phys_uio.uio_iovcnt = 1; phys_uio.uio_resid = phys_iov.iov_len; phys_uio.uio_segflg = uio->uio_segflg; phys_uio.uio_extflg = uio->uio_extflg; while (error == 0 && phys_uio.uio_resid) { nzero = (int)MIN(phys_iov.iov_len, ufs_directio_zero_len); error = uiomove(ufs_directio_zero_buf, nzero, UIO_READ, &phys_uio); } return (error); } /* * Direct Read */ int ufs_directio_read(struct inode *ip, uio_t *uio, cred_t *cr, int *statusp) { ssize_t resid, bytes_read; u_offset_t size, uoff; int error, newerror, len; size_t nbytes; struct fs *fs; vnode_t *vp; daddr_t bn; iovec_t *iov; struct ufsvfs *ufsvfsp = ip->i_ufsvfs; struct proc *procp; struct as *as; struct directio_buf *tail; /* * assume that directio isn't possible (normal case) */ *statusp = DIRECTIO_FAILURE; /* * Don't go direct */ if (ufs_directio_enabled == 0) return (0); /* * mapped file; nevermind */ if (ip->i_mapcnt) return (0); /* * CAN WE DO DIRECT IO? */ /* * must be sector aligned */ uoff = uio->uio_loffset; resid = uio->uio_resid; if ((uoff & (u_offset_t)(DEV_BSIZE - 1)) || (resid & (DEV_BSIZE - 1))) return (0); /* * must be short aligned and sector aligned */ iov = uio->uio_iov; nbytes = uio->uio_iovcnt; while (nbytes--) { if (((size_t)iov->iov_len & (DEV_BSIZE - 1)) != 0) return (0); if ((intptr_t)(iov++->iov_base) & 1) return (0); } /* * DIRECTIO */ fs = ip->i_fs; /* * don't read past EOF */ size = ip->i_size; /* * The file offset is past EOF so bail out here; we don't want * to update uio_resid and make it look like we read something. * We say that direct I/O was a success to avoid having rdip() * go through the same "read past EOF logic". */ if (uoff >= size) { *statusp = DIRECTIO_SUCCESS; return (0); } /* * The read would extend past EOF so make it smaller. */ if ((uoff + resid) > size) { resid = size - uoff; /* * recheck sector alignment */ if (resid & (DEV_BSIZE - 1)) return (0); } /* * At this point, we know there is some real work to do. */ ASSERT(resid); /* * get rid of cached pages */ vp = ITOV(ip); if (vn_has_cached_data(vp)) { rw_exit(&ip->i_contents); rw_enter(&ip->i_contents, RW_WRITER); (void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0, B_INVAL, cr); if (vn_has_cached_data(vp)) return (0); rw_downgrade(&ip->i_contents); ufs_directio_kstats.nflushes++; } /* * Direct Reads */ /* * proc and as are for VM operations in directio_start() */ if (uio->uio_segflg == UIO_USERSPACE) { procp = ttoproc(curthread); as = procp->p_as; } else { procp = NULL; as = &kas; } *statusp = DIRECTIO_SUCCESS; error = 0; newerror = 0; bytes_read = 0; ufs_directio_kstats.logical_reads++; while (error == 0 && newerror == 0 && resid && uio->uio_iovcnt) { size_t pglck_len, pglck_size; caddr_t pglck_base; page_t **pplist, **spplist; tail = NULL; /* * Adjust number of bytes */ iov = uio->uio_iov; pglck_len = (size_t)MIN(iov->iov_len, resid); pglck_base = iov->iov_base; if (pglck_len == 0) { uio->uio_iov++; uio->uio_iovcnt--; continue; } /* * Try to Lock down the largest chunck of pages possible. */ pglck_len = (size_t)MIN(pglck_len, ufsvfsp->vfs_ioclustsz); error = as_pagelock(as, &pplist, pglck_base, pglck_len, S_WRITE); if (error) break; pglck_size = pglck_len; while (pglck_len) { nbytes = pglck_len; uoff = uio->uio_loffset; /* * Re-adjust number of bytes to contiguous range */ len = (ssize_t)blkroundup(fs, nbytes); error = bmap_read(ip, uoff, &bn, &len); if (error) break; if (bn == UFS_HOLE) { nbytes = (size_t)MIN(fs->fs_bsize - (long)blkoff(fs, uoff), nbytes); error = directio_hole(uio, nbytes); /* * Hole reads are not added to the list * processed by directio_wait() below so * account for bytes read here. */ if (!error) bytes_read += nbytes; } else { nbytes = (size_t)MIN(nbytes, len); /* * Get the pagelist pointer for this offset * to be passed to directio_start. */ if (pplist != NULL) spplist = pplist + btop((uintptr_t)iov->iov_base - ((uintptr_t)pglck_base & PAGEMASK)); else spplist = NULL; /* * Kick off the direct read requests */ directio_start(ufsvfsp, ip->i_dev, nbytes, ldbtob(bn), iov->iov_base, S_WRITE, procp, &tail, spplist); } if (error) break; /* * Adjust pointers and counters */ iov->iov_len -= nbytes; iov->iov_base += nbytes; uio->uio_loffset += nbytes; resid -= nbytes; pglck_len -= nbytes; } /* * Wait for outstanding requests */ newerror = directio_wait(tail, &bytes_read); /* * Release VM resources */ as_pageunlock(as, pplist, pglck_base, pglck_size, S_WRITE); } /* * If error, adjust resid to begin at the first * un-read byte. */ if (error == 0) error = newerror; uio->uio_resid -= bytes_read; return (error); }