/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int32_t udf_open(struct vnode **, int32_t, struct cred *, caller_context_t *); static int32_t udf_close(struct vnode *, int32_t, int32_t, offset_t, struct cred *, caller_context_t *); static int32_t udf_read(struct vnode *, struct uio *, int32_t, struct cred *, caller_context_t *); static int32_t udf_write(struct vnode *, struct uio *, int32_t, struct cred *, caller_context_t *); static int32_t udf_ioctl(struct vnode *, int32_t, intptr_t, int32_t, struct cred *, int32_t *, caller_context_t *); static int32_t udf_getattr(struct vnode *, struct vattr *, int32_t, struct cred *, caller_context_t *); static int32_t udf_setattr(struct vnode *, struct vattr *, int32_t, struct cred *, caller_context_t *); static int32_t udf_access(struct vnode *, int32_t, int32_t, struct cred *, caller_context_t *); static int32_t udf_lookup(struct vnode *, char *, struct vnode **, struct pathname *, int32_t, struct vnode *, struct cred *, caller_context_t *, int *, pathname_t *); static int32_t udf_create(struct vnode *, char *, struct vattr *, enum vcexcl, int32_t, struct vnode **, struct cred *, int32_t, caller_context_t *, vsecattr_t *); static int32_t udf_remove(struct vnode *, char *, struct cred *, caller_context_t *, int); static int32_t udf_link(struct vnode *, struct vnode *, char *, struct cred *, caller_context_t *, int); static int32_t udf_rename(struct vnode *, char *, struct vnode *, char *, struct cred *, caller_context_t *, int); static int32_t udf_mkdir(struct vnode *, char *, struct vattr *, struct vnode **, struct cred *, caller_context_t *, int, vsecattr_t *); static int32_t udf_rmdir(struct vnode *, char *, struct vnode *, struct cred *, caller_context_t *, int); static int32_t udf_readdir(struct vnode *, struct uio *, struct cred *, int32_t *, caller_context_t *, int); static int32_t udf_symlink(struct vnode *, char *, struct vattr *, char *, struct cred *, caller_context_t *, int); static int32_t udf_readlink(struct vnode *, struct uio *, struct cred *, caller_context_t *); static int32_t udf_fsync(struct vnode *, int32_t, struct cred *, caller_context_t *); static void udf_inactive(struct vnode *, struct cred *, caller_context_t *); static int32_t udf_fid(struct vnode *, struct fid *, caller_context_t *); static int udf_rwlock(struct vnode *, int32_t, caller_context_t *); static void udf_rwunlock(struct vnode *, int32_t, caller_context_t *); static int32_t udf_seek(struct vnode *, offset_t, offset_t *, caller_context_t *); static int32_t udf_frlock(struct vnode *, int32_t, struct flock64 *, int32_t, offset_t, struct flk_callback *, cred_t *, caller_context_t *); static int32_t udf_space(struct vnode *, int32_t, struct flock64 *, int32_t, offset_t, cred_t *, caller_context_t *); static int32_t udf_getpage(struct vnode *, offset_t, size_t, uint32_t *, struct page **, size_t, struct seg *, caddr_t, enum seg_rw, struct cred *, caller_context_t *); static int32_t udf_putpage(struct vnode *, offset_t, size_t, int32_t, struct cred *, caller_context_t *); static int32_t udf_map(struct vnode *, offset_t, struct as *, caddr_t *, size_t, uint8_t, uint8_t, uint32_t, struct cred *, caller_context_t *); static int32_t udf_addmap(struct vnode *, offset_t, struct as *, caddr_t, size_t, uint8_t, uint8_t, uint32_t, struct cred *, caller_context_t *); static int32_t udf_delmap(struct vnode *, offset_t, struct as *, caddr_t, size_t, uint32_t, uint32_t, uint32_t, struct cred *, caller_context_t *); static int32_t udf_l_pathconf(struct vnode *, int32_t, ulong_t *, struct cred *, caller_context_t *); static int32_t udf_pageio(struct vnode *, struct page *, u_offset_t, size_t, int32_t, struct cred *, caller_context_t *); int32_t ud_getpage_miss(struct vnode *, u_offset_t, size_t, struct seg *, caddr_t, page_t *pl[], size_t, enum seg_rw, int32_t); void ud_getpage_ra(struct vnode *, u_offset_t, struct seg *, caddr_t); int32_t ud_putpages(struct vnode *, offset_t, size_t, int32_t, struct cred *); int32_t ud_page_fill(struct ud_inode *, page_t *, u_offset_t, uint32_t, u_offset_t *); int32_t ud_iodone(struct buf *); int32_t ud_rdip(struct ud_inode *, struct uio *, int32_t, cred_t *); int32_t ud_wrip(struct ud_inode *, struct uio *, int32_t, cred_t *); int32_t ud_multi_strat(struct ud_inode *, page_t *, struct buf *, u_offset_t); int32_t ud_slave_done(struct buf *); /* * Structures to control multiple IO operations to get or put pages * that are backed by discontiguous blocks. The master struct is * a dummy that holds the original bp from pageio_setup. The * slave struct holds the working bp's to do the actual IO. Once * all the slave IOs complete. The master is processed as if a single * IO op has completed. */ uint32_t master_index = 0; typedef struct mio_master { kmutex_t mm_mutex; /* protect the fields below */ int32_t mm_size; buf_t *mm_bp; /* original bp */ int32_t mm_resid; /* bytes remaining to transfer */ int32_t mm_error; /* accumulated error from slaves */ int32_t mm_index; /* XXX debugging */ } mio_master_t; typedef struct mio_slave { buf_t ms_buf; /* working buffer for this IO chunk */ mio_master_t *ms_ptr; /* pointer to master */ } mio_slave_t; struct vnodeops *udf_vnodeops; const fs_operation_def_t udf_vnodeops_template[] = { VOPNAME_OPEN, { .vop_open = udf_open }, VOPNAME_CLOSE, { .vop_close = udf_close }, VOPNAME_READ, { .vop_read = udf_read }, VOPNAME_WRITE, { .vop_write = udf_write }, VOPNAME_IOCTL, { .vop_ioctl = udf_ioctl }, VOPNAME_GETATTR, { .vop_getattr = udf_getattr }, VOPNAME_SETATTR, { .vop_setattr = udf_setattr }, VOPNAME_ACCESS, { .vop_access = udf_access }, VOPNAME_LOOKUP, { .vop_lookup = udf_lookup }, VOPNAME_CREATE, { .vop_create = udf_create }, VOPNAME_REMOVE, { .vop_remove = udf_remove }, VOPNAME_LINK, { .vop_link = udf_link }, VOPNAME_RENAME, { .vop_rename = udf_rename }, VOPNAME_MKDIR, { .vop_mkdir = udf_mkdir }, VOPNAME_RMDIR, { .vop_rmdir = udf_rmdir }, VOPNAME_READDIR, { .vop_readdir = udf_readdir }, VOPNAME_SYMLINK, { .vop_symlink = udf_symlink }, VOPNAME_READLINK, { .vop_readlink = udf_readlink }, VOPNAME_FSYNC, { .vop_fsync = udf_fsync }, VOPNAME_INACTIVE, { .vop_inactive = udf_inactive }, VOPNAME_FID, { .vop_fid = udf_fid }, VOPNAME_RWLOCK, { .vop_rwlock = udf_rwlock }, VOPNAME_RWUNLOCK, { .vop_rwunlock = udf_rwunlock }, VOPNAME_SEEK, { .vop_seek = udf_seek }, VOPNAME_FRLOCK, { .vop_frlock = udf_frlock }, VOPNAME_SPACE, { .vop_space = udf_space }, VOPNAME_GETPAGE, { .vop_getpage = udf_getpage }, VOPNAME_PUTPAGE, { .vop_putpage = udf_putpage }, VOPNAME_MAP, { .vop_map = udf_map }, VOPNAME_ADDMAP, { .vop_addmap = udf_addmap }, VOPNAME_DELMAP, { .vop_delmap = udf_delmap }, VOPNAME_PATHCONF, { .vop_pathconf = udf_l_pathconf }, VOPNAME_PAGEIO, { .vop_pageio = udf_pageio }, VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, NULL, NULL }; /* ARGSUSED */ static int32_t udf_open( struct vnode **vpp, int32_t flag, struct cred *cr, caller_context_t *ct) { ud_printf("udf_open\n"); return (0); } /* ARGSUSED */ static int32_t udf_close( struct vnode *vp, int32_t flag, int32_t count, offset_t offset, struct cred *cr, caller_context_t *ct) { struct ud_inode *ip = VTOI(vp); ud_printf("udf_close\n"); ITIMES(ip); cleanlocks(vp, ttoproc(curthread)->p_pid, 0); cleanshares(vp, ttoproc(curthread)->p_pid); /* * Push partially filled cluster at last close. * ``last close'' is approximated because the dnlc * may have a hold on the vnode. */ if (vp->v_count <= 2 && vp->v_type != VBAD) { struct ud_inode *ip = VTOI(vp); if (ip->i_delaylen) { (void) ud_putpages(vp, ip->i_delayoff, ip->i_delaylen, B_ASYNC | B_FREE, cr); ip->i_delaylen = 0; } } return (0); } /* ARGSUSED */ static int32_t udf_read( struct vnode *vp, struct uio *uiop, int32_t ioflag, struct cred *cr, caller_context_t *ct) { struct ud_inode *ip = VTOI(vp); int32_t error; ud_printf("udf_read\n"); #ifdef __lock_lint rw_enter(&ip->i_rwlock, RW_READER); #endif ASSERT(RW_READ_HELD(&ip->i_rwlock)); if (MANDLOCK(vp, ip->i_char)) { /* * udf_getattr ends up being called by chklock */ error = chklock(vp, FREAD, uiop->uio_loffset, uiop->uio_resid, uiop->uio_fmode, ct); if (error) { goto end; } } rw_enter(&ip->i_contents, RW_READER); error = ud_rdip(ip, uiop, ioflag, cr); rw_exit(&ip->i_contents); end: #ifdef __lock_lint rw_exit(&ip->i_rwlock); #endif return (error); } int32_t ud_WRITES = 1; int32_t ud_HW = 96 * 1024; int32_t ud_LW = 64 * 1024; int32_t ud_throttles = 0; /* ARGSUSED */ static int32_t udf_write( struct vnode *vp, struct uio *uiop, int32_t ioflag, struct cred *cr, caller_context_t *ct) { struct ud_inode *ip = VTOI(vp); int32_t error = 0; ud_printf("udf_write\n"); #ifdef __lock_lint rw_enter(&ip->i_rwlock, RW_WRITER); #endif ASSERT(RW_WRITE_HELD(&ip->i_rwlock)); if (MANDLOCK(vp, ip->i_char)) { /* * ud_getattr ends up being called by chklock */ error = chklock(vp, FWRITE, uiop->uio_loffset, uiop->uio_resid, uiop->uio_fmode, ct); if (error) { goto end; } } /* * Throttle writes. */ mutex_enter(&ip->i_tlock); if (ud_WRITES && (ip->i_writes > ud_HW)) { while (ip->i_writes > ud_HW) { ud_throttles++; cv_wait(&ip->i_wrcv, &ip->i_tlock); } } mutex_exit(&ip->i_tlock); /* * Write to the file */ rw_enter(&ip->i_contents, RW_WRITER); if ((ioflag & FAPPEND) != 0 && (ip->i_type == VREG)) { /* * In append mode start at end of file. */ uiop->uio_loffset = ip->i_size; } error = ud_wrip(ip, uiop, ioflag, cr); rw_exit(&ip->i_contents); end: #ifdef __lock_lint rw_exit(&ip->i_rwlock); #endif return (error); } /* ARGSUSED */ static int32_t udf_ioctl( struct vnode *vp, int32_t cmd, intptr_t arg, int32_t flag, struct cred *cr, int32_t *rvalp, caller_context_t *ct) { return (ENOTTY); } /* ARGSUSED */ static int32_t udf_getattr( struct vnode *vp, struct vattr *vap, int32_t flags, struct cred *cr, caller_context_t *ct) { struct ud_inode *ip = VTOI(vp); ud_printf("udf_getattr\n"); if (vap->va_mask == AT_SIZE) { /* * for performance, if only the size is requested don't bother * with anything else. */ vap->va_size = ip->i_size; return (0); } rw_enter(&ip->i_contents, RW_READER); vap->va_type = vp->v_type; vap->va_mode = UD2VA_PERM(ip->i_perm) | ip->i_char; vap->va_uid = ip->i_uid; vap->va_gid = ip->i_gid; vap->va_fsid = ip->i_dev; vap->va_nodeid = ip->i_icb_lbano; vap->va_nlink = ip->i_nlink; vap->va_size = ip->i_size; vap->va_seq = ip->i_seq; if (vp->v_type == VCHR || vp->v_type == VBLK) { vap->va_rdev = ip->i_rdev; } else { vap->va_rdev = 0; } mutex_enter(&ip->i_tlock); ITIMES_NOLOCK(ip); /* mark correct time in inode */ vap->va_atime.tv_sec = (time_t)ip->i_atime.tv_sec; vap->va_atime.tv_nsec = ip->i_atime.tv_nsec; vap->va_mtime.tv_sec = (time_t)ip->i_mtime.tv_sec; vap->va_mtime.tv_nsec = ip->i_mtime.tv_nsec; vap->va_ctime.tv_sec = (time_t)ip->i_ctime.tv_sec; vap->va_ctime.tv_nsec = ip->i_ctime.tv_nsec; mutex_exit(&ip->i_tlock); switch (ip->i_type) { case VBLK: vap->va_blksize = MAXBSIZE; break; case VCHR: vap->va_blksize = MAXBSIZE; break; default: vap->va_blksize = ip->i_udf->udf_lbsize; break; } vap->va_nblocks = ip->i_lbr << ip->i_udf->udf_l2d_shift; rw_exit(&ip->i_contents); return (0); } static int ud_iaccess_vmode(void *ip, int mode, struct cred *cr) { return (ud_iaccess(ip, UD_UPERM2DPERM(mode), cr, 0)); } /*ARGSUSED4*/ static int32_t udf_setattr( struct vnode *vp, struct vattr *vap, int32_t flags, struct cred *cr, caller_context_t *ct) { int32_t error = 0; uint32_t mask = vap->va_mask; struct ud_inode *ip; timestruc_t now; struct vattr ovap; ud_printf("udf_setattr\n"); ip = VTOI(vp); /* * not updates allowed to 4096 files */ if (ip->i_astrat == STRAT_TYPE4096) { return (EINVAL); } /* * Cannot set these attributes */ if (mask & AT_NOSET) { return (EINVAL); } rw_enter(&ip->i_rwlock, RW_WRITER); rw_enter(&ip->i_contents, RW_WRITER); ovap.va_uid = ip->i_uid; ovap.va_mode = UD2VA_PERM(ip->i_perm) | ip->i_char; error = secpolicy_vnode_setattr(cr, vp, vap, &ovap, flags, ud_iaccess_vmode, ip); if (error) goto update_inode; mask = vap->va_mask; /* * Change file access modes. */ if (mask & AT_MODE) { ip->i_perm = VA2UD_PERM(vap->va_mode); ip->i_char = vap->va_mode & (VSUID | VSGID | VSVTX); mutex_enter(&ip->i_tlock); ip->i_flag |= ICHG; mutex_exit(&ip->i_tlock); } if (mask & (AT_UID|AT_GID)) { if (mask & AT_UID) { ip->i_uid = vap->va_uid; } if (mask & AT_GID) { ip->i_gid = vap->va_gid; } mutex_enter(&ip->i_tlock); ip->i_flag |= ICHG; mutex_exit(&ip->i_tlock); } /* * Truncate file. Must have write permission and not be a directory. */ if (mask & AT_SIZE) { if (vp->v_type == VDIR) { error = EISDIR; goto update_inode; } if (error = ud_iaccess(ip, IWRITE, cr, 0)) { goto update_inode; } if (vap->va_size > MAXOFFSET_T) { error = EFBIG; goto update_inode; } if (error = ud_itrunc(ip, vap->va_size, 0, cr)) { goto update_inode; } } /* * Change file access or modified times. */ if (mask & (AT_ATIME|AT_MTIME)) { mutex_enter(&ip->i_tlock); if (mask & AT_ATIME) { ip->i_atime.tv_sec = vap->va_atime.tv_sec; ip->i_atime.tv_nsec = vap->va_atime.tv_nsec; ip->i_flag &= ~IACC; } if (mask & AT_MTIME) { ip->i_mtime.tv_sec = vap->va_mtime.tv_sec; ip->i_mtime.tv_nsec = vap->va_mtime.tv_nsec; gethrestime(&now); ip->i_ctime.tv_sec = now.tv_sec; ip->i_ctime.tv_nsec = now.tv_nsec; ip->i_flag &= ~(IUPD|ICHG); ip->i_flag |= IMODTIME; } ip->i_flag |= IMOD; mutex_exit(&ip->i_tlock); } update_inode: if (curthread->t_flag & T_DONTPEND) { ud_iupdat(ip, 1); } else { ITIMES_NOLOCK(ip); } rw_exit(&ip->i_contents); rw_exit(&ip->i_rwlock); return (error); } /* ARGSUSED */ static int32_t udf_access( struct vnode *vp, int32_t mode, int32_t flags, struct cred *cr, caller_context_t *ct) { struct ud_inode *ip = VTOI(vp); ud_printf("udf_access\n"); if (ip->i_udf == NULL) { return (EIO); } return (ud_iaccess(ip, UD_UPERM2DPERM(mode), cr, 1)); } int32_t udfs_stickyhack = 1; /* ARGSUSED */ static int32_t udf_lookup( struct vnode *dvp, char *nm, struct vnode **vpp, struct pathname *pnp, int32_t flags, struct vnode *rdir, struct cred *cr, caller_context_t *ct, int *direntflags, pathname_t *realpnp) { int32_t error; struct vnode *vp; struct ud_inode *ip, *xip; ud_printf("udf_lookup\n"); /* * Null component name is a synonym for directory being searched. */ if (*nm == '\0') { VN_HOLD(dvp); *vpp = dvp; error = 0; goto out; } /* * Fast path: Check the directory name lookup cache. */ ip = VTOI(dvp); if (vp = dnlc_lookup(dvp, nm)) { /* * Check accessibility of directory. */ if ((error = ud_iaccess(ip, IEXEC, cr, 1)) != 0) { VN_RELE(vp); } xip = VTOI(vp); } else { error = ud_dirlook(ip, nm, &xip, cr, 1); ITIMES(ip); } if (error == 0) { ip = xip; *vpp = ITOV(ip); if ((ip->i_type != VDIR) && (ip->i_char & ISVTX) && ((ip->i_perm & IEXEC) == 0) && udfs_stickyhack) { mutex_enter(&(*vpp)->v_lock); (*vpp)->v_flag |= VISSWAP; mutex_exit(&(*vpp)->v_lock); } ITIMES(ip); /* * If vnode is a device return special vnode instead. */ if (IS_DEVVP(*vpp)) { struct vnode *newvp; newvp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); VN_RELE(*vpp); if (newvp == NULL) { error = ENOSYS; } else { *vpp = newvp; } } } out: return (error); } /* ARGSUSED */ static int32_t udf_create( struct vnode *dvp, char *name, struct vattr *vap, enum vcexcl excl, int32_t mode, struct vnode **vpp, struct cred *cr, int32_t flag, caller_context_t *ct, vsecattr_t *vsecp) { int32_t error; struct ud_inode *ip = VTOI(dvp), *xip; ud_printf("udf_create\n"); if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr) != 0) vap->va_mode &= ~VSVTX; if (*name == '\0') { /* * Null component name refers to the directory itself. */ VN_HOLD(dvp); ITIMES(ip); error = EEXIST; } else { xip = NULL; rw_enter(&ip->i_rwlock, RW_WRITER); error = ud_direnter(ip, name, DE_CREATE, (struct ud_inode *)0, (struct ud_inode *)0, vap, &xip, cr, ct); rw_exit(&ip->i_rwlock); ITIMES(ip); ip = xip; } #ifdef __lock_lint rw_enter(&ip->i_contents, RW_WRITER); #else if (ip != NULL) { rw_enter(&ip->i_contents, RW_WRITER); } #endif /* * If the file already exists and this is a non-exclusive create, * check permissions and allow access for non-directories. * Read-only create of an existing directory is also allowed. * We fail an exclusive create of anything which already exists. */ if (error == EEXIST) { if (excl == NONEXCL) { if ((ip->i_type == VDIR) && (mode & VWRITE)) { error = EISDIR; } else if (mode) { error = ud_iaccess(ip, UD_UPERM2DPERM(mode), cr, 0); } else { error = 0; } } if (error) { rw_exit(&ip->i_contents); VN_RELE(ITOV(ip)); goto out; } else if ((ip->i_type == VREG) && (vap->va_mask & AT_SIZE) && vap->va_size == 0) { /* * Truncate regular files, if requested by caller. * Grab i_rwlock to make sure no one else is * currently writing to the file (we promised * bmap we would do this). * Must get the locks in the correct order. */ if (ip->i_size == 0) { ip->i_flag |= ICHG | IUPD; } else { rw_exit(&ip->i_contents); rw_enter(&ip->i_rwlock, RW_WRITER); rw_enter(&ip->i_contents, RW_WRITER); (void) ud_itrunc(ip, 0, 0, cr); rw_exit(&ip->i_rwlock); } vnevent_create(ITOV(ip), ct); } } if (error == 0) { *vpp = ITOV(ip); ITIMES(ip); } #ifdef __lock_lint rw_exit(&ip->i_contents); #else if (ip != NULL) { rw_exit(&ip->i_contents); } #endif if (error) { goto out; } /* * If vnode is a device return special vnode instead. */ if (!error && IS_DEVVP(*vpp)) { struct vnode *newvp; newvp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); VN_RELE(*vpp); if (newvp == NULL) { error = ENOSYS; goto out; } *vpp = newvp; } out: return (error); } /* ARGSUSED */ static int32_t udf_remove( struct vnode *vp, char *nm, struct cred *cr, caller_context_t *ct, int flags) { int32_t error; struct ud_inode *ip = VTOI(vp); ud_printf("udf_remove\n"); rw_enter(&ip->i_rwlock, RW_WRITER); error = ud_dirremove(ip, nm, (struct ud_inode *)0, (struct vnode *)0, DR_REMOVE, cr, ct); rw_exit(&ip->i_rwlock); ITIMES(ip); return (error); } /* ARGSUSED */ static int32_t udf_link( struct vnode *tdvp, struct vnode *svp, char *tnm, struct cred *cr, caller_context_t *ct, int flags) { int32_t error; struct vnode *realvp; struct ud_inode *sip; struct ud_inode *tdp; ud_printf("udf_link\n"); if (VOP_REALVP(svp, &realvp, ct) == 0) { svp = realvp; } /* * Do not allow links to directories */ if (svp->v_type == VDIR) { return (EPERM); } sip = VTOI(svp); if (sip->i_uid != crgetuid(cr) && secpolicy_basic_link(cr) != 0) return (EPERM); tdp = VTOI(tdvp); rw_enter(&tdp->i_rwlock, RW_WRITER); error = ud_direnter(tdp, tnm, DE_LINK, (struct ud_inode *)0, sip, (struct vattr *)0, (struct ud_inode **)0, cr, ct); rw_exit(&tdp->i_rwlock); ITIMES(sip); ITIMES(tdp); if (error == 0) { vnevent_link(svp, ct); } return (error); } /* ARGSUSED */ static int32_t udf_rename( struct vnode *sdvp, char *snm, struct vnode *tdvp, char *tnm, struct cred *cr, caller_context_t *ct, int flags) { int32_t error = 0; struct udf_vfs *udf_vfsp; struct ud_inode *sip; /* source inode */ struct ud_inode *sdp, *tdp; /* source and target parent inode */ struct vnode *realvp; ud_printf("udf_rename\n"); if (VOP_REALVP(tdvp, &realvp, ct) == 0) { tdvp = realvp; } sdp = VTOI(sdvp); tdp = VTOI(tdvp); udf_vfsp = sdp->i_udf; mutex_enter(&udf_vfsp->udf_rename_lck); /* * Look up inode of file we're supposed to rename. */ if (error = ud_dirlook(sdp, snm, &sip, cr, 0)) { mutex_exit(&udf_vfsp->udf_rename_lck); return (error); } /* * be sure this is not a directory with another file system mounted * over it. If it is just give up the locks, and return with * EBUSY */ if (vn_mountedvfs(ITOV(sip)) != NULL) { error = EBUSY; goto errout; } /* * Make sure we can delete the source entry. This requires * write permission on the containing directory. If that * directory is "sticky" it further requires (except for * privileged users) that the user own the directory or the * source entry, or else have permission to write the source * entry. */ rw_enter(&sdp->i_contents, RW_READER); rw_enter(&sip->i_contents, RW_READER); if ((error = ud_iaccess(sdp, IWRITE, cr, 0)) != 0 || (error = ud_sticky_remove_access(sdp, sip, cr)) != 0) { rw_exit(&sip->i_contents); rw_exit(&sdp->i_contents); ITIMES(sip); goto errout; } /* * Check for renaming '.' or '..' or alias of '.' */ if ((strcmp(snm, ".") == 0) || (strcmp(snm, "..") == 0) || (sdp == sip)) { error = EINVAL; rw_exit(&sip->i_contents); rw_exit(&sdp->i_contents); goto errout; } rw_exit(&sip->i_contents); rw_exit(&sdp->i_contents); /* * Link source to the target. */ rw_enter(&tdp->i_rwlock, RW_WRITER); if (error = ud_direnter(tdp, tnm, DE_RENAME, sdp, sip, (struct vattr *)0, (struct ud_inode **)0, cr, ct)) { /* * ESAME isn't really an error; it indicates that the * operation should not be done because the source and target * are the same file, but that no error should be reported. */ if (error == ESAME) { error = 0; } rw_exit(&tdp->i_rwlock); goto errout; } vnevent_rename_src(ITOV(sip), sdvp, snm, ct); rw_exit(&tdp->i_rwlock); rw_enter(&sdp->i_rwlock, RW_WRITER); /* * Unlink the source. * Remove the source entry. ud_dirremove() checks that the entry * still reflects sip, and returns an error if it doesn't. * If the entry has changed just forget about it. Release * the source inode. */ if ((error = ud_dirremove(sdp, snm, sip, (struct vnode *)0, DR_RENAME, cr, ct)) == ENOENT) { error = 0; } rw_exit(&sdp->i_rwlock); errout: ITIMES(sdp); ITIMES(tdp); VN_RELE(ITOV(sip)); mutex_exit(&udf_vfsp->udf_rename_lck); return (error); } /* ARGSUSED */ static int32_t udf_mkdir( struct vnode *dvp, char *dirname, struct vattr *vap, struct vnode **vpp, struct cred *cr, caller_context_t *ct, int flags, vsecattr_t *vsecp) { int32_t error; struct ud_inode *ip; struct ud_inode *xip; ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE)); ud_printf("udf_mkdir\n"); ip = VTOI(dvp); rw_enter(&ip->i_rwlock, RW_WRITER); error = ud_direnter(ip, dirname, DE_MKDIR, (struct ud_inode *)0, (struct ud_inode *)0, vap, &xip, cr, ct); rw_exit(&ip->i_rwlock); ITIMES(ip); if (error == 0) { ip = xip; *vpp = ITOV(ip); ITIMES(ip); } else if (error == EEXIST) { ITIMES(xip); VN_RELE(ITOV(xip)); } return (error); } /* ARGSUSED */ static int32_t udf_rmdir( struct vnode *vp, char *nm, struct vnode *cdir, struct cred *cr, caller_context_t *ct, int flags) { int32_t error; struct ud_inode *ip = VTOI(vp); ud_printf("udf_rmdir\n"); rw_enter(&ip->i_rwlock, RW_WRITER); error = ud_dirremove(ip, nm, (struct ud_inode *)0, cdir, DR_RMDIR, cr, ct); rw_exit(&ip->i_rwlock); ITIMES(ip); return (error); } /* ARGSUSED */ static int32_t udf_readdir( struct vnode *vp, struct uio *uiop, struct cred *cr, int32_t *eofp, caller_context_t *ct, int flags) { struct ud_inode *ip; struct dirent64 *nd; struct udf_vfs *udf_vfsp; int32_t error = 0, len, outcount = 0; uint32_t dirsiz, offset; uint32_t bufsize, ndlen, dummy; caddr_t outbuf; caddr_t outb, end_outb; struct iovec *iovp; uint8_t *dname; int32_t length; uint8_t *buf = NULL; struct fbuf *fbp = NULL; struct file_id *fid; uint8_t *name; ud_printf("udf_readdir\n"); ip = VTOI(vp); udf_vfsp = ip->i_udf; dirsiz = ip->i_size; if ((uiop->uio_offset >= dirsiz) || (ip->i_nlink <= 0)) { if (eofp) { *eofp = 1; } return (0); } offset = uiop->uio_offset; iovp = uiop->uio_iov; bufsize = iovp->iov_len; outb = outbuf = (char *)kmem_alloc((uint32_t)bufsize, KM_SLEEP); end_outb = outb + bufsize; nd = (struct dirent64 *)outbuf; dname = (uint8_t *)kmem_zalloc(1024, KM_SLEEP); buf = (uint8_t *)kmem_zalloc(udf_vfsp->udf_lbsize, KM_SLEEP); if (offset == 0) { len = DIRENT64_RECLEN(1); if (((caddr_t)nd + len) >= end_outb) { error = EINVAL; goto end; } nd->d_ino = ip->i_icb_lbano; nd->d_reclen = (uint16_t)len; nd->d_off = 0x10; nd->d_name[0] = '.'; bzero(&nd->d_name[1], DIRENT64_NAMELEN(len) - 1); nd = (struct dirent64 *)((char *)nd + nd->d_reclen); outcount++; } else if (offset == 0x10) { offset = 0; } while (offset < dirsiz) { error = ud_get_next_fid(ip, &fbp, offset, &fid, &name, buf); if (error != 0) { break; } if ((fid->fid_flags & FID_DELETED) == 0) { if (fid->fid_flags & FID_PARENT) { len = DIRENT64_RECLEN(2); if (((caddr_t)nd + len) >= end_outb) { error = EINVAL; break; } nd->d_ino = ip->i_icb_lbano; nd->d_reclen = (uint16_t)len; nd->d_off = offset + FID_LEN(fid); nd->d_name[0] = '.'; nd->d_name[1] = '.'; bzero(&nd->d_name[2], DIRENT64_NAMELEN(len) - 2); nd = (struct dirent64 *) ((char *)nd + nd->d_reclen); } else { if ((error = ud_uncompress(fid->fid_idlen, &length, name, dname)) != 0) { break; } if (length == 0) { offset += FID_LEN(fid); continue; } len = DIRENT64_RECLEN(length); if (((caddr_t)nd + len) >= end_outb) { if (!outcount) { error = EINVAL; } break; } (void) strncpy(nd->d_name, (caddr_t)dname, length); bzero(&nd->d_name[length], DIRENT64_NAMELEN(len) - length); nd->d_ino = ud_xlate_to_daddr(udf_vfsp, SWAP_16(fid->fid_icb.lad_ext_prn), SWAP_32(fid->fid_icb.lad_ext_loc), 1, &dummy); nd->d_reclen = (uint16_t)len; nd->d_off = offset + FID_LEN(fid); nd = (struct dirent64 *) ((char *)nd + nd->d_reclen); } outcount++; } offset += FID_LEN(fid); } end: if (fbp != NULL) { fbrelse(fbp, S_OTHER); } ndlen = ((char *)nd - outbuf); /* * In case of error do not call uiomove. * Return the error to the caller. */ if ((error == 0) && (ndlen != 0)) { error = uiomove(outbuf, (long)ndlen, UIO_READ, uiop); uiop->uio_offset = offset; } kmem_free((caddr_t)buf, udf_vfsp->udf_lbsize); kmem_free((caddr_t)dname, 1024); kmem_free(outbuf, (uint32_t)bufsize); if (eofp && error == 0) { *eofp = (uiop->uio_offset >= dirsiz); } return (error); } /* ARGSUSED */ static int32_t udf_symlink( struct vnode *dvp, char *linkname, struct vattr *vap, char *target, struct cred *cr, caller_context_t *ct, int flags) { int32_t error = 0, outlen; uint32_t ioflag = 0; struct ud_inode *ip, *dip = VTOI(dvp); struct path_comp *pc; int8_t *dname = NULL, *uname = NULL, *sp; ud_printf("udf_symlink\n"); ip = (struct ud_inode *)0; vap->va_type = VLNK; vap->va_rdev = 0; rw_enter(&dip->i_rwlock, RW_WRITER); error = ud_direnter(dip, linkname, DE_CREATE, (struct ud_inode *)0, (struct ud_inode *)0, vap, &ip, cr, ct); rw_exit(&dip->i_rwlock); if (error == 0) { dname = kmem_zalloc(1024, KM_SLEEP); uname = kmem_zalloc(PAGESIZE, KM_SLEEP); pc = (struct path_comp *)uname; /* * If the first character in target is "/" * then skip it and create entry for it */ if (*target == '/') { pc->pc_type = 2; pc->pc_len = 0; pc = (struct path_comp *)(((char *)pc) + 4); while (*target == '/') { target++; } } while (*target != NULL) { sp = target; while ((*target != '/') && (*target != '\0')) { target ++; } /* * We got the next component of the * path name. Create path_comp of * appropriate type */ if (((target - sp) == 1) && (*sp == '.')) { /* * Dot entry. */ pc->pc_type = 4; pc = (struct path_comp *)(((char *)pc) + 4); } else if (((target - sp) == 2) && (*sp == '.') && ((*(sp + 1)) == '.')) { /* * DotDot entry. */ pc->pc_type = 3; pc = (struct path_comp *)(((char *)pc) + 4); } else { /* * convert the user given name * into appropriate form to be put * on the media */ outlen = 1024; /* set to size of dname */ if (error = ud_compress(target - sp, &outlen, (uint8_t *)sp, (uint8_t *)dname)) { break; } pc->pc_type = 5; /* LINTED */ pc->pc_len = outlen; dname[outlen] = '\0'; (void) strcpy((char *)pc->pc_id, dname); pc = (struct path_comp *) (((char *)pc) + 4 + outlen); } while (*target == '/') { target++; } if (*target == NULL) { break; } } rw_enter(&ip->i_contents, RW_WRITER); if (error == 0) { ioflag = FWRITE; if (curthread->t_flag & T_DONTPEND) { ioflag |= FDSYNC; } error = ud_rdwri(UIO_WRITE, ioflag, ip, uname, ((int8_t *)pc) - uname, (offset_t)0, UIO_SYSSPACE, (int32_t *)0, cr); } if (error) { ud_idrop(ip); rw_exit(&ip->i_contents); rw_enter(&dip->i_rwlock, RW_WRITER); (void) ud_dirremove(dip, linkname, (struct ud_inode *)0, (struct vnode *)0, DR_REMOVE, cr, ct); rw_exit(&dip->i_rwlock); goto update_inode; } rw_exit(&ip->i_contents); } if ((error == 0) || (error == EEXIST)) { VN_RELE(ITOV(ip)); } update_inode: ITIMES(VTOI(dvp)); if (uname != NULL) { kmem_free(uname, PAGESIZE); } if (dname != NULL) { kmem_free(dname, 1024); } return (error); } /* ARGSUSED */ static int32_t udf_readlink( struct vnode *vp, struct uio *uiop, struct cred *cr, caller_context_t *ct) { int32_t error = 0, off, id_len, size, len; int8_t *dname = NULL, *uname = NULL; struct ud_inode *ip; struct fbuf *fbp = NULL; struct path_comp *pc; ud_printf("udf_readlink\n"); if (vp->v_type != VLNK) { return (EINVAL); } ip = VTOI(vp); size = ip->i_size; if (size > PAGESIZE) { return (EIO); } if (size == 0) { return (0); } dname = kmem_zalloc(1024, KM_SLEEP); uname = kmem_zalloc(PAGESIZE, KM_SLEEP); rw_enter(&ip->i_contents, RW_READER); if ((error = fbread(vp, 0, size, S_READ, &fbp)) != 0) { goto end; } off = 0; while (off < size) { pc = (struct path_comp *)(fbp->fb_addr + off); switch (pc->pc_type) { case 1 : (void) strcpy(uname, ip->i_udf->udf_fsmnt); (void) strcat(uname, "/"); break; case 2 : if (pc->pc_len != 0) { goto end; } uname[0] = '/'; uname[1] = '\0'; break; case 3 : (void) strcat(uname, "../"); break; case 4 : (void) strcat(uname, "./"); break; case 5 : if ((error = ud_uncompress(pc->pc_len, &id_len, pc->pc_id, (uint8_t *)dname)) != 0) { break; } dname[id_len] = '\0'; (void) strcat(uname, dname); (void) strcat(uname, "/"); break; default : error = EINVAL; goto end; } off += 4 + pc->pc_len; } len = strlen(uname) - 1; if (uname[len] == '/') { if (len == 0) { /* * special case link to / */ len = 1; } else { uname[len] = '\0'; } } error = uiomove(uname, len, UIO_READ, uiop); ITIMES(ip); end: if (fbp != NULL) { fbrelse(fbp, S_OTHER); } rw_exit(&ip->i_contents); if (uname != NULL) { kmem_free(uname, PAGESIZE); } if (dname != NULL) { kmem_free(dname, 1024); } return (error); } /* ARGSUSED */ static int32_t udf_fsync( struct vnode *vp, int32_t syncflag, struct cred *cr, caller_context_t *ct) { int32_t error = 0; struct ud_inode *ip = VTOI(vp); ud_printf("udf_fsync\n"); rw_enter(&ip->i_contents, RW_WRITER); if (!(IS_SWAPVP(vp))) { error = ud_syncip(ip, 0, I_SYNC); /* Do synchronous writes */ } if (error == 0) { error = ud_sync_indir(ip); } ITIMES(ip); /* XXX: is this necessary ??? */ rw_exit(&ip->i_contents); return (error); } /* ARGSUSED */ static void udf_inactive(struct vnode *vp, struct cred *cr, caller_context_t *ct) { ud_printf("udf_iinactive\n"); ud_iinactive(VTOI(vp), cr); } /* ARGSUSED */ static int32_t udf_fid(struct vnode *vp, struct fid *fidp, caller_context_t *ct) { struct udf_fid *udfidp; struct ud_inode *ip = VTOI(vp); ud_printf("udf_fid\n"); if (fidp->fid_len < (sizeof (struct udf_fid) - sizeof (uint16_t))) { fidp->fid_len = sizeof (struct udf_fid) - sizeof (uint16_t); return (ENOSPC); } udfidp = (struct udf_fid *)fidp; bzero((char *)udfidp, sizeof (struct udf_fid)); rw_enter(&ip->i_contents, RW_READER); udfidp->udfid_len = sizeof (struct udf_fid) - sizeof (uint16_t); udfidp->udfid_uinq_lo = ip->i_uniqid & 0xffffffff; udfidp->udfid_prn = ip->i_icb_prn; udfidp->udfid_icb_lbn = ip->i_icb_block; rw_exit(&ip->i_contents); return (0); } /* ARGSUSED2 */ static int udf_rwlock(struct vnode *vp, int32_t write_lock, caller_context_t *ctp) { struct ud_inode *ip = VTOI(vp); ud_printf("udf_rwlock\n"); if (write_lock) { rw_enter(&ip->i_rwlock, RW_WRITER); } else { rw_enter(&ip->i_rwlock, RW_READER); } #ifdef __lock_lint rw_exit(&ip->i_rwlock); #endif return (write_lock); } /* ARGSUSED */ static void udf_rwunlock(struct vnode *vp, int32_t write_lock, caller_context_t *ctp) { struct ud_inode *ip = VTOI(vp); ud_printf("udf_rwunlock\n"); #ifdef __lock_lint rw_enter(&ip->i_rwlock, RW_WRITER); #endif rw_exit(&ip->i_rwlock); } /* ARGSUSED */ static int32_t udf_seek(struct vnode *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct) { return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0); } static int32_t udf_frlock( struct vnode *vp, int32_t cmd, struct flock64 *bfp, int32_t flag, offset_t offset, struct flk_callback *flk_cbp, cred_t *cr, caller_context_t *ct) { struct ud_inode *ip = VTOI(vp); ud_printf("udf_frlock\n"); /* * If file is being mapped, disallow frlock. * XXX I am not holding tlock while checking i_mapcnt because the * current locking strategy drops all locks before calling fs_frlock. * So, mapcnt could change before we enter fs_frlock making is * meaningless to have held tlock in the first place. */ if ((ip->i_mapcnt > 0) && (MANDLOCK(vp, ip->i_char))) { return (EAGAIN); } return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct)); } /*ARGSUSED6*/ static int32_t udf_space( struct vnode *vp, int32_t cmd, struct flock64 *bfp, int32_t flag, offset_t offset, cred_t *cr, caller_context_t *ct) { int32_t error = 0; ud_printf("udf_space\n"); if (cmd != F_FREESP) { error = EINVAL; } else if ((error = convoff(vp, bfp, 0, offset)) == 0) { error = ud_freesp(vp, bfp, flag, cr); } return (error); } /* ARGSUSED */ static int32_t udf_getpage( struct vnode *vp, offset_t off, size_t len, uint32_t *protp, struct page **plarr, size_t plsz, struct seg *seg, caddr_t addr, enum seg_rw rw, struct cred *cr, caller_context_t *ct) { struct ud_inode *ip = VTOI(vp); int32_t error, has_holes, beyond_eof, seqmode, dolock; int32_t pgsize = PAGESIZE; struct udf_vfs *udf_vfsp = ip->i_udf; page_t **pl; u_offset_t pgoff, eoff, uoff; krw_t rwtype; caddr_t pgaddr; ud_printf("udf_getpage\n"); uoff = (u_offset_t)off; /* type conversion */ if (protp) { *protp = PROT_ALL; } if (vp->v_flag & VNOMAP) { return (ENOSYS); } seqmode = ip->i_nextr == uoff && rw != S_CREATE; rwtype = RW_READER; dolock = (rw_owner(&ip->i_contents) != curthread); retrylock: #ifdef __lock_lint rw_enter(&ip->i_contents, rwtype); #else if (dolock) { rw_enter(&ip->i_contents, rwtype); } #endif /* * We may be getting called as a side effect of a bmap using * fbread() when the blocks might be being allocated and the * size has not yet been up'ed. In this case we want to be * able to return zero pages if we get back UDF_HOLE from * calling bmap for a non write case here. We also might have * to read some frags from the disk into a page if we are * extending the number of frags for a given lbn in bmap(). */ beyond_eof = uoff + len > ip->i_size + PAGEOFFSET; if (beyond_eof && seg != segkmap) { #ifdef __lock_lint rw_exit(&ip->i_contents); #else if (dolock) { rw_exit(&ip->i_contents); } #endif return (EFAULT); } /* * Must hold i_contents lock throughout the call to pvn_getpages * since locked pages are returned from each call to ud_getapage. * Must *not* return locked pages and then try for contents lock * due to lock ordering requirements (inode > page) */ has_holes = ud_bmap_has_holes(ip); if ((rw == S_WRITE || rw == S_CREATE) && (has_holes || beyond_eof)) { int32_t blk_size, count; u_offset_t offset; /* * We must acquire the RW_WRITER lock in order to * call bmap_write(). */ if (dolock && rwtype == RW_READER) { rwtype = RW_WRITER; if (!rw_tryupgrade(&ip->i_contents)) { rw_exit(&ip->i_contents); goto retrylock; } } /* * May be allocating disk blocks for holes here as * a result of mmap faults. write(2) does the bmap_write * in rdip/wrip, not here. We are not dealing with frags * in this case. */ offset = uoff; while ((offset < uoff + len) && (offset < ip->i_size)) { /* * the variable "bnp" is to simplify the expression for * the compiler; * just passing in &bn to bmap_write * causes a compiler "loop" */ blk_size = udf_vfsp->udf_lbsize; if ((offset + blk_size) > ip->i_size) { count = ip->i_size - offset; } else { count = blk_size; } error = ud_bmap_write(ip, offset, count, 0, cr); if (error) { goto update_inode; } offset += count; /* XXX - make this contig */ } } /* * Can be a reader from now on. */ #ifdef __lock_lint if (rwtype == RW_WRITER) { rw_downgrade(&ip->i_contents); } #else if (dolock && rwtype == RW_WRITER) { rw_downgrade(&ip->i_contents); } #endif /* * We remove PROT_WRITE in cases when the file has UDF holes * because we don't want to call bmap_read() to check each * page if it is backed with a disk block. */ if (protp && has_holes && rw != S_WRITE && rw != S_CREATE) { *protp &= ~PROT_WRITE; } error = 0; /* * The loop looks up pages in the range i_nextrio = pgoff; ud_getpage_ra(vp, pgoff, seg, pgaddr); pgoff += pgsize; pgaddr += pgsize; continue; } /* * Check if we should initiate read ahead of next cluster. * We call page_exists only when we need to confirm that * we have the current page before we initiate the read ahead. */ nextrio = ip->i_nextrio; if (seqmode && pgoff + RD_CLUSTSZ(ip) >= nextrio && pgoff <= nextrio && nextrio < ip->i_size && page_exists(vp, pgoff)) ud_getpage_ra(vp, pgoff, seg, pgaddr); if ((pp = page_lookup(vp, pgoff, se)) != NULL) { /* * We found the page in the page cache. */ *pl++ = pp; pgoff += pgsize; pgaddr += pgsize; len -= pgsize; plsz -= pgsize; } else { /* * We have to create the page, or read it from disk. */ if (error = ud_getpage_miss(vp, pgoff, len, seg, pgaddr, pl, plsz, rw, seqmode)) { goto error_out; } while (*pl != NULL) { pl++; pgoff += pgsize; pgaddr += pgsize; len -= pgsize; plsz -= pgsize; } } } /* * Return pages up to plsz if they are in the page cache. * We cannot return pages if there is a chance that they are * backed with a UDF hole and rw is S_WRITE or S_CREATE. */ if (plarr && !(has_holes && (rw == S_WRITE || rw == S_CREATE))) { ASSERT((protp == NULL) || !(has_holes && (*protp & PROT_WRITE))); eoff = pgoff + plsz; while (pgoff < eoff) { page_t *pp; if ((pp = page_lookup_nowait(vp, pgoff, SE_SHARED)) == NULL) break; *pl++ = pp; pgoff += pgsize; plsz -= pgsize; } } if (plarr) *pl = NULL; /* Terminate page list */ ip->i_nextr = pgoff; error_out: if (error && plarr) { /* * Release any pages we have locked. */ while (pl > &plarr[0]) page_unlock(*--pl); plarr[0] = NULL; } update_inode: #ifdef __lock_lint rw_exit(&ip->i_contents); #else if (dolock) { rw_exit(&ip->i_contents); } #endif /* * If the inode is not already marked for IACC (in rwip() for read) * and the inode is not marked for no access time update (in rwip() * for write) then update the inode access time and mod time now. */ mutex_enter(&ip->i_tlock); if ((ip->i_flag & (IACC | INOACC)) == 0) { if ((rw != S_OTHER) && (ip->i_type != VDIR)) { ip->i_flag |= IACC; } if (rw == S_WRITE) { ip->i_flag |= IUPD; } ITIMES_NOLOCK(ip); } mutex_exit(&ip->i_tlock); return (error); } int32_t ud_delay = 1; /* ARGSUSED */ static int32_t udf_putpage( struct vnode *vp, offset_t off, size_t len, int32_t flags, struct cred *cr, caller_context_t *ct) { struct ud_inode *ip; int32_t error = 0; ud_printf("udf_putpage\n"); ip = VTOI(vp); #ifdef __lock_lint rw_enter(&ip->i_contents, RW_WRITER); #endif if (vp->v_count == 0) { cmn_err(CE_WARN, "ud_putpage : bad v_count"); error = EINVAL; goto out; } if (vp->v_flag & VNOMAP) { error = ENOSYS; goto out; } if (flags & B_ASYNC) { if (ud_delay && len && (flags & ~(B_ASYNC|B_DONTNEED|B_FREE)) == 0) { mutex_enter(&ip->i_tlock); /* * If nobody stalled, start a new cluster. */ if (ip->i_delaylen == 0) { ip->i_delayoff = off; ip->i_delaylen = len; mutex_exit(&ip->i_tlock); goto out; } /* * If we have a full cluster or they are not contig, * then push last cluster and start over. */ if (ip->i_delaylen >= WR_CLUSTSZ(ip) || ip->i_delayoff + ip->i_delaylen != off) { u_offset_t doff; size_t dlen; doff = ip->i_delayoff; dlen = ip->i_delaylen; ip->i_delayoff = off; ip->i_delaylen = len; mutex_exit(&ip->i_tlock); error = ud_putpages(vp, doff, dlen, flags, cr); /* LMXXX - flags are new val, not old */ goto out; } /* * There is something there, it's not full, and * it is contig. */ ip->i_delaylen += len; mutex_exit(&ip->i_tlock); goto out; } /* * Must have weird flags or we are not clustering. */ } error = ud_putpages(vp, off, len, flags, cr); out: #ifdef __lock_lint rw_exit(&ip->i_contents); #endif return (error); } /* ARGSUSED */ static int32_t udf_map( struct vnode *vp, offset_t off, struct as *as, caddr_t *addrp, size_t len, uint8_t prot, uint8_t maxprot, uint32_t flags, struct cred *cr, caller_context_t *ct) { struct segvn_crargs vn_a; int32_t error = 0; ud_printf("udf_map\n"); if (vp->v_flag & VNOMAP) { error = ENOSYS; goto end; } if ((off < (offset_t)0) || ((off + len) < (offset_t)0)) { error = EINVAL; goto end; } if (vp->v_type != VREG) { error = ENODEV; goto end; } /* * If file is being locked, disallow mapping. */ if (vn_has_mandatory_locks(vp, VTOI(vp)->i_char)) { error = EAGAIN; goto end; } as_rangelock(as); error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags); if (error != 0) { as_rangeunlock(as); goto end; } vn_a.vp = vp; vn_a.offset = off; vn_a.type = flags & MAP_TYPE; vn_a.prot = prot; vn_a.maxprot = maxprot; vn_a.cred = cr; vn_a.amp = NULL; vn_a.flags = flags & ~MAP_TYPE; vn_a.szc = 0; vn_a.lgrp_mem_policy_flags = 0; error = as_map(as, *addrp, len, segvn_create, (caddr_t)&vn_a); as_rangeunlock(as); end: return (error); } /* ARGSUSED */ static int32_t udf_addmap(struct vnode *vp, offset_t off, struct as *as, caddr_t addr, size_t len, uint8_t prot, uint8_t maxprot, uint32_t flags, struct cred *cr, caller_context_t *ct) { struct ud_inode *ip = VTOI(vp); ud_printf("udf_addmap\n"); if (vp->v_flag & VNOMAP) { return (ENOSYS); } mutex_enter(&ip->i_tlock); ip->i_mapcnt += btopr(len); mutex_exit(&ip->i_tlock); return (0); } /* ARGSUSED */ static int32_t udf_delmap( struct vnode *vp, offset_t off, struct as *as, caddr_t addr, size_t len, uint32_t prot, uint32_t maxprot, uint32_t flags, struct cred *cr, caller_context_t *ct) { struct ud_inode *ip = VTOI(vp); ud_printf("udf_delmap\n"); if (vp->v_flag & VNOMAP) { return (ENOSYS); } mutex_enter(&ip->i_tlock); ip->i_mapcnt -= btopr(len); /* Count released mappings */ ASSERT(ip->i_mapcnt >= 0); mutex_exit(&ip->i_tlock); return (0); } /* ARGSUSED */ static int32_t udf_l_pathconf( struct vnode *vp, int32_t cmd, ulong_t *valp, struct cred *cr, caller_context_t *ct) { int32_t error = 0; ud_printf("udf_l_pathconf\n"); if (cmd == _PC_FILESIZEBITS) { /* * udf supports 64 bits as file size * but there are several other restrictions * it only supports 32-bit block numbers and * daddr32_t is only and int32_t so taking these * into account we can stay just as where ufs is */ *valp = 41; } else if (cmd == _PC_TIMESTAMP_RESOLUTION) { /* nanosecond timestamp resolution */ *valp = 1L; } else { error = fs_pathconf(vp, cmd, valp, cr, ct); } return (error); } uint32_t ud_pageio_reads = 0, ud_pageio_writes = 0; #ifndef __lint _NOTE(SCHEME_PROTECTS_DATA("safe sharing", ud_pageio_reads)) _NOTE(SCHEME_PROTECTS_DATA("safe sharing", ud_pageio_writes)) #endif /* * Assumption is that there will not be a pageio request * to a enbedded file */ /* ARGSUSED */ static int32_t udf_pageio( struct vnode *vp, struct page *pp, u_offset_t io_off, size_t io_len, int32_t flags, struct cred *cr, caller_context_t *ct) { daddr_t bn; struct buf *bp; struct ud_inode *ip = VTOI(vp); int32_t dolock, error = 0, contig, multi_io; size_t done_len = 0, cur_len = 0; page_t *npp = NULL, *opp = NULL, *cpp = pp; if (pp == NULL) { return (EINVAL); } dolock = (rw_owner(&ip->i_contents) != curthread); /* * We need a better check. Ideally, we would use another * vnodeops so that hlocked and forcibly unmounted file * systems would return EIO where appropriate and w/o the * need for these checks. */ if (ip->i_udf == NULL) { return (EIO); } #ifdef __lock_lint rw_enter(&ip->i_contents, RW_READER); #else if (dolock) { rw_enter(&ip->i_contents, RW_READER); } #endif /* * Break the io request into chunks, one for each contiguous * stretch of disk blocks in the target file. */ while (done_len < io_len) { ASSERT(cpp); bp = NULL; contig = 0; if (error = ud_bmap_read(ip, (u_offset_t)(io_off + done_len), &bn, &contig)) { break; } if (bn == UDF_HOLE) { /* No holey swapfiles */ cmn_err(CE_WARN, "SWAP file has HOLES"); error = EINVAL; break; } cur_len = MIN(io_len - done_len, contig); /* * Check if more than one I/O is * required to complete the given * I/O operation */ if (ip->i_udf->udf_lbsize < PAGESIZE) { if (cur_len >= PAGESIZE) { multi_io = 0; cur_len &= PAGEMASK; } else { multi_io = 1; cur_len = MIN(io_len - done_len, PAGESIZE); } } page_list_break(&cpp, &npp, btop(cur_len)); bp = pageio_setup(cpp, cur_len, ip->i_devvp, flags); ASSERT(bp != NULL); bp->b_edev = ip->i_dev; bp->b_dev = cmpdev(ip->i_dev); bp->b_blkno = bn; bp->b_un.b_addr = (caddr_t)0; bp->b_file = vp; bp->b_offset = (offset_t)(io_off + done_len); /* * ub.ub_pageios.value.ul++; */ if (multi_io == 0) { (void) bdev_strategy(bp); } else { error = ud_multi_strat(ip, cpp, bp, (u_offset_t)(io_off + done_len)); if (error != 0) { pageio_done(bp); break; } } if (flags & B_READ) { ud_pageio_reads++; } else { ud_pageio_writes++; } /* * If the request is not B_ASYNC, wait for i/o to complete * and re-assemble the page list to return to the caller. * If it is B_ASYNC we leave the page list in pieces and * cleanup() will dispose of them. */ if ((flags & B_ASYNC) == 0) { error = biowait(bp); pageio_done(bp); if (error) { break; } page_list_concat(&opp, &cpp); } cpp = npp; npp = NULL; done_len += cur_len; } ASSERT(error || (cpp == NULL && npp == NULL && done_len == io_len)); if (error) { if (flags & B_ASYNC) { /* Cleanup unprocessed parts of list */ page_list_concat(&cpp, &npp); if (flags & B_READ) { pvn_read_done(cpp, B_ERROR); } else { pvn_write_done(cpp, B_ERROR); } } else { /* Re-assemble list and let caller clean up */ page_list_concat(&opp, &cpp); page_list_concat(&opp, &npp); } } #ifdef __lock_lint rw_exit(&ip->i_contents); #else if (dolock) { rw_exit(&ip->i_contents); } #endif return (error); } /* -------------------- local functions --------------------------- */ int32_t ud_rdwri(enum uio_rw rw, int32_t ioflag, struct ud_inode *ip, caddr_t base, int32_t len, offset_t offset, enum uio_seg seg, int32_t *aresid, struct cred *cr) { int32_t error; struct uio auio; struct iovec aiov; ud_printf("ud_rdwri\n"); bzero((caddr_t)&auio, sizeof (uio_t)); bzero((caddr_t)&aiov, sizeof (iovec_t)); aiov.iov_base = base; aiov.iov_len = len; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_loffset = offset; auio.uio_segflg = (int16_t)seg; auio.uio_resid = len; if (rw == UIO_WRITE) { auio.uio_fmode = FWRITE; auio.uio_extflg = UIO_COPY_DEFAULT; auio.uio_llimit = curproc->p_fsz_ctl; error = ud_wrip(ip, &auio, ioflag, cr); } else { auio.uio_fmode = FREAD; auio.uio_extflg = UIO_COPY_CACHED; auio.uio_llimit = MAXOFFSET_T; error = ud_rdip(ip, &auio, ioflag, cr); } if (aresid) { *aresid = auio.uio_resid; } else if (auio.uio_resid) { error = EIO; } return (error); } /* * Free behind hacks. The pager is busted. * XXX - need to pass the information down to writedone() in a flag like B_SEQ * or B_FREE_IF_TIGHT_ON_MEMORY. */ int32_t ud_freebehind = 1; int32_t ud_smallfile = 32 * 1024; /* ARGSUSED */ int32_t ud_getpage_miss(struct vnode *vp, u_offset_t off, size_t len, struct seg *seg, caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw, int32_t seq) { struct ud_inode *ip = VTOI(vp); int32_t err = 0; size_t io_len; u_offset_t io_off; u_offset_t pgoff; page_t *pp; pl[0] = NULL; /* * Figure out whether the page can be created, or must be * read from the disk */ if (rw == S_CREATE) { if ((pp = page_create_va(vp, off, PAGESIZE, PG_WAIT, seg, addr)) == NULL) { cmn_err(CE_WARN, "ud_getpage_miss: page_create"); return (EINVAL); } io_len = PAGESIZE; } else { pp = pvn_read_kluster(vp, off, seg, addr, &io_off, &io_len, off, PAGESIZE, 0); /* * Some other thread has entered the page. * ud_getpage will retry page_lookup. */ if (pp == NULL) { return (0); } /* * Fill the page with as much data as we can from the file. */ err = ud_page_fill(ip, pp, off, B_READ, &pgoff); if (err) { pvn_read_done(pp, B_ERROR); return (err); } /* * XXX ??? ufs has io_len instead of pgoff below */ ip->i_nextrio = off + ((pgoff + PAGESIZE - 1) & PAGEMASK); /* * If the file access is sequential, initiate read ahead * of the next cluster. */ if (seq && ip->i_nextrio < ip->i_size) { ud_getpage_ra(vp, off, seg, addr); } } outmiss: pvn_plist_init(pp, pl, plsz, (offset_t)off, io_len, rw); return (err); } /* ARGSUSED */ void ud_getpage_ra(struct vnode *vp, u_offset_t off, struct seg *seg, caddr_t addr) { page_t *pp; size_t io_len; struct ud_inode *ip = VTOI(vp); u_offset_t io_off = ip->i_nextrio, pgoff; caddr_t addr2 = addr + (io_off - off); daddr_t bn; int32_t contig = 0; /* * Is this test needed? */ if (addr2 >= seg->s_base + seg->s_size) { return; } contig = 0; if (ud_bmap_read(ip, io_off, &bn, &contig) != 0 || bn == UDF_HOLE) { return; } pp = pvn_read_kluster(vp, io_off, seg, addr2, &io_off, &io_len, io_off, PAGESIZE, 1); /* * Some other thread has entered the page. * So no read head done here (ie we will have to and wait * for the read when needed). */ if (pp == NULL) { return; } (void) ud_page_fill(ip, pp, io_off, (B_READ|B_ASYNC), &pgoff); ip->i_nextrio = io_off + ((pgoff + PAGESIZE - 1) & PAGEMASK); } int ud_page_fill(struct ud_inode *ip, page_t *pp, u_offset_t off, uint32_t bflgs, u_offset_t *pg_off) { daddr_t bn; struct buf *bp; caddr_t kaddr, caddr; int32_t error = 0, contig = 0, multi_io = 0; int32_t lbsize = ip->i_udf->udf_lbsize; int32_t lbmask = ip->i_udf->udf_lbmask; uint64_t isize; isize = (ip->i_size + lbmask) & (~lbmask); if (ip->i_desc_type == ICB_FLAG_ONE_AD) { /* * Embedded file read file_entry * from buffer cache and copy the required * portions */ bp = ud_bread(ip->i_dev, ip->i_icb_lbano << ip->i_udf->udf_l2d_shift, lbsize); if ((bp->b_error == 0) && (bp->b_resid == 0)) { caddr = bp->b_un.b_addr + ip->i_data_off; /* * mapin to kvm */ kaddr = (caddr_t)ppmapin(pp, PROT_READ | PROT_WRITE, (caddr_t)-1); (void) kcopy(caddr, kaddr, ip->i_size); /* * mapout of kvm */ ppmapout(kaddr); } brelse(bp); contig = ip->i_size; } else { /* * Get the continuous size and block number * at offset "off" */ if (error = ud_bmap_read(ip, off, &bn, &contig)) goto out; contig = MIN(contig, PAGESIZE); contig = (contig + lbmask) & (~lbmask); /* * Zero part of the page which we are not * going to read from the disk. */ if (bn == UDF_HOLE) { /* * This is a HOLE. Just zero out * the page */ if (((off + contig) == isize) || (contig == PAGESIZE)) { pagezero(pp->p_prev, 0, PAGESIZE); goto out; } } if (contig < PAGESIZE) { uint64_t count; count = isize - off; if (contig != count) { multi_io = 1; contig = (int32_t)(MIN(count, PAGESIZE)); } else { pagezero(pp->p_prev, contig, PAGESIZE - contig); } } /* * Get a bp and initialize it */ bp = pageio_setup(pp, contig, ip->i_devvp, bflgs); ASSERT(bp != NULL); bp->b_edev = ip->i_dev; bp->b_dev = cmpdev(ip->i_dev); bp->b_blkno = bn; bp->b_un.b_addr = 0; bp->b_file = ip->i_vnode; /* * Start I/O */ if (multi_io == 0) { /* * Single I/O is sufficient for this page */ (void) bdev_strategy(bp); } else { /* * We need to do the I/O in * piece's */ error = ud_multi_strat(ip, pp, bp, off); if (error != 0) { goto out; } } if ((bflgs & B_ASYNC) == 0) { /* * Wait for i/o to complete. */ error = biowait(bp); pageio_done(bp); if (error) { goto out; } } } if ((off + contig) >= ip->i_size) { contig = ip->i_size - off; } out: *pg_off = contig; return (error); } int32_t ud_putpages(struct vnode *vp, offset_t off, size_t len, int32_t flags, struct cred *cr) { struct ud_inode *ip; page_t *pp; u_offset_t io_off; size_t io_len; u_offset_t eoff; int32_t err = 0; int32_t dolock; ud_printf("ud_putpages\n"); if (vp->v_count == 0) { cmn_err(CE_WARN, "ud_putpages: bad v_count"); return (EINVAL); } ip = VTOI(vp); /* * Acquire the readers/write inode lock before locking * any pages in this inode. * The inode lock is held during i/o. */ if (len == 0) { mutex_enter(&ip->i_tlock); ip->i_delayoff = ip->i_delaylen = 0; mutex_exit(&ip->i_tlock); } #ifdef __lock_lint rw_enter(&ip->i_contents, RW_READER); #else dolock = (rw_owner(&ip->i_contents) != curthread); if (dolock) { rw_enter(&ip->i_contents, RW_READER); } #endif if (!vn_has_cached_data(vp)) { #ifdef __lock_lint rw_exit(&ip->i_contents); #else if (dolock) { rw_exit(&ip->i_contents); } #endif return (0); } if (len == 0) { /* * Search the entire vp list for pages >= off. */ err = pvn_vplist_dirty(vp, (u_offset_t)off, ud_putapage, flags, cr); } else { /* * Loop over all offsets in the range looking for * pages to deal with. */ if ((eoff = blkroundup(ip->i_udf, ip->i_size)) != 0) { eoff = MIN(off + len, eoff); } else { eoff = off + len; } for (io_off = off; io_off < eoff; io_off += io_len) { /* * If we are not invalidating, synchronously * freeing or writing pages, use the routine * page_lookup_nowait() to prevent reclaiming * them from the free list. */ if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) { pp = page_lookup(vp, io_off, (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED); } else { pp = page_lookup_nowait(vp, io_off, (flags & B_FREE) ? SE_EXCL : SE_SHARED); } if (pp == NULL || pvn_getdirty(pp, flags) == 0) { io_len = PAGESIZE; } else { err = ud_putapage(vp, pp, &io_off, &io_len, flags, cr); if (err != 0) { break; } /* * "io_off" and "io_len" are returned as * the range of pages we actually wrote. * This allows us to skip ahead more quickly * since several pages may've been dealt * with by this iteration of the loop. */ } } } if (err == 0 && off == 0 && (len == 0 || len >= ip->i_size)) { /* * We have just sync'ed back all the pages on * the inode, turn off the IMODTIME flag. */ mutex_enter(&ip->i_tlock); ip->i_flag &= ~IMODTIME; mutex_exit(&ip->i_tlock); } #ifdef __lock_lint rw_exit(&ip->i_contents); #else if (dolock) { rw_exit(&ip->i_contents); } #endif return (err); } /* ARGSUSED */ int32_t ud_putapage(struct vnode *vp, page_t *pp, u_offset_t *offp, size_t *lenp, int32_t flags, struct cred *cr) { daddr_t bn; size_t io_len; struct ud_inode *ip; int32_t error = 0, contig, multi_io = 0; struct udf_vfs *udf_vfsp; u_offset_t off, io_off; caddr_t kaddr, caddr; struct buf *bp = NULL; int32_t lbmask; uint64_t isize; int32_t crc_len; struct file_entry *fe; ud_printf("ud_putapage\n"); ip = VTOI(vp); ASSERT(ip); ASSERT(RW_LOCK_HELD(&ip->i_contents)); lbmask = ip->i_udf->udf_lbmask; isize = (ip->i_size + lbmask) & (~lbmask); udf_vfsp = ip->i_udf; ASSERT(udf_vfsp->udf_flags & UDF_FL_RW); /* * If the modified time on the inode has not already been * set elsewhere (e.g. for write/setattr) we set the time now. * This gives us approximate modified times for mmap'ed files * which are modified via stores in the user address space. */ if (((ip->i_flag & IMODTIME) == 0) || (flags & B_FORCE)) { mutex_enter(&ip->i_tlock); ip->i_flag |= IUPD; ITIMES_NOLOCK(ip); mutex_exit(&ip->i_tlock); } /* * Align the request to a block boundry (for old file systems), * and go ask bmap() how contiguous things are for this file. */ off = pp->p_offset & ~(offset_t)lbmask; /* block align it */ if (ip->i_desc_type == ICB_FLAG_ONE_AD) { ASSERT(ip->i_size <= ip->i_max_emb); pp = pvn_write_kluster(vp, pp, &io_off, &io_len, off, PAGESIZE, flags); if (io_len == 0) { io_len = PAGESIZE; } bp = ud_bread(ip->i_dev, ip->i_icb_lbano << udf_vfsp->udf_l2d_shift, udf_vfsp->udf_lbsize); fe = (struct file_entry *)bp->b_un.b_addr; if ((bp->b_flags & B_ERROR) || (ud_verify_tag_and_desc(&fe->fe_tag, UD_FILE_ENTRY, ip->i_icb_block, 1, udf_vfsp->udf_lbsize) != 0)) { if (pp != NULL) pvn_write_done(pp, B_ERROR | B_WRITE | flags); if (bp->b_flags & B_ERROR) { error = EIO; } else { error = EINVAL; } brelse(bp); return (error); } if ((bp->b_error == 0) && (bp->b_resid == 0)) { caddr = bp->b_un.b_addr + ip->i_data_off; kaddr = (caddr_t)ppmapin(pp, PROT_READ | PROT_WRITE, (caddr_t)-1); (void) kcopy(kaddr, caddr, ip->i_size); ppmapout(kaddr); } crc_len = ((uint32_t)&((struct file_entry *)0)->fe_spec) + SWAP_32(fe->fe_len_ear); crc_len += ip->i_size; ud_make_tag(ip->i_udf, &fe->fe_tag, UD_FILE_ENTRY, ip->i_icb_block, crc_len); bwrite(bp); if (flags & B_ASYNC) { pvn_write_done(pp, flags); } contig = ip->i_size; } else { if (error = ud_bmap_read(ip, off, &bn, &contig)) { goto out; } contig = MIN(contig, PAGESIZE); contig = (contig + lbmask) & (~lbmask); if (contig < PAGESIZE) { uint64_t count; count = isize - off; if (contig != count) { multi_io = 1; contig = (int32_t)(MIN(count, PAGESIZE)); } } if ((off + contig) > isize) { contig = isize - off; } if (contig > PAGESIZE) { if (contig & PAGEOFFSET) { contig &= PAGEMASK; } } pp = pvn_write_kluster(vp, pp, &io_off, &io_len, off, contig, flags); if (io_len == 0) { io_len = PAGESIZE; } bp = pageio_setup(pp, contig, ip->i_devvp, B_WRITE | flags); ASSERT(bp != NULL); bp->b_edev = ip->i_dev; bp->b_dev = cmpdev(ip->i_dev); bp->b_blkno = bn; bp->b_un.b_addr = 0; bp->b_file = vp; bp->b_offset = (offset_t)off; /* * write throttle */ ASSERT(bp->b_iodone == NULL); bp->b_iodone = ud_iodone; mutex_enter(&ip->i_tlock); ip->i_writes += bp->b_bcount; mutex_exit(&ip->i_tlock); if (multi_io == 0) { (void) bdev_strategy(bp); } else { error = ud_multi_strat(ip, pp, bp, off); if (error != 0) { goto out; } } if ((flags & B_ASYNC) == 0) { /* * Wait for i/o to complete. */ error = biowait(bp); pageio_done(bp); } } if ((flags & B_ASYNC) == 0) { pvn_write_done(pp, ((error) ? B_ERROR : 0) | B_WRITE | flags); } pp = NULL; out: if (error != 0 && pp != NULL) { pvn_write_done(pp, B_ERROR | B_WRITE | flags); } if (offp) { *offp = io_off; } if (lenp) { *lenp = io_len; } return (error); } int32_t ud_iodone(struct buf *bp) { struct ud_inode *ip; ASSERT((bp->b_pages->p_vnode != NULL) && !(bp->b_flags & B_READ)); bp->b_iodone = NULL; ip = VTOI(bp->b_pages->p_vnode); mutex_enter(&ip->i_tlock); if (ip->i_writes >= ud_LW) { if ((ip->i_writes -= bp->b_bcount) <= ud_LW) { if (ud_WRITES) { cv_broadcast(&ip->i_wrcv); /* wake all up */ } } } else { ip->i_writes -= bp->b_bcount; } mutex_exit(&ip->i_tlock); iodone(bp); return (0); } /* ARGSUSED3 */ int32_t ud_rdip(struct ud_inode *ip, struct uio *uio, int32_t ioflag, cred_t *cr) { struct vnode *vp; struct udf_vfs *udf_vfsp; krw_t rwtype; caddr_t base; uint32_t flags; int32_t error, n, on, mapon, dofree; u_offset_t off; long oresid = uio->uio_resid; ASSERT(RW_LOCK_HELD(&ip->i_contents)); if ((ip->i_type != VREG) && (ip->i_type != VDIR) && (ip->i_type != VLNK)) { return (EIO); } if (uio->uio_loffset > MAXOFFSET_T) { return (0); } if ((uio->uio_loffset < (offset_t)0) || ((uio->uio_loffset + uio->uio_resid) < 0)) { return (EINVAL); } if (uio->uio_resid == 0) { return (0); } vp = ITOV(ip); udf_vfsp = ip->i_udf; mutex_enter(&ip->i_tlock); ip->i_flag |= IACC; mutex_exit(&ip->i_tlock); rwtype = (rw_write_held(&ip->i_contents)?RW_WRITER:RW_READER); do { offset_t diff; u_offset_t uoff = uio->uio_loffset; off = uoff & (offset_t)MAXBMASK; mapon = (int)(uoff & (offset_t)MAXBOFFSET); on = (int)blkoff(udf_vfsp, uoff); n = (int)MIN(udf_vfsp->udf_lbsize - on, uio->uio_resid); diff = ip->i_size - uoff; if (diff <= (offset_t)0) { error = 0; goto out; } if (diff < (offset_t)n) { n = (int)diff; } dofree = ud_freebehind && ip->i_nextr == (off & PAGEMASK) && off > ud_smallfile; #ifndef __lock_lint if (rwtype == RW_READER) { rw_exit(&ip->i_contents); } #endif base = segmap_getmapflt(segkmap, vp, (off + mapon), (uint32_t)n, 1, S_READ); error = uiomove(base + mapon, (long)n, UIO_READ, uio); flags = 0; if (!error) { /* * If read a whole block, or read to eof, * won't need this buffer again soon. */ if (n + on == MAXBSIZE && ud_freebehind && dofree && freemem < lotsfree + pages_before_pager) { flags = SM_FREE | SM_DONTNEED |SM_ASYNC; } /* * In POSIX SYNC (FSYNC and FDSYNC) read mode, * we want to make sure that the page which has * been read, is written on disk if it is dirty. * And corresponding indirect blocks should also * be flushed out. */ if ((ioflag & FRSYNC) && (ioflag & (FSYNC|FDSYNC))) { flags &= ~SM_ASYNC; flags |= SM_WRITE; } error = segmap_release(segkmap, base, flags); } else { (void) segmap_release(segkmap, base, flags); } #ifndef __lock_lint if (rwtype == RW_READER) { rw_enter(&ip->i_contents, rwtype); } #endif } while (error == 0 && uio->uio_resid > 0 && n != 0); out: /* * Inode is updated according to this table if FRSYNC is set. * * FSYNC FDSYNC(posix.4) * -------------------------- * always IATTCHG|IBDWRITE */ if (ioflag & FRSYNC) { if ((ioflag & FSYNC) || ((ioflag & FDSYNC) && (ip->i_flag & (IATTCHG|IBDWRITE)))) { rw_exit(&ip->i_contents); rw_enter(&ip->i_contents, RW_WRITER); ud_iupdat(ip, 1); } } /* * If we've already done a partial read, terminate * the read but return no error. */ if (oresid != uio->uio_resid) { error = 0; } ITIMES(ip); return (error); } int32_t ud_wrip(struct ud_inode *ip, struct uio *uio, int ioflag, struct cred *cr) { caddr_t base; struct vnode *vp; struct udf_vfs *udf_vfsp; uint32_t flags; int32_t error = 0, iupdat_flag, n, on, mapon, i_size_changed = 0; int32_t pagecreate, newpage; uint64_t old_i_size; u_offset_t off; long start_resid = uio->uio_resid, premove_resid; rlim64_t limit = uio->uio_limit; ASSERT(RW_WRITE_HELD(&ip->i_contents)); if ((ip->i_type != VREG) && (ip->i_type != VDIR) && (ip->i_type != VLNK)) { return (EIO); } if (uio->uio_loffset >= MAXOFFSET_T) { return (EFBIG); } /* * see udf_l_pathconf */ if (limit > (((uint64_t)1 << 40) - 1)) { limit = ((uint64_t)1 << 40) - 1; } if (uio->uio_loffset >= limit) { proc_t *p = ttoproc(curthread); mutex_enter(&p->p_lock); (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], p->p_rctls, p, RCA_UNSAFE_SIGINFO); mutex_exit(&p->p_lock); return (EFBIG); } if ((uio->uio_loffset < (offset_t)0) || ((uio->uio_loffset + uio->uio_resid) < 0)) { return (EINVAL); } if (uio->uio_resid == 0) { return (0); } mutex_enter(&ip->i_tlock); ip->i_flag |= INOACC; if (ioflag & (FSYNC | FDSYNC)) { ip->i_flag |= ISYNC; iupdat_flag = 1; } mutex_exit(&ip->i_tlock); udf_vfsp = ip->i_udf; vp = ITOV(ip); do { u_offset_t uoff = uio->uio_loffset; off = uoff & (offset_t)MAXBMASK; mapon = (int)(uoff & (offset_t)MAXBOFFSET); on = (int)blkoff(udf_vfsp, uoff); n = (int)MIN(udf_vfsp->udf_lbsize - on, uio->uio_resid); if (ip->i_type == VREG && uoff + n >= limit) { if (uoff >= limit) { error = EFBIG; goto out; } n = (int)(limit - (rlim64_t)uoff); } if (uoff + n > ip->i_size) { /* * We are extending the length of the file. * bmap is used so that we are sure that * if we need to allocate new blocks, that it * is done here before we up the file size. */ error = ud_bmap_write(ip, uoff, (int)(on + n), mapon == 0, cr); if (error) { break; } i_size_changed = 1; old_i_size = ip->i_size; ip->i_size = uoff + n; /* * If we are writing from the beginning of * the mapping, we can just create the * pages without having to read them. */ pagecreate = (mapon == 0); } else if (n == MAXBSIZE) { /* * Going to do a whole mappings worth, * so we can just create the pages w/o * having to read them in. But before * we do that, we need to make sure any * needed blocks are allocated first. */ error = ud_bmap_write(ip, uoff, (int)(on + n), 1, cr); if (error) { break; } pagecreate = 1; } else { pagecreate = 0; } rw_exit(&ip->i_contents); /* * Touch the page and fault it in if it is not in * core before segmap_getmapflt can lock it. This * is to avoid the deadlock if the buffer is mapped * to the same file through mmap which we want to * write to. */ uio_prefaultpages((long)n, uio); base = segmap_getmapflt(segkmap, vp, (off + mapon), (uint32_t)n, !pagecreate, S_WRITE); /* * segmap_pagecreate() returns 1 if it calls * page_create_va() to allocate any pages. */ newpage = 0; if (pagecreate) { newpage = segmap_pagecreate(segkmap, base, (size_t)n, 0); } premove_resid = uio->uio_resid; error = uiomove(base + mapon, (long)n, UIO_WRITE, uio); if (pagecreate && uio->uio_loffset < roundup(off + mapon + n, PAGESIZE)) { /* * We created pages w/o initializing them completely, * thus we need to zero the part that wasn't set up. * This happens on most EOF write cases and if * we had some sort of error during the uiomove. */ int nzero, nmoved; nmoved = (int)(uio->uio_loffset - (off + mapon)); ASSERT(nmoved >= 0 && nmoved <= n); nzero = roundup(on + n, PAGESIZE) - nmoved; ASSERT(nzero > 0 && mapon + nmoved + nzero <= MAXBSIZE); (void) kzero(base + mapon + nmoved, (uint32_t)nzero); } /* * Unlock the pages allocated by page_create_va() * in segmap_pagecreate() */ if (newpage) { segmap_pageunlock(segkmap, base, (size_t)n, S_WRITE); } if (error) { /* * If we failed on a write, we may have already * allocated file blocks as well as pages. It's * hard to undo the block allocation, but we must * be sure to invalidate any pages that may have * been allocated. */ (void) segmap_release(segkmap, base, SM_INVAL); } else { flags = 0; /* * Force write back for synchronous write cases. */ if ((ioflag & (FSYNC|FDSYNC)) || ip->i_type == VDIR) { /* * If the sticky bit is set but the * execute bit is not set, we do a * synchronous write back and free * the page when done. We set up swap * files to be handled this way to * prevent servers from keeping around * the client's swap pages too long. * XXX - there ought to be a better way. */ if (IS_SWAPVP(vp)) { flags = SM_WRITE | SM_FREE | SM_DONTNEED; iupdat_flag = 0; } else { flags = SM_WRITE; } } else if (((mapon + n) == MAXBSIZE) || IS_SWAPVP(vp)) { /* * Have written a whole block. * Start an asynchronous write and * mark the buffer to indicate that * it won't be needed again soon. */ flags = SM_WRITE |SM_ASYNC | SM_DONTNEED; } error = segmap_release(segkmap, base, flags); /* * If the operation failed and is synchronous, * then we need to unwind what uiomove() last * did so we can potentially return an error to * the caller. If this write operation was * done in two pieces and the first succeeded, * then we won't return an error for the second * piece that failed. However, we only want to * return a resid value that reflects what was * really done. * * Failures for non-synchronous operations can * be ignored since the page subsystem will * retry the operation until it succeeds or the * file system is unmounted. */ if (error) { if ((ioflag & (FSYNC | FDSYNC)) || ip->i_type == VDIR) { uio->uio_resid = premove_resid; } else { error = 0; } } } /* * Re-acquire contents lock. */ rw_enter(&ip->i_contents, RW_WRITER); /* * If the uiomove() failed or if a synchronous * page push failed, fix up i_size. */ if (error) { if (i_size_changed) { /* * The uiomove failed, and we * allocated blocks,so get rid * of them. */ (void) ud_itrunc(ip, old_i_size, 0, cr); } } else { /* * XXX - Can this be out of the loop? */ ip->i_flag |= IUPD | ICHG; if (i_size_changed) { ip->i_flag |= IATTCHG; } if ((ip->i_perm & (IEXEC | (IEXEC >> 5) | (IEXEC >> 10))) != 0 && (ip->i_char & (ISUID | ISGID)) != 0 && secpolicy_vnode_setid_retain(cr, (ip->i_char & ISUID) != 0 && ip->i_uid == 0) != 0) { /* * Clear Set-UID & Set-GID bits on * successful write if not privileged * and at least one of the execute bits * is set. If we always clear Set-GID, * mandatory file and record locking is * unuseable. */ ip->i_char &= ~(ISUID | ISGID); } } } while (error == 0 && uio->uio_resid > 0 && n != 0); out: /* * Inode is updated according to this table - * * FSYNC FDSYNC(posix.4) * -------------------------- * always@ IATTCHG|IBDWRITE * * @ - If we are doing synchronous write the only time we should * not be sync'ing the ip here is if we have the stickyhack * activated, the file is marked with the sticky bit and * no exec bit, the file length has not been changed and * no new blocks have been allocated during this write. */ if ((ip->i_flag & ISYNC) != 0) { /* * we have eliminated nosync */ if ((ip->i_flag & (IATTCHG|IBDWRITE)) || ((ioflag & FSYNC) && iupdat_flag)) { ud_iupdat(ip, 1); } } /* * If we've already done a partial-write, terminate * the write but return no error. */ if (start_resid != uio->uio_resid) { error = 0; } ip->i_flag &= ~(INOACC | ISYNC); ITIMES_NOLOCK(ip); return (error); } int32_t ud_multi_strat(struct ud_inode *ip, page_t *pp, struct buf *bp, u_offset_t start) { daddr_t bn; int32_t error = 0, io_count, contig, alloc_sz, i; uint32_t io_off; mio_master_t *mm = NULL; mio_slave_t *ms = NULL; struct buf *rbp; ASSERT(!(start & PAGEOFFSET)); /* * Figure out how many buffers to allocate */ io_count = 0; for (io_off = 0; io_off < bp->b_bcount; io_off += contig) { contig = 0; if (error = ud_bmap_read(ip, (u_offset_t)(start + io_off), &bn, &contig)) { goto end; } if (contig == 0) { goto end; } contig = MIN(contig, PAGESIZE - io_off); if (bn != UDF_HOLE) { io_count ++; } else { /* * HOLE */ if (bp->b_flags & B_READ) { /* * This is a hole and is read * it should be filled with 0's */ pagezero(pp, io_off, contig); } } } if (io_count != 0) { /* * Allocate memory for all the * required number of buffers */ alloc_sz = sizeof (mio_master_t) + (sizeof (mio_slave_t) * io_count); mm = (mio_master_t *)kmem_zalloc(alloc_sz, KM_SLEEP); if (mm == NULL) { error = ENOMEM; goto end; } /* * initialize master */ mutex_init(&mm->mm_mutex, NULL, MUTEX_DEFAULT, NULL); mm->mm_size = alloc_sz; mm->mm_bp = bp; mm->mm_resid = 0; mm->mm_error = 0; mm->mm_index = master_index++; ms = (mio_slave_t *)(((caddr_t)mm) + sizeof (mio_master_t)); /* * Initialize buffers */ io_count = 0; for (io_off = 0; io_off < bp->b_bcount; io_off += contig) { contig = 0; if (error = ud_bmap_read(ip, (u_offset_t)(start + io_off), &bn, &contig)) { goto end; } ASSERT(contig); if ((io_off + contig) > bp->b_bcount) { contig = bp->b_bcount - io_off; } if (bn != UDF_HOLE) { /* * Clone the buffer * and prepare to start I/O */ ms->ms_ptr = mm; bioinit(&ms->ms_buf); rbp = bioclone(bp, io_off, (size_t)contig, bp->b_edev, bn, ud_slave_done, &ms->ms_buf, KM_NOSLEEP); ASSERT(rbp == &ms->ms_buf); mm->mm_resid += contig; io_count++; ms ++; } } /* * Start I/O's */ ms = (mio_slave_t *)(((caddr_t)mm) + sizeof (mio_master_t)); for (i = 0; i < io_count; i++) { (void) bdev_strategy(&ms->ms_buf); ms ++; } } end: if (error != 0) { bp->b_flags |= B_ERROR; bp->b_error = error; if (mm != NULL) { mutex_destroy(&mm->mm_mutex); kmem_free(mm, mm->mm_size); } } return (error); } int32_t ud_slave_done(struct buf *bp) { mio_master_t *mm; int32_t resid; ASSERT(SEMA_HELD(&bp->b_sem)); ASSERT((bp->b_flags & B_DONE) == 0); mm = ((mio_slave_t *)bp)->ms_ptr; /* * Propagate error and byte count info from slave struct to * the master struct */ mutex_enter(&mm->mm_mutex); if (bp->b_flags & B_ERROR) { /* * If multiple slave buffers get * error we forget the old errors * this is ok because we any way * cannot return multiple errors */ mm->mm_error = bp->b_error; } mm->mm_resid -= bp->b_bcount; resid = mm->mm_resid; mutex_exit(&mm->mm_mutex); /* * free up the resources allocated to cloned buffers. */ bp_mapout(bp); biofini(bp); if (resid == 0) { /* * This is the last I/O operation * clean up and return the original buffer */ if (mm->mm_error) { mm->mm_bp->b_flags |= B_ERROR; mm->mm_bp->b_error = mm->mm_error; } biodone(mm->mm_bp); mutex_destroy(&mm->mm_mutex); kmem_free(mm, mm->mm_size); } return (0); }