17c478bd9Sstevel@tonic-gate /* 27c478bd9Sstevel@tonic-gate * CDDL HEADER START 37c478bd9Sstevel@tonic-gate * 47c478bd9Sstevel@tonic-gate * The contents of this file are subject to the terms of the 59cbc422eSpeterte * Common Development and Distribution License (the "License"). 69cbc422eSpeterte * You may not use this file except in compliance with the License. 77c478bd9Sstevel@tonic-gate * 87c478bd9Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 97c478bd9Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 107c478bd9Sstevel@tonic-gate * See the License for the specific language governing permissions 117c478bd9Sstevel@tonic-gate * and limitations under the License. 127c478bd9Sstevel@tonic-gate * 137c478bd9Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 147c478bd9Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 157c478bd9Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 167c478bd9Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 177c478bd9Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 187c478bd9Sstevel@tonic-gate * 197c478bd9Sstevel@tonic-gate * CDDL HEADER END 207c478bd9Sstevel@tonic-gate */ 213b862e9aSRoger A. Faulkner 227c478bd9Sstevel@tonic-gate /* 233b862e9aSRoger A. Faulkner * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 247c478bd9Sstevel@tonic-gate * Use is subject to license terms. 25*06e6833aSJosef 'Jeff' Sipek * Copyright 2015 Nexenta Systems, Inc. All rights reserved. 267c478bd9Sstevel@tonic-gate */ 277c478bd9Sstevel@tonic-gate 287c478bd9Sstevel@tonic-gate /* 297c478bd9Sstevel@tonic-gate * Vnode operations for the High Sierra filesystem 307c478bd9Sstevel@tonic-gate */ 317c478bd9Sstevel@tonic-gate 327c478bd9Sstevel@tonic-gate #include <sys/types.h> 337c478bd9Sstevel@tonic-gate #include <sys/t_lock.h> 347c478bd9Sstevel@tonic-gate #include <sys/param.h> 357c478bd9Sstevel@tonic-gate #include <sys/time.h> 367c478bd9Sstevel@tonic-gate #include <sys/systm.h> 377c478bd9Sstevel@tonic-gate #include <sys/sysmacros.h> 387c478bd9Sstevel@tonic-gate #include <sys/resource.h> 397c478bd9Sstevel@tonic-gate #include <sys/signal.h> 407c478bd9Sstevel@tonic-gate #include <sys/cred.h> 417c478bd9Sstevel@tonic-gate #include <sys/user.h> 427c478bd9Sstevel@tonic-gate #include <sys/buf.h> 437c478bd9Sstevel@tonic-gate #include <sys/vfs.h> 44aa59c4cbSrsb #include <sys/vfs_opreg.h> 457c478bd9Sstevel@tonic-gate #include <sys/stat.h> 467c478bd9Sstevel@tonic-gate #include <sys/vnode.h> 477c478bd9Sstevel@tonic-gate #include <sys/mode.h> 487c478bd9Sstevel@tonic-gate #include <sys/proc.h> 497c478bd9Sstevel@tonic-gate #include <sys/disp.h> 507c478bd9Sstevel@tonic-gate #include <sys/file.h> 517c478bd9Sstevel@tonic-gate #include <sys/fcntl.h> 527c478bd9Sstevel@tonic-gate #include <sys/flock.h> 537c478bd9Sstevel@tonic-gate #include <sys/kmem.h> 547c478bd9Sstevel@tonic-gate #include <sys/uio.h> 557c478bd9Sstevel@tonic-gate #include <sys/conf.h> 567c478bd9Sstevel@tonic-gate #include <sys/errno.h> 577c478bd9Sstevel@tonic-gate #include <sys/mman.h> 587c478bd9Sstevel@tonic-gate #include <sys/pathname.h> 597c478bd9Sstevel@tonic-gate #include <sys/debug.h> 607c478bd9Sstevel@tonic-gate #include <sys/vmsystm.h> 617c478bd9Sstevel@tonic-gate #include <sys/cmn_err.h> 627c478bd9Sstevel@tonic-gate #include <sys/fbuf.h> 637c478bd9Sstevel@tonic-gate #include <sys/dirent.h> 647c478bd9Sstevel@tonic-gate #include <sys/errno.h> 6584b82766Smg147109 #include <sys/dkio.h> 6684b82766Smg147109 #include <sys/cmn_err.h> 6784b82766Smg147109 #include <sys/atomic.h> 687c478bd9Sstevel@tonic-gate 697c478bd9Sstevel@tonic-gate #include <vm/hat.h> 707c478bd9Sstevel@tonic-gate #include <vm/page.h> 717c478bd9Sstevel@tonic-gate #include <vm/pvn.h> 727c478bd9Sstevel@tonic-gate #include <vm/as.h> 737c478bd9Sstevel@tonic-gate #include <vm/seg.h> 747c478bd9Sstevel@tonic-gate #include <vm/seg_map.h> 757c478bd9Sstevel@tonic-gate #include <vm/seg_kmem.h> 767c478bd9Sstevel@tonic-gate #include <vm/seg_vn.h> 777c478bd9Sstevel@tonic-gate #include <vm/rm.h> 787c478bd9Sstevel@tonic-gate #include <vm/page.h> 797c478bd9Sstevel@tonic-gate #include <sys/swap.h> 8084b82766Smg147109 #include <sys/avl.h> 8184b82766Smg147109 #include <sys/sunldi.h> 8284b82766Smg147109 #include <sys/ddi.h> 8384b82766Smg147109 #include <sys/sunddi.h> 8484b82766Smg147109 #include <sys/sdt.h> 8584b82766Smg147109 8684b82766Smg147109 /* 8784b82766Smg147109 * For struct modlinkage 8884b82766Smg147109 */ 8984b82766Smg147109 #include <sys/modctl.h> 907c478bd9Sstevel@tonic-gate 917c478bd9Sstevel@tonic-gate #include <sys/fs/hsfs_spec.h> 927c478bd9Sstevel@tonic-gate #include <sys/fs/hsfs_node.h> 937c478bd9Sstevel@tonic-gate #include <sys/fs/hsfs_impl.h> 947c478bd9Sstevel@tonic-gate #include <sys/fs/hsfs_susp.h> 957c478bd9Sstevel@tonic-gate #include <sys/fs/hsfs_rrip.h> 967c478bd9Sstevel@tonic-gate 977c478bd9Sstevel@tonic-gate #include <fs/fs_subr.h> 987c478bd9Sstevel@tonic-gate 9984b82766Smg147109 /* # of contiguous requests to detect sequential access pattern */ 10084b82766Smg147109 static int seq_contig_requests = 2; 10184b82766Smg147109 10284b82766Smg147109 /* 10384b82766Smg147109 * This is the max number os taskq threads that will be created 10484b82766Smg147109 * if required. Since we are using a Dynamic TaskQ by default only 10584b82766Smg147109 * one thread is created initially. 10684b82766Smg147109 * 10784b82766Smg147109 * NOTE: In the usual hsfs use case this per fs instance number 10884b82766Smg147109 * of taskq threads should not place any undue load on a system. 10984b82766Smg147109 * Even on an unusual system with say 100 CDROM drives, 800 threads 11084b82766Smg147109 * will not be created unless all the drives are loaded and all 11184b82766Smg147109 * of them are saturated with I/O at the same time! If there is at 11284b82766Smg147109 * all a complaint of system load due to such an unusual case it 11384b82766Smg147109 * should be easy enough to change to one per-machine Dynamic TaskQ 11484b82766Smg147109 * for all hsfs mounts with a nthreads of say 32. 11584b82766Smg147109 */ 11684b82766Smg147109 static int hsfs_taskq_nthreads = 8; /* # of taskq threads per fs */ 11784b82766Smg147109 11884b82766Smg147109 /* Min count of adjacent bufs that will avoid buf coalescing */ 11984b82766Smg147109 static int hsched_coalesce_min = 2; 12084b82766Smg147109 12184b82766Smg147109 /* 12284b82766Smg147109 * Kmem caches for heavily used small allocations. Using these kmem 12384b82766Smg147109 * caches provides a factor of 3 reduction in system time and greatly 12484b82766Smg147109 * aids overall throughput esp. on SPARC. 12584b82766Smg147109 */ 12684b82766Smg147109 struct kmem_cache *hio_cache; 12784b82766Smg147109 struct kmem_cache *hio_info_cache; 12884b82766Smg147109 129d10b6702Sfrankho /* 130d10b6702Sfrankho * This tunable allows us to ignore inode numbers from rrip-1.12. 131d10b6702Sfrankho * In this case, we fall back to our default inode algorithm. 132d10b6702Sfrankho */ 133d10b6702Sfrankho extern int use_rrip_inodes; 134d10b6702Sfrankho 13584b82766Smg147109 /* 13684b82766Smg147109 * Free behind logic from UFS to tame our thirst for 13784b82766Smg147109 * the page cache. 13884b82766Smg147109 * See usr/src/uts/common/fs/ufs/ufs_vnops.c for more 13984b82766Smg147109 * explanation. 14084b82766Smg147109 */ 14184b82766Smg147109 static int freebehind = 1; 14284b82766Smg147109 static int smallfile = 0; 14384b82766Smg147109 static int cache_read_ahead = 0; 14484b82766Smg147109 static u_offset_t smallfile64 = 32 * 1024; 14584b82766Smg147109 #define SMALLFILE1_D 1000 14684b82766Smg147109 #define SMALLFILE2_D 10 14784b82766Smg147109 static u_offset_t smallfile1 = 32 * 1024; 14884b82766Smg147109 static u_offset_t smallfile2 = 32 * 1024; 14984b82766Smg147109 static clock_t smallfile_update = 0; /* when to recompute */ 15084b82766Smg147109 static uint_t smallfile1_d = SMALLFILE1_D; 15184b82766Smg147109 static uint_t smallfile2_d = SMALLFILE2_D; 15284b82766Smg147109 15384b82766Smg147109 static int hsched_deadline_compare(const void *x1, const void *x2); 15484b82766Smg147109 static int hsched_offset_compare(const void *x1, const void *x2); 15584b82766Smg147109 static void hsched_enqueue_io(struct hsfs *fsp, struct hio *hsio, int ra); 15684b82766Smg147109 int hsched_invoke_strategy(struct hsfs *fsp); 157d10b6702Sfrankho 1587c478bd9Sstevel@tonic-gate /* ARGSUSED */ 1597c478bd9Sstevel@tonic-gate static int 160da6c28aaSamw hsfs_fsync(vnode_t *cp, 161da6c28aaSamw int syncflag, 162da6c28aaSamw cred_t *cred, 163da6c28aaSamw caller_context_t *ct) 1647c478bd9Sstevel@tonic-gate { 1657c478bd9Sstevel@tonic-gate return (0); 1667c478bd9Sstevel@tonic-gate } 1677c478bd9Sstevel@tonic-gate 1687c478bd9Sstevel@tonic-gate 1697c478bd9Sstevel@tonic-gate /*ARGSUSED*/ 1707c478bd9Sstevel@tonic-gate static int 171da6c28aaSamw hsfs_read(struct vnode *vp, 172da6c28aaSamw struct uio *uiop, 173da6c28aaSamw int ioflag, 174da6c28aaSamw struct cred *cred, 1757c478bd9Sstevel@tonic-gate struct caller_context *ct) 1767c478bd9Sstevel@tonic-gate { 1777c478bd9Sstevel@tonic-gate caddr_t base; 1788cd7c4fcSpeterte offset_t diff; 1797c478bd9Sstevel@tonic-gate int error; 1808cd7c4fcSpeterte struct hsnode *hp; 1818cd7c4fcSpeterte uint_t filesize; 18284b82766Smg147109 int dofree; 1837c478bd9Sstevel@tonic-gate 1847c478bd9Sstevel@tonic-gate hp = VTOH(vp); 1857c478bd9Sstevel@tonic-gate /* 1867c478bd9Sstevel@tonic-gate * if vp is of type VDIR, make sure dirent 1877c478bd9Sstevel@tonic-gate * is filled up with all info (because of ptbl) 1887c478bd9Sstevel@tonic-gate */ 1897c478bd9Sstevel@tonic-gate if (vp->v_type == VDIR) { 1907c478bd9Sstevel@tonic-gate if (hp->hs_dirent.ext_size == 0) 1917c478bd9Sstevel@tonic-gate hs_filldirent(vp, &hp->hs_dirent); 1927c478bd9Sstevel@tonic-gate } 1937c478bd9Sstevel@tonic-gate filesize = hp->hs_dirent.ext_size; 1947c478bd9Sstevel@tonic-gate 1958cd7c4fcSpeterte /* Sanity checks. */ 1968cd7c4fcSpeterte if (uiop->uio_resid == 0 || /* No data wanted. */ 1979cbc422eSpeterte uiop->uio_loffset > HS_MAXFILEOFF || /* Offset too big. */ 1988cd7c4fcSpeterte uiop->uio_loffset >= filesize) /* Past EOF. */ 1998cd7c4fcSpeterte return (0); 2007c478bd9Sstevel@tonic-gate 2017c478bd9Sstevel@tonic-gate do { 2028cd7c4fcSpeterte /* 2038cd7c4fcSpeterte * We want to ask for only the "right" amount of data. 2048cd7c4fcSpeterte * In this case that means:- 2058cd7c4fcSpeterte * 2068cd7c4fcSpeterte * We can't get data from beyond our EOF. If asked, 2078cd7c4fcSpeterte * we will give a short read. 2088cd7c4fcSpeterte * 2098cd7c4fcSpeterte * segmap_getmapflt returns buffers of MAXBSIZE bytes. 2108cd7c4fcSpeterte * These buffers are always MAXBSIZE aligned. 2118cd7c4fcSpeterte * If our starting offset is not MAXBSIZE aligned, 2128cd7c4fcSpeterte * we can only ask for less than MAXBSIZE bytes. 2138cd7c4fcSpeterte * 2148cd7c4fcSpeterte * If our requested offset and length are such that 2158cd7c4fcSpeterte * they belong in different MAXBSIZE aligned slots 2168cd7c4fcSpeterte * then we'll be making more than one call on 2178cd7c4fcSpeterte * segmap_getmapflt. 2188cd7c4fcSpeterte * 2198cd7c4fcSpeterte * This diagram shows the variables we use and their 2208cd7c4fcSpeterte * relationships. 2218cd7c4fcSpeterte * 2228cd7c4fcSpeterte * |<-----MAXBSIZE----->| 2238cd7c4fcSpeterte * +--------------------------...+ 2248cd7c4fcSpeterte * |.....mapon->|<--n-->|....*...|EOF 2258cd7c4fcSpeterte * +--------------------------...+ 2268cd7c4fcSpeterte * uio_loffset->| 2278cd7c4fcSpeterte * uio_resid....|<---------->| 2288cd7c4fcSpeterte * diff.........|<-------------->| 2298cd7c4fcSpeterte * 2308cd7c4fcSpeterte * So, in this case our offset is not aligned 2318cd7c4fcSpeterte * and our request takes us outside of the 2328cd7c4fcSpeterte * MAXBSIZE window. We will break this up into 2338cd7c4fcSpeterte * two segmap_getmapflt calls. 2348cd7c4fcSpeterte */ 2358cd7c4fcSpeterte size_t nbytes; 2368cd7c4fcSpeterte offset_t mapon; 2378cd7c4fcSpeterte size_t n; 2388cd7c4fcSpeterte uint_t flags; 2397c478bd9Sstevel@tonic-gate 2408cd7c4fcSpeterte mapon = uiop->uio_loffset & MAXBOFFSET; 2418cd7c4fcSpeterte diff = filesize - uiop->uio_loffset; 2428cd7c4fcSpeterte nbytes = (size_t)MIN(MAXBSIZE - mapon, uiop->uio_resid); 2438cd7c4fcSpeterte n = MIN(diff, nbytes); 2448cd7c4fcSpeterte if (n <= 0) { 2458cd7c4fcSpeterte /* EOF or request satisfied. */ 2468cd7c4fcSpeterte return (0); 2477c478bd9Sstevel@tonic-gate } 2487c478bd9Sstevel@tonic-gate 24984b82766Smg147109 /* 25084b82766Smg147109 * Freebehind computation taken from: 25184b82766Smg147109 * usr/src/uts/common/fs/ufs/ufs_vnops.c 25284b82766Smg147109 */ 25384b82766Smg147109 if (drv_hztousec(ddi_get_lbolt()) >= smallfile_update) { 25484b82766Smg147109 uint64_t percpufreeb; 25584b82766Smg147109 if (smallfile1_d == 0) smallfile1_d = SMALLFILE1_D; 25684b82766Smg147109 if (smallfile2_d == 0) smallfile2_d = SMALLFILE2_D; 25784b82766Smg147109 percpufreeb = ptob((uint64_t)freemem) / ncpus_online; 25884b82766Smg147109 smallfile1 = percpufreeb / smallfile1_d; 25984b82766Smg147109 smallfile2 = percpufreeb / smallfile2_d; 26084b82766Smg147109 smallfile1 = MAX(smallfile1, smallfile); 26184b82766Smg147109 smallfile1 = MAX(smallfile1, smallfile64); 26284b82766Smg147109 smallfile2 = MAX(smallfile1, smallfile2); 26384b82766Smg147109 smallfile_update = drv_hztousec(ddi_get_lbolt()) 26484b82766Smg147109 + 1000000; 26584b82766Smg147109 } 26684b82766Smg147109 26784b82766Smg147109 dofree = freebehind && 26884b82766Smg147109 hp->hs_prev_offset == uiop->uio_loffset && 26984b82766Smg147109 hp->hs_ra_bytes > 0; 27084b82766Smg147109 2718cd7c4fcSpeterte base = segmap_getmapflt(segkmap, vp, 2728cd7c4fcSpeterte (u_offset_t)uiop->uio_loffset, n, 1, S_READ); 2737c478bd9Sstevel@tonic-gate 2748cd7c4fcSpeterte error = uiomove(base + mapon, n, UIO_READ, uiop); 2758cd7c4fcSpeterte 2767c478bd9Sstevel@tonic-gate if (error == 0) { 2777c478bd9Sstevel@tonic-gate /* 2787c478bd9Sstevel@tonic-gate * if read a whole block, or read to eof, 2797c478bd9Sstevel@tonic-gate * won't need this buffer again soon. 2807c478bd9Sstevel@tonic-gate */ 2818cd7c4fcSpeterte if (n + mapon == MAXBSIZE || 2828cd7c4fcSpeterte uiop->uio_loffset == filesize) 2837c478bd9Sstevel@tonic-gate flags = SM_DONTNEED; 2847c478bd9Sstevel@tonic-gate else 2857c478bd9Sstevel@tonic-gate flags = 0; 28684b82766Smg147109 28784b82766Smg147109 if (dofree) { 28884b82766Smg147109 flags = SM_FREE | SM_ASYNC; 28984b82766Smg147109 if ((cache_read_ahead == 0) && 29084b82766Smg147109 uiop->uio_loffset > smallfile2) 29184b82766Smg147109 flags |= SM_DONTNEED; 29284b82766Smg147109 } 29384b82766Smg147109 2947c478bd9Sstevel@tonic-gate error = segmap_release(segkmap, base, flags); 2957c478bd9Sstevel@tonic-gate } else 2967c478bd9Sstevel@tonic-gate (void) segmap_release(segkmap, base, 0); 2977c478bd9Sstevel@tonic-gate } while (error == 0 && uiop->uio_resid > 0); 2987c478bd9Sstevel@tonic-gate 2997c478bd9Sstevel@tonic-gate return (error); 3007c478bd9Sstevel@tonic-gate } 3017c478bd9Sstevel@tonic-gate 3027c478bd9Sstevel@tonic-gate /*ARGSUSED2*/ 3037c478bd9Sstevel@tonic-gate static int 3047c478bd9Sstevel@tonic-gate hsfs_getattr( 3057c478bd9Sstevel@tonic-gate struct vnode *vp, 3067c478bd9Sstevel@tonic-gate struct vattr *vap, 3077c478bd9Sstevel@tonic-gate int flags, 308da6c28aaSamw struct cred *cred, 309da6c28aaSamw caller_context_t *ct) 3107c478bd9Sstevel@tonic-gate { 3117c478bd9Sstevel@tonic-gate struct hsnode *hp; 3127c478bd9Sstevel@tonic-gate struct vfs *vfsp; 3137c478bd9Sstevel@tonic-gate struct hsfs *fsp; 3147c478bd9Sstevel@tonic-gate 3157c478bd9Sstevel@tonic-gate hp = VTOH(vp); 3167c478bd9Sstevel@tonic-gate fsp = VFS_TO_HSFS(vp->v_vfsp); 3177c478bd9Sstevel@tonic-gate vfsp = vp->v_vfsp; 3187c478bd9Sstevel@tonic-gate 3197c478bd9Sstevel@tonic-gate if ((hp->hs_dirent.ext_size == 0) && (vp->v_type == VDIR)) { 3207c478bd9Sstevel@tonic-gate hs_filldirent(vp, &hp->hs_dirent); 3217c478bd9Sstevel@tonic-gate } 3227c478bd9Sstevel@tonic-gate vap->va_type = IFTOVT(hp->hs_dirent.mode); 3237c478bd9Sstevel@tonic-gate vap->va_mode = hp->hs_dirent.mode; 3247c478bd9Sstevel@tonic-gate vap->va_uid = hp->hs_dirent.uid; 3257c478bd9Sstevel@tonic-gate vap->va_gid = hp->hs_dirent.gid; 3267c478bd9Sstevel@tonic-gate 3277c478bd9Sstevel@tonic-gate vap->va_fsid = vfsp->vfs_dev; 3287c478bd9Sstevel@tonic-gate vap->va_nodeid = (ino64_t)hp->hs_nodeid; 3297c478bd9Sstevel@tonic-gate vap->va_nlink = hp->hs_dirent.nlink; 3307c478bd9Sstevel@tonic-gate vap->va_size = (offset_t)hp->hs_dirent.ext_size; 3317c478bd9Sstevel@tonic-gate 3327c478bd9Sstevel@tonic-gate vap->va_atime.tv_sec = hp->hs_dirent.adate.tv_sec; 3337c478bd9Sstevel@tonic-gate vap->va_atime.tv_nsec = hp->hs_dirent.adate.tv_usec*1000; 3347c478bd9Sstevel@tonic-gate vap->va_mtime.tv_sec = hp->hs_dirent.mdate.tv_sec; 3357c478bd9Sstevel@tonic-gate vap->va_mtime.tv_nsec = hp->hs_dirent.mdate.tv_usec*1000; 3367c478bd9Sstevel@tonic-gate vap->va_ctime.tv_sec = hp->hs_dirent.cdate.tv_sec; 3377c478bd9Sstevel@tonic-gate vap->va_ctime.tv_nsec = hp->hs_dirent.cdate.tv_usec*1000; 3387c478bd9Sstevel@tonic-gate if (vp->v_type == VCHR || vp->v_type == VBLK) 3397c478bd9Sstevel@tonic-gate vap->va_rdev = hp->hs_dirent.r_dev; 3407c478bd9Sstevel@tonic-gate else 3417c478bd9Sstevel@tonic-gate vap->va_rdev = 0; 3427c478bd9Sstevel@tonic-gate vap->va_blksize = vfsp->vfs_bsize; 3437c478bd9Sstevel@tonic-gate /* no. of blocks = no. of data blocks + no. of xar blocks */ 3447c478bd9Sstevel@tonic-gate vap->va_nblocks = (fsblkcnt64_t)howmany(vap->va_size + (u_longlong_t) 3457c478bd9Sstevel@tonic-gate (hp->hs_dirent.xar_len << fsp->hsfs_vol.lbn_shift), DEV_BSIZE); 3467c478bd9Sstevel@tonic-gate vap->va_seq = hp->hs_seq; 3477c478bd9Sstevel@tonic-gate return (0); 3487c478bd9Sstevel@tonic-gate } 3497c478bd9Sstevel@tonic-gate 3507c478bd9Sstevel@tonic-gate /*ARGSUSED*/ 3517c478bd9Sstevel@tonic-gate static int 352da6c28aaSamw hsfs_readlink(struct vnode *vp, 353da6c28aaSamw struct uio *uiop, 354da6c28aaSamw struct cred *cred, 355da6c28aaSamw caller_context_t *ct) 3567c478bd9Sstevel@tonic-gate { 3577c478bd9Sstevel@tonic-gate struct hsnode *hp; 3587c478bd9Sstevel@tonic-gate 3597c478bd9Sstevel@tonic-gate if (vp->v_type != VLNK) 3607c478bd9Sstevel@tonic-gate return (EINVAL); 3617c478bd9Sstevel@tonic-gate 3627c478bd9Sstevel@tonic-gate hp = VTOH(vp); 3637c478bd9Sstevel@tonic-gate 3647c478bd9Sstevel@tonic-gate if (hp->hs_dirent.sym_link == (char *)NULL) 3657c478bd9Sstevel@tonic-gate return (ENOENT); 3667c478bd9Sstevel@tonic-gate 3677c478bd9Sstevel@tonic-gate return (uiomove(hp->hs_dirent.sym_link, 3687c478bd9Sstevel@tonic-gate (size_t)MIN(hp->hs_dirent.ext_size, 3697c478bd9Sstevel@tonic-gate uiop->uio_resid), UIO_READ, uiop)); 3707c478bd9Sstevel@tonic-gate } 3717c478bd9Sstevel@tonic-gate 3727c478bd9Sstevel@tonic-gate /*ARGSUSED*/ 3737c478bd9Sstevel@tonic-gate static void 374da6c28aaSamw hsfs_inactive(struct vnode *vp, 375da6c28aaSamw struct cred *cred, 376da6c28aaSamw caller_context_t *ct) 3777c478bd9Sstevel@tonic-gate { 3787c478bd9Sstevel@tonic-gate struct hsnode *hp; 3797c478bd9Sstevel@tonic-gate struct hsfs *fsp; 3807c478bd9Sstevel@tonic-gate 3817c478bd9Sstevel@tonic-gate int nopage; 3827c478bd9Sstevel@tonic-gate 3837c478bd9Sstevel@tonic-gate hp = VTOH(vp); 3847c478bd9Sstevel@tonic-gate fsp = VFS_TO_HSFS(vp->v_vfsp); 3857c478bd9Sstevel@tonic-gate /* 3867c478bd9Sstevel@tonic-gate * Note: acquiring and holding v_lock for quite a while 3877c478bd9Sstevel@tonic-gate * here serializes on the vnode; this is unfortunate, but 3887c478bd9Sstevel@tonic-gate * likely not to overly impact performance, as the underlying 3897c478bd9Sstevel@tonic-gate * device (CDROM drive) is quite slow. 3907c478bd9Sstevel@tonic-gate */ 3917c478bd9Sstevel@tonic-gate rw_enter(&fsp->hsfs_hash_lock, RW_WRITER); 3927c478bd9Sstevel@tonic-gate mutex_enter(&hp->hs_contents_lock); 3937c478bd9Sstevel@tonic-gate mutex_enter(&vp->v_lock); 3947c478bd9Sstevel@tonic-gate 3957c478bd9Sstevel@tonic-gate if (vp->v_count < 1) { 3967c478bd9Sstevel@tonic-gate panic("hsfs_inactive: v_count < 1"); 3977c478bd9Sstevel@tonic-gate /*NOTREACHED*/ 3987c478bd9Sstevel@tonic-gate } 3997c478bd9Sstevel@tonic-gate 4007c478bd9Sstevel@tonic-gate if (vp->v_count > 1 || (hp->hs_flags & HREF) == 0) { 4017c478bd9Sstevel@tonic-gate vp->v_count--; /* release hold from vn_rele */ 4027c478bd9Sstevel@tonic-gate mutex_exit(&vp->v_lock); 4037c478bd9Sstevel@tonic-gate mutex_exit(&hp->hs_contents_lock); 4047c478bd9Sstevel@tonic-gate rw_exit(&fsp->hsfs_hash_lock); 4057c478bd9Sstevel@tonic-gate return; 4067c478bd9Sstevel@tonic-gate } 4077c478bd9Sstevel@tonic-gate vp->v_count--; /* release hold from vn_rele */ 4087c478bd9Sstevel@tonic-gate if (vp->v_count == 0) { 4097c478bd9Sstevel@tonic-gate /* 4107c478bd9Sstevel@tonic-gate * Free the hsnode. 4117c478bd9Sstevel@tonic-gate * If there are no pages associated with the 4127c478bd9Sstevel@tonic-gate * hsnode, give it back to the kmem_cache, 4137c478bd9Sstevel@tonic-gate * else put at the end of this file system's 4147c478bd9Sstevel@tonic-gate * internal free list. 4157c478bd9Sstevel@tonic-gate */ 4167c478bd9Sstevel@tonic-gate nopage = !vn_has_cached_data(vp); 4177c478bd9Sstevel@tonic-gate hp->hs_flags = 0; 4187c478bd9Sstevel@tonic-gate /* 4197c478bd9Sstevel@tonic-gate * exit these locks now, since hs_freenode may 4207c478bd9Sstevel@tonic-gate * kmem_free the hsnode and embedded vnode 4217c478bd9Sstevel@tonic-gate */ 4227c478bd9Sstevel@tonic-gate mutex_exit(&vp->v_lock); 4237c478bd9Sstevel@tonic-gate mutex_exit(&hp->hs_contents_lock); 4247c478bd9Sstevel@tonic-gate hs_freenode(vp, fsp, nopage); 4257c478bd9Sstevel@tonic-gate } else { 4267c478bd9Sstevel@tonic-gate mutex_exit(&vp->v_lock); 4277c478bd9Sstevel@tonic-gate mutex_exit(&hp->hs_contents_lock); 4287c478bd9Sstevel@tonic-gate } 4297c478bd9Sstevel@tonic-gate rw_exit(&fsp->hsfs_hash_lock); 4307c478bd9Sstevel@tonic-gate } 4317c478bd9Sstevel@tonic-gate 4327c478bd9Sstevel@tonic-gate 4337c478bd9Sstevel@tonic-gate /*ARGSUSED*/ 4347c478bd9Sstevel@tonic-gate static int 4357c478bd9Sstevel@tonic-gate hsfs_lookup( 4367c478bd9Sstevel@tonic-gate struct vnode *dvp, 4377c478bd9Sstevel@tonic-gate char *nm, 4387c478bd9Sstevel@tonic-gate struct vnode **vpp, 4397c478bd9Sstevel@tonic-gate struct pathname *pnp, 4407c478bd9Sstevel@tonic-gate int flags, 4417c478bd9Sstevel@tonic-gate struct vnode *rdir, 442da6c28aaSamw struct cred *cred, 443da6c28aaSamw caller_context_t *ct, 444da6c28aaSamw int *direntflags, 445da6c28aaSamw pathname_t *realpnp) 4467c478bd9Sstevel@tonic-gate { 4477c478bd9Sstevel@tonic-gate int error; 4487c478bd9Sstevel@tonic-gate int namelen = (int)strlen(nm); 4497c478bd9Sstevel@tonic-gate 4507c478bd9Sstevel@tonic-gate if (*nm == '\0') { 4517c478bd9Sstevel@tonic-gate VN_HOLD(dvp); 4527c478bd9Sstevel@tonic-gate *vpp = dvp; 4537c478bd9Sstevel@tonic-gate return (0); 4547c478bd9Sstevel@tonic-gate } 4557c478bd9Sstevel@tonic-gate 4567c478bd9Sstevel@tonic-gate /* 4577c478bd9Sstevel@tonic-gate * If we're looking for ourself, life is simple. 4587c478bd9Sstevel@tonic-gate */ 4597c478bd9Sstevel@tonic-gate if (namelen == 1 && *nm == '.') { 4607c478bd9Sstevel@tonic-gate if (error = hs_access(dvp, (mode_t)VEXEC, cred)) 4617c478bd9Sstevel@tonic-gate return (error); 4627c478bd9Sstevel@tonic-gate VN_HOLD(dvp); 4637c478bd9Sstevel@tonic-gate *vpp = dvp; 4647c478bd9Sstevel@tonic-gate return (0); 4657c478bd9Sstevel@tonic-gate } 4667c478bd9Sstevel@tonic-gate 4677c478bd9Sstevel@tonic-gate return (hs_dirlook(dvp, nm, namelen, vpp, cred)); 4687c478bd9Sstevel@tonic-gate } 4697c478bd9Sstevel@tonic-gate 4707c478bd9Sstevel@tonic-gate 4717c478bd9Sstevel@tonic-gate /*ARGSUSED*/ 4727c478bd9Sstevel@tonic-gate static int 4737c478bd9Sstevel@tonic-gate hsfs_readdir( 4747c478bd9Sstevel@tonic-gate struct vnode *vp, 4757c478bd9Sstevel@tonic-gate struct uio *uiop, 4767c478bd9Sstevel@tonic-gate struct cred *cred, 477da6c28aaSamw int *eofp, 478da6c28aaSamw caller_context_t *ct, 479da6c28aaSamw int flags) 4807c478bd9Sstevel@tonic-gate { 4817c478bd9Sstevel@tonic-gate struct hsnode *dhp; 4827c478bd9Sstevel@tonic-gate struct hsfs *fsp; 4837c478bd9Sstevel@tonic-gate struct hs_direntry hd; 4847c478bd9Sstevel@tonic-gate struct dirent64 *nd; 4857c478bd9Sstevel@tonic-gate int error; 4867c478bd9Sstevel@tonic-gate uint_t offset; /* real offset in directory */ 4877c478bd9Sstevel@tonic-gate uint_t dirsiz; /* real size of directory */ 4887c478bd9Sstevel@tonic-gate uchar_t *blkp; 4897c478bd9Sstevel@tonic-gate int hdlen; /* length of hs directory entry */ 4907c478bd9Sstevel@tonic-gate long ndlen; /* length of dirent entry */ 4917c478bd9Sstevel@tonic-gate int bytes_wanted; 4927c478bd9Sstevel@tonic-gate size_t bufsize; /* size of dirent buffer */ 4937c478bd9Sstevel@tonic-gate char *outbuf; /* ptr to dirent buffer */ 4947c478bd9Sstevel@tonic-gate char *dname; 4957c478bd9Sstevel@tonic-gate int dnamelen; 4967c478bd9Sstevel@tonic-gate size_t dname_size; 4977c478bd9Sstevel@tonic-gate struct fbuf *fbp; 4987c478bd9Sstevel@tonic-gate uint_t last_offset; /* last index into current dir block */ 4997c478bd9Sstevel@tonic-gate ino64_t dirino; /* temporary storage before storing in dirent */ 5007c478bd9Sstevel@tonic-gate off_t diroff; 5017c478bd9Sstevel@tonic-gate 5027c478bd9Sstevel@tonic-gate dhp = VTOH(vp); 5037c478bd9Sstevel@tonic-gate fsp = VFS_TO_HSFS(vp->v_vfsp); 5047c478bd9Sstevel@tonic-gate if (dhp->hs_dirent.ext_size == 0) 5057c478bd9Sstevel@tonic-gate hs_filldirent(vp, &dhp->hs_dirent); 5067c478bd9Sstevel@tonic-gate dirsiz = dhp->hs_dirent.ext_size; 5077c478bd9Sstevel@tonic-gate if (uiop->uio_loffset >= dirsiz) { /* at or beyond EOF */ 5087c478bd9Sstevel@tonic-gate if (eofp) 5097c478bd9Sstevel@tonic-gate *eofp = 1; 5107c478bd9Sstevel@tonic-gate return (0); 5117c478bd9Sstevel@tonic-gate } 5129cbc422eSpeterte ASSERT(uiop->uio_loffset <= HS_MAXFILEOFF); 5139cbc422eSpeterte offset = uiop->uio_loffset; 5147c478bd9Sstevel@tonic-gate 5157c478bd9Sstevel@tonic-gate dname_size = fsp->hsfs_namemax + 1; /* 1 for the ending NUL */ 5167c478bd9Sstevel@tonic-gate dname = kmem_alloc(dname_size, KM_SLEEP); 5177c478bd9Sstevel@tonic-gate bufsize = uiop->uio_resid + sizeof (struct dirent64); 5187c478bd9Sstevel@tonic-gate 5197c478bd9Sstevel@tonic-gate outbuf = kmem_alloc(bufsize, KM_SLEEP); 5207c478bd9Sstevel@tonic-gate nd = (struct dirent64 *)outbuf; 5217c478bd9Sstevel@tonic-gate 5227c478bd9Sstevel@tonic-gate while (offset < dirsiz) { 523cf83459aSfrankho bytes_wanted = MIN(MAXBSIZE, dirsiz - (offset & MAXBMASK)); 5247c478bd9Sstevel@tonic-gate 5257c478bd9Sstevel@tonic-gate error = fbread(vp, (offset_t)(offset & MAXBMASK), 5267c478bd9Sstevel@tonic-gate (unsigned int)bytes_wanted, S_READ, &fbp); 5277c478bd9Sstevel@tonic-gate if (error) 5287c478bd9Sstevel@tonic-gate goto done; 5297c478bd9Sstevel@tonic-gate 5307c478bd9Sstevel@tonic-gate blkp = (uchar_t *)fbp->fb_addr; 531cf83459aSfrankho last_offset = (offset & MAXBMASK) + fbp->fb_count; 5327c478bd9Sstevel@tonic-gate 5337c478bd9Sstevel@tonic-gate #define rel_offset(offset) ((offset) & MAXBOFFSET) /* index into blkp */ 5347c478bd9Sstevel@tonic-gate 5357c478bd9Sstevel@tonic-gate while (offset < last_offset) { 5367c478bd9Sstevel@tonic-gate /* 537cf83459aSfrankho * Very similar validation code is found in 538cf83459aSfrankho * process_dirblock(), hsfs_node.c. 539cf83459aSfrankho * For an explanation, see there. 540cf83459aSfrankho * It may make sense for the future to 541cf83459aSfrankho * "consolidate" the code in hs_parsedir(), 542cf83459aSfrankho * process_dirblock() and hsfs_readdir() into 543cf83459aSfrankho * a single utility function. 5447c478bd9Sstevel@tonic-gate */ 5457c478bd9Sstevel@tonic-gate hdlen = (int)((uchar_t) 5467c478bd9Sstevel@tonic-gate HDE_DIR_LEN(&blkp[rel_offset(offset)])); 547cf83459aSfrankho if (hdlen < HDE_ROOT_DIR_REC_SIZE || 548cf83459aSfrankho offset + hdlen > last_offset) { 5497c478bd9Sstevel@tonic-gate /* 550cf83459aSfrankho * advance to next sector boundary 5517c478bd9Sstevel@tonic-gate */ 552cf83459aSfrankho offset = roundup(offset + 1, HS_SECTOR_SIZE); 553cf83459aSfrankho if (hdlen) 554cf83459aSfrankho hs_log_bogus_disk_warning(fsp, 555cf83459aSfrankho HSFS_ERR_TRAILING_JUNK, 0); 556cf83459aSfrankho 5577c478bd9Sstevel@tonic-gate continue; 5587c478bd9Sstevel@tonic-gate } 5597c478bd9Sstevel@tonic-gate 5607c478bd9Sstevel@tonic-gate bzero(&hd, sizeof (hd)); 5617c478bd9Sstevel@tonic-gate 5627c478bd9Sstevel@tonic-gate /* 5637c478bd9Sstevel@tonic-gate * Just ignore invalid directory entries. 5647c478bd9Sstevel@tonic-gate * XXX - maybe hs_parsedir() will detect EXISTENCE bit 5657c478bd9Sstevel@tonic-gate */ 5667c478bd9Sstevel@tonic-gate if (!hs_parsedir(fsp, &blkp[rel_offset(offset)], 567d10b6702Sfrankho &hd, dname, &dnamelen, last_offset - offset)) { 5687c478bd9Sstevel@tonic-gate /* 5697c478bd9Sstevel@tonic-gate * Determine if there is enough room 5707c478bd9Sstevel@tonic-gate */ 5717c478bd9Sstevel@tonic-gate ndlen = (long)DIRENT64_RECLEN((dnamelen)); 5727c478bd9Sstevel@tonic-gate 5737c478bd9Sstevel@tonic-gate if ((ndlen + ((char *)nd - outbuf)) > 5747c478bd9Sstevel@tonic-gate uiop->uio_resid) { 5757c478bd9Sstevel@tonic-gate fbrelse(fbp, S_READ); 5767c478bd9Sstevel@tonic-gate goto done; /* output buffer full */ 5777c478bd9Sstevel@tonic-gate } 5787c478bd9Sstevel@tonic-gate 5797c478bd9Sstevel@tonic-gate diroff = offset + hdlen; 5807c478bd9Sstevel@tonic-gate /* 581d10b6702Sfrankho * If the media carries rrip-v1.12 or newer, 582d10b6702Sfrankho * and we trust the inodes from the rrip data 583d10b6702Sfrankho * (use_rrip_inodes != 0), use that data. If the 584d10b6702Sfrankho * media has been created by a recent mkisofs 585d10b6702Sfrankho * version, we may trust all numbers in the 586d10b6702Sfrankho * starting extent number; otherwise, we cannot 587d10b6702Sfrankho * do this for zero sized files and symlinks, 588d10b6702Sfrankho * because if we did we'd end up mapping all of 589d10b6702Sfrankho * them to the same node. We use HS_DUMMY_INO 590d10b6702Sfrankho * in this case and make sure that we will not 591d10b6702Sfrankho * map all files to the same meta data. 5927c478bd9Sstevel@tonic-gate */ 593d10b6702Sfrankho if (hd.inode != 0 && use_rrip_inodes) { 594d10b6702Sfrankho dirino = hd.inode; 595d10b6702Sfrankho } else if ((hd.ext_size == 0 || 596d10b6702Sfrankho hd.sym_link != (char *)NULL) && 597d10b6702Sfrankho (fsp->hsfs_flags & HSFSMNT_INODE) == 0) { 598d10b6702Sfrankho dirino = HS_DUMMY_INO; 5997c478bd9Sstevel@tonic-gate } else { 600d10b6702Sfrankho dirino = hd.ext_lbn; 6017c478bd9Sstevel@tonic-gate } 6027c478bd9Sstevel@tonic-gate 6037c478bd9Sstevel@tonic-gate /* strncpy(9f) will zero uninitialized bytes */ 6047c478bd9Sstevel@tonic-gate 6057c478bd9Sstevel@tonic-gate ASSERT(strlen(dname) + 1 <= 6067c478bd9Sstevel@tonic-gate DIRENT64_NAMELEN(ndlen)); 6077c478bd9Sstevel@tonic-gate (void) strncpy(nd->d_name, dname, 6087c478bd9Sstevel@tonic-gate DIRENT64_NAMELEN(ndlen)); 6097c478bd9Sstevel@tonic-gate nd->d_reclen = (ushort_t)ndlen; 6107c478bd9Sstevel@tonic-gate nd->d_off = (offset_t)diroff; 6117c478bd9Sstevel@tonic-gate nd->d_ino = dirino; 6127c478bd9Sstevel@tonic-gate nd = (struct dirent64 *)((char *)nd + ndlen); 6137c478bd9Sstevel@tonic-gate 6147c478bd9Sstevel@tonic-gate /* 6157c478bd9Sstevel@tonic-gate * free up space allocated for symlink 6167c478bd9Sstevel@tonic-gate */ 6177c478bd9Sstevel@tonic-gate if (hd.sym_link != (char *)NULL) { 6187c478bd9Sstevel@tonic-gate kmem_free(hd.sym_link, 6197c478bd9Sstevel@tonic-gate (size_t)(hd.ext_size+1)); 6207c478bd9Sstevel@tonic-gate hd.sym_link = (char *)NULL; 6217c478bd9Sstevel@tonic-gate } 6227c478bd9Sstevel@tonic-gate } 6237c478bd9Sstevel@tonic-gate offset += hdlen; 6247c478bd9Sstevel@tonic-gate } 6257c478bd9Sstevel@tonic-gate fbrelse(fbp, S_READ); 6267c478bd9Sstevel@tonic-gate } 6277c478bd9Sstevel@tonic-gate 6287c478bd9Sstevel@tonic-gate /* 6297c478bd9Sstevel@tonic-gate * Got here for one of the following reasons: 6307c478bd9Sstevel@tonic-gate * 1) outbuf is full (error == 0) 6317c478bd9Sstevel@tonic-gate * 2) end of directory reached (error == 0) 6327c478bd9Sstevel@tonic-gate * 3) error reading directory sector (error != 0) 6337c478bd9Sstevel@tonic-gate * 4) directory entry crosses sector boundary (error == 0) 6347c478bd9Sstevel@tonic-gate * 6357c478bd9Sstevel@tonic-gate * If any directory entries have been copied, don't report 6367c478bd9Sstevel@tonic-gate * case 4. Instead, return the valid directory entries. 6377c478bd9Sstevel@tonic-gate * 6387c478bd9Sstevel@tonic-gate * If no entries have been copied, report the error. 6397c478bd9Sstevel@tonic-gate * If case 4, this will be indistiguishable from EOF. 6407c478bd9Sstevel@tonic-gate */ 6417c478bd9Sstevel@tonic-gate done: 6427c478bd9Sstevel@tonic-gate ndlen = ((char *)nd - outbuf); 6437c478bd9Sstevel@tonic-gate if (ndlen != 0) { 6447c478bd9Sstevel@tonic-gate error = uiomove(outbuf, (size_t)ndlen, UIO_READ, uiop); 6459cbc422eSpeterte uiop->uio_loffset = offset; 6467c478bd9Sstevel@tonic-gate } 6477c478bd9Sstevel@tonic-gate kmem_free(dname, dname_size); 6487c478bd9Sstevel@tonic-gate kmem_free(outbuf, bufsize); 6497c478bd9Sstevel@tonic-gate if (eofp && error == 0) 6509cbc422eSpeterte *eofp = (uiop->uio_loffset >= dirsiz); 6517c478bd9Sstevel@tonic-gate return (error); 6527c478bd9Sstevel@tonic-gate } 6537c478bd9Sstevel@tonic-gate 654da6c28aaSamw /*ARGSUSED2*/ 6557c478bd9Sstevel@tonic-gate static int 656da6c28aaSamw hsfs_fid(struct vnode *vp, struct fid *fidp, caller_context_t *ct) 6577c478bd9Sstevel@tonic-gate { 6587c478bd9Sstevel@tonic-gate struct hsnode *hp; 6597c478bd9Sstevel@tonic-gate struct hsfid *fid; 6607c478bd9Sstevel@tonic-gate 6617c478bd9Sstevel@tonic-gate if (fidp->fid_len < (sizeof (*fid) - sizeof (fid->hf_len))) { 6627c478bd9Sstevel@tonic-gate fidp->fid_len = sizeof (*fid) - sizeof (fid->hf_len); 6637c478bd9Sstevel@tonic-gate return (ENOSPC); 6647c478bd9Sstevel@tonic-gate } 6657c478bd9Sstevel@tonic-gate 6667c478bd9Sstevel@tonic-gate fid = (struct hsfid *)fidp; 6677c478bd9Sstevel@tonic-gate fid->hf_len = sizeof (*fid) - sizeof (fid->hf_len); 6687c478bd9Sstevel@tonic-gate hp = VTOH(vp); 6697c478bd9Sstevel@tonic-gate mutex_enter(&hp->hs_contents_lock); 6707c478bd9Sstevel@tonic-gate fid->hf_dir_lbn = hp->hs_dir_lbn; 6717c478bd9Sstevel@tonic-gate fid->hf_dir_off = (ushort_t)hp->hs_dir_off; 672d10b6702Sfrankho fid->hf_ino = hp->hs_nodeid; 6737c478bd9Sstevel@tonic-gate mutex_exit(&hp->hs_contents_lock); 6747c478bd9Sstevel@tonic-gate return (0); 6757c478bd9Sstevel@tonic-gate } 6767c478bd9Sstevel@tonic-gate 6777c478bd9Sstevel@tonic-gate /*ARGSUSED*/ 6787c478bd9Sstevel@tonic-gate static int 679da6c28aaSamw hsfs_open(struct vnode **vpp, 680da6c28aaSamw int flag, 681da6c28aaSamw struct cred *cred, 682da6c28aaSamw caller_context_t *ct) 6837c478bd9Sstevel@tonic-gate { 6847c478bd9Sstevel@tonic-gate return (0); 6857c478bd9Sstevel@tonic-gate } 6867c478bd9Sstevel@tonic-gate 6877c478bd9Sstevel@tonic-gate /*ARGSUSED*/ 6887c478bd9Sstevel@tonic-gate static int 6897c478bd9Sstevel@tonic-gate hsfs_close( 6907c478bd9Sstevel@tonic-gate struct vnode *vp, 6917c478bd9Sstevel@tonic-gate int flag, 6927c478bd9Sstevel@tonic-gate int count, 6937c478bd9Sstevel@tonic-gate offset_t offset, 694da6c28aaSamw struct cred *cred, 695da6c28aaSamw caller_context_t *ct) 6967c478bd9Sstevel@tonic-gate { 6977c478bd9Sstevel@tonic-gate (void) cleanlocks(vp, ttoproc(curthread)->p_pid, 0); 6987c478bd9Sstevel@tonic-gate cleanshares(vp, ttoproc(curthread)->p_pid); 6997c478bd9Sstevel@tonic-gate return (0); 7007c478bd9Sstevel@tonic-gate } 7017c478bd9Sstevel@tonic-gate 7027c478bd9Sstevel@tonic-gate /*ARGSUSED2*/ 7037c478bd9Sstevel@tonic-gate static int 704da6c28aaSamw hsfs_access(struct vnode *vp, 705da6c28aaSamw int mode, 706da6c28aaSamw int flags, 707da6c28aaSamw cred_t *cred, 708da6c28aaSamw caller_context_t *ct) 7097c478bd9Sstevel@tonic-gate { 7107c478bd9Sstevel@tonic-gate return (hs_access(vp, (mode_t)mode, cred)); 7117c478bd9Sstevel@tonic-gate } 7127c478bd9Sstevel@tonic-gate 7137c478bd9Sstevel@tonic-gate /* 7147c478bd9Sstevel@tonic-gate * the seek time of a CD-ROM is very slow, and data transfer 7157c478bd9Sstevel@tonic-gate * rate is even worse (max. 150K per sec). The design 7167c478bd9Sstevel@tonic-gate * decision is to reduce access to cd-rom as much as possible, 7177c478bd9Sstevel@tonic-gate * and to transfer a sizable block (read-ahead) of data at a time. 7187c478bd9Sstevel@tonic-gate * UFS style of read ahead one block at a time is not appropriate, 7197c478bd9Sstevel@tonic-gate * and is not supported 7207c478bd9Sstevel@tonic-gate */ 7217c478bd9Sstevel@tonic-gate 7227c478bd9Sstevel@tonic-gate /* 7237c478bd9Sstevel@tonic-gate * KLUSTSIZE should be a multiple of PAGESIZE and <= MAXPHYS. 7247c478bd9Sstevel@tonic-gate */ 7257c478bd9Sstevel@tonic-gate #define KLUSTSIZE (56 * 1024) 7267c478bd9Sstevel@tonic-gate /* we don't support read ahead */ 7277c478bd9Sstevel@tonic-gate int hsfs_lostpage; /* no. of times we lost original page */ 7287c478bd9Sstevel@tonic-gate 7297c478bd9Sstevel@tonic-gate /* 7307c478bd9Sstevel@tonic-gate * Used to prevent biodone() from releasing buf resources that 7317c478bd9Sstevel@tonic-gate * we didn't allocate in quite the usual way. 7327c478bd9Sstevel@tonic-gate */ 7337c478bd9Sstevel@tonic-gate /*ARGSUSED*/ 7347c478bd9Sstevel@tonic-gate int 7357c478bd9Sstevel@tonic-gate hsfs_iodone(struct buf *bp) 7367c478bd9Sstevel@tonic-gate { 7377c478bd9Sstevel@tonic-gate sema_v(&bp->b_io); 7387c478bd9Sstevel@tonic-gate return (0); 7397c478bd9Sstevel@tonic-gate } 7407c478bd9Sstevel@tonic-gate 7417c478bd9Sstevel@tonic-gate /* 74284b82766Smg147109 * The taskq thread that invokes the scheduling function to ensure 74384b82766Smg147109 * that all readaheads are complete and cleans up the associated 74484b82766Smg147109 * memory and releases the page lock. 74584b82766Smg147109 */ 74684b82766Smg147109 void 74784b82766Smg147109 hsfs_ra_task(void *arg) 74884b82766Smg147109 { 74984b82766Smg147109 struct hio_info *info = arg; 75084b82766Smg147109 uint_t count; 75184b82766Smg147109 struct buf *wbuf; 75284b82766Smg147109 75384b82766Smg147109 ASSERT(info->pp != NULL); 75484b82766Smg147109 75584b82766Smg147109 for (count = 0; count < info->bufsused; count++) { 75684b82766Smg147109 wbuf = &(info->bufs[count]); 75784b82766Smg147109 75884b82766Smg147109 DTRACE_PROBE1(hsfs_io_wait_ra, struct buf *, wbuf); 75984b82766Smg147109 while (sema_tryp(&(info->sema[count])) == 0) { 76084b82766Smg147109 if (hsched_invoke_strategy(info->fsp)) { 76184b82766Smg147109 sema_p(&(info->sema[count])); 76284b82766Smg147109 break; 76384b82766Smg147109 } 76484b82766Smg147109 } 76584b82766Smg147109 sema_destroy(&(info->sema[count])); 76684b82766Smg147109 DTRACE_PROBE1(hsfs_io_done_ra, struct buf *, wbuf); 76784b82766Smg147109 biofini(&(info->bufs[count])); 76884b82766Smg147109 } 76984b82766Smg147109 for (count = 0; count < info->bufsused; count++) { 77084b82766Smg147109 if (info->vas[count] != NULL) { 77184b82766Smg147109 ppmapout(info->vas[count]); 77284b82766Smg147109 } 77384b82766Smg147109 } 77484b82766Smg147109 kmem_free(info->vas, info->bufcnt * sizeof (caddr_t)); 77584b82766Smg147109 kmem_free(info->bufs, info->bufcnt * sizeof (struct buf)); 77684b82766Smg147109 kmem_free(info->sema, info->bufcnt * sizeof (ksema_t)); 77784b82766Smg147109 77884b82766Smg147109 pvn_read_done(info->pp, 0); 77984b82766Smg147109 kmem_cache_free(hio_info_cache, info); 78084b82766Smg147109 } 78184b82766Smg147109 78284b82766Smg147109 /* 78384b82766Smg147109 * Submit asynchronous readahead requests to the I/O scheduler 78484b82766Smg147109 * depending on the number of pages to read ahead. These requests 78584b82766Smg147109 * are asynchronous to the calling thread but I/O requests issued 78684b82766Smg147109 * subsequently by other threads with higher LBNs must wait for 78784b82766Smg147109 * these readaheads to complete since we have a single ordered 78884b82766Smg147109 * I/O pipeline. Thus these readaheads are semi-asynchronous. 78984b82766Smg147109 * A TaskQ handles waiting for the readaheads to complete. 79084b82766Smg147109 * 79184b82766Smg147109 * This function is mostly a copy of hsfs_getapage but somewhat 79284b82766Smg147109 * simpler. A readahead request is aborted if page allocation 79384b82766Smg147109 * fails. 79484b82766Smg147109 */ 79584b82766Smg147109 /*ARGSUSED*/ 79684b82766Smg147109 static int 79784b82766Smg147109 hsfs_getpage_ra( 79884b82766Smg147109 struct vnode *vp, 79984b82766Smg147109 u_offset_t off, 80084b82766Smg147109 struct seg *seg, 80184b82766Smg147109 caddr_t addr, 80284b82766Smg147109 struct hsnode *hp, 80384b82766Smg147109 struct hsfs *fsp, 80484b82766Smg147109 int xarsiz, 80584b82766Smg147109 offset_t bof, 80684b82766Smg147109 int chunk_lbn_count, 80784b82766Smg147109 int chunk_data_bytes) 80884b82766Smg147109 { 80984b82766Smg147109 struct buf *bufs; 81084b82766Smg147109 caddr_t *vas; 81184b82766Smg147109 caddr_t va; 81284b82766Smg147109 struct page *pp, *searchp, *lastp; 81384b82766Smg147109 struct vnode *devvp; 81484b82766Smg147109 ulong_t byte_offset; 81584b82766Smg147109 size_t io_len_tmp; 81684b82766Smg147109 uint_t io_off, io_len; 81784b82766Smg147109 uint_t xlen; 81884b82766Smg147109 uint_t filsiz; 81984b82766Smg147109 uint_t secsize; 82084b82766Smg147109 uint_t bufcnt; 82184b82766Smg147109 uint_t bufsused; 82284b82766Smg147109 uint_t count; 82384b82766Smg147109 uint_t io_end; 82484b82766Smg147109 uint_t which_chunk_lbn; 82584b82766Smg147109 uint_t offset_lbn; 82684b82766Smg147109 uint_t offset_extra; 82784b82766Smg147109 offset_t offset_bytes; 82884b82766Smg147109 uint_t remaining_bytes; 82984b82766Smg147109 uint_t extension; 83084b82766Smg147109 int remainder; /* must be signed */ 83184b82766Smg147109 diskaddr_t driver_block; 83284b82766Smg147109 u_offset_t io_off_tmp; 83384b82766Smg147109 ksema_t *fio_done; 83484b82766Smg147109 struct hio_info *info; 83584b82766Smg147109 size_t len; 83684b82766Smg147109 83784b82766Smg147109 ASSERT(fsp->hqueue != NULL); 83884b82766Smg147109 83984b82766Smg147109 if (addr >= seg->s_base + seg->s_size) { 84084b82766Smg147109 return (-1); 84184b82766Smg147109 } 84284b82766Smg147109 84384b82766Smg147109 devvp = fsp->hsfs_devvp; 84484b82766Smg147109 secsize = fsp->hsfs_vol.lbn_size; /* bytes per logical block */ 84584b82766Smg147109 84684b82766Smg147109 /* file data size */ 84784b82766Smg147109 filsiz = hp->hs_dirent.ext_size; 84884b82766Smg147109 84984b82766Smg147109 if (off >= filsiz) 85084b82766Smg147109 return (0); 85184b82766Smg147109 85284b82766Smg147109 extension = 0; 85384b82766Smg147109 pp = NULL; 85484b82766Smg147109 85584b82766Smg147109 extension += hp->hs_ra_bytes; 85684b82766Smg147109 85784b82766Smg147109 /* 858f9ec9c5aSmg147109 * Some CD writers (e.g. Kodak Photo CD writers) 859f9ec9c5aSmg147109 * create CDs in TAO mode and reserve tracks that 860f9ec9c5aSmg147109 * are not completely written. Some sectors remain 861f9ec9c5aSmg147109 * unreadable for this reason and give I/O errors. 862f9ec9c5aSmg147109 * Also, there's no point in reading sectors 863f9ec9c5aSmg147109 * we'll never look at. So, if we're asked to go 864f9ec9c5aSmg147109 * beyond the end of a file, truncate to the length 865f9ec9c5aSmg147109 * of that file. 86684b82766Smg147109 * 867f9ec9c5aSmg147109 * Additionally, this behaviour is required by section 868f9ec9c5aSmg147109 * 6.4.5 of ISO 9660:1988(E). 86984b82766Smg147109 */ 87084b82766Smg147109 len = MIN(extension ? extension : PAGESIZE, filsiz - off); 87184b82766Smg147109 87284b82766Smg147109 /* A little paranoia */ 87384b82766Smg147109 if (len <= 0) 87484b82766Smg147109 return (-1); 87584b82766Smg147109 87684b82766Smg147109 /* 87784b82766Smg147109 * After all that, make sure we're asking for things in units 87884b82766Smg147109 * that bdev_strategy() will understand (see bug 4202551). 87984b82766Smg147109 */ 88084b82766Smg147109 len = roundup(len, DEV_BSIZE); 88184b82766Smg147109 88284b82766Smg147109 pp = pvn_read_kluster(vp, off, seg, addr, &io_off_tmp, 88384b82766Smg147109 &io_len_tmp, off, len, 1); 88484b82766Smg147109 88584b82766Smg147109 if (pp == NULL) { 88684b82766Smg147109 hp->hs_num_contig = 0; 88784b82766Smg147109 hp->hs_ra_bytes = 0; 88884b82766Smg147109 hp->hs_prev_offset = 0; 88984b82766Smg147109 return (-1); 89084b82766Smg147109 } 89184b82766Smg147109 89284b82766Smg147109 io_off = (uint_t)io_off_tmp; 89384b82766Smg147109 io_len = (uint_t)io_len_tmp; 89484b82766Smg147109 89584b82766Smg147109 /* check for truncation */ 89684b82766Smg147109 /* 89784b82766Smg147109 * xxx Clean up and return EIO instead? 89884b82766Smg147109 * xxx Ought to go to u_offset_t for everything, but we 89984b82766Smg147109 * xxx call lots of things that want uint_t arguments. 90084b82766Smg147109 */ 90184b82766Smg147109 ASSERT(io_off == io_off_tmp); 90284b82766Smg147109 90384b82766Smg147109 /* 90484b82766Smg147109 * get enough buffers for worst-case scenario 90584b82766Smg147109 * (i.e., no coalescing possible). 90684b82766Smg147109 */ 90784b82766Smg147109 bufcnt = (len + secsize - 1) / secsize; 90884b82766Smg147109 bufs = kmem_alloc(bufcnt * sizeof (struct buf), KM_SLEEP); 90984b82766Smg147109 vas = kmem_alloc(bufcnt * sizeof (caddr_t), KM_SLEEP); 91084b82766Smg147109 91184b82766Smg147109 /* 91284b82766Smg147109 * Allocate a array of semaphores since we are doing I/O 91384b82766Smg147109 * scheduling. 91484b82766Smg147109 */ 91584b82766Smg147109 fio_done = kmem_alloc(bufcnt * sizeof (ksema_t), KM_SLEEP); 91684b82766Smg147109 91784b82766Smg147109 /* 91884b82766Smg147109 * If our filesize is not an integer multiple of PAGESIZE, 91984b82766Smg147109 * we zero that part of the last page that's between EOF and 92084b82766Smg147109 * the PAGESIZE boundary. 92184b82766Smg147109 */ 92284b82766Smg147109 xlen = io_len & PAGEOFFSET; 92384b82766Smg147109 if (xlen != 0) 92484b82766Smg147109 pagezero(pp->p_prev, xlen, PAGESIZE - xlen); 92584b82766Smg147109 92684b82766Smg147109 DTRACE_PROBE2(hsfs_readahead, struct vnode *, vp, uint_t, io_len); 92784b82766Smg147109 92884b82766Smg147109 va = NULL; 92984b82766Smg147109 lastp = NULL; 93084b82766Smg147109 searchp = pp; 93184b82766Smg147109 io_end = io_off + io_len; 93284b82766Smg147109 for (count = 0, byte_offset = io_off; 93384b82766Smg147109 byte_offset < io_end; 93484b82766Smg147109 count++) { 93584b82766Smg147109 ASSERT(count < bufcnt); 93684b82766Smg147109 93784b82766Smg147109 bioinit(&bufs[count]); 93884b82766Smg147109 bufs[count].b_edev = devvp->v_rdev; 93984b82766Smg147109 bufs[count].b_dev = cmpdev(devvp->v_rdev); 94084b82766Smg147109 bufs[count].b_flags = B_NOCACHE|B_BUSY|B_READ; 94184b82766Smg147109 bufs[count].b_iodone = hsfs_iodone; 94284b82766Smg147109 bufs[count].b_vp = vp; 94384b82766Smg147109 bufs[count].b_file = vp; 94484b82766Smg147109 94584b82766Smg147109 /* Compute disk address for interleaving. */ 94684b82766Smg147109 94784b82766Smg147109 /* considered without skips */ 94884b82766Smg147109 which_chunk_lbn = byte_offset / chunk_data_bytes; 94984b82766Smg147109 95084b82766Smg147109 /* factor in skips */ 95184b82766Smg147109 offset_lbn = which_chunk_lbn * chunk_lbn_count; 95284b82766Smg147109 95384b82766Smg147109 /* convert to physical byte offset for lbn */ 95484b82766Smg147109 offset_bytes = LBN_TO_BYTE(offset_lbn, vp->v_vfsp); 95584b82766Smg147109 95684b82766Smg147109 /* don't forget offset into lbn */ 95784b82766Smg147109 offset_extra = byte_offset % chunk_data_bytes; 95884b82766Smg147109 95984b82766Smg147109 /* get virtual block number for driver */ 96084b82766Smg147109 driver_block = lbtodb(bof + xarsiz 96184b82766Smg147109 + offset_bytes + offset_extra); 96284b82766Smg147109 96384b82766Smg147109 if (lastp != searchp) { 96484b82766Smg147109 /* this branch taken first time through loop */ 96584b82766Smg147109 va = vas[count] = ppmapin(searchp, PROT_WRITE, 96684b82766Smg147109 (caddr_t)-1); 96784b82766Smg147109 /* ppmapin() guarantees not to return NULL */ 96884b82766Smg147109 } else { 96984b82766Smg147109 vas[count] = NULL; 97084b82766Smg147109 } 97184b82766Smg147109 97284b82766Smg147109 bufs[count].b_un.b_addr = va + byte_offset % PAGESIZE; 97384b82766Smg147109 bufs[count].b_offset = 97484b82766Smg147109 (offset_t)(byte_offset - io_off + off); 97584b82766Smg147109 97684b82766Smg147109 /* 97784b82766Smg147109 * We specifically use the b_lblkno member here 97884b82766Smg147109 * as even in the 32 bit world driver_block can 97984b82766Smg147109 * get very large in line with the ISO9660 spec. 98084b82766Smg147109 */ 98184b82766Smg147109 98284b82766Smg147109 bufs[count].b_lblkno = driver_block; 98384b82766Smg147109 98484b82766Smg147109 remaining_bytes = ((which_chunk_lbn + 1) * chunk_data_bytes) 98584b82766Smg147109 - byte_offset; 98684b82766Smg147109 98784b82766Smg147109 /* 98884b82766Smg147109 * remaining_bytes can't be zero, as we derived 98984b82766Smg147109 * which_chunk_lbn directly from byte_offset. 99084b82766Smg147109 */ 99184b82766Smg147109 if ((remaining_bytes + byte_offset) < (off + len)) { 99284b82766Smg147109 /* coalesce-read the rest of the chunk */ 99384b82766Smg147109 bufs[count].b_bcount = remaining_bytes; 99484b82766Smg147109 } else { 99584b82766Smg147109 /* get the final bits */ 99684b82766Smg147109 bufs[count].b_bcount = off + len - byte_offset; 99784b82766Smg147109 } 99884b82766Smg147109 99984b82766Smg147109 remainder = PAGESIZE - (byte_offset % PAGESIZE); 100084b82766Smg147109 if (bufs[count].b_bcount > remainder) { 100184b82766Smg147109 bufs[count].b_bcount = remainder; 100284b82766Smg147109 } 100384b82766Smg147109 100484b82766Smg147109 bufs[count].b_bufsize = bufs[count].b_bcount; 100584b82766Smg147109 if (((offset_t)byte_offset + bufs[count].b_bcount) > 100684b82766Smg147109 HS_MAXFILEOFF) { 100784b82766Smg147109 break; 100884b82766Smg147109 } 100984b82766Smg147109 byte_offset += bufs[count].b_bcount; 101084b82766Smg147109 101184b82766Smg147109 /* 101284b82766Smg147109 * We are scheduling I/O so we need to enqueue 101384b82766Smg147109 * requests rather than calling bdev_strategy 101484b82766Smg147109 * here. A later invocation of the scheduling 101584b82766Smg147109 * function will take care of doing the actual 101684b82766Smg147109 * I/O as it selects requests from the queue as 101784b82766Smg147109 * per the scheduling logic. 101884b82766Smg147109 */ 101984b82766Smg147109 struct hio *hsio = kmem_cache_alloc(hio_cache, 102084b82766Smg147109 KM_SLEEP); 102184b82766Smg147109 102284b82766Smg147109 sema_init(&fio_done[count], 0, NULL, 102384b82766Smg147109 SEMA_DEFAULT, NULL); 102484b82766Smg147109 hsio->bp = &bufs[count]; 102584b82766Smg147109 hsio->sema = &fio_done[count]; 102684b82766Smg147109 hsio->io_lblkno = bufs[count].b_lblkno; 102784b82766Smg147109 hsio->nblocks = howmany(hsio->bp->b_bcount, 102884b82766Smg147109 DEV_BSIZE); 102984b82766Smg147109 103084b82766Smg147109 /* used for deadline */ 103184b82766Smg147109 hsio->io_timestamp = drv_hztousec(ddi_get_lbolt()); 103284b82766Smg147109 103384b82766Smg147109 /* for I/O coalescing */ 103484b82766Smg147109 hsio->contig_chain = NULL; 103584b82766Smg147109 hsched_enqueue_io(fsp, hsio, 1); 103684b82766Smg147109 103784b82766Smg147109 lwp_stat_update(LWP_STAT_INBLK, 1); 103884b82766Smg147109 lastp = searchp; 103984b82766Smg147109 if ((remainder - bufs[count].b_bcount) < 1) { 104084b82766Smg147109 searchp = searchp->p_next; 104184b82766Smg147109 } 104284b82766Smg147109 } 104384b82766Smg147109 104484b82766Smg147109 bufsused = count; 104584b82766Smg147109 info = kmem_cache_alloc(hio_info_cache, KM_SLEEP); 104684b82766Smg147109 info->bufs = bufs; 104784b82766Smg147109 info->vas = vas; 104884b82766Smg147109 info->sema = fio_done; 104984b82766Smg147109 info->bufsused = bufsused; 105084b82766Smg147109 info->bufcnt = bufcnt; 105184b82766Smg147109 info->fsp = fsp; 105284b82766Smg147109 info->pp = pp; 105384b82766Smg147109 105484b82766Smg147109 (void) taskq_dispatch(fsp->hqueue->ra_task, 105584b82766Smg147109 hsfs_ra_task, info, KM_SLEEP); 105684b82766Smg147109 /* 105784b82766Smg147109 * The I/O locked pages are unlocked in our taskq thread. 105884b82766Smg147109 */ 105984b82766Smg147109 return (0); 106084b82766Smg147109 } 106184b82766Smg147109 106284b82766Smg147109 /* 10637c478bd9Sstevel@tonic-gate * Each file may have a different interleaving on disk. This makes 10647c478bd9Sstevel@tonic-gate * things somewhat interesting. The gist is that there are some 10657c478bd9Sstevel@tonic-gate * number of contiguous data sectors, followed by some other number 10667c478bd9Sstevel@tonic-gate * of contiguous skip sectors. The sum of those two sets of sectors 10677c478bd9Sstevel@tonic-gate * defines the interleave size. Unfortunately, it means that we generally 10687c478bd9Sstevel@tonic-gate * can't simply read N sectors starting at a given offset to satisfy 10697c478bd9Sstevel@tonic-gate * any given request. 10707c478bd9Sstevel@tonic-gate * 10717c478bd9Sstevel@tonic-gate * What we do is get the relevant memory pages via pvn_read_kluster(), 10727c478bd9Sstevel@tonic-gate * then stride through the interleaves, setting up a buf for each 10737c478bd9Sstevel@tonic-gate * sector that needs to be brought in. Instead of kmem_alloc'ing 10747c478bd9Sstevel@tonic-gate * space for the sectors, though, we just point at the appropriate 10757c478bd9Sstevel@tonic-gate * spot in the relevant page for each of them. This saves us a bunch 10767c478bd9Sstevel@tonic-gate * of copying. 107784b82766Smg147109 * 107884b82766Smg147109 * NOTICE: The code below in hsfs_getapage is mostly same as the code 107984b82766Smg147109 * in hsfs_getpage_ra above (with some omissions). If you are 108084b82766Smg147109 * making any change to this function, please also look at 108184b82766Smg147109 * hsfs_getpage_ra. 10827c478bd9Sstevel@tonic-gate */ 10837c478bd9Sstevel@tonic-gate /*ARGSUSED*/ 10847c478bd9Sstevel@tonic-gate static int 10857c478bd9Sstevel@tonic-gate hsfs_getapage( 10867c478bd9Sstevel@tonic-gate struct vnode *vp, 10877c478bd9Sstevel@tonic-gate u_offset_t off, 10887c478bd9Sstevel@tonic-gate size_t len, 10897c478bd9Sstevel@tonic-gate uint_t *protp, 10907c478bd9Sstevel@tonic-gate struct page *pl[], 10917c478bd9Sstevel@tonic-gate size_t plsz, 10927c478bd9Sstevel@tonic-gate struct seg *seg, 10937c478bd9Sstevel@tonic-gate caddr_t addr, 10947c478bd9Sstevel@tonic-gate enum seg_rw rw, 10957c478bd9Sstevel@tonic-gate struct cred *cred) 10967c478bd9Sstevel@tonic-gate { 10977c478bd9Sstevel@tonic-gate struct hsnode *hp; 10987c478bd9Sstevel@tonic-gate struct hsfs *fsp; 10997c478bd9Sstevel@tonic-gate int err; 11007c478bd9Sstevel@tonic-gate struct buf *bufs; 11017c478bd9Sstevel@tonic-gate caddr_t *vas; 11027c478bd9Sstevel@tonic-gate caddr_t va; 11037c478bd9Sstevel@tonic-gate struct page *pp, *searchp, *lastp; 11047c478bd9Sstevel@tonic-gate page_t *pagefound; 11057c478bd9Sstevel@tonic-gate offset_t bof; 11067c478bd9Sstevel@tonic-gate struct vnode *devvp; 11077c478bd9Sstevel@tonic-gate ulong_t byte_offset; 11087c478bd9Sstevel@tonic-gate size_t io_len_tmp; 11097c478bd9Sstevel@tonic-gate uint_t io_off, io_len; 11107c478bd9Sstevel@tonic-gate uint_t xlen; 11117c478bd9Sstevel@tonic-gate uint_t filsiz; 11127c478bd9Sstevel@tonic-gate uint_t secsize; 11137c478bd9Sstevel@tonic-gate uint_t bufcnt; 11147c478bd9Sstevel@tonic-gate uint_t bufsused; 11157c478bd9Sstevel@tonic-gate uint_t count; 11167c478bd9Sstevel@tonic-gate uint_t io_end; 11177c478bd9Sstevel@tonic-gate uint_t which_chunk_lbn; 11187c478bd9Sstevel@tonic-gate uint_t offset_lbn; 11197c478bd9Sstevel@tonic-gate uint_t offset_extra; 11207c478bd9Sstevel@tonic-gate offset_t offset_bytes; 11217c478bd9Sstevel@tonic-gate uint_t remaining_bytes; 11227c478bd9Sstevel@tonic-gate uint_t extension; 11237c478bd9Sstevel@tonic-gate int remainder; /* must be signed */ 11247c478bd9Sstevel@tonic-gate int chunk_lbn_count; 11257c478bd9Sstevel@tonic-gate int chunk_data_bytes; 11267c478bd9Sstevel@tonic-gate int xarsiz; 11277c478bd9Sstevel@tonic-gate diskaddr_t driver_block; 11287c478bd9Sstevel@tonic-gate u_offset_t io_off_tmp; 112984b82766Smg147109 ksema_t *fio_done; 113084b82766Smg147109 int calcdone; 11317c478bd9Sstevel@tonic-gate 11327c478bd9Sstevel@tonic-gate /* 11337c478bd9Sstevel@tonic-gate * We don't support asynchronous operation at the moment, so 11347c478bd9Sstevel@tonic-gate * just pretend we did it. If the pages are ever actually 11357c478bd9Sstevel@tonic-gate * needed, they'll get brought in then. 11367c478bd9Sstevel@tonic-gate */ 11377c478bd9Sstevel@tonic-gate if (pl == NULL) 11387c478bd9Sstevel@tonic-gate return (0); 11397c478bd9Sstevel@tonic-gate 11407c478bd9Sstevel@tonic-gate hp = VTOH(vp); 11417c478bd9Sstevel@tonic-gate fsp = VFS_TO_HSFS(vp->v_vfsp); 11427c478bd9Sstevel@tonic-gate devvp = fsp->hsfs_devvp; 11437c478bd9Sstevel@tonic-gate secsize = fsp->hsfs_vol.lbn_size; /* bytes per logical block */ 11447c478bd9Sstevel@tonic-gate 11457c478bd9Sstevel@tonic-gate /* file data size */ 11467c478bd9Sstevel@tonic-gate filsiz = hp->hs_dirent.ext_size; 11477c478bd9Sstevel@tonic-gate 11487c478bd9Sstevel@tonic-gate /* disk addr for start of file */ 11497c478bd9Sstevel@tonic-gate bof = LBN_TO_BYTE((offset_t)hp->hs_dirent.ext_lbn, vp->v_vfsp); 11507c478bd9Sstevel@tonic-gate 11517c478bd9Sstevel@tonic-gate /* xarsiz byte must be skipped for data */ 11527c478bd9Sstevel@tonic-gate xarsiz = hp->hs_dirent.xar_len << fsp->hsfs_vol.lbn_shift; 11537c478bd9Sstevel@tonic-gate 11547c478bd9Sstevel@tonic-gate /* how many logical blocks in an interleave (data+skip) */ 11557c478bd9Sstevel@tonic-gate chunk_lbn_count = hp->hs_dirent.intlf_sz + hp->hs_dirent.intlf_sk; 11567c478bd9Sstevel@tonic-gate 11577c478bd9Sstevel@tonic-gate if (chunk_lbn_count == 0) { 11587c478bd9Sstevel@tonic-gate chunk_lbn_count = 1; 11597c478bd9Sstevel@tonic-gate } 11607c478bd9Sstevel@tonic-gate 11617c478bd9Sstevel@tonic-gate /* 11627c478bd9Sstevel@tonic-gate * Convert interleaving size into bytes. The zero case 11637c478bd9Sstevel@tonic-gate * (no interleaving) optimization is handled as a side- 11647c478bd9Sstevel@tonic-gate * effect of the read-ahead logic. 11657c478bd9Sstevel@tonic-gate */ 11667c478bd9Sstevel@tonic-gate if (hp->hs_dirent.intlf_sz == 0) { 11677c478bd9Sstevel@tonic-gate chunk_data_bytes = LBN_TO_BYTE(1, vp->v_vfsp); 116884b82766Smg147109 /* 116984b82766Smg147109 * Optimization: If our pagesize is a multiple of LBN 117084b82766Smg147109 * bytes, we can avoid breaking up a page into individual 117184b82766Smg147109 * lbn-sized requests. 117284b82766Smg147109 */ 117384b82766Smg147109 if (PAGESIZE % chunk_data_bytes == 0) { 117484b82766Smg147109 chunk_lbn_count = BYTE_TO_LBN(PAGESIZE, vp->v_vfsp); 117584b82766Smg147109 chunk_data_bytes = PAGESIZE; 117684b82766Smg147109 } 11777c478bd9Sstevel@tonic-gate } else { 1178d10b6702Sfrankho chunk_data_bytes = 1179d10b6702Sfrankho LBN_TO_BYTE(hp->hs_dirent.intlf_sz, vp->v_vfsp); 11807c478bd9Sstevel@tonic-gate } 11817c478bd9Sstevel@tonic-gate 11827c478bd9Sstevel@tonic-gate reread: 11837c478bd9Sstevel@tonic-gate err = 0; 11847c478bd9Sstevel@tonic-gate pagefound = 0; 118584b82766Smg147109 calcdone = 0; 11867c478bd9Sstevel@tonic-gate 11877c478bd9Sstevel@tonic-gate /* 11887c478bd9Sstevel@tonic-gate * Do some read-ahead. This mostly saves us a bit of 11897c478bd9Sstevel@tonic-gate * system cpu time more than anything else when doing 11907c478bd9Sstevel@tonic-gate * sequential reads. At some point, could do the 11917c478bd9Sstevel@tonic-gate * read-ahead asynchronously which might gain us something 11927c478bd9Sstevel@tonic-gate * on wall time, but it seems unlikely.... 11937c478bd9Sstevel@tonic-gate * 11947c478bd9Sstevel@tonic-gate * We do the easy case here, which is to read through 11957c478bd9Sstevel@tonic-gate * the end of the chunk, minus whatever's at the end that 11967c478bd9Sstevel@tonic-gate * won't exactly fill a page. 11977c478bd9Sstevel@tonic-gate */ 119884b82766Smg147109 if (hp->hs_ra_bytes > 0 && chunk_data_bytes != PAGESIZE) { 11997c478bd9Sstevel@tonic-gate which_chunk_lbn = (off + len) / chunk_data_bytes; 12007c478bd9Sstevel@tonic-gate extension = ((which_chunk_lbn + 1) * chunk_data_bytes) - off; 12017c478bd9Sstevel@tonic-gate extension -= (extension % PAGESIZE); 12028cd7c4fcSpeterte } else { 120384b82766Smg147109 extension = roundup(len, PAGESIZE); 12047c478bd9Sstevel@tonic-gate } 12057c478bd9Sstevel@tonic-gate 120684b82766Smg147109 atomic_inc_64(&fsp->total_pages_requested); 12077c478bd9Sstevel@tonic-gate 12087c478bd9Sstevel@tonic-gate pp = NULL; 12097c478bd9Sstevel@tonic-gate again: 12107c478bd9Sstevel@tonic-gate /* search for page in buffer */ 12117c478bd9Sstevel@tonic-gate if ((pagefound = page_exists(vp, off)) == 0) { 12127c478bd9Sstevel@tonic-gate /* 12137c478bd9Sstevel@tonic-gate * Need to really do disk IO to get the page. 12147c478bd9Sstevel@tonic-gate */ 121584b82766Smg147109 if (!calcdone) { 121684b82766Smg147109 extension += hp->hs_ra_bytes; 121784b82766Smg147109 121884b82766Smg147109 /* 121984b82766Smg147109 * Some cd writers don't write sectors that aren't 122084b82766Smg147109 * used. Also, there's no point in reading sectors 122184b82766Smg147109 * we'll never look at. So, if we're asked to go 122284b82766Smg147109 * beyond the end of a file, truncate to the length 122384b82766Smg147109 * of that file. 122484b82766Smg147109 * 122584b82766Smg147109 * Additionally, this behaviour is required by section 122684b82766Smg147109 * 6.4.5 of ISO 9660:1988(E). 122784b82766Smg147109 */ 122884b82766Smg147109 len = MIN(extension ? extension : PAGESIZE, 122984b82766Smg147109 filsiz - off); 123084b82766Smg147109 123184b82766Smg147109 /* A little paranoia. */ 123284b82766Smg147109 ASSERT(len > 0); 123384b82766Smg147109 123484b82766Smg147109 /* 123584b82766Smg147109 * After all that, make sure we're asking for things 123684b82766Smg147109 * in units that bdev_strategy() will understand 123784b82766Smg147109 * (see bug 4202551). 123884b82766Smg147109 */ 123984b82766Smg147109 len = roundup(len, DEV_BSIZE); 124084b82766Smg147109 calcdone = 1; 124184b82766Smg147109 } 124284b82766Smg147109 12437c478bd9Sstevel@tonic-gate pp = pvn_read_kluster(vp, off, seg, addr, &io_off_tmp, 12447c478bd9Sstevel@tonic-gate &io_len_tmp, off, len, 0); 12457c478bd9Sstevel@tonic-gate 124684b82766Smg147109 if (pp == NULL) { 124784b82766Smg147109 /* 124884b82766Smg147109 * Pressure on memory, roll back readahead 124984b82766Smg147109 */ 125084b82766Smg147109 hp->hs_num_contig = 0; 125184b82766Smg147109 hp->hs_ra_bytes = 0; 125284b82766Smg147109 hp->hs_prev_offset = 0; 12537c478bd9Sstevel@tonic-gate goto again; 125484b82766Smg147109 } 12557c478bd9Sstevel@tonic-gate 12567c478bd9Sstevel@tonic-gate io_off = (uint_t)io_off_tmp; 12577c478bd9Sstevel@tonic-gate io_len = (uint_t)io_len_tmp; 12587c478bd9Sstevel@tonic-gate 12597c478bd9Sstevel@tonic-gate /* check for truncation */ 12607c478bd9Sstevel@tonic-gate /* 12617c478bd9Sstevel@tonic-gate * xxx Clean up and return EIO instead? 12627c478bd9Sstevel@tonic-gate * xxx Ought to go to u_offset_t for everything, but we 12637c478bd9Sstevel@tonic-gate * xxx call lots of things that want uint_t arguments. 12647c478bd9Sstevel@tonic-gate */ 12657c478bd9Sstevel@tonic-gate ASSERT(io_off == io_off_tmp); 12667c478bd9Sstevel@tonic-gate 12677c478bd9Sstevel@tonic-gate /* 12687c478bd9Sstevel@tonic-gate * get enough buffers for worst-case scenario 12697c478bd9Sstevel@tonic-gate * (i.e., no coalescing possible). 12707c478bd9Sstevel@tonic-gate */ 12717c478bd9Sstevel@tonic-gate bufcnt = (len + secsize - 1) / secsize; 12727c478bd9Sstevel@tonic-gate bufs = kmem_zalloc(bufcnt * sizeof (struct buf), KM_SLEEP); 12737c478bd9Sstevel@tonic-gate vas = kmem_alloc(bufcnt * sizeof (caddr_t), KM_SLEEP); 127484b82766Smg147109 127584b82766Smg147109 /* 127684b82766Smg147109 * Allocate a array of semaphores if we are doing I/O 127784b82766Smg147109 * scheduling. 127884b82766Smg147109 */ 127984b82766Smg147109 if (fsp->hqueue != NULL) 128084b82766Smg147109 fio_done = kmem_alloc(bufcnt * sizeof (ksema_t), 128184b82766Smg147109 KM_SLEEP); 12827c478bd9Sstevel@tonic-gate for (count = 0; count < bufcnt; count++) { 128384b82766Smg147109 bioinit(&bufs[count]); 12847c478bd9Sstevel@tonic-gate bufs[count].b_edev = devvp->v_rdev; 12857c478bd9Sstevel@tonic-gate bufs[count].b_dev = cmpdev(devvp->v_rdev); 12867c478bd9Sstevel@tonic-gate bufs[count].b_flags = B_NOCACHE|B_BUSY|B_READ; 12877c478bd9Sstevel@tonic-gate bufs[count].b_iodone = hsfs_iodone; 12887c478bd9Sstevel@tonic-gate bufs[count].b_vp = vp; 12897c478bd9Sstevel@tonic-gate bufs[count].b_file = vp; 12907c478bd9Sstevel@tonic-gate } 12917c478bd9Sstevel@tonic-gate 12928cd7c4fcSpeterte /* 12938cd7c4fcSpeterte * If our filesize is not an integer multiple of PAGESIZE, 12948cd7c4fcSpeterte * we zero that part of the last page that's between EOF and 12958cd7c4fcSpeterte * the PAGESIZE boundary. 12968cd7c4fcSpeterte */ 12977c478bd9Sstevel@tonic-gate xlen = io_len & PAGEOFFSET; 12987c478bd9Sstevel@tonic-gate if (xlen != 0) 12997c478bd9Sstevel@tonic-gate pagezero(pp->p_prev, xlen, PAGESIZE - xlen); 13007c478bd9Sstevel@tonic-gate 13017c478bd9Sstevel@tonic-gate va = NULL; 13027c478bd9Sstevel@tonic-gate lastp = NULL; 13037c478bd9Sstevel@tonic-gate searchp = pp; 13047c478bd9Sstevel@tonic-gate io_end = io_off + io_len; 13057c478bd9Sstevel@tonic-gate for (count = 0, byte_offset = io_off; 1306d10b6702Sfrankho byte_offset < io_end; count++) { 13077c478bd9Sstevel@tonic-gate ASSERT(count < bufcnt); 13087c478bd9Sstevel@tonic-gate 13097c478bd9Sstevel@tonic-gate /* Compute disk address for interleaving. */ 13107c478bd9Sstevel@tonic-gate 13117c478bd9Sstevel@tonic-gate /* considered without skips */ 13127c478bd9Sstevel@tonic-gate which_chunk_lbn = byte_offset / chunk_data_bytes; 13137c478bd9Sstevel@tonic-gate 13147c478bd9Sstevel@tonic-gate /* factor in skips */ 13157c478bd9Sstevel@tonic-gate offset_lbn = which_chunk_lbn * chunk_lbn_count; 13167c478bd9Sstevel@tonic-gate 13177c478bd9Sstevel@tonic-gate /* convert to physical byte offset for lbn */ 13187c478bd9Sstevel@tonic-gate offset_bytes = LBN_TO_BYTE(offset_lbn, vp->v_vfsp); 13197c478bd9Sstevel@tonic-gate 13207c478bd9Sstevel@tonic-gate /* don't forget offset into lbn */ 13217c478bd9Sstevel@tonic-gate offset_extra = byte_offset % chunk_data_bytes; 13227c478bd9Sstevel@tonic-gate 13237c478bd9Sstevel@tonic-gate /* get virtual block number for driver */ 1324d10b6702Sfrankho driver_block = 1325d10b6702Sfrankho lbtodb(bof + xarsiz + offset_bytes + offset_extra); 13267c478bd9Sstevel@tonic-gate 13277c478bd9Sstevel@tonic-gate if (lastp != searchp) { 13287c478bd9Sstevel@tonic-gate /* this branch taken first time through loop */ 1329d10b6702Sfrankho va = vas[count] = 1330d10b6702Sfrankho ppmapin(searchp, PROT_WRITE, (caddr_t)-1); 13317c478bd9Sstevel@tonic-gate /* ppmapin() guarantees not to return NULL */ 13327c478bd9Sstevel@tonic-gate } else { 13337c478bd9Sstevel@tonic-gate vas[count] = NULL; 13347c478bd9Sstevel@tonic-gate } 13357c478bd9Sstevel@tonic-gate 13367c478bd9Sstevel@tonic-gate bufs[count].b_un.b_addr = va + byte_offset % PAGESIZE; 13377c478bd9Sstevel@tonic-gate bufs[count].b_offset = 13387c478bd9Sstevel@tonic-gate (offset_t)(byte_offset - io_off + off); 13397c478bd9Sstevel@tonic-gate 13407c478bd9Sstevel@tonic-gate /* 13417c478bd9Sstevel@tonic-gate * We specifically use the b_lblkno member here 13427c478bd9Sstevel@tonic-gate * as even in the 32 bit world driver_block can 13437c478bd9Sstevel@tonic-gate * get very large in line with the ISO9660 spec. 13447c478bd9Sstevel@tonic-gate */ 13457c478bd9Sstevel@tonic-gate 13467c478bd9Sstevel@tonic-gate bufs[count].b_lblkno = driver_block; 13477c478bd9Sstevel@tonic-gate 1348d10b6702Sfrankho remaining_bytes = 1349d10b6702Sfrankho ((which_chunk_lbn + 1) * chunk_data_bytes) 13507c478bd9Sstevel@tonic-gate - byte_offset; 13517c478bd9Sstevel@tonic-gate 13527c478bd9Sstevel@tonic-gate /* 13537c478bd9Sstevel@tonic-gate * remaining_bytes can't be zero, as we derived 13547c478bd9Sstevel@tonic-gate * which_chunk_lbn directly from byte_offset. 13557c478bd9Sstevel@tonic-gate */ 13567c478bd9Sstevel@tonic-gate if ((remaining_bytes + byte_offset) < (off + len)) { 13577c478bd9Sstevel@tonic-gate /* coalesce-read the rest of the chunk */ 13587c478bd9Sstevel@tonic-gate bufs[count].b_bcount = remaining_bytes; 13597c478bd9Sstevel@tonic-gate } else { 13607c478bd9Sstevel@tonic-gate /* get the final bits */ 13617c478bd9Sstevel@tonic-gate bufs[count].b_bcount = off + len - byte_offset; 13627c478bd9Sstevel@tonic-gate } 13637c478bd9Sstevel@tonic-gate 13647c478bd9Sstevel@tonic-gate /* 13657c478bd9Sstevel@tonic-gate * It would be nice to do multiple pages' 13667c478bd9Sstevel@tonic-gate * worth at once here when the opportunity 13677c478bd9Sstevel@tonic-gate * arises, as that has been shown to improve 13687c478bd9Sstevel@tonic-gate * our wall time. However, to do that 13697c478bd9Sstevel@tonic-gate * requires that we use the pageio subsystem, 13707c478bd9Sstevel@tonic-gate * which doesn't mix well with what we're 13717c478bd9Sstevel@tonic-gate * already using here. We can't use pageio 13727c478bd9Sstevel@tonic-gate * all the time, because that subsystem 13737c478bd9Sstevel@tonic-gate * assumes that a page is stored in N 13747c478bd9Sstevel@tonic-gate * contiguous blocks on the device. 13757c478bd9Sstevel@tonic-gate * Interleaving violates that assumption. 137684b82766Smg147109 * 137784b82766Smg147109 * Update: This is now not so big a problem 137884b82766Smg147109 * because of the I/O scheduler sitting below 137984b82766Smg147109 * that can re-order and coalesce I/O requests. 13807c478bd9Sstevel@tonic-gate */ 13817c478bd9Sstevel@tonic-gate 13827c478bd9Sstevel@tonic-gate remainder = PAGESIZE - (byte_offset % PAGESIZE); 13837c478bd9Sstevel@tonic-gate if (bufs[count].b_bcount > remainder) { 13847c478bd9Sstevel@tonic-gate bufs[count].b_bcount = remainder; 13857c478bd9Sstevel@tonic-gate } 13867c478bd9Sstevel@tonic-gate 13877c478bd9Sstevel@tonic-gate bufs[count].b_bufsize = bufs[count].b_bcount; 13889cbc422eSpeterte if (((offset_t)byte_offset + bufs[count].b_bcount) > 13899cbc422eSpeterte HS_MAXFILEOFF) { 13909cbc422eSpeterte break; 13919cbc422eSpeterte } 13927c478bd9Sstevel@tonic-gate byte_offset += bufs[count].b_bcount; 13937c478bd9Sstevel@tonic-gate 139484b82766Smg147109 if (fsp->hqueue == NULL) { 13957c478bd9Sstevel@tonic-gate (void) bdev_strategy(&bufs[count]); 13967c478bd9Sstevel@tonic-gate 139784b82766Smg147109 } else { 139884b82766Smg147109 /* 139984b82766Smg147109 * We are scheduling I/O so we need to enqueue 140084b82766Smg147109 * requests rather than calling bdev_strategy 140184b82766Smg147109 * here. A later invocation of the scheduling 140284b82766Smg147109 * function will take care of doing the actual 140384b82766Smg147109 * I/O as it selects requests from the queue as 140484b82766Smg147109 * per the scheduling logic. 140584b82766Smg147109 */ 140684b82766Smg147109 struct hio *hsio = kmem_cache_alloc(hio_cache, 140784b82766Smg147109 KM_SLEEP); 140884b82766Smg147109 140984b82766Smg147109 sema_init(&fio_done[count], 0, NULL, 141084b82766Smg147109 SEMA_DEFAULT, NULL); 141184b82766Smg147109 hsio->bp = &bufs[count]; 141284b82766Smg147109 hsio->sema = &fio_done[count]; 141384b82766Smg147109 hsio->io_lblkno = bufs[count].b_lblkno; 141484b82766Smg147109 hsio->nblocks = howmany(hsio->bp->b_bcount, 141584b82766Smg147109 DEV_BSIZE); 141684b82766Smg147109 141784b82766Smg147109 /* used for deadline */ 141884b82766Smg147109 hsio->io_timestamp = 141984b82766Smg147109 drv_hztousec(ddi_get_lbolt()); 142084b82766Smg147109 142184b82766Smg147109 /* for I/O coalescing */ 142284b82766Smg147109 hsio->contig_chain = NULL; 142384b82766Smg147109 hsched_enqueue_io(fsp, hsio, 0); 142484b82766Smg147109 } 142584b82766Smg147109 14267c478bd9Sstevel@tonic-gate lwp_stat_update(LWP_STAT_INBLK, 1); 14277c478bd9Sstevel@tonic-gate lastp = searchp; 14287c478bd9Sstevel@tonic-gate if ((remainder - bufs[count].b_bcount) < 1) { 14297c478bd9Sstevel@tonic-gate searchp = searchp->p_next; 14307c478bd9Sstevel@tonic-gate } 14317c478bd9Sstevel@tonic-gate } 14327c478bd9Sstevel@tonic-gate 14337c478bd9Sstevel@tonic-gate bufsused = count; 14347c478bd9Sstevel@tonic-gate /* Now wait for everything to come in */ 143584b82766Smg147109 if (fsp->hqueue == NULL) { 14367c478bd9Sstevel@tonic-gate for (count = 0; count < bufsused; count++) { 14377c478bd9Sstevel@tonic-gate if (err == 0) { 14387c478bd9Sstevel@tonic-gate err = biowait(&bufs[count]); 14397c478bd9Sstevel@tonic-gate } else 14407c478bd9Sstevel@tonic-gate (void) biowait(&bufs[count]); 14417c478bd9Sstevel@tonic-gate } 144284b82766Smg147109 } else { 144384b82766Smg147109 for (count = 0; count < bufsused; count++) { 144484b82766Smg147109 struct buf *wbuf; 144584b82766Smg147109 144684b82766Smg147109 /* 144784b82766Smg147109 * Invoke scheduling function till our buf 144884b82766Smg147109 * is processed. In doing this it might 144984b82766Smg147109 * process bufs enqueued by other threads 145084b82766Smg147109 * which is good. 145184b82766Smg147109 */ 145284b82766Smg147109 wbuf = &bufs[count]; 145384b82766Smg147109 DTRACE_PROBE1(hsfs_io_wait, struct buf *, wbuf); 145484b82766Smg147109 while (sema_tryp(&fio_done[count]) == 0) { 145584b82766Smg147109 /* 145684b82766Smg147109 * hsched_invoke_strategy will return 1 145784b82766Smg147109 * if the I/O queue is empty. This means 145884b82766Smg147109 * that there is another thread who has 145984b82766Smg147109 * issued our buf and is waiting. So we 146084b82766Smg147109 * just block instead of spinning. 146184b82766Smg147109 */ 146284b82766Smg147109 if (hsched_invoke_strategy(fsp)) { 146384b82766Smg147109 sema_p(&fio_done[count]); 146484b82766Smg147109 break; 146584b82766Smg147109 } 146684b82766Smg147109 } 146784b82766Smg147109 sema_destroy(&fio_done[count]); 146884b82766Smg147109 DTRACE_PROBE1(hsfs_io_done, struct buf *, wbuf); 146984b82766Smg147109 147084b82766Smg147109 if (err == 0) { 147184b82766Smg147109 err = geterror(wbuf); 147284b82766Smg147109 } 147384b82766Smg147109 } 147484b82766Smg147109 kmem_free(fio_done, bufcnt * sizeof (ksema_t)); 147584b82766Smg147109 } 14767c478bd9Sstevel@tonic-gate 14777c478bd9Sstevel@tonic-gate /* Don't leak resources */ 14787c478bd9Sstevel@tonic-gate for (count = 0; count < bufcnt; count++) { 147984b82766Smg147109 biofini(&bufs[count]); 14807c478bd9Sstevel@tonic-gate if (count < bufsused && vas[count] != NULL) { 14817c478bd9Sstevel@tonic-gate ppmapout(vas[count]); 14827c478bd9Sstevel@tonic-gate } 14837c478bd9Sstevel@tonic-gate } 14847c478bd9Sstevel@tonic-gate 14857c478bd9Sstevel@tonic-gate kmem_free(vas, bufcnt * sizeof (caddr_t)); 14867c478bd9Sstevel@tonic-gate kmem_free(bufs, bufcnt * sizeof (struct buf)); 14877c478bd9Sstevel@tonic-gate } 14887c478bd9Sstevel@tonic-gate 14897c478bd9Sstevel@tonic-gate if (err) { 14907c478bd9Sstevel@tonic-gate pvn_read_done(pp, B_ERROR); 14917c478bd9Sstevel@tonic-gate return (err); 14927c478bd9Sstevel@tonic-gate } 14937c478bd9Sstevel@tonic-gate 14947c478bd9Sstevel@tonic-gate /* 14957c478bd9Sstevel@tonic-gate * Lock the requested page, and the one after it if possible. 14967c478bd9Sstevel@tonic-gate * Don't bother if our caller hasn't given us a place to stash 14977c478bd9Sstevel@tonic-gate * the page pointers, since otherwise we'd lock pages that would 14987c478bd9Sstevel@tonic-gate * never get unlocked. 14997c478bd9Sstevel@tonic-gate */ 15007c478bd9Sstevel@tonic-gate if (pagefound) { 15017c478bd9Sstevel@tonic-gate int index; 15027c478bd9Sstevel@tonic-gate ulong_t soff; 15037c478bd9Sstevel@tonic-gate 15047c478bd9Sstevel@tonic-gate /* 15057c478bd9Sstevel@tonic-gate * Make sure it's in memory before we say it's here. 15067c478bd9Sstevel@tonic-gate */ 15077c478bd9Sstevel@tonic-gate if ((pp = page_lookup(vp, off, SE_SHARED)) == NULL) { 15087c478bd9Sstevel@tonic-gate hsfs_lostpage++; 15097c478bd9Sstevel@tonic-gate goto reread; 15107c478bd9Sstevel@tonic-gate } 15117c478bd9Sstevel@tonic-gate 15127c478bd9Sstevel@tonic-gate pl[0] = pp; 15137c478bd9Sstevel@tonic-gate index = 1; 151484b82766Smg147109 atomic_inc_64(&fsp->cache_read_pages); 15157c478bd9Sstevel@tonic-gate 15167c478bd9Sstevel@tonic-gate /* 15177c478bd9Sstevel@tonic-gate * Try to lock the next page, if it exists, without 15187c478bd9Sstevel@tonic-gate * blocking. 15197c478bd9Sstevel@tonic-gate */ 15207c478bd9Sstevel@tonic-gate plsz -= PAGESIZE; 15217c478bd9Sstevel@tonic-gate /* LINTED (plsz is unsigned) */ 15227c478bd9Sstevel@tonic-gate for (soff = off + PAGESIZE; plsz > 0; 15237c478bd9Sstevel@tonic-gate soff += PAGESIZE, plsz -= PAGESIZE) { 15247c478bd9Sstevel@tonic-gate pp = page_lookup_nowait(vp, (u_offset_t)soff, 15257c478bd9Sstevel@tonic-gate SE_SHARED); 15267c478bd9Sstevel@tonic-gate if (pp == NULL) 15277c478bd9Sstevel@tonic-gate break; 15287c478bd9Sstevel@tonic-gate pl[index++] = pp; 15297c478bd9Sstevel@tonic-gate } 15307c478bd9Sstevel@tonic-gate pl[index] = NULL; 153184b82766Smg147109 153284b82766Smg147109 /* 153384b82766Smg147109 * Schedule a semi-asynchronous readahead if we are 153484b82766Smg147109 * accessing the last cached page for the current 153584b82766Smg147109 * file. 153684b82766Smg147109 * 153784b82766Smg147109 * Doing this here means that readaheads will be 153884b82766Smg147109 * issued only if cache-hits occur. This is an advantage 153984b82766Smg147109 * since cache-hits would mean that readahead is giving 154084b82766Smg147109 * the desired benefit. If cache-hits do not occur there 154184b82766Smg147109 * is no point in reading ahead of time - the system 154284b82766Smg147109 * is loaded anyway. 154384b82766Smg147109 */ 154484b82766Smg147109 if (fsp->hqueue != NULL && 154584b82766Smg147109 hp->hs_prev_offset - off == PAGESIZE && 154684b82766Smg147109 hp->hs_prev_offset < filsiz && 154784b82766Smg147109 hp->hs_ra_bytes > 0 && 154884b82766Smg147109 !page_exists(vp, hp->hs_prev_offset)) { 154984b82766Smg147109 (void) hsfs_getpage_ra(vp, hp->hs_prev_offset, seg, 155084b82766Smg147109 addr + PAGESIZE, hp, fsp, xarsiz, bof, 155184b82766Smg147109 chunk_lbn_count, chunk_data_bytes); 155284b82766Smg147109 } 155384b82766Smg147109 15547c478bd9Sstevel@tonic-gate return (0); 15557c478bd9Sstevel@tonic-gate } 15567c478bd9Sstevel@tonic-gate 15577c478bd9Sstevel@tonic-gate if (pp != NULL) { 15587c478bd9Sstevel@tonic-gate pvn_plist_init(pp, pl, plsz, off, io_len, rw); 15597c478bd9Sstevel@tonic-gate } 15607c478bd9Sstevel@tonic-gate 15617c478bd9Sstevel@tonic-gate return (err); 15627c478bd9Sstevel@tonic-gate } 15637c478bd9Sstevel@tonic-gate 1564da6c28aaSamw /*ARGSUSED*/ 15657c478bd9Sstevel@tonic-gate static int 15667c478bd9Sstevel@tonic-gate hsfs_getpage( 15677c478bd9Sstevel@tonic-gate struct vnode *vp, 15687c478bd9Sstevel@tonic-gate offset_t off, 15697c478bd9Sstevel@tonic-gate size_t len, 15707c478bd9Sstevel@tonic-gate uint_t *protp, 15717c478bd9Sstevel@tonic-gate struct page *pl[], 15727c478bd9Sstevel@tonic-gate size_t plsz, 15737c478bd9Sstevel@tonic-gate struct seg *seg, 15747c478bd9Sstevel@tonic-gate caddr_t addr, 15757c478bd9Sstevel@tonic-gate enum seg_rw rw, 1576da6c28aaSamw struct cred *cred, 1577da6c28aaSamw caller_context_t *ct) 15787c478bd9Sstevel@tonic-gate { 15797c478bd9Sstevel@tonic-gate uint_t filsiz; 158084b82766Smg147109 struct hsfs *fsp; 158184b82766Smg147109 struct hsnode *hp; 158284b82766Smg147109 158384b82766Smg147109 fsp = VFS_TO_HSFS(vp->v_vfsp); 158484b82766Smg147109 hp = VTOH(vp); 15857c478bd9Sstevel@tonic-gate 15867c478bd9Sstevel@tonic-gate /* does not support write */ 15877c478bd9Sstevel@tonic-gate if (rw == S_WRITE) { 1588de4ddf9cSKeith M Wesolowski return (EROFS); 15897c478bd9Sstevel@tonic-gate } 15907c478bd9Sstevel@tonic-gate 15917c478bd9Sstevel@tonic-gate if (vp->v_flag & VNOMAP) { 15927c478bd9Sstevel@tonic-gate return (ENOSYS); 15937c478bd9Sstevel@tonic-gate } 15947c478bd9Sstevel@tonic-gate 15959cbc422eSpeterte ASSERT(off <= HS_MAXFILEOFF); 15967c478bd9Sstevel@tonic-gate 15977c478bd9Sstevel@tonic-gate /* 15987c478bd9Sstevel@tonic-gate * Determine file data size for EOF check. 15997c478bd9Sstevel@tonic-gate */ 16007c478bd9Sstevel@tonic-gate filsiz = hp->hs_dirent.ext_size; 16017c478bd9Sstevel@tonic-gate if ((off + len) > (offset_t)(filsiz + PAGEOFFSET) && seg != segkmap) 16027c478bd9Sstevel@tonic-gate return (EFAULT); /* beyond EOF */ 16037c478bd9Sstevel@tonic-gate 160484b82766Smg147109 /* 160584b82766Smg147109 * Async Read-ahead computation. 160684b82766Smg147109 * This attempts to detect sequential access pattern and 160784b82766Smg147109 * enables reading extra pages ahead of time. 160884b82766Smg147109 */ 160984b82766Smg147109 if (fsp->hqueue != NULL) { 161084b82766Smg147109 /* 161184b82766Smg147109 * This check for sequential access also takes into 161284b82766Smg147109 * account segmap weirdness when reading in chunks 161384b82766Smg147109 * less than the segmap size of 8K. 161484b82766Smg147109 */ 161584b82766Smg147109 if (hp->hs_prev_offset == off || (off < 161684b82766Smg147109 hp->hs_prev_offset && off + MAX(len, PAGESIZE) 161784b82766Smg147109 >= hp->hs_prev_offset)) { 161884b82766Smg147109 if (hp->hs_num_contig < 161984b82766Smg147109 (seq_contig_requests - 1)) { 162084b82766Smg147109 hp->hs_num_contig++; 162184b82766Smg147109 162284b82766Smg147109 } else { 162384b82766Smg147109 /* 162484b82766Smg147109 * We increase readahead quantum till 162584b82766Smg147109 * a predefined max. max_readahead_bytes 162684b82766Smg147109 * is a multiple of PAGESIZE. 162784b82766Smg147109 */ 162884b82766Smg147109 if (hp->hs_ra_bytes < 162984b82766Smg147109 fsp->hqueue->max_ra_bytes) { 163084b82766Smg147109 hp->hs_ra_bytes += PAGESIZE; 163184b82766Smg147109 } 163284b82766Smg147109 } 163384b82766Smg147109 } else { 163484b82766Smg147109 /* 163584b82766Smg147109 * Not contiguous so reduce read ahead counters. 163684b82766Smg147109 */ 163784b82766Smg147109 if (hp->hs_ra_bytes > 0) 163884b82766Smg147109 hp->hs_ra_bytes -= PAGESIZE; 163984b82766Smg147109 164084b82766Smg147109 if (hp->hs_ra_bytes <= 0) { 164184b82766Smg147109 hp->hs_ra_bytes = 0; 164284b82766Smg147109 if (hp->hs_num_contig > 0) 164384b82766Smg147109 hp->hs_num_contig--; 164484b82766Smg147109 } 164584b82766Smg147109 } 164684b82766Smg147109 /* 164784b82766Smg147109 * Length must be rounded up to page boundary. 164884b82766Smg147109 * since we read in units of pages. 164984b82766Smg147109 */ 165084b82766Smg147109 hp->hs_prev_offset = off + roundup(len, PAGESIZE); 165184b82766Smg147109 DTRACE_PROBE1(hsfs_compute_ra, struct hsnode *, hp); 165284b82766Smg147109 } 16537c478bd9Sstevel@tonic-gate if (protp != NULL) 16547c478bd9Sstevel@tonic-gate *protp = PROT_ALL; 16557c478bd9Sstevel@tonic-gate 1656*06e6833aSJosef 'Jeff' Sipek return (pvn_getpages(hsfs_getapage, vp, off, len, protp, pl, plsz, 1657*06e6833aSJosef 'Jeff' Sipek seg, addr, rw, cred)); 16587c478bd9Sstevel@tonic-gate } 16597c478bd9Sstevel@tonic-gate 16607c478bd9Sstevel@tonic-gate 16617c478bd9Sstevel@tonic-gate 16627c478bd9Sstevel@tonic-gate /* 16637c478bd9Sstevel@tonic-gate * This function should never be called. We need to have it to pass 16647c478bd9Sstevel@tonic-gate * it as an argument to other functions. 16657c478bd9Sstevel@tonic-gate */ 16667c478bd9Sstevel@tonic-gate /*ARGSUSED*/ 16677c478bd9Sstevel@tonic-gate int 16687c478bd9Sstevel@tonic-gate hsfs_putapage( 16697c478bd9Sstevel@tonic-gate vnode_t *vp, 16707c478bd9Sstevel@tonic-gate page_t *pp, 16717c478bd9Sstevel@tonic-gate u_offset_t *offp, 16727c478bd9Sstevel@tonic-gate size_t *lenp, 16737c478bd9Sstevel@tonic-gate int flags, 16747c478bd9Sstevel@tonic-gate cred_t *cr) 16757c478bd9Sstevel@tonic-gate { 16767c478bd9Sstevel@tonic-gate /* should never happen - just destroy it */ 16777c478bd9Sstevel@tonic-gate cmn_err(CE_NOTE, "hsfs_putapage: dirty HSFS page"); 16787c478bd9Sstevel@tonic-gate pvn_write_done(pp, B_ERROR | B_WRITE | B_INVAL | B_FORCE | flags); 16797c478bd9Sstevel@tonic-gate return (0); 16807c478bd9Sstevel@tonic-gate } 16817c478bd9Sstevel@tonic-gate 16827c478bd9Sstevel@tonic-gate 16837c478bd9Sstevel@tonic-gate /* 16847c478bd9Sstevel@tonic-gate * The only flags we support are B_INVAL, B_FREE and B_DONTNEED. 16857c478bd9Sstevel@tonic-gate * B_INVAL is set by: 16867c478bd9Sstevel@tonic-gate * 16877c478bd9Sstevel@tonic-gate * 1) the MC_SYNC command of memcntl(2) to support the MS_INVALIDATE flag. 16887c478bd9Sstevel@tonic-gate * 2) the MC_ADVISE command of memcntl(2) with the MADV_DONTNEED advice 16897c478bd9Sstevel@tonic-gate * which translates to an MC_SYNC with the MS_INVALIDATE flag. 16907c478bd9Sstevel@tonic-gate * 16917c478bd9Sstevel@tonic-gate * The B_FREE (as well as the B_DONTNEED) flag is set when the 16927c478bd9Sstevel@tonic-gate * MADV_SEQUENTIAL advice has been used. VOP_PUTPAGE is invoked 16937c478bd9Sstevel@tonic-gate * from SEGVN to release pages behind a pagefault. 16947c478bd9Sstevel@tonic-gate */ 16957c478bd9Sstevel@tonic-gate /*ARGSUSED*/ 16967c478bd9Sstevel@tonic-gate static int 16977c478bd9Sstevel@tonic-gate hsfs_putpage( 16987c478bd9Sstevel@tonic-gate struct vnode *vp, 16997c478bd9Sstevel@tonic-gate offset_t off, 17007c478bd9Sstevel@tonic-gate size_t len, 17017c478bd9Sstevel@tonic-gate int flags, 1702da6c28aaSamw struct cred *cr, 1703da6c28aaSamw caller_context_t *ct) 17047c478bd9Sstevel@tonic-gate { 17057c478bd9Sstevel@tonic-gate int error = 0; 17067c478bd9Sstevel@tonic-gate 17077c478bd9Sstevel@tonic-gate if (vp->v_count == 0) { 17087c478bd9Sstevel@tonic-gate panic("hsfs_putpage: bad v_count"); 17097c478bd9Sstevel@tonic-gate /*NOTREACHED*/ 17107c478bd9Sstevel@tonic-gate } 17117c478bd9Sstevel@tonic-gate 17127c478bd9Sstevel@tonic-gate if (vp->v_flag & VNOMAP) 17137c478bd9Sstevel@tonic-gate return (ENOSYS); 17147c478bd9Sstevel@tonic-gate 17159cbc422eSpeterte ASSERT(off <= HS_MAXFILEOFF); 17167c478bd9Sstevel@tonic-gate 17177c478bd9Sstevel@tonic-gate if (!vn_has_cached_data(vp)) /* no pages mapped */ 17187c478bd9Sstevel@tonic-gate return (0); 17197c478bd9Sstevel@tonic-gate 1720d10b6702Sfrankho if (len == 0) { /* from 'off' to EOF */ 1721d10b6702Sfrankho error = pvn_vplist_dirty(vp, off, hsfs_putapage, flags, cr); 1722d10b6702Sfrankho } else { 17237c478bd9Sstevel@tonic-gate offset_t end_off = off + len; 17247c478bd9Sstevel@tonic-gate offset_t file_size = VTOH(vp)->hs_dirent.ext_size; 17257c478bd9Sstevel@tonic-gate offset_t io_off; 17267c478bd9Sstevel@tonic-gate 17277c478bd9Sstevel@tonic-gate file_size = (file_size + PAGESIZE - 1) & PAGEMASK; 17287c478bd9Sstevel@tonic-gate if (end_off > file_size) 17297c478bd9Sstevel@tonic-gate end_off = file_size; 17307c478bd9Sstevel@tonic-gate 17317c478bd9Sstevel@tonic-gate for (io_off = off; io_off < end_off; io_off += PAGESIZE) { 17327c478bd9Sstevel@tonic-gate page_t *pp; 17337c478bd9Sstevel@tonic-gate 17347c478bd9Sstevel@tonic-gate /* 17357c478bd9Sstevel@tonic-gate * We insist on getting the page only if we are 17367c478bd9Sstevel@tonic-gate * about to invalidate, free or write it and 17377c478bd9Sstevel@tonic-gate * the B_ASYNC flag is not set. 17387c478bd9Sstevel@tonic-gate */ 17397c478bd9Sstevel@tonic-gate if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) { 17407c478bd9Sstevel@tonic-gate pp = page_lookup(vp, io_off, 17417c478bd9Sstevel@tonic-gate (flags & (B_INVAL | B_FREE)) ? 17427c478bd9Sstevel@tonic-gate SE_EXCL : SE_SHARED); 17437c478bd9Sstevel@tonic-gate } else { 17447c478bd9Sstevel@tonic-gate pp = page_lookup_nowait(vp, io_off, 17457c478bd9Sstevel@tonic-gate (flags & B_FREE) ? SE_EXCL : SE_SHARED); 17467c478bd9Sstevel@tonic-gate } 17477c478bd9Sstevel@tonic-gate 17487c478bd9Sstevel@tonic-gate if (pp == NULL) 17497c478bd9Sstevel@tonic-gate continue; 175084b82766Smg147109 17517c478bd9Sstevel@tonic-gate /* 17527c478bd9Sstevel@tonic-gate * Normally pvn_getdirty() should return 0, which 17537c478bd9Sstevel@tonic-gate * impies that it has done the job for us. 17547c478bd9Sstevel@tonic-gate * The shouldn't-happen scenario is when it returns 1. 17557c478bd9Sstevel@tonic-gate * This means that the page has been modified and 17567c478bd9Sstevel@tonic-gate * needs to be put back. 17577c478bd9Sstevel@tonic-gate * Since we can't write on a CD, we fake a failed 17587c478bd9Sstevel@tonic-gate * I/O and force pvn_write_done() to destroy the page. 17597c478bd9Sstevel@tonic-gate */ 17607c478bd9Sstevel@tonic-gate if (pvn_getdirty(pp, flags) == 1) { 17617c478bd9Sstevel@tonic-gate cmn_err(CE_NOTE, 17627c478bd9Sstevel@tonic-gate "hsfs_putpage: dirty HSFS page"); 17637c478bd9Sstevel@tonic-gate pvn_write_done(pp, flags | 17647c478bd9Sstevel@tonic-gate B_ERROR | B_WRITE | B_INVAL | B_FORCE); 17657c478bd9Sstevel@tonic-gate } 17667c478bd9Sstevel@tonic-gate } 17677c478bd9Sstevel@tonic-gate } 17687c478bd9Sstevel@tonic-gate return (error); 17697c478bd9Sstevel@tonic-gate } 17707c478bd9Sstevel@tonic-gate 17717c478bd9Sstevel@tonic-gate 17727c478bd9Sstevel@tonic-gate /*ARGSUSED*/ 17737c478bd9Sstevel@tonic-gate static int 17747c478bd9Sstevel@tonic-gate hsfs_map( 17757c478bd9Sstevel@tonic-gate struct vnode *vp, 17767c478bd9Sstevel@tonic-gate offset_t off, 17777c478bd9Sstevel@tonic-gate struct as *as, 17787c478bd9Sstevel@tonic-gate caddr_t *addrp, 17797c478bd9Sstevel@tonic-gate size_t len, 17807c478bd9Sstevel@tonic-gate uchar_t prot, 17817c478bd9Sstevel@tonic-gate uchar_t maxprot, 17827c478bd9Sstevel@tonic-gate uint_t flags, 1783da6c28aaSamw struct cred *cred, 1784da6c28aaSamw caller_context_t *ct) 17857c478bd9Sstevel@tonic-gate { 17867c478bd9Sstevel@tonic-gate struct segvn_crargs vn_a; 17877c478bd9Sstevel@tonic-gate int error; 17887c478bd9Sstevel@tonic-gate 17897c478bd9Sstevel@tonic-gate /* VFS_RECORD(vp->v_vfsp, VS_MAP, VS_CALL); */ 17907c478bd9Sstevel@tonic-gate 17917c478bd9Sstevel@tonic-gate if (vp->v_flag & VNOMAP) 17927c478bd9Sstevel@tonic-gate return (ENOSYS); 17937c478bd9Sstevel@tonic-gate 1794277b8dcbSHans Rosenfeld if ((prot & PROT_WRITE) && (flags & MAP_SHARED)) 1795de4ddf9cSKeith M Wesolowski return (ENOSYS); 1796de4ddf9cSKeith M Wesolowski 17979cbc422eSpeterte if (off > HS_MAXFILEOFF || off < 0 || 17989cbc422eSpeterte (off + len) < 0 || (off + len) > HS_MAXFILEOFF) 1799cfa55013Speterte return (ENXIO); 18007c478bd9Sstevel@tonic-gate 18017c478bd9Sstevel@tonic-gate if (vp->v_type != VREG) { 18027c478bd9Sstevel@tonic-gate return (ENODEV); 18037c478bd9Sstevel@tonic-gate } 18047c478bd9Sstevel@tonic-gate 18057c478bd9Sstevel@tonic-gate /* 18067c478bd9Sstevel@tonic-gate * If file is being locked, disallow mapping. 18077c478bd9Sstevel@tonic-gate */ 18087c478bd9Sstevel@tonic-gate if (vn_has_mandatory_locks(vp, VTOH(vp)->hs_dirent.mode)) 18097c478bd9Sstevel@tonic-gate return (EAGAIN); 18107c478bd9Sstevel@tonic-gate 18117c478bd9Sstevel@tonic-gate as_rangelock(as); 181260946fe0Smec error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags); 181360946fe0Smec if (error != 0) { 18147c478bd9Sstevel@tonic-gate as_rangeunlock(as); 181560946fe0Smec return (error); 18167c478bd9Sstevel@tonic-gate } 18177c478bd9Sstevel@tonic-gate 18187c478bd9Sstevel@tonic-gate vn_a.vp = vp; 18197c478bd9Sstevel@tonic-gate vn_a.offset = off; 18207c478bd9Sstevel@tonic-gate vn_a.type = flags & MAP_TYPE; 18217c478bd9Sstevel@tonic-gate vn_a.prot = prot; 18227c478bd9Sstevel@tonic-gate vn_a.maxprot = maxprot; 18237c478bd9Sstevel@tonic-gate vn_a.flags = flags & ~MAP_TYPE; 18247c478bd9Sstevel@tonic-gate vn_a.cred = cred; 18257c478bd9Sstevel@tonic-gate vn_a.amp = NULL; 18267c478bd9Sstevel@tonic-gate vn_a.szc = 0; 18277c478bd9Sstevel@tonic-gate vn_a.lgrp_mem_policy_flags = 0; 18287c478bd9Sstevel@tonic-gate 18297c478bd9Sstevel@tonic-gate error = as_map(as, *addrp, len, segvn_create, &vn_a); 18307c478bd9Sstevel@tonic-gate as_rangeunlock(as); 18317c478bd9Sstevel@tonic-gate return (error); 18327c478bd9Sstevel@tonic-gate } 18337c478bd9Sstevel@tonic-gate 18347c478bd9Sstevel@tonic-gate /* ARGSUSED */ 18357c478bd9Sstevel@tonic-gate static int 18367c478bd9Sstevel@tonic-gate hsfs_addmap( 18377c478bd9Sstevel@tonic-gate struct vnode *vp, 18387c478bd9Sstevel@tonic-gate offset_t off, 18397c478bd9Sstevel@tonic-gate struct as *as, 18407c478bd9Sstevel@tonic-gate caddr_t addr, 18417c478bd9Sstevel@tonic-gate size_t len, 18427c478bd9Sstevel@tonic-gate uchar_t prot, 18437c478bd9Sstevel@tonic-gate uchar_t maxprot, 18447c478bd9Sstevel@tonic-gate uint_t flags, 1845da6c28aaSamw struct cred *cr, 1846da6c28aaSamw caller_context_t *ct) 18477c478bd9Sstevel@tonic-gate { 18487c478bd9Sstevel@tonic-gate struct hsnode *hp; 18497c478bd9Sstevel@tonic-gate 18507c478bd9Sstevel@tonic-gate if (vp->v_flag & VNOMAP) 18517c478bd9Sstevel@tonic-gate return (ENOSYS); 18527c478bd9Sstevel@tonic-gate 18537c478bd9Sstevel@tonic-gate hp = VTOH(vp); 18547c478bd9Sstevel@tonic-gate mutex_enter(&hp->hs_contents_lock); 18557c478bd9Sstevel@tonic-gate hp->hs_mapcnt += btopr(len); 18567c478bd9Sstevel@tonic-gate mutex_exit(&hp->hs_contents_lock); 18577c478bd9Sstevel@tonic-gate return (0); 18587c478bd9Sstevel@tonic-gate } 18597c478bd9Sstevel@tonic-gate 18607c478bd9Sstevel@tonic-gate /*ARGSUSED*/ 18617c478bd9Sstevel@tonic-gate static int 18627c478bd9Sstevel@tonic-gate hsfs_delmap( 18637c478bd9Sstevel@tonic-gate struct vnode *vp, 18647c478bd9Sstevel@tonic-gate offset_t off, 18657c478bd9Sstevel@tonic-gate struct as *as, 18667c478bd9Sstevel@tonic-gate caddr_t addr, 18677c478bd9Sstevel@tonic-gate size_t len, 18687c478bd9Sstevel@tonic-gate uint_t prot, 18697c478bd9Sstevel@tonic-gate uint_t maxprot, 18707c478bd9Sstevel@tonic-gate uint_t flags, 1871da6c28aaSamw struct cred *cr, 1872da6c28aaSamw caller_context_t *ct) 18737c478bd9Sstevel@tonic-gate { 18747c478bd9Sstevel@tonic-gate struct hsnode *hp; 18757c478bd9Sstevel@tonic-gate 18767c478bd9Sstevel@tonic-gate if (vp->v_flag & VNOMAP) 18777c478bd9Sstevel@tonic-gate return (ENOSYS); 18787c478bd9Sstevel@tonic-gate 18797c478bd9Sstevel@tonic-gate hp = VTOH(vp); 18807c478bd9Sstevel@tonic-gate mutex_enter(&hp->hs_contents_lock); 18817c478bd9Sstevel@tonic-gate hp->hs_mapcnt -= btopr(len); /* Count released mappings */ 18827c478bd9Sstevel@tonic-gate ASSERT(hp->hs_mapcnt >= 0); 18837c478bd9Sstevel@tonic-gate mutex_exit(&hp->hs_contents_lock); 18847c478bd9Sstevel@tonic-gate return (0); 18857c478bd9Sstevel@tonic-gate } 18867c478bd9Sstevel@tonic-gate 18877c478bd9Sstevel@tonic-gate /* ARGSUSED */ 18887c478bd9Sstevel@tonic-gate static int 1889da6c28aaSamw hsfs_seek( 1890da6c28aaSamw struct vnode *vp, 1891da6c28aaSamw offset_t ooff, 1892da6c28aaSamw offset_t *noffp, 1893da6c28aaSamw caller_context_t *ct) 18947c478bd9Sstevel@tonic-gate { 18957c478bd9Sstevel@tonic-gate return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0); 18967c478bd9Sstevel@tonic-gate } 18977c478bd9Sstevel@tonic-gate 18987c478bd9Sstevel@tonic-gate /* ARGSUSED */ 18997c478bd9Sstevel@tonic-gate static int 19007c478bd9Sstevel@tonic-gate hsfs_frlock( 19017c478bd9Sstevel@tonic-gate struct vnode *vp, 19027c478bd9Sstevel@tonic-gate int cmd, 19037c478bd9Sstevel@tonic-gate struct flock64 *bfp, 19047c478bd9Sstevel@tonic-gate int flag, 19057c478bd9Sstevel@tonic-gate offset_t offset, 19067c478bd9Sstevel@tonic-gate struct flk_callback *flk_cbp, 1907da6c28aaSamw cred_t *cr, 1908da6c28aaSamw caller_context_t *ct) 19097c478bd9Sstevel@tonic-gate { 19107c478bd9Sstevel@tonic-gate struct hsnode *hp = VTOH(vp); 19117c478bd9Sstevel@tonic-gate 19127c478bd9Sstevel@tonic-gate /* 19137c478bd9Sstevel@tonic-gate * If the file is being mapped, disallow fs_frlock. 19147c478bd9Sstevel@tonic-gate * We are not holding the hs_contents_lock while checking 19157c478bd9Sstevel@tonic-gate * hs_mapcnt because the current locking strategy drops all 19167c478bd9Sstevel@tonic-gate * locks before calling fs_frlock. 19177c478bd9Sstevel@tonic-gate * So, hs_mapcnt could change before we enter fs_frlock making 19187c478bd9Sstevel@tonic-gate * it meaningless to have held hs_contents_lock in the first place. 19197c478bd9Sstevel@tonic-gate */ 19207c478bd9Sstevel@tonic-gate if (hp->hs_mapcnt > 0 && MANDLOCK(vp, hp->hs_dirent.mode)) 19217c478bd9Sstevel@tonic-gate return (EAGAIN); 19227c478bd9Sstevel@tonic-gate 1923da6c28aaSamw return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct)); 19247c478bd9Sstevel@tonic-gate } 19257c478bd9Sstevel@tonic-gate 192684b82766Smg147109 static int 192784b82766Smg147109 hsched_deadline_compare(const void *x1, const void *x2) 192884b82766Smg147109 { 192984b82766Smg147109 const struct hio *h1 = x1; 193084b82766Smg147109 const struct hio *h2 = x2; 193184b82766Smg147109 193284b82766Smg147109 if (h1->io_timestamp < h2->io_timestamp) 193384b82766Smg147109 return (-1); 193484b82766Smg147109 if (h1->io_timestamp > h2->io_timestamp) 193584b82766Smg147109 return (1); 193684b82766Smg147109 193784b82766Smg147109 if (h1->io_lblkno < h2->io_lblkno) 193884b82766Smg147109 return (-1); 193984b82766Smg147109 if (h1->io_lblkno > h2->io_lblkno) 194084b82766Smg147109 return (1); 194184b82766Smg147109 194284b82766Smg147109 if (h1 < h2) 194384b82766Smg147109 return (-1); 194484b82766Smg147109 if (h1 > h2) 194584b82766Smg147109 return (1); 194684b82766Smg147109 194784b82766Smg147109 return (0); 194884b82766Smg147109 } 194984b82766Smg147109 195084b82766Smg147109 static int 195184b82766Smg147109 hsched_offset_compare(const void *x1, const void *x2) 195284b82766Smg147109 { 195384b82766Smg147109 const struct hio *h1 = x1; 195484b82766Smg147109 const struct hio *h2 = x2; 195584b82766Smg147109 195684b82766Smg147109 if (h1->io_lblkno < h2->io_lblkno) 195784b82766Smg147109 return (-1); 195884b82766Smg147109 if (h1->io_lblkno > h2->io_lblkno) 195984b82766Smg147109 return (1); 196084b82766Smg147109 196184b82766Smg147109 if (h1 < h2) 196284b82766Smg147109 return (-1); 196384b82766Smg147109 if (h1 > h2) 196484b82766Smg147109 return (1); 196584b82766Smg147109 196684b82766Smg147109 return (0); 196784b82766Smg147109 } 196884b82766Smg147109 196984b82766Smg147109 void 197084b82766Smg147109 hsched_init_caches(void) 197184b82766Smg147109 { 197284b82766Smg147109 hio_cache = kmem_cache_create("hsfs_hio_cache", 197384b82766Smg147109 sizeof (struct hio), 0, NULL, 197484b82766Smg147109 NULL, NULL, NULL, NULL, 0); 197584b82766Smg147109 197684b82766Smg147109 hio_info_cache = kmem_cache_create("hsfs_hio_info_cache", 197784b82766Smg147109 sizeof (struct hio_info), 0, NULL, 197884b82766Smg147109 NULL, NULL, NULL, NULL, 0); 197984b82766Smg147109 } 198084b82766Smg147109 198184b82766Smg147109 void 198284b82766Smg147109 hsched_fini_caches(void) 198384b82766Smg147109 { 198484b82766Smg147109 kmem_cache_destroy(hio_cache); 198584b82766Smg147109 kmem_cache_destroy(hio_info_cache); 198684b82766Smg147109 } 198784b82766Smg147109 198884b82766Smg147109 /* 198984b82766Smg147109 * Initialize I/O scheduling structures. This is called via hsfs_mount 199084b82766Smg147109 */ 199184b82766Smg147109 void 199284b82766Smg147109 hsched_init(struct hsfs *fsp, int fsid, struct modlinkage *modlinkage) 199384b82766Smg147109 { 199484b82766Smg147109 struct hsfs_queue *hqueue = fsp->hqueue; 199584b82766Smg147109 struct vnode *vp = fsp->hsfs_devvp; 199684b82766Smg147109 199784b82766Smg147109 /* TaskQ name of the form: hsched_task_ + stringof(int) */ 199884b82766Smg147109 char namebuf[23]; 199984b82766Smg147109 int error, err; 200084b82766Smg147109 struct dk_cinfo info; 200184b82766Smg147109 ldi_handle_t lh; 200284b82766Smg147109 ldi_ident_t li; 200384b82766Smg147109 200484b82766Smg147109 /* 200584b82766Smg147109 * Default maxtransfer = 16k chunk 200684b82766Smg147109 */ 200784b82766Smg147109 hqueue->dev_maxtransfer = 16384; 200884b82766Smg147109 200984b82766Smg147109 /* 201084b82766Smg147109 * Try to fetch the maximum device transfer size. This is used to 201184b82766Smg147109 * ensure that a coalesced block does not exceed the maxtransfer. 201284b82766Smg147109 */ 201384b82766Smg147109 err = ldi_ident_from_mod(modlinkage, &li); 201484b82766Smg147109 if (err) { 201584b82766Smg147109 cmn_err(CE_NOTE, "hsched_init: Querying device failed"); 201684b82766Smg147109 cmn_err(CE_NOTE, "hsched_init: ldi_ident_from_mod err=%d\n", 201784b82766Smg147109 err); 201884b82766Smg147109 goto set_ra; 201984b82766Smg147109 } 202084b82766Smg147109 202184b82766Smg147109 err = ldi_open_by_dev(&(vp->v_rdev), OTYP_CHR, FREAD, CRED(), &lh, li); 202284b82766Smg147109 ldi_ident_release(li); 202384b82766Smg147109 if (err) { 202484b82766Smg147109 cmn_err(CE_NOTE, "hsched_init: Querying device failed"); 202584b82766Smg147109 cmn_err(CE_NOTE, "hsched_init: ldi_open err=%d\n", err); 202684b82766Smg147109 goto set_ra; 202784b82766Smg147109 } 202884b82766Smg147109 202984b82766Smg147109 error = ldi_ioctl(lh, DKIOCINFO, (intptr_t)&info, FKIOCTL, 203084b82766Smg147109 CRED(), &err); 203184b82766Smg147109 err = ldi_close(lh, FREAD, CRED()); 203284b82766Smg147109 if (err) { 203384b82766Smg147109 cmn_err(CE_NOTE, "hsched_init: Querying device failed"); 203484b82766Smg147109 cmn_err(CE_NOTE, "hsched_init: ldi_close err=%d\n", err); 203584b82766Smg147109 } 203684b82766Smg147109 203784b82766Smg147109 if (error == 0) { 203884b82766Smg147109 hqueue->dev_maxtransfer = ldbtob(info.dki_maxtransfer); 203984b82766Smg147109 } 204084b82766Smg147109 204184b82766Smg147109 set_ra: 204284b82766Smg147109 /* 204384b82766Smg147109 * Max size of data to read ahead for sequential access pattern. 204484b82766Smg147109 * Conservative to avoid letting the underlying CD drive to spin 204584b82766Smg147109 * down, in case the application is reading slowly. 204684b82766Smg147109 * We read ahead upto a max of 4 pages. 204784b82766Smg147109 */ 204884b82766Smg147109 hqueue->max_ra_bytes = PAGESIZE * 8; 204984b82766Smg147109 205084b82766Smg147109 mutex_init(&(hqueue->hsfs_queue_lock), NULL, MUTEX_DEFAULT, NULL); 205184b82766Smg147109 mutex_init(&(hqueue->strategy_lock), NULL, MUTEX_DEFAULT, NULL); 205284b82766Smg147109 avl_create(&(hqueue->read_tree), hsched_offset_compare, 205384b82766Smg147109 sizeof (struct hio), offsetof(struct hio, io_offset_node)); 205484b82766Smg147109 avl_create(&(hqueue->deadline_tree), hsched_deadline_compare, 205584b82766Smg147109 sizeof (struct hio), offsetof(struct hio, io_deadline_node)); 205684b82766Smg147109 205784b82766Smg147109 (void) snprintf(namebuf, sizeof (namebuf), "hsched_task_%d", fsid); 205884b82766Smg147109 hqueue->ra_task = taskq_create(namebuf, hsfs_taskq_nthreads, 205984b82766Smg147109 minclsyspri + 2, 1, 104857600 / PAGESIZE, TASKQ_DYNAMIC); 206084b82766Smg147109 206184b82766Smg147109 hqueue->next = NULL; 206284b82766Smg147109 hqueue->nbuf = kmem_zalloc(sizeof (struct buf), KM_SLEEP); 206384b82766Smg147109 } 206484b82766Smg147109 206584b82766Smg147109 void 206684b82766Smg147109 hsched_fini(struct hsfs_queue *hqueue) 206784b82766Smg147109 { 206884b82766Smg147109 if (hqueue != NULL) { 2069f9ec9c5aSmg147109 /* 2070f9ec9c5aSmg147109 * Remove the sentinel if there was one. 2071f9ec9c5aSmg147109 */ 2072f9ec9c5aSmg147109 if (hqueue->next != NULL) { 2073f9ec9c5aSmg147109 avl_remove(&hqueue->read_tree, hqueue->next); 2074f9ec9c5aSmg147109 kmem_cache_free(hio_cache, hqueue->next); 2075f9ec9c5aSmg147109 } 207684b82766Smg147109 avl_destroy(&(hqueue->read_tree)); 207784b82766Smg147109 avl_destroy(&(hqueue->deadline_tree)); 207884b82766Smg147109 mutex_destroy(&(hqueue->hsfs_queue_lock)); 207984b82766Smg147109 mutex_destroy(&(hqueue->strategy_lock)); 208084b82766Smg147109 208184b82766Smg147109 /* 208284b82766Smg147109 * If there are any existing readahead threads running 208384b82766Smg147109 * taskq_destroy will wait for them to finish. 208484b82766Smg147109 */ 208584b82766Smg147109 taskq_destroy(hqueue->ra_task); 208684b82766Smg147109 kmem_free(hqueue->nbuf, sizeof (struct buf)); 208784b82766Smg147109 } 208884b82766Smg147109 } 208984b82766Smg147109 209084b82766Smg147109 /* 209184b82766Smg147109 * Determine if two I/O requests are adjacent to each other so 209284b82766Smg147109 * that they can coalesced. 209384b82766Smg147109 */ 209484b82766Smg147109 #define IS_ADJACENT(io, nio) \ 209584b82766Smg147109 (((io)->io_lblkno + (io)->nblocks == (nio)->io_lblkno) && \ 209684b82766Smg147109 (io)->bp->b_edev == (nio)->bp->b_edev) 209784b82766Smg147109 209884b82766Smg147109 /* 209984b82766Smg147109 * This performs the actual I/O scheduling logic. We use the Circular 210084b82766Smg147109 * Look algorithm here. Sort the I/O requests in ascending order of 210184b82766Smg147109 * logical block number and process them starting with the lowest 210284b82766Smg147109 * numbered block and progressing towards higher block numbers in the 210384b82766Smg147109 * queue. Once there are no more higher numbered blocks, start again 210484b82766Smg147109 * with the lowest one. This is good for CD/DVD as you keep moving 210584b82766Smg147109 * the head in one direction along the outward spiral track and avoid 210684b82766Smg147109 * too many seeks as much as possible. The re-ordering also allows 210784b82766Smg147109 * us to coalesce adjacent requests into one larger request. 210884b82766Smg147109 * This is thus essentially a 1-way Elevator with front merging. 210984b82766Smg147109 * 211084b82766Smg147109 * In addition each read request here has a deadline and will be 211184b82766Smg147109 * processed out of turn if the deadline (500ms) expires. 211284b82766Smg147109 * 211384b82766Smg147109 * This function is necessarily serialized via hqueue->strategy_lock. 211484b82766Smg147109 * This function sits just below hsfs_getapage and processes all read 211584b82766Smg147109 * requests orginating from that function. 211684b82766Smg147109 */ 211784b82766Smg147109 int 211884b82766Smg147109 hsched_invoke_strategy(struct hsfs *fsp) 211984b82766Smg147109 { 212084b82766Smg147109 struct hsfs_queue *hqueue; 212184b82766Smg147109 struct buf *nbuf; 212284b82766Smg147109 struct hio *fio, *nio, *tio, *prev, *last; 212384b82766Smg147109 size_t bsize, soffset, offset, data; 212484b82766Smg147109 int bioret, bufcount; 212584b82766Smg147109 struct vnode *fvp; 212684b82766Smg147109 ksema_t *io_done; 212784b82766Smg147109 caddr_t iodata; 212884b82766Smg147109 212984b82766Smg147109 hqueue = fsp->hqueue; 213084b82766Smg147109 mutex_enter(&hqueue->strategy_lock); 213184b82766Smg147109 mutex_enter(&hqueue->hsfs_queue_lock); 213284b82766Smg147109 213384b82766Smg147109 /* 213484b82766Smg147109 * Check for Deadline expiration first 213584b82766Smg147109 */ 213684b82766Smg147109 fio = avl_first(&hqueue->deadline_tree); 213784b82766Smg147109 213884b82766Smg147109 /* 213984b82766Smg147109 * Paranoid check for empty I/O queue. Both deadline 214084b82766Smg147109 * and read trees contain same data sorted in different 214184b82766Smg147109 * ways. So empty deadline tree = empty read tree. 214284b82766Smg147109 */ 214384b82766Smg147109 if (fio == NULL) { 214484b82766Smg147109 /* 214584b82766Smg147109 * Remove the sentinel if there was one. 214684b82766Smg147109 */ 214784b82766Smg147109 if (hqueue->next != NULL) { 214884b82766Smg147109 avl_remove(&hqueue->read_tree, hqueue->next); 214984b82766Smg147109 kmem_cache_free(hio_cache, hqueue->next); 215084b82766Smg147109 hqueue->next = NULL; 215184b82766Smg147109 } 215284b82766Smg147109 mutex_exit(&hqueue->hsfs_queue_lock); 215384b82766Smg147109 mutex_exit(&hqueue->strategy_lock); 215484b82766Smg147109 return (1); 215584b82766Smg147109 } 215684b82766Smg147109 215784b82766Smg147109 if (drv_hztousec(ddi_get_lbolt()) - fio->io_timestamp 215884b82766Smg147109 < HSFS_READ_DEADLINE) { 215984b82766Smg147109 /* 216084b82766Smg147109 * Apply standard scheduling logic. This uses the 216184b82766Smg147109 * C-LOOK approach. Process I/O requests in ascending 216284b82766Smg147109 * order of logical block address till no subsequent 216384b82766Smg147109 * higher numbered block request remains. Then start 216484b82766Smg147109 * again from the lowest numbered block in the queue. 216584b82766Smg147109 * 216684b82766Smg147109 * We do this cheaply here by means of a sentinel. 216784b82766Smg147109 * The last processed I/O structure from the previous 216884b82766Smg147109 * invocation of this func, is left dangling in the 216984b82766Smg147109 * read_tree so that we can easily scan to the next 217084b82766Smg147109 * higher numbered request and remove the sentinel. 217184b82766Smg147109 */ 217284b82766Smg147109 fio = NULL; 217384b82766Smg147109 if (hqueue->next != NULL) { 217484b82766Smg147109 fio = AVL_NEXT(&hqueue->read_tree, hqueue->next); 217584b82766Smg147109 avl_remove(&hqueue->read_tree, hqueue->next); 217684b82766Smg147109 kmem_cache_free(hio_cache, hqueue->next); 217784b82766Smg147109 hqueue->next = NULL; 217884b82766Smg147109 } 217984b82766Smg147109 if (fio == NULL) { 218084b82766Smg147109 fio = avl_first(&hqueue->read_tree); 218184b82766Smg147109 } 218284b82766Smg147109 } else if (hqueue->next != NULL) { 218384b82766Smg147109 DTRACE_PROBE1(hsfs_deadline_expiry, struct hio *, fio); 218484b82766Smg147109 218584b82766Smg147109 avl_remove(&hqueue->read_tree, hqueue->next); 218684b82766Smg147109 kmem_cache_free(hio_cache, hqueue->next); 218784b82766Smg147109 hqueue->next = NULL; 218884b82766Smg147109 } 218984b82766Smg147109 219084b82766Smg147109 /* 219184b82766Smg147109 * In addition we try to coalesce contiguous 219284b82766Smg147109 * requests into one bigger request. 219384b82766Smg147109 */ 219484b82766Smg147109 bufcount = 1; 219584b82766Smg147109 bsize = ldbtob(fio->nblocks); 219684b82766Smg147109 fvp = fio->bp->b_file; 219784b82766Smg147109 nio = AVL_NEXT(&hqueue->read_tree, fio); 219884b82766Smg147109 tio = fio; 219984b82766Smg147109 while (nio != NULL && IS_ADJACENT(tio, nio) && 220084b82766Smg147109 bsize < hqueue->dev_maxtransfer) { 220184b82766Smg147109 avl_remove(&hqueue->deadline_tree, tio); 220284b82766Smg147109 avl_remove(&hqueue->read_tree, tio); 220384b82766Smg147109 tio->contig_chain = nio; 220484b82766Smg147109 bsize += ldbtob(nio->nblocks); 220584b82766Smg147109 prev = tio; 220684b82766Smg147109 tio = nio; 220784b82766Smg147109 220884b82766Smg147109 /* 220984b82766Smg147109 * This check is required to detect the case where 221084b82766Smg147109 * we are merging adjacent buffers belonging to 221184b82766Smg147109 * different files. fvp is used to set the b_file 221284b82766Smg147109 * parameter in the coalesced buf. b_file is used 221384b82766Smg147109 * by DTrace so we do not want DTrace to accrue 221484b82766Smg147109 * requests to two different files to any one file. 221584b82766Smg147109 */ 221684b82766Smg147109 if (fvp && tio->bp->b_file != fvp) { 221784b82766Smg147109 fvp = NULL; 221884b82766Smg147109 } 221984b82766Smg147109 222084b82766Smg147109 nio = AVL_NEXT(&hqueue->read_tree, nio); 222184b82766Smg147109 bufcount++; 222284b82766Smg147109 } 222384b82766Smg147109 222484b82766Smg147109 /* 222584b82766Smg147109 * tio is not removed from the read_tree as it serves as a sentinel 222684b82766Smg147109 * to cheaply allow us to scan to the next higher numbered I/O 222784b82766Smg147109 * request. 222884b82766Smg147109 */ 222984b82766Smg147109 hqueue->next = tio; 223084b82766Smg147109 avl_remove(&hqueue->deadline_tree, tio); 223184b82766Smg147109 mutex_exit(&hqueue->hsfs_queue_lock); 223284b82766Smg147109 DTRACE_PROBE3(hsfs_io_dequeued, struct hio *, fio, int, bufcount, 223384b82766Smg147109 size_t, bsize); 223484b82766Smg147109 223584b82766Smg147109 /* 223684b82766Smg147109 * The benefit of coalescing occurs if the the savings in I/O outweighs 223784b82766Smg147109 * the cost of doing the additional work below. 223884b82766Smg147109 * It was observed that coalescing 2 buffers results in diminishing 223984b82766Smg147109 * returns, so we do coalescing if we have >2 adjacent bufs. 224084b82766Smg147109 */ 224184b82766Smg147109 if (bufcount > hsched_coalesce_min) { 224284b82766Smg147109 /* 224384b82766Smg147109 * We have coalesced blocks. First allocate mem and buf for 224484b82766Smg147109 * the entire coalesced chunk. 224584b82766Smg147109 * Since we are guaranteed single-threaded here we pre-allocate 224684b82766Smg147109 * one buf at mount time and that is re-used every time. This 224784b82766Smg147109 * is a synthesized buf structure that uses kmem_alloced chunk. 224884b82766Smg147109 * Not quite a normal buf attached to pages. 224984b82766Smg147109 */ 225084b82766Smg147109 fsp->coalesced_bytes += bsize; 225184b82766Smg147109 nbuf = hqueue->nbuf; 225284b82766Smg147109 bioinit(nbuf); 225384b82766Smg147109 nbuf->b_edev = fio->bp->b_edev; 225484b82766Smg147109 nbuf->b_dev = fio->bp->b_dev; 225584b82766Smg147109 nbuf->b_flags = fio->bp->b_flags; 225684b82766Smg147109 nbuf->b_iodone = fio->bp->b_iodone; 225784b82766Smg147109 iodata = kmem_alloc(bsize, KM_SLEEP); 225884b82766Smg147109 nbuf->b_un.b_addr = iodata; 225984b82766Smg147109 nbuf->b_lblkno = fio->bp->b_lblkno; 226084b82766Smg147109 nbuf->b_vp = fvp; 226184b82766Smg147109 nbuf->b_file = fvp; 226284b82766Smg147109 nbuf->b_bcount = bsize; 226384b82766Smg147109 nbuf->b_bufsize = bsize; 226484b82766Smg147109 226584b82766Smg147109 DTRACE_PROBE3(hsfs_coalesced_io_start, struct hio *, fio, int, 226684b82766Smg147109 bufcount, size_t, bsize); 226784b82766Smg147109 226884b82766Smg147109 /* 226984b82766Smg147109 * Perform I/O for the coalesced block. 227084b82766Smg147109 */ 227184b82766Smg147109 (void) bdev_strategy(nbuf); 227284b82766Smg147109 227384b82766Smg147109 /* 227484b82766Smg147109 * Duplicate the last IO node to leave the sentinel alone. 227584b82766Smg147109 * The sentinel is freed in the next invocation of this 227684b82766Smg147109 * function. 227784b82766Smg147109 */ 227884b82766Smg147109 prev->contig_chain = kmem_cache_alloc(hio_cache, KM_SLEEP); 227984b82766Smg147109 prev->contig_chain->bp = tio->bp; 228084b82766Smg147109 prev->contig_chain->sema = tio->sema; 228184b82766Smg147109 tio = prev->contig_chain; 228284b82766Smg147109 tio->contig_chain = NULL; 228384b82766Smg147109 soffset = ldbtob(fio->bp->b_lblkno); 228484b82766Smg147109 nio = fio; 228584b82766Smg147109 228684b82766Smg147109 bioret = biowait(nbuf); 228784b82766Smg147109 data = bsize - nbuf->b_resid; 228884b82766Smg147109 biofini(nbuf); 228984b82766Smg147109 mutex_exit(&hqueue->strategy_lock); 229084b82766Smg147109 229184b82766Smg147109 /* 229284b82766Smg147109 * We use the b_resid parameter to detect how much 229384b82766Smg147109 * data was succesfully transferred. We will signal 229484b82766Smg147109 * a success to all the fully retrieved actual bufs 229584b82766Smg147109 * before coalescing, rest is signaled as error, 229684b82766Smg147109 * if any. 229784b82766Smg147109 */ 229884b82766Smg147109 tio = nio; 229984b82766Smg147109 DTRACE_PROBE3(hsfs_coalesced_io_done, struct hio *, nio, 230084b82766Smg147109 int, bioret, size_t, data); 230184b82766Smg147109 230284b82766Smg147109 /* 230384b82766Smg147109 * Copy data and signal success to all the bufs 230484b82766Smg147109 * which can be fully satisfied from b_resid. 230584b82766Smg147109 */ 230684b82766Smg147109 while (nio != NULL && data >= nio->bp->b_bcount) { 230784b82766Smg147109 offset = ldbtob(nio->bp->b_lblkno) - soffset; 230884b82766Smg147109 bcopy(iodata + offset, nio->bp->b_un.b_addr, 230984b82766Smg147109 nio->bp->b_bcount); 231084b82766Smg147109 data -= nio->bp->b_bcount; 231184b82766Smg147109 bioerror(nio->bp, 0); 231284b82766Smg147109 biodone(nio->bp); 231384b82766Smg147109 sema_v(nio->sema); 231484b82766Smg147109 tio = nio; 231584b82766Smg147109 nio = nio->contig_chain; 231684b82766Smg147109 kmem_cache_free(hio_cache, tio); 231784b82766Smg147109 } 231884b82766Smg147109 231984b82766Smg147109 /* 232084b82766Smg147109 * Signal error to all the leftover bufs (if any) 232184b82766Smg147109 * after b_resid data is exhausted. 232284b82766Smg147109 */ 232384b82766Smg147109 while (nio != NULL) { 232484b82766Smg147109 nio->bp->b_resid = nio->bp->b_bcount - data; 232584b82766Smg147109 bzero(nio->bp->b_un.b_addr + data, nio->bp->b_resid); 232684b82766Smg147109 bioerror(nio->bp, bioret); 232784b82766Smg147109 biodone(nio->bp); 232884b82766Smg147109 sema_v(nio->sema); 232984b82766Smg147109 tio = nio; 233084b82766Smg147109 nio = nio->contig_chain; 233184b82766Smg147109 kmem_cache_free(hio_cache, tio); 233284b82766Smg147109 data = 0; 233384b82766Smg147109 } 233484b82766Smg147109 kmem_free(iodata, bsize); 233584b82766Smg147109 } else { 233684b82766Smg147109 233784b82766Smg147109 nbuf = tio->bp; 233884b82766Smg147109 io_done = tio->sema; 233984b82766Smg147109 nio = fio; 234084b82766Smg147109 last = tio; 234184b82766Smg147109 234284b82766Smg147109 while (nio != NULL) { 234384b82766Smg147109 (void) bdev_strategy(nio->bp); 234484b82766Smg147109 nio = nio->contig_chain; 234584b82766Smg147109 } 234684b82766Smg147109 nio = fio; 234784b82766Smg147109 mutex_exit(&hqueue->strategy_lock); 234884b82766Smg147109 234984b82766Smg147109 while (nio != NULL) { 235084b82766Smg147109 if (nio == last) { 235184b82766Smg147109 (void) biowait(nbuf); 235284b82766Smg147109 sema_v(io_done); 235384b82766Smg147109 break; 235484b82766Smg147109 /* sentinel last not freed. See above. */ 235584b82766Smg147109 } else { 235684b82766Smg147109 (void) biowait(nio->bp); 235784b82766Smg147109 sema_v(nio->sema); 235884b82766Smg147109 } 235984b82766Smg147109 tio = nio; 236084b82766Smg147109 nio = nio->contig_chain; 236184b82766Smg147109 kmem_cache_free(hio_cache, tio); 236284b82766Smg147109 } 236384b82766Smg147109 } 236484b82766Smg147109 return (0); 236584b82766Smg147109 } 236684b82766Smg147109 236784b82766Smg147109 /* 236884b82766Smg147109 * Insert an I/O request in the I/O scheduler's pipeline 236984b82766Smg147109 * Using AVL tree makes it easy to reorder the I/O request 237084b82766Smg147109 * based on logical block number. 237184b82766Smg147109 */ 237284b82766Smg147109 static void 237384b82766Smg147109 hsched_enqueue_io(struct hsfs *fsp, struct hio *hsio, int ra) 237484b82766Smg147109 { 237584b82766Smg147109 struct hsfs_queue *hqueue = fsp->hqueue; 237684b82766Smg147109 237784b82766Smg147109 mutex_enter(&hqueue->hsfs_queue_lock); 237884b82766Smg147109 237984b82766Smg147109 fsp->physical_read_bytes += hsio->bp->b_bcount; 238084b82766Smg147109 if (ra) 238184b82766Smg147109 fsp->readahead_bytes += hsio->bp->b_bcount; 238284b82766Smg147109 238384b82766Smg147109 avl_add(&hqueue->deadline_tree, hsio); 238484b82766Smg147109 avl_add(&hqueue->read_tree, hsio); 238584b82766Smg147109 238684b82766Smg147109 DTRACE_PROBE3(hsfs_io_enqueued, struct hio *, hsio, 238784b82766Smg147109 struct hsfs_queue *, hqueue, int, ra); 238884b82766Smg147109 238984b82766Smg147109 mutex_exit(&hqueue->hsfs_queue_lock); 239084b82766Smg147109 } 239184b82766Smg147109 2392fc1c62b8Sfrankho /* ARGSUSED */ 2393fc1c62b8Sfrankho static int 2394da6c28aaSamw hsfs_pathconf(struct vnode *vp, 2395da6c28aaSamw int cmd, 2396da6c28aaSamw ulong_t *valp, 2397da6c28aaSamw struct cred *cr, 2398da6c28aaSamw caller_context_t *ct) 2399fc1c62b8Sfrankho { 2400fc1c62b8Sfrankho struct hsfs *fsp; 2401fc1c62b8Sfrankho 2402fc1c62b8Sfrankho int error = 0; 2403fc1c62b8Sfrankho 2404fc1c62b8Sfrankho switch (cmd) { 2405fc1c62b8Sfrankho 2406fc1c62b8Sfrankho case _PC_NAME_MAX: 2407fc1c62b8Sfrankho fsp = VFS_TO_HSFS(vp->v_vfsp); 2408fc1c62b8Sfrankho *valp = fsp->hsfs_namemax; 2409fc1c62b8Sfrankho break; 2410fc1c62b8Sfrankho 2411fc1c62b8Sfrankho case _PC_FILESIZEBITS: 2412fc1c62b8Sfrankho *valp = 33; /* Without multi extent support: 4 GB - 2k */ 2413fc1c62b8Sfrankho break; 2414fc1c62b8Sfrankho 24153b862e9aSRoger A. Faulkner case _PC_TIMESTAMP_RESOLUTION: 24163b862e9aSRoger A. Faulkner /* 24173b862e9aSRoger A. Faulkner * HSFS keeps, at best, 1/100 second timestamp resolution. 24183b862e9aSRoger A. Faulkner */ 24193b862e9aSRoger A. Faulkner *valp = 10000000L; 24203b862e9aSRoger A. Faulkner break; 24213b862e9aSRoger A. Faulkner 2422fc1c62b8Sfrankho default: 2423da6c28aaSamw error = fs_pathconf(vp, cmd, valp, cr, ct); 24243b862e9aSRoger A. Faulkner break; 2425fc1c62b8Sfrankho } 2426fc1c62b8Sfrankho 2427fc1c62b8Sfrankho return (error); 2428fc1c62b8Sfrankho } 2429fc1c62b8Sfrankho 2430fc1c62b8Sfrankho 2431fc1c62b8Sfrankho 24327c478bd9Sstevel@tonic-gate const fs_operation_def_t hsfs_vnodeops_template[] = { 2433aa59c4cbSrsb VOPNAME_OPEN, { .vop_open = hsfs_open }, 2434aa59c4cbSrsb VOPNAME_CLOSE, { .vop_close = hsfs_close }, 2435aa59c4cbSrsb VOPNAME_READ, { .vop_read = hsfs_read }, 2436aa59c4cbSrsb VOPNAME_GETATTR, { .vop_getattr = hsfs_getattr }, 2437aa59c4cbSrsb VOPNAME_ACCESS, { .vop_access = hsfs_access }, 2438aa59c4cbSrsb VOPNAME_LOOKUP, { .vop_lookup = hsfs_lookup }, 2439aa59c4cbSrsb VOPNAME_READDIR, { .vop_readdir = hsfs_readdir }, 2440aa59c4cbSrsb VOPNAME_READLINK, { .vop_readlink = hsfs_readlink }, 2441aa59c4cbSrsb VOPNAME_FSYNC, { .vop_fsync = hsfs_fsync }, 2442aa59c4cbSrsb VOPNAME_INACTIVE, { .vop_inactive = hsfs_inactive }, 2443aa59c4cbSrsb VOPNAME_FID, { .vop_fid = hsfs_fid }, 2444aa59c4cbSrsb VOPNAME_SEEK, { .vop_seek = hsfs_seek }, 2445aa59c4cbSrsb VOPNAME_FRLOCK, { .vop_frlock = hsfs_frlock }, 2446aa59c4cbSrsb VOPNAME_GETPAGE, { .vop_getpage = hsfs_getpage }, 2447aa59c4cbSrsb VOPNAME_PUTPAGE, { .vop_putpage = hsfs_putpage }, 2448aa59c4cbSrsb VOPNAME_MAP, { .vop_map = hsfs_map }, 2449aa59c4cbSrsb VOPNAME_ADDMAP, { .vop_addmap = hsfs_addmap }, 2450aa59c4cbSrsb VOPNAME_DELMAP, { .vop_delmap = hsfs_delmap }, 2451aa59c4cbSrsb VOPNAME_PATHCONF, { .vop_pathconf = hsfs_pathconf }, 24527c478bd9Sstevel@tonic-gate NULL, NULL 24537c478bd9Sstevel@tonic-gate }; 24547c478bd9Sstevel@tonic-gate 24557c478bd9Sstevel@tonic-gate struct vnodeops *hsfs_vnodeops; 2456