xref: /titanic_44/usr/src/uts/common/fs/nfs/nfs_vnops.c (revision f4427914317470f516ca69daaed90f7efb3d76dc)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved.
23  *
24  *	Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T.
25  *	All rights reserved.
26  */
27 
28 /*
29  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
30  */
31 
32 #include <sys/param.h>
33 #include <sys/types.h>
34 #include <sys/systm.h>
35 #include <sys/cred.h>
36 #include <sys/time.h>
37 #include <sys/vnode.h>
38 #include <sys/vfs.h>
39 #include <sys/vfs_opreg.h>
40 #include <sys/file.h>
41 #include <sys/filio.h>
42 #include <sys/uio.h>
43 #include <sys/buf.h>
44 #include <sys/mman.h>
45 #include <sys/pathname.h>
46 #include <sys/dirent.h>
47 #include <sys/debug.h>
48 #include <sys/vmsystm.h>
49 #include <sys/fcntl.h>
50 #include <sys/flock.h>
51 #include <sys/swap.h>
52 #include <sys/errno.h>
53 #include <sys/strsubr.h>
54 #include <sys/sysmacros.h>
55 #include <sys/kmem.h>
56 #include <sys/cmn_err.h>
57 #include <sys/pathconf.h>
58 #include <sys/utsname.h>
59 #include <sys/dnlc.h>
60 #include <sys/acl.h>
61 #include <sys/atomic.h>
62 #include <sys/policy.h>
63 #include <sys/sdt.h>
64 
65 #include <rpc/types.h>
66 #include <rpc/auth.h>
67 #include <rpc/clnt.h>
68 
69 #include <nfs/nfs.h>
70 #include <nfs/nfs_clnt.h>
71 #include <nfs/rnode.h>
72 #include <nfs/nfs_acl.h>
73 #include <nfs/lm.h>
74 
75 #include <vm/hat.h>
76 #include <vm/as.h>
77 #include <vm/page.h>
78 #include <vm/pvn.h>
79 #include <vm/seg.h>
80 #include <vm/seg_map.h>
81 #include <vm/seg_kpm.h>
82 #include <vm/seg_vn.h>
83 
84 #include <fs/fs_subr.h>
85 
86 #include <sys/ddi.h>
87 
88 static int	nfs_rdwrlbn(vnode_t *, page_t *, u_offset_t, size_t, int,
89 			cred_t *);
90 static int	nfswrite(vnode_t *, caddr_t, uint_t, int, cred_t *);
91 static int	nfsread(vnode_t *, caddr_t, uint_t, int, size_t *, cred_t *);
92 static int	nfssetattr(vnode_t *, struct vattr *, int, cred_t *);
93 static int	nfslookup_dnlc(vnode_t *, char *, vnode_t **, cred_t *);
94 static int	nfslookup_otw(vnode_t *, char *, vnode_t **, cred_t *, int);
95 static int	nfsrename(vnode_t *, char *, vnode_t *, char *, cred_t *,
96 			caller_context_t *);
97 static int	nfsreaddir(vnode_t *, rddir_cache *, cred_t *);
98 static int	nfs_bio(struct buf *, cred_t *);
99 static int	nfs_getapage(vnode_t *, u_offset_t, size_t, uint_t *,
100 			page_t *[], size_t, struct seg *, caddr_t,
101 			enum seg_rw, cred_t *);
102 static void	nfs_readahead(vnode_t *, u_offset_t, caddr_t, struct seg *,
103 			cred_t *);
104 static int	nfs_sync_putapage(vnode_t *, page_t *, u_offset_t, size_t,
105 			int, cred_t *);
106 static int	nfs_sync_pageio(vnode_t *, page_t *, u_offset_t, size_t,
107 			int, cred_t *);
108 static void	nfs_delmap_callback(struct as *, void *, uint_t);
109 
110 /*
111  * Error flags used to pass information about certain special errors
112  * which need to be handled specially.
113  */
114 #define	NFS_EOF			-98
115 
116 /*
117  * These are the vnode ops routines which implement the vnode interface to
118  * the networked file system.  These routines just take their parameters,
119  * make them look networkish by putting the right info into interface structs,
120  * and then calling the appropriate remote routine(s) to do the work.
121  *
122  * Note on directory name lookup cacheing:  If we detect a stale fhandle,
123  * we purge the directory cache relative to that vnode.  This way, the
124  * user won't get burned by the cache repeatedly.  See <nfs/rnode.h> for
125  * more details on rnode locking.
126  */
127 
128 static int	nfs_open(vnode_t **, int, cred_t *, caller_context_t *);
129 static int	nfs_close(vnode_t *, int, int, offset_t, cred_t *,
130 			caller_context_t *);
131 static int	nfs_read(vnode_t *, struct uio *, int, cred_t *,
132 			caller_context_t *);
133 static int	nfs_write(vnode_t *, struct uio *, int, cred_t *,
134 			caller_context_t *);
135 static int	nfs_ioctl(vnode_t *, int, intptr_t, int, cred_t *, int *,
136 			caller_context_t *);
137 static int	nfs_getattr(vnode_t *, struct vattr *, int, cred_t *,
138 			caller_context_t *);
139 static int	nfs_setattr(vnode_t *, struct vattr *, int, cred_t *,
140 			caller_context_t *);
141 static int	nfs_access(vnode_t *, int, int, cred_t *, caller_context_t *);
142 static int	nfs_accessx(void *, int, cred_t *);
143 static int	nfs_readlink(vnode_t *, struct uio *, cred_t *,
144 			caller_context_t *);
145 static int	nfs_fsync(vnode_t *, int, cred_t *, caller_context_t *);
146 static void	nfs_inactive(vnode_t *, cred_t *, caller_context_t *);
147 static int	nfs_lookup(vnode_t *, char *, vnode_t **, struct pathname *,
148 			int, vnode_t *, cred_t *, caller_context_t *,
149 			int *, pathname_t *);
150 static int	nfs_create(vnode_t *, char *, struct vattr *, enum vcexcl,
151 			int, vnode_t **, cred_t *, int, caller_context_t *,
152 			vsecattr_t *);
153 static int	nfs_remove(vnode_t *, char *, cred_t *, caller_context_t *,
154 			int);
155 static int	nfs_link(vnode_t *, vnode_t *, char *, cred_t *,
156 			caller_context_t *, int);
157 static int	nfs_rename(vnode_t *, char *, vnode_t *, char *, cred_t *,
158 			caller_context_t *, int);
159 static int	nfs_mkdir(vnode_t *, char *, struct vattr *, vnode_t **,
160 			cred_t *, caller_context_t *, int, vsecattr_t *);
161 static int	nfs_rmdir(vnode_t *, char *, vnode_t *, cred_t *,
162 			caller_context_t *, int);
163 static int	nfs_symlink(vnode_t *, char *, struct vattr *, char *,
164 			cred_t *, caller_context_t *, int);
165 static int	nfs_readdir(vnode_t *, struct uio *, cred_t *, int *,
166 			caller_context_t *, int);
167 static int	nfs_fid(vnode_t *, fid_t *, caller_context_t *);
168 static int	nfs_rwlock(vnode_t *, int, caller_context_t *);
169 static void	nfs_rwunlock(vnode_t *, int, caller_context_t *);
170 static int	nfs_seek(vnode_t *, offset_t, offset_t *, caller_context_t *);
171 static int	nfs_getpage(vnode_t *, offset_t, size_t, uint_t *,
172 			page_t *[], size_t, struct seg *, caddr_t,
173 			enum seg_rw, cred_t *, caller_context_t *);
174 static int	nfs_putpage(vnode_t *, offset_t, size_t, int, cred_t *,
175 			caller_context_t *);
176 static int	nfs_map(vnode_t *, offset_t, struct as *, caddr_t *, size_t,
177 			uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *);
178 static int	nfs_addmap(vnode_t *, offset_t, struct as *, caddr_t, size_t,
179 			uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *);
180 static int	nfs_frlock(vnode_t *, int, struct flock64 *, int, offset_t,
181 			struct flk_callback *, cred_t *, caller_context_t *);
182 static int	nfs_space(vnode_t *, int, struct flock64 *, int, offset_t,
183 			cred_t *, caller_context_t *);
184 static int	nfs_realvp(vnode_t *, vnode_t **, caller_context_t *);
185 static int	nfs_delmap(vnode_t *, offset_t, struct as *, caddr_t, size_t,
186 			uint_t, uint_t, uint_t, cred_t *, caller_context_t *);
187 static int	nfs_pathconf(vnode_t *, int, ulong_t *, cred_t *,
188 			caller_context_t *);
189 static int	nfs_pageio(vnode_t *, page_t *, u_offset_t, size_t, int,
190 			cred_t *, caller_context_t *);
191 static int	nfs_setsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
192 			caller_context_t *);
193 static int	nfs_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
194 			caller_context_t *);
195 static int	nfs_shrlock(vnode_t *, int, struct shrlock *, int, cred_t *,
196 			caller_context_t *);
197 
198 struct vnodeops *nfs_vnodeops;
199 
200 const fs_operation_def_t nfs_vnodeops_template[] = {
201 	VOPNAME_OPEN,		{ .vop_open = nfs_open },
202 	VOPNAME_CLOSE,		{ .vop_close = nfs_close },
203 	VOPNAME_READ,		{ .vop_read = nfs_read },
204 	VOPNAME_WRITE,		{ .vop_write = nfs_write },
205 	VOPNAME_IOCTL,		{ .vop_ioctl = nfs_ioctl },
206 	VOPNAME_GETATTR,	{ .vop_getattr = nfs_getattr },
207 	VOPNAME_SETATTR,	{ .vop_setattr = nfs_setattr },
208 	VOPNAME_ACCESS,		{ .vop_access = nfs_access },
209 	VOPNAME_LOOKUP,		{ .vop_lookup = nfs_lookup },
210 	VOPNAME_CREATE,		{ .vop_create = nfs_create },
211 	VOPNAME_REMOVE,		{ .vop_remove = nfs_remove },
212 	VOPNAME_LINK,		{ .vop_link = nfs_link },
213 	VOPNAME_RENAME,		{ .vop_rename = nfs_rename },
214 	VOPNAME_MKDIR,		{ .vop_mkdir = nfs_mkdir },
215 	VOPNAME_RMDIR,		{ .vop_rmdir = nfs_rmdir },
216 	VOPNAME_READDIR,	{ .vop_readdir = nfs_readdir },
217 	VOPNAME_SYMLINK,	{ .vop_symlink = nfs_symlink },
218 	VOPNAME_READLINK,	{ .vop_readlink = nfs_readlink },
219 	VOPNAME_FSYNC,		{ .vop_fsync = nfs_fsync },
220 	VOPNAME_INACTIVE,	{ .vop_inactive = nfs_inactive },
221 	VOPNAME_FID,		{ .vop_fid = nfs_fid },
222 	VOPNAME_RWLOCK,		{ .vop_rwlock = nfs_rwlock },
223 	VOPNAME_RWUNLOCK,	{ .vop_rwunlock = nfs_rwunlock },
224 	VOPNAME_SEEK,		{ .vop_seek = nfs_seek },
225 	VOPNAME_FRLOCK,		{ .vop_frlock = nfs_frlock },
226 	VOPNAME_SPACE,		{ .vop_space = nfs_space },
227 	VOPNAME_REALVP,		{ .vop_realvp = nfs_realvp },
228 	VOPNAME_GETPAGE,	{ .vop_getpage = nfs_getpage },
229 	VOPNAME_PUTPAGE,	{ .vop_putpage = nfs_putpage },
230 	VOPNAME_MAP,		{ .vop_map = nfs_map },
231 	VOPNAME_ADDMAP,		{ .vop_addmap = nfs_addmap },
232 	VOPNAME_DELMAP,		{ .vop_delmap = nfs_delmap },
233 	VOPNAME_DUMP,		{ .vop_dump = nfs_dump },
234 	VOPNAME_PATHCONF,	{ .vop_pathconf = nfs_pathconf },
235 	VOPNAME_PAGEIO,		{ .vop_pageio = nfs_pageio },
236 	VOPNAME_SETSECATTR,	{ .vop_setsecattr = nfs_setsecattr },
237 	VOPNAME_GETSECATTR,	{ .vop_getsecattr = nfs_getsecattr },
238 	VOPNAME_SHRLOCK,	{ .vop_shrlock = nfs_shrlock },
239 	VOPNAME_VNEVENT, 	{ .vop_vnevent = fs_vnevent_support },
240 	NULL,			NULL
241 };
242 
243 /*
244  * XXX:  This is referenced in modstubs.s
245  */
246 struct vnodeops *
247 nfs_getvnodeops(void)
248 {
249 	return (nfs_vnodeops);
250 }
251 
252 /* ARGSUSED */
253 static int
254 nfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
255 {
256 	int error;
257 	struct vattr va;
258 	rnode_t *rp;
259 	vnode_t *vp;
260 
261 	vp = *vpp;
262 	rp = VTOR(vp);
263 	if (nfs_zone() != VTOMI(vp)->mi_zone)
264 		return (EIO);
265 	mutex_enter(&rp->r_statelock);
266 	if (rp->r_cred == NULL) {
267 		crhold(cr);
268 		rp->r_cred = cr;
269 	}
270 	mutex_exit(&rp->r_statelock);
271 
272 	/*
273 	 * If there is no cached data or if close-to-open
274 	 * consistency checking is turned off, we can avoid
275 	 * the over the wire getattr.  Otherwise, if the
276 	 * file system is mounted readonly, then just verify
277 	 * the caches are up to date using the normal mechanism.
278 	 * Else, if the file is not mmap'd, then just mark
279 	 * the attributes as timed out.  They will be refreshed
280 	 * and the caches validated prior to being used.
281 	 * Else, the file system is mounted writeable so
282 	 * force an over the wire GETATTR in order to ensure
283 	 * that all cached data is valid.
284 	 */
285 	if (vp->v_count > 1 ||
286 	    ((vn_has_cached_data(vp) || HAVE_RDDIR_CACHE(rp)) &&
287 	    !(VTOMI(vp)->mi_flags & MI_NOCTO))) {
288 		if (vn_is_readonly(vp))
289 			error = nfs_validate_caches(vp, cr);
290 		else if (rp->r_mapcnt == 0 && vp->v_count == 1) {
291 			PURGE_ATTRCACHE(vp);
292 			error = 0;
293 		} else {
294 			va.va_mask = AT_ALL;
295 			error = nfs_getattr_otw(vp, &va, cr);
296 		}
297 	} else
298 		error = 0;
299 
300 	return (error);
301 }
302 
303 /* ARGSUSED */
304 static int
305 nfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
306 	caller_context_t *ct)
307 {
308 	rnode_t *rp;
309 	int error;
310 	struct vattr va;
311 
312 	/*
313 	 * zone_enter(2) prevents processes from changing zones with NFS files
314 	 * open; if we happen to get here from the wrong zone we can't do
315 	 * anything over the wire.
316 	 */
317 	if (VTOMI(vp)->mi_zone != nfs_zone()) {
318 		/*
319 		 * We could attempt to clean up locks, except we're sure
320 		 * that the current process didn't acquire any locks on
321 		 * the file: any attempt to lock a file belong to another zone
322 		 * will fail, and one can't lock an NFS file and then change
323 		 * zones, as that fails too.
324 		 *
325 		 * Returning an error here is the sane thing to do.  A
326 		 * subsequent call to VN_RELE() which translates to a
327 		 * nfs_inactive() will clean up state: if the zone of the
328 		 * vnode's origin is still alive and kicking, an async worker
329 		 * thread will handle the request (from the correct zone), and
330 		 * everything (minus the final nfs_getattr_otw() call) should
331 		 * be OK. If the zone is going away nfs_async_inactive() will
332 		 * throw away cached pages inline.
333 		 */
334 		return (EIO);
335 	}
336 
337 	/*
338 	 * If we are using local locking for this filesystem, then
339 	 * release all of the SYSV style record locks.  Otherwise,
340 	 * we are doing network locking and we need to release all
341 	 * of the network locks.  All of the locks held by this
342 	 * process on this file are released no matter what the
343 	 * incoming reference count is.
344 	 */
345 	if (VTOMI(vp)->mi_flags & MI_LLOCK) {
346 		cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
347 		cleanshares(vp, ttoproc(curthread)->p_pid);
348 	} else
349 		nfs_lockrelease(vp, flag, offset, cr);
350 
351 	if (count > 1)
352 		return (0);
353 
354 	/*
355 	 * If the file has been `unlinked', then purge the
356 	 * DNLC so that this vnode will get reycled quicker
357 	 * and the .nfs* file on the server will get removed.
358 	 */
359 	rp = VTOR(vp);
360 	if (rp->r_unldvp != NULL)
361 		dnlc_purge_vp(vp);
362 
363 	/*
364 	 * If the file was open for write and there are pages,
365 	 * then if the file system was mounted using the "no-close-
366 	 *	to-open" semantics, then start an asynchronous flush
367 	 *	of the all of the pages in the file.
368 	 * else the file system was not mounted using the "no-close-
369 	 *	to-open" semantics, then do a synchronous flush and
370 	 *	commit of all of the dirty and uncommitted pages.
371 	 *
372 	 * The asynchronous flush of the pages in the "nocto" path
373 	 * mostly just associates a cred pointer with the rnode so
374 	 * writes which happen later will have a better chance of
375 	 * working.  It also starts the data being written to the
376 	 * server, but without unnecessarily delaying the application.
377 	 */
378 	if ((flag & FWRITE) && vn_has_cached_data(vp)) {
379 		if ((VTOMI(vp)->mi_flags & MI_NOCTO)) {
380 			error = nfs_putpage(vp, (offset_t)0, 0, B_ASYNC,
381 			    cr, ct);
382 			if (error == EAGAIN)
383 				error = 0;
384 		} else
385 			error = nfs_putpage(vp, (offset_t)0, 0, 0, cr, ct);
386 		if (!error) {
387 			mutex_enter(&rp->r_statelock);
388 			error = rp->r_error;
389 			rp->r_error = 0;
390 			mutex_exit(&rp->r_statelock);
391 		}
392 	} else {
393 		mutex_enter(&rp->r_statelock);
394 		error = rp->r_error;
395 		rp->r_error = 0;
396 		mutex_exit(&rp->r_statelock);
397 	}
398 
399 	/*
400 	 * If RWRITEATTR is set, then issue an over the wire GETATTR to
401 	 * refresh the attribute cache with a set of attributes which
402 	 * weren't returned from a WRITE.  This will enable the close-
403 	 * to-open processing to work.
404 	 */
405 	if (rp->r_flags & RWRITEATTR)
406 		(void) nfs_getattr_otw(vp, &va, cr);
407 
408 	return (error);
409 }
410 
411 /* ARGSUSED */
412 static int
413 nfs_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
414 	caller_context_t *ct)
415 {
416 	rnode_t *rp;
417 	u_offset_t off;
418 	offset_t diff;
419 	int on;
420 	size_t n;
421 	caddr_t base;
422 	uint_t flags;
423 	int error;
424 	mntinfo_t *mi;
425 
426 	rp = VTOR(vp);
427 	mi = VTOMI(vp);
428 
429 	if (nfs_zone() != mi->mi_zone)
430 		return (EIO);
431 
432 	ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
433 
434 	if (vp->v_type != VREG)
435 		return (EISDIR);
436 
437 	if (uiop->uio_resid == 0)
438 		return (0);
439 
440 	if (uiop->uio_loffset > MAXOFF32_T)
441 		return (EFBIG);
442 
443 	if (uiop->uio_loffset < 0 ||
444 	    uiop->uio_loffset + uiop->uio_resid > MAXOFF32_T)
445 		return (EINVAL);
446 
447 	/*
448 	 * Bypass VM if caching has been disabled (e.g., locking) or if
449 	 * using client-side direct I/O and the file is not mmap'd and
450 	 * there are no cached pages.
451 	 */
452 	if ((vp->v_flag & VNOCACHE) ||
453 	    (((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) &&
454 	    rp->r_mapcnt == 0 && rp->r_inmap == 0 &&
455 	    !vn_has_cached_data(vp))) {
456 		size_t bufsize;
457 		size_t resid = 0;
458 
459 		/*
460 		 * Let's try to do read in as large a chunk as we can
461 		 * (Filesystem (NFS client) bsize if possible/needed).
462 		 * For V3, this is 32K and for V2, this is 8K.
463 		 */
464 		bufsize = MIN(uiop->uio_resid, VTOMI(vp)->mi_curread);
465 		base = kmem_alloc(bufsize, KM_SLEEP);
466 		do {
467 			n = MIN(uiop->uio_resid, bufsize);
468 			error = nfsread(vp, base, uiop->uio_offset, n,
469 			    &resid, cr);
470 			if (!error) {
471 				n -= resid;
472 				error = uiomove(base, n, UIO_READ, uiop);
473 			}
474 		} while (!error && uiop->uio_resid > 0 && n > 0);
475 		kmem_free(base, bufsize);
476 		return (error);
477 	}
478 
479 	error = 0;
480 
481 	do {
482 		off = uiop->uio_loffset & MAXBMASK; /* mapping offset */
483 		on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */
484 		n = MIN(MAXBSIZE - on, uiop->uio_resid);
485 
486 		error = nfs_validate_caches(vp, cr);
487 		if (error)
488 			break;
489 
490 		mutex_enter(&rp->r_statelock);
491 		while (rp->r_flags & RINCACHEPURGE) {
492 			if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
493 				mutex_exit(&rp->r_statelock);
494 				return (EINTR);
495 			}
496 		}
497 		diff = rp->r_size - uiop->uio_loffset;
498 		mutex_exit(&rp->r_statelock);
499 		if (diff <= 0)
500 			break;
501 		if (diff < n)
502 			n = (size_t)diff;
503 
504 		if (vpm_enable) {
505 			/*
506 			 * Copy data.
507 			 */
508 			error = vpm_data_copy(vp, off + on, n, uiop,
509 			    1, NULL, 0, S_READ);
510 		} else {
511 			base = segmap_getmapflt(segkmap, vp, off + on, n,
512 			    1, S_READ);
513 			error = uiomove(base + on, n, UIO_READ, uiop);
514 		}
515 
516 		if (!error) {
517 			/*
518 			 * If read a whole block or read to eof,
519 			 * won't need this buffer again soon.
520 			 */
521 			mutex_enter(&rp->r_statelock);
522 			if (n + on == MAXBSIZE ||
523 			    uiop->uio_loffset == rp->r_size)
524 				flags = SM_DONTNEED;
525 			else
526 				flags = 0;
527 			mutex_exit(&rp->r_statelock);
528 			if (vpm_enable) {
529 				error = vpm_sync_pages(vp, off, n, flags);
530 			} else {
531 				error = segmap_release(segkmap, base, flags);
532 			}
533 		} else {
534 			if (vpm_enable) {
535 				(void) vpm_sync_pages(vp, off, n, 0);
536 			} else {
537 				(void) segmap_release(segkmap, base, 0);
538 			}
539 		}
540 	} while (!error && uiop->uio_resid > 0);
541 
542 	return (error);
543 }
544 
545 /* ARGSUSED */
546 static int
547 nfs_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
548 	caller_context_t *ct)
549 {
550 	rnode_t *rp;
551 	u_offset_t off;
552 	caddr_t base;
553 	uint_t flags;
554 	int remainder;
555 	size_t n;
556 	int on;
557 	int error;
558 	int resid;
559 	offset_t offset;
560 	rlim_t limit;
561 	mntinfo_t *mi;
562 
563 	rp = VTOR(vp);
564 
565 	mi = VTOMI(vp);
566 	if (nfs_zone() != mi->mi_zone)
567 		return (EIO);
568 	if (vp->v_type != VREG)
569 		return (EISDIR);
570 
571 	if (uiop->uio_resid == 0)
572 		return (0);
573 
574 	if (ioflag & FAPPEND) {
575 		struct vattr va;
576 
577 		/*
578 		 * Must serialize if appending.
579 		 */
580 		if (nfs_rw_lock_held(&rp->r_rwlock, RW_READER)) {
581 			nfs_rw_exit(&rp->r_rwlock);
582 			if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER,
583 			    INTR(vp)))
584 				return (EINTR);
585 		}
586 
587 		va.va_mask = AT_SIZE;
588 		error = nfsgetattr(vp, &va, cr);
589 		if (error)
590 			return (error);
591 		uiop->uio_loffset = va.va_size;
592 	}
593 
594 	if (uiop->uio_loffset > MAXOFF32_T)
595 		return (EFBIG);
596 
597 	offset = uiop->uio_loffset + uiop->uio_resid;
598 
599 	if (uiop->uio_loffset < 0 || offset > MAXOFF32_T)
600 		return (EINVAL);
601 
602 	if (uiop->uio_llimit > (rlim64_t)MAXOFF32_T) {
603 		limit = MAXOFF32_T;
604 	} else {
605 		limit = (rlim_t)uiop->uio_llimit;
606 	}
607 
608 	/*
609 	 * Check to make sure that the process will not exceed
610 	 * its limit on file size.  It is okay to write up to
611 	 * the limit, but not beyond.  Thus, the write which
612 	 * reaches the limit will be short and the next write
613 	 * will return an error.
614 	 */
615 	remainder = 0;
616 	if (offset > limit) {
617 		remainder = offset - limit;
618 		uiop->uio_resid = limit - uiop->uio_offset;
619 		if (uiop->uio_resid <= 0) {
620 			proc_t *p = ttoproc(curthread);
621 
622 			uiop->uio_resid += remainder;
623 			mutex_enter(&p->p_lock);
624 			(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
625 			    p->p_rctls, p, RCA_UNSAFE_SIGINFO);
626 			mutex_exit(&p->p_lock);
627 			return (EFBIG);
628 		}
629 	}
630 
631 	if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR(vp)))
632 		return (EINTR);
633 
634 	/*
635 	 * Bypass VM if caching has been disabled (e.g., locking) or if
636 	 * using client-side direct I/O and the file is not mmap'd and
637 	 * there are no cached pages.
638 	 */
639 	if ((vp->v_flag & VNOCACHE) ||
640 	    (((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) &&
641 	    rp->r_mapcnt == 0 && rp->r_inmap == 0 &&
642 	    !vn_has_cached_data(vp))) {
643 		size_t bufsize;
644 		int count;
645 		uint_t org_offset;
646 
647 nfs_fwrite:
648 		if (rp->r_flags & RSTALE) {
649 			resid = uiop->uio_resid;
650 			offset = uiop->uio_loffset;
651 			error = rp->r_error;
652 			/*
653 			 * A close may have cleared r_error, if so,
654 			 * propagate ESTALE error return properly
655 			 */
656 			if (error == 0)
657 				error = ESTALE;
658 			goto bottom;
659 		}
660 		bufsize = MIN(uiop->uio_resid, mi->mi_curwrite);
661 		base = kmem_alloc(bufsize, KM_SLEEP);
662 		do {
663 			resid = uiop->uio_resid;
664 			offset = uiop->uio_loffset;
665 			count = MIN(uiop->uio_resid, bufsize);
666 			org_offset = uiop->uio_offset;
667 			error = uiomove(base, count, UIO_WRITE, uiop);
668 			if (!error) {
669 				error = nfswrite(vp, base, org_offset,
670 				    count, cr);
671 			}
672 		} while (!error && uiop->uio_resid > 0);
673 		kmem_free(base, bufsize);
674 		goto bottom;
675 	}
676 
677 	do {
678 		off = uiop->uio_loffset & MAXBMASK; /* mapping offset */
679 		on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */
680 		n = MIN(MAXBSIZE - on, uiop->uio_resid);
681 
682 		resid = uiop->uio_resid;
683 		offset = uiop->uio_loffset;
684 
685 		if (rp->r_flags & RSTALE) {
686 			error = rp->r_error;
687 			/*
688 			 * A close may have cleared r_error, if so,
689 			 * propagate ESTALE error return properly
690 			 */
691 			if (error == 0)
692 				error = ESTALE;
693 			break;
694 		}
695 
696 		/*
697 		 * Don't create dirty pages faster than they
698 		 * can be cleaned so that the system doesn't
699 		 * get imbalanced.  If the async queue is
700 		 * maxed out, then wait for it to drain before
701 		 * creating more dirty pages.  Also, wait for
702 		 * any threads doing pagewalks in the vop_getattr
703 		 * entry points so that they don't block for
704 		 * long periods.
705 		 */
706 		mutex_enter(&rp->r_statelock);
707 		while ((mi->mi_max_threads != 0 &&
708 		    rp->r_awcount > 2 * mi->mi_max_threads) ||
709 		    rp->r_gcount > 0) {
710 			if (INTR(vp)) {
711 				klwp_t *lwp = ttolwp(curthread);
712 
713 				if (lwp != NULL)
714 					lwp->lwp_nostop++;
715 				if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
716 					mutex_exit(&rp->r_statelock);
717 					if (lwp != NULL)
718 						lwp->lwp_nostop--;
719 					error = EINTR;
720 					goto bottom;
721 				}
722 				if (lwp != NULL)
723 					lwp->lwp_nostop--;
724 			} else
725 				cv_wait(&rp->r_cv, &rp->r_statelock);
726 		}
727 		mutex_exit(&rp->r_statelock);
728 
729 		/*
730 		 * Touch the page and fault it in if it is not in core
731 		 * before segmap_getmapflt or vpm_data_copy can lock it.
732 		 * This is to avoid the deadlock if the buffer is mapped
733 		 * to the same file through mmap which we want to write.
734 		 */
735 		uio_prefaultpages((long)n, uiop);
736 
737 		if (vpm_enable) {
738 			/*
739 			 * It will use kpm mappings, so no need to
740 			 * pass an address.
741 			 */
742 			error = writerp(rp, NULL, n, uiop, 0);
743 		} else  {
744 			if (segmap_kpm) {
745 				int pon = uiop->uio_loffset & PAGEOFFSET;
746 				size_t pn = MIN(PAGESIZE - pon,
747 				    uiop->uio_resid);
748 				int pagecreate;
749 
750 				mutex_enter(&rp->r_statelock);
751 				pagecreate = (pon == 0) && (pn == PAGESIZE ||
752 				    uiop->uio_loffset + pn >= rp->r_size);
753 				mutex_exit(&rp->r_statelock);
754 
755 				base = segmap_getmapflt(segkmap, vp, off + on,
756 				    pn, !pagecreate, S_WRITE);
757 
758 				error = writerp(rp, base + pon, n, uiop,
759 				    pagecreate);
760 
761 			} else {
762 				base = segmap_getmapflt(segkmap, vp, off + on,
763 				    n, 0, S_READ);
764 				error = writerp(rp, base + on, n, uiop, 0);
765 			}
766 		}
767 
768 		if (!error) {
769 			if (mi->mi_flags & MI_NOAC)
770 				flags = SM_WRITE;
771 			else if (n + on == MAXBSIZE || IS_SWAPVP(vp)) {
772 				/*
773 				 * Have written a whole block.
774 				 * Start an asynchronous write
775 				 * and mark the buffer to
776 				 * indicate that it won't be
777 				 * needed again soon.
778 				 */
779 				flags = SM_WRITE | SM_ASYNC | SM_DONTNEED;
780 			} else
781 				flags = 0;
782 			if ((ioflag & (FSYNC|FDSYNC)) ||
783 			    (rp->r_flags & ROUTOFSPACE)) {
784 				flags &= ~SM_ASYNC;
785 				flags |= SM_WRITE;
786 			}
787 			if (vpm_enable) {
788 				error = vpm_sync_pages(vp, off, n, flags);
789 			} else {
790 				error = segmap_release(segkmap, base, flags);
791 			}
792 		} else {
793 			if (vpm_enable) {
794 				(void) vpm_sync_pages(vp, off, n, 0);
795 			} else {
796 				(void) segmap_release(segkmap, base, 0);
797 			}
798 			/*
799 			 * In the event that we got an access error while
800 			 * faulting in a page for a write-only file just
801 			 * force a write.
802 			 */
803 			if (error == EACCES)
804 				goto nfs_fwrite;
805 		}
806 	} while (!error && uiop->uio_resid > 0);
807 
808 bottom:
809 	if (error) {
810 		uiop->uio_resid = resid + remainder;
811 		uiop->uio_loffset = offset;
812 	} else
813 		uiop->uio_resid += remainder;
814 
815 	nfs_rw_exit(&rp->r_lkserlock);
816 
817 	return (error);
818 }
819 
820 /*
821  * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED}
822  */
823 static int
824 nfs_rdwrlbn(vnode_t *vp, page_t *pp, u_offset_t off, size_t len,
825 	int flags, cred_t *cr)
826 {
827 	struct buf *bp;
828 	int error;
829 
830 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
831 	bp = pageio_setup(pp, len, vp, flags);
832 	ASSERT(bp != NULL);
833 
834 	/*
835 	 * pageio_setup should have set b_addr to 0.  This
836 	 * is correct since we want to do I/O on a page
837 	 * boundary.  bp_mapin will use this addr to calculate
838 	 * an offset, and then set b_addr to the kernel virtual
839 	 * address it allocated for us.
840 	 */
841 	ASSERT(bp->b_un.b_addr == 0);
842 
843 	bp->b_edev = 0;
844 	bp->b_dev = 0;
845 	bp->b_lblkno = lbtodb(off);
846 	bp->b_file = vp;
847 	bp->b_offset = (offset_t)off;
848 	bp_mapin(bp);
849 
850 	error = nfs_bio(bp, cr);
851 
852 	bp_mapout(bp);
853 	pageio_done(bp);
854 
855 	return (error);
856 }
857 
858 /*
859  * Write to file.  Writes to remote server in largest size
860  * chunks that the server can handle.  Write is synchronous.
861  */
862 static int
863 nfswrite(vnode_t *vp, caddr_t base, uint_t offset, int count, cred_t *cr)
864 {
865 	rnode_t *rp;
866 	mntinfo_t *mi;
867 	struct nfswriteargs wa;
868 	struct nfsattrstat ns;
869 	int error;
870 	int tsize;
871 	int douprintf;
872 
873 	douprintf = 1;
874 
875 	rp = VTOR(vp);
876 	mi = VTOMI(vp);
877 
878 	ASSERT(nfs_zone() == mi->mi_zone);
879 
880 	wa.wa_args = &wa.wa_args_buf;
881 	wa.wa_fhandle = *VTOFH(vp);
882 
883 	do {
884 		tsize = MIN(mi->mi_curwrite, count);
885 		wa.wa_data = base;
886 		wa.wa_begoff = offset;
887 		wa.wa_totcount = tsize;
888 		wa.wa_count = tsize;
889 		wa.wa_offset = offset;
890 
891 		if (mi->mi_io_kstats) {
892 			mutex_enter(&mi->mi_lock);
893 			kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
894 			mutex_exit(&mi->mi_lock);
895 		}
896 		wa.wa_mblk = NULL;
897 		do {
898 			error = rfs2call(mi, RFS_WRITE,
899 			    xdr_writeargs, (caddr_t)&wa,
900 			    xdr_attrstat, (caddr_t)&ns, cr,
901 			    &douprintf, &ns.ns_status, 0, NULL);
902 		} while (error == ENFS_TRYAGAIN);
903 		if (mi->mi_io_kstats) {
904 			mutex_enter(&mi->mi_lock);
905 			kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
906 			mutex_exit(&mi->mi_lock);
907 		}
908 
909 		if (!error) {
910 			error = geterrno(ns.ns_status);
911 			/*
912 			 * Can't check for stale fhandle and purge caches
913 			 * here because pages are held by nfs_getpage.
914 			 * Just mark the attribute cache as timed out
915 			 * and set RWRITEATTR to indicate that the file
916 			 * was modified with a WRITE operation.
917 			 */
918 			if (!error) {
919 				count -= tsize;
920 				base += tsize;
921 				offset += tsize;
922 				if (mi->mi_io_kstats) {
923 					mutex_enter(&mi->mi_lock);
924 					KSTAT_IO_PTR(mi->mi_io_kstats)->
925 					    writes++;
926 					KSTAT_IO_PTR(mi->mi_io_kstats)->
927 					    nwritten += tsize;
928 					mutex_exit(&mi->mi_lock);
929 				}
930 				lwp_stat_update(LWP_STAT_OUBLK, 1);
931 				mutex_enter(&rp->r_statelock);
932 				PURGE_ATTRCACHE_LOCKED(rp);
933 				rp->r_flags |= RWRITEATTR;
934 				mutex_exit(&rp->r_statelock);
935 			}
936 		}
937 	} while (!error && count);
938 
939 	return (error);
940 }
941 
942 /*
943  * Read from a file.  Reads data in largest chunks our interface can handle.
944  */
945 static int
946 nfsread(vnode_t *vp, caddr_t base, uint_t offset,
947     int count, size_t *residp, cred_t *cr)
948 {
949 	mntinfo_t *mi;
950 	struct nfsreadargs ra;
951 	struct nfsrdresult rr;
952 	int tsize;
953 	int error;
954 	int douprintf;
955 	failinfo_t fi;
956 	rnode_t *rp;
957 	struct vattr va;
958 	hrtime_t t;
959 
960 	rp = VTOR(vp);
961 	mi = VTOMI(vp);
962 
963 	ASSERT(nfs_zone() == mi->mi_zone);
964 
965 	douprintf = 1;
966 
967 	ra.ra_fhandle = *VTOFH(vp);
968 
969 	fi.vp = vp;
970 	fi.fhp = (caddr_t)&ra.ra_fhandle;
971 	fi.copyproc = nfscopyfh;
972 	fi.lookupproc = nfslookup;
973 	fi.xattrdirproc = acl_getxattrdir2;
974 
975 	do {
976 		if (mi->mi_io_kstats) {
977 			mutex_enter(&mi->mi_lock);
978 			kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
979 			mutex_exit(&mi->mi_lock);
980 		}
981 
982 		do {
983 			tsize = MIN(mi->mi_curread, count);
984 			rr.rr_data = base;
985 			ra.ra_offset = offset;
986 			ra.ra_totcount = tsize;
987 			ra.ra_count = tsize;
988 			ra.ra_data = base;
989 			t = gethrtime();
990 			error = rfs2call(mi, RFS_READ,
991 			    xdr_readargs, (caddr_t)&ra,
992 			    xdr_rdresult, (caddr_t)&rr, cr,
993 			    &douprintf, &rr.rr_status, 0, &fi);
994 		} while (error == ENFS_TRYAGAIN);
995 
996 		if (mi->mi_io_kstats) {
997 			mutex_enter(&mi->mi_lock);
998 			kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
999 			mutex_exit(&mi->mi_lock);
1000 		}
1001 
1002 		if (!error) {
1003 			error = geterrno(rr.rr_status);
1004 			if (!error) {
1005 				count -= rr.rr_count;
1006 				base += rr.rr_count;
1007 				offset += rr.rr_count;
1008 				if (mi->mi_io_kstats) {
1009 					mutex_enter(&mi->mi_lock);
1010 					KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
1011 					KSTAT_IO_PTR(mi->mi_io_kstats)->nread +=
1012 					    rr.rr_count;
1013 					mutex_exit(&mi->mi_lock);
1014 				}
1015 				lwp_stat_update(LWP_STAT_INBLK, 1);
1016 			}
1017 		}
1018 	} while (!error && count && rr.rr_count == tsize);
1019 
1020 	*residp = count;
1021 
1022 	if (!error) {
1023 		/*
1024 		 * Since no error occurred, we have the current
1025 		 * attributes and we need to do a cache check and then
1026 		 * potentially update the cached attributes.  We can't
1027 		 * use the normal attribute check and cache mechanisms
1028 		 * because they might cause a cache flush which would
1029 		 * deadlock.  Instead, we just check the cache to see
1030 		 * if the attributes have changed.  If it is, then we
1031 		 * just mark the attributes as out of date.  The next
1032 		 * time that the attributes are checked, they will be
1033 		 * out of date, new attributes will be fetched, and
1034 		 * the page cache will be flushed.  If the attributes
1035 		 * weren't changed, then we just update the cached
1036 		 * attributes with these attributes.
1037 		 */
1038 		/*
1039 		 * If NFS_ACL is supported on the server, then the
1040 		 * attributes returned by server may have minimal
1041 		 * permissions sometimes denying access to users having
1042 		 * proper access.  To get the proper attributes, mark
1043 		 * the attributes as expired so that they will be
1044 		 * regotten via the NFS_ACL GETATTR2 procedure.
1045 		 */
1046 		error = nattr_to_vattr(vp, &rr.rr_attr, &va);
1047 		mutex_enter(&rp->r_statelock);
1048 		if (error || !CACHE_VALID(rp, va.va_mtime, va.va_size) ||
1049 		    (mi->mi_flags & MI_ACL)) {
1050 			mutex_exit(&rp->r_statelock);
1051 			PURGE_ATTRCACHE(vp);
1052 		} else {
1053 			if (rp->r_mtime <= t) {
1054 				nfs_attrcache_va(vp, &va);
1055 			}
1056 			mutex_exit(&rp->r_statelock);
1057 		}
1058 	}
1059 
1060 	return (error);
1061 }
1062 
1063 /* ARGSUSED */
1064 static int
1065 nfs_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp,
1066 	caller_context_t *ct)
1067 {
1068 
1069 	if (nfs_zone() != VTOMI(vp)->mi_zone)
1070 		return (EIO);
1071 	switch (cmd) {
1072 		case _FIODIRECTIO:
1073 			return (nfs_directio(vp, (int)arg, cr));
1074 		default:
1075 			return (ENOTTY);
1076 	}
1077 }
1078 
1079 /* ARGSUSED */
1080 static int
1081 nfs_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
1082 	caller_context_t *ct)
1083 {
1084 	int error;
1085 	rnode_t *rp;
1086 
1087 	if (nfs_zone() != VTOMI(vp)->mi_zone)
1088 		return (EIO);
1089 	/*
1090 	 * If it has been specified that the return value will
1091 	 * just be used as a hint, and we are only being asked
1092 	 * for size, fsid or rdevid, then return the client's
1093 	 * notion of these values without checking to make sure
1094 	 * that the attribute cache is up to date.
1095 	 * The whole point is to avoid an over the wire GETATTR
1096 	 * call.
1097 	 */
1098 	rp = VTOR(vp);
1099 	if (flags & ATTR_HINT) {
1100 		if (vap->va_mask ==
1101 		    (vap->va_mask & (AT_SIZE | AT_FSID | AT_RDEV))) {
1102 			mutex_enter(&rp->r_statelock);
1103 			if (vap->va_mask | AT_SIZE)
1104 				vap->va_size = rp->r_size;
1105 			if (vap->va_mask | AT_FSID)
1106 				vap->va_fsid = rp->r_attr.va_fsid;
1107 			if (vap->va_mask | AT_RDEV)
1108 				vap->va_rdev = rp->r_attr.va_rdev;
1109 			mutex_exit(&rp->r_statelock);
1110 			return (0);
1111 		}
1112 	}
1113 
1114 	/*
1115 	 * Only need to flush pages if asking for the mtime
1116 	 * and if there any dirty pages or any outstanding
1117 	 * asynchronous (write) requests for this file.
1118 	 */
1119 	if (vap->va_mask & AT_MTIME) {
1120 		if (vn_has_cached_data(vp) &&
1121 		    ((rp->r_flags & RDIRTY) || rp->r_awcount > 0)) {
1122 			mutex_enter(&rp->r_statelock);
1123 			rp->r_gcount++;
1124 			mutex_exit(&rp->r_statelock);
1125 			error = nfs_putpage(vp, (offset_t)0, 0, 0, cr, ct);
1126 			mutex_enter(&rp->r_statelock);
1127 			if (error && (error == ENOSPC || error == EDQUOT)) {
1128 				if (!rp->r_error)
1129 					rp->r_error = error;
1130 			}
1131 			if (--rp->r_gcount == 0)
1132 				cv_broadcast(&rp->r_cv);
1133 			mutex_exit(&rp->r_statelock);
1134 		}
1135 	}
1136 
1137 	return (nfsgetattr(vp, vap, cr));
1138 }
1139 
1140 /*ARGSUSED4*/
1141 static int
1142 nfs_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
1143 		caller_context_t *ct)
1144 {
1145 	int error;
1146 	uint_t mask;
1147 	struct vattr va;
1148 
1149 	mask = vap->va_mask;
1150 
1151 	if (mask & AT_NOSET)
1152 		return (EINVAL);
1153 
1154 	if ((mask & AT_SIZE) &&
1155 	    vap->va_type == VREG &&
1156 	    vap->va_size > MAXOFF32_T)
1157 		return (EFBIG);
1158 
1159 	if (nfs_zone() != VTOMI(vp)->mi_zone)
1160 		return (EIO);
1161 
1162 	va.va_mask = AT_UID | AT_MODE;
1163 
1164 	error = nfsgetattr(vp, &va, cr);
1165 	if (error)
1166 		return (error);
1167 
1168 	error = secpolicy_vnode_setattr(cr, vp, vap, &va, flags, nfs_accessx,
1169 	    vp);
1170 
1171 	if (error)
1172 		return (error);
1173 
1174 	error = nfssetattr(vp, vap, flags, cr);
1175 
1176 	if (error == 0 && (mask & AT_SIZE) && vap->va_size == 0)
1177 		vnevent_truncate(vp, ct);
1178 
1179 	return (error);
1180 }
1181 
1182 static int
1183 nfssetattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr)
1184 {
1185 	int error;
1186 	uint_t mask;
1187 	struct nfssaargs args;
1188 	struct nfsattrstat ns;
1189 	int douprintf;
1190 	rnode_t *rp;
1191 	struct vattr va;
1192 	mode_t omode;
1193 	mntinfo_t *mi;
1194 	vsecattr_t *vsp;
1195 	hrtime_t t;
1196 
1197 	mask = vap->va_mask;
1198 
1199 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
1200 
1201 	rp = VTOR(vp);
1202 
1203 	/*
1204 	 * Only need to flush pages if there are any pages and
1205 	 * if the file is marked as dirty in some fashion.  The
1206 	 * file must be flushed so that we can accurately
1207 	 * determine the size of the file and the cached data
1208 	 * after the SETATTR returns.  A file is considered to
1209 	 * be dirty if it is either marked with RDIRTY, has
1210 	 * outstanding i/o's active, or is mmap'd.  In this
1211 	 * last case, we can't tell whether there are dirty
1212 	 * pages, so we flush just to be sure.
1213 	 */
1214 	if (vn_has_cached_data(vp) &&
1215 	    ((rp->r_flags & RDIRTY) ||
1216 	    rp->r_count > 0 ||
1217 	    rp->r_mapcnt > 0)) {
1218 		ASSERT(vp->v_type != VCHR);
1219 		error = nfs_putpage(vp, (offset_t)0, 0, 0, cr, NULL);
1220 		if (error && (error == ENOSPC || error == EDQUOT)) {
1221 			mutex_enter(&rp->r_statelock);
1222 			if (!rp->r_error)
1223 				rp->r_error = error;
1224 			mutex_exit(&rp->r_statelock);
1225 		}
1226 	}
1227 
1228 	/*
1229 	 * If the system call was utime(2) or utimes(2) and the
1230 	 * application did not specify the times, then set the
1231 	 * mtime nanosecond field to 1 billion.  This will get
1232 	 * translated from 1 billion nanoseconds to 1 million
1233 	 * microseconds in the over the wire request.  The
1234 	 * server will use 1 million in the microsecond field
1235 	 * to tell whether both the mtime and atime should be
1236 	 * set to the server's current time.
1237 	 *
1238 	 * This is an overload of the protocol and should be
1239 	 * documented in the NFS Version 2 protocol specification.
1240 	 */
1241 	if ((mask & AT_MTIME) && !(flags & ATTR_UTIME)) {
1242 		vap->va_mtime.tv_nsec = 1000000000;
1243 		if (NFS_TIME_T_OK(vap->va_mtime.tv_sec) &&
1244 		    NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
1245 			error = vattr_to_sattr(vap, &args.saa_sa);
1246 		} else {
1247 			/*
1248 			 * Use server times. vap time values will not be used.
1249 			 * To ensure no time overflow, make sure vap has
1250 			 * valid values, but retain the original values.
1251 			 */
1252 			timestruc_t	mtime = vap->va_mtime;
1253 			timestruc_t	atime = vap->va_atime;
1254 			time_t		now;
1255 
1256 			now = gethrestime_sec();
1257 			if (NFS_TIME_T_OK(now)) {
1258 				/* Just in case server does not know of this */
1259 				vap->va_mtime.tv_sec = now;
1260 				vap->va_atime.tv_sec = now;
1261 			} else {
1262 				vap->va_mtime.tv_sec = 0;
1263 				vap->va_atime.tv_sec = 0;
1264 			}
1265 			error = vattr_to_sattr(vap, &args.saa_sa);
1266 			/* set vap times back on */
1267 			vap->va_mtime = mtime;
1268 			vap->va_atime = atime;
1269 		}
1270 	} else {
1271 		/* Either do not set times or use the client specified times */
1272 		error = vattr_to_sattr(vap, &args.saa_sa);
1273 	}
1274 	if (error) {
1275 		/* req time field(s) overflow - return immediately */
1276 		return (error);
1277 	}
1278 	args.saa_fh = *VTOFH(vp);
1279 
1280 	va.va_mask = AT_MODE;
1281 	error = nfsgetattr(vp, &va, cr);
1282 	if (error)
1283 		return (error);
1284 	omode = va.va_mode;
1285 
1286 	mi = VTOMI(vp);
1287 
1288 	douprintf = 1;
1289 
1290 	t = gethrtime();
1291 
1292 	error = rfs2call(mi, RFS_SETATTR,
1293 	    xdr_saargs, (caddr_t)&args,
1294 	    xdr_attrstat, (caddr_t)&ns, cr,
1295 	    &douprintf, &ns.ns_status, 0, NULL);
1296 
1297 	/*
1298 	 * Purge the access cache and ACL cache if changing either the
1299 	 * owner of the file, the group owner, or the mode.  These may
1300 	 * change the access permissions of the file, so purge old
1301 	 * information and start over again.
1302 	 */
1303 	if ((mask & (AT_UID | AT_GID | AT_MODE)) && (mi->mi_flags & MI_ACL)) {
1304 		(void) nfs_access_purge_rp(rp);
1305 		if (rp->r_secattr != NULL) {
1306 			mutex_enter(&rp->r_statelock);
1307 			vsp = rp->r_secattr;
1308 			rp->r_secattr = NULL;
1309 			mutex_exit(&rp->r_statelock);
1310 			if (vsp != NULL)
1311 				nfs_acl_free(vsp);
1312 		}
1313 	}
1314 
1315 	if (!error) {
1316 		error = geterrno(ns.ns_status);
1317 		if (!error) {
1318 			/*
1319 			 * If changing the size of the file, invalidate
1320 			 * any local cached data which is no longer part
1321 			 * of the file.  We also possibly invalidate the
1322 			 * last page in the file.  We could use
1323 			 * pvn_vpzero(), but this would mark the page as
1324 			 * modified and require it to be written back to
1325 			 * the server for no particularly good reason.
1326 			 * This way, if we access it, then we bring it
1327 			 * back in.  A read should be cheaper than a
1328 			 * write.
1329 			 */
1330 			if (mask & AT_SIZE) {
1331 				nfs_invalidate_pages(vp,
1332 				    (vap->va_size & PAGEMASK), cr);
1333 			}
1334 			(void) nfs_cache_fattr(vp, &ns.ns_attr, &va, t, cr);
1335 			/*
1336 			 * If NFS_ACL is supported on the server, then the
1337 			 * attributes returned by server may have minimal
1338 			 * permissions sometimes denying access to users having
1339 			 * proper access.  To get the proper attributes, mark
1340 			 * the attributes as expired so that they will be
1341 			 * regotten via the NFS_ACL GETATTR2 procedure.
1342 			 */
1343 			if (mi->mi_flags & MI_ACL) {
1344 				PURGE_ATTRCACHE(vp);
1345 			}
1346 			/*
1347 			 * This next check attempts to deal with NFS
1348 			 * servers which can not handle increasing
1349 			 * the size of the file via setattr.  Most
1350 			 * of these servers do not return an error,
1351 			 * but do not change the size of the file.
1352 			 * Hence, this check and then attempt to set
1353 			 * the file size by writing 1 byte at the
1354 			 * offset of the end of the file that we need.
1355 			 */
1356 			if ((mask & AT_SIZE) &&
1357 			    ns.ns_attr.na_size < (uint32_t)vap->va_size) {
1358 				char zb = '\0';
1359 
1360 				error = nfswrite(vp, &zb,
1361 				    vap->va_size - sizeof (zb),
1362 				    sizeof (zb), cr);
1363 			}
1364 			/*
1365 			 * Some servers will change the mode to clear the setuid
1366 			 * and setgid bits when changing the uid or gid.  The
1367 			 * client needs to compensate appropriately.
1368 			 */
1369 			if (mask & (AT_UID | AT_GID)) {
1370 				int terror;
1371 
1372 				va.va_mask = AT_MODE;
1373 				terror = nfsgetattr(vp, &va, cr);
1374 				if (!terror &&
1375 				    (((mask & AT_MODE) &&
1376 				    va.va_mode != vap->va_mode) ||
1377 				    (!(mask & AT_MODE) &&
1378 				    va.va_mode != omode))) {
1379 					va.va_mask = AT_MODE;
1380 					if (mask & AT_MODE)
1381 						va.va_mode = vap->va_mode;
1382 					else
1383 						va.va_mode = omode;
1384 					(void) nfssetattr(vp, &va, 0, cr);
1385 				}
1386 			}
1387 		} else {
1388 			PURGE_ATTRCACHE(vp);
1389 			PURGE_STALE_FH(error, vp, cr);
1390 		}
1391 	} else {
1392 		PURGE_ATTRCACHE(vp);
1393 	}
1394 
1395 	return (error);
1396 }
1397 
1398 static int
1399 nfs_accessx(void *vp, int mode, cred_t *cr)
1400 {
1401 	ASSERT(nfs_zone() == VTOMI((vnode_t *)vp)->mi_zone);
1402 	return (nfs_access(vp, mode, 0, cr, NULL));
1403 }
1404 
1405 /* ARGSUSED */
1406 static int
1407 nfs_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct)
1408 {
1409 	struct vattr va;
1410 	int error;
1411 	mntinfo_t *mi;
1412 	int shift = 0;
1413 
1414 	mi = VTOMI(vp);
1415 
1416 	if (nfs_zone() != mi->mi_zone)
1417 		return (EIO);
1418 	if (mi->mi_flags & MI_ACL) {
1419 		error = acl_access2(vp, mode, flags, cr);
1420 		if (mi->mi_flags & MI_ACL)
1421 			return (error);
1422 	}
1423 
1424 	va.va_mask = AT_MODE | AT_UID | AT_GID;
1425 	error = nfsgetattr(vp, &va, cr);
1426 	if (error)
1427 		return (error);
1428 
1429 	/*
1430 	 * Disallow write attempts on read-only
1431 	 * file systems, unless the file is a
1432 	 * device node.
1433 	 */
1434 	if ((mode & VWRITE) && vn_is_readonly(vp) && !IS_DEVVP(vp))
1435 		return (EROFS);
1436 
1437 	/*
1438 	 * Disallow attempts to access mandatory lock files.
1439 	 */
1440 	if ((mode & (VWRITE | VREAD | VEXEC)) &&
1441 	    MANDLOCK(vp, va.va_mode))
1442 		return (EACCES);
1443 
1444 	/*
1445 	 * Access check is based on only
1446 	 * one of owner, group, public.
1447 	 * If not owner, then check group.
1448 	 * If not a member of the group,
1449 	 * then check public access.
1450 	 */
1451 	if (crgetuid(cr) != va.va_uid) {
1452 		shift += 3;
1453 		if (!groupmember(va.va_gid, cr))
1454 			shift += 3;
1455 	}
1456 
1457 	return (secpolicy_vnode_access2(cr, vp, va.va_uid,
1458 	    va.va_mode << shift, mode));
1459 }
1460 
1461 static int nfs_do_symlink_cache = 1;
1462 
1463 /* ARGSUSED */
1464 static int
1465 nfs_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr, caller_context_t *ct)
1466 {
1467 	int error;
1468 	struct nfsrdlnres rl;
1469 	rnode_t *rp;
1470 	int douprintf;
1471 	failinfo_t fi;
1472 
1473 	/*
1474 	 * We want to be consistent with UFS semantics so we will return
1475 	 * EINVAL instead of ENXIO. This violates the XNFS spec and
1476 	 * the RFC 1094, which are wrong any way. BUGID 1138002.
1477 	 */
1478 	if (vp->v_type != VLNK)
1479 		return (EINVAL);
1480 
1481 	if (nfs_zone() != VTOMI(vp)->mi_zone)
1482 		return (EIO);
1483 
1484 	rp = VTOR(vp);
1485 	if (nfs_do_symlink_cache && rp->r_symlink.contents != NULL) {
1486 		error = nfs_validate_caches(vp, cr);
1487 		if (error)
1488 			return (error);
1489 		mutex_enter(&rp->r_statelock);
1490 		if (rp->r_symlink.contents != NULL) {
1491 			error = uiomove(rp->r_symlink.contents,
1492 			    rp->r_symlink.len, UIO_READ, uiop);
1493 			mutex_exit(&rp->r_statelock);
1494 			return (error);
1495 		}
1496 		mutex_exit(&rp->r_statelock);
1497 	}
1498 
1499 
1500 	rl.rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
1501 
1502 	fi.vp = vp;
1503 	fi.fhp = NULL;		/* no need to update, filehandle not copied */
1504 	fi.copyproc = nfscopyfh;
1505 	fi.lookupproc = nfslookup;
1506 	fi.xattrdirproc = acl_getxattrdir2;
1507 
1508 	douprintf = 1;
1509 
1510 	error = rfs2call(VTOMI(vp), RFS_READLINK,
1511 	    xdr_readlink, (caddr_t)VTOFH(vp),
1512 	    xdr_rdlnres, (caddr_t)&rl, cr,
1513 	    &douprintf, &rl.rl_status, 0, &fi);
1514 
1515 	if (error) {
1516 
1517 		kmem_free((void *)rl.rl_data, NFS_MAXPATHLEN);
1518 		return (error);
1519 	}
1520 
1521 	error = geterrno(rl.rl_status);
1522 	if (!error) {
1523 		error = uiomove(rl.rl_data, (int)rl.rl_count, UIO_READ, uiop);
1524 		if (nfs_do_symlink_cache && rp->r_symlink.contents == NULL) {
1525 			mutex_enter(&rp->r_statelock);
1526 			if (rp->r_symlink.contents == NULL) {
1527 				rp->r_symlink.contents = rl.rl_data;
1528 				rp->r_symlink.len = (int)rl.rl_count;
1529 				rp->r_symlink.size = NFS_MAXPATHLEN;
1530 				mutex_exit(&rp->r_statelock);
1531 			} else {
1532 				mutex_exit(&rp->r_statelock);
1533 
1534 				kmem_free((void *)rl.rl_data,
1535 				    NFS_MAXPATHLEN);
1536 			}
1537 		} else {
1538 
1539 			kmem_free((void *)rl.rl_data, NFS_MAXPATHLEN);
1540 		}
1541 	} else {
1542 		PURGE_STALE_FH(error, vp, cr);
1543 
1544 		kmem_free((void *)rl.rl_data, NFS_MAXPATHLEN);
1545 	}
1546 
1547 	/*
1548 	 * Conform to UFS semantics (see comment above)
1549 	 */
1550 	return (error == ENXIO ? EINVAL : error);
1551 }
1552 
1553 /*
1554  * Flush local dirty pages to stable storage on the server.
1555  *
1556  * If FNODSYNC is specified, then there is nothing to do because
1557  * metadata changes are not cached on the client before being
1558  * sent to the server.
1559  */
1560 /* ARGSUSED */
1561 static int
1562 nfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
1563 {
1564 	int error;
1565 
1566 	if ((syncflag & FNODSYNC) || IS_SWAPVP(vp))
1567 		return (0);
1568 
1569 	if (nfs_zone() != VTOMI(vp)->mi_zone)
1570 		return (EIO);
1571 
1572 	error = nfs_putpage(vp, (offset_t)0, 0, 0, cr, ct);
1573 	if (!error)
1574 		error = VTOR(vp)->r_error;
1575 	return (error);
1576 }
1577 
1578 
1579 /*
1580  * Weirdness: if the file was removed or the target of a rename
1581  * operation while it was open, it got renamed instead.  Here we
1582  * remove the renamed file.
1583  */
1584 /* ARGSUSED */
1585 static void
1586 nfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
1587 {
1588 	rnode_t *rp;
1589 
1590 	ASSERT(vp != DNLC_NO_VNODE);
1591 
1592 	/*
1593 	 * If this is coming from the wrong zone, we let someone in the right
1594 	 * zone take care of it asynchronously.  We can get here due to
1595 	 * VN_RELE() being called from pageout() or fsflush().  This call may
1596 	 * potentially turn into an expensive no-op if, for instance, v_count
1597 	 * gets incremented in the meantime, but it's still correct.
1598 	 */
1599 	if (nfs_zone() != VTOMI(vp)->mi_zone) {
1600 		nfs_async_inactive(vp, cr, nfs_inactive);
1601 		return;
1602 	}
1603 
1604 	rp = VTOR(vp);
1605 redo:
1606 	if (rp->r_unldvp != NULL) {
1607 		/*
1608 		 * Save the vnode pointer for the directory where the
1609 		 * unlinked-open file got renamed, then set it to NULL
1610 		 * to prevent another thread from getting here before
1611 		 * we're done with the remove.  While we have the
1612 		 * statelock, make local copies of the pertinent rnode
1613 		 * fields.  If we weren't to do this in an atomic way, the
1614 		 * the unl* fields could become inconsistent with respect
1615 		 * to each other due to a race condition between this
1616 		 * code and nfs_remove().  See bug report 1034328.
1617 		 */
1618 		mutex_enter(&rp->r_statelock);
1619 		if (rp->r_unldvp != NULL) {
1620 			vnode_t *unldvp;
1621 			char *unlname;
1622 			cred_t *unlcred;
1623 			struct nfsdiropargs da;
1624 			enum nfsstat status;
1625 			int douprintf;
1626 			int error;
1627 
1628 			unldvp = rp->r_unldvp;
1629 			rp->r_unldvp = NULL;
1630 			unlname = rp->r_unlname;
1631 			rp->r_unlname = NULL;
1632 			unlcred = rp->r_unlcred;
1633 			rp->r_unlcred = NULL;
1634 			mutex_exit(&rp->r_statelock);
1635 
1636 			/*
1637 			 * If there are any dirty pages left, then flush
1638 			 * them.  This is unfortunate because they just
1639 			 * may get thrown away during the remove operation,
1640 			 * but we have to do this for correctness.
1641 			 */
1642 			if (vn_has_cached_data(vp) &&
1643 			    ((rp->r_flags & RDIRTY) || rp->r_count > 0)) {
1644 				ASSERT(vp->v_type != VCHR);
1645 				error = nfs_putpage(vp, (offset_t)0, 0, 0,
1646 				    cr, ct);
1647 				if (error) {
1648 					mutex_enter(&rp->r_statelock);
1649 					if (!rp->r_error)
1650 						rp->r_error = error;
1651 					mutex_exit(&rp->r_statelock);
1652 				}
1653 			}
1654 
1655 			/*
1656 			 * Do the remove operation on the renamed file
1657 			 */
1658 			setdiropargs(&da, unlname, unldvp);
1659 
1660 			douprintf = 1;
1661 
1662 			(void) rfs2call(VTOMI(unldvp), RFS_REMOVE,
1663 			    xdr_diropargs, (caddr_t)&da,
1664 			    xdr_enum, (caddr_t)&status, unlcred,
1665 			    &douprintf, &status, 0, NULL);
1666 
1667 			if (HAVE_RDDIR_CACHE(VTOR(unldvp)))
1668 				nfs_purge_rddir_cache(unldvp);
1669 			PURGE_ATTRCACHE(unldvp);
1670 
1671 			/*
1672 			 * Release stuff held for the remove
1673 			 */
1674 			VN_RELE(unldvp);
1675 			kmem_free(unlname, MAXNAMELEN);
1676 			crfree(unlcred);
1677 			goto redo;
1678 		}
1679 		mutex_exit(&rp->r_statelock);
1680 	}
1681 
1682 	rp_addfree(rp, cr);
1683 }
1684 
1685 /*
1686  * Remote file system operations having to do with directory manipulation.
1687  */
1688 
1689 /* ARGSUSED */
1690 static int
1691 nfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
1692 	int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
1693 	int *direntflags, pathname_t *realpnp)
1694 {
1695 	int error;
1696 	vnode_t *vp;
1697 	vnode_t *avp = NULL;
1698 	rnode_t *drp;
1699 
1700 	if (nfs_zone() != VTOMI(dvp)->mi_zone)
1701 		return (EPERM);
1702 
1703 	drp = VTOR(dvp);
1704 
1705 	/*
1706 	 * Are we looking up extended attributes?  If so, "dvp" is
1707 	 * the file or directory for which we want attributes, and
1708 	 * we need a lookup of the hidden attribute directory
1709 	 * before we lookup the rest of the path.
1710 	 */
1711 	if (flags & LOOKUP_XATTR) {
1712 		bool_t cflag = ((flags & CREATE_XATTR_DIR) != 0);
1713 		mntinfo_t *mi;
1714 
1715 		mi = VTOMI(dvp);
1716 		if (!(mi->mi_flags & MI_EXTATTR))
1717 			return (EINVAL);
1718 
1719 		if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR(dvp)))
1720 			return (EINTR);
1721 
1722 		(void) nfslookup_dnlc(dvp, XATTR_DIR_NAME, &avp, cr);
1723 		if (avp == NULL)
1724 			error = acl_getxattrdir2(dvp, &avp, cflag, cr, 0);
1725 		else
1726 			error = 0;
1727 
1728 		nfs_rw_exit(&drp->r_rwlock);
1729 
1730 		if (error) {
1731 			if (mi->mi_flags & MI_EXTATTR)
1732 				return (error);
1733 			return (EINVAL);
1734 		}
1735 		dvp = avp;
1736 		drp = VTOR(dvp);
1737 	}
1738 
1739 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR(dvp))) {
1740 		error = EINTR;
1741 		goto out;
1742 	}
1743 
1744 	error = nfslookup(dvp, nm, vpp, pnp, flags, rdir, cr, 0);
1745 
1746 	nfs_rw_exit(&drp->r_rwlock);
1747 
1748 	/*
1749 	 * If vnode is a device, create special vnode.
1750 	 */
1751 	if (!error && IS_DEVVP(*vpp)) {
1752 		vp = *vpp;
1753 		*vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
1754 		VN_RELE(vp);
1755 	}
1756 
1757 out:
1758 	if (avp != NULL)
1759 		VN_RELE(avp);
1760 
1761 	return (error);
1762 }
1763 
1764 static int nfs_lookup_neg_cache = 1;
1765 
1766 #ifdef DEBUG
1767 static int nfs_lookup_dnlc_hits = 0;
1768 static int nfs_lookup_dnlc_misses = 0;
1769 static int nfs_lookup_dnlc_neg_hits = 0;
1770 static int nfs_lookup_dnlc_disappears = 0;
1771 static int nfs_lookup_dnlc_lookups = 0;
1772 #endif
1773 
1774 /* ARGSUSED */
1775 int
1776 nfslookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
1777 	int flags, vnode_t *rdir, cred_t *cr, int rfscall_flags)
1778 {
1779 	int error;
1780 
1781 	ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
1782 
1783 	/*
1784 	 * If lookup is for "", just return dvp.  Don't need
1785 	 * to send it over the wire, look it up in the dnlc,
1786 	 * or perform any access checks.
1787 	 */
1788 	if (*nm == '\0') {
1789 		VN_HOLD(dvp);
1790 		*vpp = dvp;
1791 		return (0);
1792 	}
1793 
1794 	/*
1795 	 * Can't do lookups in non-directories.
1796 	 */
1797 	if (dvp->v_type != VDIR)
1798 		return (ENOTDIR);
1799 
1800 	/*
1801 	 * If we're called with RFSCALL_SOFT, it's important that
1802 	 * the only rfscall is one we make directly; if we permit
1803 	 * an access call because we're looking up "." or validating
1804 	 * a dnlc hit, we'll deadlock because that rfscall will not
1805 	 * have the RFSCALL_SOFT set.
1806 	 */
1807 	if (rfscall_flags & RFSCALL_SOFT)
1808 		goto callit;
1809 
1810 	/*
1811 	 * If lookup is for ".", just return dvp.  Don't need
1812 	 * to send it over the wire or look it up in the dnlc,
1813 	 * just need to check access.
1814 	 */
1815 	if (strcmp(nm, ".") == 0) {
1816 		error = nfs_access(dvp, VEXEC, 0, cr, NULL);
1817 		if (error)
1818 			return (error);
1819 		VN_HOLD(dvp);
1820 		*vpp = dvp;
1821 		return (0);
1822 	}
1823 
1824 	/*
1825 	 * Lookup this name in the DNLC.  If there was a valid entry,
1826 	 * then return the results of the lookup.
1827 	 */
1828 	error = nfslookup_dnlc(dvp, nm, vpp, cr);
1829 	if (error || *vpp != NULL)
1830 		return (error);
1831 
1832 callit:
1833 	error = nfslookup_otw(dvp, nm, vpp, cr, rfscall_flags);
1834 
1835 	return (error);
1836 }
1837 
1838 static int
1839 nfslookup_dnlc(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr)
1840 {
1841 	int error;
1842 	vnode_t *vp;
1843 
1844 	ASSERT(*nm != '\0');
1845 	ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
1846 
1847 	/*
1848 	 * Lookup this name in the DNLC.  If successful, then validate
1849 	 * the caches and then recheck the DNLC.  The DNLC is rechecked
1850 	 * just in case this entry got invalidated during the call
1851 	 * to nfs_validate_caches.
1852 	 *
1853 	 * An assumption is being made that it is safe to say that a
1854 	 * file exists which may not on the server.  Any operations to
1855 	 * the server will fail with ESTALE.
1856 	 */
1857 #ifdef DEBUG
1858 	nfs_lookup_dnlc_lookups++;
1859 #endif
1860 	vp = dnlc_lookup(dvp, nm);
1861 	if (vp != NULL) {
1862 		VN_RELE(vp);
1863 		if (vp == DNLC_NO_VNODE && !vn_is_readonly(dvp)) {
1864 			PURGE_ATTRCACHE(dvp);
1865 		}
1866 		error = nfs_validate_caches(dvp, cr);
1867 		if (error)
1868 			return (error);
1869 		vp = dnlc_lookup(dvp, nm);
1870 		if (vp != NULL) {
1871 			error = nfs_access(dvp, VEXEC, 0, cr, NULL);
1872 			if (error) {
1873 				VN_RELE(vp);
1874 				return (error);
1875 			}
1876 			if (vp == DNLC_NO_VNODE) {
1877 				VN_RELE(vp);
1878 #ifdef DEBUG
1879 				nfs_lookup_dnlc_neg_hits++;
1880 #endif
1881 				return (ENOENT);
1882 			}
1883 			*vpp = vp;
1884 #ifdef DEBUG
1885 			nfs_lookup_dnlc_hits++;
1886 #endif
1887 			return (0);
1888 		}
1889 #ifdef DEBUG
1890 		nfs_lookup_dnlc_disappears++;
1891 #endif
1892 	}
1893 #ifdef DEBUG
1894 	else
1895 		nfs_lookup_dnlc_misses++;
1896 #endif
1897 
1898 	*vpp = NULL;
1899 
1900 	return (0);
1901 }
1902 
1903 static int
1904 nfslookup_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr,
1905 	int rfscall_flags)
1906 {
1907 	int error;
1908 	struct nfsdiropargs da;
1909 	struct nfsdiropres dr;
1910 	int douprintf;
1911 	failinfo_t fi;
1912 	hrtime_t t;
1913 
1914 	ASSERT(*nm != '\0');
1915 	ASSERT(dvp->v_type == VDIR);
1916 	ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
1917 
1918 	setdiropargs(&da, nm, dvp);
1919 
1920 	fi.vp = dvp;
1921 	fi.fhp = NULL;		/* no need to update, filehandle not copied */
1922 	fi.copyproc = nfscopyfh;
1923 	fi.lookupproc = nfslookup;
1924 	fi.xattrdirproc = acl_getxattrdir2;
1925 
1926 	douprintf = 1;
1927 
1928 	t = gethrtime();
1929 
1930 	error = rfs2call(VTOMI(dvp), RFS_LOOKUP,
1931 	    xdr_diropargs, (caddr_t)&da,
1932 	    xdr_diropres, (caddr_t)&dr, cr,
1933 	    &douprintf, &dr.dr_status, rfscall_flags, &fi);
1934 
1935 	if (!error) {
1936 		error = geterrno(dr.dr_status);
1937 		if (!error) {
1938 			*vpp = makenfsnode(&dr.dr_fhandle, &dr.dr_attr,
1939 			    dvp->v_vfsp, t, cr, VTOR(dvp)->r_path, nm);
1940 			/*
1941 			 * If NFS_ACL is supported on the server, then the
1942 			 * attributes returned by server may have minimal
1943 			 * permissions sometimes denying access to users having
1944 			 * proper access.  To get the proper attributes, mark
1945 			 * the attributes as expired so that they will be
1946 			 * regotten via the NFS_ACL GETATTR2 procedure.
1947 			 */
1948 			if (VTOMI(*vpp)->mi_flags & MI_ACL) {
1949 				PURGE_ATTRCACHE(*vpp);
1950 			}
1951 			if (!(rfscall_flags & RFSCALL_SOFT))
1952 				dnlc_update(dvp, nm, *vpp);
1953 		} else {
1954 			PURGE_STALE_FH(error, dvp, cr);
1955 			if (error == ENOENT && nfs_lookup_neg_cache)
1956 				dnlc_enter(dvp, nm, DNLC_NO_VNODE);
1957 		}
1958 	}
1959 
1960 	return (error);
1961 }
1962 
1963 /* ARGSUSED */
1964 static int
1965 nfs_create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive,
1966 	int mode, vnode_t **vpp, cred_t *cr, int lfaware, caller_context_t *ct,
1967 	vsecattr_t *vsecp)
1968 {
1969 	int error;
1970 	struct nfscreatargs args;
1971 	struct nfsdiropres dr;
1972 	int douprintf;
1973 	vnode_t *vp;
1974 	rnode_t *rp;
1975 	struct vattr vattr;
1976 	rnode_t *drp;
1977 	vnode_t *tempvp;
1978 	hrtime_t t;
1979 
1980 	drp = VTOR(dvp);
1981 
1982 	if (nfs_zone() != VTOMI(dvp)->mi_zone)
1983 		return (EPERM);
1984 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
1985 		return (EINTR);
1986 
1987 	/*
1988 	 * We make a copy of the attributes because the caller does not
1989 	 * expect us to change what va points to.
1990 	 */
1991 	vattr = *va;
1992 
1993 	/*
1994 	 * If the pathname is "", just use dvp.  Don't need
1995 	 * to send it over the wire, look it up in the dnlc,
1996 	 * or perform any access checks.
1997 	 */
1998 	if (*nm == '\0') {
1999 		error = 0;
2000 		VN_HOLD(dvp);
2001 		vp = dvp;
2002 	/*
2003 	 * If the pathname is ".", just use dvp.  Don't need
2004 	 * to send it over the wire or look it up in the dnlc,
2005 	 * just need to check access.
2006 	 */
2007 	} else if (strcmp(nm, ".") == 0) {
2008 		error = nfs_access(dvp, VEXEC, 0, cr, ct);
2009 		if (error) {
2010 			nfs_rw_exit(&drp->r_rwlock);
2011 			return (error);
2012 		}
2013 		VN_HOLD(dvp);
2014 		vp = dvp;
2015 	/*
2016 	 * We need to go over the wire, just to be sure whether the
2017 	 * file exists or not.  Using the DNLC can be dangerous in
2018 	 * this case when making a decision regarding existence.
2019 	 */
2020 	} else {
2021 		error = nfslookup_otw(dvp, nm, &vp, cr, 0);
2022 	}
2023 	if (!error) {
2024 		if (exclusive == EXCL)
2025 			error = EEXIST;
2026 		else if (vp->v_type == VDIR && (mode & VWRITE))
2027 			error = EISDIR;
2028 		else {
2029 			/*
2030 			 * If vnode is a device, create special vnode.
2031 			 */
2032 			if (IS_DEVVP(vp)) {
2033 				tempvp = vp;
2034 				vp = specvp(vp, vp->v_rdev, vp->v_type, cr);
2035 				VN_RELE(tempvp);
2036 			}
2037 			if (!(error = VOP_ACCESS(vp, mode, 0, cr, ct))) {
2038 				if ((vattr.va_mask & AT_SIZE) &&
2039 				    vp->v_type == VREG) {
2040 					vattr.va_mask = AT_SIZE;
2041 					error = nfssetattr(vp, &vattr, 0, cr);
2042 
2043 					if (!error) {
2044 						/*
2045 						 * Existing file was truncated;
2046 						 * emit a create event.
2047 						 */
2048 						vnevent_create(vp, ct);
2049 					}
2050 				}
2051 			}
2052 		}
2053 		nfs_rw_exit(&drp->r_rwlock);
2054 		if (error) {
2055 			VN_RELE(vp);
2056 		} else {
2057 			*vpp = vp;
2058 		}
2059 		return (error);
2060 	}
2061 
2062 	ASSERT(vattr.va_mask & AT_TYPE);
2063 	if (vattr.va_type == VREG) {
2064 		ASSERT(vattr.va_mask & AT_MODE);
2065 		if (MANDMODE(vattr.va_mode)) {
2066 			nfs_rw_exit(&drp->r_rwlock);
2067 			return (EACCES);
2068 		}
2069 	}
2070 
2071 	dnlc_remove(dvp, nm);
2072 
2073 	setdiropargs(&args.ca_da, nm, dvp);
2074 
2075 	/*
2076 	 * Decide what the group-id of the created file should be.
2077 	 * Set it in attribute list as advisory...then do a setattr
2078 	 * if the server didn't get it right the first time.
2079 	 */
2080 	error = setdirgid(dvp, &vattr.va_gid, cr);
2081 	if (error) {
2082 		nfs_rw_exit(&drp->r_rwlock);
2083 		return (error);
2084 	}
2085 	vattr.va_mask |= AT_GID;
2086 
2087 	/*
2088 	 * This is a completely gross hack to make mknod
2089 	 * work over the wire until we can wack the protocol
2090 	 */
2091 #define	IFCHR		0020000		/* character special */
2092 #define	IFBLK		0060000		/* block special */
2093 #define	IFSOCK		0140000		/* socket */
2094 
2095 	/*
2096 	 * dev_t is uint_t in 5.x and short in 4.x. Both 4.x
2097 	 * supports 8 bit majors. 5.x supports 14 bit majors. 5.x supports 18
2098 	 * bits in the minor number where 4.x supports 8 bits.  If the 5.x
2099 	 * minor/major numbers <= 8 bits long, compress the device
2100 	 * number before sending it. Otherwise, the 4.x server will not
2101 	 * create the device with the correct device number and nothing can be
2102 	 * done about this.
2103 	 */
2104 	if (vattr.va_type == VCHR || vattr.va_type == VBLK) {
2105 		dev_t d = vattr.va_rdev;
2106 		dev32_t dev32;
2107 
2108 		if (vattr.va_type == VCHR)
2109 			vattr.va_mode |= IFCHR;
2110 		else
2111 			vattr.va_mode |= IFBLK;
2112 
2113 		(void) cmpldev(&dev32, d);
2114 		if (dev32 & ~((SO4_MAXMAJ << L_BITSMINOR32) | SO4_MAXMIN))
2115 			vattr.va_size = (u_offset_t)dev32;
2116 		else
2117 			vattr.va_size = (u_offset_t)nfsv2_cmpdev(d);
2118 
2119 		vattr.va_mask |= AT_MODE|AT_SIZE;
2120 	} else if (vattr.va_type == VFIFO) {
2121 		vattr.va_mode |= IFCHR;		/* xtra kludge for namedpipe */
2122 		vattr.va_size = (u_offset_t)NFS_FIFO_DEV;	/* blech */
2123 		vattr.va_mask |= AT_MODE|AT_SIZE;
2124 	} else if (vattr.va_type == VSOCK) {
2125 		vattr.va_mode |= IFSOCK;
2126 		/*
2127 		 * To avoid triggering bugs in the servers set AT_SIZE
2128 		 * (all other RFS_CREATE calls set this).
2129 		 */
2130 		vattr.va_size = 0;
2131 		vattr.va_mask |= AT_MODE|AT_SIZE;
2132 	}
2133 
2134 	args.ca_sa = &args.ca_sa_buf;
2135 	error = vattr_to_sattr(&vattr, args.ca_sa);
2136 	if (error) {
2137 		/* req time field(s) overflow - return immediately */
2138 		nfs_rw_exit(&drp->r_rwlock);
2139 		return (error);
2140 	}
2141 
2142 	douprintf = 1;
2143 
2144 	t = gethrtime();
2145 
2146 	error = rfs2call(VTOMI(dvp), RFS_CREATE,
2147 	    xdr_creatargs, (caddr_t)&args,
2148 	    xdr_diropres, (caddr_t)&dr, cr,
2149 	    &douprintf, &dr.dr_status, 0, NULL);
2150 
2151 	PURGE_ATTRCACHE(dvp);	/* mod time changed */
2152 
2153 	if (!error) {
2154 		error = geterrno(dr.dr_status);
2155 		if (!error) {
2156 			if (HAVE_RDDIR_CACHE(drp))
2157 				nfs_purge_rddir_cache(dvp);
2158 			vp = makenfsnode(&dr.dr_fhandle, &dr.dr_attr,
2159 			    dvp->v_vfsp, t, cr, NULL, NULL);
2160 			/*
2161 			 * If NFS_ACL is supported on the server, then the
2162 			 * attributes returned by server may have minimal
2163 			 * permissions sometimes denying access to users having
2164 			 * proper access.  To get the proper attributes, mark
2165 			 * the attributes as expired so that they will be
2166 			 * regotten via the NFS_ACL GETATTR2 procedure.
2167 			 */
2168 			if (VTOMI(vp)->mi_flags & MI_ACL) {
2169 				PURGE_ATTRCACHE(vp);
2170 			}
2171 			dnlc_update(dvp, nm, vp);
2172 			rp = VTOR(vp);
2173 			if (vattr.va_size == 0) {
2174 				mutex_enter(&rp->r_statelock);
2175 				rp->r_size = 0;
2176 				mutex_exit(&rp->r_statelock);
2177 				if (vn_has_cached_data(vp)) {
2178 					ASSERT(vp->v_type != VCHR);
2179 					nfs_invalidate_pages(vp,
2180 					    (u_offset_t)0, cr);
2181 				}
2182 			}
2183 
2184 			/*
2185 			 * Make sure the gid was set correctly.
2186 			 * If not, try to set it (but don't lose
2187 			 * any sleep over it).
2188 			 */
2189 			if (vattr.va_gid != rp->r_attr.va_gid) {
2190 				vattr.va_mask = AT_GID;
2191 				(void) nfssetattr(vp, &vattr, 0, cr);
2192 			}
2193 
2194 			/*
2195 			 * If vnode is a device create special vnode
2196 			 */
2197 			if (IS_DEVVP(vp)) {
2198 				*vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
2199 				VN_RELE(vp);
2200 			} else
2201 				*vpp = vp;
2202 		} else {
2203 			PURGE_STALE_FH(error, dvp, cr);
2204 		}
2205 	}
2206 
2207 	nfs_rw_exit(&drp->r_rwlock);
2208 
2209 	return (error);
2210 }
2211 
2212 /*
2213  * Weirdness: if the vnode to be removed is open
2214  * we rename it instead of removing it and nfs_inactive
2215  * will remove the new name.
2216  */
2217 /* ARGSUSED */
2218 static int
2219 nfs_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct, int flags)
2220 {
2221 	int error;
2222 	struct nfsdiropargs da;
2223 	enum nfsstat status;
2224 	vnode_t *vp;
2225 	char *tmpname;
2226 	int douprintf;
2227 	rnode_t *rp;
2228 	rnode_t *drp;
2229 
2230 	if (nfs_zone() != VTOMI(dvp)->mi_zone)
2231 		return (EPERM);
2232 	drp = VTOR(dvp);
2233 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
2234 		return (EINTR);
2235 
2236 	error = nfslookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0);
2237 	if (error) {
2238 		nfs_rw_exit(&drp->r_rwlock);
2239 		return (error);
2240 	}
2241 
2242 	if (vp->v_type == VDIR && secpolicy_fs_linkdir(cr, dvp->v_vfsp)) {
2243 		VN_RELE(vp);
2244 		nfs_rw_exit(&drp->r_rwlock);
2245 		return (EPERM);
2246 	}
2247 
2248 	/*
2249 	 * First just remove the entry from the name cache, as it
2250 	 * is most likely the only entry for this vp.
2251 	 */
2252 	dnlc_remove(dvp, nm);
2253 
2254 	/*
2255 	 * If the file has a v_count > 1 then there may be more than one
2256 	 * entry in the name cache due multiple links or an open file,
2257 	 * but we don't have the real reference count so flush all
2258 	 * possible entries.
2259 	 */
2260 	if (vp->v_count > 1)
2261 		dnlc_purge_vp(vp);
2262 
2263 	/*
2264 	 * Now we have the real reference count on the vnode
2265 	 */
2266 	rp = VTOR(vp);
2267 	mutex_enter(&rp->r_statelock);
2268 	if (vp->v_count > 1 &&
2269 	    (rp->r_unldvp == NULL || strcmp(nm, rp->r_unlname) == 0)) {
2270 		mutex_exit(&rp->r_statelock);
2271 		tmpname = newname();
2272 		error = nfsrename(dvp, nm, dvp, tmpname, cr, ct);
2273 		if (error)
2274 			kmem_free(tmpname, MAXNAMELEN);
2275 		else {
2276 			mutex_enter(&rp->r_statelock);
2277 			if (rp->r_unldvp == NULL) {
2278 				VN_HOLD(dvp);
2279 				rp->r_unldvp = dvp;
2280 				if (rp->r_unlcred != NULL)
2281 					crfree(rp->r_unlcred);
2282 				crhold(cr);
2283 				rp->r_unlcred = cr;
2284 				rp->r_unlname = tmpname;
2285 			} else {
2286 				kmem_free(rp->r_unlname, MAXNAMELEN);
2287 				rp->r_unlname = tmpname;
2288 			}
2289 			mutex_exit(&rp->r_statelock);
2290 		}
2291 	} else {
2292 		mutex_exit(&rp->r_statelock);
2293 		/*
2294 		 * We need to flush any dirty pages which happen to
2295 		 * be hanging around before removing the file.  This
2296 		 * shouldn't happen very often and mostly on file
2297 		 * systems mounted "nocto".
2298 		 */
2299 		if (vn_has_cached_data(vp) &&
2300 		    ((rp->r_flags & RDIRTY) || rp->r_count > 0)) {
2301 			error = nfs_putpage(vp, (offset_t)0, 0, 0, cr, ct);
2302 			if (error && (error == ENOSPC || error == EDQUOT)) {
2303 				mutex_enter(&rp->r_statelock);
2304 				if (!rp->r_error)
2305 					rp->r_error = error;
2306 				mutex_exit(&rp->r_statelock);
2307 			}
2308 		}
2309 
2310 		setdiropargs(&da, nm, dvp);
2311 
2312 		douprintf = 1;
2313 
2314 		error = rfs2call(VTOMI(dvp), RFS_REMOVE,
2315 		    xdr_diropargs, (caddr_t)&da,
2316 		    xdr_enum, (caddr_t)&status, cr,
2317 		    &douprintf, &status, 0, NULL);
2318 
2319 		/*
2320 		 * The xattr dir may be gone after last attr is removed,
2321 		 * so flush it from dnlc.
2322 		 */
2323 		if (dvp->v_flag & V_XATTRDIR)
2324 			dnlc_purge_vp(dvp);
2325 
2326 		PURGE_ATTRCACHE(dvp);	/* mod time changed */
2327 		PURGE_ATTRCACHE(vp);	/* link count changed */
2328 
2329 		if (!error) {
2330 			error = geterrno(status);
2331 			if (!error) {
2332 				if (HAVE_RDDIR_CACHE(drp))
2333 					nfs_purge_rddir_cache(dvp);
2334 			} else {
2335 				PURGE_STALE_FH(error, dvp, cr);
2336 			}
2337 		}
2338 	}
2339 
2340 	if (error == 0) {
2341 		vnevent_remove(vp, dvp, nm, ct);
2342 	}
2343 	VN_RELE(vp);
2344 
2345 	nfs_rw_exit(&drp->r_rwlock);
2346 
2347 	return (error);
2348 }
2349 
2350 /* ARGSUSED */
2351 static int
2352 nfs_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr,
2353 	caller_context_t *ct, int flags)
2354 {
2355 	int error;
2356 	struct nfslinkargs args;
2357 	enum nfsstat status;
2358 	vnode_t *realvp;
2359 	int douprintf;
2360 	rnode_t *tdrp;
2361 
2362 	if (nfs_zone() != VTOMI(tdvp)->mi_zone)
2363 		return (EPERM);
2364 	if (VOP_REALVP(svp, &realvp, ct) == 0)
2365 		svp = realvp;
2366 
2367 	args.la_from = VTOFH(svp);
2368 	setdiropargs(&args.la_to, tnm, tdvp);
2369 
2370 	tdrp = VTOR(tdvp);
2371 	if (nfs_rw_enter_sig(&tdrp->r_rwlock, RW_WRITER, INTR(tdvp)))
2372 		return (EINTR);
2373 
2374 	dnlc_remove(tdvp, tnm);
2375 
2376 	douprintf = 1;
2377 
2378 	error = rfs2call(VTOMI(svp), RFS_LINK,
2379 	    xdr_linkargs, (caddr_t)&args,
2380 	    xdr_enum, (caddr_t)&status, cr,
2381 	    &douprintf, &status, 0, NULL);
2382 
2383 	PURGE_ATTRCACHE(tdvp);	/* mod time changed */
2384 	PURGE_ATTRCACHE(svp);	/* link count changed */
2385 
2386 	if (!error) {
2387 		error = geterrno(status);
2388 		if (!error) {
2389 			if (HAVE_RDDIR_CACHE(tdrp))
2390 				nfs_purge_rddir_cache(tdvp);
2391 		}
2392 	}
2393 
2394 	nfs_rw_exit(&tdrp->r_rwlock);
2395 
2396 	if (!error) {
2397 		/*
2398 		 * Notify the source file of this link operation.
2399 		 */
2400 		vnevent_link(svp, ct);
2401 	}
2402 	return (error);
2403 }
2404 
2405 /* ARGSUSED */
2406 static int
2407 nfs_rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr,
2408 	caller_context_t *ct, int flags)
2409 {
2410 	vnode_t *realvp;
2411 
2412 	if (nfs_zone() != VTOMI(odvp)->mi_zone)
2413 		return (EPERM);
2414 	if (VOP_REALVP(ndvp, &realvp, ct) == 0)
2415 		ndvp = realvp;
2416 
2417 	return (nfsrename(odvp, onm, ndvp, nnm, cr, ct));
2418 }
2419 
2420 /*
2421  * nfsrename does the real work of renaming in NFS Version 2.
2422  */
2423 static int
2424 nfsrename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr,
2425     caller_context_t *ct)
2426 {
2427 	int error;
2428 	enum nfsstat status;
2429 	struct nfsrnmargs args;
2430 	int douprintf;
2431 	vnode_t *nvp = NULL;
2432 	vnode_t *ovp = NULL;
2433 	char *tmpname;
2434 	rnode_t *rp;
2435 	rnode_t *odrp;
2436 	rnode_t *ndrp;
2437 
2438 	ASSERT(nfs_zone() == VTOMI(odvp)->mi_zone);
2439 	if (strcmp(onm, ".") == 0 || strcmp(onm, "..") == 0 ||
2440 	    strcmp(nnm, ".") == 0 || strcmp(nnm, "..") == 0)
2441 		return (EINVAL);
2442 
2443 	odrp = VTOR(odvp);
2444 	ndrp = VTOR(ndvp);
2445 	if ((intptr_t)odrp < (intptr_t)ndrp) {
2446 		if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR(odvp)))
2447 			return (EINTR);
2448 		if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR(ndvp))) {
2449 			nfs_rw_exit(&odrp->r_rwlock);
2450 			return (EINTR);
2451 		}
2452 	} else {
2453 		if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR(ndvp)))
2454 			return (EINTR);
2455 		if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR(odvp))) {
2456 			nfs_rw_exit(&ndrp->r_rwlock);
2457 			return (EINTR);
2458 		}
2459 	}
2460 
2461 	/*
2462 	 * Lookup the target file.  If it exists, it needs to be
2463 	 * checked to see whether it is a mount point and whether
2464 	 * it is active (open).
2465 	 */
2466 	error = nfslookup(ndvp, nnm, &nvp, NULL, 0, NULL, cr, 0);
2467 	if (!error) {
2468 		/*
2469 		 * If this file has been mounted on, then just
2470 		 * return busy because renaming to it would remove
2471 		 * the mounted file system from the name space.
2472 		 */
2473 		if (vn_mountedvfs(nvp) != NULL) {
2474 			VN_RELE(nvp);
2475 			nfs_rw_exit(&odrp->r_rwlock);
2476 			nfs_rw_exit(&ndrp->r_rwlock);
2477 			return (EBUSY);
2478 		}
2479 
2480 		/*
2481 		 * Purge the name cache of all references to this vnode
2482 		 * so that we can check the reference count to infer
2483 		 * whether it is active or not.
2484 		 */
2485 		/*
2486 		 * First just remove the entry from the name cache, as it
2487 		 * is most likely the only entry for this vp.
2488 		 */
2489 		dnlc_remove(ndvp, nnm);
2490 		/*
2491 		 * If the file has a v_count > 1 then there may be more
2492 		 * than one entry in the name cache due multiple links
2493 		 * or an open file, but we don't have the real reference
2494 		 * count so flush all possible entries.
2495 		 */
2496 		if (nvp->v_count > 1)
2497 			dnlc_purge_vp(nvp);
2498 
2499 		/*
2500 		 * If the vnode is active and is not a directory,
2501 		 * arrange to rename it to a
2502 		 * temporary file so that it will continue to be
2503 		 * accessible.  This implements the "unlink-open-file"
2504 		 * semantics for the target of a rename operation.
2505 		 * Before doing this though, make sure that the
2506 		 * source and target files are not already the same.
2507 		 */
2508 		if (nvp->v_count > 1 && nvp->v_type != VDIR) {
2509 			/*
2510 			 * Lookup the source name.
2511 			 */
2512 			error = nfslookup(odvp, onm, &ovp, NULL, 0, NULL,
2513 			    cr, 0);
2514 
2515 			/*
2516 			 * The source name *should* already exist.
2517 			 */
2518 			if (error) {
2519 				VN_RELE(nvp);
2520 				nfs_rw_exit(&odrp->r_rwlock);
2521 				nfs_rw_exit(&ndrp->r_rwlock);
2522 				return (error);
2523 			}
2524 
2525 			/*
2526 			 * Compare the two vnodes.  If they are the same,
2527 			 * just release all held vnodes and return success.
2528 			 */
2529 			if (ovp == nvp) {
2530 				VN_RELE(ovp);
2531 				VN_RELE(nvp);
2532 				nfs_rw_exit(&odrp->r_rwlock);
2533 				nfs_rw_exit(&ndrp->r_rwlock);
2534 				return (0);
2535 			}
2536 
2537 			/*
2538 			 * Can't mix and match directories and non-
2539 			 * directories in rename operations.  We already
2540 			 * know that the target is not a directory.  If
2541 			 * the source is a directory, return an error.
2542 			 */
2543 			if (ovp->v_type == VDIR) {
2544 				VN_RELE(ovp);
2545 				VN_RELE(nvp);
2546 				nfs_rw_exit(&odrp->r_rwlock);
2547 				nfs_rw_exit(&ndrp->r_rwlock);
2548 				return (ENOTDIR);
2549 			}
2550 
2551 			/*
2552 			 * The target file exists, is not the same as
2553 			 * the source file, and is active.  Link it
2554 			 * to a temporary filename to avoid having
2555 			 * the server removing the file completely.
2556 			 */
2557 			tmpname = newname();
2558 			error = nfs_link(ndvp, nvp, tmpname, cr, NULL, 0);
2559 			if (error == EOPNOTSUPP) {
2560 				error = nfs_rename(ndvp, nnm, ndvp, tmpname,
2561 				    cr, NULL, 0);
2562 			}
2563 			if (error) {
2564 				kmem_free(tmpname, MAXNAMELEN);
2565 				VN_RELE(ovp);
2566 				VN_RELE(nvp);
2567 				nfs_rw_exit(&odrp->r_rwlock);
2568 				nfs_rw_exit(&ndrp->r_rwlock);
2569 				return (error);
2570 			}
2571 			rp = VTOR(nvp);
2572 			mutex_enter(&rp->r_statelock);
2573 			if (rp->r_unldvp == NULL) {
2574 				VN_HOLD(ndvp);
2575 				rp->r_unldvp = ndvp;
2576 				if (rp->r_unlcred != NULL)
2577 					crfree(rp->r_unlcred);
2578 				crhold(cr);
2579 				rp->r_unlcred = cr;
2580 				rp->r_unlname = tmpname;
2581 			} else {
2582 				kmem_free(rp->r_unlname, MAXNAMELEN);
2583 				rp->r_unlname = tmpname;
2584 			}
2585 			mutex_exit(&rp->r_statelock);
2586 		}
2587 	}
2588 
2589 	if (ovp == NULL) {
2590 		/*
2591 		 * When renaming directories to be a subdirectory of a
2592 		 * different parent, the dnlc entry for ".." will no
2593 		 * longer be valid, so it must be removed.
2594 		 *
2595 		 * We do a lookup here to determine whether we are renaming
2596 		 * a directory and we need to check if we are renaming
2597 		 * an unlinked file.  This might have already been done
2598 		 * in previous code, so we check ovp == NULL to avoid
2599 		 * doing it twice.
2600 		 */
2601 
2602 		error = nfslookup(odvp, onm, &ovp, NULL, 0, NULL, cr, 0);
2603 
2604 		/*
2605 		 * The source name *should* already exist.
2606 		 */
2607 		if (error) {
2608 			nfs_rw_exit(&odrp->r_rwlock);
2609 			nfs_rw_exit(&ndrp->r_rwlock);
2610 			if (nvp) {
2611 				VN_RELE(nvp);
2612 			}
2613 			return (error);
2614 		}
2615 		ASSERT(ovp != NULL);
2616 	}
2617 
2618 	dnlc_remove(odvp, onm);
2619 	dnlc_remove(ndvp, nnm);
2620 
2621 	setdiropargs(&args.rna_from, onm, odvp);
2622 	setdiropargs(&args.rna_to, nnm, ndvp);
2623 
2624 	douprintf = 1;
2625 
2626 	error = rfs2call(VTOMI(odvp), RFS_RENAME,
2627 	    xdr_rnmargs, (caddr_t)&args,
2628 	    xdr_enum, (caddr_t)&status, cr,
2629 	    &douprintf, &status, 0, NULL);
2630 
2631 	PURGE_ATTRCACHE(odvp);	/* mod time changed */
2632 	PURGE_ATTRCACHE(ndvp);	/* mod time changed */
2633 
2634 	if (!error) {
2635 		error = geterrno(status);
2636 		if (!error) {
2637 			if (HAVE_RDDIR_CACHE(odrp))
2638 				nfs_purge_rddir_cache(odvp);
2639 			if (HAVE_RDDIR_CACHE(ndrp))
2640 				nfs_purge_rddir_cache(ndvp);
2641 			/*
2642 			 * when renaming directories to be a subdirectory of a
2643 			 * different parent, the dnlc entry for ".." will no
2644 			 * longer be valid, so it must be removed
2645 			 */
2646 			rp = VTOR(ovp);
2647 			if (ndvp != odvp) {
2648 				if (ovp->v_type == VDIR) {
2649 					dnlc_remove(ovp, "..");
2650 					if (HAVE_RDDIR_CACHE(rp))
2651 						nfs_purge_rddir_cache(ovp);
2652 				}
2653 			}
2654 
2655 			/*
2656 			 * If we are renaming the unlinked file, update the
2657 			 * r_unldvp and r_unlname as needed.
2658 			 */
2659 			mutex_enter(&rp->r_statelock);
2660 			if (rp->r_unldvp != NULL) {
2661 				if (strcmp(rp->r_unlname, onm) == 0) {
2662 					(void) strncpy(rp->r_unlname,
2663 					    nnm, MAXNAMELEN);
2664 					rp->r_unlname[MAXNAMELEN - 1] = '\0';
2665 
2666 					if (ndvp != rp->r_unldvp) {
2667 						VN_RELE(rp->r_unldvp);
2668 						rp->r_unldvp = ndvp;
2669 						VN_HOLD(ndvp);
2670 					}
2671 				}
2672 			}
2673 			mutex_exit(&rp->r_statelock);
2674 		} else {
2675 			/*
2676 			 * System V defines rename to return EEXIST, not
2677 			 * ENOTEMPTY if the target directory is not empty.
2678 			 * Over the wire, the error is NFSERR_ENOTEMPTY
2679 			 * which geterrno maps to ENOTEMPTY.
2680 			 */
2681 			if (error == ENOTEMPTY)
2682 				error = EEXIST;
2683 		}
2684 	}
2685 
2686 	if (error == 0) {
2687 		if (nvp)
2688 			vnevent_rename_dest(nvp, ndvp, nnm, ct);
2689 
2690 		if (odvp != ndvp)
2691 			vnevent_rename_dest_dir(ndvp, ct);
2692 
2693 		ASSERT(ovp != NULL);
2694 		vnevent_rename_src(ovp, odvp, onm, ct);
2695 	}
2696 
2697 	if (nvp) {
2698 		VN_RELE(nvp);
2699 	}
2700 	VN_RELE(ovp);
2701 
2702 	nfs_rw_exit(&odrp->r_rwlock);
2703 	nfs_rw_exit(&ndrp->r_rwlock);
2704 
2705 	return (error);
2706 }
2707 
2708 /* ARGSUSED */
2709 static int
2710 nfs_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp, cred_t *cr,
2711 	caller_context_t *ct, int flags, vsecattr_t *vsecp)
2712 {
2713 	int error;
2714 	struct nfscreatargs args;
2715 	struct nfsdiropres dr;
2716 	int douprintf;
2717 	rnode_t *drp;
2718 	hrtime_t t;
2719 
2720 	if (nfs_zone() != VTOMI(dvp)->mi_zone)
2721 		return (EPERM);
2722 
2723 	setdiropargs(&args.ca_da, nm, dvp);
2724 
2725 	/*
2726 	 * Decide what the group-id and set-gid bit of the created directory
2727 	 * should be.  May have to do a setattr to get the gid right.
2728 	 */
2729 	error = setdirgid(dvp, &va->va_gid, cr);
2730 	if (error)
2731 		return (error);
2732 	error = setdirmode(dvp, &va->va_mode, cr);
2733 	if (error)
2734 		return (error);
2735 	va->va_mask |= AT_MODE|AT_GID;
2736 
2737 	args.ca_sa = &args.ca_sa_buf;
2738 	error = vattr_to_sattr(va, args.ca_sa);
2739 	if (error) {
2740 		/* req time field(s) overflow - return immediately */
2741 		return (error);
2742 	}
2743 
2744 	drp = VTOR(dvp);
2745 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
2746 		return (EINTR);
2747 
2748 	dnlc_remove(dvp, nm);
2749 
2750 	douprintf = 1;
2751 
2752 	t = gethrtime();
2753 
2754 	error = rfs2call(VTOMI(dvp), RFS_MKDIR,
2755 	    xdr_creatargs, (caddr_t)&args,
2756 	    xdr_diropres, (caddr_t)&dr, cr,
2757 	    &douprintf, &dr.dr_status, 0, NULL);
2758 
2759 	PURGE_ATTRCACHE(dvp);	/* mod time changed */
2760 
2761 	if (!error) {
2762 		error = geterrno(dr.dr_status);
2763 		if (!error) {
2764 			if (HAVE_RDDIR_CACHE(drp))
2765 				nfs_purge_rddir_cache(dvp);
2766 			/*
2767 			 * The attributes returned by RFS_MKDIR can not
2768 			 * be depended upon, so mark the attribute cache
2769 			 * as purged.  A subsequent GETATTR will get the
2770 			 * correct attributes from the server.
2771 			 */
2772 			*vpp = makenfsnode(&dr.dr_fhandle, &dr.dr_attr,
2773 			    dvp->v_vfsp, t, cr, NULL, NULL);
2774 			PURGE_ATTRCACHE(*vpp);
2775 			dnlc_update(dvp, nm, *vpp);
2776 
2777 			/*
2778 			 * Make sure the gid was set correctly.
2779 			 * If not, try to set it (but don't lose
2780 			 * any sleep over it).
2781 			 */
2782 			if (va->va_gid != VTOR(*vpp)->r_attr.va_gid) {
2783 				va->va_mask = AT_GID;
2784 				(void) nfssetattr(*vpp, va, 0, cr);
2785 			}
2786 		} else {
2787 			PURGE_STALE_FH(error, dvp, cr);
2788 		}
2789 	}
2790 
2791 	nfs_rw_exit(&drp->r_rwlock);
2792 
2793 	return (error);
2794 }
2795 
2796 /* ARGSUSED */
2797 static int
2798 nfs_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr,
2799 	caller_context_t *ct, int flags)
2800 {
2801 	int error;
2802 	enum nfsstat status;
2803 	struct nfsdiropargs da;
2804 	vnode_t *vp;
2805 	int douprintf;
2806 	rnode_t *drp;
2807 
2808 	if (nfs_zone() != VTOMI(dvp)->mi_zone)
2809 		return (EPERM);
2810 	drp = VTOR(dvp);
2811 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
2812 		return (EINTR);
2813 
2814 	/*
2815 	 * Attempt to prevent a rmdir(".") from succeeding.
2816 	 */
2817 	error = nfslookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0);
2818 	if (error) {
2819 		nfs_rw_exit(&drp->r_rwlock);
2820 		return (error);
2821 	}
2822 
2823 	if (vp == cdir) {
2824 		VN_RELE(vp);
2825 		nfs_rw_exit(&drp->r_rwlock);
2826 		return (EINVAL);
2827 	}
2828 
2829 	setdiropargs(&da, nm, dvp);
2830 
2831 	/*
2832 	 * First just remove the entry from the name cache, as it
2833 	 * is most likely an entry for this vp.
2834 	 */
2835 	dnlc_remove(dvp, nm);
2836 
2837 	/*
2838 	 * If there vnode reference count is greater than one, then
2839 	 * there may be additional references in the DNLC which will
2840 	 * need to be purged.  First, trying removing the entry for
2841 	 * the parent directory and see if that removes the additional
2842 	 * reference(s).  If that doesn't do it, then use dnlc_purge_vp
2843 	 * to completely remove any references to the directory which
2844 	 * might still exist in the DNLC.
2845 	 */
2846 	if (vp->v_count > 1) {
2847 		dnlc_remove(vp, "..");
2848 		if (vp->v_count > 1)
2849 			dnlc_purge_vp(vp);
2850 	}
2851 
2852 	douprintf = 1;
2853 
2854 	error = rfs2call(VTOMI(dvp), RFS_RMDIR,
2855 	    xdr_diropargs, (caddr_t)&da,
2856 	    xdr_enum, (caddr_t)&status, cr,
2857 	    &douprintf, &status, 0, NULL);
2858 
2859 	PURGE_ATTRCACHE(dvp);	/* mod time changed */
2860 
2861 	if (error) {
2862 		VN_RELE(vp);
2863 		nfs_rw_exit(&drp->r_rwlock);
2864 		return (error);
2865 	}
2866 
2867 	error = geterrno(status);
2868 	if (!error) {
2869 		if (HAVE_RDDIR_CACHE(drp))
2870 			nfs_purge_rddir_cache(dvp);
2871 		if (HAVE_RDDIR_CACHE(VTOR(vp)))
2872 			nfs_purge_rddir_cache(vp);
2873 	} else {
2874 		PURGE_STALE_FH(error, dvp, cr);
2875 		/*
2876 		 * System V defines rmdir to return EEXIST, not
2877 		 * ENOTEMPTY if the directory is not empty.  Over
2878 		 * the wire, the error is NFSERR_ENOTEMPTY which
2879 		 * geterrno maps to ENOTEMPTY.
2880 		 */
2881 		if (error == ENOTEMPTY)
2882 			error = EEXIST;
2883 	}
2884 
2885 	if (error == 0) {
2886 		vnevent_rmdir(vp, dvp, nm, ct);
2887 	}
2888 	VN_RELE(vp);
2889 
2890 	nfs_rw_exit(&drp->r_rwlock);
2891 
2892 	return (error);
2893 }
2894 
2895 /* ARGSUSED */
2896 static int
2897 nfs_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm, cred_t *cr,
2898 	caller_context_t *ct, int flags)
2899 {
2900 	int error;
2901 	struct nfsslargs args;
2902 	enum nfsstat status;
2903 	int douprintf;
2904 	rnode_t *drp;
2905 
2906 	if (nfs_zone() != VTOMI(dvp)->mi_zone)
2907 		return (EPERM);
2908 	setdiropargs(&args.sla_from, lnm, dvp);
2909 	args.sla_sa = &args.sla_sa_buf;
2910 	error = vattr_to_sattr(tva, args.sla_sa);
2911 	if (error) {
2912 		/* req time field(s) overflow - return immediately */
2913 		return (error);
2914 	}
2915 	args.sla_tnm = tnm;
2916 
2917 	drp = VTOR(dvp);
2918 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
2919 		return (EINTR);
2920 
2921 	dnlc_remove(dvp, lnm);
2922 
2923 	douprintf = 1;
2924 
2925 	error = rfs2call(VTOMI(dvp), RFS_SYMLINK,
2926 	    xdr_slargs, (caddr_t)&args,
2927 	    xdr_enum, (caddr_t)&status, cr,
2928 	    &douprintf, &status, 0, NULL);
2929 
2930 	PURGE_ATTRCACHE(dvp);	/* mod time changed */
2931 
2932 	if (!error) {
2933 		error = geterrno(status);
2934 		if (!error) {
2935 			if (HAVE_RDDIR_CACHE(drp))
2936 				nfs_purge_rddir_cache(dvp);
2937 		} else {
2938 			PURGE_STALE_FH(error, dvp, cr);
2939 		}
2940 	}
2941 
2942 	nfs_rw_exit(&drp->r_rwlock);
2943 
2944 	return (error);
2945 }
2946 
2947 #ifdef DEBUG
2948 static int nfs_readdir_cache_hits = 0;
2949 static int nfs_readdir_cache_shorts = 0;
2950 static int nfs_readdir_cache_waits = 0;
2951 static int nfs_readdir_cache_misses = 0;
2952 static int nfs_readdir_readahead = 0;
2953 #endif
2954 
2955 static int nfs_shrinkreaddir = 0;
2956 
2957 /*
2958  * Read directory entries.
2959  * There are some weird things to look out for here.  The uio_offset
2960  * field is either 0 or it is the offset returned from a previous
2961  * readdir.  It is an opaque value used by the server to find the
2962  * correct directory block to read. The count field is the number
2963  * of blocks to read on the server.  This is advisory only, the server
2964  * may return only one block's worth of entries.  Entries may be compressed
2965  * on the server.
2966  */
2967 /* ARGSUSED */
2968 static int
2969 nfs_readdir(vnode_t *vp, struct uio *uiop, cred_t *cr, int *eofp,
2970 	caller_context_t *ct, int flags)
2971 {
2972 	int error;
2973 	size_t count;
2974 	rnode_t *rp;
2975 	rddir_cache *rdc;
2976 	rddir_cache *nrdc;
2977 	rddir_cache *rrdc;
2978 #ifdef DEBUG
2979 	int missed;
2980 #endif
2981 	rddir_cache srdc;
2982 	avl_index_t where;
2983 
2984 	rp = VTOR(vp);
2985 
2986 	ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
2987 	if (nfs_zone() != VTOMI(vp)->mi_zone)
2988 		return (EIO);
2989 	/*
2990 	 * Make sure that the directory cache is valid.
2991 	 */
2992 	if (HAVE_RDDIR_CACHE(rp)) {
2993 		if (nfs_disable_rddir_cache) {
2994 			/*
2995 			 * Setting nfs_disable_rddir_cache in /etc/system
2996 			 * allows interoperability with servers that do not
2997 			 * properly update the attributes of directories.
2998 			 * Any cached information gets purged before an
2999 			 * access is made to it.
3000 			 */
3001 			nfs_purge_rddir_cache(vp);
3002 		} else {
3003 			error = nfs_validate_caches(vp, cr);
3004 			if (error)
3005 				return (error);
3006 		}
3007 	}
3008 
3009 	/*
3010 	 * UGLINESS: SunOS 3.2 servers apparently cannot always handle an
3011 	 * RFS_READDIR request with rda_count set to more than 0x400. So
3012 	 * we reduce the request size here purely for compatibility.
3013 	 *
3014 	 * In general, this is no longer required.  However, if a server
3015 	 * is discovered which can not handle requests larger than 1024,
3016 	 * nfs_shrinkreaddir can be set to 1 to enable this backwards
3017 	 * compatibility.
3018 	 *
3019 	 * In any case, the request size is limited to NFS_MAXDATA bytes.
3020 	 */
3021 	count = MIN(uiop->uio_iov->iov_len,
3022 	    nfs_shrinkreaddir ? 0x400 : NFS_MAXDATA);
3023 
3024 	nrdc = NULL;
3025 #ifdef DEBUG
3026 	missed = 0;
3027 #endif
3028 top:
3029 	/*
3030 	 * Short circuit last readdir which always returns 0 bytes.
3031 	 * This can be done after the directory has been read through
3032 	 * completely at least once.  This will set r_direof which
3033 	 * can be used to find the value of the last cookie.
3034 	 */
3035 	mutex_enter(&rp->r_statelock);
3036 	if (rp->r_direof != NULL &&
3037 	    uiop->uio_offset == rp->r_direof->nfs_ncookie) {
3038 		mutex_exit(&rp->r_statelock);
3039 #ifdef DEBUG
3040 		nfs_readdir_cache_shorts++;
3041 #endif
3042 		if (eofp)
3043 			*eofp = 1;
3044 		if (nrdc != NULL)
3045 			rddir_cache_rele(nrdc);
3046 		return (0);
3047 	}
3048 	/*
3049 	 * Look for a cache entry.  Cache entries are identified
3050 	 * by the NFS cookie value and the byte count requested.
3051 	 */
3052 	srdc.nfs_cookie = uiop->uio_offset;
3053 	srdc.buflen = count;
3054 	rdc = avl_find(&rp->r_dir, &srdc, &where);
3055 	if (rdc != NULL) {
3056 		rddir_cache_hold(rdc);
3057 		/*
3058 		 * If the cache entry is in the process of being
3059 		 * filled in, wait until this completes.  The
3060 		 * RDDIRWAIT bit is set to indicate that someone
3061 		 * is waiting and then the thread currently
3062 		 * filling the entry is done, it should do a
3063 		 * cv_broadcast to wakeup all of the threads
3064 		 * waiting for it to finish.
3065 		 */
3066 		if (rdc->flags & RDDIR) {
3067 			nfs_rw_exit(&rp->r_rwlock);
3068 			rdc->flags |= RDDIRWAIT;
3069 #ifdef DEBUG
3070 			nfs_readdir_cache_waits++;
3071 #endif
3072 			if (!cv_wait_sig(&rdc->cv, &rp->r_statelock)) {
3073 				/*
3074 				 * We got interrupted, probably
3075 				 * the user typed ^C or an alarm
3076 				 * fired.  We free the new entry
3077 				 * if we allocated one.
3078 				 */
3079 				mutex_exit(&rp->r_statelock);
3080 				(void) nfs_rw_enter_sig(&rp->r_rwlock,
3081 				    RW_READER, FALSE);
3082 				rddir_cache_rele(rdc);
3083 				if (nrdc != NULL)
3084 					rddir_cache_rele(nrdc);
3085 				return (EINTR);
3086 			}
3087 			mutex_exit(&rp->r_statelock);
3088 			(void) nfs_rw_enter_sig(&rp->r_rwlock,
3089 			    RW_READER, FALSE);
3090 			rddir_cache_rele(rdc);
3091 			goto top;
3092 		}
3093 		/*
3094 		 * Check to see if a readdir is required to
3095 		 * fill the entry.  If so, mark this entry
3096 		 * as being filled, remove our reference,
3097 		 * and branch to the code to fill the entry.
3098 		 */
3099 		if (rdc->flags & RDDIRREQ) {
3100 			rdc->flags &= ~RDDIRREQ;
3101 			rdc->flags |= RDDIR;
3102 			if (nrdc != NULL)
3103 				rddir_cache_rele(nrdc);
3104 			nrdc = rdc;
3105 			mutex_exit(&rp->r_statelock);
3106 			goto bottom;
3107 		}
3108 #ifdef DEBUG
3109 		if (!missed)
3110 			nfs_readdir_cache_hits++;
3111 #endif
3112 		/*
3113 		 * If an error occurred while attempting
3114 		 * to fill the cache entry, just return it.
3115 		 */
3116 		if (rdc->error) {
3117 			error = rdc->error;
3118 			mutex_exit(&rp->r_statelock);
3119 			rddir_cache_rele(rdc);
3120 			if (nrdc != NULL)
3121 				rddir_cache_rele(nrdc);
3122 			return (error);
3123 		}
3124 
3125 		/*
3126 		 * The cache entry is complete and good,
3127 		 * copyout the dirent structs to the calling
3128 		 * thread.
3129 		 */
3130 		error = uiomove(rdc->entries, rdc->entlen, UIO_READ, uiop);
3131 
3132 		/*
3133 		 * If no error occurred during the copyout,
3134 		 * update the offset in the uio struct to
3135 		 * contain the value of the next cookie
3136 		 * and set the eof value appropriately.
3137 		 */
3138 		if (!error) {
3139 			uiop->uio_offset = rdc->nfs_ncookie;
3140 			if (eofp)
3141 				*eofp = rdc->eof;
3142 		}
3143 
3144 		/*
3145 		 * Decide whether to do readahead.  Don't if
3146 		 * have already read to the end of directory.
3147 		 */
3148 		if (rdc->eof) {
3149 			rp->r_direof = rdc;
3150 			mutex_exit(&rp->r_statelock);
3151 			rddir_cache_rele(rdc);
3152 			if (nrdc != NULL)
3153 				rddir_cache_rele(nrdc);
3154 			return (error);
3155 		}
3156 
3157 		/*
3158 		 * Check to see whether we found an entry
3159 		 * for the readahead.  If so, we don't need
3160 		 * to do anything further, so free the new
3161 		 * entry if one was allocated.  Otherwise,
3162 		 * allocate a new entry, add it to the cache,
3163 		 * and then initiate an asynchronous readdir
3164 		 * operation to fill it.
3165 		 */
3166 		srdc.nfs_cookie = rdc->nfs_ncookie;
3167 		srdc.buflen = count;
3168 		rrdc = avl_find(&rp->r_dir, &srdc, &where);
3169 		if (rrdc != NULL) {
3170 			if (nrdc != NULL)
3171 				rddir_cache_rele(nrdc);
3172 		} else {
3173 			if (nrdc != NULL)
3174 				rrdc = nrdc;
3175 			else {
3176 				rrdc = rddir_cache_alloc(KM_NOSLEEP);
3177 			}
3178 			if (rrdc != NULL) {
3179 				rrdc->nfs_cookie = rdc->nfs_ncookie;
3180 				rrdc->buflen = count;
3181 				avl_insert(&rp->r_dir, rrdc, where);
3182 				rddir_cache_hold(rrdc);
3183 				mutex_exit(&rp->r_statelock);
3184 				rddir_cache_rele(rdc);
3185 #ifdef DEBUG
3186 				nfs_readdir_readahead++;
3187 #endif
3188 				nfs_async_readdir(vp, rrdc, cr, nfsreaddir);
3189 				return (error);
3190 			}
3191 		}
3192 
3193 		mutex_exit(&rp->r_statelock);
3194 		rddir_cache_rele(rdc);
3195 		return (error);
3196 	}
3197 
3198 	/*
3199 	 * Didn't find an entry in the cache.  Construct a new empty
3200 	 * entry and link it into the cache.  Other processes attempting
3201 	 * to access this entry will need to wait until it is filled in.
3202 	 *
3203 	 * Since kmem_alloc may block, another pass through the cache
3204 	 * will need to be taken to make sure that another process
3205 	 * hasn't already added an entry to the cache for this request.
3206 	 */
3207 	if (nrdc == NULL) {
3208 		mutex_exit(&rp->r_statelock);
3209 		nrdc = rddir_cache_alloc(KM_SLEEP);
3210 		nrdc->nfs_cookie = uiop->uio_offset;
3211 		nrdc->buflen = count;
3212 		goto top;
3213 	}
3214 
3215 	/*
3216 	 * Add this entry to the cache.
3217 	 */
3218 	avl_insert(&rp->r_dir, nrdc, where);
3219 	rddir_cache_hold(nrdc);
3220 	mutex_exit(&rp->r_statelock);
3221 
3222 bottom:
3223 #ifdef DEBUG
3224 	missed = 1;
3225 	nfs_readdir_cache_misses++;
3226 #endif
3227 	/*
3228 	 * Do the readdir.
3229 	 */
3230 	error = nfsreaddir(vp, nrdc, cr);
3231 
3232 	/*
3233 	 * If this operation failed, just return the error which occurred.
3234 	 */
3235 	if (error != 0)
3236 		return (error);
3237 
3238 	/*
3239 	 * Since the RPC operation will have taken sometime and blocked
3240 	 * this process, another pass through the cache will need to be
3241 	 * taken to find the correct cache entry.  It is possible that
3242 	 * the correct cache entry will not be there (although one was
3243 	 * added) because the directory changed during the RPC operation
3244 	 * and the readdir cache was flushed.  In this case, just start
3245 	 * over.  It is hoped that this will not happen too often... :-)
3246 	 */
3247 	nrdc = NULL;
3248 	goto top;
3249 	/* NOTREACHED */
3250 }
3251 
3252 static int
3253 nfsreaddir(vnode_t *vp, rddir_cache *rdc, cred_t *cr)
3254 {
3255 	int error;
3256 	struct nfsrddirargs rda;
3257 	struct nfsrddirres rd;
3258 	rnode_t *rp;
3259 	mntinfo_t *mi;
3260 	uint_t count;
3261 	int douprintf;
3262 	failinfo_t fi, *fip;
3263 
3264 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
3265 	count = rdc->buflen;
3266 
3267 	rp = VTOR(vp);
3268 	mi = VTOMI(vp);
3269 
3270 	rda.rda_fh = *VTOFH(vp);
3271 	rda.rda_offset = rdc->nfs_cookie;
3272 
3273 	/*
3274 	 * NFS client failover support
3275 	 * suppress failover unless we have a zero cookie
3276 	 */
3277 	if (rdc->nfs_cookie == (off_t)0) {
3278 		fi.vp = vp;
3279 		fi.fhp = (caddr_t)&rda.rda_fh;
3280 		fi.copyproc = nfscopyfh;
3281 		fi.lookupproc = nfslookup;
3282 		fi.xattrdirproc = acl_getxattrdir2;
3283 		fip = &fi;
3284 	} else {
3285 		fip = NULL;
3286 	}
3287 
3288 	rd.rd_entries = kmem_alloc(rdc->buflen, KM_SLEEP);
3289 	rd.rd_size = count;
3290 	rd.rd_offset = rda.rda_offset;
3291 
3292 	douprintf = 1;
3293 
3294 	if (mi->mi_io_kstats) {
3295 		mutex_enter(&mi->mi_lock);
3296 		kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
3297 		mutex_exit(&mi->mi_lock);
3298 	}
3299 
3300 	do {
3301 		rda.rda_count = MIN(count, mi->mi_curread);
3302 		error = rfs2call(mi, RFS_READDIR,
3303 		    xdr_rddirargs, (caddr_t)&rda,
3304 		    xdr_getrddirres, (caddr_t)&rd, cr,
3305 		    &douprintf, &rd.rd_status, 0, fip);
3306 	} while (error == ENFS_TRYAGAIN);
3307 
3308 	if (mi->mi_io_kstats) {
3309 		mutex_enter(&mi->mi_lock);
3310 		kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
3311 		mutex_exit(&mi->mi_lock);
3312 	}
3313 
3314 	/*
3315 	 * Since we are actually doing a READDIR RPC, we must have
3316 	 * exclusive access to the cache entry being filled.  Thus,
3317 	 * it is safe to update all fields except for the flags
3318 	 * field.  The r_statelock in the rnode must be held to
3319 	 * prevent two different threads from simultaneously
3320 	 * attempting to update the flags field.  This can happen
3321 	 * if we are turning off RDDIR and the other thread is
3322 	 * trying to set RDDIRWAIT.
3323 	 */
3324 	ASSERT(rdc->flags & RDDIR);
3325 	if (!error) {
3326 		error = geterrno(rd.rd_status);
3327 		if (!error) {
3328 			rdc->nfs_ncookie = rd.rd_offset;
3329 			rdc->eof = rd.rd_eof ? 1 : 0;
3330 			rdc->entlen = rd.rd_size;
3331 			ASSERT(rdc->entlen <= rdc->buflen);
3332 #ifdef DEBUG
3333 			rdc->entries = rddir_cache_buf_alloc(rdc->buflen,
3334 			    KM_SLEEP);
3335 #else
3336 			rdc->entries = kmem_alloc(rdc->buflen, KM_SLEEP);
3337 #endif
3338 			bcopy(rd.rd_entries, rdc->entries, rdc->entlen);
3339 			rdc->error = 0;
3340 			if (mi->mi_io_kstats) {
3341 				mutex_enter(&mi->mi_lock);
3342 				KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
3343 				KSTAT_IO_PTR(mi->mi_io_kstats)->nread +=
3344 				    rd.rd_size;
3345 				mutex_exit(&mi->mi_lock);
3346 			}
3347 		} else {
3348 			PURGE_STALE_FH(error, vp, cr);
3349 		}
3350 	}
3351 	if (error) {
3352 		rdc->entries = NULL;
3353 		rdc->error = error;
3354 	}
3355 	kmem_free(rd.rd_entries, rdc->buflen);
3356 
3357 	mutex_enter(&rp->r_statelock);
3358 	rdc->flags &= ~RDDIR;
3359 	if (rdc->flags & RDDIRWAIT) {
3360 		rdc->flags &= ~RDDIRWAIT;
3361 		cv_broadcast(&rdc->cv);
3362 	}
3363 	if (error)
3364 		rdc->flags |= RDDIRREQ;
3365 	mutex_exit(&rp->r_statelock);
3366 
3367 	rddir_cache_rele(rdc);
3368 
3369 	return (error);
3370 }
3371 
3372 #ifdef DEBUG
3373 static int nfs_bio_do_stop = 0;
3374 #endif
3375 
3376 static int
3377 nfs_bio(struct buf *bp, cred_t *cr)
3378 {
3379 	rnode_t *rp = VTOR(bp->b_vp);
3380 	int count;
3381 	int error;
3382 	cred_t *cred;
3383 	uint_t offset;
3384 
3385 	DTRACE_IO1(start, struct buf *, bp);
3386 
3387 	ASSERT(nfs_zone() == VTOMI(bp->b_vp)->mi_zone);
3388 	offset = dbtob(bp->b_blkno);
3389 
3390 	if (bp->b_flags & B_READ) {
3391 		mutex_enter(&rp->r_statelock);
3392 		if (rp->r_cred != NULL) {
3393 			cred = rp->r_cred;
3394 			crhold(cred);
3395 		} else {
3396 			rp->r_cred = cr;
3397 			crhold(cr);
3398 			cred = cr;
3399 			crhold(cred);
3400 		}
3401 		mutex_exit(&rp->r_statelock);
3402 	read_again:
3403 		error = bp->b_error = nfsread(bp->b_vp, bp->b_un.b_addr,
3404 		    offset, bp->b_bcount, &bp->b_resid, cred);
3405 
3406 		crfree(cred);
3407 		if (!error) {
3408 			if (bp->b_resid) {
3409 				/*
3410 				 * Didn't get it all because we hit EOF,
3411 				 * zero all the memory beyond the EOF.
3412 				 */
3413 				/* bzero(rdaddr + */
3414 				bzero(bp->b_un.b_addr +
3415 				    bp->b_bcount - bp->b_resid, bp->b_resid);
3416 			}
3417 			mutex_enter(&rp->r_statelock);
3418 			if (bp->b_resid == bp->b_bcount &&
3419 			    offset >= rp->r_size) {
3420 				/*
3421 				 * We didn't read anything at all as we are
3422 				 * past EOF.  Return an error indicator back
3423 				 * but don't destroy the pages (yet).
3424 				 */
3425 				error = NFS_EOF;
3426 			}
3427 			mutex_exit(&rp->r_statelock);
3428 		} else if (error == EACCES) {
3429 			mutex_enter(&rp->r_statelock);
3430 			if (cred != cr) {
3431 				if (rp->r_cred != NULL)
3432 					crfree(rp->r_cred);
3433 				rp->r_cred = cr;
3434 				crhold(cr);
3435 				cred = cr;
3436 				crhold(cred);
3437 				mutex_exit(&rp->r_statelock);
3438 				goto read_again;
3439 			}
3440 			mutex_exit(&rp->r_statelock);
3441 		}
3442 	} else {
3443 		if (!(rp->r_flags & RSTALE)) {
3444 			mutex_enter(&rp->r_statelock);
3445 			if (rp->r_cred != NULL) {
3446 				cred = rp->r_cred;
3447 				crhold(cred);
3448 			} else {
3449 				rp->r_cred = cr;
3450 				crhold(cr);
3451 				cred = cr;
3452 				crhold(cred);
3453 			}
3454 			mutex_exit(&rp->r_statelock);
3455 		write_again:
3456 			mutex_enter(&rp->r_statelock);
3457 			count = MIN(bp->b_bcount, rp->r_size - offset);
3458 			mutex_exit(&rp->r_statelock);
3459 			if (count < 0)
3460 				cmn_err(CE_PANIC, "nfs_bio: write count < 0");
3461 #ifdef DEBUG
3462 			if (count == 0) {
3463 				zcmn_err(getzoneid(), CE_WARN,
3464 				    "nfs_bio: zero length write at %d",
3465 				    offset);
3466 				nfs_printfhandle(&rp->r_fh);
3467 				if (nfs_bio_do_stop)
3468 					debug_enter("nfs_bio");
3469 			}
3470 #endif
3471 			error = nfswrite(bp->b_vp, bp->b_un.b_addr, offset,
3472 			    count, cred);
3473 			if (error == EACCES) {
3474 				mutex_enter(&rp->r_statelock);
3475 				if (cred != cr) {
3476 					if (rp->r_cred != NULL)
3477 						crfree(rp->r_cred);
3478 					rp->r_cred = cr;
3479 					crhold(cr);
3480 					crfree(cred);
3481 					cred = cr;
3482 					crhold(cred);
3483 					mutex_exit(&rp->r_statelock);
3484 					goto write_again;
3485 				}
3486 				mutex_exit(&rp->r_statelock);
3487 			}
3488 			bp->b_error = error;
3489 			if (error && error != EINTR) {
3490 				/*
3491 				 * Don't print EDQUOT errors on the console.
3492 				 * Don't print asynchronous EACCES errors.
3493 				 * Don't print EFBIG errors.
3494 				 * Print all other write errors.
3495 				 */
3496 				if (error != EDQUOT && error != EFBIG &&
3497 				    (error != EACCES ||
3498 				    !(bp->b_flags & B_ASYNC)))
3499 					nfs_write_error(bp->b_vp, error, cred);
3500 				/*
3501 				 * Update r_error and r_flags as appropriate.
3502 				 * If the error was ESTALE, then mark the
3503 				 * rnode as not being writeable and save
3504 				 * the error status.  Otherwise, save any
3505 				 * errors which occur from asynchronous
3506 				 * page invalidations.  Any errors occurring
3507 				 * from other operations should be saved
3508 				 * by the caller.
3509 				 */
3510 				mutex_enter(&rp->r_statelock);
3511 				if (error == ESTALE) {
3512 					rp->r_flags |= RSTALE;
3513 					if (!rp->r_error)
3514 						rp->r_error = error;
3515 				} else if (!rp->r_error &&
3516 				    (bp->b_flags &
3517 				    (B_INVAL|B_FORCE|B_ASYNC)) ==
3518 				    (B_INVAL|B_FORCE|B_ASYNC)) {
3519 					rp->r_error = error;
3520 				}
3521 				mutex_exit(&rp->r_statelock);
3522 			}
3523 			crfree(cred);
3524 		} else {
3525 			error = rp->r_error;
3526 			/*
3527 			 * A close may have cleared r_error, if so,
3528 			 * propagate ESTALE error return properly
3529 			 */
3530 			if (error == 0)
3531 				error = ESTALE;
3532 		}
3533 	}
3534 
3535 	if (error != 0 && error != NFS_EOF)
3536 		bp->b_flags |= B_ERROR;
3537 
3538 	DTRACE_IO1(done, struct buf *, bp);
3539 
3540 	return (error);
3541 }
3542 
3543 /* ARGSUSED */
3544 static int
3545 nfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
3546 {
3547 	struct nfs_fid *fp;
3548 	rnode_t *rp;
3549 
3550 	rp = VTOR(vp);
3551 
3552 	if (fidp->fid_len < (sizeof (struct nfs_fid) - sizeof (short))) {
3553 		fidp->fid_len = sizeof (struct nfs_fid) - sizeof (short);
3554 		return (ENOSPC);
3555 	}
3556 	fp = (struct nfs_fid *)fidp;
3557 	fp->nf_pad = 0;
3558 	fp->nf_len = sizeof (struct nfs_fid) - sizeof (short);
3559 	bcopy(rp->r_fh.fh_buf, fp->nf_data, NFS_FHSIZE);
3560 	return (0);
3561 }
3562 
3563 /* ARGSUSED2 */
3564 static int
3565 nfs_rwlock(vnode_t *vp, int write_lock, caller_context_t *ctp)
3566 {
3567 	rnode_t *rp = VTOR(vp);
3568 
3569 	if (!write_lock) {
3570 		(void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
3571 		return (V_WRITELOCK_FALSE);
3572 	}
3573 
3574 	if ((rp->r_flags & RDIRECTIO) || (VTOMI(vp)->mi_flags & MI_DIRECTIO)) {
3575 		(void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
3576 		if (rp->r_mapcnt == 0 && !vn_has_cached_data(vp))
3577 			return (V_WRITELOCK_FALSE);
3578 		nfs_rw_exit(&rp->r_rwlock);
3579 	}
3580 
3581 	(void) nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, FALSE);
3582 	return (V_WRITELOCK_TRUE);
3583 }
3584 
3585 /* ARGSUSED */
3586 static void
3587 nfs_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ctp)
3588 {
3589 	rnode_t *rp = VTOR(vp);
3590 
3591 	nfs_rw_exit(&rp->r_rwlock);
3592 }
3593 
3594 /* ARGSUSED */
3595 static int
3596 nfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct)
3597 {
3598 
3599 	/*
3600 	 * Because we stuff the readdir cookie into the offset field
3601 	 * someone may attempt to do an lseek with the cookie which
3602 	 * we want to succeed.
3603 	 */
3604 	if (vp->v_type == VDIR)
3605 		return (0);
3606 	if (*noffp < 0 || *noffp > MAXOFF32_T)
3607 		return (EINVAL);
3608 	return (0);
3609 }
3610 
3611 /*
3612  * number of NFS_MAXDATA blocks to read ahead
3613  * optimized for 100 base-T.
3614  */
3615 static int nfs_nra = 4;
3616 
3617 #ifdef DEBUG
3618 static int nfs_lostpage = 0;	/* number of times we lost original page */
3619 #endif
3620 
3621 /*
3622  * Return all the pages from [off..off+len) in file
3623  */
3624 /* ARGSUSED */
3625 static int
3626 nfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
3627 	page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
3628 	enum seg_rw rw, cred_t *cr, caller_context_t *ct)
3629 {
3630 	rnode_t *rp;
3631 	int error;
3632 	mntinfo_t *mi;
3633 
3634 	if (vp->v_flag & VNOMAP)
3635 		return (ENOSYS);
3636 
3637 	ASSERT(off <= MAXOFF32_T);
3638 	if (nfs_zone() != VTOMI(vp)->mi_zone)
3639 		return (EIO);
3640 	if (protp != NULL)
3641 		*protp = PROT_ALL;
3642 
3643 	/*
3644 	 * Now valididate that the caches are up to date.
3645 	 */
3646 	error = nfs_validate_caches(vp, cr);
3647 	if (error)
3648 		return (error);
3649 
3650 	rp = VTOR(vp);
3651 	mi = VTOMI(vp);
3652 retry:
3653 	mutex_enter(&rp->r_statelock);
3654 
3655 	/*
3656 	 * Don't create dirty pages faster than they
3657 	 * can be cleaned so that the system doesn't
3658 	 * get imbalanced.  If the async queue is
3659 	 * maxed out, then wait for it to drain before
3660 	 * creating more dirty pages.  Also, wait for
3661 	 * any threads doing pagewalks in the vop_getattr
3662 	 * entry points so that they don't block for
3663 	 * long periods.
3664 	 */
3665 	if (rw == S_CREATE) {
3666 		while ((mi->mi_max_threads != 0 &&
3667 		    rp->r_awcount > 2 * mi->mi_max_threads) ||
3668 		    rp->r_gcount > 0)
3669 			cv_wait(&rp->r_cv, &rp->r_statelock);
3670 	}
3671 
3672 	/*
3673 	 * If we are getting called as a side effect of an nfs_write()
3674 	 * operation the local file size might not be extended yet.
3675 	 * In this case we want to be able to return pages of zeroes.
3676 	 */
3677 	if (off + len > rp->r_size + PAGEOFFSET && seg != segkmap) {
3678 		mutex_exit(&rp->r_statelock);
3679 		return (EFAULT);		/* beyond EOF */
3680 	}
3681 
3682 	mutex_exit(&rp->r_statelock);
3683 
3684 	if (len <= PAGESIZE) {
3685 		error = nfs_getapage(vp, off, len, protp, pl, plsz,
3686 		    seg, addr, rw, cr);
3687 	} else {
3688 		error = pvn_getpages(nfs_getapage, vp, off, len, protp,
3689 		    pl, plsz, seg, addr, rw, cr);
3690 	}
3691 
3692 	switch (error) {
3693 	case NFS_EOF:
3694 		nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr);
3695 		goto retry;
3696 	case ESTALE:
3697 		PURGE_STALE_FH(error, vp, cr);
3698 	}
3699 
3700 	return (error);
3701 }
3702 
3703 /*
3704  * Called from pvn_getpages or nfs_getpage to get a particular page.
3705  */
3706 /* ARGSUSED */
3707 static int
3708 nfs_getapage(vnode_t *vp, u_offset_t off, size_t len, uint_t *protp,
3709 	page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
3710 	enum seg_rw rw, cred_t *cr)
3711 {
3712 	rnode_t *rp;
3713 	uint_t bsize;
3714 	struct buf *bp;
3715 	page_t *pp;
3716 	u_offset_t lbn;
3717 	u_offset_t io_off;
3718 	u_offset_t blkoff;
3719 	u_offset_t rablkoff;
3720 	size_t io_len;
3721 	uint_t blksize;
3722 	int error;
3723 	int readahead;
3724 	int readahead_issued = 0;
3725 	int ra_window; /* readahead window */
3726 	page_t *pagefound;
3727 
3728 	if (nfs_zone() != VTOMI(vp)->mi_zone)
3729 		return (EIO);
3730 	rp = VTOR(vp);
3731 	bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
3732 
3733 reread:
3734 	bp = NULL;
3735 	pp = NULL;
3736 	pagefound = NULL;
3737 
3738 	if (pl != NULL)
3739 		pl[0] = NULL;
3740 
3741 	error = 0;
3742 	lbn = off / bsize;
3743 	blkoff = lbn * bsize;
3744 
3745 	/*
3746 	 * Queueing up the readahead before doing the synchronous read
3747 	 * results in a significant increase in read throughput because
3748 	 * of the increased parallelism between the async threads and
3749 	 * the process context.
3750 	 */
3751 	if ((off & ((vp->v_vfsp->vfs_bsize) - 1)) == 0 &&
3752 	    rw != S_CREATE &&
3753 	    !(vp->v_flag & VNOCACHE)) {
3754 		mutex_enter(&rp->r_statelock);
3755 
3756 		/*
3757 		 * Calculate the number of readaheads to do.
3758 		 * a) No readaheads at offset = 0.
3759 		 * b) Do maximum(nfs_nra) readaheads when the readahead
3760 		 *    window is closed.
3761 		 * c) Do readaheads between 1 to (nfs_nra - 1) depending
3762 		 *    upon how far the readahead window is open or close.
3763 		 * d) No readaheads if rp->r_nextr is not within the scope
3764 		 *    of the readahead window (random i/o).
3765 		 */
3766 
3767 		if (off == 0)
3768 			readahead = 0;
3769 		else if (blkoff == rp->r_nextr)
3770 			readahead = nfs_nra;
3771 		else if (rp->r_nextr > blkoff &&
3772 		    ((ra_window = (rp->r_nextr - blkoff) / bsize)
3773 		    <= (nfs_nra - 1)))
3774 			readahead = nfs_nra - ra_window;
3775 		else
3776 			readahead = 0;
3777 
3778 		rablkoff = rp->r_nextr;
3779 		while (readahead > 0 && rablkoff + bsize < rp->r_size) {
3780 			mutex_exit(&rp->r_statelock);
3781 			if (nfs_async_readahead(vp, rablkoff + bsize,
3782 			    addr + (rablkoff + bsize - off), seg, cr,
3783 			    nfs_readahead) < 0) {
3784 				mutex_enter(&rp->r_statelock);
3785 				break;
3786 			}
3787 			readahead--;
3788 			rablkoff += bsize;
3789 			/*
3790 			 * Indicate that we did a readahead so
3791 			 * readahead offset is not updated
3792 			 * by the synchronous read below.
3793 			 */
3794 			readahead_issued = 1;
3795 			mutex_enter(&rp->r_statelock);
3796 			/*
3797 			 * set readahead offset to
3798 			 * offset of last async readahead
3799 			 * request.
3800 			 */
3801 			rp->r_nextr = rablkoff;
3802 		}
3803 		mutex_exit(&rp->r_statelock);
3804 	}
3805 
3806 again:
3807 	if ((pagefound = page_exists(vp, off)) == NULL) {
3808 		if (pl == NULL) {
3809 			(void) nfs_async_readahead(vp, blkoff, addr, seg, cr,
3810 			    nfs_readahead);
3811 		} else if (rw == S_CREATE) {
3812 			/*
3813 			 * Block for this page is not allocated, or the offset
3814 			 * is beyond the current allocation size, or we're
3815 			 * allocating a swap slot and the page was not found,
3816 			 * so allocate it and return a zero page.
3817 			 */
3818 			if ((pp = page_create_va(vp, off,
3819 			    PAGESIZE, PG_WAIT, seg, addr)) == NULL)
3820 				cmn_err(CE_PANIC, "nfs_getapage: page_create");
3821 			io_len = PAGESIZE;
3822 			mutex_enter(&rp->r_statelock);
3823 			rp->r_nextr = off + PAGESIZE;
3824 			mutex_exit(&rp->r_statelock);
3825 		} else {
3826 			/*
3827 			 * Need to go to server to get a BLOCK, exception to
3828 			 * that being while reading at offset = 0 or doing
3829 			 * random i/o, in that case read only a PAGE.
3830 			 */
3831 			mutex_enter(&rp->r_statelock);
3832 			if (blkoff < rp->r_size &&
3833 			    blkoff + bsize >= rp->r_size) {
3834 				/*
3835 				 * If only a block or less is left in
3836 				 * the file, read all that is remaining.
3837 				 */
3838 				if (rp->r_size <= off) {
3839 					/*
3840 					 * Trying to access beyond EOF,
3841 					 * set up to get at least one page.
3842 					 */
3843 					blksize = off + PAGESIZE - blkoff;
3844 				} else
3845 					blksize = rp->r_size - blkoff;
3846 			} else if ((off == 0) ||
3847 			    (off != rp->r_nextr && !readahead_issued)) {
3848 				blksize = PAGESIZE;
3849 				blkoff = off; /* block = page here */
3850 			} else
3851 				blksize = bsize;
3852 			mutex_exit(&rp->r_statelock);
3853 
3854 			pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
3855 			    &io_len, blkoff, blksize, 0);
3856 
3857 			/*
3858 			 * Some other thread has entered the page,
3859 			 * so just use it.
3860 			 */
3861 			if (pp == NULL)
3862 				goto again;
3863 
3864 			/*
3865 			 * Now round the request size up to page boundaries.
3866 			 * This ensures that the entire page will be
3867 			 * initialized to zeroes if EOF is encountered.
3868 			 */
3869 			io_len = ptob(btopr(io_len));
3870 
3871 			bp = pageio_setup(pp, io_len, vp, B_READ);
3872 			ASSERT(bp != NULL);
3873 
3874 			/*
3875 			 * pageio_setup should have set b_addr to 0.  This
3876 			 * is correct since we want to do I/O on a page
3877 			 * boundary.  bp_mapin will use this addr to calculate
3878 			 * an offset, and then set b_addr to the kernel virtual
3879 			 * address it allocated for us.
3880 			 */
3881 			ASSERT(bp->b_un.b_addr == 0);
3882 
3883 			bp->b_edev = 0;
3884 			bp->b_dev = 0;
3885 			bp->b_lblkno = lbtodb(io_off);
3886 			bp->b_file = vp;
3887 			bp->b_offset = (offset_t)off;
3888 			bp_mapin(bp);
3889 
3890 			/*
3891 			 * If doing a write beyond what we believe is EOF,
3892 			 * don't bother trying to read the pages from the
3893 			 * server, we'll just zero the pages here.  We
3894 			 * don't check that the rw flag is S_WRITE here
3895 			 * because some implementations may attempt a
3896 			 * read access to the buffer before copying data.
3897 			 */
3898 			mutex_enter(&rp->r_statelock);
3899 			if (io_off >= rp->r_size && seg == segkmap) {
3900 				mutex_exit(&rp->r_statelock);
3901 				bzero(bp->b_un.b_addr, io_len);
3902 			} else {
3903 				mutex_exit(&rp->r_statelock);
3904 				error = nfs_bio(bp, cr);
3905 			}
3906 
3907 			/*
3908 			 * Unmap the buffer before freeing it.
3909 			 */
3910 			bp_mapout(bp);
3911 			pageio_done(bp);
3912 
3913 			if (error == NFS_EOF) {
3914 				/*
3915 				 * If doing a write system call just return
3916 				 * zeroed pages, else user tried to get pages
3917 				 * beyond EOF, return error.  We don't check
3918 				 * that the rw flag is S_WRITE here because
3919 				 * some implementations may attempt a read
3920 				 * access to the buffer before copying data.
3921 				 */
3922 				if (seg == segkmap)
3923 					error = 0;
3924 				else
3925 					error = EFAULT;
3926 			}
3927 
3928 			if (!readahead_issued && !error) {
3929 				mutex_enter(&rp->r_statelock);
3930 				rp->r_nextr = io_off + io_len;
3931 				mutex_exit(&rp->r_statelock);
3932 			}
3933 		}
3934 	}
3935 
3936 out:
3937 	if (pl == NULL)
3938 		return (error);
3939 
3940 	if (error) {
3941 		if (pp != NULL)
3942 			pvn_read_done(pp, B_ERROR);
3943 		return (error);
3944 	}
3945 
3946 	if (pagefound) {
3947 		se_t se = (rw == S_CREATE ? SE_EXCL : SE_SHARED);
3948 
3949 		/*
3950 		 * Page exists in the cache, acquire the appropriate lock.
3951 		 * If this fails, start all over again.
3952 		 */
3953 		if ((pp = page_lookup(vp, off, se)) == NULL) {
3954 #ifdef DEBUG
3955 			nfs_lostpage++;
3956 #endif
3957 			goto reread;
3958 		}
3959 		pl[0] = pp;
3960 		pl[1] = NULL;
3961 		return (0);
3962 	}
3963 
3964 	if (pp != NULL)
3965 		pvn_plist_init(pp, pl, plsz, off, io_len, rw);
3966 
3967 	return (error);
3968 }
3969 
3970 static void
3971 nfs_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, struct seg *seg,
3972 	cred_t *cr)
3973 {
3974 	int error;
3975 	page_t *pp;
3976 	u_offset_t io_off;
3977 	size_t io_len;
3978 	struct buf *bp;
3979 	uint_t bsize, blksize;
3980 	rnode_t *rp = VTOR(vp);
3981 
3982 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
3983 
3984 	bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
3985 
3986 	mutex_enter(&rp->r_statelock);
3987 	if (blkoff < rp->r_size && blkoff + bsize > rp->r_size) {
3988 		/*
3989 		 * If less than a block left in file read less
3990 		 * than a block.
3991 		 */
3992 		blksize = rp->r_size - blkoff;
3993 	} else
3994 		blksize = bsize;
3995 	mutex_exit(&rp->r_statelock);
3996 
3997 	pp = pvn_read_kluster(vp, blkoff, segkmap, addr,
3998 	    &io_off, &io_len, blkoff, blksize, 1);
3999 	/*
4000 	 * The isra flag passed to the kluster function is 1, we may have
4001 	 * gotten a return value of NULL for a variety of reasons (# of free
4002 	 * pages < minfree, someone entered the page on the vnode etc). In all
4003 	 * cases, we want to punt on the readahead.
4004 	 */
4005 	if (pp == NULL)
4006 		return;
4007 
4008 	/*
4009 	 * Now round the request size up to page boundaries.
4010 	 * This ensures that the entire page will be
4011 	 * initialized to zeroes if EOF is encountered.
4012 	 */
4013 	io_len = ptob(btopr(io_len));
4014 
4015 	bp = pageio_setup(pp, io_len, vp, B_READ);
4016 	ASSERT(bp != NULL);
4017 
4018 	/*
4019 	 * pageio_setup should have set b_addr to 0.  This is correct since
4020 	 * we want to do I/O on a page boundary. bp_mapin() will use this addr
4021 	 * to calculate an offset, and then set b_addr to the kernel virtual
4022 	 * address it allocated for us.
4023 	 */
4024 	ASSERT(bp->b_un.b_addr == 0);
4025 
4026 	bp->b_edev = 0;
4027 	bp->b_dev = 0;
4028 	bp->b_lblkno = lbtodb(io_off);
4029 	bp->b_file = vp;
4030 	bp->b_offset = (offset_t)blkoff;
4031 	bp_mapin(bp);
4032 
4033 	/*
4034 	 * If doing a write beyond what we believe is EOF, don't bother trying
4035 	 * to read the pages from the server, we'll just zero the pages here.
4036 	 * We don't check that the rw flag is S_WRITE here because some
4037 	 * implementations may attempt a read access to the buffer before
4038 	 * copying data.
4039 	 */
4040 	mutex_enter(&rp->r_statelock);
4041 	if (io_off >= rp->r_size && seg == segkmap) {
4042 		mutex_exit(&rp->r_statelock);
4043 		bzero(bp->b_un.b_addr, io_len);
4044 		error = 0;
4045 	} else {
4046 		mutex_exit(&rp->r_statelock);
4047 		error = nfs_bio(bp, cr);
4048 		if (error == NFS_EOF)
4049 			error = 0;
4050 	}
4051 
4052 	/*
4053 	 * Unmap the buffer before freeing it.
4054 	 */
4055 	bp_mapout(bp);
4056 	pageio_done(bp);
4057 
4058 	pvn_read_done(pp, error ? B_READ | B_ERROR : B_READ);
4059 
4060 	/*
4061 	 * In case of error set readahead offset
4062 	 * to the lowest offset.
4063 	 * pvn_read_done() calls VN_DISPOSE to destroy the pages
4064 	 */
4065 	if (error && rp->r_nextr > io_off) {
4066 		mutex_enter(&rp->r_statelock);
4067 		if (rp->r_nextr > io_off)
4068 			rp->r_nextr = io_off;
4069 		mutex_exit(&rp->r_statelock);
4070 	}
4071 }
4072 
4073 /*
4074  * Flags are composed of {B_INVAL, B_FREE, B_DONTNEED, B_FORCE}
4075  * If len == 0, do from off to EOF.
4076  *
4077  * The normal cases should be len == 0 && off == 0 (entire vp list),
4078  * len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE
4079  * (from pageout).
4080  */
4081 /* ARGSUSED */
4082 static int
4083 nfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
4084 	caller_context_t *ct)
4085 {
4086 	int error;
4087 	rnode_t *rp;
4088 
4089 	ASSERT(cr != NULL);
4090 
4091 	/*
4092 	 * XXX - Why should this check be made here?
4093 	 */
4094 	if (vp->v_flag & VNOMAP)
4095 		return (ENOSYS);
4096 
4097 	if (len == 0 && !(flags & B_INVAL) && vn_is_readonly(vp))
4098 		return (0);
4099 
4100 	if (!(flags & B_ASYNC) && nfs_zone() != VTOMI(vp)->mi_zone)
4101 		return (EIO);
4102 	ASSERT(off <= MAXOFF32_T);
4103 
4104 	rp = VTOR(vp);
4105 	mutex_enter(&rp->r_statelock);
4106 	rp->r_count++;
4107 	mutex_exit(&rp->r_statelock);
4108 	error = nfs_putpages(vp, off, len, flags, cr);
4109 	mutex_enter(&rp->r_statelock);
4110 	rp->r_count--;
4111 	cv_broadcast(&rp->r_cv);
4112 	mutex_exit(&rp->r_statelock);
4113 
4114 	return (error);
4115 }
4116 
4117 /*
4118  * Write out a single page, possibly klustering adjacent dirty pages.
4119  */
4120 int
4121 nfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp,
4122 	int flags, cred_t *cr)
4123 {
4124 	u_offset_t io_off;
4125 	u_offset_t lbn_off;
4126 	u_offset_t lbn;
4127 	size_t io_len;
4128 	uint_t bsize;
4129 	int error;
4130 	rnode_t *rp;
4131 
4132 	ASSERT(!vn_is_readonly(vp));
4133 	ASSERT(pp != NULL);
4134 	ASSERT(cr != NULL);
4135 	ASSERT((flags & B_ASYNC) || nfs_zone() == VTOMI(vp)->mi_zone);
4136 
4137 	rp = VTOR(vp);
4138 	ASSERT(rp->r_count > 0);
4139 
4140 	ASSERT(pp->p_offset <= MAXOFF32_T);
4141 
4142 	bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
4143 	lbn = pp->p_offset / bsize;
4144 	lbn_off = lbn * bsize;
4145 
4146 	/*
4147 	 * Find a kluster that fits in one block, or in
4148 	 * one page if pages are bigger than blocks.  If
4149 	 * there is less file space allocated than a whole
4150 	 * page, we'll shorten the i/o request below.
4151 	 */
4152 	pp = pvn_write_kluster(vp, pp, &io_off, &io_len, lbn_off,
4153 	    roundup(bsize, PAGESIZE), flags);
4154 
4155 	/*
4156 	 * pvn_write_kluster shouldn't have returned a page with offset
4157 	 * behind the original page we were given.  Verify that.
4158 	 */
4159 	ASSERT((pp->p_offset / bsize) >= lbn);
4160 
4161 	/*
4162 	 * Now pp will have the list of kept dirty pages marked for
4163 	 * write back.  It will also handle invalidation and freeing
4164 	 * of pages that are not dirty.  Check for page length rounding
4165 	 * problems.
4166 	 */
4167 	if (io_off + io_len > lbn_off + bsize) {
4168 		ASSERT((io_off + io_len) - (lbn_off + bsize) < PAGESIZE);
4169 		io_len = lbn_off + bsize - io_off;
4170 	}
4171 	/*
4172 	 * The RMODINPROGRESS flag makes sure that nfs(3)_bio() sees a
4173 	 * consistent value of r_size. RMODINPROGRESS is set in writerp().
4174 	 * When RMODINPROGRESS is set it indicates that a uiomove() is in
4175 	 * progress and the r_size has not been made consistent with the
4176 	 * new size of the file. When the uiomove() completes the r_size is
4177 	 * updated and the RMODINPROGRESS flag is cleared.
4178 	 *
4179 	 * The RMODINPROGRESS flag makes sure that nfs(3)_bio() sees a
4180 	 * consistent value of r_size. Without this handshaking, it is
4181 	 * possible that nfs(3)_bio() picks  up the old value of r_size
4182 	 * before the uiomove() in writerp() completes. This will result
4183 	 * in the write through nfs(3)_bio() being dropped.
4184 	 *
4185 	 * More precisely, there is a window between the time the uiomove()
4186 	 * completes and the time the r_size is updated. If a VOP_PUTPAGE()
4187 	 * operation intervenes in this window, the page will be picked up,
4188 	 * because it is dirty (it will be unlocked, unless it was
4189 	 * pagecreate'd). When the page is picked up as dirty, the dirty
4190 	 * bit is reset (pvn_getdirty()). In nfs(3)write(), r_size is
4191 	 * checked. This will still be the old size. Therefore the page will
4192 	 * not be written out. When segmap_release() calls VOP_PUTPAGE(),
4193 	 * the page will be found to be clean and the write will be dropped.
4194 	 */
4195 	if (rp->r_flags & RMODINPROGRESS) {
4196 		mutex_enter(&rp->r_statelock);
4197 		if ((rp->r_flags & RMODINPROGRESS) &&
4198 		    rp->r_modaddr + MAXBSIZE > io_off &&
4199 		    rp->r_modaddr < io_off + io_len) {
4200 			page_t *plist;
4201 			/*
4202 			 * A write is in progress for this region of the file.
4203 			 * If we did not detect RMODINPROGRESS here then this
4204 			 * path through nfs_putapage() would eventually go to
4205 			 * nfs(3)_bio() and may not write out all of the data
4206 			 * in the pages. We end up losing data. So we decide
4207 			 * to set the modified bit on each page in the page
4208 			 * list and mark the rnode with RDIRTY. This write
4209 			 * will be restarted at some later time.
4210 			 */
4211 			plist = pp;
4212 			while (plist != NULL) {
4213 				pp = plist;
4214 				page_sub(&plist, pp);
4215 				hat_setmod(pp);
4216 				page_io_unlock(pp);
4217 				page_unlock(pp);
4218 			}
4219 			rp->r_flags |= RDIRTY;
4220 			mutex_exit(&rp->r_statelock);
4221 			if (offp)
4222 				*offp = io_off;
4223 			if (lenp)
4224 				*lenp = io_len;
4225 			return (0);
4226 		}
4227 		mutex_exit(&rp->r_statelock);
4228 	}
4229 
4230 	if (flags & B_ASYNC) {
4231 		error = nfs_async_putapage(vp, pp, io_off, io_len, flags, cr,
4232 		    nfs_sync_putapage);
4233 	} else
4234 		error = nfs_sync_putapage(vp, pp, io_off, io_len, flags, cr);
4235 
4236 	if (offp)
4237 		*offp = io_off;
4238 	if (lenp)
4239 		*lenp = io_len;
4240 	return (error);
4241 }
4242 
4243 static int
4244 nfs_sync_putapage(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
4245 	int flags, cred_t *cr)
4246 {
4247 	int error;
4248 	rnode_t *rp;
4249 
4250 	flags |= B_WRITE;
4251 
4252 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
4253 	error = nfs_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
4254 
4255 	rp = VTOR(vp);
4256 
4257 	if ((error == ENOSPC || error == EDQUOT || error == EACCES) &&
4258 	    (flags & (B_INVAL|B_FORCE)) != (B_INVAL|B_FORCE)) {
4259 		if (!(rp->r_flags & ROUTOFSPACE)) {
4260 			mutex_enter(&rp->r_statelock);
4261 			rp->r_flags |= ROUTOFSPACE;
4262 			mutex_exit(&rp->r_statelock);
4263 		}
4264 		flags |= B_ERROR;
4265 		pvn_write_done(pp, flags);
4266 		/*
4267 		 * If this was not an async thread, then try again to
4268 		 * write out the pages, but this time, also destroy
4269 		 * them whether or not the write is successful.  This
4270 		 * will prevent memory from filling up with these
4271 		 * pages and destroying them is the only alternative
4272 		 * if they can't be written out.
4273 		 *
4274 		 * Don't do this if this is an async thread because
4275 		 * when the pages are unlocked in pvn_write_done,
4276 		 * some other thread could have come along, locked
4277 		 * them, and queued for an async thread.  It would be
4278 		 * possible for all of the async threads to be tied
4279 		 * up waiting to lock the pages again and they would
4280 		 * all already be locked and waiting for an async
4281 		 * thread to handle them.  Deadlock.
4282 		 */
4283 		if (!(flags & B_ASYNC)) {
4284 			error = nfs_putpage(vp, io_off, io_len,
4285 			    B_INVAL | B_FORCE, cr, NULL);
4286 		}
4287 	} else {
4288 		if (error)
4289 			flags |= B_ERROR;
4290 		else if (rp->r_flags & ROUTOFSPACE) {
4291 			mutex_enter(&rp->r_statelock);
4292 			rp->r_flags &= ~ROUTOFSPACE;
4293 			mutex_exit(&rp->r_statelock);
4294 		}
4295 		pvn_write_done(pp, flags);
4296 	}
4297 
4298 	return (error);
4299 }
4300 
4301 /* ARGSUSED */
4302 static int
4303 nfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
4304 	size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
4305 	caller_context_t *ct)
4306 {
4307 	struct segvn_crargs vn_a;
4308 	int error;
4309 	rnode_t *rp;
4310 	struct vattr va;
4311 
4312 	if (nfs_zone() != VTOMI(vp)->mi_zone)
4313 		return (EIO);
4314 
4315 	if (vp->v_flag & VNOMAP)
4316 		return (ENOSYS);
4317 
4318 	if (off > MAXOFF32_T)
4319 		return (EFBIG);
4320 
4321 	if (off < 0 || off + len < 0)
4322 		return (ENXIO);
4323 
4324 	if (vp->v_type != VREG)
4325 		return (ENODEV);
4326 
4327 	/*
4328 	 * If there is cached data and if close-to-open consistency
4329 	 * checking is not turned off and if the file system is not
4330 	 * mounted readonly, then force an over the wire getattr.
4331 	 * Otherwise, just invoke nfsgetattr to get a copy of the
4332 	 * attributes.  The attribute cache will be used unless it
4333 	 * is timed out and if it is, then an over the wire getattr
4334 	 * will be issued.
4335 	 */
4336 	va.va_mask = AT_ALL;
4337 	if (vn_has_cached_data(vp) &&
4338 	    !(VTOMI(vp)->mi_flags & MI_NOCTO) && !vn_is_readonly(vp))
4339 		error = nfs_getattr_otw(vp, &va, cr);
4340 	else
4341 		error = nfsgetattr(vp, &va, cr);
4342 	if (error)
4343 		return (error);
4344 
4345 	/*
4346 	 * Check to see if the vnode is currently marked as not cachable.
4347 	 * This means portions of the file are locked (through VOP_FRLOCK).
4348 	 * In this case the map request must be refused.  We use
4349 	 * rp->r_lkserlock to avoid a race with concurrent lock requests.
4350 	 */
4351 	rp = VTOR(vp);
4352 
4353 	/*
4354 	 * Atomically increment r_inmap after acquiring r_rwlock. The
4355 	 * idea here is to acquire r_rwlock to block read/write and
4356 	 * not to protect r_inmap. r_inmap will inform nfs_read/write()
4357 	 * that we are in nfs_map(). Now, r_rwlock is acquired in order
4358 	 * and we can prevent the deadlock that would have occurred
4359 	 * when nfs_addmap() would have acquired it out of order.
4360 	 *
4361 	 * Since we are not protecting r_inmap by any lock, we do not
4362 	 * hold any lock when we decrement it. We atomically decrement
4363 	 * r_inmap after we release r_lkserlock.
4364 	 */
4365 
4366 	if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, INTR(vp)))
4367 		return (EINTR);
4368 	atomic_inc_uint(&rp->r_inmap);
4369 	nfs_rw_exit(&rp->r_rwlock);
4370 
4371 	if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR(vp))) {
4372 		atomic_dec_uint(&rp->r_inmap);
4373 		return (EINTR);
4374 	}
4375 	if (vp->v_flag & VNOCACHE) {
4376 		error = EAGAIN;
4377 		goto done;
4378 	}
4379 
4380 	/*
4381 	 * Don't allow concurrent locks and mapping if mandatory locking is
4382 	 * enabled.
4383 	 */
4384 	if ((flk_has_remote_locks(vp) || lm_has_sleep(vp)) &&
4385 	    MANDLOCK(vp, va.va_mode)) {
4386 		error = EAGAIN;
4387 		goto done;
4388 	}
4389 
4390 	as_rangelock(as);
4391 	error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
4392 	if (error != 0) {
4393 		as_rangeunlock(as);
4394 		goto done;
4395 	}
4396 
4397 	vn_a.vp = vp;
4398 	vn_a.offset = off;
4399 	vn_a.type = (flags & MAP_TYPE);
4400 	vn_a.prot = (uchar_t)prot;
4401 	vn_a.maxprot = (uchar_t)maxprot;
4402 	vn_a.flags = (flags & ~MAP_TYPE);
4403 	vn_a.cred = cr;
4404 	vn_a.amp = NULL;
4405 	vn_a.szc = 0;
4406 	vn_a.lgrp_mem_policy_flags = 0;
4407 
4408 	error = as_map(as, *addrp, len, segvn_create, &vn_a);
4409 	as_rangeunlock(as);
4410 
4411 done:
4412 	nfs_rw_exit(&rp->r_lkserlock);
4413 	atomic_dec_uint(&rp->r_inmap);
4414 	return (error);
4415 }
4416 
4417 /* ARGSUSED */
4418 static int
4419 nfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
4420 	size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
4421 	caller_context_t *ct)
4422 {
4423 	rnode_t *rp;
4424 
4425 	if (vp->v_flag & VNOMAP)
4426 		return (ENOSYS);
4427 	if (nfs_zone() != VTOMI(vp)->mi_zone)
4428 		return (EIO);
4429 
4430 	rp = VTOR(vp);
4431 	atomic_add_long((ulong_t *)&rp->r_mapcnt, btopr(len));
4432 
4433 	return (0);
4434 }
4435 
4436 /* ARGSUSED */
4437 static int
4438 nfs_frlock(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, offset_t offset,
4439 	struct flk_callback *flk_cbp, cred_t *cr, caller_context_t *ct)
4440 {
4441 	netobj lm_fh;
4442 	int rc;
4443 	u_offset_t start, end;
4444 	rnode_t *rp;
4445 	int error = 0, intr = INTR(vp);
4446 
4447 	/* check for valid cmd parameter */
4448 	if (cmd != F_GETLK && cmd != F_SETLK && cmd != F_SETLKW)
4449 		return (EINVAL);
4450 	if (nfs_zone() != VTOMI(vp)->mi_zone)
4451 		return (EIO);
4452 
4453 	/* Verify l_type. */
4454 	switch (bfp->l_type) {
4455 	case F_RDLCK:
4456 		if (cmd != F_GETLK && !(flag & FREAD))
4457 			return (EBADF);
4458 		break;
4459 	case F_WRLCK:
4460 		if (cmd != F_GETLK && !(flag & FWRITE))
4461 			return (EBADF);
4462 		break;
4463 	case F_UNLCK:
4464 		intr = 0;
4465 		break;
4466 
4467 	default:
4468 		return (EINVAL);
4469 	}
4470 
4471 	/* check the validity of the lock range */
4472 	if (rc = flk_convert_lock_data(vp, bfp, &start, &end, offset))
4473 		return (rc);
4474 	if (rc = flk_check_lock_data(start, end, MAXOFF32_T))
4475 		return (rc);
4476 
4477 	/*
4478 	 * If the filesystem is mounted using local locking, pass the
4479 	 * request off to the local locking code.
4480 	 */
4481 	if (VTOMI(vp)->mi_flags & MI_LLOCK) {
4482 		if (offset > MAXOFF32_T)
4483 			return (EFBIG);
4484 		if (cmd == F_SETLK || cmd == F_SETLKW) {
4485 			/*
4486 			 * For complete safety, we should be holding
4487 			 * r_lkserlock.  However, we can't call
4488 			 * lm_safelock and then fs_frlock while
4489 			 * holding r_lkserlock, so just invoke
4490 			 * lm_safelock and expect that this will
4491 			 * catch enough of the cases.
4492 			 */
4493 			if (!lm_safelock(vp, bfp, cr))
4494 				return (EAGAIN);
4495 		}
4496 		return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
4497 	}
4498 
4499 	rp = VTOR(vp);
4500 
4501 	/*
4502 	 * Check whether the given lock request can proceed, given the
4503 	 * current file mappings.
4504 	 */
4505 	if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_WRITER, intr))
4506 		return (EINTR);
4507 	if (cmd == F_SETLK || cmd == F_SETLKW) {
4508 		if (!lm_safelock(vp, bfp, cr)) {
4509 			rc = EAGAIN;
4510 			goto done;
4511 		}
4512 	}
4513 
4514 	/*
4515 	 * Flush the cache after waiting for async I/O to finish.  For new
4516 	 * locks, this is so that the process gets the latest bits from the
4517 	 * server.  For unlocks, this is so that other clients see the
4518 	 * latest bits once the file has been unlocked.  If currently dirty
4519 	 * pages can't be flushed, then don't allow a lock to be set.  But
4520 	 * allow unlocks to succeed, to avoid having orphan locks on the
4521 	 * server.
4522 	 */
4523 	if (cmd != F_GETLK) {
4524 		mutex_enter(&rp->r_statelock);
4525 		while (rp->r_count > 0) {
4526 			if (intr) {
4527 				klwp_t *lwp = ttolwp(curthread);
4528 
4529 				if (lwp != NULL)
4530 					lwp->lwp_nostop++;
4531 				if (cv_wait_sig(&rp->r_cv, &rp->r_statelock)
4532 				    == 0) {
4533 					if (lwp != NULL)
4534 						lwp->lwp_nostop--;
4535 					rc = EINTR;
4536 					break;
4537 				}
4538 				if (lwp != NULL)
4539 					lwp->lwp_nostop--;
4540 			} else
4541 			cv_wait(&rp->r_cv, &rp->r_statelock);
4542 		}
4543 		mutex_exit(&rp->r_statelock);
4544 		if (rc != 0)
4545 			goto done;
4546 		error = nfs_putpage(vp, (offset_t)0, 0, B_INVAL, cr, ct);
4547 		if (error) {
4548 			if (error == ENOSPC || error == EDQUOT) {
4549 				mutex_enter(&rp->r_statelock);
4550 				if (!rp->r_error)
4551 					rp->r_error = error;
4552 				mutex_exit(&rp->r_statelock);
4553 			}
4554 			if (bfp->l_type != F_UNLCK) {
4555 				rc = ENOLCK;
4556 				goto done;
4557 			}
4558 		}
4559 	}
4560 
4561 	lm_fh.n_len = sizeof (fhandle_t);
4562 	lm_fh.n_bytes = (char *)VTOFH(vp);
4563 
4564 	/*
4565 	 * Call the lock manager to do the real work of contacting
4566 	 * the server and obtaining the lock.
4567 	 */
4568 	rc = lm_frlock(vp, cmd, bfp, flag, offset, cr, &lm_fh, flk_cbp);
4569 
4570 	if (rc == 0)
4571 		nfs_lockcompletion(vp, cmd);
4572 
4573 done:
4574 	nfs_rw_exit(&rp->r_lkserlock);
4575 	return (rc);
4576 }
4577 
4578 /*
4579  * Free storage space associated with the specified vnode.  The portion
4580  * to be freed is specified by bfp->l_start and bfp->l_len (already
4581  * normalized to a "whence" of 0).
4582  *
4583  * This is an experimental facility whose continued existence is not
4584  * guaranteed.  Currently, we only support the special case
4585  * of l_len == 0, meaning free to end of file.
4586  */
4587 /* ARGSUSED */
4588 static int
4589 nfs_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
4590 	offset_t offset, cred_t *cr, caller_context_t *ct)
4591 {
4592 	int error;
4593 
4594 	ASSERT(vp->v_type == VREG);
4595 	if (cmd != F_FREESP)
4596 		return (EINVAL);
4597 
4598 	if (offset > MAXOFF32_T)
4599 		return (EFBIG);
4600 
4601 	if ((bfp->l_start > MAXOFF32_T) || (bfp->l_end > MAXOFF32_T) ||
4602 	    (bfp->l_len > MAXOFF32_T))
4603 		return (EFBIG);
4604 
4605 	if (nfs_zone() != VTOMI(vp)->mi_zone)
4606 		return (EIO);
4607 
4608 	error = convoff(vp, bfp, 0, offset);
4609 	if (!error) {
4610 		ASSERT(bfp->l_start >= 0);
4611 		if (bfp->l_len == 0) {
4612 			struct vattr va;
4613 
4614 			/*
4615 			 * ftruncate should not change the ctime and
4616 			 * mtime if we truncate the file to its
4617 			 * previous size.
4618 			 */
4619 			va.va_mask = AT_SIZE;
4620 			error = nfsgetattr(vp, &va, cr);
4621 			if (error || va.va_size == bfp->l_start)
4622 				return (error);
4623 			va.va_mask = AT_SIZE;
4624 			va.va_size = bfp->l_start;
4625 			error = nfssetattr(vp, &va, 0, cr);
4626 
4627 			if (error == 0 && bfp->l_start == 0)
4628 				vnevent_truncate(vp, ct);
4629 		} else
4630 			error = EINVAL;
4631 	}
4632 
4633 	return (error);
4634 }
4635 
4636 /* ARGSUSED */
4637 static int
4638 nfs_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct)
4639 {
4640 
4641 	return (EINVAL);
4642 }
4643 
4644 /*
4645  * Setup and add an address space callback to do the work of the delmap call.
4646  * The callback will (and must be) deleted in the actual callback function.
4647  *
4648  * This is done in order to take care of the problem that we have with holding
4649  * the address space's a_lock for a long period of time (e.g. if the NFS server
4650  * is down).  Callbacks will be executed in the address space code while the
4651  * a_lock is not held.	Holding the address space's a_lock causes things such
4652  * as ps and fork to hang because they are trying to acquire this lock as well.
4653  */
4654 /* ARGSUSED */
4655 static int
4656 nfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
4657 	size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr,
4658 	caller_context_t *ct)
4659 {
4660 	int			caller_found;
4661 	int			error;
4662 	rnode_t			*rp;
4663 	nfs_delmap_args_t	*dmapp;
4664 	nfs_delmapcall_t	*delmap_call;
4665 
4666 	if (vp->v_flag & VNOMAP)
4667 		return (ENOSYS);
4668 	/*
4669 	 * A process may not change zones if it has NFS pages mmap'ed
4670 	 * in, so we can't legitimately get here from the wrong zone.
4671 	 */
4672 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
4673 
4674 	rp = VTOR(vp);
4675 
4676 	/*
4677 	 * The way that the address space of this process deletes its mapping
4678 	 * of this file is via the following call chains:
4679 	 * - as_free()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs_delmap()
4680 	 * - as_unmap()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs_delmap()
4681 	 *
4682 	 * With the use of address space callbacks we are allowed to drop the
4683 	 * address space lock, a_lock, while executing the NFS operations that
4684 	 * need to go over the wire.  Returning EAGAIN to the caller of this
4685 	 * function is what drives the execution of the callback that we add
4686 	 * below.  The callback will be executed by the address space code
4687 	 * after dropping the a_lock.  When the callback is finished, since
4688 	 * we dropped the a_lock, it must be re-acquired and segvn_unmap()
4689 	 * is called again on the same segment to finish the rest of the work
4690 	 * that needs to happen during unmapping.
4691 	 *
4692 	 * This action of calling back into the segment driver causes
4693 	 * nfs_delmap() to get called again, but since the callback was
4694 	 * already executed at this point, it already did the work and there
4695 	 * is nothing left for us to do.
4696 	 *
4697 	 * To Summarize:
4698 	 * - The first time nfs_delmap is called by the current thread is when
4699 	 * we add the caller associated with this delmap to the delmap caller
4700 	 * list, add the callback, and return EAGAIN.
4701 	 * - The second time in this call chain when nfs_delmap is called we
4702 	 * will find this caller in the delmap caller list and realize there
4703 	 * is no more work to do thus removing this caller from the list and
4704 	 * returning the error that was set in the callback execution.
4705 	 */
4706 	caller_found = nfs_find_and_delete_delmapcall(rp, &error);
4707 	if (caller_found) {
4708 		/*
4709 		 * 'error' is from the actual delmap operations.  To avoid
4710 		 * hangs, we need to handle the return of EAGAIN differently
4711 		 * since this is what drives the callback execution.
4712 		 * In this case, we don't want to return EAGAIN and do the
4713 		 * callback execution because there are none to execute.
4714 		 */
4715 		if (error == EAGAIN)
4716 			return (0);
4717 		else
4718 			return (error);
4719 	}
4720 
4721 	/* current caller was not in the list */
4722 	delmap_call = nfs_init_delmapcall();
4723 
4724 	mutex_enter(&rp->r_statelock);
4725 	list_insert_tail(&rp->r_indelmap, delmap_call);
4726 	mutex_exit(&rp->r_statelock);
4727 
4728 	dmapp = kmem_alloc(sizeof (nfs_delmap_args_t), KM_SLEEP);
4729 
4730 	dmapp->vp = vp;
4731 	dmapp->off = off;
4732 	dmapp->addr = addr;
4733 	dmapp->len = len;
4734 	dmapp->prot = prot;
4735 	dmapp->maxprot = maxprot;
4736 	dmapp->flags = flags;
4737 	dmapp->cr = cr;
4738 	dmapp->caller = delmap_call;
4739 
4740 	error = as_add_callback(as, nfs_delmap_callback, dmapp,
4741 	    AS_UNMAP_EVENT, addr, len, KM_SLEEP);
4742 
4743 	return (error ? error : EAGAIN);
4744 }
4745 
4746 /*
4747  * Remove some pages from an mmap'd vnode.  Just update the
4748  * count of pages.  If doing close-to-open, then flush all
4749  * of the pages associated with this file.  Otherwise, start
4750  * an asynchronous page flush to write out any dirty pages.
4751  * This will also associate a credential with the rnode which
4752  * can be used to write the pages.
4753  */
4754 /* ARGSUSED */
4755 static void
4756 nfs_delmap_callback(struct as *as, void *arg, uint_t event)
4757 {
4758 	int			error;
4759 	rnode_t			*rp;
4760 	mntinfo_t		*mi;
4761 	nfs_delmap_args_t	*dmapp = (nfs_delmap_args_t *)arg;
4762 
4763 	rp = VTOR(dmapp->vp);
4764 	mi = VTOMI(dmapp->vp);
4765 
4766 	atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(dmapp->len));
4767 	ASSERT(rp->r_mapcnt >= 0);
4768 
4769 	/*
4770 	 * Initiate a page flush if there are pages, the file system
4771 	 * was not mounted readonly, the segment was mapped shared, and
4772 	 * the pages themselves were writeable.
4773 	 */
4774 	if (vn_has_cached_data(dmapp->vp) && !vn_is_readonly(dmapp->vp) &&
4775 	    dmapp->flags == MAP_SHARED && (dmapp->maxprot & PROT_WRITE)) {
4776 		mutex_enter(&rp->r_statelock);
4777 		rp->r_flags |= RDIRTY;
4778 		mutex_exit(&rp->r_statelock);
4779 		/*
4780 		 * If this is a cross-zone access a sync putpage won't work, so
4781 		 * the best we can do is try an async putpage.  That seems
4782 		 * better than something more draconian such as discarding the
4783 		 * dirty pages.
4784 		 */
4785 		if ((mi->mi_flags & MI_NOCTO) ||
4786 		    nfs_zone() != mi->mi_zone)
4787 			error = nfs_putpage(dmapp->vp, dmapp->off, dmapp->len,
4788 			    B_ASYNC, dmapp->cr, NULL);
4789 		else
4790 			error = nfs_putpage(dmapp->vp, dmapp->off, dmapp->len,
4791 			    0, dmapp->cr, NULL);
4792 		if (!error) {
4793 			mutex_enter(&rp->r_statelock);
4794 			error = rp->r_error;
4795 			rp->r_error = 0;
4796 			mutex_exit(&rp->r_statelock);
4797 		}
4798 	} else
4799 		error = 0;
4800 
4801 	if ((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO))
4802 		(void) nfs_putpage(dmapp->vp, dmapp->off, dmapp->len,
4803 		    B_INVAL, dmapp->cr, NULL);
4804 
4805 	dmapp->caller->error = error;
4806 	(void) as_delete_callback(as, arg);
4807 	kmem_free(dmapp, sizeof (nfs_delmap_args_t));
4808 }
4809 
4810 /* ARGSUSED */
4811 static int
4812 nfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
4813 	caller_context_t *ct)
4814 {
4815 	int error = 0;
4816 
4817 	if (nfs_zone() != VTOMI(vp)->mi_zone)
4818 		return (EIO);
4819 	/*
4820 	 * This looks a little weird because it's written in a general
4821 	 * manner but we make little use of cases.  If cntl() ever gets
4822 	 * widely used, the outer switch will make more sense.
4823 	 */
4824 
4825 	switch (cmd) {
4826 
4827 	/*
4828 	 * Large file spec - need to base answer new query with
4829 	 * hardcoded constant based on the protocol.
4830 	 */
4831 	case _PC_FILESIZEBITS:
4832 		*valp = 32;
4833 		return (0);
4834 
4835 	case _PC_LINK_MAX:
4836 	case _PC_NAME_MAX:
4837 	case _PC_PATH_MAX:
4838 	case _PC_SYMLINK_MAX:
4839 	case _PC_CHOWN_RESTRICTED:
4840 	case _PC_NO_TRUNC: {
4841 		mntinfo_t *mi;
4842 		struct pathcnf *pc;
4843 
4844 		if ((mi = VTOMI(vp)) == NULL || (pc = mi->mi_pathconf) == NULL)
4845 			return (EINVAL);
4846 		error = _PC_ISSET(cmd, pc->pc_mask);    /* error or bool */
4847 		switch (cmd) {
4848 		case _PC_LINK_MAX:
4849 			*valp = pc->pc_link_max;
4850 			break;
4851 		case _PC_NAME_MAX:
4852 			*valp = pc->pc_name_max;
4853 			break;
4854 		case _PC_PATH_MAX:
4855 		case _PC_SYMLINK_MAX:
4856 			*valp = pc->pc_path_max;
4857 			break;
4858 		case _PC_CHOWN_RESTRICTED:
4859 			/*
4860 			 * if we got here, error is really a boolean which
4861 			 * indicates whether cmd is set or not.
4862 			 */
4863 			*valp = error ? 1 : 0;	/* see above */
4864 			error = 0;
4865 			break;
4866 		case _PC_NO_TRUNC:
4867 			/*
4868 			 * if we got here, error is really a boolean which
4869 			 * indicates whether cmd is set or not.
4870 			 */
4871 			*valp = error ? 1 : 0;	/* see above */
4872 			error = 0;
4873 			break;
4874 		}
4875 		return (error ? EINVAL : 0);
4876 		}
4877 
4878 	case _PC_XATTR_EXISTS:
4879 		*valp = 0;
4880 		if (vp->v_vfsp->vfs_flag & VFS_XATTR) {
4881 			vnode_t *avp;
4882 			rnode_t *rp;
4883 			mntinfo_t *mi = VTOMI(vp);
4884 
4885 			if (!(mi->mi_flags & MI_EXTATTR))
4886 				return (0);
4887 
4888 			rp = VTOR(vp);
4889 			if (nfs_rw_enter_sig(&rp->r_rwlock, RW_READER,
4890 			    INTR(vp)))
4891 				return (EINTR);
4892 
4893 			error = nfslookup_dnlc(vp, XATTR_DIR_NAME, &avp, cr);
4894 			if (error || avp == NULL)
4895 				error = acl_getxattrdir2(vp, &avp, 0, cr, 0);
4896 
4897 			nfs_rw_exit(&rp->r_rwlock);
4898 
4899 			if (error == 0 && avp != NULL) {
4900 				error = do_xattr_exists_check(avp, valp, cr);
4901 				VN_RELE(avp);
4902 			}
4903 		}
4904 		return (error ? EINVAL : 0);
4905 
4906 	case _PC_ACL_ENABLED:
4907 		*valp = _ACL_ACLENT_ENABLED;
4908 		return (0);
4909 
4910 	default:
4911 		return (EINVAL);
4912 	}
4913 }
4914 
4915 /*
4916  * Called by async thread to do synchronous pageio. Do the i/o, wait
4917  * for it to complete, and cleanup the page list when done.
4918  */
4919 static int
4920 nfs_sync_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
4921 	int flags, cred_t *cr)
4922 {
4923 	int error;
4924 
4925 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
4926 	error = nfs_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
4927 	if (flags & B_READ)
4928 		pvn_read_done(pp, (error ? B_ERROR : 0) | flags);
4929 	else
4930 		pvn_write_done(pp, (error ? B_ERROR : 0) | flags);
4931 	return (error);
4932 }
4933 
4934 /* ARGSUSED */
4935 static int
4936 nfs_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
4937 	int flags, cred_t *cr, caller_context_t *ct)
4938 {
4939 	int error;
4940 	rnode_t *rp;
4941 
4942 	if (pp == NULL)
4943 		return (EINVAL);
4944 
4945 	if (io_off > MAXOFF32_T)
4946 		return (EFBIG);
4947 	if (nfs_zone() != VTOMI(vp)->mi_zone)
4948 		return (EIO);
4949 	rp = VTOR(vp);
4950 	mutex_enter(&rp->r_statelock);
4951 	rp->r_count++;
4952 	mutex_exit(&rp->r_statelock);
4953 
4954 	if (flags & B_ASYNC) {
4955 		error = nfs_async_pageio(vp, pp, io_off, io_len, flags, cr,
4956 		    nfs_sync_pageio);
4957 	} else
4958 		error = nfs_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
4959 	mutex_enter(&rp->r_statelock);
4960 	rp->r_count--;
4961 	cv_broadcast(&rp->r_cv);
4962 	mutex_exit(&rp->r_statelock);
4963 	return (error);
4964 }
4965 
4966 /* ARGSUSED */
4967 static int
4968 nfs_setsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr,
4969 	caller_context_t *ct)
4970 {
4971 	int error;
4972 	mntinfo_t *mi;
4973 
4974 	mi = VTOMI(vp);
4975 
4976 	if (nfs_zone() != mi->mi_zone)
4977 		return (EIO);
4978 	if (mi->mi_flags & MI_ACL) {
4979 		error = acl_setacl2(vp, vsecattr, flag, cr);
4980 		if (mi->mi_flags & MI_ACL)
4981 			return (error);
4982 	}
4983 
4984 	return (ENOSYS);
4985 }
4986 
4987 /* ARGSUSED */
4988 static int
4989 nfs_getsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr,
4990 	caller_context_t *ct)
4991 {
4992 	int error;
4993 	mntinfo_t *mi;
4994 
4995 	mi = VTOMI(vp);
4996 
4997 	if (nfs_zone() != mi->mi_zone)
4998 		return (EIO);
4999 	if (mi->mi_flags & MI_ACL) {
5000 		error = acl_getacl2(vp, vsecattr, flag, cr);
5001 		if (mi->mi_flags & MI_ACL)
5002 			return (error);
5003 	}
5004 
5005 	return (fs_fab_acl(vp, vsecattr, flag, cr, ct));
5006 }
5007 
5008 /* ARGSUSED */
5009 static int
5010 nfs_shrlock(vnode_t *vp, int cmd, struct shrlock *shr, int flag, cred_t *cr,
5011 	caller_context_t *ct)
5012 {
5013 	int error;
5014 	struct shrlock nshr;
5015 	struct nfs_owner nfs_owner;
5016 	netobj lm_fh;
5017 
5018 	if (nfs_zone() != VTOMI(vp)->mi_zone)
5019 		return (EIO);
5020 
5021 	/*
5022 	 * check for valid cmd parameter
5023 	 */
5024 	if (cmd != F_SHARE && cmd != F_UNSHARE && cmd != F_HASREMOTELOCKS)
5025 		return (EINVAL);
5026 
5027 	/*
5028 	 * Check access permissions
5029 	 */
5030 	if (cmd == F_SHARE &&
5031 	    (((shr->s_access & F_RDACC) && !(flag & FREAD)) ||
5032 	    ((shr->s_access & F_WRACC) && !(flag & FWRITE))))
5033 		return (EBADF);
5034 
5035 	/*
5036 	 * If the filesystem is mounted using local locking, pass the
5037 	 * request off to the local share code.
5038 	 */
5039 	if (VTOMI(vp)->mi_flags & MI_LLOCK)
5040 		return (fs_shrlock(vp, cmd, shr, flag, cr, ct));
5041 
5042 	switch (cmd) {
5043 	case F_SHARE:
5044 	case F_UNSHARE:
5045 		lm_fh.n_len = sizeof (fhandle_t);
5046 		lm_fh.n_bytes = (char *)VTOFH(vp);
5047 
5048 		/*
5049 		 * If passed an owner that is too large to fit in an
5050 		 * nfs_owner it is likely a recursive call from the
5051 		 * lock manager client and pass it straight through.  If
5052 		 * it is not a nfs_owner then simply return an error.
5053 		 */
5054 		if (shr->s_own_len > sizeof (nfs_owner.lowner)) {
5055 			if (((struct nfs_owner *)shr->s_owner)->magic !=
5056 			    NFS_OWNER_MAGIC)
5057 				return (EINVAL);
5058 
5059 			if (error = lm_shrlock(vp, cmd, shr, flag, &lm_fh)) {
5060 				error = set_errno(error);
5061 			}
5062 			return (error);
5063 		}
5064 		/*
5065 		 * Remote share reservations owner is a combination of
5066 		 * a magic number, hostname, and the local owner
5067 		 */
5068 		bzero(&nfs_owner, sizeof (nfs_owner));
5069 		nfs_owner.magic = NFS_OWNER_MAGIC;
5070 		(void) strncpy(nfs_owner.hname, uts_nodename(),
5071 		    sizeof (nfs_owner.hname));
5072 		bcopy(shr->s_owner, nfs_owner.lowner, shr->s_own_len);
5073 		nshr.s_access = shr->s_access;
5074 		nshr.s_deny = shr->s_deny;
5075 		nshr.s_sysid = 0;
5076 		nshr.s_pid = ttoproc(curthread)->p_pid;
5077 		nshr.s_own_len = sizeof (nfs_owner);
5078 		nshr.s_owner = (caddr_t)&nfs_owner;
5079 
5080 		if (error = lm_shrlock(vp, cmd, &nshr, flag, &lm_fh)) {
5081 			error = set_errno(error);
5082 		}
5083 
5084 		break;
5085 
5086 	case F_HASREMOTELOCKS:
5087 		/*
5088 		 * NFS client can't store remote locks itself
5089 		 */
5090 		shr->s_access = 0;
5091 		error = 0;
5092 		break;
5093 
5094 	default:
5095 		error = EINVAL;
5096 		break;
5097 	}
5098 
5099 	return (error);
5100 }
5101