xref: /titanic_50/usr/src/uts/common/fs/nfs/nfs_vnops.c (revision 992413f4053d9470046876b234fe094062b730b7)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  *
25  *	Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T.
26  *	All rights reserved.
27  */
28 
29 #include <sys/param.h>
30 #include <sys/types.h>
31 #include <sys/systm.h>
32 #include <sys/cred.h>
33 #include <sys/time.h>
34 #include <sys/vnode.h>
35 #include <sys/vfs.h>
36 #include <sys/vfs_opreg.h>
37 #include <sys/file.h>
38 #include <sys/filio.h>
39 #include <sys/uio.h>
40 #include <sys/buf.h>
41 #include <sys/mman.h>
42 #include <sys/pathname.h>
43 #include <sys/dirent.h>
44 #include <sys/debug.h>
45 #include <sys/vmsystm.h>
46 #include <sys/fcntl.h>
47 #include <sys/flock.h>
48 #include <sys/swap.h>
49 #include <sys/errno.h>
50 #include <sys/strsubr.h>
51 #include <sys/sysmacros.h>
52 #include <sys/kmem.h>
53 #include <sys/cmn_err.h>
54 #include <sys/pathconf.h>
55 #include <sys/utsname.h>
56 #include <sys/dnlc.h>
57 #include <sys/acl.h>
58 #include <sys/atomic.h>
59 #include <sys/policy.h>
60 #include <sys/sdt.h>
61 
62 #include <rpc/types.h>
63 #include <rpc/auth.h>
64 #include <rpc/clnt.h>
65 
66 #include <nfs/nfs.h>
67 #include <nfs/nfs_clnt.h>
68 #include <nfs/rnode.h>
69 #include <nfs/nfs_acl.h>
70 #include <nfs/lm.h>
71 
72 #include <vm/hat.h>
73 #include <vm/as.h>
74 #include <vm/page.h>
75 #include <vm/pvn.h>
76 #include <vm/seg.h>
77 #include <vm/seg_map.h>
78 #include <vm/seg_kpm.h>
79 #include <vm/seg_vn.h>
80 
81 #include <fs/fs_subr.h>
82 
83 #include <sys/ddi.h>
84 
85 static int	nfs_rdwrlbn(vnode_t *, page_t *, u_offset_t, size_t, int,
86 			cred_t *);
87 static int	nfswrite(vnode_t *, caddr_t, uint_t, int, cred_t *);
88 static int	nfsread(vnode_t *, caddr_t, uint_t, int, size_t *, cred_t *);
89 static int	nfssetattr(vnode_t *, struct vattr *, int, cred_t *);
90 static int	nfslookup_dnlc(vnode_t *, char *, vnode_t **, cred_t *);
91 static int	nfslookup_otw(vnode_t *, char *, vnode_t **, cred_t *, int);
92 static int	nfsrename(vnode_t *, char *, vnode_t *, char *, cred_t *,
93 			caller_context_t *);
94 static int	nfsreaddir(vnode_t *, rddir_cache *, cred_t *);
95 static int	nfs_bio(struct buf *, cred_t *);
96 static int	nfs_getapage(vnode_t *, u_offset_t, size_t, uint_t *,
97 			page_t *[], size_t, struct seg *, caddr_t,
98 			enum seg_rw, cred_t *);
99 static void	nfs_readahead(vnode_t *, u_offset_t, caddr_t, struct seg *,
100 			cred_t *);
101 static int	nfs_sync_putapage(vnode_t *, page_t *, u_offset_t, size_t,
102 			int, cred_t *);
103 static int	nfs_sync_pageio(vnode_t *, page_t *, u_offset_t, size_t,
104 			int, cred_t *);
105 static void	nfs_delmap_callback(struct as *, void *, uint_t);
106 
107 /*
108  * Error flags used to pass information about certain special errors
109  * which need to be handled specially.
110  */
111 #define	NFS_EOF			-98
112 
113 /*
114  * These are the vnode ops routines which implement the vnode interface to
115  * the networked file system.  These routines just take their parameters,
116  * make them look networkish by putting the right info into interface structs,
117  * and then calling the appropriate remote routine(s) to do the work.
118  *
119  * Note on directory name lookup cacheing:  If we detect a stale fhandle,
120  * we purge the directory cache relative to that vnode.  This way, the
121  * user won't get burned by the cache repeatedly.  See <nfs/rnode.h> for
122  * more details on rnode locking.
123  */
124 
125 static int	nfs_open(vnode_t **, int, cred_t *, caller_context_t *);
126 static int	nfs_close(vnode_t *, int, int, offset_t, cred_t *,
127 			caller_context_t *);
128 static int	nfs_read(vnode_t *, struct uio *, int, cred_t *,
129 			caller_context_t *);
130 static int	nfs_write(vnode_t *, struct uio *, int, cred_t *,
131 			caller_context_t *);
132 static int	nfs_ioctl(vnode_t *, int, intptr_t, int, cred_t *, int *,
133 			caller_context_t *);
134 static int	nfs_getattr(vnode_t *, struct vattr *, int, cred_t *,
135 			caller_context_t *);
136 static int	nfs_setattr(vnode_t *, struct vattr *, int, cred_t *,
137 			caller_context_t *);
138 static int	nfs_access(vnode_t *, int, int, cred_t *, caller_context_t *);
139 static int	nfs_accessx(void *, int, cred_t *);
140 static int	nfs_readlink(vnode_t *, struct uio *, cred_t *,
141 			caller_context_t *);
142 static int	nfs_fsync(vnode_t *, int, cred_t *, caller_context_t *);
143 static void	nfs_inactive(vnode_t *, cred_t *, caller_context_t *);
144 static int	nfs_lookup(vnode_t *, char *, vnode_t **, struct pathname *,
145 			int, vnode_t *, cred_t *, caller_context_t *,
146 			int *, pathname_t *);
147 static int	nfs_create(vnode_t *, char *, struct vattr *, enum vcexcl,
148 			int, vnode_t **, cred_t *, int, caller_context_t *,
149 			vsecattr_t *);
150 static int	nfs_remove(vnode_t *, char *, cred_t *, caller_context_t *,
151 			int);
152 static int	nfs_link(vnode_t *, vnode_t *, char *, cred_t *,
153 			caller_context_t *, int);
154 static int	nfs_rename(vnode_t *, char *, vnode_t *, char *, cred_t *,
155 			caller_context_t *, int);
156 static int	nfs_mkdir(vnode_t *, char *, struct vattr *, vnode_t **,
157 			cred_t *, caller_context_t *, int, vsecattr_t *);
158 static int	nfs_rmdir(vnode_t *, char *, vnode_t *, cred_t *,
159 			caller_context_t *, int);
160 static int	nfs_symlink(vnode_t *, char *, struct vattr *, char *,
161 			cred_t *, caller_context_t *, int);
162 static int	nfs_readdir(vnode_t *, struct uio *, cred_t *, int *,
163 			caller_context_t *, int);
164 static int	nfs_fid(vnode_t *, fid_t *, caller_context_t *);
165 static int	nfs_rwlock(vnode_t *, int, caller_context_t *);
166 static void	nfs_rwunlock(vnode_t *, int, caller_context_t *);
167 static int	nfs_seek(vnode_t *, offset_t, offset_t *, caller_context_t *);
168 static int	nfs_getpage(vnode_t *, offset_t, size_t, uint_t *,
169 			page_t *[], size_t, struct seg *, caddr_t,
170 			enum seg_rw, cred_t *, caller_context_t *);
171 static int	nfs_putpage(vnode_t *, offset_t, size_t, int, cred_t *,
172 			caller_context_t *);
173 static int	nfs_map(vnode_t *, offset_t, struct as *, caddr_t *, size_t,
174 			uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *);
175 static int	nfs_addmap(vnode_t *, offset_t, struct as *, caddr_t, size_t,
176 			uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *);
177 static int	nfs_frlock(vnode_t *, int, struct flock64 *, int, offset_t,
178 			struct flk_callback *, cred_t *, caller_context_t *);
179 static int	nfs_space(vnode_t *, int, struct flock64 *, int, offset_t,
180 			cred_t *, caller_context_t *);
181 static int	nfs_realvp(vnode_t *, vnode_t **, caller_context_t *);
182 static int	nfs_delmap(vnode_t *, offset_t, struct as *, caddr_t, size_t,
183 			uint_t, uint_t, uint_t, cred_t *, caller_context_t *);
184 static int	nfs_pathconf(vnode_t *, int, ulong_t *, cred_t *,
185 			caller_context_t *);
186 static int	nfs_pageio(vnode_t *, page_t *, u_offset_t, size_t, int,
187 			cred_t *, caller_context_t *);
188 static int	nfs_setsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
189 			caller_context_t *);
190 static int	nfs_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
191 			caller_context_t *);
192 static int	nfs_shrlock(vnode_t *, int, struct shrlock *, int, cred_t *,
193 			caller_context_t *);
194 
195 struct vnodeops *nfs_vnodeops;
196 
197 const fs_operation_def_t nfs_vnodeops_template[] = {
198 	VOPNAME_OPEN,		{ .vop_open = nfs_open },
199 	VOPNAME_CLOSE,		{ .vop_close = nfs_close },
200 	VOPNAME_READ,		{ .vop_read = nfs_read },
201 	VOPNAME_WRITE,		{ .vop_write = nfs_write },
202 	VOPNAME_IOCTL,		{ .vop_ioctl = nfs_ioctl },
203 	VOPNAME_GETATTR,	{ .vop_getattr = nfs_getattr },
204 	VOPNAME_SETATTR,	{ .vop_setattr = nfs_setattr },
205 	VOPNAME_ACCESS,		{ .vop_access = nfs_access },
206 	VOPNAME_LOOKUP,		{ .vop_lookup = nfs_lookup },
207 	VOPNAME_CREATE,		{ .vop_create = nfs_create },
208 	VOPNAME_REMOVE,		{ .vop_remove = nfs_remove },
209 	VOPNAME_LINK,		{ .vop_link = nfs_link },
210 	VOPNAME_RENAME,		{ .vop_rename = nfs_rename },
211 	VOPNAME_MKDIR,		{ .vop_mkdir = nfs_mkdir },
212 	VOPNAME_RMDIR,		{ .vop_rmdir = nfs_rmdir },
213 	VOPNAME_READDIR,	{ .vop_readdir = nfs_readdir },
214 	VOPNAME_SYMLINK,	{ .vop_symlink = nfs_symlink },
215 	VOPNAME_READLINK,	{ .vop_readlink = nfs_readlink },
216 	VOPNAME_FSYNC,		{ .vop_fsync = nfs_fsync },
217 	VOPNAME_INACTIVE,	{ .vop_inactive = nfs_inactive },
218 	VOPNAME_FID,		{ .vop_fid = nfs_fid },
219 	VOPNAME_RWLOCK,		{ .vop_rwlock = nfs_rwlock },
220 	VOPNAME_RWUNLOCK,	{ .vop_rwunlock = nfs_rwunlock },
221 	VOPNAME_SEEK,		{ .vop_seek = nfs_seek },
222 	VOPNAME_FRLOCK,		{ .vop_frlock = nfs_frlock },
223 	VOPNAME_SPACE,		{ .vop_space = nfs_space },
224 	VOPNAME_REALVP,		{ .vop_realvp = nfs_realvp },
225 	VOPNAME_GETPAGE,	{ .vop_getpage = nfs_getpage },
226 	VOPNAME_PUTPAGE,	{ .vop_putpage = nfs_putpage },
227 	VOPNAME_MAP,		{ .vop_map = nfs_map },
228 	VOPNAME_ADDMAP,		{ .vop_addmap = nfs_addmap },
229 	VOPNAME_DELMAP,		{ .vop_delmap = nfs_delmap },
230 	VOPNAME_DUMP,		{ .vop_dump = nfs_dump },
231 	VOPNAME_PATHCONF,	{ .vop_pathconf = nfs_pathconf },
232 	VOPNAME_PAGEIO,		{ .vop_pageio = nfs_pageio },
233 	VOPNAME_SETSECATTR,	{ .vop_setsecattr = nfs_setsecattr },
234 	VOPNAME_GETSECATTR,	{ .vop_getsecattr = nfs_getsecattr },
235 	VOPNAME_SHRLOCK,	{ .vop_shrlock = nfs_shrlock },
236 	VOPNAME_VNEVENT, 	{ .vop_vnevent = fs_vnevent_support },
237 	NULL,			NULL
238 };
239 
240 /*
241  * XXX:  This is referenced in modstubs.s
242  */
243 struct vnodeops *
244 nfs_getvnodeops(void)
245 {
246 	return (nfs_vnodeops);
247 }
248 
249 /* ARGSUSED */
250 static int
251 nfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
252 {
253 	int error;
254 	struct vattr va;
255 	rnode_t *rp;
256 	vnode_t *vp;
257 
258 	vp = *vpp;
259 	rp = VTOR(vp);
260 	if (nfs_zone() != VTOMI(vp)->mi_zone)
261 		return (EIO);
262 	mutex_enter(&rp->r_statelock);
263 	if (rp->r_cred == NULL) {
264 		crhold(cr);
265 		rp->r_cred = cr;
266 	}
267 	mutex_exit(&rp->r_statelock);
268 
269 	/*
270 	 * If there is no cached data or if close-to-open
271 	 * consistency checking is turned off, we can avoid
272 	 * the over the wire getattr.  Otherwise, if the
273 	 * file system is mounted readonly, then just verify
274 	 * the caches are up to date using the normal mechanism.
275 	 * Else, if the file is not mmap'd, then just mark
276 	 * the attributes as timed out.  They will be refreshed
277 	 * and the caches validated prior to being used.
278 	 * Else, the file system is mounted writeable so
279 	 * force an over the wire GETATTR in order to ensure
280 	 * that all cached data is valid.
281 	 */
282 	if (vp->v_count > 1 ||
283 	    ((vn_has_cached_data(vp) || HAVE_RDDIR_CACHE(rp)) &&
284 	    !(VTOMI(vp)->mi_flags & MI_NOCTO))) {
285 		if (vn_is_readonly(vp))
286 			error = nfs_validate_caches(vp, cr);
287 		else if (rp->r_mapcnt == 0 && vp->v_count == 1) {
288 			PURGE_ATTRCACHE(vp);
289 			error = 0;
290 		} else {
291 			va.va_mask = AT_ALL;
292 			error = nfs_getattr_otw(vp, &va, cr);
293 		}
294 	} else
295 		error = 0;
296 
297 	return (error);
298 }
299 
300 /* ARGSUSED */
301 static int
302 nfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
303 	caller_context_t *ct)
304 {
305 	rnode_t *rp;
306 	int error;
307 	struct vattr va;
308 
309 	/*
310 	 * zone_enter(2) prevents processes from changing zones with NFS files
311 	 * open; if we happen to get here from the wrong zone we can't do
312 	 * anything over the wire.
313 	 */
314 	if (VTOMI(vp)->mi_zone != nfs_zone()) {
315 		/*
316 		 * We could attempt to clean up locks, except we're sure
317 		 * that the current process didn't acquire any locks on
318 		 * the file: any attempt to lock a file belong to another zone
319 		 * will fail, and one can't lock an NFS file and then change
320 		 * zones, as that fails too.
321 		 *
322 		 * Returning an error here is the sane thing to do.  A
323 		 * subsequent call to VN_RELE() which translates to a
324 		 * nfs_inactive() will clean up state: if the zone of the
325 		 * vnode's origin is still alive and kicking, an async worker
326 		 * thread will handle the request (from the correct zone), and
327 		 * everything (minus the final nfs_getattr_otw() call) should
328 		 * be OK. If the zone is going away nfs_async_inactive() will
329 		 * throw away cached pages inline.
330 		 */
331 		return (EIO);
332 	}
333 
334 	/*
335 	 * If we are using local locking for this filesystem, then
336 	 * release all of the SYSV style record locks.  Otherwise,
337 	 * we are doing network locking and we need to release all
338 	 * of the network locks.  All of the locks held by this
339 	 * process on this file are released no matter what the
340 	 * incoming reference count is.
341 	 */
342 	if (VTOMI(vp)->mi_flags & MI_LLOCK) {
343 		cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
344 		cleanshares(vp, ttoproc(curthread)->p_pid);
345 	} else
346 		nfs_lockrelease(vp, flag, offset, cr);
347 
348 	if (count > 1)
349 		return (0);
350 
351 	/*
352 	 * If the file has been `unlinked', then purge the
353 	 * DNLC so that this vnode will get reycled quicker
354 	 * and the .nfs* file on the server will get removed.
355 	 */
356 	rp = VTOR(vp);
357 	if (rp->r_unldvp != NULL)
358 		dnlc_purge_vp(vp);
359 
360 	/*
361 	 * If the file was open for write and there are pages,
362 	 * then if the file system was mounted using the "no-close-
363 	 *	to-open" semantics, then start an asynchronous flush
364 	 *	of the all of the pages in the file.
365 	 * else the file system was not mounted using the "no-close-
366 	 *	to-open" semantics, then do a synchronous flush and
367 	 *	commit of all of the dirty and uncommitted pages.
368 	 *
369 	 * The asynchronous flush of the pages in the "nocto" path
370 	 * mostly just associates a cred pointer with the rnode so
371 	 * writes which happen later will have a better chance of
372 	 * working.  It also starts the data being written to the
373 	 * server, but without unnecessarily delaying the application.
374 	 */
375 	if ((flag & FWRITE) && vn_has_cached_data(vp)) {
376 		if ((VTOMI(vp)->mi_flags & MI_NOCTO)) {
377 			error = nfs_putpage(vp, (offset_t)0, 0, B_ASYNC,
378 			    cr, ct);
379 			if (error == EAGAIN)
380 				error = 0;
381 		} else
382 			error = nfs_putpage(vp, (offset_t)0, 0, 0, cr, ct);
383 		if (!error) {
384 			mutex_enter(&rp->r_statelock);
385 			error = rp->r_error;
386 			rp->r_error = 0;
387 			mutex_exit(&rp->r_statelock);
388 		}
389 	} else {
390 		mutex_enter(&rp->r_statelock);
391 		error = rp->r_error;
392 		rp->r_error = 0;
393 		mutex_exit(&rp->r_statelock);
394 	}
395 
396 	/*
397 	 * If RWRITEATTR is set, then issue an over the wire GETATTR to
398 	 * refresh the attribute cache with a set of attributes which
399 	 * weren't returned from a WRITE.  This will enable the close-
400 	 * to-open processing to work.
401 	 */
402 	if (rp->r_flags & RWRITEATTR)
403 		(void) nfs_getattr_otw(vp, &va, cr);
404 
405 	return (error);
406 }
407 
408 /* ARGSUSED */
409 static int
410 nfs_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
411 	caller_context_t *ct)
412 {
413 	rnode_t *rp;
414 	u_offset_t off;
415 	offset_t diff;
416 	int on;
417 	size_t n;
418 	caddr_t base;
419 	uint_t flags;
420 	int error;
421 	mntinfo_t *mi;
422 
423 	rp = VTOR(vp);
424 	mi = VTOMI(vp);
425 
426 	if (nfs_zone() != mi->mi_zone)
427 		return (EIO);
428 
429 	ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
430 
431 	if (vp->v_type != VREG)
432 		return (EISDIR);
433 
434 	if (uiop->uio_resid == 0)
435 		return (0);
436 
437 	if (uiop->uio_loffset > MAXOFF32_T)
438 		return (EFBIG);
439 
440 	if (uiop->uio_loffset < 0 ||
441 	    uiop->uio_loffset + uiop->uio_resid > MAXOFF32_T)
442 		return (EINVAL);
443 
444 	/*
445 	 * Bypass VM if caching has been disabled (e.g., locking) or if
446 	 * using client-side direct I/O and the file is not mmap'd and
447 	 * there are no cached pages.
448 	 */
449 	if ((vp->v_flag & VNOCACHE) ||
450 	    (((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) &&
451 	    rp->r_mapcnt == 0 && rp->r_inmap == 0 &&
452 	    !vn_has_cached_data(vp))) {
453 		size_t bufsize;
454 		size_t resid = 0;
455 
456 		/*
457 		 * Let's try to do read in as large a chunk as we can
458 		 * (Filesystem (NFS client) bsize if possible/needed).
459 		 * For V3, this is 32K and for V2, this is 8K.
460 		 */
461 		bufsize = MIN(uiop->uio_resid, VTOMI(vp)->mi_curread);
462 		base = kmem_alloc(bufsize, KM_SLEEP);
463 		do {
464 			n = MIN(uiop->uio_resid, bufsize);
465 			error = nfsread(vp, base, uiop->uio_offset, n,
466 			    &resid, cr);
467 			if (!error) {
468 				n -= resid;
469 				error = uiomove(base, n, UIO_READ, uiop);
470 			}
471 		} while (!error && uiop->uio_resid > 0 && n > 0);
472 		kmem_free(base, bufsize);
473 		return (error);
474 	}
475 
476 	error = 0;
477 
478 	do {
479 		off = uiop->uio_loffset & MAXBMASK; /* mapping offset */
480 		on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */
481 		n = MIN(MAXBSIZE - on, uiop->uio_resid);
482 
483 		error = nfs_validate_caches(vp, cr);
484 		if (error)
485 			break;
486 
487 		mutex_enter(&rp->r_statelock);
488 		while (rp->r_flags & RINCACHEPURGE) {
489 			if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
490 				mutex_exit(&rp->r_statelock);
491 				return (EINTR);
492 			}
493 		}
494 		diff = rp->r_size - uiop->uio_loffset;
495 		mutex_exit(&rp->r_statelock);
496 		if (diff <= 0)
497 			break;
498 		if (diff < n)
499 			n = (size_t)diff;
500 
501 		if (vpm_enable) {
502 			/*
503 			 * Copy data.
504 			 */
505 			error = vpm_data_copy(vp, off + on, n, uiop,
506 			    1, NULL, 0, S_READ);
507 		} else {
508 			base = segmap_getmapflt(segkmap, vp, off + on, n,
509 			    1, S_READ);
510 			error = uiomove(base + on, n, UIO_READ, uiop);
511 		}
512 
513 		if (!error) {
514 			/*
515 			 * If read a whole block or read to eof,
516 			 * won't need this buffer again soon.
517 			 */
518 			mutex_enter(&rp->r_statelock);
519 			if (n + on == MAXBSIZE ||
520 			    uiop->uio_loffset == rp->r_size)
521 				flags = SM_DONTNEED;
522 			else
523 				flags = 0;
524 			mutex_exit(&rp->r_statelock);
525 			if (vpm_enable) {
526 				error = vpm_sync_pages(vp, off, n, flags);
527 			} else {
528 				error = segmap_release(segkmap, base, flags);
529 			}
530 		} else {
531 			if (vpm_enable) {
532 				(void) vpm_sync_pages(vp, off, n, 0);
533 			} else {
534 				(void) segmap_release(segkmap, base, 0);
535 			}
536 		}
537 	} while (!error && uiop->uio_resid > 0);
538 
539 	return (error);
540 }
541 
542 /* ARGSUSED */
543 static int
544 nfs_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
545 	caller_context_t *ct)
546 {
547 	rnode_t *rp;
548 	u_offset_t off;
549 	caddr_t base;
550 	uint_t flags;
551 	int remainder;
552 	size_t n;
553 	int on;
554 	int error;
555 	int resid;
556 	offset_t offset;
557 	rlim_t limit;
558 	mntinfo_t *mi;
559 
560 	rp = VTOR(vp);
561 
562 	mi = VTOMI(vp);
563 	if (nfs_zone() != mi->mi_zone)
564 		return (EIO);
565 	if (vp->v_type != VREG)
566 		return (EISDIR);
567 
568 	if (uiop->uio_resid == 0)
569 		return (0);
570 
571 	if (ioflag & FAPPEND) {
572 		struct vattr va;
573 
574 		/*
575 		 * Must serialize if appending.
576 		 */
577 		if (nfs_rw_lock_held(&rp->r_rwlock, RW_READER)) {
578 			nfs_rw_exit(&rp->r_rwlock);
579 			if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER,
580 			    INTR(vp)))
581 				return (EINTR);
582 		}
583 
584 		va.va_mask = AT_SIZE;
585 		error = nfsgetattr(vp, &va, cr);
586 		if (error)
587 			return (error);
588 		uiop->uio_loffset = va.va_size;
589 	}
590 
591 	if (uiop->uio_loffset > MAXOFF32_T)
592 		return (EFBIG);
593 
594 	offset = uiop->uio_loffset + uiop->uio_resid;
595 
596 	if (uiop->uio_loffset < 0 || offset > MAXOFF32_T)
597 		return (EINVAL);
598 
599 	if (uiop->uio_llimit > (rlim64_t)MAXOFF32_T) {
600 		limit = MAXOFF32_T;
601 	} else {
602 		limit = (rlim_t)uiop->uio_llimit;
603 	}
604 
605 	/*
606 	 * Check to make sure that the process will not exceed
607 	 * its limit on file size.  It is okay to write up to
608 	 * the limit, but not beyond.  Thus, the write which
609 	 * reaches the limit will be short and the next write
610 	 * will return an error.
611 	 */
612 	remainder = 0;
613 	if (offset > limit) {
614 		remainder = offset - limit;
615 		uiop->uio_resid = limit - uiop->uio_offset;
616 		if (uiop->uio_resid <= 0) {
617 			proc_t *p = ttoproc(curthread);
618 
619 			uiop->uio_resid += remainder;
620 			mutex_enter(&p->p_lock);
621 			(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
622 			    p->p_rctls, p, RCA_UNSAFE_SIGINFO);
623 			mutex_exit(&p->p_lock);
624 			return (EFBIG);
625 		}
626 	}
627 
628 	if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR(vp)))
629 		return (EINTR);
630 
631 	/*
632 	 * Bypass VM if caching has been disabled (e.g., locking) or if
633 	 * using client-side direct I/O and the file is not mmap'd and
634 	 * there are no cached pages.
635 	 */
636 	if ((vp->v_flag & VNOCACHE) ||
637 	    (((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) &&
638 	    rp->r_mapcnt == 0 && rp->r_inmap == 0 &&
639 	    !vn_has_cached_data(vp))) {
640 		size_t bufsize;
641 		int count;
642 		uint_t org_offset;
643 
644 nfs_fwrite:
645 		if (rp->r_flags & RSTALE) {
646 			resid = uiop->uio_resid;
647 			offset = uiop->uio_loffset;
648 			error = rp->r_error;
649 			/*
650 			 * A close may have cleared r_error, if so,
651 			 * propagate ESTALE error return properly
652 			 */
653 			if (error == 0)
654 				error = ESTALE;
655 			goto bottom;
656 		}
657 		bufsize = MIN(uiop->uio_resid, mi->mi_curwrite);
658 		base = kmem_alloc(bufsize, KM_SLEEP);
659 		do {
660 			resid = uiop->uio_resid;
661 			offset = uiop->uio_loffset;
662 			count = MIN(uiop->uio_resid, bufsize);
663 			org_offset = uiop->uio_offset;
664 			error = uiomove(base, count, UIO_WRITE, uiop);
665 			if (!error) {
666 				error = nfswrite(vp, base, org_offset,
667 				    count, cr);
668 			}
669 		} while (!error && uiop->uio_resid > 0);
670 		kmem_free(base, bufsize);
671 		goto bottom;
672 	}
673 
674 	do {
675 		off = uiop->uio_loffset & MAXBMASK; /* mapping offset */
676 		on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */
677 		n = MIN(MAXBSIZE - on, uiop->uio_resid);
678 
679 		resid = uiop->uio_resid;
680 		offset = uiop->uio_loffset;
681 
682 		if (rp->r_flags & RSTALE) {
683 			error = rp->r_error;
684 			/*
685 			 * A close may have cleared r_error, if so,
686 			 * propagate ESTALE error return properly
687 			 */
688 			if (error == 0)
689 				error = ESTALE;
690 			break;
691 		}
692 
693 		/*
694 		 * Don't create dirty pages faster than they
695 		 * can be cleaned so that the system doesn't
696 		 * get imbalanced.  If the async queue is
697 		 * maxed out, then wait for it to drain before
698 		 * creating more dirty pages.  Also, wait for
699 		 * any threads doing pagewalks in the vop_getattr
700 		 * entry points so that they don't block for
701 		 * long periods.
702 		 */
703 		mutex_enter(&rp->r_statelock);
704 		while ((mi->mi_max_threads != 0 &&
705 		    rp->r_awcount > 2 * mi->mi_max_threads) ||
706 		    rp->r_gcount > 0) {
707 			if (INTR(vp)) {
708 				klwp_t *lwp = ttolwp(curthread);
709 
710 				if (lwp != NULL)
711 					lwp->lwp_nostop++;
712 				if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
713 					mutex_exit(&rp->r_statelock);
714 					if (lwp != NULL)
715 						lwp->lwp_nostop--;
716 					error = EINTR;
717 					goto bottom;
718 				}
719 				if (lwp != NULL)
720 					lwp->lwp_nostop--;
721 			} else
722 				cv_wait(&rp->r_cv, &rp->r_statelock);
723 		}
724 		mutex_exit(&rp->r_statelock);
725 
726 		/*
727 		 * Touch the page and fault it in if it is not in core
728 		 * before segmap_getmapflt or vpm_data_copy can lock it.
729 		 * This is to avoid the deadlock if the buffer is mapped
730 		 * to the same file through mmap which we want to write.
731 		 */
732 		uio_prefaultpages((long)n, uiop);
733 
734 		if (vpm_enable) {
735 			/*
736 			 * It will use kpm mappings, so no need to
737 			 * pass an address.
738 			 */
739 			error = writerp(rp, NULL, n, uiop, 0);
740 		} else  {
741 			if (segmap_kpm) {
742 				int pon = uiop->uio_loffset & PAGEOFFSET;
743 				size_t pn = MIN(PAGESIZE - pon,
744 				    uiop->uio_resid);
745 				int pagecreate;
746 
747 				mutex_enter(&rp->r_statelock);
748 				pagecreate = (pon == 0) && (pn == PAGESIZE ||
749 				    uiop->uio_loffset + pn >= rp->r_size);
750 				mutex_exit(&rp->r_statelock);
751 
752 				base = segmap_getmapflt(segkmap, vp, off + on,
753 				    pn, !pagecreate, S_WRITE);
754 
755 				error = writerp(rp, base + pon, n, uiop,
756 				    pagecreate);
757 
758 			} else {
759 				base = segmap_getmapflt(segkmap, vp, off + on,
760 				    n, 0, S_READ);
761 				error = writerp(rp, base + on, n, uiop, 0);
762 			}
763 		}
764 
765 		if (!error) {
766 			if (mi->mi_flags & MI_NOAC)
767 				flags = SM_WRITE;
768 			else if (n + on == MAXBSIZE || IS_SWAPVP(vp)) {
769 				/*
770 				 * Have written a whole block.
771 				 * Start an asynchronous write
772 				 * and mark the buffer to
773 				 * indicate that it won't be
774 				 * needed again soon.
775 				 */
776 				flags = SM_WRITE | SM_ASYNC | SM_DONTNEED;
777 			} else
778 				flags = 0;
779 			if ((ioflag & (FSYNC|FDSYNC)) ||
780 			    (rp->r_flags & ROUTOFSPACE)) {
781 				flags &= ~SM_ASYNC;
782 				flags |= SM_WRITE;
783 			}
784 			if (vpm_enable) {
785 				error = vpm_sync_pages(vp, off, n, flags);
786 			} else {
787 				error = segmap_release(segkmap, base, flags);
788 			}
789 		} else {
790 			if (vpm_enable) {
791 				(void) vpm_sync_pages(vp, off, n, 0);
792 			} else {
793 				(void) segmap_release(segkmap, base, 0);
794 			}
795 			/*
796 			 * In the event that we got an access error while
797 			 * faulting in a page for a write-only file just
798 			 * force a write.
799 			 */
800 			if (error == EACCES)
801 				goto nfs_fwrite;
802 		}
803 	} while (!error && uiop->uio_resid > 0);
804 
805 bottom:
806 	if (error) {
807 		uiop->uio_resid = resid + remainder;
808 		uiop->uio_loffset = offset;
809 	} else
810 		uiop->uio_resid += remainder;
811 
812 	nfs_rw_exit(&rp->r_lkserlock);
813 
814 	return (error);
815 }
816 
817 /*
818  * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED}
819  */
820 static int
821 nfs_rdwrlbn(vnode_t *vp, page_t *pp, u_offset_t off, size_t len,
822 	int flags, cred_t *cr)
823 {
824 	struct buf *bp;
825 	int error;
826 
827 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
828 	bp = pageio_setup(pp, len, vp, flags);
829 	ASSERT(bp != NULL);
830 
831 	/*
832 	 * pageio_setup should have set b_addr to 0.  This
833 	 * is correct since we want to do I/O on a page
834 	 * boundary.  bp_mapin will use this addr to calculate
835 	 * an offset, and then set b_addr to the kernel virtual
836 	 * address it allocated for us.
837 	 */
838 	ASSERT(bp->b_un.b_addr == 0);
839 
840 	bp->b_edev = 0;
841 	bp->b_dev = 0;
842 	bp->b_lblkno = lbtodb(off);
843 	bp->b_file = vp;
844 	bp->b_offset = (offset_t)off;
845 	bp_mapin(bp);
846 
847 	error = nfs_bio(bp, cr);
848 
849 	bp_mapout(bp);
850 	pageio_done(bp);
851 
852 	return (error);
853 }
854 
855 /*
856  * Write to file.  Writes to remote server in largest size
857  * chunks that the server can handle.  Write is synchronous.
858  */
859 static int
860 nfswrite(vnode_t *vp, caddr_t base, uint_t offset, int count, cred_t *cr)
861 {
862 	rnode_t *rp;
863 	mntinfo_t *mi;
864 	struct nfswriteargs wa;
865 	struct nfsattrstat ns;
866 	int error;
867 	int tsize;
868 	int douprintf;
869 
870 	douprintf = 1;
871 
872 	rp = VTOR(vp);
873 	mi = VTOMI(vp);
874 
875 	ASSERT(nfs_zone() == mi->mi_zone);
876 
877 	wa.wa_args = &wa.wa_args_buf;
878 	wa.wa_fhandle = *VTOFH(vp);
879 
880 	do {
881 		tsize = MIN(mi->mi_curwrite, count);
882 		wa.wa_data = base;
883 		wa.wa_begoff = offset;
884 		wa.wa_totcount = tsize;
885 		wa.wa_count = tsize;
886 		wa.wa_offset = offset;
887 
888 		if (mi->mi_io_kstats) {
889 			mutex_enter(&mi->mi_lock);
890 			kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
891 			mutex_exit(&mi->mi_lock);
892 		}
893 		wa.wa_mblk = NULL;
894 		do {
895 			error = rfs2call(mi, RFS_WRITE,
896 			    xdr_writeargs, (caddr_t)&wa,
897 			    xdr_attrstat, (caddr_t)&ns, cr,
898 			    &douprintf, &ns.ns_status, 0, NULL);
899 		} while (error == ENFS_TRYAGAIN);
900 		if (mi->mi_io_kstats) {
901 			mutex_enter(&mi->mi_lock);
902 			kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
903 			mutex_exit(&mi->mi_lock);
904 		}
905 
906 		if (!error) {
907 			error = geterrno(ns.ns_status);
908 			/*
909 			 * Can't check for stale fhandle and purge caches
910 			 * here because pages are held by nfs_getpage.
911 			 * Just mark the attribute cache as timed out
912 			 * and set RWRITEATTR to indicate that the file
913 			 * was modified with a WRITE operation.
914 			 */
915 			if (!error) {
916 				count -= tsize;
917 				base += tsize;
918 				offset += tsize;
919 				if (mi->mi_io_kstats) {
920 					mutex_enter(&mi->mi_lock);
921 					KSTAT_IO_PTR(mi->mi_io_kstats)->
922 					    writes++;
923 					KSTAT_IO_PTR(mi->mi_io_kstats)->
924 					    nwritten += tsize;
925 					mutex_exit(&mi->mi_lock);
926 				}
927 				lwp_stat_update(LWP_STAT_OUBLK, 1);
928 				mutex_enter(&rp->r_statelock);
929 				PURGE_ATTRCACHE_LOCKED(rp);
930 				rp->r_flags |= RWRITEATTR;
931 				mutex_exit(&rp->r_statelock);
932 			}
933 		}
934 	} while (!error && count);
935 
936 	return (error);
937 }
938 
939 /*
940  * Read from a file.  Reads data in largest chunks our interface can handle.
941  */
942 static int
943 nfsread(vnode_t *vp, caddr_t base, uint_t offset,
944     int count, size_t *residp, cred_t *cr)
945 {
946 	mntinfo_t *mi;
947 	struct nfsreadargs ra;
948 	struct nfsrdresult rr;
949 	int tsize;
950 	int error;
951 	int douprintf;
952 	failinfo_t fi;
953 	rnode_t *rp;
954 	struct vattr va;
955 	hrtime_t t;
956 
957 	rp = VTOR(vp);
958 	mi = VTOMI(vp);
959 
960 	ASSERT(nfs_zone() == mi->mi_zone);
961 
962 	douprintf = 1;
963 
964 	ra.ra_fhandle = *VTOFH(vp);
965 
966 	fi.vp = vp;
967 	fi.fhp = (caddr_t)&ra.ra_fhandle;
968 	fi.copyproc = nfscopyfh;
969 	fi.lookupproc = nfslookup;
970 	fi.xattrdirproc = acl_getxattrdir2;
971 
972 	do {
973 		if (mi->mi_io_kstats) {
974 			mutex_enter(&mi->mi_lock);
975 			kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
976 			mutex_exit(&mi->mi_lock);
977 		}
978 
979 		do {
980 			tsize = MIN(mi->mi_curread, count);
981 			rr.rr_data = base;
982 			ra.ra_offset = offset;
983 			ra.ra_totcount = tsize;
984 			ra.ra_count = tsize;
985 			ra.ra_data = base;
986 			t = gethrtime();
987 			error = rfs2call(mi, RFS_READ,
988 			    xdr_readargs, (caddr_t)&ra,
989 			    xdr_rdresult, (caddr_t)&rr, cr,
990 			    &douprintf, &rr.rr_status, 0, &fi);
991 		} while (error == ENFS_TRYAGAIN);
992 
993 		if (mi->mi_io_kstats) {
994 			mutex_enter(&mi->mi_lock);
995 			kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
996 			mutex_exit(&mi->mi_lock);
997 		}
998 
999 		if (!error) {
1000 			error = geterrno(rr.rr_status);
1001 			if (!error) {
1002 				count -= rr.rr_count;
1003 				base += rr.rr_count;
1004 				offset += rr.rr_count;
1005 				if (mi->mi_io_kstats) {
1006 					mutex_enter(&mi->mi_lock);
1007 					KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
1008 					KSTAT_IO_PTR(mi->mi_io_kstats)->nread +=
1009 					    rr.rr_count;
1010 					mutex_exit(&mi->mi_lock);
1011 				}
1012 				lwp_stat_update(LWP_STAT_INBLK, 1);
1013 			}
1014 		}
1015 	} while (!error && count && rr.rr_count == tsize);
1016 
1017 	*residp = count;
1018 
1019 	if (!error) {
1020 		/*
1021 		 * Since no error occurred, we have the current
1022 		 * attributes and we need to do a cache check and then
1023 		 * potentially update the cached attributes.  We can't
1024 		 * use the normal attribute check and cache mechanisms
1025 		 * because they might cause a cache flush which would
1026 		 * deadlock.  Instead, we just check the cache to see
1027 		 * if the attributes have changed.  If it is, then we
1028 		 * just mark the attributes as out of date.  The next
1029 		 * time that the attributes are checked, they will be
1030 		 * out of date, new attributes will be fetched, and
1031 		 * the page cache will be flushed.  If the attributes
1032 		 * weren't changed, then we just update the cached
1033 		 * attributes with these attributes.
1034 		 */
1035 		/*
1036 		 * If NFS_ACL is supported on the server, then the
1037 		 * attributes returned by server may have minimal
1038 		 * permissions sometimes denying access to users having
1039 		 * proper access.  To get the proper attributes, mark
1040 		 * the attributes as expired so that they will be
1041 		 * regotten via the NFS_ACL GETATTR2 procedure.
1042 		 */
1043 		error = nattr_to_vattr(vp, &rr.rr_attr, &va);
1044 		mutex_enter(&rp->r_statelock);
1045 		if (error || !CACHE_VALID(rp, va.va_mtime, va.va_size) ||
1046 		    (mi->mi_flags & MI_ACL)) {
1047 			mutex_exit(&rp->r_statelock);
1048 			PURGE_ATTRCACHE(vp);
1049 		} else {
1050 			if (rp->r_mtime <= t) {
1051 				nfs_attrcache_va(vp, &va);
1052 			}
1053 			mutex_exit(&rp->r_statelock);
1054 		}
1055 	}
1056 
1057 	return (error);
1058 }
1059 
1060 /* ARGSUSED */
1061 static int
1062 nfs_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp,
1063 	caller_context_t *ct)
1064 {
1065 
1066 	if (nfs_zone() != VTOMI(vp)->mi_zone)
1067 		return (EIO);
1068 	switch (cmd) {
1069 		case _FIODIRECTIO:
1070 			return (nfs_directio(vp, (int)arg, cr));
1071 		default:
1072 			return (ENOTTY);
1073 	}
1074 }
1075 
1076 /* ARGSUSED */
1077 static int
1078 nfs_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
1079 	caller_context_t *ct)
1080 {
1081 	int error;
1082 	rnode_t *rp;
1083 
1084 	if (nfs_zone() != VTOMI(vp)->mi_zone)
1085 		return (EIO);
1086 	/*
1087 	 * If it has been specified that the return value will
1088 	 * just be used as a hint, and we are only being asked
1089 	 * for size, fsid or rdevid, then return the client's
1090 	 * notion of these values without checking to make sure
1091 	 * that the attribute cache is up to date.
1092 	 * The whole point is to avoid an over the wire GETATTR
1093 	 * call.
1094 	 */
1095 	rp = VTOR(vp);
1096 	if (flags & ATTR_HINT) {
1097 		if (vap->va_mask ==
1098 		    (vap->va_mask & (AT_SIZE | AT_FSID | AT_RDEV))) {
1099 			mutex_enter(&rp->r_statelock);
1100 			if (vap->va_mask | AT_SIZE)
1101 				vap->va_size = rp->r_size;
1102 			if (vap->va_mask | AT_FSID)
1103 				vap->va_fsid = rp->r_attr.va_fsid;
1104 			if (vap->va_mask | AT_RDEV)
1105 				vap->va_rdev = rp->r_attr.va_rdev;
1106 			mutex_exit(&rp->r_statelock);
1107 			return (0);
1108 		}
1109 	}
1110 
1111 	/*
1112 	 * Only need to flush pages if asking for the mtime
1113 	 * and if there any dirty pages or any outstanding
1114 	 * asynchronous (write) requests for this file.
1115 	 */
1116 	if (vap->va_mask & AT_MTIME) {
1117 		if (vn_has_cached_data(vp) &&
1118 		    ((rp->r_flags & RDIRTY) || rp->r_awcount > 0)) {
1119 			mutex_enter(&rp->r_statelock);
1120 			rp->r_gcount++;
1121 			mutex_exit(&rp->r_statelock);
1122 			error = nfs_putpage(vp, (offset_t)0, 0, 0, cr, ct);
1123 			mutex_enter(&rp->r_statelock);
1124 			if (error && (error == ENOSPC || error == EDQUOT)) {
1125 				if (!rp->r_error)
1126 					rp->r_error = error;
1127 			}
1128 			if (--rp->r_gcount == 0)
1129 				cv_broadcast(&rp->r_cv);
1130 			mutex_exit(&rp->r_statelock);
1131 		}
1132 	}
1133 
1134 	return (nfsgetattr(vp, vap, cr));
1135 }
1136 
1137 /*ARGSUSED4*/
1138 static int
1139 nfs_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
1140 		caller_context_t *ct)
1141 {
1142 	int error;
1143 	uint_t mask;
1144 	struct vattr va;
1145 
1146 	mask = vap->va_mask;
1147 
1148 	if (mask & AT_NOSET)
1149 		return (EINVAL);
1150 
1151 	if ((mask & AT_SIZE) &&
1152 	    vap->va_type == VREG &&
1153 	    vap->va_size > MAXOFF32_T)
1154 		return (EFBIG);
1155 
1156 	if (nfs_zone() != VTOMI(vp)->mi_zone)
1157 		return (EIO);
1158 
1159 	va.va_mask = AT_UID | AT_MODE;
1160 
1161 	error = nfsgetattr(vp, &va, cr);
1162 	if (error)
1163 		return (error);
1164 
1165 	error = secpolicy_vnode_setattr(cr, vp, vap, &va, flags, nfs_accessx,
1166 	    vp);
1167 
1168 	if (error)
1169 		return (error);
1170 
1171 	return (nfssetattr(vp, vap, flags, cr));
1172 }
1173 
1174 static int
1175 nfssetattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr)
1176 {
1177 	int error;
1178 	uint_t mask;
1179 	struct nfssaargs args;
1180 	struct nfsattrstat ns;
1181 	int douprintf;
1182 	rnode_t *rp;
1183 	struct vattr va;
1184 	mode_t omode;
1185 	mntinfo_t *mi;
1186 	vsecattr_t *vsp;
1187 	hrtime_t t;
1188 
1189 	mask = vap->va_mask;
1190 
1191 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
1192 
1193 	rp = VTOR(vp);
1194 
1195 	/*
1196 	 * Only need to flush pages if there are any pages and
1197 	 * if the file is marked as dirty in some fashion.  The
1198 	 * file must be flushed so that we can accurately
1199 	 * determine the size of the file and the cached data
1200 	 * after the SETATTR returns.  A file is considered to
1201 	 * be dirty if it is either marked with RDIRTY, has
1202 	 * outstanding i/o's active, or is mmap'd.  In this
1203 	 * last case, we can't tell whether there are dirty
1204 	 * pages, so we flush just to be sure.
1205 	 */
1206 	if (vn_has_cached_data(vp) &&
1207 	    ((rp->r_flags & RDIRTY) ||
1208 	    rp->r_count > 0 ||
1209 	    rp->r_mapcnt > 0)) {
1210 		ASSERT(vp->v_type != VCHR);
1211 		error = nfs_putpage(vp, (offset_t)0, 0, 0, cr, NULL);
1212 		if (error && (error == ENOSPC || error == EDQUOT)) {
1213 			mutex_enter(&rp->r_statelock);
1214 			if (!rp->r_error)
1215 				rp->r_error = error;
1216 			mutex_exit(&rp->r_statelock);
1217 		}
1218 	}
1219 
1220 	/*
1221 	 * If the system call was utime(2) or utimes(2) and the
1222 	 * application did not specify the times, then set the
1223 	 * mtime nanosecond field to 1 billion.  This will get
1224 	 * translated from 1 billion nanoseconds to 1 million
1225 	 * microseconds in the over the wire request.  The
1226 	 * server will use 1 million in the microsecond field
1227 	 * to tell whether both the mtime and atime should be
1228 	 * set to the server's current time.
1229 	 *
1230 	 * This is an overload of the protocol and should be
1231 	 * documented in the NFS Version 2 protocol specification.
1232 	 */
1233 	if ((mask & AT_MTIME) && !(flags & ATTR_UTIME)) {
1234 		vap->va_mtime.tv_nsec = 1000000000;
1235 		if (NFS_TIME_T_OK(vap->va_mtime.tv_sec) &&
1236 		    NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
1237 			error = vattr_to_sattr(vap, &args.saa_sa);
1238 		} else {
1239 			/*
1240 			 * Use server times. vap time values will not be used.
1241 			 * To ensure no time overflow, make sure vap has
1242 			 * valid values, but retain the original values.
1243 			 */
1244 			timestruc_t	mtime = vap->va_mtime;
1245 			timestruc_t	atime = vap->va_atime;
1246 			time_t		now;
1247 
1248 			now = gethrestime_sec();
1249 			if (NFS_TIME_T_OK(now)) {
1250 				/* Just in case server does not know of this */
1251 				vap->va_mtime.tv_sec = now;
1252 				vap->va_atime.tv_sec = now;
1253 			} else {
1254 				vap->va_mtime.tv_sec = 0;
1255 				vap->va_atime.tv_sec = 0;
1256 			}
1257 			error = vattr_to_sattr(vap, &args.saa_sa);
1258 			/* set vap times back on */
1259 			vap->va_mtime = mtime;
1260 			vap->va_atime = atime;
1261 		}
1262 	} else {
1263 		/* Either do not set times or use the client specified times */
1264 		error = vattr_to_sattr(vap, &args.saa_sa);
1265 	}
1266 	if (error) {
1267 		/* req time field(s) overflow - return immediately */
1268 		return (error);
1269 	}
1270 	args.saa_fh = *VTOFH(vp);
1271 
1272 	va.va_mask = AT_MODE;
1273 	error = nfsgetattr(vp, &va, cr);
1274 	if (error)
1275 		return (error);
1276 	omode = va.va_mode;
1277 
1278 	mi = VTOMI(vp);
1279 
1280 	douprintf = 1;
1281 
1282 	t = gethrtime();
1283 
1284 	error = rfs2call(mi, RFS_SETATTR,
1285 	    xdr_saargs, (caddr_t)&args,
1286 	    xdr_attrstat, (caddr_t)&ns, cr,
1287 	    &douprintf, &ns.ns_status, 0, NULL);
1288 
1289 	/*
1290 	 * Purge the access cache and ACL cache if changing either the
1291 	 * owner of the file, the group owner, or the mode.  These may
1292 	 * change the access permissions of the file, so purge old
1293 	 * information and start over again.
1294 	 */
1295 	if ((mask & (AT_UID | AT_GID | AT_MODE)) && (mi->mi_flags & MI_ACL)) {
1296 		(void) nfs_access_purge_rp(rp);
1297 		if (rp->r_secattr != NULL) {
1298 			mutex_enter(&rp->r_statelock);
1299 			vsp = rp->r_secattr;
1300 			rp->r_secattr = NULL;
1301 			mutex_exit(&rp->r_statelock);
1302 			if (vsp != NULL)
1303 				nfs_acl_free(vsp);
1304 		}
1305 	}
1306 
1307 	if (!error) {
1308 		error = geterrno(ns.ns_status);
1309 		if (!error) {
1310 			/*
1311 			 * If changing the size of the file, invalidate
1312 			 * any local cached data which is no longer part
1313 			 * of the file.  We also possibly invalidate the
1314 			 * last page in the file.  We could use
1315 			 * pvn_vpzero(), but this would mark the page as
1316 			 * modified and require it to be written back to
1317 			 * the server for no particularly good reason.
1318 			 * This way, if we access it, then we bring it
1319 			 * back in.  A read should be cheaper than a
1320 			 * write.
1321 			 */
1322 			if (mask & AT_SIZE) {
1323 				nfs_invalidate_pages(vp,
1324 				    (vap->va_size & PAGEMASK), cr);
1325 			}
1326 			(void) nfs_cache_fattr(vp, &ns.ns_attr, &va, t, cr);
1327 			/*
1328 			 * If NFS_ACL is supported on the server, then the
1329 			 * attributes returned by server may have minimal
1330 			 * permissions sometimes denying access to users having
1331 			 * proper access.  To get the proper attributes, mark
1332 			 * the attributes as expired so that they will be
1333 			 * regotten via the NFS_ACL GETATTR2 procedure.
1334 			 */
1335 			if (mi->mi_flags & MI_ACL) {
1336 				PURGE_ATTRCACHE(vp);
1337 			}
1338 			/*
1339 			 * This next check attempts to deal with NFS
1340 			 * servers which can not handle increasing
1341 			 * the size of the file via setattr.  Most
1342 			 * of these servers do not return an error,
1343 			 * but do not change the size of the file.
1344 			 * Hence, this check and then attempt to set
1345 			 * the file size by writing 1 byte at the
1346 			 * offset of the end of the file that we need.
1347 			 */
1348 			if ((mask & AT_SIZE) &&
1349 			    ns.ns_attr.na_size < (uint32_t)vap->va_size) {
1350 				char zb = '\0';
1351 
1352 				error = nfswrite(vp, &zb,
1353 				    vap->va_size - sizeof (zb),
1354 				    sizeof (zb), cr);
1355 			}
1356 			/*
1357 			 * Some servers will change the mode to clear the setuid
1358 			 * and setgid bits when changing the uid or gid.  The
1359 			 * client needs to compensate appropriately.
1360 			 */
1361 			if (mask & (AT_UID | AT_GID)) {
1362 				int terror;
1363 
1364 				va.va_mask = AT_MODE;
1365 				terror = nfsgetattr(vp, &va, cr);
1366 				if (!terror &&
1367 				    (((mask & AT_MODE) &&
1368 				    va.va_mode != vap->va_mode) ||
1369 				    (!(mask & AT_MODE) &&
1370 				    va.va_mode != omode))) {
1371 					va.va_mask = AT_MODE;
1372 					if (mask & AT_MODE)
1373 						va.va_mode = vap->va_mode;
1374 					else
1375 						va.va_mode = omode;
1376 					(void) nfssetattr(vp, &va, 0, cr);
1377 				}
1378 			}
1379 		} else {
1380 			PURGE_ATTRCACHE(vp);
1381 			PURGE_STALE_FH(error, vp, cr);
1382 		}
1383 	} else {
1384 		PURGE_ATTRCACHE(vp);
1385 	}
1386 
1387 	return (error);
1388 }
1389 
1390 static int
1391 nfs_accessx(void *vp, int mode, cred_t *cr)
1392 {
1393 	ASSERT(nfs_zone() == VTOMI((vnode_t *)vp)->mi_zone);
1394 	return (nfs_access(vp, mode, 0, cr, NULL));
1395 }
1396 
1397 /* ARGSUSED */
1398 static int
1399 nfs_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct)
1400 {
1401 	struct vattr va;
1402 	int error;
1403 	mntinfo_t *mi;
1404 	int shift = 0;
1405 
1406 	mi = VTOMI(vp);
1407 
1408 	if (nfs_zone() != mi->mi_zone)
1409 		return (EIO);
1410 	if (mi->mi_flags & MI_ACL) {
1411 		error = acl_access2(vp, mode, flags, cr);
1412 		if (mi->mi_flags & MI_ACL)
1413 			return (error);
1414 	}
1415 
1416 	va.va_mask = AT_MODE | AT_UID | AT_GID;
1417 	error = nfsgetattr(vp, &va, cr);
1418 	if (error)
1419 		return (error);
1420 
1421 	/*
1422 	 * Disallow write attempts on read-only
1423 	 * file systems, unless the file is a
1424 	 * device node.
1425 	 */
1426 	if ((mode & VWRITE) && vn_is_readonly(vp) && !IS_DEVVP(vp))
1427 		return (EROFS);
1428 
1429 	/*
1430 	 * Disallow attempts to access mandatory lock files.
1431 	 */
1432 	if ((mode & (VWRITE | VREAD | VEXEC)) &&
1433 	    MANDLOCK(vp, va.va_mode))
1434 		return (EACCES);
1435 
1436 	/*
1437 	 * Access check is based on only
1438 	 * one of owner, group, public.
1439 	 * If not owner, then check group.
1440 	 * If not a member of the group,
1441 	 * then check public access.
1442 	 */
1443 	if (crgetuid(cr) != va.va_uid) {
1444 		shift += 3;
1445 		if (!groupmember(va.va_gid, cr))
1446 			shift += 3;
1447 	}
1448 found:
1449 	mode &= ~(va.va_mode << shift);
1450 	if (mode == 0)
1451 		return (0);
1452 
1453 	return (secpolicy_vnode_access(cr, vp, va.va_uid, mode));
1454 }
1455 
1456 static int nfs_do_symlink_cache = 1;
1457 
1458 /* ARGSUSED */
1459 static int
1460 nfs_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr, caller_context_t *ct)
1461 {
1462 	int error;
1463 	struct nfsrdlnres rl;
1464 	rnode_t *rp;
1465 	int douprintf;
1466 	failinfo_t fi;
1467 
1468 	/*
1469 	 * We want to be consistent with UFS semantics so we will return
1470 	 * EINVAL instead of ENXIO. This violates the XNFS spec and
1471 	 * the RFC 1094, which are wrong any way. BUGID 1138002.
1472 	 */
1473 	if (vp->v_type != VLNK)
1474 		return (EINVAL);
1475 
1476 	if (nfs_zone() != VTOMI(vp)->mi_zone)
1477 		return (EIO);
1478 
1479 	rp = VTOR(vp);
1480 	if (nfs_do_symlink_cache && rp->r_symlink.contents != NULL) {
1481 		error = nfs_validate_caches(vp, cr);
1482 		if (error)
1483 			return (error);
1484 		mutex_enter(&rp->r_statelock);
1485 		if (rp->r_symlink.contents != NULL) {
1486 			error = uiomove(rp->r_symlink.contents,
1487 			    rp->r_symlink.len, UIO_READ, uiop);
1488 			mutex_exit(&rp->r_statelock);
1489 			return (error);
1490 		}
1491 		mutex_exit(&rp->r_statelock);
1492 	}
1493 
1494 
1495 	rl.rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
1496 
1497 	fi.vp = vp;
1498 	fi.fhp = NULL;		/* no need to update, filehandle not copied */
1499 	fi.copyproc = nfscopyfh;
1500 	fi.lookupproc = nfslookup;
1501 	fi.xattrdirproc = acl_getxattrdir2;
1502 
1503 	douprintf = 1;
1504 
1505 	error = rfs2call(VTOMI(vp), RFS_READLINK,
1506 	    xdr_readlink, (caddr_t)VTOFH(vp),
1507 	    xdr_rdlnres, (caddr_t)&rl, cr,
1508 	    &douprintf, &rl.rl_status, 0, &fi);
1509 
1510 	if (error) {
1511 
1512 		kmem_free((void *)rl.rl_data, NFS_MAXPATHLEN);
1513 		return (error);
1514 	}
1515 
1516 	error = geterrno(rl.rl_status);
1517 	if (!error) {
1518 		error = uiomove(rl.rl_data, (int)rl.rl_count, UIO_READ, uiop);
1519 		if (nfs_do_symlink_cache && rp->r_symlink.contents == NULL) {
1520 			mutex_enter(&rp->r_statelock);
1521 			if (rp->r_symlink.contents == NULL) {
1522 				rp->r_symlink.contents = rl.rl_data;
1523 				rp->r_symlink.len = (int)rl.rl_count;
1524 				rp->r_symlink.size = NFS_MAXPATHLEN;
1525 				mutex_exit(&rp->r_statelock);
1526 			} else {
1527 				mutex_exit(&rp->r_statelock);
1528 
1529 				kmem_free((void *)rl.rl_data,
1530 				    NFS_MAXPATHLEN);
1531 			}
1532 		} else {
1533 
1534 			kmem_free((void *)rl.rl_data, NFS_MAXPATHLEN);
1535 		}
1536 	} else {
1537 		PURGE_STALE_FH(error, vp, cr);
1538 
1539 		kmem_free((void *)rl.rl_data, NFS_MAXPATHLEN);
1540 	}
1541 
1542 	/*
1543 	 * Conform to UFS semantics (see comment above)
1544 	 */
1545 	return (error == ENXIO ? EINVAL : error);
1546 }
1547 
1548 /*
1549  * Flush local dirty pages to stable storage on the server.
1550  *
1551  * If FNODSYNC is specified, then there is nothing to do because
1552  * metadata changes are not cached on the client before being
1553  * sent to the server.
1554  */
1555 /* ARGSUSED */
1556 static int
1557 nfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
1558 {
1559 	int error;
1560 
1561 	if ((syncflag & FNODSYNC) || IS_SWAPVP(vp))
1562 		return (0);
1563 
1564 	if (nfs_zone() != VTOMI(vp)->mi_zone)
1565 		return (EIO);
1566 
1567 	error = nfs_putpage(vp, (offset_t)0, 0, 0, cr, ct);
1568 	if (!error)
1569 		error = VTOR(vp)->r_error;
1570 	return (error);
1571 }
1572 
1573 
1574 /*
1575  * Weirdness: if the file was removed or the target of a rename
1576  * operation while it was open, it got renamed instead.  Here we
1577  * remove the renamed file.
1578  */
1579 /* ARGSUSED */
1580 static void
1581 nfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
1582 {
1583 	rnode_t *rp;
1584 
1585 	ASSERT(vp != DNLC_NO_VNODE);
1586 
1587 	/*
1588 	 * If this is coming from the wrong zone, we let someone in the right
1589 	 * zone take care of it asynchronously.  We can get here due to
1590 	 * VN_RELE() being called from pageout() or fsflush().  This call may
1591 	 * potentially turn into an expensive no-op if, for instance, v_count
1592 	 * gets incremented in the meantime, but it's still correct.
1593 	 */
1594 	if (nfs_zone() != VTOMI(vp)->mi_zone) {
1595 		nfs_async_inactive(vp, cr, nfs_inactive);
1596 		return;
1597 	}
1598 
1599 	rp = VTOR(vp);
1600 redo:
1601 	if (rp->r_unldvp != NULL) {
1602 		/*
1603 		 * Save the vnode pointer for the directory where the
1604 		 * unlinked-open file got renamed, then set it to NULL
1605 		 * to prevent another thread from getting here before
1606 		 * we're done with the remove.  While we have the
1607 		 * statelock, make local copies of the pertinent rnode
1608 		 * fields.  If we weren't to do this in an atomic way, the
1609 		 * the unl* fields could become inconsistent with respect
1610 		 * to each other due to a race condition between this
1611 		 * code and nfs_remove().  See bug report 1034328.
1612 		 */
1613 		mutex_enter(&rp->r_statelock);
1614 		if (rp->r_unldvp != NULL) {
1615 			vnode_t *unldvp;
1616 			char *unlname;
1617 			cred_t *unlcred;
1618 			struct nfsdiropargs da;
1619 			enum nfsstat status;
1620 			int douprintf;
1621 			int error;
1622 
1623 			unldvp = rp->r_unldvp;
1624 			rp->r_unldvp = NULL;
1625 			unlname = rp->r_unlname;
1626 			rp->r_unlname = NULL;
1627 			unlcred = rp->r_unlcred;
1628 			rp->r_unlcred = NULL;
1629 			mutex_exit(&rp->r_statelock);
1630 
1631 			/*
1632 			 * If there are any dirty pages left, then flush
1633 			 * them.  This is unfortunate because they just
1634 			 * may get thrown away during the remove operation,
1635 			 * but we have to do this for correctness.
1636 			 */
1637 			if (vn_has_cached_data(vp) &&
1638 			    ((rp->r_flags & RDIRTY) || rp->r_count > 0)) {
1639 				ASSERT(vp->v_type != VCHR);
1640 				error = nfs_putpage(vp, (offset_t)0, 0, 0,
1641 				    cr, ct);
1642 				if (error) {
1643 					mutex_enter(&rp->r_statelock);
1644 					if (!rp->r_error)
1645 						rp->r_error = error;
1646 					mutex_exit(&rp->r_statelock);
1647 				}
1648 			}
1649 
1650 			/*
1651 			 * Do the remove operation on the renamed file
1652 			 */
1653 			setdiropargs(&da, unlname, unldvp);
1654 
1655 			douprintf = 1;
1656 
1657 			(void) rfs2call(VTOMI(unldvp), RFS_REMOVE,
1658 			    xdr_diropargs, (caddr_t)&da,
1659 			    xdr_enum, (caddr_t)&status, unlcred,
1660 			    &douprintf, &status, 0, NULL);
1661 
1662 			if (HAVE_RDDIR_CACHE(VTOR(unldvp)))
1663 				nfs_purge_rddir_cache(unldvp);
1664 			PURGE_ATTRCACHE(unldvp);
1665 
1666 			/*
1667 			 * Release stuff held for the remove
1668 			 */
1669 			VN_RELE(unldvp);
1670 			kmem_free(unlname, MAXNAMELEN);
1671 			crfree(unlcred);
1672 			goto redo;
1673 		}
1674 		mutex_exit(&rp->r_statelock);
1675 	}
1676 
1677 	rp_addfree(rp, cr);
1678 }
1679 
1680 /*
1681  * Remote file system operations having to do with directory manipulation.
1682  */
1683 
1684 /* ARGSUSED */
1685 static int
1686 nfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
1687 	int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
1688 	int *direntflags, pathname_t *realpnp)
1689 {
1690 	int error;
1691 	vnode_t *vp;
1692 	vnode_t *avp = NULL;
1693 	rnode_t *drp;
1694 
1695 	if (nfs_zone() != VTOMI(dvp)->mi_zone)
1696 		return (EPERM);
1697 
1698 	drp = VTOR(dvp);
1699 
1700 	/*
1701 	 * Are we looking up extended attributes?  If so, "dvp" is
1702 	 * the file or directory for which we want attributes, and
1703 	 * we need a lookup of the hidden attribute directory
1704 	 * before we lookup the rest of the path.
1705 	 */
1706 	if (flags & LOOKUP_XATTR) {
1707 		bool_t cflag = ((flags & CREATE_XATTR_DIR) != 0);
1708 		mntinfo_t *mi;
1709 
1710 		mi = VTOMI(dvp);
1711 		if (!(mi->mi_flags & MI_EXTATTR))
1712 			return (EINVAL);
1713 
1714 		if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR(dvp)))
1715 			return (EINTR);
1716 
1717 		(void) nfslookup_dnlc(dvp, XATTR_DIR_NAME, &avp, cr);
1718 		if (avp == NULL)
1719 			error = acl_getxattrdir2(dvp, &avp, cflag, cr, 0);
1720 		else
1721 			error = 0;
1722 
1723 		nfs_rw_exit(&drp->r_rwlock);
1724 
1725 		if (error) {
1726 			if (mi->mi_flags & MI_EXTATTR)
1727 				return (error);
1728 			return (EINVAL);
1729 		}
1730 		dvp = avp;
1731 		drp = VTOR(dvp);
1732 	}
1733 
1734 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR(dvp))) {
1735 		error = EINTR;
1736 		goto out;
1737 	}
1738 
1739 	error = nfslookup(dvp, nm, vpp, pnp, flags, rdir, cr, 0);
1740 
1741 	nfs_rw_exit(&drp->r_rwlock);
1742 
1743 	/*
1744 	 * If vnode is a device, create special vnode.
1745 	 */
1746 	if (!error && IS_DEVVP(*vpp)) {
1747 		vp = *vpp;
1748 		*vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
1749 		VN_RELE(vp);
1750 	}
1751 
1752 out:
1753 	if (avp != NULL)
1754 		VN_RELE(avp);
1755 
1756 	return (error);
1757 }
1758 
1759 static int nfs_lookup_neg_cache = 1;
1760 
1761 #ifdef DEBUG
1762 static int nfs_lookup_dnlc_hits = 0;
1763 static int nfs_lookup_dnlc_misses = 0;
1764 static int nfs_lookup_dnlc_neg_hits = 0;
1765 static int nfs_lookup_dnlc_disappears = 0;
1766 static int nfs_lookup_dnlc_lookups = 0;
1767 #endif
1768 
1769 /* ARGSUSED */
1770 int
1771 nfslookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
1772 	int flags, vnode_t *rdir, cred_t *cr, int rfscall_flags)
1773 {
1774 	int error;
1775 
1776 	ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
1777 
1778 	/*
1779 	 * If lookup is for "", just return dvp.  Don't need
1780 	 * to send it over the wire, look it up in the dnlc,
1781 	 * or perform any access checks.
1782 	 */
1783 	if (*nm == '\0') {
1784 		VN_HOLD(dvp);
1785 		*vpp = dvp;
1786 		return (0);
1787 	}
1788 
1789 	/*
1790 	 * Can't do lookups in non-directories.
1791 	 */
1792 	if (dvp->v_type != VDIR)
1793 		return (ENOTDIR);
1794 
1795 	/*
1796 	 * If we're called with RFSCALL_SOFT, it's important that
1797 	 * the only rfscall is one we make directly; if we permit
1798 	 * an access call because we're looking up "." or validating
1799 	 * a dnlc hit, we'll deadlock because that rfscall will not
1800 	 * have the RFSCALL_SOFT set.
1801 	 */
1802 	if (rfscall_flags & RFSCALL_SOFT)
1803 		goto callit;
1804 
1805 	/*
1806 	 * If lookup is for ".", just return dvp.  Don't need
1807 	 * to send it over the wire or look it up in the dnlc,
1808 	 * just need to check access.
1809 	 */
1810 	if (strcmp(nm, ".") == 0) {
1811 		error = nfs_access(dvp, VEXEC, 0, cr, NULL);
1812 		if (error)
1813 			return (error);
1814 		VN_HOLD(dvp);
1815 		*vpp = dvp;
1816 		return (0);
1817 	}
1818 
1819 	/*
1820 	 * Lookup this name in the DNLC.  If there was a valid entry,
1821 	 * then return the results of the lookup.
1822 	 */
1823 	error = nfslookup_dnlc(dvp, nm, vpp, cr);
1824 	if (error || *vpp != NULL)
1825 		return (error);
1826 
1827 callit:
1828 	error = nfslookup_otw(dvp, nm, vpp, cr, rfscall_flags);
1829 
1830 	return (error);
1831 }
1832 
1833 static int
1834 nfslookup_dnlc(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr)
1835 {
1836 	int error;
1837 	vnode_t *vp;
1838 
1839 	ASSERT(*nm != '\0');
1840 	ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
1841 
1842 	/*
1843 	 * Lookup this name in the DNLC.  If successful, then validate
1844 	 * the caches and then recheck the DNLC.  The DNLC is rechecked
1845 	 * just in case this entry got invalidated during the call
1846 	 * to nfs_validate_caches.
1847 	 *
1848 	 * An assumption is being made that it is safe to say that a
1849 	 * file exists which may not on the server.  Any operations to
1850 	 * the server will fail with ESTALE.
1851 	 */
1852 #ifdef DEBUG
1853 	nfs_lookup_dnlc_lookups++;
1854 #endif
1855 	vp = dnlc_lookup(dvp, nm);
1856 	if (vp != NULL) {
1857 		VN_RELE(vp);
1858 		if (vp == DNLC_NO_VNODE && !vn_is_readonly(dvp)) {
1859 			PURGE_ATTRCACHE(dvp);
1860 		}
1861 		error = nfs_validate_caches(dvp, cr);
1862 		if (error)
1863 			return (error);
1864 		vp = dnlc_lookup(dvp, nm);
1865 		if (vp != NULL) {
1866 			error = nfs_access(dvp, VEXEC, 0, cr, NULL);
1867 			if (error) {
1868 				VN_RELE(vp);
1869 				return (error);
1870 			}
1871 			if (vp == DNLC_NO_VNODE) {
1872 				VN_RELE(vp);
1873 #ifdef DEBUG
1874 				nfs_lookup_dnlc_neg_hits++;
1875 #endif
1876 				return (ENOENT);
1877 			}
1878 			*vpp = vp;
1879 #ifdef DEBUG
1880 			nfs_lookup_dnlc_hits++;
1881 #endif
1882 			return (0);
1883 		}
1884 #ifdef DEBUG
1885 		nfs_lookup_dnlc_disappears++;
1886 #endif
1887 	}
1888 #ifdef DEBUG
1889 	else
1890 		nfs_lookup_dnlc_misses++;
1891 #endif
1892 
1893 	*vpp = NULL;
1894 
1895 	return (0);
1896 }
1897 
1898 static int
1899 nfslookup_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr,
1900 	int rfscall_flags)
1901 {
1902 	int error;
1903 	struct nfsdiropargs da;
1904 	struct nfsdiropres dr;
1905 	int douprintf;
1906 	failinfo_t fi;
1907 	hrtime_t t;
1908 
1909 	ASSERT(*nm != '\0');
1910 	ASSERT(dvp->v_type == VDIR);
1911 	ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
1912 
1913 	setdiropargs(&da, nm, dvp);
1914 
1915 	fi.vp = dvp;
1916 	fi.fhp = NULL;		/* no need to update, filehandle not copied */
1917 	fi.copyproc = nfscopyfh;
1918 	fi.lookupproc = nfslookup;
1919 	fi.xattrdirproc = acl_getxattrdir2;
1920 
1921 	douprintf = 1;
1922 
1923 	t = gethrtime();
1924 
1925 	error = rfs2call(VTOMI(dvp), RFS_LOOKUP,
1926 	    xdr_diropargs, (caddr_t)&da,
1927 	    xdr_diropres, (caddr_t)&dr, cr,
1928 	    &douprintf, &dr.dr_status, rfscall_flags, &fi);
1929 
1930 	if (!error) {
1931 		error = geterrno(dr.dr_status);
1932 		if (!error) {
1933 			*vpp = makenfsnode(&dr.dr_fhandle, &dr.dr_attr,
1934 			    dvp->v_vfsp, t, cr, VTOR(dvp)->r_path, nm);
1935 			/*
1936 			 * If NFS_ACL is supported on the server, then the
1937 			 * attributes returned by server may have minimal
1938 			 * permissions sometimes denying access to users having
1939 			 * proper access.  To get the proper attributes, mark
1940 			 * the attributes as expired so that they will be
1941 			 * regotten via the NFS_ACL GETATTR2 procedure.
1942 			 */
1943 			if (VTOMI(*vpp)->mi_flags & MI_ACL) {
1944 				PURGE_ATTRCACHE(*vpp);
1945 			}
1946 			if (!(rfscall_flags & RFSCALL_SOFT))
1947 				dnlc_update(dvp, nm, *vpp);
1948 		} else {
1949 			PURGE_STALE_FH(error, dvp, cr);
1950 			if (error == ENOENT && nfs_lookup_neg_cache)
1951 				dnlc_enter(dvp, nm, DNLC_NO_VNODE);
1952 		}
1953 	}
1954 
1955 	return (error);
1956 }
1957 
1958 /* ARGSUSED */
1959 static int
1960 nfs_create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive,
1961 	int mode, vnode_t **vpp, cred_t *cr, int lfaware, caller_context_t *ct,
1962 	vsecattr_t *vsecp)
1963 {
1964 	int error;
1965 	struct nfscreatargs args;
1966 	struct nfsdiropres dr;
1967 	int douprintf;
1968 	vnode_t *vp;
1969 	rnode_t *rp;
1970 	struct vattr vattr;
1971 	rnode_t *drp;
1972 	vnode_t *tempvp;
1973 	hrtime_t t;
1974 
1975 	drp = VTOR(dvp);
1976 
1977 	if (nfs_zone() != VTOMI(dvp)->mi_zone)
1978 		return (EPERM);
1979 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
1980 		return (EINTR);
1981 
1982 	/*
1983 	 * We make a copy of the attributes because the caller does not
1984 	 * expect us to change what va points to.
1985 	 */
1986 	vattr = *va;
1987 
1988 	/*
1989 	 * If the pathname is "", just use dvp.  Don't need
1990 	 * to send it over the wire, look it up in the dnlc,
1991 	 * or perform any access checks.
1992 	 */
1993 	if (*nm == '\0') {
1994 		error = 0;
1995 		VN_HOLD(dvp);
1996 		vp = dvp;
1997 	/*
1998 	 * If the pathname is ".", just use dvp.  Don't need
1999 	 * to send it over the wire or look it up in the dnlc,
2000 	 * just need to check access.
2001 	 */
2002 	} else if (strcmp(nm, ".") == 0) {
2003 		error = nfs_access(dvp, VEXEC, 0, cr, ct);
2004 		if (error) {
2005 			nfs_rw_exit(&drp->r_rwlock);
2006 			return (error);
2007 		}
2008 		VN_HOLD(dvp);
2009 		vp = dvp;
2010 	/*
2011 	 * We need to go over the wire, just to be sure whether the
2012 	 * file exists or not.  Using the DNLC can be dangerous in
2013 	 * this case when making a decision regarding existence.
2014 	 */
2015 	} else {
2016 		error = nfslookup_otw(dvp, nm, &vp, cr, 0);
2017 	}
2018 	if (!error) {
2019 		if (exclusive == EXCL)
2020 			error = EEXIST;
2021 		else if (vp->v_type == VDIR && (mode & VWRITE))
2022 			error = EISDIR;
2023 		else {
2024 			/*
2025 			 * If vnode is a device, create special vnode.
2026 			 */
2027 			if (IS_DEVVP(vp)) {
2028 				tempvp = vp;
2029 				vp = specvp(vp, vp->v_rdev, vp->v_type, cr);
2030 				VN_RELE(tempvp);
2031 			}
2032 			if (!(error = VOP_ACCESS(vp, mode, 0, cr, ct))) {
2033 				if ((vattr.va_mask & AT_SIZE) &&
2034 				    vp->v_type == VREG) {
2035 					vattr.va_mask = AT_SIZE;
2036 					error = nfssetattr(vp, &vattr, 0, cr);
2037 				}
2038 			}
2039 		}
2040 		nfs_rw_exit(&drp->r_rwlock);
2041 		if (error) {
2042 			VN_RELE(vp);
2043 		} else {
2044 			/*
2045 			 * existing file got truncated, notify.
2046 			 */
2047 			vnevent_create(vp, ct);
2048 			*vpp = vp;
2049 		}
2050 		return (error);
2051 	}
2052 
2053 	ASSERT(vattr.va_mask & AT_TYPE);
2054 	if (vattr.va_type == VREG) {
2055 		ASSERT(vattr.va_mask & AT_MODE);
2056 		if (MANDMODE(vattr.va_mode)) {
2057 			nfs_rw_exit(&drp->r_rwlock);
2058 			return (EACCES);
2059 		}
2060 	}
2061 
2062 	dnlc_remove(dvp, nm);
2063 
2064 	setdiropargs(&args.ca_da, nm, dvp);
2065 
2066 	/*
2067 	 * Decide what the group-id of the created file should be.
2068 	 * Set it in attribute list as advisory...then do a setattr
2069 	 * if the server didn't get it right the first time.
2070 	 */
2071 	error = setdirgid(dvp, &vattr.va_gid, cr);
2072 	if (error) {
2073 		nfs_rw_exit(&drp->r_rwlock);
2074 		return (error);
2075 	}
2076 	vattr.va_mask |= AT_GID;
2077 
2078 	/*
2079 	 * This is a completely gross hack to make mknod
2080 	 * work over the wire until we can wack the protocol
2081 	 */
2082 #define	IFCHR		0020000		/* character special */
2083 #define	IFBLK		0060000		/* block special */
2084 #define	IFSOCK		0140000		/* socket */
2085 
2086 	/*
2087 	 * dev_t is uint_t in 5.x and short in 4.x. Both 4.x
2088 	 * supports 8 bit majors. 5.x supports 14 bit majors. 5.x supports 18
2089 	 * bits in the minor number where 4.x supports 8 bits.  If the 5.x
2090 	 * minor/major numbers <= 8 bits long, compress the device
2091 	 * number before sending it. Otherwise, the 4.x server will not
2092 	 * create the device with the correct device number and nothing can be
2093 	 * done about this.
2094 	 */
2095 	if (vattr.va_type == VCHR || vattr.va_type == VBLK) {
2096 		dev_t d = vattr.va_rdev;
2097 		dev32_t dev32;
2098 
2099 		if (vattr.va_type == VCHR)
2100 			vattr.va_mode |= IFCHR;
2101 		else
2102 			vattr.va_mode |= IFBLK;
2103 
2104 		(void) cmpldev(&dev32, d);
2105 		if (dev32 & ~((SO4_MAXMAJ << L_BITSMINOR32) | SO4_MAXMIN))
2106 			vattr.va_size = (u_offset_t)dev32;
2107 		else
2108 			vattr.va_size = (u_offset_t)nfsv2_cmpdev(d);
2109 
2110 		vattr.va_mask |= AT_MODE|AT_SIZE;
2111 	} else if (vattr.va_type == VFIFO) {
2112 		vattr.va_mode |= IFCHR;		/* xtra kludge for namedpipe */
2113 		vattr.va_size = (u_offset_t)NFS_FIFO_DEV;	/* blech */
2114 		vattr.va_mask |= AT_MODE|AT_SIZE;
2115 	} else if (vattr.va_type == VSOCK) {
2116 		vattr.va_mode |= IFSOCK;
2117 		/*
2118 		 * To avoid triggering bugs in the servers set AT_SIZE
2119 		 * (all other RFS_CREATE calls set this).
2120 		 */
2121 		vattr.va_size = 0;
2122 		vattr.va_mask |= AT_MODE|AT_SIZE;
2123 	}
2124 
2125 	args.ca_sa = &args.ca_sa_buf;
2126 	error = vattr_to_sattr(&vattr, args.ca_sa);
2127 	if (error) {
2128 		/* req time field(s) overflow - return immediately */
2129 		nfs_rw_exit(&drp->r_rwlock);
2130 		return (error);
2131 	}
2132 
2133 	douprintf = 1;
2134 
2135 	t = gethrtime();
2136 
2137 	error = rfs2call(VTOMI(dvp), RFS_CREATE,
2138 	    xdr_creatargs, (caddr_t)&args,
2139 	    xdr_diropres, (caddr_t)&dr, cr,
2140 	    &douprintf, &dr.dr_status, 0, NULL);
2141 
2142 	PURGE_ATTRCACHE(dvp);	/* mod time changed */
2143 
2144 	if (!error) {
2145 		error = geterrno(dr.dr_status);
2146 		if (!error) {
2147 			if (HAVE_RDDIR_CACHE(drp))
2148 				nfs_purge_rddir_cache(dvp);
2149 			vp = makenfsnode(&dr.dr_fhandle, &dr.dr_attr,
2150 			    dvp->v_vfsp, t, cr, NULL, NULL);
2151 			/*
2152 			 * If NFS_ACL is supported on the server, then the
2153 			 * attributes returned by server may have minimal
2154 			 * permissions sometimes denying access to users having
2155 			 * proper access.  To get the proper attributes, mark
2156 			 * the attributes as expired so that they will be
2157 			 * regotten via the NFS_ACL GETATTR2 procedure.
2158 			 */
2159 			if (VTOMI(vp)->mi_flags & MI_ACL) {
2160 				PURGE_ATTRCACHE(vp);
2161 			}
2162 			dnlc_update(dvp, nm, vp);
2163 			rp = VTOR(vp);
2164 			if (vattr.va_size == 0) {
2165 				mutex_enter(&rp->r_statelock);
2166 				rp->r_size = 0;
2167 				mutex_exit(&rp->r_statelock);
2168 				if (vn_has_cached_data(vp)) {
2169 					ASSERT(vp->v_type != VCHR);
2170 					nfs_invalidate_pages(vp,
2171 					    (u_offset_t)0, cr);
2172 				}
2173 			}
2174 
2175 			/*
2176 			 * Make sure the gid was set correctly.
2177 			 * If not, try to set it (but don't lose
2178 			 * any sleep over it).
2179 			 */
2180 			if (vattr.va_gid != rp->r_attr.va_gid) {
2181 				vattr.va_mask = AT_GID;
2182 				(void) nfssetattr(vp, &vattr, 0, cr);
2183 			}
2184 
2185 			/*
2186 			 * If vnode is a device create special vnode
2187 			 */
2188 			if (IS_DEVVP(vp)) {
2189 				*vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
2190 				VN_RELE(vp);
2191 			} else
2192 				*vpp = vp;
2193 		} else {
2194 			PURGE_STALE_FH(error, dvp, cr);
2195 		}
2196 	}
2197 
2198 	nfs_rw_exit(&drp->r_rwlock);
2199 
2200 	return (error);
2201 }
2202 
2203 /*
2204  * Weirdness: if the vnode to be removed is open
2205  * we rename it instead of removing it and nfs_inactive
2206  * will remove the new name.
2207  */
2208 /* ARGSUSED */
2209 static int
2210 nfs_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct, int flags)
2211 {
2212 	int error;
2213 	struct nfsdiropargs da;
2214 	enum nfsstat status;
2215 	vnode_t *vp;
2216 	char *tmpname;
2217 	int douprintf;
2218 	rnode_t *rp;
2219 	rnode_t *drp;
2220 
2221 	if (nfs_zone() != VTOMI(dvp)->mi_zone)
2222 		return (EPERM);
2223 	drp = VTOR(dvp);
2224 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
2225 		return (EINTR);
2226 
2227 	error = nfslookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0);
2228 	if (error) {
2229 		nfs_rw_exit(&drp->r_rwlock);
2230 		return (error);
2231 	}
2232 
2233 	if (vp->v_type == VDIR && secpolicy_fs_linkdir(cr, dvp->v_vfsp)) {
2234 		VN_RELE(vp);
2235 		nfs_rw_exit(&drp->r_rwlock);
2236 		return (EPERM);
2237 	}
2238 
2239 	/*
2240 	 * First just remove the entry from the name cache, as it
2241 	 * is most likely the only entry for this vp.
2242 	 */
2243 	dnlc_remove(dvp, nm);
2244 
2245 	/*
2246 	 * If the file has a v_count > 1 then there may be more than one
2247 	 * entry in the name cache due multiple links or an open file,
2248 	 * but we don't have the real reference count so flush all
2249 	 * possible entries.
2250 	 */
2251 	if (vp->v_count > 1)
2252 		dnlc_purge_vp(vp);
2253 
2254 	/*
2255 	 * Now we have the real reference count on the vnode
2256 	 */
2257 	rp = VTOR(vp);
2258 	mutex_enter(&rp->r_statelock);
2259 	if (vp->v_count > 1 &&
2260 	    (rp->r_unldvp == NULL || strcmp(nm, rp->r_unlname) == 0)) {
2261 		mutex_exit(&rp->r_statelock);
2262 		tmpname = newname();
2263 		error = nfsrename(dvp, nm, dvp, tmpname, cr, ct);
2264 		if (error)
2265 			kmem_free(tmpname, MAXNAMELEN);
2266 		else {
2267 			mutex_enter(&rp->r_statelock);
2268 			if (rp->r_unldvp == NULL) {
2269 				VN_HOLD(dvp);
2270 				rp->r_unldvp = dvp;
2271 				if (rp->r_unlcred != NULL)
2272 					crfree(rp->r_unlcred);
2273 				crhold(cr);
2274 				rp->r_unlcred = cr;
2275 				rp->r_unlname = tmpname;
2276 			} else {
2277 				kmem_free(rp->r_unlname, MAXNAMELEN);
2278 				rp->r_unlname = tmpname;
2279 			}
2280 			mutex_exit(&rp->r_statelock);
2281 		}
2282 	} else {
2283 		mutex_exit(&rp->r_statelock);
2284 		/*
2285 		 * We need to flush any dirty pages which happen to
2286 		 * be hanging around before removing the file.  This
2287 		 * shouldn't happen very often and mostly on file
2288 		 * systems mounted "nocto".
2289 		 */
2290 		if (vn_has_cached_data(vp) &&
2291 		    ((rp->r_flags & RDIRTY) || rp->r_count > 0)) {
2292 			error = nfs_putpage(vp, (offset_t)0, 0, 0, cr, ct);
2293 			if (error && (error == ENOSPC || error == EDQUOT)) {
2294 				mutex_enter(&rp->r_statelock);
2295 				if (!rp->r_error)
2296 					rp->r_error = error;
2297 				mutex_exit(&rp->r_statelock);
2298 			}
2299 		}
2300 
2301 		setdiropargs(&da, nm, dvp);
2302 
2303 		douprintf = 1;
2304 
2305 		error = rfs2call(VTOMI(dvp), RFS_REMOVE,
2306 		    xdr_diropargs, (caddr_t)&da,
2307 		    xdr_enum, (caddr_t)&status, cr,
2308 		    &douprintf, &status, 0, NULL);
2309 
2310 		/*
2311 		 * The xattr dir may be gone after last attr is removed,
2312 		 * so flush it from dnlc.
2313 		 */
2314 		if (dvp->v_flag & V_XATTRDIR)
2315 			dnlc_purge_vp(dvp);
2316 
2317 		PURGE_ATTRCACHE(dvp);	/* mod time changed */
2318 		PURGE_ATTRCACHE(vp);	/* link count changed */
2319 
2320 		if (!error) {
2321 			error = geterrno(status);
2322 			if (!error) {
2323 				if (HAVE_RDDIR_CACHE(drp))
2324 					nfs_purge_rddir_cache(dvp);
2325 			} else {
2326 				PURGE_STALE_FH(error, dvp, cr);
2327 			}
2328 		}
2329 	}
2330 
2331 	if (error == 0) {
2332 		vnevent_remove(vp, dvp, nm, ct);
2333 	}
2334 	VN_RELE(vp);
2335 
2336 	nfs_rw_exit(&drp->r_rwlock);
2337 
2338 	return (error);
2339 }
2340 
2341 /* ARGSUSED */
2342 static int
2343 nfs_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr,
2344 	caller_context_t *ct, int flags)
2345 {
2346 	int error;
2347 	struct nfslinkargs args;
2348 	enum nfsstat status;
2349 	vnode_t *realvp;
2350 	int douprintf;
2351 	rnode_t *tdrp;
2352 
2353 	if (nfs_zone() != VTOMI(tdvp)->mi_zone)
2354 		return (EPERM);
2355 	if (VOP_REALVP(svp, &realvp, ct) == 0)
2356 		svp = realvp;
2357 
2358 	args.la_from = VTOFH(svp);
2359 	setdiropargs(&args.la_to, tnm, tdvp);
2360 
2361 	tdrp = VTOR(tdvp);
2362 	if (nfs_rw_enter_sig(&tdrp->r_rwlock, RW_WRITER, INTR(tdvp)))
2363 		return (EINTR);
2364 
2365 	dnlc_remove(tdvp, tnm);
2366 
2367 	douprintf = 1;
2368 
2369 	error = rfs2call(VTOMI(svp), RFS_LINK,
2370 	    xdr_linkargs, (caddr_t)&args,
2371 	    xdr_enum, (caddr_t)&status, cr,
2372 	    &douprintf, &status, 0, NULL);
2373 
2374 	PURGE_ATTRCACHE(tdvp);	/* mod time changed */
2375 	PURGE_ATTRCACHE(svp);	/* link count changed */
2376 
2377 	if (!error) {
2378 		error = geterrno(status);
2379 		if (!error) {
2380 			if (HAVE_RDDIR_CACHE(tdrp))
2381 				nfs_purge_rddir_cache(tdvp);
2382 		}
2383 	}
2384 
2385 	nfs_rw_exit(&tdrp->r_rwlock);
2386 
2387 	if (!error) {
2388 		/*
2389 		 * Notify the source file of this link operation.
2390 		 */
2391 		vnevent_link(svp, ct);
2392 	}
2393 	return (error);
2394 }
2395 
2396 /* ARGSUSED */
2397 static int
2398 nfs_rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr,
2399 	caller_context_t *ct, int flags)
2400 {
2401 	vnode_t *realvp;
2402 
2403 	if (nfs_zone() != VTOMI(odvp)->mi_zone)
2404 		return (EPERM);
2405 	if (VOP_REALVP(ndvp, &realvp, ct) == 0)
2406 		ndvp = realvp;
2407 
2408 	return (nfsrename(odvp, onm, ndvp, nnm, cr, ct));
2409 }
2410 
2411 /*
2412  * nfsrename does the real work of renaming in NFS Version 2.
2413  */
2414 static int
2415 nfsrename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr,
2416     caller_context_t *ct)
2417 {
2418 	int error;
2419 	enum nfsstat status;
2420 	struct nfsrnmargs args;
2421 	int douprintf;
2422 	vnode_t *nvp = NULL;
2423 	vnode_t *ovp = NULL;
2424 	char *tmpname;
2425 	rnode_t *rp;
2426 	rnode_t *odrp;
2427 	rnode_t *ndrp;
2428 
2429 	ASSERT(nfs_zone() == VTOMI(odvp)->mi_zone);
2430 	if (strcmp(onm, ".") == 0 || strcmp(onm, "..") == 0 ||
2431 	    strcmp(nnm, ".") == 0 || strcmp(nnm, "..") == 0)
2432 		return (EINVAL);
2433 
2434 	odrp = VTOR(odvp);
2435 	ndrp = VTOR(ndvp);
2436 	if ((intptr_t)odrp < (intptr_t)ndrp) {
2437 		if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR(odvp)))
2438 			return (EINTR);
2439 		if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR(ndvp))) {
2440 			nfs_rw_exit(&odrp->r_rwlock);
2441 			return (EINTR);
2442 		}
2443 	} else {
2444 		if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR(ndvp)))
2445 			return (EINTR);
2446 		if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR(odvp))) {
2447 			nfs_rw_exit(&ndrp->r_rwlock);
2448 			return (EINTR);
2449 		}
2450 	}
2451 
2452 	/*
2453 	 * Lookup the target file.  If it exists, it needs to be
2454 	 * checked to see whether it is a mount point and whether
2455 	 * it is active (open).
2456 	 */
2457 	error = nfslookup(ndvp, nnm, &nvp, NULL, 0, NULL, cr, 0);
2458 	if (!error) {
2459 		/*
2460 		 * If this file has been mounted on, then just
2461 		 * return busy because renaming to it would remove
2462 		 * the mounted file system from the name space.
2463 		 */
2464 		if (vn_mountedvfs(nvp) != NULL) {
2465 			VN_RELE(nvp);
2466 			nfs_rw_exit(&odrp->r_rwlock);
2467 			nfs_rw_exit(&ndrp->r_rwlock);
2468 			return (EBUSY);
2469 		}
2470 
2471 		/*
2472 		 * Purge the name cache of all references to this vnode
2473 		 * so that we can check the reference count to infer
2474 		 * whether it is active or not.
2475 		 */
2476 		/*
2477 		 * First just remove the entry from the name cache, as it
2478 		 * is most likely the only entry for this vp.
2479 		 */
2480 		dnlc_remove(ndvp, nnm);
2481 		/*
2482 		 * If the file has a v_count > 1 then there may be more
2483 		 * than one entry in the name cache due multiple links
2484 		 * or an open file, but we don't have the real reference
2485 		 * count so flush all possible entries.
2486 		 */
2487 		if (nvp->v_count > 1)
2488 			dnlc_purge_vp(nvp);
2489 
2490 		/*
2491 		 * If the vnode is active and is not a directory,
2492 		 * arrange to rename it to a
2493 		 * temporary file so that it will continue to be
2494 		 * accessible.  This implements the "unlink-open-file"
2495 		 * semantics for the target of a rename operation.
2496 		 * Before doing this though, make sure that the
2497 		 * source and target files are not already the same.
2498 		 */
2499 		if (nvp->v_count > 1 && nvp->v_type != VDIR) {
2500 			/*
2501 			 * Lookup the source name.
2502 			 */
2503 			error = nfslookup(odvp, onm, &ovp, NULL, 0, NULL,
2504 			    cr, 0);
2505 
2506 			/*
2507 			 * The source name *should* already exist.
2508 			 */
2509 			if (error) {
2510 				VN_RELE(nvp);
2511 				nfs_rw_exit(&odrp->r_rwlock);
2512 				nfs_rw_exit(&ndrp->r_rwlock);
2513 				return (error);
2514 			}
2515 
2516 			/*
2517 			 * Compare the two vnodes.  If they are the same,
2518 			 * just release all held vnodes and return success.
2519 			 */
2520 			if (ovp == nvp) {
2521 				VN_RELE(ovp);
2522 				VN_RELE(nvp);
2523 				nfs_rw_exit(&odrp->r_rwlock);
2524 				nfs_rw_exit(&ndrp->r_rwlock);
2525 				return (0);
2526 			}
2527 
2528 			/*
2529 			 * Can't mix and match directories and non-
2530 			 * directories in rename operations.  We already
2531 			 * know that the target is not a directory.  If
2532 			 * the source is a directory, return an error.
2533 			 */
2534 			if (ovp->v_type == VDIR) {
2535 				VN_RELE(ovp);
2536 				VN_RELE(nvp);
2537 				nfs_rw_exit(&odrp->r_rwlock);
2538 				nfs_rw_exit(&ndrp->r_rwlock);
2539 				return (ENOTDIR);
2540 			}
2541 
2542 			/*
2543 			 * The target file exists, is not the same as
2544 			 * the source file, and is active.  Link it
2545 			 * to a temporary filename to avoid having
2546 			 * the server removing the file completely.
2547 			 */
2548 			tmpname = newname();
2549 			error = nfs_link(ndvp, nvp, tmpname, cr, NULL, 0);
2550 			if (error == EOPNOTSUPP) {
2551 				error = nfs_rename(ndvp, nnm, ndvp, tmpname,
2552 				    cr, NULL, 0);
2553 			}
2554 			if (error) {
2555 				kmem_free(tmpname, MAXNAMELEN);
2556 				VN_RELE(ovp);
2557 				VN_RELE(nvp);
2558 				nfs_rw_exit(&odrp->r_rwlock);
2559 				nfs_rw_exit(&ndrp->r_rwlock);
2560 				return (error);
2561 			}
2562 			rp = VTOR(nvp);
2563 			mutex_enter(&rp->r_statelock);
2564 			if (rp->r_unldvp == NULL) {
2565 				VN_HOLD(ndvp);
2566 				rp->r_unldvp = ndvp;
2567 				if (rp->r_unlcred != NULL)
2568 					crfree(rp->r_unlcred);
2569 				crhold(cr);
2570 				rp->r_unlcred = cr;
2571 				rp->r_unlname = tmpname;
2572 			} else {
2573 				kmem_free(rp->r_unlname, MAXNAMELEN);
2574 				rp->r_unlname = tmpname;
2575 			}
2576 			mutex_exit(&rp->r_statelock);
2577 		}
2578 	}
2579 
2580 	if (ovp == NULL) {
2581 		/*
2582 		 * When renaming directories to be a subdirectory of a
2583 		 * different parent, the dnlc entry for ".." will no
2584 		 * longer be valid, so it must be removed.
2585 		 *
2586 		 * We do a lookup here to determine whether we are renaming
2587 		 * a directory and we need to check if we are renaming
2588 		 * an unlinked file.  This might have already been done
2589 		 * in previous code, so we check ovp == NULL to avoid
2590 		 * doing it twice.
2591 		 */
2592 
2593 		error = nfslookup(odvp, onm, &ovp, NULL, 0, NULL, cr, 0);
2594 
2595 		/*
2596 		 * The source name *should* already exist.
2597 		 */
2598 		if (error) {
2599 			nfs_rw_exit(&odrp->r_rwlock);
2600 			nfs_rw_exit(&ndrp->r_rwlock);
2601 			if (nvp) {
2602 				VN_RELE(nvp);
2603 			}
2604 			return (error);
2605 		}
2606 		ASSERT(ovp != NULL);
2607 	}
2608 
2609 	dnlc_remove(odvp, onm);
2610 	dnlc_remove(ndvp, nnm);
2611 
2612 	setdiropargs(&args.rna_from, onm, odvp);
2613 	setdiropargs(&args.rna_to, nnm, ndvp);
2614 
2615 	douprintf = 1;
2616 
2617 	error = rfs2call(VTOMI(odvp), RFS_RENAME,
2618 	    xdr_rnmargs, (caddr_t)&args,
2619 	    xdr_enum, (caddr_t)&status, cr,
2620 	    &douprintf, &status, 0, NULL);
2621 
2622 	PURGE_ATTRCACHE(odvp);	/* mod time changed */
2623 	PURGE_ATTRCACHE(ndvp);	/* mod time changed */
2624 
2625 	if (!error) {
2626 		error = geterrno(status);
2627 		if (!error) {
2628 			if (HAVE_RDDIR_CACHE(odrp))
2629 				nfs_purge_rddir_cache(odvp);
2630 			if (HAVE_RDDIR_CACHE(ndrp))
2631 				nfs_purge_rddir_cache(ndvp);
2632 			/*
2633 			 * when renaming directories to be a subdirectory of a
2634 			 * different parent, the dnlc entry for ".." will no
2635 			 * longer be valid, so it must be removed
2636 			 */
2637 			rp = VTOR(ovp);
2638 			if (ndvp != odvp) {
2639 				if (ovp->v_type == VDIR) {
2640 					dnlc_remove(ovp, "..");
2641 					if (HAVE_RDDIR_CACHE(rp))
2642 						nfs_purge_rddir_cache(ovp);
2643 				}
2644 			}
2645 
2646 			/*
2647 			 * If we are renaming the unlinked file, update the
2648 			 * r_unldvp and r_unlname as needed.
2649 			 */
2650 			mutex_enter(&rp->r_statelock);
2651 			if (rp->r_unldvp != NULL) {
2652 				if (strcmp(rp->r_unlname, onm) == 0) {
2653 					(void) strncpy(rp->r_unlname,
2654 					    nnm, MAXNAMELEN);
2655 					rp->r_unlname[MAXNAMELEN - 1] = '\0';
2656 
2657 					if (ndvp != rp->r_unldvp) {
2658 						VN_RELE(rp->r_unldvp);
2659 						rp->r_unldvp = ndvp;
2660 						VN_HOLD(ndvp);
2661 					}
2662 				}
2663 			}
2664 			mutex_exit(&rp->r_statelock);
2665 		} else {
2666 			/*
2667 			 * System V defines rename to return EEXIST, not
2668 			 * ENOTEMPTY if the target directory is not empty.
2669 			 * Over the wire, the error is NFSERR_ENOTEMPTY
2670 			 * which geterrno maps to ENOTEMPTY.
2671 			 */
2672 			if (error == ENOTEMPTY)
2673 				error = EEXIST;
2674 		}
2675 	}
2676 
2677 	if (error == 0) {
2678 		if (nvp)
2679 			vnevent_rename_dest(nvp, ndvp, nnm, ct);
2680 
2681 		if (odvp != ndvp)
2682 			vnevent_rename_dest_dir(ndvp, ct);
2683 
2684 		ASSERT(ovp != NULL);
2685 		vnevent_rename_src(ovp, odvp, onm, ct);
2686 	}
2687 
2688 	if (nvp) {
2689 		VN_RELE(nvp);
2690 	}
2691 	VN_RELE(ovp);
2692 
2693 	nfs_rw_exit(&odrp->r_rwlock);
2694 	nfs_rw_exit(&ndrp->r_rwlock);
2695 
2696 	return (error);
2697 }
2698 
2699 /* ARGSUSED */
2700 static int
2701 nfs_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp, cred_t *cr,
2702 	caller_context_t *ct, int flags, vsecattr_t *vsecp)
2703 {
2704 	int error;
2705 	struct nfscreatargs args;
2706 	struct nfsdiropres dr;
2707 	int douprintf;
2708 	rnode_t *drp;
2709 	hrtime_t t;
2710 
2711 	if (nfs_zone() != VTOMI(dvp)->mi_zone)
2712 		return (EPERM);
2713 
2714 	setdiropargs(&args.ca_da, nm, dvp);
2715 
2716 	/*
2717 	 * Decide what the group-id and set-gid bit of the created directory
2718 	 * should be.  May have to do a setattr to get the gid right.
2719 	 */
2720 	error = setdirgid(dvp, &va->va_gid, cr);
2721 	if (error)
2722 		return (error);
2723 	error = setdirmode(dvp, &va->va_mode, cr);
2724 	if (error)
2725 		return (error);
2726 	va->va_mask |= AT_MODE|AT_GID;
2727 
2728 	args.ca_sa = &args.ca_sa_buf;
2729 	error = vattr_to_sattr(va, args.ca_sa);
2730 	if (error) {
2731 		/* req time field(s) overflow - return immediately */
2732 		return (error);
2733 	}
2734 
2735 	drp = VTOR(dvp);
2736 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
2737 		return (EINTR);
2738 
2739 	dnlc_remove(dvp, nm);
2740 
2741 	douprintf = 1;
2742 
2743 	t = gethrtime();
2744 
2745 	error = rfs2call(VTOMI(dvp), RFS_MKDIR,
2746 	    xdr_creatargs, (caddr_t)&args,
2747 	    xdr_diropres, (caddr_t)&dr, cr,
2748 	    &douprintf, &dr.dr_status, 0, NULL);
2749 
2750 	PURGE_ATTRCACHE(dvp);	/* mod time changed */
2751 
2752 	if (!error) {
2753 		error = geterrno(dr.dr_status);
2754 		if (!error) {
2755 			if (HAVE_RDDIR_CACHE(drp))
2756 				nfs_purge_rddir_cache(dvp);
2757 			/*
2758 			 * The attributes returned by RFS_MKDIR can not
2759 			 * be depended upon, so mark the attribute cache
2760 			 * as purged.  A subsequent GETATTR will get the
2761 			 * correct attributes from the server.
2762 			 */
2763 			*vpp = makenfsnode(&dr.dr_fhandle, &dr.dr_attr,
2764 			    dvp->v_vfsp, t, cr, NULL, NULL);
2765 			PURGE_ATTRCACHE(*vpp);
2766 			dnlc_update(dvp, nm, *vpp);
2767 
2768 			/*
2769 			 * Make sure the gid was set correctly.
2770 			 * If not, try to set it (but don't lose
2771 			 * any sleep over it).
2772 			 */
2773 			if (va->va_gid != VTOR(*vpp)->r_attr.va_gid) {
2774 				va->va_mask = AT_GID;
2775 				(void) nfssetattr(*vpp, va, 0, cr);
2776 			}
2777 		} else {
2778 			PURGE_STALE_FH(error, dvp, cr);
2779 		}
2780 	}
2781 
2782 	nfs_rw_exit(&drp->r_rwlock);
2783 
2784 	return (error);
2785 }
2786 
2787 /* ARGSUSED */
2788 static int
2789 nfs_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr,
2790 	caller_context_t *ct, int flags)
2791 {
2792 	int error;
2793 	enum nfsstat status;
2794 	struct nfsdiropargs da;
2795 	vnode_t *vp;
2796 	int douprintf;
2797 	rnode_t *drp;
2798 
2799 	if (nfs_zone() != VTOMI(dvp)->mi_zone)
2800 		return (EPERM);
2801 	drp = VTOR(dvp);
2802 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
2803 		return (EINTR);
2804 
2805 	/*
2806 	 * Attempt to prevent a rmdir(".") from succeeding.
2807 	 */
2808 	error = nfslookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0);
2809 	if (error) {
2810 		nfs_rw_exit(&drp->r_rwlock);
2811 		return (error);
2812 	}
2813 
2814 	if (vp == cdir) {
2815 		VN_RELE(vp);
2816 		nfs_rw_exit(&drp->r_rwlock);
2817 		return (EINVAL);
2818 	}
2819 
2820 	setdiropargs(&da, nm, dvp);
2821 
2822 	/*
2823 	 * First just remove the entry from the name cache, as it
2824 	 * is most likely an entry for this vp.
2825 	 */
2826 	dnlc_remove(dvp, nm);
2827 
2828 	/*
2829 	 * If there vnode reference count is greater than one, then
2830 	 * there may be additional references in the DNLC which will
2831 	 * need to be purged.  First, trying removing the entry for
2832 	 * the parent directory and see if that removes the additional
2833 	 * reference(s).  If that doesn't do it, then use dnlc_purge_vp
2834 	 * to completely remove any references to the directory which
2835 	 * might still exist in the DNLC.
2836 	 */
2837 	if (vp->v_count > 1) {
2838 		dnlc_remove(vp, "..");
2839 		if (vp->v_count > 1)
2840 			dnlc_purge_vp(vp);
2841 	}
2842 
2843 	douprintf = 1;
2844 
2845 	error = rfs2call(VTOMI(dvp), RFS_RMDIR,
2846 	    xdr_diropargs, (caddr_t)&da,
2847 	    xdr_enum, (caddr_t)&status, cr,
2848 	    &douprintf, &status, 0, NULL);
2849 
2850 	PURGE_ATTRCACHE(dvp);	/* mod time changed */
2851 
2852 	if (error) {
2853 		VN_RELE(vp);
2854 		nfs_rw_exit(&drp->r_rwlock);
2855 		return (error);
2856 	}
2857 
2858 	error = geterrno(status);
2859 	if (!error) {
2860 		if (HAVE_RDDIR_CACHE(drp))
2861 			nfs_purge_rddir_cache(dvp);
2862 		if (HAVE_RDDIR_CACHE(VTOR(vp)))
2863 			nfs_purge_rddir_cache(vp);
2864 	} else {
2865 		PURGE_STALE_FH(error, dvp, cr);
2866 		/*
2867 		 * System V defines rmdir to return EEXIST, not
2868 		 * ENOTEMPTY if the directory is not empty.  Over
2869 		 * the wire, the error is NFSERR_ENOTEMPTY which
2870 		 * geterrno maps to ENOTEMPTY.
2871 		 */
2872 		if (error == ENOTEMPTY)
2873 			error = EEXIST;
2874 	}
2875 
2876 	if (error == 0) {
2877 		vnevent_rmdir(vp, dvp, nm, ct);
2878 	}
2879 	VN_RELE(vp);
2880 
2881 	nfs_rw_exit(&drp->r_rwlock);
2882 
2883 	return (error);
2884 }
2885 
2886 /* ARGSUSED */
2887 static int
2888 nfs_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm, cred_t *cr,
2889 	caller_context_t *ct, int flags)
2890 {
2891 	int error;
2892 	struct nfsslargs args;
2893 	enum nfsstat status;
2894 	int douprintf;
2895 	rnode_t *drp;
2896 
2897 	if (nfs_zone() != VTOMI(dvp)->mi_zone)
2898 		return (EPERM);
2899 	setdiropargs(&args.sla_from, lnm, dvp);
2900 	args.sla_sa = &args.sla_sa_buf;
2901 	error = vattr_to_sattr(tva, args.sla_sa);
2902 	if (error) {
2903 		/* req time field(s) overflow - return immediately */
2904 		return (error);
2905 	}
2906 	args.sla_tnm = tnm;
2907 
2908 	drp = VTOR(dvp);
2909 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
2910 		return (EINTR);
2911 
2912 	dnlc_remove(dvp, lnm);
2913 
2914 	douprintf = 1;
2915 
2916 	error = rfs2call(VTOMI(dvp), RFS_SYMLINK,
2917 	    xdr_slargs, (caddr_t)&args,
2918 	    xdr_enum, (caddr_t)&status, cr,
2919 	    &douprintf, &status, 0, NULL);
2920 
2921 	PURGE_ATTRCACHE(dvp);	/* mod time changed */
2922 
2923 	if (!error) {
2924 		error = geterrno(status);
2925 		if (!error) {
2926 			if (HAVE_RDDIR_CACHE(drp))
2927 				nfs_purge_rddir_cache(dvp);
2928 		} else {
2929 			PURGE_STALE_FH(error, dvp, cr);
2930 		}
2931 	}
2932 
2933 	nfs_rw_exit(&drp->r_rwlock);
2934 
2935 	return (error);
2936 }
2937 
2938 #ifdef DEBUG
2939 static int nfs_readdir_cache_hits = 0;
2940 static int nfs_readdir_cache_shorts = 0;
2941 static int nfs_readdir_cache_waits = 0;
2942 static int nfs_readdir_cache_misses = 0;
2943 static int nfs_readdir_readahead = 0;
2944 #endif
2945 
2946 static int nfs_shrinkreaddir = 0;
2947 
2948 /*
2949  * Read directory entries.
2950  * There are some weird things to look out for here.  The uio_offset
2951  * field is either 0 or it is the offset returned from a previous
2952  * readdir.  It is an opaque value used by the server to find the
2953  * correct directory block to read. The count field is the number
2954  * of blocks to read on the server.  This is advisory only, the server
2955  * may return only one block's worth of entries.  Entries may be compressed
2956  * on the server.
2957  */
2958 /* ARGSUSED */
2959 static int
2960 nfs_readdir(vnode_t *vp, struct uio *uiop, cred_t *cr, int *eofp,
2961 	caller_context_t *ct, int flags)
2962 {
2963 	int error;
2964 	size_t count;
2965 	rnode_t *rp;
2966 	rddir_cache *rdc;
2967 	rddir_cache *nrdc;
2968 	rddir_cache *rrdc;
2969 #ifdef DEBUG
2970 	int missed;
2971 #endif
2972 	rddir_cache srdc;
2973 	avl_index_t where;
2974 
2975 	rp = VTOR(vp);
2976 
2977 	ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
2978 	if (nfs_zone() != VTOMI(vp)->mi_zone)
2979 		return (EIO);
2980 	/*
2981 	 * Make sure that the directory cache is valid.
2982 	 */
2983 	if (HAVE_RDDIR_CACHE(rp)) {
2984 		if (nfs_disable_rddir_cache) {
2985 			/*
2986 			 * Setting nfs_disable_rddir_cache in /etc/system
2987 			 * allows interoperability with servers that do not
2988 			 * properly update the attributes of directories.
2989 			 * Any cached information gets purged before an
2990 			 * access is made to it.
2991 			 */
2992 			nfs_purge_rddir_cache(vp);
2993 		} else {
2994 			error = nfs_validate_caches(vp, cr);
2995 			if (error)
2996 				return (error);
2997 		}
2998 	}
2999 
3000 	/*
3001 	 * UGLINESS: SunOS 3.2 servers apparently cannot always handle an
3002 	 * RFS_READDIR request with rda_count set to more than 0x400. So
3003 	 * we reduce the request size here purely for compatibility.
3004 	 *
3005 	 * In general, this is no longer required.  However, if a server
3006 	 * is discovered which can not handle requests larger than 1024,
3007 	 * nfs_shrinkreaddir can be set to 1 to enable this backwards
3008 	 * compatibility.
3009 	 *
3010 	 * In any case, the request size is limited to NFS_MAXDATA bytes.
3011 	 */
3012 	count = MIN(uiop->uio_iov->iov_len,
3013 	    nfs_shrinkreaddir ? 0x400 : NFS_MAXDATA);
3014 
3015 	nrdc = NULL;
3016 #ifdef DEBUG
3017 	missed = 0;
3018 #endif
3019 top:
3020 	/*
3021 	 * Short circuit last readdir which always returns 0 bytes.
3022 	 * This can be done after the directory has been read through
3023 	 * completely at least once.  This will set r_direof which
3024 	 * can be used to find the value of the last cookie.
3025 	 */
3026 	mutex_enter(&rp->r_statelock);
3027 	if (rp->r_direof != NULL &&
3028 	    uiop->uio_offset == rp->r_direof->nfs_ncookie) {
3029 		mutex_exit(&rp->r_statelock);
3030 #ifdef DEBUG
3031 		nfs_readdir_cache_shorts++;
3032 #endif
3033 		if (eofp)
3034 			*eofp = 1;
3035 		if (nrdc != NULL)
3036 			rddir_cache_rele(nrdc);
3037 		return (0);
3038 	}
3039 	/*
3040 	 * Look for a cache entry.  Cache entries are identified
3041 	 * by the NFS cookie value and the byte count requested.
3042 	 */
3043 	srdc.nfs_cookie = uiop->uio_offset;
3044 	srdc.buflen = count;
3045 	rdc = avl_find(&rp->r_dir, &srdc, &where);
3046 	if (rdc != NULL) {
3047 		rddir_cache_hold(rdc);
3048 		/*
3049 		 * If the cache entry is in the process of being
3050 		 * filled in, wait until this completes.  The
3051 		 * RDDIRWAIT bit is set to indicate that someone
3052 		 * is waiting and then the thread currently
3053 		 * filling the entry is done, it should do a
3054 		 * cv_broadcast to wakeup all of the threads
3055 		 * waiting for it to finish.
3056 		 */
3057 		if (rdc->flags & RDDIR) {
3058 			nfs_rw_exit(&rp->r_rwlock);
3059 			rdc->flags |= RDDIRWAIT;
3060 #ifdef DEBUG
3061 			nfs_readdir_cache_waits++;
3062 #endif
3063 			if (!cv_wait_sig(&rdc->cv, &rp->r_statelock)) {
3064 				/*
3065 				 * We got interrupted, probably
3066 				 * the user typed ^C or an alarm
3067 				 * fired.  We free the new entry
3068 				 * if we allocated one.
3069 				 */
3070 				mutex_exit(&rp->r_statelock);
3071 				(void) nfs_rw_enter_sig(&rp->r_rwlock,
3072 				    RW_READER, FALSE);
3073 				rddir_cache_rele(rdc);
3074 				if (nrdc != NULL)
3075 					rddir_cache_rele(nrdc);
3076 				return (EINTR);
3077 			}
3078 			mutex_exit(&rp->r_statelock);
3079 			(void) nfs_rw_enter_sig(&rp->r_rwlock,
3080 			    RW_READER, FALSE);
3081 			rddir_cache_rele(rdc);
3082 			goto top;
3083 		}
3084 		/*
3085 		 * Check to see if a readdir is required to
3086 		 * fill the entry.  If so, mark this entry
3087 		 * as being filled, remove our reference,
3088 		 * and branch to the code to fill the entry.
3089 		 */
3090 		if (rdc->flags & RDDIRREQ) {
3091 			rdc->flags &= ~RDDIRREQ;
3092 			rdc->flags |= RDDIR;
3093 			if (nrdc != NULL)
3094 				rddir_cache_rele(nrdc);
3095 			nrdc = rdc;
3096 			mutex_exit(&rp->r_statelock);
3097 			goto bottom;
3098 		}
3099 #ifdef DEBUG
3100 		if (!missed)
3101 			nfs_readdir_cache_hits++;
3102 #endif
3103 		/*
3104 		 * If an error occurred while attempting
3105 		 * to fill the cache entry, just return it.
3106 		 */
3107 		if (rdc->error) {
3108 			error = rdc->error;
3109 			mutex_exit(&rp->r_statelock);
3110 			rddir_cache_rele(rdc);
3111 			if (nrdc != NULL)
3112 				rddir_cache_rele(nrdc);
3113 			return (error);
3114 		}
3115 
3116 		/*
3117 		 * The cache entry is complete and good,
3118 		 * copyout the dirent structs to the calling
3119 		 * thread.
3120 		 */
3121 		error = uiomove(rdc->entries, rdc->entlen, UIO_READ, uiop);
3122 
3123 		/*
3124 		 * If no error occurred during the copyout,
3125 		 * update the offset in the uio struct to
3126 		 * contain the value of the next cookie
3127 		 * and set the eof value appropriately.
3128 		 */
3129 		if (!error) {
3130 			uiop->uio_offset = rdc->nfs_ncookie;
3131 			if (eofp)
3132 				*eofp = rdc->eof;
3133 		}
3134 
3135 		/*
3136 		 * Decide whether to do readahead.  Don't if
3137 		 * have already read to the end of directory.
3138 		 */
3139 		if (rdc->eof) {
3140 			rp->r_direof = rdc;
3141 			mutex_exit(&rp->r_statelock);
3142 			rddir_cache_rele(rdc);
3143 			if (nrdc != NULL)
3144 				rddir_cache_rele(nrdc);
3145 			return (error);
3146 		}
3147 
3148 		/*
3149 		 * Check to see whether we found an entry
3150 		 * for the readahead.  If so, we don't need
3151 		 * to do anything further, so free the new
3152 		 * entry if one was allocated.  Otherwise,
3153 		 * allocate a new entry, add it to the cache,
3154 		 * and then initiate an asynchronous readdir
3155 		 * operation to fill it.
3156 		 */
3157 		srdc.nfs_cookie = rdc->nfs_ncookie;
3158 		srdc.buflen = count;
3159 		rrdc = avl_find(&rp->r_dir, &srdc, &where);
3160 		if (rrdc != NULL) {
3161 			if (nrdc != NULL)
3162 				rddir_cache_rele(nrdc);
3163 		} else {
3164 			if (nrdc != NULL)
3165 				rrdc = nrdc;
3166 			else {
3167 				rrdc = rddir_cache_alloc(KM_NOSLEEP);
3168 			}
3169 			if (rrdc != NULL) {
3170 				rrdc->nfs_cookie = rdc->nfs_ncookie;
3171 				rrdc->buflen = count;
3172 				avl_insert(&rp->r_dir, rrdc, where);
3173 				rddir_cache_hold(rrdc);
3174 				mutex_exit(&rp->r_statelock);
3175 				rddir_cache_rele(rdc);
3176 #ifdef DEBUG
3177 				nfs_readdir_readahead++;
3178 #endif
3179 				nfs_async_readdir(vp, rrdc, cr, nfsreaddir);
3180 				return (error);
3181 			}
3182 		}
3183 
3184 		mutex_exit(&rp->r_statelock);
3185 		rddir_cache_rele(rdc);
3186 		return (error);
3187 	}
3188 
3189 	/*
3190 	 * Didn't find an entry in the cache.  Construct a new empty
3191 	 * entry and link it into the cache.  Other processes attempting
3192 	 * to access this entry will need to wait until it is filled in.
3193 	 *
3194 	 * Since kmem_alloc may block, another pass through the cache
3195 	 * will need to be taken to make sure that another process
3196 	 * hasn't already added an entry to the cache for this request.
3197 	 */
3198 	if (nrdc == NULL) {
3199 		mutex_exit(&rp->r_statelock);
3200 		nrdc = rddir_cache_alloc(KM_SLEEP);
3201 		nrdc->nfs_cookie = uiop->uio_offset;
3202 		nrdc->buflen = count;
3203 		goto top;
3204 	}
3205 
3206 	/*
3207 	 * Add this entry to the cache.
3208 	 */
3209 	avl_insert(&rp->r_dir, nrdc, where);
3210 	rddir_cache_hold(nrdc);
3211 	mutex_exit(&rp->r_statelock);
3212 
3213 bottom:
3214 #ifdef DEBUG
3215 	missed = 1;
3216 	nfs_readdir_cache_misses++;
3217 #endif
3218 	/*
3219 	 * Do the readdir.
3220 	 */
3221 	error = nfsreaddir(vp, nrdc, cr);
3222 
3223 	/*
3224 	 * If this operation failed, just return the error which occurred.
3225 	 */
3226 	if (error != 0)
3227 		return (error);
3228 
3229 	/*
3230 	 * Since the RPC operation will have taken sometime and blocked
3231 	 * this process, another pass through the cache will need to be
3232 	 * taken to find the correct cache entry.  It is possible that
3233 	 * the correct cache entry will not be there (although one was
3234 	 * added) because the directory changed during the RPC operation
3235 	 * and the readdir cache was flushed.  In this case, just start
3236 	 * over.  It is hoped that this will not happen too often... :-)
3237 	 */
3238 	nrdc = NULL;
3239 	goto top;
3240 	/* NOTREACHED */
3241 }
3242 
3243 static int
3244 nfsreaddir(vnode_t *vp, rddir_cache *rdc, cred_t *cr)
3245 {
3246 	int error;
3247 	struct nfsrddirargs rda;
3248 	struct nfsrddirres rd;
3249 	rnode_t *rp;
3250 	mntinfo_t *mi;
3251 	uint_t count;
3252 	int douprintf;
3253 	failinfo_t fi, *fip;
3254 
3255 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
3256 	count = rdc->buflen;
3257 
3258 	rp = VTOR(vp);
3259 	mi = VTOMI(vp);
3260 
3261 	rda.rda_fh = *VTOFH(vp);
3262 	rda.rda_offset = rdc->nfs_cookie;
3263 
3264 	/*
3265 	 * NFS client failover support
3266 	 * suppress failover unless we have a zero cookie
3267 	 */
3268 	if (rdc->nfs_cookie == (off_t)0) {
3269 		fi.vp = vp;
3270 		fi.fhp = (caddr_t)&rda.rda_fh;
3271 		fi.copyproc = nfscopyfh;
3272 		fi.lookupproc = nfslookup;
3273 		fi.xattrdirproc = acl_getxattrdir2;
3274 		fip = &fi;
3275 	} else {
3276 		fip = NULL;
3277 	}
3278 
3279 	rd.rd_entries = kmem_alloc(rdc->buflen, KM_SLEEP);
3280 	rd.rd_size = count;
3281 	rd.rd_offset = rda.rda_offset;
3282 
3283 	douprintf = 1;
3284 
3285 	if (mi->mi_io_kstats) {
3286 		mutex_enter(&mi->mi_lock);
3287 		kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
3288 		mutex_exit(&mi->mi_lock);
3289 	}
3290 
3291 	do {
3292 		rda.rda_count = MIN(count, mi->mi_curread);
3293 		error = rfs2call(mi, RFS_READDIR,
3294 		    xdr_rddirargs, (caddr_t)&rda,
3295 		    xdr_getrddirres, (caddr_t)&rd, cr,
3296 		    &douprintf, &rd.rd_status, 0, fip);
3297 	} while (error == ENFS_TRYAGAIN);
3298 
3299 	if (mi->mi_io_kstats) {
3300 		mutex_enter(&mi->mi_lock);
3301 		kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
3302 		mutex_exit(&mi->mi_lock);
3303 	}
3304 
3305 	/*
3306 	 * Since we are actually doing a READDIR RPC, we must have
3307 	 * exclusive access to the cache entry being filled.  Thus,
3308 	 * it is safe to update all fields except for the flags
3309 	 * field.  The r_statelock in the rnode must be held to
3310 	 * prevent two different threads from simultaneously
3311 	 * attempting to update the flags field.  This can happen
3312 	 * if we are turning off RDDIR and the other thread is
3313 	 * trying to set RDDIRWAIT.
3314 	 */
3315 	ASSERT(rdc->flags & RDDIR);
3316 	if (!error) {
3317 		error = geterrno(rd.rd_status);
3318 		if (!error) {
3319 			rdc->nfs_ncookie = rd.rd_offset;
3320 			rdc->eof = rd.rd_eof ? 1 : 0;
3321 			rdc->entlen = rd.rd_size;
3322 			ASSERT(rdc->entlen <= rdc->buflen);
3323 #ifdef DEBUG
3324 			rdc->entries = rddir_cache_buf_alloc(rdc->buflen,
3325 			    KM_SLEEP);
3326 #else
3327 			rdc->entries = kmem_alloc(rdc->buflen, KM_SLEEP);
3328 #endif
3329 			bcopy(rd.rd_entries, rdc->entries, rdc->entlen);
3330 			rdc->error = 0;
3331 			if (mi->mi_io_kstats) {
3332 				mutex_enter(&mi->mi_lock);
3333 				KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
3334 				KSTAT_IO_PTR(mi->mi_io_kstats)->nread +=
3335 				    rd.rd_size;
3336 				mutex_exit(&mi->mi_lock);
3337 			}
3338 		} else {
3339 			PURGE_STALE_FH(error, vp, cr);
3340 		}
3341 	}
3342 	if (error) {
3343 		rdc->entries = NULL;
3344 		rdc->error = error;
3345 	}
3346 	kmem_free(rd.rd_entries, rdc->buflen);
3347 
3348 	mutex_enter(&rp->r_statelock);
3349 	rdc->flags &= ~RDDIR;
3350 	if (rdc->flags & RDDIRWAIT) {
3351 		rdc->flags &= ~RDDIRWAIT;
3352 		cv_broadcast(&rdc->cv);
3353 	}
3354 	if (error)
3355 		rdc->flags |= RDDIRREQ;
3356 	mutex_exit(&rp->r_statelock);
3357 
3358 	rddir_cache_rele(rdc);
3359 
3360 	return (error);
3361 }
3362 
3363 #ifdef DEBUG
3364 static int nfs_bio_do_stop = 0;
3365 #endif
3366 
3367 static int
3368 nfs_bio(struct buf *bp, cred_t *cr)
3369 {
3370 	rnode_t *rp = VTOR(bp->b_vp);
3371 	int count;
3372 	int error;
3373 	cred_t *cred;
3374 	uint_t offset;
3375 
3376 	DTRACE_IO1(start, struct buf *, bp);
3377 
3378 	ASSERT(nfs_zone() == VTOMI(bp->b_vp)->mi_zone);
3379 	offset = dbtob(bp->b_blkno);
3380 
3381 	if (bp->b_flags & B_READ) {
3382 		mutex_enter(&rp->r_statelock);
3383 		if (rp->r_cred != NULL) {
3384 			cred = rp->r_cred;
3385 			crhold(cred);
3386 		} else {
3387 			rp->r_cred = cr;
3388 			crhold(cr);
3389 			cred = cr;
3390 			crhold(cred);
3391 		}
3392 		mutex_exit(&rp->r_statelock);
3393 	read_again:
3394 		error = bp->b_error = nfsread(bp->b_vp, bp->b_un.b_addr,
3395 		    offset, bp->b_bcount, &bp->b_resid, cred);
3396 
3397 		crfree(cred);
3398 		if (!error) {
3399 			if (bp->b_resid) {
3400 				/*
3401 				 * Didn't get it all because we hit EOF,
3402 				 * zero all the memory beyond the EOF.
3403 				 */
3404 				/* bzero(rdaddr + */
3405 				bzero(bp->b_un.b_addr +
3406 				    bp->b_bcount - bp->b_resid, bp->b_resid);
3407 			}
3408 			mutex_enter(&rp->r_statelock);
3409 			if (bp->b_resid == bp->b_bcount &&
3410 			    offset >= rp->r_size) {
3411 				/*
3412 				 * We didn't read anything at all as we are
3413 				 * past EOF.  Return an error indicator back
3414 				 * but don't destroy the pages (yet).
3415 				 */
3416 				error = NFS_EOF;
3417 			}
3418 			mutex_exit(&rp->r_statelock);
3419 		} else if (error == EACCES) {
3420 			mutex_enter(&rp->r_statelock);
3421 			if (cred != cr) {
3422 				if (rp->r_cred != NULL)
3423 					crfree(rp->r_cred);
3424 				rp->r_cred = cr;
3425 				crhold(cr);
3426 				cred = cr;
3427 				crhold(cred);
3428 				mutex_exit(&rp->r_statelock);
3429 				goto read_again;
3430 			}
3431 			mutex_exit(&rp->r_statelock);
3432 		}
3433 	} else {
3434 		if (!(rp->r_flags & RSTALE)) {
3435 			mutex_enter(&rp->r_statelock);
3436 			if (rp->r_cred != NULL) {
3437 				cred = rp->r_cred;
3438 				crhold(cred);
3439 			} else {
3440 				rp->r_cred = cr;
3441 				crhold(cr);
3442 				cred = cr;
3443 				crhold(cred);
3444 			}
3445 			mutex_exit(&rp->r_statelock);
3446 		write_again:
3447 			mutex_enter(&rp->r_statelock);
3448 			count = MIN(bp->b_bcount, rp->r_size - offset);
3449 			mutex_exit(&rp->r_statelock);
3450 			if (count < 0)
3451 				cmn_err(CE_PANIC, "nfs_bio: write count < 0");
3452 #ifdef DEBUG
3453 			if (count == 0) {
3454 				zcmn_err(getzoneid(), CE_WARN,
3455 				    "nfs_bio: zero length write at %d",
3456 				    offset);
3457 				nfs_printfhandle(&rp->r_fh);
3458 				if (nfs_bio_do_stop)
3459 					debug_enter("nfs_bio");
3460 			}
3461 #endif
3462 			error = nfswrite(bp->b_vp, bp->b_un.b_addr, offset,
3463 			    count, cred);
3464 			if (error == EACCES) {
3465 				mutex_enter(&rp->r_statelock);
3466 				if (cred != cr) {
3467 					if (rp->r_cred != NULL)
3468 						crfree(rp->r_cred);
3469 					rp->r_cred = cr;
3470 					crhold(cr);
3471 					crfree(cred);
3472 					cred = cr;
3473 					crhold(cred);
3474 					mutex_exit(&rp->r_statelock);
3475 					goto write_again;
3476 				}
3477 				mutex_exit(&rp->r_statelock);
3478 			}
3479 			bp->b_error = error;
3480 			if (error && error != EINTR) {
3481 				/*
3482 				 * Don't print EDQUOT errors on the console.
3483 				 * Don't print asynchronous EACCES errors.
3484 				 * Don't print EFBIG errors.
3485 				 * Print all other write errors.
3486 				 */
3487 				if (error != EDQUOT && error != EFBIG &&
3488 				    (error != EACCES ||
3489 				    !(bp->b_flags & B_ASYNC)))
3490 					nfs_write_error(bp->b_vp, error, cred);
3491 				/*
3492 				 * Update r_error and r_flags as appropriate.
3493 				 * If the error was ESTALE, then mark the
3494 				 * rnode as not being writeable and save
3495 				 * the error status.  Otherwise, save any
3496 				 * errors which occur from asynchronous
3497 				 * page invalidations.  Any errors occurring
3498 				 * from other operations should be saved
3499 				 * by the caller.
3500 				 */
3501 				mutex_enter(&rp->r_statelock);
3502 				if (error == ESTALE) {
3503 					rp->r_flags |= RSTALE;
3504 					if (!rp->r_error)
3505 						rp->r_error = error;
3506 				} else if (!rp->r_error &&
3507 				    (bp->b_flags &
3508 				    (B_INVAL|B_FORCE|B_ASYNC)) ==
3509 				    (B_INVAL|B_FORCE|B_ASYNC)) {
3510 					rp->r_error = error;
3511 				}
3512 				mutex_exit(&rp->r_statelock);
3513 			}
3514 			crfree(cred);
3515 		} else {
3516 			error = rp->r_error;
3517 			/*
3518 			 * A close may have cleared r_error, if so,
3519 			 * propagate ESTALE error return properly
3520 			 */
3521 			if (error == 0)
3522 				error = ESTALE;
3523 		}
3524 	}
3525 
3526 	if (error != 0 && error != NFS_EOF)
3527 		bp->b_flags |= B_ERROR;
3528 
3529 	DTRACE_IO1(done, struct buf *, bp);
3530 
3531 	return (error);
3532 }
3533 
3534 /* ARGSUSED */
3535 static int
3536 nfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
3537 {
3538 	struct nfs_fid *fp;
3539 	rnode_t *rp;
3540 
3541 	rp = VTOR(vp);
3542 
3543 	if (fidp->fid_len < (sizeof (struct nfs_fid) - sizeof (short))) {
3544 		fidp->fid_len = sizeof (struct nfs_fid) - sizeof (short);
3545 		return (ENOSPC);
3546 	}
3547 	fp = (struct nfs_fid *)fidp;
3548 	fp->nf_pad = 0;
3549 	fp->nf_len = sizeof (struct nfs_fid) - sizeof (short);
3550 	bcopy(rp->r_fh.fh_buf, fp->nf_data, NFS_FHSIZE);
3551 	return (0);
3552 }
3553 
3554 /* ARGSUSED2 */
3555 static int
3556 nfs_rwlock(vnode_t *vp, int write_lock, caller_context_t *ctp)
3557 {
3558 	rnode_t *rp = VTOR(vp);
3559 
3560 	if (!write_lock) {
3561 		(void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
3562 		return (V_WRITELOCK_FALSE);
3563 	}
3564 
3565 	if ((rp->r_flags & RDIRECTIO) || (VTOMI(vp)->mi_flags & MI_DIRECTIO)) {
3566 		(void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
3567 		if (rp->r_mapcnt == 0 && !vn_has_cached_data(vp))
3568 			return (V_WRITELOCK_FALSE);
3569 		nfs_rw_exit(&rp->r_rwlock);
3570 	}
3571 
3572 	(void) nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, FALSE);
3573 	return (V_WRITELOCK_TRUE);
3574 }
3575 
3576 /* ARGSUSED */
3577 static void
3578 nfs_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ctp)
3579 {
3580 	rnode_t *rp = VTOR(vp);
3581 
3582 	nfs_rw_exit(&rp->r_rwlock);
3583 }
3584 
3585 /* ARGSUSED */
3586 static int
3587 nfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct)
3588 {
3589 
3590 	/*
3591 	 * Because we stuff the readdir cookie into the offset field
3592 	 * someone may attempt to do an lseek with the cookie which
3593 	 * we want to succeed.
3594 	 */
3595 	if (vp->v_type == VDIR)
3596 		return (0);
3597 	if (*noffp < 0 || *noffp > MAXOFF32_T)
3598 		return (EINVAL);
3599 	return (0);
3600 }
3601 
3602 /*
3603  * number of NFS_MAXDATA blocks to read ahead
3604  * optimized for 100 base-T.
3605  */
3606 static int nfs_nra = 4;
3607 
3608 #ifdef DEBUG
3609 static int nfs_lostpage = 0;	/* number of times we lost original page */
3610 #endif
3611 
3612 /*
3613  * Return all the pages from [off..off+len) in file
3614  */
3615 /* ARGSUSED */
3616 static int
3617 nfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
3618 	page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
3619 	enum seg_rw rw, cred_t *cr, caller_context_t *ct)
3620 {
3621 	rnode_t *rp;
3622 	int error;
3623 	mntinfo_t *mi;
3624 
3625 	if (vp->v_flag & VNOMAP)
3626 		return (ENOSYS);
3627 
3628 	ASSERT(off <= MAXOFF32_T);
3629 	if (nfs_zone() != VTOMI(vp)->mi_zone)
3630 		return (EIO);
3631 	if (protp != NULL)
3632 		*protp = PROT_ALL;
3633 
3634 	/*
3635 	 * Now valididate that the caches are up to date.
3636 	 */
3637 	error = nfs_validate_caches(vp, cr);
3638 	if (error)
3639 		return (error);
3640 
3641 	rp = VTOR(vp);
3642 	mi = VTOMI(vp);
3643 retry:
3644 	mutex_enter(&rp->r_statelock);
3645 
3646 	/*
3647 	 * Don't create dirty pages faster than they
3648 	 * can be cleaned so that the system doesn't
3649 	 * get imbalanced.  If the async queue is
3650 	 * maxed out, then wait for it to drain before
3651 	 * creating more dirty pages.  Also, wait for
3652 	 * any threads doing pagewalks in the vop_getattr
3653 	 * entry points so that they don't block for
3654 	 * long periods.
3655 	 */
3656 	if (rw == S_CREATE) {
3657 		while ((mi->mi_max_threads != 0 &&
3658 		    rp->r_awcount > 2 * mi->mi_max_threads) ||
3659 		    rp->r_gcount > 0)
3660 			cv_wait(&rp->r_cv, &rp->r_statelock);
3661 	}
3662 
3663 	/*
3664 	 * If we are getting called as a side effect of an nfs_write()
3665 	 * operation the local file size might not be extended yet.
3666 	 * In this case we want to be able to return pages of zeroes.
3667 	 */
3668 	if (off + len > rp->r_size + PAGEOFFSET && seg != segkmap) {
3669 		mutex_exit(&rp->r_statelock);
3670 		return (EFAULT);		/* beyond EOF */
3671 	}
3672 
3673 	mutex_exit(&rp->r_statelock);
3674 
3675 	if (len <= PAGESIZE) {
3676 		error = nfs_getapage(vp, off, len, protp, pl, plsz,
3677 		    seg, addr, rw, cr);
3678 	} else {
3679 		error = pvn_getpages(nfs_getapage, vp, off, len, protp,
3680 		    pl, plsz, seg, addr, rw, cr);
3681 	}
3682 
3683 	switch (error) {
3684 	case NFS_EOF:
3685 		nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr);
3686 		goto retry;
3687 	case ESTALE:
3688 		PURGE_STALE_FH(error, vp, cr);
3689 	}
3690 
3691 	return (error);
3692 }
3693 
3694 /*
3695  * Called from pvn_getpages or nfs_getpage to get a particular page.
3696  */
3697 /* ARGSUSED */
3698 static int
3699 nfs_getapage(vnode_t *vp, u_offset_t off, size_t len, uint_t *protp,
3700 	page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
3701 	enum seg_rw rw, cred_t *cr)
3702 {
3703 	rnode_t *rp;
3704 	uint_t bsize;
3705 	struct buf *bp;
3706 	page_t *pp;
3707 	u_offset_t lbn;
3708 	u_offset_t io_off;
3709 	u_offset_t blkoff;
3710 	u_offset_t rablkoff;
3711 	size_t io_len;
3712 	uint_t blksize;
3713 	int error;
3714 	int readahead;
3715 	int readahead_issued = 0;
3716 	int ra_window; /* readahead window */
3717 	page_t *pagefound;
3718 
3719 	if (nfs_zone() != VTOMI(vp)->mi_zone)
3720 		return (EIO);
3721 	rp = VTOR(vp);
3722 	bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
3723 
3724 reread:
3725 	bp = NULL;
3726 	pp = NULL;
3727 	pagefound = NULL;
3728 
3729 	if (pl != NULL)
3730 		pl[0] = NULL;
3731 
3732 	error = 0;
3733 	lbn = off / bsize;
3734 	blkoff = lbn * bsize;
3735 
3736 	/*
3737 	 * Queueing up the readahead before doing the synchronous read
3738 	 * results in a significant increase in read throughput because
3739 	 * of the increased parallelism between the async threads and
3740 	 * the process context.
3741 	 */
3742 	if ((off & ((vp->v_vfsp->vfs_bsize) - 1)) == 0 &&
3743 	    rw != S_CREATE &&
3744 	    !(vp->v_flag & VNOCACHE)) {
3745 		mutex_enter(&rp->r_statelock);
3746 
3747 		/*
3748 		 * Calculate the number of readaheads to do.
3749 		 * a) No readaheads at offset = 0.
3750 		 * b) Do maximum(nfs_nra) readaheads when the readahead
3751 		 *    window is closed.
3752 		 * c) Do readaheads between 1 to (nfs_nra - 1) depending
3753 		 *    upon how far the readahead window is open or close.
3754 		 * d) No readaheads if rp->r_nextr is not within the scope
3755 		 *    of the readahead window (random i/o).
3756 		 */
3757 
3758 		if (off == 0)
3759 			readahead = 0;
3760 		else if (blkoff == rp->r_nextr)
3761 			readahead = nfs_nra;
3762 		else if (rp->r_nextr > blkoff &&
3763 		    ((ra_window = (rp->r_nextr - blkoff) / bsize)
3764 		    <= (nfs_nra - 1)))
3765 			readahead = nfs_nra - ra_window;
3766 		else
3767 			readahead = 0;
3768 
3769 		rablkoff = rp->r_nextr;
3770 		while (readahead > 0 && rablkoff + bsize < rp->r_size) {
3771 			mutex_exit(&rp->r_statelock);
3772 			if (nfs_async_readahead(vp, rablkoff + bsize,
3773 			    addr + (rablkoff + bsize - off), seg, cr,
3774 			    nfs_readahead) < 0) {
3775 				mutex_enter(&rp->r_statelock);
3776 				break;
3777 			}
3778 			readahead--;
3779 			rablkoff += bsize;
3780 			/*
3781 			 * Indicate that we did a readahead so
3782 			 * readahead offset is not updated
3783 			 * by the synchronous read below.
3784 			 */
3785 			readahead_issued = 1;
3786 			mutex_enter(&rp->r_statelock);
3787 			/*
3788 			 * set readahead offset to
3789 			 * offset of last async readahead
3790 			 * request.
3791 			 */
3792 			rp->r_nextr = rablkoff;
3793 		}
3794 		mutex_exit(&rp->r_statelock);
3795 	}
3796 
3797 again:
3798 	if ((pagefound = page_exists(vp, off)) == NULL) {
3799 		if (pl == NULL) {
3800 			(void) nfs_async_readahead(vp, blkoff, addr, seg, cr,
3801 			    nfs_readahead);
3802 		} else if (rw == S_CREATE) {
3803 			/*
3804 			 * Block for this page is not allocated, or the offset
3805 			 * is beyond the current allocation size, or we're
3806 			 * allocating a swap slot and the page was not found,
3807 			 * so allocate it and return a zero page.
3808 			 */
3809 			if ((pp = page_create_va(vp, off,
3810 			    PAGESIZE, PG_WAIT, seg, addr)) == NULL)
3811 				cmn_err(CE_PANIC, "nfs_getapage: page_create");
3812 			io_len = PAGESIZE;
3813 			mutex_enter(&rp->r_statelock);
3814 			rp->r_nextr = off + PAGESIZE;
3815 			mutex_exit(&rp->r_statelock);
3816 		} else {
3817 			/*
3818 			 * Need to go to server to get a BLOCK, exception to
3819 			 * that being while reading at offset = 0 or doing
3820 			 * random i/o, in that case read only a PAGE.
3821 			 */
3822 			mutex_enter(&rp->r_statelock);
3823 			if (blkoff < rp->r_size &&
3824 			    blkoff + bsize >= rp->r_size) {
3825 				/*
3826 				 * If only a block or less is left in
3827 				 * the file, read all that is remaining.
3828 				 */
3829 				if (rp->r_size <= off) {
3830 					/*
3831 					 * Trying to access beyond EOF,
3832 					 * set up to get at least one page.
3833 					 */
3834 					blksize = off + PAGESIZE - blkoff;
3835 				} else
3836 					blksize = rp->r_size - blkoff;
3837 			} else if ((off == 0) ||
3838 			    (off != rp->r_nextr && !readahead_issued)) {
3839 				blksize = PAGESIZE;
3840 				blkoff = off; /* block = page here */
3841 			} else
3842 				blksize = bsize;
3843 			mutex_exit(&rp->r_statelock);
3844 
3845 			pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
3846 			    &io_len, blkoff, blksize, 0);
3847 
3848 			/*
3849 			 * Some other thread has entered the page,
3850 			 * so just use it.
3851 			 */
3852 			if (pp == NULL)
3853 				goto again;
3854 
3855 			/*
3856 			 * Now round the request size up to page boundaries.
3857 			 * This ensures that the entire page will be
3858 			 * initialized to zeroes if EOF is encountered.
3859 			 */
3860 			io_len = ptob(btopr(io_len));
3861 
3862 			bp = pageio_setup(pp, io_len, vp, B_READ);
3863 			ASSERT(bp != NULL);
3864 
3865 			/*
3866 			 * pageio_setup should have set b_addr to 0.  This
3867 			 * is correct since we want to do I/O on a page
3868 			 * boundary.  bp_mapin will use this addr to calculate
3869 			 * an offset, and then set b_addr to the kernel virtual
3870 			 * address it allocated for us.
3871 			 */
3872 			ASSERT(bp->b_un.b_addr == 0);
3873 
3874 			bp->b_edev = 0;
3875 			bp->b_dev = 0;
3876 			bp->b_lblkno = lbtodb(io_off);
3877 			bp->b_file = vp;
3878 			bp->b_offset = (offset_t)off;
3879 			bp_mapin(bp);
3880 
3881 			/*
3882 			 * If doing a write beyond what we believe is EOF,
3883 			 * don't bother trying to read the pages from the
3884 			 * server, we'll just zero the pages here.  We
3885 			 * don't check that the rw flag is S_WRITE here
3886 			 * because some implementations may attempt a
3887 			 * read access to the buffer before copying data.
3888 			 */
3889 			mutex_enter(&rp->r_statelock);
3890 			if (io_off >= rp->r_size && seg == segkmap) {
3891 				mutex_exit(&rp->r_statelock);
3892 				bzero(bp->b_un.b_addr, io_len);
3893 			} else {
3894 				mutex_exit(&rp->r_statelock);
3895 				error = nfs_bio(bp, cr);
3896 			}
3897 
3898 			/*
3899 			 * Unmap the buffer before freeing it.
3900 			 */
3901 			bp_mapout(bp);
3902 			pageio_done(bp);
3903 
3904 			if (error == NFS_EOF) {
3905 				/*
3906 				 * If doing a write system call just return
3907 				 * zeroed pages, else user tried to get pages
3908 				 * beyond EOF, return error.  We don't check
3909 				 * that the rw flag is S_WRITE here because
3910 				 * some implementations may attempt a read
3911 				 * access to the buffer before copying data.
3912 				 */
3913 				if (seg == segkmap)
3914 					error = 0;
3915 				else
3916 					error = EFAULT;
3917 			}
3918 
3919 			if (!readahead_issued && !error) {
3920 				mutex_enter(&rp->r_statelock);
3921 				rp->r_nextr = io_off + io_len;
3922 				mutex_exit(&rp->r_statelock);
3923 			}
3924 		}
3925 	}
3926 
3927 out:
3928 	if (pl == NULL)
3929 		return (error);
3930 
3931 	if (error) {
3932 		if (pp != NULL)
3933 			pvn_read_done(pp, B_ERROR);
3934 		return (error);
3935 	}
3936 
3937 	if (pagefound) {
3938 		se_t se = (rw == S_CREATE ? SE_EXCL : SE_SHARED);
3939 
3940 		/*
3941 		 * Page exists in the cache, acquire the appropriate lock.
3942 		 * If this fails, start all over again.
3943 		 */
3944 		if ((pp = page_lookup(vp, off, se)) == NULL) {
3945 #ifdef DEBUG
3946 			nfs_lostpage++;
3947 #endif
3948 			goto reread;
3949 		}
3950 		pl[0] = pp;
3951 		pl[1] = NULL;
3952 		return (0);
3953 	}
3954 
3955 	if (pp != NULL)
3956 		pvn_plist_init(pp, pl, plsz, off, io_len, rw);
3957 
3958 	return (error);
3959 }
3960 
3961 static void
3962 nfs_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, struct seg *seg,
3963 	cred_t *cr)
3964 {
3965 	int error;
3966 	page_t *pp;
3967 	u_offset_t io_off;
3968 	size_t io_len;
3969 	struct buf *bp;
3970 	uint_t bsize, blksize;
3971 	rnode_t *rp = VTOR(vp);
3972 
3973 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
3974 
3975 	bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
3976 
3977 	mutex_enter(&rp->r_statelock);
3978 	if (blkoff < rp->r_size && blkoff + bsize > rp->r_size) {
3979 		/*
3980 		 * If less than a block left in file read less
3981 		 * than a block.
3982 		 */
3983 		blksize = rp->r_size - blkoff;
3984 	} else
3985 		blksize = bsize;
3986 	mutex_exit(&rp->r_statelock);
3987 
3988 	pp = pvn_read_kluster(vp, blkoff, segkmap, addr,
3989 	    &io_off, &io_len, blkoff, blksize, 1);
3990 	/*
3991 	 * The isra flag passed to the kluster function is 1, we may have
3992 	 * gotten a return value of NULL for a variety of reasons (# of free
3993 	 * pages < minfree, someone entered the page on the vnode etc). In all
3994 	 * cases, we want to punt on the readahead.
3995 	 */
3996 	if (pp == NULL)
3997 		return;
3998 
3999 	/*
4000 	 * Now round the request size up to page boundaries.
4001 	 * This ensures that the entire page will be
4002 	 * initialized to zeroes if EOF is encountered.
4003 	 */
4004 	io_len = ptob(btopr(io_len));
4005 
4006 	bp = pageio_setup(pp, io_len, vp, B_READ);
4007 	ASSERT(bp != NULL);
4008 
4009 	/*
4010 	 * pageio_setup should have set b_addr to 0.  This is correct since
4011 	 * we want to do I/O on a page boundary. bp_mapin() will use this addr
4012 	 * to calculate an offset, and then set b_addr to the kernel virtual
4013 	 * address it allocated for us.
4014 	 */
4015 	ASSERT(bp->b_un.b_addr == 0);
4016 
4017 	bp->b_edev = 0;
4018 	bp->b_dev = 0;
4019 	bp->b_lblkno = lbtodb(io_off);
4020 	bp->b_file = vp;
4021 	bp->b_offset = (offset_t)blkoff;
4022 	bp_mapin(bp);
4023 
4024 	/*
4025 	 * If doing a write beyond what we believe is EOF, don't bother trying
4026 	 * to read the pages from the server, we'll just zero the pages here.
4027 	 * We don't check that the rw flag is S_WRITE here because some
4028 	 * implementations may attempt a read access to the buffer before
4029 	 * copying data.
4030 	 */
4031 	mutex_enter(&rp->r_statelock);
4032 	if (io_off >= rp->r_size && seg == segkmap) {
4033 		mutex_exit(&rp->r_statelock);
4034 		bzero(bp->b_un.b_addr, io_len);
4035 		error = 0;
4036 	} else {
4037 		mutex_exit(&rp->r_statelock);
4038 		error = nfs_bio(bp, cr);
4039 		if (error == NFS_EOF)
4040 			error = 0;
4041 	}
4042 
4043 	/*
4044 	 * Unmap the buffer before freeing it.
4045 	 */
4046 	bp_mapout(bp);
4047 	pageio_done(bp);
4048 
4049 	pvn_read_done(pp, error ? B_READ | B_ERROR : B_READ);
4050 
4051 	/*
4052 	 * In case of error set readahead offset
4053 	 * to the lowest offset.
4054 	 * pvn_read_done() calls VN_DISPOSE to destroy the pages
4055 	 */
4056 	if (error && rp->r_nextr > io_off) {
4057 		mutex_enter(&rp->r_statelock);
4058 		if (rp->r_nextr > io_off)
4059 			rp->r_nextr = io_off;
4060 		mutex_exit(&rp->r_statelock);
4061 	}
4062 }
4063 
4064 /*
4065  * Flags are composed of {B_INVAL, B_FREE, B_DONTNEED, B_FORCE}
4066  * If len == 0, do from off to EOF.
4067  *
4068  * The normal cases should be len == 0 && off == 0 (entire vp list),
4069  * len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE
4070  * (from pageout).
4071  */
4072 /* ARGSUSED */
4073 static int
4074 nfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
4075 	caller_context_t *ct)
4076 {
4077 	int error;
4078 	rnode_t *rp;
4079 
4080 	ASSERT(cr != NULL);
4081 
4082 	/*
4083 	 * XXX - Why should this check be made here?
4084 	 */
4085 	if (vp->v_flag & VNOMAP)
4086 		return (ENOSYS);
4087 
4088 	if (len == 0 && !(flags & B_INVAL) && vn_is_readonly(vp))
4089 		return (0);
4090 
4091 	if (!(flags & B_ASYNC) && nfs_zone() != VTOMI(vp)->mi_zone)
4092 		return (EIO);
4093 	ASSERT(off <= MAXOFF32_T);
4094 
4095 	rp = VTOR(vp);
4096 	mutex_enter(&rp->r_statelock);
4097 	rp->r_count++;
4098 	mutex_exit(&rp->r_statelock);
4099 	error = nfs_putpages(vp, off, len, flags, cr);
4100 	mutex_enter(&rp->r_statelock);
4101 	rp->r_count--;
4102 	cv_broadcast(&rp->r_cv);
4103 	mutex_exit(&rp->r_statelock);
4104 
4105 	return (error);
4106 }
4107 
4108 /*
4109  * Write out a single page, possibly klustering adjacent dirty pages.
4110  */
4111 int
4112 nfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp,
4113 	int flags, cred_t *cr)
4114 {
4115 	u_offset_t io_off;
4116 	u_offset_t lbn_off;
4117 	u_offset_t lbn;
4118 	size_t io_len;
4119 	uint_t bsize;
4120 	int error;
4121 	rnode_t *rp;
4122 
4123 	ASSERT(!vn_is_readonly(vp));
4124 	ASSERT(pp != NULL);
4125 	ASSERT(cr != NULL);
4126 	ASSERT((flags & B_ASYNC) || nfs_zone() == VTOMI(vp)->mi_zone);
4127 
4128 	rp = VTOR(vp);
4129 	ASSERT(rp->r_count > 0);
4130 
4131 	ASSERT(pp->p_offset <= MAXOFF32_T);
4132 
4133 	bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
4134 	lbn = pp->p_offset / bsize;
4135 	lbn_off = lbn * bsize;
4136 
4137 	/*
4138 	 * Find a kluster that fits in one block, or in
4139 	 * one page if pages are bigger than blocks.  If
4140 	 * there is less file space allocated than a whole
4141 	 * page, we'll shorten the i/o request below.
4142 	 */
4143 	pp = pvn_write_kluster(vp, pp, &io_off, &io_len, lbn_off,
4144 	    roundup(bsize, PAGESIZE), flags);
4145 
4146 	/*
4147 	 * pvn_write_kluster shouldn't have returned a page with offset
4148 	 * behind the original page we were given.  Verify that.
4149 	 */
4150 	ASSERT((pp->p_offset / bsize) >= lbn);
4151 
4152 	/*
4153 	 * Now pp will have the list of kept dirty pages marked for
4154 	 * write back.  It will also handle invalidation and freeing
4155 	 * of pages that are not dirty.  Check for page length rounding
4156 	 * problems.
4157 	 */
4158 	if (io_off + io_len > lbn_off + bsize) {
4159 		ASSERT((io_off + io_len) - (lbn_off + bsize) < PAGESIZE);
4160 		io_len = lbn_off + bsize - io_off;
4161 	}
4162 	/*
4163 	 * The RMODINPROGRESS flag makes sure that nfs(3)_bio() sees a
4164 	 * consistent value of r_size. RMODINPROGRESS is set in writerp().
4165 	 * When RMODINPROGRESS is set it indicates that a uiomove() is in
4166 	 * progress and the r_size has not been made consistent with the
4167 	 * new size of the file. When the uiomove() completes the r_size is
4168 	 * updated and the RMODINPROGRESS flag is cleared.
4169 	 *
4170 	 * The RMODINPROGRESS flag makes sure that nfs(3)_bio() sees a
4171 	 * consistent value of r_size. Without this handshaking, it is
4172 	 * possible that nfs(3)_bio() picks  up the old value of r_size
4173 	 * before the uiomove() in writerp() completes. This will result
4174 	 * in the write through nfs(3)_bio() being dropped.
4175 	 *
4176 	 * More precisely, there is a window between the time the uiomove()
4177 	 * completes and the time the r_size is updated. If a VOP_PUTPAGE()
4178 	 * operation intervenes in this window, the page will be picked up,
4179 	 * because it is dirty (it will be unlocked, unless it was
4180 	 * pagecreate'd). When the page is picked up as dirty, the dirty
4181 	 * bit is reset (pvn_getdirty()). In nfs(3)write(), r_size is
4182 	 * checked. This will still be the old size. Therefore the page will
4183 	 * not be written out. When segmap_release() calls VOP_PUTPAGE(),
4184 	 * the page will be found to be clean and the write will be dropped.
4185 	 */
4186 	if (rp->r_flags & RMODINPROGRESS) {
4187 		mutex_enter(&rp->r_statelock);
4188 		if ((rp->r_flags & RMODINPROGRESS) &&
4189 		    rp->r_modaddr + MAXBSIZE > io_off &&
4190 		    rp->r_modaddr < io_off + io_len) {
4191 			page_t *plist;
4192 			/*
4193 			 * A write is in progress for this region of the file.
4194 			 * If we did not detect RMODINPROGRESS here then this
4195 			 * path through nfs_putapage() would eventually go to
4196 			 * nfs(3)_bio() and may not write out all of the data
4197 			 * in the pages. We end up losing data. So we decide
4198 			 * to set the modified bit on each page in the page
4199 			 * list and mark the rnode with RDIRTY. This write
4200 			 * will be restarted at some later time.
4201 			 */
4202 			plist = pp;
4203 			while (plist != NULL) {
4204 				pp = plist;
4205 				page_sub(&plist, pp);
4206 				hat_setmod(pp);
4207 				page_io_unlock(pp);
4208 				page_unlock(pp);
4209 			}
4210 			rp->r_flags |= RDIRTY;
4211 			mutex_exit(&rp->r_statelock);
4212 			if (offp)
4213 				*offp = io_off;
4214 			if (lenp)
4215 				*lenp = io_len;
4216 			return (0);
4217 		}
4218 		mutex_exit(&rp->r_statelock);
4219 	}
4220 
4221 	if (flags & B_ASYNC) {
4222 		error = nfs_async_putapage(vp, pp, io_off, io_len, flags, cr,
4223 		    nfs_sync_putapage);
4224 	} else
4225 		error = nfs_sync_putapage(vp, pp, io_off, io_len, flags, cr);
4226 
4227 	if (offp)
4228 		*offp = io_off;
4229 	if (lenp)
4230 		*lenp = io_len;
4231 	return (error);
4232 }
4233 
4234 static int
4235 nfs_sync_putapage(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
4236 	int flags, cred_t *cr)
4237 {
4238 	int error;
4239 	rnode_t *rp;
4240 
4241 	flags |= B_WRITE;
4242 
4243 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
4244 	error = nfs_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
4245 
4246 	rp = VTOR(vp);
4247 
4248 	if ((error == ENOSPC || error == EDQUOT || error == EACCES) &&
4249 	    (flags & (B_INVAL|B_FORCE)) != (B_INVAL|B_FORCE)) {
4250 		if (!(rp->r_flags & ROUTOFSPACE)) {
4251 			mutex_enter(&rp->r_statelock);
4252 			rp->r_flags |= ROUTOFSPACE;
4253 			mutex_exit(&rp->r_statelock);
4254 		}
4255 		flags |= B_ERROR;
4256 		pvn_write_done(pp, flags);
4257 		/*
4258 		 * If this was not an async thread, then try again to
4259 		 * write out the pages, but this time, also destroy
4260 		 * them whether or not the write is successful.  This
4261 		 * will prevent memory from filling up with these
4262 		 * pages and destroying them is the only alternative
4263 		 * if they can't be written out.
4264 		 *
4265 		 * Don't do this if this is an async thread because
4266 		 * when the pages are unlocked in pvn_write_done,
4267 		 * some other thread could have come along, locked
4268 		 * them, and queued for an async thread.  It would be
4269 		 * possible for all of the async threads to be tied
4270 		 * up waiting to lock the pages again and they would
4271 		 * all already be locked and waiting for an async
4272 		 * thread to handle them.  Deadlock.
4273 		 */
4274 		if (!(flags & B_ASYNC)) {
4275 			error = nfs_putpage(vp, io_off, io_len,
4276 			    B_INVAL | B_FORCE, cr, NULL);
4277 		}
4278 	} else {
4279 		if (error)
4280 			flags |= B_ERROR;
4281 		else if (rp->r_flags & ROUTOFSPACE) {
4282 			mutex_enter(&rp->r_statelock);
4283 			rp->r_flags &= ~ROUTOFSPACE;
4284 			mutex_exit(&rp->r_statelock);
4285 		}
4286 		pvn_write_done(pp, flags);
4287 	}
4288 
4289 	return (error);
4290 }
4291 
4292 /* ARGSUSED */
4293 static int
4294 nfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
4295 	size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
4296 	caller_context_t *ct)
4297 {
4298 	struct segvn_crargs vn_a;
4299 	int error;
4300 	rnode_t *rp;
4301 	struct vattr va;
4302 
4303 	if (nfs_zone() != VTOMI(vp)->mi_zone)
4304 		return (EIO);
4305 
4306 	if (vp->v_flag & VNOMAP)
4307 		return (ENOSYS);
4308 
4309 	if (off > MAXOFF32_T)
4310 		return (EFBIG);
4311 
4312 	if (off < 0 || off + len < 0)
4313 		return (ENXIO);
4314 
4315 	if (vp->v_type != VREG)
4316 		return (ENODEV);
4317 
4318 	/*
4319 	 * If there is cached data and if close-to-open consistency
4320 	 * checking is not turned off and if the file system is not
4321 	 * mounted readonly, then force an over the wire getattr.
4322 	 * Otherwise, just invoke nfsgetattr to get a copy of the
4323 	 * attributes.  The attribute cache will be used unless it
4324 	 * is timed out and if it is, then an over the wire getattr
4325 	 * will be issued.
4326 	 */
4327 	va.va_mask = AT_ALL;
4328 	if (vn_has_cached_data(vp) &&
4329 	    !(VTOMI(vp)->mi_flags & MI_NOCTO) && !vn_is_readonly(vp))
4330 		error = nfs_getattr_otw(vp, &va, cr);
4331 	else
4332 		error = nfsgetattr(vp, &va, cr);
4333 	if (error)
4334 		return (error);
4335 
4336 	/*
4337 	 * Check to see if the vnode is currently marked as not cachable.
4338 	 * This means portions of the file are locked (through VOP_FRLOCK).
4339 	 * In this case the map request must be refused.  We use
4340 	 * rp->r_lkserlock to avoid a race with concurrent lock requests.
4341 	 */
4342 	rp = VTOR(vp);
4343 
4344 	/*
4345 	 * Atomically increment r_inmap after acquiring r_rwlock. The
4346 	 * idea here is to acquire r_rwlock to block read/write and
4347 	 * not to protect r_inmap. r_inmap will inform nfs_read/write()
4348 	 * that we are in nfs_map(). Now, r_rwlock is acquired in order
4349 	 * and we can prevent the deadlock that would have occurred
4350 	 * when nfs_addmap() would have acquired it out of order.
4351 	 *
4352 	 * Since we are not protecting r_inmap by any lock, we do not
4353 	 * hold any lock when we decrement it. We atomically decrement
4354 	 * r_inmap after we release r_lkserlock.
4355 	 */
4356 
4357 	if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, INTR(vp)))
4358 		return (EINTR);
4359 	atomic_add_int(&rp->r_inmap, 1);
4360 	nfs_rw_exit(&rp->r_rwlock);
4361 
4362 	if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR(vp))) {
4363 		atomic_add_int(&rp->r_inmap, -1);
4364 		return (EINTR);
4365 	}
4366 	if (vp->v_flag & VNOCACHE) {
4367 		error = EAGAIN;
4368 		goto done;
4369 	}
4370 
4371 	/*
4372 	 * Don't allow concurrent locks and mapping if mandatory locking is
4373 	 * enabled.
4374 	 */
4375 	if ((flk_has_remote_locks(vp) || lm_has_sleep(vp)) &&
4376 	    MANDLOCK(vp, va.va_mode)) {
4377 		error = EAGAIN;
4378 		goto done;
4379 	}
4380 
4381 	as_rangelock(as);
4382 	error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
4383 	if (error != 0) {
4384 		as_rangeunlock(as);
4385 		goto done;
4386 	}
4387 
4388 	vn_a.vp = vp;
4389 	vn_a.offset = off;
4390 	vn_a.type = (flags & MAP_TYPE);
4391 	vn_a.prot = (uchar_t)prot;
4392 	vn_a.maxprot = (uchar_t)maxprot;
4393 	vn_a.flags = (flags & ~MAP_TYPE);
4394 	vn_a.cred = cr;
4395 	vn_a.amp = NULL;
4396 	vn_a.szc = 0;
4397 	vn_a.lgrp_mem_policy_flags = 0;
4398 
4399 	error = as_map(as, *addrp, len, segvn_create, &vn_a);
4400 	as_rangeunlock(as);
4401 
4402 done:
4403 	nfs_rw_exit(&rp->r_lkserlock);
4404 	atomic_add_int(&rp->r_inmap, -1);
4405 	return (error);
4406 }
4407 
4408 /* ARGSUSED */
4409 static int
4410 nfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
4411 	size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
4412 	caller_context_t *ct)
4413 {
4414 	rnode_t *rp;
4415 
4416 	if (vp->v_flag & VNOMAP)
4417 		return (ENOSYS);
4418 	if (nfs_zone() != VTOMI(vp)->mi_zone)
4419 		return (EIO);
4420 
4421 	rp = VTOR(vp);
4422 	atomic_add_long((ulong_t *)&rp->r_mapcnt, btopr(len));
4423 
4424 	return (0);
4425 }
4426 
4427 /* ARGSUSED */
4428 static int
4429 nfs_frlock(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, offset_t offset,
4430 	struct flk_callback *flk_cbp, cred_t *cr, caller_context_t *ct)
4431 {
4432 	netobj lm_fh;
4433 	int rc;
4434 	u_offset_t start, end;
4435 	rnode_t *rp;
4436 	int error = 0, intr = INTR(vp);
4437 
4438 	/* check for valid cmd parameter */
4439 	if (cmd != F_GETLK && cmd != F_SETLK && cmd != F_SETLKW)
4440 		return (EINVAL);
4441 	if (nfs_zone() != VTOMI(vp)->mi_zone)
4442 		return (EIO);
4443 
4444 	/* Verify l_type. */
4445 	switch (bfp->l_type) {
4446 	case F_RDLCK:
4447 		if (cmd != F_GETLK && !(flag & FREAD))
4448 			return (EBADF);
4449 		break;
4450 	case F_WRLCK:
4451 		if (cmd != F_GETLK && !(flag & FWRITE))
4452 			return (EBADF);
4453 		break;
4454 	case F_UNLCK:
4455 		intr = 0;
4456 		break;
4457 
4458 	default:
4459 		return (EINVAL);
4460 	}
4461 
4462 	/* check the validity of the lock range */
4463 	if (rc = flk_convert_lock_data(vp, bfp, &start, &end, offset))
4464 		return (rc);
4465 	if (rc = flk_check_lock_data(start, end, MAXOFF32_T))
4466 		return (rc);
4467 
4468 	/*
4469 	 * If the filesystem is mounted using local locking, pass the
4470 	 * request off to the local locking code.
4471 	 */
4472 	if (VTOMI(vp)->mi_flags & MI_LLOCK) {
4473 		if (offset > MAXOFF32_T)
4474 			return (EFBIG);
4475 		if (cmd == F_SETLK || cmd == F_SETLKW) {
4476 			/*
4477 			 * For complete safety, we should be holding
4478 			 * r_lkserlock.  However, we can't call
4479 			 * lm_safelock and then fs_frlock while
4480 			 * holding r_lkserlock, so just invoke
4481 			 * lm_safelock and expect that this will
4482 			 * catch enough of the cases.
4483 			 */
4484 			if (!lm_safelock(vp, bfp, cr))
4485 				return (EAGAIN);
4486 		}
4487 		return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
4488 	}
4489 
4490 	rp = VTOR(vp);
4491 
4492 	/*
4493 	 * Check whether the given lock request can proceed, given the
4494 	 * current file mappings.
4495 	 */
4496 	if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_WRITER, intr))
4497 		return (EINTR);
4498 	if (cmd == F_SETLK || cmd == F_SETLKW) {
4499 		if (!lm_safelock(vp, bfp, cr)) {
4500 			rc = EAGAIN;
4501 			goto done;
4502 		}
4503 	}
4504 
4505 	/*
4506 	 * Flush the cache after waiting for async I/O to finish.  For new
4507 	 * locks, this is so that the process gets the latest bits from the
4508 	 * server.  For unlocks, this is so that other clients see the
4509 	 * latest bits once the file has been unlocked.  If currently dirty
4510 	 * pages can't be flushed, then don't allow a lock to be set.  But
4511 	 * allow unlocks to succeed, to avoid having orphan locks on the
4512 	 * server.
4513 	 */
4514 	if (cmd != F_GETLK) {
4515 		mutex_enter(&rp->r_statelock);
4516 		while (rp->r_count > 0) {
4517 			if (intr) {
4518 				klwp_t *lwp = ttolwp(curthread);
4519 
4520 				if (lwp != NULL)
4521 					lwp->lwp_nostop++;
4522 				if (cv_wait_sig(&rp->r_cv, &rp->r_statelock)
4523 				    == 0) {
4524 					if (lwp != NULL)
4525 						lwp->lwp_nostop--;
4526 					rc = EINTR;
4527 					break;
4528 				}
4529 				if (lwp != NULL)
4530 					lwp->lwp_nostop--;
4531 			} else
4532 			cv_wait(&rp->r_cv, &rp->r_statelock);
4533 		}
4534 		mutex_exit(&rp->r_statelock);
4535 		if (rc != 0)
4536 			goto done;
4537 		error = nfs_putpage(vp, (offset_t)0, 0, B_INVAL, cr, ct);
4538 		if (error) {
4539 			if (error == ENOSPC || error == EDQUOT) {
4540 				mutex_enter(&rp->r_statelock);
4541 				if (!rp->r_error)
4542 					rp->r_error = error;
4543 				mutex_exit(&rp->r_statelock);
4544 			}
4545 			if (bfp->l_type != F_UNLCK) {
4546 				rc = ENOLCK;
4547 				goto done;
4548 			}
4549 		}
4550 	}
4551 
4552 	lm_fh.n_len = sizeof (fhandle_t);
4553 	lm_fh.n_bytes = (char *)VTOFH(vp);
4554 
4555 	/*
4556 	 * Call the lock manager to do the real work of contacting
4557 	 * the server and obtaining the lock.
4558 	 */
4559 	rc = lm_frlock(vp, cmd, bfp, flag, offset, cr, &lm_fh, flk_cbp);
4560 
4561 	if (rc == 0)
4562 		nfs_lockcompletion(vp, cmd);
4563 
4564 done:
4565 	nfs_rw_exit(&rp->r_lkserlock);
4566 	return (rc);
4567 }
4568 
4569 /*
4570  * Free storage space associated with the specified vnode.  The portion
4571  * to be freed is specified by bfp->l_start and bfp->l_len (already
4572  * normalized to a "whence" of 0).
4573  *
4574  * This is an experimental facility whose continued existence is not
4575  * guaranteed.  Currently, we only support the special case
4576  * of l_len == 0, meaning free to end of file.
4577  */
4578 /* ARGSUSED */
4579 static int
4580 nfs_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
4581 	offset_t offset, cred_t *cr, caller_context_t *ct)
4582 {
4583 	int error;
4584 
4585 	ASSERT(vp->v_type == VREG);
4586 	if (cmd != F_FREESP)
4587 		return (EINVAL);
4588 
4589 	if (offset > MAXOFF32_T)
4590 		return (EFBIG);
4591 
4592 	if ((bfp->l_start > MAXOFF32_T) || (bfp->l_end > MAXOFF32_T) ||
4593 	    (bfp->l_len > MAXOFF32_T))
4594 		return (EFBIG);
4595 
4596 	if (nfs_zone() != VTOMI(vp)->mi_zone)
4597 		return (EIO);
4598 
4599 	error = convoff(vp, bfp, 0, offset);
4600 	if (!error) {
4601 		ASSERT(bfp->l_start >= 0);
4602 		if (bfp->l_len == 0) {
4603 			struct vattr va;
4604 
4605 			/*
4606 			 * ftruncate should not change the ctime and
4607 			 * mtime if we truncate the file to its
4608 			 * previous size.
4609 			 */
4610 			va.va_mask = AT_SIZE;
4611 			error = nfsgetattr(vp, &va, cr);
4612 			if (error || va.va_size == bfp->l_start)
4613 				return (error);
4614 			va.va_mask = AT_SIZE;
4615 			va.va_size = bfp->l_start;
4616 			error = nfssetattr(vp, &va, 0, cr);
4617 		} else
4618 			error = EINVAL;
4619 	}
4620 
4621 	return (error);
4622 }
4623 
4624 /* ARGSUSED */
4625 static int
4626 nfs_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct)
4627 {
4628 
4629 	return (EINVAL);
4630 }
4631 
4632 /*
4633  * Setup and add an address space callback to do the work of the delmap call.
4634  * The callback will (and must be) deleted in the actual callback function.
4635  *
4636  * This is done in order to take care of the problem that we have with holding
4637  * the address space's a_lock for a long period of time (e.g. if the NFS server
4638  * is down).  Callbacks will be executed in the address space code while the
4639  * a_lock is not held.	Holding the address space's a_lock causes things such
4640  * as ps and fork to hang because they are trying to acquire this lock as well.
4641  */
4642 /* ARGSUSED */
4643 static int
4644 nfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
4645 	size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr,
4646 	caller_context_t *ct)
4647 {
4648 	int			caller_found;
4649 	int			error;
4650 	rnode_t			*rp;
4651 	nfs_delmap_args_t	*dmapp;
4652 	nfs_delmapcall_t	*delmap_call;
4653 
4654 	if (vp->v_flag & VNOMAP)
4655 		return (ENOSYS);
4656 	/*
4657 	 * A process may not change zones if it has NFS pages mmap'ed
4658 	 * in, so we can't legitimately get here from the wrong zone.
4659 	 */
4660 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
4661 
4662 	rp = VTOR(vp);
4663 
4664 	/*
4665 	 * The way that the address space of this process deletes its mapping
4666 	 * of this file is via the following call chains:
4667 	 * - as_free()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs_delmap()
4668 	 * - as_unmap()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs_delmap()
4669 	 *
4670 	 * With the use of address space callbacks we are allowed to drop the
4671 	 * address space lock, a_lock, while executing the NFS operations that
4672 	 * need to go over the wire.  Returning EAGAIN to the caller of this
4673 	 * function is what drives the execution of the callback that we add
4674 	 * below.  The callback will be executed by the address space code
4675 	 * after dropping the a_lock.  When the callback is finished, since
4676 	 * we dropped the a_lock, it must be re-acquired and segvn_unmap()
4677 	 * is called again on the same segment to finish the rest of the work
4678 	 * that needs to happen during unmapping.
4679 	 *
4680 	 * This action of calling back into the segment driver causes
4681 	 * nfs_delmap() to get called again, but since the callback was
4682 	 * already executed at this point, it already did the work and there
4683 	 * is nothing left for us to do.
4684 	 *
4685 	 * To Summarize:
4686 	 * - The first time nfs_delmap is called by the current thread is when
4687 	 * we add the caller associated with this delmap to the delmap caller
4688 	 * list, add the callback, and return EAGAIN.
4689 	 * - The second time in this call chain when nfs_delmap is called we
4690 	 * will find this caller in the delmap caller list and realize there
4691 	 * is no more work to do thus removing this caller from the list and
4692 	 * returning the error that was set in the callback execution.
4693 	 */
4694 	caller_found = nfs_find_and_delete_delmapcall(rp, &error);
4695 	if (caller_found) {
4696 		/*
4697 		 * 'error' is from the actual delmap operations.  To avoid
4698 		 * hangs, we need to handle the return of EAGAIN differently
4699 		 * since this is what drives the callback execution.
4700 		 * In this case, we don't want to return EAGAIN and do the
4701 		 * callback execution because there are none to execute.
4702 		 */
4703 		if (error == EAGAIN)
4704 			return (0);
4705 		else
4706 			return (error);
4707 	}
4708 
4709 	/* current caller was not in the list */
4710 	delmap_call = nfs_init_delmapcall();
4711 
4712 	mutex_enter(&rp->r_statelock);
4713 	list_insert_tail(&rp->r_indelmap, delmap_call);
4714 	mutex_exit(&rp->r_statelock);
4715 
4716 	dmapp = kmem_alloc(sizeof (nfs_delmap_args_t), KM_SLEEP);
4717 
4718 	dmapp->vp = vp;
4719 	dmapp->off = off;
4720 	dmapp->addr = addr;
4721 	dmapp->len = len;
4722 	dmapp->prot = prot;
4723 	dmapp->maxprot = maxprot;
4724 	dmapp->flags = flags;
4725 	dmapp->cr = cr;
4726 	dmapp->caller = delmap_call;
4727 
4728 	error = as_add_callback(as, nfs_delmap_callback, dmapp,
4729 	    AS_UNMAP_EVENT, addr, len, KM_SLEEP);
4730 
4731 	return (error ? error : EAGAIN);
4732 }
4733 
4734 /*
4735  * Remove some pages from an mmap'd vnode.  Just update the
4736  * count of pages.  If doing close-to-open, then flush all
4737  * of the pages associated with this file.  Otherwise, start
4738  * an asynchronous page flush to write out any dirty pages.
4739  * This will also associate a credential with the rnode which
4740  * can be used to write the pages.
4741  */
4742 /* ARGSUSED */
4743 static void
4744 nfs_delmap_callback(struct as *as, void *arg, uint_t event)
4745 {
4746 	int			error;
4747 	rnode_t			*rp;
4748 	mntinfo_t		*mi;
4749 	nfs_delmap_args_t	*dmapp = (nfs_delmap_args_t *)arg;
4750 
4751 	rp = VTOR(dmapp->vp);
4752 	mi = VTOMI(dmapp->vp);
4753 
4754 	atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(dmapp->len));
4755 	ASSERT(rp->r_mapcnt >= 0);
4756 
4757 	/*
4758 	 * Initiate a page flush if there are pages, the file system
4759 	 * was not mounted readonly, the segment was mapped shared, and
4760 	 * the pages themselves were writeable.
4761 	 */
4762 	if (vn_has_cached_data(dmapp->vp) && !vn_is_readonly(dmapp->vp) &&
4763 	    dmapp->flags == MAP_SHARED && (dmapp->maxprot & PROT_WRITE)) {
4764 		mutex_enter(&rp->r_statelock);
4765 		rp->r_flags |= RDIRTY;
4766 		mutex_exit(&rp->r_statelock);
4767 		/*
4768 		 * If this is a cross-zone access a sync putpage won't work, so
4769 		 * the best we can do is try an async putpage.  That seems
4770 		 * better than something more draconian such as discarding the
4771 		 * dirty pages.
4772 		 */
4773 		if ((mi->mi_flags & MI_NOCTO) ||
4774 		    nfs_zone() != mi->mi_zone)
4775 			error = nfs_putpage(dmapp->vp, dmapp->off, dmapp->len,
4776 			    B_ASYNC, dmapp->cr, NULL);
4777 		else
4778 			error = nfs_putpage(dmapp->vp, dmapp->off, dmapp->len,
4779 			    0, dmapp->cr, NULL);
4780 		if (!error) {
4781 			mutex_enter(&rp->r_statelock);
4782 			error = rp->r_error;
4783 			rp->r_error = 0;
4784 			mutex_exit(&rp->r_statelock);
4785 		}
4786 	} else
4787 		error = 0;
4788 
4789 	if ((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO))
4790 		(void) nfs_putpage(dmapp->vp, dmapp->off, dmapp->len,
4791 		    B_INVAL, dmapp->cr, NULL);
4792 
4793 	dmapp->caller->error = error;
4794 	(void) as_delete_callback(as, arg);
4795 	kmem_free(dmapp, sizeof (nfs_delmap_args_t));
4796 }
4797 
4798 /* ARGSUSED */
4799 static int
4800 nfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
4801 	caller_context_t *ct)
4802 {
4803 	int error = 0;
4804 
4805 	if (nfs_zone() != VTOMI(vp)->mi_zone)
4806 		return (EIO);
4807 	/*
4808 	 * This looks a little weird because it's written in a general
4809 	 * manner but we make little use of cases.  If cntl() ever gets
4810 	 * widely used, the outer switch will make more sense.
4811 	 */
4812 
4813 	switch (cmd) {
4814 
4815 	/*
4816 	 * Large file spec - need to base answer new query with
4817 	 * hardcoded constant based on the protocol.
4818 	 */
4819 	case _PC_FILESIZEBITS:
4820 		*valp = 32;
4821 		return (0);
4822 
4823 	case _PC_LINK_MAX:
4824 	case _PC_NAME_MAX:
4825 	case _PC_PATH_MAX:
4826 	case _PC_SYMLINK_MAX:
4827 	case _PC_CHOWN_RESTRICTED:
4828 	case _PC_NO_TRUNC: {
4829 		mntinfo_t *mi;
4830 		struct pathcnf *pc;
4831 
4832 		if ((mi = VTOMI(vp)) == NULL || (pc = mi->mi_pathconf) == NULL)
4833 			return (EINVAL);
4834 		error = _PC_ISSET(cmd, pc->pc_mask);    /* error or bool */
4835 		switch (cmd) {
4836 		case _PC_LINK_MAX:
4837 			*valp = pc->pc_link_max;
4838 			break;
4839 		case _PC_NAME_MAX:
4840 			*valp = pc->pc_name_max;
4841 			break;
4842 		case _PC_PATH_MAX:
4843 		case _PC_SYMLINK_MAX:
4844 			*valp = pc->pc_path_max;
4845 			break;
4846 		case _PC_CHOWN_RESTRICTED:
4847 			/*
4848 			 * if we got here, error is really a boolean which
4849 			 * indicates whether cmd is set or not.
4850 			 */
4851 			*valp = error ? 1 : 0;	/* see above */
4852 			error = 0;
4853 			break;
4854 		case _PC_NO_TRUNC:
4855 			/*
4856 			 * if we got here, error is really a boolean which
4857 			 * indicates whether cmd is set or not.
4858 			 */
4859 			*valp = error ? 1 : 0;	/* see above */
4860 			error = 0;
4861 			break;
4862 		}
4863 		return (error ? EINVAL : 0);
4864 		}
4865 
4866 	case _PC_XATTR_EXISTS:
4867 		*valp = 0;
4868 		if (vp->v_vfsp->vfs_flag & VFS_XATTR) {
4869 			vnode_t *avp;
4870 			rnode_t *rp;
4871 			mntinfo_t *mi = VTOMI(vp);
4872 
4873 			if (!(mi->mi_flags & MI_EXTATTR))
4874 				return (0);
4875 
4876 			rp = VTOR(vp);
4877 			if (nfs_rw_enter_sig(&rp->r_rwlock, RW_READER,
4878 			    INTR(vp)))
4879 				return (EINTR);
4880 
4881 			error = nfslookup_dnlc(vp, XATTR_DIR_NAME, &avp, cr);
4882 			if (error || avp == NULL)
4883 				error = acl_getxattrdir2(vp, &avp, 0, cr, 0);
4884 
4885 			nfs_rw_exit(&rp->r_rwlock);
4886 
4887 			if (error == 0 && avp != NULL) {
4888 				error = do_xattr_exists_check(avp, valp, cr);
4889 				VN_RELE(avp);
4890 			}
4891 		}
4892 		return (error ? EINVAL : 0);
4893 
4894 	case _PC_ACL_ENABLED:
4895 		*valp = _ACL_ACLENT_ENABLED;
4896 		return (0);
4897 
4898 	default:
4899 		return (EINVAL);
4900 	}
4901 }
4902 
4903 /*
4904  * Called by async thread to do synchronous pageio. Do the i/o, wait
4905  * for it to complete, and cleanup the page list when done.
4906  */
4907 static int
4908 nfs_sync_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
4909 	int flags, cred_t *cr)
4910 {
4911 	int error;
4912 
4913 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
4914 	error = nfs_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
4915 	if (flags & B_READ)
4916 		pvn_read_done(pp, (error ? B_ERROR : 0) | flags);
4917 	else
4918 		pvn_write_done(pp, (error ? B_ERROR : 0) | flags);
4919 	return (error);
4920 }
4921 
4922 /* ARGSUSED */
4923 static int
4924 nfs_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
4925 	int flags, cred_t *cr, caller_context_t *ct)
4926 {
4927 	int error;
4928 	rnode_t *rp;
4929 
4930 	if (pp == NULL)
4931 		return (EINVAL);
4932 
4933 	if (io_off > MAXOFF32_T)
4934 		return (EFBIG);
4935 	if (nfs_zone() != VTOMI(vp)->mi_zone)
4936 		return (EIO);
4937 	rp = VTOR(vp);
4938 	mutex_enter(&rp->r_statelock);
4939 	rp->r_count++;
4940 	mutex_exit(&rp->r_statelock);
4941 
4942 	if (flags & B_ASYNC) {
4943 		error = nfs_async_pageio(vp, pp, io_off, io_len, flags, cr,
4944 		    nfs_sync_pageio);
4945 	} else
4946 		error = nfs_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
4947 	mutex_enter(&rp->r_statelock);
4948 	rp->r_count--;
4949 	cv_broadcast(&rp->r_cv);
4950 	mutex_exit(&rp->r_statelock);
4951 	return (error);
4952 }
4953 
4954 /* ARGSUSED */
4955 static int
4956 nfs_setsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr,
4957 	caller_context_t *ct)
4958 {
4959 	int error;
4960 	mntinfo_t *mi;
4961 
4962 	mi = VTOMI(vp);
4963 
4964 	if (nfs_zone() != mi->mi_zone)
4965 		return (EIO);
4966 	if (mi->mi_flags & MI_ACL) {
4967 		error = acl_setacl2(vp, vsecattr, flag, cr);
4968 		if (mi->mi_flags & MI_ACL)
4969 			return (error);
4970 	}
4971 
4972 	return (ENOSYS);
4973 }
4974 
4975 /* ARGSUSED */
4976 static int
4977 nfs_getsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr,
4978 	caller_context_t *ct)
4979 {
4980 	int error;
4981 	mntinfo_t *mi;
4982 
4983 	mi = VTOMI(vp);
4984 
4985 	if (nfs_zone() != mi->mi_zone)
4986 		return (EIO);
4987 	if (mi->mi_flags & MI_ACL) {
4988 		error = acl_getacl2(vp, vsecattr, flag, cr);
4989 		if (mi->mi_flags & MI_ACL)
4990 			return (error);
4991 	}
4992 
4993 	return (fs_fab_acl(vp, vsecattr, flag, cr, ct));
4994 }
4995 
4996 /* ARGSUSED */
4997 static int
4998 nfs_shrlock(vnode_t *vp, int cmd, struct shrlock *shr, int flag, cred_t *cr,
4999 	caller_context_t *ct)
5000 {
5001 	int error;
5002 	struct shrlock nshr;
5003 	struct nfs_owner nfs_owner;
5004 	netobj lm_fh;
5005 
5006 	if (nfs_zone() != VTOMI(vp)->mi_zone)
5007 		return (EIO);
5008 
5009 	/*
5010 	 * check for valid cmd parameter
5011 	 */
5012 	if (cmd != F_SHARE && cmd != F_UNSHARE && cmd != F_HASREMOTELOCKS)
5013 		return (EINVAL);
5014 
5015 	/*
5016 	 * Check access permissions
5017 	 */
5018 	if (cmd == F_SHARE &&
5019 	    (((shr->s_access & F_RDACC) && !(flag & FREAD)) ||
5020 	    ((shr->s_access & F_WRACC) && !(flag & FWRITE))))
5021 		return (EBADF);
5022 
5023 	/*
5024 	 * If the filesystem is mounted using local locking, pass the
5025 	 * request off to the local share code.
5026 	 */
5027 	if (VTOMI(vp)->mi_flags & MI_LLOCK)
5028 		return (fs_shrlock(vp, cmd, shr, flag, cr, ct));
5029 
5030 	switch (cmd) {
5031 	case F_SHARE:
5032 	case F_UNSHARE:
5033 		lm_fh.n_len = sizeof (fhandle_t);
5034 		lm_fh.n_bytes = (char *)VTOFH(vp);
5035 
5036 		/*
5037 		 * If passed an owner that is too large to fit in an
5038 		 * nfs_owner it is likely a recursive call from the
5039 		 * lock manager client and pass it straight through.  If
5040 		 * it is not a nfs_owner then simply return an error.
5041 		 */
5042 		if (shr->s_own_len > sizeof (nfs_owner.lowner)) {
5043 			if (((struct nfs_owner *)shr->s_owner)->magic !=
5044 			    NFS_OWNER_MAGIC)
5045 				return (EINVAL);
5046 
5047 			if (error = lm_shrlock(vp, cmd, shr, flag, &lm_fh)) {
5048 				error = set_errno(error);
5049 			}
5050 			return (error);
5051 		}
5052 		/*
5053 		 * Remote share reservations owner is a combination of
5054 		 * a magic number, hostname, and the local owner
5055 		 */
5056 		bzero(&nfs_owner, sizeof (nfs_owner));
5057 		nfs_owner.magic = NFS_OWNER_MAGIC;
5058 		(void) strncpy(nfs_owner.hname, uts_nodename(),
5059 		    sizeof (nfs_owner.hname));
5060 		bcopy(shr->s_owner, nfs_owner.lowner, shr->s_own_len);
5061 		nshr.s_access = shr->s_access;
5062 		nshr.s_deny = shr->s_deny;
5063 		nshr.s_sysid = 0;
5064 		nshr.s_pid = ttoproc(curthread)->p_pid;
5065 		nshr.s_own_len = sizeof (nfs_owner);
5066 		nshr.s_owner = (caddr_t)&nfs_owner;
5067 
5068 		if (error = lm_shrlock(vp, cmd, &nshr, flag, &lm_fh)) {
5069 			error = set_errno(error);
5070 		}
5071 
5072 		break;
5073 
5074 	case F_HASREMOTELOCKS:
5075 		/*
5076 		 * NFS client can't store remote locks itself
5077 		 */
5078 		shr->s_access = 0;
5079 		error = 0;
5080 		break;
5081 
5082 	default:
5083 		error = EINVAL;
5084 		break;
5085 	}
5086 
5087 	return (error);
5088 }
5089