xref: /titanic_41/usr/src/uts/common/fs/nfs/nfs_vnops.c (revision b02e9a2d4d2071d770e5aa9ae8f83f2bbe1f2ced)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  *
25  *	Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T.
26  *	All rights reserved.
27  */
28 
29 #pragma ident	"%Z%%M%	%I%	%E% SMI"
30 
31 #include <sys/param.h>
32 #include <sys/types.h>
33 #include <sys/systm.h>
34 #include <sys/cred.h>
35 #include <sys/time.h>
36 #include <sys/vnode.h>
37 #include <sys/vfs.h>
38 #include <sys/vfs_opreg.h>
39 #include <sys/file.h>
40 #include <sys/filio.h>
41 #include <sys/uio.h>
42 #include <sys/buf.h>
43 #include <sys/mman.h>
44 #include <sys/pathname.h>
45 #include <sys/dirent.h>
46 #include <sys/debug.h>
47 #include <sys/vmsystm.h>
48 #include <sys/fcntl.h>
49 #include <sys/flock.h>
50 #include <sys/swap.h>
51 #include <sys/errno.h>
52 #include <sys/strsubr.h>
53 #include <sys/sysmacros.h>
54 #include <sys/kmem.h>
55 #include <sys/cmn_err.h>
56 #include <sys/pathconf.h>
57 #include <sys/utsname.h>
58 #include <sys/dnlc.h>
59 #include <sys/acl.h>
60 #include <sys/atomic.h>
61 #include <sys/policy.h>
62 #include <sys/sdt.h>
63 
64 #include <rpc/types.h>
65 #include <rpc/auth.h>
66 #include <rpc/clnt.h>
67 
68 #include <nfs/nfs.h>
69 #include <nfs/nfs_clnt.h>
70 #include <nfs/rnode.h>
71 #include <nfs/nfs_acl.h>
72 #include <nfs/lm.h>
73 
74 #include <vm/hat.h>
75 #include <vm/as.h>
76 #include <vm/page.h>
77 #include <vm/pvn.h>
78 #include <vm/seg.h>
79 #include <vm/seg_map.h>
80 #include <vm/seg_kpm.h>
81 #include <vm/seg_vn.h>
82 
83 #include <fs/fs_subr.h>
84 
85 #include <sys/ddi.h>
86 
87 static int	nfs_rdwrlbn(vnode_t *, page_t *, u_offset_t, size_t, int,
88 			cred_t *);
89 static int	nfswrite(vnode_t *, caddr_t, uint_t, int, cred_t *);
90 static int	nfsread(vnode_t *, caddr_t, uint_t, int, size_t *, cred_t *);
91 static int	nfssetattr(vnode_t *, struct vattr *, int, cred_t *);
92 static int	nfslookup_dnlc(vnode_t *, char *, vnode_t **, cred_t *);
93 static int	nfslookup_otw(vnode_t *, char *, vnode_t **, cred_t *, int);
94 static int	nfsrename(vnode_t *, char *, vnode_t *, char *, cred_t *,
95 			caller_context_t *);
96 static int	nfsreaddir(vnode_t *, rddir_cache *, cred_t *);
97 static int	nfs_bio(struct buf *, cred_t *);
98 static int	nfs_getapage(vnode_t *, u_offset_t, size_t, uint_t *,
99 			page_t *[], size_t, struct seg *, caddr_t,
100 			enum seg_rw, cred_t *);
101 static void	nfs_readahead(vnode_t *, u_offset_t, caddr_t, struct seg *,
102 			cred_t *);
103 static int	nfs_sync_putapage(vnode_t *, page_t *, u_offset_t, size_t,
104 			int, cred_t *);
105 static int	nfs_sync_pageio(vnode_t *, page_t *, u_offset_t, size_t,
106 			int, cred_t *);
107 static void	nfs_delmap_callback(struct as *, void *, uint_t);
108 
109 /*
110  * Error flags used to pass information about certain special errors
111  * which need to be handled specially.
112  */
113 #define	NFS_EOF			-98
114 
115 /*
116  * These are the vnode ops routines which implement the vnode interface to
117  * the networked file system.  These routines just take their parameters,
118  * make them look networkish by putting the right info into interface structs,
119  * and then calling the appropriate remote routine(s) to do the work.
120  *
121  * Note on directory name lookup cacheing:  If we detect a stale fhandle,
122  * we purge the directory cache relative to that vnode.  This way, the
123  * user won't get burned by the cache repeatedly.  See <nfs/rnode.h> for
124  * more details on rnode locking.
125  */
126 
127 static int	nfs_open(vnode_t **, int, cred_t *, caller_context_t *);
128 static int	nfs_close(vnode_t *, int, int, offset_t, cred_t *,
129 			caller_context_t *);
130 static int	nfs_read(vnode_t *, struct uio *, int, cred_t *,
131 			caller_context_t *);
132 static int	nfs_write(vnode_t *, struct uio *, int, cred_t *,
133 			caller_context_t *);
134 static int	nfs_ioctl(vnode_t *, int, intptr_t, int, cred_t *, int *,
135 			caller_context_t *);
136 static int	nfs_getattr(vnode_t *, struct vattr *, int, cred_t *,
137 			caller_context_t *);
138 static int	nfs_setattr(vnode_t *, struct vattr *, int, cred_t *,
139 			caller_context_t *);
140 static int	nfs_access(vnode_t *, int, int, cred_t *, caller_context_t *);
141 static int	nfs_accessx(void *, int, cred_t *);
142 static int	nfs_readlink(vnode_t *, struct uio *, cred_t *,
143 			caller_context_t *);
144 static int	nfs_fsync(vnode_t *, int, cred_t *, caller_context_t *);
145 static void	nfs_inactive(vnode_t *, cred_t *, caller_context_t *);
146 static int	nfs_lookup(vnode_t *, char *, vnode_t **, struct pathname *,
147 			int, vnode_t *, cred_t *, caller_context_t *,
148 			int *, pathname_t *);
149 static int	nfs_create(vnode_t *, char *, struct vattr *, enum vcexcl,
150 			int, vnode_t **, cred_t *, int, caller_context_t *,
151 			vsecattr_t *);
152 static int	nfs_remove(vnode_t *, char *, cred_t *, caller_context_t *,
153 			int);
154 static int	nfs_link(vnode_t *, vnode_t *, char *, cred_t *,
155 			caller_context_t *, int);
156 static int	nfs_rename(vnode_t *, char *, vnode_t *, char *, cred_t *,
157 			caller_context_t *, int);
158 static int	nfs_mkdir(vnode_t *, char *, struct vattr *, vnode_t **,
159 			cred_t *, caller_context_t *, int, vsecattr_t *);
160 static int	nfs_rmdir(vnode_t *, char *, vnode_t *, cred_t *,
161 			caller_context_t *, int);
162 static int	nfs_symlink(vnode_t *, char *, struct vattr *, char *,
163 			cred_t *, caller_context_t *, int);
164 static int	nfs_readdir(vnode_t *, struct uio *, cred_t *, int *,
165 			caller_context_t *, int);
166 static int	nfs_fid(vnode_t *, fid_t *, caller_context_t *);
167 static int	nfs_rwlock(vnode_t *, int, caller_context_t *);
168 static void	nfs_rwunlock(vnode_t *, int, caller_context_t *);
169 static int	nfs_seek(vnode_t *, offset_t, offset_t *, caller_context_t *);
170 static int	nfs_getpage(vnode_t *, offset_t, size_t, uint_t *,
171 			page_t *[], size_t, struct seg *, caddr_t,
172 			enum seg_rw, cred_t *, caller_context_t *);
173 static int	nfs_putpage(vnode_t *, offset_t, size_t, int, cred_t *,
174 			caller_context_t *);
175 static int	nfs_map(vnode_t *, offset_t, struct as *, caddr_t *, size_t,
176 			uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *);
177 static int	nfs_addmap(vnode_t *, offset_t, struct as *, caddr_t, size_t,
178 			uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *);
179 static int	nfs_frlock(vnode_t *, int, struct flock64 *, int, offset_t,
180 			struct flk_callback *, cred_t *, caller_context_t *);
181 static int	nfs_space(vnode_t *, int, struct flock64 *, int, offset_t,
182 			cred_t *, caller_context_t *);
183 static int	nfs_realvp(vnode_t *, vnode_t **, caller_context_t *);
184 static int	nfs_delmap(vnode_t *, offset_t, struct as *, caddr_t, size_t,
185 			uint_t, uint_t, uint_t, cred_t *, caller_context_t *);
186 static int	nfs_pathconf(vnode_t *, int, ulong_t *, cred_t *,
187 			caller_context_t *);
188 static int	nfs_pageio(vnode_t *, page_t *, u_offset_t, size_t, int,
189 			cred_t *, caller_context_t *);
190 static int	nfs_setsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
191 			caller_context_t *);
192 static int	nfs_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
193 			caller_context_t *);
194 static int	nfs_shrlock(vnode_t *, int, struct shrlock *, int, cred_t *,
195 			caller_context_t *);
196 
197 struct vnodeops *nfs_vnodeops;
198 
199 const fs_operation_def_t nfs_vnodeops_template[] = {
200 	VOPNAME_OPEN,		{ .vop_open = nfs_open },
201 	VOPNAME_CLOSE,		{ .vop_close = nfs_close },
202 	VOPNAME_READ,		{ .vop_read = nfs_read },
203 	VOPNAME_WRITE,		{ .vop_write = nfs_write },
204 	VOPNAME_IOCTL,		{ .vop_ioctl = nfs_ioctl },
205 	VOPNAME_GETATTR,	{ .vop_getattr = nfs_getattr },
206 	VOPNAME_SETATTR,	{ .vop_setattr = nfs_setattr },
207 	VOPNAME_ACCESS,		{ .vop_access = nfs_access },
208 	VOPNAME_LOOKUP,		{ .vop_lookup = nfs_lookup },
209 	VOPNAME_CREATE,		{ .vop_create = nfs_create },
210 	VOPNAME_REMOVE,		{ .vop_remove = nfs_remove },
211 	VOPNAME_LINK,		{ .vop_link = nfs_link },
212 	VOPNAME_RENAME,		{ .vop_rename = nfs_rename },
213 	VOPNAME_MKDIR,		{ .vop_mkdir = nfs_mkdir },
214 	VOPNAME_RMDIR,		{ .vop_rmdir = nfs_rmdir },
215 	VOPNAME_READDIR,	{ .vop_readdir = nfs_readdir },
216 	VOPNAME_SYMLINK,	{ .vop_symlink = nfs_symlink },
217 	VOPNAME_READLINK,	{ .vop_readlink = nfs_readlink },
218 	VOPNAME_FSYNC,		{ .vop_fsync = nfs_fsync },
219 	VOPNAME_INACTIVE,	{ .vop_inactive = nfs_inactive },
220 	VOPNAME_FID,		{ .vop_fid = nfs_fid },
221 	VOPNAME_RWLOCK,		{ .vop_rwlock = nfs_rwlock },
222 	VOPNAME_RWUNLOCK,	{ .vop_rwunlock = nfs_rwunlock },
223 	VOPNAME_SEEK,		{ .vop_seek = nfs_seek },
224 	VOPNAME_FRLOCK,		{ .vop_frlock = nfs_frlock },
225 	VOPNAME_SPACE,		{ .vop_space = nfs_space },
226 	VOPNAME_REALVP,		{ .vop_realvp = nfs_realvp },
227 	VOPNAME_GETPAGE,	{ .vop_getpage = nfs_getpage },
228 	VOPNAME_PUTPAGE,	{ .vop_putpage = nfs_putpage },
229 	VOPNAME_MAP,		{ .vop_map = nfs_map },
230 	VOPNAME_ADDMAP,		{ .vop_addmap = nfs_addmap },
231 	VOPNAME_DELMAP,		{ .vop_delmap = nfs_delmap },
232 	VOPNAME_DUMP,		{ .vop_dump = nfs_dump },
233 	VOPNAME_PATHCONF,	{ .vop_pathconf = nfs_pathconf },
234 	VOPNAME_PAGEIO,		{ .vop_pageio = nfs_pageio },
235 	VOPNAME_SETSECATTR,	{ .vop_setsecattr = nfs_setsecattr },
236 	VOPNAME_GETSECATTR,	{ .vop_getsecattr = nfs_getsecattr },
237 	VOPNAME_SHRLOCK,	{ .vop_shrlock = nfs_shrlock },
238 	VOPNAME_VNEVENT, 	{ .vop_vnevent = fs_vnevent_support },
239 	NULL,			NULL
240 };
241 
242 /*
243  * XXX:  This is referenced in modstubs.s
244  */
245 struct vnodeops *
246 nfs_getvnodeops(void)
247 {
248 	return (nfs_vnodeops);
249 }
250 
251 /* ARGSUSED */
252 static int
253 nfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
254 {
255 	int error;
256 	struct vattr va;
257 	rnode_t *rp;
258 	vnode_t *vp;
259 
260 	vp = *vpp;
261 	rp = VTOR(vp);
262 	if (nfs_zone() != VTOMI(vp)->mi_zone)
263 		return (EIO);
264 	mutex_enter(&rp->r_statelock);
265 	if (rp->r_cred == NULL) {
266 		crhold(cr);
267 		rp->r_cred = cr;
268 	}
269 	mutex_exit(&rp->r_statelock);
270 
271 	/*
272 	 * If there is no cached data or if close-to-open
273 	 * consistency checking is turned off, we can avoid
274 	 * the over the wire getattr.  Otherwise, if the
275 	 * file system is mounted readonly, then just verify
276 	 * the caches are up to date using the normal mechanism.
277 	 * Else, if the file is not mmap'd, then just mark
278 	 * the attributes as timed out.  They will be refreshed
279 	 * and the caches validated prior to being used.
280 	 * Else, the file system is mounted writeable so
281 	 * force an over the wire GETATTR in order to ensure
282 	 * that all cached data is valid.
283 	 */
284 	if (vp->v_count > 1 ||
285 	    ((vn_has_cached_data(vp) || HAVE_RDDIR_CACHE(rp)) &&
286 	    !(VTOMI(vp)->mi_flags & MI_NOCTO))) {
287 		if (vn_is_readonly(vp))
288 			error = nfs_validate_caches(vp, cr);
289 		else if (rp->r_mapcnt == 0 && vp->v_count == 1) {
290 			PURGE_ATTRCACHE(vp);
291 			error = 0;
292 		} else {
293 			va.va_mask = AT_ALL;
294 			error = nfs_getattr_otw(vp, &va, cr);
295 		}
296 	} else
297 		error = 0;
298 
299 	return (error);
300 }
301 
302 /* ARGSUSED */
303 static int
304 nfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
305 	caller_context_t *ct)
306 {
307 	rnode_t *rp;
308 	int error;
309 	struct vattr va;
310 
311 	/*
312 	 * zone_enter(2) prevents processes from changing zones with NFS files
313 	 * open; if we happen to get here from the wrong zone we can't do
314 	 * anything over the wire.
315 	 */
316 	if (VTOMI(vp)->mi_zone != nfs_zone()) {
317 		/*
318 		 * We could attempt to clean up locks, except we're sure
319 		 * that the current process didn't acquire any locks on
320 		 * the file: any attempt to lock a file belong to another zone
321 		 * will fail, and one can't lock an NFS file and then change
322 		 * zones, as that fails too.
323 		 *
324 		 * Returning an error here is the sane thing to do.  A
325 		 * subsequent call to VN_RELE() which translates to a
326 		 * nfs_inactive() will clean up state: if the zone of the
327 		 * vnode's origin is still alive and kicking, an async worker
328 		 * thread will handle the request (from the correct zone), and
329 		 * everything (minus the final nfs_getattr_otw() call) should
330 		 * be OK. If the zone is going away nfs_async_inactive() will
331 		 * throw away cached pages inline.
332 		 */
333 		return (EIO);
334 	}
335 
336 	/*
337 	 * If we are using local locking for this filesystem, then
338 	 * release all of the SYSV style record locks.  Otherwise,
339 	 * we are doing network locking and we need to release all
340 	 * of the network locks.  All of the locks held by this
341 	 * process on this file are released no matter what the
342 	 * incoming reference count is.
343 	 */
344 	if (VTOMI(vp)->mi_flags & MI_LLOCK) {
345 		cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
346 		cleanshares(vp, ttoproc(curthread)->p_pid);
347 	} else
348 		nfs_lockrelease(vp, flag, offset, cr);
349 
350 	if (count > 1)
351 		return (0);
352 
353 	/*
354 	 * If the file has been `unlinked', then purge the
355 	 * DNLC so that this vnode will get reycled quicker
356 	 * and the .nfs* file on the server will get removed.
357 	 */
358 	rp = VTOR(vp);
359 	if (rp->r_unldvp != NULL)
360 		dnlc_purge_vp(vp);
361 
362 	/*
363 	 * If the file was open for write and there are pages,
364 	 * then if the file system was mounted using the "no-close-
365 	 *	to-open" semantics, then start an asynchronous flush
366 	 *	of the all of the pages in the file.
367 	 * else the file system was not mounted using the "no-close-
368 	 *	to-open" semantics, then do a synchronous flush and
369 	 *	commit of all of the dirty and uncommitted pages.
370 	 *
371 	 * The asynchronous flush of the pages in the "nocto" path
372 	 * mostly just associates a cred pointer with the rnode so
373 	 * writes which happen later will have a better chance of
374 	 * working.  It also starts the data being written to the
375 	 * server, but without unnecessarily delaying the application.
376 	 */
377 	if ((flag & FWRITE) && vn_has_cached_data(vp)) {
378 		if ((VTOMI(vp)->mi_flags & MI_NOCTO)) {
379 			error = nfs_putpage(vp, (offset_t)0, 0, B_ASYNC,
380 			    cr, ct);
381 			if (error == EAGAIN)
382 				error = 0;
383 		} else
384 			error = nfs_putpage(vp, (offset_t)0, 0, 0, cr, ct);
385 		if (!error) {
386 			mutex_enter(&rp->r_statelock);
387 			error = rp->r_error;
388 			rp->r_error = 0;
389 			mutex_exit(&rp->r_statelock);
390 		}
391 	} else {
392 		mutex_enter(&rp->r_statelock);
393 		error = rp->r_error;
394 		rp->r_error = 0;
395 		mutex_exit(&rp->r_statelock);
396 	}
397 
398 	/*
399 	 * If RWRITEATTR is set, then issue an over the wire GETATTR to
400 	 * refresh the attribute cache with a set of attributes which
401 	 * weren't returned from a WRITE.  This will enable the close-
402 	 * to-open processing to work.
403 	 */
404 	if (rp->r_flags & RWRITEATTR)
405 		(void) nfs_getattr_otw(vp, &va, cr);
406 
407 	return (error);
408 }
409 
410 /* ARGSUSED */
411 static int
412 nfs_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
413 	caller_context_t *ct)
414 {
415 	rnode_t *rp;
416 	u_offset_t off;
417 	offset_t diff;
418 	int on;
419 	size_t n;
420 	caddr_t base;
421 	uint_t flags;
422 	int error;
423 	mntinfo_t *mi;
424 
425 	rp = VTOR(vp);
426 	mi = VTOMI(vp);
427 
428 	if (nfs_zone() != mi->mi_zone)
429 		return (EIO);
430 
431 	ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
432 
433 	if (vp->v_type != VREG)
434 		return (EISDIR);
435 
436 	if (uiop->uio_resid == 0)
437 		return (0);
438 
439 	if (uiop->uio_loffset > MAXOFF32_T)
440 		return (EFBIG);
441 
442 	if (uiop->uio_loffset < 0 ||
443 	    uiop->uio_loffset + uiop->uio_resid > MAXOFF32_T)
444 		return (EINVAL);
445 
446 	/*
447 	 * Bypass VM if caching has been disabled (e.g., locking) or if
448 	 * using client-side direct I/O and the file is not mmap'd and
449 	 * there are no cached pages.
450 	 */
451 	if ((vp->v_flag & VNOCACHE) ||
452 	    (((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) &&
453 	    rp->r_mapcnt == 0 && !vn_has_cached_data(vp))) {
454 		size_t bufsize;
455 		size_t resid = 0;
456 
457 		/*
458 		 * Let's try to do read in as large a chunk as we can
459 		 * (Filesystem (NFS client) bsize if possible/needed).
460 		 * For V3, this is 32K and for V2, this is 8K.
461 		 */
462 		bufsize = MIN(uiop->uio_resid, VTOMI(vp)->mi_curread);
463 		base = kmem_alloc(bufsize, KM_SLEEP);
464 		do {
465 			n = MIN(uiop->uio_resid, bufsize);
466 			error = nfsread(vp, base, uiop->uio_offset, n,
467 			    &resid, cr);
468 			if (!error) {
469 				n -= resid;
470 				error = uiomove(base, n, UIO_READ, uiop);
471 			}
472 		} while (!error && uiop->uio_resid > 0 && n > 0);
473 		kmem_free(base, bufsize);
474 		return (error);
475 	}
476 
477 	error = 0;
478 
479 	do {
480 		off = uiop->uio_loffset & MAXBMASK; /* mapping offset */
481 		on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */
482 		n = MIN(MAXBSIZE - on, uiop->uio_resid);
483 
484 		error = nfs_validate_caches(vp, cr);
485 		if (error)
486 			break;
487 
488 		mutex_enter(&rp->r_statelock);
489 		diff = rp->r_size - uiop->uio_loffset;
490 		mutex_exit(&rp->r_statelock);
491 		if (diff <= 0)
492 			break;
493 		if (diff < n)
494 			n = (size_t)diff;
495 
496 		if (vpm_enable) {
497 			/*
498 			 * Copy data.
499 			 */
500 			error = vpm_data_copy(vp, off + on, n, uiop,
501 			    1, NULL, 0, S_READ);
502 		} else {
503 			base = segmap_getmapflt(segkmap, vp, off + on, n,
504 			    1, S_READ);
505 			error = uiomove(base + on, n, UIO_READ, uiop);
506 		}
507 
508 		if (!error) {
509 			/*
510 			 * If read a whole block or read to eof,
511 			 * won't need this buffer again soon.
512 			 */
513 			mutex_enter(&rp->r_statelock);
514 			if (n + on == MAXBSIZE ||
515 			    uiop->uio_loffset == rp->r_size)
516 				flags = SM_DONTNEED;
517 			else
518 				flags = 0;
519 			mutex_exit(&rp->r_statelock);
520 			if (vpm_enable) {
521 				error = vpm_sync_pages(vp, off, n, flags);
522 			} else {
523 				error = segmap_release(segkmap, base, flags);
524 			}
525 		} else {
526 			if (vpm_enable) {
527 				(void) vpm_sync_pages(vp, off, n, 0);
528 			} else {
529 				(void) segmap_release(segkmap, base, 0);
530 			}
531 		}
532 	} while (!error && uiop->uio_resid > 0);
533 
534 	return (error);
535 }
536 
537 /* ARGSUSED */
538 static int
539 nfs_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
540 	caller_context_t *ct)
541 {
542 	rnode_t *rp;
543 	u_offset_t off;
544 	caddr_t base;
545 	uint_t flags;
546 	int remainder;
547 	size_t n;
548 	int on;
549 	int error;
550 	int resid;
551 	offset_t offset;
552 	rlim_t limit;
553 	mntinfo_t *mi;
554 
555 	rp = VTOR(vp);
556 
557 	mi = VTOMI(vp);
558 	if (nfs_zone() != mi->mi_zone)
559 		return (EIO);
560 	if (vp->v_type != VREG)
561 		return (EISDIR);
562 
563 	if (uiop->uio_resid == 0)
564 		return (0);
565 
566 	if (ioflag & FAPPEND) {
567 		struct vattr va;
568 
569 		/*
570 		 * Must serialize if appending.
571 		 */
572 		if (nfs_rw_lock_held(&rp->r_rwlock, RW_READER)) {
573 			nfs_rw_exit(&rp->r_rwlock);
574 			if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER,
575 			    INTR(vp)))
576 				return (EINTR);
577 		}
578 
579 		va.va_mask = AT_SIZE;
580 		error = nfsgetattr(vp, &va, cr);
581 		if (error)
582 			return (error);
583 		uiop->uio_loffset = va.va_size;
584 	}
585 
586 	if (uiop->uio_loffset > MAXOFF32_T)
587 		return (EFBIG);
588 
589 	offset = uiop->uio_loffset + uiop->uio_resid;
590 
591 	if (uiop->uio_loffset < 0 || offset > MAXOFF32_T)
592 		return (EINVAL);
593 
594 	if (uiop->uio_llimit > (rlim64_t)MAXOFF32_T) {
595 		limit = MAXOFF32_T;
596 	} else {
597 		limit = (rlim_t)uiop->uio_llimit;
598 	}
599 
600 	/*
601 	 * Check to make sure that the process will not exceed
602 	 * its limit on file size.  It is okay to write up to
603 	 * the limit, but not beyond.  Thus, the write which
604 	 * reaches the limit will be short and the next write
605 	 * will return an error.
606 	 */
607 	remainder = 0;
608 	if (offset > limit) {
609 		remainder = offset - limit;
610 		uiop->uio_resid = limit - uiop->uio_offset;
611 		if (uiop->uio_resid <= 0) {
612 			proc_t *p = ttoproc(curthread);
613 
614 			uiop->uio_resid += remainder;
615 			mutex_enter(&p->p_lock);
616 			(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
617 			    p->p_rctls, p, RCA_UNSAFE_SIGINFO);
618 			mutex_exit(&p->p_lock);
619 			return (EFBIG);
620 		}
621 	}
622 
623 	if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR(vp)))
624 		return (EINTR);
625 
626 	/*
627 	 * Bypass VM if caching has been disabled (e.g., locking) or if
628 	 * using client-side direct I/O and the file is not mmap'd and
629 	 * there are no cached pages.
630 	 */
631 	if ((vp->v_flag & VNOCACHE) ||
632 	    (((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) &&
633 	    rp->r_mapcnt == 0 && !vn_has_cached_data(vp))) {
634 		size_t bufsize;
635 		int count;
636 		uint_t org_offset;
637 
638 nfs_fwrite:
639 		if (rp->r_flags & RSTALE) {
640 			resid = uiop->uio_resid;
641 			offset = uiop->uio_loffset;
642 			error = rp->r_error;
643 			goto bottom;
644 		}
645 		bufsize = MIN(uiop->uio_resid, mi->mi_curwrite);
646 		base = kmem_alloc(bufsize, KM_SLEEP);
647 		do {
648 			resid = uiop->uio_resid;
649 			offset = uiop->uio_loffset;
650 			count = MIN(uiop->uio_resid, bufsize);
651 			org_offset = uiop->uio_offset;
652 			error = uiomove(base, count, UIO_WRITE, uiop);
653 			if (!error) {
654 				error = nfswrite(vp, base, org_offset,
655 				    count, cr);
656 			}
657 		} while (!error && uiop->uio_resid > 0);
658 		kmem_free(base, bufsize);
659 		goto bottom;
660 	}
661 
662 	do {
663 		off = uiop->uio_loffset & MAXBMASK; /* mapping offset */
664 		on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */
665 		n = MIN(MAXBSIZE - on, uiop->uio_resid);
666 
667 		resid = uiop->uio_resid;
668 		offset = uiop->uio_loffset;
669 
670 		if (rp->r_flags & RSTALE) {
671 			error = rp->r_error;
672 			break;
673 		}
674 
675 		/*
676 		 * Don't create dirty pages faster than they
677 		 * can be cleaned so that the system doesn't
678 		 * get imbalanced.  If the async queue is
679 		 * maxed out, then wait for it to drain before
680 		 * creating more dirty pages.  Also, wait for
681 		 * any threads doing pagewalks in the vop_getattr
682 		 * entry points so that they don't block for
683 		 * long periods.
684 		 */
685 		mutex_enter(&rp->r_statelock);
686 		while ((mi->mi_max_threads != 0 &&
687 		    rp->r_awcount > 2 * mi->mi_max_threads) ||
688 		    rp->r_gcount > 0)
689 			cv_wait(&rp->r_cv, &rp->r_statelock);
690 		mutex_exit(&rp->r_statelock);
691 
692 		if (vpm_enable) {
693 			/*
694 			 * It will use kpm mappings, so no need to
695 			 * pass an address.
696 			 */
697 			error = writerp(rp, NULL, n, uiop, 0);
698 		} else  {
699 			if (segmap_kpm) {
700 				int pon = uiop->uio_loffset & PAGEOFFSET;
701 				size_t pn = MIN(PAGESIZE - pon,
702 				    uiop->uio_resid);
703 				int pagecreate;
704 
705 				mutex_enter(&rp->r_statelock);
706 				pagecreate = (pon == 0) && (pn == PAGESIZE ||
707 				    uiop->uio_loffset + pn >= rp->r_size);
708 				mutex_exit(&rp->r_statelock);
709 
710 				base = segmap_getmapflt(segkmap, vp, off + on,
711 				    pn, !pagecreate, S_WRITE);
712 
713 				error = writerp(rp, base + pon, n, uiop,
714 				    pagecreate);
715 
716 			} else {
717 				base = segmap_getmapflt(segkmap, vp, off + on,
718 				    n, 0, S_READ);
719 				error = writerp(rp, base + on, n, uiop, 0);
720 			}
721 		}
722 
723 		if (!error) {
724 			if (mi->mi_flags & MI_NOAC)
725 				flags = SM_WRITE;
726 			else if (n + on == MAXBSIZE || IS_SWAPVP(vp)) {
727 				/*
728 				 * Have written a whole block.
729 				 * Start an asynchronous write
730 				 * and mark the buffer to
731 				 * indicate that it won't be
732 				 * needed again soon.
733 				 */
734 				flags = SM_WRITE | SM_ASYNC | SM_DONTNEED;
735 			} else
736 				flags = 0;
737 			if ((ioflag & (FSYNC|FDSYNC)) ||
738 			    (rp->r_flags & ROUTOFSPACE)) {
739 				flags &= ~SM_ASYNC;
740 				flags |= SM_WRITE;
741 			}
742 			if (vpm_enable) {
743 				error = vpm_sync_pages(vp, off, n, flags);
744 			} else {
745 				error = segmap_release(segkmap, base, flags);
746 			}
747 		} else {
748 			if (vpm_enable) {
749 				(void) vpm_sync_pages(vp, off, n, 0);
750 			} else {
751 				(void) segmap_release(segkmap, base, 0);
752 			}
753 			/*
754 			 * In the event that we got an access error while
755 			 * faulting in a page for a write-only file just
756 			 * force a write.
757 			 */
758 			if (error == EACCES)
759 				goto nfs_fwrite;
760 		}
761 	} while (!error && uiop->uio_resid > 0);
762 
763 bottom:
764 	if (error) {
765 		uiop->uio_resid = resid + remainder;
766 		uiop->uio_loffset = offset;
767 	} else
768 		uiop->uio_resid += remainder;
769 
770 	nfs_rw_exit(&rp->r_lkserlock);
771 
772 	return (error);
773 }
774 
775 /*
776  * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED}
777  */
778 static int
779 nfs_rdwrlbn(vnode_t *vp, page_t *pp, u_offset_t off, size_t len,
780 	int flags, cred_t *cr)
781 {
782 	struct buf *bp;
783 	int error;
784 
785 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
786 	bp = pageio_setup(pp, len, vp, flags);
787 	ASSERT(bp != NULL);
788 
789 	/*
790 	 * pageio_setup should have set b_addr to 0.  This
791 	 * is correct since we want to do I/O on a page
792 	 * boundary.  bp_mapin will use this addr to calculate
793 	 * an offset, and then set b_addr to the kernel virtual
794 	 * address it allocated for us.
795 	 */
796 	ASSERT(bp->b_un.b_addr == 0);
797 
798 	bp->b_edev = 0;
799 	bp->b_dev = 0;
800 	bp->b_lblkno = lbtodb(off);
801 	bp->b_file = vp;
802 	bp->b_offset = (offset_t)off;
803 	bp_mapin(bp);
804 
805 	error = nfs_bio(bp, cr);
806 
807 	bp_mapout(bp);
808 	pageio_done(bp);
809 
810 	return (error);
811 }
812 
813 /*
814  * Write to file.  Writes to remote server in largest size
815  * chunks that the server can handle.  Write is synchronous.
816  */
817 static int
818 nfswrite(vnode_t *vp, caddr_t base, uint_t offset, int count, cred_t *cr)
819 {
820 	rnode_t *rp;
821 	mntinfo_t *mi;
822 	struct nfswriteargs wa;
823 	struct nfsattrstat ns;
824 	int error;
825 	int tsize;
826 	int douprintf;
827 
828 	douprintf = 1;
829 
830 	rp = VTOR(vp);
831 	mi = VTOMI(vp);
832 
833 	ASSERT(nfs_zone() == mi->mi_zone);
834 
835 	wa.wa_args = &wa.wa_args_buf;
836 	wa.wa_fhandle = *VTOFH(vp);
837 
838 	do {
839 		tsize = MIN(mi->mi_curwrite, count);
840 		wa.wa_data = base;
841 		wa.wa_begoff = offset;
842 		wa.wa_totcount = tsize;
843 		wa.wa_count = tsize;
844 		wa.wa_offset = offset;
845 
846 		if (mi->mi_io_kstats) {
847 			mutex_enter(&mi->mi_lock);
848 			kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
849 			mutex_exit(&mi->mi_lock);
850 		}
851 		wa.wa_mblk = NULL;
852 		do {
853 			error = rfs2call(mi, RFS_WRITE,
854 			    xdr_writeargs, (caddr_t)&wa,
855 			    xdr_attrstat, (caddr_t)&ns, cr,
856 			    &douprintf, &ns.ns_status, 0, NULL);
857 		} while (error == ENFS_TRYAGAIN);
858 		if (mi->mi_io_kstats) {
859 			mutex_enter(&mi->mi_lock);
860 			kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
861 			mutex_exit(&mi->mi_lock);
862 		}
863 
864 		if (!error) {
865 			error = geterrno(ns.ns_status);
866 			/*
867 			 * Can't check for stale fhandle and purge caches
868 			 * here because pages are held by nfs_getpage.
869 			 * Just mark the attribute cache as timed out
870 			 * and set RWRITEATTR to indicate that the file
871 			 * was modified with a WRITE operation.
872 			 */
873 			if (!error) {
874 				count -= tsize;
875 				base += tsize;
876 				offset += tsize;
877 				if (mi->mi_io_kstats) {
878 					mutex_enter(&mi->mi_lock);
879 					KSTAT_IO_PTR(mi->mi_io_kstats)->
880 					    writes++;
881 					KSTAT_IO_PTR(mi->mi_io_kstats)->
882 					    nwritten += tsize;
883 					mutex_exit(&mi->mi_lock);
884 				}
885 				lwp_stat_update(LWP_STAT_OUBLK, 1);
886 				mutex_enter(&rp->r_statelock);
887 				PURGE_ATTRCACHE_LOCKED(rp);
888 				rp->r_flags |= RWRITEATTR;
889 				mutex_exit(&rp->r_statelock);
890 			}
891 		}
892 	} while (!error && count);
893 
894 	return (error);
895 }
896 
897 /*
898  * Read from a file.  Reads data in largest chunks our interface can handle.
899  */
900 static int
901 nfsread(vnode_t *vp, caddr_t base, uint_t offset, int count, size_t *residp,
902 	cred_t *cr)
903 {
904 	mntinfo_t *mi;
905 	struct nfsreadargs ra;
906 	struct nfsrdresult rr;
907 	int tsize;
908 	int error;
909 	int douprintf;
910 	failinfo_t fi;
911 	rnode_t *rp;
912 	struct vattr va;
913 	hrtime_t t;
914 
915 	rp = VTOR(vp);
916 	mi = VTOMI(vp);
917 
918 	ASSERT(nfs_zone() == mi->mi_zone);
919 
920 	douprintf = 1;
921 
922 	ra.ra_fhandle = *VTOFH(vp);
923 
924 	fi.vp = vp;
925 	fi.fhp = (caddr_t)&ra.ra_fhandle;
926 	fi.copyproc = nfscopyfh;
927 	fi.lookupproc = nfslookup;
928 	fi.xattrdirproc = acl_getxattrdir2;
929 
930 	do {
931 		if (mi->mi_io_kstats) {
932 			mutex_enter(&mi->mi_lock);
933 			kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
934 			mutex_exit(&mi->mi_lock);
935 		}
936 
937 		do {
938 			tsize = MIN(mi->mi_curread, count);
939 			rr.rr_data = base;
940 			ra.ra_offset = offset;
941 			ra.ra_totcount = tsize;
942 			ra.ra_count = tsize;
943 			t = gethrtime();
944 			error = rfs2call(mi, RFS_READ,
945 			    xdr_readargs, (caddr_t)&ra,
946 			    xdr_rdresult, (caddr_t)&rr, cr,
947 			    &douprintf, &rr.rr_status, 0, &fi);
948 		} while (error == ENFS_TRYAGAIN);
949 
950 		if (mi->mi_io_kstats) {
951 			mutex_enter(&mi->mi_lock);
952 			kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
953 			mutex_exit(&mi->mi_lock);
954 		}
955 
956 		if (!error) {
957 			error = geterrno(rr.rr_status);
958 			if (!error) {
959 				count -= rr.rr_count;
960 				base += rr.rr_count;
961 				offset += rr.rr_count;
962 				if (mi->mi_io_kstats) {
963 					mutex_enter(&mi->mi_lock);
964 					KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
965 					KSTAT_IO_PTR(mi->mi_io_kstats)->nread +=
966 					    rr.rr_count;
967 					mutex_exit(&mi->mi_lock);
968 				}
969 				lwp_stat_update(LWP_STAT_INBLK, 1);
970 			}
971 		}
972 	} while (!error && count && rr.rr_count == tsize);
973 
974 	*residp = count;
975 
976 	if (!error) {
977 		/*
978 		 * Since no error occurred, we have the current
979 		 * attributes and we need to do a cache check and then
980 		 * potentially update the cached attributes.  We can't
981 		 * use the normal attribute check and cache mechanisms
982 		 * because they might cause a cache flush which would
983 		 * deadlock.  Instead, we just check the cache to see
984 		 * if the attributes have changed.  If it is, then we
985 		 * just mark the attributes as out of date.  The next
986 		 * time that the attributes are checked, they will be
987 		 * out of date, new attributes will be fetched, and
988 		 * the page cache will be flushed.  If the attributes
989 		 * weren't changed, then we just update the cached
990 		 * attributes with these attributes.
991 		 */
992 		/*
993 		 * If NFS_ACL is supported on the server, then the
994 		 * attributes returned by server may have minimal
995 		 * permissions sometimes denying access to users having
996 		 * proper access.  To get the proper attributes, mark
997 		 * the attributes as expired so that they will be
998 		 * regotten via the NFS_ACL GETATTR2 procedure.
999 		 */
1000 		error = nattr_to_vattr(vp, &rr.rr_attr, &va);
1001 		mutex_enter(&rp->r_statelock);
1002 		if (error || !CACHE_VALID(rp, va.va_mtime, va.va_size) ||
1003 		    (mi->mi_flags & MI_ACL)) {
1004 			mutex_exit(&rp->r_statelock);
1005 			PURGE_ATTRCACHE(vp);
1006 		} else {
1007 			if (rp->r_mtime <= t) {
1008 				nfs_attrcache_va(vp, &va);
1009 			}
1010 			mutex_exit(&rp->r_statelock);
1011 		}
1012 	}
1013 
1014 	return (error);
1015 }
1016 
1017 /* ARGSUSED */
1018 static int
1019 nfs_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp,
1020 	caller_context_t *ct)
1021 {
1022 
1023 	if (nfs_zone() != VTOMI(vp)->mi_zone)
1024 		return (EIO);
1025 	switch (cmd) {
1026 		case _FIODIRECTIO:
1027 			return (nfs_directio(vp, (int)arg, cr));
1028 		default:
1029 			return (ENOTTY);
1030 	}
1031 }
1032 
1033 /* ARGSUSED */
1034 static int
1035 nfs_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
1036 	caller_context_t *ct)
1037 {
1038 	int error;
1039 	rnode_t *rp;
1040 
1041 	if (nfs_zone() != VTOMI(vp)->mi_zone)
1042 		return (EIO);
1043 	/*
1044 	 * If it has been specified that the return value will
1045 	 * just be used as a hint, and we are only being asked
1046 	 * for size, fsid or rdevid, then return the client's
1047 	 * notion of these values without checking to make sure
1048 	 * that the attribute cache is up to date.
1049 	 * The whole point is to avoid an over the wire GETATTR
1050 	 * call.
1051 	 */
1052 	rp = VTOR(vp);
1053 	if (flags & ATTR_HINT) {
1054 		if (vap->va_mask ==
1055 		    (vap->va_mask & (AT_SIZE | AT_FSID | AT_RDEV))) {
1056 			mutex_enter(&rp->r_statelock);
1057 			if (vap->va_mask | AT_SIZE)
1058 				vap->va_size = rp->r_size;
1059 			if (vap->va_mask | AT_FSID)
1060 				vap->va_fsid = rp->r_attr.va_fsid;
1061 			if (vap->va_mask | AT_RDEV)
1062 				vap->va_rdev = rp->r_attr.va_rdev;
1063 			mutex_exit(&rp->r_statelock);
1064 			return (0);
1065 		}
1066 	}
1067 
1068 	/*
1069 	 * Only need to flush pages if asking for the mtime
1070 	 * and if there any dirty pages or any outstanding
1071 	 * asynchronous (write) requests for this file.
1072 	 */
1073 	if (vap->va_mask & AT_MTIME) {
1074 		if (vn_has_cached_data(vp) &&
1075 		    ((rp->r_flags & RDIRTY) || rp->r_awcount > 0)) {
1076 			mutex_enter(&rp->r_statelock);
1077 			rp->r_gcount++;
1078 			mutex_exit(&rp->r_statelock);
1079 			error = nfs_putpage(vp, (offset_t)0, 0, 0, cr, ct);
1080 			mutex_enter(&rp->r_statelock);
1081 			if (error && (error == ENOSPC || error == EDQUOT)) {
1082 				if (!rp->r_error)
1083 					rp->r_error = error;
1084 			}
1085 			if (--rp->r_gcount == 0)
1086 				cv_broadcast(&rp->r_cv);
1087 			mutex_exit(&rp->r_statelock);
1088 		}
1089 	}
1090 
1091 	return (nfsgetattr(vp, vap, cr));
1092 }
1093 
1094 /*ARGSUSED4*/
1095 static int
1096 nfs_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
1097 		caller_context_t *ct)
1098 {
1099 	int error;
1100 	uint_t mask;
1101 	struct vattr va;
1102 
1103 	mask = vap->va_mask;
1104 
1105 	if (mask & AT_NOSET)
1106 		return (EINVAL);
1107 
1108 	if ((mask & AT_SIZE) &&
1109 	    vap->va_type == VREG &&
1110 	    vap->va_size > MAXOFF32_T)
1111 		return (EFBIG);
1112 
1113 	if (nfs_zone() != VTOMI(vp)->mi_zone)
1114 		return (EIO);
1115 
1116 	va.va_mask = AT_UID | AT_MODE;
1117 
1118 	error = nfsgetattr(vp, &va, cr);
1119 	if (error)
1120 		return (error);
1121 
1122 	error = secpolicy_vnode_setattr(cr, vp, vap, &va, flags, nfs_accessx,
1123 	    vp);
1124 
1125 	if (error)
1126 		return (error);
1127 
1128 	return (nfssetattr(vp, vap, flags, cr));
1129 }
1130 
1131 static int
1132 nfssetattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr)
1133 {
1134 	int error;
1135 	uint_t mask;
1136 	struct nfssaargs args;
1137 	struct nfsattrstat ns;
1138 	int douprintf;
1139 	rnode_t *rp;
1140 	struct vattr va;
1141 	mode_t omode;
1142 	mntinfo_t *mi;
1143 	vsecattr_t *vsp;
1144 	hrtime_t t;
1145 
1146 	mask = vap->va_mask;
1147 
1148 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
1149 
1150 	rp = VTOR(vp);
1151 
1152 	/*
1153 	 * Only need to flush pages if there are any pages and
1154 	 * if the file is marked as dirty in some fashion.  The
1155 	 * file must be flushed so that we can accurately
1156 	 * determine the size of the file and the cached data
1157 	 * after the SETATTR returns.  A file is considered to
1158 	 * be dirty if it is either marked with RDIRTY, has
1159 	 * outstanding i/o's active, or is mmap'd.  In this
1160 	 * last case, we can't tell whether there are dirty
1161 	 * pages, so we flush just to be sure.
1162 	 */
1163 	if (vn_has_cached_data(vp) &&
1164 	    ((rp->r_flags & RDIRTY) ||
1165 	    rp->r_count > 0 ||
1166 	    rp->r_mapcnt > 0)) {
1167 		ASSERT(vp->v_type != VCHR);
1168 		error = nfs_putpage(vp, (offset_t)0, 0, 0, cr, NULL);
1169 		if (error && (error == ENOSPC || error == EDQUOT)) {
1170 			mutex_enter(&rp->r_statelock);
1171 			if (!rp->r_error)
1172 				rp->r_error = error;
1173 			mutex_exit(&rp->r_statelock);
1174 		}
1175 	}
1176 
1177 	/*
1178 	 * If the system call was utime(2) or utimes(2) and the
1179 	 * application did not specify the times, then set the
1180 	 * mtime nanosecond field to 1 billion.  This will get
1181 	 * translated from 1 billion nanoseconds to 1 million
1182 	 * microseconds in the over the wire request.  The
1183 	 * server will use 1 million in the microsecond field
1184 	 * to tell whether both the mtime and atime should be
1185 	 * set to the server's current time.
1186 	 *
1187 	 * This is an overload of the protocol and should be
1188 	 * documented in the NFS Version 2 protocol specification.
1189 	 */
1190 	if ((mask & AT_MTIME) && !(flags & ATTR_UTIME)) {
1191 		vap->va_mtime.tv_nsec = 1000000000;
1192 		if (NFS_TIME_T_OK(vap->va_mtime.tv_sec) &&
1193 		    NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
1194 			error = vattr_to_sattr(vap, &args.saa_sa);
1195 		} else {
1196 			/*
1197 			 * Use server times. vap time values will not be used.
1198 			 * To ensure no time overflow, make sure vap has
1199 			 * valid values, but retain the original values.
1200 			 */
1201 			timestruc_t	mtime = vap->va_mtime;
1202 			timestruc_t	atime = vap->va_atime;
1203 			time_t		now;
1204 
1205 			now = gethrestime_sec();
1206 			if (NFS_TIME_T_OK(now)) {
1207 				/* Just in case server does not know of this */
1208 				vap->va_mtime.tv_sec = now;
1209 				vap->va_atime.tv_sec = now;
1210 			} else {
1211 				vap->va_mtime.tv_sec = 0;
1212 				vap->va_atime.tv_sec = 0;
1213 			}
1214 			error = vattr_to_sattr(vap, &args.saa_sa);
1215 			/* set vap times back on */
1216 			vap->va_mtime = mtime;
1217 			vap->va_atime = atime;
1218 		}
1219 	} else {
1220 		/* Either do not set times or use the client specified times */
1221 		error = vattr_to_sattr(vap, &args.saa_sa);
1222 	}
1223 	if (error) {
1224 		/* req time field(s) overflow - return immediately */
1225 		return (error);
1226 	}
1227 	args.saa_fh = *VTOFH(vp);
1228 
1229 	va.va_mask = AT_MODE;
1230 	error = nfsgetattr(vp, &va, cr);
1231 	if (error)
1232 		return (error);
1233 	omode = va.va_mode;
1234 
1235 	mi = VTOMI(vp);
1236 
1237 	douprintf = 1;
1238 
1239 	t = gethrtime();
1240 
1241 	error = rfs2call(mi, RFS_SETATTR,
1242 	    xdr_saargs, (caddr_t)&args,
1243 	    xdr_attrstat, (caddr_t)&ns, cr,
1244 	    &douprintf, &ns.ns_status, 0, NULL);
1245 
1246 	/*
1247 	 * Purge the access cache and ACL cache if changing either the
1248 	 * owner of the file, the group owner, or the mode.  These may
1249 	 * change the access permissions of the file, so purge old
1250 	 * information and start over again.
1251 	 */
1252 	if ((mask & (AT_UID | AT_GID | AT_MODE)) && (mi->mi_flags & MI_ACL)) {
1253 		(void) nfs_access_purge_rp(rp);
1254 		if (rp->r_secattr != NULL) {
1255 			mutex_enter(&rp->r_statelock);
1256 			vsp = rp->r_secattr;
1257 			rp->r_secattr = NULL;
1258 			mutex_exit(&rp->r_statelock);
1259 			if (vsp != NULL)
1260 				nfs_acl_free(vsp);
1261 		}
1262 	}
1263 
1264 	if (!error) {
1265 		error = geterrno(ns.ns_status);
1266 		if (!error) {
1267 			/*
1268 			 * If changing the size of the file, invalidate
1269 			 * any local cached data which is no longer part
1270 			 * of the file.  We also possibly invalidate the
1271 			 * last page in the file.  We could use
1272 			 * pvn_vpzero(), but this would mark the page as
1273 			 * modified and require it to be written back to
1274 			 * the server for no particularly good reason.
1275 			 * This way, if we access it, then we bring it
1276 			 * back in.  A read should be cheaper than a
1277 			 * write.
1278 			 */
1279 			if (mask & AT_SIZE) {
1280 				nfs_invalidate_pages(vp,
1281 				    (vap->va_size & PAGEMASK), cr);
1282 			}
1283 			(void) nfs_cache_fattr(vp, &ns.ns_attr, &va, t, cr);
1284 			/*
1285 			 * If NFS_ACL is supported on the server, then the
1286 			 * attributes returned by server may have minimal
1287 			 * permissions sometimes denying access to users having
1288 			 * proper access.  To get the proper attributes, mark
1289 			 * the attributes as expired so that they will be
1290 			 * regotten via the NFS_ACL GETATTR2 procedure.
1291 			 */
1292 			if (mi->mi_flags & MI_ACL) {
1293 				PURGE_ATTRCACHE(vp);
1294 			}
1295 			/*
1296 			 * This next check attempts to deal with NFS
1297 			 * servers which can not handle increasing
1298 			 * the size of the file via setattr.  Most
1299 			 * of these servers do not return an error,
1300 			 * but do not change the size of the file.
1301 			 * Hence, this check and then attempt to set
1302 			 * the file size by writing 1 byte at the
1303 			 * offset of the end of the file that we need.
1304 			 */
1305 			if ((mask & AT_SIZE) &&
1306 			    ns.ns_attr.na_size < (uint32_t)vap->va_size) {
1307 				char zb = '\0';
1308 
1309 				error = nfswrite(vp, &zb,
1310 				    vap->va_size - sizeof (zb),
1311 				    sizeof (zb), cr);
1312 			}
1313 			/*
1314 			 * Some servers will change the mode to clear the setuid
1315 			 * and setgid bits when changing the uid or gid.  The
1316 			 * client needs to compensate appropriately.
1317 			 */
1318 			if (mask & (AT_UID | AT_GID)) {
1319 				int terror;
1320 
1321 				va.va_mask = AT_MODE;
1322 				terror = nfsgetattr(vp, &va, cr);
1323 				if (!terror &&
1324 				    (((mask & AT_MODE) &&
1325 				    va.va_mode != vap->va_mode) ||
1326 				    (!(mask & AT_MODE) &&
1327 				    va.va_mode != omode))) {
1328 					va.va_mask = AT_MODE;
1329 					if (mask & AT_MODE)
1330 						va.va_mode = vap->va_mode;
1331 					else
1332 						va.va_mode = omode;
1333 					(void) nfssetattr(vp, &va, 0, cr);
1334 				}
1335 			}
1336 		} else {
1337 			PURGE_ATTRCACHE(vp);
1338 			PURGE_STALE_FH(error, vp, cr);
1339 		}
1340 	} else {
1341 		PURGE_ATTRCACHE(vp);
1342 	}
1343 
1344 	return (error);
1345 }
1346 
1347 static int
1348 nfs_accessx(void *vp, int mode, cred_t *cr)
1349 {
1350 	ASSERT(nfs_zone() == VTOMI((vnode_t *)vp)->mi_zone);
1351 	return (nfs_access(vp, mode, 0, cr, NULL));
1352 }
1353 
1354 /* ARGSUSED */
1355 static int
1356 nfs_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct)
1357 {
1358 	struct vattr va;
1359 	int error;
1360 	mntinfo_t *mi;
1361 	int shift = 0;
1362 
1363 	mi = VTOMI(vp);
1364 
1365 	if (nfs_zone() != mi->mi_zone)
1366 		return (EIO);
1367 	if (mi->mi_flags & MI_ACL) {
1368 		error = acl_access2(vp, mode, flags, cr);
1369 		if (mi->mi_flags & MI_ACL)
1370 			return (error);
1371 	}
1372 
1373 	va.va_mask = AT_MODE | AT_UID | AT_GID;
1374 	error = nfsgetattr(vp, &va, cr);
1375 	if (error)
1376 		return (error);
1377 
1378 	/*
1379 	 * Disallow write attempts on read-only
1380 	 * file systems, unless the file is a
1381 	 * device node.
1382 	 */
1383 	if ((mode & VWRITE) && vn_is_readonly(vp) && !IS_DEVVP(vp))
1384 		return (EROFS);
1385 
1386 	/*
1387 	 * Disallow attempts to access mandatory lock files.
1388 	 */
1389 	if ((mode & (VWRITE | VREAD | VEXEC)) &&
1390 	    MANDLOCK(vp, va.va_mode))
1391 		return (EACCES);
1392 
1393 	/*
1394 	 * Access check is based on only
1395 	 * one of owner, group, public.
1396 	 * If not owner, then check group.
1397 	 * If not a member of the group,
1398 	 * then check public access.
1399 	 */
1400 	if (crgetuid(cr) != va.va_uid) {
1401 		shift += 3;
1402 		if (!groupmember(va.va_gid, cr))
1403 			shift += 3;
1404 	}
1405 found:
1406 	mode &= ~(va.va_mode << shift);
1407 	if (mode == 0)
1408 		return (0);
1409 
1410 	return (secpolicy_vnode_access(cr, vp, va.va_uid, mode));
1411 }
1412 
1413 static int nfs_do_symlink_cache = 1;
1414 
1415 /* ARGSUSED */
1416 static int
1417 nfs_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr, caller_context_t *ct)
1418 {
1419 	int error;
1420 	struct nfsrdlnres rl;
1421 	rnode_t *rp;
1422 	int douprintf;
1423 	failinfo_t fi;
1424 
1425 	/*
1426 	 * We want to be consistent with UFS semantics so we will return
1427 	 * EINVAL instead of ENXIO. This violates the XNFS spec and
1428 	 * the RFC 1094, which are wrong any way. BUGID 1138002.
1429 	 */
1430 	if (vp->v_type != VLNK)
1431 		return (EINVAL);
1432 
1433 	if (nfs_zone() != VTOMI(vp)->mi_zone)
1434 		return (EIO);
1435 
1436 	rp = VTOR(vp);
1437 	if (nfs_do_symlink_cache && rp->r_symlink.contents != NULL) {
1438 		error = nfs_validate_caches(vp, cr);
1439 		if (error)
1440 			return (error);
1441 		mutex_enter(&rp->r_statelock);
1442 		if (rp->r_symlink.contents != NULL) {
1443 			error = uiomove(rp->r_symlink.contents,
1444 			    rp->r_symlink.len, UIO_READ, uiop);
1445 			mutex_exit(&rp->r_statelock);
1446 			return (error);
1447 		}
1448 		mutex_exit(&rp->r_statelock);
1449 	}
1450 
1451 
1452 	rl.rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
1453 
1454 	fi.vp = vp;
1455 	fi.fhp = NULL;		/* no need to update, filehandle not copied */
1456 	fi.copyproc = nfscopyfh;
1457 	fi.lookupproc = nfslookup;
1458 	fi.xattrdirproc = acl_getxattrdir2;
1459 
1460 	douprintf = 1;
1461 
1462 	error = rfs2call(VTOMI(vp), RFS_READLINK,
1463 	    xdr_fhandle, (caddr_t)VTOFH(vp),
1464 	    xdr_rdlnres, (caddr_t)&rl, cr,
1465 	    &douprintf, &rl.rl_status, 0, &fi);
1466 
1467 	if (error) {
1468 
1469 		kmem_free((void *)rl.rl_data, NFS_MAXPATHLEN);
1470 		return (error);
1471 	}
1472 
1473 	error = geterrno(rl.rl_status);
1474 	if (!error) {
1475 		error = uiomove(rl.rl_data, (int)rl.rl_count, UIO_READ, uiop);
1476 		if (nfs_do_symlink_cache && rp->r_symlink.contents == NULL) {
1477 			mutex_enter(&rp->r_statelock);
1478 			if (rp->r_symlink.contents == NULL) {
1479 				rp->r_symlink.contents = rl.rl_data;
1480 				rp->r_symlink.len = (int)rl.rl_count;
1481 				rp->r_symlink.size = NFS_MAXPATHLEN;
1482 				mutex_exit(&rp->r_statelock);
1483 			} else {
1484 				mutex_exit(&rp->r_statelock);
1485 
1486 				kmem_free((void *)rl.rl_data,
1487 				    NFS_MAXPATHLEN);
1488 			}
1489 		} else {
1490 
1491 			kmem_free((void *)rl.rl_data, NFS_MAXPATHLEN);
1492 		}
1493 	} else {
1494 		PURGE_STALE_FH(error, vp, cr);
1495 
1496 		kmem_free((void *)rl.rl_data, NFS_MAXPATHLEN);
1497 	}
1498 
1499 	/*
1500 	 * Conform to UFS semantics (see comment above)
1501 	 */
1502 	return (error == ENXIO ? EINVAL : error);
1503 }
1504 
1505 /*
1506  * Flush local dirty pages to stable storage on the server.
1507  *
1508  * If FNODSYNC is specified, then there is nothing to do because
1509  * metadata changes are not cached on the client before being
1510  * sent to the server.
1511  */
1512 /* ARGSUSED */
1513 static int
1514 nfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
1515 {
1516 	int error;
1517 
1518 	if ((syncflag & FNODSYNC) || IS_SWAPVP(vp))
1519 		return (0);
1520 
1521 	if (nfs_zone() != VTOMI(vp)->mi_zone)
1522 		return (EIO);
1523 
1524 	error = nfs_putpage(vp, (offset_t)0, 0, 0, cr, ct);
1525 	if (!error)
1526 		error = VTOR(vp)->r_error;
1527 	return (error);
1528 }
1529 
1530 
1531 /*
1532  * Weirdness: if the file was removed or the target of a rename
1533  * operation while it was open, it got renamed instead.  Here we
1534  * remove the renamed file.
1535  */
1536 /* ARGSUSED */
1537 static void
1538 nfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
1539 {
1540 	rnode_t *rp;
1541 
1542 	ASSERT(vp != DNLC_NO_VNODE);
1543 
1544 	/*
1545 	 * If this is coming from the wrong zone, we let someone in the right
1546 	 * zone take care of it asynchronously.  We can get here due to
1547 	 * VN_RELE() being called from pageout() or fsflush().  This call may
1548 	 * potentially turn into an expensive no-op if, for instance, v_count
1549 	 * gets incremented in the meantime, but it's still correct.
1550 	 */
1551 	if (nfs_zone() != VTOMI(vp)->mi_zone) {
1552 		nfs_async_inactive(vp, cr, nfs_inactive);
1553 		return;
1554 	}
1555 
1556 	rp = VTOR(vp);
1557 redo:
1558 	if (rp->r_unldvp != NULL) {
1559 		/*
1560 		 * Save the vnode pointer for the directory where the
1561 		 * unlinked-open file got renamed, then set it to NULL
1562 		 * to prevent another thread from getting here before
1563 		 * we're done with the remove.  While we have the
1564 		 * statelock, make local copies of the pertinent rnode
1565 		 * fields.  If we weren't to do this in an atomic way, the
1566 		 * the unl* fields could become inconsistent with respect
1567 		 * to each other due to a race condition between this
1568 		 * code and nfs_remove().  See bug report 1034328.
1569 		 */
1570 		mutex_enter(&rp->r_statelock);
1571 		if (rp->r_unldvp != NULL) {
1572 			vnode_t *unldvp;
1573 			char *unlname;
1574 			cred_t *unlcred;
1575 			struct nfsdiropargs da;
1576 			enum nfsstat status;
1577 			int douprintf;
1578 			int error;
1579 
1580 			unldvp = rp->r_unldvp;
1581 			rp->r_unldvp = NULL;
1582 			unlname = rp->r_unlname;
1583 			rp->r_unlname = NULL;
1584 			unlcred = rp->r_unlcred;
1585 			rp->r_unlcred = NULL;
1586 			mutex_exit(&rp->r_statelock);
1587 
1588 			/*
1589 			 * If there are any dirty pages left, then flush
1590 			 * them.  This is unfortunate because they just
1591 			 * may get thrown away during the remove operation,
1592 			 * but we have to do this for correctness.
1593 			 */
1594 			if (vn_has_cached_data(vp) &&
1595 			    ((rp->r_flags & RDIRTY) || rp->r_count > 0)) {
1596 				ASSERT(vp->v_type != VCHR);
1597 				error = nfs_putpage(vp, (offset_t)0, 0, 0,
1598 				    cr, ct);
1599 				if (error) {
1600 					mutex_enter(&rp->r_statelock);
1601 					if (!rp->r_error)
1602 						rp->r_error = error;
1603 					mutex_exit(&rp->r_statelock);
1604 				}
1605 			}
1606 
1607 			/*
1608 			 * Do the remove operation on the renamed file
1609 			 */
1610 			setdiropargs(&da, unlname, unldvp);
1611 
1612 			douprintf = 1;
1613 
1614 			(void) rfs2call(VTOMI(unldvp), RFS_REMOVE,
1615 			    xdr_diropargs, (caddr_t)&da,
1616 			    xdr_enum, (caddr_t)&status, unlcred,
1617 			    &douprintf, &status, 0, NULL);
1618 
1619 			if (HAVE_RDDIR_CACHE(VTOR(unldvp)))
1620 				nfs_purge_rddir_cache(unldvp);
1621 			PURGE_ATTRCACHE(unldvp);
1622 
1623 			/*
1624 			 * Release stuff held for the remove
1625 			 */
1626 			VN_RELE(unldvp);
1627 			kmem_free(unlname, MAXNAMELEN);
1628 			crfree(unlcred);
1629 			goto redo;
1630 		}
1631 		mutex_exit(&rp->r_statelock);
1632 	}
1633 
1634 	rp_addfree(rp, cr);
1635 }
1636 
1637 /*
1638  * Remote file system operations having to do with directory manipulation.
1639  */
1640 
1641 /* ARGSUSED */
1642 static int
1643 nfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
1644 	int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
1645 	int *direntflags, pathname_t *realpnp)
1646 {
1647 	int error;
1648 	vnode_t *vp;
1649 	vnode_t *avp = NULL;
1650 	rnode_t *drp;
1651 
1652 	if (nfs_zone() != VTOMI(dvp)->mi_zone)
1653 		return (EPERM);
1654 
1655 	drp = VTOR(dvp);
1656 
1657 	/*
1658 	 * Are we looking up extended attributes?  If so, "dvp" is
1659 	 * the file or directory for which we want attributes, and
1660 	 * we need a lookup of the hidden attribute directory
1661 	 * before we lookup the rest of the path.
1662 	 */
1663 	if (flags & LOOKUP_XATTR) {
1664 		bool_t cflag = ((flags & CREATE_XATTR_DIR) != 0);
1665 		mntinfo_t *mi;
1666 
1667 		mi = VTOMI(dvp);
1668 		if (!(mi->mi_flags & MI_EXTATTR))
1669 			return (EINVAL);
1670 
1671 		if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR(dvp)))
1672 			return (EINTR);
1673 
1674 		(void) nfslookup_dnlc(dvp, XATTR_DIR_NAME, &avp, cr);
1675 		if (avp == NULL)
1676 			error = acl_getxattrdir2(dvp, &avp, cflag, cr, 0);
1677 		else
1678 			error = 0;
1679 
1680 		nfs_rw_exit(&drp->r_rwlock);
1681 
1682 		if (error) {
1683 			if (mi->mi_flags & MI_EXTATTR)
1684 				return (error);
1685 			return (EINVAL);
1686 		}
1687 		dvp = avp;
1688 		drp = VTOR(dvp);
1689 	}
1690 
1691 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR(dvp))) {
1692 		error = EINTR;
1693 		goto out;
1694 	}
1695 
1696 	error = nfslookup(dvp, nm, vpp, pnp, flags, rdir, cr, 0);
1697 
1698 	nfs_rw_exit(&drp->r_rwlock);
1699 
1700 	/*
1701 	 * If vnode is a device, create special vnode.
1702 	 */
1703 	if (!error && IS_DEVVP(*vpp)) {
1704 		vp = *vpp;
1705 		*vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
1706 		VN_RELE(vp);
1707 	}
1708 
1709 out:
1710 	if (avp != NULL)
1711 		VN_RELE(avp);
1712 
1713 	return (error);
1714 }
1715 
1716 static int nfs_lookup_neg_cache = 1;
1717 
1718 #ifdef DEBUG
1719 static int nfs_lookup_dnlc_hits = 0;
1720 static int nfs_lookup_dnlc_misses = 0;
1721 static int nfs_lookup_dnlc_neg_hits = 0;
1722 static int nfs_lookup_dnlc_disappears = 0;
1723 static int nfs_lookup_dnlc_lookups = 0;
1724 #endif
1725 
1726 /* ARGSUSED */
1727 int
1728 nfslookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
1729 	int flags, vnode_t *rdir, cred_t *cr, int rfscall_flags)
1730 {
1731 	int error;
1732 
1733 	ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
1734 
1735 	/*
1736 	 * If lookup is for "", just return dvp.  Don't need
1737 	 * to send it over the wire, look it up in the dnlc,
1738 	 * or perform any access checks.
1739 	 */
1740 	if (*nm == '\0') {
1741 		VN_HOLD(dvp);
1742 		*vpp = dvp;
1743 		return (0);
1744 	}
1745 
1746 	/*
1747 	 * Can't do lookups in non-directories.
1748 	 */
1749 	if (dvp->v_type != VDIR)
1750 		return (ENOTDIR);
1751 
1752 	/*
1753 	 * If we're called with RFSCALL_SOFT, it's important that
1754 	 * the only rfscall is one we make directly; if we permit
1755 	 * an access call because we're looking up "." or validating
1756 	 * a dnlc hit, we'll deadlock because that rfscall will not
1757 	 * have the RFSCALL_SOFT set.
1758 	 */
1759 	if (rfscall_flags & RFSCALL_SOFT)
1760 		goto callit;
1761 
1762 	/*
1763 	 * If lookup is for ".", just return dvp.  Don't need
1764 	 * to send it over the wire or look it up in the dnlc,
1765 	 * just need to check access.
1766 	 */
1767 	if (strcmp(nm, ".") == 0) {
1768 		error = nfs_access(dvp, VEXEC, 0, cr, NULL);
1769 		if (error)
1770 			return (error);
1771 		VN_HOLD(dvp);
1772 		*vpp = dvp;
1773 		return (0);
1774 	}
1775 
1776 	/*
1777 	 * Lookup this name in the DNLC.  If there was a valid entry,
1778 	 * then return the results of the lookup.
1779 	 */
1780 	error = nfslookup_dnlc(dvp, nm, vpp, cr);
1781 	if (error || *vpp != NULL)
1782 		return (error);
1783 
1784 callit:
1785 	error = nfslookup_otw(dvp, nm, vpp, cr, rfscall_flags);
1786 
1787 	return (error);
1788 }
1789 
1790 static int
1791 nfslookup_dnlc(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr)
1792 {
1793 	int error;
1794 	vnode_t *vp;
1795 
1796 	ASSERT(*nm != '\0');
1797 	ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
1798 
1799 	/*
1800 	 * Lookup this name in the DNLC.  If successful, then validate
1801 	 * the caches and then recheck the DNLC.  The DNLC is rechecked
1802 	 * just in case this entry got invalidated during the call
1803 	 * to nfs_validate_caches.
1804 	 *
1805 	 * An assumption is being made that it is safe to say that a
1806 	 * file exists which may not on the server.  Any operations to
1807 	 * the server will fail with ESTALE.
1808 	 */
1809 #ifdef DEBUG
1810 	nfs_lookup_dnlc_lookups++;
1811 #endif
1812 	vp = dnlc_lookup(dvp, nm);
1813 	if (vp != NULL) {
1814 		VN_RELE(vp);
1815 		if (vp == DNLC_NO_VNODE && !vn_is_readonly(dvp)) {
1816 			PURGE_ATTRCACHE(dvp);
1817 		}
1818 		error = nfs_validate_caches(dvp, cr);
1819 		if (error)
1820 			return (error);
1821 		vp = dnlc_lookup(dvp, nm);
1822 		if (vp != NULL) {
1823 			error = nfs_access(dvp, VEXEC, 0, cr, NULL);
1824 			if (error) {
1825 				VN_RELE(vp);
1826 				return (error);
1827 			}
1828 			if (vp == DNLC_NO_VNODE) {
1829 				VN_RELE(vp);
1830 #ifdef DEBUG
1831 				nfs_lookup_dnlc_neg_hits++;
1832 #endif
1833 				return (ENOENT);
1834 			}
1835 			*vpp = vp;
1836 #ifdef DEBUG
1837 			nfs_lookup_dnlc_hits++;
1838 #endif
1839 			return (0);
1840 		}
1841 #ifdef DEBUG
1842 		nfs_lookup_dnlc_disappears++;
1843 #endif
1844 	}
1845 #ifdef DEBUG
1846 	else
1847 		nfs_lookup_dnlc_misses++;
1848 #endif
1849 
1850 	*vpp = NULL;
1851 
1852 	return (0);
1853 }
1854 
1855 static int
1856 nfslookup_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr,
1857 	int rfscall_flags)
1858 {
1859 	int error;
1860 	struct nfsdiropargs da;
1861 	struct nfsdiropres dr;
1862 	int douprintf;
1863 	failinfo_t fi;
1864 	hrtime_t t;
1865 
1866 	ASSERT(*nm != '\0');
1867 	ASSERT(dvp->v_type == VDIR);
1868 	ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
1869 
1870 	setdiropargs(&da, nm, dvp);
1871 
1872 	fi.vp = dvp;
1873 	fi.fhp = NULL;		/* no need to update, filehandle not copied */
1874 	fi.copyproc = nfscopyfh;
1875 	fi.lookupproc = nfslookup;
1876 	fi.xattrdirproc = acl_getxattrdir2;
1877 
1878 	douprintf = 1;
1879 
1880 	t = gethrtime();
1881 
1882 	error = rfs2call(VTOMI(dvp), RFS_LOOKUP,
1883 	    xdr_diropargs, (caddr_t)&da,
1884 	    xdr_diropres, (caddr_t)&dr, cr,
1885 	    &douprintf, &dr.dr_status, rfscall_flags, &fi);
1886 
1887 	if (!error) {
1888 		error = geterrno(dr.dr_status);
1889 		if (!error) {
1890 			*vpp = makenfsnode(&dr.dr_fhandle, &dr.dr_attr,
1891 			    dvp->v_vfsp, t, cr, VTOR(dvp)->r_path, nm);
1892 			/*
1893 			 * If NFS_ACL is supported on the server, then the
1894 			 * attributes returned by server may have minimal
1895 			 * permissions sometimes denying access to users having
1896 			 * proper access.  To get the proper attributes, mark
1897 			 * the attributes as expired so that they will be
1898 			 * regotten via the NFS_ACL GETATTR2 procedure.
1899 			 */
1900 			if (VTOMI(*vpp)->mi_flags & MI_ACL) {
1901 				PURGE_ATTRCACHE(*vpp);
1902 			}
1903 			if (!(rfscall_flags & RFSCALL_SOFT))
1904 				dnlc_update(dvp, nm, *vpp);
1905 		} else {
1906 			PURGE_STALE_FH(error, dvp, cr);
1907 			if (error == ENOENT && nfs_lookup_neg_cache)
1908 				dnlc_enter(dvp, nm, DNLC_NO_VNODE);
1909 		}
1910 	}
1911 
1912 	return (error);
1913 }
1914 
1915 /* ARGSUSED */
1916 static int
1917 nfs_create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive,
1918 	int mode, vnode_t **vpp, cred_t *cr, int lfaware, caller_context_t *ct,
1919 	vsecattr_t *vsecp)
1920 {
1921 	int error;
1922 	struct nfscreatargs args;
1923 	struct nfsdiropres dr;
1924 	int douprintf;
1925 	vnode_t *vp;
1926 	rnode_t *rp;
1927 	struct vattr vattr;
1928 	rnode_t *drp;
1929 	vnode_t *tempvp;
1930 	hrtime_t t;
1931 
1932 	drp = VTOR(dvp);
1933 
1934 	if (nfs_zone() != VTOMI(dvp)->mi_zone)
1935 		return (EPERM);
1936 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
1937 		return (EINTR);
1938 
1939 	/*
1940 	 * We make a copy of the attributes because the caller does not
1941 	 * expect us to change what va points to.
1942 	 */
1943 	vattr = *va;
1944 
1945 	/*
1946 	 * If the pathname is "", just use dvp.  Don't need
1947 	 * to send it over the wire, look it up in the dnlc,
1948 	 * or perform any access checks.
1949 	 */
1950 	if (*nm == '\0') {
1951 		error = 0;
1952 		VN_HOLD(dvp);
1953 		vp = dvp;
1954 	/*
1955 	 * If the pathname is ".", just use dvp.  Don't need
1956 	 * to send it over the wire or look it up in the dnlc,
1957 	 * just need to check access.
1958 	 */
1959 	} else if (strcmp(nm, ".") == 0) {
1960 		error = nfs_access(dvp, VEXEC, 0, cr, ct);
1961 		if (error) {
1962 			nfs_rw_exit(&drp->r_rwlock);
1963 			return (error);
1964 		}
1965 		VN_HOLD(dvp);
1966 		vp = dvp;
1967 	/*
1968 	 * We need to go over the wire, just to be sure whether the
1969 	 * file exists or not.  Using the DNLC can be dangerous in
1970 	 * this case when making a decision regarding existence.
1971 	 */
1972 	} else {
1973 		error = nfslookup_otw(dvp, nm, &vp, cr, 0);
1974 	}
1975 	if (!error) {
1976 		if (exclusive == EXCL)
1977 			error = EEXIST;
1978 		else if (vp->v_type == VDIR && (mode & VWRITE))
1979 			error = EISDIR;
1980 		else {
1981 			/*
1982 			 * If vnode is a device, create special vnode.
1983 			 */
1984 			if (IS_DEVVP(vp)) {
1985 				tempvp = vp;
1986 				vp = specvp(vp, vp->v_rdev, vp->v_type, cr);
1987 				VN_RELE(tempvp);
1988 			}
1989 			if (!(error = VOP_ACCESS(vp, mode, 0, cr, ct))) {
1990 				if ((vattr.va_mask & AT_SIZE) &&
1991 				    vp->v_type == VREG) {
1992 					vattr.va_mask = AT_SIZE;
1993 					error = nfssetattr(vp, &vattr, 0, cr);
1994 				}
1995 			}
1996 		}
1997 		nfs_rw_exit(&drp->r_rwlock);
1998 		if (error) {
1999 			VN_RELE(vp);
2000 		} else {
2001 			/*
2002 			 * existing file got truncated, notify.
2003 			 */
2004 			vnevent_create(vp, ct);
2005 			*vpp = vp;
2006 		}
2007 		return (error);
2008 	}
2009 
2010 	ASSERT(vattr.va_mask & AT_TYPE);
2011 	if (vattr.va_type == VREG) {
2012 		ASSERT(vattr.va_mask & AT_MODE);
2013 		if (MANDMODE(vattr.va_mode)) {
2014 			nfs_rw_exit(&drp->r_rwlock);
2015 			return (EACCES);
2016 		}
2017 	}
2018 
2019 	dnlc_remove(dvp, nm);
2020 
2021 	setdiropargs(&args.ca_da, nm, dvp);
2022 
2023 	/*
2024 	 * Decide what the group-id of the created file should be.
2025 	 * Set it in attribute list as advisory...then do a setattr
2026 	 * if the server didn't get it right the first time.
2027 	 */
2028 	error = setdirgid(dvp, &vattr.va_gid, cr);
2029 	if (error) {
2030 		nfs_rw_exit(&drp->r_rwlock);
2031 		return (error);
2032 	}
2033 	vattr.va_mask |= AT_GID;
2034 
2035 	/*
2036 	 * This is a completely gross hack to make mknod
2037 	 * work over the wire until we can wack the protocol
2038 	 */
2039 #define	IFCHR		0020000		/* character special */
2040 #define	IFBLK		0060000		/* block special */
2041 #define	IFSOCK		0140000		/* socket */
2042 
2043 	/*
2044 	 * dev_t is uint_t in 5.x and short in 4.x. Both 4.x
2045 	 * supports 8 bit majors. 5.x supports 14 bit majors. 5.x supports 18
2046 	 * bits in the minor number where 4.x supports 8 bits.  If the 5.x
2047 	 * minor/major numbers <= 8 bits long, compress the device
2048 	 * number before sending it. Otherwise, the 4.x server will not
2049 	 * create the device with the correct device number and nothing can be
2050 	 * done about this.
2051 	 */
2052 	if (vattr.va_type == VCHR || vattr.va_type == VBLK) {
2053 		dev_t d = vattr.va_rdev;
2054 		dev32_t dev32;
2055 
2056 		if (vattr.va_type == VCHR)
2057 			vattr.va_mode |= IFCHR;
2058 		else
2059 			vattr.va_mode |= IFBLK;
2060 
2061 		(void) cmpldev(&dev32, d);
2062 		if (dev32 & ~((SO4_MAXMAJ << L_BITSMINOR32) | SO4_MAXMIN))
2063 			vattr.va_size = (u_offset_t)dev32;
2064 		else
2065 			vattr.va_size = (u_offset_t)nfsv2_cmpdev(d);
2066 
2067 		vattr.va_mask |= AT_MODE|AT_SIZE;
2068 	} else if (vattr.va_type == VFIFO) {
2069 		vattr.va_mode |= IFCHR;		/* xtra kludge for namedpipe */
2070 		vattr.va_size = (u_offset_t)NFS_FIFO_DEV;	/* blech */
2071 		vattr.va_mask |= AT_MODE|AT_SIZE;
2072 	} else if (vattr.va_type == VSOCK) {
2073 		vattr.va_mode |= IFSOCK;
2074 		/*
2075 		 * To avoid triggering bugs in the servers set AT_SIZE
2076 		 * (all other RFS_CREATE calls set this).
2077 		 */
2078 		vattr.va_size = 0;
2079 		vattr.va_mask |= AT_MODE|AT_SIZE;
2080 	}
2081 
2082 	args.ca_sa = &args.ca_sa_buf;
2083 	error = vattr_to_sattr(&vattr, args.ca_sa);
2084 	if (error) {
2085 		/* req time field(s) overflow - return immediately */
2086 		nfs_rw_exit(&drp->r_rwlock);
2087 		return (error);
2088 	}
2089 
2090 	douprintf = 1;
2091 
2092 	t = gethrtime();
2093 
2094 	error = rfs2call(VTOMI(dvp), RFS_CREATE,
2095 	    xdr_creatargs, (caddr_t)&args,
2096 	    xdr_diropres, (caddr_t)&dr, cr,
2097 	    &douprintf, &dr.dr_status, 0, NULL);
2098 
2099 	PURGE_ATTRCACHE(dvp);	/* mod time changed */
2100 
2101 	if (!error) {
2102 		error = geterrno(dr.dr_status);
2103 		if (!error) {
2104 			if (HAVE_RDDIR_CACHE(drp))
2105 				nfs_purge_rddir_cache(dvp);
2106 			vp = makenfsnode(&dr.dr_fhandle, &dr.dr_attr,
2107 			    dvp->v_vfsp, t, cr, NULL, NULL);
2108 			/*
2109 			 * If NFS_ACL is supported on the server, then the
2110 			 * attributes returned by server may have minimal
2111 			 * permissions sometimes denying access to users having
2112 			 * proper access.  To get the proper attributes, mark
2113 			 * the attributes as expired so that they will be
2114 			 * regotten via the NFS_ACL GETATTR2 procedure.
2115 			 */
2116 			if (VTOMI(vp)->mi_flags & MI_ACL) {
2117 				PURGE_ATTRCACHE(vp);
2118 			}
2119 			dnlc_update(dvp, nm, vp);
2120 			rp = VTOR(vp);
2121 			if (vattr.va_size == 0) {
2122 				mutex_enter(&rp->r_statelock);
2123 				rp->r_size = 0;
2124 				mutex_exit(&rp->r_statelock);
2125 				if (vn_has_cached_data(vp)) {
2126 					ASSERT(vp->v_type != VCHR);
2127 					nfs_invalidate_pages(vp,
2128 					    (u_offset_t)0, cr);
2129 				}
2130 			}
2131 
2132 			/*
2133 			 * Make sure the gid was set correctly.
2134 			 * If not, try to set it (but don't lose
2135 			 * any sleep over it).
2136 			 */
2137 			if (vattr.va_gid != rp->r_attr.va_gid) {
2138 				vattr.va_mask = AT_GID;
2139 				(void) nfssetattr(vp, &vattr, 0, cr);
2140 			}
2141 
2142 			/*
2143 			 * If vnode is a device create special vnode
2144 			 */
2145 			if (IS_DEVVP(vp)) {
2146 				*vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
2147 				VN_RELE(vp);
2148 			} else
2149 				*vpp = vp;
2150 		} else {
2151 			PURGE_STALE_FH(error, dvp, cr);
2152 		}
2153 	}
2154 
2155 	nfs_rw_exit(&drp->r_rwlock);
2156 
2157 	return (error);
2158 }
2159 
2160 /*
2161  * Weirdness: if the vnode to be removed is open
2162  * we rename it instead of removing it and nfs_inactive
2163  * will remove the new name.
2164  */
2165 /* ARGSUSED */
2166 static int
2167 nfs_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct, int flags)
2168 {
2169 	int error;
2170 	struct nfsdiropargs da;
2171 	enum nfsstat status;
2172 	vnode_t *vp;
2173 	char *tmpname;
2174 	int douprintf;
2175 	rnode_t *rp;
2176 	rnode_t *drp;
2177 
2178 	if (nfs_zone() != VTOMI(dvp)->mi_zone)
2179 		return (EPERM);
2180 	drp = VTOR(dvp);
2181 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
2182 		return (EINTR);
2183 
2184 	error = nfslookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0);
2185 	if (error) {
2186 		nfs_rw_exit(&drp->r_rwlock);
2187 		return (error);
2188 	}
2189 
2190 	if (vp->v_type == VDIR && secpolicy_fs_linkdir(cr, dvp->v_vfsp)) {
2191 		VN_RELE(vp);
2192 		nfs_rw_exit(&drp->r_rwlock);
2193 		return (EPERM);
2194 	}
2195 
2196 	/*
2197 	 * First just remove the entry from the name cache, as it
2198 	 * is most likely the only entry for this vp.
2199 	 */
2200 	dnlc_remove(dvp, nm);
2201 
2202 	/*
2203 	 * If the file has a v_count > 1 then there may be more than one
2204 	 * entry in the name cache due multiple links or an open file,
2205 	 * but we don't have the real reference count so flush all
2206 	 * possible entries.
2207 	 */
2208 	if (vp->v_count > 1)
2209 		dnlc_purge_vp(vp);
2210 
2211 	/*
2212 	 * Now we have the real reference count on the vnode
2213 	 */
2214 	rp = VTOR(vp);
2215 	mutex_enter(&rp->r_statelock);
2216 	if (vp->v_count > 1 &&
2217 	    (rp->r_unldvp == NULL || strcmp(nm, rp->r_unlname) == 0)) {
2218 		mutex_exit(&rp->r_statelock);
2219 		tmpname = newname();
2220 		error = nfsrename(dvp, nm, dvp, tmpname, cr, ct);
2221 		if (error)
2222 			kmem_free(tmpname, MAXNAMELEN);
2223 		else {
2224 			mutex_enter(&rp->r_statelock);
2225 			if (rp->r_unldvp == NULL) {
2226 				VN_HOLD(dvp);
2227 				rp->r_unldvp = dvp;
2228 				if (rp->r_unlcred != NULL)
2229 					crfree(rp->r_unlcred);
2230 				crhold(cr);
2231 				rp->r_unlcred = cr;
2232 				rp->r_unlname = tmpname;
2233 			} else {
2234 				kmem_free(rp->r_unlname, MAXNAMELEN);
2235 				rp->r_unlname = tmpname;
2236 			}
2237 			mutex_exit(&rp->r_statelock);
2238 		}
2239 	} else {
2240 		mutex_exit(&rp->r_statelock);
2241 		/*
2242 		 * We need to flush any dirty pages which happen to
2243 		 * be hanging around before removing the file.  This
2244 		 * shouldn't happen very often and mostly on file
2245 		 * systems mounted "nocto".
2246 		 */
2247 		if (vn_has_cached_data(vp) &&
2248 		    ((rp->r_flags & RDIRTY) || rp->r_count > 0)) {
2249 			error = nfs_putpage(vp, (offset_t)0, 0, 0, cr, ct);
2250 			if (error && (error == ENOSPC || error == EDQUOT)) {
2251 				mutex_enter(&rp->r_statelock);
2252 				if (!rp->r_error)
2253 					rp->r_error = error;
2254 				mutex_exit(&rp->r_statelock);
2255 			}
2256 		}
2257 
2258 		setdiropargs(&da, nm, dvp);
2259 
2260 		douprintf = 1;
2261 
2262 		error = rfs2call(VTOMI(dvp), RFS_REMOVE,
2263 		    xdr_diropargs, (caddr_t)&da,
2264 		    xdr_enum, (caddr_t)&status, cr,
2265 		    &douprintf, &status, 0, NULL);
2266 
2267 		/*
2268 		 * The xattr dir may be gone after last attr is removed,
2269 		 * so flush it from dnlc.
2270 		 */
2271 		if (dvp->v_flag & V_XATTRDIR)
2272 			dnlc_purge_vp(dvp);
2273 
2274 		PURGE_ATTRCACHE(dvp);	/* mod time changed */
2275 		PURGE_ATTRCACHE(vp);	/* link count changed */
2276 
2277 		if (!error) {
2278 			error = geterrno(status);
2279 			if (!error) {
2280 				if (HAVE_RDDIR_CACHE(drp))
2281 					nfs_purge_rddir_cache(dvp);
2282 			} else {
2283 				PURGE_STALE_FH(error, dvp, cr);
2284 			}
2285 		}
2286 	}
2287 
2288 	if (error == 0) {
2289 		vnevent_remove(vp, dvp, nm, ct);
2290 	}
2291 	VN_RELE(vp);
2292 
2293 	nfs_rw_exit(&drp->r_rwlock);
2294 
2295 	return (error);
2296 }
2297 
2298 /* ARGSUSED */
2299 static int
2300 nfs_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr,
2301 	caller_context_t *ct, int flags)
2302 {
2303 	int error;
2304 	struct nfslinkargs args;
2305 	enum nfsstat status;
2306 	vnode_t *realvp;
2307 	int douprintf;
2308 	rnode_t *tdrp;
2309 
2310 	if (nfs_zone() != VTOMI(tdvp)->mi_zone)
2311 		return (EPERM);
2312 	if (VOP_REALVP(svp, &realvp, ct) == 0)
2313 		svp = realvp;
2314 
2315 	args.la_from = VTOFH(svp);
2316 	setdiropargs(&args.la_to, tnm, tdvp);
2317 
2318 	tdrp = VTOR(tdvp);
2319 	if (nfs_rw_enter_sig(&tdrp->r_rwlock, RW_WRITER, INTR(tdvp)))
2320 		return (EINTR);
2321 
2322 	dnlc_remove(tdvp, tnm);
2323 
2324 	douprintf = 1;
2325 
2326 	error = rfs2call(VTOMI(svp), RFS_LINK,
2327 	    xdr_linkargs, (caddr_t)&args,
2328 	    xdr_enum, (caddr_t)&status, cr,
2329 	    &douprintf, &status, 0, NULL);
2330 
2331 	PURGE_ATTRCACHE(tdvp);	/* mod time changed */
2332 	PURGE_ATTRCACHE(svp);	/* link count changed */
2333 
2334 	if (!error) {
2335 		error = geterrno(status);
2336 		if (!error) {
2337 			if (HAVE_RDDIR_CACHE(tdrp))
2338 				nfs_purge_rddir_cache(tdvp);
2339 		}
2340 	}
2341 
2342 	nfs_rw_exit(&tdrp->r_rwlock);
2343 
2344 	if (!error) {
2345 		/*
2346 		 * Notify the source file of this link operation.
2347 		 */
2348 		vnevent_link(svp, ct);
2349 	}
2350 	return (error);
2351 }
2352 
2353 /* ARGSUSED */
2354 static int
2355 nfs_rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr,
2356 	caller_context_t *ct, int flags)
2357 {
2358 	vnode_t *realvp;
2359 
2360 	if (nfs_zone() != VTOMI(odvp)->mi_zone)
2361 		return (EPERM);
2362 	if (VOP_REALVP(ndvp, &realvp, ct) == 0)
2363 		ndvp = realvp;
2364 
2365 	return (nfsrename(odvp, onm, ndvp, nnm, cr, ct));
2366 }
2367 
2368 /*
2369  * nfsrename does the real work of renaming in NFS Version 2.
2370  */
2371 static int
2372 nfsrename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr,
2373     caller_context_t *ct)
2374 {
2375 	int error;
2376 	enum nfsstat status;
2377 	struct nfsrnmargs args;
2378 	int douprintf;
2379 	vnode_t *nvp = NULL;
2380 	vnode_t *ovp = NULL;
2381 	char *tmpname;
2382 	rnode_t *rp;
2383 	rnode_t *odrp;
2384 	rnode_t *ndrp;
2385 
2386 	ASSERT(nfs_zone() == VTOMI(odvp)->mi_zone);
2387 	if (strcmp(onm, ".") == 0 || strcmp(onm, "..") == 0 ||
2388 	    strcmp(nnm, ".") == 0 || strcmp(nnm, "..") == 0)
2389 		return (EINVAL);
2390 
2391 	odrp = VTOR(odvp);
2392 	ndrp = VTOR(ndvp);
2393 	if ((intptr_t)odrp < (intptr_t)ndrp) {
2394 		if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR(odvp)))
2395 			return (EINTR);
2396 		if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR(ndvp))) {
2397 			nfs_rw_exit(&odrp->r_rwlock);
2398 			return (EINTR);
2399 		}
2400 	} else {
2401 		if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR(ndvp)))
2402 			return (EINTR);
2403 		if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR(odvp))) {
2404 			nfs_rw_exit(&ndrp->r_rwlock);
2405 			return (EINTR);
2406 		}
2407 	}
2408 
2409 	/*
2410 	 * Lookup the target file.  If it exists, it needs to be
2411 	 * checked to see whether it is a mount point and whether
2412 	 * it is active (open).
2413 	 */
2414 	error = nfslookup(ndvp, nnm, &nvp, NULL, 0, NULL, cr, 0);
2415 	if (!error) {
2416 		/*
2417 		 * If this file has been mounted on, then just
2418 		 * return busy because renaming to it would remove
2419 		 * the mounted file system from the name space.
2420 		 */
2421 		if (vn_mountedvfs(nvp) != NULL) {
2422 			VN_RELE(nvp);
2423 			nfs_rw_exit(&odrp->r_rwlock);
2424 			nfs_rw_exit(&ndrp->r_rwlock);
2425 			return (EBUSY);
2426 		}
2427 
2428 		/*
2429 		 * Purge the name cache of all references to this vnode
2430 		 * so that we can check the reference count to infer
2431 		 * whether it is active or not.
2432 		 */
2433 		/*
2434 		 * First just remove the entry from the name cache, as it
2435 		 * is most likely the only entry for this vp.
2436 		 */
2437 		dnlc_remove(ndvp, nnm);
2438 		/*
2439 		 * If the file has a v_count > 1 then there may be more
2440 		 * than one entry in the name cache due multiple links
2441 		 * or an open file, but we don't have the real reference
2442 		 * count so flush all possible entries.
2443 		 */
2444 		if (nvp->v_count > 1)
2445 			dnlc_purge_vp(nvp);
2446 
2447 		/*
2448 		 * If the vnode is active and is not a directory,
2449 		 * arrange to rename it to a
2450 		 * temporary file so that it will continue to be
2451 		 * accessible.  This implements the "unlink-open-file"
2452 		 * semantics for the target of a rename operation.
2453 		 * Before doing this though, make sure that the
2454 		 * source and target files are not already the same.
2455 		 */
2456 		if (nvp->v_count > 1 && nvp->v_type != VDIR) {
2457 			/*
2458 			 * Lookup the source name.
2459 			 */
2460 			error = nfslookup(odvp, onm, &ovp, NULL, 0, NULL,
2461 			    cr, 0);
2462 
2463 			/*
2464 			 * The source name *should* already exist.
2465 			 */
2466 			if (error) {
2467 				VN_RELE(nvp);
2468 				nfs_rw_exit(&odrp->r_rwlock);
2469 				nfs_rw_exit(&ndrp->r_rwlock);
2470 				return (error);
2471 			}
2472 
2473 			/*
2474 			 * Compare the two vnodes.  If they are the same,
2475 			 * just release all held vnodes and return success.
2476 			 */
2477 			if (ovp == nvp) {
2478 				VN_RELE(ovp);
2479 				VN_RELE(nvp);
2480 				nfs_rw_exit(&odrp->r_rwlock);
2481 				nfs_rw_exit(&ndrp->r_rwlock);
2482 				return (0);
2483 			}
2484 
2485 			/*
2486 			 * Can't mix and match directories and non-
2487 			 * directories in rename operations.  We already
2488 			 * know that the target is not a directory.  If
2489 			 * the source is a directory, return an error.
2490 			 */
2491 			if (ovp->v_type == VDIR) {
2492 				VN_RELE(ovp);
2493 				VN_RELE(nvp);
2494 				nfs_rw_exit(&odrp->r_rwlock);
2495 				nfs_rw_exit(&ndrp->r_rwlock);
2496 				return (ENOTDIR);
2497 			}
2498 
2499 			/*
2500 			 * The target file exists, is not the same as
2501 			 * the source file, and is active.  Link it
2502 			 * to a temporary filename to avoid having
2503 			 * the server removing the file completely.
2504 			 */
2505 			tmpname = newname();
2506 			error = nfs_link(ndvp, nvp, tmpname, cr, NULL, 0);
2507 			if (error == EOPNOTSUPP) {
2508 				error = nfs_rename(ndvp, nnm, ndvp, tmpname,
2509 				    cr, NULL, 0);
2510 			}
2511 			if (error) {
2512 				kmem_free(tmpname, MAXNAMELEN);
2513 				VN_RELE(ovp);
2514 				VN_RELE(nvp);
2515 				nfs_rw_exit(&odrp->r_rwlock);
2516 				nfs_rw_exit(&ndrp->r_rwlock);
2517 				return (error);
2518 			}
2519 			rp = VTOR(nvp);
2520 			mutex_enter(&rp->r_statelock);
2521 			if (rp->r_unldvp == NULL) {
2522 				VN_HOLD(ndvp);
2523 				rp->r_unldvp = ndvp;
2524 				if (rp->r_unlcred != NULL)
2525 					crfree(rp->r_unlcred);
2526 				crhold(cr);
2527 				rp->r_unlcred = cr;
2528 				rp->r_unlname = tmpname;
2529 			} else {
2530 				kmem_free(rp->r_unlname, MAXNAMELEN);
2531 				rp->r_unlname = tmpname;
2532 			}
2533 			mutex_exit(&rp->r_statelock);
2534 		}
2535 	}
2536 
2537 	if (ovp == NULL) {
2538 		/*
2539 		 * When renaming directories to be a subdirectory of a
2540 		 * different parent, the dnlc entry for ".." will no
2541 		 * longer be valid, so it must be removed.
2542 		 *
2543 		 * We do a lookup here to determine whether we are renaming
2544 		 * a directory and we need to check if we are renaming
2545 		 * an unlinked file.  This might have already been done
2546 		 * in previous code, so we check ovp == NULL to avoid
2547 		 * doing it twice.
2548 		 */
2549 
2550 		error = nfslookup(odvp, onm, &ovp, NULL, 0, NULL, cr, 0);
2551 
2552 		/*
2553 		 * The source name *should* already exist.
2554 		 */
2555 		if (error) {
2556 			nfs_rw_exit(&odrp->r_rwlock);
2557 			nfs_rw_exit(&ndrp->r_rwlock);
2558 			if (nvp) {
2559 				VN_RELE(nvp);
2560 			}
2561 			return (error);
2562 		}
2563 		ASSERT(ovp != NULL);
2564 	}
2565 
2566 	dnlc_remove(odvp, onm);
2567 	dnlc_remove(ndvp, nnm);
2568 
2569 	setdiropargs(&args.rna_from, onm, odvp);
2570 	setdiropargs(&args.rna_to, nnm, ndvp);
2571 
2572 	douprintf = 1;
2573 
2574 	error = rfs2call(VTOMI(odvp), RFS_RENAME,
2575 	    xdr_rnmargs, (caddr_t)&args,
2576 	    xdr_enum, (caddr_t)&status, cr,
2577 	    &douprintf, &status, 0, NULL);
2578 
2579 	PURGE_ATTRCACHE(odvp);	/* mod time changed */
2580 	PURGE_ATTRCACHE(ndvp);	/* mod time changed */
2581 
2582 	if (!error) {
2583 		error = geterrno(status);
2584 		if (!error) {
2585 			if (HAVE_RDDIR_CACHE(odrp))
2586 				nfs_purge_rddir_cache(odvp);
2587 			if (HAVE_RDDIR_CACHE(ndrp))
2588 				nfs_purge_rddir_cache(ndvp);
2589 			/*
2590 			 * when renaming directories to be a subdirectory of a
2591 			 * different parent, the dnlc entry for ".." will no
2592 			 * longer be valid, so it must be removed
2593 			 */
2594 			rp = VTOR(ovp);
2595 			if (ndvp != odvp) {
2596 				if (ovp->v_type == VDIR) {
2597 					dnlc_remove(ovp, "..");
2598 					if (HAVE_RDDIR_CACHE(rp))
2599 						nfs_purge_rddir_cache(ovp);
2600 				}
2601 			}
2602 
2603 			/*
2604 			 * If we are renaming the unlinked file, update the
2605 			 * r_unldvp and r_unlname as needed.
2606 			 */
2607 			mutex_enter(&rp->r_statelock);
2608 			if (rp->r_unldvp != NULL) {
2609 				if (strcmp(rp->r_unlname, onm) == 0) {
2610 					(void) strncpy(rp->r_unlname,
2611 					    nnm, MAXNAMELEN);
2612 					rp->r_unlname[MAXNAMELEN - 1] = '\0';
2613 
2614 					if (ndvp != rp->r_unldvp) {
2615 						VN_RELE(rp->r_unldvp);
2616 						rp->r_unldvp = ndvp;
2617 						VN_HOLD(ndvp);
2618 					}
2619 				}
2620 			}
2621 			mutex_exit(&rp->r_statelock);
2622 		} else {
2623 			/*
2624 			 * System V defines rename to return EEXIST, not
2625 			 * ENOTEMPTY if the target directory is not empty.
2626 			 * Over the wire, the error is NFSERR_ENOTEMPTY
2627 			 * which geterrno maps to ENOTEMPTY.
2628 			 */
2629 			if (error == ENOTEMPTY)
2630 				error = EEXIST;
2631 		}
2632 	}
2633 
2634 	if (error == 0) {
2635 		if (nvp)
2636 			vnevent_rename_dest(nvp, ndvp, nnm, ct);
2637 
2638 		if (odvp != ndvp)
2639 			vnevent_rename_dest_dir(ndvp, ct);
2640 
2641 		ASSERT(ovp != NULL);
2642 		vnevent_rename_src(ovp, odvp, onm, ct);
2643 	}
2644 
2645 	if (nvp) {
2646 		VN_RELE(nvp);
2647 	}
2648 	VN_RELE(ovp);
2649 
2650 	nfs_rw_exit(&odrp->r_rwlock);
2651 	nfs_rw_exit(&ndrp->r_rwlock);
2652 
2653 	return (error);
2654 }
2655 
2656 /* ARGSUSED */
2657 static int
2658 nfs_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp, cred_t *cr,
2659 	caller_context_t *ct, int flags, vsecattr_t *vsecp)
2660 {
2661 	int error;
2662 	struct nfscreatargs args;
2663 	struct nfsdiropres dr;
2664 	int douprintf;
2665 	rnode_t *drp;
2666 	hrtime_t t;
2667 
2668 	if (nfs_zone() != VTOMI(dvp)->mi_zone)
2669 		return (EPERM);
2670 
2671 	setdiropargs(&args.ca_da, nm, dvp);
2672 
2673 	/*
2674 	 * Decide what the group-id and set-gid bit of the created directory
2675 	 * should be.  May have to do a setattr to get the gid right.
2676 	 */
2677 	error = setdirgid(dvp, &va->va_gid, cr);
2678 	if (error)
2679 		return (error);
2680 	error = setdirmode(dvp, &va->va_mode, cr);
2681 	if (error)
2682 		return (error);
2683 	va->va_mask |= AT_MODE|AT_GID;
2684 
2685 	args.ca_sa = &args.ca_sa_buf;
2686 	error = vattr_to_sattr(va, args.ca_sa);
2687 	if (error) {
2688 		/* req time field(s) overflow - return immediately */
2689 		return (error);
2690 	}
2691 
2692 	drp = VTOR(dvp);
2693 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
2694 		return (EINTR);
2695 
2696 	dnlc_remove(dvp, nm);
2697 
2698 	douprintf = 1;
2699 
2700 	t = gethrtime();
2701 
2702 	error = rfs2call(VTOMI(dvp), RFS_MKDIR,
2703 	    xdr_creatargs, (caddr_t)&args,
2704 	    xdr_diropres, (caddr_t)&dr, cr,
2705 	    &douprintf, &dr.dr_status, 0, NULL);
2706 
2707 	PURGE_ATTRCACHE(dvp);	/* mod time changed */
2708 
2709 	if (!error) {
2710 		error = geterrno(dr.dr_status);
2711 		if (!error) {
2712 			if (HAVE_RDDIR_CACHE(drp))
2713 				nfs_purge_rddir_cache(dvp);
2714 			/*
2715 			 * The attributes returned by RFS_MKDIR can not
2716 			 * be depended upon, so mark the attribute cache
2717 			 * as purged.  A subsequent GETATTR will get the
2718 			 * correct attributes from the server.
2719 			 */
2720 			*vpp = makenfsnode(&dr.dr_fhandle, &dr.dr_attr,
2721 			    dvp->v_vfsp, t, cr, NULL, NULL);
2722 			PURGE_ATTRCACHE(*vpp);
2723 			dnlc_update(dvp, nm, *vpp);
2724 
2725 			/*
2726 			 * Make sure the gid was set correctly.
2727 			 * If not, try to set it (but don't lose
2728 			 * any sleep over it).
2729 			 */
2730 			if (va->va_gid != VTOR(*vpp)->r_attr.va_gid) {
2731 				va->va_mask = AT_GID;
2732 				(void) nfssetattr(*vpp, va, 0, cr);
2733 			}
2734 		} else {
2735 			PURGE_STALE_FH(error, dvp, cr);
2736 		}
2737 	}
2738 
2739 	nfs_rw_exit(&drp->r_rwlock);
2740 
2741 	return (error);
2742 }
2743 
2744 /* ARGSUSED */
2745 static int
2746 nfs_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr,
2747 	caller_context_t *ct, int flags)
2748 {
2749 	int error;
2750 	enum nfsstat status;
2751 	struct nfsdiropargs da;
2752 	vnode_t *vp;
2753 	int douprintf;
2754 	rnode_t *drp;
2755 
2756 	if (nfs_zone() != VTOMI(dvp)->mi_zone)
2757 		return (EPERM);
2758 	drp = VTOR(dvp);
2759 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
2760 		return (EINTR);
2761 
2762 	/*
2763 	 * Attempt to prevent a rmdir(".") from succeeding.
2764 	 */
2765 	error = nfslookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0);
2766 	if (error) {
2767 		nfs_rw_exit(&drp->r_rwlock);
2768 		return (error);
2769 	}
2770 
2771 	if (vp == cdir) {
2772 		VN_RELE(vp);
2773 		nfs_rw_exit(&drp->r_rwlock);
2774 		return (EINVAL);
2775 	}
2776 
2777 	setdiropargs(&da, nm, dvp);
2778 
2779 	/*
2780 	 * First just remove the entry from the name cache, as it
2781 	 * is most likely an entry for this vp.
2782 	 */
2783 	dnlc_remove(dvp, nm);
2784 
2785 	/*
2786 	 * If there vnode reference count is greater than one, then
2787 	 * there may be additional references in the DNLC which will
2788 	 * need to be purged.  First, trying removing the entry for
2789 	 * the parent directory and see if that removes the additional
2790 	 * reference(s).  If that doesn't do it, then use dnlc_purge_vp
2791 	 * to completely remove any references to the directory which
2792 	 * might still exist in the DNLC.
2793 	 */
2794 	if (vp->v_count > 1) {
2795 		dnlc_remove(vp, "..");
2796 		if (vp->v_count > 1)
2797 			dnlc_purge_vp(vp);
2798 	}
2799 
2800 	douprintf = 1;
2801 
2802 	error = rfs2call(VTOMI(dvp), RFS_RMDIR,
2803 	    xdr_diropargs, (caddr_t)&da,
2804 	    xdr_enum, (caddr_t)&status, cr,
2805 	    &douprintf, &status, 0, NULL);
2806 
2807 	PURGE_ATTRCACHE(dvp);	/* mod time changed */
2808 
2809 	if (error) {
2810 		VN_RELE(vp);
2811 		nfs_rw_exit(&drp->r_rwlock);
2812 		return (error);
2813 	}
2814 
2815 	error = geterrno(status);
2816 	if (!error) {
2817 		if (HAVE_RDDIR_CACHE(drp))
2818 			nfs_purge_rddir_cache(dvp);
2819 		if (HAVE_RDDIR_CACHE(VTOR(vp)))
2820 			nfs_purge_rddir_cache(vp);
2821 	} else {
2822 		PURGE_STALE_FH(error, dvp, cr);
2823 		/*
2824 		 * System V defines rmdir to return EEXIST, not
2825 		 * ENOTEMPTY if the directory is not empty.  Over
2826 		 * the wire, the error is NFSERR_ENOTEMPTY which
2827 		 * geterrno maps to ENOTEMPTY.
2828 		 */
2829 		if (error == ENOTEMPTY)
2830 			error = EEXIST;
2831 	}
2832 
2833 	if (error == 0) {
2834 		vnevent_rmdir(vp, dvp, nm, ct);
2835 	}
2836 	VN_RELE(vp);
2837 
2838 	nfs_rw_exit(&drp->r_rwlock);
2839 
2840 	return (error);
2841 }
2842 
2843 /* ARGSUSED */
2844 static int
2845 nfs_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm, cred_t *cr,
2846 	caller_context_t *ct, int flags)
2847 {
2848 	int error;
2849 	struct nfsslargs args;
2850 	enum nfsstat status;
2851 	int douprintf;
2852 	rnode_t *drp;
2853 
2854 	if (nfs_zone() != VTOMI(dvp)->mi_zone)
2855 		return (EPERM);
2856 	setdiropargs(&args.sla_from, lnm, dvp);
2857 	args.sla_sa = &args.sla_sa_buf;
2858 	error = vattr_to_sattr(tva, args.sla_sa);
2859 	if (error) {
2860 		/* req time field(s) overflow - return immediately */
2861 		return (error);
2862 	}
2863 	args.sla_tnm = tnm;
2864 
2865 	drp = VTOR(dvp);
2866 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
2867 		return (EINTR);
2868 
2869 	dnlc_remove(dvp, lnm);
2870 
2871 	douprintf = 1;
2872 
2873 	error = rfs2call(VTOMI(dvp), RFS_SYMLINK,
2874 	    xdr_slargs, (caddr_t)&args,
2875 	    xdr_enum, (caddr_t)&status, cr,
2876 	    &douprintf, &status, 0, NULL);
2877 
2878 	PURGE_ATTRCACHE(dvp);	/* mod time changed */
2879 
2880 	if (!error) {
2881 		error = geterrno(status);
2882 		if (!error) {
2883 			if (HAVE_RDDIR_CACHE(drp))
2884 				nfs_purge_rddir_cache(dvp);
2885 		} else {
2886 			PURGE_STALE_FH(error, dvp, cr);
2887 		}
2888 	}
2889 
2890 	nfs_rw_exit(&drp->r_rwlock);
2891 
2892 	return (error);
2893 }
2894 
2895 #ifdef DEBUG
2896 static int nfs_readdir_cache_hits = 0;
2897 static int nfs_readdir_cache_shorts = 0;
2898 static int nfs_readdir_cache_waits = 0;
2899 static int nfs_readdir_cache_misses = 0;
2900 static int nfs_readdir_readahead = 0;
2901 #endif
2902 
2903 static int nfs_shrinkreaddir = 0;
2904 
2905 /*
2906  * Read directory entries.
2907  * There are some weird things to look out for here.  The uio_offset
2908  * field is either 0 or it is the offset returned from a previous
2909  * readdir.  It is an opaque value used by the server to find the
2910  * correct directory block to read. The count field is the number
2911  * of blocks to read on the server.  This is advisory only, the server
2912  * may return only one block's worth of entries.  Entries may be compressed
2913  * on the server.
2914  */
2915 /* ARGSUSED */
2916 static int
2917 nfs_readdir(vnode_t *vp, struct uio *uiop, cred_t *cr, int *eofp,
2918 	caller_context_t *ct, int flags)
2919 {
2920 	int error;
2921 	size_t count;
2922 	rnode_t *rp;
2923 	rddir_cache *rdc;
2924 	rddir_cache *nrdc;
2925 	rddir_cache *rrdc;
2926 #ifdef DEBUG
2927 	int missed;
2928 #endif
2929 	rddir_cache srdc;
2930 	avl_index_t where;
2931 
2932 	rp = VTOR(vp);
2933 
2934 	ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
2935 	if (nfs_zone() != VTOMI(vp)->mi_zone)
2936 		return (EIO);
2937 	/*
2938 	 * Make sure that the directory cache is valid.
2939 	 */
2940 	if (HAVE_RDDIR_CACHE(rp)) {
2941 		if (nfs_disable_rddir_cache) {
2942 			/*
2943 			 * Setting nfs_disable_rddir_cache in /etc/system
2944 			 * allows interoperability with servers that do not
2945 			 * properly update the attributes of directories.
2946 			 * Any cached information gets purged before an
2947 			 * access is made to it.
2948 			 */
2949 			nfs_purge_rddir_cache(vp);
2950 		} else {
2951 			error = nfs_validate_caches(vp, cr);
2952 			if (error)
2953 				return (error);
2954 		}
2955 	}
2956 
2957 	/*
2958 	 * UGLINESS: SunOS 3.2 servers apparently cannot always handle an
2959 	 * RFS_READDIR request with rda_count set to more than 0x400. So
2960 	 * we reduce the request size here purely for compatibility.
2961 	 *
2962 	 * In general, this is no longer required.  However, if a server
2963 	 * is discovered which can not handle requests larger than 1024,
2964 	 * nfs_shrinkreaddir can be set to 1 to enable this backwards
2965 	 * compatibility.
2966 	 *
2967 	 * In any case, the request size is limited to NFS_MAXDATA bytes.
2968 	 */
2969 	count = MIN(uiop->uio_iov->iov_len,
2970 	    nfs_shrinkreaddir ? 0x400 : NFS_MAXDATA);
2971 
2972 	nrdc = NULL;
2973 #ifdef DEBUG
2974 	missed = 0;
2975 #endif
2976 top:
2977 	/*
2978 	 * Short circuit last readdir which always returns 0 bytes.
2979 	 * This can be done after the directory has been read through
2980 	 * completely at least once.  This will set r_direof which
2981 	 * can be used to find the value of the last cookie.
2982 	 */
2983 	mutex_enter(&rp->r_statelock);
2984 	if (rp->r_direof != NULL &&
2985 	    uiop->uio_offset == rp->r_direof->nfs_ncookie) {
2986 		mutex_exit(&rp->r_statelock);
2987 #ifdef DEBUG
2988 		nfs_readdir_cache_shorts++;
2989 #endif
2990 		if (eofp)
2991 			*eofp = 1;
2992 		if (nrdc != NULL)
2993 			rddir_cache_rele(nrdc);
2994 		return (0);
2995 	}
2996 	/*
2997 	 * Look for a cache entry.  Cache entries are identified
2998 	 * by the NFS cookie value and the byte count requested.
2999 	 */
3000 	srdc.nfs_cookie = uiop->uio_offset;
3001 	srdc.buflen = count;
3002 	rdc = avl_find(&rp->r_dir, &srdc, &where);
3003 	if (rdc != NULL) {
3004 		rddir_cache_hold(rdc);
3005 		/*
3006 		 * If the cache entry is in the process of being
3007 		 * filled in, wait until this completes.  The
3008 		 * RDDIRWAIT bit is set to indicate that someone
3009 		 * is waiting and then the thread currently
3010 		 * filling the entry is done, it should do a
3011 		 * cv_broadcast to wakeup all of the threads
3012 		 * waiting for it to finish.
3013 		 */
3014 		if (rdc->flags & RDDIR) {
3015 			nfs_rw_exit(&rp->r_rwlock);
3016 			rdc->flags |= RDDIRWAIT;
3017 #ifdef DEBUG
3018 			nfs_readdir_cache_waits++;
3019 #endif
3020 			if (!cv_wait_sig(&rdc->cv, &rp->r_statelock)) {
3021 				/*
3022 				 * We got interrupted, probably
3023 				 * the user typed ^C or an alarm
3024 				 * fired.  We free the new entry
3025 				 * if we allocated one.
3026 				 */
3027 				mutex_exit(&rp->r_statelock);
3028 				(void) nfs_rw_enter_sig(&rp->r_rwlock,
3029 				    RW_READER, FALSE);
3030 				rddir_cache_rele(rdc);
3031 				if (nrdc != NULL)
3032 					rddir_cache_rele(nrdc);
3033 				return (EINTR);
3034 			}
3035 			mutex_exit(&rp->r_statelock);
3036 			(void) nfs_rw_enter_sig(&rp->r_rwlock,
3037 			    RW_READER, FALSE);
3038 			rddir_cache_rele(rdc);
3039 			goto top;
3040 		}
3041 		/*
3042 		 * Check to see if a readdir is required to
3043 		 * fill the entry.  If so, mark this entry
3044 		 * as being filled, remove our reference,
3045 		 * and branch to the code to fill the entry.
3046 		 */
3047 		if (rdc->flags & RDDIRREQ) {
3048 			rdc->flags &= ~RDDIRREQ;
3049 			rdc->flags |= RDDIR;
3050 			if (nrdc != NULL)
3051 				rddir_cache_rele(nrdc);
3052 			nrdc = rdc;
3053 			mutex_exit(&rp->r_statelock);
3054 			goto bottom;
3055 		}
3056 #ifdef DEBUG
3057 		if (!missed)
3058 			nfs_readdir_cache_hits++;
3059 #endif
3060 		/*
3061 		 * If an error occurred while attempting
3062 		 * to fill the cache entry, just return it.
3063 		 */
3064 		if (rdc->error) {
3065 			error = rdc->error;
3066 			mutex_exit(&rp->r_statelock);
3067 			rddir_cache_rele(rdc);
3068 			if (nrdc != NULL)
3069 				rddir_cache_rele(nrdc);
3070 			return (error);
3071 		}
3072 
3073 		/*
3074 		 * The cache entry is complete and good,
3075 		 * copyout the dirent structs to the calling
3076 		 * thread.
3077 		 */
3078 		error = uiomove(rdc->entries, rdc->entlen, UIO_READ, uiop);
3079 
3080 		/*
3081 		 * If no error occurred during the copyout,
3082 		 * update the offset in the uio struct to
3083 		 * contain the value of the next cookie
3084 		 * and set the eof value appropriately.
3085 		 */
3086 		if (!error) {
3087 			uiop->uio_offset = rdc->nfs_ncookie;
3088 			if (eofp)
3089 				*eofp = rdc->eof;
3090 		}
3091 
3092 		/*
3093 		 * Decide whether to do readahead.  Don't if
3094 		 * have already read to the end of directory.
3095 		 */
3096 		if (rdc->eof) {
3097 			rp->r_direof = rdc;
3098 			mutex_exit(&rp->r_statelock);
3099 			rddir_cache_rele(rdc);
3100 			if (nrdc != NULL)
3101 				rddir_cache_rele(nrdc);
3102 			return (error);
3103 		}
3104 
3105 		/*
3106 		 * Check to see whether we found an entry
3107 		 * for the readahead.  If so, we don't need
3108 		 * to do anything further, so free the new
3109 		 * entry if one was allocated.  Otherwise,
3110 		 * allocate a new entry, add it to the cache,
3111 		 * and then initiate an asynchronous readdir
3112 		 * operation to fill it.
3113 		 */
3114 		srdc.nfs_cookie = rdc->nfs_ncookie;
3115 		srdc.buflen = count;
3116 		rrdc = avl_find(&rp->r_dir, &srdc, &where);
3117 		if (rrdc != NULL) {
3118 			if (nrdc != NULL)
3119 				rddir_cache_rele(nrdc);
3120 		} else {
3121 			if (nrdc != NULL)
3122 				rrdc = nrdc;
3123 			else {
3124 				rrdc = rddir_cache_alloc(KM_NOSLEEP);
3125 			}
3126 			if (rrdc != NULL) {
3127 				rrdc->nfs_cookie = rdc->nfs_ncookie;
3128 				rrdc->buflen = count;
3129 				avl_insert(&rp->r_dir, rrdc, where);
3130 				rddir_cache_hold(rrdc);
3131 				mutex_exit(&rp->r_statelock);
3132 				rddir_cache_rele(rdc);
3133 #ifdef DEBUG
3134 				nfs_readdir_readahead++;
3135 #endif
3136 				nfs_async_readdir(vp, rrdc, cr, nfsreaddir);
3137 				return (error);
3138 			}
3139 		}
3140 
3141 		mutex_exit(&rp->r_statelock);
3142 		rddir_cache_rele(rdc);
3143 		return (error);
3144 	}
3145 
3146 	/*
3147 	 * Didn't find an entry in the cache.  Construct a new empty
3148 	 * entry and link it into the cache.  Other processes attempting
3149 	 * to access this entry will need to wait until it is filled in.
3150 	 *
3151 	 * Since kmem_alloc may block, another pass through the cache
3152 	 * will need to be taken to make sure that another process
3153 	 * hasn't already added an entry to the cache for this request.
3154 	 */
3155 	if (nrdc == NULL) {
3156 		mutex_exit(&rp->r_statelock);
3157 		nrdc = rddir_cache_alloc(KM_SLEEP);
3158 		nrdc->nfs_cookie = uiop->uio_offset;
3159 		nrdc->buflen = count;
3160 		goto top;
3161 	}
3162 
3163 	/*
3164 	 * Add this entry to the cache.
3165 	 */
3166 	avl_insert(&rp->r_dir, nrdc, where);
3167 	rddir_cache_hold(nrdc);
3168 	mutex_exit(&rp->r_statelock);
3169 
3170 bottom:
3171 #ifdef DEBUG
3172 	missed = 1;
3173 	nfs_readdir_cache_misses++;
3174 #endif
3175 	/*
3176 	 * Do the readdir.
3177 	 */
3178 	error = nfsreaddir(vp, nrdc, cr);
3179 
3180 	/*
3181 	 * If this operation failed, just return the error which occurred.
3182 	 */
3183 	if (error != 0)
3184 		return (error);
3185 
3186 	/*
3187 	 * Since the RPC operation will have taken sometime and blocked
3188 	 * this process, another pass through the cache will need to be
3189 	 * taken to find the correct cache entry.  It is possible that
3190 	 * the correct cache entry will not be there (although one was
3191 	 * added) because the directory changed during the RPC operation
3192 	 * and the readdir cache was flushed.  In this case, just start
3193 	 * over.  It is hoped that this will not happen too often... :-)
3194 	 */
3195 	nrdc = NULL;
3196 	goto top;
3197 	/* NOTREACHED */
3198 }
3199 
3200 static int
3201 nfsreaddir(vnode_t *vp, rddir_cache *rdc, cred_t *cr)
3202 {
3203 	int error;
3204 	struct nfsrddirargs rda;
3205 	struct nfsrddirres rd;
3206 	rnode_t *rp;
3207 	mntinfo_t *mi;
3208 	uint_t count;
3209 	int douprintf;
3210 	failinfo_t fi, *fip;
3211 
3212 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
3213 	count = rdc->buflen;
3214 
3215 	rp = VTOR(vp);
3216 	mi = VTOMI(vp);
3217 
3218 	rda.rda_fh = *VTOFH(vp);
3219 	rda.rda_offset = rdc->nfs_cookie;
3220 
3221 	/*
3222 	 * NFS client failover support
3223 	 * suppress failover unless we have a zero cookie
3224 	 */
3225 	if (rdc->nfs_cookie == (off_t)0) {
3226 		fi.vp = vp;
3227 		fi.fhp = (caddr_t)&rda.rda_fh;
3228 		fi.copyproc = nfscopyfh;
3229 		fi.lookupproc = nfslookup;
3230 		fi.xattrdirproc = acl_getxattrdir2;
3231 		fip = &fi;
3232 	} else {
3233 		fip = NULL;
3234 	}
3235 
3236 	rd.rd_entries = kmem_alloc(rdc->buflen, KM_SLEEP);
3237 	rd.rd_size = count;
3238 	rd.rd_offset = rda.rda_offset;
3239 
3240 	douprintf = 1;
3241 
3242 	if (mi->mi_io_kstats) {
3243 		mutex_enter(&mi->mi_lock);
3244 		kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
3245 		mutex_exit(&mi->mi_lock);
3246 	}
3247 
3248 	do {
3249 		rda.rda_count = MIN(count, mi->mi_curread);
3250 		error = rfs2call(mi, RFS_READDIR,
3251 		    xdr_rddirargs, (caddr_t)&rda,
3252 		    xdr_getrddirres, (caddr_t)&rd, cr,
3253 		    &douprintf, &rd.rd_status, 0, fip);
3254 	} while (error == ENFS_TRYAGAIN);
3255 
3256 	if (mi->mi_io_kstats) {
3257 		mutex_enter(&mi->mi_lock);
3258 		kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
3259 		mutex_exit(&mi->mi_lock);
3260 	}
3261 
3262 	/*
3263 	 * Since we are actually doing a READDIR RPC, we must have
3264 	 * exclusive access to the cache entry being filled.  Thus,
3265 	 * it is safe to update all fields except for the flags
3266 	 * field.  The r_statelock in the rnode must be held to
3267 	 * prevent two different threads from simultaneously
3268 	 * attempting to update the flags field.  This can happen
3269 	 * if we are turning off RDDIR and the other thread is
3270 	 * trying to set RDDIRWAIT.
3271 	 */
3272 	ASSERT(rdc->flags & RDDIR);
3273 	if (!error) {
3274 		error = geterrno(rd.rd_status);
3275 		if (!error) {
3276 			rdc->nfs_ncookie = rd.rd_offset;
3277 			rdc->eof = rd.rd_eof ? 1 : 0;
3278 			rdc->entlen = rd.rd_size;
3279 			ASSERT(rdc->entlen <= rdc->buflen);
3280 #ifdef DEBUG
3281 			rdc->entries = rddir_cache_buf_alloc(rdc->buflen,
3282 			    KM_SLEEP);
3283 #else
3284 			rdc->entries = kmem_alloc(rdc->buflen, KM_SLEEP);
3285 #endif
3286 			bcopy(rd.rd_entries, rdc->entries, rdc->entlen);
3287 			rdc->error = 0;
3288 			if (mi->mi_io_kstats) {
3289 				mutex_enter(&mi->mi_lock);
3290 				KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
3291 				KSTAT_IO_PTR(mi->mi_io_kstats)->nread +=
3292 				    rd.rd_size;
3293 				mutex_exit(&mi->mi_lock);
3294 			}
3295 		} else {
3296 			PURGE_STALE_FH(error, vp, cr);
3297 		}
3298 	}
3299 	if (error) {
3300 		rdc->entries = NULL;
3301 		rdc->error = error;
3302 	}
3303 	kmem_free(rd.rd_entries, rdc->buflen);
3304 
3305 	mutex_enter(&rp->r_statelock);
3306 	rdc->flags &= ~RDDIR;
3307 	if (rdc->flags & RDDIRWAIT) {
3308 		rdc->flags &= ~RDDIRWAIT;
3309 		cv_broadcast(&rdc->cv);
3310 	}
3311 	if (error)
3312 		rdc->flags |= RDDIRREQ;
3313 	mutex_exit(&rp->r_statelock);
3314 
3315 	rddir_cache_rele(rdc);
3316 
3317 	return (error);
3318 }
3319 
3320 #ifdef DEBUG
3321 static int nfs_bio_do_stop = 0;
3322 #endif
3323 
3324 static int
3325 nfs_bio(struct buf *bp, cred_t *cr)
3326 {
3327 	rnode_t *rp = VTOR(bp->b_vp);
3328 	int count;
3329 	int error;
3330 	cred_t *cred;
3331 	uint_t offset;
3332 
3333 	DTRACE_IO1(start, struct buf *, bp);
3334 
3335 	ASSERT(nfs_zone() == VTOMI(bp->b_vp)->mi_zone);
3336 	offset = dbtob(bp->b_blkno);
3337 
3338 	if (bp->b_flags & B_READ) {
3339 		mutex_enter(&rp->r_statelock);
3340 		if (rp->r_cred != NULL) {
3341 			cred = rp->r_cred;
3342 			crhold(cred);
3343 		} else {
3344 			rp->r_cred = cr;
3345 			crhold(cr);
3346 			cred = cr;
3347 			crhold(cred);
3348 		}
3349 		mutex_exit(&rp->r_statelock);
3350 	read_again:
3351 		error = bp->b_error = nfsread(bp->b_vp, bp->b_un.b_addr,
3352 		    offset, bp->b_bcount, &bp->b_resid, cred);
3353 		crfree(cred);
3354 		if (!error) {
3355 			if (bp->b_resid) {
3356 				/*
3357 				 * Didn't get it all because we hit EOF,
3358 				 * zero all the memory beyond the EOF.
3359 				 */
3360 				/* bzero(rdaddr + */
3361 				bzero(bp->b_un.b_addr +
3362 				    bp->b_bcount - bp->b_resid, bp->b_resid);
3363 			}
3364 			mutex_enter(&rp->r_statelock);
3365 			if (bp->b_resid == bp->b_bcount &&
3366 			    offset >= rp->r_size) {
3367 				/*
3368 				 * We didn't read anything at all as we are
3369 				 * past EOF.  Return an error indicator back
3370 				 * but don't destroy the pages (yet).
3371 				 */
3372 				error = NFS_EOF;
3373 			}
3374 			mutex_exit(&rp->r_statelock);
3375 		} else if (error == EACCES) {
3376 			mutex_enter(&rp->r_statelock);
3377 			if (cred != cr) {
3378 				if (rp->r_cred != NULL)
3379 					crfree(rp->r_cred);
3380 				rp->r_cred = cr;
3381 				crhold(cr);
3382 				cred = cr;
3383 				crhold(cred);
3384 				mutex_exit(&rp->r_statelock);
3385 				goto read_again;
3386 			}
3387 			mutex_exit(&rp->r_statelock);
3388 		}
3389 	} else {
3390 		if (!(rp->r_flags & RSTALE)) {
3391 			mutex_enter(&rp->r_statelock);
3392 			if (rp->r_cred != NULL) {
3393 				cred = rp->r_cred;
3394 				crhold(cred);
3395 			} else {
3396 				rp->r_cred = cr;
3397 				crhold(cr);
3398 				cred = cr;
3399 				crhold(cred);
3400 			}
3401 			mutex_exit(&rp->r_statelock);
3402 		write_again:
3403 			mutex_enter(&rp->r_statelock);
3404 			count = MIN(bp->b_bcount, rp->r_size - offset);
3405 			mutex_exit(&rp->r_statelock);
3406 			if (count < 0)
3407 				cmn_err(CE_PANIC, "nfs_bio: write count < 0");
3408 #ifdef DEBUG
3409 			if (count == 0) {
3410 				zcmn_err(getzoneid(), CE_WARN,
3411 				    "nfs_bio: zero length write at %d",
3412 				    offset);
3413 				nfs_printfhandle(&rp->r_fh);
3414 				if (nfs_bio_do_stop)
3415 					debug_enter("nfs_bio");
3416 			}
3417 #endif
3418 			error = nfswrite(bp->b_vp, bp->b_un.b_addr, offset,
3419 			    count, cred);
3420 			if (error == EACCES) {
3421 				mutex_enter(&rp->r_statelock);
3422 				if (cred != cr) {
3423 					if (rp->r_cred != NULL)
3424 						crfree(rp->r_cred);
3425 					rp->r_cred = cr;
3426 					crhold(cr);
3427 					crfree(cred);
3428 					cred = cr;
3429 					crhold(cred);
3430 					mutex_exit(&rp->r_statelock);
3431 					goto write_again;
3432 				}
3433 				mutex_exit(&rp->r_statelock);
3434 			}
3435 			bp->b_error = error;
3436 			if (error && error != EINTR) {
3437 				/*
3438 				 * Don't print EDQUOT errors on the console.
3439 				 * Don't print asynchronous EACCES errors.
3440 				 * Don't print EFBIG errors.
3441 				 * Print all other write errors.
3442 				 */
3443 				if (error != EDQUOT && error != EFBIG &&
3444 				    (error != EACCES ||
3445 				    !(bp->b_flags & B_ASYNC)))
3446 					nfs_write_error(bp->b_vp, error, cred);
3447 				/*
3448 				 * Update r_error and r_flags as appropriate.
3449 				 * If the error was ESTALE, then mark the
3450 				 * rnode as not being writeable and save
3451 				 * the error status.  Otherwise, save any
3452 				 * errors which occur from asynchronous
3453 				 * page invalidations.  Any errors occurring
3454 				 * from other operations should be saved
3455 				 * by the caller.
3456 				 */
3457 				mutex_enter(&rp->r_statelock);
3458 				if (error == ESTALE) {
3459 					rp->r_flags |= RSTALE;
3460 					if (!rp->r_error)
3461 						rp->r_error = error;
3462 				} else if (!rp->r_error &&
3463 				    (bp->b_flags &
3464 				    (B_INVAL|B_FORCE|B_ASYNC)) ==
3465 				    (B_INVAL|B_FORCE|B_ASYNC)) {
3466 					rp->r_error = error;
3467 				}
3468 				mutex_exit(&rp->r_statelock);
3469 			}
3470 			crfree(cred);
3471 		} else
3472 			error = rp->r_error;
3473 	}
3474 
3475 	if (error != 0 && error != NFS_EOF)
3476 		bp->b_flags |= B_ERROR;
3477 
3478 	DTRACE_IO1(done, struct buf *, bp);
3479 
3480 	return (error);
3481 }
3482 
3483 /* ARGSUSED */
3484 static int
3485 nfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
3486 {
3487 	struct nfs_fid *fp;
3488 	rnode_t *rp;
3489 
3490 	rp = VTOR(vp);
3491 
3492 	if (fidp->fid_len < (sizeof (struct nfs_fid) - sizeof (short))) {
3493 		fidp->fid_len = sizeof (struct nfs_fid) - sizeof (short);
3494 		return (ENOSPC);
3495 	}
3496 	fp = (struct nfs_fid *)fidp;
3497 	fp->nf_pad = 0;
3498 	fp->nf_len = sizeof (struct nfs_fid) - sizeof (short);
3499 	bcopy(rp->r_fh.fh_buf, fp->nf_data, NFS_FHSIZE);
3500 	return (0);
3501 }
3502 
3503 /* ARGSUSED2 */
3504 static int
3505 nfs_rwlock(vnode_t *vp, int write_lock, caller_context_t *ctp)
3506 {
3507 	rnode_t *rp = VTOR(vp);
3508 
3509 	if (!write_lock) {
3510 		(void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
3511 		return (V_WRITELOCK_FALSE);
3512 	}
3513 
3514 	if ((rp->r_flags & RDIRECTIO) || (VTOMI(vp)->mi_flags & MI_DIRECTIO)) {
3515 		(void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
3516 		if (rp->r_mapcnt == 0 && !vn_has_cached_data(vp))
3517 			return (V_WRITELOCK_FALSE);
3518 		nfs_rw_exit(&rp->r_rwlock);
3519 	}
3520 
3521 	(void) nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, FALSE);
3522 	return (V_WRITELOCK_TRUE);
3523 }
3524 
3525 /* ARGSUSED */
3526 static void
3527 nfs_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ctp)
3528 {
3529 	rnode_t *rp = VTOR(vp);
3530 
3531 	nfs_rw_exit(&rp->r_rwlock);
3532 }
3533 
3534 /* ARGSUSED */
3535 static int
3536 nfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct)
3537 {
3538 
3539 	/*
3540 	 * Because we stuff the readdir cookie into the offset field
3541 	 * someone may attempt to do an lseek with the cookie which
3542 	 * we want to succeed.
3543 	 */
3544 	if (vp->v_type == VDIR)
3545 		return (0);
3546 	if (*noffp < 0 || *noffp > MAXOFF32_T)
3547 		return (EINVAL);
3548 	return (0);
3549 }
3550 
3551 /*
3552  * number of NFS_MAXDATA blocks to read ahead
3553  * optimized for 100 base-T.
3554  */
3555 static int nfs_nra = 4;
3556 
3557 #ifdef DEBUG
3558 static int nfs_lostpage = 0;	/* number of times we lost original page */
3559 #endif
3560 
3561 /*
3562  * Return all the pages from [off..off+len) in file
3563  */
3564 /* ARGSUSED */
3565 static int
3566 nfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
3567 	page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
3568 	enum seg_rw rw, cred_t *cr, caller_context_t *ct)
3569 {
3570 	rnode_t *rp;
3571 	int error;
3572 	mntinfo_t *mi;
3573 
3574 	if (vp->v_flag & VNOMAP)
3575 		return (ENOSYS);
3576 
3577 	ASSERT(off <= MAXOFF32_T);
3578 	if (nfs_zone() != VTOMI(vp)->mi_zone)
3579 		return (EIO);
3580 	if (protp != NULL)
3581 		*protp = PROT_ALL;
3582 
3583 	/*
3584 	 * Now valididate that the caches are up to date.
3585 	 */
3586 	error = nfs_validate_caches(vp, cr);
3587 	if (error)
3588 		return (error);
3589 
3590 	rp = VTOR(vp);
3591 	mi = VTOMI(vp);
3592 retry:
3593 	mutex_enter(&rp->r_statelock);
3594 
3595 	/*
3596 	 * Don't create dirty pages faster than they
3597 	 * can be cleaned so that the system doesn't
3598 	 * get imbalanced.  If the async queue is
3599 	 * maxed out, then wait for it to drain before
3600 	 * creating more dirty pages.  Also, wait for
3601 	 * any threads doing pagewalks in the vop_getattr
3602 	 * entry points so that they don't block for
3603 	 * long periods.
3604 	 */
3605 	if (rw == S_CREATE) {
3606 		while ((mi->mi_max_threads != 0 &&
3607 		    rp->r_awcount > 2 * mi->mi_max_threads) ||
3608 		    rp->r_gcount > 0)
3609 			cv_wait(&rp->r_cv, &rp->r_statelock);
3610 	}
3611 
3612 	/*
3613 	 * If we are getting called as a side effect of an nfs_write()
3614 	 * operation the local file size might not be extended yet.
3615 	 * In this case we want to be able to return pages of zeroes.
3616 	 */
3617 	if (off + len > rp->r_size + PAGEOFFSET && seg != segkmap) {
3618 		mutex_exit(&rp->r_statelock);
3619 		return (EFAULT);		/* beyond EOF */
3620 	}
3621 
3622 	mutex_exit(&rp->r_statelock);
3623 
3624 	if (len <= PAGESIZE) {
3625 		error = nfs_getapage(vp, off, len, protp, pl, plsz,
3626 		    seg, addr, rw, cr);
3627 	} else {
3628 		error = pvn_getpages(nfs_getapage, vp, off, len, protp,
3629 		    pl, plsz, seg, addr, rw, cr);
3630 	}
3631 
3632 	switch (error) {
3633 	case NFS_EOF:
3634 		nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr);
3635 		goto retry;
3636 	case ESTALE:
3637 		PURGE_STALE_FH(error, vp, cr);
3638 	}
3639 
3640 	return (error);
3641 }
3642 
3643 /*
3644  * Called from pvn_getpages or nfs_getpage to get a particular page.
3645  */
3646 /* ARGSUSED */
3647 static int
3648 nfs_getapage(vnode_t *vp, u_offset_t off, size_t len, uint_t *protp,
3649 	page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
3650 	enum seg_rw rw, cred_t *cr)
3651 {
3652 	rnode_t *rp;
3653 	uint_t bsize;
3654 	struct buf *bp;
3655 	page_t *pp;
3656 	u_offset_t lbn;
3657 	u_offset_t io_off;
3658 	u_offset_t blkoff;
3659 	u_offset_t rablkoff;
3660 	size_t io_len;
3661 	uint_t blksize;
3662 	int error;
3663 	int readahead;
3664 	int readahead_issued = 0;
3665 	int ra_window; /* readahead window */
3666 	page_t *pagefound;
3667 
3668 	if (nfs_zone() != VTOMI(vp)->mi_zone)
3669 		return (EIO);
3670 	rp = VTOR(vp);
3671 	bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
3672 
3673 reread:
3674 	bp = NULL;
3675 	pp = NULL;
3676 	pagefound = NULL;
3677 
3678 	if (pl != NULL)
3679 		pl[0] = NULL;
3680 
3681 	error = 0;
3682 	lbn = off / bsize;
3683 	blkoff = lbn * bsize;
3684 
3685 	/*
3686 	 * Queueing up the readahead before doing the synchronous read
3687 	 * results in a significant increase in read throughput because
3688 	 * of the increased parallelism between the async threads and
3689 	 * the process context.
3690 	 */
3691 	if ((off & ((vp->v_vfsp->vfs_bsize) - 1)) == 0 &&
3692 	    rw != S_CREATE &&
3693 	    !(vp->v_flag & VNOCACHE)) {
3694 		mutex_enter(&rp->r_statelock);
3695 
3696 		/*
3697 		 * Calculate the number of readaheads to do.
3698 		 * a) No readaheads at offset = 0.
3699 		 * b) Do maximum(nfs_nra) readaheads when the readahead
3700 		 *    window is closed.
3701 		 * c) Do readaheads between 1 to (nfs_nra - 1) depending
3702 		 *    upon how far the readahead window is open or close.
3703 		 * d) No readaheads if rp->r_nextr is not within the scope
3704 		 *    of the readahead window (random i/o).
3705 		 */
3706 
3707 		if (off == 0)
3708 			readahead = 0;
3709 		else if (blkoff == rp->r_nextr)
3710 			readahead = nfs_nra;
3711 		else if (rp->r_nextr > blkoff &&
3712 		    ((ra_window = (rp->r_nextr - blkoff) / bsize)
3713 		    <= (nfs_nra - 1)))
3714 			readahead = nfs_nra - ra_window;
3715 		else
3716 			readahead = 0;
3717 
3718 		rablkoff = rp->r_nextr;
3719 		while (readahead > 0 && rablkoff + bsize < rp->r_size) {
3720 			mutex_exit(&rp->r_statelock);
3721 			if (nfs_async_readahead(vp, rablkoff + bsize,
3722 			    addr + (rablkoff + bsize - off), seg, cr,
3723 			    nfs_readahead) < 0) {
3724 				mutex_enter(&rp->r_statelock);
3725 				break;
3726 			}
3727 			readahead--;
3728 			rablkoff += bsize;
3729 			/*
3730 			 * Indicate that we did a readahead so
3731 			 * readahead offset is not updated
3732 			 * by the synchronous read below.
3733 			 */
3734 			readahead_issued = 1;
3735 			mutex_enter(&rp->r_statelock);
3736 			/*
3737 			 * set readahead offset to
3738 			 * offset of last async readahead
3739 			 * request.
3740 			 */
3741 			rp->r_nextr = rablkoff;
3742 		}
3743 		mutex_exit(&rp->r_statelock);
3744 	}
3745 
3746 again:
3747 	if ((pagefound = page_exists(vp, off)) == NULL) {
3748 		if (pl == NULL) {
3749 			(void) nfs_async_readahead(vp, blkoff, addr, seg, cr,
3750 			    nfs_readahead);
3751 		} else if (rw == S_CREATE) {
3752 			/*
3753 			 * Block for this page is not allocated, or the offset
3754 			 * is beyond the current allocation size, or we're
3755 			 * allocating a swap slot and the page was not found,
3756 			 * so allocate it and return a zero page.
3757 			 */
3758 			if ((pp = page_create_va(vp, off,
3759 			    PAGESIZE, PG_WAIT, seg, addr)) == NULL)
3760 				cmn_err(CE_PANIC, "nfs_getapage: page_create");
3761 			io_len = PAGESIZE;
3762 			mutex_enter(&rp->r_statelock);
3763 			rp->r_nextr = off + PAGESIZE;
3764 			mutex_exit(&rp->r_statelock);
3765 		} else {
3766 			/*
3767 			 * Need to go to server to get a BLOCK, exception to
3768 			 * that being while reading at offset = 0 or doing
3769 			 * random i/o, in that case read only a PAGE.
3770 			 */
3771 			mutex_enter(&rp->r_statelock);
3772 			if (blkoff < rp->r_size &&
3773 			    blkoff + bsize >= rp->r_size) {
3774 				/*
3775 				 * If only a block or less is left in
3776 				 * the file, read all that is remaining.
3777 				 */
3778 				if (rp->r_size <= off) {
3779 					/*
3780 					 * Trying to access beyond EOF,
3781 					 * set up to get at least one page.
3782 					 */
3783 					blksize = off + PAGESIZE - blkoff;
3784 				} else
3785 					blksize = rp->r_size - blkoff;
3786 			} else if ((off == 0) ||
3787 			    (off != rp->r_nextr && !readahead_issued)) {
3788 				blksize = PAGESIZE;
3789 				blkoff = off; /* block = page here */
3790 			} else
3791 				blksize = bsize;
3792 			mutex_exit(&rp->r_statelock);
3793 
3794 			pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
3795 			    &io_len, blkoff, blksize, 0);
3796 
3797 			/*
3798 			 * Some other thread has entered the page,
3799 			 * so just use it.
3800 			 */
3801 			if (pp == NULL)
3802 				goto again;
3803 
3804 			/*
3805 			 * Now round the request size up to page boundaries.
3806 			 * This ensures that the entire page will be
3807 			 * initialized to zeroes if EOF is encountered.
3808 			 */
3809 			io_len = ptob(btopr(io_len));
3810 
3811 			bp = pageio_setup(pp, io_len, vp, B_READ);
3812 			ASSERT(bp != NULL);
3813 
3814 			/*
3815 			 * pageio_setup should have set b_addr to 0.  This
3816 			 * is correct since we want to do I/O on a page
3817 			 * boundary.  bp_mapin will use this addr to calculate
3818 			 * an offset, and then set b_addr to the kernel virtual
3819 			 * address it allocated for us.
3820 			 */
3821 			ASSERT(bp->b_un.b_addr == 0);
3822 
3823 			bp->b_edev = 0;
3824 			bp->b_dev = 0;
3825 			bp->b_lblkno = lbtodb(io_off);
3826 			bp->b_file = vp;
3827 			bp->b_offset = (offset_t)off;
3828 			bp_mapin(bp);
3829 
3830 			/*
3831 			 * If doing a write beyond what we believe is EOF,
3832 			 * don't bother trying to read the pages from the
3833 			 * server, we'll just zero the pages here.  We
3834 			 * don't check that the rw flag is S_WRITE here
3835 			 * because some implementations may attempt a
3836 			 * read access to the buffer before copying data.
3837 			 */
3838 			mutex_enter(&rp->r_statelock);
3839 			if (io_off >= rp->r_size && seg == segkmap) {
3840 				mutex_exit(&rp->r_statelock);
3841 				bzero(bp->b_un.b_addr, io_len);
3842 			} else {
3843 				mutex_exit(&rp->r_statelock);
3844 				error = nfs_bio(bp, cr);
3845 			}
3846 
3847 			/*
3848 			 * Unmap the buffer before freeing it.
3849 			 */
3850 			bp_mapout(bp);
3851 			pageio_done(bp);
3852 
3853 			if (error == NFS_EOF) {
3854 				/*
3855 				 * If doing a write system call just return
3856 				 * zeroed pages, else user tried to get pages
3857 				 * beyond EOF, return error.  We don't check
3858 				 * that the rw flag is S_WRITE here because
3859 				 * some implementations may attempt a read
3860 				 * access to the buffer before copying data.
3861 				 */
3862 				if (seg == segkmap)
3863 					error = 0;
3864 				else
3865 					error = EFAULT;
3866 			}
3867 
3868 			if (!readahead_issued && !error) {
3869 				mutex_enter(&rp->r_statelock);
3870 				rp->r_nextr = io_off + io_len;
3871 				mutex_exit(&rp->r_statelock);
3872 			}
3873 		}
3874 	}
3875 
3876 out:
3877 	if (pl == NULL)
3878 		return (error);
3879 
3880 	if (error) {
3881 		if (pp != NULL)
3882 			pvn_read_done(pp, B_ERROR);
3883 		return (error);
3884 	}
3885 
3886 	if (pagefound) {
3887 		se_t se = (rw == S_CREATE ? SE_EXCL : SE_SHARED);
3888 
3889 		/*
3890 		 * Page exists in the cache, acquire the appropriate lock.
3891 		 * If this fails, start all over again.
3892 		 */
3893 		if ((pp = page_lookup(vp, off, se)) == NULL) {
3894 #ifdef DEBUG
3895 			nfs_lostpage++;
3896 #endif
3897 			goto reread;
3898 		}
3899 		pl[0] = pp;
3900 		pl[1] = NULL;
3901 		return (0);
3902 	}
3903 
3904 	if (pp != NULL)
3905 		pvn_plist_init(pp, pl, plsz, off, io_len, rw);
3906 
3907 	return (error);
3908 }
3909 
3910 static void
3911 nfs_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, struct seg *seg,
3912 	cred_t *cr)
3913 {
3914 	int error;
3915 	page_t *pp;
3916 	u_offset_t io_off;
3917 	size_t io_len;
3918 	struct buf *bp;
3919 	uint_t bsize, blksize;
3920 	rnode_t *rp = VTOR(vp);
3921 
3922 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
3923 
3924 	bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
3925 
3926 	mutex_enter(&rp->r_statelock);
3927 	if (blkoff < rp->r_size && blkoff + bsize > rp->r_size) {
3928 		/*
3929 		 * If less than a block left in file read less
3930 		 * than a block.
3931 		 */
3932 		blksize = rp->r_size - blkoff;
3933 	} else
3934 		blksize = bsize;
3935 	mutex_exit(&rp->r_statelock);
3936 
3937 	pp = pvn_read_kluster(vp, blkoff, segkmap, addr,
3938 	    &io_off, &io_len, blkoff, blksize, 1);
3939 	/*
3940 	 * The isra flag passed to the kluster function is 1, we may have
3941 	 * gotten a return value of NULL for a variety of reasons (# of free
3942 	 * pages < minfree, someone entered the page on the vnode etc). In all
3943 	 * cases, we want to punt on the readahead.
3944 	 */
3945 	if (pp == NULL)
3946 		return;
3947 
3948 	/*
3949 	 * Now round the request size up to page boundaries.
3950 	 * This ensures that the entire page will be
3951 	 * initialized to zeroes if EOF is encountered.
3952 	 */
3953 	io_len = ptob(btopr(io_len));
3954 
3955 	bp = pageio_setup(pp, io_len, vp, B_READ);
3956 	ASSERT(bp != NULL);
3957 
3958 	/*
3959 	 * pageio_setup should have set b_addr to 0.  This is correct since
3960 	 * we want to do I/O on a page boundary. bp_mapin() will use this addr
3961 	 * to calculate an offset, and then set b_addr to the kernel virtual
3962 	 * address it allocated for us.
3963 	 */
3964 	ASSERT(bp->b_un.b_addr == 0);
3965 
3966 	bp->b_edev = 0;
3967 	bp->b_dev = 0;
3968 	bp->b_lblkno = lbtodb(io_off);
3969 	bp->b_file = vp;
3970 	bp->b_offset = (offset_t)blkoff;
3971 	bp_mapin(bp);
3972 
3973 	/*
3974 	 * If doing a write beyond what we believe is EOF, don't bother trying
3975 	 * to read the pages from the server, we'll just zero the pages here.
3976 	 * We don't check that the rw flag is S_WRITE here because some
3977 	 * implementations may attempt a read access to the buffer before
3978 	 * copying data.
3979 	 */
3980 	mutex_enter(&rp->r_statelock);
3981 	if (io_off >= rp->r_size && seg == segkmap) {
3982 		mutex_exit(&rp->r_statelock);
3983 		bzero(bp->b_un.b_addr, io_len);
3984 		error = 0;
3985 	} else {
3986 		mutex_exit(&rp->r_statelock);
3987 		error = nfs_bio(bp, cr);
3988 		if (error == NFS_EOF)
3989 			error = 0;
3990 	}
3991 
3992 	/*
3993 	 * Unmap the buffer before freeing it.
3994 	 */
3995 	bp_mapout(bp);
3996 	pageio_done(bp);
3997 
3998 	pvn_read_done(pp, error ? B_READ | B_ERROR : B_READ);
3999 
4000 	/*
4001 	 * In case of error set readahead offset
4002 	 * to the lowest offset.
4003 	 * pvn_read_done() calls VN_DISPOSE to destroy the pages
4004 	 */
4005 	if (error && rp->r_nextr > io_off) {
4006 		mutex_enter(&rp->r_statelock);
4007 		if (rp->r_nextr > io_off)
4008 			rp->r_nextr = io_off;
4009 		mutex_exit(&rp->r_statelock);
4010 	}
4011 }
4012 
4013 /*
4014  * Flags are composed of {B_INVAL, B_FREE, B_DONTNEED, B_FORCE}
4015  * If len == 0, do from off to EOF.
4016  *
4017  * The normal cases should be len == 0 && off == 0 (entire vp list),
4018  * len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE
4019  * (from pageout).
4020  */
4021 /* ARGSUSED */
4022 static int
4023 nfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
4024 	caller_context_t *ct)
4025 {
4026 	int error;
4027 	rnode_t *rp;
4028 
4029 	ASSERT(cr != NULL);
4030 
4031 	/*
4032 	 * XXX - Why should this check be made here?
4033 	 */
4034 	if (vp->v_flag & VNOMAP)
4035 		return (ENOSYS);
4036 
4037 	if (len == 0 && !(flags & B_INVAL) && vn_is_readonly(vp))
4038 		return (0);
4039 
4040 	if (!(flags & B_ASYNC) && nfs_zone() != VTOMI(vp)->mi_zone)
4041 		return (EIO);
4042 	ASSERT(off <= MAXOFF32_T);
4043 
4044 	rp = VTOR(vp);
4045 	mutex_enter(&rp->r_statelock);
4046 	rp->r_count++;
4047 	mutex_exit(&rp->r_statelock);
4048 	error = nfs_putpages(vp, off, len, flags, cr);
4049 	mutex_enter(&rp->r_statelock);
4050 	rp->r_count--;
4051 	cv_broadcast(&rp->r_cv);
4052 	mutex_exit(&rp->r_statelock);
4053 
4054 	return (error);
4055 }
4056 
4057 /*
4058  * Write out a single page, possibly klustering adjacent dirty pages.
4059  */
4060 int
4061 nfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp,
4062 	int flags, cred_t *cr)
4063 {
4064 	u_offset_t io_off;
4065 	u_offset_t lbn_off;
4066 	u_offset_t lbn;
4067 	size_t io_len;
4068 	uint_t bsize;
4069 	int error;
4070 	rnode_t *rp;
4071 
4072 	ASSERT(!vn_is_readonly(vp));
4073 	ASSERT(pp != NULL);
4074 	ASSERT(cr != NULL);
4075 	ASSERT((flags & B_ASYNC) || nfs_zone() == VTOMI(vp)->mi_zone);
4076 
4077 	rp = VTOR(vp);
4078 	ASSERT(rp->r_count > 0);
4079 
4080 	ASSERT(pp->p_offset <= MAXOFF32_T);
4081 
4082 	bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
4083 	lbn = pp->p_offset / bsize;
4084 	lbn_off = lbn * bsize;
4085 
4086 	/*
4087 	 * Find a kluster that fits in one block, or in
4088 	 * one page if pages are bigger than blocks.  If
4089 	 * there is less file space allocated than a whole
4090 	 * page, we'll shorten the i/o request below.
4091 	 */
4092 	pp = pvn_write_kluster(vp, pp, &io_off, &io_len, lbn_off,
4093 	    roundup(bsize, PAGESIZE), flags);
4094 
4095 	/*
4096 	 * pvn_write_kluster shouldn't have returned a page with offset
4097 	 * behind the original page we were given.  Verify that.
4098 	 */
4099 	ASSERT((pp->p_offset / bsize) >= lbn);
4100 
4101 	/*
4102 	 * Now pp will have the list of kept dirty pages marked for
4103 	 * write back.  It will also handle invalidation and freeing
4104 	 * of pages that are not dirty.  Check for page length rounding
4105 	 * problems.
4106 	 */
4107 	if (io_off + io_len > lbn_off + bsize) {
4108 		ASSERT((io_off + io_len) - (lbn_off + bsize) < PAGESIZE);
4109 		io_len = lbn_off + bsize - io_off;
4110 	}
4111 	/*
4112 	 * The RMODINPROGRESS flag makes sure that nfs(3)_bio() sees a
4113 	 * consistent value of r_size. RMODINPROGRESS is set in writerp().
4114 	 * When RMODINPROGRESS is set it indicates that a uiomove() is in
4115 	 * progress and the r_size has not been made consistent with the
4116 	 * new size of the file. When the uiomove() completes the r_size is
4117 	 * updated and the RMODINPROGRESS flag is cleared.
4118 	 *
4119 	 * The RMODINPROGRESS flag makes sure that nfs(3)_bio() sees a
4120 	 * consistent value of r_size. Without this handshaking, it is
4121 	 * possible that nfs(3)_bio() picks  up the old value of r_size
4122 	 * before the uiomove() in writerp() completes. This will result
4123 	 * in the write through nfs(3)_bio() being dropped.
4124 	 *
4125 	 * More precisely, there is a window between the time the uiomove()
4126 	 * completes and the time the r_size is updated. If a VOP_PUTPAGE()
4127 	 * operation intervenes in this window, the page will be picked up,
4128 	 * because it is dirty (it will be unlocked, unless it was
4129 	 * pagecreate'd). When the page is picked up as dirty, the dirty
4130 	 * bit is reset (pvn_getdirty()). In nfs(3)write(), r_size is
4131 	 * checked. This will still be the old size. Therefore the page will
4132 	 * not be written out. When segmap_release() calls VOP_PUTPAGE(),
4133 	 * the page will be found to be clean and the write will be dropped.
4134 	 */
4135 	if (rp->r_flags & RMODINPROGRESS) {
4136 		mutex_enter(&rp->r_statelock);
4137 		if ((rp->r_flags & RMODINPROGRESS) &&
4138 		    rp->r_modaddr + MAXBSIZE > io_off &&
4139 		    rp->r_modaddr < io_off + io_len) {
4140 			page_t *plist;
4141 			/*
4142 			 * A write is in progress for this region of the file.
4143 			 * If we did not detect RMODINPROGRESS here then this
4144 			 * path through nfs_putapage() would eventually go to
4145 			 * nfs(3)_bio() and may not write out all of the data
4146 			 * in the pages. We end up losing data. So we decide
4147 			 * to set the modified bit on each page in the page
4148 			 * list and mark the rnode with RDIRTY. This write
4149 			 * will be restarted at some later time.
4150 			 */
4151 			plist = pp;
4152 			while (plist != NULL) {
4153 				pp = plist;
4154 				page_sub(&plist, pp);
4155 				hat_setmod(pp);
4156 				page_io_unlock(pp);
4157 				page_unlock(pp);
4158 			}
4159 			rp->r_flags |= RDIRTY;
4160 			mutex_exit(&rp->r_statelock);
4161 			if (offp)
4162 				*offp = io_off;
4163 			if (lenp)
4164 				*lenp = io_len;
4165 			return (0);
4166 		}
4167 		mutex_exit(&rp->r_statelock);
4168 	}
4169 
4170 	if (flags & B_ASYNC) {
4171 		error = nfs_async_putapage(vp, pp, io_off, io_len, flags, cr,
4172 		    nfs_sync_putapage);
4173 	} else
4174 		error = nfs_sync_putapage(vp, pp, io_off, io_len, flags, cr);
4175 
4176 	if (offp)
4177 		*offp = io_off;
4178 	if (lenp)
4179 		*lenp = io_len;
4180 	return (error);
4181 }
4182 
4183 static int
4184 nfs_sync_putapage(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
4185 	int flags, cred_t *cr)
4186 {
4187 	int error;
4188 	rnode_t *rp;
4189 
4190 	flags |= B_WRITE;
4191 
4192 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
4193 	error = nfs_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
4194 
4195 	rp = VTOR(vp);
4196 
4197 	if ((error == ENOSPC || error == EDQUOT || error == EACCES) &&
4198 	    (flags & (B_INVAL|B_FORCE)) != (B_INVAL|B_FORCE)) {
4199 		if (!(rp->r_flags & ROUTOFSPACE)) {
4200 			mutex_enter(&rp->r_statelock);
4201 			rp->r_flags |= ROUTOFSPACE;
4202 			mutex_exit(&rp->r_statelock);
4203 		}
4204 		flags |= B_ERROR;
4205 		pvn_write_done(pp, flags);
4206 		/*
4207 		 * If this was not an async thread, then try again to
4208 		 * write out the pages, but this time, also destroy
4209 		 * them whether or not the write is successful.  This
4210 		 * will prevent memory from filling up with these
4211 		 * pages and destroying them is the only alternative
4212 		 * if they can't be written out.
4213 		 *
4214 		 * Don't do this if this is an async thread because
4215 		 * when the pages are unlocked in pvn_write_done,
4216 		 * some other thread could have come along, locked
4217 		 * them, and queued for an async thread.  It would be
4218 		 * possible for all of the async threads to be tied
4219 		 * up waiting to lock the pages again and they would
4220 		 * all already be locked and waiting for an async
4221 		 * thread to handle them.  Deadlock.
4222 		 */
4223 		if (!(flags & B_ASYNC)) {
4224 			error = nfs_putpage(vp, io_off, io_len,
4225 			    B_INVAL | B_FORCE, cr, NULL);
4226 		}
4227 	} else {
4228 		if (error)
4229 			flags |= B_ERROR;
4230 		else if (rp->r_flags & ROUTOFSPACE) {
4231 			mutex_enter(&rp->r_statelock);
4232 			rp->r_flags &= ~ROUTOFSPACE;
4233 			mutex_exit(&rp->r_statelock);
4234 		}
4235 		pvn_write_done(pp, flags);
4236 	}
4237 
4238 	return (error);
4239 }
4240 
4241 /* ARGSUSED */
4242 static int
4243 nfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
4244 	size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
4245 	caller_context_t *ct)
4246 {
4247 	struct segvn_crargs vn_a;
4248 	int error;
4249 	rnode_t *rp;
4250 	struct vattr va;
4251 
4252 	if (nfs_zone() != VTOMI(vp)->mi_zone)
4253 		return (EIO);
4254 
4255 	if (vp->v_flag & VNOMAP)
4256 		return (ENOSYS);
4257 
4258 	if (off > MAXOFF32_T)
4259 		return (EFBIG);
4260 
4261 	if (off < 0 || off + len < 0)
4262 		return (ENXIO);
4263 
4264 	if (vp->v_type != VREG)
4265 		return (ENODEV);
4266 
4267 	/*
4268 	 * If there is cached data and if close-to-open consistency
4269 	 * checking is not turned off and if the file system is not
4270 	 * mounted readonly, then force an over the wire getattr.
4271 	 * Otherwise, just invoke nfsgetattr to get a copy of the
4272 	 * attributes.  The attribute cache will be used unless it
4273 	 * is timed out and if it is, then an over the wire getattr
4274 	 * will be issued.
4275 	 */
4276 	va.va_mask = AT_ALL;
4277 	if (vn_has_cached_data(vp) &&
4278 	    !(VTOMI(vp)->mi_flags & MI_NOCTO) && !vn_is_readonly(vp))
4279 		error = nfs_getattr_otw(vp, &va, cr);
4280 	else
4281 		error = nfsgetattr(vp, &va, cr);
4282 	if (error)
4283 		return (error);
4284 
4285 	/*
4286 	 * Check to see if the vnode is currently marked as not cachable.
4287 	 * This means portions of the file are locked (through VOP_FRLOCK).
4288 	 * In this case the map request must be refused.  We use
4289 	 * rp->r_lkserlock to avoid a race with concurrent lock requests.
4290 	 */
4291 	rp = VTOR(vp);
4292 	if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR(vp)))
4293 		return (EINTR);
4294 
4295 	if (vp->v_flag & VNOCACHE) {
4296 		error = EAGAIN;
4297 		goto done;
4298 	}
4299 
4300 	/*
4301 	 * Don't allow concurrent locks and mapping if mandatory locking is
4302 	 * enabled.
4303 	 */
4304 	if ((flk_has_remote_locks(vp) || lm_has_sleep(vp)) &&
4305 	    MANDLOCK(vp, va.va_mode)) {
4306 		error = EAGAIN;
4307 		goto done;
4308 	}
4309 
4310 	as_rangelock(as);
4311 	if (!(flags & MAP_FIXED)) {
4312 		map_addr(addrp, len, off, 1, flags);
4313 		if (*addrp == NULL) {
4314 			as_rangeunlock(as);
4315 			error = ENOMEM;
4316 			goto done;
4317 		}
4318 	} else {
4319 		/*
4320 		 * User specified address - blow away any previous mappings
4321 		 */
4322 		(void) as_unmap(as, *addrp, len);
4323 	}
4324 
4325 	vn_a.vp = vp;
4326 	vn_a.offset = off;
4327 	vn_a.type = (flags & MAP_TYPE);
4328 	vn_a.prot = (uchar_t)prot;
4329 	vn_a.maxprot = (uchar_t)maxprot;
4330 	vn_a.flags = (flags & ~MAP_TYPE);
4331 	vn_a.cred = cr;
4332 	vn_a.amp = NULL;
4333 	vn_a.szc = 0;
4334 	vn_a.lgrp_mem_policy_flags = 0;
4335 
4336 	error = as_map(as, *addrp, len, segvn_create, &vn_a);
4337 	as_rangeunlock(as);
4338 
4339 done:
4340 	nfs_rw_exit(&rp->r_lkserlock);
4341 	return (error);
4342 }
4343 
4344 /* ARGSUSED */
4345 static int
4346 nfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
4347 	size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
4348 	caller_context_t *ct)
4349 {
4350 	rnode_t *rp;
4351 
4352 	if (vp->v_flag & VNOMAP)
4353 		return (ENOSYS);
4354 	if (nfs_zone() != VTOMI(vp)->mi_zone)
4355 		return (EIO);
4356 
4357 	/*
4358 	 * Need to hold rwlock while incrementing the mapcnt so that
4359 	 * mmap'ing can be serialized with writes so that the caching
4360 	 * can be handled correctly.
4361 	 */
4362 	rp = VTOR(vp);
4363 	if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, INTR(vp)))
4364 		return (EINTR);
4365 	atomic_add_long((ulong_t *)&rp->r_mapcnt, btopr(len));
4366 	nfs_rw_exit(&rp->r_rwlock);
4367 
4368 	return (0);
4369 }
4370 
4371 /* ARGSUSED */
4372 static int
4373 nfs_frlock(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, offset_t offset,
4374 	struct flk_callback *flk_cbp, cred_t *cr, caller_context_t *ct)
4375 {
4376 	netobj lm_fh;
4377 	int rc;
4378 	u_offset_t start, end;
4379 	rnode_t *rp;
4380 	int error = 0, intr = INTR(vp);
4381 
4382 	/* check for valid cmd parameter */
4383 	if (cmd != F_GETLK && cmd != F_SETLK && cmd != F_SETLKW)
4384 		return (EINVAL);
4385 	if (nfs_zone() != VTOMI(vp)->mi_zone)
4386 		return (EIO);
4387 
4388 	/* Verify l_type. */
4389 	switch (bfp->l_type) {
4390 	case F_RDLCK:
4391 		if (cmd != F_GETLK && !(flag & FREAD))
4392 			return (EBADF);
4393 		break;
4394 	case F_WRLCK:
4395 		if (cmd != F_GETLK && !(flag & FWRITE))
4396 			return (EBADF);
4397 		break;
4398 	case F_UNLCK:
4399 		intr = 0;
4400 		break;
4401 
4402 	default:
4403 		return (EINVAL);
4404 	}
4405 
4406 	/* check the validity of the lock range */
4407 	if (rc = flk_convert_lock_data(vp, bfp, &start, &end, offset))
4408 		return (rc);
4409 	if (rc = flk_check_lock_data(start, end, MAXOFF32_T))
4410 		return (rc);
4411 
4412 	/*
4413 	 * If the filesystem is mounted using local locking, pass the
4414 	 * request off to the local locking code.
4415 	 */
4416 	if (VTOMI(vp)->mi_flags & MI_LLOCK) {
4417 		if (offset > MAXOFF32_T)
4418 			return (EFBIG);
4419 		if (cmd == F_SETLK || cmd == F_SETLKW) {
4420 			/*
4421 			 * For complete safety, we should be holding
4422 			 * r_lkserlock.  However, we can't call
4423 			 * lm_safelock and then fs_frlock while
4424 			 * holding r_lkserlock, so just invoke
4425 			 * lm_safelock and expect that this will
4426 			 * catch enough of the cases.
4427 			 */
4428 			if (!lm_safelock(vp, bfp, cr))
4429 				return (EAGAIN);
4430 		}
4431 		return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
4432 	}
4433 
4434 	rp = VTOR(vp);
4435 
4436 	/*
4437 	 * Check whether the given lock request can proceed, given the
4438 	 * current file mappings.
4439 	 */
4440 	if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_WRITER, intr))
4441 		return (EINTR);
4442 	if (cmd == F_SETLK || cmd == F_SETLKW) {
4443 		if (!lm_safelock(vp, bfp, cr)) {
4444 			rc = EAGAIN;
4445 			goto done;
4446 		}
4447 	}
4448 
4449 	/*
4450 	 * Flush the cache after waiting for async I/O to finish.  For new
4451 	 * locks, this is so that the process gets the latest bits from the
4452 	 * server.  For unlocks, this is so that other clients see the
4453 	 * latest bits once the file has been unlocked.  If currently dirty
4454 	 * pages can't be flushed, then don't allow a lock to be set.  But
4455 	 * allow unlocks to succeed, to avoid having orphan locks on the
4456 	 * server.
4457 	 */
4458 	if (cmd != F_GETLK) {
4459 		mutex_enter(&rp->r_statelock);
4460 		while (rp->r_count > 0) {
4461 			if (intr) {
4462 				klwp_t *lwp = ttolwp(curthread);
4463 
4464 				if (lwp != NULL)
4465 					lwp->lwp_nostop++;
4466 				if (cv_wait_sig(&rp->r_cv, &rp->r_statelock)
4467 				    == 0) {
4468 					if (lwp != NULL)
4469 						lwp->lwp_nostop--;
4470 					rc = EINTR;
4471 					break;
4472 				}
4473 				if (lwp != NULL)
4474 					lwp->lwp_nostop--;
4475 			} else
4476 			cv_wait(&rp->r_cv, &rp->r_statelock);
4477 		}
4478 		mutex_exit(&rp->r_statelock);
4479 		if (rc != 0)
4480 			goto done;
4481 		error = nfs_putpage(vp, (offset_t)0, 0, B_INVAL, cr, ct);
4482 		if (error) {
4483 			if (error == ENOSPC || error == EDQUOT) {
4484 				mutex_enter(&rp->r_statelock);
4485 				if (!rp->r_error)
4486 					rp->r_error = error;
4487 				mutex_exit(&rp->r_statelock);
4488 			}
4489 			if (bfp->l_type != F_UNLCK) {
4490 				rc = ENOLCK;
4491 				goto done;
4492 			}
4493 		}
4494 	}
4495 
4496 	lm_fh.n_len = sizeof (fhandle_t);
4497 	lm_fh.n_bytes = (char *)VTOFH(vp);
4498 
4499 	/*
4500 	 * Call the lock manager to do the real work of contacting
4501 	 * the server and obtaining the lock.
4502 	 */
4503 	rc = lm_frlock(vp, cmd, bfp, flag, offset, cr, &lm_fh, flk_cbp);
4504 
4505 	if (rc == 0)
4506 		nfs_lockcompletion(vp, cmd);
4507 
4508 done:
4509 	nfs_rw_exit(&rp->r_lkserlock);
4510 	return (rc);
4511 }
4512 
4513 /*
4514  * Free storage space associated with the specified vnode.  The portion
4515  * to be freed is specified by bfp->l_start and bfp->l_len (already
4516  * normalized to a "whence" of 0).
4517  *
4518  * This is an experimental facility whose continued existence is not
4519  * guaranteed.  Currently, we only support the special case
4520  * of l_len == 0, meaning free to end of file.
4521  */
4522 /* ARGSUSED */
4523 static int
4524 nfs_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
4525 	offset_t offset, cred_t *cr, caller_context_t *ct)
4526 {
4527 	int error;
4528 
4529 	ASSERT(vp->v_type == VREG);
4530 	if (cmd != F_FREESP)
4531 		return (EINVAL);
4532 
4533 	if (offset > MAXOFF32_T)
4534 		return (EFBIG);
4535 
4536 	if ((bfp->l_start > MAXOFF32_T) || (bfp->l_end > MAXOFF32_T) ||
4537 	    (bfp->l_len > MAXOFF32_T))
4538 		return (EFBIG);
4539 
4540 	if (nfs_zone() != VTOMI(vp)->mi_zone)
4541 		return (EIO);
4542 
4543 	error = convoff(vp, bfp, 0, offset);
4544 	if (!error) {
4545 		ASSERT(bfp->l_start >= 0);
4546 		if (bfp->l_len == 0) {
4547 			struct vattr va;
4548 
4549 			/*
4550 			 * ftruncate should not change the ctime and
4551 			 * mtime if we truncate the file to its
4552 			 * previous size.
4553 			 */
4554 			va.va_mask = AT_SIZE;
4555 			error = nfsgetattr(vp, &va, cr);
4556 			if (error || va.va_size == bfp->l_start)
4557 				return (error);
4558 			va.va_mask = AT_SIZE;
4559 			va.va_size = bfp->l_start;
4560 			error = nfssetattr(vp, &va, 0, cr);
4561 		} else
4562 			error = EINVAL;
4563 	}
4564 
4565 	return (error);
4566 }
4567 
4568 /* ARGSUSED */
4569 static int
4570 nfs_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct)
4571 {
4572 
4573 	return (EINVAL);
4574 }
4575 
4576 /*
4577  * Setup and add an address space callback to do the work of the delmap call.
4578  * The callback will (and must be) deleted in the actual callback function.
4579  *
4580  * This is done in order to take care of the problem that we have with holding
4581  * the address space's a_lock for a long period of time (e.g. if the NFS server
4582  * is down).  Callbacks will be executed in the address space code while the
4583  * a_lock is not held.	Holding the address space's a_lock causes things such
4584  * as ps and fork to hang because they are trying to acquire this lock as well.
4585  */
4586 /* ARGSUSED */
4587 static int
4588 nfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
4589 	size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr,
4590 	caller_context_t *ct)
4591 {
4592 	int			caller_found;
4593 	int			error;
4594 	rnode_t			*rp;
4595 	nfs_delmap_args_t	*dmapp;
4596 	nfs_delmapcall_t	*delmap_call;
4597 
4598 	if (vp->v_flag & VNOMAP)
4599 		return (ENOSYS);
4600 	/*
4601 	 * A process may not change zones if it has NFS pages mmap'ed
4602 	 * in, so we can't legitimately get here from the wrong zone.
4603 	 */
4604 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
4605 
4606 	rp = VTOR(vp);
4607 
4608 	/*
4609 	 * The way that the address space of this process deletes its mapping
4610 	 * of this file is via the following call chains:
4611 	 * - as_free()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs_delmap()
4612 	 * - as_unmap()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs_delmap()
4613 	 *
4614 	 * With the use of address space callbacks we are allowed to drop the
4615 	 * address space lock, a_lock, while executing the NFS operations that
4616 	 * need to go over the wire.  Returning EAGAIN to the caller of this
4617 	 * function is what drives the execution of the callback that we add
4618 	 * below.  The callback will be executed by the address space code
4619 	 * after dropping the a_lock.  When the callback is finished, since
4620 	 * we dropped the a_lock, it must be re-acquired and segvn_unmap()
4621 	 * is called again on the same segment to finish the rest of the work
4622 	 * that needs to happen during unmapping.
4623 	 *
4624 	 * This action of calling back into the segment driver causes
4625 	 * nfs_delmap() to get called again, but since the callback was
4626 	 * already executed at this point, it already did the work and there
4627 	 * is nothing left for us to do.
4628 	 *
4629 	 * To Summarize:
4630 	 * - The first time nfs_delmap is called by the current thread is when
4631 	 * we add the caller associated with this delmap to the delmap caller
4632 	 * list, add the callback, and return EAGAIN.
4633 	 * - The second time in this call chain when nfs_delmap is called we
4634 	 * will find this caller in the delmap caller list and realize there
4635 	 * is no more work to do thus removing this caller from the list and
4636 	 * returning the error that was set in the callback execution.
4637 	 */
4638 	caller_found = nfs_find_and_delete_delmapcall(rp, &error);
4639 	if (caller_found) {
4640 		/*
4641 		 * 'error' is from the actual delmap operations.  To avoid
4642 		 * hangs, we need to handle the return of EAGAIN differently
4643 		 * since this is what drives the callback execution.
4644 		 * In this case, we don't want to return EAGAIN and do the
4645 		 * callback execution because there are none to execute.
4646 		 */
4647 		if (error == EAGAIN)
4648 			return (0);
4649 		else
4650 			return (error);
4651 	}
4652 
4653 	/* current caller was not in the list */
4654 	delmap_call = nfs_init_delmapcall();
4655 
4656 	mutex_enter(&rp->r_statelock);
4657 	list_insert_tail(&rp->r_indelmap, delmap_call);
4658 	mutex_exit(&rp->r_statelock);
4659 
4660 	dmapp = kmem_alloc(sizeof (nfs_delmap_args_t), KM_SLEEP);
4661 
4662 	dmapp->vp = vp;
4663 	dmapp->off = off;
4664 	dmapp->addr = addr;
4665 	dmapp->len = len;
4666 	dmapp->prot = prot;
4667 	dmapp->maxprot = maxprot;
4668 	dmapp->flags = flags;
4669 	dmapp->cr = cr;
4670 	dmapp->caller = delmap_call;
4671 
4672 	error = as_add_callback(as, nfs_delmap_callback, dmapp,
4673 	    AS_UNMAP_EVENT, addr, len, KM_SLEEP);
4674 
4675 	return (error ? error : EAGAIN);
4676 }
4677 
4678 /*
4679  * Remove some pages from an mmap'd vnode.  Just update the
4680  * count of pages.  If doing close-to-open, then flush all
4681  * of the pages associated with this file.  Otherwise, start
4682  * an asynchronous page flush to write out any dirty pages.
4683  * This will also associate a credential with the rnode which
4684  * can be used to write the pages.
4685  */
4686 /* ARGSUSED */
4687 static void
4688 nfs_delmap_callback(struct as *as, void *arg, uint_t event)
4689 {
4690 	int			error;
4691 	rnode_t			*rp;
4692 	mntinfo_t		*mi;
4693 	nfs_delmap_args_t	*dmapp = (nfs_delmap_args_t *)arg;
4694 
4695 	rp = VTOR(dmapp->vp);
4696 	mi = VTOMI(dmapp->vp);
4697 
4698 	atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(dmapp->len));
4699 	ASSERT(rp->r_mapcnt >= 0);
4700 
4701 	/*
4702 	 * Initiate a page flush if there are pages, the file system
4703 	 * was not mounted readonly, the segment was mapped shared, and
4704 	 * the pages themselves were writeable.
4705 	 */
4706 	if (vn_has_cached_data(dmapp->vp) && !vn_is_readonly(dmapp->vp) &&
4707 	    dmapp->flags == MAP_SHARED && (dmapp->maxprot & PROT_WRITE)) {
4708 		mutex_enter(&rp->r_statelock);
4709 		rp->r_flags |= RDIRTY;
4710 		mutex_exit(&rp->r_statelock);
4711 		/*
4712 		 * If this is a cross-zone access a sync putpage won't work, so
4713 		 * the best we can do is try an async putpage.  That seems
4714 		 * better than something more draconian such as discarding the
4715 		 * dirty pages.
4716 		 */
4717 		if ((mi->mi_flags & MI_NOCTO) ||
4718 		    nfs_zone() != mi->mi_zone)
4719 			error = nfs_putpage(dmapp->vp, dmapp->off, dmapp->len,
4720 			    B_ASYNC, dmapp->cr, NULL);
4721 		else
4722 			error = nfs_putpage(dmapp->vp, dmapp->off, dmapp->len,
4723 			    0, dmapp->cr, NULL);
4724 		if (!error) {
4725 			mutex_enter(&rp->r_statelock);
4726 			error = rp->r_error;
4727 			rp->r_error = 0;
4728 			mutex_exit(&rp->r_statelock);
4729 		}
4730 	} else
4731 		error = 0;
4732 
4733 	if ((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO))
4734 		(void) nfs_putpage(dmapp->vp, dmapp->off, dmapp->len,
4735 		    B_INVAL, dmapp->cr, NULL);
4736 
4737 	dmapp->caller->error = error;
4738 	(void) as_delete_callback(as, arg);
4739 	kmem_free(dmapp, sizeof (nfs_delmap_args_t));
4740 }
4741 
4742 /* ARGSUSED */
4743 static int
4744 nfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
4745 	caller_context_t *ct)
4746 {
4747 	int error = 0;
4748 
4749 	if (nfs_zone() != VTOMI(vp)->mi_zone)
4750 		return (EIO);
4751 	/*
4752 	 * This looks a little weird because it's written in a general
4753 	 * manner but we make little use of cases.  If cntl() ever gets
4754 	 * widely used, the outer switch will make more sense.
4755 	 */
4756 
4757 	switch (cmd) {
4758 
4759 	/*
4760 	 * Large file spec - need to base answer new query with
4761 	 * hardcoded constant based on the protocol.
4762 	 */
4763 	case _PC_FILESIZEBITS:
4764 		*valp = 32;
4765 		return (0);
4766 
4767 	case _PC_LINK_MAX:
4768 	case _PC_NAME_MAX:
4769 	case _PC_PATH_MAX:
4770 	case _PC_SYMLINK_MAX:
4771 	case _PC_CHOWN_RESTRICTED:
4772 	case _PC_NO_TRUNC: {
4773 		mntinfo_t *mi;
4774 		struct pathcnf *pc;
4775 
4776 		if ((mi = VTOMI(vp)) == NULL || (pc = mi->mi_pathconf) == NULL)
4777 			return (EINVAL);
4778 		error = _PC_ISSET(cmd, pc->pc_mask);    /* error or bool */
4779 		switch (cmd) {
4780 		case _PC_LINK_MAX:
4781 			*valp = pc->pc_link_max;
4782 			break;
4783 		case _PC_NAME_MAX:
4784 			*valp = pc->pc_name_max;
4785 			break;
4786 		case _PC_PATH_MAX:
4787 		case _PC_SYMLINK_MAX:
4788 			*valp = pc->pc_path_max;
4789 			break;
4790 		case _PC_CHOWN_RESTRICTED:
4791 			/*
4792 			 * if we got here, error is really a boolean which
4793 			 * indicates whether cmd is set or not.
4794 			 */
4795 			*valp = error ? 1 : 0;	/* see above */
4796 			error = 0;
4797 			break;
4798 		case _PC_NO_TRUNC:
4799 			/*
4800 			 * if we got here, error is really a boolean which
4801 			 * indicates whether cmd is set or not.
4802 			 */
4803 			*valp = error ? 1 : 0;	/* see above */
4804 			error = 0;
4805 			break;
4806 		}
4807 		return (error ? EINVAL : 0);
4808 		}
4809 
4810 	case _PC_XATTR_EXISTS:
4811 		*valp = 0;
4812 		if (vp->v_vfsp->vfs_flag & VFS_XATTR) {
4813 			vnode_t *avp;
4814 			rnode_t *rp;
4815 			mntinfo_t *mi = VTOMI(vp);
4816 
4817 			if (!(mi->mi_flags & MI_EXTATTR))
4818 				return (0);
4819 
4820 			rp = VTOR(vp);
4821 			if (nfs_rw_enter_sig(&rp->r_rwlock, RW_READER,
4822 			    INTR(vp)))
4823 				return (EINTR);
4824 
4825 			error = nfslookup_dnlc(vp, XATTR_DIR_NAME, &avp, cr);
4826 			if (error || avp == NULL)
4827 				error = acl_getxattrdir2(vp, &avp, 0, cr, 0);
4828 
4829 			nfs_rw_exit(&rp->r_rwlock);
4830 
4831 			if (error == 0 && avp != NULL) {
4832 				VN_RELE(avp);
4833 				*valp = 1;
4834 			}
4835 		}
4836 		return (error ? EINVAL : 0);
4837 
4838 	case _PC_ACL_ENABLED:
4839 		*valp = _ACL_ACLENT_ENABLED;
4840 		return (0);
4841 
4842 	default:
4843 		return (EINVAL);
4844 	}
4845 }
4846 
4847 /*
4848  * Called by async thread to do synchronous pageio. Do the i/o, wait
4849  * for it to complete, and cleanup the page list when done.
4850  */
4851 static int
4852 nfs_sync_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
4853 	int flags, cred_t *cr)
4854 {
4855 	int error;
4856 
4857 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
4858 	error = nfs_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
4859 	if (flags & B_READ)
4860 		pvn_read_done(pp, (error ? B_ERROR : 0) | flags);
4861 	else
4862 		pvn_write_done(pp, (error ? B_ERROR : 0) | flags);
4863 	return (error);
4864 }
4865 
4866 /* ARGSUSED */
4867 static int
4868 nfs_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
4869 	int flags, cred_t *cr, caller_context_t *ct)
4870 {
4871 	int error;
4872 	rnode_t *rp;
4873 
4874 	if (pp == NULL)
4875 		return (EINVAL);
4876 
4877 	if (io_off > MAXOFF32_T)
4878 		return (EFBIG);
4879 	if (nfs_zone() != VTOMI(vp)->mi_zone)
4880 		return (EIO);
4881 	rp = VTOR(vp);
4882 	mutex_enter(&rp->r_statelock);
4883 	rp->r_count++;
4884 	mutex_exit(&rp->r_statelock);
4885 
4886 	if (flags & B_ASYNC) {
4887 		error = nfs_async_pageio(vp, pp, io_off, io_len, flags, cr,
4888 		    nfs_sync_pageio);
4889 	} else
4890 		error = nfs_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
4891 	mutex_enter(&rp->r_statelock);
4892 	rp->r_count--;
4893 	cv_broadcast(&rp->r_cv);
4894 	mutex_exit(&rp->r_statelock);
4895 	return (error);
4896 }
4897 
4898 /* ARGSUSED */
4899 static int
4900 nfs_setsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr,
4901 	caller_context_t *ct)
4902 {
4903 	int error;
4904 	mntinfo_t *mi;
4905 
4906 	mi = VTOMI(vp);
4907 
4908 	if (nfs_zone() != mi->mi_zone)
4909 		return (EIO);
4910 	if (mi->mi_flags & MI_ACL) {
4911 		error = acl_setacl2(vp, vsecattr, flag, cr);
4912 		if (mi->mi_flags & MI_ACL)
4913 			return (error);
4914 	}
4915 
4916 	return (ENOSYS);
4917 }
4918 
4919 /* ARGSUSED */
4920 static int
4921 nfs_getsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr,
4922 	caller_context_t *ct)
4923 {
4924 	int error;
4925 	mntinfo_t *mi;
4926 
4927 	mi = VTOMI(vp);
4928 
4929 	if (nfs_zone() != mi->mi_zone)
4930 		return (EIO);
4931 	if (mi->mi_flags & MI_ACL) {
4932 		error = acl_getacl2(vp, vsecattr, flag, cr);
4933 		if (mi->mi_flags & MI_ACL)
4934 			return (error);
4935 	}
4936 
4937 	return (fs_fab_acl(vp, vsecattr, flag, cr, ct));
4938 }
4939 
4940 /* ARGSUSED */
4941 static int
4942 nfs_shrlock(vnode_t *vp, int cmd, struct shrlock *shr, int flag, cred_t *cr,
4943 	caller_context_t *ct)
4944 {
4945 	int error;
4946 	struct shrlock nshr;
4947 	struct nfs_owner nfs_owner;
4948 	netobj lm_fh;
4949 
4950 	if (nfs_zone() != VTOMI(vp)->mi_zone)
4951 		return (EIO);
4952 
4953 	/*
4954 	 * check for valid cmd parameter
4955 	 */
4956 	if (cmd != F_SHARE && cmd != F_UNSHARE && cmd != F_HASREMOTELOCKS)
4957 		return (EINVAL);
4958 
4959 	/*
4960 	 * Check access permissions
4961 	 */
4962 	if (cmd == F_SHARE &&
4963 	    (((shr->s_access & F_RDACC) && !(flag & FREAD)) ||
4964 	    ((shr->s_access & F_WRACC) && !(flag & FWRITE))))
4965 		return (EBADF);
4966 
4967 	/*
4968 	 * If the filesystem is mounted using local locking, pass the
4969 	 * request off to the local share code.
4970 	 */
4971 	if (VTOMI(vp)->mi_flags & MI_LLOCK)
4972 		return (fs_shrlock(vp, cmd, shr, flag, cr, ct));
4973 
4974 	switch (cmd) {
4975 	case F_SHARE:
4976 	case F_UNSHARE:
4977 		lm_fh.n_len = sizeof (fhandle_t);
4978 		lm_fh.n_bytes = (char *)VTOFH(vp);
4979 
4980 		/*
4981 		 * If passed an owner that is too large to fit in an
4982 		 * nfs_owner it is likely a recursive call from the
4983 		 * lock manager client and pass it straight through.  If
4984 		 * it is not a nfs_owner then simply return an error.
4985 		 */
4986 		if (shr->s_own_len > sizeof (nfs_owner.lowner)) {
4987 			if (((struct nfs_owner *)shr->s_owner)->magic !=
4988 			    NFS_OWNER_MAGIC)
4989 				return (EINVAL);
4990 
4991 			if (error = lm_shrlock(vp, cmd, shr, flag, &lm_fh)) {
4992 				error = set_errno(error);
4993 			}
4994 			return (error);
4995 		}
4996 		/*
4997 		 * Remote share reservations owner is a combination of
4998 		 * a magic number, hostname, and the local owner
4999 		 */
5000 		bzero(&nfs_owner, sizeof (nfs_owner));
5001 		nfs_owner.magic = NFS_OWNER_MAGIC;
5002 		(void) strncpy(nfs_owner.hname, uts_nodename(),
5003 		    sizeof (nfs_owner.hname));
5004 		bcopy(shr->s_owner, nfs_owner.lowner, shr->s_own_len);
5005 		nshr.s_access = shr->s_access;
5006 		nshr.s_deny = shr->s_deny;
5007 		nshr.s_sysid = 0;
5008 		nshr.s_pid = ttoproc(curthread)->p_pid;
5009 		nshr.s_own_len = sizeof (nfs_owner);
5010 		nshr.s_owner = (caddr_t)&nfs_owner;
5011 
5012 		if (error = lm_shrlock(vp, cmd, &nshr, flag, &lm_fh)) {
5013 			error = set_errno(error);
5014 		}
5015 
5016 		break;
5017 
5018 	case F_HASREMOTELOCKS:
5019 		/*
5020 		 * NFS client can't store remote locks itself
5021 		 */
5022 		shr->s_access = 0;
5023 		error = 0;
5024 		break;
5025 
5026 	default:
5027 		error = EINVAL;
5028 		break;
5029 	}
5030 
5031 	return (error);
5032 }
5033