xref: /titanic_51/usr/src/uts/common/fs/nfs/nfs3_vnops.c (revision f275d02f08c70e13825071e2577d1481e8bba78e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  *	Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T.
28  *	All rights reserved.
29  */
30 
31 #include <sys/param.h>
32 #include <sys/types.h>
33 #include <sys/systm.h>
34 #include <sys/cred.h>
35 #include <sys/time.h>
36 #include <sys/vnode.h>
37 #include <sys/vfs.h>
38 #include <sys/vfs_opreg.h>
39 #include <sys/file.h>
40 #include <sys/filio.h>
41 #include <sys/uio.h>
42 #include <sys/buf.h>
43 #include <sys/mman.h>
44 #include <sys/pathname.h>
45 #include <sys/dirent.h>
46 #include <sys/debug.h>
47 #include <sys/vmsystm.h>
48 #include <sys/fcntl.h>
49 #include <sys/flock.h>
50 #include <sys/swap.h>
51 #include <sys/errno.h>
52 #include <sys/strsubr.h>
53 #include <sys/sysmacros.h>
54 #include <sys/kmem.h>
55 #include <sys/cmn_err.h>
56 #include <sys/pathconf.h>
57 #include <sys/utsname.h>
58 #include <sys/dnlc.h>
59 #include <sys/acl.h>
60 #include <sys/systeminfo.h>
61 #include <sys/atomic.h>
62 #include <sys/policy.h>
63 #include <sys/sdt.h>
64 #include <sys/zone.h>
65 
66 #include <rpc/types.h>
67 #include <rpc/auth.h>
68 #include <rpc/clnt.h>
69 #include <rpc/rpc_rdma.h>
70 
71 #include <nfs/nfs.h>
72 #include <nfs/nfs_clnt.h>
73 #include <nfs/rnode.h>
74 #include <nfs/nfs_acl.h>
75 #include <nfs/lm.h>
76 
77 #include <vm/hat.h>
78 #include <vm/as.h>
79 #include <vm/page.h>
80 #include <vm/pvn.h>
81 #include <vm/seg.h>
82 #include <vm/seg_map.h>
83 #include <vm/seg_kpm.h>
84 #include <vm/seg_vn.h>
85 
86 #include <fs/fs_subr.h>
87 
88 #include <sys/ddi.h>
89 
90 static int	nfs3_rdwrlbn(vnode_t *, page_t *, u_offset_t, size_t, int,
91 			cred_t *);
92 static int	nfs3write(vnode_t *, caddr_t, u_offset_t, int, cred_t *,
93 			stable_how *);
94 static int	nfs3read(vnode_t *, caddr_t, offset_t, int, size_t *, cred_t *);
95 static int	nfs3setattr(vnode_t *, struct vattr *, int, cred_t *);
96 static int	nfs3_accessx(void *, int, cred_t *);
97 static int	nfs3lookup_dnlc(vnode_t *, char *, vnode_t **, cred_t *);
98 static int	nfs3lookup_otw(vnode_t *, char *, vnode_t **, cred_t *, int);
99 static int	nfs3create(vnode_t *, char *, struct vattr *, enum vcexcl,
100 			int, vnode_t **, cred_t *, int);
101 static int	nfs3excl_create_settimes(vnode_t *, struct vattr *, cred_t *);
102 static int	nfs3mknod(vnode_t *, char *, struct vattr *, enum vcexcl,
103 			int, vnode_t **, cred_t *);
104 static int	nfs3rename(vnode_t *, char *, vnode_t *, char *, cred_t *,
105 			caller_context_t *);
106 static int	do_nfs3readdir(vnode_t *, rddir_cache *, cred_t *);
107 static void	nfs3readdir(vnode_t *, rddir_cache *, cred_t *);
108 static void	nfs3readdirplus(vnode_t *, rddir_cache *, cred_t *);
109 static int	nfs3_bio(struct buf *, stable_how *, cred_t *);
110 static int	nfs3_getapage(vnode_t *, u_offset_t, size_t, uint_t *,
111 			page_t *[], size_t, struct seg *, caddr_t,
112 			enum seg_rw, cred_t *);
113 static void	nfs3_readahead(vnode_t *, u_offset_t, caddr_t, struct seg *,
114 			cred_t *);
115 static int	nfs3_sync_putapage(vnode_t *, page_t *, u_offset_t, size_t,
116 			int, cred_t *);
117 static int	nfs3_sync_pageio(vnode_t *, page_t *, u_offset_t, size_t,
118 			int, cred_t *);
119 static int	nfs3_commit(vnode_t *, offset3, count3, cred_t *);
120 static void	nfs3_set_mod(vnode_t *);
121 static void	nfs3_get_commit(vnode_t *);
122 static void	nfs3_get_commit_range(vnode_t *, u_offset_t, size_t);
123 #if 0 /* unused */
124 #ifdef DEBUG
125 static int	nfs3_no_uncommitted_pages(vnode_t *);
126 #endif
127 #endif /* unused */
128 static int	nfs3_putpage_commit(vnode_t *, offset_t, size_t, cred_t *);
129 static int	nfs3_commit_vp(vnode_t *, u_offset_t, size_t,  cred_t *);
130 static int	nfs3_sync_commit(vnode_t *, page_t *, offset3, count3,
131 			cred_t *);
132 static void	nfs3_async_commit(vnode_t *, page_t *, offset3, count3,
133 			cred_t *);
134 static void	nfs3_delmap_callback(struct as *, void *, uint_t);
135 
136 /*
137  * Error flags used to pass information about certain special errors
138  * which need to be handled specially.
139  */
140 #define	NFS_EOF			-98
141 #define	NFS_VERF_MISMATCH	-97
142 
143 /* ALIGN64 aligns the given buffer and adjust buffer size to 64 bit */
144 #define	ALIGN64(x, ptr, sz)						\
145 	x = ((uintptr_t)(ptr)) & (sizeof (uint64_t) - 1);		\
146 	if (x) {							\
147 		x = sizeof (uint64_t) - (x);				\
148 		sz -= (x);						\
149 		ptr += (x);						\
150 	}
151 
152 /*
153  * These are the vnode ops routines which implement the vnode interface to
154  * the networked file system.  These routines just take their parameters,
155  * make them look networkish by putting the right info into interface structs,
156  * and then calling the appropriate remote routine(s) to do the work.
157  *
158  * Note on directory name lookup cacheing:  If we detect a stale fhandle,
159  * we purge the directory cache relative to that vnode.  This way, the
160  * user won't get burned by the cache repeatedly.  See <nfs/rnode.h> for
161  * more details on rnode locking.
162  */
163 
164 static int	nfs3_open(vnode_t **, int, cred_t *, caller_context_t *);
165 static int	nfs3_close(vnode_t *, int, int, offset_t, cred_t *,
166 			caller_context_t *);
167 static int	nfs3_read(vnode_t *, struct uio *, int, cred_t *,
168 			caller_context_t *);
169 static int	nfs3_write(vnode_t *, struct uio *, int, cred_t *,
170 			caller_context_t *);
171 static int	nfs3_ioctl(vnode_t *, int, intptr_t, int, cred_t *, int *,
172 			caller_context_t *);
173 static int	nfs3_getattr(vnode_t *, struct vattr *, int, cred_t *,
174 			caller_context_t *);
175 static int	nfs3_setattr(vnode_t *, struct vattr *, int, cred_t *,
176 			caller_context_t *);
177 static int	nfs3_access(vnode_t *, int, int, cred_t *, caller_context_t *);
178 static int	nfs3_readlink(vnode_t *, struct uio *, cred_t *,
179 			caller_context_t *);
180 static int	nfs3_fsync(vnode_t *, int, cred_t *, caller_context_t *);
181 static void	nfs3_inactive(vnode_t *, cred_t *, caller_context_t *);
182 static int	nfs3_lookup(vnode_t *, char *, vnode_t **,
183 			struct pathname *, int, vnode_t *, cred_t *,
184 			caller_context_t *, int *, pathname_t *);
185 static int	nfs3_create(vnode_t *, char *, struct vattr *, enum vcexcl,
186 			int, vnode_t **, cred_t *, int, caller_context_t *,
187 			vsecattr_t *);
188 static int	nfs3_remove(vnode_t *, char *, cred_t *, caller_context_t *,
189 			int);
190 static int	nfs3_link(vnode_t *, vnode_t *, char *, cred_t *,
191 			caller_context_t *, int);
192 static int	nfs3_rename(vnode_t *, char *, vnode_t *, char *, cred_t *,
193 			caller_context_t *, int);
194 static int	nfs3_mkdir(vnode_t *, char *, struct vattr *, vnode_t **,
195 			cred_t *, caller_context_t *, int, vsecattr_t *);
196 static int	nfs3_rmdir(vnode_t *, char *, vnode_t *, cred_t *,
197 			caller_context_t *, int);
198 static int	nfs3_symlink(vnode_t *, char *, struct vattr *, char *,
199 			cred_t *, caller_context_t *, int);
200 static int	nfs3_readdir(vnode_t *, struct uio *, cred_t *, int *,
201 			caller_context_t *, int);
202 static int	nfs3_fid(vnode_t *, fid_t *, caller_context_t *);
203 static int	nfs3_rwlock(vnode_t *, int, caller_context_t *);
204 static void	nfs3_rwunlock(vnode_t *, int, caller_context_t *);
205 static int	nfs3_seek(vnode_t *, offset_t, offset_t *, caller_context_t *);
206 static int	nfs3_getpage(vnode_t *, offset_t, size_t, uint_t *,
207 			page_t *[], size_t, struct seg *, caddr_t,
208 			enum seg_rw, cred_t *, caller_context_t *);
209 static int	nfs3_putpage(vnode_t *, offset_t, size_t, int, cred_t *,
210 			caller_context_t *);
211 static int	nfs3_map(vnode_t *, offset_t, struct as *, caddr_t *, size_t,
212 			uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *);
213 static int	nfs3_addmap(vnode_t *, offset_t, struct as *, caddr_t, size_t,
214 			uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *);
215 static int	nfs3_frlock(vnode_t *, int, struct flock64 *, int, offset_t,
216 			struct flk_callback *, cred_t *, caller_context_t *);
217 static int	nfs3_space(vnode_t *, int, struct flock64 *, int, offset_t,
218 			cred_t *, caller_context_t *);
219 static int	nfs3_realvp(vnode_t *, vnode_t **, caller_context_t *);
220 static int	nfs3_delmap(vnode_t *, offset_t, struct as *, caddr_t, size_t,
221 			uint_t, uint_t, uint_t, cred_t *, caller_context_t *);
222 static int	nfs3_pathconf(vnode_t *, int, ulong_t *, cred_t *,
223 			caller_context_t *);
224 static int	nfs3_pageio(vnode_t *, page_t *, u_offset_t, size_t, int,
225 			cred_t *, caller_context_t *);
226 static void	nfs3_dispose(vnode_t *, page_t *, int, int, cred_t *,
227 			caller_context_t *);
228 static int	nfs3_setsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
229 			caller_context_t *);
230 static int	nfs3_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
231 			caller_context_t *);
232 static int	nfs3_shrlock(vnode_t *, int, struct shrlock *, int, cred_t *,
233 			caller_context_t *);
234 
235 struct vnodeops *nfs3_vnodeops;
236 
237 const fs_operation_def_t nfs3_vnodeops_template[] = {
238 	VOPNAME_OPEN,		{ .vop_open = nfs3_open },
239 	VOPNAME_CLOSE,		{ .vop_close = nfs3_close },
240 	VOPNAME_READ,		{ .vop_read = nfs3_read },
241 	VOPNAME_WRITE,		{ .vop_write = nfs3_write },
242 	VOPNAME_IOCTL,		{ .vop_ioctl = nfs3_ioctl },
243 	VOPNAME_GETATTR,	{ .vop_getattr = nfs3_getattr },
244 	VOPNAME_SETATTR,	{ .vop_setattr = nfs3_setattr },
245 	VOPNAME_ACCESS,		{ .vop_access = nfs3_access },
246 	VOPNAME_LOOKUP,		{ .vop_lookup = nfs3_lookup },
247 	VOPNAME_CREATE,		{ .vop_create = nfs3_create },
248 	VOPNAME_REMOVE,		{ .vop_remove = nfs3_remove },
249 	VOPNAME_LINK,		{ .vop_link = nfs3_link },
250 	VOPNAME_RENAME,		{ .vop_rename = nfs3_rename },
251 	VOPNAME_MKDIR,		{ .vop_mkdir = nfs3_mkdir },
252 	VOPNAME_RMDIR,		{ .vop_rmdir = nfs3_rmdir },
253 	VOPNAME_READDIR,	{ .vop_readdir = nfs3_readdir },
254 	VOPNAME_SYMLINK,	{ .vop_symlink = nfs3_symlink },
255 	VOPNAME_READLINK,	{ .vop_readlink = nfs3_readlink },
256 	VOPNAME_FSYNC,		{ .vop_fsync = nfs3_fsync },
257 	VOPNAME_INACTIVE,	{ .vop_inactive = nfs3_inactive },
258 	VOPNAME_FID,		{ .vop_fid = nfs3_fid },
259 	VOPNAME_RWLOCK,		{ .vop_rwlock = nfs3_rwlock },
260 	VOPNAME_RWUNLOCK,	{ .vop_rwunlock = nfs3_rwunlock },
261 	VOPNAME_SEEK,		{ .vop_seek = nfs3_seek },
262 	VOPNAME_FRLOCK,		{ .vop_frlock = nfs3_frlock },
263 	VOPNAME_SPACE,		{ .vop_space = nfs3_space },
264 	VOPNAME_REALVP,		{ .vop_realvp = nfs3_realvp },
265 	VOPNAME_GETPAGE,	{ .vop_getpage = nfs3_getpage },
266 	VOPNAME_PUTPAGE,	{ .vop_putpage = nfs3_putpage },
267 	VOPNAME_MAP,		{ .vop_map = nfs3_map },
268 	VOPNAME_ADDMAP,		{ .vop_addmap = nfs3_addmap },
269 	VOPNAME_DELMAP,		{ .vop_delmap = nfs3_delmap },
270 	/* no separate nfs3_dump */
271 	VOPNAME_DUMP,		{ .vop_dump = nfs_dump },
272 	VOPNAME_PATHCONF,	{ .vop_pathconf = nfs3_pathconf },
273 	VOPNAME_PAGEIO,		{ .vop_pageio = nfs3_pageio },
274 	VOPNAME_DISPOSE,	{ .vop_dispose = nfs3_dispose },
275 	VOPNAME_SETSECATTR,	{ .vop_setsecattr = nfs3_setsecattr },
276 	VOPNAME_GETSECATTR,	{ .vop_getsecattr = nfs3_getsecattr },
277 	VOPNAME_SHRLOCK,	{ .vop_shrlock = nfs3_shrlock },
278 	VOPNAME_VNEVENT, 	{ .vop_vnevent = fs_vnevent_support },
279 	NULL,			NULL
280 };
281 
282 /*
283  * XXX:  This is referenced in modstubs.s
284  */
285 struct vnodeops *
286 nfs3_getvnodeops(void)
287 {
288 	return (nfs3_vnodeops);
289 }
290 
291 /* ARGSUSED */
292 static int
293 nfs3_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
294 {
295 	int error;
296 	struct vattr va;
297 	rnode_t *rp;
298 	vnode_t *vp;
299 
300 	vp = *vpp;
301 	if (nfs_zone() != VTOMI(vp)->mi_zone)
302 		return (EIO);
303 	rp = VTOR(vp);
304 	mutex_enter(&rp->r_statelock);
305 	if (rp->r_cred == NULL) {
306 		crhold(cr);
307 		rp->r_cred = cr;
308 	}
309 	mutex_exit(&rp->r_statelock);
310 
311 	/*
312 	 * If there is no cached data or if close-to-open
313 	 * consistency checking is turned off, we can avoid
314 	 * the over the wire getattr.  Otherwise, if the
315 	 * file system is mounted readonly, then just verify
316 	 * the caches are up to date using the normal mechanism.
317 	 * Else, if the file is not mmap'd, then just mark
318 	 * the attributes as timed out.  They will be refreshed
319 	 * and the caches validated prior to being used.
320 	 * Else, the file system is mounted writeable so
321 	 * force an over the wire GETATTR in order to ensure
322 	 * that all cached data is valid.
323 	 */
324 	if (vp->v_count > 1 ||
325 	    ((vn_has_cached_data(vp) || HAVE_RDDIR_CACHE(rp)) &&
326 	    !(VTOMI(vp)->mi_flags & MI_NOCTO))) {
327 		if (vn_is_readonly(vp))
328 			error = nfs3_validate_caches(vp, cr);
329 		else if (rp->r_mapcnt == 0 && vp->v_count == 1) {
330 			PURGE_ATTRCACHE(vp);
331 			error = 0;
332 		} else {
333 			va.va_mask = AT_ALL;
334 			error = nfs3_getattr_otw(vp, &va, cr);
335 		}
336 	} else
337 		error = 0;
338 
339 	return (error);
340 }
341 
342 /* ARGSUSED */
343 static int
344 nfs3_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
345 		caller_context_t *ct)
346 {
347 	rnode_t *rp;
348 	int error;
349 	struct vattr va;
350 
351 	/*
352 	 * zone_enter(2) prevents processes from changing zones with NFS files
353 	 * open; if we happen to get here from the wrong zone we can't do
354 	 * anything over the wire.
355 	 */
356 	if (VTOMI(vp)->mi_zone != nfs_zone()) {
357 		/*
358 		 * We could attempt to clean up locks, except we're sure
359 		 * that the current process didn't acquire any locks on
360 		 * the file: any attempt to lock a file belong to another zone
361 		 * will fail, and one can't lock an NFS file and then change
362 		 * zones, as that fails too.
363 		 *
364 		 * Returning an error here is the sane thing to do.  A
365 		 * subsequent call to VN_RELE() which translates to a
366 		 * nfs3_inactive() will clean up state: if the zone of the
367 		 * vnode's origin is still alive and kicking, an async worker
368 		 * thread will handle the request (from the correct zone), and
369 		 * everything (minus the commit and final nfs3_getattr_otw()
370 		 * call) should be OK. If the zone is going away
371 		 * nfs_async_inactive() will throw away cached pages inline.
372 		 */
373 		return (EIO);
374 	}
375 
376 	/*
377 	 * If we are using local locking for this filesystem, then
378 	 * release all of the SYSV style record locks.  Otherwise,
379 	 * we are doing network locking and we need to release all
380 	 * of the network locks.  All of the locks held by this
381 	 * process on this file are released no matter what the
382 	 * incoming reference count is.
383 	 */
384 	if (VTOMI(vp)->mi_flags & MI_LLOCK) {
385 		cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
386 		cleanshares(vp, ttoproc(curthread)->p_pid);
387 	} else
388 		nfs_lockrelease(vp, flag, offset, cr);
389 
390 	if (count > 1)
391 		return (0);
392 
393 	/*
394 	 * If the file has been `unlinked', then purge the
395 	 * DNLC so that this vnode will get reycled quicker
396 	 * and the .nfs* file on the server will get removed.
397 	 */
398 	rp = VTOR(vp);
399 	if (rp->r_unldvp != NULL)
400 		dnlc_purge_vp(vp);
401 
402 	/*
403 	 * If the file was open for write and there are pages,
404 	 * then if the file system was mounted using the "no-close-
405 	 *	to-open" semantics, then start an asynchronous flush
406 	 *	of the all of the pages in the file.
407 	 * else the file system was not mounted using the "no-close-
408 	 *	to-open" semantics, then do a synchronous flush and
409 	 *	commit of all of the dirty and uncommitted pages.
410 	 *
411 	 * The asynchronous flush of the pages in the "nocto" path
412 	 * mostly just associates a cred pointer with the rnode so
413 	 * writes which happen later will have a better chance of
414 	 * working.  It also starts the data being written to the
415 	 * server, but without unnecessarily delaying the application.
416 	 */
417 	if ((flag & FWRITE) && vn_has_cached_data(vp)) {
418 		if (VTOMI(vp)->mi_flags & MI_NOCTO) {
419 			error = nfs3_putpage(vp, (offset_t)0, 0, B_ASYNC,
420 			    cr, ct);
421 			if (error == EAGAIN)
422 				error = 0;
423 		} else
424 			error = nfs3_putpage_commit(vp, (offset_t)0, 0, cr);
425 		if (!error) {
426 			mutex_enter(&rp->r_statelock);
427 			error = rp->r_error;
428 			rp->r_error = 0;
429 			mutex_exit(&rp->r_statelock);
430 		}
431 	} else {
432 		mutex_enter(&rp->r_statelock);
433 		error = rp->r_error;
434 		rp->r_error = 0;
435 		mutex_exit(&rp->r_statelock);
436 	}
437 
438 	/*
439 	 * If RWRITEATTR is set, then issue an over the wire GETATTR to
440 	 * refresh the attribute cache with a set of attributes which
441 	 * weren't returned from a WRITE.  This will enable the close-
442 	 * to-open processing to work.
443 	 */
444 	if (rp->r_flags & RWRITEATTR)
445 		(void) nfs3_getattr_otw(vp, &va, cr);
446 
447 	return (error);
448 }
449 
450 /* ARGSUSED */
451 static int
452 nfs3_directio_read(vnode_t *vp, struct uio *uiop, cred_t *cr)
453 {
454 	mntinfo_t *mi;
455 	READ3args args;
456 	READ3uiores res;
457 	int tsize;
458 	offset_t offset;
459 	ssize_t count;
460 	int error;
461 	int douprintf;
462 	failinfo_t fi;
463 	char *sv_hostname;
464 
465 	mi = VTOMI(vp);
466 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
467 	sv_hostname = VTOR(vp)->r_server->sv_hostname;
468 
469 	douprintf = 1;
470 	args.file = *VTOFH3(vp);
471 	fi.vp = vp;
472 	fi.fhp = (caddr_t)&args.file;
473 	fi.copyproc = nfs3copyfh;
474 	fi.lookupproc = nfs3lookup;
475 	fi.xattrdirproc = acl_getxattrdir3;
476 
477 	res.uiop = uiop;
478 
479 	res.wlist = NULL;
480 
481 	offset = uiop->uio_loffset;
482 	count = uiop->uio_resid;
483 
484 	do {
485 		if (mi->mi_io_kstats) {
486 			mutex_enter(&mi->mi_lock);
487 			kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
488 			mutex_exit(&mi->mi_lock);
489 		}
490 
491 		do {
492 			tsize = MIN(mi->mi_tsize, count);
493 			args.offset = (offset3)offset;
494 			args.count = (count3)tsize;
495 			res.size = (uint_t)tsize;
496 			args.res_uiop = uiop;
497 			args.res_data_val_alt = NULL;
498 
499 			error = rfs3call(mi, NFSPROC3_READ,
500 			    xdr_READ3args, (caddr_t)&args,
501 			    xdr_READ3uiores, (caddr_t)&res, cr,
502 			    &douprintf, &res.status, 0, &fi);
503 		} while (error == ENFS_TRYAGAIN);
504 
505 		if (mi->mi_io_kstats) {
506 			mutex_enter(&mi->mi_lock);
507 			kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
508 			mutex_exit(&mi->mi_lock);
509 		}
510 
511 		if (error)
512 			return (error);
513 
514 		error = geterrno3(res.status);
515 		if (error)
516 			return (error);
517 
518 		if (res.count != res.size) {
519 			zcmn_err(getzoneid(), CE_WARN,
520 "nfs3_directio_read: server %s returned incorrect amount",
521 			    sv_hostname);
522 			return (EIO);
523 		}
524 		count -= res.count;
525 		offset += res.count;
526 		if (mi->mi_io_kstats) {
527 			mutex_enter(&mi->mi_lock);
528 			KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
529 			KSTAT_IO_PTR(mi->mi_io_kstats)->nread += res.count;
530 			mutex_exit(&mi->mi_lock);
531 		}
532 		lwp_stat_update(LWP_STAT_INBLK, 1);
533 	} while (count && !res.eof);
534 
535 	return (0);
536 }
537 
538 /* ARGSUSED */
539 static int
540 nfs3_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
541 	caller_context_t *ct)
542 {
543 	rnode_t *rp;
544 	u_offset_t off;
545 	offset_t diff;
546 	int on;
547 	size_t n;
548 	caddr_t base;
549 	uint_t flags;
550 	int error = 0;
551 	mntinfo_t *mi;
552 
553 	rp = VTOR(vp);
554 	mi = VTOMI(vp);
555 
556 	ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
557 
558 	if (nfs_zone() != mi->mi_zone)
559 		return (EIO);
560 
561 	if (vp->v_type != VREG)
562 		return (EISDIR);
563 
564 	if (uiop->uio_resid == 0)
565 		return (0);
566 
567 	if (uiop->uio_loffset < 0 || uiop->uio_loffset + uiop->uio_resid < 0)
568 		return (EINVAL);
569 
570 	/*
571 	 * Bypass VM if caching has been disabled (e.g., locking) or if
572 	 * using client-side direct I/O and the file is not mmap'd and
573 	 * there are no cached pages.
574 	 */
575 	if ((vp->v_flag & VNOCACHE) ||
576 	    (((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) &&
577 	    rp->r_mapcnt == 0 && rp->r_inmap == 0 &&
578 	    !vn_has_cached_data(vp))) {
579 		return (nfs3_directio_read(vp, uiop, cr));
580 	}
581 
582 	do {
583 		off = uiop->uio_loffset & MAXBMASK; /* mapping offset */
584 		on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */
585 		n = MIN(MAXBSIZE - on, uiop->uio_resid);
586 
587 		error = nfs3_validate_caches(vp, cr);
588 		if (error)
589 			break;
590 
591 		mutex_enter(&rp->r_statelock);
592 		while (rp->r_flags & RINCACHEPURGE) {
593 			if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
594 				mutex_exit(&rp->r_statelock);
595 				return (EINTR);
596 			}
597 		}
598 		diff = rp->r_size - uiop->uio_loffset;
599 		mutex_exit(&rp->r_statelock);
600 		if (diff <= 0)
601 			break;
602 		if (diff < n)
603 			n = (size_t)diff;
604 
605 		if (vpm_enable) {
606 			/*
607 			 * Copy data.
608 			 */
609 			error = vpm_data_copy(vp, off + on, n, uiop,
610 			    1, NULL, 0, S_READ);
611 		} else {
612 			base = segmap_getmapflt(segkmap, vp, off + on, n, 1,
613 			    S_READ);
614 
615 			error = uiomove(base + on, n, UIO_READ, uiop);
616 		}
617 
618 		if (!error) {
619 			/*
620 			 * If read a whole block or read to eof,
621 			 * won't need this buffer again soon.
622 			 */
623 			mutex_enter(&rp->r_statelock);
624 			if (n + on == MAXBSIZE ||
625 			    uiop->uio_loffset == rp->r_size)
626 				flags = SM_DONTNEED;
627 			else
628 				flags = 0;
629 			mutex_exit(&rp->r_statelock);
630 			if (vpm_enable) {
631 				error = vpm_sync_pages(vp, off, n, flags);
632 			} else {
633 				error = segmap_release(segkmap, base, flags);
634 			}
635 		} else {
636 			if (vpm_enable) {
637 				(void) vpm_sync_pages(vp, off, n, 0);
638 			} else {
639 				(void) segmap_release(segkmap, base, 0);
640 			}
641 		}
642 	} while (!error && uiop->uio_resid > 0);
643 
644 	return (error);
645 }
646 
647 /* ARGSUSED */
648 static int
649 nfs3_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
650 	caller_context_t *ct)
651 {
652 	rlim64_t limit = uiop->uio_llimit;
653 	rnode_t *rp;
654 	u_offset_t off;
655 	caddr_t base;
656 	uint_t flags;
657 	int remainder;
658 	size_t n;
659 	int on;
660 	int error;
661 	int resid;
662 	offset_t offset;
663 	mntinfo_t *mi;
664 	uint_t bsize;
665 
666 	rp = VTOR(vp);
667 
668 	if (vp->v_type != VREG)
669 		return (EISDIR);
670 
671 	mi = VTOMI(vp);
672 	if (nfs_zone() != mi->mi_zone)
673 		return (EIO);
674 	if (uiop->uio_resid == 0)
675 		return (0);
676 
677 	if (ioflag & FAPPEND) {
678 		struct vattr va;
679 
680 		/*
681 		 * Must serialize if appending.
682 		 */
683 		if (nfs_rw_lock_held(&rp->r_rwlock, RW_READER)) {
684 			nfs_rw_exit(&rp->r_rwlock);
685 			if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER,
686 			    INTR(vp)))
687 				return (EINTR);
688 		}
689 
690 		va.va_mask = AT_SIZE;
691 		error = nfs3getattr(vp, &va, cr);
692 		if (error)
693 			return (error);
694 		uiop->uio_loffset = va.va_size;
695 	}
696 
697 	offset = uiop->uio_loffset + uiop->uio_resid;
698 
699 	if (uiop->uio_loffset < 0 || offset < 0)
700 		return (EINVAL);
701 
702 	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
703 		limit = MAXOFFSET_T;
704 
705 	/*
706 	 * Check to make sure that the process will not exceed
707 	 * its limit on file size.  It is okay to write up to
708 	 * the limit, but not beyond.  Thus, the write which
709 	 * reaches the limit will be short and the next write
710 	 * will return an error.
711 	 */
712 	remainder = 0;
713 	if (offset > limit) {
714 		remainder = offset - limit;
715 		uiop->uio_resid = limit - uiop->uio_loffset;
716 		if (uiop->uio_resid <= 0) {
717 			proc_t *p = ttoproc(curthread);
718 
719 			uiop->uio_resid += remainder;
720 			mutex_enter(&p->p_lock);
721 			(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
722 			    p->p_rctls, p, RCA_UNSAFE_SIGINFO);
723 			mutex_exit(&p->p_lock);
724 			return (EFBIG);
725 		}
726 	}
727 
728 	if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR(vp)))
729 		return (EINTR);
730 
731 	/*
732 	 * Bypass VM if caching has been disabled (e.g., locking) or if
733 	 * using client-side direct I/O and the file is not mmap'd and
734 	 * there are no cached pages.
735 	 */
736 	if ((vp->v_flag & VNOCACHE) ||
737 	    (((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) &&
738 	    rp->r_mapcnt == 0 && rp->r_inmap == 0 &&
739 	    !vn_has_cached_data(vp))) {
740 		size_t bufsize;
741 		int count;
742 		u_offset_t org_offset;
743 		stable_how stab_comm;
744 
745 nfs3_fwrite:
746 		if (rp->r_flags & RSTALE) {
747 			resid = uiop->uio_resid;
748 			offset = uiop->uio_loffset;
749 			error = rp->r_error;
750 			/*
751 			 * A close may have cleared r_error, if so,
752 			 * propagate ESTALE error return properly
753 			 */
754 			if (error == 0)
755 				error = ESTALE;
756 			goto bottom;
757 		}
758 		bufsize = MIN(uiop->uio_resid, mi->mi_stsize);
759 		base = kmem_alloc(bufsize, KM_SLEEP);
760 		do {
761 			if (ioflag & FDSYNC)
762 				stab_comm = DATA_SYNC;
763 			else
764 				stab_comm = FILE_SYNC;
765 			resid = uiop->uio_resid;
766 			offset = uiop->uio_loffset;
767 			count = MIN(uiop->uio_resid, bufsize);
768 			org_offset = uiop->uio_loffset;
769 			error = uiomove(base, count, UIO_WRITE, uiop);
770 			if (!error) {
771 				error = nfs3write(vp, base, org_offset,
772 				    count, cr, &stab_comm);
773 			}
774 		} while (!error && uiop->uio_resid > 0);
775 		kmem_free(base, bufsize);
776 		goto bottom;
777 	}
778 
779 
780 	bsize = vp->v_vfsp->vfs_bsize;
781 
782 	do {
783 		off = uiop->uio_loffset & MAXBMASK; /* mapping offset */
784 		on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */
785 		n = MIN(MAXBSIZE - on, uiop->uio_resid);
786 
787 		resid = uiop->uio_resid;
788 		offset = uiop->uio_loffset;
789 
790 		if (rp->r_flags & RSTALE) {
791 			error = rp->r_error;
792 			/*
793 			 * A close may have cleared r_error, if so,
794 			 * propagate ESTALE error return properly
795 			 */
796 			if (error == 0)
797 				error = ESTALE;
798 			break;
799 		}
800 
801 		/*
802 		 * Don't create dirty pages faster than they
803 		 * can be cleaned so that the system doesn't
804 		 * get imbalanced.  If the async queue is
805 		 * maxed out, then wait for it to drain before
806 		 * creating more dirty pages.  Also, wait for
807 		 * any threads doing pagewalks in the vop_getattr
808 		 * entry points so that they don't block for
809 		 * long periods.
810 		 */
811 		mutex_enter(&rp->r_statelock);
812 		while ((mi->mi_max_threads != 0 &&
813 		    rp->r_awcount > 2 * mi->mi_max_threads) ||
814 		    rp->r_gcount > 0)
815 			cv_wait(&rp->r_cv, &rp->r_statelock);
816 		mutex_exit(&rp->r_statelock);
817 
818 		/*
819 		 * Touch the page and fault it in if it is not in core
820 		 * before segmap_getmapflt or vpm_data_copy can lock it.
821 		 * This is to avoid the deadlock if the buffer is mapped
822 		 * to the same file through mmap which we want to write.
823 		 */
824 		uio_prefaultpages((long)n, uiop);
825 
826 		if (vpm_enable) {
827 			/*
828 			 * It will use kpm mappings, so no need to
829 			 * pass an address.
830 			 */
831 			error = writerp(rp, NULL, n, uiop, 0);
832 		} else  {
833 			if (segmap_kpm) {
834 				int pon = uiop->uio_loffset & PAGEOFFSET;
835 				size_t pn = MIN(PAGESIZE - pon,
836 				    uiop->uio_resid);
837 				int pagecreate;
838 
839 				mutex_enter(&rp->r_statelock);
840 				pagecreate = (pon == 0) && (pn == PAGESIZE ||
841 				    uiop->uio_loffset + pn >= rp->r_size);
842 				mutex_exit(&rp->r_statelock);
843 
844 				base = segmap_getmapflt(segkmap, vp, off + on,
845 				    pn, !pagecreate, S_WRITE);
846 
847 				error = writerp(rp, base + pon, n, uiop,
848 				    pagecreate);
849 
850 			} else {
851 				base = segmap_getmapflt(segkmap, vp, off + on,
852 				    n, 0, S_READ);
853 				error = writerp(rp, base + on, n, uiop, 0);
854 			}
855 		}
856 
857 		if (!error) {
858 			if (mi->mi_flags & MI_NOAC)
859 				flags = SM_WRITE;
860 			else if ((uiop->uio_loffset % bsize) == 0 ||
861 			    IS_SWAPVP(vp)) {
862 				/*
863 				 * Have written a whole block.
864 				 * Start an asynchronous write
865 				 * and mark the buffer to
866 				 * indicate that it won't be
867 				 * needed again soon.
868 				 */
869 				flags = SM_WRITE | SM_ASYNC | SM_DONTNEED;
870 			} else
871 				flags = 0;
872 			if ((ioflag & (FSYNC|FDSYNC)) ||
873 			    (rp->r_flags & ROUTOFSPACE)) {
874 				flags &= ~SM_ASYNC;
875 				flags |= SM_WRITE;
876 			}
877 			if (vpm_enable) {
878 				error = vpm_sync_pages(vp, off, n, flags);
879 			} else {
880 				error = segmap_release(segkmap, base, flags);
881 			}
882 		} else {
883 			if (vpm_enable) {
884 				(void) vpm_sync_pages(vp, off, n, 0);
885 			} else {
886 				(void) segmap_release(segkmap, base, 0);
887 			}
888 			/*
889 			 * In the event that we got an access error while
890 			 * faulting in a page for a write-only file just
891 			 * force a write.
892 			 */
893 			if (error == EACCES)
894 				goto nfs3_fwrite;
895 		}
896 	} while (!error && uiop->uio_resid > 0);
897 
898 bottom:
899 	if (error) {
900 		uiop->uio_resid = resid + remainder;
901 		uiop->uio_loffset = offset;
902 	} else
903 		uiop->uio_resid += remainder;
904 
905 	nfs_rw_exit(&rp->r_lkserlock);
906 
907 	return (error);
908 }
909 
910 /*
911  * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED}
912  */
913 static int
914 nfs3_rdwrlbn(vnode_t *vp, page_t *pp, u_offset_t off, size_t len,
915 	int flags, cred_t *cr)
916 {
917 	struct buf *bp;
918 	int error;
919 	page_t *savepp;
920 	uchar_t fsdata;
921 	stable_how stab_comm;
922 
923 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
924 	bp = pageio_setup(pp, len, vp, flags);
925 	ASSERT(bp != NULL);
926 
927 	/*
928 	 * pageio_setup should have set b_addr to 0.  This
929 	 * is correct since we want to do I/O on a page
930 	 * boundary.  bp_mapin will use this addr to calculate
931 	 * an offset, and then set b_addr to the kernel virtual
932 	 * address it allocated for us.
933 	 */
934 	ASSERT(bp->b_un.b_addr == 0);
935 
936 	bp->b_edev = 0;
937 	bp->b_dev = 0;
938 	bp->b_lblkno = lbtodb(off);
939 	bp->b_file = vp;
940 	bp->b_offset = (offset_t)off;
941 	bp_mapin(bp);
942 
943 	/*
944 	 * Calculate the desired level of stability to write data
945 	 * on the server and then mark all of the pages to reflect
946 	 * this.
947 	 */
948 	if ((flags & (B_WRITE|B_ASYNC)) == (B_WRITE|B_ASYNC) &&
949 	    freemem > desfree) {
950 		stab_comm = UNSTABLE;
951 		fsdata = C_DELAYCOMMIT;
952 	} else {
953 		stab_comm = FILE_SYNC;
954 		fsdata = C_NOCOMMIT;
955 	}
956 
957 	savepp = pp;
958 	do {
959 		pp->p_fsdata = fsdata;
960 	} while ((pp = pp->p_next) != savepp);
961 
962 	error = nfs3_bio(bp, &stab_comm, cr);
963 
964 	bp_mapout(bp);
965 	pageio_done(bp);
966 
967 	/*
968 	 * If the server wrote pages in a more stable fashion than
969 	 * was requested, then clear all of the marks in the pages
970 	 * indicating that COMMIT operations were required.
971 	 */
972 	if (stab_comm != UNSTABLE && fsdata == C_DELAYCOMMIT) {
973 		do {
974 			pp->p_fsdata = C_NOCOMMIT;
975 		} while ((pp = pp->p_next) != savepp);
976 	}
977 
978 	return (error);
979 }
980 
981 /*
982  * Write to file.  Writes to remote server in largest size
983  * chunks that the server can handle.  Write is synchronous.
984  */
985 static int
986 nfs3write(vnode_t *vp, caddr_t base, u_offset_t offset, int count, cred_t *cr,
987 	stable_how *stab_comm)
988 {
989 	mntinfo_t *mi;
990 	WRITE3args args;
991 	WRITE3res res;
992 	int error;
993 	int tsize;
994 	rnode_t *rp;
995 	int douprintf;
996 
997 	rp = VTOR(vp);
998 	mi = VTOMI(vp);
999 
1000 	ASSERT(nfs_zone() == mi->mi_zone);
1001 
1002 	args.file = *VTOFH3(vp);
1003 	args.stable = *stab_comm;
1004 
1005 	*stab_comm = FILE_SYNC;
1006 
1007 	douprintf = 1;
1008 
1009 	do {
1010 		if ((vp->v_flag & VNOCACHE) ||
1011 		    (rp->r_flags & RDIRECTIO) ||
1012 		    (mi->mi_flags & MI_DIRECTIO))
1013 			tsize = MIN(mi->mi_stsize, count);
1014 		else
1015 			tsize = MIN(mi->mi_curwrite, count);
1016 		args.offset = (offset3)offset;
1017 		args.count = (count3)tsize;
1018 		args.data.data_len = (uint_t)tsize;
1019 		args.data.data_val = base;
1020 
1021 		if (mi->mi_io_kstats) {
1022 			mutex_enter(&mi->mi_lock);
1023 			kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1024 			mutex_exit(&mi->mi_lock);
1025 		}
1026 		args.mblk = NULL;
1027 		do {
1028 			error = rfs3call(mi, NFSPROC3_WRITE,
1029 			    xdr_WRITE3args, (caddr_t)&args,
1030 			    xdr_WRITE3res, (caddr_t)&res, cr,
1031 			    &douprintf, &res.status, 0, NULL);
1032 		} while (error == ENFS_TRYAGAIN);
1033 		if (mi->mi_io_kstats) {
1034 			mutex_enter(&mi->mi_lock);
1035 			kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
1036 			mutex_exit(&mi->mi_lock);
1037 		}
1038 
1039 		if (error)
1040 			return (error);
1041 		error = geterrno3(res.status);
1042 		if (!error) {
1043 			if (res.resok.count > args.count) {
1044 				zcmn_err(getzoneid(), CE_WARN,
1045 				    "nfs3write: server %s wrote %u, "
1046 				    "requested was %u",
1047 				    rp->r_server->sv_hostname,
1048 				    res.resok.count, args.count);
1049 				return (EIO);
1050 			}
1051 			if (res.resok.committed == UNSTABLE) {
1052 				*stab_comm = UNSTABLE;
1053 				if (args.stable == DATA_SYNC ||
1054 				    args.stable == FILE_SYNC) {
1055 					zcmn_err(getzoneid(), CE_WARN,
1056 			"nfs3write: server %s did not commit to stable storage",
1057 					    rp->r_server->sv_hostname);
1058 					return (EIO);
1059 				}
1060 			}
1061 			tsize = (int)res.resok.count;
1062 			count -= tsize;
1063 			base += tsize;
1064 			offset += tsize;
1065 			if (mi->mi_io_kstats) {
1066 				mutex_enter(&mi->mi_lock);
1067 				KSTAT_IO_PTR(mi->mi_io_kstats)->writes++;
1068 				KSTAT_IO_PTR(mi->mi_io_kstats)->nwritten +=
1069 				    tsize;
1070 				mutex_exit(&mi->mi_lock);
1071 			}
1072 			lwp_stat_update(LWP_STAT_OUBLK, 1);
1073 			mutex_enter(&rp->r_statelock);
1074 			if (rp->r_flags & RHAVEVERF) {
1075 				if (rp->r_verf != res.resok.verf) {
1076 					nfs3_set_mod(vp);
1077 					rp->r_verf = res.resok.verf;
1078 					/*
1079 					 * If the data was written UNSTABLE,
1080 					 * then might as well stop because
1081 					 * the whole block will have to get
1082 					 * rewritten anyway.
1083 					 */
1084 					if (*stab_comm == UNSTABLE) {
1085 						mutex_exit(&rp->r_statelock);
1086 						break;
1087 					}
1088 				}
1089 			} else {
1090 				rp->r_verf = res.resok.verf;
1091 				rp->r_flags |= RHAVEVERF;
1092 			}
1093 			/*
1094 			 * Mark the attribute cache as timed out and
1095 			 * set RWRITEATTR to indicate that the file
1096 			 * was modified with a WRITE operation and
1097 			 * that the attributes can not be trusted.
1098 			 */
1099 			PURGE_ATTRCACHE_LOCKED(rp);
1100 			rp->r_flags |= RWRITEATTR;
1101 			mutex_exit(&rp->r_statelock);
1102 		}
1103 	} while (!error && count);
1104 
1105 	return (error);
1106 }
1107 
1108 /*
1109  * Read from a file.  Reads data in largest chunks our interface can handle.
1110  */
1111 static int
1112 nfs3read(vnode_t *vp, caddr_t base, offset_t offset, int count,
1113 	size_t *residp, cred_t *cr)
1114 {
1115 	mntinfo_t *mi;
1116 	READ3args args;
1117 	READ3vres res;
1118 	int tsize;
1119 	int error;
1120 	int douprintf;
1121 	failinfo_t fi;
1122 	rnode_t *rp;
1123 	struct vattr va;
1124 	hrtime_t t;
1125 
1126 	rp = VTOR(vp);
1127 	mi = VTOMI(vp);
1128 	ASSERT(nfs_zone() == mi->mi_zone);
1129 	douprintf = 1;
1130 
1131 	args.file = *VTOFH3(vp);
1132 	fi.vp = vp;
1133 	fi.fhp = (caddr_t)&args.file;
1134 	fi.copyproc = nfs3copyfh;
1135 	fi.lookupproc = nfs3lookup;
1136 	fi.xattrdirproc = acl_getxattrdir3;
1137 
1138 	res.pov.fres.vp = vp;
1139 	res.pov.fres.vap = &va;
1140 
1141 	res.wlist = NULL;
1142 	*residp = count;
1143 	do {
1144 		if (mi->mi_io_kstats) {
1145 			mutex_enter(&mi->mi_lock);
1146 			kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1147 			mutex_exit(&mi->mi_lock);
1148 		}
1149 
1150 		do {
1151 			if ((vp->v_flag & VNOCACHE) ||
1152 			    (rp->r_flags & RDIRECTIO) ||
1153 			    (mi->mi_flags & MI_DIRECTIO))
1154 				tsize = MIN(mi->mi_tsize, count);
1155 			else
1156 				tsize = MIN(mi->mi_curread, count);
1157 			res.data.data_val = base;
1158 			res.data.data_len = tsize;
1159 			args.offset = (offset3)offset;
1160 			args.count = (count3)tsize;
1161 			args.res_uiop = NULL;
1162 			args.res_data_val_alt = base;
1163 
1164 			t = gethrtime();
1165 			error = rfs3call(mi, NFSPROC3_READ,
1166 			    xdr_READ3args, (caddr_t)&args,
1167 			    xdr_READ3vres, (caddr_t)&res, cr,
1168 			    &douprintf, &res.status, 0, &fi);
1169 		} while (error == ENFS_TRYAGAIN);
1170 
1171 		if (mi->mi_io_kstats) {
1172 			mutex_enter(&mi->mi_lock);
1173 			kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
1174 			mutex_exit(&mi->mi_lock);
1175 		}
1176 
1177 		if (error)
1178 			return (error);
1179 
1180 		error = geterrno3(res.status);
1181 		if (error)
1182 			return (error);
1183 
1184 		if (res.count != res.data.data_len) {
1185 			zcmn_err(getzoneid(), CE_WARN,
1186 			    "nfs3read: server %s returned incorrect amount",
1187 			    rp->r_server->sv_hostname);
1188 			return (EIO);
1189 		}
1190 
1191 		count -= res.count;
1192 		*residp = count;
1193 		base += res.count;
1194 		offset += res.count;
1195 		if (mi->mi_io_kstats) {
1196 			mutex_enter(&mi->mi_lock);
1197 			KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
1198 			KSTAT_IO_PTR(mi->mi_io_kstats)->nread += res.count;
1199 			mutex_exit(&mi->mi_lock);
1200 		}
1201 		lwp_stat_update(LWP_STAT_INBLK, 1);
1202 	} while (count && !res.eof);
1203 
1204 	if (res.pov.attributes) {
1205 		mutex_enter(&rp->r_statelock);
1206 		if (!CACHE_VALID(rp, va.va_mtime, va.va_size)) {
1207 			mutex_exit(&rp->r_statelock);
1208 			PURGE_ATTRCACHE(vp);
1209 		} else {
1210 			if (rp->r_mtime <= t)
1211 				nfs_attrcache_va(vp, &va);
1212 			mutex_exit(&rp->r_statelock);
1213 		}
1214 	}
1215 
1216 	return (0);
1217 }
1218 
1219 /* ARGSUSED */
1220 static int
1221 nfs3_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp,
1222 	caller_context_t *ct)
1223 {
1224 
1225 	if (nfs_zone() != VTOMI(vp)->mi_zone)
1226 		return (EIO);
1227 	switch (cmd) {
1228 		case _FIODIRECTIO:
1229 			return (nfs_directio(vp, (int)arg, cr));
1230 		default:
1231 			return (ENOTTY);
1232 	}
1233 }
1234 
1235 /* ARGSUSED */
1236 static int
1237 nfs3_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
1238 	caller_context_t *ct)
1239 {
1240 	int error;
1241 	rnode_t *rp;
1242 
1243 	if (nfs_zone() != VTOMI(vp)->mi_zone)
1244 		return (EIO);
1245 	/*
1246 	 * If it has been specified that the return value will
1247 	 * just be used as a hint, and we are only being asked
1248 	 * for size, fsid or rdevid, then return the client's
1249 	 * notion of these values without checking to make sure
1250 	 * that the attribute cache is up to date.
1251 	 * The whole point is to avoid an over the wire GETATTR
1252 	 * call.
1253 	 */
1254 	rp = VTOR(vp);
1255 	if (flags & ATTR_HINT) {
1256 		if (vap->va_mask ==
1257 		    (vap->va_mask & (AT_SIZE | AT_FSID | AT_RDEV))) {
1258 			mutex_enter(&rp->r_statelock);
1259 			if (vap->va_mask | AT_SIZE)
1260 				vap->va_size = rp->r_size;
1261 			if (vap->va_mask | AT_FSID)
1262 				vap->va_fsid = rp->r_attr.va_fsid;
1263 			if (vap->va_mask | AT_RDEV)
1264 				vap->va_rdev = rp->r_attr.va_rdev;
1265 			mutex_exit(&rp->r_statelock);
1266 			return (0);
1267 		}
1268 	}
1269 
1270 	/*
1271 	 * Only need to flush pages if asking for the mtime
1272 	 * and if there any dirty pages or any outstanding
1273 	 * asynchronous (write) requests for this file.
1274 	 */
1275 	if (vap->va_mask & AT_MTIME) {
1276 		if (vn_has_cached_data(vp) &&
1277 		    ((rp->r_flags & RDIRTY) || rp->r_awcount > 0)) {
1278 			mutex_enter(&rp->r_statelock);
1279 			rp->r_gcount++;
1280 			mutex_exit(&rp->r_statelock);
1281 			error = nfs3_putpage(vp, (offset_t)0, 0, 0, cr, ct);
1282 			mutex_enter(&rp->r_statelock);
1283 			if (error && (error == ENOSPC || error == EDQUOT)) {
1284 				if (!rp->r_error)
1285 					rp->r_error = error;
1286 			}
1287 			if (--rp->r_gcount == 0)
1288 				cv_broadcast(&rp->r_cv);
1289 			mutex_exit(&rp->r_statelock);
1290 		}
1291 	}
1292 
1293 	return (nfs3getattr(vp, vap, cr));
1294 }
1295 
1296 /*ARGSUSED4*/
1297 static int
1298 nfs3_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
1299 		caller_context_t *ct)
1300 {
1301 	int error;
1302 	struct vattr va;
1303 
1304 	if (vap->va_mask & AT_NOSET)
1305 		return (EINVAL);
1306 	if (nfs_zone() != VTOMI(vp)->mi_zone)
1307 		return (EIO);
1308 
1309 	va.va_mask = AT_UID | AT_MODE;
1310 	error = nfs3getattr(vp, &va, cr);
1311 	if (error)
1312 		return (error);
1313 
1314 	error = secpolicy_vnode_setattr(cr, vp, vap, &va, flags, nfs3_accessx,
1315 	    vp);
1316 	if (error)
1317 		return (error);
1318 
1319 	return (nfs3setattr(vp, vap, flags, cr));
1320 }
1321 
1322 static int
1323 nfs3setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr)
1324 {
1325 	int error;
1326 	uint_t mask;
1327 	SETATTR3args args;
1328 	SETATTR3res res;
1329 	int douprintf;
1330 	rnode_t *rp;
1331 	struct vattr va;
1332 	mode_t omode;
1333 	vsecattr_t *vsp;
1334 	hrtime_t t;
1335 
1336 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
1337 	mask = vap->va_mask;
1338 
1339 	rp = VTOR(vp);
1340 
1341 	/*
1342 	 * Only need to flush pages if there are any pages and
1343 	 * if the file is marked as dirty in some fashion.  The
1344 	 * file must be flushed so that we can accurately
1345 	 * determine the size of the file and the cached data
1346 	 * after the SETATTR returns.  A file is considered to
1347 	 * be dirty if it is either marked with RDIRTY, has
1348 	 * outstanding i/o's active, or is mmap'd.  In this
1349 	 * last case, we can't tell whether there are dirty
1350 	 * pages, so we flush just to be sure.
1351 	 */
1352 	if (vn_has_cached_data(vp) &&
1353 	    ((rp->r_flags & RDIRTY) ||
1354 	    rp->r_count > 0 ||
1355 	    rp->r_mapcnt > 0)) {
1356 		ASSERT(vp->v_type != VCHR);
1357 		error = nfs3_putpage(vp, (offset_t)0, 0, 0, cr, NULL);
1358 		if (error && (error == ENOSPC || error == EDQUOT)) {
1359 			mutex_enter(&rp->r_statelock);
1360 			if (!rp->r_error)
1361 				rp->r_error = error;
1362 			mutex_exit(&rp->r_statelock);
1363 		}
1364 	}
1365 
1366 	args.object = *RTOFH3(rp);
1367 	/*
1368 	 * If the intent is for the server to set the times,
1369 	 * there is no point in have the mask indicating set mtime or
1370 	 * atime, because the vap values may be junk, and so result
1371 	 * in an overflow error. Remove these flags from the vap mask
1372 	 * before calling in this case, and restore them afterwards.
1373 	 */
1374 	if ((mask & (AT_ATIME | AT_MTIME)) && !(flags & ATTR_UTIME)) {
1375 		/* Use server times, so don't set the args time fields */
1376 		vap->va_mask &= ~(AT_ATIME | AT_MTIME);
1377 		error = vattr_to_sattr3(vap, &args.new_attributes);
1378 		vap->va_mask |= (mask & (AT_ATIME | AT_MTIME));
1379 		if (mask & AT_ATIME) {
1380 			args.new_attributes.atime.set_it = SET_TO_SERVER_TIME;
1381 		}
1382 		if (mask & AT_MTIME) {
1383 			args.new_attributes.mtime.set_it = SET_TO_SERVER_TIME;
1384 		}
1385 	} else {
1386 		/* Either do not set times or use the client specified times */
1387 		error = vattr_to_sattr3(vap, &args.new_attributes);
1388 	}
1389 
1390 	if (error) {
1391 		/* req time field(s) overflow - return immediately */
1392 		return (error);
1393 	}
1394 
1395 	va.va_mask = AT_MODE | AT_CTIME;
1396 	error = nfs3getattr(vp, &va, cr);
1397 	if (error)
1398 		return (error);
1399 	omode = va.va_mode;
1400 
1401 tryagain:
1402 	if (mask & AT_SIZE) {
1403 		args.guard.check = TRUE;
1404 		args.guard.obj_ctime.seconds = va.va_ctime.tv_sec;
1405 		args.guard.obj_ctime.nseconds = va.va_ctime.tv_nsec;
1406 	} else
1407 		args.guard.check = FALSE;
1408 
1409 	douprintf = 1;
1410 
1411 	t = gethrtime();
1412 
1413 	error = rfs3call(VTOMI(vp), NFSPROC3_SETATTR,
1414 	    xdr_SETATTR3args, (caddr_t)&args,
1415 	    xdr_SETATTR3res, (caddr_t)&res, cr,
1416 	    &douprintf, &res.status, 0, NULL);
1417 
1418 	/*
1419 	 * Purge the access cache and ACL cache if changing either the
1420 	 * owner of the file, the group owner, or the mode.  These may
1421 	 * change the access permissions of the file, so purge old
1422 	 * information and start over again.
1423 	 */
1424 	if (mask & (AT_UID | AT_GID | AT_MODE)) {
1425 		(void) nfs_access_purge_rp(rp);
1426 		if (rp->r_secattr != NULL) {
1427 			mutex_enter(&rp->r_statelock);
1428 			vsp = rp->r_secattr;
1429 			rp->r_secattr = NULL;
1430 			mutex_exit(&rp->r_statelock);
1431 			if (vsp != NULL)
1432 				nfs_acl_free(vsp);
1433 		}
1434 	}
1435 
1436 	if (error) {
1437 		PURGE_ATTRCACHE(vp);
1438 		return (error);
1439 	}
1440 
1441 	error = geterrno3(res.status);
1442 	if (!error) {
1443 		/*
1444 		 * If changing the size of the file, invalidate
1445 		 * any local cached data which is no longer part
1446 		 * of the file.  We also possibly invalidate the
1447 		 * last page in the file.  We could use
1448 		 * pvn_vpzero(), but this would mark the page as
1449 		 * modified and require it to be written back to
1450 		 * the server for no particularly good reason.
1451 		 * This way, if we access it, then we bring it
1452 		 * back in.  A read should be cheaper than a
1453 		 * write.
1454 		 */
1455 		if (mask & AT_SIZE) {
1456 			nfs_invalidate_pages(vp,
1457 			    (vap->va_size & PAGEMASK), cr);
1458 		}
1459 		nfs3_cache_wcc_data(vp, &res.resok.obj_wcc, t, cr);
1460 		/*
1461 		 * Some servers will change the mode to clear the setuid
1462 		 * and setgid bits when changing the uid or gid.  The
1463 		 * client needs to compensate appropriately.
1464 		 */
1465 		if (mask & (AT_UID | AT_GID)) {
1466 			int terror;
1467 
1468 			va.va_mask = AT_MODE;
1469 			terror = nfs3getattr(vp, &va, cr);
1470 			if (!terror &&
1471 			    (((mask & AT_MODE) && va.va_mode != vap->va_mode) ||
1472 			    (!(mask & AT_MODE) && va.va_mode != omode))) {
1473 				va.va_mask = AT_MODE;
1474 				if (mask & AT_MODE)
1475 					va.va_mode = vap->va_mode;
1476 				else
1477 					va.va_mode = omode;
1478 				(void) nfs3setattr(vp, &va, 0, cr);
1479 			}
1480 		}
1481 	} else {
1482 		nfs3_cache_wcc_data(vp, &res.resfail.obj_wcc, t, cr);
1483 		/*
1484 		 * If we got back a "not synchronized" error, then
1485 		 * we need to retry with a new guard value.  The
1486 		 * guard value used is the change time.  If the
1487 		 * server returned post_op_attr, then we can just
1488 		 * retry because we have the latest attributes.
1489 		 * Otherwise, we issue a GETATTR to get the latest
1490 		 * attributes and then retry.  If we couldn't get
1491 		 * the attributes this way either, then we give
1492 		 * up because we can't complete the operation as
1493 		 * required.
1494 		 */
1495 		if (res.status == NFS3ERR_NOT_SYNC) {
1496 			va.va_mask = AT_CTIME;
1497 			if (nfs3getattr(vp, &va, cr) == 0)
1498 				goto tryagain;
1499 		}
1500 		PURGE_STALE_FH(error, vp, cr);
1501 	}
1502 
1503 	return (error);
1504 }
1505 
1506 static int
1507 nfs3_accessx(void *vp, int mode, cred_t *cr)
1508 {
1509 	ASSERT(nfs_zone() == VTOMI((vnode_t *)vp)->mi_zone);
1510 	return (nfs3_access(vp, mode, 0, cr, NULL));
1511 }
1512 
1513 /* ARGSUSED */
1514 static int
1515 nfs3_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct)
1516 {
1517 	int error;
1518 	ACCESS3args args;
1519 	ACCESS3res res;
1520 	int douprintf;
1521 	uint32 acc;
1522 	rnode_t *rp;
1523 	cred_t *cred, *ncr, *ncrfree = NULL;
1524 	failinfo_t fi;
1525 	nfs_access_type_t cacc;
1526 	hrtime_t t;
1527 
1528 	acc = 0;
1529 	if (nfs_zone() != VTOMI(vp)->mi_zone)
1530 		return (EIO);
1531 	if (mode & VREAD)
1532 		acc |= ACCESS3_READ;
1533 	if (mode & VWRITE) {
1534 		if (vn_is_readonly(vp) && !IS_DEVVP(vp))
1535 			return (EROFS);
1536 		if (vp->v_type == VDIR)
1537 			acc |= ACCESS3_DELETE;
1538 		acc |= ACCESS3_MODIFY | ACCESS3_EXTEND;
1539 	}
1540 	if (mode & VEXEC) {
1541 		if (vp->v_type == VDIR)
1542 			acc |= ACCESS3_LOOKUP;
1543 		else
1544 			acc |= ACCESS3_EXECUTE;
1545 	}
1546 
1547 	rp = VTOR(vp);
1548 	args.object = *VTOFH3(vp);
1549 	if (vp->v_type == VDIR) {
1550 		args.access = ACCESS3_READ | ACCESS3_DELETE | ACCESS3_MODIFY |
1551 		    ACCESS3_EXTEND | ACCESS3_LOOKUP;
1552 	} else {
1553 		args.access = ACCESS3_READ | ACCESS3_MODIFY | ACCESS3_EXTEND |
1554 		    ACCESS3_EXECUTE;
1555 	}
1556 	fi.vp = vp;
1557 	fi.fhp = (caddr_t)&args.object;
1558 	fi.copyproc = nfs3copyfh;
1559 	fi.lookupproc = nfs3lookup;
1560 	fi.xattrdirproc = acl_getxattrdir3;
1561 
1562 	cred = cr;
1563 	/*
1564 	 * ncr and ncrfree both initially
1565 	 * point to the memory area returned
1566 	 * by crnetadjust();
1567 	 * ncrfree not NULL when exiting means
1568 	 * that we need to release it
1569 	 */
1570 	ncr = crnetadjust(cred);
1571 	ncrfree = ncr;
1572 tryagain:
1573 	if (rp->r_acache != NULL) {
1574 		cacc = nfs_access_check(rp, acc, cred);
1575 		if (cacc == NFS_ACCESS_ALLOWED) {
1576 			if (ncrfree != NULL)
1577 				crfree(ncrfree);
1578 			return (0);
1579 		}
1580 		if (cacc == NFS_ACCESS_DENIED) {
1581 			/*
1582 			 * If the cred can be adjusted, try again
1583 			 * with the new cred.
1584 			 */
1585 			if (ncr != NULL) {
1586 				cred = ncr;
1587 				ncr = NULL;
1588 				goto tryagain;
1589 			}
1590 			if (ncrfree != NULL)
1591 				crfree(ncrfree);
1592 			return (EACCES);
1593 		}
1594 	}
1595 
1596 	douprintf = 1;
1597 
1598 	t = gethrtime();
1599 
1600 	error = rfs3call(VTOMI(vp), NFSPROC3_ACCESS,
1601 	    xdr_ACCESS3args, (caddr_t)&args,
1602 	    xdr_ACCESS3res, (caddr_t)&res, cred,
1603 	    &douprintf, &res.status, 0, &fi);
1604 
1605 	if (error) {
1606 		if (ncrfree != NULL)
1607 			crfree(ncrfree);
1608 		return (error);
1609 	}
1610 
1611 	error = geterrno3(res.status);
1612 	if (!error) {
1613 		nfs3_cache_post_op_attr(vp, &res.resok.obj_attributes, t, cr);
1614 		nfs_access_cache(rp, args.access, res.resok.access, cred);
1615 		/*
1616 		 * we just cached results with cred; if cred is the
1617 		 * adjusted credentials from crnetadjust, we do not want
1618 		 * to release them before exiting: hence setting ncrfree
1619 		 * to NULL
1620 		 */
1621 		if (cred != cr)
1622 			ncrfree = NULL;
1623 		if ((acc & res.resok.access) != acc) {
1624 			/*
1625 			 * If the cred can be adjusted, try again
1626 			 * with the new cred.
1627 			 */
1628 			if (ncr != NULL) {
1629 				cred = ncr;
1630 				ncr = NULL;
1631 				goto tryagain;
1632 			}
1633 			error = EACCES;
1634 		}
1635 	} else {
1636 		nfs3_cache_post_op_attr(vp, &res.resfail.obj_attributes, t, cr);
1637 		PURGE_STALE_FH(error, vp, cr);
1638 	}
1639 
1640 	if (ncrfree != NULL)
1641 		crfree(ncrfree);
1642 
1643 	return (error);
1644 }
1645 
1646 static int nfs3_do_symlink_cache = 1;
1647 
1648 /* ARGSUSED */
1649 static int
1650 nfs3_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr, caller_context_t *ct)
1651 {
1652 	int error;
1653 	READLINK3args args;
1654 	READLINK3res res;
1655 	nfspath3 resdata_backup;
1656 	rnode_t *rp;
1657 	int douprintf;
1658 	int len;
1659 	failinfo_t fi;
1660 	hrtime_t t;
1661 
1662 	/*
1663 	 * Can't readlink anything other than a symbolic link.
1664 	 */
1665 	if (vp->v_type != VLNK)
1666 		return (EINVAL);
1667 	if (nfs_zone() != VTOMI(vp)->mi_zone)
1668 		return (EIO);
1669 
1670 	rp = VTOR(vp);
1671 	if (nfs3_do_symlink_cache && rp->r_symlink.contents != NULL) {
1672 		error = nfs3_validate_caches(vp, cr);
1673 		if (error)
1674 			return (error);
1675 		mutex_enter(&rp->r_statelock);
1676 		if (rp->r_symlink.contents != NULL) {
1677 			error = uiomove(rp->r_symlink.contents,
1678 			    rp->r_symlink.len, UIO_READ, uiop);
1679 			mutex_exit(&rp->r_statelock);
1680 			return (error);
1681 		}
1682 		mutex_exit(&rp->r_statelock);
1683 	}
1684 
1685 	args.symlink = *VTOFH3(vp);
1686 	fi.vp = vp;
1687 	fi.fhp = (caddr_t)&args.symlink;
1688 	fi.copyproc = nfs3copyfh;
1689 	fi.lookupproc = nfs3lookup;
1690 	fi.xattrdirproc = acl_getxattrdir3;
1691 
1692 	res.resok.data = kmem_alloc(MAXPATHLEN, KM_SLEEP);
1693 
1694 	resdata_backup = res.resok.data;
1695 
1696 	douprintf = 1;
1697 
1698 	t = gethrtime();
1699 
1700 	error = rfs3call(VTOMI(vp), NFSPROC3_READLINK,
1701 	    xdr_READLINK3args, (caddr_t)&args,
1702 	    xdr_READLINK3res, (caddr_t)&res, cr,
1703 	    &douprintf, &res.status, 0, &fi);
1704 
1705 	if (res.resok.data == nfs3nametoolong)
1706 		error = EINVAL;
1707 
1708 	if (error) {
1709 		kmem_free(resdata_backup, MAXPATHLEN);
1710 		return (error);
1711 	}
1712 
1713 	error = geterrno3(res.status);
1714 	if (!error) {
1715 		nfs3_cache_post_op_attr(vp, &res.resok.symlink_attributes, t,
1716 		    cr);
1717 		len = strlen(res.resok.data);
1718 		error = uiomove(res.resok.data, len, UIO_READ, uiop);
1719 		if (nfs3_do_symlink_cache && rp->r_symlink.contents == NULL) {
1720 			mutex_enter(&rp->r_statelock);
1721 				if (rp->r_symlink.contents == NULL) {
1722 				rp->r_symlink.contents = res.resok.data;
1723 				rp->r_symlink.len = len;
1724 				rp->r_symlink.size = MAXPATHLEN;
1725 				mutex_exit(&rp->r_statelock);
1726 			} else {
1727 				mutex_exit(&rp->r_statelock);
1728 
1729 				kmem_free((void *)res.resok.data, MAXPATHLEN);
1730 			}
1731 		} else {
1732 			kmem_free((void *)res.resok.data, MAXPATHLEN);
1733 		}
1734 	} else {
1735 		nfs3_cache_post_op_attr(vp,
1736 		    &res.resfail.symlink_attributes, t, cr);
1737 		PURGE_STALE_FH(error, vp, cr);
1738 
1739 		kmem_free((void *)res.resok.data, MAXPATHLEN);
1740 
1741 	}
1742 
1743 	/*
1744 	 * The over the wire error for attempting to readlink something
1745 	 * other than a symbolic link is ENXIO.  However, we need to
1746 	 * return EINVAL instead of ENXIO, so we map it here.
1747 	 */
1748 	return (error == ENXIO ? EINVAL : error);
1749 }
1750 
1751 /*
1752  * Flush local dirty pages to stable storage on the server.
1753  *
1754  * If FNODSYNC is specified, then there is nothing to do because
1755  * metadata changes are not cached on the client before being
1756  * sent to the server.
1757  */
1758 /* ARGSUSED */
1759 static int
1760 nfs3_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
1761 {
1762 	int error;
1763 
1764 	if ((syncflag & FNODSYNC) || IS_SWAPVP(vp))
1765 		return (0);
1766 	if (nfs_zone() != VTOMI(vp)->mi_zone)
1767 		return (EIO);
1768 
1769 	error = nfs3_putpage_commit(vp, (offset_t)0, 0, cr);
1770 	if (!error)
1771 		error = VTOR(vp)->r_error;
1772 	return (error);
1773 }
1774 
1775 /*
1776  * Weirdness: if the file was removed or the target of a rename
1777  * operation while it was open, it got renamed instead.  Here we
1778  * remove the renamed file.
1779  */
1780 /* ARGSUSED */
1781 static void
1782 nfs3_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
1783 {
1784 	rnode_t *rp;
1785 
1786 	ASSERT(vp != DNLC_NO_VNODE);
1787 
1788 	/*
1789 	 * If this is coming from the wrong zone, we let someone in the right
1790 	 * zone take care of it asynchronously.  We can get here due to
1791 	 * VN_RELE() being called from pageout() or fsflush().  This call may
1792 	 * potentially turn into an expensive no-op if, for instance, v_count
1793 	 * gets incremented in the meantime, but it's still correct.
1794 	 */
1795 	if (nfs_zone() != VTOMI(vp)->mi_zone) {
1796 		nfs_async_inactive(vp, cr, nfs3_inactive);
1797 		return;
1798 	}
1799 
1800 	rp = VTOR(vp);
1801 redo:
1802 	if (rp->r_unldvp != NULL) {
1803 		/*
1804 		 * Save the vnode pointer for the directory where the
1805 		 * unlinked-open file got renamed, then set it to NULL
1806 		 * to prevent another thread from getting here before
1807 		 * we're done with the remove.  While we have the
1808 		 * statelock, make local copies of the pertinent rnode
1809 		 * fields.  If we weren't to do this in an atomic way, the
1810 		 * the unl* fields could become inconsistent with respect
1811 		 * to each other due to a race condition between this
1812 		 * code and nfs_remove().  See bug report 1034328.
1813 		 */
1814 		mutex_enter(&rp->r_statelock);
1815 		if (rp->r_unldvp != NULL) {
1816 			vnode_t *unldvp;
1817 			char *unlname;
1818 			cred_t *unlcred;
1819 			REMOVE3args args;
1820 			REMOVE3res res;
1821 			int douprintf;
1822 			int error;
1823 			hrtime_t t;
1824 
1825 			unldvp = rp->r_unldvp;
1826 			rp->r_unldvp = NULL;
1827 			unlname = rp->r_unlname;
1828 			rp->r_unlname = NULL;
1829 			unlcred = rp->r_unlcred;
1830 			rp->r_unlcred = NULL;
1831 			mutex_exit(&rp->r_statelock);
1832 
1833 			/*
1834 			 * If there are any dirty pages left, then flush
1835 			 * them.  This is unfortunate because they just
1836 			 * may get thrown away during the remove operation,
1837 			 * but we have to do this for correctness.
1838 			 */
1839 			if (vn_has_cached_data(vp) &&
1840 			    ((rp->r_flags & RDIRTY) || rp->r_count > 0)) {
1841 				ASSERT(vp->v_type != VCHR);
1842 				error = nfs3_putpage(vp, (offset_t)0, 0, 0,
1843 				    cr, ct);
1844 				if (error) {
1845 					mutex_enter(&rp->r_statelock);
1846 					if (!rp->r_error)
1847 						rp->r_error = error;
1848 					mutex_exit(&rp->r_statelock);
1849 				}
1850 			}
1851 
1852 			/*
1853 			 * Do the remove operation on the renamed file
1854 			 */
1855 			setdiropargs3(&args.object, unlname, unldvp);
1856 
1857 			douprintf = 1;
1858 
1859 			t = gethrtime();
1860 
1861 			error = rfs3call(VTOMI(unldvp), NFSPROC3_REMOVE,
1862 			    xdr_diropargs3, (caddr_t)&args,
1863 			    xdr_REMOVE3res, (caddr_t)&res, unlcred,
1864 			    &douprintf, &res.status, 0, NULL);
1865 
1866 			if (error) {
1867 				PURGE_ATTRCACHE(unldvp);
1868 			} else {
1869 				error = geterrno3(res.status);
1870 				if (!error) {
1871 					nfs3_cache_wcc_data(unldvp,
1872 					    &res.resok.dir_wcc, t, cr);
1873 					if (HAVE_RDDIR_CACHE(VTOR(unldvp)))
1874 						nfs_purge_rddir_cache(unldvp);
1875 				} else {
1876 					nfs3_cache_wcc_data(unldvp,
1877 					    &res.resfail.dir_wcc, t, cr);
1878 					PURGE_STALE_FH(error, unldvp, cr);
1879 				}
1880 			}
1881 
1882 			/*
1883 			 * Release stuff held for the remove
1884 			 */
1885 			VN_RELE(unldvp);
1886 			kmem_free(unlname, MAXNAMELEN);
1887 			crfree(unlcred);
1888 			goto redo;
1889 		}
1890 		mutex_exit(&rp->r_statelock);
1891 	}
1892 
1893 	rp_addfree(rp, cr);
1894 }
1895 
1896 /*
1897  * Remote file system operations having to do with directory manipulation.
1898  */
1899 
1900 /* ARGSUSED */
1901 static int
1902 nfs3_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
1903 	int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
1904 	int *direntflags, pathname_t *realpnp)
1905 {
1906 	int error;
1907 	vnode_t *vp;
1908 	vnode_t *avp = NULL;
1909 	rnode_t *drp;
1910 
1911 	if (nfs_zone() != VTOMI(dvp)->mi_zone)
1912 		return (EPERM);
1913 
1914 	drp = VTOR(dvp);
1915 
1916 	/*
1917 	 * Are we looking up extended attributes?  If so, "dvp" is
1918 	 * the file or directory for which we want attributes, and
1919 	 * we need a lookup of the hidden attribute directory
1920 	 * before we lookup the rest of the path.
1921 	 */
1922 	if (flags & LOOKUP_XATTR) {
1923 		bool_t cflag = ((flags & CREATE_XATTR_DIR) != 0);
1924 		mntinfo_t *mi;
1925 
1926 		mi = VTOMI(dvp);
1927 		if (!(mi->mi_flags & MI_EXTATTR))
1928 			return (EINVAL);
1929 
1930 		if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR(dvp)))
1931 			return (EINTR);
1932 
1933 		(void) nfs3lookup_dnlc(dvp, XATTR_DIR_NAME, &avp, cr);
1934 		if (avp == NULL)
1935 			error = acl_getxattrdir3(dvp, &avp, cflag, cr, 0);
1936 		else
1937 			error = 0;
1938 
1939 		nfs_rw_exit(&drp->r_rwlock);
1940 
1941 		if (error) {
1942 			if (mi->mi_flags & MI_EXTATTR)
1943 				return (error);
1944 			return (EINVAL);
1945 		}
1946 		dvp = avp;
1947 		drp = VTOR(dvp);
1948 	}
1949 
1950 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR(dvp))) {
1951 		error = EINTR;
1952 		goto out;
1953 	}
1954 
1955 	error = nfs3lookup(dvp, nm, vpp, pnp, flags, rdir, cr, 0);
1956 
1957 	nfs_rw_exit(&drp->r_rwlock);
1958 
1959 	/*
1960 	 * If vnode is a device, create special vnode.
1961 	 */
1962 	if (!error && IS_DEVVP(*vpp)) {
1963 		vp = *vpp;
1964 		*vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
1965 		VN_RELE(vp);
1966 	}
1967 
1968 out:
1969 	if (avp != NULL)
1970 		VN_RELE(avp);
1971 
1972 	return (error);
1973 }
1974 
1975 static int nfs3_lookup_neg_cache = 1;
1976 
1977 #ifdef DEBUG
1978 static int nfs3_lookup_dnlc_hits = 0;
1979 static int nfs3_lookup_dnlc_misses = 0;
1980 static int nfs3_lookup_dnlc_neg_hits = 0;
1981 static int nfs3_lookup_dnlc_disappears = 0;
1982 static int nfs3_lookup_dnlc_lookups = 0;
1983 #endif
1984 
1985 /* ARGSUSED */
1986 int
1987 nfs3lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
1988 	int flags, vnode_t *rdir, cred_t *cr, int rfscall_flags)
1989 {
1990 	int error;
1991 	rnode_t *drp;
1992 
1993 	ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
1994 	/*
1995 	 * If lookup is for "", just return dvp.  Don't need
1996 	 * to send it over the wire, look it up in the dnlc,
1997 	 * or perform any access checks.
1998 	 */
1999 	if (*nm == '\0') {
2000 		VN_HOLD(dvp);
2001 		*vpp = dvp;
2002 		return (0);
2003 	}
2004 
2005 	/*
2006 	 * Can't do lookups in non-directories.
2007 	 */
2008 	if (dvp->v_type != VDIR)
2009 		return (ENOTDIR);
2010 
2011 	/*
2012 	 * If we're called with RFSCALL_SOFT, it's important that
2013 	 * the only rfscall is one we make directly; if we permit
2014 	 * an access call because we're looking up "." or validating
2015 	 * a dnlc hit, we'll deadlock because that rfscall will not
2016 	 * have the RFSCALL_SOFT set.
2017 	 */
2018 	if (rfscall_flags & RFSCALL_SOFT)
2019 		goto callit;
2020 
2021 	/*
2022 	 * If lookup is for ".", just return dvp.  Don't need
2023 	 * to send it over the wire or look it up in the dnlc,
2024 	 * just need to check access.
2025 	 */
2026 	if (strcmp(nm, ".") == 0) {
2027 		error = nfs3_access(dvp, VEXEC, 0, cr, NULL);
2028 		if (error)
2029 			return (error);
2030 		VN_HOLD(dvp);
2031 		*vpp = dvp;
2032 		return (0);
2033 	}
2034 
2035 	drp = VTOR(dvp);
2036 	if (!(drp->r_flags & RLOOKUP)) {
2037 		mutex_enter(&drp->r_statelock);
2038 		drp->r_flags |= RLOOKUP;
2039 		mutex_exit(&drp->r_statelock);
2040 	}
2041 
2042 	/*
2043 	 * Lookup this name in the DNLC.  If there was a valid entry,
2044 	 * then return the results of the lookup.
2045 	 */
2046 	error = nfs3lookup_dnlc(dvp, nm, vpp, cr);
2047 	if (error || *vpp != NULL)
2048 		return (error);
2049 
2050 callit:
2051 	error = nfs3lookup_otw(dvp, nm, vpp, cr, rfscall_flags);
2052 
2053 	return (error);
2054 }
2055 
2056 static int
2057 nfs3lookup_dnlc(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr)
2058 {
2059 	int error;
2060 	vnode_t *vp;
2061 
2062 	ASSERT(*nm != '\0');
2063 	ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
2064 	/*
2065 	 * Lookup this name in the DNLC.  If successful, then validate
2066 	 * the caches and then recheck the DNLC.  The DNLC is rechecked
2067 	 * just in case this entry got invalidated during the call
2068 	 * to nfs3_validate_caches.
2069 	 *
2070 	 * An assumption is being made that it is safe to say that a
2071 	 * file exists which may not on the server.  Any operations to
2072 	 * the server will fail with ESTALE.
2073 	 */
2074 #ifdef DEBUG
2075 	nfs3_lookup_dnlc_lookups++;
2076 #endif
2077 	vp = dnlc_lookup(dvp, nm);
2078 	if (vp != NULL) {
2079 		VN_RELE(vp);
2080 		if (vp == DNLC_NO_VNODE && !vn_is_readonly(dvp)) {
2081 			PURGE_ATTRCACHE(dvp);
2082 		}
2083 		error = nfs3_validate_caches(dvp, cr);
2084 		if (error)
2085 			return (error);
2086 		vp = dnlc_lookup(dvp, nm);
2087 		if (vp != NULL) {
2088 			error = nfs3_access(dvp, VEXEC, 0, cr, NULL);
2089 			if (error) {
2090 				VN_RELE(vp);
2091 				return (error);
2092 			}
2093 			if (vp == DNLC_NO_VNODE) {
2094 				VN_RELE(vp);
2095 #ifdef DEBUG
2096 				nfs3_lookup_dnlc_neg_hits++;
2097 #endif
2098 				return (ENOENT);
2099 			}
2100 			*vpp = vp;
2101 #ifdef DEBUG
2102 			nfs3_lookup_dnlc_hits++;
2103 #endif
2104 			return (0);
2105 		}
2106 #ifdef DEBUG
2107 		nfs3_lookup_dnlc_disappears++;
2108 #endif
2109 	}
2110 #ifdef DEBUG
2111 	else
2112 		nfs3_lookup_dnlc_misses++;
2113 #endif
2114 
2115 	*vpp = NULL;
2116 
2117 	return (0);
2118 }
2119 
2120 static int
2121 nfs3lookup_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr,
2122 	int rfscall_flags)
2123 {
2124 	int error;
2125 	LOOKUP3args args;
2126 	LOOKUP3vres res;
2127 	int douprintf;
2128 	struct vattr vattr;
2129 	struct vattr dvattr;
2130 	vnode_t *vp;
2131 	failinfo_t fi;
2132 	hrtime_t t;
2133 
2134 	ASSERT(*nm != '\0');
2135 	ASSERT(dvp->v_type == VDIR);
2136 	ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
2137 
2138 	setdiropargs3(&args.what, nm, dvp);
2139 
2140 	fi.vp = dvp;
2141 	fi.fhp = (caddr_t)&args.what.dir;
2142 	fi.copyproc = nfs3copyfh;
2143 	fi.lookupproc = nfs3lookup;
2144 	fi.xattrdirproc = acl_getxattrdir3;
2145 	res.obj_attributes.fres.vp = dvp;
2146 	res.obj_attributes.fres.vap = &vattr;
2147 	res.dir_attributes.fres.vp = dvp;
2148 	res.dir_attributes.fres.vap = &dvattr;
2149 
2150 	douprintf = 1;
2151 
2152 	t = gethrtime();
2153 
2154 	error = rfs3call(VTOMI(dvp), NFSPROC3_LOOKUP,
2155 	    xdr_diropargs3, (caddr_t)&args,
2156 	    xdr_LOOKUP3vres, (caddr_t)&res, cr,
2157 	    &douprintf, &res.status, rfscall_flags, &fi);
2158 
2159 	if (error)
2160 		return (error);
2161 
2162 	nfs3_cache_post_op_vattr(dvp, &res.dir_attributes, t, cr);
2163 
2164 	error = geterrno3(res.status);
2165 	if (error) {
2166 		PURGE_STALE_FH(error, dvp, cr);
2167 		if (error == ENOENT && nfs3_lookup_neg_cache)
2168 			dnlc_enter(dvp, nm, DNLC_NO_VNODE);
2169 		return (error);
2170 	}
2171 
2172 	if (res.obj_attributes.attributes) {
2173 		vp = makenfs3node_va(&res.object, res.obj_attributes.fres.vap,
2174 		    dvp->v_vfsp, t, cr, VTOR(dvp)->r_path, nm);
2175 	} else {
2176 		vp = makenfs3node_va(&res.object, NULL,
2177 		    dvp->v_vfsp, t, cr, VTOR(dvp)->r_path, nm);
2178 		if (vp->v_type == VNON) {
2179 			vattr.va_mask = AT_TYPE;
2180 			error = nfs3getattr(vp, &vattr, cr);
2181 			if (error) {
2182 				VN_RELE(vp);
2183 				return (error);
2184 			}
2185 			vp->v_type = vattr.va_type;
2186 		}
2187 	}
2188 
2189 	if (!(rfscall_flags & RFSCALL_SOFT))
2190 		dnlc_update(dvp, nm, vp);
2191 
2192 	*vpp = vp;
2193 
2194 	return (error);
2195 }
2196 
2197 #ifdef DEBUG
2198 static int nfs3_create_misses = 0;
2199 #endif
2200 
2201 /* ARGSUSED */
2202 static int
2203 nfs3_create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive,
2204 	int mode, vnode_t **vpp, cred_t *cr, int lfaware, caller_context_t *ct,
2205 	vsecattr_t *vsecp)
2206 {
2207 	int error;
2208 	vnode_t *vp;
2209 	rnode_t *rp;
2210 	struct vattr vattr;
2211 	rnode_t *drp;
2212 	vnode_t *tempvp;
2213 
2214 	drp = VTOR(dvp);
2215 	if (nfs_zone() != VTOMI(dvp)->mi_zone)
2216 		return (EPERM);
2217 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
2218 		return (EINTR);
2219 
2220 top:
2221 	/*
2222 	 * We make a copy of the attributes because the caller does not
2223 	 * expect us to change what va points to.
2224 	 */
2225 	vattr = *va;
2226 
2227 	/*
2228 	 * If the pathname is "", just use dvp.  Don't need
2229 	 * to send it over the wire, look it up in the dnlc,
2230 	 * or perform any access checks.
2231 	 */
2232 	if (*nm == '\0') {
2233 		error = 0;
2234 		VN_HOLD(dvp);
2235 		vp = dvp;
2236 	/*
2237 	 * If the pathname is ".", just use dvp.  Don't need
2238 	 * to send it over the wire or look it up in the dnlc,
2239 	 * just need to check access.
2240 	 */
2241 	} else if (strcmp(nm, ".") == 0) {
2242 		error = nfs3_access(dvp, VEXEC, 0, cr, ct);
2243 		if (error) {
2244 			nfs_rw_exit(&drp->r_rwlock);
2245 			return (error);
2246 		}
2247 		VN_HOLD(dvp);
2248 		vp = dvp;
2249 	/*
2250 	 * We need to go over the wire, just to be sure whether the
2251 	 * file exists or not.  Using the DNLC can be dangerous in
2252 	 * this case when making a decision regarding existence.
2253 	 */
2254 	} else {
2255 		error = nfs3lookup_otw(dvp, nm, &vp, cr, 0);
2256 	}
2257 	if (!error) {
2258 		if (exclusive == EXCL)
2259 			error = EEXIST;
2260 		else if (vp->v_type == VDIR && (mode & VWRITE))
2261 			error = EISDIR;
2262 		else {
2263 			/*
2264 			 * If vnode is a device, create special vnode.
2265 			 */
2266 			if (IS_DEVVP(vp)) {
2267 				tempvp = vp;
2268 				vp = specvp(vp, vp->v_rdev, vp->v_type, cr);
2269 				VN_RELE(tempvp);
2270 			}
2271 			if (!(error = VOP_ACCESS(vp, mode, 0, cr, ct))) {
2272 				if ((vattr.va_mask & AT_SIZE) &&
2273 				    vp->v_type == VREG) {
2274 					rp = VTOR(vp);
2275 					/*
2276 					 * Check here for large file handled
2277 					 * by LF-unaware process (as
2278 					 * ufs_create() does)
2279 					 */
2280 					if (!(lfaware & FOFFMAX)) {
2281 						mutex_enter(&rp->r_statelock);
2282 						if (rp->r_size > MAXOFF32_T)
2283 							error = EOVERFLOW;
2284 						mutex_exit(&rp->r_statelock);
2285 					}
2286 					if (!error) {
2287 						vattr.va_mask = AT_SIZE;
2288 						error = nfs3setattr(vp,
2289 						    &vattr, 0, cr);
2290 					}
2291 				}
2292 			}
2293 		}
2294 		nfs_rw_exit(&drp->r_rwlock);
2295 		if (error) {
2296 			VN_RELE(vp);
2297 		} else {
2298 			/*
2299 			 * existing file got truncated, notify.
2300 			 */
2301 			vnevent_create(vp, ct);
2302 			*vpp = vp;
2303 		}
2304 		return (error);
2305 	}
2306 
2307 	dnlc_remove(dvp, nm);
2308 
2309 	/*
2310 	 * Decide what the group-id of the created file should be.
2311 	 * Set it in attribute list as advisory...
2312 	 */
2313 	error = setdirgid(dvp, &vattr.va_gid, cr);
2314 	if (error) {
2315 		nfs_rw_exit(&drp->r_rwlock);
2316 		return (error);
2317 	}
2318 	vattr.va_mask |= AT_GID;
2319 
2320 	ASSERT(vattr.va_mask & AT_TYPE);
2321 	if (vattr.va_type == VREG) {
2322 		ASSERT(vattr.va_mask & AT_MODE);
2323 		if (MANDMODE(vattr.va_mode)) {
2324 			nfs_rw_exit(&drp->r_rwlock);
2325 			return (EACCES);
2326 		}
2327 		error = nfs3create(dvp, nm, &vattr, exclusive, mode, vpp, cr,
2328 		    lfaware);
2329 		/*
2330 		 * If this is not an exclusive create, then the CREATE
2331 		 * request will be made with the GUARDED mode set.  This
2332 		 * means that the server will return EEXIST if the file
2333 		 * exists.  The file could exist because of a retransmitted
2334 		 * request.  In this case, we recover by starting over and
2335 		 * checking to see whether the file exists.  This second
2336 		 * time through it should and a CREATE request will not be
2337 		 * sent.
2338 		 *
2339 		 * This handles the problem of a dangling CREATE request
2340 		 * which contains attributes which indicate that the file
2341 		 * should be truncated.  This retransmitted request could
2342 		 * possibly truncate valid data in the file if not caught
2343 		 * by the duplicate request mechanism on the server or if
2344 		 * not caught by other means.  The scenario is:
2345 		 *
2346 		 * Client transmits CREATE request with size = 0
2347 		 * Client times out, retransmits request.
2348 		 * Response to the first request arrives from the server
2349 		 *  and the client proceeds on.
2350 		 * Client writes data to the file.
2351 		 * The server now processes retransmitted CREATE request
2352 		 *  and truncates file.
2353 		 *
2354 		 * The use of the GUARDED CREATE request prevents this from
2355 		 * happening because the retransmitted CREATE would fail
2356 		 * with EEXIST and would not truncate the file.
2357 		 */
2358 		if (error == EEXIST && exclusive == NONEXCL) {
2359 #ifdef DEBUG
2360 			nfs3_create_misses++;
2361 #endif
2362 			goto top;
2363 		}
2364 		nfs_rw_exit(&drp->r_rwlock);
2365 		return (error);
2366 	}
2367 	error = nfs3mknod(dvp, nm, &vattr, exclusive, mode, vpp, cr);
2368 	nfs_rw_exit(&drp->r_rwlock);
2369 	return (error);
2370 }
2371 
2372 /* ARGSUSED */
2373 static int
2374 nfs3create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive,
2375 	int mode, vnode_t **vpp, cred_t *cr, int lfaware)
2376 {
2377 	int error;
2378 	CREATE3args args;
2379 	CREATE3res res;
2380 	int douprintf;
2381 	vnode_t *vp;
2382 	struct vattr vattr;
2383 	nfstime3 *verfp;
2384 	rnode_t *rp;
2385 	timestruc_t now;
2386 	hrtime_t t;
2387 
2388 	ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
2389 	setdiropargs3(&args.where, nm, dvp);
2390 	if (exclusive == EXCL) {
2391 		args.how.mode = EXCLUSIVE;
2392 		/*
2393 		 * Construct the create verifier.  This verifier needs
2394 		 * to be unique between different clients.  It also needs
2395 		 * to vary for each exclusive create request generated
2396 		 * from the client to the server.
2397 		 *
2398 		 * The first attempt is made to use the hostid and a
2399 		 * unique number on the client.  If the hostid has not
2400 		 * been set, the high resolution time that the exclusive
2401 		 * create request is being made is used.  This will work
2402 		 * unless two different clients, both with the hostid
2403 		 * not set, attempt an exclusive create request on the
2404 		 * same file, at exactly the same clock time.  The
2405 		 * chances of this happening seem small enough to be
2406 		 * reasonable.
2407 		 */
2408 		verfp = (nfstime3 *)&args.how.createhow3_u.verf;
2409 		verfp->seconds = zone_get_hostid(NULL);
2410 		if (verfp->seconds != 0)
2411 			verfp->nseconds = newnum();
2412 		else {
2413 			gethrestime(&now);
2414 			verfp->seconds = now.tv_sec;
2415 			verfp->nseconds = now.tv_nsec;
2416 		}
2417 		/*
2418 		 * Since the server will use this value for the mtime,
2419 		 * make sure that it can't overflow. Zero out the MSB.
2420 		 * The actual value does not matter here, only its uniqeness.
2421 		 */
2422 		verfp->seconds %= INT32_MAX;
2423 	} else {
2424 		/*
2425 		 * Issue the non-exclusive create in guarded mode.  This
2426 		 * may result in some false EEXIST responses for
2427 		 * retransmitted requests, but these will be handled at
2428 		 * a higher level.  By using GUARDED, duplicate requests
2429 		 * to do file truncation and possible access problems
2430 		 * can be avoided.
2431 		 */
2432 		args.how.mode = GUARDED;
2433 		error = vattr_to_sattr3(va,
2434 		    &args.how.createhow3_u.obj_attributes);
2435 		if (error) {
2436 			/* req time field(s) overflow - return immediately */
2437 			return (error);
2438 		}
2439 	}
2440 
2441 	douprintf = 1;
2442 
2443 	t = gethrtime();
2444 
2445 	error = rfs3call(VTOMI(dvp), NFSPROC3_CREATE,
2446 	    xdr_CREATE3args, (caddr_t)&args,
2447 	    xdr_CREATE3res, (caddr_t)&res, cr,
2448 	    &douprintf, &res.status, 0, NULL);
2449 
2450 	if (error) {
2451 		PURGE_ATTRCACHE(dvp);
2452 		return (error);
2453 	}
2454 
2455 	error = geterrno3(res.status);
2456 	if (!error) {
2457 		nfs3_cache_wcc_data(dvp, &res.resok.dir_wcc, t, cr);
2458 		if (HAVE_RDDIR_CACHE(VTOR(dvp)))
2459 			nfs_purge_rddir_cache(dvp);
2460 
2461 		/*
2462 		 * On exclusive create the times need to be explicitly
2463 		 * set to clear any potential verifier that may be stored
2464 		 * in one of these fields (see comment below).  This
2465 		 * is done here to cover the case where no post op attrs
2466 		 * were returned or a 'invalid' time was returned in
2467 		 * the attributes.
2468 		 */
2469 		if (exclusive == EXCL)
2470 			va->va_mask |= (AT_MTIME | AT_ATIME);
2471 
2472 		if (!res.resok.obj.handle_follows) {
2473 			error = nfs3lookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0);
2474 			if (error)
2475 				return (error);
2476 		} else {
2477 			if (res.resok.obj_attributes.attributes) {
2478 				vp = makenfs3node(&res.resok.obj.handle,
2479 				    &res.resok.obj_attributes.attr,
2480 				    dvp->v_vfsp, t, cr, NULL, NULL);
2481 			} else {
2482 				vp = makenfs3node(&res.resok.obj.handle, NULL,
2483 				    dvp->v_vfsp, t, cr, NULL, NULL);
2484 
2485 				/*
2486 				 * On an exclusive create, it is possible
2487 				 * that attributes were returned but those
2488 				 * postop attributes failed to decode
2489 				 * properly.  If this is the case,
2490 				 * then most likely the atime or mtime
2491 				 * were invalid for our client; this
2492 				 * is caused by the server storing the
2493 				 * create verifier in one of the time
2494 				 * fields(most likely mtime).
2495 				 * So... we are going to setattr just the
2496 				 * atime/mtime to clear things up.
2497 				 */
2498 				if (exclusive == EXCL) {
2499 					if (error =
2500 					    nfs3excl_create_settimes(vp,
2501 					    va, cr)) {
2502 						/*
2503 						 * Setting the times failed.
2504 						 * Remove the file and return
2505 						 * the error.
2506 						 */
2507 						VN_RELE(vp);
2508 						(void) nfs3_remove(dvp,
2509 						    nm, cr, NULL, 0);
2510 						return (error);
2511 					}
2512 				}
2513 
2514 				/*
2515 				 * This handles the non-exclusive case
2516 				 * and the exclusive case where no post op
2517 				 * attrs were returned.
2518 				 */
2519 				if (vp->v_type == VNON) {
2520 					vattr.va_mask = AT_TYPE;
2521 					error = nfs3getattr(vp, &vattr, cr);
2522 					if (error) {
2523 						VN_RELE(vp);
2524 						return (error);
2525 					}
2526 					vp->v_type = vattr.va_type;
2527 				}
2528 			}
2529 			dnlc_update(dvp, nm, vp);
2530 		}
2531 
2532 		rp = VTOR(vp);
2533 
2534 		/*
2535 		 * Check here for large file handled by
2536 		 * LF-unaware process (as ufs_create() does)
2537 		 */
2538 		if ((va->va_mask & AT_SIZE) && vp->v_type == VREG &&
2539 		    !(lfaware & FOFFMAX)) {
2540 			mutex_enter(&rp->r_statelock);
2541 			if (rp->r_size > MAXOFF32_T) {
2542 				mutex_exit(&rp->r_statelock);
2543 				VN_RELE(vp);
2544 				return (EOVERFLOW);
2545 			}
2546 			mutex_exit(&rp->r_statelock);
2547 		}
2548 
2549 		if (exclusive == EXCL &&
2550 		    (va->va_mask & ~(AT_GID | AT_SIZE))) {
2551 			/*
2552 			 * If doing an exclusive create, then generate
2553 			 * a SETATTR to set the initial attributes.
2554 			 * Try to set the mtime and the atime to the
2555 			 * server's current time.  It is somewhat
2556 			 * expected that these fields will be used to
2557 			 * store the exclusive create cookie.  If not,
2558 			 * server implementors will need to know that
2559 			 * a SETATTR will follow an exclusive create
2560 			 * and the cookie should be destroyed if
2561 			 * appropriate. This work may have been done
2562 			 * earlier in this function if post op attrs
2563 			 * were not available.
2564 			 *
2565 			 * The AT_GID and AT_SIZE bits are turned off
2566 			 * so that the SETATTR request will not attempt
2567 			 * to process these.  The gid will be set
2568 			 * separately if appropriate.  The size is turned
2569 			 * off because it is assumed that a new file will
2570 			 * be created empty and if the file wasn't empty,
2571 			 * then the exclusive create will have failed
2572 			 * because the file must have existed already.
2573 			 * Therefore, no truncate operation is needed.
2574 			 */
2575 			va->va_mask &= ~(AT_GID | AT_SIZE);
2576 			error = nfs3setattr(vp, va, 0, cr);
2577 			if (error) {
2578 				/*
2579 				 * Couldn't correct the attributes of
2580 				 * the newly created file and the
2581 				 * attributes are wrong.  Remove the
2582 				 * file and return an error to the
2583 				 * application.
2584 				 */
2585 				VN_RELE(vp);
2586 				(void) nfs3_remove(dvp, nm, cr, NULL, 0);
2587 				return (error);
2588 			}
2589 		}
2590 
2591 		if (va->va_gid != rp->r_attr.va_gid) {
2592 			/*
2593 			 * If the gid on the file isn't right, then
2594 			 * generate a SETATTR to attempt to change
2595 			 * it.  This may or may not work, depending
2596 			 * upon the server's semantics for allowing
2597 			 * file ownership changes.
2598 			 */
2599 			va->va_mask = AT_GID;
2600 			(void) nfs3setattr(vp, va, 0, cr);
2601 		}
2602 
2603 		/*
2604 		 * If vnode is a device create special vnode
2605 		 */
2606 		if (IS_DEVVP(vp)) {
2607 			*vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
2608 			VN_RELE(vp);
2609 		} else
2610 			*vpp = vp;
2611 	} else {
2612 		nfs3_cache_wcc_data(dvp, &res.resfail.dir_wcc, t, cr);
2613 		PURGE_STALE_FH(error, dvp, cr);
2614 	}
2615 
2616 	return (error);
2617 }
2618 
2619 /*
2620  * Special setattr function to take care of rest of atime/mtime
2621  * after successful exclusive create.  This function exists to avoid
2622  * handling attributes from the server; exclusive the atime/mtime fields
2623  * may be 'invalid' in client's view and therefore can not be trusted.
2624  */
2625 static int
2626 nfs3excl_create_settimes(vnode_t *vp, struct vattr *vap, cred_t *cr)
2627 {
2628 	int error;
2629 	uint_t mask;
2630 	SETATTR3args args;
2631 	SETATTR3res res;
2632 	int douprintf;
2633 	rnode_t *rp;
2634 	hrtime_t t;
2635 
2636 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
2637 	/* save the caller's mask so that it can be reset later */
2638 	mask = vap->va_mask;
2639 
2640 	rp = VTOR(vp);
2641 
2642 	args.object = *RTOFH3(rp);
2643 	args.guard.check = FALSE;
2644 
2645 	/* Use the mask to initialize the arguments */
2646 	vap->va_mask = 0;
2647 	error = vattr_to_sattr3(vap, &args.new_attributes);
2648 
2649 	/* We want to set just atime/mtime on this request */
2650 	args.new_attributes.atime.set_it = SET_TO_SERVER_TIME;
2651 	args.new_attributes.mtime.set_it = SET_TO_SERVER_TIME;
2652 
2653 	douprintf = 1;
2654 
2655 	t = gethrtime();
2656 
2657 	error = rfs3call(VTOMI(vp), NFSPROC3_SETATTR,
2658 	    xdr_SETATTR3args, (caddr_t)&args,
2659 	    xdr_SETATTR3res, (caddr_t)&res, cr,
2660 	    &douprintf, &res.status, 0, NULL);
2661 
2662 	if (error) {
2663 		vap->va_mask = mask;
2664 		return (error);
2665 	}
2666 
2667 	error = geterrno3(res.status);
2668 	if (!error) {
2669 		/*
2670 		 * It is important to pick up the attributes.
2671 		 * Since this is the exclusive create path, the
2672 		 * attributes on the initial create were ignored
2673 		 * and we need these to have the correct info.
2674 		 */
2675 		nfs3_cache_wcc_data(vp, &res.resok.obj_wcc, t, cr);
2676 		/*
2677 		 * No need to do the atime/mtime work again so clear
2678 		 * the bits.
2679 		 */
2680 		mask &= ~(AT_ATIME | AT_MTIME);
2681 	} else {
2682 		nfs3_cache_wcc_data(vp, &res.resfail.obj_wcc, t, cr);
2683 	}
2684 
2685 	vap->va_mask = mask;
2686 
2687 	return (error);
2688 }
2689 
2690 /* ARGSUSED */
2691 static int
2692 nfs3mknod(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive,
2693 	int mode, vnode_t **vpp, cred_t *cr)
2694 {
2695 	int error;
2696 	MKNOD3args args;
2697 	MKNOD3res res;
2698 	int douprintf;
2699 	vnode_t *vp;
2700 	struct vattr vattr;
2701 	hrtime_t t;
2702 
2703 	ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
2704 	switch (va->va_type) {
2705 	case VCHR:
2706 	case VBLK:
2707 		setdiropargs3(&args.where, nm, dvp);
2708 		args.what.type = (va->va_type == VCHR) ? NF3CHR : NF3BLK;
2709 		error = vattr_to_sattr3(va,
2710 		    &args.what.mknoddata3_u.device.dev_attributes);
2711 		if (error) {
2712 			/* req time field(s) overflow - return immediately */
2713 			return (error);
2714 		}
2715 		args.what.mknoddata3_u.device.spec.specdata1 =
2716 		    getmajor(va->va_rdev);
2717 		args.what.mknoddata3_u.device.spec.specdata2 =
2718 		    getminor(va->va_rdev);
2719 		break;
2720 
2721 	case VFIFO:
2722 	case VSOCK:
2723 		setdiropargs3(&args.where, nm, dvp);
2724 		args.what.type = (va->va_type == VFIFO) ? NF3FIFO : NF3SOCK;
2725 		error = vattr_to_sattr3(va,
2726 		    &args.what.mknoddata3_u.pipe_attributes);
2727 		if (error) {
2728 			/* req time field(s) overflow - return immediately */
2729 			return (error);
2730 		}
2731 		break;
2732 
2733 	default:
2734 		return (EINVAL);
2735 	}
2736 
2737 	douprintf = 1;
2738 
2739 	t = gethrtime();
2740 
2741 	error = rfs3call(VTOMI(dvp), NFSPROC3_MKNOD,
2742 	    xdr_MKNOD3args, (caddr_t)&args,
2743 	    xdr_MKNOD3res, (caddr_t)&res, cr,
2744 	    &douprintf, &res.status, 0, NULL);
2745 
2746 	if (error) {
2747 		PURGE_ATTRCACHE(dvp);
2748 		return (error);
2749 	}
2750 
2751 	error = geterrno3(res.status);
2752 	if (!error) {
2753 		nfs3_cache_wcc_data(dvp, &res.resok.dir_wcc, t, cr);
2754 		if (HAVE_RDDIR_CACHE(VTOR(dvp)))
2755 			nfs_purge_rddir_cache(dvp);
2756 
2757 		if (!res.resok.obj.handle_follows) {
2758 			error = nfs3lookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0);
2759 			if (error)
2760 				return (error);
2761 		} else {
2762 			if (res.resok.obj_attributes.attributes) {
2763 				vp = makenfs3node(&res.resok.obj.handle,
2764 				    &res.resok.obj_attributes.attr,
2765 				    dvp->v_vfsp, t, cr, NULL, NULL);
2766 			} else {
2767 				vp = makenfs3node(&res.resok.obj.handle, NULL,
2768 				    dvp->v_vfsp, t, cr, NULL, NULL);
2769 				if (vp->v_type == VNON) {
2770 					vattr.va_mask = AT_TYPE;
2771 					error = nfs3getattr(vp, &vattr, cr);
2772 					if (error) {
2773 						VN_RELE(vp);
2774 						return (error);
2775 					}
2776 					vp->v_type = vattr.va_type;
2777 				}
2778 
2779 			}
2780 			dnlc_update(dvp, nm, vp);
2781 		}
2782 
2783 		if (va->va_gid != VTOR(vp)->r_attr.va_gid) {
2784 			va->va_mask = AT_GID;
2785 			(void) nfs3setattr(vp, va, 0, cr);
2786 		}
2787 
2788 		/*
2789 		 * If vnode is a device create special vnode
2790 		 */
2791 		if (IS_DEVVP(vp)) {
2792 			*vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
2793 			VN_RELE(vp);
2794 		} else
2795 			*vpp = vp;
2796 	} else {
2797 		nfs3_cache_wcc_data(dvp, &res.resfail.dir_wcc, t, cr);
2798 		PURGE_STALE_FH(error, dvp, cr);
2799 	}
2800 	return (error);
2801 }
2802 
2803 /*
2804  * Weirdness: if the vnode to be removed is open
2805  * we rename it instead of removing it and nfs_inactive
2806  * will remove the new name.
2807  */
2808 /* ARGSUSED */
2809 static int
2810 nfs3_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct, int flags)
2811 {
2812 	int error;
2813 	REMOVE3args args;
2814 	REMOVE3res res;
2815 	vnode_t *vp;
2816 	char *tmpname;
2817 	int douprintf;
2818 	rnode_t *rp;
2819 	rnode_t *drp;
2820 	hrtime_t t;
2821 
2822 	if (nfs_zone() != VTOMI(dvp)->mi_zone)
2823 		return (EPERM);
2824 	drp = VTOR(dvp);
2825 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
2826 		return (EINTR);
2827 
2828 	error = nfs3lookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0);
2829 	if (error) {
2830 		nfs_rw_exit(&drp->r_rwlock);
2831 		return (error);
2832 	}
2833 
2834 	if (vp->v_type == VDIR && secpolicy_fs_linkdir(cr, dvp->v_vfsp)) {
2835 		VN_RELE(vp);
2836 		nfs_rw_exit(&drp->r_rwlock);
2837 		return (EPERM);
2838 	}
2839 
2840 	/*
2841 	 * First just remove the entry from the name cache, as it
2842 	 * is most likely the only entry for this vp.
2843 	 */
2844 	dnlc_remove(dvp, nm);
2845 
2846 	/*
2847 	 * If the file has a v_count > 1 then there may be more than one
2848 	 * entry in the name cache due multiple links or an open file,
2849 	 * but we don't have the real reference count so flush all
2850 	 * possible entries.
2851 	 */
2852 	if (vp->v_count > 1)
2853 		dnlc_purge_vp(vp);
2854 
2855 	/*
2856 	 * Now we have the real reference count on the vnode
2857 	 */
2858 	rp = VTOR(vp);
2859 	mutex_enter(&rp->r_statelock);
2860 	if (vp->v_count > 1 &&
2861 	    (rp->r_unldvp == NULL || strcmp(nm, rp->r_unlname) == 0)) {
2862 		mutex_exit(&rp->r_statelock);
2863 		tmpname = newname();
2864 		error = nfs3rename(dvp, nm, dvp, tmpname, cr, ct);
2865 		if (error)
2866 			kmem_free(tmpname, MAXNAMELEN);
2867 		else {
2868 			mutex_enter(&rp->r_statelock);
2869 			if (rp->r_unldvp == NULL) {
2870 				VN_HOLD(dvp);
2871 				rp->r_unldvp = dvp;
2872 				if (rp->r_unlcred != NULL)
2873 					crfree(rp->r_unlcred);
2874 				crhold(cr);
2875 				rp->r_unlcred = cr;
2876 				rp->r_unlname = tmpname;
2877 			} else {
2878 				kmem_free(rp->r_unlname, MAXNAMELEN);
2879 				rp->r_unlname = tmpname;
2880 			}
2881 			mutex_exit(&rp->r_statelock);
2882 		}
2883 	} else {
2884 		mutex_exit(&rp->r_statelock);
2885 		/*
2886 		 * We need to flush any dirty pages which happen to
2887 		 * be hanging around before removing the file.  This
2888 		 * shouldn't happen very often and mostly on file
2889 		 * systems mounted "nocto".
2890 		 */
2891 		if (vn_has_cached_data(vp) &&
2892 		    ((rp->r_flags & RDIRTY) || rp->r_count > 0)) {
2893 			error = nfs3_putpage(vp, (offset_t)0, 0, 0, cr, ct);
2894 			if (error && (error == ENOSPC || error == EDQUOT)) {
2895 				mutex_enter(&rp->r_statelock);
2896 				if (!rp->r_error)
2897 					rp->r_error = error;
2898 				mutex_exit(&rp->r_statelock);
2899 			}
2900 		}
2901 
2902 		setdiropargs3(&args.object, nm, dvp);
2903 
2904 		douprintf = 1;
2905 
2906 		t = gethrtime();
2907 
2908 		error = rfs3call(VTOMI(dvp), NFSPROC3_REMOVE,
2909 		    xdr_diropargs3, (caddr_t)&args,
2910 		    xdr_REMOVE3res, (caddr_t)&res, cr,
2911 		    &douprintf, &res.status, 0, NULL);
2912 
2913 		/*
2914 		 * The xattr dir may be gone after last attr is removed,
2915 		 * so flush it from dnlc.
2916 		 */
2917 		if (dvp->v_flag & V_XATTRDIR)
2918 			dnlc_purge_vp(dvp);
2919 
2920 		PURGE_ATTRCACHE(vp);
2921 
2922 		if (error) {
2923 			PURGE_ATTRCACHE(dvp);
2924 		} else {
2925 			error = geterrno3(res.status);
2926 			if (!error) {
2927 				nfs3_cache_wcc_data(dvp, &res.resok.dir_wcc, t,
2928 				    cr);
2929 				if (HAVE_RDDIR_CACHE(drp))
2930 					nfs_purge_rddir_cache(dvp);
2931 			} else {
2932 				nfs3_cache_wcc_data(dvp, &res.resfail.dir_wcc,
2933 				    t, cr);
2934 				PURGE_STALE_FH(error, dvp, cr);
2935 			}
2936 		}
2937 	}
2938 
2939 	if (error == 0) {
2940 		vnevent_remove(vp, dvp, nm, ct);
2941 	}
2942 	VN_RELE(vp);
2943 
2944 	nfs_rw_exit(&drp->r_rwlock);
2945 
2946 	return (error);
2947 }
2948 
2949 /* ARGSUSED */
2950 static int
2951 nfs3_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr,
2952 	caller_context_t *ct, int flags)
2953 {
2954 	int error;
2955 	LINK3args args;
2956 	LINK3res res;
2957 	vnode_t *realvp;
2958 	int douprintf;
2959 	mntinfo_t *mi;
2960 	rnode_t *tdrp;
2961 	hrtime_t t;
2962 
2963 	if (nfs_zone() != VTOMI(tdvp)->mi_zone)
2964 		return (EPERM);
2965 	if (VOP_REALVP(svp, &realvp, ct) == 0)
2966 		svp = realvp;
2967 
2968 	mi = VTOMI(svp);
2969 
2970 	if (!(mi->mi_flags & MI_LINK))
2971 		return (EOPNOTSUPP);
2972 
2973 	args.file = *VTOFH3(svp);
2974 	setdiropargs3(&args.link, tnm, tdvp);
2975 
2976 	tdrp = VTOR(tdvp);
2977 	if (nfs_rw_enter_sig(&tdrp->r_rwlock, RW_WRITER, INTR(tdvp)))
2978 		return (EINTR);
2979 
2980 	dnlc_remove(tdvp, tnm);
2981 
2982 	douprintf = 1;
2983 
2984 	t = gethrtime();
2985 
2986 	error = rfs3call(mi, NFSPROC3_LINK,
2987 	    xdr_LINK3args, (caddr_t)&args,
2988 	    xdr_LINK3res, (caddr_t)&res, cr,
2989 	    &douprintf, &res.status, 0, NULL);
2990 
2991 	if (error) {
2992 		PURGE_ATTRCACHE(tdvp);
2993 		PURGE_ATTRCACHE(svp);
2994 		nfs_rw_exit(&tdrp->r_rwlock);
2995 		return (error);
2996 	}
2997 
2998 	error = geterrno3(res.status);
2999 
3000 	if (!error) {
3001 		nfs3_cache_post_op_attr(svp, &res.resok.file_attributes, t, cr);
3002 		nfs3_cache_wcc_data(tdvp, &res.resok.linkdir_wcc, t, cr);
3003 		if (HAVE_RDDIR_CACHE(tdrp))
3004 			nfs_purge_rddir_cache(tdvp);
3005 		dnlc_update(tdvp, tnm, svp);
3006 	} else {
3007 		nfs3_cache_post_op_attr(svp, &res.resfail.file_attributes, t,
3008 		    cr);
3009 		nfs3_cache_wcc_data(tdvp, &res.resfail.linkdir_wcc, t, cr);
3010 		if (error == EOPNOTSUPP) {
3011 			mutex_enter(&mi->mi_lock);
3012 			mi->mi_flags &= ~MI_LINK;
3013 			mutex_exit(&mi->mi_lock);
3014 		}
3015 	}
3016 
3017 	nfs_rw_exit(&tdrp->r_rwlock);
3018 
3019 	if (!error) {
3020 		/*
3021 		 * Notify the source file of this link operation.
3022 		 */
3023 		vnevent_link(svp, ct);
3024 	}
3025 	return (error);
3026 }
3027 
3028 /* ARGSUSED */
3029 static int
3030 nfs3_rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr,
3031 	caller_context_t *ct, int flags)
3032 {
3033 	vnode_t *realvp;
3034 
3035 	if (nfs_zone() != VTOMI(odvp)->mi_zone)
3036 		return (EPERM);
3037 	if (VOP_REALVP(ndvp, &realvp, ct) == 0)
3038 		ndvp = realvp;
3039 
3040 	return (nfs3rename(odvp, onm, ndvp, nnm, cr, ct));
3041 }
3042 
3043 /*
3044  * nfs3rename does the real work of renaming in NFS Version 3.
3045  */
3046 static int
3047 nfs3rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr,
3048     caller_context_t *ct)
3049 {
3050 	int error;
3051 	RENAME3args args;
3052 	RENAME3res res;
3053 	int douprintf;
3054 	vnode_t *nvp = NULL;
3055 	vnode_t *ovp = NULL;
3056 	char *tmpname;
3057 	rnode_t *rp;
3058 	rnode_t *odrp;
3059 	rnode_t *ndrp;
3060 	hrtime_t t;
3061 
3062 	ASSERT(nfs_zone() == VTOMI(odvp)->mi_zone);
3063 
3064 	if (strcmp(onm, ".") == 0 || strcmp(onm, "..") == 0 ||
3065 	    strcmp(nnm, ".") == 0 || strcmp(nnm, "..") == 0)
3066 		return (EINVAL);
3067 
3068 	odrp = VTOR(odvp);
3069 	ndrp = VTOR(ndvp);
3070 	if ((intptr_t)odrp < (intptr_t)ndrp) {
3071 		if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR(odvp)))
3072 			return (EINTR);
3073 		if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR(ndvp))) {
3074 			nfs_rw_exit(&odrp->r_rwlock);
3075 			return (EINTR);
3076 		}
3077 	} else {
3078 		if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR(ndvp)))
3079 			return (EINTR);
3080 		if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR(odvp))) {
3081 			nfs_rw_exit(&ndrp->r_rwlock);
3082 			return (EINTR);
3083 		}
3084 	}
3085 
3086 	/*
3087 	 * Lookup the target file.  If it exists, it needs to be
3088 	 * checked to see whether it is a mount point and whether
3089 	 * it is active (open).
3090 	 */
3091 	error = nfs3lookup(ndvp, nnm, &nvp, NULL, 0, NULL, cr, 0);
3092 	if (!error) {
3093 		/*
3094 		 * If this file has been mounted on, then just
3095 		 * return busy because renaming to it would remove
3096 		 * the mounted file system from the name space.
3097 		 */
3098 		if (vn_mountedvfs(nvp) != NULL) {
3099 			VN_RELE(nvp);
3100 			nfs_rw_exit(&odrp->r_rwlock);
3101 			nfs_rw_exit(&ndrp->r_rwlock);
3102 			return (EBUSY);
3103 		}
3104 
3105 		/*
3106 		 * Purge the name cache of all references to this vnode
3107 		 * so that we can check the reference count to infer
3108 		 * whether it is active or not.
3109 		 */
3110 		/*
3111 		 * First just remove the entry from the name cache, as it
3112 		 * is most likely the only entry for this vp.
3113 		 */
3114 		dnlc_remove(ndvp, nnm);
3115 		/*
3116 		 * If the file has a v_count > 1 then there may be more
3117 		 * than one entry in the name cache due multiple links
3118 		 * or an open file, but we don't have the real reference
3119 		 * count so flush all possible entries.
3120 		 */
3121 		if (nvp->v_count > 1)
3122 			dnlc_purge_vp(nvp);
3123 
3124 		/*
3125 		 * If the vnode is active and is not a directory,
3126 		 * arrange to rename it to a
3127 		 * temporary file so that it will continue to be
3128 		 * accessible.  This implements the "unlink-open-file"
3129 		 * semantics for the target of a rename operation.
3130 		 * Before doing this though, make sure that the
3131 		 * source and target files are not already the same.
3132 		 */
3133 		if (nvp->v_count > 1 && nvp->v_type != VDIR) {
3134 			/*
3135 			 * Lookup the source name.
3136 			 */
3137 			error = nfs3lookup(odvp, onm, &ovp, NULL, 0, NULL,
3138 			    cr, 0);
3139 
3140 			/*
3141 			 * The source name *should* already exist.
3142 			 */
3143 			if (error) {
3144 				VN_RELE(nvp);
3145 				nfs_rw_exit(&odrp->r_rwlock);
3146 				nfs_rw_exit(&ndrp->r_rwlock);
3147 				return (error);
3148 			}
3149 
3150 			/*
3151 			 * Compare the two vnodes.  If they are the same,
3152 			 * just release all held vnodes and return success.
3153 			 */
3154 			if (ovp == nvp) {
3155 				VN_RELE(ovp);
3156 				VN_RELE(nvp);
3157 				nfs_rw_exit(&odrp->r_rwlock);
3158 				nfs_rw_exit(&ndrp->r_rwlock);
3159 				return (0);
3160 			}
3161 
3162 			/*
3163 			 * Can't mix and match directories and non-
3164 			 * directories in rename operations.  We already
3165 			 * know that the target is not a directory.  If
3166 			 * the source is a directory, return an error.
3167 			 */
3168 			if (ovp->v_type == VDIR) {
3169 				VN_RELE(ovp);
3170 				VN_RELE(nvp);
3171 				nfs_rw_exit(&odrp->r_rwlock);
3172 				nfs_rw_exit(&ndrp->r_rwlock);
3173 				return (ENOTDIR);
3174 			}
3175 
3176 			/*
3177 			 * The target file exists, is not the same as
3178 			 * the source file, and is active.  Link it
3179 			 * to a temporary filename to avoid having
3180 			 * the server removing the file completely.
3181 			 */
3182 			tmpname = newname();
3183 			error = nfs3_link(ndvp, nvp, tmpname, cr, NULL, 0);
3184 			if (error == EOPNOTSUPP) {
3185 				error = nfs3_rename(ndvp, nnm, ndvp, tmpname,
3186 				    cr, NULL, 0);
3187 			}
3188 			if (error) {
3189 				kmem_free(tmpname, MAXNAMELEN);
3190 				VN_RELE(ovp);
3191 				VN_RELE(nvp);
3192 				nfs_rw_exit(&odrp->r_rwlock);
3193 				nfs_rw_exit(&ndrp->r_rwlock);
3194 				return (error);
3195 			}
3196 			rp = VTOR(nvp);
3197 			mutex_enter(&rp->r_statelock);
3198 			if (rp->r_unldvp == NULL) {
3199 				VN_HOLD(ndvp);
3200 				rp->r_unldvp = ndvp;
3201 				if (rp->r_unlcred != NULL)
3202 					crfree(rp->r_unlcred);
3203 				crhold(cr);
3204 				rp->r_unlcred = cr;
3205 				rp->r_unlname = tmpname;
3206 			} else {
3207 				kmem_free(rp->r_unlname, MAXNAMELEN);
3208 				rp->r_unlname = tmpname;
3209 			}
3210 			mutex_exit(&rp->r_statelock);
3211 		}
3212 	}
3213 
3214 	if (ovp == NULL) {
3215 		/*
3216 		 * When renaming directories to be a subdirectory of a
3217 		 * different parent, the dnlc entry for ".." will no
3218 		 * longer be valid, so it must be removed.
3219 		 *
3220 		 * We do a lookup here to determine whether we are renaming
3221 		 * a directory and we need to check if we are renaming
3222 		 * an unlinked file.  This might have already been done
3223 		 * in previous code, so we check ovp == NULL to avoid
3224 		 * doing it twice.
3225 		 */
3226 
3227 		error = nfs3lookup(odvp, onm, &ovp, NULL, 0, NULL, cr, 0);
3228 		/*
3229 		 * The source name *should* already exist.
3230 		 */
3231 		if (error) {
3232 			nfs_rw_exit(&odrp->r_rwlock);
3233 			nfs_rw_exit(&ndrp->r_rwlock);
3234 			if (nvp) {
3235 				VN_RELE(nvp);
3236 			}
3237 			return (error);
3238 		}
3239 		ASSERT(ovp != NULL);
3240 	}
3241 
3242 	dnlc_remove(odvp, onm);
3243 	dnlc_remove(ndvp, nnm);
3244 
3245 	setdiropargs3(&args.from, onm, odvp);
3246 	setdiropargs3(&args.to, nnm, ndvp);
3247 
3248 	douprintf = 1;
3249 
3250 	t = gethrtime();
3251 
3252 	error = rfs3call(VTOMI(odvp), NFSPROC3_RENAME,
3253 	    xdr_RENAME3args, (caddr_t)&args,
3254 	    xdr_RENAME3res, (caddr_t)&res, cr,
3255 	    &douprintf, &res.status, 0, NULL);
3256 
3257 	if (error) {
3258 		PURGE_ATTRCACHE(odvp);
3259 		PURGE_ATTRCACHE(ndvp);
3260 		VN_RELE(ovp);
3261 		nfs_rw_exit(&odrp->r_rwlock);
3262 		nfs_rw_exit(&ndrp->r_rwlock);
3263 		if (nvp) {
3264 			VN_RELE(nvp);
3265 		}
3266 		return (error);
3267 	}
3268 
3269 	error = geterrno3(res.status);
3270 
3271 	if (!error) {
3272 		nfs3_cache_wcc_data(odvp, &res.resok.fromdir_wcc, t, cr);
3273 		if (HAVE_RDDIR_CACHE(odrp))
3274 			nfs_purge_rddir_cache(odvp);
3275 		if (ndvp != odvp) {
3276 			nfs3_cache_wcc_data(ndvp, &res.resok.todir_wcc, t, cr);
3277 			if (HAVE_RDDIR_CACHE(ndrp))
3278 				nfs_purge_rddir_cache(ndvp);
3279 		}
3280 		/*
3281 		 * when renaming directories to be a subdirectory of a
3282 		 * different parent, the dnlc entry for ".." will no
3283 		 * longer be valid, so it must be removed
3284 		 */
3285 		rp = VTOR(ovp);
3286 		if (ndvp != odvp) {
3287 			if (ovp->v_type == VDIR) {
3288 				dnlc_remove(ovp, "..");
3289 				if (HAVE_RDDIR_CACHE(rp))
3290 					nfs_purge_rddir_cache(ovp);
3291 			}
3292 		}
3293 
3294 		/*
3295 		 * If we are renaming the unlinked file, update the
3296 		 * r_unldvp and r_unlname as needed.
3297 		 */
3298 		mutex_enter(&rp->r_statelock);
3299 		if (rp->r_unldvp != NULL) {
3300 			if (strcmp(rp->r_unlname, onm) == 0) {
3301 				(void) strncpy(rp->r_unlname, nnm, MAXNAMELEN);
3302 				rp->r_unlname[MAXNAMELEN - 1] = '\0';
3303 
3304 				if (ndvp != rp->r_unldvp) {
3305 					VN_RELE(rp->r_unldvp);
3306 					rp->r_unldvp = ndvp;
3307 					VN_HOLD(ndvp);
3308 				}
3309 			}
3310 		}
3311 		mutex_exit(&rp->r_statelock);
3312 	} else {
3313 		nfs3_cache_wcc_data(odvp, &res.resfail.fromdir_wcc, t, cr);
3314 		if (ndvp != odvp) {
3315 			nfs3_cache_wcc_data(ndvp, &res.resfail.todir_wcc, t,
3316 			    cr);
3317 		}
3318 		/*
3319 		 * System V defines rename to return EEXIST, not
3320 		 * ENOTEMPTY if the target directory is not empty.
3321 		 * Over the wire, the error is NFSERR_ENOTEMPTY
3322 		 * which geterrno maps to ENOTEMPTY.
3323 		 */
3324 		if (error == ENOTEMPTY)
3325 			error = EEXIST;
3326 	}
3327 
3328 	if (error == 0) {
3329 		if (nvp)
3330 			vnevent_rename_dest(nvp, ndvp, nnm, ct);
3331 
3332 		if (odvp != ndvp)
3333 			vnevent_rename_dest_dir(ndvp, ct);
3334 		ASSERT(ovp != NULL);
3335 		vnevent_rename_src(ovp, odvp, onm, ct);
3336 	}
3337 
3338 	if (nvp) {
3339 		VN_RELE(nvp);
3340 	}
3341 	VN_RELE(ovp);
3342 
3343 	nfs_rw_exit(&odrp->r_rwlock);
3344 	nfs_rw_exit(&ndrp->r_rwlock);
3345 
3346 	return (error);
3347 }
3348 
3349 /* ARGSUSED */
3350 static int
3351 nfs3_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp, cred_t *cr,
3352 	caller_context_t *ct, int flags, vsecattr_t *vsecp)
3353 {
3354 	int error;
3355 	MKDIR3args args;
3356 	MKDIR3res res;
3357 	int douprintf;
3358 	struct vattr vattr;
3359 	vnode_t *vp;
3360 	rnode_t *drp;
3361 	hrtime_t t;
3362 
3363 	if (nfs_zone() != VTOMI(dvp)->mi_zone)
3364 		return (EPERM);
3365 	setdiropargs3(&args.where, nm, dvp);
3366 
3367 	/*
3368 	 * Decide what the group-id and set-gid bit of the created directory
3369 	 * should be.  May have to do a setattr to get the gid right.
3370 	 */
3371 	error = setdirgid(dvp, &va->va_gid, cr);
3372 	if (error)
3373 		return (error);
3374 	error = setdirmode(dvp, &va->va_mode, cr);
3375 	if (error)
3376 		return (error);
3377 	va->va_mask |= AT_MODE|AT_GID;
3378 
3379 	error = vattr_to_sattr3(va, &args.attributes);
3380 	if (error) {
3381 		/* req time field(s) overflow - return immediately */
3382 		return (error);
3383 	}
3384 
3385 	drp = VTOR(dvp);
3386 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
3387 		return (EINTR);
3388 
3389 	dnlc_remove(dvp, nm);
3390 
3391 	douprintf = 1;
3392 
3393 	t = gethrtime();
3394 
3395 	error = rfs3call(VTOMI(dvp), NFSPROC3_MKDIR,
3396 	    xdr_MKDIR3args, (caddr_t)&args,
3397 	    xdr_MKDIR3res, (caddr_t)&res, cr,
3398 	    &douprintf, &res.status, 0, NULL);
3399 
3400 	if (error) {
3401 		PURGE_ATTRCACHE(dvp);
3402 		nfs_rw_exit(&drp->r_rwlock);
3403 		return (error);
3404 	}
3405 
3406 	error = geterrno3(res.status);
3407 	if (!error) {
3408 		nfs3_cache_wcc_data(dvp, &res.resok.dir_wcc, t, cr);
3409 		if (HAVE_RDDIR_CACHE(drp))
3410 			nfs_purge_rddir_cache(dvp);
3411 
3412 		if (!res.resok.obj.handle_follows) {
3413 			error = nfs3lookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0);
3414 			if (error) {
3415 				nfs_rw_exit(&drp->r_rwlock);
3416 				return (error);
3417 			}
3418 		} else {
3419 			if (res.resok.obj_attributes.attributes) {
3420 				vp = makenfs3node(&res.resok.obj.handle,
3421 				    &res.resok.obj_attributes.attr,
3422 				    dvp->v_vfsp, t, cr, NULL, NULL);
3423 			} else {
3424 				vp = makenfs3node(&res.resok.obj.handle, NULL,
3425 				    dvp->v_vfsp, t, cr, NULL, NULL);
3426 				if (vp->v_type == VNON) {
3427 					vattr.va_mask = AT_TYPE;
3428 					error = nfs3getattr(vp, &vattr, cr);
3429 					if (error) {
3430 						VN_RELE(vp);
3431 						nfs_rw_exit(&drp->r_rwlock);
3432 						return (error);
3433 					}
3434 					vp->v_type = vattr.va_type;
3435 				}
3436 			}
3437 			dnlc_update(dvp, nm, vp);
3438 		}
3439 		if (va->va_gid != VTOR(vp)->r_attr.va_gid) {
3440 			va->va_mask = AT_GID;
3441 			(void) nfs3setattr(vp, va, 0, cr);
3442 		}
3443 		*vpp = vp;
3444 	} else {
3445 		nfs3_cache_wcc_data(dvp, &res.resfail.dir_wcc, t, cr);
3446 		PURGE_STALE_FH(error, dvp, cr);
3447 	}
3448 
3449 	nfs_rw_exit(&drp->r_rwlock);
3450 
3451 	return (error);
3452 }
3453 
3454 /* ARGSUSED */
3455 static int
3456 nfs3_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr,
3457 	caller_context_t *ct, int flags)
3458 {
3459 	int error;
3460 	RMDIR3args args;
3461 	RMDIR3res res;
3462 	vnode_t *vp;
3463 	int douprintf;
3464 	rnode_t *drp;
3465 	hrtime_t t;
3466 
3467 	if (nfs_zone() != VTOMI(dvp)->mi_zone)
3468 		return (EPERM);
3469 	drp = VTOR(dvp);
3470 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
3471 		return (EINTR);
3472 
3473 	/*
3474 	 * Attempt to prevent a rmdir(".") from succeeding.
3475 	 */
3476 	error = nfs3lookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0);
3477 	if (error) {
3478 		nfs_rw_exit(&drp->r_rwlock);
3479 		return (error);
3480 	}
3481 
3482 	if (vp == cdir) {
3483 		VN_RELE(vp);
3484 		nfs_rw_exit(&drp->r_rwlock);
3485 		return (EINVAL);
3486 	}
3487 
3488 	setdiropargs3(&args.object, nm, dvp);
3489 
3490 	/*
3491 	 * First just remove the entry from the name cache, as it
3492 	 * is most likely an entry for this vp.
3493 	 */
3494 	dnlc_remove(dvp, nm);
3495 
3496 	/*
3497 	 * If there vnode reference count is greater than one, then
3498 	 * there may be additional references in the DNLC which will
3499 	 * need to be purged.  First, trying removing the entry for
3500 	 * the parent directory and see if that removes the additional
3501 	 * reference(s).  If that doesn't do it, then use dnlc_purge_vp
3502 	 * to completely remove any references to the directory which
3503 	 * might still exist in the DNLC.
3504 	 */
3505 	if (vp->v_count > 1) {
3506 		dnlc_remove(vp, "..");
3507 		if (vp->v_count > 1)
3508 			dnlc_purge_vp(vp);
3509 	}
3510 
3511 	douprintf = 1;
3512 
3513 	t = gethrtime();
3514 
3515 	error = rfs3call(VTOMI(dvp), NFSPROC3_RMDIR,
3516 	    xdr_diropargs3, (caddr_t)&args,
3517 	    xdr_RMDIR3res, (caddr_t)&res, cr,
3518 	    &douprintf, &res.status, 0, NULL);
3519 
3520 	PURGE_ATTRCACHE(vp);
3521 
3522 	if (error) {
3523 		PURGE_ATTRCACHE(dvp);
3524 		VN_RELE(vp);
3525 		nfs_rw_exit(&drp->r_rwlock);
3526 		return (error);
3527 	}
3528 
3529 	error = geterrno3(res.status);
3530 	if (!error) {
3531 		nfs3_cache_wcc_data(dvp, &res.resok.dir_wcc, t, cr);
3532 		if (HAVE_RDDIR_CACHE(drp))
3533 			nfs_purge_rddir_cache(dvp);
3534 		if (HAVE_RDDIR_CACHE(VTOR(vp)))
3535 			nfs_purge_rddir_cache(vp);
3536 	} else {
3537 		nfs3_cache_wcc_data(dvp, &res.resfail.dir_wcc, t, cr);
3538 		PURGE_STALE_FH(error, dvp, cr);
3539 		/*
3540 		 * System V defines rmdir to return EEXIST, not
3541 		 * ENOTEMPTY if the directory is not empty.  Over
3542 		 * the wire, the error is NFSERR_ENOTEMPTY which
3543 		 * geterrno maps to ENOTEMPTY.
3544 		 */
3545 		if (error == ENOTEMPTY)
3546 			error = EEXIST;
3547 	}
3548 
3549 	if (error == 0) {
3550 		vnevent_rmdir(vp, dvp, nm, ct);
3551 	}
3552 	VN_RELE(vp);
3553 
3554 	nfs_rw_exit(&drp->r_rwlock);
3555 
3556 	return (error);
3557 }
3558 
3559 /* ARGSUSED */
3560 static int
3561 nfs3_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm, cred_t *cr,
3562 	caller_context_t *ct, int flags)
3563 {
3564 	int error;
3565 	SYMLINK3args args;
3566 	SYMLINK3res res;
3567 	int douprintf;
3568 	mntinfo_t *mi;
3569 	vnode_t *vp;
3570 	rnode_t *rp;
3571 	char *contents;
3572 	rnode_t *drp;
3573 	hrtime_t t;
3574 
3575 	mi = VTOMI(dvp);
3576 
3577 	if (nfs_zone() != mi->mi_zone)
3578 		return (EPERM);
3579 	if (!(mi->mi_flags & MI_SYMLINK))
3580 		return (EOPNOTSUPP);
3581 
3582 	setdiropargs3(&args.where, lnm, dvp);
3583 	error = vattr_to_sattr3(tva, &args.symlink.symlink_attributes);
3584 	if (error) {
3585 		/* req time field(s) overflow - return immediately */
3586 		return (error);
3587 	}
3588 	args.symlink.symlink_data = tnm;
3589 
3590 	drp = VTOR(dvp);
3591 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
3592 		return (EINTR);
3593 
3594 	dnlc_remove(dvp, lnm);
3595 
3596 	douprintf = 1;
3597 
3598 	t = gethrtime();
3599 
3600 	error = rfs3call(mi, NFSPROC3_SYMLINK,
3601 	    xdr_SYMLINK3args, (caddr_t)&args,
3602 	    xdr_SYMLINK3res, (caddr_t)&res, cr,
3603 	    &douprintf, &res.status, 0, NULL);
3604 
3605 	if (error) {
3606 		PURGE_ATTRCACHE(dvp);
3607 		nfs_rw_exit(&drp->r_rwlock);
3608 		return (error);
3609 	}
3610 
3611 	error = geterrno3(res.status);
3612 	if (!error) {
3613 		nfs3_cache_wcc_data(dvp, &res.resok.dir_wcc, t, cr);
3614 		if (HAVE_RDDIR_CACHE(drp))
3615 			nfs_purge_rddir_cache(dvp);
3616 
3617 		if (res.resok.obj.handle_follows) {
3618 			if (res.resok.obj_attributes.attributes) {
3619 				vp = makenfs3node(&res.resok.obj.handle,
3620 				    &res.resok.obj_attributes.attr,
3621 				    dvp->v_vfsp, t, cr, NULL, NULL);
3622 			} else {
3623 				vp = makenfs3node(&res.resok.obj.handle, NULL,
3624 				    dvp->v_vfsp, t, cr, NULL, NULL);
3625 				vp->v_type = VLNK;
3626 				vp->v_rdev = 0;
3627 			}
3628 			dnlc_update(dvp, lnm, vp);
3629 			rp = VTOR(vp);
3630 			if (nfs3_do_symlink_cache &&
3631 			    rp->r_symlink.contents == NULL) {
3632 
3633 				contents = kmem_alloc(MAXPATHLEN,
3634 				    KM_NOSLEEP);
3635 
3636 				if (contents != NULL) {
3637 					mutex_enter(&rp->r_statelock);
3638 					if (rp->r_symlink.contents == NULL) {
3639 						rp->r_symlink.len = strlen(tnm);
3640 						bcopy(tnm, contents,
3641 						    rp->r_symlink.len);
3642 						rp->r_symlink.contents =
3643 						    contents;
3644 						rp->r_symlink.size = MAXPATHLEN;
3645 						mutex_exit(&rp->r_statelock);
3646 					} else {
3647 						mutex_exit(&rp->r_statelock);
3648 						kmem_free((void *)contents,
3649 						    MAXPATHLEN);
3650 					}
3651 				}
3652 			}
3653 			VN_RELE(vp);
3654 		}
3655 	} else {
3656 		nfs3_cache_wcc_data(dvp, &res.resfail.dir_wcc, t, cr);
3657 		PURGE_STALE_FH(error, dvp, cr);
3658 		if (error == EOPNOTSUPP) {
3659 			mutex_enter(&mi->mi_lock);
3660 			mi->mi_flags &= ~MI_SYMLINK;
3661 			mutex_exit(&mi->mi_lock);
3662 		}
3663 	}
3664 
3665 	nfs_rw_exit(&drp->r_rwlock);
3666 
3667 	return (error);
3668 }
3669 
3670 #ifdef DEBUG
3671 static int nfs3_readdir_cache_hits = 0;
3672 static int nfs3_readdir_cache_shorts = 0;
3673 static int nfs3_readdir_cache_waits = 0;
3674 static int nfs3_readdir_cache_misses = 0;
3675 static int nfs3_readdir_readahead = 0;
3676 #endif
3677 
3678 static int nfs3_shrinkreaddir = 0;
3679 
3680 /*
3681  * Read directory entries.
3682  * There are some weird things to look out for here.  The uio_loffset
3683  * field is either 0 or it is the offset returned from a previous
3684  * readdir.  It is an opaque value used by the server to find the
3685  * correct directory block to read. The count field is the number
3686  * of blocks to read on the server.  This is advisory only, the server
3687  * may return only one block's worth of entries.  Entries may be compressed
3688  * on the server.
3689  */
3690 /* ARGSUSED */
3691 static int
3692 nfs3_readdir(vnode_t *vp, struct uio *uiop, cred_t *cr, int *eofp,
3693 	caller_context_t *ct, int flags)
3694 {
3695 	int error;
3696 	size_t count;
3697 	rnode_t *rp;
3698 	rddir_cache *rdc;
3699 	rddir_cache *nrdc;
3700 	rddir_cache *rrdc;
3701 #ifdef DEBUG
3702 	int missed;
3703 #endif
3704 	int doreadahead;
3705 	rddir_cache srdc;
3706 	avl_index_t where;
3707 
3708 	if (nfs_zone() != VTOMI(vp)->mi_zone)
3709 		return (EIO);
3710 	rp = VTOR(vp);
3711 
3712 	ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
3713 
3714 	/*
3715 	 * Make sure that the directory cache is valid.
3716 	 */
3717 	if (HAVE_RDDIR_CACHE(rp)) {
3718 		if (nfs_disable_rddir_cache) {
3719 			/*
3720 			 * Setting nfs_disable_rddir_cache in /etc/system
3721 			 * allows interoperability with servers that do not
3722 			 * properly update the attributes of directories.
3723 			 * Any cached information gets purged before an
3724 			 * access is made to it.
3725 			 */
3726 			nfs_purge_rddir_cache(vp);
3727 		} else {
3728 			error = nfs3_validate_caches(vp, cr);
3729 			if (error)
3730 				return (error);
3731 		}
3732 	}
3733 
3734 	/*
3735 	 * It is possible that some servers may not be able to correctly
3736 	 * handle a large READDIR or READDIRPLUS request due to bugs in
3737 	 * their implementation.  In order to continue to interoperate
3738 	 * with them, this workaround is provided to limit the maximum
3739 	 * size of a READDIRPLUS request to 1024.  In any case, the request
3740 	 * size is limited to MAXBSIZE.
3741 	 */
3742 	count = MIN(uiop->uio_iov->iov_len,
3743 	    nfs3_shrinkreaddir ? 1024 : MAXBSIZE);
3744 
3745 	nrdc = NULL;
3746 #ifdef DEBUG
3747 	missed = 0;
3748 #endif
3749 top:
3750 	/*
3751 	 * Short circuit last readdir which always returns 0 bytes.
3752 	 * This can be done after the directory has been read through
3753 	 * completely at least once.  This will set r_direof which
3754 	 * can be used to find the value of the last cookie.
3755 	 */
3756 	mutex_enter(&rp->r_statelock);
3757 	if (rp->r_direof != NULL &&
3758 	    uiop->uio_loffset == rp->r_direof->nfs3_ncookie) {
3759 		mutex_exit(&rp->r_statelock);
3760 #ifdef DEBUG
3761 		nfs3_readdir_cache_shorts++;
3762 #endif
3763 		if (eofp)
3764 			*eofp = 1;
3765 		if (nrdc != NULL)
3766 			rddir_cache_rele(nrdc);
3767 		return (0);
3768 	}
3769 	/*
3770 	 * Look for a cache entry.  Cache entries are identified
3771 	 * by the NFS cookie value and the byte count requested.
3772 	 */
3773 	srdc.nfs3_cookie = uiop->uio_loffset;
3774 	srdc.buflen = count;
3775 	rdc = avl_find(&rp->r_dir, &srdc, &where);
3776 	if (rdc != NULL) {
3777 		rddir_cache_hold(rdc);
3778 		/*
3779 		 * If the cache entry is in the process of being
3780 		 * filled in, wait until this completes.  The
3781 		 * RDDIRWAIT bit is set to indicate that someone
3782 		 * is waiting and then the thread currently
3783 		 * filling the entry is done, it should do a
3784 		 * cv_broadcast to wakeup all of the threads
3785 		 * waiting for it to finish.
3786 		 */
3787 		if (rdc->flags & RDDIR) {
3788 			nfs_rw_exit(&rp->r_rwlock);
3789 			rdc->flags |= RDDIRWAIT;
3790 #ifdef DEBUG
3791 			nfs3_readdir_cache_waits++;
3792 #endif
3793 			if (!cv_wait_sig(&rdc->cv, &rp->r_statelock)) {
3794 				/*
3795 				 * We got interrupted, probably
3796 				 * the user typed ^C or an alarm
3797 				 * fired.  We free the new entry
3798 				 * if we allocated one.
3799 				 */
3800 				mutex_exit(&rp->r_statelock);
3801 				(void) nfs_rw_enter_sig(&rp->r_rwlock,
3802 				    RW_READER, FALSE);
3803 				rddir_cache_rele(rdc);
3804 				if (nrdc != NULL)
3805 					rddir_cache_rele(nrdc);
3806 				return (EINTR);
3807 			}
3808 			mutex_exit(&rp->r_statelock);
3809 			(void) nfs_rw_enter_sig(&rp->r_rwlock,
3810 			    RW_READER, FALSE);
3811 			rddir_cache_rele(rdc);
3812 			goto top;
3813 		}
3814 		/*
3815 		 * Check to see if a readdir is required to
3816 		 * fill the entry.  If so, mark this entry
3817 		 * as being filled, remove our reference,
3818 		 * and branch to the code to fill the entry.
3819 		 */
3820 		if (rdc->flags & RDDIRREQ) {
3821 			rdc->flags &= ~RDDIRREQ;
3822 			rdc->flags |= RDDIR;
3823 			if (nrdc != NULL)
3824 				rddir_cache_rele(nrdc);
3825 			nrdc = rdc;
3826 			mutex_exit(&rp->r_statelock);
3827 			goto bottom;
3828 		}
3829 #ifdef DEBUG
3830 		if (!missed)
3831 			nfs3_readdir_cache_hits++;
3832 #endif
3833 		/*
3834 		 * If an error occurred while attempting
3835 		 * to fill the cache entry, just return it.
3836 		 */
3837 		if (rdc->error) {
3838 			error = rdc->error;
3839 			mutex_exit(&rp->r_statelock);
3840 			rddir_cache_rele(rdc);
3841 			if (nrdc != NULL)
3842 				rddir_cache_rele(nrdc);
3843 			return (error);
3844 		}
3845 
3846 		/*
3847 		 * The cache entry is complete and good,
3848 		 * copyout the dirent structs to the calling
3849 		 * thread.
3850 		 */
3851 		error = uiomove(rdc->entries, rdc->entlen, UIO_READ, uiop);
3852 
3853 		/*
3854 		 * If no error occurred during the copyout,
3855 		 * update the offset in the uio struct to
3856 		 * contain the value of the next cookie
3857 		 * and set the eof value appropriately.
3858 		 */
3859 		if (!error) {
3860 			uiop->uio_loffset = rdc->nfs3_ncookie;
3861 			if (eofp)
3862 				*eofp = rdc->eof;
3863 		}
3864 
3865 		/*
3866 		 * Decide whether to do readahead.
3867 		 *
3868 		 * Don't if have already read to the end of
3869 		 * directory.  There is nothing more to read.
3870 		 *
3871 		 * Don't if the application is not doing
3872 		 * lookups in the directory.  The readahead
3873 		 * is only effective if the application can
3874 		 * be doing work while an async thread is
3875 		 * handling the over the wire request.
3876 		 */
3877 		if (rdc->eof) {
3878 			rp->r_direof = rdc;
3879 			doreadahead = FALSE;
3880 		} else if (!(rp->r_flags & RLOOKUP))
3881 			doreadahead = FALSE;
3882 		else
3883 			doreadahead = TRUE;
3884 
3885 		if (!doreadahead) {
3886 			mutex_exit(&rp->r_statelock);
3887 			rddir_cache_rele(rdc);
3888 			if (nrdc != NULL)
3889 				rddir_cache_rele(nrdc);
3890 			return (error);
3891 		}
3892 
3893 		/*
3894 		 * Check to see whether we found an entry
3895 		 * for the readahead.  If so, we don't need
3896 		 * to do anything further, so free the new
3897 		 * entry if one was allocated.  Otherwise,
3898 		 * allocate a new entry, add it to the cache,
3899 		 * and then initiate an asynchronous readdir
3900 		 * operation to fill it.
3901 		 */
3902 		srdc.nfs3_cookie = rdc->nfs3_ncookie;
3903 		srdc.buflen = count;
3904 		rrdc = avl_find(&rp->r_dir, &srdc, &where);
3905 		if (rrdc != NULL) {
3906 			if (nrdc != NULL)
3907 				rddir_cache_rele(nrdc);
3908 		} else {
3909 			if (nrdc != NULL)
3910 				rrdc = nrdc;
3911 			else {
3912 				rrdc = rddir_cache_alloc(KM_NOSLEEP);
3913 			}
3914 			if (rrdc != NULL) {
3915 				rrdc->nfs3_cookie = rdc->nfs3_ncookie;
3916 				rrdc->buflen = count;
3917 				avl_insert(&rp->r_dir, rrdc, where);
3918 				rddir_cache_hold(rrdc);
3919 				mutex_exit(&rp->r_statelock);
3920 				rddir_cache_rele(rdc);
3921 #ifdef DEBUG
3922 				nfs3_readdir_readahead++;
3923 #endif
3924 				nfs_async_readdir(vp, rrdc, cr, do_nfs3readdir);
3925 				return (error);
3926 			}
3927 		}
3928 
3929 		mutex_exit(&rp->r_statelock);
3930 		rddir_cache_rele(rdc);
3931 		return (error);
3932 	}
3933 
3934 	/*
3935 	 * Didn't find an entry in the cache.  Construct a new empty
3936 	 * entry and link it into the cache.  Other processes attempting
3937 	 * to access this entry will need to wait until it is filled in.
3938 	 *
3939 	 * Since kmem_alloc may block, another pass through the cache
3940 	 * will need to be taken to make sure that another process
3941 	 * hasn't already added an entry to the cache for this request.
3942 	 */
3943 	if (nrdc == NULL) {
3944 		mutex_exit(&rp->r_statelock);
3945 		nrdc = rddir_cache_alloc(KM_SLEEP);
3946 		nrdc->nfs3_cookie = uiop->uio_loffset;
3947 		nrdc->buflen = count;
3948 		goto top;
3949 	}
3950 
3951 	/*
3952 	 * Add this entry to the cache.
3953 	 */
3954 	avl_insert(&rp->r_dir, nrdc, where);
3955 	rddir_cache_hold(nrdc);
3956 	mutex_exit(&rp->r_statelock);
3957 
3958 bottom:
3959 #ifdef DEBUG
3960 	missed = 1;
3961 	nfs3_readdir_cache_misses++;
3962 #endif
3963 	/*
3964 	 * Do the readdir.  This routine decides whether to use
3965 	 * READDIR or READDIRPLUS.
3966 	 */
3967 	error = do_nfs3readdir(vp, nrdc, cr);
3968 
3969 	/*
3970 	 * If this operation failed, just return the error which occurred.
3971 	 */
3972 	if (error != 0)
3973 		return (error);
3974 
3975 	/*
3976 	 * Since the RPC operation will have taken sometime and blocked
3977 	 * this process, another pass through the cache will need to be
3978 	 * taken to find the correct cache entry.  It is possible that
3979 	 * the correct cache entry will not be there (although one was
3980 	 * added) because the directory changed during the RPC operation
3981 	 * and the readdir cache was flushed.  In this case, just start
3982 	 * over.  It is hoped that this will not happen too often... :-)
3983 	 */
3984 	nrdc = NULL;
3985 	goto top;
3986 	/* NOTREACHED */
3987 }
3988 
3989 static int
3990 do_nfs3readdir(vnode_t *vp, rddir_cache *rdc, cred_t *cr)
3991 {
3992 	int error;
3993 	rnode_t *rp;
3994 	mntinfo_t *mi;
3995 
3996 	rp = VTOR(vp);
3997 	mi = VTOMI(vp);
3998 	ASSERT(nfs_zone() == mi->mi_zone);
3999 	/*
4000 	 * Issue the proper request.
4001 	 *
4002 	 * If the server does not support READDIRPLUS, then use READDIR.
4003 	 *
4004 	 * Otherwise --
4005 	 * Issue a READDIRPLUS if reading to fill an empty cache or if
4006 	 * an application has performed a lookup in the directory which
4007 	 * required an over the wire lookup.  The use of READDIRPLUS
4008 	 * will help to (re)populate the DNLC.
4009 	 */
4010 	if (!(mi->mi_flags & MI_READDIRONLY) &&
4011 	    (rp->r_flags & (RLOOKUP | RREADDIRPLUS))) {
4012 		if (rp->r_flags & RREADDIRPLUS) {
4013 			mutex_enter(&rp->r_statelock);
4014 			rp->r_flags &= ~RREADDIRPLUS;
4015 			mutex_exit(&rp->r_statelock);
4016 		}
4017 		nfs3readdirplus(vp, rdc, cr);
4018 		if (rdc->error == EOPNOTSUPP)
4019 			nfs3readdir(vp, rdc, cr);
4020 	} else
4021 		nfs3readdir(vp, rdc, cr);
4022 
4023 	mutex_enter(&rp->r_statelock);
4024 	rdc->flags &= ~RDDIR;
4025 	if (rdc->flags & RDDIRWAIT) {
4026 		rdc->flags &= ~RDDIRWAIT;
4027 		cv_broadcast(&rdc->cv);
4028 	}
4029 	error = rdc->error;
4030 	if (error)
4031 		rdc->flags |= RDDIRREQ;
4032 	mutex_exit(&rp->r_statelock);
4033 
4034 	rddir_cache_rele(rdc);
4035 
4036 	return (error);
4037 }
4038 
4039 static void
4040 nfs3readdir(vnode_t *vp, rddir_cache *rdc, cred_t *cr)
4041 {
4042 	int error;
4043 	READDIR3args args;
4044 	READDIR3vres res;
4045 	vattr_t dva;
4046 	rnode_t *rp;
4047 	int douprintf;
4048 	failinfo_t fi, *fip = NULL;
4049 	mntinfo_t *mi;
4050 	hrtime_t t;
4051 
4052 	rp = VTOR(vp);
4053 	mi = VTOMI(vp);
4054 	ASSERT(nfs_zone() == mi->mi_zone);
4055 
4056 	args.dir = *RTOFH3(rp);
4057 	args.cookie = (cookie3)rdc->nfs3_cookie;
4058 	args.cookieverf = rp->r_cookieverf;
4059 	args.count = rdc->buflen;
4060 
4061 	/*
4062 	 * NFS client failover support
4063 	 * suppress failover unless we have a zero cookie
4064 	 */
4065 	if (args.cookie == (cookie3) 0) {
4066 		fi.vp = vp;
4067 		fi.fhp = (caddr_t)&args.dir;
4068 		fi.copyproc = nfs3copyfh;
4069 		fi.lookupproc = nfs3lookup;
4070 		fi.xattrdirproc = acl_getxattrdir3;
4071 		fip = &fi;
4072 	}
4073 
4074 #ifdef DEBUG
4075 	rdc->entries = rddir_cache_buf_alloc(rdc->buflen, KM_SLEEP);
4076 #else
4077 	rdc->entries = kmem_alloc(rdc->buflen, KM_SLEEP);
4078 #endif
4079 
4080 	res.entries = (dirent64_t *)rdc->entries;
4081 	res.entries_size = rdc->buflen;
4082 	res.dir_attributes.fres.vap = &dva;
4083 	res.dir_attributes.fres.vp = vp;
4084 	res.loff = rdc->nfs3_cookie;
4085 
4086 	douprintf = 1;
4087 
4088 	if (mi->mi_io_kstats) {
4089 		mutex_enter(&mi->mi_lock);
4090 		kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
4091 		mutex_exit(&mi->mi_lock);
4092 	}
4093 
4094 	t = gethrtime();
4095 
4096 	error = rfs3call(VTOMI(vp), NFSPROC3_READDIR,
4097 	    xdr_READDIR3args, (caddr_t)&args,
4098 	    xdr_READDIR3vres, (caddr_t)&res, cr,
4099 	    &douprintf, &res.status, 0, fip);
4100 
4101 	if (mi->mi_io_kstats) {
4102 		mutex_enter(&mi->mi_lock);
4103 		kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
4104 		mutex_exit(&mi->mi_lock);
4105 	}
4106 
4107 	if (error)
4108 		goto err;
4109 
4110 	nfs3_cache_post_op_vattr(vp, &res.dir_attributes, t, cr);
4111 
4112 	error = geterrno3(res.status);
4113 	if (error) {
4114 		PURGE_STALE_FH(error, vp, cr);
4115 		goto err;
4116 	}
4117 
4118 	if (mi->mi_io_kstats) {
4119 		mutex_enter(&mi->mi_lock);
4120 		KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
4121 		KSTAT_IO_PTR(mi->mi_io_kstats)->nread += res.size;
4122 		mutex_exit(&mi->mi_lock);
4123 	}
4124 
4125 	rdc->nfs3_ncookie = res.loff;
4126 	rp->r_cookieverf = res.cookieverf;
4127 	rdc->eof = res.eof ? 1 : 0;
4128 	rdc->entlen = res.size;
4129 	ASSERT(rdc->entlen <= rdc->buflen);
4130 	rdc->error = 0;
4131 	return;
4132 
4133 err:
4134 	kmem_free(rdc->entries, rdc->buflen);
4135 	rdc->entries = NULL;
4136 	rdc->error = error;
4137 }
4138 
4139 /*
4140  * Read directory entries.
4141  * There are some weird things to look out for here.  The uio_loffset
4142  * field is either 0 or it is the offset returned from a previous
4143  * readdir.  It is an opaque value used by the server to find the
4144  * correct directory block to read. The count field is the number
4145  * of blocks to read on the server.  This is advisory only, the server
4146  * may return only one block's worth of entries.  Entries may be compressed
4147  * on the server.
4148  */
4149 static void
4150 nfs3readdirplus(vnode_t *vp, rddir_cache *rdc, cred_t *cr)
4151 {
4152 	int error;
4153 	READDIRPLUS3args args;
4154 	READDIRPLUS3vres res;
4155 	vattr_t dva;
4156 	rnode_t *rp;
4157 	mntinfo_t *mi;
4158 	int douprintf;
4159 	failinfo_t fi, *fip = NULL;
4160 
4161 	rp = VTOR(vp);
4162 	mi = VTOMI(vp);
4163 	ASSERT(nfs_zone() == mi->mi_zone);
4164 
4165 	args.dir = *RTOFH3(rp);
4166 	args.cookie = (cookie3)rdc->nfs3_cookie;
4167 	args.cookieverf = rp->r_cookieverf;
4168 	args.dircount = rdc->buflen;
4169 	args.maxcount = mi->mi_tsize;
4170 
4171 	/*
4172 	 * NFS client failover support
4173 	 * suppress failover unless we have a zero cookie
4174 	 */
4175 	if (args.cookie == (cookie3)0) {
4176 		fi.vp = vp;
4177 		fi.fhp = (caddr_t)&args.dir;
4178 		fi.copyproc = nfs3copyfh;
4179 		fi.lookupproc = nfs3lookup;
4180 		fi.xattrdirproc = acl_getxattrdir3;
4181 		fip = &fi;
4182 	}
4183 
4184 #ifdef DEBUG
4185 	rdc->entries = rddir_cache_buf_alloc(rdc->buflen, KM_SLEEP);
4186 #else
4187 	rdc->entries = kmem_alloc(rdc->buflen, KM_SLEEP);
4188 #endif
4189 
4190 	res.entries = (dirent64_t *)rdc->entries;
4191 	res.entries_size = rdc->buflen;
4192 	res.dir_attributes.fres.vap = &dva;
4193 	res.dir_attributes.fres.vp = vp;
4194 	res.loff = rdc->nfs3_cookie;
4195 	res.credentials = cr;
4196 
4197 	douprintf = 1;
4198 
4199 	if (mi->mi_io_kstats) {
4200 		mutex_enter(&mi->mi_lock);
4201 		kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
4202 		mutex_exit(&mi->mi_lock);
4203 	}
4204 
4205 	res.time = gethrtime();
4206 
4207 	error = rfs3call(mi, NFSPROC3_READDIRPLUS,
4208 	    xdr_READDIRPLUS3args, (caddr_t)&args,
4209 	    xdr_READDIRPLUS3vres, (caddr_t)&res, cr,
4210 	    &douprintf, &res.status, 0, fip);
4211 
4212 	if (mi->mi_io_kstats) {
4213 		mutex_enter(&mi->mi_lock);
4214 		kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
4215 		mutex_exit(&mi->mi_lock);
4216 	}
4217 
4218 	if (error) {
4219 		goto err;
4220 	}
4221 
4222 	nfs3_cache_post_op_vattr(vp, &res.dir_attributes, res.time, cr);
4223 
4224 	error = geterrno3(res.status);
4225 	if (error) {
4226 		PURGE_STALE_FH(error, vp, cr);
4227 		if (error == EOPNOTSUPP) {
4228 			mutex_enter(&mi->mi_lock);
4229 			mi->mi_flags |= MI_READDIRONLY;
4230 			mutex_exit(&mi->mi_lock);
4231 		}
4232 		goto err;
4233 	}
4234 
4235 	if (mi->mi_io_kstats) {
4236 		mutex_enter(&mi->mi_lock);
4237 		KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
4238 		KSTAT_IO_PTR(mi->mi_io_kstats)->nread += res.size;
4239 		mutex_exit(&mi->mi_lock);
4240 	}
4241 
4242 	rdc->nfs3_ncookie = res.loff;
4243 	rp->r_cookieverf = res.cookieverf;
4244 	rdc->eof = res.eof ? 1 : 0;
4245 	rdc->entlen = res.size;
4246 	ASSERT(rdc->entlen <= rdc->buflen);
4247 	rdc->error = 0;
4248 
4249 	return;
4250 
4251 err:
4252 	kmem_free(rdc->entries, rdc->buflen);
4253 	rdc->entries = NULL;
4254 	rdc->error = error;
4255 }
4256 
4257 #ifdef DEBUG
4258 static int nfs3_bio_do_stop = 0;
4259 #endif
4260 
4261 static int
4262 nfs3_bio(struct buf *bp, stable_how *stab_comm, cred_t *cr)
4263 {
4264 	rnode_t *rp = VTOR(bp->b_vp);
4265 	int count;
4266 	int error;
4267 	cred_t *cred;
4268 	offset_t offset;
4269 
4270 	ASSERT(nfs_zone() == VTOMI(bp->b_vp)->mi_zone);
4271 	offset = ldbtob(bp->b_lblkno);
4272 
4273 	DTRACE_IO1(start, struct buf *, bp);
4274 
4275 	if (bp->b_flags & B_READ) {
4276 		mutex_enter(&rp->r_statelock);
4277 		if (rp->r_cred != NULL) {
4278 			cred = rp->r_cred;
4279 			crhold(cred);
4280 		} else {
4281 			rp->r_cred = cr;
4282 			crhold(cr);
4283 			cred = cr;
4284 			crhold(cred);
4285 		}
4286 		mutex_exit(&rp->r_statelock);
4287 	read_again:
4288 		error = bp->b_error = nfs3read(bp->b_vp, bp->b_un.b_addr,
4289 		    offset, bp->b_bcount, &bp->b_resid, cred);
4290 		crfree(cred);
4291 		if (!error) {
4292 			if (bp->b_resid) {
4293 				/*
4294 				 * Didn't get it all because we hit EOF,
4295 				 * zero all the memory beyond the EOF.
4296 				 */
4297 				/* bzero(rdaddr + */
4298 				bzero(bp->b_un.b_addr +
4299 				    bp->b_bcount - bp->b_resid, bp->b_resid);
4300 			}
4301 			mutex_enter(&rp->r_statelock);
4302 			if (bp->b_resid == bp->b_bcount &&
4303 			    offset >= rp->r_size) {
4304 				/*
4305 				 * We didn't read anything at all as we are
4306 				 * past EOF.  Return an error indicator back
4307 				 * but don't destroy the pages (yet).
4308 				 */
4309 				error = NFS_EOF;
4310 			}
4311 			mutex_exit(&rp->r_statelock);
4312 		} else if (error == EACCES) {
4313 			mutex_enter(&rp->r_statelock);
4314 			if (cred != cr) {
4315 				if (rp->r_cred != NULL)
4316 					crfree(rp->r_cred);
4317 				rp->r_cred = cr;
4318 				crhold(cr);
4319 				cred = cr;
4320 				crhold(cred);
4321 				mutex_exit(&rp->r_statelock);
4322 				goto read_again;
4323 			}
4324 			mutex_exit(&rp->r_statelock);
4325 		}
4326 	} else {
4327 		if (!(rp->r_flags & RSTALE)) {
4328 			mutex_enter(&rp->r_statelock);
4329 			if (rp->r_cred != NULL) {
4330 				cred = rp->r_cred;
4331 				crhold(cred);
4332 			} else {
4333 				rp->r_cred = cr;
4334 				crhold(cr);
4335 				cred = cr;
4336 				crhold(cred);
4337 			}
4338 			mutex_exit(&rp->r_statelock);
4339 		write_again:
4340 			mutex_enter(&rp->r_statelock);
4341 			count = MIN(bp->b_bcount, rp->r_size - offset);
4342 			mutex_exit(&rp->r_statelock);
4343 			if (count < 0)
4344 				cmn_err(CE_PANIC, "nfs3_bio: write count < 0");
4345 #ifdef DEBUG
4346 			if (count == 0) {
4347 				zcmn_err(getzoneid(), CE_WARN,
4348 				    "nfs3_bio: zero length write at %lld",
4349 				    offset);
4350 				nfs_printfhandle(&rp->r_fh);
4351 				if (nfs3_bio_do_stop)
4352 					debug_enter("nfs3_bio");
4353 			}
4354 #endif
4355 			error = nfs3write(bp->b_vp, bp->b_un.b_addr, offset,
4356 			    count, cred, stab_comm);
4357 			if (error == EACCES) {
4358 				mutex_enter(&rp->r_statelock);
4359 				if (cred != cr) {
4360 					if (rp->r_cred != NULL)
4361 						crfree(rp->r_cred);
4362 					rp->r_cred = cr;
4363 					crhold(cr);
4364 					crfree(cred);
4365 					cred = cr;
4366 					crhold(cred);
4367 					mutex_exit(&rp->r_statelock);
4368 					goto write_again;
4369 				}
4370 				mutex_exit(&rp->r_statelock);
4371 			}
4372 			bp->b_error = error;
4373 			if (error && error != EINTR) {
4374 				/*
4375 				 * Don't print EDQUOT errors on the console.
4376 				 * Don't print asynchronous EACCES errors.
4377 				 * Don't print EFBIG errors.
4378 				 * Print all other write errors.
4379 				 */
4380 				if (error != EDQUOT && error != EFBIG &&
4381 				    (error != EACCES ||
4382 				    !(bp->b_flags & B_ASYNC)))
4383 					nfs_write_error(bp->b_vp, error, cred);
4384 				/*
4385 				 * Update r_error and r_flags as appropriate.
4386 				 * If the error was ESTALE, then mark the
4387 				 * rnode as not being writeable and save
4388 				 * the error status.  Otherwise, save any
4389 				 * errors which occur from asynchronous
4390 				 * page invalidations.  Any errors occurring
4391 				 * from other operations should be saved
4392 				 * by the caller.
4393 				 */
4394 				mutex_enter(&rp->r_statelock);
4395 				if (error == ESTALE) {
4396 					rp->r_flags |= RSTALE;
4397 					if (!rp->r_error)
4398 						rp->r_error = error;
4399 				} else if (!rp->r_error &&
4400 				    (bp->b_flags &
4401 				    (B_INVAL|B_FORCE|B_ASYNC)) ==
4402 				    (B_INVAL|B_FORCE|B_ASYNC)) {
4403 					rp->r_error = error;
4404 				}
4405 				mutex_exit(&rp->r_statelock);
4406 			}
4407 			crfree(cred);
4408 		} else {
4409 			error = rp->r_error;
4410 			/*
4411 			 * A close may have cleared r_error, if so,
4412 			 * propagate ESTALE error return properly
4413 			 */
4414 			if (error == 0)
4415 				error = ESTALE;
4416 		}
4417 	}
4418 
4419 	if (error != 0 && error != NFS_EOF)
4420 		bp->b_flags |= B_ERROR;
4421 
4422 	DTRACE_IO1(done, struct buf *, bp);
4423 
4424 	return (error);
4425 }
4426 
4427 /* ARGSUSED */
4428 static int
4429 nfs3_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
4430 {
4431 	rnode_t *rp;
4432 
4433 	if (nfs_zone() != VTOMI(vp)->mi_zone)
4434 		return (EIO);
4435 	rp = VTOR(vp);
4436 
4437 	if (fidp->fid_len < (ushort_t)rp->r_fh.fh_len) {
4438 		fidp->fid_len = rp->r_fh.fh_len;
4439 		return (ENOSPC);
4440 	}
4441 	fidp->fid_len = rp->r_fh.fh_len;
4442 	bcopy(rp->r_fh.fh_buf, fidp->fid_data, fidp->fid_len);
4443 	return (0);
4444 }
4445 
4446 /* ARGSUSED2 */
4447 static int
4448 nfs3_rwlock(vnode_t *vp, int write_lock, caller_context_t *ctp)
4449 {
4450 	rnode_t *rp = VTOR(vp);
4451 
4452 	if (!write_lock) {
4453 		(void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
4454 		return (V_WRITELOCK_FALSE);
4455 	}
4456 
4457 	if ((rp->r_flags & RDIRECTIO) || (VTOMI(vp)->mi_flags & MI_DIRECTIO)) {
4458 		(void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
4459 		if (rp->r_mapcnt == 0 && !vn_has_cached_data(vp))
4460 			return (V_WRITELOCK_FALSE);
4461 		nfs_rw_exit(&rp->r_rwlock);
4462 	}
4463 
4464 	(void) nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, FALSE);
4465 	return (V_WRITELOCK_TRUE);
4466 }
4467 
4468 /* ARGSUSED */
4469 static void
4470 nfs3_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ctp)
4471 {
4472 	rnode_t *rp = VTOR(vp);
4473 
4474 	nfs_rw_exit(&rp->r_rwlock);
4475 }
4476 
4477 /* ARGSUSED */
4478 static int
4479 nfs3_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct)
4480 {
4481 
4482 	/*
4483 	 * Because we stuff the readdir cookie into the offset field
4484 	 * someone may attempt to do an lseek with the cookie which
4485 	 * we want to succeed.
4486 	 */
4487 	if (vp->v_type == VDIR)
4488 		return (0);
4489 	if (*noffp < 0)
4490 		return (EINVAL);
4491 	return (0);
4492 }
4493 
4494 /*
4495  * number of nfs3_bsize blocks to read ahead.
4496  */
4497 static int nfs3_nra = 4;
4498 
4499 #ifdef DEBUG
4500 static int nfs3_lostpage = 0;	/* number of times we lost original page */
4501 #endif
4502 
4503 /*
4504  * Return all the pages from [off..off+len) in file
4505  */
4506 /* ARGSUSED */
4507 static int
4508 nfs3_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
4509 	page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
4510 	enum seg_rw rw, cred_t *cr, caller_context_t *ct)
4511 {
4512 	rnode_t *rp;
4513 	int error;
4514 	mntinfo_t *mi;
4515 
4516 	if (vp->v_flag & VNOMAP)
4517 		return (ENOSYS);
4518 
4519 	if (nfs_zone() != VTOMI(vp)->mi_zone)
4520 		return (EIO);
4521 	if (protp != NULL)
4522 		*protp = PROT_ALL;
4523 
4524 	/*
4525 	 * Now valididate that the caches are up to date.
4526 	 */
4527 	error = nfs3_validate_caches(vp, cr);
4528 	if (error)
4529 		return (error);
4530 
4531 	rp = VTOR(vp);
4532 	mi = VTOMI(vp);
4533 retry:
4534 	mutex_enter(&rp->r_statelock);
4535 
4536 	/*
4537 	 * Don't create dirty pages faster than they
4538 	 * can be cleaned so that the system doesn't
4539 	 * get imbalanced.  If the async queue is
4540 	 * maxed out, then wait for it to drain before
4541 	 * creating more dirty pages.  Also, wait for
4542 	 * any threads doing pagewalks in the vop_getattr
4543 	 * entry points so that they don't block for
4544 	 * long periods.
4545 	 */
4546 	if (rw == S_CREATE) {
4547 		while ((mi->mi_max_threads != 0 &&
4548 		    rp->r_awcount > 2 * mi->mi_max_threads) ||
4549 		    rp->r_gcount > 0)
4550 			cv_wait(&rp->r_cv, &rp->r_statelock);
4551 	}
4552 
4553 	/*
4554 	 * If we are getting called as a side effect of an nfs_write()
4555 	 * operation the local file size might not be extended yet.
4556 	 * In this case we want to be able to return pages of zeroes.
4557 	 */
4558 	if (off + len > rp->r_size + PAGEOFFSET && seg != segkmap) {
4559 		mutex_exit(&rp->r_statelock);
4560 		return (EFAULT);		/* beyond EOF */
4561 	}
4562 
4563 	mutex_exit(&rp->r_statelock);
4564 
4565 	if (len <= PAGESIZE) {
4566 		error = nfs3_getapage(vp, off, len, protp, pl, plsz,
4567 		    seg, addr, rw, cr);
4568 	} else {
4569 		error = pvn_getpages(nfs3_getapage, vp, off, len, protp,
4570 		    pl, plsz, seg, addr, rw, cr);
4571 	}
4572 
4573 	switch (error) {
4574 	case NFS_EOF:
4575 		nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr);
4576 		goto retry;
4577 	case ESTALE:
4578 		PURGE_STALE_FH(error, vp, cr);
4579 	}
4580 
4581 	return (error);
4582 }
4583 
4584 /*
4585  * Called from pvn_getpages or nfs3_getpage to get a particular page.
4586  */
4587 /* ARGSUSED */
4588 static int
4589 nfs3_getapage(vnode_t *vp, u_offset_t off, size_t len, uint_t *protp,
4590 	page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
4591 	enum seg_rw rw, cred_t *cr)
4592 {
4593 	rnode_t *rp;
4594 	uint_t bsize;
4595 	struct buf *bp;
4596 	page_t *pp;
4597 	u_offset_t lbn;
4598 	u_offset_t io_off;
4599 	u_offset_t blkoff;
4600 	u_offset_t rablkoff;
4601 	size_t io_len;
4602 	uint_t blksize;
4603 	int error;
4604 	int readahead;
4605 	int readahead_issued = 0;
4606 	int ra_window; /* readahead window */
4607 	page_t *pagefound;
4608 	page_t *savepp;
4609 
4610 	if (nfs_zone() != VTOMI(vp)->mi_zone)
4611 		return (EIO);
4612 	rp = VTOR(vp);
4613 	bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
4614 
4615 reread:
4616 	bp = NULL;
4617 	pp = NULL;
4618 	pagefound = NULL;
4619 
4620 	if (pl != NULL)
4621 		pl[0] = NULL;
4622 
4623 	error = 0;
4624 	lbn = off / bsize;
4625 	blkoff = lbn * bsize;
4626 
4627 	/*
4628 	 * Queueing up the readahead before doing the synchronous read
4629 	 * results in a significant increase in read throughput because
4630 	 * of the increased parallelism between the async threads and
4631 	 * the process context.
4632 	 */
4633 	if ((off & ((vp->v_vfsp->vfs_bsize) - 1)) == 0 &&
4634 	    rw != S_CREATE &&
4635 	    !(vp->v_flag & VNOCACHE)) {
4636 		mutex_enter(&rp->r_statelock);
4637 
4638 		/*
4639 		 * Calculate the number of readaheads to do.
4640 		 * a) No readaheads at offset = 0.
4641 		 * b) Do maximum(nfs3_nra) readaheads when the readahead
4642 		 *    window is closed.
4643 		 * c) Do readaheads between 1 to (nfs3_nra - 1) depending
4644 		 *    upon how far the readahead window is open or close.
4645 		 * d) No readaheads if rp->r_nextr is not within the scope
4646 		 *    of the readahead window (random i/o).
4647 		 */
4648 
4649 		if (off == 0)
4650 			readahead = 0;
4651 		else if (blkoff == rp->r_nextr)
4652 			readahead = nfs3_nra;
4653 		else if (rp->r_nextr > blkoff &&
4654 		    ((ra_window = (rp->r_nextr - blkoff) / bsize)
4655 		    <= (nfs3_nra - 1)))
4656 			readahead = nfs3_nra - ra_window;
4657 		else
4658 			readahead = 0;
4659 
4660 		rablkoff = rp->r_nextr;
4661 		while (readahead > 0 && rablkoff + bsize < rp->r_size) {
4662 			mutex_exit(&rp->r_statelock);
4663 			if (nfs_async_readahead(vp, rablkoff + bsize,
4664 			    addr + (rablkoff + bsize - off), seg, cr,
4665 			    nfs3_readahead) < 0) {
4666 				mutex_enter(&rp->r_statelock);
4667 				break;
4668 			}
4669 			readahead--;
4670 			rablkoff += bsize;
4671 			/*
4672 			 * Indicate that we did a readahead so
4673 			 * readahead offset is not updated
4674 			 * by the synchronous read below.
4675 			 */
4676 			readahead_issued = 1;
4677 			mutex_enter(&rp->r_statelock);
4678 			/*
4679 			 * set readahead offset to
4680 			 * offset of last async readahead
4681 			 * request.
4682 			 */
4683 			rp->r_nextr = rablkoff;
4684 		}
4685 		mutex_exit(&rp->r_statelock);
4686 	}
4687 
4688 again:
4689 	if ((pagefound = page_exists(vp, off)) == NULL) {
4690 		if (pl == NULL) {
4691 			(void) nfs_async_readahead(vp, blkoff, addr, seg, cr,
4692 			    nfs3_readahead);
4693 		} else if (rw == S_CREATE) {
4694 			/*
4695 			 * Block for this page is not allocated, or the offset
4696 			 * is beyond the current allocation size, or we're
4697 			 * allocating a swap slot and the page was not found,
4698 			 * so allocate it and return a zero page.
4699 			 */
4700 			if ((pp = page_create_va(vp, off,
4701 			    PAGESIZE, PG_WAIT, seg, addr)) == NULL)
4702 				cmn_err(CE_PANIC, "nfs3_getapage: page_create");
4703 			io_len = PAGESIZE;
4704 			mutex_enter(&rp->r_statelock);
4705 			rp->r_nextr = off + PAGESIZE;
4706 			mutex_exit(&rp->r_statelock);
4707 		} else {
4708 			/*
4709 			 * Need to go to server to get a BLOCK, exception to
4710 			 * that being while reading at offset = 0 or doing
4711 			 * random i/o, in that case read only a PAGE.
4712 			 */
4713 			mutex_enter(&rp->r_statelock);
4714 			if (blkoff < rp->r_size &&
4715 			    blkoff + bsize >= rp->r_size) {
4716 				/*
4717 				 * If only a block or less is left in
4718 				 * the file, read all that is remaining.
4719 				 */
4720 				if (rp->r_size <= off) {
4721 					/*
4722 					 * Trying to access beyond EOF,
4723 					 * set up to get at least one page.
4724 					 */
4725 					blksize = off + PAGESIZE - blkoff;
4726 				} else
4727 					blksize = rp->r_size - blkoff;
4728 			} else if ((off == 0) ||
4729 			    (off != rp->r_nextr && !readahead_issued)) {
4730 				blksize = PAGESIZE;
4731 				blkoff = off; /* block = page here */
4732 			} else
4733 				blksize = bsize;
4734 			mutex_exit(&rp->r_statelock);
4735 
4736 			pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
4737 			    &io_len, blkoff, blksize, 0);
4738 
4739 			/*
4740 			 * Some other thread has entered the page,
4741 			 * so just use it.
4742 			 */
4743 			if (pp == NULL)
4744 				goto again;
4745 
4746 			/*
4747 			 * Now round the request size up to page boundaries.
4748 			 * This ensures that the entire page will be
4749 			 * initialized to zeroes if EOF is encountered.
4750 			 */
4751 			io_len = ptob(btopr(io_len));
4752 
4753 			bp = pageio_setup(pp, io_len, vp, B_READ);
4754 			ASSERT(bp != NULL);
4755 
4756 			/*
4757 			 * pageio_setup should have set b_addr to 0.  This
4758 			 * is correct since we want to do I/O on a page
4759 			 * boundary.  bp_mapin will use this addr to calculate
4760 			 * an offset, and then set b_addr to the kernel virtual
4761 			 * address it allocated for us.
4762 			 */
4763 			ASSERT(bp->b_un.b_addr == 0);
4764 
4765 			bp->b_edev = 0;
4766 			bp->b_dev = 0;
4767 			bp->b_lblkno = lbtodb(io_off);
4768 			bp->b_file = vp;
4769 			bp->b_offset = (offset_t)off;
4770 			bp_mapin(bp);
4771 
4772 			/*
4773 			 * If doing a write beyond what we believe is EOF,
4774 			 * don't bother trying to read the pages from the
4775 			 * server, we'll just zero the pages here.  We
4776 			 * don't check that the rw flag is S_WRITE here
4777 			 * because some implementations may attempt a
4778 			 * read access to the buffer before copying data.
4779 			 */
4780 			mutex_enter(&rp->r_statelock);
4781 			if (io_off >= rp->r_size && seg == segkmap) {
4782 				mutex_exit(&rp->r_statelock);
4783 				bzero(bp->b_un.b_addr, io_len);
4784 			} else {
4785 				mutex_exit(&rp->r_statelock);
4786 				error = nfs3_bio(bp, NULL, cr);
4787 			}
4788 
4789 			/*
4790 			 * Unmap the buffer before freeing it.
4791 			 */
4792 			bp_mapout(bp);
4793 			pageio_done(bp);
4794 
4795 			savepp = pp;
4796 			do {
4797 				pp->p_fsdata = C_NOCOMMIT;
4798 			} while ((pp = pp->p_next) != savepp);
4799 
4800 			if (error == NFS_EOF) {
4801 				/*
4802 				 * If doing a write system call just return
4803 				 * zeroed pages, else user tried to get pages
4804 				 * beyond EOF, return error.  We don't check
4805 				 * that the rw flag is S_WRITE here because
4806 				 * some implementations may attempt a read
4807 				 * access to the buffer before copying data.
4808 				 */
4809 				if (seg == segkmap)
4810 					error = 0;
4811 				else
4812 					error = EFAULT;
4813 			}
4814 
4815 			if (!readahead_issued && !error) {
4816 				mutex_enter(&rp->r_statelock);
4817 				rp->r_nextr = io_off + io_len;
4818 				mutex_exit(&rp->r_statelock);
4819 			}
4820 		}
4821 	}
4822 
4823 out:
4824 	if (pl == NULL)
4825 		return (error);
4826 
4827 	if (error) {
4828 		if (pp != NULL)
4829 			pvn_read_done(pp, B_ERROR);
4830 		return (error);
4831 	}
4832 
4833 	if (pagefound) {
4834 		se_t se = (rw == S_CREATE ? SE_EXCL : SE_SHARED);
4835 
4836 		/*
4837 		 * Page exists in the cache, acquire the appropriate lock.
4838 		 * If this fails, start all over again.
4839 		 */
4840 		if ((pp = page_lookup(vp, off, se)) == NULL) {
4841 #ifdef DEBUG
4842 			nfs3_lostpage++;
4843 #endif
4844 			goto reread;
4845 		}
4846 		pl[0] = pp;
4847 		pl[1] = NULL;
4848 		return (0);
4849 	}
4850 
4851 	if (pp != NULL)
4852 		pvn_plist_init(pp, pl, plsz, off, io_len, rw);
4853 
4854 	return (error);
4855 }
4856 
4857 static void
4858 nfs3_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, struct seg *seg,
4859 	cred_t *cr)
4860 {
4861 	int error;
4862 	page_t *pp;
4863 	u_offset_t io_off;
4864 	size_t io_len;
4865 	struct buf *bp;
4866 	uint_t bsize, blksize;
4867 	rnode_t *rp = VTOR(vp);
4868 	page_t *savepp;
4869 
4870 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
4871 	bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
4872 
4873 	mutex_enter(&rp->r_statelock);
4874 	if (blkoff < rp->r_size && blkoff + bsize > rp->r_size) {
4875 		/*
4876 		 * If less than a block left in file read less
4877 		 * than a block.
4878 		 */
4879 		blksize = rp->r_size - blkoff;
4880 	} else
4881 		blksize = bsize;
4882 	mutex_exit(&rp->r_statelock);
4883 
4884 	pp = pvn_read_kluster(vp, blkoff, segkmap, addr,
4885 	    &io_off, &io_len, blkoff, blksize, 1);
4886 	/*
4887 	 * The isra flag passed to the kluster function is 1, we may have
4888 	 * gotten a return value of NULL for a variety of reasons (# of free
4889 	 * pages < minfree, someone entered the page on the vnode etc). In all
4890 	 * cases, we want to punt on the readahead.
4891 	 */
4892 	if (pp == NULL)
4893 		return;
4894 
4895 	/*
4896 	 * Now round the request size up to page boundaries.
4897 	 * This ensures that the entire page will be
4898 	 * initialized to zeroes if EOF is encountered.
4899 	 */
4900 	io_len = ptob(btopr(io_len));
4901 
4902 	bp = pageio_setup(pp, io_len, vp, B_READ);
4903 	ASSERT(bp != NULL);
4904 
4905 	/*
4906 	 * pageio_setup should have set b_addr to 0.  This is correct since
4907 	 * we want to do I/O on a page boundary. bp_mapin() will use this addr
4908 	 * to calculate an offset, and then set b_addr to the kernel virtual
4909 	 * address it allocated for us.
4910 	 */
4911 	ASSERT(bp->b_un.b_addr == 0);
4912 
4913 	bp->b_edev = 0;
4914 	bp->b_dev = 0;
4915 	bp->b_lblkno = lbtodb(io_off);
4916 	bp->b_file = vp;
4917 	bp->b_offset = (offset_t)blkoff;
4918 	bp_mapin(bp);
4919 
4920 	/*
4921 	 * If doing a write beyond what we believe is EOF, don't bother trying
4922 	 * to read the pages from the server, we'll just zero the pages here.
4923 	 * We don't check that the rw flag is S_WRITE here because some
4924 	 * implementations may attempt a read access to the buffer before
4925 	 * copying data.
4926 	 */
4927 	mutex_enter(&rp->r_statelock);
4928 	if (io_off >= rp->r_size && seg == segkmap) {
4929 		mutex_exit(&rp->r_statelock);
4930 		bzero(bp->b_un.b_addr, io_len);
4931 		error = 0;
4932 	} else {
4933 		mutex_exit(&rp->r_statelock);
4934 		error = nfs3_bio(bp, NULL, cr);
4935 		if (error == NFS_EOF)
4936 			error = 0;
4937 	}
4938 
4939 	/*
4940 	 * Unmap the buffer before freeing it.
4941 	 */
4942 	bp_mapout(bp);
4943 	pageio_done(bp);
4944 
4945 	savepp = pp;
4946 	do {
4947 		pp->p_fsdata = C_NOCOMMIT;
4948 	} while ((pp = pp->p_next) != savepp);
4949 
4950 	pvn_read_done(pp, error ? B_READ | B_ERROR : B_READ);
4951 
4952 	/*
4953 	 * In case of error set readahead offset
4954 	 * to the lowest offset.
4955 	 * pvn_read_done() calls VN_DISPOSE to destroy the pages
4956 	 */
4957 	if (error && rp->r_nextr > io_off) {
4958 		mutex_enter(&rp->r_statelock);
4959 		if (rp->r_nextr > io_off)
4960 			rp->r_nextr = io_off;
4961 		mutex_exit(&rp->r_statelock);
4962 	}
4963 }
4964 
4965 /*
4966  * Flags are composed of {B_INVAL, B_FREE, B_DONTNEED, B_FORCE}
4967  * If len == 0, do from off to EOF.
4968  *
4969  * The normal cases should be len == 0 && off == 0 (entire vp list),
4970  * len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE
4971  * (from pageout).
4972  */
4973 /* ARGSUSED */
4974 static int
4975 nfs3_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
4976 	caller_context_t *ct)
4977 {
4978 	int error;
4979 	rnode_t *rp;
4980 
4981 	ASSERT(cr != NULL);
4982 
4983 	/*
4984 	 * XXX - Why should this check be made here?
4985 	 */
4986 	if (vp->v_flag & VNOMAP)
4987 		return (ENOSYS);
4988 	if (len == 0 && !(flags & B_INVAL) && vn_is_readonly(vp))
4989 		return (0);
4990 	if (!(flags & B_ASYNC) && nfs_zone() != VTOMI(vp)->mi_zone)
4991 		return (EIO);
4992 
4993 	rp = VTOR(vp);
4994 	mutex_enter(&rp->r_statelock);
4995 	rp->r_count++;
4996 	mutex_exit(&rp->r_statelock);
4997 	error = nfs_putpages(vp, off, len, flags, cr);
4998 	mutex_enter(&rp->r_statelock);
4999 	rp->r_count--;
5000 	cv_broadcast(&rp->r_cv);
5001 	mutex_exit(&rp->r_statelock);
5002 
5003 	return (error);
5004 }
5005 
5006 /*
5007  * Write out a single page, possibly klustering adjacent dirty pages.
5008  */
5009 int
5010 nfs3_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp,
5011 	int flags, cred_t *cr)
5012 {
5013 	u_offset_t io_off;
5014 	u_offset_t lbn_off;
5015 	u_offset_t lbn;
5016 	size_t io_len;
5017 	uint_t bsize;
5018 	int error;
5019 	rnode_t *rp;
5020 
5021 	ASSERT(!vn_is_readonly(vp));
5022 	ASSERT(pp != NULL);
5023 	ASSERT(cr != NULL);
5024 	ASSERT((flags & B_ASYNC) || nfs_zone() == VTOMI(vp)->mi_zone);
5025 
5026 	rp = VTOR(vp);
5027 	ASSERT(rp->r_count > 0);
5028 
5029 	bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
5030 	lbn = pp->p_offset / bsize;
5031 	lbn_off = lbn * bsize;
5032 
5033 	/*
5034 	 * Find a kluster that fits in one block, or in
5035 	 * one page if pages are bigger than blocks.  If
5036 	 * there is less file space allocated than a whole
5037 	 * page, we'll shorten the i/o request below.
5038 	 */
5039 	pp = pvn_write_kluster(vp, pp, &io_off, &io_len, lbn_off,
5040 	    roundup(bsize, PAGESIZE), flags);
5041 
5042 	/*
5043 	 * pvn_write_kluster shouldn't have returned a page with offset
5044 	 * behind the original page we were given.  Verify that.
5045 	 */
5046 	ASSERT((pp->p_offset / bsize) >= lbn);
5047 
5048 	/*
5049 	 * Now pp will have the list of kept dirty pages marked for
5050 	 * write back.  It will also handle invalidation and freeing
5051 	 * of pages that are not dirty.  Check for page length rounding
5052 	 * problems.
5053 	 */
5054 	if (io_off + io_len > lbn_off + bsize) {
5055 		ASSERT((io_off + io_len) - (lbn_off + bsize) < PAGESIZE);
5056 		io_len = lbn_off + bsize - io_off;
5057 	}
5058 	/*
5059 	 * The RMODINPROGRESS flag makes sure that nfs(3)_bio() sees a
5060 	 * consistent value of r_size. RMODINPROGRESS is set in writerp().
5061 	 * When RMODINPROGRESS is set it indicates that a uiomove() is in
5062 	 * progress and the r_size has not been made consistent with the
5063 	 * new size of the file. When the uiomove() completes the r_size is
5064 	 * updated and the RMODINPROGRESS flag is cleared.
5065 	 *
5066 	 * The RMODINPROGRESS flag makes sure that nfs(3)_bio() sees a
5067 	 * consistent value of r_size. Without this handshaking, it is
5068 	 * possible that nfs(3)_bio() picks  up the old value of r_size
5069 	 * before the uiomove() in writerp() completes. This will result
5070 	 * in the write through nfs(3)_bio() being dropped.
5071 	 *
5072 	 * More precisely, there is a window between the time the uiomove()
5073 	 * completes and the time the r_size is updated. If a VOP_PUTPAGE()
5074 	 * operation intervenes in this window, the page will be picked up,
5075 	 * because it is dirty (it will be unlocked, unless it was
5076 	 * pagecreate'd). When the page is picked up as dirty, the dirty
5077 	 * bit is reset (pvn_getdirty()). In nfs(3)write(), r_size is
5078 	 * checked. This will still be the old size. Therefore the page will
5079 	 * not be written out. When segmap_release() calls VOP_PUTPAGE(),
5080 	 * the page will be found to be clean and the write will be dropped.
5081 	 */
5082 	if (rp->r_flags & RMODINPROGRESS) {
5083 		mutex_enter(&rp->r_statelock);
5084 		if ((rp->r_flags & RMODINPROGRESS) &&
5085 		    rp->r_modaddr + MAXBSIZE > io_off &&
5086 		    rp->r_modaddr < io_off + io_len) {
5087 			page_t *plist;
5088 			/*
5089 			 * A write is in progress for this region of the file.
5090 			 * If we did not detect RMODINPROGRESS here then this
5091 			 * path through nfs_putapage() would eventually go to
5092 			 * nfs(3)_bio() and may not write out all of the data
5093 			 * in the pages. We end up losing data. So we decide
5094 			 * to set the modified bit on each page in the page
5095 			 * list and mark the rnode with RDIRTY. This write
5096 			 * will be restarted at some later time.
5097 			 */
5098 			plist = pp;
5099 			while (plist != NULL) {
5100 				pp = plist;
5101 				page_sub(&plist, pp);
5102 				hat_setmod(pp);
5103 				page_io_unlock(pp);
5104 				page_unlock(pp);
5105 			}
5106 			rp->r_flags |= RDIRTY;
5107 			mutex_exit(&rp->r_statelock);
5108 			if (offp)
5109 				*offp = io_off;
5110 			if (lenp)
5111 				*lenp = io_len;
5112 			return (0);
5113 		}
5114 		mutex_exit(&rp->r_statelock);
5115 	}
5116 
5117 	if (flags & B_ASYNC) {
5118 		error = nfs_async_putapage(vp, pp, io_off, io_len, flags, cr,
5119 		    nfs3_sync_putapage);
5120 	} else
5121 		error = nfs3_sync_putapage(vp, pp, io_off, io_len, flags, cr);
5122 
5123 	if (offp)
5124 		*offp = io_off;
5125 	if (lenp)
5126 		*lenp = io_len;
5127 	return (error);
5128 }
5129 
5130 static int
5131 nfs3_sync_putapage(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
5132 	int flags, cred_t *cr)
5133 {
5134 	int error;
5135 	rnode_t *rp;
5136 
5137 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
5138 
5139 	flags |= B_WRITE;
5140 
5141 	error = nfs3_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
5142 
5143 	rp = VTOR(vp);
5144 
5145 	if ((error == ENOSPC || error == EDQUOT || error == EFBIG ||
5146 	    error == EACCES) &&
5147 	    (flags & (B_INVAL|B_FORCE)) != (B_INVAL|B_FORCE)) {
5148 		if (!(rp->r_flags & ROUTOFSPACE)) {
5149 			mutex_enter(&rp->r_statelock);
5150 			rp->r_flags |= ROUTOFSPACE;
5151 			mutex_exit(&rp->r_statelock);
5152 		}
5153 		flags |= B_ERROR;
5154 		pvn_write_done(pp, flags);
5155 		/*
5156 		 * If this was not an async thread, then try again to
5157 		 * write out the pages, but this time, also destroy
5158 		 * them whether or not the write is successful.  This
5159 		 * will prevent memory from filling up with these
5160 		 * pages and destroying them is the only alternative
5161 		 * if they can't be written out.
5162 		 *
5163 		 * Don't do this if this is an async thread because
5164 		 * when the pages are unlocked in pvn_write_done,
5165 		 * some other thread could have come along, locked
5166 		 * them, and queued for an async thread.  It would be
5167 		 * possible for all of the async threads to be tied
5168 		 * up waiting to lock the pages again and they would
5169 		 * all already be locked and waiting for an async
5170 		 * thread to handle them.  Deadlock.
5171 		 */
5172 		if (!(flags & B_ASYNC)) {
5173 			error = nfs3_putpage(vp, io_off, io_len,
5174 			    B_INVAL | B_FORCE, cr, NULL);
5175 		}
5176 	} else {
5177 		if (error)
5178 			flags |= B_ERROR;
5179 		else if (rp->r_flags & ROUTOFSPACE) {
5180 			mutex_enter(&rp->r_statelock);
5181 			rp->r_flags &= ~ROUTOFSPACE;
5182 			mutex_exit(&rp->r_statelock);
5183 		}
5184 		pvn_write_done(pp, flags);
5185 		if (freemem < desfree)
5186 			(void) nfs3_commit_vp(vp, (u_offset_t)0, 0, cr);
5187 	}
5188 
5189 	return (error);
5190 }
5191 
5192 /* ARGSUSED */
5193 static int
5194 nfs3_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
5195 	size_t len, uchar_t prot, uchar_t maxprot, uint_t flags,
5196 	cred_t *cr, caller_context_t *ct)
5197 {
5198 	struct segvn_crargs vn_a;
5199 	int error;
5200 	rnode_t *rp;
5201 	struct vattr va;
5202 
5203 	if (nfs_zone() != VTOMI(vp)->mi_zone)
5204 		return (EIO);
5205 
5206 	if (vp->v_flag & VNOMAP)
5207 		return (ENOSYS);
5208 
5209 	if (off < 0 || off + len < 0)
5210 		return (ENXIO);
5211 
5212 	if (vp->v_type != VREG)
5213 		return (ENODEV);
5214 
5215 	/*
5216 	 * If there is cached data and if close-to-open consistency
5217 	 * checking is not turned off and if the file system is not
5218 	 * mounted readonly, then force an over the wire getattr.
5219 	 * Otherwise, just invoke nfs3getattr to get a copy of the
5220 	 * attributes.  The attribute cache will be used unless it
5221 	 * is timed out and if it is, then an over the wire getattr
5222 	 * will be issued.
5223 	 */
5224 	va.va_mask = AT_ALL;
5225 	if (vn_has_cached_data(vp) &&
5226 	    !(VTOMI(vp)->mi_flags & MI_NOCTO) && !vn_is_readonly(vp))
5227 		error = nfs3_getattr_otw(vp, &va, cr);
5228 	else
5229 		error = nfs3getattr(vp, &va, cr);
5230 	if (error)
5231 		return (error);
5232 
5233 	/*
5234 	 * Check to see if the vnode is currently marked as not cachable.
5235 	 * This means portions of the file are locked (through VOP_FRLOCK).
5236 	 * In this case the map request must be refused.  We use
5237 	 * rp->r_lkserlock to avoid a race with concurrent lock requests.
5238 	 */
5239 	rp = VTOR(vp);
5240 
5241 	/*
5242 	 * Atomically increment r_inmap after acquiring r_rwlock. The
5243 	 * idea here is to acquire r_rwlock to block read/write and
5244 	 * not to protect r_inmap. r_inmap will inform nfs3_read/write()
5245 	 * that we are in nfs3_map(). Now, r_rwlock is acquired in order
5246 	 * and we can prevent the deadlock that would have occurred
5247 	 * when nfs3_addmap() would have acquired it out of order.
5248 	 *
5249 	 * Since we are not protecting r_inmap by any lock, we do not
5250 	 * hold any lock when we decrement it. We atomically decrement
5251 	 * r_inmap after we release r_lkserlock.
5252 	 */
5253 
5254 	if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, INTR(vp)))
5255 		return (EINTR);
5256 	atomic_add_int(&rp->r_inmap, 1);
5257 	nfs_rw_exit(&rp->r_rwlock);
5258 
5259 	if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR(vp))) {
5260 		atomic_add_int(&rp->r_inmap, -1);
5261 		return (EINTR);
5262 	}
5263 
5264 	if (vp->v_flag & VNOCACHE) {
5265 		error = EAGAIN;
5266 		goto done;
5267 	}
5268 
5269 	/*
5270 	 * Don't allow concurrent locks and mapping if mandatory locking is
5271 	 * enabled.
5272 	 */
5273 	if ((flk_has_remote_locks(vp) || lm_has_sleep(vp)) &&
5274 	    MANDLOCK(vp, va.va_mode)) {
5275 		error = EAGAIN;
5276 		goto done;
5277 	}
5278 
5279 	as_rangelock(as);
5280 	error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
5281 	if (error != 0) {
5282 		as_rangeunlock(as);
5283 		goto done;
5284 	}
5285 
5286 	vn_a.vp = vp;
5287 	vn_a.offset = off;
5288 	vn_a.type = (flags & MAP_TYPE);
5289 	vn_a.prot = (uchar_t)prot;
5290 	vn_a.maxprot = (uchar_t)maxprot;
5291 	vn_a.flags = (flags & ~MAP_TYPE);
5292 	vn_a.cred = cr;
5293 	vn_a.amp = NULL;
5294 	vn_a.szc = 0;
5295 	vn_a.lgrp_mem_policy_flags = 0;
5296 
5297 	error = as_map(as, *addrp, len, segvn_create, &vn_a);
5298 	as_rangeunlock(as);
5299 
5300 done:
5301 	nfs_rw_exit(&rp->r_lkserlock);
5302 	atomic_add_int(&rp->r_inmap, -1);
5303 	return (error);
5304 }
5305 
5306 /* ARGSUSED */
5307 static int
5308 nfs3_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
5309 	size_t len, uchar_t prot, uchar_t maxprot, uint_t flags,
5310 	cred_t *cr, caller_context_t *ct)
5311 {
5312 	rnode_t *rp;
5313 
5314 	if (vp->v_flag & VNOMAP)
5315 		return (ENOSYS);
5316 	if (nfs_zone() != VTOMI(vp)->mi_zone)
5317 		return (EIO);
5318 
5319 	rp = VTOR(vp);
5320 	atomic_add_long((ulong_t *)&rp->r_mapcnt, btopr(len));
5321 
5322 	return (0);
5323 }
5324 
5325 /* ARGSUSED */
5326 static int
5327 nfs3_frlock(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
5328 	offset_t offset, struct flk_callback *flk_cbp, cred_t *cr,
5329 	caller_context_t *ct)
5330 {
5331 	netobj lm_fh3;
5332 	int rc;
5333 	u_offset_t start, end;
5334 	rnode_t *rp;
5335 	int error = 0, intr = INTR(vp);
5336 
5337 	if (nfs_zone() != VTOMI(vp)->mi_zone)
5338 		return (EIO);
5339 	/* check for valid cmd parameter */
5340 	if (cmd != F_GETLK && cmd != F_SETLK && cmd != F_SETLKW)
5341 		return (EINVAL);
5342 
5343 	/* Verify l_type. */
5344 	switch (bfp->l_type) {
5345 	case F_RDLCK:
5346 		if (cmd != F_GETLK && !(flag & FREAD))
5347 			return (EBADF);
5348 		break;
5349 	case F_WRLCK:
5350 		if (cmd != F_GETLK && !(flag & FWRITE))
5351 			return (EBADF);
5352 		break;
5353 	case F_UNLCK:
5354 		intr = 0;
5355 		break;
5356 
5357 	default:
5358 		return (EINVAL);
5359 	}
5360 
5361 	/* check the validity of the lock range */
5362 	if (rc = flk_convert_lock_data(vp, bfp, &start, &end, offset))
5363 		return (rc);
5364 	if (rc = flk_check_lock_data(start, end, MAXEND))
5365 		return (rc);
5366 
5367 	/*
5368 	 * If the filesystem is mounted using local locking, pass the
5369 	 * request off to the local locking code.
5370 	 */
5371 	if (VTOMI(vp)->mi_flags & MI_LLOCK) {
5372 		if (cmd == F_SETLK || cmd == F_SETLKW) {
5373 			/*
5374 			 * For complete safety, we should be holding
5375 			 * r_lkserlock.  However, we can't call
5376 			 * lm_safelock and then fs_frlock while
5377 			 * holding r_lkserlock, so just invoke
5378 			 * lm_safelock and expect that this will
5379 			 * catch enough of the cases.
5380 			 */
5381 			if (!lm_safelock(vp, bfp, cr))
5382 				return (EAGAIN);
5383 		}
5384 		return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
5385 	}
5386 
5387 	rp = VTOR(vp);
5388 
5389 	/*
5390 	 * Check whether the given lock request can proceed, given the
5391 	 * current file mappings.
5392 	 */
5393 	if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_WRITER, intr))
5394 		return (EINTR);
5395 	if (cmd == F_SETLK || cmd == F_SETLKW) {
5396 		if (!lm_safelock(vp, bfp, cr)) {
5397 			rc = EAGAIN;
5398 			goto done;
5399 		}
5400 	}
5401 
5402 	/*
5403 	 * Flush the cache after waiting for async I/O to finish.  For new
5404 	 * locks, this is so that the process gets the latest bits from the
5405 	 * server.  For unlocks, this is so that other clients see the
5406 	 * latest bits once the file has been unlocked.  If currently dirty
5407 	 * pages can't be flushed, then don't allow a lock to be set.  But
5408 	 * allow unlocks to succeed, to avoid having orphan locks on the
5409 	 * server.
5410 	 */
5411 	if (cmd != F_GETLK) {
5412 		mutex_enter(&rp->r_statelock);
5413 		while (rp->r_count > 0) {
5414 			if (intr) {
5415 				klwp_t *lwp = ttolwp(curthread);
5416 
5417 				if (lwp != NULL)
5418 					lwp->lwp_nostop++;
5419 				if (cv_wait_sig(&rp->r_cv,
5420 				    &rp->r_statelock) == 0) {
5421 					if (lwp != NULL)
5422 						lwp->lwp_nostop--;
5423 					rc = EINTR;
5424 					break;
5425 				}
5426 				if (lwp != NULL)
5427 					lwp->lwp_nostop--;
5428 			} else
5429 				cv_wait(&rp->r_cv, &rp->r_statelock);
5430 		}
5431 		mutex_exit(&rp->r_statelock);
5432 		if (rc != 0)
5433 			goto done;
5434 		error = nfs3_putpage(vp, (offset_t)0, 0, B_INVAL, cr, ct);
5435 		if (error) {
5436 			if (error == ENOSPC || error == EDQUOT) {
5437 				mutex_enter(&rp->r_statelock);
5438 				if (!rp->r_error)
5439 					rp->r_error = error;
5440 				mutex_exit(&rp->r_statelock);
5441 			}
5442 			if (bfp->l_type != F_UNLCK) {
5443 				rc = ENOLCK;
5444 				goto done;
5445 			}
5446 		}
5447 	}
5448 
5449 	lm_fh3.n_len = VTOFH3(vp)->fh3_length;
5450 	lm_fh3.n_bytes = (char *)&(VTOFH3(vp)->fh3_u.data);
5451 
5452 	/*
5453 	 * Call the lock manager to do the real work of contacting
5454 	 * the server and obtaining the lock.
5455 	 */
5456 	rc = lm4_frlock(vp, cmd, bfp, flag, offset, cr, &lm_fh3, flk_cbp);
5457 
5458 	if (rc == 0)
5459 		nfs_lockcompletion(vp, cmd);
5460 
5461 done:
5462 	nfs_rw_exit(&rp->r_lkserlock);
5463 	return (rc);
5464 }
5465 
5466 /*
5467  * Free storage space associated with the specified vnode.  The portion
5468  * to be freed is specified by bfp->l_start and bfp->l_len (already
5469  * normalized to a "whence" of 0).
5470  *
5471  * This is an experimental facility whose continued existence is not
5472  * guaranteed.  Currently, we only support the special case
5473  * of l_len == 0, meaning free to end of file.
5474  */
5475 /* ARGSUSED */
5476 static int
5477 nfs3_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
5478 	offset_t offset, cred_t *cr, caller_context_t *ct)
5479 {
5480 	int error;
5481 
5482 	ASSERT(vp->v_type == VREG);
5483 	if (cmd != F_FREESP)
5484 		return (EINVAL);
5485 	if (nfs_zone() != VTOMI(vp)->mi_zone)
5486 		return (EIO);
5487 
5488 	error = convoff(vp, bfp, 0, offset);
5489 	if (!error) {
5490 		ASSERT(bfp->l_start >= 0);
5491 		if (bfp->l_len == 0) {
5492 			struct vattr va;
5493 
5494 			/*
5495 			 * ftruncate should not change the ctime and
5496 			 * mtime if we truncate the file to its
5497 			 * previous size.
5498 			 */
5499 			va.va_mask = AT_SIZE;
5500 			error = nfs3getattr(vp, &va, cr);
5501 			if (error || va.va_size == bfp->l_start)
5502 				return (error);
5503 			va.va_mask = AT_SIZE;
5504 			va.va_size = bfp->l_start;
5505 			error = nfs3setattr(vp, &va, 0, cr);
5506 		} else
5507 			error = EINVAL;
5508 	}
5509 
5510 	return (error);
5511 }
5512 
5513 /* ARGSUSED */
5514 static int
5515 nfs3_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct)
5516 {
5517 
5518 	return (EINVAL);
5519 }
5520 
5521 /*
5522  * Setup and add an address space callback to do the work of the delmap call.
5523  * The callback will (and must be) deleted in the actual callback function.
5524  *
5525  * This is done in order to take care of the problem that we have with holding
5526  * the address space's a_lock for a long period of time (e.g. if the NFS server
5527  * is down).  Callbacks will be executed in the address space code while the
5528  * a_lock is not held.	Holding the address space's a_lock causes things such
5529  * as ps and fork to hang because they are trying to acquire this lock as well.
5530  */
5531 /* ARGSUSED */
5532 static int
5533 nfs3_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
5534 	size_t len, uint_t prot, uint_t maxprot, uint_t flags,
5535 	cred_t *cr, caller_context_t *ct)
5536 {
5537 	int			caller_found;
5538 	int			error;
5539 	rnode_t			*rp;
5540 	nfs_delmap_args_t	*dmapp;
5541 	nfs_delmapcall_t	*delmap_call;
5542 
5543 	if (vp->v_flag & VNOMAP)
5544 		return (ENOSYS);
5545 	/*
5546 	 * A process may not change zones if it has NFS pages mmap'ed
5547 	 * in, so we can't legitimately get here from the wrong zone.
5548 	 */
5549 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
5550 
5551 	rp = VTOR(vp);
5552 
5553 	/*
5554 	 * The way that the address space of this process deletes its mapping
5555 	 * of this file is via the following call chains:
5556 	 * - as_free()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs3_delmap()
5557 	 * - as_unmap()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs3_delmap()
5558 	 *
5559 	 * With the use of address space callbacks we are allowed to drop the
5560 	 * address space lock, a_lock, while executing the NFS operations that
5561 	 * need to go over the wire.  Returning EAGAIN to the caller of this
5562 	 * function is what drives the execution of the callback that we add
5563 	 * below.  The callback will be executed by the address space code
5564 	 * after dropping the a_lock.  When the callback is finished, since
5565 	 * we dropped the a_lock, it must be re-acquired and segvn_unmap()
5566 	 * is called again on the same segment to finish the rest of the work
5567 	 * that needs to happen during unmapping.
5568 	 *
5569 	 * This action of calling back into the segment driver causes
5570 	 * nfs3_delmap() to get called again, but since the callback was
5571 	 * already executed at this point, it already did the work and there
5572 	 * is nothing left for us to do.
5573 	 *
5574 	 * To Summarize:
5575 	 * - The first time nfs3_delmap is called by the current thread is when
5576 	 * we add the caller associated with this delmap to the delmap caller
5577 	 * list, add the callback, and return EAGAIN.
5578 	 * - The second time in this call chain when nfs3_delmap is called we
5579 	 * will find this caller in the delmap caller list and realize there
5580 	 * is no more work to do thus removing this caller from the list and
5581 	 * returning the error that was set in the callback execution.
5582 	 */
5583 	caller_found = nfs_find_and_delete_delmapcall(rp, &error);
5584 	if (caller_found) {
5585 		/*
5586 		 * 'error' is from the actual delmap operations.  To avoid
5587 		 * hangs, we need to handle the return of EAGAIN differently
5588 		 * since this is what drives the callback execution.
5589 		 * In this case, we don't want to return EAGAIN and do the
5590 		 * callback execution because there are none to execute.
5591 		 */
5592 		if (error == EAGAIN)
5593 			return (0);
5594 		else
5595 			return (error);
5596 	}
5597 
5598 	/* current caller was not in the list */
5599 	delmap_call = nfs_init_delmapcall();
5600 
5601 	mutex_enter(&rp->r_statelock);
5602 	list_insert_tail(&rp->r_indelmap, delmap_call);
5603 	mutex_exit(&rp->r_statelock);
5604 
5605 	dmapp = kmem_alloc(sizeof (nfs_delmap_args_t), KM_SLEEP);
5606 
5607 	dmapp->vp = vp;
5608 	dmapp->off = off;
5609 	dmapp->addr = addr;
5610 	dmapp->len = len;
5611 	dmapp->prot = prot;
5612 	dmapp->maxprot = maxprot;
5613 	dmapp->flags = flags;
5614 	dmapp->cr = cr;
5615 	dmapp->caller = delmap_call;
5616 
5617 	error = as_add_callback(as, nfs3_delmap_callback, dmapp,
5618 	    AS_UNMAP_EVENT, addr, len, KM_SLEEP);
5619 
5620 	return (error ? error : EAGAIN);
5621 }
5622 
5623 /*
5624  * Remove some pages from an mmap'd vnode.  Just update the
5625  * count of pages.  If doing close-to-open, then flush and
5626  * commit all of the pages associated with this file.
5627  * Otherwise, start an asynchronous page flush to write out
5628  * any dirty pages.  This will also associate a credential
5629  * with the rnode which can be used to write the pages.
5630  */
5631 /* ARGSUSED */
5632 static void
5633 nfs3_delmap_callback(struct as *as, void *arg, uint_t event)
5634 {
5635 	int			error;
5636 	rnode_t			*rp;
5637 	mntinfo_t		*mi;
5638 	nfs_delmap_args_t	*dmapp = (nfs_delmap_args_t *)arg;
5639 
5640 	rp = VTOR(dmapp->vp);
5641 	mi = VTOMI(dmapp->vp);
5642 
5643 	atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(dmapp->len));
5644 	ASSERT(rp->r_mapcnt >= 0);
5645 
5646 	/*
5647 	 * Initiate a page flush and potential commit if there are
5648 	 * pages, the file system was not mounted readonly, the segment
5649 	 * was mapped shared, and the pages themselves were writeable.
5650 	 */
5651 	if (vn_has_cached_data(dmapp->vp) && !vn_is_readonly(dmapp->vp) &&
5652 	    dmapp->flags == MAP_SHARED && (dmapp->maxprot & PROT_WRITE)) {
5653 		mutex_enter(&rp->r_statelock);
5654 		rp->r_flags |= RDIRTY;
5655 		mutex_exit(&rp->r_statelock);
5656 		/*
5657 		 * If this is a cross-zone access a sync putpage won't work, so
5658 		 * the best we can do is try an async putpage.  That seems
5659 		 * better than something more draconian such as discarding the
5660 		 * dirty pages.
5661 		 */
5662 		if ((mi->mi_flags & MI_NOCTO) ||
5663 		    nfs_zone() != mi->mi_zone)
5664 			error = nfs3_putpage(dmapp->vp, dmapp->off, dmapp->len,
5665 			    B_ASYNC, dmapp->cr, NULL);
5666 		else
5667 			error = nfs3_putpage_commit(dmapp->vp, dmapp->off,
5668 			    dmapp->len, dmapp->cr);
5669 		if (!error) {
5670 			mutex_enter(&rp->r_statelock);
5671 			error = rp->r_error;
5672 			rp->r_error = 0;
5673 			mutex_exit(&rp->r_statelock);
5674 		}
5675 	} else
5676 		error = 0;
5677 
5678 	if ((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO))
5679 		(void) nfs3_putpage(dmapp->vp, dmapp->off, dmapp->len,
5680 		    B_INVAL, dmapp->cr, NULL);
5681 
5682 	dmapp->caller->error = error;
5683 	(void) as_delete_callback(as, arg);
5684 	kmem_free(dmapp, sizeof (nfs_delmap_args_t));
5685 }
5686 
5687 static int nfs3_pathconf_disable_cache = 0;
5688 
5689 #ifdef DEBUG
5690 static int nfs3_pathconf_cache_hits = 0;
5691 static int nfs3_pathconf_cache_misses = 0;
5692 #endif
5693 
5694 /* ARGSUSED */
5695 static int
5696 nfs3_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
5697 	caller_context_t *ct)
5698 {
5699 	int error;
5700 	PATHCONF3args args;
5701 	PATHCONF3res res;
5702 	int douprintf;
5703 	failinfo_t fi;
5704 	rnode_t *rp;
5705 	hrtime_t t;
5706 
5707 	if (nfs_zone() != VTOMI(vp)->mi_zone)
5708 		return (EIO);
5709 	/*
5710 	 * Large file spec - need to base answer on info stored
5711 	 * on original FSINFO response.
5712 	 */
5713 	if (cmd == _PC_FILESIZEBITS) {
5714 		unsigned long long ll;
5715 		long l = 1;
5716 
5717 		ll = VTOMI(vp)->mi_maxfilesize;
5718 
5719 		if (ll == 0) {
5720 			*valp = 0;
5721 			return (0);
5722 		}
5723 
5724 		if (ll & 0xffffffff00000000) {
5725 			l += 32; ll >>= 32;
5726 		}
5727 		if (ll & 0xffff0000) {
5728 			l += 16; ll >>= 16;
5729 		}
5730 		if (ll & 0xff00) {
5731 			l += 8; ll >>= 8;
5732 		}
5733 		if (ll & 0xf0) {
5734 			l += 4; ll >>= 4;
5735 		}
5736 		if (ll & 0xc) {
5737 			l += 2; ll >>= 2;
5738 		}
5739 		if (ll & 0x2)
5740 			l += 2;
5741 		else if (ll & 0x1)
5742 			l += 1;
5743 		*valp = l;
5744 		return (0);
5745 	}
5746 
5747 	if (cmd == _PC_ACL_ENABLED) {
5748 		*valp = _ACL_ACLENT_ENABLED;
5749 		return (0);
5750 	}
5751 
5752 	if (cmd == _PC_XATTR_EXISTS) {
5753 		error = 0;
5754 		*valp = 0;
5755 		if (vp->v_vfsp->vfs_flag & VFS_XATTR) {
5756 			vnode_t *avp;
5757 			rnode_t *rp;
5758 			int error = 0;
5759 			mntinfo_t *mi = VTOMI(vp);
5760 
5761 			if (!(mi->mi_flags & MI_EXTATTR))
5762 				return (0);
5763 
5764 			rp = VTOR(vp);
5765 			if (nfs_rw_enter_sig(&rp->r_rwlock, RW_READER,
5766 			    INTR(vp)))
5767 				return (EINTR);
5768 
5769 			error = nfs3lookup_dnlc(vp, XATTR_DIR_NAME, &avp, cr);
5770 			if (error || avp == NULL)
5771 				error = acl_getxattrdir3(vp, &avp, 0, cr, 0);
5772 
5773 			nfs_rw_exit(&rp->r_rwlock);
5774 
5775 			if (error == 0 && avp != NULL) {
5776 				error = do_xattr_exists_check(avp, valp, cr);
5777 				VN_RELE(avp);
5778 			} else if (error == ENOENT) {
5779 				error = 0;
5780 				*valp = 0;
5781 			}
5782 		}
5783 		return (error);
5784 	}
5785 
5786 	rp = VTOR(vp);
5787 	if (rp->r_pathconf != NULL) {
5788 		mutex_enter(&rp->r_statelock);
5789 		if (rp->r_pathconf != NULL && nfs3_pathconf_disable_cache) {
5790 			kmem_free(rp->r_pathconf, sizeof (*rp->r_pathconf));
5791 			rp->r_pathconf = NULL;
5792 		}
5793 		if (rp->r_pathconf != NULL) {
5794 			error = 0;
5795 			switch (cmd) {
5796 			case _PC_LINK_MAX:
5797 				*valp = rp->r_pathconf->link_max;
5798 				break;
5799 			case _PC_NAME_MAX:
5800 				*valp = rp->r_pathconf->name_max;
5801 				break;
5802 			case _PC_PATH_MAX:
5803 			case _PC_SYMLINK_MAX:
5804 				*valp = MAXPATHLEN;
5805 				break;
5806 			case _PC_CHOWN_RESTRICTED:
5807 				*valp = rp->r_pathconf->chown_restricted;
5808 				break;
5809 			case _PC_NO_TRUNC:
5810 				*valp = rp->r_pathconf->no_trunc;
5811 				break;
5812 			default:
5813 				error = EINVAL;
5814 				break;
5815 			}
5816 			mutex_exit(&rp->r_statelock);
5817 #ifdef DEBUG
5818 			nfs3_pathconf_cache_hits++;
5819 #endif
5820 			return (error);
5821 		}
5822 		mutex_exit(&rp->r_statelock);
5823 	}
5824 #ifdef DEBUG
5825 	nfs3_pathconf_cache_misses++;
5826 #endif
5827 
5828 	args.object = *VTOFH3(vp);
5829 	fi.vp = vp;
5830 	fi.fhp = (caddr_t)&args.object;
5831 	fi.copyproc = nfs3copyfh;
5832 	fi.lookupproc = nfs3lookup;
5833 	fi.xattrdirproc = acl_getxattrdir3;
5834 
5835 	douprintf = 1;
5836 
5837 	t = gethrtime();
5838 
5839 	error = rfs3call(VTOMI(vp), NFSPROC3_PATHCONF,
5840 	    xdr_nfs_fh3, (caddr_t)&args,
5841 	    xdr_PATHCONF3res, (caddr_t)&res, cr,
5842 	    &douprintf, &res.status, 0, &fi);
5843 
5844 	if (error)
5845 		return (error);
5846 
5847 	error = geterrno3(res.status);
5848 
5849 	if (!error) {
5850 		nfs3_cache_post_op_attr(vp, &res.resok.obj_attributes, t, cr);
5851 		if (!nfs3_pathconf_disable_cache) {
5852 			mutex_enter(&rp->r_statelock);
5853 			if (rp->r_pathconf == NULL) {
5854 				rp->r_pathconf = kmem_alloc(
5855 				    sizeof (*rp->r_pathconf), KM_NOSLEEP);
5856 				if (rp->r_pathconf != NULL)
5857 					*rp->r_pathconf = res.resok.info;
5858 			}
5859 			mutex_exit(&rp->r_statelock);
5860 		}
5861 		switch (cmd) {
5862 		case _PC_LINK_MAX:
5863 			*valp = res.resok.info.link_max;
5864 			break;
5865 		case _PC_NAME_MAX:
5866 			*valp = res.resok.info.name_max;
5867 			break;
5868 		case _PC_PATH_MAX:
5869 		case _PC_SYMLINK_MAX:
5870 			*valp = MAXPATHLEN;
5871 			break;
5872 		case _PC_CHOWN_RESTRICTED:
5873 			*valp = res.resok.info.chown_restricted;
5874 			break;
5875 		case _PC_NO_TRUNC:
5876 			*valp = res.resok.info.no_trunc;
5877 			break;
5878 		default:
5879 			return (EINVAL);
5880 		}
5881 	} else {
5882 		nfs3_cache_post_op_attr(vp, &res.resfail.obj_attributes, t, cr);
5883 		PURGE_STALE_FH(error, vp, cr);
5884 	}
5885 
5886 	return (error);
5887 }
5888 
5889 /*
5890  * Called by async thread to do synchronous pageio. Do the i/o, wait
5891  * for it to complete, and cleanup the page list when done.
5892  */
5893 static int
5894 nfs3_sync_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
5895 	int flags, cred_t *cr)
5896 {
5897 	int error;
5898 
5899 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
5900 	error = nfs3_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
5901 	if (flags & B_READ)
5902 		pvn_read_done(pp, (error ? B_ERROR : 0) | flags);
5903 	else
5904 		pvn_write_done(pp, (error ? B_ERROR : 0) | flags);
5905 	return (error);
5906 }
5907 
5908 /* ARGSUSED */
5909 static int
5910 nfs3_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
5911 	int flags, cred_t *cr, caller_context_t *ct)
5912 {
5913 	int error;
5914 	rnode_t *rp;
5915 
5916 	if (pp == NULL)
5917 		return (EINVAL);
5918 	if (!(flags & B_ASYNC) && nfs_zone() != VTOMI(vp)->mi_zone)
5919 		return (EIO);
5920 
5921 	rp = VTOR(vp);
5922 	mutex_enter(&rp->r_statelock);
5923 	rp->r_count++;
5924 	mutex_exit(&rp->r_statelock);
5925 
5926 	if (flags & B_ASYNC) {
5927 		error = nfs_async_pageio(vp, pp, io_off, io_len, flags, cr,
5928 		    nfs3_sync_pageio);
5929 	} else
5930 		error = nfs3_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
5931 	mutex_enter(&rp->r_statelock);
5932 	rp->r_count--;
5933 	cv_broadcast(&rp->r_cv);
5934 	mutex_exit(&rp->r_statelock);
5935 	return (error);
5936 }
5937 
5938 /* ARGSUSED */
5939 static void
5940 nfs3_dispose(vnode_t *vp, page_t *pp, int fl, int dn, cred_t *cr,
5941 	caller_context_t *ct)
5942 {
5943 	int error;
5944 	rnode_t *rp;
5945 	page_t *plist;
5946 	page_t *pptr;
5947 	offset3 offset;
5948 	count3 len;
5949 	k_sigset_t smask;
5950 
5951 	/*
5952 	 * We should get called with fl equal to either B_FREE or
5953 	 * B_INVAL.  Any other value is illegal.
5954 	 *
5955 	 * The page that we are either supposed to free or destroy
5956 	 * should be exclusive locked and its io lock should not
5957 	 * be held.
5958 	 */
5959 	ASSERT(fl == B_FREE || fl == B_INVAL);
5960 	ASSERT((PAGE_EXCL(pp) && !page_iolock_assert(pp)) || panicstr);
5961 	rp = VTOR(vp);
5962 
5963 	/*
5964 	 * If the page doesn't need to be committed or we shouldn't
5965 	 * even bother attempting to commit it, then just make sure
5966 	 * that the p_fsdata byte is clear and then either free or
5967 	 * destroy the page as appropriate.
5968 	 */
5969 	if (pp->p_fsdata == C_NOCOMMIT || (rp->r_flags & RSTALE)) {
5970 		pp->p_fsdata = C_NOCOMMIT;
5971 		if (fl == B_FREE)
5972 			page_free(pp, dn);
5973 		else
5974 			page_destroy(pp, dn);
5975 		return;
5976 	}
5977 
5978 	/*
5979 	 * If there is a page invalidation operation going on, then
5980 	 * if this is one of the pages being destroyed, then just
5981 	 * clear the p_fsdata byte and then either free or destroy
5982 	 * the page as appropriate.
5983 	 */
5984 	mutex_enter(&rp->r_statelock);
5985 	if ((rp->r_flags & RTRUNCATE) && pp->p_offset >= rp->r_truncaddr) {
5986 		mutex_exit(&rp->r_statelock);
5987 		pp->p_fsdata = C_NOCOMMIT;
5988 		if (fl == B_FREE)
5989 			page_free(pp, dn);
5990 		else
5991 			page_destroy(pp, dn);
5992 		return;
5993 	}
5994 
5995 	/*
5996 	 * If we are freeing this page and someone else is already
5997 	 * waiting to do a commit, then just unlock the page and
5998 	 * return.  That other thread will take care of commiting
5999 	 * this page.  The page can be freed sometime after the
6000 	 * commit has finished.  Otherwise, if the page is marked
6001 	 * as delay commit, then we may be getting called from
6002 	 * pvn_write_done, one page at a time.   This could result
6003 	 * in one commit per page, so we end up doing lots of small
6004 	 * commits instead of fewer larger commits.  This is bad,
6005 	 * we want do as few commits as possible.
6006 	 */
6007 	if (fl == B_FREE) {
6008 		if (rp->r_flags & RCOMMITWAIT) {
6009 			page_unlock(pp);
6010 			mutex_exit(&rp->r_statelock);
6011 			return;
6012 		}
6013 		if (pp->p_fsdata == C_DELAYCOMMIT) {
6014 			pp->p_fsdata = C_COMMIT;
6015 			page_unlock(pp);
6016 			mutex_exit(&rp->r_statelock);
6017 			return;
6018 		}
6019 	}
6020 
6021 	/*
6022 	 * Check to see if there is a signal which would prevent an
6023 	 * attempt to commit the pages from being successful.  If so,
6024 	 * then don't bother with all of the work to gather pages and
6025 	 * generate the unsuccessful RPC.  Just return from here and
6026 	 * let the page be committed at some later time.
6027 	 */
6028 	sigintr(&smask, VTOMI(vp)->mi_flags & MI_INT);
6029 	if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING)) {
6030 		sigunintr(&smask);
6031 		page_unlock(pp);
6032 		mutex_exit(&rp->r_statelock);
6033 		return;
6034 	}
6035 	sigunintr(&smask);
6036 
6037 	/*
6038 	 * We are starting to need to commit pages, so let's try
6039 	 * to commit as many as possible at once to reduce the
6040 	 * overhead.
6041 	 *
6042 	 * Set the `commit inprogress' state bit.  We must
6043 	 * first wait until any current one finishes.  Then
6044 	 * we initialize the c_pages list with this page.
6045 	 */
6046 	while (rp->r_flags & RCOMMIT) {
6047 		rp->r_flags |= RCOMMITWAIT;
6048 		cv_wait(&rp->r_commit.c_cv, &rp->r_statelock);
6049 		rp->r_flags &= ~RCOMMITWAIT;
6050 	}
6051 	rp->r_flags |= RCOMMIT;
6052 	mutex_exit(&rp->r_statelock);
6053 	ASSERT(rp->r_commit.c_pages == NULL);
6054 	rp->r_commit.c_pages = pp;
6055 	rp->r_commit.c_commbase = (offset3)pp->p_offset;
6056 	rp->r_commit.c_commlen = PAGESIZE;
6057 
6058 	/*
6059 	 * Gather together all other pages which can be committed.
6060 	 * They will all be chained off r_commit.c_pages.
6061 	 */
6062 	nfs3_get_commit(vp);
6063 
6064 	/*
6065 	 * Clear the `commit inprogress' status and disconnect
6066 	 * the list of pages to be committed from the rnode.
6067 	 * At this same time, we also save the starting offset
6068 	 * and length of data to be committed on the server.
6069 	 */
6070 	plist = rp->r_commit.c_pages;
6071 	rp->r_commit.c_pages = NULL;
6072 	offset = rp->r_commit.c_commbase;
6073 	len = rp->r_commit.c_commlen;
6074 	mutex_enter(&rp->r_statelock);
6075 	rp->r_flags &= ~RCOMMIT;
6076 	cv_broadcast(&rp->r_commit.c_cv);
6077 	mutex_exit(&rp->r_statelock);
6078 
6079 	if (curproc == proc_pageout || curproc == proc_fsflush ||
6080 	    nfs_zone() != VTOMI(vp)->mi_zone) {
6081 		nfs_async_commit(vp, plist, offset, len, cr, nfs3_async_commit);
6082 		return;
6083 	}
6084 
6085 	/*
6086 	 * Actually generate the COMMIT3 over the wire operation.
6087 	 */
6088 	error = nfs3_commit(vp, offset, len, cr);
6089 
6090 	/*
6091 	 * If we got an error during the commit, just unlock all
6092 	 * of the pages.  The pages will get retransmitted to the
6093 	 * server during a putpage operation.
6094 	 */
6095 	if (error) {
6096 		while (plist != NULL) {
6097 			pptr = plist;
6098 			page_sub(&plist, pptr);
6099 			page_unlock(pptr);
6100 		}
6101 		return;
6102 	}
6103 
6104 	/*
6105 	 * We've tried as hard as we can to commit the data to stable
6106 	 * storage on the server.  We release the rest of the pages
6107 	 * and clear the commit required state.  They will be put
6108 	 * onto the tail of the cachelist if they are nolonger
6109 	 * mapped.
6110 	 */
6111 	while (plist != pp) {
6112 		pptr = plist;
6113 		page_sub(&plist, pptr);
6114 		pptr->p_fsdata = C_NOCOMMIT;
6115 		(void) page_release(pptr, 1);
6116 	}
6117 
6118 	/*
6119 	 * It is possible that nfs3_commit didn't return error but
6120 	 * some other thread has modified the page we are going
6121 	 * to free/destroy.
6122 	 *    In this case we need to rewrite the page. Do an explicit check
6123 	 * before attempting to free/destroy the page. If modified, needs to
6124 	 * be rewritten so unlock the page and return.
6125 	 */
6126 	if (hat_ismod(pp)) {
6127 		pp->p_fsdata = C_NOCOMMIT;
6128 		page_unlock(pp);
6129 		return;
6130 	}
6131 
6132 	/*
6133 	 * Now, as appropriate, either free or destroy the page
6134 	 * that we were called with.
6135 	 */
6136 	pp->p_fsdata = C_NOCOMMIT;
6137 	if (fl == B_FREE)
6138 		page_free(pp, dn);
6139 	else
6140 		page_destroy(pp, dn);
6141 }
6142 
6143 static int
6144 nfs3_commit(vnode_t *vp, offset3 offset, count3 count, cred_t *cr)
6145 {
6146 	int error;
6147 	rnode_t *rp;
6148 	COMMIT3args args;
6149 	COMMIT3res res;
6150 	int douprintf;
6151 	cred_t *cred;
6152 
6153 	rp = VTOR(vp);
6154 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
6155 
6156 	mutex_enter(&rp->r_statelock);
6157 	if (rp->r_cred != NULL) {
6158 		cred = rp->r_cred;
6159 		crhold(cred);
6160 	} else {
6161 		rp->r_cred = cr;
6162 		crhold(cr);
6163 		cred = cr;
6164 		crhold(cred);
6165 	}
6166 	mutex_exit(&rp->r_statelock);
6167 
6168 	args.file = *VTOFH3(vp);
6169 	args.offset = offset;
6170 	args.count = count;
6171 
6172 doitagain:
6173 	douprintf = 1;
6174 	error = rfs3call(VTOMI(vp), NFSPROC3_COMMIT,
6175 	    xdr_COMMIT3args, (caddr_t)&args,
6176 	    xdr_COMMIT3res, (caddr_t)&res, cred,
6177 	    &douprintf, &res.status, 0, NULL);
6178 
6179 	crfree(cred);
6180 
6181 	if (error)
6182 		return (error);
6183 
6184 	error = geterrno3(res.status);
6185 	if (!error) {
6186 		ASSERT(rp->r_flags & RHAVEVERF);
6187 		mutex_enter(&rp->r_statelock);
6188 		if (rp->r_verf == res.resok.verf) {
6189 			mutex_exit(&rp->r_statelock);
6190 			return (0);
6191 		}
6192 		nfs3_set_mod(vp);
6193 		rp->r_verf = res.resok.verf;
6194 		mutex_exit(&rp->r_statelock);
6195 		error = NFS_VERF_MISMATCH;
6196 	} else {
6197 		if (error == EACCES) {
6198 			mutex_enter(&rp->r_statelock);
6199 			if (cred != cr) {
6200 				if (rp->r_cred != NULL)
6201 					crfree(rp->r_cred);
6202 				rp->r_cred = cr;
6203 				crhold(cr);
6204 				cred = cr;
6205 				crhold(cred);
6206 				mutex_exit(&rp->r_statelock);
6207 				goto doitagain;
6208 			}
6209 			mutex_exit(&rp->r_statelock);
6210 		}
6211 		/*
6212 		 * Can't do a PURGE_STALE_FH here because this
6213 		 * can cause a deadlock.  nfs3_commit can
6214 		 * be called from nfs3_dispose which can be called
6215 		 * indirectly via pvn_vplist_dirty.  PURGE_STALE_FH
6216 		 * can call back to pvn_vplist_dirty.
6217 		 */
6218 		if (error == ESTALE) {
6219 			mutex_enter(&rp->r_statelock);
6220 			rp->r_flags |= RSTALE;
6221 			if (!rp->r_error)
6222 				rp->r_error = error;
6223 			mutex_exit(&rp->r_statelock);
6224 			PURGE_ATTRCACHE(vp);
6225 		} else {
6226 			mutex_enter(&rp->r_statelock);
6227 			if (!rp->r_error)
6228 				rp->r_error = error;
6229 			mutex_exit(&rp->r_statelock);
6230 		}
6231 	}
6232 
6233 	return (error);
6234 }
6235 
6236 static void
6237 nfs3_set_mod(vnode_t *vp)
6238 {
6239 	page_t *pp;
6240 	kmutex_t *vphm;
6241 
6242 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
6243 	vphm = page_vnode_mutex(vp);
6244 	mutex_enter(vphm);
6245 	if ((pp = vp->v_pages) != NULL) {
6246 		do {
6247 			if (pp->p_fsdata != C_NOCOMMIT) {
6248 				hat_setmod(pp);
6249 				pp->p_fsdata = C_NOCOMMIT;
6250 			}
6251 		} while ((pp = pp->p_vpnext) != vp->v_pages);
6252 	}
6253 	mutex_exit(vphm);
6254 }
6255 
6256 
6257 /*
6258  * This routine is used to gather together a page list of the pages
6259  * which are to be committed on the server.  This routine must not
6260  * be called if the calling thread holds any locked pages.
6261  *
6262  * The calling thread must have set RCOMMIT.  This bit is used to
6263  * serialize access to the commit structure in the rnode.  As long
6264  * as the thread has set RCOMMIT, then it can manipulate the commit
6265  * structure without requiring any other locks.
6266  */
6267 static void
6268 nfs3_get_commit(vnode_t *vp)
6269 {
6270 	rnode_t *rp;
6271 	page_t *pp;
6272 	kmutex_t *vphm;
6273 
6274 	rp = VTOR(vp);
6275 
6276 	ASSERT(rp->r_flags & RCOMMIT);
6277 
6278 	vphm = page_vnode_mutex(vp);
6279 	mutex_enter(vphm);
6280 
6281 	/*
6282 	 * If there are no pages associated with this vnode, then
6283 	 * just return.
6284 	 */
6285 	if ((pp = vp->v_pages) == NULL) {
6286 		mutex_exit(vphm);
6287 		return;
6288 	}
6289 
6290 	/*
6291 	 * Step through all of the pages associated with this vnode
6292 	 * looking for pages which need to be committed.
6293 	 */
6294 	do {
6295 		/*
6296 		 * If this page does not need to be committed or is
6297 		 * modified, then just skip it.
6298 		 */
6299 		if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp))
6300 			continue;
6301 
6302 		/*
6303 		 * Attempt to lock the page.  If we can't, then
6304 		 * someone else is messing with it and we will
6305 		 * just skip it.
6306 		 */
6307 		if (!page_trylock(pp, SE_EXCL))
6308 			continue;
6309 
6310 		/*
6311 		 * If this page does not need to be committed or is
6312 		 * modified, then just skip it.  Recheck now that
6313 		 * the page is locked.
6314 		 */
6315 		if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) {
6316 			page_unlock(pp);
6317 			continue;
6318 		}
6319 
6320 		if (PP_ISFREE(pp)) {
6321 			cmn_err(CE_PANIC, "nfs3_get_commit: %p is free",
6322 			    (void *)pp);
6323 		}
6324 
6325 		/*
6326 		 * The page needs to be committed and we locked it.
6327 		 * Update the base and length parameters and add it
6328 		 * to r_pages.
6329 		 */
6330 		if (rp->r_commit.c_pages == NULL) {
6331 			rp->r_commit.c_commbase = (offset3)pp->p_offset;
6332 			rp->r_commit.c_commlen = PAGESIZE;
6333 		} else if (pp->p_offset < rp->r_commit.c_commbase) {
6334 			rp->r_commit.c_commlen = rp->r_commit.c_commbase -
6335 			    (offset3)pp->p_offset + rp->r_commit.c_commlen;
6336 			rp->r_commit.c_commbase = (offset3)pp->p_offset;
6337 		} else if ((rp->r_commit.c_commbase + rp->r_commit.c_commlen)
6338 		    <= pp->p_offset) {
6339 			rp->r_commit.c_commlen = (offset3)pp->p_offset -
6340 			    rp->r_commit.c_commbase + PAGESIZE;
6341 		}
6342 		page_add(&rp->r_commit.c_pages, pp);
6343 	} while ((pp = pp->p_vpnext) != vp->v_pages);
6344 
6345 	mutex_exit(vphm);
6346 }
6347 
6348 /*
6349  * This routine is used to gather together a page list of the pages
6350  * which are to be committed on the server.  This routine must not
6351  * be called if the calling thread holds any locked pages.
6352  *
6353  * The calling thread must have set RCOMMIT.  This bit is used to
6354  * serialize access to the commit structure in the rnode.  As long
6355  * as the thread has set RCOMMIT, then it can manipulate the commit
6356  * structure without requiring any other locks.
6357  */
6358 static void
6359 nfs3_get_commit_range(vnode_t *vp, u_offset_t soff, size_t len)
6360 {
6361 
6362 	rnode_t *rp;
6363 	page_t *pp;
6364 	u_offset_t end;
6365 	u_offset_t off;
6366 
6367 	ASSERT(len != 0);
6368 
6369 	rp = VTOR(vp);
6370 
6371 	ASSERT(rp->r_flags & RCOMMIT);
6372 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
6373 
6374 	/*
6375 	 * If there are no pages associated with this vnode, then
6376 	 * just return.
6377 	 */
6378 	if ((pp = vp->v_pages) == NULL)
6379 		return;
6380 
6381 	/*
6382 	 * Calculate the ending offset.
6383 	 */
6384 	end = soff + len;
6385 
6386 	for (off = soff; off < end; off += PAGESIZE) {
6387 		/*
6388 		 * Lookup each page by vp, offset.
6389 		 */
6390 		if ((pp = page_lookup_nowait(vp, off, SE_EXCL)) == NULL)
6391 			continue;
6392 
6393 		/*
6394 		 * If this page does not need to be committed or is
6395 		 * modified, then just skip it.
6396 		 */
6397 		if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) {
6398 			page_unlock(pp);
6399 			continue;
6400 		}
6401 
6402 		ASSERT(PP_ISFREE(pp) == 0);
6403 
6404 		/*
6405 		 * The page needs to be committed and we locked it.
6406 		 * Update the base and length parameters and add it
6407 		 * to r_pages.
6408 		 */
6409 		if (rp->r_commit.c_pages == NULL) {
6410 			rp->r_commit.c_commbase = (offset3)pp->p_offset;
6411 			rp->r_commit.c_commlen = PAGESIZE;
6412 		} else {
6413 			rp->r_commit.c_commlen = (offset3)pp->p_offset -
6414 			    rp->r_commit.c_commbase + PAGESIZE;
6415 		}
6416 		page_add(&rp->r_commit.c_pages, pp);
6417 	}
6418 }
6419 
6420 #if 0	/* unused */
6421 #ifdef DEBUG
6422 static int
6423 nfs3_no_uncommitted_pages(vnode_t *vp)
6424 {
6425 	page_t *pp;
6426 	kmutex_t *vphm;
6427 
6428 	vphm = page_vnode_mutex(vp);
6429 	mutex_enter(vphm);
6430 	if ((pp = vp->v_pages) != NULL) {
6431 		do {
6432 			if (pp->p_fsdata != C_NOCOMMIT) {
6433 				mutex_exit(vphm);
6434 				return (0);
6435 			}
6436 		} while ((pp = pp->p_vpnext) != vp->v_pages);
6437 	}
6438 	mutex_exit(vphm);
6439 
6440 	return (1);
6441 }
6442 #endif
6443 #endif
6444 
6445 static int
6446 nfs3_putpage_commit(vnode_t *vp, offset_t poff, size_t plen, cred_t *cr)
6447 {
6448 	int error;
6449 	writeverf3 write_verf;
6450 	rnode_t *rp = VTOR(vp);
6451 
6452 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
6453 	/*
6454 	 * Flush the data portion of the file and then commit any
6455 	 * portions which need to be committed.  This may need to
6456 	 * be done twice if the server has changed state since
6457 	 * data was last written.  The data will need to be
6458 	 * rewritten to the server and then a new commit done.
6459 	 *
6460 	 * In fact, this may need to be done several times if the
6461 	 * server is having problems and crashing while we are
6462 	 * attempting to do this.
6463 	 */
6464 
6465 top:
6466 	/*
6467 	 * Do a flush based on the poff and plen arguments.  This
6468 	 * will asynchronously write out any modified pages in the
6469 	 * range specified by (poff, plen).  This starts all of the
6470 	 * i/o operations which will be waited for in the next
6471 	 * call to nfs3_putpage
6472 	 */
6473 
6474 	mutex_enter(&rp->r_statelock);
6475 	write_verf = rp->r_verf;
6476 	mutex_exit(&rp->r_statelock);
6477 
6478 	error = nfs3_putpage(vp, poff, plen, B_ASYNC, cr, NULL);
6479 	if (error == EAGAIN)
6480 		error = 0;
6481 
6482 	/*
6483 	 * Do a flush based on the poff and plen arguments.  This
6484 	 * will synchronously write out any modified pages in the
6485 	 * range specified by (poff, plen) and wait until all of
6486 	 * the asynchronous i/o's in that range are done as well.
6487 	 */
6488 	if (!error)
6489 		error = nfs3_putpage(vp, poff, plen, 0, cr, NULL);
6490 
6491 	if (error)
6492 		return (error);
6493 
6494 	mutex_enter(&rp->r_statelock);
6495 	if (rp->r_verf != write_verf) {
6496 		mutex_exit(&rp->r_statelock);
6497 		goto top;
6498 	}
6499 	mutex_exit(&rp->r_statelock);
6500 
6501 	/*
6502 	 * Now commit any pages which might need to be committed.
6503 	 * If the error, NFS_VERF_MISMATCH, is returned, then
6504 	 * start over with the flush operation.
6505 	 */
6506 
6507 	error = nfs3_commit_vp(vp, poff, plen, cr);
6508 
6509 	if (error == NFS_VERF_MISMATCH)
6510 		goto top;
6511 
6512 	return (error);
6513 }
6514 
6515 static int
6516 nfs3_commit_vp(vnode_t *vp, u_offset_t poff, size_t plen, cred_t *cr)
6517 {
6518 	rnode_t *rp;
6519 	page_t *plist;
6520 	offset3 offset;
6521 	count3 len;
6522 
6523 
6524 	rp = VTOR(vp);
6525 
6526 	if (nfs_zone() != VTOMI(vp)->mi_zone)
6527 		return (EIO);
6528 	/*
6529 	 * Set the `commit inprogress' state bit.  We must
6530 	 * first wait until any current one finishes.
6531 	 */
6532 	mutex_enter(&rp->r_statelock);
6533 	while (rp->r_flags & RCOMMIT) {
6534 		rp->r_flags |= RCOMMITWAIT;
6535 		cv_wait(&rp->r_commit.c_cv, &rp->r_statelock);
6536 		rp->r_flags &= ~RCOMMITWAIT;
6537 	}
6538 	rp->r_flags |= RCOMMIT;
6539 	mutex_exit(&rp->r_statelock);
6540 
6541 	/*
6542 	 * Gather together all of the pages which need to be
6543 	 * committed.
6544 	 */
6545 	if (plen == 0)
6546 		nfs3_get_commit(vp);
6547 	else
6548 		nfs3_get_commit_range(vp, poff, plen);
6549 
6550 	/*
6551 	 * Clear the `commit inprogress' bit and disconnect the
6552 	 * page list which was gathered together in nfs3_get_commit.
6553 	 */
6554 	plist = rp->r_commit.c_pages;
6555 	rp->r_commit.c_pages = NULL;
6556 	offset = rp->r_commit.c_commbase;
6557 	len = rp->r_commit.c_commlen;
6558 	mutex_enter(&rp->r_statelock);
6559 	rp->r_flags &= ~RCOMMIT;
6560 	cv_broadcast(&rp->r_commit.c_cv);
6561 	mutex_exit(&rp->r_statelock);
6562 
6563 	/*
6564 	 * If any pages need to be committed, commit them and
6565 	 * then unlock them so that they can be freed some
6566 	 * time later.
6567 	 */
6568 	if (plist != NULL) {
6569 		/*
6570 		 * No error occurred during the flush portion
6571 		 * of this operation, so now attempt to commit
6572 		 * the data to stable storage on the server.
6573 		 *
6574 		 * This will unlock all of the pages on the list.
6575 		 */
6576 		return (nfs3_sync_commit(vp, plist, offset, len, cr));
6577 	}
6578 	return (0);
6579 }
6580 
6581 static int
6582 nfs3_sync_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
6583 	cred_t *cr)
6584 {
6585 	int error;
6586 	page_t *pp;
6587 
6588 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
6589 	error = nfs3_commit(vp, offset, count, cr);
6590 
6591 	/*
6592 	 * If we got an error, then just unlock all of the pages
6593 	 * on the list.
6594 	 */
6595 	if (error) {
6596 		while (plist != NULL) {
6597 			pp = plist;
6598 			page_sub(&plist, pp);
6599 			page_unlock(pp);
6600 		}
6601 		return (error);
6602 	}
6603 	/*
6604 	 * We've tried as hard as we can to commit the data to stable
6605 	 * storage on the server.  We just unlock the pages and clear
6606 	 * the commit required state.  They will get freed later.
6607 	 */
6608 	while (plist != NULL) {
6609 		pp = plist;
6610 		page_sub(&plist, pp);
6611 		pp->p_fsdata = C_NOCOMMIT;
6612 		page_unlock(pp);
6613 	}
6614 
6615 	return (error);
6616 }
6617 
6618 static void
6619 nfs3_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
6620 	cred_t *cr)
6621 {
6622 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
6623 	(void) nfs3_sync_commit(vp, plist, offset, count, cr);
6624 }
6625 
6626 /* ARGSUSED */
6627 static int
6628 nfs3_setsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr,
6629 	caller_context_t *ct)
6630 {
6631 	int error;
6632 	mntinfo_t *mi;
6633 
6634 	mi = VTOMI(vp);
6635 
6636 	if (nfs_zone() != mi->mi_zone)
6637 		return (EIO);
6638 
6639 	if (mi->mi_flags & MI_ACL) {
6640 		error = acl_setacl3(vp, vsecattr, flag, cr);
6641 		if (mi->mi_flags & MI_ACL)
6642 			return (error);
6643 	}
6644 
6645 	return (ENOSYS);
6646 }
6647 
6648 /* ARGSUSED */
6649 static int
6650 nfs3_getsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr,
6651 	caller_context_t *ct)
6652 {
6653 	int error;
6654 	mntinfo_t *mi;
6655 
6656 	mi = VTOMI(vp);
6657 
6658 	if (nfs_zone() != mi->mi_zone)
6659 		return (EIO);
6660 
6661 	if (mi->mi_flags & MI_ACL) {
6662 		error = acl_getacl3(vp, vsecattr, flag, cr);
6663 		if (mi->mi_flags & MI_ACL)
6664 			return (error);
6665 	}
6666 
6667 	return (fs_fab_acl(vp, vsecattr, flag, cr, ct));
6668 }
6669 
6670 /* ARGSUSED */
6671 static int
6672 nfs3_shrlock(vnode_t *vp, int cmd, struct shrlock *shr, int flag, cred_t *cr,
6673 	caller_context_t *ct)
6674 {
6675 	int error;
6676 	struct shrlock nshr;
6677 	struct nfs_owner nfs_owner;
6678 	netobj lm_fh3;
6679 
6680 	if (nfs_zone() != VTOMI(vp)->mi_zone)
6681 		return (EIO);
6682 
6683 	/*
6684 	 * check for valid cmd parameter
6685 	 */
6686 	if (cmd != F_SHARE && cmd != F_UNSHARE && cmd != F_HASREMOTELOCKS)
6687 		return (EINVAL);
6688 
6689 	/*
6690 	 * Check access permissions
6691 	 */
6692 	if (cmd == F_SHARE &&
6693 	    (((shr->s_access & F_RDACC) && !(flag & FREAD)) ||
6694 	    ((shr->s_access & F_WRACC) && !(flag & FWRITE))))
6695 		return (EBADF);
6696 
6697 	/*
6698 	 * If the filesystem is mounted using local locking, pass the
6699 	 * request off to the local share code.
6700 	 */
6701 	if (VTOMI(vp)->mi_flags & MI_LLOCK)
6702 		return (fs_shrlock(vp, cmd, shr, flag, cr, ct));
6703 
6704 	switch (cmd) {
6705 	case F_SHARE:
6706 	case F_UNSHARE:
6707 		lm_fh3.n_len = VTOFH3(vp)->fh3_length;
6708 		lm_fh3.n_bytes = (char *)&(VTOFH3(vp)->fh3_u.data);
6709 
6710 		/*
6711 		 * If passed an owner that is too large to fit in an
6712 		 * nfs_owner it is likely a recursive call from the
6713 		 * lock manager client and pass it straight through.  If
6714 		 * it is not a nfs_owner then simply return an error.
6715 		 */
6716 		if (shr->s_own_len > sizeof (nfs_owner.lowner)) {
6717 			if (((struct nfs_owner *)shr->s_owner)->magic !=
6718 			    NFS_OWNER_MAGIC)
6719 				return (EINVAL);
6720 
6721 			if (error = lm4_shrlock(vp, cmd, shr, flag, &lm_fh3)) {
6722 				error = set_errno(error);
6723 			}
6724 			return (error);
6725 		}
6726 		/*
6727 		 * Remote share reservations owner is a combination of
6728 		 * a magic number, hostname, and the local owner
6729 		 */
6730 		bzero(&nfs_owner, sizeof (nfs_owner));
6731 		nfs_owner.magic = NFS_OWNER_MAGIC;
6732 		(void) strncpy(nfs_owner.hname, uts_nodename(),
6733 		    sizeof (nfs_owner.hname));
6734 		bcopy(shr->s_owner, nfs_owner.lowner, shr->s_own_len);
6735 		nshr.s_access = shr->s_access;
6736 		nshr.s_deny = shr->s_deny;
6737 		nshr.s_sysid = 0;
6738 		nshr.s_pid = ttoproc(curthread)->p_pid;
6739 		nshr.s_own_len = sizeof (nfs_owner);
6740 		nshr.s_owner = (caddr_t)&nfs_owner;
6741 
6742 		if (error = lm4_shrlock(vp, cmd, &nshr, flag, &lm_fh3)) {
6743 			error = set_errno(error);
6744 		}
6745 
6746 		break;
6747 
6748 	case F_HASREMOTELOCKS:
6749 		/*
6750 		 * NFS client can't store remote locks itself
6751 		 */
6752 		shr->s_access = 0;
6753 		error = 0;
6754 		break;
6755 
6756 	default:
6757 		error = EINVAL;
6758 		break;
6759 	}
6760 
6761 	return (error);
6762 }
6763