xref: /titanic_52/usr/src/uts/common/fs/nfs/nfs3_vnops.c (revision 380789fc80376bd1573770361cb177a08c7e3524)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  *	Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T.
28  *	All rights reserved.
29  */
30 
31 #pragma ident	"%Z%%M%	%I%	%E% SMI"
32 
33 #include <sys/param.h>
34 #include <sys/types.h>
35 #include <sys/systm.h>
36 #include <sys/cred.h>
37 #include <sys/time.h>
38 #include <sys/vnode.h>
39 #include <sys/vfs.h>
40 #include <sys/vfs_opreg.h>
41 #include <sys/file.h>
42 #include <sys/filio.h>
43 #include <sys/uio.h>
44 #include <sys/buf.h>
45 #include <sys/mman.h>
46 #include <sys/pathname.h>
47 #include <sys/dirent.h>
48 #include <sys/debug.h>
49 #include <sys/vmsystm.h>
50 #include <sys/fcntl.h>
51 #include <sys/flock.h>
52 #include <sys/swap.h>
53 #include <sys/errno.h>
54 #include <sys/strsubr.h>
55 #include <sys/sysmacros.h>
56 #include <sys/kmem.h>
57 #include <sys/cmn_err.h>
58 #include <sys/pathconf.h>
59 #include <sys/utsname.h>
60 #include <sys/dnlc.h>
61 #include <sys/acl.h>
62 #include <sys/systeminfo.h>
63 #include <sys/atomic.h>
64 #include <sys/policy.h>
65 #include <sys/sdt.h>
66 
67 #include <rpc/types.h>
68 #include <rpc/auth.h>
69 #include <rpc/clnt.h>
70 
71 #include <nfs/nfs.h>
72 #include <nfs/nfs_clnt.h>
73 #include <nfs/rnode.h>
74 #include <nfs/nfs_acl.h>
75 #include <nfs/lm.h>
76 
77 #include <vm/hat.h>
78 #include <vm/as.h>
79 #include <vm/page.h>
80 #include <vm/pvn.h>
81 #include <vm/seg.h>
82 #include <vm/seg_map.h>
83 #include <vm/seg_kpm.h>
84 #include <vm/seg_vn.h>
85 
86 #include <fs/fs_subr.h>
87 
88 #include <sys/ddi.h>
89 
90 static int	nfs3_rdwrlbn(vnode_t *, page_t *, u_offset_t, size_t, int,
91 			cred_t *);
92 static int	nfs3write(vnode_t *, caddr_t, u_offset_t, int, cred_t *,
93 			stable_how *);
94 static int	nfs3read(vnode_t *, caddr_t, offset_t, int, size_t *, cred_t *);
95 static int	nfs3setattr(vnode_t *, struct vattr *, int, cred_t *);
96 static int	nfs3_accessx(void *, int, cred_t *);
97 static int	nfs3lookup_dnlc(vnode_t *, char *, vnode_t **, cred_t *);
98 static int	nfs3lookup_otw(vnode_t *, char *, vnode_t **, cred_t *, int);
99 static int	nfs3create(vnode_t *, char *, struct vattr *, enum vcexcl,
100 			int, vnode_t **, cred_t *, int);
101 static int	nfs3excl_create_settimes(vnode_t *, struct vattr *, cred_t *);
102 static int	nfs3mknod(vnode_t *, char *, struct vattr *, enum vcexcl,
103 			int, vnode_t **, cred_t *);
104 static int	nfs3rename(vnode_t *, char *, vnode_t *, char *, cred_t *,
105 			caller_context_t *);
106 static int	do_nfs3readdir(vnode_t *, rddir_cache *, cred_t *);
107 static void	nfs3readdir(vnode_t *, rddir_cache *, cred_t *);
108 static void	nfs3readdirplus(vnode_t *, rddir_cache *, cred_t *);
109 static int	nfs3_bio(struct buf *, stable_how *, cred_t *);
110 static int	nfs3_getapage(vnode_t *, u_offset_t, size_t, uint_t *,
111 			page_t *[], size_t, struct seg *, caddr_t,
112 			enum seg_rw, cred_t *);
113 static void	nfs3_readahead(vnode_t *, u_offset_t, caddr_t, struct seg *,
114 			cred_t *);
115 static int	nfs3_sync_putapage(vnode_t *, page_t *, u_offset_t, size_t,
116 			int, cred_t *);
117 static int	nfs3_sync_pageio(vnode_t *, page_t *, u_offset_t, size_t,
118 			int, cred_t *);
119 static int	nfs3_commit(vnode_t *, offset3, count3, cred_t *);
120 static void	nfs3_set_mod(vnode_t *);
121 static void	nfs3_get_commit(vnode_t *);
122 static void	nfs3_get_commit_range(vnode_t *, u_offset_t, size_t);
123 #if 0 /* unused */
124 #ifdef DEBUG
125 static int	nfs3_no_uncommitted_pages(vnode_t *);
126 #endif
127 #endif /* unused */
128 static int	nfs3_putpage_commit(vnode_t *, offset_t, size_t, cred_t *);
129 static int	nfs3_commit_vp(vnode_t *, u_offset_t, size_t,  cred_t *);
130 static int	nfs3_sync_commit(vnode_t *, page_t *, offset3, count3,
131 			cred_t *);
132 static void	nfs3_async_commit(vnode_t *, page_t *, offset3, count3,
133 			cred_t *);
134 static void	nfs3_delmap_callback(struct as *, void *, uint_t);
135 
136 /*
137  * Error flags used to pass information about certain special errors
138  * which need to be handled specially.
139  */
140 #define	NFS_EOF			-98
141 #define	NFS_VERF_MISMATCH	-97
142 
143 /* ALIGN64 aligns the given buffer and adjust buffer size to 64 bit */
144 #define	ALIGN64(x, ptr, sz)						\
145 	x = ((uintptr_t)(ptr)) & (sizeof (uint64_t) - 1);		\
146 	if (x) {							\
147 		x = sizeof (uint64_t) - (x);				\
148 		sz -= (x);						\
149 		ptr += (x);						\
150 	}
151 
152 /*
153  * These are the vnode ops routines which implement the vnode interface to
154  * the networked file system.  These routines just take their parameters,
155  * make them look networkish by putting the right info into interface structs,
156  * and then calling the appropriate remote routine(s) to do the work.
157  *
158  * Note on directory name lookup cacheing:  If we detect a stale fhandle,
159  * we purge the directory cache relative to that vnode.  This way, the
160  * user won't get burned by the cache repeatedly.  See <nfs/rnode.h> for
161  * more details on rnode locking.
162  */
163 
164 static int	nfs3_open(vnode_t **, int, cred_t *, caller_context_t *);
165 static int	nfs3_close(vnode_t *, int, int, offset_t, cred_t *,
166 			caller_context_t *);
167 static int	nfs3_read(vnode_t *, struct uio *, int, cred_t *,
168 			caller_context_t *);
169 static int	nfs3_write(vnode_t *, struct uio *, int, cred_t *,
170 			caller_context_t *);
171 static int	nfs3_ioctl(vnode_t *, int, intptr_t, int, cred_t *, int *,
172 			caller_context_t *);
173 static int	nfs3_getattr(vnode_t *, struct vattr *, int, cred_t *,
174 			caller_context_t *);
175 static int	nfs3_setattr(vnode_t *, struct vattr *, int, cred_t *,
176 			caller_context_t *);
177 static int	nfs3_access(vnode_t *, int, int, cred_t *, caller_context_t *);
178 static int	nfs3_readlink(vnode_t *, struct uio *, cred_t *,
179 			caller_context_t *);
180 static int	nfs3_fsync(vnode_t *, int, cred_t *, caller_context_t *);
181 static void	nfs3_inactive(vnode_t *, cred_t *, caller_context_t *);
182 static int	nfs3_lookup(vnode_t *, char *, vnode_t **,
183 			struct pathname *, int, vnode_t *, cred_t *,
184 			caller_context_t *, int *, pathname_t *);
185 static int	nfs3_create(vnode_t *, char *, struct vattr *, enum vcexcl,
186 			int, vnode_t **, cred_t *, int, caller_context_t *,
187 			vsecattr_t *);
188 static int	nfs3_remove(vnode_t *, char *, cred_t *, caller_context_t *,
189 			int);
190 static int	nfs3_link(vnode_t *, vnode_t *, char *, cred_t *,
191 			caller_context_t *, int);
192 static int	nfs3_rename(vnode_t *, char *, vnode_t *, char *, cred_t *,
193 			caller_context_t *, int);
194 static int	nfs3_mkdir(vnode_t *, char *, struct vattr *, vnode_t **,
195 			cred_t *, caller_context_t *, int, vsecattr_t *);
196 static int	nfs3_rmdir(vnode_t *, char *, vnode_t *, cred_t *,
197 			caller_context_t *, int);
198 static int	nfs3_symlink(vnode_t *, char *, struct vattr *, char *,
199 			cred_t *, caller_context_t *, int);
200 static int	nfs3_readdir(vnode_t *, struct uio *, cred_t *, int *,
201 			caller_context_t *, int);
202 static int	nfs3_fid(vnode_t *, fid_t *, caller_context_t *);
203 static int	nfs3_rwlock(vnode_t *, int, caller_context_t *);
204 static void	nfs3_rwunlock(vnode_t *, int, caller_context_t *);
205 static int	nfs3_seek(vnode_t *, offset_t, offset_t *, caller_context_t *);
206 static int	nfs3_getpage(vnode_t *, offset_t, size_t, uint_t *,
207 			page_t *[], size_t, struct seg *, caddr_t,
208 			enum seg_rw, cred_t *, caller_context_t *);
209 static int	nfs3_putpage(vnode_t *, offset_t, size_t, int, cred_t *,
210 			caller_context_t *);
211 static int	nfs3_map(vnode_t *, offset_t, struct as *, caddr_t *, size_t,
212 			uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *);
213 static int	nfs3_addmap(vnode_t *, offset_t, struct as *, caddr_t, size_t,
214 			uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *);
215 static int	nfs3_frlock(vnode_t *, int, struct flock64 *, int, offset_t,
216 			struct flk_callback *, cred_t *, caller_context_t *);
217 static int	nfs3_space(vnode_t *, int, struct flock64 *, int, offset_t,
218 			cred_t *, caller_context_t *);
219 static int	nfs3_realvp(vnode_t *, vnode_t **, caller_context_t *);
220 static int	nfs3_delmap(vnode_t *, offset_t, struct as *, caddr_t, size_t,
221 			uint_t, uint_t, uint_t, cred_t *, caller_context_t *);
222 static int	nfs3_pathconf(vnode_t *, int, ulong_t *, cred_t *,
223 			caller_context_t *);
224 static int	nfs3_pageio(vnode_t *, page_t *, u_offset_t, size_t, int,
225 			cred_t *, caller_context_t *);
226 static void	nfs3_dispose(vnode_t *, page_t *, int, int, cred_t *,
227 			caller_context_t *);
228 static int	nfs3_setsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
229 			caller_context_t *);
230 static int	nfs3_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
231 			caller_context_t *);
232 static int	nfs3_shrlock(vnode_t *, int, struct shrlock *, int, cred_t *,
233 			caller_context_t *);
234 
235 struct vnodeops *nfs3_vnodeops;
236 
237 const fs_operation_def_t nfs3_vnodeops_template[] = {
238 	VOPNAME_OPEN,		{ .vop_open = nfs3_open },
239 	VOPNAME_CLOSE,		{ .vop_close = nfs3_close },
240 	VOPNAME_READ,		{ .vop_read = nfs3_read },
241 	VOPNAME_WRITE,		{ .vop_write = nfs3_write },
242 	VOPNAME_IOCTL,		{ .vop_ioctl = nfs3_ioctl },
243 	VOPNAME_GETATTR,	{ .vop_getattr = nfs3_getattr },
244 	VOPNAME_SETATTR,	{ .vop_setattr = nfs3_setattr },
245 	VOPNAME_ACCESS,		{ .vop_access = nfs3_access },
246 	VOPNAME_LOOKUP,		{ .vop_lookup = nfs3_lookup },
247 	VOPNAME_CREATE,		{ .vop_create = nfs3_create },
248 	VOPNAME_REMOVE,		{ .vop_remove = nfs3_remove },
249 	VOPNAME_LINK,		{ .vop_link = nfs3_link },
250 	VOPNAME_RENAME,		{ .vop_rename = nfs3_rename },
251 	VOPNAME_MKDIR,		{ .vop_mkdir = nfs3_mkdir },
252 	VOPNAME_RMDIR,		{ .vop_rmdir = nfs3_rmdir },
253 	VOPNAME_READDIR,	{ .vop_readdir = nfs3_readdir },
254 	VOPNAME_SYMLINK,	{ .vop_symlink = nfs3_symlink },
255 	VOPNAME_READLINK,	{ .vop_readlink = nfs3_readlink },
256 	VOPNAME_FSYNC,		{ .vop_fsync = nfs3_fsync },
257 	VOPNAME_INACTIVE,	{ .vop_inactive = nfs3_inactive },
258 	VOPNAME_FID,		{ .vop_fid = nfs3_fid },
259 	VOPNAME_RWLOCK,		{ .vop_rwlock = nfs3_rwlock },
260 	VOPNAME_RWUNLOCK,	{ .vop_rwunlock = nfs3_rwunlock },
261 	VOPNAME_SEEK,		{ .vop_seek = nfs3_seek },
262 	VOPNAME_FRLOCK,		{ .vop_frlock = nfs3_frlock },
263 	VOPNAME_SPACE,		{ .vop_space = nfs3_space },
264 	VOPNAME_REALVP,		{ .vop_realvp = nfs3_realvp },
265 	VOPNAME_GETPAGE,	{ .vop_getpage = nfs3_getpage },
266 	VOPNAME_PUTPAGE,	{ .vop_putpage = nfs3_putpage },
267 	VOPNAME_MAP,		{ .vop_map = nfs3_map },
268 	VOPNAME_ADDMAP,		{ .vop_addmap = nfs3_addmap },
269 	VOPNAME_DELMAP,		{ .vop_delmap = nfs3_delmap },
270 	/* no separate nfs3_dump */
271 	VOPNAME_DUMP,		{ .vop_dump = nfs_dump },
272 	VOPNAME_PATHCONF,	{ .vop_pathconf = nfs3_pathconf },
273 	VOPNAME_PAGEIO,		{ .vop_pageio = nfs3_pageio },
274 	VOPNAME_DISPOSE,	{ .vop_dispose = nfs3_dispose },
275 	VOPNAME_SETSECATTR,	{ .vop_setsecattr = nfs3_setsecattr },
276 	VOPNAME_GETSECATTR,	{ .vop_getsecattr = nfs3_getsecattr },
277 	VOPNAME_SHRLOCK,	{ .vop_shrlock = nfs3_shrlock },
278 	VOPNAME_VNEVENT, 	{ .vop_vnevent = fs_vnevent_support },
279 	NULL,			NULL
280 };
281 
282 /*
283  * XXX:  This is referenced in modstubs.s
284  */
285 struct vnodeops *
286 nfs3_getvnodeops(void)
287 {
288 	return (nfs3_vnodeops);
289 }
290 
291 /* ARGSUSED */
292 static int
293 nfs3_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
294 {
295 	int error;
296 	struct vattr va;
297 	rnode_t *rp;
298 	vnode_t *vp;
299 
300 	vp = *vpp;
301 	if (nfs_zone() != VTOMI(vp)->mi_zone)
302 		return (EIO);
303 	rp = VTOR(vp);
304 	mutex_enter(&rp->r_statelock);
305 	if (rp->r_cred == NULL) {
306 		crhold(cr);
307 		rp->r_cred = cr;
308 	}
309 	mutex_exit(&rp->r_statelock);
310 
311 	/*
312 	 * If there is no cached data or if close-to-open
313 	 * consistency checking is turned off, we can avoid
314 	 * the over the wire getattr.  Otherwise, if the
315 	 * file system is mounted readonly, then just verify
316 	 * the caches are up to date using the normal mechanism.
317 	 * Else, if the file is not mmap'd, then just mark
318 	 * the attributes as timed out.  They will be refreshed
319 	 * and the caches validated prior to being used.
320 	 * Else, the file system is mounted writeable so
321 	 * force an over the wire GETATTR in order to ensure
322 	 * that all cached data is valid.
323 	 */
324 	if (vp->v_count > 1 ||
325 	    ((vn_has_cached_data(vp) || HAVE_RDDIR_CACHE(rp)) &&
326 	    !(VTOMI(vp)->mi_flags & MI_NOCTO))) {
327 		if (vn_is_readonly(vp))
328 			error = nfs3_validate_caches(vp, cr);
329 		else if (rp->r_mapcnt == 0 && vp->v_count == 1) {
330 			PURGE_ATTRCACHE(vp);
331 			error = 0;
332 		} else {
333 			va.va_mask = AT_ALL;
334 			error = nfs3_getattr_otw(vp, &va, cr);
335 		}
336 	} else
337 		error = 0;
338 
339 	return (error);
340 }
341 
342 /* ARGSUSED */
343 static int
344 nfs3_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
345 		caller_context_t *ct)
346 {
347 	rnode_t *rp;
348 	int error;
349 	struct vattr va;
350 
351 	/*
352 	 * zone_enter(2) prevents processes from changing zones with NFS files
353 	 * open; if we happen to get here from the wrong zone we can't do
354 	 * anything over the wire.
355 	 */
356 	if (VTOMI(vp)->mi_zone != nfs_zone()) {
357 		/*
358 		 * We could attempt to clean up locks, except we're sure
359 		 * that the current process didn't acquire any locks on
360 		 * the file: any attempt to lock a file belong to another zone
361 		 * will fail, and one can't lock an NFS file and then change
362 		 * zones, as that fails too.
363 		 *
364 		 * Returning an error here is the sane thing to do.  A
365 		 * subsequent call to VN_RELE() which translates to a
366 		 * nfs3_inactive() will clean up state: if the zone of the
367 		 * vnode's origin is still alive and kicking, an async worker
368 		 * thread will handle the request (from the correct zone), and
369 		 * everything (minus the commit and final nfs3_getattr_otw()
370 		 * call) should be OK. If the zone is going away
371 		 * nfs_async_inactive() will throw away cached pages inline.
372 		 */
373 		return (EIO);
374 	}
375 
376 	/*
377 	 * If we are using local locking for this filesystem, then
378 	 * release all of the SYSV style record locks.  Otherwise,
379 	 * we are doing network locking and we need to release all
380 	 * of the network locks.  All of the locks held by this
381 	 * process on this file are released no matter what the
382 	 * incoming reference count is.
383 	 */
384 	if (VTOMI(vp)->mi_flags & MI_LLOCK) {
385 		cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
386 		cleanshares(vp, ttoproc(curthread)->p_pid);
387 	} else
388 		nfs_lockrelease(vp, flag, offset, cr);
389 
390 	if (count > 1)
391 		return (0);
392 
393 	/*
394 	 * If the file has been `unlinked', then purge the
395 	 * DNLC so that this vnode will get reycled quicker
396 	 * and the .nfs* file on the server will get removed.
397 	 */
398 	rp = VTOR(vp);
399 	if (rp->r_unldvp != NULL)
400 		dnlc_purge_vp(vp);
401 
402 	/*
403 	 * If the file was open for write and there are pages,
404 	 * then if the file system was mounted using the "no-close-
405 	 *	to-open" semantics, then start an asynchronous flush
406 	 *	of the all of the pages in the file.
407 	 * else the file system was not mounted using the "no-close-
408 	 *	to-open" semantics, then do a synchronous flush and
409 	 *	commit of all of the dirty and uncommitted pages.
410 	 *
411 	 * The asynchronous flush of the pages in the "nocto" path
412 	 * mostly just associates a cred pointer with the rnode so
413 	 * writes which happen later will have a better chance of
414 	 * working.  It also starts the data being written to the
415 	 * server, but without unnecessarily delaying the application.
416 	 */
417 	if ((flag & FWRITE) && vn_has_cached_data(vp)) {
418 		if (VTOMI(vp)->mi_flags & MI_NOCTO) {
419 			error = nfs3_putpage(vp, (offset_t)0, 0, B_ASYNC,
420 					cr, ct);
421 			if (error == EAGAIN)
422 				error = 0;
423 		} else
424 			error = nfs3_putpage_commit(vp, (offset_t)0, 0, cr);
425 		if (!error) {
426 			mutex_enter(&rp->r_statelock);
427 			error = rp->r_error;
428 			rp->r_error = 0;
429 			mutex_exit(&rp->r_statelock);
430 		}
431 	} else {
432 		mutex_enter(&rp->r_statelock);
433 		error = rp->r_error;
434 		rp->r_error = 0;
435 		mutex_exit(&rp->r_statelock);
436 	}
437 
438 	/*
439 	 * If RWRITEATTR is set, then issue an over the wire GETATTR to
440 	 * refresh the attribute cache with a set of attributes which
441 	 * weren't returned from a WRITE.  This will enable the close-
442 	 * to-open processing to work.
443 	 */
444 	if (rp->r_flags & RWRITEATTR)
445 		(void) nfs3_getattr_otw(vp, &va, cr);
446 
447 	return (error);
448 }
449 
450 /* ARGSUSED */
451 static int
452 nfs3_directio_read(vnode_t *vp, struct uio *uiop, cred_t *cr)
453 {
454 	mntinfo_t *mi;
455 	READ3args args;
456 	READ3uiores res;
457 	int tsize;
458 	offset_t offset;
459 	ssize_t count;
460 	int error;
461 	int douprintf;
462 	failinfo_t fi;
463 	char *sv_hostname;
464 
465 	mi = VTOMI(vp);
466 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
467 	sv_hostname = VTOR(vp)->r_server->sv_hostname;
468 
469 	douprintf = 1;
470 	args.file = *VTOFH3(vp);
471 	fi.vp = vp;
472 	fi.fhp = (caddr_t)&args.file;
473 	fi.copyproc = nfs3copyfh;
474 	fi.lookupproc = nfs3lookup;
475 	fi.xattrdirproc = acl_getxattrdir3;
476 
477 	res.uiop = uiop;
478 
479 	offset = uiop->uio_loffset;
480 	count = uiop->uio_resid;
481 
482 	do {
483 		if (mi->mi_io_kstats) {
484 			mutex_enter(&mi->mi_lock);
485 			kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
486 			mutex_exit(&mi->mi_lock);
487 		}
488 
489 		do {
490 			tsize = MIN(mi->mi_tsize, count);
491 			args.offset = (offset3)offset;
492 			args.count = (count3)tsize;
493 			res.size = (uint_t)tsize;
494 			error = rfs3call(mi, NFSPROC3_READ,
495 				    xdr_READ3args, (caddr_t)&args,
496 				    xdr_READ3uiores, (caddr_t)&res, cr,
497 				    &douprintf, &res.status, 0, &fi);
498 		} while (error == ENFS_TRYAGAIN);
499 
500 		if (mi->mi_io_kstats) {
501 			mutex_enter(&mi->mi_lock);
502 			kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
503 			mutex_exit(&mi->mi_lock);
504 		}
505 
506 		if (error)
507 			return (error);
508 
509 		error = geterrno3(res.status);
510 		if (error)
511 			return (error);
512 
513 		if (res.count != res.size) {
514 			zcmn_err(getzoneid(), CE_WARN,
515 "nfs3_directio_read: server %s returned incorrect amount",
516 					sv_hostname);
517 			return (EIO);
518 		}
519 		count -= res.count;
520 		offset += res.count;
521 		if (mi->mi_io_kstats) {
522 			mutex_enter(&mi->mi_lock);
523 			KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
524 			KSTAT_IO_PTR(mi->mi_io_kstats)->nread += res.count;
525 			mutex_exit(&mi->mi_lock);
526 		}
527 		lwp_stat_update(LWP_STAT_INBLK, 1);
528 	} while (count && !res.eof);
529 
530 	return (0);
531 }
532 
533 /* ARGSUSED */
534 static int
535 nfs3_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
536 	caller_context_t *ct)
537 {
538 	rnode_t *rp;
539 	u_offset_t off;
540 	offset_t diff;
541 	int on;
542 	size_t n;
543 	caddr_t base;
544 	uint_t flags;
545 	int error = 0;
546 	mntinfo_t *mi;
547 
548 	rp = VTOR(vp);
549 	mi = VTOMI(vp);
550 
551 	ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
552 
553 	if (nfs_zone() != mi->mi_zone)
554 		return (EIO);
555 
556 	if (vp->v_type != VREG)
557 		return (EISDIR);
558 
559 	if (uiop->uio_resid == 0)
560 		return (0);
561 
562 	if (uiop->uio_loffset < 0 || uiop->uio_loffset + uiop->uio_resid < 0)
563 		return (EINVAL);
564 
565 	/*
566 	 * Bypass VM if caching has been disabled (e.g., locking) or if
567 	 * using client-side direct I/O and the file is not mmap'd and
568 	 * there are no cached pages.
569 	 */
570 	if ((vp->v_flag & VNOCACHE) ||
571 	    (((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) &&
572 	    rp->r_mapcnt == 0 && !vn_has_cached_data(vp))) {
573 		return (nfs3_directio_read(vp, uiop, cr));
574 	}
575 
576 	do {
577 		off = uiop->uio_loffset & MAXBMASK; /* mapping offset */
578 		on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */
579 		n = MIN(MAXBSIZE - on, uiop->uio_resid);
580 
581 		error = nfs3_validate_caches(vp, cr);
582 		if (error)
583 			break;
584 
585 		mutex_enter(&rp->r_statelock);
586 		diff = rp->r_size - uiop->uio_loffset;
587 		mutex_exit(&rp->r_statelock);
588 		if (diff <= 0)
589 			break;
590 		if (diff < n)
591 			n = (size_t)diff;
592 
593 		if (vpm_enable) {
594 			/*
595 			 * Copy data.
596 			 */
597 			error = vpm_data_copy(vp, off + on, n, uiop,
598 						1, NULL, 0, S_READ);
599 		} else {
600 			base = segmap_getmapflt(segkmap, vp, off + on, n, 1,
601 							S_READ);
602 
603 			error = uiomove(base + on, n, UIO_READ, uiop);
604 		}
605 
606 		if (!error) {
607 			/*
608 			 * If read a whole block or read to eof,
609 			 * won't need this buffer again soon.
610 			 */
611 			mutex_enter(&rp->r_statelock);
612 			if (n + on == MAXBSIZE ||
613 			    uiop->uio_loffset == rp->r_size)
614 				flags = SM_DONTNEED;
615 			else
616 				flags = 0;
617 			mutex_exit(&rp->r_statelock);
618 			if (vpm_enable) {
619 				error = vpm_sync_pages(vp, off, n, flags);
620 			} else {
621 				error = segmap_release(segkmap, base, flags);
622 			}
623 		} else {
624 			if (vpm_enable) {
625 				(void) vpm_sync_pages(vp, off, n, 0);
626 			} else {
627 				(void) segmap_release(segkmap, base, 0);
628 			}
629 		}
630 	} while (!error && uiop->uio_resid > 0);
631 
632 	return (error);
633 }
634 
635 /* ARGSUSED */
636 static int
637 nfs3_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
638 	caller_context_t *ct)
639 {
640 	rlim64_t limit = uiop->uio_llimit;
641 	rnode_t *rp;
642 	u_offset_t off;
643 	caddr_t base;
644 	uint_t flags;
645 	int remainder;
646 	size_t n;
647 	int on;
648 	int error;
649 	int resid;
650 	offset_t offset;
651 	mntinfo_t *mi;
652 	uint_t bsize;
653 
654 	rp = VTOR(vp);
655 
656 	if (vp->v_type != VREG)
657 		return (EISDIR);
658 
659 	mi = VTOMI(vp);
660 	if (nfs_zone() != mi->mi_zone)
661 		return (EIO);
662 	if (uiop->uio_resid == 0)
663 		return (0);
664 
665 	if (ioflag & FAPPEND) {
666 		struct vattr va;
667 
668 		/*
669 		 * Must serialize if appending.
670 		 */
671 		if (nfs_rw_lock_held(&rp->r_rwlock, RW_READER)) {
672 			nfs_rw_exit(&rp->r_rwlock);
673 			if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER,
674 			    INTR(vp)))
675 				return (EINTR);
676 		}
677 
678 		va.va_mask = AT_SIZE;
679 		error = nfs3getattr(vp, &va, cr);
680 		if (error)
681 			return (error);
682 		uiop->uio_loffset = va.va_size;
683 	}
684 
685 	offset = uiop->uio_loffset + uiop->uio_resid;
686 
687 	if (uiop->uio_loffset < 0 || offset < 0)
688 		return (EINVAL);
689 
690 	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
691 		limit = MAXOFFSET_T;
692 
693 	/*
694 	 * Check to make sure that the process will not exceed
695 	 * its limit on file size.  It is okay to write up to
696 	 * the limit, but not beyond.  Thus, the write which
697 	 * reaches the limit will be short and the next write
698 	 * will return an error.
699 	 */
700 	remainder = 0;
701 	if (offset > limit) {
702 		remainder = offset - limit;
703 		uiop->uio_resid = limit - uiop->uio_loffset;
704 		if (uiop->uio_resid <= 0) {
705 			proc_t *p = ttoproc(curthread);
706 
707 			uiop->uio_resid += remainder;
708 			mutex_enter(&p->p_lock);
709 			(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
710 			    p->p_rctls, p, RCA_UNSAFE_SIGINFO);
711 			mutex_exit(&p->p_lock);
712 			return (EFBIG);
713 		}
714 	}
715 
716 	if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR(vp)))
717 		return (EINTR);
718 
719 	/*
720 	 * Bypass VM if caching has been disabled (e.g., locking) or if
721 	 * using client-side direct I/O and the file is not mmap'd and
722 	 * there are no cached pages.
723 	 */
724 	if ((vp->v_flag & VNOCACHE) ||
725 	    (((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) &&
726 	    rp->r_mapcnt == 0 && !vn_has_cached_data(vp))) {
727 		size_t bufsize;
728 		int count;
729 		u_offset_t org_offset;
730 		stable_how stab_comm;
731 
732 nfs3_fwrite:
733 		if (rp->r_flags & RSTALE) {
734 			resid = uiop->uio_resid;
735 			offset = uiop->uio_loffset;
736 			error = rp->r_error;
737 			goto bottom;
738 		}
739 		bufsize = MIN(uiop->uio_resid, mi->mi_stsize);
740 		base = kmem_alloc(bufsize, KM_SLEEP);
741 		do {
742 			if (ioflag & FDSYNC)
743 				stab_comm = DATA_SYNC;
744 			else
745 				stab_comm = FILE_SYNC;
746 			resid = uiop->uio_resid;
747 			offset = uiop->uio_loffset;
748 			count = MIN(uiop->uio_resid, bufsize);
749 			org_offset = uiop->uio_loffset;
750 			error = uiomove(base, count, UIO_WRITE, uiop);
751 			if (!error) {
752 				error = nfs3write(vp, base, org_offset,
753 				    count, cr, &stab_comm);
754 			}
755 		} while (!error && uiop->uio_resid > 0);
756 		kmem_free(base, bufsize);
757 		goto bottom;
758 	}
759 
760 
761 	bsize = vp->v_vfsp->vfs_bsize;
762 
763 	do {
764 		off = uiop->uio_loffset & MAXBMASK; /* mapping offset */
765 		on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */
766 		n = MIN(MAXBSIZE - on, uiop->uio_resid);
767 
768 		resid = uiop->uio_resid;
769 		offset = uiop->uio_loffset;
770 
771 		if (rp->r_flags & RSTALE) {
772 			error = rp->r_error;
773 			break;
774 		}
775 
776 		/*
777 		 * Don't create dirty pages faster than they
778 		 * can be cleaned so that the system doesn't
779 		 * get imbalanced.  If the async queue is
780 		 * maxed out, then wait for it to drain before
781 		 * creating more dirty pages.  Also, wait for
782 		 * any threads doing pagewalks in the vop_getattr
783 		 * entry points so that they don't block for
784 		 * long periods.
785 		 */
786 		mutex_enter(&rp->r_statelock);
787 		while ((mi->mi_max_threads != 0 &&
788 		    rp->r_awcount > 2 * mi->mi_max_threads) ||
789 		    rp->r_gcount > 0)
790 			cv_wait(&rp->r_cv, &rp->r_statelock);
791 		mutex_exit(&rp->r_statelock);
792 
793 		if (vpm_enable) {
794 			/*
795 			 * It will use kpm mappings, so no need to
796 			 * pass an address.
797 			 */
798 			error = writerp(rp, NULL, n, uiop, 0);
799 		} else  {
800 			if (segmap_kpm) {
801 				int pon = uiop->uio_loffset & PAGEOFFSET;
802 				size_t pn = MIN(PAGESIZE - pon,
803 							uiop->uio_resid);
804 				int pagecreate;
805 
806 				mutex_enter(&rp->r_statelock);
807 				pagecreate = (pon == 0) && (pn == PAGESIZE ||
808 					uiop->uio_loffset + pn >= rp->r_size);
809 				mutex_exit(&rp->r_statelock);
810 
811 				base = segmap_getmapflt(segkmap, vp, off + on,
812 						pn, !pagecreate, S_WRITE);
813 
814 				error = writerp(rp, base + pon, n, uiop,
815 								pagecreate);
816 
817 			} else {
818 				base = segmap_getmapflt(segkmap, vp, off + on,
819 							n, 0, S_READ);
820 				error = writerp(rp, base + on, n, uiop, 0);
821 			}
822 		}
823 
824 		if (!error) {
825 			if (mi->mi_flags & MI_NOAC)
826 				flags = SM_WRITE;
827 			else if ((uiop->uio_loffset % bsize) == 0 ||
828 			    IS_SWAPVP(vp)) {
829 				/*
830 				 * Have written a whole block.
831 				 * Start an asynchronous write
832 				 * and mark the buffer to
833 				 * indicate that it won't be
834 				 * needed again soon.
835 				 */
836 				flags = SM_WRITE | SM_ASYNC | SM_DONTNEED;
837 			} else
838 				flags = 0;
839 			if ((ioflag & (FSYNC|FDSYNC)) ||
840 			    (rp->r_flags & ROUTOFSPACE)) {
841 				flags &= ~SM_ASYNC;
842 				flags |= SM_WRITE;
843 			}
844 			if (vpm_enable) {
845 				error = vpm_sync_pages(vp, off, n, flags);
846 			} else {
847 				error = segmap_release(segkmap, base, flags);
848 			}
849 		} else {
850 			if (vpm_enable) {
851 				(void) vpm_sync_pages(vp, off, n, 0);
852 			} else {
853 				(void) segmap_release(segkmap, base, 0);
854 			}
855 			/*
856 			 * In the event that we got an access error while
857 			 * faulting in a page for a write-only file just
858 			 * force a write.
859 			 */
860 			if (error == EACCES)
861 				goto nfs3_fwrite;
862 		}
863 	} while (!error && uiop->uio_resid > 0);
864 
865 bottom:
866 	if (error) {
867 		uiop->uio_resid = resid + remainder;
868 		uiop->uio_loffset = offset;
869 	} else
870 		uiop->uio_resid += remainder;
871 
872 	nfs_rw_exit(&rp->r_lkserlock);
873 
874 	return (error);
875 }
876 
877 /*
878  * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED}
879  */
880 static int
881 nfs3_rdwrlbn(vnode_t *vp, page_t *pp, u_offset_t off, size_t len,
882 	int flags, cred_t *cr)
883 {
884 	struct buf *bp;
885 	int error;
886 	page_t *savepp;
887 	uchar_t fsdata;
888 	stable_how stab_comm;
889 
890 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
891 	bp = pageio_setup(pp, len, vp, flags);
892 	ASSERT(bp != NULL);
893 
894 	/*
895 	 * pageio_setup should have set b_addr to 0.  This
896 	 * is correct since we want to do I/O on a page
897 	 * boundary.  bp_mapin will use this addr to calculate
898 	 * an offset, and then set b_addr to the kernel virtual
899 	 * address it allocated for us.
900 	 */
901 	ASSERT(bp->b_un.b_addr == 0);
902 
903 	bp->b_edev = 0;
904 	bp->b_dev = 0;
905 	bp->b_lblkno = lbtodb(off);
906 	bp->b_file = vp;
907 	bp->b_offset = (offset_t)off;
908 	bp_mapin(bp);
909 
910 	/*
911 	 * Calculate the desired level of stability to write data
912 	 * on the server and then mark all of the pages to reflect
913 	 * this.
914 	 */
915 	if ((flags & (B_WRITE|B_ASYNC)) == (B_WRITE|B_ASYNC) &&
916 	    freemem > desfree) {
917 		stab_comm = UNSTABLE;
918 		fsdata = C_DELAYCOMMIT;
919 	} else {
920 		stab_comm = FILE_SYNC;
921 		fsdata = C_NOCOMMIT;
922 	}
923 
924 	savepp = pp;
925 	do {
926 		pp->p_fsdata = fsdata;
927 	} while ((pp = pp->p_next) != savepp);
928 
929 	error = nfs3_bio(bp, &stab_comm, cr);
930 
931 	bp_mapout(bp);
932 	pageio_done(bp);
933 
934 	/*
935 	 * If the server wrote pages in a more stable fashion than
936 	 * was requested, then clear all of the marks in the pages
937 	 * indicating that COMMIT operations were required.
938 	 */
939 	if (stab_comm != UNSTABLE && fsdata == C_DELAYCOMMIT) {
940 		do {
941 			pp->p_fsdata = C_NOCOMMIT;
942 		} while ((pp = pp->p_next) != savepp);
943 	}
944 
945 	return (error);
946 }
947 
948 /*
949  * Write to file.  Writes to remote server in largest size
950  * chunks that the server can handle.  Write is synchronous.
951  */
952 static int
953 nfs3write(vnode_t *vp, caddr_t base, u_offset_t offset, int count, cred_t *cr,
954 	stable_how *stab_comm)
955 {
956 	mntinfo_t *mi;
957 	WRITE3args args;
958 	WRITE3res res;
959 	int error;
960 	int tsize;
961 	rnode_t *rp;
962 	int douprintf;
963 
964 	rp = VTOR(vp);
965 	mi = VTOMI(vp);
966 
967 	ASSERT(nfs_zone() == mi->mi_zone);
968 
969 	args.file = *VTOFH3(vp);
970 	args.stable = *stab_comm;
971 
972 	*stab_comm = FILE_SYNC;
973 
974 	douprintf = 1;
975 
976 	do {
977 		if ((vp->v_flag & VNOCACHE) ||
978 		    (rp->r_flags & RDIRECTIO) ||
979 		    (mi->mi_flags & MI_DIRECTIO))
980 			tsize = MIN(mi->mi_stsize, count);
981 		else
982 			tsize = MIN(mi->mi_curwrite, count);
983 		args.offset = (offset3)offset;
984 		args.count = (count3)tsize;
985 		args.data.data_len = (uint_t)tsize;
986 		args.data.data_val = base;
987 
988 		if (mi->mi_io_kstats) {
989 			mutex_enter(&mi->mi_lock);
990 			kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
991 			mutex_exit(&mi->mi_lock);
992 		}
993 		args.mblk = NULL;
994 		do {
995 			error = rfs3call(mi, NFSPROC3_WRITE,
996 			    xdr_WRITE3args, (caddr_t)&args,
997 			    xdr_WRITE3res, (caddr_t)&res, cr,
998 			    &douprintf, &res.status, 0, NULL);
999 		} while (error == ENFS_TRYAGAIN);
1000 		if (mi->mi_io_kstats) {
1001 			mutex_enter(&mi->mi_lock);
1002 			kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
1003 			mutex_exit(&mi->mi_lock);
1004 		}
1005 
1006 		if (error)
1007 			return (error);
1008 		error = geterrno3(res.status);
1009 		if (!error) {
1010 			if (res.resok.count > args.count) {
1011 				zcmn_err(getzoneid(), CE_WARN,
1012 				    "nfs3write: server %s wrote %u, "
1013 				    "requested was %u",
1014 				    rp->r_server->sv_hostname,
1015 				    res.resok.count, args.count);
1016 				return (EIO);
1017 			}
1018 			if (res.resok.committed == UNSTABLE) {
1019 				*stab_comm = UNSTABLE;
1020 				if (args.stable == DATA_SYNC ||
1021 				    args.stable == FILE_SYNC) {
1022 					zcmn_err(getzoneid(), CE_WARN,
1023 			"nfs3write: server %s did not commit to stable storage",
1024 					    rp->r_server->sv_hostname);
1025 					return (EIO);
1026 				}
1027 			}
1028 			tsize = (int)res.resok.count;
1029 			count -= tsize;
1030 			base += tsize;
1031 			offset += tsize;
1032 			if (mi->mi_io_kstats) {
1033 				mutex_enter(&mi->mi_lock);
1034 				KSTAT_IO_PTR(mi->mi_io_kstats)->writes++;
1035 				KSTAT_IO_PTR(mi->mi_io_kstats)->nwritten +=
1036 				    tsize;
1037 				mutex_exit(&mi->mi_lock);
1038 			}
1039 			lwp_stat_update(LWP_STAT_OUBLK, 1);
1040 			mutex_enter(&rp->r_statelock);
1041 			if (rp->r_flags & RHAVEVERF) {
1042 				if (rp->r_verf != res.resok.verf) {
1043 					nfs3_set_mod(vp);
1044 					rp->r_verf = res.resok.verf;
1045 					/*
1046 					 * If the data was written UNSTABLE,
1047 					 * then might as well stop because
1048 					 * the whole block will have to get
1049 					 * rewritten anyway.
1050 					 */
1051 					if (*stab_comm == UNSTABLE) {
1052 						mutex_exit(&rp->r_statelock);
1053 						break;
1054 					}
1055 				}
1056 			} else {
1057 				rp->r_verf = res.resok.verf;
1058 				rp->r_flags |= RHAVEVERF;
1059 			}
1060 			/*
1061 			 * Mark the attribute cache as timed out and
1062 			 * set RWRITEATTR to indicate that the file
1063 			 * was modified with a WRITE operation and
1064 			 * that the attributes can not be trusted.
1065 			 */
1066 			PURGE_ATTRCACHE_LOCKED(rp);
1067 			rp->r_flags |= RWRITEATTR;
1068 			mutex_exit(&rp->r_statelock);
1069 		}
1070 	} while (!error && count);
1071 
1072 	return (error);
1073 }
1074 
1075 /*
1076  * Read from a file.  Reads data in largest chunks our interface can handle.
1077  */
1078 static int
1079 nfs3read(vnode_t *vp, caddr_t base, offset_t offset, int count,
1080 	size_t *residp, cred_t *cr)
1081 {
1082 	mntinfo_t *mi;
1083 	READ3args args;
1084 	READ3vres res;
1085 	int tsize;
1086 	int error;
1087 	int douprintf;
1088 	failinfo_t fi;
1089 	rnode_t *rp;
1090 	struct vattr va;
1091 	hrtime_t t;
1092 
1093 	rp = VTOR(vp);
1094 	mi = VTOMI(vp);
1095 	ASSERT(nfs_zone() == mi->mi_zone);
1096 	douprintf = 1;
1097 
1098 	args.file = *VTOFH3(vp);
1099 	fi.vp = vp;
1100 	fi.fhp = (caddr_t)&args.file;
1101 	fi.copyproc = nfs3copyfh;
1102 	fi.lookupproc = nfs3lookup;
1103 	fi.xattrdirproc = acl_getxattrdir3;
1104 
1105 	res.pov.fres.vp = vp;
1106 	res.pov.fres.vap = &va;
1107 
1108 	*residp = count;
1109 	do {
1110 		if (mi->mi_io_kstats) {
1111 			mutex_enter(&mi->mi_lock);
1112 			kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1113 			mutex_exit(&mi->mi_lock);
1114 		}
1115 
1116 		do {
1117 			if ((vp->v_flag & VNOCACHE) ||
1118 			    (rp->r_flags & RDIRECTIO) ||
1119 			    (mi->mi_flags & MI_DIRECTIO))
1120 				tsize = MIN(mi->mi_tsize, count);
1121 			else
1122 				tsize = MIN(mi->mi_curread, count);
1123 			res.data.data_val = base;
1124 			res.data.data_len = tsize;
1125 			args.offset = (offset3)offset;
1126 			args.count = (count3)tsize;
1127 			t = gethrtime();
1128 			error = rfs3call(mi, NFSPROC3_READ,
1129 			    xdr_READ3args, (caddr_t)&args,
1130 			    xdr_READ3vres, (caddr_t)&res, cr,
1131 			    &douprintf, &res.status, 0, &fi);
1132 		} while (error == ENFS_TRYAGAIN);
1133 
1134 		if (mi->mi_io_kstats) {
1135 			mutex_enter(&mi->mi_lock);
1136 			kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
1137 			mutex_exit(&mi->mi_lock);
1138 		}
1139 
1140 		if (error)
1141 			return (error);
1142 
1143 		error = geterrno3(res.status);
1144 		if (error)
1145 			return (error);
1146 
1147 		if (res.count != res.data.data_len) {
1148 			zcmn_err(getzoneid(), CE_WARN,
1149 				"nfs3read: server %s returned incorrect amount",
1150 				rp->r_server->sv_hostname);
1151 			return (EIO);
1152 		}
1153 
1154 		count -= res.count;
1155 		*residp = count;
1156 		base += res.count;
1157 		offset += res.count;
1158 		if (mi->mi_io_kstats) {
1159 			mutex_enter(&mi->mi_lock);
1160 			KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
1161 			KSTAT_IO_PTR(mi->mi_io_kstats)->nread += res.count;
1162 			mutex_exit(&mi->mi_lock);
1163 		}
1164 		lwp_stat_update(LWP_STAT_INBLK, 1);
1165 	} while (count && !res.eof);
1166 
1167 	if (res.pov.attributes) {
1168 		mutex_enter(&rp->r_statelock);
1169 		if (!CACHE_VALID(rp, va.va_mtime, va.va_size)) {
1170 			mutex_exit(&rp->r_statelock);
1171 			PURGE_ATTRCACHE(vp);
1172 		} else {
1173 			if (rp->r_mtime <= t)
1174 				nfs_attrcache_va(vp, &va);
1175 			mutex_exit(&rp->r_statelock);
1176 		}
1177 	}
1178 
1179 	return (0);
1180 }
1181 
1182 /* ARGSUSED */
1183 static int
1184 nfs3_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp,
1185 	caller_context_t *ct)
1186 {
1187 
1188 	if (nfs_zone() != VTOMI(vp)->mi_zone)
1189 		return (EIO);
1190 	switch (cmd) {
1191 		case _FIODIRECTIO:
1192 			return (nfs_directio(vp, (int)arg, cr));
1193 		default:
1194 			return (ENOTTY);
1195 	}
1196 }
1197 
1198 /* ARGSUSED */
1199 static int
1200 nfs3_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
1201 	caller_context_t *ct)
1202 {
1203 	int error;
1204 	rnode_t *rp;
1205 
1206 	if (nfs_zone() != VTOMI(vp)->mi_zone)
1207 		return (EIO);
1208 	/*
1209 	 * If it has been specified that the return value will
1210 	 * just be used as a hint, and we are only being asked
1211 	 * for size, fsid or rdevid, then return the client's
1212 	 * notion of these values without checking to make sure
1213 	 * that the attribute cache is up to date.
1214 	 * The whole point is to avoid an over the wire GETATTR
1215 	 * call.
1216 	 */
1217 	rp = VTOR(vp);
1218 	if (flags & ATTR_HINT) {
1219 		if (vap->va_mask ==
1220 		    (vap->va_mask & (AT_SIZE | AT_FSID | AT_RDEV))) {
1221 			mutex_enter(&rp->r_statelock);
1222 			if (vap->va_mask | AT_SIZE)
1223 				vap->va_size = rp->r_size;
1224 			if (vap->va_mask | AT_FSID)
1225 				vap->va_fsid = rp->r_attr.va_fsid;
1226 			if (vap->va_mask | AT_RDEV)
1227 				vap->va_rdev = rp->r_attr.va_rdev;
1228 			mutex_exit(&rp->r_statelock);
1229 			return (0);
1230 		}
1231 	}
1232 
1233 	/*
1234 	 * Only need to flush pages if asking for the mtime
1235 	 * and if there any dirty pages or any outstanding
1236 	 * asynchronous (write) requests for this file.
1237 	 */
1238 	if (vap->va_mask & AT_MTIME) {
1239 		if (vn_has_cached_data(vp) &&
1240 		    ((rp->r_flags & RDIRTY) || rp->r_awcount > 0)) {
1241 			mutex_enter(&rp->r_statelock);
1242 			rp->r_gcount++;
1243 			mutex_exit(&rp->r_statelock);
1244 			error = nfs3_putpage(vp, (offset_t)0, 0, 0, cr, ct);
1245 			mutex_enter(&rp->r_statelock);
1246 			if (error && (error == ENOSPC || error == EDQUOT)) {
1247 				if (!rp->r_error)
1248 					rp->r_error = error;
1249 			}
1250 			if (--rp->r_gcount == 0)
1251 				cv_broadcast(&rp->r_cv);
1252 			mutex_exit(&rp->r_statelock);
1253 		}
1254 	}
1255 
1256 	return (nfs3getattr(vp, vap, cr));
1257 }
1258 
1259 /*ARGSUSED4*/
1260 static int
1261 nfs3_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
1262 		caller_context_t *ct)
1263 {
1264 	int error;
1265 	struct vattr va;
1266 
1267 	if (vap->va_mask & AT_NOSET)
1268 		return (EINVAL);
1269 	if (nfs_zone() != VTOMI(vp)->mi_zone)
1270 		return (EIO);
1271 
1272 	va.va_mask = AT_UID | AT_MODE;
1273 	error = nfs3getattr(vp, &va, cr);
1274 	if (error)
1275 		return (error);
1276 
1277 	error = secpolicy_vnode_setattr(cr, vp, vap, &va, flags, nfs3_accessx,
1278 		vp);
1279 	if (error)
1280 		return (error);
1281 
1282 	return (nfs3setattr(vp, vap, flags, cr));
1283 }
1284 
1285 static int
1286 nfs3setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr)
1287 {
1288 	int error;
1289 	uint_t mask;
1290 	SETATTR3args args;
1291 	SETATTR3res res;
1292 	int douprintf;
1293 	rnode_t *rp;
1294 	struct vattr va;
1295 	mode_t omode;
1296 	vsecattr_t *vsp;
1297 	hrtime_t t;
1298 
1299 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
1300 	mask = vap->va_mask;
1301 
1302 	rp = VTOR(vp);
1303 
1304 	/*
1305 	 * Only need to flush pages if there are any pages and
1306 	 * if the file is marked as dirty in some fashion.  The
1307 	 * file must be flushed so that we can accurately
1308 	 * determine the size of the file and the cached data
1309 	 * after the SETATTR returns.  A file is considered to
1310 	 * be dirty if it is either marked with RDIRTY, has
1311 	 * outstanding i/o's active, or is mmap'd.  In this
1312 	 * last case, we can't tell whether there are dirty
1313 	 * pages, so we flush just to be sure.
1314 	 */
1315 	if (vn_has_cached_data(vp) &&
1316 	    ((rp->r_flags & RDIRTY) ||
1317 	    rp->r_count > 0 ||
1318 	    rp->r_mapcnt > 0)) {
1319 		ASSERT(vp->v_type != VCHR);
1320 		error = nfs3_putpage(vp, (offset_t)0, 0, 0, cr, NULL);
1321 		if (error && (error == ENOSPC || error == EDQUOT)) {
1322 			mutex_enter(&rp->r_statelock);
1323 			if (!rp->r_error)
1324 				rp->r_error = error;
1325 			mutex_exit(&rp->r_statelock);
1326 		}
1327 	}
1328 
1329 	args.object = *RTOFH3(rp);
1330 	/*
1331 	 * If the intent is for the server to set the times,
1332 	 * there is no point in have the mask indicating set mtime or
1333 	 * atime, because the vap values may be junk, and so result
1334 	 * in an overflow error. Remove these flags from the vap mask
1335 	 * before calling in this case, and restore them afterwards.
1336 	 */
1337 	if ((mask & (AT_ATIME | AT_MTIME)) && !(flags & ATTR_UTIME)) {
1338 		/* Use server times, so don't set the args time fields */
1339 		vap->va_mask &= ~(AT_ATIME | AT_MTIME);
1340 		error = vattr_to_sattr3(vap, &args.new_attributes);
1341 		vap->va_mask |= (mask & (AT_ATIME | AT_MTIME));
1342 		if (mask & AT_ATIME) {
1343 			args.new_attributes.atime.set_it = SET_TO_SERVER_TIME;
1344 		}
1345 		if (mask & AT_MTIME) {
1346 			args.new_attributes.mtime.set_it = SET_TO_SERVER_TIME;
1347 		}
1348 	} else {
1349 		/* Either do not set times or use the client specified times */
1350 		error = vattr_to_sattr3(vap, &args.new_attributes);
1351 	}
1352 
1353 	if (error) {
1354 		/* req time field(s) overflow - return immediately */
1355 		return (error);
1356 	}
1357 
1358 	va.va_mask = AT_MODE | AT_CTIME;
1359 	error = nfs3getattr(vp, &va, cr);
1360 	if (error)
1361 		return (error);
1362 	omode = va.va_mode;
1363 
1364 tryagain:
1365 	if (mask & AT_SIZE) {
1366 		args.guard.check = TRUE;
1367 		args.guard.obj_ctime.seconds = va.va_ctime.tv_sec;
1368 		args.guard.obj_ctime.nseconds = va.va_ctime.tv_nsec;
1369 	} else
1370 		args.guard.check = FALSE;
1371 
1372 	douprintf = 1;
1373 
1374 	t = gethrtime();
1375 
1376 	error = rfs3call(VTOMI(vp), NFSPROC3_SETATTR,
1377 	    xdr_SETATTR3args, (caddr_t)&args,
1378 	    xdr_SETATTR3res, (caddr_t)&res, cr,
1379 	    &douprintf, &res.status, 0, NULL);
1380 
1381 	/*
1382 	 * Purge the access cache and ACL cache if changing either the
1383 	 * owner of the file, the group owner, or the mode.  These may
1384 	 * change the access permissions of the file, so purge old
1385 	 * information and start over again.
1386 	 */
1387 	if (mask & (AT_UID | AT_GID | AT_MODE)) {
1388 		(void) nfs_access_purge_rp(rp);
1389 		if (rp->r_secattr != NULL) {
1390 			mutex_enter(&rp->r_statelock);
1391 			vsp = rp->r_secattr;
1392 			rp->r_secattr = NULL;
1393 			mutex_exit(&rp->r_statelock);
1394 			if (vsp != NULL)
1395 				nfs_acl_free(vsp);
1396 		}
1397 	}
1398 
1399 	if (error) {
1400 		PURGE_ATTRCACHE(vp);
1401 		return (error);
1402 	}
1403 
1404 	error = geterrno3(res.status);
1405 	if (!error) {
1406 		/*
1407 		 * If changing the size of the file, invalidate
1408 		 * any local cached data which is no longer part
1409 		 * of the file.  We also possibly invalidate the
1410 		 * last page in the file.  We could use
1411 		 * pvn_vpzero(), but this would mark the page as
1412 		 * modified and require it to be written back to
1413 		 * the server for no particularly good reason.
1414 		 * This way, if we access it, then we bring it
1415 		 * back in.  A read should be cheaper than a
1416 		 * write.
1417 		 */
1418 		if (mask & AT_SIZE) {
1419 			nfs_invalidate_pages(vp,
1420 			    (vap->va_size & PAGEMASK), cr);
1421 		}
1422 		nfs3_cache_wcc_data(vp, &res.resok.obj_wcc, t, cr);
1423 		/*
1424 		 * Some servers will change the mode to clear the setuid
1425 		 * and setgid bits when changing the uid or gid.  The
1426 		 * client needs to compensate appropriately.
1427 		 */
1428 		if (mask & (AT_UID | AT_GID)) {
1429 			int terror;
1430 
1431 			va.va_mask = AT_MODE;
1432 			terror = nfs3getattr(vp, &va, cr);
1433 			if (!terror &&
1434 			    (((mask & AT_MODE) && va.va_mode != vap->va_mode) ||
1435 			    (!(mask & AT_MODE) && va.va_mode != omode))) {
1436 				va.va_mask = AT_MODE;
1437 				if (mask & AT_MODE)
1438 					va.va_mode = vap->va_mode;
1439 				else
1440 					va.va_mode = omode;
1441 				(void) nfs3setattr(vp, &va, 0, cr);
1442 			}
1443 		}
1444 	} else {
1445 		nfs3_cache_wcc_data(vp, &res.resfail.obj_wcc, t, cr);
1446 		/*
1447 		 * If we got back a "not synchronized" error, then
1448 		 * we need to retry with a new guard value.  The
1449 		 * guard value used is the change time.  If the
1450 		 * server returned post_op_attr, then we can just
1451 		 * retry because we have the latest attributes.
1452 		 * Otherwise, we issue a GETATTR to get the latest
1453 		 * attributes and then retry.  If we couldn't get
1454 		 * the attributes this way either, then we give
1455 		 * up because we can't complete the operation as
1456 		 * required.
1457 		 */
1458 		if (res.status == NFS3ERR_NOT_SYNC) {
1459 			va.va_mask = AT_CTIME;
1460 			if (nfs3getattr(vp, &va, cr) == 0)
1461 				goto tryagain;
1462 		}
1463 		PURGE_STALE_FH(error, vp, cr);
1464 	}
1465 
1466 	return (error);
1467 }
1468 
1469 static int
1470 nfs3_accessx(void *vp, int mode, cred_t *cr)
1471 {
1472 	ASSERT(nfs_zone() == VTOMI((vnode_t *)vp)->mi_zone);
1473 	return (nfs3_access(vp, mode, 0, cr, NULL));
1474 }
1475 
1476 /* ARGSUSED */
1477 static int
1478 nfs3_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct)
1479 {
1480 	int error;
1481 	ACCESS3args args;
1482 	ACCESS3res res;
1483 	int douprintf;
1484 	uint32 acc;
1485 	rnode_t *rp;
1486 	cred_t *cred, *ncr, *ncrfree = NULL;
1487 	failinfo_t fi;
1488 	nfs_access_type_t cacc;
1489 	hrtime_t t;
1490 
1491 	acc = 0;
1492 	if (nfs_zone() != VTOMI(vp)->mi_zone)
1493 		return (EIO);
1494 	if (mode & VREAD)
1495 		acc |= ACCESS3_READ;
1496 	if (mode & VWRITE) {
1497 		if (vn_is_readonly(vp) && !IS_DEVVP(vp))
1498 			return (EROFS);
1499 		if (vp->v_type == VDIR)
1500 			acc |= ACCESS3_DELETE;
1501 		acc |= ACCESS3_MODIFY | ACCESS3_EXTEND;
1502 	}
1503 	if (mode & VEXEC) {
1504 		if (vp->v_type == VDIR)
1505 			acc |= ACCESS3_LOOKUP;
1506 		else
1507 			acc |= ACCESS3_EXECUTE;
1508 	}
1509 
1510 	rp = VTOR(vp);
1511 	args.object = *VTOFH3(vp);
1512 	if (vp->v_type == VDIR) {
1513 		args.access = ACCESS3_READ | ACCESS3_DELETE | ACCESS3_MODIFY |
1514 		    ACCESS3_EXTEND | ACCESS3_LOOKUP;
1515 	} else {
1516 		args.access = ACCESS3_READ | ACCESS3_MODIFY | ACCESS3_EXTEND |
1517 		    ACCESS3_EXECUTE;
1518 	}
1519 	fi.vp = vp;
1520 	fi.fhp = (caddr_t)&args.object;
1521 	fi.copyproc = nfs3copyfh;
1522 	fi.lookupproc = nfs3lookup;
1523 	fi.xattrdirproc = acl_getxattrdir3;
1524 
1525 	cred = cr;
1526 	/*
1527 	 * ncr and ncrfree both initially
1528 	 * point to the memory area returned
1529 	 * by crnetadjust();
1530 	 * ncrfree not NULL when exiting means
1531 	 * that we need to release it
1532 	 */
1533 	ncr = crnetadjust(cred);
1534 	ncrfree = ncr;
1535 tryagain:
1536 	if (rp->r_acache != NULL) {
1537 		cacc = nfs_access_check(rp, acc, cred);
1538 		if (cacc == NFS_ACCESS_ALLOWED) {
1539 			if (ncrfree != NULL)
1540 				crfree(ncrfree);
1541 			return (0);
1542 		}
1543 		if (cacc == NFS_ACCESS_DENIED) {
1544 			/*
1545 			 * If the cred can be adjusted, try again
1546 			 * with the new cred.
1547 			 */
1548 			if (ncr != NULL) {
1549 				cred = ncr;
1550 				ncr = NULL;
1551 				goto tryagain;
1552 			}
1553 			if (ncrfree != NULL)
1554 				crfree(ncrfree);
1555 			return (EACCES);
1556 		}
1557 	}
1558 
1559 	douprintf = 1;
1560 
1561 	t = gethrtime();
1562 
1563 	error = rfs3call(VTOMI(vp), NFSPROC3_ACCESS,
1564 	    xdr_ACCESS3args, (caddr_t)&args,
1565 	    xdr_ACCESS3res, (caddr_t)&res, cred,
1566 	    &douprintf, &res.status, 0, &fi);
1567 
1568 	if (error) {
1569 		if (ncrfree != NULL)
1570 			crfree(ncrfree);
1571 		return (error);
1572 	}
1573 
1574 	error = geterrno3(res.status);
1575 	if (!error) {
1576 		nfs3_cache_post_op_attr(vp, &res.resok.obj_attributes, t, cr);
1577 		nfs_access_cache(rp, args.access, res.resok.access, cred);
1578 		/*
1579 		 * we just cached results with cred; if cred is the
1580 		 * adjusted credentials from crnetadjust, we do not want
1581 		 * to release them before exiting: hence setting ncrfree
1582 		 * to NULL
1583 		 */
1584 		if (cred != cr)
1585 			ncrfree = NULL;
1586 		if ((acc & res.resok.access) != acc) {
1587 			/*
1588 			 * If the cred can be adjusted, try again
1589 			 * with the new cred.
1590 			 */
1591 			if (ncr != NULL) {
1592 				cred = ncr;
1593 				ncr = NULL;
1594 				goto tryagain;
1595 			}
1596 			error = EACCES;
1597 		}
1598 	} else {
1599 		nfs3_cache_post_op_attr(vp, &res.resfail.obj_attributes, t, cr);
1600 		PURGE_STALE_FH(error, vp, cr);
1601 	}
1602 
1603 	if (ncrfree != NULL)
1604 		crfree(ncrfree);
1605 
1606 	return (error);
1607 }
1608 
1609 static int nfs3_do_symlink_cache = 1;
1610 
1611 /* ARGSUSED */
1612 static int
1613 nfs3_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr, caller_context_t *ct)
1614 {
1615 	int error;
1616 	READLINK3args args;
1617 	READLINK3res res;
1618 	nfspath3 resdata_backup;
1619 	rnode_t *rp;
1620 	int douprintf;
1621 	int len;
1622 	failinfo_t fi;
1623 	hrtime_t t;
1624 
1625 	/*
1626 	 * Can't readlink anything other than a symbolic link.
1627 	 */
1628 	if (vp->v_type != VLNK)
1629 		return (EINVAL);
1630 	if (nfs_zone() != VTOMI(vp)->mi_zone)
1631 		return (EIO);
1632 
1633 	rp = VTOR(vp);
1634 	if (nfs3_do_symlink_cache && rp->r_symlink.contents != NULL) {
1635 		error = nfs3_validate_caches(vp, cr);
1636 		if (error)
1637 			return (error);
1638 		mutex_enter(&rp->r_statelock);
1639 		if (rp->r_symlink.contents != NULL) {
1640 			error = uiomove(rp->r_symlink.contents,
1641 			    rp->r_symlink.len, UIO_READ, uiop);
1642 			mutex_exit(&rp->r_statelock);
1643 			return (error);
1644 		}
1645 		mutex_exit(&rp->r_statelock);
1646 	}
1647 
1648 	args.symlink = *VTOFH3(vp);
1649 	fi.vp = vp;
1650 	fi.fhp = (caddr_t)&args.symlink;
1651 	fi.copyproc = nfs3copyfh;
1652 	fi.lookupproc = nfs3lookup;
1653 	fi.xattrdirproc = acl_getxattrdir3;
1654 
1655 	res.resok.data = kmem_alloc(MAXPATHLEN, KM_SLEEP);
1656 
1657 	resdata_backup = res.resok.data;
1658 
1659 	douprintf = 1;
1660 
1661 	t = gethrtime();
1662 
1663 	error = rfs3call(VTOMI(vp), NFSPROC3_READLINK,
1664 	    xdr_nfs_fh3, (caddr_t)&args,
1665 	    xdr_READLINK3res, (caddr_t)&res, cr,
1666 	    &douprintf, &res.status, 0, &fi);
1667 
1668 	if (res.resok.data == nfs3nametoolong)
1669 		error = EINVAL;
1670 
1671 	if (error) {
1672 		kmem_free(resdata_backup, MAXPATHLEN);
1673 		return (error);
1674 	}
1675 
1676 	error = geterrno3(res.status);
1677 	if (!error) {
1678 		nfs3_cache_post_op_attr(vp, &res.resok.symlink_attributes, t,
1679 		    cr);
1680 		len = strlen(res.resok.data);
1681 		error = uiomove(res.resok.data, len, UIO_READ, uiop);
1682 		if (nfs3_do_symlink_cache && rp->r_symlink.contents == NULL) {
1683 			mutex_enter(&rp->r_statelock);
1684 				if (rp->r_symlink.contents == NULL) {
1685 				rp->r_symlink.contents = res.resok.data;
1686 				rp->r_symlink.len = len;
1687 				rp->r_symlink.size = MAXPATHLEN;
1688 				mutex_exit(&rp->r_statelock);
1689 			} else {
1690 				mutex_exit(&rp->r_statelock);
1691 
1692 				kmem_free((void *)res.resok.data, MAXPATHLEN);
1693 			}
1694 		} else {
1695 			kmem_free((void *)res.resok.data, MAXPATHLEN);
1696 		}
1697 	} else {
1698 		nfs3_cache_post_op_attr(vp,
1699 		    &res.resfail.symlink_attributes, t, cr);
1700 		PURGE_STALE_FH(error, vp, cr);
1701 
1702 		kmem_free((void *)res.resok.data, MAXPATHLEN);
1703 
1704 	}
1705 
1706 	/*
1707 	 * The over the wire error for attempting to readlink something
1708 	 * other than a symbolic link is ENXIO.  However, we need to
1709 	 * return EINVAL instead of ENXIO, so we map it here.
1710 	 */
1711 	return (error == ENXIO ? EINVAL : error);
1712 }
1713 
1714 /*
1715  * Flush local dirty pages to stable storage on the server.
1716  *
1717  * If FNODSYNC is specified, then there is nothing to do because
1718  * metadata changes are not cached on the client before being
1719  * sent to the server.
1720  */
1721 /* ARGSUSED */
1722 static int
1723 nfs3_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
1724 {
1725 	int error;
1726 
1727 	if ((syncflag & FNODSYNC) || IS_SWAPVP(vp))
1728 		return (0);
1729 	if (nfs_zone() != VTOMI(vp)->mi_zone)
1730 		return (EIO);
1731 
1732 	error = nfs3_putpage_commit(vp, (offset_t)0, 0, cr);
1733 	if (!error)
1734 		error = VTOR(vp)->r_error;
1735 	return (error);
1736 }
1737 
1738 /*
1739  * Weirdness: if the file was removed or the target of a rename
1740  * operation while it was open, it got renamed instead.  Here we
1741  * remove the renamed file.
1742  */
1743 /* ARGSUSED */
1744 static void
1745 nfs3_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
1746 {
1747 	rnode_t *rp;
1748 
1749 	ASSERT(vp != DNLC_NO_VNODE);
1750 
1751 	/*
1752 	 * If this is coming from the wrong zone, we let someone in the right
1753 	 * zone take care of it asynchronously.  We can get here due to
1754 	 * VN_RELE() being called from pageout() or fsflush().  This call may
1755 	 * potentially turn into an expensive no-op if, for instance, v_count
1756 	 * gets incremented in the meantime, but it's still correct.
1757 	 */
1758 	if (nfs_zone() != VTOMI(vp)->mi_zone) {
1759 		nfs_async_inactive(vp, cr, nfs3_inactive);
1760 		return;
1761 	}
1762 
1763 	rp = VTOR(vp);
1764 redo:
1765 	if (rp->r_unldvp != NULL) {
1766 		/*
1767 		 * Save the vnode pointer for the directory where the
1768 		 * unlinked-open file got renamed, then set it to NULL
1769 		 * to prevent another thread from getting here before
1770 		 * we're done with the remove.  While we have the
1771 		 * statelock, make local copies of the pertinent rnode
1772 		 * fields.  If we weren't to do this in an atomic way, the
1773 		 * the unl* fields could become inconsistent with respect
1774 		 * to each other due to a race condition between this
1775 		 * code and nfs_remove().  See bug report 1034328.
1776 		 */
1777 		mutex_enter(&rp->r_statelock);
1778 		if (rp->r_unldvp != NULL) {
1779 			vnode_t *unldvp;
1780 			char *unlname;
1781 			cred_t *unlcred;
1782 			REMOVE3args args;
1783 			REMOVE3res res;
1784 			int douprintf;
1785 			int error;
1786 			hrtime_t t;
1787 
1788 			unldvp = rp->r_unldvp;
1789 			rp->r_unldvp = NULL;
1790 			unlname = rp->r_unlname;
1791 			rp->r_unlname = NULL;
1792 			unlcred = rp->r_unlcred;
1793 			rp->r_unlcred = NULL;
1794 			mutex_exit(&rp->r_statelock);
1795 
1796 			/*
1797 			 * If there are any dirty pages left, then flush
1798 			 * them.  This is unfortunate because they just
1799 			 * may get thrown away during the remove operation,
1800 			 * but we have to do this for correctness.
1801 			 */
1802 			if (vn_has_cached_data(vp) &&
1803 			    ((rp->r_flags & RDIRTY) || rp->r_count > 0)) {
1804 				ASSERT(vp->v_type != VCHR);
1805 				error = nfs3_putpage(vp, (offset_t)0, 0, 0,
1806 						cr, ct);
1807 				if (error) {
1808 					mutex_enter(&rp->r_statelock);
1809 					if (!rp->r_error)
1810 						rp->r_error = error;
1811 					mutex_exit(&rp->r_statelock);
1812 				}
1813 			}
1814 
1815 			/*
1816 			 * Do the remove operation on the renamed file
1817 			 */
1818 			setdiropargs3(&args.object, unlname, unldvp);
1819 
1820 			douprintf = 1;
1821 
1822 			t = gethrtime();
1823 
1824 			error = rfs3call(VTOMI(unldvp), NFSPROC3_REMOVE,
1825 			    xdr_diropargs3, (caddr_t)&args,
1826 			    xdr_REMOVE3res, (caddr_t)&res, unlcred,
1827 			    &douprintf, &res.status, 0, NULL);
1828 
1829 			if (error) {
1830 				PURGE_ATTRCACHE(unldvp);
1831 			} else {
1832 				error = geterrno3(res.status);
1833 				if (!error) {
1834 					nfs3_cache_wcc_data(unldvp,
1835 					    &res.resok.dir_wcc, t, cr);
1836 					if (HAVE_RDDIR_CACHE(VTOR(unldvp)))
1837 						nfs_purge_rddir_cache(unldvp);
1838 				} else {
1839 					nfs3_cache_wcc_data(unldvp,
1840 					    &res.resfail.dir_wcc, t, cr);
1841 					PURGE_STALE_FH(error, unldvp, cr);
1842 				}
1843 			}
1844 
1845 			/*
1846 			 * Release stuff held for the remove
1847 			 */
1848 			VN_RELE(unldvp);
1849 			kmem_free(unlname, MAXNAMELEN);
1850 			crfree(unlcred);
1851 			goto redo;
1852 		}
1853 		mutex_exit(&rp->r_statelock);
1854 	}
1855 
1856 	rp_addfree(rp, cr);
1857 }
1858 
1859 /*
1860  * Remote file system operations having to do with directory manipulation.
1861  */
1862 
1863 /* ARGSUSED */
1864 static int
1865 nfs3_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
1866 	int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
1867 	int *direntflags, pathname_t *realpnp)
1868 {
1869 	int error;
1870 	vnode_t *vp;
1871 	vnode_t *avp = NULL;
1872 	rnode_t *drp;
1873 
1874 	if (nfs_zone() != VTOMI(dvp)->mi_zone)
1875 		return (EPERM);
1876 
1877 	drp = VTOR(dvp);
1878 
1879 	/*
1880 	 * Are we looking up extended attributes?  If so, "dvp" is
1881 	 * the file or directory for which we want attributes, and
1882 	 * we need a lookup of the hidden attribute directory
1883 	 * before we lookup the rest of the path.
1884 	 */
1885 	if (flags & LOOKUP_XATTR) {
1886 		bool_t cflag = ((flags & CREATE_XATTR_DIR) != 0);
1887 		mntinfo_t *mi;
1888 
1889 		mi = VTOMI(dvp);
1890 		if (!(mi->mi_flags & MI_EXTATTR))
1891 			return (EINVAL);
1892 
1893 		if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR(dvp)))
1894 			return (EINTR);
1895 
1896 		(void) nfs3lookup_dnlc(dvp, XATTR_DIR_NAME, &avp, cr);
1897 		if (avp == NULL)
1898 			error = acl_getxattrdir3(dvp, &avp, cflag, cr, 0);
1899 		else
1900 			error = 0;
1901 
1902 		nfs_rw_exit(&drp->r_rwlock);
1903 
1904 		if (error) {
1905 			if (mi->mi_flags & MI_EXTATTR)
1906 				return (error);
1907 			return (EINVAL);
1908 		}
1909 		dvp = avp;
1910 		drp = VTOR(dvp);
1911 	}
1912 
1913 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR(dvp))) {
1914 		error = EINTR;
1915 		goto out;
1916 	}
1917 
1918 	error = nfs3lookup(dvp, nm, vpp, pnp, flags, rdir, cr, 0);
1919 
1920 	nfs_rw_exit(&drp->r_rwlock);
1921 
1922 	/*
1923 	 * If vnode is a device, create special vnode.
1924 	 */
1925 	if (!error && IS_DEVVP(*vpp)) {
1926 		vp = *vpp;
1927 		*vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
1928 		VN_RELE(vp);
1929 	}
1930 
1931 out:
1932 	if (avp != NULL)
1933 		VN_RELE(avp);
1934 
1935 	return (error);
1936 }
1937 
1938 static int nfs3_lookup_neg_cache = 1;
1939 
1940 #ifdef DEBUG
1941 static int nfs3_lookup_dnlc_hits = 0;
1942 static int nfs3_lookup_dnlc_misses = 0;
1943 static int nfs3_lookup_dnlc_neg_hits = 0;
1944 static int nfs3_lookup_dnlc_disappears = 0;
1945 static int nfs3_lookup_dnlc_lookups = 0;
1946 #endif
1947 
1948 /* ARGSUSED */
1949 int
1950 nfs3lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
1951 	int flags, vnode_t *rdir, cred_t *cr, int rfscall_flags)
1952 {
1953 	int error;
1954 	rnode_t *drp;
1955 
1956 	ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
1957 	/*
1958 	 * If lookup is for "", just return dvp.  Don't need
1959 	 * to send it over the wire, look it up in the dnlc,
1960 	 * or perform any access checks.
1961 	 */
1962 	if (*nm == '\0') {
1963 		VN_HOLD(dvp);
1964 		*vpp = dvp;
1965 		return (0);
1966 	}
1967 
1968 	/*
1969 	 * Can't do lookups in non-directories.
1970 	 */
1971 	if (dvp->v_type != VDIR)
1972 		return (ENOTDIR);
1973 
1974 	/*
1975 	 * If we're called with RFSCALL_SOFT, it's important that
1976 	 * the only rfscall is one we make directly; if we permit
1977 	 * an access call because we're looking up "." or validating
1978 	 * a dnlc hit, we'll deadlock because that rfscall will not
1979 	 * have the RFSCALL_SOFT set.
1980 	 */
1981 	if (rfscall_flags & RFSCALL_SOFT)
1982 		goto callit;
1983 
1984 	/*
1985 	 * If lookup is for ".", just return dvp.  Don't need
1986 	 * to send it over the wire or look it up in the dnlc,
1987 	 * just need to check access.
1988 	 */
1989 	if (strcmp(nm, ".") == 0) {
1990 		error = nfs3_access(dvp, VEXEC, 0, cr, NULL);
1991 		if (error)
1992 			return (error);
1993 		VN_HOLD(dvp);
1994 		*vpp = dvp;
1995 		return (0);
1996 	}
1997 
1998 	drp = VTOR(dvp);
1999 	if (!(drp->r_flags & RLOOKUP)) {
2000 		mutex_enter(&drp->r_statelock);
2001 		drp->r_flags |= RLOOKUP;
2002 		mutex_exit(&drp->r_statelock);
2003 	}
2004 
2005 	/*
2006 	 * Lookup this name in the DNLC.  If there was a valid entry,
2007 	 * then return the results of the lookup.
2008 	 */
2009 	error = nfs3lookup_dnlc(dvp, nm, vpp, cr);
2010 	if (error || *vpp != NULL)
2011 		return (error);
2012 
2013 callit:
2014 	error = nfs3lookup_otw(dvp, nm, vpp, cr, rfscall_flags);
2015 
2016 	return (error);
2017 }
2018 
2019 static int
2020 nfs3lookup_dnlc(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr)
2021 {
2022 	int error;
2023 	vnode_t *vp;
2024 
2025 	ASSERT(*nm != '\0');
2026 	ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
2027 	/*
2028 	 * Lookup this name in the DNLC.  If successful, then validate
2029 	 * the caches and then recheck the DNLC.  The DNLC is rechecked
2030 	 * just in case this entry got invalidated during the call
2031 	 * to nfs3_validate_caches.
2032 	 *
2033 	 * An assumption is being made that it is safe to say that a
2034 	 * file exists which may not on the server.  Any operations to
2035 	 * the server will fail with ESTALE.
2036 	 */
2037 #ifdef DEBUG
2038 	nfs3_lookup_dnlc_lookups++;
2039 #endif
2040 	vp = dnlc_lookup(dvp, nm);
2041 	if (vp != NULL) {
2042 		VN_RELE(vp);
2043 		if (vp == DNLC_NO_VNODE && !vn_is_readonly(dvp)) {
2044 			PURGE_ATTRCACHE(dvp);
2045 		}
2046 		error = nfs3_validate_caches(dvp, cr);
2047 		if (error)
2048 			return (error);
2049 		vp = dnlc_lookup(dvp, nm);
2050 		if (vp != NULL) {
2051 			error = nfs3_access(dvp, VEXEC, 0, cr, NULL);
2052 			if (error) {
2053 				VN_RELE(vp);
2054 				return (error);
2055 			}
2056 			if (vp == DNLC_NO_VNODE) {
2057 				VN_RELE(vp);
2058 #ifdef DEBUG
2059 				nfs3_lookup_dnlc_neg_hits++;
2060 #endif
2061 				return (ENOENT);
2062 			}
2063 			*vpp = vp;
2064 #ifdef DEBUG
2065 			nfs3_lookup_dnlc_hits++;
2066 #endif
2067 			return (0);
2068 		}
2069 #ifdef DEBUG
2070 		nfs3_lookup_dnlc_disappears++;
2071 #endif
2072 	}
2073 #ifdef DEBUG
2074 	else
2075 		nfs3_lookup_dnlc_misses++;
2076 #endif
2077 
2078 	*vpp = NULL;
2079 
2080 	return (0);
2081 }
2082 
2083 static int
2084 nfs3lookup_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr,
2085 	int rfscall_flags)
2086 {
2087 	int error;
2088 	LOOKUP3args args;
2089 	LOOKUP3vres res;
2090 	int douprintf;
2091 	struct vattr vattr;
2092 	struct vattr dvattr;
2093 	vnode_t *vp;
2094 	failinfo_t fi;
2095 	hrtime_t t;
2096 
2097 	ASSERT(*nm != '\0');
2098 	ASSERT(dvp->v_type == VDIR);
2099 	ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
2100 
2101 	setdiropargs3(&args.what, nm, dvp);
2102 
2103 	fi.vp = dvp;
2104 	fi.fhp = (caddr_t)&args.what.dir;
2105 	fi.copyproc = nfs3copyfh;
2106 	fi.lookupproc = nfs3lookup;
2107 	fi.xattrdirproc = acl_getxattrdir3;
2108 	res.obj_attributes.fres.vp = dvp;
2109 	res.obj_attributes.fres.vap = &vattr;
2110 	res.dir_attributes.fres.vp = dvp;
2111 	res.dir_attributes.fres.vap = &dvattr;
2112 
2113 	douprintf = 1;
2114 
2115 	t = gethrtime();
2116 
2117 	error = rfs3call(VTOMI(dvp), NFSPROC3_LOOKUP,
2118 	    xdr_diropargs3, (caddr_t)&args,
2119 	    xdr_LOOKUP3vres, (caddr_t)&res, cr,
2120 	    &douprintf, &res.status, rfscall_flags, &fi);
2121 
2122 	if (error)
2123 		return (error);
2124 
2125 	nfs3_cache_post_op_vattr(dvp, &res.dir_attributes, t, cr);
2126 
2127 	error = geterrno3(res.status);
2128 	if (error) {
2129 		PURGE_STALE_FH(error, dvp, cr);
2130 		if (error == ENOENT && nfs3_lookup_neg_cache)
2131 			dnlc_enter(dvp, nm, DNLC_NO_VNODE);
2132 		return (error);
2133 	}
2134 
2135 	if (res.obj_attributes.attributes) {
2136 		vp = makenfs3node_va(&res.object, res.obj_attributes.fres.vap,
2137 				dvp->v_vfsp, t, cr, VTOR(dvp)->r_path, nm);
2138 	} else {
2139 		vp = makenfs3node_va(&res.object, NULL,
2140 				dvp->v_vfsp, t, cr, VTOR(dvp)->r_path, nm);
2141 		if (vp->v_type == VNON) {
2142 			vattr.va_mask = AT_TYPE;
2143 			error = nfs3getattr(vp, &vattr, cr);
2144 			if (error) {
2145 				VN_RELE(vp);
2146 				return (error);
2147 			}
2148 			vp->v_type = vattr.va_type;
2149 		}
2150 	}
2151 
2152 	if (!(rfscall_flags & RFSCALL_SOFT))
2153 		dnlc_update(dvp, nm, vp);
2154 
2155 	*vpp = vp;
2156 
2157 	return (error);
2158 }
2159 
2160 #ifdef DEBUG
2161 static int nfs3_create_misses = 0;
2162 #endif
2163 
2164 /* ARGSUSED */
2165 static int
2166 nfs3_create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive,
2167 	int mode, vnode_t **vpp, cred_t *cr, int lfaware, caller_context_t *ct,
2168 	vsecattr_t *vsecp)
2169 {
2170 	int error;
2171 	vnode_t *vp;
2172 	rnode_t *rp;
2173 	struct vattr vattr;
2174 	rnode_t *drp;
2175 	vnode_t *tempvp;
2176 
2177 	drp = VTOR(dvp);
2178 	if (nfs_zone() != VTOMI(dvp)->mi_zone)
2179 		return (EPERM);
2180 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
2181 		return (EINTR);
2182 
2183 top:
2184 	/*
2185 	 * We make a copy of the attributes because the caller does not
2186 	 * expect us to change what va points to.
2187 	 */
2188 	vattr = *va;
2189 
2190 	/*
2191 	 * If the pathname is "", just use dvp.  Don't need
2192 	 * to send it over the wire, look it up in the dnlc,
2193 	 * or perform any access checks.
2194 	 */
2195 	if (*nm == '\0') {
2196 		error = 0;
2197 		VN_HOLD(dvp);
2198 		vp = dvp;
2199 	/*
2200 	 * If the pathname is ".", just use dvp.  Don't need
2201 	 * to send it over the wire or look it up in the dnlc,
2202 	 * just need to check access.
2203 	 */
2204 	} else if (strcmp(nm, ".") == 0) {
2205 		error = nfs3_access(dvp, VEXEC, 0, cr, ct);
2206 		if (error) {
2207 			nfs_rw_exit(&drp->r_rwlock);
2208 			return (error);
2209 		}
2210 		VN_HOLD(dvp);
2211 		vp = dvp;
2212 	/*
2213 	 * We need to go over the wire, just to be sure whether the
2214 	 * file exists or not.  Using the DNLC can be dangerous in
2215 	 * this case when making a decision regarding existence.
2216 	 */
2217 	} else {
2218 		error = nfs3lookup_otw(dvp, nm, &vp, cr, 0);
2219 	}
2220 	if (!error) {
2221 		if (exclusive == EXCL)
2222 			error = EEXIST;
2223 		else if (vp->v_type == VDIR && (mode & VWRITE))
2224 			error = EISDIR;
2225 		else {
2226 			/*
2227 			 * If vnode is a device, create special vnode.
2228 			 */
2229 			if (IS_DEVVP(vp)) {
2230 				tempvp = vp;
2231 				vp = specvp(vp, vp->v_rdev, vp->v_type, cr);
2232 				VN_RELE(tempvp);
2233 			}
2234 			if (!(error = VOP_ACCESS(vp, mode, 0, cr, ct))) {
2235 				if ((vattr.va_mask & AT_SIZE) &&
2236 				    vp->v_type == VREG) {
2237 					rp = VTOR(vp);
2238 					/*
2239 					 * Check here for large file handled
2240 					 * by LF-unaware process (as
2241 					 * ufs_create() does)
2242 					 */
2243 					if (!(lfaware & FOFFMAX)) {
2244 						mutex_enter(&rp->r_statelock);
2245 						if (rp->r_size > MAXOFF32_T)
2246 							error = EOVERFLOW;
2247 						mutex_exit(&rp->r_statelock);
2248 					}
2249 					if (!error) {
2250 						vattr.va_mask = AT_SIZE;
2251 						error = nfs3setattr(vp,
2252 						    &vattr, 0, cr);
2253 					}
2254 				}
2255 			}
2256 		}
2257 		nfs_rw_exit(&drp->r_rwlock);
2258 		if (error) {
2259 			VN_RELE(vp);
2260 		} else {
2261 			/*
2262 			 * existing file got truncated, notify.
2263 			 */
2264 			vnevent_create(vp, ct);
2265 			*vpp = vp;
2266 		}
2267 		return (error);
2268 	}
2269 
2270 	dnlc_remove(dvp, nm);
2271 
2272 	/*
2273 	 * Decide what the group-id of the created file should be.
2274 	 * Set it in attribute list as advisory...
2275 	 */
2276 	error = setdirgid(dvp, &vattr.va_gid, cr);
2277 	if (error) {
2278 		nfs_rw_exit(&drp->r_rwlock);
2279 		return (error);
2280 	}
2281 	vattr.va_mask |= AT_GID;
2282 
2283 	ASSERT(vattr.va_mask & AT_TYPE);
2284 	if (vattr.va_type == VREG) {
2285 		ASSERT(vattr.va_mask & AT_MODE);
2286 		if (MANDMODE(vattr.va_mode)) {
2287 			nfs_rw_exit(&drp->r_rwlock);
2288 			return (EACCES);
2289 		}
2290 		error = nfs3create(dvp, nm, &vattr, exclusive, mode, vpp, cr,
2291 		    lfaware);
2292 		/*
2293 		 * If this is not an exclusive create, then the CREATE
2294 		 * request will be made with the GUARDED mode set.  This
2295 		 * means that the server will return EEXIST if the file
2296 		 * exists.  The file could exist because of a retransmitted
2297 		 * request.  In this case, we recover by starting over and
2298 		 * checking to see whether the file exists.  This second
2299 		 * time through it should and a CREATE request will not be
2300 		 * sent.
2301 		 *
2302 		 * This handles the problem of a dangling CREATE request
2303 		 * which contains attributes which indicate that the file
2304 		 * should be truncated.  This retransmitted request could
2305 		 * possibly truncate valid data in the file if not caught
2306 		 * by the duplicate request mechanism on the server or if
2307 		 * not caught by other means.  The scenario is:
2308 		 *
2309 		 * Client transmits CREATE request with size = 0
2310 		 * Client times out, retransmits request.
2311 		 * Response to the first request arrives from the server
2312 		 *  and the client proceeds on.
2313 		 * Client writes data to the file.
2314 		 * The server now processes retransmitted CREATE request
2315 		 *  and truncates file.
2316 		 *
2317 		 * The use of the GUARDED CREATE request prevents this from
2318 		 * happening because the retransmitted CREATE would fail
2319 		 * with EEXIST and would not truncate the file.
2320 		 */
2321 		if (error == EEXIST && exclusive == NONEXCL) {
2322 #ifdef DEBUG
2323 			nfs3_create_misses++;
2324 #endif
2325 			goto top;
2326 		}
2327 		nfs_rw_exit(&drp->r_rwlock);
2328 		return (error);
2329 	}
2330 	error = nfs3mknod(dvp, nm, &vattr, exclusive, mode, vpp, cr);
2331 	nfs_rw_exit(&drp->r_rwlock);
2332 	return (error);
2333 }
2334 
2335 /* ARGSUSED */
2336 static int
2337 nfs3create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive,
2338 	int mode, vnode_t **vpp, cred_t *cr, int lfaware)
2339 {
2340 	int error;
2341 	CREATE3args args;
2342 	CREATE3res res;
2343 	int douprintf;
2344 	vnode_t *vp;
2345 	struct vattr vattr;
2346 	nfstime3 *verfp;
2347 	rnode_t *rp;
2348 	timestruc_t now;
2349 	hrtime_t t;
2350 
2351 	ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
2352 	setdiropargs3(&args.where, nm, dvp);
2353 	if (exclusive == EXCL) {
2354 		args.how.mode = EXCLUSIVE;
2355 		/*
2356 		 * Construct the create verifier.  This verifier needs
2357 		 * to be unique between different clients.  It also needs
2358 		 * to vary for each exclusive create request generated
2359 		 * from the client to the server.
2360 		 *
2361 		 * The first attempt is made to use the hostid and a
2362 		 * unique number on the client.  If the hostid has not
2363 		 * been set, the high resolution time that the exclusive
2364 		 * create request is being made is used.  This will work
2365 		 * unless two different clients, both with the hostid
2366 		 * not set, attempt an exclusive create request on the
2367 		 * same file, at exactly the same clock time.  The
2368 		 * chances of this happening seem small enough to be
2369 		 * reasonable.
2370 		 */
2371 		verfp = (nfstime3 *)&args.how.createhow3_u.verf;
2372 		verfp->seconds = nfs_atoi(hw_serial);
2373 		if (verfp->seconds != 0)
2374 			verfp->nseconds = newnum();
2375 		else {
2376 			gethrestime(&now);
2377 			verfp->seconds = now.tv_sec;
2378 			verfp->nseconds = now.tv_nsec;
2379 		}
2380 		/*
2381 		 * Since the server will use this value for the mtime,
2382 		 * make sure that it can't overflow. Zero out the MSB.
2383 		 * The actual value does not matter here, only its uniqeness.
2384 		 */
2385 		verfp->seconds %= INT32_MAX;
2386 	} else {
2387 		/*
2388 		 * Issue the non-exclusive create in guarded mode.  This
2389 		 * may result in some false EEXIST responses for
2390 		 * retransmitted requests, but these will be handled at
2391 		 * a higher level.  By using GUARDED, duplicate requests
2392 		 * to do file truncation and possible access problems
2393 		 * can be avoided.
2394 		 */
2395 		args.how.mode = GUARDED;
2396 		error = vattr_to_sattr3(va,
2397 				&args.how.createhow3_u.obj_attributes);
2398 		if (error) {
2399 			/* req time field(s) overflow - return immediately */
2400 			return (error);
2401 		}
2402 	}
2403 
2404 	douprintf = 1;
2405 
2406 	t = gethrtime();
2407 
2408 	error = rfs3call(VTOMI(dvp), NFSPROC3_CREATE,
2409 	    xdr_CREATE3args, (caddr_t)&args,
2410 	    xdr_CREATE3res, (caddr_t)&res, cr,
2411 	    &douprintf, &res.status, 0, NULL);
2412 
2413 	if (error) {
2414 		PURGE_ATTRCACHE(dvp);
2415 		return (error);
2416 	}
2417 
2418 	error = geterrno3(res.status);
2419 	if (!error) {
2420 		nfs3_cache_wcc_data(dvp, &res.resok.dir_wcc, t, cr);
2421 		if (HAVE_RDDIR_CACHE(VTOR(dvp)))
2422 			nfs_purge_rddir_cache(dvp);
2423 
2424 		/*
2425 		 * On exclusive create the times need to be explicitly
2426 		 * set to clear any potential verifier that may be stored
2427 		 * in one of these fields (see comment below).  This
2428 		 * is done here to cover the case where no post op attrs
2429 		 * were returned or a 'invalid' time was returned in
2430 		 * the attributes.
2431 		 */
2432 		if (exclusive == EXCL)
2433 			va->va_mask |= (AT_MTIME | AT_ATIME);
2434 
2435 		if (!res.resok.obj.handle_follows) {
2436 			error = nfs3lookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0);
2437 			if (error)
2438 				return (error);
2439 		} else {
2440 			if (res.resok.obj_attributes.attributes) {
2441 				vp = makenfs3node(&res.resok.obj.handle,
2442 				    &res.resok.obj_attributes.attr,
2443 				    dvp->v_vfsp, t, cr, NULL, NULL);
2444 			} else {
2445 				vp = makenfs3node(&res.resok.obj.handle, NULL,
2446 				    dvp->v_vfsp, t, cr, NULL, NULL);
2447 
2448 				/*
2449 				 * On an exclusive create, it is possible
2450 				 * that attributes were returned but those
2451 				 * postop attributes failed to decode
2452 				 * properly.  If this is the case,
2453 				 * then most likely the atime or mtime
2454 				 * were invalid for our client; this
2455 				 * is caused by the server storing the
2456 				 * create verifier in one of the time
2457 				 * fields(most likely mtime).
2458 				 * So... we are going to setattr just the
2459 				 * atime/mtime to clear things up.
2460 				 */
2461 				if (exclusive == EXCL) {
2462 					if (error =
2463 						nfs3excl_create_settimes(vp,
2464 							va, cr)) {
2465 						/*
2466 						 * Setting the times failed.
2467 						 * Remove the file and return
2468 						 * the error.
2469 						 */
2470 						VN_RELE(vp);
2471 						(void) nfs3_remove(dvp,
2472 							nm, cr, NULL, 0);
2473 						return (error);
2474 					}
2475 				}
2476 
2477 				/*
2478 				 * This handles the non-exclusive case
2479 				 * and the exclusive case where no post op
2480 				 * attrs were returned.
2481 				 */
2482 				if (vp->v_type == VNON) {
2483 					vattr.va_mask = AT_TYPE;
2484 					error = nfs3getattr(vp, &vattr, cr);
2485 					if (error) {
2486 						VN_RELE(vp);
2487 						return (error);
2488 					}
2489 					vp->v_type = vattr.va_type;
2490 				}
2491 			}
2492 			dnlc_update(dvp, nm, vp);
2493 		}
2494 
2495 		rp = VTOR(vp);
2496 
2497 		/*
2498 		 * Check here for large file handled by
2499 		 * LF-unaware process (as ufs_create() does)
2500 		 */
2501 		if ((va->va_mask & AT_SIZE) && vp->v_type == VREG &&
2502 		    !(lfaware & FOFFMAX)) {
2503 			mutex_enter(&rp->r_statelock);
2504 			if (rp->r_size > MAXOFF32_T) {
2505 				mutex_exit(&rp->r_statelock);
2506 				VN_RELE(vp);
2507 				return (EOVERFLOW);
2508 			}
2509 			mutex_exit(&rp->r_statelock);
2510 		}
2511 
2512 		if (exclusive == EXCL &&
2513 			(va->va_mask & ~(AT_GID | AT_SIZE))) {
2514 			/*
2515 			 * If doing an exclusive create, then generate
2516 			 * a SETATTR to set the initial attributes.
2517 			 * Try to set the mtime and the atime to the
2518 			 * server's current time.  It is somewhat
2519 			 * expected that these fields will be used to
2520 			 * store the exclusive create cookie.  If not,
2521 			 * server implementors will need to know that
2522 			 * a SETATTR will follow an exclusive create
2523 			 * and the cookie should be destroyed if
2524 			 * appropriate. This work may have been done
2525 			 * earlier in this function if post op attrs
2526 			 * were not available.
2527 			 *
2528 			 * The AT_GID and AT_SIZE bits are turned off
2529 			 * so that the SETATTR request will not attempt
2530 			 * to process these.  The gid will be set
2531 			 * separately if appropriate.  The size is turned
2532 			 * off because it is assumed that a new file will
2533 			 * be created empty and if the file wasn't empty,
2534 			 * then the exclusive create will have failed
2535 			 * because the file must have existed already.
2536 			 * Therefore, no truncate operation is needed.
2537 			 */
2538 			va->va_mask &= ~(AT_GID | AT_SIZE);
2539 			error = nfs3setattr(vp, va, 0, cr);
2540 			if (error) {
2541 				/*
2542 				 * Couldn't correct the attributes of
2543 				 * the newly created file and the
2544 				 * attributes are wrong.  Remove the
2545 				 * file and return an error to the
2546 				 * application.
2547 				 */
2548 				VN_RELE(vp);
2549 				(void) nfs3_remove(dvp, nm, cr, NULL, 0);
2550 				return (error);
2551 			}
2552 		}
2553 
2554 		if (va->va_gid != rp->r_attr.va_gid) {
2555 			/*
2556 			 * If the gid on the file isn't right, then
2557 			 * generate a SETATTR to attempt to change
2558 			 * it.  This may or may not work, depending
2559 			 * upon the server's semantics for allowing
2560 			 * file ownership changes.
2561 			 */
2562 			va->va_mask = AT_GID;
2563 			(void) nfs3setattr(vp, va, 0, cr);
2564 		}
2565 
2566 		/*
2567 		 * If vnode is a device create special vnode
2568 		 */
2569 		if (IS_DEVVP(vp)) {
2570 			*vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
2571 			VN_RELE(vp);
2572 		} else
2573 			*vpp = vp;
2574 	} else {
2575 		nfs3_cache_wcc_data(dvp, &res.resfail.dir_wcc, t, cr);
2576 		PURGE_STALE_FH(error, dvp, cr);
2577 	}
2578 
2579 	return (error);
2580 }
2581 
2582 /*
2583  * Special setattr function to take care of rest of atime/mtime
2584  * after successful exclusive create.  This function exists to avoid
2585  * handling attributes from the server; exclusive the atime/mtime fields
2586  * may be 'invalid' in client's view and therefore can not be trusted.
2587  */
2588 static int
2589 nfs3excl_create_settimes(vnode_t *vp, struct vattr *vap, cred_t *cr)
2590 {
2591 	int error;
2592 	uint_t mask;
2593 	SETATTR3args args;
2594 	SETATTR3res res;
2595 	int douprintf;
2596 	rnode_t *rp;
2597 	hrtime_t t;
2598 
2599 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
2600 	/* save the caller's mask so that it can be reset later */
2601 	mask = vap->va_mask;
2602 
2603 	rp = VTOR(vp);
2604 
2605 	args.object = *RTOFH3(rp);
2606 	args.guard.check = FALSE;
2607 
2608 	/* Use the mask to initialize the arguments */
2609 	vap->va_mask = 0;
2610 	error = vattr_to_sattr3(vap, &args.new_attributes);
2611 
2612 	/* We want to set just atime/mtime on this request */
2613 	args.new_attributes.atime.set_it = SET_TO_SERVER_TIME;
2614 	args.new_attributes.mtime.set_it = SET_TO_SERVER_TIME;
2615 
2616 	douprintf = 1;
2617 
2618 	t = gethrtime();
2619 
2620 	error = rfs3call(VTOMI(vp), NFSPROC3_SETATTR,
2621 	    xdr_SETATTR3args, (caddr_t)&args,
2622 	    xdr_SETATTR3res, (caddr_t)&res, cr,
2623 	    &douprintf, &res.status, 0, NULL);
2624 
2625 	if (error) {
2626 		vap->va_mask = mask;
2627 		return (error);
2628 	}
2629 
2630 	error = geterrno3(res.status);
2631 	if (!error) {
2632 		/*
2633 		 * It is important to pick up the attributes.
2634 		 * Since this is the exclusive create path, the
2635 		 * attributes on the initial create were ignored
2636 		 * and we need these to have the correct info.
2637 		 */
2638 		nfs3_cache_wcc_data(vp, &res.resok.obj_wcc, t, cr);
2639 		/*
2640 		 * No need to do the atime/mtime work again so clear
2641 		 * the bits.
2642 		 */
2643 		mask &= ~(AT_ATIME | AT_MTIME);
2644 	} else {
2645 		nfs3_cache_wcc_data(vp, &res.resfail.obj_wcc, t, cr);
2646 	}
2647 
2648 	vap->va_mask = mask;
2649 
2650 	return (error);
2651 }
2652 
2653 /* ARGSUSED */
2654 static int
2655 nfs3mknod(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive,
2656 	int mode, vnode_t **vpp, cred_t *cr)
2657 {
2658 	int error;
2659 	MKNOD3args args;
2660 	MKNOD3res res;
2661 	int douprintf;
2662 	vnode_t *vp;
2663 	struct vattr vattr;
2664 	hrtime_t t;
2665 
2666 	ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
2667 	switch (va->va_type) {
2668 	case VCHR:
2669 	case VBLK:
2670 		setdiropargs3(&args.where, nm, dvp);
2671 		args.what.type = (va->va_type == VCHR) ? NF3CHR : NF3BLK;
2672 		error = vattr_to_sattr3(va,
2673 		    &args.what.mknoddata3_u.device.dev_attributes);
2674 		if (error) {
2675 			/* req time field(s) overflow - return immediately */
2676 			return (error);
2677 		}
2678 		args.what.mknoddata3_u.device.spec.specdata1 =
2679 		    getmajor(va->va_rdev);
2680 		args.what.mknoddata3_u.device.spec.specdata2 =
2681 		    getminor(va->va_rdev);
2682 		break;
2683 
2684 	case VFIFO:
2685 	case VSOCK:
2686 		setdiropargs3(&args.where, nm, dvp);
2687 		args.what.type = (va->va_type == VFIFO) ? NF3FIFO : NF3SOCK;
2688 		error = vattr_to_sattr3(va,
2689 				&args.what.mknoddata3_u.pipe_attributes);
2690 		if (error) {
2691 			/* req time field(s) overflow - return immediately */
2692 			return (error);
2693 		}
2694 		break;
2695 
2696 	default:
2697 		return (EINVAL);
2698 	}
2699 
2700 	douprintf = 1;
2701 
2702 	t = gethrtime();
2703 
2704 	error = rfs3call(VTOMI(dvp), NFSPROC3_MKNOD,
2705 	    xdr_MKNOD3args, (caddr_t)&args,
2706 	    xdr_MKNOD3res, (caddr_t)&res, cr,
2707 	    &douprintf, &res.status, 0, NULL);
2708 
2709 	if (error) {
2710 		PURGE_ATTRCACHE(dvp);
2711 		return (error);
2712 	}
2713 
2714 	error = geterrno3(res.status);
2715 	if (!error) {
2716 		nfs3_cache_wcc_data(dvp, &res.resok.dir_wcc, t, cr);
2717 		if (HAVE_RDDIR_CACHE(VTOR(dvp)))
2718 			nfs_purge_rddir_cache(dvp);
2719 
2720 		if (!res.resok.obj.handle_follows) {
2721 			error = nfs3lookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0);
2722 			if (error)
2723 				return (error);
2724 		} else {
2725 			if (res.resok.obj_attributes.attributes) {
2726 				vp = makenfs3node(&res.resok.obj.handle,
2727 				    &res.resok.obj_attributes.attr,
2728 				    dvp->v_vfsp, t, cr, NULL, NULL);
2729 			} else {
2730 				vp = makenfs3node(&res.resok.obj.handle, NULL,
2731 				    dvp->v_vfsp, t, cr, NULL, NULL);
2732 				if (vp->v_type == VNON) {
2733 					vattr.va_mask = AT_TYPE;
2734 					error = nfs3getattr(vp, &vattr, cr);
2735 					if (error) {
2736 						VN_RELE(vp);
2737 						return (error);
2738 					}
2739 					vp->v_type = vattr.va_type;
2740 				}
2741 
2742 			}
2743 			dnlc_update(dvp, nm, vp);
2744 		}
2745 
2746 		if (va->va_gid != VTOR(vp)->r_attr.va_gid) {
2747 			va->va_mask = AT_GID;
2748 			(void) nfs3setattr(vp, va, 0, cr);
2749 		}
2750 
2751 		/*
2752 		 * If vnode is a device create special vnode
2753 		 */
2754 		if (IS_DEVVP(vp)) {
2755 			*vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
2756 			VN_RELE(vp);
2757 		} else
2758 			*vpp = vp;
2759 	} else {
2760 		nfs3_cache_wcc_data(dvp, &res.resfail.dir_wcc, t, cr);
2761 		PURGE_STALE_FH(error, dvp, cr);
2762 	}
2763 	return (error);
2764 }
2765 
2766 /*
2767  * Weirdness: if the vnode to be removed is open
2768  * we rename it instead of removing it and nfs_inactive
2769  * will remove the new name.
2770  */
2771 /* ARGSUSED */
2772 static int
2773 nfs3_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct, int flags)
2774 {
2775 	int error;
2776 	REMOVE3args args;
2777 	REMOVE3res res;
2778 	vnode_t *vp;
2779 	char *tmpname;
2780 	int douprintf;
2781 	rnode_t *rp;
2782 	rnode_t *drp;
2783 	hrtime_t t;
2784 
2785 	if (nfs_zone() != VTOMI(dvp)->mi_zone)
2786 		return (EPERM);
2787 	drp = VTOR(dvp);
2788 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
2789 		return (EINTR);
2790 
2791 	error = nfs3lookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0);
2792 	if (error) {
2793 		nfs_rw_exit(&drp->r_rwlock);
2794 		return (error);
2795 	}
2796 
2797 	if (vp->v_type == VDIR && secpolicy_fs_linkdir(cr, dvp->v_vfsp)) {
2798 		VN_RELE(vp);
2799 		nfs_rw_exit(&drp->r_rwlock);
2800 		return (EPERM);
2801 	}
2802 
2803 	/*
2804 	 * First just remove the entry from the name cache, as it
2805 	 * is most likely the only entry for this vp.
2806 	 */
2807 	dnlc_remove(dvp, nm);
2808 
2809 	/*
2810 	 * If the file has a v_count > 1 then there may be more than one
2811 	 * entry in the name cache due multiple links or an open file,
2812 	 * but we don't have the real reference count so flush all
2813 	 * possible entries.
2814 	 */
2815 	if (vp->v_count > 1)
2816 		dnlc_purge_vp(vp);
2817 
2818 	/*
2819 	 * Now we have the real reference count on the vnode
2820 	 */
2821 	rp = VTOR(vp);
2822 	mutex_enter(&rp->r_statelock);
2823 	if (vp->v_count > 1 &&
2824 	    (rp->r_unldvp == NULL || strcmp(nm, rp->r_unlname) == 0)) {
2825 		mutex_exit(&rp->r_statelock);
2826 		tmpname = newname();
2827 		error = nfs3rename(dvp, nm, dvp, tmpname, cr, ct);
2828 		if (error)
2829 			kmem_free(tmpname, MAXNAMELEN);
2830 		else {
2831 			mutex_enter(&rp->r_statelock);
2832 			if (rp->r_unldvp == NULL) {
2833 				VN_HOLD(dvp);
2834 				rp->r_unldvp = dvp;
2835 				if (rp->r_unlcred != NULL)
2836 					crfree(rp->r_unlcred);
2837 				crhold(cr);
2838 				rp->r_unlcred = cr;
2839 				rp->r_unlname = tmpname;
2840 			} else {
2841 				kmem_free(rp->r_unlname, MAXNAMELEN);
2842 				rp->r_unlname = tmpname;
2843 			}
2844 			mutex_exit(&rp->r_statelock);
2845 		}
2846 	} else {
2847 		mutex_exit(&rp->r_statelock);
2848 		/*
2849 		 * We need to flush any dirty pages which happen to
2850 		 * be hanging around before removing the file.  This
2851 		 * shouldn't happen very often and mostly on file
2852 		 * systems mounted "nocto".
2853 		 */
2854 		if (vn_has_cached_data(vp) &&
2855 		    ((rp->r_flags & RDIRTY) || rp->r_count > 0)) {
2856 			error = nfs3_putpage(vp, (offset_t)0, 0, 0, cr, ct);
2857 			if (error && (error == ENOSPC || error == EDQUOT)) {
2858 				mutex_enter(&rp->r_statelock);
2859 				if (!rp->r_error)
2860 					rp->r_error = error;
2861 				mutex_exit(&rp->r_statelock);
2862 			}
2863 		}
2864 
2865 		setdiropargs3(&args.object, nm, dvp);
2866 
2867 		douprintf = 1;
2868 
2869 		t = gethrtime();
2870 
2871 		error = rfs3call(VTOMI(dvp), NFSPROC3_REMOVE,
2872 		    xdr_diropargs3, (caddr_t)&args,
2873 		    xdr_REMOVE3res, (caddr_t)&res, cr,
2874 		    &douprintf, &res.status, 0, NULL);
2875 
2876 		/*
2877 		 * The xattr dir may be gone after last attr is removed,
2878 		 * so flush it from dnlc.
2879 		 */
2880 		if (dvp->v_flag & V_XATTRDIR)
2881 			dnlc_purge_vp(dvp);
2882 
2883 		PURGE_ATTRCACHE(vp);
2884 
2885 		if (error) {
2886 			PURGE_ATTRCACHE(dvp);
2887 		} else {
2888 			error = geterrno3(res.status);
2889 			if (!error) {
2890 				nfs3_cache_wcc_data(dvp, &res.resok.dir_wcc, t,
2891 				    cr);
2892 				if (HAVE_RDDIR_CACHE(drp))
2893 					nfs_purge_rddir_cache(dvp);
2894 			} else {
2895 				nfs3_cache_wcc_data(dvp, &res.resfail.dir_wcc,
2896 				    t, cr);
2897 				PURGE_STALE_FH(error, dvp, cr);
2898 			}
2899 		}
2900 	}
2901 
2902 	if (error == 0) {
2903 		vnevent_remove(vp, dvp, nm, ct);
2904 	}
2905 	VN_RELE(vp);
2906 
2907 	nfs_rw_exit(&drp->r_rwlock);
2908 
2909 	return (error);
2910 }
2911 
2912 /* ARGSUSED */
2913 static int
2914 nfs3_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr,
2915 	caller_context_t *ct, int flags)
2916 {
2917 	int error;
2918 	LINK3args args;
2919 	LINK3res res;
2920 	vnode_t *realvp;
2921 	int douprintf;
2922 	mntinfo_t *mi;
2923 	rnode_t *tdrp;
2924 	hrtime_t t;
2925 
2926 	if (nfs_zone() != VTOMI(tdvp)->mi_zone)
2927 		return (EPERM);
2928 	if (VOP_REALVP(svp, &realvp, ct) == 0)
2929 		svp = realvp;
2930 
2931 	mi = VTOMI(svp);
2932 
2933 	if (!(mi->mi_flags & MI_LINK))
2934 		return (EOPNOTSUPP);
2935 
2936 	args.file = *VTOFH3(svp);
2937 	setdiropargs3(&args.link, tnm, tdvp);
2938 
2939 	tdrp = VTOR(tdvp);
2940 	if (nfs_rw_enter_sig(&tdrp->r_rwlock, RW_WRITER, INTR(tdvp)))
2941 		return (EINTR);
2942 
2943 	dnlc_remove(tdvp, tnm);
2944 
2945 	douprintf = 1;
2946 
2947 	t = gethrtime();
2948 
2949 	error = rfs3call(mi, NFSPROC3_LINK,
2950 	    xdr_LINK3args, (caddr_t)&args,
2951 	    xdr_LINK3res, (caddr_t)&res, cr,
2952 	    &douprintf, &res.status, 0, NULL);
2953 
2954 	if (error) {
2955 		PURGE_ATTRCACHE(tdvp);
2956 		PURGE_ATTRCACHE(svp);
2957 		nfs_rw_exit(&tdrp->r_rwlock);
2958 		return (error);
2959 	}
2960 
2961 	error = geterrno3(res.status);
2962 
2963 	if (!error) {
2964 		nfs3_cache_post_op_attr(svp, &res.resok.file_attributes, t, cr);
2965 		nfs3_cache_wcc_data(tdvp, &res.resok.linkdir_wcc, t, cr);
2966 		if (HAVE_RDDIR_CACHE(tdrp))
2967 			nfs_purge_rddir_cache(tdvp);
2968 		dnlc_update(tdvp, tnm, svp);
2969 	} else {
2970 		nfs3_cache_post_op_attr(svp, &res.resfail.file_attributes, t,
2971 		    cr);
2972 		nfs3_cache_wcc_data(tdvp, &res.resfail.linkdir_wcc, t, cr);
2973 		if (error == EOPNOTSUPP) {
2974 			mutex_enter(&mi->mi_lock);
2975 			mi->mi_flags &= ~MI_LINK;
2976 			mutex_exit(&mi->mi_lock);
2977 		}
2978 	}
2979 
2980 	nfs_rw_exit(&tdrp->r_rwlock);
2981 
2982 	if (!error) {
2983 		/*
2984 		 * Notify the source file of this link operation.
2985 		 */
2986 		vnevent_link(svp, ct);
2987 	}
2988 	return (error);
2989 }
2990 
2991 /* ARGSUSED */
2992 static int
2993 nfs3_rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr,
2994 	caller_context_t *ct, int flags)
2995 {
2996 	vnode_t *realvp;
2997 
2998 	if (nfs_zone() != VTOMI(odvp)->mi_zone)
2999 		return (EPERM);
3000 	if (VOP_REALVP(ndvp, &realvp, ct) == 0)
3001 		ndvp = realvp;
3002 
3003 	return (nfs3rename(odvp, onm, ndvp, nnm, cr, ct));
3004 }
3005 
3006 /*
3007  * nfs3rename does the real work of renaming in NFS Version 3.
3008  */
3009 static int
3010 nfs3rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr,
3011     caller_context_t *ct)
3012 {
3013 	int error;
3014 	RENAME3args args;
3015 	RENAME3res res;
3016 	int douprintf;
3017 	vnode_t *nvp = NULL;
3018 	vnode_t *ovp = NULL;
3019 	char *tmpname;
3020 	rnode_t *rp;
3021 	rnode_t *odrp;
3022 	rnode_t *ndrp;
3023 	hrtime_t t;
3024 
3025 	ASSERT(nfs_zone() == VTOMI(odvp)->mi_zone);
3026 
3027 	if (strcmp(onm, ".") == 0 || strcmp(onm, "..") == 0 ||
3028 	    strcmp(nnm, ".") == 0 || strcmp(nnm, "..") == 0)
3029 		return (EINVAL);
3030 
3031 	odrp = VTOR(odvp);
3032 	ndrp = VTOR(ndvp);
3033 	if ((intptr_t)odrp < (intptr_t)ndrp) {
3034 		if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR(odvp)))
3035 			return (EINTR);
3036 		if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR(ndvp))) {
3037 			nfs_rw_exit(&odrp->r_rwlock);
3038 			return (EINTR);
3039 		}
3040 	} else {
3041 		if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR(ndvp)))
3042 			return (EINTR);
3043 		if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR(odvp))) {
3044 			nfs_rw_exit(&ndrp->r_rwlock);
3045 			return (EINTR);
3046 		}
3047 	}
3048 
3049 	/*
3050 	 * Lookup the target file.  If it exists, it needs to be
3051 	 * checked to see whether it is a mount point and whether
3052 	 * it is active (open).
3053 	 */
3054 	error = nfs3lookup(ndvp, nnm, &nvp, NULL, 0, NULL, cr, 0);
3055 	if (!error) {
3056 		/*
3057 		 * If this file has been mounted on, then just
3058 		 * return busy because renaming to it would remove
3059 		 * the mounted file system from the name space.
3060 		 */
3061 		if (vn_mountedvfs(nvp) != NULL) {
3062 			VN_RELE(nvp);
3063 			nfs_rw_exit(&odrp->r_rwlock);
3064 			nfs_rw_exit(&ndrp->r_rwlock);
3065 			return (EBUSY);
3066 		}
3067 
3068 		/*
3069 		 * Purge the name cache of all references to this vnode
3070 		 * so that we can check the reference count to infer
3071 		 * whether it is active or not.
3072 		 */
3073 		/*
3074 		 * First just remove the entry from the name cache, as it
3075 		 * is most likely the only entry for this vp.
3076 		 */
3077 		dnlc_remove(ndvp, nnm);
3078 		/*
3079 		 * If the file has a v_count > 1 then there may be more
3080 		 * than one entry in the name cache due multiple links
3081 		 * or an open file, but we don't have the real reference
3082 		 * count so flush all possible entries.
3083 		 */
3084 		if (nvp->v_count > 1)
3085 			dnlc_purge_vp(nvp);
3086 
3087 		/*
3088 		 * If the vnode is active and is not a directory,
3089 		 * arrange to rename it to a
3090 		 * temporary file so that it will continue to be
3091 		 * accessible.  This implements the "unlink-open-file"
3092 		 * semantics for the target of a rename operation.
3093 		 * Before doing this though, make sure that the
3094 		 * source and target files are not already the same.
3095 		 */
3096 		if (nvp->v_count > 1 && nvp->v_type != VDIR) {
3097 			/*
3098 			 * Lookup the source name.
3099 			 */
3100 			error = nfs3lookup(odvp, onm, &ovp, NULL, 0, NULL,
3101 			    cr, 0);
3102 
3103 			/*
3104 			 * The source name *should* already exist.
3105 			 */
3106 			if (error) {
3107 				VN_RELE(nvp);
3108 				nfs_rw_exit(&odrp->r_rwlock);
3109 				nfs_rw_exit(&ndrp->r_rwlock);
3110 				return (error);
3111 			}
3112 
3113 			/*
3114 			 * Compare the two vnodes.  If they are the same,
3115 			 * just release all held vnodes and return success.
3116 			 */
3117 			if (ovp == nvp) {
3118 				VN_RELE(ovp);
3119 				VN_RELE(nvp);
3120 				nfs_rw_exit(&odrp->r_rwlock);
3121 				nfs_rw_exit(&ndrp->r_rwlock);
3122 				return (0);
3123 			}
3124 
3125 			/*
3126 			 * Can't mix and match directories and non-
3127 			 * directories in rename operations.  We already
3128 			 * know that the target is not a directory.  If
3129 			 * the source is a directory, return an error.
3130 			 */
3131 			if (ovp->v_type == VDIR) {
3132 				VN_RELE(ovp);
3133 				VN_RELE(nvp);
3134 				nfs_rw_exit(&odrp->r_rwlock);
3135 				nfs_rw_exit(&ndrp->r_rwlock);
3136 				return (ENOTDIR);
3137 			}
3138 
3139 			/*
3140 			 * The target file exists, is not the same as
3141 			 * the source file, and is active.  Link it
3142 			 * to a temporary filename to avoid having
3143 			 * the server removing the file completely.
3144 			 */
3145 			tmpname = newname();
3146 			error = nfs3_link(ndvp, nvp, tmpname, cr, NULL, 0);
3147 			if (error == EOPNOTSUPP) {
3148 				error = nfs3_rename(ndvp, nnm, ndvp, tmpname,
3149 				    cr, NULL, 0);
3150 			}
3151 			if (error) {
3152 				kmem_free(tmpname, MAXNAMELEN);
3153 				VN_RELE(ovp);
3154 				VN_RELE(nvp);
3155 				nfs_rw_exit(&odrp->r_rwlock);
3156 				nfs_rw_exit(&ndrp->r_rwlock);
3157 				return (error);
3158 			}
3159 			rp = VTOR(nvp);
3160 			mutex_enter(&rp->r_statelock);
3161 			if (rp->r_unldvp == NULL) {
3162 				VN_HOLD(ndvp);
3163 				rp->r_unldvp = ndvp;
3164 				if (rp->r_unlcred != NULL)
3165 					crfree(rp->r_unlcred);
3166 				crhold(cr);
3167 				rp->r_unlcred = cr;
3168 				rp->r_unlname = tmpname;
3169 			} else {
3170 				kmem_free(rp->r_unlname, MAXNAMELEN);
3171 				rp->r_unlname = tmpname;
3172 			}
3173 			mutex_exit(&rp->r_statelock);
3174 		}
3175 	}
3176 
3177 	if (ovp == NULL) {
3178 		/*
3179 		 * When renaming directories to be a subdirectory of a
3180 		 * different parent, the dnlc entry for ".." will no
3181 		 * longer be valid, so it must be removed.
3182 		 *
3183 		 * We do a lookup here to determine whether we are renaming
3184 		 * a directory and we need to check if we are renaming
3185 		 * an unlinked file.  This might have already been done
3186 		 * in previous code, so we check ovp == NULL to avoid
3187 		 * doing it twice.
3188 		 */
3189 
3190 		error = nfs3lookup(odvp, onm, &ovp, NULL, 0, NULL, cr, 0);
3191 		/*
3192 		 * The source name *should* already exist.
3193 		 */
3194 		if (error) {
3195 			nfs_rw_exit(&odrp->r_rwlock);
3196 			nfs_rw_exit(&ndrp->r_rwlock);
3197 			if (nvp) {
3198 				VN_RELE(nvp);
3199 			}
3200 			return (error);
3201 		}
3202 		ASSERT(ovp != NULL);
3203 	}
3204 
3205 	dnlc_remove(odvp, onm);
3206 	dnlc_remove(ndvp, nnm);
3207 
3208 	setdiropargs3(&args.from, onm, odvp);
3209 	setdiropargs3(&args.to, nnm, ndvp);
3210 
3211 	douprintf = 1;
3212 
3213 	t = gethrtime();
3214 
3215 	error = rfs3call(VTOMI(odvp), NFSPROC3_RENAME,
3216 	    xdr_RENAME3args, (caddr_t)&args,
3217 	    xdr_RENAME3res, (caddr_t)&res, cr,
3218 	    &douprintf, &res.status, 0, NULL);
3219 
3220 	if (error) {
3221 		PURGE_ATTRCACHE(odvp);
3222 		PURGE_ATTRCACHE(ndvp);
3223 		VN_RELE(ovp);
3224 		nfs_rw_exit(&odrp->r_rwlock);
3225 		nfs_rw_exit(&ndrp->r_rwlock);
3226 		if (nvp) {
3227 			VN_RELE(nvp);
3228 		}
3229 		return (error);
3230 	}
3231 
3232 	error = geterrno3(res.status);
3233 
3234 	if (!error) {
3235 		nfs3_cache_wcc_data(odvp, &res.resok.fromdir_wcc, t, cr);
3236 		if (HAVE_RDDIR_CACHE(odrp))
3237 			nfs_purge_rddir_cache(odvp);
3238 		if (ndvp != odvp) {
3239 			nfs3_cache_wcc_data(ndvp, &res.resok.todir_wcc, t, cr);
3240 			if (HAVE_RDDIR_CACHE(ndrp))
3241 				nfs_purge_rddir_cache(ndvp);
3242 		}
3243 		/*
3244 		 * when renaming directories to be a subdirectory of a
3245 		 * different parent, the dnlc entry for ".." will no
3246 		 * longer be valid, so it must be removed
3247 		 */
3248 		rp = VTOR(ovp);
3249 		if (ndvp != odvp) {
3250 			if (ovp->v_type == VDIR) {
3251 				dnlc_remove(ovp, "..");
3252 				if (HAVE_RDDIR_CACHE(rp))
3253 					nfs_purge_rddir_cache(ovp);
3254 			}
3255 		}
3256 
3257 		/*
3258 		 * If we are renaming the unlinked file, update the
3259 		 * r_unldvp and r_unlname as needed.
3260 		 */
3261 		mutex_enter(&rp->r_statelock);
3262 		if (rp->r_unldvp != NULL) {
3263 			if (strcmp(rp->r_unlname, onm) == 0) {
3264 				(void) strncpy(rp->r_unlname, nnm, MAXNAMELEN);
3265 				rp->r_unlname[MAXNAMELEN - 1] = '\0';
3266 
3267 				if (ndvp != rp->r_unldvp) {
3268 					VN_RELE(rp->r_unldvp);
3269 					rp->r_unldvp = ndvp;
3270 					VN_HOLD(ndvp);
3271 				}
3272 			}
3273 		}
3274 		mutex_exit(&rp->r_statelock);
3275 	} else {
3276 		nfs3_cache_wcc_data(odvp, &res.resfail.fromdir_wcc, t, cr);
3277 		if (ndvp != odvp) {
3278 			nfs3_cache_wcc_data(ndvp, &res.resfail.todir_wcc, t,
3279 			    cr);
3280 		}
3281 		/*
3282 		 * System V defines rename to return EEXIST, not
3283 		 * ENOTEMPTY if the target directory is not empty.
3284 		 * Over the wire, the error is NFSERR_ENOTEMPTY
3285 		 * which geterrno maps to ENOTEMPTY.
3286 		 */
3287 		if (error == ENOTEMPTY)
3288 			error = EEXIST;
3289 	}
3290 
3291 	if (error == 0) {
3292 		if (nvp)
3293 			vnevent_rename_dest(nvp, ndvp, nnm, ct);
3294 
3295 		if (odvp != ndvp)
3296 			vnevent_rename_dest_dir(ndvp, ct);
3297 		ASSERT(ovp != NULL);
3298 		vnevent_rename_src(ovp, odvp, onm, ct);
3299 	}
3300 
3301 	if (nvp) {
3302 		VN_RELE(nvp);
3303 	}
3304 	VN_RELE(ovp);
3305 
3306 	nfs_rw_exit(&odrp->r_rwlock);
3307 	nfs_rw_exit(&ndrp->r_rwlock);
3308 
3309 	return (error);
3310 }
3311 
3312 /* ARGSUSED */
3313 static int
3314 nfs3_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp, cred_t *cr,
3315 	caller_context_t *ct, int flags, vsecattr_t *vsecp)
3316 {
3317 	int error;
3318 	MKDIR3args args;
3319 	MKDIR3res res;
3320 	int douprintf;
3321 	struct vattr vattr;
3322 	vnode_t *vp;
3323 	rnode_t *drp;
3324 	hrtime_t t;
3325 
3326 	if (nfs_zone() != VTOMI(dvp)->mi_zone)
3327 		return (EPERM);
3328 	setdiropargs3(&args.where, nm, dvp);
3329 
3330 	/*
3331 	 * Decide what the group-id and set-gid bit of the created directory
3332 	 * should be.  May have to do a setattr to get the gid right.
3333 	 */
3334 	error = setdirgid(dvp, &va->va_gid, cr);
3335 	if (error)
3336 		return (error);
3337 	error = setdirmode(dvp, &va->va_mode, cr);
3338 	if (error)
3339 		return (error);
3340 	va->va_mask |= AT_MODE|AT_GID;
3341 
3342 	error = vattr_to_sattr3(va, &args.attributes);
3343 	if (error) {
3344 		/* req time field(s) overflow - return immediately */
3345 		return (error);
3346 	}
3347 
3348 	drp = VTOR(dvp);
3349 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
3350 		return (EINTR);
3351 
3352 	dnlc_remove(dvp, nm);
3353 
3354 	douprintf = 1;
3355 
3356 	t = gethrtime();
3357 
3358 	error = rfs3call(VTOMI(dvp), NFSPROC3_MKDIR,
3359 	    xdr_MKDIR3args, (caddr_t)&args,
3360 	    xdr_MKDIR3res, (caddr_t)&res, cr,
3361 	    &douprintf, &res.status, 0, NULL);
3362 
3363 	if (error) {
3364 		PURGE_ATTRCACHE(dvp);
3365 		nfs_rw_exit(&drp->r_rwlock);
3366 		return (error);
3367 	}
3368 
3369 	error = geterrno3(res.status);
3370 	if (!error) {
3371 		nfs3_cache_wcc_data(dvp, &res.resok.dir_wcc, t, cr);
3372 		if (HAVE_RDDIR_CACHE(drp))
3373 			nfs_purge_rddir_cache(dvp);
3374 
3375 		if (!res.resok.obj.handle_follows) {
3376 			error = nfs3lookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0);
3377 			if (error) {
3378 				nfs_rw_exit(&drp->r_rwlock);
3379 				return (error);
3380 			}
3381 		} else {
3382 			if (res.resok.obj_attributes.attributes) {
3383 				vp = makenfs3node(&res.resok.obj.handle,
3384 				    &res.resok.obj_attributes.attr,
3385 				    dvp->v_vfsp, t, cr, NULL, NULL);
3386 			} else {
3387 				vp = makenfs3node(&res.resok.obj.handle, NULL,
3388 				    dvp->v_vfsp, t, cr, NULL, NULL);
3389 				if (vp->v_type == VNON) {
3390 					vattr.va_mask = AT_TYPE;
3391 					error = nfs3getattr(vp, &vattr, cr);
3392 					if (error) {
3393 						VN_RELE(vp);
3394 						nfs_rw_exit(&drp->r_rwlock);
3395 						return (error);
3396 					}
3397 					vp->v_type = vattr.va_type;
3398 				}
3399 			}
3400 			dnlc_update(dvp, nm, vp);
3401 		}
3402 		if (va->va_gid != VTOR(vp)->r_attr.va_gid) {
3403 			va->va_mask = AT_GID;
3404 			(void) nfs3setattr(vp, va, 0, cr);
3405 		}
3406 		*vpp = vp;
3407 	} else {
3408 		nfs3_cache_wcc_data(dvp, &res.resfail.dir_wcc, t, cr);
3409 		PURGE_STALE_FH(error, dvp, cr);
3410 	}
3411 
3412 	nfs_rw_exit(&drp->r_rwlock);
3413 
3414 	return (error);
3415 }
3416 
3417 /* ARGSUSED */
3418 static int
3419 nfs3_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr,
3420 	caller_context_t *ct, int flags)
3421 {
3422 	int error;
3423 	RMDIR3args args;
3424 	RMDIR3res res;
3425 	vnode_t *vp;
3426 	int douprintf;
3427 	rnode_t *drp;
3428 	hrtime_t t;
3429 
3430 	if (nfs_zone() != VTOMI(dvp)->mi_zone)
3431 		return (EPERM);
3432 	drp = VTOR(dvp);
3433 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
3434 		return (EINTR);
3435 
3436 	/*
3437 	 * Attempt to prevent a rmdir(".") from succeeding.
3438 	 */
3439 	error = nfs3lookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0);
3440 	if (error) {
3441 		nfs_rw_exit(&drp->r_rwlock);
3442 		return (error);
3443 	}
3444 
3445 	if (vp == cdir) {
3446 		VN_RELE(vp);
3447 		nfs_rw_exit(&drp->r_rwlock);
3448 		return (EINVAL);
3449 	}
3450 
3451 	setdiropargs3(&args.object, nm, dvp);
3452 
3453 	/*
3454 	 * First just remove the entry from the name cache, as it
3455 	 * is most likely an entry for this vp.
3456 	 */
3457 	dnlc_remove(dvp, nm);
3458 
3459 	/*
3460 	 * If there vnode reference count is greater than one, then
3461 	 * there may be additional references in the DNLC which will
3462 	 * need to be purged.  First, trying removing the entry for
3463 	 * the parent directory and see if that removes the additional
3464 	 * reference(s).  If that doesn't do it, then use dnlc_purge_vp
3465 	 * to completely remove any references to the directory which
3466 	 * might still exist in the DNLC.
3467 	 */
3468 	if (vp->v_count > 1) {
3469 		dnlc_remove(vp, "..");
3470 		if (vp->v_count > 1)
3471 			dnlc_purge_vp(vp);
3472 	}
3473 
3474 	douprintf = 1;
3475 
3476 	t = gethrtime();
3477 
3478 	error = rfs3call(VTOMI(dvp), NFSPROC3_RMDIR,
3479 	    xdr_diropargs3, (caddr_t)&args,
3480 	    xdr_RMDIR3res, (caddr_t)&res, cr,
3481 	    &douprintf, &res.status, 0, NULL);
3482 
3483 	PURGE_ATTRCACHE(vp);
3484 
3485 	if (error) {
3486 		PURGE_ATTRCACHE(dvp);
3487 		VN_RELE(vp);
3488 		nfs_rw_exit(&drp->r_rwlock);
3489 		return (error);
3490 	}
3491 
3492 	error = geterrno3(res.status);
3493 	if (!error) {
3494 		nfs3_cache_wcc_data(dvp, &res.resok.dir_wcc, t, cr);
3495 		if (HAVE_RDDIR_CACHE(drp))
3496 			nfs_purge_rddir_cache(dvp);
3497 		if (HAVE_RDDIR_CACHE(VTOR(vp)))
3498 			nfs_purge_rddir_cache(vp);
3499 	} else {
3500 		nfs3_cache_wcc_data(dvp, &res.resfail.dir_wcc, t, cr);
3501 		PURGE_STALE_FH(error, dvp, cr);
3502 		/*
3503 		 * System V defines rmdir to return EEXIST, not
3504 		 * ENOTEMPTY if the directory is not empty.  Over
3505 		 * the wire, the error is NFSERR_ENOTEMPTY which
3506 		 * geterrno maps to ENOTEMPTY.
3507 		 */
3508 		if (error == ENOTEMPTY)
3509 			error = EEXIST;
3510 	}
3511 
3512 	if (error == 0) {
3513 		vnevent_rmdir(vp, dvp, nm, ct);
3514 	}
3515 	VN_RELE(vp);
3516 
3517 	nfs_rw_exit(&drp->r_rwlock);
3518 
3519 	return (error);
3520 }
3521 
3522 /* ARGSUSED */
3523 static int
3524 nfs3_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm, cred_t *cr,
3525 	caller_context_t *ct, int flags)
3526 {
3527 	int error;
3528 	SYMLINK3args args;
3529 	SYMLINK3res res;
3530 	int douprintf;
3531 	mntinfo_t *mi;
3532 	vnode_t *vp;
3533 	rnode_t *rp;
3534 	char *contents;
3535 	rnode_t *drp;
3536 	hrtime_t t;
3537 
3538 	mi = VTOMI(dvp);
3539 
3540 	if (nfs_zone() != mi->mi_zone)
3541 		return (EPERM);
3542 	if (!(mi->mi_flags & MI_SYMLINK))
3543 		return (EOPNOTSUPP);
3544 
3545 	setdiropargs3(&args.where, lnm, dvp);
3546 	error = vattr_to_sattr3(tva, &args.symlink.symlink_attributes);
3547 	if (error) {
3548 		/* req time field(s) overflow - return immediately */
3549 		return (error);
3550 	}
3551 	args.symlink.symlink_data = tnm;
3552 
3553 	drp = VTOR(dvp);
3554 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
3555 		return (EINTR);
3556 
3557 	dnlc_remove(dvp, lnm);
3558 
3559 	douprintf = 1;
3560 
3561 	t = gethrtime();
3562 
3563 	error = rfs3call(mi, NFSPROC3_SYMLINK,
3564 	    xdr_SYMLINK3args, (caddr_t)&args,
3565 	    xdr_SYMLINK3res, (caddr_t)&res, cr,
3566 	    &douprintf, &res.status, 0, NULL);
3567 
3568 	if (error) {
3569 		PURGE_ATTRCACHE(dvp);
3570 		nfs_rw_exit(&drp->r_rwlock);
3571 		return (error);
3572 	}
3573 
3574 	error = geterrno3(res.status);
3575 	if (!error) {
3576 		nfs3_cache_wcc_data(dvp, &res.resok.dir_wcc, t, cr);
3577 		if (HAVE_RDDIR_CACHE(drp))
3578 			nfs_purge_rddir_cache(dvp);
3579 
3580 		if (res.resok.obj.handle_follows) {
3581 			if (res.resok.obj_attributes.attributes) {
3582 				vp = makenfs3node(&res.resok.obj.handle,
3583 				    &res.resok.obj_attributes.attr,
3584 				    dvp->v_vfsp, t, cr, NULL, NULL);
3585 			} else {
3586 				vp = makenfs3node(&res.resok.obj.handle, NULL,
3587 				    dvp->v_vfsp, t, cr, NULL, NULL);
3588 				vp->v_type = VLNK;
3589 				vp->v_rdev = 0;
3590 			}
3591 			dnlc_update(dvp, lnm, vp);
3592 			rp = VTOR(vp);
3593 			if (nfs3_do_symlink_cache &&
3594 			    rp->r_symlink.contents == NULL) {
3595 
3596 				contents = kmem_alloc(MAXPATHLEN,
3597 				    KM_NOSLEEP);
3598 
3599 				if (contents != NULL) {
3600 					mutex_enter(&rp->r_statelock);
3601 					if (rp->r_symlink.contents == NULL) {
3602 						rp->r_symlink.len = strlen(tnm);
3603 						bcopy(tnm, contents,
3604 						    rp->r_symlink.len);
3605 						rp->r_symlink.contents =
3606 						    contents;
3607 						rp->r_symlink.size = MAXPATHLEN;
3608 						mutex_exit(&rp->r_statelock);
3609 					} else {
3610 						mutex_exit(&rp->r_statelock);
3611 						kmem_free((void *)contents,
3612 							    MAXPATHLEN);
3613 					}
3614 				}
3615 			}
3616 			VN_RELE(vp);
3617 		}
3618 	} else {
3619 		nfs3_cache_wcc_data(dvp, &res.resfail.dir_wcc, t, cr);
3620 		PURGE_STALE_FH(error, dvp, cr);
3621 		if (error == EOPNOTSUPP) {
3622 			mutex_enter(&mi->mi_lock);
3623 			mi->mi_flags &= ~MI_SYMLINK;
3624 			mutex_exit(&mi->mi_lock);
3625 		}
3626 	}
3627 
3628 	nfs_rw_exit(&drp->r_rwlock);
3629 
3630 	return (error);
3631 }
3632 
3633 #ifdef DEBUG
3634 static int nfs3_readdir_cache_hits = 0;
3635 static int nfs3_readdir_cache_shorts = 0;
3636 static int nfs3_readdir_cache_waits = 0;
3637 static int nfs3_readdir_cache_misses = 0;
3638 static int nfs3_readdir_readahead = 0;
3639 #endif
3640 
3641 static int nfs3_shrinkreaddir = 0;
3642 
3643 /*
3644  * Read directory entries.
3645  * There are some weird things to look out for here.  The uio_loffset
3646  * field is either 0 or it is the offset returned from a previous
3647  * readdir.  It is an opaque value used by the server to find the
3648  * correct directory block to read. The count field is the number
3649  * of blocks to read on the server.  This is advisory only, the server
3650  * may return only one block's worth of entries.  Entries may be compressed
3651  * on the server.
3652  */
3653 /* ARGSUSED */
3654 static int
3655 nfs3_readdir(vnode_t *vp, struct uio *uiop, cred_t *cr, int *eofp,
3656 	caller_context_t *ct, int flags)
3657 {
3658 	int error;
3659 	size_t count;
3660 	rnode_t *rp;
3661 	rddir_cache *rdc;
3662 	rddir_cache *nrdc;
3663 	rddir_cache *rrdc;
3664 #ifdef DEBUG
3665 	int missed;
3666 #endif
3667 	int doreadahead;
3668 	rddir_cache srdc;
3669 	avl_index_t where;
3670 
3671 	if (nfs_zone() != VTOMI(vp)->mi_zone)
3672 		return (EIO);
3673 	rp = VTOR(vp);
3674 
3675 	ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
3676 
3677 	/*
3678 	 * Make sure that the directory cache is valid.
3679 	 */
3680 	if (HAVE_RDDIR_CACHE(rp)) {
3681 		if (nfs_disable_rddir_cache) {
3682 			/*
3683 			 * Setting nfs_disable_rddir_cache in /etc/system
3684 			 * allows interoperability with servers that do not
3685 			 * properly update the attributes of directories.
3686 			 * Any cached information gets purged before an
3687 			 * access is made to it.
3688 			 */
3689 			nfs_purge_rddir_cache(vp);
3690 		} else {
3691 			error = nfs3_validate_caches(vp, cr);
3692 			if (error)
3693 				return (error);
3694 		}
3695 	}
3696 
3697 	/*
3698 	 * It is possible that some servers may not be able to correctly
3699 	 * handle a large READDIR or READDIRPLUS request due to bugs in
3700 	 * their implementation.  In order to continue to interoperate
3701 	 * with them, this workaround is provided to limit the maximum
3702 	 * size of a READDIRPLUS request to 1024.  In any case, the request
3703 	 * size is limited to MAXBSIZE.
3704 	 */
3705 	count = MIN(uiop->uio_iov->iov_len,
3706 	    nfs3_shrinkreaddir ? 1024 : MAXBSIZE);
3707 
3708 	nrdc = NULL;
3709 #ifdef DEBUG
3710 	missed = 0;
3711 #endif
3712 top:
3713 	/*
3714 	 * Short circuit last readdir which always returns 0 bytes.
3715 	 * This can be done after the directory has been read through
3716 	 * completely at least once.  This will set r_direof which
3717 	 * can be used to find the value of the last cookie.
3718 	 */
3719 	mutex_enter(&rp->r_statelock);
3720 	if (rp->r_direof != NULL &&
3721 	    uiop->uio_loffset == rp->r_direof->nfs3_ncookie) {
3722 		mutex_exit(&rp->r_statelock);
3723 #ifdef DEBUG
3724 		nfs3_readdir_cache_shorts++;
3725 #endif
3726 		if (eofp)
3727 			*eofp = 1;
3728 		if (nrdc != NULL)
3729 			rddir_cache_rele(nrdc);
3730 		return (0);
3731 	}
3732 	/*
3733 	 * Look for a cache entry.  Cache entries are identified
3734 	 * by the NFS cookie value and the byte count requested.
3735 	 */
3736 	srdc.nfs3_cookie = uiop->uio_loffset;
3737 	srdc.buflen = count;
3738 	rdc = avl_find(&rp->r_dir, &srdc, &where);
3739 	if (rdc != NULL) {
3740 		rddir_cache_hold(rdc);
3741 		/*
3742 		 * If the cache entry is in the process of being
3743 		 * filled in, wait until this completes.  The
3744 		 * RDDIRWAIT bit is set to indicate that someone
3745 		 * is waiting and then the thread currently
3746 		 * filling the entry is done, it should do a
3747 		 * cv_broadcast to wakeup all of the threads
3748 		 * waiting for it to finish.
3749 		 */
3750 		if (rdc->flags & RDDIR) {
3751 			nfs_rw_exit(&rp->r_rwlock);
3752 			rdc->flags |= RDDIRWAIT;
3753 #ifdef DEBUG
3754 			nfs3_readdir_cache_waits++;
3755 #endif
3756 			if (!cv_wait_sig(&rdc->cv, &rp->r_statelock)) {
3757 				/*
3758 				 * We got interrupted, probably
3759 				 * the user typed ^C or an alarm
3760 				 * fired.  We free the new entry
3761 				 * if we allocated one.
3762 				 */
3763 				mutex_exit(&rp->r_statelock);
3764 				(void) nfs_rw_enter_sig(&rp->r_rwlock,
3765 					RW_READER, FALSE);
3766 				rddir_cache_rele(rdc);
3767 				if (nrdc != NULL)
3768 					rddir_cache_rele(nrdc);
3769 				return (EINTR);
3770 			}
3771 			mutex_exit(&rp->r_statelock);
3772 			(void) nfs_rw_enter_sig(&rp->r_rwlock,
3773 				RW_READER, FALSE);
3774 			rddir_cache_rele(rdc);
3775 			goto top;
3776 		}
3777 		/*
3778 		 * Check to see if a readdir is required to
3779 		 * fill the entry.  If so, mark this entry
3780 		 * as being filled, remove our reference,
3781 		 * and branch to the code to fill the entry.
3782 		 */
3783 		if (rdc->flags & RDDIRREQ) {
3784 			rdc->flags &= ~RDDIRREQ;
3785 			rdc->flags |= RDDIR;
3786 			if (nrdc != NULL)
3787 				rddir_cache_rele(nrdc);
3788 			nrdc = rdc;
3789 			mutex_exit(&rp->r_statelock);
3790 			goto bottom;
3791 		}
3792 #ifdef DEBUG
3793 		if (!missed)
3794 			nfs3_readdir_cache_hits++;
3795 #endif
3796 		/*
3797 		 * If an error occurred while attempting
3798 		 * to fill the cache entry, just return it.
3799 		 */
3800 		if (rdc->error) {
3801 			error = rdc->error;
3802 			mutex_exit(&rp->r_statelock);
3803 			rddir_cache_rele(rdc);
3804 			if (nrdc != NULL)
3805 				rddir_cache_rele(nrdc);
3806 			return (error);
3807 		}
3808 
3809 		/*
3810 		 * The cache entry is complete and good,
3811 		 * copyout the dirent structs to the calling
3812 		 * thread.
3813 		 */
3814 		error = uiomove(rdc->entries, rdc->entlen, UIO_READ, uiop);
3815 
3816 		/*
3817 		 * If no error occurred during the copyout,
3818 		 * update the offset in the uio struct to
3819 		 * contain the value of the next cookie
3820 		 * and set the eof value appropriately.
3821 		 */
3822 		if (!error) {
3823 			uiop->uio_loffset = rdc->nfs3_ncookie;
3824 			if (eofp)
3825 				*eofp = rdc->eof;
3826 		}
3827 
3828 		/*
3829 		 * Decide whether to do readahead.
3830 		 *
3831 		 * Don't if have already read to the end of
3832 		 * directory.  There is nothing more to read.
3833 		 *
3834 		 * Don't if the application is not doing
3835 		 * lookups in the directory.  The readahead
3836 		 * is only effective if the application can
3837 		 * be doing work while an async thread is
3838 		 * handling the over the wire request.
3839 		 */
3840 		if (rdc->eof) {
3841 			rp->r_direof = rdc;
3842 			doreadahead = FALSE;
3843 		} else if (!(rp->r_flags & RLOOKUP))
3844 			doreadahead = FALSE;
3845 		else
3846 			doreadahead = TRUE;
3847 
3848 		if (!doreadahead) {
3849 			mutex_exit(&rp->r_statelock);
3850 			rddir_cache_rele(rdc);
3851 			if (nrdc != NULL)
3852 				rddir_cache_rele(nrdc);
3853 			return (error);
3854 		}
3855 
3856 		/*
3857 		 * Check to see whether we found an entry
3858 		 * for the readahead.  If so, we don't need
3859 		 * to do anything further, so free the new
3860 		 * entry if one was allocated.  Otherwise,
3861 		 * allocate a new entry, add it to the cache,
3862 		 * and then initiate an asynchronous readdir
3863 		 * operation to fill it.
3864 		 */
3865 		srdc.nfs3_cookie = rdc->nfs3_ncookie;
3866 		srdc.buflen = count;
3867 		rrdc = avl_find(&rp->r_dir, &srdc, &where);
3868 		if (rrdc != NULL) {
3869 			if (nrdc != NULL)
3870 				rddir_cache_rele(nrdc);
3871 		} else {
3872 			if (nrdc != NULL)
3873 				rrdc = nrdc;
3874 			else {
3875 				rrdc = rddir_cache_alloc(KM_NOSLEEP);
3876 			}
3877 			if (rrdc != NULL) {
3878 				rrdc->nfs3_cookie = rdc->nfs3_ncookie;
3879 				rrdc->buflen = count;
3880 				avl_insert(&rp->r_dir, rrdc, where);
3881 				rddir_cache_hold(rrdc);
3882 				mutex_exit(&rp->r_statelock);
3883 				rddir_cache_rele(rdc);
3884 #ifdef DEBUG
3885 				nfs3_readdir_readahead++;
3886 #endif
3887 				nfs_async_readdir(vp, rrdc, cr, do_nfs3readdir);
3888 				return (error);
3889 			}
3890 		}
3891 
3892 		mutex_exit(&rp->r_statelock);
3893 		rddir_cache_rele(rdc);
3894 		return (error);
3895 	}
3896 
3897 	/*
3898 	 * Didn't find an entry in the cache.  Construct a new empty
3899 	 * entry and link it into the cache.  Other processes attempting
3900 	 * to access this entry will need to wait until it is filled in.
3901 	 *
3902 	 * Since kmem_alloc may block, another pass through the cache
3903 	 * will need to be taken to make sure that another process
3904 	 * hasn't already added an entry to the cache for this request.
3905 	 */
3906 	if (nrdc == NULL) {
3907 		mutex_exit(&rp->r_statelock);
3908 		nrdc = rddir_cache_alloc(KM_SLEEP);
3909 		nrdc->nfs3_cookie = uiop->uio_loffset;
3910 		nrdc->buflen = count;
3911 		goto top;
3912 	}
3913 
3914 	/*
3915 	 * Add this entry to the cache.
3916 	 */
3917 	avl_insert(&rp->r_dir, nrdc, where);
3918 	rddir_cache_hold(nrdc);
3919 	mutex_exit(&rp->r_statelock);
3920 
3921 bottom:
3922 #ifdef DEBUG
3923 	missed = 1;
3924 	nfs3_readdir_cache_misses++;
3925 #endif
3926 	/*
3927 	 * Do the readdir.  This routine decides whether to use
3928 	 * READDIR or READDIRPLUS.
3929 	 */
3930 	error = do_nfs3readdir(vp, nrdc, cr);
3931 
3932 	/*
3933 	 * If this operation failed, just return the error which occurred.
3934 	 */
3935 	if (error != 0)
3936 		return (error);
3937 
3938 	/*
3939 	 * Since the RPC operation will have taken sometime and blocked
3940 	 * this process, another pass through the cache will need to be
3941 	 * taken to find the correct cache entry.  It is possible that
3942 	 * the correct cache entry will not be there (although one was
3943 	 * added) because the directory changed during the RPC operation
3944 	 * and the readdir cache was flushed.  In this case, just start
3945 	 * over.  It is hoped that this will not happen too often... :-)
3946 	 */
3947 	nrdc = NULL;
3948 	goto top;
3949 	/* NOTREACHED */
3950 }
3951 
3952 static int
3953 do_nfs3readdir(vnode_t *vp, rddir_cache *rdc, cred_t *cr)
3954 {
3955 	int error;
3956 	rnode_t *rp;
3957 	mntinfo_t *mi;
3958 
3959 	rp = VTOR(vp);
3960 	mi = VTOMI(vp);
3961 	ASSERT(nfs_zone() == mi->mi_zone);
3962 	/*
3963 	 * Issue the proper request.
3964 	 *
3965 	 * If the server does not support READDIRPLUS, then use READDIR.
3966 	 *
3967 	 * Otherwise --
3968 	 * Issue a READDIRPLUS if reading to fill an empty cache or if
3969 	 * an application has performed a lookup in the directory which
3970 	 * required an over the wire lookup.  The use of READDIRPLUS
3971 	 * will help to (re)populate the DNLC.
3972 	 */
3973 	if (!(mi->mi_flags & MI_READDIRONLY) &&
3974 	    (rp->r_flags & (RLOOKUP | RREADDIRPLUS))) {
3975 		if (rp->r_flags & RREADDIRPLUS) {
3976 			mutex_enter(&rp->r_statelock);
3977 			rp->r_flags &= ~RREADDIRPLUS;
3978 			mutex_exit(&rp->r_statelock);
3979 		}
3980 		nfs3readdirplus(vp, rdc, cr);
3981 		if (rdc->error == EOPNOTSUPP)
3982 			nfs3readdir(vp, rdc, cr);
3983 	} else
3984 		nfs3readdir(vp, rdc, cr);
3985 
3986 	mutex_enter(&rp->r_statelock);
3987 	rdc->flags &= ~RDDIR;
3988 	if (rdc->flags & RDDIRWAIT) {
3989 		rdc->flags &= ~RDDIRWAIT;
3990 		cv_broadcast(&rdc->cv);
3991 	}
3992 	error = rdc->error;
3993 	if (error)
3994 		rdc->flags |= RDDIRREQ;
3995 	mutex_exit(&rp->r_statelock);
3996 
3997 	rddir_cache_rele(rdc);
3998 
3999 	return (error);
4000 }
4001 
4002 static void
4003 nfs3readdir(vnode_t *vp, rddir_cache *rdc, cred_t *cr)
4004 {
4005 	int error;
4006 	READDIR3args args;
4007 	READDIR3vres res;
4008 	vattr_t dva;
4009 	rnode_t *rp;
4010 	int douprintf;
4011 	failinfo_t fi, *fip = NULL;
4012 	mntinfo_t *mi;
4013 	hrtime_t t;
4014 
4015 	rp = VTOR(vp);
4016 	mi = VTOMI(vp);
4017 	ASSERT(nfs_zone() == mi->mi_zone);
4018 
4019 	args.dir = *RTOFH3(rp);
4020 	args.cookie = (cookie3)rdc->nfs3_cookie;
4021 	args.cookieverf = rp->r_cookieverf;
4022 	args.count = rdc->buflen;
4023 
4024 	/*
4025 	 * NFS client failover support
4026 	 * suppress failover unless we have a zero cookie
4027 	 */
4028 	if (args.cookie == (cookie3) 0) {
4029 		fi.vp = vp;
4030 		fi.fhp = (caddr_t)&args.dir;
4031 		fi.copyproc = nfs3copyfh;
4032 		fi.lookupproc = nfs3lookup;
4033 		fi.xattrdirproc = acl_getxattrdir3;
4034 		fip = &fi;
4035 	}
4036 
4037 #ifdef DEBUG
4038 	rdc->entries = rddir_cache_buf_alloc(rdc->buflen, KM_SLEEP);
4039 #else
4040 	rdc->entries = kmem_alloc(rdc->buflen, KM_SLEEP);
4041 #endif
4042 
4043 	res.entries = (dirent64_t *)rdc->entries;
4044 	res.entries_size = rdc->buflen;
4045 	res.dir_attributes.fres.vap = &dva;
4046 	res.dir_attributes.fres.vp = vp;
4047 	res.loff = rdc->nfs3_cookie;
4048 
4049 	douprintf = 1;
4050 
4051 	if (mi->mi_io_kstats) {
4052 		mutex_enter(&mi->mi_lock);
4053 		kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
4054 		mutex_exit(&mi->mi_lock);
4055 	}
4056 
4057 	t = gethrtime();
4058 
4059 	error = rfs3call(VTOMI(vp), NFSPROC3_READDIR,
4060 	    xdr_READDIR3args, (caddr_t)&args,
4061 	    xdr_READDIR3vres, (caddr_t)&res, cr,
4062 	    &douprintf, &res.status, 0, fip);
4063 
4064 	if (mi->mi_io_kstats) {
4065 		mutex_enter(&mi->mi_lock);
4066 		kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
4067 		mutex_exit(&mi->mi_lock);
4068 	}
4069 
4070 	if (error)
4071 		goto err;
4072 
4073 	nfs3_cache_post_op_vattr(vp, &res.dir_attributes, t, cr);
4074 
4075 	error = geterrno3(res.status);
4076 	if (error) {
4077 		PURGE_STALE_FH(error, vp, cr);
4078 		goto err;
4079 	}
4080 
4081 	if (mi->mi_io_kstats) {
4082 		mutex_enter(&mi->mi_lock);
4083 		KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
4084 		KSTAT_IO_PTR(mi->mi_io_kstats)->nread += res.size;
4085 		mutex_exit(&mi->mi_lock);
4086 	}
4087 
4088 	rdc->nfs3_ncookie = res.loff;
4089 	rp->r_cookieverf = res.cookieverf;
4090 	rdc->eof = res.eof ? 1 : 0;
4091 	rdc->entlen = res.size;
4092 	ASSERT(rdc->entlen <= rdc->buflen);
4093 	rdc->error = 0;
4094 	return;
4095 
4096 err:
4097 	kmem_free(rdc->entries, rdc->buflen);
4098 	rdc->entries = NULL;
4099 	rdc->error = error;
4100 }
4101 
4102 /*
4103  * Read directory entries.
4104  * There are some weird things to look out for here.  The uio_loffset
4105  * field is either 0 or it is the offset returned from a previous
4106  * readdir.  It is an opaque value used by the server to find the
4107  * correct directory block to read. The count field is the number
4108  * of blocks to read on the server.  This is advisory only, the server
4109  * may return only one block's worth of entries.  Entries may be compressed
4110  * on the server.
4111  */
4112 static void
4113 nfs3readdirplus(vnode_t *vp, rddir_cache *rdc, cred_t *cr)
4114 {
4115 	int error;
4116 	READDIRPLUS3args args;
4117 	READDIRPLUS3vres res;
4118 	vattr_t dva;
4119 	rnode_t *rp;
4120 	mntinfo_t *mi;
4121 	int douprintf;
4122 	failinfo_t fi, *fip = NULL;
4123 
4124 	rp = VTOR(vp);
4125 	mi = VTOMI(vp);
4126 	ASSERT(nfs_zone() == mi->mi_zone);
4127 
4128 	args.dir = *RTOFH3(rp);
4129 	args.cookie = (cookie3)rdc->nfs3_cookie;
4130 	args.cookieverf = rp->r_cookieverf;
4131 	args.dircount = rdc->buflen;
4132 	args.maxcount = mi->mi_tsize;
4133 
4134 	/*
4135 	 * NFS client failover support
4136 	 * suppress failover unless we have a zero cookie
4137 	 */
4138 	if (args.cookie == (cookie3)0) {
4139 		fi.vp = vp;
4140 		fi.fhp = (caddr_t)&args.dir;
4141 		fi.copyproc = nfs3copyfh;
4142 		fi.lookupproc = nfs3lookup;
4143 		fi.xattrdirproc = acl_getxattrdir3;
4144 		fip = &fi;
4145 	}
4146 
4147 #ifdef DEBUG
4148 	rdc->entries = rddir_cache_buf_alloc(rdc->buflen, KM_SLEEP);
4149 #else
4150 	rdc->entries = kmem_alloc(rdc->buflen, KM_SLEEP);
4151 #endif
4152 
4153 	res.entries = (dirent64_t *)rdc->entries;
4154 	res.entries_size = rdc->buflen;
4155 	res.dir_attributes.fres.vap = &dva;
4156 	res.dir_attributes.fres.vp = vp;
4157 	res.loff = rdc->nfs3_cookie;
4158 	res.credentials = cr;
4159 
4160 	douprintf = 1;
4161 
4162 	if (mi->mi_io_kstats) {
4163 		mutex_enter(&mi->mi_lock);
4164 		kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
4165 		mutex_exit(&mi->mi_lock);
4166 	}
4167 
4168 	res.time = gethrtime();
4169 
4170 	error = rfs3call(mi, NFSPROC3_READDIRPLUS,
4171 	    xdr_READDIRPLUS3args, (caddr_t)&args,
4172 	    xdr_READDIRPLUS3vres, (caddr_t)&res, cr,
4173 	    &douprintf, &res.status, 0, fip);
4174 
4175 	if (mi->mi_io_kstats) {
4176 		mutex_enter(&mi->mi_lock);
4177 		kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
4178 		mutex_exit(&mi->mi_lock);
4179 	}
4180 
4181 	if (error) {
4182 		goto err;
4183 	}
4184 
4185 	nfs3_cache_post_op_vattr(vp, &res.dir_attributes, res.time, cr);
4186 
4187 	error = geterrno3(res.status);
4188 	if (error) {
4189 		PURGE_STALE_FH(error, vp, cr);
4190 		if (error == EOPNOTSUPP) {
4191 			mutex_enter(&mi->mi_lock);
4192 			mi->mi_flags |= MI_READDIRONLY;
4193 			mutex_exit(&mi->mi_lock);
4194 		}
4195 		goto err;
4196 	}
4197 
4198 	if (mi->mi_io_kstats) {
4199 		mutex_enter(&mi->mi_lock);
4200 		KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
4201 		KSTAT_IO_PTR(mi->mi_io_kstats)->nread += res.size;
4202 		mutex_exit(&mi->mi_lock);
4203 	}
4204 
4205 	rdc->nfs3_ncookie = res.loff;
4206 	rp->r_cookieverf = res.cookieverf;
4207 	rdc->eof = res.eof ? 1 : 0;
4208 	rdc->entlen = res.size;
4209 	ASSERT(rdc->entlen <= rdc->buflen);
4210 	rdc->error = 0;
4211 
4212 	return;
4213 
4214 err:
4215 	kmem_free(rdc->entries, rdc->buflen);
4216 	rdc->entries = NULL;
4217 	rdc->error = error;
4218 }
4219 
4220 #ifdef DEBUG
4221 static int nfs3_bio_do_stop = 0;
4222 #endif
4223 
4224 static int
4225 nfs3_bio(struct buf *bp, stable_how *stab_comm, cred_t *cr)
4226 {
4227 	rnode_t *rp = VTOR(bp->b_vp);
4228 	int count;
4229 	int error;
4230 	cred_t *cred;
4231 	offset_t offset;
4232 
4233 	ASSERT(nfs_zone() == VTOMI(bp->b_vp)->mi_zone);
4234 	offset = ldbtob(bp->b_lblkno);
4235 
4236 	DTRACE_IO1(start, struct buf *, bp);
4237 
4238 	if (bp->b_flags & B_READ) {
4239 		mutex_enter(&rp->r_statelock);
4240 		if (rp->r_cred != NULL) {
4241 			cred = rp->r_cred;
4242 			crhold(cred);
4243 		} else {
4244 			rp->r_cred = cr;
4245 			crhold(cr);
4246 			cred = cr;
4247 			crhold(cred);
4248 		}
4249 		mutex_exit(&rp->r_statelock);
4250 	read_again:
4251 		error = bp->b_error = nfs3read(bp->b_vp, bp->b_un.b_addr,
4252 		    offset, bp->b_bcount, &bp->b_resid, cred);
4253 		crfree(cred);
4254 		if (!error) {
4255 			if (bp->b_resid) {
4256 				/*
4257 				 * Didn't get it all because we hit EOF,
4258 				 * zero all the memory beyond the EOF.
4259 				 */
4260 				/* bzero(rdaddr + */
4261 				bzero(bp->b_un.b_addr +
4262 				    bp->b_bcount - bp->b_resid, bp->b_resid);
4263 			}
4264 			mutex_enter(&rp->r_statelock);
4265 			if (bp->b_resid == bp->b_bcount &&
4266 			    offset >= rp->r_size) {
4267 				/*
4268 				 * We didn't read anything at all as we are
4269 				 * past EOF.  Return an error indicator back
4270 				 * but don't destroy the pages (yet).
4271 				 */
4272 				error = NFS_EOF;
4273 			}
4274 			mutex_exit(&rp->r_statelock);
4275 		} else if (error == EACCES) {
4276 			mutex_enter(&rp->r_statelock);
4277 			if (cred != cr) {
4278 				if (rp->r_cred != NULL)
4279 					crfree(rp->r_cred);
4280 				rp->r_cred = cr;
4281 				crhold(cr);
4282 				cred = cr;
4283 				crhold(cred);
4284 				mutex_exit(&rp->r_statelock);
4285 				goto read_again;
4286 			}
4287 			mutex_exit(&rp->r_statelock);
4288 		}
4289 	} else {
4290 		if (!(rp->r_flags & RSTALE)) {
4291 			mutex_enter(&rp->r_statelock);
4292 			if (rp->r_cred != NULL) {
4293 				cred = rp->r_cred;
4294 				crhold(cred);
4295 			} else {
4296 				rp->r_cred = cr;
4297 				crhold(cr);
4298 				cred = cr;
4299 				crhold(cred);
4300 			}
4301 			mutex_exit(&rp->r_statelock);
4302 		write_again:
4303 			mutex_enter(&rp->r_statelock);
4304 			count = MIN(bp->b_bcount, rp->r_size - offset);
4305 			mutex_exit(&rp->r_statelock);
4306 			if (count < 0)
4307 				cmn_err(CE_PANIC, "nfs3_bio: write count < 0");
4308 #ifdef DEBUG
4309 			if (count == 0) {
4310 				zcmn_err(getzoneid(), CE_WARN,
4311 				    "nfs3_bio: zero length write at %lld",
4312 				    offset);
4313 				nfs_printfhandle(&rp->r_fh);
4314 				if (nfs3_bio_do_stop)
4315 					debug_enter("nfs3_bio");
4316 			}
4317 #endif
4318 			error = nfs3write(bp->b_vp, bp->b_un.b_addr, offset,
4319 			    count, cred, stab_comm);
4320 			if (error == EACCES) {
4321 				mutex_enter(&rp->r_statelock);
4322 				if (cred != cr) {
4323 					if (rp->r_cred != NULL)
4324 						crfree(rp->r_cred);
4325 					rp->r_cred = cr;
4326 					crhold(cr);
4327 					crfree(cred);
4328 					cred = cr;
4329 					crhold(cred);
4330 					mutex_exit(&rp->r_statelock);
4331 					goto write_again;
4332 				}
4333 				mutex_exit(&rp->r_statelock);
4334 			}
4335 			bp->b_error = error;
4336 			if (error && error != EINTR) {
4337 				/*
4338 				 * Don't print EDQUOT errors on the console.
4339 				 * Don't print asynchronous EACCES errors.
4340 				 * Don't print EFBIG errors.
4341 				 * Print all other write errors.
4342 				 */
4343 				if (error != EDQUOT && error != EFBIG &&
4344 				    (error != EACCES ||
4345 				    !(bp->b_flags & B_ASYNC)))
4346 					nfs_write_error(bp->b_vp, error, cred);
4347 				/*
4348 				 * Update r_error and r_flags as appropriate.
4349 				 * If the error was ESTALE, then mark the
4350 				 * rnode as not being writeable and save
4351 				 * the error status.  Otherwise, save any
4352 				 * errors which occur from asynchronous
4353 				 * page invalidations.  Any errors occurring
4354 				 * from other operations should be saved
4355 				 * by the caller.
4356 				 */
4357 				mutex_enter(&rp->r_statelock);
4358 				if (error == ESTALE) {
4359 					rp->r_flags |= RSTALE;
4360 					if (!rp->r_error)
4361 						rp->r_error = error;
4362 				} else if (!rp->r_error &&
4363 				    (bp->b_flags &
4364 				    (B_INVAL|B_FORCE|B_ASYNC)) ==
4365 				    (B_INVAL|B_FORCE|B_ASYNC)) {
4366 					rp->r_error = error;
4367 				}
4368 				mutex_exit(&rp->r_statelock);
4369 			}
4370 			crfree(cred);
4371 		} else
4372 			error = rp->r_error;
4373 	}
4374 
4375 	if (error != 0 && error != NFS_EOF)
4376 		bp->b_flags |= B_ERROR;
4377 
4378 	DTRACE_IO1(done, struct buf *, bp);
4379 
4380 	return (error);
4381 }
4382 
4383 /* ARGSUSED */
4384 static int
4385 nfs3_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
4386 {
4387 	rnode_t *rp;
4388 
4389 	if (nfs_zone() != VTOMI(vp)->mi_zone)
4390 		return (EIO);
4391 	rp = VTOR(vp);
4392 
4393 	if (fidp->fid_len < (ushort_t)rp->r_fh.fh_len) {
4394 		fidp->fid_len = rp->r_fh.fh_len;
4395 		return (ENOSPC);
4396 	}
4397 	fidp->fid_len = rp->r_fh.fh_len;
4398 	bcopy(rp->r_fh.fh_buf, fidp->fid_data, fidp->fid_len);
4399 	return (0);
4400 }
4401 
4402 /* ARGSUSED2 */
4403 static int
4404 nfs3_rwlock(vnode_t *vp, int write_lock, caller_context_t *ctp)
4405 {
4406 	rnode_t *rp = VTOR(vp);
4407 
4408 	if (!write_lock) {
4409 		(void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
4410 		return (V_WRITELOCK_FALSE);
4411 	}
4412 
4413 	if ((rp->r_flags & RDIRECTIO) || (VTOMI(vp)->mi_flags & MI_DIRECTIO)) {
4414 		(void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
4415 		if (rp->r_mapcnt == 0 && !vn_has_cached_data(vp))
4416 			return (V_WRITELOCK_FALSE);
4417 		nfs_rw_exit(&rp->r_rwlock);
4418 	}
4419 
4420 	(void) nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, FALSE);
4421 	return (V_WRITELOCK_TRUE);
4422 }
4423 
4424 /* ARGSUSED */
4425 static void
4426 nfs3_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ctp)
4427 {
4428 	rnode_t *rp = VTOR(vp);
4429 
4430 	nfs_rw_exit(&rp->r_rwlock);
4431 }
4432 
4433 /* ARGSUSED */
4434 static int
4435 nfs3_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct)
4436 {
4437 
4438 	/*
4439 	 * Because we stuff the readdir cookie into the offset field
4440 	 * someone may attempt to do an lseek with the cookie which
4441 	 * we want to succeed.
4442 	 */
4443 	if (vp->v_type == VDIR)
4444 		return (0);
4445 	if (*noffp < 0)
4446 		return (EINVAL);
4447 	return (0);
4448 }
4449 
4450 /*
4451  * number of nfs3_bsize blocks to read ahead.
4452  */
4453 static int nfs3_nra = 4;
4454 
4455 #ifdef DEBUG
4456 static int nfs3_lostpage = 0;	/* number of times we lost original page */
4457 #endif
4458 
4459 /*
4460  * Return all the pages from [off..off+len) in file
4461  */
4462 /* ARGSUSED */
4463 static int
4464 nfs3_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
4465 	page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
4466 	enum seg_rw rw, cred_t *cr, caller_context_t *ct)
4467 {
4468 	rnode_t *rp;
4469 	int error;
4470 	mntinfo_t *mi;
4471 
4472 	if (vp->v_flag & VNOMAP)
4473 		return (ENOSYS);
4474 
4475 	if (nfs_zone() != VTOMI(vp)->mi_zone)
4476 		return (EIO);
4477 	if (protp != NULL)
4478 		*protp = PROT_ALL;
4479 
4480 	/*
4481 	 * Now valididate that the caches are up to date.
4482 	 */
4483 	error = nfs3_validate_caches(vp, cr);
4484 	if (error)
4485 		return (error);
4486 
4487 	rp = VTOR(vp);
4488 	mi = VTOMI(vp);
4489 retry:
4490 	mutex_enter(&rp->r_statelock);
4491 
4492 	/*
4493 	 * Don't create dirty pages faster than they
4494 	 * can be cleaned so that the system doesn't
4495 	 * get imbalanced.  If the async queue is
4496 	 * maxed out, then wait for it to drain before
4497 	 * creating more dirty pages.  Also, wait for
4498 	 * any threads doing pagewalks in the vop_getattr
4499 	 * entry points so that they don't block for
4500 	 * long periods.
4501 	 */
4502 	if (rw == S_CREATE) {
4503 		while ((mi->mi_max_threads != 0 &&
4504 		    rp->r_awcount > 2 * mi->mi_max_threads) ||
4505 		    rp->r_gcount > 0)
4506 			cv_wait(&rp->r_cv, &rp->r_statelock);
4507 	}
4508 
4509 	/*
4510 	 * If we are getting called as a side effect of an nfs_write()
4511 	 * operation the local file size might not be extended yet.
4512 	 * In this case we want to be able to return pages of zeroes.
4513 	 */
4514 	if (off + len > rp->r_size + PAGEOFFSET && seg != segkmap) {
4515 		mutex_exit(&rp->r_statelock);
4516 		return (EFAULT);		/* beyond EOF */
4517 	}
4518 
4519 	mutex_exit(&rp->r_statelock);
4520 
4521 	if (len <= PAGESIZE) {
4522 		error = nfs3_getapage(vp, off, len, protp, pl, plsz,
4523 		    seg, addr, rw, cr);
4524 	} else {
4525 		error = pvn_getpages(nfs3_getapage, vp, off, len, protp,
4526 		    pl, plsz, seg, addr, rw, cr);
4527 	}
4528 
4529 	switch (error) {
4530 	case NFS_EOF:
4531 		nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr);
4532 		goto retry;
4533 	case ESTALE:
4534 		PURGE_STALE_FH(error, vp, cr);
4535 	}
4536 
4537 	return (error);
4538 }
4539 
4540 /*
4541  * Called from pvn_getpages or nfs3_getpage to get a particular page.
4542  */
4543 /* ARGSUSED */
4544 static int
4545 nfs3_getapage(vnode_t *vp, u_offset_t off, size_t len, uint_t *protp,
4546 	page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
4547 	enum seg_rw rw, cred_t *cr)
4548 {
4549 	rnode_t *rp;
4550 	uint_t bsize;
4551 	struct buf *bp;
4552 	page_t *pp;
4553 	u_offset_t lbn;
4554 	u_offset_t io_off;
4555 	u_offset_t blkoff;
4556 	u_offset_t rablkoff;
4557 	size_t io_len;
4558 	uint_t blksize;
4559 	int error;
4560 	int readahead;
4561 	int readahead_issued = 0;
4562 	int ra_window; /* readahead window */
4563 	page_t *pagefound;
4564 	page_t *savepp;
4565 
4566 	if (nfs_zone() != VTOMI(vp)->mi_zone)
4567 		return (EIO);
4568 	rp = VTOR(vp);
4569 	bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
4570 
4571 reread:
4572 	bp = NULL;
4573 	pp = NULL;
4574 	pagefound = NULL;
4575 
4576 	if (pl != NULL)
4577 		pl[0] = NULL;
4578 
4579 	error = 0;
4580 	lbn = off / bsize;
4581 	blkoff = lbn * bsize;
4582 
4583 	/*
4584 	 * Queueing up the readahead before doing the synchronous read
4585 	 * results in a significant increase in read throughput because
4586 	 * of the increased parallelism between the async threads and
4587 	 * the process context.
4588 	 */
4589 	if ((off & ((vp->v_vfsp->vfs_bsize) - 1)) == 0 &&
4590 	    rw != S_CREATE &&
4591 	    !(vp->v_flag & VNOCACHE)) {
4592 		mutex_enter(&rp->r_statelock);
4593 
4594 		/*
4595 		 * Calculate the number of readaheads to do.
4596 		 * a) No readaheads at offset = 0.
4597 		 * b) Do maximum(nfs3_nra) readaheads when the readahead
4598 		 *    window is closed.
4599 		 * c) Do readaheads between 1 to (nfs3_nra - 1) depending
4600 		 *    upon how far the readahead window is open or close.
4601 		 * d) No readaheads if rp->r_nextr is not within the scope
4602 		 *    of the readahead window (random i/o).
4603 		 */
4604 
4605 		if (off == 0)
4606 			readahead = 0;
4607 		else if (blkoff == rp->r_nextr)
4608 			readahead = nfs3_nra;
4609 		else if (rp->r_nextr > blkoff &&
4610 				((ra_window = (rp->r_nextr - blkoff) / bsize)
4611 					<= (nfs3_nra - 1)))
4612 			readahead = nfs3_nra - ra_window;
4613 		else
4614 			readahead = 0;
4615 
4616 		rablkoff = rp->r_nextr;
4617 		while (readahead > 0 && rablkoff + bsize < rp->r_size) {
4618 			mutex_exit(&rp->r_statelock);
4619 			if (nfs_async_readahead(vp, rablkoff + bsize,
4620 			    addr + (rablkoff + bsize - off), seg, cr,
4621 			    nfs3_readahead) < 0) {
4622 				mutex_enter(&rp->r_statelock);
4623 				break;
4624 			}
4625 			readahead--;
4626 			rablkoff += bsize;
4627 			/*
4628 			 * Indicate that we did a readahead so
4629 			 * readahead offset is not updated
4630 			 * by the synchronous read below.
4631 			 */
4632 			readahead_issued = 1;
4633 			mutex_enter(&rp->r_statelock);
4634 			/*
4635 			 * set readahead offset to
4636 			 * offset of last async readahead
4637 			 * request.
4638 			 */
4639 			rp->r_nextr = rablkoff;
4640 		}
4641 		mutex_exit(&rp->r_statelock);
4642 	}
4643 
4644 again:
4645 	if ((pagefound = page_exists(vp, off)) == NULL) {
4646 		if (pl == NULL) {
4647 			(void) nfs_async_readahead(vp, blkoff, addr, seg, cr,
4648 			    nfs3_readahead);
4649 		} else if (rw == S_CREATE) {
4650 			/*
4651 			 * Block for this page is not allocated, or the offset
4652 			 * is beyond the current allocation size, or we're
4653 			 * allocating a swap slot and the page was not found,
4654 			 * so allocate it and return a zero page.
4655 			 */
4656 			if ((pp = page_create_va(vp, off,
4657 			    PAGESIZE, PG_WAIT, seg, addr)) == NULL)
4658 				cmn_err(CE_PANIC, "nfs3_getapage: page_create");
4659 			io_len = PAGESIZE;
4660 			mutex_enter(&rp->r_statelock);
4661 			rp->r_nextr = off + PAGESIZE;
4662 			mutex_exit(&rp->r_statelock);
4663 		} else {
4664 			/*
4665 			 * Need to go to server to get a BLOCK, exception to
4666 			 * that being while reading at offset = 0 or doing
4667 			 * random i/o, in that case read only a PAGE.
4668 			 */
4669 			mutex_enter(&rp->r_statelock);
4670 			if (blkoff < rp->r_size &&
4671 			    blkoff + bsize >= rp->r_size) {
4672 				/*
4673 				 * If only a block or less is left in
4674 				 * the file, read all that is remaining.
4675 				 */
4676 				if (rp->r_size <= off) {
4677 					/*
4678 					 * Trying to access beyond EOF,
4679 					 * set up to get at least one page.
4680 					 */
4681 					blksize = off + PAGESIZE - blkoff;
4682 				} else
4683 					blksize = rp->r_size - blkoff;
4684 			} else if ((off == 0) ||
4685 				(off != rp->r_nextr && !readahead_issued)) {
4686 				blksize = PAGESIZE;
4687 				blkoff = off; /* block = page here */
4688 			} else
4689 				blksize = bsize;
4690 			mutex_exit(&rp->r_statelock);
4691 
4692 			pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
4693 			    &io_len, blkoff, blksize, 0);
4694 
4695 			/*
4696 			 * Some other thread has entered the page,
4697 			 * so just use it.
4698 			 */
4699 			if (pp == NULL)
4700 				goto again;
4701 
4702 			/*
4703 			 * Now round the request size up to page boundaries.
4704 			 * This ensures that the entire page will be
4705 			 * initialized to zeroes if EOF is encountered.
4706 			 */
4707 			io_len = ptob(btopr(io_len));
4708 
4709 			bp = pageio_setup(pp, io_len, vp, B_READ);
4710 			ASSERT(bp != NULL);
4711 
4712 			/*
4713 			 * pageio_setup should have set b_addr to 0.  This
4714 			 * is correct since we want to do I/O on a page
4715 			 * boundary.  bp_mapin will use this addr to calculate
4716 			 * an offset, and then set b_addr to the kernel virtual
4717 			 * address it allocated for us.
4718 			 */
4719 			ASSERT(bp->b_un.b_addr == 0);
4720 
4721 			bp->b_edev = 0;
4722 			bp->b_dev = 0;
4723 			bp->b_lblkno = lbtodb(io_off);
4724 			bp->b_file = vp;
4725 			bp->b_offset = (offset_t)off;
4726 			bp_mapin(bp);
4727 
4728 			/*
4729 			 * If doing a write beyond what we believe is EOF,
4730 			 * don't bother trying to read the pages from the
4731 			 * server, we'll just zero the pages here.  We
4732 			 * don't check that the rw flag is S_WRITE here
4733 			 * because some implementations may attempt a
4734 			 * read access to the buffer before copying data.
4735 			 */
4736 			mutex_enter(&rp->r_statelock);
4737 			if (io_off >= rp->r_size && seg == segkmap) {
4738 				mutex_exit(&rp->r_statelock);
4739 				bzero(bp->b_un.b_addr, io_len);
4740 			} else {
4741 				mutex_exit(&rp->r_statelock);
4742 				error = nfs3_bio(bp, NULL, cr);
4743 			}
4744 
4745 			/*
4746 			 * Unmap the buffer before freeing it.
4747 			 */
4748 			bp_mapout(bp);
4749 			pageio_done(bp);
4750 
4751 			savepp = pp;
4752 			do {
4753 				pp->p_fsdata = C_NOCOMMIT;
4754 			} while ((pp = pp->p_next) != savepp);
4755 
4756 			if (error == NFS_EOF) {
4757 				/*
4758 				 * If doing a write system call just return
4759 				 * zeroed pages, else user tried to get pages
4760 				 * beyond EOF, return error.  We don't check
4761 				 * that the rw flag is S_WRITE here because
4762 				 * some implementations may attempt a read
4763 				 * access to the buffer before copying data.
4764 				 */
4765 				if (seg == segkmap)
4766 					error = 0;
4767 				else
4768 					error = EFAULT;
4769 			}
4770 
4771 			if (!readahead_issued && !error) {
4772 				mutex_enter(&rp->r_statelock);
4773 				rp->r_nextr = io_off + io_len;
4774 				mutex_exit(&rp->r_statelock);
4775 			}
4776 		}
4777 	}
4778 
4779 out:
4780 	if (pl == NULL)
4781 		return (error);
4782 
4783 	if (error) {
4784 		if (pp != NULL)
4785 			pvn_read_done(pp, B_ERROR);
4786 		return (error);
4787 	}
4788 
4789 	if (pagefound) {
4790 		se_t se = (rw == S_CREATE ? SE_EXCL : SE_SHARED);
4791 
4792 		/*
4793 		 * Page exists in the cache, acquire the appropriate lock.
4794 		 * If this fails, start all over again.
4795 		 */
4796 		if ((pp = page_lookup(vp, off, se)) == NULL) {
4797 #ifdef DEBUG
4798 			nfs3_lostpage++;
4799 #endif
4800 			goto reread;
4801 		}
4802 		pl[0] = pp;
4803 		pl[1] = NULL;
4804 		return (0);
4805 	}
4806 
4807 	if (pp != NULL)
4808 		pvn_plist_init(pp, pl, plsz, off, io_len, rw);
4809 
4810 	return (error);
4811 }
4812 
4813 static void
4814 nfs3_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, struct seg *seg,
4815 	cred_t *cr)
4816 {
4817 	int error;
4818 	page_t *pp;
4819 	u_offset_t io_off;
4820 	size_t io_len;
4821 	struct buf *bp;
4822 	uint_t bsize, blksize;
4823 	rnode_t *rp = VTOR(vp);
4824 	page_t *savepp;
4825 
4826 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
4827 	bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
4828 
4829 	mutex_enter(&rp->r_statelock);
4830 	if (blkoff < rp->r_size && blkoff + bsize > rp->r_size) {
4831 		/*
4832 		 * If less than a block left in file read less
4833 		 * than a block.
4834 		 */
4835 		blksize = rp->r_size - blkoff;
4836 	} else
4837 		blksize = bsize;
4838 	mutex_exit(&rp->r_statelock);
4839 
4840 	pp = pvn_read_kluster(vp, blkoff, segkmap, addr,
4841 	    &io_off, &io_len, blkoff, blksize, 1);
4842 	/*
4843 	 * The isra flag passed to the kluster function is 1, we may have
4844 	 * gotten a return value of NULL for a variety of reasons (# of free
4845 	 * pages < minfree, someone entered the page on the vnode etc). In all
4846 	 * cases, we want to punt on the readahead.
4847 	 */
4848 	if (pp == NULL)
4849 		return;
4850 
4851 	/*
4852 	 * Now round the request size up to page boundaries.
4853 	 * This ensures that the entire page will be
4854 	 * initialized to zeroes if EOF is encountered.
4855 	 */
4856 	io_len = ptob(btopr(io_len));
4857 
4858 	bp = pageio_setup(pp, io_len, vp, B_READ);
4859 	ASSERT(bp != NULL);
4860 
4861 	/*
4862 	 * pageio_setup should have set b_addr to 0.  This is correct since
4863 	 * we want to do I/O on a page boundary. bp_mapin() will use this addr
4864 	 * to calculate an offset, and then set b_addr to the kernel virtual
4865 	 * address it allocated for us.
4866 	 */
4867 	ASSERT(bp->b_un.b_addr == 0);
4868 
4869 	bp->b_edev = 0;
4870 	bp->b_dev = 0;
4871 	bp->b_lblkno = lbtodb(io_off);
4872 	bp->b_file = vp;
4873 	bp->b_offset = (offset_t)blkoff;
4874 	bp_mapin(bp);
4875 
4876 	/*
4877 	 * If doing a write beyond what we believe is EOF, don't bother trying
4878 	 * to read the pages from the server, we'll just zero the pages here.
4879 	 * We don't check that the rw flag is S_WRITE here because some
4880 	 * implementations may attempt a read access to the buffer before
4881 	 * copying data.
4882 	 */
4883 	mutex_enter(&rp->r_statelock);
4884 	if (io_off >= rp->r_size && seg == segkmap) {
4885 		mutex_exit(&rp->r_statelock);
4886 		bzero(bp->b_un.b_addr, io_len);
4887 		error = 0;
4888 	} else {
4889 		mutex_exit(&rp->r_statelock);
4890 		error = nfs3_bio(bp, NULL, cr);
4891 		if (error == NFS_EOF)
4892 			error = 0;
4893 	}
4894 
4895 	/*
4896 	 * Unmap the buffer before freeing it.
4897 	 */
4898 	bp_mapout(bp);
4899 	pageio_done(bp);
4900 
4901 	savepp = pp;
4902 	do {
4903 		pp->p_fsdata = C_NOCOMMIT;
4904 	} while ((pp = pp->p_next) != savepp);
4905 
4906 	pvn_read_done(pp, error ? B_READ | B_ERROR : B_READ);
4907 
4908 	/*
4909 	 * In case of error set readahead offset
4910 	 * to the lowest offset.
4911 	 * pvn_read_done() calls VN_DISPOSE to destroy the pages
4912 	 */
4913 	if (error && rp->r_nextr > io_off) {
4914 		mutex_enter(&rp->r_statelock);
4915 		if (rp->r_nextr > io_off)
4916 			rp->r_nextr = io_off;
4917 		mutex_exit(&rp->r_statelock);
4918 	}
4919 }
4920 
4921 /*
4922  * Flags are composed of {B_INVAL, B_FREE, B_DONTNEED, B_FORCE}
4923  * If len == 0, do from off to EOF.
4924  *
4925  * The normal cases should be len == 0 && off == 0 (entire vp list),
4926  * len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE
4927  * (from pageout).
4928  */
4929 /* ARGSUSED */
4930 static int
4931 nfs3_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
4932 	caller_context_t *ct)
4933 {
4934 	int error;
4935 	rnode_t *rp;
4936 
4937 	ASSERT(cr != NULL);
4938 
4939 	/*
4940 	 * XXX - Why should this check be made here?
4941 	 */
4942 	if (vp->v_flag & VNOMAP)
4943 		return (ENOSYS);
4944 	if (len == 0 && !(flags & B_INVAL) && vn_is_readonly(vp))
4945 		return (0);
4946 	if (!(flags & B_ASYNC) && nfs_zone() != VTOMI(vp)->mi_zone)
4947 		return (EIO);
4948 
4949 	rp = VTOR(vp);
4950 	mutex_enter(&rp->r_statelock);
4951 	rp->r_count++;
4952 	mutex_exit(&rp->r_statelock);
4953 	error = nfs_putpages(vp, off, len, flags, cr);
4954 	mutex_enter(&rp->r_statelock);
4955 	rp->r_count--;
4956 	cv_broadcast(&rp->r_cv);
4957 	mutex_exit(&rp->r_statelock);
4958 
4959 	return (error);
4960 }
4961 
4962 /*
4963  * Write out a single page, possibly klustering adjacent dirty pages.
4964  */
4965 int
4966 nfs3_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp,
4967 	int flags, cred_t *cr)
4968 {
4969 	u_offset_t io_off;
4970 	u_offset_t lbn_off;
4971 	u_offset_t lbn;
4972 	size_t io_len;
4973 	uint_t bsize;
4974 	int error;
4975 	rnode_t *rp;
4976 
4977 	ASSERT(!vn_is_readonly(vp));
4978 	ASSERT(pp != NULL);
4979 	ASSERT(cr != NULL);
4980 	ASSERT((flags & B_ASYNC) || nfs_zone() == VTOMI(vp)->mi_zone);
4981 
4982 	rp = VTOR(vp);
4983 	ASSERT(rp->r_count > 0);
4984 
4985 	bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
4986 	lbn = pp->p_offset / bsize;
4987 	lbn_off = lbn * bsize;
4988 
4989 	/*
4990 	 * Find a kluster that fits in one block, or in
4991 	 * one page if pages are bigger than blocks.  If
4992 	 * there is less file space allocated than a whole
4993 	 * page, we'll shorten the i/o request below.
4994 	 */
4995 	pp = pvn_write_kluster(vp, pp, &io_off, &io_len, lbn_off,
4996 	    roundup(bsize, PAGESIZE), flags);
4997 
4998 	/*
4999 	 * pvn_write_kluster shouldn't have returned a page with offset
5000 	 * behind the original page we were given.  Verify that.
5001 	 */
5002 	ASSERT((pp->p_offset / bsize) >= lbn);
5003 
5004 	/*
5005 	 * Now pp will have the list of kept dirty pages marked for
5006 	 * write back.  It will also handle invalidation and freeing
5007 	 * of pages that are not dirty.  Check for page length rounding
5008 	 * problems.
5009 	 */
5010 	if (io_off + io_len > lbn_off + bsize) {
5011 		ASSERT((io_off + io_len) - (lbn_off + bsize) < PAGESIZE);
5012 		io_len = lbn_off + bsize - io_off;
5013 	}
5014 	/*
5015 	 * The RMODINPROGRESS flag makes sure that nfs(3)_bio() sees a
5016 	 * consistent value of r_size. RMODINPROGRESS is set in writerp().
5017 	 * When RMODINPROGRESS is set it indicates that a uiomove() is in
5018 	 * progress and the r_size has not been made consistent with the
5019 	 * new size of the file. When the uiomove() completes the r_size is
5020 	 * updated and the RMODINPROGRESS flag is cleared.
5021 	 *
5022 	 * The RMODINPROGRESS flag makes sure that nfs(3)_bio() sees a
5023 	 * consistent value of r_size. Without this handshaking, it is
5024 	 * possible that nfs(3)_bio() picks  up the old value of r_size
5025 	 * before the uiomove() in writerp() completes. This will result
5026 	 * in the write through nfs(3)_bio() being dropped.
5027 	 *
5028 	 * More precisely, there is a window between the time the uiomove()
5029 	 * completes and the time the r_size is updated. If a VOP_PUTPAGE()
5030 	 * operation intervenes in this window, the page will be picked up,
5031 	 * because it is dirty (it will be unlocked, unless it was
5032 	 * pagecreate'd). When the page is picked up as dirty, the dirty
5033 	 * bit is reset (pvn_getdirty()). In nfs(3)write(), r_size is
5034 	 * checked. This will still be the old size. Therefore the page will
5035 	 * not be written out. When segmap_release() calls VOP_PUTPAGE(),
5036 	 * the page will be found to be clean and the write will be dropped.
5037 	 */
5038 	if (rp->r_flags & RMODINPROGRESS) {
5039 		mutex_enter(&rp->r_statelock);
5040 		if ((rp->r_flags & RMODINPROGRESS) &&
5041 		    rp->r_modaddr + MAXBSIZE > io_off &&
5042 		    rp->r_modaddr < io_off + io_len) {
5043 			page_t *plist;
5044 			/*
5045 			 * A write is in progress for this region of the file.
5046 			 * If we did not detect RMODINPROGRESS here then this
5047 			 * path through nfs_putapage() would eventually go to
5048 			 * nfs(3)_bio() and may not write out all of the data
5049 			 * in the pages. We end up losing data. So we decide
5050 			 * to set the modified bit on each page in the page
5051 			 * list and mark the rnode with RDIRTY. This write
5052 			 * will be restarted at some later time.
5053 			 */
5054 			plist = pp;
5055 			while (plist != NULL) {
5056 				pp = plist;
5057 				page_sub(&plist, pp);
5058 				hat_setmod(pp);
5059 				page_io_unlock(pp);
5060 				page_unlock(pp);
5061 			}
5062 			rp->r_flags |= RDIRTY;
5063 			mutex_exit(&rp->r_statelock);
5064 			if (offp)
5065 				*offp = io_off;
5066 			if (lenp)
5067 				*lenp = io_len;
5068 			return (0);
5069 		}
5070 		mutex_exit(&rp->r_statelock);
5071 	}
5072 
5073 	if (flags & B_ASYNC) {
5074 		error = nfs_async_putapage(vp, pp, io_off, io_len, flags, cr,
5075 		    nfs3_sync_putapage);
5076 	} else
5077 		error = nfs3_sync_putapage(vp, pp, io_off, io_len, flags, cr);
5078 
5079 	if (offp)
5080 		*offp = io_off;
5081 	if (lenp)
5082 		*lenp = io_len;
5083 	return (error);
5084 }
5085 
5086 static int
5087 nfs3_sync_putapage(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
5088 	int flags, cred_t *cr)
5089 {
5090 	int error;
5091 	rnode_t *rp;
5092 
5093 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
5094 
5095 	flags |= B_WRITE;
5096 
5097 	error = nfs3_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
5098 
5099 	rp = VTOR(vp);
5100 
5101 	if ((error == ENOSPC || error == EDQUOT || error == EFBIG ||
5102 	    error == EACCES) &&
5103 	    (flags & (B_INVAL|B_FORCE)) != (B_INVAL|B_FORCE)) {
5104 		if (!(rp->r_flags & ROUTOFSPACE)) {
5105 			mutex_enter(&rp->r_statelock);
5106 			rp->r_flags |= ROUTOFSPACE;
5107 			mutex_exit(&rp->r_statelock);
5108 		}
5109 		flags |= B_ERROR;
5110 		pvn_write_done(pp, flags);
5111 		/*
5112 		 * If this was not an async thread, then try again to
5113 		 * write out the pages, but this time, also destroy
5114 		 * them whether or not the write is successful.  This
5115 		 * will prevent memory from filling up with these
5116 		 * pages and destroying them is the only alternative
5117 		 * if they can't be written out.
5118 		 *
5119 		 * Don't do this if this is an async thread because
5120 		 * when the pages are unlocked in pvn_write_done,
5121 		 * some other thread could have come along, locked
5122 		 * them, and queued for an async thread.  It would be
5123 		 * possible for all of the async threads to be tied
5124 		 * up waiting to lock the pages again and they would
5125 		 * all already be locked and waiting for an async
5126 		 * thread to handle them.  Deadlock.
5127 		 */
5128 		if (!(flags & B_ASYNC)) {
5129 			error = nfs3_putpage(vp, io_off, io_len,
5130 			    B_INVAL | B_FORCE, cr, NULL);
5131 		}
5132 	} else {
5133 		if (error)
5134 			flags |= B_ERROR;
5135 		else if (rp->r_flags & ROUTOFSPACE) {
5136 			mutex_enter(&rp->r_statelock);
5137 			rp->r_flags &= ~ROUTOFSPACE;
5138 			mutex_exit(&rp->r_statelock);
5139 		}
5140 		pvn_write_done(pp, flags);
5141 		if (freemem < desfree)
5142 			(void) nfs3_commit_vp(vp, (u_offset_t)0, 0, cr);
5143 	}
5144 
5145 	return (error);
5146 }
5147 
5148 /* ARGSUSED */
5149 static int
5150 nfs3_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
5151 	size_t len, uchar_t prot, uchar_t maxprot, uint_t flags,
5152 	cred_t *cr, caller_context_t *ct)
5153 {
5154 	struct segvn_crargs vn_a;
5155 	int error;
5156 	rnode_t *rp;
5157 	struct vattr va;
5158 
5159 	if (nfs_zone() != VTOMI(vp)->mi_zone)
5160 		return (EIO);
5161 
5162 	if (vp->v_flag & VNOMAP)
5163 		return (ENOSYS);
5164 
5165 	if (off < 0 || off + len < 0)
5166 		return (ENXIO);
5167 
5168 	if (vp->v_type != VREG)
5169 		return (ENODEV);
5170 
5171 	/*
5172 	 * If there is cached data and if close-to-open consistency
5173 	 * checking is not turned off and if the file system is not
5174 	 * mounted readonly, then force an over the wire getattr.
5175 	 * Otherwise, just invoke nfs3getattr to get a copy of the
5176 	 * attributes.  The attribute cache will be used unless it
5177 	 * is timed out and if it is, then an over the wire getattr
5178 	 * will be issued.
5179 	 */
5180 	va.va_mask = AT_ALL;
5181 	if (vn_has_cached_data(vp) &&
5182 	    !(VTOMI(vp)->mi_flags & MI_NOCTO) && !vn_is_readonly(vp))
5183 		error = nfs3_getattr_otw(vp, &va, cr);
5184 	else
5185 		error = nfs3getattr(vp, &va, cr);
5186 	if (error)
5187 		return (error);
5188 
5189 	/*
5190 	 * Check to see if the vnode is currently marked as not cachable.
5191 	 * This means portions of the file are locked (through VOP_FRLOCK).
5192 	 * In this case the map request must be refused.  We use
5193 	 * rp->r_lkserlock to avoid a race with concurrent lock requests.
5194 	 */
5195 	rp = VTOR(vp);
5196 	if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR(vp)))
5197 		return (EINTR);
5198 
5199 	if (vp->v_flag & VNOCACHE) {
5200 		error = EAGAIN;
5201 		goto done;
5202 	}
5203 
5204 	/*
5205 	 * Don't allow concurrent locks and mapping if mandatory locking is
5206 	 * enabled.
5207 	 */
5208 	if ((flk_has_remote_locks(vp) || lm_has_sleep(vp)) &&
5209 	    MANDLOCK(vp, va.va_mode)) {
5210 		error = EAGAIN;
5211 		goto done;
5212 	}
5213 
5214 	as_rangelock(as);
5215 	if (!(flags & MAP_FIXED)) {
5216 		map_addr(addrp, len, off, 1, flags);
5217 		if (*addrp == NULL) {
5218 			as_rangeunlock(as);
5219 			error = ENOMEM;
5220 			goto done;
5221 		}
5222 	} else {
5223 		/*
5224 		 * User specified address - blow away any previous mappings
5225 		 */
5226 		(void) as_unmap(as, *addrp, len);
5227 	}
5228 
5229 	vn_a.vp = vp;
5230 	vn_a.offset = off;
5231 	vn_a.type = (flags & MAP_TYPE);
5232 	vn_a.prot = (uchar_t)prot;
5233 	vn_a.maxprot = (uchar_t)maxprot;
5234 	vn_a.flags = (flags & ~MAP_TYPE);
5235 	vn_a.cred = cr;
5236 	vn_a.amp = NULL;
5237 	vn_a.szc = 0;
5238 	vn_a.lgrp_mem_policy_flags = 0;
5239 
5240 	error = as_map(as, *addrp, len, segvn_create, &vn_a);
5241 	as_rangeunlock(as);
5242 
5243 done:
5244 	nfs_rw_exit(&rp->r_lkserlock);
5245 	return (error);
5246 }
5247 
5248 /* ARGSUSED */
5249 static int
5250 nfs3_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
5251 	size_t len, uchar_t prot, uchar_t maxprot, uint_t flags,
5252 	cred_t *cr, caller_context_t *ct)
5253 {
5254 	rnode_t *rp;
5255 
5256 	if (vp->v_flag & VNOMAP)
5257 		return (ENOSYS);
5258 	if (nfs_zone() != VTOMI(vp)->mi_zone)
5259 		return (EIO);
5260 
5261 	/*
5262 	 * Need to hold rwlock while incrementing the mapcnt so that
5263 	 * mmap'ing can be serialized with writes so that the caching
5264 	 * can be handled correctly.
5265 	 */
5266 	rp = VTOR(vp);
5267 	if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, INTR(vp)))
5268 		return (EINTR);
5269 	atomic_add_long((ulong_t *)&rp->r_mapcnt, btopr(len));
5270 	nfs_rw_exit(&rp->r_rwlock);
5271 
5272 	return (0);
5273 }
5274 
5275 /* ARGSUSED */
5276 static int
5277 nfs3_frlock(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
5278 	offset_t offset, struct flk_callback *flk_cbp, cred_t *cr,
5279 	caller_context_t *ct)
5280 {
5281 	netobj lm_fh3;
5282 	int rc;
5283 	u_offset_t start, end;
5284 	rnode_t *rp;
5285 	int error = 0, intr = INTR(vp);
5286 
5287 	if (nfs_zone() != VTOMI(vp)->mi_zone)
5288 		return (EIO);
5289 	/* check for valid cmd parameter */
5290 	if (cmd != F_GETLK && cmd != F_SETLK && cmd != F_SETLKW)
5291 		return (EINVAL);
5292 
5293 	/* Verify l_type. */
5294 	switch (bfp->l_type) {
5295 	case F_RDLCK:
5296 		if (cmd != F_GETLK && !(flag & FREAD))
5297 			return (EBADF);
5298 		break;
5299 	case F_WRLCK:
5300 		if (cmd != F_GETLK && !(flag & FWRITE))
5301 			return (EBADF);
5302 		break;
5303 	case F_UNLCK:
5304 		intr = 0;
5305 		break;
5306 
5307 	default:
5308 		return (EINVAL);
5309 	}
5310 
5311 	/* check the validity of the lock range */
5312 	if (rc = flk_convert_lock_data(vp, bfp, &start, &end, offset))
5313 		return (rc);
5314 	if (rc = flk_check_lock_data(start, end, MAXEND))
5315 		return (rc);
5316 
5317 	/*
5318 	 * If the filesystem is mounted using local locking, pass the
5319 	 * request off to the local locking code.
5320 	 */
5321 	if (VTOMI(vp)->mi_flags & MI_LLOCK) {
5322 		if (cmd == F_SETLK || cmd == F_SETLKW) {
5323 			/*
5324 			 * For complete safety, we should be holding
5325 			 * r_lkserlock.  However, we can't call
5326 			 * lm_safelock and then fs_frlock while
5327 			 * holding r_lkserlock, so just invoke
5328 			 * lm_safelock and expect that this will
5329 			 * catch enough of the cases.
5330 			 */
5331 			if (!lm_safelock(vp, bfp, cr))
5332 				return (EAGAIN);
5333 		}
5334 		return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
5335 	}
5336 
5337 	rp = VTOR(vp);
5338 
5339 	/*
5340 	 * Check whether the given lock request can proceed, given the
5341 	 * current file mappings.
5342 	 */
5343 	if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_WRITER, intr))
5344 		return (EINTR);
5345 	if (cmd == F_SETLK || cmd == F_SETLKW) {
5346 		if (!lm_safelock(vp, bfp, cr)) {
5347 			rc = EAGAIN;
5348 			goto done;
5349 		}
5350 	}
5351 
5352 	/*
5353 	 * Flush the cache after waiting for async I/O to finish.  For new
5354 	 * locks, this is so that the process gets the latest bits from the
5355 	 * server.  For unlocks, this is so that other clients see the
5356 	 * latest bits once the file has been unlocked.  If currently dirty
5357 	 * pages can't be flushed, then don't allow a lock to be set.  But
5358 	 * allow unlocks to succeed, to avoid having orphan locks on the
5359 	 * server.
5360 	 */
5361 	if (cmd != F_GETLK) {
5362 		mutex_enter(&rp->r_statelock);
5363 		while (rp->r_count > 0) {
5364 		    if (intr) {
5365 			klwp_t *lwp = ttolwp(curthread);
5366 
5367 			if (lwp != NULL)
5368 				lwp->lwp_nostop++;
5369 			if (cv_wait_sig(&rp->r_cv, &rp->r_statelock) == 0) {
5370 				if (lwp != NULL)
5371 					lwp->lwp_nostop--;
5372 				rc = EINTR;
5373 				break;
5374 			}
5375 			if (lwp != NULL)
5376 				lwp->lwp_nostop--;
5377 		    } else
5378 			cv_wait(&rp->r_cv, &rp->r_statelock);
5379 		}
5380 		mutex_exit(&rp->r_statelock);
5381 		if (rc != 0)
5382 			goto done;
5383 		error = nfs3_putpage(vp, (offset_t)0, 0, B_INVAL, cr, ct);
5384 		if (error) {
5385 			if (error == ENOSPC || error == EDQUOT) {
5386 				mutex_enter(&rp->r_statelock);
5387 				if (!rp->r_error)
5388 					rp->r_error = error;
5389 				mutex_exit(&rp->r_statelock);
5390 			}
5391 			if (bfp->l_type != F_UNLCK) {
5392 				rc = ENOLCK;
5393 				goto done;
5394 			}
5395 		}
5396 	}
5397 
5398 	lm_fh3.n_len = VTOFH3(vp)->fh3_length;
5399 	lm_fh3.n_bytes = (char *)&(VTOFH3(vp)->fh3_u.data);
5400 
5401 	/*
5402 	 * Call the lock manager to do the real work of contacting
5403 	 * the server and obtaining the lock.
5404 	 */
5405 	rc = lm4_frlock(vp, cmd, bfp, flag, offset, cr, &lm_fh3, flk_cbp);
5406 
5407 	if (rc == 0)
5408 		nfs_lockcompletion(vp, cmd);
5409 
5410 done:
5411 	nfs_rw_exit(&rp->r_lkserlock);
5412 	return (rc);
5413 }
5414 
5415 /*
5416  * Free storage space associated with the specified vnode.  The portion
5417  * to be freed is specified by bfp->l_start and bfp->l_len (already
5418  * normalized to a "whence" of 0).
5419  *
5420  * This is an experimental facility whose continued existence is not
5421  * guaranteed.  Currently, we only support the special case
5422  * of l_len == 0, meaning free to end of file.
5423  */
5424 /* ARGSUSED */
5425 static int
5426 nfs3_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
5427 	offset_t offset, cred_t *cr, caller_context_t *ct)
5428 {
5429 	int error;
5430 
5431 	ASSERT(vp->v_type == VREG);
5432 	if (cmd != F_FREESP)
5433 		return (EINVAL);
5434 	if (nfs_zone() != VTOMI(vp)->mi_zone)
5435 		return (EIO);
5436 
5437 	error = convoff(vp, bfp, 0, offset);
5438 	if (!error) {
5439 		ASSERT(bfp->l_start >= 0);
5440 		if (bfp->l_len == 0) {
5441 			struct vattr va;
5442 
5443 			/*
5444 			 * ftruncate should not change the ctime and
5445 			 * mtime if we truncate the file to its
5446 			 * previous size.
5447 			 */
5448 			va.va_mask = AT_SIZE;
5449 			error = nfs3getattr(vp, &va, cr);
5450 			if (error || va.va_size == bfp->l_start)
5451 				return (error);
5452 			va.va_mask = AT_SIZE;
5453 			va.va_size = bfp->l_start;
5454 			error = nfs3setattr(vp, &va, 0, cr);
5455 		} else
5456 			error = EINVAL;
5457 	}
5458 
5459 	return (error);
5460 }
5461 
5462 /* ARGSUSED */
5463 static int
5464 nfs3_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct)
5465 {
5466 
5467 	return (EINVAL);
5468 }
5469 
5470 /*
5471  * Setup and add an address space callback to do the work of the delmap call.
5472  * The callback will (and must be) deleted in the actual callback function.
5473  *
5474  * This is done in order to take care of the problem that we have with holding
5475  * the address space's a_lock for a long period of time (e.g. if the NFS server
5476  * is down).  Callbacks will be executed in the address space code while the
5477  * a_lock is not held.	Holding the address space's a_lock causes things such
5478  * as ps and fork to hang because they are trying to acquire this lock as well.
5479  */
5480 /* ARGSUSED */
5481 static int
5482 nfs3_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
5483 	size_t len, uint_t prot, uint_t maxprot, uint_t flags,
5484 	cred_t *cr, caller_context_t *ct)
5485 {
5486 	int			caller_found;
5487 	int			error;
5488 	rnode_t			*rp;
5489 	nfs_delmap_args_t	*dmapp;
5490 	nfs_delmapcall_t	*delmap_call;
5491 
5492 	if (vp->v_flag & VNOMAP)
5493 		return (ENOSYS);
5494 	/*
5495 	 * A process may not change zones if it has NFS pages mmap'ed
5496 	 * in, so we can't legitimately get here from the wrong zone.
5497 	 */
5498 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
5499 
5500 	rp = VTOR(vp);
5501 
5502 	/*
5503 	 * The way that the address space of this process deletes its mapping
5504 	 * of this file is via the following call chains:
5505 	 * - as_free()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs3_delmap()
5506 	 * - as_unmap()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs3_delmap()
5507 	 *
5508 	 * With the use of address space callbacks we are allowed to drop the
5509 	 * address space lock, a_lock, while executing the NFS operations that
5510 	 * need to go over the wire.  Returning EAGAIN to the caller of this
5511 	 * function is what drives the execution of the callback that we add
5512 	 * below.  The callback will be executed by the address space code
5513 	 * after dropping the a_lock.  When the callback is finished, since
5514 	 * we dropped the a_lock, it must be re-acquired and segvn_unmap()
5515 	 * is called again on the same segment to finish the rest of the work
5516 	 * that needs to happen during unmapping.
5517 	 *
5518 	 * This action of calling back into the segment driver causes
5519 	 * nfs3_delmap() to get called again, but since the callback was
5520 	 * already executed at this point, it already did the work and there
5521 	 * is nothing left for us to do.
5522 	 *
5523 	 * To Summarize:
5524 	 * - The first time nfs3_delmap is called by the current thread is when
5525 	 * we add the caller associated with this delmap to the delmap caller
5526 	 * list, add the callback, and return EAGAIN.
5527 	 * - The second time in this call chain when nfs3_delmap is called we
5528 	 * will find this caller in the delmap caller list and realize there
5529 	 * is no more work to do thus removing this caller from the list and
5530 	 * returning the error that was set in the callback execution.
5531 	 */
5532 	caller_found = nfs_find_and_delete_delmapcall(rp, &error);
5533 	if (caller_found) {
5534 		/*
5535 		 * 'error' is from the actual delmap operations.  To avoid
5536 		 * hangs, we need to handle the return of EAGAIN differently
5537 		 * since this is what drives the callback execution.
5538 		 * In this case, we don't want to return EAGAIN and do the
5539 		 * callback execution because there are none to execute.
5540 		 */
5541 		if (error == EAGAIN)
5542 			return (0);
5543 		else
5544 			return (error);
5545 	}
5546 
5547 	/* current caller was not in the list */
5548 	delmap_call = nfs_init_delmapcall();
5549 
5550 	mutex_enter(&rp->r_statelock);
5551 	list_insert_tail(&rp->r_indelmap, delmap_call);
5552 	mutex_exit(&rp->r_statelock);
5553 
5554 	dmapp = kmem_alloc(sizeof (nfs_delmap_args_t), KM_SLEEP);
5555 
5556 	dmapp->vp = vp;
5557 	dmapp->off = off;
5558 	dmapp->addr = addr;
5559 	dmapp->len = len;
5560 	dmapp->prot = prot;
5561 	dmapp->maxprot = maxprot;
5562 	dmapp->flags = flags;
5563 	dmapp->cr = cr;
5564 	dmapp->caller = delmap_call;
5565 
5566 	error = as_add_callback(as, nfs3_delmap_callback, dmapp,
5567 	AS_UNMAP_EVENT, addr, len, KM_SLEEP);
5568 
5569 	return (error ? error : EAGAIN);
5570 }
5571 
5572 /*
5573  * Remove some pages from an mmap'd vnode.  Just update the
5574  * count of pages.  If doing close-to-open, then flush and
5575  * commit all of the pages associated with this file.
5576  * Otherwise, start an asynchronous page flush to write out
5577  * any dirty pages.  This will also associate a credential
5578  * with the rnode which can be used to write the pages.
5579  */
5580 /* ARGSUSED */
5581 static void
5582 nfs3_delmap_callback(struct as *as, void *arg, uint_t event)
5583 {
5584 	int			error;
5585 	rnode_t			*rp;
5586 	mntinfo_t		*mi;
5587 	nfs_delmap_args_t	*dmapp = (nfs_delmap_args_t *)arg;
5588 
5589 	rp = VTOR(dmapp->vp);
5590 	mi = VTOMI(dmapp->vp);
5591 
5592 	atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(dmapp->len));
5593 	ASSERT(rp->r_mapcnt >= 0);
5594 
5595 	/*
5596 	 * Initiate a page flush and potential commit if there are
5597 	 * pages, the file system was not mounted readonly, the segment
5598 	 * was mapped shared, and the pages themselves were writeable.
5599 	 */
5600 	if (vn_has_cached_data(dmapp->vp) && !vn_is_readonly(dmapp->vp) &&
5601 	    dmapp->flags == MAP_SHARED && (dmapp->maxprot & PROT_WRITE)) {
5602 		mutex_enter(&rp->r_statelock);
5603 		rp->r_flags |= RDIRTY;
5604 		mutex_exit(&rp->r_statelock);
5605 		/*
5606 		 * If this is a cross-zone access a sync putpage won't work, so
5607 		 * the best we can do is try an async putpage.  That seems
5608 		 * better than something more draconian such as discarding the
5609 		 * dirty pages.
5610 		 */
5611 		if ((mi->mi_flags & MI_NOCTO) ||
5612 		    nfs_zone() != mi->mi_zone)
5613 			error = nfs3_putpage(dmapp->vp, dmapp->off, dmapp->len,
5614 			    B_ASYNC, dmapp->cr, NULL);
5615 		else
5616 			error = nfs3_putpage_commit(dmapp->vp, dmapp->off,
5617 			    dmapp->len, dmapp->cr);
5618 		if (!error) {
5619 			mutex_enter(&rp->r_statelock);
5620 			error = rp->r_error;
5621 			rp->r_error = 0;
5622 			mutex_exit(&rp->r_statelock);
5623 		}
5624 	} else
5625 		error = 0;
5626 
5627 	if ((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO))
5628 		(void) nfs3_putpage(dmapp->vp, dmapp->off, dmapp->len,
5629 		    B_INVAL, dmapp->cr, NULL);
5630 
5631 	dmapp->caller->error = error;
5632 	(void) as_delete_callback(as, arg);
5633 	kmem_free(dmapp, sizeof (nfs_delmap_args_t));
5634 }
5635 
5636 static int nfs3_pathconf_disable_cache = 0;
5637 
5638 #ifdef DEBUG
5639 static int nfs3_pathconf_cache_hits = 0;
5640 static int nfs3_pathconf_cache_misses = 0;
5641 #endif
5642 
5643 /* ARGSUSED */
5644 static int
5645 nfs3_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
5646 	caller_context_t *ct)
5647 {
5648 	int error;
5649 	PATHCONF3args args;
5650 	PATHCONF3res res;
5651 	int douprintf;
5652 	failinfo_t fi;
5653 	rnode_t *rp;
5654 	hrtime_t t;
5655 
5656 	if (nfs_zone() != VTOMI(vp)->mi_zone)
5657 		return (EIO);
5658 	/*
5659 	 * Large file spec - need to base answer on info stored
5660 	 * on original FSINFO response.
5661 	 */
5662 	if (cmd == _PC_FILESIZEBITS) {
5663 		unsigned long long ll;
5664 		long l = 1;
5665 
5666 		ll = VTOMI(vp)->mi_maxfilesize;
5667 
5668 		if (ll == 0) {
5669 			*valp = 0;
5670 			return (0);
5671 		}
5672 
5673 		if (ll & 0xffffffff00000000) {
5674 			l += 32; ll >>= 32;
5675 		}
5676 		if (ll & 0xffff0000) {
5677 			l += 16; ll >>= 16;
5678 		}
5679 		if (ll & 0xff00) {
5680 			l += 8; ll >>= 8;
5681 		}
5682 		if (ll & 0xf0) {
5683 			l += 4; ll >>= 4;
5684 		}
5685 		if (ll & 0xc) {
5686 			l += 2; ll >>= 2;
5687 		}
5688 		if (ll & 0x2)
5689 			l += 2;
5690 		else if (ll & 0x1)
5691 			l += 1;
5692 		*valp = l;
5693 		return (0);
5694 	}
5695 
5696 	if (cmd == _PC_ACL_ENABLED) {
5697 		*valp = _ACL_ACLENT_ENABLED;
5698 		return (0);
5699 	}
5700 
5701 	if (cmd == _PC_XATTR_EXISTS) {
5702 		error = 0;
5703 		*valp = 0;
5704 		if (vp->v_vfsp->vfs_flag & VFS_XATTR) {
5705 			vnode_t *avp;
5706 			rnode_t *rp;
5707 			int error = 0;
5708 			mntinfo_t *mi = VTOMI(vp);
5709 
5710 			if (!(mi->mi_flags & MI_EXTATTR))
5711 				return (0);
5712 
5713 			rp = VTOR(vp);
5714 			if (nfs_rw_enter_sig(&rp->r_rwlock, RW_READER,
5715 			    INTR(vp)))
5716 				return (EINTR);
5717 
5718 			error = nfs3lookup_dnlc(vp, XATTR_DIR_NAME, &avp, cr);
5719 			if (error || avp == NULL)
5720 				error = acl_getxattrdir3(vp, &avp, 0, cr, 0);
5721 
5722 			nfs_rw_exit(&rp->r_rwlock);
5723 
5724 			if (error == 0 && avp != NULL) {
5725 				VN_RELE(avp);
5726 				*valp = 1;
5727 			} else if (error == ENOENT)
5728 				error = 0;
5729 		}
5730 		return (error);
5731 	}
5732 
5733 	rp = VTOR(vp);
5734 	if (rp->r_pathconf != NULL) {
5735 		mutex_enter(&rp->r_statelock);
5736 		if (rp->r_pathconf != NULL && nfs3_pathconf_disable_cache) {
5737 			kmem_free(rp->r_pathconf, sizeof (*rp->r_pathconf));
5738 			rp->r_pathconf = NULL;
5739 		}
5740 		if (rp->r_pathconf != NULL) {
5741 			error = 0;
5742 			switch (cmd) {
5743 			case _PC_LINK_MAX:
5744 				*valp = rp->r_pathconf->link_max;
5745 				break;
5746 			case _PC_NAME_MAX:
5747 				*valp = rp->r_pathconf->name_max;
5748 				break;
5749 			case _PC_PATH_MAX:
5750 			case _PC_SYMLINK_MAX:
5751 				*valp = MAXPATHLEN;
5752 				break;
5753 			case _PC_CHOWN_RESTRICTED:
5754 				*valp = rp->r_pathconf->chown_restricted;
5755 				break;
5756 			case _PC_NO_TRUNC:
5757 				*valp = rp->r_pathconf->no_trunc;
5758 				break;
5759 			default:
5760 				error = EINVAL;
5761 				break;
5762 			}
5763 			mutex_exit(&rp->r_statelock);
5764 #ifdef DEBUG
5765 			nfs3_pathconf_cache_hits++;
5766 #endif
5767 			return (error);
5768 		}
5769 		mutex_exit(&rp->r_statelock);
5770 	}
5771 #ifdef DEBUG
5772 	nfs3_pathconf_cache_misses++;
5773 #endif
5774 
5775 	args.object = *VTOFH3(vp);
5776 	fi.vp = vp;
5777 	fi.fhp = (caddr_t)&args.object;
5778 	fi.copyproc = nfs3copyfh;
5779 	fi.lookupproc = nfs3lookup;
5780 	fi.xattrdirproc = acl_getxattrdir3;
5781 
5782 	douprintf = 1;
5783 
5784 	t = gethrtime();
5785 
5786 	error = rfs3call(VTOMI(vp), NFSPROC3_PATHCONF,
5787 	    xdr_nfs_fh3, (caddr_t)&args,
5788 	    xdr_PATHCONF3res, (caddr_t)&res, cr,
5789 	    &douprintf, &res.status, 0, &fi);
5790 
5791 	if (error)
5792 		return (error);
5793 
5794 	error = geterrno3(res.status);
5795 
5796 	if (!error) {
5797 		nfs3_cache_post_op_attr(vp, &res.resok.obj_attributes, t, cr);
5798 		if (!nfs3_pathconf_disable_cache) {
5799 			mutex_enter(&rp->r_statelock);
5800 			if (rp->r_pathconf == NULL) {
5801 				rp->r_pathconf = kmem_alloc(
5802 				    sizeof (*rp->r_pathconf), KM_NOSLEEP);
5803 				if (rp->r_pathconf != NULL)
5804 					*rp->r_pathconf = res.resok.info;
5805 			}
5806 			mutex_exit(&rp->r_statelock);
5807 		}
5808 		switch (cmd) {
5809 		case _PC_LINK_MAX:
5810 			*valp = res.resok.info.link_max;
5811 			break;
5812 		case _PC_NAME_MAX:
5813 			*valp = res.resok.info.name_max;
5814 			break;
5815 		case _PC_PATH_MAX:
5816 		case _PC_SYMLINK_MAX:
5817 			*valp = MAXPATHLEN;
5818 			break;
5819 		case _PC_CHOWN_RESTRICTED:
5820 			*valp = res.resok.info.chown_restricted;
5821 			break;
5822 		case _PC_NO_TRUNC:
5823 			*valp = res.resok.info.no_trunc;
5824 			break;
5825 		default:
5826 			return (EINVAL);
5827 		}
5828 	} else {
5829 		nfs3_cache_post_op_attr(vp, &res.resfail.obj_attributes, t, cr);
5830 		PURGE_STALE_FH(error, vp, cr);
5831 	}
5832 
5833 	return (error);
5834 }
5835 
5836 /*
5837  * Called by async thread to do synchronous pageio. Do the i/o, wait
5838  * for it to complete, and cleanup the page list when done.
5839  */
5840 static int
5841 nfs3_sync_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
5842 	int flags, cred_t *cr)
5843 {
5844 	int error;
5845 
5846 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
5847 	error = nfs3_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
5848 	if (flags & B_READ)
5849 		pvn_read_done(pp, (error ? B_ERROR : 0) | flags);
5850 	else
5851 		pvn_write_done(pp, (error ? B_ERROR : 0) | flags);
5852 	return (error);
5853 }
5854 
5855 /* ARGSUSED */
5856 static int
5857 nfs3_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
5858 	int flags, cred_t *cr, caller_context_t *ct)
5859 {
5860 	int error;
5861 	rnode_t *rp;
5862 
5863 	if (pp == NULL)
5864 		return (EINVAL);
5865 	if (!(flags & B_ASYNC) && nfs_zone() != VTOMI(vp)->mi_zone)
5866 		return (EIO);
5867 
5868 	rp = VTOR(vp);
5869 	mutex_enter(&rp->r_statelock);
5870 	rp->r_count++;
5871 	mutex_exit(&rp->r_statelock);
5872 
5873 	if (flags & B_ASYNC) {
5874 		error = nfs_async_pageio(vp, pp, io_off, io_len, flags, cr,
5875 		    nfs3_sync_pageio);
5876 	} else
5877 		error = nfs3_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
5878 	mutex_enter(&rp->r_statelock);
5879 	rp->r_count--;
5880 	cv_broadcast(&rp->r_cv);
5881 	mutex_exit(&rp->r_statelock);
5882 	return (error);
5883 }
5884 
5885 /* ARGSUSED */
5886 static void
5887 nfs3_dispose(vnode_t *vp, page_t *pp, int fl, int dn, cred_t *cr,
5888 	caller_context_t *ct)
5889 {
5890 	int error;
5891 	rnode_t *rp;
5892 	page_t *plist;
5893 	page_t *pptr;
5894 	offset3 offset;
5895 	count3 len;
5896 	k_sigset_t smask;
5897 
5898 	/*
5899 	 * We should get called with fl equal to either B_FREE or
5900 	 * B_INVAL.  Any other value is illegal.
5901 	 *
5902 	 * The page that we are either supposed to free or destroy
5903 	 * should be exclusive locked and its io lock should not
5904 	 * be held.
5905 	 */
5906 	ASSERT(fl == B_FREE || fl == B_INVAL);
5907 	ASSERT((PAGE_EXCL(pp) && !page_iolock_assert(pp)) || panicstr);
5908 	rp = VTOR(vp);
5909 
5910 	/*
5911 	 * If the page doesn't need to be committed or we shouldn't
5912 	 * even bother attempting to commit it, then just make sure
5913 	 * that the p_fsdata byte is clear and then either free or
5914 	 * destroy the page as appropriate.
5915 	 */
5916 	if (pp->p_fsdata == C_NOCOMMIT || (rp->r_flags & RSTALE)) {
5917 		pp->p_fsdata = C_NOCOMMIT;
5918 		if (fl == B_FREE)
5919 			page_free(pp, dn);
5920 		else
5921 			page_destroy(pp, dn);
5922 		return;
5923 	}
5924 
5925 	/*
5926 	 * If there is a page invalidation operation going on, then
5927 	 * if this is one of the pages being destroyed, then just
5928 	 * clear the p_fsdata byte and then either free or destroy
5929 	 * the page as appropriate.
5930 	 */
5931 	mutex_enter(&rp->r_statelock);
5932 	if ((rp->r_flags & RTRUNCATE) && pp->p_offset >= rp->r_truncaddr) {
5933 		mutex_exit(&rp->r_statelock);
5934 		pp->p_fsdata = C_NOCOMMIT;
5935 		if (fl == B_FREE)
5936 			page_free(pp, dn);
5937 		else
5938 			page_destroy(pp, dn);
5939 		return;
5940 	}
5941 
5942 	/*
5943 	 * If we are freeing this page and someone else is already
5944 	 * waiting to do a commit, then just unlock the page and
5945 	 * return.  That other thread will take care of commiting
5946 	 * this page.  The page can be freed sometime after the
5947 	 * commit has finished.  Otherwise, if the page is marked
5948 	 * as delay commit, then we may be getting called from
5949 	 * pvn_write_done, one page at a time.   This could result
5950 	 * in one commit per page, so we end up doing lots of small
5951 	 * commits instead of fewer larger commits.  This is bad,
5952 	 * we want do as few commits as possible.
5953 	 */
5954 	if (fl == B_FREE) {
5955 		if (rp->r_flags & RCOMMITWAIT) {
5956 			page_unlock(pp);
5957 			mutex_exit(&rp->r_statelock);
5958 			return;
5959 		}
5960 		if (pp->p_fsdata == C_DELAYCOMMIT) {
5961 			pp->p_fsdata = C_COMMIT;
5962 			page_unlock(pp);
5963 			mutex_exit(&rp->r_statelock);
5964 			return;
5965 		}
5966 	}
5967 
5968 	/*
5969 	 * Check to see if there is a signal which would prevent an
5970 	 * attempt to commit the pages from being successful.  If so,
5971 	 * then don't bother with all of the work to gather pages and
5972 	 * generate the unsuccessful RPC.  Just return from here and
5973 	 * let the page be committed at some later time.
5974 	 */
5975 	sigintr(&smask, VTOMI(vp)->mi_flags & MI_INT);
5976 	if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING)) {
5977 		sigunintr(&smask);
5978 		page_unlock(pp);
5979 		mutex_exit(&rp->r_statelock);
5980 		return;
5981 	}
5982 	sigunintr(&smask);
5983 
5984 	/*
5985 	 * We are starting to need to commit pages, so let's try
5986 	 * to commit as many as possible at once to reduce the
5987 	 * overhead.
5988 	 *
5989 	 * Set the `commit inprogress' state bit.  We must
5990 	 * first wait until any current one finishes.  Then
5991 	 * we initialize the c_pages list with this page.
5992 	 */
5993 	while (rp->r_flags & RCOMMIT) {
5994 		rp->r_flags |= RCOMMITWAIT;
5995 		cv_wait(&rp->r_commit.c_cv, &rp->r_statelock);
5996 		rp->r_flags &= ~RCOMMITWAIT;
5997 	}
5998 	rp->r_flags |= RCOMMIT;
5999 	mutex_exit(&rp->r_statelock);
6000 	ASSERT(rp->r_commit.c_pages == NULL);
6001 	rp->r_commit.c_pages = pp;
6002 	rp->r_commit.c_commbase = (offset3)pp->p_offset;
6003 	rp->r_commit.c_commlen = PAGESIZE;
6004 
6005 	/*
6006 	 * Gather together all other pages which can be committed.
6007 	 * They will all be chained off r_commit.c_pages.
6008 	 */
6009 	nfs3_get_commit(vp);
6010 
6011 	/*
6012 	 * Clear the `commit inprogress' status and disconnect
6013 	 * the list of pages to be committed from the rnode.
6014 	 * At this same time, we also save the starting offset
6015 	 * and length of data to be committed on the server.
6016 	 */
6017 	plist = rp->r_commit.c_pages;
6018 	rp->r_commit.c_pages = NULL;
6019 	offset = rp->r_commit.c_commbase;
6020 	len = rp->r_commit.c_commlen;
6021 	mutex_enter(&rp->r_statelock);
6022 	rp->r_flags &= ~RCOMMIT;
6023 	cv_broadcast(&rp->r_commit.c_cv);
6024 	mutex_exit(&rp->r_statelock);
6025 
6026 	if (curproc == proc_pageout || curproc == proc_fsflush ||
6027 	    nfs_zone() != VTOMI(vp)->mi_zone) {
6028 		nfs_async_commit(vp, plist, offset, len, cr, nfs3_async_commit);
6029 		return;
6030 	}
6031 
6032 	/*
6033 	 * Actually generate the COMMIT3 over the wire operation.
6034 	 */
6035 	error = nfs3_commit(vp, offset, len, cr);
6036 
6037 	/*
6038 	 * If we got an error during the commit, just unlock all
6039 	 * of the pages.  The pages will get retransmitted to the
6040 	 * server during a putpage operation.
6041 	 */
6042 	if (error) {
6043 		while (plist != NULL) {
6044 			pptr = plist;
6045 			page_sub(&plist, pptr);
6046 			page_unlock(pptr);
6047 		}
6048 		return;
6049 	}
6050 
6051 	/*
6052 	 * We've tried as hard as we can to commit the data to stable
6053 	 * storage on the server.  We release the rest of the pages
6054 	 * and clear the commit required state.  They will be put
6055 	 * onto the tail of the cachelist if they are nolonger
6056 	 * mapped.
6057 	 */
6058 	while (plist != pp) {
6059 		pptr = plist;
6060 		page_sub(&plist, pptr);
6061 		pptr->p_fsdata = C_NOCOMMIT;
6062 		(void) page_release(pptr, 1);
6063 	}
6064 
6065 	/*
6066 	 * It is possible that nfs3_commit didn't return error but
6067 	 * some other thread has modified the page we are going
6068 	 * to free/destroy.
6069 	 *    In this case we need to rewrite the page. Do an explicit check
6070 	 * before attempting to free/destroy the page. If modified, needs to
6071 	 * be rewritten so unlock the page and return.
6072 	 */
6073 	if (hat_ismod(pp)) {
6074 		pp->p_fsdata = C_NOCOMMIT;
6075 		page_unlock(pp);
6076 		return;
6077 	}
6078 
6079 	/*
6080 	 * Now, as appropriate, either free or destroy the page
6081 	 * that we were called with.
6082 	 */
6083 	pp->p_fsdata = C_NOCOMMIT;
6084 	if (fl == B_FREE)
6085 		page_free(pp, dn);
6086 	else
6087 		page_destroy(pp, dn);
6088 }
6089 
6090 static int
6091 nfs3_commit(vnode_t *vp, offset3 offset, count3 count, cred_t *cr)
6092 {
6093 	int error;
6094 	rnode_t *rp;
6095 	COMMIT3args args;
6096 	COMMIT3res res;
6097 	int douprintf;
6098 	cred_t *cred;
6099 
6100 	rp = VTOR(vp);
6101 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
6102 
6103 	mutex_enter(&rp->r_statelock);
6104 	if (rp->r_cred != NULL) {
6105 		cred = rp->r_cred;
6106 		crhold(cred);
6107 	} else {
6108 		rp->r_cred = cr;
6109 		crhold(cr);
6110 		cred = cr;
6111 		crhold(cred);
6112 	}
6113 	mutex_exit(&rp->r_statelock);
6114 
6115 	args.file = *VTOFH3(vp);
6116 	args.offset = offset;
6117 	args.count = count;
6118 
6119 doitagain:
6120 	douprintf = 1;
6121 	error = rfs3call(VTOMI(vp), NFSPROC3_COMMIT,
6122 	    xdr_COMMIT3args, (caddr_t)&args,
6123 	    xdr_COMMIT3res, (caddr_t)&res, cred,
6124 	    &douprintf, &res.status, 0, NULL);
6125 
6126 	crfree(cred);
6127 
6128 	if (error)
6129 		return (error);
6130 
6131 	error = geterrno3(res.status);
6132 	if (!error) {
6133 		ASSERT(rp->r_flags & RHAVEVERF);
6134 		mutex_enter(&rp->r_statelock);
6135 		if (rp->r_verf == res.resok.verf) {
6136 			mutex_exit(&rp->r_statelock);
6137 			return (0);
6138 		}
6139 		nfs3_set_mod(vp);
6140 		rp->r_verf = res.resok.verf;
6141 		mutex_exit(&rp->r_statelock);
6142 		error = NFS_VERF_MISMATCH;
6143 	} else {
6144 		if (error == EACCES) {
6145 			mutex_enter(&rp->r_statelock);
6146 			if (cred != cr) {
6147 				if (rp->r_cred != NULL)
6148 					crfree(rp->r_cred);
6149 				rp->r_cred = cr;
6150 				crhold(cr);
6151 				cred = cr;
6152 				crhold(cred);
6153 				mutex_exit(&rp->r_statelock);
6154 				goto doitagain;
6155 			}
6156 			mutex_exit(&rp->r_statelock);
6157 		}
6158 		/*
6159 		 * Can't do a PURGE_STALE_FH here because this
6160 		 * can cause a deadlock.  nfs3_commit can
6161 		 * be called from nfs3_dispose which can be called
6162 		 * indirectly via pvn_vplist_dirty.  PURGE_STALE_FH
6163 		 * can call back to pvn_vplist_dirty.
6164 		 */
6165 		if (error == ESTALE) {
6166 			mutex_enter(&rp->r_statelock);
6167 			rp->r_flags |= RSTALE;
6168 			if (!rp->r_error)
6169 				rp->r_error = error;
6170 			mutex_exit(&rp->r_statelock);
6171 			PURGE_ATTRCACHE(vp);
6172 		} else {
6173 			mutex_enter(&rp->r_statelock);
6174 			if (!rp->r_error)
6175 				rp->r_error = error;
6176 			mutex_exit(&rp->r_statelock);
6177 		}
6178 	}
6179 
6180 	return (error);
6181 }
6182 
6183 static void
6184 nfs3_set_mod(vnode_t *vp)
6185 {
6186 	page_t *pp;
6187 	kmutex_t *vphm;
6188 
6189 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
6190 	vphm = page_vnode_mutex(vp);
6191 	mutex_enter(vphm);
6192 	if ((pp = vp->v_pages) != NULL) {
6193 		do {
6194 			if (pp->p_fsdata != C_NOCOMMIT) {
6195 				hat_setmod(pp);
6196 				pp->p_fsdata = C_NOCOMMIT;
6197 			}
6198 		} while ((pp = pp->p_vpnext) != vp->v_pages);
6199 	}
6200 	mutex_exit(vphm);
6201 }
6202 
6203 
6204 /*
6205  * This routine is used to gather together a page list of the pages
6206  * which are to be committed on the server.  This routine must not
6207  * be called if the calling thread holds any locked pages.
6208  *
6209  * The calling thread must have set RCOMMIT.  This bit is used to
6210  * serialize access to the commit structure in the rnode.  As long
6211  * as the thread has set RCOMMIT, then it can manipulate the commit
6212  * structure without requiring any other locks.
6213  */
6214 static void
6215 nfs3_get_commit(vnode_t *vp)
6216 {
6217 	rnode_t *rp;
6218 	page_t *pp;
6219 	kmutex_t *vphm;
6220 
6221 	rp = VTOR(vp);
6222 
6223 	ASSERT(rp->r_flags & RCOMMIT);
6224 
6225 	vphm = page_vnode_mutex(vp);
6226 	mutex_enter(vphm);
6227 
6228 	/*
6229 	 * If there are no pages associated with this vnode, then
6230 	 * just return.
6231 	 */
6232 	if ((pp = vp->v_pages) == NULL) {
6233 		mutex_exit(vphm);
6234 		return;
6235 	}
6236 
6237 	/*
6238 	 * Step through all of the pages associated with this vnode
6239 	 * looking for pages which need to be committed.
6240 	 */
6241 	do {
6242 		/*
6243 		 * If this page does not need to be committed or is
6244 		 * modified, then just skip it.
6245 		 */
6246 		if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp))
6247 			continue;
6248 
6249 		/*
6250 		 * Attempt to lock the page.  If we can't, then
6251 		 * someone else is messing with it and we will
6252 		 * just skip it.
6253 		 */
6254 		if (!page_trylock(pp, SE_EXCL))
6255 			continue;
6256 
6257 		/*
6258 		 * If this page does not need to be committed or is
6259 		 * modified, then just skip it.  Recheck now that
6260 		 * the page is locked.
6261 		 */
6262 		if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) {
6263 			page_unlock(pp);
6264 			continue;
6265 		}
6266 
6267 		if (PP_ISFREE(pp)) {
6268 			cmn_err(CE_PANIC, "nfs3_get_commit: %p is free",
6269 			    (void *)pp);
6270 		}
6271 
6272 		/*
6273 		 * The page needs to be committed and we locked it.
6274 		 * Update the base and length parameters and add it
6275 		 * to r_pages.
6276 		 */
6277 		if (rp->r_commit.c_pages == NULL) {
6278 			rp->r_commit.c_commbase = (offset3)pp->p_offset;
6279 			rp->r_commit.c_commlen = PAGESIZE;
6280 		} else if (pp->p_offset < rp->r_commit.c_commbase) {
6281 			rp->r_commit.c_commlen = rp->r_commit.c_commbase -
6282 			    (offset3)pp->p_offset + rp->r_commit.c_commlen;
6283 			rp->r_commit.c_commbase = (offset3)pp->p_offset;
6284 		} else if ((rp->r_commit.c_commbase + rp->r_commit.c_commlen)
6285 			    <= pp->p_offset) {
6286 			rp->r_commit.c_commlen = (offset3)pp->p_offset -
6287 			    rp->r_commit.c_commbase + PAGESIZE;
6288 		}
6289 		page_add(&rp->r_commit.c_pages, pp);
6290 	} while ((pp = pp->p_vpnext) != vp->v_pages);
6291 
6292 	mutex_exit(vphm);
6293 }
6294 
6295 /*
6296  * This routine is used to gather together a page list of the pages
6297  * which are to be committed on the server.  This routine must not
6298  * be called if the calling thread holds any locked pages.
6299  *
6300  * The calling thread must have set RCOMMIT.  This bit is used to
6301  * serialize access to the commit structure in the rnode.  As long
6302  * as the thread has set RCOMMIT, then it can manipulate the commit
6303  * structure without requiring any other locks.
6304  */
6305 static void
6306 nfs3_get_commit_range(vnode_t *vp, u_offset_t soff, size_t len)
6307 {
6308 
6309 	rnode_t *rp;
6310 	page_t *pp;
6311 	u_offset_t end;
6312 	u_offset_t off;
6313 
6314 	ASSERT(len != 0);
6315 
6316 	rp = VTOR(vp);
6317 
6318 	ASSERT(rp->r_flags & RCOMMIT);
6319 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
6320 
6321 	/*
6322 	 * If there are no pages associated with this vnode, then
6323 	 * just return.
6324 	 */
6325 	if ((pp = vp->v_pages) == NULL)
6326 		return;
6327 
6328 	/*
6329 	 * Calculate the ending offset.
6330 	 */
6331 	end = soff + len;
6332 
6333 	for (off = soff; off < end; off += PAGESIZE) {
6334 		/*
6335 		 * Lookup each page by vp, offset.
6336 		 */
6337 		if ((pp = page_lookup_nowait(vp, off, SE_EXCL)) == NULL)
6338 			continue;
6339 
6340 		/*
6341 		 * If this page does not need to be committed or is
6342 		 * modified, then just skip it.
6343 		 */
6344 		if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) {
6345 			page_unlock(pp);
6346 			continue;
6347 		}
6348 
6349 		ASSERT(PP_ISFREE(pp) == 0);
6350 
6351 		/*
6352 		 * The page needs to be committed and we locked it.
6353 		 * Update the base and length parameters and add it
6354 		 * to r_pages.
6355 		 */
6356 		if (rp->r_commit.c_pages == NULL) {
6357 			rp->r_commit.c_commbase = (offset3)pp->p_offset;
6358 			rp->r_commit.c_commlen = PAGESIZE;
6359 		} else {
6360 			rp->r_commit.c_commlen = (offset3)pp->p_offset -
6361 					rp->r_commit.c_commbase + PAGESIZE;
6362 		}
6363 		page_add(&rp->r_commit.c_pages, pp);
6364 	}
6365 }
6366 
6367 #if 0	/* unused */
6368 #ifdef DEBUG
6369 static int
6370 nfs3_no_uncommitted_pages(vnode_t *vp)
6371 {
6372 	page_t *pp;
6373 	kmutex_t *vphm;
6374 
6375 	vphm = page_vnode_mutex(vp);
6376 	mutex_enter(vphm);
6377 	if ((pp = vp->v_pages) != NULL) {
6378 		do {
6379 			if (pp->p_fsdata != C_NOCOMMIT) {
6380 				mutex_exit(vphm);
6381 				return (0);
6382 			}
6383 		} while ((pp = pp->p_vpnext) != vp->v_pages);
6384 	}
6385 	mutex_exit(vphm);
6386 
6387 	return (1);
6388 }
6389 #endif
6390 #endif
6391 
6392 static int
6393 nfs3_putpage_commit(vnode_t *vp, offset_t poff, size_t plen, cred_t *cr)
6394 {
6395 	int error;
6396 	writeverf3 write_verf;
6397 	rnode_t *rp = VTOR(vp);
6398 
6399 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
6400 	/*
6401 	 * Flush the data portion of the file and then commit any
6402 	 * portions which need to be committed.  This may need to
6403 	 * be done twice if the server has changed state since
6404 	 * data was last written.  The data will need to be
6405 	 * rewritten to the server and then a new commit done.
6406 	 *
6407 	 * In fact, this may need to be done several times if the
6408 	 * server is having problems and crashing while we are
6409 	 * attempting to do this.
6410 	 */
6411 
6412 top:
6413 	/*
6414 	 * Do a flush based on the poff and plen arguments.  This
6415 	 * will asynchronously write out any modified pages in the
6416 	 * range specified by (poff, plen).  This starts all of the
6417 	 * i/o operations which will be waited for in the next
6418 	 * call to nfs3_putpage
6419 	 */
6420 
6421 	mutex_enter(&rp->r_statelock);
6422 	write_verf = rp->r_verf;
6423 	mutex_exit(&rp->r_statelock);
6424 
6425 	error = nfs3_putpage(vp, poff, plen, B_ASYNC, cr, NULL);
6426 	if (error == EAGAIN)
6427 		error = 0;
6428 
6429 	/*
6430 	 * Do a flush based on the poff and plen arguments.  This
6431 	 * will synchronously write out any modified pages in the
6432 	 * range specified by (poff, plen) and wait until all of
6433 	 * the asynchronous i/o's in that range are done as well.
6434 	 */
6435 	if (!error)
6436 		error = nfs3_putpage(vp, poff, plen, 0, cr, NULL);
6437 
6438 	if (error)
6439 		return (error);
6440 
6441 	mutex_enter(&rp->r_statelock);
6442 	if (rp->r_verf != write_verf) {
6443 		mutex_exit(&rp->r_statelock);
6444 		goto top;
6445 	}
6446 	mutex_exit(&rp->r_statelock);
6447 
6448 	/*
6449 	 * Now commit any pages which might need to be committed.
6450 	 * If the error, NFS_VERF_MISMATCH, is returned, then
6451 	 * start over with the flush operation.
6452 	 */
6453 
6454 	error = nfs3_commit_vp(vp, poff, plen, cr);
6455 
6456 	if (error == NFS_VERF_MISMATCH)
6457 		goto top;
6458 
6459 	return (error);
6460 }
6461 
6462 static int
6463 nfs3_commit_vp(vnode_t *vp, u_offset_t poff, size_t plen, cred_t *cr)
6464 {
6465 	rnode_t *rp;
6466 	page_t *plist;
6467 	offset3 offset;
6468 	count3 len;
6469 
6470 
6471 	rp = VTOR(vp);
6472 
6473 	if (nfs_zone() != VTOMI(vp)->mi_zone)
6474 		return (EIO);
6475 	/*
6476 	 * Set the `commit inprogress' state bit.  We must
6477 	 * first wait until any current one finishes.
6478 	 */
6479 	mutex_enter(&rp->r_statelock);
6480 	while (rp->r_flags & RCOMMIT) {
6481 		rp->r_flags |= RCOMMITWAIT;
6482 		cv_wait(&rp->r_commit.c_cv, &rp->r_statelock);
6483 		rp->r_flags &= ~RCOMMITWAIT;
6484 	}
6485 	rp->r_flags |= RCOMMIT;
6486 	mutex_exit(&rp->r_statelock);
6487 
6488 	/*
6489 	 * Gather together all of the pages which need to be
6490 	 * committed.
6491 	 */
6492 	if (plen == 0)
6493 		nfs3_get_commit(vp);
6494 	else
6495 		nfs3_get_commit_range(vp, poff, plen);
6496 
6497 	/*
6498 	 * Clear the `commit inprogress' bit and disconnect the
6499 	 * page list which was gathered together in nfs3_get_commit.
6500 	 */
6501 	plist = rp->r_commit.c_pages;
6502 	rp->r_commit.c_pages = NULL;
6503 	offset = rp->r_commit.c_commbase;
6504 	len = rp->r_commit.c_commlen;
6505 	mutex_enter(&rp->r_statelock);
6506 	rp->r_flags &= ~RCOMMIT;
6507 	cv_broadcast(&rp->r_commit.c_cv);
6508 	mutex_exit(&rp->r_statelock);
6509 
6510 	/*
6511 	 * If any pages need to be committed, commit them and
6512 	 * then unlock them so that they can be freed some
6513 	 * time later.
6514 	 */
6515 	if (plist != NULL) {
6516 		/*
6517 		 * No error occurred during the flush portion
6518 		 * of this operation, so now attempt to commit
6519 		 * the data to stable storage on the server.
6520 		 *
6521 		 * This will unlock all of the pages on the list.
6522 		 */
6523 		return (nfs3_sync_commit(vp, plist, offset, len, cr));
6524 	}
6525 	return (0);
6526 }
6527 
6528 static int
6529 nfs3_sync_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
6530 	cred_t *cr)
6531 {
6532 	int error;
6533 	page_t *pp;
6534 
6535 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
6536 	error = nfs3_commit(vp, offset, count, cr);
6537 
6538 	/*
6539 	 * If we got an error, then just unlock all of the pages
6540 	 * on the list.
6541 	 */
6542 	if (error) {
6543 		while (plist != NULL) {
6544 			pp = plist;
6545 			page_sub(&plist, pp);
6546 			page_unlock(pp);
6547 		}
6548 		return (error);
6549 	}
6550 	/*
6551 	 * We've tried as hard as we can to commit the data to stable
6552 	 * storage on the server.  We just unlock the pages and clear
6553 	 * the commit required state.  They will get freed later.
6554 	 */
6555 	while (plist != NULL) {
6556 		pp = plist;
6557 		page_sub(&plist, pp);
6558 		pp->p_fsdata = C_NOCOMMIT;
6559 		page_unlock(pp);
6560 	}
6561 
6562 	return (error);
6563 }
6564 
6565 static void
6566 nfs3_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
6567 	cred_t *cr)
6568 {
6569 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
6570 	(void) nfs3_sync_commit(vp, plist, offset, count, cr);
6571 }
6572 
6573 /* ARGSUSED */
6574 static int
6575 nfs3_setsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr,
6576 	caller_context_t *ct)
6577 {
6578 	int error;
6579 	mntinfo_t *mi;
6580 
6581 	mi = VTOMI(vp);
6582 
6583 	if (nfs_zone() != mi->mi_zone)
6584 		return (EIO);
6585 
6586 	if (mi->mi_flags & MI_ACL) {
6587 		error = acl_setacl3(vp, vsecattr, flag, cr);
6588 		if (mi->mi_flags & MI_ACL)
6589 			return (error);
6590 	}
6591 
6592 	return (ENOSYS);
6593 }
6594 
6595 /* ARGSUSED */
6596 static int
6597 nfs3_getsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr,
6598 	caller_context_t *ct)
6599 {
6600 	int error;
6601 	mntinfo_t *mi;
6602 
6603 	mi = VTOMI(vp);
6604 
6605 	if (nfs_zone() != mi->mi_zone)
6606 		return (EIO);
6607 
6608 	if (mi->mi_flags & MI_ACL) {
6609 		error = acl_getacl3(vp, vsecattr, flag, cr);
6610 		if (mi->mi_flags & MI_ACL)
6611 			return (error);
6612 	}
6613 
6614 	return (fs_fab_acl(vp, vsecattr, flag, cr, ct));
6615 }
6616 
6617 /* ARGSUSED */
6618 static int
6619 nfs3_shrlock(vnode_t *vp, int cmd, struct shrlock *shr, int flag, cred_t *cr,
6620 	caller_context_t *ct)
6621 {
6622 	int error;
6623 	struct shrlock nshr;
6624 	struct nfs_owner nfs_owner;
6625 	netobj lm_fh3;
6626 
6627 	if (nfs_zone() != VTOMI(vp)->mi_zone)
6628 		return (EIO);
6629 
6630 	/*
6631 	 * check for valid cmd parameter
6632 	 */
6633 	if (cmd != F_SHARE && cmd != F_UNSHARE && cmd != F_HASREMOTELOCKS)
6634 		return (EINVAL);
6635 
6636 	/*
6637 	 * Check access permissions
6638 	 */
6639 	if (cmd == F_SHARE &&
6640 	    (((shr->s_access & F_RDACC) && !(flag & FREAD)) ||
6641 	    ((shr->s_access & F_WRACC) && !(flag & FWRITE))))
6642 		return (EBADF);
6643 
6644 	/*
6645 	 * If the filesystem is mounted using local locking, pass the
6646 	 * request off to the local share code.
6647 	 */
6648 	if (VTOMI(vp)->mi_flags & MI_LLOCK)
6649 		return (fs_shrlock(vp, cmd, shr, flag, cr, ct));
6650 
6651 	switch (cmd) {
6652 	case F_SHARE:
6653 	case F_UNSHARE:
6654 		lm_fh3.n_len = VTOFH3(vp)->fh3_length;
6655 		lm_fh3.n_bytes = (char *)&(VTOFH3(vp)->fh3_u.data);
6656 
6657 		/*
6658 		 * If passed an owner that is too large to fit in an
6659 		 * nfs_owner it is likely a recursive call from the
6660 		 * lock manager client and pass it straight through.  If
6661 		 * it is not a nfs_owner then simply return an error.
6662 		 */
6663 		if (shr->s_own_len > sizeof (nfs_owner.lowner)) {
6664 			if (((struct nfs_owner *)shr->s_owner)->magic !=
6665 			    NFS_OWNER_MAGIC)
6666 				return (EINVAL);
6667 
6668 			if (error = lm4_shrlock(vp, cmd, shr, flag, &lm_fh3)) {
6669 				error = set_errno(error);
6670 			}
6671 			return (error);
6672 		}
6673 		/*
6674 		 * Remote share reservations owner is a combination of
6675 		 * a magic number, hostname, and the local owner
6676 		 */
6677 		bzero(&nfs_owner, sizeof (nfs_owner));
6678 		nfs_owner.magic = NFS_OWNER_MAGIC;
6679 		(void) strncpy(nfs_owner.hname, uts_nodename(),
6680 		    sizeof (nfs_owner.hname));
6681 		bcopy(shr->s_owner, nfs_owner.lowner, shr->s_own_len);
6682 		nshr.s_access = shr->s_access;
6683 		nshr.s_deny = shr->s_deny;
6684 		nshr.s_sysid = 0;
6685 		nshr.s_pid = ttoproc(curthread)->p_pid;
6686 		nshr.s_own_len = sizeof (nfs_owner);
6687 		nshr.s_owner = (caddr_t)&nfs_owner;
6688 
6689 		if (error = lm4_shrlock(vp, cmd, &nshr, flag, &lm_fh3)) {
6690 			error = set_errno(error);
6691 		}
6692 
6693 		break;
6694 
6695 	case F_HASREMOTELOCKS:
6696 		/*
6697 		 * NFS client can't store remote locks itself
6698 		 */
6699 		shr->s_access = 0;
6700 		error = 0;
6701 		break;
6702 
6703 	default:
6704 		error = EINVAL;
6705 		break;
6706 	}
6707 
6708 	return (error);
6709 }
6710