xref: /titanic_50/usr/src/uts/common/fs/nfs/nfs_vnops.c (revision 5dd2c7e9f9042148fc81d6aada5df28c2705977d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  *
25  *	Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T.
26  *	All rights reserved.
27  */
28 
29 #pragma ident	"%Z%%M%	%I%	%E% SMI"
30 
31 #include <sys/param.h>
32 #include <sys/types.h>
33 #include <sys/systm.h>
34 #include <sys/cred.h>
35 #include <sys/time.h>
36 #include <sys/vnode.h>
37 #include <sys/vfs.h>
38 #include <sys/vfs_opreg.h>
39 #include <sys/file.h>
40 #include <sys/filio.h>
41 #include <sys/uio.h>
42 #include <sys/buf.h>
43 #include <sys/mman.h>
44 #include <sys/pathname.h>
45 #include <sys/dirent.h>
46 #include <sys/debug.h>
47 #include <sys/vmsystm.h>
48 #include <sys/fcntl.h>
49 #include <sys/flock.h>
50 #include <sys/swap.h>
51 #include <sys/errno.h>
52 #include <sys/strsubr.h>
53 #include <sys/sysmacros.h>
54 #include <sys/kmem.h>
55 #include <sys/cmn_err.h>
56 #include <sys/pathconf.h>
57 #include <sys/utsname.h>
58 #include <sys/dnlc.h>
59 #include <sys/acl.h>
60 #include <sys/atomic.h>
61 #include <sys/policy.h>
62 #include <sys/sdt.h>
63 
64 #include <rpc/types.h>
65 #include <rpc/auth.h>
66 #include <rpc/clnt.h>
67 
68 #include <nfs/nfs.h>
69 #include <nfs/nfs_clnt.h>
70 #include <nfs/rnode.h>
71 #include <nfs/nfs_acl.h>
72 #include <nfs/lm.h>
73 
74 #include <vm/hat.h>
75 #include <vm/as.h>
76 #include <vm/page.h>
77 #include <vm/pvn.h>
78 #include <vm/seg.h>
79 #include <vm/seg_map.h>
80 #include <vm/seg_kpm.h>
81 #include <vm/seg_vn.h>
82 
83 #include <fs/fs_subr.h>
84 
85 #include <sys/ddi.h>
86 
87 static int	nfs_rdwrlbn(vnode_t *, page_t *, u_offset_t, size_t, int,
88 			cred_t *);
89 static int	nfswrite(vnode_t *, caddr_t, uint_t, int, cred_t *);
90 static int	nfsread(vnode_t *, caddr_t, uint_t, int, size_t *, cred_t *);
91 static int	nfssetattr(vnode_t *, struct vattr *, int, cred_t *);
92 static int	nfslookup_dnlc(vnode_t *, char *, vnode_t **, cred_t *);
93 static int	nfslookup_otw(vnode_t *, char *, vnode_t **, cred_t *, int);
94 static int	nfsrename(vnode_t *, char *, vnode_t *, char *, cred_t *);
95 static int	nfsreaddir(vnode_t *, rddir_cache *, cred_t *);
96 static int	nfs_bio(struct buf *, cred_t *);
97 static int	nfs_getapage(vnode_t *, u_offset_t, size_t, uint_t *,
98 			page_t *[], size_t, struct seg *, caddr_t,
99 			enum seg_rw, cred_t *);
100 static void	nfs_readahead(vnode_t *, u_offset_t, caddr_t, struct seg *,
101 			cred_t *);
102 static int	nfs_sync_putapage(vnode_t *, page_t *, u_offset_t, size_t,
103 			int, cred_t *);
104 static int	nfs_sync_pageio(vnode_t *, page_t *, u_offset_t, size_t,
105 			int, cred_t *);
106 static void	nfs_delmap_callback(struct as *, void *, uint_t);
107 
108 /*
109  * Error flags used to pass information about certain special errors
110  * which need to be handled specially.
111  */
112 #define	NFS_EOF			-98
113 
114 /*
115  * These are the vnode ops routines which implement the vnode interface to
116  * the networked file system.  These routines just take their parameters,
117  * make them look networkish by putting the right info into interface structs,
118  * and then calling the appropriate remote routine(s) to do the work.
119  *
120  * Note on directory name lookup cacheing:  If we detect a stale fhandle,
121  * we purge the directory cache relative to that vnode.  This way, the
122  * user won't get burned by the cache repeatedly.  See <nfs/rnode.h> for
123  * more details on rnode locking.
124  */
125 
126 static int	nfs_open(vnode_t **, int, cred_t *);
127 static int	nfs_close(vnode_t *, int, int, offset_t, cred_t *);
128 static int	nfs_read(vnode_t *, struct uio *, int, cred_t *,
129 			caller_context_t *);
130 static int	nfs_write(vnode_t *, struct uio *, int, cred_t *,
131 			caller_context_t *);
132 static int	nfs_ioctl(vnode_t *, int, intptr_t, int, cred_t *, int *);
133 static int	nfs_getattr(vnode_t *, struct vattr *, int, cred_t *);
134 static int	nfs_setattr(vnode_t *, struct vattr *, int, cred_t *,
135 			caller_context_t *);
136 static int	nfs_access(vnode_t *, int, int, cred_t *);
137 static int	nfs_accessx(void *, int, cred_t *);
138 static int	nfs_readlink(vnode_t *, struct uio *, cred_t *);
139 static int	nfs_fsync(vnode_t *, int, cred_t *);
140 static void	nfs_inactive(vnode_t *, cred_t *);
141 static int	nfs_lookup(vnode_t *, char *, vnode_t **, struct pathname *,
142 			int, vnode_t *, cred_t *);
143 static int	nfs_create(vnode_t *, char *, struct vattr *, enum vcexcl,
144 			int, vnode_t **, cred_t *, int);
145 static int	nfs_remove(vnode_t *, char *, cred_t *);
146 static int	nfs_link(vnode_t *, vnode_t *, char *, cred_t *);
147 static int	nfs_rename(vnode_t *, char *, vnode_t *, char *, cred_t *);
148 static int	nfs_mkdir(vnode_t *, char *, struct vattr *,
149 			vnode_t **, cred_t *);
150 static int	nfs_rmdir(vnode_t *, char *, vnode_t *, cred_t *);
151 static int	nfs_symlink(vnode_t *, char *, struct vattr *, char *,
152 			cred_t *);
153 static int	nfs_readdir(vnode_t *, struct uio *, cred_t *, int *);
154 static int	nfs_fid(vnode_t *, fid_t *);
155 static int	nfs_rwlock(vnode_t *, int, caller_context_t *);
156 static void	nfs_rwunlock(vnode_t *, int, caller_context_t *);
157 static int	nfs_seek(vnode_t *, offset_t, offset_t *);
158 static int	nfs_getpage(vnode_t *, offset_t, size_t, uint_t *,
159 			page_t *[], size_t, struct seg *, caddr_t,
160 			enum seg_rw, cred_t *);
161 static int	nfs_putpage(vnode_t *, offset_t, size_t, int, cred_t *);
162 static int	nfs_map(vnode_t *, offset_t, struct as *, caddr_t *,
163 			size_t, uchar_t, uchar_t, uint_t, cred_t *);
164 static int	nfs_addmap(vnode_t *, offset_t, struct as *, caddr_t,
165 			size_t, uchar_t, uchar_t, uint_t, cred_t *);
166 static int	nfs_frlock(vnode_t *, int, struct flock64 *, int, offset_t,
167 			struct flk_callback *, cred_t *);
168 static int	nfs_space(vnode_t *, int, struct flock64 *, int, offset_t,
169 			cred_t *, caller_context_t *);
170 static int	nfs_realvp(vnode_t *, vnode_t **);
171 static int	nfs_delmap(vnode_t *, offset_t, struct as *, caddr_t,
172 			size_t, uint_t, uint_t, uint_t, cred_t *);
173 static int	nfs_pathconf(vnode_t *, int, ulong_t *, cred_t *);
174 static int	nfs_pageio(vnode_t *, page_t *, u_offset_t, size_t, int,
175 			cred_t *);
176 static int	nfs_setsecattr(vnode_t *, vsecattr_t *, int, cred_t *);
177 static int	nfs_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *);
178 static int	nfs_shrlock(vnode_t *, int, struct shrlock *, int, cred_t *);
179 
180 struct vnodeops *nfs_vnodeops;
181 
182 const fs_operation_def_t nfs_vnodeops_template[] = {
183 	VOPNAME_OPEN,		{ .vop_open = nfs_open },
184 	VOPNAME_CLOSE,		{ .vop_close = nfs_close },
185 	VOPNAME_READ,		{ .vop_read = nfs_read },
186 	VOPNAME_WRITE,		{ .vop_write = nfs_write },
187 	VOPNAME_IOCTL,		{ .vop_ioctl = nfs_ioctl },
188 	VOPNAME_GETATTR,	{ .vop_getattr = nfs_getattr },
189 	VOPNAME_SETATTR,	{ .vop_setattr = nfs_setattr },
190 	VOPNAME_ACCESS,		{ .vop_access = nfs_access },
191 	VOPNAME_LOOKUP,		{ .vop_lookup = nfs_lookup },
192 	VOPNAME_CREATE,		{ .vop_create = nfs_create },
193 	VOPNAME_REMOVE,		{ .vop_remove = nfs_remove },
194 	VOPNAME_LINK,		{ .vop_link = nfs_link },
195 	VOPNAME_RENAME,		{ .vop_rename = nfs_rename },
196 	VOPNAME_MKDIR,		{ .vop_mkdir = nfs_mkdir },
197 	VOPNAME_RMDIR,		{ .vop_rmdir = nfs_rmdir },
198 	VOPNAME_READDIR,	{ .vop_readdir = nfs_readdir },
199 	VOPNAME_SYMLINK,	{ .vop_symlink = nfs_symlink },
200 	VOPNAME_READLINK,	{ .vop_readlink = nfs_readlink },
201 	VOPNAME_FSYNC,		{ .vop_fsync = nfs_fsync },
202 	VOPNAME_INACTIVE,	{ .vop_inactive = nfs_inactive },
203 	VOPNAME_FID,		{ .vop_fid = nfs_fid },
204 	VOPNAME_RWLOCK,		{ .vop_rwlock = nfs_rwlock },
205 	VOPNAME_RWUNLOCK,	{ .vop_rwunlock = nfs_rwunlock },
206 	VOPNAME_SEEK,		{ .vop_seek = nfs_seek },
207 	VOPNAME_FRLOCK,		{ .vop_frlock = nfs_frlock },
208 	VOPNAME_SPACE,		{ .vop_space = nfs_space },
209 	VOPNAME_REALVP,		{ .vop_realvp = nfs_realvp },
210 	VOPNAME_GETPAGE,	{ .vop_getpage = nfs_getpage },
211 	VOPNAME_PUTPAGE,	{ .vop_putpage = nfs_putpage },
212 	VOPNAME_MAP,		{ .vop_map = nfs_map },
213 	VOPNAME_ADDMAP,		{ .vop_addmap = nfs_addmap },
214 	VOPNAME_DELMAP,		{ .vop_delmap = nfs_delmap },
215 	VOPNAME_DUMP,		{ .vop_dump = nfs_dump },
216 	VOPNAME_PATHCONF,	{ .vop_pathconf = nfs_pathconf },
217 	VOPNAME_PAGEIO,		{ .vop_pageio = nfs_pageio },
218 	VOPNAME_SETSECATTR,	{ .vop_setsecattr = nfs_setsecattr },
219 	VOPNAME_GETSECATTR,	{ .vop_getsecattr = nfs_getsecattr },
220 	VOPNAME_SHRLOCK,	{ .vop_shrlock = nfs_shrlock },
221 	VOPNAME_VNEVENT, 	{ .vop_vnevent = fs_vnevent_support },
222 	NULL,			NULL
223 };
224 
225 /*
226  * XXX:  This is referenced in modstubs.s
227  */
228 struct vnodeops *
229 nfs_getvnodeops(void)
230 {
231 	return (nfs_vnodeops);
232 }
233 
234 /* ARGSUSED */
235 static int
236 nfs_open(vnode_t **vpp, int flag, cred_t *cr)
237 {
238 	int error;
239 	struct vattr va;
240 	rnode_t *rp;
241 	vnode_t *vp;
242 
243 	vp = *vpp;
244 	rp = VTOR(vp);
245 	if (nfs_zone() != VTOMI(vp)->mi_zone)
246 		return (EIO);
247 	mutex_enter(&rp->r_statelock);
248 	if (rp->r_cred == NULL) {
249 		crhold(cr);
250 		rp->r_cred = cr;
251 	}
252 	mutex_exit(&rp->r_statelock);
253 
254 	/*
255 	 * If there is no cached data or if close-to-open
256 	 * consistency checking is turned off, we can avoid
257 	 * the over the wire getattr.  Otherwise, if the
258 	 * file system is mounted readonly, then just verify
259 	 * the caches are up to date using the normal mechanism.
260 	 * Else, if the file is not mmap'd, then just mark
261 	 * the attributes as timed out.  They will be refreshed
262 	 * and the caches validated prior to being used.
263 	 * Else, the file system is mounted writeable so
264 	 * force an over the wire GETATTR in order to ensure
265 	 * that all cached data is valid.
266 	 */
267 	if (vp->v_count > 1 ||
268 	    ((vn_has_cached_data(vp) || HAVE_RDDIR_CACHE(rp)) &&
269 	    !(VTOMI(vp)->mi_flags & MI_NOCTO))) {
270 		if (vn_is_readonly(vp))
271 			error = nfs_validate_caches(vp, cr);
272 		else if (rp->r_mapcnt == 0 && vp->v_count == 1) {
273 			PURGE_ATTRCACHE(vp);
274 			error = 0;
275 		} else {
276 			va.va_mask = AT_ALL;
277 			error = nfs_getattr_otw(vp, &va, cr);
278 		}
279 	} else
280 		error = 0;
281 
282 	return (error);
283 }
284 
285 static int
286 nfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr)
287 {
288 	rnode_t *rp;
289 	int error;
290 	struct vattr va;
291 
292 	/*
293 	 * zone_enter(2) prevents processes from changing zones with NFS files
294 	 * open; if we happen to get here from the wrong zone we can't do
295 	 * anything over the wire.
296 	 */
297 	if (VTOMI(vp)->mi_zone != nfs_zone()) {
298 		/*
299 		 * We could attempt to clean up locks, except we're sure
300 		 * that the current process didn't acquire any locks on
301 		 * the file: any attempt to lock a file belong to another zone
302 		 * will fail, and one can't lock an NFS file and then change
303 		 * zones, as that fails too.
304 		 *
305 		 * Returning an error here is the sane thing to do.  A
306 		 * subsequent call to VN_RELE() which translates to a
307 		 * nfs_inactive() will clean up state: if the zone of the
308 		 * vnode's origin is still alive and kicking, an async worker
309 		 * thread will handle the request (from the correct zone), and
310 		 * everything (minus the final nfs_getattr_otw() call) should
311 		 * be OK. If the zone is going away nfs_async_inactive() will
312 		 * throw away cached pages inline.
313 		 */
314 		return (EIO);
315 	}
316 
317 	/*
318 	 * If we are using local locking for this filesystem, then
319 	 * release all of the SYSV style record locks.  Otherwise,
320 	 * we are doing network locking and we need to release all
321 	 * of the network locks.  All of the locks held by this
322 	 * process on this file are released no matter what the
323 	 * incoming reference count is.
324 	 */
325 	if (VTOMI(vp)->mi_flags & MI_LLOCK) {
326 		cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
327 		cleanshares(vp, ttoproc(curthread)->p_pid);
328 	} else
329 		nfs_lockrelease(vp, flag, offset, cr);
330 
331 	if (count > 1)
332 		return (0);
333 
334 	/*
335 	 * If the file has been `unlinked', then purge the
336 	 * DNLC so that this vnode will get reycled quicker
337 	 * and the .nfs* file on the server will get removed.
338 	 */
339 	rp = VTOR(vp);
340 	if (rp->r_unldvp != NULL)
341 		dnlc_purge_vp(vp);
342 
343 	/*
344 	 * If the file was open for write and there are pages,
345 	 * then if the file system was mounted using the "no-close-
346 	 *	to-open" semantics, then start an asynchronous flush
347 	 *	of the all of the pages in the file.
348 	 * else the file system was not mounted using the "no-close-
349 	 *	to-open" semantics, then do a synchronous flush and
350 	 *	commit of all of the dirty and uncommitted pages.
351 	 *
352 	 * The asynchronous flush of the pages in the "nocto" path
353 	 * mostly just associates a cred pointer with the rnode so
354 	 * writes which happen later will have a better chance of
355 	 * working.  It also starts the data being written to the
356 	 * server, but without unnecessarily delaying the application.
357 	 */
358 	if ((flag & FWRITE) && vn_has_cached_data(vp)) {
359 		if ((VTOMI(vp)->mi_flags & MI_NOCTO)) {
360 			error = nfs_putpage(vp, (offset_t)0, 0, B_ASYNC, cr);
361 			if (error == EAGAIN)
362 				error = 0;
363 		} else
364 			error = nfs_putpage(vp, (offset_t)0, 0, 0, cr);
365 		if (!error) {
366 			mutex_enter(&rp->r_statelock);
367 			error = rp->r_error;
368 			rp->r_error = 0;
369 			mutex_exit(&rp->r_statelock);
370 		}
371 	} else {
372 		mutex_enter(&rp->r_statelock);
373 		error = rp->r_error;
374 		rp->r_error = 0;
375 		mutex_exit(&rp->r_statelock);
376 	}
377 
378 	/*
379 	 * If RWRITEATTR is set, then issue an over the wire GETATTR to
380 	 * refresh the attribute cache with a set of attributes which
381 	 * weren't returned from a WRITE.  This will enable the close-
382 	 * to-open processing to work.
383 	 */
384 	if (rp->r_flags & RWRITEATTR)
385 		(void) nfs_getattr_otw(vp, &va, cr);
386 
387 	return (error);
388 }
389 
390 /* ARGSUSED */
391 static int
392 nfs_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
393 	caller_context_t *ct)
394 {
395 	rnode_t *rp;
396 	u_offset_t off;
397 	offset_t diff;
398 	int on;
399 	size_t n;
400 	caddr_t base;
401 	uint_t flags;
402 	int error;
403 	mntinfo_t *mi;
404 
405 	rp = VTOR(vp);
406 	mi = VTOMI(vp);
407 
408 	if (nfs_zone() != mi->mi_zone)
409 		return (EIO);
410 
411 	ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
412 
413 	if (vp->v_type != VREG)
414 		return (EISDIR);
415 
416 	if (uiop->uio_resid == 0)
417 		return (0);
418 
419 	if (uiop->uio_loffset > MAXOFF32_T)
420 		return (EFBIG);
421 
422 	if (uiop->uio_loffset < 0 ||
423 	    uiop->uio_loffset + uiop->uio_resid > MAXOFF32_T)
424 		return (EINVAL);
425 
426 	/*
427 	 * Bypass VM if caching has been disabled (e.g., locking) or if
428 	 * using client-side direct I/O and the file is not mmap'd and
429 	 * there are no cached pages.
430 	 */
431 	if ((vp->v_flag & VNOCACHE) ||
432 	    (((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) &&
433 	    rp->r_mapcnt == 0 && !vn_has_cached_data(vp))) {
434 		size_t bufsize;
435 		size_t resid = 0;
436 
437 		/*
438 		 * Let's try to do read in as large a chunk as we can
439 		 * (Filesystem (NFS client) bsize if possible/needed).
440 		 * For V3, this is 32K and for V2, this is 8K.
441 		 */
442 		bufsize = MIN(uiop->uio_resid, VTOMI(vp)->mi_curread);
443 		base = kmem_alloc(bufsize, KM_SLEEP);
444 		do {
445 			n = MIN(uiop->uio_resid, bufsize);
446 			error = nfsread(vp, base, uiop->uio_offset, n,
447 			    &resid, cr);
448 			if (!error) {
449 				n -= resid;
450 				error = uiomove(base, n, UIO_READ, uiop);
451 			}
452 		} while (!error && uiop->uio_resid > 0 && n > 0);
453 		kmem_free(base, bufsize);
454 		return (error);
455 	}
456 
457 	error = 0;
458 
459 	do {
460 		off = uiop->uio_loffset & MAXBMASK; /* mapping offset */
461 		on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */
462 		n = MIN(MAXBSIZE - on, uiop->uio_resid);
463 
464 		error = nfs_validate_caches(vp, cr);
465 		if (error)
466 			break;
467 
468 		mutex_enter(&rp->r_statelock);
469 		diff = rp->r_size - uiop->uio_loffset;
470 		mutex_exit(&rp->r_statelock);
471 		if (diff <= 0)
472 			break;
473 		if (diff < n)
474 			n = (size_t)diff;
475 
476 		if (vpm_enable) {
477 			/*
478 			 * Copy data.
479 			 */
480 			error = vpm_data_copy(vp, off + on, n, uiop,
481 						1, NULL, 0, S_READ);
482 		} else {
483 			base = segmap_getmapflt(segkmap, vp, off + on, n,
484 								1, S_READ);
485 			error = uiomove(base + on, n, UIO_READ, uiop);
486 		}
487 
488 		if (!error) {
489 			/*
490 			 * If read a whole block or read to eof,
491 			 * won't need this buffer again soon.
492 			 */
493 			mutex_enter(&rp->r_statelock);
494 			if (n + on == MAXBSIZE ||
495 			    uiop->uio_loffset == rp->r_size)
496 				flags = SM_DONTNEED;
497 			else
498 				flags = 0;
499 			mutex_exit(&rp->r_statelock);
500 			if (vpm_enable) {
501 				error = vpm_sync_pages(vp, off, n, flags);
502 			} else {
503 				error = segmap_release(segkmap, base, flags);
504 			}
505 		} else {
506 			if (vpm_enable) {
507 				(void) vpm_sync_pages(vp, off, n, 0);
508 			} else {
509 				(void) segmap_release(segkmap, base, 0);
510 			}
511 		}
512 	} while (!error && uiop->uio_resid > 0);
513 
514 	return (error);
515 }
516 
517 /* ARGSUSED */
518 static int
519 nfs_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
520 	caller_context_t *ct)
521 {
522 	rnode_t *rp;
523 	u_offset_t off;
524 	caddr_t base;
525 	uint_t flags;
526 	int remainder;
527 	size_t n;
528 	int on;
529 	int error;
530 	int resid;
531 	offset_t offset;
532 	rlim_t limit;
533 	mntinfo_t *mi;
534 
535 	rp = VTOR(vp);
536 
537 	mi = VTOMI(vp);
538 	if (nfs_zone() != mi->mi_zone)
539 		return (EIO);
540 	if (vp->v_type != VREG)
541 		return (EISDIR);
542 
543 	if (uiop->uio_resid == 0)
544 		return (0);
545 
546 	if (ioflag & FAPPEND) {
547 		struct vattr va;
548 
549 		/*
550 		 * Must serialize if appending.
551 		 */
552 		if (nfs_rw_lock_held(&rp->r_rwlock, RW_READER)) {
553 			nfs_rw_exit(&rp->r_rwlock);
554 			if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER,
555 			    INTR(vp)))
556 				return (EINTR);
557 		}
558 
559 		va.va_mask = AT_SIZE;
560 		error = nfsgetattr(vp, &va, cr);
561 		if (error)
562 			return (error);
563 		uiop->uio_loffset = va.va_size;
564 	}
565 
566 	if (uiop->uio_loffset > MAXOFF32_T)
567 		return (EFBIG);
568 
569 	offset = uiop->uio_loffset + uiop->uio_resid;
570 
571 	if (uiop->uio_loffset < 0 || offset > MAXOFF32_T)
572 		return (EINVAL);
573 
574 	if (uiop->uio_llimit > (rlim64_t)MAXOFF32_T) {
575 		limit = MAXOFF32_T;
576 	} else {
577 		limit = (rlim_t)uiop->uio_llimit;
578 	}
579 
580 	/*
581 	 * Check to make sure that the process will not exceed
582 	 * its limit on file size.  It is okay to write up to
583 	 * the limit, but not beyond.  Thus, the write which
584 	 * reaches the limit will be short and the next write
585 	 * will return an error.
586 	 */
587 	remainder = 0;
588 	if (offset > limit) {
589 		remainder = offset - limit;
590 		uiop->uio_resid = limit - uiop->uio_offset;
591 		if (uiop->uio_resid <= 0) {
592 			proc_t *p = ttoproc(curthread);
593 
594 			uiop->uio_resid += remainder;
595 			mutex_enter(&p->p_lock);
596 			(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
597 			    p->p_rctls, p, RCA_UNSAFE_SIGINFO);
598 			mutex_exit(&p->p_lock);
599 			return (EFBIG);
600 		}
601 	}
602 
603 	if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR(vp)))
604 		return (EINTR);
605 
606 	/*
607 	 * Bypass VM if caching has been disabled (e.g., locking) or if
608 	 * using client-side direct I/O and the file is not mmap'd and
609 	 * there are no cached pages.
610 	 */
611 	if ((vp->v_flag & VNOCACHE) ||
612 	    (((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) &&
613 	    rp->r_mapcnt == 0 && !vn_has_cached_data(vp))) {
614 		size_t bufsize;
615 		int count;
616 		uint_t org_offset;
617 
618 nfs_fwrite:
619 		if (rp->r_flags & RSTALE) {
620 			resid = uiop->uio_resid;
621 			offset = uiop->uio_loffset;
622 			error = rp->r_error;
623 			goto bottom;
624 		}
625 		bufsize = MIN(uiop->uio_resid, mi->mi_curwrite);
626 		base = kmem_alloc(bufsize, KM_SLEEP);
627 		do {
628 			resid = uiop->uio_resid;
629 			offset = uiop->uio_loffset;
630 			count = MIN(uiop->uio_resid, bufsize);
631 			org_offset = uiop->uio_offset;
632 			error = uiomove(base, count, UIO_WRITE, uiop);
633 			if (!error) {
634 				error = nfswrite(vp, base, org_offset,
635 				    count, cr);
636 			}
637 		} while (!error && uiop->uio_resid > 0);
638 		kmem_free(base, bufsize);
639 		goto bottom;
640 	}
641 
642 	do {
643 		off = uiop->uio_loffset & MAXBMASK; /* mapping offset */
644 		on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */
645 		n = MIN(MAXBSIZE - on, uiop->uio_resid);
646 
647 		resid = uiop->uio_resid;
648 		offset = uiop->uio_loffset;
649 
650 		if (rp->r_flags & RSTALE) {
651 			error = rp->r_error;
652 			break;
653 		}
654 
655 		/*
656 		 * Don't create dirty pages faster than they
657 		 * can be cleaned so that the system doesn't
658 		 * get imbalanced.  If the async queue is
659 		 * maxed out, then wait for it to drain before
660 		 * creating more dirty pages.  Also, wait for
661 		 * any threads doing pagewalks in the vop_getattr
662 		 * entry points so that they don't block for
663 		 * long periods.
664 		 */
665 		mutex_enter(&rp->r_statelock);
666 		while ((mi->mi_max_threads != 0 &&
667 		    rp->r_awcount > 2 * mi->mi_max_threads) ||
668 		    rp->r_gcount > 0)
669 			cv_wait(&rp->r_cv, &rp->r_statelock);
670 		mutex_exit(&rp->r_statelock);
671 
672 		if (vpm_enable) {
673 			/*
674 			 * It will use kpm mappings, so no need to
675 			 * pass an address.
676 			 */
677 			error = writerp(rp, NULL, n, uiop, 0);
678 		} else  {
679 			if (segmap_kpm) {
680 				int pon = uiop->uio_loffset & PAGEOFFSET;
681 				size_t pn = MIN(PAGESIZE - pon,
682 							uiop->uio_resid);
683 				int pagecreate;
684 
685 				mutex_enter(&rp->r_statelock);
686 				pagecreate = (pon == 0) && (pn == PAGESIZE ||
687 					uiop->uio_loffset + pn >= rp->r_size);
688 				mutex_exit(&rp->r_statelock);
689 
690 				base = segmap_getmapflt(segkmap, vp, off + on,
691 						pn, !pagecreate, S_WRITE);
692 
693 				error = writerp(rp, base + pon, n, uiop,
694 								pagecreate);
695 
696 			} else {
697 				base = segmap_getmapflt(segkmap, vp, off + on,
698 							n, 0, S_READ);
699 				error = writerp(rp, base + on, n, uiop, 0);
700 			}
701 		}
702 
703 		if (!error) {
704 			if (mi->mi_flags & MI_NOAC)
705 				flags = SM_WRITE;
706 			else if (n + on == MAXBSIZE || IS_SWAPVP(vp)) {
707 				/*
708 				 * Have written a whole block.
709 				 * Start an asynchronous write
710 				 * and mark the buffer to
711 				 * indicate that it won't be
712 				 * needed again soon.
713 				 */
714 				flags = SM_WRITE | SM_ASYNC | SM_DONTNEED;
715 			} else
716 				flags = 0;
717 			if ((ioflag & (FSYNC|FDSYNC)) ||
718 			    (rp->r_flags & ROUTOFSPACE)) {
719 				flags &= ~SM_ASYNC;
720 				flags |= SM_WRITE;
721 			}
722 			if (vpm_enable) {
723 				error = vpm_sync_pages(vp, off, n, flags);
724 			} else {
725 				error = segmap_release(segkmap, base, flags);
726 			}
727 		} else {
728 			if (vpm_enable) {
729 				(void) vpm_sync_pages(vp, off, n, 0);
730 			} else {
731 				(void) segmap_release(segkmap, base, 0);
732 			}
733 			/*
734 			 * In the event that we got an access error while
735 			 * faulting in a page for a write-only file just
736 			 * force a write.
737 			 */
738 			if (error == EACCES)
739 				goto nfs_fwrite;
740 		}
741 	} while (!error && uiop->uio_resid > 0);
742 
743 bottom:
744 	if (error) {
745 		uiop->uio_resid = resid + remainder;
746 		uiop->uio_loffset = offset;
747 	} else
748 		uiop->uio_resid += remainder;
749 
750 	nfs_rw_exit(&rp->r_lkserlock);
751 
752 	return (error);
753 }
754 
755 /*
756  * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED}
757  */
758 static int
759 nfs_rdwrlbn(vnode_t *vp, page_t *pp, u_offset_t off, size_t len,
760 	int flags, cred_t *cr)
761 {
762 	struct buf *bp;
763 	int error;
764 
765 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
766 	bp = pageio_setup(pp, len, vp, flags);
767 	ASSERT(bp != NULL);
768 
769 	/*
770 	 * pageio_setup should have set b_addr to 0.  This
771 	 * is correct since we want to do I/O on a page
772 	 * boundary.  bp_mapin will use this addr to calculate
773 	 * an offset, and then set b_addr to the kernel virtual
774 	 * address it allocated for us.
775 	 */
776 	ASSERT(bp->b_un.b_addr == 0);
777 
778 	bp->b_edev = 0;
779 	bp->b_dev = 0;
780 	bp->b_lblkno = lbtodb(off);
781 	bp->b_file = vp;
782 	bp->b_offset = (offset_t)off;
783 	bp_mapin(bp);
784 
785 	error = nfs_bio(bp, cr);
786 
787 	bp_mapout(bp);
788 	pageio_done(bp);
789 
790 	return (error);
791 }
792 
793 /*
794  * Write to file.  Writes to remote server in largest size
795  * chunks that the server can handle.  Write is synchronous.
796  */
797 static int
798 nfswrite(vnode_t *vp, caddr_t base, uint_t offset, int count, cred_t *cr)
799 {
800 	rnode_t *rp;
801 	mntinfo_t *mi;
802 	struct nfswriteargs wa;
803 	struct nfsattrstat ns;
804 	int error;
805 	int tsize;
806 	int douprintf;
807 
808 	douprintf = 1;
809 
810 	rp = VTOR(vp);
811 	mi = VTOMI(vp);
812 
813 	ASSERT(nfs_zone() == mi->mi_zone);
814 
815 	wa.wa_args = &wa.wa_args_buf;
816 	wa.wa_fhandle = *VTOFH(vp);
817 
818 	do {
819 		tsize = MIN(mi->mi_curwrite, count);
820 		wa.wa_data = base;
821 		wa.wa_begoff = offset;
822 		wa.wa_totcount = tsize;
823 		wa.wa_count = tsize;
824 		wa.wa_offset = offset;
825 
826 		if (mi->mi_io_kstats) {
827 			mutex_enter(&mi->mi_lock);
828 			kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
829 			mutex_exit(&mi->mi_lock);
830 		}
831 		wa.wa_mblk = NULL;
832 		do {
833 			error = rfs2call(mi, RFS_WRITE,
834 			    xdr_writeargs, (caddr_t)&wa,
835 			    xdr_attrstat, (caddr_t)&ns, cr,
836 			    &douprintf, &ns.ns_status, 0, NULL);
837 		} while (error == ENFS_TRYAGAIN);
838 		if (mi->mi_io_kstats) {
839 			mutex_enter(&mi->mi_lock);
840 			kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
841 			mutex_exit(&mi->mi_lock);
842 		}
843 
844 		if (!error) {
845 			error = geterrno(ns.ns_status);
846 			/*
847 			 * Can't check for stale fhandle and purge caches
848 			 * here because pages are held by nfs_getpage.
849 			 * Just mark the attribute cache as timed out
850 			 * and set RWRITEATTR to indicate that the file
851 			 * was modified with a WRITE operation.
852 			 */
853 			if (!error) {
854 				count -= tsize;
855 				base += tsize;
856 				offset += tsize;
857 				if (mi->mi_io_kstats) {
858 					mutex_enter(&mi->mi_lock);
859 				    KSTAT_IO_PTR(mi->mi_io_kstats)->writes++;
860 				    KSTAT_IO_PTR(mi->mi_io_kstats)->nwritten +=
861 					    tsize;
862 					mutex_exit(&mi->mi_lock);
863 				}
864 				lwp_stat_update(LWP_STAT_OUBLK, 1);
865 				mutex_enter(&rp->r_statelock);
866 				PURGE_ATTRCACHE_LOCKED(rp);
867 				rp->r_flags |= RWRITEATTR;
868 				mutex_exit(&rp->r_statelock);
869 			}
870 		}
871 	} while (!error && count);
872 
873 	return (error);
874 }
875 
876 /*
877  * Read from a file.  Reads data in largest chunks our interface can handle.
878  */
879 static int
880 nfsread(vnode_t *vp, caddr_t base, uint_t offset, int count, size_t *residp,
881 	cred_t *cr)
882 {
883 	mntinfo_t *mi;
884 	struct nfsreadargs ra;
885 	struct nfsrdresult rr;
886 	int tsize;
887 	int error;
888 	int douprintf;
889 	failinfo_t fi;
890 	rnode_t *rp;
891 	struct vattr va;
892 	hrtime_t t;
893 
894 	rp = VTOR(vp);
895 	mi = VTOMI(vp);
896 
897 	ASSERT(nfs_zone() == mi->mi_zone);
898 
899 	douprintf = 1;
900 
901 	ra.ra_fhandle = *VTOFH(vp);
902 
903 	fi.vp = vp;
904 	fi.fhp = (caddr_t)&ra.ra_fhandle;
905 	fi.copyproc = nfscopyfh;
906 	fi.lookupproc = nfslookup;
907 	fi.xattrdirproc = acl_getxattrdir2;
908 
909 	do {
910 		if (mi->mi_io_kstats) {
911 			mutex_enter(&mi->mi_lock);
912 			kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
913 			mutex_exit(&mi->mi_lock);
914 		}
915 
916 		do {
917 			tsize = MIN(mi->mi_curread, count);
918 			rr.rr_data = base;
919 			ra.ra_offset = offset;
920 			ra.ra_totcount = tsize;
921 			ra.ra_count = tsize;
922 			t = gethrtime();
923 			error = rfs2call(mi, RFS_READ,
924 			    xdr_readargs, (caddr_t)&ra,
925 			    xdr_rdresult, (caddr_t)&rr, cr,
926 			    &douprintf, &rr.rr_status, 0, &fi);
927 		} while (error == ENFS_TRYAGAIN);
928 
929 		if (mi->mi_io_kstats) {
930 			mutex_enter(&mi->mi_lock);
931 			kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
932 			mutex_exit(&mi->mi_lock);
933 		}
934 
935 		if (!error) {
936 			error = geterrno(rr.rr_status);
937 			if (!error) {
938 				count -= rr.rr_count;
939 				base += rr.rr_count;
940 				offset += rr.rr_count;
941 				if (mi->mi_io_kstats) {
942 					mutex_enter(&mi->mi_lock);
943 					KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
944 					KSTAT_IO_PTR(mi->mi_io_kstats)->nread +=
945 					    rr.rr_count;
946 					mutex_exit(&mi->mi_lock);
947 				}
948 				lwp_stat_update(LWP_STAT_INBLK, 1);
949 			}
950 		}
951 	} while (!error && count && rr.rr_count == tsize);
952 
953 	*residp = count;
954 
955 	if (!error) {
956 		/*
957 		 * Since no error occurred, we have the current
958 		 * attributes and we need to do a cache check and then
959 		 * potentially update the cached attributes.  We can't
960 		 * use the normal attribute check and cache mechanisms
961 		 * because they might cause a cache flush which would
962 		 * deadlock.  Instead, we just check the cache to see
963 		 * if the attributes have changed.  If it is, then we
964 		 * just mark the attributes as out of date.  The next
965 		 * time that the attributes are checked, they will be
966 		 * out of date, new attributes will be fetched, and
967 		 * the page cache will be flushed.  If the attributes
968 		 * weren't changed, then we just update the cached
969 		 * attributes with these attributes.
970 		 */
971 		/*
972 		 * If NFS_ACL is supported on the server, then the
973 		 * attributes returned by server may have minimal
974 		 * permissions sometimes denying access to users having
975 		 * proper access.  To get the proper attributes, mark
976 		 * the attributes as expired so that they will be
977 		 * regotten via the NFS_ACL GETATTR2 procedure.
978 		 */
979 		error = nattr_to_vattr(vp, &rr.rr_attr, &va);
980 		mutex_enter(&rp->r_statelock);
981 		if (error || !CACHE_VALID(rp, va.va_mtime, va.va_size) ||
982 		    (mi->mi_flags & MI_ACL)) {
983 			mutex_exit(&rp->r_statelock);
984 			PURGE_ATTRCACHE(vp);
985 		} else {
986 			if (rp->r_mtime <= t) {
987 				nfs_attrcache_va(vp, &va);
988 			}
989 			mutex_exit(&rp->r_statelock);
990 		}
991 	}
992 
993 	return (error);
994 }
995 
996 /* ARGSUSED */
997 static int
998 nfs_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
999 {
1000 
1001 	if (nfs_zone() != VTOMI(vp)->mi_zone)
1002 		return (EIO);
1003 	switch (cmd) {
1004 		case _FIODIRECTIO:
1005 			return (nfs_directio(vp, (int)arg, cr));
1006 		default:
1007 			return (ENOTTY);
1008 	}
1009 }
1010 
1011 static int
1012 nfs_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr)
1013 {
1014 	int error;
1015 	rnode_t *rp;
1016 
1017 	if (nfs_zone() != VTOMI(vp)->mi_zone)
1018 		return (EIO);
1019 	/*
1020 	 * If it has been specified that the return value will
1021 	 * just be used as a hint, and we are only being asked
1022 	 * for size, fsid or rdevid, then return the client's
1023 	 * notion of these values without checking to make sure
1024 	 * that the attribute cache is up to date.
1025 	 * The whole point is to avoid an over the wire GETATTR
1026 	 * call.
1027 	 */
1028 	rp = VTOR(vp);
1029 	if (flags & ATTR_HINT) {
1030 		if (vap->va_mask ==
1031 		    (vap->va_mask & (AT_SIZE | AT_FSID | AT_RDEV))) {
1032 			mutex_enter(&rp->r_statelock);
1033 			if (vap->va_mask | AT_SIZE)
1034 				vap->va_size = rp->r_size;
1035 			if (vap->va_mask | AT_FSID)
1036 				vap->va_fsid = rp->r_attr.va_fsid;
1037 			if (vap->va_mask | AT_RDEV)
1038 				vap->va_rdev = rp->r_attr.va_rdev;
1039 			mutex_exit(&rp->r_statelock);
1040 			return (0);
1041 		}
1042 	}
1043 
1044 	/*
1045 	 * Only need to flush pages if asking for the mtime
1046 	 * and if there any dirty pages or any outstanding
1047 	 * asynchronous (write) requests for this file.
1048 	 */
1049 	if (vap->va_mask & AT_MTIME) {
1050 		if (vn_has_cached_data(vp) &&
1051 		    ((rp->r_flags & RDIRTY) || rp->r_awcount > 0)) {
1052 			mutex_enter(&rp->r_statelock);
1053 			rp->r_gcount++;
1054 			mutex_exit(&rp->r_statelock);
1055 			error = nfs_putpage(vp, (offset_t)0, 0, 0, cr);
1056 			mutex_enter(&rp->r_statelock);
1057 			if (error && (error == ENOSPC || error == EDQUOT)) {
1058 				if (!rp->r_error)
1059 					rp->r_error = error;
1060 			}
1061 			if (--rp->r_gcount == 0)
1062 				cv_broadcast(&rp->r_cv);
1063 			mutex_exit(&rp->r_statelock);
1064 		}
1065 	}
1066 
1067 	return (nfsgetattr(vp, vap, cr));
1068 }
1069 
1070 /*ARGSUSED4*/
1071 static int
1072 nfs_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
1073 		caller_context_t *ct)
1074 {
1075 	int error;
1076 	uint_t mask;
1077 	struct vattr va;
1078 
1079 	mask = vap->va_mask;
1080 
1081 	if (mask & AT_NOSET)
1082 		return (EINVAL);
1083 
1084 	if ((mask & AT_SIZE) &&
1085 	    vap->va_type == VREG &&
1086 	    vap->va_size > MAXOFF32_T)
1087 		return (EFBIG);
1088 
1089 	if (nfs_zone() != VTOMI(vp)->mi_zone)
1090 		return (EIO);
1091 
1092 	va.va_mask = AT_UID | AT_MODE;
1093 
1094 	error = nfsgetattr(vp, &va, cr);
1095 	if (error)
1096 		return (error);
1097 
1098 	error = secpolicy_vnode_setattr(cr, vp, vap, &va, flags, nfs_accessx,
1099 			vp);
1100 
1101 	if (error)
1102 		return (error);
1103 
1104 	return (nfssetattr(vp, vap, flags, cr));
1105 }
1106 
1107 static int
1108 nfssetattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr)
1109 {
1110 	int error;
1111 	uint_t mask;
1112 	struct nfssaargs args;
1113 	struct nfsattrstat ns;
1114 	int douprintf;
1115 	rnode_t *rp;
1116 	struct vattr va;
1117 	mode_t omode;
1118 	mntinfo_t *mi;
1119 	vsecattr_t *vsp;
1120 	hrtime_t t;
1121 
1122 	mask = vap->va_mask;
1123 
1124 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
1125 
1126 	rp = VTOR(vp);
1127 
1128 	/*
1129 	 * Only need to flush pages if there are any pages and
1130 	 * if the file is marked as dirty in some fashion.  The
1131 	 * file must be flushed so that we can accurately
1132 	 * determine the size of the file and the cached data
1133 	 * after the SETATTR returns.  A file is considered to
1134 	 * be dirty if it is either marked with RDIRTY, has
1135 	 * outstanding i/o's active, or is mmap'd.  In this
1136 	 * last case, we can't tell whether there are dirty
1137 	 * pages, so we flush just to be sure.
1138 	 */
1139 	if (vn_has_cached_data(vp) &&
1140 	    ((rp->r_flags & RDIRTY) ||
1141 	    rp->r_count > 0 ||
1142 	    rp->r_mapcnt > 0)) {
1143 		ASSERT(vp->v_type != VCHR);
1144 		error = nfs_putpage(vp, (offset_t)0, 0, 0, cr);
1145 		if (error && (error == ENOSPC || error == EDQUOT)) {
1146 			mutex_enter(&rp->r_statelock);
1147 			if (!rp->r_error)
1148 				rp->r_error = error;
1149 			mutex_exit(&rp->r_statelock);
1150 		}
1151 	}
1152 
1153 	/*
1154 	 * If the system call was utime(2) or utimes(2) and the
1155 	 * application did not specify the times, then set the
1156 	 * mtime nanosecond field to 1 billion.  This will get
1157 	 * translated from 1 billion nanoseconds to 1 million
1158 	 * microseconds in the over the wire request.  The
1159 	 * server will use 1 million in the microsecond field
1160 	 * to tell whether both the mtime and atime should be
1161 	 * set to the server's current time.
1162 	 *
1163 	 * This is an overload of the protocol and should be
1164 	 * documented in the NFS Version 2 protocol specification.
1165 	 */
1166 	if ((mask & AT_MTIME) && !(flags & ATTR_UTIME)) {
1167 		vap->va_mtime.tv_nsec = 1000000000;
1168 		if (NFS_TIME_T_OK(vap->va_mtime.tv_sec) &&
1169 		    NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
1170 			error = vattr_to_sattr(vap, &args.saa_sa);
1171 		} else {
1172 			/*
1173 			 * Use server times. vap time values will not be used.
1174 			 * To ensure no time overflow, make sure vap has
1175 			 * valid values, but retain the original values.
1176 			 */
1177 			timestruc_t	mtime = vap->va_mtime;
1178 			timestruc_t	atime = vap->va_atime;
1179 			time_t		now;
1180 
1181 			now = gethrestime_sec();
1182 			if (NFS_TIME_T_OK(now)) {
1183 				/* Just in case server does not know of this */
1184 				vap->va_mtime.tv_sec = now;
1185 				vap->va_atime.tv_sec = now;
1186 			} else {
1187 				vap->va_mtime.tv_sec = 0;
1188 				vap->va_atime.tv_sec = 0;
1189 			}
1190 			error = vattr_to_sattr(vap, &args.saa_sa);
1191 			/* set vap times back on */
1192 			vap->va_mtime = mtime;
1193 			vap->va_atime = atime;
1194 		}
1195 	} else {
1196 		/* Either do not set times or use the client specified times */
1197 		error = vattr_to_sattr(vap, &args.saa_sa);
1198 	}
1199 	if (error) {
1200 		/* req time field(s) overflow - return immediately */
1201 		return (error);
1202 	}
1203 	args.saa_fh = *VTOFH(vp);
1204 
1205 	va.va_mask = AT_MODE;
1206 	error = nfsgetattr(vp, &va, cr);
1207 	if (error)
1208 		return (error);
1209 	omode = va.va_mode;
1210 
1211 	mi = VTOMI(vp);
1212 
1213 	douprintf = 1;
1214 
1215 	t = gethrtime();
1216 
1217 	error = rfs2call(mi, RFS_SETATTR,
1218 	    xdr_saargs, (caddr_t)&args,
1219 	    xdr_attrstat, (caddr_t)&ns, cr,
1220 	    &douprintf, &ns.ns_status, 0, NULL);
1221 
1222 	/*
1223 	 * Purge the access cache and ACL cache if changing either the
1224 	 * owner of the file, the group owner, or the mode.  These may
1225 	 * change the access permissions of the file, so purge old
1226 	 * information and start over again.
1227 	 */
1228 	if ((mask & (AT_UID | AT_GID | AT_MODE)) && (mi->mi_flags & MI_ACL)) {
1229 		(void) nfs_access_purge_rp(rp);
1230 		if (rp->r_secattr != NULL) {
1231 			mutex_enter(&rp->r_statelock);
1232 			vsp = rp->r_secattr;
1233 			rp->r_secattr = NULL;
1234 			mutex_exit(&rp->r_statelock);
1235 			if (vsp != NULL)
1236 				nfs_acl_free(vsp);
1237 		}
1238 	}
1239 
1240 	if (!error) {
1241 		error = geterrno(ns.ns_status);
1242 		if (!error) {
1243 			/*
1244 			 * If changing the size of the file, invalidate
1245 			 * any local cached data which is no longer part
1246 			 * of the file.  We also possibly invalidate the
1247 			 * last page in the file.  We could use
1248 			 * pvn_vpzero(), but this would mark the page as
1249 			 * modified and require it to be written back to
1250 			 * the server for no particularly good reason.
1251 			 * This way, if we access it, then we bring it
1252 			 * back in.  A read should be cheaper than a
1253 			 * write.
1254 			 */
1255 			if (mask & AT_SIZE) {
1256 				nfs_invalidate_pages(vp,
1257 				    (vap->va_size & PAGEMASK), cr);
1258 			}
1259 			(void) nfs_cache_fattr(vp, &ns.ns_attr, &va, t, cr);
1260 			/*
1261 			 * If NFS_ACL is supported on the server, then the
1262 			 * attributes returned by server may have minimal
1263 			 * permissions sometimes denying access to users having
1264 			 * proper access.  To get the proper attributes, mark
1265 			 * the attributes as expired so that they will be
1266 			 * regotten via the NFS_ACL GETATTR2 procedure.
1267 			 */
1268 			if (mi->mi_flags & MI_ACL) {
1269 				PURGE_ATTRCACHE(vp);
1270 			}
1271 			/*
1272 			 * This next check attempts to deal with NFS
1273 			 * servers which can not handle increasing
1274 			 * the size of the file via setattr.  Most
1275 			 * of these servers do not return an error,
1276 			 * but do not change the size of the file.
1277 			 * Hence, this check and then attempt to set
1278 			 * the file size by writing 1 byte at the
1279 			 * offset of the end of the file that we need.
1280 			 */
1281 			if ((mask & AT_SIZE) &&
1282 			    ns.ns_attr.na_size < (uint32_t)vap->va_size) {
1283 				char zb = '\0';
1284 
1285 				error = nfswrite(vp, &zb,
1286 				    vap->va_size - sizeof (zb),
1287 				    sizeof (zb), cr);
1288 			}
1289 			/*
1290 			 * Some servers will change the mode to clear the setuid
1291 			 * and setgid bits when changing the uid or gid.  The
1292 			 * client needs to compensate appropriately.
1293 			 */
1294 			if (mask & (AT_UID | AT_GID)) {
1295 				int terror;
1296 
1297 				va.va_mask = AT_MODE;
1298 				terror = nfsgetattr(vp, &va, cr);
1299 				if (!terror &&
1300 				    (((mask & AT_MODE) &&
1301 				    va.va_mode != vap->va_mode) ||
1302 				    (!(mask & AT_MODE) &&
1303 				    va.va_mode != omode))) {
1304 					va.va_mask = AT_MODE;
1305 					if (mask & AT_MODE)
1306 						va.va_mode = vap->va_mode;
1307 					else
1308 						va.va_mode = omode;
1309 					(void) nfssetattr(vp, &va, 0, cr);
1310 				}
1311 			}
1312 		} else {
1313 			PURGE_ATTRCACHE(vp);
1314 			PURGE_STALE_FH(error, vp, cr);
1315 		}
1316 	} else {
1317 		PURGE_ATTRCACHE(vp);
1318 	}
1319 
1320 	return (error);
1321 }
1322 
1323 static int
1324 nfs_accessx(void *vp, int mode, cred_t *cr)
1325 {
1326 	ASSERT(nfs_zone() == VTOMI((vnode_t *)vp)->mi_zone);
1327 	return (nfs_access(vp, mode, 0, cr));
1328 }
1329 
1330 static int
1331 nfs_access(vnode_t *vp, int mode, int flags, cred_t *cr)
1332 {
1333 	struct vattr va;
1334 	int error;
1335 	mntinfo_t *mi;
1336 	int shift = 0;
1337 
1338 	mi = VTOMI(vp);
1339 
1340 	if (nfs_zone() != mi->mi_zone)
1341 		return (EIO);
1342 	if (mi->mi_flags & MI_ACL) {
1343 		error = acl_access2(vp, mode, flags, cr);
1344 		if (mi->mi_flags & MI_ACL)
1345 			return (error);
1346 	}
1347 
1348 	va.va_mask = AT_MODE | AT_UID | AT_GID;
1349 	error = nfsgetattr(vp, &va, cr);
1350 	if (error)
1351 		return (error);
1352 
1353 	/*
1354 	 * Disallow write attempts on read-only
1355 	 * file systems, unless the file is a
1356 	 * device node.
1357 	 */
1358 	if ((mode & VWRITE) && vn_is_readonly(vp) && !IS_DEVVP(vp))
1359 		return (EROFS);
1360 
1361 	/*
1362 	 * Disallow attempts to access mandatory lock files.
1363 	 */
1364 	if ((mode & (VWRITE | VREAD | VEXEC)) &&
1365 	    MANDLOCK(vp, va.va_mode))
1366 		return (EACCES);
1367 
1368 	/*
1369 	 * Access check is based on only
1370 	 * one of owner, group, public.
1371 	 * If not owner, then check group.
1372 	 * If not a member of the group,
1373 	 * then check public access.
1374 	 */
1375 	if (crgetuid(cr) != va.va_uid) {
1376 		shift += 3;
1377 		if (!groupmember(va.va_gid, cr))
1378 			shift += 3;
1379 	}
1380 found:
1381 	mode &= ~(va.va_mode << shift);
1382 	if (mode == 0)
1383 		return (0);
1384 
1385 	return (secpolicy_vnode_access(cr, vp, va.va_uid, mode));
1386 }
1387 
1388 static int nfs_do_symlink_cache = 1;
1389 
1390 static int
1391 nfs_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr)
1392 {
1393 	int error;
1394 	struct nfsrdlnres rl;
1395 	rnode_t *rp;
1396 	int douprintf;
1397 	failinfo_t fi;
1398 
1399 	/*
1400 	 * We want to be consistent with UFS semantics so we will return
1401 	 * EINVAL instead of ENXIO. This violates the XNFS spec and
1402 	 * the RFC 1094, which are wrong any way. BUGID 1138002.
1403 	 */
1404 	if (vp->v_type != VLNK)
1405 		return (EINVAL);
1406 
1407 	if (nfs_zone() != VTOMI(vp)->mi_zone)
1408 		return (EIO);
1409 
1410 	rp = VTOR(vp);
1411 	if (nfs_do_symlink_cache && rp->r_symlink.contents != NULL) {
1412 		error = nfs_validate_caches(vp, cr);
1413 		if (error)
1414 			return (error);
1415 		mutex_enter(&rp->r_statelock);
1416 		if (rp->r_symlink.contents != NULL) {
1417 			error = uiomove(rp->r_symlink.contents,
1418 			    rp->r_symlink.len, UIO_READ, uiop);
1419 			mutex_exit(&rp->r_statelock);
1420 			return (error);
1421 		}
1422 		mutex_exit(&rp->r_statelock);
1423 	}
1424 
1425 
1426 	rl.rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
1427 
1428 	fi.vp = vp;
1429 	fi.fhp = NULL;		/* no need to update, filehandle not copied */
1430 	fi.copyproc = nfscopyfh;
1431 	fi.lookupproc = nfslookup;
1432 	fi.xattrdirproc = acl_getxattrdir2;
1433 
1434 	douprintf = 1;
1435 
1436 	error = rfs2call(VTOMI(vp), RFS_READLINK,
1437 	    xdr_fhandle, (caddr_t)VTOFH(vp),
1438 	    xdr_rdlnres, (caddr_t)&rl, cr,
1439 	    &douprintf, &rl.rl_status, 0, &fi);
1440 
1441 	if (error) {
1442 
1443 		kmem_free((void *)rl.rl_data, NFS_MAXPATHLEN);
1444 		return (error);
1445 	}
1446 
1447 	error = geterrno(rl.rl_status);
1448 	if (!error) {
1449 		error = uiomove(rl.rl_data, (int)rl.rl_count, UIO_READ, uiop);
1450 		if (nfs_do_symlink_cache && rp->r_symlink.contents == NULL) {
1451 			mutex_enter(&rp->r_statelock);
1452 			if (rp->r_symlink.contents == NULL) {
1453 				rp->r_symlink.contents = rl.rl_data;
1454 				rp->r_symlink.len = (int)rl.rl_count;
1455 				rp->r_symlink.size = NFS_MAXPATHLEN;
1456 				mutex_exit(&rp->r_statelock);
1457 			} else {
1458 				mutex_exit(&rp->r_statelock);
1459 
1460 				kmem_free((void *)rl.rl_data,
1461 				    NFS_MAXPATHLEN);
1462 			}
1463 		} else {
1464 
1465 			kmem_free((void *)rl.rl_data, NFS_MAXPATHLEN);
1466 		}
1467 	} else {
1468 		PURGE_STALE_FH(error, vp, cr);
1469 
1470 		kmem_free((void *)rl.rl_data, NFS_MAXPATHLEN);
1471 	}
1472 
1473 	/*
1474 	 * Conform to UFS semantics (see comment above)
1475 	 */
1476 	return (error == ENXIO ? EINVAL : error);
1477 }
1478 
1479 /*
1480  * Flush local dirty pages to stable storage on the server.
1481  *
1482  * If FNODSYNC is specified, then there is nothing to do because
1483  * metadata changes are not cached on the client before being
1484  * sent to the server.
1485  */
1486 static int
1487 nfs_fsync(vnode_t *vp, int syncflag, cred_t *cr)
1488 {
1489 	int error;
1490 
1491 	if ((syncflag & FNODSYNC) || IS_SWAPVP(vp))
1492 		return (0);
1493 
1494 	if (nfs_zone() != VTOMI(vp)->mi_zone)
1495 		return (EIO);
1496 
1497 	error = nfs_putpage(vp, (offset_t)0, 0, 0, cr);
1498 	if (!error)
1499 		error = VTOR(vp)->r_error;
1500 	return (error);
1501 }
1502 
1503 
1504 /*
1505  * Weirdness: if the file was removed or the target of a rename
1506  * operation while it was open, it got renamed instead.  Here we
1507  * remove the renamed file.
1508  */
1509 static void
1510 nfs_inactive(vnode_t *vp, cred_t *cr)
1511 {
1512 	rnode_t *rp;
1513 
1514 	ASSERT(vp != DNLC_NO_VNODE);
1515 
1516 	/*
1517 	 * If this is coming from the wrong zone, we let someone in the right
1518 	 * zone take care of it asynchronously.  We can get here due to
1519 	 * VN_RELE() being called from pageout() or fsflush().  This call may
1520 	 * potentially turn into an expensive no-op if, for instance, v_count
1521 	 * gets incremented in the meantime, but it's still correct.
1522 	 */
1523 	if (nfs_zone() != VTOMI(vp)->mi_zone) {
1524 		nfs_async_inactive(vp, cr, nfs_inactive);
1525 		return;
1526 	}
1527 
1528 	rp = VTOR(vp);
1529 redo:
1530 	if (rp->r_unldvp != NULL) {
1531 		/*
1532 		 * Save the vnode pointer for the directory where the
1533 		 * unlinked-open file got renamed, then set it to NULL
1534 		 * to prevent another thread from getting here before
1535 		 * we're done with the remove.  While we have the
1536 		 * statelock, make local copies of the pertinent rnode
1537 		 * fields.  If we weren't to do this in an atomic way, the
1538 		 * the unl* fields could become inconsistent with respect
1539 		 * to each other due to a race condition between this
1540 		 * code and nfs_remove().  See bug report 1034328.
1541 		 */
1542 		mutex_enter(&rp->r_statelock);
1543 		if (rp->r_unldvp != NULL) {
1544 			vnode_t *unldvp;
1545 			char *unlname;
1546 			cred_t *unlcred;
1547 			struct nfsdiropargs da;
1548 			enum nfsstat status;
1549 			int douprintf;
1550 			int error;
1551 
1552 			unldvp = rp->r_unldvp;
1553 			rp->r_unldvp = NULL;
1554 			unlname = rp->r_unlname;
1555 			rp->r_unlname = NULL;
1556 			unlcred = rp->r_unlcred;
1557 			rp->r_unlcred = NULL;
1558 			mutex_exit(&rp->r_statelock);
1559 
1560 			/*
1561 			 * If there are any dirty pages left, then flush
1562 			 * them.  This is unfortunate because they just
1563 			 * may get thrown away during the remove operation,
1564 			 * but we have to do this for correctness.
1565 			 */
1566 			if (vn_has_cached_data(vp) &&
1567 			    ((rp->r_flags & RDIRTY) || rp->r_count > 0)) {
1568 				ASSERT(vp->v_type != VCHR);
1569 				error = nfs_putpage(vp, (offset_t)0, 0, 0, cr);
1570 				if (error) {
1571 					mutex_enter(&rp->r_statelock);
1572 					if (!rp->r_error)
1573 						rp->r_error = error;
1574 					mutex_exit(&rp->r_statelock);
1575 				}
1576 			}
1577 
1578 			/*
1579 			 * Do the remove operation on the renamed file
1580 			 */
1581 			setdiropargs(&da, unlname, unldvp);
1582 
1583 			douprintf = 1;
1584 
1585 			(void) rfs2call(VTOMI(unldvp), RFS_REMOVE,
1586 			    xdr_diropargs, (caddr_t)&da,
1587 			    xdr_enum, (caddr_t)&status, unlcred,
1588 			    &douprintf, &status, 0, NULL);
1589 
1590 			if (HAVE_RDDIR_CACHE(VTOR(unldvp)))
1591 				nfs_purge_rddir_cache(unldvp);
1592 			PURGE_ATTRCACHE(unldvp);
1593 
1594 			/*
1595 			 * Release stuff held for the remove
1596 			 */
1597 			VN_RELE(unldvp);
1598 			kmem_free(unlname, MAXNAMELEN);
1599 			crfree(unlcred);
1600 			goto redo;
1601 		}
1602 		mutex_exit(&rp->r_statelock);
1603 	}
1604 
1605 	rp_addfree(rp, cr);
1606 }
1607 
1608 /*
1609  * Remote file system operations having to do with directory manipulation.
1610  */
1611 
1612 static int
1613 nfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
1614 	int flags, vnode_t *rdir, cred_t *cr)
1615 {
1616 	int error;
1617 	vnode_t *vp;
1618 	vnode_t *avp = NULL;
1619 	rnode_t *drp;
1620 
1621 	if (nfs_zone() != VTOMI(dvp)->mi_zone)
1622 		return (EPERM);
1623 
1624 	drp = VTOR(dvp);
1625 
1626 	/*
1627 	 * Are we looking up extended attributes?  If so, "dvp" is
1628 	 * the file or directory for which we want attributes, and
1629 	 * we need a lookup of the hidden attribute directory
1630 	 * before we lookup the rest of the path.
1631 	 */
1632 	if (flags & LOOKUP_XATTR) {
1633 		bool_t cflag = ((flags & CREATE_XATTR_DIR) != 0);
1634 		mntinfo_t *mi;
1635 
1636 		mi = VTOMI(dvp);
1637 		if (!(mi->mi_flags & MI_EXTATTR))
1638 			return (EINVAL);
1639 
1640 		if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR(dvp)))
1641 			return (EINTR);
1642 
1643 		(void) nfslookup_dnlc(dvp, XATTR_DIR_NAME, &avp, cr);
1644 		if (avp == NULL)
1645 			error = acl_getxattrdir2(dvp, &avp, cflag, cr, 0);
1646 		else
1647 			error = 0;
1648 
1649 		nfs_rw_exit(&drp->r_rwlock);
1650 
1651 		if (error) {
1652 			if (mi->mi_flags & MI_EXTATTR)
1653 				return (error);
1654 			return (EINVAL);
1655 		}
1656 		dvp = avp;
1657 		drp = VTOR(dvp);
1658 	}
1659 
1660 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR(dvp))) {
1661 		error = EINTR;
1662 		goto out;
1663 	}
1664 
1665 	error = nfslookup(dvp, nm, vpp, pnp, flags, rdir, cr, 0);
1666 
1667 	nfs_rw_exit(&drp->r_rwlock);
1668 
1669 	/*
1670 	 * If vnode is a device, create special vnode.
1671 	 */
1672 	if (!error && IS_DEVVP(*vpp)) {
1673 		vp = *vpp;
1674 		*vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
1675 		VN_RELE(vp);
1676 	}
1677 
1678 out:
1679 	if (avp != NULL)
1680 		VN_RELE(avp);
1681 
1682 	return (error);
1683 }
1684 
1685 static int nfs_lookup_neg_cache = 1;
1686 
1687 #ifdef DEBUG
1688 static int nfs_lookup_dnlc_hits = 0;
1689 static int nfs_lookup_dnlc_misses = 0;
1690 static int nfs_lookup_dnlc_neg_hits = 0;
1691 static int nfs_lookup_dnlc_disappears = 0;
1692 static int nfs_lookup_dnlc_lookups = 0;
1693 #endif
1694 
1695 /* ARGSUSED */
1696 int
1697 nfslookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
1698 	int flags, vnode_t *rdir, cred_t *cr, int rfscall_flags)
1699 {
1700 	int error;
1701 
1702 	ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
1703 
1704 	/*
1705 	 * If lookup is for "", just return dvp.  Don't need
1706 	 * to send it over the wire, look it up in the dnlc,
1707 	 * or perform any access checks.
1708 	 */
1709 	if (*nm == '\0') {
1710 		VN_HOLD(dvp);
1711 		*vpp = dvp;
1712 		return (0);
1713 	}
1714 
1715 	/*
1716 	 * Can't do lookups in non-directories.
1717 	 */
1718 	if (dvp->v_type != VDIR)
1719 		return (ENOTDIR);
1720 
1721 	/*
1722 	 * If we're called with RFSCALL_SOFT, it's important that
1723 	 * the only rfscall is one we make directly; if we permit
1724 	 * an access call because we're looking up "." or validating
1725 	 * a dnlc hit, we'll deadlock because that rfscall will not
1726 	 * have the RFSCALL_SOFT set.
1727 	 */
1728 	if (rfscall_flags & RFSCALL_SOFT)
1729 		goto callit;
1730 
1731 	/*
1732 	 * If lookup is for ".", just return dvp.  Don't need
1733 	 * to send it over the wire or look it up in the dnlc,
1734 	 * just need to check access.
1735 	 */
1736 	if (strcmp(nm, ".") == 0) {
1737 		error = nfs_access(dvp, VEXEC, 0, cr);
1738 		if (error)
1739 			return (error);
1740 		VN_HOLD(dvp);
1741 		*vpp = dvp;
1742 		return (0);
1743 	}
1744 
1745 	/*
1746 	 * Lookup this name in the DNLC.  If there was a valid entry,
1747 	 * then return the results of the lookup.
1748 	 */
1749 	error = nfslookup_dnlc(dvp, nm, vpp, cr);
1750 	if (error || *vpp != NULL)
1751 		return (error);
1752 
1753 callit:
1754 	error = nfslookup_otw(dvp, nm, vpp, cr, rfscall_flags);
1755 
1756 	return (error);
1757 }
1758 
1759 static int
1760 nfslookup_dnlc(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr)
1761 {
1762 	int error;
1763 	vnode_t *vp;
1764 
1765 	ASSERT(*nm != '\0');
1766 	ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
1767 
1768 	/*
1769 	 * Lookup this name in the DNLC.  If successful, then validate
1770 	 * the caches and then recheck the DNLC.  The DNLC is rechecked
1771 	 * just in case this entry got invalidated during the call
1772 	 * to nfs_validate_caches.
1773 	 *
1774 	 * An assumption is being made that it is safe to say that a
1775 	 * file exists which may not on the server.  Any operations to
1776 	 * the server will fail with ESTALE.
1777 	 */
1778 #ifdef DEBUG
1779 	nfs_lookup_dnlc_lookups++;
1780 #endif
1781 	vp = dnlc_lookup(dvp, nm);
1782 	if (vp != NULL) {
1783 		VN_RELE(vp);
1784 		if (vp == DNLC_NO_VNODE && !vn_is_readonly(dvp)) {
1785 			PURGE_ATTRCACHE(dvp);
1786 		}
1787 		error = nfs_validate_caches(dvp, cr);
1788 		if (error)
1789 			return (error);
1790 		vp = dnlc_lookup(dvp, nm);
1791 		if (vp != NULL) {
1792 			error = nfs_access(dvp, VEXEC, 0, cr);
1793 			if (error) {
1794 				VN_RELE(vp);
1795 				return (error);
1796 			}
1797 			if (vp == DNLC_NO_VNODE) {
1798 				VN_RELE(vp);
1799 #ifdef DEBUG
1800 				nfs_lookup_dnlc_neg_hits++;
1801 #endif
1802 				return (ENOENT);
1803 			}
1804 			*vpp = vp;
1805 #ifdef DEBUG
1806 			nfs_lookup_dnlc_hits++;
1807 #endif
1808 			return (0);
1809 		}
1810 #ifdef DEBUG
1811 		nfs_lookup_dnlc_disappears++;
1812 #endif
1813 	}
1814 #ifdef DEBUG
1815 	else
1816 		nfs_lookup_dnlc_misses++;
1817 #endif
1818 
1819 	*vpp = NULL;
1820 
1821 	return (0);
1822 }
1823 
1824 static int
1825 nfslookup_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr,
1826 	int rfscall_flags)
1827 {
1828 	int error;
1829 	struct nfsdiropargs da;
1830 	struct nfsdiropres dr;
1831 	int douprintf;
1832 	failinfo_t fi;
1833 	hrtime_t t;
1834 
1835 	ASSERT(*nm != '\0');
1836 	ASSERT(dvp->v_type == VDIR);
1837 	ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
1838 
1839 	setdiropargs(&da, nm, dvp);
1840 
1841 	fi.vp = dvp;
1842 	fi.fhp = NULL;		/* no need to update, filehandle not copied */
1843 	fi.copyproc = nfscopyfh;
1844 	fi.lookupproc = nfslookup;
1845 	fi.xattrdirproc = acl_getxattrdir2;
1846 
1847 	douprintf = 1;
1848 
1849 	t = gethrtime();
1850 
1851 	error = rfs2call(VTOMI(dvp), RFS_LOOKUP,
1852 	    xdr_diropargs, (caddr_t)&da,
1853 	    xdr_diropres, (caddr_t)&dr, cr,
1854 	    &douprintf, &dr.dr_status, rfscall_flags, &fi);
1855 
1856 	if (!error) {
1857 		error = geterrno(dr.dr_status);
1858 		if (!error) {
1859 			*vpp = makenfsnode(&dr.dr_fhandle, &dr.dr_attr,
1860 			    dvp->v_vfsp, t, cr, VTOR(dvp)->r_path, nm);
1861 			/*
1862 			 * If NFS_ACL is supported on the server, then the
1863 			 * attributes returned by server may have minimal
1864 			 * permissions sometimes denying access to users having
1865 			 * proper access.  To get the proper attributes, mark
1866 			 * the attributes as expired so that they will be
1867 			 * regotten via the NFS_ACL GETATTR2 procedure.
1868 			 */
1869 			if (VTOMI(*vpp)->mi_flags & MI_ACL) {
1870 				PURGE_ATTRCACHE(*vpp);
1871 			}
1872 			if (!(rfscall_flags & RFSCALL_SOFT))
1873 				dnlc_update(dvp, nm, *vpp);
1874 		} else {
1875 			PURGE_STALE_FH(error, dvp, cr);
1876 			if (error == ENOENT && nfs_lookup_neg_cache)
1877 				dnlc_enter(dvp, nm, DNLC_NO_VNODE);
1878 		}
1879 	}
1880 
1881 	return (error);
1882 }
1883 
1884 /* ARGSUSED */
1885 static int
1886 nfs_create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive,
1887 	int mode, vnode_t **vpp, cred_t *cr, int lfaware)
1888 {
1889 	int error;
1890 	struct nfscreatargs args;
1891 	struct nfsdiropres dr;
1892 	int douprintf;
1893 	vnode_t *vp;
1894 	rnode_t *rp;
1895 	struct vattr vattr;
1896 	rnode_t *drp;
1897 	vnode_t *tempvp;
1898 	hrtime_t t;
1899 
1900 	drp = VTOR(dvp);
1901 
1902 	if (nfs_zone() != VTOMI(dvp)->mi_zone)
1903 		return (EPERM);
1904 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
1905 		return (EINTR);
1906 
1907 	/*
1908 	 * We make a copy of the attributes because the caller does not
1909 	 * expect us to change what va points to.
1910 	 */
1911 	vattr = *va;
1912 
1913 	/*
1914 	 * If the pathname is "", just use dvp.  Don't need
1915 	 * to send it over the wire, look it up in the dnlc,
1916 	 * or perform any access checks.
1917 	 */
1918 	if (*nm == '\0') {
1919 		error = 0;
1920 		VN_HOLD(dvp);
1921 		vp = dvp;
1922 	/*
1923 	 * If the pathname is ".", just use dvp.  Don't need
1924 	 * to send it over the wire or look it up in the dnlc,
1925 	 * just need to check access.
1926 	 */
1927 	} else if (strcmp(nm, ".") == 0) {
1928 		error = nfs_access(dvp, VEXEC, 0, cr);
1929 		if (error) {
1930 			nfs_rw_exit(&drp->r_rwlock);
1931 			return (error);
1932 		}
1933 		VN_HOLD(dvp);
1934 		vp = dvp;
1935 	/*
1936 	 * We need to go over the wire, just to be sure whether the
1937 	 * file exists or not.  Using the DNLC can be dangerous in
1938 	 * this case when making a decision regarding existence.
1939 	 */
1940 	} else {
1941 		error = nfslookup_otw(dvp, nm, &vp, cr, 0);
1942 	}
1943 	if (!error) {
1944 		if (exclusive == EXCL)
1945 			error = EEXIST;
1946 		else if (vp->v_type == VDIR && (mode & VWRITE))
1947 			error = EISDIR;
1948 		else {
1949 			/*
1950 			 * If vnode is a device, create special vnode.
1951 			 */
1952 			if (IS_DEVVP(vp)) {
1953 				tempvp = vp;
1954 				vp = specvp(vp, vp->v_rdev, vp->v_type, cr);
1955 				VN_RELE(tempvp);
1956 			}
1957 			if (!(error = VOP_ACCESS(vp, mode, 0, cr))) {
1958 				if ((vattr.va_mask & AT_SIZE) &&
1959 				    vp->v_type == VREG) {
1960 					vattr.va_mask = AT_SIZE;
1961 					error = nfssetattr(vp, &vattr, 0, cr);
1962 				}
1963 			}
1964 		}
1965 		nfs_rw_exit(&drp->r_rwlock);
1966 		if (error) {
1967 			VN_RELE(vp);
1968 		} else {
1969 			/*
1970 			 * existing file got truncated, notify.
1971 			 */
1972 			vnevent_create(vp);
1973 			*vpp = vp;
1974 		}
1975 		return (error);
1976 	}
1977 
1978 	ASSERT(vattr.va_mask & AT_TYPE);
1979 	if (vattr.va_type == VREG) {
1980 		ASSERT(vattr.va_mask & AT_MODE);
1981 		if (MANDMODE(vattr.va_mode)) {
1982 			nfs_rw_exit(&drp->r_rwlock);
1983 			return (EACCES);
1984 		}
1985 	}
1986 
1987 	dnlc_remove(dvp, nm);
1988 
1989 	setdiropargs(&args.ca_da, nm, dvp);
1990 
1991 	/*
1992 	 * Decide what the group-id of the created file should be.
1993 	 * Set it in attribute list as advisory...then do a setattr
1994 	 * if the server didn't get it right the first time.
1995 	 */
1996 	error = setdirgid(dvp, &vattr.va_gid, cr);
1997 	if (error) {
1998 		nfs_rw_exit(&drp->r_rwlock);
1999 		return (error);
2000 	}
2001 	vattr.va_mask |= AT_GID;
2002 
2003 	/*
2004 	 * This is a completely gross hack to make mknod
2005 	 * work over the wire until we can wack the protocol
2006 	 */
2007 #define	IFCHR		0020000		/* character special */
2008 #define	IFBLK		0060000		/* block special */
2009 #define	IFSOCK		0140000		/* socket */
2010 
2011 	/*
2012 	 * dev_t is uint_t in 5.x and short in 4.x. Both 4.x
2013 	 * supports 8 bit majors. 5.x supports 14 bit majors. 5.x supports 18
2014 	 * bits in the minor number where 4.x supports 8 bits.  If the 5.x
2015 	 * minor/major numbers <= 8 bits long, compress the device
2016 	 * number before sending it. Otherwise, the 4.x server will not
2017 	 * create the device with the correct device number and nothing can be
2018 	 * done about this.
2019 	 */
2020 	if (vattr.va_type == VCHR || vattr.va_type == VBLK) {
2021 		dev_t d = vattr.va_rdev;
2022 		dev32_t dev32;
2023 
2024 		if (vattr.va_type == VCHR)
2025 			vattr.va_mode |= IFCHR;
2026 		else
2027 			vattr.va_mode |= IFBLK;
2028 
2029 		(void) cmpldev(&dev32, d);
2030 		if (dev32 & ~((SO4_MAXMAJ << L_BITSMINOR32) | SO4_MAXMIN))
2031 			vattr.va_size = (u_offset_t)dev32;
2032 		else
2033 			vattr.va_size = (u_offset_t)nfsv2_cmpdev(d);
2034 
2035 		vattr.va_mask |= AT_MODE|AT_SIZE;
2036 	} else if (vattr.va_type == VFIFO) {
2037 		vattr.va_mode |= IFCHR;		/* xtra kludge for namedpipe */
2038 		vattr.va_size = (u_offset_t)NFS_FIFO_DEV;	/* blech */
2039 		vattr.va_mask |= AT_MODE|AT_SIZE;
2040 	} else if (vattr.va_type == VSOCK) {
2041 		vattr.va_mode |= IFSOCK;
2042 		/*
2043 		 * To avoid triggering bugs in the servers set AT_SIZE
2044 		 * (all other RFS_CREATE calls set this).
2045 		 */
2046 		vattr.va_size = 0;
2047 		vattr.va_mask |= AT_MODE|AT_SIZE;
2048 	}
2049 
2050 	args.ca_sa = &args.ca_sa_buf;
2051 	error = vattr_to_sattr(&vattr, args.ca_sa);
2052 	if (error) {
2053 		/* req time field(s) overflow - return immediately */
2054 		nfs_rw_exit(&drp->r_rwlock);
2055 		return (error);
2056 	}
2057 
2058 	douprintf = 1;
2059 
2060 	t = gethrtime();
2061 
2062 	error = rfs2call(VTOMI(dvp), RFS_CREATE,
2063 	    xdr_creatargs, (caddr_t)&args,
2064 	    xdr_diropres, (caddr_t)&dr, cr,
2065 	    &douprintf, &dr.dr_status, 0, NULL);
2066 
2067 	PURGE_ATTRCACHE(dvp);	/* mod time changed */
2068 
2069 	if (!error) {
2070 		error = geterrno(dr.dr_status);
2071 		if (!error) {
2072 			if (HAVE_RDDIR_CACHE(drp))
2073 				nfs_purge_rddir_cache(dvp);
2074 			vp = makenfsnode(&dr.dr_fhandle, &dr.dr_attr,
2075 			    dvp->v_vfsp, t, cr, NULL, NULL);
2076 			/*
2077 			 * If NFS_ACL is supported on the server, then the
2078 			 * attributes returned by server may have minimal
2079 			 * permissions sometimes denying access to users having
2080 			 * proper access.  To get the proper attributes, mark
2081 			 * the attributes as expired so that they will be
2082 			 * regotten via the NFS_ACL GETATTR2 procedure.
2083 			 */
2084 			if (VTOMI(vp)->mi_flags & MI_ACL) {
2085 				PURGE_ATTRCACHE(vp);
2086 			}
2087 			dnlc_update(dvp, nm, vp);
2088 			rp = VTOR(vp);
2089 			if (vattr.va_size == 0) {
2090 				mutex_enter(&rp->r_statelock);
2091 				rp->r_size = 0;
2092 				mutex_exit(&rp->r_statelock);
2093 				if (vn_has_cached_data(vp)) {
2094 					ASSERT(vp->v_type != VCHR);
2095 					nfs_invalidate_pages(vp,
2096 					    (u_offset_t)0, cr);
2097 				}
2098 			}
2099 
2100 			/*
2101 			 * Make sure the gid was set correctly.
2102 			 * If not, try to set it (but don't lose
2103 			 * any sleep over it).
2104 			 */
2105 			if (vattr.va_gid != rp->r_attr.va_gid) {
2106 				vattr.va_mask = AT_GID;
2107 				(void) nfssetattr(vp, &vattr, 0, cr);
2108 			}
2109 
2110 			/*
2111 			 * If vnode is a device create special vnode
2112 			 */
2113 			if (IS_DEVVP(vp)) {
2114 				*vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
2115 				VN_RELE(vp);
2116 			} else
2117 				*vpp = vp;
2118 		} else {
2119 			PURGE_STALE_FH(error, dvp, cr);
2120 		}
2121 	}
2122 
2123 	nfs_rw_exit(&drp->r_rwlock);
2124 
2125 	return (error);
2126 }
2127 
2128 /*
2129  * Weirdness: if the vnode to be removed is open
2130  * we rename it instead of removing it and nfs_inactive
2131  * will remove the new name.
2132  */
2133 static int
2134 nfs_remove(vnode_t *dvp, char *nm, cred_t *cr)
2135 {
2136 	int error;
2137 	struct nfsdiropargs da;
2138 	enum nfsstat status;
2139 	vnode_t *vp;
2140 	char *tmpname;
2141 	int douprintf;
2142 	rnode_t *rp;
2143 	rnode_t *drp;
2144 
2145 	if (nfs_zone() != VTOMI(dvp)->mi_zone)
2146 		return (EPERM);
2147 	drp = VTOR(dvp);
2148 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
2149 		return (EINTR);
2150 
2151 	error = nfslookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0);
2152 	if (error) {
2153 		nfs_rw_exit(&drp->r_rwlock);
2154 		return (error);
2155 	}
2156 
2157 	if (vp->v_type == VDIR && secpolicy_fs_linkdir(cr, dvp->v_vfsp)) {
2158 		VN_RELE(vp);
2159 		nfs_rw_exit(&drp->r_rwlock);
2160 		return (EPERM);
2161 	}
2162 
2163 	/*
2164 	 * First just remove the entry from the name cache, as it
2165 	 * is most likely the only entry for this vp.
2166 	 */
2167 	dnlc_remove(dvp, nm);
2168 
2169 	/*
2170 	 * If the file has a v_count > 1 then there may be more than one
2171 	 * entry in the name cache due multiple links or an open file,
2172 	 * but we don't have the real reference count so flush all
2173 	 * possible entries.
2174 	 */
2175 	if (vp->v_count > 1)
2176 		dnlc_purge_vp(vp);
2177 
2178 	/*
2179 	 * Now we have the real reference count on the vnode
2180 	 */
2181 	rp = VTOR(vp);
2182 	mutex_enter(&rp->r_statelock);
2183 	if (vp->v_count > 1 &&
2184 	    (rp->r_unldvp == NULL || strcmp(nm, rp->r_unlname) == 0)) {
2185 		mutex_exit(&rp->r_statelock);
2186 		tmpname = newname();
2187 		error = nfsrename(dvp, nm, dvp, tmpname, cr);
2188 		if (error)
2189 			kmem_free(tmpname, MAXNAMELEN);
2190 		else {
2191 			mutex_enter(&rp->r_statelock);
2192 			if (rp->r_unldvp == NULL) {
2193 				VN_HOLD(dvp);
2194 				rp->r_unldvp = dvp;
2195 				if (rp->r_unlcred != NULL)
2196 					crfree(rp->r_unlcred);
2197 				crhold(cr);
2198 				rp->r_unlcred = cr;
2199 				rp->r_unlname = tmpname;
2200 			} else {
2201 				kmem_free(rp->r_unlname, MAXNAMELEN);
2202 				rp->r_unlname = tmpname;
2203 			}
2204 			mutex_exit(&rp->r_statelock);
2205 		}
2206 	} else {
2207 		mutex_exit(&rp->r_statelock);
2208 		/*
2209 		 * We need to flush any dirty pages which happen to
2210 		 * be hanging around before removing the file.  This
2211 		 * shouldn't happen very often and mostly on file
2212 		 * systems mounted "nocto".
2213 		 */
2214 		if (vn_has_cached_data(vp) &&
2215 		    ((rp->r_flags & RDIRTY) || rp->r_count > 0)) {
2216 			error = nfs_putpage(vp, (offset_t)0, 0, 0, cr);
2217 			if (error && (error == ENOSPC || error == EDQUOT)) {
2218 				mutex_enter(&rp->r_statelock);
2219 				if (!rp->r_error)
2220 					rp->r_error = error;
2221 				mutex_exit(&rp->r_statelock);
2222 			}
2223 		}
2224 
2225 		setdiropargs(&da, nm, dvp);
2226 
2227 		douprintf = 1;
2228 
2229 		error = rfs2call(VTOMI(dvp), RFS_REMOVE,
2230 		    xdr_diropargs, (caddr_t)&da,
2231 		    xdr_enum, (caddr_t)&status, cr,
2232 		    &douprintf, &status, 0, NULL);
2233 
2234 		/*
2235 		 * The xattr dir may be gone after last attr is removed,
2236 		 * so flush it from dnlc.
2237 		 */
2238 		if (dvp->v_flag & V_XATTRDIR)
2239 			dnlc_purge_vp(dvp);
2240 
2241 		PURGE_ATTRCACHE(dvp);	/* mod time changed */
2242 		PURGE_ATTRCACHE(vp);	/* link count changed */
2243 
2244 		if (!error) {
2245 			error = geterrno(status);
2246 			if (!error) {
2247 				if (HAVE_RDDIR_CACHE(drp))
2248 					nfs_purge_rddir_cache(dvp);
2249 			} else {
2250 				PURGE_STALE_FH(error, dvp, cr);
2251 			}
2252 		}
2253 	}
2254 
2255 	if (error == 0) {
2256 		vnevent_remove(vp, dvp, nm);
2257 	}
2258 	VN_RELE(vp);
2259 
2260 	nfs_rw_exit(&drp->r_rwlock);
2261 
2262 	return (error);
2263 }
2264 
2265 static int
2266 nfs_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr)
2267 {
2268 	int error;
2269 	struct nfslinkargs args;
2270 	enum nfsstat status;
2271 	vnode_t *realvp;
2272 	int douprintf;
2273 	rnode_t *tdrp;
2274 
2275 	if (nfs_zone() != VTOMI(tdvp)->mi_zone)
2276 		return (EPERM);
2277 	if (VOP_REALVP(svp, &realvp) == 0)
2278 		svp = realvp;
2279 
2280 	args.la_from = VTOFH(svp);
2281 	setdiropargs(&args.la_to, tnm, tdvp);
2282 
2283 	tdrp = VTOR(tdvp);
2284 	if (nfs_rw_enter_sig(&tdrp->r_rwlock, RW_WRITER, INTR(tdvp)))
2285 		return (EINTR);
2286 
2287 	dnlc_remove(tdvp, tnm);
2288 
2289 	douprintf = 1;
2290 
2291 	error = rfs2call(VTOMI(svp), RFS_LINK,
2292 	    xdr_linkargs, (caddr_t)&args,
2293 	    xdr_enum, (caddr_t)&status, cr,
2294 	    &douprintf, &status, 0, NULL);
2295 
2296 	PURGE_ATTRCACHE(tdvp);	/* mod time changed */
2297 	PURGE_ATTRCACHE(svp);	/* link count changed */
2298 
2299 	if (!error) {
2300 		error = geterrno(status);
2301 		if (!error) {
2302 			if (HAVE_RDDIR_CACHE(tdrp))
2303 				nfs_purge_rddir_cache(tdvp);
2304 		}
2305 	}
2306 
2307 	nfs_rw_exit(&tdrp->r_rwlock);
2308 
2309 	if (!error) {
2310 		/*
2311 		 * Notify the source file of this link operation.
2312 		 */
2313 		vnevent_link(svp);
2314 	}
2315 	return (error);
2316 }
2317 
2318 static int
2319 nfs_rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr)
2320 {
2321 	vnode_t *realvp;
2322 
2323 	if (nfs_zone() != VTOMI(odvp)->mi_zone)
2324 		return (EPERM);
2325 	if (VOP_REALVP(ndvp, &realvp) == 0)
2326 		ndvp = realvp;
2327 
2328 	return (nfsrename(odvp, onm, ndvp, nnm, cr));
2329 }
2330 
2331 /*
2332  * nfsrename does the real work of renaming in NFS Version 2.
2333  */
2334 static int
2335 nfsrename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr)
2336 {
2337 	int error;
2338 	enum nfsstat status;
2339 	struct nfsrnmargs args;
2340 	int douprintf;
2341 	vnode_t *nvp = NULL;
2342 	vnode_t *ovp = NULL;
2343 	char *tmpname;
2344 	rnode_t *rp;
2345 	rnode_t *odrp;
2346 	rnode_t *ndrp;
2347 
2348 	ASSERT(nfs_zone() == VTOMI(odvp)->mi_zone);
2349 	if (strcmp(onm, ".") == 0 || strcmp(onm, "..") == 0 ||
2350 	    strcmp(nnm, ".") == 0 || strcmp(nnm, "..") == 0)
2351 		return (EINVAL);
2352 
2353 	odrp = VTOR(odvp);
2354 	ndrp = VTOR(ndvp);
2355 	if ((intptr_t)odrp < (intptr_t)ndrp) {
2356 		if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR(odvp)))
2357 			return (EINTR);
2358 		if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR(ndvp))) {
2359 			nfs_rw_exit(&odrp->r_rwlock);
2360 			return (EINTR);
2361 		}
2362 	} else {
2363 		if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR(ndvp)))
2364 			return (EINTR);
2365 		if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR(odvp))) {
2366 			nfs_rw_exit(&ndrp->r_rwlock);
2367 			return (EINTR);
2368 		}
2369 	}
2370 
2371 	/*
2372 	 * Lookup the target file.  If it exists, it needs to be
2373 	 * checked to see whether it is a mount point and whether
2374 	 * it is active (open).
2375 	 */
2376 	error = nfslookup(ndvp, nnm, &nvp, NULL, 0, NULL, cr, 0);
2377 	if (!error) {
2378 		/*
2379 		 * If this file has been mounted on, then just
2380 		 * return busy because renaming to it would remove
2381 		 * the mounted file system from the name space.
2382 		 */
2383 		if (vn_mountedvfs(nvp) != NULL) {
2384 			VN_RELE(nvp);
2385 			nfs_rw_exit(&odrp->r_rwlock);
2386 			nfs_rw_exit(&ndrp->r_rwlock);
2387 			return (EBUSY);
2388 		}
2389 
2390 		/*
2391 		 * Purge the name cache of all references to this vnode
2392 		 * so that we can check the reference count to infer
2393 		 * whether it is active or not.
2394 		 */
2395 		/*
2396 		 * First just remove the entry from the name cache, as it
2397 		 * is most likely the only entry for this vp.
2398 		 */
2399 		dnlc_remove(ndvp, nnm);
2400 		/*
2401 		 * If the file has a v_count > 1 then there may be more
2402 		 * than one entry in the name cache due multiple links
2403 		 * or an open file, but we don't have the real reference
2404 		 * count so flush all possible entries.
2405 		 */
2406 		if (nvp->v_count > 1)
2407 			dnlc_purge_vp(nvp);
2408 
2409 		/*
2410 		 * If the vnode is active and is not a directory,
2411 		 * arrange to rename it to a
2412 		 * temporary file so that it will continue to be
2413 		 * accessible.  This implements the "unlink-open-file"
2414 		 * semantics for the target of a rename operation.
2415 		 * Before doing this though, make sure that the
2416 		 * source and target files are not already the same.
2417 		 */
2418 		if (nvp->v_count > 1 && nvp->v_type != VDIR) {
2419 			/*
2420 			 * Lookup the source name.
2421 			 */
2422 			error = nfslookup(odvp, onm, &ovp, NULL, 0, NULL,
2423 			    cr, 0);
2424 
2425 			/*
2426 			 * The source name *should* already exist.
2427 			 */
2428 			if (error) {
2429 				VN_RELE(nvp);
2430 				nfs_rw_exit(&odrp->r_rwlock);
2431 				nfs_rw_exit(&ndrp->r_rwlock);
2432 				return (error);
2433 			}
2434 
2435 			/*
2436 			 * Compare the two vnodes.  If they are the same,
2437 			 * just release all held vnodes and return success.
2438 			 */
2439 			if (ovp == nvp) {
2440 				VN_RELE(ovp);
2441 				VN_RELE(nvp);
2442 				nfs_rw_exit(&odrp->r_rwlock);
2443 				nfs_rw_exit(&ndrp->r_rwlock);
2444 				return (0);
2445 			}
2446 
2447 			/*
2448 			 * Can't mix and match directories and non-
2449 			 * directories in rename operations.  We already
2450 			 * know that the target is not a directory.  If
2451 			 * the source is a directory, return an error.
2452 			 */
2453 			if (ovp->v_type == VDIR) {
2454 				VN_RELE(ovp);
2455 				VN_RELE(nvp);
2456 				nfs_rw_exit(&odrp->r_rwlock);
2457 				nfs_rw_exit(&ndrp->r_rwlock);
2458 				return (ENOTDIR);
2459 			}
2460 
2461 			/*
2462 			 * The target file exists, is not the same as
2463 			 * the source file, and is active.  Link it
2464 			 * to a temporary filename to avoid having
2465 			 * the server removing the file completely.
2466 			 */
2467 			tmpname = newname();
2468 			error = nfs_link(ndvp, nvp, tmpname, cr);
2469 			if (error == EOPNOTSUPP) {
2470 				error = nfs_rename(ndvp, nnm, ndvp, tmpname,
2471 				    cr);
2472 			}
2473 			if (error) {
2474 				kmem_free(tmpname, MAXNAMELEN);
2475 				VN_RELE(ovp);
2476 				VN_RELE(nvp);
2477 				nfs_rw_exit(&odrp->r_rwlock);
2478 				nfs_rw_exit(&ndrp->r_rwlock);
2479 				return (error);
2480 			}
2481 			rp = VTOR(nvp);
2482 			mutex_enter(&rp->r_statelock);
2483 			if (rp->r_unldvp == NULL) {
2484 				VN_HOLD(ndvp);
2485 				rp->r_unldvp = ndvp;
2486 				if (rp->r_unlcred != NULL)
2487 					crfree(rp->r_unlcred);
2488 				crhold(cr);
2489 				rp->r_unlcred = cr;
2490 				rp->r_unlname = tmpname;
2491 			} else {
2492 				kmem_free(rp->r_unlname, MAXNAMELEN);
2493 				rp->r_unlname = tmpname;
2494 			}
2495 			mutex_exit(&rp->r_statelock);
2496 		}
2497 	}
2498 
2499 	if (ovp == NULL) {
2500 		/*
2501 		 * When renaming directories to be a subdirectory of a
2502 		 * different parent, the dnlc entry for ".." will no
2503 		 * longer be valid, so it must be removed.
2504 		 *
2505 		 * We do a lookup here to determine whether we are renaming
2506 		 * a directory and we need to check if we are renaming
2507 		 * an unlinked file.  This might have already been done
2508 		 * in previous code, so we check ovp == NULL to avoid
2509 		 * doing it twice.
2510 		 */
2511 
2512 		error = nfslookup(odvp, onm, &ovp, NULL, 0, NULL, cr, 0);
2513 
2514 		/*
2515 		 * The source name *should* already exist.
2516 		 */
2517 		if (error) {
2518 			nfs_rw_exit(&odrp->r_rwlock);
2519 			nfs_rw_exit(&ndrp->r_rwlock);
2520 			if (nvp) {
2521 				VN_RELE(nvp);
2522 			}
2523 			return (error);
2524 		}
2525 		ASSERT(ovp != NULL);
2526 	}
2527 
2528 	dnlc_remove(odvp, onm);
2529 	dnlc_remove(ndvp, nnm);
2530 
2531 	setdiropargs(&args.rna_from, onm, odvp);
2532 	setdiropargs(&args.rna_to, nnm, ndvp);
2533 
2534 	douprintf = 1;
2535 
2536 	error = rfs2call(VTOMI(odvp), RFS_RENAME,
2537 	    xdr_rnmargs, (caddr_t)&args,
2538 	    xdr_enum, (caddr_t)&status, cr,
2539 	    &douprintf, &status, 0, NULL);
2540 
2541 	PURGE_ATTRCACHE(odvp);	/* mod time changed */
2542 	PURGE_ATTRCACHE(ndvp);	/* mod time changed */
2543 
2544 	if (!error) {
2545 		error = geterrno(status);
2546 		if (!error) {
2547 			if (HAVE_RDDIR_CACHE(odrp))
2548 				nfs_purge_rddir_cache(odvp);
2549 			if (HAVE_RDDIR_CACHE(ndrp))
2550 				nfs_purge_rddir_cache(ndvp);
2551 			/*
2552 			 * when renaming directories to be a subdirectory of a
2553 			 * different parent, the dnlc entry for ".." will no
2554 			 * longer be valid, so it must be removed
2555 			 */
2556 			rp = VTOR(ovp);
2557 			if (ndvp != odvp) {
2558 				if (ovp->v_type == VDIR) {
2559 					dnlc_remove(ovp, "..");
2560 					if (HAVE_RDDIR_CACHE(rp))
2561 						nfs_purge_rddir_cache(ovp);
2562 				}
2563 			}
2564 
2565 			/*
2566 			 * If we are renaming the unlinked file, update the
2567 			 * r_unldvp and r_unlname as needed.
2568 			 */
2569 			mutex_enter(&rp->r_statelock);
2570 			if (rp->r_unldvp != NULL) {
2571 				if (strcmp(rp->r_unlname, onm) == 0) {
2572 					(void) strncpy(rp->r_unlname,
2573 						    nnm, MAXNAMELEN);
2574 					rp->r_unlname[MAXNAMELEN - 1] = '\0';
2575 
2576 					if (ndvp != rp->r_unldvp) {
2577 						VN_RELE(rp->r_unldvp);
2578 						rp->r_unldvp = ndvp;
2579 						VN_HOLD(ndvp);
2580 					}
2581 				}
2582 			}
2583 			mutex_exit(&rp->r_statelock);
2584 		} else {
2585 			/*
2586 			 * System V defines rename to return EEXIST, not
2587 			 * ENOTEMPTY if the target directory is not empty.
2588 			 * Over the wire, the error is NFSERR_ENOTEMPTY
2589 			 * which geterrno maps to ENOTEMPTY.
2590 			 */
2591 			if (error == ENOTEMPTY)
2592 				error = EEXIST;
2593 		}
2594 	}
2595 
2596 	if (error == 0) {
2597 		if (nvp)
2598 			vnevent_rename_dest(nvp, ndvp, nnm);
2599 
2600 		if (odvp != ndvp)
2601 			vnevent_rename_dest_dir(ndvp);
2602 
2603 		ASSERT(ovp != NULL);
2604 		vnevent_rename_src(ovp, odvp, onm);
2605 	}
2606 
2607 	if (nvp) {
2608 		VN_RELE(nvp);
2609 	}
2610 	VN_RELE(ovp);
2611 
2612 	nfs_rw_exit(&odrp->r_rwlock);
2613 	nfs_rw_exit(&ndrp->r_rwlock);
2614 
2615 	return (error);
2616 }
2617 
2618 static int
2619 nfs_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp, cred_t *cr)
2620 {
2621 	int error;
2622 	struct nfscreatargs args;
2623 	struct nfsdiropres dr;
2624 	int douprintf;
2625 	rnode_t *drp;
2626 	hrtime_t t;
2627 
2628 	if (nfs_zone() != VTOMI(dvp)->mi_zone)
2629 		return (EPERM);
2630 
2631 	setdiropargs(&args.ca_da, nm, dvp);
2632 
2633 	/*
2634 	 * Decide what the group-id and set-gid bit of the created directory
2635 	 * should be.  May have to do a setattr to get the gid right.
2636 	 */
2637 	error = setdirgid(dvp, &va->va_gid, cr);
2638 	if (error)
2639 		return (error);
2640 	error = setdirmode(dvp, &va->va_mode, cr);
2641 	if (error)
2642 		return (error);
2643 	va->va_mask |= AT_MODE|AT_GID;
2644 
2645 	args.ca_sa = &args.ca_sa_buf;
2646 	error = vattr_to_sattr(va, args.ca_sa);
2647 	if (error) {
2648 		/* req time field(s) overflow - return immediately */
2649 		return (error);
2650 	}
2651 
2652 	drp = VTOR(dvp);
2653 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
2654 		return (EINTR);
2655 
2656 	dnlc_remove(dvp, nm);
2657 
2658 	douprintf = 1;
2659 
2660 	t = gethrtime();
2661 
2662 	error = rfs2call(VTOMI(dvp), RFS_MKDIR,
2663 	    xdr_creatargs, (caddr_t)&args,
2664 	    xdr_diropres, (caddr_t)&dr, cr,
2665 	    &douprintf, &dr.dr_status, 0, NULL);
2666 
2667 	PURGE_ATTRCACHE(dvp);	/* mod time changed */
2668 
2669 	if (!error) {
2670 		error = geterrno(dr.dr_status);
2671 		if (!error) {
2672 			if (HAVE_RDDIR_CACHE(drp))
2673 				nfs_purge_rddir_cache(dvp);
2674 			/*
2675 			 * The attributes returned by RFS_MKDIR can not
2676 			 * be depended upon, so mark the attribute cache
2677 			 * as purged.  A subsequent GETATTR will get the
2678 			 * correct attributes from the server.
2679 			 */
2680 			*vpp = makenfsnode(&dr.dr_fhandle, &dr.dr_attr,
2681 			    dvp->v_vfsp, t, cr, NULL, NULL);
2682 			PURGE_ATTRCACHE(*vpp);
2683 			dnlc_update(dvp, nm, *vpp);
2684 
2685 			/*
2686 			 * Make sure the gid was set correctly.
2687 			 * If not, try to set it (but don't lose
2688 			 * any sleep over it).
2689 			 */
2690 			if (va->va_gid != VTOR(*vpp)->r_attr.va_gid) {
2691 				va->va_mask = AT_GID;
2692 				(void) nfssetattr(*vpp, va, 0, cr);
2693 			}
2694 		} else {
2695 			PURGE_STALE_FH(error, dvp, cr);
2696 		}
2697 	}
2698 
2699 	nfs_rw_exit(&drp->r_rwlock);
2700 
2701 	return (error);
2702 }
2703 
2704 static int
2705 nfs_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr)
2706 {
2707 	int error;
2708 	enum nfsstat status;
2709 	struct nfsdiropargs da;
2710 	vnode_t *vp;
2711 	int douprintf;
2712 	rnode_t *drp;
2713 
2714 	if (nfs_zone() != VTOMI(dvp)->mi_zone)
2715 		return (EPERM);
2716 	drp = VTOR(dvp);
2717 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
2718 		return (EINTR);
2719 
2720 	/*
2721 	 * Attempt to prevent a rmdir(".") from succeeding.
2722 	 */
2723 	error = nfslookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0);
2724 	if (error) {
2725 		nfs_rw_exit(&drp->r_rwlock);
2726 		return (error);
2727 	}
2728 
2729 	if (vp == cdir) {
2730 		VN_RELE(vp);
2731 		nfs_rw_exit(&drp->r_rwlock);
2732 		return (EINVAL);
2733 	}
2734 
2735 	setdiropargs(&da, nm, dvp);
2736 
2737 	/*
2738 	 * First just remove the entry from the name cache, as it
2739 	 * is most likely an entry for this vp.
2740 	 */
2741 	dnlc_remove(dvp, nm);
2742 
2743 	/*
2744 	 * If there vnode reference count is greater than one, then
2745 	 * there may be additional references in the DNLC which will
2746 	 * need to be purged.  First, trying removing the entry for
2747 	 * the parent directory and see if that removes the additional
2748 	 * reference(s).  If that doesn't do it, then use dnlc_purge_vp
2749 	 * to completely remove any references to the directory which
2750 	 * might still exist in the DNLC.
2751 	 */
2752 	if (vp->v_count > 1) {
2753 		dnlc_remove(vp, "..");
2754 		if (vp->v_count > 1)
2755 			dnlc_purge_vp(vp);
2756 	}
2757 
2758 	douprintf = 1;
2759 
2760 	error = rfs2call(VTOMI(dvp), RFS_RMDIR,
2761 	    xdr_diropargs, (caddr_t)&da,
2762 	    xdr_enum, (caddr_t)&status, cr,
2763 	    &douprintf, &status, 0, NULL);
2764 
2765 	PURGE_ATTRCACHE(dvp);	/* mod time changed */
2766 
2767 	if (error) {
2768 		VN_RELE(vp);
2769 		nfs_rw_exit(&drp->r_rwlock);
2770 		return (error);
2771 	}
2772 
2773 	error = geterrno(status);
2774 	if (!error) {
2775 		if (HAVE_RDDIR_CACHE(drp))
2776 			nfs_purge_rddir_cache(dvp);
2777 		if (HAVE_RDDIR_CACHE(VTOR(vp)))
2778 			nfs_purge_rddir_cache(vp);
2779 	} else {
2780 		PURGE_STALE_FH(error, dvp, cr);
2781 		/*
2782 		 * System V defines rmdir to return EEXIST, not
2783 		 * ENOTEMPTY if the directory is not empty.  Over
2784 		 * the wire, the error is NFSERR_ENOTEMPTY which
2785 		 * geterrno maps to ENOTEMPTY.
2786 		 */
2787 		if (error == ENOTEMPTY)
2788 			error = EEXIST;
2789 	}
2790 
2791 	if (error == 0) {
2792 		vnevent_rmdir(vp, dvp, nm);
2793 	}
2794 	VN_RELE(vp);
2795 
2796 	nfs_rw_exit(&drp->r_rwlock);
2797 
2798 	return (error);
2799 }
2800 
2801 static int
2802 nfs_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm, cred_t *cr)
2803 {
2804 	int error;
2805 	struct nfsslargs args;
2806 	enum nfsstat status;
2807 	int douprintf;
2808 	rnode_t *drp;
2809 
2810 	if (nfs_zone() != VTOMI(dvp)->mi_zone)
2811 		return (EPERM);
2812 	setdiropargs(&args.sla_from, lnm, dvp);
2813 	args.sla_sa = &args.sla_sa_buf;
2814 	error = vattr_to_sattr(tva, args.sla_sa);
2815 	if (error) {
2816 		/* req time field(s) overflow - return immediately */
2817 		return (error);
2818 	}
2819 	args.sla_tnm = tnm;
2820 
2821 	drp = VTOR(dvp);
2822 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
2823 		return (EINTR);
2824 
2825 	dnlc_remove(dvp, lnm);
2826 
2827 	douprintf = 1;
2828 
2829 	error = rfs2call(VTOMI(dvp), RFS_SYMLINK,
2830 	    xdr_slargs, (caddr_t)&args,
2831 	    xdr_enum, (caddr_t)&status, cr,
2832 	    &douprintf, &status, 0, NULL);
2833 
2834 	PURGE_ATTRCACHE(dvp);	/* mod time changed */
2835 
2836 	if (!error) {
2837 		error = geterrno(status);
2838 		if (!error) {
2839 			if (HAVE_RDDIR_CACHE(drp))
2840 				nfs_purge_rddir_cache(dvp);
2841 		} else {
2842 			PURGE_STALE_FH(error, dvp, cr);
2843 		}
2844 	}
2845 
2846 	nfs_rw_exit(&drp->r_rwlock);
2847 
2848 	return (error);
2849 }
2850 
2851 #ifdef DEBUG
2852 static int nfs_readdir_cache_hits = 0;
2853 static int nfs_readdir_cache_shorts = 0;
2854 static int nfs_readdir_cache_waits = 0;
2855 static int nfs_readdir_cache_misses = 0;
2856 static int nfs_readdir_readahead = 0;
2857 #endif
2858 
2859 static int nfs_shrinkreaddir = 0;
2860 
2861 /*
2862  * Read directory entries.
2863  * There are some weird things to look out for here.  The uio_offset
2864  * field is either 0 or it is the offset returned from a previous
2865  * readdir.  It is an opaque value used by the server to find the
2866  * correct directory block to read. The count field is the number
2867  * of blocks to read on the server.  This is advisory only, the server
2868  * may return only one block's worth of entries.  Entries may be compressed
2869  * on the server.
2870  */
2871 static int
2872 nfs_readdir(vnode_t *vp, struct uio *uiop, cred_t *cr, int *eofp)
2873 {
2874 	int error;
2875 	size_t count;
2876 	rnode_t *rp;
2877 	rddir_cache *rdc;
2878 	rddir_cache *nrdc;
2879 	rddir_cache *rrdc;
2880 #ifdef DEBUG
2881 	int missed;
2882 #endif
2883 	rddir_cache srdc;
2884 	avl_index_t where;
2885 
2886 	rp = VTOR(vp);
2887 
2888 	ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
2889 	if (nfs_zone() != VTOMI(vp)->mi_zone)
2890 		return (EIO);
2891 	/*
2892 	 * Make sure that the directory cache is valid.
2893 	 */
2894 	if (HAVE_RDDIR_CACHE(rp)) {
2895 		if (nfs_disable_rddir_cache) {
2896 			/*
2897 			 * Setting nfs_disable_rddir_cache in /etc/system
2898 			 * allows interoperability with servers that do not
2899 			 * properly update the attributes of directories.
2900 			 * Any cached information gets purged before an
2901 			 * access is made to it.
2902 			 */
2903 			nfs_purge_rddir_cache(vp);
2904 		} else {
2905 			error = nfs_validate_caches(vp, cr);
2906 			if (error)
2907 				return (error);
2908 		}
2909 	}
2910 
2911 	/*
2912 	 * UGLINESS: SunOS 3.2 servers apparently cannot always handle an
2913 	 * RFS_READDIR request with rda_count set to more than 0x400. So
2914 	 * we reduce the request size here purely for compatibility.
2915 	 *
2916 	 * In general, this is no longer required.  However, if a server
2917 	 * is discovered which can not handle requests larger than 1024,
2918 	 * nfs_shrinkreaddir can be set to 1 to enable this backwards
2919 	 * compatibility.
2920 	 *
2921 	 * In any case, the request size is limited to NFS_MAXDATA bytes.
2922 	 */
2923 	count = MIN(uiop->uio_iov->iov_len,
2924 	    nfs_shrinkreaddir ? 0x400 : NFS_MAXDATA);
2925 
2926 	nrdc = NULL;
2927 #ifdef DEBUG
2928 	missed = 0;
2929 #endif
2930 top:
2931 	/*
2932 	 * Short circuit last readdir which always returns 0 bytes.
2933 	 * This can be done after the directory has been read through
2934 	 * completely at least once.  This will set r_direof which
2935 	 * can be used to find the value of the last cookie.
2936 	 */
2937 	mutex_enter(&rp->r_statelock);
2938 	if (rp->r_direof != NULL &&
2939 	    uiop->uio_offset == rp->r_direof->nfs_ncookie) {
2940 		mutex_exit(&rp->r_statelock);
2941 #ifdef DEBUG
2942 		nfs_readdir_cache_shorts++;
2943 #endif
2944 		if (eofp)
2945 			*eofp = 1;
2946 		if (nrdc != NULL)
2947 			rddir_cache_rele(nrdc);
2948 		return (0);
2949 	}
2950 	/*
2951 	 * Look for a cache entry.  Cache entries are identified
2952 	 * by the NFS cookie value and the byte count requested.
2953 	 */
2954 	srdc.nfs_cookie = uiop->uio_offset;
2955 	srdc.buflen = count;
2956 	rdc = avl_find(&rp->r_dir, &srdc, &where);
2957 	if (rdc != NULL) {
2958 		rddir_cache_hold(rdc);
2959 		/*
2960 		 * If the cache entry is in the process of being
2961 		 * filled in, wait until this completes.  The
2962 		 * RDDIRWAIT bit is set to indicate that someone
2963 		 * is waiting and then the thread currently
2964 		 * filling the entry is done, it should do a
2965 		 * cv_broadcast to wakeup all of the threads
2966 		 * waiting for it to finish.
2967 		 */
2968 		if (rdc->flags & RDDIR) {
2969 			nfs_rw_exit(&rp->r_rwlock);
2970 			rdc->flags |= RDDIRWAIT;
2971 #ifdef DEBUG
2972 			nfs_readdir_cache_waits++;
2973 #endif
2974 			if (!cv_wait_sig(&rdc->cv, &rp->r_statelock)) {
2975 				/*
2976 				 * We got interrupted, probably
2977 				 * the user typed ^C or an alarm
2978 				 * fired.  We free the new entry
2979 				 * if we allocated one.
2980 				 */
2981 				mutex_exit(&rp->r_statelock);
2982 				(void) nfs_rw_enter_sig(&rp->r_rwlock,
2983 					RW_READER, FALSE);
2984 				rddir_cache_rele(rdc);
2985 				if (nrdc != NULL)
2986 					rddir_cache_rele(nrdc);
2987 				return (EINTR);
2988 			}
2989 			mutex_exit(&rp->r_statelock);
2990 			(void) nfs_rw_enter_sig(&rp->r_rwlock,
2991 				RW_READER, FALSE);
2992 			rddir_cache_rele(rdc);
2993 			goto top;
2994 		}
2995 		/*
2996 		 * Check to see if a readdir is required to
2997 		 * fill the entry.  If so, mark this entry
2998 		 * as being filled, remove our reference,
2999 		 * and branch to the code to fill the entry.
3000 		 */
3001 		if (rdc->flags & RDDIRREQ) {
3002 			rdc->flags &= ~RDDIRREQ;
3003 			rdc->flags |= RDDIR;
3004 			if (nrdc != NULL)
3005 				rddir_cache_rele(nrdc);
3006 			nrdc = rdc;
3007 			mutex_exit(&rp->r_statelock);
3008 			goto bottom;
3009 		}
3010 #ifdef DEBUG
3011 		if (!missed)
3012 			nfs_readdir_cache_hits++;
3013 #endif
3014 		/*
3015 		 * If an error occurred while attempting
3016 		 * to fill the cache entry, just return it.
3017 		 */
3018 		if (rdc->error) {
3019 			error = rdc->error;
3020 			mutex_exit(&rp->r_statelock);
3021 			rddir_cache_rele(rdc);
3022 			if (nrdc != NULL)
3023 				rddir_cache_rele(nrdc);
3024 			return (error);
3025 		}
3026 
3027 		/*
3028 		 * The cache entry is complete and good,
3029 		 * copyout the dirent structs to the calling
3030 		 * thread.
3031 		 */
3032 		error = uiomove(rdc->entries, rdc->entlen, UIO_READ, uiop);
3033 
3034 		/*
3035 		 * If no error occurred during the copyout,
3036 		 * update the offset in the uio struct to
3037 		 * contain the value of the next cookie
3038 		 * and set the eof value appropriately.
3039 		 */
3040 		if (!error) {
3041 			uiop->uio_offset = rdc->nfs_ncookie;
3042 			if (eofp)
3043 				*eofp = rdc->eof;
3044 		}
3045 
3046 		/*
3047 		 * Decide whether to do readahead.  Don't if
3048 		 * have already read to the end of directory.
3049 		 */
3050 		if (rdc->eof) {
3051 			rp->r_direof = rdc;
3052 			mutex_exit(&rp->r_statelock);
3053 			rddir_cache_rele(rdc);
3054 			if (nrdc != NULL)
3055 				rddir_cache_rele(nrdc);
3056 			return (error);
3057 		}
3058 
3059 		/*
3060 		 * Check to see whether we found an entry
3061 		 * for the readahead.  If so, we don't need
3062 		 * to do anything further, so free the new
3063 		 * entry if one was allocated.  Otherwise,
3064 		 * allocate a new entry, add it to the cache,
3065 		 * and then initiate an asynchronous readdir
3066 		 * operation to fill it.
3067 		 */
3068 		srdc.nfs_cookie = rdc->nfs_ncookie;
3069 		srdc.buflen = count;
3070 		rrdc = avl_find(&rp->r_dir, &srdc, &where);
3071 		if (rrdc != NULL) {
3072 			if (nrdc != NULL)
3073 				rddir_cache_rele(nrdc);
3074 		} else {
3075 			if (nrdc != NULL)
3076 				rrdc = nrdc;
3077 			else {
3078 				rrdc = rddir_cache_alloc(KM_NOSLEEP);
3079 			}
3080 			if (rrdc != NULL) {
3081 				rrdc->nfs_cookie = rdc->nfs_ncookie;
3082 				rrdc->buflen = count;
3083 				avl_insert(&rp->r_dir, rrdc, where);
3084 				rddir_cache_hold(rrdc);
3085 				mutex_exit(&rp->r_statelock);
3086 				rddir_cache_rele(rdc);
3087 #ifdef DEBUG
3088 				nfs_readdir_readahead++;
3089 #endif
3090 				nfs_async_readdir(vp, rrdc, cr, nfsreaddir);
3091 				return (error);
3092 			}
3093 		}
3094 
3095 		mutex_exit(&rp->r_statelock);
3096 		rddir_cache_rele(rdc);
3097 		return (error);
3098 	}
3099 
3100 	/*
3101 	 * Didn't find an entry in the cache.  Construct a new empty
3102 	 * entry and link it into the cache.  Other processes attempting
3103 	 * to access this entry will need to wait until it is filled in.
3104 	 *
3105 	 * Since kmem_alloc may block, another pass through the cache
3106 	 * will need to be taken to make sure that another process
3107 	 * hasn't already added an entry to the cache for this request.
3108 	 */
3109 	if (nrdc == NULL) {
3110 		mutex_exit(&rp->r_statelock);
3111 		nrdc = rddir_cache_alloc(KM_SLEEP);
3112 		nrdc->nfs_cookie = uiop->uio_offset;
3113 		nrdc->buflen = count;
3114 		goto top;
3115 	}
3116 
3117 	/*
3118 	 * Add this entry to the cache.
3119 	 */
3120 	avl_insert(&rp->r_dir, nrdc, where);
3121 	rddir_cache_hold(nrdc);
3122 	mutex_exit(&rp->r_statelock);
3123 
3124 bottom:
3125 #ifdef DEBUG
3126 	missed = 1;
3127 	nfs_readdir_cache_misses++;
3128 #endif
3129 	/*
3130 	 * Do the readdir.
3131 	 */
3132 	error = nfsreaddir(vp, nrdc, cr);
3133 
3134 	/*
3135 	 * If this operation failed, just return the error which occurred.
3136 	 */
3137 	if (error != 0)
3138 		return (error);
3139 
3140 	/*
3141 	 * Since the RPC operation will have taken sometime and blocked
3142 	 * this process, another pass through the cache will need to be
3143 	 * taken to find the correct cache entry.  It is possible that
3144 	 * the correct cache entry will not be there (although one was
3145 	 * added) because the directory changed during the RPC operation
3146 	 * and the readdir cache was flushed.  In this case, just start
3147 	 * over.  It is hoped that this will not happen too often... :-)
3148 	 */
3149 	nrdc = NULL;
3150 	goto top;
3151 	/* NOTREACHED */
3152 }
3153 
3154 static int
3155 nfsreaddir(vnode_t *vp, rddir_cache *rdc, cred_t *cr)
3156 {
3157 	int error;
3158 	struct nfsrddirargs rda;
3159 	struct nfsrddirres rd;
3160 	rnode_t *rp;
3161 	mntinfo_t *mi;
3162 	uint_t count;
3163 	int douprintf;
3164 	failinfo_t fi, *fip;
3165 
3166 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
3167 	count = rdc->buflen;
3168 
3169 	rp = VTOR(vp);
3170 	mi = VTOMI(vp);
3171 
3172 	rda.rda_fh = *VTOFH(vp);
3173 	rda.rda_offset = rdc->nfs_cookie;
3174 
3175 	/*
3176 	 * NFS client failover support
3177 	 * suppress failover unless we have a zero cookie
3178 	 */
3179 	if (rdc->nfs_cookie == (off_t)0) {
3180 		fi.vp = vp;
3181 		fi.fhp = (caddr_t)&rda.rda_fh;
3182 		fi.copyproc = nfscopyfh;
3183 		fi.lookupproc = nfslookup;
3184 		fi.xattrdirproc = acl_getxattrdir2;
3185 		fip = &fi;
3186 	} else {
3187 		fip = NULL;
3188 	}
3189 
3190 	rd.rd_entries = kmem_alloc(rdc->buflen, KM_SLEEP);
3191 	rd.rd_size = count;
3192 	rd.rd_offset = rda.rda_offset;
3193 
3194 	douprintf = 1;
3195 
3196 	if (mi->mi_io_kstats) {
3197 		mutex_enter(&mi->mi_lock);
3198 		kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
3199 		mutex_exit(&mi->mi_lock);
3200 	}
3201 
3202 	do {
3203 		rda.rda_count = MIN(count, mi->mi_curread);
3204 		error = rfs2call(mi, RFS_READDIR,
3205 		    xdr_rddirargs, (caddr_t)&rda,
3206 		    xdr_getrddirres, (caddr_t)&rd, cr,
3207 		    &douprintf, &rd.rd_status, 0, fip);
3208 	} while (error == ENFS_TRYAGAIN);
3209 
3210 	if (mi->mi_io_kstats) {
3211 		mutex_enter(&mi->mi_lock);
3212 		kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
3213 		mutex_exit(&mi->mi_lock);
3214 	}
3215 
3216 	/*
3217 	 * Since we are actually doing a READDIR RPC, we must have
3218 	 * exclusive access to the cache entry being filled.  Thus,
3219 	 * it is safe to update all fields except for the flags
3220 	 * field.  The r_statelock in the rnode must be held to
3221 	 * prevent two different threads from simultaneously
3222 	 * attempting to update the flags field.  This can happen
3223 	 * if we are turning off RDDIR and the other thread is
3224 	 * trying to set RDDIRWAIT.
3225 	 */
3226 	ASSERT(rdc->flags & RDDIR);
3227 	if (!error) {
3228 		error = geterrno(rd.rd_status);
3229 		if (!error) {
3230 			rdc->nfs_ncookie = rd.rd_offset;
3231 			rdc->eof = rd.rd_eof ? 1 : 0;
3232 			rdc->entlen = rd.rd_size;
3233 			ASSERT(rdc->entlen <= rdc->buflen);
3234 #ifdef DEBUG
3235 			rdc->entries = rddir_cache_buf_alloc(rdc->buflen,
3236 			    KM_SLEEP);
3237 #else
3238 			rdc->entries = kmem_alloc(rdc->buflen, KM_SLEEP);
3239 #endif
3240 			bcopy(rd.rd_entries, rdc->entries, rdc->entlen);
3241 			rdc->error = 0;
3242 			if (mi->mi_io_kstats) {
3243 				mutex_enter(&mi->mi_lock);
3244 				KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
3245 				KSTAT_IO_PTR(mi->mi_io_kstats)->nread +=
3246 				    rd.rd_size;
3247 				mutex_exit(&mi->mi_lock);
3248 			}
3249 		} else {
3250 			PURGE_STALE_FH(error, vp, cr);
3251 		}
3252 	}
3253 	if (error) {
3254 		rdc->entries = NULL;
3255 		rdc->error = error;
3256 	}
3257 	kmem_free(rd.rd_entries, rdc->buflen);
3258 
3259 	mutex_enter(&rp->r_statelock);
3260 	rdc->flags &= ~RDDIR;
3261 	if (rdc->flags & RDDIRWAIT) {
3262 		rdc->flags &= ~RDDIRWAIT;
3263 		cv_broadcast(&rdc->cv);
3264 	}
3265 	if (error)
3266 		rdc->flags |= RDDIRREQ;
3267 	mutex_exit(&rp->r_statelock);
3268 
3269 	rddir_cache_rele(rdc);
3270 
3271 	return (error);
3272 }
3273 
3274 #ifdef DEBUG
3275 static int nfs_bio_do_stop = 0;
3276 #endif
3277 
3278 static int
3279 nfs_bio(struct buf *bp, cred_t *cr)
3280 {
3281 	rnode_t *rp = VTOR(bp->b_vp);
3282 	int count;
3283 	int error;
3284 	cred_t *cred;
3285 	uint_t offset;
3286 
3287 	DTRACE_IO1(start, struct buf *, bp);
3288 
3289 	ASSERT(nfs_zone() == VTOMI(bp->b_vp)->mi_zone);
3290 	offset = dbtob(bp->b_blkno);
3291 
3292 	if (bp->b_flags & B_READ) {
3293 		mutex_enter(&rp->r_statelock);
3294 		if (rp->r_cred != NULL) {
3295 			cred = rp->r_cred;
3296 			crhold(cred);
3297 		} else {
3298 			rp->r_cred = cr;
3299 			crhold(cr);
3300 			cred = cr;
3301 			crhold(cred);
3302 		}
3303 		mutex_exit(&rp->r_statelock);
3304 	read_again:
3305 		error = bp->b_error = nfsread(bp->b_vp, bp->b_un.b_addr,
3306 		    offset, bp->b_bcount, &bp->b_resid, cred);
3307 		crfree(cred);
3308 		if (!error) {
3309 			if (bp->b_resid) {
3310 				/*
3311 				 * Didn't get it all because we hit EOF,
3312 				 * zero all the memory beyond the EOF.
3313 				 */
3314 				/* bzero(rdaddr + */
3315 				bzero(bp->b_un.b_addr +
3316 				    bp->b_bcount - bp->b_resid, bp->b_resid);
3317 			}
3318 			mutex_enter(&rp->r_statelock);
3319 			if (bp->b_resid == bp->b_bcount &&
3320 			    offset >= rp->r_size) {
3321 				/*
3322 				 * We didn't read anything at all as we are
3323 				 * past EOF.  Return an error indicator back
3324 				 * but don't destroy the pages (yet).
3325 				 */
3326 				error = NFS_EOF;
3327 			}
3328 			mutex_exit(&rp->r_statelock);
3329 		} else if (error == EACCES) {
3330 			mutex_enter(&rp->r_statelock);
3331 			if (cred != cr) {
3332 				if (rp->r_cred != NULL)
3333 					crfree(rp->r_cred);
3334 				rp->r_cred = cr;
3335 				crhold(cr);
3336 				cred = cr;
3337 				crhold(cred);
3338 				mutex_exit(&rp->r_statelock);
3339 				goto read_again;
3340 			}
3341 			mutex_exit(&rp->r_statelock);
3342 		}
3343 	} else {
3344 		if (!(rp->r_flags & RSTALE)) {
3345 			mutex_enter(&rp->r_statelock);
3346 			if (rp->r_cred != NULL) {
3347 				cred = rp->r_cred;
3348 				crhold(cred);
3349 			} else {
3350 				rp->r_cred = cr;
3351 				crhold(cr);
3352 				cred = cr;
3353 				crhold(cred);
3354 			}
3355 			mutex_exit(&rp->r_statelock);
3356 		write_again:
3357 			mutex_enter(&rp->r_statelock);
3358 			count = MIN(bp->b_bcount, rp->r_size - offset);
3359 			mutex_exit(&rp->r_statelock);
3360 			if (count < 0)
3361 				cmn_err(CE_PANIC, "nfs_bio: write count < 0");
3362 #ifdef DEBUG
3363 			if (count == 0) {
3364 				zcmn_err(getzoneid(), CE_WARN,
3365 				    "nfs_bio: zero length write at %d",
3366 				    offset);
3367 				nfs_printfhandle(&rp->r_fh);
3368 				if (nfs_bio_do_stop)
3369 					debug_enter("nfs_bio");
3370 			}
3371 #endif
3372 			error = nfswrite(bp->b_vp, bp->b_un.b_addr, offset,
3373 			    count, cred);
3374 			if (error == EACCES) {
3375 				mutex_enter(&rp->r_statelock);
3376 				if (cred != cr) {
3377 					if (rp->r_cred != NULL)
3378 						crfree(rp->r_cred);
3379 					rp->r_cred = cr;
3380 					crhold(cr);
3381 					crfree(cred);
3382 					cred = cr;
3383 					crhold(cred);
3384 					mutex_exit(&rp->r_statelock);
3385 					goto write_again;
3386 				}
3387 				mutex_exit(&rp->r_statelock);
3388 			}
3389 			bp->b_error = error;
3390 			if (error && error != EINTR) {
3391 				/*
3392 				 * Don't print EDQUOT errors on the console.
3393 				 * Don't print asynchronous EACCES errors.
3394 				 * Don't print EFBIG errors.
3395 				 * Print all other write errors.
3396 				 */
3397 				if (error != EDQUOT && error != EFBIG &&
3398 				    (error != EACCES ||
3399 				    !(bp->b_flags & B_ASYNC)))
3400 					nfs_write_error(bp->b_vp, error, cred);
3401 				/*
3402 				 * Update r_error and r_flags as appropriate.
3403 				 * If the error was ESTALE, then mark the
3404 				 * rnode as not being writeable and save
3405 				 * the error status.  Otherwise, save any
3406 				 * errors which occur from asynchronous
3407 				 * page invalidations.  Any errors occurring
3408 				 * from other operations should be saved
3409 				 * by the caller.
3410 				 */
3411 				mutex_enter(&rp->r_statelock);
3412 				if (error == ESTALE) {
3413 					rp->r_flags |= RSTALE;
3414 					if (!rp->r_error)
3415 						rp->r_error = error;
3416 				} else if (!rp->r_error &&
3417 				    (bp->b_flags &
3418 				    (B_INVAL|B_FORCE|B_ASYNC)) ==
3419 				    (B_INVAL|B_FORCE|B_ASYNC)) {
3420 					rp->r_error = error;
3421 				}
3422 				mutex_exit(&rp->r_statelock);
3423 			}
3424 			crfree(cred);
3425 		} else
3426 			error = rp->r_error;
3427 	}
3428 
3429 	if (error != 0 && error != NFS_EOF)
3430 		bp->b_flags |= B_ERROR;
3431 
3432 	DTRACE_IO1(done, struct buf *, bp);
3433 
3434 	return (error);
3435 }
3436 
3437 static int
3438 nfs_fid(vnode_t *vp, fid_t *fidp)
3439 {
3440 	struct nfs_fid *fp;
3441 	rnode_t *rp;
3442 
3443 	rp = VTOR(vp);
3444 
3445 	if (fidp->fid_len < (sizeof (struct nfs_fid) - sizeof (short))) {
3446 		fidp->fid_len = sizeof (struct nfs_fid) - sizeof (short);
3447 		return (ENOSPC);
3448 	}
3449 	fp = (struct nfs_fid *)fidp;
3450 	fp->nf_pad = 0;
3451 	fp->nf_len = sizeof (struct nfs_fid) - sizeof (short);
3452 	bcopy(rp->r_fh.fh_buf, fp->nf_data, NFS_FHSIZE);
3453 	return (0);
3454 }
3455 
3456 /* ARGSUSED2 */
3457 static int
3458 nfs_rwlock(vnode_t *vp, int write_lock, caller_context_t *ctp)
3459 {
3460 	rnode_t *rp = VTOR(vp);
3461 
3462 	if (!write_lock) {
3463 		(void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
3464 		return (V_WRITELOCK_FALSE);
3465 	}
3466 
3467 	if ((rp->r_flags & RDIRECTIO) || (VTOMI(vp)->mi_flags & MI_DIRECTIO)) {
3468 		(void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
3469 		if (rp->r_mapcnt == 0 && !vn_has_cached_data(vp))
3470 			return (V_WRITELOCK_FALSE);
3471 		nfs_rw_exit(&rp->r_rwlock);
3472 	}
3473 
3474 	(void) nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, FALSE);
3475 	return (V_WRITELOCK_TRUE);
3476 }
3477 
3478 /* ARGSUSED */
3479 static void
3480 nfs_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ctp)
3481 {
3482 	rnode_t *rp = VTOR(vp);
3483 
3484 	nfs_rw_exit(&rp->r_rwlock);
3485 }
3486 
3487 /* ARGSUSED */
3488 static int
3489 nfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp)
3490 {
3491 
3492 	/*
3493 	 * Because we stuff the readdir cookie into the offset field
3494 	 * someone may attempt to do an lseek with the cookie which
3495 	 * we want to succeed.
3496 	 */
3497 	if (vp->v_type == VDIR)
3498 		return (0);
3499 	if (*noffp < 0 || *noffp > MAXOFF32_T)
3500 		return (EINVAL);
3501 	return (0);
3502 }
3503 
3504 /*
3505  * number of NFS_MAXDATA blocks to read ahead
3506  * optimized for 100 base-T.
3507  */
3508 static int nfs_nra = 4;
3509 
3510 #ifdef DEBUG
3511 static int nfs_lostpage = 0;	/* number of times we lost original page */
3512 #endif
3513 
3514 /*
3515  * Return all the pages from [off..off+len) in file
3516  */
3517 static int
3518 nfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
3519 	page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
3520 	enum seg_rw rw, cred_t *cr)
3521 {
3522 	rnode_t *rp;
3523 	int error;
3524 	mntinfo_t *mi;
3525 
3526 	if (vp->v_flag & VNOMAP)
3527 		return (ENOSYS);
3528 
3529 	ASSERT(off <= MAXOFF32_T);
3530 	if (nfs_zone() != VTOMI(vp)->mi_zone)
3531 		return (EIO);
3532 	if (protp != NULL)
3533 		*protp = PROT_ALL;
3534 
3535 	/*
3536 	 * Now valididate that the caches are up to date.
3537 	 */
3538 	error = nfs_validate_caches(vp, cr);
3539 	if (error)
3540 		return (error);
3541 
3542 	rp = VTOR(vp);
3543 	mi = VTOMI(vp);
3544 retry:
3545 	mutex_enter(&rp->r_statelock);
3546 
3547 	/*
3548 	 * Don't create dirty pages faster than they
3549 	 * can be cleaned so that the system doesn't
3550 	 * get imbalanced.  If the async queue is
3551 	 * maxed out, then wait for it to drain before
3552 	 * creating more dirty pages.  Also, wait for
3553 	 * any threads doing pagewalks in the vop_getattr
3554 	 * entry points so that they don't block for
3555 	 * long periods.
3556 	 */
3557 	if (rw == S_CREATE) {
3558 		while ((mi->mi_max_threads != 0 &&
3559 		    rp->r_awcount > 2 * mi->mi_max_threads) ||
3560 		    rp->r_gcount > 0)
3561 			cv_wait(&rp->r_cv, &rp->r_statelock);
3562 	}
3563 
3564 	/*
3565 	 * If we are getting called as a side effect of an nfs_write()
3566 	 * operation the local file size might not be extended yet.
3567 	 * In this case we want to be able to return pages of zeroes.
3568 	 */
3569 	if (off + len > rp->r_size + PAGEOFFSET && seg != segkmap) {
3570 		mutex_exit(&rp->r_statelock);
3571 		return (EFAULT);		/* beyond EOF */
3572 	}
3573 
3574 	mutex_exit(&rp->r_statelock);
3575 
3576 	if (len <= PAGESIZE) {
3577 		error = nfs_getapage(vp, off, len, protp, pl, plsz,
3578 		    seg, addr, rw, cr);
3579 	} else {
3580 		error = pvn_getpages(nfs_getapage, vp, off, len, protp,
3581 		    pl, plsz, seg, addr, rw, cr);
3582 	}
3583 
3584 	switch (error) {
3585 	case NFS_EOF:
3586 		nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr);
3587 		goto retry;
3588 	case ESTALE:
3589 		PURGE_STALE_FH(error, vp, cr);
3590 	}
3591 
3592 	return (error);
3593 }
3594 
3595 /*
3596  * Called from pvn_getpages or nfs_getpage to get a particular page.
3597  */
3598 /* ARGSUSED */
3599 static int
3600 nfs_getapage(vnode_t *vp, u_offset_t off, size_t len, uint_t *protp,
3601 	page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
3602 	enum seg_rw rw, cred_t *cr)
3603 {
3604 	rnode_t *rp;
3605 	uint_t bsize;
3606 	struct buf *bp;
3607 	page_t *pp;
3608 	u_offset_t lbn;
3609 	u_offset_t io_off;
3610 	u_offset_t blkoff;
3611 	u_offset_t rablkoff;
3612 	size_t io_len;
3613 	uint_t blksize;
3614 	int error;
3615 	int readahead;
3616 	int readahead_issued = 0;
3617 	int ra_window; /* readahead window */
3618 	page_t *pagefound;
3619 
3620 	if (nfs_zone() != VTOMI(vp)->mi_zone)
3621 		return (EIO);
3622 	rp = VTOR(vp);
3623 	bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
3624 
3625 reread:
3626 	bp = NULL;
3627 	pp = NULL;
3628 	pagefound = NULL;
3629 
3630 	if (pl != NULL)
3631 		pl[0] = NULL;
3632 
3633 	error = 0;
3634 	lbn = off / bsize;
3635 	blkoff = lbn * bsize;
3636 
3637 	/*
3638 	 * Queueing up the readahead before doing the synchronous read
3639 	 * results in a significant increase in read throughput because
3640 	 * of the increased parallelism between the async threads and
3641 	 * the process context.
3642 	 */
3643 	if ((off & ((vp->v_vfsp->vfs_bsize) - 1)) == 0 &&
3644 	    rw != S_CREATE &&
3645 	    !(vp->v_flag & VNOCACHE)) {
3646 		mutex_enter(&rp->r_statelock);
3647 
3648 		/*
3649 		 * Calculate the number of readaheads to do.
3650 		 * a) No readaheads at offset = 0.
3651 		 * b) Do maximum(nfs_nra) readaheads when the readahead
3652 		 *    window is closed.
3653 		 * c) Do readaheads between 1 to (nfs_nra - 1) depending
3654 		 *    upon how far the readahead window is open or close.
3655 		 * d) No readaheads if rp->r_nextr is not within the scope
3656 		 *    of the readahead window (random i/o).
3657 		 */
3658 
3659 		if (off == 0)
3660 			readahead = 0;
3661 		else if (blkoff == rp->r_nextr)
3662 			readahead = nfs_nra;
3663 		else if (rp->r_nextr > blkoff &&
3664 				((ra_window = (rp->r_nextr - blkoff) / bsize)
3665 					<= (nfs_nra - 1)))
3666 			readahead = nfs_nra - ra_window;
3667 		else
3668 			readahead = 0;
3669 
3670 		rablkoff = rp->r_nextr;
3671 		while (readahead > 0 && rablkoff + bsize < rp->r_size) {
3672 			mutex_exit(&rp->r_statelock);
3673 			if (nfs_async_readahead(vp, rablkoff + bsize,
3674 			    addr + (rablkoff + bsize - off), seg, cr,
3675 			    nfs_readahead) < 0) {
3676 				mutex_enter(&rp->r_statelock);
3677 				break;
3678 			}
3679 			readahead--;
3680 			rablkoff += bsize;
3681 			/*
3682 			 * Indicate that we did a readahead so
3683 			 * readahead offset is not updated
3684 			 * by the synchronous read below.
3685 			 */
3686 			readahead_issued = 1;
3687 			mutex_enter(&rp->r_statelock);
3688 			/*
3689 			 * set readahead offset to
3690 			 * offset of last async readahead
3691 			 * request.
3692 			 */
3693 			rp->r_nextr = rablkoff;
3694 		}
3695 		mutex_exit(&rp->r_statelock);
3696 	}
3697 
3698 again:
3699 	if ((pagefound = page_exists(vp, off)) == NULL) {
3700 		if (pl == NULL) {
3701 			(void) nfs_async_readahead(vp, blkoff, addr, seg, cr,
3702 			    nfs_readahead);
3703 		} else if (rw == S_CREATE) {
3704 			/*
3705 			 * Block for this page is not allocated, or the offset
3706 			 * is beyond the current allocation size, or we're
3707 			 * allocating a swap slot and the page was not found,
3708 			 * so allocate it and return a zero page.
3709 			 */
3710 			if ((pp = page_create_va(vp, off,
3711 			    PAGESIZE, PG_WAIT, seg, addr)) == NULL)
3712 				cmn_err(CE_PANIC, "nfs_getapage: page_create");
3713 			io_len = PAGESIZE;
3714 			mutex_enter(&rp->r_statelock);
3715 			rp->r_nextr = off + PAGESIZE;
3716 			mutex_exit(&rp->r_statelock);
3717 		} else {
3718 			/*
3719 			 * Need to go to server to get a BLOCK, exception to
3720 			 * that being while reading at offset = 0 or doing
3721 			 * random i/o, in that case read only a PAGE.
3722 			 */
3723 			mutex_enter(&rp->r_statelock);
3724 			if (blkoff < rp->r_size &&
3725 			    blkoff + bsize >= rp->r_size) {
3726 				/*
3727 				 * If only a block or less is left in
3728 				 * the file, read all that is remaining.
3729 				 */
3730 				if (rp->r_size <= off) {
3731 					/*
3732 					 * Trying to access beyond EOF,
3733 					 * set up to get at least one page.
3734 					 */
3735 					blksize = off + PAGESIZE - blkoff;
3736 				} else
3737 					blksize = rp->r_size - blkoff;
3738 			} else if ((off == 0) ||
3739 				(off != rp->r_nextr && !readahead_issued)) {
3740 				blksize = PAGESIZE;
3741 				blkoff = off; /* block = page here */
3742 			} else
3743 				blksize = bsize;
3744 			mutex_exit(&rp->r_statelock);
3745 
3746 			pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
3747 			    &io_len, blkoff, blksize, 0);
3748 
3749 			/*
3750 			 * Some other thread has entered the page,
3751 			 * so just use it.
3752 			 */
3753 			if (pp == NULL)
3754 				goto again;
3755 
3756 			/*
3757 			 * Now round the request size up to page boundaries.
3758 			 * This ensures that the entire page will be
3759 			 * initialized to zeroes if EOF is encountered.
3760 			 */
3761 			io_len = ptob(btopr(io_len));
3762 
3763 			bp = pageio_setup(pp, io_len, vp, B_READ);
3764 			ASSERT(bp != NULL);
3765 
3766 			/*
3767 			 * pageio_setup should have set b_addr to 0.  This
3768 			 * is correct since we want to do I/O on a page
3769 			 * boundary.  bp_mapin will use this addr to calculate
3770 			 * an offset, and then set b_addr to the kernel virtual
3771 			 * address it allocated for us.
3772 			 */
3773 			ASSERT(bp->b_un.b_addr == 0);
3774 
3775 			bp->b_edev = 0;
3776 			bp->b_dev = 0;
3777 			bp->b_lblkno = lbtodb(io_off);
3778 			bp->b_file = vp;
3779 			bp->b_offset = (offset_t)off;
3780 			bp_mapin(bp);
3781 
3782 			/*
3783 			 * If doing a write beyond what we believe is EOF,
3784 			 * don't bother trying to read the pages from the
3785 			 * server, we'll just zero the pages here.  We
3786 			 * don't check that the rw flag is S_WRITE here
3787 			 * because some implementations may attempt a
3788 			 * read access to the buffer before copying data.
3789 			 */
3790 			mutex_enter(&rp->r_statelock);
3791 			if (io_off >= rp->r_size && seg == segkmap) {
3792 				mutex_exit(&rp->r_statelock);
3793 				bzero(bp->b_un.b_addr, io_len);
3794 			} else {
3795 				mutex_exit(&rp->r_statelock);
3796 				error = nfs_bio(bp, cr);
3797 			}
3798 
3799 			/*
3800 			 * Unmap the buffer before freeing it.
3801 			 */
3802 			bp_mapout(bp);
3803 			pageio_done(bp);
3804 
3805 			if (error == NFS_EOF) {
3806 				/*
3807 				 * If doing a write system call just return
3808 				 * zeroed pages, else user tried to get pages
3809 				 * beyond EOF, return error.  We don't check
3810 				 * that the rw flag is S_WRITE here because
3811 				 * some implementations may attempt a read
3812 				 * access to the buffer before copying data.
3813 				 */
3814 				if (seg == segkmap)
3815 					error = 0;
3816 				else
3817 					error = EFAULT;
3818 			}
3819 
3820 			if (!readahead_issued && !error) {
3821 			    mutex_enter(&rp->r_statelock);
3822 			    rp->r_nextr = io_off + io_len;
3823 			    mutex_exit(&rp->r_statelock);
3824 			}
3825 		}
3826 	}
3827 
3828 out:
3829 	if (pl == NULL)
3830 		return (error);
3831 
3832 	if (error) {
3833 		if (pp != NULL)
3834 			pvn_read_done(pp, B_ERROR);
3835 		return (error);
3836 	}
3837 
3838 	if (pagefound) {
3839 		se_t se = (rw == S_CREATE ? SE_EXCL : SE_SHARED);
3840 
3841 		/*
3842 		 * Page exists in the cache, acquire the appropriate lock.
3843 		 * If this fails, start all over again.
3844 		 */
3845 		if ((pp = page_lookup(vp, off, se)) == NULL) {
3846 #ifdef DEBUG
3847 			nfs_lostpage++;
3848 #endif
3849 			goto reread;
3850 		}
3851 		pl[0] = pp;
3852 		pl[1] = NULL;
3853 		return (0);
3854 	}
3855 
3856 	if (pp != NULL)
3857 		pvn_plist_init(pp, pl, plsz, off, io_len, rw);
3858 
3859 	return (error);
3860 }
3861 
3862 static void
3863 nfs_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, struct seg *seg,
3864 	cred_t *cr)
3865 {
3866 	int error;
3867 	page_t *pp;
3868 	u_offset_t io_off;
3869 	size_t io_len;
3870 	struct buf *bp;
3871 	uint_t bsize, blksize;
3872 	rnode_t *rp = VTOR(vp);
3873 
3874 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
3875 
3876 	bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
3877 
3878 	mutex_enter(&rp->r_statelock);
3879 	if (blkoff < rp->r_size && blkoff + bsize > rp->r_size) {
3880 		/*
3881 		 * If less than a block left in file read less
3882 		 * than a block.
3883 		 */
3884 		blksize = rp->r_size - blkoff;
3885 	} else
3886 		blksize = bsize;
3887 	mutex_exit(&rp->r_statelock);
3888 
3889 	pp = pvn_read_kluster(vp, blkoff, segkmap, addr,
3890 	    &io_off, &io_len, blkoff, blksize, 1);
3891 	/*
3892 	 * The isra flag passed to the kluster function is 1, we may have
3893 	 * gotten a return value of NULL for a variety of reasons (# of free
3894 	 * pages < minfree, someone entered the page on the vnode etc). In all
3895 	 * cases, we want to punt on the readahead.
3896 	 */
3897 	if (pp == NULL)
3898 		return;
3899 
3900 	/*
3901 	 * Now round the request size up to page boundaries.
3902 	 * This ensures that the entire page will be
3903 	 * initialized to zeroes if EOF is encountered.
3904 	 */
3905 	io_len = ptob(btopr(io_len));
3906 
3907 	bp = pageio_setup(pp, io_len, vp, B_READ);
3908 	ASSERT(bp != NULL);
3909 
3910 	/*
3911 	 * pageio_setup should have set b_addr to 0.  This is correct since
3912 	 * we want to do I/O on a page boundary. bp_mapin() will use this addr
3913 	 * to calculate an offset, and then set b_addr to the kernel virtual
3914 	 * address it allocated for us.
3915 	 */
3916 	ASSERT(bp->b_un.b_addr == 0);
3917 
3918 	bp->b_edev = 0;
3919 	bp->b_dev = 0;
3920 	bp->b_lblkno = lbtodb(io_off);
3921 	bp->b_file = vp;
3922 	bp->b_offset = (offset_t)blkoff;
3923 	bp_mapin(bp);
3924 
3925 	/*
3926 	 * If doing a write beyond what we believe is EOF, don't bother trying
3927 	 * to read the pages from the server, we'll just zero the pages here.
3928 	 * We don't check that the rw flag is S_WRITE here because some
3929 	 * implementations may attempt a read access to the buffer before
3930 	 * copying data.
3931 	 */
3932 	mutex_enter(&rp->r_statelock);
3933 	if (io_off >= rp->r_size && seg == segkmap) {
3934 		mutex_exit(&rp->r_statelock);
3935 		bzero(bp->b_un.b_addr, io_len);
3936 		error = 0;
3937 	} else {
3938 		mutex_exit(&rp->r_statelock);
3939 		error = nfs_bio(bp, cr);
3940 		if (error == NFS_EOF)
3941 			error = 0;
3942 	}
3943 
3944 	/*
3945 	 * Unmap the buffer before freeing it.
3946 	 */
3947 	bp_mapout(bp);
3948 	pageio_done(bp);
3949 
3950 	pvn_read_done(pp, error ? B_READ | B_ERROR : B_READ);
3951 
3952 	/*
3953 	 * In case of error set readahead offset
3954 	 * to the lowest offset.
3955 	 * pvn_read_done() calls VN_DISPOSE to destroy the pages
3956 	 */
3957 	if (error && rp->r_nextr > io_off) {
3958 		mutex_enter(&rp->r_statelock);
3959 		if (rp->r_nextr > io_off)
3960 			rp->r_nextr = io_off;
3961 		mutex_exit(&rp->r_statelock);
3962 	}
3963 }
3964 
3965 /*
3966  * Flags are composed of {B_INVAL, B_FREE, B_DONTNEED, B_FORCE}
3967  * If len == 0, do from off to EOF.
3968  *
3969  * The normal cases should be len == 0 && off == 0 (entire vp list),
3970  * len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE
3971  * (from pageout).
3972  */
3973 static int
3974 nfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr)
3975 {
3976 	int error;
3977 	rnode_t *rp;
3978 
3979 	ASSERT(cr != NULL);
3980 
3981 	/*
3982 	 * XXX - Why should this check be made here?
3983 	 */
3984 	if (vp->v_flag & VNOMAP)
3985 		return (ENOSYS);
3986 
3987 	if (len == 0 && !(flags & B_INVAL) && vn_is_readonly(vp))
3988 		return (0);
3989 
3990 	if (!(flags & B_ASYNC) && nfs_zone() != VTOMI(vp)->mi_zone)
3991 		return (EIO);
3992 	ASSERT(off <= MAXOFF32_T);
3993 
3994 	rp = VTOR(vp);
3995 	mutex_enter(&rp->r_statelock);
3996 	rp->r_count++;
3997 	mutex_exit(&rp->r_statelock);
3998 	error = nfs_putpages(vp, off, len, flags, cr);
3999 	mutex_enter(&rp->r_statelock);
4000 	rp->r_count--;
4001 	cv_broadcast(&rp->r_cv);
4002 	mutex_exit(&rp->r_statelock);
4003 
4004 	return (error);
4005 }
4006 
4007 /*
4008  * Write out a single page, possibly klustering adjacent dirty pages.
4009  */
4010 int
4011 nfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp,
4012 	int flags, cred_t *cr)
4013 {
4014 	u_offset_t io_off;
4015 	u_offset_t lbn_off;
4016 	u_offset_t lbn;
4017 	size_t io_len;
4018 	uint_t bsize;
4019 	int error;
4020 	rnode_t *rp;
4021 
4022 	ASSERT(!vn_is_readonly(vp));
4023 	ASSERT(pp != NULL);
4024 	ASSERT(cr != NULL);
4025 	ASSERT((flags & B_ASYNC) || nfs_zone() == VTOMI(vp)->mi_zone);
4026 
4027 	rp = VTOR(vp);
4028 	ASSERT(rp->r_count > 0);
4029 
4030 	ASSERT(pp->p_offset <= MAXOFF32_T);
4031 
4032 	bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
4033 	lbn = pp->p_offset / bsize;
4034 	lbn_off = lbn * bsize;
4035 
4036 	/*
4037 	 * Find a kluster that fits in one block, or in
4038 	 * one page if pages are bigger than blocks.  If
4039 	 * there is less file space allocated than a whole
4040 	 * page, we'll shorten the i/o request below.
4041 	 */
4042 	pp = pvn_write_kluster(vp, pp, &io_off, &io_len, lbn_off,
4043 	    roundup(bsize, PAGESIZE), flags);
4044 
4045 	/*
4046 	 * pvn_write_kluster shouldn't have returned a page with offset
4047 	 * behind the original page we were given.  Verify that.
4048 	 */
4049 	ASSERT((pp->p_offset / bsize) >= lbn);
4050 
4051 	/*
4052 	 * Now pp will have the list of kept dirty pages marked for
4053 	 * write back.  It will also handle invalidation and freeing
4054 	 * of pages that are not dirty.  Check for page length rounding
4055 	 * problems.
4056 	 */
4057 	if (io_off + io_len > lbn_off + bsize) {
4058 		ASSERT((io_off + io_len) - (lbn_off + bsize) < PAGESIZE);
4059 		io_len = lbn_off + bsize - io_off;
4060 	}
4061 	/*
4062 	 * The RMODINPROGRESS flag makes sure that nfs(3)_bio() sees a
4063 	 * consistent value of r_size. RMODINPROGRESS is set in writerp().
4064 	 * When RMODINPROGRESS is set it indicates that a uiomove() is in
4065 	 * progress and the r_size has not been made consistent with the
4066 	 * new size of the file. When the uiomove() completes the r_size is
4067 	 * updated and the RMODINPROGRESS flag is cleared.
4068 	 *
4069 	 * The RMODINPROGRESS flag makes sure that nfs(3)_bio() sees a
4070 	 * consistent value of r_size. Without this handshaking, it is
4071 	 * possible that nfs(3)_bio() picks  up the old value of r_size
4072 	 * before the uiomove() in writerp() completes. This will result
4073 	 * in the write through nfs(3)_bio() being dropped.
4074 	 *
4075 	 * More precisely, there is a window between the time the uiomove()
4076 	 * completes and the time the r_size is updated. If a VOP_PUTPAGE()
4077 	 * operation intervenes in this window, the page will be picked up,
4078 	 * because it is dirty (it will be unlocked, unless it was
4079 	 * pagecreate'd). When the page is picked up as dirty, the dirty
4080 	 * bit is reset (pvn_getdirty()). In nfs(3)write(), r_size is
4081 	 * checked. This will still be the old size. Therefore the page will
4082 	 * not be written out. When segmap_release() calls VOP_PUTPAGE(),
4083 	 * the page will be found to be clean and the write will be dropped.
4084 	 */
4085 	if (rp->r_flags & RMODINPROGRESS) {
4086 		mutex_enter(&rp->r_statelock);
4087 		if ((rp->r_flags & RMODINPROGRESS) &&
4088 		    rp->r_modaddr + MAXBSIZE > io_off &&
4089 		    rp->r_modaddr < io_off + io_len) {
4090 			page_t *plist;
4091 			/*
4092 			 * A write is in progress for this region of the file.
4093 			 * If we did not detect RMODINPROGRESS here then this
4094 			 * path through nfs_putapage() would eventually go to
4095 			 * nfs(3)_bio() and may not write out all of the data
4096 			 * in the pages. We end up losing data. So we decide
4097 			 * to set the modified bit on each page in the page
4098 			 * list and mark the rnode with RDIRTY. This write
4099 			 * will be restarted at some later time.
4100 			 */
4101 			plist = pp;
4102 			while (plist != NULL) {
4103 				pp = plist;
4104 				page_sub(&plist, pp);
4105 				hat_setmod(pp);
4106 				page_io_unlock(pp);
4107 				page_unlock(pp);
4108 			}
4109 			rp->r_flags |= RDIRTY;
4110 			mutex_exit(&rp->r_statelock);
4111 			if (offp)
4112 				*offp = io_off;
4113 			if (lenp)
4114 				*lenp = io_len;
4115 			return (0);
4116 		}
4117 		mutex_exit(&rp->r_statelock);
4118 	}
4119 
4120 	if (flags & B_ASYNC) {
4121 		error = nfs_async_putapage(vp, pp, io_off, io_len, flags, cr,
4122 		    nfs_sync_putapage);
4123 	} else
4124 		error = nfs_sync_putapage(vp, pp, io_off, io_len, flags, cr);
4125 
4126 	if (offp)
4127 		*offp = io_off;
4128 	if (lenp)
4129 		*lenp = io_len;
4130 	return (error);
4131 }
4132 
4133 static int
4134 nfs_sync_putapage(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
4135 	int flags, cred_t *cr)
4136 {
4137 	int error;
4138 	rnode_t *rp;
4139 
4140 	flags |= B_WRITE;
4141 
4142 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
4143 	error = nfs_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
4144 
4145 	rp = VTOR(vp);
4146 
4147 	if ((error == ENOSPC || error == EDQUOT || error == EACCES) &&
4148 	    (flags & (B_INVAL|B_FORCE)) != (B_INVAL|B_FORCE)) {
4149 		if (!(rp->r_flags & ROUTOFSPACE)) {
4150 			mutex_enter(&rp->r_statelock);
4151 			rp->r_flags |= ROUTOFSPACE;
4152 			mutex_exit(&rp->r_statelock);
4153 		}
4154 		flags |= B_ERROR;
4155 		pvn_write_done(pp, flags);
4156 		/*
4157 		 * If this was not an async thread, then try again to
4158 		 * write out the pages, but this time, also destroy
4159 		 * them whether or not the write is successful.  This
4160 		 * will prevent memory from filling up with these
4161 		 * pages and destroying them is the only alternative
4162 		 * if they can't be written out.
4163 		 *
4164 		 * Don't do this if this is an async thread because
4165 		 * when the pages are unlocked in pvn_write_done,
4166 		 * some other thread could have come along, locked
4167 		 * them, and queued for an async thread.  It would be
4168 		 * possible for all of the async threads to be tied
4169 		 * up waiting to lock the pages again and they would
4170 		 * all already be locked and waiting for an async
4171 		 * thread to handle them.  Deadlock.
4172 		 */
4173 		if (!(flags & B_ASYNC)) {
4174 			error = nfs_putpage(vp, io_off, io_len,
4175 			    B_INVAL | B_FORCE, cr);
4176 		}
4177 	} else {
4178 		if (error)
4179 			flags |= B_ERROR;
4180 		else if (rp->r_flags & ROUTOFSPACE) {
4181 			mutex_enter(&rp->r_statelock);
4182 			rp->r_flags &= ~ROUTOFSPACE;
4183 			mutex_exit(&rp->r_statelock);
4184 		}
4185 		pvn_write_done(pp, flags);
4186 	}
4187 
4188 	return (error);
4189 }
4190 
4191 static int
4192 nfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
4193 	size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr)
4194 {
4195 	struct segvn_crargs vn_a;
4196 	int error;
4197 	rnode_t *rp;
4198 	struct vattr va;
4199 
4200 	if (nfs_zone() != VTOMI(vp)->mi_zone)
4201 		return (EIO);
4202 
4203 	if (vp->v_flag & VNOMAP)
4204 		return (ENOSYS);
4205 
4206 	if (off > MAXOFF32_T)
4207 		return (EFBIG);
4208 
4209 	if (off < 0 || off + len < 0)
4210 		return (ENXIO);
4211 
4212 	if (vp->v_type != VREG)
4213 		return (ENODEV);
4214 
4215 	/*
4216 	 * If there is cached data and if close-to-open consistency
4217 	 * checking is not turned off and if the file system is not
4218 	 * mounted readonly, then force an over the wire getattr.
4219 	 * Otherwise, just invoke nfsgetattr to get a copy of the
4220 	 * attributes.  The attribute cache will be used unless it
4221 	 * is timed out and if it is, then an over the wire getattr
4222 	 * will be issued.
4223 	 */
4224 	va.va_mask = AT_ALL;
4225 	if (vn_has_cached_data(vp) &&
4226 	    !(VTOMI(vp)->mi_flags & MI_NOCTO) && !vn_is_readonly(vp))
4227 		error = nfs_getattr_otw(vp, &va, cr);
4228 	else
4229 		error = nfsgetattr(vp, &va, cr);
4230 	if (error)
4231 		return (error);
4232 
4233 	/*
4234 	 * Check to see if the vnode is currently marked as not cachable.
4235 	 * This means portions of the file are locked (through VOP_FRLOCK).
4236 	 * In this case the map request must be refused.  We use
4237 	 * rp->r_lkserlock to avoid a race with concurrent lock requests.
4238 	 */
4239 	rp = VTOR(vp);
4240 	if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR(vp)))
4241 		return (EINTR);
4242 
4243 	if (vp->v_flag & VNOCACHE) {
4244 		error = EAGAIN;
4245 		goto done;
4246 	}
4247 
4248 	/*
4249 	 * Don't allow concurrent locks and mapping if mandatory locking is
4250 	 * enabled.
4251 	 */
4252 	if ((flk_has_remote_locks(vp) || lm_has_sleep(vp)) &&
4253 	    MANDLOCK(vp, va.va_mode)) {
4254 		error = EAGAIN;
4255 		goto done;
4256 	}
4257 
4258 	as_rangelock(as);
4259 	if (!(flags & MAP_FIXED)) {
4260 		map_addr(addrp, len, off, 1, flags);
4261 		if (*addrp == NULL) {
4262 			as_rangeunlock(as);
4263 			error = ENOMEM;
4264 			goto done;
4265 		}
4266 	} else {
4267 		/*
4268 		 * User specified address - blow away any previous mappings
4269 		 */
4270 		(void) as_unmap(as, *addrp, len);
4271 	}
4272 
4273 	vn_a.vp = vp;
4274 	vn_a.offset = off;
4275 	vn_a.type = (flags & MAP_TYPE);
4276 	vn_a.prot = (uchar_t)prot;
4277 	vn_a.maxprot = (uchar_t)maxprot;
4278 	vn_a.flags = (flags & ~MAP_TYPE);
4279 	vn_a.cred = cr;
4280 	vn_a.amp = NULL;
4281 	vn_a.szc = 0;
4282 	vn_a.lgrp_mem_policy_flags = 0;
4283 
4284 	error = as_map(as, *addrp, len, segvn_create, &vn_a);
4285 	as_rangeunlock(as);
4286 
4287 done:
4288 	nfs_rw_exit(&rp->r_lkserlock);
4289 	return (error);
4290 }
4291 
4292 /* ARGSUSED */
4293 static int
4294 nfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
4295 	size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr)
4296 {
4297 	rnode_t *rp;
4298 
4299 	if (vp->v_flag & VNOMAP)
4300 		return (ENOSYS);
4301 	if (nfs_zone() != VTOMI(vp)->mi_zone)
4302 		return (EIO);
4303 
4304 	/*
4305 	 * Need to hold rwlock while incrementing the mapcnt so that
4306 	 * mmap'ing can be serialized with writes so that the caching
4307 	 * can be handled correctly.
4308 	 */
4309 	rp = VTOR(vp);
4310 	if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, INTR(vp)))
4311 		return (EINTR);
4312 	atomic_add_long((ulong_t *)&rp->r_mapcnt, btopr(len));
4313 	nfs_rw_exit(&rp->r_rwlock);
4314 
4315 	return (0);
4316 }
4317 
4318 static int
4319 nfs_frlock(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
4320 	offset_t offset, struct flk_callback *flk_cbp, cred_t *cr)
4321 {
4322 	netobj lm_fh;
4323 	int rc;
4324 	u_offset_t start, end;
4325 	rnode_t *rp;
4326 	int error = 0, intr = INTR(vp);
4327 
4328 	/* check for valid cmd parameter */
4329 	if (cmd != F_GETLK && cmd != F_SETLK && cmd != F_SETLKW)
4330 		return (EINVAL);
4331 	if (nfs_zone() != VTOMI(vp)->mi_zone)
4332 		return (EIO);
4333 
4334 	/* Verify l_type. */
4335 	switch (bfp->l_type) {
4336 	case F_RDLCK:
4337 		if (cmd != F_GETLK && !(flag & FREAD))
4338 			return (EBADF);
4339 		break;
4340 	case F_WRLCK:
4341 		if (cmd != F_GETLK && !(flag & FWRITE))
4342 			return (EBADF);
4343 		break;
4344 	case F_UNLCK:
4345 		intr = 0;
4346 		break;
4347 
4348 	default:
4349 		return (EINVAL);
4350 	}
4351 
4352 	/* check the validity of the lock range */
4353 	if (rc = flk_convert_lock_data(vp, bfp, &start, &end, offset))
4354 		return (rc);
4355 	if (rc = flk_check_lock_data(start, end, MAXOFF32_T))
4356 		return (rc);
4357 
4358 	/*
4359 	 * If the filesystem is mounted using local locking, pass the
4360 	 * request off to the local locking code.
4361 	 */
4362 	if (VTOMI(vp)->mi_flags & MI_LLOCK) {
4363 		if (offset > MAXOFF32_T)
4364 			return (EFBIG);
4365 		if (cmd == F_SETLK || cmd == F_SETLKW) {
4366 			/*
4367 			 * For complete safety, we should be holding
4368 			 * r_lkserlock.  However, we can't call
4369 			 * lm_safelock and then fs_frlock while
4370 			 * holding r_lkserlock, so just invoke
4371 			 * lm_safelock and expect that this will
4372 			 * catch enough of the cases.
4373 			 */
4374 			if (!lm_safelock(vp, bfp, cr))
4375 				return (EAGAIN);
4376 		}
4377 		return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr));
4378 	}
4379 
4380 	rp = VTOR(vp);
4381 
4382 	/*
4383 	 * Check whether the given lock request can proceed, given the
4384 	 * current file mappings.
4385 	 */
4386 	if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_WRITER, intr))
4387 		return (EINTR);
4388 	if (cmd == F_SETLK || cmd == F_SETLKW) {
4389 		if (!lm_safelock(vp, bfp, cr)) {
4390 			rc = EAGAIN;
4391 			goto done;
4392 		}
4393 	}
4394 
4395 	/*
4396 	 * Flush the cache after waiting for async I/O to finish.  For new
4397 	 * locks, this is so that the process gets the latest bits from the
4398 	 * server.  For unlocks, this is so that other clients see the
4399 	 * latest bits once the file has been unlocked.  If currently dirty
4400 	 * pages can't be flushed, then don't allow a lock to be set.  But
4401 	 * allow unlocks to succeed, to avoid having orphan locks on the
4402 	 * server.
4403 	 */
4404 	if (cmd != F_GETLK) {
4405 		mutex_enter(&rp->r_statelock);
4406 		while (rp->r_count > 0) {
4407 		    if (intr) {
4408 			klwp_t *lwp = ttolwp(curthread);
4409 
4410 			if (lwp != NULL)
4411 				lwp->lwp_nostop++;
4412 			if (cv_wait_sig(&rp->r_cv, &rp->r_statelock) == 0) {
4413 				if (lwp != NULL)
4414 					lwp->lwp_nostop--;
4415 				rc = EINTR;
4416 				break;
4417 			}
4418 			if (lwp != NULL)
4419 				lwp->lwp_nostop--;
4420 		    } else
4421 			cv_wait(&rp->r_cv, &rp->r_statelock);
4422 		}
4423 		mutex_exit(&rp->r_statelock);
4424 		if (rc != 0)
4425 			goto done;
4426 		error = nfs_putpage(vp, (offset_t)0, 0, B_INVAL, cr);
4427 		if (error) {
4428 			if (error == ENOSPC || error == EDQUOT) {
4429 				mutex_enter(&rp->r_statelock);
4430 				if (!rp->r_error)
4431 					rp->r_error = error;
4432 				mutex_exit(&rp->r_statelock);
4433 			}
4434 			if (bfp->l_type != F_UNLCK) {
4435 				rc = ENOLCK;
4436 				goto done;
4437 			}
4438 		}
4439 	}
4440 
4441 	lm_fh.n_len = sizeof (fhandle_t);
4442 	lm_fh.n_bytes = (char *)VTOFH(vp);
4443 
4444 	/*
4445 	 * Call the lock manager to do the real work of contacting
4446 	 * the server and obtaining the lock.
4447 	 */
4448 	rc = lm_frlock(vp, cmd, bfp, flag, offset, cr, &lm_fh, flk_cbp);
4449 
4450 	if (rc == 0)
4451 		nfs_lockcompletion(vp, cmd);
4452 
4453 done:
4454 	nfs_rw_exit(&rp->r_lkserlock);
4455 	return (rc);
4456 }
4457 
4458 /*
4459  * Free storage space associated with the specified vnode.  The portion
4460  * to be freed is specified by bfp->l_start and bfp->l_len (already
4461  * normalized to a "whence" of 0).
4462  *
4463  * This is an experimental facility whose continued existence is not
4464  * guaranteed.  Currently, we only support the special case
4465  * of l_len == 0, meaning free to end of file.
4466  */
4467 /* ARGSUSED */
4468 static int
4469 nfs_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
4470 	offset_t offset, cred_t *cr, caller_context_t *ct)
4471 {
4472 	int error;
4473 
4474 	ASSERT(vp->v_type == VREG);
4475 	if (cmd != F_FREESP)
4476 		return (EINVAL);
4477 
4478 	if (offset > MAXOFF32_T)
4479 		return (EFBIG);
4480 
4481 	if ((bfp->l_start > MAXOFF32_T) || (bfp->l_end > MAXOFF32_T) ||
4482 	    (bfp->l_len > MAXOFF32_T))
4483 		return (EFBIG);
4484 
4485 	if (nfs_zone() != VTOMI(vp)->mi_zone)
4486 		return (EIO);
4487 
4488 	error = convoff(vp, bfp, 0, offset);
4489 	if (!error) {
4490 		ASSERT(bfp->l_start >= 0);
4491 		if (bfp->l_len == 0) {
4492 			struct vattr va;
4493 
4494 			/*
4495 			 * ftruncate should not change the ctime and
4496 			 * mtime if we truncate the file to its
4497 			 * previous size.
4498 			 */
4499 			va.va_mask = AT_SIZE;
4500 			error = nfsgetattr(vp, &va, cr);
4501 			if (error || va.va_size == bfp->l_start)
4502 				return (error);
4503 			va.va_mask = AT_SIZE;
4504 			va.va_size = bfp->l_start;
4505 			error = nfssetattr(vp, &va, 0, cr);
4506 		} else
4507 			error = EINVAL;
4508 	}
4509 
4510 	return (error);
4511 }
4512 
4513 /* ARGSUSED */
4514 static int
4515 nfs_realvp(vnode_t *vp, vnode_t **vpp)
4516 {
4517 
4518 	return (EINVAL);
4519 }
4520 
4521 /*
4522  * Setup and add an address space callback to do the work of the delmap call.
4523  * The callback will (and must be) deleted in the actual callback function.
4524  *
4525  * This is done in order to take care of the problem that we have with holding
4526  * the address space's a_lock for a long period of time (e.g. if the NFS server
4527  * is down).  Callbacks will be executed in the address space code while the
4528  * a_lock is not held.	Holding the address space's a_lock causes things such
4529  * as ps and fork to hang because they are trying to acquire this lock as well.
4530  */
4531 /* ARGSUSED */
4532 static int
4533 nfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
4534 	size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr)
4535 {
4536 	int			caller_found;
4537 	int			error;
4538 	rnode_t			*rp;
4539 	nfs_delmap_args_t	*dmapp;
4540 	nfs_delmapcall_t	*delmap_call;
4541 
4542 	if (vp->v_flag & VNOMAP)
4543 		return (ENOSYS);
4544 	/*
4545 	 * A process may not change zones if it has NFS pages mmap'ed
4546 	 * in, so we can't legitimately get here from the wrong zone.
4547 	 */
4548 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
4549 
4550 	rp = VTOR(vp);
4551 
4552 	/*
4553 	 * The way that the address space of this process deletes its mapping
4554 	 * of this file is via the following call chains:
4555 	 * - as_free()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs_delmap()
4556 	 * - as_unmap()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs_delmap()
4557 	 *
4558 	 * With the use of address space callbacks we are allowed to drop the
4559 	 * address space lock, a_lock, while executing the NFS operations that
4560 	 * need to go over the wire.  Returning EAGAIN to the caller of this
4561 	 * function is what drives the execution of the callback that we add
4562 	 * below.  The callback will be executed by the address space code
4563 	 * after dropping the a_lock.  When the callback is finished, since
4564 	 * we dropped the a_lock, it must be re-acquired and segvn_unmap()
4565 	 * is called again on the same segment to finish the rest of the work
4566 	 * that needs to happen during unmapping.
4567 	 *
4568 	 * This action of calling back into the segment driver causes
4569 	 * nfs_delmap() to get called again, but since the callback was
4570 	 * already executed at this point, it already did the work and there
4571 	 * is nothing left for us to do.
4572 	 *
4573 	 * To Summarize:
4574 	 * - The first time nfs_delmap is called by the current thread is when
4575 	 * we add the caller associated with this delmap to the delmap caller
4576 	 * list, add the callback, and return EAGAIN.
4577 	 * - The second time in this call chain when nfs_delmap is called we
4578 	 * will find this caller in the delmap caller list and realize there
4579 	 * is no more work to do thus removing this caller from the list and
4580 	 * returning the error that was set in the callback execution.
4581 	 */
4582 	caller_found = nfs_find_and_delete_delmapcall(rp, &error);
4583 	if (caller_found) {
4584 		/*
4585 		 * 'error' is from the actual delmap operations.  To avoid
4586 		 * hangs, we need to handle the return of EAGAIN differently
4587 		 * since this is what drives the callback execution.
4588 		 * In this case, we don't want to return EAGAIN and do the
4589 		 * callback execution because there are none to execute.
4590 		 */
4591 		if (error == EAGAIN)
4592 			return (0);
4593 		else
4594 			return (error);
4595 	}
4596 
4597 	/* current caller was not in the list */
4598 	delmap_call = nfs_init_delmapcall();
4599 
4600 	mutex_enter(&rp->r_statelock);
4601 	list_insert_tail(&rp->r_indelmap, delmap_call);
4602 	mutex_exit(&rp->r_statelock);
4603 
4604 	dmapp = kmem_alloc(sizeof (nfs_delmap_args_t), KM_SLEEP);
4605 
4606 	dmapp->vp = vp;
4607 	dmapp->off = off;
4608 	dmapp->addr = addr;
4609 	dmapp->len = len;
4610 	dmapp->prot = prot;
4611 	dmapp->maxprot = maxprot;
4612 	dmapp->flags = flags;
4613 	dmapp->cr = cr;
4614 	dmapp->caller = delmap_call;
4615 
4616 	error = as_add_callback(as, nfs_delmap_callback, dmapp,
4617 	    AS_UNMAP_EVENT, addr, len, KM_SLEEP);
4618 
4619 	return (error ? error : EAGAIN);
4620 }
4621 
4622 /*
4623  * Remove some pages from an mmap'd vnode.  Just update the
4624  * count of pages.  If doing close-to-open, then flush all
4625  * of the pages associated with this file.  Otherwise, start
4626  * an asynchronous page flush to write out any dirty pages.
4627  * This will also associate a credential with the rnode which
4628  * can be used to write the pages.
4629  */
4630 /* ARGSUSED */
4631 static void
4632 nfs_delmap_callback(struct as *as, void *arg, uint_t event)
4633 {
4634 	int			error;
4635 	rnode_t			*rp;
4636 	mntinfo_t		*mi;
4637 	nfs_delmap_args_t	*dmapp = (nfs_delmap_args_t *)arg;
4638 
4639 	rp = VTOR(dmapp->vp);
4640 	mi = VTOMI(dmapp->vp);
4641 
4642 	atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(dmapp->len));
4643 	ASSERT(rp->r_mapcnt >= 0);
4644 
4645 	/*
4646 	 * Initiate a page flush if there are pages, the file system
4647 	 * was not mounted readonly, the segment was mapped shared, and
4648 	 * the pages themselves were writeable.
4649 	 */
4650 	if (vn_has_cached_data(dmapp->vp) && !vn_is_readonly(dmapp->vp) &&
4651 	    dmapp->flags == MAP_SHARED && (dmapp->maxprot & PROT_WRITE)) {
4652 		mutex_enter(&rp->r_statelock);
4653 		rp->r_flags |= RDIRTY;
4654 		mutex_exit(&rp->r_statelock);
4655 		/*
4656 		 * If this is a cross-zone access a sync putpage won't work, so
4657 		 * the best we can do is try an async putpage.  That seems
4658 		 * better than something more draconian such as discarding the
4659 		 * dirty pages.
4660 		 */
4661 		if ((mi->mi_flags & MI_NOCTO) ||
4662 		    nfs_zone() != mi->mi_zone)
4663 			error = nfs_putpage(dmapp->vp, dmapp->off, dmapp->len,
4664 			    B_ASYNC, dmapp->cr);
4665 		else
4666 			error = nfs_putpage(dmapp->vp, dmapp->off, dmapp->len,
4667 			    0, dmapp->cr);
4668 		if (!error) {
4669 			mutex_enter(&rp->r_statelock);
4670 			error = rp->r_error;
4671 			rp->r_error = 0;
4672 			mutex_exit(&rp->r_statelock);
4673 		}
4674 	} else
4675 		error = 0;
4676 
4677 	if ((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO))
4678 		(void) nfs_putpage(dmapp->vp, dmapp->off, dmapp->len,
4679 		    B_INVAL, dmapp->cr);
4680 
4681 	dmapp->caller->error = error;
4682 	(void) as_delete_callback(as, arg);
4683 	kmem_free(dmapp, sizeof (nfs_delmap_args_t));
4684 }
4685 
4686 /* ARGSUSED */
4687 static int
4688 nfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr)
4689 {
4690 	int error = 0;
4691 
4692 	if (nfs_zone() != VTOMI(vp)->mi_zone)
4693 		return (EIO);
4694 	/*
4695 	 * This looks a little weird because it's written in a general
4696 	 * manner but we make little use of cases.  If cntl() ever gets
4697 	 * widely used, the outer switch will make more sense.
4698 	 */
4699 
4700 	switch (cmd) {
4701 
4702 	/*
4703 	 * Large file spec - need to base answer new query with
4704 	 * hardcoded constant based on the protocol.
4705 	 */
4706 	case _PC_FILESIZEBITS:
4707 		*valp = 32;
4708 		return (0);
4709 
4710 	case _PC_LINK_MAX:
4711 	case _PC_NAME_MAX:
4712 	case _PC_PATH_MAX:
4713 	case _PC_SYMLINK_MAX:
4714 	case _PC_CHOWN_RESTRICTED:
4715 	case _PC_NO_TRUNC: {
4716 		mntinfo_t *mi;
4717 		struct pathcnf *pc;
4718 
4719 		if ((mi = VTOMI(vp)) == NULL || (pc = mi->mi_pathconf) == NULL)
4720 			return (EINVAL);
4721 		error = _PC_ISSET(cmd, pc->pc_mask);    /* error or bool */
4722 		switch (cmd) {
4723 		case _PC_LINK_MAX:
4724 			*valp = pc->pc_link_max;
4725 			break;
4726 		case _PC_NAME_MAX:
4727 			*valp = pc->pc_name_max;
4728 			break;
4729 		case _PC_PATH_MAX:
4730 		case _PC_SYMLINK_MAX:
4731 			*valp = pc->pc_path_max;
4732 			break;
4733 		case _PC_CHOWN_RESTRICTED:
4734 			/*
4735 			 * if we got here, error is really a boolean which
4736 			 * indicates whether cmd is set or not.
4737 			 */
4738 			*valp = error ? 1 : 0;	/* see above */
4739 			error = 0;
4740 			break;
4741 		case _PC_NO_TRUNC:
4742 			/*
4743 			 * if we got here, error is really a boolean which
4744 			 * indicates whether cmd is set or not.
4745 			 */
4746 			*valp = error ? 1 : 0;	/* see above */
4747 			error = 0;
4748 			break;
4749 		}
4750 		return (error ? EINVAL : 0);
4751 	    }
4752 
4753 	case _PC_XATTR_EXISTS:
4754 		*valp = 0;
4755 		if (vp->v_vfsp->vfs_flag & VFS_XATTR) {
4756 			vnode_t *avp;
4757 			rnode_t *rp;
4758 			mntinfo_t *mi = VTOMI(vp);
4759 
4760 			if (!(mi->mi_flags & MI_EXTATTR))
4761 				return (0);
4762 
4763 			rp = VTOR(vp);
4764 			if (nfs_rw_enter_sig(&rp->r_rwlock, RW_READER,
4765 			    INTR(vp)))
4766 				return (EINTR);
4767 
4768 			error = nfslookup_dnlc(vp, XATTR_DIR_NAME, &avp, cr);
4769 			if (error || avp == NULL)
4770 				error = acl_getxattrdir2(vp, &avp, 0, cr, 0);
4771 
4772 			nfs_rw_exit(&rp->r_rwlock);
4773 
4774 			if (error == 0 && avp != NULL) {
4775 				VN_RELE(avp);
4776 				*valp = 1;
4777 			}
4778 		}
4779 		return (error ? EINVAL : 0);
4780 
4781 	case _PC_ACL_ENABLED:
4782 		*valp = _ACL_ACLENT_ENABLED;
4783 		return (0);
4784 
4785 	default:
4786 		return (EINVAL);
4787 	}
4788 }
4789 
4790 /*
4791  * Called by async thread to do synchronous pageio. Do the i/o, wait
4792  * for it to complete, and cleanup the page list when done.
4793  */
4794 static int
4795 nfs_sync_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
4796 	int flags, cred_t *cr)
4797 {
4798 	int error;
4799 
4800 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
4801 	error = nfs_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
4802 	if (flags & B_READ)
4803 		pvn_read_done(pp, (error ? B_ERROR : 0) | flags);
4804 	else
4805 		pvn_write_done(pp, (error ? B_ERROR : 0) | flags);
4806 	return (error);
4807 }
4808 
4809 static int
4810 nfs_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
4811 	int flags, cred_t *cr)
4812 {
4813 	int error;
4814 	rnode_t *rp;
4815 
4816 	if (pp == NULL)
4817 		return (EINVAL);
4818 
4819 	if (io_off > MAXOFF32_T)
4820 		return (EFBIG);
4821 	if (nfs_zone() != VTOMI(vp)->mi_zone)
4822 		return (EIO);
4823 	rp = VTOR(vp);
4824 	mutex_enter(&rp->r_statelock);
4825 	rp->r_count++;
4826 	mutex_exit(&rp->r_statelock);
4827 
4828 	if (flags & B_ASYNC) {
4829 		error = nfs_async_pageio(vp, pp, io_off, io_len, flags, cr,
4830 		    nfs_sync_pageio);
4831 	} else
4832 		error = nfs_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
4833 	mutex_enter(&rp->r_statelock);
4834 	rp->r_count--;
4835 	cv_broadcast(&rp->r_cv);
4836 	mutex_exit(&rp->r_statelock);
4837 	return (error);
4838 }
4839 
4840 static int
4841 nfs_setsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr)
4842 {
4843 	int error;
4844 	mntinfo_t *mi;
4845 
4846 	mi = VTOMI(vp);
4847 
4848 	if (nfs_zone() != mi->mi_zone)
4849 		return (EIO);
4850 	if (mi->mi_flags & MI_ACL) {
4851 		error = acl_setacl2(vp, vsecattr, flag, cr);
4852 		if (mi->mi_flags & MI_ACL)
4853 			return (error);
4854 	}
4855 
4856 	return (ENOSYS);
4857 }
4858 
4859 static int
4860 nfs_getsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr)
4861 {
4862 	int error;
4863 	mntinfo_t *mi;
4864 
4865 	mi = VTOMI(vp);
4866 
4867 	if (nfs_zone() != mi->mi_zone)
4868 		return (EIO);
4869 	if (mi->mi_flags & MI_ACL) {
4870 		error = acl_getacl2(vp, vsecattr, flag, cr);
4871 		if (mi->mi_flags & MI_ACL)
4872 			return (error);
4873 	}
4874 
4875 	return (fs_fab_acl(vp, vsecattr, flag, cr));
4876 }
4877 
4878 static int
4879 nfs_shrlock(vnode_t *vp, int cmd, struct shrlock *shr, int flag, cred_t *cr)
4880 {
4881 	int error;
4882 	struct shrlock nshr;
4883 	struct nfs_owner nfs_owner;
4884 	netobj lm_fh;
4885 
4886 	if (nfs_zone() != VTOMI(vp)->mi_zone)
4887 		return (EIO);
4888 
4889 	/*
4890 	 * check for valid cmd parameter
4891 	 */
4892 	if (cmd != F_SHARE && cmd != F_UNSHARE && cmd != F_HASREMOTELOCKS)
4893 		return (EINVAL);
4894 
4895 	/*
4896 	 * Check access permissions
4897 	 */
4898 	if (cmd == F_SHARE &&
4899 	    (((shr->s_access & F_RDACC) && !(flag & FREAD)) ||
4900 	    ((shr->s_access & F_WRACC) && !(flag & FWRITE))))
4901 		return (EBADF);
4902 
4903 	/*
4904 	 * If the filesystem is mounted using local locking, pass the
4905 	 * request off to the local share code.
4906 	 */
4907 	if (VTOMI(vp)->mi_flags & MI_LLOCK)
4908 		return (fs_shrlock(vp, cmd, shr, flag, cr));
4909 
4910 	switch (cmd) {
4911 	case F_SHARE:
4912 	case F_UNSHARE:
4913 		lm_fh.n_len = sizeof (fhandle_t);
4914 		lm_fh.n_bytes = (char *)VTOFH(vp);
4915 
4916 		/*
4917 		 * If passed an owner that is too large to fit in an
4918 		 * nfs_owner it is likely a recursive call from the
4919 		 * lock manager client and pass it straight through.  If
4920 		 * it is not a nfs_owner then simply return an error.
4921 		 */
4922 		if (shr->s_own_len > sizeof (nfs_owner.lowner)) {
4923 			if (((struct nfs_owner *)shr->s_owner)->magic !=
4924 			    NFS_OWNER_MAGIC)
4925 				return (EINVAL);
4926 
4927 			if (error = lm_shrlock(vp, cmd, shr, flag, &lm_fh)) {
4928 				error = set_errno(error);
4929 			}
4930 			return (error);
4931 		}
4932 		/*
4933 		 * Remote share reservations owner is a combination of
4934 		 * a magic number, hostname, and the local owner
4935 		 */
4936 		bzero(&nfs_owner, sizeof (nfs_owner));
4937 		nfs_owner.magic = NFS_OWNER_MAGIC;
4938 		(void) strncpy(nfs_owner.hname, uts_nodename(),
4939 		    sizeof (nfs_owner.hname));
4940 		bcopy(shr->s_owner, nfs_owner.lowner, shr->s_own_len);
4941 		nshr.s_access = shr->s_access;
4942 		nshr.s_deny = shr->s_deny;
4943 		nshr.s_sysid = 0;
4944 		nshr.s_pid = ttoproc(curthread)->p_pid;
4945 		nshr.s_own_len = sizeof (nfs_owner);
4946 		nshr.s_owner = (caddr_t)&nfs_owner;
4947 
4948 		if (error = lm_shrlock(vp, cmd, &nshr, flag, &lm_fh)) {
4949 			error = set_errno(error);
4950 		}
4951 
4952 		break;
4953 
4954 	case F_HASREMOTELOCKS:
4955 		/*
4956 		 * NFS client can't store remote locks itself
4957 		 */
4958 		shr->s_access = 0;
4959 		error = 0;
4960 		break;
4961 
4962 	default:
4963 		error = EINVAL;
4964 		break;
4965 	}
4966 
4967 	return (error);
4968 }
4969