xref: /titanic_51/usr/src/uts/common/fs/nfs/nfs_vnops.c (revision ebd1706e95186ddae1d4c0d63c47544cf33832ee)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  *
26  *	Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T.
27  *	All rights reserved.
28  */
29 
30 #pragma ident	"%Z%%M%	%I%	%E% SMI"
31 
32 #include <sys/param.h>
33 #include <sys/types.h>
34 #include <sys/systm.h>
35 #include <sys/cred.h>
36 #include <sys/time.h>
37 #include <sys/vnode.h>
38 #include <sys/vfs.h>
39 #include <sys/file.h>
40 #include <sys/filio.h>
41 #include <sys/uio.h>
42 #include <sys/buf.h>
43 #include <sys/mman.h>
44 #include <sys/pathname.h>
45 #include <sys/dirent.h>
46 #include <sys/debug.h>
47 #include <sys/vmsystm.h>
48 #include <sys/fcntl.h>
49 #include <sys/flock.h>
50 #include <sys/swap.h>
51 #include <sys/errno.h>
52 #include <sys/strsubr.h>
53 #include <sys/sysmacros.h>
54 #include <sys/kmem.h>
55 #include <sys/cmn_err.h>
56 #include <sys/pathconf.h>
57 #include <sys/utsname.h>
58 #include <sys/dnlc.h>
59 #include <sys/acl.h>
60 #include <sys/atomic.h>
61 #include <sys/policy.h>
62 #include <sys/sdt.h>
63 
64 #include <rpc/types.h>
65 #include <rpc/auth.h>
66 #include <rpc/clnt.h>
67 
68 #include <nfs/nfs.h>
69 #include <nfs/nfs_clnt.h>
70 #include <nfs/rnode.h>
71 #include <nfs/nfs_acl.h>
72 #include <nfs/lm.h>
73 
74 #include <vm/hat.h>
75 #include <vm/as.h>
76 #include <vm/page.h>
77 #include <vm/pvn.h>
78 #include <vm/seg.h>
79 #include <vm/seg_map.h>
80 #include <vm/seg_kpm.h>
81 #include <vm/seg_vn.h>
82 
83 #include <fs/fs_subr.h>
84 
85 #include <sys/ddi.h>
86 
87 static int	nfs_rdwrlbn(vnode_t *, page_t *, u_offset_t, size_t, int,
88 			cred_t *);
89 static int	nfswrite(vnode_t *, caddr_t, uint_t, int, cred_t *);
90 static int	nfsread(vnode_t *, caddr_t, uint_t, int, size_t *, cred_t *);
91 static int	nfssetattr(vnode_t *, struct vattr *, int, cred_t *);
92 static int	nfslookup_dnlc(vnode_t *, char *, vnode_t **, cred_t *);
93 static int	nfslookup_otw(vnode_t *, char *, vnode_t **, cred_t *, int);
94 static int	nfsrename(vnode_t *, char *, vnode_t *, char *, cred_t *);
95 static int	nfsreaddir(vnode_t *, rddir_cache *, cred_t *);
96 static int	nfs_bio(struct buf *, cred_t *);
97 static int	nfs_getapage(vnode_t *, u_offset_t, size_t, uint_t *,
98 			page_t *[], size_t, struct seg *, caddr_t,
99 			enum seg_rw, cred_t *);
100 static void	nfs_readahead(vnode_t *, u_offset_t, caddr_t, struct seg *,
101 			cred_t *);
102 static int	nfs_sync_putapage(vnode_t *, page_t *, u_offset_t, size_t,
103 			int, cred_t *);
104 static int	nfs_sync_pageio(vnode_t *, page_t *, u_offset_t, size_t,
105 			int, cred_t *);
106 static void	nfs_delmap_callback(struct as *, void *, uint_t);
107 
108 /*
109  * Error flags used to pass information about certain special errors
110  * which need to be handled specially.
111  */
112 #define	NFS_EOF			-98
113 
114 /*
115  * These are the vnode ops routines which implement the vnode interface to
116  * the networked file system.  These routines just take their parameters,
117  * make them look networkish by putting the right info into interface structs,
118  * and then calling the appropriate remote routine(s) to do the work.
119  *
120  * Note on directory name lookup cacheing:  If we detect a stale fhandle,
121  * we purge the directory cache relative to that vnode.  This way, the
122  * user won't get burned by the cache repeatedly.  See <nfs/rnode.h> for
123  * more details on rnode locking.
124  */
125 
126 static int	nfs_open(vnode_t **, int, cred_t *);
127 static int	nfs_close(vnode_t *, int, int, offset_t, cred_t *);
128 static int	nfs_read(vnode_t *, struct uio *, int, cred_t *,
129 			caller_context_t *);
130 static int	nfs_write(vnode_t *, struct uio *, int, cred_t *,
131 			caller_context_t *);
132 static int	nfs_ioctl(vnode_t *, int, intptr_t, int, cred_t *, int *);
133 static int	nfs_getattr(vnode_t *, struct vattr *, int, cred_t *);
134 static int	nfs_setattr(vnode_t *, struct vattr *, int, cred_t *,
135 			caller_context_t *);
136 static int	nfs_access(vnode_t *, int, int, cred_t *);
137 static int	nfs_accessx(void *, int, cred_t *);
138 static int	nfs_readlink(vnode_t *, struct uio *, cred_t *);
139 static int	nfs_fsync(vnode_t *, int, cred_t *);
140 static void	nfs_inactive(vnode_t *, cred_t *);
141 static int	nfs_lookup(vnode_t *, char *, vnode_t **, struct pathname *,
142 			int, vnode_t *, cred_t *);
143 static int	nfs_create(vnode_t *, char *, struct vattr *, enum vcexcl,
144 			int, vnode_t **, cred_t *, int);
145 static int	nfs_remove(vnode_t *, char *, cred_t *);
146 static int	nfs_link(vnode_t *, vnode_t *, char *, cred_t *);
147 static int	nfs_rename(vnode_t *, char *, vnode_t *, char *, cred_t *);
148 static int	nfs_mkdir(vnode_t *, char *, struct vattr *,
149 			vnode_t **, cred_t *);
150 static int	nfs_rmdir(vnode_t *, char *, vnode_t *, cred_t *);
151 static int	nfs_symlink(vnode_t *, char *, struct vattr *, char *,
152 			cred_t *);
153 static int	nfs_readdir(vnode_t *, struct uio *, cred_t *, int *);
154 static int	nfs_fid(vnode_t *, fid_t *);
155 static int	nfs_rwlock(vnode_t *, int, caller_context_t *);
156 static void	nfs_rwunlock(vnode_t *, int, caller_context_t *);
157 static int	nfs_seek(vnode_t *, offset_t, offset_t *);
158 static int	nfs_getpage(vnode_t *, offset_t, size_t, uint_t *,
159 			page_t *[], size_t, struct seg *, caddr_t,
160 			enum seg_rw, cred_t *);
161 static int	nfs_putpage(vnode_t *, offset_t, size_t, int, cred_t *);
162 static int	nfs_map(vnode_t *, offset_t, struct as *, caddr_t *,
163 			size_t, uchar_t, uchar_t, uint_t, cred_t *);
164 static int	nfs_addmap(vnode_t *, offset_t, struct as *, caddr_t,
165 			size_t, uchar_t, uchar_t, uint_t, cred_t *);
166 static int	nfs_frlock(vnode_t *, int, struct flock64 *, int, offset_t,
167 			struct flk_callback *, cred_t *);
168 static int	nfs_space(vnode_t *, int, struct flock64 *, int, offset_t,
169 			cred_t *, caller_context_t *);
170 static int	nfs_realvp(vnode_t *, vnode_t **);
171 static int	nfs_delmap(vnode_t *, offset_t, struct as *, caddr_t,
172 			size_t, uint_t, uint_t, uint_t, cred_t *);
173 static int	nfs_pathconf(vnode_t *, int, ulong_t *, cred_t *);
174 static int	nfs_pageio(vnode_t *, page_t *, u_offset_t, size_t, int,
175 			cred_t *);
176 static int	nfs_setsecattr(vnode_t *, vsecattr_t *, int, cred_t *);
177 static int	nfs_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *);
178 static int	nfs_shrlock(vnode_t *, int, struct shrlock *, int, cred_t *);
179 
180 struct vnodeops *nfs_vnodeops;
181 
182 const fs_operation_def_t nfs_vnodeops_template[] = {
183 	VOPNAME_OPEN, nfs_open,
184 	VOPNAME_CLOSE, nfs_close,
185 	VOPNAME_READ, nfs_read,
186 	VOPNAME_WRITE, nfs_write,
187 	VOPNAME_IOCTL, nfs_ioctl,
188 	VOPNAME_GETATTR, nfs_getattr,
189 	VOPNAME_SETATTR, nfs_setattr,
190 	VOPNAME_ACCESS, nfs_access,
191 	VOPNAME_LOOKUP, nfs_lookup,
192 	VOPNAME_CREATE, nfs_create,
193 	VOPNAME_REMOVE, nfs_remove,
194 	VOPNAME_LINK, nfs_link,
195 	VOPNAME_RENAME, nfs_rename,
196 	VOPNAME_MKDIR, nfs_mkdir,
197 	VOPNAME_RMDIR, nfs_rmdir,
198 	VOPNAME_READDIR, nfs_readdir,
199 	VOPNAME_SYMLINK, nfs_symlink,
200 	VOPNAME_READLINK, nfs_readlink,
201 	VOPNAME_FSYNC, nfs_fsync,
202 	VOPNAME_INACTIVE, (fs_generic_func_p) nfs_inactive,
203 	VOPNAME_FID, nfs_fid,
204 	VOPNAME_RWLOCK, nfs_rwlock,
205 	VOPNAME_RWUNLOCK, (fs_generic_func_p) nfs_rwunlock,
206 	VOPNAME_SEEK, nfs_seek,
207 	VOPNAME_FRLOCK, nfs_frlock,
208 	VOPNAME_SPACE, nfs_space,
209 	VOPNAME_REALVP, nfs_realvp,
210 	VOPNAME_GETPAGE, nfs_getpage,
211 	VOPNAME_PUTPAGE, nfs_putpage,
212 	VOPNAME_MAP, (fs_generic_func_p) nfs_map,
213 	VOPNAME_ADDMAP, (fs_generic_func_p) nfs_addmap,
214 	VOPNAME_DELMAP, nfs_delmap,
215 	VOPNAME_DUMP, nfs_dump,
216 	VOPNAME_PATHCONF, nfs_pathconf,
217 	VOPNAME_PAGEIO, nfs_pageio,
218 	VOPNAME_SETSECATTR, nfs_setsecattr,
219 	VOPNAME_GETSECATTR, nfs_getsecattr,
220 	VOPNAME_SHRLOCK, nfs_shrlock,
221 	NULL, NULL
222 };
223 
224 /*
225  * XXX:  This is referenced in modstubs.s
226  */
227 struct vnodeops *
228 nfs_getvnodeops(void)
229 {
230 	return (nfs_vnodeops);
231 }
232 
233 /* ARGSUSED */
234 static int
235 nfs_open(vnode_t **vpp, int flag, cred_t *cr)
236 {
237 	int error;
238 	struct vattr va;
239 	rnode_t *rp;
240 	vnode_t *vp;
241 
242 	vp = *vpp;
243 	rp = VTOR(vp);
244 	if (nfs_zone() != VTOMI(vp)->mi_zone)
245 		return (EIO);
246 	mutex_enter(&rp->r_statelock);
247 	if (rp->r_cred == NULL) {
248 		crhold(cr);
249 		rp->r_cred = cr;
250 	}
251 	mutex_exit(&rp->r_statelock);
252 
253 	/*
254 	 * If there is no cached data or if close-to-open
255 	 * consistency checking is turned off, we can avoid
256 	 * the over the wire getattr.  Otherwise, if the
257 	 * file system is mounted readonly, then just verify
258 	 * the caches are up to date using the normal mechanism.
259 	 * Else, if the file is not mmap'd, then just mark
260 	 * the attributes as timed out.  They will be refreshed
261 	 * and the caches validated prior to being used.
262 	 * Else, the file system is mounted writeable so
263 	 * force an over the wire GETATTR in order to ensure
264 	 * that all cached data is valid.
265 	 */
266 	if (vp->v_count > 1 ||
267 	    ((vn_has_cached_data(vp) || HAVE_RDDIR_CACHE(rp)) &&
268 	    !(VTOMI(vp)->mi_flags & MI_NOCTO))) {
269 		if (vn_is_readonly(vp))
270 			error = nfs_validate_caches(vp, cr);
271 		else if (rp->r_mapcnt == 0 && vp->v_count == 1) {
272 			PURGE_ATTRCACHE(vp);
273 			error = 0;
274 		} else {
275 			va.va_mask = AT_ALL;
276 			error = nfs_getattr_otw(vp, &va, cr);
277 		}
278 	} else
279 		error = 0;
280 
281 	return (error);
282 }
283 
284 static int
285 nfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr)
286 {
287 	rnode_t *rp;
288 	int error;
289 	struct vattr va;
290 
291 	/*
292 	 * zone_enter(2) prevents processes from changing zones with NFS files
293 	 * open; if we happen to get here from the wrong zone we can't do
294 	 * anything over the wire.
295 	 */
296 	if (VTOMI(vp)->mi_zone != nfs_zone()) {
297 		/*
298 		 * We could attempt to clean up locks, except we're sure
299 		 * that the current process didn't acquire any locks on
300 		 * the file: any attempt to lock a file belong to another zone
301 		 * will fail, and one can't lock an NFS file and then change
302 		 * zones, as that fails too.
303 		 *
304 		 * Returning an error here is the sane thing to do.  A
305 		 * subsequent call to VN_RELE() which translates to a
306 		 * nfs_inactive() will clean up state: if the zone of the
307 		 * vnode's origin is still alive and kicking, an async worker
308 		 * thread will handle the request (from the correct zone), and
309 		 * everything (minus the final nfs_getattr_otw() call) should
310 		 * be OK. If the zone is going away nfs_async_inactive() will
311 		 * throw away cached pages inline.
312 		 */
313 		return (EIO);
314 	}
315 
316 	/*
317 	 * If we are using local locking for this filesystem, then
318 	 * release all of the SYSV style record locks.  Otherwise,
319 	 * we are doing network locking and we need to release all
320 	 * of the network locks.  All of the locks held by this
321 	 * process on this file are released no matter what the
322 	 * incoming reference count is.
323 	 */
324 	if (VTOMI(vp)->mi_flags & MI_LLOCK) {
325 		cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
326 		cleanshares(vp, ttoproc(curthread)->p_pid);
327 	} else
328 		nfs_lockrelease(vp, flag, offset, cr);
329 
330 	if (count > 1)
331 		return (0);
332 
333 	/*
334 	 * If the file has been `unlinked', then purge the
335 	 * DNLC so that this vnode will get reycled quicker
336 	 * and the .nfs* file on the server will get removed.
337 	 */
338 	rp = VTOR(vp);
339 	if (rp->r_unldvp != NULL)
340 		dnlc_purge_vp(vp);
341 
342 	/*
343 	 * If the file was open for write and there are pages,
344 	 * then if the file system was mounted using the "no-close-
345 	 *	to-open" semantics, then start an asynchronous flush
346 	 *	of the all of the pages in the file.
347 	 * else the file system was not mounted using the "no-close-
348 	 *	to-open" semantics, then do a synchronous flush and
349 	 *	commit of all of the dirty and uncommitted pages.
350 	 *
351 	 * The asynchronous flush of the pages in the "nocto" path
352 	 * mostly just associates a cred pointer with the rnode so
353 	 * writes which happen later will have a better chance of
354 	 * working.  It also starts the data being written to the
355 	 * server, but without unnecessarily delaying the application.
356 	 */
357 	if ((flag & FWRITE) && vn_has_cached_data(vp)) {
358 		if ((VTOMI(vp)->mi_flags & MI_NOCTO)) {
359 			error = nfs_putpage(vp, (offset_t)0, 0, B_ASYNC, cr);
360 			if (error == EAGAIN)
361 				error = 0;
362 		} else
363 			error = nfs_putpage(vp, (offset_t)0, 0, 0, cr);
364 		if (!error) {
365 			mutex_enter(&rp->r_statelock);
366 			error = rp->r_error;
367 			rp->r_error = 0;
368 			mutex_exit(&rp->r_statelock);
369 		}
370 	} else {
371 		mutex_enter(&rp->r_statelock);
372 		error = rp->r_error;
373 		rp->r_error = 0;
374 		mutex_exit(&rp->r_statelock);
375 	}
376 
377 	/*
378 	 * If RWRITEATTR is set, then issue an over the wire GETATTR to
379 	 * refresh the attribute cache with a set of attributes which
380 	 * weren't returned from a WRITE.  This will enable the close-
381 	 * to-open processing to work.
382 	 */
383 	if (rp->r_flags & RWRITEATTR)
384 		(void) nfs_getattr_otw(vp, &va, cr);
385 
386 	return (error);
387 }
388 
389 /* ARGSUSED */
390 static int
391 nfs_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
392 	caller_context_t *ct)
393 {
394 	rnode_t *rp;
395 	u_offset_t off;
396 	offset_t diff;
397 	int on;
398 	size_t n;
399 	caddr_t base;
400 	uint_t flags;
401 	int error;
402 	mntinfo_t *mi;
403 
404 	rp = VTOR(vp);
405 	mi = VTOMI(vp);
406 
407 	if (nfs_zone() != mi->mi_zone)
408 		return (EIO);
409 
410 	ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
411 
412 	if (vp->v_type != VREG)
413 		return (EISDIR);
414 
415 	if (uiop->uio_resid == 0)
416 		return (0);
417 
418 	if (uiop->uio_loffset > MAXOFF32_T)
419 		return (EFBIG);
420 
421 	if (uiop->uio_loffset < 0 ||
422 	    uiop->uio_loffset + uiop->uio_resid > MAXOFF32_T)
423 		return (EINVAL);
424 
425 	/*
426 	 * Bypass VM if caching has been disabled (e.g., locking) or if
427 	 * using client-side direct I/O and the file is not mmap'd and
428 	 * there are no cached pages.
429 	 */
430 	if ((vp->v_flag & VNOCACHE) ||
431 	    (((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) &&
432 	    rp->r_mapcnt == 0 && !vn_has_cached_data(vp))) {
433 		size_t bufsize;
434 		size_t resid = 0;
435 
436 		/*
437 		 * Let's try to do read in as large a chunk as we can
438 		 * (Filesystem (NFS client) bsize if possible/needed).
439 		 * For V3, this is 32K and for V2, this is 8K.
440 		 */
441 		bufsize = MIN(uiop->uio_resid, VTOMI(vp)->mi_curread);
442 		base = kmem_alloc(bufsize, KM_SLEEP);
443 		do {
444 			n = MIN(uiop->uio_resid, bufsize);
445 			error = nfsread(vp, base, uiop->uio_offset, n,
446 			    &resid, cr);
447 			if (!error) {
448 				n -= resid;
449 				error = uiomove(base, n, UIO_READ, uiop);
450 			}
451 		} while (!error && uiop->uio_resid > 0 && n > 0);
452 		kmem_free(base, bufsize);
453 		return (error);
454 	}
455 
456 	error = 0;
457 
458 	do {
459 		off = uiop->uio_loffset & MAXBMASK; /* mapping offset */
460 		on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */
461 		n = MIN(MAXBSIZE - on, uiop->uio_resid);
462 
463 		error = nfs_validate_caches(vp, cr);
464 		if (error)
465 			break;
466 
467 		mutex_enter(&rp->r_statelock);
468 		diff = rp->r_size - uiop->uio_loffset;
469 		mutex_exit(&rp->r_statelock);
470 		if (diff <= 0)
471 			break;
472 		if (diff < n)
473 			n = (size_t)diff;
474 
475 		base = segmap_getmapflt(segkmap, vp, off + on, n, 1, S_READ);
476 
477 		error = uiomove(base + on, n, UIO_READ, uiop);
478 
479 		if (!error) {
480 			/*
481 			 * If read a whole block or read to eof,
482 			 * won't need this buffer again soon.
483 			 */
484 			mutex_enter(&rp->r_statelock);
485 			if (n + on == MAXBSIZE ||
486 			    uiop->uio_loffset == rp->r_size)
487 				flags = SM_DONTNEED;
488 			else
489 				flags = 0;
490 			mutex_exit(&rp->r_statelock);
491 			error = segmap_release(segkmap, base, flags);
492 		} else
493 			(void) segmap_release(segkmap, base, 0);
494 	} while (!error && uiop->uio_resid > 0);
495 
496 	return (error);
497 }
498 
499 /* ARGSUSED */
500 static int
501 nfs_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
502 	caller_context_t *ct)
503 {
504 	rnode_t *rp;
505 	u_offset_t off;
506 	caddr_t base;
507 	uint_t flags;
508 	int remainder;
509 	size_t n;
510 	int on;
511 	int error;
512 	int resid;
513 	offset_t offset;
514 	rlim_t limit;
515 	mntinfo_t *mi;
516 
517 	rp = VTOR(vp);
518 
519 	mi = VTOMI(vp);
520 	if (nfs_zone() != mi->mi_zone)
521 		return (EIO);
522 	if (vp->v_type != VREG)
523 		return (EISDIR);
524 
525 	if (uiop->uio_resid == 0)
526 		return (0);
527 
528 	if (ioflag & FAPPEND) {
529 		struct vattr va;
530 
531 		/*
532 		 * Must serialize if appending.
533 		 */
534 		if (nfs_rw_lock_held(&rp->r_rwlock, RW_READER)) {
535 			nfs_rw_exit(&rp->r_rwlock);
536 			if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER,
537 			    INTR(vp)))
538 				return (EINTR);
539 		}
540 
541 		va.va_mask = AT_SIZE;
542 		error = nfsgetattr(vp, &va, cr);
543 		if (error)
544 			return (error);
545 		uiop->uio_loffset = va.va_size;
546 	}
547 
548 	if (uiop->uio_loffset > MAXOFF32_T)
549 		return (EFBIG);
550 
551 	offset = uiop->uio_loffset + uiop->uio_resid;
552 
553 	if (uiop->uio_loffset < 0 || offset > MAXOFF32_T)
554 		return (EINVAL);
555 
556 	if (uiop->uio_llimit > (rlim64_t)MAXOFF32_T) {
557 		limit = MAXOFF32_T;
558 	} else {
559 		limit = (rlim_t)uiop->uio_llimit;
560 	}
561 
562 	/*
563 	 * Check to make sure that the process will not exceed
564 	 * its limit on file size.  It is okay to write up to
565 	 * the limit, but not beyond.  Thus, the write which
566 	 * reaches the limit will be short and the next write
567 	 * will return an error.
568 	 */
569 	remainder = 0;
570 	if (offset > limit) {
571 		remainder = offset - limit;
572 		uiop->uio_resid = limit - uiop->uio_offset;
573 		if (uiop->uio_resid <= 0) {
574 			proc_t *p = ttoproc(curthread);
575 
576 			uiop->uio_resid += remainder;
577 			mutex_enter(&p->p_lock);
578 			(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
579 			    p->p_rctls, p, RCA_UNSAFE_SIGINFO);
580 			mutex_exit(&p->p_lock);
581 			return (EFBIG);
582 		}
583 	}
584 
585 	if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR(vp)))
586 		return (EINTR);
587 
588 	/*
589 	 * Bypass VM if caching has been disabled (e.g., locking) or if
590 	 * using client-side direct I/O and the file is not mmap'd and
591 	 * there are no cached pages.
592 	 */
593 	if ((vp->v_flag & VNOCACHE) ||
594 	    (((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) &&
595 	    rp->r_mapcnt == 0 && !vn_has_cached_data(vp))) {
596 		size_t bufsize;
597 		int count;
598 		uint_t org_offset;
599 
600 nfs_fwrite:
601 		if (rp->r_flags & RSTALE) {
602 			resid = uiop->uio_resid;
603 			offset = uiop->uio_loffset;
604 			error = rp->r_error;
605 			goto bottom;
606 		}
607 		bufsize = MIN(uiop->uio_resid, mi->mi_curwrite);
608 		base = kmem_alloc(bufsize, KM_SLEEP);
609 		do {
610 			resid = uiop->uio_resid;
611 			offset = uiop->uio_loffset;
612 			count = MIN(uiop->uio_resid, bufsize);
613 			org_offset = uiop->uio_offset;
614 			error = uiomove(base, count, UIO_WRITE, uiop);
615 			if (!error) {
616 				error = nfswrite(vp, base, org_offset,
617 				    count, cr);
618 			}
619 		} while (!error && uiop->uio_resid > 0);
620 		kmem_free(base, bufsize);
621 		goto bottom;
622 	}
623 
624 	do {
625 		off = uiop->uio_loffset & MAXBMASK; /* mapping offset */
626 		on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */
627 		n = MIN(MAXBSIZE - on, uiop->uio_resid);
628 
629 		resid = uiop->uio_resid;
630 		offset = uiop->uio_loffset;
631 
632 		if (rp->r_flags & RSTALE) {
633 			error = rp->r_error;
634 			break;
635 		}
636 
637 		/*
638 		 * Don't create dirty pages faster than they
639 		 * can be cleaned so that the system doesn't
640 		 * get imbalanced.  If the async queue is
641 		 * maxed out, then wait for it to drain before
642 		 * creating more dirty pages.  Also, wait for
643 		 * any threads doing pagewalks in the vop_getattr
644 		 * entry points so that they don't block for
645 		 * long periods.
646 		 */
647 		mutex_enter(&rp->r_statelock);
648 		while ((mi->mi_max_threads != 0 &&
649 		    rp->r_awcount > 2 * mi->mi_max_threads) ||
650 		    rp->r_gcount > 0)
651 			cv_wait(&rp->r_cv, &rp->r_statelock);
652 		mutex_exit(&rp->r_statelock);
653 
654 		if (segmap_kpm) {
655 			int pon = uiop->uio_loffset & PAGEOFFSET;
656 			size_t pn = MIN(PAGESIZE - pon, uiop->uio_resid);
657 			int pagecreate;
658 
659 			mutex_enter(&rp->r_statelock);
660 			pagecreate = (pon == 0) && (pn == PAGESIZE ||
661 				uiop->uio_loffset + pn >= rp->r_size);
662 			mutex_exit(&rp->r_statelock);
663 
664 			base = segmap_getmapflt(segkmap, vp, off + on,
665 						pn, !pagecreate, S_WRITE);
666 
667 			error = writerp(rp, base + pon, n, uiop, pagecreate);
668 
669 		} else {
670 			base = segmap_getmapflt(segkmap, vp, off + on,
671 						n, 0, S_READ);
672 			error = writerp(rp, base + on, n, uiop, 0);
673 		}
674 
675 		if (!error) {
676 			if (mi->mi_flags & MI_NOAC)
677 				flags = SM_WRITE;
678 			else if (n + on == MAXBSIZE || IS_SWAPVP(vp)) {
679 				/*
680 				 * Have written a whole block.
681 				 * Start an asynchronous write
682 				 * and mark the buffer to
683 				 * indicate that it won't be
684 				 * needed again soon.
685 				 */
686 				flags = SM_WRITE | SM_ASYNC | SM_DONTNEED;
687 			} else
688 				flags = 0;
689 			if ((ioflag & (FSYNC|FDSYNC)) ||
690 			    (rp->r_flags & ROUTOFSPACE)) {
691 				flags &= ~SM_ASYNC;
692 				flags |= SM_WRITE;
693 			}
694 			error = segmap_release(segkmap, base, flags);
695 		} else {
696 			(void) segmap_release(segkmap, base, 0);
697 			/*
698 			 * In the event that we got an access error while
699 			 * faulting in a page for a write-only file just
700 			 * force a write.
701 			 */
702 			if (error == EACCES)
703 				goto nfs_fwrite;
704 		}
705 	} while (!error && uiop->uio_resid > 0);
706 
707 bottom:
708 	if (error) {
709 		uiop->uio_resid = resid + remainder;
710 		uiop->uio_loffset = offset;
711 	} else
712 		uiop->uio_resid += remainder;
713 
714 	nfs_rw_exit(&rp->r_lkserlock);
715 
716 	return (error);
717 }
718 
719 /*
720  * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED}
721  */
722 static int
723 nfs_rdwrlbn(vnode_t *vp, page_t *pp, u_offset_t off, size_t len,
724 	int flags, cred_t *cr)
725 {
726 	struct buf *bp;
727 	int error;
728 
729 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
730 	bp = pageio_setup(pp, len, vp, flags);
731 	ASSERT(bp != NULL);
732 
733 	/*
734 	 * pageio_setup should have set b_addr to 0.  This
735 	 * is correct since we want to do I/O on a page
736 	 * boundary.  bp_mapin will use this addr to calculate
737 	 * an offset, and then set b_addr to the kernel virtual
738 	 * address it allocated for us.
739 	 */
740 	ASSERT(bp->b_un.b_addr == 0);
741 
742 	bp->b_edev = 0;
743 	bp->b_dev = 0;
744 	bp->b_lblkno = lbtodb(off);
745 	bp->b_file = vp;
746 	bp->b_offset = (offset_t)off;
747 	bp_mapin(bp);
748 
749 	error = nfs_bio(bp, cr);
750 
751 	bp_mapout(bp);
752 	pageio_done(bp);
753 
754 	return (error);
755 }
756 
757 /*
758  * Write to file.  Writes to remote server in largest size
759  * chunks that the server can handle.  Write is synchronous.
760  */
761 static int
762 nfswrite(vnode_t *vp, caddr_t base, uint_t offset, int count, cred_t *cr)
763 {
764 	rnode_t *rp;
765 	mntinfo_t *mi;
766 	struct nfswriteargs wa;
767 	struct nfsattrstat ns;
768 	int error;
769 	int tsize;
770 	int douprintf;
771 
772 	douprintf = 1;
773 
774 	rp = VTOR(vp);
775 	mi = VTOMI(vp);
776 
777 	ASSERT(nfs_zone() == mi->mi_zone);
778 
779 	wa.wa_args = &wa.wa_args_buf;
780 	wa.wa_fhandle = *VTOFH(vp);
781 
782 	do {
783 		tsize = MIN(mi->mi_curwrite, count);
784 		wa.wa_data = base;
785 		wa.wa_begoff = offset;
786 		wa.wa_totcount = tsize;
787 		wa.wa_count = tsize;
788 		wa.wa_offset = offset;
789 
790 		if (mi->mi_io_kstats) {
791 			mutex_enter(&mi->mi_lock);
792 			kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
793 			mutex_exit(&mi->mi_lock);
794 		}
795 		wa.wa_mblk = NULL;
796 		do {
797 			error = rfs2call(mi, RFS_WRITE,
798 			    xdr_writeargs, (caddr_t)&wa,
799 			    xdr_attrstat, (caddr_t)&ns, cr,
800 			    &douprintf, &ns.ns_status, 0, NULL);
801 		} while (error == ENFS_TRYAGAIN);
802 		if (mi->mi_io_kstats) {
803 			mutex_enter(&mi->mi_lock);
804 			kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
805 			mutex_exit(&mi->mi_lock);
806 		}
807 
808 		if (!error) {
809 			error = geterrno(ns.ns_status);
810 			/*
811 			 * Can't check for stale fhandle and purge caches
812 			 * here because pages are held by nfs_getpage.
813 			 * Just mark the attribute cache as timed out
814 			 * and set RWRITEATTR to indicate that the file
815 			 * was modified with a WRITE operation.
816 			 */
817 			if (!error) {
818 				count -= tsize;
819 				base += tsize;
820 				offset += tsize;
821 				if (mi->mi_io_kstats) {
822 					mutex_enter(&mi->mi_lock);
823 				    KSTAT_IO_PTR(mi->mi_io_kstats)->writes++;
824 				    KSTAT_IO_PTR(mi->mi_io_kstats)->nwritten +=
825 					    tsize;
826 					mutex_exit(&mi->mi_lock);
827 				}
828 				lwp_stat_update(LWP_STAT_OUBLK, 1);
829 				mutex_enter(&rp->r_statelock);
830 				PURGE_ATTRCACHE_LOCKED(rp);
831 				rp->r_flags |= RWRITEATTR;
832 				mutex_exit(&rp->r_statelock);
833 			}
834 		}
835 	} while (!error && count);
836 
837 	return (error);
838 }
839 
840 /*
841  * Read from a file.  Reads data in largest chunks our interface can handle.
842  */
843 static int
844 nfsread(vnode_t *vp, caddr_t base, uint_t offset, int count, size_t *residp,
845 	cred_t *cr)
846 {
847 	mntinfo_t *mi;
848 	struct nfsreadargs ra;
849 	struct nfsrdresult rr;
850 	int tsize;
851 	int error;
852 	int douprintf;
853 	failinfo_t fi;
854 	rnode_t *rp;
855 	struct vattr va;
856 	hrtime_t t;
857 
858 	rp = VTOR(vp);
859 	mi = VTOMI(vp);
860 
861 	ASSERT(nfs_zone() == mi->mi_zone);
862 
863 	douprintf = 1;
864 
865 	ra.ra_fhandle = *VTOFH(vp);
866 
867 	fi.vp = vp;
868 	fi.fhp = (caddr_t)&ra.ra_fhandle;
869 	fi.copyproc = nfscopyfh;
870 	fi.lookupproc = nfslookup;
871 	fi.xattrdirproc = acl_getxattrdir2;
872 
873 	do {
874 		if (mi->mi_io_kstats) {
875 			mutex_enter(&mi->mi_lock);
876 			kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
877 			mutex_exit(&mi->mi_lock);
878 		}
879 
880 		do {
881 			tsize = MIN(mi->mi_curread, count);
882 			rr.rr_data = base;
883 			ra.ra_offset = offset;
884 			ra.ra_totcount = tsize;
885 			ra.ra_count = tsize;
886 			t = gethrtime();
887 			error = rfs2call(mi, RFS_READ,
888 			    xdr_readargs, (caddr_t)&ra,
889 			    xdr_rdresult, (caddr_t)&rr, cr,
890 			    &douprintf, &rr.rr_status, 0, &fi);
891 		} while (error == ENFS_TRYAGAIN);
892 
893 		if (mi->mi_io_kstats) {
894 			mutex_enter(&mi->mi_lock);
895 			kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
896 			mutex_exit(&mi->mi_lock);
897 		}
898 
899 		if (!error) {
900 			error = geterrno(rr.rr_status);
901 			if (!error) {
902 				count -= rr.rr_count;
903 				base += rr.rr_count;
904 				offset += rr.rr_count;
905 				if (mi->mi_io_kstats) {
906 					mutex_enter(&mi->mi_lock);
907 					KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
908 					KSTAT_IO_PTR(mi->mi_io_kstats)->nread +=
909 					    rr.rr_count;
910 					mutex_exit(&mi->mi_lock);
911 				}
912 				lwp_stat_update(LWP_STAT_INBLK, 1);
913 			}
914 		}
915 	} while (!error && count && rr.rr_count == tsize);
916 
917 	*residp = count;
918 
919 	if (!error) {
920 		/*
921 		 * Since no error occurred, we have the current
922 		 * attributes and we need to do a cache check and then
923 		 * potentially update the cached attributes.  We can't
924 		 * use the normal attribute check and cache mechanisms
925 		 * because they might cause a cache flush which would
926 		 * deadlock.  Instead, we just check the cache to see
927 		 * if the attributes have changed.  If it is, then we
928 		 * just mark the attributes as out of date.  The next
929 		 * time that the attributes are checked, they will be
930 		 * out of date, new attributes will be fetched, and
931 		 * the page cache will be flushed.  If the attributes
932 		 * weren't changed, then we just update the cached
933 		 * attributes with these attributes.
934 		 */
935 		/*
936 		 * If NFS_ACL is supported on the server, then the
937 		 * attributes returned by server may have minimal
938 		 * permissions sometimes denying access to users having
939 		 * proper access.  To get the proper attributes, mark
940 		 * the attributes as expired so that they will be
941 		 * regotten via the NFS_ACL GETATTR2 procedure.
942 		 */
943 		error = nattr_to_vattr(vp, &rr.rr_attr, &va);
944 		mutex_enter(&rp->r_statelock);
945 		if (error || !CACHE_VALID(rp, va.va_mtime, va.va_size) ||
946 		    (mi->mi_flags & MI_ACL)) {
947 			mutex_exit(&rp->r_statelock);
948 			PURGE_ATTRCACHE(vp);
949 		} else {
950 			if (rp->r_mtime <= t) {
951 				nfs_attrcache_va(vp, &va);
952 			}
953 			mutex_exit(&rp->r_statelock);
954 		}
955 	}
956 
957 	return (error);
958 }
959 
960 /* ARGSUSED */
961 static int
962 nfs_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
963 {
964 
965 	if (nfs_zone() != VTOMI(vp)->mi_zone)
966 		return (EIO);
967 	switch (cmd) {
968 		case _FIODIRECTIO:
969 			return (nfs_directio(vp, (int)arg, cr));
970 		default:
971 			return (ENOTTY);
972 	}
973 }
974 
975 static int
976 nfs_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr)
977 {
978 	int error;
979 	rnode_t *rp;
980 
981 	if (nfs_zone() != VTOMI(vp)->mi_zone)
982 		return (EIO);
983 	/*
984 	 * If it has been specified that the return value will
985 	 * just be used as a hint, and we are only being asked
986 	 * for size, fsid or rdevid, then return the client's
987 	 * notion of these values without checking to make sure
988 	 * that the attribute cache is up to date.
989 	 * The whole point is to avoid an over the wire GETATTR
990 	 * call.
991 	 */
992 	rp = VTOR(vp);
993 	if (flags & ATTR_HINT) {
994 		if (vap->va_mask ==
995 		    (vap->va_mask & (AT_SIZE | AT_FSID | AT_RDEV))) {
996 			mutex_enter(&rp->r_statelock);
997 			if (vap->va_mask | AT_SIZE)
998 				vap->va_size = rp->r_size;
999 			if (vap->va_mask | AT_FSID)
1000 				vap->va_fsid = rp->r_attr.va_fsid;
1001 			if (vap->va_mask | AT_RDEV)
1002 				vap->va_rdev = rp->r_attr.va_rdev;
1003 			mutex_exit(&rp->r_statelock);
1004 			return (0);
1005 		}
1006 	}
1007 
1008 	/*
1009 	 * Only need to flush pages if asking for the mtime
1010 	 * and if there any dirty pages or any outstanding
1011 	 * asynchronous (write) requests for this file.
1012 	 */
1013 	if (vap->va_mask & AT_MTIME) {
1014 		if (vn_has_cached_data(vp) &&
1015 		    ((rp->r_flags & RDIRTY) || rp->r_awcount > 0)) {
1016 			mutex_enter(&rp->r_statelock);
1017 			rp->r_gcount++;
1018 			mutex_exit(&rp->r_statelock);
1019 			error = nfs_putpage(vp, (offset_t)0, 0, 0, cr);
1020 			mutex_enter(&rp->r_statelock);
1021 			if (error && (error == ENOSPC || error == EDQUOT)) {
1022 				if (!rp->r_error)
1023 					rp->r_error = error;
1024 			}
1025 			if (--rp->r_gcount == 0)
1026 				cv_broadcast(&rp->r_cv);
1027 			mutex_exit(&rp->r_statelock);
1028 		}
1029 	}
1030 
1031 	return (nfsgetattr(vp, vap, cr));
1032 }
1033 
1034 /*ARGSUSED4*/
1035 static int
1036 nfs_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
1037 		caller_context_t *ct)
1038 {
1039 	int error;
1040 	uint_t mask;
1041 	struct vattr va;
1042 
1043 	mask = vap->va_mask;
1044 
1045 	if (mask & AT_NOSET)
1046 		return (EINVAL);
1047 
1048 	if ((mask & AT_SIZE) &&
1049 	    vap->va_type == VREG &&
1050 	    vap->va_size > MAXOFF32_T)
1051 		return (EFBIG);
1052 
1053 	if (nfs_zone() != VTOMI(vp)->mi_zone)
1054 		return (EIO);
1055 
1056 	va.va_mask = AT_UID | AT_MODE;
1057 
1058 	error = nfsgetattr(vp, &va, cr);
1059 	if (error)
1060 		return (error);
1061 
1062 	error = secpolicy_vnode_setattr(cr, vp, vap, &va, flags, nfs_accessx,
1063 			vp);
1064 
1065 	if (error)
1066 		return (error);
1067 
1068 	return (nfssetattr(vp, vap, flags, cr));
1069 }
1070 
1071 static int
1072 nfssetattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr)
1073 {
1074 	int error;
1075 	uint_t mask;
1076 	struct nfssaargs args;
1077 	struct nfsattrstat ns;
1078 	int douprintf;
1079 	rnode_t *rp;
1080 	struct vattr va;
1081 	mode_t omode;
1082 	mntinfo_t *mi;
1083 	vsecattr_t *vsp;
1084 	hrtime_t t;
1085 
1086 	mask = vap->va_mask;
1087 
1088 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
1089 
1090 	rp = VTOR(vp);
1091 
1092 	/*
1093 	 * Only need to flush pages if there are any pages and
1094 	 * if the file is marked as dirty in some fashion.  The
1095 	 * file must be flushed so that we can accurately
1096 	 * determine the size of the file and the cached data
1097 	 * after the SETATTR returns.  A file is considered to
1098 	 * be dirty if it is either marked with RDIRTY, has
1099 	 * outstanding i/o's active, or is mmap'd.  In this
1100 	 * last case, we can't tell whether there are dirty
1101 	 * pages, so we flush just to be sure.
1102 	 */
1103 	if (vn_has_cached_data(vp) &&
1104 	    ((rp->r_flags & RDIRTY) ||
1105 	    rp->r_count > 0 ||
1106 	    rp->r_mapcnt > 0)) {
1107 		ASSERT(vp->v_type != VCHR);
1108 		error = nfs_putpage(vp, (offset_t)0, 0, 0, cr);
1109 		if (error && (error == ENOSPC || error == EDQUOT)) {
1110 			mutex_enter(&rp->r_statelock);
1111 			if (!rp->r_error)
1112 				rp->r_error = error;
1113 			mutex_exit(&rp->r_statelock);
1114 		}
1115 	}
1116 
1117 	/*
1118 	 * If the system call was utime(2) or utimes(2) and the
1119 	 * application did not specify the times, then set the
1120 	 * mtime nanosecond field to 1 billion.  This will get
1121 	 * translated from 1 billion nanoseconds to 1 million
1122 	 * microseconds in the over the wire request.  The
1123 	 * server will use 1 million in the microsecond field
1124 	 * to tell whether both the mtime and atime should be
1125 	 * set to the server's current time.
1126 	 *
1127 	 * This is an overload of the protocol and should be
1128 	 * documented in the NFS Version 2 protocol specification.
1129 	 */
1130 	if ((mask & AT_MTIME) && !(flags & ATTR_UTIME)) {
1131 		vap->va_mtime.tv_nsec = 1000000000;
1132 		if (NFS_TIME_T_OK(vap->va_mtime.tv_sec) &&
1133 		    NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
1134 			error = vattr_to_sattr(vap, &args.saa_sa);
1135 		} else {
1136 			/*
1137 			 * Use server times. vap time values will not be used.
1138 			 * To ensure no time overflow, make sure vap has
1139 			 * valid values, but retain the original values.
1140 			 */
1141 			timestruc_t	mtime = vap->va_mtime;
1142 			timestruc_t	atime = vap->va_atime;
1143 			time_t		now;
1144 
1145 			now = gethrestime_sec();
1146 			if (NFS_TIME_T_OK(now)) {
1147 				/* Just in case server does not know of this */
1148 				vap->va_mtime.tv_sec = now;
1149 				vap->va_atime.tv_sec = now;
1150 			} else {
1151 				vap->va_mtime.tv_sec = 0;
1152 				vap->va_atime.tv_sec = 0;
1153 			}
1154 			error = vattr_to_sattr(vap, &args.saa_sa);
1155 			/* set vap times back on */
1156 			vap->va_mtime = mtime;
1157 			vap->va_atime = atime;
1158 		}
1159 	} else {
1160 		/* Either do not set times or use the client specified times */
1161 		error = vattr_to_sattr(vap, &args.saa_sa);
1162 	}
1163 	if (error) {
1164 		/* req time field(s) overflow - return immediately */
1165 		return (error);
1166 	}
1167 	args.saa_fh = *VTOFH(vp);
1168 
1169 	va.va_mask = AT_MODE;
1170 	error = nfsgetattr(vp, &va, cr);
1171 	if (error)
1172 		return (error);
1173 	omode = va.va_mode;
1174 
1175 	mi = VTOMI(vp);
1176 
1177 	douprintf = 1;
1178 
1179 	t = gethrtime();
1180 
1181 	error = rfs2call(mi, RFS_SETATTR,
1182 	    xdr_saargs, (caddr_t)&args,
1183 	    xdr_attrstat, (caddr_t)&ns, cr,
1184 	    &douprintf, &ns.ns_status, 0, NULL);
1185 
1186 	/*
1187 	 * Purge the access cache and ACL cache if changing either the
1188 	 * owner of the file, the group owner, or the mode.  These may
1189 	 * change the access permissions of the file, so purge old
1190 	 * information and start over again.
1191 	 */
1192 	if ((mask & (AT_UID | AT_GID | AT_MODE)) && (mi->mi_flags & MI_ACL)) {
1193 		(void) nfs_access_purge_rp(rp);
1194 		if (rp->r_secattr != NULL) {
1195 			mutex_enter(&rp->r_statelock);
1196 			vsp = rp->r_secattr;
1197 			rp->r_secattr = NULL;
1198 			mutex_exit(&rp->r_statelock);
1199 			if (vsp != NULL)
1200 				nfs_acl_free(vsp);
1201 		}
1202 	}
1203 
1204 	if (!error) {
1205 		error = geterrno(ns.ns_status);
1206 		if (!error) {
1207 			/*
1208 			 * If changing the size of the file, invalidate
1209 			 * any local cached data which is no longer part
1210 			 * of the file.  We also possibly invalidate the
1211 			 * last page in the file.  We could use
1212 			 * pvn_vpzero(), but this would mark the page as
1213 			 * modified and require it to be written back to
1214 			 * the server for no particularly good reason.
1215 			 * This way, if we access it, then we bring it
1216 			 * back in.  A read should be cheaper than a
1217 			 * write.
1218 			 */
1219 			if (mask & AT_SIZE) {
1220 				nfs_invalidate_pages(vp,
1221 				    (vap->va_size & PAGEMASK), cr);
1222 			}
1223 			(void) nfs_cache_fattr(vp, &ns.ns_attr, &va, t, cr);
1224 			/*
1225 			 * If NFS_ACL is supported on the server, then the
1226 			 * attributes returned by server may have minimal
1227 			 * permissions sometimes denying access to users having
1228 			 * proper access.  To get the proper attributes, mark
1229 			 * the attributes as expired so that they will be
1230 			 * regotten via the NFS_ACL GETATTR2 procedure.
1231 			 */
1232 			if (mi->mi_flags & MI_ACL) {
1233 				PURGE_ATTRCACHE(vp);
1234 			}
1235 			/*
1236 			 * This next check attempts to deal with NFS
1237 			 * servers which can not handle increasing
1238 			 * the size of the file via setattr.  Most
1239 			 * of these servers do not return an error,
1240 			 * but do not change the size of the file.
1241 			 * Hence, this check and then attempt to set
1242 			 * the file size by writing 1 byte at the
1243 			 * offset of the end of the file that we need.
1244 			 */
1245 			if ((mask & AT_SIZE) &&
1246 			    ns.ns_attr.na_size < (uint32_t)vap->va_size) {
1247 				char zb = '\0';
1248 
1249 				error = nfswrite(vp, &zb,
1250 				    vap->va_size - sizeof (zb),
1251 				    sizeof (zb), cr);
1252 			}
1253 			/*
1254 			 * Some servers will change the mode to clear the setuid
1255 			 * and setgid bits when changing the uid or gid.  The
1256 			 * client needs to compensate appropriately.
1257 			 */
1258 			if (mask & (AT_UID | AT_GID)) {
1259 				int terror;
1260 
1261 				va.va_mask = AT_MODE;
1262 				terror = nfsgetattr(vp, &va, cr);
1263 				if (!terror &&
1264 				    (((mask & AT_MODE) &&
1265 				    va.va_mode != vap->va_mode) ||
1266 				    (!(mask & AT_MODE) &&
1267 				    va.va_mode != omode))) {
1268 					va.va_mask = AT_MODE;
1269 					if (mask & AT_MODE)
1270 						va.va_mode = vap->va_mode;
1271 					else
1272 						va.va_mode = omode;
1273 					(void) nfssetattr(vp, &va, 0, cr);
1274 				}
1275 			}
1276 		} else {
1277 			PURGE_ATTRCACHE(vp);
1278 			PURGE_STALE_FH(error, vp, cr);
1279 		}
1280 	} else {
1281 		PURGE_ATTRCACHE(vp);
1282 	}
1283 
1284 	return (error);
1285 }
1286 
1287 static int
1288 nfs_accessx(void *vp, int mode, cred_t *cr)
1289 {
1290 	ASSERT(nfs_zone() == VTOMI((vnode_t *)vp)->mi_zone);
1291 	return (nfs_access(vp, mode, 0, cr));
1292 }
1293 
1294 static int
1295 nfs_access(vnode_t *vp, int mode, int flags, cred_t *cr)
1296 {
1297 	struct vattr va;
1298 	int error;
1299 	mntinfo_t *mi;
1300 	int shift = 0;
1301 
1302 	mi = VTOMI(vp);
1303 
1304 	if (nfs_zone() != mi->mi_zone)
1305 		return (EIO);
1306 	if (mi->mi_flags & MI_ACL) {
1307 		error = acl_access2(vp, mode, flags, cr);
1308 		if (mi->mi_flags & MI_ACL)
1309 			return (error);
1310 	}
1311 
1312 	va.va_mask = AT_MODE | AT_UID | AT_GID;
1313 	error = nfsgetattr(vp, &va, cr);
1314 	if (error)
1315 		return (error);
1316 
1317 	/*
1318 	 * Disallow write attempts on read-only
1319 	 * file systems, unless the file is a
1320 	 * device node.
1321 	 */
1322 	if ((mode & VWRITE) && vn_is_readonly(vp) && !IS_DEVVP(vp))
1323 		return (EROFS);
1324 
1325 	/*
1326 	 * Disallow attempts to access mandatory lock files.
1327 	 */
1328 	if ((mode & (VWRITE | VREAD | VEXEC)) &&
1329 	    MANDLOCK(vp, va.va_mode))
1330 		return (EACCES);
1331 
1332 	/*
1333 	 * Access check is based on only
1334 	 * one of owner, group, public.
1335 	 * If not owner, then check group.
1336 	 * If not a member of the group,
1337 	 * then check public access.
1338 	 */
1339 	if (crgetuid(cr) != va.va_uid) {
1340 		shift += 3;
1341 		if (!groupmember(va.va_gid, cr))
1342 			shift += 3;
1343 	}
1344 found:
1345 	mode &= ~(va.va_mode << shift);
1346 	if (mode == 0)
1347 		return (0);
1348 
1349 	return (secpolicy_vnode_access(cr, vp, va.va_uid, mode));
1350 }
1351 
1352 static int nfs_do_symlink_cache = 1;
1353 
1354 static int
1355 nfs_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr)
1356 {
1357 	int error;
1358 	struct nfsrdlnres rl;
1359 	rnode_t *rp;
1360 	int douprintf;
1361 	failinfo_t fi;
1362 
1363 	/*
1364 	 * We want to be consistent with UFS semantics so we will return
1365 	 * EINVAL instead of ENXIO. This violates the XNFS spec and
1366 	 * the RFC 1094, which are wrong any way. BUGID 1138002.
1367 	 */
1368 	if (vp->v_type != VLNK)
1369 		return (EINVAL);
1370 
1371 	if (nfs_zone() != VTOMI(vp)->mi_zone)
1372 		return (EIO);
1373 
1374 	rp = VTOR(vp);
1375 	if (nfs_do_symlink_cache && rp->r_symlink.contents != NULL) {
1376 		error = nfs_validate_caches(vp, cr);
1377 		if (error)
1378 			return (error);
1379 		mutex_enter(&rp->r_statelock);
1380 		if (rp->r_symlink.contents != NULL) {
1381 			error = uiomove(rp->r_symlink.contents,
1382 			    rp->r_symlink.len, UIO_READ, uiop);
1383 			mutex_exit(&rp->r_statelock);
1384 			return (error);
1385 		}
1386 		mutex_exit(&rp->r_statelock);
1387 	}
1388 
1389 
1390 	rl.rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
1391 
1392 	fi.vp = vp;
1393 	fi.fhp = NULL;		/* no need to update, filehandle not copied */
1394 	fi.copyproc = nfscopyfh;
1395 	fi.lookupproc = nfslookup;
1396 	fi.xattrdirproc = acl_getxattrdir2;
1397 
1398 	douprintf = 1;
1399 
1400 	error = rfs2call(VTOMI(vp), RFS_READLINK,
1401 	    xdr_fhandle, (caddr_t)VTOFH(vp),
1402 	    xdr_rdlnres, (caddr_t)&rl, cr,
1403 	    &douprintf, &rl.rl_status, 0, &fi);
1404 
1405 	if (error) {
1406 
1407 		kmem_free((void *)rl.rl_data, NFS_MAXPATHLEN);
1408 		return (error);
1409 	}
1410 
1411 	error = geterrno(rl.rl_status);
1412 	if (!error) {
1413 		error = uiomove(rl.rl_data, (int)rl.rl_count, UIO_READ, uiop);
1414 		if (nfs_do_symlink_cache && rp->r_symlink.contents == NULL) {
1415 			mutex_enter(&rp->r_statelock);
1416 			if (rp->r_symlink.contents == NULL) {
1417 				rp->r_symlink.contents = rl.rl_data;
1418 				rp->r_symlink.len = (int)rl.rl_count;
1419 				rp->r_symlink.size = NFS_MAXPATHLEN;
1420 				mutex_exit(&rp->r_statelock);
1421 			} else {
1422 				mutex_exit(&rp->r_statelock);
1423 
1424 				kmem_free((void *)rl.rl_data,
1425 				    NFS_MAXPATHLEN);
1426 			}
1427 		} else {
1428 
1429 			kmem_free((void *)rl.rl_data, NFS_MAXPATHLEN);
1430 		}
1431 	} else {
1432 		PURGE_STALE_FH(error, vp, cr);
1433 
1434 		kmem_free((void *)rl.rl_data, NFS_MAXPATHLEN);
1435 	}
1436 
1437 	/*
1438 	 * Conform to UFS semantics (see comment above)
1439 	 */
1440 	return (error == ENXIO ? EINVAL : error);
1441 }
1442 
1443 /*
1444  * Flush local dirty pages to stable storage on the server.
1445  *
1446  * If FNODSYNC is specified, then there is nothing to do because
1447  * metadata changes are not cached on the client before being
1448  * sent to the server.
1449  */
1450 static int
1451 nfs_fsync(vnode_t *vp, int syncflag, cred_t *cr)
1452 {
1453 	int error;
1454 
1455 	if ((syncflag & FNODSYNC) || IS_SWAPVP(vp))
1456 		return (0);
1457 
1458 	if (nfs_zone() != VTOMI(vp)->mi_zone)
1459 		return (EIO);
1460 
1461 	error = nfs_putpage(vp, (offset_t)0, 0, 0, cr);
1462 	if (!error)
1463 		error = VTOR(vp)->r_error;
1464 	return (error);
1465 }
1466 
1467 
1468 /*
1469  * Weirdness: if the file was removed or the target of a rename
1470  * operation while it was open, it got renamed instead.  Here we
1471  * remove the renamed file.
1472  */
1473 static void
1474 nfs_inactive(vnode_t *vp, cred_t *cr)
1475 {
1476 	rnode_t *rp;
1477 
1478 	ASSERT(vp != DNLC_NO_VNODE);
1479 
1480 	/*
1481 	 * If this is coming from the wrong zone, we let someone in the right
1482 	 * zone take care of it asynchronously.  We can get here due to
1483 	 * VN_RELE() being called from pageout() or fsflush().  This call may
1484 	 * potentially turn into an expensive no-op if, for instance, v_count
1485 	 * gets incremented in the meantime, but it's still correct.
1486 	 */
1487 	if (nfs_zone() != VTOMI(vp)->mi_zone) {
1488 		nfs_async_inactive(vp, cr, nfs_inactive);
1489 		return;
1490 	}
1491 
1492 	rp = VTOR(vp);
1493 redo:
1494 	if (rp->r_unldvp != NULL) {
1495 		/*
1496 		 * Save the vnode pointer for the directory where the
1497 		 * unlinked-open file got renamed, then set it to NULL
1498 		 * to prevent another thread from getting here before
1499 		 * we're done with the remove.  While we have the
1500 		 * statelock, make local copies of the pertinent rnode
1501 		 * fields.  If we weren't to do this in an atomic way, the
1502 		 * the unl* fields could become inconsistent with respect
1503 		 * to each other due to a race condition between this
1504 		 * code and nfs_remove().  See bug report 1034328.
1505 		 */
1506 		mutex_enter(&rp->r_statelock);
1507 		if (rp->r_unldvp != NULL) {
1508 			vnode_t *unldvp;
1509 			char *unlname;
1510 			cred_t *unlcred;
1511 			struct nfsdiropargs da;
1512 			enum nfsstat status;
1513 			int douprintf;
1514 			int error;
1515 
1516 			unldvp = rp->r_unldvp;
1517 			rp->r_unldvp = NULL;
1518 			unlname = rp->r_unlname;
1519 			rp->r_unlname = NULL;
1520 			unlcred = rp->r_unlcred;
1521 			rp->r_unlcred = NULL;
1522 			mutex_exit(&rp->r_statelock);
1523 
1524 			/*
1525 			 * If there are any dirty pages left, then flush
1526 			 * them.  This is unfortunate because they just
1527 			 * may get thrown away during the remove operation,
1528 			 * but we have to do this for correctness.
1529 			 */
1530 			if (vn_has_cached_data(vp) &&
1531 			    ((rp->r_flags & RDIRTY) || rp->r_count > 0)) {
1532 				ASSERT(vp->v_type != VCHR);
1533 				error = nfs_putpage(vp, (offset_t)0, 0, 0, cr);
1534 				if (error) {
1535 					mutex_enter(&rp->r_statelock);
1536 					if (!rp->r_error)
1537 						rp->r_error = error;
1538 					mutex_exit(&rp->r_statelock);
1539 				}
1540 			}
1541 
1542 			/*
1543 			 * Do the remove operation on the renamed file
1544 			 */
1545 			setdiropargs(&da, unlname, unldvp);
1546 
1547 			douprintf = 1;
1548 
1549 			(void) rfs2call(VTOMI(unldvp), RFS_REMOVE,
1550 			    xdr_diropargs, (caddr_t)&da,
1551 			    xdr_enum, (caddr_t)&status, unlcred,
1552 			    &douprintf, &status, 0, NULL);
1553 
1554 			if (HAVE_RDDIR_CACHE(VTOR(unldvp)))
1555 				nfs_purge_rddir_cache(unldvp);
1556 			PURGE_ATTRCACHE(unldvp);
1557 
1558 			/*
1559 			 * Release stuff held for the remove
1560 			 */
1561 			VN_RELE(unldvp);
1562 			kmem_free(unlname, MAXNAMELEN);
1563 			crfree(unlcred);
1564 			goto redo;
1565 		}
1566 		mutex_exit(&rp->r_statelock);
1567 	}
1568 
1569 	rp_addfree(rp, cr);
1570 }
1571 
1572 /*
1573  * Remote file system operations having to do with directory manipulation.
1574  */
1575 
1576 static int
1577 nfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
1578 	int flags, vnode_t *rdir, cred_t *cr)
1579 {
1580 	int error;
1581 	vnode_t *vp;
1582 	vnode_t *avp = NULL;
1583 	rnode_t *drp;
1584 
1585 	if (nfs_zone() != VTOMI(dvp)->mi_zone)
1586 		return (EPERM);
1587 
1588 	drp = VTOR(dvp);
1589 
1590 	/*
1591 	 * Are we looking up extended attributes?  If so, "dvp" is
1592 	 * the file or directory for which we want attributes, and
1593 	 * we need a lookup of the hidden attribute directory
1594 	 * before we lookup the rest of the path.
1595 	 */
1596 	if (flags & LOOKUP_XATTR) {
1597 		bool_t cflag = ((flags & CREATE_XATTR_DIR) != 0);
1598 		mntinfo_t *mi;
1599 
1600 		mi = VTOMI(dvp);
1601 		if (!(mi->mi_flags & MI_EXTATTR))
1602 			return (EINVAL);
1603 
1604 		if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR(dvp)))
1605 			return (EINTR);
1606 
1607 		(void) nfslookup_dnlc(dvp, XATTR_DIR_NAME, &avp, cr);
1608 		if (avp == NULL)
1609 			error = acl_getxattrdir2(dvp, &avp, cflag, cr, 0);
1610 		else
1611 			error = 0;
1612 
1613 		nfs_rw_exit(&drp->r_rwlock);
1614 
1615 		if (error) {
1616 			if (mi->mi_flags & MI_EXTATTR)
1617 				return (error);
1618 			return (EINVAL);
1619 		}
1620 		dvp = avp;
1621 		drp = VTOR(dvp);
1622 	}
1623 
1624 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR(dvp))) {
1625 		error = EINTR;
1626 		goto out;
1627 	}
1628 
1629 	error = nfslookup(dvp, nm, vpp, pnp, flags, rdir, cr, 0);
1630 
1631 	nfs_rw_exit(&drp->r_rwlock);
1632 
1633 	/*
1634 	 * If vnode is a device, create special vnode.
1635 	 */
1636 	if (!error && IS_DEVVP(*vpp)) {
1637 		vp = *vpp;
1638 		*vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
1639 		VN_RELE(vp);
1640 	}
1641 
1642 out:
1643 	if (avp != NULL)
1644 		VN_RELE(avp);
1645 
1646 	return (error);
1647 }
1648 
1649 static int nfs_lookup_neg_cache = 1;
1650 
1651 #ifdef DEBUG
1652 static int nfs_lookup_dnlc_hits = 0;
1653 static int nfs_lookup_dnlc_misses = 0;
1654 static int nfs_lookup_dnlc_neg_hits = 0;
1655 static int nfs_lookup_dnlc_disappears = 0;
1656 static int nfs_lookup_dnlc_lookups = 0;
1657 #endif
1658 
1659 /* ARGSUSED */
1660 int
1661 nfslookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
1662 	int flags, vnode_t *rdir, cred_t *cr, int rfscall_flags)
1663 {
1664 	int error;
1665 
1666 	ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
1667 
1668 	/*
1669 	 * If lookup is for "", just return dvp.  Don't need
1670 	 * to send it over the wire, look it up in the dnlc,
1671 	 * or perform any access checks.
1672 	 */
1673 	if (*nm == '\0') {
1674 		VN_HOLD(dvp);
1675 		*vpp = dvp;
1676 		return (0);
1677 	}
1678 
1679 	/*
1680 	 * Can't do lookups in non-directories.
1681 	 */
1682 	if (dvp->v_type != VDIR)
1683 		return (ENOTDIR);
1684 
1685 	/*
1686 	 * If we're called with RFSCALL_SOFT, it's important that
1687 	 * the only rfscall is one we make directly; if we permit
1688 	 * an access call because we're looking up "." or validating
1689 	 * a dnlc hit, we'll deadlock because that rfscall will not
1690 	 * have the RFSCALL_SOFT set.
1691 	 */
1692 	if (rfscall_flags & RFSCALL_SOFT)
1693 		goto callit;
1694 
1695 	/*
1696 	 * If lookup is for ".", just return dvp.  Don't need
1697 	 * to send it over the wire or look it up in the dnlc,
1698 	 * just need to check access.
1699 	 */
1700 	if (strcmp(nm, ".") == 0) {
1701 		error = nfs_access(dvp, VEXEC, 0, cr);
1702 		if (error)
1703 			return (error);
1704 		VN_HOLD(dvp);
1705 		*vpp = dvp;
1706 		return (0);
1707 	}
1708 
1709 	/*
1710 	 * Lookup this name in the DNLC.  If there was a valid entry,
1711 	 * then return the results of the lookup.
1712 	 */
1713 	error = nfslookup_dnlc(dvp, nm, vpp, cr);
1714 	if (error || *vpp != NULL)
1715 		return (error);
1716 
1717 callit:
1718 	error = nfslookup_otw(dvp, nm, vpp, cr, rfscall_flags);
1719 
1720 	return (error);
1721 }
1722 
1723 static int
1724 nfslookup_dnlc(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr)
1725 {
1726 	int error;
1727 	vnode_t *vp;
1728 
1729 	ASSERT(*nm != '\0');
1730 	ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
1731 
1732 	/*
1733 	 * Lookup this name in the DNLC.  If successful, then validate
1734 	 * the caches and then recheck the DNLC.  The DNLC is rechecked
1735 	 * just in case this entry got invalidated during the call
1736 	 * to nfs_validate_caches.
1737 	 *
1738 	 * An assumption is being made that it is safe to say that a
1739 	 * file exists which may not on the server.  Any operations to
1740 	 * the server will fail with ESTALE.
1741 	 */
1742 #ifdef DEBUG
1743 	nfs_lookup_dnlc_lookups++;
1744 #endif
1745 	vp = dnlc_lookup(dvp, nm);
1746 	if (vp != NULL) {
1747 		VN_RELE(vp);
1748 		if (vp == DNLC_NO_VNODE && !vn_is_readonly(dvp)) {
1749 			PURGE_ATTRCACHE(dvp);
1750 		}
1751 		error = nfs_validate_caches(dvp, cr);
1752 		if (error)
1753 			return (error);
1754 		vp = dnlc_lookup(dvp, nm);
1755 		if (vp != NULL) {
1756 			error = nfs_access(dvp, VEXEC, 0, cr);
1757 			if (error) {
1758 				VN_RELE(vp);
1759 				return (error);
1760 			}
1761 			if (vp == DNLC_NO_VNODE) {
1762 				VN_RELE(vp);
1763 #ifdef DEBUG
1764 				nfs_lookup_dnlc_neg_hits++;
1765 #endif
1766 				return (ENOENT);
1767 			}
1768 			*vpp = vp;
1769 #ifdef DEBUG
1770 			nfs_lookup_dnlc_hits++;
1771 #endif
1772 			return (0);
1773 		}
1774 #ifdef DEBUG
1775 		nfs_lookup_dnlc_disappears++;
1776 #endif
1777 	}
1778 #ifdef DEBUG
1779 	else
1780 		nfs_lookup_dnlc_misses++;
1781 #endif
1782 
1783 	*vpp = NULL;
1784 
1785 	return (0);
1786 }
1787 
1788 static int
1789 nfslookup_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr,
1790 	int rfscall_flags)
1791 {
1792 	int error;
1793 	struct nfsdiropargs da;
1794 	struct nfsdiropres dr;
1795 	int douprintf;
1796 	failinfo_t fi;
1797 	hrtime_t t;
1798 
1799 	ASSERT(*nm != '\0');
1800 	ASSERT(dvp->v_type == VDIR);
1801 	ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
1802 
1803 	setdiropargs(&da, nm, dvp);
1804 
1805 	fi.vp = dvp;
1806 	fi.fhp = NULL;		/* no need to update, filehandle not copied */
1807 	fi.copyproc = nfscopyfh;
1808 	fi.lookupproc = nfslookup;
1809 	fi.xattrdirproc = acl_getxattrdir2;
1810 
1811 	douprintf = 1;
1812 
1813 	t = gethrtime();
1814 
1815 	error = rfs2call(VTOMI(dvp), RFS_LOOKUP,
1816 	    xdr_diropargs, (caddr_t)&da,
1817 	    xdr_diropres, (caddr_t)&dr, cr,
1818 	    &douprintf, &dr.dr_status, rfscall_flags, &fi);
1819 
1820 	if (!error) {
1821 		error = geterrno(dr.dr_status);
1822 		if (!error) {
1823 			*vpp = makenfsnode(&dr.dr_fhandle, &dr.dr_attr,
1824 			    dvp->v_vfsp, t, cr, VTOR(dvp)->r_path, nm);
1825 			/*
1826 			 * If NFS_ACL is supported on the server, then the
1827 			 * attributes returned by server may have minimal
1828 			 * permissions sometimes denying access to users having
1829 			 * proper access.  To get the proper attributes, mark
1830 			 * the attributes as expired so that they will be
1831 			 * regotten via the NFS_ACL GETATTR2 procedure.
1832 			 */
1833 			if (VTOMI(*vpp)->mi_flags & MI_ACL) {
1834 				PURGE_ATTRCACHE(*vpp);
1835 			}
1836 			if (!(rfscall_flags & RFSCALL_SOFT))
1837 				dnlc_update(dvp, nm, *vpp);
1838 		} else {
1839 			PURGE_STALE_FH(error, dvp, cr);
1840 			if (error == ENOENT && nfs_lookup_neg_cache)
1841 				dnlc_enter(dvp, nm, DNLC_NO_VNODE);
1842 		}
1843 	}
1844 
1845 	return (error);
1846 }
1847 
1848 /* ARGSUSED */
1849 static int
1850 nfs_create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive,
1851 	int mode, vnode_t **vpp, cred_t *cr, int lfaware)
1852 {
1853 	int error;
1854 	struct nfscreatargs args;
1855 	struct nfsdiropres dr;
1856 	int douprintf;
1857 	vnode_t *vp;
1858 	rnode_t *rp;
1859 	struct vattr vattr;
1860 	rnode_t *drp;
1861 	vnode_t *tempvp;
1862 	hrtime_t t;
1863 
1864 	drp = VTOR(dvp);
1865 
1866 	if (nfs_zone() != VTOMI(dvp)->mi_zone)
1867 		return (EPERM);
1868 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
1869 		return (EINTR);
1870 
1871 	/*
1872 	 * We make a copy of the attributes because the caller does not
1873 	 * expect us to change what va points to.
1874 	 */
1875 	vattr = *va;
1876 
1877 	/*
1878 	 * If the pathname is "", just use dvp.  Don't need
1879 	 * to send it over the wire, look it up in the dnlc,
1880 	 * or perform any access checks.
1881 	 */
1882 	if (*nm == '\0') {
1883 		error = 0;
1884 		VN_HOLD(dvp);
1885 		vp = dvp;
1886 	/*
1887 	 * If the pathname is ".", just use dvp.  Don't need
1888 	 * to send it over the wire or look it up in the dnlc,
1889 	 * just need to check access.
1890 	 */
1891 	} else if (strcmp(nm, ".") == 0) {
1892 		error = nfs_access(dvp, VEXEC, 0, cr);
1893 		if (error) {
1894 			nfs_rw_exit(&drp->r_rwlock);
1895 			return (error);
1896 		}
1897 		VN_HOLD(dvp);
1898 		vp = dvp;
1899 	/*
1900 	 * We need to go over the wire, just to be sure whether the
1901 	 * file exists or not.  Using the DNLC can be dangerous in
1902 	 * this case when making a decision regarding existence.
1903 	 */
1904 	} else {
1905 		error = nfslookup_otw(dvp, nm, &vp, cr, 0);
1906 	}
1907 	if (!error) {
1908 		if (exclusive == EXCL)
1909 			error = EEXIST;
1910 		else if (vp->v_type == VDIR && (mode & VWRITE))
1911 			error = EISDIR;
1912 		else {
1913 			/*
1914 			 * If vnode is a device, create special vnode.
1915 			 */
1916 			if (IS_DEVVP(vp)) {
1917 				tempvp = vp;
1918 				vp = specvp(vp, vp->v_rdev, vp->v_type, cr);
1919 				VN_RELE(tempvp);
1920 			}
1921 			if (!(error = VOP_ACCESS(vp, mode, 0, cr))) {
1922 				if ((vattr.va_mask & AT_SIZE) &&
1923 				    vp->v_type == VREG) {
1924 					vattr.va_mask = AT_SIZE;
1925 					error = nfssetattr(vp, &vattr, 0, cr);
1926 				}
1927 			}
1928 		}
1929 		nfs_rw_exit(&drp->r_rwlock);
1930 		if (error) {
1931 			VN_RELE(vp);
1932 		} else
1933 			*vpp = vp;
1934 		return (error);
1935 	}
1936 
1937 	ASSERT(vattr.va_mask & AT_TYPE);
1938 	if (vattr.va_type == VREG) {
1939 		ASSERT(vattr.va_mask & AT_MODE);
1940 		if (MANDMODE(vattr.va_mode)) {
1941 			nfs_rw_exit(&drp->r_rwlock);
1942 			return (EACCES);
1943 		}
1944 	}
1945 
1946 	dnlc_remove(dvp, nm);
1947 
1948 	setdiropargs(&args.ca_da, nm, dvp);
1949 
1950 	/*
1951 	 * Decide what the group-id of the created file should be.
1952 	 * Set it in attribute list as advisory...then do a setattr
1953 	 * if the server didn't get it right the first time.
1954 	 */
1955 	error = setdirgid(dvp, &vattr.va_gid, cr);
1956 	if (error) {
1957 		nfs_rw_exit(&drp->r_rwlock);
1958 		return (error);
1959 	}
1960 	vattr.va_mask |= AT_GID;
1961 
1962 	/*
1963 	 * This is a completely gross hack to make mknod
1964 	 * work over the wire until we can wack the protocol
1965 	 */
1966 #define	IFCHR		0020000		/* character special */
1967 #define	IFBLK		0060000		/* block special */
1968 #define	IFSOCK		0140000		/* socket */
1969 
1970 	/*
1971 	 * dev_t is uint_t in 5.x and short in 4.x. Both 4.x
1972 	 * supports 8 bit majors. 5.x supports 14 bit majors. 5.x supports 18
1973 	 * bits in the minor number where 4.x supports 8 bits.  If the 5.x
1974 	 * minor/major numbers <= 8 bits long, compress the device
1975 	 * number before sending it. Otherwise, the 4.x server will not
1976 	 * create the device with the correct device number and nothing can be
1977 	 * done about this.
1978 	 */
1979 	if (vattr.va_type == VCHR || vattr.va_type == VBLK) {
1980 		dev_t d = vattr.va_rdev;
1981 		dev32_t dev32;
1982 
1983 		if (vattr.va_type == VCHR)
1984 			vattr.va_mode |= IFCHR;
1985 		else
1986 			vattr.va_mode |= IFBLK;
1987 
1988 		(void) cmpldev(&dev32, d);
1989 		if (dev32 & ~((SO4_MAXMAJ << L_BITSMINOR32) | SO4_MAXMIN))
1990 			vattr.va_size = (u_offset_t)dev32;
1991 		else
1992 			vattr.va_size = (u_offset_t)nfsv2_cmpdev(d);
1993 
1994 		vattr.va_mask |= AT_MODE|AT_SIZE;
1995 	} else if (vattr.va_type == VFIFO) {
1996 		vattr.va_mode |= IFCHR;		/* xtra kludge for namedpipe */
1997 		vattr.va_size = (u_offset_t)NFS_FIFO_DEV;	/* blech */
1998 		vattr.va_mask |= AT_MODE|AT_SIZE;
1999 	} else if (vattr.va_type == VSOCK) {
2000 		vattr.va_mode |= IFSOCK;
2001 		/*
2002 		 * To avoid triggering bugs in the servers set AT_SIZE
2003 		 * (all other RFS_CREATE calls set this).
2004 		 */
2005 		vattr.va_size = 0;
2006 		vattr.va_mask |= AT_MODE|AT_SIZE;
2007 	}
2008 
2009 	args.ca_sa = &args.ca_sa_buf;
2010 	error = vattr_to_sattr(&vattr, args.ca_sa);
2011 	if (error) {
2012 		/* req time field(s) overflow - return immediately */
2013 		nfs_rw_exit(&drp->r_rwlock);
2014 		return (error);
2015 	}
2016 
2017 	douprintf = 1;
2018 
2019 	t = gethrtime();
2020 
2021 	error = rfs2call(VTOMI(dvp), RFS_CREATE,
2022 	    xdr_creatargs, (caddr_t)&args,
2023 	    xdr_diropres, (caddr_t)&dr, cr,
2024 	    &douprintf, &dr.dr_status, 0, NULL);
2025 
2026 	PURGE_ATTRCACHE(dvp);	/* mod time changed */
2027 
2028 	if (!error) {
2029 		error = geterrno(dr.dr_status);
2030 		if (!error) {
2031 			if (HAVE_RDDIR_CACHE(drp))
2032 				nfs_purge_rddir_cache(dvp);
2033 			vp = makenfsnode(&dr.dr_fhandle, &dr.dr_attr,
2034 			    dvp->v_vfsp, t, cr, NULL, NULL);
2035 			/*
2036 			 * If NFS_ACL is supported on the server, then the
2037 			 * attributes returned by server may have minimal
2038 			 * permissions sometimes denying access to users having
2039 			 * proper access.  To get the proper attributes, mark
2040 			 * the attributes as expired so that they will be
2041 			 * regotten via the NFS_ACL GETATTR2 procedure.
2042 			 */
2043 			if (VTOMI(vp)->mi_flags & MI_ACL) {
2044 				PURGE_ATTRCACHE(vp);
2045 			}
2046 			dnlc_update(dvp, nm, vp);
2047 			rp = VTOR(vp);
2048 			if (vattr.va_size == 0) {
2049 				mutex_enter(&rp->r_statelock);
2050 				rp->r_size = 0;
2051 				mutex_exit(&rp->r_statelock);
2052 				if (vn_has_cached_data(vp)) {
2053 					ASSERT(vp->v_type != VCHR);
2054 					nfs_invalidate_pages(vp,
2055 					    (u_offset_t)0, cr);
2056 				}
2057 			}
2058 
2059 			/*
2060 			 * Make sure the gid was set correctly.
2061 			 * If not, try to set it (but don't lose
2062 			 * any sleep over it).
2063 			 */
2064 			if (vattr.va_gid != rp->r_attr.va_gid) {
2065 				vattr.va_mask = AT_GID;
2066 				(void) nfssetattr(vp, &vattr, 0, cr);
2067 			}
2068 
2069 			/*
2070 			 * If vnode is a device create special vnode
2071 			 */
2072 			if (IS_DEVVP(vp)) {
2073 				*vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
2074 				VN_RELE(vp);
2075 			} else
2076 				*vpp = vp;
2077 		} else {
2078 			PURGE_STALE_FH(error, dvp, cr);
2079 		}
2080 	}
2081 
2082 	nfs_rw_exit(&drp->r_rwlock);
2083 
2084 	return (error);
2085 }
2086 
2087 /*
2088  * Weirdness: if the vnode to be removed is open
2089  * we rename it instead of removing it and nfs_inactive
2090  * will remove the new name.
2091  */
2092 static int
2093 nfs_remove(vnode_t *dvp, char *nm, cred_t *cr)
2094 {
2095 	int error;
2096 	struct nfsdiropargs da;
2097 	enum nfsstat status;
2098 	vnode_t *vp;
2099 	char *tmpname;
2100 	int douprintf;
2101 	rnode_t *rp;
2102 	rnode_t *drp;
2103 
2104 	if (nfs_zone() != VTOMI(dvp)->mi_zone)
2105 		return (EPERM);
2106 	drp = VTOR(dvp);
2107 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
2108 		return (EINTR);
2109 
2110 	error = nfslookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0);
2111 	if (error) {
2112 		nfs_rw_exit(&drp->r_rwlock);
2113 		return (error);
2114 	}
2115 
2116 	if (vp->v_type == VDIR && secpolicy_fs_linkdir(cr, dvp->v_vfsp)) {
2117 		VN_RELE(vp);
2118 		nfs_rw_exit(&drp->r_rwlock);
2119 		return (EPERM);
2120 	}
2121 
2122 	/*
2123 	 * First just remove the entry from the name cache, as it
2124 	 * is most likely the only entry for this vp.
2125 	 */
2126 	dnlc_remove(dvp, nm);
2127 
2128 	/*
2129 	 * If the file has a v_count > 1 then there may be more than one
2130 	 * entry in the name cache due multiple links or an open file,
2131 	 * but we don't have the real reference count so flush all
2132 	 * possible entries.
2133 	 */
2134 	if (vp->v_count > 1)
2135 		dnlc_purge_vp(vp);
2136 
2137 	/*
2138 	 * Now we have the real reference count on the vnode
2139 	 */
2140 	rp = VTOR(vp);
2141 	mutex_enter(&rp->r_statelock);
2142 	if (vp->v_count > 1 &&
2143 	    (rp->r_unldvp == NULL || strcmp(nm, rp->r_unlname) == 0)) {
2144 		mutex_exit(&rp->r_statelock);
2145 		tmpname = newname();
2146 		error = nfsrename(dvp, nm, dvp, tmpname, cr);
2147 		if (error)
2148 			kmem_free(tmpname, MAXNAMELEN);
2149 		else {
2150 			mutex_enter(&rp->r_statelock);
2151 			if (rp->r_unldvp == NULL) {
2152 				VN_HOLD(dvp);
2153 				rp->r_unldvp = dvp;
2154 				if (rp->r_unlcred != NULL)
2155 					crfree(rp->r_unlcred);
2156 				crhold(cr);
2157 				rp->r_unlcred = cr;
2158 				rp->r_unlname = tmpname;
2159 			} else {
2160 				kmem_free(rp->r_unlname, MAXNAMELEN);
2161 				rp->r_unlname = tmpname;
2162 			}
2163 			mutex_exit(&rp->r_statelock);
2164 		}
2165 	} else {
2166 		mutex_exit(&rp->r_statelock);
2167 		/*
2168 		 * We need to flush any dirty pages which happen to
2169 		 * be hanging around before removing the file.  This
2170 		 * shouldn't happen very often and mostly on file
2171 		 * systems mounted "nocto".
2172 		 */
2173 		if (vn_has_cached_data(vp) &&
2174 		    ((rp->r_flags & RDIRTY) || rp->r_count > 0)) {
2175 			error = nfs_putpage(vp, (offset_t)0, 0, 0, cr);
2176 			if (error && (error == ENOSPC || error == EDQUOT)) {
2177 				mutex_enter(&rp->r_statelock);
2178 				if (!rp->r_error)
2179 					rp->r_error = error;
2180 				mutex_exit(&rp->r_statelock);
2181 			}
2182 		}
2183 
2184 		setdiropargs(&da, nm, dvp);
2185 
2186 		douprintf = 1;
2187 
2188 		error = rfs2call(VTOMI(dvp), RFS_REMOVE,
2189 		    xdr_diropargs, (caddr_t)&da,
2190 		    xdr_enum, (caddr_t)&status, cr,
2191 		    &douprintf, &status, 0, NULL);
2192 
2193 		/*
2194 		 * The xattr dir may be gone after last attr is removed,
2195 		 * so flush it from dnlc.
2196 		 */
2197 		if (dvp->v_flag & V_XATTRDIR)
2198 			dnlc_purge_vp(dvp);
2199 
2200 		PURGE_ATTRCACHE(dvp);	/* mod time changed */
2201 		PURGE_ATTRCACHE(vp);	/* link count changed */
2202 
2203 		if (!error) {
2204 			error = geterrno(status);
2205 			if (!error) {
2206 				if (HAVE_RDDIR_CACHE(drp))
2207 					nfs_purge_rddir_cache(dvp);
2208 			} else {
2209 				PURGE_STALE_FH(error, dvp, cr);
2210 			}
2211 		}
2212 	}
2213 
2214 	VN_RELE(vp);
2215 
2216 	nfs_rw_exit(&drp->r_rwlock);
2217 
2218 	return (error);
2219 }
2220 
2221 static int
2222 nfs_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr)
2223 {
2224 	int error;
2225 	struct nfslinkargs args;
2226 	enum nfsstat status;
2227 	vnode_t *realvp;
2228 	int douprintf;
2229 	rnode_t *tdrp;
2230 
2231 	if (nfs_zone() != VTOMI(tdvp)->mi_zone)
2232 		return (EPERM);
2233 	if (VOP_REALVP(svp, &realvp) == 0)
2234 		svp = realvp;
2235 
2236 	args.la_from = VTOFH(svp);
2237 	setdiropargs(&args.la_to, tnm, tdvp);
2238 
2239 	tdrp = VTOR(tdvp);
2240 	if (nfs_rw_enter_sig(&tdrp->r_rwlock, RW_WRITER, INTR(tdvp)))
2241 		return (EINTR);
2242 
2243 	dnlc_remove(tdvp, tnm);
2244 
2245 	douprintf = 1;
2246 
2247 	error = rfs2call(VTOMI(svp), RFS_LINK,
2248 	    xdr_linkargs, (caddr_t)&args,
2249 	    xdr_enum, (caddr_t)&status, cr,
2250 	    &douprintf, &status, 0, NULL);
2251 
2252 	PURGE_ATTRCACHE(tdvp);	/* mod time changed */
2253 	PURGE_ATTRCACHE(svp);	/* link count changed */
2254 
2255 	if (!error) {
2256 		error = geterrno(status);
2257 		if (!error) {
2258 			if (HAVE_RDDIR_CACHE(tdrp))
2259 				nfs_purge_rddir_cache(tdvp);
2260 		}
2261 	}
2262 
2263 	nfs_rw_exit(&tdrp->r_rwlock);
2264 
2265 	return (error);
2266 }
2267 
2268 static int
2269 nfs_rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr)
2270 {
2271 	vnode_t *realvp;
2272 
2273 	if (nfs_zone() != VTOMI(odvp)->mi_zone)
2274 		return (EPERM);
2275 	if (VOP_REALVP(ndvp, &realvp) == 0)
2276 		ndvp = realvp;
2277 
2278 	return (nfsrename(odvp, onm, ndvp, nnm, cr));
2279 }
2280 
2281 /*
2282  * nfsrename does the real work of renaming in NFS Version 2.
2283  */
2284 static int
2285 nfsrename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr)
2286 {
2287 	int error;
2288 	enum nfsstat status;
2289 	struct nfsrnmargs args;
2290 	int douprintf;
2291 	vnode_t *nvp;
2292 	vnode_t *ovp = NULL;
2293 	char *tmpname;
2294 	rnode_t *rp;
2295 	rnode_t *odrp;
2296 	rnode_t *ndrp;
2297 
2298 	ASSERT(nfs_zone() == VTOMI(odvp)->mi_zone);
2299 	if (strcmp(onm, ".") == 0 || strcmp(onm, "..") == 0 ||
2300 	    strcmp(nnm, ".") == 0 || strcmp(nnm, "..") == 0)
2301 		return (EINVAL);
2302 
2303 	odrp = VTOR(odvp);
2304 	ndrp = VTOR(ndvp);
2305 	if ((intptr_t)odrp < (intptr_t)ndrp) {
2306 		if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR(odvp)))
2307 			return (EINTR);
2308 		if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR(ndvp))) {
2309 			nfs_rw_exit(&odrp->r_rwlock);
2310 			return (EINTR);
2311 		}
2312 	} else {
2313 		if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR(ndvp)))
2314 			return (EINTR);
2315 		if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR(odvp))) {
2316 			nfs_rw_exit(&ndrp->r_rwlock);
2317 			return (EINTR);
2318 		}
2319 	}
2320 
2321 	/*
2322 	 * Lookup the target file.  If it exists, it needs to be
2323 	 * checked to see whether it is a mount point and whether
2324 	 * it is active (open).
2325 	 */
2326 	error = nfslookup(ndvp, nnm, &nvp, NULL, 0, NULL, cr, 0);
2327 	if (!error) {
2328 		/*
2329 		 * If this file has been mounted on, then just
2330 		 * return busy because renaming to it would remove
2331 		 * the mounted file system from the name space.
2332 		 */
2333 		if (vn_mountedvfs(nvp) != NULL) {
2334 			VN_RELE(nvp);
2335 			nfs_rw_exit(&odrp->r_rwlock);
2336 			nfs_rw_exit(&ndrp->r_rwlock);
2337 			return (EBUSY);
2338 		}
2339 
2340 		/*
2341 		 * Purge the name cache of all references to this vnode
2342 		 * so that we can check the reference count to infer
2343 		 * whether it is active or not.
2344 		 */
2345 		/*
2346 		 * First just remove the entry from the name cache, as it
2347 		 * is most likely the only entry for this vp.
2348 		 */
2349 		dnlc_remove(ndvp, nnm);
2350 		/*
2351 		 * If the file has a v_count > 1 then there may be more
2352 		 * than one entry in the name cache due multiple links
2353 		 * or an open file, but we don't have the real reference
2354 		 * count so flush all possible entries.
2355 		 */
2356 		if (nvp->v_count > 1)
2357 			dnlc_purge_vp(nvp);
2358 
2359 		/*
2360 		 * If the vnode is active and is not a directory,
2361 		 * arrange to rename it to a
2362 		 * temporary file so that it will continue to be
2363 		 * accessible.  This implements the "unlink-open-file"
2364 		 * semantics for the target of a rename operation.
2365 		 * Before doing this though, make sure that the
2366 		 * source and target files are not already the same.
2367 		 */
2368 		if (nvp->v_count > 1 && nvp->v_type != VDIR) {
2369 			/*
2370 			 * Lookup the source name.
2371 			 */
2372 			error = nfslookup(odvp, onm, &ovp, NULL, 0, NULL,
2373 			    cr, 0);
2374 
2375 			/*
2376 			 * The source name *should* already exist.
2377 			 */
2378 			if (error) {
2379 				VN_RELE(nvp);
2380 				nfs_rw_exit(&odrp->r_rwlock);
2381 				nfs_rw_exit(&ndrp->r_rwlock);
2382 				return (error);
2383 			}
2384 
2385 			/*
2386 			 * Compare the two vnodes.  If they are the same,
2387 			 * just release all held vnodes and return success.
2388 			 */
2389 			if (ovp == nvp) {
2390 				VN_RELE(ovp);
2391 				VN_RELE(nvp);
2392 				nfs_rw_exit(&odrp->r_rwlock);
2393 				nfs_rw_exit(&ndrp->r_rwlock);
2394 				return (0);
2395 			}
2396 
2397 			/*
2398 			 * Can't mix and match directories and non-
2399 			 * directories in rename operations.  We already
2400 			 * know that the target is not a directory.  If
2401 			 * the source is a directory, return an error.
2402 			 */
2403 			if (ovp->v_type == VDIR) {
2404 				VN_RELE(ovp);
2405 				VN_RELE(nvp);
2406 				nfs_rw_exit(&odrp->r_rwlock);
2407 				nfs_rw_exit(&ndrp->r_rwlock);
2408 				return (ENOTDIR);
2409 			}
2410 
2411 			/*
2412 			 * The target file exists, is not the same as
2413 			 * the source file, and is active.  Link it
2414 			 * to a temporary filename to avoid having
2415 			 * the server removing the file completely.
2416 			 */
2417 			tmpname = newname();
2418 			error = nfs_link(ndvp, nvp, tmpname, cr);
2419 			if (error == EOPNOTSUPP) {
2420 				error = nfs_rename(ndvp, nnm, ndvp, tmpname,
2421 				    cr);
2422 			}
2423 			if (error) {
2424 				kmem_free(tmpname, MAXNAMELEN);
2425 				VN_RELE(ovp);
2426 				VN_RELE(nvp);
2427 				nfs_rw_exit(&odrp->r_rwlock);
2428 				nfs_rw_exit(&ndrp->r_rwlock);
2429 				return (error);
2430 			}
2431 			rp = VTOR(nvp);
2432 			mutex_enter(&rp->r_statelock);
2433 			if (rp->r_unldvp == NULL) {
2434 				VN_HOLD(ndvp);
2435 				rp->r_unldvp = ndvp;
2436 				if (rp->r_unlcred != NULL)
2437 					crfree(rp->r_unlcred);
2438 				crhold(cr);
2439 				rp->r_unlcred = cr;
2440 				rp->r_unlname = tmpname;
2441 			} else {
2442 				kmem_free(rp->r_unlname, MAXNAMELEN);
2443 				rp->r_unlname = tmpname;
2444 			}
2445 			mutex_exit(&rp->r_statelock);
2446 		}
2447 
2448 		VN_RELE(nvp);
2449 	}
2450 
2451 	if (ovp == NULL) {
2452 		/*
2453 		 * When renaming directories to be a subdirectory of a
2454 		 * different parent, the dnlc entry for ".." will no
2455 		 * longer be valid, so it must be removed.
2456 		 *
2457 		 * We do a lookup here to determine whether we are renaming
2458 		 * a directory and we need to check if we are renaming
2459 		 * an unlinked file.  This might have already been done
2460 		 * in previous code, so we check ovp == NULL to avoid
2461 		 * doing it twice.
2462 		 */
2463 
2464 		error = nfslookup(odvp, onm, &ovp, NULL, 0, NULL, cr, 0);
2465 
2466 		/*
2467 		 * The source name *should* already exist.
2468 		 */
2469 		if (error) {
2470 			nfs_rw_exit(&odrp->r_rwlock);
2471 			nfs_rw_exit(&ndrp->r_rwlock);
2472 			return (error);
2473 		}
2474 		ASSERT(ovp != NULL);
2475 	}
2476 
2477 	dnlc_remove(odvp, onm);
2478 	dnlc_remove(ndvp, nnm);
2479 
2480 	setdiropargs(&args.rna_from, onm, odvp);
2481 	setdiropargs(&args.rna_to, nnm, ndvp);
2482 
2483 	douprintf = 1;
2484 
2485 	error = rfs2call(VTOMI(odvp), RFS_RENAME,
2486 	    xdr_rnmargs, (caddr_t)&args,
2487 	    xdr_enum, (caddr_t)&status, cr,
2488 	    &douprintf, &status, 0, NULL);
2489 
2490 	PURGE_ATTRCACHE(odvp);	/* mod time changed */
2491 	PURGE_ATTRCACHE(ndvp);	/* mod time changed */
2492 
2493 	if (!error) {
2494 		error = geterrno(status);
2495 		if (!error) {
2496 			if (HAVE_RDDIR_CACHE(odrp))
2497 				nfs_purge_rddir_cache(odvp);
2498 			if (HAVE_RDDIR_CACHE(ndrp))
2499 				nfs_purge_rddir_cache(ndvp);
2500 			/*
2501 			 * when renaming directories to be a subdirectory of a
2502 			 * different parent, the dnlc entry for ".." will no
2503 			 * longer be valid, so it must be removed
2504 			 */
2505 			rp = VTOR(ovp);
2506 			if (ndvp != odvp) {
2507 				if (ovp->v_type == VDIR) {
2508 					dnlc_remove(ovp, "..");
2509 					if (HAVE_RDDIR_CACHE(rp))
2510 						nfs_purge_rddir_cache(ovp);
2511 				}
2512 			}
2513 
2514 			/*
2515 			 * If we are renaming the unlinked file, update the
2516 			 * r_unldvp and r_unlname as needed.
2517 			 */
2518 			mutex_enter(&rp->r_statelock);
2519 			if (rp->r_unldvp != NULL) {
2520 				if (strcmp(rp->r_unlname, onm) == 0) {
2521 					(void) strncpy(rp->r_unlname,
2522 						    nnm, MAXNAMELEN);
2523 					rp->r_unlname[MAXNAMELEN - 1] = '\0';
2524 
2525 					if (ndvp != rp->r_unldvp) {
2526 						VN_RELE(rp->r_unldvp);
2527 						rp->r_unldvp = ndvp;
2528 						VN_HOLD(ndvp);
2529 					}
2530 				}
2531 			}
2532 			mutex_exit(&rp->r_statelock);
2533 		} else {
2534 			/*
2535 			 * System V defines rename to return EEXIST, not
2536 			 * ENOTEMPTY if the target directory is not empty.
2537 			 * Over the wire, the error is NFSERR_ENOTEMPTY
2538 			 * which geterrno maps to ENOTEMPTY.
2539 			 */
2540 			if (error == ENOTEMPTY)
2541 				error = EEXIST;
2542 		}
2543 	}
2544 
2545 	VN_RELE(ovp);
2546 
2547 	nfs_rw_exit(&odrp->r_rwlock);
2548 	nfs_rw_exit(&ndrp->r_rwlock);
2549 
2550 	return (error);
2551 }
2552 
2553 static int
2554 nfs_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp, cred_t *cr)
2555 {
2556 	int error;
2557 	struct nfscreatargs args;
2558 	struct nfsdiropres dr;
2559 	int douprintf;
2560 	rnode_t *drp;
2561 	hrtime_t t;
2562 
2563 	if (nfs_zone() != VTOMI(dvp)->mi_zone)
2564 		return (EPERM);
2565 
2566 	setdiropargs(&args.ca_da, nm, dvp);
2567 
2568 	/*
2569 	 * Decide what the group-id and set-gid bit of the created directory
2570 	 * should be.  May have to do a setattr to get the gid right.
2571 	 */
2572 	error = setdirgid(dvp, &va->va_gid, cr);
2573 	if (error)
2574 		return (error);
2575 	error = setdirmode(dvp, &va->va_mode, cr);
2576 	if (error)
2577 		return (error);
2578 	va->va_mask |= AT_MODE|AT_GID;
2579 
2580 	args.ca_sa = &args.ca_sa_buf;
2581 	error = vattr_to_sattr(va, args.ca_sa);
2582 	if (error) {
2583 		/* req time field(s) overflow - return immediately */
2584 		return (error);
2585 	}
2586 
2587 	drp = VTOR(dvp);
2588 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
2589 		return (EINTR);
2590 
2591 	dnlc_remove(dvp, nm);
2592 
2593 	douprintf = 1;
2594 
2595 	t = gethrtime();
2596 
2597 	error = rfs2call(VTOMI(dvp), RFS_MKDIR,
2598 	    xdr_creatargs, (caddr_t)&args,
2599 	    xdr_diropres, (caddr_t)&dr, cr,
2600 	    &douprintf, &dr.dr_status, 0, NULL);
2601 
2602 	PURGE_ATTRCACHE(dvp);	/* mod time changed */
2603 
2604 	if (!error) {
2605 		error = geterrno(dr.dr_status);
2606 		if (!error) {
2607 			if (HAVE_RDDIR_CACHE(drp))
2608 				nfs_purge_rddir_cache(dvp);
2609 			/*
2610 			 * The attributes returned by RFS_MKDIR can not
2611 			 * be depended upon, so mark the attribute cache
2612 			 * as purged.  A subsequent GETATTR will get the
2613 			 * correct attributes from the server.
2614 			 */
2615 			*vpp = makenfsnode(&dr.dr_fhandle, &dr.dr_attr,
2616 			    dvp->v_vfsp, t, cr, NULL, NULL);
2617 			PURGE_ATTRCACHE(*vpp);
2618 			dnlc_update(dvp, nm, *vpp);
2619 
2620 			/*
2621 			 * Make sure the gid was set correctly.
2622 			 * If not, try to set it (but don't lose
2623 			 * any sleep over it).
2624 			 */
2625 			if (va->va_gid != VTOR(*vpp)->r_attr.va_gid) {
2626 				va->va_mask = AT_GID;
2627 				(void) nfssetattr(*vpp, va, 0, cr);
2628 			}
2629 		} else {
2630 			PURGE_STALE_FH(error, dvp, cr);
2631 		}
2632 	}
2633 
2634 	nfs_rw_exit(&drp->r_rwlock);
2635 
2636 	return (error);
2637 }
2638 
2639 static int
2640 nfs_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr)
2641 {
2642 	int error;
2643 	enum nfsstat status;
2644 	struct nfsdiropargs da;
2645 	vnode_t *vp;
2646 	int douprintf;
2647 	rnode_t *drp;
2648 
2649 	if (nfs_zone() != VTOMI(dvp)->mi_zone)
2650 		return (EPERM);
2651 	drp = VTOR(dvp);
2652 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
2653 		return (EINTR);
2654 
2655 	/*
2656 	 * Attempt to prevent a rmdir(".") from succeeding.
2657 	 */
2658 	error = nfslookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0);
2659 	if (error) {
2660 		nfs_rw_exit(&drp->r_rwlock);
2661 		return (error);
2662 	}
2663 
2664 	if (vp == cdir) {
2665 		VN_RELE(vp);
2666 		nfs_rw_exit(&drp->r_rwlock);
2667 		return (EINVAL);
2668 	}
2669 
2670 	setdiropargs(&da, nm, dvp);
2671 
2672 	/*
2673 	 * First just remove the entry from the name cache, as it
2674 	 * is most likely an entry for this vp.
2675 	 */
2676 	dnlc_remove(dvp, nm);
2677 
2678 	/*
2679 	 * If there vnode reference count is greater than one, then
2680 	 * there may be additional references in the DNLC which will
2681 	 * need to be purged.  First, trying removing the entry for
2682 	 * the parent directory and see if that removes the additional
2683 	 * reference(s).  If that doesn't do it, then use dnlc_purge_vp
2684 	 * to completely remove any references to the directory which
2685 	 * might still exist in the DNLC.
2686 	 */
2687 	if (vp->v_count > 1) {
2688 		dnlc_remove(vp, "..");
2689 		if (vp->v_count > 1)
2690 			dnlc_purge_vp(vp);
2691 	}
2692 
2693 	douprintf = 1;
2694 
2695 	error = rfs2call(VTOMI(dvp), RFS_RMDIR,
2696 	    xdr_diropargs, (caddr_t)&da,
2697 	    xdr_enum, (caddr_t)&status, cr,
2698 	    &douprintf, &status, 0, NULL);
2699 
2700 	PURGE_ATTRCACHE(dvp);	/* mod time changed */
2701 
2702 	if (error) {
2703 		VN_RELE(vp);
2704 		nfs_rw_exit(&drp->r_rwlock);
2705 		return (error);
2706 	}
2707 
2708 	error = geterrno(status);
2709 	if (!error) {
2710 		if (HAVE_RDDIR_CACHE(drp))
2711 			nfs_purge_rddir_cache(dvp);
2712 		if (HAVE_RDDIR_CACHE(VTOR(vp)))
2713 			nfs_purge_rddir_cache(vp);
2714 	} else {
2715 		PURGE_STALE_FH(error, dvp, cr);
2716 		/*
2717 		 * System V defines rmdir to return EEXIST, not
2718 		 * ENOTEMPTY if the directory is not empty.  Over
2719 		 * the wire, the error is NFSERR_ENOTEMPTY which
2720 		 * geterrno maps to ENOTEMPTY.
2721 		 */
2722 		if (error == ENOTEMPTY)
2723 			error = EEXIST;
2724 	}
2725 
2726 	VN_RELE(vp);
2727 
2728 	nfs_rw_exit(&drp->r_rwlock);
2729 
2730 	return (error);
2731 }
2732 
2733 static int
2734 nfs_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm, cred_t *cr)
2735 {
2736 	int error;
2737 	struct nfsslargs args;
2738 	enum nfsstat status;
2739 	int douprintf;
2740 	rnode_t *drp;
2741 
2742 	if (nfs_zone() != VTOMI(dvp)->mi_zone)
2743 		return (EPERM);
2744 	setdiropargs(&args.sla_from, lnm, dvp);
2745 	args.sla_sa = &args.sla_sa_buf;
2746 	error = vattr_to_sattr(tva, args.sla_sa);
2747 	if (error) {
2748 		/* req time field(s) overflow - return immediately */
2749 		return (error);
2750 	}
2751 	args.sla_tnm = tnm;
2752 
2753 	drp = VTOR(dvp);
2754 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
2755 		return (EINTR);
2756 
2757 	dnlc_remove(dvp, lnm);
2758 
2759 	douprintf = 1;
2760 
2761 	error = rfs2call(VTOMI(dvp), RFS_SYMLINK,
2762 	    xdr_slargs, (caddr_t)&args,
2763 	    xdr_enum, (caddr_t)&status, cr,
2764 	    &douprintf, &status, 0, NULL);
2765 
2766 	PURGE_ATTRCACHE(dvp);	/* mod time changed */
2767 
2768 	if (!error) {
2769 		error = geterrno(status);
2770 		if (!error) {
2771 			if (HAVE_RDDIR_CACHE(drp))
2772 				nfs_purge_rddir_cache(dvp);
2773 		} else {
2774 			PURGE_STALE_FH(error, dvp, cr);
2775 		}
2776 	}
2777 
2778 	nfs_rw_exit(&drp->r_rwlock);
2779 
2780 	return (error);
2781 }
2782 
2783 #ifdef DEBUG
2784 static int nfs_readdir_cache_hits = 0;
2785 static int nfs_readdir_cache_shorts = 0;
2786 static int nfs_readdir_cache_waits = 0;
2787 static int nfs_readdir_cache_misses = 0;
2788 static int nfs_readdir_readahead = 0;
2789 #endif
2790 
2791 static int nfs_shrinkreaddir = 0;
2792 
2793 /*
2794  * Read directory entries.
2795  * There are some weird things to look out for here.  The uio_offset
2796  * field is either 0 or it is the offset returned from a previous
2797  * readdir.  It is an opaque value used by the server to find the
2798  * correct directory block to read. The count field is the number
2799  * of blocks to read on the server.  This is advisory only, the server
2800  * may return only one block's worth of entries.  Entries may be compressed
2801  * on the server.
2802  */
2803 static int
2804 nfs_readdir(vnode_t *vp, struct uio *uiop, cred_t *cr, int *eofp)
2805 {
2806 	int error;
2807 	size_t count;
2808 	rnode_t *rp;
2809 	rddir_cache *rdc;
2810 	rddir_cache *nrdc;
2811 	rddir_cache *rrdc;
2812 #ifdef DEBUG
2813 	int missed;
2814 #endif
2815 	rddir_cache srdc;
2816 	avl_index_t where;
2817 
2818 	rp = VTOR(vp);
2819 
2820 	ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
2821 	if (nfs_zone() != VTOMI(vp)->mi_zone)
2822 		return (EIO);
2823 	/*
2824 	 * Make sure that the directory cache is valid.
2825 	 */
2826 	if (HAVE_RDDIR_CACHE(rp)) {
2827 		if (nfs_disable_rddir_cache) {
2828 			/*
2829 			 * Setting nfs_disable_rddir_cache in /etc/system
2830 			 * allows interoperability with servers that do not
2831 			 * properly update the attributes of directories.
2832 			 * Any cached information gets purged before an
2833 			 * access is made to it.
2834 			 */
2835 			nfs_purge_rddir_cache(vp);
2836 		} else {
2837 			error = nfs_validate_caches(vp, cr);
2838 			if (error)
2839 				return (error);
2840 		}
2841 	}
2842 
2843 	/*
2844 	 * UGLINESS: SunOS 3.2 servers apparently cannot always handle an
2845 	 * RFS_READDIR request with rda_count set to more than 0x400. So
2846 	 * we reduce the request size here purely for compatibility.
2847 	 *
2848 	 * In general, this is no longer required.  However, if a server
2849 	 * is discovered which can not handle requests larger than 1024,
2850 	 * nfs_shrinkreaddir can be set to 1 to enable this backwards
2851 	 * compatibility.
2852 	 *
2853 	 * In any case, the request size is limited to NFS_MAXDATA bytes.
2854 	 */
2855 	count = MIN(uiop->uio_iov->iov_len,
2856 	    nfs_shrinkreaddir ? 0x400 : NFS_MAXDATA);
2857 
2858 	nrdc = NULL;
2859 #ifdef DEBUG
2860 	missed = 0;
2861 #endif
2862 top:
2863 	/*
2864 	 * Short circuit last readdir which always returns 0 bytes.
2865 	 * This can be done after the directory has been read through
2866 	 * completely at least once.  This will set r_direof which
2867 	 * can be used to find the value of the last cookie.
2868 	 */
2869 	mutex_enter(&rp->r_statelock);
2870 	if (rp->r_direof != NULL &&
2871 	    uiop->uio_offset == rp->r_direof->nfs_ncookie) {
2872 		mutex_exit(&rp->r_statelock);
2873 #ifdef DEBUG
2874 		nfs_readdir_cache_shorts++;
2875 #endif
2876 		if (eofp)
2877 			*eofp = 1;
2878 		if (nrdc != NULL)
2879 			rddir_cache_rele(nrdc);
2880 		return (0);
2881 	}
2882 	/*
2883 	 * Look for a cache entry.  Cache entries are identified
2884 	 * by the NFS cookie value and the byte count requested.
2885 	 */
2886 	srdc.nfs_cookie = uiop->uio_offset;
2887 	srdc.buflen = count;
2888 	rdc = avl_find(&rp->r_dir, &srdc, &where);
2889 	if (rdc != NULL) {
2890 		rddir_cache_hold(rdc);
2891 		/*
2892 		 * If the cache entry is in the process of being
2893 		 * filled in, wait until this completes.  The
2894 		 * RDDIRWAIT bit is set to indicate that someone
2895 		 * is waiting and then the thread currently
2896 		 * filling the entry is done, it should do a
2897 		 * cv_broadcast to wakeup all of the threads
2898 		 * waiting for it to finish.
2899 		 */
2900 		if (rdc->flags & RDDIR) {
2901 			nfs_rw_exit(&rp->r_rwlock);
2902 			rdc->flags |= RDDIRWAIT;
2903 #ifdef DEBUG
2904 			nfs_readdir_cache_waits++;
2905 #endif
2906 			if (!cv_wait_sig(&rdc->cv, &rp->r_statelock)) {
2907 				/*
2908 				 * We got interrupted, probably
2909 				 * the user typed ^C or an alarm
2910 				 * fired.  We free the new entry
2911 				 * if we allocated one.
2912 				 */
2913 				mutex_exit(&rp->r_statelock);
2914 				(void) nfs_rw_enter_sig(&rp->r_rwlock,
2915 					RW_READER, FALSE);
2916 				rddir_cache_rele(rdc);
2917 				if (nrdc != NULL)
2918 					rddir_cache_rele(nrdc);
2919 				return (EINTR);
2920 			}
2921 			mutex_exit(&rp->r_statelock);
2922 			(void) nfs_rw_enter_sig(&rp->r_rwlock,
2923 				RW_READER, FALSE);
2924 			rddir_cache_rele(rdc);
2925 			goto top;
2926 		}
2927 		/*
2928 		 * Check to see if a readdir is required to
2929 		 * fill the entry.  If so, mark this entry
2930 		 * as being filled, remove our reference,
2931 		 * and branch to the code to fill the entry.
2932 		 */
2933 		if (rdc->flags & RDDIRREQ) {
2934 			rdc->flags &= ~RDDIRREQ;
2935 			rdc->flags |= RDDIR;
2936 			if (nrdc != NULL)
2937 				rddir_cache_rele(nrdc);
2938 			nrdc = rdc;
2939 			mutex_exit(&rp->r_statelock);
2940 			goto bottom;
2941 		}
2942 #ifdef DEBUG
2943 		if (!missed)
2944 			nfs_readdir_cache_hits++;
2945 #endif
2946 		/*
2947 		 * If an error occurred while attempting
2948 		 * to fill the cache entry, just return it.
2949 		 */
2950 		if (rdc->error) {
2951 			error = rdc->error;
2952 			mutex_exit(&rp->r_statelock);
2953 			rddir_cache_rele(rdc);
2954 			if (nrdc != NULL)
2955 				rddir_cache_rele(nrdc);
2956 			return (error);
2957 		}
2958 
2959 		/*
2960 		 * The cache entry is complete and good,
2961 		 * copyout the dirent structs to the calling
2962 		 * thread.
2963 		 */
2964 		error = uiomove(rdc->entries, rdc->entlen, UIO_READ, uiop);
2965 
2966 		/*
2967 		 * If no error occurred during the copyout,
2968 		 * update the offset in the uio struct to
2969 		 * contain the value of the next cookie
2970 		 * and set the eof value appropriately.
2971 		 */
2972 		if (!error) {
2973 			uiop->uio_offset = rdc->nfs_ncookie;
2974 			if (eofp)
2975 				*eofp = rdc->eof;
2976 		}
2977 
2978 		/*
2979 		 * Decide whether to do readahead.  Don't if
2980 		 * have already read to the end of directory.
2981 		 */
2982 		if (rdc->eof) {
2983 			rp->r_direof = rdc;
2984 			mutex_exit(&rp->r_statelock);
2985 			rddir_cache_rele(rdc);
2986 			if (nrdc != NULL)
2987 				rddir_cache_rele(nrdc);
2988 			return (error);
2989 		}
2990 
2991 		/*
2992 		 * Check to see whether we found an entry
2993 		 * for the readahead.  If so, we don't need
2994 		 * to do anything further, so free the new
2995 		 * entry if one was allocated.  Otherwise,
2996 		 * allocate a new entry, add it to the cache,
2997 		 * and then initiate an asynchronous readdir
2998 		 * operation to fill it.
2999 		 */
3000 		srdc.nfs_cookie = rdc->nfs_ncookie;
3001 		srdc.buflen = count;
3002 		rrdc = avl_find(&rp->r_dir, &srdc, &where);
3003 		if (rrdc != NULL) {
3004 			if (nrdc != NULL)
3005 				rddir_cache_rele(nrdc);
3006 		} else {
3007 			if (nrdc != NULL)
3008 				rrdc = nrdc;
3009 			else {
3010 				rrdc = rddir_cache_alloc(KM_NOSLEEP);
3011 			}
3012 			if (rrdc != NULL) {
3013 				rrdc->nfs_cookie = rdc->nfs_ncookie;
3014 				rrdc->buflen = count;
3015 				avl_insert(&rp->r_dir, rrdc, where);
3016 				rddir_cache_hold(rrdc);
3017 				mutex_exit(&rp->r_statelock);
3018 				rddir_cache_rele(rdc);
3019 #ifdef DEBUG
3020 				nfs_readdir_readahead++;
3021 #endif
3022 				nfs_async_readdir(vp, rrdc, cr, nfsreaddir);
3023 				return (error);
3024 			}
3025 		}
3026 
3027 		mutex_exit(&rp->r_statelock);
3028 		rddir_cache_rele(rdc);
3029 		return (error);
3030 	}
3031 
3032 	/*
3033 	 * Didn't find an entry in the cache.  Construct a new empty
3034 	 * entry and link it into the cache.  Other processes attempting
3035 	 * to access this entry will need to wait until it is filled in.
3036 	 *
3037 	 * Since kmem_alloc may block, another pass through the cache
3038 	 * will need to be taken to make sure that another process
3039 	 * hasn't already added an entry to the cache for this request.
3040 	 */
3041 	if (nrdc == NULL) {
3042 		mutex_exit(&rp->r_statelock);
3043 		nrdc = rddir_cache_alloc(KM_SLEEP);
3044 		nrdc->nfs_cookie = uiop->uio_offset;
3045 		nrdc->buflen = count;
3046 		goto top;
3047 	}
3048 
3049 	/*
3050 	 * Add this entry to the cache.
3051 	 */
3052 	avl_insert(&rp->r_dir, nrdc, where);
3053 	rddir_cache_hold(nrdc);
3054 	mutex_exit(&rp->r_statelock);
3055 
3056 bottom:
3057 #ifdef DEBUG
3058 	missed = 1;
3059 	nfs_readdir_cache_misses++;
3060 #endif
3061 	/*
3062 	 * Do the readdir.
3063 	 */
3064 	error = nfsreaddir(vp, nrdc, cr);
3065 
3066 	/*
3067 	 * If this operation failed, just return the error which occurred.
3068 	 */
3069 	if (error != 0)
3070 		return (error);
3071 
3072 	/*
3073 	 * Since the RPC operation will have taken sometime and blocked
3074 	 * this process, another pass through the cache will need to be
3075 	 * taken to find the correct cache entry.  It is possible that
3076 	 * the correct cache entry will not be there (although one was
3077 	 * added) because the directory changed during the RPC operation
3078 	 * and the readdir cache was flushed.  In this case, just start
3079 	 * over.  It is hoped that this will not happen too often... :-)
3080 	 */
3081 	nrdc = NULL;
3082 	goto top;
3083 	/* NOTREACHED */
3084 }
3085 
3086 static int
3087 nfsreaddir(vnode_t *vp, rddir_cache *rdc, cred_t *cr)
3088 {
3089 	int error;
3090 	struct nfsrddirargs rda;
3091 	struct nfsrddirres rd;
3092 	rnode_t *rp;
3093 	mntinfo_t *mi;
3094 	uint_t count;
3095 	int douprintf;
3096 	failinfo_t fi, *fip;
3097 
3098 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
3099 	count = rdc->buflen;
3100 
3101 	rp = VTOR(vp);
3102 	mi = VTOMI(vp);
3103 
3104 	rda.rda_fh = *VTOFH(vp);
3105 	rda.rda_offset = rdc->nfs_cookie;
3106 
3107 	/*
3108 	 * NFS client failover support
3109 	 * suppress failover unless we have a zero cookie
3110 	 */
3111 	if (rdc->nfs_cookie == (off_t)0) {
3112 		fi.vp = vp;
3113 		fi.fhp = (caddr_t)&rda.rda_fh;
3114 		fi.copyproc = nfscopyfh;
3115 		fi.lookupproc = nfslookup;
3116 		fi.xattrdirproc = acl_getxattrdir2;
3117 		fip = &fi;
3118 	} else {
3119 		fip = NULL;
3120 	}
3121 
3122 	rd.rd_entries = kmem_alloc(rdc->buflen, KM_SLEEP);
3123 	rd.rd_size = count;
3124 	rd.rd_offset = rda.rda_offset;
3125 
3126 	douprintf = 1;
3127 
3128 	if (mi->mi_io_kstats) {
3129 		mutex_enter(&mi->mi_lock);
3130 		kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
3131 		mutex_exit(&mi->mi_lock);
3132 	}
3133 
3134 	do {
3135 		rda.rda_count = MIN(count, mi->mi_curread);
3136 		error = rfs2call(mi, RFS_READDIR,
3137 		    xdr_rddirargs, (caddr_t)&rda,
3138 		    xdr_getrddirres, (caddr_t)&rd, cr,
3139 		    &douprintf, &rd.rd_status, 0, fip);
3140 	} while (error == ENFS_TRYAGAIN);
3141 
3142 	if (mi->mi_io_kstats) {
3143 		mutex_enter(&mi->mi_lock);
3144 		kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
3145 		mutex_exit(&mi->mi_lock);
3146 	}
3147 
3148 	/*
3149 	 * Since we are actually doing a READDIR RPC, we must have
3150 	 * exclusive access to the cache entry being filled.  Thus,
3151 	 * it is safe to update all fields except for the flags
3152 	 * field.  The r_statelock in the rnode must be held to
3153 	 * prevent two different threads from simultaneously
3154 	 * attempting to update the flags field.  This can happen
3155 	 * if we are turning off RDDIR and the other thread is
3156 	 * trying to set RDDIRWAIT.
3157 	 */
3158 	ASSERT(rdc->flags & RDDIR);
3159 	if (!error) {
3160 		error = geterrno(rd.rd_status);
3161 		if (!error) {
3162 			rdc->nfs_ncookie = rd.rd_offset;
3163 			rdc->eof = rd.rd_eof ? 1 : 0;
3164 			rdc->entlen = rd.rd_size;
3165 			ASSERT(rdc->entlen <= rdc->buflen);
3166 #ifdef DEBUG
3167 			rdc->entries = rddir_cache_buf_alloc(rdc->buflen,
3168 			    KM_SLEEP);
3169 #else
3170 			rdc->entries = kmem_alloc(rdc->buflen, KM_SLEEP);
3171 #endif
3172 			bcopy(rd.rd_entries, rdc->entries, rdc->entlen);
3173 			rdc->error = 0;
3174 			if (mi->mi_io_kstats) {
3175 				mutex_enter(&mi->mi_lock);
3176 				KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
3177 				KSTAT_IO_PTR(mi->mi_io_kstats)->nread +=
3178 				    rd.rd_size;
3179 				mutex_exit(&mi->mi_lock);
3180 			}
3181 		} else {
3182 			PURGE_STALE_FH(error, vp, cr);
3183 		}
3184 	}
3185 	if (error) {
3186 		rdc->entries = NULL;
3187 		rdc->error = error;
3188 	}
3189 	kmem_free(rd.rd_entries, rdc->buflen);
3190 
3191 	mutex_enter(&rp->r_statelock);
3192 	rdc->flags &= ~RDDIR;
3193 	if (rdc->flags & RDDIRWAIT) {
3194 		rdc->flags &= ~RDDIRWAIT;
3195 		cv_broadcast(&rdc->cv);
3196 	}
3197 	if (error)
3198 		rdc->flags |= RDDIRREQ;
3199 	mutex_exit(&rp->r_statelock);
3200 
3201 	rddir_cache_rele(rdc);
3202 
3203 	return (error);
3204 }
3205 
3206 #ifdef DEBUG
3207 static int nfs_bio_do_stop = 0;
3208 #endif
3209 
3210 static int
3211 nfs_bio(struct buf *bp, cred_t *cr)
3212 {
3213 	rnode_t *rp = VTOR(bp->b_vp);
3214 	int count;
3215 	int error;
3216 	cred_t *cred;
3217 	uint_t offset;
3218 
3219 	DTRACE_IO1(start, struct buf *, bp);
3220 
3221 	ASSERT(nfs_zone() == VTOMI(bp->b_vp)->mi_zone);
3222 	offset = dbtob(bp->b_blkno);
3223 
3224 	if (bp->b_flags & B_READ) {
3225 		mutex_enter(&rp->r_statelock);
3226 		if (rp->r_cred != NULL) {
3227 			cred = rp->r_cred;
3228 			crhold(cred);
3229 		} else {
3230 			rp->r_cred = cr;
3231 			crhold(cr);
3232 			cred = cr;
3233 			crhold(cred);
3234 		}
3235 		mutex_exit(&rp->r_statelock);
3236 	read_again:
3237 		error = bp->b_error = nfsread(bp->b_vp, bp->b_un.b_addr,
3238 		    offset, bp->b_bcount, &bp->b_resid, cred);
3239 		crfree(cred);
3240 		if (!error) {
3241 			if (bp->b_resid) {
3242 				/*
3243 				 * Didn't get it all because we hit EOF,
3244 				 * zero all the memory beyond the EOF.
3245 				 */
3246 				/* bzero(rdaddr + */
3247 				bzero(bp->b_un.b_addr +
3248 				    bp->b_bcount - bp->b_resid, bp->b_resid);
3249 			}
3250 			mutex_enter(&rp->r_statelock);
3251 			if (bp->b_resid == bp->b_bcount &&
3252 			    offset >= rp->r_size) {
3253 				/*
3254 				 * We didn't read anything at all as we are
3255 				 * past EOF.  Return an error indicator back
3256 				 * but don't destroy the pages (yet).
3257 				 */
3258 				error = NFS_EOF;
3259 			}
3260 			mutex_exit(&rp->r_statelock);
3261 		} else if (error == EACCES) {
3262 			mutex_enter(&rp->r_statelock);
3263 			if (cred != cr) {
3264 				if (rp->r_cred != NULL)
3265 					crfree(rp->r_cred);
3266 				rp->r_cred = cr;
3267 				crhold(cr);
3268 				cred = cr;
3269 				crhold(cred);
3270 				mutex_exit(&rp->r_statelock);
3271 				goto read_again;
3272 			}
3273 			mutex_exit(&rp->r_statelock);
3274 		}
3275 	} else {
3276 		if (!(rp->r_flags & RSTALE)) {
3277 			mutex_enter(&rp->r_statelock);
3278 			if (rp->r_cred != NULL) {
3279 				cred = rp->r_cred;
3280 				crhold(cred);
3281 			} else {
3282 				rp->r_cred = cr;
3283 				crhold(cr);
3284 				cred = cr;
3285 				crhold(cred);
3286 			}
3287 			mutex_exit(&rp->r_statelock);
3288 		write_again:
3289 			mutex_enter(&rp->r_statelock);
3290 			count = MIN(bp->b_bcount, rp->r_size - offset);
3291 			mutex_exit(&rp->r_statelock);
3292 			if (count < 0)
3293 				cmn_err(CE_PANIC, "nfs_bio: write count < 0");
3294 #ifdef DEBUG
3295 			if (count == 0) {
3296 				zcmn_err(getzoneid(), CE_WARN,
3297 				    "nfs_bio: zero length write at %d",
3298 				    offset);
3299 				nfs_printfhandle(&rp->r_fh);
3300 				if (nfs_bio_do_stop)
3301 					debug_enter("nfs_bio");
3302 			}
3303 #endif
3304 			error = nfswrite(bp->b_vp, bp->b_un.b_addr, offset,
3305 			    count, cred);
3306 			if (error == EACCES) {
3307 				mutex_enter(&rp->r_statelock);
3308 				if (cred != cr) {
3309 					if (rp->r_cred != NULL)
3310 						crfree(rp->r_cred);
3311 					rp->r_cred = cr;
3312 					crhold(cr);
3313 					crfree(cred);
3314 					cred = cr;
3315 					crhold(cred);
3316 					mutex_exit(&rp->r_statelock);
3317 					goto write_again;
3318 				}
3319 				mutex_exit(&rp->r_statelock);
3320 			}
3321 			bp->b_error = error;
3322 			if (error && error != EINTR) {
3323 				/*
3324 				 * Don't print EDQUOT errors on the console.
3325 				 * Don't print asynchronous EACCES errors.
3326 				 * Don't print EFBIG errors.
3327 				 * Print all other write errors.
3328 				 */
3329 				if (error != EDQUOT && error != EFBIG &&
3330 				    (error != EACCES ||
3331 				    !(bp->b_flags & B_ASYNC)))
3332 					nfs_write_error(bp->b_vp, error, cred);
3333 				/*
3334 				 * Update r_error and r_flags as appropriate.
3335 				 * If the error was ESTALE, then mark the
3336 				 * rnode as not being writeable and save
3337 				 * the error status.  Otherwise, save any
3338 				 * errors which occur from asynchronous
3339 				 * page invalidations.  Any errors occurring
3340 				 * from other operations should be saved
3341 				 * by the caller.
3342 				 */
3343 				mutex_enter(&rp->r_statelock);
3344 				if (error == ESTALE) {
3345 					rp->r_flags |= RSTALE;
3346 					if (!rp->r_error)
3347 						rp->r_error = error;
3348 				} else if (!rp->r_error &&
3349 				    (bp->b_flags &
3350 				    (B_INVAL|B_FORCE|B_ASYNC)) ==
3351 				    (B_INVAL|B_FORCE|B_ASYNC)) {
3352 					rp->r_error = error;
3353 				}
3354 				mutex_exit(&rp->r_statelock);
3355 			}
3356 			crfree(cred);
3357 		} else
3358 			error = rp->r_error;
3359 	}
3360 
3361 	if (error != 0 && error != NFS_EOF)
3362 		bp->b_flags |= B_ERROR;
3363 
3364 	DTRACE_IO1(done, struct buf *, bp);
3365 
3366 	return (error);
3367 }
3368 
3369 static int
3370 nfs_fid(vnode_t *vp, fid_t *fidp)
3371 {
3372 	struct nfs_fid *fp;
3373 	rnode_t *rp;
3374 
3375 	rp = VTOR(vp);
3376 
3377 	if (fidp->fid_len < (sizeof (struct nfs_fid) - sizeof (short))) {
3378 		fidp->fid_len = sizeof (struct nfs_fid) - sizeof (short);
3379 		return (ENOSPC);
3380 	}
3381 	fp = (struct nfs_fid *)fidp;
3382 	fp->nf_pad = 0;
3383 	fp->nf_len = sizeof (struct nfs_fid) - sizeof (short);
3384 	bcopy(rp->r_fh.fh_buf, fp->nf_data, NFS_FHSIZE);
3385 	return (0);
3386 }
3387 
3388 /* ARGSUSED2 */
3389 static int
3390 nfs_rwlock(vnode_t *vp, int write_lock, caller_context_t *ctp)
3391 {
3392 	rnode_t *rp = VTOR(vp);
3393 
3394 	if (!write_lock) {
3395 		(void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
3396 		return (V_WRITELOCK_FALSE);
3397 	}
3398 
3399 	if ((rp->r_flags & RDIRECTIO) || (VTOMI(vp)->mi_flags & MI_DIRECTIO)) {
3400 		(void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
3401 		if (rp->r_mapcnt == 0 && !vn_has_cached_data(vp))
3402 			return (V_WRITELOCK_FALSE);
3403 		nfs_rw_exit(&rp->r_rwlock);
3404 	}
3405 
3406 	(void) nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, FALSE);
3407 	return (V_WRITELOCK_TRUE);
3408 }
3409 
3410 /* ARGSUSED */
3411 static void
3412 nfs_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ctp)
3413 {
3414 	rnode_t *rp = VTOR(vp);
3415 
3416 	nfs_rw_exit(&rp->r_rwlock);
3417 }
3418 
3419 /* ARGSUSED */
3420 static int
3421 nfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp)
3422 {
3423 
3424 	/*
3425 	 * Because we stuff the readdir cookie into the offset field
3426 	 * someone may attempt to do an lseek with the cookie which
3427 	 * we want to succeed.
3428 	 */
3429 	if (vp->v_type == VDIR)
3430 		return (0);
3431 	if (*noffp < 0 || *noffp > MAXOFF32_T)
3432 		return (EINVAL);
3433 	return (0);
3434 }
3435 
3436 /*
3437  * number of NFS_MAXDATA blocks to read ahead
3438  * optimized for 100 base-T.
3439  */
3440 static int nfs_nra = 4;
3441 
3442 #ifdef DEBUG
3443 static int nfs_lostpage = 0;	/* number of times we lost original page */
3444 #endif
3445 
3446 /*
3447  * Return all the pages from [off..off+len) in file
3448  */
3449 static int
3450 nfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
3451 	page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
3452 	enum seg_rw rw, cred_t *cr)
3453 {
3454 	rnode_t *rp;
3455 	int error;
3456 	mntinfo_t *mi;
3457 
3458 	if (vp->v_flag & VNOMAP)
3459 		return (ENOSYS);
3460 
3461 	ASSERT(off <= MAXOFF32_T);
3462 	if (nfs_zone() != VTOMI(vp)->mi_zone)
3463 		return (EIO);
3464 	if (protp != NULL)
3465 		*protp = PROT_ALL;
3466 
3467 	/*
3468 	 * Now valididate that the caches are up to date.
3469 	 */
3470 	error = nfs_validate_caches(vp, cr);
3471 	if (error)
3472 		return (error);
3473 
3474 	rp = VTOR(vp);
3475 	mi = VTOMI(vp);
3476 retry:
3477 	mutex_enter(&rp->r_statelock);
3478 
3479 	/*
3480 	 * Don't create dirty pages faster than they
3481 	 * can be cleaned so that the system doesn't
3482 	 * get imbalanced.  If the async queue is
3483 	 * maxed out, then wait for it to drain before
3484 	 * creating more dirty pages.  Also, wait for
3485 	 * any threads doing pagewalks in the vop_getattr
3486 	 * entry points so that they don't block for
3487 	 * long periods.
3488 	 */
3489 	if (rw == S_CREATE) {
3490 		while ((mi->mi_max_threads != 0 &&
3491 		    rp->r_awcount > 2 * mi->mi_max_threads) ||
3492 		    rp->r_gcount > 0)
3493 			cv_wait(&rp->r_cv, &rp->r_statelock);
3494 	}
3495 
3496 	/*
3497 	 * If we are getting called as a side effect of an nfs_write()
3498 	 * operation the local file size might not be extended yet.
3499 	 * In this case we want to be able to return pages of zeroes.
3500 	 */
3501 	if (off + len > rp->r_size + PAGEOFFSET && seg != segkmap) {
3502 		mutex_exit(&rp->r_statelock);
3503 		return (EFAULT);		/* beyond EOF */
3504 	}
3505 
3506 	mutex_exit(&rp->r_statelock);
3507 
3508 	if (len <= PAGESIZE) {
3509 		error = nfs_getapage(vp, off, len, protp, pl, plsz,
3510 		    seg, addr, rw, cr);
3511 	} else {
3512 		error = pvn_getpages(nfs_getapage, vp, off, len, protp,
3513 		    pl, plsz, seg, addr, rw, cr);
3514 	}
3515 
3516 	switch (error) {
3517 	case NFS_EOF:
3518 		nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr);
3519 		goto retry;
3520 	case ESTALE:
3521 		PURGE_STALE_FH(error, vp, cr);
3522 	}
3523 
3524 	return (error);
3525 }
3526 
3527 /*
3528  * Called from pvn_getpages or nfs_getpage to get a particular page.
3529  */
3530 /* ARGSUSED */
3531 static int
3532 nfs_getapage(vnode_t *vp, u_offset_t off, size_t len, uint_t *protp,
3533 	page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
3534 	enum seg_rw rw, cred_t *cr)
3535 {
3536 	rnode_t *rp;
3537 	uint_t bsize;
3538 	struct buf *bp;
3539 	page_t *pp;
3540 	u_offset_t lbn;
3541 	u_offset_t io_off;
3542 	u_offset_t blkoff;
3543 	u_offset_t rablkoff;
3544 	size_t io_len;
3545 	uint_t blksize;
3546 	int error;
3547 	int readahead;
3548 	int readahead_issued = 0;
3549 	int ra_window; /* readahead window */
3550 	page_t *pagefound;
3551 
3552 	if (nfs_zone() != VTOMI(vp)->mi_zone)
3553 		return (EIO);
3554 	rp = VTOR(vp);
3555 	bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
3556 
3557 reread:
3558 	bp = NULL;
3559 	pp = NULL;
3560 	pagefound = NULL;
3561 
3562 	if (pl != NULL)
3563 		pl[0] = NULL;
3564 
3565 	error = 0;
3566 	lbn = off / bsize;
3567 	blkoff = lbn * bsize;
3568 
3569 	/*
3570 	 * Queueing up the readahead before doing the synchronous read
3571 	 * results in a significant increase in read throughput because
3572 	 * of the increased parallelism between the async threads and
3573 	 * the process context.
3574 	 */
3575 	if ((off & ((vp->v_vfsp->vfs_bsize) - 1)) == 0 &&
3576 	    rw != S_CREATE &&
3577 	    !(vp->v_flag & VNOCACHE)) {
3578 		mutex_enter(&rp->r_statelock);
3579 
3580 		/*
3581 		 * Calculate the number of readaheads to do.
3582 		 * a) No readaheads at offset = 0.
3583 		 * b) Do maximum(nfs_nra) readaheads when the readahead
3584 		 *    window is closed.
3585 		 * c) Do readaheads between 1 to (nfs_nra - 1) depending
3586 		 *    upon how far the readahead window is open or close.
3587 		 * d) No readaheads if rp->r_nextr is not within the scope
3588 		 *    of the readahead window (random i/o).
3589 		 */
3590 
3591 		if (off == 0)
3592 			readahead = 0;
3593 		else if (blkoff == rp->r_nextr)
3594 			readahead = nfs_nra;
3595 		else if (rp->r_nextr > blkoff &&
3596 				((ra_window = (rp->r_nextr - blkoff) / bsize)
3597 					<= (nfs_nra - 1)))
3598 			readahead = nfs_nra - ra_window;
3599 		else
3600 			readahead = 0;
3601 
3602 		rablkoff = rp->r_nextr;
3603 		while (readahead > 0 && rablkoff + bsize < rp->r_size) {
3604 			mutex_exit(&rp->r_statelock);
3605 			if (nfs_async_readahead(vp, rablkoff + bsize,
3606 			    addr + (rablkoff + bsize - off), seg, cr,
3607 			    nfs_readahead) < 0) {
3608 				mutex_enter(&rp->r_statelock);
3609 				break;
3610 			}
3611 			readahead--;
3612 			rablkoff += bsize;
3613 			/*
3614 			 * Indicate that we did a readahead so
3615 			 * readahead offset is not updated
3616 			 * by the synchronous read below.
3617 			 */
3618 			readahead_issued = 1;
3619 			mutex_enter(&rp->r_statelock);
3620 			/*
3621 			 * set readahead offset to
3622 			 * offset of last async readahead
3623 			 * request.
3624 			 */
3625 			rp->r_nextr = rablkoff;
3626 		}
3627 		mutex_exit(&rp->r_statelock);
3628 	}
3629 
3630 again:
3631 	if ((pagefound = page_exists(vp, off)) == NULL) {
3632 		if (pl == NULL) {
3633 			(void) nfs_async_readahead(vp, blkoff, addr, seg, cr,
3634 			    nfs_readahead);
3635 		} else if (rw == S_CREATE) {
3636 			/*
3637 			 * Block for this page is not allocated, or the offset
3638 			 * is beyond the current allocation size, or we're
3639 			 * allocating a swap slot and the page was not found,
3640 			 * so allocate it and return a zero page.
3641 			 */
3642 			if ((pp = page_create_va(vp, off,
3643 			    PAGESIZE, PG_WAIT, seg, addr)) == NULL)
3644 				cmn_err(CE_PANIC, "nfs_getapage: page_create");
3645 			io_len = PAGESIZE;
3646 			mutex_enter(&rp->r_statelock);
3647 			rp->r_nextr = off + PAGESIZE;
3648 			mutex_exit(&rp->r_statelock);
3649 		} else {
3650 			/*
3651 			 * Need to go to server to get a BLOCK, exception to
3652 			 * that being while reading at offset = 0 or doing
3653 			 * random i/o, in that case read only a PAGE.
3654 			 */
3655 			mutex_enter(&rp->r_statelock);
3656 			if (blkoff < rp->r_size &&
3657 			    blkoff + bsize >= rp->r_size) {
3658 				/*
3659 				 * If only a block or less is left in
3660 				 * the file, read all that is remaining.
3661 				 */
3662 				if (rp->r_size <= off) {
3663 					/*
3664 					 * Trying to access beyond EOF,
3665 					 * set up to get at least one page.
3666 					 */
3667 					blksize = off + PAGESIZE - blkoff;
3668 				} else
3669 					blksize = rp->r_size - blkoff;
3670 			} else if ((off == 0) ||
3671 				(off != rp->r_nextr && !readahead_issued)) {
3672 				blksize = PAGESIZE;
3673 				blkoff = off; /* block = page here */
3674 			} else
3675 				blksize = bsize;
3676 			mutex_exit(&rp->r_statelock);
3677 
3678 			pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
3679 			    &io_len, blkoff, blksize, 0);
3680 
3681 			/*
3682 			 * Some other thread has entered the page,
3683 			 * so just use it.
3684 			 */
3685 			if (pp == NULL)
3686 				goto again;
3687 
3688 			/*
3689 			 * Now round the request size up to page boundaries.
3690 			 * This ensures that the entire page will be
3691 			 * initialized to zeroes if EOF is encountered.
3692 			 */
3693 			io_len = ptob(btopr(io_len));
3694 
3695 			bp = pageio_setup(pp, io_len, vp, B_READ);
3696 			ASSERT(bp != NULL);
3697 
3698 			/*
3699 			 * pageio_setup should have set b_addr to 0.  This
3700 			 * is correct since we want to do I/O on a page
3701 			 * boundary.  bp_mapin will use this addr to calculate
3702 			 * an offset, and then set b_addr to the kernel virtual
3703 			 * address it allocated for us.
3704 			 */
3705 			ASSERT(bp->b_un.b_addr == 0);
3706 
3707 			bp->b_edev = 0;
3708 			bp->b_dev = 0;
3709 			bp->b_lblkno = lbtodb(io_off);
3710 			bp->b_file = vp;
3711 			bp->b_offset = (offset_t)off;
3712 			bp_mapin(bp);
3713 
3714 			/*
3715 			 * If doing a write beyond what we believe is EOF,
3716 			 * don't bother trying to read the pages from the
3717 			 * server, we'll just zero the pages here.  We
3718 			 * don't check that the rw flag is S_WRITE here
3719 			 * because some implementations may attempt a
3720 			 * read access to the buffer before copying data.
3721 			 */
3722 			mutex_enter(&rp->r_statelock);
3723 			if (io_off >= rp->r_size && seg == segkmap) {
3724 				mutex_exit(&rp->r_statelock);
3725 				bzero(bp->b_un.b_addr, io_len);
3726 			} else {
3727 				mutex_exit(&rp->r_statelock);
3728 				error = nfs_bio(bp, cr);
3729 			}
3730 
3731 			/*
3732 			 * Unmap the buffer before freeing it.
3733 			 */
3734 			bp_mapout(bp);
3735 			pageio_done(bp);
3736 
3737 			if (error == NFS_EOF) {
3738 				/*
3739 				 * If doing a write system call just return
3740 				 * zeroed pages, else user tried to get pages
3741 				 * beyond EOF, return error.  We don't check
3742 				 * that the rw flag is S_WRITE here because
3743 				 * some implementations may attempt a read
3744 				 * access to the buffer before copying data.
3745 				 */
3746 				if (seg == segkmap)
3747 					error = 0;
3748 				else
3749 					error = EFAULT;
3750 			}
3751 
3752 			if (!readahead_issued && !error) {
3753 			    mutex_enter(&rp->r_statelock);
3754 			    rp->r_nextr = io_off + io_len;
3755 			    mutex_exit(&rp->r_statelock);
3756 			}
3757 		}
3758 	}
3759 
3760 out:
3761 	if (pl == NULL)
3762 		return (error);
3763 
3764 	if (error) {
3765 		if (pp != NULL)
3766 			pvn_read_done(pp, B_ERROR);
3767 		return (error);
3768 	}
3769 
3770 	if (pagefound) {
3771 		se_t se = (rw == S_CREATE ? SE_EXCL : SE_SHARED);
3772 
3773 		/*
3774 		 * Page exists in the cache, acquire the appropriate lock.
3775 		 * If this fails, start all over again.
3776 		 */
3777 		if ((pp = page_lookup(vp, off, se)) == NULL) {
3778 #ifdef DEBUG
3779 			nfs_lostpage++;
3780 #endif
3781 			goto reread;
3782 		}
3783 		pl[0] = pp;
3784 		pl[1] = NULL;
3785 		return (0);
3786 	}
3787 
3788 	if (pp != NULL)
3789 		pvn_plist_init(pp, pl, plsz, off, io_len, rw);
3790 
3791 	return (error);
3792 }
3793 
3794 static void
3795 nfs_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, struct seg *seg,
3796 	cred_t *cr)
3797 {
3798 	int error;
3799 	page_t *pp;
3800 	u_offset_t io_off;
3801 	size_t io_len;
3802 	struct buf *bp;
3803 	uint_t bsize, blksize;
3804 	rnode_t *rp = VTOR(vp);
3805 
3806 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
3807 
3808 	bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
3809 
3810 	mutex_enter(&rp->r_statelock);
3811 	if (blkoff < rp->r_size && blkoff + bsize > rp->r_size) {
3812 		/*
3813 		 * If less than a block left in file read less
3814 		 * than a block.
3815 		 */
3816 		blksize = rp->r_size - blkoff;
3817 	} else
3818 		blksize = bsize;
3819 	mutex_exit(&rp->r_statelock);
3820 
3821 	pp = pvn_read_kluster(vp, blkoff, segkmap, addr,
3822 	    &io_off, &io_len, blkoff, blksize, 1);
3823 	/*
3824 	 * The isra flag passed to the kluster function is 1, we may have
3825 	 * gotten a return value of NULL for a variety of reasons (# of free
3826 	 * pages < minfree, someone entered the page on the vnode etc). In all
3827 	 * cases, we want to punt on the readahead.
3828 	 */
3829 	if (pp == NULL)
3830 		return;
3831 
3832 	/*
3833 	 * Now round the request size up to page boundaries.
3834 	 * This ensures that the entire page will be
3835 	 * initialized to zeroes if EOF is encountered.
3836 	 */
3837 	io_len = ptob(btopr(io_len));
3838 
3839 	bp = pageio_setup(pp, io_len, vp, B_READ);
3840 	ASSERT(bp != NULL);
3841 
3842 	/*
3843 	 * pageio_setup should have set b_addr to 0.  This is correct since
3844 	 * we want to do I/O on a page boundary. bp_mapin() will use this addr
3845 	 * to calculate an offset, and then set b_addr to the kernel virtual
3846 	 * address it allocated for us.
3847 	 */
3848 	ASSERT(bp->b_un.b_addr == 0);
3849 
3850 	bp->b_edev = 0;
3851 	bp->b_dev = 0;
3852 	bp->b_lblkno = lbtodb(io_off);
3853 	bp->b_file = vp;
3854 	bp->b_offset = (offset_t)blkoff;
3855 	bp_mapin(bp);
3856 
3857 	/*
3858 	 * If doing a write beyond what we believe is EOF, don't bother trying
3859 	 * to read the pages from the server, we'll just zero the pages here.
3860 	 * We don't check that the rw flag is S_WRITE here because some
3861 	 * implementations may attempt a read access to the buffer before
3862 	 * copying data.
3863 	 */
3864 	mutex_enter(&rp->r_statelock);
3865 	if (io_off >= rp->r_size && seg == segkmap) {
3866 		mutex_exit(&rp->r_statelock);
3867 		bzero(bp->b_un.b_addr, io_len);
3868 		error = 0;
3869 	} else {
3870 		mutex_exit(&rp->r_statelock);
3871 		error = nfs_bio(bp, cr);
3872 		if (error == NFS_EOF)
3873 			error = 0;
3874 	}
3875 
3876 	/*
3877 	 * Unmap the buffer before freeing it.
3878 	 */
3879 	bp_mapout(bp);
3880 	pageio_done(bp);
3881 
3882 	pvn_read_done(pp, error ? B_READ | B_ERROR : B_READ);
3883 
3884 	/*
3885 	 * In case of error set readahead offset
3886 	 * to the lowest offset.
3887 	 * pvn_read_done() calls VN_DISPOSE to destroy the pages
3888 	 */
3889 	if (error && rp->r_nextr > io_off) {
3890 		mutex_enter(&rp->r_statelock);
3891 		if (rp->r_nextr > io_off)
3892 			rp->r_nextr = io_off;
3893 		mutex_exit(&rp->r_statelock);
3894 	}
3895 }
3896 
3897 /*
3898  * Flags are composed of {B_INVAL, B_FREE, B_DONTNEED, B_FORCE}
3899  * If len == 0, do from off to EOF.
3900  *
3901  * The normal cases should be len == 0 && off == 0 (entire vp list),
3902  * len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE
3903  * (from pageout).
3904  */
3905 static int
3906 nfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr)
3907 {
3908 	int error;
3909 	rnode_t *rp;
3910 
3911 	ASSERT(cr != NULL);
3912 
3913 	/*
3914 	 * XXX - Why should this check be made here?
3915 	 */
3916 	if (vp->v_flag & VNOMAP)
3917 		return (ENOSYS);
3918 
3919 	if (len == 0 && !(flags & B_INVAL) && vn_is_readonly(vp))
3920 		return (0);
3921 
3922 	if (!(flags & B_ASYNC) && nfs_zone() != VTOMI(vp)->mi_zone)
3923 		return (EIO);
3924 	ASSERT(off <= MAXOFF32_T);
3925 
3926 	rp = VTOR(vp);
3927 	mutex_enter(&rp->r_statelock);
3928 	rp->r_count++;
3929 	mutex_exit(&rp->r_statelock);
3930 	error = nfs_putpages(vp, off, len, flags, cr);
3931 	mutex_enter(&rp->r_statelock);
3932 	rp->r_count--;
3933 	cv_broadcast(&rp->r_cv);
3934 	mutex_exit(&rp->r_statelock);
3935 
3936 	return (error);
3937 }
3938 
3939 /*
3940  * Write out a single page, possibly klustering adjacent dirty pages.
3941  */
3942 int
3943 nfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp,
3944 	int flags, cred_t *cr)
3945 {
3946 	u_offset_t io_off;
3947 	u_offset_t lbn_off;
3948 	u_offset_t lbn;
3949 	size_t io_len;
3950 	uint_t bsize;
3951 	int error;
3952 	rnode_t *rp;
3953 
3954 	ASSERT(!vn_is_readonly(vp));
3955 	ASSERT(pp != NULL);
3956 	ASSERT(cr != NULL);
3957 	ASSERT((flags & B_ASYNC) || nfs_zone() == VTOMI(vp)->mi_zone);
3958 
3959 	rp = VTOR(vp);
3960 	ASSERT(rp->r_count > 0);
3961 
3962 	ASSERT(pp->p_offset <= MAXOFF32_T);
3963 
3964 	bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
3965 	lbn = pp->p_offset / bsize;
3966 	lbn_off = lbn * bsize;
3967 
3968 	/*
3969 	 * Find a kluster that fits in one block, or in
3970 	 * one page if pages are bigger than blocks.  If
3971 	 * there is less file space allocated than a whole
3972 	 * page, we'll shorten the i/o request below.
3973 	 */
3974 	pp = pvn_write_kluster(vp, pp, &io_off, &io_len, lbn_off,
3975 	    roundup(bsize, PAGESIZE), flags);
3976 
3977 	/*
3978 	 * pvn_write_kluster shouldn't have returned a page with offset
3979 	 * behind the original page we were given.  Verify that.
3980 	 */
3981 	ASSERT((pp->p_offset / bsize) >= lbn);
3982 
3983 	/*
3984 	 * Now pp will have the list of kept dirty pages marked for
3985 	 * write back.  It will also handle invalidation and freeing
3986 	 * of pages that are not dirty.  Check for page length rounding
3987 	 * problems.
3988 	 */
3989 	if (io_off + io_len > lbn_off + bsize) {
3990 		ASSERT((io_off + io_len) - (lbn_off + bsize) < PAGESIZE);
3991 		io_len = lbn_off + bsize - io_off;
3992 	}
3993 	/*
3994 	 * The RMODINPROGRESS flag makes sure that nfs(3)_bio() sees a
3995 	 * consistent value of r_size. RMODINPROGRESS is set in writerp().
3996 	 * When RMODINPROGRESS is set it indicates that a uiomove() is in
3997 	 * progress and the r_size has not been made consistent with the
3998 	 * new size of the file. When the uiomove() completes the r_size is
3999 	 * updated and the RMODINPROGRESS flag is cleared.
4000 	 *
4001 	 * The RMODINPROGRESS flag makes sure that nfs(3)_bio() sees a
4002 	 * consistent value of r_size. Without this handshaking, it is
4003 	 * possible that nfs(3)_bio() picks  up the old value of r_size
4004 	 * before the uiomove() in writerp() completes. This will result
4005 	 * in the write through nfs(3)_bio() being dropped.
4006 	 *
4007 	 * More precisely, there is a window between the time the uiomove()
4008 	 * completes and the time the r_size is updated. If a VOP_PUTPAGE()
4009 	 * operation intervenes in this window, the page will be picked up,
4010 	 * because it is dirty (it will be unlocked, unless it was
4011 	 * pagecreate'd). When the page is picked up as dirty, the dirty
4012 	 * bit is reset (pvn_getdirty()). In nfs(3)write(), r_size is
4013 	 * checked. This will still be the old size. Therefore the page will
4014 	 * not be written out. When segmap_release() calls VOP_PUTPAGE(),
4015 	 * the page will be found to be clean and the write will be dropped.
4016 	 */
4017 	if (rp->r_flags & RMODINPROGRESS) {
4018 		mutex_enter(&rp->r_statelock);
4019 		if ((rp->r_flags & RMODINPROGRESS) &&
4020 		    rp->r_modaddr + MAXBSIZE > io_off &&
4021 		    rp->r_modaddr < io_off + io_len) {
4022 			page_t *plist;
4023 			/*
4024 			 * A write is in progress for this region of the file.
4025 			 * If we did not detect RMODINPROGRESS here then this
4026 			 * path through nfs_putapage() would eventually go to
4027 			 * nfs(3)_bio() and may not write out all of the data
4028 			 * in the pages. We end up losing data. So we decide
4029 			 * to set the modified bit on each page in the page
4030 			 * list and mark the rnode with RDIRTY. This write
4031 			 * will be restarted at some later time.
4032 			 */
4033 			plist = pp;
4034 			while (plist != NULL) {
4035 				pp = plist;
4036 				page_sub(&plist, pp);
4037 				hat_setmod(pp);
4038 				page_io_unlock(pp);
4039 				page_unlock(pp);
4040 			}
4041 			rp->r_flags |= RDIRTY;
4042 			mutex_exit(&rp->r_statelock);
4043 			if (offp)
4044 				*offp = io_off;
4045 			if (lenp)
4046 				*lenp = io_len;
4047 			return (0);
4048 		}
4049 		mutex_exit(&rp->r_statelock);
4050 	}
4051 
4052 	if (flags & B_ASYNC) {
4053 		error = nfs_async_putapage(vp, pp, io_off, io_len, flags, cr,
4054 		    nfs_sync_putapage);
4055 	} else
4056 		error = nfs_sync_putapage(vp, pp, io_off, io_len, flags, cr);
4057 
4058 	if (offp)
4059 		*offp = io_off;
4060 	if (lenp)
4061 		*lenp = io_len;
4062 	return (error);
4063 }
4064 
4065 static int
4066 nfs_sync_putapage(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
4067 	int flags, cred_t *cr)
4068 {
4069 	int error;
4070 	rnode_t *rp;
4071 
4072 	flags |= B_WRITE;
4073 
4074 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
4075 	error = nfs_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
4076 
4077 	rp = VTOR(vp);
4078 
4079 	if ((error == ENOSPC || error == EDQUOT || error == EACCES) &&
4080 	    (flags & (B_INVAL|B_FORCE)) != (B_INVAL|B_FORCE)) {
4081 		if (!(rp->r_flags & ROUTOFSPACE)) {
4082 			mutex_enter(&rp->r_statelock);
4083 			rp->r_flags |= ROUTOFSPACE;
4084 			mutex_exit(&rp->r_statelock);
4085 		}
4086 		flags |= B_ERROR;
4087 		pvn_write_done(pp, flags);
4088 		/*
4089 		 * If this was not an async thread, then try again to
4090 		 * write out the pages, but this time, also destroy
4091 		 * them whether or not the write is successful.  This
4092 		 * will prevent memory from filling up with these
4093 		 * pages and destroying them is the only alternative
4094 		 * if they can't be written out.
4095 		 *
4096 		 * Don't do this if this is an async thread because
4097 		 * when the pages are unlocked in pvn_write_done,
4098 		 * some other thread could have come along, locked
4099 		 * them, and queued for an async thread.  It would be
4100 		 * possible for all of the async threads to be tied
4101 		 * up waiting to lock the pages again and they would
4102 		 * all already be locked and waiting for an async
4103 		 * thread to handle them.  Deadlock.
4104 		 */
4105 		if (!(flags & B_ASYNC)) {
4106 			error = nfs_putpage(vp, io_off, io_len,
4107 			    B_INVAL | B_FORCE, cr);
4108 		}
4109 	} else {
4110 		if (error)
4111 			flags |= B_ERROR;
4112 		else if (rp->r_flags & ROUTOFSPACE) {
4113 			mutex_enter(&rp->r_statelock);
4114 			rp->r_flags &= ~ROUTOFSPACE;
4115 			mutex_exit(&rp->r_statelock);
4116 		}
4117 		pvn_write_done(pp, flags);
4118 	}
4119 
4120 	return (error);
4121 }
4122 
4123 static int
4124 nfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
4125 	size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr)
4126 {
4127 	struct segvn_crargs vn_a;
4128 	int error;
4129 	rnode_t *rp;
4130 	struct vattr va;
4131 
4132 	if (nfs_zone() != VTOMI(vp)->mi_zone)
4133 		return (EIO);
4134 
4135 	if (vp->v_flag & VNOMAP)
4136 		return (ENOSYS);
4137 
4138 	if (off > MAXOFF32_T)
4139 		return (EFBIG);
4140 
4141 	if (off < 0 || off + len < 0)
4142 		return (ENXIO);
4143 
4144 	if (vp->v_type != VREG)
4145 		return (ENODEV);
4146 
4147 	/*
4148 	 * If there is cached data and if close-to-open consistency
4149 	 * checking is not turned off and if the file system is not
4150 	 * mounted readonly, then force an over the wire getattr.
4151 	 * Otherwise, just invoke nfsgetattr to get a copy of the
4152 	 * attributes.  The attribute cache will be used unless it
4153 	 * is timed out and if it is, then an over the wire getattr
4154 	 * will be issued.
4155 	 */
4156 	va.va_mask = AT_ALL;
4157 	if (vn_has_cached_data(vp) &&
4158 	    !(VTOMI(vp)->mi_flags & MI_NOCTO) && !vn_is_readonly(vp))
4159 		error = nfs_getattr_otw(vp, &va, cr);
4160 	else
4161 		error = nfsgetattr(vp, &va, cr);
4162 	if (error)
4163 		return (error);
4164 
4165 	/*
4166 	 * Check to see if the vnode is currently marked as not cachable.
4167 	 * This means portions of the file are locked (through VOP_FRLOCK).
4168 	 * In this case the map request must be refused.  We use
4169 	 * rp->r_lkserlock to avoid a race with concurrent lock requests.
4170 	 */
4171 	rp = VTOR(vp);
4172 	if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR(vp)))
4173 		return (EINTR);
4174 
4175 	if (vp->v_flag & VNOCACHE) {
4176 		error = EAGAIN;
4177 		goto done;
4178 	}
4179 
4180 	/*
4181 	 * Don't allow concurrent locks and mapping if mandatory locking is
4182 	 * enabled.
4183 	 */
4184 	if ((flk_has_remote_locks(vp) || lm_has_sleep(vp)) &&
4185 	    MANDLOCK(vp, va.va_mode)) {
4186 		error = EAGAIN;
4187 		goto done;
4188 	}
4189 
4190 	as_rangelock(as);
4191 	if (!(flags & MAP_FIXED)) {
4192 		map_addr(addrp, len, off, 1, flags);
4193 		if (*addrp == NULL) {
4194 			as_rangeunlock(as);
4195 			error = ENOMEM;
4196 			goto done;
4197 		}
4198 	} else {
4199 		/*
4200 		 * User specified address - blow away any previous mappings
4201 		 */
4202 		(void) as_unmap(as, *addrp, len);
4203 	}
4204 
4205 	vn_a.vp = vp;
4206 	vn_a.offset = off;
4207 	vn_a.type = (flags & MAP_TYPE);
4208 	vn_a.prot = (uchar_t)prot;
4209 	vn_a.maxprot = (uchar_t)maxprot;
4210 	vn_a.flags = (flags & ~MAP_TYPE);
4211 	vn_a.cred = cr;
4212 	vn_a.amp = NULL;
4213 	vn_a.szc = 0;
4214 	vn_a.lgrp_mem_policy_flags = 0;
4215 
4216 	error = as_map(as, *addrp, len, segvn_create, &vn_a);
4217 	as_rangeunlock(as);
4218 
4219 done:
4220 	nfs_rw_exit(&rp->r_lkserlock);
4221 	return (error);
4222 }
4223 
4224 /* ARGSUSED */
4225 static int
4226 nfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
4227 	size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr)
4228 {
4229 	rnode_t *rp;
4230 
4231 	if (vp->v_flag & VNOMAP)
4232 		return (ENOSYS);
4233 	if (nfs_zone() != VTOMI(vp)->mi_zone)
4234 		return (EIO);
4235 
4236 	/*
4237 	 * Need to hold rwlock while incrementing the mapcnt so that
4238 	 * mmap'ing can be serialized with writes so that the caching
4239 	 * can be handled correctly.
4240 	 */
4241 	rp = VTOR(vp);
4242 	if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, INTR(vp)))
4243 		return (EINTR);
4244 	atomic_add_long((ulong_t *)&rp->r_mapcnt, btopr(len));
4245 	nfs_rw_exit(&rp->r_rwlock);
4246 
4247 	return (0);
4248 }
4249 
4250 static int
4251 nfs_frlock(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
4252 	offset_t offset, struct flk_callback *flk_cbp, cred_t *cr)
4253 {
4254 	netobj lm_fh;
4255 	int rc;
4256 	u_offset_t start, end;
4257 	rnode_t *rp;
4258 	int error = 0, intr = INTR(vp);
4259 
4260 	/* check for valid cmd parameter */
4261 	if (cmd != F_GETLK && cmd != F_SETLK && cmd != F_SETLKW)
4262 		return (EINVAL);
4263 	if (nfs_zone() != VTOMI(vp)->mi_zone)
4264 		return (EIO);
4265 
4266 	/* Verify l_type. */
4267 	switch (bfp->l_type) {
4268 	case F_RDLCK:
4269 		if (cmd != F_GETLK && !(flag & FREAD))
4270 			return (EBADF);
4271 		break;
4272 	case F_WRLCK:
4273 		if (cmd != F_GETLK && !(flag & FWRITE))
4274 			return (EBADF);
4275 		break;
4276 	case F_UNLCK:
4277 		intr = 0;
4278 		break;
4279 
4280 	default:
4281 		return (EINVAL);
4282 	}
4283 
4284 	/* check the validity of the lock range */
4285 	if (rc = flk_convert_lock_data(vp, bfp, &start, &end, offset))
4286 		return (rc);
4287 	if (rc = flk_check_lock_data(start, end, MAXOFF32_T))
4288 		return (rc);
4289 
4290 	/*
4291 	 * If the filesystem is mounted using local locking, pass the
4292 	 * request off to the local locking code.
4293 	 */
4294 	if (VTOMI(vp)->mi_flags & MI_LLOCK) {
4295 		if (offset > MAXOFF32_T)
4296 			return (EFBIG);
4297 		if (cmd == F_SETLK || cmd == F_SETLKW) {
4298 			/*
4299 			 * For complete safety, we should be holding
4300 			 * r_lkserlock.  However, we can't call
4301 			 * lm_safelock and then fs_frlock while
4302 			 * holding r_lkserlock, so just invoke
4303 			 * lm_safelock and expect that this will
4304 			 * catch enough of the cases.
4305 			 */
4306 			if (!lm_safelock(vp, bfp, cr))
4307 				return (EAGAIN);
4308 		}
4309 		return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr));
4310 	}
4311 
4312 	rp = VTOR(vp);
4313 
4314 	/*
4315 	 * Check whether the given lock request can proceed, given the
4316 	 * current file mappings.
4317 	 */
4318 	if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_WRITER, intr))
4319 		return (EINTR);
4320 	if (cmd == F_SETLK || cmd == F_SETLKW) {
4321 		if (!lm_safelock(vp, bfp, cr)) {
4322 			rc = EAGAIN;
4323 			goto done;
4324 		}
4325 	}
4326 
4327 	/*
4328 	 * Flush the cache after waiting for async I/O to finish.  For new
4329 	 * locks, this is so that the process gets the latest bits from the
4330 	 * server.  For unlocks, this is so that other clients see the
4331 	 * latest bits once the file has been unlocked.  If currently dirty
4332 	 * pages can't be flushed, then don't allow a lock to be set.  But
4333 	 * allow unlocks to succeed, to avoid having orphan locks on the
4334 	 * server.
4335 	 */
4336 	if (cmd != F_GETLK) {
4337 		mutex_enter(&rp->r_statelock);
4338 		while (rp->r_count > 0) {
4339 		    if (intr) {
4340 			klwp_t *lwp = ttolwp(curthread);
4341 
4342 			if (lwp != NULL)
4343 				lwp->lwp_nostop++;
4344 			if (cv_wait_sig(&rp->r_cv, &rp->r_statelock) == 0) {
4345 				if (lwp != NULL)
4346 					lwp->lwp_nostop--;
4347 				rc = EINTR;
4348 				break;
4349 			}
4350 			if (lwp != NULL)
4351 				lwp->lwp_nostop--;
4352 		    } else
4353 			cv_wait(&rp->r_cv, &rp->r_statelock);
4354 		}
4355 		mutex_exit(&rp->r_statelock);
4356 		if (rc != 0)
4357 			goto done;
4358 		error = nfs_putpage(vp, (offset_t)0, 0, B_INVAL, cr);
4359 		if (error) {
4360 			if (error == ENOSPC || error == EDQUOT) {
4361 				mutex_enter(&rp->r_statelock);
4362 				if (!rp->r_error)
4363 					rp->r_error = error;
4364 				mutex_exit(&rp->r_statelock);
4365 			}
4366 			if (bfp->l_type != F_UNLCK) {
4367 				rc = ENOLCK;
4368 				goto done;
4369 			}
4370 		}
4371 	}
4372 
4373 	lm_fh.n_len = sizeof (fhandle_t);
4374 	lm_fh.n_bytes = (char *)VTOFH(vp);
4375 
4376 	/*
4377 	 * Call the lock manager to do the real work of contacting
4378 	 * the server and obtaining the lock.
4379 	 */
4380 	rc = lm_frlock(vp, cmd, bfp, flag, offset, cr, &lm_fh, flk_cbp);
4381 
4382 	if (rc == 0)
4383 		nfs_lockcompletion(vp, cmd);
4384 
4385 done:
4386 	nfs_rw_exit(&rp->r_lkserlock);
4387 	return (rc);
4388 }
4389 
4390 /*
4391  * Free storage space associated with the specified vnode.  The portion
4392  * to be freed is specified by bfp->l_start and bfp->l_len (already
4393  * normalized to a "whence" of 0).
4394  *
4395  * This is an experimental facility whose continued existence is not
4396  * guaranteed.  Currently, we only support the special case
4397  * of l_len == 0, meaning free to end of file.
4398  */
4399 /* ARGSUSED */
4400 static int
4401 nfs_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
4402 	offset_t offset, cred_t *cr, caller_context_t *ct)
4403 {
4404 	int error;
4405 
4406 	ASSERT(vp->v_type == VREG);
4407 	if (cmd != F_FREESP)
4408 		return (EINVAL);
4409 
4410 	if (offset > MAXOFF32_T)
4411 		return (EFBIG);
4412 
4413 	if ((bfp->l_start > MAXOFF32_T) || (bfp->l_end > MAXOFF32_T) ||
4414 	    (bfp->l_len > MAXOFF32_T))
4415 		return (EFBIG);
4416 
4417 	if (nfs_zone() != VTOMI(vp)->mi_zone)
4418 		return (EIO);
4419 
4420 	error = convoff(vp, bfp, 0, offset);
4421 	if (!error) {
4422 		ASSERT(bfp->l_start >= 0);
4423 		if (bfp->l_len == 0) {
4424 			struct vattr va;
4425 
4426 			/*
4427 			 * ftruncate should not change the ctime and
4428 			 * mtime if we truncate the file to its
4429 			 * previous size.
4430 			 */
4431 			va.va_mask = AT_SIZE;
4432 			error = nfsgetattr(vp, &va, cr);
4433 			if (error || va.va_size == bfp->l_start)
4434 				return (error);
4435 			va.va_mask = AT_SIZE;
4436 			va.va_size = bfp->l_start;
4437 			error = nfssetattr(vp, &va, 0, cr);
4438 		} else
4439 			error = EINVAL;
4440 	}
4441 
4442 	return (error);
4443 }
4444 
4445 /* ARGSUSED */
4446 static int
4447 nfs_realvp(vnode_t *vp, vnode_t **vpp)
4448 {
4449 
4450 	return (EINVAL);
4451 }
4452 
4453 /*
4454  * Setup and add an address space callback to do the work of the delmap call.
4455  * The callback will (and must be) deleted in the actual callback function.
4456  *
4457  * This is done in order to take care of the problem that we have with holding
4458  * the address space's a_lock for a long period of time (e.g. if the NFS server
4459  * is down).  Callbacks will be executed in the address space code while the
4460  * a_lock is not held.	Holding the address space's a_lock causes things such
4461  * as ps and fork to hang because they are trying to acquire this lock as well.
4462  */
4463 /* ARGSUSED */
4464 static int
4465 nfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
4466 	size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr)
4467 {
4468 	int			caller_found;
4469 	int			error;
4470 	rnode_t			*rp;
4471 	nfs_delmap_args_t	*dmapp;
4472 	nfs_delmapcall_t	*delmap_call;
4473 
4474 	if (vp->v_flag & VNOMAP)
4475 		return (ENOSYS);
4476 	/*
4477 	 * A process may not change zones if it has NFS pages mmap'ed
4478 	 * in, so we can't legitimately get here from the wrong zone.
4479 	 */
4480 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
4481 
4482 	rp = VTOR(vp);
4483 
4484 	/*
4485 	 * The way that the address space of this process deletes its mapping
4486 	 * of this file is via the following call chains:
4487 	 * - as_free()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs_delmap()
4488 	 * - as_unmap()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs_delmap()
4489 	 *
4490 	 * With the use of address space callbacks we are allowed to drop the
4491 	 * address space lock, a_lock, while executing the NFS operations that
4492 	 * need to go over the wire.  Returning EAGAIN to the caller of this
4493 	 * function is what drives the execution of the callback that we add
4494 	 * below.  The callback will be executed by the address space code
4495 	 * after dropping the a_lock.  When the callback is finished, since
4496 	 * we dropped the a_lock, it must be re-acquired and segvn_unmap()
4497 	 * is called again on the same segment to finish the rest of the work
4498 	 * that needs to happen during unmapping.
4499 	 *
4500 	 * This action of calling back into the segment driver causes
4501 	 * nfs_delmap() to get called again, but since the callback was
4502 	 * already executed at this point, it already did the work and there
4503 	 * is nothing left for us to do.
4504 	 *
4505 	 * To Summarize:
4506 	 * - The first time nfs_delmap is called by the current thread is when
4507 	 * we add the caller associated with this delmap to the delmap caller
4508 	 * list, add the callback, and return EAGAIN.
4509 	 * - The second time in this call chain when nfs_delmap is called we
4510 	 * will find this caller in the delmap caller list and realize there
4511 	 * is no more work to do thus removing this caller from the list and
4512 	 * returning the error that was set in the callback execution.
4513 	 */
4514 	caller_found = nfs_find_and_delete_delmapcall(rp, &error);
4515 	if (caller_found) {
4516 		/*
4517 		 * 'error' is from the actual delmap operations.  To avoid
4518 		 * hangs, we need to handle the return of EAGAIN differently
4519 		 * since this is what drives the callback execution.
4520 		 * In this case, we don't want to return EAGAIN and do the
4521 		 * callback execution because there are none to execute.
4522 		 */
4523 		if (error == EAGAIN)
4524 			return (0);
4525 		else
4526 			return (error);
4527 	}
4528 
4529 	/* current caller was not in the list */
4530 	delmap_call = nfs_init_delmapcall();
4531 
4532 	mutex_enter(&rp->r_statelock);
4533 	list_insert_tail(&rp->r_indelmap, delmap_call);
4534 	mutex_exit(&rp->r_statelock);
4535 
4536 	dmapp = kmem_alloc(sizeof (nfs_delmap_args_t), KM_SLEEP);
4537 
4538 	dmapp->vp = vp;
4539 	dmapp->off = off;
4540 	dmapp->addr = addr;
4541 	dmapp->len = len;
4542 	dmapp->prot = prot;
4543 	dmapp->maxprot = maxprot;
4544 	dmapp->flags = flags;
4545 	dmapp->cr = cr;
4546 	dmapp->caller = delmap_call;
4547 
4548 	error = as_add_callback(as, nfs_delmap_callback, dmapp,
4549 	    AS_UNMAP_EVENT, addr, len, KM_SLEEP);
4550 
4551 	return (error ? error : EAGAIN);
4552 }
4553 
4554 /*
4555  * Remove some pages from an mmap'd vnode.  Just update the
4556  * count of pages.  If doing close-to-open, then flush all
4557  * of the pages associated with this file.  Otherwise, start
4558  * an asynchronous page flush to write out any dirty pages.
4559  * This will also associate a credential with the rnode which
4560  * can be used to write the pages.
4561  */
4562 /* ARGSUSED */
4563 static void
4564 nfs_delmap_callback(struct as *as, void *arg, uint_t event)
4565 {
4566 	int			error;
4567 	rnode_t			*rp;
4568 	mntinfo_t		*mi;
4569 	nfs_delmap_args_t	*dmapp = (nfs_delmap_args_t *)arg;
4570 
4571 	rp = VTOR(dmapp->vp);
4572 	mi = VTOMI(dmapp->vp);
4573 
4574 	atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(dmapp->len));
4575 	ASSERT(rp->r_mapcnt >= 0);
4576 
4577 	/*
4578 	 * Initiate a page flush if there are pages, the file system
4579 	 * was not mounted readonly, the segment was mapped shared, and
4580 	 * the pages themselves were writeable.
4581 	 */
4582 	if (vn_has_cached_data(dmapp->vp) && !vn_is_readonly(dmapp->vp) &&
4583 	    dmapp->flags == MAP_SHARED && (dmapp->maxprot & PROT_WRITE)) {
4584 		mutex_enter(&rp->r_statelock);
4585 		rp->r_flags |= RDIRTY;
4586 		mutex_exit(&rp->r_statelock);
4587 		/*
4588 		 * If this is a cross-zone access a sync putpage won't work, so
4589 		 * the best we can do is try an async putpage.  That seems
4590 		 * better than something more draconian such as discarding the
4591 		 * dirty pages.
4592 		 */
4593 		if ((mi->mi_flags & MI_NOCTO) ||
4594 		    nfs_zone() != mi->mi_zone)
4595 			error = nfs_putpage(dmapp->vp, dmapp->off, dmapp->len,
4596 			    B_ASYNC, dmapp->cr);
4597 		else
4598 			error = nfs_putpage(dmapp->vp, dmapp->off, dmapp->len,
4599 			    0, dmapp->cr);
4600 		if (!error) {
4601 			mutex_enter(&rp->r_statelock);
4602 			error = rp->r_error;
4603 			rp->r_error = 0;
4604 			mutex_exit(&rp->r_statelock);
4605 		}
4606 	} else
4607 		error = 0;
4608 
4609 	if ((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO))
4610 		(void) nfs_putpage(dmapp->vp, dmapp->off, dmapp->len,
4611 		    B_INVAL, dmapp->cr);
4612 
4613 	dmapp->caller->error = error;
4614 	(void) as_delete_callback(as, arg);
4615 	kmem_free(dmapp, sizeof (nfs_delmap_args_t));
4616 }
4617 
4618 /* ARGSUSED */
4619 static int
4620 nfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr)
4621 {
4622 	int error = 0;
4623 
4624 	if (nfs_zone() != VTOMI(vp)->mi_zone)
4625 		return (EIO);
4626 	/*
4627 	 * This looks a little weird because it's written in a general
4628 	 * manner but we make little use of cases.  If cntl() ever gets
4629 	 * widely used, the outer switch will make more sense.
4630 	 */
4631 
4632 	switch (cmd) {
4633 
4634 	/*
4635 	 * Large file spec - need to base answer new query with
4636 	 * hardcoded constant based on the protocol.
4637 	 */
4638 	case _PC_FILESIZEBITS:
4639 		*valp = 32;
4640 		return (0);
4641 
4642 	case _PC_LINK_MAX:
4643 	case _PC_NAME_MAX:
4644 	case _PC_PATH_MAX:
4645 	case _PC_SYMLINK_MAX:
4646 	case _PC_CHOWN_RESTRICTED:
4647 	case _PC_NO_TRUNC: {
4648 		mntinfo_t *mi;
4649 		struct pathcnf *pc;
4650 
4651 		if ((mi = VTOMI(vp)) == NULL || (pc = mi->mi_pathconf) == NULL)
4652 			return (EINVAL);
4653 		error = _PC_ISSET(cmd, pc->pc_mask);    /* error or bool */
4654 		switch (cmd) {
4655 		case _PC_LINK_MAX:
4656 			*valp = pc->pc_link_max;
4657 			break;
4658 		case _PC_NAME_MAX:
4659 			*valp = pc->pc_name_max;
4660 			break;
4661 		case _PC_PATH_MAX:
4662 		case _PC_SYMLINK_MAX:
4663 			*valp = pc->pc_path_max;
4664 			break;
4665 		case _PC_CHOWN_RESTRICTED:
4666 			/*
4667 			 * if we got here, error is really a boolean which
4668 			 * indicates whether cmd is set or not.
4669 			 */
4670 			*valp = error ? 1 : 0;	/* see above */
4671 			error = 0;
4672 			break;
4673 		case _PC_NO_TRUNC:
4674 			/*
4675 			 * if we got here, error is really a boolean which
4676 			 * indicates whether cmd is set or not.
4677 			 */
4678 			*valp = error ? 1 : 0;	/* see above */
4679 			error = 0;
4680 			break;
4681 		}
4682 		return (error ? EINVAL : 0);
4683 	    }
4684 
4685 	case _PC_XATTR_EXISTS:
4686 		*valp = 0;
4687 		if (vp->v_vfsp->vfs_flag & VFS_XATTR) {
4688 			vnode_t *avp;
4689 			rnode_t *rp;
4690 			mntinfo_t *mi = VTOMI(vp);
4691 
4692 			if (!(mi->mi_flags & MI_EXTATTR))
4693 				return (0);
4694 
4695 			rp = VTOR(vp);
4696 			if (nfs_rw_enter_sig(&rp->r_rwlock, RW_READER,
4697 			    INTR(vp)))
4698 				return (EINTR);
4699 
4700 			error = nfslookup_dnlc(vp, XATTR_DIR_NAME, &avp, cr);
4701 			if (error || avp == NULL)
4702 				error = acl_getxattrdir2(vp, &avp, 0, cr, 0);
4703 
4704 			nfs_rw_exit(&rp->r_rwlock);
4705 
4706 			if (error == 0 && avp != NULL) {
4707 				VN_RELE(avp);
4708 				*valp = 1;
4709 			}
4710 		}
4711 		return (error ? EINVAL : 0);
4712 
4713 	case _PC_ACL_ENABLED:
4714 		*valp = _ACL_ACLENT_ENABLED;
4715 		return (0);
4716 
4717 	default:
4718 		return (EINVAL);
4719 	}
4720 }
4721 
4722 /*
4723  * Called by async thread to do synchronous pageio. Do the i/o, wait
4724  * for it to complete, and cleanup the page list when done.
4725  */
4726 static int
4727 nfs_sync_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
4728 	int flags, cred_t *cr)
4729 {
4730 	int error;
4731 
4732 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
4733 	error = nfs_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
4734 	if (flags & B_READ)
4735 		pvn_read_done(pp, (error ? B_ERROR : 0) | flags);
4736 	else
4737 		pvn_write_done(pp, (error ? B_ERROR : 0) | flags);
4738 	return (error);
4739 }
4740 
4741 static int
4742 nfs_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
4743 	int flags, cred_t *cr)
4744 {
4745 	int error;
4746 	rnode_t *rp;
4747 
4748 	if (pp == NULL)
4749 		return (EINVAL);
4750 
4751 	if (io_off > MAXOFF32_T)
4752 		return (EFBIG);
4753 	if (nfs_zone() != VTOMI(vp)->mi_zone)
4754 		return (EIO);
4755 	rp = VTOR(vp);
4756 	mutex_enter(&rp->r_statelock);
4757 	rp->r_count++;
4758 	mutex_exit(&rp->r_statelock);
4759 
4760 	if (flags & B_ASYNC) {
4761 		error = nfs_async_pageio(vp, pp, io_off, io_len, flags, cr,
4762 		    nfs_sync_pageio);
4763 	} else
4764 		error = nfs_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
4765 	mutex_enter(&rp->r_statelock);
4766 	rp->r_count--;
4767 	cv_broadcast(&rp->r_cv);
4768 	mutex_exit(&rp->r_statelock);
4769 	return (error);
4770 }
4771 
4772 static int
4773 nfs_setsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr)
4774 {
4775 	int error;
4776 	mntinfo_t *mi;
4777 
4778 	mi = VTOMI(vp);
4779 
4780 	if (nfs_zone() != mi->mi_zone)
4781 		return (EIO);
4782 	if (mi->mi_flags & MI_ACL) {
4783 		error = acl_setacl2(vp, vsecattr, flag, cr);
4784 		if (mi->mi_flags & MI_ACL)
4785 			return (error);
4786 	}
4787 
4788 	return (ENOSYS);
4789 }
4790 
4791 static int
4792 nfs_getsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr)
4793 {
4794 	int error;
4795 	mntinfo_t *mi;
4796 
4797 	mi = VTOMI(vp);
4798 
4799 	if (nfs_zone() != mi->mi_zone)
4800 		return (EIO);
4801 	if (mi->mi_flags & MI_ACL) {
4802 		error = acl_getacl2(vp, vsecattr, flag, cr);
4803 		if (mi->mi_flags & MI_ACL)
4804 			return (error);
4805 	}
4806 
4807 	return (fs_fab_acl(vp, vsecattr, flag, cr));
4808 }
4809 
4810 static int
4811 nfs_shrlock(vnode_t *vp, int cmd, struct shrlock *shr, int flag, cred_t *cr)
4812 {
4813 	int error;
4814 	struct shrlock nshr;
4815 	struct nfs_owner nfs_owner;
4816 	netobj lm_fh;
4817 
4818 	if (nfs_zone() != VTOMI(vp)->mi_zone)
4819 		return (EIO);
4820 
4821 	/*
4822 	 * check for valid cmd parameter
4823 	 */
4824 	if (cmd != F_SHARE && cmd != F_UNSHARE && cmd != F_HASREMOTELOCKS)
4825 		return (EINVAL);
4826 
4827 	/*
4828 	 * Check access permissions
4829 	 */
4830 	if (cmd == F_SHARE &&
4831 	    (((shr->s_access & F_RDACC) && !(flag & FREAD)) ||
4832 	    ((shr->s_access & F_WRACC) && !(flag & FWRITE))))
4833 		return (EBADF);
4834 
4835 	/*
4836 	 * If the filesystem is mounted using local locking, pass the
4837 	 * request off to the local share code.
4838 	 */
4839 	if (VTOMI(vp)->mi_flags & MI_LLOCK)
4840 		return (fs_shrlock(vp, cmd, shr, flag, cr));
4841 
4842 	switch (cmd) {
4843 	case F_SHARE:
4844 	case F_UNSHARE:
4845 		lm_fh.n_len = sizeof (fhandle_t);
4846 		lm_fh.n_bytes = (char *)VTOFH(vp);
4847 
4848 		/*
4849 		 * If passed an owner that is too large to fit in an
4850 		 * nfs_owner it is likely a recursive call from the
4851 		 * lock manager client and pass it straight through.  If
4852 		 * it is not a nfs_owner then simply return an error.
4853 		 */
4854 		if (shr->s_own_len > sizeof (nfs_owner.lowner)) {
4855 			if (((struct nfs_owner *)shr->s_owner)->magic !=
4856 			    NFS_OWNER_MAGIC)
4857 				return (EINVAL);
4858 
4859 			if (error = lm_shrlock(vp, cmd, shr, flag, &lm_fh)) {
4860 				error = set_errno(error);
4861 			}
4862 			return (error);
4863 		}
4864 		/*
4865 		 * Remote share reservations owner is a combination of
4866 		 * a magic number, hostname, and the local owner
4867 		 */
4868 		bzero(&nfs_owner, sizeof (nfs_owner));
4869 		nfs_owner.magic = NFS_OWNER_MAGIC;
4870 		(void) strncpy(nfs_owner.hname, uts_nodename(),
4871 		    sizeof (nfs_owner.hname));
4872 		bcopy(shr->s_owner, nfs_owner.lowner, shr->s_own_len);
4873 		nshr.s_access = shr->s_access;
4874 		nshr.s_deny = shr->s_deny;
4875 		nshr.s_sysid = 0;
4876 		nshr.s_pid = ttoproc(curthread)->p_pid;
4877 		nshr.s_own_len = sizeof (nfs_owner);
4878 		nshr.s_owner = (caddr_t)&nfs_owner;
4879 
4880 		if (error = lm_shrlock(vp, cmd, &nshr, flag, &lm_fh)) {
4881 			error = set_errno(error);
4882 		}
4883 
4884 		break;
4885 
4886 	case F_HASREMOTELOCKS:
4887 		/*
4888 		 * NFS client can't store remote locks itself
4889 		 */
4890 		shr->s_access = 0;
4891 		error = 0;
4892 		break;
4893 
4894 	default:
4895 		error = EINVAL;
4896 		break;
4897 	}
4898 
4899 	return (error);
4900 }
4901