xref: /titanic_41/usr/src/uts/common/fs/nfs/nfs_srv.c (revision d89fccd8788afe1e920f842edd883fe192a1b8fe)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  *	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
28  *	All rights reserved.
29  */
30 
31 #pragma ident	"%Z%%M%	%I%	%E% SMI"
32 
33 #include <sys/param.h>
34 #include <sys/types.h>
35 #include <sys/systm.h>
36 #include <sys/cred.h>
37 #include <sys/buf.h>
38 #include <sys/vfs.h>
39 #include <sys/vnode.h>
40 #include <sys/uio.h>
41 #include <sys/stat.h>
42 #include <sys/errno.h>
43 #include <sys/sysmacros.h>
44 #include <sys/statvfs.h>
45 #include <sys/kmem.h>
46 #include <sys/kstat.h>
47 #include <sys/dirent.h>
48 #include <sys/cmn_err.h>
49 #include <sys/debug.h>
50 #include <sys/vtrace.h>
51 #include <sys/mode.h>
52 #include <sys/acl.h>
53 #include <sys/nbmlock.h>
54 #include <sys/policy.h>
55 
56 #include <rpc/types.h>
57 #include <rpc/auth.h>
58 #include <rpc/svc.h>
59 
60 #include <nfs/nfs.h>
61 #include <nfs/export.h>
62 
63 #include <vm/hat.h>
64 #include <vm/as.h>
65 #include <vm/seg.h>
66 #include <vm/seg_map.h>
67 #include <vm/seg_kmem.h>
68 
69 #include <sys/strsubr.h>
70 
71 /*
72  * These are the interface routines for the server side of the
73  * Network File System.  See the NFS version 2 protocol specification
74  * for a description of this interface.
75  */
76 
77 static int	sattr_to_vattr(struct nfssattr *, struct vattr *);
78 static void	acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
79 			cred_t *);
80 
81 /*
82  * Some "over the wire" UNIX file types.  These are encoded
83  * into the mode.  This needs to be fixed in the next rev.
84  */
85 #define	IFMT		0170000		/* type of file */
86 #define	IFCHR		0020000		/* character special */
87 #define	IFBLK		0060000		/* block special */
88 #define	IFSOCK		0140000		/* socket */
89 
90 /*
91  * Get file attributes.
92  * Returns the current attributes of the file with the given fhandle.
93  */
94 /* ARGSUSED */
95 void
96 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
97 	struct svc_req *req, cred_t *cr)
98 {
99 	int error;
100 	vnode_t *vp;
101 	struct vattr va;
102 
103 	TRACE_0(TR_FAC_NFS, TR_RFS_GETATTR_START,
104 		"rfs_getattr_start:");
105 
106 	vp = nfs_fhtovp(fhp, exi);
107 	if (vp == NULL) {
108 		ns->ns_status = NFSERR_STALE;
109 		TRACE_1(TR_FAC_NFS, TR_RFS_GETATTR_END,
110 			"rfs_getattr_end:(%S)", "stale");
111 		return;
112 	}
113 
114 	/*
115 	 * Do the getattr.
116 	 */
117 	va.va_mask = AT_ALL;	/* we want all the attributes */
118 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
119 		"vop_getattr_start:");
120 	error = rfs4_delegated_getattr(vp, &va, 0, cr);
121 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
122 		"vop_getattr_end:");
123 
124 	/* check for overflows */
125 	if (!error) {
126 		acl_perm(vp, exi, &va, cr);
127 		error = vattr_to_nattr(&va, &ns->ns_attr);
128 	}
129 
130 	VN_RELE(vp);
131 
132 	ns->ns_status = puterrno(error);
133 
134 	TRACE_1(TR_FAC_NFS, TR_RFS_GETATTR_END,
135 		"rfs_getattr_end:(%S)", "done");
136 }
137 void *
138 rfs_getattr_getfh(fhandle_t *fhp)
139 {
140 	return (fhp);
141 }
142 
143 /*
144  * Set file attributes.
145  * Sets the attributes of the file with the given fhandle.  Returns
146  * the new attributes.
147  */
148 void
149 rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns,
150 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
151 {
152 	int error;
153 	int flag;
154 	int in_crit = 0;
155 	vnode_t *vp;
156 	struct vattr va;
157 	struct vattr bva;
158 	struct flock64 bf;
159 
160 	TRACE_0(TR_FAC_NFS, TR_RFS_SETATTR_START,
161 		"rfs_setattr_start:");
162 
163 	vp = nfs_fhtovp(&args->saa_fh, exi);
164 	if (vp == NULL) {
165 		ns->ns_status = NFSERR_STALE;
166 		TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END,
167 			"rfs_setattr_end:(%S)", "stale");
168 		return;
169 	}
170 
171 	if (rdonly(exi, req) || vn_is_readonly(vp)) {
172 		VN_RELE(vp);
173 		ns->ns_status = NFSERR_ROFS;
174 		TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END,
175 			"rfs_setattr_end:(%S)", "rofs");
176 		return;
177 	}
178 
179 	error = sattr_to_vattr(&args->saa_sa, &va);
180 	if (error) {
181 		VN_RELE(vp);
182 		ns->ns_status = puterrno(error);
183 		TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END,
184 			"rfs_setattr_end:(%S)", "sattr");
185 		return;
186 	}
187 
188 	/*
189 	 * If the client is requesting a change to the mtime,
190 	 * but the nanosecond field is set to 1 billion, then
191 	 * this is a flag to the server that it should set the
192 	 * atime and mtime fields to the server's current time.
193 	 * The 1 billion number actually came from the client
194 	 * as 1 million, but the units in the over the wire
195 	 * request are microseconds instead of nanoseconds.
196 	 *
197 	 * This is an overload of the protocol and should be
198 	 * documented in the NFS Version 2 protocol specification.
199 	 */
200 	if (va.va_mask & AT_MTIME) {
201 		if (va.va_mtime.tv_nsec == 1000000000) {
202 			gethrestime(&va.va_mtime);
203 			va.va_atime = va.va_mtime;
204 			va.va_mask |= AT_ATIME;
205 			flag = 0;
206 		} else
207 			flag = ATTR_UTIME;
208 	} else
209 		flag = 0;
210 
211 	/*
212 	 * If the filesystem is exported with nosuid, then mask off
213 	 * the setuid and setgid bits.
214 	 */
215 	if ((va.va_mask & AT_MODE) && vp->v_type == VREG &&
216 	    (exi->exi_export.ex_flags & EX_NOSUID))
217 		va.va_mode &= ~(VSUID | VSGID);
218 
219 	/*
220 	 * We need to specially handle size changes because it is
221 	 * possible for the client to create a file with modes
222 	 * which indicate read-only, but with the file opened for
223 	 * writing.  If the client then tries to set the size of
224 	 * the file, then the normal access checking done in
225 	 * VOP_SETATTR would prevent the client from doing so,
226 	 * although it should be legal for it to do so.  To get
227 	 * around this, we do the access checking for ourselves
228 	 * and then use VOP_SPACE which doesn't do the access
229 	 * checking which VOP_SETATTR does. VOP_SPACE can only
230 	 * operate on VREG files, let VOP_SETATTR handle the other
231 	 * extremely rare cases.
232 	 * Also the client should not be allowed to change the
233 	 * size of the file if there is a conflicting non-blocking
234 	 * mandatory lock in the region of change.
235 	 *
236 	 * Also(2), check to see if the v4 side of the server has
237 	 * delegated this file.  If so, then we set T_WOULDBLOCK
238 	 * so that the dispatch function dosn't send a reply, forcing
239 	 * the client to retrasmit its request.
240 	 */
241 	if (vp->v_type == VREG && va.va_mask & AT_SIZE) {
242 		/* If delegated, mark as wouldblock so response is dropped */
243 		if (rfs4_check_delegated(FWRITE, vp, TRUE)) {
244 			VN_RELE(vp);
245 			curthread->t_flag |= T_WOULDBLOCK;
246 			TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END,
247 				"rfs_setattr_end:(%S)", "delegated");
248 			return;
249 		}
250 		if (nbl_need_check(vp)) {
251 			nbl_start_crit(vp, RW_READER);
252 			in_crit = 1;
253 		}
254 
255 		bva.va_mask = AT_UID | AT_SIZE;
256 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
257 			"vop_getattr_start:");
258 		error = VOP_GETATTR(vp, &bva, 0, cr);
259 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
260 			"vop_getattr_end:");
261 		if (error) {
262 			if (in_crit)
263 				nbl_end_crit(vp);
264 			VN_RELE(vp);
265 			ns->ns_status = puterrno(error);
266 			TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END,
267 				"rfs_setattr_end:(%S)", "getattr");
268 			return;
269 		}
270 
271 		if (in_crit) {
272 			u_offset_t offset;
273 			ssize_t length;
274 
275 			if (va.va_size < bva.va_size) {
276 				offset = va.va_size;
277 				length = bva.va_size - va.va_size;
278 			} else {
279 				offset = bva.va_size;
280 				length = va.va_size - bva.va_size;
281 			}
282 			if (nbl_conflict(vp, NBL_WRITE, offset, length, 0)) {
283 				error = EACCES;
284 			}
285 		}
286 
287 		if (crgetuid(cr) == bva.va_uid && !error &&
288 			va.va_size != bva.va_size) {
289 			va.va_mask &= ~AT_SIZE;
290 			bf.l_type = F_WRLCK;
291 			bf.l_whence = 0;
292 			bf.l_start = (off64_t)va.va_size;
293 			bf.l_len = 0;
294 			bf.l_sysid = 0;
295 			bf.l_pid = 0;
296 			TRACE_0(TR_FAC_NFS, TR_VOP_SPACE_START,
297 				"vop_space_start:");
298 			error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE,
299 					(offset_t)va.va_size, cr, NULL);
300 			TRACE_0(TR_FAC_NFS, TR_VOP_SPACE_END,
301 				"vop_space_end:");
302 		}
303 		if (in_crit)
304 			nbl_end_crit(vp);
305 	} else
306 		error = 0;
307 
308 	/*
309 	 * Do the setattr.
310 	 */
311 	if (!error && va.va_mask) {
312 		TRACE_0(TR_FAC_NFS, TR_VOP_SETATTR_START,
313 			"vop_setattr_start:");
314 		error = VOP_SETATTR(vp, &va, flag, cr, NULL);
315 		TRACE_0(TR_FAC_NFS, TR_VOP_SETATTR_END,
316 			"vop_setattr_end:");
317 	}
318 
319 	if (!error) {
320 		va.va_mask = AT_ALL;	/* get everything */
321 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
322 			"vop_getattr_start:");
323 		error = rfs4_delegated_getattr(vp, &va, 0, cr);
324 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
325 			"vop_getattr_end:");
326 
327 		/* check for overflows */
328 		if (!error) {
329 			acl_perm(vp, exi, &va, cr);
330 			error = vattr_to_nattr(&va, &ns->ns_attr);
331 		}
332 	}
333 
334 	/*
335 	 * Force modified metadata out to stable storage.
336 	 */
337 	(void) VOP_FSYNC(vp, FNODSYNC, cr);
338 
339 	VN_RELE(vp);
340 
341 	ns->ns_status = puterrno(error);
342 
343 	TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END,
344 		"rfs_setattr_end:(%S)", "done");
345 }
346 void *
347 rfs_setattr_getfh(struct nfssaargs *args)
348 {
349 	return (&args->saa_fh);
350 }
351 
352 /*
353  * Directory lookup.
354  * Returns an fhandle and file attributes for file name in a directory.
355  */
356 /* ARGSUSED */
357 void
358 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
359 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
360 {
361 	int error;
362 	vnode_t *dvp;
363 	vnode_t *vp;
364 	struct vattr va;
365 	fhandle_t *fhp = da->da_fhandle;
366 	struct sec_ol sec = {0, 0};
367 	bool_t publicfh_flag = FALSE, auth_weak = FALSE;
368 
369 	TRACE_0(TR_FAC_NFS, TR_RFS_LOOKUP_START,
370 		"rfs_lookup_start:");
371 
372 	/*
373 	 * Disallow NULL paths
374 	 */
375 	if (da->da_name == NULL || *da->da_name == '\0') {
376 		dr->dr_status = NFSERR_ACCES;
377 		TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END,
378 			"rfs_lookup_end:(%S)", "access");
379 		return;
380 	}
381 
382 	/*
383 	 * Allow lookups from the root - the default
384 	 * location of the public filehandle.
385 	 */
386 	if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
387 		dvp = rootdir;
388 		VN_HOLD(dvp);
389 	} else {
390 		dvp = nfs_fhtovp(fhp, exi);
391 		if (dvp == NULL) {
392 			dr->dr_status = NFSERR_STALE;
393 			TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END,
394 				"rfs_lookup_end:(%S)", "stale");
395 			return;
396 		}
397 	}
398 
399 	/*
400 	 * Not allow lookup beyond root.
401 	 * If the filehandle matches a filehandle of the exi,
402 	 * then the ".." refers beyond the root of an exported filesystem.
403 	 */
404 	if (strcmp(da->da_name, "..") == 0 &&
405 	    EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
406 		VN_RELE(dvp);
407 		dr->dr_status = NFSERR_NOENT;
408 		TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END,
409 			"rfs_lookup_end:(%S)", "noent");
410 		return;
411 	}
412 
413 	/*
414 	 * If the public filehandle is used then allow
415 	 * a multi-component lookup, i.e. evaluate
416 	 * a pathname and follow symbolic links if
417 	 * necessary.
418 	 *
419 	 * This may result in a vnode in another filesystem
420 	 * which is OK as long as the filesystem is exported.
421 	 */
422 	if (PUBLIC_FH2(fhp)) {
423 		publicfh_flag = TRUE;
424 		error = rfs_publicfh_mclookup(da->da_name, dvp, cr, &vp, &exi,
425 					&sec);
426 	} else {
427 		/*
428 		 * Do a normal single component lookup.
429 		 */
430 		TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_START,
431 			"vop_lookup_start:");
432 		error = VOP_LOOKUP(dvp, da->da_name, &vp, NULL, 0, NULL, cr);
433 		TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_END,
434 			"vop_lookup_end:");
435 	}
436 
437 	if (!error) {
438 		va.va_mask = AT_ALL;	/* we want everything */
439 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
440 			"vop_getattr_start:");
441 		error = rfs4_delegated_getattr(vp, &va, 0, cr);
442 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
443 			"vop_getattr_end:");
444 		/* check for overflows */
445 		if (!error) {
446 			acl_perm(vp, exi, &va, cr);
447 			error = vattr_to_nattr(&va, &dr->dr_attr);
448 			if (!error) {
449 				if (sec.sec_flags & SEC_QUERY)
450 					error = makefh_ol(&dr->dr_fhandle, exi,
451 							sec.sec_index);
452 				else {
453 					error = makefh(&dr->dr_fhandle, vp,
454 								exi);
455 					if (!error && publicfh_flag &&
456 						!chk_clnt_sec(exi, req))
457 						auth_weak = TRUE;
458 				}
459 			}
460 		}
461 		VN_RELE(vp);
462 	}
463 
464 	VN_RELE(dvp);
465 
466 	/*
467 	 * If publicfh_flag is true then we have called rfs_publicfh_mclookup
468 	 * and have obtained a new exportinfo in exi which needs to be
469 	 * released. Note the the original exportinfo pointed to by exi
470 	 * will be released by the caller, comon_dispatch.
471 	 */
472 	if (publicfh_flag && exi != NULL)
473 		exi_rele(exi);
474 
475 	/*
476 	 * If it's public fh, no 0x81, and client's flavor is
477 	 * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
478 	 * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
479 	 */
480 	if (auth_weak)
481 		dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR;
482 	else
483 		dr->dr_status = puterrno(error);
484 
485 	TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END,
486 		"rfs_lookup_end:(%S)", "done");
487 }
488 void *
489 rfs_lookup_getfh(struct nfsdiropargs *da)
490 {
491 	return (da->da_fhandle);
492 }
493 
494 /*
495  * Read symbolic link.
496  * Returns the string in the symbolic link at the given fhandle.
497  */
498 /* ARGSUSED */
499 void
500 rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi,
501 	struct svc_req *req, cred_t *cr)
502 {
503 	int error;
504 	struct iovec iov;
505 	struct uio uio;
506 	vnode_t *vp;
507 	struct vattr va;
508 
509 	TRACE_0(TR_FAC_NFS, TR_RFS_READLINK_START,
510 		"rfs_readlink_start:");
511 
512 	vp = nfs_fhtovp(fhp, exi);
513 	if (vp == NULL) {
514 		rl->rl_data = NULL;
515 		rl->rl_status = NFSERR_STALE;
516 		TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END,
517 			"rfs_readlink_end:(%S)", "stale");
518 		return;
519 	}
520 
521 	va.va_mask = AT_MODE;
522 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
523 		"vop_getattr_start:");
524 	error = VOP_GETATTR(vp, &va, 0, cr);
525 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
526 		"vop_getattr_end:");
527 
528 	if (error) {
529 		VN_RELE(vp);
530 		rl->rl_data = NULL;
531 		rl->rl_status = puterrno(error);
532 		TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END,
533 			"rfs_readlink_end:(%S)", "getattr error");
534 		return;
535 	}
536 
537 	if (MANDLOCK(vp, va.va_mode)) {
538 		VN_RELE(vp);
539 		rl->rl_data = NULL;
540 		rl->rl_status = NFSERR_ACCES;
541 		TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END,
542 			"rfs_readlink_end:(%S)", "access");
543 		return;
544 	}
545 
546 	/*
547 	 * XNFS and RFC1094 require us to return ENXIO if argument
548 	 * is not a link. BUGID 1138002.
549 	 */
550 	if (vp->v_type != VLNK) {
551 		VN_RELE(vp);
552 		rl->rl_data = NULL;
553 		rl->rl_status = NFSERR_NXIO;
554 		TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END,
555 			"rfs_readlink_end:(%S)", "nxio");
556 		return;
557 	}
558 
559 	/*
560 	 * Allocate data for pathname.  This will be freed by rfs_rlfree.
561 	 */
562 	rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
563 
564 	/*
565 	 * Set up io vector to read sym link data
566 	 */
567 	iov.iov_base = rl->rl_data;
568 	iov.iov_len = NFS_MAXPATHLEN;
569 	uio.uio_iov = &iov;
570 	uio.uio_iovcnt = 1;
571 	uio.uio_segflg = UIO_SYSSPACE;
572 	uio.uio_extflg = UIO_COPY_CACHED;
573 	uio.uio_loffset = (offset_t)0;
574 	uio.uio_resid = NFS_MAXPATHLEN;
575 
576 	/*
577 	 * Do the readlink.
578 	 */
579 	TRACE_0(TR_FAC_NFS, TR_VOP_READLINK_START,
580 		"vop_readlink_start:");
581 	error = VOP_READLINK(vp, &uio, cr);
582 	TRACE_0(TR_FAC_NFS, TR_VOP_READLINK_END,
583 		"vop_readlink_end:");
584 
585 #if 0 /* notyet */
586 	/*
587 	 * Don't do this.  It causes local disk writes when just
588 	 * reading the file and the overhead is deemed larger
589 	 * than the benefit.
590 	 */
591 	/*
592 	 * Force modified metadata out to stable storage.
593 	 */
594 	(void) VOP_FSYNC(vp, FNODSYNC, cr);
595 #endif
596 
597 	VN_RELE(vp);
598 
599 	rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid);
600 
601 	/*
602 	 * XNFS and RFC1094 require us to return ENXIO if argument
603 	 * is not a link. UFS returns EINVAL if this is the case,
604 	 * so we do the mapping here. BUGID 1138002.
605 	 */
606 	if (error == EINVAL)
607 		rl->rl_status = NFSERR_NXIO;
608 	else
609 		rl->rl_status = puterrno(error);
610 
611 	TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END,
612 		"rfs_readlink_end:(%S)", "done");
613 }
614 void *
615 rfs_readlink_getfh(fhandle_t *fhp)
616 {
617 	return (fhp);
618 }
619 /*
620  * Free data allocated by rfs_readlink
621  */
622 void
623 rfs_rlfree(struct nfsrdlnres *rl)
624 {
625 	if (rl->rl_data != NULL)
626 		kmem_free(rl->rl_data, NFS_MAXPATHLEN);
627 }
628 
629 /*
630  * Read data.
631  * Returns some data read from the file at the given fhandle.
632  */
633 /* ARGSUSED */
634 void
635 rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr,
636 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
637 {
638 	vnode_t *vp;
639 	int error;
640 	struct vattr va;
641 	struct iovec iov;
642 	struct uio uio;
643 	mblk_t *mp;
644 	int alloc_err = 0;
645 	int in_crit = 0;
646 
647 	TRACE_0(TR_FAC_NFS, TR_RFS_READ_START,
648 		"rfs_read_start:");
649 
650 	vp = nfs_fhtovp(&ra->ra_fhandle, exi);
651 	if (vp == NULL) {
652 		rr->rr_data = NULL;
653 		rr->rr_status = NFSERR_STALE;
654 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
655 			"rfs_read_end:(%S)", "stale");
656 		return;
657 	}
658 
659 	if (vp->v_type != VREG) {
660 		VN_RELE(vp);
661 		rr->rr_data = NULL;
662 		rr->rr_status = NFSERR_ISDIR;
663 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
664 			"rfs_read_end:(%S)", "isdir");
665 		return;
666 	}
667 
668 	/*
669 	 * Check to see if the v4 side of the server has delegated
670 	 * this file.  If so, then we mark thread as wouldblock so
671 	 * the response is dropped.
672 	 */
673 	if (rfs4_check_delegated(FREAD, vp, FALSE)) {
674 		VN_RELE(vp);
675 		curthread->t_flag |= T_WOULDBLOCK;
676 		rr->rr_data = NULL;
677 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
678 			"rfs_read_end:(%S)", "delegated");
679 		return;
680 	}
681 
682 	/*
683 	 * Enter the critical region before calling VOP_RWLOCK
684 	 * to avoid a deadlock with write requests.
685 	 */
686 	if (nbl_need_check(vp)) {
687 		nbl_start_crit(vp, RW_READER);
688 		if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
689 		    0)) {
690 			nbl_end_crit(vp);
691 			VN_RELE(vp);
692 			rr->rr_data = NULL;
693 			rr->rr_status = NFSERR_ACCES;
694 			TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
695 				"rfs_read_end:(%S)", " csf access error");
696 			return;
697 		}
698 		in_crit = 1;
699 	}
700 
701 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START,
702 		"vop_rwlock_start:");
703 	(void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
704 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END,
705 		"vop_rwlock_end:");
706 
707 	va.va_mask = AT_ALL;
708 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
709 		"vop_getattr_start:");
710 	error = VOP_GETATTR(vp, &va, 0, cr);
711 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
712 		"vop_getattr_end:");
713 
714 	if (error) {
715 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
716 			"vop_rwunlock_start:");
717 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
718 		if (in_crit)
719 			nbl_end_crit(vp);
720 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END,
721 			"vop_rwunlock_end:");
722 		VN_RELE(vp);
723 		rr->rr_data = NULL;
724 		rr->rr_status = puterrno(error);
725 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
726 			"rfs_read_end:(%S)", "getattr error");
727 		return;
728 	}
729 
730 	/*
731 	 * This is a kludge to allow reading of files created
732 	 * with no read permission.  The owner of the file
733 	 * is always allowed to read it.
734 	 */
735 	if (crgetuid(cr) != va.va_uid) {
736 		TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START,
737 			"vop_access_start:");
738 		error = VOP_ACCESS(vp, VREAD, 0, cr);
739 		TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END,
740 			"vop_access_end:");
741 		if (error) {
742 			/*
743 			 * Exec is the same as read over the net because
744 			 * of demand loading.
745 			 */
746 			TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START,
747 				"vop_access_start:");
748 			error = VOP_ACCESS(vp, VEXEC, 0, cr);
749 			TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END,
750 				"vop_access_end:");
751 		}
752 		if (error) {
753 			TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
754 				"vop_rwunlock_start:");
755 			VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
756 			if (in_crit)
757 				nbl_end_crit(vp);
758 			TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END,
759 				"vop_rwunlock_end:");
760 			VN_RELE(vp);
761 			rr->rr_data = NULL;
762 			rr->rr_status = puterrno(error);
763 			TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
764 				"rfs_read_end:(%S)", "access error");
765 			return;
766 		}
767 	}
768 
769 	if (MANDLOCK(vp, va.va_mode)) {
770 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
771 			"vop_rwunlock_start:");
772 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
773 		if (in_crit)
774 			nbl_end_crit(vp);
775 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END,
776 			"vop_rwunlock_end:");
777 		VN_RELE(vp);
778 		rr->rr_data = NULL;
779 		rr->rr_status = NFSERR_ACCES;
780 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
781 			"rfs_read_end:(%S)", "mand lock");
782 		return;
783 	}
784 
785 	if ((u_offset_t)ra->ra_offset >= va.va_size) {
786 		rr->rr_count = 0;
787 		rr->rr_data = NULL;
788 		/*
789 		 * In this case, status is NFS_OK, but there is no data
790 		 * to encode. So set rr_mp to NULL.
791 		 */
792 		rr->rr_mp = NULL;
793 		goto done;
794 	}
795 
796 	/*
797 	 * mp will contain the data to be sent out in the read reply.
798 	 * This will be freed after the reply has been sent out (by the
799 	 * driver).
800 	 * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so
801 	 * that the call to xdrmblk_putmblk() never fails.
802 	 */
803 	mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG,
804 	    &alloc_err);
805 	ASSERT(mp != NULL);
806 	ASSERT(alloc_err == 0);
807 
808 	rr->rr_mp = mp;
809 
810 	/*
811 	 * Set up io vector
812 	 */
813 	iov.iov_base = (caddr_t)mp->b_datap->db_base;
814 	iov.iov_len = ra->ra_count;
815 	uio.uio_iov = &iov;
816 	uio.uio_iovcnt = 1;
817 	uio.uio_segflg = UIO_SYSSPACE;
818 	uio.uio_extflg = UIO_COPY_CACHED;
819 	uio.uio_loffset = (offset_t)ra->ra_offset;
820 	uio.uio_resid = ra->ra_count;
821 
822 	TRACE_0(TR_FAC_NFS, TR_VOP_READ_START,
823 		"vop_read_start:");
824 	error = VOP_READ(vp, &uio, 0, cr, NULL);
825 	TRACE_0(TR_FAC_NFS, TR_VOP_READ_END,
826 		"vop_read_end:");
827 
828 	if (error) {
829 		freeb(mp);
830 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
831 			"vop_rwunlock_start:");
832 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
833 		if (in_crit)
834 			nbl_end_crit(vp);
835 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END,
836 			"vop_rwunlock_end:");
837 		VN_RELE(vp);
838 		rr->rr_data = NULL;
839 		rr->rr_status = puterrno(error);
840 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
841 			"rfs_read_end:(%S)", "read error");
842 		return;
843 	}
844 
845 	/*
846 	 * Get attributes again so we can send the latest access
847 	 * time to the client side for his cache.
848 	 */
849 	va.va_mask = AT_ALL;
850 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
851 		"vop_getattr_start:");
852 	error = VOP_GETATTR(vp, &va, 0, cr);
853 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
854 		"vop_getattr_end:");
855 	if (error) {
856 		freeb(mp);
857 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
858 			"vop_rwunlock_start:");
859 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
860 		if (in_crit)
861 			nbl_end_crit(vp);
862 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END,
863 			"vop_rwunlock_end:");
864 		VN_RELE(vp);
865 		rr->rr_data = NULL;
866 		rr->rr_status = puterrno(error);
867 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
868 			"rfs_read_end:(%S)", "read error");
869 		return;
870 	}
871 
872 	rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid);
873 
874 	rr->rr_data = (char *)mp->b_datap->db_base;
875 
876 done:
877 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
878 		"vop_rwunlock_start:");
879 	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
880 	if (in_crit)
881 		nbl_end_crit(vp);
882 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END,
883 		"vop_rwunlock_end:");
884 
885 	acl_perm(vp, exi, &va, cr);
886 
887 	/* check for overflows */
888 	error = vattr_to_nattr(&va, &rr->rr_attr);
889 
890 #if 0 /* notyet */
891 	/*
892 	 * Don't do this.  It causes local disk writes when just
893 	 * reading the file and the overhead is deemed larger
894 	 * than the benefit.
895 	 */
896 	/*
897 	 * Force modified metadata out to stable storage.
898 	 */
899 	(void) VOP_FSYNC(vp, FNODSYNC, cr);
900 #endif
901 
902 	VN_RELE(vp);
903 
904 	rr->rr_status = puterrno(error);
905 
906 	TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
907 		"rfs_read_end:(%S)", "done");
908 }
909 
910 /*
911  * Free data allocated by rfs_read
912  */
913 void
914 rfs_rdfree(struct nfsrdresult *rr)
915 {
916 	mblk_t *mp;
917 
918 	if (rr->rr_status == NFS_OK) {
919 		mp = rr->rr_mp;
920 		if (mp != NULL)
921 			freeb(mp);
922 	}
923 }
924 
925 void *
926 rfs_read_getfh(struct nfsreadargs *ra)
927 {
928 	return (&ra->ra_fhandle);
929 }
930 
931 #define	MAX_IOVECS	12
932 
933 #ifdef DEBUG
934 static int rfs_write_sync_hits = 0;
935 static int rfs_write_sync_misses = 0;
936 #endif
937 
938 /*
939  * Write data to file.
940  * Returns attributes of a file after writing some data to it.
941  *
942  * Any changes made here, especially in error handling might have
943  * to also be done in rfs_write (which clusters write requests).
944  */
945 void
946 rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns,
947 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
948 {
949 	int error;
950 	vnode_t *vp;
951 	rlim64_t rlimit;
952 	struct vattr va;
953 	struct uio uio;
954 	struct iovec iov[MAX_IOVECS];
955 	mblk_t *m;
956 	struct iovec *iovp;
957 	int iovcnt;
958 	cred_t *savecred;
959 	int in_crit = 0;
960 
961 	TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_START,
962 		"rfs_write_start:(%S)", "sync");
963 
964 	vp = nfs_fhtovp(&wa->wa_fhandle, exi);
965 	if (vp == NULL) {
966 		ns->ns_status = NFSERR_STALE;
967 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
968 			"rfs_write_end:(%S)", "stale");
969 		return;
970 	}
971 
972 	if (rdonly(exi, req)) {
973 		VN_RELE(vp);
974 		ns->ns_status = NFSERR_ROFS;
975 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
976 			"rfs_write_end:(%S)", "rofs");
977 		return;
978 	}
979 
980 	if (vp->v_type != VREG) {
981 		VN_RELE(vp);
982 		ns->ns_status = NFSERR_ISDIR;
983 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
984 			"rfs_write_end:(%S)", "isdir");
985 		return;
986 	}
987 
988 	/*
989 	 * Check to see if the v4 side of the server has delegated
990 	 * this file.  If so, then we mark thread as wouldblock so
991 	 * the response is dropped.
992 	 */
993 	if (rfs4_check_delegated(FWRITE, vp, FALSE)) {
994 		VN_RELE(vp);
995 		curthread->t_flag |= T_WOULDBLOCK;
996 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
997 			"rfs_write_end:(%S)", "delegated");
998 		return;
999 	}
1000 
1001 	va.va_mask = AT_UID|AT_MODE;
1002 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
1003 		"vop_getattr_start:");
1004 	error = VOP_GETATTR(vp, &va, 0, cr);
1005 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
1006 		"vop_getattr_end:");
1007 
1008 	if (error) {
1009 		VN_RELE(vp);
1010 		ns->ns_status = puterrno(error);
1011 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
1012 			"rfs_write_end:(%S)", "getattr error");
1013 		return;
1014 	}
1015 
1016 	if (crgetuid(cr) != va.va_uid) {
1017 		/*
1018 		 * This is a kludge to allow writes of files created
1019 		 * with read only permission.  The owner of the file
1020 		 * is always allowed to write it.
1021 		 */
1022 		TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START,
1023 			"vop_access_start:");
1024 		error = VOP_ACCESS(vp, VWRITE, 0, cr);
1025 		TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END,
1026 			"vop_access_end:");
1027 		if (error) {
1028 			VN_RELE(vp);
1029 			ns->ns_status = puterrno(error);
1030 			TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
1031 				"rfs_write_end:(%S)", "access error");
1032 			return;
1033 		}
1034 	}
1035 
1036 	/*
1037 	 * Can't access a mandatory lock file.  This might cause
1038 	 * the NFS service thread to block forever waiting for a
1039 	 * lock to be released that will never be released.
1040 	 */
1041 	if (MANDLOCK(vp, va.va_mode)) {
1042 		VN_RELE(vp);
1043 		ns->ns_status = NFSERR_ACCES;
1044 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
1045 			"rfs_write_end:(%S)", "mand lock");
1046 		return;
1047 	}
1048 
1049 	/*
1050 	 * We have to enter the critical region before calling VOP_RWLOCK
1051 	 * to avoid a deadlock with ufs.
1052 	 */
1053 	if (nbl_need_check(vp)) {
1054 		nbl_start_crit(vp, RW_READER);
1055 		in_crit = 1;
1056 		if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
1057 		    wa->wa_count, 0)) {
1058 			error = EACCES;
1059 			goto out;
1060 		}
1061 	}
1062 
1063 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START,
1064 		"vop_rwlock_start:");
1065 	(void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);
1066 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END,
1067 		"vop_rwlock_end:");
1068 
1069 	if (wa->wa_data) {
1070 		iov[0].iov_base = wa->wa_data;
1071 		iov[0].iov_len = wa->wa_count;
1072 		uio.uio_iov = iov;
1073 		uio.uio_iovcnt = 1;
1074 		uio.uio_segflg = UIO_SYSSPACE;
1075 		uio.uio_extflg = UIO_COPY_DEFAULT;
1076 		uio.uio_loffset = (offset_t)wa->wa_offset;
1077 		uio.uio_resid = wa->wa_count;
1078 		/*
1079 		 * The limit is checked on the client. We
1080 		 * should allow any size writes here.
1081 		 */
1082 		uio.uio_llimit = curproc->p_fsz_ctl;
1083 		rlimit = uio.uio_llimit - wa->wa_offset;
1084 		if (rlimit < (rlim64_t)uio.uio_resid)
1085 			uio.uio_resid = (uint_t)rlimit;
1086 
1087 		/*
1088 		 * for now we assume no append mode
1089 		 */
1090 		TRACE_1(TR_FAC_NFS, TR_VOP_WRITE_START,
1091 			"vop_write_start:(%S)", "sync");
1092 		/*
1093 		 * We're changing creds because VM may fault and we need
1094 		 * the cred of the current thread to be used if quota
1095 		 * checking is enabled.
1096 		 */
1097 		savecred = curthread->t_cred;
1098 		curthread->t_cred = cr;
1099 		error = VOP_WRITE(vp, &uio, FSYNC, cr, NULL);
1100 		curthread->t_cred = savecred;
1101 		TRACE_0(TR_FAC_NFS, TR_VOP_WRITE_END,
1102 			"vop_write_end:");
1103 	} else {
1104 		iovcnt = 0;
1105 		for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
1106 			iovcnt++;
1107 		if (iovcnt <= MAX_IOVECS) {
1108 #ifdef DEBUG
1109 			rfs_write_sync_hits++;
1110 #endif
1111 			iovp = iov;
1112 		} else {
1113 #ifdef DEBUG
1114 			rfs_write_sync_misses++;
1115 #endif
1116 			iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
1117 		}
1118 		mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
1119 		uio.uio_iov = iovp;
1120 		uio.uio_iovcnt = iovcnt;
1121 		uio.uio_segflg = UIO_SYSSPACE;
1122 		uio.uio_extflg = UIO_COPY_DEFAULT;
1123 		uio.uio_loffset = (offset_t)wa->wa_offset;
1124 		uio.uio_resid = wa->wa_count;
1125 		/*
1126 		 * The limit is checked on the client. We
1127 		 * should allow any size writes here.
1128 		 */
1129 		uio.uio_llimit = curproc->p_fsz_ctl;
1130 		rlimit = uio.uio_llimit - wa->wa_offset;
1131 		if (rlimit < (rlim64_t)uio.uio_resid)
1132 			uio.uio_resid = (uint_t)rlimit;
1133 
1134 		/*
1135 		 * For now we assume no append mode.
1136 		 */
1137 		TRACE_1(TR_FAC_NFS, TR_VOP_WRITE_START,
1138 			"vop_write_start:(%S)", "iov sync");
1139 		/*
1140 		 * We're changing creds because VM may fault and we need
1141 		 * the cred of the current thread to be used if quota
1142 		 * checking is enabled.
1143 		 */
1144 		savecred = curthread->t_cred;
1145 		curthread->t_cred = cr;
1146 		error = VOP_WRITE(vp, &uio, FSYNC, cr, NULL);
1147 		curthread->t_cred = savecred;
1148 		TRACE_0(TR_FAC_NFS, TR_VOP_WRITE_END,
1149 			"vop_write_end:");
1150 
1151 		if (iovp != iov)
1152 			kmem_free(iovp, sizeof (*iovp) * iovcnt);
1153 	}
1154 
1155 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
1156 		"vop_rwunlock_start:");
1157 	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
1158 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END,
1159 		"vop_rwunlock_end:");
1160 
1161 	if (!error) {
1162 		/*
1163 		 * Get attributes again so we send the latest mod
1164 		 * time to the client side for his cache.
1165 		 */
1166 		va.va_mask = AT_ALL;	/* now we want everything */
1167 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
1168 			"vop_getattr_start:");
1169 		error = VOP_GETATTR(vp, &va, 0, cr);
1170 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
1171 			"vop_getattr_end:");
1172 		/* check for overflows */
1173 		if (!error) {
1174 			acl_perm(vp, exi, &va, cr);
1175 			error = vattr_to_nattr(&va, &ns->ns_attr);
1176 		}
1177 	}
1178 
1179 out:
1180 	if (in_crit)
1181 		nbl_end_crit(vp);
1182 	VN_RELE(vp);
1183 
1184 	ns->ns_status = puterrno(error);
1185 
1186 	TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
1187 		"rfs_write_end:(%S)", "sync");
1188 }
1189 
1190 struct rfs_async_write {
1191 	struct nfswriteargs *wa;
1192 	struct nfsattrstat *ns;
1193 	struct svc_req *req;
1194 	cred_t *cr;
1195 	kthread_t *thread;
1196 	struct rfs_async_write *list;
1197 };
1198 
1199 struct rfs_async_write_list {
1200 	fhandle_t *fhp;
1201 	kcondvar_t cv;
1202 	struct rfs_async_write *list;
1203 	struct rfs_async_write_list *next;
1204 };
1205 
1206 static struct rfs_async_write_list *rfs_async_write_head = NULL;
1207 static kmutex_t rfs_async_write_lock;
1208 static int rfs_write_async = 1;	/* enables write clustering if == 1 */
1209 
1210 #define	MAXCLIOVECS	42
1211 #define	RFSWRITE_INITVAL (enum nfsstat) -1
1212 
1213 #ifdef DEBUG
1214 static int rfs_write_hits = 0;
1215 static int rfs_write_misses = 0;
1216 #endif
1217 
1218 /*
1219  * Write data to file.
1220  * Returns attributes of a file after writing some data to it.
1221  */
1222 void
1223 rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns,
1224 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1225 {
1226 	int error;
1227 	vnode_t *vp;
1228 	rlim64_t rlimit;
1229 	struct vattr va;
1230 	struct uio uio;
1231 	struct rfs_async_write_list *lp;
1232 	struct rfs_async_write_list *nlp;
1233 	struct rfs_async_write *rp;
1234 	struct rfs_async_write *nrp;
1235 	struct rfs_async_write *trp;
1236 	struct rfs_async_write *lrp;
1237 	int data_written;
1238 	int iovcnt;
1239 	mblk_t *m;
1240 	struct iovec *iovp;
1241 	struct iovec *niovp;
1242 	struct iovec iov[MAXCLIOVECS];
1243 	int count;
1244 	int rcount;
1245 	uint_t off;
1246 	uint_t len;
1247 	struct rfs_async_write nrpsp;
1248 	struct rfs_async_write_list nlpsp;
1249 	ushort_t t_flag;
1250 	cred_t *savecred;
1251 	int in_crit = 0;
1252 
1253 	if (!rfs_write_async) {
1254 		rfs_write_sync(wa, ns, exi, req, cr);
1255 		return;
1256 	}
1257 
1258 	TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_START,
1259 		"rfs_write_start:(%S)", "async");
1260 
1261 	/*
1262 	 * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
1263 	 * is considered an OK.
1264 	 */
1265 	ns->ns_status = RFSWRITE_INITVAL;
1266 
1267 	nrp = &nrpsp;
1268 	nrp->wa = wa;
1269 	nrp->ns = ns;
1270 	nrp->req = req;
1271 	nrp->cr = cr;
1272 	nrp->thread = curthread;
1273 
1274 	ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
1275 
1276 	/*
1277 	 * Look to see if there is already a cluster started
1278 	 * for this file.
1279 	 */
1280 	mutex_enter(&rfs_async_write_lock);
1281 	for (lp = rfs_async_write_head; lp != NULL; lp = lp->next) {
1282 		if (bcmp(&wa->wa_fhandle, lp->fhp,
1283 		    sizeof (fhandle_t)) == 0)
1284 			break;
1285 	}
1286 
1287 	/*
1288 	 * If lp is non-NULL, then there is already a cluster
1289 	 * started.  We need to place ourselves in the cluster
1290 	 * list in the right place as determined by starting
1291 	 * offset.  Conflicts with non-blocking mandatory locked
1292 	 * regions will be checked when the cluster is processed.
1293 	 */
1294 	if (lp != NULL) {
1295 		rp = lp->list;
1296 		trp = NULL;
1297 		while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
1298 			trp = rp;
1299 			rp = rp->list;
1300 		}
1301 		nrp->list = rp;
1302 		if (trp == NULL)
1303 			lp->list = nrp;
1304 		else
1305 			trp->list = nrp;
1306 		while (nrp->ns->ns_status == RFSWRITE_INITVAL)
1307 			cv_wait(&lp->cv, &rfs_async_write_lock);
1308 		mutex_exit(&rfs_async_write_lock);
1309 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
1310 			"rfs_write_end:(%S)", "cluster child");
1311 		return;
1312 	}
1313 
1314 	/*
1315 	 * No cluster started yet, start one and add ourselves
1316 	 * to the list of clusters.
1317 	 */
1318 	nrp->list = NULL;
1319 
1320 	nlp = &nlpsp;
1321 	nlp->fhp = &wa->wa_fhandle;
1322 	cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
1323 	nlp->list = nrp;
1324 	nlp->next = NULL;
1325 
1326 	if (rfs_async_write_head == NULL) {
1327 		rfs_async_write_head = nlp;
1328 	} else {
1329 		lp = rfs_async_write_head;
1330 		while (lp->next != NULL)
1331 			lp = lp->next;
1332 		lp->next = nlp;
1333 	}
1334 	mutex_exit(&rfs_async_write_lock);
1335 
1336 	/*
1337 	 * Convert the file handle common to all of the requests
1338 	 * in this cluster to a vnode.
1339 	 */
1340 	vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1341 	if (vp == NULL) {
1342 		mutex_enter(&rfs_async_write_lock);
1343 		if (rfs_async_write_head == nlp)
1344 			rfs_async_write_head = nlp->next;
1345 		else {
1346 			lp = rfs_async_write_head;
1347 			while (lp->next != nlp)
1348 				lp = lp->next;
1349 			lp->next = nlp->next;
1350 		}
1351 		t_flag = curthread->t_flag & T_WOULDBLOCK;
1352 		for (rp = nlp->list; rp != NULL; rp = rp->list) {
1353 			rp->ns->ns_status = NFSERR_STALE;
1354 			rp->thread->t_flag |= t_flag;
1355 		}
1356 		cv_broadcast(&nlp->cv);
1357 		mutex_exit(&rfs_async_write_lock);
1358 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
1359 			"rfs_write_end:(%S)", "stale");
1360 		return;
1361 	}
1362 
1363 	/*
1364 	 * Can only write regular files.  Attempts to write any
1365 	 * other file types fail with EISDIR.
1366 	 */
1367 	if (vp->v_type != VREG) {
1368 		VN_RELE(vp);
1369 		mutex_enter(&rfs_async_write_lock);
1370 		if (rfs_async_write_head == nlp)
1371 			rfs_async_write_head = nlp->next;
1372 		else {
1373 			lp = rfs_async_write_head;
1374 			while (lp->next != nlp)
1375 				lp = lp->next;
1376 			lp->next = nlp->next;
1377 		}
1378 		t_flag = curthread->t_flag & T_WOULDBLOCK;
1379 		for (rp = nlp->list; rp != NULL; rp = rp->list) {
1380 			rp->ns->ns_status = NFSERR_ISDIR;
1381 			rp->thread->t_flag |= t_flag;
1382 		}
1383 		cv_broadcast(&nlp->cv);
1384 		mutex_exit(&rfs_async_write_lock);
1385 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
1386 			"rfs_write_end:(%S)", "isdir");
1387 		return;
1388 	}
1389 
1390 	/*
1391 	 * Enter the critical region before calling VOP_RWLOCK, to avoid a
1392 	 * deadlock with ufs.
1393 	 */
1394 	if (nbl_need_check(vp)) {
1395 		nbl_start_crit(vp, RW_READER);
1396 		in_crit = 1;
1397 	}
1398 
1399 	/*
1400 	 * Lock the file for writing.  This operation provides
1401 	 * the delay which allows clusters to grow.
1402 	 */
1403 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START,
1404 		"vop_wrlock_start:");
1405 	(void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);
1406 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END,
1407 		"vop_wrlock_end");
1408 
1409 	/*
1410 	 * Disconnect this cluster from the list of clusters.
1411 	 * The cluster that is being dealt with must be fixed
1412 	 * in size after this point, so there is no reason
1413 	 * to leave it on the list so that new requests can
1414 	 * find it.
1415 	 *
1416 	 * The algorithm is that the first write request will
1417 	 * create a cluster, convert the file handle to a
1418 	 * vnode pointer, and then lock the file for writing.
1419 	 * This request is not likely to be clustered with
1420 	 * any others.  However, the next request will create
1421 	 * a new cluster and be blocked in VOP_RWLOCK while
1422 	 * the first request is being processed.  This delay
1423 	 * will allow more requests to be clustered in this
1424 	 * second cluster.
1425 	 */
1426 	mutex_enter(&rfs_async_write_lock);
1427 	if (rfs_async_write_head == nlp)
1428 		rfs_async_write_head = nlp->next;
1429 	else {
1430 		lp = rfs_async_write_head;
1431 		while (lp->next != nlp)
1432 			lp = lp->next;
1433 		lp->next = nlp->next;
1434 	}
1435 	mutex_exit(&rfs_async_write_lock);
1436 
1437 	/*
1438 	 * Step through the list of requests in this cluster.
1439 	 * We need to check permissions to make sure that all
1440 	 * of the requests have sufficient permission to write
1441 	 * the file.  A cluster can be composed of requests
1442 	 * from different clients and different users on each
1443 	 * client.
1444 	 *
1445 	 * As a side effect, we also calculate the size of the
1446 	 * byte range that this cluster encompasses.
1447 	 */
1448 	rp = nlp->list;
1449 	off = rp->wa->wa_offset;
1450 	len = (uint_t)0;
1451 	do {
1452 		if (rdonly(exi, rp->req)) {
1453 			rp->ns->ns_status = NFSERR_ROFS;
1454 			t_flag = curthread->t_flag & T_WOULDBLOCK;
1455 			rp->thread->t_flag |= t_flag;
1456 			continue;
1457 		}
1458 
1459 		va.va_mask = AT_UID|AT_MODE;
1460 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
1461 			"vop_getattr_start:");
1462 		error = VOP_GETATTR(vp, &va, 0, rp->cr);
1463 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
1464 			"vop_getattr_end:");
1465 		if (!error) {
1466 			if (crgetuid(rp->cr) != va.va_uid) {
1467 				/*
1468 				 * This is a kludge to allow writes of files
1469 				 * created with read only permission.  The
1470 				 * owner of the file is always allowed to
1471 				 * write it.
1472 				 */
1473 				TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START,
1474 					"vop_access_start:");
1475 				error = VOP_ACCESS(vp, VWRITE, 0, rp->cr);
1476 				TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END,
1477 					"vop_access_end:");
1478 			}
1479 			if (!error && MANDLOCK(vp, va.va_mode))
1480 				error = EACCES;
1481 		}
1482 
1483 		/*
1484 		 * Check for a conflict with a nbmand-locked region.
1485 		 */
1486 		if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset,
1487 		    rp->wa->wa_count, 0)) {
1488 			error = EACCES;
1489 		}
1490 
1491 		if (error) {
1492 			rp->ns->ns_status = puterrno(error);
1493 			t_flag = curthread->t_flag & T_WOULDBLOCK;
1494 			rp->thread->t_flag |= t_flag;
1495 			continue;
1496 		}
1497 		if (len < rp->wa->wa_offset + rp->wa->wa_count - off)
1498 			len = rp->wa->wa_offset + rp->wa->wa_count - off;
1499 	} while ((rp = rp->list) != NULL);
1500 
1501 	/*
1502 	 * Step through the cluster attempting to gather as many
1503 	 * requests which are contiguous as possible.  These
1504 	 * contiguous requests are handled via one call to VOP_WRITE
1505 	 * instead of different calls to VOP_WRITE.  We also keep
1506 	 * track of the fact that any data was written.
1507 	 */
1508 	rp = nlp->list;
1509 	data_written = 0;
1510 	do {
1511 		/*
1512 		 * Skip any requests which are already marked as having an
1513 		 * error.
1514 		 */
1515 		if (rp->ns->ns_status != RFSWRITE_INITVAL) {
1516 			rp = rp->list;
1517 			continue;
1518 		}
1519 
1520 		/*
1521 		 * Count the number of iovec's which are required
1522 		 * to handle this set of requests.  One iovec is
1523 		 * needed for each data buffer, whether addressed
1524 		 * by wa_data or by the b_rptr pointers in the
1525 		 * mblk chains.
1526 		 */
1527 		iovcnt = 0;
1528 		lrp = rp;
1529 		for (;;) {
1530 			if (lrp->wa->wa_data)
1531 				iovcnt++;
1532 			else {
1533 				m = lrp->wa->wa_mblk;
1534 				while (m != NULL) {
1535 					iovcnt++;
1536 					m = m->b_cont;
1537 				}
1538 			}
1539 			if (lrp->list == NULL ||
1540 			    lrp->list->ns->ns_status != RFSWRITE_INITVAL ||
1541 			    lrp->wa->wa_offset + lrp->wa->wa_count !=
1542 			    lrp->list->wa->wa_offset) {
1543 				lrp = lrp->list;
1544 				break;
1545 			}
1546 			lrp = lrp->list;
1547 		}
1548 
1549 		if (iovcnt <= MAXCLIOVECS) {
1550 #ifdef DEBUG
1551 			rfs_write_hits++;
1552 #endif
1553 			niovp = iov;
1554 		} else {
1555 #ifdef DEBUG
1556 			rfs_write_misses++;
1557 #endif
1558 			niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP);
1559 		}
1560 		/*
1561 		 * Put together the scatter/gather iovecs.
1562 		 */
1563 		iovp = niovp;
1564 		trp = rp;
1565 		count = 0;
1566 		do {
1567 			if (trp->wa->wa_data) {
1568 				iovp->iov_base = trp->wa->wa_data;
1569 				iovp->iov_len = trp->wa->wa_count;
1570 				iovp++;
1571 			} else {
1572 				m = trp->wa->wa_mblk;
1573 				rcount = trp->wa->wa_count;
1574 				while (m != NULL) {
1575 					iovp->iov_base = (caddr_t)m->b_rptr;
1576 					iovp->iov_len = (m->b_wptr - m->b_rptr);
1577 					rcount -= iovp->iov_len;
1578 					if (rcount < 0)
1579 						iovp->iov_len += rcount;
1580 					iovp++;
1581 					if (rcount <= 0)
1582 						break;
1583 					m = m->b_cont;
1584 				}
1585 			}
1586 			count += trp->wa->wa_count;
1587 			trp = trp->list;
1588 		} while (trp != lrp);
1589 
1590 		uio.uio_iov = niovp;
1591 		uio.uio_iovcnt = iovcnt;
1592 		uio.uio_segflg = UIO_SYSSPACE;
1593 		uio.uio_extflg = UIO_COPY_DEFAULT;
1594 		uio.uio_loffset = (offset_t)rp->wa->wa_offset;
1595 		uio.uio_resid = count;
1596 		/*
1597 		 * The limit is checked on the client. We
1598 		 * should allow any size writes here.
1599 		 */
1600 		uio.uio_llimit = curproc->p_fsz_ctl;
1601 		rlimit = uio.uio_llimit - rp->wa->wa_offset;
1602 		if (rlimit < (rlim64_t)uio.uio_resid)
1603 			uio.uio_resid = (uint_t)rlimit;
1604 
1605 		/*
1606 		 * For now we assume no append mode.
1607 		 */
1608 		TRACE_1(TR_FAC_NFS, TR_VOP_WRITE_START,
1609 			"vop_write_start:(%S)", "async");
1610 
1611 		/*
1612 		 * Check to see if the v4 side of the server has
1613 		 * delegated this file.  If so, then we mark thread
1614 		 * as wouldblock so the response is dropped.
1615 		 */
1616 		if (rfs4_check_delegated(FWRITE, vp, FALSE)) {
1617 			curthread->t_flag |= T_WOULDBLOCK;
1618 			error = EACCES; /* just to have an error */
1619 			TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
1620 				"rfs_write_end:(%S)", "delegated");
1621 		} else {
1622 			/*
1623 			 * We're changing creds because VM may fault
1624 			 * and we need the cred of the current
1625 			 * thread to be used if quota * checking is
1626 			 * enabled.
1627 			 */
1628 			savecred = curthread->t_cred;
1629 			curthread->t_cred = cr;
1630 			error = VOP_WRITE(vp, &uio, 0, rp->cr, NULL);
1631 			curthread->t_cred = savecred;
1632 			TRACE_0(TR_FAC_NFS, TR_VOP_WRITE_END,
1633 				"vop_write_end:");
1634 		}
1635 
1636 		if (niovp != iov)
1637 			kmem_free(niovp, sizeof (*niovp) * iovcnt);
1638 
1639 		if (!error) {
1640 			data_written = 1;
1641 			/*
1642 			 * Get attributes again so we send the latest mod
1643 			 * time to the client side for his cache.
1644 			 */
1645 			va.va_mask = AT_ALL;	/* now we want everything */
1646 			TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
1647 				"vop_getattr_start:");
1648 			error = VOP_GETATTR(vp, &va, 0, rp->cr);
1649 			TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
1650 				"vop_getattr_end:");
1651 			if (!error)
1652 				acl_perm(vp, exi, &va, rp->cr);
1653 		}
1654 
1655 		/*
1656 		 * Fill in the status responses for each request
1657 		 * which was just handled.  Also, copy the latest
1658 		 * attributes in to the attribute responses if
1659 		 * appropriate.
1660 		 */
1661 		t_flag = curthread->t_flag & T_WOULDBLOCK;
1662 		do {
1663 			rp->thread->t_flag |= t_flag;
1664 			/* check for overflows */
1665 			if (!error) {
1666 				error  = vattr_to_nattr(&va, &rp->ns->ns_attr);
1667 			}
1668 			rp->ns->ns_status = puterrno(error);
1669 			rp = rp->list;
1670 		} while (rp != lrp);
1671 	} while (rp != NULL);
1672 
1673 	/*
1674 	 * If any data was written at all, then we need to flush
1675 	 * the data and metadata to stable storage.
1676 	 */
1677 	if (data_written) {
1678 		TRACE_0(TR_FAC_NFS, TR_VOP_PUTPAGE_START,
1679 			"vop_putpage_start:");
1680 		error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr);
1681 		TRACE_0(TR_FAC_NFS, TR_VOP_PUTPAGE_END,
1682 			"vop_putpage_end:");
1683 		if (!error) {
1684 			TRACE_0(TR_FAC_NFS, TR_VOP_FSYNC_START,
1685 				"vop_fsync_start:");
1686 			error = VOP_FSYNC(vp, FNODSYNC, cr);
1687 			TRACE_0(TR_FAC_NFS, TR_VOP_FSYNC_END,
1688 				"vop_fsync_end:");
1689 		}
1690 	}
1691 
1692 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
1693 		"vop_rwunlock_start:");
1694 	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
1695 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END,
1696 		"vop_rwunlock_end:");
1697 
1698 	if (in_crit)
1699 		nbl_end_crit(vp);
1700 	VN_RELE(vp);
1701 
1702 	t_flag = curthread->t_flag & T_WOULDBLOCK;
1703 	mutex_enter(&rfs_async_write_lock);
1704 	for (rp = nlp->list; rp != NULL; rp = rp->list) {
1705 		if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1706 			rp->ns->ns_status = puterrno(error);
1707 			rp->thread->t_flag |= t_flag;
1708 		}
1709 	}
1710 	cv_broadcast(&nlp->cv);
1711 	mutex_exit(&rfs_async_write_lock);
1712 
1713 	TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
1714 		"rfs_write_end:(%S)", "async");
1715 }
1716 
1717 void *
1718 rfs_write_getfh(struct nfswriteargs *wa)
1719 {
1720 	return (&wa->wa_fhandle);
1721 }
1722 
1723 /*
1724  * Create a file.
1725  * Creates a file with given attributes and returns those attributes
1726  * and an fhandle for the new file.
1727  */
1728 void
1729 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
1730 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1731 {
1732 	int error;
1733 	int lookuperr;
1734 	int in_crit = 0;
1735 	struct vattr va;
1736 	vnode_t *vp;
1737 	vnode_t *dvp;
1738 	char *name = args->ca_da.da_name;
1739 	vnode_t *tvp = NULL;
1740 	int mode;
1741 	int lookup_ok;
1742 	bool_t trunc;
1743 
1744 	TRACE_0(TR_FAC_NFS, TR_RFS_CREATE_START,
1745 		"rfs_create_start:");
1746 
1747 	/*
1748 	 * Disallow NULL paths
1749 	 */
1750 	if (name == NULL || *name == '\0') {
1751 		dr->dr_status = NFSERR_ACCES;
1752 		TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END,
1753 			"rfs_create_end:(%S)", "access");
1754 		return;
1755 	}
1756 
1757 	dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
1758 	if (dvp == NULL) {
1759 		dr->dr_status = NFSERR_STALE;
1760 		TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END,
1761 			"rfs_create_end:(%S)", "stale");
1762 		return;
1763 	}
1764 
1765 	error = sattr_to_vattr(args->ca_sa, &va);
1766 	if (error) {
1767 		dr->dr_status = puterrno(error);
1768 		TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END,
1769 			"rfs_create_end:(%S)", "sattr");
1770 		return;
1771 	}
1772 
1773 	/*
1774 	 * Must specify the mode.
1775 	 */
1776 	if (!(va.va_mask & AT_MODE)) {
1777 		VN_RELE(dvp);
1778 		dr->dr_status = NFSERR_INVAL;
1779 		TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END,
1780 			"rfs_create_end:(%S)", "no mode");
1781 		return;
1782 	}
1783 
1784 	/*
1785 	 * This is a completely gross hack to make mknod
1786 	 * work over the wire until we can wack the protocol
1787 	 */
1788 	if ((va.va_mode & IFMT) == IFCHR) {
1789 		if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV)
1790 			va.va_type = VFIFO;	/* xtra kludge for named pipe */
1791 		else {
1792 			va.va_type = VCHR;
1793 			/*
1794 			 * uncompress the received dev_t
1795 			 * if the top half is zero indicating a request
1796 			 * from an `older style' OS.
1797 			 */
1798 			if ((va.va_size & 0xffff0000) == 0)
1799 				va.va_rdev = nfsv2_expdev(va.va_size);
1800 			else
1801 				va.va_rdev = (dev_t)va.va_size;
1802 		}
1803 		va.va_mask &= ~AT_SIZE;
1804 	} else if ((va.va_mode & IFMT) == IFBLK) {
1805 		va.va_type = VBLK;
1806 		/*
1807 		 * uncompress the received dev_t
1808 		 * if the top half is zero indicating a request
1809 		 * from an `older style' OS.
1810 		 */
1811 		if ((va.va_size & 0xffff0000) == 0)
1812 			va.va_rdev = nfsv2_expdev(va.va_size);
1813 		else
1814 			va.va_rdev = (dev_t)va.va_size;
1815 		va.va_mask &= ~AT_SIZE;
1816 	} else if ((va.va_mode & IFMT) == IFSOCK) {
1817 		va.va_type = VSOCK;
1818 	} else
1819 		va.va_type = VREG;
1820 	va.va_mode &= ~IFMT;
1821 	va.va_mask |= AT_TYPE;
1822 
1823 	/*
1824 	 * Why was the choice made to use VWRITE as the mode to the
1825 	 * call to VOP_CREATE ? This results in a bug.  When a client
1826 	 * opens a file that already exists and is RDONLY, the second
1827 	 * open fails with an EACESS because of the mode.
1828 	 * bug ID 1054648.
1829 	 */
1830 	lookup_ok = 0;
1831 	mode = VWRITE;
1832 	if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) {
1833 		TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_START,
1834 			"vop_lookup_start:");
1835 		error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr);
1836 		TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_END,
1837 			"vop_lookup_end:");
1838 		if (!error) {
1839 			struct vattr at;
1840 
1841 			lookup_ok = 1;
1842 			at.va_mask = AT_MODE;
1843 			TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
1844 				"vop_getattr_start:");
1845 			error = VOP_GETATTR(tvp, &at, 0, cr);
1846 			TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
1847 				"vop_getattr_end:");
1848 			if (!error)
1849 				mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD;
1850 			VN_RELE(tvp);
1851 			tvp = NULL;
1852 		}
1853 	}
1854 
1855 	if (!lookup_ok) {
1856 		if (rdonly(exi, req)) {
1857 			error = EROFS;
1858 		} else if (va.va_type != VREG && va.va_type != VFIFO &&
1859 		    va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) {
1860 			error = EPERM;
1861 		} else {
1862 			error = 0;
1863 		}
1864 	}
1865 
1866 	/*
1867 	 * If file size is being modified on an already existing file
1868 	 * make sure that there are no conflicting non-blocking mandatory
1869 	 * locks in the region being manipulated. Return EACCES if there
1870 	 * are conflicting locks.
1871 	 */
1872 	if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) {
1873 		lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr);
1874 
1875 		if (!lookuperr &&
1876 		    rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) {
1877 			VN_RELE(tvp);
1878 			curthread->t_flag |= T_WOULDBLOCK;
1879 			goto out;
1880 		}
1881 
1882 		if (!lookuperr && nbl_need_check(tvp)) {
1883 			/*
1884 			 * The file exists. Now check if it has any
1885 			 * conflicting non-blocking mandatory locks
1886 			 * in the region being changed.
1887 			 */
1888 			struct vattr bva;
1889 			u_offset_t offset;
1890 			ssize_t length;
1891 
1892 			nbl_start_crit(tvp, RW_READER);
1893 			in_crit = 1;
1894 
1895 			bva.va_mask = AT_SIZE;
1896 			error = VOP_GETATTR(tvp, &bva, 0, cr);
1897 			if (!error) {
1898 				if (va.va_size < bva.va_size) {
1899 					offset = va.va_size;
1900 					length = bva.va_size - va.va_size;
1901 				} else {
1902 					offset = bva.va_size;
1903 					length = va.va_size - bva.va_size;
1904 				}
1905 				if (length) {
1906 					if (nbl_conflict(tvp, NBL_WRITE,
1907 						offset, length, 0)) {
1908 						error = EACCES;
1909 					}
1910 				}
1911 			}
1912 			if (error) {
1913 				nbl_end_crit(tvp);
1914 				VN_RELE(tvp);
1915 				in_crit = 0;
1916 			}
1917 		} else if (tvp != NULL) {
1918 			VN_RELE(tvp);
1919 		}
1920 	}
1921 
1922 	if (!error) {
1923 		/*
1924 		 * If filesystem is shared with nosuid the remove any
1925 		 * setuid/setgid bits on create.
1926 		 */
1927 		if (va.va_type == VREG &&
1928 		    exi->exi_export.ex_flags & EX_NOSUID)
1929 			va.va_mode &= ~(VSUID | VSGID);
1930 
1931 		TRACE_0(TR_FAC_NFS, TR_VOP_CREATE_START,
1932 			"vop_create_start:");
1933 		error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0);
1934 		TRACE_0(TR_FAC_NFS, TR_VOP_CREATE_END,
1935 			"vop_create_end:");
1936 
1937 		if (!error) {
1938 
1939 			if ((va.va_mask & AT_SIZE) && (va.va_size == 0))
1940 				trunc = TRUE;
1941 			else
1942 				trunc = FALSE;
1943 
1944 			if (rfs4_check_delegated(FWRITE, tvp, trunc)) {
1945 				VN_RELE(tvp);
1946 				curthread->t_flag |= T_WOULDBLOCK;
1947 				goto out;
1948 			}
1949 			va.va_mask = AT_ALL;
1950 			TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
1951 				"vop_getattr_start:");
1952 			error = VOP_GETATTR(vp, &va, 0, cr);
1953 			TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
1954 				"vop_getattr_end:");
1955 			/* check for overflows */
1956 			if (!error) {
1957 				acl_perm(vp, exi, &va, cr);
1958 				error = vattr_to_nattr(&va, &dr->dr_attr);
1959 				if (!error) {
1960 					error = makefh(&dr->dr_fhandle, vp,
1961 							exi);
1962 				}
1963 			}
1964 			/*
1965 			 * Force modified metadata out to stable storage.
1966 			 */
1967 			(void) VOP_FSYNC(vp, FNODSYNC, cr);
1968 			VN_RELE(vp);
1969 		}
1970 
1971 		if (in_crit) {
1972 			nbl_end_crit(tvp);
1973 			VN_RELE(tvp);
1974 		}
1975 	}
1976 
1977 	/*
1978 	 * Force modified data and metadata out to stable storage.
1979 	 */
1980 	(void) VOP_FSYNC(dvp, 0, cr);
1981 
1982 out:
1983 
1984 	VN_RELE(dvp);
1985 
1986 	dr->dr_status = puterrno(error);
1987 
1988 	TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END,
1989 		"rfs_create_end:(%S)", "done");
1990 }
1991 void *
1992 rfs_create_getfh(struct nfscreatargs *args)
1993 {
1994 	return (args->ca_da.da_fhandle);
1995 }
1996 
1997 /*
1998  * Remove a file.
1999  * Remove named file from parent directory.
2000  */
2001 void
2002 rfs_remove(struct nfsdiropargs *da, enum nfsstat *status,
2003 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2004 {
2005 	int error = 0;
2006 	vnode_t *vp;
2007 	vnode_t *targvp;
2008 	int in_crit = 0;
2009 
2010 	TRACE_0(TR_FAC_NFS, TR_RFS_REMOVE_START,
2011 		"rfs_remove_start:");
2012 
2013 	/*
2014 	 * Disallow NULL paths
2015 	 */
2016 	if (da->da_name == NULL || *da->da_name == '\0') {
2017 		*status = NFSERR_ACCES;
2018 		TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END,
2019 			"rfs_remove_end:(%S)", "access");
2020 		return;
2021 	}
2022 
2023 	vp = nfs_fhtovp(da->da_fhandle, exi);
2024 	if (vp == NULL) {
2025 		*status = NFSERR_STALE;
2026 		TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END,
2027 			"rfs_remove_end:(%S)", "stale");
2028 		return;
2029 	}
2030 
2031 	if (rdonly(exi, req)) {
2032 		VN_RELE(vp);
2033 		*status = NFSERR_ROFS;
2034 		TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END,
2035 			"rfs_remove_end:(%S)", "rofs");
2036 		return;
2037 	}
2038 
2039 	/*
2040 	 * Check for a conflict with a non-blocking mandatory share reservation.
2041 	 */
2042 	error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0,
2043 			NULL, cr);
2044 	if (error != 0) {
2045 		VN_RELE(vp);
2046 		*status = puterrno(error);
2047 		return;
2048 	}
2049 
2050 	/*
2051 	 * If the file is delegated to an v4 client, then initiate
2052 	 * recall and drop this request (by setting T_WOULDBLOCK).
2053 	 * The client will eventually re-transmit the request and
2054 	 * (hopefully), by then, the v4 client will have returned
2055 	 * the delegation.
2056 	 */
2057 
2058 	if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2059 		VN_RELE(vp);
2060 		VN_RELE(targvp);
2061 		curthread->t_flag |= T_WOULDBLOCK;
2062 		return;
2063 	}
2064 
2065 	if (nbl_need_check(targvp)) {
2066 		nbl_start_crit(targvp, RW_READER);
2067 		in_crit = 1;
2068 		if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0)) {
2069 			error = EACCES;
2070 			goto out;
2071 		}
2072 	}
2073 
2074 	TRACE_0(TR_FAC_NFS, TR_VOP_REMOVE_START,
2075 		"vop_remove_start:");
2076 	error = VOP_REMOVE(vp, da->da_name, cr);
2077 	TRACE_0(TR_FAC_NFS, TR_VOP_REMOVE_END,
2078 		"vop_remove_end:");
2079 
2080 	/*
2081 	 * Force modified data and metadata out to stable storage.
2082 	 */
2083 	(void) VOP_FSYNC(vp, 0, cr);
2084 
2085 out:
2086 	if (in_crit)
2087 		nbl_end_crit(targvp);
2088 	VN_RELE(targvp);
2089 	VN_RELE(vp);
2090 
2091 	*status = puterrno(error);
2092 
2093 	TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END,
2094 		"rfs_remove_end:(%S)", "done");
2095 }
2096 
2097 void *
2098 rfs_remove_getfh(struct nfsdiropargs *da)
2099 {
2100 	return (da->da_fhandle);
2101 }
2102 
2103 /*
2104  * rename a file
2105  * Give a file (from) a new name (to).
2106  */
2107 void
2108 rfs_rename(struct nfsrnmargs *args, enum nfsstat *status,
2109 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2110 {
2111 	int error = 0;
2112 	vnode_t *fromvp;
2113 	vnode_t *tovp;
2114 	struct exportinfo *to_exi;
2115 	fhandle_t *fh;
2116 	vnode_t *srcvp;
2117 	vnode_t *targvp;
2118 	int in_crit = 0;
2119 
2120 	TRACE_0(TR_FAC_NFS, TR_RFS_RENAME_START,
2121 		"rfs_rename_start:");
2122 
2123 	fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi);
2124 	if (fromvp == NULL) {
2125 		*status = NFSERR_STALE;
2126 		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2127 			"rfs_rename_end:(%S)", "from stale");
2128 		return;
2129 	}
2130 
2131 	fh = args->rna_to.da_fhandle;
2132 	to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2133 	if (to_exi == NULL) {
2134 		VN_RELE(fromvp);
2135 		*status = NFSERR_ACCES;
2136 		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2137 			"rfs_rename_end:(%S)", "cross device");
2138 		return;
2139 	}
2140 	exi_rele(to_exi);
2141 
2142 	if (to_exi != exi) {
2143 		VN_RELE(fromvp);
2144 		*status = NFSERR_XDEV;
2145 		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2146 			"rfs_rename_end:(%S)", "from stale");
2147 		return;
2148 	}
2149 
2150 	tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi);
2151 	if (tovp == NULL) {
2152 		VN_RELE(fromvp);
2153 		*status = NFSERR_STALE;
2154 		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2155 			"rfs_rename_end:(%S)", "to stale");
2156 		return;
2157 	}
2158 
2159 	if (fromvp->v_type != VDIR || tovp->v_type != VDIR) {
2160 		VN_RELE(tovp);
2161 		VN_RELE(fromvp);
2162 		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2163 			"rfs_rename_end:(%S)", "not dir");
2164 		*status = NFSERR_NOTDIR;
2165 		return;
2166 	}
2167 
2168 	/*
2169 	 * Disallow NULL paths
2170 	 */
2171 	if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' ||
2172 	    args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') {
2173 		VN_RELE(tovp);
2174 		VN_RELE(fromvp);
2175 		*status = NFSERR_ACCES;
2176 		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2177 			"rfs_rename_end:(%S)", "access");
2178 		return;
2179 	}
2180 
2181 	if (rdonly(exi, req)) {
2182 		VN_RELE(tovp);
2183 		VN_RELE(fromvp);
2184 		*status = NFSERR_ROFS;
2185 		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2186 			"rfs_rename_end:(%S)", "rofs");
2187 		return;
2188 	}
2189 
2190 	/*
2191 	 * Check for a conflict with a non-blocking mandatory share reservation.
2192 	 */
2193 	error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
2194 			NULL, cr);
2195 	if (error != 0) {
2196 		VN_RELE(tovp);
2197 		VN_RELE(fromvp);
2198 		*status = puterrno(error);
2199 		return;
2200 	}
2201 
2202 	/* Check for delegations on the source file */
2203 
2204 	if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
2205 		VN_RELE(tovp);
2206 		VN_RELE(fromvp);
2207 		VN_RELE(srcvp);
2208 		curthread->t_flag |= T_WOULDBLOCK;
2209 		return;
2210 	}
2211 
2212 	/* Check for delegation on the file being renamed over, if it exists */
2213 
2214 	if (rfs4_deleg_policy != SRV_NEVER_DELEGATE &&
2215 	    VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr)
2216 	    == 0) {
2217 
2218 		if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2219 			VN_RELE(tovp);
2220 			VN_RELE(fromvp);
2221 			VN_RELE(srcvp);
2222 			VN_RELE(targvp);
2223 			curthread->t_flag |= T_WOULDBLOCK;
2224 			return;
2225 		}
2226 		VN_RELE(targvp);
2227 	}
2228 
2229 
2230 	if (nbl_need_check(srcvp)) {
2231 		nbl_start_crit(srcvp, RW_READER);
2232 		in_crit = 1;
2233 		if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0)) {
2234 			error = EACCES;
2235 			goto out;
2236 		}
2237 	}
2238 
2239 	TRACE_0(TR_FAC_NFS, TR_VOP_RENAME_START,
2240 		"vop_rename_start:");
2241 	error = VOP_RENAME(fromvp, args->rna_from.da_name,
2242 	    tovp, args->rna_to.da_name, cr);
2243 	TRACE_0(TR_FAC_NFS, TR_VOP_RENAME_END,
2244 		"vop_rename_end:");
2245 
2246 	if (error == 0) {
2247 		char *tmp;
2248 
2249 		/* fix the path name for the renamed file */
2250 		mutex_enter(&srcvp->v_lock);
2251 		tmp = srcvp->v_path;
2252 		srcvp->v_path = NULL;
2253 		mutex_exit(&srcvp->v_lock);
2254 		vn_setpath(rootdir, tovp, srcvp, args->rna_to.da_name,
2255 				strlen(args->rna_to.da_name));
2256 		if (tmp != NULL)
2257 			kmem_free(tmp, strlen(tmp) + 1);
2258 	}
2259 
2260 	/*
2261 	 * Force modified data and metadata out to stable storage.
2262 	 */
2263 	(void) VOP_FSYNC(tovp, 0, cr);
2264 	(void) VOP_FSYNC(fromvp, 0, cr);
2265 
2266 out:
2267 	if (in_crit)
2268 		nbl_end_crit(srcvp);
2269 	VN_RELE(srcvp);
2270 	VN_RELE(tovp);
2271 	VN_RELE(fromvp);
2272 
2273 	*status = puterrno(error);
2274 
2275 	TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2276 		"rfs_rename_end:(%S)", "done");
2277 }
2278 void *
2279 rfs_rename_getfh(struct nfsrnmargs *args)
2280 {
2281 	return (args->rna_from.da_fhandle);
2282 }
2283 
2284 /*
2285  * Link to a file.
2286  * Create a file (to) which is a hard link to the given file (from).
2287  */
2288 void
2289 rfs_link(struct nfslinkargs *args, enum nfsstat *status,
2290 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2291 {
2292 	int error;
2293 	vnode_t *fromvp;
2294 	vnode_t *tovp;
2295 	struct exportinfo *to_exi;
2296 	fhandle_t *fh;
2297 
2298 	TRACE_0(TR_FAC_NFS, TR_RFS_LINK_START,
2299 		"rfs_link_start:");
2300 
2301 	fromvp = nfs_fhtovp(args->la_from, exi);
2302 	if (fromvp == NULL) {
2303 		*status = NFSERR_STALE;
2304 		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2305 			"rfs_link_end:(%S)", "from stale");
2306 		return;
2307 	}
2308 
2309 	fh = args->la_to.da_fhandle;
2310 	to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2311 	if (to_exi == NULL) {
2312 		VN_RELE(fromvp);
2313 		*status = NFSERR_ACCES;
2314 		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2315 			"rfs_link_end:(%S)", "cross device");
2316 		return;
2317 	}
2318 	exi_rele(to_exi);
2319 
2320 	if (to_exi != exi) {
2321 		VN_RELE(fromvp);
2322 		*status = NFSERR_XDEV;
2323 		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2324 			"rfs_link_end:(%S)", "cross device");
2325 		return;
2326 	}
2327 
2328 	tovp = nfs_fhtovp(args->la_to.da_fhandle, exi);
2329 	if (tovp == NULL) {
2330 		VN_RELE(fromvp);
2331 		*status = NFSERR_STALE;
2332 		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2333 			"rfs_link_end:(%S)", "to stale");
2334 		return;
2335 	}
2336 
2337 	if (tovp->v_type != VDIR) {
2338 		VN_RELE(tovp);
2339 		VN_RELE(fromvp);
2340 		*status = NFSERR_NOTDIR;
2341 		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2342 			"rfs_link_end:(%S)", "not dir");
2343 		return;
2344 	}
2345 	/*
2346 	 * Disallow NULL paths
2347 	 */
2348 	if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') {
2349 		VN_RELE(tovp);
2350 		VN_RELE(fromvp);
2351 		*status = NFSERR_ACCES;
2352 		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2353 			"rfs_link_end:(%S)", "access");
2354 		return;
2355 	}
2356 
2357 	if (rdonly(exi, req)) {
2358 		VN_RELE(tovp);
2359 		VN_RELE(fromvp);
2360 		*status = NFSERR_ROFS;
2361 		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2362 			"rfs_link_end:(%S)", "rofs");
2363 		return;
2364 	}
2365 
2366 	TRACE_0(TR_FAC_NFS, TR_VOP_LINK_START,
2367 		"vop_link_start:");
2368 	error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr);
2369 	TRACE_0(TR_FAC_NFS, TR_VOP_LINK_END,
2370 		"vop_link_end:");
2371 
2372 	/*
2373 	 * Force modified data and metadata out to stable storage.
2374 	 */
2375 	(void) VOP_FSYNC(tovp, 0, cr);
2376 	(void) VOP_FSYNC(fromvp, FNODSYNC, cr);
2377 
2378 	VN_RELE(tovp);
2379 	VN_RELE(fromvp);
2380 
2381 	*status = puterrno(error);
2382 
2383 	TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2384 		"rfs_link_end:(%S)", "done");
2385 }
2386 void *
2387 rfs_link_getfh(struct nfslinkargs *args)
2388 {
2389 	return (args->la_from);
2390 }
2391 
2392 /*
2393  * Symbolicly link to a file.
2394  * Create a file (to) with the given attributes which is a symbolic link
2395  * to the given path name (to).
2396  */
2397 void
2398 rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
2399 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2400 {
2401 	int error;
2402 	struct vattr va;
2403 	vnode_t *vp;
2404 	vnode_t *svp;
2405 	int lerror;
2406 
2407 	TRACE_0(TR_FAC_NFS, TR_RFS_SYMLINK_START,
2408 		"rfs_symlink_start:");
2409 
2410 	/*
2411 	 * Disallow NULL paths
2412 	 */
2413 	if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') {
2414 		*status = NFSERR_ACCES;
2415 		TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END,
2416 			"rfs_symlink_end:(%S)", "access");
2417 		return;
2418 	}
2419 
2420 	vp = nfs_fhtovp(args->sla_from.da_fhandle, exi);
2421 	if (vp == NULL) {
2422 		*status = NFSERR_STALE;
2423 		TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END,
2424 			"rfs_symlink_end:(%S)", "stale");
2425 		return;
2426 	}
2427 
2428 	if (rdonly(exi, req)) {
2429 		VN_RELE(vp);
2430 		*status = NFSERR_ROFS;
2431 		TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END,
2432 			"rfs_symlink_end:(%S)", "rofs");
2433 		return;
2434 	}
2435 
2436 	error = sattr_to_vattr(args->sla_sa, &va);
2437 	if (error) {
2438 		VN_RELE(vp);
2439 		*status = puterrno(error);
2440 		TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END,
2441 			"rfs_symlink_end:(%S)", "sattr");
2442 		return;
2443 	}
2444 
2445 	if (!(va.va_mask & AT_MODE)) {
2446 		VN_RELE(vp);
2447 		*status = NFSERR_INVAL;
2448 		TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END,
2449 			"rfs_symlink_end:(%S)", "no mode");
2450 		return;
2451 	}
2452 
2453 	va.va_type = VLNK;
2454 	va.va_mask |= AT_TYPE;
2455 
2456 	TRACE_0(TR_FAC_NFS, TR_VOP_SYMLINK_START,
2457 		"vop_symlink_start:");
2458 	error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, args->sla_tnm, cr);
2459 	TRACE_0(TR_FAC_NFS, TR_VOP_SYMLINK_END,
2460 		"vop_symlink_end:");
2461 
2462 	/*
2463 	 * Force new data and metadata out to stable storage.
2464 	 */
2465 	TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_START,
2466 		"vop_lookup_start:");
2467 	lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL,
2468 	    0, NULL, cr);
2469 	TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_END,
2470 		"vop_lookup_end:");
2471 	if (!lerror) {
2472 		(void) VOP_FSYNC(svp, 0, cr);
2473 		VN_RELE(svp);
2474 	}
2475 
2476 	/*
2477 	 * Force modified data and metadata out to stable storage.
2478 	 */
2479 	(void) VOP_FSYNC(vp, 0, cr);
2480 
2481 	VN_RELE(vp);
2482 
2483 	*status = puterrno(error);
2484 
2485 	TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END,
2486 		"rfs_symlink_end:(%S)", "done");
2487 }
2488 void *
2489 rfs_symlink_getfh(struct nfsslargs *args)
2490 {
2491 	return (args->sla_from.da_fhandle);
2492 }
2493 
2494 /*
2495  * Make a directory.
2496  * Create a directory with the given name, parent directory, and attributes.
2497  * Returns a file handle and attributes for the new directory.
2498  */
2499 void
2500 rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr,
2501 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2502 {
2503 	int error;
2504 	struct vattr va;
2505 	vnode_t *dvp = NULL;
2506 	vnode_t *vp;
2507 	char *name = args->ca_da.da_name;
2508 
2509 	TRACE_0(TR_FAC_NFS, TR_RFS_MKDIR_START,
2510 		"rfs_mkdir_start:");
2511 
2512 	/*
2513 	 * Disallow NULL paths
2514 	 */
2515 	if (name == NULL || *name == '\0') {
2516 		dr->dr_status = NFSERR_ACCES;
2517 		TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END,
2518 			"rfs_mkdir_end:(%S)", "access");
2519 		return;
2520 	}
2521 
2522 	vp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
2523 	if (vp == NULL) {
2524 		dr->dr_status = NFSERR_STALE;
2525 		TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END,
2526 			"rfs_mkdir_end:(%S)", "stale");
2527 		return;
2528 	}
2529 
2530 	if (rdonly(exi, req)) {
2531 		VN_RELE(vp);
2532 		dr->dr_status = NFSERR_ROFS;
2533 		TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END,
2534 			"rfs_mkdir_end:(%S)", "rofs");
2535 		return;
2536 	}
2537 
2538 	error = sattr_to_vattr(args->ca_sa, &va);
2539 	if (error) {
2540 		VN_RELE(vp);
2541 		dr->dr_status = puterrno(error);
2542 		TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END,
2543 			"rfs_mkdir_end:(%S)", "sattr");
2544 		return;
2545 	}
2546 
2547 	if (!(va.va_mask & AT_MODE)) {
2548 		VN_RELE(vp);
2549 		dr->dr_status = NFSERR_INVAL;
2550 		TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END,
2551 			"rfs_mkdir_end:(%S)", "no mode");
2552 		return;
2553 	}
2554 
2555 	va.va_type = VDIR;
2556 	va.va_mask |= AT_TYPE;
2557 
2558 	TRACE_0(TR_FAC_NFS, TR_VOP_MKDIR_START,
2559 		"vop_mkdir_start:");
2560 	error = VOP_MKDIR(vp, name, &va, &dvp, cr);
2561 	TRACE_0(TR_FAC_NFS, TR_VOP_MKDIR_END,
2562 		"vop_mkdir_end:");
2563 
2564 	if (!error) {
2565 		/*
2566 		 * Attribtutes of the newly created directory should
2567 		 * be returned to the client.
2568 		 */
2569 		va.va_mask = AT_ALL; /* We want everything */
2570 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
2571 			"vop_getattr_start:");
2572 		error = VOP_GETATTR(dvp, &va, 0, cr);
2573 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
2574 			"vop_getattr_end:");
2575 		/* check for overflows */
2576 		if (!error) {
2577 			acl_perm(vp, exi, &va, cr);
2578 			error = vattr_to_nattr(&va, &dr->dr_attr);
2579 			if (!error) {
2580 				error = makefh(&dr->dr_fhandle, dvp, exi);
2581 			}
2582 		}
2583 		/*
2584 		 * Force new data and metadata out to stable storage.
2585 		 */
2586 		(void) VOP_FSYNC(dvp, 0, cr);
2587 		VN_RELE(dvp);
2588 	}
2589 
2590 	/*
2591 	 * Force modified data and metadata out to stable storage.
2592 	 */
2593 	(void) VOP_FSYNC(vp, 0, cr);
2594 
2595 	VN_RELE(vp);
2596 
2597 	dr->dr_status = puterrno(error);
2598 
2599 	TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END,
2600 		"rfs_mkdir_end:(%S)", "done");
2601 }
2602 void *
2603 rfs_mkdir_getfh(struct nfscreatargs *args)
2604 {
2605 	return (args->ca_da.da_fhandle);
2606 }
2607 
2608 /*
2609  * Remove a directory.
2610  * Remove the given directory name from the given parent directory.
2611  */
2612 void
2613 rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status,
2614 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2615 {
2616 	int error;
2617 	vnode_t *vp;
2618 
2619 	TRACE_0(TR_FAC_NFS, TR_RFS_RMDIR_START,
2620 		"rfs_rmdir_start:");
2621 
2622 	/*
2623 	 * Disallow NULL paths
2624 	 */
2625 	if (da->da_name == NULL || *da->da_name == '\0') {
2626 		*status = NFSERR_ACCES;
2627 		TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END,
2628 			"rfs_rmdir_end:(%S)", "access");
2629 		return;
2630 	}
2631 
2632 	vp = nfs_fhtovp(da->da_fhandle, exi);
2633 	if (vp == NULL) {
2634 		*status = NFSERR_STALE;
2635 		TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END,
2636 			"rfs_rmdir_end:(%S)", "stale");
2637 		return;
2638 	}
2639 
2640 	if (rdonly(exi, req)) {
2641 		VN_RELE(vp);
2642 		*status = NFSERR_ROFS;
2643 		TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END,
2644 			"rfs_rmdir_end:(%S)", "rofs");
2645 		return;
2646 	}
2647 
2648 	/*
2649 	 * VOP_RMDIR now takes a new third argument (the current
2650 	 * directory of the process).  That's because someone
2651 	 * wants to return EINVAL if one tries to remove ".".
2652 	 * Of course, NFS servers have no idea what their
2653 	 * clients' current directories are.  We fake it by
2654 	 * supplying a vnode known to exist and illegal to
2655 	 * remove.
2656 	 */
2657 	TRACE_0(TR_FAC_NFS, TR_VOP_RMDIR_START,
2658 		"vop_rmdir_start:");
2659 	error = VOP_RMDIR(vp, da->da_name, rootdir, cr);
2660 	TRACE_0(TR_FAC_NFS, TR_VOP_RMDIR_END,
2661 		"vop_rmdir_end:");
2662 
2663 	/*
2664 	 * Force modified data and metadata out to stable storage.
2665 	 */
2666 	(void) VOP_FSYNC(vp, 0, cr);
2667 
2668 	VN_RELE(vp);
2669 
2670 	/*
2671 	 * System V defines rmdir to return EEXIST, not ENOTEMPTY,
2672 	 * if the directory is not empty.  A System V NFS server
2673 	 * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
2674 	 * over the wire.
2675 	 */
2676 	if (error == EEXIST)
2677 		*status = NFSERR_NOTEMPTY;
2678 	else
2679 		*status = puterrno(error);
2680 
2681 	TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END,
2682 		"rfs_rmdir_end:(%S)", "done");
2683 }
2684 void *
2685 rfs_rmdir_getfh(struct nfsdiropargs *da)
2686 {
2687 	return (da->da_fhandle);
2688 }
2689 
2690 /* ARGSUSED */
2691 void
2692 rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
2693 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2694 {
2695 	int error;
2696 	int iseof;
2697 	struct iovec iov;
2698 	struct uio uio;
2699 	vnode_t *vp;
2700 
2701 	TRACE_0(TR_FAC_NFS, TR_RFS_READDIR_START,
2702 		"rfs_readdir_start:");
2703 
2704 	vp = nfs_fhtovp(&rda->rda_fh, exi);
2705 	if (vp == NULL) {
2706 		rd->rd_entries = NULL;
2707 		rd->rd_status = NFSERR_STALE;
2708 		TRACE_1(TR_FAC_NFS, TR_RFS_READDIR_END,
2709 			"rfs_readdir_end:(%S)", "stale");
2710 		return;
2711 	}
2712 
2713 	if (vp->v_type != VDIR) {
2714 		VN_RELE(vp);
2715 		rd->rd_entries = NULL;
2716 		rd->rd_status = NFSERR_NOTDIR;
2717 		TRACE_1(TR_FAC_NFS, TR_RFS_READDIR_END,
2718 			"rfs_readdir_end:(%S)", "notdir");
2719 		return;
2720 	}
2721 
2722 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START,
2723 		"vop_rwlock_start:");
2724 	(void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
2725 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END,
2726 		"vop_rwlock_end:");
2727 
2728 	TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START,
2729 		"vop_access_start:");
2730 	error = VOP_ACCESS(vp, VREAD, 0, cr);
2731 	TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END,
2732 		"vop_access_end:");
2733 	if (error) {
2734 		rd->rd_entries = NULL;
2735 		goto bad;
2736 	}
2737 
2738 	if (rda->rda_count == 0) {
2739 		rd->rd_entries = NULL;
2740 		rd->rd_size = 0;
2741 		rd->rd_eof = FALSE;
2742 		goto bad;
2743 	}
2744 
2745 	rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA);
2746 
2747 	/*
2748 	 * Allocate data for entries.  This will be freed by rfs_rddirfree.
2749 	 */
2750 	rd->rd_bufsize = (uint_t)rda->rda_count;
2751 	rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP);
2752 
2753 	/*
2754 	 * Set up io vector to read directory data
2755 	 */
2756 	iov.iov_base = (caddr_t)rd->rd_entries;
2757 	iov.iov_len = rda->rda_count;
2758 	uio.uio_iov = &iov;
2759 	uio.uio_iovcnt = 1;
2760 	uio.uio_segflg = UIO_SYSSPACE;
2761 	uio.uio_extflg = UIO_COPY_CACHED;
2762 	uio.uio_loffset = (offset_t)rda->rda_offset;
2763 	uio.uio_resid = rda->rda_count;
2764 
2765 	/*
2766 	 * read directory
2767 	 */
2768 	TRACE_0(TR_FAC_NFS, TR_VOP_READDIR_START,
2769 		"vop_readdir_start:");
2770 	error = VOP_READDIR(vp, &uio, cr, &iseof);
2771 	TRACE_0(TR_FAC_NFS, TR_VOP_READDIR_END,
2772 		"vop_readdir_end:");
2773 
2774 	/*
2775 	 * Clean up
2776 	 */
2777 	if (!error) {
2778 		/*
2779 		 * set size and eof
2780 		 */
2781 		if (uio.uio_resid == rda->rda_count) {
2782 			rd->rd_size = 0;
2783 			rd->rd_eof = TRUE;
2784 		} else {
2785 			rd->rd_size = (uint32_t)(rda->rda_count -
2786 			    uio.uio_resid);
2787 			rd->rd_eof = iseof ? TRUE : FALSE;
2788 		}
2789 	}
2790 
2791 bad:
2792 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
2793 		"vop_rwunlock_start:");
2794 	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
2795 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END,
2796 		"vop_rwunlock_end:");
2797 
2798 #if 0 /* notyet */
2799 	/*
2800 	 * Don't do this.  It causes local disk writes when just
2801 	 * reading the file and the overhead is deemed larger
2802 	 * than the benefit.
2803 	 */
2804 	/*
2805 	 * Force modified metadata out to stable storage.
2806 	 */
2807 	(void) VOP_FSYNC(vp, FNODSYNC, cr);
2808 #endif
2809 
2810 	VN_RELE(vp);
2811 
2812 	rd->rd_status = puterrno(error);
2813 
2814 	TRACE_1(TR_FAC_NFS, TR_RFS_READDIR_END,
2815 		"rfs_readdir_end:(%S)", "done");
2816 }
2817 void *
2818 rfs_readdir_getfh(struct nfsrddirargs *rda)
2819 {
2820 	return (&rda->rda_fh);
2821 }
2822 void
2823 rfs_rddirfree(struct nfsrddirres *rd)
2824 {
2825 	if (rd->rd_entries != NULL)
2826 		kmem_free(rd->rd_entries, rd->rd_bufsize);
2827 }
2828 
2829 /* ARGSUSED */
2830 void
2831 rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
2832 	struct svc_req *req, cred_t *cr)
2833 {
2834 	int error;
2835 	struct statvfs64 sb;
2836 	vnode_t *vp;
2837 
2838 	TRACE_0(TR_FAC_NFS, TR_RFS_STATFS_START,
2839 		"rfs_statfs_start:");
2840 
2841 	vp = nfs_fhtovp(fh, exi);
2842 	if (vp == NULL) {
2843 		fs->fs_status = NFSERR_STALE;
2844 		TRACE_1(TR_FAC_NFS, TR_RFS_STATFS_END,
2845 			"rfs_statfs_end:(%S)", "stale");
2846 		return;
2847 	}
2848 
2849 	error = VFS_STATVFS(vp->v_vfsp, &sb);
2850 
2851 	if (!error) {
2852 		fs->fs_tsize = nfstsize();
2853 		fs->fs_bsize = sb.f_frsize;
2854 		fs->fs_blocks = sb.f_blocks;
2855 		fs->fs_bfree = sb.f_bfree;
2856 		fs->fs_bavail = sb.f_bavail;
2857 	}
2858 
2859 	VN_RELE(vp);
2860 
2861 	fs->fs_status = puterrno(error);
2862 
2863 	TRACE_1(TR_FAC_NFS, TR_RFS_STATFS_END,
2864 		"rfs_statfs_end:(%S)", "done");
2865 }
2866 void *
2867 rfs_statfs_getfh(fhandle_t *fh)
2868 {
2869 	return (fh);
2870 }
2871 
2872 static int
2873 sattr_to_vattr(struct nfssattr *sa, struct vattr *vap)
2874 {
2875 	vap->va_mask = 0;
2876 
2877 	/*
2878 	 * There was a sign extension bug in some VFS based systems
2879 	 * which stored the mode as a short.  When it would get
2880 	 * assigned to a u_long, no sign extension would occur.
2881 	 * It needed to, but this wasn't noticed because sa_mode
2882 	 * would then get assigned back to the short, thus ignoring
2883 	 * the upper 16 bits of sa_mode.
2884 	 *
2885 	 * To make this implementation work for both broken
2886 	 * clients and good clients, we check for both versions
2887 	 * of the mode.
2888 	 */
2889 	if (sa->sa_mode != (uint32_t)((ushort_t)-1) &&
2890 	    sa->sa_mode != (uint32_t)-1) {
2891 		vap->va_mask |= AT_MODE;
2892 		vap->va_mode = sa->sa_mode;
2893 	}
2894 	if (sa->sa_uid != (uint32_t)-1) {
2895 		vap->va_mask |= AT_UID;
2896 		vap->va_uid = sa->sa_uid;
2897 	}
2898 	if (sa->sa_gid != (uint32_t)-1) {
2899 		vap->va_mask |= AT_GID;
2900 		vap->va_gid = sa->sa_gid;
2901 	}
2902 	if (sa->sa_size != (uint32_t)-1) {
2903 		vap->va_mask |= AT_SIZE;
2904 		vap->va_size = sa->sa_size;
2905 	}
2906 	if (sa->sa_atime.tv_sec != (int32_t)-1 &&
2907 	    sa->sa_atime.tv_usec != (int32_t)-1) {
2908 #ifndef _LP64
2909 		/* return error if time overflow */
2910 		if (!NFS2_TIME_OK(sa->sa_atime.tv_sec))
2911 			return (EOVERFLOW);
2912 #endif
2913 		vap->va_mask |= AT_ATIME;
2914 		/*
2915 		 * nfs protocol defines times as unsigned so don't extend sign,
2916 		 * unless sysadmin set nfs_allow_preepoch_time.
2917 		 */
2918 		NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec);
2919 		vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
2920 	}
2921 	if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
2922 	    sa->sa_mtime.tv_usec != (int32_t)-1) {
2923 #ifndef _LP64
2924 		/* return error if time overflow */
2925 		if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
2926 			return (EOVERFLOW);
2927 #endif
2928 		vap->va_mask |= AT_MTIME;
2929 		/*
2930 		 * nfs protocol defines times as unsigned so don't extend sign,
2931 		 * unless sysadmin set nfs_allow_preepoch_time.
2932 		 */
2933 		NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
2934 		vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
2935 	}
2936 	return (0);
2937 }
2938 
2939 static enum nfsftype vt_to_nf[] = {
2940 	0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
2941 };
2942 
2943 /*
2944  * check the following fields for overflow: nodeid, size, and time.
2945  * There could be a problem when converting 64-bit LP64 fields
2946  * into 32-bit ones.  Return an error if there is an overflow.
2947  */
2948 int
2949 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
2950 {
2951 	ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
2952 	na->na_type = vt_to_nf[vap->va_type];
2953 
2954 	if (vap->va_mode == (unsigned short) -1)
2955 		na->na_mode = (uint32_t)-1;
2956 	else
2957 		na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
2958 
2959 	if (vap->va_uid == (unsigned short)(-1))
2960 		na->na_uid = (uint32_t)(-1);
2961 	else if (vap->va_uid == UID_NOBODY)
2962 		na->na_uid = (uint32_t)NFS_UID_NOBODY;
2963 	else
2964 		na->na_uid = vap->va_uid;
2965 
2966 	if (vap->va_gid == (unsigned short)(-1))
2967 		na->na_gid = (uint32_t)-1;
2968 	else if (vap->va_gid == GID_NOBODY)
2969 		na->na_gid = (uint32_t)NFS_GID_NOBODY;
2970 	else
2971 		na->na_gid = vap->va_gid;
2972 
2973 	/*
2974 	 * Do we need to check fsid for overflow?  It is 64-bit in the
2975 	 * vattr, but are bigger than 32 bit values supported?
2976 	 */
2977 	na->na_fsid = vap->va_fsid;
2978 
2979 	na->na_nodeid = vap->va_nodeid;
2980 
2981 	/*
2982 	 * Check to make sure that the nodeid is representable over the
2983 	 * wire without losing bits.
2984 	 */
2985 	if (vap->va_nodeid != (u_longlong_t)na->na_nodeid)
2986 		return (EFBIG);
2987 	na->na_nlink = vap->va_nlink;
2988 
2989 	/*
2990 	 * Check for big files here, instead of at the caller.  See
2991 	 * comments in cstat for large special file explanation.
2992 	 */
2993 	if (vap->va_size > (u_longlong_t)MAXOFF32_T) {
2994 		if ((vap->va_type == VREG) || (vap->va_type == VDIR))
2995 			return (EFBIG);
2996 		if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) {
2997 			/* UNKNOWN_SIZE | OVERFLOW */
2998 			na->na_size = MAXOFF32_T;
2999 		} else
3000 			na->na_size = vap->va_size;
3001 	} else
3002 		na->na_size = vap->va_size;
3003 
3004 	/*
3005 	 * If the vnode times overflow the 32-bit times that NFS2
3006 	 * uses on the wire then return an error.
3007 	 */
3008 	if (!NFS_VAP_TIME_OK(vap)) {
3009 		return (EOVERFLOW);
3010 	}
3011 	na->na_atime.tv_sec = vap->va_atime.tv_sec;
3012 	na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
3013 
3014 	na->na_mtime.tv_sec = vap->va_mtime.tv_sec;
3015 	na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
3016 
3017 	na->na_ctime.tv_sec = vap->va_ctime.tv_sec;
3018 	na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000;
3019 
3020 	/*
3021 	 * If the dev_t will fit into 16 bits then compress
3022 	 * it, otherwise leave it alone. See comments in
3023 	 * nfs_client.c.
3024 	 */
3025 	if (getminor(vap->va_rdev) <= SO4_MAXMIN &&
3026 	    getmajor(vap->va_rdev) <= SO4_MAXMAJ)
3027 		na->na_rdev = nfsv2_cmpdev(vap->va_rdev);
3028 	else
3029 		(void) cmpldev(&na->na_rdev, vap->va_rdev);
3030 
3031 	na->na_blocks = vap->va_nblocks;
3032 	na->na_blocksize = vap->va_blksize;
3033 
3034 	/*
3035 	 * This bit of ugliness is a *TEMPORARY* hack to preserve the
3036 	 * over-the-wire protocols for named-pipe vnodes.  It remaps the
3037 	 * VFIFO type to the special over-the-wire type. (see note in nfs.h)
3038 	 *
3039 	 * BUYER BEWARE:
3040 	 *  If you are porting the NFS to a non-Sun server, you probably
3041 	 *  don't want to include the following block of code.  The
3042 	 *  over-the-wire special file types will be changing with the
3043 	 *  NFS Protocol Revision.
3044 	 */
3045 	if (vap->va_type == VFIFO)
3046 		NA_SETFIFO(na);
3047 	return (0);
3048 }
3049 
3050 /*
3051  * acl v2 support: returns approximate permission.
3052  *	default: returns minimal permission (more restrictive)
3053  *	aclok: returns maximal permission (less restrictive)
3054  *	This routine changes the permissions that are alaredy in *va.
3055  *	If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES,
3056  *	CLASS_OBJ is always the same as GROUP_OBJ entry.
3057  */
3058 static void
3059 acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr)
3060 {
3061 	vsecattr_t	vsa;
3062 	int		aclcnt;
3063 	aclent_t	*aclentp;
3064 	mode_t		mask_perm;
3065 	mode_t		grp_perm;
3066 	mode_t		other_perm;
3067 	mode_t		other_orig;
3068 	int		error;
3069 
3070 	/* dont care default acl */
3071 	vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT);
3072 	error = VOP_GETSECATTR(vp, &vsa, 0, cr);
3073 
3074 	if (!error) {
3075 		aclcnt = vsa.vsa_aclcnt;
3076 		if (aclcnt > MIN_ACL_ENTRIES) {
3077 			/* non-trivial ACL */
3078 			aclentp = vsa.vsa_aclentp;
3079 			if (exi->exi_export.ex_flags & EX_ACLOK) {
3080 				/* maximal permissions */
3081 				grp_perm = 0;
3082 				other_perm = 0;
3083 				for (; aclcnt > 0; aclcnt--, aclentp++) {
3084 					switch (aclentp->a_type) {
3085 					case USER_OBJ:
3086 						break;
3087 					case USER:
3088 						grp_perm |=
3089 						    aclentp->a_perm << 3;
3090 						other_perm |= aclentp->a_perm;
3091 						break;
3092 					case GROUP_OBJ:
3093 						grp_perm |=
3094 						    aclentp->a_perm << 3;
3095 						break;
3096 					case GROUP:
3097 						other_perm |= aclentp->a_perm;
3098 						break;
3099 					case OTHER_OBJ:
3100 						other_orig = aclentp->a_perm;
3101 						break;
3102 					case CLASS_OBJ:
3103 						mask_perm = aclentp->a_perm;
3104 						break;
3105 					default:
3106 						break;
3107 					}
3108 				}
3109 				grp_perm &= mask_perm << 3;
3110 				other_perm &= mask_perm;
3111 				other_perm |= other_orig;
3112 
3113 			} else {
3114 				/* minimal permissions */
3115 				grp_perm = 070;
3116 				other_perm = 07;
3117 				for (; aclcnt > 0; aclcnt--, aclentp++) {
3118 					switch (aclentp->a_type) {
3119 					case USER_OBJ:
3120 						break;
3121 					case USER:
3122 					case CLASS_OBJ:
3123 						grp_perm &=
3124 						    aclentp->a_perm << 3;
3125 						other_perm &=
3126 						    aclentp->a_perm;
3127 						break;
3128 					case GROUP_OBJ:
3129 						grp_perm &=
3130 						    aclentp->a_perm << 3;
3131 						break;
3132 					case GROUP:
3133 						other_perm &=
3134 						    aclentp->a_perm;
3135 						break;
3136 					case OTHER_OBJ:
3137 						other_perm &=
3138 						    aclentp->a_perm;
3139 						break;
3140 					default:
3141 						break;
3142 					}
3143 				}
3144 			}
3145 			/* copy to va */
3146 			va->va_mode &= ~077;
3147 			va->va_mode |= grp_perm | other_perm;
3148 		}
3149 		if (vsa.vsa_aclcnt)
3150 			kmem_free(vsa.vsa_aclentp,
3151 			    vsa.vsa_aclcnt * sizeof (aclent_t));
3152 	}
3153 }
3154 
3155 void
3156 rfs_srvrinit(void)
3157 {
3158 	mutex_init(&rfs_async_write_lock, NULL, MUTEX_DEFAULT, NULL);
3159 }
3160 
3161 void
3162 rfs_srvrfini(void)
3163 {
3164 	mutex_destroy(&rfs_async_write_lock);
3165 }
3166