xref: /titanic_51/usr/src/uts/common/fs/nfs/nfs_srv.c (revision 8e22821528b08c6dba4e8176351560f316f6d0de)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  *	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
28  *	All rights reserved.
29  */
30 
31 #pragma ident	"%Z%%M%	%I%	%E% SMI"
32 
33 #include <sys/param.h>
34 #include <sys/types.h>
35 #include <sys/systm.h>
36 #include <sys/cred.h>
37 #include <sys/buf.h>
38 #include <sys/vfs.h>
39 #include <sys/vnode.h>
40 #include <sys/uio.h>
41 #include <sys/stat.h>
42 #include <sys/errno.h>
43 #include <sys/sysmacros.h>
44 #include <sys/statvfs.h>
45 #include <sys/kmem.h>
46 #include <sys/kstat.h>
47 #include <sys/dirent.h>
48 #include <sys/cmn_err.h>
49 #include <sys/debug.h>
50 #include <sys/vtrace.h>
51 #include <sys/mode.h>
52 #include <sys/acl.h>
53 #include <sys/nbmlock.h>
54 #include <sys/policy.h>
55 
56 #include <rpc/types.h>
57 #include <rpc/auth.h>
58 #include <rpc/svc.h>
59 
60 #include <nfs/nfs.h>
61 #include <nfs/export.h>
62 
63 #include <vm/hat.h>
64 #include <vm/as.h>
65 #include <vm/seg.h>
66 #include <vm/seg_map.h>
67 #include <vm/seg_kmem.h>
68 
69 #include <sys/strsubr.h>
70 
71 /*
72  * These are the interface routines for the server side of the
73  * Network File System.  See the NFS version 2 protocol specification
74  * for a description of this interface.
75  */
76 
77 static int	sattr_to_vattr(struct nfssattr *, struct vattr *);
78 static void	acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
79 			cred_t *);
80 
81 /*
82  * Some "over the wire" UNIX file types.  These are encoded
83  * into the mode.  This needs to be fixed in the next rev.
84  */
85 #define	IFMT		0170000		/* type of file */
86 #define	IFCHR		0020000		/* character special */
87 #define	IFBLK		0060000		/* block special */
88 #define	IFSOCK		0140000		/* socket */
89 
90 /*
91  * Get file attributes.
92  * Returns the current attributes of the file with the given fhandle.
93  */
94 /* ARGSUSED */
95 void
96 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
97 	struct svc_req *req, cred_t *cr)
98 {
99 	int error;
100 	vnode_t *vp;
101 	struct vattr va;
102 
103 	TRACE_0(TR_FAC_NFS, TR_RFS_GETATTR_START, "rfs_getattr_start:");
104 
105 	vp = nfs_fhtovp(fhp, exi);
106 	if (vp == NULL) {
107 		ns->ns_status = NFSERR_STALE;
108 		TRACE_1(TR_FAC_NFS, TR_RFS_GETATTR_END,
109 		    "rfs_getattr_end:(%S)", "stale");
110 		return;
111 	}
112 
113 	/*
114 	 * Do the getattr.
115 	 */
116 	va.va_mask = AT_ALL;	/* we want all the attributes */
117 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:");
118 	error = rfs4_delegated_getattr(vp, &va, 0, cr);
119 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:");
120 
121 	/* check for overflows */
122 	if (!error) {
123 		acl_perm(vp, exi, &va, cr);
124 		error = vattr_to_nattr(&va, &ns->ns_attr);
125 	}
126 
127 	VN_RELE(vp);
128 
129 	ns->ns_status = puterrno(error);
130 
131 	TRACE_1(TR_FAC_NFS, TR_RFS_GETATTR_END, "rfs_getattr_end:(%S)", "done");
132 }
133 void *
134 rfs_getattr_getfh(fhandle_t *fhp)
135 {
136 	return (fhp);
137 }
138 
139 /*
140  * Set file attributes.
141  * Sets the attributes of the file with the given fhandle.  Returns
142  * the new attributes.
143  */
144 void
145 rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns,
146 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
147 {
148 	int error;
149 	int flag;
150 	int in_crit = 0;
151 	vnode_t *vp;
152 	struct vattr va;
153 	struct vattr bva;
154 	struct flock64 bf;
155 
156 	TRACE_0(TR_FAC_NFS, TR_RFS_SETATTR_START, "rfs_setattr_start:");
157 
158 	vp = nfs_fhtovp(&args->saa_fh, exi);
159 	if (vp == NULL) {
160 		ns->ns_status = NFSERR_STALE;
161 		TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END,
162 		    "rfs_setattr_end:(%S)", "stale");
163 		return;
164 	}
165 
166 	if (rdonly(exi, req) || vn_is_readonly(vp)) {
167 		VN_RELE(vp);
168 		ns->ns_status = NFSERR_ROFS;
169 		TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END,
170 		    "rfs_setattr_end:(%S)", "rofs");
171 		return;
172 	}
173 
174 	error = sattr_to_vattr(&args->saa_sa, &va);
175 	if (error) {
176 		VN_RELE(vp);
177 		ns->ns_status = puterrno(error);
178 		TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END,
179 		    "rfs_setattr_end:(%S)", "sattr");
180 		return;
181 	}
182 
183 	/*
184 	 * If the client is requesting a change to the mtime,
185 	 * but the nanosecond field is set to 1 billion, then
186 	 * this is a flag to the server that it should set the
187 	 * atime and mtime fields to the server's current time.
188 	 * The 1 billion number actually came from the client
189 	 * as 1 million, but the units in the over the wire
190 	 * request are microseconds instead of nanoseconds.
191 	 *
192 	 * This is an overload of the protocol and should be
193 	 * documented in the NFS Version 2 protocol specification.
194 	 */
195 	if (va.va_mask & AT_MTIME) {
196 		if (va.va_mtime.tv_nsec == 1000000000) {
197 			gethrestime(&va.va_mtime);
198 			va.va_atime = va.va_mtime;
199 			va.va_mask |= AT_ATIME;
200 			flag = 0;
201 		} else
202 			flag = ATTR_UTIME;
203 	} else
204 		flag = 0;
205 
206 	/*
207 	 * If the filesystem is exported with nosuid, then mask off
208 	 * the setuid and setgid bits.
209 	 */
210 	if ((va.va_mask & AT_MODE) && vp->v_type == VREG &&
211 	    (exi->exi_export.ex_flags & EX_NOSUID))
212 		va.va_mode &= ~(VSUID | VSGID);
213 
214 	/*
215 	 * We need to specially handle size changes because it is
216 	 * possible for the client to create a file with modes
217 	 * which indicate read-only, but with the file opened for
218 	 * writing.  If the client then tries to set the size of
219 	 * the file, then the normal access checking done in
220 	 * VOP_SETATTR would prevent the client from doing so,
221 	 * although it should be legal for it to do so.  To get
222 	 * around this, we do the access checking for ourselves
223 	 * and then use VOP_SPACE which doesn't do the access
224 	 * checking which VOP_SETATTR does. VOP_SPACE can only
225 	 * operate on VREG files, let VOP_SETATTR handle the other
226 	 * extremely rare cases.
227 	 * Also the client should not be allowed to change the
228 	 * size of the file if there is a conflicting non-blocking
229 	 * mandatory lock in the region of change.
230 	 *
231 	 * Also(2), check to see if the v4 side of the server has
232 	 * delegated this file.  If so, then we set T_WOULDBLOCK
233 	 * so that the dispatch function dosn't send a reply, forcing
234 	 * the client to retrasmit its request.
235 	 */
236 	if (vp->v_type == VREG && va.va_mask & AT_SIZE) {
237 		/* If delegated, mark as wouldblock so response is dropped */
238 		if (rfs4_check_delegated(FWRITE, vp, TRUE)) {
239 			VN_RELE(vp);
240 			curthread->t_flag |= T_WOULDBLOCK;
241 			TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END,
242 			    "rfs_setattr_end:(%S)", "delegated");
243 			return;
244 		}
245 		if (nbl_need_check(vp)) {
246 			nbl_start_crit(vp, RW_READER);
247 			in_crit = 1;
248 		}
249 
250 		bva.va_mask = AT_UID | AT_SIZE;
251 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:");
252 		error = VOP_GETATTR(vp, &bva, 0, cr);
253 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:");
254 		if (error) {
255 			if (in_crit)
256 				nbl_end_crit(vp);
257 			VN_RELE(vp);
258 			ns->ns_status = puterrno(error);
259 			TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END,
260 			    "rfs_setattr_end:(%S)", "getattr");
261 			return;
262 		}
263 
264 		if (in_crit) {
265 			u_offset_t offset;
266 			ssize_t length;
267 
268 			if (va.va_size < bva.va_size) {
269 				offset = va.va_size;
270 				length = bva.va_size - va.va_size;
271 			} else {
272 				offset = bva.va_size;
273 				length = va.va_size - bva.va_size;
274 			}
275 			if (nbl_conflict(vp, NBL_WRITE, offset, length, 0)) {
276 				error = EACCES;
277 			}
278 		}
279 
280 		if (crgetuid(cr) == bva.va_uid && !error &&
281 		    va.va_size != bva.va_size) {
282 			va.va_mask &= ~AT_SIZE;
283 			bf.l_type = F_WRLCK;
284 			bf.l_whence = 0;
285 			bf.l_start = (off64_t)va.va_size;
286 			bf.l_len = 0;
287 			bf.l_sysid = 0;
288 			bf.l_pid = 0;
289 			TRACE_0(TR_FAC_NFS, TR_VOP_SPACE_START,
290 			    "vop_space_start:");
291 			error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE,
292 			    (offset_t)va.va_size, cr, NULL);
293 			TRACE_0(TR_FAC_NFS, TR_VOP_SPACE_END, "vop_space_end:");
294 		}
295 		if (in_crit)
296 			nbl_end_crit(vp);
297 	} else
298 		error = 0;
299 
300 	/*
301 	 * Do the setattr.
302 	 */
303 	if (!error && va.va_mask) {
304 		TRACE_0(TR_FAC_NFS, TR_VOP_SETATTR_START, "vop_setattr_start:");
305 		error = VOP_SETATTR(vp, &va, flag, cr, NULL);
306 		TRACE_0(TR_FAC_NFS, TR_VOP_SETATTR_END, "vop_setattr_end:");
307 	}
308 
309 	if (!error) {
310 		va.va_mask = AT_ALL;	/* get everything */
311 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:");
312 		error = rfs4_delegated_getattr(vp, &va, 0, cr);
313 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:");
314 
315 		/* check for overflows */
316 		if (!error) {
317 			acl_perm(vp, exi, &va, cr);
318 			error = vattr_to_nattr(&va, &ns->ns_attr);
319 		}
320 	}
321 
322 	/*
323 	 * Force modified metadata out to stable storage.
324 	 */
325 	(void) VOP_FSYNC(vp, FNODSYNC, cr);
326 
327 	VN_RELE(vp);
328 
329 	ns->ns_status = puterrno(error);
330 
331 	TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END, "rfs_setattr_end:(%S)", "done");
332 }
333 void *
334 rfs_setattr_getfh(struct nfssaargs *args)
335 {
336 	return (&args->saa_fh);
337 }
338 
339 /*
340  * Directory lookup.
341  * Returns an fhandle and file attributes for file name in a directory.
342  */
343 /* ARGSUSED */
344 void
345 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
346 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
347 {
348 	int error;
349 	vnode_t *dvp;
350 	vnode_t *vp;
351 	struct vattr va;
352 	fhandle_t *fhp = da->da_fhandle;
353 	struct sec_ol sec = {0, 0};
354 	bool_t publicfh_flag = FALSE, auth_weak = FALSE;
355 
356 	TRACE_0(TR_FAC_NFS, TR_RFS_LOOKUP_START, "rfs_lookup_start:");
357 
358 	/*
359 	 * Trusted Extension doesn't support NFSv2. MOUNT
360 	 * will reject v2 clients. Need to prevent v2 client
361 	 * access via WebNFS here.
362 	 */
363 	if (is_system_labeled() && req->rq_vers == 2) {
364 		dr->dr_status = NFSERR_ACCES;
365 		TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END,
366 		    "rfs_lookup_end:(%S)", "access");
367 		return;
368 	}
369 
370 	/*
371 	 * Disallow NULL paths
372 	 */
373 	if (da->da_name == NULL || *da->da_name == '\0') {
374 		dr->dr_status = NFSERR_ACCES;
375 		TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END,
376 		    "rfs_lookup_end:(%S)", "access");
377 		return;
378 	}
379 
380 	/*
381 	 * Allow lookups from the root - the default
382 	 * location of the public filehandle.
383 	 */
384 	if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
385 		dvp = rootdir;
386 		VN_HOLD(dvp);
387 	} else {
388 		dvp = nfs_fhtovp(fhp, exi);
389 		if (dvp == NULL) {
390 			dr->dr_status = NFSERR_STALE;
391 			TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END,
392 			    "rfs_lookup_end:(%S)", "stale");
393 			return;
394 		}
395 	}
396 
397 	/*
398 	 * Not allow lookup beyond root.
399 	 * If the filehandle matches a filehandle of the exi,
400 	 * then the ".." refers beyond the root of an exported filesystem.
401 	 */
402 	if (strcmp(da->da_name, "..") == 0 &&
403 	    EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
404 		VN_RELE(dvp);
405 		dr->dr_status = NFSERR_NOENT;
406 		TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END,
407 		    "rfs_lookup_end:(%S)", "noent");
408 		return;
409 	}
410 
411 	/*
412 	 * If the public filehandle is used then allow
413 	 * a multi-component lookup, i.e. evaluate
414 	 * a pathname and follow symbolic links if
415 	 * necessary.
416 	 *
417 	 * This may result in a vnode in another filesystem
418 	 * which is OK as long as the filesystem is exported.
419 	 */
420 	if (PUBLIC_FH2(fhp)) {
421 		publicfh_flag = TRUE;
422 		error = rfs_publicfh_mclookup(da->da_name, dvp, cr, &vp, &exi,
423 		    &sec);
424 	} else {
425 		/*
426 		 * Do a normal single component lookup.
427 		 */
428 		TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_START, "vop_lookup_start:");
429 		error = VOP_LOOKUP(dvp, da->da_name, &vp, NULL, 0, NULL, cr);
430 		TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_END, "vop_lookup_end:");
431 	}
432 
433 	if (!error) {
434 		va.va_mask = AT_ALL;	/* we want everything */
435 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:");
436 		error = rfs4_delegated_getattr(vp, &va, 0, cr);
437 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:");
438 		/* check for overflows */
439 		if (!error) {
440 			acl_perm(vp, exi, &va, cr);
441 			error = vattr_to_nattr(&va, &dr->dr_attr);
442 			if (!error) {
443 				if (sec.sec_flags & SEC_QUERY)
444 					error = makefh_ol(&dr->dr_fhandle, exi,
445 					    sec.sec_index);
446 				else {
447 					error = makefh(&dr->dr_fhandle, vp,
448 					    exi);
449 					if (!error && publicfh_flag &&
450 					    !chk_clnt_sec(exi, req))
451 						auth_weak = TRUE;
452 				}
453 			}
454 		}
455 		VN_RELE(vp);
456 	}
457 
458 	VN_RELE(dvp);
459 
460 	/*
461 	 * If publicfh_flag is true then we have called rfs_publicfh_mclookup
462 	 * and have obtained a new exportinfo in exi which needs to be
463 	 * released. Note the the original exportinfo pointed to by exi
464 	 * will be released by the caller, comon_dispatch.
465 	 */
466 	if (publicfh_flag && exi != NULL)
467 		exi_rele(exi);
468 
469 	/*
470 	 * If it's public fh, no 0x81, and client's flavor is
471 	 * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
472 	 * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
473 	 */
474 	if (auth_weak)
475 		dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR;
476 	else
477 		dr->dr_status = puterrno(error);
478 
479 	TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END, "rfs_lookup_end:(%S)", "done");
480 }
481 void *
482 rfs_lookup_getfh(struct nfsdiropargs *da)
483 {
484 	return (da->da_fhandle);
485 }
486 
487 /*
488  * Read symbolic link.
489  * Returns the string in the symbolic link at the given fhandle.
490  */
491 /* ARGSUSED */
492 void
493 rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi,
494 	struct svc_req *req, cred_t *cr)
495 {
496 	int error;
497 	struct iovec iov;
498 	struct uio uio;
499 	vnode_t *vp;
500 	struct vattr va;
501 
502 	TRACE_0(TR_FAC_NFS, TR_RFS_READLINK_START, "rfs_readlink_start:");
503 
504 	vp = nfs_fhtovp(fhp, exi);
505 	if (vp == NULL) {
506 		rl->rl_data = NULL;
507 		rl->rl_status = NFSERR_STALE;
508 		TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END,
509 		    "rfs_readlink_end:(%S)", "stale");
510 		return;
511 	}
512 
513 	va.va_mask = AT_MODE;
514 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:");
515 	error = VOP_GETATTR(vp, &va, 0, cr);
516 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:");
517 
518 	if (error) {
519 		VN_RELE(vp);
520 		rl->rl_data = NULL;
521 		rl->rl_status = puterrno(error);
522 		TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END,
523 		    "rfs_readlink_end:(%S)", "getattr error");
524 		return;
525 	}
526 
527 	if (MANDLOCK(vp, va.va_mode)) {
528 		VN_RELE(vp);
529 		rl->rl_data = NULL;
530 		rl->rl_status = NFSERR_ACCES;
531 		TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END,
532 		    "rfs_readlink_end:(%S)", "access");
533 		return;
534 	}
535 
536 	/*
537 	 * XNFS and RFC1094 require us to return ENXIO if argument
538 	 * is not a link. BUGID 1138002.
539 	 */
540 	if (vp->v_type != VLNK) {
541 		VN_RELE(vp);
542 		rl->rl_data = NULL;
543 		rl->rl_status = NFSERR_NXIO;
544 		TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END,
545 		    "rfs_readlink_end:(%S)", "nxio");
546 		return;
547 	}
548 
549 	/*
550 	 * Allocate data for pathname.  This will be freed by rfs_rlfree.
551 	 */
552 	rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
553 
554 	/*
555 	 * Set up io vector to read sym link data
556 	 */
557 	iov.iov_base = rl->rl_data;
558 	iov.iov_len = NFS_MAXPATHLEN;
559 	uio.uio_iov = &iov;
560 	uio.uio_iovcnt = 1;
561 	uio.uio_segflg = UIO_SYSSPACE;
562 	uio.uio_extflg = UIO_COPY_CACHED;
563 	uio.uio_loffset = (offset_t)0;
564 	uio.uio_resid = NFS_MAXPATHLEN;
565 
566 	/*
567 	 * Do the readlink.
568 	 */
569 	TRACE_0(TR_FAC_NFS, TR_VOP_READLINK_START, "vop_readlink_start:");
570 	error = VOP_READLINK(vp, &uio, cr);
571 	TRACE_0(TR_FAC_NFS, TR_VOP_READLINK_END, "vop_readlink_end:");
572 
573 #if 0 /* notyet */
574 	/*
575 	 * Don't do this.  It causes local disk writes when just
576 	 * reading the file and the overhead is deemed larger
577 	 * than the benefit.
578 	 */
579 	/*
580 	 * Force modified metadata out to stable storage.
581 	 */
582 	(void) VOP_FSYNC(vp, FNODSYNC, cr);
583 #endif
584 
585 	VN_RELE(vp);
586 
587 	rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid);
588 
589 	/*
590 	 * XNFS and RFC1094 require us to return ENXIO if argument
591 	 * is not a link. UFS returns EINVAL if this is the case,
592 	 * so we do the mapping here. BUGID 1138002.
593 	 */
594 	if (error == EINVAL)
595 		rl->rl_status = NFSERR_NXIO;
596 	else
597 		rl->rl_status = puterrno(error);
598 
599 	TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END,
600 	    "rfs_readlink_end:(%S)", "done");
601 }
602 void *
603 rfs_readlink_getfh(fhandle_t *fhp)
604 {
605 	return (fhp);
606 }
607 /*
608  * Free data allocated by rfs_readlink
609  */
610 void
611 rfs_rlfree(struct nfsrdlnres *rl)
612 {
613 	if (rl->rl_data != NULL)
614 		kmem_free(rl->rl_data, NFS_MAXPATHLEN);
615 }
616 
617 /*
618  * Read data.
619  * Returns some data read from the file at the given fhandle.
620  */
621 /* ARGSUSED */
622 void
623 rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr,
624 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
625 {
626 	vnode_t *vp;
627 	int error;
628 	struct vattr va;
629 	struct iovec iov;
630 	struct uio uio;
631 	mblk_t *mp;
632 	int alloc_err = 0;
633 	int in_crit = 0;
634 
635 	TRACE_0(TR_FAC_NFS, TR_RFS_READ_START, "rfs_read_start:");
636 
637 	vp = nfs_fhtovp(&ra->ra_fhandle, exi);
638 	if (vp == NULL) {
639 		rr->rr_data = NULL;
640 		rr->rr_status = NFSERR_STALE;
641 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
642 		    "rfs_read_end:(%S)", "stale");
643 		return;
644 	}
645 
646 	if (vp->v_type != VREG) {
647 		VN_RELE(vp);
648 		rr->rr_data = NULL;
649 		rr->rr_status = NFSERR_ISDIR;
650 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
651 		    "rfs_read_end:(%S)", "isdir");
652 		return;
653 	}
654 
655 	/*
656 	 * Check to see if the v4 side of the server has delegated
657 	 * this file.  If so, then we mark thread as wouldblock so
658 	 * the response is dropped.
659 	 */
660 	if (rfs4_check_delegated(FREAD, vp, FALSE)) {
661 		VN_RELE(vp);
662 		curthread->t_flag |= T_WOULDBLOCK;
663 		rr->rr_data = NULL;
664 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
665 		    "rfs_read_end:(%S)", "delegated");
666 		return;
667 	}
668 
669 	/*
670 	 * Enter the critical region before calling VOP_RWLOCK
671 	 * to avoid a deadlock with write requests.
672 	 */
673 	if (nbl_need_check(vp)) {
674 		nbl_start_crit(vp, RW_READER);
675 		if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
676 		    0)) {
677 			nbl_end_crit(vp);
678 			VN_RELE(vp);
679 			rr->rr_data = NULL;
680 			rr->rr_status = NFSERR_ACCES;
681 			TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
682 			    "rfs_read_end:(%S)", " csf access error");
683 			return;
684 		}
685 		in_crit = 1;
686 	}
687 
688 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START, "vop_rwlock_start:");
689 	(void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
690 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END, "vop_rwlock_end:");
691 
692 	va.va_mask = AT_ALL;
693 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:");
694 	error = VOP_GETATTR(vp, &va, 0, cr);
695 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:");
696 
697 	if (error) {
698 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
699 		    "vop_rwunlock_start:");
700 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
701 		if (in_crit)
702 			nbl_end_crit(vp);
703 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, "vop_rwunlock_end:");
704 		VN_RELE(vp);
705 		rr->rr_data = NULL;
706 		rr->rr_status = puterrno(error);
707 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
708 		    "rfs_read_end:(%S)", "getattr error");
709 		return;
710 	}
711 
712 	/*
713 	 * This is a kludge to allow reading of files created
714 	 * with no read permission.  The owner of the file
715 	 * is always allowed to read it.
716 	 */
717 	if (crgetuid(cr) != va.va_uid) {
718 		TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START, "vop_access_start:");
719 		error = VOP_ACCESS(vp, VREAD, 0, cr);
720 		TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END, "vop_access_end:");
721 		if (error) {
722 			/*
723 			 * Exec is the same as read over the net because
724 			 * of demand loading.
725 			 */
726 			TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START,
727 			    "vop_access_start:");
728 			error = VOP_ACCESS(vp, VEXEC, 0, cr);
729 			TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END,
730 			    "vop_access_end:");
731 		}
732 		if (error) {
733 			TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
734 			    "vop_rwunlock_start:");
735 			VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
736 			if (in_crit)
737 				nbl_end_crit(vp);
738 			TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END,
739 			    "vop_rwunlock_end:");
740 			VN_RELE(vp);
741 			rr->rr_data = NULL;
742 			rr->rr_status = puterrno(error);
743 			TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
744 			    "rfs_read_end:(%S)", "access error");
745 			return;
746 		}
747 	}
748 
749 	if (MANDLOCK(vp, va.va_mode)) {
750 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
751 		    "vop_rwunlock_start:");
752 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
753 		if (in_crit)
754 			nbl_end_crit(vp);
755 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, "vop_rwunlock_end:");
756 		VN_RELE(vp);
757 		rr->rr_data = NULL;
758 		rr->rr_status = NFSERR_ACCES;
759 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
760 		    "rfs_read_end:(%S)", "mand lock");
761 		return;
762 	}
763 
764 	if ((u_offset_t)ra->ra_offset >= va.va_size) {
765 		rr->rr_count = 0;
766 		rr->rr_data = NULL;
767 		/*
768 		 * In this case, status is NFS_OK, but there is no data
769 		 * to encode. So set rr_mp to NULL.
770 		 */
771 		rr->rr_mp = NULL;
772 		goto done;
773 	}
774 
775 	/*
776 	 * mp will contain the data to be sent out in the read reply.
777 	 * This will be freed after the reply has been sent out (by the
778 	 * driver).
779 	 * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so
780 	 * that the call to xdrmblk_putmblk() never fails.
781 	 */
782 	mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG,
783 	    &alloc_err);
784 	ASSERT(mp != NULL);
785 	ASSERT(alloc_err == 0);
786 
787 	rr->rr_mp = mp;
788 
789 	/*
790 	 * Set up io vector
791 	 */
792 	iov.iov_base = (caddr_t)mp->b_datap->db_base;
793 	iov.iov_len = ra->ra_count;
794 	uio.uio_iov = &iov;
795 	uio.uio_iovcnt = 1;
796 	uio.uio_segflg = UIO_SYSSPACE;
797 	uio.uio_extflg = UIO_COPY_CACHED;
798 	uio.uio_loffset = (offset_t)ra->ra_offset;
799 	uio.uio_resid = ra->ra_count;
800 
801 	TRACE_0(TR_FAC_NFS, TR_VOP_READ_START, "vop_read_start:");
802 	error = VOP_READ(vp, &uio, 0, cr, NULL);
803 	TRACE_0(TR_FAC_NFS, TR_VOP_READ_END, "vop_read_end:");
804 
805 	if (error) {
806 		freeb(mp);
807 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
808 		    "vop_rwunlock_start:");
809 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
810 		if (in_crit)
811 			nbl_end_crit(vp);
812 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, "vop_rwunlock_end:");
813 		VN_RELE(vp);
814 		rr->rr_data = NULL;
815 		rr->rr_status = puterrno(error);
816 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
817 		    "rfs_read_end:(%S)", "read error");
818 		return;
819 	}
820 
821 	/*
822 	 * Get attributes again so we can send the latest access
823 	 * time to the client side for his cache.
824 	 */
825 	va.va_mask = AT_ALL;
826 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:");
827 	error = VOP_GETATTR(vp, &va, 0, cr);
828 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:");
829 	if (error) {
830 		freeb(mp);
831 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
832 		    "vop_rwunlock_start:");
833 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
834 		if (in_crit)
835 			nbl_end_crit(vp);
836 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END,
837 		    "vop_rwunlock_end:");
838 		VN_RELE(vp);
839 		rr->rr_data = NULL;
840 		rr->rr_status = puterrno(error);
841 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
842 		    "rfs_read_end:(%S)", "read error");
843 		return;
844 	}
845 
846 	rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid);
847 
848 	rr->rr_data = (char *)mp->b_datap->db_base;
849 
850 done:
851 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, "vop_rwunlock_start:");
852 	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
853 	if (in_crit)
854 		nbl_end_crit(vp);
855 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, "vop_rwunlock_end:");
856 
857 	acl_perm(vp, exi, &va, cr);
858 
859 	/* check for overflows */
860 	error = vattr_to_nattr(&va, &rr->rr_attr);
861 
862 #if 0 /* notyet */
863 	/*
864 	 * Don't do this.  It causes local disk writes when just
865 	 * reading the file and the overhead is deemed larger
866 	 * than the benefit.
867 	 */
868 	/*
869 	 * Force modified metadata out to stable storage.
870 	 */
871 	(void) VOP_FSYNC(vp, FNODSYNC, cr);
872 #endif
873 
874 	VN_RELE(vp);
875 
876 	rr->rr_status = puterrno(error);
877 
878 	TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, "rfs_read_end:(%S)", "done");
879 }
880 
881 /*
882  * Free data allocated by rfs_read
883  */
884 void
885 rfs_rdfree(struct nfsrdresult *rr)
886 {
887 	mblk_t *mp;
888 
889 	if (rr->rr_status == NFS_OK) {
890 		mp = rr->rr_mp;
891 		if (mp != NULL)
892 			freeb(mp);
893 	}
894 }
895 
896 void *
897 rfs_read_getfh(struct nfsreadargs *ra)
898 {
899 	return (&ra->ra_fhandle);
900 }
901 
902 #define	MAX_IOVECS	12
903 
904 #ifdef DEBUG
905 static int rfs_write_sync_hits = 0;
906 static int rfs_write_sync_misses = 0;
907 #endif
908 
909 /*
910  * Write data to file.
911  * Returns attributes of a file after writing some data to it.
912  *
913  * Any changes made here, especially in error handling might have
914  * to also be done in rfs_write (which clusters write requests).
915  */
916 void
917 rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns,
918 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
919 {
920 	int error;
921 	vnode_t *vp;
922 	rlim64_t rlimit;
923 	struct vattr va;
924 	struct uio uio;
925 	struct iovec iov[MAX_IOVECS];
926 	mblk_t *m;
927 	struct iovec *iovp;
928 	int iovcnt;
929 	cred_t *savecred;
930 	int in_crit = 0;
931 
932 	TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_START, "rfs_write_start:(%S)", "sync");
933 
934 	vp = nfs_fhtovp(&wa->wa_fhandle, exi);
935 	if (vp == NULL) {
936 		ns->ns_status = NFSERR_STALE;
937 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
938 		    "rfs_write_end:(%S)", "stale");
939 		return;
940 	}
941 
942 	if (rdonly(exi, req)) {
943 		VN_RELE(vp);
944 		ns->ns_status = NFSERR_ROFS;
945 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
946 		    "rfs_write_end:(%S)", "rofs");
947 		return;
948 	}
949 
950 	if (vp->v_type != VREG) {
951 		VN_RELE(vp);
952 		ns->ns_status = NFSERR_ISDIR;
953 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
954 		    "rfs_write_end:(%S)", "isdir");
955 		return;
956 	}
957 
958 	/*
959 	 * Check to see if the v4 side of the server has delegated
960 	 * this file.  If so, then we mark thread as wouldblock so
961 	 * the response is dropped.
962 	 */
963 	if (rfs4_check_delegated(FWRITE, vp, FALSE)) {
964 		VN_RELE(vp);
965 		curthread->t_flag |= T_WOULDBLOCK;
966 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
967 		    "rfs_write_end:(%S)", "delegated");
968 		return;
969 	}
970 
971 	va.va_mask = AT_UID|AT_MODE;
972 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:");
973 	error = VOP_GETATTR(vp, &va, 0, cr);
974 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:");
975 
976 	if (error) {
977 		VN_RELE(vp);
978 		ns->ns_status = puterrno(error);
979 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
980 		    "rfs_write_end:(%S)", "getattr error");
981 		return;
982 	}
983 
984 	if (crgetuid(cr) != va.va_uid) {
985 		/*
986 		 * This is a kludge to allow writes of files created
987 		 * with read only permission.  The owner of the file
988 		 * is always allowed to write it.
989 		 */
990 		TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START, "vop_access_start:");
991 		error = VOP_ACCESS(vp, VWRITE, 0, cr);
992 		TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END, "vop_access_end:");
993 		if (error) {
994 			VN_RELE(vp);
995 			ns->ns_status = puterrno(error);
996 			TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
997 			    "rfs_write_end:(%S)", "access error");
998 			return;
999 		}
1000 	}
1001 
1002 	/*
1003 	 * Can't access a mandatory lock file.  This might cause
1004 	 * the NFS service thread to block forever waiting for a
1005 	 * lock to be released that will never be released.
1006 	 */
1007 	if (MANDLOCK(vp, va.va_mode)) {
1008 		VN_RELE(vp);
1009 		ns->ns_status = NFSERR_ACCES;
1010 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
1011 		    "rfs_write_end:(%S)", "mand lock");
1012 		return;
1013 	}
1014 
1015 	/*
1016 	 * We have to enter the critical region before calling VOP_RWLOCK
1017 	 * to avoid a deadlock with ufs.
1018 	 */
1019 	if (nbl_need_check(vp)) {
1020 		nbl_start_crit(vp, RW_READER);
1021 		in_crit = 1;
1022 		if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
1023 		    wa->wa_count, 0)) {
1024 			error = EACCES;
1025 			goto out;
1026 		}
1027 	}
1028 
1029 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START, "vop_rwlock_start:");
1030 	(void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);
1031 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END, "vop_rwlock_end:");
1032 
1033 	if (wa->wa_data) {
1034 		iov[0].iov_base = wa->wa_data;
1035 		iov[0].iov_len = wa->wa_count;
1036 		uio.uio_iov = iov;
1037 		uio.uio_iovcnt = 1;
1038 		uio.uio_segflg = UIO_SYSSPACE;
1039 		uio.uio_extflg = UIO_COPY_DEFAULT;
1040 		uio.uio_loffset = (offset_t)wa->wa_offset;
1041 		uio.uio_resid = wa->wa_count;
1042 		/*
1043 		 * The limit is checked on the client. We
1044 		 * should allow any size writes here.
1045 		 */
1046 		uio.uio_llimit = curproc->p_fsz_ctl;
1047 		rlimit = uio.uio_llimit - wa->wa_offset;
1048 		if (rlimit < (rlim64_t)uio.uio_resid)
1049 			uio.uio_resid = (uint_t)rlimit;
1050 
1051 		/*
1052 		 * for now we assume no append mode
1053 		 */
1054 		TRACE_1(TR_FAC_NFS, TR_VOP_WRITE_START,
1055 		    "vop_write_start:(%S)", "sync");
1056 		/*
1057 		 * We're changing creds because VM may fault and we need
1058 		 * the cred of the current thread to be used if quota
1059 		 * checking is enabled.
1060 		 */
1061 		savecred = curthread->t_cred;
1062 		curthread->t_cred = cr;
1063 		error = VOP_WRITE(vp, &uio, FSYNC, cr, NULL);
1064 		curthread->t_cred = savecred;
1065 		TRACE_0(TR_FAC_NFS, TR_VOP_WRITE_END, "vop_write_end:");
1066 	} else {
1067 		iovcnt = 0;
1068 		for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
1069 			iovcnt++;
1070 		if (iovcnt <= MAX_IOVECS) {
1071 #ifdef DEBUG
1072 			rfs_write_sync_hits++;
1073 #endif
1074 			iovp = iov;
1075 		} else {
1076 #ifdef DEBUG
1077 			rfs_write_sync_misses++;
1078 #endif
1079 			iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
1080 		}
1081 		mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
1082 		uio.uio_iov = iovp;
1083 		uio.uio_iovcnt = iovcnt;
1084 		uio.uio_segflg = UIO_SYSSPACE;
1085 		uio.uio_extflg = UIO_COPY_DEFAULT;
1086 		uio.uio_loffset = (offset_t)wa->wa_offset;
1087 		uio.uio_resid = wa->wa_count;
1088 		/*
1089 		 * The limit is checked on the client. We
1090 		 * should allow any size writes here.
1091 		 */
1092 		uio.uio_llimit = curproc->p_fsz_ctl;
1093 		rlimit = uio.uio_llimit - wa->wa_offset;
1094 		if (rlimit < (rlim64_t)uio.uio_resid)
1095 			uio.uio_resid = (uint_t)rlimit;
1096 
1097 		/*
1098 		 * For now we assume no append mode.
1099 		 */
1100 		TRACE_1(TR_FAC_NFS, TR_VOP_WRITE_START,
1101 		    "vop_write_start:(%S)", "iov sync");
1102 		/*
1103 		 * We're changing creds because VM may fault and we need
1104 		 * the cred of the current thread to be used if quota
1105 		 * checking is enabled.
1106 		 */
1107 		savecred = curthread->t_cred;
1108 		curthread->t_cred = cr;
1109 		error = VOP_WRITE(vp, &uio, FSYNC, cr, NULL);
1110 		curthread->t_cred = savecred;
1111 		TRACE_0(TR_FAC_NFS, TR_VOP_WRITE_END, "vop_write_end:");
1112 
1113 		if (iovp != iov)
1114 			kmem_free(iovp, sizeof (*iovp) * iovcnt);
1115 	}
1116 
1117 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, "vop_rwunlock_start:");
1118 	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
1119 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, "vop_rwunlock_end:");
1120 
1121 	if (!error) {
1122 		/*
1123 		 * Get attributes again so we send the latest mod
1124 		 * time to the client side for his cache.
1125 		 */
1126 		va.va_mask = AT_ALL;	/* now we want everything */
1127 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:");
1128 		error = VOP_GETATTR(vp, &va, 0, cr);
1129 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:");
1130 		/* check for overflows */
1131 		if (!error) {
1132 			acl_perm(vp, exi, &va, cr);
1133 			error = vattr_to_nattr(&va, &ns->ns_attr);
1134 		}
1135 	}
1136 
1137 out:
1138 	if (in_crit)
1139 		nbl_end_crit(vp);
1140 	VN_RELE(vp);
1141 
1142 	ns->ns_status = puterrno(error);
1143 
1144 	TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, "rfs_write_end:(%S)", "sync");
1145 }
1146 
1147 struct rfs_async_write {
1148 	struct nfswriteargs *wa;
1149 	struct nfsattrstat *ns;
1150 	struct svc_req *req;
1151 	cred_t *cr;
1152 	kthread_t *thread;
1153 	struct rfs_async_write *list;
1154 };
1155 
1156 struct rfs_async_write_list {
1157 	fhandle_t *fhp;
1158 	kcondvar_t cv;
1159 	struct rfs_async_write *list;
1160 	struct rfs_async_write_list *next;
1161 };
1162 
1163 static struct rfs_async_write_list *rfs_async_write_head = NULL;
1164 static kmutex_t rfs_async_write_lock;
1165 static int rfs_write_async = 1;	/* enables write clustering if == 1 */
1166 
1167 #define	MAXCLIOVECS	42
1168 #define	RFSWRITE_INITVAL (enum nfsstat) -1
1169 
1170 #ifdef DEBUG
1171 static int rfs_write_hits = 0;
1172 static int rfs_write_misses = 0;
1173 #endif
1174 
1175 /*
1176  * Write data to file.
1177  * Returns attributes of a file after writing some data to it.
1178  */
1179 void
1180 rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns,
1181 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1182 {
1183 	int error;
1184 	vnode_t *vp;
1185 	rlim64_t rlimit;
1186 	struct vattr va;
1187 	struct uio uio;
1188 	struct rfs_async_write_list *lp;
1189 	struct rfs_async_write_list *nlp;
1190 	struct rfs_async_write *rp;
1191 	struct rfs_async_write *nrp;
1192 	struct rfs_async_write *trp;
1193 	struct rfs_async_write *lrp;
1194 	int data_written;
1195 	int iovcnt;
1196 	mblk_t *m;
1197 	struct iovec *iovp;
1198 	struct iovec *niovp;
1199 	struct iovec iov[MAXCLIOVECS];
1200 	int count;
1201 	int rcount;
1202 	uint_t off;
1203 	uint_t len;
1204 	struct rfs_async_write nrpsp;
1205 	struct rfs_async_write_list nlpsp;
1206 	ushort_t t_flag;
1207 	cred_t *savecred;
1208 	int in_crit = 0;
1209 
1210 	if (!rfs_write_async) {
1211 		rfs_write_sync(wa, ns, exi, req, cr);
1212 		return;
1213 	}
1214 
1215 	TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_START,
1216 	    "rfs_write_start:(%S)", "async");
1217 
1218 	/*
1219 	 * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
1220 	 * is considered an OK.
1221 	 */
1222 	ns->ns_status = RFSWRITE_INITVAL;
1223 
1224 	nrp = &nrpsp;
1225 	nrp->wa = wa;
1226 	nrp->ns = ns;
1227 	nrp->req = req;
1228 	nrp->cr = cr;
1229 	nrp->thread = curthread;
1230 
1231 	ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
1232 
1233 	/*
1234 	 * Look to see if there is already a cluster started
1235 	 * for this file.
1236 	 */
1237 	mutex_enter(&rfs_async_write_lock);
1238 	for (lp = rfs_async_write_head; lp != NULL; lp = lp->next) {
1239 		if (bcmp(&wa->wa_fhandle, lp->fhp,
1240 		    sizeof (fhandle_t)) == 0)
1241 			break;
1242 	}
1243 
1244 	/*
1245 	 * If lp is non-NULL, then there is already a cluster
1246 	 * started.  We need to place ourselves in the cluster
1247 	 * list in the right place as determined by starting
1248 	 * offset.  Conflicts with non-blocking mandatory locked
1249 	 * regions will be checked when the cluster is processed.
1250 	 */
1251 	if (lp != NULL) {
1252 		rp = lp->list;
1253 		trp = NULL;
1254 		while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
1255 			trp = rp;
1256 			rp = rp->list;
1257 		}
1258 		nrp->list = rp;
1259 		if (trp == NULL)
1260 			lp->list = nrp;
1261 		else
1262 			trp->list = nrp;
1263 		while (nrp->ns->ns_status == RFSWRITE_INITVAL)
1264 			cv_wait(&lp->cv, &rfs_async_write_lock);
1265 		mutex_exit(&rfs_async_write_lock);
1266 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
1267 		    "rfs_write_end:(%S)", "cluster child");
1268 		return;
1269 	}
1270 
1271 	/*
1272 	 * No cluster started yet, start one and add ourselves
1273 	 * to the list of clusters.
1274 	 */
1275 	nrp->list = NULL;
1276 
1277 	nlp = &nlpsp;
1278 	nlp->fhp = &wa->wa_fhandle;
1279 	cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
1280 	nlp->list = nrp;
1281 	nlp->next = NULL;
1282 
1283 	if (rfs_async_write_head == NULL) {
1284 		rfs_async_write_head = nlp;
1285 	} else {
1286 		lp = rfs_async_write_head;
1287 		while (lp->next != NULL)
1288 			lp = lp->next;
1289 		lp->next = nlp;
1290 	}
1291 	mutex_exit(&rfs_async_write_lock);
1292 
1293 	/*
1294 	 * Convert the file handle common to all of the requests
1295 	 * in this cluster to a vnode.
1296 	 */
1297 	vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1298 	if (vp == NULL) {
1299 		mutex_enter(&rfs_async_write_lock);
1300 		if (rfs_async_write_head == nlp)
1301 			rfs_async_write_head = nlp->next;
1302 		else {
1303 			lp = rfs_async_write_head;
1304 			while (lp->next != nlp)
1305 				lp = lp->next;
1306 			lp->next = nlp->next;
1307 		}
1308 		t_flag = curthread->t_flag & T_WOULDBLOCK;
1309 		for (rp = nlp->list; rp != NULL; rp = rp->list) {
1310 			rp->ns->ns_status = NFSERR_STALE;
1311 			rp->thread->t_flag |= t_flag;
1312 		}
1313 		cv_broadcast(&nlp->cv);
1314 		mutex_exit(&rfs_async_write_lock);
1315 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
1316 		    "rfs_write_end:(%S)", "stale");
1317 		return;
1318 	}
1319 
1320 	/*
1321 	 * Can only write regular files.  Attempts to write any
1322 	 * other file types fail with EISDIR.
1323 	 */
1324 	if (vp->v_type != VREG) {
1325 		VN_RELE(vp);
1326 		mutex_enter(&rfs_async_write_lock);
1327 		if (rfs_async_write_head == nlp)
1328 			rfs_async_write_head = nlp->next;
1329 		else {
1330 			lp = rfs_async_write_head;
1331 			while (lp->next != nlp)
1332 				lp = lp->next;
1333 			lp->next = nlp->next;
1334 		}
1335 		t_flag = curthread->t_flag & T_WOULDBLOCK;
1336 		for (rp = nlp->list; rp != NULL; rp = rp->list) {
1337 			rp->ns->ns_status = NFSERR_ISDIR;
1338 			rp->thread->t_flag |= t_flag;
1339 		}
1340 		cv_broadcast(&nlp->cv);
1341 		mutex_exit(&rfs_async_write_lock);
1342 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
1343 		    "rfs_write_end:(%S)", "isdir");
1344 		return;
1345 	}
1346 
1347 	/*
1348 	 * Enter the critical region before calling VOP_RWLOCK, to avoid a
1349 	 * deadlock with ufs.
1350 	 */
1351 	if (nbl_need_check(vp)) {
1352 		nbl_start_crit(vp, RW_READER);
1353 		in_crit = 1;
1354 	}
1355 
1356 	/*
1357 	 * Lock the file for writing.  This operation provides
1358 	 * the delay which allows clusters to grow.
1359 	 */
1360 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START, "vop_wrlock_start:");
1361 	(void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);
1362 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END, "vop_wrlock_end");
1363 
1364 	/*
1365 	 * Disconnect this cluster from the list of clusters.
1366 	 * The cluster that is being dealt with must be fixed
1367 	 * in size after this point, so there is no reason
1368 	 * to leave it on the list so that new requests can
1369 	 * find it.
1370 	 *
1371 	 * The algorithm is that the first write request will
1372 	 * create a cluster, convert the file handle to a
1373 	 * vnode pointer, and then lock the file for writing.
1374 	 * This request is not likely to be clustered with
1375 	 * any others.  However, the next request will create
1376 	 * a new cluster and be blocked in VOP_RWLOCK while
1377 	 * the first request is being processed.  This delay
1378 	 * will allow more requests to be clustered in this
1379 	 * second cluster.
1380 	 */
1381 	mutex_enter(&rfs_async_write_lock);
1382 	if (rfs_async_write_head == nlp)
1383 		rfs_async_write_head = nlp->next;
1384 	else {
1385 		lp = rfs_async_write_head;
1386 		while (lp->next != nlp)
1387 			lp = lp->next;
1388 		lp->next = nlp->next;
1389 	}
1390 	mutex_exit(&rfs_async_write_lock);
1391 
1392 	/*
1393 	 * Step through the list of requests in this cluster.
1394 	 * We need to check permissions to make sure that all
1395 	 * of the requests have sufficient permission to write
1396 	 * the file.  A cluster can be composed of requests
1397 	 * from different clients and different users on each
1398 	 * client.
1399 	 *
1400 	 * As a side effect, we also calculate the size of the
1401 	 * byte range that this cluster encompasses.
1402 	 */
1403 	rp = nlp->list;
1404 	off = rp->wa->wa_offset;
1405 	len = (uint_t)0;
1406 	do {
1407 		if (rdonly(exi, rp->req)) {
1408 			rp->ns->ns_status = NFSERR_ROFS;
1409 			t_flag = curthread->t_flag & T_WOULDBLOCK;
1410 			rp->thread->t_flag |= t_flag;
1411 			continue;
1412 		}
1413 
1414 		va.va_mask = AT_UID|AT_MODE;
1415 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:");
1416 		error = VOP_GETATTR(vp, &va, 0, rp->cr);
1417 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:");
1418 		if (!error) {
1419 			if (crgetuid(rp->cr) != va.va_uid) {
1420 				/*
1421 				 * This is a kludge to allow writes of files
1422 				 * created with read only permission.  The
1423 				 * owner of the file is always allowed to
1424 				 * write it.
1425 				 */
1426 				TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START,
1427 				    "vop_access_start:");
1428 				error = VOP_ACCESS(vp, VWRITE, 0, rp->cr);
1429 				TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END,
1430 				    "vop_access_end:");
1431 			}
1432 			if (!error && MANDLOCK(vp, va.va_mode))
1433 				error = EACCES;
1434 		}
1435 
1436 		/*
1437 		 * Check for a conflict with a nbmand-locked region.
1438 		 */
1439 		if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset,
1440 		    rp->wa->wa_count, 0)) {
1441 			error = EACCES;
1442 		}
1443 
1444 		if (error) {
1445 			rp->ns->ns_status = puterrno(error);
1446 			t_flag = curthread->t_flag & T_WOULDBLOCK;
1447 			rp->thread->t_flag |= t_flag;
1448 			continue;
1449 		}
1450 		if (len < rp->wa->wa_offset + rp->wa->wa_count - off)
1451 			len = rp->wa->wa_offset + rp->wa->wa_count - off;
1452 	} while ((rp = rp->list) != NULL);
1453 
1454 	/*
1455 	 * Step through the cluster attempting to gather as many
1456 	 * requests which are contiguous as possible.  These
1457 	 * contiguous requests are handled via one call to VOP_WRITE
1458 	 * instead of different calls to VOP_WRITE.  We also keep
1459 	 * track of the fact that any data was written.
1460 	 */
1461 	rp = nlp->list;
1462 	data_written = 0;
1463 	do {
1464 		/*
1465 		 * Skip any requests which are already marked as having an
1466 		 * error.
1467 		 */
1468 		if (rp->ns->ns_status != RFSWRITE_INITVAL) {
1469 			rp = rp->list;
1470 			continue;
1471 		}
1472 
1473 		/*
1474 		 * Count the number of iovec's which are required
1475 		 * to handle this set of requests.  One iovec is
1476 		 * needed for each data buffer, whether addressed
1477 		 * by wa_data or by the b_rptr pointers in the
1478 		 * mblk chains.
1479 		 */
1480 		iovcnt = 0;
1481 		lrp = rp;
1482 		for (;;) {
1483 			if (lrp->wa->wa_data)
1484 				iovcnt++;
1485 			else {
1486 				m = lrp->wa->wa_mblk;
1487 				while (m != NULL) {
1488 					iovcnt++;
1489 					m = m->b_cont;
1490 				}
1491 			}
1492 			if (lrp->list == NULL ||
1493 			    lrp->list->ns->ns_status != RFSWRITE_INITVAL ||
1494 			    lrp->wa->wa_offset + lrp->wa->wa_count !=
1495 			    lrp->list->wa->wa_offset) {
1496 				lrp = lrp->list;
1497 				break;
1498 			}
1499 			lrp = lrp->list;
1500 		}
1501 
1502 		if (iovcnt <= MAXCLIOVECS) {
1503 #ifdef DEBUG
1504 			rfs_write_hits++;
1505 #endif
1506 			niovp = iov;
1507 		} else {
1508 #ifdef DEBUG
1509 			rfs_write_misses++;
1510 #endif
1511 			niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP);
1512 		}
1513 		/*
1514 		 * Put together the scatter/gather iovecs.
1515 		 */
1516 		iovp = niovp;
1517 		trp = rp;
1518 		count = 0;
1519 		do {
1520 			if (trp->wa->wa_data) {
1521 				iovp->iov_base = trp->wa->wa_data;
1522 				iovp->iov_len = trp->wa->wa_count;
1523 				iovp++;
1524 			} else {
1525 				m = trp->wa->wa_mblk;
1526 				rcount = trp->wa->wa_count;
1527 				while (m != NULL) {
1528 					iovp->iov_base = (caddr_t)m->b_rptr;
1529 					iovp->iov_len = (m->b_wptr - m->b_rptr);
1530 					rcount -= iovp->iov_len;
1531 					if (rcount < 0)
1532 						iovp->iov_len += rcount;
1533 					iovp++;
1534 					if (rcount <= 0)
1535 						break;
1536 					m = m->b_cont;
1537 				}
1538 			}
1539 			count += trp->wa->wa_count;
1540 			trp = trp->list;
1541 		} while (trp != lrp);
1542 
1543 		uio.uio_iov = niovp;
1544 		uio.uio_iovcnt = iovcnt;
1545 		uio.uio_segflg = UIO_SYSSPACE;
1546 		uio.uio_extflg = UIO_COPY_DEFAULT;
1547 		uio.uio_loffset = (offset_t)rp->wa->wa_offset;
1548 		uio.uio_resid = count;
1549 		/*
1550 		 * The limit is checked on the client. We
1551 		 * should allow any size writes here.
1552 		 */
1553 		uio.uio_llimit = curproc->p_fsz_ctl;
1554 		rlimit = uio.uio_llimit - rp->wa->wa_offset;
1555 		if (rlimit < (rlim64_t)uio.uio_resid)
1556 			uio.uio_resid = (uint_t)rlimit;
1557 
1558 		/*
1559 		 * For now we assume no append mode.
1560 		 */
1561 		TRACE_1(TR_FAC_NFS, TR_VOP_WRITE_START,
1562 		    "vop_write_start:(%S)", "async");
1563 
1564 		/*
1565 		 * Check to see if the v4 side of the server has
1566 		 * delegated this file.  If so, then we mark thread
1567 		 * as wouldblock so the response is dropped.
1568 		 */
1569 		if (rfs4_check_delegated(FWRITE, vp, FALSE)) {
1570 			curthread->t_flag |= T_WOULDBLOCK;
1571 			error = EACCES; /* just to have an error */
1572 			TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
1573 			    "rfs_write_end:(%S)", "delegated");
1574 		} else {
1575 			/*
1576 			 * We're changing creds because VM may fault
1577 			 * and we need the cred of the current
1578 			 * thread to be used if quota * checking is
1579 			 * enabled.
1580 			 */
1581 			savecred = curthread->t_cred;
1582 			curthread->t_cred = cr;
1583 			error = VOP_WRITE(vp, &uio, 0, rp->cr, NULL);
1584 			curthread->t_cred = savecred;
1585 			TRACE_0(TR_FAC_NFS, TR_VOP_WRITE_END,
1586 			    "vop_write_end:");
1587 		}
1588 
1589 		if (niovp != iov)
1590 			kmem_free(niovp, sizeof (*niovp) * iovcnt);
1591 
1592 		if (!error) {
1593 			data_written = 1;
1594 			/*
1595 			 * Get attributes again so we send the latest mod
1596 			 * time to the client side for his cache.
1597 			 */
1598 			va.va_mask = AT_ALL;	/* now we want everything */
1599 			TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
1600 			    "vop_getattr_start:");
1601 			error = VOP_GETATTR(vp, &va, 0, rp->cr);
1602 			TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
1603 			    "vop_getattr_end:");
1604 			if (!error)
1605 				acl_perm(vp, exi, &va, rp->cr);
1606 		}
1607 
1608 		/*
1609 		 * Fill in the status responses for each request
1610 		 * which was just handled.  Also, copy the latest
1611 		 * attributes in to the attribute responses if
1612 		 * appropriate.
1613 		 */
1614 		t_flag = curthread->t_flag & T_WOULDBLOCK;
1615 		do {
1616 			rp->thread->t_flag |= t_flag;
1617 			/* check for overflows */
1618 			if (!error) {
1619 				error  = vattr_to_nattr(&va, &rp->ns->ns_attr);
1620 			}
1621 			rp->ns->ns_status = puterrno(error);
1622 			rp = rp->list;
1623 		} while (rp != lrp);
1624 	} while (rp != NULL);
1625 
1626 	/*
1627 	 * If any data was written at all, then we need to flush
1628 	 * the data and metadata to stable storage.
1629 	 */
1630 	if (data_written) {
1631 		TRACE_0(TR_FAC_NFS, TR_VOP_PUTPAGE_START, "vop_putpage_start:");
1632 		error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr);
1633 		TRACE_0(TR_FAC_NFS, TR_VOP_PUTPAGE_END, "vop_putpage_end:");
1634 		if (!error) {
1635 			TRACE_0(TR_FAC_NFS, TR_VOP_FSYNC_START,
1636 			    "vop_fsync_start:");
1637 			error = VOP_FSYNC(vp, FNODSYNC, cr);
1638 			TRACE_0(TR_FAC_NFS, TR_VOP_FSYNC_END, "vop_fsync_end:");
1639 		}
1640 	}
1641 
1642 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, "vop_rwunlock_start:");
1643 	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
1644 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, "vop_rwunlock_end:");
1645 
1646 	if (in_crit)
1647 		nbl_end_crit(vp);
1648 	VN_RELE(vp);
1649 
1650 	t_flag = curthread->t_flag & T_WOULDBLOCK;
1651 	mutex_enter(&rfs_async_write_lock);
1652 	for (rp = nlp->list; rp != NULL; rp = rp->list) {
1653 		if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1654 			rp->ns->ns_status = puterrno(error);
1655 			rp->thread->t_flag |= t_flag;
1656 		}
1657 	}
1658 	cv_broadcast(&nlp->cv);
1659 	mutex_exit(&rfs_async_write_lock);
1660 
1661 	TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, "rfs_write_end:(%S)", "async");
1662 }
1663 
1664 void *
1665 rfs_write_getfh(struct nfswriteargs *wa)
1666 {
1667 	return (&wa->wa_fhandle);
1668 }
1669 
1670 /*
1671  * Create a file.
1672  * Creates a file with given attributes and returns those attributes
1673  * and an fhandle for the new file.
1674  */
1675 void
1676 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
1677 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1678 {
1679 	int error;
1680 	int lookuperr;
1681 	int in_crit = 0;
1682 	struct vattr va;
1683 	vnode_t *vp;
1684 	vnode_t *dvp;
1685 	char *name = args->ca_da.da_name;
1686 	vnode_t *tvp = NULL;
1687 	int mode;
1688 	int lookup_ok;
1689 	bool_t trunc;
1690 
1691 	TRACE_0(TR_FAC_NFS, TR_RFS_CREATE_START, "rfs_create_start:");
1692 
1693 	/*
1694 	 * Disallow NULL paths
1695 	 */
1696 	if (name == NULL || *name == '\0') {
1697 		dr->dr_status = NFSERR_ACCES;
1698 		TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END,
1699 		    "rfs_create_end:(%S)", "access");
1700 		return;
1701 	}
1702 
1703 	dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
1704 	if (dvp == NULL) {
1705 		dr->dr_status = NFSERR_STALE;
1706 		TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END,
1707 		    "rfs_create_end:(%S)", "stale");
1708 		return;
1709 	}
1710 
1711 	error = sattr_to_vattr(args->ca_sa, &va);
1712 	if (error) {
1713 		dr->dr_status = puterrno(error);
1714 		TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END,
1715 		    "rfs_create_end:(%S)", "sattr");
1716 		return;
1717 	}
1718 
1719 	/*
1720 	 * Must specify the mode.
1721 	 */
1722 	if (!(va.va_mask & AT_MODE)) {
1723 		VN_RELE(dvp);
1724 		dr->dr_status = NFSERR_INVAL;
1725 		TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END,
1726 		    "rfs_create_end:(%S)", "no mode");
1727 		return;
1728 	}
1729 
1730 	/*
1731 	 * This is a completely gross hack to make mknod
1732 	 * work over the wire until we can wack the protocol
1733 	 */
1734 	if ((va.va_mode & IFMT) == IFCHR) {
1735 		if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV)
1736 			va.va_type = VFIFO;	/* xtra kludge for named pipe */
1737 		else {
1738 			va.va_type = VCHR;
1739 			/*
1740 			 * uncompress the received dev_t
1741 			 * if the top half is zero indicating a request
1742 			 * from an `older style' OS.
1743 			 */
1744 			if ((va.va_size & 0xffff0000) == 0)
1745 				va.va_rdev = nfsv2_expdev(va.va_size);
1746 			else
1747 				va.va_rdev = (dev_t)va.va_size;
1748 		}
1749 		va.va_mask &= ~AT_SIZE;
1750 	} else if ((va.va_mode & IFMT) == IFBLK) {
1751 		va.va_type = VBLK;
1752 		/*
1753 		 * uncompress the received dev_t
1754 		 * if the top half is zero indicating a request
1755 		 * from an `older style' OS.
1756 		 */
1757 		if ((va.va_size & 0xffff0000) == 0)
1758 			va.va_rdev = nfsv2_expdev(va.va_size);
1759 		else
1760 			va.va_rdev = (dev_t)va.va_size;
1761 		va.va_mask &= ~AT_SIZE;
1762 	} else if ((va.va_mode & IFMT) == IFSOCK) {
1763 		va.va_type = VSOCK;
1764 	} else
1765 		va.va_type = VREG;
1766 	va.va_mode &= ~IFMT;
1767 	va.va_mask |= AT_TYPE;
1768 
1769 	/*
1770 	 * Why was the choice made to use VWRITE as the mode to the
1771 	 * call to VOP_CREATE ? This results in a bug.  When a client
1772 	 * opens a file that already exists and is RDONLY, the second
1773 	 * open fails with an EACESS because of the mode.
1774 	 * bug ID 1054648.
1775 	 */
1776 	lookup_ok = 0;
1777 	mode = VWRITE;
1778 	if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) {
1779 		TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_START, "vop_lookup_start:");
1780 		error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr);
1781 		TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_END, "vop_lookup_end:");
1782 		if (!error) {
1783 			struct vattr at;
1784 
1785 			lookup_ok = 1;
1786 			at.va_mask = AT_MODE;
1787 			TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
1788 			    "vop_getattr_start:");
1789 			error = VOP_GETATTR(tvp, &at, 0, cr);
1790 			TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
1791 			    "vop_getattr_end:");
1792 			if (!error)
1793 				mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD;
1794 			VN_RELE(tvp);
1795 			tvp = NULL;
1796 		}
1797 	}
1798 
1799 	if (!lookup_ok) {
1800 		if (rdonly(exi, req)) {
1801 			error = EROFS;
1802 		} else if (va.va_type != VREG && va.va_type != VFIFO &&
1803 		    va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) {
1804 			error = EPERM;
1805 		} else {
1806 			error = 0;
1807 		}
1808 	}
1809 
1810 	/*
1811 	 * If file size is being modified on an already existing file
1812 	 * make sure that there are no conflicting non-blocking mandatory
1813 	 * locks in the region being manipulated. Return EACCES if there
1814 	 * are conflicting locks.
1815 	 */
1816 	if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) {
1817 		lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr);
1818 
1819 		if (!lookuperr &&
1820 		    rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) {
1821 			VN_RELE(tvp);
1822 			curthread->t_flag |= T_WOULDBLOCK;
1823 			goto out;
1824 		}
1825 
1826 		if (!lookuperr && nbl_need_check(tvp)) {
1827 			/*
1828 			 * The file exists. Now check if it has any
1829 			 * conflicting non-blocking mandatory locks
1830 			 * in the region being changed.
1831 			 */
1832 			struct vattr bva;
1833 			u_offset_t offset;
1834 			ssize_t length;
1835 
1836 			nbl_start_crit(tvp, RW_READER);
1837 			in_crit = 1;
1838 
1839 			bva.va_mask = AT_SIZE;
1840 			error = VOP_GETATTR(tvp, &bva, 0, cr);
1841 			if (!error) {
1842 				if (va.va_size < bva.va_size) {
1843 					offset = va.va_size;
1844 					length = bva.va_size - va.va_size;
1845 				} else {
1846 					offset = bva.va_size;
1847 					length = va.va_size - bva.va_size;
1848 				}
1849 				if (length) {
1850 					if (nbl_conflict(tvp, NBL_WRITE,
1851 					    offset, length, 0)) {
1852 						error = EACCES;
1853 					}
1854 				}
1855 			}
1856 			if (error) {
1857 				nbl_end_crit(tvp);
1858 				VN_RELE(tvp);
1859 				in_crit = 0;
1860 			}
1861 		} else if (tvp != NULL) {
1862 			VN_RELE(tvp);
1863 		}
1864 	}
1865 
1866 	if (!error) {
1867 		/*
1868 		 * If filesystem is shared with nosuid the remove any
1869 		 * setuid/setgid bits on create.
1870 		 */
1871 		if (va.va_type == VREG &&
1872 		    exi->exi_export.ex_flags & EX_NOSUID)
1873 			va.va_mode &= ~(VSUID | VSGID);
1874 
1875 		TRACE_0(TR_FAC_NFS, TR_VOP_CREATE_START, "vop_create_start:");
1876 		error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0);
1877 		TRACE_0(TR_FAC_NFS, TR_VOP_CREATE_END, "vop_create_end:");
1878 
1879 		if (!error) {
1880 
1881 			if ((va.va_mask & AT_SIZE) && (va.va_size == 0))
1882 				trunc = TRUE;
1883 			else
1884 				trunc = FALSE;
1885 
1886 			if (rfs4_check_delegated(FWRITE, vp, trunc)) {
1887 				VN_RELE(vp);
1888 				curthread->t_flag |= T_WOULDBLOCK;
1889 				goto out;
1890 			}
1891 			va.va_mask = AT_ALL;
1892 			TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
1893 			    "vop_getattr_start:");
1894 			error = VOP_GETATTR(vp, &va, 0, cr);
1895 			TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
1896 			    "vop_getattr_end:");
1897 			/* check for overflows */
1898 			if (!error) {
1899 				acl_perm(vp, exi, &va, cr);
1900 				error = vattr_to_nattr(&va, &dr->dr_attr);
1901 				if (!error) {
1902 					error = makefh(&dr->dr_fhandle, vp,
1903 					    exi);
1904 				}
1905 			}
1906 			/*
1907 			 * Force modified metadata out to stable storage.
1908 			 */
1909 			(void) VOP_FSYNC(vp, FNODSYNC, cr);
1910 			VN_RELE(vp);
1911 		}
1912 
1913 		if (in_crit) {
1914 			nbl_end_crit(tvp);
1915 			VN_RELE(tvp);
1916 		}
1917 	}
1918 
1919 	/*
1920 	 * Force modified data and metadata out to stable storage.
1921 	 */
1922 	(void) VOP_FSYNC(dvp, 0, cr);
1923 
1924 out:
1925 
1926 	VN_RELE(dvp);
1927 
1928 	dr->dr_status = puterrno(error);
1929 
1930 	TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END, "rfs_create_end:(%S)", "done");
1931 }
1932 void *
1933 rfs_create_getfh(struct nfscreatargs *args)
1934 {
1935 	return (args->ca_da.da_fhandle);
1936 }
1937 
1938 /*
1939  * Remove a file.
1940  * Remove named file from parent directory.
1941  */
1942 void
1943 rfs_remove(struct nfsdiropargs *da, enum nfsstat *status,
1944 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1945 {
1946 	int error = 0;
1947 	vnode_t *vp;
1948 	vnode_t *targvp;
1949 	int in_crit = 0;
1950 
1951 	TRACE_0(TR_FAC_NFS, TR_RFS_REMOVE_START, "rfs_remove_start:");
1952 
1953 	/*
1954 	 * Disallow NULL paths
1955 	 */
1956 	if (da->da_name == NULL || *da->da_name == '\0') {
1957 		*status = NFSERR_ACCES;
1958 		TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END,
1959 		    "rfs_remove_end:(%S)", "access");
1960 		return;
1961 	}
1962 
1963 	vp = nfs_fhtovp(da->da_fhandle, exi);
1964 	if (vp == NULL) {
1965 		*status = NFSERR_STALE;
1966 		TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END,
1967 		    "rfs_remove_end:(%S)", "stale");
1968 		return;
1969 	}
1970 
1971 	if (rdonly(exi, req)) {
1972 		VN_RELE(vp);
1973 		*status = NFSERR_ROFS;
1974 		TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END,
1975 		    "rfs_remove_end:(%S)", "rofs");
1976 		return;
1977 	}
1978 
1979 	/*
1980 	 * Check for a conflict with a non-blocking mandatory share reservation.
1981 	 */
1982 	error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0, NULL, cr);
1983 	if (error != 0) {
1984 		VN_RELE(vp);
1985 		*status = puterrno(error);
1986 		return;
1987 	}
1988 
1989 	/*
1990 	 * If the file is delegated to an v4 client, then initiate
1991 	 * recall and drop this request (by setting T_WOULDBLOCK).
1992 	 * The client will eventually re-transmit the request and
1993 	 * (hopefully), by then, the v4 client will have returned
1994 	 * the delegation.
1995 	 */
1996 
1997 	if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
1998 		VN_RELE(vp);
1999 		VN_RELE(targvp);
2000 		curthread->t_flag |= T_WOULDBLOCK;
2001 		return;
2002 	}
2003 
2004 	if (nbl_need_check(targvp)) {
2005 		nbl_start_crit(targvp, RW_READER);
2006 		in_crit = 1;
2007 		if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0)) {
2008 			error = EACCES;
2009 			goto out;
2010 		}
2011 	}
2012 
2013 	TRACE_0(TR_FAC_NFS, TR_VOP_REMOVE_START, "vop_remove_start:");
2014 	error = VOP_REMOVE(vp, da->da_name, cr);
2015 	TRACE_0(TR_FAC_NFS, TR_VOP_REMOVE_END, "vop_remove_end:");
2016 
2017 	/*
2018 	 * Force modified data and metadata out to stable storage.
2019 	 */
2020 	(void) VOP_FSYNC(vp, 0, cr);
2021 
2022 out:
2023 	if (in_crit)
2024 		nbl_end_crit(targvp);
2025 	VN_RELE(targvp);
2026 	VN_RELE(vp);
2027 
2028 	*status = puterrno(error);
2029 
2030 	TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END, "rfs_remove_end:(%S)", "done");
2031 }
2032 
2033 void *
2034 rfs_remove_getfh(struct nfsdiropargs *da)
2035 {
2036 	return (da->da_fhandle);
2037 }
2038 
2039 /*
2040  * rename a file
2041  * Give a file (from) a new name (to).
2042  */
2043 void
2044 rfs_rename(struct nfsrnmargs *args, enum nfsstat *status,
2045 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2046 {
2047 	int error = 0;
2048 	vnode_t *fromvp;
2049 	vnode_t *tovp;
2050 	struct exportinfo *to_exi;
2051 	fhandle_t *fh;
2052 	vnode_t *srcvp;
2053 	vnode_t *targvp;
2054 	int in_crit = 0;
2055 
2056 	TRACE_0(TR_FAC_NFS, TR_RFS_RENAME_START, "rfs_rename_start:");
2057 
2058 	fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi);
2059 	if (fromvp == NULL) {
2060 		*status = NFSERR_STALE;
2061 		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2062 		    "rfs_rename_end:(%S)", "from stale");
2063 		return;
2064 	}
2065 
2066 	fh = args->rna_to.da_fhandle;
2067 	to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2068 	if (to_exi == NULL) {
2069 		VN_RELE(fromvp);
2070 		*status = NFSERR_ACCES;
2071 		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2072 		    "rfs_rename_end:(%S)", "cross device");
2073 		return;
2074 	}
2075 	exi_rele(to_exi);
2076 
2077 	if (to_exi != exi) {
2078 		VN_RELE(fromvp);
2079 		*status = NFSERR_XDEV;
2080 		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2081 		    "rfs_rename_end:(%S)", "from stale");
2082 		return;
2083 	}
2084 
2085 	tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi);
2086 	if (tovp == NULL) {
2087 		VN_RELE(fromvp);
2088 		*status = NFSERR_STALE;
2089 		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2090 		    "rfs_rename_end:(%S)", "to stale");
2091 		return;
2092 	}
2093 
2094 	if (fromvp->v_type != VDIR || tovp->v_type != VDIR) {
2095 		VN_RELE(tovp);
2096 		VN_RELE(fromvp);
2097 		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2098 		    "rfs_rename_end:(%S)", "not dir");
2099 		*status = NFSERR_NOTDIR;
2100 		return;
2101 	}
2102 
2103 	/*
2104 	 * Disallow NULL paths
2105 	 */
2106 	if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' ||
2107 	    args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') {
2108 		VN_RELE(tovp);
2109 		VN_RELE(fromvp);
2110 		*status = NFSERR_ACCES;
2111 		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2112 		    "rfs_rename_end:(%S)", "access");
2113 		return;
2114 	}
2115 
2116 	if (rdonly(exi, req)) {
2117 		VN_RELE(tovp);
2118 		VN_RELE(fromvp);
2119 		*status = NFSERR_ROFS;
2120 		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2121 		    "rfs_rename_end:(%S)", "rofs");
2122 		return;
2123 	}
2124 
2125 	/*
2126 	 * Check for a conflict with a non-blocking mandatory share reservation.
2127 	 */
2128 	error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
2129 	    NULL, cr);
2130 	if (error != 0) {
2131 		VN_RELE(tovp);
2132 		VN_RELE(fromvp);
2133 		*status = puterrno(error);
2134 		return;
2135 	}
2136 
2137 	/* Check for delegations on the source file */
2138 
2139 	if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
2140 		VN_RELE(tovp);
2141 		VN_RELE(fromvp);
2142 		VN_RELE(srcvp);
2143 		curthread->t_flag |= T_WOULDBLOCK;
2144 		return;
2145 	}
2146 
2147 	/* Check for delegation on the file being renamed over, if it exists */
2148 
2149 	if (rfs4_deleg_policy != SRV_NEVER_DELEGATE &&
2150 	    VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr)
2151 	    == 0) {
2152 
2153 		if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2154 			VN_RELE(tovp);
2155 			VN_RELE(fromvp);
2156 			VN_RELE(srcvp);
2157 			VN_RELE(targvp);
2158 			curthread->t_flag |= T_WOULDBLOCK;
2159 			return;
2160 		}
2161 		VN_RELE(targvp);
2162 	}
2163 
2164 
2165 	if (nbl_need_check(srcvp)) {
2166 		nbl_start_crit(srcvp, RW_READER);
2167 		in_crit = 1;
2168 		if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0)) {
2169 			error = EACCES;
2170 			goto out;
2171 		}
2172 	}
2173 
2174 	TRACE_0(TR_FAC_NFS, TR_VOP_RENAME_START, "vop_rename_start:");
2175 	error = VOP_RENAME(fromvp, args->rna_from.da_name,
2176 	    tovp, args->rna_to.da_name, cr);
2177 	TRACE_0(TR_FAC_NFS, TR_VOP_RENAME_END, "vop_rename_end:");
2178 
2179 	if (error == 0) {
2180 		char *tmp;
2181 
2182 		/* fix the path name for the renamed file */
2183 		mutex_enter(&srcvp->v_lock);
2184 		tmp = srcvp->v_path;
2185 		srcvp->v_path = NULL;
2186 		mutex_exit(&srcvp->v_lock);
2187 		vn_setpath(rootdir, tovp, srcvp, args->rna_to.da_name,
2188 		    strlen(args->rna_to.da_name));
2189 		if (tmp != NULL)
2190 			kmem_free(tmp, strlen(tmp) + 1);
2191 	}
2192 
2193 	/*
2194 	 * Force modified data and metadata out to stable storage.
2195 	 */
2196 	(void) VOP_FSYNC(tovp, 0, cr);
2197 	(void) VOP_FSYNC(fromvp, 0, cr);
2198 
2199 out:
2200 	if (in_crit)
2201 		nbl_end_crit(srcvp);
2202 	VN_RELE(srcvp);
2203 	VN_RELE(tovp);
2204 	VN_RELE(fromvp);
2205 
2206 	*status = puterrno(error);
2207 
2208 	TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, "rfs_rename_end:(%S)", "done");
2209 }
2210 void *
2211 rfs_rename_getfh(struct nfsrnmargs *args)
2212 {
2213 	return (args->rna_from.da_fhandle);
2214 }
2215 
2216 /*
2217  * Link to a file.
2218  * Create a file (to) which is a hard link to the given file (from).
2219  */
2220 void
2221 rfs_link(struct nfslinkargs *args, enum nfsstat *status,
2222 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2223 {
2224 	int error;
2225 	vnode_t *fromvp;
2226 	vnode_t *tovp;
2227 	struct exportinfo *to_exi;
2228 	fhandle_t *fh;
2229 
2230 	TRACE_0(TR_FAC_NFS, TR_RFS_LINK_START, "rfs_link_start:");
2231 
2232 	fromvp = nfs_fhtovp(args->la_from, exi);
2233 	if (fromvp == NULL) {
2234 		*status = NFSERR_STALE;
2235 		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2236 		    "rfs_link_end:(%S)", "from stale");
2237 		return;
2238 	}
2239 
2240 	fh = args->la_to.da_fhandle;
2241 	to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2242 	if (to_exi == NULL) {
2243 		VN_RELE(fromvp);
2244 		*status = NFSERR_ACCES;
2245 		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2246 		    "rfs_link_end:(%S)", "cross device");
2247 		return;
2248 	}
2249 	exi_rele(to_exi);
2250 
2251 	if (to_exi != exi) {
2252 		VN_RELE(fromvp);
2253 		*status = NFSERR_XDEV;
2254 		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2255 		    "rfs_link_end:(%S)", "cross device");
2256 		return;
2257 	}
2258 
2259 	tovp = nfs_fhtovp(args->la_to.da_fhandle, exi);
2260 	if (tovp == NULL) {
2261 		VN_RELE(fromvp);
2262 		*status = NFSERR_STALE;
2263 		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2264 		    "rfs_link_end:(%S)", "to stale");
2265 		return;
2266 	}
2267 
2268 	if (tovp->v_type != VDIR) {
2269 		VN_RELE(tovp);
2270 		VN_RELE(fromvp);
2271 		*status = NFSERR_NOTDIR;
2272 		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2273 		    "rfs_link_end:(%S)", "not dir");
2274 		return;
2275 	}
2276 	/*
2277 	 * Disallow NULL paths
2278 	 */
2279 	if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') {
2280 		VN_RELE(tovp);
2281 		VN_RELE(fromvp);
2282 		*status = NFSERR_ACCES;
2283 		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2284 		    "rfs_link_end:(%S)", "access");
2285 		return;
2286 	}
2287 
2288 	if (rdonly(exi, req)) {
2289 		VN_RELE(tovp);
2290 		VN_RELE(fromvp);
2291 		*status = NFSERR_ROFS;
2292 		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2293 		    "rfs_link_end:(%S)", "rofs");
2294 		return;
2295 	}
2296 
2297 	TRACE_0(TR_FAC_NFS, TR_VOP_LINK_START, "vop_link_start:");
2298 	error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr);
2299 	TRACE_0(TR_FAC_NFS, TR_VOP_LINK_END, "vop_link_end:");
2300 
2301 	/*
2302 	 * Force modified data and metadata out to stable storage.
2303 	 */
2304 	(void) VOP_FSYNC(tovp, 0, cr);
2305 	(void) VOP_FSYNC(fromvp, FNODSYNC, cr);
2306 
2307 	VN_RELE(tovp);
2308 	VN_RELE(fromvp);
2309 
2310 	*status = puterrno(error);
2311 
2312 	TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, "rfs_link_end:(%S)", "done");
2313 }
2314 void *
2315 rfs_link_getfh(struct nfslinkargs *args)
2316 {
2317 	return (args->la_from);
2318 }
2319 
2320 /*
2321  * Symbolicly link to a file.
2322  * Create a file (to) with the given attributes which is a symbolic link
2323  * to the given path name (to).
2324  */
2325 void
2326 rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
2327 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2328 {
2329 	int error;
2330 	struct vattr va;
2331 	vnode_t *vp;
2332 	vnode_t *svp;
2333 	int lerror;
2334 
2335 	TRACE_0(TR_FAC_NFS, TR_RFS_SYMLINK_START, "rfs_symlink_start:");
2336 
2337 	/*
2338 	 * Disallow NULL paths
2339 	 */
2340 	if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') {
2341 		*status = NFSERR_ACCES;
2342 		TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END,
2343 		    "rfs_symlink_end:(%S)", "access");
2344 		return;
2345 	}
2346 
2347 	vp = nfs_fhtovp(args->sla_from.da_fhandle, exi);
2348 	if (vp == NULL) {
2349 		*status = NFSERR_STALE;
2350 		TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END,
2351 		    "rfs_symlink_end:(%S)", "stale");
2352 		return;
2353 	}
2354 
2355 	if (rdonly(exi, req)) {
2356 		VN_RELE(vp);
2357 		*status = NFSERR_ROFS;
2358 		TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END,
2359 		    "rfs_symlink_end:(%S)", "rofs");
2360 		return;
2361 	}
2362 
2363 	error = sattr_to_vattr(args->sla_sa, &va);
2364 	if (error) {
2365 		VN_RELE(vp);
2366 		*status = puterrno(error);
2367 		TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END,
2368 		    "rfs_symlink_end:(%S)", "sattr");
2369 		return;
2370 	}
2371 
2372 	if (!(va.va_mask & AT_MODE)) {
2373 		VN_RELE(vp);
2374 		*status = NFSERR_INVAL;
2375 		TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END,
2376 		    "rfs_symlink_end:(%S)", "no mode");
2377 		return;
2378 	}
2379 
2380 	va.va_type = VLNK;
2381 	va.va_mask |= AT_TYPE;
2382 
2383 	TRACE_0(TR_FAC_NFS, TR_VOP_SYMLINK_START, "vop_symlink_start:");
2384 	error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, args->sla_tnm, cr);
2385 	TRACE_0(TR_FAC_NFS, TR_VOP_SYMLINK_END, "vop_symlink_end:");
2386 
2387 	/*
2388 	 * Force new data and metadata out to stable storage.
2389 	 */
2390 	TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_START, "vop_lookup_start:");
2391 	lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL,
2392 	    0, NULL, cr);
2393 	TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_END, "vop_lookup_end:");
2394 	if (!lerror) {
2395 		(void) VOP_FSYNC(svp, 0, cr);
2396 		VN_RELE(svp);
2397 	}
2398 
2399 	/*
2400 	 * Force modified data and metadata out to stable storage.
2401 	 */
2402 	(void) VOP_FSYNC(vp, 0, cr);
2403 
2404 	VN_RELE(vp);
2405 
2406 	*status = puterrno(error);
2407 
2408 	TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END, "rfs_symlink_end:(%S)", "done");
2409 }
2410 void *
2411 rfs_symlink_getfh(struct nfsslargs *args)
2412 {
2413 	return (args->sla_from.da_fhandle);
2414 }
2415 
2416 /*
2417  * Make a directory.
2418  * Create a directory with the given name, parent directory, and attributes.
2419  * Returns a file handle and attributes for the new directory.
2420  */
2421 void
2422 rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr,
2423 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2424 {
2425 	int error;
2426 	struct vattr va;
2427 	vnode_t *dvp = NULL;
2428 	vnode_t *vp;
2429 	char *name = args->ca_da.da_name;
2430 
2431 	TRACE_0(TR_FAC_NFS, TR_RFS_MKDIR_START, "rfs_mkdir_start:");
2432 
2433 	/*
2434 	 * Disallow NULL paths
2435 	 */
2436 	if (name == NULL || *name == '\0') {
2437 		dr->dr_status = NFSERR_ACCES;
2438 		TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END,
2439 		    "rfs_mkdir_end:(%S)", "access");
2440 		return;
2441 	}
2442 
2443 	vp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
2444 	if (vp == NULL) {
2445 		dr->dr_status = NFSERR_STALE;
2446 		TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END,
2447 		    "rfs_mkdir_end:(%S)", "stale");
2448 		return;
2449 	}
2450 
2451 	if (rdonly(exi, req)) {
2452 		VN_RELE(vp);
2453 		dr->dr_status = NFSERR_ROFS;
2454 		TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END,
2455 		    "rfs_mkdir_end:(%S)", "rofs");
2456 		return;
2457 	}
2458 
2459 	error = sattr_to_vattr(args->ca_sa, &va);
2460 	if (error) {
2461 		VN_RELE(vp);
2462 		dr->dr_status = puterrno(error);
2463 		TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END,
2464 		    "rfs_mkdir_end:(%S)", "sattr");
2465 		return;
2466 	}
2467 
2468 	if (!(va.va_mask & AT_MODE)) {
2469 		VN_RELE(vp);
2470 		dr->dr_status = NFSERR_INVAL;
2471 		TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END,
2472 		    "rfs_mkdir_end:(%S)", "no mode");
2473 		return;
2474 	}
2475 
2476 	va.va_type = VDIR;
2477 	va.va_mask |= AT_TYPE;
2478 
2479 	TRACE_0(TR_FAC_NFS, TR_VOP_MKDIR_START, "vop_mkdir_start:");
2480 	error = VOP_MKDIR(vp, name, &va, &dvp, cr);
2481 	TRACE_0(TR_FAC_NFS, TR_VOP_MKDIR_END, "vop_mkdir_end:");
2482 
2483 	if (!error) {
2484 		/*
2485 		 * Attribtutes of the newly created directory should
2486 		 * be returned to the client.
2487 		 */
2488 		va.va_mask = AT_ALL; /* We want everything */
2489 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:");
2490 		error = VOP_GETATTR(dvp, &va, 0, cr);
2491 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:");
2492 		/* check for overflows */
2493 		if (!error) {
2494 			acl_perm(vp, exi, &va, cr);
2495 			error = vattr_to_nattr(&va, &dr->dr_attr);
2496 			if (!error) {
2497 				error = makefh(&dr->dr_fhandle, dvp, exi);
2498 			}
2499 		}
2500 		/*
2501 		 * Force new data and metadata out to stable storage.
2502 		 */
2503 		(void) VOP_FSYNC(dvp, 0, cr);
2504 		VN_RELE(dvp);
2505 	}
2506 
2507 	/*
2508 	 * Force modified data and metadata out to stable storage.
2509 	 */
2510 	(void) VOP_FSYNC(vp, 0, cr);
2511 
2512 	VN_RELE(vp);
2513 
2514 	dr->dr_status = puterrno(error);
2515 
2516 	TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END, "rfs_mkdir_end:(%S)", "done");
2517 }
2518 void *
2519 rfs_mkdir_getfh(struct nfscreatargs *args)
2520 {
2521 	return (args->ca_da.da_fhandle);
2522 }
2523 
2524 /*
2525  * Remove a directory.
2526  * Remove the given directory name from the given parent directory.
2527  */
2528 void
2529 rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status,
2530 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2531 {
2532 	int error;
2533 	vnode_t *vp;
2534 
2535 	TRACE_0(TR_FAC_NFS, TR_RFS_RMDIR_START, "rfs_rmdir_start:");
2536 
2537 	/*
2538 	 * Disallow NULL paths
2539 	 */
2540 	if (da->da_name == NULL || *da->da_name == '\0') {
2541 		*status = NFSERR_ACCES;
2542 		TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END,
2543 		    "rfs_rmdir_end:(%S)", "access");
2544 		return;
2545 	}
2546 
2547 	vp = nfs_fhtovp(da->da_fhandle, exi);
2548 	if (vp == NULL) {
2549 		*status = NFSERR_STALE;
2550 		TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END,
2551 		    "rfs_rmdir_end:(%S)", "stale");
2552 		return;
2553 	}
2554 
2555 	if (rdonly(exi, req)) {
2556 		VN_RELE(vp);
2557 		*status = NFSERR_ROFS;
2558 		TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END,
2559 		    "rfs_rmdir_end:(%S)", "rofs");
2560 		return;
2561 	}
2562 
2563 	/*
2564 	 * VOP_RMDIR now takes a new third argument (the current
2565 	 * directory of the process).  That's because someone
2566 	 * wants to return EINVAL if one tries to remove ".".
2567 	 * Of course, NFS servers have no idea what their
2568 	 * clients' current directories are.  We fake it by
2569 	 * supplying a vnode known to exist and illegal to
2570 	 * remove.
2571 	 */
2572 	TRACE_0(TR_FAC_NFS, TR_VOP_RMDIR_START, "vop_rmdir_start:");
2573 	error = VOP_RMDIR(vp, da->da_name, rootdir, cr);
2574 	TRACE_0(TR_FAC_NFS, TR_VOP_RMDIR_END, "vop_rmdir_end:");
2575 
2576 	/*
2577 	 * Force modified data and metadata out to stable storage.
2578 	 */
2579 	(void) VOP_FSYNC(vp, 0, cr);
2580 
2581 	VN_RELE(vp);
2582 
2583 	/*
2584 	 * System V defines rmdir to return EEXIST, not ENOTEMPTY,
2585 	 * if the directory is not empty.  A System V NFS server
2586 	 * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
2587 	 * over the wire.
2588 	 */
2589 	if (error == EEXIST)
2590 		*status = NFSERR_NOTEMPTY;
2591 	else
2592 		*status = puterrno(error);
2593 
2594 	TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END, "rfs_rmdir_end:(%S)", "done");
2595 }
2596 void *
2597 rfs_rmdir_getfh(struct nfsdiropargs *da)
2598 {
2599 	return (da->da_fhandle);
2600 }
2601 
2602 /* ARGSUSED */
2603 void
2604 rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
2605 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2606 {
2607 	int error;
2608 	int iseof;
2609 	struct iovec iov;
2610 	struct uio uio;
2611 	vnode_t *vp;
2612 
2613 	TRACE_0(TR_FAC_NFS, TR_RFS_READDIR_START, "rfs_readdir_start:");
2614 
2615 	vp = nfs_fhtovp(&rda->rda_fh, exi);
2616 	if (vp == NULL) {
2617 		rd->rd_entries = NULL;
2618 		rd->rd_status = NFSERR_STALE;
2619 		TRACE_1(TR_FAC_NFS, TR_RFS_READDIR_END,
2620 		    "rfs_readdir_end:(%S)", "stale");
2621 		return;
2622 	}
2623 
2624 	if (vp->v_type != VDIR) {
2625 		VN_RELE(vp);
2626 		rd->rd_entries = NULL;
2627 		rd->rd_status = NFSERR_NOTDIR;
2628 		TRACE_1(TR_FAC_NFS, TR_RFS_READDIR_END,
2629 		    "rfs_readdir_end:(%S)", "notdir");
2630 		return;
2631 	}
2632 
2633 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START, "vop_rwlock_start:");
2634 	(void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
2635 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END, "vop_rwlock_end:");
2636 
2637 	TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START, "vop_access_start:");
2638 	error = VOP_ACCESS(vp, VREAD, 0, cr);
2639 	TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END, "vop_access_end:");
2640 	if (error) {
2641 		rd->rd_entries = NULL;
2642 		goto bad;
2643 	}
2644 
2645 	if (rda->rda_count == 0) {
2646 		rd->rd_entries = NULL;
2647 		rd->rd_size = 0;
2648 		rd->rd_eof = FALSE;
2649 		goto bad;
2650 	}
2651 
2652 	rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA);
2653 
2654 	/*
2655 	 * Allocate data for entries.  This will be freed by rfs_rddirfree.
2656 	 */
2657 	rd->rd_bufsize = (uint_t)rda->rda_count;
2658 	rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP);
2659 
2660 	/*
2661 	 * Set up io vector to read directory data
2662 	 */
2663 	iov.iov_base = (caddr_t)rd->rd_entries;
2664 	iov.iov_len = rda->rda_count;
2665 	uio.uio_iov = &iov;
2666 	uio.uio_iovcnt = 1;
2667 	uio.uio_segflg = UIO_SYSSPACE;
2668 	uio.uio_extflg = UIO_COPY_CACHED;
2669 	uio.uio_loffset = (offset_t)rda->rda_offset;
2670 	uio.uio_resid = rda->rda_count;
2671 
2672 	/*
2673 	 * read directory
2674 	 */
2675 	TRACE_0(TR_FAC_NFS, TR_VOP_READDIR_START, "vop_readdir_start:");
2676 	error = VOP_READDIR(vp, &uio, cr, &iseof);
2677 	TRACE_0(TR_FAC_NFS, TR_VOP_READDIR_END, "vop_readdir_end:");
2678 
2679 	/*
2680 	 * Clean up
2681 	 */
2682 	if (!error) {
2683 		/*
2684 		 * set size and eof
2685 		 */
2686 		if (uio.uio_resid == rda->rda_count) {
2687 			rd->rd_size = 0;
2688 			rd->rd_eof = TRUE;
2689 		} else {
2690 			rd->rd_size = (uint32_t)(rda->rda_count -
2691 			    uio.uio_resid);
2692 			rd->rd_eof = iseof ? TRUE : FALSE;
2693 		}
2694 	}
2695 
2696 bad:
2697 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, "vop_rwunlock_start:");
2698 	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
2699 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, "vop_rwunlock_end:");
2700 
2701 #if 0 /* notyet */
2702 	/*
2703 	 * Don't do this.  It causes local disk writes when just
2704 	 * reading the file and the overhead is deemed larger
2705 	 * than the benefit.
2706 	 */
2707 	/*
2708 	 * Force modified metadata out to stable storage.
2709 	 */
2710 	(void) VOP_FSYNC(vp, FNODSYNC, cr);
2711 #endif
2712 
2713 	VN_RELE(vp);
2714 
2715 	rd->rd_status = puterrno(error);
2716 
2717 	TRACE_1(TR_FAC_NFS, TR_RFS_READDIR_END, "rfs_readdir_end:(%S)", "done");
2718 }
2719 void *
2720 rfs_readdir_getfh(struct nfsrddirargs *rda)
2721 {
2722 	return (&rda->rda_fh);
2723 }
2724 void
2725 rfs_rddirfree(struct nfsrddirres *rd)
2726 {
2727 	if (rd->rd_entries != NULL)
2728 		kmem_free(rd->rd_entries, rd->rd_bufsize);
2729 }
2730 
2731 /* ARGSUSED */
2732 void
2733 rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
2734 	struct svc_req *req, cred_t *cr)
2735 {
2736 	int error;
2737 	struct statvfs64 sb;
2738 	vnode_t *vp;
2739 
2740 	TRACE_0(TR_FAC_NFS, TR_RFS_STATFS_START, "rfs_statfs_start:");
2741 
2742 	vp = nfs_fhtovp(fh, exi);
2743 	if (vp == NULL) {
2744 		fs->fs_status = NFSERR_STALE;
2745 		TRACE_1(TR_FAC_NFS, TR_RFS_STATFS_END,
2746 		    "rfs_statfs_end:(%S)", "stale");
2747 		return;
2748 	}
2749 
2750 	error = VFS_STATVFS(vp->v_vfsp, &sb);
2751 
2752 	if (!error) {
2753 		fs->fs_tsize = nfstsize();
2754 		fs->fs_bsize = sb.f_frsize;
2755 		fs->fs_blocks = sb.f_blocks;
2756 		fs->fs_bfree = sb.f_bfree;
2757 		fs->fs_bavail = sb.f_bavail;
2758 	}
2759 
2760 	VN_RELE(vp);
2761 
2762 	fs->fs_status = puterrno(error);
2763 
2764 	TRACE_1(TR_FAC_NFS, TR_RFS_STATFS_END, "rfs_statfs_end:(%S)", "done");
2765 }
2766 void *
2767 rfs_statfs_getfh(fhandle_t *fh)
2768 {
2769 	return (fh);
2770 }
2771 
2772 static int
2773 sattr_to_vattr(struct nfssattr *sa, struct vattr *vap)
2774 {
2775 	vap->va_mask = 0;
2776 
2777 	/*
2778 	 * There was a sign extension bug in some VFS based systems
2779 	 * which stored the mode as a short.  When it would get
2780 	 * assigned to a u_long, no sign extension would occur.
2781 	 * It needed to, but this wasn't noticed because sa_mode
2782 	 * would then get assigned back to the short, thus ignoring
2783 	 * the upper 16 bits of sa_mode.
2784 	 *
2785 	 * To make this implementation work for both broken
2786 	 * clients and good clients, we check for both versions
2787 	 * of the mode.
2788 	 */
2789 	if (sa->sa_mode != (uint32_t)((ushort_t)-1) &&
2790 	    sa->sa_mode != (uint32_t)-1) {
2791 		vap->va_mask |= AT_MODE;
2792 		vap->va_mode = sa->sa_mode;
2793 	}
2794 	if (sa->sa_uid != (uint32_t)-1) {
2795 		vap->va_mask |= AT_UID;
2796 		vap->va_uid = sa->sa_uid;
2797 	}
2798 	if (sa->sa_gid != (uint32_t)-1) {
2799 		vap->va_mask |= AT_GID;
2800 		vap->va_gid = sa->sa_gid;
2801 	}
2802 	if (sa->sa_size != (uint32_t)-1) {
2803 		vap->va_mask |= AT_SIZE;
2804 		vap->va_size = sa->sa_size;
2805 	}
2806 	if (sa->sa_atime.tv_sec != (int32_t)-1 &&
2807 	    sa->sa_atime.tv_usec != (int32_t)-1) {
2808 #ifndef _LP64
2809 		/* return error if time overflow */
2810 		if (!NFS2_TIME_OK(sa->sa_atime.tv_sec))
2811 			return (EOVERFLOW);
2812 #endif
2813 		vap->va_mask |= AT_ATIME;
2814 		/*
2815 		 * nfs protocol defines times as unsigned so don't extend sign,
2816 		 * unless sysadmin set nfs_allow_preepoch_time.
2817 		 */
2818 		NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec);
2819 		vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
2820 	}
2821 	if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
2822 	    sa->sa_mtime.tv_usec != (int32_t)-1) {
2823 #ifndef _LP64
2824 		/* return error if time overflow */
2825 		if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
2826 			return (EOVERFLOW);
2827 #endif
2828 		vap->va_mask |= AT_MTIME;
2829 		/*
2830 		 * nfs protocol defines times as unsigned so don't extend sign,
2831 		 * unless sysadmin set nfs_allow_preepoch_time.
2832 		 */
2833 		NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
2834 		vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
2835 	}
2836 	return (0);
2837 }
2838 
2839 static enum nfsftype vt_to_nf[] = {
2840 	0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
2841 };
2842 
2843 /*
2844  * check the following fields for overflow: nodeid, size, and time.
2845  * There could be a problem when converting 64-bit LP64 fields
2846  * into 32-bit ones.  Return an error if there is an overflow.
2847  */
2848 int
2849 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
2850 {
2851 	ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
2852 	na->na_type = vt_to_nf[vap->va_type];
2853 
2854 	if (vap->va_mode == (unsigned short) -1)
2855 		na->na_mode = (uint32_t)-1;
2856 	else
2857 		na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
2858 
2859 	if (vap->va_uid == (unsigned short)(-1))
2860 		na->na_uid = (uint32_t)(-1);
2861 	else if (vap->va_uid == UID_NOBODY)
2862 		na->na_uid = (uint32_t)NFS_UID_NOBODY;
2863 	else
2864 		na->na_uid = vap->va_uid;
2865 
2866 	if (vap->va_gid == (unsigned short)(-1))
2867 		na->na_gid = (uint32_t)-1;
2868 	else if (vap->va_gid == GID_NOBODY)
2869 		na->na_gid = (uint32_t)NFS_GID_NOBODY;
2870 	else
2871 		na->na_gid = vap->va_gid;
2872 
2873 	/*
2874 	 * Do we need to check fsid for overflow?  It is 64-bit in the
2875 	 * vattr, but are bigger than 32 bit values supported?
2876 	 */
2877 	na->na_fsid = vap->va_fsid;
2878 
2879 	na->na_nodeid = vap->va_nodeid;
2880 
2881 	/*
2882 	 * Check to make sure that the nodeid is representable over the
2883 	 * wire without losing bits.
2884 	 */
2885 	if (vap->va_nodeid != (u_longlong_t)na->na_nodeid)
2886 		return (EFBIG);
2887 	na->na_nlink = vap->va_nlink;
2888 
2889 	/*
2890 	 * Check for big files here, instead of at the caller.  See
2891 	 * comments in cstat for large special file explanation.
2892 	 */
2893 	if (vap->va_size > (u_longlong_t)MAXOFF32_T) {
2894 		if ((vap->va_type == VREG) || (vap->va_type == VDIR))
2895 			return (EFBIG);
2896 		if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) {
2897 			/* UNKNOWN_SIZE | OVERFLOW */
2898 			na->na_size = MAXOFF32_T;
2899 		} else
2900 			na->na_size = vap->va_size;
2901 	} else
2902 		na->na_size = vap->va_size;
2903 
2904 	/*
2905 	 * If the vnode times overflow the 32-bit times that NFS2
2906 	 * uses on the wire then return an error.
2907 	 */
2908 	if (!NFS_VAP_TIME_OK(vap)) {
2909 		return (EOVERFLOW);
2910 	}
2911 	na->na_atime.tv_sec = vap->va_atime.tv_sec;
2912 	na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2913 
2914 	na->na_mtime.tv_sec = vap->va_mtime.tv_sec;
2915 	na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2916 
2917 	na->na_ctime.tv_sec = vap->va_ctime.tv_sec;
2918 	na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000;
2919 
2920 	/*
2921 	 * If the dev_t will fit into 16 bits then compress
2922 	 * it, otherwise leave it alone. See comments in
2923 	 * nfs_client.c.
2924 	 */
2925 	if (getminor(vap->va_rdev) <= SO4_MAXMIN &&
2926 	    getmajor(vap->va_rdev) <= SO4_MAXMAJ)
2927 		na->na_rdev = nfsv2_cmpdev(vap->va_rdev);
2928 	else
2929 		(void) cmpldev(&na->na_rdev, vap->va_rdev);
2930 
2931 	na->na_blocks = vap->va_nblocks;
2932 	na->na_blocksize = vap->va_blksize;
2933 
2934 	/*
2935 	 * This bit of ugliness is a *TEMPORARY* hack to preserve the
2936 	 * over-the-wire protocols for named-pipe vnodes.  It remaps the
2937 	 * VFIFO type to the special over-the-wire type. (see note in nfs.h)
2938 	 *
2939 	 * BUYER BEWARE:
2940 	 *  If you are porting the NFS to a non-Sun server, you probably
2941 	 *  don't want to include the following block of code.  The
2942 	 *  over-the-wire special file types will be changing with the
2943 	 *  NFS Protocol Revision.
2944 	 */
2945 	if (vap->va_type == VFIFO)
2946 		NA_SETFIFO(na);
2947 	return (0);
2948 }
2949 
2950 /*
2951  * acl v2 support: returns approximate permission.
2952  *	default: returns minimal permission (more restrictive)
2953  *	aclok: returns maximal permission (less restrictive)
2954  *	This routine changes the permissions that are alaredy in *va.
2955  *	If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES,
2956  *	CLASS_OBJ is always the same as GROUP_OBJ entry.
2957  */
2958 static void
2959 acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr)
2960 {
2961 	vsecattr_t	vsa;
2962 	int		aclcnt;
2963 	aclent_t	*aclentp;
2964 	mode_t		mask_perm;
2965 	mode_t		grp_perm;
2966 	mode_t		other_perm;
2967 	mode_t		other_orig;
2968 	int		error;
2969 
2970 	/* dont care default acl */
2971 	vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT);
2972 	error = VOP_GETSECATTR(vp, &vsa, 0, cr);
2973 
2974 	if (!error) {
2975 		aclcnt = vsa.vsa_aclcnt;
2976 		if (aclcnt > MIN_ACL_ENTRIES) {
2977 			/* non-trivial ACL */
2978 			aclentp = vsa.vsa_aclentp;
2979 			if (exi->exi_export.ex_flags & EX_ACLOK) {
2980 				/* maximal permissions */
2981 				grp_perm = 0;
2982 				other_perm = 0;
2983 				for (; aclcnt > 0; aclcnt--, aclentp++) {
2984 					switch (aclentp->a_type) {
2985 					case USER_OBJ:
2986 						break;
2987 					case USER:
2988 						grp_perm |=
2989 						    aclentp->a_perm << 3;
2990 						other_perm |= aclentp->a_perm;
2991 						break;
2992 					case GROUP_OBJ:
2993 						grp_perm |=
2994 						    aclentp->a_perm << 3;
2995 						break;
2996 					case GROUP:
2997 						other_perm |= aclentp->a_perm;
2998 						break;
2999 					case OTHER_OBJ:
3000 						other_orig = aclentp->a_perm;
3001 						break;
3002 					case CLASS_OBJ:
3003 						mask_perm = aclentp->a_perm;
3004 						break;
3005 					default:
3006 						break;
3007 					}
3008 				}
3009 				grp_perm &= mask_perm << 3;
3010 				other_perm &= mask_perm;
3011 				other_perm |= other_orig;
3012 
3013 			} else {
3014 				/* minimal permissions */
3015 				grp_perm = 070;
3016 				other_perm = 07;
3017 				for (; aclcnt > 0; aclcnt--, aclentp++) {
3018 					switch (aclentp->a_type) {
3019 					case USER_OBJ:
3020 						break;
3021 					case USER:
3022 					case CLASS_OBJ:
3023 						grp_perm &=
3024 						    aclentp->a_perm << 3;
3025 						other_perm &=
3026 						    aclentp->a_perm;
3027 						break;
3028 					case GROUP_OBJ:
3029 						grp_perm &=
3030 						    aclentp->a_perm << 3;
3031 						break;
3032 					case GROUP:
3033 						other_perm &=
3034 						    aclentp->a_perm;
3035 						break;
3036 					case OTHER_OBJ:
3037 						other_perm &=
3038 						    aclentp->a_perm;
3039 						break;
3040 					default:
3041 						break;
3042 					}
3043 				}
3044 			}
3045 			/* copy to va */
3046 			va->va_mode &= ~077;
3047 			va->va_mode |= grp_perm | other_perm;
3048 		}
3049 		if (vsa.vsa_aclcnt)
3050 			kmem_free(vsa.vsa_aclentp,
3051 			    vsa.vsa_aclcnt * sizeof (aclent_t));
3052 	}
3053 }
3054 
3055 void
3056 rfs_srvrinit(void)
3057 {
3058 	mutex_init(&rfs_async_write_lock, NULL, MUTEX_DEFAULT, NULL);
3059 }
3060 
3061 void
3062 rfs_srvrfini(void)
3063 {
3064 	mutex_destroy(&rfs_async_write_lock);
3065 }
3066