xref: /illumos-gate/usr/src/uts/common/fs/nfs/nfs_srv.c (revision 65a89a64c60f3061bbe2381edaacc81660af9a95)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  *	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
29  *	All rights reserved.
30  */
31 
32 #pragma ident	"%Z%%M%	%I%	%E% SMI"
33 
34 #include <sys/param.h>
35 #include <sys/types.h>
36 #include <sys/systm.h>
37 #include <sys/cred.h>
38 #include <sys/buf.h>
39 #include <sys/vfs.h>
40 #include <sys/vnode.h>
41 #include <sys/uio.h>
42 #include <sys/stat.h>
43 #include <sys/errno.h>
44 #include <sys/sysmacros.h>
45 #include <sys/statvfs.h>
46 #include <sys/kmem.h>
47 #include <sys/kstat.h>
48 #include <sys/dirent.h>
49 #include <sys/cmn_err.h>
50 #include <sys/debug.h>
51 #include <sys/vtrace.h>
52 #include <sys/mode.h>
53 #include <sys/acl.h>
54 #include <sys/nbmlock.h>
55 #include <sys/policy.h>
56 
57 #include <rpc/types.h>
58 #include <rpc/auth.h>
59 #include <rpc/svc.h>
60 
61 #include <nfs/nfs.h>
62 #include <nfs/export.h>
63 
64 #include <vm/hat.h>
65 #include <vm/as.h>
66 #include <vm/seg.h>
67 #include <vm/seg_map.h>
68 #include <vm/seg_kmem.h>
69 
70 #include <sys/strsubr.h>
71 
72 /*
73  * These are the interface routines for the server side of the
74  * Network File System.  See the NFS version 2 protocol specification
75  * for a description of this interface.
76  */
77 
78 static int	sattr_to_vattr(struct nfssattr *, struct vattr *);
79 static void	acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
80 			cred_t *);
81 
82 /*
83  * Some "over the wire" UNIX file types.  These are encoded
84  * into the mode.  This needs to be fixed in the next rev.
85  */
86 #define	IFMT		0170000		/* type of file */
87 #define	IFCHR		0020000		/* character special */
88 #define	IFBLK		0060000		/* block special */
89 #define	IFSOCK		0140000		/* socket */
90 
91 /*
92  * Get file attributes.
93  * Returns the current attributes of the file with the given fhandle.
94  */
95 /* ARGSUSED */
96 void
97 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
98 	struct svc_req *req, cred_t *cr)
99 {
100 	int error;
101 	vnode_t *vp;
102 	struct vattr va;
103 
104 	TRACE_0(TR_FAC_NFS, TR_RFS_GETATTR_START,
105 		"rfs_getattr_start:");
106 
107 	vp = nfs_fhtovp(fhp, exi);
108 	if (vp == NULL) {
109 		ns->ns_status = NFSERR_STALE;
110 		TRACE_1(TR_FAC_NFS, TR_RFS_GETATTR_END,
111 			"rfs_getattr_end:(%S)", "stale");
112 		return;
113 	}
114 
115 	/*
116 	 * Do the getattr.
117 	 */
118 	va.va_mask = AT_ALL;	/* we want all the attributes */
119 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
120 		"vop_getattr_start:");
121 	error = rfs4_delegated_getattr(vp, &va, 0, cr);
122 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
123 		"vop_getattr_end:");
124 
125 	/* check for overflows */
126 	if (!error) {
127 		acl_perm(vp, exi, &va, cr);
128 		error = vattr_to_nattr(&va, &ns->ns_attr);
129 	}
130 
131 	VN_RELE(vp);
132 
133 	ns->ns_status = puterrno(error);
134 
135 	TRACE_1(TR_FAC_NFS, TR_RFS_GETATTR_END,
136 		"rfs_getattr_end:(%S)", "done");
137 }
138 fhandle_t *
139 rfs_getattr_getfh(fhandle_t *fhp)
140 {
141 	return (fhp);
142 }
143 
144 /*
145  * Set file attributes.
146  * Sets the attributes of the file with the given fhandle.  Returns
147  * the new attributes.
148  */
149 void
150 rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns,
151 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
152 {
153 	int error;
154 	int flag;
155 	int in_crit = 0;
156 	vnode_t *vp;
157 	struct vattr va;
158 	struct vattr bva;
159 	struct flock64 bf;
160 
161 	TRACE_0(TR_FAC_NFS, TR_RFS_SETATTR_START,
162 		"rfs_setattr_start:");
163 
164 	vp = nfs_fhtovp(&args->saa_fh, exi);
165 	if (vp == NULL) {
166 		ns->ns_status = NFSERR_STALE;
167 		TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END,
168 			"rfs_setattr_end:(%S)", "stale");
169 		return;
170 	}
171 
172 	if (rdonly(exi, req) || vn_is_readonly(vp)) {
173 		VN_RELE(vp);
174 		ns->ns_status = NFSERR_ROFS;
175 		TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END,
176 			"rfs_setattr_end:(%S)", "rofs");
177 		return;
178 	}
179 
180 	error = sattr_to_vattr(&args->saa_sa, &va);
181 	if (error) {
182 		VN_RELE(vp);
183 		ns->ns_status = puterrno(error);
184 		TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END,
185 			"rfs_setattr_end:(%S)", "sattr");
186 		return;
187 	}
188 
189 	/*
190 	 * If the client is requesting a change to the mtime,
191 	 * but the nanosecond field is set to 1 billion, then
192 	 * this is a flag to the server that it should set the
193 	 * atime and mtime fields to the server's current time.
194 	 * The 1 billion number actually came from the client
195 	 * as 1 million, but the units in the over the wire
196 	 * request are microseconds instead of nanoseconds.
197 	 *
198 	 * This is an overload of the protocol and should be
199 	 * documented in the NFS Version 2 protocol specification.
200 	 */
201 	if (va.va_mask & AT_MTIME) {
202 		if (va.va_mtime.tv_nsec == 1000000000) {
203 			gethrestime(&va.va_mtime);
204 			va.va_atime = va.va_mtime;
205 			va.va_mask |= AT_ATIME;
206 			flag = 0;
207 		} else
208 			flag = ATTR_UTIME;
209 	} else
210 		flag = 0;
211 
212 	/*
213 	 * If the filesystem is exported with nosuid, then mask off
214 	 * the setuid and setgid bits.
215 	 */
216 	if ((va.va_mask & AT_MODE) && vp->v_type == VREG &&
217 	    (exi->exi_export.ex_flags & EX_NOSUID))
218 		va.va_mode &= ~(VSUID | VSGID);
219 
220 	/*
221 	 * We need to specially handle size changes because it is
222 	 * possible for the client to create a file with modes
223 	 * which indicate read-only, but with the file opened for
224 	 * writing.  If the client then tries to set the size of
225 	 * the file, then the normal access checking done in
226 	 * VOP_SETATTR would prevent the client from doing so,
227 	 * although it should be legal for it to do so.  To get
228 	 * around this, we do the access checking for ourselves
229 	 * and then use VOP_SPACE which doesn't do the access
230 	 * checking which VOP_SETATTR does. VOP_SPACE can only
231 	 * operate on VREG files, let VOP_SETATTR handle the other
232 	 * extremely rare cases.
233 	 * Also the client should not be allowed to change the
234 	 * size of the file if there is a conflicting non-blocking
235 	 * mandatory lock in the region of change.
236 	 *
237 	 * Also(2), check to see if the v4 side of the server has
238 	 * delegated this file.  If so, then we set T_WOULDBLOCK
239 	 * so that the dispatch function dosn't send a reply, forcing
240 	 * the client to retrasmit its request.
241 	 */
242 	if (vp->v_type == VREG && va.va_mask & AT_SIZE) {
243 		/* If delegated, mark as wouldblock so response is dropped */
244 		if (rfs4_check_delegated(FWRITE, vp, TRUE)) {
245 			VN_RELE(vp);
246 			curthread->t_flag |= T_WOULDBLOCK;
247 			TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END,
248 				"rfs_setattr_end:(%S)", "delegated");
249 			return;
250 		}
251 		if (nbl_need_check(vp)) {
252 			nbl_start_crit(vp, RW_READER);
253 			in_crit = 1;
254 		}
255 
256 		bva.va_mask = AT_UID | AT_SIZE;
257 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
258 			"vop_getattr_start:");
259 		error = VOP_GETATTR(vp, &bva, 0, cr);
260 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
261 			"vop_getattr_end:");
262 		if (error) {
263 			if (in_crit)
264 				nbl_end_crit(vp);
265 			VN_RELE(vp);
266 			ns->ns_status = puterrno(error);
267 			TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END,
268 				"rfs_setattr_end:(%S)", "getattr");
269 			return;
270 		}
271 
272 		if (in_crit) {
273 			u_offset_t offset;
274 			ssize_t length;
275 
276 			if (va.va_size < bva.va_size) {
277 				offset = va.va_size;
278 				length = bva.va_size - va.va_size;
279 			} else {
280 				offset = bva.va_size;
281 				length = va.va_size - bva.va_size;
282 			}
283 			if (nbl_conflict(vp, NBL_WRITE, offset, length, 0)) {
284 				error = EACCES;
285 			}
286 		}
287 
288 		if (crgetuid(cr) == bva.va_uid && !error &&
289 			va.va_size != bva.va_size) {
290 			va.va_mask &= ~AT_SIZE;
291 			bf.l_type = F_WRLCK;
292 			bf.l_whence = 0;
293 			bf.l_start = (off64_t)va.va_size;
294 			bf.l_len = 0;
295 			bf.l_sysid = 0;
296 			bf.l_pid = 0;
297 			TRACE_0(TR_FAC_NFS, TR_VOP_SPACE_START,
298 				"vop_space_start:");
299 			error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE,
300 					(offset_t)va.va_size, cr, NULL);
301 			TRACE_0(TR_FAC_NFS, TR_VOP_SPACE_END,
302 				"vop_space_end:");
303 		}
304 		if (in_crit)
305 			nbl_end_crit(vp);
306 	} else
307 		error = 0;
308 
309 	/*
310 	 * Do the setattr.
311 	 */
312 	if (!error && va.va_mask) {
313 		TRACE_0(TR_FAC_NFS, TR_VOP_SETATTR_START,
314 			"vop_setattr_start:");
315 		error = VOP_SETATTR(vp, &va, flag, cr, NULL);
316 		TRACE_0(TR_FAC_NFS, TR_VOP_SETATTR_END,
317 			"vop_setattr_end:");
318 	}
319 
320 	if (!error) {
321 		va.va_mask = AT_ALL;	/* get everything */
322 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
323 			"vop_getattr_start:");
324 		error = rfs4_delegated_getattr(vp, &va, 0, cr);
325 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
326 			"vop_getattr_end:");
327 
328 		/* check for overflows */
329 		if (!error) {
330 			acl_perm(vp, exi, &va, cr);
331 			error = vattr_to_nattr(&va, &ns->ns_attr);
332 		}
333 	}
334 
335 	/*
336 	 * Force modified metadata out to stable storage.
337 	 */
338 	(void) VOP_FSYNC(vp, FNODSYNC, cr);
339 
340 	VN_RELE(vp);
341 
342 	ns->ns_status = puterrno(error);
343 
344 	TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END,
345 		"rfs_setattr_end:(%S)", "done");
346 }
347 fhandle_t *
348 rfs_setattr_getfh(struct nfssaargs *args)
349 {
350 	return (&args->saa_fh);
351 }
352 
353 /*
354  * Directory lookup.
355  * Returns an fhandle and file attributes for file name in a directory.
356  */
357 /* ARGSUSED */
358 void
359 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
360 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
361 {
362 	int error;
363 	vnode_t *dvp;
364 	vnode_t *vp;
365 	struct vattr va;
366 	fhandle_t *fhp = da->da_fhandle;
367 	struct sec_ol sec = {0, 0};
368 	bool_t publicfh_flag = FALSE, auth_weak = FALSE;
369 
370 	TRACE_0(TR_FAC_NFS, TR_RFS_LOOKUP_START,
371 		"rfs_lookup_start:");
372 
373 	/*
374 	 * Disallow NULL paths
375 	 */
376 	if (da->da_name == NULL || *da->da_name == '\0') {
377 		dr->dr_status = NFSERR_ACCES;
378 		TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END,
379 			"rfs_lookup_end:(%S)", "access");
380 		return;
381 	}
382 
383 	/*
384 	 * Allow lookups from the root - the default
385 	 * location of the public filehandle.
386 	 */
387 	if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
388 		dvp = rootdir;
389 		VN_HOLD(dvp);
390 	} else {
391 		dvp = nfs_fhtovp(fhp, exi);
392 		if (dvp == NULL) {
393 			dr->dr_status = NFSERR_STALE;
394 			TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END,
395 				"rfs_lookup_end:(%S)", "stale");
396 			return;
397 		}
398 	}
399 
400 	/*
401 	 * Not allow lookup beyond root.
402 	 * If the filehandle matches a filehandle of the exi,
403 	 * then the ".." refers beyond the root of an exported filesystem.
404 	 */
405 	if (strcmp(da->da_name, "..") == 0 &&
406 	    EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
407 		VN_RELE(dvp);
408 		dr->dr_status = NFSERR_NOENT;
409 		TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END,
410 			"rfs_lookup_end:(%S)", "noent");
411 		return;
412 	}
413 
414 	/*
415 	 * If the public filehandle is used then allow
416 	 * a multi-component lookup, i.e. evaluate
417 	 * a pathname and follow symbolic links if
418 	 * necessary.
419 	 *
420 	 * This may result in a vnode in another filesystem
421 	 * which is OK as long as the filesystem is exported.
422 	 */
423 	if (PUBLIC_FH2(fhp)) {
424 		publicfh_flag = TRUE;
425 		error = rfs_publicfh_mclookup(da->da_name, dvp, cr, &vp, &exi,
426 					&sec);
427 	} else {
428 		/*
429 		 * Do a normal single component lookup.
430 		 */
431 		TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_START,
432 			"vop_lookup_start:");
433 		error = VOP_LOOKUP(dvp, da->da_name, &vp, NULL, 0, NULL, cr);
434 		TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_END,
435 			"vop_lookup_end:");
436 	}
437 
438 	if (!error) {
439 		va.va_mask = AT_ALL;	/* we want everything */
440 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
441 			"vop_getattr_start:");
442 		error = rfs4_delegated_getattr(vp, &va, 0, cr);
443 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
444 			"vop_getattr_end:");
445 		/* check for overflows */
446 		if (!error) {
447 			acl_perm(vp, exi, &va, cr);
448 			error = vattr_to_nattr(&va, &dr->dr_attr);
449 			if (!error) {
450 				if (sec.sec_flags & SEC_QUERY)
451 					error = makefh_ol(&dr->dr_fhandle, exi,
452 							sec.sec_index);
453 				else {
454 					error = makefh(&dr->dr_fhandle, vp,
455 								exi);
456 					if (!error && publicfh_flag &&
457 						!chk_clnt_sec(exi, req))
458 						auth_weak = TRUE;
459 				}
460 			}
461 		}
462 		VN_RELE(vp);
463 	}
464 
465 	VN_RELE(dvp);
466 
467 	/*
468 	 * If publicfh_flag is true then we have called rfs_publicfh_mclookup
469 	 * and have obtained a new exportinfo in exi which needs to be
470 	 * released. Note the the original exportinfo pointed to by exi
471 	 * will be released by the caller, comon_dispatch.
472 	 */
473 	if (publicfh_flag && exi != NULL)
474 		exi_rele(exi);
475 
476 	/*
477 	 * If it's public fh, no 0x81, and client's flavor is
478 	 * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
479 	 * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
480 	 */
481 	if (auth_weak)
482 		dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR;
483 	else
484 		dr->dr_status = puterrno(error);
485 
486 	TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END,
487 		"rfs_lookup_end:(%S)", "done");
488 }
489 fhandle_t *
490 rfs_lookup_getfh(struct nfsdiropargs *da)
491 {
492 	return (da->da_fhandle);
493 }
494 
495 /*
496  * Read symbolic link.
497  * Returns the string in the symbolic link at the given fhandle.
498  */
499 /* ARGSUSED */
500 void
501 rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi,
502 	struct svc_req *req, cred_t *cr)
503 {
504 	int error;
505 	struct iovec iov;
506 	struct uio uio;
507 	vnode_t *vp;
508 	struct vattr va;
509 
510 	TRACE_0(TR_FAC_NFS, TR_RFS_READLINK_START,
511 		"rfs_readlink_start:");
512 
513 	vp = nfs_fhtovp(fhp, exi);
514 	if (vp == NULL) {
515 		rl->rl_data = NULL;
516 		rl->rl_status = NFSERR_STALE;
517 		TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END,
518 			"rfs_readlink_end:(%S)", "stale");
519 		return;
520 	}
521 
522 	va.va_mask = AT_MODE;
523 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
524 		"vop_getattr_start:");
525 	error = VOP_GETATTR(vp, &va, 0, cr);
526 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
527 		"vop_getattr_end:");
528 
529 	if (error) {
530 		VN_RELE(vp);
531 		rl->rl_data = NULL;
532 		rl->rl_status = puterrno(error);
533 		TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END,
534 			"rfs_readlink_end:(%S)", "getattr error");
535 		return;
536 	}
537 
538 	if (MANDLOCK(vp, va.va_mode)) {
539 		VN_RELE(vp);
540 		rl->rl_data = NULL;
541 		rl->rl_status = NFSERR_ACCES;
542 		TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END,
543 			"rfs_readlink_end:(%S)", "access");
544 		return;
545 	}
546 
547 	/*
548 	 * XNFS and RFC1094 require us to return ENXIO if argument
549 	 * is not a link. BUGID 1138002.
550 	 */
551 	if (vp->v_type != VLNK) {
552 		VN_RELE(vp);
553 		rl->rl_data = NULL;
554 		rl->rl_status = NFSERR_NXIO;
555 		TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END,
556 			"rfs_readlink_end:(%S)", "nxio");
557 		return;
558 	}
559 
560 	/*
561 	 * Allocate data for pathname.  This will be freed by rfs_rlfree.
562 	 */
563 	rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
564 
565 	/*
566 	 * Set up io vector to read sym link data
567 	 */
568 	iov.iov_base = rl->rl_data;
569 	iov.iov_len = NFS_MAXPATHLEN;
570 	uio.uio_iov = &iov;
571 	uio.uio_iovcnt = 1;
572 	uio.uio_segflg = UIO_SYSSPACE;
573 	uio.uio_extflg = UIO_COPY_CACHED;
574 	uio.uio_loffset = (offset_t)0;
575 	uio.uio_resid = NFS_MAXPATHLEN;
576 
577 	/*
578 	 * Do the readlink.
579 	 */
580 	TRACE_0(TR_FAC_NFS, TR_VOP_READLINK_START,
581 		"vop_readlink_start:");
582 	error = VOP_READLINK(vp, &uio, cr);
583 	TRACE_0(TR_FAC_NFS, TR_VOP_READLINK_END,
584 		"vop_readlink_end:");
585 
586 #if 0 /* notyet */
587 	/*
588 	 * Don't do this.  It causes local disk writes when just
589 	 * reading the file and the overhead is deemed larger
590 	 * than the benefit.
591 	 */
592 	/*
593 	 * Force modified metadata out to stable storage.
594 	 */
595 	(void) VOP_FSYNC(vp, FNODSYNC, cr);
596 #endif
597 
598 	VN_RELE(vp);
599 
600 	rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid);
601 
602 	/*
603 	 * XNFS and RFC1094 require us to return ENXIO if argument
604 	 * is not a link. UFS returns EINVAL if this is the case,
605 	 * so we do the mapping here. BUGID 1138002.
606 	 */
607 	if (error == EINVAL)
608 		rl->rl_status = NFSERR_NXIO;
609 	else
610 		rl->rl_status = puterrno(error);
611 
612 	TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END,
613 		"rfs_readlink_end:(%S)", "done");
614 }
615 fhandle_t *
616 rfs_readlink_getfh(fhandle_t *fhp)
617 {
618 	return (fhp);
619 }
620 /*
621  * Free data allocated by rfs_readlink
622  */
623 void
624 rfs_rlfree(struct nfsrdlnres *rl)
625 {
626 	if (rl->rl_data != NULL)
627 		kmem_free(rl->rl_data, NFS_MAXPATHLEN);
628 }
629 
630 /*
631  * Read data.
632  * Returns some data read from the file at the given fhandle.
633  */
634 /* ARGSUSED */
635 void
636 rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr,
637 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
638 {
639 	vnode_t *vp;
640 	int error;
641 	struct vattr va;
642 	struct iovec iov;
643 	struct uio uio;
644 	mblk_t *mp;
645 	int alloc_err = 0;
646 	int in_crit = 0;
647 
648 	TRACE_0(TR_FAC_NFS, TR_RFS_READ_START,
649 		"rfs_read_start:");
650 
651 	vp = nfs_fhtovp(&ra->ra_fhandle, exi);
652 	if (vp == NULL) {
653 		rr->rr_data = NULL;
654 		rr->rr_status = NFSERR_STALE;
655 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
656 			"rfs_read_end:(%S)", "stale");
657 		return;
658 	}
659 
660 	if (vp->v_type != VREG) {
661 		VN_RELE(vp);
662 		rr->rr_data = NULL;
663 		rr->rr_status = NFSERR_ISDIR;
664 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
665 			"rfs_read_end:(%S)", "isdir");
666 		return;
667 	}
668 
669 	/*
670 	 * Check to see if the v4 side of the server has delegated
671 	 * this file.  If so, then we mark thread as wouldblock so
672 	 * the response is dropped.
673 	 */
674 	if (rfs4_check_delegated(FREAD, vp, FALSE)) {
675 		VN_RELE(vp);
676 		curthread->t_flag |= T_WOULDBLOCK;
677 		rr->rr_data = NULL;
678 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
679 			"rfs_read_end:(%S)", "delegated");
680 		return;
681 	}
682 
683 	/*
684 	 * Enter the critical region before calling VOP_RWLOCK
685 	 * to avoid a deadlock with write requests.
686 	 */
687 	if (nbl_need_check(vp)) {
688 		nbl_start_crit(vp, RW_READER);
689 		if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
690 		    0)) {
691 			nbl_end_crit(vp);
692 			VN_RELE(vp);
693 			rr->rr_data = NULL;
694 			rr->rr_status = NFSERR_ACCES;
695 			TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
696 				"rfs_read_end:(%S)", " csf access error");
697 			return;
698 		}
699 		in_crit = 1;
700 	}
701 
702 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START,
703 		"vop_rwlock_start:");
704 	(void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
705 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END,
706 		"vop_rwlock_end:");
707 
708 	va.va_mask = AT_ALL;
709 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
710 		"vop_getattr_start:");
711 	error = VOP_GETATTR(vp, &va, 0, cr);
712 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
713 		"vop_getattr_end:");
714 
715 	if (error) {
716 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
717 			"vop_rwunlock_start:");
718 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
719 		if (in_crit)
720 			nbl_end_crit(vp);
721 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END,
722 			"vop_rwunlock_end:");
723 		VN_RELE(vp);
724 		rr->rr_data = NULL;
725 		rr->rr_status = puterrno(error);
726 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
727 			"rfs_read_end:(%S)", "getattr error");
728 		return;
729 	}
730 
731 	/*
732 	 * This is a kludge to allow reading of files created
733 	 * with no read permission.  The owner of the file
734 	 * is always allowed to read it.
735 	 */
736 	if (crgetuid(cr) != va.va_uid) {
737 		TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START,
738 			"vop_access_start:");
739 		error = VOP_ACCESS(vp, VREAD, 0, cr);
740 		TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END,
741 			"vop_access_end:");
742 		if (error) {
743 			/*
744 			 * Exec is the same as read over the net because
745 			 * of demand loading.
746 			 */
747 			TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START,
748 				"vop_access_start:");
749 			error = VOP_ACCESS(vp, VEXEC, 0, cr);
750 			TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END,
751 				"vop_access_end:");
752 		}
753 		if (error) {
754 			TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
755 				"vop_rwunlock_start:");
756 			VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
757 			if (in_crit)
758 				nbl_end_crit(vp);
759 			TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END,
760 				"vop_rwunlock_end:");
761 			VN_RELE(vp);
762 			rr->rr_data = NULL;
763 			rr->rr_status = puterrno(error);
764 			TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
765 				"rfs_read_end:(%S)", "access error");
766 			return;
767 		}
768 	}
769 
770 	if (MANDLOCK(vp, va.va_mode)) {
771 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
772 			"vop_rwunlock_start:");
773 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
774 		if (in_crit)
775 			nbl_end_crit(vp);
776 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END,
777 			"vop_rwunlock_end:");
778 		VN_RELE(vp);
779 		rr->rr_data = NULL;
780 		rr->rr_status = NFSERR_ACCES;
781 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
782 			"rfs_read_end:(%S)", "mand lock");
783 		return;
784 	}
785 
786 	if ((u_offset_t)ra->ra_offset >= va.va_size) {
787 		rr->rr_count = 0;
788 		rr->rr_data = NULL;
789 		/*
790 		 * In this case, status is NFS_OK, but there is no data
791 		 * to encode. So set rr_mp to NULL.
792 		 */
793 		rr->rr_mp = NULL;
794 		goto done;
795 	}
796 
797 	/*
798 	 * mp will contain the data to be sent out in the read reply.
799 	 * This will be freed after the reply has been sent out (by the
800 	 * driver).
801 	 * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so
802 	 * that the call to xdrmblk_putmblk() never fails.
803 	 */
804 	mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG,
805 	    &alloc_err);
806 	ASSERT(mp != NULL);
807 	ASSERT(alloc_err == 0);
808 
809 	rr->rr_mp = mp;
810 
811 	/*
812 	 * Set up io vector
813 	 */
814 	iov.iov_base = (caddr_t)mp->b_datap->db_base;
815 	iov.iov_len = ra->ra_count;
816 	uio.uio_iov = &iov;
817 	uio.uio_iovcnt = 1;
818 	uio.uio_segflg = UIO_SYSSPACE;
819 	uio.uio_extflg = UIO_COPY_CACHED;
820 	uio.uio_loffset = (offset_t)ra->ra_offset;
821 	uio.uio_resid = ra->ra_count;
822 
823 	TRACE_0(TR_FAC_NFS, TR_VOP_READ_START,
824 		"vop_read_start:");
825 	error = VOP_READ(vp, &uio, 0, cr, NULL);
826 	TRACE_0(TR_FAC_NFS, TR_VOP_READ_END,
827 		"vop_read_end:");
828 
829 	if (error) {
830 		freeb(mp);
831 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
832 			"vop_rwunlock_start:");
833 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
834 		if (in_crit)
835 			nbl_end_crit(vp);
836 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END,
837 			"vop_rwunlock_end:");
838 		VN_RELE(vp);
839 		rr->rr_data = NULL;
840 		rr->rr_status = puterrno(error);
841 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
842 			"rfs_read_end:(%S)", "read error");
843 		return;
844 	}
845 
846 	/*
847 	 * Get attributes again so we can send the latest access
848 	 * time to the client side for his cache.
849 	 */
850 	va.va_mask = AT_ALL;
851 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
852 		"vop_getattr_start:");
853 	error = VOP_GETATTR(vp, &va, 0, cr);
854 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
855 		"vop_getattr_end:");
856 	if (error) {
857 		freeb(mp);
858 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
859 			"vop_rwunlock_start:");
860 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
861 		if (in_crit)
862 			nbl_end_crit(vp);
863 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END,
864 			"vop_rwunlock_end:");
865 		VN_RELE(vp);
866 		rr->rr_data = NULL;
867 		rr->rr_status = puterrno(error);
868 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
869 			"rfs_read_end:(%S)", "read error");
870 		return;
871 	}
872 
873 	rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid);
874 
875 	rr->rr_data = (char *)mp->b_datap->db_base;
876 
877 done:
878 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
879 		"vop_rwunlock_start:");
880 	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
881 	if (in_crit)
882 		nbl_end_crit(vp);
883 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END,
884 		"vop_rwunlock_end:");
885 
886 	acl_perm(vp, exi, &va, cr);
887 
888 	/* check for overflows */
889 	error = vattr_to_nattr(&va, &rr->rr_attr);
890 
891 #if 0 /* notyet */
892 	/*
893 	 * Don't do this.  It causes local disk writes when just
894 	 * reading the file and the overhead is deemed larger
895 	 * than the benefit.
896 	 */
897 	/*
898 	 * Force modified metadata out to stable storage.
899 	 */
900 	(void) VOP_FSYNC(vp, FNODSYNC, cr);
901 #endif
902 
903 	VN_RELE(vp);
904 
905 	rr->rr_status = puterrno(error);
906 
907 	TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
908 		"rfs_read_end:(%S)", "done");
909 }
910 
911 /*
912  * Free data allocated by rfs_read
913  */
914 void
915 rfs_rdfree(struct nfsrdresult *rr)
916 {
917 	mblk_t *mp;
918 
919 	if (rr->rr_status == NFS_OK) {
920 		mp = rr->rr_mp;
921 		if (mp != NULL)
922 			freeb(mp);
923 	}
924 }
925 
926 fhandle_t *
927 rfs_read_getfh(struct nfsreadargs *ra)
928 {
929 	return (&ra->ra_fhandle);
930 }
931 
932 #define	MAX_IOVECS	12
933 
934 #ifdef DEBUG
935 static int rfs_write_sync_hits = 0;
936 static int rfs_write_sync_misses = 0;
937 #endif
938 
939 /*
940  * Write data to file.
941  * Returns attributes of a file after writing some data to it.
942  *
943  * Any changes made here, especially in error handling might have
944  * to also be done in rfs_write (which clusters write requests).
945  */
946 void
947 rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns,
948 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
949 {
950 	int error;
951 	vnode_t *vp;
952 	rlim64_t rlimit;
953 	struct vattr va;
954 	struct uio uio;
955 	struct iovec iov[MAX_IOVECS];
956 	mblk_t *m;
957 	struct iovec *iovp;
958 	int iovcnt;
959 	cred_t *savecred;
960 	int in_crit = 0;
961 
962 	TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_START,
963 		"rfs_write_start:(%S)", "sync");
964 
965 	vp = nfs_fhtovp(&wa->wa_fhandle, exi);
966 	if (vp == NULL) {
967 		ns->ns_status = NFSERR_STALE;
968 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
969 			"rfs_write_end:(%S)", "stale");
970 		return;
971 	}
972 
973 	if (rdonly(exi, req)) {
974 		VN_RELE(vp);
975 		ns->ns_status = NFSERR_ROFS;
976 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
977 			"rfs_write_end:(%S)", "rofs");
978 		return;
979 	}
980 
981 	if (vp->v_type != VREG) {
982 		VN_RELE(vp);
983 		ns->ns_status = NFSERR_ISDIR;
984 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
985 			"rfs_write_end:(%S)", "isdir");
986 		return;
987 	}
988 
989 	/*
990 	 * Check to see if the v4 side of the server has delegated
991 	 * this file.  If so, then we mark thread as wouldblock so
992 	 * the response is dropped.
993 	 */
994 	if (rfs4_check_delegated(FWRITE, vp, FALSE)) {
995 		VN_RELE(vp);
996 		curthread->t_flag |= T_WOULDBLOCK;
997 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
998 			"rfs_write_end:(%S)", "delegated");
999 		return;
1000 	}
1001 
1002 	va.va_mask = AT_UID|AT_MODE;
1003 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
1004 		"vop_getattr_start:");
1005 	error = VOP_GETATTR(vp, &va, 0, cr);
1006 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
1007 		"vop_getattr_end:");
1008 
1009 	if (error) {
1010 		VN_RELE(vp);
1011 		ns->ns_status = puterrno(error);
1012 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
1013 			"rfs_write_end:(%S)", "getattr error");
1014 		return;
1015 	}
1016 
1017 	if (crgetuid(cr) != va.va_uid) {
1018 		/*
1019 		 * This is a kludge to allow writes of files created
1020 		 * with read only permission.  The owner of the file
1021 		 * is always allowed to write it.
1022 		 */
1023 		TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START,
1024 			"vop_access_start:");
1025 		error = VOP_ACCESS(vp, VWRITE, 0, cr);
1026 		TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END,
1027 			"vop_access_end:");
1028 		if (error) {
1029 			VN_RELE(vp);
1030 			ns->ns_status = puterrno(error);
1031 			TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
1032 				"rfs_write_end:(%S)", "access error");
1033 			return;
1034 		}
1035 	}
1036 
1037 	/*
1038 	 * Can't access a mandatory lock file.  This might cause
1039 	 * the NFS service thread to block forever waiting for a
1040 	 * lock to be released that will never be released.
1041 	 */
1042 	if (MANDLOCK(vp, va.va_mode)) {
1043 		VN_RELE(vp);
1044 		ns->ns_status = NFSERR_ACCES;
1045 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
1046 			"rfs_write_end:(%S)", "mand lock");
1047 		return;
1048 	}
1049 
1050 	/*
1051 	 * We have to enter the critical region before calling VOP_RWLOCK
1052 	 * to avoid a deadlock with ufs.
1053 	 */
1054 	if (nbl_need_check(vp)) {
1055 		nbl_start_crit(vp, RW_READER);
1056 		in_crit = 1;
1057 		if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
1058 		    wa->wa_count, 0)) {
1059 			error = EACCES;
1060 			goto out;
1061 		}
1062 	}
1063 
1064 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START,
1065 		"vop_rwlock_start:");
1066 	(void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);
1067 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END,
1068 		"vop_rwlock_end:");
1069 
1070 	if (wa->wa_data) {
1071 		iov[0].iov_base = wa->wa_data;
1072 		iov[0].iov_len = wa->wa_count;
1073 		uio.uio_iov = iov;
1074 		uio.uio_iovcnt = 1;
1075 		uio.uio_segflg = UIO_SYSSPACE;
1076 		uio.uio_extflg = UIO_COPY_DEFAULT;
1077 		uio.uio_loffset = (offset_t)wa->wa_offset;
1078 		uio.uio_resid = wa->wa_count;
1079 		/*
1080 		 * The limit is checked on the client. We
1081 		 * should allow any size writes here.
1082 		 */
1083 		uio.uio_llimit = curproc->p_fsz_ctl;
1084 		rlimit = uio.uio_llimit - wa->wa_offset;
1085 		if (rlimit < (rlim64_t)uio.uio_resid)
1086 			uio.uio_resid = (uint_t)rlimit;
1087 
1088 		/*
1089 		 * for now we assume no append mode
1090 		 */
1091 		TRACE_1(TR_FAC_NFS, TR_VOP_WRITE_START,
1092 			"vop_write_start:(%S)", "sync");
1093 		/*
1094 		 * We're changing creds because VM may fault and we need
1095 		 * the cred of the current thread to be used if quota
1096 		 * checking is enabled.
1097 		 */
1098 		savecred = curthread->t_cred;
1099 		curthread->t_cred = cr;
1100 		error = VOP_WRITE(vp, &uio, FSYNC, cr, NULL);
1101 		curthread->t_cred = savecred;
1102 		TRACE_0(TR_FAC_NFS, TR_VOP_WRITE_END,
1103 			"vop_write_end:");
1104 	} else {
1105 		iovcnt = 0;
1106 		for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
1107 			iovcnt++;
1108 		if (iovcnt <= MAX_IOVECS) {
1109 #ifdef DEBUG
1110 			rfs_write_sync_hits++;
1111 #endif
1112 			iovp = iov;
1113 		} else {
1114 #ifdef DEBUG
1115 			rfs_write_sync_misses++;
1116 #endif
1117 			iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
1118 		}
1119 		mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
1120 		uio.uio_iov = iovp;
1121 		uio.uio_iovcnt = iovcnt;
1122 		uio.uio_segflg = UIO_SYSSPACE;
1123 		uio.uio_extflg = UIO_COPY_DEFAULT;
1124 		uio.uio_loffset = (offset_t)wa->wa_offset;
1125 		uio.uio_resid = wa->wa_count;
1126 		/*
1127 		 * The limit is checked on the client. We
1128 		 * should allow any size writes here.
1129 		 */
1130 		uio.uio_llimit = curproc->p_fsz_ctl;
1131 		rlimit = uio.uio_llimit - wa->wa_offset;
1132 		if (rlimit < (rlim64_t)uio.uio_resid)
1133 			uio.uio_resid = (uint_t)rlimit;
1134 
1135 		/*
1136 		 * For now we assume no append mode.
1137 		 */
1138 		TRACE_1(TR_FAC_NFS, TR_VOP_WRITE_START,
1139 			"vop_write_start:(%S)", "iov sync");
1140 		/*
1141 		 * We're changing creds because VM may fault and we need
1142 		 * the cred of the current thread to be used if quota
1143 		 * checking is enabled.
1144 		 */
1145 		savecred = curthread->t_cred;
1146 		curthread->t_cred = cr;
1147 		error = VOP_WRITE(vp, &uio, FSYNC, cr, NULL);
1148 		curthread->t_cred = savecred;
1149 		TRACE_0(TR_FAC_NFS, TR_VOP_WRITE_END,
1150 			"vop_write_end:");
1151 
1152 		if (iovp != iov)
1153 			kmem_free(iovp, sizeof (*iovp) * iovcnt);
1154 	}
1155 
1156 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
1157 		"vop_rwunlock_start:");
1158 	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
1159 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END,
1160 		"vop_rwunlock_end:");
1161 
1162 	if (!error) {
1163 		/*
1164 		 * Get attributes again so we send the latest mod
1165 		 * time to the client side for his cache.
1166 		 */
1167 		va.va_mask = AT_ALL;	/* now we want everything */
1168 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
1169 			"vop_getattr_start:");
1170 		error = VOP_GETATTR(vp, &va, 0, cr);
1171 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
1172 			"vop_getattr_end:");
1173 		/* check for overflows */
1174 		if (!error) {
1175 			acl_perm(vp, exi, &va, cr);
1176 			error = vattr_to_nattr(&va, &ns->ns_attr);
1177 		}
1178 	}
1179 
1180 out:
1181 	if (in_crit)
1182 		nbl_end_crit(vp);
1183 	VN_RELE(vp);
1184 
1185 	ns->ns_status = puterrno(error);
1186 
1187 	TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
1188 		"rfs_write_end:(%S)", "sync");
1189 }
1190 
1191 struct rfs_async_write {
1192 	struct nfswriteargs *wa;
1193 	struct nfsattrstat *ns;
1194 	struct svc_req *req;
1195 	cred_t *cr;
1196 	kthread_t *thread;
1197 	struct rfs_async_write *list;
1198 };
1199 
1200 struct rfs_async_write_list {
1201 	fhandle_t *fhp;
1202 	kcondvar_t cv;
1203 	struct rfs_async_write *list;
1204 	struct rfs_async_write_list *next;
1205 };
1206 
1207 static struct rfs_async_write_list *rfs_async_write_head = NULL;
1208 static kmutex_t rfs_async_write_lock;
1209 static int rfs_write_async = 1;	/* enables write clustering if == 1 */
1210 
1211 #define	MAXCLIOVECS	42
1212 #define	RFSWRITE_INITVAL (enum nfsstat) -1
1213 
1214 #ifdef DEBUG
1215 static int rfs_write_hits = 0;
1216 static int rfs_write_misses = 0;
1217 #endif
1218 
1219 /*
1220  * Write data to file.
1221  * Returns attributes of a file after writing some data to it.
1222  */
1223 void
1224 rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns,
1225 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1226 {
1227 	int error;
1228 	vnode_t *vp;
1229 	rlim64_t rlimit;
1230 	struct vattr va;
1231 	struct uio uio;
1232 	struct rfs_async_write_list *lp;
1233 	struct rfs_async_write_list *nlp;
1234 	struct rfs_async_write *rp;
1235 	struct rfs_async_write *nrp;
1236 	struct rfs_async_write *trp;
1237 	struct rfs_async_write *lrp;
1238 	int data_written;
1239 	int iovcnt;
1240 	mblk_t *m;
1241 	struct iovec *iovp;
1242 	struct iovec *niovp;
1243 	struct iovec iov[MAXCLIOVECS];
1244 	int count;
1245 	int rcount;
1246 	uint_t off;
1247 	uint_t len;
1248 	struct rfs_async_write nrpsp;
1249 	struct rfs_async_write_list nlpsp;
1250 	ushort_t t_flag;
1251 	cred_t *savecred;
1252 	int in_crit = 0;
1253 
1254 	if (!rfs_write_async) {
1255 		rfs_write_sync(wa, ns, exi, req, cr);
1256 		return;
1257 	}
1258 
1259 	TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_START,
1260 		"rfs_write_start:(%S)", "async");
1261 
1262 	/*
1263 	 * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
1264 	 * is considered an OK.
1265 	 */
1266 	ns->ns_status = RFSWRITE_INITVAL;
1267 
1268 	nrp = &nrpsp;
1269 	nrp->wa = wa;
1270 	nrp->ns = ns;
1271 	nrp->req = req;
1272 	nrp->cr = cr;
1273 	nrp->thread = curthread;
1274 
1275 	ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
1276 
1277 	/*
1278 	 * Look to see if there is already a cluster started
1279 	 * for this file.
1280 	 */
1281 	mutex_enter(&rfs_async_write_lock);
1282 	for (lp = rfs_async_write_head; lp != NULL; lp = lp->next) {
1283 		if (bcmp(&wa->wa_fhandle, lp->fhp,
1284 		    sizeof (fhandle_t)) == 0)
1285 			break;
1286 	}
1287 
1288 	/*
1289 	 * If lp is non-NULL, then there is already a cluster
1290 	 * started.  We need to place ourselves in the cluster
1291 	 * list in the right place as determined by starting
1292 	 * offset.  Conflicts with non-blocking mandatory locked
1293 	 * regions will be checked when the cluster is processed.
1294 	 */
1295 	if (lp != NULL) {
1296 		rp = lp->list;
1297 		trp = NULL;
1298 		while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
1299 			trp = rp;
1300 			rp = rp->list;
1301 		}
1302 		nrp->list = rp;
1303 		if (trp == NULL)
1304 			lp->list = nrp;
1305 		else
1306 			trp->list = nrp;
1307 		while (nrp->ns->ns_status == RFSWRITE_INITVAL)
1308 			cv_wait(&lp->cv, &rfs_async_write_lock);
1309 		mutex_exit(&rfs_async_write_lock);
1310 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
1311 			"rfs_write_end:(%S)", "cluster child");
1312 		return;
1313 	}
1314 
1315 	/*
1316 	 * No cluster started yet, start one and add ourselves
1317 	 * to the list of clusters.
1318 	 */
1319 	nrp->list = NULL;
1320 
1321 	nlp = &nlpsp;
1322 	nlp->fhp = &wa->wa_fhandle;
1323 	cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
1324 	nlp->list = nrp;
1325 	nlp->next = NULL;
1326 
1327 	if (rfs_async_write_head == NULL) {
1328 		rfs_async_write_head = nlp;
1329 	} else {
1330 		lp = rfs_async_write_head;
1331 		while (lp->next != NULL)
1332 			lp = lp->next;
1333 		lp->next = nlp;
1334 	}
1335 	mutex_exit(&rfs_async_write_lock);
1336 
1337 	/*
1338 	 * Convert the file handle common to all of the requests
1339 	 * in this cluster to a vnode.
1340 	 */
1341 	vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1342 	if (vp == NULL) {
1343 		mutex_enter(&rfs_async_write_lock);
1344 		if (rfs_async_write_head == nlp)
1345 			rfs_async_write_head = nlp->next;
1346 		else {
1347 			lp = rfs_async_write_head;
1348 			while (lp->next != nlp)
1349 				lp = lp->next;
1350 			lp->next = nlp->next;
1351 		}
1352 		t_flag = curthread->t_flag & T_WOULDBLOCK;
1353 		for (rp = nlp->list; rp != NULL; rp = rp->list) {
1354 			rp->ns->ns_status = NFSERR_STALE;
1355 			rp->thread->t_flag |= t_flag;
1356 		}
1357 		cv_broadcast(&nlp->cv);
1358 		mutex_exit(&rfs_async_write_lock);
1359 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
1360 			"rfs_write_end:(%S)", "stale");
1361 		return;
1362 	}
1363 
1364 	/*
1365 	 * Can only write regular files.  Attempts to write any
1366 	 * other file types fail with EISDIR.
1367 	 */
1368 	if (vp->v_type != VREG) {
1369 		VN_RELE(vp);
1370 		mutex_enter(&rfs_async_write_lock);
1371 		if (rfs_async_write_head == nlp)
1372 			rfs_async_write_head = nlp->next;
1373 		else {
1374 			lp = rfs_async_write_head;
1375 			while (lp->next != nlp)
1376 				lp = lp->next;
1377 			lp->next = nlp->next;
1378 		}
1379 		t_flag = curthread->t_flag & T_WOULDBLOCK;
1380 		for (rp = nlp->list; rp != NULL; rp = rp->list) {
1381 			rp->ns->ns_status = NFSERR_ISDIR;
1382 			rp->thread->t_flag |= t_flag;
1383 		}
1384 		cv_broadcast(&nlp->cv);
1385 		mutex_exit(&rfs_async_write_lock);
1386 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
1387 			"rfs_write_end:(%S)", "isdir");
1388 		return;
1389 	}
1390 
1391 	/*
1392 	 * Enter the critical region before calling VOP_RWLOCK, to avoid a
1393 	 * deadlock with ufs.
1394 	 */
1395 	if (nbl_need_check(vp)) {
1396 		nbl_start_crit(vp, RW_READER);
1397 		in_crit = 1;
1398 	}
1399 
1400 	/*
1401 	 * Lock the file for writing.  This operation provides
1402 	 * the delay which allows clusters to grow.
1403 	 */
1404 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START,
1405 		"vop_wrlock_start:");
1406 	(void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);
1407 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END,
1408 		"vop_wrlock_end");
1409 
1410 	/*
1411 	 * Disconnect this cluster from the list of clusters.
1412 	 * The cluster that is being dealt with must be fixed
1413 	 * in size after this point, so there is no reason
1414 	 * to leave it on the list so that new requests can
1415 	 * find it.
1416 	 *
1417 	 * The algorithm is that the first write request will
1418 	 * create a cluster, convert the file handle to a
1419 	 * vnode pointer, and then lock the file for writing.
1420 	 * This request is not likely to be clustered with
1421 	 * any others.  However, the next request will create
1422 	 * a new cluster and be blocked in VOP_RWLOCK while
1423 	 * the first request is being processed.  This delay
1424 	 * will allow more requests to be clustered in this
1425 	 * second cluster.
1426 	 */
1427 	mutex_enter(&rfs_async_write_lock);
1428 	if (rfs_async_write_head == nlp)
1429 		rfs_async_write_head = nlp->next;
1430 	else {
1431 		lp = rfs_async_write_head;
1432 		while (lp->next != nlp)
1433 			lp = lp->next;
1434 		lp->next = nlp->next;
1435 	}
1436 	mutex_exit(&rfs_async_write_lock);
1437 
1438 	/*
1439 	 * Step through the list of requests in this cluster.
1440 	 * We need to check permissions to make sure that all
1441 	 * of the requests have sufficient permission to write
1442 	 * the file.  A cluster can be composed of requests
1443 	 * from different clients and different users on each
1444 	 * client.
1445 	 *
1446 	 * As a side effect, we also calculate the size of the
1447 	 * byte range that this cluster encompasses.
1448 	 */
1449 	rp = nlp->list;
1450 	off = rp->wa->wa_offset;
1451 	len = (uint_t)0;
1452 	do {
1453 		if (rdonly(exi, rp->req)) {
1454 			rp->ns->ns_status = NFSERR_ROFS;
1455 			t_flag = curthread->t_flag & T_WOULDBLOCK;
1456 			rp->thread->t_flag |= t_flag;
1457 			continue;
1458 		}
1459 
1460 		va.va_mask = AT_UID|AT_MODE;
1461 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
1462 			"vop_getattr_start:");
1463 		error = VOP_GETATTR(vp, &va, 0, rp->cr);
1464 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
1465 			"vop_getattr_end:");
1466 		if (!error) {
1467 			if (crgetuid(rp->cr) != va.va_uid) {
1468 				/*
1469 				 * This is a kludge to allow writes of files
1470 				 * created with read only permission.  The
1471 				 * owner of the file is always allowed to
1472 				 * write it.
1473 				 */
1474 				TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START,
1475 					"vop_access_start:");
1476 				error = VOP_ACCESS(vp, VWRITE, 0, rp->cr);
1477 				TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END,
1478 					"vop_access_end:");
1479 			}
1480 			if (!error && MANDLOCK(vp, va.va_mode))
1481 				error = EACCES;
1482 		}
1483 
1484 		/*
1485 		 * Check for a conflict with a nbmand-locked region.
1486 		 */
1487 		if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset,
1488 		    rp->wa->wa_count, 0)) {
1489 			error = EACCES;
1490 		}
1491 
1492 		if (error) {
1493 			rp->ns->ns_status = puterrno(error);
1494 			t_flag = curthread->t_flag & T_WOULDBLOCK;
1495 			rp->thread->t_flag |= t_flag;
1496 			continue;
1497 		}
1498 		if (len < rp->wa->wa_offset + rp->wa->wa_count - off)
1499 			len = rp->wa->wa_offset + rp->wa->wa_count - off;
1500 	} while ((rp = rp->list) != NULL);
1501 
1502 	/*
1503 	 * Step through the cluster attempting to gather as many
1504 	 * requests which are contiguous as possible.  These
1505 	 * contiguous requests are handled via one call to VOP_WRITE
1506 	 * instead of different calls to VOP_WRITE.  We also keep
1507 	 * track of the fact that any data was written.
1508 	 */
1509 	rp = nlp->list;
1510 	data_written = 0;
1511 	do {
1512 		/*
1513 		 * Skip any requests which are already marked as having an
1514 		 * error.
1515 		 */
1516 		if (rp->ns->ns_status != RFSWRITE_INITVAL) {
1517 			rp = rp->list;
1518 			continue;
1519 		}
1520 
1521 		/*
1522 		 * Count the number of iovec's which are required
1523 		 * to handle this set of requests.  One iovec is
1524 		 * needed for each data buffer, whether addressed
1525 		 * by wa_data or by the b_rptr pointers in the
1526 		 * mblk chains.
1527 		 */
1528 		iovcnt = 0;
1529 		lrp = rp;
1530 		for (;;) {
1531 			if (lrp->wa->wa_data)
1532 				iovcnt++;
1533 			else {
1534 				m = lrp->wa->wa_mblk;
1535 				while (m != NULL) {
1536 					iovcnt++;
1537 					m = m->b_cont;
1538 				}
1539 			}
1540 			if (lrp->list == NULL ||
1541 			    lrp->list->ns->ns_status != RFSWRITE_INITVAL ||
1542 			    lrp->wa->wa_offset + lrp->wa->wa_count !=
1543 			    lrp->list->wa->wa_offset) {
1544 				lrp = lrp->list;
1545 				break;
1546 			}
1547 			lrp = lrp->list;
1548 		}
1549 
1550 		if (iovcnt <= MAXCLIOVECS) {
1551 #ifdef DEBUG
1552 			rfs_write_hits++;
1553 #endif
1554 			niovp = iov;
1555 		} else {
1556 #ifdef DEBUG
1557 			rfs_write_misses++;
1558 #endif
1559 			niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP);
1560 		}
1561 		/*
1562 		 * Put together the scatter/gather iovecs.
1563 		 */
1564 		iovp = niovp;
1565 		trp = rp;
1566 		count = 0;
1567 		do {
1568 			if (trp->wa->wa_data) {
1569 				iovp->iov_base = trp->wa->wa_data;
1570 				iovp->iov_len = trp->wa->wa_count;
1571 				iovp++;
1572 			} else {
1573 				m = trp->wa->wa_mblk;
1574 				rcount = trp->wa->wa_count;
1575 				while (m != NULL) {
1576 					iovp->iov_base = (caddr_t)m->b_rptr;
1577 					iovp->iov_len = (m->b_wptr - m->b_rptr);
1578 					rcount -= iovp->iov_len;
1579 					if (rcount < 0)
1580 						iovp->iov_len += rcount;
1581 					iovp++;
1582 					if (rcount <= 0)
1583 						break;
1584 					m = m->b_cont;
1585 				}
1586 			}
1587 			count += trp->wa->wa_count;
1588 			trp = trp->list;
1589 		} while (trp != lrp);
1590 
1591 		uio.uio_iov = niovp;
1592 		uio.uio_iovcnt = iovcnt;
1593 		uio.uio_segflg = UIO_SYSSPACE;
1594 		uio.uio_extflg = UIO_COPY_DEFAULT;
1595 		uio.uio_loffset = (offset_t)rp->wa->wa_offset;
1596 		uio.uio_resid = count;
1597 		/*
1598 		 * The limit is checked on the client. We
1599 		 * should allow any size writes here.
1600 		 */
1601 		uio.uio_llimit = curproc->p_fsz_ctl;
1602 		rlimit = uio.uio_llimit - rp->wa->wa_offset;
1603 		if (rlimit < (rlim64_t)uio.uio_resid)
1604 			uio.uio_resid = (uint_t)rlimit;
1605 
1606 		/*
1607 		 * For now we assume no append mode.
1608 		 */
1609 		TRACE_1(TR_FAC_NFS, TR_VOP_WRITE_START,
1610 			"vop_write_start:(%S)", "async");
1611 
1612 		/*
1613 		 * Check to see if the v4 side of the server has
1614 		 * delegated this file.  If so, then we mark thread
1615 		 * as wouldblock so the response is dropped.
1616 		 */
1617 		if (rfs4_check_delegated(FWRITE, vp, FALSE)) {
1618 			curthread->t_flag |= T_WOULDBLOCK;
1619 			error = EACCES; /* just to have an error */
1620 			TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
1621 				"rfs_write_end:(%S)", "delegated");
1622 		} else {
1623 			/*
1624 			 * We're changing creds because VM may fault
1625 			 * and we need the cred of the current
1626 			 * thread to be used if quota * checking is
1627 			 * enabled.
1628 			 */
1629 			savecred = curthread->t_cred;
1630 			curthread->t_cred = cr;
1631 			error = VOP_WRITE(vp, &uio, 0, rp->cr, NULL);
1632 			curthread->t_cred = savecred;
1633 			TRACE_0(TR_FAC_NFS, TR_VOP_WRITE_END,
1634 				"vop_write_end:");
1635 		}
1636 
1637 		if (niovp != iov)
1638 			kmem_free(niovp, sizeof (*niovp) * iovcnt);
1639 
1640 		if (!error) {
1641 			data_written = 1;
1642 			/*
1643 			 * Get attributes again so we send the latest mod
1644 			 * time to the client side for his cache.
1645 			 */
1646 			va.va_mask = AT_ALL;	/* now we want everything */
1647 			TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
1648 				"vop_getattr_start:");
1649 			error = VOP_GETATTR(vp, &va, 0, rp->cr);
1650 			TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
1651 				"vop_getattr_end:");
1652 			if (!error)
1653 				acl_perm(vp, exi, &va, rp->cr);
1654 		}
1655 
1656 		/*
1657 		 * Fill in the status responses for each request
1658 		 * which was just handled.  Also, copy the latest
1659 		 * attributes in to the attribute responses if
1660 		 * appropriate.
1661 		 */
1662 		t_flag = curthread->t_flag & T_WOULDBLOCK;
1663 		do {
1664 			rp->thread->t_flag |= t_flag;
1665 			/* check for overflows */
1666 			if (!error) {
1667 				error  = vattr_to_nattr(&va, &rp->ns->ns_attr);
1668 			}
1669 			rp->ns->ns_status = puterrno(error);
1670 			rp = rp->list;
1671 		} while (rp != lrp);
1672 	} while (rp != NULL);
1673 
1674 	/*
1675 	 * If any data was written at all, then we need to flush
1676 	 * the data and metadata to stable storage.
1677 	 */
1678 	if (data_written) {
1679 		TRACE_0(TR_FAC_NFS, TR_VOP_PUTPAGE_START,
1680 			"vop_putpage_start:");
1681 		error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr);
1682 		TRACE_0(TR_FAC_NFS, TR_VOP_PUTPAGE_END,
1683 			"vop_putpage_end:");
1684 		if (!error) {
1685 			TRACE_0(TR_FAC_NFS, TR_VOP_FSYNC_START,
1686 				"vop_fsync_start:");
1687 			error = VOP_FSYNC(vp, FNODSYNC, cr);
1688 			TRACE_0(TR_FAC_NFS, TR_VOP_FSYNC_END,
1689 				"vop_fsync_end:");
1690 		}
1691 	}
1692 
1693 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
1694 		"vop_rwunlock_start:");
1695 	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
1696 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END,
1697 		"vop_rwunlock_end:");
1698 
1699 	if (in_crit)
1700 		nbl_end_crit(vp);
1701 	VN_RELE(vp);
1702 
1703 	t_flag = curthread->t_flag & T_WOULDBLOCK;
1704 	mutex_enter(&rfs_async_write_lock);
1705 	for (rp = nlp->list; rp != NULL; rp = rp->list) {
1706 		if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1707 			rp->ns->ns_status = puterrno(error);
1708 			rp->thread->t_flag |= t_flag;
1709 		}
1710 	}
1711 	cv_broadcast(&nlp->cv);
1712 	mutex_exit(&rfs_async_write_lock);
1713 
1714 	TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
1715 		"rfs_write_end:(%S)", "async");
1716 }
1717 
1718 fhandle_t *
1719 rfs_write_getfh(struct nfswriteargs *wa)
1720 {
1721 	return (&wa->wa_fhandle);
1722 }
1723 
1724 /*
1725  * Create a file.
1726  * Creates a file with given attributes and returns those attributes
1727  * and an fhandle for the new file.
1728  */
1729 void
1730 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
1731 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1732 {
1733 	int error;
1734 	int lookuperr;
1735 	int in_crit = 0;
1736 	struct vattr va;
1737 	vnode_t *vp;
1738 	vnode_t *dvp;
1739 	char *name = args->ca_da.da_name;
1740 	vnode_t *tvp = NULL;
1741 	int mode;
1742 	int lookup_ok;
1743 	bool_t trunc;
1744 
1745 	TRACE_0(TR_FAC_NFS, TR_RFS_CREATE_START,
1746 		"rfs_create_start:");
1747 
1748 	/*
1749 	 * Disallow NULL paths
1750 	 */
1751 	if (name == NULL || *name == '\0') {
1752 		dr->dr_status = NFSERR_ACCES;
1753 		TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END,
1754 			"rfs_create_end:(%S)", "access");
1755 		return;
1756 	}
1757 
1758 	dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
1759 	if (dvp == NULL) {
1760 		dr->dr_status = NFSERR_STALE;
1761 		TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END,
1762 			"rfs_create_end:(%S)", "stale");
1763 		return;
1764 	}
1765 
1766 	error = sattr_to_vattr(args->ca_sa, &va);
1767 	if (error) {
1768 		dr->dr_status = puterrno(error);
1769 		TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END,
1770 			"rfs_create_end:(%S)", "sattr");
1771 		return;
1772 	}
1773 
1774 	/*
1775 	 * Must specify the mode.
1776 	 */
1777 	if (!(va.va_mask & AT_MODE)) {
1778 		VN_RELE(dvp);
1779 		dr->dr_status = NFSERR_INVAL;
1780 		TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END,
1781 			"rfs_create_end:(%S)", "no mode");
1782 		return;
1783 	}
1784 
1785 	/*
1786 	 * This is a completely gross hack to make mknod
1787 	 * work over the wire until we can wack the protocol
1788 	 */
1789 	if ((va.va_mode & IFMT) == IFCHR) {
1790 		if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV)
1791 			va.va_type = VFIFO;	/* xtra kludge for named pipe */
1792 		else {
1793 			va.va_type = VCHR;
1794 			/*
1795 			 * uncompress the received dev_t
1796 			 * if the top half is zero indicating a request
1797 			 * from an `older style' OS.
1798 			 */
1799 			if ((va.va_size & 0xffff0000) == 0)
1800 				va.va_rdev = nfsv2_expdev(va.va_size);
1801 			else
1802 				va.va_rdev = (dev_t)va.va_size;
1803 		}
1804 		va.va_mask &= ~AT_SIZE;
1805 	} else if ((va.va_mode & IFMT) == IFBLK) {
1806 		va.va_type = VBLK;
1807 		/*
1808 		 * uncompress the received dev_t
1809 		 * if the top half is zero indicating a request
1810 		 * from an `older style' OS.
1811 		 */
1812 		if ((va.va_size & 0xffff0000) == 0)
1813 			va.va_rdev = nfsv2_expdev(va.va_size);
1814 		else
1815 			va.va_rdev = (dev_t)va.va_size;
1816 		va.va_mask &= ~AT_SIZE;
1817 	} else if ((va.va_mode & IFMT) == IFSOCK) {
1818 		va.va_type = VSOCK;
1819 	} else
1820 		va.va_type = VREG;
1821 	va.va_mode &= ~IFMT;
1822 	va.va_mask |= AT_TYPE;
1823 
1824 	/*
1825 	 * Why was the choice made to use VWRITE as the mode to the
1826 	 * call to VOP_CREATE ? This results in a bug.  When a client
1827 	 * opens a file that already exists and is RDONLY, the second
1828 	 * open fails with an EACESS because of the mode.
1829 	 * bug ID 1054648.
1830 	 */
1831 	lookup_ok = 0;
1832 	mode = VWRITE;
1833 	if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) {
1834 		TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_START,
1835 			"vop_lookup_start:");
1836 		error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr);
1837 		TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_END,
1838 			"vop_lookup_end:");
1839 		if (!error) {
1840 			struct vattr at;
1841 
1842 			lookup_ok = 1;
1843 			at.va_mask = AT_MODE;
1844 			TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
1845 				"vop_getattr_start:");
1846 			error = VOP_GETATTR(tvp, &at, 0, cr);
1847 			TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
1848 				"vop_getattr_end:");
1849 			if (!error)
1850 				mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD;
1851 			VN_RELE(tvp);
1852 			tvp = NULL;
1853 		}
1854 	}
1855 
1856 	if (!lookup_ok) {
1857 		if (rdonly(exi, req)) {
1858 			error = EROFS;
1859 		} else if (va.va_type != VREG && va.va_type != VFIFO &&
1860 		    va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) {
1861 			error = EPERM;
1862 		} else {
1863 			error = 0;
1864 		}
1865 	}
1866 
1867 	/*
1868 	 * If file size is being modified on an already existing file
1869 	 * make sure that there are no conflicting non-blocking mandatory
1870 	 * locks in the region being manipulated. Return EACCES if there
1871 	 * are conflicting locks.
1872 	 */
1873 	if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) {
1874 		lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr);
1875 
1876 		if (!lookuperr &&
1877 		    rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) {
1878 			VN_RELE(tvp);
1879 			curthread->t_flag |= T_WOULDBLOCK;
1880 			goto out;
1881 		}
1882 
1883 		if (!lookuperr && nbl_need_check(tvp)) {
1884 			/*
1885 			 * The file exists. Now check if it has any
1886 			 * conflicting non-blocking mandatory locks
1887 			 * in the region being changed.
1888 			 */
1889 			struct vattr bva;
1890 			u_offset_t offset;
1891 			ssize_t length;
1892 
1893 			nbl_start_crit(tvp, RW_READER);
1894 			in_crit = 1;
1895 
1896 			bva.va_mask = AT_SIZE;
1897 			error = VOP_GETATTR(tvp, &bva, 0, cr);
1898 			if (!error) {
1899 				if (va.va_size < bva.va_size) {
1900 					offset = va.va_size;
1901 					length = bva.va_size - va.va_size;
1902 				} else {
1903 					offset = bva.va_size;
1904 					length = va.va_size - bva.va_size;
1905 				}
1906 				if (length) {
1907 					if (nbl_conflict(tvp, NBL_WRITE,
1908 						offset, length, 0)) {
1909 						error = EACCES;
1910 					}
1911 				}
1912 			}
1913 			if (error) {
1914 				nbl_end_crit(tvp);
1915 				VN_RELE(tvp);
1916 				in_crit = 0;
1917 			}
1918 		} else if (tvp != NULL) {
1919 			VN_RELE(tvp);
1920 		}
1921 	}
1922 
1923 	if (!error) {
1924 		/*
1925 		 * If filesystem is shared with nosuid the remove any
1926 		 * setuid/setgid bits on create.
1927 		 */
1928 		if (va.va_type == VREG &&
1929 		    exi->exi_export.ex_flags & EX_NOSUID)
1930 			va.va_mode &= ~(VSUID | VSGID);
1931 
1932 		TRACE_0(TR_FAC_NFS, TR_VOP_CREATE_START,
1933 			"vop_create_start:");
1934 		error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0);
1935 		TRACE_0(TR_FAC_NFS, TR_VOP_CREATE_END,
1936 			"vop_create_end:");
1937 
1938 		if (!error) {
1939 
1940 			if ((va.va_mask & AT_SIZE) && (va.va_size == 0))
1941 				trunc = TRUE;
1942 			else
1943 				trunc = FALSE;
1944 
1945 			if (rfs4_check_delegated(FWRITE, tvp, trunc)) {
1946 				VN_RELE(tvp);
1947 				curthread->t_flag |= T_WOULDBLOCK;
1948 				goto out;
1949 			}
1950 			va.va_mask = AT_ALL;
1951 			TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
1952 				"vop_getattr_start:");
1953 			error = VOP_GETATTR(vp, &va, 0, cr);
1954 			TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
1955 				"vop_getattr_end:");
1956 			/* check for overflows */
1957 			if (!error) {
1958 				acl_perm(vp, exi, &va, cr);
1959 				error = vattr_to_nattr(&va, &dr->dr_attr);
1960 				if (!error) {
1961 					error = makefh(&dr->dr_fhandle, vp,
1962 							exi);
1963 				}
1964 			}
1965 			/*
1966 			 * Force modified metadata out to stable storage.
1967 			 */
1968 			(void) VOP_FSYNC(vp, FNODSYNC, cr);
1969 			VN_RELE(vp);
1970 		}
1971 
1972 		if (in_crit) {
1973 			nbl_end_crit(tvp);
1974 			VN_RELE(tvp);
1975 		}
1976 	}
1977 
1978 	/*
1979 	 * Force modified data and metadata out to stable storage.
1980 	 */
1981 	(void) VOP_FSYNC(dvp, 0, cr);
1982 
1983 out:
1984 
1985 	VN_RELE(dvp);
1986 
1987 	dr->dr_status = puterrno(error);
1988 
1989 	TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END,
1990 		"rfs_create_end:(%S)", "done");
1991 }
1992 fhandle_t *
1993 rfs_create_getfh(struct nfscreatargs *args)
1994 {
1995 	return (args->ca_da.da_fhandle);
1996 }
1997 
1998 /*
1999  * Remove a file.
2000  * Remove named file from parent directory.
2001  */
2002 void
2003 rfs_remove(struct nfsdiropargs *da, enum nfsstat *status,
2004 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2005 {
2006 	int error = 0;
2007 	vnode_t *vp;
2008 	vnode_t *targvp;
2009 	int in_crit = 0;
2010 
2011 	TRACE_0(TR_FAC_NFS, TR_RFS_REMOVE_START,
2012 		"rfs_remove_start:");
2013 
2014 	/*
2015 	 * Disallow NULL paths
2016 	 */
2017 	if (da->da_name == NULL || *da->da_name == '\0') {
2018 		*status = NFSERR_ACCES;
2019 		TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END,
2020 			"rfs_remove_end:(%S)", "access");
2021 		return;
2022 	}
2023 
2024 	vp = nfs_fhtovp(da->da_fhandle, exi);
2025 	if (vp == NULL) {
2026 		*status = NFSERR_STALE;
2027 		TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END,
2028 			"rfs_remove_end:(%S)", "stale");
2029 		return;
2030 	}
2031 
2032 	if (rdonly(exi, req)) {
2033 		VN_RELE(vp);
2034 		*status = NFSERR_ROFS;
2035 		TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END,
2036 			"rfs_remove_end:(%S)", "rofs");
2037 		return;
2038 	}
2039 
2040 	/*
2041 	 * Check for a conflict with a non-blocking mandatory share reservation.
2042 	 */
2043 	error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0,
2044 			NULL, cr);
2045 	if (error != 0) {
2046 		VN_RELE(vp);
2047 		*status = puterrno(error);
2048 		return;
2049 	}
2050 
2051 	/*
2052 	 * If the file is delegated to an v4 client, then initiate
2053 	 * recall and drop this request (by setting T_WOULDBLOCK).
2054 	 * The client will eventually re-transmit the request and
2055 	 * (hopefully), by then, the v4 client will have returned
2056 	 * the delegation.
2057 	 */
2058 
2059 	if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2060 		VN_RELE(vp);
2061 		VN_RELE(targvp);
2062 		curthread->t_flag |= T_WOULDBLOCK;
2063 		return;
2064 	}
2065 
2066 	if (nbl_need_check(targvp)) {
2067 		nbl_start_crit(targvp, RW_READER);
2068 		in_crit = 1;
2069 		if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0)) {
2070 			error = EACCES;
2071 			goto out;
2072 		}
2073 	}
2074 
2075 	TRACE_0(TR_FAC_NFS, TR_VOP_REMOVE_START,
2076 		"vop_remove_start:");
2077 	error = VOP_REMOVE(vp, da->da_name, cr);
2078 	TRACE_0(TR_FAC_NFS, TR_VOP_REMOVE_END,
2079 		"vop_remove_end:");
2080 
2081 	/*
2082 	 * Force modified data and metadata out to stable storage.
2083 	 */
2084 	(void) VOP_FSYNC(vp, 0, cr);
2085 
2086 out:
2087 	if (in_crit)
2088 		nbl_end_crit(targvp);
2089 	VN_RELE(targvp);
2090 	VN_RELE(vp);
2091 
2092 	*status = puterrno(error);
2093 
2094 	TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END,
2095 		"rfs_remove_end:(%S)", "done");
2096 }
2097 
2098 fhandle_t *
2099 rfs_remove_getfh(struct nfsdiropargs *da)
2100 {
2101 	return (da->da_fhandle);
2102 }
2103 
2104 /*
2105  * rename a file
2106  * Give a file (from) a new name (to).
2107  */
2108 void
2109 rfs_rename(struct nfsrnmargs *args, enum nfsstat *status,
2110 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2111 {
2112 	int error = 0;
2113 	vnode_t *fromvp;
2114 	vnode_t *tovp;
2115 	struct exportinfo *to_exi;
2116 	fhandle_t *fh;
2117 	vnode_t *srcvp;
2118 	vnode_t *targvp;
2119 	int in_crit = 0;
2120 
2121 	TRACE_0(TR_FAC_NFS, TR_RFS_RENAME_START,
2122 		"rfs_rename_start:");
2123 
2124 	fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi);
2125 	if (fromvp == NULL) {
2126 		*status = NFSERR_STALE;
2127 		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2128 			"rfs_rename_end:(%S)", "from stale");
2129 		return;
2130 	}
2131 
2132 	fh = args->rna_to.da_fhandle;
2133 	to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2134 	if (to_exi == NULL) {
2135 		VN_RELE(fromvp);
2136 		*status = NFSERR_ACCES;
2137 		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2138 			"rfs_rename_end:(%S)", "cross device");
2139 		return;
2140 	}
2141 	exi_rele(to_exi);
2142 
2143 	if (to_exi != exi) {
2144 		VN_RELE(fromvp);
2145 		*status = NFSERR_XDEV;
2146 		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2147 			"rfs_rename_end:(%S)", "from stale");
2148 		return;
2149 	}
2150 
2151 	tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi);
2152 	if (tovp == NULL) {
2153 		VN_RELE(fromvp);
2154 		*status = NFSERR_STALE;
2155 		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2156 			"rfs_rename_end:(%S)", "to stale");
2157 		return;
2158 	}
2159 
2160 	if (fromvp->v_type != VDIR || tovp->v_type != VDIR) {
2161 		VN_RELE(tovp);
2162 		VN_RELE(fromvp);
2163 		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2164 			"rfs_rename_end:(%S)", "not dir");
2165 		*status = NFSERR_NOTDIR;
2166 		return;
2167 	}
2168 
2169 	/*
2170 	 * Disallow NULL paths
2171 	 */
2172 	if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' ||
2173 	    args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') {
2174 		VN_RELE(tovp);
2175 		VN_RELE(fromvp);
2176 		*status = NFSERR_ACCES;
2177 		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2178 			"rfs_rename_end:(%S)", "access");
2179 		return;
2180 	}
2181 
2182 	if (rdonly(exi, req)) {
2183 		VN_RELE(tovp);
2184 		VN_RELE(fromvp);
2185 		*status = NFSERR_ROFS;
2186 		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2187 			"rfs_rename_end:(%S)", "rofs");
2188 		return;
2189 	}
2190 
2191 	/*
2192 	 * Check for a conflict with a non-blocking mandatory share reservation.
2193 	 */
2194 	error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
2195 			NULL, cr);
2196 	if (error != 0) {
2197 		VN_RELE(tovp);
2198 		VN_RELE(fromvp);
2199 		*status = puterrno(error);
2200 		return;
2201 	}
2202 
2203 	/* Check for delegations on the source file */
2204 
2205 	if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
2206 		VN_RELE(tovp);
2207 		VN_RELE(fromvp);
2208 		VN_RELE(srcvp);
2209 		curthread->t_flag |= T_WOULDBLOCK;
2210 		return;
2211 	}
2212 
2213 	/* Check for delegation on the file being renamed over, if it exists */
2214 
2215 	if (rfs4_deleg_policy != SRV_NEVER_DELEGATE &&
2216 	    VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr)
2217 	    == 0) {
2218 
2219 		if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2220 			VN_RELE(tovp);
2221 			VN_RELE(fromvp);
2222 			VN_RELE(srcvp);
2223 			VN_RELE(targvp);
2224 			curthread->t_flag |= T_WOULDBLOCK;
2225 			return;
2226 		}
2227 		VN_RELE(targvp);
2228 	}
2229 
2230 
2231 	if (nbl_need_check(srcvp)) {
2232 		nbl_start_crit(srcvp, RW_READER);
2233 		in_crit = 1;
2234 		if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0)) {
2235 			error = EACCES;
2236 			goto out;
2237 		}
2238 	}
2239 
2240 	TRACE_0(TR_FAC_NFS, TR_VOP_RENAME_START,
2241 		"vop_rename_start:");
2242 	error = VOP_RENAME(fromvp, args->rna_from.da_name,
2243 	    tovp, args->rna_to.da_name, cr);
2244 	TRACE_0(TR_FAC_NFS, TR_VOP_RENAME_END,
2245 		"vop_rename_end:");
2246 
2247 	if (error == 0) {
2248 		char *tmp;
2249 
2250 		/* fix the path name for the renamed file */
2251 		mutex_enter(&srcvp->v_lock);
2252 		tmp = srcvp->v_path;
2253 		srcvp->v_path = NULL;
2254 		mutex_exit(&srcvp->v_lock);
2255 		vn_setpath(rootdir, tovp, srcvp, args->rna_to.da_name,
2256 				strlen(args->rna_to.da_name));
2257 		if (tmp != NULL)
2258 			kmem_free(tmp, strlen(tmp) + 1);
2259 	}
2260 
2261 	/*
2262 	 * Force modified data and metadata out to stable storage.
2263 	 */
2264 	(void) VOP_FSYNC(tovp, 0, cr);
2265 	(void) VOP_FSYNC(fromvp, 0, cr);
2266 
2267 out:
2268 	if (in_crit)
2269 		nbl_end_crit(srcvp);
2270 	VN_RELE(srcvp);
2271 	VN_RELE(tovp);
2272 	VN_RELE(fromvp);
2273 
2274 	*status = puterrno(error);
2275 
2276 	TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2277 		"rfs_rename_end:(%S)", "done");
2278 }
2279 fhandle_t *
2280 rfs_rename_getfh(struct nfsrnmargs *args)
2281 {
2282 	return (args->rna_from.da_fhandle);
2283 }
2284 
2285 /*
2286  * Link to a file.
2287  * Create a file (to) which is a hard link to the given file (from).
2288  */
2289 void
2290 rfs_link(struct nfslinkargs *args, enum nfsstat *status,
2291 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2292 {
2293 	int error;
2294 	vnode_t *fromvp;
2295 	vnode_t *tovp;
2296 	struct exportinfo *to_exi;
2297 	fhandle_t *fh;
2298 
2299 	TRACE_0(TR_FAC_NFS, TR_RFS_LINK_START,
2300 		"rfs_link_start:");
2301 
2302 	fromvp = nfs_fhtovp(args->la_from, exi);
2303 	if (fromvp == NULL) {
2304 		*status = NFSERR_STALE;
2305 		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2306 			"rfs_link_end:(%S)", "from stale");
2307 		return;
2308 	}
2309 
2310 	fh = args->la_to.da_fhandle;
2311 	to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2312 	if (to_exi == NULL) {
2313 		VN_RELE(fromvp);
2314 		*status = NFSERR_ACCES;
2315 		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2316 			"rfs_link_end:(%S)", "cross device");
2317 		return;
2318 	}
2319 	exi_rele(to_exi);
2320 
2321 	if (to_exi != exi) {
2322 		VN_RELE(fromvp);
2323 		*status = NFSERR_XDEV;
2324 		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2325 			"rfs_link_end:(%S)", "cross device");
2326 		return;
2327 	}
2328 
2329 	tovp = nfs_fhtovp(args->la_to.da_fhandle, exi);
2330 	if (tovp == NULL) {
2331 		VN_RELE(fromvp);
2332 		*status = NFSERR_STALE;
2333 		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2334 			"rfs_link_end:(%S)", "to stale");
2335 		return;
2336 	}
2337 
2338 	if (tovp->v_type != VDIR) {
2339 		VN_RELE(tovp);
2340 		VN_RELE(fromvp);
2341 		*status = NFSERR_NOTDIR;
2342 		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2343 			"rfs_link_end:(%S)", "not dir");
2344 		return;
2345 	}
2346 	/*
2347 	 * Disallow NULL paths
2348 	 */
2349 	if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') {
2350 		VN_RELE(tovp);
2351 		VN_RELE(fromvp);
2352 		*status = NFSERR_ACCES;
2353 		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2354 			"rfs_link_end:(%S)", "access");
2355 		return;
2356 	}
2357 
2358 	if (rdonly(exi, req)) {
2359 		VN_RELE(tovp);
2360 		VN_RELE(fromvp);
2361 		*status = NFSERR_ROFS;
2362 		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2363 			"rfs_link_end:(%S)", "rofs");
2364 		return;
2365 	}
2366 
2367 	TRACE_0(TR_FAC_NFS, TR_VOP_LINK_START,
2368 		"vop_link_start:");
2369 	error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr);
2370 	TRACE_0(TR_FAC_NFS, TR_VOP_LINK_END,
2371 		"vop_link_end:");
2372 
2373 	/*
2374 	 * Force modified data and metadata out to stable storage.
2375 	 */
2376 	(void) VOP_FSYNC(tovp, 0, cr);
2377 	(void) VOP_FSYNC(fromvp, FNODSYNC, cr);
2378 
2379 	VN_RELE(tovp);
2380 	VN_RELE(fromvp);
2381 
2382 	*status = puterrno(error);
2383 
2384 	TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2385 		"rfs_link_end:(%S)", "done");
2386 }
2387 fhandle_t *
2388 rfs_link_getfh(struct nfslinkargs *args)
2389 {
2390 	return (args->la_from);
2391 }
2392 
2393 /*
2394  * Symbolicly link to a file.
2395  * Create a file (to) with the given attributes which is a symbolic link
2396  * to the given path name (to).
2397  */
2398 void
2399 rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
2400 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2401 {
2402 	int error;
2403 	struct vattr va;
2404 	vnode_t *vp;
2405 	vnode_t *svp;
2406 	int lerror;
2407 
2408 	TRACE_0(TR_FAC_NFS, TR_RFS_SYMLINK_START,
2409 		"rfs_symlink_start:");
2410 
2411 	/*
2412 	 * Disallow NULL paths
2413 	 */
2414 	if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') {
2415 		*status = NFSERR_ACCES;
2416 		TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END,
2417 			"rfs_symlink_end:(%S)", "access");
2418 		return;
2419 	}
2420 
2421 	vp = nfs_fhtovp(args->sla_from.da_fhandle, exi);
2422 	if (vp == NULL) {
2423 		*status = NFSERR_STALE;
2424 		TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END,
2425 			"rfs_symlink_end:(%S)", "stale");
2426 		return;
2427 	}
2428 
2429 	if (rdonly(exi, req)) {
2430 		VN_RELE(vp);
2431 		*status = NFSERR_ROFS;
2432 		TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END,
2433 			"rfs_symlink_end:(%S)", "rofs");
2434 		return;
2435 	}
2436 
2437 	error = sattr_to_vattr(args->sla_sa, &va);
2438 	if (error) {
2439 		VN_RELE(vp);
2440 		*status = puterrno(error);
2441 		TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END,
2442 			"rfs_symlink_end:(%S)", "sattr");
2443 		return;
2444 	}
2445 
2446 	if (!(va.va_mask & AT_MODE)) {
2447 		VN_RELE(vp);
2448 		*status = NFSERR_INVAL;
2449 		TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END,
2450 			"rfs_symlink_end:(%S)", "no mode");
2451 		return;
2452 	}
2453 
2454 	va.va_type = VLNK;
2455 	va.va_mask |= AT_TYPE;
2456 
2457 	TRACE_0(TR_FAC_NFS, TR_VOP_SYMLINK_START,
2458 		"vop_symlink_start:");
2459 	error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, args->sla_tnm, cr);
2460 	TRACE_0(TR_FAC_NFS, TR_VOP_SYMLINK_END,
2461 		"vop_symlink_end:");
2462 
2463 	/*
2464 	 * Force new data and metadata out to stable storage.
2465 	 */
2466 	TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_START,
2467 		"vop_lookup_start:");
2468 	lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL,
2469 	    0, NULL, cr);
2470 	TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_END,
2471 		"vop_lookup_end:");
2472 	if (!lerror) {
2473 		(void) VOP_FSYNC(svp, 0, cr);
2474 		VN_RELE(svp);
2475 	}
2476 
2477 	/*
2478 	 * Force modified data and metadata out to stable storage.
2479 	 */
2480 	(void) VOP_FSYNC(vp, 0, cr);
2481 
2482 	VN_RELE(vp);
2483 
2484 	*status = puterrno(error);
2485 
2486 	TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END,
2487 		"rfs_symlink_end:(%S)", "done");
2488 }
2489 fhandle_t *
2490 rfs_symlink_getfh(struct nfsslargs *args)
2491 {
2492 	return (args->sla_from.da_fhandle);
2493 }
2494 
2495 /*
2496  * Make a directory.
2497  * Create a directory with the given name, parent directory, and attributes.
2498  * Returns a file handle and attributes for the new directory.
2499  */
2500 void
2501 rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr,
2502 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2503 {
2504 	int error;
2505 	struct vattr va;
2506 	vnode_t *dvp = NULL;
2507 	vnode_t *vp;
2508 	char *name = args->ca_da.da_name;
2509 
2510 	TRACE_0(TR_FAC_NFS, TR_RFS_MKDIR_START,
2511 		"rfs_mkdir_start:");
2512 
2513 	/*
2514 	 * Disallow NULL paths
2515 	 */
2516 	if (name == NULL || *name == '\0') {
2517 		dr->dr_status = NFSERR_ACCES;
2518 		TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END,
2519 			"rfs_mkdir_end:(%S)", "access");
2520 		return;
2521 	}
2522 
2523 	vp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
2524 	if (vp == NULL) {
2525 		dr->dr_status = NFSERR_STALE;
2526 		TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END,
2527 			"rfs_mkdir_end:(%S)", "stale");
2528 		return;
2529 	}
2530 
2531 	if (rdonly(exi, req)) {
2532 		VN_RELE(vp);
2533 		dr->dr_status = NFSERR_ROFS;
2534 		TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END,
2535 			"rfs_mkdir_end:(%S)", "rofs");
2536 		return;
2537 	}
2538 
2539 	error = sattr_to_vattr(args->ca_sa, &va);
2540 	if (error) {
2541 		VN_RELE(vp);
2542 		dr->dr_status = puterrno(error);
2543 		TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END,
2544 			"rfs_mkdir_end:(%S)", "sattr");
2545 		return;
2546 	}
2547 
2548 	if (!(va.va_mask & AT_MODE)) {
2549 		VN_RELE(vp);
2550 		dr->dr_status = NFSERR_INVAL;
2551 		TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END,
2552 			"rfs_mkdir_end:(%S)", "no mode");
2553 		return;
2554 	}
2555 
2556 	va.va_type = VDIR;
2557 	va.va_mask |= AT_TYPE;
2558 
2559 	TRACE_0(TR_FAC_NFS, TR_VOP_MKDIR_START,
2560 		"vop_mkdir_start:");
2561 	error = VOP_MKDIR(vp, name, &va, &dvp, cr);
2562 	TRACE_0(TR_FAC_NFS, TR_VOP_MKDIR_END,
2563 		"vop_mkdir_end:");
2564 
2565 	if (!error) {
2566 		/*
2567 		 * Attribtutes of the newly created directory should
2568 		 * be returned to the client.
2569 		 */
2570 		va.va_mask = AT_ALL; /* We want everything */
2571 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
2572 			"vop_getattr_start:");
2573 		error = VOP_GETATTR(dvp, &va, 0, cr);
2574 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
2575 			"vop_getattr_end:");
2576 		/* check for overflows */
2577 		if (!error) {
2578 			acl_perm(vp, exi, &va, cr);
2579 			error = vattr_to_nattr(&va, &dr->dr_attr);
2580 			if (!error) {
2581 				error = makefh(&dr->dr_fhandle, dvp, exi);
2582 			}
2583 		}
2584 		/*
2585 		 * Force new data and metadata out to stable storage.
2586 		 */
2587 		(void) VOP_FSYNC(dvp, 0, cr);
2588 		VN_RELE(dvp);
2589 	}
2590 
2591 	/*
2592 	 * Force modified data and metadata out to stable storage.
2593 	 */
2594 	(void) VOP_FSYNC(vp, 0, cr);
2595 
2596 	VN_RELE(vp);
2597 
2598 	dr->dr_status = puterrno(error);
2599 
2600 	TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END,
2601 		"rfs_mkdir_end:(%S)", "done");
2602 }
2603 fhandle_t *
2604 rfs_mkdir_getfh(struct nfscreatargs *args)
2605 {
2606 	return (args->ca_da.da_fhandle);
2607 }
2608 
2609 /*
2610  * Remove a directory.
2611  * Remove the given directory name from the given parent directory.
2612  */
2613 void
2614 rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status,
2615 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2616 {
2617 	int error;
2618 	vnode_t *vp;
2619 
2620 	TRACE_0(TR_FAC_NFS, TR_RFS_RMDIR_START,
2621 		"rfs_rmdir_start:");
2622 
2623 	/*
2624 	 * Disallow NULL paths
2625 	 */
2626 	if (da->da_name == NULL || *da->da_name == '\0') {
2627 		*status = NFSERR_ACCES;
2628 		TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END,
2629 			"rfs_rmdir_end:(%S)", "access");
2630 		return;
2631 	}
2632 
2633 	vp = nfs_fhtovp(da->da_fhandle, exi);
2634 	if (vp == NULL) {
2635 		*status = NFSERR_STALE;
2636 		TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END,
2637 			"rfs_rmdir_end:(%S)", "stale");
2638 		return;
2639 	}
2640 
2641 	if (rdonly(exi, req)) {
2642 		VN_RELE(vp);
2643 		*status = NFSERR_ROFS;
2644 		TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END,
2645 			"rfs_rmdir_end:(%S)", "rofs");
2646 		return;
2647 	}
2648 
2649 	/*
2650 	 * VOP_RMDIR now takes a new third argument (the current
2651 	 * directory of the process).  That's because someone
2652 	 * wants to return EINVAL if one tries to remove ".".
2653 	 * Of course, NFS servers have no idea what their
2654 	 * clients' current directories are.  We fake it by
2655 	 * supplying a vnode known to exist and illegal to
2656 	 * remove.
2657 	 */
2658 	TRACE_0(TR_FAC_NFS, TR_VOP_RMDIR_START,
2659 		"vop_rmdir_start:");
2660 	error = VOP_RMDIR(vp, da->da_name, rootdir, cr);
2661 	TRACE_0(TR_FAC_NFS, TR_VOP_RMDIR_END,
2662 		"vop_rmdir_end:");
2663 
2664 	/*
2665 	 * Force modified data and metadata out to stable storage.
2666 	 */
2667 	(void) VOP_FSYNC(vp, 0, cr);
2668 
2669 	VN_RELE(vp);
2670 
2671 	/*
2672 	 * System V defines rmdir to return EEXIST, not ENOTEMPTY,
2673 	 * if the directory is not empty.  A System V NFS server
2674 	 * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
2675 	 * over the wire.
2676 	 */
2677 	if (error == EEXIST)
2678 		*status = NFSERR_NOTEMPTY;
2679 	else
2680 		*status = puterrno(error);
2681 
2682 	TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END,
2683 		"rfs_rmdir_end:(%S)", "done");
2684 }
2685 fhandle_t *
2686 rfs_rmdir_getfh(struct nfsdiropargs *da)
2687 {
2688 	return (da->da_fhandle);
2689 }
2690 
2691 /* ARGSUSED */
2692 void
2693 rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
2694 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2695 {
2696 	int error;
2697 	int iseof;
2698 	struct iovec iov;
2699 	struct uio uio;
2700 	vnode_t *vp;
2701 
2702 	TRACE_0(TR_FAC_NFS, TR_RFS_READDIR_START,
2703 		"rfs_readdir_start:");
2704 
2705 	vp = nfs_fhtovp(&rda->rda_fh, exi);
2706 	if (vp == NULL) {
2707 		rd->rd_entries = NULL;
2708 		rd->rd_status = NFSERR_STALE;
2709 		TRACE_1(TR_FAC_NFS, TR_RFS_READDIR_END,
2710 			"rfs_readdir_end:(%S)", "stale");
2711 		return;
2712 	}
2713 
2714 	if (vp->v_type != VDIR) {
2715 		VN_RELE(vp);
2716 		rd->rd_entries = NULL;
2717 		rd->rd_status = NFSERR_NOTDIR;
2718 		TRACE_1(TR_FAC_NFS, TR_RFS_READDIR_END,
2719 			"rfs_readdir_end:(%S)", "notdir");
2720 		return;
2721 	}
2722 
2723 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START,
2724 		"vop_rwlock_start:");
2725 	(void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
2726 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END,
2727 		"vop_rwlock_end:");
2728 
2729 	TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START,
2730 		"vop_access_start:");
2731 	error = VOP_ACCESS(vp, VREAD, 0, cr);
2732 	TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END,
2733 		"vop_access_end:");
2734 	if (error) {
2735 		rd->rd_entries = NULL;
2736 		goto bad;
2737 	}
2738 
2739 	if (rda->rda_count == 0) {
2740 		rd->rd_entries = NULL;
2741 		rd->rd_size = 0;
2742 		rd->rd_eof = FALSE;
2743 		goto bad;
2744 	}
2745 
2746 	rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA);
2747 
2748 	/*
2749 	 * Allocate data for entries.  This will be freed by rfs_rddirfree.
2750 	 */
2751 	rd->rd_bufsize = (uint_t)rda->rda_count;
2752 	rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP);
2753 
2754 	/*
2755 	 * Set up io vector to read directory data
2756 	 */
2757 	iov.iov_base = (caddr_t)rd->rd_entries;
2758 	iov.iov_len = rda->rda_count;
2759 	uio.uio_iov = &iov;
2760 	uio.uio_iovcnt = 1;
2761 	uio.uio_segflg = UIO_SYSSPACE;
2762 	uio.uio_extflg = UIO_COPY_CACHED;
2763 	uio.uio_loffset = (offset_t)rda->rda_offset;
2764 	uio.uio_resid = rda->rda_count;
2765 
2766 	/*
2767 	 * read directory
2768 	 */
2769 	TRACE_0(TR_FAC_NFS, TR_VOP_READDIR_START,
2770 		"vop_readdir_start:");
2771 	error = VOP_READDIR(vp, &uio, cr, &iseof);
2772 	TRACE_0(TR_FAC_NFS, TR_VOP_READDIR_END,
2773 		"vop_readdir_end:");
2774 
2775 	/*
2776 	 * Clean up
2777 	 */
2778 	if (!error) {
2779 		/*
2780 		 * set size and eof
2781 		 */
2782 		if (uio.uio_resid == rda->rda_count) {
2783 			rd->rd_size = 0;
2784 			rd->rd_eof = TRUE;
2785 		} else {
2786 			rd->rd_size = (uint32_t)(rda->rda_count -
2787 			    uio.uio_resid);
2788 			rd->rd_eof = iseof ? TRUE : FALSE;
2789 		}
2790 	}
2791 
2792 bad:
2793 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
2794 		"vop_rwunlock_start:");
2795 	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
2796 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END,
2797 		"vop_rwunlock_end:");
2798 
2799 #if 0 /* notyet */
2800 	/*
2801 	 * Don't do this.  It causes local disk writes when just
2802 	 * reading the file and the overhead is deemed larger
2803 	 * than the benefit.
2804 	 */
2805 	/*
2806 	 * Force modified metadata out to stable storage.
2807 	 */
2808 	(void) VOP_FSYNC(vp, FNODSYNC, cr);
2809 #endif
2810 
2811 	VN_RELE(vp);
2812 
2813 	rd->rd_status = puterrno(error);
2814 
2815 	TRACE_1(TR_FAC_NFS, TR_RFS_READDIR_END,
2816 		"rfs_readdir_end:(%S)", "done");
2817 }
2818 fhandle_t *
2819 rfs_readdir_getfh(struct nfsrddirargs *rda)
2820 {
2821 	return (&rda->rda_fh);
2822 }
2823 void
2824 rfs_rddirfree(struct nfsrddirres *rd)
2825 {
2826 	if (rd->rd_entries != NULL)
2827 		kmem_free(rd->rd_entries, rd->rd_bufsize);
2828 }
2829 
2830 /* ARGSUSED */
2831 void
2832 rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
2833 	struct svc_req *req, cred_t *cr)
2834 {
2835 	int error;
2836 	struct statvfs64 sb;
2837 	vnode_t *vp;
2838 
2839 	TRACE_0(TR_FAC_NFS, TR_RFS_STATFS_START,
2840 		"rfs_statfs_start:");
2841 
2842 	vp = nfs_fhtovp(fh, exi);
2843 	if (vp == NULL) {
2844 		fs->fs_status = NFSERR_STALE;
2845 		TRACE_1(TR_FAC_NFS, TR_RFS_STATFS_END,
2846 			"rfs_statfs_end:(%S)", "stale");
2847 		return;
2848 	}
2849 
2850 	error = VFS_STATVFS(vp->v_vfsp, &sb);
2851 
2852 	if (!error) {
2853 		fs->fs_tsize = nfstsize();
2854 		fs->fs_bsize = sb.f_frsize;
2855 		fs->fs_blocks = sb.f_blocks;
2856 		fs->fs_bfree = sb.f_bfree;
2857 		fs->fs_bavail = sb.f_bavail;
2858 	}
2859 
2860 	VN_RELE(vp);
2861 
2862 	fs->fs_status = puterrno(error);
2863 
2864 	TRACE_1(TR_FAC_NFS, TR_RFS_STATFS_END,
2865 		"rfs_statfs_end:(%S)", "done");
2866 }
2867 fhandle_t *
2868 rfs_statfs_getfh(fhandle_t *fh)
2869 {
2870 	return (fh);
2871 }
2872 
2873 static int
2874 sattr_to_vattr(struct nfssattr *sa, struct vattr *vap)
2875 {
2876 	vap->va_mask = 0;
2877 
2878 	/*
2879 	 * There was a sign extension bug in some VFS based systems
2880 	 * which stored the mode as a short.  When it would get
2881 	 * assigned to a u_long, no sign extension would occur.
2882 	 * It needed to, but this wasn't noticed because sa_mode
2883 	 * would then get assigned back to the short, thus ignoring
2884 	 * the upper 16 bits of sa_mode.
2885 	 *
2886 	 * To make this implementation work for both broken
2887 	 * clients and good clients, we check for both versions
2888 	 * of the mode.
2889 	 */
2890 	if (sa->sa_mode != (uint32_t)((ushort_t)-1) &&
2891 	    sa->sa_mode != (uint32_t)-1) {
2892 		vap->va_mask |= AT_MODE;
2893 		vap->va_mode = sa->sa_mode;
2894 	}
2895 	if (sa->sa_uid != (uint32_t)-1) {
2896 		vap->va_mask |= AT_UID;
2897 		vap->va_uid = sa->sa_uid;
2898 	}
2899 	if (sa->sa_gid != (uint32_t)-1) {
2900 		vap->va_mask |= AT_GID;
2901 		vap->va_gid = sa->sa_gid;
2902 	}
2903 	if (sa->sa_size != (uint32_t)-1) {
2904 		vap->va_mask |= AT_SIZE;
2905 		vap->va_size = sa->sa_size;
2906 	}
2907 	if (sa->sa_atime.tv_sec != (int32_t)-1 &&
2908 	    sa->sa_atime.tv_usec != (int32_t)-1) {
2909 #ifndef _LP64
2910 		/* return error if time overflow */
2911 		if (!NFS2_TIME_OK(sa->sa_atime.tv_sec))
2912 			return (EOVERFLOW);
2913 #endif
2914 		vap->va_mask |= AT_ATIME;
2915 		/*
2916 		 * nfs protocol defines times as unsigned so don't extend sign,
2917 		 * unless sysadmin set nfs_allow_preepoch_time.
2918 		 */
2919 		NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec);
2920 		vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
2921 	}
2922 	if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
2923 	    sa->sa_mtime.tv_usec != (int32_t)-1) {
2924 #ifndef _LP64
2925 		/* return error if time overflow */
2926 		if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
2927 			return (EOVERFLOW);
2928 #endif
2929 		vap->va_mask |= AT_MTIME;
2930 		/*
2931 		 * nfs protocol defines times as unsigned so don't extend sign,
2932 		 * unless sysadmin set nfs_allow_preepoch_time.
2933 		 */
2934 		NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
2935 		vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
2936 	}
2937 	return (0);
2938 }
2939 
2940 static enum nfsftype vt_to_nf[] = {
2941 	0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
2942 };
2943 
2944 /*
2945  * check the following fields for overflow: nodeid, size, and time.
2946  * There could be a problem when converting 64-bit LP64 fields
2947  * into 32-bit ones.  Return an error if there is an overflow.
2948  */
2949 int
2950 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
2951 {
2952 	ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
2953 	na->na_type = vt_to_nf[vap->va_type];
2954 
2955 	if (vap->va_mode == (unsigned short) -1)
2956 		na->na_mode = (uint32_t)-1;
2957 	else
2958 		na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
2959 
2960 	if (vap->va_uid == (unsigned short)(-1))
2961 		na->na_uid = (uint32_t)(-1);
2962 	else if (vap->va_uid == UID_NOBODY)
2963 		na->na_uid = (uint32_t)NFS_UID_NOBODY;
2964 	else
2965 		na->na_uid = vap->va_uid;
2966 
2967 	if (vap->va_gid == (unsigned short)(-1))
2968 		na->na_gid = (uint32_t)-1;
2969 	else if (vap->va_gid == GID_NOBODY)
2970 		na->na_gid = (uint32_t)NFS_GID_NOBODY;
2971 	else
2972 		na->na_gid = vap->va_gid;
2973 
2974 	/*
2975 	 * Do we need to check fsid for overflow?  It is 64-bit in the
2976 	 * vattr, but are bigger than 32 bit values supported?
2977 	 */
2978 	na->na_fsid = vap->va_fsid;
2979 
2980 	na->na_nodeid = vap->va_nodeid;
2981 
2982 	/*
2983 	 * Check to make sure that the nodeid is representable over the
2984 	 * wire without losing bits.
2985 	 */
2986 	if (vap->va_nodeid != (u_longlong_t)na->na_nodeid)
2987 		return (EFBIG);
2988 	na->na_nlink = vap->va_nlink;
2989 
2990 	/*
2991 	 * Check for big files here, instead of at the caller.  See
2992 	 * comments in cstat for large special file explanation.
2993 	 */
2994 	if (vap->va_size > (u_longlong_t)MAXOFF32_T) {
2995 		if ((vap->va_type == VREG) || (vap->va_type == VDIR))
2996 			return (EFBIG);
2997 		if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) {
2998 			/* UNKNOWN_SIZE | OVERFLOW */
2999 			na->na_size = MAXOFF32_T;
3000 		} else
3001 			na->na_size = vap->va_size;
3002 	} else
3003 		na->na_size = vap->va_size;
3004 
3005 	/*
3006 	 * If the vnode times overflow the 32-bit times that NFS2
3007 	 * uses on the wire then return an error.
3008 	 */
3009 	if (!NFS_VAP_TIME_OK(vap)) {
3010 		return (EOVERFLOW);
3011 	}
3012 	na->na_atime.tv_sec = vap->va_atime.tv_sec;
3013 	na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
3014 
3015 	na->na_mtime.tv_sec = vap->va_mtime.tv_sec;
3016 	na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
3017 
3018 	na->na_ctime.tv_sec = vap->va_ctime.tv_sec;
3019 	na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000;
3020 
3021 	/*
3022 	 * If the dev_t will fit into 16 bits then compress
3023 	 * it, otherwise leave it alone. See comments in
3024 	 * nfs_client.c.
3025 	 */
3026 	if (getminor(vap->va_rdev) <= SO4_MAXMIN &&
3027 	    getmajor(vap->va_rdev) <= SO4_MAXMAJ)
3028 		na->na_rdev = nfsv2_cmpdev(vap->va_rdev);
3029 	else
3030 		(void) cmpldev(&na->na_rdev, vap->va_rdev);
3031 
3032 	na->na_blocks = vap->va_nblocks;
3033 	na->na_blocksize = vap->va_blksize;
3034 
3035 	/*
3036 	 * This bit of ugliness is a *TEMPORARY* hack to preserve the
3037 	 * over-the-wire protocols for named-pipe vnodes.  It remaps the
3038 	 * VFIFO type to the special over-the-wire type. (see note in nfs.h)
3039 	 *
3040 	 * BUYER BEWARE:
3041 	 *  If you are porting the NFS to a non-Sun server, you probably
3042 	 *  don't want to include the following block of code.  The
3043 	 *  over-the-wire special file types will be changing with the
3044 	 *  NFS Protocol Revision.
3045 	 */
3046 	if (vap->va_type == VFIFO)
3047 		NA_SETFIFO(na);
3048 	return (0);
3049 }
3050 
3051 /*
3052  * acl v2 support: returns approximate permission.
3053  *	default: returns minimal permission (more restrictive)
3054  *	aclok: returns maximal permission (less restrictive)
3055  *	This routine changes the permissions that are alaredy in *va.
3056  *	If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES,
3057  *	CLASS_OBJ is always the same as GROUP_OBJ entry.
3058  */
3059 static void
3060 acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr)
3061 {
3062 	vsecattr_t	vsa;
3063 	int		aclcnt;
3064 	aclent_t	*aclentp;
3065 	mode_t		mask_perm;
3066 	mode_t		grp_perm;
3067 	mode_t		other_perm;
3068 	mode_t		other_orig;
3069 	int		error;
3070 
3071 	/* dont care default acl */
3072 	vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT);
3073 	error = VOP_GETSECATTR(vp, &vsa, 0, cr);
3074 
3075 	if (!error) {
3076 		aclcnt = vsa.vsa_aclcnt;
3077 		if (aclcnt > MIN_ACL_ENTRIES) {
3078 			/* non-trivial ACL */
3079 			aclentp = vsa.vsa_aclentp;
3080 			if (exi->exi_export.ex_flags & EX_ACLOK) {
3081 				/* maximal permissions */
3082 				grp_perm = 0;
3083 				other_perm = 0;
3084 				for (; aclcnt > 0; aclcnt--, aclentp++) {
3085 					switch (aclentp->a_type) {
3086 					case USER_OBJ:
3087 						break;
3088 					case USER:
3089 						grp_perm |=
3090 						    aclentp->a_perm << 3;
3091 						other_perm |= aclentp->a_perm;
3092 						break;
3093 					case GROUP_OBJ:
3094 						grp_perm |=
3095 						    aclentp->a_perm << 3;
3096 						break;
3097 					case GROUP:
3098 						other_perm |= aclentp->a_perm;
3099 						break;
3100 					case OTHER_OBJ:
3101 						other_orig = aclentp->a_perm;
3102 						break;
3103 					case CLASS_OBJ:
3104 						mask_perm = aclentp->a_perm;
3105 						break;
3106 					default:
3107 						break;
3108 					}
3109 				}
3110 				grp_perm &= mask_perm << 3;
3111 				other_perm &= mask_perm;
3112 				other_perm |= other_orig;
3113 
3114 			} else {
3115 				/* minimal permissions */
3116 				grp_perm = 070;
3117 				other_perm = 07;
3118 				for (; aclcnt > 0; aclcnt--, aclentp++) {
3119 					switch (aclentp->a_type) {
3120 					case USER_OBJ:
3121 						break;
3122 					case USER:
3123 					case CLASS_OBJ:
3124 						grp_perm &=
3125 						    aclentp->a_perm << 3;
3126 						other_perm &=
3127 						    aclentp->a_perm;
3128 						break;
3129 					case GROUP_OBJ:
3130 						grp_perm &=
3131 						    aclentp->a_perm << 3;
3132 						break;
3133 					case GROUP:
3134 						other_perm &=
3135 						    aclentp->a_perm;
3136 						break;
3137 					case OTHER_OBJ:
3138 						other_perm &=
3139 						    aclentp->a_perm;
3140 						break;
3141 					default:
3142 						break;
3143 					}
3144 				}
3145 			}
3146 			/* copy to va */
3147 			va->va_mode &= ~077;
3148 			va->va_mode |= grp_perm | other_perm;
3149 		}
3150 		if (vsa.vsa_aclcnt)
3151 			kmem_free(vsa.vsa_aclentp,
3152 			    vsa.vsa_aclcnt * sizeof (aclent_t));
3153 	}
3154 }
3155 
3156 void
3157 rfs_srvrinit(void)
3158 {
3159 	mutex_init(&rfs_async_write_lock, NULL, MUTEX_DEFAULT, NULL);
3160 }
3161 
3162 void
3163 rfs_srvrfini(void)
3164 {
3165 	mutex_destroy(&rfs_async_write_lock);
3166 }
3167