xref: /titanic_50/usr/src/uts/common/fs/nfs/nfs_srv.c (revision 8eea8e29cc4374d1ee24c25a07f45af132db3499)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  *	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
29  *	All rights reserved.
30  */
31 
32 #pragma ident	"%Z%%M%	%I%	%E% SMI"
33 
34 #include <sys/param.h>
35 #include <sys/types.h>
36 #include <sys/systm.h>
37 #include <sys/cred.h>
38 #include <sys/buf.h>
39 #include <sys/vfs.h>
40 #include <sys/vnode.h>
41 #include <sys/uio.h>
42 #include <sys/stat.h>
43 #include <sys/errno.h>
44 #include <sys/sysmacros.h>
45 #include <sys/statvfs.h>
46 #include <sys/kmem.h>
47 #include <sys/kstat.h>
48 #include <sys/dirent.h>
49 #include <sys/cmn_err.h>
50 #include <sys/debug.h>
51 #include <sys/vtrace.h>
52 #include <sys/mode.h>
53 #include <sys/acl.h>
54 #include <sys/nbmlock.h>
55 #include <sys/policy.h>
56 
57 #include <rpc/types.h>
58 #include <rpc/auth.h>
59 #include <rpc/svc.h>
60 
61 #include <nfs/nfs.h>
62 #include <nfs/export.h>
63 
64 #include <vm/hat.h>
65 #include <vm/as.h>
66 #include <vm/seg.h>
67 #include <vm/seg_map.h>
68 #include <vm/seg_kmem.h>
69 
70 #include <sys/strsubr.h>
71 
72 /*
73  * These are the interface routines for the server side of the
74  * Network File System.  See the NFS version 2 protocol specification
75  * for a description of this interface.
76  */
77 
78 static int	sattr_to_vattr(struct nfssattr *, struct vattr *);
79 static void	acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
80 			cred_t *);
81 
82 /*
83  * Some "over the wire" UNIX file types.  These are encoded
84  * into the mode.  This needs to be fixed in the next rev.
85  */
86 #define	IFMT		0170000		/* type of file */
87 #define	IFCHR		0020000		/* character special */
88 #define	IFBLK		0060000		/* block special */
89 #define	IFSOCK		0140000		/* socket */
90 
91 /*
92  * Get file attributes.
93  * Returns the current attributes of the file with the given fhandle.
94  */
95 /* ARGSUSED */
96 void
97 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
98 	struct svc_req *req, cred_t *cr)
99 {
100 	int error;
101 	vnode_t *vp;
102 	struct vattr va;
103 
104 	TRACE_0(TR_FAC_NFS, TR_RFS_GETATTR_START,
105 		"rfs_getattr_start:");
106 
107 	vp = nfs_fhtovp(fhp, exi);
108 	if (vp == NULL) {
109 		ns->ns_status = NFSERR_STALE;
110 		TRACE_1(TR_FAC_NFS, TR_RFS_GETATTR_END,
111 			"rfs_getattr_end:(%S)", "stale");
112 		return;
113 	}
114 
115 	/*
116 	 * Do the getattr.
117 	 */
118 	va.va_mask = AT_ALL;	/* we want all the attributes */
119 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
120 		"vop_getattr_start:");
121 	error = rfs4_delegated_getattr(vp, &va, 0, cr);
122 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
123 		"vop_getattr_end:");
124 
125 	/* check for overflows */
126 	if (!error) {
127 		acl_perm(vp, exi, &va, cr);
128 		error = vattr_to_nattr(&va, &ns->ns_attr);
129 	}
130 
131 	VN_RELE(vp);
132 
133 	ns->ns_status = puterrno(error);
134 
135 	TRACE_1(TR_FAC_NFS, TR_RFS_GETATTR_END,
136 		"rfs_getattr_end:(%S)", "done");
137 }
138 fhandle_t *
139 rfs_getattr_getfh(fhandle_t *fhp)
140 {
141 	return (fhp);
142 }
143 
144 /*
145  * Set file attributes.
146  * Sets the attributes of the file with the given fhandle.  Returns
147  * the new attributes.
148  */
149 void
150 rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns,
151 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
152 {
153 	int error;
154 	int flag;
155 	int in_crit = 0;
156 	vnode_t *vp;
157 	struct vattr va;
158 	struct vattr bva;
159 	struct flock64 bf;
160 
161 	TRACE_0(TR_FAC_NFS, TR_RFS_SETATTR_START,
162 		"rfs_setattr_start:");
163 
164 	vp = nfs_fhtovp(&args->saa_fh, exi);
165 	if (vp == NULL) {
166 		ns->ns_status = NFSERR_STALE;
167 		TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END,
168 			"rfs_setattr_end:(%S)", "stale");
169 		return;
170 	}
171 
172 	if (rdonly(exi, req) || vn_is_readonly(vp)) {
173 		VN_RELE(vp);
174 		ns->ns_status = NFSERR_ROFS;
175 		TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END,
176 			"rfs_setattr_end:(%S)", "rofs");
177 		return;
178 	}
179 
180 	error = sattr_to_vattr(&args->saa_sa, &va);
181 	if (error) {
182 		VN_RELE(vp);
183 		ns->ns_status = puterrno(error);
184 		TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END,
185 			"rfs_setattr_end:(%S)", "sattr");
186 		return;
187 	}
188 
189 	/*
190 	 * If the client is requesting a change to the mtime,
191 	 * but the nanosecond field is set to 1 billion, then
192 	 * this is a flag to the server that it should set the
193 	 * atime and mtime fields to the server's current time.
194 	 * The 1 billion number actually came from the client
195 	 * as 1 million, but the units in the over the wire
196 	 * request are microseconds instead of nanoseconds.
197 	 *
198 	 * This is an overload of the protocol and should be
199 	 * documented in the NFS Version 2 protocol specification.
200 	 */
201 	if (va.va_mask & AT_MTIME) {
202 		if (va.va_mtime.tv_nsec == 1000000000) {
203 			gethrestime(&va.va_mtime);
204 			va.va_atime = va.va_mtime;
205 			va.va_mask |= AT_ATIME;
206 			flag = 0;
207 		} else
208 			flag = ATTR_UTIME;
209 	} else
210 		flag = 0;
211 
212 	/*
213 	 * If the filesystem is exported with nosuid, then mask off
214 	 * the setuid and setgid bits.
215 	 */
216 	if ((va.va_mask & AT_MODE) && vp->v_type == VREG &&
217 	    (exi->exi_export.ex_flags & EX_NOSUID))
218 		va.va_mode &= ~(VSUID | VSGID);
219 
220 	/*
221 	 * We need to specially handle size changes because it is
222 	 * possible for the client to create a file with modes
223 	 * which indicate read-only, but with the file opened for
224 	 * writing.  If the client then tries to set the size of
225 	 * the file, then the normal access checking done in
226 	 * VOP_SETATTR would prevent the client from doing so,
227 	 * although it should be legal for it to do so.  To get
228 	 * around this, we do the access checking for ourselves
229 	 * and then use VOP_SPACE which doesn't do the access
230 	 * checking which VOP_SETATTR does. VOP_SPACE can only
231 	 * operate on VREG files, let VOP_SETATTR handle the other
232 	 * extremely rare cases.
233 	 * Also the client should not be allowed to change the
234 	 * size of the file if there is a conflicting non-blocking
235 	 * mandatory lock in the region of change.
236 	 *
237 	 * Also(2), check to see if the v4 side of the server has
238 	 * delegated this file.  If so, then we set T_WOULDBLOCK
239 	 * so that the dispatch function dosn't send a reply, forcing
240 	 * the client to retrasmit its request.
241 	 */
242 	if (vp->v_type == VREG && va.va_mask & AT_SIZE) {
243 		/* If delegated, mark as wouldblock so response is dropped */
244 		if (rfs4_check_delegated(FWRITE, vp, TRUE)) {
245 			VN_RELE(vp);
246 			curthread->t_flag |= T_WOULDBLOCK;
247 			TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END,
248 				"rfs_setattr_end:(%S)", "delegated");
249 			return;
250 		}
251 		if (nbl_need_check(vp)) {
252 			nbl_start_crit(vp, RW_READER);
253 			in_crit = 1;
254 		}
255 
256 		bva.va_mask = AT_UID | AT_SIZE;
257 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
258 			"vop_getattr_start:");
259 		error = VOP_GETATTR(vp, &bva, 0, cr);
260 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
261 			"vop_getattr_end:");
262 		if (error) {
263 			if (in_crit)
264 				nbl_end_crit(vp);
265 			VN_RELE(vp);
266 			ns->ns_status = puterrno(error);
267 			TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END,
268 				"rfs_setattr_end:(%S)", "getattr");
269 			return;
270 		}
271 
272 		if (in_crit) {
273 			u_offset_t offset;
274 			ssize_t length;
275 
276 			if (va.va_size < bva.va_size) {
277 				offset = va.va_size;
278 				length = bva.va_size - va.va_size;
279 			} else {
280 				offset = bva.va_size;
281 				length = va.va_size - bva.va_size;
282 			}
283 			if (nbl_conflict(vp, NBL_WRITE, offset, length, 0)) {
284 				error = EACCES;
285 			}
286 		}
287 
288 		if (crgetuid(cr) == bva.va_uid && !error &&
289 			va.va_size != bva.va_size) {
290 			va.va_mask &= ~AT_SIZE;
291 			bf.l_type = F_WRLCK;
292 			bf.l_whence = 0;
293 			bf.l_start = (off64_t)va.va_size;
294 			bf.l_len = 0;
295 			bf.l_sysid = 0;
296 			bf.l_pid = 0;
297 			TRACE_0(TR_FAC_NFS, TR_VOP_SPACE_START,
298 				"vop_space_start:");
299 			error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE,
300 					(offset_t)va.va_size, cr, NULL);
301 			TRACE_0(TR_FAC_NFS, TR_VOP_SPACE_END,
302 				"vop_space_end:");
303 		}
304 		if (in_crit)
305 			nbl_end_crit(vp);
306 	} else
307 		error = 0;
308 
309 	/*
310 	 * Do the setattr.
311 	 */
312 	if (!error && va.va_mask) {
313 		TRACE_0(TR_FAC_NFS, TR_VOP_SETATTR_START,
314 			"vop_setattr_start:");
315 		error = VOP_SETATTR(vp, &va, flag, cr, NULL);
316 		TRACE_0(TR_FAC_NFS, TR_VOP_SETATTR_END,
317 			"vop_setattr_end:");
318 	}
319 
320 	if (!error) {
321 		va.va_mask = AT_ALL;	/* get everything */
322 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
323 			"vop_getattr_start:");
324 		error = rfs4_delegated_getattr(vp, &va, 0, cr);
325 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
326 			"vop_getattr_end:");
327 
328 		/* check for overflows */
329 		if (!error) {
330 			acl_perm(vp, exi, &va, cr);
331 			error = vattr_to_nattr(&va, &ns->ns_attr);
332 		}
333 	}
334 
335 	/*
336 	 * Force modified metadata out to stable storage.
337 	 */
338 	(void) VOP_FSYNC(vp, FNODSYNC, cr);
339 
340 	VN_RELE(vp);
341 
342 	ns->ns_status = puterrno(error);
343 
344 	TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END,
345 		"rfs_setattr_end:(%S)", "done");
346 }
347 fhandle_t *
348 rfs_setattr_getfh(struct nfssaargs *args)
349 {
350 	return (&args->saa_fh);
351 }
352 
353 /*
354  * Directory lookup.
355  * Returns an fhandle and file attributes for file name in a directory.
356  */
357 /* ARGSUSED */
358 void
359 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
360 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
361 {
362 	int error;
363 	vnode_t *dvp;
364 	vnode_t *vp;
365 	struct vattr va;
366 	fhandle_t *fhp = da->da_fhandle;
367 	struct sec_ol sec = {0, 0};
368 	bool_t publicfh_flag = FALSE, auth_weak = FALSE;
369 
370 	TRACE_0(TR_FAC_NFS, TR_RFS_LOOKUP_START,
371 		"rfs_lookup_start:");
372 
373 	/*
374 	 * Disallow NULL paths
375 	 */
376 	if (da->da_name == NULL || *da->da_name == '\0') {
377 		dr->dr_status = NFSERR_ACCES;
378 		TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END,
379 			"rfs_lookup_end:(%S)", "access");
380 		return;
381 	}
382 
383 	/*
384 	 * Allow lookups from the root - the default
385 	 * location of the public filehandle.
386 	 */
387 	if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
388 		dvp = rootdir;
389 		VN_HOLD(dvp);
390 	} else {
391 		dvp = nfs_fhtovp(fhp, exi);
392 		if (dvp == NULL) {
393 			dr->dr_status = NFSERR_STALE;
394 			TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END,
395 				"rfs_lookup_end:(%S)", "stale");
396 			return;
397 		}
398 	}
399 
400 	/*
401 	 * Not allow lookup beyond root.
402 	 * If the filehandle matches a filehandle of the exi,
403 	 * then the ".." refers beyond the root of an exported filesystem.
404 	 */
405 	if (strcmp(da->da_name, "..") == 0 &&
406 	    EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
407 		VN_RELE(dvp);
408 		dr->dr_status = NFSERR_NOENT;
409 		TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END,
410 			"rfs_lookup_end:(%S)", "noent");
411 		return;
412 	}
413 
414 	/*
415 	 * If the public filehandle is used then allow
416 	 * a multi-component lookup, i.e. evaluate
417 	 * a pathname and follow symbolic links if
418 	 * necessary.
419 	 *
420 	 * This may result in a vnode in another filesystem
421 	 * which is OK as long as the filesystem is exported.
422 	 */
423 	if (PUBLIC_FH2(fhp)) {
424 		publicfh_flag = TRUE;
425 		error = rfs_publicfh_mclookup(da->da_name, dvp, cr, &vp, &exi,
426 					&sec);
427 	} else {
428 		/*
429 		 * Do a normal single component lookup.
430 		 */
431 		TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_START,
432 			"vop_lookup_start:");
433 		error = VOP_LOOKUP(dvp, da->da_name, &vp, NULL, 0, NULL, cr);
434 		TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_END,
435 			"vop_lookup_end:");
436 	}
437 
438 	if (!error) {
439 		VN_SETPATH(rootdir, dvp, vp, da->da_name,
440 			strlen(da->da_name));
441 		va.va_mask = AT_ALL;	/* we want everything */
442 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
443 			"vop_getattr_start:");
444 		error = rfs4_delegated_getattr(vp, &va, 0, cr);
445 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
446 			"vop_getattr_end:");
447 		/* check for overflows */
448 		if (!error) {
449 			acl_perm(vp, exi, &va, cr);
450 			error = vattr_to_nattr(&va, &dr->dr_attr);
451 			if (!error) {
452 				if (sec.sec_flags & SEC_QUERY)
453 					error = makefh_ol(&dr->dr_fhandle, exi,
454 							sec.sec_index);
455 				else {
456 					error = makefh(&dr->dr_fhandle, vp,
457 								exi);
458 					if (!error && publicfh_flag &&
459 						!chk_clnt_sec(exi, req))
460 						auth_weak = TRUE;
461 				}
462 			}
463 		}
464 		VN_RELE(vp);
465 	}
466 
467 	VN_RELE(dvp);
468 
469 	/*
470 	 * If publicfh_flag is true then we have called rfs_publicfh_mclookup
471 	 * and have obtained a new exportinfo in exi which needs to be
472 	 * released. Note the the original exportinfo pointed to by exi
473 	 * will be released by the caller, comon_dispatch.
474 	 */
475 	if (publicfh_flag && exi != NULL)
476 		exi_rele(exi);
477 
478 	/*
479 	 * If it's public fh, no 0x81, and client's flavor is
480 	 * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
481 	 * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
482 	 */
483 	if (auth_weak)
484 		dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR;
485 	else
486 		dr->dr_status = puterrno(error);
487 
488 	TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END,
489 		"rfs_lookup_end:(%S)", "done");
490 }
491 fhandle_t *
492 rfs_lookup_getfh(struct nfsdiropargs *da)
493 {
494 	return (da->da_fhandle);
495 }
496 
497 /*
498  * Read symbolic link.
499  * Returns the string in the symbolic link at the given fhandle.
500  */
501 /* ARGSUSED */
502 void
503 rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi,
504 	struct svc_req *req, cred_t *cr)
505 {
506 	int error;
507 	struct iovec iov;
508 	struct uio uio;
509 	vnode_t *vp;
510 	struct vattr va;
511 
512 	TRACE_0(TR_FAC_NFS, TR_RFS_READLINK_START,
513 		"rfs_readlink_start:");
514 
515 	vp = nfs_fhtovp(fhp, exi);
516 	if (vp == NULL) {
517 		rl->rl_data = NULL;
518 		rl->rl_status = NFSERR_STALE;
519 		TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END,
520 			"rfs_readlink_end:(%S)", "stale");
521 		return;
522 	}
523 
524 	va.va_mask = AT_MODE;
525 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
526 		"vop_getattr_start:");
527 	error = VOP_GETATTR(vp, &va, 0, cr);
528 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
529 		"vop_getattr_end:");
530 
531 	if (error) {
532 		VN_RELE(vp);
533 		rl->rl_data = NULL;
534 		rl->rl_status = puterrno(error);
535 		TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END,
536 			"rfs_readlink_end:(%S)", "getattr error");
537 		return;
538 	}
539 
540 	if (MANDLOCK(vp, va.va_mode)) {
541 		VN_RELE(vp);
542 		rl->rl_data = NULL;
543 		rl->rl_status = NFSERR_ACCES;
544 		TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END,
545 			"rfs_readlink_end:(%S)", "access");
546 		return;
547 	}
548 
549 	/*
550 	 * XNFS and RFC1094 require us to return ENXIO if argument
551 	 * is not a link. BUGID 1138002.
552 	 */
553 	if (vp->v_type != VLNK) {
554 		VN_RELE(vp);
555 		rl->rl_data = NULL;
556 		rl->rl_status = NFSERR_NXIO;
557 		TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END,
558 			"rfs_readlink_end:(%S)", "nxio");
559 		return;
560 	}
561 
562 	/*
563 	 * Allocate data for pathname.  This will be freed by rfs_rlfree.
564 	 */
565 	rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
566 
567 	/*
568 	 * Set up io vector to read sym link data
569 	 */
570 	iov.iov_base = rl->rl_data;
571 	iov.iov_len = NFS_MAXPATHLEN;
572 	uio.uio_iov = &iov;
573 	uio.uio_iovcnt = 1;
574 	uio.uio_segflg = UIO_SYSSPACE;
575 	uio.uio_extflg = UIO_COPY_CACHED;
576 	uio.uio_loffset = (offset_t)0;
577 	uio.uio_resid = NFS_MAXPATHLEN;
578 
579 	/*
580 	 * Do the readlink.
581 	 */
582 	TRACE_0(TR_FAC_NFS, TR_VOP_READLINK_START,
583 		"vop_readlink_start:");
584 	error = VOP_READLINK(vp, &uio, cr);
585 	TRACE_0(TR_FAC_NFS, TR_VOP_READLINK_END,
586 		"vop_readlink_end:");
587 
588 #if 0 /* notyet */
589 	/*
590 	 * Don't do this.  It causes local disk writes when just
591 	 * reading the file and the overhead is deemed larger
592 	 * than the benefit.
593 	 */
594 	/*
595 	 * Force modified metadata out to stable storage.
596 	 */
597 	(void) VOP_FSYNC(vp, FNODSYNC, cr);
598 #endif
599 
600 	VN_RELE(vp);
601 
602 	rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid);
603 
604 	/*
605 	 * XNFS and RFC1094 require us to return ENXIO if argument
606 	 * is not a link. UFS returns EINVAL if this is the case,
607 	 * so we do the mapping here. BUGID 1138002.
608 	 */
609 	if (error == EINVAL)
610 		rl->rl_status = NFSERR_NXIO;
611 	else
612 		rl->rl_status = puterrno(error);
613 
614 	TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END,
615 		"rfs_readlink_end:(%S)", "done");
616 }
617 fhandle_t *
618 rfs_readlink_getfh(fhandle_t *fhp)
619 {
620 	return (fhp);
621 }
622 /*
623  * Free data allocated by rfs_readlink
624  */
625 void
626 rfs_rlfree(struct nfsrdlnres *rl)
627 {
628 	if (rl->rl_data != NULL)
629 		kmem_free(rl->rl_data, NFS_MAXPATHLEN);
630 }
631 
632 /*
633  * Read data.
634  * Returns some data read from the file at the given fhandle.
635  */
636 /* ARGSUSED */
637 void
638 rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr,
639 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
640 {
641 	vnode_t *vp;
642 	int error;
643 	struct vattr va;
644 	struct iovec iov;
645 	struct uio uio;
646 	mblk_t *mp;
647 	int alloc_err = 0;
648 	int in_crit = 0;
649 
650 	TRACE_0(TR_FAC_NFS, TR_RFS_READ_START,
651 		"rfs_read_start:");
652 
653 	vp = nfs_fhtovp(&ra->ra_fhandle, exi);
654 	if (vp == NULL) {
655 		rr->rr_data = NULL;
656 		rr->rr_status = NFSERR_STALE;
657 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
658 			"rfs_read_end:(%S)", "stale");
659 		return;
660 	}
661 
662 	if (vp->v_type != VREG) {
663 		VN_RELE(vp);
664 		rr->rr_data = NULL;
665 		rr->rr_status = NFSERR_ISDIR;
666 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
667 			"rfs_read_end:(%S)", "isdir");
668 		return;
669 	}
670 
671 	/*
672 	 * Check to see if the v4 side of the server has delegated
673 	 * this file.  If so, then we mark thread as wouldblock so
674 	 * the response is dropped.
675 	 */
676 	if (rfs4_check_delegated(FREAD, vp, FALSE)) {
677 		VN_RELE(vp);
678 		curthread->t_flag |= T_WOULDBLOCK;
679 		rr->rr_data = NULL;
680 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
681 			"rfs_read_end:(%S)", "delegated");
682 		return;
683 	}
684 
685 	/*
686 	 * Enter the critical region before calling VOP_RWLOCK
687 	 * to avoid a deadlock with write requests.
688 	 */
689 	if (nbl_need_check(vp)) {
690 		nbl_start_crit(vp, RW_READER);
691 		if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
692 		    0)) {
693 			nbl_end_crit(vp);
694 			VN_RELE(vp);
695 			rr->rr_data = NULL;
696 			rr->rr_status = NFSERR_ACCES;
697 			TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
698 				"rfs_read_end:(%S)", " csf access error");
699 			return;
700 		}
701 		in_crit = 1;
702 	}
703 
704 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START,
705 		"vop_rwlock_start:");
706 	(void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
707 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END,
708 		"vop_rwlock_end:");
709 
710 	va.va_mask = AT_ALL;
711 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
712 		"vop_getattr_start:");
713 	error = VOP_GETATTR(vp, &va, 0, cr);
714 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
715 		"vop_getattr_end:");
716 
717 	if (error) {
718 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
719 			"vop_rwunlock_start:");
720 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
721 		if (in_crit)
722 			nbl_end_crit(vp);
723 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END,
724 			"vop_rwunlock_end:");
725 		VN_RELE(vp);
726 		rr->rr_data = NULL;
727 		rr->rr_status = puterrno(error);
728 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
729 			"rfs_read_end:(%S)", "getattr error");
730 		return;
731 	}
732 
733 	/*
734 	 * This is a kludge to allow reading of files created
735 	 * with no read permission.  The owner of the file
736 	 * is always allowed to read it.
737 	 */
738 	if (crgetuid(cr) != va.va_uid) {
739 		TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START,
740 			"vop_access_start:");
741 		error = VOP_ACCESS(vp, VREAD, 0, cr);
742 		TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END,
743 			"vop_access_end:");
744 		if (error) {
745 			/*
746 			 * Exec is the same as read over the net because
747 			 * of demand loading.
748 			 */
749 			TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START,
750 				"vop_access_start:");
751 			error = VOP_ACCESS(vp, VEXEC, 0, cr);
752 			TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END,
753 				"vop_access_end:");
754 		}
755 		if (error) {
756 			TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
757 				"vop_rwunlock_start:");
758 			VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
759 			if (in_crit)
760 				nbl_end_crit(vp);
761 			TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END,
762 				"vop_rwunlock_end:");
763 			VN_RELE(vp);
764 			rr->rr_data = NULL;
765 			rr->rr_status = puterrno(error);
766 			TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
767 				"rfs_read_end:(%S)", "access error");
768 			return;
769 		}
770 	}
771 
772 	if (MANDLOCK(vp, va.va_mode)) {
773 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
774 			"vop_rwunlock_start:");
775 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
776 		if (in_crit)
777 			nbl_end_crit(vp);
778 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END,
779 			"vop_rwunlock_end:");
780 		VN_RELE(vp);
781 		rr->rr_data = NULL;
782 		rr->rr_status = NFSERR_ACCES;
783 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
784 			"rfs_read_end:(%S)", "mand lock");
785 		return;
786 	}
787 
788 	if ((u_offset_t)ra->ra_offset >= va.va_size) {
789 		rr->rr_count = 0;
790 		rr->rr_data = NULL;
791 		/*
792 		 * In this case, status is NFS_OK, but there is no data
793 		 * to encode. So set rr_mp to NULL.
794 		 */
795 		rr->rr_mp = NULL;
796 		goto done;
797 	}
798 
799 	/*
800 	 * mp will contain the data to be sent out in the read reply.
801 	 * This will be freed after the reply has been sent out (by the
802 	 * driver).
803 	 * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so
804 	 * that the call to xdrmblk_putmblk() never fails.
805 	 */
806 	mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG,
807 	    &alloc_err);
808 	ASSERT(mp != NULL);
809 	ASSERT(alloc_err == 0);
810 
811 	rr->rr_mp = mp;
812 
813 	/*
814 	 * Set up io vector
815 	 */
816 	iov.iov_base = (caddr_t)mp->b_datap->db_base;
817 	iov.iov_len = ra->ra_count;
818 	uio.uio_iov = &iov;
819 	uio.uio_iovcnt = 1;
820 	uio.uio_segflg = UIO_SYSSPACE;
821 	uio.uio_extflg = UIO_COPY_CACHED;
822 	uio.uio_loffset = (offset_t)ra->ra_offset;
823 	uio.uio_resid = ra->ra_count;
824 
825 	TRACE_0(TR_FAC_NFS, TR_VOP_READ_START,
826 		"vop_read_start:");
827 	error = VOP_READ(vp, &uio, 0, cr, NULL);
828 	TRACE_0(TR_FAC_NFS, TR_VOP_READ_END,
829 		"vop_read_end:");
830 
831 	if (error) {
832 		freeb(mp);
833 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
834 			"vop_rwunlock_start:");
835 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
836 		if (in_crit)
837 			nbl_end_crit(vp);
838 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END,
839 			"vop_rwunlock_end:");
840 		VN_RELE(vp);
841 		rr->rr_data = NULL;
842 		rr->rr_status = puterrno(error);
843 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
844 			"rfs_read_end:(%S)", "read error");
845 		return;
846 	}
847 
848 	/*
849 	 * Get attributes again so we can send the latest access
850 	 * time to the client side for his cache.
851 	 */
852 	va.va_mask = AT_ALL;
853 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
854 		"vop_getattr_start:");
855 	error = VOP_GETATTR(vp, &va, 0, cr);
856 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
857 		"vop_getattr_end:");
858 	if (error) {
859 		freeb(mp);
860 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
861 			"vop_rwunlock_start:");
862 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
863 		if (in_crit)
864 			nbl_end_crit(vp);
865 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END,
866 			"vop_rwunlock_end:");
867 		VN_RELE(vp);
868 		rr->rr_data = NULL;
869 		rr->rr_status = puterrno(error);
870 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
871 			"rfs_read_end:(%S)", "read error");
872 		return;
873 	}
874 
875 	rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid);
876 
877 	rr->rr_data = (char *)mp->b_datap->db_base;
878 
879 done:
880 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
881 		"vop_rwunlock_start:");
882 	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
883 	if (in_crit)
884 		nbl_end_crit(vp);
885 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END,
886 		"vop_rwunlock_end:");
887 
888 	acl_perm(vp, exi, &va, cr);
889 
890 	/* check for overflows */
891 	error = vattr_to_nattr(&va, &rr->rr_attr);
892 
893 #if 0 /* notyet */
894 	/*
895 	 * Don't do this.  It causes local disk writes when just
896 	 * reading the file and the overhead is deemed larger
897 	 * than the benefit.
898 	 */
899 	/*
900 	 * Force modified metadata out to stable storage.
901 	 */
902 	(void) VOP_FSYNC(vp, FNODSYNC, cr);
903 #endif
904 
905 	VN_RELE(vp);
906 
907 	rr->rr_status = puterrno(error);
908 
909 	TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
910 		"rfs_read_end:(%S)", "done");
911 }
912 
913 /*
914  * Free data allocated by rfs_read
915  */
916 void
917 rfs_rdfree(struct nfsrdresult *rr)
918 {
919 	mblk_t *mp;
920 
921 	if (rr->rr_status == NFS_OK) {
922 		mp = rr->rr_mp;
923 		if (mp != NULL)
924 			freeb(mp);
925 	}
926 }
927 
928 fhandle_t *
929 rfs_read_getfh(struct nfsreadargs *ra)
930 {
931 	return (&ra->ra_fhandle);
932 }
933 
934 #define	MAX_IOVECS	12
935 
936 #ifdef DEBUG
937 static int rfs_write_sync_hits = 0;
938 static int rfs_write_sync_misses = 0;
939 #endif
940 
941 /*
942  * Write data to file.
943  * Returns attributes of a file after writing some data to it.
944  *
945  * Any changes made here, especially in error handling might have
946  * to also be done in rfs_write (which clusters write requests).
947  */
948 void
949 rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns,
950 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
951 {
952 	int error;
953 	vnode_t *vp;
954 	rlim64_t rlimit;
955 	struct vattr va;
956 	struct uio uio;
957 	struct iovec iov[MAX_IOVECS];
958 	mblk_t *m;
959 	struct iovec *iovp;
960 	int iovcnt;
961 	cred_t *savecred;
962 	int in_crit = 0;
963 
964 	TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_START,
965 		"rfs_write_start:(%S)", "sync");
966 
967 	vp = nfs_fhtovp(&wa->wa_fhandle, exi);
968 	if (vp == NULL) {
969 		ns->ns_status = NFSERR_STALE;
970 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
971 			"rfs_write_end:(%S)", "stale");
972 		return;
973 	}
974 
975 	if (rdonly(exi, req)) {
976 		VN_RELE(vp);
977 		ns->ns_status = NFSERR_ROFS;
978 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
979 			"rfs_write_end:(%S)", "rofs");
980 		return;
981 	}
982 
983 	if (vp->v_type != VREG) {
984 		VN_RELE(vp);
985 		ns->ns_status = NFSERR_ISDIR;
986 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
987 			"rfs_write_end:(%S)", "isdir");
988 		return;
989 	}
990 
991 	/*
992 	 * Check to see if the v4 side of the server has delegated
993 	 * this file.  If so, then we mark thread as wouldblock so
994 	 * the response is dropped.
995 	 */
996 	if (rfs4_check_delegated(FWRITE, vp, FALSE)) {
997 		VN_RELE(vp);
998 		curthread->t_flag |= T_WOULDBLOCK;
999 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
1000 			"rfs_write_end:(%S)", "delegated");
1001 		return;
1002 	}
1003 
1004 	va.va_mask = AT_UID|AT_MODE;
1005 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
1006 		"vop_getattr_start:");
1007 	error = VOP_GETATTR(vp, &va, 0, cr);
1008 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
1009 		"vop_getattr_end:");
1010 
1011 	if (error) {
1012 		VN_RELE(vp);
1013 		ns->ns_status = puterrno(error);
1014 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
1015 			"rfs_write_end:(%S)", "getattr error");
1016 		return;
1017 	}
1018 
1019 	if (crgetuid(cr) != va.va_uid) {
1020 		/*
1021 		 * This is a kludge to allow writes of files created
1022 		 * with read only permission.  The owner of the file
1023 		 * is always allowed to write it.
1024 		 */
1025 		TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START,
1026 			"vop_access_start:");
1027 		error = VOP_ACCESS(vp, VWRITE, 0, cr);
1028 		TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END,
1029 			"vop_access_end:");
1030 		if (error) {
1031 			VN_RELE(vp);
1032 			ns->ns_status = puterrno(error);
1033 			TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
1034 				"rfs_write_end:(%S)", "access error");
1035 			return;
1036 		}
1037 	}
1038 
1039 	/*
1040 	 * Can't access a mandatory lock file.  This might cause
1041 	 * the NFS service thread to block forever waiting for a
1042 	 * lock to be released that will never be released.
1043 	 */
1044 	if (MANDLOCK(vp, va.va_mode)) {
1045 		VN_RELE(vp);
1046 		ns->ns_status = NFSERR_ACCES;
1047 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
1048 			"rfs_write_end:(%S)", "mand lock");
1049 		return;
1050 	}
1051 
1052 	/*
1053 	 * We have to enter the critical region before calling VOP_RWLOCK
1054 	 * to avoid a deadlock with ufs.
1055 	 */
1056 	if (nbl_need_check(vp)) {
1057 		nbl_start_crit(vp, RW_READER);
1058 		in_crit = 1;
1059 		if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
1060 		    wa->wa_count, 0)) {
1061 			error = EACCES;
1062 			goto out;
1063 		}
1064 	}
1065 
1066 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START,
1067 		"vop_rwlock_start:");
1068 	(void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);
1069 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END,
1070 		"vop_rwlock_end:");
1071 
1072 	if (wa->wa_data) {
1073 		iov[0].iov_base = wa->wa_data;
1074 		iov[0].iov_len = wa->wa_count;
1075 		uio.uio_iov = iov;
1076 		uio.uio_iovcnt = 1;
1077 		uio.uio_segflg = UIO_SYSSPACE;
1078 		uio.uio_extflg = UIO_COPY_DEFAULT;
1079 		uio.uio_loffset = (offset_t)wa->wa_offset;
1080 		uio.uio_resid = wa->wa_count;
1081 		/*
1082 		 * The limit is checked on the client. We
1083 		 * should allow any size writes here.
1084 		 */
1085 		uio.uio_llimit = curproc->p_fsz_ctl;
1086 		rlimit = uio.uio_llimit - wa->wa_offset;
1087 		if (rlimit < (rlim64_t)uio.uio_resid)
1088 			uio.uio_resid = (uint_t)rlimit;
1089 
1090 		/*
1091 		 * for now we assume no append mode
1092 		 */
1093 		TRACE_1(TR_FAC_NFS, TR_VOP_WRITE_START,
1094 			"vop_write_start:(%S)", "sync");
1095 		/*
1096 		 * We're changing creds because VM may fault and we need
1097 		 * the cred of the current thread to be used if quota
1098 		 * checking is enabled.
1099 		 */
1100 		savecred = curthread->t_cred;
1101 		curthread->t_cred = cr;
1102 		error = VOP_WRITE(vp, &uio, FSYNC, cr, NULL);
1103 		curthread->t_cred = savecred;
1104 		TRACE_0(TR_FAC_NFS, TR_VOP_WRITE_END,
1105 			"vop_write_end:");
1106 	} else {
1107 		iovcnt = 0;
1108 		for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
1109 			iovcnt++;
1110 		if (iovcnt <= MAX_IOVECS) {
1111 #ifdef DEBUG
1112 			rfs_write_sync_hits++;
1113 #endif
1114 			iovp = iov;
1115 		} else {
1116 #ifdef DEBUG
1117 			rfs_write_sync_misses++;
1118 #endif
1119 			iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
1120 		}
1121 		mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
1122 		uio.uio_iov = iovp;
1123 		uio.uio_iovcnt = iovcnt;
1124 		uio.uio_segflg = UIO_SYSSPACE;
1125 		uio.uio_extflg = UIO_COPY_DEFAULT;
1126 		uio.uio_loffset = (offset_t)wa->wa_offset;
1127 		uio.uio_resid = wa->wa_count;
1128 		/*
1129 		 * The limit is checked on the client. We
1130 		 * should allow any size writes here.
1131 		 */
1132 		uio.uio_llimit = curproc->p_fsz_ctl;
1133 		rlimit = uio.uio_llimit - wa->wa_offset;
1134 		if (rlimit < (rlim64_t)uio.uio_resid)
1135 			uio.uio_resid = (uint_t)rlimit;
1136 
1137 		/*
1138 		 * For now we assume no append mode.
1139 		 */
1140 		TRACE_1(TR_FAC_NFS, TR_VOP_WRITE_START,
1141 			"vop_write_start:(%S)", "iov sync");
1142 		/*
1143 		 * We're changing creds because VM may fault and we need
1144 		 * the cred of the current thread to be used if quota
1145 		 * checking is enabled.
1146 		 */
1147 		savecred = curthread->t_cred;
1148 		curthread->t_cred = cr;
1149 		error = VOP_WRITE(vp, &uio, FSYNC, cr, NULL);
1150 		curthread->t_cred = savecred;
1151 		TRACE_0(TR_FAC_NFS, TR_VOP_WRITE_END,
1152 			"vop_write_end:");
1153 
1154 		if (iovp != iov)
1155 			kmem_free(iovp, sizeof (*iovp) * iovcnt);
1156 	}
1157 
1158 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
1159 		"vop_rwunlock_start:");
1160 	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
1161 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END,
1162 		"vop_rwunlock_end:");
1163 
1164 	if (!error) {
1165 		/*
1166 		 * Get attributes again so we send the latest mod
1167 		 * time to the client side for his cache.
1168 		 */
1169 		va.va_mask = AT_ALL;	/* now we want everything */
1170 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
1171 			"vop_getattr_start:");
1172 		error = VOP_GETATTR(vp, &va, 0, cr);
1173 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
1174 			"vop_getattr_end:");
1175 		/* check for overflows */
1176 		if (!error) {
1177 			acl_perm(vp, exi, &va, cr);
1178 			error = vattr_to_nattr(&va, &ns->ns_attr);
1179 		}
1180 	}
1181 
1182 out:
1183 	if (in_crit)
1184 		nbl_end_crit(vp);
1185 	VN_RELE(vp);
1186 
1187 	ns->ns_status = puterrno(error);
1188 
1189 	TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
1190 		"rfs_write_end:(%S)", "sync");
1191 }
1192 
1193 struct rfs_async_write {
1194 	struct nfswriteargs *wa;
1195 	struct nfsattrstat *ns;
1196 	struct svc_req *req;
1197 	cred_t *cr;
1198 	kthread_t *thread;
1199 	struct rfs_async_write *list;
1200 };
1201 
1202 struct rfs_async_write_list {
1203 	fhandle_t *fhp;
1204 	kcondvar_t cv;
1205 	struct rfs_async_write *list;
1206 	struct rfs_async_write_list *next;
1207 };
1208 
1209 static struct rfs_async_write_list *rfs_async_write_head = NULL;
1210 static kmutex_t rfs_async_write_lock;
1211 static int rfs_write_async = 1;	/* enables write clustering if == 1 */
1212 
1213 #define	MAXCLIOVECS	42
1214 #define	RFSWRITE_INITVAL (enum nfsstat) -1
1215 
1216 #ifdef DEBUG
1217 static int rfs_write_hits = 0;
1218 static int rfs_write_misses = 0;
1219 #endif
1220 
1221 /*
1222  * Write data to file.
1223  * Returns attributes of a file after writing some data to it.
1224  */
1225 void
1226 rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns,
1227 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1228 {
1229 	int error;
1230 	vnode_t *vp;
1231 	rlim64_t rlimit;
1232 	struct vattr va;
1233 	struct uio uio;
1234 	struct rfs_async_write_list *lp;
1235 	struct rfs_async_write_list *nlp;
1236 	struct rfs_async_write *rp;
1237 	struct rfs_async_write *nrp;
1238 	struct rfs_async_write *trp;
1239 	struct rfs_async_write *lrp;
1240 	int data_written;
1241 	int iovcnt;
1242 	mblk_t *m;
1243 	struct iovec *iovp;
1244 	struct iovec *niovp;
1245 	struct iovec iov[MAXCLIOVECS];
1246 	int count;
1247 	int rcount;
1248 	uint_t off;
1249 	uint_t len;
1250 	struct rfs_async_write nrpsp;
1251 	struct rfs_async_write_list nlpsp;
1252 	ushort_t t_flag;
1253 	cred_t *savecred;
1254 	int in_crit = 0;
1255 
1256 	if (!rfs_write_async) {
1257 		rfs_write_sync(wa, ns, exi, req, cr);
1258 		return;
1259 	}
1260 
1261 	TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_START,
1262 		"rfs_write_start:(%S)", "async");
1263 
1264 	/*
1265 	 * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
1266 	 * is considered an OK.
1267 	 */
1268 	ns->ns_status = RFSWRITE_INITVAL;
1269 
1270 	nrp = &nrpsp;
1271 	nrp->wa = wa;
1272 	nrp->ns = ns;
1273 	nrp->req = req;
1274 	nrp->cr = cr;
1275 	nrp->thread = curthread;
1276 
1277 	ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
1278 
1279 	/*
1280 	 * Look to see if there is already a cluster started
1281 	 * for this file.
1282 	 */
1283 	mutex_enter(&rfs_async_write_lock);
1284 	for (lp = rfs_async_write_head; lp != NULL; lp = lp->next) {
1285 		if (bcmp(&wa->wa_fhandle, lp->fhp,
1286 		    sizeof (fhandle_t)) == 0)
1287 			break;
1288 	}
1289 
1290 	/*
1291 	 * If lp is non-NULL, then there is already a cluster
1292 	 * started.  We need to place ourselves in the cluster
1293 	 * list in the right place as determined by starting
1294 	 * offset.  Conflicts with non-blocking mandatory locked
1295 	 * regions will be checked when the cluster is processed.
1296 	 */
1297 	if (lp != NULL) {
1298 		rp = lp->list;
1299 		trp = NULL;
1300 		while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
1301 			trp = rp;
1302 			rp = rp->list;
1303 		}
1304 		nrp->list = rp;
1305 		if (trp == NULL)
1306 			lp->list = nrp;
1307 		else
1308 			trp->list = nrp;
1309 		while (nrp->ns->ns_status == RFSWRITE_INITVAL)
1310 			cv_wait(&lp->cv, &rfs_async_write_lock);
1311 		mutex_exit(&rfs_async_write_lock);
1312 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
1313 			"rfs_write_end:(%S)", "cluster child");
1314 		return;
1315 	}
1316 
1317 	/*
1318 	 * No cluster started yet, start one and add ourselves
1319 	 * to the list of clusters.
1320 	 */
1321 	nrp->list = NULL;
1322 
1323 	nlp = &nlpsp;
1324 	nlp->fhp = &wa->wa_fhandle;
1325 	cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
1326 	nlp->list = nrp;
1327 	nlp->next = NULL;
1328 
1329 	if (rfs_async_write_head == NULL) {
1330 		rfs_async_write_head = nlp;
1331 	} else {
1332 		lp = rfs_async_write_head;
1333 		while (lp->next != NULL)
1334 			lp = lp->next;
1335 		lp->next = nlp;
1336 	}
1337 	mutex_exit(&rfs_async_write_lock);
1338 
1339 	/*
1340 	 * Convert the file handle common to all of the requests
1341 	 * in this cluster to a vnode.
1342 	 */
1343 	vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1344 	if (vp == NULL) {
1345 		mutex_enter(&rfs_async_write_lock);
1346 		if (rfs_async_write_head == nlp)
1347 			rfs_async_write_head = nlp->next;
1348 		else {
1349 			lp = rfs_async_write_head;
1350 			while (lp->next != nlp)
1351 				lp = lp->next;
1352 			lp->next = nlp->next;
1353 		}
1354 		t_flag = curthread->t_flag & T_WOULDBLOCK;
1355 		for (rp = nlp->list; rp != NULL; rp = rp->list) {
1356 			rp->ns->ns_status = NFSERR_STALE;
1357 			rp->thread->t_flag |= t_flag;
1358 		}
1359 		cv_broadcast(&nlp->cv);
1360 		mutex_exit(&rfs_async_write_lock);
1361 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
1362 			"rfs_write_end:(%S)", "stale");
1363 		return;
1364 	}
1365 
1366 	/*
1367 	 * Can only write regular files.  Attempts to write any
1368 	 * other file types fail with EISDIR.
1369 	 */
1370 	if (vp->v_type != VREG) {
1371 		VN_RELE(vp);
1372 		mutex_enter(&rfs_async_write_lock);
1373 		if (rfs_async_write_head == nlp)
1374 			rfs_async_write_head = nlp->next;
1375 		else {
1376 			lp = rfs_async_write_head;
1377 			while (lp->next != nlp)
1378 				lp = lp->next;
1379 			lp->next = nlp->next;
1380 		}
1381 		t_flag = curthread->t_flag & T_WOULDBLOCK;
1382 		for (rp = nlp->list; rp != NULL; rp = rp->list) {
1383 			rp->ns->ns_status = NFSERR_ISDIR;
1384 			rp->thread->t_flag |= t_flag;
1385 		}
1386 		cv_broadcast(&nlp->cv);
1387 		mutex_exit(&rfs_async_write_lock);
1388 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
1389 			"rfs_write_end:(%S)", "isdir");
1390 		return;
1391 	}
1392 
1393 	/*
1394 	 * Enter the critical region before calling VOP_RWLOCK, to avoid a
1395 	 * deadlock with ufs.
1396 	 */
1397 	if (nbl_need_check(vp)) {
1398 		nbl_start_crit(vp, RW_READER);
1399 		in_crit = 1;
1400 	}
1401 
1402 	/*
1403 	 * Lock the file for writing.  This operation provides
1404 	 * the delay which allows clusters to grow.
1405 	 */
1406 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START,
1407 		"vop_wrlock_start:");
1408 	(void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);
1409 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END,
1410 		"vop_wrlock_end");
1411 
1412 	/*
1413 	 * Disconnect this cluster from the list of clusters.
1414 	 * The cluster that is being dealt with must be fixed
1415 	 * in size after this point, so there is no reason
1416 	 * to leave it on the list so that new requests can
1417 	 * find it.
1418 	 *
1419 	 * The algorithm is that the first write request will
1420 	 * create a cluster, convert the file handle to a
1421 	 * vnode pointer, and then lock the file for writing.
1422 	 * This request is not likely to be clustered with
1423 	 * any others.  However, the next request will create
1424 	 * a new cluster and be blocked in VOP_RWLOCK while
1425 	 * the first request is being processed.  This delay
1426 	 * will allow more requests to be clustered in this
1427 	 * second cluster.
1428 	 */
1429 	mutex_enter(&rfs_async_write_lock);
1430 	if (rfs_async_write_head == nlp)
1431 		rfs_async_write_head = nlp->next;
1432 	else {
1433 		lp = rfs_async_write_head;
1434 		while (lp->next != nlp)
1435 			lp = lp->next;
1436 		lp->next = nlp->next;
1437 	}
1438 	mutex_exit(&rfs_async_write_lock);
1439 
1440 	/*
1441 	 * Step through the list of requests in this cluster.
1442 	 * We need to check permissions to make sure that all
1443 	 * of the requests have sufficient permission to write
1444 	 * the file.  A cluster can be composed of requests
1445 	 * from different clients and different users on each
1446 	 * client.
1447 	 *
1448 	 * As a side effect, we also calculate the size of the
1449 	 * byte range that this cluster encompasses.
1450 	 */
1451 	rp = nlp->list;
1452 	off = rp->wa->wa_offset;
1453 	len = (uint_t)0;
1454 	do {
1455 		if (rdonly(exi, rp->req)) {
1456 			rp->ns->ns_status = NFSERR_ROFS;
1457 			t_flag = curthread->t_flag & T_WOULDBLOCK;
1458 			rp->thread->t_flag |= t_flag;
1459 			continue;
1460 		}
1461 
1462 		va.va_mask = AT_UID|AT_MODE;
1463 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
1464 			"vop_getattr_start:");
1465 		error = VOP_GETATTR(vp, &va, 0, rp->cr);
1466 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
1467 			"vop_getattr_end:");
1468 		if (!error) {
1469 			if (crgetuid(rp->cr) != va.va_uid) {
1470 				/*
1471 				 * This is a kludge to allow writes of files
1472 				 * created with read only permission.  The
1473 				 * owner of the file is always allowed to
1474 				 * write it.
1475 				 */
1476 				TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START,
1477 					"vop_access_start:");
1478 				error = VOP_ACCESS(vp, VWRITE, 0, rp->cr);
1479 				TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END,
1480 					"vop_access_end:");
1481 			}
1482 			if (!error && MANDLOCK(vp, va.va_mode))
1483 				error = EACCES;
1484 		}
1485 
1486 		/*
1487 		 * Check for a conflict with a nbmand-locked region.
1488 		 */
1489 		if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset,
1490 		    rp->wa->wa_count, 0)) {
1491 			error = EACCES;
1492 		}
1493 
1494 		if (error) {
1495 			rp->ns->ns_status = puterrno(error);
1496 			t_flag = curthread->t_flag & T_WOULDBLOCK;
1497 			rp->thread->t_flag |= t_flag;
1498 			continue;
1499 		}
1500 		if (len < rp->wa->wa_offset + rp->wa->wa_count - off)
1501 			len = rp->wa->wa_offset + rp->wa->wa_count - off;
1502 	} while ((rp = rp->list) != NULL);
1503 
1504 	/*
1505 	 * Step through the cluster attempting to gather as many
1506 	 * requests which are contiguous as possible.  These
1507 	 * contiguous requests are handled via one call to VOP_WRITE
1508 	 * instead of different calls to VOP_WRITE.  We also keep
1509 	 * track of the fact that any data was written.
1510 	 */
1511 	rp = nlp->list;
1512 	data_written = 0;
1513 	do {
1514 		/*
1515 		 * Skip any requests which are already marked as having an
1516 		 * error.
1517 		 */
1518 		if (rp->ns->ns_status != RFSWRITE_INITVAL) {
1519 			rp = rp->list;
1520 			continue;
1521 		}
1522 
1523 		/*
1524 		 * Count the number of iovec's which are required
1525 		 * to handle this set of requests.  One iovec is
1526 		 * needed for each data buffer, whether addressed
1527 		 * by wa_data or by the b_rptr pointers in the
1528 		 * mblk chains.
1529 		 */
1530 		iovcnt = 0;
1531 		lrp = rp;
1532 		for (;;) {
1533 			if (lrp->wa->wa_data)
1534 				iovcnt++;
1535 			else {
1536 				m = lrp->wa->wa_mblk;
1537 				while (m != NULL) {
1538 					iovcnt++;
1539 					m = m->b_cont;
1540 				}
1541 			}
1542 			if (lrp->list == NULL ||
1543 			    lrp->list->ns->ns_status != RFSWRITE_INITVAL ||
1544 			    lrp->wa->wa_offset + lrp->wa->wa_count !=
1545 			    lrp->list->wa->wa_offset) {
1546 				lrp = lrp->list;
1547 				break;
1548 			}
1549 			lrp = lrp->list;
1550 		}
1551 
1552 		if (iovcnt <= MAXCLIOVECS) {
1553 #ifdef DEBUG
1554 			rfs_write_hits++;
1555 #endif
1556 			niovp = iov;
1557 		} else {
1558 #ifdef DEBUG
1559 			rfs_write_misses++;
1560 #endif
1561 			niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP);
1562 		}
1563 		/*
1564 		 * Put together the scatter/gather iovecs.
1565 		 */
1566 		iovp = niovp;
1567 		trp = rp;
1568 		count = 0;
1569 		do {
1570 			if (trp->wa->wa_data) {
1571 				iovp->iov_base = trp->wa->wa_data;
1572 				iovp->iov_len = trp->wa->wa_count;
1573 				iovp++;
1574 			} else {
1575 				m = trp->wa->wa_mblk;
1576 				rcount = trp->wa->wa_count;
1577 				while (m != NULL) {
1578 					iovp->iov_base = (caddr_t)m->b_rptr;
1579 					iovp->iov_len = (m->b_wptr - m->b_rptr);
1580 					rcount -= iovp->iov_len;
1581 					if (rcount < 0)
1582 						iovp->iov_len += rcount;
1583 					iovp++;
1584 					if (rcount <= 0)
1585 						break;
1586 					m = m->b_cont;
1587 				}
1588 			}
1589 			count += trp->wa->wa_count;
1590 			trp = trp->list;
1591 		} while (trp != lrp);
1592 
1593 		uio.uio_iov = niovp;
1594 		uio.uio_iovcnt = iovcnt;
1595 		uio.uio_segflg = UIO_SYSSPACE;
1596 		uio.uio_extflg = UIO_COPY_DEFAULT;
1597 		uio.uio_loffset = (offset_t)rp->wa->wa_offset;
1598 		uio.uio_resid = count;
1599 		/*
1600 		 * The limit is checked on the client. We
1601 		 * should allow any size writes here.
1602 		 */
1603 		uio.uio_llimit = curproc->p_fsz_ctl;
1604 		rlimit = uio.uio_llimit - rp->wa->wa_offset;
1605 		if (rlimit < (rlim64_t)uio.uio_resid)
1606 			uio.uio_resid = (uint_t)rlimit;
1607 
1608 		/*
1609 		 * For now we assume no append mode.
1610 		 */
1611 		TRACE_1(TR_FAC_NFS, TR_VOP_WRITE_START,
1612 			"vop_write_start:(%S)", "async");
1613 
1614 		/*
1615 		 * Check to see if the v4 side of the server has
1616 		 * delegated this file.  If so, then we mark thread
1617 		 * as wouldblock so the response is dropped.
1618 		 */
1619 		if (rfs4_check_delegated(FWRITE, vp, FALSE)) {
1620 			curthread->t_flag |= T_WOULDBLOCK;
1621 			error = EACCES; /* just to have an error */
1622 			TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
1623 				"rfs_write_end:(%S)", "delegated");
1624 		} else {
1625 			/*
1626 			 * We're changing creds because VM may fault
1627 			 * and we need the cred of the current
1628 			 * thread to be used if quota * checking is
1629 			 * enabled.
1630 			 */
1631 			savecred = curthread->t_cred;
1632 			curthread->t_cred = cr;
1633 			error = VOP_WRITE(vp, &uio, 0, rp->cr, NULL);
1634 			curthread->t_cred = savecred;
1635 			TRACE_0(TR_FAC_NFS, TR_VOP_WRITE_END,
1636 				"vop_write_end:");
1637 		}
1638 
1639 		if (niovp != iov)
1640 			kmem_free(niovp, sizeof (*niovp) * iovcnt);
1641 
1642 		if (!error) {
1643 			data_written = 1;
1644 			/*
1645 			 * Get attributes again so we send the latest mod
1646 			 * time to the client side for his cache.
1647 			 */
1648 			va.va_mask = AT_ALL;	/* now we want everything */
1649 			TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
1650 				"vop_getattr_start:");
1651 			error = VOP_GETATTR(vp, &va, 0, rp->cr);
1652 			TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
1653 				"vop_getattr_end:");
1654 			if (!error)
1655 				acl_perm(vp, exi, &va, rp->cr);
1656 		}
1657 
1658 		/*
1659 		 * Fill in the status responses for each request
1660 		 * which was just handled.  Also, copy the latest
1661 		 * attributes in to the attribute responses if
1662 		 * appropriate.
1663 		 */
1664 		t_flag = curthread->t_flag & T_WOULDBLOCK;
1665 		do {
1666 			rp->thread->t_flag |= t_flag;
1667 			/* check for overflows */
1668 			if (!error) {
1669 				error  = vattr_to_nattr(&va, &rp->ns->ns_attr);
1670 			}
1671 			rp->ns->ns_status = puterrno(error);
1672 			rp = rp->list;
1673 		} while (rp != lrp);
1674 	} while (rp != NULL);
1675 
1676 	/*
1677 	 * If any data was written at all, then we need to flush
1678 	 * the data and metadata to stable storage.
1679 	 */
1680 	if (data_written) {
1681 		TRACE_0(TR_FAC_NFS, TR_VOP_PUTPAGE_START,
1682 			"vop_putpage_start:");
1683 		error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr);
1684 		TRACE_0(TR_FAC_NFS, TR_VOP_PUTPAGE_END,
1685 			"vop_putpage_end:");
1686 		if (!error) {
1687 			TRACE_0(TR_FAC_NFS, TR_VOP_FSYNC_START,
1688 				"vop_fsync_start:");
1689 			error = VOP_FSYNC(vp, FNODSYNC, cr);
1690 			TRACE_0(TR_FAC_NFS, TR_VOP_FSYNC_END,
1691 				"vop_fsync_end:");
1692 		}
1693 	}
1694 
1695 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
1696 		"vop_rwunlock_start:");
1697 	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
1698 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END,
1699 		"vop_rwunlock_end:");
1700 
1701 	if (in_crit)
1702 		nbl_end_crit(vp);
1703 	VN_RELE(vp);
1704 
1705 	t_flag = curthread->t_flag & T_WOULDBLOCK;
1706 	mutex_enter(&rfs_async_write_lock);
1707 	for (rp = nlp->list; rp != NULL; rp = rp->list) {
1708 		if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1709 			rp->ns->ns_status = puterrno(error);
1710 			rp->thread->t_flag |= t_flag;
1711 		}
1712 	}
1713 	cv_broadcast(&nlp->cv);
1714 	mutex_exit(&rfs_async_write_lock);
1715 
1716 	TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
1717 		"rfs_write_end:(%S)", "async");
1718 }
1719 
1720 fhandle_t *
1721 rfs_write_getfh(struct nfswriteargs *wa)
1722 {
1723 	return (&wa->wa_fhandle);
1724 }
1725 
1726 /*
1727  * Create a file.
1728  * Creates a file with given attributes and returns those attributes
1729  * and an fhandle for the new file.
1730  */
1731 void
1732 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
1733 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1734 {
1735 	int error;
1736 	int lookuperr;
1737 	int in_crit = 0;
1738 	struct vattr va;
1739 	vnode_t *vp;
1740 	vnode_t *dvp;
1741 	char *name = args->ca_da.da_name;
1742 	vnode_t *tvp = NULL;
1743 	int mode;
1744 	int lookup_ok;
1745 	bool_t trunc;
1746 
1747 	TRACE_0(TR_FAC_NFS, TR_RFS_CREATE_START,
1748 		"rfs_create_start:");
1749 
1750 	/*
1751 	 * Disallow NULL paths
1752 	 */
1753 	if (name == NULL || *name == '\0') {
1754 		dr->dr_status = NFSERR_ACCES;
1755 		TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END,
1756 			"rfs_create_end:(%S)", "access");
1757 		return;
1758 	}
1759 
1760 	dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
1761 	if (dvp == NULL) {
1762 		dr->dr_status = NFSERR_STALE;
1763 		TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END,
1764 			"rfs_create_end:(%S)", "stale");
1765 		return;
1766 	}
1767 
1768 	error = sattr_to_vattr(args->ca_sa, &va);
1769 	if (error) {
1770 		dr->dr_status = puterrno(error);
1771 		TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END,
1772 			"rfs_create_end:(%S)", "sattr");
1773 		return;
1774 	}
1775 
1776 	/*
1777 	 * Must specify the mode.
1778 	 */
1779 	if (!(va.va_mask & AT_MODE)) {
1780 		VN_RELE(dvp);
1781 		dr->dr_status = NFSERR_INVAL;
1782 		TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END,
1783 			"rfs_create_end:(%S)", "no mode");
1784 		return;
1785 	}
1786 
1787 	/*
1788 	 * This is a completely gross hack to make mknod
1789 	 * work over the wire until we can wack the protocol
1790 	 */
1791 	if ((va.va_mode & IFMT) == IFCHR) {
1792 		if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV)
1793 			va.va_type = VFIFO;	/* xtra kludge for named pipe */
1794 		else {
1795 			va.va_type = VCHR;
1796 			/*
1797 			 * uncompress the received dev_t
1798 			 * if the top half is zero indicating a request
1799 			 * from an `older style' OS.
1800 			 */
1801 			if ((va.va_size & 0xffff0000) == 0)
1802 				va.va_rdev = nfsv2_expdev(va.va_size);
1803 			else
1804 				va.va_rdev = (dev_t)va.va_size;
1805 		}
1806 		va.va_mask &= ~AT_SIZE;
1807 	} else if ((va.va_mode & IFMT) == IFBLK) {
1808 		va.va_type = VBLK;
1809 		/*
1810 		 * uncompress the received dev_t
1811 		 * if the top half is zero indicating a request
1812 		 * from an `older style' OS.
1813 		 */
1814 		if ((va.va_size & 0xffff0000) == 0)
1815 			va.va_rdev = nfsv2_expdev(va.va_size);
1816 		else
1817 			va.va_rdev = (dev_t)va.va_size;
1818 		va.va_mask &= ~AT_SIZE;
1819 	} else if ((va.va_mode & IFMT) == IFSOCK) {
1820 		va.va_type = VSOCK;
1821 	} else
1822 		va.va_type = VREG;
1823 	va.va_mode &= ~IFMT;
1824 	va.va_mask |= AT_TYPE;
1825 
1826 	/*
1827 	 * Why was the choice made to use VWRITE as the mode to the
1828 	 * call to VOP_CREATE ? This results in a bug.  When a client
1829 	 * opens a file that already exists and is RDONLY, the second
1830 	 * open fails with an EACESS because of the mode.
1831 	 * bug ID 1054648.
1832 	 */
1833 	lookup_ok = 0;
1834 	mode = VWRITE;
1835 	if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) {
1836 		TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_START,
1837 			"vop_lookup_start:");
1838 		error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr);
1839 		TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_END,
1840 			"vop_lookup_end:");
1841 		if (!error) {
1842 			struct vattr at;
1843 
1844 			VN_SETPATH(rootdir, dvp, tvp, name, strlen(name));
1845 			lookup_ok = 1;
1846 			at.va_mask = AT_MODE;
1847 			TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
1848 				"vop_getattr_start:");
1849 			error = VOP_GETATTR(tvp, &at, 0, cr);
1850 			TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
1851 				"vop_getattr_end:");
1852 			if (!error)
1853 				mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD;
1854 			VN_RELE(tvp);
1855 			tvp = NULL;
1856 		}
1857 	}
1858 
1859 	if (!lookup_ok) {
1860 		if (rdonly(exi, req)) {
1861 			error = EROFS;
1862 		} else if (va.va_type != VREG && va.va_type != VFIFO &&
1863 		    va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) {
1864 			error = EPERM;
1865 		} else {
1866 			error = 0;
1867 		}
1868 	}
1869 
1870 	/*
1871 	 * If file size is being modified on an already existing file
1872 	 * make sure that there are no conflicting non-blocking mandatory
1873 	 * locks in the region being manipulated. Return EACCES if there
1874 	 * are conflicting locks.
1875 	 */
1876 	if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) {
1877 		lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr);
1878 
1879 		if (!lookuperr &&
1880 		    rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) {
1881 			VN_RELE(tvp);
1882 			curthread->t_flag |= T_WOULDBLOCK;
1883 			goto out;
1884 		}
1885 
1886 		if (!lookuperr && nbl_need_check(tvp)) {
1887 			/*
1888 			 * The file exists. Now check if it has any
1889 			 * conflicting non-blocking mandatory locks
1890 			 * in the region being changed.
1891 			 */
1892 			struct vattr bva;
1893 			u_offset_t offset;
1894 			ssize_t length;
1895 
1896 			nbl_start_crit(tvp, RW_READER);
1897 			in_crit = 1;
1898 
1899 			bva.va_mask = AT_SIZE;
1900 			error = VOP_GETATTR(tvp, &bva, 0, cr);
1901 			if (!error) {
1902 				if (va.va_size < bva.va_size) {
1903 					offset = va.va_size;
1904 					length = bva.va_size - va.va_size;
1905 				} else {
1906 					offset = bva.va_size;
1907 					length = va.va_size - bva.va_size;
1908 				}
1909 				if (length) {
1910 					if (nbl_conflict(tvp, NBL_WRITE,
1911 						offset, length, 0)) {
1912 						error = EACCES;
1913 					}
1914 				}
1915 			}
1916 			if (error) {
1917 				nbl_end_crit(tvp);
1918 				VN_RELE(tvp);
1919 				in_crit = 0;
1920 			}
1921 		} else if (tvp != NULL) {
1922 			VN_RELE(tvp);
1923 		}
1924 	}
1925 
1926 	if (!error) {
1927 		/*
1928 		 * If filesystem is shared with nosuid the remove any
1929 		 * setuid/setgid bits on create.
1930 		 */
1931 		if (va.va_type == VREG &&
1932 		    exi->exi_export.ex_flags & EX_NOSUID)
1933 			va.va_mode &= ~(VSUID | VSGID);
1934 
1935 		TRACE_0(TR_FAC_NFS, TR_VOP_CREATE_START,
1936 			"vop_create_start:");
1937 		error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0);
1938 		TRACE_0(TR_FAC_NFS, TR_VOP_CREATE_END,
1939 			"vop_create_end:");
1940 
1941 		if (!error) {
1942 
1943 			if ((va.va_mask & AT_SIZE) && (va.va_size == 0))
1944 				trunc = TRUE;
1945 			else
1946 				trunc = FALSE;
1947 
1948 			if (rfs4_check_delegated(FWRITE, tvp, trunc)) {
1949 				VN_RELE(tvp);
1950 				curthread->t_flag |= T_WOULDBLOCK;
1951 				goto out;
1952 			}
1953 			va.va_mask = AT_ALL;
1954 			TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
1955 				"vop_getattr_start:");
1956 			error = VOP_GETATTR(vp, &va, 0, cr);
1957 			TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
1958 				"vop_getattr_end:");
1959 			/* check for overflows */
1960 			if (!error) {
1961 				acl_perm(vp, exi, &va, cr);
1962 				error = vattr_to_nattr(&va, &dr->dr_attr);
1963 				if (!error) {
1964 					error = makefh(&dr->dr_fhandle, vp,
1965 							exi);
1966 				}
1967 			}
1968 			/*
1969 			 * Force modified metadata out to stable storage.
1970 			 */
1971 			(void) VOP_FSYNC(vp, FNODSYNC, cr);
1972 			VN_RELE(vp);
1973 		}
1974 
1975 		if (in_crit) {
1976 			nbl_end_crit(tvp);
1977 			VN_RELE(tvp);
1978 		}
1979 	}
1980 
1981 	/*
1982 	 * Force modified data and metadata out to stable storage.
1983 	 */
1984 	(void) VOP_FSYNC(dvp, 0, cr);
1985 
1986 out:
1987 
1988 	VN_RELE(dvp);
1989 
1990 	dr->dr_status = puterrno(error);
1991 
1992 	TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END,
1993 		"rfs_create_end:(%S)", "done");
1994 }
1995 fhandle_t *
1996 rfs_create_getfh(struct nfscreatargs *args)
1997 {
1998 	return (args->ca_da.da_fhandle);
1999 }
2000 
2001 /*
2002  * Remove a file.
2003  * Remove named file from parent directory.
2004  */
2005 void
2006 rfs_remove(struct nfsdiropargs *da, enum nfsstat *status,
2007 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2008 {
2009 	int error = 0;
2010 	vnode_t *vp;
2011 	vnode_t *targvp;
2012 	int in_crit = 0;
2013 
2014 	TRACE_0(TR_FAC_NFS, TR_RFS_REMOVE_START,
2015 		"rfs_remove_start:");
2016 
2017 	/*
2018 	 * Disallow NULL paths
2019 	 */
2020 	if (da->da_name == NULL || *da->da_name == '\0') {
2021 		*status = NFSERR_ACCES;
2022 		TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END,
2023 			"rfs_remove_end:(%S)", "access");
2024 		return;
2025 	}
2026 
2027 	vp = nfs_fhtovp(da->da_fhandle, exi);
2028 	if (vp == NULL) {
2029 		*status = NFSERR_STALE;
2030 		TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END,
2031 			"rfs_remove_end:(%S)", "stale");
2032 		return;
2033 	}
2034 
2035 	if (rdonly(exi, req)) {
2036 		VN_RELE(vp);
2037 		*status = NFSERR_ROFS;
2038 		TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END,
2039 			"rfs_remove_end:(%S)", "rofs");
2040 		return;
2041 	}
2042 
2043 	/*
2044 	 * Check for a conflict with a non-blocking mandatory share reservation.
2045 	 */
2046 	error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0,
2047 			NULL, cr);
2048 	if (error != 0) {
2049 		VN_RELE(vp);
2050 		*status = puterrno(error);
2051 		return;
2052 	}
2053 
2054 	/*
2055 	 * If the file is delegated to an v4 client, then initiate
2056 	 * recall and drop this request (by setting T_WOULDBLOCK).
2057 	 * The client will eventually re-transmit the request and
2058 	 * (hopefully), by then, the v4 client will have returned
2059 	 * the delegation.
2060 	 */
2061 
2062 	if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2063 		VN_RELE(vp);
2064 		VN_RELE(targvp);
2065 		curthread->t_flag |= T_WOULDBLOCK;
2066 		return;
2067 	}
2068 
2069 	if (nbl_need_check(targvp)) {
2070 		nbl_start_crit(targvp, RW_READER);
2071 		in_crit = 1;
2072 		if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0)) {
2073 			error = EACCES;
2074 			goto out;
2075 		}
2076 	}
2077 
2078 	TRACE_0(TR_FAC_NFS, TR_VOP_REMOVE_START,
2079 		"vop_remove_start:");
2080 	error = VOP_REMOVE(vp, da->da_name, cr);
2081 	TRACE_0(TR_FAC_NFS, TR_VOP_REMOVE_END,
2082 		"vop_remove_end:");
2083 
2084 	/*
2085 	 * Force modified data and metadata out to stable storage.
2086 	 */
2087 	(void) VOP_FSYNC(vp, 0, cr);
2088 
2089 out:
2090 	if (in_crit)
2091 		nbl_end_crit(targvp);
2092 	VN_RELE(targvp);
2093 	VN_RELE(vp);
2094 
2095 	*status = puterrno(error);
2096 
2097 	TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END,
2098 		"rfs_remove_end:(%S)", "done");
2099 }
2100 
2101 fhandle_t *
2102 rfs_remove_getfh(struct nfsdiropargs *da)
2103 {
2104 	return (da->da_fhandle);
2105 }
2106 
2107 /*
2108  * rename a file
2109  * Give a file (from) a new name (to).
2110  */
2111 void
2112 rfs_rename(struct nfsrnmargs *args, enum nfsstat *status,
2113 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2114 {
2115 	int error = 0;
2116 	vnode_t *fromvp;
2117 	vnode_t *tovp;
2118 	struct exportinfo *to_exi;
2119 	fhandle_t *fh;
2120 	vnode_t *srcvp;
2121 	vnode_t *targvp;
2122 	int in_crit = 0;
2123 
2124 	TRACE_0(TR_FAC_NFS, TR_RFS_RENAME_START,
2125 		"rfs_rename_start:");
2126 
2127 	fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi);
2128 	if (fromvp == NULL) {
2129 		*status = NFSERR_STALE;
2130 		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2131 			"rfs_rename_end:(%S)", "from stale");
2132 		return;
2133 	}
2134 
2135 	fh = args->rna_to.da_fhandle;
2136 	to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2137 	if (to_exi == NULL) {
2138 		VN_RELE(fromvp);
2139 		*status = NFSERR_ACCES;
2140 		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2141 			"rfs_rename_end:(%S)", "cross device");
2142 		return;
2143 	}
2144 	exi_rele(to_exi);
2145 
2146 	if (to_exi != exi) {
2147 		VN_RELE(fromvp);
2148 		*status = NFSERR_XDEV;
2149 		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2150 			"rfs_rename_end:(%S)", "from stale");
2151 		return;
2152 	}
2153 
2154 	tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi);
2155 	if (tovp == NULL) {
2156 		VN_RELE(fromvp);
2157 		*status = NFSERR_STALE;
2158 		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2159 			"rfs_rename_end:(%S)", "to stale");
2160 		return;
2161 	}
2162 
2163 	if (fromvp->v_type != VDIR || tovp->v_type != VDIR) {
2164 		VN_RELE(tovp);
2165 		VN_RELE(fromvp);
2166 		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2167 			"rfs_rename_end:(%S)", "not dir");
2168 		*status = NFSERR_NOTDIR;
2169 		return;
2170 	}
2171 
2172 	/*
2173 	 * Disallow NULL paths
2174 	 */
2175 	if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' ||
2176 	    args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') {
2177 		VN_RELE(tovp);
2178 		VN_RELE(fromvp);
2179 		*status = NFSERR_ACCES;
2180 		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2181 			"rfs_rename_end:(%S)", "access");
2182 		return;
2183 	}
2184 
2185 	if (rdonly(exi, req)) {
2186 		VN_RELE(tovp);
2187 		VN_RELE(fromvp);
2188 		*status = NFSERR_ROFS;
2189 		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2190 			"rfs_rename_end:(%S)", "rofs");
2191 		return;
2192 	}
2193 
2194 	/*
2195 	 * Check for a conflict with a non-blocking mandatory share reservation.
2196 	 */
2197 	error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
2198 			NULL, cr);
2199 	if (error != 0) {
2200 		VN_RELE(tovp);
2201 		VN_RELE(fromvp);
2202 		*status = puterrno(error);
2203 		return;
2204 	}
2205 
2206 	/* Check for delegations on the source file */
2207 
2208 	if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
2209 		VN_RELE(tovp);
2210 		VN_RELE(fromvp);
2211 		VN_RELE(srcvp);
2212 		curthread->t_flag |= T_WOULDBLOCK;
2213 		return;
2214 	}
2215 
2216 	/* Check for delegation on the file being renamed over, if it exists */
2217 
2218 	if (rfs4_deleg_policy != SRV_NEVER_DELEGATE &&
2219 	    VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr)
2220 	    == 0) {
2221 
2222 		if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2223 			VN_RELE(tovp);
2224 			VN_RELE(fromvp);
2225 			VN_RELE(srcvp);
2226 			VN_RELE(targvp);
2227 			curthread->t_flag |= T_WOULDBLOCK;
2228 			return;
2229 		}
2230 		VN_RELE(targvp);
2231 	}
2232 
2233 
2234 	if (nbl_need_check(srcvp)) {
2235 		nbl_start_crit(srcvp, RW_READER);
2236 		in_crit = 1;
2237 		if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0)) {
2238 			error = EACCES;
2239 			goto out;
2240 		}
2241 	}
2242 
2243 	TRACE_0(TR_FAC_NFS, TR_VOP_RENAME_START,
2244 		"vop_rename_start:");
2245 	error = VOP_RENAME(fromvp, args->rna_from.da_name,
2246 	    tovp, args->rna_to.da_name, cr);
2247 	TRACE_0(TR_FAC_NFS, TR_VOP_RENAME_END,
2248 		"vop_rename_end:");
2249 
2250 	/*
2251 	 * Force modified data and metadata out to stable storage.
2252 	 */
2253 	(void) VOP_FSYNC(tovp, 0, cr);
2254 	(void) VOP_FSYNC(fromvp, 0, cr);
2255 
2256 out:
2257 	if (in_crit)
2258 		nbl_end_crit(srcvp);
2259 	VN_RELE(srcvp);
2260 	VN_RELE(tovp);
2261 	VN_RELE(fromvp);
2262 
2263 	*status = puterrno(error);
2264 
2265 	TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2266 		"rfs_rename_end:(%S)", "done");
2267 }
2268 fhandle_t *
2269 rfs_rename_getfh(struct nfsrnmargs *args)
2270 {
2271 	return (args->rna_from.da_fhandle);
2272 }
2273 
2274 /*
2275  * Link to a file.
2276  * Create a file (to) which is a hard link to the given file (from).
2277  */
2278 void
2279 rfs_link(struct nfslinkargs *args, enum nfsstat *status,
2280 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2281 {
2282 	int error;
2283 	vnode_t *fromvp;
2284 	vnode_t *tovp;
2285 	struct exportinfo *to_exi;
2286 	fhandle_t *fh;
2287 
2288 	TRACE_0(TR_FAC_NFS, TR_RFS_LINK_START,
2289 		"rfs_link_start:");
2290 
2291 	fromvp = nfs_fhtovp(args->la_from, exi);
2292 	if (fromvp == NULL) {
2293 		*status = NFSERR_STALE;
2294 		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2295 			"rfs_link_end:(%S)", "from stale");
2296 		return;
2297 	}
2298 
2299 	fh = args->la_to.da_fhandle;
2300 	to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2301 	if (to_exi == NULL) {
2302 		VN_RELE(fromvp);
2303 		*status = NFSERR_ACCES;
2304 		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2305 			"rfs_link_end:(%S)", "cross device");
2306 		return;
2307 	}
2308 	exi_rele(to_exi);
2309 
2310 	if (to_exi != exi) {
2311 		VN_RELE(fromvp);
2312 		*status = NFSERR_XDEV;
2313 		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2314 			"rfs_link_end:(%S)", "cross device");
2315 		return;
2316 	}
2317 
2318 	tovp = nfs_fhtovp(args->la_to.da_fhandle, exi);
2319 	if (tovp == NULL) {
2320 		VN_RELE(fromvp);
2321 		*status = NFSERR_STALE;
2322 		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2323 			"rfs_link_end:(%S)", "to stale");
2324 		return;
2325 	}
2326 
2327 	if (tovp->v_type != VDIR) {
2328 		VN_RELE(tovp);
2329 		VN_RELE(fromvp);
2330 		*status = NFSERR_NOTDIR;
2331 		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2332 			"rfs_link_end:(%S)", "not dir");
2333 		return;
2334 	}
2335 	/*
2336 	 * Disallow NULL paths
2337 	 */
2338 	if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') {
2339 		VN_RELE(tovp);
2340 		VN_RELE(fromvp);
2341 		*status = NFSERR_ACCES;
2342 		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2343 			"rfs_link_end:(%S)", "access");
2344 		return;
2345 	}
2346 
2347 	if (rdonly(exi, req)) {
2348 		VN_RELE(tovp);
2349 		VN_RELE(fromvp);
2350 		*status = NFSERR_ROFS;
2351 		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2352 			"rfs_link_end:(%S)", "rofs");
2353 		return;
2354 	}
2355 
2356 	TRACE_0(TR_FAC_NFS, TR_VOP_LINK_START,
2357 		"vop_link_start:");
2358 	error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr);
2359 	TRACE_0(TR_FAC_NFS, TR_VOP_LINK_END,
2360 		"vop_link_end:");
2361 
2362 	/*
2363 	 * Force modified data and metadata out to stable storage.
2364 	 */
2365 	(void) VOP_FSYNC(tovp, 0, cr);
2366 	(void) VOP_FSYNC(fromvp, FNODSYNC, cr);
2367 
2368 	VN_RELE(tovp);
2369 	VN_RELE(fromvp);
2370 
2371 	*status = puterrno(error);
2372 
2373 	TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2374 		"rfs_link_end:(%S)", "done");
2375 }
2376 fhandle_t *
2377 rfs_link_getfh(struct nfslinkargs *args)
2378 {
2379 	return (args->la_from);
2380 }
2381 
2382 /*
2383  * Symbolicly link to a file.
2384  * Create a file (to) with the given attributes which is a symbolic link
2385  * to the given path name (to).
2386  */
2387 void
2388 rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
2389 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2390 {
2391 	int error;
2392 	struct vattr va;
2393 	vnode_t *vp;
2394 	vnode_t *svp;
2395 	int lerror;
2396 
2397 	TRACE_0(TR_FAC_NFS, TR_RFS_SYMLINK_START,
2398 		"rfs_symlink_start:");
2399 
2400 	/*
2401 	 * Disallow NULL paths
2402 	 */
2403 	if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') {
2404 		*status = NFSERR_ACCES;
2405 		TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END,
2406 			"rfs_symlink_end:(%S)", "access");
2407 		return;
2408 	}
2409 
2410 	vp = nfs_fhtovp(args->sla_from.da_fhandle, exi);
2411 	if (vp == NULL) {
2412 		*status = NFSERR_STALE;
2413 		TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END,
2414 			"rfs_symlink_end:(%S)", "stale");
2415 		return;
2416 	}
2417 
2418 	if (rdonly(exi, req)) {
2419 		VN_RELE(vp);
2420 		*status = NFSERR_ROFS;
2421 		TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END,
2422 			"rfs_symlink_end:(%S)", "rofs");
2423 		return;
2424 	}
2425 
2426 	error = sattr_to_vattr(args->sla_sa, &va);
2427 	if (error) {
2428 		VN_RELE(vp);
2429 		*status = puterrno(error);
2430 		TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END,
2431 			"rfs_symlink_end:(%S)", "sattr");
2432 		return;
2433 	}
2434 
2435 	if (!(va.va_mask & AT_MODE)) {
2436 		VN_RELE(vp);
2437 		*status = NFSERR_INVAL;
2438 		TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END,
2439 			"rfs_symlink_end:(%S)", "no mode");
2440 		return;
2441 	}
2442 
2443 	va.va_type = VLNK;
2444 	va.va_mask |= AT_TYPE;
2445 
2446 	TRACE_0(TR_FAC_NFS, TR_VOP_SYMLINK_START,
2447 		"vop_symlink_start:");
2448 	error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, args->sla_tnm, cr);
2449 	TRACE_0(TR_FAC_NFS, TR_VOP_SYMLINK_END,
2450 		"vop_symlink_end:");
2451 
2452 	/*
2453 	 * Force new data and metadata out to stable storage.
2454 	 */
2455 	TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_START,
2456 		"vop_lookup_start:");
2457 	lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL,
2458 	    0, NULL, cr);
2459 	TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_END,
2460 		"vop_lookup_end:");
2461 	if (!lerror) {
2462 		VN_SETPATH(rootdir, vp, svp, args->sla_from.da_name,
2463 			strlen(args->sla_from.da_name));
2464 		(void) VOP_FSYNC(svp, 0, cr);
2465 		VN_RELE(svp);
2466 	}
2467 
2468 	/*
2469 	 * Force modified data and metadata out to stable storage.
2470 	 */
2471 	(void) VOP_FSYNC(vp, 0, cr);
2472 
2473 	VN_RELE(vp);
2474 
2475 	*status = puterrno(error);
2476 
2477 	TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END,
2478 		"rfs_symlink_end:(%S)", "done");
2479 }
2480 fhandle_t *
2481 rfs_symlink_getfh(struct nfsslargs *args)
2482 {
2483 	return (args->sla_from.da_fhandle);
2484 }
2485 
2486 /*
2487  * Make a directory.
2488  * Create a directory with the given name, parent directory, and attributes.
2489  * Returns a file handle and attributes for the new directory.
2490  */
2491 void
2492 rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr,
2493 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2494 {
2495 	int error;
2496 	struct vattr va;
2497 	vnode_t *dvp = NULL;
2498 	vnode_t *vp;
2499 	char *name = args->ca_da.da_name;
2500 
2501 	TRACE_0(TR_FAC_NFS, TR_RFS_MKDIR_START,
2502 		"rfs_mkdir_start:");
2503 
2504 	/*
2505 	 * Disallow NULL paths
2506 	 */
2507 	if (name == NULL || *name == '\0') {
2508 		dr->dr_status = NFSERR_ACCES;
2509 		TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END,
2510 			"rfs_mkdir_end:(%S)", "access");
2511 		return;
2512 	}
2513 
2514 	vp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
2515 	if (vp == NULL) {
2516 		dr->dr_status = NFSERR_STALE;
2517 		TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END,
2518 			"rfs_mkdir_end:(%S)", "stale");
2519 		return;
2520 	}
2521 
2522 	if (rdonly(exi, req)) {
2523 		VN_RELE(vp);
2524 		dr->dr_status = NFSERR_ROFS;
2525 		TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END,
2526 			"rfs_mkdir_end:(%S)", "rofs");
2527 		return;
2528 	}
2529 
2530 	error = sattr_to_vattr(args->ca_sa, &va);
2531 	if (error) {
2532 		VN_RELE(vp);
2533 		dr->dr_status = puterrno(error);
2534 		TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END,
2535 			"rfs_mkdir_end:(%S)", "sattr");
2536 		return;
2537 	}
2538 
2539 	if (!(va.va_mask & AT_MODE)) {
2540 		VN_RELE(vp);
2541 		dr->dr_status = NFSERR_INVAL;
2542 		TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END,
2543 			"rfs_mkdir_end:(%S)", "no mode");
2544 		return;
2545 	}
2546 
2547 	va.va_type = VDIR;
2548 	va.va_mask |= AT_TYPE;
2549 
2550 	TRACE_0(TR_FAC_NFS, TR_VOP_MKDIR_START,
2551 		"vop_mkdir_start:");
2552 	error = VOP_MKDIR(vp, name, &va, &dvp, cr);
2553 	TRACE_0(TR_FAC_NFS, TR_VOP_MKDIR_END,
2554 		"vop_mkdir_end:");
2555 
2556 	if (!error) {
2557 		/*
2558 		 * Attribtutes of the newly created directory should
2559 		 * be returned to the client.
2560 		 */
2561 		va.va_mask = AT_ALL; /* We want everything */
2562 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
2563 			"vop_getattr_start:");
2564 		error = VOP_GETATTR(dvp, &va, 0, cr);
2565 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
2566 			"vop_getattr_end:");
2567 		/* check for overflows */
2568 		if (!error) {
2569 			acl_perm(vp, exi, &va, cr);
2570 			error = vattr_to_nattr(&va, &dr->dr_attr);
2571 			if (!error) {
2572 				error = makefh(&dr->dr_fhandle, dvp, exi);
2573 			}
2574 		}
2575 		/*
2576 		 * Force new data and metadata out to stable storage.
2577 		 */
2578 		(void) VOP_FSYNC(dvp, 0, cr);
2579 		VN_RELE(dvp);
2580 	}
2581 
2582 	/*
2583 	 * Force modified data and metadata out to stable storage.
2584 	 */
2585 	(void) VOP_FSYNC(vp, 0, cr);
2586 
2587 	VN_RELE(vp);
2588 
2589 	dr->dr_status = puterrno(error);
2590 
2591 	TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END,
2592 		"rfs_mkdir_end:(%S)", "done");
2593 }
2594 fhandle_t *
2595 rfs_mkdir_getfh(struct nfscreatargs *args)
2596 {
2597 	return (args->ca_da.da_fhandle);
2598 }
2599 
2600 /*
2601  * Remove a directory.
2602  * Remove the given directory name from the given parent directory.
2603  */
2604 void
2605 rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status,
2606 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2607 {
2608 	int error;
2609 	vnode_t *vp;
2610 
2611 	TRACE_0(TR_FAC_NFS, TR_RFS_RMDIR_START,
2612 		"rfs_rmdir_start:");
2613 
2614 	/*
2615 	 * Disallow NULL paths
2616 	 */
2617 	if (da->da_name == NULL || *da->da_name == '\0') {
2618 		*status = NFSERR_ACCES;
2619 		TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END,
2620 			"rfs_rmdir_end:(%S)", "access");
2621 		return;
2622 	}
2623 
2624 	vp = nfs_fhtovp(da->da_fhandle, exi);
2625 	if (vp == NULL) {
2626 		*status = NFSERR_STALE;
2627 		TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END,
2628 			"rfs_rmdir_end:(%S)", "stale");
2629 		return;
2630 	}
2631 
2632 	if (rdonly(exi, req)) {
2633 		VN_RELE(vp);
2634 		*status = NFSERR_ROFS;
2635 		TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END,
2636 			"rfs_rmdir_end:(%S)", "rofs");
2637 		return;
2638 	}
2639 
2640 	/*
2641 	 * VOP_RMDIR now takes a new third argument (the current
2642 	 * directory of the process).  That's because someone
2643 	 * wants to return EINVAL if one tries to remove ".".
2644 	 * Of course, NFS servers have no idea what their
2645 	 * clients' current directories are.  We fake it by
2646 	 * supplying a vnode known to exist and illegal to
2647 	 * remove.
2648 	 */
2649 	TRACE_0(TR_FAC_NFS, TR_VOP_RMDIR_START,
2650 		"vop_rmdir_start:");
2651 	error = VOP_RMDIR(vp, da->da_name, rootdir, cr);
2652 	TRACE_0(TR_FAC_NFS, TR_VOP_RMDIR_END,
2653 		"vop_rmdir_end:");
2654 
2655 	/*
2656 	 * Force modified data and metadata out to stable storage.
2657 	 */
2658 	(void) VOP_FSYNC(vp, 0, cr);
2659 
2660 	VN_RELE(vp);
2661 
2662 	/*
2663 	 * System V defines rmdir to return EEXIST, not ENOTEMPTY,
2664 	 * if the directory is not empty.  A System V NFS server
2665 	 * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
2666 	 * over the wire.
2667 	 */
2668 	if (error == EEXIST)
2669 		*status = NFSERR_NOTEMPTY;
2670 	else
2671 		*status = puterrno(error);
2672 
2673 	TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END,
2674 		"rfs_rmdir_end:(%S)", "done");
2675 }
2676 fhandle_t *
2677 rfs_rmdir_getfh(struct nfsdiropargs *da)
2678 {
2679 	return (da->da_fhandle);
2680 }
2681 
2682 /* ARGSUSED */
2683 void
2684 rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
2685 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2686 {
2687 	int error;
2688 	int iseof;
2689 	struct iovec iov;
2690 	struct uio uio;
2691 	vnode_t *vp;
2692 
2693 	TRACE_0(TR_FAC_NFS, TR_RFS_READDIR_START,
2694 		"rfs_readdir_start:");
2695 
2696 	vp = nfs_fhtovp(&rda->rda_fh, exi);
2697 	if (vp == NULL) {
2698 		rd->rd_entries = NULL;
2699 		rd->rd_status = NFSERR_STALE;
2700 		TRACE_1(TR_FAC_NFS, TR_RFS_READDIR_END,
2701 			"rfs_readdir_end:(%S)", "stale");
2702 		return;
2703 	}
2704 
2705 	if (vp->v_type != VDIR) {
2706 		VN_RELE(vp);
2707 		rd->rd_entries = NULL;
2708 		rd->rd_status = NFSERR_NOTDIR;
2709 		TRACE_1(TR_FAC_NFS, TR_RFS_READDIR_END,
2710 			"rfs_readdir_end:(%S)", "notdir");
2711 		return;
2712 	}
2713 
2714 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START,
2715 		"vop_rwlock_start:");
2716 	(void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
2717 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END,
2718 		"vop_rwlock_end:");
2719 
2720 	TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START,
2721 		"vop_access_start:");
2722 	error = VOP_ACCESS(vp, VREAD, 0, cr);
2723 	TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END,
2724 		"vop_access_end:");
2725 	if (error) {
2726 		rd->rd_entries = NULL;
2727 		goto bad;
2728 	}
2729 
2730 	if (rda->rda_count == 0) {
2731 		rd->rd_entries = NULL;
2732 		rd->rd_size = 0;
2733 		rd->rd_eof = FALSE;
2734 		goto bad;
2735 	}
2736 
2737 	rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA);
2738 
2739 	/*
2740 	 * Allocate data for entries.  This will be freed by rfs_rddirfree.
2741 	 */
2742 	rd->rd_bufsize = (uint_t)rda->rda_count;
2743 	rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP);
2744 
2745 	/*
2746 	 * Set up io vector to read directory data
2747 	 */
2748 	iov.iov_base = (caddr_t)rd->rd_entries;
2749 	iov.iov_len = rda->rda_count;
2750 	uio.uio_iov = &iov;
2751 	uio.uio_iovcnt = 1;
2752 	uio.uio_segflg = UIO_SYSSPACE;
2753 	uio.uio_extflg = UIO_COPY_CACHED;
2754 	uio.uio_loffset = (offset_t)rda->rda_offset;
2755 	uio.uio_resid = rda->rda_count;
2756 
2757 	/*
2758 	 * read directory
2759 	 */
2760 	TRACE_0(TR_FAC_NFS, TR_VOP_READDIR_START,
2761 		"vop_readdir_start:");
2762 	error = VOP_READDIR(vp, &uio, cr, &iseof);
2763 	TRACE_0(TR_FAC_NFS, TR_VOP_READDIR_END,
2764 		"vop_readdir_end:");
2765 
2766 	/*
2767 	 * Clean up
2768 	 */
2769 	if (!error) {
2770 		/*
2771 		 * set size and eof
2772 		 */
2773 		if (uio.uio_resid == rda->rda_count) {
2774 			rd->rd_size = 0;
2775 			rd->rd_eof = TRUE;
2776 		} else {
2777 			rd->rd_size = (uint32_t)(rda->rda_count -
2778 			    uio.uio_resid);
2779 			rd->rd_eof = iseof ? TRUE : FALSE;
2780 		}
2781 	}
2782 
2783 bad:
2784 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
2785 		"vop_rwunlock_start:");
2786 	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
2787 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END,
2788 		"vop_rwunlock_end:");
2789 
2790 #if 0 /* notyet */
2791 	/*
2792 	 * Don't do this.  It causes local disk writes when just
2793 	 * reading the file and the overhead is deemed larger
2794 	 * than the benefit.
2795 	 */
2796 	/*
2797 	 * Force modified metadata out to stable storage.
2798 	 */
2799 	(void) VOP_FSYNC(vp, FNODSYNC, cr);
2800 #endif
2801 
2802 	VN_RELE(vp);
2803 
2804 	rd->rd_status = puterrno(error);
2805 
2806 	TRACE_1(TR_FAC_NFS, TR_RFS_READDIR_END,
2807 		"rfs_readdir_end:(%S)", "done");
2808 }
2809 fhandle_t *
2810 rfs_readdir_getfh(struct nfsrddirargs *rda)
2811 {
2812 	return (&rda->rda_fh);
2813 }
2814 void
2815 rfs_rddirfree(struct nfsrddirres *rd)
2816 {
2817 	if (rd->rd_entries != NULL)
2818 		kmem_free(rd->rd_entries, rd->rd_bufsize);
2819 }
2820 
2821 /* ARGSUSED */
2822 void
2823 rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
2824 	struct svc_req *req, cred_t *cr)
2825 {
2826 	int error;
2827 	struct statvfs64 sb;
2828 	vnode_t *vp;
2829 
2830 	TRACE_0(TR_FAC_NFS, TR_RFS_STATFS_START,
2831 		"rfs_statfs_start:");
2832 
2833 	vp = nfs_fhtovp(fh, exi);
2834 	if (vp == NULL) {
2835 		fs->fs_status = NFSERR_STALE;
2836 		TRACE_1(TR_FAC_NFS, TR_RFS_STATFS_END,
2837 			"rfs_statfs_end:(%S)", "stale");
2838 		return;
2839 	}
2840 
2841 	error = VFS_STATVFS(vp->v_vfsp, &sb);
2842 
2843 	if (!error) {
2844 		fs->fs_tsize = nfstsize();
2845 		fs->fs_bsize = sb.f_frsize;
2846 		fs->fs_blocks = sb.f_blocks;
2847 		fs->fs_bfree = sb.f_bfree;
2848 		fs->fs_bavail = sb.f_bavail;
2849 	}
2850 
2851 	VN_RELE(vp);
2852 
2853 	fs->fs_status = puterrno(error);
2854 
2855 	TRACE_1(TR_FAC_NFS, TR_RFS_STATFS_END,
2856 		"rfs_statfs_end:(%S)", "done");
2857 }
2858 fhandle_t *
2859 rfs_statfs_getfh(fhandle_t *fh)
2860 {
2861 	return (fh);
2862 }
2863 
2864 static int
2865 sattr_to_vattr(struct nfssattr *sa, struct vattr *vap)
2866 {
2867 	vap->va_mask = 0;
2868 
2869 	/*
2870 	 * There was a sign extension bug in some VFS based systems
2871 	 * which stored the mode as a short.  When it would get
2872 	 * assigned to a u_long, no sign extension would occur.
2873 	 * It needed to, but this wasn't noticed because sa_mode
2874 	 * would then get assigned back to the short, thus ignoring
2875 	 * the upper 16 bits of sa_mode.
2876 	 *
2877 	 * To make this implementation work for both broken
2878 	 * clients and good clients, we check for both versions
2879 	 * of the mode.
2880 	 */
2881 	if (sa->sa_mode != (uint32_t)((ushort_t)-1) &&
2882 	    sa->sa_mode != (uint32_t)-1) {
2883 		vap->va_mask |= AT_MODE;
2884 		vap->va_mode = sa->sa_mode;
2885 	}
2886 	if (sa->sa_uid != (uint32_t)-1) {
2887 		vap->va_mask |= AT_UID;
2888 		vap->va_uid = sa->sa_uid;
2889 	}
2890 	if (sa->sa_gid != (uint32_t)-1) {
2891 		vap->va_mask |= AT_GID;
2892 		vap->va_gid = sa->sa_gid;
2893 	}
2894 	if (sa->sa_size != (uint32_t)-1) {
2895 		vap->va_mask |= AT_SIZE;
2896 		vap->va_size = sa->sa_size;
2897 	}
2898 	if (sa->sa_atime.tv_sec != (int32_t)-1 &&
2899 	    sa->sa_atime.tv_usec != (int32_t)-1) {
2900 #ifndef _LP64
2901 		/* return error if time overflow */
2902 		if (!NFS2_TIME_OK(sa->sa_atime.tv_sec))
2903 			return (EOVERFLOW);
2904 #endif
2905 		vap->va_mask |= AT_ATIME;
2906 		/*
2907 		 * nfs protocol defines times as unsigned so don't extend sign,
2908 		 * unless sysadmin set nfs_allow_preepoch_time.
2909 		 */
2910 		NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec);
2911 		vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
2912 	}
2913 	if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
2914 	    sa->sa_mtime.tv_usec != (int32_t)-1) {
2915 #ifndef _LP64
2916 		/* return error if time overflow */
2917 		if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
2918 			return (EOVERFLOW);
2919 #endif
2920 		vap->va_mask |= AT_MTIME;
2921 		/*
2922 		 * nfs protocol defines times as unsigned so don't extend sign,
2923 		 * unless sysadmin set nfs_allow_preepoch_time.
2924 		 */
2925 		NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
2926 		vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
2927 	}
2928 	return (0);
2929 }
2930 
2931 static enum nfsftype vt_to_nf[] = {
2932 	0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
2933 };
2934 
2935 /*
2936  * check the following fields for overflow: nodeid, size, and time.
2937  * There could be a problem when converting 64-bit LP64 fields
2938  * into 32-bit ones.  Return an error if there is an overflow.
2939  */
2940 int
2941 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
2942 {
2943 	ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
2944 	na->na_type = vt_to_nf[vap->va_type];
2945 
2946 	if (vap->va_mode == (unsigned short) -1)
2947 		na->na_mode = (uint32_t)-1;
2948 	else
2949 		na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
2950 
2951 	if (vap->va_uid == (unsigned short)(-1))
2952 		na->na_uid = (uint32_t)(-1);
2953 	else if (vap->va_uid == UID_NOBODY)
2954 		na->na_uid = (uint32_t)NFS_UID_NOBODY;
2955 	else
2956 		na->na_uid = vap->va_uid;
2957 
2958 	if (vap->va_gid == (unsigned short)(-1))
2959 		na->na_gid = (uint32_t)-1;
2960 	else if (vap->va_gid == GID_NOBODY)
2961 		na->na_gid = (uint32_t)NFS_GID_NOBODY;
2962 	else
2963 		na->na_gid = vap->va_gid;
2964 
2965 	/*
2966 	 * Do we need to check fsid for overflow?  It is 64-bit in the
2967 	 * vattr, but are bigger than 32 bit values supported?
2968 	 */
2969 	na->na_fsid = vap->va_fsid;
2970 
2971 	na->na_nodeid = vap->va_nodeid;
2972 
2973 	/*
2974 	 * Check to make sure that the nodeid is representable over the
2975 	 * wire without losing bits.
2976 	 */
2977 	if (vap->va_nodeid != (u_longlong_t)na->na_nodeid)
2978 		return (EFBIG);
2979 	na->na_nlink = vap->va_nlink;
2980 
2981 	/*
2982 	 * Check for big files here, instead of at the caller.  See
2983 	 * comments in cstat for large special file explanation.
2984 	 */
2985 	if (vap->va_size > (u_longlong_t)MAXOFF32_T) {
2986 		if ((vap->va_type == VREG) || (vap->va_type == VDIR))
2987 			return (EFBIG);
2988 		if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) {
2989 			/* UNKNOWN_SIZE | OVERFLOW */
2990 			na->na_size = MAXOFF32_T;
2991 		} else
2992 			na->na_size = vap->va_size;
2993 	} else
2994 		na->na_size = vap->va_size;
2995 
2996 	/*
2997 	 * If the vnode times overflow the 32-bit times that NFS2
2998 	 * uses on the wire then return an error.
2999 	 */
3000 	if (!NFS_VAP_TIME_OK(vap)) {
3001 		return (EOVERFLOW);
3002 	}
3003 	na->na_atime.tv_sec = vap->va_atime.tv_sec;
3004 	na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
3005 
3006 	na->na_mtime.tv_sec = vap->va_mtime.tv_sec;
3007 	na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
3008 
3009 	na->na_ctime.tv_sec = vap->va_ctime.tv_sec;
3010 	na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000;
3011 
3012 	/*
3013 	 * If the dev_t will fit into 16 bits then compress
3014 	 * it, otherwise leave it alone. See comments in
3015 	 * nfs_client.c.
3016 	 */
3017 	if (getminor(vap->va_rdev) <= SO4_MAXMIN &&
3018 	    getmajor(vap->va_rdev) <= SO4_MAXMAJ)
3019 		na->na_rdev = nfsv2_cmpdev(vap->va_rdev);
3020 	else
3021 		(void) cmpldev(&na->na_rdev, vap->va_rdev);
3022 
3023 	na->na_blocks = vap->va_nblocks;
3024 	na->na_blocksize = vap->va_blksize;
3025 
3026 	/*
3027 	 * This bit of ugliness is a *TEMPORARY* hack to preserve the
3028 	 * over-the-wire protocols for named-pipe vnodes.  It remaps the
3029 	 * VFIFO type to the special over-the-wire type. (see note in nfs.h)
3030 	 *
3031 	 * BUYER BEWARE:
3032 	 *  If you are porting the NFS to a non-Sun server, you probably
3033 	 *  don't want to include the following block of code.  The
3034 	 *  over-the-wire special file types will be changing with the
3035 	 *  NFS Protocol Revision.
3036 	 */
3037 	if (vap->va_type == VFIFO)
3038 		NA_SETFIFO(na);
3039 	return (0);
3040 }
3041 
3042 /*
3043  * acl v2 support: returns approximate permission.
3044  *	default: returns minimal permission (more restrictive)
3045  *	aclok: returns maximal permission (less restrictive)
3046  *	This routine changes the permissions that are alaredy in *va.
3047  *	If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES,
3048  *	CLASS_OBJ is always the same as GROUP_OBJ entry.
3049  */
3050 static void
3051 acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr)
3052 {
3053 	vsecattr_t	vsa;
3054 	int		aclcnt;
3055 	aclent_t	*aclentp;
3056 	mode_t		mask_perm;
3057 	mode_t		grp_perm;
3058 	mode_t		other_perm;
3059 	mode_t		other_orig;
3060 	int		error;
3061 
3062 	/* dont care default acl */
3063 	vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT);
3064 	error = VOP_GETSECATTR(vp, &vsa, 0, cr);
3065 
3066 	if (!error) {
3067 		aclcnt = vsa.vsa_aclcnt;
3068 		if (aclcnt > MIN_ACL_ENTRIES) {
3069 			/* non-trivial ACL */
3070 			aclentp = vsa.vsa_aclentp;
3071 			if (exi->exi_export.ex_flags & EX_ACLOK) {
3072 				/* maximal permissions */
3073 				grp_perm = 0;
3074 				other_perm = 0;
3075 				for (; aclcnt > 0; aclcnt--, aclentp++) {
3076 					switch (aclentp->a_type) {
3077 					case USER_OBJ:
3078 						break;
3079 					case USER:
3080 						grp_perm |=
3081 						    aclentp->a_perm << 3;
3082 						other_perm |= aclentp->a_perm;
3083 						break;
3084 					case GROUP_OBJ:
3085 						grp_perm |=
3086 						    aclentp->a_perm << 3;
3087 						break;
3088 					case GROUP:
3089 						other_perm |= aclentp->a_perm;
3090 						break;
3091 					case OTHER_OBJ:
3092 						other_orig = aclentp->a_perm;
3093 						break;
3094 					case CLASS_OBJ:
3095 						mask_perm = aclentp->a_perm;
3096 						break;
3097 					default:
3098 						break;
3099 					}
3100 				}
3101 				grp_perm &= mask_perm << 3;
3102 				other_perm &= mask_perm;
3103 				other_perm |= other_orig;
3104 
3105 			} else {
3106 				/* minimal permissions */
3107 				grp_perm = 070;
3108 				other_perm = 07;
3109 				for (; aclcnt > 0; aclcnt--, aclentp++) {
3110 					switch (aclentp->a_type) {
3111 					case USER_OBJ:
3112 						break;
3113 					case USER:
3114 					case CLASS_OBJ:
3115 						grp_perm &=
3116 						    aclentp->a_perm << 3;
3117 						other_perm &=
3118 						    aclentp->a_perm;
3119 						break;
3120 					case GROUP_OBJ:
3121 						grp_perm &=
3122 						    aclentp->a_perm << 3;
3123 						break;
3124 					case GROUP:
3125 						other_perm &=
3126 						    aclentp->a_perm;
3127 						break;
3128 					case OTHER_OBJ:
3129 						other_perm &=
3130 						    aclentp->a_perm;
3131 						break;
3132 					default:
3133 						break;
3134 					}
3135 				}
3136 			}
3137 			/* copy to va */
3138 			va->va_mode &= ~077;
3139 			va->va_mode |= grp_perm | other_perm;
3140 		}
3141 		if (vsa.vsa_aclcnt)
3142 			kmem_free(vsa.vsa_aclentp,
3143 			    vsa.vsa_aclcnt * sizeof (aclent_t));
3144 	}
3145 }
3146 
3147 void
3148 rfs_srvrinit(void)
3149 {
3150 	mutex_init(&rfs_async_write_lock, NULL, MUTEX_DEFAULT, NULL);
3151 }
3152 
3153 void
3154 rfs_srvrfini(void)
3155 {
3156 	mutex_destroy(&rfs_async_write_lock);
3157 }
3158