xref: /titanic_52/usr/src/uts/common/fs/nfs/nfs_srv.c (revision dc5a8425272d2602e4c21b95b9eeac2b897f45a1)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  *	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
29  *	All rights reserved.
30  */
31 
32 #pragma ident	"%Z%%M%	%I%	%E% SMI"
33 
34 #include <sys/param.h>
35 #include <sys/types.h>
36 #include <sys/systm.h>
37 #include <sys/cred.h>
38 #include <sys/buf.h>
39 #include <sys/vfs.h>
40 #include <sys/vnode.h>
41 #include <sys/uio.h>
42 #include <sys/stat.h>
43 #include <sys/errno.h>
44 #include <sys/sysmacros.h>
45 #include <sys/statvfs.h>
46 #include <sys/kmem.h>
47 #include <sys/kstat.h>
48 #include <sys/dirent.h>
49 #include <sys/cmn_err.h>
50 #include <sys/debug.h>
51 #include <sys/vtrace.h>
52 #include <sys/mode.h>
53 #include <sys/acl.h>
54 #include <sys/nbmlock.h>
55 #include <sys/policy.h>
56 
57 #include <rpc/types.h>
58 #include <rpc/auth.h>
59 #include <rpc/svc.h>
60 
61 #include <nfs/nfs.h>
62 #include <nfs/export.h>
63 
64 #include <vm/hat.h>
65 #include <vm/as.h>
66 #include <vm/seg.h>
67 #include <vm/seg_map.h>
68 #include <vm/seg_kmem.h>
69 
70 #include <sys/strsubr.h>
71 
72 /*
73  * These are the interface routines for the server side of the
74  * Network File System.  See the NFS version 2 protocol specification
75  * for a description of this interface.
76  */
77 
78 static int	sattr_to_vattr(struct nfssattr *, struct vattr *);
79 static void	acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
80 			cred_t *);
81 
82 /*
83  * Some "over the wire" UNIX file types.  These are encoded
84  * into the mode.  This needs to be fixed in the next rev.
85  */
86 #define	IFMT		0170000		/* type of file */
87 #define	IFCHR		0020000		/* character special */
88 #define	IFBLK		0060000		/* block special */
89 #define	IFSOCK		0140000		/* socket */
90 
91 /*
92  * Get file attributes.
93  * Returns the current attributes of the file with the given fhandle.
94  */
95 /* ARGSUSED */
96 void
97 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
98 	struct svc_req *req, cred_t *cr)
99 {
100 	int error;
101 	vnode_t *vp;
102 	struct vattr va;
103 
104 	TRACE_0(TR_FAC_NFS, TR_RFS_GETATTR_START,
105 		"rfs_getattr_start:");
106 
107 	vp = nfs_fhtovp(fhp, exi);
108 	if (vp == NULL) {
109 		ns->ns_status = NFSERR_STALE;
110 		TRACE_1(TR_FAC_NFS, TR_RFS_GETATTR_END,
111 			"rfs_getattr_end:(%S)", "stale");
112 		return;
113 	}
114 
115 	/*
116 	 * Do the getattr.
117 	 */
118 	va.va_mask = AT_ALL;	/* we want all the attributes */
119 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
120 		"vop_getattr_start:");
121 	error = rfs4_delegated_getattr(vp, &va, 0, cr);
122 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
123 		"vop_getattr_end:");
124 
125 	/* check for overflows */
126 	if (!error) {
127 		acl_perm(vp, exi, &va, cr);
128 		error = vattr_to_nattr(&va, &ns->ns_attr);
129 	}
130 
131 	VN_RELE(vp);
132 
133 	ns->ns_status = puterrno(error);
134 
135 	TRACE_1(TR_FAC_NFS, TR_RFS_GETATTR_END,
136 		"rfs_getattr_end:(%S)", "done");
137 }
138 fhandle_t *
139 rfs_getattr_getfh(fhandle_t *fhp)
140 {
141 	return (fhp);
142 }
143 
144 /*
145  * Set file attributes.
146  * Sets the attributes of the file with the given fhandle.  Returns
147  * the new attributes.
148  */
149 void
150 rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns,
151 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
152 {
153 	int error;
154 	int flag;
155 	int in_crit = 0;
156 	vnode_t *vp;
157 	struct vattr va;
158 	struct vattr bva;
159 	struct flock64 bf;
160 
161 	TRACE_0(TR_FAC_NFS, TR_RFS_SETATTR_START,
162 		"rfs_setattr_start:");
163 
164 	vp = nfs_fhtovp(&args->saa_fh, exi);
165 	if (vp == NULL) {
166 		ns->ns_status = NFSERR_STALE;
167 		TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END,
168 			"rfs_setattr_end:(%S)", "stale");
169 		return;
170 	}
171 
172 	if (rdonly(exi, req) || vn_is_readonly(vp)) {
173 		VN_RELE(vp);
174 		ns->ns_status = NFSERR_ROFS;
175 		TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END,
176 			"rfs_setattr_end:(%S)", "rofs");
177 		return;
178 	}
179 
180 	error = sattr_to_vattr(&args->saa_sa, &va);
181 	if (error) {
182 		VN_RELE(vp);
183 		ns->ns_status = puterrno(error);
184 		TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END,
185 			"rfs_setattr_end:(%S)", "sattr");
186 		return;
187 	}
188 
189 	/*
190 	 * If the client is requesting a change to the mtime,
191 	 * but the nanosecond field is set to 1 billion, then
192 	 * this is a flag to the server that it should set the
193 	 * atime and mtime fields to the server's current time.
194 	 * The 1 billion number actually came from the client
195 	 * as 1 million, but the units in the over the wire
196 	 * request are microseconds instead of nanoseconds.
197 	 *
198 	 * This is an overload of the protocol and should be
199 	 * documented in the NFS Version 2 protocol specification.
200 	 */
201 	if (va.va_mask & AT_MTIME) {
202 		if (va.va_mtime.tv_nsec == 1000000000) {
203 			gethrestime(&va.va_mtime);
204 			va.va_atime = va.va_mtime;
205 			va.va_mask |= AT_ATIME;
206 			flag = 0;
207 		} else
208 			flag = ATTR_UTIME;
209 	} else
210 		flag = 0;
211 
212 	/*
213 	 * If the filesystem is exported with nosuid, then mask off
214 	 * the setuid and setgid bits.
215 	 */
216 	if ((va.va_mask & AT_MODE) && vp->v_type == VREG &&
217 	    (exi->exi_export.ex_flags & EX_NOSUID))
218 		va.va_mode &= ~(VSUID | VSGID);
219 
220 	/*
221 	 * We need to specially handle size changes because it is
222 	 * possible for the client to create a file with modes
223 	 * which indicate read-only, but with the file opened for
224 	 * writing.  If the client then tries to set the size of
225 	 * the file, then the normal access checking done in
226 	 * VOP_SETATTR would prevent the client from doing so,
227 	 * although it should be legal for it to do so.  To get
228 	 * around this, we do the access checking for ourselves
229 	 * and then use VOP_SPACE which doesn't do the access
230 	 * checking which VOP_SETATTR does. VOP_SPACE can only
231 	 * operate on VREG files, let VOP_SETATTR handle the other
232 	 * extremely rare cases.
233 	 * Also the client should not be allowed to change the
234 	 * size of the file if there is a conflicting non-blocking
235 	 * mandatory lock in the region of change.
236 	 *
237 	 * Also(2), check to see if the v4 side of the server has
238 	 * delegated this file.  If so, then we set T_WOULDBLOCK
239 	 * so that the dispatch function dosn't send a reply, forcing
240 	 * the client to retrasmit its request.
241 	 */
242 	if (vp->v_type == VREG && va.va_mask & AT_SIZE) {
243 		/* If delegated, mark as wouldblock so response is dropped */
244 		if (rfs4_check_delegated(FWRITE, vp, TRUE)) {
245 			VN_RELE(vp);
246 			curthread->t_flag |= T_WOULDBLOCK;
247 			TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END,
248 				"rfs_setattr_end:(%S)", "delegated");
249 			return;
250 		}
251 		if (nbl_need_check(vp)) {
252 			nbl_start_crit(vp, RW_READER);
253 			in_crit = 1;
254 		}
255 
256 		bva.va_mask = AT_UID | AT_SIZE;
257 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
258 			"vop_getattr_start:");
259 		error = VOP_GETATTR(vp, &bva, 0, cr);
260 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
261 			"vop_getattr_end:");
262 		if (error) {
263 			if (in_crit)
264 				nbl_end_crit(vp);
265 			VN_RELE(vp);
266 			ns->ns_status = puterrno(error);
267 			TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END,
268 				"rfs_setattr_end:(%S)", "getattr");
269 			return;
270 		}
271 
272 		if (in_crit) {
273 			u_offset_t offset;
274 			ssize_t length;
275 
276 			if (va.va_size < bva.va_size) {
277 				offset = va.va_size;
278 				length = bva.va_size - va.va_size;
279 			} else {
280 				offset = bva.va_size;
281 				length = va.va_size - bva.va_size;
282 			}
283 			if (nbl_conflict(vp, NBL_WRITE, offset, length, 0)) {
284 				error = EACCES;
285 			}
286 		}
287 
288 		if (crgetuid(cr) == bva.va_uid && !error &&
289 			va.va_size != bva.va_size) {
290 			va.va_mask &= ~AT_SIZE;
291 			bf.l_type = F_WRLCK;
292 			bf.l_whence = 0;
293 			bf.l_start = (off64_t)va.va_size;
294 			bf.l_len = 0;
295 			bf.l_sysid = 0;
296 			bf.l_pid = 0;
297 			TRACE_0(TR_FAC_NFS, TR_VOP_SPACE_START,
298 				"vop_space_start:");
299 			error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE,
300 					(offset_t)va.va_size, cr, NULL);
301 			TRACE_0(TR_FAC_NFS, TR_VOP_SPACE_END,
302 				"vop_space_end:");
303 		}
304 		if (in_crit)
305 			nbl_end_crit(vp);
306 	} else
307 		error = 0;
308 
309 	/*
310 	 * Do the setattr.
311 	 */
312 	if (!error && va.va_mask) {
313 		TRACE_0(TR_FAC_NFS, TR_VOP_SETATTR_START,
314 			"vop_setattr_start:");
315 		error = VOP_SETATTR(vp, &va, flag, cr, NULL);
316 		TRACE_0(TR_FAC_NFS, TR_VOP_SETATTR_END,
317 			"vop_setattr_end:");
318 	}
319 
320 	if (!error) {
321 		va.va_mask = AT_ALL;	/* get everything */
322 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
323 			"vop_getattr_start:");
324 		error = rfs4_delegated_getattr(vp, &va, 0, cr);
325 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
326 			"vop_getattr_end:");
327 
328 		/* check for overflows */
329 		if (!error) {
330 			acl_perm(vp, exi, &va, cr);
331 			error = vattr_to_nattr(&va, &ns->ns_attr);
332 		}
333 	}
334 
335 	/*
336 	 * Force modified metadata out to stable storage.
337 	 */
338 	(void) VOP_FSYNC(vp, FNODSYNC, cr);
339 
340 	VN_RELE(vp);
341 
342 	ns->ns_status = puterrno(error);
343 
344 	TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END,
345 		"rfs_setattr_end:(%S)", "done");
346 }
347 fhandle_t *
348 rfs_setattr_getfh(struct nfssaargs *args)
349 {
350 	return (&args->saa_fh);
351 }
352 
353 /*
354  * Directory lookup.
355  * Returns an fhandle and file attributes for file name in a directory.
356  */
357 /* ARGSUSED */
358 void
359 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
360 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
361 {
362 	int error;
363 	vnode_t *dvp;
364 	vnode_t *vp;
365 	struct vattr va;
366 	fhandle_t *fhp = da->da_fhandle;
367 	struct sec_ol sec = {0, 0};
368 	bool_t publicfh_flag = FALSE, auth_weak = FALSE;
369 
370 	TRACE_0(TR_FAC_NFS, TR_RFS_LOOKUP_START,
371 		"rfs_lookup_start:");
372 
373 	/*
374 	 * Disallow NULL paths
375 	 */
376 	if (da->da_name == NULL || *da->da_name == '\0') {
377 		dr->dr_status = NFSERR_ACCES;
378 		TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END,
379 			"rfs_lookup_end:(%S)", "access");
380 		return;
381 	}
382 
383 	/*
384 	 * Allow lookups from the root - the default
385 	 * location of the public filehandle.
386 	 */
387 	if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
388 		dvp = rootdir;
389 		VN_HOLD(dvp);
390 	} else {
391 		dvp = nfs_fhtovp(fhp, exi);
392 		if (dvp == NULL) {
393 			dr->dr_status = NFSERR_STALE;
394 			TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END,
395 				"rfs_lookup_end:(%S)", "stale");
396 			return;
397 		}
398 	}
399 
400 	/*
401 	 * Not allow lookup beyond root.
402 	 * If the filehandle matches a filehandle of the exi,
403 	 * then the ".." refers beyond the root of an exported filesystem.
404 	 */
405 	if (strcmp(da->da_name, "..") == 0 &&
406 	    EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
407 		VN_RELE(dvp);
408 		dr->dr_status = NFSERR_NOENT;
409 		TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END,
410 			"rfs_lookup_end:(%S)", "noent");
411 		return;
412 	}
413 
414 	/*
415 	 * If the public filehandle is used then allow
416 	 * a multi-component lookup, i.e. evaluate
417 	 * a pathname and follow symbolic links if
418 	 * necessary.
419 	 *
420 	 * This may result in a vnode in another filesystem
421 	 * which is OK as long as the filesystem is exported.
422 	 */
423 	if (PUBLIC_FH2(fhp)) {
424 		publicfh_flag = TRUE;
425 		error = rfs_publicfh_mclookup(da->da_name, dvp, cr, &vp, &exi,
426 					&sec);
427 	} else {
428 		/*
429 		 * Do a normal single component lookup.
430 		 */
431 		TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_START,
432 			"vop_lookup_start:");
433 		error = VOP_LOOKUP(dvp, da->da_name, &vp, NULL, 0, NULL, cr);
434 		TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_END,
435 			"vop_lookup_end:");
436 	}
437 
438 	if (!error) {
439 		va.va_mask = AT_ALL;	/* we want everything */
440 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
441 			"vop_getattr_start:");
442 		error = rfs4_delegated_getattr(vp, &va, 0, cr);
443 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
444 			"vop_getattr_end:");
445 		/* check for overflows */
446 		if (!error) {
447 			acl_perm(vp, exi, &va, cr);
448 			error = vattr_to_nattr(&va, &dr->dr_attr);
449 			if (!error) {
450 				if (sec.sec_flags & SEC_QUERY)
451 					error = makefh_ol(&dr->dr_fhandle, exi,
452 							sec.sec_index);
453 				else {
454 					error = makefh(&dr->dr_fhandle, vp,
455 								exi);
456 					if (!error && publicfh_flag &&
457 						!chk_clnt_sec(exi, req))
458 						auth_weak = TRUE;
459 				}
460 			}
461 		}
462 		VN_RELE(vp);
463 	}
464 
465 	VN_RELE(dvp);
466 
467 	/*
468 	 * If publicfh_flag is true then we have called rfs_publicfh_mclookup
469 	 * and have obtained a new exportinfo in exi which needs to be
470 	 * released. Note the the original exportinfo pointed to by exi
471 	 * will be released by the caller, comon_dispatch.
472 	 */
473 	if (publicfh_flag && exi != NULL)
474 		exi_rele(exi);
475 
476 	/*
477 	 * If it's public fh, no 0x81, and client's flavor is
478 	 * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
479 	 * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
480 	 */
481 	if (auth_weak)
482 		dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR;
483 	else
484 		dr->dr_status = puterrno(error);
485 
486 	TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END,
487 		"rfs_lookup_end:(%S)", "done");
488 }
489 fhandle_t *
490 rfs_lookup_getfh(struct nfsdiropargs *da)
491 {
492 	return (da->da_fhandle);
493 }
494 
495 /*
496  * Read symbolic link.
497  * Returns the string in the symbolic link at the given fhandle.
498  */
499 /* ARGSUSED */
500 void
501 rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi,
502 	struct svc_req *req, cred_t *cr)
503 {
504 	int error;
505 	struct iovec iov;
506 	struct uio uio;
507 	vnode_t *vp;
508 	struct vattr va;
509 
510 	TRACE_0(TR_FAC_NFS, TR_RFS_READLINK_START,
511 		"rfs_readlink_start:");
512 
513 	vp = nfs_fhtovp(fhp, exi);
514 	if (vp == NULL) {
515 		rl->rl_data = NULL;
516 		rl->rl_status = NFSERR_STALE;
517 		TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END,
518 			"rfs_readlink_end:(%S)", "stale");
519 		return;
520 	}
521 
522 	va.va_mask = AT_MODE;
523 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
524 		"vop_getattr_start:");
525 	error = VOP_GETATTR(vp, &va, 0, cr);
526 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
527 		"vop_getattr_end:");
528 
529 	if (error) {
530 		VN_RELE(vp);
531 		rl->rl_data = NULL;
532 		rl->rl_status = puterrno(error);
533 		TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END,
534 			"rfs_readlink_end:(%S)", "getattr error");
535 		return;
536 	}
537 
538 	if (MANDLOCK(vp, va.va_mode)) {
539 		VN_RELE(vp);
540 		rl->rl_data = NULL;
541 		rl->rl_status = NFSERR_ACCES;
542 		TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END,
543 			"rfs_readlink_end:(%S)", "access");
544 		return;
545 	}
546 
547 	/*
548 	 * XNFS and RFC1094 require us to return ENXIO if argument
549 	 * is not a link. BUGID 1138002.
550 	 */
551 	if (vp->v_type != VLNK) {
552 		VN_RELE(vp);
553 		rl->rl_data = NULL;
554 		rl->rl_status = NFSERR_NXIO;
555 		TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END,
556 			"rfs_readlink_end:(%S)", "nxio");
557 		return;
558 	}
559 
560 	/*
561 	 * Allocate data for pathname.  This will be freed by rfs_rlfree.
562 	 */
563 	rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
564 
565 	/*
566 	 * Set up io vector to read sym link data
567 	 */
568 	iov.iov_base = rl->rl_data;
569 	iov.iov_len = NFS_MAXPATHLEN;
570 	uio.uio_iov = &iov;
571 	uio.uio_iovcnt = 1;
572 	uio.uio_segflg = UIO_SYSSPACE;
573 	uio.uio_extflg = UIO_COPY_CACHED;
574 	uio.uio_loffset = (offset_t)0;
575 	uio.uio_resid = NFS_MAXPATHLEN;
576 
577 	/*
578 	 * Do the readlink.
579 	 */
580 	TRACE_0(TR_FAC_NFS, TR_VOP_READLINK_START,
581 		"vop_readlink_start:");
582 	error = VOP_READLINK(vp, &uio, cr);
583 	TRACE_0(TR_FAC_NFS, TR_VOP_READLINK_END,
584 		"vop_readlink_end:");
585 
586 #if 0 /* notyet */
587 	/*
588 	 * Don't do this.  It causes local disk writes when just
589 	 * reading the file and the overhead is deemed larger
590 	 * than the benefit.
591 	 */
592 	/*
593 	 * Force modified metadata out to stable storage.
594 	 */
595 	(void) VOP_FSYNC(vp, FNODSYNC, cr);
596 #endif
597 
598 	VN_RELE(vp);
599 
600 	rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid);
601 
602 	/*
603 	 * XNFS and RFC1094 require us to return ENXIO if argument
604 	 * is not a link. UFS returns EINVAL if this is the case,
605 	 * so we do the mapping here. BUGID 1138002.
606 	 */
607 	if (error == EINVAL)
608 		rl->rl_status = NFSERR_NXIO;
609 	else
610 		rl->rl_status = puterrno(error);
611 
612 	TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END,
613 		"rfs_readlink_end:(%S)", "done");
614 }
615 fhandle_t *
616 rfs_readlink_getfh(fhandle_t *fhp)
617 {
618 	return (fhp);
619 }
620 /*
621  * Free data allocated by rfs_readlink
622  */
623 void
624 rfs_rlfree(struct nfsrdlnres *rl)
625 {
626 	if (rl->rl_data != NULL)
627 		kmem_free(rl->rl_data, NFS_MAXPATHLEN);
628 }
629 
630 /*
631  * Read data.
632  * Returns some data read from the file at the given fhandle.
633  */
634 /* ARGSUSED */
635 void
636 rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr,
637 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
638 {
639 	vnode_t *vp;
640 	int error;
641 	struct vattr va;
642 	struct iovec iov;
643 	struct uio uio;
644 	mblk_t *mp;
645 	int alloc_err = 0;
646 	int in_crit = 0;
647 
648 	TRACE_0(TR_FAC_NFS, TR_RFS_READ_START,
649 		"rfs_read_start:");
650 
651 	vp = nfs_fhtovp(&ra->ra_fhandle, exi);
652 	if (vp == NULL) {
653 		rr->rr_data = NULL;
654 		rr->rr_status = NFSERR_STALE;
655 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
656 			"rfs_read_end:(%S)", "stale");
657 		return;
658 	}
659 
660 	if (vp->v_type != VREG) {
661 		VN_RELE(vp);
662 		rr->rr_data = NULL;
663 		rr->rr_status = NFSERR_ISDIR;
664 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
665 			"rfs_read_end:(%S)", "isdir");
666 		return;
667 	}
668 
669 	/*
670 	 * Check to see if the v4 side of the server has delegated
671 	 * this file.  If so, then we mark thread as wouldblock so
672 	 * the response is dropped.
673 	 */
674 	if (rfs4_check_delegated(FREAD, vp, FALSE)) {
675 		VN_RELE(vp);
676 		curthread->t_flag |= T_WOULDBLOCK;
677 		rr->rr_data = NULL;
678 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
679 			"rfs_read_end:(%S)", "delegated");
680 		return;
681 	}
682 
683 	/*
684 	 * Enter the critical region before calling VOP_RWLOCK
685 	 * to avoid a deadlock with write requests.
686 	 */
687 	if (nbl_need_check(vp)) {
688 		nbl_start_crit(vp, RW_READER);
689 		if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
690 		    0)) {
691 			nbl_end_crit(vp);
692 			VN_RELE(vp);
693 			rr->rr_data = NULL;
694 			rr->rr_status = NFSERR_ACCES;
695 			TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
696 				"rfs_read_end:(%S)", " csf access error");
697 			return;
698 		}
699 		in_crit = 1;
700 	}
701 
702 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START,
703 		"vop_rwlock_start:");
704 	(void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
705 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END,
706 		"vop_rwlock_end:");
707 
708 	va.va_mask = AT_ALL;
709 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
710 		"vop_getattr_start:");
711 	error = VOP_GETATTR(vp, &va, 0, cr);
712 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
713 		"vop_getattr_end:");
714 
715 	if (error) {
716 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
717 			"vop_rwunlock_start:");
718 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
719 		if (in_crit)
720 			nbl_end_crit(vp);
721 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END,
722 			"vop_rwunlock_end:");
723 		VN_RELE(vp);
724 		rr->rr_data = NULL;
725 		rr->rr_status = puterrno(error);
726 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
727 			"rfs_read_end:(%S)", "getattr error");
728 		return;
729 	}
730 
731 	/*
732 	 * This is a kludge to allow reading of files created
733 	 * with no read permission.  The owner of the file
734 	 * is always allowed to read it.
735 	 */
736 	if (crgetuid(cr) != va.va_uid) {
737 		TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START,
738 			"vop_access_start:");
739 		error = VOP_ACCESS(vp, VREAD, 0, cr);
740 		TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END,
741 			"vop_access_end:");
742 		if (error) {
743 			/*
744 			 * Exec is the same as read over the net because
745 			 * of demand loading.
746 			 */
747 			TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START,
748 				"vop_access_start:");
749 			error = VOP_ACCESS(vp, VEXEC, 0, cr);
750 			TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END,
751 				"vop_access_end:");
752 		}
753 		if (error) {
754 			TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
755 				"vop_rwunlock_start:");
756 			VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
757 			if (in_crit)
758 				nbl_end_crit(vp);
759 			TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END,
760 				"vop_rwunlock_end:");
761 			VN_RELE(vp);
762 			rr->rr_data = NULL;
763 			rr->rr_status = puterrno(error);
764 			TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
765 				"rfs_read_end:(%S)", "access error");
766 			return;
767 		}
768 	}
769 
770 	if (MANDLOCK(vp, va.va_mode)) {
771 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
772 			"vop_rwunlock_start:");
773 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
774 		if (in_crit)
775 			nbl_end_crit(vp);
776 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END,
777 			"vop_rwunlock_end:");
778 		VN_RELE(vp);
779 		rr->rr_data = NULL;
780 		rr->rr_status = NFSERR_ACCES;
781 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
782 			"rfs_read_end:(%S)", "mand lock");
783 		return;
784 	}
785 
786 	if ((u_offset_t)ra->ra_offset >= va.va_size) {
787 		rr->rr_count = 0;
788 		rr->rr_data = NULL;
789 		/*
790 		 * In this case, status is NFS_OK, but there is no data
791 		 * to encode. So set rr_mp to NULL.
792 		 */
793 		rr->rr_mp = NULL;
794 		goto done;
795 	}
796 
797 	/*
798 	 * mp will contain the data to be sent out in the read reply.
799 	 * This will be freed after the reply has been sent out (by the
800 	 * driver).
801 	 * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so
802 	 * that the call to xdrmblk_putmblk() never fails.
803 	 */
804 	mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG,
805 	    &alloc_err);
806 	ASSERT(mp != NULL);
807 	ASSERT(alloc_err == 0);
808 
809 	rr->rr_mp = mp;
810 
811 	/*
812 	 * Set up io vector
813 	 */
814 	iov.iov_base = (caddr_t)mp->b_datap->db_base;
815 	iov.iov_len = ra->ra_count;
816 	uio.uio_iov = &iov;
817 	uio.uio_iovcnt = 1;
818 	uio.uio_segflg = UIO_SYSSPACE;
819 	uio.uio_extflg = UIO_COPY_CACHED;
820 	uio.uio_loffset = (offset_t)ra->ra_offset;
821 	uio.uio_resid = ra->ra_count;
822 
823 	TRACE_0(TR_FAC_NFS, TR_VOP_READ_START,
824 		"vop_read_start:");
825 	error = VOP_READ(vp, &uio, 0, cr, NULL);
826 	TRACE_0(TR_FAC_NFS, TR_VOP_READ_END,
827 		"vop_read_end:");
828 
829 	if (error) {
830 		freeb(mp);
831 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
832 			"vop_rwunlock_start:");
833 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
834 		if (in_crit)
835 			nbl_end_crit(vp);
836 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END,
837 			"vop_rwunlock_end:");
838 		VN_RELE(vp);
839 		rr->rr_data = NULL;
840 		rr->rr_status = puterrno(error);
841 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
842 			"rfs_read_end:(%S)", "read error");
843 		return;
844 	}
845 
846 	/*
847 	 * Get attributes again so we can send the latest access
848 	 * time to the client side for his cache.
849 	 */
850 	va.va_mask = AT_ALL;
851 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
852 		"vop_getattr_start:");
853 	error = VOP_GETATTR(vp, &va, 0, cr);
854 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
855 		"vop_getattr_end:");
856 	if (error) {
857 		freeb(mp);
858 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
859 			"vop_rwunlock_start:");
860 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
861 		if (in_crit)
862 			nbl_end_crit(vp);
863 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END,
864 			"vop_rwunlock_end:");
865 		VN_RELE(vp);
866 		rr->rr_data = NULL;
867 		rr->rr_status = puterrno(error);
868 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
869 			"rfs_read_end:(%S)", "read error");
870 		return;
871 	}
872 
873 	rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid);
874 
875 	rr->rr_data = (char *)mp->b_datap->db_base;
876 
877 done:
878 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
879 		"vop_rwunlock_start:");
880 	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
881 	if (in_crit)
882 		nbl_end_crit(vp);
883 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END,
884 		"vop_rwunlock_end:");
885 
886 	acl_perm(vp, exi, &va, cr);
887 
888 	/* check for overflows */
889 	error = vattr_to_nattr(&va, &rr->rr_attr);
890 
891 #if 0 /* notyet */
892 	/*
893 	 * Don't do this.  It causes local disk writes when just
894 	 * reading the file and the overhead is deemed larger
895 	 * than the benefit.
896 	 */
897 	/*
898 	 * Force modified metadata out to stable storage.
899 	 */
900 	(void) VOP_FSYNC(vp, FNODSYNC, cr);
901 #endif
902 
903 	VN_RELE(vp);
904 
905 	rr->rr_status = puterrno(error);
906 
907 	TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
908 		"rfs_read_end:(%S)", "done");
909 }
910 
911 /*
912  * Free data allocated by rfs_read
913  */
914 void
915 rfs_rdfree(struct nfsrdresult *rr)
916 {
917 	mblk_t *mp;
918 
919 	if (rr->rr_status == NFS_OK) {
920 		mp = rr->rr_mp;
921 		if (mp != NULL)
922 			freeb(mp);
923 	}
924 }
925 
926 fhandle_t *
927 rfs_read_getfh(struct nfsreadargs *ra)
928 {
929 	return (&ra->ra_fhandle);
930 }
931 
932 #define	MAX_IOVECS	12
933 
934 #ifdef DEBUG
935 static int rfs_write_sync_hits = 0;
936 static int rfs_write_sync_misses = 0;
937 #endif
938 
939 /*
940  * Write data to file.
941  * Returns attributes of a file after writing some data to it.
942  *
943  * Any changes made here, especially in error handling might have
944  * to also be done in rfs_write (which clusters write requests).
945  */
946 void
947 rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns,
948 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
949 {
950 	int error;
951 	vnode_t *vp;
952 	rlim64_t rlimit;
953 	struct vattr va;
954 	struct uio uio;
955 	struct iovec iov[MAX_IOVECS];
956 	mblk_t *m;
957 	struct iovec *iovp;
958 	int iovcnt;
959 	cred_t *savecred;
960 	int in_crit = 0;
961 
962 	TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_START,
963 		"rfs_write_start:(%S)", "sync");
964 
965 	vp = nfs_fhtovp(&wa->wa_fhandle, exi);
966 	if (vp == NULL) {
967 		ns->ns_status = NFSERR_STALE;
968 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
969 			"rfs_write_end:(%S)", "stale");
970 		return;
971 	}
972 
973 	if (rdonly(exi, req)) {
974 		VN_RELE(vp);
975 		ns->ns_status = NFSERR_ROFS;
976 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
977 			"rfs_write_end:(%S)", "rofs");
978 		return;
979 	}
980 
981 	if (vp->v_type != VREG) {
982 		VN_RELE(vp);
983 		ns->ns_status = NFSERR_ISDIR;
984 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
985 			"rfs_write_end:(%S)", "isdir");
986 		return;
987 	}
988 
989 	/*
990 	 * Check to see if the v4 side of the server has delegated
991 	 * this file.  If so, then we mark thread as wouldblock so
992 	 * the response is dropped.
993 	 */
994 	if (rfs4_check_delegated(FWRITE, vp, FALSE)) {
995 		VN_RELE(vp);
996 		curthread->t_flag |= T_WOULDBLOCK;
997 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
998 			"rfs_write_end:(%S)", "delegated");
999 		return;
1000 	}
1001 
1002 	va.va_mask = AT_UID|AT_MODE;
1003 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
1004 		"vop_getattr_start:");
1005 	error = VOP_GETATTR(vp, &va, 0, cr);
1006 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
1007 		"vop_getattr_end:");
1008 
1009 	if (error) {
1010 		VN_RELE(vp);
1011 		ns->ns_status = puterrno(error);
1012 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
1013 			"rfs_write_end:(%S)", "getattr error");
1014 		return;
1015 	}
1016 
1017 	if (crgetuid(cr) != va.va_uid) {
1018 		/*
1019 		 * This is a kludge to allow writes of files created
1020 		 * with read only permission.  The owner of the file
1021 		 * is always allowed to write it.
1022 		 */
1023 		TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START,
1024 			"vop_access_start:");
1025 		error = VOP_ACCESS(vp, VWRITE, 0, cr);
1026 		TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END,
1027 			"vop_access_end:");
1028 		if (error) {
1029 			VN_RELE(vp);
1030 			ns->ns_status = puterrno(error);
1031 			TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
1032 				"rfs_write_end:(%S)", "access error");
1033 			return;
1034 		}
1035 	}
1036 
1037 	/*
1038 	 * Can't access a mandatory lock file.  This might cause
1039 	 * the NFS service thread to block forever waiting for a
1040 	 * lock to be released that will never be released.
1041 	 */
1042 	if (MANDLOCK(vp, va.va_mode)) {
1043 		VN_RELE(vp);
1044 		ns->ns_status = NFSERR_ACCES;
1045 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
1046 			"rfs_write_end:(%S)", "mand lock");
1047 		return;
1048 	}
1049 
1050 	/*
1051 	 * We have to enter the critical region before calling VOP_RWLOCK
1052 	 * to avoid a deadlock with ufs.
1053 	 */
1054 	if (nbl_need_check(vp)) {
1055 		nbl_start_crit(vp, RW_READER);
1056 		in_crit = 1;
1057 		if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
1058 		    wa->wa_count, 0)) {
1059 			error = EACCES;
1060 			goto out;
1061 		}
1062 	}
1063 
1064 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START,
1065 		"vop_rwlock_start:");
1066 	(void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);
1067 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END,
1068 		"vop_rwlock_end:");
1069 
1070 	if (wa->wa_data) {
1071 		iov[0].iov_base = wa->wa_data;
1072 		iov[0].iov_len = wa->wa_count;
1073 		uio.uio_iov = iov;
1074 		uio.uio_iovcnt = 1;
1075 		uio.uio_segflg = UIO_SYSSPACE;
1076 		uio.uio_extflg = UIO_COPY_DEFAULT;
1077 		uio.uio_loffset = (offset_t)wa->wa_offset;
1078 		uio.uio_resid = wa->wa_count;
1079 		/*
1080 		 * The limit is checked on the client. We
1081 		 * should allow any size writes here.
1082 		 */
1083 		uio.uio_llimit = curproc->p_fsz_ctl;
1084 		rlimit = uio.uio_llimit - wa->wa_offset;
1085 		if (rlimit < (rlim64_t)uio.uio_resid)
1086 			uio.uio_resid = (uint_t)rlimit;
1087 
1088 		/*
1089 		 * for now we assume no append mode
1090 		 */
1091 		TRACE_1(TR_FAC_NFS, TR_VOP_WRITE_START,
1092 			"vop_write_start:(%S)", "sync");
1093 		/*
1094 		 * We're changing creds because VM may fault and we need
1095 		 * the cred of the current thread to be used if quota
1096 		 * checking is enabled.
1097 		 */
1098 		savecred = curthread->t_cred;
1099 		curthread->t_cred = cr;
1100 		error = VOP_WRITE(vp, &uio, FSYNC, cr, NULL);
1101 		curthread->t_cred = savecred;
1102 		TRACE_0(TR_FAC_NFS, TR_VOP_WRITE_END,
1103 			"vop_write_end:");
1104 	} else {
1105 		iovcnt = 0;
1106 		for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
1107 			iovcnt++;
1108 		if (iovcnt <= MAX_IOVECS) {
1109 #ifdef DEBUG
1110 			rfs_write_sync_hits++;
1111 #endif
1112 			iovp = iov;
1113 		} else {
1114 #ifdef DEBUG
1115 			rfs_write_sync_misses++;
1116 #endif
1117 			iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
1118 		}
1119 		mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
1120 		uio.uio_iov = iovp;
1121 		uio.uio_iovcnt = iovcnt;
1122 		uio.uio_segflg = UIO_SYSSPACE;
1123 		uio.uio_extflg = UIO_COPY_DEFAULT;
1124 		uio.uio_loffset = (offset_t)wa->wa_offset;
1125 		uio.uio_resid = wa->wa_count;
1126 		/*
1127 		 * The limit is checked on the client. We
1128 		 * should allow any size writes here.
1129 		 */
1130 		uio.uio_llimit = curproc->p_fsz_ctl;
1131 		rlimit = uio.uio_llimit - wa->wa_offset;
1132 		if (rlimit < (rlim64_t)uio.uio_resid)
1133 			uio.uio_resid = (uint_t)rlimit;
1134 
1135 		/*
1136 		 * For now we assume no append mode.
1137 		 */
1138 		TRACE_1(TR_FAC_NFS, TR_VOP_WRITE_START,
1139 			"vop_write_start:(%S)", "iov sync");
1140 		/*
1141 		 * We're changing creds because VM may fault and we need
1142 		 * the cred of the current thread to be used if quota
1143 		 * checking is enabled.
1144 		 */
1145 		savecred = curthread->t_cred;
1146 		curthread->t_cred = cr;
1147 		error = VOP_WRITE(vp, &uio, FSYNC, cr, NULL);
1148 		curthread->t_cred = savecred;
1149 		TRACE_0(TR_FAC_NFS, TR_VOP_WRITE_END,
1150 			"vop_write_end:");
1151 
1152 		if (iovp != iov)
1153 			kmem_free(iovp, sizeof (*iovp) * iovcnt);
1154 	}
1155 
1156 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
1157 		"vop_rwunlock_start:");
1158 	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
1159 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END,
1160 		"vop_rwunlock_end:");
1161 
1162 	if (!error) {
1163 		/*
1164 		 * Get attributes again so we send the latest mod
1165 		 * time to the client side for his cache.
1166 		 */
1167 		va.va_mask = AT_ALL;	/* now we want everything */
1168 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
1169 			"vop_getattr_start:");
1170 		error = VOP_GETATTR(vp, &va, 0, cr);
1171 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
1172 			"vop_getattr_end:");
1173 		/* check for overflows */
1174 		if (!error) {
1175 			acl_perm(vp, exi, &va, cr);
1176 			error = vattr_to_nattr(&va, &ns->ns_attr);
1177 		}
1178 	}
1179 
1180 out:
1181 	if (in_crit)
1182 		nbl_end_crit(vp);
1183 	VN_RELE(vp);
1184 
1185 	ns->ns_status = puterrno(error);
1186 
1187 	TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
1188 		"rfs_write_end:(%S)", "sync");
1189 }
1190 
1191 struct rfs_async_write {
1192 	struct nfswriteargs *wa;
1193 	struct nfsattrstat *ns;
1194 	struct svc_req *req;
1195 	cred_t *cr;
1196 	kthread_t *thread;
1197 	struct rfs_async_write *list;
1198 };
1199 
1200 struct rfs_async_write_list {
1201 	fhandle_t *fhp;
1202 	kcondvar_t cv;
1203 	struct rfs_async_write *list;
1204 	struct rfs_async_write_list *next;
1205 };
1206 
1207 static struct rfs_async_write_list *rfs_async_write_head = NULL;
1208 static kmutex_t rfs_async_write_lock;
1209 static int rfs_write_async = 1;	/* enables write clustering if == 1 */
1210 
1211 #define	MAXCLIOVECS	42
1212 #define	RFSWRITE_INITVAL (enum nfsstat) -1
1213 
1214 #ifdef DEBUG
1215 static int rfs_write_hits = 0;
1216 static int rfs_write_misses = 0;
1217 #endif
1218 
1219 /*
1220  * Write data to file.
1221  * Returns attributes of a file after writing some data to it.
1222  */
1223 void
1224 rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns,
1225 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1226 {
1227 	int error;
1228 	vnode_t *vp;
1229 	rlim64_t rlimit;
1230 	struct vattr va;
1231 	struct uio uio;
1232 	struct rfs_async_write_list *lp;
1233 	struct rfs_async_write_list *nlp;
1234 	struct rfs_async_write *rp;
1235 	struct rfs_async_write *nrp;
1236 	struct rfs_async_write *trp;
1237 	struct rfs_async_write *lrp;
1238 	int data_written;
1239 	int iovcnt;
1240 	mblk_t *m;
1241 	struct iovec *iovp;
1242 	struct iovec *niovp;
1243 	struct iovec iov[MAXCLIOVECS];
1244 	int count;
1245 	int rcount;
1246 	uint_t off;
1247 	uint_t len;
1248 	struct rfs_async_write nrpsp;
1249 	struct rfs_async_write_list nlpsp;
1250 	ushort_t t_flag;
1251 	cred_t *savecred;
1252 	int in_crit = 0;
1253 
1254 	if (!rfs_write_async) {
1255 		rfs_write_sync(wa, ns, exi, req, cr);
1256 		return;
1257 	}
1258 
1259 	TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_START,
1260 		"rfs_write_start:(%S)", "async");
1261 
1262 	/*
1263 	 * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
1264 	 * is considered an OK.
1265 	 */
1266 	ns->ns_status = RFSWRITE_INITVAL;
1267 
1268 	nrp = &nrpsp;
1269 	nrp->wa = wa;
1270 	nrp->ns = ns;
1271 	nrp->req = req;
1272 	nrp->cr = cr;
1273 	nrp->thread = curthread;
1274 
1275 	ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
1276 
1277 	/*
1278 	 * Look to see if there is already a cluster started
1279 	 * for this file.
1280 	 */
1281 	mutex_enter(&rfs_async_write_lock);
1282 	for (lp = rfs_async_write_head; lp != NULL; lp = lp->next) {
1283 		if (bcmp(&wa->wa_fhandle, lp->fhp,
1284 		    sizeof (fhandle_t)) == 0)
1285 			break;
1286 	}
1287 
1288 	/*
1289 	 * If lp is non-NULL, then there is already a cluster
1290 	 * started.  We need to place ourselves in the cluster
1291 	 * list in the right place as determined by starting
1292 	 * offset.  Conflicts with non-blocking mandatory locked
1293 	 * regions will be checked when the cluster is processed.
1294 	 */
1295 	if (lp != NULL) {
1296 		rp = lp->list;
1297 		trp = NULL;
1298 		while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
1299 			trp = rp;
1300 			rp = rp->list;
1301 		}
1302 		nrp->list = rp;
1303 		if (trp == NULL)
1304 			lp->list = nrp;
1305 		else
1306 			trp->list = nrp;
1307 		while (nrp->ns->ns_status == RFSWRITE_INITVAL)
1308 			cv_wait(&lp->cv, &rfs_async_write_lock);
1309 		mutex_exit(&rfs_async_write_lock);
1310 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
1311 			"rfs_write_end:(%S)", "cluster child");
1312 		return;
1313 	}
1314 
1315 	/*
1316 	 * No cluster started yet, start one and add ourselves
1317 	 * to the list of clusters.
1318 	 */
1319 	nrp->list = NULL;
1320 
1321 	nlp = &nlpsp;
1322 	nlp->fhp = &wa->wa_fhandle;
1323 	cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
1324 	nlp->list = nrp;
1325 	nlp->next = NULL;
1326 
1327 	if (rfs_async_write_head == NULL) {
1328 		rfs_async_write_head = nlp;
1329 	} else {
1330 		lp = rfs_async_write_head;
1331 		while (lp->next != NULL)
1332 			lp = lp->next;
1333 		lp->next = nlp;
1334 	}
1335 	mutex_exit(&rfs_async_write_lock);
1336 
1337 	/*
1338 	 * Convert the file handle common to all of the requests
1339 	 * in this cluster to a vnode.
1340 	 */
1341 	vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1342 	if (vp == NULL) {
1343 		mutex_enter(&rfs_async_write_lock);
1344 		if (rfs_async_write_head == nlp)
1345 			rfs_async_write_head = nlp->next;
1346 		else {
1347 			lp = rfs_async_write_head;
1348 			while (lp->next != nlp)
1349 				lp = lp->next;
1350 			lp->next = nlp->next;
1351 		}
1352 		t_flag = curthread->t_flag & T_WOULDBLOCK;
1353 		for (rp = nlp->list; rp != NULL; rp = rp->list) {
1354 			rp->ns->ns_status = NFSERR_STALE;
1355 			rp->thread->t_flag |= t_flag;
1356 		}
1357 		cv_broadcast(&nlp->cv);
1358 		mutex_exit(&rfs_async_write_lock);
1359 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
1360 			"rfs_write_end:(%S)", "stale");
1361 		return;
1362 	}
1363 
1364 	/*
1365 	 * Can only write regular files.  Attempts to write any
1366 	 * other file types fail with EISDIR.
1367 	 */
1368 	if (vp->v_type != VREG) {
1369 		VN_RELE(vp);
1370 		mutex_enter(&rfs_async_write_lock);
1371 		if (rfs_async_write_head == nlp)
1372 			rfs_async_write_head = nlp->next;
1373 		else {
1374 			lp = rfs_async_write_head;
1375 			while (lp->next != nlp)
1376 				lp = lp->next;
1377 			lp->next = nlp->next;
1378 		}
1379 		t_flag = curthread->t_flag & T_WOULDBLOCK;
1380 		for (rp = nlp->list; rp != NULL; rp = rp->list) {
1381 			rp->ns->ns_status = NFSERR_ISDIR;
1382 			rp->thread->t_flag |= t_flag;
1383 		}
1384 		cv_broadcast(&nlp->cv);
1385 		mutex_exit(&rfs_async_write_lock);
1386 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
1387 			"rfs_write_end:(%S)", "isdir");
1388 		return;
1389 	}
1390 
1391 	/*
1392 	 * Enter the critical region before calling VOP_RWLOCK, to avoid a
1393 	 * deadlock with ufs.
1394 	 */
1395 	if (nbl_need_check(vp)) {
1396 		nbl_start_crit(vp, RW_READER);
1397 		in_crit = 1;
1398 	}
1399 
1400 	/*
1401 	 * Lock the file for writing.  This operation provides
1402 	 * the delay which allows clusters to grow.
1403 	 */
1404 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START,
1405 		"vop_wrlock_start:");
1406 	(void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);
1407 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END,
1408 		"vop_wrlock_end");
1409 
1410 	/*
1411 	 * Disconnect this cluster from the list of clusters.
1412 	 * The cluster that is being dealt with must be fixed
1413 	 * in size after this point, so there is no reason
1414 	 * to leave it on the list so that new requests can
1415 	 * find it.
1416 	 *
1417 	 * The algorithm is that the first write request will
1418 	 * create a cluster, convert the file handle to a
1419 	 * vnode pointer, and then lock the file for writing.
1420 	 * This request is not likely to be clustered with
1421 	 * any others.  However, the next request will create
1422 	 * a new cluster and be blocked in VOP_RWLOCK while
1423 	 * the first request is being processed.  This delay
1424 	 * will allow more requests to be clustered in this
1425 	 * second cluster.
1426 	 */
1427 	mutex_enter(&rfs_async_write_lock);
1428 	if (rfs_async_write_head == nlp)
1429 		rfs_async_write_head = nlp->next;
1430 	else {
1431 		lp = rfs_async_write_head;
1432 		while (lp->next != nlp)
1433 			lp = lp->next;
1434 		lp->next = nlp->next;
1435 	}
1436 	mutex_exit(&rfs_async_write_lock);
1437 
1438 	/*
1439 	 * Step through the list of requests in this cluster.
1440 	 * We need to check permissions to make sure that all
1441 	 * of the requests have sufficient permission to write
1442 	 * the file.  A cluster can be composed of requests
1443 	 * from different clients and different users on each
1444 	 * client.
1445 	 *
1446 	 * As a side effect, we also calculate the size of the
1447 	 * byte range that this cluster encompasses.
1448 	 */
1449 	rp = nlp->list;
1450 	off = rp->wa->wa_offset;
1451 	len = (uint_t)0;
1452 	do {
1453 		if (rdonly(exi, rp->req)) {
1454 			rp->ns->ns_status = NFSERR_ROFS;
1455 			t_flag = curthread->t_flag & T_WOULDBLOCK;
1456 			rp->thread->t_flag |= t_flag;
1457 			continue;
1458 		}
1459 
1460 		va.va_mask = AT_UID|AT_MODE;
1461 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
1462 			"vop_getattr_start:");
1463 		error = VOP_GETATTR(vp, &va, 0, rp->cr);
1464 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
1465 			"vop_getattr_end:");
1466 		if (!error) {
1467 			if (crgetuid(rp->cr) != va.va_uid) {
1468 				/*
1469 				 * This is a kludge to allow writes of files
1470 				 * created with read only permission.  The
1471 				 * owner of the file is always allowed to
1472 				 * write it.
1473 				 */
1474 				TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START,
1475 					"vop_access_start:");
1476 				error = VOP_ACCESS(vp, VWRITE, 0, rp->cr);
1477 				TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END,
1478 					"vop_access_end:");
1479 			}
1480 			if (!error && MANDLOCK(vp, va.va_mode))
1481 				error = EACCES;
1482 		}
1483 
1484 		/*
1485 		 * Check for a conflict with a nbmand-locked region.
1486 		 */
1487 		if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset,
1488 		    rp->wa->wa_count, 0)) {
1489 			error = EACCES;
1490 		}
1491 
1492 		if (error) {
1493 			rp->ns->ns_status = puterrno(error);
1494 			t_flag = curthread->t_flag & T_WOULDBLOCK;
1495 			rp->thread->t_flag |= t_flag;
1496 			continue;
1497 		}
1498 		if (len < rp->wa->wa_offset + rp->wa->wa_count - off)
1499 			len = rp->wa->wa_offset + rp->wa->wa_count - off;
1500 	} while ((rp = rp->list) != NULL);
1501 
1502 	/*
1503 	 * Step through the cluster attempting to gather as many
1504 	 * requests which are contiguous as possible.  These
1505 	 * contiguous requests are handled via one call to VOP_WRITE
1506 	 * instead of different calls to VOP_WRITE.  We also keep
1507 	 * track of the fact that any data was written.
1508 	 */
1509 	rp = nlp->list;
1510 	data_written = 0;
1511 	do {
1512 		/*
1513 		 * Skip any requests which are already marked as having an
1514 		 * error.
1515 		 */
1516 		if (rp->ns->ns_status != RFSWRITE_INITVAL) {
1517 			rp = rp->list;
1518 			continue;
1519 		}
1520 
1521 		/*
1522 		 * Count the number of iovec's which are required
1523 		 * to handle this set of requests.  One iovec is
1524 		 * needed for each data buffer, whether addressed
1525 		 * by wa_data or by the b_rptr pointers in the
1526 		 * mblk chains.
1527 		 */
1528 		iovcnt = 0;
1529 		lrp = rp;
1530 		for (;;) {
1531 			if (lrp->wa->wa_data)
1532 				iovcnt++;
1533 			else {
1534 				m = lrp->wa->wa_mblk;
1535 				while (m != NULL) {
1536 					iovcnt++;
1537 					m = m->b_cont;
1538 				}
1539 			}
1540 			if (lrp->list == NULL ||
1541 			    lrp->list->ns->ns_status != RFSWRITE_INITVAL ||
1542 			    lrp->wa->wa_offset + lrp->wa->wa_count !=
1543 			    lrp->list->wa->wa_offset) {
1544 				lrp = lrp->list;
1545 				break;
1546 			}
1547 			lrp = lrp->list;
1548 		}
1549 
1550 		if (iovcnt <= MAXCLIOVECS) {
1551 #ifdef DEBUG
1552 			rfs_write_hits++;
1553 #endif
1554 			niovp = iov;
1555 		} else {
1556 #ifdef DEBUG
1557 			rfs_write_misses++;
1558 #endif
1559 			niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP);
1560 		}
1561 		/*
1562 		 * Put together the scatter/gather iovecs.
1563 		 */
1564 		iovp = niovp;
1565 		trp = rp;
1566 		count = 0;
1567 		do {
1568 			if (trp->wa->wa_data) {
1569 				iovp->iov_base = trp->wa->wa_data;
1570 				iovp->iov_len = trp->wa->wa_count;
1571 				iovp++;
1572 			} else {
1573 				m = trp->wa->wa_mblk;
1574 				rcount = trp->wa->wa_count;
1575 				while (m != NULL) {
1576 					iovp->iov_base = (caddr_t)m->b_rptr;
1577 					iovp->iov_len = (m->b_wptr - m->b_rptr);
1578 					rcount -= iovp->iov_len;
1579 					if (rcount < 0)
1580 						iovp->iov_len += rcount;
1581 					iovp++;
1582 					if (rcount <= 0)
1583 						break;
1584 					m = m->b_cont;
1585 				}
1586 			}
1587 			count += trp->wa->wa_count;
1588 			trp = trp->list;
1589 		} while (trp != lrp);
1590 
1591 		uio.uio_iov = niovp;
1592 		uio.uio_iovcnt = iovcnt;
1593 		uio.uio_segflg = UIO_SYSSPACE;
1594 		uio.uio_extflg = UIO_COPY_DEFAULT;
1595 		uio.uio_loffset = (offset_t)rp->wa->wa_offset;
1596 		uio.uio_resid = count;
1597 		/*
1598 		 * The limit is checked on the client. We
1599 		 * should allow any size writes here.
1600 		 */
1601 		uio.uio_llimit = curproc->p_fsz_ctl;
1602 		rlimit = uio.uio_llimit - rp->wa->wa_offset;
1603 		if (rlimit < (rlim64_t)uio.uio_resid)
1604 			uio.uio_resid = (uint_t)rlimit;
1605 
1606 		/*
1607 		 * For now we assume no append mode.
1608 		 */
1609 		TRACE_1(TR_FAC_NFS, TR_VOP_WRITE_START,
1610 			"vop_write_start:(%S)", "async");
1611 
1612 		/*
1613 		 * Check to see if the v4 side of the server has
1614 		 * delegated this file.  If so, then we mark thread
1615 		 * as wouldblock so the response is dropped.
1616 		 */
1617 		if (rfs4_check_delegated(FWRITE, vp, FALSE)) {
1618 			curthread->t_flag |= T_WOULDBLOCK;
1619 			error = EACCES; /* just to have an error */
1620 			TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
1621 				"rfs_write_end:(%S)", "delegated");
1622 		} else {
1623 			/*
1624 			 * We're changing creds because VM may fault
1625 			 * and we need the cred of the current
1626 			 * thread to be used if quota * checking is
1627 			 * enabled.
1628 			 */
1629 			savecred = curthread->t_cred;
1630 			curthread->t_cred = cr;
1631 			error = VOP_WRITE(vp, &uio, 0, rp->cr, NULL);
1632 			curthread->t_cred = savecred;
1633 			TRACE_0(TR_FAC_NFS, TR_VOP_WRITE_END,
1634 				"vop_write_end:");
1635 		}
1636 
1637 		if (niovp != iov)
1638 			kmem_free(niovp, sizeof (*niovp) * iovcnt);
1639 
1640 		if (!error) {
1641 			data_written = 1;
1642 			/*
1643 			 * Get attributes again so we send the latest mod
1644 			 * time to the client side for his cache.
1645 			 */
1646 			va.va_mask = AT_ALL;	/* now we want everything */
1647 			TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
1648 				"vop_getattr_start:");
1649 			error = VOP_GETATTR(vp, &va, 0, rp->cr);
1650 			TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
1651 				"vop_getattr_end:");
1652 			if (!error)
1653 				acl_perm(vp, exi, &va, rp->cr);
1654 		}
1655 
1656 		/*
1657 		 * Fill in the status responses for each request
1658 		 * which was just handled.  Also, copy the latest
1659 		 * attributes in to the attribute responses if
1660 		 * appropriate.
1661 		 */
1662 		t_flag = curthread->t_flag & T_WOULDBLOCK;
1663 		do {
1664 			rp->thread->t_flag |= t_flag;
1665 			/* check for overflows */
1666 			if (!error) {
1667 				error  = vattr_to_nattr(&va, &rp->ns->ns_attr);
1668 			}
1669 			rp->ns->ns_status = puterrno(error);
1670 			rp = rp->list;
1671 		} while (rp != lrp);
1672 	} while (rp != NULL);
1673 
1674 	/*
1675 	 * If any data was written at all, then we need to flush
1676 	 * the data and metadata to stable storage.
1677 	 */
1678 	if (data_written) {
1679 		TRACE_0(TR_FAC_NFS, TR_VOP_PUTPAGE_START,
1680 			"vop_putpage_start:");
1681 		error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr);
1682 		TRACE_0(TR_FAC_NFS, TR_VOP_PUTPAGE_END,
1683 			"vop_putpage_end:");
1684 		if (!error) {
1685 			TRACE_0(TR_FAC_NFS, TR_VOP_FSYNC_START,
1686 				"vop_fsync_start:");
1687 			error = VOP_FSYNC(vp, FNODSYNC, cr);
1688 			TRACE_0(TR_FAC_NFS, TR_VOP_FSYNC_END,
1689 				"vop_fsync_end:");
1690 		}
1691 	}
1692 
1693 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
1694 		"vop_rwunlock_start:");
1695 	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
1696 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END,
1697 		"vop_rwunlock_end:");
1698 
1699 	if (in_crit)
1700 		nbl_end_crit(vp);
1701 	VN_RELE(vp);
1702 
1703 	t_flag = curthread->t_flag & T_WOULDBLOCK;
1704 	mutex_enter(&rfs_async_write_lock);
1705 	for (rp = nlp->list; rp != NULL; rp = rp->list) {
1706 		if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1707 			rp->ns->ns_status = puterrno(error);
1708 			rp->thread->t_flag |= t_flag;
1709 		}
1710 	}
1711 	cv_broadcast(&nlp->cv);
1712 	mutex_exit(&rfs_async_write_lock);
1713 
1714 	TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
1715 		"rfs_write_end:(%S)", "async");
1716 }
1717 
1718 fhandle_t *
1719 rfs_write_getfh(struct nfswriteargs *wa)
1720 {
1721 	return (&wa->wa_fhandle);
1722 }
1723 
1724 /*
1725  * Create a file.
1726  * Creates a file with given attributes and returns those attributes
1727  * and an fhandle for the new file.
1728  */
1729 void
1730 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
1731 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1732 {
1733 	int error;
1734 	int lookuperr;
1735 	int in_crit = 0;
1736 	struct vattr va;
1737 	vnode_t *vp;
1738 	vnode_t *dvp;
1739 	char *name = args->ca_da.da_name;
1740 	vnode_t *tvp = NULL;
1741 	int mode;
1742 	int lookup_ok;
1743 	bool_t trunc;
1744 
1745 	TRACE_0(TR_FAC_NFS, TR_RFS_CREATE_START,
1746 		"rfs_create_start:");
1747 
1748 	/*
1749 	 * Disallow NULL paths
1750 	 */
1751 	if (name == NULL || *name == '\0') {
1752 		dr->dr_status = NFSERR_ACCES;
1753 		TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END,
1754 			"rfs_create_end:(%S)", "access");
1755 		return;
1756 	}
1757 
1758 	dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
1759 	if (dvp == NULL) {
1760 		dr->dr_status = NFSERR_STALE;
1761 		TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END,
1762 			"rfs_create_end:(%S)", "stale");
1763 		return;
1764 	}
1765 
1766 	error = sattr_to_vattr(args->ca_sa, &va);
1767 	if (error) {
1768 		dr->dr_status = puterrno(error);
1769 		TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END,
1770 			"rfs_create_end:(%S)", "sattr");
1771 		return;
1772 	}
1773 
1774 	/*
1775 	 * Must specify the mode.
1776 	 */
1777 	if (!(va.va_mask & AT_MODE)) {
1778 		VN_RELE(dvp);
1779 		dr->dr_status = NFSERR_INVAL;
1780 		TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END,
1781 			"rfs_create_end:(%S)", "no mode");
1782 		return;
1783 	}
1784 
1785 	/*
1786 	 * This is a completely gross hack to make mknod
1787 	 * work over the wire until we can wack the protocol
1788 	 */
1789 	if ((va.va_mode & IFMT) == IFCHR) {
1790 		if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV)
1791 			va.va_type = VFIFO;	/* xtra kludge for named pipe */
1792 		else {
1793 			va.va_type = VCHR;
1794 			/*
1795 			 * uncompress the received dev_t
1796 			 * if the top half is zero indicating a request
1797 			 * from an `older style' OS.
1798 			 */
1799 			if ((va.va_size & 0xffff0000) == 0)
1800 				va.va_rdev = nfsv2_expdev(va.va_size);
1801 			else
1802 				va.va_rdev = (dev_t)va.va_size;
1803 		}
1804 		va.va_mask &= ~AT_SIZE;
1805 	} else if ((va.va_mode & IFMT) == IFBLK) {
1806 		va.va_type = VBLK;
1807 		/*
1808 		 * uncompress the received dev_t
1809 		 * if the top half is zero indicating a request
1810 		 * from an `older style' OS.
1811 		 */
1812 		if ((va.va_size & 0xffff0000) == 0)
1813 			va.va_rdev = nfsv2_expdev(va.va_size);
1814 		else
1815 			va.va_rdev = (dev_t)va.va_size;
1816 		va.va_mask &= ~AT_SIZE;
1817 	} else if ((va.va_mode & IFMT) == IFSOCK) {
1818 		va.va_type = VSOCK;
1819 	} else
1820 		va.va_type = VREG;
1821 	va.va_mode &= ~IFMT;
1822 	va.va_mask |= AT_TYPE;
1823 
1824 	/*
1825 	 * Why was the choice made to use VWRITE as the mode to the
1826 	 * call to VOP_CREATE ? This results in a bug.  When a client
1827 	 * opens a file that already exists and is RDONLY, the second
1828 	 * open fails with an EACESS because of the mode.
1829 	 * bug ID 1054648.
1830 	 */
1831 	lookup_ok = 0;
1832 	mode = VWRITE;
1833 	if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) {
1834 		TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_START,
1835 			"vop_lookup_start:");
1836 		error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr);
1837 		TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_END,
1838 			"vop_lookup_end:");
1839 		if (!error) {
1840 			struct vattr at;
1841 
1842 			lookup_ok = 1;
1843 			at.va_mask = AT_MODE;
1844 			TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
1845 				"vop_getattr_start:");
1846 			error = VOP_GETATTR(tvp, &at, 0, cr);
1847 			TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
1848 				"vop_getattr_end:");
1849 			if (!error)
1850 				mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD;
1851 			VN_RELE(tvp);
1852 			tvp = NULL;
1853 		}
1854 	}
1855 
1856 	if (!lookup_ok) {
1857 		if (rdonly(exi, req)) {
1858 			error = EROFS;
1859 		} else if (va.va_type != VREG && va.va_type != VFIFO &&
1860 		    va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) {
1861 			error = EPERM;
1862 		} else {
1863 			error = 0;
1864 		}
1865 	}
1866 
1867 	/*
1868 	 * If file size is being modified on an already existing file
1869 	 * make sure that there are no conflicting non-blocking mandatory
1870 	 * locks in the region being manipulated. Return EACCES if there
1871 	 * are conflicting locks.
1872 	 */
1873 	if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) {
1874 		lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr);
1875 
1876 		if (!lookuperr &&
1877 		    rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) {
1878 			VN_RELE(tvp);
1879 			curthread->t_flag |= T_WOULDBLOCK;
1880 			goto out;
1881 		}
1882 
1883 		if (!lookuperr && nbl_need_check(tvp)) {
1884 			/*
1885 			 * The file exists. Now check if it has any
1886 			 * conflicting non-blocking mandatory locks
1887 			 * in the region being changed.
1888 			 */
1889 			struct vattr bva;
1890 			u_offset_t offset;
1891 			ssize_t length;
1892 
1893 			nbl_start_crit(tvp, RW_READER);
1894 			in_crit = 1;
1895 
1896 			bva.va_mask = AT_SIZE;
1897 			error = VOP_GETATTR(tvp, &bva, 0, cr);
1898 			if (!error) {
1899 				if (va.va_size < bva.va_size) {
1900 					offset = va.va_size;
1901 					length = bva.va_size - va.va_size;
1902 				} else {
1903 					offset = bva.va_size;
1904 					length = va.va_size - bva.va_size;
1905 				}
1906 				if (length) {
1907 					if (nbl_conflict(tvp, NBL_WRITE,
1908 						offset, length, 0)) {
1909 						error = EACCES;
1910 					}
1911 				}
1912 			}
1913 			if (error) {
1914 				nbl_end_crit(tvp);
1915 				VN_RELE(tvp);
1916 				in_crit = 0;
1917 			}
1918 		} else if (tvp != NULL) {
1919 			VN_RELE(tvp);
1920 		}
1921 	}
1922 
1923 	if (!error) {
1924 		/*
1925 		 * If filesystem is shared with nosuid the remove any
1926 		 * setuid/setgid bits on create.
1927 		 */
1928 		if (va.va_type == VREG &&
1929 		    exi->exi_export.ex_flags & EX_NOSUID)
1930 			va.va_mode &= ~(VSUID | VSGID);
1931 
1932 		TRACE_0(TR_FAC_NFS, TR_VOP_CREATE_START,
1933 			"vop_create_start:");
1934 		error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0);
1935 		TRACE_0(TR_FAC_NFS, TR_VOP_CREATE_END,
1936 			"vop_create_end:");
1937 
1938 		if (!error) {
1939 
1940 			if ((va.va_mask & AT_SIZE) && (va.va_size == 0))
1941 				trunc = TRUE;
1942 			else
1943 				trunc = FALSE;
1944 
1945 			if (rfs4_check_delegated(FWRITE, tvp, trunc)) {
1946 				VN_RELE(tvp);
1947 				curthread->t_flag |= T_WOULDBLOCK;
1948 				goto out;
1949 			}
1950 			va.va_mask = AT_ALL;
1951 			TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
1952 				"vop_getattr_start:");
1953 			error = VOP_GETATTR(vp, &va, 0, cr);
1954 			TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
1955 				"vop_getattr_end:");
1956 			/* check for overflows */
1957 			if (!error) {
1958 				acl_perm(vp, exi, &va, cr);
1959 				error = vattr_to_nattr(&va, &dr->dr_attr);
1960 				if (!error) {
1961 					error = makefh(&dr->dr_fhandle, vp,
1962 							exi);
1963 				}
1964 			}
1965 			/*
1966 			 * Force modified metadata out to stable storage.
1967 			 */
1968 			(void) VOP_FSYNC(vp, FNODSYNC, cr);
1969 			VN_RELE(vp);
1970 		}
1971 
1972 		if (in_crit) {
1973 			nbl_end_crit(tvp);
1974 			VN_RELE(tvp);
1975 		}
1976 	}
1977 
1978 	/*
1979 	 * Force modified data and metadata out to stable storage.
1980 	 */
1981 	(void) VOP_FSYNC(dvp, 0, cr);
1982 
1983 out:
1984 
1985 	VN_RELE(dvp);
1986 
1987 	dr->dr_status = puterrno(error);
1988 
1989 	TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END,
1990 		"rfs_create_end:(%S)", "done");
1991 }
1992 fhandle_t *
1993 rfs_create_getfh(struct nfscreatargs *args)
1994 {
1995 	return (args->ca_da.da_fhandle);
1996 }
1997 
1998 /*
1999  * Remove a file.
2000  * Remove named file from parent directory.
2001  */
2002 void
2003 rfs_remove(struct nfsdiropargs *da, enum nfsstat *status,
2004 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2005 {
2006 	int error = 0;
2007 	vnode_t *vp;
2008 	vnode_t *targvp;
2009 	int in_crit = 0;
2010 
2011 	TRACE_0(TR_FAC_NFS, TR_RFS_REMOVE_START,
2012 		"rfs_remove_start:");
2013 
2014 	/*
2015 	 * Disallow NULL paths
2016 	 */
2017 	if (da->da_name == NULL || *da->da_name == '\0') {
2018 		*status = NFSERR_ACCES;
2019 		TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END,
2020 			"rfs_remove_end:(%S)", "access");
2021 		return;
2022 	}
2023 
2024 	vp = nfs_fhtovp(da->da_fhandle, exi);
2025 	if (vp == NULL) {
2026 		*status = NFSERR_STALE;
2027 		TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END,
2028 			"rfs_remove_end:(%S)", "stale");
2029 		return;
2030 	}
2031 
2032 	if (rdonly(exi, req)) {
2033 		VN_RELE(vp);
2034 		*status = NFSERR_ROFS;
2035 		TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END,
2036 			"rfs_remove_end:(%S)", "rofs");
2037 		return;
2038 	}
2039 
2040 	/*
2041 	 * Check for a conflict with a non-blocking mandatory share reservation.
2042 	 */
2043 	error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0,
2044 			NULL, cr);
2045 	if (error != 0) {
2046 		VN_RELE(vp);
2047 		*status = puterrno(error);
2048 		return;
2049 	}
2050 
2051 	/*
2052 	 * If the file is delegated to an v4 client, then initiate
2053 	 * recall and drop this request (by setting T_WOULDBLOCK).
2054 	 * The client will eventually re-transmit the request and
2055 	 * (hopefully), by then, the v4 client will have returned
2056 	 * the delegation.
2057 	 */
2058 
2059 	if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2060 		VN_RELE(vp);
2061 		VN_RELE(targvp);
2062 		curthread->t_flag |= T_WOULDBLOCK;
2063 		return;
2064 	}
2065 
2066 	if (nbl_need_check(targvp)) {
2067 		nbl_start_crit(targvp, RW_READER);
2068 		in_crit = 1;
2069 		if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0)) {
2070 			error = EACCES;
2071 			goto out;
2072 		}
2073 	}
2074 
2075 	TRACE_0(TR_FAC_NFS, TR_VOP_REMOVE_START,
2076 		"vop_remove_start:");
2077 	error = VOP_REMOVE(vp, da->da_name, cr);
2078 	TRACE_0(TR_FAC_NFS, TR_VOP_REMOVE_END,
2079 		"vop_remove_end:");
2080 
2081 	/*
2082 	 * Force modified data and metadata out to stable storage.
2083 	 */
2084 	(void) VOP_FSYNC(vp, 0, cr);
2085 
2086 out:
2087 	if (in_crit)
2088 		nbl_end_crit(targvp);
2089 	VN_RELE(targvp);
2090 	VN_RELE(vp);
2091 
2092 	*status = puterrno(error);
2093 
2094 	TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END,
2095 		"rfs_remove_end:(%S)", "done");
2096 }
2097 
2098 fhandle_t *
2099 rfs_remove_getfh(struct nfsdiropargs *da)
2100 {
2101 	return (da->da_fhandle);
2102 }
2103 
2104 /*
2105  * rename a file
2106  * Give a file (from) a new name (to).
2107  */
2108 void
2109 rfs_rename(struct nfsrnmargs *args, enum nfsstat *status,
2110 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2111 {
2112 	int error = 0;
2113 	vnode_t *fromvp;
2114 	vnode_t *tovp;
2115 	struct exportinfo *to_exi;
2116 	fhandle_t *fh;
2117 	vnode_t *srcvp;
2118 	vnode_t *targvp;
2119 	int in_crit = 0;
2120 
2121 	TRACE_0(TR_FAC_NFS, TR_RFS_RENAME_START,
2122 		"rfs_rename_start:");
2123 
2124 	fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi);
2125 	if (fromvp == NULL) {
2126 		*status = NFSERR_STALE;
2127 		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2128 			"rfs_rename_end:(%S)", "from stale");
2129 		return;
2130 	}
2131 
2132 	fh = args->rna_to.da_fhandle;
2133 	to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2134 	if (to_exi == NULL) {
2135 		VN_RELE(fromvp);
2136 		*status = NFSERR_ACCES;
2137 		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2138 			"rfs_rename_end:(%S)", "cross device");
2139 		return;
2140 	}
2141 	exi_rele(to_exi);
2142 
2143 	if (to_exi != exi) {
2144 		VN_RELE(fromvp);
2145 		*status = NFSERR_XDEV;
2146 		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2147 			"rfs_rename_end:(%S)", "from stale");
2148 		return;
2149 	}
2150 
2151 	tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi);
2152 	if (tovp == NULL) {
2153 		VN_RELE(fromvp);
2154 		*status = NFSERR_STALE;
2155 		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2156 			"rfs_rename_end:(%S)", "to stale");
2157 		return;
2158 	}
2159 
2160 	if (fromvp->v_type != VDIR || tovp->v_type != VDIR) {
2161 		VN_RELE(tovp);
2162 		VN_RELE(fromvp);
2163 		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2164 			"rfs_rename_end:(%S)", "not dir");
2165 		*status = NFSERR_NOTDIR;
2166 		return;
2167 	}
2168 
2169 	/*
2170 	 * Disallow NULL paths
2171 	 */
2172 	if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' ||
2173 	    args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') {
2174 		VN_RELE(tovp);
2175 		VN_RELE(fromvp);
2176 		*status = NFSERR_ACCES;
2177 		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2178 			"rfs_rename_end:(%S)", "access");
2179 		return;
2180 	}
2181 
2182 	if (rdonly(exi, req)) {
2183 		VN_RELE(tovp);
2184 		VN_RELE(fromvp);
2185 		*status = NFSERR_ROFS;
2186 		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2187 			"rfs_rename_end:(%S)", "rofs");
2188 		return;
2189 	}
2190 
2191 	/*
2192 	 * Check for a conflict with a non-blocking mandatory share reservation.
2193 	 */
2194 	error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
2195 			NULL, cr);
2196 	if (error != 0) {
2197 		VN_RELE(tovp);
2198 		VN_RELE(fromvp);
2199 		*status = puterrno(error);
2200 		return;
2201 	}
2202 
2203 	/* Check for delegations on the source file */
2204 
2205 	if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
2206 		VN_RELE(tovp);
2207 		VN_RELE(fromvp);
2208 		VN_RELE(srcvp);
2209 		curthread->t_flag |= T_WOULDBLOCK;
2210 		return;
2211 	}
2212 
2213 	/* Check for delegation on the file being renamed over, if it exists */
2214 
2215 	if (rfs4_deleg_policy != SRV_NEVER_DELEGATE &&
2216 	    VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr)
2217 	    == 0) {
2218 
2219 		if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2220 			VN_RELE(tovp);
2221 			VN_RELE(fromvp);
2222 			VN_RELE(srcvp);
2223 			VN_RELE(targvp);
2224 			curthread->t_flag |= T_WOULDBLOCK;
2225 			return;
2226 		}
2227 		VN_RELE(targvp);
2228 	}
2229 
2230 
2231 	if (nbl_need_check(srcvp)) {
2232 		nbl_start_crit(srcvp, RW_READER);
2233 		in_crit = 1;
2234 		if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0)) {
2235 			error = EACCES;
2236 			goto out;
2237 		}
2238 	}
2239 
2240 	TRACE_0(TR_FAC_NFS, TR_VOP_RENAME_START,
2241 		"vop_rename_start:");
2242 	error = VOP_RENAME(fromvp, args->rna_from.da_name,
2243 	    tovp, args->rna_to.da_name, cr);
2244 	TRACE_0(TR_FAC_NFS, TR_VOP_RENAME_END,
2245 		"vop_rename_end:");
2246 
2247 	/*
2248 	 * Force modified data and metadata out to stable storage.
2249 	 */
2250 	(void) VOP_FSYNC(tovp, 0, cr);
2251 	(void) VOP_FSYNC(fromvp, 0, cr);
2252 
2253 out:
2254 	if (in_crit)
2255 		nbl_end_crit(srcvp);
2256 	VN_RELE(srcvp);
2257 	VN_RELE(tovp);
2258 	VN_RELE(fromvp);
2259 
2260 	*status = puterrno(error);
2261 
2262 	TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2263 		"rfs_rename_end:(%S)", "done");
2264 }
2265 fhandle_t *
2266 rfs_rename_getfh(struct nfsrnmargs *args)
2267 {
2268 	return (args->rna_from.da_fhandle);
2269 }
2270 
2271 /*
2272  * Link to a file.
2273  * Create a file (to) which is a hard link to the given file (from).
2274  */
2275 void
2276 rfs_link(struct nfslinkargs *args, enum nfsstat *status,
2277 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2278 {
2279 	int error;
2280 	vnode_t *fromvp;
2281 	vnode_t *tovp;
2282 	struct exportinfo *to_exi;
2283 	fhandle_t *fh;
2284 
2285 	TRACE_0(TR_FAC_NFS, TR_RFS_LINK_START,
2286 		"rfs_link_start:");
2287 
2288 	fromvp = nfs_fhtovp(args->la_from, exi);
2289 	if (fromvp == NULL) {
2290 		*status = NFSERR_STALE;
2291 		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2292 			"rfs_link_end:(%S)", "from stale");
2293 		return;
2294 	}
2295 
2296 	fh = args->la_to.da_fhandle;
2297 	to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2298 	if (to_exi == NULL) {
2299 		VN_RELE(fromvp);
2300 		*status = NFSERR_ACCES;
2301 		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2302 			"rfs_link_end:(%S)", "cross device");
2303 		return;
2304 	}
2305 	exi_rele(to_exi);
2306 
2307 	if (to_exi != exi) {
2308 		VN_RELE(fromvp);
2309 		*status = NFSERR_XDEV;
2310 		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2311 			"rfs_link_end:(%S)", "cross device");
2312 		return;
2313 	}
2314 
2315 	tovp = nfs_fhtovp(args->la_to.da_fhandle, exi);
2316 	if (tovp == NULL) {
2317 		VN_RELE(fromvp);
2318 		*status = NFSERR_STALE;
2319 		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2320 			"rfs_link_end:(%S)", "to stale");
2321 		return;
2322 	}
2323 
2324 	if (tovp->v_type != VDIR) {
2325 		VN_RELE(tovp);
2326 		VN_RELE(fromvp);
2327 		*status = NFSERR_NOTDIR;
2328 		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2329 			"rfs_link_end:(%S)", "not dir");
2330 		return;
2331 	}
2332 	/*
2333 	 * Disallow NULL paths
2334 	 */
2335 	if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') {
2336 		VN_RELE(tovp);
2337 		VN_RELE(fromvp);
2338 		*status = NFSERR_ACCES;
2339 		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2340 			"rfs_link_end:(%S)", "access");
2341 		return;
2342 	}
2343 
2344 	if (rdonly(exi, req)) {
2345 		VN_RELE(tovp);
2346 		VN_RELE(fromvp);
2347 		*status = NFSERR_ROFS;
2348 		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2349 			"rfs_link_end:(%S)", "rofs");
2350 		return;
2351 	}
2352 
2353 	TRACE_0(TR_FAC_NFS, TR_VOP_LINK_START,
2354 		"vop_link_start:");
2355 	error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr);
2356 	TRACE_0(TR_FAC_NFS, TR_VOP_LINK_END,
2357 		"vop_link_end:");
2358 
2359 	/*
2360 	 * Force modified data and metadata out to stable storage.
2361 	 */
2362 	(void) VOP_FSYNC(tovp, 0, cr);
2363 	(void) VOP_FSYNC(fromvp, FNODSYNC, cr);
2364 
2365 	VN_RELE(tovp);
2366 	VN_RELE(fromvp);
2367 
2368 	*status = puterrno(error);
2369 
2370 	TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2371 		"rfs_link_end:(%S)", "done");
2372 }
2373 fhandle_t *
2374 rfs_link_getfh(struct nfslinkargs *args)
2375 {
2376 	return (args->la_from);
2377 }
2378 
2379 /*
2380  * Symbolicly link to a file.
2381  * Create a file (to) with the given attributes which is a symbolic link
2382  * to the given path name (to).
2383  */
2384 void
2385 rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
2386 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2387 {
2388 	int error;
2389 	struct vattr va;
2390 	vnode_t *vp;
2391 	vnode_t *svp;
2392 	int lerror;
2393 
2394 	TRACE_0(TR_FAC_NFS, TR_RFS_SYMLINK_START,
2395 		"rfs_symlink_start:");
2396 
2397 	/*
2398 	 * Disallow NULL paths
2399 	 */
2400 	if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') {
2401 		*status = NFSERR_ACCES;
2402 		TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END,
2403 			"rfs_symlink_end:(%S)", "access");
2404 		return;
2405 	}
2406 
2407 	vp = nfs_fhtovp(args->sla_from.da_fhandle, exi);
2408 	if (vp == NULL) {
2409 		*status = NFSERR_STALE;
2410 		TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END,
2411 			"rfs_symlink_end:(%S)", "stale");
2412 		return;
2413 	}
2414 
2415 	if (rdonly(exi, req)) {
2416 		VN_RELE(vp);
2417 		*status = NFSERR_ROFS;
2418 		TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END,
2419 			"rfs_symlink_end:(%S)", "rofs");
2420 		return;
2421 	}
2422 
2423 	error = sattr_to_vattr(args->sla_sa, &va);
2424 	if (error) {
2425 		VN_RELE(vp);
2426 		*status = puterrno(error);
2427 		TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END,
2428 			"rfs_symlink_end:(%S)", "sattr");
2429 		return;
2430 	}
2431 
2432 	if (!(va.va_mask & AT_MODE)) {
2433 		VN_RELE(vp);
2434 		*status = NFSERR_INVAL;
2435 		TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END,
2436 			"rfs_symlink_end:(%S)", "no mode");
2437 		return;
2438 	}
2439 
2440 	va.va_type = VLNK;
2441 	va.va_mask |= AT_TYPE;
2442 
2443 	TRACE_0(TR_FAC_NFS, TR_VOP_SYMLINK_START,
2444 		"vop_symlink_start:");
2445 	error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, args->sla_tnm, cr);
2446 	TRACE_0(TR_FAC_NFS, TR_VOP_SYMLINK_END,
2447 		"vop_symlink_end:");
2448 
2449 	/*
2450 	 * Force new data and metadata out to stable storage.
2451 	 */
2452 	TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_START,
2453 		"vop_lookup_start:");
2454 	lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL,
2455 	    0, NULL, cr);
2456 	TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_END,
2457 		"vop_lookup_end:");
2458 	if (!lerror) {
2459 		(void) VOP_FSYNC(svp, 0, cr);
2460 		VN_RELE(svp);
2461 	}
2462 
2463 	/*
2464 	 * Force modified data and metadata out to stable storage.
2465 	 */
2466 	(void) VOP_FSYNC(vp, 0, cr);
2467 
2468 	VN_RELE(vp);
2469 
2470 	*status = puterrno(error);
2471 
2472 	TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END,
2473 		"rfs_symlink_end:(%S)", "done");
2474 }
2475 fhandle_t *
2476 rfs_symlink_getfh(struct nfsslargs *args)
2477 {
2478 	return (args->sla_from.da_fhandle);
2479 }
2480 
2481 /*
2482  * Make a directory.
2483  * Create a directory with the given name, parent directory, and attributes.
2484  * Returns a file handle and attributes for the new directory.
2485  */
2486 void
2487 rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr,
2488 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2489 {
2490 	int error;
2491 	struct vattr va;
2492 	vnode_t *dvp = NULL;
2493 	vnode_t *vp;
2494 	char *name = args->ca_da.da_name;
2495 
2496 	TRACE_0(TR_FAC_NFS, TR_RFS_MKDIR_START,
2497 		"rfs_mkdir_start:");
2498 
2499 	/*
2500 	 * Disallow NULL paths
2501 	 */
2502 	if (name == NULL || *name == '\0') {
2503 		dr->dr_status = NFSERR_ACCES;
2504 		TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END,
2505 			"rfs_mkdir_end:(%S)", "access");
2506 		return;
2507 	}
2508 
2509 	vp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
2510 	if (vp == NULL) {
2511 		dr->dr_status = NFSERR_STALE;
2512 		TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END,
2513 			"rfs_mkdir_end:(%S)", "stale");
2514 		return;
2515 	}
2516 
2517 	if (rdonly(exi, req)) {
2518 		VN_RELE(vp);
2519 		dr->dr_status = NFSERR_ROFS;
2520 		TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END,
2521 			"rfs_mkdir_end:(%S)", "rofs");
2522 		return;
2523 	}
2524 
2525 	error = sattr_to_vattr(args->ca_sa, &va);
2526 	if (error) {
2527 		VN_RELE(vp);
2528 		dr->dr_status = puterrno(error);
2529 		TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END,
2530 			"rfs_mkdir_end:(%S)", "sattr");
2531 		return;
2532 	}
2533 
2534 	if (!(va.va_mask & AT_MODE)) {
2535 		VN_RELE(vp);
2536 		dr->dr_status = NFSERR_INVAL;
2537 		TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END,
2538 			"rfs_mkdir_end:(%S)", "no mode");
2539 		return;
2540 	}
2541 
2542 	va.va_type = VDIR;
2543 	va.va_mask |= AT_TYPE;
2544 
2545 	TRACE_0(TR_FAC_NFS, TR_VOP_MKDIR_START,
2546 		"vop_mkdir_start:");
2547 	error = VOP_MKDIR(vp, name, &va, &dvp, cr);
2548 	TRACE_0(TR_FAC_NFS, TR_VOP_MKDIR_END,
2549 		"vop_mkdir_end:");
2550 
2551 	if (!error) {
2552 		/*
2553 		 * Attribtutes of the newly created directory should
2554 		 * be returned to the client.
2555 		 */
2556 		va.va_mask = AT_ALL; /* We want everything */
2557 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
2558 			"vop_getattr_start:");
2559 		error = VOP_GETATTR(dvp, &va, 0, cr);
2560 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
2561 			"vop_getattr_end:");
2562 		/* check for overflows */
2563 		if (!error) {
2564 			acl_perm(vp, exi, &va, cr);
2565 			error = vattr_to_nattr(&va, &dr->dr_attr);
2566 			if (!error) {
2567 				error = makefh(&dr->dr_fhandle, dvp, exi);
2568 			}
2569 		}
2570 		/*
2571 		 * Force new data and metadata out to stable storage.
2572 		 */
2573 		(void) VOP_FSYNC(dvp, 0, cr);
2574 		VN_RELE(dvp);
2575 	}
2576 
2577 	/*
2578 	 * Force modified data and metadata out to stable storage.
2579 	 */
2580 	(void) VOP_FSYNC(vp, 0, cr);
2581 
2582 	VN_RELE(vp);
2583 
2584 	dr->dr_status = puterrno(error);
2585 
2586 	TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END,
2587 		"rfs_mkdir_end:(%S)", "done");
2588 }
2589 fhandle_t *
2590 rfs_mkdir_getfh(struct nfscreatargs *args)
2591 {
2592 	return (args->ca_da.da_fhandle);
2593 }
2594 
2595 /*
2596  * Remove a directory.
2597  * Remove the given directory name from the given parent directory.
2598  */
2599 void
2600 rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status,
2601 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2602 {
2603 	int error;
2604 	vnode_t *vp;
2605 
2606 	TRACE_0(TR_FAC_NFS, TR_RFS_RMDIR_START,
2607 		"rfs_rmdir_start:");
2608 
2609 	/*
2610 	 * Disallow NULL paths
2611 	 */
2612 	if (da->da_name == NULL || *da->da_name == '\0') {
2613 		*status = NFSERR_ACCES;
2614 		TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END,
2615 			"rfs_rmdir_end:(%S)", "access");
2616 		return;
2617 	}
2618 
2619 	vp = nfs_fhtovp(da->da_fhandle, exi);
2620 	if (vp == NULL) {
2621 		*status = NFSERR_STALE;
2622 		TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END,
2623 			"rfs_rmdir_end:(%S)", "stale");
2624 		return;
2625 	}
2626 
2627 	if (rdonly(exi, req)) {
2628 		VN_RELE(vp);
2629 		*status = NFSERR_ROFS;
2630 		TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END,
2631 			"rfs_rmdir_end:(%S)", "rofs");
2632 		return;
2633 	}
2634 
2635 	/*
2636 	 * VOP_RMDIR now takes a new third argument (the current
2637 	 * directory of the process).  That's because someone
2638 	 * wants to return EINVAL if one tries to remove ".".
2639 	 * Of course, NFS servers have no idea what their
2640 	 * clients' current directories are.  We fake it by
2641 	 * supplying a vnode known to exist and illegal to
2642 	 * remove.
2643 	 */
2644 	TRACE_0(TR_FAC_NFS, TR_VOP_RMDIR_START,
2645 		"vop_rmdir_start:");
2646 	error = VOP_RMDIR(vp, da->da_name, rootdir, cr);
2647 	TRACE_0(TR_FAC_NFS, TR_VOP_RMDIR_END,
2648 		"vop_rmdir_end:");
2649 
2650 	/*
2651 	 * Force modified data and metadata out to stable storage.
2652 	 */
2653 	(void) VOP_FSYNC(vp, 0, cr);
2654 
2655 	VN_RELE(vp);
2656 
2657 	/*
2658 	 * System V defines rmdir to return EEXIST, not ENOTEMPTY,
2659 	 * if the directory is not empty.  A System V NFS server
2660 	 * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
2661 	 * over the wire.
2662 	 */
2663 	if (error == EEXIST)
2664 		*status = NFSERR_NOTEMPTY;
2665 	else
2666 		*status = puterrno(error);
2667 
2668 	TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END,
2669 		"rfs_rmdir_end:(%S)", "done");
2670 }
2671 fhandle_t *
2672 rfs_rmdir_getfh(struct nfsdiropargs *da)
2673 {
2674 	return (da->da_fhandle);
2675 }
2676 
2677 /* ARGSUSED */
2678 void
2679 rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
2680 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2681 {
2682 	int error;
2683 	int iseof;
2684 	struct iovec iov;
2685 	struct uio uio;
2686 	vnode_t *vp;
2687 
2688 	TRACE_0(TR_FAC_NFS, TR_RFS_READDIR_START,
2689 		"rfs_readdir_start:");
2690 
2691 	vp = nfs_fhtovp(&rda->rda_fh, exi);
2692 	if (vp == NULL) {
2693 		rd->rd_entries = NULL;
2694 		rd->rd_status = NFSERR_STALE;
2695 		TRACE_1(TR_FAC_NFS, TR_RFS_READDIR_END,
2696 			"rfs_readdir_end:(%S)", "stale");
2697 		return;
2698 	}
2699 
2700 	if (vp->v_type != VDIR) {
2701 		VN_RELE(vp);
2702 		rd->rd_entries = NULL;
2703 		rd->rd_status = NFSERR_NOTDIR;
2704 		TRACE_1(TR_FAC_NFS, TR_RFS_READDIR_END,
2705 			"rfs_readdir_end:(%S)", "notdir");
2706 		return;
2707 	}
2708 
2709 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START,
2710 		"vop_rwlock_start:");
2711 	(void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
2712 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END,
2713 		"vop_rwlock_end:");
2714 
2715 	TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START,
2716 		"vop_access_start:");
2717 	error = VOP_ACCESS(vp, VREAD, 0, cr);
2718 	TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END,
2719 		"vop_access_end:");
2720 	if (error) {
2721 		rd->rd_entries = NULL;
2722 		goto bad;
2723 	}
2724 
2725 	if (rda->rda_count == 0) {
2726 		rd->rd_entries = NULL;
2727 		rd->rd_size = 0;
2728 		rd->rd_eof = FALSE;
2729 		goto bad;
2730 	}
2731 
2732 	rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA);
2733 
2734 	/*
2735 	 * Allocate data for entries.  This will be freed by rfs_rddirfree.
2736 	 */
2737 	rd->rd_bufsize = (uint_t)rda->rda_count;
2738 	rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP);
2739 
2740 	/*
2741 	 * Set up io vector to read directory data
2742 	 */
2743 	iov.iov_base = (caddr_t)rd->rd_entries;
2744 	iov.iov_len = rda->rda_count;
2745 	uio.uio_iov = &iov;
2746 	uio.uio_iovcnt = 1;
2747 	uio.uio_segflg = UIO_SYSSPACE;
2748 	uio.uio_extflg = UIO_COPY_CACHED;
2749 	uio.uio_loffset = (offset_t)rda->rda_offset;
2750 	uio.uio_resid = rda->rda_count;
2751 
2752 	/*
2753 	 * read directory
2754 	 */
2755 	TRACE_0(TR_FAC_NFS, TR_VOP_READDIR_START,
2756 		"vop_readdir_start:");
2757 	error = VOP_READDIR(vp, &uio, cr, &iseof);
2758 	TRACE_0(TR_FAC_NFS, TR_VOP_READDIR_END,
2759 		"vop_readdir_end:");
2760 
2761 	/*
2762 	 * Clean up
2763 	 */
2764 	if (!error) {
2765 		/*
2766 		 * set size and eof
2767 		 */
2768 		if (uio.uio_resid == rda->rda_count) {
2769 			rd->rd_size = 0;
2770 			rd->rd_eof = TRUE;
2771 		} else {
2772 			rd->rd_size = (uint32_t)(rda->rda_count -
2773 			    uio.uio_resid);
2774 			rd->rd_eof = iseof ? TRUE : FALSE;
2775 		}
2776 	}
2777 
2778 bad:
2779 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
2780 		"vop_rwunlock_start:");
2781 	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
2782 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END,
2783 		"vop_rwunlock_end:");
2784 
2785 #if 0 /* notyet */
2786 	/*
2787 	 * Don't do this.  It causes local disk writes when just
2788 	 * reading the file and the overhead is deemed larger
2789 	 * than the benefit.
2790 	 */
2791 	/*
2792 	 * Force modified metadata out to stable storage.
2793 	 */
2794 	(void) VOP_FSYNC(vp, FNODSYNC, cr);
2795 #endif
2796 
2797 	VN_RELE(vp);
2798 
2799 	rd->rd_status = puterrno(error);
2800 
2801 	TRACE_1(TR_FAC_NFS, TR_RFS_READDIR_END,
2802 		"rfs_readdir_end:(%S)", "done");
2803 }
2804 fhandle_t *
2805 rfs_readdir_getfh(struct nfsrddirargs *rda)
2806 {
2807 	return (&rda->rda_fh);
2808 }
2809 void
2810 rfs_rddirfree(struct nfsrddirres *rd)
2811 {
2812 	if (rd->rd_entries != NULL)
2813 		kmem_free(rd->rd_entries, rd->rd_bufsize);
2814 }
2815 
2816 /* ARGSUSED */
2817 void
2818 rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
2819 	struct svc_req *req, cred_t *cr)
2820 {
2821 	int error;
2822 	struct statvfs64 sb;
2823 	vnode_t *vp;
2824 
2825 	TRACE_0(TR_FAC_NFS, TR_RFS_STATFS_START,
2826 		"rfs_statfs_start:");
2827 
2828 	vp = nfs_fhtovp(fh, exi);
2829 	if (vp == NULL) {
2830 		fs->fs_status = NFSERR_STALE;
2831 		TRACE_1(TR_FAC_NFS, TR_RFS_STATFS_END,
2832 			"rfs_statfs_end:(%S)", "stale");
2833 		return;
2834 	}
2835 
2836 	error = VFS_STATVFS(vp->v_vfsp, &sb);
2837 
2838 	if (!error) {
2839 		fs->fs_tsize = nfstsize();
2840 		fs->fs_bsize = sb.f_frsize;
2841 		fs->fs_blocks = sb.f_blocks;
2842 		fs->fs_bfree = sb.f_bfree;
2843 		fs->fs_bavail = sb.f_bavail;
2844 	}
2845 
2846 	VN_RELE(vp);
2847 
2848 	fs->fs_status = puterrno(error);
2849 
2850 	TRACE_1(TR_FAC_NFS, TR_RFS_STATFS_END,
2851 		"rfs_statfs_end:(%S)", "done");
2852 }
2853 fhandle_t *
2854 rfs_statfs_getfh(fhandle_t *fh)
2855 {
2856 	return (fh);
2857 }
2858 
2859 static int
2860 sattr_to_vattr(struct nfssattr *sa, struct vattr *vap)
2861 {
2862 	vap->va_mask = 0;
2863 
2864 	/*
2865 	 * There was a sign extension bug in some VFS based systems
2866 	 * which stored the mode as a short.  When it would get
2867 	 * assigned to a u_long, no sign extension would occur.
2868 	 * It needed to, but this wasn't noticed because sa_mode
2869 	 * would then get assigned back to the short, thus ignoring
2870 	 * the upper 16 bits of sa_mode.
2871 	 *
2872 	 * To make this implementation work for both broken
2873 	 * clients and good clients, we check for both versions
2874 	 * of the mode.
2875 	 */
2876 	if (sa->sa_mode != (uint32_t)((ushort_t)-1) &&
2877 	    sa->sa_mode != (uint32_t)-1) {
2878 		vap->va_mask |= AT_MODE;
2879 		vap->va_mode = sa->sa_mode;
2880 	}
2881 	if (sa->sa_uid != (uint32_t)-1) {
2882 		vap->va_mask |= AT_UID;
2883 		vap->va_uid = sa->sa_uid;
2884 	}
2885 	if (sa->sa_gid != (uint32_t)-1) {
2886 		vap->va_mask |= AT_GID;
2887 		vap->va_gid = sa->sa_gid;
2888 	}
2889 	if (sa->sa_size != (uint32_t)-1) {
2890 		vap->va_mask |= AT_SIZE;
2891 		vap->va_size = sa->sa_size;
2892 	}
2893 	if (sa->sa_atime.tv_sec != (int32_t)-1 &&
2894 	    sa->sa_atime.tv_usec != (int32_t)-1) {
2895 #ifndef _LP64
2896 		/* return error if time overflow */
2897 		if (!NFS2_TIME_OK(sa->sa_atime.tv_sec))
2898 			return (EOVERFLOW);
2899 #endif
2900 		vap->va_mask |= AT_ATIME;
2901 		/*
2902 		 * nfs protocol defines times as unsigned so don't extend sign,
2903 		 * unless sysadmin set nfs_allow_preepoch_time.
2904 		 */
2905 		NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec);
2906 		vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
2907 	}
2908 	if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
2909 	    sa->sa_mtime.tv_usec != (int32_t)-1) {
2910 #ifndef _LP64
2911 		/* return error if time overflow */
2912 		if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
2913 			return (EOVERFLOW);
2914 #endif
2915 		vap->va_mask |= AT_MTIME;
2916 		/*
2917 		 * nfs protocol defines times as unsigned so don't extend sign,
2918 		 * unless sysadmin set nfs_allow_preepoch_time.
2919 		 */
2920 		NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
2921 		vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
2922 	}
2923 	return (0);
2924 }
2925 
2926 static enum nfsftype vt_to_nf[] = {
2927 	0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
2928 };
2929 
2930 /*
2931  * check the following fields for overflow: nodeid, size, and time.
2932  * There could be a problem when converting 64-bit LP64 fields
2933  * into 32-bit ones.  Return an error if there is an overflow.
2934  */
2935 int
2936 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
2937 {
2938 	ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
2939 	na->na_type = vt_to_nf[vap->va_type];
2940 
2941 	if (vap->va_mode == (unsigned short) -1)
2942 		na->na_mode = (uint32_t)-1;
2943 	else
2944 		na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
2945 
2946 	if (vap->va_uid == (unsigned short)(-1))
2947 		na->na_uid = (uint32_t)(-1);
2948 	else if (vap->va_uid == UID_NOBODY)
2949 		na->na_uid = (uint32_t)NFS_UID_NOBODY;
2950 	else
2951 		na->na_uid = vap->va_uid;
2952 
2953 	if (vap->va_gid == (unsigned short)(-1))
2954 		na->na_gid = (uint32_t)-1;
2955 	else if (vap->va_gid == GID_NOBODY)
2956 		na->na_gid = (uint32_t)NFS_GID_NOBODY;
2957 	else
2958 		na->na_gid = vap->va_gid;
2959 
2960 	/*
2961 	 * Do we need to check fsid for overflow?  It is 64-bit in the
2962 	 * vattr, but are bigger than 32 bit values supported?
2963 	 */
2964 	na->na_fsid = vap->va_fsid;
2965 
2966 	na->na_nodeid = vap->va_nodeid;
2967 
2968 	/*
2969 	 * Check to make sure that the nodeid is representable over the
2970 	 * wire without losing bits.
2971 	 */
2972 	if (vap->va_nodeid != (u_longlong_t)na->na_nodeid)
2973 		return (EFBIG);
2974 	na->na_nlink = vap->va_nlink;
2975 
2976 	/*
2977 	 * Check for big files here, instead of at the caller.  See
2978 	 * comments in cstat for large special file explanation.
2979 	 */
2980 	if (vap->va_size > (u_longlong_t)MAXOFF32_T) {
2981 		if ((vap->va_type == VREG) || (vap->va_type == VDIR))
2982 			return (EFBIG);
2983 		if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) {
2984 			/* UNKNOWN_SIZE | OVERFLOW */
2985 			na->na_size = MAXOFF32_T;
2986 		} else
2987 			na->na_size = vap->va_size;
2988 	} else
2989 		na->na_size = vap->va_size;
2990 
2991 	/*
2992 	 * If the vnode times overflow the 32-bit times that NFS2
2993 	 * uses on the wire then return an error.
2994 	 */
2995 	if (!NFS_VAP_TIME_OK(vap)) {
2996 		return (EOVERFLOW);
2997 	}
2998 	na->na_atime.tv_sec = vap->va_atime.tv_sec;
2999 	na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
3000 
3001 	na->na_mtime.tv_sec = vap->va_mtime.tv_sec;
3002 	na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
3003 
3004 	na->na_ctime.tv_sec = vap->va_ctime.tv_sec;
3005 	na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000;
3006 
3007 	/*
3008 	 * If the dev_t will fit into 16 bits then compress
3009 	 * it, otherwise leave it alone. See comments in
3010 	 * nfs_client.c.
3011 	 */
3012 	if (getminor(vap->va_rdev) <= SO4_MAXMIN &&
3013 	    getmajor(vap->va_rdev) <= SO4_MAXMAJ)
3014 		na->na_rdev = nfsv2_cmpdev(vap->va_rdev);
3015 	else
3016 		(void) cmpldev(&na->na_rdev, vap->va_rdev);
3017 
3018 	na->na_blocks = vap->va_nblocks;
3019 	na->na_blocksize = vap->va_blksize;
3020 
3021 	/*
3022 	 * This bit of ugliness is a *TEMPORARY* hack to preserve the
3023 	 * over-the-wire protocols for named-pipe vnodes.  It remaps the
3024 	 * VFIFO type to the special over-the-wire type. (see note in nfs.h)
3025 	 *
3026 	 * BUYER BEWARE:
3027 	 *  If you are porting the NFS to a non-Sun server, you probably
3028 	 *  don't want to include the following block of code.  The
3029 	 *  over-the-wire special file types will be changing with the
3030 	 *  NFS Protocol Revision.
3031 	 */
3032 	if (vap->va_type == VFIFO)
3033 		NA_SETFIFO(na);
3034 	return (0);
3035 }
3036 
3037 /*
3038  * acl v2 support: returns approximate permission.
3039  *	default: returns minimal permission (more restrictive)
3040  *	aclok: returns maximal permission (less restrictive)
3041  *	This routine changes the permissions that are alaredy in *va.
3042  *	If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES,
3043  *	CLASS_OBJ is always the same as GROUP_OBJ entry.
3044  */
3045 static void
3046 acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr)
3047 {
3048 	vsecattr_t	vsa;
3049 	int		aclcnt;
3050 	aclent_t	*aclentp;
3051 	mode_t		mask_perm;
3052 	mode_t		grp_perm;
3053 	mode_t		other_perm;
3054 	mode_t		other_orig;
3055 	int		error;
3056 
3057 	/* dont care default acl */
3058 	vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT);
3059 	error = VOP_GETSECATTR(vp, &vsa, 0, cr);
3060 
3061 	if (!error) {
3062 		aclcnt = vsa.vsa_aclcnt;
3063 		if (aclcnt > MIN_ACL_ENTRIES) {
3064 			/* non-trivial ACL */
3065 			aclentp = vsa.vsa_aclentp;
3066 			if (exi->exi_export.ex_flags & EX_ACLOK) {
3067 				/* maximal permissions */
3068 				grp_perm = 0;
3069 				other_perm = 0;
3070 				for (; aclcnt > 0; aclcnt--, aclentp++) {
3071 					switch (aclentp->a_type) {
3072 					case USER_OBJ:
3073 						break;
3074 					case USER:
3075 						grp_perm |=
3076 						    aclentp->a_perm << 3;
3077 						other_perm |= aclentp->a_perm;
3078 						break;
3079 					case GROUP_OBJ:
3080 						grp_perm |=
3081 						    aclentp->a_perm << 3;
3082 						break;
3083 					case GROUP:
3084 						other_perm |= aclentp->a_perm;
3085 						break;
3086 					case OTHER_OBJ:
3087 						other_orig = aclentp->a_perm;
3088 						break;
3089 					case CLASS_OBJ:
3090 						mask_perm = aclentp->a_perm;
3091 						break;
3092 					default:
3093 						break;
3094 					}
3095 				}
3096 				grp_perm &= mask_perm << 3;
3097 				other_perm &= mask_perm;
3098 				other_perm |= other_orig;
3099 
3100 			} else {
3101 				/* minimal permissions */
3102 				grp_perm = 070;
3103 				other_perm = 07;
3104 				for (; aclcnt > 0; aclcnt--, aclentp++) {
3105 					switch (aclentp->a_type) {
3106 					case USER_OBJ:
3107 						break;
3108 					case USER:
3109 					case CLASS_OBJ:
3110 						grp_perm &=
3111 						    aclentp->a_perm << 3;
3112 						other_perm &=
3113 						    aclentp->a_perm;
3114 						break;
3115 					case GROUP_OBJ:
3116 						grp_perm &=
3117 						    aclentp->a_perm << 3;
3118 						break;
3119 					case GROUP:
3120 						other_perm &=
3121 						    aclentp->a_perm;
3122 						break;
3123 					case OTHER_OBJ:
3124 						other_perm &=
3125 						    aclentp->a_perm;
3126 						break;
3127 					default:
3128 						break;
3129 					}
3130 				}
3131 			}
3132 			/* copy to va */
3133 			va->va_mode &= ~077;
3134 			va->va_mode |= grp_perm | other_perm;
3135 		}
3136 		if (vsa.vsa_aclcnt)
3137 			kmem_free(vsa.vsa_aclentp,
3138 			    vsa.vsa_aclcnt * sizeof (aclent_t));
3139 	}
3140 }
3141 
3142 void
3143 rfs_srvrinit(void)
3144 {
3145 	mutex_init(&rfs_async_write_lock, NULL, MUTEX_DEFAULT, NULL);
3146 }
3147 
3148 void
3149 rfs_srvrfini(void)
3150 {
3151 	mutex_destroy(&rfs_async_write_lock);
3152 }
3153