xref: /titanic_52/usr/src/uts/common/fs/nfs/nfs_srv.c (revision 5763ba1e357fad1d57b5875c499307b7ea6e2cd4)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  *	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
28  *	All rights reserved.
29  */
30 
31 #pragma ident	"%Z%%M%	%I%	%E% SMI"
32 
33 #include <sys/param.h>
34 #include <sys/types.h>
35 #include <sys/systm.h>
36 #include <sys/cred.h>
37 #include <sys/buf.h>
38 #include <sys/vfs.h>
39 #include <sys/vnode.h>
40 #include <sys/uio.h>
41 #include <sys/stat.h>
42 #include <sys/errno.h>
43 #include <sys/sysmacros.h>
44 #include <sys/statvfs.h>
45 #include <sys/kmem.h>
46 #include <sys/kstat.h>
47 #include <sys/dirent.h>
48 #include <sys/cmn_err.h>
49 #include <sys/debug.h>
50 #include <sys/vtrace.h>
51 #include <sys/mode.h>
52 #include <sys/acl.h>
53 #include <sys/nbmlock.h>
54 #include <sys/policy.h>
55 
56 #include <rpc/types.h>
57 #include <rpc/auth.h>
58 #include <rpc/svc.h>
59 
60 #include <nfs/nfs.h>
61 #include <nfs/export.h>
62 
63 #include <vm/hat.h>
64 #include <vm/as.h>
65 #include <vm/seg.h>
66 #include <vm/seg_map.h>
67 #include <vm/seg_kmem.h>
68 
69 #include <sys/strsubr.h>
70 
71 /*
72  * These are the interface routines for the server side of the
73  * Network File System.  See the NFS version 2 protocol specification
74  * for a description of this interface.
75  */
76 
77 static int	sattr_to_vattr(struct nfssattr *, struct vattr *);
78 static void	acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
79 			cred_t *);
80 
81 /*
82  * Some "over the wire" UNIX file types.  These are encoded
83  * into the mode.  This needs to be fixed in the next rev.
84  */
85 #define	IFMT		0170000		/* type of file */
86 #define	IFCHR		0020000		/* character special */
87 #define	IFBLK		0060000		/* block special */
88 #define	IFSOCK		0140000		/* socket */
89 
90 /*
91  * Get file attributes.
92  * Returns the current attributes of the file with the given fhandle.
93  */
94 /* ARGSUSED */
95 void
96 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
97 	struct svc_req *req, cred_t *cr)
98 {
99 	int error;
100 	vnode_t *vp;
101 	struct vattr va;
102 
103 	TRACE_0(TR_FAC_NFS, TR_RFS_GETATTR_START, "rfs_getattr_start:");
104 
105 	vp = nfs_fhtovp(fhp, exi);
106 	if (vp == NULL) {
107 		ns->ns_status = NFSERR_STALE;
108 		TRACE_1(TR_FAC_NFS, TR_RFS_GETATTR_END,
109 		    "rfs_getattr_end:(%S)", "stale");
110 		return;
111 	}
112 
113 	/*
114 	 * Do the getattr.
115 	 */
116 	va.va_mask = AT_ALL;	/* we want all the attributes */
117 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:");
118 	error = rfs4_delegated_getattr(vp, &va, 0, cr);
119 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:");
120 
121 	/* check for overflows */
122 	if (!error) {
123 		acl_perm(vp, exi, &va, cr);
124 		error = vattr_to_nattr(&va, &ns->ns_attr);
125 	}
126 
127 	VN_RELE(vp);
128 
129 	ns->ns_status = puterrno(error);
130 
131 	TRACE_1(TR_FAC_NFS, TR_RFS_GETATTR_END, "rfs_getattr_end:(%S)", "done");
132 }
133 void *
134 rfs_getattr_getfh(fhandle_t *fhp)
135 {
136 	return (fhp);
137 }
138 
139 /*
140  * Set file attributes.
141  * Sets the attributes of the file with the given fhandle.  Returns
142  * the new attributes.
143  */
144 void
145 rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns,
146 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
147 {
148 	int error;
149 	int flag;
150 	int in_crit = 0;
151 	vnode_t *vp;
152 	struct vattr va;
153 	struct vattr bva;
154 	struct flock64 bf;
155 
156 	TRACE_0(TR_FAC_NFS, TR_RFS_SETATTR_START, "rfs_setattr_start:");
157 
158 	vp = nfs_fhtovp(&args->saa_fh, exi);
159 	if (vp == NULL) {
160 		ns->ns_status = NFSERR_STALE;
161 		TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END,
162 		    "rfs_setattr_end:(%S)", "stale");
163 		return;
164 	}
165 
166 	if (rdonly(exi, req) || vn_is_readonly(vp)) {
167 		VN_RELE(vp);
168 		ns->ns_status = NFSERR_ROFS;
169 		TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END,
170 		    "rfs_setattr_end:(%S)", "rofs");
171 		return;
172 	}
173 
174 	error = sattr_to_vattr(&args->saa_sa, &va);
175 	if (error) {
176 		VN_RELE(vp);
177 		ns->ns_status = puterrno(error);
178 		TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END,
179 		    "rfs_setattr_end:(%S)", "sattr");
180 		return;
181 	}
182 
183 	/*
184 	 * If the client is requesting a change to the mtime,
185 	 * but the nanosecond field is set to 1 billion, then
186 	 * this is a flag to the server that it should set the
187 	 * atime and mtime fields to the server's current time.
188 	 * The 1 billion number actually came from the client
189 	 * as 1 million, but the units in the over the wire
190 	 * request are microseconds instead of nanoseconds.
191 	 *
192 	 * This is an overload of the protocol and should be
193 	 * documented in the NFS Version 2 protocol specification.
194 	 */
195 	if (va.va_mask & AT_MTIME) {
196 		if (va.va_mtime.tv_nsec == 1000000000) {
197 			gethrestime(&va.va_mtime);
198 			va.va_atime = va.va_mtime;
199 			va.va_mask |= AT_ATIME;
200 			flag = 0;
201 		} else
202 			flag = ATTR_UTIME;
203 	} else
204 		flag = 0;
205 
206 	/*
207 	 * If the filesystem is exported with nosuid, then mask off
208 	 * the setuid and setgid bits.
209 	 */
210 	if ((va.va_mask & AT_MODE) && vp->v_type == VREG &&
211 	    (exi->exi_export.ex_flags & EX_NOSUID))
212 		va.va_mode &= ~(VSUID | VSGID);
213 
214 	/*
215 	 * We need to specially handle size changes because it is
216 	 * possible for the client to create a file with modes
217 	 * which indicate read-only, but with the file opened for
218 	 * writing.  If the client then tries to set the size of
219 	 * the file, then the normal access checking done in
220 	 * VOP_SETATTR would prevent the client from doing so,
221 	 * although it should be legal for it to do so.  To get
222 	 * around this, we do the access checking for ourselves
223 	 * and then use VOP_SPACE which doesn't do the access
224 	 * checking which VOP_SETATTR does. VOP_SPACE can only
225 	 * operate on VREG files, let VOP_SETATTR handle the other
226 	 * extremely rare cases.
227 	 * Also the client should not be allowed to change the
228 	 * size of the file if there is a conflicting non-blocking
229 	 * mandatory lock in the region of change.
230 	 *
231 	 * Also(2), check to see if the v4 side of the server has
232 	 * delegated this file.  If so, then we set T_WOULDBLOCK
233 	 * so that the dispatch function dosn't send a reply, forcing
234 	 * the client to retrasmit its request.
235 	 */
236 	if (vp->v_type == VREG && va.va_mask & AT_SIZE) {
237 		/* If delegated, mark as wouldblock so response is dropped */
238 		if (rfs4_check_delegated(FWRITE, vp, TRUE)) {
239 			VN_RELE(vp);
240 			curthread->t_flag |= T_WOULDBLOCK;
241 			TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END,
242 			    "rfs_setattr_end:(%S)", "delegated");
243 			return;
244 		}
245 		if (nbl_need_check(vp)) {
246 			nbl_start_crit(vp, RW_READER);
247 			in_crit = 1;
248 		}
249 
250 		bva.va_mask = AT_UID | AT_SIZE;
251 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:");
252 		error = VOP_GETATTR(vp, &bva, 0, cr, NULL);
253 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:");
254 		if (error) {
255 			if (in_crit)
256 				nbl_end_crit(vp);
257 			VN_RELE(vp);
258 			ns->ns_status = puterrno(error);
259 			TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END,
260 			    "rfs_setattr_end:(%S)", "getattr");
261 			return;
262 		}
263 
264 		if (in_crit) {
265 			u_offset_t offset;
266 			ssize_t length;
267 
268 			if (va.va_size < bva.va_size) {
269 				offset = va.va_size;
270 				length = bva.va_size - va.va_size;
271 			} else {
272 				offset = bva.va_size;
273 				length = va.va_size - bva.va_size;
274 			}
275 			if (nbl_conflict(vp, NBL_WRITE, offset, length, 0,
276 			    NULL)) {
277 				error = EACCES;
278 			}
279 		}
280 
281 		if (crgetuid(cr) == bva.va_uid && !error &&
282 		    va.va_size != bva.va_size) {
283 			va.va_mask &= ~AT_SIZE;
284 			bf.l_type = F_WRLCK;
285 			bf.l_whence = 0;
286 			bf.l_start = (off64_t)va.va_size;
287 			bf.l_len = 0;
288 			bf.l_sysid = 0;
289 			bf.l_pid = 0;
290 			TRACE_0(TR_FAC_NFS, TR_VOP_SPACE_START,
291 			    "vop_space_start:");
292 			error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE,
293 			    (offset_t)va.va_size, cr, NULL);
294 			TRACE_0(TR_FAC_NFS, TR_VOP_SPACE_END, "vop_space_end:");
295 		}
296 		if (in_crit)
297 			nbl_end_crit(vp);
298 	} else
299 		error = 0;
300 
301 	/*
302 	 * Do the setattr.
303 	 */
304 	if (!error && va.va_mask) {
305 		TRACE_0(TR_FAC_NFS, TR_VOP_SETATTR_START, "vop_setattr_start:");
306 		error = VOP_SETATTR(vp, &va, flag, cr, NULL);
307 		TRACE_0(TR_FAC_NFS, TR_VOP_SETATTR_END, "vop_setattr_end:");
308 	}
309 
310 	if (!error) {
311 		va.va_mask = AT_ALL;	/* get everything */
312 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:");
313 		error = rfs4_delegated_getattr(vp, &va, 0, cr);
314 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:");
315 
316 		/* check for overflows */
317 		if (!error) {
318 			acl_perm(vp, exi, &va, cr);
319 			error = vattr_to_nattr(&va, &ns->ns_attr);
320 		}
321 	}
322 
323 	/*
324 	 * Force modified metadata out to stable storage.
325 	 */
326 	(void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
327 
328 	VN_RELE(vp);
329 
330 	ns->ns_status = puterrno(error);
331 
332 	TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END, "rfs_setattr_end:(%S)", "done");
333 }
334 void *
335 rfs_setattr_getfh(struct nfssaargs *args)
336 {
337 	return (&args->saa_fh);
338 }
339 
340 /*
341  * Directory lookup.
342  * Returns an fhandle and file attributes for file name in a directory.
343  */
344 /* ARGSUSED */
345 void
346 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
347 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
348 {
349 	int error;
350 	vnode_t *dvp;
351 	vnode_t *vp;
352 	struct vattr va;
353 	fhandle_t *fhp = da->da_fhandle;
354 	struct sec_ol sec = {0, 0};
355 	bool_t publicfh_flag = FALSE, auth_weak = FALSE;
356 
357 	TRACE_0(TR_FAC_NFS, TR_RFS_LOOKUP_START, "rfs_lookup_start:");
358 
359 	/*
360 	 * Trusted Extension doesn't support NFSv2. MOUNT
361 	 * will reject v2 clients. Need to prevent v2 client
362 	 * access via WebNFS here.
363 	 */
364 	if (is_system_labeled() && req->rq_vers == 2) {
365 		dr->dr_status = NFSERR_ACCES;
366 		TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END,
367 		    "rfs_lookup_end:(%S)", "access");
368 		return;
369 	}
370 
371 	/*
372 	 * Disallow NULL paths
373 	 */
374 	if (da->da_name == NULL || *da->da_name == '\0') {
375 		dr->dr_status = NFSERR_ACCES;
376 		TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END,
377 		    "rfs_lookup_end:(%S)", "access");
378 		return;
379 	}
380 
381 	/*
382 	 * Allow lookups from the root - the default
383 	 * location of the public filehandle.
384 	 */
385 	if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
386 		dvp = rootdir;
387 		VN_HOLD(dvp);
388 	} else {
389 		dvp = nfs_fhtovp(fhp, exi);
390 		if (dvp == NULL) {
391 			dr->dr_status = NFSERR_STALE;
392 			TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END,
393 			    "rfs_lookup_end:(%S)", "stale");
394 			return;
395 		}
396 	}
397 
398 	/*
399 	 * Not allow lookup beyond root.
400 	 * If the filehandle matches a filehandle of the exi,
401 	 * then the ".." refers beyond the root of an exported filesystem.
402 	 */
403 	if (strcmp(da->da_name, "..") == 0 &&
404 	    EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
405 		VN_RELE(dvp);
406 		dr->dr_status = NFSERR_NOENT;
407 		TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END,
408 		    "rfs_lookup_end:(%S)", "noent");
409 		return;
410 	}
411 
412 	/*
413 	 * If the public filehandle is used then allow
414 	 * a multi-component lookup, i.e. evaluate
415 	 * a pathname and follow symbolic links if
416 	 * necessary.
417 	 *
418 	 * This may result in a vnode in another filesystem
419 	 * which is OK as long as the filesystem is exported.
420 	 */
421 	if (PUBLIC_FH2(fhp)) {
422 		publicfh_flag = TRUE;
423 		error = rfs_publicfh_mclookup(da->da_name, dvp, cr, &vp, &exi,
424 		    &sec);
425 	} else {
426 		/*
427 		 * Do a normal single component lookup.
428 		 */
429 		TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_START, "vop_lookup_start:");
430 		error = VOP_LOOKUP(dvp, da->da_name, &vp, NULL, 0, NULL, cr,
431 		    NULL, NULL, NULL);
432 		TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_END, "vop_lookup_end:");
433 	}
434 
435 	if (!error) {
436 		va.va_mask = AT_ALL;	/* we want everything */
437 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:");
438 		error = rfs4_delegated_getattr(vp, &va, 0, cr);
439 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:");
440 		/* check for overflows */
441 		if (!error) {
442 			acl_perm(vp, exi, &va, cr);
443 			error = vattr_to_nattr(&va, &dr->dr_attr);
444 			if (!error) {
445 				if (sec.sec_flags & SEC_QUERY)
446 					error = makefh_ol(&dr->dr_fhandle, exi,
447 					    sec.sec_index);
448 				else {
449 					error = makefh(&dr->dr_fhandle, vp,
450 					    exi);
451 					if (!error && publicfh_flag &&
452 					    !chk_clnt_sec(exi, req))
453 						auth_weak = TRUE;
454 				}
455 			}
456 		}
457 		VN_RELE(vp);
458 	}
459 
460 	VN_RELE(dvp);
461 
462 	/*
463 	 * If publicfh_flag is true then we have called rfs_publicfh_mclookup
464 	 * and have obtained a new exportinfo in exi which needs to be
465 	 * released. Note the the original exportinfo pointed to by exi
466 	 * will be released by the caller, comon_dispatch.
467 	 */
468 	if (publicfh_flag && exi != NULL)
469 		exi_rele(exi);
470 
471 	/*
472 	 * If it's public fh, no 0x81, and client's flavor is
473 	 * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
474 	 * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
475 	 */
476 	if (auth_weak)
477 		dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR;
478 	else
479 		dr->dr_status = puterrno(error);
480 
481 	TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END, "rfs_lookup_end:(%S)", "done");
482 }
483 void *
484 rfs_lookup_getfh(struct nfsdiropargs *da)
485 {
486 	return (da->da_fhandle);
487 }
488 
489 /*
490  * Read symbolic link.
491  * Returns the string in the symbolic link at the given fhandle.
492  */
493 /* ARGSUSED */
494 void
495 rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi,
496 	struct svc_req *req, cred_t *cr)
497 {
498 	int error;
499 	struct iovec iov;
500 	struct uio uio;
501 	vnode_t *vp;
502 	struct vattr va;
503 
504 	TRACE_0(TR_FAC_NFS, TR_RFS_READLINK_START, "rfs_readlink_start:");
505 
506 	vp = nfs_fhtovp(fhp, exi);
507 	if (vp == NULL) {
508 		rl->rl_data = NULL;
509 		rl->rl_status = NFSERR_STALE;
510 		TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END,
511 		    "rfs_readlink_end:(%S)", "stale");
512 		return;
513 	}
514 
515 	va.va_mask = AT_MODE;
516 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:");
517 	error = VOP_GETATTR(vp, &va, 0, cr, NULL);
518 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:");
519 
520 	if (error) {
521 		VN_RELE(vp);
522 		rl->rl_data = NULL;
523 		rl->rl_status = puterrno(error);
524 		TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END,
525 		    "rfs_readlink_end:(%S)", "getattr error");
526 		return;
527 	}
528 
529 	if (MANDLOCK(vp, va.va_mode)) {
530 		VN_RELE(vp);
531 		rl->rl_data = NULL;
532 		rl->rl_status = NFSERR_ACCES;
533 		TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END,
534 		    "rfs_readlink_end:(%S)", "access");
535 		return;
536 	}
537 
538 	/*
539 	 * XNFS and RFC1094 require us to return ENXIO if argument
540 	 * is not a link. BUGID 1138002.
541 	 */
542 	if (vp->v_type != VLNK) {
543 		VN_RELE(vp);
544 		rl->rl_data = NULL;
545 		rl->rl_status = NFSERR_NXIO;
546 		TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END,
547 		    "rfs_readlink_end:(%S)", "nxio");
548 		return;
549 	}
550 
551 	/*
552 	 * Allocate data for pathname.  This will be freed by rfs_rlfree.
553 	 */
554 	rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
555 
556 	/*
557 	 * Set up io vector to read sym link data
558 	 */
559 	iov.iov_base = rl->rl_data;
560 	iov.iov_len = NFS_MAXPATHLEN;
561 	uio.uio_iov = &iov;
562 	uio.uio_iovcnt = 1;
563 	uio.uio_segflg = UIO_SYSSPACE;
564 	uio.uio_extflg = UIO_COPY_CACHED;
565 	uio.uio_loffset = (offset_t)0;
566 	uio.uio_resid = NFS_MAXPATHLEN;
567 
568 	/*
569 	 * Do the readlink.
570 	 */
571 	TRACE_0(TR_FAC_NFS, TR_VOP_READLINK_START, "vop_readlink_start:");
572 	error = VOP_READLINK(vp, &uio, cr, NULL);
573 	TRACE_0(TR_FAC_NFS, TR_VOP_READLINK_END, "vop_readlink_end:");
574 
575 #if 0 /* notyet */
576 	/*
577 	 * Don't do this.  It causes local disk writes when just
578 	 * reading the file and the overhead is deemed larger
579 	 * than the benefit.
580 	 */
581 	/*
582 	 * Force modified metadata out to stable storage.
583 	 */
584 	(void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
585 #endif
586 
587 	VN_RELE(vp);
588 
589 	rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid);
590 
591 	/*
592 	 * XNFS and RFC1094 require us to return ENXIO if argument
593 	 * is not a link. UFS returns EINVAL if this is the case,
594 	 * so we do the mapping here. BUGID 1138002.
595 	 */
596 	if (error == EINVAL)
597 		rl->rl_status = NFSERR_NXIO;
598 	else
599 		rl->rl_status = puterrno(error);
600 
601 	TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END,
602 	    "rfs_readlink_end:(%S)", "done");
603 }
604 void *
605 rfs_readlink_getfh(fhandle_t *fhp)
606 {
607 	return (fhp);
608 }
609 /*
610  * Free data allocated by rfs_readlink
611  */
612 void
613 rfs_rlfree(struct nfsrdlnres *rl)
614 {
615 	if (rl->rl_data != NULL)
616 		kmem_free(rl->rl_data, NFS_MAXPATHLEN);
617 }
618 
619 /*
620  * Read data.
621  * Returns some data read from the file at the given fhandle.
622  */
623 /* ARGSUSED */
624 void
625 rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr,
626 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
627 {
628 	vnode_t *vp;
629 	int error;
630 	struct vattr va;
631 	struct iovec iov;
632 	struct uio uio;
633 	mblk_t *mp;
634 	int alloc_err = 0;
635 	int in_crit = 0;
636 
637 	TRACE_0(TR_FAC_NFS, TR_RFS_READ_START, "rfs_read_start:");
638 
639 	vp = nfs_fhtovp(&ra->ra_fhandle, exi);
640 	if (vp == NULL) {
641 		rr->rr_data = NULL;
642 		rr->rr_status = NFSERR_STALE;
643 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
644 		    "rfs_read_end:(%S)", "stale");
645 		return;
646 	}
647 
648 	if (vp->v_type != VREG) {
649 		VN_RELE(vp);
650 		rr->rr_data = NULL;
651 		rr->rr_status = NFSERR_ISDIR;
652 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
653 		    "rfs_read_end:(%S)", "isdir");
654 		return;
655 	}
656 
657 	/*
658 	 * Check to see if the v4 side of the server has delegated
659 	 * this file.  If so, then we mark thread as wouldblock so
660 	 * the response is dropped.
661 	 */
662 	if (rfs4_check_delegated(FREAD, vp, FALSE)) {
663 		VN_RELE(vp);
664 		curthread->t_flag |= T_WOULDBLOCK;
665 		rr->rr_data = NULL;
666 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
667 		    "rfs_read_end:(%S)", "delegated");
668 		return;
669 	}
670 
671 	/*
672 	 * Enter the critical region before calling VOP_RWLOCK
673 	 * to avoid a deadlock with write requests.
674 	 */
675 	if (nbl_need_check(vp)) {
676 		nbl_start_crit(vp, RW_READER);
677 		if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
678 		    0, NULL)) {
679 			nbl_end_crit(vp);
680 			VN_RELE(vp);
681 			rr->rr_data = NULL;
682 			rr->rr_status = NFSERR_ACCES;
683 			TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
684 			    "rfs_read_end:(%S)", " csf access error");
685 			return;
686 		}
687 		in_crit = 1;
688 	}
689 
690 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START, "vop_rwlock_start:");
691 	(void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
692 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END, "vop_rwlock_end:");
693 
694 	va.va_mask = AT_ALL;
695 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:");
696 	error = VOP_GETATTR(vp, &va, 0, cr, NULL);
697 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:");
698 
699 	if (error) {
700 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
701 		    "vop_rwunlock_start:");
702 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
703 		if (in_crit)
704 			nbl_end_crit(vp);
705 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, "vop_rwunlock_end:");
706 		VN_RELE(vp);
707 		rr->rr_data = NULL;
708 		rr->rr_status = puterrno(error);
709 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
710 		    "rfs_read_end:(%S)", "getattr error");
711 		return;
712 	}
713 
714 	/*
715 	 * This is a kludge to allow reading of files created
716 	 * with no read permission.  The owner of the file
717 	 * is always allowed to read it.
718 	 */
719 	if (crgetuid(cr) != va.va_uid) {
720 		TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START, "vop_access_start:");
721 		error = VOP_ACCESS(vp, VREAD, 0, cr, NULL);
722 		TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END, "vop_access_end:");
723 		if (error) {
724 			/*
725 			 * Exec is the same as read over the net because
726 			 * of demand loading.
727 			 */
728 			TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START,
729 			    "vop_access_start:");
730 			error = VOP_ACCESS(vp, VEXEC, 0, cr, NULL);
731 			TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END,
732 			    "vop_access_end:");
733 		}
734 		if (error) {
735 			TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
736 			    "vop_rwunlock_start:");
737 			VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
738 			if (in_crit)
739 				nbl_end_crit(vp);
740 			TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END,
741 			    "vop_rwunlock_end:");
742 			VN_RELE(vp);
743 			rr->rr_data = NULL;
744 			rr->rr_status = puterrno(error);
745 			TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
746 			    "rfs_read_end:(%S)", "access error");
747 			return;
748 		}
749 	}
750 
751 	if (MANDLOCK(vp, va.va_mode)) {
752 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
753 		    "vop_rwunlock_start:");
754 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
755 		if (in_crit)
756 			nbl_end_crit(vp);
757 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, "vop_rwunlock_end:");
758 		VN_RELE(vp);
759 		rr->rr_data = NULL;
760 		rr->rr_status = NFSERR_ACCES;
761 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
762 		    "rfs_read_end:(%S)", "mand lock");
763 		return;
764 	}
765 
766 	if ((u_offset_t)ra->ra_offset >= va.va_size) {
767 		rr->rr_count = 0;
768 		rr->rr_data = NULL;
769 		/*
770 		 * In this case, status is NFS_OK, but there is no data
771 		 * to encode. So set rr_mp to NULL.
772 		 */
773 		rr->rr_mp = NULL;
774 		goto done;
775 	}
776 
777 	/*
778 	 * mp will contain the data to be sent out in the read reply.
779 	 * This will be freed after the reply has been sent out (by the
780 	 * driver).
781 	 * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so
782 	 * that the call to xdrmblk_putmblk() never fails.
783 	 */
784 	mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG,
785 	    &alloc_err);
786 	ASSERT(mp != NULL);
787 	ASSERT(alloc_err == 0);
788 
789 	rr->rr_mp = mp;
790 
791 	/*
792 	 * Set up io vector
793 	 */
794 	iov.iov_base = (caddr_t)mp->b_datap->db_base;
795 	iov.iov_len = ra->ra_count;
796 	uio.uio_iov = &iov;
797 	uio.uio_iovcnt = 1;
798 	uio.uio_segflg = UIO_SYSSPACE;
799 	uio.uio_extflg = UIO_COPY_CACHED;
800 	uio.uio_loffset = (offset_t)ra->ra_offset;
801 	uio.uio_resid = ra->ra_count;
802 
803 	TRACE_0(TR_FAC_NFS, TR_VOP_READ_START, "vop_read_start:");
804 	error = VOP_READ(vp, &uio, 0, cr, NULL);
805 	TRACE_0(TR_FAC_NFS, TR_VOP_READ_END, "vop_read_end:");
806 
807 	if (error) {
808 		freeb(mp);
809 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
810 		    "vop_rwunlock_start:");
811 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
812 		if (in_crit)
813 			nbl_end_crit(vp);
814 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, "vop_rwunlock_end:");
815 		VN_RELE(vp);
816 		rr->rr_data = NULL;
817 		rr->rr_status = puterrno(error);
818 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
819 		    "rfs_read_end:(%S)", "read error");
820 		return;
821 	}
822 
823 	/*
824 	 * Get attributes again so we can send the latest access
825 	 * time to the client side for his cache.
826 	 */
827 	va.va_mask = AT_ALL;
828 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:");
829 	error = VOP_GETATTR(vp, &va, 0, cr, NULL);
830 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:");
831 	if (error) {
832 		freeb(mp);
833 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
834 		    "vop_rwunlock_start:");
835 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
836 		if (in_crit)
837 			nbl_end_crit(vp);
838 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END,
839 		    "vop_rwunlock_end:");
840 		VN_RELE(vp);
841 		rr->rr_data = NULL;
842 		rr->rr_status = puterrno(error);
843 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
844 		    "rfs_read_end:(%S)", "read error");
845 		return;
846 	}
847 
848 	rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid);
849 
850 	rr->rr_data = (char *)mp->b_datap->db_base;
851 
852 done:
853 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, "vop_rwunlock_start:");
854 	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
855 	if (in_crit)
856 		nbl_end_crit(vp);
857 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, "vop_rwunlock_end:");
858 
859 	acl_perm(vp, exi, &va, cr);
860 
861 	/* check for overflows */
862 	error = vattr_to_nattr(&va, &rr->rr_attr);
863 
864 #if 0 /* notyet */
865 	/*
866 	 * Don't do this.  It causes local disk writes when just
867 	 * reading the file and the overhead is deemed larger
868 	 * than the benefit.
869 	 */
870 	/*
871 	 * Force modified metadata out to stable storage.
872 	 */
873 	(void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
874 #endif
875 
876 	VN_RELE(vp);
877 
878 	rr->rr_status = puterrno(error);
879 
880 	TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, "rfs_read_end:(%S)", "done");
881 }
882 
883 /*
884  * Free data allocated by rfs_read
885  */
886 void
887 rfs_rdfree(struct nfsrdresult *rr)
888 {
889 	mblk_t *mp;
890 
891 	if (rr->rr_status == NFS_OK) {
892 		mp = rr->rr_mp;
893 		if (mp != NULL)
894 			freeb(mp);
895 	}
896 }
897 
898 void *
899 rfs_read_getfh(struct nfsreadargs *ra)
900 {
901 	return (&ra->ra_fhandle);
902 }
903 
904 #define	MAX_IOVECS	12
905 
906 #ifdef DEBUG
907 static int rfs_write_sync_hits = 0;
908 static int rfs_write_sync_misses = 0;
909 #endif
910 
911 /*
912  * Write data to file.
913  * Returns attributes of a file after writing some data to it.
914  *
915  * Any changes made here, especially in error handling might have
916  * to also be done in rfs_write (which clusters write requests).
917  */
918 void
919 rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns,
920 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
921 {
922 	int error;
923 	vnode_t *vp;
924 	rlim64_t rlimit;
925 	struct vattr va;
926 	struct uio uio;
927 	struct iovec iov[MAX_IOVECS];
928 	mblk_t *m;
929 	struct iovec *iovp;
930 	int iovcnt;
931 	cred_t *savecred;
932 	int in_crit = 0;
933 
934 	TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_START, "rfs_write_start:(%S)", "sync");
935 
936 	vp = nfs_fhtovp(&wa->wa_fhandle, exi);
937 	if (vp == NULL) {
938 		ns->ns_status = NFSERR_STALE;
939 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
940 		    "rfs_write_end:(%S)", "stale");
941 		return;
942 	}
943 
944 	if (rdonly(exi, req)) {
945 		VN_RELE(vp);
946 		ns->ns_status = NFSERR_ROFS;
947 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
948 		    "rfs_write_end:(%S)", "rofs");
949 		return;
950 	}
951 
952 	if (vp->v_type != VREG) {
953 		VN_RELE(vp);
954 		ns->ns_status = NFSERR_ISDIR;
955 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
956 		    "rfs_write_end:(%S)", "isdir");
957 		return;
958 	}
959 
960 	/*
961 	 * Check to see if the v4 side of the server has delegated
962 	 * this file.  If so, then we mark thread as wouldblock so
963 	 * the response is dropped.
964 	 */
965 	if (rfs4_check_delegated(FWRITE, vp, FALSE)) {
966 		VN_RELE(vp);
967 		curthread->t_flag |= T_WOULDBLOCK;
968 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
969 		    "rfs_write_end:(%S)", "delegated");
970 		return;
971 	}
972 
973 	va.va_mask = AT_UID|AT_MODE;
974 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:");
975 	error = VOP_GETATTR(vp, &va, 0, cr, NULL);
976 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:");
977 
978 	if (error) {
979 		VN_RELE(vp);
980 		ns->ns_status = puterrno(error);
981 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
982 		    "rfs_write_end:(%S)", "getattr error");
983 		return;
984 	}
985 
986 	if (crgetuid(cr) != va.va_uid) {
987 		/*
988 		 * This is a kludge to allow writes of files created
989 		 * with read only permission.  The owner of the file
990 		 * is always allowed to write it.
991 		 */
992 		TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START, "vop_access_start:");
993 		error = VOP_ACCESS(vp, VWRITE, 0, cr, NULL);
994 		TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END, "vop_access_end:");
995 		if (error) {
996 			VN_RELE(vp);
997 			ns->ns_status = puterrno(error);
998 			TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
999 			    "rfs_write_end:(%S)", "access error");
1000 			return;
1001 		}
1002 	}
1003 
1004 	/*
1005 	 * Can't access a mandatory lock file.  This might cause
1006 	 * the NFS service thread to block forever waiting for a
1007 	 * lock to be released that will never be released.
1008 	 */
1009 	if (MANDLOCK(vp, va.va_mode)) {
1010 		VN_RELE(vp);
1011 		ns->ns_status = NFSERR_ACCES;
1012 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
1013 		    "rfs_write_end:(%S)", "mand lock");
1014 		return;
1015 	}
1016 
1017 	/*
1018 	 * We have to enter the critical region before calling VOP_RWLOCK
1019 	 * to avoid a deadlock with ufs.
1020 	 */
1021 	if (nbl_need_check(vp)) {
1022 		nbl_start_crit(vp, RW_READER);
1023 		in_crit = 1;
1024 		if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
1025 		    wa->wa_count, 0, NULL)) {
1026 			error = EACCES;
1027 			goto out;
1028 		}
1029 	}
1030 
1031 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START, "vop_rwlock_start:");
1032 	(void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);
1033 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END, "vop_rwlock_end:");
1034 
1035 	if (wa->wa_data) {
1036 		iov[0].iov_base = wa->wa_data;
1037 		iov[0].iov_len = wa->wa_count;
1038 		uio.uio_iov = iov;
1039 		uio.uio_iovcnt = 1;
1040 		uio.uio_segflg = UIO_SYSSPACE;
1041 		uio.uio_extflg = UIO_COPY_DEFAULT;
1042 		uio.uio_loffset = (offset_t)wa->wa_offset;
1043 		uio.uio_resid = wa->wa_count;
1044 		/*
1045 		 * The limit is checked on the client. We
1046 		 * should allow any size writes here.
1047 		 */
1048 		uio.uio_llimit = curproc->p_fsz_ctl;
1049 		rlimit = uio.uio_llimit - wa->wa_offset;
1050 		if (rlimit < (rlim64_t)uio.uio_resid)
1051 			uio.uio_resid = (uint_t)rlimit;
1052 
1053 		/*
1054 		 * for now we assume no append mode
1055 		 */
1056 		TRACE_1(TR_FAC_NFS, TR_VOP_WRITE_START,
1057 		    "vop_write_start:(%S)", "sync");
1058 		/*
1059 		 * We're changing creds because VM may fault and we need
1060 		 * the cred of the current thread to be used if quota
1061 		 * checking is enabled.
1062 		 */
1063 		savecred = curthread->t_cred;
1064 		curthread->t_cred = cr;
1065 		error = VOP_WRITE(vp, &uio, FSYNC, cr, NULL);
1066 		curthread->t_cred = savecred;
1067 		TRACE_0(TR_FAC_NFS, TR_VOP_WRITE_END, "vop_write_end:");
1068 	} else {
1069 		iovcnt = 0;
1070 		for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
1071 			iovcnt++;
1072 		if (iovcnt <= MAX_IOVECS) {
1073 #ifdef DEBUG
1074 			rfs_write_sync_hits++;
1075 #endif
1076 			iovp = iov;
1077 		} else {
1078 #ifdef DEBUG
1079 			rfs_write_sync_misses++;
1080 #endif
1081 			iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
1082 		}
1083 		mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
1084 		uio.uio_iov = iovp;
1085 		uio.uio_iovcnt = iovcnt;
1086 		uio.uio_segflg = UIO_SYSSPACE;
1087 		uio.uio_extflg = UIO_COPY_DEFAULT;
1088 		uio.uio_loffset = (offset_t)wa->wa_offset;
1089 		uio.uio_resid = wa->wa_count;
1090 		/*
1091 		 * The limit is checked on the client. We
1092 		 * should allow any size writes here.
1093 		 */
1094 		uio.uio_llimit = curproc->p_fsz_ctl;
1095 		rlimit = uio.uio_llimit - wa->wa_offset;
1096 		if (rlimit < (rlim64_t)uio.uio_resid)
1097 			uio.uio_resid = (uint_t)rlimit;
1098 
1099 		/*
1100 		 * For now we assume no append mode.
1101 		 */
1102 		TRACE_1(TR_FAC_NFS, TR_VOP_WRITE_START,
1103 		    "vop_write_start:(%S)", "iov sync");
1104 		/*
1105 		 * We're changing creds because VM may fault and we need
1106 		 * the cred of the current thread to be used if quota
1107 		 * checking is enabled.
1108 		 */
1109 		savecred = curthread->t_cred;
1110 		curthread->t_cred = cr;
1111 		error = VOP_WRITE(vp, &uio, FSYNC, cr, NULL);
1112 		curthread->t_cred = savecred;
1113 		TRACE_0(TR_FAC_NFS, TR_VOP_WRITE_END, "vop_write_end:");
1114 
1115 		if (iovp != iov)
1116 			kmem_free(iovp, sizeof (*iovp) * iovcnt);
1117 	}
1118 
1119 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, "vop_rwunlock_start:");
1120 	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
1121 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, "vop_rwunlock_end:");
1122 
1123 	if (!error) {
1124 		/*
1125 		 * Get attributes again so we send the latest mod
1126 		 * time to the client side for his cache.
1127 		 */
1128 		va.va_mask = AT_ALL;	/* now we want everything */
1129 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:");
1130 		error = VOP_GETATTR(vp, &va, 0, cr, NULL);
1131 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:");
1132 		/* check for overflows */
1133 		if (!error) {
1134 			acl_perm(vp, exi, &va, cr);
1135 			error = vattr_to_nattr(&va, &ns->ns_attr);
1136 		}
1137 	}
1138 
1139 out:
1140 	if (in_crit)
1141 		nbl_end_crit(vp);
1142 	VN_RELE(vp);
1143 
1144 	ns->ns_status = puterrno(error);
1145 
1146 	TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, "rfs_write_end:(%S)", "sync");
1147 }
1148 
1149 struct rfs_async_write {
1150 	struct nfswriteargs *wa;
1151 	struct nfsattrstat *ns;
1152 	struct svc_req *req;
1153 	cred_t *cr;
1154 	kthread_t *thread;
1155 	struct rfs_async_write *list;
1156 };
1157 
1158 struct rfs_async_write_list {
1159 	fhandle_t *fhp;
1160 	kcondvar_t cv;
1161 	struct rfs_async_write *list;
1162 	struct rfs_async_write_list *next;
1163 };
1164 
1165 static struct rfs_async_write_list *rfs_async_write_head = NULL;
1166 static kmutex_t rfs_async_write_lock;
1167 static int rfs_write_async = 1;	/* enables write clustering if == 1 */
1168 
1169 #define	MAXCLIOVECS	42
1170 #define	RFSWRITE_INITVAL (enum nfsstat) -1
1171 
1172 #ifdef DEBUG
1173 static int rfs_write_hits = 0;
1174 static int rfs_write_misses = 0;
1175 #endif
1176 
1177 /*
1178  * Write data to file.
1179  * Returns attributes of a file after writing some data to it.
1180  */
1181 void
1182 rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns,
1183 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1184 {
1185 	int error;
1186 	vnode_t *vp;
1187 	rlim64_t rlimit;
1188 	struct vattr va;
1189 	struct uio uio;
1190 	struct rfs_async_write_list *lp;
1191 	struct rfs_async_write_list *nlp;
1192 	struct rfs_async_write *rp;
1193 	struct rfs_async_write *nrp;
1194 	struct rfs_async_write *trp;
1195 	struct rfs_async_write *lrp;
1196 	int data_written;
1197 	int iovcnt;
1198 	mblk_t *m;
1199 	struct iovec *iovp;
1200 	struct iovec *niovp;
1201 	struct iovec iov[MAXCLIOVECS];
1202 	int count;
1203 	int rcount;
1204 	uint_t off;
1205 	uint_t len;
1206 	struct rfs_async_write nrpsp;
1207 	struct rfs_async_write_list nlpsp;
1208 	ushort_t t_flag;
1209 	cred_t *savecred;
1210 	int in_crit = 0;
1211 
1212 	if (!rfs_write_async) {
1213 		rfs_write_sync(wa, ns, exi, req, cr);
1214 		return;
1215 	}
1216 
1217 	TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_START,
1218 	    "rfs_write_start:(%S)", "async");
1219 
1220 	/*
1221 	 * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
1222 	 * is considered an OK.
1223 	 */
1224 	ns->ns_status = RFSWRITE_INITVAL;
1225 
1226 	nrp = &nrpsp;
1227 	nrp->wa = wa;
1228 	nrp->ns = ns;
1229 	nrp->req = req;
1230 	nrp->cr = cr;
1231 	nrp->thread = curthread;
1232 
1233 	ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
1234 
1235 	/*
1236 	 * Look to see if there is already a cluster started
1237 	 * for this file.
1238 	 */
1239 	mutex_enter(&rfs_async_write_lock);
1240 	for (lp = rfs_async_write_head; lp != NULL; lp = lp->next) {
1241 		if (bcmp(&wa->wa_fhandle, lp->fhp,
1242 		    sizeof (fhandle_t)) == 0)
1243 			break;
1244 	}
1245 
1246 	/*
1247 	 * If lp is non-NULL, then there is already a cluster
1248 	 * started.  We need to place ourselves in the cluster
1249 	 * list in the right place as determined by starting
1250 	 * offset.  Conflicts with non-blocking mandatory locked
1251 	 * regions will be checked when the cluster is processed.
1252 	 */
1253 	if (lp != NULL) {
1254 		rp = lp->list;
1255 		trp = NULL;
1256 		while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
1257 			trp = rp;
1258 			rp = rp->list;
1259 		}
1260 		nrp->list = rp;
1261 		if (trp == NULL)
1262 			lp->list = nrp;
1263 		else
1264 			trp->list = nrp;
1265 		while (nrp->ns->ns_status == RFSWRITE_INITVAL)
1266 			cv_wait(&lp->cv, &rfs_async_write_lock);
1267 		mutex_exit(&rfs_async_write_lock);
1268 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
1269 		    "rfs_write_end:(%S)", "cluster child");
1270 		return;
1271 	}
1272 
1273 	/*
1274 	 * No cluster started yet, start one and add ourselves
1275 	 * to the list of clusters.
1276 	 */
1277 	nrp->list = NULL;
1278 
1279 	nlp = &nlpsp;
1280 	nlp->fhp = &wa->wa_fhandle;
1281 	cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
1282 	nlp->list = nrp;
1283 	nlp->next = NULL;
1284 
1285 	if (rfs_async_write_head == NULL) {
1286 		rfs_async_write_head = nlp;
1287 	} else {
1288 		lp = rfs_async_write_head;
1289 		while (lp->next != NULL)
1290 			lp = lp->next;
1291 		lp->next = nlp;
1292 	}
1293 	mutex_exit(&rfs_async_write_lock);
1294 
1295 	/*
1296 	 * Convert the file handle common to all of the requests
1297 	 * in this cluster to a vnode.
1298 	 */
1299 	vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1300 	if (vp == NULL) {
1301 		mutex_enter(&rfs_async_write_lock);
1302 		if (rfs_async_write_head == nlp)
1303 			rfs_async_write_head = nlp->next;
1304 		else {
1305 			lp = rfs_async_write_head;
1306 			while (lp->next != nlp)
1307 				lp = lp->next;
1308 			lp->next = nlp->next;
1309 		}
1310 		t_flag = curthread->t_flag & T_WOULDBLOCK;
1311 		for (rp = nlp->list; rp != NULL; rp = rp->list) {
1312 			rp->ns->ns_status = NFSERR_STALE;
1313 			rp->thread->t_flag |= t_flag;
1314 		}
1315 		cv_broadcast(&nlp->cv);
1316 		mutex_exit(&rfs_async_write_lock);
1317 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
1318 		    "rfs_write_end:(%S)", "stale");
1319 		return;
1320 	}
1321 
1322 	/*
1323 	 * Can only write regular files.  Attempts to write any
1324 	 * other file types fail with EISDIR.
1325 	 */
1326 	if (vp->v_type != VREG) {
1327 		VN_RELE(vp);
1328 		mutex_enter(&rfs_async_write_lock);
1329 		if (rfs_async_write_head == nlp)
1330 			rfs_async_write_head = nlp->next;
1331 		else {
1332 			lp = rfs_async_write_head;
1333 			while (lp->next != nlp)
1334 				lp = lp->next;
1335 			lp->next = nlp->next;
1336 		}
1337 		t_flag = curthread->t_flag & T_WOULDBLOCK;
1338 		for (rp = nlp->list; rp != NULL; rp = rp->list) {
1339 			rp->ns->ns_status = NFSERR_ISDIR;
1340 			rp->thread->t_flag |= t_flag;
1341 		}
1342 		cv_broadcast(&nlp->cv);
1343 		mutex_exit(&rfs_async_write_lock);
1344 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
1345 		    "rfs_write_end:(%S)", "isdir");
1346 		return;
1347 	}
1348 
1349 	/*
1350 	 * Enter the critical region before calling VOP_RWLOCK, to avoid a
1351 	 * deadlock with ufs.
1352 	 */
1353 	if (nbl_need_check(vp)) {
1354 		nbl_start_crit(vp, RW_READER);
1355 		in_crit = 1;
1356 	}
1357 
1358 	/*
1359 	 * Lock the file for writing.  This operation provides
1360 	 * the delay which allows clusters to grow.
1361 	 */
1362 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START, "vop_wrlock_start:");
1363 	(void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);
1364 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END, "vop_wrlock_end");
1365 
1366 	/*
1367 	 * Disconnect this cluster from the list of clusters.
1368 	 * The cluster that is being dealt with must be fixed
1369 	 * in size after this point, so there is no reason
1370 	 * to leave it on the list so that new requests can
1371 	 * find it.
1372 	 *
1373 	 * The algorithm is that the first write request will
1374 	 * create a cluster, convert the file handle to a
1375 	 * vnode pointer, and then lock the file for writing.
1376 	 * This request is not likely to be clustered with
1377 	 * any others.  However, the next request will create
1378 	 * a new cluster and be blocked in VOP_RWLOCK while
1379 	 * the first request is being processed.  This delay
1380 	 * will allow more requests to be clustered in this
1381 	 * second cluster.
1382 	 */
1383 	mutex_enter(&rfs_async_write_lock);
1384 	if (rfs_async_write_head == nlp)
1385 		rfs_async_write_head = nlp->next;
1386 	else {
1387 		lp = rfs_async_write_head;
1388 		while (lp->next != nlp)
1389 			lp = lp->next;
1390 		lp->next = nlp->next;
1391 	}
1392 	mutex_exit(&rfs_async_write_lock);
1393 
1394 	/*
1395 	 * Step through the list of requests in this cluster.
1396 	 * We need to check permissions to make sure that all
1397 	 * of the requests have sufficient permission to write
1398 	 * the file.  A cluster can be composed of requests
1399 	 * from different clients and different users on each
1400 	 * client.
1401 	 *
1402 	 * As a side effect, we also calculate the size of the
1403 	 * byte range that this cluster encompasses.
1404 	 */
1405 	rp = nlp->list;
1406 	off = rp->wa->wa_offset;
1407 	len = (uint_t)0;
1408 	do {
1409 		if (rdonly(exi, rp->req)) {
1410 			rp->ns->ns_status = NFSERR_ROFS;
1411 			t_flag = curthread->t_flag & T_WOULDBLOCK;
1412 			rp->thread->t_flag |= t_flag;
1413 			continue;
1414 		}
1415 
1416 		va.va_mask = AT_UID|AT_MODE;
1417 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:");
1418 		error = VOP_GETATTR(vp, &va, 0, rp->cr, NULL);
1419 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:");
1420 		if (!error) {
1421 			if (crgetuid(rp->cr) != va.va_uid) {
1422 				/*
1423 				 * This is a kludge to allow writes of files
1424 				 * created with read only permission.  The
1425 				 * owner of the file is always allowed to
1426 				 * write it.
1427 				 */
1428 				TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START,
1429 				    "vop_access_start:");
1430 				error = VOP_ACCESS(vp, VWRITE, 0, rp->cr, NULL);
1431 				TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END,
1432 				    "vop_access_end:");
1433 			}
1434 			if (!error && MANDLOCK(vp, va.va_mode))
1435 				error = EACCES;
1436 		}
1437 
1438 		/*
1439 		 * Check for a conflict with a nbmand-locked region.
1440 		 */
1441 		if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset,
1442 		    rp->wa->wa_count, 0, NULL)) {
1443 			error = EACCES;
1444 		}
1445 
1446 		if (error) {
1447 			rp->ns->ns_status = puterrno(error);
1448 			t_flag = curthread->t_flag & T_WOULDBLOCK;
1449 			rp->thread->t_flag |= t_flag;
1450 			continue;
1451 		}
1452 		if (len < rp->wa->wa_offset + rp->wa->wa_count - off)
1453 			len = rp->wa->wa_offset + rp->wa->wa_count - off;
1454 	} while ((rp = rp->list) != NULL);
1455 
1456 	/*
1457 	 * Step through the cluster attempting to gather as many
1458 	 * requests which are contiguous as possible.  These
1459 	 * contiguous requests are handled via one call to VOP_WRITE
1460 	 * instead of different calls to VOP_WRITE.  We also keep
1461 	 * track of the fact that any data was written.
1462 	 */
1463 	rp = nlp->list;
1464 	data_written = 0;
1465 	do {
1466 		/*
1467 		 * Skip any requests which are already marked as having an
1468 		 * error.
1469 		 */
1470 		if (rp->ns->ns_status != RFSWRITE_INITVAL) {
1471 			rp = rp->list;
1472 			continue;
1473 		}
1474 
1475 		/*
1476 		 * Count the number of iovec's which are required
1477 		 * to handle this set of requests.  One iovec is
1478 		 * needed for each data buffer, whether addressed
1479 		 * by wa_data or by the b_rptr pointers in the
1480 		 * mblk chains.
1481 		 */
1482 		iovcnt = 0;
1483 		lrp = rp;
1484 		for (;;) {
1485 			if (lrp->wa->wa_data)
1486 				iovcnt++;
1487 			else {
1488 				m = lrp->wa->wa_mblk;
1489 				while (m != NULL) {
1490 					iovcnt++;
1491 					m = m->b_cont;
1492 				}
1493 			}
1494 			if (lrp->list == NULL ||
1495 			    lrp->list->ns->ns_status != RFSWRITE_INITVAL ||
1496 			    lrp->wa->wa_offset + lrp->wa->wa_count !=
1497 			    lrp->list->wa->wa_offset) {
1498 				lrp = lrp->list;
1499 				break;
1500 			}
1501 			lrp = lrp->list;
1502 		}
1503 
1504 		if (iovcnt <= MAXCLIOVECS) {
1505 #ifdef DEBUG
1506 			rfs_write_hits++;
1507 #endif
1508 			niovp = iov;
1509 		} else {
1510 #ifdef DEBUG
1511 			rfs_write_misses++;
1512 #endif
1513 			niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP);
1514 		}
1515 		/*
1516 		 * Put together the scatter/gather iovecs.
1517 		 */
1518 		iovp = niovp;
1519 		trp = rp;
1520 		count = 0;
1521 		do {
1522 			if (trp->wa->wa_data) {
1523 				iovp->iov_base = trp->wa->wa_data;
1524 				iovp->iov_len = trp->wa->wa_count;
1525 				iovp++;
1526 			} else {
1527 				m = trp->wa->wa_mblk;
1528 				rcount = trp->wa->wa_count;
1529 				while (m != NULL) {
1530 					iovp->iov_base = (caddr_t)m->b_rptr;
1531 					iovp->iov_len = (m->b_wptr - m->b_rptr);
1532 					rcount -= iovp->iov_len;
1533 					if (rcount < 0)
1534 						iovp->iov_len += rcount;
1535 					iovp++;
1536 					if (rcount <= 0)
1537 						break;
1538 					m = m->b_cont;
1539 				}
1540 			}
1541 			count += trp->wa->wa_count;
1542 			trp = trp->list;
1543 		} while (trp != lrp);
1544 
1545 		uio.uio_iov = niovp;
1546 		uio.uio_iovcnt = iovcnt;
1547 		uio.uio_segflg = UIO_SYSSPACE;
1548 		uio.uio_extflg = UIO_COPY_DEFAULT;
1549 		uio.uio_loffset = (offset_t)rp->wa->wa_offset;
1550 		uio.uio_resid = count;
1551 		/*
1552 		 * The limit is checked on the client. We
1553 		 * should allow any size writes here.
1554 		 */
1555 		uio.uio_llimit = curproc->p_fsz_ctl;
1556 		rlimit = uio.uio_llimit - rp->wa->wa_offset;
1557 		if (rlimit < (rlim64_t)uio.uio_resid)
1558 			uio.uio_resid = (uint_t)rlimit;
1559 
1560 		/*
1561 		 * For now we assume no append mode.
1562 		 */
1563 		TRACE_1(TR_FAC_NFS, TR_VOP_WRITE_START,
1564 		    "vop_write_start:(%S)", "async");
1565 
1566 		/*
1567 		 * Check to see if the v4 side of the server has
1568 		 * delegated this file.  If so, then we mark thread
1569 		 * as wouldblock so the response is dropped.
1570 		 */
1571 		if (rfs4_check_delegated(FWRITE, vp, FALSE)) {
1572 			curthread->t_flag |= T_WOULDBLOCK;
1573 			error = EACCES; /* just to have an error */
1574 			TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
1575 			    "rfs_write_end:(%S)", "delegated");
1576 		} else {
1577 			/*
1578 			 * We're changing creds because VM may fault
1579 			 * and we need the cred of the current
1580 			 * thread to be used if quota * checking is
1581 			 * enabled.
1582 			 */
1583 			savecred = curthread->t_cred;
1584 			curthread->t_cred = cr;
1585 			error = VOP_WRITE(vp, &uio, 0, rp->cr, NULL);
1586 			curthread->t_cred = savecred;
1587 			TRACE_0(TR_FAC_NFS, TR_VOP_WRITE_END,
1588 			    "vop_write_end:");
1589 		}
1590 
1591 		if (niovp != iov)
1592 			kmem_free(niovp, sizeof (*niovp) * iovcnt);
1593 
1594 		if (!error) {
1595 			data_written = 1;
1596 			/*
1597 			 * Get attributes again so we send the latest mod
1598 			 * time to the client side for his cache.
1599 			 */
1600 			va.va_mask = AT_ALL;	/* now we want everything */
1601 			TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
1602 			    "vop_getattr_start:");
1603 			error = VOP_GETATTR(vp, &va, 0, rp->cr, NULL);
1604 			TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
1605 			    "vop_getattr_end:");
1606 			if (!error)
1607 				acl_perm(vp, exi, &va, rp->cr);
1608 		}
1609 
1610 		/*
1611 		 * Fill in the status responses for each request
1612 		 * which was just handled.  Also, copy the latest
1613 		 * attributes in to the attribute responses if
1614 		 * appropriate.
1615 		 */
1616 		t_flag = curthread->t_flag & T_WOULDBLOCK;
1617 		do {
1618 			rp->thread->t_flag |= t_flag;
1619 			/* check for overflows */
1620 			if (!error) {
1621 				error  = vattr_to_nattr(&va, &rp->ns->ns_attr);
1622 			}
1623 			rp->ns->ns_status = puterrno(error);
1624 			rp = rp->list;
1625 		} while (rp != lrp);
1626 	} while (rp != NULL);
1627 
1628 	/*
1629 	 * If any data was written at all, then we need to flush
1630 	 * the data and metadata to stable storage.
1631 	 */
1632 	if (data_written) {
1633 		TRACE_0(TR_FAC_NFS, TR_VOP_PUTPAGE_START, "vop_putpage_start:");
1634 		error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr, NULL);
1635 		TRACE_0(TR_FAC_NFS, TR_VOP_PUTPAGE_END, "vop_putpage_end:");
1636 		if (!error) {
1637 			TRACE_0(TR_FAC_NFS, TR_VOP_FSYNC_START,
1638 			    "vop_fsync_start:");
1639 			error = VOP_FSYNC(vp, FNODSYNC, cr, NULL);
1640 			TRACE_0(TR_FAC_NFS, TR_VOP_FSYNC_END, "vop_fsync_end:");
1641 		}
1642 	}
1643 
1644 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, "vop_rwunlock_start:");
1645 	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
1646 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, "vop_rwunlock_end:");
1647 
1648 	if (in_crit)
1649 		nbl_end_crit(vp);
1650 	VN_RELE(vp);
1651 
1652 	t_flag = curthread->t_flag & T_WOULDBLOCK;
1653 	mutex_enter(&rfs_async_write_lock);
1654 	for (rp = nlp->list; rp != NULL; rp = rp->list) {
1655 		if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1656 			rp->ns->ns_status = puterrno(error);
1657 			rp->thread->t_flag |= t_flag;
1658 		}
1659 	}
1660 	cv_broadcast(&nlp->cv);
1661 	mutex_exit(&rfs_async_write_lock);
1662 
1663 	TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, "rfs_write_end:(%S)", "async");
1664 }
1665 
1666 void *
1667 rfs_write_getfh(struct nfswriteargs *wa)
1668 {
1669 	return (&wa->wa_fhandle);
1670 }
1671 
1672 /*
1673  * Create a file.
1674  * Creates a file with given attributes and returns those attributes
1675  * and an fhandle for the new file.
1676  */
1677 void
1678 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
1679 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1680 {
1681 	int error;
1682 	int lookuperr;
1683 	int in_crit = 0;
1684 	struct vattr va;
1685 	vnode_t *vp;
1686 	vnode_t *dvp;
1687 	char *name = args->ca_da.da_name;
1688 	vnode_t *tvp = NULL;
1689 	int mode;
1690 	int lookup_ok;
1691 	bool_t trunc;
1692 
1693 	TRACE_0(TR_FAC_NFS, TR_RFS_CREATE_START, "rfs_create_start:");
1694 
1695 	/*
1696 	 * Disallow NULL paths
1697 	 */
1698 	if (name == NULL || *name == '\0') {
1699 		dr->dr_status = NFSERR_ACCES;
1700 		TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END,
1701 		    "rfs_create_end:(%S)", "access");
1702 		return;
1703 	}
1704 
1705 	dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
1706 	if (dvp == NULL) {
1707 		dr->dr_status = NFSERR_STALE;
1708 		TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END,
1709 		    "rfs_create_end:(%S)", "stale");
1710 		return;
1711 	}
1712 
1713 	error = sattr_to_vattr(args->ca_sa, &va);
1714 	if (error) {
1715 		dr->dr_status = puterrno(error);
1716 		TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END,
1717 		    "rfs_create_end:(%S)", "sattr");
1718 		return;
1719 	}
1720 
1721 	/*
1722 	 * Must specify the mode.
1723 	 */
1724 	if (!(va.va_mask & AT_MODE)) {
1725 		VN_RELE(dvp);
1726 		dr->dr_status = NFSERR_INVAL;
1727 		TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END,
1728 		    "rfs_create_end:(%S)", "no mode");
1729 		return;
1730 	}
1731 
1732 	/*
1733 	 * This is a completely gross hack to make mknod
1734 	 * work over the wire until we can wack the protocol
1735 	 */
1736 	if ((va.va_mode & IFMT) == IFCHR) {
1737 		if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV)
1738 			va.va_type = VFIFO;	/* xtra kludge for named pipe */
1739 		else {
1740 			va.va_type = VCHR;
1741 			/*
1742 			 * uncompress the received dev_t
1743 			 * if the top half is zero indicating a request
1744 			 * from an `older style' OS.
1745 			 */
1746 			if ((va.va_size & 0xffff0000) == 0)
1747 				va.va_rdev = nfsv2_expdev(va.va_size);
1748 			else
1749 				va.va_rdev = (dev_t)va.va_size;
1750 		}
1751 		va.va_mask &= ~AT_SIZE;
1752 	} else if ((va.va_mode & IFMT) == IFBLK) {
1753 		va.va_type = VBLK;
1754 		/*
1755 		 * uncompress the received dev_t
1756 		 * if the top half is zero indicating a request
1757 		 * from an `older style' OS.
1758 		 */
1759 		if ((va.va_size & 0xffff0000) == 0)
1760 			va.va_rdev = nfsv2_expdev(va.va_size);
1761 		else
1762 			va.va_rdev = (dev_t)va.va_size;
1763 		va.va_mask &= ~AT_SIZE;
1764 	} else if ((va.va_mode & IFMT) == IFSOCK) {
1765 		va.va_type = VSOCK;
1766 	} else
1767 		va.va_type = VREG;
1768 	va.va_mode &= ~IFMT;
1769 	va.va_mask |= AT_TYPE;
1770 
1771 	/*
1772 	 * Why was the choice made to use VWRITE as the mode to the
1773 	 * call to VOP_CREATE ? This results in a bug.  When a client
1774 	 * opens a file that already exists and is RDONLY, the second
1775 	 * open fails with an EACESS because of the mode.
1776 	 * bug ID 1054648.
1777 	 */
1778 	lookup_ok = 0;
1779 	mode = VWRITE;
1780 	if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) {
1781 		TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_START, "vop_lookup_start:");
1782 		error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1783 		    NULL, NULL, NULL);
1784 		TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_END, "vop_lookup_end:");
1785 		if (!error) {
1786 			struct vattr at;
1787 
1788 			lookup_ok = 1;
1789 			at.va_mask = AT_MODE;
1790 			TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
1791 			    "vop_getattr_start:");
1792 			error = VOP_GETATTR(tvp, &at, 0, cr, NULL);
1793 			TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
1794 			    "vop_getattr_end:");
1795 			if (!error)
1796 				mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD;
1797 			VN_RELE(tvp);
1798 			tvp = NULL;
1799 		}
1800 	}
1801 
1802 	if (!lookup_ok) {
1803 		if (rdonly(exi, req)) {
1804 			error = EROFS;
1805 		} else if (va.va_type != VREG && va.va_type != VFIFO &&
1806 		    va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) {
1807 			error = EPERM;
1808 		} else {
1809 			error = 0;
1810 		}
1811 	}
1812 
1813 	/*
1814 	 * If file size is being modified on an already existing file
1815 	 * make sure that there are no conflicting non-blocking mandatory
1816 	 * locks in the region being manipulated. Return EACCES if there
1817 	 * are conflicting locks.
1818 	 */
1819 	if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) {
1820 		lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1821 		    NULL, NULL, NULL);
1822 
1823 		if (!lookuperr &&
1824 		    rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) {
1825 			VN_RELE(tvp);
1826 			curthread->t_flag |= T_WOULDBLOCK;
1827 			goto out;
1828 		}
1829 
1830 		if (!lookuperr && nbl_need_check(tvp)) {
1831 			/*
1832 			 * The file exists. Now check if it has any
1833 			 * conflicting non-blocking mandatory locks
1834 			 * in the region being changed.
1835 			 */
1836 			struct vattr bva;
1837 			u_offset_t offset;
1838 			ssize_t length;
1839 
1840 			nbl_start_crit(tvp, RW_READER);
1841 			in_crit = 1;
1842 
1843 			bva.va_mask = AT_SIZE;
1844 			error = VOP_GETATTR(tvp, &bva, 0, cr, NULL);
1845 			if (!error) {
1846 				if (va.va_size < bva.va_size) {
1847 					offset = va.va_size;
1848 					length = bva.va_size - va.va_size;
1849 				} else {
1850 					offset = bva.va_size;
1851 					length = va.va_size - bva.va_size;
1852 				}
1853 				if (length) {
1854 					if (nbl_conflict(tvp, NBL_WRITE,
1855 					    offset, length, 0, NULL)) {
1856 						error = EACCES;
1857 					}
1858 				}
1859 			}
1860 			if (error) {
1861 				nbl_end_crit(tvp);
1862 				VN_RELE(tvp);
1863 				in_crit = 0;
1864 			}
1865 		} else if (tvp != NULL) {
1866 			VN_RELE(tvp);
1867 		}
1868 	}
1869 
1870 	if (!error) {
1871 		/*
1872 		 * If filesystem is shared with nosuid the remove any
1873 		 * setuid/setgid bits on create.
1874 		 */
1875 		if (va.va_type == VREG &&
1876 		    exi->exi_export.ex_flags & EX_NOSUID)
1877 			va.va_mode &= ~(VSUID | VSGID);
1878 
1879 		TRACE_0(TR_FAC_NFS, TR_VOP_CREATE_START, "vop_create_start:");
1880 		error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0,
1881 		    NULL, NULL);
1882 		TRACE_0(TR_FAC_NFS, TR_VOP_CREATE_END, "vop_create_end:");
1883 
1884 		if (!error) {
1885 
1886 			if ((va.va_mask & AT_SIZE) && (va.va_size == 0))
1887 				trunc = TRUE;
1888 			else
1889 				trunc = FALSE;
1890 
1891 			if (rfs4_check_delegated(FWRITE, vp, trunc)) {
1892 				VN_RELE(vp);
1893 				curthread->t_flag |= T_WOULDBLOCK;
1894 				goto out;
1895 			}
1896 			va.va_mask = AT_ALL;
1897 			TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
1898 			    "vop_getattr_start:");
1899 			error = VOP_GETATTR(vp, &va, 0, cr, NULL);
1900 			TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
1901 			    "vop_getattr_end:");
1902 			/* check for overflows */
1903 			if (!error) {
1904 				acl_perm(vp, exi, &va, cr);
1905 				error = vattr_to_nattr(&va, &dr->dr_attr);
1906 				if (!error) {
1907 					error = makefh(&dr->dr_fhandle, vp,
1908 					    exi);
1909 				}
1910 			}
1911 			/*
1912 			 * Force modified metadata out to stable storage.
1913 			 */
1914 			(void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
1915 			VN_RELE(vp);
1916 		}
1917 
1918 		if (in_crit) {
1919 			nbl_end_crit(tvp);
1920 			VN_RELE(tvp);
1921 		}
1922 	}
1923 
1924 	/*
1925 	 * Force modified data and metadata out to stable storage.
1926 	 */
1927 	(void) VOP_FSYNC(dvp, 0, cr, NULL);
1928 
1929 out:
1930 
1931 	VN_RELE(dvp);
1932 
1933 	dr->dr_status = puterrno(error);
1934 
1935 	TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END, "rfs_create_end:(%S)", "done");
1936 }
1937 void *
1938 rfs_create_getfh(struct nfscreatargs *args)
1939 {
1940 	return (args->ca_da.da_fhandle);
1941 }
1942 
1943 /*
1944  * Remove a file.
1945  * Remove named file from parent directory.
1946  */
1947 void
1948 rfs_remove(struct nfsdiropargs *da, enum nfsstat *status,
1949 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1950 {
1951 	int error = 0;
1952 	vnode_t *vp;
1953 	vnode_t *targvp;
1954 	int in_crit = 0;
1955 
1956 	TRACE_0(TR_FAC_NFS, TR_RFS_REMOVE_START, "rfs_remove_start:");
1957 
1958 	/*
1959 	 * Disallow NULL paths
1960 	 */
1961 	if (da->da_name == NULL || *da->da_name == '\0') {
1962 		*status = NFSERR_ACCES;
1963 		TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END,
1964 		    "rfs_remove_end:(%S)", "access");
1965 		return;
1966 	}
1967 
1968 	vp = nfs_fhtovp(da->da_fhandle, exi);
1969 	if (vp == NULL) {
1970 		*status = NFSERR_STALE;
1971 		TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END,
1972 		    "rfs_remove_end:(%S)", "stale");
1973 		return;
1974 	}
1975 
1976 	if (rdonly(exi, req)) {
1977 		VN_RELE(vp);
1978 		*status = NFSERR_ROFS;
1979 		TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END,
1980 		    "rfs_remove_end:(%S)", "rofs");
1981 		return;
1982 	}
1983 
1984 	/*
1985 	 * Check for a conflict with a non-blocking mandatory share reservation.
1986 	 */
1987 	error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0,
1988 	    NULL, cr, NULL, NULL, NULL);
1989 	if (error != 0) {
1990 		VN_RELE(vp);
1991 		*status = puterrno(error);
1992 		return;
1993 	}
1994 
1995 	/*
1996 	 * If the file is delegated to an v4 client, then initiate
1997 	 * recall and drop this request (by setting T_WOULDBLOCK).
1998 	 * The client will eventually re-transmit the request and
1999 	 * (hopefully), by then, the v4 client will have returned
2000 	 * the delegation.
2001 	 */
2002 
2003 	if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2004 		VN_RELE(vp);
2005 		VN_RELE(targvp);
2006 		curthread->t_flag |= T_WOULDBLOCK;
2007 		return;
2008 	}
2009 
2010 	if (nbl_need_check(targvp)) {
2011 		nbl_start_crit(targvp, RW_READER);
2012 		in_crit = 1;
2013 		if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
2014 			error = EACCES;
2015 			goto out;
2016 		}
2017 	}
2018 
2019 	TRACE_0(TR_FAC_NFS, TR_VOP_REMOVE_START, "vop_remove_start:");
2020 	error = VOP_REMOVE(vp, da->da_name, cr, NULL, 0);
2021 	TRACE_0(TR_FAC_NFS, TR_VOP_REMOVE_END, "vop_remove_end:");
2022 
2023 	/*
2024 	 * Force modified data and metadata out to stable storage.
2025 	 */
2026 	(void) VOP_FSYNC(vp, 0, cr, NULL);
2027 
2028 out:
2029 	if (in_crit)
2030 		nbl_end_crit(targvp);
2031 	VN_RELE(targvp);
2032 	VN_RELE(vp);
2033 
2034 	*status = puterrno(error);
2035 
2036 	TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END, "rfs_remove_end:(%S)", "done");
2037 }
2038 
2039 void *
2040 rfs_remove_getfh(struct nfsdiropargs *da)
2041 {
2042 	return (da->da_fhandle);
2043 }
2044 
2045 /*
2046  * rename a file
2047  * Give a file (from) a new name (to).
2048  */
2049 void
2050 rfs_rename(struct nfsrnmargs *args, enum nfsstat *status,
2051 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2052 {
2053 	int error = 0;
2054 	vnode_t *fromvp;
2055 	vnode_t *tovp;
2056 	struct exportinfo *to_exi;
2057 	fhandle_t *fh;
2058 	vnode_t *srcvp;
2059 	vnode_t *targvp;
2060 	int in_crit = 0;
2061 
2062 	TRACE_0(TR_FAC_NFS, TR_RFS_RENAME_START, "rfs_rename_start:");
2063 
2064 	fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi);
2065 	if (fromvp == NULL) {
2066 		*status = NFSERR_STALE;
2067 		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2068 		    "rfs_rename_end:(%S)", "from stale");
2069 		return;
2070 	}
2071 
2072 	fh = args->rna_to.da_fhandle;
2073 	to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2074 	if (to_exi == NULL) {
2075 		VN_RELE(fromvp);
2076 		*status = NFSERR_ACCES;
2077 		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2078 		    "rfs_rename_end:(%S)", "cross device");
2079 		return;
2080 	}
2081 	exi_rele(to_exi);
2082 
2083 	if (to_exi != exi) {
2084 		VN_RELE(fromvp);
2085 		*status = NFSERR_XDEV;
2086 		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2087 		    "rfs_rename_end:(%S)", "from stale");
2088 		return;
2089 	}
2090 
2091 	tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi);
2092 	if (tovp == NULL) {
2093 		VN_RELE(fromvp);
2094 		*status = NFSERR_STALE;
2095 		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2096 		    "rfs_rename_end:(%S)", "to stale");
2097 		return;
2098 	}
2099 
2100 	if (fromvp->v_type != VDIR || tovp->v_type != VDIR) {
2101 		VN_RELE(tovp);
2102 		VN_RELE(fromvp);
2103 		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2104 		    "rfs_rename_end:(%S)", "not dir");
2105 		*status = NFSERR_NOTDIR;
2106 		return;
2107 	}
2108 
2109 	/*
2110 	 * Disallow NULL paths
2111 	 */
2112 	if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' ||
2113 	    args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') {
2114 		VN_RELE(tovp);
2115 		VN_RELE(fromvp);
2116 		*status = NFSERR_ACCES;
2117 		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2118 		    "rfs_rename_end:(%S)", "access");
2119 		return;
2120 	}
2121 
2122 	if (rdonly(exi, req)) {
2123 		VN_RELE(tovp);
2124 		VN_RELE(fromvp);
2125 		*status = NFSERR_ROFS;
2126 		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2127 		    "rfs_rename_end:(%S)", "rofs");
2128 		return;
2129 	}
2130 
2131 	/*
2132 	 * Check for a conflict with a non-blocking mandatory share reservation.
2133 	 */
2134 	error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
2135 	    NULL, cr, NULL, NULL, NULL);
2136 	if (error != 0) {
2137 		VN_RELE(tovp);
2138 		VN_RELE(fromvp);
2139 		*status = puterrno(error);
2140 		return;
2141 	}
2142 
2143 	/* Check for delegations on the source file */
2144 
2145 	if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
2146 		VN_RELE(tovp);
2147 		VN_RELE(fromvp);
2148 		VN_RELE(srcvp);
2149 		curthread->t_flag |= T_WOULDBLOCK;
2150 		return;
2151 	}
2152 
2153 	/* Check for delegation on the file being renamed over, if it exists */
2154 
2155 	if (rfs4_deleg_policy != SRV_NEVER_DELEGATE &&
2156 	    VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
2157 	    NULL, NULL, NULL) == 0) {
2158 
2159 		if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2160 			VN_RELE(tovp);
2161 			VN_RELE(fromvp);
2162 			VN_RELE(srcvp);
2163 			VN_RELE(targvp);
2164 			curthread->t_flag |= T_WOULDBLOCK;
2165 			return;
2166 		}
2167 		VN_RELE(targvp);
2168 	}
2169 
2170 
2171 	if (nbl_need_check(srcvp)) {
2172 		nbl_start_crit(srcvp, RW_READER);
2173 		in_crit = 1;
2174 		if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) {
2175 			error = EACCES;
2176 			goto out;
2177 		}
2178 	}
2179 
2180 	TRACE_0(TR_FAC_NFS, TR_VOP_RENAME_START, "vop_rename_start:");
2181 	error = VOP_RENAME(fromvp, args->rna_from.da_name,
2182 	    tovp, args->rna_to.da_name, cr, NULL, 0);
2183 	TRACE_0(TR_FAC_NFS, TR_VOP_RENAME_END, "vop_rename_end:");
2184 
2185 	if (error == 0) {
2186 		char *tmp;
2187 
2188 		/* fix the path name for the renamed file */
2189 		mutex_enter(&srcvp->v_lock);
2190 		tmp = srcvp->v_path;
2191 		srcvp->v_path = NULL;
2192 		mutex_exit(&srcvp->v_lock);
2193 		vn_setpath(rootdir, tovp, srcvp, args->rna_to.da_name,
2194 		    strlen(args->rna_to.da_name));
2195 		if (tmp != NULL)
2196 			kmem_free(tmp, strlen(tmp) + 1);
2197 	}
2198 
2199 	/*
2200 	 * Force modified data and metadata out to stable storage.
2201 	 */
2202 	(void) VOP_FSYNC(tovp, 0, cr, NULL);
2203 	(void) VOP_FSYNC(fromvp, 0, cr, NULL);
2204 
2205 out:
2206 	if (in_crit)
2207 		nbl_end_crit(srcvp);
2208 	VN_RELE(srcvp);
2209 	VN_RELE(tovp);
2210 	VN_RELE(fromvp);
2211 
2212 	*status = puterrno(error);
2213 
2214 	TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, "rfs_rename_end:(%S)", "done");
2215 }
2216 void *
2217 rfs_rename_getfh(struct nfsrnmargs *args)
2218 {
2219 	return (args->rna_from.da_fhandle);
2220 }
2221 
2222 /*
2223  * Link to a file.
2224  * Create a file (to) which is a hard link to the given file (from).
2225  */
2226 void
2227 rfs_link(struct nfslinkargs *args, enum nfsstat *status,
2228 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2229 {
2230 	int error;
2231 	vnode_t *fromvp;
2232 	vnode_t *tovp;
2233 	struct exportinfo *to_exi;
2234 	fhandle_t *fh;
2235 
2236 	TRACE_0(TR_FAC_NFS, TR_RFS_LINK_START, "rfs_link_start:");
2237 
2238 	fromvp = nfs_fhtovp(args->la_from, exi);
2239 	if (fromvp == NULL) {
2240 		*status = NFSERR_STALE;
2241 		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2242 		    "rfs_link_end:(%S)", "from stale");
2243 		return;
2244 	}
2245 
2246 	fh = args->la_to.da_fhandle;
2247 	to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2248 	if (to_exi == NULL) {
2249 		VN_RELE(fromvp);
2250 		*status = NFSERR_ACCES;
2251 		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2252 		    "rfs_link_end:(%S)", "cross device");
2253 		return;
2254 	}
2255 	exi_rele(to_exi);
2256 
2257 	if (to_exi != exi) {
2258 		VN_RELE(fromvp);
2259 		*status = NFSERR_XDEV;
2260 		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2261 		    "rfs_link_end:(%S)", "cross device");
2262 		return;
2263 	}
2264 
2265 	tovp = nfs_fhtovp(args->la_to.da_fhandle, exi);
2266 	if (tovp == NULL) {
2267 		VN_RELE(fromvp);
2268 		*status = NFSERR_STALE;
2269 		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2270 		    "rfs_link_end:(%S)", "to stale");
2271 		return;
2272 	}
2273 
2274 	if (tovp->v_type != VDIR) {
2275 		VN_RELE(tovp);
2276 		VN_RELE(fromvp);
2277 		*status = NFSERR_NOTDIR;
2278 		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2279 		    "rfs_link_end:(%S)", "not dir");
2280 		return;
2281 	}
2282 	/*
2283 	 * Disallow NULL paths
2284 	 */
2285 	if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') {
2286 		VN_RELE(tovp);
2287 		VN_RELE(fromvp);
2288 		*status = NFSERR_ACCES;
2289 		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2290 		    "rfs_link_end:(%S)", "access");
2291 		return;
2292 	}
2293 
2294 	if (rdonly(exi, req)) {
2295 		VN_RELE(tovp);
2296 		VN_RELE(fromvp);
2297 		*status = NFSERR_ROFS;
2298 		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2299 		    "rfs_link_end:(%S)", "rofs");
2300 		return;
2301 	}
2302 
2303 	TRACE_0(TR_FAC_NFS, TR_VOP_LINK_START, "vop_link_start:");
2304 	error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr, NULL, 0);
2305 	TRACE_0(TR_FAC_NFS, TR_VOP_LINK_END, "vop_link_end:");
2306 
2307 	/*
2308 	 * Force modified data and metadata out to stable storage.
2309 	 */
2310 	(void) VOP_FSYNC(tovp, 0, cr, NULL);
2311 	(void) VOP_FSYNC(fromvp, FNODSYNC, cr, NULL);
2312 
2313 	VN_RELE(tovp);
2314 	VN_RELE(fromvp);
2315 
2316 	*status = puterrno(error);
2317 
2318 	TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, "rfs_link_end:(%S)", "done");
2319 }
2320 void *
2321 rfs_link_getfh(struct nfslinkargs *args)
2322 {
2323 	return (args->la_from);
2324 }
2325 
2326 /*
2327  * Symbolicly link to a file.
2328  * Create a file (to) with the given attributes which is a symbolic link
2329  * to the given path name (to).
2330  */
2331 void
2332 rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
2333 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2334 {
2335 	int error;
2336 	struct vattr va;
2337 	vnode_t *vp;
2338 	vnode_t *svp;
2339 	int lerror;
2340 
2341 	TRACE_0(TR_FAC_NFS, TR_RFS_SYMLINK_START, "rfs_symlink_start:");
2342 
2343 	/*
2344 	 * Disallow NULL paths
2345 	 */
2346 	if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') {
2347 		*status = NFSERR_ACCES;
2348 		TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END,
2349 		    "rfs_symlink_end:(%S)", "access");
2350 		return;
2351 	}
2352 
2353 	vp = nfs_fhtovp(args->sla_from.da_fhandle, exi);
2354 	if (vp == NULL) {
2355 		*status = NFSERR_STALE;
2356 		TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END,
2357 		    "rfs_symlink_end:(%S)", "stale");
2358 		return;
2359 	}
2360 
2361 	if (rdonly(exi, req)) {
2362 		VN_RELE(vp);
2363 		*status = NFSERR_ROFS;
2364 		TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END,
2365 		    "rfs_symlink_end:(%S)", "rofs");
2366 		return;
2367 	}
2368 
2369 	error = sattr_to_vattr(args->sla_sa, &va);
2370 	if (error) {
2371 		VN_RELE(vp);
2372 		*status = puterrno(error);
2373 		TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END,
2374 		    "rfs_symlink_end:(%S)", "sattr");
2375 		return;
2376 	}
2377 
2378 	if (!(va.va_mask & AT_MODE)) {
2379 		VN_RELE(vp);
2380 		*status = NFSERR_INVAL;
2381 		TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END,
2382 		    "rfs_symlink_end:(%S)", "no mode");
2383 		return;
2384 	}
2385 
2386 	va.va_type = VLNK;
2387 	va.va_mask |= AT_TYPE;
2388 
2389 	TRACE_0(TR_FAC_NFS, TR_VOP_SYMLINK_START, "vop_symlink_start:");
2390 	error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, args->sla_tnm, cr,
2391 	    NULL, 0);
2392 	TRACE_0(TR_FAC_NFS, TR_VOP_SYMLINK_END, "vop_symlink_end:");
2393 
2394 	/*
2395 	 * Force new data and metadata out to stable storage.
2396 	 */
2397 	TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_START, "vop_lookup_start:");
2398 	lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL,
2399 	    0, NULL, cr, NULL, NULL, NULL);
2400 	TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_END, "vop_lookup_end:");
2401 	if (!lerror) {
2402 		(void) VOP_FSYNC(svp, 0, cr, NULL);
2403 		VN_RELE(svp);
2404 	}
2405 
2406 	/*
2407 	 * Force modified data and metadata out to stable storage.
2408 	 */
2409 	(void) VOP_FSYNC(vp, 0, cr, NULL);
2410 
2411 	VN_RELE(vp);
2412 
2413 	*status = puterrno(error);
2414 
2415 	TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END, "rfs_symlink_end:(%S)", "done");
2416 }
2417 void *
2418 rfs_symlink_getfh(struct nfsslargs *args)
2419 {
2420 	return (args->sla_from.da_fhandle);
2421 }
2422 
2423 /*
2424  * Make a directory.
2425  * Create a directory with the given name, parent directory, and attributes.
2426  * Returns a file handle and attributes for the new directory.
2427  */
2428 void
2429 rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr,
2430 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2431 {
2432 	int error;
2433 	struct vattr va;
2434 	vnode_t *dvp = NULL;
2435 	vnode_t *vp;
2436 	char *name = args->ca_da.da_name;
2437 
2438 	TRACE_0(TR_FAC_NFS, TR_RFS_MKDIR_START, "rfs_mkdir_start:");
2439 
2440 	/*
2441 	 * Disallow NULL paths
2442 	 */
2443 	if (name == NULL || *name == '\0') {
2444 		dr->dr_status = NFSERR_ACCES;
2445 		TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END,
2446 		    "rfs_mkdir_end:(%S)", "access");
2447 		return;
2448 	}
2449 
2450 	vp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
2451 	if (vp == NULL) {
2452 		dr->dr_status = NFSERR_STALE;
2453 		TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END,
2454 		    "rfs_mkdir_end:(%S)", "stale");
2455 		return;
2456 	}
2457 
2458 	if (rdonly(exi, req)) {
2459 		VN_RELE(vp);
2460 		dr->dr_status = NFSERR_ROFS;
2461 		TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END,
2462 		    "rfs_mkdir_end:(%S)", "rofs");
2463 		return;
2464 	}
2465 
2466 	error = sattr_to_vattr(args->ca_sa, &va);
2467 	if (error) {
2468 		VN_RELE(vp);
2469 		dr->dr_status = puterrno(error);
2470 		TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END,
2471 		    "rfs_mkdir_end:(%S)", "sattr");
2472 		return;
2473 	}
2474 
2475 	if (!(va.va_mask & AT_MODE)) {
2476 		VN_RELE(vp);
2477 		dr->dr_status = NFSERR_INVAL;
2478 		TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END,
2479 		    "rfs_mkdir_end:(%S)", "no mode");
2480 		return;
2481 	}
2482 
2483 	va.va_type = VDIR;
2484 	va.va_mask |= AT_TYPE;
2485 
2486 	TRACE_0(TR_FAC_NFS, TR_VOP_MKDIR_START, "vop_mkdir_start:");
2487 	error = VOP_MKDIR(vp, name, &va, &dvp, cr, NULL, 0, NULL);
2488 	TRACE_0(TR_FAC_NFS, TR_VOP_MKDIR_END, "vop_mkdir_end:");
2489 
2490 	if (!error) {
2491 		/*
2492 		 * Attribtutes of the newly created directory should
2493 		 * be returned to the client.
2494 		 */
2495 		va.va_mask = AT_ALL; /* We want everything */
2496 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:");
2497 		error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2498 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:");
2499 		/* check for overflows */
2500 		if (!error) {
2501 			acl_perm(vp, exi, &va, cr);
2502 			error = vattr_to_nattr(&va, &dr->dr_attr);
2503 			if (!error) {
2504 				error = makefh(&dr->dr_fhandle, dvp, exi);
2505 			}
2506 		}
2507 		/*
2508 		 * Force new data and metadata out to stable storage.
2509 		 */
2510 		(void) VOP_FSYNC(dvp, 0, cr, NULL);
2511 		VN_RELE(dvp);
2512 	}
2513 
2514 	/*
2515 	 * Force modified data and metadata out to stable storage.
2516 	 */
2517 	(void) VOP_FSYNC(vp, 0, cr, NULL);
2518 
2519 	VN_RELE(vp);
2520 
2521 	dr->dr_status = puterrno(error);
2522 
2523 	TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END, "rfs_mkdir_end:(%S)", "done");
2524 }
2525 void *
2526 rfs_mkdir_getfh(struct nfscreatargs *args)
2527 {
2528 	return (args->ca_da.da_fhandle);
2529 }
2530 
2531 /*
2532  * Remove a directory.
2533  * Remove the given directory name from the given parent directory.
2534  */
2535 void
2536 rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status,
2537 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2538 {
2539 	int error;
2540 	vnode_t *vp;
2541 
2542 	TRACE_0(TR_FAC_NFS, TR_RFS_RMDIR_START, "rfs_rmdir_start:");
2543 
2544 	/*
2545 	 * Disallow NULL paths
2546 	 */
2547 	if (da->da_name == NULL || *da->da_name == '\0') {
2548 		*status = NFSERR_ACCES;
2549 		TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END,
2550 		    "rfs_rmdir_end:(%S)", "access");
2551 		return;
2552 	}
2553 
2554 	vp = nfs_fhtovp(da->da_fhandle, exi);
2555 	if (vp == NULL) {
2556 		*status = NFSERR_STALE;
2557 		TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END,
2558 		    "rfs_rmdir_end:(%S)", "stale");
2559 		return;
2560 	}
2561 
2562 	if (rdonly(exi, req)) {
2563 		VN_RELE(vp);
2564 		*status = NFSERR_ROFS;
2565 		TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END,
2566 		    "rfs_rmdir_end:(%S)", "rofs");
2567 		return;
2568 	}
2569 
2570 	/*
2571 	 * VOP_RMDIR now takes a new third argument (the current
2572 	 * directory of the process).  That's because someone
2573 	 * wants to return EINVAL if one tries to remove ".".
2574 	 * Of course, NFS servers have no idea what their
2575 	 * clients' current directories are.  We fake it by
2576 	 * supplying a vnode known to exist and illegal to
2577 	 * remove.
2578 	 */
2579 	TRACE_0(TR_FAC_NFS, TR_VOP_RMDIR_START, "vop_rmdir_start:");
2580 	error = VOP_RMDIR(vp, da->da_name, rootdir, cr, NULL, 0);
2581 	TRACE_0(TR_FAC_NFS, TR_VOP_RMDIR_END, "vop_rmdir_end:");
2582 
2583 	/*
2584 	 * Force modified data and metadata out to stable storage.
2585 	 */
2586 	(void) VOP_FSYNC(vp, 0, cr, NULL);
2587 
2588 	VN_RELE(vp);
2589 
2590 	/*
2591 	 * System V defines rmdir to return EEXIST, not ENOTEMPTY,
2592 	 * if the directory is not empty.  A System V NFS server
2593 	 * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
2594 	 * over the wire.
2595 	 */
2596 	if (error == EEXIST)
2597 		*status = NFSERR_NOTEMPTY;
2598 	else
2599 		*status = puterrno(error);
2600 
2601 	TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END, "rfs_rmdir_end:(%S)", "done");
2602 }
2603 void *
2604 rfs_rmdir_getfh(struct nfsdiropargs *da)
2605 {
2606 	return (da->da_fhandle);
2607 }
2608 
2609 /* ARGSUSED */
2610 void
2611 rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
2612 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2613 {
2614 	int error;
2615 	int iseof;
2616 	struct iovec iov;
2617 	struct uio uio;
2618 	vnode_t *vp;
2619 
2620 	TRACE_0(TR_FAC_NFS, TR_RFS_READDIR_START, "rfs_readdir_start:");
2621 
2622 	vp = nfs_fhtovp(&rda->rda_fh, exi);
2623 	if (vp == NULL) {
2624 		rd->rd_entries = NULL;
2625 		rd->rd_status = NFSERR_STALE;
2626 		TRACE_1(TR_FAC_NFS, TR_RFS_READDIR_END,
2627 		    "rfs_readdir_end:(%S)", "stale");
2628 		return;
2629 	}
2630 
2631 	if (vp->v_type != VDIR) {
2632 		VN_RELE(vp);
2633 		rd->rd_entries = NULL;
2634 		rd->rd_status = NFSERR_NOTDIR;
2635 		TRACE_1(TR_FAC_NFS, TR_RFS_READDIR_END,
2636 		    "rfs_readdir_end:(%S)", "notdir");
2637 		return;
2638 	}
2639 
2640 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START, "vop_rwlock_start:");
2641 	(void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
2642 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END, "vop_rwlock_end:");
2643 
2644 	TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START, "vop_access_start:");
2645 	error = VOP_ACCESS(vp, VREAD, 0, cr, NULL);
2646 	TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END, "vop_access_end:");
2647 	if (error) {
2648 		rd->rd_entries = NULL;
2649 		goto bad;
2650 	}
2651 
2652 	if (rda->rda_count == 0) {
2653 		rd->rd_entries = NULL;
2654 		rd->rd_size = 0;
2655 		rd->rd_eof = FALSE;
2656 		goto bad;
2657 	}
2658 
2659 	rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA);
2660 
2661 	/*
2662 	 * Allocate data for entries.  This will be freed by rfs_rddirfree.
2663 	 */
2664 	rd->rd_bufsize = (uint_t)rda->rda_count;
2665 	rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP);
2666 
2667 	/*
2668 	 * Set up io vector to read directory data
2669 	 */
2670 	iov.iov_base = (caddr_t)rd->rd_entries;
2671 	iov.iov_len = rda->rda_count;
2672 	uio.uio_iov = &iov;
2673 	uio.uio_iovcnt = 1;
2674 	uio.uio_segflg = UIO_SYSSPACE;
2675 	uio.uio_extflg = UIO_COPY_CACHED;
2676 	uio.uio_loffset = (offset_t)rda->rda_offset;
2677 	uio.uio_resid = rda->rda_count;
2678 
2679 	/*
2680 	 * read directory
2681 	 */
2682 	TRACE_0(TR_FAC_NFS, TR_VOP_READDIR_START, "vop_readdir_start:");
2683 	error = VOP_READDIR(vp, &uio, cr, &iseof, NULL, 0);
2684 	TRACE_0(TR_FAC_NFS, TR_VOP_READDIR_END, "vop_readdir_end:");
2685 
2686 	/*
2687 	 * Clean up
2688 	 */
2689 	if (!error) {
2690 		/*
2691 		 * set size and eof
2692 		 */
2693 		if (uio.uio_resid == rda->rda_count) {
2694 			rd->rd_size = 0;
2695 			rd->rd_eof = TRUE;
2696 		} else {
2697 			rd->rd_size = (uint32_t)(rda->rda_count -
2698 			    uio.uio_resid);
2699 			rd->rd_eof = iseof ? TRUE : FALSE;
2700 		}
2701 	}
2702 
2703 bad:
2704 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, "vop_rwunlock_start:");
2705 	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
2706 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, "vop_rwunlock_end:");
2707 
2708 #if 0 /* notyet */
2709 	/*
2710 	 * Don't do this.  It causes local disk writes when just
2711 	 * reading the file and the overhead is deemed larger
2712 	 * than the benefit.
2713 	 */
2714 	/*
2715 	 * Force modified metadata out to stable storage.
2716 	 */
2717 	(void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2718 #endif
2719 
2720 	VN_RELE(vp);
2721 
2722 	rd->rd_status = puterrno(error);
2723 
2724 	TRACE_1(TR_FAC_NFS, TR_RFS_READDIR_END, "rfs_readdir_end:(%S)", "done");
2725 }
2726 void *
2727 rfs_readdir_getfh(struct nfsrddirargs *rda)
2728 {
2729 	return (&rda->rda_fh);
2730 }
2731 void
2732 rfs_rddirfree(struct nfsrddirres *rd)
2733 {
2734 	if (rd->rd_entries != NULL)
2735 		kmem_free(rd->rd_entries, rd->rd_bufsize);
2736 }
2737 
2738 /* ARGSUSED */
2739 void
2740 rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
2741 	struct svc_req *req, cred_t *cr)
2742 {
2743 	int error;
2744 	struct statvfs64 sb;
2745 	vnode_t *vp;
2746 
2747 	TRACE_0(TR_FAC_NFS, TR_RFS_STATFS_START, "rfs_statfs_start:");
2748 
2749 	vp = nfs_fhtovp(fh, exi);
2750 	if (vp == NULL) {
2751 		fs->fs_status = NFSERR_STALE;
2752 		TRACE_1(TR_FAC_NFS, TR_RFS_STATFS_END,
2753 		    "rfs_statfs_end:(%S)", "stale");
2754 		return;
2755 	}
2756 
2757 	error = VFS_STATVFS(vp->v_vfsp, &sb);
2758 
2759 	if (!error) {
2760 		fs->fs_tsize = nfstsize();
2761 		fs->fs_bsize = sb.f_frsize;
2762 		fs->fs_blocks = sb.f_blocks;
2763 		fs->fs_bfree = sb.f_bfree;
2764 		fs->fs_bavail = sb.f_bavail;
2765 	}
2766 
2767 	VN_RELE(vp);
2768 
2769 	fs->fs_status = puterrno(error);
2770 
2771 	TRACE_1(TR_FAC_NFS, TR_RFS_STATFS_END, "rfs_statfs_end:(%S)", "done");
2772 }
2773 void *
2774 rfs_statfs_getfh(fhandle_t *fh)
2775 {
2776 	return (fh);
2777 }
2778 
2779 static int
2780 sattr_to_vattr(struct nfssattr *sa, struct vattr *vap)
2781 {
2782 	vap->va_mask = 0;
2783 
2784 	/*
2785 	 * There was a sign extension bug in some VFS based systems
2786 	 * which stored the mode as a short.  When it would get
2787 	 * assigned to a u_long, no sign extension would occur.
2788 	 * It needed to, but this wasn't noticed because sa_mode
2789 	 * would then get assigned back to the short, thus ignoring
2790 	 * the upper 16 bits of sa_mode.
2791 	 *
2792 	 * To make this implementation work for both broken
2793 	 * clients and good clients, we check for both versions
2794 	 * of the mode.
2795 	 */
2796 	if (sa->sa_mode != (uint32_t)((ushort_t)-1) &&
2797 	    sa->sa_mode != (uint32_t)-1) {
2798 		vap->va_mask |= AT_MODE;
2799 		vap->va_mode = sa->sa_mode;
2800 	}
2801 	if (sa->sa_uid != (uint32_t)-1) {
2802 		vap->va_mask |= AT_UID;
2803 		vap->va_uid = sa->sa_uid;
2804 	}
2805 	if (sa->sa_gid != (uint32_t)-1) {
2806 		vap->va_mask |= AT_GID;
2807 		vap->va_gid = sa->sa_gid;
2808 	}
2809 	if (sa->sa_size != (uint32_t)-1) {
2810 		vap->va_mask |= AT_SIZE;
2811 		vap->va_size = sa->sa_size;
2812 	}
2813 	if (sa->sa_atime.tv_sec != (int32_t)-1 &&
2814 	    sa->sa_atime.tv_usec != (int32_t)-1) {
2815 #ifndef _LP64
2816 		/* return error if time overflow */
2817 		if (!NFS2_TIME_OK(sa->sa_atime.tv_sec))
2818 			return (EOVERFLOW);
2819 #endif
2820 		vap->va_mask |= AT_ATIME;
2821 		/*
2822 		 * nfs protocol defines times as unsigned so don't extend sign,
2823 		 * unless sysadmin set nfs_allow_preepoch_time.
2824 		 */
2825 		NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec);
2826 		vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
2827 	}
2828 	if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
2829 	    sa->sa_mtime.tv_usec != (int32_t)-1) {
2830 #ifndef _LP64
2831 		/* return error if time overflow */
2832 		if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
2833 			return (EOVERFLOW);
2834 #endif
2835 		vap->va_mask |= AT_MTIME;
2836 		/*
2837 		 * nfs protocol defines times as unsigned so don't extend sign,
2838 		 * unless sysadmin set nfs_allow_preepoch_time.
2839 		 */
2840 		NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
2841 		vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
2842 	}
2843 	return (0);
2844 }
2845 
2846 static enum nfsftype vt_to_nf[] = {
2847 	0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
2848 };
2849 
2850 /*
2851  * check the following fields for overflow: nodeid, size, and time.
2852  * There could be a problem when converting 64-bit LP64 fields
2853  * into 32-bit ones.  Return an error if there is an overflow.
2854  */
2855 int
2856 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
2857 {
2858 	ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
2859 	na->na_type = vt_to_nf[vap->va_type];
2860 
2861 	if (vap->va_mode == (unsigned short) -1)
2862 		na->na_mode = (uint32_t)-1;
2863 	else
2864 		na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
2865 
2866 	if (vap->va_uid == (unsigned short)(-1))
2867 		na->na_uid = (uint32_t)(-1);
2868 	else if (vap->va_uid == UID_NOBODY)
2869 		na->na_uid = (uint32_t)NFS_UID_NOBODY;
2870 	else
2871 		na->na_uid = vap->va_uid;
2872 
2873 	if (vap->va_gid == (unsigned short)(-1))
2874 		na->na_gid = (uint32_t)-1;
2875 	else if (vap->va_gid == GID_NOBODY)
2876 		na->na_gid = (uint32_t)NFS_GID_NOBODY;
2877 	else
2878 		na->na_gid = vap->va_gid;
2879 
2880 	/*
2881 	 * Do we need to check fsid for overflow?  It is 64-bit in the
2882 	 * vattr, but are bigger than 32 bit values supported?
2883 	 */
2884 	na->na_fsid = vap->va_fsid;
2885 
2886 	na->na_nodeid = vap->va_nodeid;
2887 
2888 	/*
2889 	 * Check to make sure that the nodeid is representable over the
2890 	 * wire without losing bits.
2891 	 */
2892 	if (vap->va_nodeid != (u_longlong_t)na->na_nodeid)
2893 		return (EFBIG);
2894 	na->na_nlink = vap->va_nlink;
2895 
2896 	/*
2897 	 * Check for big files here, instead of at the caller.  See
2898 	 * comments in cstat for large special file explanation.
2899 	 */
2900 	if (vap->va_size > (u_longlong_t)MAXOFF32_T) {
2901 		if ((vap->va_type == VREG) || (vap->va_type == VDIR))
2902 			return (EFBIG);
2903 		if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) {
2904 			/* UNKNOWN_SIZE | OVERFLOW */
2905 			na->na_size = MAXOFF32_T;
2906 		} else
2907 			na->na_size = vap->va_size;
2908 	} else
2909 		na->na_size = vap->va_size;
2910 
2911 	/*
2912 	 * If the vnode times overflow the 32-bit times that NFS2
2913 	 * uses on the wire then return an error.
2914 	 */
2915 	if (!NFS_VAP_TIME_OK(vap)) {
2916 		return (EOVERFLOW);
2917 	}
2918 	na->na_atime.tv_sec = vap->va_atime.tv_sec;
2919 	na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2920 
2921 	na->na_mtime.tv_sec = vap->va_mtime.tv_sec;
2922 	na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2923 
2924 	na->na_ctime.tv_sec = vap->va_ctime.tv_sec;
2925 	na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000;
2926 
2927 	/*
2928 	 * If the dev_t will fit into 16 bits then compress
2929 	 * it, otherwise leave it alone. See comments in
2930 	 * nfs_client.c.
2931 	 */
2932 	if (getminor(vap->va_rdev) <= SO4_MAXMIN &&
2933 	    getmajor(vap->va_rdev) <= SO4_MAXMAJ)
2934 		na->na_rdev = nfsv2_cmpdev(vap->va_rdev);
2935 	else
2936 		(void) cmpldev(&na->na_rdev, vap->va_rdev);
2937 
2938 	na->na_blocks = vap->va_nblocks;
2939 	na->na_blocksize = vap->va_blksize;
2940 
2941 	/*
2942 	 * This bit of ugliness is a *TEMPORARY* hack to preserve the
2943 	 * over-the-wire protocols for named-pipe vnodes.  It remaps the
2944 	 * VFIFO type to the special over-the-wire type. (see note in nfs.h)
2945 	 *
2946 	 * BUYER BEWARE:
2947 	 *  If you are porting the NFS to a non-Sun server, you probably
2948 	 *  don't want to include the following block of code.  The
2949 	 *  over-the-wire special file types will be changing with the
2950 	 *  NFS Protocol Revision.
2951 	 */
2952 	if (vap->va_type == VFIFO)
2953 		NA_SETFIFO(na);
2954 	return (0);
2955 }
2956 
2957 /*
2958  * acl v2 support: returns approximate permission.
2959  *	default: returns minimal permission (more restrictive)
2960  *	aclok: returns maximal permission (less restrictive)
2961  *	This routine changes the permissions that are alaredy in *va.
2962  *	If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES,
2963  *	CLASS_OBJ is always the same as GROUP_OBJ entry.
2964  */
2965 static void
2966 acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr)
2967 {
2968 	vsecattr_t	vsa;
2969 	int		aclcnt;
2970 	aclent_t	*aclentp;
2971 	mode_t		mask_perm;
2972 	mode_t		grp_perm;
2973 	mode_t		other_perm;
2974 	mode_t		other_orig;
2975 	int		error;
2976 
2977 	/* dont care default acl */
2978 	vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT);
2979 	error = VOP_GETSECATTR(vp, &vsa, 0, cr, NULL);
2980 
2981 	if (!error) {
2982 		aclcnt = vsa.vsa_aclcnt;
2983 		if (aclcnt > MIN_ACL_ENTRIES) {
2984 			/* non-trivial ACL */
2985 			aclentp = vsa.vsa_aclentp;
2986 			if (exi->exi_export.ex_flags & EX_ACLOK) {
2987 				/* maximal permissions */
2988 				grp_perm = 0;
2989 				other_perm = 0;
2990 				for (; aclcnt > 0; aclcnt--, aclentp++) {
2991 					switch (aclentp->a_type) {
2992 					case USER_OBJ:
2993 						break;
2994 					case USER:
2995 						grp_perm |=
2996 						    aclentp->a_perm << 3;
2997 						other_perm |= aclentp->a_perm;
2998 						break;
2999 					case GROUP_OBJ:
3000 						grp_perm |=
3001 						    aclentp->a_perm << 3;
3002 						break;
3003 					case GROUP:
3004 						other_perm |= aclentp->a_perm;
3005 						break;
3006 					case OTHER_OBJ:
3007 						other_orig = aclentp->a_perm;
3008 						break;
3009 					case CLASS_OBJ:
3010 						mask_perm = aclentp->a_perm;
3011 						break;
3012 					default:
3013 						break;
3014 					}
3015 				}
3016 				grp_perm &= mask_perm << 3;
3017 				other_perm &= mask_perm;
3018 				other_perm |= other_orig;
3019 
3020 			} else {
3021 				/* minimal permissions */
3022 				grp_perm = 070;
3023 				other_perm = 07;
3024 				for (; aclcnt > 0; aclcnt--, aclentp++) {
3025 					switch (aclentp->a_type) {
3026 					case USER_OBJ:
3027 						break;
3028 					case USER:
3029 					case CLASS_OBJ:
3030 						grp_perm &=
3031 						    aclentp->a_perm << 3;
3032 						other_perm &=
3033 						    aclentp->a_perm;
3034 						break;
3035 					case GROUP_OBJ:
3036 						grp_perm &=
3037 						    aclentp->a_perm << 3;
3038 						break;
3039 					case GROUP:
3040 						other_perm &=
3041 						    aclentp->a_perm;
3042 						break;
3043 					case OTHER_OBJ:
3044 						other_perm &=
3045 						    aclentp->a_perm;
3046 						break;
3047 					default:
3048 						break;
3049 					}
3050 				}
3051 			}
3052 			/* copy to va */
3053 			va->va_mode &= ~077;
3054 			va->va_mode |= grp_perm | other_perm;
3055 		}
3056 		if (vsa.vsa_aclcnt)
3057 			kmem_free(vsa.vsa_aclentp,
3058 			    vsa.vsa_aclcnt * sizeof (aclent_t));
3059 	}
3060 }
3061 
3062 void
3063 rfs_srvrinit(void)
3064 {
3065 	mutex_init(&rfs_async_write_lock, NULL, MUTEX_DEFAULT, NULL);
3066 }
3067 
3068 void
3069 rfs_srvrfini(void)
3070 {
3071 	mutex_destroy(&rfs_async_write_lock);
3072 }
3073