xref: /titanic_50/usr/src/uts/common/fs/nfs/nfs_srv.c (revision cbcdac8f95bd87f5b3243184b35ed7bd79fc1533)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  *	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
28  *	All rights reserved.
29  */
30 
31 #pragma ident	"%Z%%M%	%I%	%E% SMI"
32 
33 #include <sys/param.h>
34 #include <sys/types.h>
35 #include <sys/systm.h>
36 #include <sys/cred.h>
37 #include <sys/buf.h>
38 #include <sys/vfs.h>
39 #include <sys/vnode.h>
40 #include <sys/uio.h>
41 #include <sys/stat.h>
42 #include <sys/errno.h>
43 #include <sys/sysmacros.h>
44 #include <sys/statvfs.h>
45 #include <sys/kmem.h>
46 #include <sys/kstat.h>
47 #include <sys/dirent.h>
48 #include <sys/cmn_err.h>
49 #include <sys/debug.h>
50 #include <sys/vtrace.h>
51 #include <sys/mode.h>
52 #include <sys/acl.h>
53 #include <sys/nbmlock.h>
54 #include <sys/policy.h>
55 
56 #include <rpc/types.h>
57 #include <rpc/auth.h>
58 #include <rpc/svc.h>
59 
60 #include <nfs/nfs.h>
61 #include <nfs/export.h>
62 
63 #include <vm/hat.h>
64 #include <vm/as.h>
65 #include <vm/seg.h>
66 #include <vm/seg_map.h>
67 #include <vm/seg_kmem.h>
68 
69 #include <sys/strsubr.h>
70 
71 /*
72  * These are the interface routines for the server side of the
73  * Network File System.  See the NFS version 2 protocol specification
74  * for a description of this interface.
75  */
76 
77 static int	sattr_to_vattr(struct nfssattr *, struct vattr *);
78 static void	acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
79 			cred_t *);
80 
81 /*
82  * Some "over the wire" UNIX file types.  These are encoded
83  * into the mode.  This needs to be fixed in the next rev.
84  */
85 #define	IFMT		0170000		/* type of file */
86 #define	IFCHR		0020000		/* character special */
87 #define	IFBLK		0060000		/* block special */
88 #define	IFSOCK		0140000		/* socket */
89 
90 /*
91  * Get file attributes.
92  * Returns the current attributes of the file with the given fhandle.
93  */
94 /* ARGSUSED */
95 void
96 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
97 	struct svc_req *req, cred_t *cr)
98 {
99 	int error;
100 	vnode_t *vp;
101 	struct vattr va;
102 
103 	TRACE_0(TR_FAC_NFS, TR_RFS_GETATTR_START,
104 		"rfs_getattr_start:");
105 
106 	vp = nfs_fhtovp(fhp, exi);
107 	if (vp == NULL) {
108 		ns->ns_status = NFSERR_STALE;
109 		TRACE_1(TR_FAC_NFS, TR_RFS_GETATTR_END,
110 			"rfs_getattr_end:(%S)", "stale");
111 		return;
112 	}
113 
114 	/*
115 	 * Do the getattr.
116 	 */
117 	va.va_mask = AT_ALL;	/* we want all the attributes */
118 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
119 		"vop_getattr_start:");
120 	error = rfs4_delegated_getattr(vp, &va, 0, cr);
121 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
122 		"vop_getattr_end:");
123 
124 	/* check for overflows */
125 	if (!error) {
126 		acl_perm(vp, exi, &va, cr);
127 		error = vattr_to_nattr(&va, &ns->ns_attr);
128 	}
129 
130 	VN_RELE(vp);
131 
132 	ns->ns_status = puterrno(error);
133 
134 	TRACE_1(TR_FAC_NFS, TR_RFS_GETATTR_END,
135 		"rfs_getattr_end:(%S)", "done");
136 }
137 void *
138 rfs_getattr_getfh(fhandle_t *fhp)
139 {
140 	return (fhp);
141 }
142 
143 /*
144  * Set file attributes.
145  * Sets the attributes of the file with the given fhandle.  Returns
146  * the new attributes.
147  */
148 void
149 rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns,
150 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
151 {
152 	int error;
153 	int flag;
154 	int in_crit = 0;
155 	vnode_t *vp;
156 	struct vattr va;
157 	struct vattr bva;
158 	struct flock64 bf;
159 
160 	TRACE_0(TR_FAC_NFS, TR_RFS_SETATTR_START,
161 		"rfs_setattr_start:");
162 
163 	vp = nfs_fhtovp(&args->saa_fh, exi);
164 	if (vp == NULL) {
165 		ns->ns_status = NFSERR_STALE;
166 		TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END,
167 			"rfs_setattr_end:(%S)", "stale");
168 		return;
169 	}
170 
171 	if (rdonly(exi, req) || vn_is_readonly(vp)) {
172 		VN_RELE(vp);
173 		ns->ns_status = NFSERR_ROFS;
174 		TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END,
175 			"rfs_setattr_end:(%S)", "rofs");
176 		return;
177 	}
178 
179 	error = sattr_to_vattr(&args->saa_sa, &va);
180 	if (error) {
181 		VN_RELE(vp);
182 		ns->ns_status = puterrno(error);
183 		TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END,
184 			"rfs_setattr_end:(%S)", "sattr");
185 		return;
186 	}
187 
188 	/*
189 	 * If the client is requesting a change to the mtime,
190 	 * but the nanosecond field is set to 1 billion, then
191 	 * this is a flag to the server that it should set the
192 	 * atime and mtime fields to the server's current time.
193 	 * The 1 billion number actually came from the client
194 	 * as 1 million, but the units in the over the wire
195 	 * request are microseconds instead of nanoseconds.
196 	 *
197 	 * This is an overload of the protocol and should be
198 	 * documented in the NFS Version 2 protocol specification.
199 	 */
200 	if (va.va_mask & AT_MTIME) {
201 		if (va.va_mtime.tv_nsec == 1000000000) {
202 			gethrestime(&va.va_mtime);
203 			va.va_atime = va.va_mtime;
204 			va.va_mask |= AT_ATIME;
205 			flag = 0;
206 		} else
207 			flag = ATTR_UTIME;
208 	} else
209 		flag = 0;
210 
211 	/*
212 	 * If the filesystem is exported with nosuid, then mask off
213 	 * the setuid and setgid bits.
214 	 */
215 	if ((va.va_mask & AT_MODE) && vp->v_type == VREG &&
216 	    (exi->exi_export.ex_flags & EX_NOSUID))
217 		va.va_mode &= ~(VSUID | VSGID);
218 
219 	/*
220 	 * We need to specially handle size changes because it is
221 	 * possible for the client to create a file with modes
222 	 * which indicate read-only, but with the file opened for
223 	 * writing.  If the client then tries to set the size of
224 	 * the file, then the normal access checking done in
225 	 * VOP_SETATTR would prevent the client from doing so,
226 	 * although it should be legal for it to do so.  To get
227 	 * around this, we do the access checking for ourselves
228 	 * and then use VOP_SPACE which doesn't do the access
229 	 * checking which VOP_SETATTR does. VOP_SPACE can only
230 	 * operate on VREG files, let VOP_SETATTR handle the other
231 	 * extremely rare cases.
232 	 * Also the client should not be allowed to change the
233 	 * size of the file if there is a conflicting non-blocking
234 	 * mandatory lock in the region of change.
235 	 *
236 	 * Also(2), check to see if the v4 side of the server has
237 	 * delegated this file.  If so, then we set T_WOULDBLOCK
238 	 * so that the dispatch function dosn't send a reply, forcing
239 	 * the client to retrasmit its request.
240 	 */
241 	if (vp->v_type == VREG && va.va_mask & AT_SIZE) {
242 		/* If delegated, mark as wouldblock so response is dropped */
243 		if (rfs4_check_delegated(FWRITE, vp, TRUE)) {
244 			VN_RELE(vp);
245 			curthread->t_flag |= T_WOULDBLOCK;
246 			TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END,
247 				"rfs_setattr_end:(%S)", "delegated");
248 			return;
249 		}
250 		if (nbl_need_check(vp)) {
251 			nbl_start_crit(vp, RW_READER);
252 			in_crit = 1;
253 		}
254 
255 		bva.va_mask = AT_UID | AT_SIZE;
256 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
257 			"vop_getattr_start:");
258 		error = VOP_GETATTR(vp, &bva, 0, cr);
259 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
260 			"vop_getattr_end:");
261 		if (error) {
262 			if (in_crit)
263 				nbl_end_crit(vp);
264 			VN_RELE(vp);
265 			ns->ns_status = puterrno(error);
266 			TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END,
267 				"rfs_setattr_end:(%S)", "getattr");
268 			return;
269 		}
270 
271 		if (in_crit) {
272 			u_offset_t offset;
273 			ssize_t length;
274 
275 			if (va.va_size < bva.va_size) {
276 				offset = va.va_size;
277 				length = bva.va_size - va.va_size;
278 			} else {
279 				offset = bva.va_size;
280 				length = va.va_size - bva.va_size;
281 			}
282 			if (nbl_conflict(vp, NBL_WRITE, offset, length, 0)) {
283 				error = EACCES;
284 			}
285 		}
286 
287 		if (crgetuid(cr) == bva.va_uid && !error &&
288 			va.va_size != bva.va_size) {
289 			va.va_mask &= ~AT_SIZE;
290 			bf.l_type = F_WRLCK;
291 			bf.l_whence = 0;
292 			bf.l_start = (off64_t)va.va_size;
293 			bf.l_len = 0;
294 			bf.l_sysid = 0;
295 			bf.l_pid = 0;
296 			TRACE_0(TR_FAC_NFS, TR_VOP_SPACE_START,
297 				"vop_space_start:");
298 			error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE,
299 					(offset_t)va.va_size, cr, NULL);
300 			TRACE_0(TR_FAC_NFS, TR_VOP_SPACE_END,
301 				"vop_space_end:");
302 		}
303 		if (in_crit)
304 			nbl_end_crit(vp);
305 	} else
306 		error = 0;
307 
308 	/*
309 	 * Do the setattr.
310 	 */
311 	if (!error && va.va_mask) {
312 		TRACE_0(TR_FAC_NFS, TR_VOP_SETATTR_START,
313 			"vop_setattr_start:");
314 		error = VOP_SETATTR(vp, &va, flag, cr, NULL);
315 		TRACE_0(TR_FAC_NFS, TR_VOP_SETATTR_END,
316 			"vop_setattr_end:");
317 	}
318 
319 	if (!error) {
320 		va.va_mask = AT_ALL;	/* get everything */
321 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
322 			"vop_getattr_start:");
323 		error = rfs4_delegated_getattr(vp, &va, 0, cr);
324 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
325 			"vop_getattr_end:");
326 
327 		/* check for overflows */
328 		if (!error) {
329 			acl_perm(vp, exi, &va, cr);
330 			error = vattr_to_nattr(&va, &ns->ns_attr);
331 		}
332 	}
333 
334 	/*
335 	 * Force modified metadata out to stable storage.
336 	 */
337 	(void) VOP_FSYNC(vp, FNODSYNC, cr);
338 
339 	VN_RELE(vp);
340 
341 	ns->ns_status = puterrno(error);
342 
343 	TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END,
344 		"rfs_setattr_end:(%S)", "done");
345 }
346 void *
347 rfs_setattr_getfh(struct nfssaargs *args)
348 {
349 	return (&args->saa_fh);
350 }
351 
352 /*
353  * Directory lookup.
354  * Returns an fhandle and file attributes for file name in a directory.
355  */
356 /* ARGSUSED */
357 void
358 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
359 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
360 {
361 	int error;
362 	vnode_t *dvp;
363 	vnode_t *vp;
364 	struct vattr va;
365 	fhandle_t *fhp = da->da_fhandle;
366 	struct sec_ol sec = {0, 0};
367 	bool_t publicfh_flag = FALSE, auth_weak = FALSE;
368 
369 	TRACE_0(TR_FAC_NFS, TR_RFS_LOOKUP_START,
370 		"rfs_lookup_start:");
371 
372 	/*
373 	 * Trusted Extension doesn't support NFSv2. MOUNT
374 	 * will reject v2 clients. Need to prevent v2 client
375 	 * access via WebNFS here.
376 	 */
377 	if (is_system_labeled() && req->rq_vers == 2) {
378 		dr->dr_status = NFSERR_ACCES;
379 		TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END,
380 			"rfs_lookup_end:(%S)", "access");
381 		return;
382 	}
383 
384 	/*
385 	 * Disallow NULL paths
386 	 */
387 	if (da->da_name == NULL || *da->da_name == '\0') {
388 		dr->dr_status = NFSERR_ACCES;
389 		TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END,
390 			"rfs_lookup_end:(%S)", "access");
391 		return;
392 	}
393 
394 	/*
395 	 * Allow lookups from the root - the default
396 	 * location of the public filehandle.
397 	 */
398 	if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
399 		dvp = rootdir;
400 		VN_HOLD(dvp);
401 	} else {
402 		dvp = nfs_fhtovp(fhp, exi);
403 		if (dvp == NULL) {
404 			dr->dr_status = NFSERR_STALE;
405 			TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END,
406 				"rfs_lookup_end:(%S)", "stale");
407 			return;
408 		}
409 	}
410 
411 	/*
412 	 * Not allow lookup beyond root.
413 	 * If the filehandle matches a filehandle of the exi,
414 	 * then the ".." refers beyond the root of an exported filesystem.
415 	 */
416 	if (strcmp(da->da_name, "..") == 0 &&
417 	    EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
418 		VN_RELE(dvp);
419 		dr->dr_status = NFSERR_NOENT;
420 		TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END,
421 			"rfs_lookup_end:(%S)", "noent");
422 		return;
423 	}
424 
425 	/*
426 	 * If the public filehandle is used then allow
427 	 * a multi-component lookup, i.e. evaluate
428 	 * a pathname and follow symbolic links if
429 	 * necessary.
430 	 *
431 	 * This may result in a vnode in another filesystem
432 	 * which is OK as long as the filesystem is exported.
433 	 */
434 	if (PUBLIC_FH2(fhp)) {
435 		publicfh_flag = TRUE;
436 		error = rfs_publicfh_mclookup(da->da_name, dvp, cr, &vp, &exi,
437 					&sec);
438 	} else {
439 		/*
440 		 * Do a normal single component lookup.
441 		 */
442 		TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_START,
443 			"vop_lookup_start:");
444 		error = VOP_LOOKUP(dvp, da->da_name, &vp, NULL, 0, NULL, cr);
445 		TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_END,
446 			"vop_lookup_end:");
447 	}
448 
449 	if (!error) {
450 		va.va_mask = AT_ALL;	/* we want everything */
451 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
452 			"vop_getattr_start:");
453 		error = rfs4_delegated_getattr(vp, &va, 0, cr);
454 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
455 			"vop_getattr_end:");
456 		/* check for overflows */
457 		if (!error) {
458 			acl_perm(vp, exi, &va, cr);
459 			error = vattr_to_nattr(&va, &dr->dr_attr);
460 			if (!error) {
461 				if (sec.sec_flags & SEC_QUERY)
462 					error = makefh_ol(&dr->dr_fhandle, exi,
463 							sec.sec_index);
464 				else {
465 					error = makefh(&dr->dr_fhandle, vp,
466 								exi);
467 					if (!error && publicfh_flag &&
468 						!chk_clnt_sec(exi, req))
469 						auth_weak = TRUE;
470 				}
471 			}
472 		}
473 		VN_RELE(vp);
474 	}
475 
476 	VN_RELE(dvp);
477 
478 	/*
479 	 * If publicfh_flag is true then we have called rfs_publicfh_mclookup
480 	 * and have obtained a new exportinfo in exi which needs to be
481 	 * released. Note the the original exportinfo pointed to by exi
482 	 * will be released by the caller, comon_dispatch.
483 	 */
484 	if (publicfh_flag && exi != NULL)
485 		exi_rele(exi);
486 
487 	/*
488 	 * If it's public fh, no 0x81, and client's flavor is
489 	 * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
490 	 * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
491 	 */
492 	if (auth_weak)
493 		dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR;
494 	else
495 		dr->dr_status = puterrno(error);
496 
497 	TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END,
498 		"rfs_lookup_end:(%S)", "done");
499 }
500 void *
501 rfs_lookup_getfh(struct nfsdiropargs *da)
502 {
503 	return (da->da_fhandle);
504 }
505 
506 /*
507  * Read symbolic link.
508  * Returns the string in the symbolic link at the given fhandle.
509  */
510 /* ARGSUSED */
511 void
512 rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi,
513 	struct svc_req *req, cred_t *cr)
514 {
515 	int error;
516 	struct iovec iov;
517 	struct uio uio;
518 	vnode_t *vp;
519 	struct vattr va;
520 
521 	TRACE_0(TR_FAC_NFS, TR_RFS_READLINK_START,
522 		"rfs_readlink_start:");
523 
524 	vp = nfs_fhtovp(fhp, exi);
525 	if (vp == NULL) {
526 		rl->rl_data = NULL;
527 		rl->rl_status = NFSERR_STALE;
528 		TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END,
529 			"rfs_readlink_end:(%S)", "stale");
530 		return;
531 	}
532 
533 	va.va_mask = AT_MODE;
534 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
535 		"vop_getattr_start:");
536 	error = VOP_GETATTR(vp, &va, 0, cr);
537 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
538 		"vop_getattr_end:");
539 
540 	if (error) {
541 		VN_RELE(vp);
542 		rl->rl_data = NULL;
543 		rl->rl_status = puterrno(error);
544 		TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END,
545 			"rfs_readlink_end:(%S)", "getattr error");
546 		return;
547 	}
548 
549 	if (MANDLOCK(vp, va.va_mode)) {
550 		VN_RELE(vp);
551 		rl->rl_data = NULL;
552 		rl->rl_status = NFSERR_ACCES;
553 		TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END,
554 			"rfs_readlink_end:(%S)", "access");
555 		return;
556 	}
557 
558 	/*
559 	 * XNFS and RFC1094 require us to return ENXIO if argument
560 	 * is not a link. BUGID 1138002.
561 	 */
562 	if (vp->v_type != VLNK) {
563 		VN_RELE(vp);
564 		rl->rl_data = NULL;
565 		rl->rl_status = NFSERR_NXIO;
566 		TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END,
567 			"rfs_readlink_end:(%S)", "nxio");
568 		return;
569 	}
570 
571 	/*
572 	 * Allocate data for pathname.  This will be freed by rfs_rlfree.
573 	 */
574 	rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
575 
576 	/*
577 	 * Set up io vector to read sym link data
578 	 */
579 	iov.iov_base = rl->rl_data;
580 	iov.iov_len = NFS_MAXPATHLEN;
581 	uio.uio_iov = &iov;
582 	uio.uio_iovcnt = 1;
583 	uio.uio_segflg = UIO_SYSSPACE;
584 	uio.uio_extflg = UIO_COPY_CACHED;
585 	uio.uio_loffset = (offset_t)0;
586 	uio.uio_resid = NFS_MAXPATHLEN;
587 
588 	/*
589 	 * Do the readlink.
590 	 */
591 	TRACE_0(TR_FAC_NFS, TR_VOP_READLINK_START,
592 		"vop_readlink_start:");
593 	error = VOP_READLINK(vp, &uio, cr);
594 	TRACE_0(TR_FAC_NFS, TR_VOP_READLINK_END,
595 		"vop_readlink_end:");
596 
597 #if 0 /* notyet */
598 	/*
599 	 * Don't do this.  It causes local disk writes when just
600 	 * reading the file and the overhead is deemed larger
601 	 * than the benefit.
602 	 */
603 	/*
604 	 * Force modified metadata out to stable storage.
605 	 */
606 	(void) VOP_FSYNC(vp, FNODSYNC, cr);
607 #endif
608 
609 	VN_RELE(vp);
610 
611 	rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid);
612 
613 	/*
614 	 * XNFS and RFC1094 require us to return ENXIO if argument
615 	 * is not a link. UFS returns EINVAL if this is the case,
616 	 * so we do the mapping here. BUGID 1138002.
617 	 */
618 	if (error == EINVAL)
619 		rl->rl_status = NFSERR_NXIO;
620 	else
621 		rl->rl_status = puterrno(error);
622 
623 	TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END,
624 		"rfs_readlink_end:(%S)", "done");
625 }
626 void *
627 rfs_readlink_getfh(fhandle_t *fhp)
628 {
629 	return (fhp);
630 }
631 /*
632  * Free data allocated by rfs_readlink
633  */
634 void
635 rfs_rlfree(struct nfsrdlnres *rl)
636 {
637 	if (rl->rl_data != NULL)
638 		kmem_free(rl->rl_data, NFS_MAXPATHLEN);
639 }
640 
641 /*
642  * Read data.
643  * Returns some data read from the file at the given fhandle.
644  */
645 /* ARGSUSED */
646 void
647 rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr,
648 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
649 {
650 	vnode_t *vp;
651 	int error;
652 	struct vattr va;
653 	struct iovec iov;
654 	struct uio uio;
655 	mblk_t *mp;
656 	int alloc_err = 0;
657 	int in_crit = 0;
658 
659 	TRACE_0(TR_FAC_NFS, TR_RFS_READ_START,
660 		"rfs_read_start:");
661 
662 	vp = nfs_fhtovp(&ra->ra_fhandle, exi);
663 	if (vp == NULL) {
664 		rr->rr_data = NULL;
665 		rr->rr_status = NFSERR_STALE;
666 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
667 			"rfs_read_end:(%S)", "stale");
668 		return;
669 	}
670 
671 	if (vp->v_type != VREG) {
672 		VN_RELE(vp);
673 		rr->rr_data = NULL;
674 		rr->rr_status = NFSERR_ISDIR;
675 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
676 			"rfs_read_end:(%S)", "isdir");
677 		return;
678 	}
679 
680 	/*
681 	 * Check to see if the v4 side of the server has delegated
682 	 * this file.  If so, then we mark thread as wouldblock so
683 	 * the response is dropped.
684 	 */
685 	if (rfs4_check_delegated(FREAD, vp, FALSE)) {
686 		VN_RELE(vp);
687 		curthread->t_flag |= T_WOULDBLOCK;
688 		rr->rr_data = NULL;
689 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
690 			"rfs_read_end:(%S)", "delegated");
691 		return;
692 	}
693 
694 	/*
695 	 * Enter the critical region before calling VOP_RWLOCK
696 	 * to avoid a deadlock with write requests.
697 	 */
698 	if (nbl_need_check(vp)) {
699 		nbl_start_crit(vp, RW_READER);
700 		if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
701 		    0)) {
702 			nbl_end_crit(vp);
703 			VN_RELE(vp);
704 			rr->rr_data = NULL;
705 			rr->rr_status = NFSERR_ACCES;
706 			TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
707 				"rfs_read_end:(%S)", " csf access error");
708 			return;
709 		}
710 		in_crit = 1;
711 	}
712 
713 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START,
714 		"vop_rwlock_start:");
715 	(void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
716 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END,
717 		"vop_rwlock_end:");
718 
719 	va.va_mask = AT_ALL;
720 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
721 		"vop_getattr_start:");
722 	error = VOP_GETATTR(vp, &va, 0, cr);
723 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
724 		"vop_getattr_end:");
725 
726 	if (error) {
727 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
728 			"vop_rwunlock_start:");
729 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
730 		if (in_crit)
731 			nbl_end_crit(vp);
732 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END,
733 			"vop_rwunlock_end:");
734 		VN_RELE(vp);
735 		rr->rr_data = NULL;
736 		rr->rr_status = puterrno(error);
737 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
738 			"rfs_read_end:(%S)", "getattr error");
739 		return;
740 	}
741 
742 	/*
743 	 * This is a kludge to allow reading of files created
744 	 * with no read permission.  The owner of the file
745 	 * is always allowed to read it.
746 	 */
747 	if (crgetuid(cr) != va.va_uid) {
748 		TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START,
749 			"vop_access_start:");
750 		error = VOP_ACCESS(vp, VREAD, 0, cr);
751 		TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END,
752 			"vop_access_end:");
753 		if (error) {
754 			/*
755 			 * Exec is the same as read over the net because
756 			 * of demand loading.
757 			 */
758 			TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START,
759 				"vop_access_start:");
760 			error = VOP_ACCESS(vp, VEXEC, 0, cr);
761 			TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END,
762 				"vop_access_end:");
763 		}
764 		if (error) {
765 			TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
766 				"vop_rwunlock_start:");
767 			VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
768 			if (in_crit)
769 				nbl_end_crit(vp);
770 			TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END,
771 				"vop_rwunlock_end:");
772 			VN_RELE(vp);
773 			rr->rr_data = NULL;
774 			rr->rr_status = puterrno(error);
775 			TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
776 				"rfs_read_end:(%S)", "access error");
777 			return;
778 		}
779 	}
780 
781 	if (MANDLOCK(vp, va.va_mode)) {
782 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
783 			"vop_rwunlock_start:");
784 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
785 		if (in_crit)
786 			nbl_end_crit(vp);
787 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END,
788 			"vop_rwunlock_end:");
789 		VN_RELE(vp);
790 		rr->rr_data = NULL;
791 		rr->rr_status = NFSERR_ACCES;
792 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
793 			"rfs_read_end:(%S)", "mand lock");
794 		return;
795 	}
796 
797 	if ((u_offset_t)ra->ra_offset >= va.va_size) {
798 		rr->rr_count = 0;
799 		rr->rr_data = NULL;
800 		/*
801 		 * In this case, status is NFS_OK, but there is no data
802 		 * to encode. So set rr_mp to NULL.
803 		 */
804 		rr->rr_mp = NULL;
805 		goto done;
806 	}
807 
808 	/*
809 	 * mp will contain the data to be sent out in the read reply.
810 	 * This will be freed after the reply has been sent out (by the
811 	 * driver).
812 	 * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so
813 	 * that the call to xdrmblk_putmblk() never fails.
814 	 */
815 	mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG,
816 	    &alloc_err);
817 	ASSERT(mp != NULL);
818 	ASSERT(alloc_err == 0);
819 
820 	rr->rr_mp = mp;
821 
822 	/*
823 	 * Set up io vector
824 	 */
825 	iov.iov_base = (caddr_t)mp->b_datap->db_base;
826 	iov.iov_len = ra->ra_count;
827 	uio.uio_iov = &iov;
828 	uio.uio_iovcnt = 1;
829 	uio.uio_segflg = UIO_SYSSPACE;
830 	uio.uio_extflg = UIO_COPY_CACHED;
831 	uio.uio_loffset = (offset_t)ra->ra_offset;
832 	uio.uio_resid = ra->ra_count;
833 
834 	TRACE_0(TR_FAC_NFS, TR_VOP_READ_START,
835 		"vop_read_start:");
836 	error = VOP_READ(vp, &uio, 0, cr, NULL);
837 	TRACE_0(TR_FAC_NFS, TR_VOP_READ_END,
838 		"vop_read_end:");
839 
840 	if (error) {
841 		freeb(mp);
842 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
843 			"vop_rwunlock_start:");
844 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
845 		if (in_crit)
846 			nbl_end_crit(vp);
847 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END,
848 			"vop_rwunlock_end:");
849 		VN_RELE(vp);
850 		rr->rr_data = NULL;
851 		rr->rr_status = puterrno(error);
852 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
853 			"rfs_read_end:(%S)", "read error");
854 		return;
855 	}
856 
857 	/*
858 	 * Get attributes again so we can send the latest access
859 	 * time to the client side for his cache.
860 	 */
861 	va.va_mask = AT_ALL;
862 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
863 		"vop_getattr_start:");
864 	error = VOP_GETATTR(vp, &va, 0, cr);
865 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
866 		"vop_getattr_end:");
867 	if (error) {
868 		freeb(mp);
869 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
870 			"vop_rwunlock_start:");
871 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
872 		if (in_crit)
873 			nbl_end_crit(vp);
874 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END,
875 			"vop_rwunlock_end:");
876 		VN_RELE(vp);
877 		rr->rr_data = NULL;
878 		rr->rr_status = puterrno(error);
879 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
880 			"rfs_read_end:(%S)", "read error");
881 		return;
882 	}
883 
884 	rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid);
885 
886 	rr->rr_data = (char *)mp->b_datap->db_base;
887 
888 done:
889 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
890 		"vop_rwunlock_start:");
891 	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
892 	if (in_crit)
893 		nbl_end_crit(vp);
894 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END,
895 		"vop_rwunlock_end:");
896 
897 	acl_perm(vp, exi, &va, cr);
898 
899 	/* check for overflows */
900 	error = vattr_to_nattr(&va, &rr->rr_attr);
901 
902 #if 0 /* notyet */
903 	/*
904 	 * Don't do this.  It causes local disk writes when just
905 	 * reading the file and the overhead is deemed larger
906 	 * than the benefit.
907 	 */
908 	/*
909 	 * Force modified metadata out to stable storage.
910 	 */
911 	(void) VOP_FSYNC(vp, FNODSYNC, cr);
912 #endif
913 
914 	VN_RELE(vp);
915 
916 	rr->rr_status = puterrno(error);
917 
918 	TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
919 		"rfs_read_end:(%S)", "done");
920 }
921 
922 /*
923  * Free data allocated by rfs_read
924  */
925 void
926 rfs_rdfree(struct nfsrdresult *rr)
927 {
928 	mblk_t *mp;
929 
930 	if (rr->rr_status == NFS_OK) {
931 		mp = rr->rr_mp;
932 		if (mp != NULL)
933 			freeb(mp);
934 	}
935 }
936 
937 void *
938 rfs_read_getfh(struct nfsreadargs *ra)
939 {
940 	return (&ra->ra_fhandle);
941 }
942 
943 #define	MAX_IOVECS	12
944 
945 #ifdef DEBUG
946 static int rfs_write_sync_hits = 0;
947 static int rfs_write_sync_misses = 0;
948 #endif
949 
950 /*
951  * Write data to file.
952  * Returns attributes of a file after writing some data to it.
953  *
954  * Any changes made here, especially in error handling might have
955  * to also be done in rfs_write (which clusters write requests).
956  */
957 void
958 rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns,
959 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
960 {
961 	int error;
962 	vnode_t *vp;
963 	rlim64_t rlimit;
964 	struct vattr va;
965 	struct uio uio;
966 	struct iovec iov[MAX_IOVECS];
967 	mblk_t *m;
968 	struct iovec *iovp;
969 	int iovcnt;
970 	cred_t *savecred;
971 	int in_crit = 0;
972 
973 	TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_START,
974 		"rfs_write_start:(%S)", "sync");
975 
976 	vp = nfs_fhtovp(&wa->wa_fhandle, exi);
977 	if (vp == NULL) {
978 		ns->ns_status = NFSERR_STALE;
979 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
980 			"rfs_write_end:(%S)", "stale");
981 		return;
982 	}
983 
984 	if (rdonly(exi, req)) {
985 		VN_RELE(vp);
986 		ns->ns_status = NFSERR_ROFS;
987 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
988 			"rfs_write_end:(%S)", "rofs");
989 		return;
990 	}
991 
992 	if (vp->v_type != VREG) {
993 		VN_RELE(vp);
994 		ns->ns_status = NFSERR_ISDIR;
995 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
996 			"rfs_write_end:(%S)", "isdir");
997 		return;
998 	}
999 
1000 	/*
1001 	 * Check to see if the v4 side of the server has delegated
1002 	 * this file.  If so, then we mark thread as wouldblock so
1003 	 * the response is dropped.
1004 	 */
1005 	if (rfs4_check_delegated(FWRITE, vp, FALSE)) {
1006 		VN_RELE(vp);
1007 		curthread->t_flag |= T_WOULDBLOCK;
1008 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
1009 			"rfs_write_end:(%S)", "delegated");
1010 		return;
1011 	}
1012 
1013 	va.va_mask = AT_UID|AT_MODE;
1014 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
1015 		"vop_getattr_start:");
1016 	error = VOP_GETATTR(vp, &va, 0, cr);
1017 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
1018 		"vop_getattr_end:");
1019 
1020 	if (error) {
1021 		VN_RELE(vp);
1022 		ns->ns_status = puterrno(error);
1023 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
1024 			"rfs_write_end:(%S)", "getattr error");
1025 		return;
1026 	}
1027 
1028 	if (crgetuid(cr) != va.va_uid) {
1029 		/*
1030 		 * This is a kludge to allow writes of files created
1031 		 * with read only permission.  The owner of the file
1032 		 * is always allowed to write it.
1033 		 */
1034 		TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START,
1035 			"vop_access_start:");
1036 		error = VOP_ACCESS(vp, VWRITE, 0, cr);
1037 		TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END,
1038 			"vop_access_end:");
1039 		if (error) {
1040 			VN_RELE(vp);
1041 			ns->ns_status = puterrno(error);
1042 			TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
1043 				"rfs_write_end:(%S)", "access error");
1044 			return;
1045 		}
1046 	}
1047 
1048 	/*
1049 	 * Can't access a mandatory lock file.  This might cause
1050 	 * the NFS service thread to block forever waiting for a
1051 	 * lock to be released that will never be released.
1052 	 */
1053 	if (MANDLOCK(vp, va.va_mode)) {
1054 		VN_RELE(vp);
1055 		ns->ns_status = NFSERR_ACCES;
1056 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
1057 			"rfs_write_end:(%S)", "mand lock");
1058 		return;
1059 	}
1060 
1061 	/*
1062 	 * We have to enter the critical region before calling VOP_RWLOCK
1063 	 * to avoid a deadlock with ufs.
1064 	 */
1065 	if (nbl_need_check(vp)) {
1066 		nbl_start_crit(vp, RW_READER);
1067 		in_crit = 1;
1068 		if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
1069 		    wa->wa_count, 0)) {
1070 			error = EACCES;
1071 			goto out;
1072 		}
1073 	}
1074 
1075 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START,
1076 		"vop_rwlock_start:");
1077 	(void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);
1078 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END,
1079 		"vop_rwlock_end:");
1080 
1081 	if (wa->wa_data) {
1082 		iov[0].iov_base = wa->wa_data;
1083 		iov[0].iov_len = wa->wa_count;
1084 		uio.uio_iov = iov;
1085 		uio.uio_iovcnt = 1;
1086 		uio.uio_segflg = UIO_SYSSPACE;
1087 		uio.uio_extflg = UIO_COPY_DEFAULT;
1088 		uio.uio_loffset = (offset_t)wa->wa_offset;
1089 		uio.uio_resid = wa->wa_count;
1090 		/*
1091 		 * The limit is checked on the client. We
1092 		 * should allow any size writes here.
1093 		 */
1094 		uio.uio_llimit = curproc->p_fsz_ctl;
1095 		rlimit = uio.uio_llimit - wa->wa_offset;
1096 		if (rlimit < (rlim64_t)uio.uio_resid)
1097 			uio.uio_resid = (uint_t)rlimit;
1098 
1099 		/*
1100 		 * for now we assume no append mode
1101 		 */
1102 		TRACE_1(TR_FAC_NFS, TR_VOP_WRITE_START,
1103 			"vop_write_start:(%S)", "sync");
1104 		/*
1105 		 * We're changing creds because VM may fault and we need
1106 		 * the cred of the current thread to be used if quota
1107 		 * checking is enabled.
1108 		 */
1109 		savecred = curthread->t_cred;
1110 		curthread->t_cred = cr;
1111 		error = VOP_WRITE(vp, &uio, FSYNC, cr, NULL);
1112 		curthread->t_cred = savecred;
1113 		TRACE_0(TR_FAC_NFS, TR_VOP_WRITE_END,
1114 			"vop_write_end:");
1115 	} else {
1116 		iovcnt = 0;
1117 		for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
1118 			iovcnt++;
1119 		if (iovcnt <= MAX_IOVECS) {
1120 #ifdef DEBUG
1121 			rfs_write_sync_hits++;
1122 #endif
1123 			iovp = iov;
1124 		} else {
1125 #ifdef DEBUG
1126 			rfs_write_sync_misses++;
1127 #endif
1128 			iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
1129 		}
1130 		mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
1131 		uio.uio_iov = iovp;
1132 		uio.uio_iovcnt = iovcnt;
1133 		uio.uio_segflg = UIO_SYSSPACE;
1134 		uio.uio_extflg = UIO_COPY_DEFAULT;
1135 		uio.uio_loffset = (offset_t)wa->wa_offset;
1136 		uio.uio_resid = wa->wa_count;
1137 		/*
1138 		 * The limit is checked on the client. We
1139 		 * should allow any size writes here.
1140 		 */
1141 		uio.uio_llimit = curproc->p_fsz_ctl;
1142 		rlimit = uio.uio_llimit - wa->wa_offset;
1143 		if (rlimit < (rlim64_t)uio.uio_resid)
1144 			uio.uio_resid = (uint_t)rlimit;
1145 
1146 		/*
1147 		 * For now we assume no append mode.
1148 		 */
1149 		TRACE_1(TR_FAC_NFS, TR_VOP_WRITE_START,
1150 			"vop_write_start:(%S)", "iov sync");
1151 		/*
1152 		 * We're changing creds because VM may fault and we need
1153 		 * the cred of the current thread to be used if quota
1154 		 * checking is enabled.
1155 		 */
1156 		savecred = curthread->t_cred;
1157 		curthread->t_cred = cr;
1158 		error = VOP_WRITE(vp, &uio, FSYNC, cr, NULL);
1159 		curthread->t_cred = savecred;
1160 		TRACE_0(TR_FAC_NFS, TR_VOP_WRITE_END,
1161 			"vop_write_end:");
1162 
1163 		if (iovp != iov)
1164 			kmem_free(iovp, sizeof (*iovp) * iovcnt);
1165 	}
1166 
1167 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
1168 		"vop_rwunlock_start:");
1169 	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
1170 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END,
1171 		"vop_rwunlock_end:");
1172 
1173 	if (!error) {
1174 		/*
1175 		 * Get attributes again so we send the latest mod
1176 		 * time to the client side for his cache.
1177 		 */
1178 		va.va_mask = AT_ALL;	/* now we want everything */
1179 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
1180 			"vop_getattr_start:");
1181 		error = VOP_GETATTR(vp, &va, 0, cr);
1182 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
1183 			"vop_getattr_end:");
1184 		/* check for overflows */
1185 		if (!error) {
1186 			acl_perm(vp, exi, &va, cr);
1187 			error = vattr_to_nattr(&va, &ns->ns_attr);
1188 		}
1189 	}
1190 
1191 out:
1192 	if (in_crit)
1193 		nbl_end_crit(vp);
1194 	VN_RELE(vp);
1195 
1196 	ns->ns_status = puterrno(error);
1197 
1198 	TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
1199 		"rfs_write_end:(%S)", "sync");
1200 }
1201 
1202 struct rfs_async_write {
1203 	struct nfswriteargs *wa;
1204 	struct nfsattrstat *ns;
1205 	struct svc_req *req;
1206 	cred_t *cr;
1207 	kthread_t *thread;
1208 	struct rfs_async_write *list;
1209 };
1210 
1211 struct rfs_async_write_list {
1212 	fhandle_t *fhp;
1213 	kcondvar_t cv;
1214 	struct rfs_async_write *list;
1215 	struct rfs_async_write_list *next;
1216 };
1217 
1218 static struct rfs_async_write_list *rfs_async_write_head = NULL;
1219 static kmutex_t rfs_async_write_lock;
1220 static int rfs_write_async = 1;	/* enables write clustering if == 1 */
1221 
1222 #define	MAXCLIOVECS	42
1223 #define	RFSWRITE_INITVAL (enum nfsstat) -1
1224 
1225 #ifdef DEBUG
1226 static int rfs_write_hits = 0;
1227 static int rfs_write_misses = 0;
1228 #endif
1229 
1230 /*
1231  * Write data to file.
1232  * Returns attributes of a file after writing some data to it.
1233  */
1234 void
1235 rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns,
1236 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1237 {
1238 	int error;
1239 	vnode_t *vp;
1240 	rlim64_t rlimit;
1241 	struct vattr va;
1242 	struct uio uio;
1243 	struct rfs_async_write_list *lp;
1244 	struct rfs_async_write_list *nlp;
1245 	struct rfs_async_write *rp;
1246 	struct rfs_async_write *nrp;
1247 	struct rfs_async_write *trp;
1248 	struct rfs_async_write *lrp;
1249 	int data_written;
1250 	int iovcnt;
1251 	mblk_t *m;
1252 	struct iovec *iovp;
1253 	struct iovec *niovp;
1254 	struct iovec iov[MAXCLIOVECS];
1255 	int count;
1256 	int rcount;
1257 	uint_t off;
1258 	uint_t len;
1259 	struct rfs_async_write nrpsp;
1260 	struct rfs_async_write_list nlpsp;
1261 	ushort_t t_flag;
1262 	cred_t *savecred;
1263 	int in_crit = 0;
1264 
1265 	if (!rfs_write_async) {
1266 		rfs_write_sync(wa, ns, exi, req, cr);
1267 		return;
1268 	}
1269 
1270 	TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_START,
1271 		"rfs_write_start:(%S)", "async");
1272 
1273 	/*
1274 	 * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
1275 	 * is considered an OK.
1276 	 */
1277 	ns->ns_status = RFSWRITE_INITVAL;
1278 
1279 	nrp = &nrpsp;
1280 	nrp->wa = wa;
1281 	nrp->ns = ns;
1282 	nrp->req = req;
1283 	nrp->cr = cr;
1284 	nrp->thread = curthread;
1285 
1286 	ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
1287 
1288 	/*
1289 	 * Look to see if there is already a cluster started
1290 	 * for this file.
1291 	 */
1292 	mutex_enter(&rfs_async_write_lock);
1293 	for (lp = rfs_async_write_head; lp != NULL; lp = lp->next) {
1294 		if (bcmp(&wa->wa_fhandle, lp->fhp,
1295 		    sizeof (fhandle_t)) == 0)
1296 			break;
1297 	}
1298 
1299 	/*
1300 	 * If lp is non-NULL, then there is already a cluster
1301 	 * started.  We need to place ourselves in the cluster
1302 	 * list in the right place as determined by starting
1303 	 * offset.  Conflicts with non-blocking mandatory locked
1304 	 * regions will be checked when the cluster is processed.
1305 	 */
1306 	if (lp != NULL) {
1307 		rp = lp->list;
1308 		trp = NULL;
1309 		while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
1310 			trp = rp;
1311 			rp = rp->list;
1312 		}
1313 		nrp->list = rp;
1314 		if (trp == NULL)
1315 			lp->list = nrp;
1316 		else
1317 			trp->list = nrp;
1318 		while (nrp->ns->ns_status == RFSWRITE_INITVAL)
1319 			cv_wait(&lp->cv, &rfs_async_write_lock);
1320 		mutex_exit(&rfs_async_write_lock);
1321 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
1322 			"rfs_write_end:(%S)", "cluster child");
1323 		return;
1324 	}
1325 
1326 	/*
1327 	 * No cluster started yet, start one and add ourselves
1328 	 * to the list of clusters.
1329 	 */
1330 	nrp->list = NULL;
1331 
1332 	nlp = &nlpsp;
1333 	nlp->fhp = &wa->wa_fhandle;
1334 	cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
1335 	nlp->list = nrp;
1336 	nlp->next = NULL;
1337 
1338 	if (rfs_async_write_head == NULL) {
1339 		rfs_async_write_head = nlp;
1340 	} else {
1341 		lp = rfs_async_write_head;
1342 		while (lp->next != NULL)
1343 			lp = lp->next;
1344 		lp->next = nlp;
1345 	}
1346 	mutex_exit(&rfs_async_write_lock);
1347 
1348 	/*
1349 	 * Convert the file handle common to all of the requests
1350 	 * in this cluster to a vnode.
1351 	 */
1352 	vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1353 	if (vp == NULL) {
1354 		mutex_enter(&rfs_async_write_lock);
1355 		if (rfs_async_write_head == nlp)
1356 			rfs_async_write_head = nlp->next;
1357 		else {
1358 			lp = rfs_async_write_head;
1359 			while (lp->next != nlp)
1360 				lp = lp->next;
1361 			lp->next = nlp->next;
1362 		}
1363 		t_flag = curthread->t_flag & T_WOULDBLOCK;
1364 		for (rp = nlp->list; rp != NULL; rp = rp->list) {
1365 			rp->ns->ns_status = NFSERR_STALE;
1366 			rp->thread->t_flag |= t_flag;
1367 		}
1368 		cv_broadcast(&nlp->cv);
1369 		mutex_exit(&rfs_async_write_lock);
1370 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
1371 			"rfs_write_end:(%S)", "stale");
1372 		return;
1373 	}
1374 
1375 	/*
1376 	 * Can only write regular files.  Attempts to write any
1377 	 * other file types fail with EISDIR.
1378 	 */
1379 	if (vp->v_type != VREG) {
1380 		VN_RELE(vp);
1381 		mutex_enter(&rfs_async_write_lock);
1382 		if (rfs_async_write_head == nlp)
1383 			rfs_async_write_head = nlp->next;
1384 		else {
1385 			lp = rfs_async_write_head;
1386 			while (lp->next != nlp)
1387 				lp = lp->next;
1388 			lp->next = nlp->next;
1389 		}
1390 		t_flag = curthread->t_flag & T_WOULDBLOCK;
1391 		for (rp = nlp->list; rp != NULL; rp = rp->list) {
1392 			rp->ns->ns_status = NFSERR_ISDIR;
1393 			rp->thread->t_flag |= t_flag;
1394 		}
1395 		cv_broadcast(&nlp->cv);
1396 		mutex_exit(&rfs_async_write_lock);
1397 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
1398 			"rfs_write_end:(%S)", "isdir");
1399 		return;
1400 	}
1401 
1402 	/*
1403 	 * Enter the critical region before calling VOP_RWLOCK, to avoid a
1404 	 * deadlock with ufs.
1405 	 */
1406 	if (nbl_need_check(vp)) {
1407 		nbl_start_crit(vp, RW_READER);
1408 		in_crit = 1;
1409 	}
1410 
1411 	/*
1412 	 * Lock the file for writing.  This operation provides
1413 	 * the delay which allows clusters to grow.
1414 	 */
1415 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START,
1416 		"vop_wrlock_start:");
1417 	(void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);
1418 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END,
1419 		"vop_wrlock_end");
1420 
1421 	/*
1422 	 * Disconnect this cluster from the list of clusters.
1423 	 * The cluster that is being dealt with must be fixed
1424 	 * in size after this point, so there is no reason
1425 	 * to leave it on the list so that new requests can
1426 	 * find it.
1427 	 *
1428 	 * The algorithm is that the first write request will
1429 	 * create a cluster, convert the file handle to a
1430 	 * vnode pointer, and then lock the file for writing.
1431 	 * This request is not likely to be clustered with
1432 	 * any others.  However, the next request will create
1433 	 * a new cluster and be blocked in VOP_RWLOCK while
1434 	 * the first request is being processed.  This delay
1435 	 * will allow more requests to be clustered in this
1436 	 * second cluster.
1437 	 */
1438 	mutex_enter(&rfs_async_write_lock);
1439 	if (rfs_async_write_head == nlp)
1440 		rfs_async_write_head = nlp->next;
1441 	else {
1442 		lp = rfs_async_write_head;
1443 		while (lp->next != nlp)
1444 			lp = lp->next;
1445 		lp->next = nlp->next;
1446 	}
1447 	mutex_exit(&rfs_async_write_lock);
1448 
1449 	/*
1450 	 * Step through the list of requests in this cluster.
1451 	 * We need to check permissions to make sure that all
1452 	 * of the requests have sufficient permission to write
1453 	 * the file.  A cluster can be composed of requests
1454 	 * from different clients and different users on each
1455 	 * client.
1456 	 *
1457 	 * As a side effect, we also calculate the size of the
1458 	 * byte range that this cluster encompasses.
1459 	 */
1460 	rp = nlp->list;
1461 	off = rp->wa->wa_offset;
1462 	len = (uint_t)0;
1463 	do {
1464 		if (rdonly(exi, rp->req)) {
1465 			rp->ns->ns_status = NFSERR_ROFS;
1466 			t_flag = curthread->t_flag & T_WOULDBLOCK;
1467 			rp->thread->t_flag |= t_flag;
1468 			continue;
1469 		}
1470 
1471 		va.va_mask = AT_UID|AT_MODE;
1472 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
1473 			"vop_getattr_start:");
1474 		error = VOP_GETATTR(vp, &va, 0, rp->cr);
1475 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
1476 			"vop_getattr_end:");
1477 		if (!error) {
1478 			if (crgetuid(rp->cr) != va.va_uid) {
1479 				/*
1480 				 * This is a kludge to allow writes of files
1481 				 * created with read only permission.  The
1482 				 * owner of the file is always allowed to
1483 				 * write it.
1484 				 */
1485 				TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START,
1486 					"vop_access_start:");
1487 				error = VOP_ACCESS(vp, VWRITE, 0, rp->cr);
1488 				TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END,
1489 					"vop_access_end:");
1490 			}
1491 			if (!error && MANDLOCK(vp, va.va_mode))
1492 				error = EACCES;
1493 		}
1494 
1495 		/*
1496 		 * Check for a conflict with a nbmand-locked region.
1497 		 */
1498 		if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset,
1499 		    rp->wa->wa_count, 0)) {
1500 			error = EACCES;
1501 		}
1502 
1503 		if (error) {
1504 			rp->ns->ns_status = puterrno(error);
1505 			t_flag = curthread->t_flag & T_WOULDBLOCK;
1506 			rp->thread->t_flag |= t_flag;
1507 			continue;
1508 		}
1509 		if (len < rp->wa->wa_offset + rp->wa->wa_count - off)
1510 			len = rp->wa->wa_offset + rp->wa->wa_count - off;
1511 	} while ((rp = rp->list) != NULL);
1512 
1513 	/*
1514 	 * Step through the cluster attempting to gather as many
1515 	 * requests which are contiguous as possible.  These
1516 	 * contiguous requests are handled via one call to VOP_WRITE
1517 	 * instead of different calls to VOP_WRITE.  We also keep
1518 	 * track of the fact that any data was written.
1519 	 */
1520 	rp = nlp->list;
1521 	data_written = 0;
1522 	do {
1523 		/*
1524 		 * Skip any requests which are already marked as having an
1525 		 * error.
1526 		 */
1527 		if (rp->ns->ns_status != RFSWRITE_INITVAL) {
1528 			rp = rp->list;
1529 			continue;
1530 		}
1531 
1532 		/*
1533 		 * Count the number of iovec's which are required
1534 		 * to handle this set of requests.  One iovec is
1535 		 * needed for each data buffer, whether addressed
1536 		 * by wa_data or by the b_rptr pointers in the
1537 		 * mblk chains.
1538 		 */
1539 		iovcnt = 0;
1540 		lrp = rp;
1541 		for (;;) {
1542 			if (lrp->wa->wa_data)
1543 				iovcnt++;
1544 			else {
1545 				m = lrp->wa->wa_mblk;
1546 				while (m != NULL) {
1547 					iovcnt++;
1548 					m = m->b_cont;
1549 				}
1550 			}
1551 			if (lrp->list == NULL ||
1552 			    lrp->list->ns->ns_status != RFSWRITE_INITVAL ||
1553 			    lrp->wa->wa_offset + lrp->wa->wa_count !=
1554 			    lrp->list->wa->wa_offset) {
1555 				lrp = lrp->list;
1556 				break;
1557 			}
1558 			lrp = lrp->list;
1559 		}
1560 
1561 		if (iovcnt <= MAXCLIOVECS) {
1562 #ifdef DEBUG
1563 			rfs_write_hits++;
1564 #endif
1565 			niovp = iov;
1566 		} else {
1567 #ifdef DEBUG
1568 			rfs_write_misses++;
1569 #endif
1570 			niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP);
1571 		}
1572 		/*
1573 		 * Put together the scatter/gather iovecs.
1574 		 */
1575 		iovp = niovp;
1576 		trp = rp;
1577 		count = 0;
1578 		do {
1579 			if (trp->wa->wa_data) {
1580 				iovp->iov_base = trp->wa->wa_data;
1581 				iovp->iov_len = trp->wa->wa_count;
1582 				iovp++;
1583 			} else {
1584 				m = trp->wa->wa_mblk;
1585 				rcount = trp->wa->wa_count;
1586 				while (m != NULL) {
1587 					iovp->iov_base = (caddr_t)m->b_rptr;
1588 					iovp->iov_len = (m->b_wptr - m->b_rptr);
1589 					rcount -= iovp->iov_len;
1590 					if (rcount < 0)
1591 						iovp->iov_len += rcount;
1592 					iovp++;
1593 					if (rcount <= 0)
1594 						break;
1595 					m = m->b_cont;
1596 				}
1597 			}
1598 			count += trp->wa->wa_count;
1599 			trp = trp->list;
1600 		} while (trp != lrp);
1601 
1602 		uio.uio_iov = niovp;
1603 		uio.uio_iovcnt = iovcnt;
1604 		uio.uio_segflg = UIO_SYSSPACE;
1605 		uio.uio_extflg = UIO_COPY_DEFAULT;
1606 		uio.uio_loffset = (offset_t)rp->wa->wa_offset;
1607 		uio.uio_resid = count;
1608 		/*
1609 		 * The limit is checked on the client. We
1610 		 * should allow any size writes here.
1611 		 */
1612 		uio.uio_llimit = curproc->p_fsz_ctl;
1613 		rlimit = uio.uio_llimit - rp->wa->wa_offset;
1614 		if (rlimit < (rlim64_t)uio.uio_resid)
1615 			uio.uio_resid = (uint_t)rlimit;
1616 
1617 		/*
1618 		 * For now we assume no append mode.
1619 		 */
1620 		TRACE_1(TR_FAC_NFS, TR_VOP_WRITE_START,
1621 			"vop_write_start:(%S)", "async");
1622 
1623 		/*
1624 		 * Check to see if the v4 side of the server has
1625 		 * delegated this file.  If so, then we mark thread
1626 		 * as wouldblock so the response is dropped.
1627 		 */
1628 		if (rfs4_check_delegated(FWRITE, vp, FALSE)) {
1629 			curthread->t_flag |= T_WOULDBLOCK;
1630 			error = EACCES; /* just to have an error */
1631 			TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
1632 				"rfs_write_end:(%S)", "delegated");
1633 		} else {
1634 			/*
1635 			 * We're changing creds because VM may fault
1636 			 * and we need the cred of the current
1637 			 * thread to be used if quota * checking is
1638 			 * enabled.
1639 			 */
1640 			savecred = curthread->t_cred;
1641 			curthread->t_cred = cr;
1642 			error = VOP_WRITE(vp, &uio, 0, rp->cr, NULL);
1643 			curthread->t_cred = savecred;
1644 			TRACE_0(TR_FAC_NFS, TR_VOP_WRITE_END,
1645 				"vop_write_end:");
1646 		}
1647 
1648 		if (niovp != iov)
1649 			kmem_free(niovp, sizeof (*niovp) * iovcnt);
1650 
1651 		if (!error) {
1652 			data_written = 1;
1653 			/*
1654 			 * Get attributes again so we send the latest mod
1655 			 * time to the client side for his cache.
1656 			 */
1657 			va.va_mask = AT_ALL;	/* now we want everything */
1658 			TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
1659 				"vop_getattr_start:");
1660 			error = VOP_GETATTR(vp, &va, 0, rp->cr);
1661 			TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
1662 				"vop_getattr_end:");
1663 			if (!error)
1664 				acl_perm(vp, exi, &va, rp->cr);
1665 		}
1666 
1667 		/*
1668 		 * Fill in the status responses for each request
1669 		 * which was just handled.  Also, copy the latest
1670 		 * attributes in to the attribute responses if
1671 		 * appropriate.
1672 		 */
1673 		t_flag = curthread->t_flag & T_WOULDBLOCK;
1674 		do {
1675 			rp->thread->t_flag |= t_flag;
1676 			/* check for overflows */
1677 			if (!error) {
1678 				error  = vattr_to_nattr(&va, &rp->ns->ns_attr);
1679 			}
1680 			rp->ns->ns_status = puterrno(error);
1681 			rp = rp->list;
1682 		} while (rp != lrp);
1683 	} while (rp != NULL);
1684 
1685 	/*
1686 	 * If any data was written at all, then we need to flush
1687 	 * the data and metadata to stable storage.
1688 	 */
1689 	if (data_written) {
1690 		TRACE_0(TR_FAC_NFS, TR_VOP_PUTPAGE_START,
1691 			"vop_putpage_start:");
1692 		error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr);
1693 		TRACE_0(TR_FAC_NFS, TR_VOP_PUTPAGE_END,
1694 			"vop_putpage_end:");
1695 		if (!error) {
1696 			TRACE_0(TR_FAC_NFS, TR_VOP_FSYNC_START,
1697 				"vop_fsync_start:");
1698 			error = VOP_FSYNC(vp, FNODSYNC, cr);
1699 			TRACE_0(TR_FAC_NFS, TR_VOP_FSYNC_END,
1700 				"vop_fsync_end:");
1701 		}
1702 	}
1703 
1704 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
1705 		"vop_rwunlock_start:");
1706 	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
1707 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END,
1708 		"vop_rwunlock_end:");
1709 
1710 	if (in_crit)
1711 		nbl_end_crit(vp);
1712 	VN_RELE(vp);
1713 
1714 	t_flag = curthread->t_flag & T_WOULDBLOCK;
1715 	mutex_enter(&rfs_async_write_lock);
1716 	for (rp = nlp->list; rp != NULL; rp = rp->list) {
1717 		if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1718 			rp->ns->ns_status = puterrno(error);
1719 			rp->thread->t_flag |= t_flag;
1720 		}
1721 	}
1722 	cv_broadcast(&nlp->cv);
1723 	mutex_exit(&rfs_async_write_lock);
1724 
1725 	TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
1726 		"rfs_write_end:(%S)", "async");
1727 }
1728 
1729 void *
1730 rfs_write_getfh(struct nfswriteargs *wa)
1731 {
1732 	return (&wa->wa_fhandle);
1733 }
1734 
1735 /*
1736  * Create a file.
1737  * Creates a file with given attributes and returns those attributes
1738  * and an fhandle for the new file.
1739  */
1740 void
1741 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
1742 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1743 {
1744 	int error;
1745 	int lookuperr;
1746 	int in_crit = 0;
1747 	struct vattr va;
1748 	vnode_t *vp;
1749 	vnode_t *dvp;
1750 	char *name = args->ca_da.da_name;
1751 	vnode_t *tvp = NULL;
1752 	int mode;
1753 	int lookup_ok;
1754 	bool_t trunc;
1755 
1756 	TRACE_0(TR_FAC_NFS, TR_RFS_CREATE_START,
1757 		"rfs_create_start:");
1758 
1759 	/*
1760 	 * Disallow NULL paths
1761 	 */
1762 	if (name == NULL || *name == '\0') {
1763 		dr->dr_status = NFSERR_ACCES;
1764 		TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END,
1765 			"rfs_create_end:(%S)", "access");
1766 		return;
1767 	}
1768 
1769 	dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
1770 	if (dvp == NULL) {
1771 		dr->dr_status = NFSERR_STALE;
1772 		TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END,
1773 			"rfs_create_end:(%S)", "stale");
1774 		return;
1775 	}
1776 
1777 	error = sattr_to_vattr(args->ca_sa, &va);
1778 	if (error) {
1779 		dr->dr_status = puterrno(error);
1780 		TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END,
1781 			"rfs_create_end:(%S)", "sattr");
1782 		return;
1783 	}
1784 
1785 	/*
1786 	 * Must specify the mode.
1787 	 */
1788 	if (!(va.va_mask & AT_MODE)) {
1789 		VN_RELE(dvp);
1790 		dr->dr_status = NFSERR_INVAL;
1791 		TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END,
1792 			"rfs_create_end:(%S)", "no mode");
1793 		return;
1794 	}
1795 
1796 	/*
1797 	 * This is a completely gross hack to make mknod
1798 	 * work over the wire until we can wack the protocol
1799 	 */
1800 	if ((va.va_mode & IFMT) == IFCHR) {
1801 		if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV)
1802 			va.va_type = VFIFO;	/* xtra kludge for named pipe */
1803 		else {
1804 			va.va_type = VCHR;
1805 			/*
1806 			 * uncompress the received dev_t
1807 			 * if the top half is zero indicating a request
1808 			 * from an `older style' OS.
1809 			 */
1810 			if ((va.va_size & 0xffff0000) == 0)
1811 				va.va_rdev = nfsv2_expdev(va.va_size);
1812 			else
1813 				va.va_rdev = (dev_t)va.va_size;
1814 		}
1815 		va.va_mask &= ~AT_SIZE;
1816 	} else if ((va.va_mode & IFMT) == IFBLK) {
1817 		va.va_type = VBLK;
1818 		/*
1819 		 * uncompress the received dev_t
1820 		 * if the top half is zero indicating a request
1821 		 * from an `older style' OS.
1822 		 */
1823 		if ((va.va_size & 0xffff0000) == 0)
1824 			va.va_rdev = nfsv2_expdev(va.va_size);
1825 		else
1826 			va.va_rdev = (dev_t)va.va_size;
1827 		va.va_mask &= ~AT_SIZE;
1828 	} else if ((va.va_mode & IFMT) == IFSOCK) {
1829 		va.va_type = VSOCK;
1830 	} else
1831 		va.va_type = VREG;
1832 	va.va_mode &= ~IFMT;
1833 	va.va_mask |= AT_TYPE;
1834 
1835 	/*
1836 	 * Why was the choice made to use VWRITE as the mode to the
1837 	 * call to VOP_CREATE ? This results in a bug.  When a client
1838 	 * opens a file that already exists and is RDONLY, the second
1839 	 * open fails with an EACESS because of the mode.
1840 	 * bug ID 1054648.
1841 	 */
1842 	lookup_ok = 0;
1843 	mode = VWRITE;
1844 	if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) {
1845 		TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_START,
1846 			"vop_lookup_start:");
1847 		error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr);
1848 		TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_END,
1849 			"vop_lookup_end:");
1850 		if (!error) {
1851 			struct vattr at;
1852 
1853 			lookup_ok = 1;
1854 			at.va_mask = AT_MODE;
1855 			TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
1856 				"vop_getattr_start:");
1857 			error = VOP_GETATTR(tvp, &at, 0, cr);
1858 			TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
1859 				"vop_getattr_end:");
1860 			if (!error)
1861 				mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD;
1862 			VN_RELE(tvp);
1863 			tvp = NULL;
1864 		}
1865 	}
1866 
1867 	if (!lookup_ok) {
1868 		if (rdonly(exi, req)) {
1869 			error = EROFS;
1870 		} else if (va.va_type != VREG && va.va_type != VFIFO &&
1871 		    va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) {
1872 			error = EPERM;
1873 		} else {
1874 			error = 0;
1875 		}
1876 	}
1877 
1878 	/*
1879 	 * If file size is being modified on an already existing file
1880 	 * make sure that there are no conflicting non-blocking mandatory
1881 	 * locks in the region being manipulated. Return EACCES if there
1882 	 * are conflicting locks.
1883 	 */
1884 	if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) {
1885 		lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr);
1886 
1887 		if (!lookuperr &&
1888 		    rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) {
1889 			VN_RELE(tvp);
1890 			curthread->t_flag |= T_WOULDBLOCK;
1891 			goto out;
1892 		}
1893 
1894 		if (!lookuperr && nbl_need_check(tvp)) {
1895 			/*
1896 			 * The file exists. Now check if it has any
1897 			 * conflicting non-blocking mandatory locks
1898 			 * in the region being changed.
1899 			 */
1900 			struct vattr bva;
1901 			u_offset_t offset;
1902 			ssize_t length;
1903 
1904 			nbl_start_crit(tvp, RW_READER);
1905 			in_crit = 1;
1906 
1907 			bva.va_mask = AT_SIZE;
1908 			error = VOP_GETATTR(tvp, &bva, 0, cr);
1909 			if (!error) {
1910 				if (va.va_size < bva.va_size) {
1911 					offset = va.va_size;
1912 					length = bva.va_size - va.va_size;
1913 				} else {
1914 					offset = bva.va_size;
1915 					length = va.va_size - bva.va_size;
1916 				}
1917 				if (length) {
1918 					if (nbl_conflict(tvp, NBL_WRITE,
1919 						offset, length, 0)) {
1920 						error = EACCES;
1921 					}
1922 				}
1923 			}
1924 			if (error) {
1925 				nbl_end_crit(tvp);
1926 				VN_RELE(tvp);
1927 				in_crit = 0;
1928 			}
1929 		} else if (tvp != NULL) {
1930 			VN_RELE(tvp);
1931 		}
1932 	}
1933 
1934 	if (!error) {
1935 		/*
1936 		 * If filesystem is shared with nosuid the remove any
1937 		 * setuid/setgid bits on create.
1938 		 */
1939 		if (va.va_type == VREG &&
1940 		    exi->exi_export.ex_flags & EX_NOSUID)
1941 			va.va_mode &= ~(VSUID | VSGID);
1942 
1943 		TRACE_0(TR_FAC_NFS, TR_VOP_CREATE_START,
1944 			"vop_create_start:");
1945 		error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0);
1946 		TRACE_0(TR_FAC_NFS, TR_VOP_CREATE_END,
1947 			"vop_create_end:");
1948 
1949 		if (!error) {
1950 
1951 			if ((va.va_mask & AT_SIZE) && (va.va_size == 0))
1952 				trunc = TRUE;
1953 			else
1954 				trunc = FALSE;
1955 
1956 			if (rfs4_check_delegated(FWRITE, tvp, trunc)) {
1957 				VN_RELE(tvp);
1958 				curthread->t_flag |= T_WOULDBLOCK;
1959 				goto out;
1960 			}
1961 			va.va_mask = AT_ALL;
1962 			TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
1963 				"vop_getattr_start:");
1964 			error = VOP_GETATTR(vp, &va, 0, cr);
1965 			TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
1966 				"vop_getattr_end:");
1967 			/* check for overflows */
1968 			if (!error) {
1969 				acl_perm(vp, exi, &va, cr);
1970 				error = vattr_to_nattr(&va, &dr->dr_attr);
1971 				if (!error) {
1972 					error = makefh(&dr->dr_fhandle, vp,
1973 							exi);
1974 				}
1975 			}
1976 			/*
1977 			 * Force modified metadata out to stable storage.
1978 			 */
1979 			(void) VOP_FSYNC(vp, FNODSYNC, cr);
1980 			VN_RELE(vp);
1981 		}
1982 
1983 		if (in_crit) {
1984 			nbl_end_crit(tvp);
1985 			VN_RELE(tvp);
1986 		}
1987 	}
1988 
1989 	/*
1990 	 * Force modified data and metadata out to stable storage.
1991 	 */
1992 	(void) VOP_FSYNC(dvp, 0, cr);
1993 
1994 out:
1995 
1996 	VN_RELE(dvp);
1997 
1998 	dr->dr_status = puterrno(error);
1999 
2000 	TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END,
2001 		"rfs_create_end:(%S)", "done");
2002 }
2003 void *
2004 rfs_create_getfh(struct nfscreatargs *args)
2005 {
2006 	return (args->ca_da.da_fhandle);
2007 }
2008 
2009 /*
2010  * Remove a file.
2011  * Remove named file from parent directory.
2012  */
2013 void
2014 rfs_remove(struct nfsdiropargs *da, enum nfsstat *status,
2015 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2016 {
2017 	int error = 0;
2018 	vnode_t *vp;
2019 	vnode_t *targvp;
2020 	int in_crit = 0;
2021 
2022 	TRACE_0(TR_FAC_NFS, TR_RFS_REMOVE_START,
2023 		"rfs_remove_start:");
2024 
2025 	/*
2026 	 * Disallow NULL paths
2027 	 */
2028 	if (da->da_name == NULL || *da->da_name == '\0') {
2029 		*status = NFSERR_ACCES;
2030 		TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END,
2031 			"rfs_remove_end:(%S)", "access");
2032 		return;
2033 	}
2034 
2035 	vp = nfs_fhtovp(da->da_fhandle, exi);
2036 	if (vp == NULL) {
2037 		*status = NFSERR_STALE;
2038 		TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END,
2039 			"rfs_remove_end:(%S)", "stale");
2040 		return;
2041 	}
2042 
2043 	if (rdonly(exi, req)) {
2044 		VN_RELE(vp);
2045 		*status = NFSERR_ROFS;
2046 		TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END,
2047 			"rfs_remove_end:(%S)", "rofs");
2048 		return;
2049 	}
2050 
2051 	/*
2052 	 * Check for a conflict with a non-blocking mandatory share reservation.
2053 	 */
2054 	error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0,
2055 			NULL, cr);
2056 	if (error != 0) {
2057 		VN_RELE(vp);
2058 		*status = puterrno(error);
2059 		return;
2060 	}
2061 
2062 	/*
2063 	 * If the file is delegated to an v4 client, then initiate
2064 	 * recall and drop this request (by setting T_WOULDBLOCK).
2065 	 * The client will eventually re-transmit the request and
2066 	 * (hopefully), by then, the v4 client will have returned
2067 	 * the delegation.
2068 	 */
2069 
2070 	if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2071 		VN_RELE(vp);
2072 		VN_RELE(targvp);
2073 		curthread->t_flag |= T_WOULDBLOCK;
2074 		return;
2075 	}
2076 
2077 	if (nbl_need_check(targvp)) {
2078 		nbl_start_crit(targvp, RW_READER);
2079 		in_crit = 1;
2080 		if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0)) {
2081 			error = EACCES;
2082 			goto out;
2083 		}
2084 	}
2085 
2086 	TRACE_0(TR_FAC_NFS, TR_VOP_REMOVE_START,
2087 		"vop_remove_start:");
2088 	error = VOP_REMOVE(vp, da->da_name, cr);
2089 	TRACE_0(TR_FAC_NFS, TR_VOP_REMOVE_END,
2090 		"vop_remove_end:");
2091 
2092 	/*
2093 	 * Force modified data and metadata out to stable storage.
2094 	 */
2095 	(void) VOP_FSYNC(vp, 0, cr);
2096 
2097 out:
2098 	if (in_crit)
2099 		nbl_end_crit(targvp);
2100 	VN_RELE(targvp);
2101 	VN_RELE(vp);
2102 
2103 	*status = puterrno(error);
2104 
2105 	TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END,
2106 		"rfs_remove_end:(%S)", "done");
2107 }
2108 
2109 void *
2110 rfs_remove_getfh(struct nfsdiropargs *da)
2111 {
2112 	return (da->da_fhandle);
2113 }
2114 
2115 /*
2116  * rename a file
2117  * Give a file (from) a new name (to).
2118  */
2119 void
2120 rfs_rename(struct nfsrnmargs *args, enum nfsstat *status,
2121 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2122 {
2123 	int error = 0;
2124 	vnode_t *fromvp;
2125 	vnode_t *tovp;
2126 	struct exportinfo *to_exi;
2127 	fhandle_t *fh;
2128 	vnode_t *srcvp;
2129 	vnode_t *targvp;
2130 	int in_crit = 0;
2131 
2132 	TRACE_0(TR_FAC_NFS, TR_RFS_RENAME_START,
2133 		"rfs_rename_start:");
2134 
2135 	fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi);
2136 	if (fromvp == NULL) {
2137 		*status = NFSERR_STALE;
2138 		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2139 			"rfs_rename_end:(%S)", "from stale");
2140 		return;
2141 	}
2142 
2143 	fh = args->rna_to.da_fhandle;
2144 	to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2145 	if (to_exi == NULL) {
2146 		VN_RELE(fromvp);
2147 		*status = NFSERR_ACCES;
2148 		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2149 			"rfs_rename_end:(%S)", "cross device");
2150 		return;
2151 	}
2152 	exi_rele(to_exi);
2153 
2154 	if (to_exi != exi) {
2155 		VN_RELE(fromvp);
2156 		*status = NFSERR_XDEV;
2157 		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2158 			"rfs_rename_end:(%S)", "from stale");
2159 		return;
2160 	}
2161 
2162 	tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi);
2163 	if (tovp == NULL) {
2164 		VN_RELE(fromvp);
2165 		*status = NFSERR_STALE;
2166 		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2167 			"rfs_rename_end:(%S)", "to stale");
2168 		return;
2169 	}
2170 
2171 	if (fromvp->v_type != VDIR || tovp->v_type != VDIR) {
2172 		VN_RELE(tovp);
2173 		VN_RELE(fromvp);
2174 		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2175 			"rfs_rename_end:(%S)", "not dir");
2176 		*status = NFSERR_NOTDIR;
2177 		return;
2178 	}
2179 
2180 	/*
2181 	 * Disallow NULL paths
2182 	 */
2183 	if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' ||
2184 	    args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') {
2185 		VN_RELE(tovp);
2186 		VN_RELE(fromvp);
2187 		*status = NFSERR_ACCES;
2188 		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2189 			"rfs_rename_end:(%S)", "access");
2190 		return;
2191 	}
2192 
2193 	if (rdonly(exi, req)) {
2194 		VN_RELE(tovp);
2195 		VN_RELE(fromvp);
2196 		*status = NFSERR_ROFS;
2197 		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2198 			"rfs_rename_end:(%S)", "rofs");
2199 		return;
2200 	}
2201 
2202 	/*
2203 	 * Check for a conflict with a non-blocking mandatory share reservation.
2204 	 */
2205 	error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
2206 			NULL, cr);
2207 	if (error != 0) {
2208 		VN_RELE(tovp);
2209 		VN_RELE(fromvp);
2210 		*status = puterrno(error);
2211 		return;
2212 	}
2213 
2214 	/* Check for delegations on the source file */
2215 
2216 	if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
2217 		VN_RELE(tovp);
2218 		VN_RELE(fromvp);
2219 		VN_RELE(srcvp);
2220 		curthread->t_flag |= T_WOULDBLOCK;
2221 		return;
2222 	}
2223 
2224 	/* Check for delegation on the file being renamed over, if it exists */
2225 
2226 	if (rfs4_deleg_policy != SRV_NEVER_DELEGATE &&
2227 	    VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr)
2228 	    == 0) {
2229 
2230 		if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2231 			VN_RELE(tovp);
2232 			VN_RELE(fromvp);
2233 			VN_RELE(srcvp);
2234 			VN_RELE(targvp);
2235 			curthread->t_flag |= T_WOULDBLOCK;
2236 			return;
2237 		}
2238 		VN_RELE(targvp);
2239 	}
2240 
2241 
2242 	if (nbl_need_check(srcvp)) {
2243 		nbl_start_crit(srcvp, RW_READER);
2244 		in_crit = 1;
2245 		if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0)) {
2246 			error = EACCES;
2247 			goto out;
2248 		}
2249 	}
2250 
2251 	TRACE_0(TR_FAC_NFS, TR_VOP_RENAME_START,
2252 		"vop_rename_start:");
2253 	error = VOP_RENAME(fromvp, args->rna_from.da_name,
2254 	    tovp, args->rna_to.da_name, cr);
2255 	TRACE_0(TR_FAC_NFS, TR_VOP_RENAME_END,
2256 		"vop_rename_end:");
2257 
2258 	if (error == 0) {
2259 		char *tmp;
2260 
2261 		/* fix the path name for the renamed file */
2262 		mutex_enter(&srcvp->v_lock);
2263 		tmp = srcvp->v_path;
2264 		srcvp->v_path = NULL;
2265 		mutex_exit(&srcvp->v_lock);
2266 		vn_setpath(rootdir, tovp, srcvp, args->rna_to.da_name,
2267 				strlen(args->rna_to.da_name));
2268 		if (tmp != NULL)
2269 			kmem_free(tmp, strlen(tmp) + 1);
2270 	}
2271 
2272 	/*
2273 	 * Force modified data and metadata out to stable storage.
2274 	 */
2275 	(void) VOP_FSYNC(tovp, 0, cr);
2276 	(void) VOP_FSYNC(fromvp, 0, cr);
2277 
2278 out:
2279 	if (in_crit)
2280 		nbl_end_crit(srcvp);
2281 	VN_RELE(srcvp);
2282 	VN_RELE(tovp);
2283 	VN_RELE(fromvp);
2284 
2285 	*status = puterrno(error);
2286 
2287 	TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2288 		"rfs_rename_end:(%S)", "done");
2289 }
2290 void *
2291 rfs_rename_getfh(struct nfsrnmargs *args)
2292 {
2293 	return (args->rna_from.da_fhandle);
2294 }
2295 
2296 /*
2297  * Link to a file.
2298  * Create a file (to) which is a hard link to the given file (from).
2299  */
2300 void
2301 rfs_link(struct nfslinkargs *args, enum nfsstat *status,
2302 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2303 {
2304 	int error;
2305 	vnode_t *fromvp;
2306 	vnode_t *tovp;
2307 	struct exportinfo *to_exi;
2308 	fhandle_t *fh;
2309 
2310 	TRACE_0(TR_FAC_NFS, TR_RFS_LINK_START,
2311 		"rfs_link_start:");
2312 
2313 	fromvp = nfs_fhtovp(args->la_from, exi);
2314 	if (fromvp == NULL) {
2315 		*status = NFSERR_STALE;
2316 		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2317 			"rfs_link_end:(%S)", "from stale");
2318 		return;
2319 	}
2320 
2321 	fh = args->la_to.da_fhandle;
2322 	to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2323 	if (to_exi == NULL) {
2324 		VN_RELE(fromvp);
2325 		*status = NFSERR_ACCES;
2326 		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2327 			"rfs_link_end:(%S)", "cross device");
2328 		return;
2329 	}
2330 	exi_rele(to_exi);
2331 
2332 	if (to_exi != exi) {
2333 		VN_RELE(fromvp);
2334 		*status = NFSERR_XDEV;
2335 		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2336 			"rfs_link_end:(%S)", "cross device");
2337 		return;
2338 	}
2339 
2340 	tovp = nfs_fhtovp(args->la_to.da_fhandle, exi);
2341 	if (tovp == NULL) {
2342 		VN_RELE(fromvp);
2343 		*status = NFSERR_STALE;
2344 		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2345 			"rfs_link_end:(%S)", "to stale");
2346 		return;
2347 	}
2348 
2349 	if (tovp->v_type != VDIR) {
2350 		VN_RELE(tovp);
2351 		VN_RELE(fromvp);
2352 		*status = NFSERR_NOTDIR;
2353 		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2354 			"rfs_link_end:(%S)", "not dir");
2355 		return;
2356 	}
2357 	/*
2358 	 * Disallow NULL paths
2359 	 */
2360 	if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') {
2361 		VN_RELE(tovp);
2362 		VN_RELE(fromvp);
2363 		*status = NFSERR_ACCES;
2364 		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2365 			"rfs_link_end:(%S)", "access");
2366 		return;
2367 	}
2368 
2369 	if (rdonly(exi, req)) {
2370 		VN_RELE(tovp);
2371 		VN_RELE(fromvp);
2372 		*status = NFSERR_ROFS;
2373 		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2374 			"rfs_link_end:(%S)", "rofs");
2375 		return;
2376 	}
2377 
2378 	TRACE_0(TR_FAC_NFS, TR_VOP_LINK_START,
2379 		"vop_link_start:");
2380 	error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr);
2381 	TRACE_0(TR_FAC_NFS, TR_VOP_LINK_END,
2382 		"vop_link_end:");
2383 
2384 	/*
2385 	 * Force modified data and metadata out to stable storage.
2386 	 */
2387 	(void) VOP_FSYNC(tovp, 0, cr);
2388 	(void) VOP_FSYNC(fromvp, FNODSYNC, cr);
2389 
2390 	VN_RELE(tovp);
2391 	VN_RELE(fromvp);
2392 
2393 	*status = puterrno(error);
2394 
2395 	TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2396 		"rfs_link_end:(%S)", "done");
2397 }
2398 void *
2399 rfs_link_getfh(struct nfslinkargs *args)
2400 {
2401 	return (args->la_from);
2402 }
2403 
2404 /*
2405  * Symbolicly link to a file.
2406  * Create a file (to) with the given attributes which is a symbolic link
2407  * to the given path name (to).
2408  */
2409 void
2410 rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
2411 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2412 {
2413 	int error;
2414 	struct vattr va;
2415 	vnode_t *vp;
2416 	vnode_t *svp;
2417 	int lerror;
2418 
2419 	TRACE_0(TR_FAC_NFS, TR_RFS_SYMLINK_START,
2420 		"rfs_symlink_start:");
2421 
2422 	/*
2423 	 * Disallow NULL paths
2424 	 */
2425 	if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') {
2426 		*status = NFSERR_ACCES;
2427 		TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END,
2428 			"rfs_symlink_end:(%S)", "access");
2429 		return;
2430 	}
2431 
2432 	vp = nfs_fhtovp(args->sla_from.da_fhandle, exi);
2433 	if (vp == NULL) {
2434 		*status = NFSERR_STALE;
2435 		TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END,
2436 			"rfs_symlink_end:(%S)", "stale");
2437 		return;
2438 	}
2439 
2440 	if (rdonly(exi, req)) {
2441 		VN_RELE(vp);
2442 		*status = NFSERR_ROFS;
2443 		TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END,
2444 			"rfs_symlink_end:(%S)", "rofs");
2445 		return;
2446 	}
2447 
2448 	error = sattr_to_vattr(args->sla_sa, &va);
2449 	if (error) {
2450 		VN_RELE(vp);
2451 		*status = puterrno(error);
2452 		TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END,
2453 			"rfs_symlink_end:(%S)", "sattr");
2454 		return;
2455 	}
2456 
2457 	if (!(va.va_mask & AT_MODE)) {
2458 		VN_RELE(vp);
2459 		*status = NFSERR_INVAL;
2460 		TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END,
2461 			"rfs_symlink_end:(%S)", "no mode");
2462 		return;
2463 	}
2464 
2465 	va.va_type = VLNK;
2466 	va.va_mask |= AT_TYPE;
2467 
2468 	TRACE_0(TR_FAC_NFS, TR_VOP_SYMLINK_START,
2469 		"vop_symlink_start:");
2470 	error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, args->sla_tnm, cr);
2471 	TRACE_0(TR_FAC_NFS, TR_VOP_SYMLINK_END,
2472 		"vop_symlink_end:");
2473 
2474 	/*
2475 	 * Force new data and metadata out to stable storage.
2476 	 */
2477 	TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_START,
2478 		"vop_lookup_start:");
2479 	lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL,
2480 	    0, NULL, cr);
2481 	TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_END,
2482 		"vop_lookup_end:");
2483 	if (!lerror) {
2484 		(void) VOP_FSYNC(svp, 0, cr);
2485 		VN_RELE(svp);
2486 	}
2487 
2488 	/*
2489 	 * Force modified data and metadata out to stable storage.
2490 	 */
2491 	(void) VOP_FSYNC(vp, 0, cr);
2492 
2493 	VN_RELE(vp);
2494 
2495 	*status = puterrno(error);
2496 
2497 	TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END,
2498 		"rfs_symlink_end:(%S)", "done");
2499 }
2500 void *
2501 rfs_symlink_getfh(struct nfsslargs *args)
2502 {
2503 	return (args->sla_from.da_fhandle);
2504 }
2505 
2506 /*
2507  * Make a directory.
2508  * Create a directory with the given name, parent directory, and attributes.
2509  * Returns a file handle and attributes for the new directory.
2510  */
2511 void
2512 rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr,
2513 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2514 {
2515 	int error;
2516 	struct vattr va;
2517 	vnode_t *dvp = NULL;
2518 	vnode_t *vp;
2519 	char *name = args->ca_da.da_name;
2520 
2521 	TRACE_0(TR_FAC_NFS, TR_RFS_MKDIR_START,
2522 		"rfs_mkdir_start:");
2523 
2524 	/*
2525 	 * Disallow NULL paths
2526 	 */
2527 	if (name == NULL || *name == '\0') {
2528 		dr->dr_status = NFSERR_ACCES;
2529 		TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END,
2530 			"rfs_mkdir_end:(%S)", "access");
2531 		return;
2532 	}
2533 
2534 	vp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
2535 	if (vp == NULL) {
2536 		dr->dr_status = NFSERR_STALE;
2537 		TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END,
2538 			"rfs_mkdir_end:(%S)", "stale");
2539 		return;
2540 	}
2541 
2542 	if (rdonly(exi, req)) {
2543 		VN_RELE(vp);
2544 		dr->dr_status = NFSERR_ROFS;
2545 		TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END,
2546 			"rfs_mkdir_end:(%S)", "rofs");
2547 		return;
2548 	}
2549 
2550 	error = sattr_to_vattr(args->ca_sa, &va);
2551 	if (error) {
2552 		VN_RELE(vp);
2553 		dr->dr_status = puterrno(error);
2554 		TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END,
2555 			"rfs_mkdir_end:(%S)", "sattr");
2556 		return;
2557 	}
2558 
2559 	if (!(va.va_mask & AT_MODE)) {
2560 		VN_RELE(vp);
2561 		dr->dr_status = NFSERR_INVAL;
2562 		TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END,
2563 			"rfs_mkdir_end:(%S)", "no mode");
2564 		return;
2565 	}
2566 
2567 	va.va_type = VDIR;
2568 	va.va_mask |= AT_TYPE;
2569 
2570 	TRACE_0(TR_FAC_NFS, TR_VOP_MKDIR_START,
2571 		"vop_mkdir_start:");
2572 	error = VOP_MKDIR(vp, name, &va, &dvp, cr);
2573 	TRACE_0(TR_FAC_NFS, TR_VOP_MKDIR_END,
2574 		"vop_mkdir_end:");
2575 
2576 	if (!error) {
2577 		/*
2578 		 * Attribtutes of the newly created directory should
2579 		 * be returned to the client.
2580 		 */
2581 		va.va_mask = AT_ALL; /* We want everything */
2582 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
2583 			"vop_getattr_start:");
2584 		error = VOP_GETATTR(dvp, &va, 0, cr);
2585 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
2586 			"vop_getattr_end:");
2587 		/* check for overflows */
2588 		if (!error) {
2589 			acl_perm(vp, exi, &va, cr);
2590 			error = vattr_to_nattr(&va, &dr->dr_attr);
2591 			if (!error) {
2592 				error = makefh(&dr->dr_fhandle, dvp, exi);
2593 			}
2594 		}
2595 		/*
2596 		 * Force new data and metadata out to stable storage.
2597 		 */
2598 		(void) VOP_FSYNC(dvp, 0, cr);
2599 		VN_RELE(dvp);
2600 	}
2601 
2602 	/*
2603 	 * Force modified data and metadata out to stable storage.
2604 	 */
2605 	(void) VOP_FSYNC(vp, 0, cr);
2606 
2607 	VN_RELE(vp);
2608 
2609 	dr->dr_status = puterrno(error);
2610 
2611 	TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END,
2612 		"rfs_mkdir_end:(%S)", "done");
2613 }
2614 void *
2615 rfs_mkdir_getfh(struct nfscreatargs *args)
2616 {
2617 	return (args->ca_da.da_fhandle);
2618 }
2619 
2620 /*
2621  * Remove a directory.
2622  * Remove the given directory name from the given parent directory.
2623  */
2624 void
2625 rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status,
2626 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2627 {
2628 	int error;
2629 	vnode_t *vp;
2630 
2631 	TRACE_0(TR_FAC_NFS, TR_RFS_RMDIR_START,
2632 		"rfs_rmdir_start:");
2633 
2634 	/*
2635 	 * Disallow NULL paths
2636 	 */
2637 	if (da->da_name == NULL || *da->da_name == '\0') {
2638 		*status = NFSERR_ACCES;
2639 		TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END,
2640 			"rfs_rmdir_end:(%S)", "access");
2641 		return;
2642 	}
2643 
2644 	vp = nfs_fhtovp(da->da_fhandle, exi);
2645 	if (vp == NULL) {
2646 		*status = NFSERR_STALE;
2647 		TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END,
2648 			"rfs_rmdir_end:(%S)", "stale");
2649 		return;
2650 	}
2651 
2652 	if (rdonly(exi, req)) {
2653 		VN_RELE(vp);
2654 		*status = NFSERR_ROFS;
2655 		TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END,
2656 			"rfs_rmdir_end:(%S)", "rofs");
2657 		return;
2658 	}
2659 
2660 	/*
2661 	 * VOP_RMDIR now takes a new third argument (the current
2662 	 * directory of the process).  That's because someone
2663 	 * wants to return EINVAL if one tries to remove ".".
2664 	 * Of course, NFS servers have no idea what their
2665 	 * clients' current directories are.  We fake it by
2666 	 * supplying a vnode known to exist and illegal to
2667 	 * remove.
2668 	 */
2669 	TRACE_0(TR_FAC_NFS, TR_VOP_RMDIR_START,
2670 		"vop_rmdir_start:");
2671 	error = VOP_RMDIR(vp, da->da_name, rootdir, cr);
2672 	TRACE_0(TR_FAC_NFS, TR_VOP_RMDIR_END,
2673 		"vop_rmdir_end:");
2674 
2675 	/*
2676 	 * Force modified data and metadata out to stable storage.
2677 	 */
2678 	(void) VOP_FSYNC(vp, 0, cr);
2679 
2680 	VN_RELE(vp);
2681 
2682 	/*
2683 	 * System V defines rmdir to return EEXIST, not ENOTEMPTY,
2684 	 * if the directory is not empty.  A System V NFS server
2685 	 * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
2686 	 * over the wire.
2687 	 */
2688 	if (error == EEXIST)
2689 		*status = NFSERR_NOTEMPTY;
2690 	else
2691 		*status = puterrno(error);
2692 
2693 	TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END,
2694 		"rfs_rmdir_end:(%S)", "done");
2695 }
2696 void *
2697 rfs_rmdir_getfh(struct nfsdiropargs *da)
2698 {
2699 	return (da->da_fhandle);
2700 }
2701 
2702 /* ARGSUSED */
2703 void
2704 rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
2705 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2706 {
2707 	int error;
2708 	int iseof;
2709 	struct iovec iov;
2710 	struct uio uio;
2711 	vnode_t *vp;
2712 
2713 	TRACE_0(TR_FAC_NFS, TR_RFS_READDIR_START,
2714 		"rfs_readdir_start:");
2715 
2716 	vp = nfs_fhtovp(&rda->rda_fh, exi);
2717 	if (vp == NULL) {
2718 		rd->rd_entries = NULL;
2719 		rd->rd_status = NFSERR_STALE;
2720 		TRACE_1(TR_FAC_NFS, TR_RFS_READDIR_END,
2721 			"rfs_readdir_end:(%S)", "stale");
2722 		return;
2723 	}
2724 
2725 	if (vp->v_type != VDIR) {
2726 		VN_RELE(vp);
2727 		rd->rd_entries = NULL;
2728 		rd->rd_status = NFSERR_NOTDIR;
2729 		TRACE_1(TR_FAC_NFS, TR_RFS_READDIR_END,
2730 			"rfs_readdir_end:(%S)", "notdir");
2731 		return;
2732 	}
2733 
2734 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START,
2735 		"vop_rwlock_start:");
2736 	(void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
2737 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END,
2738 		"vop_rwlock_end:");
2739 
2740 	TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START,
2741 		"vop_access_start:");
2742 	error = VOP_ACCESS(vp, VREAD, 0, cr);
2743 	TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END,
2744 		"vop_access_end:");
2745 	if (error) {
2746 		rd->rd_entries = NULL;
2747 		goto bad;
2748 	}
2749 
2750 	if (rda->rda_count == 0) {
2751 		rd->rd_entries = NULL;
2752 		rd->rd_size = 0;
2753 		rd->rd_eof = FALSE;
2754 		goto bad;
2755 	}
2756 
2757 	rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA);
2758 
2759 	/*
2760 	 * Allocate data for entries.  This will be freed by rfs_rddirfree.
2761 	 */
2762 	rd->rd_bufsize = (uint_t)rda->rda_count;
2763 	rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP);
2764 
2765 	/*
2766 	 * Set up io vector to read directory data
2767 	 */
2768 	iov.iov_base = (caddr_t)rd->rd_entries;
2769 	iov.iov_len = rda->rda_count;
2770 	uio.uio_iov = &iov;
2771 	uio.uio_iovcnt = 1;
2772 	uio.uio_segflg = UIO_SYSSPACE;
2773 	uio.uio_extflg = UIO_COPY_CACHED;
2774 	uio.uio_loffset = (offset_t)rda->rda_offset;
2775 	uio.uio_resid = rda->rda_count;
2776 
2777 	/*
2778 	 * read directory
2779 	 */
2780 	TRACE_0(TR_FAC_NFS, TR_VOP_READDIR_START,
2781 		"vop_readdir_start:");
2782 	error = VOP_READDIR(vp, &uio, cr, &iseof);
2783 	TRACE_0(TR_FAC_NFS, TR_VOP_READDIR_END,
2784 		"vop_readdir_end:");
2785 
2786 	/*
2787 	 * Clean up
2788 	 */
2789 	if (!error) {
2790 		/*
2791 		 * set size and eof
2792 		 */
2793 		if (uio.uio_resid == rda->rda_count) {
2794 			rd->rd_size = 0;
2795 			rd->rd_eof = TRUE;
2796 		} else {
2797 			rd->rd_size = (uint32_t)(rda->rda_count -
2798 			    uio.uio_resid);
2799 			rd->rd_eof = iseof ? TRUE : FALSE;
2800 		}
2801 	}
2802 
2803 bad:
2804 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
2805 		"vop_rwunlock_start:");
2806 	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
2807 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END,
2808 		"vop_rwunlock_end:");
2809 
2810 #if 0 /* notyet */
2811 	/*
2812 	 * Don't do this.  It causes local disk writes when just
2813 	 * reading the file and the overhead is deemed larger
2814 	 * than the benefit.
2815 	 */
2816 	/*
2817 	 * Force modified metadata out to stable storage.
2818 	 */
2819 	(void) VOP_FSYNC(vp, FNODSYNC, cr);
2820 #endif
2821 
2822 	VN_RELE(vp);
2823 
2824 	rd->rd_status = puterrno(error);
2825 
2826 	TRACE_1(TR_FAC_NFS, TR_RFS_READDIR_END,
2827 		"rfs_readdir_end:(%S)", "done");
2828 }
2829 void *
2830 rfs_readdir_getfh(struct nfsrddirargs *rda)
2831 {
2832 	return (&rda->rda_fh);
2833 }
2834 void
2835 rfs_rddirfree(struct nfsrddirres *rd)
2836 {
2837 	if (rd->rd_entries != NULL)
2838 		kmem_free(rd->rd_entries, rd->rd_bufsize);
2839 }
2840 
2841 /* ARGSUSED */
2842 void
2843 rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
2844 	struct svc_req *req, cred_t *cr)
2845 {
2846 	int error;
2847 	struct statvfs64 sb;
2848 	vnode_t *vp;
2849 
2850 	TRACE_0(TR_FAC_NFS, TR_RFS_STATFS_START,
2851 		"rfs_statfs_start:");
2852 
2853 	vp = nfs_fhtovp(fh, exi);
2854 	if (vp == NULL) {
2855 		fs->fs_status = NFSERR_STALE;
2856 		TRACE_1(TR_FAC_NFS, TR_RFS_STATFS_END,
2857 			"rfs_statfs_end:(%S)", "stale");
2858 		return;
2859 	}
2860 
2861 	error = VFS_STATVFS(vp->v_vfsp, &sb);
2862 
2863 	if (!error) {
2864 		fs->fs_tsize = nfstsize();
2865 		fs->fs_bsize = sb.f_frsize;
2866 		fs->fs_blocks = sb.f_blocks;
2867 		fs->fs_bfree = sb.f_bfree;
2868 		fs->fs_bavail = sb.f_bavail;
2869 	}
2870 
2871 	VN_RELE(vp);
2872 
2873 	fs->fs_status = puterrno(error);
2874 
2875 	TRACE_1(TR_FAC_NFS, TR_RFS_STATFS_END,
2876 		"rfs_statfs_end:(%S)", "done");
2877 }
2878 void *
2879 rfs_statfs_getfh(fhandle_t *fh)
2880 {
2881 	return (fh);
2882 }
2883 
2884 static int
2885 sattr_to_vattr(struct nfssattr *sa, struct vattr *vap)
2886 {
2887 	vap->va_mask = 0;
2888 
2889 	/*
2890 	 * There was a sign extension bug in some VFS based systems
2891 	 * which stored the mode as a short.  When it would get
2892 	 * assigned to a u_long, no sign extension would occur.
2893 	 * It needed to, but this wasn't noticed because sa_mode
2894 	 * would then get assigned back to the short, thus ignoring
2895 	 * the upper 16 bits of sa_mode.
2896 	 *
2897 	 * To make this implementation work for both broken
2898 	 * clients and good clients, we check for both versions
2899 	 * of the mode.
2900 	 */
2901 	if (sa->sa_mode != (uint32_t)((ushort_t)-1) &&
2902 	    sa->sa_mode != (uint32_t)-1) {
2903 		vap->va_mask |= AT_MODE;
2904 		vap->va_mode = sa->sa_mode;
2905 	}
2906 	if (sa->sa_uid != (uint32_t)-1) {
2907 		vap->va_mask |= AT_UID;
2908 		vap->va_uid = sa->sa_uid;
2909 	}
2910 	if (sa->sa_gid != (uint32_t)-1) {
2911 		vap->va_mask |= AT_GID;
2912 		vap->va_gid = sa->sa_gid;
2913 	}
2914 	if (sa->sa_size != (uint32_t)-1) {
2915 		vap->va_mask |= AT_SIZE;
2916 		vap->va_size = sa->sa_size;
2917 	}
2918 	if (sa->sa_atime.tv_sec != (int32_t)-1 &&
2919 	    sa->sa_atime.tv_usec != (int32_t)-1) {
2920 #ifndef _LP64
2921 		/* return error if time overflow */
2922 		if (!NFS2_TIME_OK(sa->sa_atime.tv_sec))
2923 			return (EOVERFLOW);
2924 #endif
2925 		vap->va_mask |= AT_ATIME;
2926 		/*
2927 		 * nfs protocol defines times as unsigned so don't extend sign,
2928 		 * unless sysadmin set nfs_allow_preepoch_time.
2929 		 */
2930 		NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec);
2931 		vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
2932 	}
2933 	if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
2934 	    sa->sa_mtime.tv_usec != (int32_t)-1) {
2935 #ifndef _LP64
2936 		/* return error if time overflow */
2937 		if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
2938 			return (EOVERFLOW);
2939 #endif
2940 		vap->va_mask |= AT_MTIME;
2941 		/*
2942 		 * nfs protocol defines times as unsigned so don't extend sign,
2943 		 * unless sysadmin set nfs_allow_preepoch_time.
2944 		 */
2945 		NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
2946 		vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
2947 	}
2948 	return (0);
2949 }
2950 
2951 static enum nfsftype vt_to_nf[] = {
2952 	0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
2953 };
2954 
2955 /*
2956  * check the following fields for overflow: nodeid, size, and time.
2957  * There could be a problem when converting 64-bit LP64 fields
2958  * into 32-bit ones.  Return an error if there is an overflow.
2959  */
2960 int
2961 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
2962 {
2963 	ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
2964 	na->na_type = vt_to_nf[vap->va_type];
2965 
2966 	if (vap->va_mode == (unsigned short) -1)
2967 		na->na_mode = (uint32_t)-1;
2968 	else
2969 		na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
2970 
2971 	if (vap->va_uid == (unsigned short)(-1))
2972 		na->na_uid = (uint32_t)(-1);
2973 	else if (vap->va_uid == UID_NOBODY)
2974 		na->na_uid = (uint32_t)NFS_UID_NOBODY;
2975 	else
2976 		na->na_uid = vap->va_uid;
2977 
2978 	if (vap->va_gid == (unsigned short)(-1))
2979 		na->na_gid = (uint32_t)-1;
2980 	else if (vap->va_gid == GID_NOBODY)
2981 		na->na_gid = (uint32_t)NFS_GID_NOBODY;
2982 	else
2983 		na->na_gid = vap->va_gid;
2984 
2985 	/*
2986 	 * Do we need to check fsid for overflow?  It is 64-bit in the
2987 	 * vattr, but are bigger than 32 bit values supported?
2988 	 */
2989 	na->na_fsid = vap->va_fsid;
2990 
2991 	na->na_nodeid = vap->va_nodeid;
2992 
2993 	/*
2994 	 * Check to make sure that the nodeid is representable over the
2995 	 * wire without losing bits.
2996 	 */
2997 	if (vap->va_nodeid != (u_longlong_t)na->na_nodeid)
2998 		return (EFBIG);
2999 	na->na_nlink = vap->va_nlink;
3000 
3001 	/*
3002 	 * Check for big files here, instead of at the caller.  See
3003 	 * comments in cstat for large special file explanation.
3004 	 */
3005 	if (vap->va_size > (u_longlong_t)MAXOFF32_T) {
3006 		if ((vap->va_type == VREG) || (vap->va_type == VDIR))
3007 			return (EFBIG);
3008 		if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) {
3009 			/* UNKNOWN_SIZE | OVERFLOW */
3010 			na->na_size = MAXOFF32_T;
3011 		} else
3012 			na->na_size = vap->va_size;
3013 	} else
3014 		na->na_size = vap->va_size;
3015 
3016 	/*
3017 	 * If the vnode times overflow the 32-bit times that NFS2
3018 	 * uses on the wire then return an error.
3019 	 */
3020 	if (!NFS_VAP_TIME_OK(vap)) {
3021 		return (EOVERFLOW);
3022 	}
3023 	na->na_atime.tv_sec = vap->va_atime.tv_sec;
3024 	na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
3025 
3026 	na->na_mtime.tv_sec = vap->va_mtime.tv_sec;
3027 	na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
3028 
3029 	na->na_ctime.tv_sec = vap->va_ctime.tv_sec;
3030 	na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000;
3031 
3032 	/*
3033 	 * If the dev_t will fit into 16 bits then compress
3034 	 * it, otherwise leave it alone. See comments in
3035 	 * nfs_client.c.
3036 	 */
3037 	if (getminor(vap->va_rdev) <= SO4_MAXMIN &&
3038 	    getmajor(vap->va_rdev) <= SO4_MAXMAJ)
3039 		na->na_rdev = nfsv2_cmpdev(vap->va_rdev);
3040 	else
3041 		(void) cmpldev(&na->na_rdev, vap->va_rdev);
3042 
3043 	na->na_blocks = vap->va_nblocks;
3044 	na->na_blocksize = vap->va_blksize;
3045 
3046 	/*
3047 	 * This bit of ugliness is a *TEMPORARY* hack to preserve the
3048 	 * over-the-wire protocols for named-pipe vnodes.  It remaps the
3049 	 * VFIFO type to the special over-the-wire type. (see note in nfs.h)
3050 	 *
3051 	 * BUYER BEWARE:
3052 	 *  If you are porting the NFS to a non-Sun server, you probably
3053 	 *  don't want to include the following block of code.  The
3054 	 *  over-the-wire special file types will be changing with the
3055 	 *  NFS Protocol Revision.
3056 	 */
3057 	if (vap->va_type == VFIFO)
3058 		NA_SETFIFO(na);
3059 	return (0);
3060 }
3061 
3062 /*
3063  * acl v2 support: returns approximate permission.
3064  *	default: returns minimal permission (more restrictive)
3065  *	aclok: returns maximal permission (less restrictive)
3066  *	This routine changes the permissions that are alaredy in *va.
3067  *	If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES,
3068  *	CLASS_OBJ is always the same as GROUP_OBJ entry.
3069  */
3070 static void
3071 acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr)
3072 {
3073 	vsecattr_t	vsa;
3074 	int		aclcnt;
3075 	aclent_t	*aclentp;
3076 	mode_t		mask_perm;
3077 	mode_t		grp_perm;
3078 	mode_t		other_perm;
3079 	mode_t		other_orig;
3080 	int		error;
3081 
3082 	/* dont care default acl */
3083 	vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT);
3084 	error = VOP_GETSECATTR(vp, &vsa, 0, cr);
3085 
3086 	if (!error) {
3087 		aclcnt = vsa.vsa_aclcnt;
3088 		if (aclcnt > MIN_ACL_ENTRIES) {
3089 			/* non-trivial ACL */
3090 			aclentp = vsa.vsa_aclentp;
3091 			if (exi->exi_export.ex_flags & EX_ACLOK) {
3092 				/* maximal permissions */
3093 				grp_perm = 0;
3094 				other_perm = 0;
3095 				for (; aclcnt > 0; aclcnt--, aclentp++) {
3096 					switch (aclentp->a_type) {
3097 					case USER_OBJ:
3098 						break;
3099 					case USER:
3100 						grp_perm |=
3101 						    aclentp->a_perm << 3;
3102 						other_perm |= aclentp->a_perm;
3103 						break;
3104 					case GROUP_OBJ:
3105 						grp_perm |=
3106 						    aclentp->a_perm << 3;
3107 						break;
3108 					case GROUP:
3109 						other_perm |= aclentp->a_perm;
3110 						break;
3111 					case OTHER_OBJ:
3112 						other_orig = aclentp->a_perm;
3113 						break;
3114 					case CLASS_OBJ:
3115 						mask_perm = aclentp->a_perm;
3116 						break;
3117 					default:
3118 						break;
3119 					}
3120 				}
3121 				grp_perm &= mask_perm << 3;
3122 				other_perm &= mask_perm;
3123 				other_perm |= other_orig;
3124 
3125 			} else {
3126 				/* minimal permissions */
3127 				grp_perm = 070;
3128 				other_perm = 07;
3129 				for (; aclcnt > 0; aclcnt--, aclentp++) {
3130 					switch (aclentp->a_type) {
3131 					case USER_OBJ:
3132 						break;
3133 					case USER:
3134 					case CLASS_OBJ:
3135 						grp_perm &=
3136 						    aclentp->a_perm << 3;
3137 						other_perm &=
3138 						    aclentp->a_perm;
3139 						break;
3140 					case GROUP_OBJ:
3141 						grp_perm &=
3142 						    aclentp->a_perm << 3;
3143 						break;
3144 					case GROUP:
3145 						other_perm &=
3146 						    aclentp->a_perm;
3147 						break;
3148 					case OTHER_OBJ:
3149 						other_perm &=
3150 						    aclentp->a_perm;
3151 						break;
3152 					default:
3153 						break;
3154 					}
3155 				}
3156 			}
3157 			/* copy to va */
3158 			va->va_mode &= ~077;
3159 			va->va_mode |= grp_perm | other_perm;
3160 		}
3161 		if (vsa.vsa_aclcnt)
3162 			kmem_free(vsa.vsa_aclentp,
3163 			    vsa.vsa_aclcnt * sizeof (aclent_t));
3164 	}
3165 }
3166 
3167 void
3168 rfs_srvrinit(void)
3169 {
3170 	mutex_init(&rfs_async_write_lock, NULL, MUTEX_DEFAULT, NULL);
3171 }
3172 
3173 void
3174 rfs_srvrfini(void)
3175 {
3176 	mutex_destroy(&rfs_async_write_lock);
3177 }
3178