xref: /illumos-gate/usr/src/uts/common/fs/nfs/nfs_srv.c (revision 99d3b4e271d47a93935645d0c2d348d161f90c80)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
25  * Copyright (c) 2016 by Delphix. All rights reserved.
26  */
27 
28 /*
29  *	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
30  *	All rights reserved.
31  */
32 
33 #include <sys/param.h>
34 #include <sys/types.h>
35 #include <sys/systm.h>
36 #include <sys/cred.h>
37 #include <sys/buf.h>
38 #include <sys/vfs.h>
39 #include <sys/vnode.h>
40 #include <sys/uio.h>
41 #include <sys/stat.h>
42 #include <sys/errno.h>
43 #include <sys/sysmacros.h>
44 #include <sys/statvfs.h>
45 #include <sys/kmem.h>
46 #include <sys/kstat.h>
47 #include <sys/dirent.h>
48 #include <sys/cmn_err.h>
49 #include <sys/debug.h>
50 #include <sys/vtrace.h>
51 #include <sys/mode.h>
52 #include <sys/acl.h>
53 #include <sys/nbmlock.h>
54 #include <sys/policy.h>
55 #include <sys/sdt.h>
56 
57 #include <rpc/types.h>
58 #include <rpc/auth.h>
59 #include <rpc/svc.h>
60 
61 #include <nfs/nfs.h>
62 #include <nfs/export.h>
63 #include <nfs/nfs_cmd.h>
64 
65 #include <vm/hat.h>
66 #include <vm/as.h>
67 #include <vm/seg.h>
68 #include <vm/seg_map.h>
69 #include <vm/seg_kmem.h>
70 
71 #include <sys/strsubr.h>
72 
73 /*
74  * These are the interface routines for the server side of the
75  * Network File System.  See the NFS version 2 protocol specification
76  * for a description of this interface.
77  */
78 
79 static int	sattr_to_vattr(struct nfssattr *, struct vattr *);
80 static void	acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
81 			cred_t *);
82 
83 /*
84  * Some "over the wire" UNIX file types.  These are encoded
85  * into the mode.  This needs to be fixed in the next rev.
86  */
87 #define	IFMT		0170000		/* type of file */
88 #define	IFCHR		0020000		/* character special */
89 #define	IFBLK		0060000		/* block special */
90 #define	IFSOCK		0140000		/* socket */
91 
92 u_longlong_t nfs2_srv_caller_id;
93 
94 /*
95  * Get file attributes.
96  * Returns the current attributes of the file with the given fhandle.
97  */
98 /* ARGSUSED */
99 void
100 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
101     struct svc_req *req, cred_t *cr, bool_t ro)
102 {
103 	int error;
104 	vnode_t *vp;
105 	struct vattr va;
106 
107 	vp = nfs_fhtovp(fhp, exi);
108 	if (vp == NULL) {
109 		ns->ns_status = NFSERR_STALE;
110 		return;
111 	}
112 
113 	/*
114 	 * Do the getattr.
115 	 */
116 	va.va_mask = AT_ALL;	/* we want all the attributes */
117 
118 	error = rfs4_delegated_getattr(vp, &va, 0, cr);
119 
120 	/* check for overflows */
121 	if (!error) {
122 		/* Lie about the object type for a referral */
123 		if (vn_is_nfs_reparse(vp, cr))
124 			va.va_type = VLNK;
125 
126 		acl_perm(vp, exi, &va, cr);
127 		error = vattr_to_nattr(&va, &ns->ns_attr);
128 	}
129 
130 	VN_RELE(vp);
131 
132 	ns->ns_status = puterrno(error);
133 }
134 void *
135 rfs_getattr_getfh(fhandle_t *fhp)
136 {
137 	return (fhp);
138 }
139 
140 /*
141  * Set file attributes.
142  * Sets the attributes of the file with the given fhandle.  Returns
143  * the new attributes.
144  */
145 /* ARGSUSED */
146 void
147 rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns,
148     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
149 {
150 	int error;
151 	int flag;
152 	int in_crit = 0;
153 	vnode_t *vp;
154 	struct vattr va;
155 	struct vattr bva;
156 	struct flock64 bf;
157 	caller_context_t ct;
158 
159 
160 	vp = nfs_fhtovp(&args->saa_fh, exi);
161 	if (vp == NULL) {
162 		ns->ns_status = NFSERR_STALE;
163 		return;
164 	}
165 
166 	if (rdonly(ro, vp)) {
167 		VN_RELE(vp);
168 		ns->ns_status = NFSERR_ROFS;
169 		return;
170 	}
171 
172 	error = sattr_to_vattr(&args->saa_sa, &va);
173 	if (error) {
174 		VN_RELE(vp);
175 		ns->ns_status = puterrno(error);
176 		return;
177 	}
178 
179 	/*
180 	 * If the client is requesting a change to the mtime,
181 	 * but the nanosecond field is set to 1 billion, then
182 	 * this is a flag to the server that it should set the
183 	 * atime and mtime fields to the server's current time.
184 	 * The 1 billion number actually came from the client
185 	 * as 1 million, but the units in the over the wire
186 	 * request are microseconds instead of nanoseconds.
187 	 *
188 	 * This is an overload of the protocol and should be
189 	 * documented in the NFS Version 2 protocol specification.
190 	 */
191 	if (va.va_mask & AT_MTIME) {
192 		if (va.va_mtime.tv_nsec == 1000000000) {
193 			gethrestime(&va.va_mtime);
194 			va.va_atime = va.va_mtime;
195 			va.va_mask |= AT_ATIME;
196 			flag = 0;
197 		} else
198 			flag = ATTR_UTIME;
199 	} else
200 		flag = 0;
201 
202 	/*
203 	 * If the filesystem is exported with nosuid, then mask off
204 	 * the setuid and setgid bits.
205 	 */
206 	if ((va.va_mask & AT_MODE) && vp->v_type == VREG &&
207 	    (exi->exi_export.ex_flags & EX_NOSUID))
208 		va.va_mode &= ~(VSUID | VSGID);
209 
210 	ct.cc_sysid = 0;
211 	ct.cc_pid = 0;
212 	ct.cc_caller_id = nfs2_srv_caller_id;
213 	ct.cc_flags = CC_DONTBLOCK;
214 
215 	/*
216 	 * We need to specially handle size changes because it is
217 	 * possible for the client to create a file with modes
218 	 * which indicate read-only, but with the file opened for
219 	 * writing.  If the client then tries to set the size of
220 	 * the file, then the normal access checking done in
221 	 * VOP_SETATTR would prevent the client from doing so,
222 	 * although it should be legal for it to do so.  To get
223 	 * around this, we do the access checking for ourselves
224 	 * and then use VOP_SPACE which doesn't do the access
225 	 * checking which VOP_SETATTR does. VOP_SPACE can only
226 	 * operate on VREG files, let VOP_SETATTR handle the other
227 	 * extremely rare cases.
228 	 * Also the client should not be allowed to change the
229 	 * size of the file if there is a conflicting non-blocking
230 	 * mandatory lock in the region of change.
231 	 */
232 	if (vp->v_type == VREG && va.va_mask & AT_SIZE) {
233 		if (nbl_need_check(vp)) {
234 			nbl_start_crit(vp, RW_READER);
235 			in_crit = 1;
236 		}
237 
238 		bva.va_mask = AT_UID | AT_SIZE;
239 
240 		error = VOP_GETATTR(vp, &bva, 0, cr, &ct);
241 
242 		if (error) {
243 			if (in_crit)
244 				nbl_end_crit(vp);
245 			VN_RELE(vp);
246 			ns->ns_status = puterrno(error);
247 			return;
248 		}
249 
250 		if (in_crit) {
251 			u_offset_t offset;
252 			ssize_t length;
253 
254 			if (va.va_size < bva.va_size) {
255 				offset = va.va_size;
256 				length = bva.va_size - va.va_size;
257 			} else {
258 				offset = bva.va_size;
259 				length = va.va_size - bva.va_size;
260 			}
261 			if (nbl_conflict(vp, NBL_WRITE, offset, length, 0,
262 			    NULL)) {
263 				error = EACCES;
264 			}
265 		}
266 
267 		if (crgetuid(cr) == bva.va_uid && !error &&
268 		    va.va_size != bva.va_size) {
269 			va.va_mask &= ~AT_SIZE;
270 			bf.l_type = F_WRLCK;
271 			bf.l_whence = 0;
272 			bf.l_start = (off64_t)va.va_size;
273 			bf.l_len = 0;
274 			bf.l_sysid = 0;
275 			bf.l_pid = 0;
276 
277 			error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE,
278 			    (offset_t)va.va_size, cr, &ct);
279 		}
280 		if (in_crit)
281 			nbl_end_crit(vp);
282 	} else
283 		error = 0;
284 
285 	/*
286 	 * Do the setattr.
287 	 */
288 	if (!error && va.va_mask) {
289 		error = VOP_SETATTR(vp, &va, flag, cr, &ct);
290 	}
291 
292 	/*
293 	 * check if the monitor on either vop_space or vop_setattr detected
294 	 * a delegation conflict and if so, mark the thread flag as
295 	 * wouldblock so that the response is dropped and the client will
296 	 * try again.
297 	 */
298 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
299 		VN_RELE(vp);
300 		curthread->t_flag |= T_WOULDBLOCK;
301 		return;
302 	}
303 
304 	if (!error) {
305 		va.va_mask = AT_ALL;	/* get everything */
306 
307 		error = rfs4_delegated_getattr(vp, &va, 0, cr);
308 
309 		/* check for overflows */
310 		if (!error) {
311 			acl_perm(vp, exi, &va, cr);
312 			error = vattr_to_nattr(&va, &ns->ns_attr);
313 		}
314 	}
315 
316 	ct.cc_flags = 0;
317 
318 	/*
319 	 * Force modified metadata out to stable storage.
320 	 */
321 	(void) VOP_FSYNC(vp, FNODSYNC, cr, &ct);
322 
323 	VN_RELE(vp);
324 
325 	ns->ns_status = puterrno(error);
326 }
327 void *
328 rfs_setattr_getfh(struct nfssaargs *args)
329 {
330 	return (&args->saa_fh);
331 }
332 
333 /* Change and release @exip and @vpp only in success */
334 int
335 rfs_cross_mnt(vnode_t **vpp, struct exportinfo **exip)
336 {
337 	struct exportinfo *exi;
338 	vnode_t *vp = *vpp;
339 	fid_t fid;
340 	int error;
341 
342 	VN_HOLD(vp);
343 
344 	if ((error = traverse(&vp)) != 0) {
345 		VN_RELE(vp);
346 		return (error);
347 	}
348 
349 	bzero(&fid, sizeof (fid));
350 	fid.fid_len = MAXFIDSZ;
351 	error = VOP_FID(vp, &fid, NULL);
352 	if (error) {
353 		VN_RELE(vp);
354 		return (error);
355 	}
356 
357 	exi = checkexport(&vp->v_vfsp->vfs_fsid, &fid);
358 	if (exi == NULL ||
359 	    (exi->exi_export.ex_flags & EX_NOHIDE) == 0) {
360 		/*
361 		 * It is not error, just subdir is not exported
362 		 * or "nohide" is not set
363 		 */
364 		if (exi != NULL)
365 			exi_rele(exi);
366 		VN_RELE(vp);
367 	} else {
368 		/* go to submount */
369 		exi_rele(*exip);
370 		*exip = exi;
371 
372 		VN_RELE(*vpp);
373 		*vpp = vp;
374 	}
375 
376 	return (0);
377 }
378 
379 /*
380  * Given mounted "dvp" and "exi", go upper mountpoint
381  * with dvp/exi correction
382  * Return 0 in success
383  */
384 int
385 rfs_climb_crossmnt(vnode_t **dvpp, struct exportinfo **exip, cred_t *cr)
386 {
387 	struct exportinfo *exi;
388 	vnode_t *dvp = *dvpp;
389 
390 	ASSERT(dvp->v_flag & VROOT);
391 
392 	VN_HOLD(dvp);
393 	dvp = untraverse(dvp);
394 	exi = nfs_vptoexi(NULL, dvp, cr, NULL, NULL, FALSE);
395 	if (exi == NULL) {
396 		VN_RELE(dvp);
397 		return (-1);
398 	}
399 
400 	exi_rele(*exip);
401 	*exip = exi;
402 	VN_RELE(*dvpp);
403 	*dvpp = dvp;
404 
405 	return (0);
406 }
407 /*
408  * Directory lookup.
409  * Returns an fhandle and file attributes for file name in a directory.
410  */
411 /* ARGSUSED */
412 void
413 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
414     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
415 {
416 	int error;
417 	vnode_t *dvp;
418 	vnode_t *vp;
419 	struct vattr va;
420 	fhandle_t *fhp = da->da_fhandle;
421 	struct sec_ol sec = {0, 0};
422 	bool_t publicfh_flag = FALSE, auth_weak = FALSE;
423 	char *name;
424 	struct sockaddr *ca;
425 
426 	/*
427 	 * Trusted Extension doesn't support NFSv2. MOUNT
428 	 * will reject v2 clients. Need to prevent v2 client
429 	 * access via WebNFS here.
430 	 */
431 	if (is_system_labeled() && req->rq_vers == 2) {
432 		dr->dr_status = NFSERR_ACCES;
433 		return;
434 	}
435 
436 	/*
437 	 * Disallow NULL paths
438 	 */
439 	if (da->da_name == NULL || *da->da_name == '\0') {
440 		dr->dr_status = NFSERR_ACCES;
441 		return;
442 	}
443 
444 	/*
445 	 * Allow lookups from the root - the default
446 	 * location of the public filehandle.
447 	 */
448 	if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
449 		dvp = rootdir;
450 		VN_HOLD(dvp);
451 	} else {
452 		dvp = nfs_fhtovp(fhp, exi);
453 		if (dvp == NULL) {
454 			dr->dr_status = NFSERR_STALE;
455 			return;
456 		}
457 	}
458 
459 	exi_hold(exi);
460 
461 	/*
462 	 * Not allow lookup beyond root.
463 	 * If the filehandle matches a filehandle of the exi,
464 	 * then the ".." refers beyond the root of an exported filesystem.
465 	 */
466 	if (strcmp(da->da_name, "..") == 0 &&
467 	    EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
468 		if ((exi->exi_export.ex_flags & EX_NOHIDE) &&
469 		    (dvp->v_flag & VROOT)) {
470 			/*
471 			 * special case for ".." and 'nohide'exported root
472 			 */
473 			if (rfs_climb_crossmnt(&dvp, &exi, cr) != 0) {
474 				error = NFSERR_ACCES;
475 				goto out;
476 			}
477 		} else  {
478 			error = NFSERR_NOENT;
479 			goto out;
480 		}
481 	}
482 
483 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
484 	name = nfscmd_convname(ca, exi, da->da_name, NFSCMD_CONV_INBOUND,
485 	    MAXPATHLEN);
486 
487 	if (name == NULL) {
488 		error = NFSERR_ACCES;
489 		goto out;
490 	}
491 
492 	/*
493 	 * If the public filehandle is used then allow
494 	 * a multi-component lookup, i.e. evaluate
495 	 * a pathname and follow symbolic links if
496 	 * necessary.
497 	 *
498 	 * This may result in a vnode in another filesystem
499 	 * which is OK as long as the filesystem is exported.
500 	 */
501 	if (PUBLIC_FH2(fhp)) {
502 		publicfh_flag = TRUE;
503 
504 		exi_rele(exi);
505 
506 		error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &exi,
507 		    &sec);
508 	} else {
509 		/*
510 		 * Do a normal single component lookup.
511 		 */
512 		error = VOP_LOOKUP(dvp, name, &vp, NULL, 0, NULL, cr,
513 		    NULL, NULL, NULL);
514 	}
515 
516 	if (name != da->da_name)
517 		kmem_free(name, MAXPATHLEN);
518 
519 	if (error == 0 && vn_ismntpt(vp)) {
520 		error = rfs_cross_mnt(&vp, &exi);
521 		if (error)
522 			VN_RELE(vp);
523 	}
524 
525 	if (!error) {
526 		va.va_mask = AT_ALL;	/* we want everything */
527 
528 		error = rfs4_delegated_getattr(vp, &va, 0, cr);
529 
530 		/* check for overflows */
531 		if (!error) {
532 			acl_perm(vp, exi, &va, cr);
533 			error = vattr_to_nattr(&va, &dr->dr_attr);
534 			if (!error) {
535 				if (sec.sec_flags & SEC_QUERY)
536 					error = makefh_ol(&dr->dr_fhandle, exi,
537 					    sec.sec_index);
538 				else {
539 					error = makefh(&dr->dr_fhandle, vp,
540 					    exi);
541 					if (!error && publicfh_flag &&
542 					    !chk_clnt_sec(exi, req))
543 						auth_weak = TRUE;
544 				}
545 			}
546 		}
547 		VN_RELE(vp);
548 	}
549 
550 out:
551 	VN_RELE(dvp);
552 
553 	if (exi != NULL)
554 		exi_rele(exi);
555 
556 	/*
557 	 * If it's public fh, no 0x81, and client's flavor is
558 	 * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
559 	 * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
560 	 */
561 	if (auth_weak)
562 		dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR;
563 	else
564 		dr->dr_status = puterrno(error);
565 }
566 void *
567 rfs_lookup_getfh(struct nfsdiropargs *da)
568 {
569 	return (da->da_fhandle);
570 }
571 
572 /*
573  * Read symbolic link.
574  * Returns the string in the symbolic link at the given fhandle.
575  */
576 /* ARGSUSED */
577 void
578 rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi,
579     struct svc_req *req, cred_t *cr, bool_t ro)
580 {
581 	int error;
582 	struct iovec iov;
583 	struct uio uio;
584 	vnode_t *vp;
585 	struct vattr va;
586 	struct sockaddr *ca;
587 	char *name = NULL;
588 	int is_referral = 0;
589 
590 	vp = nfs_fhtovp(fhp, exi);
591 	if (vp == NULL) {
592 		rl->rl_data = NULL;
593 		rl->rl_status = NFSERR_STALE;
594 		return;
595 	}
596 
597 	va.va_mask = AT_MODE;
598 
599 	error = VOP_GETATTR(vp, &va, 0, cr, NULL);
600 
601 	if (error) {
602 		VN_RELE(vp);
603 		rl->rl_data = NULL;
604 		rl->rl_status = puterrno(error);
605 		return;
606 	}
607 
608 	if (MANDLOCK(vp, va.va_mode)) {
609 		VN_RELE(vp);
610 		rl->rl_data = NULL;
611 		rl->rl_status = NFSERR_ACCES;
612 		return;
613 	}
614 
615 	/* We lied about the object type for a referral */
616 	if (vn_is_nfs_reparse(vp, cr))
617 		is_referral = 1;
618 
619 	/*
620 	 * XNFS and RFC1094 require us to return ENXIO if argument
621 	 * is not a link. BUGID 1138002.
622 	 */
623 	if (vp->v_type != VLNK && !is_referral) {
624 		VN_RELE(vp);
625 		rl->rl_data = NULL;
626 		rl->rl_status = NFSERR_NXIO;
627 		return;
628 	}
629 
630 	/*
631 	 * Allocate data for pathname.  This will be freed by rfs_rlfree.
632 	 */
633 	rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
634 
635 	if (is_referral) {
636 		char *s;
637 		size_t strsz;
638 
639 		/* Get an artificial symlink based on a referral */
640 		s = build_symlink(vp, cr, &strsz);
641 		global_svstat_ptr[2][NFS_REFERLINKS].value.ui64++;
642 		DTRACE_PROBE2(nfs2serv__func__referral__reflink,
643 		    vnode_t *, vp, char *, s);
644 		if (s == NULL)
645 			error = EINVAL;
646 		else {
647 			error = 0;
648 			(void) strlcpy(rl->rl_data, s, NFS_MAXPATHLEN);
649 			rl->rl_count = (uint32_t)MIN(strsz, NFS_MAXPATHLEN);
650 			kmem_free(s, strsz);
651 		}
652 
653 	} else {
654 
655 		/*
656 		 * Set up io vector to read sym link data
657 		 */
658 		iov.iov_base = rl->rl_data;
659 		iov.iov_len = NFS_MAXPATHLEN;
660 		uio.uio_iov = &iov;
661 		uio.uio_iovcnt = 1;
662 		uio.uio_segflg = UIO_SYSSPACE;
663 		uio.uio_extflg = UIO_COPY_CACHED;
664 		uio.uio_loffset = (offset_t)0;
665 		uio.uio_resid = NFS_MAXPATHLEN;
666 
667 		/*
668 		 * Do the readlink.
669 		 */
670 		error = VOP_READLINK(vp, &uio, cr, NULL);
671 
672 		rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid);
673 
674 		if (!error)
675 			rl->rl_data[rl->rl_count] = '\0';
676 
677 	}
678 
679 
680 	VN_RELE(vp);
681 
682 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
683 	name = nfscmd_convname(ca, exi, rl->rl_data,
684 	    NFSCMD_CONV_OUTBOUND, MAXPATHLEN);
685 
686 	if (name != NULL && name != rl->rl_data) {
687 		kmem_free(rl->rl_data, NFS_MAXPATHLEN);
688 		rl->rl_data = name;
689 	}
690 
691 	/*
692 	 * XNFS and RFC1094 require us to return ENXIO if argument
693 	 * is not a link. UFS returns EINVAL if this is the case,
694 	 * so we do the mapping here. BUGID 1138002.
695 	 */
696 	if (error == EINVAL)
697 		rl->rl_status = NFSERR_NXIO;
698 	else
699 		rl->rl_status = puterrno(error);
700 
701 }
702 void *
703 rfs_readlink_getfh(fhandle_t *fhp)
704 {
705 	return (fhp);
706 }
707 /*
708  * Free data allocated by rfs_readlink
709  */
710 void
711 rfs_rlfree(struct nfsrdlnres *rl)
712 {
713 	if (rl->rl_data != NULL)
714 		kmem_free(rl->rl_data, NFS_MAXPATHLEN);
715 }
716 
717 static int rdma_setup_read_data2(struct nfsreadargs *, struct nfsrdresult *);
718 
719 /*
720  * Read data.
721  * Returns some data read from the file at the given fhandle.
722  */
723 /* ARGSUSED */
724 void
725 rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr,
726     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
727 {
728 	vnode_t *vp;
729 	int error;
730 	struct vattr va;
731 	struct iovec iov;
732 	struct uio uio;
733 	mblk_t *mp;
734 	int alloc_err = 0;
735 	int in_crit = 0;
736 	caller_context_t ct;
737 
738 	vp = nfs_fhtovp(&ra->ra_fhandle, exi);
739 	if (vp == NULL) {
740 		rr->rr_data = NULL;
741 		rr->rr_status = NFSERR_STALE;
742 		return;
743 	}
744 
745 	if (vp->v_type != VREG) {
746 		VN_RELE(vp);
747 		rr->rr_data = NULL;
748 		rr->rr_status = NFSERR_ISDIR;
749 		return;
750 	}
751 
752 	ct.cc_sysid = 0;
753 	ct.cc_pid = 0;
754 	ct.cc_caller_id = nfs2_srv_caller_id;
755 	ct.cc_flags = CC_DONTBLOCK;
756 
757 	/*
758 	 * Enter the critical region before calling VOP_RWLOCK
759 	 * to avoid a deadlock with write requests.
760 	 */
761 	if (nbl_need_check(vp)) {
762 		nbl_start_crit(vp, RW_READER);
763 		if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
764 		    0, NULL)) {
765 			nbl_end_crit(vp);
766 			VN_RELE(vp);
767 			rr->rr_data = NULL;
768 			rr->rr_status = NFSERR_ACCES;
769 			return;
770 		}
771 		in_crit = 1;
772 	}
773 
774 	error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct);
775 
776 	/* check if a monitor detected a delegation conflict */
777 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
778 		VN_RELE(vp);
779 		/* mark as wouldblock so response is dropped */
780 		curthread->t_flag |= T_WOULDBLOCK;
781 
782 		rr->rr_data = NULL;
783 		return;
784 	}
785 
786 	va.va_mask = AT_ALL;
787 
788 	error = VOP_GETATTR(vp, &va, 0, cr, &ct);
789 
790 	if (error) {
791 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
792 		if (in_crit)
793 			nbl_end_crit(vp);
794 
795 		VN_RELE(vp);
796 		rr->rr_data = NULL;
797 		rr->rr_status = puterrno(error);
798 
799 		return;
800 	}
801 
802 	/*
803 	 * This is a kludge to allow reading of files created
804 	 * with no read permission.  The owner of the file
805 	 * is always allowed to read it.
806 	 */
807 	if (crgetuid(cr) != va.va_uid) {
808 		error = VOP_ACCESS(vp, VREAD, 0, cr, &ct);
809 
810 		if (error) {
811 			/*
812 			 * Exec is the same as read over the net because
813 			 * of demand loading.
814 			 */
815 			error = VOP_ACCESS(vp, VEXEC, 0, cr, &ct);
816 		}
817 		if (error) {
818 			VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
819 			if (in_crit)
820 				nbl_end_crit(vp);
821 			VN_RELE(vp);
822 			rr->rr_data = NULL;
823 			rr->rr_status = puterrno(error);
824 
825 			return;
826 		}
827 	}
828 
829 	if (MANDLOCK(vp, va.va_mode)) {
830 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
831 		if (in_crit)
832 			nbl_end_crit(vp);
833 
834 		VN_RELE(vp);
835 		rr->rr_data = NULL;
836 		rr->rr_status = NFSERR_ACCES;
837 
838 		return;
839 	}
840 
841 	rr->rr_ok.rrok_wlist_len = 0;
842 	rr->rr_ok.rrok_wlist = NULL;
843 
844 	if ((u_offset_t)ra->ra_offset >= va.va_size) {
845 		rr->rr_count = 0;
846 		rr->rr_data = NULL;
847 		/*
848 		 * In this case, status is NFS_OK, but there is no data
849 		 * to encode. So set rr_mp to NULL.
850 		 */
851 		rr->rr_mp = NULL;
852 		rr->rr_ok.rrok_wlist = ra->ra_wlist;
853 		if (rr->rr_ok.rrok_wlist)
854 			clist_zero_len(rr->rr_ok.rrok_wlist);
855 		goto done;
856 	}
857 
858 	if (ra->ra_wlist) {
859 		mp = NULL;
860 		rr->rr_mp = NULL;
861 		(void) rdma_get_wchunk(req, &iov, ra->ra_wlist);
862 		if (ra->ra_count > iov.iov_len) {
863 			rr->rr_data = NULL;
864 			rr->rr_status = NFSERR_INVAL;
865 			goto done;
866 		}
867 	} else {
868 		/*
869 		 * mp will contain the data to be sent out in the read reply.
870 		 * This will be freed after the reply has been sent out (by the
871 		 * driver).
872 		 * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so
873 		 * that the call to xdrmblk_putmblk() never fails.
874 		 */
875 		mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG,
876 		    &alloc_err);
877 		ASSERT(mp != NULL);
878 		ASSERT(alloc_err == 0);
879 
880 		rr->rr_mp = mp;
881 
882 		/*
883 		 * Set up io vector
884 		 */
885 		iov.iov_base = (caddr_t)mp->b_datap->db_base;
886 		iov.iov_len = ra->ra_count;
887 	}
888 
889 	uio.uio_iov = &iov;
890 	uio.uio_iovcnt = 1;
891 	uio.uio_segflg = UIO_SYSSPACE;
892 	uio.uio_extflg = UIO_COPY_CACHED;
893 	uio.uio_loffset = (offset_t)ra->ra_offset;
894 	uio.uio_resid = ra->ra_count;
895 
896 	error = VOP_READ(vp, &uio, 0, cr, &ct);
897 
898 	if (error) {
899 		if (mp)
900 			freeb(mp);
901 
902 		/*
903 		 * check if a monitor detected a delegation conflict and
904 		 * mark as wouldblock so response is dropped
905 		 */
906 		if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
907 			curthread->t_flag |= T_WOULDBLOCK;
908 		else
909 			rr->rr_status = puterrno(error);
910 
911 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
912 		if (in_crit)
913 			nbl_end_crit(vp);
914 
915 		VN_RELE(vp);
916 		rr->rr_data = NULL;
917 
918 		return;
919 	}
920 
921 	/*
922 	 * Get attributes again so we can send the latest access
923 	 * time to the client side for its cache.
924 	 */
925 	va.va_mask = AT_ALL;
926 
927 	error = VOP_GETATTR(vp, &va, 0, cr, &ct);
928 
929 	if (error) {
930 		if (mp)
931 			freeb(mp);
932 
933 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
934 		if (in_crit)
935 			nbl_end_crit(vp);
936 
937 		VN_RELE(vp);
938 		rr->rr_data = NULL;
939 		rr->rr_status = puterrno(error);
940 
941 		return;
942 	}
943 
944 	rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid);
945 
946 	if (mp) {
947 		rr->rr_data = (char *)mp->b_datap->db_base;
948 	} else {
949 		if (ra->ra_wlist) {
950 			rr->rr_data = (caddr_t)iov.iov_base;
951 			if (!rdma_setup_read_data2(ra, rr)) {
952 				rr->rr_data = NULL;
953 				rr->rr_status = puterrno(NFSERR_INVAL);
954 			}
955 		}
956 	}
957 done:
958 	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
959 	if (in_crit)
960 		nbl_end_crit(vp);
961 
962 	acl_perm(vp, exi, &va, cr);
963 
964 	/* check for overflows */
965 	error = vattr_to_nattr(&va, &rr->rr_attr);
966 
967 	VN_RELE(vp);
968 
969 	rr->rr_status = puterrno(error);
970 }
971 
972 /*
973  * Free data allocated by rfs_read
974  */
975 void
976 rfs_rdfree(struct nfsrdresult *rr)
977 {
978 	mblk_t *mp;
979 
980 	if (rr->rr_status == NFS_OK) {
981 		mp = rr->rr_mp;
982 		if (mp != NULL)
983 			freeb(mp);
984 	}
985 }
986 
987 void *
988 rfs_read_getfh(struct nfsreadargs *ra)
989 {
990 	return (&ra->ra_fhandle);
991 }
992 
993 #define	MAX_IOVECS	12
994 
995 #ifdef DEBUG
996 static int rfs_write_sync_hits = 0;
997 static int rfs_write_sync_misses = 0;
998 #endif
999 
1000 /*
1001  * Write data to file.
1002  * Returns attributes of a file after writing some data to it.
1003  *
1004  * Any changes made here, especially in error handling might have
1005  * to also be done in rfs_write (which clusters write requests).
1006  */
1007 /* ARGSUSED */
1008 void
1009 rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns,
1010     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1011 {
1012 	int error;
1013 	vnode_t *vp;
1014 	rlim64_t rlimit;
1015 	struct vattr va;
1016 	struct uio uio;
1017 	struct iovec iov[MAX_IOVECS];
1018 	mblk_t *m;
1019 	struct iovec *iovp;
1020 	int iovcnt;
1021 	cred_t *savecred;
1022 	int in_crit = 0;
1023 	caller_context_t ct;
1024 
1025 	vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1026 	if (vp == NULL) {
1027 		ns->ns_status = NFSERR_STALE;
1028 		return;
1029 	}
1030 
1031 	if (rdonly(ro, vp)) {
1032 		VN_RELE(vp);
1033 		ns->ns_status = NFSERR_ROFS;
1034 		return;
1035 	}
1036 
1037 	if (vp->v_type != VREG) {
1038 		VN_RELE(vp);
1039 		ns->ns_status = NFSERR_ISDIR;
1040 		return;
1041 	}
1042 
1043 	ct.cc_sysid = 0;
1044 	ct.cc_pid = 0;
1045 	ct.cc_caller_id = nfs2_srv_caller_id;
1046 	ct.cc_flags = CC_DONTBLOCK;
1047 
1048 	va.va_mask = AT_UID|AT_MODE;
1049 
1050 	error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1051 
1052 	if (error) {
1053 		VN_RELE(vp);
1054 		ns->ns_status = puterrno(error);
1055 
1056 		return;
1057 	}
1058 
1059 	if (crgetuid(cr) != va.va_uid) {
1060 		/*
1061 		 * This is a kludge to allow writes of files created
1062 		 * with read only permission.  The owner of the file
1063 		 * is always allowed to write it.
1064 		 */
1065 		error = VOP_ACCESS(vp, VWRITE, 0, cr, &ct);
1066 
1067 		if (error) {
1068 			VN_RELE(vp);
1069 			ns->ns_status = puterrno(error);
1070 			return;
1071 		}
1072 	}
1073 
1074 	/*
1075 	 * Can't access a mandatory lock file.  This might cause
1076 	 * the NFS service thread to block forever waiting for a
1077 	 * lock to be released that will never be released.
1078 	 */
1079 	if (MANDLOCK(vp, va.va_mode)) {
1080 		VN_RELE(vp);
1081 		ns->ns_status = NFSERR_ACCES;
1082 		return;
1083 	}
1084 
1085 	/*
1086 	 * We have to enter the critical region before calling VOP_RWLOCK
1087 	 * to avoid a deadlock with ufs.
1088 	 */
1089 	if (nbl_need_check(vp)) {
1090 		nbl_start_crit(vp, RW_READER);
1091 		in_crit = 1;
1092 		if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
1093 		    wa->wa_count, 0, NULL)) {
1094 			error = EACCES;
1095 			goto out;
1096 		}
1097 	}
1098 
1099 	error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1100 
1101 	/* check if a monitor detected a delegation conflict */
1102 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1103 		VN_RELE(vp);
1104 		/* mark as wouldblock so response is dropped */
1105 		curthread->t_flag |= T_WOULDBLOCK;
1106 		return;
1107 	}
1108 
1109 	if (wa->wa_data || wa->wa_rlist) {
1110 		/* Do the RDMA thing if necessary */
1111 		if (wa->wa_rlist) {
1112 			iov[0].iov_base = (char *)((wa->wa_rlist)->u.c_daddr3);
1113 			iov[0].iov_len = wa->wa_count;
1114 		} else  {
1115 			iov[0].iov_base = wa->wa_data;
1116 			iov[0].iov_len = wa->wa_count;
1117 		}
1118 		uio.uio_iov = iov;
1119 		uio.uio_iovcnt = 1;
1120 		uio.uio_segflg = UIO_SYSSPACE;
1121 		uio.uio_extflg = UIO_COPY_DEFAULT;
1122 		uio.uio_loffset = (offset_t)wa->wa_offset;
1123 		uio.uio_resid = wa->wa_count;
1124 		/*
1125 		 * The limit is checked on the client. We
1126 		 * should allow any size writes here.
1127 		 */
1128 		uio.uio_llimit = curproc->p_fsz_ctl;
1129 		rlimit = uio.uio_llimit - wa->wa_offset;
1130 		if (rlimit < (rlim64_t)uio.uio_resid)
1131 			uio.uio_resid = (uint_t)rlimit;
1132 
1133 		/*
1134 		 * for now we assume no append mode
1135 		 */
1136 		/*
1137 		 * We're changing creds because VM may fault and we need
1138 		 * the cred of the current thread to be used if quota
1139 		 * checking is enabled.
1140 		 */
1141 		savecred = curthread->t_cred;
1142 		curthread->t_cred = cr;
1143 		error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1144 		curthread->t_cred = savecred;
1145 	} else {
1146 		iovcnt = 0;
1147 		for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
1148 			iovcnt++;
1149 		if (iovcnt <= MAX_IOVECS) {
1150 #ifdef DEBUG
1151 			rfs_write_sync_hits++;
1152 #endif
1153 			iovp = iov;
1154 		} else {
1155 #ifdef DEBUG
1156 			rfs_write_sync_misses++;
1157 #endif
1158 			iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
1159 		}
1160 		mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
1161 		uio.uio_iov = iovp;
1162 		uio.uio_iovcnt = iovcnt;
1163 		uio.uio_segflg = UIO_SYSSPACE;
1164 		uio.uio_extflg = UIO_COPY_DEFAULT;
1165 		uio.uio_loffset = (offset_t)wa->wa_offset;
1166 		uio.uio_resid = wa->wa_count;
1167 		/*
1168 		 * The limit is checked on the client. We
1169 		 * should allow any size writes here.
1170 		 */
1171 		uio.uio_llimit = curproc->p_fsz_ctl;
1172 		rlimit = uio.uio_llimit - wa->wa_offset;
1173 		if (rlimit < (rlim64_t)uio.uio_resid)
1174 			uio.uio_resid = (uint_t)rlimit;
1175 
1176 		/*
1177 		 * For now we assume no append mode.
1178 		 */
1179 		/*
1180 		 * We're changing creds because VM may fault and we need
1181 		 * the cred of the current thread to be used if quota
1182 		 * checking is enabled.
1183 		 */
1184 		savecred = curthread->t_cred;
1185 		curthread->t_cred = cr;
1186 		error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1187 		curthread->t_cred = savecred;
1188 
1189 		if (iovp != iov)
1190 			kmem_free(iovp, sizeof (*iovp) * iovcnt);
1191 	}
1192 
1193 	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1194 
1195 	if (!error) {
1196 		/*
1197 		 * Get attributes again so we send the latest mod
1198 		 * time to the client side for its cache.
1199 		 */
1200 		va.va_mask = AT_ALL;	/* now we want everything */
1201 
1202 		error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1203 
1204 		/* check for overflows */
1205 		if (!error) {
1206 			acl_perm(vp, exi, &va, cr);
1207 			error = vattr_to_nattr(&va, &ns->ns_attr);
1208 		}
1209 	}
1210 
1211 out:
1212 	if (in_crit)
1213 		nbl_end_crit(vp);
1214 	VN_RELE(vp);
1215 
1216 	/* check if a monitor detected a delegation conflict */
1217 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1218 		/* mark as wouldblock so response is dropped */
1219 		curthread->t_flag |= T_WOULDBLOCK;
1220 	else
1221 		ns->ns_status = puterrno(error);
1222 
1223 }
1224 
1225 struct rfs_async_write {
1226 	struct nfswriteargs *wa;
1227 	struct nfsattrstat *ns;
1228 	struct svc_req *req;
1229 	cred_t *cr;
1230 	bool_t ro;
1231 	kthread_t *thread;
1232 	struct rfs_async_write *list;
1233 };
1234 
1235 struct rfs_async_write_list {
1236 	fhandle_t *fhp;
1237 	kcondvar_t cv;
1238 	struct rfs_async_write *list;
1239 	struct rfs_async_write_list *next;
1240 };
1241 
1242 static struct rfs_async_write_list *rfs_async_write_head = NULL;
1243 static kmutex_t rfs_async_write_lock;
1244 static int rfs_write_async = 1;	/* enables write clustering if == 1 */
1245 
1246 #define	MAXCLIOVECS	42
1247 #define	RFSWRITE_INITVAL (enum nfsstat) -1
1248 
1249 #ifdef DEBUG
1250 static int rfs_write_hits = 0;
1251 static int rfs_write_misses = 0;
1252 #endif
1253 
1254 /*
1255  * Write data to file.
1256  * Returns attributes of a file after writing some data to it.
1257  */
1258 void
1259 rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns,
1260     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1261 {
1262 	int error;
1263 	vnode_t *vp;
1264 	rlim64_t rlimit;
1265 	struct vattr va;
1266 	struct uio uio;
1267 	struct rfs_async_write_list *lp;
1268 	struct rfs_async_write_list *nlp;
1269 	struct rfs_async_write *rp;
1270 	struct rfs_async_write *nrp;
1271 	struct rfs_async_write *trp;
1272 	struct rfs_async_write *lrp;
1273 	int data_written;
1274 	int iovcnt;
1275 	mblk_t *m;
1276 	struct iovec *iovp;
1277 	struct iovec *niovp;
1278 	struct iovec iov[MAXCLIOVECS];
1279 	int count;
1280 	int rcount;
1281 	uint_t off;
1282 	uint_t len;
1283 	struct rfs_async_write nrpsp;
1284 	struct rfs_async_write_list nlpsp;
1285 	ushort_t t_flag;
1286 	cred_t *savecred;
1287 	int in_crit = 0;
1288 	caller_context_t ct;
1289 
1290 	if (!rfs_write_async) {
1291 		rfs_write_sync(wa, ns, exi, req, cr, ro);
1292 		return;
1293 	}
1294 
1295 	/*
1296 	 * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
1297 	 * is considered an OK.
1298 	 */
1299 	ns->ns_status = RFSWRITE_INITVAL;
1300 
1301 	nrp = &nrpsp;
1302 	nrp->wa = wa;
1303 	nrp->ns = ns;
1304 	nrp->req = req;
1305 	nrp->cr = cr;
1306 	nrp->ro = ro;
1307 	nrp->thread = curthread;
1308 
1309 	ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
1310 
1311 	/*
1312 	 * Look to see if there is already a cluster started
1313 	 * for this file.
1314 	 */
1315 	mutex_enter(&rfs_async_write_lock);
1316 	for (lp = rfs_async_write_head; lp != NULL; lp = lp->next) {
1317 		if (bcmp(&wa->wa_fhandle, lp->fhp,
1318 		    sizeof (fhandle_t)) == 0)
1319 			break;
1320 	}
1321 
1322 	/*
1323 	 * If lp is non-NULL, then there is already a cluster
1324 	 * started.  We need to place ourselves in the cluster
1325 	 * list in the right place as determined by starting
1326 	 * offset.  Conflicts with non-blocking mandatory locked
1327 	 * regions will be checked when the cluster is processed.
1328 	 */
1329 	if (lp != NULL) {
1330 		rp = lp->list;
1331 		trp = NULL;
1332 		while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
1333 			trp = rp;
1334 			rp = rp->list;
1335 		}
1336 		nrp->list = rp;
1337 		if (trp == NULL)
1338 			lp->list = nrp;
1339 		else
1340 			trp->list = nrp;
1341 		while (nrp->ns->ns_status == RFSWRITE_INITVAL)
1342 			cv_wait(&lp->cv, &rfs_async_write_lock);
1343 		mutex_exit(&rfs_async_write_lock);
1344 
1345 		return;
1346 	}
1347 
1348 	/*
1349 	 * No cluster started yet, start one and add ourselves
1350 	 * to the list of clusters.
1351 	 */
1352 	nrp->list = NULL;
1353 
1354 	nlp = &nlpsp;
1355 	nlp->fhp = &wa->wa_fhandle;
1356 	cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
1357 	nlp->list = nrp;
1358 	nlp->next = NULL;
1359 
1360 	if (rfs_async_write_head == NULL) {
1361 		rfs_async_write_head = nlp;
1362 	} else {
1363 		lp = rfs_async_write_head;
1364 		while (lp->next != NULL)
1365 			lp = lp->next;
1366 		lp->next = nlp;
1367 	}
1368 	mutex_exit(&rfs_async_write_lock);
1369 
1370 	/*
1371 	 * Convert the file handle common to all of the requests
1372 	 * in this cluster to a vnode.
1373 	 */
1374 	vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1375 	if (vp == NULL) {
1376 		mutex_enter(&rfs_async_write_lock);
1377 		if (rfs_async_write_head == nlp)
1378 			rfs_async_write_head = nlp->next;
1379 		else {
1380 			lp = rfs_async_write_head;
1381 			while (lp->next != nlp)
1382 				lp = lp->next;
1383 			lp->next = nlp->next;
1384 		}
1385 		t_flag = curthread->t_flag & T_WOULDBLOCK;
1386 		for (rp = nlp->list; rp != NULL; rp = rp->list) {
1387 			rp->ns->ns_status = NFSERR_STALE;
1388 			rp->thread->t_flag |= t_flag;
1389 		}
1390 		cv_broadcast(&nlp->cv);
1391 		mutex_exit(&rfs_async_write_lock);
1392 
1393 		return;
1394 	}
1395 
1396 	/*
1397 	 * Can only write regular files.  Attempts to write any
1398 	 * other file types fail with EISDIR.
1399 	 */
1400 	if (vp->v_type != VREG) {
1401 		VN_RELE(vp);
1402 		mutex_enter(&rfs_async_write_lock);
1403 		if (rfs_async_write_head == nlp)
1404 			rfs_async_write_head = nlp->next;
1405 		else {
1406 			lp = rfs_async_write_head;
1407 			while (lp->next != nlp)
1408 				lp = lp->next;
1409 			lp->next = nlp->next;
1410 		}
1411 		t_flag = curthread->t_flag & T_WOULDBLOCK;
1412 		for (rp = nlp->list; rp != NULL; rp = rp->list) {
1413 			rp->ns->ns_status = NFSERR_ISDIR;
1414 			rp->thread->t_flag |= t_flag;
1415 		}
1416 		cv_broadcast(&nlp->cv);
1417 		mutex_exit(&rfs_async_write_lock);
1418 
1419 		return;
1420 	}
1421 
1422 	/*
1423 	 * Enter the critical region before calling VOP_RWLOCK, to avoid a
1424 	 * deadlock with ufs.
1425 	 */
1426 	if (nbl_need_check(vp)) {
1427 		nbl_start_crit(vp, RW_READER);
1428 		in_crit = 1;
1429 	}
1430 
1431 	ct.cc_sysid = 0;
1432 	ct.cc_pid = 0;
1433 	ct.cc_caller_id = nfs2_srv_caller_id;
1434 	ct.cc_flags = CC_DONTBLOCK;
1435 
1436 	/*
1437 	 * Lock the file for writing.  This operation provides
1438 	 * the delay which allows clusters to grow.
1439 	 */
1440 	error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1441 
1442 	/* check if a monitor detected a delegation conflict */
1443 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1444 		if (in_crit)
1445 			nbl_end_crit(vp);
1446 		VN_RELE(vp);
1447 		/* mark as wouldblock so response is dropped */
1448 		curthread->t_flag |= T_WOULDBLOCK;
1449 		mutex_enter(&rfs_async_write_lock);
1450 		if (rfs_async_write_head == nlp)
1451 			rfs_async_write_head = nlp->next;
1452 		else {
1453 			lp = rfs_async_write_head;
1454 			while (lp->next != nlp)
1455 				lp = lp->next;
1456 			lp->next = nlp->next;
1457 		}
1458 		for (rp = nlp->list; rp != NULL; rp = rp->list) {
1459 			if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1460 				rp->ns->ns_status = puterrno(error);
1461 				rp->thread->t_flag |= T_WOULDBLOCK;
1462 			}
1463 		}
1464 		cv_broadcast(&nlp->cv);
1465 		mutex_exit(&rfs_async_write_lock);
1466 
1467 		return;
1468 	}
1469 
1470 	/*
1471 	 * Disconnect this cluster from the list of clusters.
1472 	 * The cluster that is being dealt with must be fixed
1473 	 * in size after this point, so there is no reason
1474 	 * to leave it on the list so that new requests can
1475 	 * find it.
1476 	 *
1477 	 * The algorithm is that the first write request will
1478 	 * create a cluster, convert the file handle to a
1479 	 * vnode pointer, and then lock the file for writing.
1480 	 * This request is not likely to be clustered with
1481 	 * any others.  However, the next request will create
1482 	 * a new cluster and be blocked in VOP_RWLOCK while
1483 	 * the first request is being processed.  This delay
1484 	 * will allow more requests to be clustered in this
1485 	 * second cluster.
1486 	 */
1487 	mutex_enter(&rfs_async_write_lock);
1488 	if (rfs_async_write_head == nlp)
1489 		rfs_async_write_head = nlp->next;
1490 	else {
1491 		lp = rfs_async_write_head;
1492 		while (lp->next != nlp)
1493 			lp = lp->next;
1494 		lp->next = nlp->next;
1495 	}
1496 	mutex_exit(&rfs_async_write_lock);
1497 
1498 	/*
1499 	 * Step through the list of requests in this cluster.
1500 	 * We need to check permissions to make sure that all
1501 	 * of the requests have sufficient permission to write
1502 	 * the file.  A cluster can be composed of requests
1503 	 * from different clients and different users on each
1504 	 * client.
1505 	 *
1506 	 * As a side effect, we also calculate the size of the
1507 	 * byte range that this cluster encompasses.
1508 	 */
1509 	rp = nlp->list;
1510 	off = rp->wa->wa_offset;
1511 	len = (uint_t)0;
1512 	do {
1513 		if (rdonly(rp->ro, vp)) {
1514 			rp->ns->ns_status = NFSERR_ROFS;
1515 			t_flag = curthread->t_flag & T_WOULDBLOCK;
1516 			rp->thread->t_flag |= t_flag;
1517 			continue;
1518 		}
1519 
1520 		va.va_mask = AT_UID|AT_MODE;
1521 
1522 		error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1523 
1524 		if (!error) {
1525 			if (crgetuid(rp->cr) != va.va_uid) {
1526 				/*
1527 				 * This is a kludge to allow writes of files
1528 				 * created with read only permission.  The
1529 				 * owner of the file is always allowed to
1530 				 * write it.
1531 				 */
1532 				error = VOP_ACCESS(vp, VWRITE, 0, rp->cr, &ct);
1533 			}
1534 			if (!error && MANDLOCK(vp, va.va_mode))
1535 				error = EACCES;
1536 		}
1537 
1538 		/*
1539 		 * Check for a conflict with a nbmand-locked region.
1540 		 */
1541 		if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset,
1542 		    rp->wa->wa_count, 0, NULL)) {
1543 			error = EACCES;
1544 		}
1545 
1546 		if (error) {
1547 			rp->ns->ns_status = puterrno(error);
1548 			t_flag = curthread->t_flag & T_WOULDBLOCK;
1549 			rp->thread->t_flag |= t_flag;
1550 			continue;
1551 		}
1552 		if (len < rp->wa->wa_offset + rp->wa->wa_count - off)
1553 			len = rp->wa->wa_offset + rp->wa->wa_count - off;
1554 	} while ((rp = rp->list) != NULL);
1555 
1556 	/*
1557 	 * Step through the cluster attempting to gather as many
1558 	 * requests which are contiguous as possible.  These
1559 	 * contiguous requests are handled via one call to VOP_WRITE
1560 	 * instead of different calls to VOP_WRITE.  We also keep
1561 	 * track of the fact that any data was written.
1562 	 */
1563 	rp = nlp->list;
1564 	data_written = 0;
1565 	do {
1566 		/*
1567 		 * Skip any requests which are already marked as having an
1568 		 * error.
1569 		 */
1570 		if (rp->ns->ns_status != RFSWRITE_INITVAL) {
1571 			rp = rp->list;
1572 			continue;
1573 		}
1574 
1575 		/*
1576 		 * Count the number of iovec's which are required
1577 		 * to handle this set of requests.  One iovec is
1578 		 * needed for each data buffer, whether addressed
1579 		 * by wa_data or by the b_rptr pointers in the
1580 		 * mblk chains.
1581 		 */
1582 		iovcnt = 0;
1583 		lrp = rp;
1584 		for (;;) {
1585 			if (lrp->wa->wa_data || lrp->wa->wa_rlist)
1586 				iovcnt++;
1587 			else {
1588 				m = lrp->wa->wa_mblk;
1589 				while (m != NULL) {
1590 					iovcnt++;
1591 					m = m->b_cont;
1592 				}
1593 			}
1594 			if (lrp->list == NULL ||
1595 			    lrp->list->ns->ns_status != RFSWRITE_INITVAL ||
1596 			    lrp->wa->wa_offset + lrp->wa->wa_count !=
1597 			    lrp->list->wa->wa_offset) {
1598 				lrp = lrp->list;
1599 				break;
1600 			}
1601 			lrp = lrp->list;
1602 		}
1603 
1604 		if (iovcnt <= MAXCLIOVECS) {
1605 #ifdef DEBUG
1606 			rfs_write_hits++;
1607 #endif
1608 			niovp = iov;
1609 		} else {
1610 #ifdef DEBUG
1611 			rfs_write_misses++;
1612 #endif
1613 			niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP);
1614 		}
1615 		/*
1616 		 * Put together the scatter/gather iovecs.
1617 		 */
1618 		iovp = niovp;
1619 		trp = rp;
1620 		count = 0;
1621 		do {
1622 			if (trp->wa->wa_data || trp->wa->wa_rlist) {
1623 				if (trp->wa->wa_rlist) {
1624 					iovp->iov_base =
1625 					    (char *)((trp->wa->wa_rlist)->
1626 					    u.c_daddr3);
1627 					iovp->iov_len = trp->wa->wa_count;
1628 				} else  {
1629 					iovp->iov_base = trp->wa->wa_data;
1630 					iovp->iov_len = trp->wa->wa_count;
1631 				}
1632 				iovp++;
1633 			} else {
1634 				m = trp->wa->wa_mblk;
1635 				rcount = trp->wa->wa_count;
1636 				while (m != NULL) {
1637 					iovp->iov_base = (caddr_t)m->b_rptr;
1638 					iovp->iov_len = (m->b_wptr - m->b_rptr);
1639 					rcount -= iovp->iov_len;
1640 					if (rcount < 0)
1641 						iovp->iov_len += rcount;
1642 					iovp++;
1643 					if (rcount <= 0)
1644 						break;
1645 					m = m->b_cont;
1646 				}
1647 			}
1648 			count += trp->wa->wa_count;
1649 			trp = trp->list;
1650 		} while (trp != lrp);
1651 
1652 		uio.uio_iov = niovp;
1653 		uio.uio_iovcnt = iovcnt;
1654 		uio.uio_segflg = UIO_SYSSPACE;
1655 		uio.uio_extflg = UIO_COPY_DEFAULT;
1656 		uio.uio_loffset = (offset_t)rp->wa->wa_offset;
1657 		uio.uio_resid = count;
1658 		/*
1659 		 * The limit is checked on the client. We
1660 		 * should allow any size writes here.
1661 		 */
1662 		uio.uio_llimit = curproc->p_fsz_ctl;
1663 		rlimit = uio.uio_llimit - rp->wa->wa_offset;
1664 		if (rlimit < (rlim64_t)uio.uio_resid)
1665 			uio.uio_resid = (uint_t)rlimit;
1666 
1667 		/*
1668 		 * For now we assume no append mode.
1669 		 */
1670 
1671 		/*
1672 		 * We're changing creds because VM may fault
1673 		 * and we need the cred of the current
1674 		 * thread to be used if quota * checking is
1675 		 * enabled.
1676 		 */
1677 		savecred = curthread->t_cred;
1678 		curthread->t_cred = cr;
1679 		error = VOP_WRITE(vp, &uio, 0, rp->cr, &ct);
1680 		curthread->t_cred = savecred;
1681 
1682 		/* check if a monitor detected a delegation conflict */
1683 		if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1684 			/* mark as wouldblock so response is dropped */
1685 			curthread->t_flag |= T_WOULDBLOCK;
1686 
1687 		if (niovp != iov)
1688 			kmem_free(niovp, sizeof (*niovp) * iovcnt);
1689 
1690 		if (!error) {
1691 			data_written = 1;
1692 			/*
1693 			 * Get attributes again so we send the latest mod
1694 			 * time to the client side for its cache.
1695 			 */
1696 			va.va_mask = AT_ALL;	/* now we want everything */
1697 
1698 			error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1699 
1700 			if (!error)
1701 				acl_perm(vp, exi, &va, rp->cr);
1702 		}
1703 
1704 		/*
1705 		 * Fill in the status responses for each request
1706 		 * which was just handled.  Also, copy the latest
1707 		 * attributes in to the attribute responses if
1708 		 * appropriate.
1709 		 */
1710 		t_flag = curthread->t_flag & T_WOULDBLOCK;
1711 		do {
1712 			rp->thread->t_flag |= t_flag;
1713 			/* check for overflows */
1714 			if (!error) {
1715 				error  = vattr_to_nattr(&va, &rp->ns->ns_attr);
1716 			}
1717 			rp->ns->ns_status = puterrno(error);
1718 			rp = rp->list;
1719 		} while (rp != lrp);
1720 	} while (rp != NULL);
1721 
1722 	/*
1723 	 * If any data was written at all, then we need to flush
1724 	 * the data and metadata to stable storage.
1725 	 */
1726 	if (data_written) {
1727 		error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr, &ct);
1728 
1729 		if (!error) {
1730 			error = VOP_FSYNC(vp, FNODSYNC, cr, &ct);
1731 		}
1732 	}
1733 
1734 	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1735 
1736 	if (in_crit)
1737 		nbl_end_crit(vp);
1738 	VN_RELE(vp);
1739 
1740 	t_flag = curthread->t_flag & T_WOULDBLOCK;
1741 	mutex_enter(&rfs_async_write_lock);
1742 	for (rp = nlp->list; rp != NULL; rp = rp->list) {
1743 		if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1744 			rp->ns->ns_status = puterrno(error);
1745 			rp->thread->t_flag |= t_flag;
1746 		}
1747 	}
1748 	cv_broadcast(&nlp->cv);
1749 	mutex_exit(&rfs_async_write_lock);
1750 
1751 }
1752 
1753 void *
1754 rfs_write_getfh(struct nfswriteargs *wa)
1755 {
1756 	return (&wa->wa_fhandle);
1757 }
1758 
1759 /*
1760  * Create a file.
1761  * Creates a file with given attributes and returns those attributes
1762  * and an fhandle for the new file.
1763  */
1764 void
1765 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
1766     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1767 {
1768 	int error;
1769 	int lookuperr;
1770 	int in_crit = 0;
1771 	struct vattr va;
1772 	vnode_t *vp;
1773 	vnode_t *realvp;
1774 	vnode_t *dvp;
1775 	char *name = args->ca_da.da_name;
1776 	vnode_t *tvp = NULL;
1777 	int mode;
1778 	int lookup_ok;
1779 	bool_t trunc;
1780 	struct sockaddr *ca;
1781 
1782 	/*
1783 	 * Disallow NULL paths
1784 	 */
1785 	if (name == NULL || *name == '\0') {
1786 		dr->dr_status = NFSERR_ACCES;
1787 		return;
1788 	}
1789 
1790 	dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
1791 	if (dvp == NULL) {
1792 		dr->dr_status = NFSERR_STALE;
1793 		return;
1794 	}
1795 
1796 	error = sattr_to_vattr(args->ca_sa, &va);
1797 	if (error) {
1798 		dr->dr_status = puterrno(error);
1799 		return;
1800 	}
1801 
1802 	/*
1803 	 * Must specify the mode.
1804 	 */
1805 	if (!(va.va_mask & AT_MODE)) {
1806 		VN_RELE(dvp);
1807 		dr->dr_status = NFSERR_INVAL;
1808 		return;
1809 	}
1810 
1811 	/*
1812 	 * This is a completely gross hack to make mknod
1813 	 * work over the wire until we can wack the protocol
1814 	 */
1815 	if ((va.va_mode & IFMT) == IFCHR) {
1816 		if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV)
1817 			va.va_type = VFIFO;	/* xtra kludge for named pipe */
1818 		else {
1819 			va.va_type = VCHR;
1820 			/*
1821 			 * uncompress the received dev_t
1822 			 * if the top half is zero indicating a request
1823 			 * from an `older style' OS.
1824 			 */
1825 			if ((va.va_size & 0xffff0000) == 0)
1826 				va.va_rdev = nfsv2_expdev(va.va_size);
1827 			else
1828 				va.va_rdev = (dev_t)va.va_size;
1829 		}
1830 		va.va_mask &= ~AT_SIZE;
1831 	} else if ((va.va_mode & IFMT) == IFBLK) {
1832 		va.va_type = VBLK;
1833 		/*
1834 		 * uncompress the received dev_t
1835 		 * if the top half is zero indicating a request
1836 		 * from an `older style' OS.
1837 		 */
1838 		if ((va.va_size & 0xffff0000) == 0)
1839 			va.va_rdev = nfsv2_expdev(va.va_size);
1840 		else
1841 			va.va_rdev = (dev_t)va.va_size;
1842 		va.va_mask &= ~AT_SIZE;
1843 	} else if ((va.va_mode & IFMT) == IFSOCK) {
1844 		va.va_type = VSOCK;
1845 	} else {
1846 		va.va_type = VREG;
1847 	}
1848 	va.va_mode &= ~IFMT;
1849 	va.va_mask |= AT_TYPE;
1850 
1851 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
1852 	name = nfscmd_convname(ca, exi, name, NFSCMD_CONV_INBOUND,
1853 	    MAXPATHLEN);
1854 	if (name == NULL) {
1855 		dr->dr_status = puterrno(EINVAL);
1856 		return;
1857 	}
1858 
1859 	/*
1860 	 * Why was the choice made to use VWRITE as the mode to the
1861 	 * call to VOP_CREATE ? This results in a bug.  When a client
1862 	 * opens a file that already exists and is RDONLY, the second
1863 	 * open fails with an EACESS because of the mode.
1864 	 * bug ID 1054648.
1865 	 */
1866 	lookup_ok = 0;
1867 	mode = VWRITE;
1868 	if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) {
1869 		error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1870 		    NULL, NULL, NULL);
1871 		if (!error) {
1872 			struct vattr at;
1873 
1874 			lookup_ok = 1;
1875 			at.va_mask = AT_MODE;
1876 			error = VOP_GETATTR(tvp, &at, 0, cr, NULL);
1877 			if (!error)
1878 				mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD;
1879 			VN_RELE(tvp);
1880 			tvp = NULL;
1881 		}
1882 	}
1883 
1884 	if (!lookup_ok) {
1885 		if (rdonly(ro, dvp)) {
1886 			error = EROFS;
1887 		} else if (va.va_type != VREG && va.va_type != VFIFO &&
1888 		    va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) {
1889 			error = EPERM;
1890 		} else {
1891 			error = 0;
1892 		}
1893 	}
1894 
1895 	/*
1896 	 * If file size is being modified on an already existing file
1897 	 * make sure that there are no conflicting non-blocking mandatory
1898 	 * locks in the region being manipulated. Return EACCES if there
1899 	 * are conflicting locks.
1900 	 */
1901 	if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) {
1902 		lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1903 		    NULL, NULL, NULL);
1904 
1905 		if (!lookuperr &&
1906 		    rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) {
1907 			VN_RELE(tvp);
1908 			curthread->t_flag |= T_WOULDBLOCK;
1909 			goto out;
1910 		}
1911 
1912 		if (!lookuperr && nbl_need_check(tvp)) {
1913 			/*
1914 			 * The file exists. Now check if it has any
1915 			 * conflicting non-blocking mandatory locks
1916 			 * in the region being changed.
1917 			 */
1918 			struct vattr bva;
1919 			u_offset_t offset;
1920 			ssize_t length;
1921 
1922 			nbl_start_crit(tvp, RW_READER);
1923 			in_crit = 1;
1924 
1925 			bva.va_mask = AT_SIZE;
1926 			error = VOP_GETATTR(tvp, &bva, 0, cr, NULL);
1927 			if (!error) {
1928 				if (va.va_size < bva.va_size) {
1929 					offset = va.va_size;
1930 					length = bva.va_size - va.va_size;
1931 				} else {
1932 					offset = bva.va_size;
1933 					length = va.va_size - bva.va_size;
1934 				}
1935 				if (length) {
1936 					if (nbl_conflict(tvp, NBL_WRITE,
1937 					    offset, length, 0, NULL)) {
1938 						error = EACCES;
1939 					}
1940 				}
1941 			}
1942 			if (error) {
1943 				nbl_end_crit(tvp);
1944 				VN_RELE(tvp);
1945 				in_crit = 0;
1946 			}
1947 		} else if (tvp != NULL) {
1948 			VN_RELE(tvp);
1949 		}
1950 	}
1951 
1952 	if (!error) {
1953 		/*
1954 		 * If filesystem is shared with nosuid the remove any
1955 		 * setuid/setgid bits on create.
1956 		 */
1957 		if (va.va_type == VREG &&
1958 		    exi->exi_export.ex_flags & EX_NOSUID)
1959 			va.va_mode &= ~(VSUID | VSGID);
1960 
1961 		error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0,
1962 		    NULL, NULL);
1963 
1964 		if (!error) {
1965 
1966 			if ((va.va_mask & AT_SIZE) && (va.va_size == 0))
1967 				trunc = TRUE;
1968 			else
1969 				trunc = FALSE;
1970 
1971 			if (rfs4_check_delegated(FWRITE, vp, trunc)) {
1972 				VN_RELE(vp);
1973 				curthread->t_flag |= T_WOULDBLOCK;
1974 				goto out;
1975 			}
1976 			va.va_mask = AT_ALL;
1977 
1978 			error = VOP_GETATTR(vp, &va, 0, cr, NULL);
1979 
1980 			/* check for overflows */
1981 			if (!error) {
1982 				acl_perm(vp, exi, &va, cr);
1983 				error = vattr_to_nattr(&va, &dr->dr_attr);
1984 				if (!error) {
1985 					error = makefh(&dr->dr_fhandle, vp,
1986 					    exi);
1987 				}
1988 			}
1989 			/*
1990 			 * Force modified metadata out to stable storage.
1991 			 *
1992 			 * if a underlying vp exists, pass it to VOP_FSYNC
1993 			 */
1994 			if (VOP_REALVP(vp, &realvp, NULL) == 0)
1995 				(void) VOP_FSYNC(realvp, FNODSYNC, cr, NULL);
1996 			else
1997 				(void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
1998 			VN_RELE(vp);
1999 		}
2000 
2001 		if (in_crit) {
2002 			nbl_end_crit(tvp);
2003 			VN_RELE(tvp);
2004 		}
2005 	}
2006 
2007 	/*
2008 	 * Force modified data and metadata out to stable storage.
2009 	 */
2010 	(void) VOP_FSYNC(dvp, 0, cr, NULL);
2011 
2012 out:
2013 
2014 	VN_RELE(dvp);
2015 
2016 	dr->dr_status = puterrno(error);
2017 
2018 	if (name != args->ca_da.da_name)
2019 		kmem_free(name, MAXPATHLEN);
2020 }
2021 void *
2022 rfs_create_getfh(struct nfscreatargs *args)
2023 {
2024 	return (args->ca_da.da_fhandle);
2025 }
2026 
2027 /*
2028  * Remove a file.
2029  * Remove named file from parent directory.
2030  */
2031 /* ARGSUSED */
2032 void
2033 rfs_remove(struct nfsdiropargs *da, enum nfsstat *status,
2034     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2035 {
2036 	int error = 0;
2037 	vnode_t *vp;
2038 	vnode_t *targvp;
2039 	int in_crit = 0;
2040 
2041 	/*
2042 	 * Disallow NULL paths
2043 	 */
2044 	if (da->da_name == NULL || *da->da_name == '\0') {
2045 		*status = NFSERR_ACCES;
2046 		return;
2047 	}
2048 
2049 	vp = nfs_fhtovp(da->da_fhandle, exi);
2050 	if (vp == NULL) {
2051 		*status = NFSERR_STALE;
2052 		return;
2053 	}
2054 
2055 	if (rdonly(ro, vp)) {
2056 		VN_RELE(vp);
2057 		*status = NFSERR_ROFS;
2058 		return;
2059 	}
2060 
2061 	/*
2062 	 * Check for a conflict with a non-blocking mandatory share reservation.
2063 	 */
2064 	error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0,
2065 	    NULL, cr, NULL, NULL, NULL);
2066 	if (error != 0) {
2067 		VN_RELE(vp);
2068 		*status = puterrno(error);
2069 		return;
2070 	}
2071 
2072 	/*
2073 	 * If the file is delegated to an v4 client, then initiate
2074 	 * recall and drop this request (by setting T_WOULDBLOCK).
2075 	 * The client will eventually re-transmit the request and
2076 	 * (hopefully), by then, the v4 client will have returned
2077 	 * the delegation.
2078 	 */
2079 
2080 	if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2081 		VN_RELE(vp);
2082 		VN_RELE(targvp);
2083 		curthread->t_flag |= T_WOULDBLOCK;
2084 		return;
2085 	}
2086 
2087 	if (nbl_need_check(targvp)) {
2088 		nbl_start_crit(targvp, RW_READER);
2089 		in_crit = 1;
2090 		if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
2091 			error = EACCES;
2092 			goto out;
2093 		}
2094 	}
2095 
2096 	error = VOP_REMOVE(vp, da->da_name, cr, NULL, 0);
2097 
2098 	/*
2099 	 * Force modified data and metadata out to stable storage.
2100 	 */
2101 	(void) VOP_FSYNC(vp, 0, cr, NULL);
2102 
2103 out:
2104 	if (in_crit)
2105 		nbl_end_crit(targvp);
2106 	VN_RELE(targvp);
2107 	VN_RELE(vp);
2108 
2109 	*status = puterrno(error);
2110 
2111 }
2112 
2113 void *
2114 rfs_remove_getfh(struct nfsdiropargs *da)
2115 {
2116 	return (da->da_fhandle);
2117 }
2118 
2119 /*
2120  * rename a file
2121  * Give a file (from) a new name (to).
2122  */
2123 /* ARGSUSED */
2124 void
2125 rfs_rename(struct nfsrnmargs *args, enum nfsstat *status,
2126     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2127 {
2128 	int error = 0;
2129 	vnode_t *fromvp;
2130 	vnode_t *tovp;
2131 	struct exportinfo *to_exi;
2132 	fhandle_t *fh;
2133 	vnode_t *srcvp;
2134 	vnode_t *targvp;
2135 	int in_crit = 0;
2136 
2137 	fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi);
2138 	if (fromvp == NULL) {
2139 		*status = NFSERR_STALE;
2140 		return;
2141 	}
2142 
2143 	fh = args->rna_to.da_fhandle;
2144 	to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2145 	if (to_exi == NULL) {
2146 		VN_RELE(fromvp);
2147 		*status = NFSERR_ACCES;
2148 		return;
2149 	}
2150 	exi_rele(to_exi);
2151 
2152 	if (to_exi != exi) {
2153 		VN_RELE(fromvp);
2154 		*status = NFSERR_XDEV;
2155 		return;
2156 	}
2157 
2158 	tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi);
2159 	if (tovp == NULL) {
2160 		VN_RELE(fromvp);
2161 		*status = NFSERR_STALE;
2162 		return;
2163 	}
2164 
2165 	if (fromvp->v_type != VDIR || tovp->v_type != VDIR) {
2166 		VN_RELE(tovp);
2167 		VN_RELE(fromvp);
2168 		*status = NFSERR_NOTDIR;
2169 		return;
2170 	}
2171 
2172 	/*
2173 	 * Disallow NULL paths
2174 	 */
2175 	if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' ||
2176 	    args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') {
2177 		VN_RELE(tovp);
2178 		VN_RELE(fromvp);
2179 		*status = NFSERR_ACCES;
2180 		return;
2181 	}
2182 
2183 	if (rdonly(ro, tovp)) {
2184 		VN_RELE(tovp);
2185 		VN_RELE(fromvp);
2186 		*status = NFSERR_ROFS;
2187 		return;
2188 	}
2189 
2190 	/*
2191 	 * Check for a conflict with a non-blocking mandatory share reservation.
2192 	 */
2193 	error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
2194 	    NULL, cr, NULL, NULL, NULL);
2195 	if (error != 0) {
2196 		VN_RELE(tovp);
2197 		VN_RELE(fromvp);
2198 		*status = puterrno(error);
2199 		return;
2200 	}
2201 
2202 	/* Check for delegations on the source file */
2203 
2204 	if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
2205 		VN_RELE(tovp);
2206 		VN_RELE(fromvp);
2207 		VN_RELE(srcvp);
2208 		curthread->t_flag |= T_WOULDBLOCK;
2209 		return;
2210 	}
2211 
2212 	/* Check for delegation on the file being renamed over, if it exists */
2213 
2214 	if (rfs4_deleg_policy != SRV_NEVER_DELEGATE &&
2215 	    VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
2216 	    NULL, NULL, NULL) == 0) {
2217 
2218 		if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2219 			VN_RELE(tovp);
2220 			VN_RELE(fromvp);
2221 			VN_RELE(srcvp);
2222 			VN_RELE(targvp);
2223 			curthread->t_flag |= T_WOULDBLOCK;
2224 			return;
2225 		}
2226 		VN_RELE(targvp);
2227 	}
2228 
2229 
2230 	if (nbl_need_check(srcvp)) {
2231 		nbl_start_crit(srcvp, RW_READER);
2232 		in_crit = 1;
2233 		if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) {
2234 			error = EACCES;
2235 			goto out;
2236 		}
2237 	}
2238 
2239 	error = VOP_RENAME(fromvp, args->rna_from.da_name,
2240 	    tovp, args->rna_to.da_name, cr, NULL, 0);
2241 
2242 	if (error == 0)
2243 		vn_renamepath(tovp, srcvp, args->rna_to.da_name,
2244 		    strlen(args->rna_to.da_name));
2245 
2246 	/*
2247 	 * Force modified data and metadata out to stable storage.
2248 	 */
2249 	(void) VOP_FSYNC(tovp, 0, cr, NULL);
2250 	(void) VOP_FSYNC(fromvp, 0, cr, NULL);
2251 
2252 out:
2253 	if (in_crit)
2254 		nbl_end_crit(srcvp);
2255 	VN_RELE(srcvp);
2256 	VN_RELE(tovp);
2257 	VN_RELE(fromvp);
2258 
2259 	*status = puterrno(error);
2260 
2261 }
2262 void *
2263 rfs_rename_getfh(struct nfsrnmargs *args)
2264 {
2265 	return (args->rna_from.da_fhandle);
2266 }
2267 
2268 /*
2269  * Link to a file.
2270  * Create a file (to) which is a hard link to the given file (from).
2271  */
2272 /* ARGSUSED */
2273 void
2274 rfs_link(struct nfslinkargs *args, enum nfsstat *status,
2275     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2276 {
2277 	int error;
2278 	vnode_t *fromvp;
2279 	vnode_t *tovp;
2280 	struct exportinfo *to_exi;
2281 	fhandle_t *fh;
2282 
2283 	fromvp = nfs_fhtovp(args->la_from, exi);
2284 	if (fromvp == NULL) {
2285 		*status = NFSERR_STALE;
2286 		return;
2287 	}
2288 
2289 	fh = args->la_to.da_fhandle;
2290 	to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2291 	if (to_exi == NULL) {
2292 		VN_RELE(fromvp);
2293 		*status = NFSERR_ACCES;
2294 		return;
2295 	}
2296 	exi_rele(to_exi);
2297 
2298 	if (to_exi != exi) {
2299 		VN_RELE(fromvp);
2300 		*status = NFSERR_XDEV;
2301 		return;
2302 	}
2303 
2304 	tovp = nfs_fhtovp(args->la_to.da_fhandle, exi);
2305 	if (tovp == NULL) {
2306 		VN_RELE(fromvp);
2307 		*status = NFSERR_STALE;
2308 		return;
2309 	}
2310 
2311 	if (tovp->v_type != VDIR) {
2312 		VN_RELE(tovp);
2313 		VN_RELE(fromvp);
2314 		*status = NFSERR_NOTDIR;
2315 		return;
2316 	}
2317 	/*
2318 	 * Disallow NULL paths
2319 	 */
2320 	if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') {
2321 		VN_RELE(tovp);
2322 		VN_RELE(fromvp);
2323 		*status = NFSERR_ACCES;
2324 		return;
2325 	}
2326 
2327 	if (rdonly(ro, tovp)) {
2328 		VN_RELE(tovp);
2329 		VN_RELE(fromvp);
2330 		*status = NFSERR_ROFS;
2331 		return;
2332 	}
2333 
2334 	error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr, NULL, 0);
2335 
2336 	/*
2337 	 * Force modified data and metadata out to stable storage.
2338 	 */
2339 	(void) VOP_FSYNC(tovp, 0, cr, NULL);
2340 	(void) VOP_FSYNC(fromvp, FNODSYNC, cr, NULL);
2341 
2342 	VN_RELE(tovp);
2343 	VN_RELE(fromvp);
2344 
2345 	*status = puterrno(error);
2346 
2347 }
2348 void *
2349 rfs_link_getfh(struct nfslinkargs *args)
2350 {
2351 	return (args->la_from);
2352 }
2353 
2354 /*
2355  * Symbolicly link to a file.
2356  * Create a file (to) with the given attributes which is a symbolic link
2357  * to the given path name (to).
2358  */
2359 void
2360 rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
2361     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2362 {
2363 	int error;
2364 	struct vattr va;
2365 	vnode_t *vp;
2366 	vnode_t *svp;
2367 	int lerror;
2368 	struct sockaddr *ca;
2369 	char *name = NULL;
2370 
2371 	/*
2372 	 * Disallow NULL paths
2373 	 */
2374 	if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') {
2375 		*status = NFSERR_ACCES;
2376 		return;
2377 	}
2378 
2379 	vp = nfs_fhtovp(args->sla_from.da_fhandle, exi);
2380 	if (vp == NULL) {
2381 		*status = NFSERR_STALE;
2382 		return;
2383 	}
2384 
2385 	if (rdonly(ro, vp)) {
2386 		VN_RELE(vp);
2387 		*status = NFSERR_ROFS;
2388 		return;
2389 	}
2390 
2391 	error = sattr_to_vattr(args->sla_sa, &va);
2392 	if (error) {
2393 		VN_RELE(vp);
2394 		*status = puterrno(error);
2395 		return;
2396 	}
2397 
2398 	if (!(va.va_mask & AT_MODE)) {
2399 		VN_RELE(vp);
2400 		*status = NFSERR_INVAL;
2401 		return;
2402 	}
2403 
2404 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2405 	name = nfscmd_convname(ca, exi, args->sla_tnm,
2406 	    NFSCMD_CONV_INBOUND, MAXPATHLEN);
2407 
2408 	if (name == NULL) {
2409 		*status = NFSERR_ACCES;
2410 		return;
2411 	}
2412 
2413 	va.va_type = VLNK;
2414 	va.va_mask |= AT_TYPE;
2415 
2416 	error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, name, cr, NULL, 0);
2417 
2418 	/*
2419 	 * Force new data and metadata out to stable storage.
2420 	 */
2421 	lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL, 0,
2422 	    NULL, cr, NULL, NULL, NULL);
2423 
2424 	if (!lerror) {
2425 		(void) VOP_FSYNC(svp, 0, cr, NULL);
2426 		VN_RELE(svp);
2427 	}
2428 
2429 	/*
2430 	 * Force modified data and metadata out to stable storage.
2431 	 */
2432 	(void) VOP_FSYNC(vp, 0, cr, NULL);
2433 
2434 	VN_RELE(vp);
2435 
2436 	*status = puterrno(error);
2437 	if (name != args->sla_tnm)
2438 		kmem_free(name, MAXPATHLEN);
2439 
2440 }
2441 void *
2442 rfs_symlink_getfh(struct nfsslargs *args)
2443 {
2444 	return (args->sla_from.da_fhandle);
2445 }
2446 
2447 /*
2448  * Make a directory.
2449  * Create a directory with the given name, parent directory, and attributes.
2450  * Returns a file handle and attributes for the new directory.
2451  */
2452 /* ARGSUSED */
2453 void
2454 rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr,
2455     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2456 {
2457 	int error;
2458 	struct vattr va;
2459 	vnode_t *dvp = NULL;
2460 	vnode_t *vp;
2461 	char *name = args->ca_da.da_name;
2462 
2463 	/*
2464 	 * Disallow NULL paths
2465 	 */
2466 	if (name == NULL || *name == '\0') {
2467 		dr->dr_status = NFSERR_ACCES;
2468 		return;
2469 	}
2470 
2471 	vp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
2472 	if (vp == NULL) {
2473 		dr->dr_status = NFSERR_STALE;
2474 		return;
2475 	}
2476 
2477 	if (rdonly(ro, vp)) {
2478 		VN_RELE(vp);
2479 		dr->dr_status = NFSERR_ROFS;
2480 		return;
2481 	}
2482 
2483 	error = sattr_to_vattr(args->ca_sa, &va);
2484 	if (error) {
2485 		VN_RELE(vp);
2486 		dr->dr_status = puterrno(error);
2487 		return;
2488 	}
2489 
2490 	if (!(va.va_mask & AT_MODE)) {
2491 		VN_RELE(vp);
2492 		dr->dr_status = NFSERR_INVAL;
2493 		return;
2494 	}
2495 
2496 	va.va_type = VDIR;
2497 	va.va_mask |= AT_TYPE;
2498 
2499 	error = VOP_MKDIR(vp, name, &va, &dvp, cr, NULL, 0, NULL);
2500 
2501 	if (!error) {
2502 		/*
2503 		 * Attribtutes of the newly created directory should
2504 		 * be returned to the client.
2505 		 */
2506 		va.va_mask = AT_ALL; /* We want everything */
2507 		error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2508 
2509 		/* check for overflows */
2510 		if (!error) {
2511 			acl_perm(vp, exi, &va, cr);
2512 			error = vattr_to_nattr(&va, &dr->dr_attr);
2513 			if (!error) {
2514 				error = makefh(&dr->dr_fhandle, dvp, exi);
2515 			}
2516 		}
2517 		/*
2518 		 * Force new data and metadata out to stable storage.
2519 		 */
2520 		(void) VOP_FSYNC(dvp, 0, cr, NULL);
2521 		VN_RELE(dvp);
2522 	}
2523 
2524 	/*
2525 	 * Force modified data and metadata out to stable storage.
2526 	 */
2527 	(void) VOP_FSYNC(vp, 0, cr, NULL);
2528 
2529 	VN_RELE(vp);
2530 
2531 	dr->dr_status = puterrno(error);
2532 
2533 }
2534 void *
2535 rfs_mkdir_getfh(struct nfscreatargs *args)
2536 {
2537 	return (args->ca_da.da_fhandle);
2538 }
2539 
2540 /*
2541  * Remove a directory.
2542  * Remove the given directory name from the given parent directory.
2543  */
2544 /* ARGSUSED */
2545 void
2546 rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status,
2547     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2548 {
2549 	int error;
2550 	vnode_t *vp;
2551 
2552 	/*
2553 	 * Disallow NULL paths
2554 	 */
2555 	if (da->da_name == NULL || *da->da_name == '\0') {
2556 		*status = NFSERR_ACCES;
2557 		return;
2558 	}
2559 
2560 	vp = nfs_fhtovp(da->da_fhandle, exi);
2561 	if (vp == NULL) {
2562 		*status = NFSERR_STALE;
2563 		return;
2564 	}
2565 
2566 	if (rdonly(ro, vp)) {
2567 		VN_RELE(vp);
2568 		*status = NFSERR_ROFS;
2569 		return;
2570 	}
2571 
2572 	/*
2573 	 * VOP_RMDIR takes a third argument (the current
2574 	 * directory of the process).  That's because someone
2575 	 * wants to return EINVAL if one tries to remove ".".
2576 	 * Of course, NFS servers have no idea what their
2577 	 * clients' current directories are.  We fake it by
2578 	 * supplying a vnode known to exist and illegal to
2579 	 * remove.
2580 	 */
2581 	error = VOP_RMDIR(vp, da->da_name, rootdir, cr, NULL, 0);
2582 
2583 	/*
2584 	 * Force modified data and metadata out to stable storage.
2585 	 */
2586 	(void) VOP_FSYNC(vp, 0, cr, NULL);
2587 
2588 	VN_RELE(vp);
2589 
2590 	/*
2591 	 * System V defines rmdir to return EEXIST, not ENOTEMPTY,
2592 	 * if the directory is not empty.  A System V NFS server
2593 	 * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
2594 	 * over the wire.
2595 	 */
2596 	if (error == EEXIST)
2597 		*status = NFSERR_NOTEMPTY;
2598 	else
2599 		*status = puterrno(error);
2600 
2601 }
2602 void *
2603 rfs_rmdir_getfh(struct nfsdiropargs *da)
2604 {
2605 	return (da->da_fhandle);
2606 }
2607 
2608 /* ARGSUSED */
2609 void
2610 rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
2611     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2612 {
2613 	int error;
2614 	int iseof;
2615 	struct iovec iov;
2616 	struct uio uio;
2617 	vnode_t *vp;
2618 	char *ndata = NULL;
2619 	struct sockaddr *ca;
2620 	size_t nents;
2621 	int ret;
2622 
2623 	vp = nfs_fhtovp(&rda->rda_fh, exi);
2624 	if (vp == NULL) {
2625 		rd->rd_entries = NULL;
2626 		rd->rd_status = NFSERR_STALE;
2627 		return;
2628 	}
2629 
2630 	if (vp->v_type != VDIR) {
2631 		VN_RELE(vp);
2632 		rd->rd_entries = NULL;
2633 		rd->rd_status = NFSERR_NOTDIR;
2634 		return;
2635 	}
2636 
2637 	(void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
2638 
2639 	error = VOP_ACCESS(vp, VREAD, 0, cr, NULL);
2640 
2641 	if (error) {
2642 		rd->rd_entries = NULL;
2643 		goto bad;
2644 	}
2645 
2646 	if (rda->rda_count == 0) {
2647 		rd->rd_entries = NULL;
2648 		rd->rd_size = 0;
2649 		rd->rd_eof = FALSE;
2650 		goto bad;
2651 	}
2652 
2653 	rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA);
2654 
2655 	/*
2656 	 * Allocate data for entries.  This will be freed by rfs_rddirfree.
2657 	 */
2658 	rd->rd_bufsize = (uint_t)rda->rda_count;
2659 	rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP);
2660 
2661 	/*
2662 	 * Set up io vector to read directory data
2663 	 */
2664 	iov.iov_base = (caddr_t)rd->rd_entries;
2665 	iov.iov_len = rda->rda_count;
2666 	uio.uio_iov = &iov;
2667 	uio.uio_iovcnt = 1;
2668 	uio.uio_segflg = UIO_SYSSPACE;
2669 	uio.uio_extflg = UIO_COPY_CACHED;
2670 	uio.uio_loffset = (offset_t)rda->rda_offset;
2671 	uio.uio_resid = rda->rda_count;
2672 
2673 	/*
2674 	 * read directory
2675 	 */
2676 	error = VOP_READDIR(vp, &uio, cr, &iseof, NULL, 0);
2677 
2678 	/*
2679 	 * Clean up
2680 	 */
2681 	if (!error) {
2682 		/*
2683 		 * set size and eof
2684 		 */
2685 		if (uio.uio_resid == rda->rda_count) {
2686 			rd->rd_size = 0;
2687 			rd->rd_eof = TRUE;
2688 		} else {
2689 			rd->rd_size = (uint32_t)(rda->rda_count -
2690 			    uio.uio_resid);
2691 			rd->rd_eof = iseof ? TRUE : FALSE;
2692 		}
2693 	}
2694 
2695 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2696 	nents = nfscmd_countents((char *)rd->rd_entries, rd->rd_size);
2697 	ret = nfscmd_convdirplus(ca, exi, (char *)rd->rd_entries, nents,
2698 	    rda->rda_count, &ndata);
2699 
2700 	if (ret != 0) {
2701 		size_t dropbytes;
2702 		/*
2703 		 * We had to drop one or more entries in order to fit
2704 		 * during the character conversion.  We need to patch
2705 		 * up the size and eof info.
2706 		 */
2707 		if (rd->rd_eof)
2708 			rd->rd_eof = FALSE;
2709 		dropbytes = nfscmd_dropped_entrysize(
2710 		    (struct dirent64 *)rd->rd_entries, nents, ret);
2711 		rd->rd_size -= dropbytes;
2712 	}
2713 	if (ndata == NULL) {
2714 		ndata = (char *)rd->rd_entries;
2715 	} else if (ndata != (char *)rd->rd_entries) {
2716 		kmem_free(rd->rd_entries, rd->rd_bufsize);
2717 		rd->rd_entries = (void *)ndata;
2718 		rd->rd_bufsize = rda->rda_count;
2719 	}
2720 
2721 bad:
2722 	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
2723 
2724 #if 0 /* notyet */
2725 	/*
2726 	 * Don't do this.  It causes local disk writes when just
2727 	 * reading the file and the overhead is deemed larger
2728 	 * than the benefit.
2729 	 */
2730 	/*
2731 	 * Force modified metadata out to stable storage.
2732 	 */
2733 	(void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2734 #endif
2735 
2736 	VN_RELE(vp);
2737 
2738 	rd->rd_status = puterrno(error);
2739 
2740 }
2741 void *
2742 rfs_readdir_getfh(struct nfsrddirargs *rda)
2743 {
2744 	return (&rda->rda_fh);
2745 }
2746 void
2747 rfs_rddirfree(struct nfsrddirres *rd)
2748 {
2749 	if (rd->rd_entries != NULL)
2750 		kmem_free(rd->rd_entries, rd->rd_bufsize);
2751 }
2752 
2753 /* ARGSUSED */
2754 void
2755 rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
2756     struct svc_req *req, cred_t *cr, bool_t ro)
2757 {
2758 	int error;
2759 	struct statvfs64 sb;
2760 	vnode_t *vp;
2761 
2762 	vp = nfs_fhtovp(fh, exi);
2763 	if (vp == NULL) {
2764 		fs->fs_status = NFSERR_STALE;
2765 		return;
2766 	}
2767 
2768 	error = VFS_STATVFS(vp->v_vfsp, &sb);
2769 
2770 	if (!error) {
2771 		fs->fs_tsize = nfstsize();
2772 		fs->fs_bsize = sb.f_frsize;
2773 		fs->fs_blocks = sb.f_blocks;
2774 		fs->fs_bfree = sb.f_bfree;
2775 		fs->fs_bavail = sb.f_bavail;
2776 	}
2777 
2778 	VN_RELE(vp);
2779 
2780 	fs->fs_status = puterrno(error);
2781 
2782 }
2783 void *
2784 rfs_statfs_getfh(fhandle_t *fh)
2785 {
2786 	return (fh);
2787 }
2788 
2789 static int
2790 sattr_to_vattr(struct nfssattr *sa, struct vattr *vap)
2791 {
2792 	vap->va_mask = 0;
2793 
2794 	/*
2795 	 * There was a sign extension bug in some VFS based systems
2796 	 * which stored the mode as a short.  When it would get
2797 	 * assigned to a u_long, no sign extension would occur.
2798 	 * It needed to, but this wasn't noticed because sa_mode
2799 	 * would then get assigned back to the short, thus ignoring
2800 	 * the upper 16 bits of sa_mode.
2801 	 *
2802 	 * To make this implementation work for both broken
2803 	 * clients and good clients, we check for both versions
2804 	 * of the mode.
2805 	 */
2806 	if (sa->sa_mode != (uint32_t)((ushort_t)-1) &&
2807 	    sa->sa_mode != (uint32_t)-1) {
2808 		vap->va_mask |= AT_MODE;
2809 		vap->va_mode = sa->sa_mode;
2810 	}
2811 	if (sa->sa_uid != (uint32_t)-1) {
2812 		vap->va_mask |= AT_UID;
2813 		vap->va_uid = sa->sa_uid;
2814 	}
2815 	if (sa->sa_gid != (uint32_t)-1) {
2816 		vap->va_mask |= AT_GID;
2817 		vap->va_gid = sa->sa_gid;
2818 	}
2819 	if (sa->sa_size != (uint32_t)-1) {
2820 		vap->va_mask |= AT_SIZE;
2821 		vap->va_size = sa->sa_size;
2822 	}
2823 	if (sa->sa_atime.tv_sec != (int32_t)-1 &&
2824 	    sa->sa_atime.tv_usec != (int32_t)-1) {
2825 #ifndef _LP64
2826 		/* return error if time overflow */
2827 		if (!NFS2_TIME_OK(sa->sa_atime.tv_sec))
2828 			return (EOVERFLOW);
2829 #endif
2830 		vap->va_mask |= AT_ATIME;
2831 		/*
2832 		 * nfs protocol defines times as unsigned so don't extend sign,
2833 		 * unless sysadmin set nfs_allow_preepoch_time.
2834 		 */
2835 		NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec);
2836 		vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
2837 	}
2838 	if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
2839 	    sa->sa_mtime.tv_usec != (int32_t)-1) {
2840 #ifndef _LP64
2841 		/* return error if time overflow */
2842 		if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
2843 			return (EOVERFLOW);
2844 #endif
2845 		vap->va_mask |= AT_MTIME;
2846 		/*
2847 		 * nfs protocol defines times as unsigned so don't extend sign,
2848 		 * unless sysadmin set nfs_allow_preepoch_time.
2849 		 */
2850 		NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
2851 		vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
2852 	}
2853 	return (0);
2854 }
2855 
2856 static enum nfsftype vt_to_nf[] = {
2857 	0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
2858 };
2859 
2860 /*
2861  * check the following fields for overflow: nodeid, size, and time.
2862  * There could be a problem when converting 64-bit LP64 fields
2863  * into 32-bit ones.  Return an error if there is an overflow.
2864  */
2865 int
2866 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
2867 {
2868 	ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
2869 	na->na_type = vt_to_nf[vap->va_type];
2870 
2871 	if (vap->va_mode == (unsigned short) -1)
2872 		na->na_mode = (uint32_t)-1;
2873 	else
2874 		na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
2875 
2876 	if (vap->va_uid == (unsigned short)(-1))
2877 		na->na_uid = (uint32_t)(-1);
2878 	else if (vap->va_uid == UID_NOBODY)
2879 		na->na_uid = (uint32_t)NFS_UID_NOBODY;
2880 	else
2881 		na->na_uid = vap->va_uid;
2882 
2883 	if (vap->va_gid == (unsigned short)(-1))
2884 		na->na_gid = (uint32_t)-1;
2885 	else if (vap->va_gid == GID_NOBODY)
2886 		na->na_gid = (uint32_t)NFS_GID_NOBODY;
2887 	else
2888 		na->na_gid = vap->va_gid;
2889 
2890 	/*
2891 	 * Do we need to check fsid for overflow?  It is 64-bit in the
2892 	 * vattr, but are bigger than 32 bit values supported?
2893 	 */
2894 	na->na_fsid = vap->va_fsid;
2895 
2896 	na->na_nodeid = vap->va_nodeid;
2897 
2898 	/*
2899 	 * Check to make sure that the nodeid is representable over the
2900 	 * wire without losing bits.
2901 	 */
2902 	if (vap->va_nodeid != (u_longlong_t)na->na_nodeid)
2903 		return (EFBIG);
2904 	na->na_nlink = vap->va_nlink;
2905 
2906 	/*
2907 	 * Check for big files here, instead of at the caller.  See
2908 	 * comments in cstat for large special file explanation.
2909 	 */
2910 	if (vap->va_size > (u_longlong_t)MAXOFF32_T) {
2911 		if ((vap->va_type == VREG) || (vap->va_type == VDIR))
2912 			return (EFBIG);
2913 		if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) {
2914 			/* UNKNOWN_SIZE | OVERFLOW */
2915 			na->na_size = MAXOFF32_T;
2916 		} else
2917 			na->na_size = vap->va_size;
2918 	} else
2919 		na->na_size = vap->va_size;
2920 
2921 	/*
2922 	 * If the vnode times overflow the 32-bit times that NFS2
2923 	 * uses on the wire then return an error.
2924 	 */
2925 	if (!NFS_VAP_TIME_OK(vap)) {
2926 		return (EOVERFLOW);
2927 	}
2928 	na->na_atime.tv_sec = vap->va_atime.tv_sec;
2929 	na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2930 
2931 	na->na_mtime.tv_sec = vap->va_mtime.tv_sec;
2932 	na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2933 
2934 	na->na_ctime.tv_sec = vap->va_ctime.tv_sec;
2935 	na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000;
2936 
2937 	/*
2938 	 * If the dev_t will fit into 16 bits then compress
2939 	 * it, otherwise leave it alone. See comments in
2940 	 * nfs_client.c.
2941 	 */
2942 	if (getminor(vap->va_rdev) <= SO4_MAXMIN &&
2943 	    getmajor(vap->va_rdev) <= SO4_MAXMAJ)
2944 		na->na_rdev = nfsv2_cmpdev(vap->va_rdev);
2945 	else
2946 		(void) cmpldev(&na->na_rdev, vap->va_rdev);
2947 
2948 	na->na_blocks = vap->va_nblocks;
2949 	na->na_blocksize = vap->va_blksize;
2950 
2951 	/*
2952 	 * This bit of ugliness is a *TEMPORARY* hack to preserve the
2953 	 * over-the-wire protocols for named-pipe vnodes.  It remaps the
2954 	 * VFIFO type to the special over-the-wire type. (see note in nfs.h)
2955 	 *
2956 	 * BUYER BEWARE:
2957 	 *  If you are porting the NFS to a non-Sun server, you probably
2958 	 *  don't want to include the following block of code.  The
2959 	 *  over-the-wire special file types will be changing with the
2960 	 *  NFS Protocol Revision.
2961 	 */
2962 	if (vap->va_type == VFIFO)
2963 		NA_SETFIFO(na);
2964 	return (0);
2965 }
2966 
2967 /*
2968  * acl v2 support: returns approximate permission.
2969  *	default: returns minimal permission (more restrictive)
2970  *	aclok: returns maximal permission (less restrictive)
2971  *	This routine changes the permissions that are alaredy in *va.
2972  *	If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES,
2973  *	CLASS_OBJ is always the same as GROUP_OBJ entry.
2974  */
2975 static void
2976 acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr)
2977 {
2978 	vsecattr_t	vsa;
2979 	int		aclcnt;
2980 	aclent_t	*aclentp;
2981 	mode_t		mask_perm;
2982 	mode_t		grp_perm;
2983 	mode_t		other_perm;
2984 	mode_t		other_orig;
2985 	int		error;
2986 
2987 	/* dont care default acl */
2988 	vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT);
2989 	error = VOP_GETSECATTR(vp, &vsa, 0, cr, NULL);
2990 
2991 	if (!error) {
2992 		aclcnt = vsa.vsa_aclcnt;
2993 		if (aclcnt > MIN_ACL_ENTRIES) {
2994 			/* non-trivial ACL */
2995 			aclentp = vsa.vsa_aclentp;
2996 			if (exi->exi_export.ex_flags & EX_ACLOK) {
2997 				/* maximal permissions */
2998 				grp_perm = 0;
2999 				other_perm = 0;
3000 				for (; aclcnt > 0; aclcnt--, aclentp++) {
3001 					switch (aclentp->a_type) {
3002 					case USER_OBJ:
3003 						break;
3004 					case USER:
3005 						grp_perm |=
3006 						    aclentp->a_perm << 3;
3007 						other_perm |= aclentp->a_perm;
3008 						break;
3009 					case GROUP_OBJ:
3010 						grp_perm |=
3011 						    aclentp->a_perm << 3;
3012 						break;
3013 					case GROUP:
3014 						other_perm |= aclentp->a_perm;
3015 						break;
3016 					case OTHER_OBJ:
3017 						other_orig = aclentp->a_perm;
3018 						break;
3019 					case CLASS_OBJ:
3020 						mask_perm = aclentp->a_perm;
3021 						break;
3022 					default:
3023 						break;
3024 					}
3025 				}
3026 				grp_perm &= mask_perm << 3;
3027 				other_perm &= mask_perm;
3028 				other_perm |= other_orig;
3029 
3030 			} else {
3031 				/* minimal permissions */
3032 				grp_perm = 070;
3033 				other_perm = 07;
3034 				for (; aclcnt > 0; aclcnt--, aclentp++) {
3035 					switch (aclentp->a_type) {
3036 					case USER_OBJ:
3037 						break;
3038 					case USER:
3039 					case CLASS_OBJ:
3040 						grp_perm &=
3041 						    aclentp->a_perm << 3;
3042 						other_perm &=
3043 						    aclentp->a_perm;
3044 						break;
3045 					case GROUP_OBJ:
3046 						grp_perm &=
3047 						    aclentp->a_perm << 3;
3048 						break;
3049 					case GROUP:
3050 						other_perm &=
3051 						    aclentp->a_perm;
3052 						break;
3053 					case OTHER_OBJ:
3054 						other_perm &=
3055 						    aclentp->a_perm;
3056 						break;
3057 					default:
3058 						break;
3059 					}
3060 				}
3061 			}
3062 			/* copy to va */
3063 			va->va_mode &= ~077;
3064 			va->va_mode |= grp_perm | other_perm;
3065 		}
3066 		if (vsa.vsa_aclcnt)
3067 			kmem_free(vsa.vsa_aclentp,
3068 			    vsa.vsa_aclcnt * sizeof (aclent_t));
3069 	}
3070 }
3071 
3072 void
3073 rfs_srvrinit(void)
3074 {
3075 	mutex_init(&rfs_async_write_lock, NULL, MUTEX_DEFAULT, NULL);
3076 	nfs2_srv_caller_id = fs_new_caller_id();
3077 }
3078 
3079 void
3080 rfs_srvrfini(void)
3081 {
3082 	mutex_destroy(&rfs_async_write_lock);
3083 }
3084 
3085 static int
3086 rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
3087 {
3088 	struct clist	*wcl;
3089 	int		wlist_len;
3090 	uint32_t	count = rr->rr_count;
3091 
3092 	wcl = ra->ra_wlist;
3093 
3094 	if (rdma_setup_read_chunks(wcl, count, &wlist_len) == FALSE) {
3095 		return (FALSE);
3096 	}
3097 
3098 	wcl = ra->ra_wlist;
3099 	rr->rr_ok.rrok_wlist_len = wlist_len;
3100 	rr->rr_ok.rrok_wlist = wcl;
3101 
3102 	return (TRUE);
3103 }
3104