xref: /illumos-gate/usr/src/uts/common/fs/nfs/nfs_srv.c (revision 445784c00080f22524a1a4659cf8b7b2e1ad84d1)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
25  * Copyright (c) 2016 by Delphix. All rights reserved.
26  */
27 
28 /*
29  *	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
30  *	All rights reserved.
31  */
32 
33 /*
34  * Copyright 2018 Nexenta Systems, Inc.
35  * Copyright (c) 2016 by Delphix. All rights reserved.
36  */
37 
38 #include <sys/param.h>
39 #include <sys/types.h>
40 #include <sys/systm.h>
41 #include <sys/cred.h>
42 #include <sys/buf.h>
43 #include <sys/vfs.h>
44 #include <sys/vnode.h>
45 #include <sys/uio.h>
46 #include <sys/stat.h>
47 #include <sys/errno.h>
48 #include <sys/sysmacros.h>
49 #include <sys/statvfs.h>
50 #include <sys/kmem.h>
51 #include <sys/kstat.h>
52 #include <sys/dirent.h>
53 #include <sys/cmn_err.h>
54 #include <sys/debug.h>
55 #include <sys/vtrace.h>
56 #include <sys/mode.h>
57 #include <sys/acl.h>
58 #include <sys/nbmlock.h>
59 #include <sys/policy.h>
60 #include <sys/sdt.h>
61 
62 #include <rpc/types.h>
63 #include <rpc/auth.h>
64 #include <rpc/svc.h>
65 
66 #include <nfs/nfs.h>
67 #include <nfs/export.h>
68 #include <nfs/nfs_cmd.h>
69 
70 #include <vm/hat.h>
71 #include <vm/as.h>
72 #include <vm/seg.h>
73 #include <vm/seg_map.h>
74 #include <vm/seg_kmem.h>
75 
76 #include <sys/strsubr.h>
77 
78 struct rfs_async_write_list;
79 
80 /*
81  * Zone globals of NFSv2 server
82  */
83 typedef struct nfs_srv {
84 	kmutex_t			async_write_lock;
85 	struct rfs_async_write_list	*async_write_head;
86 
87 	/*
88 	 * enables write clustering if == 1
89 	 */
90 	int		write_async;
91 } nfs_srv_t;
92 
93 /*
94  * These are the interface routines for the server side of the
95  * Network File System.  See the NFS version 2 protocol specification
96  * for a description of this interface.
97  */
98 
99 static int	sattr_to_vattr(struct nfssattr *, struct vattr *);
100 static void	acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
101 			cred_t *);
102 
103 
104 /*
105  * Some "over the wire" UNIX file types.  These are encoded
106  * into the mode.  This needs to be fixed in the next rev.
107  */
108 #define	IFMT		0170000		/* type of file */
109 #define	IFCHR		0020000		/* character special */
110 #define	IFBLK		0060000		/* block special */
111 #define	IFSOCK		0140000		/* socket */
112 
113 u_longlong_t nfs2_srv_caller_id;
114 
115 static nfs_srv_t *
116 nfs_get_srv(void)
117 {
118 	nfs_globals_t *ng = nfs_srv_getzg();
119 	nfs_srv_t *srv = ng->nfs_srv;
120 	ASSERT(srv != NULL);
121 	return (srv);
122 }
123 
124 /*
125  * Get file attributes.
126  * Returns the current attributes of the file with the given fhandle.
127  */
128 /* ARGSUSED */
129 void
130 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
131     struct svc_req *req, cred_t *cr, bool_t ro)
132 {
133 	int error;
134 	vnode_t *vp;
135 	struct vattr va;
136 
137 	vp = nfs_fhtovp(fhp, exi);
138 	if (vp == NULL) {
139 		ns->ns_status = NFSERR_STALE;
140 		return;
141 	}
142 
143 	/*
144 	 * Do the getattr.
145 	 */
146 	va.va_mask = AT_ALL;	/* we want all the attributes */
147 
148 	error = rfs4_delegated_getattr(vp, &va, 0, cr);
149 
150 	/* check for overflows */
151 	if (!error) {
152 		/* Lie about the object type for a referral */
153 		if (vn_is_nfs_reparse(vp, cr))
154 			va.va_type = VLNK;
155 
156 		acl_perm(vp, exi, &va, cr);
157 		error = vattr_to_nattr(&va, &ns->ns_attr);
158 	}
159 
160 	VN_RELE(vp);
161 
162 	ns->ns_status = puterrno(error);
163 }
164 void *
165 rfs_getattr_getfh(fhandle_t *fhp)
166 {
167 	return (fhp);
168 }
169 
170 /*
171  * Set file attributes.
172  * Sets the attributes of the file with the given fhandle.  Returns
173  * the new attributes.
174  */
175 /* ARGSUSED */
176 void
177 rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns,
178     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
179 {
180 	int error;
181 	int flag;
182 	int in_crit = 0;
183 	vnode_t *vp;
184 	struct vattr va;
185 	struct vattr bva;
186 	struct flock64 bf;
187 	caller_context_t ct;
188 
189 
190 	vp = nfs_fhtovp(&args->saa_fh, exi);
191 	if (vp == NULL) {
192 		ns->ns_status = NFSERR_STALE;
193 		return;
194 	}
195 
196 	if (rdonly(ro, vp)) {
197 		VN_RELE(vp);
198 		ns->ns_status = NFSERR_ROFS;
199 		return;
200 	}
201 
202 	error = sattr_to_vattr(&args->saa_sa, &va);
203 	if (error) {
204 		VN_RELE(vp);
205 		ns->ns_status = puterrno(error);
206 		return;
207 	}
208 
209 	/*
210 	 * If the client is requesting a change to the mtime,
211 	 * but the nanosecond field is set to 1 billion, then
212 	 * this is a flag to the server that it should set the
213 	 * atime and mtime fields to the server's current time.
214 	 * The 1 billion number actually came from the client
215 	 * as 1 million, but the units in the over the wire
216 	 * request are microseconds instead of nanoseconds.
217 	 *
218 	 * This is an overload of the protocol and should be
219 	 * documented in the NFS Version 2 protocol specification.
220 	 */
221 	if (va.va_mask & AT_MTIME) {
222 		if (va.va_mtime.tv_nsec == 1000000000) {
223 			gethrestime(&va.va_mtime);
224 			va.va_atime = va.va_mtime;
225 			va.va_mask |= AT_ATIME;
226 			flag = 0;
227 		} else
228 			flag = ATTR_UTIME;
229 	} else
230 		flag = 0;
231 
232 	/*
233 	 * If the filesystem is exported with nosuid, then mask off
234 	 * the setuid and setgid bits.
235 	 */
236 	if ((va.va_mask & AT_MODE) && vp->v_type == VREG &&
237 	    (exi->exi_export.ex_flags & EX_NOSUID))
238 		va.va_mode &= ~(VSUID | VSGID);
239 
240 	ct.cc_sysid = 0;
241 	ct.cc_pid = 0;
242 	ct.cc_caller_id = nfs2_srv_caller_id;
243 	ct.cc_flags = CC_DONTBLOCK;
244 
245 	/*
246 	 * We need to specially handle size changes because it is
247 	 * possible for the client to create a file with modes
248 	 * which indicate read-only, but with the file opened for
249 	 * writing.  If the client then tries to set the size of
250 	 * the file, then the normal access checking done in
251 	 * VOP_SETATTR would prevent the client from doing so,
252 	 * although it should be legal for it to do so.  To get
253 	 * around this, we do the access checking for ourselves
254 	 * and then use VOP_SPACE which doesn't do the access
255 	 * checking which VOP_SETATTR does. VOP_SPACE can only
256 	 * operate on VREG files, let VOP_SETATTR handle the other
257 	 * extremely rare cases.
258 	 * Also the client should not be allowed to change the
259 	 * size of the file if there is a conflicting non-blocking
260 	 * mandatory lock in the region of change.
261 	 */
262 	if (vp->v_type == VREG && va.va_mask & AT_SIZE) {
263 		if (nbl_need_check(vp)) {
264 			nbl_start_crit(vp, RW_READER);
265 			in_crit = 1;
266 		}
267 
268 		bva.va_mask = AT_UID | AT_SIZE;
269 
270 		error = VOP_GETATTR(vp, &bva, 0, cr, &ct);
271 
272 		if (error) {
273 			if (in_crit)
274 				nbl_end_crit(vp);
275 			VN_RELE(vp);
276 			ns->ns_status = puterrno(error);
277 			return;
278 		}
279 
280 		if (in_crit) {
281 			u_offset_t offset;
282 			ssize_t length;
283 
284 			if (va.va_size < bva.va_size) {
285 				offset = va.va_size;
286 				length = bva.va_size - va.va_size;
287 			} else {
288 				offset = bva.va_size;
289 				length = va.va_size - bva.va_size;
290 			}
291 			if (nbl_conflict(vp, NBL_WRITE, offset, length, 0,
292 			    NULL)) {
293 				error = EACCES;
294 			}
295 		}
296 
297 		if (crgetuid(cr) == bva.va_uid && !error &&
298 		    va.va_size != bva.va_size) {
299 			va.va_mask &= ~AT_SIZE;
300 			bf.l_type = F_WRLCK;
301 			bf.l_whence = 0;
302 			bf.l_start = (off64_t)va.va_size;
303 			bf.l_len = 0;
304 			bf.l_sysid = 0;
305 			bf.l_pid = 0;
306 
307 			error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE,
308 			    (offset_t)va.va_size, cr, &ct);
309 		}
310 		if (in_crit)
311 			nbl_end_crit(vp);
312 	} else
313 		error = 0;
314 
315 	/*
316 	 * Do the setattr.
317 	 */
318 	if (!error && va.va_mask) {
319 		error = VOP_SETATTR(vp, &va, flag, cr, &ct);
320 	}
321 
322 	/*
323 	 * check if the monitor on either vop_space or vop_setattr detected
324 	 * a delegation conflict and if so, mark the thread flag as
325 	 * wouldblock so that the response is dropped and the client will
326 	 * try again.
327 	 */
328 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
329 		VN_RELE(vp);
330 		curthread->t_flag |= T_WOULDBLOCK;
331 		return;
332 	}
333 
334 	if (!error) {
335 		va.va_mask = AT_ALL;	/* get everything */
336 
337 		error = rfs4_delegated_getattr(vp, &va, 0, cr);
338 
339 		/* check for overflows */
340 		if (!error) {
341 			acl_perm(vp, exi, &va, cr);
342 			error = vattr_to_nattr(&va, &ns->ns_attr);
343 		}
344 	}
345 
346 	ct.cc_flags = 0;
347 
348 	/*
349 	 * Force modified metadata out to stable storage.
350 	 */
351 	(void) VOP_FSYNC(vp, FNODSYNC, cr, &ct);
352 
353 	VN_RELE(vp);
354 
355 	ns->ns_status = puterrno(error);
356 }
357 void *
358 rfs_setattr_getfh(struct nfssaargs *args)
359 {
360 	return (&args->saa_fh);
361 }
362 
363 /* Change and release @exip and @vpp only in success */
364 int
365 rfs_cross_mnt(vnode_t **vpp, struct exportinfo **exip)
366 {
367 	struct exportinfo *exi;
368 	vnode_t *vp = *vpp;
369 	fid_t fid;
370 	int error;
371 
372 	VN_HOLD(vp);
373 
374 	if ((error = traverse(&vp)) != 0) {
375 		VN_RELE(vp);
376 		return (error);
377 	}
378 
379 	bzero(&fid, sizeof (fid));
380 	fid.fid_len = MAXFIDSZ;
381 	error = VOP_FID(vp, &fid, NULL);
382 	if (error) {
383 		VN_RELE(vp);
384 		return (error);
385 	}
386 
387 	exi = checkexport(&vp->v_vfsp->vfs_fsid, &fid);
388 	if (exi == NULL ||
389 	    (exi->exi_export.ex_flags & EX_NOHIDE) == 0) {
390 		/*
391 		 * It is not error, just subdir is not exported
392 		 * or "nohide" is not set
393 		 */
394 		if (exi != NULL)
395 			exi_rele(exi);
396 		VN_RELE(vp);
397 	} else {
398 		/* go to submount */
399 		exi_rele(*exip);
400 		*exip = exi;
401 
402 		VN_RELE(*vpp);
403 		*vpp = vp;
404 	}
405 
406 	return (0);
407 }
408 
409 /*
410  * Given mounted "dvp" and "exi", go upper mountpoint
411  * with dvp/exi correction
412  * Return 0 in success
413  */
414 int
415 rfs_climb_crossmnt(vnode_t **dvpp, struct exportinfo **exip, cred_t *cr)
416 {
417 	struct exportinfo *exi;
418 	vnode_t *dvp = *dvpp;
419 	vnode_t *zone_rootvp;
420 
421 	zone_rootvp = (*exip)->exi_ne->exi_root->exi_vp;
422 	ASSERT((dvp->v_flag & VROOT) || VN_CMP(zone_rootvp, dvp));
423 
424 	VN_HOLD(dvp);
425 	dvp = untraverse(dvp, zone_rootvp);
426 	exi = nfs_vptoexi(NULL, dvp, cr, NULL, NULL, FALSE);
427 	if (exi == NULL) {
428 		VN_RELE(dvp);
429 		return (-1);
430 	}
431 
432 	ASSERT3U(exi->exi_zoneid, ==, (*exip)->exi_zoneid);
433 	exi_rele(*exip);
434 	*exip = exi;
435 	VN_RELE(*dvpp);
436 	*dvpp = dvp;
437 
438 	return (0);
439 }
440 /*
441  * Directory lookup.
442  * Returns an fhandle and file attributes for file name in a directory.
443  */
444 /* ARGSUSED */
445 void
446 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
447     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
448 {
449 	int error;
450 	vnode_t *dvp;
451 	vnode_t *vp;
452 	struct vattr va;
453 	fhandle_t *fhp = da->da_fhandle;
454 	struct sec_ol sec = {0, 0};
455 	bool_t publicfh_flag = FALSE, auth_weak = FALSE;
456 	char *name;
457 	struct sockaddr *ca;
458 
459 	/*
460 	 * Trusted Extension doesn't support NFSv2. MOUNT
461 	 * will reject v2 clients. Need to prevent v2 client
462 	 * access via WebNFS here.
463 	 */
464 	if (is_system_labeled() && req->rq_vers == 2) {
465 		dr->dr_status = NFSERR_ACCES;
466 		return;
467 	}
468 
469 	/*
470 	 * Disallow NULL paths
471 	 */
472 	if (da->da_name == NULL || *da->da_name == '\0') {
473 		dr->dr_status = NFSERR_ACCES;
474 		return;
475 	}
476 
477 	/*
478 	 * Allow lookups from the root - the default
479 	 * location of the public filehandle.
480 	 */
481 	if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
482 		dvp = ZONE_ROOTVP();
483 		VN_HOLD(dvp);
484 	} else {
485 		dvp = nfs_fhtovp(fhp, exi);
486 		if (dvp == NULL) {
487 			dr->dr_status = NFSERR_STALE;
488 			return;
489 		}
490 	}
491 
492 	exi_hold(exi);
493 	ASSERT3U(exi->exi_zoneid, ==, curzone->zone_id);
494 
495 	/*
496 	 * Not allow lookup beyond root.
497 	 * If the filehandle matches a filehandle of the exi,
498 	 * then the ".." refers beyond the root of an exported filesystem.
499 	 */
500 	if (strcmp(da->da_name, "..") == 0 &&
501 	    EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
502 		if ((exi->exi_export.ex_flags & EX_NOHIDE) &&
503 		    ((dvp->v_flag & VROOT) || VN_IS_CURZONEROOT(dvp))) {
504 			/*
505 			 * special case for ".." and 'nohide'exported root
506 			 */
507 			if (rfs_climb_crossmnt(&dvp, &exi, cr) != 0) {
508 				error = NFSERR_ACCES;
509 				goto out;
510 			}
511 		} else  {
512 			error = NFSERR_NOENT;
513 			goto out;
514 		}
515 	}
516 
517 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
518 	name = nfscmd_convname(ca, exi, da->da_name, NFSCMD_CONV_INBOUND,
519 	    MAXPATHLEN);
520 
521 	if (name == NULL) {
522 		error = NFSERR_ACCES;
523 		goto out;
524 	}
525 
526 	/*
527 	 * If the public filehandle is used then allow
528 	 * a multi-component lookup, i.e. evaluate
529 	 * a pathname and follow symbolic links if
530 	 * necessary.
531 	 *
532 	 * This may result in a vnode in another filesystem
533 	 * which is OK as long as the filesystem is exported.
534 	 */
535 	if (PUBLIC_FH2(fhp)) {
536 		publicfh_flag = TRUE;
537 
538 		exi_rele(exi);
539 		exi = NULL;
540 
541 		error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &exi,
542 		    &sec);
543 	} else {
544 		/*
545 		 * Do a normal single component lookup.
546 		 */
547 		error = VOP_LOOKUP(dvp, name, &vp, NULL, 0, NULL, cr,
548 		    NULL, NULL, NULL);
549 	}
550 
551 	if (name != da->da_name)
552 		kmem_free(name, MAXPATHLEN);
553 
554 	if (error == 0 && vn_ismntpt(vp)) {
555 		error = rfs_cross_mnt(&vp, &exi);
556 		if (error)
557 			VN_RELE(vp);
558 	}
559 
560 	if (!error) {
561 		va.va_mask = AT_ALL;	/* we want everything */
562 
563 		error = rfs4_delegated_getattr(vp, &va, 0, cr);
564 
565 		/* check for overflows */
566 		if (!error) {
567 			acl_perm(vp, exi, &va, cr);
568 			error = vattr_to_nattr(&va, &dr->dr_attr);
569 			if (!error) {
570 				if (sec.sec_flags & SEC_QUERY)
571 					error = makefh_ol(&dr->dr_fhandle, exi,
572 					    sec.sec_index);
573 				else {
574 					error = makefh(&dr->dr_fhandle, vp,
575 					    exi);
576 					if (!error && publicfh_flag &&
577 					    !chk_clnt_sec(exi, req))
578 						auth_weak = TRUE;
579 				}
580 			}
581 		}
582 		VN_RELE(vp);
583 	}
584 
585 out:
586 	VN_RELE(dvp);
587 
588 	if (exi != NULL)
589 		exi_rele(exi);
590 
591 	/*
592 	 * If it's public fh, no 0x81, and client's flavor is
593 	 * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
594 	 * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
595 	 */
596 	if (auth_weak)
597 		dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR;
598 	else
599 		dr->dr_status = puterrno(error);
600 }
601 void *
602 rfs_lookup_getfh(struct nfsdiropargs *da)
603 {
604 	return (da->da_fhandle);
605 }
606 
607 /*
608  * Read symbolic link.
609  * Returns the string in the symbolic link at the given fhandle.
610  */
611 /* ARGSUSED */
612 void
613 rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi,
614     struct svc_req *req, cred_t *cr, bool_t ro)
615 {
616 	int error;
617 	struct iovec iov;
618 	struct uio uio;
619 	vnode_t *vp;
620 	struct vattr va;
621 	struct sockaddr *ca;
622 	char *name = NULL;
623 	int is_referral = 0;
624 
625 	vp = nfs_fhtovp(fhp, exi);
626 	if (vp == NULL) {
627 		rl->rl_data = NULL;
628 		rl->rl_status = NFSERR_STALE;
629 		return;
630 	}
631 
632 	va.va_mask = AT_MODE;
633 
634 	error = VOP_GETATTR(vp, &va, 0, cr, NULL);
635 
636 	if (error) {
637 		VN_RELE(vp);
638 		rl->rl_data = NULL;
639 		rl->rl_status = puterrno(error);
640 		return;
641 	}
642 
643 	if (MANDLOCK(vp, va.va_mode)) {
644 		VN_RELE(vp);
645 		rl->rl_data = NULL;
646 		rl->rl_status = NFSERR_ACCES;
647 		return;
648 	}
649 
650 	/* We lied about the object type for a referral */
651 	if (vn_is_nfs_reparse(vp, cr))
652 		is_referral = 1;
653 
654 	/*
655 	 * XNFS and RFC1094 require us to return ENXIO if argument
656 	 * is not a link. BUGID 1138002.
657 	 */
658 	if (vp->v_type != VLNK && !is_referral) {
659 		VN_RELE(vp);
660 		rl->rl_data = NULL;
661 		rl->rl_status = NFSERR_NXIO;
662 		return;
663 	}
664 
665 	/*
666 	 * Allocate data for pathname.  This will be freed by rfs_rlfree.
667 	 */
668 	rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
669 
670 	if (is_referral) {
671 		char *s;
672 		size_t strsz;
673 		kstat_named_t *stat =
674 		    exi->exi_ne->ne_globals->svstat[NFS_VERSION];
675 
676 		/* Get an artificial symlink based on a referral */
677 		s = build_symlink(vp, cr, &strsz);
678 		stat[NFS_REFERLINKS].value.ui64++;
679 		DTRACE_PROBE2(nfs2serv__func__referral__reflink,
680 		    vnode_t *, vp, char *, s);
681 		if (s == NULL)
682 			error = EINVAL;
683 		else {
684 			error = 0;
685 			(void) strlcpy(rl->rl_data, s, NFS_MAXPATHLEN);
686 			rl->rl_count = (uint32_t)MIN(strsz, NFS_MAXPATHLEN);
687 			kmem_free(s, strsz);
688 		}
689 
690 	} else {
691 
692 		/*
693 		 * Set up io vector to read sym link data
694 		 */
695 		iov.iov_base = rl->rl_data;
696 		iov.iov_len = NFS_MAXPATHLEN;
697 		uio.uio_iov = &iov;
698 		uio.uio_iovcnt = 1;
699 		uio.uio_segflg = UIO_SYSSPACE;
700 		uio.uio_extflg = UIO_COPY_CACHED;
701 		uio.uio_loffset = (offset_t)0;
702 		uio.uio_resid = NFS_MAXPATHLEN;
703 
704 		/*
705 		 * Do the readlink.
706 		 */
707 		error = VOP_READLINK(vp, &uio, cr, NULL);
708 
709 		rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid);
710 
711 		if (!error)
712 			rl->rl_data[rl->rl_count] = '\0';
713 
714 	}
715 
716 
717 	VN_RELE(vp);
718 
719 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
720 	name = nfscmd_convname(ca, exi, rl->rl_data,
721 	    NFSCMD_CONV_OUTBOUND, MAXPATHLEN);
722 
723 	if (name != NULL && name != rl->rl_data) {
724 		kmem_free(rl->rl_data, NFS_MAXPATHLEN);
725 		rl->rl_data = name;
726 	}
727 
728 	/*
729 	 * XNFS and RFC1094 require us to return ENXIO if argument
730 	 * is not a link. UFS returns EINVAL if this is the case,
731 	 * so we do the mapping here. BUGID 1138002.
732 	 */
733 	if (error == EINVAL)
734 		rl->rl_status = NFSERR_NXIO;
735 	else
736 		rl->rl_status = puterrno(error);
737 
738 }
739 void *
740 rfs_readlink_getfh(fhandle_t *fhp)
741 {
742 	return (fhp);
743 }
744 /*
745  * Free data allocated by rfs_readlink
746  */
747 void
748 rfs_rlfree(struct nfsrdlnres *rl)
749 {
750 	if (rl->rl_data != NULL)
751 		kmem_free(rl->rl_data, NFS_MAXPATHLEN);
752 }
753 
754 static int rdma_setup_read_data2(struct nfsreadargs *, struct nfsrdresult *);
755 
756 /*
757  * Read data.
758  * Returns some data read from the file at the given fhandle.
759  */
760 /* ARGSUSED */
761 void
762 rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr,
763     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
764 {
765 	vnode_t *vp;
766 	int error;
767 	struct vattr va;
768 	struct iovec iov;
769 	struct uio uio;
770 	mblk_t *mp;
771 	int alloc_err = 0;
772 	int in_crit = 0;
773 	caller_context_t ct;
774 
775 	vp = nfs_fhtovp(&ra->ra_fhandle, exi);
776 	if (vp == NULL) {
777 		rr->rr_data = NULL;
778 		rr->rr_status = NFSERR_STALE;
779 		return;
780 	}
781 
782 	if (vp->v_type != VREG) {
783 		VN_RELE(vp);
784 		rr->rr_data = NULL;
785 		rr->rr_status = NFSERR_ISDIR;
786 		return;
787 	}
788 
789 	ct.cc_sysid = 0;
790 	ct.cc_pid = 0;
791 	ct.cc_caller_id = nfs2_srv_caller_id;
792 	ct.cc_flags = CC_DONTBLOCK;
793 
794 	/*
795 	 * Enter the critical region before calling VOP_RWLOCK
796 	 * to avoid a deadlock with write requests.
797 	 */
798 	if (nbl_need_check(vp)) {
799 		nbl_start_crit(vp, RW_READER);
800 		if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
801 		    0, NULL)) {
802 			nbl_end_crit(vp);
803 			VN_RELE(vp);
804 			rr->rr_data = NULL;
805 			rr->rr_status = NFSERR_ACCES;
806 			return;
807 		}
808 		in_crit = 1;
809 	}
810 
811 	error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct);
812 
813 	/* check if a monitor detected a delegation conflict */
814 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
815 		if (in_crit)
816 			nbl_end_crit(vp);
817 		VN_RELE(vp);
818 		/* mark as wouldblock so response is dropped */
819 		curthread->t_flag |= T_WOULDBLOCK;
820 
821 		rr->rr_data = NULL;
822 		return;
823 	}
824 
825 	va.va_mask = AT_ALL;
826 
827 	error = VOP_GETATTR(vp, &va, 0, cr, &ct);
828 
829 	if (error) {
830 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
831 		if (in_crit)
832 			nbl_end_crit(vp);
833 
834 		VN_RELE(vp);
835 		rr->rr_data = NULL;
836 		rr->rr_status = puterrno(error);
837 
838 		return;
839 	}
840 
841 	/*
842 	 * This is a kludge to allow reading of files created
843 	 * with no read permission.  The owner of the file
844 	 * is always allowed to read it.
845 	 */
846 	if (crgetuid(cr) != va.va_uid) {
847 		error = VOP_ACCESS(vp, VREAD, 0, cr, &ct);
848 
849 		if (error) {
850 			/*
851 			 * Exec is the same as read over the net because
852 			 * of demand loading.
853 			 */
854 			error = VOP_ACCESS(vp, VEXEC, 0, cr, &ct);
855 		}
856 		if (error) {
857 			VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
858 			if (in_crit)
859 				nbl_end_crit(vp);
860 			VN_RELE(vp);
861 			rr->rr_data = NULL;
862 			rr->rr_status = puterrno(error);
863 
864 			return;
865 		}
866 	}
867 
868 	if (MANDLOCK(vp, va.va_mode)) {
869 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
870 		if (in_crit)
871 			nbl_end_crit(vp);
872 
873 		VN_RELE(vp);
874 		rr->rr_data = NULL;
875 		rr->rr_status = NFSERR_ACCES;
876 
877 		return;
878 	}
879 
880 	rr->rr_ok.rrok_wlist_len = 0;
881 	rr->rr_ok.rrok_wlist = NULL;
882 
883 	if ((u_offset_t)ra->ra_offset >= va.va_size) {
884 		rr->rr_count = 0;
885 		rr->rr_data = NULL;
886 		/*
887 		 * In this case, status is NFS_OK, but there is no data
888 		 * to encode. So set rr_mp to NULL.
889 		 */
890 		rr->rr_mp = NULL;
891 		rr->rr_ok.rrok_wlist = ra->ra_wlist;
892 		if (rr->rr_ok.rrok_wlist)
893 			clist_zero_len(rr->rr_ok.rrok_wlist);
894 		goto done;
895 	}
896 
897 	if (ra->ra_wlist) {
898 		mp = NULL;
899 		rr->rr_mp = NULL;
900 		(void) rdma_get_wchunk(req, &iov, ra->ra_wlist);
901 		if (ra->ra_count > iov.iov_len) {
902 			rr->rr_data = NULL;
903 			rr->rr_status = NFSERR_INVAL;
904 			goto done;
905 		}
906 	} else {
907 		/*
908 		 * mp will contain the data to be sent out in the read reply.
909 		 * This will be freed after the reply has been sent out (by the
910 		 * driver).
911 		 * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so
912 		 * that the call to xdrmblk_putmblk() never fails.
913 		 */
914 		mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG,
915 		    &alloc_err);
916 		ASSERT(mp != NULL);
917 		ASSERT(alloc_err == 0);
918 
919 		rr->rr_mp = mp;
920 
921 		/*
922 		 * Set up io vector
923 		 */
924 		iov.iov_base = (caddr_t)mp->b_datap->db_base;
925 		iov.iov_len = ra->ra_count;
926 	}
927 
928 	uio.uio_iov = &iov;
929 	uio.uio_iovcnt = 1;
930 	uio.uio_segflg = UIO_SYSSPACE;
931 	uio.uio_extflg = UIO_COPY_CACHED;
932 	uio.uio_loffset = (offset_t)ra->ra_offset;
933 	uio.uio_resid = ra->ra_count;
934 
935 	error = VOP_READ(vp, &uio, 0, cr, &ct);
936 
937 	if (error) {
938 		if (mp)
939 			freeb(mp);
940 
941 		/*
942 		 * check if a monitor detected a delegation conflict and
943 		 * mark as wouldblock so response is dropped
944 		 */
945 		if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
946 			curthread->t_flag |= T_WOULDBLOCK;
947 		else
948 			rr->rr_status = puterrno(error);
949 
950 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
951 		if (in_crit)
952 			nbl_end_crit(vp);
953 
954 		VN_RELE(vp);
955 		rr->rr_data = NULL;
956 
957 		return;
958 	}
959 
960 	/*
961 	 * Get attributes again so we can send the latest access
962 	 * time to the client side for its cache.
963 	 */
964 	va.va_mask = AT_ALL;
965 
966 	error = VOP_GETATTR(vp, &va, 0, cr, &ct);
967 
968 	if (error) {
969 		if (mp)
970 			freeb(mp);
971 
972 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
973 		if (in_crit)
974 			nbl_end_crit(vp);
975 
976 		VN_RELE(vp);
977 		rr->rr_data = NULL;
978 		rr->rr_status = puterrno(error);
979 
980 		return;
981 	}
982 
983 	rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid);
984 
985 	if (mp) {
986 		rr->rr_data = (char *)mp->b_datap->db_base;
987 	} else {
988 		if (ra->ra_wlist) {
989 			rr->rr_data = (caddr_t)iov.iov_base;
990 			if (!rdma_setup_read_data2(ra, rr)) {
991 				rr->rr_data = NULL;
992 				rr->rr_status = puterrno(NFSERR_INVAL);
993 			}
994 		}
995 	}
996 done:
997 	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
998 	if (in_crit)
999 		nbl_end_crit(vp);
1000 
1001 	acl_perm(vp, exi, &va, cr);
1002 
1003 	/* check for overflows */
1004 	error = vattr_to_nattr(&va, &rr->rr_attr);
1005 
1006 	VN_RELE(vp);
1007 
1008 	rr->rr_status = puterrno(error);
1009 }
1010 
1011 /*
1012  * Free data allocated by rfs_read
1013  */
1014 void
1015 rfs_rdfree(struct nfsrdresult *rr)
1016 {
1017 	mblk_t *mp;
1018 
1019 	if (rr->rr_status == NFS_OK) {
1020 		mp = rr->rr_mp;
1021 		if (mp != NULL)
1022 			freeb(mp);
1023 	}
1024 }
1025 
1026 void *
1027 rfs_read_getfh(struct nfsreadargs *ra)
1028 {
1029 	return (&ra->ra_fhandle);
1030 }
1031 
1032 #define	MAX_IOVECS	12
1033 
1034 #ifdef DEBUG
1035 static int rfs_write_sync_hits = 0;
1036 static int rfs_write_sync_misses = 0;
1037 #endif
1038 
1039 /*
1040  * Write data to file.
1041  * Returns attributes of a file after writing some data to it.
1042  *
1043  * Any changes made here, especially in error handling might have
1044  * to also be done in rfs_write (which clusters write requests).
1045  */
1046 /* ARGSUSED */
1047 void
1048 rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns,
1049     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1050 {
1051 	int error;
1052 	vnode_t *vp;
1053 	rlim64_t rlimit;
1054 	struct vattr va;
1055 	struct uio uio;
1056 	struct iovec iov[MAX_IOVECS];
1057 	mblk_t *m;
1058 	struct iovec *iovp;
1059 	int iovcnt;
1060 	cred_t *savecred;
1061 	int in_crit = 0;
1062 	caller_context_t ct;
1063 
1064 	vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1065 	if (vp == NULL) {
1066 		ns->ns_status = NFSERR_STALE;
1067 		return;
1068 	}
1069 
1070 	if (rdonly(ro, vp)) {
1071 		VN_RELE(vp);
1072 		ns->ns_status = NFSERR_ROFS;
1073 		return;
1074 	}
1075 
1076 	if (vp->v_type != VREG) {
1077 		VN_RELE(vp);
1078 		ns->ns_status = NFSERR_ISDIR;
1079 		return;
1080 	}
1081 
1082 	ct.cc_sysid = 0;
1083 	ct.cc_pid = 0;
1084 	ct.cc_caller_id = nfs2_srv_caller_id;
1085 	ct.cc_flags = CC_DONTBLOCK;
1086 
1087 	va.va_mask = AT_UID|AT_MODE;
1088 
1089 	error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1090 
1091 	if (error) {
1092 		VN_RELE(vp);
1093 		ns->ns_status = puterrno(error);
1094 
1095 		return;
1096 	}
1097 
1098 	if (crgetuid(cr) != va.va_uid) {
1099 		/*
1100 		 * This is a kludge to allow writes of files created
1101 		 * with read only permission.  The owner of the file
1102 		 * is always allowed to write it.
1103 		 */
1104 		error = VOP_ACCESS(vp, VWRITE, 0, cr, &ct);
1105 
1106 		if (error) {
1107 			VN_RELE(vp);
1108 			ns->ns_status = puterrno(error);
1109 			return;
1110 		}
1111 	}
1112 
1113 	/*
1114 	 * Can't access a mandatory lock file.  This might cause
1115 	 * the NFS service thread to block forever waiting for a
1116 	 * lock to be released that will never be released.
1117 	 */
1118 	if (MANDLOCK(vp, va.va_mode)) {
1119 		VN_RELE(vp);
1120 		ns->ns_status = NFSERR_ACCES;
1121 		return;
1122 	}
1123 
1124 	/*
1125 	 * We have to enter the critical region before calling VOP_RWLOCK
1126 	 * to avoid a deadlock with ufs.
1127 	 */
1128 	if (nbl_need_check(vp)) {
1129 		nbl_start_crit(vp, RW_READER);
1130 		in_crit = 1;
1131 		if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
1132 		    wa->wa_count, 0, NULL)) {
1133 			error = EACCES;
1134 			goto out;
1135 		}
1136 	}
1137 
1138 	error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1139 
1140 	/* check if a monitor detected a delegation conflict */
1141 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1142 		goto out;
1143 	}
1144 
1145 	if (wa->wa_data || wa->wa_rlist) {
1146 		/* Do the RDMA thing if necessary */
1147 		if (wa->wa_rlist) {
1148 			iov[0].iov_base = (char *)((wa->wa_rlist)->u.c_daddr3);
1149 			iov[0].iov_len = wa->wa_count;
1150 		} else  {
1151 			iov[0].iov_base = wa->wa_data;
1152 			iov[0].iov_len = wa->wa_count;
1153 		}
1154 		uio.uio_iov = iov;
1155 		uio.uio_iovcnt = 1;
1156 		uio.uio_segflg = UIO_SYSSPACE;
1157 		uio.uio_extflg = UIO_COPY_DEFAULT;
1158 		uio.uio_loffset = (offset_t)wa->wa_offset;
1159 		uio.uio_resid = wa->wa_count;
1160 		/*
1161 		 * The limit is checked on the client. We
1162 		 * should allow any size writes here.
1163 		 */
1164 		uio.uio_llimit = curproc->p_fsz_ctl;
1165 		rlimit = uio.uio_llimit - wa->wa_offset;
1166 		if (rlimit < (rlim64_t)uio.uio_resid)
1167 			uio.uio_resid = (uint_t)rlimit;
1168 
1169 		/*
1170 		 * for now we assume no append mode
1171 		 */
1172 		/*
1173 		 * We're changing creds because VM may fault and we need
1174 		 * the cred of the current thread to be used if quota
1175 		 * checking is enabled.
1176 		 */
1177 		savecred = curthread->t_cred;
1178 		curthread->t_cred = cr;
1179 		error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1180 		curthread->t_cred = savecred;
1181 	} else {
1182 
1183 		iovcnt = 0;
1184 		for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
1185 			iovcnt++;
1186 		if (iovcnt <= MAX_IOVECS) {
1187 #ifdef DEBUG
1188 			rfs_write_sync_hits++;
1189 #endif
1190 			iovp = iov;
1191 		} else {
1192 #ifdef DEBUG
1193 			rfs_write_sync_misses++;
1194 #endif
1195 			iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
1196 		}
1197 		mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
1198 		uio.uio_iov = iovp;
1199 		uio.uio_iovcnt = iovcnt;
1200 		uio.uio_segflg = UIO_SYSSPACE;
1201 		uio.uio_extflg = UIO_COPY_DEFAULT;
1202 		uio.uio_loffset = (offset_t)wa->wa_offset;
1203 		uio.uio_resid = wa->wa_count;
1204 		/*
1205 		 * The limit is checked on the client. We
1206 		 * should allow any size writes here.
1207 		 */
1208 		uio.uio_llimit = curproc->p_fsz_ctl;
1209 		rlimit = uio.uio_llimit - wa->wa_offset;
1210 		if (rlimit < (rlim64_t)uio.uio_resid)
1211 			uio.uio_resid = (uint_t)rlimit;
1212 
1213 		/*
1214 		 * For now we assume no append mode.
1215 		 */
1216 		/*
1217 		 * We're changing creds because VM may fault and we need
1218 		 * the cred of the current thread to be used if quota
1219 		 * checking is enabled.
1220 		 */
1221 		savecred = curthread->t_cred;
1222 		curthread->t_cred = cr;
1223 		error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1224 		curthread->t_cred = savecred;
1225 
1226 		if (iovp != iov)
1227 			kmem_free(iovp, sizeof (*iovp) * iovcnt);
1228 	}
1229 
1230 	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1231 
1232 	if (!error) {
1233 		/*
1234 		 * Get attributes again so we send the latest mod
1235 		 * time to the client side for its cache.
1236 		 */
1237 		va.va_mask = AT_ALL;	/* now we want everything */
1238 
1239 		error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1240 
1241 		/* check for overflows */
1242 		if (!error) {
1243 			acl_perm(vp, exi, &va, cr);
1244 			error = vattr_to_nattr(&va, &ns->ns_attr);
1245 		}
1246 	}
1247 
1248 out:
1249 	if (in_crit)
1250 		nbl_end_crit(vp);
1251 	VN_RELE(vp);
1252 
1253 	/* check if a monitor detected a delegation conflict */
1254 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1255 		/* mark as wouldblock so response is dropped */
1256 		curthread->t_flag |= T_WOULDBLOCK;
1257 	else
1258 		ns->ns_status = puterrno(error);
1259 
1260 }
1261 
1262 struct rfs_async_write {
1263 	struct nfswriteargs *wa;
1264 	struct nfsattrstat *ns;
1265 	struct svc_req *req;
1266 	cred_t *cr;
1267 	bool_t ro;
1268 	kthread_t *thread;
1269 	struct rfs_async_write *list;
1270 };
1271 
1272 struct rfs_async_write_list {
1273 	fhandle_t *fhp;
1274 	kcondvar_t cv;
1275 	struct rfs_async_write *list;
1276 	struct rfs_async_write_list *next;
1277 };
1278 
1279 #define	MAXCLIOVECS	42
1280 #define	RFSWRITE_INITVAL (enum nfsstat) -1
1281 
1282 #ifdef DEBUG
1283 static int rfs_write_hits = 0;
1284 static int rfs_write_misses = 0;
1285 #endif
1286 
1287 /*
1288  * Write data to file.
1289  * Returns attributes of a file after writing some data to it.
1290  */
1291 void
1292 rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns,
1293     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1294 {
1295 	int error;
1296 	vnode_t *vp;
1297 	rlim64_t rlimit;
1298 	struct vattr va;
1299 	struct uio uio;
1300 	struct rfs_async_write_list *lp;
1301 	struct rfs_async_write_list *nlp;
1302 	struct rfs_async_write *rp;
1303 	struct rfs_async_write *nrp;
1304 	struct rfs_async_write *trp;
1305 	struct rfs_async_write *lrp;
1306 	int data_written;
1307 	int iovcnt;
1308 	mblk_t *m;
1309 	struct iovec *iovp;
1310 	struct iovec *niovp;
1311 	struct iovec iov[MAXCLIOVECS];
1312 	int count;
1313 	int rcount;
1314 	uint_t off;
1315 	uint_t len;
1316 	struct rfs_async_write nrpsp;
1317 	struct rfs_async_write_list nlpsp;
1318 	ushort_t t_flag;
1319 	cred_t *savecred;
1320 	int in_crit = 0;
1321 	caller_context_t ct;
1322 	nfs_srv_t *nsrv;
1323 
1324 	ASSERT(exi == NULL || exi->exi_zoneid == curzone->zone_id);
1325 	nsrv = nfs_get_srv();
1326 	if (!nsrv->write_async) {
1327 		rfs_write_sync(wa, ns, exi, req, cr, ro);
1328 		return;
1329 	}
1330 
1331 	/*
1332 	 * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
1333 	 * is considered an OK.
1334 	 */
1335 	ns->ns_status = RFSWRITE_INITVAL;
1336 
1337 	nrp = &nrpsp;
1338 	nrp->wa = wa;
1339 	nrp->ns = ns;
1340 	nrp->req = req;
1341 	nrp->cr = cr;
1342 	nrp->ro = ro;
1343 	nrp->thread = curthread;
1344 
1345 	ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
1346 
1347 	/*
1348 	 * Look to see if there is already a cluster started
1349 	 * for this file.
1350 	 */
1351 	mutex_enter(&nsrv->async_write_lock);
1352 	for (lp = nsrv->async_write_head; lp != NULL; lp = lp->next) {
1353 		if (bcmp(&wa->wa_fhandle, lp->fhp,
1354 		    sizeof (fhandle_t)) == 0)
1355 			break;
1356 	}
1357 
1358 	/*
1359 	 * If lp is non-NULL, then there is already a cluster
1360 	 * started.  We need to place ourselves in the cluster
1361 	 * list in the right place as determined by starting
1362 	 * offset.  Conflicts with non-blocking mandatory locked
1363 	 * regions will be checked when the cluster is processed.
1364 	 */
1365 	if (lp != NULL) {
1366 		rp = lp->list;
1367 		trp = NULL;
1368 		while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
1369 			trp = rp;
1370 			rp = rp->list;
1371 		}
1372 		nrp->list = rp;
1373 		if (trp == NULL)
1374 			lp->list = nrp;
1375 		else
1376 			trp->list = nrp;
1377 		while (nrp->ns->ns_status == RFSWRITE_INITVAL)
1378 			cv_wait(&lp->cv, &nsrv->async_write_lock);
1379 		mutex_exit(&nsrv->async_write_lock);
1380 
1381 		return;
1382 	}
1383 
1384 	/*
1385 	 * No cluster started yet, start one and add ourselves
1386 	 * to the list of clusters.
1387 	 */
1388 	nrp->list = NULL;
1389 
1390 	nlp = &nlpsp;
1391 	nlp->fhp = &wa->wa_fhandle;
1392 	cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
1393 	nlp->list = nrp;
1394 	nlp->next = NULL;
1395 
1396 	if (nsrv->async_write_head == NULL) {
1397 		nsrv->async_write_head = nlp;
1398 	} else {
1399 		lp = nsrv->async_write_head;
1400 		while (lp->next != NULL)
1401 			lp = lp->next;
1402 		lp->next = nlp;
1403 	}
1404 	mutex_exit(&nsrv->async_write_lock);
1405 
1406 	/*
1407 	 * Convert the file handle common to all of the requests
1408 	 * in this cluster to a vnode.
1409 	 */
1410 	vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1411 	if (vp == NULL) {
1412 		mutex_enter(&nsrv->async_write_lock);
1413 		if (nsrv->async_write_head == nlp)
1414 			nsrv->async_write_head = nlp->next;
1415 		else {
1416 			lp = nsrv->async_write_head;
1417 			while (lp->next != nlp)
1418 				lp = lp->next;
1419 			lp->next = nlp->next;
1420 		}
1421 		t_flag = curthread->t_flag & T_WOULDBLOCK;
1422 		for (rp = nlp->list; rp != NULL; rp = rp->list) {
1423 			rp->ns->ns_status = NFSERR_STALE;
1424 			rp->thread->t_flag |= t_flag;
1425 		}
1426 		cv_broadcast(&nlp->cv);
1427 		mutex_exit(&nsrv->async_write_lock);
1428 
1429 		return;
1430 	}
1431 
1432 	/*
1433 	 * Can only write regular files.  Attempts to write any
1434 	 * other file types fail with EISDIR.
1435 	 */
1436 	if (vp->v_type != VREG) {
1437 		VN_RELE(vp);
1438 		mutex_enter(&nsrv->async_write_lock);
1439 		if (nsrv->async_write_head == nlp)
1440 			nsrv->async_write_head = nlp->next;
1441 		else {
1442 			lp = nsrv->async_write_head;
1443 			while (lp->next != nlp)
1444 				lp = lp->next;
1445 			lp->next = nlp->next;
1446 		}
1447 		t_flag = curthread->t_flag & T_WOULDBLOCK;
1448 		for (rp = nlp->list; rp != NULL; rp = rp->list) {
1449 			rp->ns->ns_status = NFSERR_ISDIR;
1450 			rp->thread->t_flag |= t_flag;
1451 		}
1452 		cv_broadcast(&nlp->cv);
1453 		mutex_exit(&nsrv->async_write_lock);
1454 
1455 		return;
1456 	}
1457 
1458 	/*
1459 	 * Enter the critical region before calling VOP_RWLOCK, to avoid a
1460 	 * deadlock with ufs.
1461 	 */
1462 	if (nbl_need_check(vp)) {
1463 		nbl_start_crit(vp, RW_READER);
1464 		in_crit = 1;
1465 	}
1466 
1467 	ct.cc_sysid = 0;
1468 	ct.cc_pid = 0;
1469 	ct.cc_caller_id = nfs2_srv_caller_id;
1470 	ct.cc_flags = CC_DONTBLOCK;
1471 
1472 	/*
1473 	 * Lock the file for writing.  This operation provides
1474 	 * the delay which allows clusters to grow.
1475 	 */
1476 	error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1477 
1478 	/* check if a monitor detected a delegation conflict */
1479 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1480 		if (in_crit)
1481 			nbl_end_crit(vp);
1482 		VN_RELE(vp);
1483 		/* mark as wouldblock so response is dropped */
1484 		curthread->t_flag |= T_WOULDBLOCK;
1485 		mutex_enter(&nsrv->async_write_lock);
1486 		if (nsrv->async_write_head == nlp)
1487 			nsrv->async_write_head = nlp->next;
1488 		else {
1489 			lp = nsrv->async_write_head;
1490 			while (lp->next != nlp)
1491 				lp = lp->next;
1492 			lp->next = nlp->next;
1493 		}
1494 		for (rp = nlp->list; rp != NULL; rp = rp->list) {
1495 			if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1496 				rp->ns->ns_status = puterrno(error);
1497 				rp->thread->t_flag |= T_WOULDBLOCK;
1498 			}
1499 		}
1500 		cv_broadcast(&nlp->cv);
1501 		mutex_exit(&nsrv->async_write_lock);
1502 
1503 		return;
1504 	}
1505 
1506 	/*
1507 	 * Disconnect this cluster from the list of clusters.
1508 	 * The cluster that is being dealt with must be fixed
1509 	 * in size after this point, so there is no reason
1510 	 * to leave it on the list so that new requests can
1511 	 * find it.
1512 	 *
1513 	 * The algorithm is that the first write request will
1514 	 * create a cluster, convert the file handle to a
1515 	 * vnode pointer, and then lock the file for writing.
1516 	 * This request is not likely to be clustered with
1517 	 * any others.  However, the next request will create
1518 	 * a new cluster and be blocked in VOP_RWLOCK while
1519 	 * the first request is being processed.  This delay
1520 	 * will allow more requests to be clustered in this
1521 	 * second cluster.
1522 	 */
1523 	mutex_enter(&nsrv->async_write_lock);
1524 	if (nsrv->async_write_head == nlp)
1525 		nsrv->async_write_head = nlp->next;
1526 	else {
1527 		lp = nsrv->async_write_head;
1528 		while (lp->next != nlp)
1529 			lp = lp->next;
1530 		lp->next = nlp->next;
1531 	}
1532 	mutex_exit(&nsrv->async_write_lock);
1533 
1534 	/*
1535 	 * Step through the list of requests in this cluster.
1536 	 * We need to check permissions to make sure that all
1537 	 * of the requests have sufficient permission to write
1538 	 * the file.  A cluster can be composed of requests
1539 	 * from different clients and different users on each
1540 	 * client.
1541 	 *
1542 	 * As a side effect, we also calculate the size of the
1543 	 * byte range that this cluster encompasses.
1544 	 */
1545 	rp = nlp->list;
1546 	off = rp->wa->wa_offset;
1547 	len = (uint_t)0;
1548 	do {
1549 		if (rdonly(rp->ro, vp)) {
1550 			rp->ns->ns_status = NFSERR_ROFS;
1551 			t_flag = curthread->t_flag & T_WOULDBLOCK;
1552 			rp->thread->t_flag |= t_flag;
1553 			continue;
1554 		}
1555 
1556 		va.va_mask = AT_UID|AT_MODE;
1557 
1558 		error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1559 
1560 		if (!error) {
1561 			if (crgetuid(rp->cr) != va.va_uid) {
1562 				/*
1563 				 * This is a kludge to allow writes of files
1564 				 * created with read only permission.  The
1565 				 * owner of the file is always allowed to
1566 				 * write it.
1567 				 */
1568 				error = VOP_ACCESS(vp, VWRITE, 0, rp->cr, &ct);
1569 			}
1570 			if (!error && MANDLOCK(vp, va.va_mode))
1571 				error = EACCES;
1572 		}
1573 
1574 		/*
1575 		 * Check for a conflict with a nbmand-locked region.
1576 		 */
1577 		if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset,
1578 		    rp->wa->wa_count, 0, NULL)) {
1579 			error = EACCES;
1580 		}
1581 
1582 		if (error) {
1583 			rp->ns->ns_status = puterrno(error);
1584 			t_flag = curthread->t_flag & T_WOULDBLOCK;
1585 			rp->thread->t_flag |= t_flag;
1586 			continue;
1587 		}
1588 		if (len < rp->wa->wa_offset + rp->wa->wa_count - off)
1589 			len = rp->wa->wa_offset + rp->wa->wa_count - off;
1590 	} while ((rp = rp->list) != NULL);
1591 
1592 	/*
1593 	 * Step through the cluster attempting to gather as many
1594 	 * requests which are contiguous as possible.  These
1595 	 * contiguous requests are handled via one call to VOP_WRITE
1596 	 * instead of different calls to VOP_WRITE.  We also keep
1597 	 * track of the fact that any data was written.
1598 	 */
1599 	rp = nlp->list;
1600 	data_written = 0;
1601 	do {
1602 		/*
1603 		 * Skip any requests which are already marked as having an
1604 		 * error.
1605 		 */
1606 		if (rp->ns->ns_status != RFSWRITE_INITVAL) {
1607 			rp = rp->list;
1608 			continue;
1609 		}
1610 
1611 		/*
1612 		 * Count the number of iovec's which are required
1613 		 * to handle this set of requests.  One iovec is
1614 		 * needed for each data buffer, whether addressed
1615 		 * by wa_data or by the b_rptr pointers in the
1616 		 * mblk chains.
1617 		 */
1618 		iovcnt = 0;
1619 		lrp = rp;
1620 		for (;;) {
1621 			if (lrp->wa->wa_data || lrp->wa->wa_rlist)
1622 				iovcnt++;
1623 			else {
1624 				m = lrp->wa->wa_mblk;
1625 				while (m != NULL) {
1626 					iovcnt++;
1627 					m = m->b_cont;
1628 				}
1629 			}
1630 			if (lrp->list == NULL ||
1631 			    lrp->list->ns->ns_status != RFSWRITE_INITVAL ||
1632 			    lrp->wa->wa_offset + lrp->wa->wa_count !=
1633 			    lrp->list->wa->wa_offset) {
1634 				lrp = lrp->list;
1635 				break;
1636 			}
1637 			lrp = lrp->list;
1638 		}
1639 
1640 		if (iovcnt <= MAXCLIOVECS) {
1641 #ifdef DEBUG
1642 			rfs_write_hits++;
1643 #endif
1644 			niovp = iov;
1645 		} else {
1646 #ifdef DEBUG
1647 			rfs_write_misses++;
1648 #endif
1649 			niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP);
1650 		}
1651 		/*
1652 		 * Put together the scatter/gather iovecs.
1653 		 */
1654 		iovp = niovp;
1655 		trp = rp;
1656 		count = 0;
1657 		do {
1658 			if (trp->wa->wa_data || trp->wa->wa_rlist) {
1659 				if (trp->wa->wa_rlist) {
1660 					iovp->iov_base =
1661 					    (char *)((trp->wa->wa_rlist)->
1662 					    u.c_daddr3);
1663 					iovp->iov_len = trp->wa->wa_count;
1664 				} else  {
1665 					iovp->iov_base = trp->wa->wa_data;
1666 					iovp->iov_len = trp->wa->wa_count;
1667 				}
1668 				iovp++;
1669 			} else {
1670 				m = trp->wa->wa_mblk;
1671 				rcount = trp->wa->wa_count;
1672 				while (m != NULL) {
1673 					iovp->iov_base = (caddr_t)m->b_rptr;
1674 					iovp->iov_len = (m->b_wptr - m->b_rptr);
1675 					rcount -= iovp->iov_len;
1676 					if (rcount < 0)
1677 						iovp->iov_len += rcount;
1678 					iovp++;
1679 					if (rcount <= 0)
1680 						break;
1681 					m = m->b_cont;
1682 				}
1683 			}
1684 			count += trp->wa->wa_count;
1685 			trp = trp->list;
1686 		} while (trp != lrp);
1687 
1688 		uio.uio_iov = niovp;
1689 		uio.uio_iovcnt = iovcnt;
1690 		uio.uio_segflg = UIO_SYSSPACE;
1691 		uio.uio_extflg = UIO_COPY_DEFAULT;
1692 		uio.uio_loffset = (offset_t)rp->wa->wa_offset;
1693 		uio.uio_resid = count;
1694 		/*
1695 		 * The limit is checked on the client. We
1696 		 * should allow any size writes here.
1697 		 */
1698 		uio.uio_llimit = curproc->p_fsz_ctl;
1699 		rlimit = uio.uio_llimit - rp->wa->wa_offset;
1700 		if (rlimit < (rlim64_t)uio.uio_resid)
1701 			uio.uio_resid = (uint_t)rlimit;
1702 
1703 		/*
1704 		 * For now we assume no append mode.
1705 		 */
1706 
1707 		/*
1708 		 * We're changing creds because VM may fault
1709 		 * and we need the cred of the current
1710 		 * thread to be used if quota * checking is
1711 		 * enabled.
1712 		 */
1713 		savecred = curthread->t_cred;
1714 		curthread->t_cred = cr;
1715 		error = VOP_WRITE(vp, &uio, 0, rp->cr, &ct);
1716 		curthread->t_cred = savecred;
1717 
1718 		/* check if a monitor detected a delegation conflict */
1719 		if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1720 			/* mark as wouldblock so response is dropped */
1721 			curthread->t_flag |= T_WOULDBLOCK;
1722 
1723 		if (niovp != iov)
1724 			kmem_free(niovp, sizeof (*niovp) * iovcnt);
1725 
1726 		if (!error) {
1727 			data_written = 1;
1728 			/*
1729 			 * Get attributes again so we send the latest mod
1730 			 * time to the client side for its cache.
1731 			 */
1732 			va.va_mask = AT_ALL;	/* now we want everything */
1733 
1734 			error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1735 
1736 			if (!error)
1737 				acl_perm(vp, exi, &va, rp->cr);
1738 		}
1739 
1740 		/*
1741 		 * Fill in the status responses for each request
1742 		 * which was just handled.  Also, copy the latest
1743 		 * attributes in to the attribute responses if
1744 		 * appropriate.
1745 		 */
1746 		t_flag = curthread->t_flag & T_WOULDBLOCK;
1747 		do {
1748 			rp->thread->t_flag |= t_flag;
1749 			/* check for overflows */
1750 			if (!error) {
1751 				error  = vattr_to_nattr(&va, &rp->ns->ns_attr);
1752 			}
1753 			rp->ns->ns_status = puterrno(error);
1754 			rp = rp->list;
1755 		} while (rp != lrp);
1756 	} while (rp != NULL);
1757 
1758 	/*
1759 	 * If any data was written at all, then we need to flush
1760 	 * the data and metadata to stable storage.
1761 	 */
1762 	if (data_written) {
1763 		error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr, &ct);
1764 
1765 		if (!error) {
1766 			error = VOP_FSYNC(vp, FNODSYNC, cr, &ct);
1767 		}
1768 	}
1769 
1770 	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1771 
1772 	if (in_crit)
1773 		nbl_end_crit(vp);
1774 	VN_RELE(vp);
1775 
1776 	t_flag = curthread->t_flag & T_WOULDBLOCK;
1777 	mutex_enter(&nsrv->async_write_lock);
1778 	for (rp = nlp->list; rp != NULL; rp = rp->list) {
1779 		if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1780 			rp->ns->ns_status = puterrno(error);
1781 			rp->thread->t_flag |= t_flag;
1782 		}
1783 	}
1784 	cv_broadcast(&nlp->cv);
1785 	mutex_exit(&nsrv->async_write_lock);
1786 
1787 }
1788 
1789 void *
1790 rfs_write_getfh(struct nfswriteargs *wa)
1791 {
1792 	return (&wa->wa_fhandle);
1793 }
1794 
1795 /*
1796  * Create a file.
1797  * Creates a file with given attributes and returns those attributes
1798  * and an fhandle for the new file.
1799  */
1800 void
1801 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
1802     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1803 {
1804 	int error;
1805 	int lookuperr;
1806 	int in_crit = 0;
1807 	struct vattr va;
1808 	vnode_t *vp;
1809 	vnode_t *realvp;
1810 	vnode_t *dvp;
1811 	char *name = args->ca_da.da_name;
1812 	vnode_t *tvp = NULL;
1813 	int mode;
1814 	int lookup_ok;
1815 	bool_t trunc;
1816 	struct sockaddr *ca;
1817 
1818 	/*
1819 	 * Disallow NULL paths
1820 	 */
1821 	if (name == NULL || *name == '\0') {
1822 		dr->dr_status = NFSERR_ACCES;
1823 		return;
1824 	}
1825 
1826 	dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
1827 	if (dvp == NULL) {
1828 		dr->dr_status = NFSERR_STALE;
1829 		return;
1830 	}
1831 
1832 	error = sattr_to_vattr(args->ca_sa, &va);
1833 	if (error) {
1834 		dr->dr_status = puterrno(error);
1835 		return;
1836 	}
1837 
1838 	/*
1839 	 * Must specify the mode.
1840 	 */
1841 	if (!(va.va_mask & AT_MODE)) {
1842 		VN_RELE(dvp);
1843 		dr->dr_status = NFSERR_INVAL;
1844 		return;
1845 	}
1846 
1847 	/*
1848 	 * This is a completely gross hack to make mknod
1849 	 * work over the wire until we can wack the protocol
1850 	 */
1851 	if ((va.va_mode & IFMT) == IFCHR) {
1852 		if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV)
1853 			va.va_type = VFIFO;	/* xtra kludge for named pipe */
1854 		else {
1855 			va.va_type = VCHR;
1856 			/*
1857 			 * uncompress the received dev_t
1858 			 * if the top half is zero indicating a request
1859 			 * from an `older style' OS.
1860 			 */
1861 			if ((va.va_size & 0xffff0000) == 0)
1862 				va.va_rdev = nfsv2_expdev(va.va_size);
1863 			else
1864 				va.va_rdev = (dev_t)va.va_size;
1865 		}
1866 		va.va_mask &= ~AT_SIZE;
1867 	} else if ((va.va_mode & IFMT) == IFBLK) {
1868 		va.va_type = VBLK;
1869 		/*
1870 		 * uncompress the received dev_t
1871 		 * if the top half is zero indicating a request
1872 		 * from an `older style' OS.
1873 		 */
1874 		if ((va.va_size & 0xffff0000) == 0)
1875 			va.va_rdev = nfsv2_expdev(va.va_size);
1876 		else
1877 			va.va_rdev = (dev_t)va.va_size;
1878 		va.va_mask &= ~AT_SIZE;
1879 	} else if ((va.va_mode & IFMT) == IFSOCK) {
1880 		va.va_type = VSOCK;
1881 	} else {
1882 		va.va_type = VREG;
1883 	}
1884 	va.va_mode &= ~IFMT;
1885 	va.va_mask |= AT_TYPE;
1886 
1887 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
1888 	name = nfscmd_convname(ca, exi, name, NFSCMD_CONV_INBOUND,
1889 	    MAXPATHLEN);
1890 	if (name == NULL) {
1891 		dr->dr_status = puterrno(EINVAL);
1892 		return;
1893 	}
1894 
1895 	/*
1896 	 * Why was the choice made to use VWRITE as the mode to the
1897 	 * call to VOP_CREATE ? This results in a bug.  When a client
1898 	 * opens a file that already exists and is RDONLY, the second
1899 	 * open fails with an EACESS because of the mode.
1900 	 * bug ID 1054648.
1901 	 */
1902 	lookup_ok = 0;
1903 	mode = VWRITE;
1904 	if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) {
1905 		error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1906 		    NULL, NULL, NULL);
1907 		if (!error) {
1908 			struct vattr at;
1909 
1910 			lookup_ok = 1;
1911 			at.va_mask = AT_MODE;
1912 			error = VOP_GETATTR(tvp, &at, 0, cr, NULL);
1913 			if (!error)
1914 				mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD;
1915 			VN_RELE(tvp);
1916 			tvp = NULL;
1917 		}
1918 	}
1919 
1920 	if (!lookup_ok) {
1921 		if (rdonly(ro, dvp)) {
1922 			error = EROFS;
1923 		} else if (va.va_type != VREG && va.va_type != VFIFO &&
1924 		    va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) {
1925 			error = EPERM;
1926 		} else {
1927 			error = 0;
1928 		}
1929 	}
1930 
1931 	/*
1932 	 * If file size is being modified on an already existing file
1933 	 * make sure that there are no conflicting non-blocking mandatory
1934 	 * locks in the region being manipulated. Return EACCES if there
1935 	 * are conflicting locks.
1936 	 */
1937 	if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) {
1938 		lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1939 		    NULL, NULL, NULL);
1940 
1941 		if (!lookuperr &&
1942 		    rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) {
1943 			VN_RELE(tvp);
1944 			curthread->t_flag |= T_WOULDBLOCK;
1945 			goto out;
1946 		}
1947 
1948 		if (!lookuperr && nbl_need_check(tvp)) {
1949 			/*
1950 			 * The file exists. Now check if it has any
1951 			 * conflicting non-blocking mandatory locks
1952 			 * in the region being changed.
1953 			 */
1954 			struct vattr bva;
1955 			u_offset_t offset;
1956 			ssize_t length;
1957 
1958 			nbl_start_crit(tvp, RW_READER);
1959 			in_crit = 1;
1960 
1961 			bva.va_mask = AT_SIZE;
1962 			error = VOP_GETATTR(tvp, &bva, 0, cr, NULL);
1963 			if (!error) {
1964 				if (va.va_size < bva.va_size) {
1965 					offset = va.va_size;
1966 					length = bva.va_size - va.va_size;
1967 				} else {
1968 					offset = bva.va_size;
1969 					length = va.va_size - bva.va_size;
1970 				}
1971 				if (length) {
1972 					if (nbl_conflict(tvp, NBL_WRITE,
1973 					    offset, length, 0, NULL)) {
1974 						error = EACCES;
1975 					}
1976 				}
1977 			}
1978 			if (error) {
1979 				nbl_end_crit(tvp);
1980 				VN_RELE(tvp);
1981 				in_crit = 0;
1982 			}
1983 		} else if (tvp != NULL) {
1984 			VN_RELE(tvp);
1985 		}
1986 	}
1987 
1988 	if (!error) {
1989 		/*
1990 		 * If filesystem is shared with nosuid the remove any
1991 		 * setuid/setgid bits on create.
1992 		 */
1993 		if (va.va_type == VREG &&
1994 		    exi->exi_export.ex_flags & EX_NOSUID)
1995 			va.va_mode &= ~(VSUID | VSGID);
1996 
1997 		error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0,
1998 		    NULL, NULL);
1999 
2000 		if (!error) {
2001 
2002 			if ((va.va_mask & AT_SIZE) && (va.va_size == 0))
2003 				trunc = TRUE;
2004 			else
2005 				trunc = FALSE;
2006 
2007 			if (rfs4_check_delegated(FWRITE, vp, trunc)) {
2008 				VN_RELE(vp);
2009 				curthread->t_flag |= T_WOULDBLOCK;
2010 				goto out;
2011 			}
2012 			va.va_mask = AT_ALL;
2013 
2014 			error = VOP_GETATTR(vp, &va, 0, cr, NULL);
2015 
2016 			/* check for overflows */
2017 			if (!error) {
2018 				acl_perm(vp, exi, &va, cr);
2019 				error = vattr_to_nattr(&va, &dr->dr_attr);
2020 				if (!error) {
2021 					error = makefh(&dr->dr_fhandle, vp,
2022 					    exi);
2023 				}
2024 			}
2025 			/*
2026 			 * Force modified metadata out to stable storage.
2027 			 *
2028 			 * if a underlying vp exists, pass it to VOP_FSYNC
2029 			 */
2030 			if (VOP_REALVP(vp, &realvp, NULL) == 0)
2031 				(void) VOP_FSYNC(realvp, FNODSYNC, cr, NULL);
2032 			else
2033 				(void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2034 			VN_RELE(vp);
2035 		}
2036 
2037 		if (in_crit) {
2038 			nbl_end_crit(tvp);
2039 			VN_RELE(tvp);
2040 		}
2041 	}
2042 
2043 	/*
2044 	 * Force modified data and metadata out to stable storage.
2045 	 */
2046 	(void) VOP_FSYNC(dvp, 0, cr, NULL);
2047 
2048 out:
2049 
2050 	VN_RELE(dvp);
2051 
2052 	dr->dr_status = puterrno(error);
2053 
2054 	if (name != args->ca_da.da_name)
2055 		kmem_free(name, MAXPATHLEN);
2056 }
2057 void *
2058 rfs_create_getfh(struct nfscreatargs *args)
2059 {
2060 	return (args->ca_da.da_fhandle);
2061 }
2062 
2063 /*
2064  * Remove a file.
2065  * Remove named file from parent directory.
2066  */
2067 /* ARGSUSED */
2068 void
2069 rfs_remove(struct nfsdiropargs *da, enum nfsstat *status,
2070     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2071 {
2072 	int error = 0;
2073 	vnode_t *vp;
2074 	vnode_t *targvp;
2075 	int in_crit = 0;
2076 
2077 	/*
2078 	 * Disallow NULL paths
2079 	 */
2080 	if (da->da_name == NULL || *da->da_name == '\0') {
2081 		*status = NFSERR_ACCES;
2082 		return;
2083 	}
2084 
2085 	vp = nfs_fhtovp(da->da_fhandle, exi);
2086 	if (vp == NULL) {
2087 		*status = NFSERR_STALE;
2088 		return;
2089 	}
2090 
2091 	if (rdonly(ro, vp)) {
2092 		VN_RELE(vp);
2093 		*status = NFSERR_ROFS;
2094 		return;
2095 	}
2096 
2097 	/*
2098 	 * Check for a conflict with a non-blocking mandatory share reservation.
2099 	 */
2100 	error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0,
2101 	    NULL, cr, NULL, NULL, NULL);
2102 	if (error != 0) {
2103 		VN_RELE(vp);
2104 		*status = puterrno(error);
2105 		return;
2106 	}
2107 
2108 	/*
2109 	 * If the file is delegated to an v4 client, then initiate
2110 	 * recall and drop this request (by setting T_WOULDBLOCK).
2111 	 * The client will eventually re-transmit the request and
2112 	 * (hopefully), by then, the v4 client will have returned
2113 	 * the delegation.
2114 	 */
2115 
2116 	if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2117 		VN_RELE(vp);
2118 		VN_RELE(targvp);
2119 		curthread->t_flag |= T_WOULDBLOCK;
2120 		return;
2121 	}
2122 
2123 	if (nbl_need_check(targvp)) {
2124 		nbl_start_crit(targvp, RW_READER);
2125 		in_crit = 1;
2126 		if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
2127 			error = EACCES;
2128 			goto out;
2129 		}
2130 	}
2131 
2132 	error = VOP_REMOVE(vp, da->da_name, cr, NULL, 0);
2133 
2134 	/*
2135 	 * Force modified data and metadata out to stable storage.
2136 	 */
2137 	(void) VOP_FSYNC(vp, 0, cr, NULL);
2138 
2139 out:
2140 	if (in_crit)
2141 		nbl_end_crit(targvp);
2142 	VN_RELE(targvp);
2143 	VN_RELE(vp);
2144 
2145 	*status = puterrno(error);
2146 
2147 }
2148 
2149 void *
2150 rfs_remove_getfh(struct nfsdiropargs *da)
2151 {
2152 	return (da->da_fhandle);
2153 }
2154 
2155 /*
2156  * rename a file
2157  * Give a file (from) a new name (to).
2158  */
2159 /* ARGSUSED */
2160 void
2161 rfs_rename(struct nfsrnmargs *args, enum nfsstat *status,
2162     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2163 {
2164 	int error = 0;
2165 	vnode_t *fromvp;
2166 	vnode_t *tovp;
2167 	struct exportinfo *to_exi;
2168 	fhandle_t *fh;
2169 	vnode_t *srcvp;
2170 	vnode_t *targvp;
2171 	int in_crit = 0;
2172 
2173 	fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi);
2174 	if (fromvp == NULL) {
2175 		*status = NFSERR_STALE;
2176 		return;
2177 	}
2178 
2179 	fh = args->rna_to.da_fhandle;
2180 	to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2181 	if (to_exi == NULL) {
2182 		VN_RELE(fromvp);
2183 		*status = NFSERR_ACCES;
2184 		return;
2185 	}
2186 	exi_rele(to_exi);
2187 
2188 	if (to_exi != exi) {
2189 		VN_RELE(fromvp);
2190 		*status = NFSERR_XDEV;
2191 		return;
2192 	}
2193 
2194 	tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi);
2195 	if (tovp == NULL) {
2196 		VN_RELE(fromvp);
2197 		*status = NFSERR_STALE;
2198 		return;
2199 	}
2200 
2201 	if (fromvp->v_type != VDIR || tovp->v_type != VDIR) {
2202 		VN_RELE(tovp);
2203 		VN_RELE(fromvp);
2204 		*status = NFSERR_NOTDIR;
2205 		return;
2206 	}
2207 
2208 	/*
2209 	 * Disallow NULL paths
2210 	 */
2211 	if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' ||
2212 	    args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') {
2213 		VN_RELE(tovp);
2214 		VN_RELE(fromvp);
2215 		*status = NFSERR_ACCES;
2216 		return;
2217 	}
2218 
2219 	if (rdonly(ro, tovp)) {
2220 		VN_RELE(tovp);
2221 		VN_RELE(fromvp);
2222 		*status = NFSERR_ROFS;
2223 		return;
2224 	}
2225 
2226 	/*
2227 	 * Check for a conflict with a non-blocking mandatory share reservation.
2228 	 */
2229 	error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
2230 	    NULL, cr, NULL, NULL, NULL);
2231 	if (error != 0) {
2232 		VN_RELE(tovp);
2233 		VN_RELE(fromvp);
2234 		*status = puterrno(error);
2235 		return;
2236 	}
2237 
2238 	/* Check for delegations on the source file */
2239 
2240 	if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
2241 		VN_RELE(tovp);
2242 		VN_RELE(fromvp);
2243 		VN_RELE(srcvp);
2244 		curthread->t_flag |= T_WOULDBLOCK;
2245 		return;
2246 	}
2247 
2248 	/* Check for delegation on the file being renamed over, if it exists */
2249 
2250 	if (nfs4_get_deleg_policy() != SRV_NEVER_DELEGATE &&
2251 	    VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
2252 	    NULL, NULL, NULL) == 0) {
2253 
2254 		if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2255 			VN_RELE(tovp);
2256 			VN_RELE(fromvp);
2257 			VN_RELE(srcvp);
2258 			VN_RELE(targvp);
2259 			curthread->t_flag |= T_WOULDBLOCK;
2260 			return;
2261 		}
2262 		VN_RELE(targvp);
2263 	}
2264 
2265 
2266 	if (nbl_need_check(srcvp)) {
2267 		nbl_start_crit(srcvp, RW_READER);
2268 		in_crit = 1;
2269 		if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) {
2270 			error = EACCES;
2271 			goto out;
2272 		}
2273 	}
2274 
2275 	error = VOP_RENAME(fromvp, args->rna_from.da_name,
2276 	    tovp, args->rna_to.da_name, cr, NULL, 0);
2277 
2278 	if (error == 0)
2279 		vn_renamepath(tovp, srcvp, args->rna_to.da_name,
2280 		    strlen(args->rna_to.da_name));
2281 
2282 	/*
2283 	 * Force modified data and metadata out to stable storage.
2284 	 */
2285 	(void) VOP_FSYNC(tovp, 0, cr, NULL);
2286 	(void) VOP_FSYNC(fromvp, 0, cr, NULL);
2287 
2288 out:
2289 	if (in_crit)
2290 		nbl_end_crit(srcvp);
2291 	VN_RELE(srcvp);
2292 	VN_RELE(tovp);
2293 	VN_RELE(fromvp);
2294 
2295 	*status = puterrno(error);
2296 
2297 }
2298 void *
2299 rfs_rename_getfh(struct nfsrnmargs *args)
2300 {
2301 	return (args->rna_from.da_fhandle);
2302 }
2303 
2304 /*
2305  * Link to a file.
2306  * Create a file (to) which is a hard link to the given file (from).
2307  */
2308 /* ARGSUSED */
2309 void
2310 rfs_link(struct nfslinkargs *args, enum nfsstat *status,
2311     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2312 {
2313 	int error;
2314 	vnode_t *fromvp;
2315 	vnode_t *tovp;
2316 	struct exportinfo *to_exi;
2317 	fhandle_t *fh;
2318 
2319 	fromvp = nfs_fhtovp(args->la_from, exi);
2320 	if (fromvp == NULL) {
2321 		*status = NFSERR_STALE;
2322 		return;
2323 	}
2324 
2325 	fh = args->la_to.da_fhandle;
2326 	to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2327 	if (to_exi == NULL) {
2328 		VN_RELE(fromvp);
2329 		*status = NFSERR_ACCES;
2330 		return;
2331 	}
2332 	exi_rele(to_exi);
2333 
2334 	if (to_exi != exi) {
2335 		VN_RELE(fromvp);
2336 		*status = NFSERR_XDEV;
2337 		return;
2338 	}
2339 
2340 	tovp = nfs_fhtovp(args->la_to.da_fhandle, exi);
2341 	if (tovp == NULL) {
2342 		VN_RELE(fromvp);
2343 		*status = NFSERR_STALE;
2344 		return;
2345 	}
2346 
2347 	if (tovp->v_type != VDIR) {
2348 		VN_RELE(tovp);
2349 		VN_RELE(fromvp);
2350 		*status = NFSERR_NOTDIR;
2351 		return;
2352 	}
2353 	/*
2354 	 * Disallow NULL paths
2355 	 */
2356 	if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') {
2357 		VN_RELE(tovp);
2358 		VN_RELE(fromvp);
2359 		*status = NFSERR_ACCES;
2360 		return;
2361 	}
2362 
2363 	if (rdonly(ro, tovp)) {
2364 		VN_RELE(tovp);
2365 		VN_RELE(fromvp);
2366 		*status = NFSERR_ROFS;
2367 		return;
2368 	}
2369 
2370 	error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr, NULL, 0);
2371 
2372 	/*
2373 	 * Force modified data and metadata out to stable storage.
2374 	 */
2375 	(void) VOP_FSYNC(tovp, 0, cr, NULL);
2376 	(void) VOP_FSYNC(fromvp, FNODSYNC, cr, NULL);
2377 
2378 	VN_RELE(tovp);
2379 	VN_RELE(fromvp);
2380 
2381 	*status = puterrno(error);
2382 
2383 }
2384 void *
2385 rfs_link_getfh(struct nfslinkargs *args)
2386 {
2387 	return (args->la_from);
2388 }
2389 
2390 /*
2391  * Symbolicly link to a file.
2392  * Create a file (to) with the given attributes which is a symbolic link
2393  * to the given path name (to).
2394  */
2395 void
2396 rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
2397     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2398 {
2399 	int error;
2400 	struct vattr va;
2401 	vnode_t *vp;
2402 	vnode_t *svp;
2403 	int lerror;
2404 	struct sockaddr *ca;
2405 	char *name = NULL;
2406 
2407 	/*
2408 	 * Disallow NULL paths
2409 	 */
2410 	if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') {
2411 		*status = NFSERR_ACCES;
2412 		return;
2413 	}
2414 
2415 	vp = nfs_fhtovp(args->sla_from.da_fhandle, exi);
2416 	if (vp == NULL) {
2417 		*status = NFSERR_STALE;
2418 		return;
2419 	}
2420 
2421 	if (rdonly(ro, vp)) {
2422 		VN_RELE(vp);
2423 		*status = NFSERR_ROFS;
2424 		return;
2425 	}
2426 
2427 	error = sattr_to_vattr(args->sla_sa, &va);
2428 	if (error) {
2429 		VN_RELE(vp);
2430 		*status = puterrno(error);
2431 		return;
2432 	}
2433 
2434 	if (!(va.va_mask & AT_MODE)) {
2435 		VN_RELE(vp);
2436 		*status = NFSERR_INVAL;
2437 		return;
2438 	}
2439 
2440 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2441 	name = nfscmd_convname(ca, exi, args->sla_tnm,
2442 	    NFSCMD_CONV_INBOUND, MAXPATHLEN);
2443 
2444 	if (name == NULL) {
2445 		*status = NFSERR_ACCES;
2446 		return;
2447 	}
2448 
2449 	va.va_type = VLNK;
2450 	va.va_mask |= AT_TYPE;
2451 
2452 	error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, name, cr, NULL, 0);
2453 
2454 	/*
2455 	 * Force new data and metadata out to stable storage.
2456 	 */
2457 	lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL, 0,
2458 	    NULL, cr, NULL, NULL, NULL);
2459 
2460 	if (!lerror) {
2461 		(void) VOP_FSYNC(svp, 0, cr, NULL);
2462 		VN_RELE(svp);
2463 	}
2464 
2465 	/*
2466 	 * Force modified data and metadata out to stable storage.
2467 	 */
2468 	(void) VOP_FSYNC(vp, 0, cr, NULL);
2469 
2470 	VN_RELE(vp);
2471 
2472 	*status = puterrno(error);
2473 	if (name != args->sla_tnm)
2474 		kmem_free(name, MAXPATHLEN);
2475 
2476 }
2477 void *
2478 rfs_symlink_getfh(struct nfsslargs *args)
2479 {
2480 	return (args->sla_from.da_fhandle);
2481 }
2482 
2483 /*
2484  * Make a directory.
2485  * Create a directory with the given name, parent directory, and attributes.
2486  * Returns a file handle and attributes for the new directory.
2487  */
2488 /* ARGSUSED */
2489 void
2490 rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr,
2491     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2492 {
2493 	int error;
2494 	struct vattr va;
2495 	vnode_t *dvp = NULL;
2496 	vnode_t *vp;
2497 	char *name = args->ca_da.da_name;
2498 
2499 	/*
2500 	 * Disallow NULL paths
2501 	 */
2502 	if (name == NULL || *name == '\0') {
2503 		dr->dr_status = NFSERR_ACCES;
2504 		return;
2505 	}
2506 
2507 	vp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
2508 	if (vp == NULL) {
2509 		dr->dr_status = NFSERR_STALE;
2510 		return;
2511 	}
2512 
2513 	if (rdonly(ro, vp)) {
2514 		VN_RELE(vp);
2515 		dr->dr_status = NFSERR_ROFS;
2516 		return;
2517 	}
2518 
2519 	error = sattr_to_vattr(args->ca_sa, &va);
2520 	if (error) {
2521 		VN_RELE(vp);
2522 		dr->dr_status = puterrno(error);
2523 		return;
2524 	}
2525 
2526 	if (!(va.va_mask & AT_MODE)) {
2527 		VN_RELE(vp);
2528 		dr->dr_status = NFSERR_INVAL;
2529 		return;
2530 	}
2531 
2532 	va.va_type = VDIR;
2533 	va.va_mask |= AT_TYPE;
2534 
2535 	error = VOP_MKDIR(vp, name, &va, &dvp, cr, NULL, 0, NULL);
2536 
2537 	if (!error) {
2538 		/*
2539 		 * Attribtutes of the newly created directory should
2540 		 * be returned to the client.
2541 		 */
2542 		va.va_mask = AT_ALL; /* We want everything */
2543 		error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2544 
2545 		/* check for overflows */
2546 		if (!error) {
2547 			acl_perm(vp, exi, &va, cr);
2548 			error = vattr_to_nattr(&va, &dr->dr_attr);
2549 			if (!error) {
2550 				error = makefh(&dr->dr_fhandle, dvp, exi);
2551 			}
2552 		}
2553 		/*
2554 		 * Force new data and metadata out to stable storage.
2555 		 */
2556 		(void) VOP_FSYNC(dvp, 0, cr, NULL);
2557 		VN_RELE(dvp);
2558 	}
2559 
2560 	/*
2561 	 * Force modified data and metadata out to stable storage.
2562 	 */
2563 	(void) VOP_FSYNC(vp, 0, cr, NULL);
2564 
2565 	VN_RELE(vp);
2566 
2567 	dr->dr_status = puterrno(error);
2568 
2569 }
2570 void *
2571 rfs_mkdir_getfh(struct nfscreatargs *args)
2572 {
2573 	return (args->ca_da.da_fhandle);
2574 }
2575 
2576 /*
2577  * Remove a directory.
2578  * Remove the given directory name from the given parent directory.
2579  */
2580 /* ARGSUSED */
2581 void
2582 rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status,
2583     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2584 {
2585 	int error;
2586 	vnode_t *vp;
2587 
2588 	/*
2589 	 * Disallow NULL paths
2590 	 */
2591 	if (da->da_name == NULL || *da->da_name == '\0') {
2592 		*status = NFSERR_ACCES;
2593 		return;
2594 	}
2595 
2596 	vp = nfs_fhtovp(da->da_fhandle, exi);
2597 	if (vp == NULL) {
2598 		*status = NFSERR_STALE;
2599 		return;
2600 	}
2601 
2602 	if (rdonly(ro, vp)) {
2603 		VN_RELE(vp);
2604 		*status = NFSERR_ROFS;
2605 		return;
2606 	}
2607 
2608 	/*
2609 	 * VOP_RMDIR takes a third argument (the current
2610 	 * directory of the process).  That's because someone
2611 	 * wants to return EINVAL if one tries to remove ".".
2612 	 * Of course, NFS servers have no idea what their
2613 	 * clients' current directories are.  We fake it by
2614 	 * supplying a vnode known to exist and illegal to
2615 	 * remove.
2616 	 */
2617 	error = VOP_RMDIR(vp, da->da_name, ZONE_ROOTVP(), cr, NULL, 0);
2618 
2619 	/*
2620 	 * Force modified data and metadata out to stable storage.
2621 	 */
2622 	(void) VOP_FSYNC(vp, 0, cr, NULL);
2623 
2624 	VN_RELE(vp);
2625 
2626 	/*
2627 	 * System V defines rmdir to return EEXIST, not ENOTEMPTY,
2628 	 * if the directory is not empty.  A System V NFS server
2629 	 * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
2630 	 * over the wire.
2631 	 */
2632 	if (error == EEXIST)
2633 		*status = NFSERR_NOTEMPTY;
2634 	else
2635 		*status = puterrno(error);
2636 
2637 }
2638 void *
2639 rfs_rmdir_getfh(struct nfsdiropargs *da)
2640 {
2641 	return (da->da_fhandle);
2642 }
2643 
2644 /* ARGSUSED */
2645 void
2646 rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
2647     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2648 {
2649 	int error;
2650 	int iseof;
2651 	struct iovec iov;
2652 	struct uio uio;
2653 	vnode_t *vp;
2654 	char *ndata = NULL;
2655 	struct sockaddr *ca;
2656 	size_t nents;
2657 	int ret;
2658 
2659 	vp = nfs_fhtovp(&rda->rda_fh, exi);
2660 	if (vp == NULL) {
2661 		rd->rd_entries = NULL;
2662 		rd->rd_status = NFSERR_STALE;
2663 		return;
2664 	}
2665 
2666 	if (vp->v_type != VDIR) {
2667 		VN_RELE(vp);
2668 		rd->rd_entries = NULL;
2669 		rd->rd_status = NFSERR_NOTDIR;
2670 		return;
2671 	}
2672 
2673 	(void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
2674 
2675 	error = VOP_ACCESS(vp, VREAD, 0, cr, NULL);
2676 
2677 	if (error) {
2678 		rd->rd_entries = NULL;
2679 		goto bad;
2680 	}
2681 
2682 	if (rda->rda_count == 0) {
2683 		rd->rd_entries = NULL;
2684 		rd->rd_size = 0;
2685 		rd->rd_eof = FALSE;
2686 		goto bad;
2687 	}
2688 
2689 	rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA);
2690 
2691 	/*
2692 	 * Allocate data for entries.  This will be freed by rfs_rddirfree.
2693 	 */
2694 	rd->rd_bufsize = (uint_t)rda->rda_count;
2695 	rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP);
2696 
2697 	/*
2698 	 * Set up io vector to read directory data
2699 	 */
2700 	iov.iov_base = (caddr_t)rd->rd_entries;
2701 	iov.iov_len = rda->rda_count;
2702 	uio.uio_iov = &iov;
2703 	uio.uio_iovcnt = 1;
2704 	uio.uio_segflg = UIO_SYSSPACE;
2705 	uio.uio_extflg = UIO_COPY_CACHED;
2706 	uio.uio_loffset = (offset_t)rda->rda_offset;
2707 	uio.uio_resid = rda->rda_count;
2708 
2709 	/*
2710 	 * read directory
2711 	 */
2712 	error = VOP_READDIR(vp, &uio, cr, &iseof, NULL, 0);
2713 
2714 	/*
2715 	 * Clean up
2716 	 */
2717 	if (!error) {
2718 		/*
2719 		 * set size and eof
2720 		 */
2721 		if (uio.uio_resid == rda->rda_count) {
2722 			rd->rd_size = 0;
2723 			rd->rd_eof = TRUE;
2724 		} else {
2725 			rd->rd_size = (uint32_t)(rda->rda_count -
2726 			    uio.uio_resid);
2727 			rd->rd_eof = iseof ? TRUE : FALSE;
2728 		}
2729 	}
2730 
2731 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2732 	nents = nfscmd_countents((char *)rd->rd_entries, rd->rd_size);
2733 	ret = nfscmd_convdirplus(ca, exi, (char *)rd->rd_entries, nents,
2734 	    rda->rda_count, &ndata);
2735 
2736 	if (ret != 0) {
2737 		size_t dropbytes;
2738 		/*
2739 		 * We had to drop one or more entries in order to fit
2740 		 * during the character conversion.  We need to patch
2741 		 * up the size and eof info.
2742 		 */
2743 		if (rd->rd_eof)
2744 			rd->rd_eof = FALSE;
2745 		dropbytes = nfscmd_dropped_entrysize(
2746 		    (struct dirent64 *)rd->rd_entries, nents, ret);
2747 		rd->rd_size -= dropbytes;
2748 	}
2749 	if (ndata == NULL) {
2750 		ndata = (char *)rd->rd_entries;
2751 	} else if (ndata != (char *)rd->rd_entries) {
2752 		kmem_free(rd->rd_entries, rd->rd_bufsize);
2753 		rd->rd_entries = (void *)ndata;
2754 		rd->rd_bufsize = rda->rda_count;
2755 	}
2756 
2757 bad:
2758 	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
2759 
2760 #if 0 /* notyet */
2761 	/*
2762 	 * Don't do this.  It causes local disk writes when just
2763 	 * reading the file and the overhead is deemed larger
2764 	 * than the benefit.
2765 	 */
2766 	/*
2767 	 * Force modified metadata out to stable storage.
2768 	 */
2769 	(void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2770 #endif
2771 
2772 	VN_RELE(vp);
2773 
2774 	rd->rd_status = puterrno(error);
2775 
2776 }
2777 void *
2778 rfs_readdir_getfh(struct nfsrddirargs *rda)
2779 {
2780 	return (&rda->rda_fh);
2781 }
2782 void
2783 rfs_rddirfree(struct nfsrddirres *rd)
2784 {
2785 	if (rd->rd_entries != NULL)
2786 		kmem_free(rd->rd_entries, rd->rd_bufsize);
2787 }
2788 
2789 /* ARGSUSED */
2790 void
2791 rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
2792     struct svc_req *req, cred_t *cr, bool_t ro)
2793 {
2794 	int error;
2795 	struct statvfs64 sb;
2796 	vnode_t *vp;
2797 
2798 	vp = nfs_fhtovp(fh, exi);
2799 	if (vp == NULL) {
2800 		fs->fs_status = NFSERR_STALE;
2801 		return;
2802 	}
2803 
2804 	error = VFS_STATVFS(vp->v_vfsp, &sb);
2805 
2806 	if (!error) {
2807 		fs->fs_tsize = nfstsize();
2808 		fs->fs_bsize = sb.f_frsize;
2809 		fs->fs_blocks = sb.f_blocks;
2810 		fs->fs_bfree = sb.f_bfree;
2811 		fs->fs_bavail = sb.f_bavail;
2812 	}
2813 
2814 	VN_RELE(vp);
2815 
2816 	fs->fs_status = puterrno(error);
2817 
2818 }
2819 void *
2820 rfs_statfs_getfh(fhandle_t *fh)
2821 {
2822 	return (fh);
2823 }
2824 
2825 static int
2826 sattr_to_vattr(struct nfssattr *sa, struct vattr *vap)
2827 {
2828 	vap->va_mask = 0;
2829 
2830 	/*
2831 	 * There was a sign extension bug in some VFS based systems
2832 	 * which stored the mode as a short.  When it would get
2833 	 * assigned to a u_long, no sign extension would occur.
2834 	 * It needed to, but this wasn't noticed because sa_mode
2835 	 * would then get assigned back to the short, thus ignoring
2836 	 * the upper 16 bits of sa_mode.
2837 	 *
2838 	 * To make this implementation work for both broken
2839 	 * clients and good clients, we check for both versions
2840 	 * of the mode.
2841 	 */
2842 	if (sa->sa_mode != (uint32_t)((ushort_t)-1) &&
2843 	    sa->sa_mode != (uint32_t)-1) {
2844 		vap->va_mask |= AT_MODE;
2845 		vap->va_mode = sa->sa_mode;
2846 	}
2847 	if (sa->sa_uid != (uint32_t)-1) {
2848 		vap->va_mask |= AT_UID;
2849 		vap->va_uid = sa->sa_uid;
2850 	}
2851 	if (sa->sa_gid != (uint32_t)-1) {
2852 		vap->va_mask |= AT_GID;
2853 		vap->va_gid = sa->sa_gid;
2854 	}
2855 	if (sa->sa_size != (uint32_t)-1) {
2856 		vap->va_mask |= AT_SIZE;
2857 		vap->va_size = sa->sa_size;
2858 	}
2859 	if (sa->sa_atime.tv_sec != (int32_t)-1 &&
2860 	    sa->sa_atime.tv_usec != (int32_t)-1) {
2861 #ifndef _LP64
2862 		/* return error if time overflow */
2863 		if (!NFS2_TIME_OK(sa->sa_atime.tv_sec))
2864 			return (EOVERFLOW);
2865 #endif
2866 		vap->va_mask |= AT_ATIME;
2867 		/*
2868 		 * nfs protocol defines times as unsigned so don't extend sign,
2869 		 * unless sysadmin set nfs_allow_preepoch_time.
2870 		 */
2871 		NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec);
2872 		vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
2873 	}
2874 	if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
2875 	    sa->sa_mtime.tv_usec != (int32_t)-1) {
2876 #ifndef _LP64
2877 		/* return error if time overflow */
2878 		if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
2879 			return (EOVERFLOW);
2880 #endif
2881 		vap->va_mask |= AT_MTIME;
2882 		/*
2883 		 * nfs protocol defines times as unsigned so don't extend sign,
2884 		 * unless sysadmin set nfs_allow_preepoch_time.
2885 		 */
2886 		NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
2887 		vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
2888 	}
2889 	return (0);
2890 }
2891 
2892 static const enum nfsftype vt_to_nf[] = {
2893 	0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
2894 };
2895 
2896 /*
2897  * check the following fields for overflow: nodeid, size, and time.
2898  * There could be a problem when converting 64-bit LP64 fields
2899  * into 32-bit ones.  Return an error if there is an overflow.
2900  */
2901 int
2902 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
2903 {
2904 	ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
2905 	na->na_type = vt_to_nf[vap->va_type];
2906 
2907 	if (vap->va_mode == (unsigned short) -1)
2908 		na->na_mode = (uint32_t)-1;
2909 	else
2910 		na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
2911 
2912 	if (vap->va_uid == (unsigned short)(-1))
2913 		na->na_uid = (uint32_t)(-1);
2914 	else if (vap->va_uid == UID_NOBODY)
2915 		na->na_uid = (uint32_t)NFS_UID_NOBODY;
2916 	else
2917 		na->na_uid = vap->va_uid;
2918 
2919 	if (vap->va_gid == (unsigned short)(-1))
2920 		na->na_gid = (uint32_t)-1;
2921 	else if (vap->va_gid == GID_NOBODY)
2922 		na->na_gid = (uint32_t)NFS_GID_NOBODY;
2923 	else
2924 		na->na_gid = vap->va_gid;
2925 
2926 	/*
2927 	 * Do we need to check fsid for overflow?  It is 64-bit in the
2928 	 * vattr, but are bigger than 32 bit values supported?
2929 	 */
2930 	na->na_fsid = vap->va_fsid;
2931 
2932 	na->na_nodeid = vap->va_nodeid;
2933 
2934 	/*
2935 	 * Check to make sure that the nodeid is representable over the
2936 	 * wire without losing bits.
2937 	 */
2938 	if (vap->va_nodeid != (u_longlong_t)na->na_nodeid)
2939 		return (EFBIG);
2940 	na->na_nlink = vap->va_nlink;
2941 
2942 	/*
2943 	 * Check for big files here, instead of at the caller.  See
2944 	 * comments in cstat for large special file explanation.
2945 	 */
2946 	if (vap->va_size > (u_longlong_t)MAXOFF32_T) {
2947 		if ((vap->va_type == VREG) || (vap->va_type == VDIR))
2948 			return (EFBIG);
2949 		if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) {
2950 			/* UNKNOWN_SIZE | OVERFLOW */
2951 			na->na_size = MAXOFF32_T;
2952 		} else
2953 			na->na_size = vap->va_size;
2954 	} else
2955 		na->na_size = vap->va_size;
2956 
2957 	/*
2958 	 * If the vnode times overflow the 32-bit times that NFS2
2959 	 * uses on the wire then return an error.
2960 	 */
2961 	if (!NFS_VAP_TIME_OK(vap)) {
2962 		return (EOVERFLOW);
2963 	}
2964 	na->na_atime.tv_sec = vap->va_atime.tv_sec;
2965 	na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2966 
2967 	na->na_mtime.tv_sec = vap->va_mtime.tv_sec;
2968 	na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2969 
2970 	na->na_ctime.tv_sec = vap->va_ctime.tv_sec;
2971 	na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000;
2972 
2973 	/*
2974 	 * If the dev_t will fit into 16 bits then compress
2975 	 * it, otherwise leave it alone. See comments in
2976 	 * nfs_client.c.
2977 	 */
2978 	if (getminor(vap->va_rdev) <= SO4_MAXMIN &&
2979 	    getmajor(vap->va_rdev) <= SO4_MAXMAJ)
2980 		na->na_rdev = nfsv2_cmpdev(vap->va_rdev);
2981 	else
2982 		(void) cmpldev(&na->na_rdev, vap->va_rdev);
2983 
2984 	na->na_blocks = vap->va_nblocks;
2985 	na->na_blocksize = vap->va_blksize;
2986 
2987 	/*
2988 	 * This bit of ugliness is a *TEMPORARY* hack to preserve the
2989 	 * over-the-wire protocols for named-pipe vnodes.  It remaps the
2990 	 * VFIFO type to the special over-the-wire type. (see note in nfs.h)
2991 	 *
2992 	 * BUYER BEWARE:
2993 	 *  If you are porting the NFS to a non-Sun server, you probably
2994 	 *  don't want to include the following block of code.  The
2995 	 *  over-the-wire special file types will be changing with the
2996 	 *  NFS Protocol Revision.
2997 	 */
2998 	if (vap->va_type == VFIFO)
2999 		NA_SETFIFO(na);
3000 	return (0);
3001 }
3002 
3003 /*
3004  * acl v2 support: returns approximate permission.
3005  *	default: returns minimal permission (more restrictive)
3006  *	aclok: returns maximal permission (less restrictive)
3007  *	This routine changes the permissions that are alaredy in *va.
3008  *	If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES,
3009  *	CLASS_OBJ is always the same as GROUP_OBJ entry.
3010  */
3011 static void
3012 acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr)
3013 {
3014 	vsecattr_t	vsa;
3015 	int		aclcnt;
3016 	aclent_t	*aclentp;
3017 	mode_t		mask_perm;
3018 	mode_t		grp_perm;
3019 	mode_t		other_perm;
3020 	mode_t		other_orig;
3021 	int		error;
3022 
3023 	/* dont care default acl */
3024 	vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT);
3025 	error = VOP_GETSECATTR(vp, &vsa, 0, cr, NULL);
3026 
3027 	if (!error) {
3028 		aclcnt = vsa.vsa_aclcnt;
3029 		if (aclcnt > MIN_ACL_ENTRIES) {
3030 			/* non-trivial ACL */
3031 			aclentp = vsa.vsa_aclentp;
3032 			if (exi->exi_export.ex_flags & EX_ACLOK) {
3033 				/* maximal permissions */
3034 				grp_perm = 0;
3035 				other_perm = 0;
3036 				for (; aclcnt > 0; aclcnt--, aclentp++) {
3037 					switch (aclentp->a_type) {
3038 					case USER_OBJ:
3039 						break;
3040 					case USER:
3041 						grp_perm |=
3042 						    aclentp->a_perm << 3;
3043 						other_perm |= aclentp->a_perm;
3044 						break;
3045 					case GROUP_OBJ:
3046 						grp_perm |=
3047 						    aclentp->a_perm << 3;
3048 						break;
3049 					case GROUP:
3050 						other_perm |= aclentp->a_perm;
3051 						break;
3052 					case OTHER_OBJ:
3053 						other_orig = aclentp->a_perm;
3054 						break;
3055 					case CLASS_OBJ:
3056 						mask_perm = aclentp->a_perm;
3057 						break;
3058 					default:
3059 						break;
3060 					}
3061 				}
3062 				grp_perm &= mask_perm << 3;
3063 				other_perm &= mask_perm;
3064 				other_perm |= other_orig;
3065 
3066 			} else {
3067 				/* minimal permissions */
3068 				grp_perm = 070;
3069 				other_perm = 07;
3070 				for (; aclcnt > 0; aclcnt--, aclentp++) {
3071 					switch (aclentp->a_type) {
3072 					case USER_OBJ:
3073 						break;
3074 					case USER:
3075 					case CLASS_OBJ:
3076 						grp_perm &=
3077 						    aclentp->a_perm << 3;
3078 						other_perm &=
3079 						    aclentp->a_perm;
3080 						break;
3081 					case GROUP_OBJ:
3082 						grp_perm &=
3083 						    aclentp->a_perm << 3;
3084 						break;
3085 					case GROUP:
3086 						other_perm &=
3087 						    aclentp->a_perm;
3088 						break;
3089 					case OTHER_OBJ:
3090 						other_perm &=
3091 						    aclentp->a_perm;
3092 						break;
3093 					default:
3094 						break;
3095 					}
3096 				}
3097 			}
3098 			/* copy to va */
3099 			va->va_mode &= ~077;
3100 			va->va_mode |= grp_perm | other_perm;
3101 		}
3102 		if (vsa.vsa_aclcnt)
3103 			kmem_free(vsa.vsa_aclentp,
3104 			    vsa.vsa_aclcnt * sizeof (aclent_t));
3105 	}
3106 }
3107 
3108 void
3109 rfs_srvrinit(void)
3110 {
3111 	nfs2_srv_caller_id = fs_new_caller_id();
3112 }
3113 
3114 void
3115 rfs_srvrfini(void)
3116 {
3117 }
3118 
3119 /* ARGSUSED */
3120 void
3121 rfs_srv_zone_init(nfs_globals_t *ng)
3122 {
3123 	nfs_srv_t *ns;
3124 
3125 	ns = kmem_zalloc(sizeof (*ns), KM_SLEEP);
3126 
3127 	mutex_init(&ns->async_write_lock, NULL, MUTEX_DEFAULT, NULL);
3128 	ns->write_async = 1;
3129 
3130 	ng->nfs_srv = ns;
3131 }
3132 
3133 /* ARGSUSED */
3134 void
3135 rfs_srv_zone_fini(nfs_globals_t *ng)
3136 {
3137 	nfs_srv_t *ns = ng->nfs_srv;
3138 
3139 	ng->nfs_srv = NULL;
3140 
3141 	mutex_destroy(&ns->async_write_lock);
3142 	kmem_free(ns, sizeof (*ns));
3143 }
3144 
3145 static int
3146 rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
3147 {
3148 	struct clist	*wcl;
3149 	int		wlist_len;
3150 	uint32_t	count = rr->rr_count;
3151 
3152 	wcl = ra->ra_wlist;
3153 
3154 	if (rdma_setup_read_chunks(wcl, count, &wlist_len) == FALSE) {
3155 		return (FALSE);
3156 	}
3157 
3158 	wcl = ra->ra_wlist;
3159 	rr->rr_ok.rrok_wlist_len = wlist_len;
3160 	rr->rr_ok.rrok_wlist = wcl;
3161 
3162 	return (TRUE);
3163 }
3164