xref: /illumos-gate/usr/src/uts/common/fs/nfs/nfs_srv.c (revision 528737823843346cf95a4a701612f82089135554)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
25  * Copyright (c) 2016 by Delphix. All rights reserved.
26  */
27 
28 /*
29  *	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
30  *	All rights reserved.
31  */
32 
33 /*
34  * Copyright 2018 Nexenta Systems, Inc.
35  * Copyright (c) 2016 by Delphix. All rights reserved.
36  */
37 
38 #include <sys/param.h>
39 #include <sys/types.h>
40 #include <sys/systm.h>
41 #include <sys/cred.h>
42 #include <sys/buf.h>
43 #include <sys/vfs.h>
44 #include <sys/vnode.h>
45 #include <sys/uio.h>
46 #include <sys/stat.h>
47 #include <sys/errno.h>
48 #include <sys/sysmacros.h>
49 #include <sys/statvfs.h>
50 #include <sys/kmem.h>
51 #include <sys/kstat.h>
52 #include <sys/dirent.h>
53 #include <sys/cmn_err.h>
54 #include <sys/debug.h>
55 #include <sys/vtrace.h>
56 #include <sys/mode.h>
57 #include <sys/acl.h>
58 #include <sys/nbmlock.h>
59 #include <sys/policy.h>
60 #include <sys/sdt.h>
61 
62 #include <rpc/types.h>
63 #include <rpc/auth.h>
64 #include <rpc/svc.h>
65 
66 #include <nfs/nfs.h>
67 #include <nfs/export.h>
68 #include <nfs/nfs_cmd.h>
69 
70 #include <vm/hat.h>
71 #include <vm/as.h>
72 #include <vm/seg.h>
73 #include <vm/seg_map.h>
74 #include <vm/seg_kmem.h>
75 
76 #include <sys/strsubr.h>
77 
78 struct rfs_async_write_list;
79 
80 /*
81  * Zone globals of NFSv2 server
82  */
83 typedef struct nfs_srv {
84 	kmutex_t			async_write_lock;
85 	struct rfs_async_write_list	*async_write_head;
86 
87 	/*
88 	 * enables write clustering if == 1
89 	 */
90 	int		write_async;
91 } nfs_srv_t;
92 
93 /*
94  * These are the interface routines for the server side of the
95  * Network File System.  See the NFS version 2 protocol specification
96  * for a description of this interface.
97  */
98 
99 static int	sattr_to_vattr(struct nfssattr *, struct vattr *);
100 static void	acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
101 			cred_t *);
102 
103 
104 /*
105  * Some "over the wire" UNIX file types.  These are encoded
106  * into the mode.  This needs to be fixed in the next rev.
107  */
108 #define	IFMT		0170000		/* type of file */
109 #define	IFCHR		0020000		/* character special */
110 #define	IFBLK		0060000		/* block special */
111 #define	IFSOCK		0140000		/* socket */
112 
113 u_longlong_t nfs2_srv_caller_id;
114 
115 static nfs_srv_t *
116 nfs_get_srv(void)
117 {
118 	nfs_globals_t *ng = nfs_srv_getzg();
119 	nfs_srv_t *srv = ng->nfs_srv;
120 	ASSERT(srv != NULL);
121 	return (srv);
122 }
123 
124 /*
125  * Get file attributes.
126  * Returns the current attributes of the file with the given fhandle.
127  */
128 /* ARGSUSED */
129 void
130 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
131     struct svc_req *req, cred_t *cr, bool_t ro)
132 {
133 	int error;
134 	vnode_t *vp;
135 	struct vattr va;
136 
137 	vp = nfs_fhtovp(fhp, exi);
138 	if (vp == NULL) {
139 		ns->ns_status = NFSERR_STALE;
140 		return;
141 	}
142 
143 	/*
144 	 * Do the getattr.
145 	 */
146 	va.va_mask = AT_ALL;	/* we want all the attributes */
147 
148 	error = rfs4_delegated_getattr(vp, &va, 0, cr);
149 
150 	/* check for overflows */
151 	if (!error) {
152 		/* Lie about the object type for a referral */
153 		if (vn_is_nfs_reparse(vp, cr))
154 			va.va_type = VLNK;
155 
156 		acl_perm(vp, exi, &va, cr);
157 		error = vattr_to_nattr(&va, &ns->ns_attr);
158 	}
159 
160 	VN_RELE(vp);
161 
162 	ns->ns_status = puterrno(error);
163 }
164 void *
165 rfs_getattr_getfh(fhandle_t *fhp)
166 {
167 	return (fhp);
168 }
169 
170 /*
171  * Set file attributes.
172  * Sets the attributes of the file with the given fhandle.  Returns
173  * the new attributes.
174  */
175 /* ARGSUSED */
176 void
177 rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns,
178     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
179 {
180 	int error;
181 	int flag;
182 	int in_crit = 0;
183 	vnode_t *vp;
184 	struct vattr va;
185 	struct vattr bva;
186 	struct flock64 bf;
187 	caller_context_t ct;
188 
189 
190 	vp = nfs_fhtovp(&args->saa_fh, exi);
191 	if (vp == NULL) {
192 		ns->ns_status = NFSERR_STALE;
193 		return;
194 	}
195 
196 	if (rdonly(ro, vp)) {
197 		VN_RELE(vp);
198 		ns->ns_status = NFSERR_ROFS;
199 		return;
200 	}
201 
202 	error = sattr_to_vattr(&args->saa_sa, &va);
203 	if (error) {
204 		VN_RELE(vp);
205 		ns->ns_status = puterrno(error);
206 		return;
207 	}
208 
209 	/*
210 	 * If the client is requesting a change to the mtime,
211 	 * but the nanosecond field is set to 1 billion, then
212 	 * this is a flag to the server that it should set the
213 	 * atime and mtime fields to the server's current time.
214 	 * The 1 billion number actually came from the client
215 	 * as 1 million, but the units in the over the wire
216 	 * request are microseconds instead of nanoseconds.
217 	 *
218 	 * This is an overload of the protocol and should be
219 	 * documented in the NFS Version 2 protocol specification.
220 	 */
221 	if (va.va_mask & AT_MTIME) {
222 		if (va.va_mtime.tv_nsec == 1000000000) {
223 			gethrestime(&va.va_mtime);
224 			va.va_atime = va.va_mtime;
225 			va.va_mask |= AT_ATIME;
226 			flag = 0;
227 		} else
228 			flag = ATTR_UTIME;
229 	} else
230 		flag = 0;
231 
232 	/*
233 	 * If the filesystem is exported with nosuid, then mask off
234 	 * the setuid and setgid bits.
235 	 */
236 	if ((va.va_mask & AT_MODE) && vp->v_type == VREG &&
237 	    (exi->exi_export.ex_flags & EX_NOSUID))
238 		va.va_mode &= ~(VSUID | VSGID);
239 
240 	ct.cc_sysid = 0;
241 	ct.cc_pid = 0;
242 	ct.cc_caller_id = nfs2_srv_caller_id;
243 	ct.cc_flags = CC_DONTBLOCK;
244 
245 	/*
246 	 * We need to specially handle size changes because it is
247 	 * possible for the client to create a file with modes
248 	 * which indicate read-only, but with the file opened for
249 	 * writing.  If the client then tries to set the size of
250 	 * the file, then the normal access checking done in
251 	 * VOP_SETATTR would prevent the client from doing so,
252 	 * although it should be legal for it to do so.  To get
253 	 * around this, we do the access checking for ourselves
254 	 * and then use VOP_SPACE which doesn't do the access
255 	 * checking which VOP_SETATTR does. VOP_SPACE can only
256 	 * operate on VREG files, let VOP_SETATTR handle the other
257 	 * extremely rare cases.
258 	 * Also the client should not be allowed to change the
259 	 * size of the file if there is a conflicting non-blocking
260 	 * mandatory lock in the region of change.
261 	 */
262 	if (vp->v_type == VREG && va.va_mask & AT_SIZE) {
263 		if (nbl_need_check(vp)) {
264 			nbl_start_crit(vp, RW_READER);
265 			in_crit = 1;
266 		}
267 
268 		bva.va_mask = AT_UID | AT_SIZE;
269 
270 		error = VOP_GETATTR(vp, &bva, 0, cr, &ct);
271 
272 		if (error) {
273 			if (in_crit)
274 				nbl_end_crit(vp);
275 			VN_RELE(vp);
276 			ns->ns_status = puterrno(error);
277 			return;
278 		}
279 
280 		if (in_crit) {
281 			u_offset_t offset;
282 			ssize_t length;
283 
284 			if (va.va_size < bva.va_size) {
285 				offset = va.va_size;
286 				length = bva.va_size - va.va_size;
287 			} else {
288 				offset = bva.va_size;
289 				length = va.va_size - bva.va_size;
290 			}
291 			if (nbl_conflict(vp, NBL_WRITE, offset, length, 0,
292 			    NULL)) {
293 				error = EACCES;
294 			}
295 		}
296 
297 		if (crgetuid(cr) == bva.va_uid && !error &&
298 		    va.va_size != bva.va_size) {
299 			va.va_mask &= ~AT_SIZE;
300 			bf.l_type = F_WRLCK;
301 			bf.l_whence = 0;
302 			bf.l_start = (off64_t)va.va_size;
303 			bf.l_len = 0;
304 			bf.l_sysid = 0;
305 			bf.l_pid = 0;
306 
307 			error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE,
308 			    (offset_t)va.va_size, cr, &ct);
309 		}
310 		if (in_crit)
311 			nbl_end_crit(vp);
312 	} else
313 		error = 0;
314 
315 	/*
316 	 * Do the setattr.
317 	 */
318 	if (!error && va.va_mask) {
319 		error = VOP_SETATTR(vp, &va, flag, cr, &ct);
320 	}
321 
322 	/*
323 	 * check if the monitor on either vop_space or vop_setattr detected
324 	 * a delegation conflict and if so, mark the thread flag as
325 	 * wouldblock so that the response is dropped and the client will
326 	 * try again.
327 	 */
328 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
329 		VN_RELE(vp);
330 		curthread->t_flag |= T_WOULDBLOCK;
331 		return;
332 	}
333 
334 	if (!error) {
335 		va.va_mask = AT_ALL;	/* get everything */
336 
337 		error = rfs4_delegated_getattr(vp, &va, 0, cr);
338 
339 		/* check for overflows */
340 		if (!error) {
341 			acl_perm(vp, exi, &va, cr);
342 			error = vattr_to_nattr(&va, &ns->ns_attr);
343 		}
344 	}
345 
346 	ct.cc_flags = 0;
347 
348 	/*
349 	 * Force modified metadata out to stable storage.
350 	 */
351 	(void) VOP_FSYNC(vp, FNODSYNC, cr, &ct);
352 
353 	VN_RELE(vp);
354 
355 	ns->ns_status = puterrno(error);
356 }
357 void *
358 rfs_setattr_getfh(struct nfssaargs *args)
359 {
360 	return (&args->saa_fh);
361 }
362 
363 /* Change and release @exip and @vpp only in success */
364 int
365 rfs_cross_mnt(vnode_t **vpp, struct exportinfo **exip)
366 {
367 	struct exportinfo *exi;
368 	vnode_t *vp = *vpp;
369 	fid_t fid;
370 	int error;
371 
372 	VN_HOLD(vp);
373 
374 	if ((error = traverse(&vp)) != 0) {
375 		VN_RELE(vp);
376 		return (error);
377 	}
378 
379 	bzero(&fid, sizeof (fid));
380 	fid.fid_len = MAXFIDSZ;
381 	error = VOP_FID(vp, &fid, NULL);
382 	if (error) {
383 		VN_RELE(vp);
384 		return (error);
385 	}
386 
387 	exi = checkexport(&vp->v_vfsp->vfs_fsid, &fid);
388 	if (exi == NULL ||
389 	    (exi->exi_export.ex_flags & EX_NOHIDE) == 0) {
390 		/*
391 		 * It is not error, just subdir is not exported
392 		 * or "nohide" is not set
393 		 */
394 		if (exi != NULL)
395 			exi_rele(exi);
396 		VN_RELE(vp);
397 	} else {
398 		/* go to submount */
399 		exi_rele(*exip);
400 		*exip = exi;
401 
402 		VN_RELE(*vpp);
403 		*vpp = vp;
404 	}
405 
406 	return (0);
407 }
408 
409 /*
410  * Given mounted "dvp" and "exi", go upper mountpoint
411  * with dvp/exi correction
412  * Return 0 in success
413  */
414 int
415 rfs_climb_crossmnt(vnode_t **dvpp, struct exportinfo **exip, cred_t *cr)
416 {
417 	struct exportinfo *exi;
418 	vnode_t *dvp = *dvpp;
419 	vnode_t *zone_rootvp;
420 
421 	zone_rootvp = (*exip)->exi_ne->exi_root->exi_vp;
422 	ASSERT((dvp->v_flag & VROOT) || VN_CMP(zone_rootvp, dvp));
423 
424 	VN_HOLD(dvp);
425 	dvp = untraverse(dvp, zone_rootvp);
426 	exi = nfs_vptoexi(NULL, dvp, cr, NULL, NULL, FALSE);
427 	if (exi == NULL) {
428 		VN_RELE(dvp);
429 		return (-1);
430 	}
431 
432 	ASSERT3U(exi->exi_zoneid, ==, (*exip)->exi_zoneid);
433 	exi_rele(*exip);
434 	*exip = exi;
435 	VN_RELE(*dvpp);
436 	*dvpp = dvp;
437 
438 	return (0);
439 }
440 /*
441  * Directory lookup.
442  * Returns an fhandle and file attributes for file name in a directory.
443  */
444 /* ARGSUSED */
445 void
446 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
447     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
448 {
449 	int error;
450 	vnode_t *dvp;
451 	vnode_t *vp;
452 	struct vattr va;
453 	fhandle_t *fhp = da->da_fhandle;
454 	struct sec_ol sec = {0, 0};
455 	bool_t publicfh_flag = FALSE, auth_weak = FALSE;
456 	char *name;
457 	struct sockaddr *ca;
458 
459 	/*
460 	 * Trusted Extension doesn't support NFSv2. MOUNT
461 	 * will reject v2 clients. Need to prevent v2 client
462 	 * access via WebNFS here.
463 	 */
464 	if (is_system_labeled() && req->rq_vers == 2) {
465 		dr->dr_status = NFSERR_ACCES;
466 		return;
467 	}
468 
469 	/*
470 	 * Disallow NULL paths
471 	 */
472 	if (da->da_name == NULL || *da->da_name == '\0') {
473 		dr->dr_status = NFSERR_ACCES;
474 		return;
475 	}
476 
477 	/*
478 	 * Allow lookups from the root - the default
479 	 * location of the public filehandle.
480 	 */
481 	if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
482 		dvp = ZONE_ROOTVP();
483 		VN_HOLD(dvp);
484 	} else {
485 		dvp = nfs_fhtovp(fhp, exi);
486 		if (dvp == NULL) {
487 			dr->dr_status = NFSERR_STALE;
488 			return;
489 		}
490 	}
491 
492 	exi_hold(exi);
493 	ASSERT3U(exi->exi_zoneid, ==, curzone->zone_id);
494 
495 	/*
496 	 * Not allow lookup beyond root.
497 	 * If the filehandle matches a filehandle of the exi,
498 	 * then the ".." refers beyond the root of an exported filesystem.
499 	 */
500 	if (strcmp(da->da_name, "..") == 0 &&
501 	    EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
502 		if ((exi->exi_export.ex_flags & EX_NOHIDE) &&
503 		    ((dvp->v_flag & VROOT) || VN_IS_CURZONEROOT(dvp))) {
504 			/*
505 			 * special case for ".." and 'nohide'exported root
506 			 */
507 			if (rfs_climb_crossmnt(&dvp, &exi, cr) != 0) {
508 				error = NFSERR_ACCES;
509 				goto out;
510 			}
511 		} else  {
512 			error = NFSERR_NOENT;
513 			goto out;
514 		}
515 	}
516 
517 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
518 	name = nfscmd_convname(ca, exi, da->da_name, NFSCMD_CONV_INBOUND,
519 	    MAXPATHLEN);
520 
521 	if (name == NULL) {
522 		error = NFSERR_ACCES;
523 		goto out;
524 	}
525 
526 	/*
527 	 * If the public filehandle is used then allow
528 	 * a multi-component lookup, i.e. evaluate
529 	 * a pathname and follow symbolic links if
530 	 * necessary.
531 	 *
532 	 * This may result in a vnode in another filesystem
533 	 * which is OK as long as the filesystem is exported.
534 	 */
535 	if (PUBLIC_FH2(fhp)) {
536 		publicfh_flag = TRUE;
537 
538 		exi_rele(exi);
539 		exi = NULL;
540 
541 		error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &exi,
542 		    &sec);
543 	} else {
544 		/*
545 		 * Do a normal single component lookup.
546 		 */
547 		error = VOP_LOOKUP(dvp, name, &vp, NULL, 0, NULL, cr,
548 		    NULL, NULL, NULL);
549 	}
550 
551 	if (name != da->da_name)
552 		kmem_free(name, MAXPATHLEN);
553 
554 	if (error == 0 && vn_ismntpt(vp)) {
555 		error = rfs_cross_mnt(&vp, &exi);
556 		if (error)
557 			VN_RELE(vp);
558 	}
559 
560 	if (!error) {
561 		va.va_mask = AT_ALL;	/* we want everything */
562 
563 		error = rfs4_delegated_getattr(vp, &va, 0, cr);
564 
565 		/* check for overflows */
566 		if (!error) {
567 			acl_perm(vp, exi, &va, cr);
568 			error = vattr_to_nattr(&va, &dr->dr_attr);
569 			if (!error) {
570 				if (sec.sec_flags & SEC_QUERY)
571 					error = makefh_ol(&dr->dr_fhandle, exi,
572 					    sec.sec_index);
573 				else {
574 					error = makefh(&dr->dr_fhandle, vp,
575 					    exi);
576 					if (!error && publicfh_flag &&
577 					    !chk_clnt_sec(exi, req))
578 						auth_weak = TRUE;
579 				}
580 			}
581 		}
582 		VN_RELE(vp);
583 	}
584 
585 out:
586 	VN_RELE(dvp);
587 
588 	if (exi != NULL)
589 		exi_rele(exi);
590 
591 	/*
592 	 * If it's public fh, no 0x81, and client's flavor is
593 	 * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
594 	 * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
595 	 */
596 	if (auth_weak)
597 		dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR;
598 	else
599 		dr->dr_status = puterrno(error);
600 }
601 void *
602 rfs_lookup_getfh(struct nfsdiropargs *da)
603 {
604 	return (da->da_fhandle);
605 }
606 
607 /*
608  * Read symbolic link.
609  * Returns the string in the symbolic link at the given fhandle.
610  */
611 /* ARGSUSED */
612 void
613 rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi,
614     struct svc_req *req, cred_t *cr, bool_t ro)
615 {
616 	int error;
617 	struct iovec iov;
618 	struct uio uio;
619 	vnode_t *vp;
620 	struct vattr va;
621 	struct sockaddr *ca;
622 	char *name = NULL;
623 	int is_referral = 0;
624 
625 	vp = nfs_fhtovp(fhp, exi);
626 	if (vp == NULL) {
627 		rl->rl_data = NULL;
628 		rl->rl_status = NFSERR_STALE;
629 		return;
630 	}
631 
632 	va.va_mask = AT_MODE;
633 
634 	error = VOP_GETATTR(vp, &va, 0, cr, NULL);
635 
636 	if (error) {
637 		VN_RELE(vp);
638 		rl->rl_data = NULL;
639 		rl->rl_status = puterrno(error);
640 		return;
641 	}
642 
643 	if (MANDLOCK(vp, va.va_mode)) {
644 		VN_RELE(vp);
645 		rl->rl_data = NULL;
646 		rl->rl_status = NFSERR_ACCES;
647 		return;
648 	}
649 
650 	/* We lied about the object type for a referral */
651 	if (vn_is_nfs_reparse(vp, cr))
652 		is_referral = 1;
653 
654 	/*
655 	 * XNFS and RFC1094 require us to return ENXIO if argument
656 	 * is not a link. BUGID 1138002.
657 	 */
658 	if (vp->v_type != VLNK && !is_referral) {
659 		VN_RELE(vp);
660 		rl->rl_data = NULL;
661 		rl->rl_status = NFSERR_NXIO;
662 		return;
663 	}
664 
665 	/*
666 	 * Allocate data for pathname.  This will be freed by rfs_rlfree.
667 	 */
668 	rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
669 
670 	if (is_referral) {
671 		char *s;
672 		size_t strsz;
673 		kstat_named_t *stat =
674 		    exi->exi_ne->ne_globals->svstat[NFS_VERSION];
675 
676 		/* Get an artificial symlink based on a referral */
677 		s = build_symlink(vp, cr, &strsz);
678 		stat[NFS_REFERLINKS].value.ui64++;
679 		DTRACE_PROBE2(nfs2serv__func__referral__reflink,
680 		    vnode_t *, vp, char *, s);
681 		if (s == NULL)
682 			error = EINVAL;
683 		else {
684 			error = 0;
685 			(void) strlcpy(rl->rl_data, s, NFS_MAXPATHLEN);
686 			rl->rl_count = (uint32_t)MIN(strsz, NFS_MAXPATHLEN);
687 			kmem_free(s, strsz);
688 		}
689 
690 	} else {
691 
692 		/*
693 		 * Set up io vector to read sym link data
694 		 */
695 		iov.iov_base = rl->rl_data;
696 		iov.iov_len = NFS_MAXPATHLEN;
697 		uio.uio_iov = &iov;
698 		uio.uio_iovcnt = 1;
699 		uio.uio_segflg = UIO_SYSSPACE;
700 		uio.uio_extflg = UIO_COPY_CACHED;
701 		uio.uio_loffset = (offset_t)0;
702 		uio.uio_resid = NFS_MAXPATHLEN;
703 
704 		/*
705 		 * Do the readlink.
706 		 */
707 		error = VOP_READLINK(vp, &uio, cr, NULL);
708 
709 		rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid);
710 
711 		if (!error)
712 			rl->rl_data[rl->rl_count] = '\0';
713 
714 	}
715 
716 
717 	VN_RELE(vp);
718 
719 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
720 	name = nfscmd_convname(ca, exi, rl->rl_data,
721 	    NFSCMD_CONV_OUTBOUND, MAXPATHLEN);
722 
723 	if (name != NULL && name != rl->rl_data) {
724 		kmem_free(rl->rl_data, NFS_MAXPATHLEN);
725 		rl->rl_data = name;
726 	}
727 
728 	/*
729 	 * XNFS and RFC1094 require us to return ENXIO if argument
730 	 * is not a link. UFS returns EINVAL if this is the case,
731 	 * so we do the mapping here. BUGID 1138002.
732 	 */
733 	if (error == EINVAL)
734 		rl->rl_status = NFSERR_NXIO;
735 	else
736 		rl->rl_status = puterrno(error);
737 
738 }
739 void *
740 rfs_readlink_getfh(fhandle_t *fhp)
741 {
742 	return (fhp);
743 }
744 /*
745  * Free data allocated by rfs_readlink
746  */
747 void
748 rfs_rlfree(struct nfsrdlnres *rl)
749 {
750 	if (rl->rl_data != NULL)
751 		kmem_free(rl->rl_data, NFS_MAXPATHLEN);
752 }
753 
754 static int rdma_setup_read_data2(struct nfsreadargs *, struct nfsrdresult *);
755 
756 /*
757  * Read data.
758  * Returns some data read from the file at the given fhandle.
759  */
760 /* ARGSUSED */
761 void
762 rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr,
763     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
764 {
765 	vnode_t *vp;
766 	int error;
767 	struct vattr va;
768 	struct iovec iov;
769 	struct uio uio;
770 	mblk_t *mp;
771 	int alloc_err = 0;
772 	int in_crit = 0;
773 	caller_context_t ct;
774 
775 	vp = nfs_fhtovp(&ra->ra_fhandle, exi);
776 	if (vp == NULL) {
777 		rr->rr_data = NULL;
778 		rr->rr_status = NFSERR_STALE;
779 		return;
780 	}
781 
782 	if (vp->v_type != VREG) {
783 		VN_RELE(vp);
784 		rr->rr_data = NULL;
785 		rr->rr_status = NFSERR_ISDIR;
786 		return;
787 	}
788 
789 	ct.cc_sysid = 0;
790 	ct.cc_pid = 0;
791 	ct.cc_caller_id = nfs2_srv_caller_id;
792 	ct.cc_flags = CC_DONTBLOCK;
793 
794 	/*
795 	 * Enter the critical region before calling VOP_RWLOCK
796 	 * to avoid a deadlock with write requests.
797 	 */
798 	if (nbl_need_check(vp)) {
799 		nbl_start_crit(vp, RW_READER);
800 		if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
801 		    0, NULL)) {
802 			nbl_end_crit(vp);
803 			VN_RELE(vp);
804 			rr->rr_data = NULL;
805 			rr->rr_status = NFSERR_ACCES;
806 			return;
807 		}
808 		in_crit = 1;
809 	}
810 
811 	error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct);
812 
813 	/* check if a monitor detected a delegation conflict */
814 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
815 		if (in_crit)
816 			nbl_end_crit(vp);
817 		VN_RELE(vp);
818 		/* mark as wouldblock so response is dropped */
819 		curthread->t_flag |= T_WOULDBLOCK;
820 
821 		rr->rr_data = NULL;
822 		return;
823 	}
824 
825 	va.va_mask = AT_ALL;
826 
827 	error = VOP_GETATTR(vp, &va, 0, cr, &ct);
828 
829 	if (error) {
830 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
831 		if (in_crit)
832 			nbl_end_crit(vp);
833 
834 		VN_RELE(vp);
835 		rr->rr_data = NULL;
836 		rr->rr_status = puterrno(error);
837 
838 		return;
839 	}
840 
841 	/*
842 	 * This is a kludge to allow reading of files created
843 	 * with no read permission.  The owner of the file
844 	 * is always allowed to read it.
845 	 */
846 	if (crgetuid(cr) != va.va_uid) {
847 		error = VOP_ACCESS(vp, VREAD, 0, cr, &ct);
848 
849 		if (error) {
850 			/*
851 			 * Exec is the same as read over the net because
852 			 * of demand loading.
853 			 */
854 			error = VOP_ACCESS(vp, VEXEC, 0, cr, &ct);
855 		}
856 		if (error) {
857 			VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
858 			if (in_crit)
859 				nbl_end_crit(vp);
860 			VN_RELE(vp);
861 			rr->rr_data = NULL;
862 			rr->rr_status = puterrno(error);
863 
864 			return;
865 		}
866 	}
867 
868 	if (MANDLOCK(vp, va.va_mode)) {
869 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
870 		if (in_crit)
871 			nbl_end_crit(vp);
872 
873 		VN_RELE(vp);
874 		rr->rr_data = NULL;
875 		rr->rr_status = NFSERR_ACCES;
876 
877 		return;
878 	}
879 
880 	rr->rr_ok.rrok_wlist_len = 0;
881 	rr->rr_ok.rrok_wlist = NULL;
882 
883 	if ((u_offset_t)ra->ra_offset >= va.va_size) {
884 		rr->rr_count = 0;
885 		rr->rr_data = NULL;
886 		/*
887 		 * In this case, status is NFS_OK, but there is no data
888 		 * to encode. So set rr_mp to NULL.
889 		 */
890 		rr->rr_mp = NULL;
891 		rr->rr_ok.rrok_wlist = ra->ra_wlist;
892 		if (rr->rr_ok.rrok_wlist)
893 			clist_zero_len(rr->rr_ok.rrok_wlist);
894 		goto done;
895 	}
896 
897 	if (ra->ra_wlist) {
898 		mp = NULL;
899 		rr->rr_mp = NULL;
900 		(void) rdma_get_wchunk(req, &iov, ra->ra_wlist);
901 		if (ra->ra_count > iov.iov_len) {
902 			rr->rr_data = NULL;
903 			rr->rr_status = NFSERR_INVAL;
904 			goto done;
905 		}
906 	} else {
907 		/*
908 		 * mp will contain the data to be sent out in the read reply.
909 		 * This will be freed after the reply has been sent out (by the
910 		 * driver).
911 		 * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so
912 		 * that the call to xdrmblk_putmblk() never fails.
913 		 */
914 		mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG,
915 		    &alloc_err);
916 		ASSERT(mp != NULL);
917 		ASSERT(alloc_err == 0);
918 
919 		rr->rr_mp = mp;
920 
921 		/*
922 		 * Set up io vector
923 		 */
924 		iov.iov_base = (caddr_t)mp->b_datap->db_base;
925 		iov.iov_len = ra->ra_count;
926 	}
927 
928 	uio.uio_iov = &iov;
929 	uio.uio_iovcnt = 1;
930 	uio.uio_segflg = UIO_SYSSPACE;
931 	uio.uio_extflg = UIO_COPY_CACHED;
932 	uio.uio_loffset = (offset_t)ra->ra_offset;
933 	uio.uio_resid = ra->ra_count;
934 
935 	error = VOP_READ(vp, &uio, 0, cr, &ct);
936 
937 	if (error) {
938 		if (mp)
939 			freeb(mp);
940 
941 		/*
942 		 * check if a monitor detected a delegation conflict and
943 		 * mark as wouldblock so response is dropped
944 		 */
945 		if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
946 			curthread->t_flag |= T_WOULDBLOCK;
947 		else
948 			rr->rr_status = puterrno(error);
949 
950 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
951 		if (in_crit)
952 			nbl_end_crit(vp);
953 
954 		VN_RELE(vp);
955 		rr->rr_data = NULL;
956 
957 		return;
958 	}
959 
960 	/*
961 	 * Get attributes again so we can send the latest access
962 	 * time to the client side for its cache.
963 	 */
964 	va.va_mask = AT_ALL;
965 
966 	error = VOP_GETATTR(vp, &va, 0, cr, &ct);
967 
968 	if (error) {
969 		if (mp)
970 			freeb(mp);
971 
972 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
973 		if (in_crit)
974 			nbl_end_crit(vp);
975 
976 		VN_RELE(vp);
977 		rr->rr_data = NULL;
978 		rr->rr_status = puterrno(error);
979 
980 		return;
981 	}
982 
983 	rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid);
984 
985 	if (mp) {
986 		rr->rr_data = (char *)mp->b_datap->db_base;
987 	} else {
988 		if (ra->ra_wlist) {
989 			rr->rr_data = (caddr_t)iov.iov_base;
990 			if (!rdma_setup_read_data2(ra, rr)) {
991 				rr->rr_data = NULL;
992 				rr->rr_status = puterrno(NFSERR_INVAL);
993 			}
994 		}
995 	}
996 done:
997 	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
998 	if (in_crit)
999 		nbl_end_crit(vp);
1000 
1001 	acl_perm(vp, exi, &va, cr);
1002 
1003 	/* check for overflows */
1004 	error = vattr_to_nattr(&va, &rr->rr_attr);
1005 
1006 	VN_RELE(vp);
1007 
1008 	rr->rr_status = puterrno(error);
1009 }
1010 
1011 /*
1012  * Free data allocated by rfs_read
1013  */
1014 void
1015 rfs_rdfree(struct nfsrdresult *rr)
1016 {
1017 	mblk_t *mp;
1018 
1019 	if (rr->rr_status == NFS_OK) {
1020 		mp = rr->rr_mp;
1021 		if (mp != NULL)
1022 			freeb(mp);
1023 	}
1024 }
1025 
1026 void *
1027 rfs_read_getfh(struct nfsreadargs *ra)
1028 {
1029 	return (&ra->ra_fhandle);
1030 }
1031 
1032 #define	MAX_IOVECS	12
1033 
1034 #ifdef DEBUG
1035 static int rfs_write_sync_hits = 0;
1036 static int rfs_write_sync_misses = 0;
1037 #endif
1038 
1039 /*
1040  * Write data to file.
1041  * Returns attributes of a file after writing some data to it.
1042  *
1043  * Any changes made here, especially in error handling might have
1044  * to also be done in rfs_write (which clusters write requests).
1045  */
1046 /* ARGSUSED */
1047 void
1048 rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns,
1049     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1050 {
1051 	int error;
1052 	vnode_t *vp;
1053 	rlim64_t rlimit;
1054 	struct vattr va;
1055 	struct uio uio;
1056 	struct iovec iov[MAX_IOVECS];
1057 	mblk_t *m;
1058 	struct iovec *iovp;
1059 	int iovcnt;
1060 	cred_t *savecred;
1061 	int in_crit = 0;
1062 	caller_context_t ct;
1063 
1064 	vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1065 	if (vp == NULL) {
1066 		ns->ns_status = NFSERR_STALE;
1067 		return;
1068 	}
1069 
1070 	if (rdonly(ro, vp)) {
1071 		VN_RELE(vp);
1072 		ns->ns_status = NFSERR_ROFS;
1073 		return;
1074 	}
1075 
1076 	if (vp->v_type != VREG) {
1077 		VN_RELE(vp);
1078 		ns->ns_status = NFSERR_ISDIR;
1079 		return;
1080 	}
1081 
1082 	ct.cc_sysid = 0;
1083 	ct.cc_pid = 0;
1084 	ct.cc_caller_id = nfs2_srv_caller_id;
1085 	ct.cc_flags = CC_DONTBLOCK;
1086 
1087 	va.va_mask = AT_UID|AT_MODE;
1088 
1089 	error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1090 
1091 	if (error) {
1092 		VN_RELE(vp);
1093 		ns->ns_status = puterrno(error);
1094 
1095 		return;
1096 	}
1097 
1098 	if (crgetuid(cr) != va.va_uid) {
1099 		/*
1100 		 * This is a kludge to allow writes of files created
1101 		 * with read only permission.  The owner of the file
1102 		 * is always allowed to write it.
1103 		 */
1104 		error = VOP_ACCESS(vp, VWRITE, 0, cr, &ct);
1105 
1106 		if (error) {
1107 			VN_RELE(vp);
1108 			ns->ns_status = puterrno(error);
1109 			return;
1110 		}
1111 	}
1112 
1113 	/*
1114 	 * Can't access a mandatory lock file.  This might cause
1115 	 * the NFS service thread to block forever waiting for a
1116 	 * lock to be released that will never be released.
1117 	 */
1118 	if (MANDLOCK(vp, va.va_mode)) {
1119 		VN_RELE(vp);
1120 		ns->ns_status = NFSERR_ACCES;
1121 		return;
1122 	}
1123 
1124 	/*
1125 	 * We have to enter the critical region before calling VOP_RWLOCK
1126 	 * to avoid a deadlock with ufs.
1127 	 */
1128 	if (nbl_need_check(vp)) {
1129 		nbl_start_crit(vp, RW_READER);
1130 		in_crit = 1;
1131 		if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
1132 		    wa->wa_count, 0, NULL)) {
1133 			error = EACCES;
1134 			goto out;
1135 		}
1136 	}
1137 
1138 	error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1139 
1140 	/* check if a monitor detected a delegation conflict */
1141 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1142 		goto out;
1143 	}
1144 
1145 	if (wa->wa_data || wa->wa_rlist) {
1146 		/* Do the RDMA thing if necessary */
1147 		if (wa->wa_rlist) {
1148 			iov[0].iov_base = (char *)((wa->wa_rlist)->u.c_daddr3);
1149 			iov[0].iov_len = wa->wa_count;
1150 		} else  {
1151 			iov[0].iov_base = wa->wa_data;
1152 			iov[0].iov_len = wa->wa_count;
1153 		}
1154 		uio.uio_iov = iov;
1155 		uio.uio_iovcnt = 1;
1156 		uio.uio_segflg = UIO_SYSSPACE;
1157 		uio.uio_extflg = UIO_COPY_DEFAULT;
1158 		uio.uio_loffset = (offset_t)wa->wa_offset;
1159 		uio.uio_resid = wa->wa_count;
1160 		/*
1161 		 * The limit is checked on the client. We
1162 		 * should allow any size writes here.
1163 		 */
1164 		uio.uio_llimit = curproc->p_fsz_ctl;
1165 		rlimit = uio.uio_llimit - wa->wa_offset;
1166 		if (rlimit < (rlim64_t)uio.uio_resid)
1167 			uio.uio_resid = (uint_t)rlimit;
1168 
1169 		/*
1170 		 * for now we assume no append mode
1171 		 */
1172 		/*
1173 		 * We're changing creds because VM may fault and we need
1174 		 * the cred of the current thread to be used if quota
1175 		 * checking is enabled.
1176 		 */
1177 		savecred = curthread->t_cred;
1178 		curthread->t_cred = cr;
1179 		error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1180 		curthread->t_cred = savecred;
1181 	} else {
1182 
1183 		iovcnt = 0;
1184 		for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
1185 			iovcnt++;
1186 		if (iovcnt <= MAX_IOVECS) {
1187 #ifdef DEBUG
1188 			rfs_write_sync_hits++;
1189 #endif
1190 			iovp = iov;
1191 		} else {
1192 #ifdef DEBUG
1193 			rfs_write_sync_misses++;
1194 #endif
1195 			iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
1196 		}
1197 		mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
1198 		uio.uio_iov = iovp;
1199 		uio.uio_iovcnt = iovcnt;
1200 		uio.uio_segflg = UIO_SYSSPACE;
1201 		uio.uio_extflg = UIO_COPY_DEFAULT;
1202 		uio.uio_loffset = (offset_t)wa->wa_offset;
1203 		uio.uio_resid = wa->wa_count;
1204 		/*
1205 		 * The limit is checked on the client. We
1206 		 * should allow any size writes here.
1207 		 */
1208 		uio.uio_llimit = curproc->p_fsz_ctl;
1209 		rlimit = uio.uio_llimit - wa->wa_offset;
1210 		if (rlimit < (rlim64_t)uio.uio_resid)
1211 			uio.uio_resid = (uint_t)rlimit;
1212 
1213 		/*
1214 		 * For now we assume no append mode.
1215 		 */
1216 		/*
1217 		 * We're changing creds because VM may fault and we need
1218 		 * the cred of the current thread to be used if quota
1219 		 * checking is enabled.
1220 		 */
1221 		savecred = curthread->t_cred;
1222 		curthread->t_cred = cr;
1223 		error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1224 		curthread->t_cred = savecred;
1225 
1226 		if (iovp != iov)
1227 			kmem_free(iovp, sizeof (*iovp) * iovcnt);
1228 	}
1229 
1230 	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1231 
1232 	if (!error) {
1233 		/*
1234 		 * Get attributes again so we send the latest mod
1235 		 * time to the client side for its cache.
1236 		 */
1237 		va.va_mask = AT_ALL;	/* now we want everything */
1238 
1239 		error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1240 
1241 		/* check for overflows */
1242 		if (!error) {
1243 			acl_perm(vp, exi, &va, cr);
1244 			error = vattr_to_nattr(&va, &ns->ns_attr);
1245 		}
1246 	}
1247 
1248 out:
1249 	if (in_crit)
1250 		nbl_end_crit(vp);
1251 	VN_RELE(vp);
1252 
1253 	/* check if a monitor detected a delegation conflict */
1254 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1255 		/* mark as wouldblock so response is dropped */
1256 		curthread->t_flag |= T_WOULDBLOCK;
1257 	else
1258 		ns->ns_status = puterrno(error);
1259 
1260 }
1261 
1262 struct rfs_async_write {
1263 	struct nfswriteargs *wa;
1264 	struct nfsattrstat *ns;
1265 	struct svc_req *req;
1266 	cred_t *cr;
1267 	bool_t ro;
1268 	kthread_t *thread;
1269 	struct rfs_async_write *list;
1270 };
1271 
1272 struct rfs_async_write_list {
1273 	fhandle_t *fhp;
1274 	kcondvar_t cv;
1275 	struct rfs_async_write *list;
1276 	struct rfs_async_write_list *next;
1277 };
1278 
1279 static struct rfs_async_write_list *rfs_async_write_head = NULL;
1280 static kmutex_t rfs_async_write_lock;
1281 static int rfs_write_async = 1;	/* enables write clustering if == 1 */
1282 
1283 #define	MAXCLIOVECS	42
1284 #define	RFSWRITE_INITVAL (enum nfsstat) -1
1285 
1286 #ifdef DEBUG
1287 static int rfs_write_hits = 0;
1288 static int rfs_write_misses = 0;
1289 #endif
1290 
1291 /*
1292  * Write data to file.
1293  * Returns attributes of a file after writing some data to it.
1294  */
1295 void
1296 rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns,
1297     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1298 {
1299 	int error;
1300 	vnode_t *vp;
1301 	rlim64_t rlimit;
1302 	struct vattr va;
1303 	struct uio uio;
1304 	struct rfs_async_write_list *lp;
1305 	struct rfs_async_write_list *nlp;
1306 	struct rfs_async_write *rp;
1307 	struct rfs_async_write *nrp;
1308 	struct rfs_async_write *trp;
1309 	struct rfs_async_write *lrp;
1310 	int data_written;
1311 	int iovcnt;
1312 	mblk_t *m;
1313 	struct iovec *iovp;
1314 	struct iovec *niovp;
1315 	struct iovec iov[MAXCLIOVECS];
1316 	int count;
1317 	int rcount;
1318 	uint_t off;
1319 	uint_t len;
1320 	struct rfs_async_write nrpsp;
1321 	struct rfs_async_write_list nlpsp;
1322 	ushort_t t_flag;
1323 	cred_t *savecred;
1324 	int in_crit = 0;
1325 	caller_context_t ct;
1326 	nfs_srv_t *nsrv;
1327 
1328 	ASSERT(exi == NULL || exi->exi_zoneid == curzone->zone_id);
1329 	nsrv = nfs_get_srv();
1330 	if (!nsrv->write_async) {
1331 		rfs_write_sync(wa, ns, exi, req, cr, ro);
1332 		return;
1333 	}
1334 
1335 	/*
1336 	 * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
1337 	 * is considered an OK.
1338 	 */
1339 	ns->ns_status = RFSWRITE_INITVAL;
1340 
1341 	nrp = &nrpsp;
1342 	nrp->wa = wa;
1343 	nrp->ns = ns;
1344 	nrp->req = req;
1345 	nrp->cr = cr;
1346 	nrp->ro = ro;
1347 	nrp->thread = curthread;
1348 
1349 	ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
1350 
1351 	/*
1352 	 * Look to see if there is already a cluster started
1353 	 * for this file.
1354 	 */
1355 	mutex_enter(&nsrv->async_write_lock);
1356 	for (lp = nsrv->async_write_head; lp != NULL; lp = lp->next) {
1357 		if (bcmp(&wa->wa_fhandle, lp->fhp,
1358 		    sizeof (fhandle_t)) == 0)
1359 			break;
1360 	}
1361 
1362 	/*
1363 	 * If lp is non-NULL, then there is already a cluster
1364 	 * started.  We need to place ourselves in the cluster
1365 	 * list in the right place as determined by starting
1366 	 * offset.  Conflicts with non-blocking mandatory locked
1367 	 * regions will be checked when the cluster is processed.
1368 	 */
1369 	if (lp != NULL) {
1370 		rp = lp->list;
1371 		trp = NULL;
1372 		while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
1373 			trp = rp;
1374 			rp = rp->list;
1375 		}
1376 		nrp->list = rp;
1377 		if (trp == NULL)
1378 			lp->list = nrp;
1379 		else
1380 			trp->list = nrp;
1381 		while (nrp->ns->ns_status == RFSWRITE_INITVAL)
1382 			cv_wait(&lp->cv, &nsrv->async_write_lock);
1383 		mutex_exit(&nsrv->async_write_lock);
1384 
1385 		return;
1386 	}
1387 
1388 	/*
1389 	 * No cluster started yet, start one and add ourselves
1390 	 * to the list of clusters.
1391 	 */
1392 	nrp->list = NULL;
1393 
1394 	nlp = &nlpsp;
1395 	nlp->fhp = &wa->wa_fhandle;
1396 	cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
1397 	nlp->list = nrp;
1398 	nlp->next = NULL;
1399 
1400 	if (nsrv->async_write_head == NULL) {
1401 		nsrv->async_write_head = nlp;
1402 	} else {
1403 		lp = nsrv->async_write_head;
1404 		while (lp->next != NULL)
1405 			lp = lp->next;
1406 		lp->next = nlp;
1407 	}
1408 	mutex_exit(&nsrv->async_write_lock);
1409 
1410 	/*
1411 	 * Convert the file handle common to all of the requests
1412 	 * in this cluster to a vnode.
1413 	 */
1414 	vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1415 	if (vp == NULL) {
1416 		mutex_enter(&nsrv->async_write_lock);
1417 		if (nsrv->async_write_head == nlp)
1418 			nsrv->async_write_head = nlp->next;
1419 		else {
1420 			lp = nsrv->async_write_head;
1421 			while (lp->next != nlp)
1422 				lp = lp->next;
1423 			lp->next = nlp->next;
1424 		}
1425 		t_flag = curthread->t_flag & T_WOULDBLOCK;
1426 		for (rp = nlp->list; rp != NULL; rp = rp->list) {
1427 			rp->ns->ns_status = NFSERR_STALE;
1428 			rp->thread->t_flag |= t_flag;
1429 		}
1430 		cv_broadcast(&nlp->cv);
1431 		mutex_exit(&nsrv->async_write_lock);
1432 
1433 		return;
1434 	}
1435 
1436 	/*
1437 	 * Can only write regular files.  Attempts to write any
1438 	 * other file types fail with EISDIR.
1439 	 */
1440 	if (vp->v_type != VREG) {
1441 		VN_RELE(vp);
1442 		mutex_enter(&nsrv->async_write_lock);
1443 		if (nsrv->async_write_head == nlp)
1444 			nsrv->async_write_head = nlp->next;
1445 		else {
1446 			lp = nsrv->async_write_head;
1447 			while (lp->next != nlp)
1448 				lp = lp->next;
1449 			lp->next = nlp->next;
1450 		}
1451 		t_flag = curthread->t_flag & T_WOULDBLOCK;
1452 		for (rp = nlp->list; rp != NULL; rp = rp->list) {
1453 			rp->ns->ns_status = NFSERR_ISDIR;
1454 			rp->thread->t_flag |= t_flag;
1455 		}
1456 		cv_broadcast(&nlp->cv);
1457 		mutex_exit(&nsrv->async_write_lock);
1458 
1459 		return;
1460 	}
1461 
1462 	/*
1463 	 * Enter the critical region before calling VOP_RWLOCK, to avoid a
1464 	 * deadlock with ufs.
1465 	 */
1466 	if (nbl_need_check(vp)) {
1467 		nbl_start_crit(vp, RW_READER);
1468 		in_crit = 1;
1469 	}
1470 
1471 	ct.cc_sysid = 0;
1472 	ct.cc_pid = 0;
1473 	ct.cc_caller_id = nfs2_srv_caller_id;
1474 	ct.cc_flags = CC_DONTBLOCK;
1475 
1476 	/*
1477 	 * Lock the file for writing.  This operation provides
1478 	 * the delay which allows clusters to grow.
1479 	 */
1480 	error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1481 
1482 	/* check if a monitor detected a delegation conflict */
1483 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1484 		if (in_crit)
1485 			nbl_end_crit(vp);
1486 		VN_RELE(vp);
1487 		/* mark as wouldblock so response is dropped */
1488 		curthread->t_flag |= T_WOULDBLOCK;
1489 		mutex_enter(&nsrv->async_write_lock);
1490 		if (nsrv->async_write_head == nlp)
1491 			nsrv->async_write_head = nlp->next;
1492 		else {
1493 			lp = nsrv->async_write_head;
1494 			while (lp->next != nlp)
1495 				lp = lp->next;
1496 			lp->next = nlp->next;
1497 		}
1498 		for (rp = nlp->list; rp != NULL; rp = rp->list) {
1499 			if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1500 				rp->ns->ns_status = puterrno(error);
1501 				rp->thread->t_flag |= T_WOULDBLOCK;
1502 			}
1503 		}
1504 		cv_broadcast(&nlp->cv);
1505 		mutex_exit(&nsrv->async_write_lock);
1506 
1507 		return;
1508 	}
1509 
1510 	/*
1511 	 * Disconnect this cluster from the list of clusters.
1512 	 * The cluster that is being dealt with must be fixed
1513 	 * in size after this point, so there is no reason
1514 	 * to leave it on the list so that new requests can
1515 	 * find it.
1516 	 *
1517 	 * The algorithm is that the first write request will
1518 	 * create a cluster, convert the file handle to a
1519 	 * vnode pointer, and then lock the file for writing.
1520 	 * This request is not likely to be clustered with
1521 	 * any others.  However, the next request will create
1522 	 * a new cluster and be blocked in VOP_RWLOCK while
1523 	 * the first request is being processed.  This delay
1524 	 * will allow more requests to be clustered in this
1525 	 * second cluster.
1526 	 */
1527 	mutex_enter(&nsrv->async_write_lock);
1528 	if (nsrv->async_write_head == nlp)
1529 		nsrv->async_write_head = nlp->next;
1530 	else {
1531 		lp = nsrv->async_write_head;
1532 		while (lp->next != nlp)
1533 			lp = lp->next;
1534 		lp->next = nlp->next;
1535 	}
1536 	mutex_exit(&nsrv->async_write_lock);
1537 
1538 	/*
1539 	 * Step through the list of requests in this cluster.
1540 	 * We need to check permissions to make sure that all
1541 	 * of the requests have sufficient permission to write
1542 	 * the file.  A cluster can be composed of requests
1543 	 * from different clients and different users on each
1544 	 * client.
1545 	 *
1546 	 * As a side effect, we also calculate the size of the
1547 	 * byte range that this cluster encompasses.
1548 	 */
1549 	rp = nlp->list;
1550 	off = rp->wa->wa_offset;
1551 	len = (uint_t)0;
1552 	do {
1553 		if (rdonly(rp->ro, vp)) {
1554 			rp->ns->ns_status = NFSERR_ROFS;
1555 			t_flag = curthread->t_flag & T_WOULDBLOCK;
1556 			rp->thread->t_flag |= t_flag;
1557 			continue;
1558 		}
1559 
1560 		va.va_mask = AT_UID|AT_MODE;
1561 
1562 		error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1563 
1564 		if (!error) {
1565 			if (crgetuid(rp->cr) != va.va_uid) {
1566 				/*
1567 				 * This is a kludge to allow writes of files
1568 				 * created with read only permission.  The
1569 				 * owner of the file is always allowed to
1570 				 * write it.
1571 				 */
1572 				error = VOP_ACCESS(vp, VWRITE, 0, rp->cr, &ct);
1573 			}
1574 			if (!error && MANDLOCK(vp, va.va_mode))
1575 				error = EACCES;
1576 		}
1577 
1578 		/*
1579 		 * Check for a conflict with a nbmand-locked region.
1580 		 */
1581 		if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset,
1582 		    rp->wa->wa_count, 0, NULL)) {
1583 			error = EACCES;
1584 		}
1585 
1586 		if (error) {
1587 			rp->ns->ns_status = puterrno(error);
1588 			t_flag = curthread->t_flag & T_WOULDBLOCK;
1589 			rp->thread->t_flag |= t_flag;
1590 			continue;
1591 		}
1592 		if (len < rp->wa->wa_offset + rp->wa->wa_count - off)
1593 			len = rp->wa->wa_offset + rp->wa->wa_count - off;
1594 	} while ((rp = rp->list) != NULL);
1595 
1596 	/*
1597 	 * Step through the cluster attempting to gather as many
1598 	 * requests which are contiguous as possible.  These
1599 	 * contiguous requests are handled via one call to VOP_WRITE
1600 	 * instead of different calls to VOP_WRITE.  We also keep
1601 	 * track of the fact that any data was written.
1602 	 */
1603 	rp = nlp->list;
1604 	data_written = 0;
1605 	do {
1606 		/*
1607 		 * Skip any requests which are already marked as having an
1608 		 * error.
1609 		 */
1610 		if (rp->ns->ns_status != RFSWRITE_INITVAL) {
1611 			rp = rp->list;
1612 			continue;
1613 		}
1614 
1615 		/*
1616 		 * Count the number of iovec's which are required
1617 		 * to handle this set of requests.  One iovec is
1618 		 * needed for each data buffer, whether addressed
1619 		 * by wa_data or by the b_rptr pointers in the
1620 		 * mblk chains.
1621 		 */
1622 		iovcnt = 0;
1623 		lrp = rp;
1624 		for (;;) {
1625 			if (lrp->wa->wa_data || lrp->wa->wa_rlist)
1626 				iovcnt++;
1627 			else {
1628 				m = lrp->wa->wa_mblk;
1629 				while (m != NULL) {
1630 					iovcnt++;
1631 					m = m->b_cont;
1632 				}
1633 			}
1634 			if (lrp->list == NULL ||
1635 			    lrp->list->ns->ns_status != RFSWRITE_INITVAL ||
1636 			    lrp->wa->wa_offset + lrp->wa->wa_count !=
1637 			    lrp->list->wa->wa_offset) {
1638 				lrp = lrp->list;
1639 				break;
1640 			}
1641 			lrp = lrp->list;
1642 		}
1643 
1644 		if (iovcnt <= MAXCLIOVECS) {
1645 #ifdef DEBUG
1646 			rfs_write_hits++;
1647 #endif
1648 			niovp = iov;
1649 		} else {
1650 #ifdef DEBUG
1651 			rfs_write_misses++;
1652 #endif
1653 			niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP);
1654 		}
1655 		/*
1656 		 * Put together the scatter/gather iovecs.
1657 		 */
1658 		iovp = niovp;
1659 		trp = rp;
1660 		count = 0;
1661 		do {
1662 			if (trp->wa->wa_data || trp->wa->wa_rlist) {
1663 				if (trp->wa->wa_rlist) {
1664 					iovp->iov_base =
1665 					    (char *)((trp->wa->wa_rlist)->
1666 					    u.c_daddr3);
1667 					iovp->iov_len = trp->wa->wa_count;
1668 				} else  {
1669 					iovp->iov_base = trp->wa->wa_data;
1670 					iovp->iov_len = trp->wa->wa_count;
1671 				}
1672 				iovp++;
1673 			} else {
1674 				m = trp->wa->wa_mblk;
1675 				rcount = trp->wa->wa_count;
1676 				while (m != NULL) {
1677 					iovp->iov_base = (caddr_t)m->b_rptr;
1678 					iovp->iov_len = (m->b_wptr - m->b_rptr);
1679 					rcount -= iovp->iov_len;
1680 					if (rcount < 0)
1681 						iovp->iov_len += rcount;
1682 					iovp++;
1683 					if (rcount <= 0)
1684 						break;
1685 					m = m->b_cont;
1686 				}
1687 			}
1688 			count += trp->wa->wa_count;
1689 			trp = trp->list;
1690 		} while (trp != lrp);
1691 
1692 		uio.uio_iov = niovp;
1693 		uio.uio_iovcnt = iovcnt;
1694 		uio.uio_segflg = UIO_SYSSPACE;
1695 		uio.uio_extflg = UIO_COPY_DEFAULT;
1696 		uio.uio_loffset = (offset_t)rp->wa->wa_offset;
1697 		uio.uio_resid = count;
1698 		/*
1699 		 * The limit is checked on the client. We
1700 		 * should allow any size writes here.
1701 		 */
1702 		uio.uio_llimit = curproc->p_fsz_ctl;
1703 		rlimit = uio.uio_llimit - rp->wa->wa_offset;
1704 		if (rlimit < (rlim64_t)uio.uio_resid)
1705 			uio.uio_resid = (uint_t)rlimit;
1706 
1707 		/*
1708 		 * For now we assume no append mode.
1709 		 */
1710 
1711 		/*
1712 		 * We're changing creds because VM may fault
1713 		 * and we need the cred of the current
1714 		 * thread to be used if quota * checking is
1715 		 * enabled.
1716 		 */
1717 		savecred = curthread->t_cred;
1718 		curthread->t_cred = cr;
1719 		error = VOP_WRITE(vp, &uio, 0, rp->cr, &ct);
1720 		curthread->t_cred = savecred;
1721 
1722 		/* check if a monitor detected a delegation conflict */
1723 		if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1724 			/* mark as wouldblock so response is dropped */
1725 			curthread->t_flag |= T_WOULDBLOCK;
1726 
1727 		if (niovp != iov)
1728 			kmem_free(niovp, sizeof (*niovp) * iovcnt);
1729 
1730 		if (!error) {
1731 			data_written = 1;
1732 			/*
1733 			 * Get attributes again so we send the latest mod
1734 			 * time to the client side for its cache.
1735 			 */
1736 			va.va_mask = AT_ALL;	/* now we want everything */
1737 
1738 			error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1739 
1740 			if (!error)
1741 				acl_perm(vp, exi, &va, rp->cr);
1742 		}
1743 
1744 		/*
1745 		 * Fill in the status responses for each request
1746 		 * which was just handled.  Also, copy the latest
1747 		 * attributes in to the attribute responses if
1748 		 * appropriate.
1749 		 */
1750 		t_flag = curthread->t_flag & T_WOULDBLOCK;
1751 		do {
1752 			rp->thread->t_flag |= t_flag;
1753 			/* check for overflows */
1754 			if (!error) {
1755 				error  = vattr_to_nattr(&va, &rp->ns->ns_attr);
1756 			}
1757 			rp->ns->ns_status = puterrno(error);
1758 			rp = rp->list;
1759 		} while (rp != lrp);
1760 	} while (rp != NULL);
1761 
1762 	/*
1763 	 * If any data was written at all, then we need to flush
1764 	 * the data and metadata to stable storage.
1765 	 */
1766 	if (data_written) {
1767 		error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr, &ct);
1768 
1769 		if (!error) {
1770 			error = VOP_FSYNC(vp, FNODSYNC, cr, &ct);
1771 		}
1772 	}
1773 
1774 	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1775 
1776 	if (in_crit)
1777 		nbl_end_crit(vp);
1778 	VN_RELE(vp);
1779 
1780 	t_flag = curthread->t_flag & T_WOULDBLOCK;
1781 	mutex_enter(&nsrv->async_write_lock);
1782 	for (rp = nlp->list; rp != NULL; rp = rp->list) {
1783 		if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1784 			rp->ns->ns_status = puterrno(error);
1785 			rp->thread->t_flag |= t_flag;
1786 		}
1787 	}
1788 	cv_broadcast(&nlp->cv);
1789 	mutex_exit(&nsrv->async_write_lock);
1790 
1791 }
1792 
1793 void *
1794 rfs_write_getfh(struct nfswriteargs *wa)
1795 {
1796 	return (&wa->wa_fhandle);
1797 }
1798 
1799 /*
1800  * Create a file.
1801  * Creates a file with given attributes and returns those attributes
1802  * and an fhandle for the new file.
1803  */
1804 void
1805 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
1806     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1807 {
1808 	int error;
1809 	int lookuperr;
1810 	int in_crit = 0;
1811 	struct vattr va;
1812 	vnode_t *vp;
1813 	vnode_t *realvp;
1814 	vnode_t *dvp;
1815 	char *name = args->ca_da.da_name;
1816 	vnode_t *tvp = NULL;
1817 	int mode;
1818 	int lookup_ok;
1819 	bool_t trunc;
1820 	struct sockaddr *ca;
1821 
1822 	/*
1823 	 * Disallow NULL paths
1824 	 */
1825 	if (name == NULL || *name == '\0') {
1826 		dr->dr_status = NFSERR_ACCES;
1827 		return;
1828 	}
1829 
1830 	dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
1831 	if (dvp == NULL) {
1832 		dr->dr_status = NFSERR_STALE;
1833 		return;
1834 	}
1835 
1836 	error = sattr_to_vattr(args->ca_sa, &va);
1837 	if (error) {
1838 		dr->dr_status = puterrno(error);
1839 		return;
1840 	}
1841 
1842 	/*
1843 	 * Must specify the mode.
1844 	 */
1845 	if (!(va.va_mask & AT_MODE)) {
1846 		VN_RELE(dvp);
1847 		dr->dr_status = NFSERR_INVAL;
1848 		return;
1849 	}
1850 
1851 	/*
1852 	 * This is a completely gross hack to make mknod
1853 	 * work over the wire until we can wack the protocol
1854 	 */
1855 	if ((va.va_mode & IFMT) == IFCHR) {
1856 		if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV)
1857 			va.va_type = VFIFO;	/* xtra kludge for named pipe */
1858 		else {
1859 			va.va_type = VCHR;
1860 			/*
1861 			 * uncompress the received dev_t
1862 			 * if the top half is zero indicating a request
1863 			 * from an `older style' OS.
1864 			 */
1865 			if ((va.va_size & 0xffff0000) == 0)
1866 				va.va_rdev = nfsv2_expdev(va.va_size);
1867 			else
1868 				va.va_rdev = (dev_t)va.va_size;
1869 		}
1870 		va.va_mask &= ~AT_SIZE;
1871 	} else if ((va.va_mode & IFMT) == IFBLK) {
1872 		va.va_type = VBLK;
1873 		/*
1874 		 * uncompress the received dev_t
1875 		 * if the top half is zero indicating a request
1876 		 * from an `older style' OS.
1877 		 */
1878 		if ((va.va_size & 0xffff0000) == 0)
1879 			va.va_rdev = nfsv2_expdev(va.va_size);
1880 		else
1881 			va.va_rdev = (dev_t)va.va_size;
1882 		va.va_mask &= ~AT_SIZE;
1883 	} else if ((va.va_mode & IFMT) == IFSOCK) {
1884 		va.va_type = VSOCK;
1885 	} else {
1886 		va.va_type = VREG;
1887 	}
1888 	va.va_mode &= ~IFMT;
1889 	va.va_mask |= AT_TYPE;
1890 
1891 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
1892 	name = nfscmd_convname(ca, exi, name, NFSCMD_CONV_INBOUND,
1893 	    MAXPATHLEN);
1894 	if (name == NULL) {
1895 		dr->dr_status = puterrno(EINVAL);
1896 		return;
1897 	}
1898 
1899 	/*
1900 	 * Why was the choice made to use VWRITE as the mode to the
1901 	 * call to VOP_CREATE ? This results in a bug.  When a client
1902 	 * opens a file that already exists and is RDONLY, the second
1903 	 * open fails with an EACESS because of the mode.
1904 	 * bug ID 1054648.
1905 	 */
1906 	lookup_ok = 0;
1907 	mode = VWRITE;
1908 	if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) {
1909 		error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1910 		    NULL, NULL, NULL);
1911 		if (!error) {
1912 			struct vattr at;
1913 
1914 			lookup_ok = 1;
1915 			at.va_mask = AT_MODE;
1916 			error = VOP_GETATTR(tvp, &at, 0, cr, NULL);
1917 			if (!error)
1918 				mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD;
1919 			VN_RELE(tvp);
1920 			tvp = NULL;
1921 		}
1922 	}
1923 
1924 	if (!lookup_ok) {
1925 		if (rdonly(ro, dvp)) {
1926 			error = EROFS;
1927 		} else if (va.va_type != VREG && va.va_type != VFIFO &&
1928 		    va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) {
1929 			error = EPERM;
1930 		} else {
1931 			error = 0;
1932 		}
1933 	}
1934 
1935 	/*
1936 	 * If file size is being modified on an already existing file
1937 	 * make sure that there are no conflicting non-blocking mandatory
1938 	 * locks in the region being manipulated. Return EACCES if there
1939 	 * are conflicting locks.
1940 	 */
1941 	if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) {
1942 		lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1943 		    NULL, NULL, NULL);
1944 
1945 		if (!lookuperr &&
1946 		    rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) {
1947 			VN_RELE(tvp);
1948 			curthread->t_flag |= T_WOULDBLOCK;
1949 			goto out;
1950 		}
1951 
1952 		if (!lookuperr && nbl_need_check(tvp)) {
1953 			/*
1954 			 * The file exists. Now check if it has any
1955 			 * conflicting non-blocking mandatory locks
1956 			 * in the region being changed.
1957 			 */
1958 			struct vattr bva;
1959 			u_offset_t offset;
1960 			ssize_t length;
1961 
1962 			nbl_start_crit(tvp, RW_READER);
1963 			in_crit = 1;
1964 
1965 			bva.va_mask = AT_SIZE;
1966 			error = VOP_GETATTR(tvp, &bva, 0, cr, NULL);
1967 			if (!error) {
1968 				if (va.va_size < bva.va_size) {
1969 					offset = va.va_size;
1970 					length = bva.va_size - va.va_size;
1971 				} else {
1972 					offset = bva.va_size;
1973 					length = va.va_size - bva.va_size;
1974 				}
1975 				if (length) {
1976 					if (nbl_conflict(tvp, NBL_WRITE,
1977 					    offset, length, 0, NULL)) {
1978 						error = EACCES;
1979 					}
1980 				}
1981 			}
1982 			if (error) {
1983 				nbl_end_crit(tvp);
1984 				VN_RELE(tvp);
1985 				in_crit = 0;
1986 			}
1987 		} else if (tvp != NULL) {
1988 			VN_RELE(tvp);
1989 		}
1990 	}
1991 
1992 	if (!error) {
1993 		/*
1994 		 * If filesystem is shared with nosuid the remove any
1995 		 * setuid/setgid bits on create.
1996 		 */
1997 		if (va.va_type == VREG &&
1998 		    exi->exi_export.ex_flags & EX_NOSUID)
1999 			va.va_mode &= ~(VSUID | VSGID);
2000 
2001 		error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0,
2002 		    NULL, NULL);
2003 
2004 		if (!error) {
2005 
2006 			if ((va.va_mask & AT_SIZE) && (va.va_size == 0))
2007 				trunc = TRUE;
2008 			else
2009 				trunc = FALSE;
2010 
2011 			if (rfs4_check_delegated(FWRITE, vp, trunc)) {
2012 				VN_RELE(vp);
2013 				curthread->t_flag |= T_WOULDBLOCK;
2014 				goto out;
2015 			}
2016 			va.va_mask = AT_ALL;
2017 
2018 			error = VOP_GETATTR(vp, &va, 0, cr, NULL);
2019 
2020 			/* check for overflows */
2021 			if (!error) {
2022 				acl_perm(vp, exi, &va, cr);
2023 				error = vattr_to_nattr(&va, &dr->dr_attr);
2024 				if (!error) {
2025 					error = makefh(&dr->dr_fhandle, vp,
2026 					    exi);
2027 				}
2028 			}
2029 			/*
2030 			 * Force modified metadata out to stable storage.
2031 			 *
2032 			 * if a underlying vp exists, pass it to VOP_FSYNC
2033 			 */
2034 			if (VOP_REALVP(vp, &realvp, NULL) == 0)
2035 				(void) VOP_FSYNC(realvp, FNODSYNC, cr, NULL);
2036 			else
2037 				(void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2038 			VN_RELE(vp);
2039 		}
2040 
2041 		if (in_crit) {
2042 			nbl_end_crit(tvp);
2043 			VN_RELE(tvp);
2044 		}
2045 	}
2046 
2047 	/*
2048 	 * Force modified data and metadata out to stable storage.
2049 	 */
2050 	(void) VOP_FSYNC(dvp, 0, cr, NULL);
2051 
2052 out:
2053 
2054 	VN_RELE(dvp);
2055 
2056 	dr->dr_status = puterrno(error);
2057 
2058 	if (name != args->ca_da.da_name)
2059 		kmem_free(name, MAXPATHLEN);
2060 }
2061 void *
2062 rfs_create_getfh(struct nfscreatargs *args)
2063 {
2064 	return (args->ca_da.da_fhandle);
2065 }
2066 
2067 /*
2068  * Remove a file.
2069  * Remove named file from parent directory.
2070  */
2071 /* ARGSUSED */
2072 void
2073 rfs_remove(struct nfsdiropargs *da, enum nfsstat *status,
2074     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2075 {
2076 	int error = 0;
2077 	vnode_t *vp;
2078 	vnode_t *targvp;
2079 	int in_crit = 0;
2080 
2081 	/*
2082 	 * Disallow NULL paths
2083 	 */
2084 	if (da->da_name == NULL || *da->da_name == '\0') {
2085 		*status = NFSERR_ACCES;
2086 		return;
2087 	}
2088 
2089 	vp = nfs_fhtovp(da->da_fhandle, exi);
2090 	if (vp == NULL) {
2091 		*status = NFSERR_STALE;
2092 		return;
2093 	}
2094 
2095 	if (rdonly(ro, vp)) {
2096 		VN_RELE(vp);
2097 		*status = NFSERR_ROFS;
2098 		return;
2099 	}
2100 
2101 	/*
2102 	 * Check for a conflict with a non-blocking mandatory share reservation.
2103 	 */
2104 	error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0,
2105 	    NULL, cr, NULL, NULL, NULL);
2106 	if (error != 0) {
2107 		VN_RELE(vp);
2108 		*status = puterrno(error);
2109 		return;
2110 	}
2111 
2112 	/*
2113 	 * If the file is delegated to an v4 client, then initiate
2114 	 * recall and drop this request (by setting T_WOULDBLOCK).
2115 	 * The client will eventually re-transmit the request and
2116 	 * (hopefully), by then, the v4 client will have returned
2117 	 * the delegation.
2118 	 */
2119 
2120 	if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2121 		VN_RELE(vp);
2122 		VN_RELE(targvp);
2123 		curthread->t_flag |= T_WOULDBLOCK;
2124 		return;
2125 	}
2126 
2127 	if (nbl_need_check(targvp)) {
2128 		nbl_start_crit(targvp, RW_READER);
2129 		in_crit = 1;
2130 		if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
2131 			error = EACCES;
2132 			goto out;
2133 		}
2134 	}
2135 
2136 	error = VOP_REMOVE(vp, da->da_name, cr, NULL, 0);
2137 
2138 	/*
2139 	 * Force modified data and metadata out to stable storage.
2140 	 */
2141 	(void) VOP_FSYNC(vp, 0, cr, NULL);
2142 
2143 out:
2144 	if (in_crit)
2145 		nbl_end_crit(targvp);
2146 	VN_RELE(targvp);
2147 	VN_RELE(vp);
2148 
2149 	*status = puterrno(error);
2150 
2151 }
2152 
2153 void *
2154 rfs_remove_getfh(struct nfsdiropargs *da)
2155 {
2156 	return (da->da_fhandle);
2157 }
2158 
2159 /*
2160  * rename a file
2161  * Give a file (from) a new name (to).
2162  */
2163 /* ARGSUSED */
2164 void
2165 rfs_rename(struct nfsrnmargs *args, enum nfsstat *status,
2166     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2167 {
2168 	int error = 0;
2169 	vnode_t *fromvp;
2170 	vnode_t *tovp;
2171 	struct exportinfo *to_exi;
2172 	fhandle_t *fh;
2173 	vnode_t *srcvp;
2174 	vnode_t *targvp;
2175 	int in_crit = 0;
2176 
2177 	fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi);
2178 	if (fromvp == NULL) {
2179 		*status = NFSERR_STALE;
2180 		return;
2181 	}
2182 
2183 	fh = args->rna_to.da_fhandle;
2184 	to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2185 	if (to_exi == NULL) {
2186 		VN_RELE(fromvp);
2187 		*status = NFSERR_ACCES;
2188 		return;
2189 	}
2190 	exi_rele(to_exi);
2191 
2192 	if (to_exi != exi) {
2193 		VN_RELE(fromvp);
2194 		*status = NFSERR_XDEV;
2195 		return;
2196 	}
2197 
2198 	tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi);
2199 	if (tovp == NULL) {
2200 		VN_RELE(fromvp);
2201 		*status = NFSERR_STALE;
2202 		return;
2203 	}
2204 
2205 	if (fromvp->v_type != VDIR || tovp->v_type != VDIR) {
2206 		VN_RELE(tovp);
2207 		VN_RELE(fromvp);
2208 		*status = NFSERR_NOTDIR;
2209 		return;
2210 	}
2211 
2212 	/*
2213 	 * Disallow NULL paths
2214 	 */
2215 	if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' ||
2216 	    args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') {
2217 		VN_RELE(tovp);
2218 		VN_RELE(fromvp);
2219 		*status = NFSERR_ACCES;
2220 		return;
2221 	}
2222 
2223 	if (rdonly(ro, tovp)) {
2224 		VN_RELE(tovp);
2225 		VN_RELE(fromvp);
2226 		*status = NFSERR_ROFS;
2227 		return;
2228 	}
2229 
2230 	/*
2231 	 * Check for a conflict with a non-blocking mandatory share reservation.
2232 	 */
2233 	error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
2234 	    NULL, cr, NULL, NULL, NULL);
2235 	if (error != 0) {
2236 		VN_RELE(tovp);
2237 		VN_RELE(fromvp);
2238 		*status = puterrno(error);
2239 		return;
2240 	}
2241 
2242 	/* Check for delegations on the source file */
2243 
2244 	if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
2245 		VN_RELE(tovp);
2246 		VN_RELE(fromvp);
2247 		VN_RELE(srcvp);
2248 		curthread->t_flag |= T_WOULDBLOCK;
2249 		return;
2250 	}
2251 
2252 	/* Check for delegation on the file being renamed over, if it exists */
2253 
2254 	if (nfs4_get_deleg_policy() != SRV_NEVER_DELEGATE &&
2255 	    VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
2256 	    NULL, NULL, NULL) == 0) {
2257 
2258 		if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2259 			VN_RELE(tovp);
2260 			VN_RELE(fromvp);
2261 			VN_RELE(srcvp);
2262 			VN_RELE(targvp);
2263 			curthread->t_flag |= T_WOULDBLOCK;
2264 			return;
2265 		}
2266 		VN_RELE(targvp);
2267 	}
2268 
2269 
2270 	if (nbl_need_check(srcvp)) {
2271 		nbl_start_crit(srcvp, RW_READER);
2272 		in_crit = 1;
2273 		if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) {
2274 			error = EACCES;
2275 			goto out;
2276 		}
2277 	}
2278 
2279 	error = VOP_RENAME(fromvp, args->rna_from.da_name,
2280 	    tovp, args->rna_to.da_name, cr, NULL, 0);
2281 
2282 	if (error == 0)
2283 		vn_renamepath(tovp, srcvp, args->rna_to.da_name,
2284 		    strlen(args->rna_to.da_name));
2285 
2286 	/*
2287 	 * Force modified data and metadata out to stable storage.
2288 	 */
2289 	(void) VOP_FSYNC(tovp, 0, cr, NULL);
2290 	(void) VOP_FSYNC(fromvp, 0, cr, NULL);
2291 
2292 out:
2293 	if (in_crit)
2294 		nbl_end_crit(srcvp);
2295 	VN_RELE(srcvp);
2296 	VN_RELE(tovp);
2297 	VN_RELE(fromvp);
2298 
2299 	*status = puterrno(error);
2300 
2301 }
2302 void *
2303 rfs_rename_getfh(struct nfsrnmargs *args)
2304 {
2305 	return (args->rna_from.da_fhandle);
2306 }
2307 
2308 /*
2309  * Link to a file.
2310  * Create a file (to) which is a hard link to the given file (from).
2311  */
2312 /* ARGSUSED */
2313 void
2314 rfs_link(struct nfslinkargs *args, enum nfsstat *status,
2315     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2316 {
2317 	int error;
2318 	vnode_t *fromvp;
2319 	vnode_t *tovp;
2320 	struct exportinfo *to_exi;
2321 	fhandle_t *fh;
2322 
2323 	fromvp = nfs_fhtovp(args->la_from, exi);
2324 	if (fromvp == NULL) {
2325 		*status = NFSERR_STALE;
2326 		return;
2327 	}
2328 
2329 	fh = args->la_to.da_fhandle;
2330 	to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2331 	if (to_exi == NULL) {
2332 		VN_RELE(fromvp);
2333 		*status = NFSERR_ACCES;
2334 		return;
2335 	}
2336 	exi_rele(to_exi);
2337 
2338 	if (to_exi != exi) {
2339 		VN_RELE(fromvp);
2340 		*status = NFSERR_XDEV;
2341 		return;
2342 	}
2343 
2344 	tovp = nfs_fhtovp(args->la_to.da_fhandle, exi);
2345 	if (tovp == NULL) {
2346 		VN_RELE(fromvp);
2347 		*status = NFSERR_STALE;
2348 		return;
2349 	}
2350 
2351 	if (tovp->v_type != VDIR) {
2352 		VN_RELE(tovp);
2353 		VN_RELE(fromvp);
2354 		*status = NFSERR_NOTDIR;
2355 		return;
2356 	}
2357 	/*
2358 	 * Disallow NULL paths
2359 	 */
2360 	if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') {
2361 		VN_RELE(tovp);
2362 		VN_RELE(fromvp);
2363 		*status = NFSERR_ACCES;
2364 		return;
2365 	}
2366 
2367 	if (rdonly(ro, tovp)) {
2368 		VN_RELE(tovp);
2369 		VN_RELE(fromvp);
2370 		*status = NFSERR_ROFS;
2371 		return;
2372 	}
2373 
2374 	error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr, NULL, 0);
2375 
2376 	/*
2377 	 * Force modified data and metadata out to stable storage.
2378 	 */
2379 	(void) VOP_FSYNC(tovp, 0, cr, NULL);
2380 	(void) VOP_FSYNC(fromvp, FNODSYNC, cr, NULL);
2381 
2382 	VN_RELE(tovp);
2383 	VN_RELE(fromvp);
2384 
2385 	*status = puterrno(error);
2386 
2387 }
2388 void *
2389 rfs_link_getfh(struct nfslinkargs *args)
2390 {
2391 	return (args->la_from);
2392 }
2393 
2394 /*
2395  * Symbolicly link to a file.
2396  * Create a file (to) with the given attributes which is a symbolic link
2397  * to the given path name (to).
2398  */
2399 void
2400 rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
2401     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2402 {
2403 	int error;
2404 	struct vattr va;
2405 	vnode_t *vp;
2406 	vnode_t *svp;
2407 	int lerror;
2408 	struct sockaddr *ca;
2409 	char *name = NULL;
2410 
2411 	/*
2412 	 * Disallow NULL paths
2413 	 */
2414 	if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') {
2415 		*status = NFSERR_ACCES;
2416 		return;
2417 	}
2418 
2419 	vp = nfs_fhtovp(args->sla_from.da_fhandle, exi);
2420 	if (vp == NULL) {
2421 		*status = NFSERR_STALE;
2422 		return;
2423 	}
2424 
2425 	if (rdonly(ro, vp)) {
2426 		VN_RELE(vp);
2427 		*status = NFSERR_ROFS;
2428 		return;
2429 	}
2430 
2431 	error = sattr_to_vattr(args->sla_sa, &va);
2432 	if (error) {
2433 		VN_RELE(vp);
2434 		*status = puterrno(error);
2435 		return;
2436 	}
2437 
2438 	if (!(va.va_mask & AT_MODE)) {
2439 		VN_RELE(vp);
2440 		*status = NFSERR_INVAL;
2441 		return;
2442 	}
2443 
2444 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2445 	name = nfscmd_convname(ca, exi, args->sla_tnm,
2446 	    NFSCMD_CONV_INBOUND, MAXPATHLEN);
2447 
2448 	if (name == NULL) {
2449 		*status = NFSERR_ACCES;
2450 		return;
2451 	}
2452 
2453 	va.va_type = VLNK;
2454 	va.va_mask |= AT_TYPE;
2455 
2456 	error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, name, cr, NULL, 0);
2457 
2458 	/*
2459 	 * Force new data and metadata out to stable storage.
2460 	 */
2461 	lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL, 0,
2462 	    NULL, cr, NULL, NULL, NULL);
2463 
2464 	if (!lerror) {
2465 		(void) VOP_FSYNC(svp, 0, cr, NULL);
2466 		VN_RELE(svp);
2467 	}
2468 
2469 	/*
2470 	 * Force modified data and metadata out to stable storage.
2471 	 */
2472 	(void) VOP_FSYNC(vp, 0, cr, NULL);
2473 
2474 	VN_RELE(vp);
2475 
2476 	*status = puterrno(error);
2477 	if (name != args->sla_tnm)
2478 		kmem_free(name, MAXPATHLEN);
2479 
2480 }
2481 void *
2482 rfs_symlink_getfh(struct nfsslargs *args)
2483 {
2484 	return (args->sla_from.da_fhandle);
2485 }
2486 
2487 /*
2488  * Make a directory.
2489  * Create a directory with the given name, parent directory, and attributes.
2490  * Returns a file handle and attributes for the new directory.
2491  */
2492 /* ARGSUSED */
2493 void
2494 rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr,
2495     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2496 {
2497 	int error;
2498 	struct vattr va;
2499 	vnode_t *dvp = NULL;
2500 	vnode_t *vp;
2501 	char *name = args->ca_da.da_name;
2502 
2503 	/*
2504 	 * Disallow NULL paths
2505 	 */
2506 	if (name == NULL || *name == '\0') {
2507 		dr->dr_status = NFSERR_ACCES;
2508 		return;
2509 	}
2510 
2511 	vp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
2512 	if (vp == NULL) {
2513 		dr->dr_status = NFSERR_STALE;
2514 		return;
2515 	}
2516 
2517 	if (rdonly(ro, vp)) {
2518 		VN_RELE(vp);
2519 		dr->dr_status = NFSERR_ROFS;
2520 		return;
2521 	}
2522 
2523 	error = sattr_to_vattr(args->ca_sa, &va);
2524 	if (error) {
2525 		VN_RELE(vp);
2526 		dr->dr_status = puterrno(error);
2527 		return;
2528 	}
2529 
2530 	if (!(va.va_mask & AT_MODE)) {
2531 		VN_RELE(vp);
2532 		dr->dr_status = NFSERR_INVAL;
2533 		return;
2534 	}
2535 
2536 	va.va_type = VDIR;
2537 	va.va_mask |= AT_TYPE;
2538 
2539 	error = VOP_MKDIR(vp, name, &va, &dvp, cr, NULL, 0, NULL);
2540 
2541 	if (!error) {
2542 		/*
2543 		 * Attribtutes of the newly created directory should
2544 		 * be returned to the client.
2545 		 */
2546 		va.va_mask = AT_ALL; /* We want everything */
2547 		error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2548 
2549 		/* check for overflows */
2550 		if (!error) {
2551 			acl_perm(vp, exi, &va, cr);
2552 			error = vattr_to_nattr(&va, &dr->dr_attr);
2553 			if (!error) {
2554 				error = makefh(&dr->dr_fhandle, dvp, exi);
2555 			}
2556 		}
2557 		/*
2558 		 * Force new data and metadata out to stable storage.
2559 		 */
2560 		(void) VOP_FSYNC(dvp, 0, cr, NULL);
2561 		VN_RELE(dvp);
2562 	}
2563 
2564 	/*
2565 	 * Force modified data and metadata out to stable storage.
2566 	 */
2567 	(void) VOP_FSYNC(vp, 0, cr, NULL);
2568 
2569 	VN_RELE(vp);
2570 
2571 	dr->dr_status = puterrno(error);
2572 
2573 }
2574 void *
2575 rfs_mkdir_getfh(struct nfscreatargs *args)
2576 {
2577 	return (args->ca_da.da_fhandle);
2578 }
2579 
2580 /*
2581  * Remove a directory.
2582  * Remove the given directory name from the given parent directory.
2583  */
2584 /* ARGSUSED */
2585 void
2586 rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status,
2587     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2588 {
2589 	int error;
2590 	vnode_t *vp;
2591 
2592 	/*
2593 	 * Disallow NULL paths
2594 	 */
2595 	if (da->da_name == NULL || *da->da_name == '\0') {
2596 		*status = NFSERR_ACCES;
2597 		return;
2598 	}
2599 
2600 	vp = nfs_fhtovp(da->da_fhandle, exi);
2601 	if (vp == NULL) {
2602 		*status = NFSERR_STALE;
2603 		return;
2604 	}
2605 
2606 	if (rdonly(ro, vp)) {
2607 		VN_RELE(vp);
2608 		*status = NFSERR_ROFS;
2609 		return;
2610 	}
2611 
2612 	/*
2613 	 * VOP_RMDIR takes a third argument (the current
2614 	 * directory of the process).  That's because someone
2615 	 * wants to return EINVAL if one tries to remove ".".
2616 	 * Of course, NFS servers have no idea what their
2617 	 * clients' current directories are.  We fake it by
2618 	 * supplying a vnode known to exist and illegal to
2619 	 * remove.
2620 	 */
2621 	error = VOP_RMDIR(vp, da->da_name, ZONE_ROOTVP(), cr, NULL, 0);
2622 
2623 	/*
2624 	 * Force modified data and metadata out to stable storage.
2625 	 */
2626 	(void) VOP_FSYNC(vp, 0, cr, NULL);
2627 
2628 	VN_RELE(vp);
2629 
2630 	/*
2631 	 * System V defines rmdir to return EEXIST, not ENOTEMPTY,
2632 	 * if the directory is not empty.  A System V NFS server
2633 	 * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
2634 	 * over the wire.
2635 	 */
2636 	if (error == EEXIST)
2637 		*status = NFSERR_NOTEMPTY;
2638 	else
2639 		*status = puterrno(error);
2640 
2641 }
2642 void *
2643 rfs_rmdir_getfh(struct nfsdiropargs *da)
2644 {
2645 	return (da->da_fhandle);
2646 }
2647 
2648 /* ARGSUSED */
2649 void
2650 rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
2651     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2652 {
2653 	int error;
2654 	int iseof;
2655 	struct iovec iov;
2656 	struct uio uio;
2657 	vnode_t *vp;
2658 	char *ndata = NULL;
2659 	struct sockaddr *ca;
2660 	size_t nents;
2661 	int ret;
2662 
2663 	vp = nfs_fhtovp(&rda->rda_fh, exi);
2664 	if (vp == NULL) {
2665 		rd->rd_entries = NULL;
2666 		rd->rd_status = NFSERR_STALE;
2667 		return;
2668 	}
2669 
2670 	if (vp->v_type != VDIR) {
2671 		VN_RELE(vp);
2672 		rd->rd_entries = NULL;
2673 		rd->rd_status = NFSERR_NOTDIR;
2674 		return;
2675 	}
2676 
2677 	(void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
2678 
2679 	error = VOP_ACCESS(vp, VREAD, 0, cr, NULL);
2680 
2681 	if (error) {
2682 		rd->rd_entries = NULL;
2683 		goto bad;
2684 	}
2685 
2686 	if (rda->rda_count == 0) {
2687 		rd->rd_entries = NULL;
2688 		rd->rd_size = 0;
2689 		rd->rd_eof = FALSE;
2690 		goto bad;
2691 	}
2692 
2693 	rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA);
2694 
2695 	/*
2696 	 * Allocate data for entries.  This will be freed by rfs_rddirfree.
2697 	 */
2698 	rd->rd_bufsize = (uint_t)rda->rda_count;
2699 	rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP);
2700 
2701 	/*
2702 	 * Set up io vector to read directory data
2703 	 */
2704 	iov.iov_base = (caddr_t)rd->rd_entries;
2705 	iov.iov_len = rda->rda_count;
2706 	uio.uio_iov = &iov;
2707 	uio.uio_iovcnt = 1;
2708 	uio.uio_segflg = UIO_SYSSPACE;
2709 	uio.uio_extflg = UIO_COPY_CACHED;
2710 	uio.uio_loffset = (offset_t)rda->rda_offset;
2711 	uio.uio_resid = rda->rda_count;
2712 
2713 	/*
2714 	 * read directory
2715 	 */
2716 	error = VOP_READDIR(vp, &uio, cr, &iseof, NULL, 0);
2717 
2718 	/*
2719 	 * Clean up
2720 	 */
2721 	if (!error) {
2722 		/*
2723 		 * set size and eof
2724 		 */
2725 		if (uio.uio_resid == rda->rda_count) {
2726 			rd->rd_size = 0;
2727 			rd->rd_eof = TRUE;
2728 		} else {
2729 			rd->rd_size = (uint32_t)(rda->rda_count -
2730 			    uio.uio_resid);
2731 			rd->rd_eof = iseof ? TRUE : FALSE;
2732 		}
2733 	}
2734 
2735 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2736 	nents = nfscmd_countents((char *)rd->rd_entries, rd->rd_size);
2737 	ret = nfscmd_convdirplus(ca, exi, (char *)rd->rd_entries, nents,
2738 	    rda->rda_count, &ndata);
2739 
2740 	if (ret != 0) {
2741 		size_t dropbytes;
2742 		/*
2743 		 * We had to drop one or more entries in order to fit
2744 		 * during the character conversion.  We need to patch
2745 		 * up the size and eof info.
2746 		 */
2747 		if (rd->rd_eof)
2748 			rd->rd_eof = FALSE;
2749 		dropbytes = nfscmd_dropped_entrysize(
2750 		    (struct dirent64 *)rd->rd_entries, nents, ret);
2751 		rd->rd_size -= dropbytes;
2752 	}
2753 	if (ndata == NULL) {
2754 		ndata = (char *)rd->rd_entries;
2755 	} else if (ndata != (char *)rd->rd_entries) {
2756 		kmem_free(rd->rd_entries, rd->rd_bufsize);
2757 		rd->rd_entries = (void *)ndata;
2758 		rd->rd_bufsize = rda->rda_count;
2759 	}
2760 
2761 bad:
2762 	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
2763 
2764 #if 0 /* notyet */
2765 	/*
2766 	 * Don't do this.  It causes local disk writes when just
2767 	 * reading the file and the overhead is deemed larger
2768 	 * than the benefit.
2769 	 */
2770 	/*
2771 	 * Force modified metadata out to stable storage.
2772 	 */
2773 	(void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2774 #endif
2775 
2776 	VN_RELE(vp);
2777 
2778 	rd->rd_status = puterrno(error);
2779 
2780 }
2781 void *
2782 rfs_readdir_getfh(struct nfsrddirargs *rda)
2783 {
2784 	return (&rda->rda_fh);
2785 }
2786 void
2787 rfs_rddirfree(struct nfsrddirres *rd)
2788 {
2789 	if (rd->rd_entries != NULL)
2790 		kmem_free(rd->rd_entries, rd->rd_bufsize);
2791 }
2792 
2793 /* ARGSUSED */
2794 void
2795 rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
2796     struct svc_req *req, cred_t *cr, bool_t ro)
2797 {
2798 	int error;
2799 	struct statvfs64 sb;
2800 	vnode_t *vp;
2801 
2802 	vp = nfs_fhtovp(fh, exi);
2803 	if (vp == NULL) {
2804 		fs->fs_status = NFSERR_STALE;
2805 		return;
2806 	}
2807 
2808 	error = VFS_STATVFS(vp->v_vfsp, &sb);
2809 
2810 	if (!error) {
2811 		fs->fs_tsize = nfstsize();
2812 		fs->fs_bsize = sb.f_frsize;
2813 		fs->fs_blocks = sb.f_blocks;
2814 		fs->fs_bfree = sb.f_bfree;
2815 		fs->fs_bavail = sb.f_bavail;
2816 	}
2817 
2818 	VN_RELE(vp);
2819 
2820 	fs->fs_status = puterrno(error);
2821 
2822 }
2823 void *
2824 rfs_statfs_getfh(fhandle_t *fh)
2825 {
2826 	return (fh);
2827 }
2828 
2829 static int
2830 sattr_to_vattr(struct nfssattr *sa, struct vattr *vap)
2831 {
2832 	vap->va_mask = 0;
2833 
2834 	/*
2835 	 * There was a sign extension bug in some VFS based systems
2836 	 * which stored the mode as a short.  When it would get
2837 	 * assigned to a u_long, no sign extension would occur.
2838 	 * It needed to, but this wasn't noticed because sa_mode
2839 	 * would then get assigned back to the short, thus ignoring
2840 	 * the upper 16 bits of sa_mode.
2841 	 *
2842 	 * To make this implementation work for both broken
2843 	 * clients and good clients, we check for both versions
2844 	 * of the mode.
2845 	 */
2846 	if (sa->sa_mode != (uint32_t)((ushort_t)-1) &&
2847 	    sa->sa_mode != (uint32_t)-1) {
2848 		vap->va_mask |= AT_MODE;
2849 		vap->va_mode = sa->sa_mode;
2850 	}
2851 	if (sa->sa_uid != (uint32_t)-1) {
2852 		vap->va_mask |= AT_UID;
2853 		vap->va_uid = sa->sa_uid;
2854 	}
2855 	if (sa->sa_gid != (uint32_t)-1) {
2856 		vap->va_mask |= AT_GID;
2857 		vap->va_gid = sa->sa_gid;
2858 	}
2859 	if (sa->sa_size != (uint32_t)-1) {
2860 		vap->va_mask |= AT_SIZE;
2861 		vap->va_size = sa->sa_size;
2862 	}
2863 	if (sa->sa_atime.tv_sec != (int32_t)-1 &&
2864 	    sa->sa_atime.tv_usec != (int32_t)-1) {
2865 #ifndef _LP64
2866 		/* return error if time overflow */
2867 		if (!NFS2_TIME_OK(sa->sa_atime.tv_sec))
2868 			return (EOVERFLOW);
2869 #endif
2870 		vap->va_mask |= AT_ATIME;
2871 		/*
2872 		 * nfs protocol defines times as unsigned so don't extend sign,
2873 		 * unless sysadmin set nfs_allow_preepoch_time.
2874 		 */
2875 		NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec);
2876 		vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
2877 	}
2878 	if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
2879 	    sa->sa_mtime.tv_usec != (int32_t)-1) {
2880 #ifndef _LP64
2881 		/* return error if time overflow */
2882 		if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
2883 			return (EOVERFLOW);
2884 #endif
2885 		vap->va_mask |= AT_MTIME;
2886 		/*
2887 		 * nfs protocol defines times as unsigned so don't extend sign,
2888 		 * unless sysadmin set nfs_allow_preepoch_time.
2889 		 */
2890 		NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
2891 		vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
2892 	}
2893 	return (0);
2894 }
2895 
2896 static const enum nfsftype vt_to_nf[] = {
2897 	0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
2898 };
2899 
2900 /*
2901  * check the following fields for overflow: nodeid, size, and time.
2902  * There could be a problem when converting 64-bit LP64 fields
2903  * into 32-bit ones.  Return an error if there is an overflow.
2904  */
2905 int
2906 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
2907 {
2908 	ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
2909 	na->na_type = vt_to_nf[vap->va_type];
2910 
2911 	if (vap->va_mode == (unsigned short) -1)
2912 		na->na_mode = (uint32_t)-1;
2913 	else
2914 		na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
2915 
2916 	if (vap->va_uid == (unsigned short)(-1))
2917 		na->na_uid = (uint32_t)(-1);
2918 	else if (vap->va_uid == UID_NOBODY)
2919 		na->na_uid = (uint32_t)NFS_UID_NOBODY;
2920 	else
2921 		na->na_uid = vap->va_uid;
2922 
2923 	if (vap->va_gid == (unsigned short)(-1))
2924 		na->na_gid = (uint32_t)-1;
2925 	else if (vap->va_gid == GID_NOBODY)
2926 		na->na_gid = (uint32_t)NFS_GID_NOBODY;
2927 	else
2928 		na->na_gid = vap->va_gid;
2929 
2930 	/*
2931 	 * Do we need to check fsid for overflow?  It is 64-bit in the
2932 	 * vattr, but are bigger than 32 bit values supported?
2933 	 */
2934 	na->na_fsid = vap->va_fsid;
2935 
2936 	na->na_nodeid = vap->va_nodeid;
2937 
2938 	/*
2939 	 * Check to make sure that the nodeid is representable over the
2940 	 * wire without losing bits.
2941 	 */
2942 	if (vap->va_nodeid != (u_longlong_t)na->na_nodeid)
2943 		return (EFBIG);
2944 	na->na_nlink = vap->va_nlink;
2945 
2946 	/*
2947 	 * Check for big files here, instead of at the caller.  See
2948 	 * comments in cstat for large special file explanation.
2949 	 */
2950 	if (vap->va_size > (u_longlong_t)MAXOFF32_T) {
2951 		if ((vap->va_type == VREG) || (vap->va_type == VDIR))
2952 			return (EFBIG);
2953 		if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) {
2954 			/* UNKNOWN_SIZE | OVERFLOW */
2955 			na->na_size = MAXOFF32_T;
2956 		} else
2957 			na->na_size = vap->va_size;
2958 	} else
2959 		na->na_size = vap->va_size;
2960 
2961 	/*
2962 	 * If the vnode times overflow the 32-bit times that NFS2
2963 	 * uses on the wire then return an error.
2964 	 */
2965 	if (!NFS_VAP_TIME_OK(vap)) {
2966 		return (EOVERFLOW);
2967 	}
2968 	na->na_atime.tv_sec = vap->va_atime.tv_sec;
2969 	na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2970 
2971 	na->na_mtime.tv_sec = vap->va_mtime.tv_sec;
2972 	na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2973 
2974 	na->na_ctime.tv_sec = vap->va_ctime.tv_sec;
2975 	na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000;
2976 
2977 	/*
2978 	 * If the dev_t will fit into 16 bits then compress
2979 	 * it, otherwise leave it alone. See comments in
2980 	 * nfs_client.c.
2981 	 */
2982 	if (getminor(vap->va_rdev) <= SO4_MAXMIN &&
2983 	    getmajor(vap->va_rdev) <= SO4_MAXMAJ)
2984 		na->na_rdev = nfsv2_cmpdev(vap->va_rdev);
2985 	else
2986 		(void) cmpldev(&na->na_rdev, vap->va_rdev);
2987 
2988 	na->na_blocks = vap->va_nblocks;
2989 	na->na_blocksize = vap->va_blksize;
2990 
2991 	/*
2992 	 * This bit of ugliness is a *TEMPORARY* hack to preserve the
2993 	 * over-the-wire protocols for named-pipe vnodes.  It remaps the
2994 	 * VFIFO type to the special over-the-wire type. (see note in nfs.h)
2995 	 *
2996 	 * BUYER BEWARE:
2997 	 *  If you are porting the NFS to a non-Sun server, you probably
2998 	 *  don't want to include the following block of code.  The
2999 	 *  over-the-wire special file types will be changing with the
3000 	 *  NFS Protocol Revision.
3001 	 */
3002 	if (vap->va_type == VFIFO)
3003 		NA_SETFIFO(na);
3004 	return (0);
3005 }
3006 
3007 /*
3008  * acl v2 support: returns approximate permission.
3009  *	default: returns minimal permission (more restrictive)
3010  *	aclok: returns maximal permission (less restrictive)
3011  *	This routine changes the permissions that are alaredy in *va.
3012  *	If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES,
3013  *	CLASS_OBJ is always the same as GROUP_OBJ entry.
3014  */
3015 static void
3016 acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr)
3017 {
3018 	vsecattr_t	vsa;
3019 	int		aclcnt;
3020 	aclent_t	*aclentp;
3021 	mode_t		mask_perm;
3022 	mode_t		grp_perm;
3023 	mode_t		other_perm;
3024 	mode_t		other_orig;
3025 	int		error;
3026 
3027 	/* dont care default acl */
3028 	vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT);
3029 	error = VOP_GETSECATTR(vp, &vsa, 0, cr, NULL);
3030 
3031 	if (!error) {
3032 		aclcnt = vsa.vsa_aclcnt;
3033 		if (aclcnt > MIN_ACL_ENTRIES) {
3034 			/* non-trivial ACL */
3035 			aclentp = vsa.vsa_aclentp;
3036 			if (exi->exi_export.ex_flags & EX_ACLOK) {
3037 				/* maximal permissions */
3038 				grp_perm = 0;
3039 				other_perm = 0;
3040 				for (; aclcnt > 0; aclcnt--, aclentp++) {
3041 					switch (aclentp->a_type) {
3042 					case USER_OBJ:
3043 						break;
3044 					case USER:
3045 						grp_perm |=
3046 						    aclentp->a_perm << 3;
3047 						other_perm |= aclentp->a_perm;
3048 						break;
3049 					case GROUP_OBJ:
3050 						grp_perm |=
3051 						    aclentp->a_perm << 3;
3052 						break;
3053 					case GROUP:
3054 						other_perm |= aclentp->a_perm;
3055 						break;
3056 					case OTHER_OBJ:
3057 						other_orig = aclentp->a_perm;
3058 						break;
3059 					case CLASS_OBJ:
3060 						mask_perm = aclentp->a_perm;
3061 						break;
3062 					default:
3063 						break;
3064 					}
3065 				}
3066 				grp_perm &= mask_perm << 3;
3067 				other_perm &= mask_perm;
3068 				other_perm |= other_orig;
3069 
3070 			} else {
3071 				/* minimal permissions */
3072 				grp_perm = 070;
3073 				other_perm = 07;
3074 				for (; aclcnt > 0; aclcnt--, aclentp++) {
3075 					switch (aclentp->a_type) {
3076 					case USER_OBJ:
3077 						break;
3078 					case USER:
3079 					case CLASS_OBJ:
3080 						grp_perm &=
3081 						    aclentp->a_perm << 3;
3082 						other_perm &=
3083 						    aclentp->a_perm;
3084 						break;
3085 					case GROUP_OBJ:
3086 						grp_perm &=
3087 						    aclentp->a_perm << 3;
3088 						break;
3089 					case GROUP:
3090 						other_perm &=
3091 						    aclentp->a_perm;
3092 						break;
3093 					case OTHER_OBJ:
3094 						other_perm &=
3095 						    aclentp->a_perm;
3096 						break;
3097 					default:
3098 						break;
3099 					}
3100 				}
3101 			}
3102 			/* copy to va */
3103 			va->va_mode &= ~077;
3104 			va->va_mode |= grp_perm | other_perm;
3105 		}
3106 		if (vsa.vsa_aclcnt)
3107 			kmem_free(vsa.vsa_aclentp,
3108 			    vsa.vsa_aclcnt * sizeof (aclent_t));
3109 	}
3110 }
3111 
3112 void
3113 rfs_srvrinit(void)
3114 {
3115 	nfs2_srv_caller_id = fs_new_caller_id();
3116 }
3117 
3118 void
3119 rfs_srvrfini(void)
3120 {
3121 }
3122 
3123 /* ARGSUSED */
3124 void
3125 rfs_srv_zone_init(nfs_globals_t *ng)
3126 {
3127 	nfs_srv_t *ns;
3128 
3129 	ns = kmem_zalloc(sizeof (*ns), KM_SLEEP);
3130 
3131 	mutex_init(&ns->async_write_lock, NULL, MUTEX_DEFAULT, NULL);
3132 	ns->write_async = 1;
3133 
3134 	ng->nfs_srv = ns;
3135 }
3136 
3137 /* ARGSUSED */
3138 void
3139 rfs_srv_zone_fini(nfs_globals_t *ng)
3140 {
3141 	nfs_srv_t *ns = ng->nfs_srv;
3142 
3143 	ng->nfs_srv = NULL;
3144 
3145 	mutex_destroy(&ns->async_write_lock);
3146 	kmem_free(ns, sizeof (*ns));
3147 }
3148 
3149 static int
3150 rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
3151 {
3152 	struct clist	*wcl;
3153 	int		wlist_len;
3154 	uint32_t	count = rr->rr_count;
3155 
3156 	wcl = ra->ra_wlist;
3157 
3158 	if (rdma_setup_read_chunks(wcl, count, &wlist_len) == FALSE) {
3159 		return (FALSE);
3160 	}
3161 
3162 	wcl = ra->ra_wlist;
3163 	rr->rr_ok.rrok_wlist_len = wlist_len;
3164 	rr->rr_ok.rrok_wlist = wcl;
3165 
3166 	return (TRUE);
3167 }
3168