xref: /titanic_50/usr/src/uts/common/fs/nfs/nfs_srv.c (revision 8e7248e505faa19396d4e853604e3fa7cd2cb3b5)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  *	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
28  *	All rights reserved.
29  */
30 
31 #include <sys/param.h>
32 #include <sys/types.h>
33 #include <sys/systm.h>
34 #include <sys/cred.h>
35 #include <sys/buf.h>
36 #include <sys/vfs.h>
37 #include <sys/vnode.h>
38 #include <sys/uio.h>
39 #include <sys/stat.h>
40 #include <sys/errno.h>
41 #include <sys/sysmacros.h>
42 #include <sys/statvfs.h>
43 #include <sys/kmem.h>
44 #include <sys/kstat.h>
45 #include <sys/dirent.h>
46 #include <sys/cmn_err.h>
47 #include <sys/debug.h>
48 #include <sys/vtrace.h>
49 #include <sys/mode.h>
50 #include <sys/acl.h>
51 #include <sys/nbmlock.h>
52 #include <sys/policy.h>
53 #include <sys/sdt.h>
54 
55 #include <rpc/types.h>
56 #include <rpc/auth.h>
57 #include <rpc/svc.h>
58 
59 #include <nfs/nfs.h>
60 #include <nfs/export.h>
61 #include <nfs/nfs_cmd.h>
62 
63 #include <vm/hat.h>
64 #include <vm/as.h>
65 #include <vm/seg.h>
66 #include <vm/seg_map.h>
67 #include <vm/seg_kmem.h>
68 
69 #include <sys/strsubr.h>
70 
71 /*
72  * These are the interface routines for the server side of the
73  * Network File System.  See the NFS version 2 protocol specification
74  * for a description of this interface.
75  */
76 
77 static int	sattr_to_vattr(struct nfssattr *, struct vattr *);
78 static void	acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
79 			cred_t *);
80 
81 /*
82  * Some "over the wire" UNIX file types.  These are encoded
83  * into the mode.  This needs to be fixed in the next rev.
84  */
85 #define	IFMT		0170000		/* type of file */
86 #define	IFCHR		0020000		/* character special */
87 #define	IFBLK		0060000		/* block special */
88 #define	IFSOCK		0140000		/* socket */
89 
90 u_longlong_t nfs2_srv_caller_id;
91 
92 /*
93  * Get file attributes.
94  * Returns the current attributes of the file with the given fhandle.
95  */
96 /* ARGSUSED */
97 void
98 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
99 	struct svc_req *req, cred_t *cr)
100 {
101 	int error;
102 	vnode_t *vp;
103 	struct vattr va;
104 
105 	vp = nfs_fhtovp(fhp, exi);
106 	if (vp == NULL) {
107 		ns->ns_status = NFSERR_STALE;
108 		return;
109 	}
110 
111 	/*
112 	 * Do the getattr.
113 	 */
114 	va.va_mask = AT_ALL;	/* we want all the attributes */
115 
116 	error = rfs4_delegated_getattr(vp, &va, 0, cr);
117 
118 	/* check for overflows */
119 	if (!error) {
120 		acl_perm(vp, exi, &va, cr);
121 		error = vattr_to_nattr(&va, &ns->ns_attr);
122 	}
123 
124 	VN_RELE(vp);
125 
126 	ns->ns_status = puterrno(error);
127 }
128 void *
129 rfs_getattr_getfh(fhandle_t *fhp)
130 {
131 	return (fhp);
132 }
133 
134 /*
135  * Set file attributes.
136  * Sets the attributes of the file with the given fhandle.  Returns
137  * the new attributes.
138  */
139 void
140 rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns,
141 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
142 {
143 	int error;
144 	int flag;
145 	int in_crit = 0;
146 	vnode_t *vp;
147 	struct vattr va;
148 	struct vattr bva;
149 	struct flock64 bf;
150 	caller_context_t ct;
151 
152 
153 	vp = nfs_fhtovp(&args->saa_fh, exi);
154 	if (vp == NULL) {
155 		ns->ns_status = NFSERR_STALE;
156 		return;
157 	}
158 
159 	if (rdonly(exi, req) || vn_is_readonly(vp)) {
160 		VN_RELE(vp);
161 		ns->ns_status = NFSERR_ROFS;
162 		return;
163 	}
164 
165 	error = sattr_to_vattr(&args->saa_sa, &va);
166 	if (error) {
167 		VN_RELE(vp);
168 		ns->ns_status = puterrno(error);
169 		return;
170 	}
171 
172 	/*
173 	 * If the client is requesting a change to the mtime,
174 	 * but the nanosecond field is set to 1 billion, then
175 	 * this is a flag to the server that it should set the
176 	 * atime and mtime fields to the server's current time.
177 	 * The 1 billion number actually came from the client
178 	 * as 1 million, but the units in the over the wire
179 	 * request are microseconds instead of nanoseconds.
180 	 *
181 	 * This is an overload of the protocol and should be
182 	 * documented in the NFS Version 2 protocol specification.
183 	 */
184 	if (va.va_mask & AT_MTIME) {
185 		if (va.va_mtime.tv_nsec == 1000000000) {
186 			gethrestime(&va.va_mtime);
187 			va.va_atime = va.va_mtime;
188 			va.va_mask |= AT_ATIME;
189 			flag = 0;
190 		} else
191 			flag = ATTR_UTIME;
192 	} else
193 		flag = 0;
194 
195 	/*
196 	 * If the filesystem is exported with nosuid, then mask off
197 	 * the setuid and setgid bits.
198 	 */
199 	if ((va.va_mask & AT_MODE) && vp->v_type == VREG &&
200 	    (exi->exi_export.ex_flags & EX_NOSUID))
201 		va.va_mode &= ~(VSUID | VSGID);
202 
203 	ct.cc_sysid = 0;
204 	ct.cc_pid = 0;
205 	ct.cc_caller_id = nfs2_srv_caller_id;
206 	ct.cc_flags = CC_DONTBLOCK;
207 
208 	/*
209 	 * We need to specially handle size changes because it is
210 	 * possible for the client to create a file with modes
211 	 * which indicate read-only, but with the file opened for
212 	 * writing.  If the client then tries to set the size of
213 	 * the file, then the normal access checking done in
214 	 * VOP_SETATTR would prevent the client from doing so,
215 	 * although it should be legal for it to do so.  To get
216 	 * around this, we do the access checking for ourselves
217 	 * and then use VOP_SPACE which doesn't do the access
218 	 * checking which VOP_SETATTR does. VOP_SPACE can only
219 	 * operate on VREG files, let VOP_SETATTR handle the other
220 	 * extremely rare cases.
221 	 * Also the client should not be allowed to change the
222 	 * size of the file if there is a conflicting non-blocking
223 	 * mandatory lock in the region of change.
224 	 */
225 	if (vp->v_type == VREG && va.va_mask & AT_SIZE) {
226 		if (nbl_need_check(vp)) {
227 			nbl_start_crit(vp, RW_READER);
228 			in_crit = 1;
229 		}
230 
231 		bva.va_mask = AT_UID | AT_SIZE;
232 
233 		error = VOP_GETATTR(vp, &bva, 0, cr, &ct);
234 
235 		if (error) {
236 			if (in_crit)
237 				nbl_end_crit(vp);
238 			VN_RELE(vp);
239 			ns->ns_status = puterrno(error);
240 			return;
241 		}
242 
243 		if (in_crit) {
244 			u_offset_t offset;
245 			ssize_t length;
246 
247 			if (va.va_size < bva.va_size) {
248 				offset = va.va_size;
249 				length = bva.va_size - va.va_size;
250 			} else {
251 				offset = bva.va_size;
252 				length = va.va_size - bva.va_size;
253 			}
254 			if (nbl_conflict(vp, NBL_WRITE, offset, length, 0,
255 			    NULL)) {
256 				error = EACCES;
257 			}
258 		}
259 
260 		if (crgetuid(cr) == bva.va_uid && !error &&
261 		    va.va_size != bva.va_size) {
262 			va.va_mask &= ~AT_SIZE;
263 			bf.l_type = F_WRLCK;
264 			bf.l_whence = 0;
265 			bf.l_start = (off64_t)va.va_size;
266 			bf.l_len = 0;
267 			bf.l_sysid = 0;
268 			bf.l_pid = 0;
269 
270 			error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE,
271 			    (offset_t)va.va_size, cr, &ct);
272 		}
273 		if (in_crit)
274 			nbl_end_crit(vp);
275 	} else
276 		error = 0;
277 
278 	/*
279 	 * Do the setattr.
280 	 */
281 	if (!error && va.va_mask) {
282 		error = VOP_SETATTR(vp, &va, flag, cr, &ct);
283 	}
284 
285 	/*
286 	 * check if the monitor on either vop_space or vop_setattr detected
287 	 * a delegation conflict and if so, mark the thread flag as
288 	 * wouldblock so that the response is dropped and the client will
289 	 * try again.
290 	 */
291 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
292 		VN_RELE(vp);
293 		curthread->t_flag |= T_WOULDBLOCK;
294 		return;
295 	}
296 
297 	if (!error) {
298 		va.va_mask = AT_ALL;	/* get everything */
299 
300 		error = rfs4_delegated_getattr(vp, &va, 0, cr);
301 
302 		/* check for overflows */
303 		if (!error) {
304 			acl_perm(vp, exi, &va, cr);
305 			error = vattr_to_nattr(&va, &ns->ns_attr);
306 		}
307 	}
308 
309 	ct.cc_flags = 0;
310 
311 	/*
312 	 * Force modified metadata out to stable storage.
313 	 */
314 	(void) VOP_FSYNC(vp, FNODSYNC, cr, &ct);
315 
316 	VN_RELE(vp);
317 
318 	ns->ns_status = puterrno(error);
319 }
320 void *
321 rfs_setattr_getfh(struct nfssaargs *args)
322 {
323 	return (&args->saa_fh);
324 }
325 
326 /*
327  * Directory lookup.
328  * Returns an fhandle and file attributes for file name in a directory.
329  */
330 /* ARGSUSED */
331 void
332 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
333 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
334 {
335 	int error;
336 	vnode_t *dvp;
337 	vnode_t *vp;
338 	struct vattr va;
339 	fhandle_t *fhp = da->da_fhandle;
340 	struct sec_ol sec = {0, 0};
341 	bool_t publicfh_flag = FALSE, auth_weak = FALSE;
342 	char *name;
343 	struct sockaddr *ca;
344 
345 	/*
346 	 * Trusted Extension doesn't support NFSv2. MOUNT
347 	 * will reject v2 clients. Need to prevent v2 client
348 	 * access via WebNFS here.
349 	 */
350 	if (is_system_labeled() && req->rq_vers == 2) {
351 		dr->dr_status = NFSERR_ACCES;
352 		return;
353 	}
354 
355 	/*
356 	 * Disallow NULL paths
357 	 */
358 	if (da->da_name == NULL || *da->da_name == '\0') {
359 		dr->dr_status = NFSERR_ACCES;
360 		return;
361 	}
362 
363 	/*
364 	 * Allow lookups from the root - the default
365 	 * location of the public filehandle.
366 	 */
367 	if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
368 		dvp = rootdir;
369 		VN_HOLD(dvp);
370 	} else {
371 		dvp = nfs_fhtovp(fhp, exi);
372 		if (dvp == NULL) {
373 			dr->dr_status = NFSERR_STALE;
374 			return;
375 		}
376 	}
377 
378 	/*
379 	 * Not allow lookup beyond root.
380 	 * If the filehandle matches a filehandle of the exi,
381 	 * then the ".." refers beyond the root of an exported filesystem.
382 	 */
383 	if (strcmp(da->da_name, "..") == 0 &&
384 	    EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
385 		VN_RELE(dvp);
386 		dr->dr_status = NFSERR_NOENT;
387 		return;
388 	}
389 
390 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
391 	name = nfscmd_convname(ca, exi, da->da_name, NFSCMD_CONV_INBOUND,
392 	    MAXPATHLEN);
393 
394 	if (name == NULL) {
395 		dr->dr_status = NFSERR_ACCES;
396 		return;
397 	}
398 
399 	/*
400 	 * If the public filehandle is used then allow
401 	 * a multi-component lookup, i.e. evaluate
402 	 * a pathname and follow symbolic links if
403 	 * necessary.
404 	 *
405 	 * This may result in a vnode in another filesystem
406 	 * which is OK as long as the filesystem is exported.
407 	 */
408 	if (PUBLIC_FH2(fhp)) {
409 		publicfh_flag = TRUE;
410 		error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &exi,
411 		    &sec);
412 	} else {
413 		/*
414 		 * Do a normal single component lookup.
415 		 */
416 		error = VOP_LOOKUP(dvp, name, &vp, NULL, 0, NULL, cr,
417 		    NULL, NULL, NULL);
418 	}
419 
420 	if (name != da->da_name)
421 		kmem_free(name, MAXPATHLEN);
422 
423 
424 	if (!error) {
425 		va.va_mask = AT_ALL;	/* we want everything */
426 
427 		error = rfs4_delegated_getattr(vp, &va, 0, cr);
428 
429 		/* check for overflows */
430 		if (!error) {
431 			acl_perm(vp, exi, &va, cr);
432 			error = vattr_to_nattr(&va, &dr->dr_attr);
433 			if (!error) {
434 				if (sec.sec_flags & SEC_QUERY)
435 					error = makefh_ol(&dr->dr_fhandle, exi,
436 					    sec.sec_index);
437 				else {
438 					error = makefh(&dr->dr_fhandle, vp,
439 					    exi);
440 					if (!error && publicfh_flag &&
441 					    !chk_clnt_sec(exi, req))
442 						auth_weak = TRUE;
443 				}
444 			}
445 		}
446 		VN_RELE(vp);
447 	}
448 
449 	VN_RELE(dvp);
450 
451 	/*
452 	 * If publicfh_flag is true then we have called rfs_publicfh_mclookup
453 	 * and have obtained a new exportinfo in exi which needs to be
454 	 * released. Note the the original exportinfo pointed to by exi
455 	 * will be released by the caller, comon_dispatch.
456 	 */
457 	if (publicfh_flag && exi != NULL)
458 		exi_rele(exi);
459 
460 	/*
461 	 * If it's public fh, no 0x81, and client's flavor is
462 	 * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
463 	 * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
464 	 */
465 	if (auth_weak)
466 		dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR;
467 	else
468 		dr->dr_status = puterrno(error);
469 }
470 void *
471 rfs_lookup_getfh(struct nfsdiropargs *da)
472 {
473 	return (da->da_fhandle);
474 }
475 
476 /*
477  * Read symbolic link.
478  * Returns the string in the symbolic link at the given fhandle.
479  */
480 /* ARGSUSED */
481 void
482 rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi,
483 	struct svc_req *req, cred_t *cr)
484 {
485 	int error;
486 	struct iovec iov;
487 	struct uio uio;
488 	vnode_t *vp;
489 	struct vattr va;
490 	struct sockaddr *ca;
491 	char *name = NULL;
492 
493 	vp = nfs_fhtovp(fhp, exi);
494 	if (vp == NULL) {
495 		rl->rl_data = NULL;
496 		rl->rl_status = NFSERR_STALE;
497 		return;
498 	}
499 
500 	va.va_mask = AT_MODE;
501 
502 	error = VOP_GETATTR(vp, &va, 0, cr, NULL);
503 
504 	if (error) {
505 		VN_RELE(vp);
506 		rl->rl_data = NULL;
507 		rl->rl_status = puterrno(error);
508 		return;
509 	}
510 
511 	if (MANDLOCK(vp, va.va_mode)) {
512 		VN_RELE(vp);
513 		rl->rl_data = NULL;
514 		rl->rl_status = NFSERR_ACCES;
515 		return;
516 	}
517 
518 	/*
519 	 * XNFS and RFC1094 require us to return ENXIO if argument
520 	 * is not a link. BUGID 1138002.
521 	 */
522 	if (vp->v_type != VLNK) {
523 		VN_RELE(vp);
524 		rl->rl_data = NULL;
525 		rl->rl_status = NFSERR_NXIO;
526 		return;
527 	}
528 
529 	/*
530 	 * Allocate data for pathname.  This will be freed by rfs_rlfree.
531 	 */
532 	rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
533 
534 	/*
535 	 * Set up io vector to read sym link data
536 	 */
537 	iov.iov_base = rl->rl_data;
538 	iov.iov_len = NFS_MAXPATHLEN;
539 	uio.uio_iov = &iov;
540 	uio.uio_iovcnt = 1;
541 	uio.uio_segflg = UIO_SYSSPACE;
542 	uio.uio_extflg = UIO_COPY_CACHED;
543 	uio.uio_loffset = (offset_t)0;
544 	uio.uio_resid = NFS_MAXPATHLEN;
545 
546 	/*
547 	 * Do the readlink.
548 	 */
549 	error = VOP_READLINK(vp, &uio, cr, NULL);
550 
551 	VN_RELE(vp);
552 
553 	rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid);
554 	rl->rl_data[rl->rl_count] = '\0';
555 
556 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
557 	name = nfscmd_convname(ca, exi, rl->rl_data,
558 	    NFSCMD_CONV_OUTBOUND, MAXPATHLEN);
559 
560 	if (name != NULL && name != rl->rl_data) {
561 		kmem_free(rl->rl_data, NFS_MAXPATHLEN);
562 		rl->rl_data = name;
563 	}
564 
565 	/*
566 	 * XNFS and RFC1094 require us to return ENXIO if argument
567 	 * is not a link. UFS returns EINVAL if this is the case,
568 	 * so we do the mapping here. BUGID 1138002.
569 	 */
570 	if (error == EINVAL)
571 		rl->rl_status = NFSERR_NXIO;
572 	else
573 		rl->rl_status = puterrno(error);
574 
575 }
576 void *
577 rfs_readlink_getfh(fhandle_t *fhp)
578 {
579 	return (fhp);
580 }
581 /*
582  * Free data allocated by rfs_readlink
583  */
584 void
585 rfs_rlfree(struct nfsrdlnres *rl)
586 {
587 	if (rl->rl_data != NULL)
588 		kmem_free(rl->rl_data, NFS_MAXPATHLEN);
589 }
590 
591 static int rdma_setup_read_data2(struct nfsreadargs *, struct nfsrdresult *);
592 
593 /*
594  * Read data.
595  * Returns some data read from the file at the given fhandle.
596  */
597 /* ARGSUSED */
598 void
599 rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr,
600 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
601 {
602 	vnode_t *vp;
603 	int error;
604 	struct vattr va;
605 	struct iovec iov;
606 	struct uio uio;
607 	mblk_t *mp;
608 	int alloc_err = 0;
609 	int in_crit = 0;
610 	caller_context_t ct;
611 
612 	vp = nfs_fhtovp(&ra->ra_fhandle, exi);
613 	if (vp == NULL) {
614 		rr->rr_data = NULL;
615 		rr->rr_status = NFSERR_STALE;
616 		return;
617 	}
618 
619 	if (vp->v_type != VREG) {
620 		VN_RELE(vp);
621 		rr->rr_data = NULL;
622 		rr->rr_status = NFSERR_ISDIR;
623 		return;
624 	}
625 
626 	ct.cc_sysid = 0;
627 	ct.cc_pid = 0;
628 	ct.cc_caller_id = nfs2_srv_caller_id;
629 	ct.cc_flags = CC_DONTBLOCK;
630 
631 	/*
632 	 * Enter the critical region before calling VOP_RWLOCK
633 	 * to avoid a deadlock with write requests.
634 	 */
635 	if (nbl_need_check(vp)) {
636 		nbl_start_crit(vp, RW_READER);
637 		if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
638 		    0, NULL)) {
639 			nbl_end_crit(vp);
640 			VN_RELE(vp);
641 			rr->rr_data = NULL;
642 			rr->rr_status = NFSERR_ACCES;
643 			return;
644 		}
645 		in_crit = 1;
646 	}
647 
648 	error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct);
649 
650 	/* check if a monitor detected a delegation conflict */
651 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
652 		VN_RELE(vp);
653 		/* mark as wouldblock so response is dropped */
654 		curthread->t_flag |= T_WOULDBLOCK;
655 
656 		rr->rr_data = NULL;
657 		return;
658 	}
659 
660 	va.va_mask = AT_ALL;
661 
662 	error = VOP_GETATTR(vp, &va, 0, cr, &ct);
663 
664 	if (error) {
665 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
666 		if (in_crit)
667 			nbl_end_crit(vp);
668 
669 		VN_RELE(vp);
670 		rr->rr_data = NULL;
671 		rr->rr_status = puterrno(error);
672 
673 		return;
674 	}
675 
676 	/*
677 	 * This is a kludge to allow reading of files created
678 	 * with no read permission.  The owner of the file
679 	 * is always allowed to read it.
680 	 */
681 	if (crgetuid(cr) != va.va_uid) {
682 		error = VOP_ACCESS(vp, VREAD, 0, cr, &ct);
683 
684 		if (error) {
685 			/*
686 			 * Exec is the same as read over the net because
687 			 * of demand loading.
688 			 */
689 			error = VOP_ACCESS(vp, VEXEC, 0, cr, &ct);
690 		}
691 		if (error) {
692 			VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
693 			if (in_crit)
694 				nbl_end_crit(vp);
695 			VN_RELE(vp);
696 			rr->rr_data = NULL;
697 			rr->rr_status = puterrno(error);
698 
699 			return;
700 		}
701 	}
702 
703 	if (MANDLOCK(vp, va.va_mode)) {
704 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
705 		if (in_crit)
706 			nbl_end_crit(vp);
707 
708 		VN_RELE(vp);
709 		rr->rr_data = NULL;
710 		rr->rr_status = NFSERR_ACCES;
711 
712 		return;
713 	}
714 
715 	rr->rr_ok.rrok_wlist_len = 0;
716 	rr->rr_ok.rrok_wlist = NULL;
717 
718 	if ((u_offset_t)ra->ra_offset >= va.va_size) {
719 		rr->rr_count = 0;
720 		rr->rr_data = NULL;
721 		/*
722 		 * In this case, status is NFS_OK, but there is no data
723 		 * to encode. So set rr_mp to NULL.
724 		 */
725 		rr->rr_mp = NULL;
726 		goto done;
727 	}
728 
729 	if (ra->ra_wlist) {
730 		mp = NULL;
731 		rr->rr_mp = NULL;
732 		(void) rdma_get_wchunk(req, &iov, ra->ra_wlist);
733 	} else {
734 		/*
735 		 * mp will contain the data to be sent out in the read reply.
736 		 * This will be freed after the reply has been sent out (by the
737 		 * driver).
738 		 * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so
739 		 * that the call to xdrmblk_putmblk() never fails.
740 		 */
741 		mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG,
742 		    &alloc_err);
743 		ASSERT(mp != NULL);
744 		ASSERT(alloc_err == 0);
745 
746 		rr->rr_mp = mp;
747 
748 		/*
749 		 * Set up io vector
750 		 */
751 		iov.iov_base = (caddr_t)mp->b_datap->db_base;
752 		iov.iov_len = ra->ra_count;
753 	}
754 
755 	uio.uio_iov = &iov;
756 	uio.uio_iovcnt = 1;
757 	uio.uio_segflg = UIO_SYSSPACE;
758 	uio.uio_extflg = UIO_COPY_CACHED;
759 	uio.uio_loffset = (offset_t)ra->ra_offset;
760 	uio.uio_resid = ra->ra_count;
761 
762 	error = VOP_READ(vp, &uio, 0, cr, &ct);
763 
764 	if (error) {
765 		if (mp)
766 			freeb(mp);
767 
768 		/*
769 		 * check if a monitor detected a delegation conflict and
770 		 * mark as wouldblock so response is dropped
771 		 */
772 		if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
773 			curthread->t_flag |= T_WOULDBLOCK;
774 		else
775 			rr->rr_status = puterrno(error);
776 
777 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
778 		if (in_crit)
779 			nbl_end_crit(vp);
780 
781 		VN_RELE(vp);
782 		rr->rr_data = NULL;
783 
784 		return;
785 	}
786 
787 	/*
788 	 * Get attributes again so we can send the latest access
789 	 * time to the client side for his cache.
790 	 */
791 	va.va_mask = AT_ALL;
792 
793 	error = VOP_GETATTR(vp, &va, 0, cr, &ct);
794 
795 	if (error) {
796 		if (mp)
797 			freeb(mp);
798 
799 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
800 		if (in_crit)
801 			nbl_end_crit(vp);
802 
803 		VN_RELE(vp);
804 		rr->rr_data = NULL;
805 		rr->rr_status = puterrno(error);
806 
807 		return;
808 	}
809 
810 	rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid);
811 
812 	if (mp) {
813 		rr->rr_data = (char *)mp->b_datap->db_base;
814 	} else {
815 		if (ra->ra_wlist) {
816 			rr->rr_data = (caddr_t)iov.iov_base;
817 			if (!rdma_setup_read_data2(ra, rr)) {
818 				rr->rr_data = NULL;
819 				rr->rr_status = puterrno(NFSERR_INVAL);
820 			}
821 		}
822 	}
823 done:
824 	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
825 	if (in_crit)
826 		nbl_end_crit(vp);
827 
828 	acl_perm(vp, exi, &va, cr);
829 
830 	/* check for overflows */
831 	error = vattr_to_nattr(&va, &rr->rr_attr);
832 
833 	VN_RELE(vp);
834 
835 	rr->rr_status = puterrno(error);
836 }
837 
838 /*
839  * Free data allocated by rfs_read
840  */
841 void
842 rfs_rdfree(struct nfsrdresult *rr)
843 {
844 	mblk_t *mp;
845 
846 	if (rr->rr_status == NFS_OK) {
847 		mp = rr->rr_mp;
848 		if (mp != NULL)
849 			freeb(mp);
850 	}
851 }
852 
853 void *
854 rfs_read_getfh(struct nfsreadargs *ra)
855 {
856 	return (&ra->ra_fhandle);
857 }
858 
859 #define	MAX_IOVECS	12
860 
861 #ifdef DEBUG
862 static int rfs_write_sync_hits = 0;
863 static int rfs_write_sync_misses = 0;
864 #endif
865 
866 /*
867  * Write data to file.
868  * Returns attributes of a file after writing some data to it.
869  *
870  * Any changes made here, especially in error handling might have
871  * to also be done in rfs_write (which clusters write requests).
872  */
873 void
874 rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns,
875 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
876 {
877 	int error;
878 	vnode_t *vp;
879 	rlim64_t rlimit;
880 	struct vattr va;
881 	struct uio uio;
882 	struct iovec iov[MAX_IOVECS];
883 	mblk_t *m;
884 	struct iovec *iovp;
885 	int iovcnt;
886 	cred_t *savecred;
887 	int in_crit = 0;
888 	caller_context_t ct;
889 
890 	vp = nfs_fhtovp(&wa->wa_fhandle, exi);
891 	if (vp == NULL) {
892 		ns->ns_status = NFSERR_STALE;
893 		return;
894 	}
895 
896 	if (rdonly(exi, req)) {
897 		VN_RELE(vp);
898 		ns->ns_status = NFSERR_ROFS;
899 		return;
900 	}
901 
902 	if (vp->v_type != VREG) {
903 		VN_RELE(vp);
904 		ns->ns_status = NFSERR_ISDIR;
905 		return;
906 	}
907 
908 	ct.cc_sysid = 0;
909 	ct.cc_pid = 0;
910 	ct.cc_caller_id = nfs2_srv_caller_id;
911 	ct.cc_flags = CC_DONTBLOCK;
912 
913 	va.va_mask = AT_UID|AT_MODE;
914 
915 	error = VOP_GETATTR(vp, &va, 0, cr, &ct);
916 
917 	if (error) {
918 		VN_RELE(vp);
919 		ns->ns_status = puterrno(error);
920 
921 		return;
922 	}
923 
924 	if (crgetuid(cr) != va.va_uid) {
925 		/*
926 		 * This is a kludge to allow writes of files created
927 		 * with read only permission.  The owner of the file
928 		 * is always allowed to write it.
929 		 */
930 		error = VOP_ACCESS(vp, VWRITE, 0, cr, &ct);
931 
932 		if (error) {
933 			VN_RELE(vp);
934 			ns->ns_status = puterrno(error);
935 			return;
936 		}
937 	}
938 
939 	/*
940 	 * Can't access a mandatory lock file.  This might cause
941 	 * the NFS service thread to block forever waiting for a
942 	 * lock to be released that will never be released.
943 	 */
944 	if (MANDLOCK(vp, va.va_mode)) {
945 		VN_RELE(vp);
946 		ns->ns_status = NFSERR_ACCES;
947 		return;
948 	}
949 
950 	/*
951 	 * We have to enter the critical region before calling VOP_RWLOCK
952 	 * to avoid a deadlock with ufs.
953 	 */
954 	if (nbl_need_check(vp)) {
955 		nbl_start_crit(vp, RW_READER);
956 		in_crit = 1;
957 		if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
958 		    wa->wa_count, 0, NULL)) {
959 			error = EACCES;
960 			goto out;
961 		}
962 	}
963 
964 	error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
965 
966 	/* check if a monitor detected a delegation conflict */
967 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
968 		VN_RELE(vp);
969 		/* mark as wouldblock so response is dropped */
970 		curthread->t_flag |= T_WOULDBLOCK;
971 		return;
972 	}
973 
974 	if (wa->wa_data || wa->wa_rlist) {
975 		/* Do the RDMA thing if necessary */
976 		if (wa->wa_rlist) {
977 			iov[0].iov_base = (char *)((wa->wa_rlist)->u.c_daddr3);
978 			iov[0].iov_len = wa->wa_count;
979 		} else  {
980 			iov[0].iov_base = wa->wa_data;
981 			iov[0].iov_len = wa->wa_count;
982 		}
983 		uio.uio_iov = iov;
984 		uio.uio_iovcnt = 1;
985 		uio.uio_segflg = UIO_SYSSPACE;
986 		uio.uio_extflg = UIO_COPY_DEFAULT;
987 		uio.uio_loffset = (offset_t)wa->wa_offset;
988 		uio.uio_resid = wa->wa_count;
989 		/*
990 		 * The limit is checked on the client. We
991 		 * should allow any size writes here.
992 		 */
993 		uio.uio_llimit = curproc->p_fsz_ctl;
994 		rlimit = uio.uio_llimit - wa->wa_offset;
995 		if (rlimit < (rlim64_t)uio.uio_resid)
996 			uio.uio_resid = (uint_t)rlimit;
997 
998 		/*
999 		 * for now we assume no append mode
1000 		 */
1001 		/*
1002 		 * We're changing creds because VM may fault and we need
1003 		 * the cred of the current thread to be used if quota
1004 		 * checking is enabled.
1005 		 */
1006 		savecred = curthread->t_cred;
1007 		curthread->t_cred = cr;
1008 		error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1009 		curthread->t_cred = savecred;
1010 	} else {
1011 		iovcnt = 0;
1012 		for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
1013 			iovcnt++;
1014 		if (iovcnt <= MAX_IOVECS) {
1015 #ifdef DEBUG
1016 			rfs_write_sync_hits++;
1017 #endif
1018 			iovp = iov;
1019 		} else {
1020 #ifdef DEBUG
1021 			rfs_write_sync_misses++;
1022 #endif
1023 			iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
1024 		}
1025 		mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
1026 		uio.uio_iov = iovp;
1027 		uio.uio_iovcnt = iovcnt;
1028 		uio.uio_segflg = UIO_SYSSPACE;
1029 		uio.uio_extflg = UIO_COPY_DEFAULT;
1030 		uio.uio_loffset = (offset_t)wa->wa_offset;
1031 		uio.uio_resid = wa->wa_count;
1032 		/*
1033 		 * The limit is checked on the client. We
1034 		 * should allow any size writes here.
1035 		 */
1036 		uio.uio_llimit = curproc->p_fsz_ctl;
1037 		rlimit = uio.uio_llimit - wa->wa_offset;
1038 		if (rlimit < (rlim64_t)uio.uio_resid)
1039 			uio.uio_resid = (uint_t)rlimit;
1040 
1041 		/*
1042 		 * For now we assume no append mode.
1043 		 */
1044 		/*
1045 		 * We're changing creds because VM may fault and we need
1046 		 * the cred of the current thread to be used if quota
1047 		 * checking is enabled.
1048 		 */
1049 		savecred = curthread->t_cred;
1050 		curthread->t_cred = cr;
1051 		error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1052 		curthread->t_cred = savecred;
1053 
1054 		if (iovp != iov)
1055 			kmem_free(iovp, sizeof (*iovp) * iovcnt);
1056 	}
1057 
1058 	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1059 
1060 	if (!error) {
1061 		/*
1062 		 * Get attributes again so we send the latest mod
1063 		 * time to the client side for his cache.
1064 		 */
1065 		va.va_mask = AT_ALL;	/* now we want everything */
1066 
1067 		error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1068 
1069 		/* check for overflows */
1070 		if (!error) {
1071 			acl_perm(vp, exi, &va, cr);
1072 			error = vattr_to_nattr(&va, &ns->ns_attr);
1073 		}
1074 	}
1075 
1076 out:
1077 	if (in_crit)
1078 		nbl_end_crit(vp);
1079 	VN_RELE(vp);
1080 
1081 	/* check if a monitor detected a delegation conflict */
1082 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1083 		/* mark as wouldblock so response is dropped */
1084 		curthread->t_flag |= T_WOULDBLOCK;
1085 	else
1086 		ns->ns_status = puterrno(error);
1087 
1088 }
1089 
1090 struct rfs_async_write {
1091 	struct nfswriteargs *wa;
1092 	struct nfsattrstat *ns;
1093 	struct svc_req *req;
1094 	cred_t *cr;
1095 	kthread_t *thread;
1096 	struct rfs_async_write *list;
1097 };
1098 
1099 struct rfs_async_write_list {
1100 	fhandle_t *fhp;
1101 	kcondvar_t cv;
1102 	struct rfs_async_write *list;
1103 	struct rfs_async_write_list *next;
1104 };
1105 
1106 static struct rfs_async_write_list *rfs_async_write_head = NULL;
1107 static kmutex_t rfs_async_write_lock;
1108 static int rfs_write_async = 1;	/* enables write clustering if == 1 */
1109 
1110 #define	MAXCLIOVECS	42
1111 #define	RFSWRITE_INITVAL (enum nfsstat) -1
1112 
1113 #ifdef DEBUG
1114 static int rfs_write_hits = 0;
1115 static int rfs_write_misses = 0;
1116 #endif
1117 
1118 /*
1119  * Write data to file.
1120  * Returns attributes of a file after writing some data to it.
1121  */
1122 void
1123 rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns,
1124 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1125 {
1126 	int error;
1127 	vnode_t *vp;
1128 	rlim64_t rlimit;
1129 	struct vattr va;
1130 	struct uio uio;
1131 	struct rfs_async_write_list *lp;
1132 	struct rfs_async_write_list *nlp;
1133 	struct rfs_async_write *rp;
1134 	struct rfs_async_write *nrp;
1135 	struct rfs_async_write *trp;
1136 	struct rfs_async_write *lrp;
1137 	int data_written;
1138 	int iovcnt;
1139 	mblk_t *m;
1140 	struct iovec *iovp;
1141 	struct iovec *niovp;
1142 	struct iovec iov[MAXCLIOVECS];
1143 	int count;
1144 	int rcount;
1145 	uint_t off;
1146 	uint_t len;
1147 	struct rfs_async_write nrpsp;
1148 	struct rfs_async_write_list nlpsp;
1149 	ushort_t t_flag;
1150 	cred_t *savecred;
1151 	int in_crit = 0;
1152 	caller_context_t ct;
1153 
1154 	if (!rfs_write_async) {
1155 		rfs_write_sync(wa, ns, exi, req, cr);
1156 		return;
1157 	}
1158 
1159 	/*
1160 	 * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
1161 	 * is considered an OK.
1162 	 */
1163 	ns->ns_status = RFSWRITE_INITVAL;
1164 
1165 	nrp = &nrpsp;
1166 	nrp->wa = wa;
1167 	nrp->ns = ns;
1168 	nrp->req = req;
1169 	nrp->cr = cr;
1170 	nrp->thread = curthread;
1171 
1172 	ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
1173 
1174 	/*
1175 	 * Look to see if there is already a cluster started
1176 	 * for this file.
1177 	 */
1178 	mutex_enter(&rfs_async_write_lock);
1179 	for (lp = rfs_async_write_head; lp != NULL; lp = lp->next) {
1180 		if (bcmp(&wa->wa_fhandle, lp->fhp,
1181 		    sizeof (fhandle_t)) == 0)
1182 			break;
1183 	}
1184 
1185 	/*
1186 	 * If lp is non-NULL, then there is already a cluster
1187 	 * started.  We need to place ourselves in the cluster
1188 	 * list in the right place as determined by starting
1189 	 * offset.  Conflicts with non-blocking mandatory locked
1190 	 * regions will be checked when the cluster is processed.
1191 	 */
1192 	if (lp != NULL) {
1193 		rp = lp->list;
1194 		trp = NULL;
1195 		while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
1196 			trp = rp;
1197 			rp = rp->list;
1198 		}
1199 		nrp->list = rp;
1200 		if (trp == NULL)
1201 			lp->list = nrp;
1202 		else
1203 			trp->list = nrp;
1204 		while (nrp->ns->ns_status == RFSWRITE_INITVAL)
1205 			cv_wait(&lp->cv, &rfs_async_write_lock);
1206 		mutex_exit(&rfs_async_write_lock);
1207 
1208 		return;
1209 	}
1210 
1211 	/*
1212 	 * No cluster started yet, start one and add ourselves
1213 	 * to the list of clusters.
1214 	 */
1215 	nrp->list = NULL;
1216 
1217 	nlp = &nlpsp;
1218 	nlp->fhp = &wa->wa_fhandle;
1219 	cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
1220 	nlp->list = nrp;
1221 	nlp->next = NULL;
1222 
1223 	if (rfs_async_write_head == NULL) {
1224 		rfs_async_write_head = nlp;
1225 	} else {
1226 		lp = rfs_async_write_head;
1227 		while (lp->next != NULL)
1228 			lp = lp->next;
1229 		lp->next = nlp;
1230 	}
1231 	mutex_exit(&rfs_async_write_lock);
1232 
1233 	/*
1234 	 * Convert the file handle common to all of the requests
1235 	 * in this cluster to a vnode.
1236 	 */
1237 	vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1238 	if (vp == NULL) {
1239 		mutex_enter(&rfs_async_write_lock);
1240 		if (rfs_async_write_head == nlp)
1241 			rfs_async_write_head = nlp->next;
1242 		else {
1243 			lp = rfs_async_write_head;
1244 			while (lp->next != nlp)
1245 				lp = lp->next;
1246 			lp->next = nlp->next;
1247 		}
1248 		t_flag = curthread->t_flag & T_WOULDBLOCK;
1249 		for (rp = nlp->list; rp != NULL; rp = rp->list) {
1250 			rp->ns->ns_status = NFSERR_STALE;
1251 			rp->thread->t_flag |= t_flag;
1252 		}
1253 		cv_broadcast(&nlp->cv);
1254 		mutex_exit(&rfs_async_write_lock);
1255 
1256 		return;
1257 	}
1258 
1259 	/*
1260 	 * Can only write regular files.  Attempts to write any
1261 	 * other file types fail with EISDIR.
1262 	 */
1263 	if (vp->v_type != VREG) {
1264 		VN_RELE(vp);
1265 		mutex_enter(&rfs_async_write_lock);
1266 		if (rfs_async_write_head == nlp)
1267 			rfs_async_write_head = nlp->next;
1268 		else {
1269 			lp = rfs_async_write_head;
1270 			while (lp->next != nlp)
1271 				lp = lp->next;
1272 			lp->next = nlp->next;
1273 		}
1274 		t_flag = curthread->t_flag & T_WOULDBLOCK;
1275 		for (rp = nlp->list; rp != NULL; rp = rp->list) {
1276 			rp->ns->ns_status = NFSERR_ISDIR;
1277 			rp->thread->t_flag |= t_flag;
1278 		}
1279 		cv_broadcast(&nlp->cv);
1280 		mutex_exit(&rfs_async_write_lock);
1281 
1282 		return;
1283 	}
1284 
1285 	/*
1286 	 * Enter the critical region before calling VOP_RWLOCK, to avoid a
1287 	 * deadlock with ufs.
1288 	 */
1289 	if (nbl_need_check(vp)) {
1290 		nbl_start_crit(vp, RW_READER);
1291 		in_crit = 1;
1292 	}
1293 
1294 	ct.cc_sysid = 0;
1295 	ct.cc_pid = 0;
1296 	ct.cc_caller_id = nfs2_srv_caller_id;
1297 	ct.cc_flags = CC_DONTBLOCK;
1298 
1299 	/*
1300 	 * Lock the file for writing.  This operation provides
1301 	 * the delay which allows clusters to grow.
1302 	 */
1303 	error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1304 
1305 	/* check if a monitor detected a delegation conflict */
1306 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1307 		VN_RELE(vp);
1308 		/* mark as wouldblock so response is dropped */
1309 		curthread->t_flag |= T_WOULDBLOCK;
1310 		mutex_enter(&rfs_async_write_lock);
1311 		for (rp = nlp->list; rp != NULL; rp = rp->list) {
1312 			if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1313 				rp->ns->ns_status = puterrno(error);
1314 				rp->thread->t_flag |= T_WOULDBLOCK;
1315 			}
1316 		}
1317 		cv_broadcast(&nlp->cv);
1318 		mutex_exit(&rfs_async_write_lock);
1319 
1320 		return;
1321 	}
1322 
1323 	/*
1324 	 * Disconnect this cluster from the list of clusters.
1325 	 * The cluster that is being dealt with must be fixed
1326 	 * in size after this point, so there is no reason
1327 	 * to leave it on the list so that new requests can
1328 	 * find it.
1329 	 *
1330 	 * The algorithm is that the first write request will
1331 	 * create a cluster, convert the file handle to a
1332 	 * vnode pointer, and then lock the file for writing.
1333 	 * This request is not likely to be clustered with
1334 	 * any others.  However, the next request will create
1335 	 * a new cluster and be blocked in VOP_RWLOCK while
1336 	 * the first request is being processed.  This delay
1337 	 * will allow more requests to be clustered in this
1338 	 * second cluster.
1339 	 */
1340 	mutex_enter(&rfs_async_write_lock);
1341 	if (rfs_async_write_head == nlp)
1342 		rfs_async_write_head = nlp->next;
1343 	else {
1344 		lp = rfs_async_write_head;
1345 		while (lp->next != nlp)
1346 			lp = lp->next;
1347 		lp->next = nlp->next;
1348 	}
1349 	mutex_exit(&rfs_async_write_lock);
1350 
1351 	/*
1352 	 * Step through the list of requests in this cluster.
1353 	 * We need to check permissions to make sure that all
1354 	 * of the requests have sufficient permission to write
1355 	 * the file.  A cluster can be composed of requests
1356 	 * from different clients and different users on each
1357 	 * client.
1358 	 *
1359 	 * As a side effect, we also calculate the size of the
1360 	 * byte range that this cluster encompasses.
1361 	 */
1362 	rp = nlp->list;
1363 	off = rp->wa->wa_offset;
1364 	len = (uint_t)0;
1365 	do {
1366 		if (rdonly(exi, rp->req)) {
1367 			rp->ns->ns_status = NFSERR_ROFS;
1368 			t_flag = curthread->t_flag & T_WOULDBLOCK;
1369 			rp->thread->t_flag |= t_flag;
1370 			continue;
1371 		}
1372 
1373 		va.va_mask = AT_UID|AT_MODE;
1374 
1375 		error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1376 
1377 		if (!error) {
1378 			if (crgetuid(rp->cr) != va.va_uid) {
1379 				/*
1380 				 * This is a kludge to allow writes of files
1381 				 * created with read only permission.  The
1382 				 * owner of the file is always allowed to
1383 				 * write it.
1384 				 */
1385 				error = VOP_ACCESS(vp, VWRITE, 0, rp->cr, &ct);
1386 			}
1387 			if (!error && MANDLOCK(vp, va.va_mode))
1388 				error = EACCES;
1389 		}
1390 
1391 		/*
1392 		 * Check for a conflict with a nbmand-locked region.
1393 		 */
1394 		if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset,
1395 		    rp->wa->wa_count, 0, NULL)) {
1396 			error = EACCES;
1397 		}
1398 
1399 		if (error) {
1400 			rp->ns->ns_status = puterrno(error);
1401 			t_flag = curthread->t_flag & T_WOULDBLOCK;
1402 			rp->thread->t_flag |= t_flag;
1403 			continue;
1404 		}
1405 		if (len < rp->wa->wa_offset + rp->wa->wa_count - off)
1406 			len = rp->wa->wa_offset + rp->wa->wa_count - off;
1407 	} while ((rp = rp->list) != NULL);
1408 
1409 	/*
1410 	 * Step through the cluster attempting to gather as many
1411 	 * requests which are contiguous as possible.  These
1412 	 * contiguous requests are handled via one call to VOP_WRITE
1413 	 * instead of different calls to VOP_WRITE.  We also keep
1414 	 * track of the fact that any data was written.
1415 	 */
1416 	rp = nlp->list;
1417 	data_written = 0;
1418 	do {
1419 		/*
1420 		 * Skip any requests which are already marked as having an
1421 		 * error.
1422 		 */
1423 		if (rp->ns->ns_status != RFSWRITE_INITVAL) {
1424 			rp = rp->list;
1425 			continue;
1426 		}
1427 
1428 		/*
1429 		 * Count the number of iovec's which are required
1430 		 * to handle this set of requests.  One iovec is
1431 		 * needed for each data buffer, whether addressed
1432 		 * by wa_data or by the b_rptr pointers in the
1433 		 * mblk chains.
1434 		 */
1435 		iovcnt = 0;
1436 		lrp = rp;
1437 		for (;;) {
1438 			if (lrp->wa->wa_data || lrp->wa->wa_rlist)
1439 				iovcnt++;
1440 			else {
1441 				m = lrp->wa->wa_mblk;
1442 				while (m != NULL) {
1443 					iovcnt++;
1444 					m = m->b_cont;
1445 				}
1446 			}
1447 			if (lrp->list == NULL ||
1448 			    lrp->list->ns->ns_status != RFSWRITE_INITVAL ||
1449 			    lrp->wa->wa_offset + lrp->wa->wa_count !=
1450 			    lrp->list->wa->wa_offset) {
1451 				lrp = lrp->list;
1452 				break;
1453 			}
1454 			lrp = lrp->list;
1455 		}
1456 
1457 		if (iovcnt <= MAXCLIOVECS) {
1458 #ifdef DEBUG
1459 			rfs_write_hits++;
1460 #endif
1461 			niovp = iov;
1462 		} else {
1463 #ifdef DEBUG
1464 			rfs_write_misses++;
1465 #endif
1466 			niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP);
1467 		}
1468 		/*
1469 		 * Put together the scatter/gather iovecs.
1470 		 */
1471 		iovp = niovp;
1472 		trp = rp;
1473 		count = 0;
1474 		do {
1475 			if (trp->wa->wa_data || trp->wa->wa_rlist) {
1476 				if (trp->wa->wa_rlist) {
1477 					iovp->iov_base =
1478 					    (char *)((trp->wa->wa_rlist)->
1479 					    u.c_daddr3);
1480 					iovp->iov_len = trp->wa->wa_count;
1481 				} else  {
1482 					iovp->iov_base = trp->wa->wa_data;
1483 					iovp->iov_len = trp->wa->wa_count;
1484 				}
1485 				iovp++;
1486 			} else {
1487 				m = trp->wa->wa_mblk;
1488 				rcount = trp->wa->wa_count;
1489 				while (m != NULL) {
1490 					iovp->iov_base = (caddr_t)m->b_rptr;
1491 					iovp->iov_len = (m->b_wptr - m->b_rptr);
1492 					rcount -= iovp->iov_len;
1493 					if (rcount < 0)
1494 						iovp->iov_len += rcount;
1495 					iovp++;
1496 					if (rcount <= 0)
1497 						break;
1498 					m = m->b_cont;
1499 				}
1500 			}
1501 			count += trp->wa->wa_count;
1502 			trp = trp->list;
1503 		} while (trp != lrp);
1504 
1505 		uio.uio_iov = niovp;
1506 		uio.uio_iovcnt = iovcnt;
1507 		uio.uio_segflg = UIO_SYSSPACE;
1508 		uio.uio_extflg = UIO_COPY_DEFAULT;
1509 		uio.uio_loffset = (offset_t)rp->wa->wa_offset;
1510 		uio.uio_resid = count;
1511 		/*
1512 		 * The limit is checked on the client. We
1513 		 * should allow any size writes here.
1514 		 */
1515 		uio.uio_llimit = curproc->p_fsz_ctl;
1516 		rlimit = uio.uio_llimit - rp->wa->wa_offset;
1517 		if (rlimit < (rlim64_t)uio.uio_resid)
1518 			uio.uio_resid = (uint_t)rlimit;
1519 
1520 		/*
1521 		 * For now we assume no append mode.
1522 		 */
1523 
1524 		/*
1525 		 * We're changing creds because VM may fault
1526 		 * and we need the cred of the current
1527 		 * thread to be used if quota * checking is
1528 		 * enabled.
1529 		 */
1530 		savecred = curthread->t_cred;
1531 		curthread->t_cred = cr;
1532 		error = VOP_WRITE(vp, &uio, 0, rp->cr, &ct);
1533 		curthread->t_cred = savecred;
1534 
1535 		/* check if a monitor detected a delegation conflict */
1536 		if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1537 			/* mark as wouldblock so response is dropped */
1538 			curthread->t_flag |= T_WOULDBLOCK;
1539 
1540 		if (niovp != iov)
1541 			kmem_free(niovp, sizeof (*niovp) * iovcnt);
1542 
1543 		if (!error) {
1544 			data_written = 1;
1545 			/*
1546 			 * Get attributes again so we send the latest mod
1547 			 * time to the client side for his cache.
1548 			 */
1549 			va.va_mask = AT_ALL;	/* now we want everything */
1550 
1551 			error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1552 
1553 			if (!error)
1554 				acl_perm(vp, exi, &va, rp->cr);
1555 		}
1556 
1557 		/*
1558 		 * Fill in the status responses for each request
1559 		 * which was just handled.  Also, copy the latest
1560 		 * attributes in to the attribute responses if
1561 		 * appropriate.
1562 		 */
1563 		t_flag = curthread->t_flag & T_WOULDBLOCK;
1564 		do {
1565 			rp->thread->t_flag |= t_flag;
1566 			/* check for overflows */
1567 			if (!error) {
1568 				error  = vattr_to_nattr(&va, &rp->ns->ns_attr);
1569 			}
1570 			rp->ns->ns_status = puterrno(error);
1571 			rp = rp->list;
1572 		} while (rp != lrp);
1573 	} while (rp != NULL);
1574 
1575 	/*
1576 	 * If any data was written at all, then we need to flush
1577 	 * the data and metadata to stable storage.
1578 	 */
1579 	if (data_written) {
1580 		error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr, &ct);
1581 
1582 		if (!error) {
1583 			error = VOP_FSYNC(vp, FNODSYNC, cr, &ct);
1584 		}
1585 	}
1586 
1587 	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1588 
1589 	if (in_crit)
1590 		nbl_end_crit(vp);
1591 	VN_RELE(vp);
1592 
1593 	t_flag = curthread->t_flag & T_WOULDBLOCK;
1594 	mutex_enter(&rfs_async_write_lock);
1595 	for (rp = nlp->list; rp != NULL; rp = rp->list) {
1596 		if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1597 			rp->ns->ns_status = puterrno(error);
1598 			rp->thread->t_flag |= t_flag;
1599 		}
1600 	}
1601 	cv_broadcast(&nlp->cv);
1602 	mutex_exit(&rfs_async_write_lock);
1603 
1604 }
1605 
1606 void *
1607 rfs_write_getfh(struct nfswriteargs *wa)
1608 {
1609 	return (&wa->wa_fhandle);
1610 }
1611 
1612 /*
1613  * Create a file.
1614  * Creates a file with given attributes and returns those attributes
1615  * and an fhandle for the new file.
1616  */
1617 void
1618 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
1619 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1620 {
1621 	int error;
1622 	int lookuperr;
1623 	int in_crit = 0;
1624 	struct vattr va;
1625 	vnode_t *vp;
1626 	vnode_t *realvp;
1627 	vnode_t *dvp;
1628 	char *name = args->ca_da.da_name;
1629 	vnode_t *tvp = NULL;
1630 	int mode;
1631 	int lookup_ok;
1632 	bool_t trunc;
1633 	struct sockaddr *ca;
1634 
1635 	/*
1636 	 * Disallow NULL paths
1637 	 */
1638 	if (name == NULL || *name == '\0') {
1639 		dr->dr_status = NFSERR_ACCES;
1640 		return;
1641 	}
1642 
1643 	dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
1644 	if (dvp == NULL) {
1645 		dr->dr_status = NFSERR_STALE;
1646 		return;
1647 	}
1648 
1649 	error = sattr_to_vattr(args->ca_sa, &va);
1650 	if (error) {
1651 		dr->dr_status = puterrno(error);
1652 		return;
1653 	}
1654 
1655 	/*
1656 	 * Must specify the mode.
1657 	 */
1658 	if (!(va.va_mask & AT_MODE)) {
1659 		VN_RELE(dvp);
1660 		dr->dr_status = NFSERR_INVAL;
1661 		return;
1662 	}
1663 
1664 	/*
1665 	 * This is a completely gross hack to make mknod
1666 	 * work over the wire until we can wack the protocol
1667 	 */
1668 	if ((va.va_mode & IFMT) == IFCHR) {
1669 		if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV)
1670 			va.va_type = VFIFO;	/* xtra kludge for named pipe */
1671 		else {
1672 			va.va_type = VCHR;
1673 			/*
1674 			 * uncompress the received dev_t
1675 			 * if the top half is zero indicating a request
1676 			 * from an `older style' OS.
1677 			 */
1678 			if ((va.va_size & 0xffff0000) == 0)
1679 				va.va_rdev = nfsv2_expdev(va.va_size);
1680 			else
1681 				va.va_rdev = (dev_t)va.va_size;
1682 		}
1683 		va.va_mask &= ~AT_SIZE;
1684 	} else if ((va.va_mode & IFMT) == IFBLK) {
1685 		va.va_type = VBLK;
1686 		/*
1687 		 * uncompress the received dev_t
1688 		 * if the top half is zero indicating a request
1689 		 * from an `older style' OS.
1690 		 */
1691 		if ((va.va_size & 0xffff0000) == 0)
1692 			va.va_rdev = nfsv2_expdev(va.va_size);
1693 		else
1694 			va.va_rdev = (dev_t)va.va_size;
1695 		va.va_mask &= ~AT_SIZE;
1696 	} else if ((va.va_mode & IFMT) == IFSOCK) {
1697 		va.va_type = VSOCK;
1698 	} else {
1699 		va.va_type = VREG;
1700 	}
1701 	va.va_mode &= ~IFMT;
1702 	va.va_mask |= AT_TYPE;
1703 
1704 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
1705 	name = nfscmd_convname(ca, exi, name, NFSCMD_CONV_INBOUND,
1706 	    MAXPATHLEN);
1707 	if (name == NULL) {
1708 		dr->dr_status = puterrno(EINVAL);
1709 		return;
1710 	}
1711 
1712 	/*
1713 	 * Why was the choice made to use VWRITE as the mode to the
1714 	 * call to VOP_CREATE ? This results in a bug.  When a client
1715 	 * opens a file that already exists and is RDONLY, the second
1716 	 * open fails with an EACESS because of the mode.
1717 	 * bug ID 1054648.
1718 	 */
1719 	lookup_ok = 0;
1720 	mode = VWRITE;
1721 	if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) {
1722 		error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1723 		    NULL, NULL, NULL);
1724 		if (!error) {
1725 			struct vattr at;
1726 
1727 			lookup_ok = 1;
1728 			at.va_mask = AT_MODE;
1729 			error = VOP_GETATTR(tvp, &at, 0, cr, NULL);
1730 			if (!error)
1731 				mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD;
1732 			VN_RELE(tvp);
1733 			tvp = NULL;
1734 		}
1735 	}
1736 
1737 	if (!lookup_ok) {
1738 		if (rdonly(exi, req)) {
1739 			error = EROFS;
1740 		} else if (va.va_type != VREG && va.va_type != VFIFO &&
1741 		    va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) {
1742 			error = EPERM;
1743 		} else {
1744 			error = 0;
1745 		}
1746 	}
1747 
1748 	/*
1749 	 * If file size is being modified on an already existing file
1750 	 * make sure that there are no conflicting non-blocking mandatory
1751 	 * locks in the region being manipulated. Return EACCES if there
1752 	 * are conflicting locks.
1753 	 */
1754 	if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) {
1755 		lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1756 		    NULL, NULL, NULL);
1757 
1758 		if (!lookuperr &&
1759 		    rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) {
1760 			VN_RELE(tvp);
1761 			curthread->t_flag |= T_WOULDBLOCK;
1762 			goto out;
1763 		}
1764 
1765 		if (!lookuperr && nbl_need_check(tvp)) {
1766 			/*
1767 			 * The file exists. Now check if it has any
1768 			 * conflicting non-blocking mandatory locks
1769 			 * in the region being changed.
1770 			 */
1771 			struct vattr bva;
1772 			u_offset_t offset;
1773 			ssize_t length;
1774 
1775 			nbl_start_crit(tvp, RW_READER);
1776 			in_crit = 1;
1777 
1778 			bva.va_mask = AT_SIZE;
1779 			error = VOP_GETATTR(tvp, &bva, 0, cr, NULL);
1780 			if (!error) {
1781 				if (va.va_size < bva.va_size) {
1782 					offset = va.va_size;
1783 					length = bva.va_size - va.va_size;
1784 				} else {
1785 					offset = bva.va_size;
1786 					length = va.va_size - bva.va_size;
1787 				}
1788 				if (length) {
1789 					if (nbl_conflict(tvp, NBL_WRITE,
1790 					    offset, length, 0, NULL)) {
1791 						error = EACCES;
1792 					}
1793 				}
1794 			}
1795 			if (error) {
1796 				nbl_end_crit(tvp);
1797 				VN_RELE(tvp);
1798 				in_crit = 0;
1799 			}
1800 		} else if (tvp != NULL) {
1801 			VN_RELE(tvp);
1802 		}
1803 	}
1804 
1805 	if (!error) {
1806 		/*
1807 		 * If filesystem is shared with nosuid the remove any
1808 		 * setuid/setgid bits on create.
1809 		 */
1810 		if (va.va_type == VREG &&
1811 		    exi->exi_export.ex_flags & EX_NOSUID)
1812 			va.va_mode &= ~(VSUID | VSGID);
1813 
1814 		error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0,
1815 		    NULL, NULL);
1816 
1817 		if (!error) {
1818 
1819 			if ((va.va_mask & AT_SIZE) && (va.va_size == 0))
1820 				trunc = TRUE;
1821 			else
1822 				trunc = FALSE;
1823 
1824 			if (rfs4_check_delegated(FWRITE, vp, trunc)) {
1825 				VN_RELE(vp);
1826 				curthread->t_flag |= T_WOULDBLOCK;
1827 				goto out;
1828 			}
1829 			va.va_mask = AT_ALL;
1830 
1831 			error = VOP_GETATTR(vp, &va, 0, cr, NULL);
1832 
1833 			/* check for overflows */
1834 			if (!error) {
1835 				acl_perm(vp, exi, &va, cr);
1836 				error = vattr_to_nattr(&va, &dr->dr_attr);
1837 				if (!error) {
1838 					error = makefh(&dr->dr_fhandle, vp,
1839 					    exi);
1840 				}
1841 			}
1842 			/*
1843 			 * Force modified metadata out to stable storage.
1844 			 *
1845 			 * if a underlying vp exists, pass it to VOP_FSYNC
1846 			 */
1847 			if (VOP_REALVP(vp, &realvp, NULL) == 0)
1848 				(void) VOP_FSYNC(realvp, FNODSYNC, cr, NULL);
1849 			else
1850 				(void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
1851 			VN_RELE(vp);
1852 		}
1853 
1854 		if (in_crit) {
1855 			nbl_end_crit(tvp);
1856 			VN_RELE(tvp);
1857 		}
1858 	}
1859 
1860 	/*
1861 	 * Force modified data and metadata out to stable storage.
1862 	 */
1863 	(void) VOP_FSYNC(dvp, 0, cr, NULL);
1864 
1865 out:
1866 
1867 	VN_RELE(dvp);
1868 
1869 	dr->dr_status = puterrno(error);
1870 
1871 	if (name != args->ca_da.da_name)
1872 		kmem_free(name, MAXPATHLEN);
1873 }
1874 void *
1875 rfs_create_getfh(struct nfscreatargs *args)
1876 {
1877 	return (args->ca_da.da_fhandle);
1878 }
1879 
1880 /*
1881  * Remove a file.
1882  * Remove named file from parent directory.
1883  */
1884 void
1885 rfs_remove(struct nfsdiropargs *da, enum nfsstat *status,
1886 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1887 {
1888 	int error = 0;
1889 	vnode_t *vp;
1890 	vnode_t *targvp;
1891 	int in_crit = 0;
1892 
1893 	/*
1894 	 * Disallow NULL paths
1895 	 */
1896 	if (da->da_name == NULL || *da->da_name == '\0') {
1897 		*status = NFSERR_ACCES;
1898 		return;
1899 	}
1900 
1901 	vp = nfs_fhtovp(da->da_fhandle, exi);
1902 	if (vp == NULL) {
1903 		*status = NFSERR_STALE;
1904 		return;
1905 	}
1906 
1907 	if (rdonly(exi, req)) {
1908 		VN_RELE(vp);
1909 		*status = NFSERR_ROFS;
1910 		return;
1911 	}
1912 
1913 	/*
1914 	 * Check for a conflict with a non-blocking mandatory share reservation.
1915 	 */
1916 	error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0,
1917 	    NULL, cr, NULL, NULL, NULL);
1918 	if (error != 0) {
1919 		VN_RELE(vp);
1920 		*status = puterrno(error);
1921 		return;
1922 	}
1923 
1924 	/*
1925 	 * If the file is delegated to an v4 client, then initiate
1926 	 * recall and drop this request (by setting T_WOULDBLOCK).
1927 	 * The client will eventually re-transmit the request and
1928 	 * (hopefully), by then, the v4 client will have returned
1929 	 * the delegation.
1930 	 */
1931 
1932 	if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
1933 		VN_RELE(vp);
1934 		VN_RELE(targvp);
1935 		curthread->t_flag |= T_WOULDBLOCK;
1936 		return;
1937 	}
1938 
1939 	if (nbl_need_check(targvp)) {
1940 		nbl_start_crit(targvp, RW_READER);
1941 		in_crit = 1;
1942 		if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
1943 			error = EACCES;
1944 			goto out;
1945 		}
1946 	}
1947 
1948 	error = VOP_REMOVE(vp, da->da_name, cr, NULL, 0);
1949 
1950 	/*
1951 	 * Force modified data and metadata out to stable storage.
1952 	 */
1953 	(void) VOP_FSYNC(vp, 0, cr, NULL);
1954 
1955 out:
1956 	if (in_crit)
1957 		nbl_end_crit(targvp);
1958 	VN_RELE(targvp);
1959 	VN_RELE(vp);
1960 
1961 	*status = puterrno(error);
1962 
1963 }
1964 
1965 void *
1966 rfs_remove_getfh(struct nfsdiropargs *da)
1967 {
1968 	return (da->da_fhandle);
1969 }
1970 
1971 /*
1972  * rename a file
1973  * Give a file (from) a new name (to).
1974  */
1975 void
1976 rfs_rename(struct nfsrnmargs *args, enum nfsstat *status,
1977 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1978 {
1979 	int error = 0;
1980 	vnode_t *fromvp;
1981 	vnode_t *tovp;
1982 	struct exportinfo *to_exi;
1983 	fhandle_t *fh;
1984 	vnode_t *srcvp;
1985 	vnode_t *targvp;
1986 	int in_crit = 0;
1987 
1988 	fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi);
1989 	if (fromvp == NULL) {
1990 		*status = NFSERR_STALE;
1991 		return;
1992 	}
1993 
1994 	fh = args->rna_to.da_fhandle;
1995 	to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
1996 	if (to_exi == NULL) {
1997 		VN_RELE(fromvp);
1998 		*status = NFSERR_ACCES;
1999 		return;
2000 	}
2001 	exi_rele(to_exi);
2002 
2003 	if (to_exi != exi) {
2004 		VN_RELE(fromvp);
2005 		*status = NFSERR_XDEV;
2006 		return;
2007 	}
2008 
2009 	tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi);
2010 	if (tovp == NULL) {
2011 		VN_RELE(fromvp);
2012 		*status = NFSERR_STALE;
2013 		return;
2014 	}
2015 
2016 	if (fromvp->v_type != VDIR || tovp->v_type != VDIR) {
2017 		VN_RELE(tovp);
2018 		VN_RELE(fromvp);
2019 		*status = NFSERR_NOTDIR;
2020 		return;
2021 	}
2022 
2023 	/*
2024 	 * Disallow NULL paths
2025 	 */
2026 	if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' ||
2027 	    args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') {
2028 		VN_RELE(tovp);
2029 		VN_RELE(fromvp);
2030 		*status = NFSERR_ACCES;
2031 		return;
2032 	}
2033 
2034 	if (rdonly(exi, req)) {
2035 		VN_RELE(tovp);
2036 		VN_RELE(fromvp);
2037 		*status = NFSERR_ROFS;
2038 		return;
2039 	}
2040 
2041 	/*
2042 	 * Check for a conflict with a non-blocking mandatory share reservation.
2043 	 */
2044 	error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
2045 	    NULL, cr, NULL, NULL, NULL);
2046 	if (error != 0) {
2047 		VN_RELE(tovp);
2048 		VN_RELE(fromvp);
2049 		*status = puterrno(error);
2050 		return;
2051 	}
2052 
2053 	/* Check for delegations on the source file */
2054 
2055 	if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
2056 		VN_RELE(tovp);
2057 		VN_RELE(fromvp);
2058 		VN_RELE(srcvp);
2059 		curthread->t_flag |= T_WOULDBLOCK;
2060 		return;
2061 	}
2062 
2063 	/* Check for delegation on the file being renamed over, if it exists */
2064 
2065 	if (rfs4_deleg_policy != SRV_NEVER_DELEGATE &&
2066 	    VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
2067 	    NULL, NULL, NULL) == 0) {
2068 
2069 		if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2070 			VN_RELE(tovp);
2071 			VN_RELE(fromvp);
2072 			VN_RELE(srcvp);
2073 			VN_RELE(targvp);
2074 			curthread->t_flag |= T_WOULDBLOCK;
2075 			return;
2076 		}
2077 		VN_RELE(targvp);
2078 	}
2079 
2080 
2081 	if (nbl_need_check(srcvp)) {
2082 		nbl_start_crit(srcvp, RW_READER);
2083 		in_crit = 1;
2084 		if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) {
2085 			error = EACCES;
2086 			goto out;
2087 		}
2088 	}
2089 
2090 	error = VOP_RENAME(fromvp, args->rna_from.da_name,
2091 	    tovp, args->rna_to.da_name, cr, NULL, 0);
2092 
2093 	if (error == 0)
2094 		vn_renamepath(tovp, srcvp, args->rna_to.da_name,
2095 		    strlen(args->rna_to.da_name));
2096 
2097 	/*
2098 	 * Force modified data and metadata out to stable storage.
2099 	 */
2100 	(void) VOP_FSYNC(tovp, 0, cr, NULL);
2101 	(void) VOP_FSYNC(fromvp, 0, cr, NULL);
2102 
2103 out:
2104 	if (in_crit)
2105 		nbl_end_crit(srcvp);
2106 	VN_RELE(srcvp);
2107 	VN_RELE(tovp);
2108 	VN_RELE(fromvp);
2109 
2110 	*status = puterrno(error);
2111 
2112 }
2113 void *
2114 rfs_rename_getfh(struct nfsrnmargs *args)
2115 {
2116 	return (args->rna_from.da_fhandle);
2117 }
2118 
2119 /*
2120  * Link to a file.
2121  * Create a file (to) which is a hard link to the given file (from).
2122  */
2123 void
2124 rfs_link(struct nfslinkargs *args, enum nfsstat *status,
2125 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2126 {
2127 	int error;
2128 	vnode_t *fromvp;
2129 	vnode_t *tovp;
2130 	struct exportinfo *to_exi;
2131 	fhandle_t *fh;
2132 
2133 	fromvp = nfs_fhtovp(args->la_from, exi);
2134 	if (fromvp == NULL) {
2135 		*status = NFSERR_STALE;
2136 		return;
2137 	}
2138 
2139 	fh = args->la_to.da_fhandle;
2140 	to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2141 	if (to_exi == NULL) {
2142 		VN_RELE(fromvp);
2143 		*status = NFSERR_ACCES;
2144 		return;
2145 	}
2146 	exi_rele(to_exi);
2147 
2148 	if (to_exi != exi) {
2149 		VN_RELE(fromvp);
2150 		*status = NFSERR_XDEV;
2151 		return;
2152 	}
2153 
2154 	tovp = nfs_fhtovp(args->la_to.da_fhandle, exi);
2155 	if (tovp == NULL) {
2156 		VN_RELE(fromvp);
2157 		*status = NFSERR_STALE;
2158 		return;
2159 	}
2160 
2161 	if (tovp->v_type != VDIR) {
2162 		VN_RELE(tovp);
2163 		VN_RELE(fromvp);
2164 		*status = NFSERR_NOTDIR;
2165 		return;
2166 	}
2167 	/*
2168 	 * Disallow NULL paths
2169 	 */
2170 	if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') {
2171 		VN_RELE(tovp);
2172 		VN_RELE(fromvp);
2173 		*status = NFSERR_ACCES;
2174 		return;
2175 	}
2176 
2177 	if (rdonly(exi, req)) {
2178 		VN_RELE(tovp);
2179 		VN_RELE(fromvp);
2180 		*status = NFSERR_ROFS;
2181 		return;
2182 	}
2183 
2184 	error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr, NULL, 0);
2185 
2186 	/*
2187 	 * Force modified data and metadata out to stable storage.
2188 	 */
2189 	(void) VOP_FSYNC(tovp, 0, cr, NULL);
2190 	(void) VOP_FSYNC(fromvp, FNODSYNC, cr, NULL);
2191 
2192 	VN_RELE(tovp);
2193 	VN_RELE(fromvp);
2194 
2195 	*status = puterrno(error);
2196 
2197 }
2198 void *
2199 rfs_link_getfh(struct nfslinkargs *args)
2200 {
2201 	return (args->la_from);
2202 }
2203 
2204 /*
2205  * Symbolicly link to a file.
2206  * Create a file (to) with the given attributes which is a symbolic link
2207  * to the given path name (to).
2208  */
2209 void
2210 rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
2211 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2212 {
2213 	int error;
2214 	struct vattr va;
2215 	vnode_t *vp;
2216 	vnode_t *svp;
2217 	int lerror;
2218 	struct sockaddr *ca;
2219 	char *name = NULL;
2220 
2221 	/*
2222 	 * Disallow NULL paths
2223 	 */
2224 	if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') {
2225 		*status = NFSERR_ACCES;
2226 		return;
2227 	}
2228 
2229 	vp = nfs_fhtovp(args->sla_from.da_fhandle, exi);
2230 	if (vp == NULL) {
2231 		*status = NFSERR_STALE;
2232 		return;
2233 	}
2234 
2235 	if (rdonly(exi, req)) {
2236 		VN_RELE(vp);
2237 		*status = NFSERR_ROFS;
2238 		return;
2239 	}
2240 
2241 	error = sattr_to_vattr(args->sla_sa, &va);
2242 	if (error) {
2243 		VN_RELE(vp);
2244 		*status = puterrno(error);
2245 		return;
2246 	}
2247 
2248 	if (!(va.va_mask & AT_MODE)) {
2249 		VN_RELE(vp);
2250 		*status = NFSERR_INVAL;
2251 		return;
2252 	}
2253 
2254 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2255 	name = nfscmd_convname(ca, exi, args->sla_tnm,
2256 	    NFSCMD_CONV_INBOUND, MAXPATHLEN);
2257 
2258 	if (name == NULL) {
2259 		*status = NFSERR_ACCES;
2260 		return;
2261 	}
2262 
2263 	va.va_type = VLNK;
2264 	va.va_mask |= AT_TYPE;
2265 
2266 	error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, name, cr, NULL, 0);
2267 
2268 	/*
2269 	 * Force new data and metadata out to stable storage.
2270 	 */
2271 	lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL, 0,
2272 	    NULL, cr, NULL, NULL, NULL);
2273 
2274 	if (!lerror) {
2275 		(void) VOP_FSYNC(svp, 0, cr, NULL);
2276 		VN_RELE(svp);
2277 	}
2278 
2279 	/*
2280 	 * Force modified data and metadata out to stable storage.
2281 	 */
2282 	(void) VOP_FSYNC(vp, 0, cr, NULL);
2283 
2284 	VN_RELE(vp);
2285 
2286 	*status = puterrno(error);
2287 	if (name != args->sla_tnm)
2288 		kmem_free(name, MAXPATHLEN);
2289 
2290 }
2291 void *
2292 rfs_symlink_getfh(struct nfsslargs *args)
2293 {
2294 	return (args->sla_from.da_fhandle);
2295 }
2296 
2297 /*
2298  * Make a directory.
2299  * Create a directory with the given name, parent directory, and attributes.
2300  * Returns a file handle and attributes for the new directory.
2301  */
2302 void
2303 rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr,
2304 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2305 {
2306 	int error;
2307 	struct vattr va;
2308 	vnode_t *dvp = NULL;
2309 	vnode_t *vp;
2310 	char *name = args->ca_da.da_name;
2311 
2312 	/*
2313 	 * Disallow NULL paths
2314 	 */
2315 	if (name == NULL || *name == '\0') {
2316 		dr->dr_status = NFSERR_ACCES;
2317 		return;
2318 	}
2319 
2320 	vp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
2321 	if (vp == NULL) {
2322 		dr->dr_status = NFSERR_STALE;
2323 		return;
2324 	}
2325 
2326 	if (rdonly(exi, req)) {
2327 		VN_RELE(vp);
2328 		dr->dr_status = NFSERR_ROFS;
2329 		return;
2330 	}
2331 
2332 	error = sattr_to_vattr(args->ca_sa, &va);
2333 	if (error) {
2334 		VN_RELE(vp);
2335 		dr->dr_status = puterrno(error);
2336 		return;
2337 	}
2338 
2339 	if (!(va.va_mask & AT_MODE)) {
2340 		VN_RELE(vp);
2341 		dr->dr_status = NFSERR_INVAL;
2342 		return;
2343 	}
2344 
2345 	va.va_type = VDIR;
2346 	va.va_mask |= AT_TYPE;
2347 
2348 	error = VOP_MKDIR(vp, name, &va, &dvp, cr, NULL, 0, NULL);
2349 
2350 	if (!error) {
2351 		/*
2352 		 * Attribtutes of the newly created directory should
2353 		 * be returned to the client.
2354 		 */
2355 		va.va_mask = AT_ALL; /* We want everything */
2356 		error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2357 
2358 		/* check for overflows */
2359 		if (!error) {
2360 			acl_perm(vp, exi, &va, cr);
2361 			error = vattr_to_nattr(&va, &dr->dr_attr);
2362 			if (!error) {
2363 				error = makefh(&dr->dr_fhandle, dvp, exi);
2364 			}
2365 		}
2366 		/*
2367 		 * Force new data and metadata out to stable storage.
2368 		 */
2369 		(void) VOP_FSYNC(dvp, 0, cr, NULL);
2370 		VN_RELE(dvp);
2371 	}
2372 
2373 	/*
2374 	 * Force modified data and metadata out to stable storage.
2375 	 */
2376 	(void) VOP_FSYNC(vp, 0, cr, NULL);
2377 
2378 	VN_RELE(vp);
2379 
2380 	dr->dr_status = puterrno(error);
2381 
2382 }
2383 void *
2384 rfs_mkdir_getfh(struct nfscreatargs *args)
2385 {
2386 	return (args->ca_da.da_fhandle);
2387 }
2388 
2389 /*
2390  * Remove a directory.
2391  * Remove the given directory name from the given parent directory.
2392  */
2393 void
2394 rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status,
2395 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2396 {
2397 	int error;
2398 	vnode_t *vp;
2399 
2400 
2401 	/*
2402 	 * Disallow NULL paths
2403 	 */
2404 	if (da->da_name == NULL || *da->da_name == '\0') {
2405 		*status = NFSERR_ACCES;
2406 		return;
2407 	}
2408 
2409 	vp = nfs_fhtovp(da->da_fhandle, exi);
2410 	if (vp == NULL) {
2411 		*status = NFSERR_STALE;
2412 		return;
2413 	}
2414 
2415 	if (rdonly(exi, req)) {
2416 		VN_RELE(vp);
2417 		*status = NFSERR_ROFS;
2418 		return;
2419 	}
2420 
2421 	/*
2422 	 * VOP_RMDIR now takes a new third argument (the current
2423 	 * directory of the process).  That's because someone
2424 	 * wants to return EINVAL if one tries to remove ".".
2425 	 * Of course, NFS servers have no idea what their
2426 	 * clients' current directories are.  We fake it by
2427 	 * supplying a vnode known to exist and illegal to
2428 	 * remove.
2429 	 */
2430 	error = VOP_RMDIR(vp, da->da_name, rootdir, cr, NULL, 0);
2431 
2432 	/*
2433 	 * Force modified data and metadata out to stable storage.
2434 	 */
2435 	(void) VOP_FSYNC(vp, 0, cr, NULL);
2436 
2437 	VN_RELE(vp);
2438 
2439 	/*
2440 	 * System V defines rmdir to return EEXIST, not ENOTEMPTY,
2441 	 * if the directory is not empty.  A System V NFS server
2442 	 * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
2443 	 * over the wire.
2444 	 */
2445 	if (error == EEXIST)
2446 		*status = NFSERR_NOTEMPTY;
2447 	else
2448 		*status = puterrno(error);
2449 
2450 }
2451 void *
2452 rfs_rmdir_getfh(struct nfsdiropargs *da)
2453 {
2454 	return (da->da_fhandle);
2455 }
2456 
2457 /* ARGSUSED */
2458 void
2459 rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
2460 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2461 {
2462 	int error;
2463 	int iseof;
2464 	struct iovec iov;
2465 	struct uio uio;
2466 	vnode_t *vp;
2467 	char *ndata = NULL;
2468 	struct sockaddr *ca;
2469 	size_t nents;
2470 	int ret;
2471 
2472 	vp = nfs_fhtovp(&rda->rda_fh, exi);
2473 	if (vp == NULL) {
2474 		rd->rd_entries = NULL;
2475 		rd->rd_status = NFSERR_STALE;
2476 		return;
2477 	}
2478 
2479 	if (vp->v_type != VDIR) {
2480 		VN_RELE(vp);
2481 		rd->rd_entries = NULL;
2482 		rd->rd_status = NFSERR_NOTDIR;
2483 		return;
2484 	}
2485 
2486 	(void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
2487 
2488 	error = VOP_ACCESS(vp, VREAD, 0, cr, NULL);
2489 
2490 	if (error) {
2491 		rd->rd_entries = NULL;
2492 		goto bad;
2493 	}
2494 
2495 	if (rda->rda_count == 0) {
2496 		rd->rd_entries = NULL;
2497 		rd->rd_size = 0;
2498 		rd->rd_eof = FALSE;
2499 		goto bad;
2500 	}
2501 
2502 	rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA);
2503 
2504 	/*
2505 	 * Allocate data for entries.  This will be freed by rfs_rddirfree.
2506 	 */
2507 	rd->rd_bufsize = (uint_t)rda->rda_count;
2508 	rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP);
2509 
2510 	/*
2511 	 * Set up io vector to read directory data
2512 	 */
2513 	iov.iov_base = (caddr_t)rd->rd_entries;
2514 	iov.iov_len = rda->rda_count;
2515 	uio.uio_iov = &iov;
2516 	uio.uio_iovcnt = 1;
2517 	uio.uio_segflg = UIO_SYSSPACE;
2518 	uio.uio_extflg = UIO_COPY_CACHED;
2519 	uio.uio_loffset = (offset_t)rda->rda_offset;
2520 	uio.uio_resid = rda->rda_count;
2521 
2522 	/*
2523 	 * read directory
2524 	 */
2525 	error = VOP_READDIR(vp, &uio, cr, &iseof, NULL, 0);
2526 
2527 	/*
2528 	 * Clean up
2529 	 */
2530 	if (!error) {
2531 		/*
2532 		 * set size and eof
2533 		 */
2534 		if (uio.uio_resid == rda->rda_count) {
2535 			rd->rd_size = 0;
2536 			rd->rd_eof = TRUE;
2537 		} else {
2538 			rd->rd_size = (uint32_t)(rda->rda_count -
2539 			    uio.uio_resid);
2540 			rd->rd_eof = iseof ? TRUE : FALSE;
2541 		}
2542 	}
2543 
2544 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2545 	nents = nfscmd_countents((char *)rd->rd_entries, rd->rd_size);
2546 	ret = nfscmd_convdirplus(ca, exi, (char *)rd->rd_entries, nents,
2547 	    rda->rda_count, &ndata);
2548 
2549 	if (ret != 0) {
2550 		size_t dropbytes;
2551 		/*
2552 		 * We had to drop one or more entries in order to fit
2553 		 * during the character conversion.  We need to patch
2554 		 * up the size and eof info.
2555 		 */
2556 		if (rd->rd_eof)
2557 			rd->rd_eof = FALSE;
2558 		dropbytes = nfscmd_dropped_entrysize(
2559 		    (struct dirent64 *)rd->rd_entries, nents, ret);
2560 		rd->rd_size -= dropbytes;
2561 	}
2562 	if (ndata == NULL) {
2563 		ndata = (char *)rd->rd_entries;
2564 	} else if (ndata != (char *)rd->rd_entries) {
2565 		kmem_free(rd->rd_entries, rd->rd_bufsize);
2566 		rd->rd_entries = (void *)ndata;
2567 		rd->rd_bufsize = rda->rda_count;
2568 	}
2569 
2570 bad:
2571 	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
2572 
2573 #if 0 /* notyet */
2574 	/*
2575 	 * Don't do this.  It causes local disk writes when just
2576 	 * reading the file and the overhead is deemed larger
2577 	 * than the benefit.
2578 	 */
2579 	/*
2580 	 * Force modified metadata out to stable storage.
2581 	 */
2582 	(void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2583 #endif
2584 
2585 	VN_RELE(vp);
2586 
2587 	rd->rd_status = puterrno(error);
2588 
2589 }
2590 void *
2591 rfs_readdir_getfh(struct nfsrddirargs *rda)
2592 {
2593 	return (&rda->rda_fh);
2594 }
2595 void
2596 rfs_rddirfree(struct nfsrddirres *rd)
2597 {
2598 	if (rd->rd_entries != NULL)
2599 		kmem_free(rd->rd_entries, rd->rd_bufsize);
2600 }
2601 
2602 /* ARGSUSED */
2603 void
2604 rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
2605 	struct svc_req *req, cred_t *cr)
2606 {
2607 	int error;
2608 	struct statvfs64 sb;
2609 	vnode_t *vp;
2610 
2611 	vp = nfs_fhtovp(fh, exi);
2612 	if (vp == NULL) {
2613 		fs->fs_status = NFSERR_STALE;
2614 		return;
2615 	}
2616 
2617 	error = VFS_STATVFS(vp->v_vfsp, &sb);
2618 
2619 	if (!error) {
2620 		fs->fs_tsize = nfstsize();
2621 		fs->fs_bsize = sb.f_frsize;
2622 		fs->fs_blocks = sb.f_blocks;
2623 		fs->fs_bfree = sb.f_bfree;
2624 		fs->fs_bavail = sb.f_bavail;
2625 	}
2626 
2627 	VN_RELE(vp);
2628 
2629 	fs->fs_status = puterrno(error);
2630 
2631 }
2632 void *
2633 rfs_statfs_getfh(fhandle_t *fh)
2634 {
2635 	return (fh);
2636 }
2637 
2638 static int
2639 sattr_to_vattr(struct nfssattr *sa, struct vattr *vap)
2640 {
2641 	vap->va_mask = 0;
2642 
2643 	/*
2644 	 * There was a sign extension bug in some VFS based systems
2645 	 * which stored the mode as a short.  When it would get
2646 	 * assigned to a u_long, no sign extension would occur.
2647 	 * It needed to, but this wasn't noticed because sa_mode
2648 	 * would then get assigned back to the short, thus ignoring
2649 	 * the upper 16 bits of sa_mode.
2650 	 *
2651 	 * To make this implementation work for both broken
2652 	 * clients and good clients, we check for both versions
2653 	 * of the mode.
2654 	 */
2655 	if (sa->sa_mode != (uint32_t)((ushort_t)-1) &&
2656 	    sa->sa_mode != (uint32_t)-1) {
2657 		vap->va_mask |= AT_MODE;
2658 		vap->va_mode = sa->sa_mode;
2659 	}
2660 	if (sa->sa_uid != (uint32_t)-1) {
2661 		vap->va_mask |= AT_UID;
2662 		vap->va_uid = sa->sa_uid;
2663 	}
2664 	if (sa->sa_gid != (uint32_t)-1) {
2665 		vap->va_mask |= AT_GID;
2666 		vap->va_gid = sa->sa_gid;
2667 	}
2668 	if (sa->sa_size != (uint32_t)-1) {
2669 		vap->va_mask |= AT_SIZE;
2670 		vap->va_size = sa->sa_size;
2671 	}
2672 	if (sa->sa_atime.tv_sec != (int32_t)-1 &&
2673 	    sa->sa_atime.tv_usec != (int32_t)-1) {
2674 #ifndef _LP64
2675 		/* return error if time overflow */
2676 		if (!NFS2_TIME_OK(sa->sa_atime.tv_sec))
2677 			return (EOVERFLOW);
2678 #endif
2679 		vap->va_mask |= AT_ATIME;
2680 		/*
2681 		 * nfs protocol defines times as unsigned so don't extend sign,
2682 		 * unless sysadmin set nfs_allow_preepoch_time.
2683 		 */
2684 		NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec);
2685 		vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
2686 	}
2687 	if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
2688 	    sa->sa_mtime.tv_usec != (int32_t)-1) {
2689 #ifndef _LP64
2690 		/* return error if time overflow */
2691 		if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
2692 			return (EOVERFLOW);
2693 #endif
2694 		vap->va_mask |= AT_MTIME;
2695 		/*
2696 		 * nfs protocol defines times as unsigned so don't extend sign,
2697 		 * unless sysadmin set nfs_allow_preepoch_time.
2698 		 */
2699 		NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
2700 		vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
2701 	}
2702 	return (0);
2703 }
2704 
2705 static enum nfsftype vt_to_nf[] = {
2706 	0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
2707 };
2708 
2709 /*
2710  * check the following fields for overflow: nodeid, size, and time.
2711  * There could be a problem when converting 64-bit LP64 fields
2712  * into 32-bit ones.  Return an error if there is an overflow.
2713  */
2714 int
2715 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
2716 {
2717 	ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
2718 	na->na_type = vt_to_nf[vap->va_type];
2719 
2720 	if (vap->va_mode == (unsigned short) -1)
2721 		na->na_mode = (uint32_t)-1;
2722 	else
2723 		na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
2724 
2725 	if (vap->va_uid == (unsigned short)(-1))
2726 		na->na_uid = (uint32_t)(-1);
2727 	else if (vap->va_uid == UID_NOBODY)
2728 		na->na_uid = (uint32_t)NFS_UID_NOBODY;
2729 	else
2730 		na->na_uid = vap->va_uid;
2731 
2732 	if (vap->va_gid == (unsigned short)(-1))
2733 		na->na_gid = (uint32_t)-1;
2734 	else if (vap->va_gid == GID_NOBODY)
2735 		na->na_gid = (uint32_t)NFS_GID_NOBODY;
2736 	else
2737 		na->na_gid = vap->va_gid;
2738 
2739 	/*
2740 	 * Do we need to check fsid for overflow?  It is 64-bit in the
2741 	 * vattr, but are bigger than 32 bit values supported?
2742 	 */
2743 	na->na_fsid = vap->va_fsid;
2744 
2745 	na->na_nodeid = vap->va_nodeid;
2746 
2747 	/*
2748 	 * Check to make sure that the nodeid is representable over the
2749 	 * wire without losing bits.
2750 	 */
2751 	if (vap->va_nodeid != (u_longlong_t)na->na_nodeid)
2752 		return (EFBIG);
2753 	na->na_nlink = vap->va_nlink;
2754 
2755 	/*
2756 	 * Check for big files here, instead of at the caller.  See
2757 	 * comments in cstat for large special file explanation.
2758 	 */
2759 	if (vap->va_size > (u_longlong_t)MAXOFF32_T) {
2760 		if ((vap->va_type == VREG) || (vap->va_type == VDIR))
2761 			return (EFBIG);
2762 		if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) {
2763 			/* UNKNOWN_SIZE | OVERFLOW */
2764 			na->na_size = MAXOFF32_T;
2765 		} else
2766 			na->na_size = vap->va_size;
2767 	} else
2768 		na->na_size = vap->va_size;
2769 
2770 	/*
2771 	 * If the vnode times overflow the 32-bit times that NFS2
2772 	 * uses on the wire then return an error.
2773 	 */
2774 	if (!NFS_VAP_TIME_OK(vap)) {
2775 		return (EOVERFLOW);
2776 	}
2777 	na->na_atime.tv_sec = vap->va_atime.tv_sec;
2778 	na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2779 
2780 	na->na_mtime.tv_sec = vap->va_mtime.tv_sec;
2781 	na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2782 
2783 	na->na_ctime.tv_sec = vap->va_ctime.tv_sec;
2784 	na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000;
2785 
2786 	/*
2787 	 * If the dev_t will fit into 16 bits then compress
2788 	 * it, otherwise leave it alone. See comments in
2789 	 * nfs_client.c.
2790 	 */
2791 	if (getminor(vap->va_rdev) <= SO4_MAXMIN &&
2792 	    getmajor(vap->va_rdev) <= SO4_MAXMAJ)
2793 		na->na_rdev = nfsv2_cmpdev(vap->va_rdev);
2794 	else
2795 		(void) cmpldev(&na->na_rdev, vap->va_rdev);
2796 
2797 	na->na_blocks = vap->va_nblocks;
2798 	na->na_blocksize = vap->va_blksize;
2799 
2800 	/*
2801 	 * This bit of ugliness is a *TEMPORARY* hack to preserve the
2802 	 * over-the-wire protocols for named-pipe vnodes.  It remaps the
2803 	 * VFIFO type to the special over-the-wire type. (see note in nfs.h)
2804 	 *
2805 	 * BUYER BEWARE:
2806 	 *  If you are porting the NFS to a non-Sun server, you probably
2807 	 *  don't want to include the following block of code.  The
2808 	 *  over-the-wire special file types will be changing with the
2809 	 *  NFS Protocol Revision.
2810 	 */
2811 	if (vap->va_type == VFIFO)
2812 		NA_SETFIFO(na);
2813 	return (0);
2814 }
2815 
2816 /*
2817  * acl v2 support: returns approximate permission.
2818  *	default: returns minimal permission (more restrictive)
2819  *	aclok: returns maximal permission (less restrictive)
2820  *	This routine changes the permissions that are alaredy in *va.
2821  *	If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES,
2822  *	CLASS_OBJ is always the same as GROUP_OBJ entry.
2823  */
2824 static void
2825 acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr)
2826 {
2827 	vsecattr_t	vsa;
2828 	int		aclcnt;
2829 	aclent_t	*aclentp;
2830 	mode_t		mask_perm;
2831 	mode_t		grp_perm;
2832 	mode_t		other_perm;
2833 	mode_t		other_orig;
2834 	int		error;
2835 
2836 	/* dont care default acl */
2837 	vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT);
2838 	error = VOP_GETSECATTR(vp, &vsa, 0, cr, NULL);
2839 
2840 	if (!error) {
2841 		aclcnt = vsa.vsa_aclcnt;
2842 		if (aclcnt > MIN_ACL_ENTRIES) {
2843 			/* non-trivial ACL */
2844 			aclentp = vsa.vsa_aclentp;
2845 			if (exi->exi_export.ex_flags & EX_ACLOK) {
2846 				/* maximal permissions */
2847 				grp_perm = 0;
2848 				other_perm = 0;
2849 				for (; aclcnt > 0; aclcnt--, aclentp++) {
2850 					switch (aclentp->a_type) {
2851 					case USER_OBJ:
2852 						break;
2853 					case USER:
2854 						grp_perm |=
2855 						    aclentp->a_perm << 3;
2856 						other_perm |= aclentp->a_perm;
2857 						break;
2858 					case GROUP_OBJ:
2859 						grp_perm |=
2860 						    aclentp->a_perm << 3;
2861 						break;
2862 					case GROUP:
2863 						other_perm |= aclentp->a_perm;
2864 						break;
2865 					case OTHER_OBJ:
2866 						other_orig = aclentp->a_perm;
2867 						break;
2868 					case CLASS_OBJ:
2869 						mask_perm = aclentp->a_perm;
2870 						break;
2871 					default:
2872 						break;
2873 					}
2874 				}
2875 				grp_perm &= mask_perm << 3;
2876 				other_perm &= mask_perm;
2877 				other_perm |= other_orig;
2878 
2879 			} else {
2880 				/* minimal permissions */
2881 				grp_perm = 070;
2882 				other_perm = 07;
2883 				for (; aclcnt > 0; aclcnt--, aclentp++) {
2884 					switch (aclentp->a_type) {
2885 					case USER_OBJ:
2886 						break;
2887 					case USER:
2888 					case CLASS_OBJ:
2889 						grp_perm &=
2890 						    aclentp->a_perm << 3;
2891 						other_perm &=
2892 						    aclentp->a_perm;
2893 						break;
2894 					case GROUP_OBJ:
2895 						grp_perm &=
2896 						    aclentp->a_perm << 3;
2897 						break;
2898 					case GROUP:
2899 						other_perm &=
2900 						    aclentp->a_perm;
2901 						break;
2902 					case OTHER_OBJ:
2903 						other_perm &=
2904 						    aclentp->a_perm;
2905 						break;
2906 					default:
2907 						break;
2908 					}
2909 				}
2910 			}
2911 			/* copy to va */
2912 			va->va_mode &= ~077;
2913 			va->va_mode |= grp_perm | other_perm;
2914 		}
2915 		if (vsa.vsa_aclcnt)
2916 			kmem_free(vsa.vsa_aclentp,
2917 			    vsa.vsa_aclcnt * sizeof (aclent_t));
2918 	}
2919 }
2920 
2921 void
2922 rfs_srvrinit(void)
2923 {
2924 	mutex_init(&rfs_async_write_lock, NULL, MUTEX_DEFAULT, NULL);
2925 	nfs2_srv_caller_id = fs_new_caller_id();
2926 }
2927 
2928 void
2929 rfs_srvrfini(void)
2930 {
2931 	mutex_destroy(&rfs_async_write_lock);
2932 }
2933 
2934 static int
2935 rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
2936 {
2937 	struct clist	*wcl;
2938 	int		data_len, avail_len, num;
2939 	uint32_t	count = rr->rr_count;
2940 
2941 	data_len = num = avail_len = 0;
2942 
2943 	wcl = ra->ra_wlist;
2944 	while (wcl != NULL) {
2945 		if (wcl->c_dmemhandle.mrc_rmr == 0)
2946 			break;
2947 
2948 		avail_len += wcl->c_len;
2949 		if (wcl->c_len < count) {
2950 			data_len += wcl->c_len;
2951 		} else {
2952 			/* Can make the rest chunks all 0-len */
2953 			data_len += count;
2954 			wcl->c_len = count;
2955 		}
2956 		count -= wcl->c_len;
2957 		num ++;
2958 		wcl = wcl->c_next;
2959 	}
2960 
2961 	/*
2962 	 * MUST fail if there are still more data
2963 	 */
2964 	if (count > 0) {
2965 		DTRACE_PROBE2(nfss__e__read__wlist__fail,
2966 		    int, data_len, int, count);
2967 		return (FALSE);
2968 	}
2969 
2970 	wcl = ra->ra_wlist;
2971 	rr->rr_count = data_len;
2972 	rr->rr_ok.rrok_wlist_len = data_len;
2973 	rr->rr_ok.rrok_wlist = wcl;
2974 
2975 	return (TRUE);
2976 }
2977