xref: /titanic_50/usr/src/uts/common/fs/nfs/nfs_srv.c (revision af79d7d9cd4d7b7b3b52261fc6f4be7426335485)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  *	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
28  *	All rights reserved.
29  */
30 
31 #include <sys/param.h>
32 #include <sys/types.h>
33 #include <sys/systm.h>
34 #include <sys/cred.h>
35 #include <sys/buf.h>
36 #include <sys/vfs.h>
37 #include <sys/vnode.h>
38 #include <sys/uio.h>
39 #include <sys/stat.h>
40 #include <sys/errno.h>
41 #include <sys/sysmacros.h>
42 #include <sys/statvfs.h>
43 #include <sys/kmem.h>
44 #include <sys/kstat.h>
45 #include <sys/dirent.h>
46 #include <sys/cmn_err.h>
47 #include <sys/debug.h>
48 #include <sys/vtrace.h>
49 #include <sys/mode.h>
50 #include <sys/acl.h>
51 #include <sys/nbmlock.h>
52 #include <sys/policy.h>
53 #include <sys/sdt.h>
54 
55 #include <rpc/types.h>
56 #include <rpc/auth.h>
57 #include <rpc/svc.h>
58 
59 #include <nfs/nfs.h>
60 #include <nfs/export.h>
61 #include <nfs/nfs_cmd.h>
62 
63 #include <vm/hat.h>
64 #include <vm/as.h>
65 #include <vm/seg.h>
66 #include <vm/seg_map.h>
67 #include <vm/seg_kmem.h>
68 
69 #include <sys/strsubr.h>
70 
71 /*
72  * These are the interface routines for the server side of the
73  * Network File System.  See the NFS version 2 protocol specification
74  * for a description of this interface.
75  */
76 
77 static int	sattr_to_vattr(struct nfssattr *, struct vattr *);
78 static void	acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
79 			cred_t *);
80 
81 /*
82  * Some "over the wire" UNIX file types.  These are encoded
83  * into the mode.  This needs to be fixed in the next rev.
84  */
85 #define	IFMT		0170000		/* type of file */
86 #define	IFCHR		0020000		/* character special */
87 #define	IFBLK		0060000		/* block special */
88 #define	IFSOCK		0140000		/* socket */
89 
90 u_longlong_t nfs2_srv_caller_id;
91 
92 /*
93  * Get file attributes.
94  * Returns the current attributes of the file with the given fhandle.
95  */
96 /* ARGSUSED */
97 void
98 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
99 	struct svc_req *req, cred_t *cr)
100 {
101 	int error;
102 	vnode_t *vp;
103 	struct vattr va;
104 
105 	vp = nfs_fhtovp(fhp, exi);
106 	if (vp == NULL) {
107 		ns->ns_status = NFSERR_STALE;
108 		return;
109 	}
110 
111 	/*
112 	 * Do the getattr.
113 	 */
114 	va.va_mask = AT_ALL;	/* we want all the attributes */
115 
116 	error = rfs4_delegated_getattr(vp, &va, 0, cr);
117 
118 	/* check for overflows */
119 	if (!error) {
120 		acl_perm(vp, exi, &va, cr);
121 		error = vattr_to_nattr(&va, &ns->ns_attr);
122 	}
123 
124 	VN_RELE(vp);
125 
126 	ns->ns_status = puterrno(error);
127 }
128 void *
129 rfs_getattr_getfh(fhandle_t *fhp)
130 {
131 	return (fhp);
132 }
133 
134 /*
135  * Set file attributes.
136  * Sets the attributes of the file with the given fhandle.  Returns
137  * the new attributes.
138  */
139 void
140 rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns,
141 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
142 {
143 	int error;
144 	int flag;
145 	int in_crit = 0;
146 	vnode_t *vp;
147 	struct vattr va;
148 	struct vattr bva;
149 	struct flock64 bf;
150 	caller_context_t ct;
151 
152 
153 	vp = nfs_fhtovp(&args->saa_fh, exi);
154 	if (vp == NULL) {
155 		ns->ns_status = NFSERR_STALE;
156 		return;
157 	}
158 
159 	if (rdonly(exi, req) || vn_is_readonly(vp)) {
160 		VN_RELE(vp);
161 		ns->ns_status = NFSERR_ROFS;
162 		return;
163 	}
164 
165 	error = sattr_to_vattr(&args->saa_sa, &va);
166 	if (error) {
167 		VN_RELE(vp);
168 		ns->ns_status = puterrno(error);
169 		return;
170 	}
171 
172 	/*
173 	 * If the client is requesting a change to the mtime,
174 	 * but the nanosecond field is set to 1 billion, then
175 	 * this is a flag to the server that it should set the
176 	 * atime and mtime fields to the server's current time.
177 	 * The 1 billion number actually came from the client
178 	 * as 1 million, but the units in the over the wire
179 	 * request are microseconds instead of nanoseconds.
180 	 *
181 	 * This is an overload of the protocol and should be
182 	 * documented in the NFS Version 2 protocol specification.
183 	 */
184 	if (va.va_mask & AT_MTIME) {
185 		if (va.va_mtime.tv_nsec == 1000000000) {
186 			gethrestime(&va.va_mtime);
187 			va.va_atime = va.va_mtime;
188 			va.va_mask |= AT_ATIME;
189 			flag = 0;
190 		} else
191 			flag = ATTR_UTIME;
192 	} else
193 		flag = 0;
194 
195 	/*
196 	 * If the filesystem is exported with nosuid, then mask off
197 	 * the setuid and setgid bits.
198 	 */
199 	if ((va.va_mask & AT_MODE) && vp->v_type == VREG &&
200 	    (exi->exi_export.ex_flags & EX_NOSUID))
201 		va.va_mode &= ~(VSUID | VSGID);
202 
203 	ct.cc_sysid = 0;
204 	ct.cc_pid = 0;
205 	ct.cc_caller_id = nfs2_srv_caller_id;
206 	ct.cc_flags = CC_DONTBLOCK;
207 
208 	/*
209 	 * We need to specially handle size changes because it is
210 	 * possible for the client to create a file with modes
211 	 * which indicate read-only, but with the file opened for
212 	 * writing.  If the client then tries to set the size of
213 	 * the file, then the normal access checking done in
214 	 * VOP_SETATTR would prevent the client from doing so,
215 	 * although it should be legal for it to do so.  To get
216 	 * around this, we do the access checking for ourselves
217 	 * and then use VOP_SPACE which doesn't do the access
218 	 * checking which VOP_SETATTR does. VOP_SPACE can only
219 	 * operate on VREG files, let VOP_SETATTR handle the other
220 	 * extremely rare cases.
221 	 * Also the client should not be allowed to change the
222 	 * size of the file if there is a conflicting non-blocking
223 	 * mandatory lock in the region of change.
224 	 */
225 	if (vp->v_type == VREG && va.va_mask & AT_SIZE) {
226 		if (nbl_need_check(vp)) {
227 			nbl_start_crit(vp, RW_READER);
228 			in_crit = 1;
229 		}
230 
231 		bva.va_mask = AT_UID | AT_SIZE;
232 
233 		error = VOP_GETATTR(vp, &bva, 0, cr, &ct);
234 
235 		if (error) {
236 			if (in_crit)
237 				nbl_end_crit(vp);
238 			VN_RELE(vp);
239 			ns->ns_status = puterrno(error);
240 			return;
241 		}
242 
243 		if (in_crit) {
244 			u_offset_t offset;
245 			ssize_t length;
246 
247 			if (va.va_size < bva.va_size) {
248 				offset = va.va_size;
249 				length = bva.va_size - va.va_size;
250 			} else {
251 				offset = bva.va_size;
252 				length = va.va_size - bva.va_size;
253 			}
254 			if (nbl_conflict(vp, NBL_WRITE, offset, length, 0,
255 			    NULL)) {
256 				error = EACCES;
257 			}
258 		}
259 
260 		if (crgetuid(cr) == bva.va_uid && !error &&
261 		    va.va_size != bva.va_size) {
262 			va.va_mask &= ~AT_SIZE;
263 			bf.l_type = F_WRLCK;
264 			bf.l_whence = 0;
265 			bf.l_start = (off64_t)va.va_size;
266 			bf.l_len = 0;
267 			bf.l_sysid = 0;
268 			bf.l_pid = 0;
269 
270 			error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE,
271 			    (offset_t)va.va_size, cr, &ct);
272 		}
273 		if (in_crit)
274 			nbl_end_crit(vp);
275 	} else
276 		error = 0;
277 
278 	/*
279 	 * Do the setattr.
280 	 */
281 	if (!error && va.va_mask) {
282 		error = VOP_SETATTR(vp, &va, flag, cr, &ct);
283 	}
284 
285 	/*
286 	 * check if the monitor on either vop_space or vop_setattr detected
287 	 * a delegation conflict and if so, mark the thread flag as
288 	 * wouldblock so that the response is dropped and the client will
289 	 * try again.
290 	 */
291 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
292 		VN_RELE(vp);
293 		curthread->t_flag |= T_WOULDBLOCK;
294 		return;
295 	}
296 
297 	if (!error) {
298 		va.va_mask = AT_ALL;	/* get everything */
299 
300 		error = rfs4_delegated_getattr(vp, &va, 0, cr);
301 
302 		/* check for overflows */
303 		if (!error) {
304 			acl_perm(vp, exi, &va, cr);
305 			error = vattr_to_nattr(&va, &ns->ns_attr);
306 		}
307 	}
308 
309 	ct.cc_flags = 0;
310 
311 	/*
312 	 * Force modified metadata out to stable storage.
313 	 */
314 	(void) VOP_FSYNC(vp, FNODSYNC, cr, &ct);
315 
316 	VN_RELE(vp);
317 
318 	ns->ns_status = puterrno(error);
319 }
320 void *
321 rfs_setattr_getfh(struct nfssaargs *args)
322 {
323 	return (&args->saa_fh);
324 }
325 
326 /*
327  * Directory lookup.
328  * Returns an fhandle and file attributes for file name in a directory.
329  */
330 /* ARGSUSED */
331 void
332 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
333 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
334 {
335 	int error;
336 	vnode_t *dvp;
337 	vnode_t *vp;
338 	struct vattr va;
339 	fhandle_t *fhp = da->da_fhandle;
340 	struct sec_ol sec = {0, 0};
341 	bool_t publicfh_flag = FALSE, auth_weak = FALSE;
342 	char *name;
343 	struct sockaddr *ca;
344 
345 	/*
346 	 * Trusted Extension doesn't support NFSv2. MOUNT
347 	 * will reject v2 clients. Need to prevent v2 client
348 	 * access via WebNFS here.
349 	 */
350 	if (is_system_labeled() && req->rq_vers == 2) {
351 		dr->dr_status = NFSERR_ACCES;
352 		return;
353 	}
354 
355 	/*
356 	 * Disallow NULL paths
357 	 */
358 	if (da->da_name == NULL || *da->da_name == '\0') {
359 		dr->dr_status = NFSERR_ACCES;
360 		return;
361 	}
362 
363 	/*
364 	 * Allow lookups from the root - the default
365 	 * location of the public filehandle.
366 	 */
367 	if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
368 		dvp = rootdir;
369 		VN_HOLD(dvp);
370 	} else {
371 		dvp = nfs_fhtovp(fhp, exi);
372 		if (dvp == NULL) {
373 			dr->dr_status = NFSERR_STALE;
374 			return;
375 		}
376 	}
377 
378 	/*
379 	 * Not allow lookup beyond root.
380 	 * If the filehandle matches a filehandle of the exi,
381 	 * then the ".." refers beyond the root of an exported filesystem.
382 	 */
383 	if (strcmp(da->da_name, "..") == 0 &&
384 	    EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
385 		VN_RELE(dvp);
386 		dr->dr_status = NFSERR_NOENT;
387 		return;
388 	}
389 
390 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
391 	name = nfscmd_convname(ca, exi, da->da_name, NFSCMD_CONV_INBOUND,
392 	    MAXPATHLEN);
393 
394 	if (name == NULL) {
395 		dr->dr_status = NFSERR_ACCES;
396 		return;
397 	}
398 
399 	/*
400 	 * If the public filehandle is used then allow
401 	 * a multi-component lookup, i.e. evaluate
402 	 * a pathname and follow symbolic links if
403 	 * necessary.
404 	 *
405 	 * This may result in a vnode in another filesystem
406 	 * which is OK as long as the filesystem is exported.
407 	 */
408 	if (PUBLIC_FH2(fhp)) {
409 		publicfh_flag = TRUE;
410 		error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &exi,
411 		    &sec);
412 	} else {
413 		/*
414 		 * Do a normal single component lookup.
415 		 */
416 		error = VOP_LOOKUP(dvp, name, &vp, NULL, 0, NULL, cr,
417 		    NULL, NULL, NULL);
418 	}
419 
420 	if (name != da->da_name)
421 		kmem_free(name, MAXPATHLEN);
422 
423 
424 	if (!error) {
425 		va.va_mask = AT_ALL;	/* we want everything */
426 
427 		error = rfs4_delegated_getattr(vp, &va, 0, cr);
428 
429 		/* check for overflows */
430 		if (!error) {
431 			acl_perm(vp, exi, &va, cr);
432 			error = vattr_to_nattr(&va, &dr->dr_attr);
433 			if (!error) {
434 				if (sec.sec_flags & SEC_QUERY)
435 					error = makefh_ol(&dr->dr_fhandle, exi,
436 					    sec.sec_index);
437 				else {
438 					error = makefh(&dr->dr_fhandle, vp,
439 					    exi);
440 					if (!error && publicfh_flag &&
441 					    !chk_clnt_sec(exi, req))
442 						auth_weak = TRUE;
443 				}
444 			}
445 		}
446 		VN_RELE(vp);
447 	}
448 
449 	VN_RELE(dvp);
450 
451 	/*
452 	 * If publicfh_flag is true then we have called rfs_publicfh_mclookup
453 	 * and have obtained a new exportinfo in exi which needs to be
454 	 * released. Note the the original exportinfo pointed to by exi
455 	 * will be released by the caller, comon_dispatch.
456 	 */
457 	if (publicfh_flag && exi != NULL)
458 		exi_rele(exi);
459 
460 	/*
461 	 * If it's public fh, no 0x81, and client's flavor is
462 	 * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
463 	 * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
464 	 */
465 	if (auth_weak)
466 		dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR;
467 	else
468 		dr->dr_status = puterrno(error);
469 }
470 void *
471 rfs_lookup_getfh(struct nfsdiropargs *da)
472 {
473 	return (da->da_fhandle);
474 }
475 
476 /*
477  * Read symbolic link.
478  * Returns the string in the symbolic link at the given fhandle.
479  */
480 /* ARGSUSED */
481 void
482 rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi,
483 	struct svc_req *req, cred_t *cr)
484 {
485 	int error;
486 	struct iovec iov;
487 	struct uio uio;
488 	vnode_t *vp;
489 	struct vattr va;
490 	struct sockaddr *ca;
491 	char *name = NULL;
492 
493 	vp = nfs_fhtovp(fhp, exi);
494 	if (vp == NULL) {
495 		rl->rl_data = NULL;
496 		rl->rl_status = NFSERR_STALE;
497 		return;
498 	}
499 
500 	va.va_mask = AT_MODE;
501 
502 	error = VOP_GETATTR(vp, &va, 0, cr, NULL);
503 
504 	if (error) {
505 		VN_RELE(vp);
506 		rl->rl_data = NULL;
507 		rl->rl_status = puterrno(error);
508 		return;
509 	}
510 
511 	if (MANDLOCK(vp, va.va_mode)) {
512 		VN_RELE(vp);
513 		rl->rl_data = NULL;
514 		rl->rl_status = NFSERR_ACCES;
515 		return;
516 	}
517 
518 	/*
519 	 * XNFS and RFC1094 require us to return ENXIO if argument
520 	 * is not a link. BUGID 1138002.
521 	 */
522 	if (vp->v_type != VLNK) {
523 		VN_RELE(vp);
524 		rl->rl_data = NULL;
525 		rl->rl_status = NFSERR_NXIO;
526 		return;
527 	}
528 
529 	/*
530 	 * Allocate data for pathname.  This will be freed by rfs_rlfree.
531 	 */
532 	rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
533 
534 	/*
535 	 * Set up io vector to read sym link data
536 	 */
537 	iov.iov_base = rl->rl_data;
538 	iov.iov_len = NFS_MAXPATHLEN;
539 	uio.uio_iov = &iov;
540 	uio.uio_iovcnt = 1;
541 	uio.uio_segflg = UIO_SYSSPACE;
542 	uio.uio_extflg = UIO_COPY_CACHED;
543 	uio.uio_loffset = (offset_t)0;
544 	uio.uio_resid = NFS_MAXPATHLEN;
545 
546 	/*
547 	 * Do the readlink.
548 	 */
549 	error = VOP_READLINK(vp, &uio, cr, NULL);
550 
551 	VN_RELE(vp);
552 
553 	rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid);
554 	rl->rl_data[rl->rl_count] = '\0';
555 
556 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
557 	name = nfscmd_convname(ca, exi, rl->rl_data,
558 	    NFSCMD_CONV_OUTBOUND, MAXPATHLEN);
559 
560 	if (name != NULL && name != rl->rl_data) {
561 		kmem_free(rl->rl_data, NFS_MAXPATHLEN);
562 		rl->rl_data = name;
563 	}
564 
565 	/*
566 	 * XNFS and RFC1094 require us to return ENXIO if argument
567 	 * is not a link. UFS returns EINVAL if this is the case,
568 	 * so we do the mapping here. BUGID 1138002.
569 	 */
570 	if (error == EINVAL)
571 		rl->rl_status = NFSERR_NXIO;
572 	else
573 		rl->rl_status = puterrno(error);
574 
575 }
576 void *
577 rfs_readlink_getfh(fhandle_t *fhp)
578 {
579 	return (fhp);
580 }
581 /*
582  * Free data allocated by rfs_readlink
583  */
584 void
585 rfs_rlfree(struct nfsrdlnres *rl)
586 {
587 	if (rl->rl_data != NULL)
588 		kmem_free(rl->rl_data, NFS_MAXPATHLEN);
589 }
590 
591 static int rdma_setup_read_data2(struct nfsreadargs *, struct nfsrdresult *);
592 
593 /*
594  * Read data.
595  * Returns some data read from the file at the given fhandle.
596  */
597 /* ARGSUSED */
598 void
599 rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr,
600 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
601 {
602 	vnode_t *vp;
603 	int error;
604 	struct vattr va;
605 	struct iovec iov;
606 	struct uio uio;
607 	mblk_t *mp;
608 	int alloc_err = 0;
609 	int in_crit = 0;
610 	caller_context_t ct;
611 
612 	vp = nfs_fhtovp(&ra->ra_fhandle, exi);
613 	if (vp == NULL) {
614 		rr->rr_data = NULL;
615 		rr->rr_status = NFSERR_STALE;
616 		return;
617 	}
618 
619 	if (vp->v_type != VREG) {
620 		VN_RELE(vp);
621 		rr->rr_data = NULL;
622 		rr->rr_status = NFSERR_ISDIR;
623 		return;
624 	}
625 
626 	ct.cc_sysid = 0;
627 	ct.cc_pid = 0;
628 	ct.cc_caller_id = nfs2_srv_caller_id;
629 	ct.cc_flags = CC_DONTBLOCK;
630 
631 	/*
632 	 * Enter the critical region before calling VOP_RWLOCK
633 	 * to avoid a deadlock with write requests.
634 	 */
635 	if (nbl_need_check(vp)) {
636 		nbl_start_crit(vp, RW_READER);
637 		if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
638 		    0, NULL)) {
639 			nbl_end_crit(vp);
640 			VN_RELE(vp);
641 			rr->rr_data = NULL;
642 			rr->rr_status = NFSERR_ACCES;
643 			return;
644 		}
645 		in_crit = 1;
646 	}
647 
648 	error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct);
649 
650 	/* check if a monitor detected a delegation conflict */
651 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
652 		VN_RELE(vp);
653 		/* mark as wouldblock so response is dropped */
654 		curthread->t_flag |= T_WOULDBLOCK;
655 
656 		rr->rr_data = NULL;
657 		return;
658 	}
659 
660 	va.va_mask = AT_ALL;
661 
662 	error = VOP_GETATTR(vp, &va, 0, cr, &ct);
663 
664 	if (error) {
665 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
666 		if (in_crit)
667 			nbl_end_crit(vp);
668 
669 		VN_RELE(vp);
670 		rr->rr_data = NULL;
671 		rr->rr_status = puterrno(error);
672 
673 		return;
674 	}
675 
676 	/*
677 	 * This is a kludge to allow reading of files created
678 	 * with no read permission.  The owner of the file
679 	 * is always allowed to read it.
680 	 */
681 	if (crgetuid(cr) != va.va_uid) {
682 		error = VOP_ACCESS(vp, VREAD, 0, cr, &ct);
683 
684 		if (error) {
685 			/*
686 			 * Exec is the same as read over the net because
687 			 * of demand loading.
688 			 */
689 			error = VOP_ACCESS(vp, VEXEC, 0, cr, &ct);
690 		}
691 		if (error) {
692 			VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
693 			if (in_crit)
694 				nbl_end_crit(vp);
695 			VN_RELE(vp);
696 			rr->rr_data = NULL;
697 			rr->rr_status = puterrno(error);
698 
699 			return;
700 		}
701 	}
702 
703 	if (MANDLOCK(vp, va.va_mode)) {
704 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
705 		if (in_crit)
706 			nbl_end_crit(vp);
707 
708 		VN_RELE(vp);
709 		rr->rr_data = NULL;
710 		rr->rr_status = NFSERR_ACCES;
711 
712 		return;
713 	}
714 
715 	rr->rr_ok.rrok_wlist_len = 0;
716 	rr->rr_ok.rrok_wlist = NULL;
717 
718 	if ((u_offset_t)ra->ra_offset >= va.va_size) {
719 		rr->rr_count = 0;
720 		rr->rr_data = NULL;
721 		/*
722 		 * In this case, status is NFS_OK, but there is no data
723 		 * to encode. So set rr_mp to NULL.
724 		 */
725 		rr->rr_mp = NULL;
726 		goto done;
727 	}
728 
729 	if (ra->ra_wlist) {
730 		mp = NULL;
731 		rr->rr_mp = NULL;
732 		(void) rdma_get_wchunk(req, &iov, ra->ra_wlist);
733 	} else {
734 		/*
735 		 * mp will contain the data to be sent out in the read reply.
736 		 * This will be freed after the reply has been sent out (by the
737 		 * driver).
738 		 * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so
739 		 * that the call to xdrmblk_putmblk() never fails.
740 		 */
741 		mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG,
742 		    &alloc_err);
743 		ASSERT(mp != NULL);
744 		ASSERT(alloc_err == 0);
745 
746 		rr->rr_mp = mp;
747 
748 		/*
749 		 * Set up io vector
750 		 */
751 		iov.iov_base = (caddr_t)mp->b_datap->db_base;
752 		iov.iov_len = ra->ra_count;
753 	}
754 
755 	uio.uio_iov = &iov;
756 	uio.uio_iovcnt = 1;
757 	uio.uio_segflg = UIO_SYSSPACE;
758 	uio.uio_extflg = UIO_COPY_CACHED;
759 	uio.uio_loffset = (offset_t)ra->ra_offset;
760 	uio.uio_resid = ra->ra_count;
761 
762 	error = VOP_READ(vp, &uio, 0, cr, &ct);
763 
764 	if (error) {
765 		if (mp)
766 			freeb(mp);
767 
768 		/*
769 		 * check if a monitor detected a delegation conflict and
770 		 * mark as wouldblock so response is dropped
771 		 */
772 		if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
773 			curthread->t_flag |= T_WOULDBLOCK;
774 		else
775 			rr->rr_status = puterrno(error);
776 
777 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
778 		if (in_crit)
779 			nbl_end_crit(vp);
780 
781 		VN_RELE(vp);
782 		rr->rr_data = NULL;
783 
784 		return;
785 	}
786 
787 	/*
788 	 * Get attributes again so we can send the latest access
789 	 * time to the client side for his cache.
790 	 */
791 	va.va_mask = AT_ALL;
792 
793 	error = VOP_GETATTR(vp, &va, 0, cr, &ct);
794 
795 	if (error) {
796 		if (mp)
797 			freeb(mp);
798 
799 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
800 		if (in_crit)
801 			nbl_end_crit(vp);
802 
803 		VN_RELE(vp);
804 		rr->rr_data = NULL;
805 		rr->rr_status = puterrno(error);
806 
807 		return;
808 	}
809 
810 	rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid);
811 
812 	if (mp) {
813 		rr->rr_data = (char *)mp->b_datap->db_base;
814 	} else {
815 		if (ra->ra_wlist) {
816 			rr->rr_data = (caddr_t)iov.iov_base;
817 			if (!rdma_setup_read_data2(ra, rr)) {
818 				rr->rr_data = NULL;
819 				rr->rr_status = puterrno(NFSERR_INVAL);
820 			}
821 		}
822 	}
823 done:
824 	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
825 	if (in_crit)
826 		nbl_end_crit(vp);
827 
828 	acl_perm(vp, exi, &va, cr);
829 
830 	/* check for overflows */
831 	error = vattr_to_nattr(&va, &rr->rr_attr);
832 
833 	VN_RELE(vp);
834 
835 	rr->rr_status = puterrno(error);
836 }
837 
838 /*
839  * Free data allocated by rfs_read
840  */
841 void
842 rfs_rdfree(struct nfsrdresult *rr)
843 {
844 	mblk_t *mp;
845 
846 	if (rr->rr_status == NFS_OK) {
847 		mp = rr->rr_mp;
848 		if (mp != NULL)
849 			freeb(mp);
850 	}
851 }
852 
853 void *
854 rfs_read_getfh(struct nfsreadargs *ra)
855 {
856 	return (&ra->ra_fhandle);
857 }
858 
859 #define	MAX_IOVECS	12
860 
861 #ifdef DEBUG
862 static int rfs_write_sync_hits = 0;
863 static int rfs_write_sync_misses = 0;
864 #endif
865 
866 /*
867  * Write data to file.
868  * Returns attributes of a file after writing some data to it.
869  *
870  * Any changes made here, especially in error handling might have
871  * to also be done in rfs_write (which clusters write requests).
872  */
873 void
874 rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns,
875 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
876 {
877 	int error;
878 	vnode_t *vp;
879 	rlim64_t rlimit;
880 	struct vattr va;
881 	struct uio uio;
882 	struct iovec iov[MAX_IOVECS];
883 	mblk_t *m;
884 	struct iovec *iovp;
885 	int iovcnt;
886 	cred_t *savecred;
887 	int in_crit = 0;
888 	caller_context_t ct;
889 
890 	vp = nfs_fhtovp(&wa->wa_fhandle, exi);
891 	if (vp == NULL) {
892 		ns->ns_status = NFSERR_STALE;
893 		return;
894 	}
895 
896 	if (rdonly(exi, req)) {
897 		VN_RELE(vp);
898 		ns->ns_status = NFSERR_ROFS;
899 		return;
900 	}
901 
902 	if (vp->v_type != VREG) {
903 		VN_RELE(vp);
904 		ns->ns_status = NFSERR_ISDIR;
905 		return;
906 	}
907 
908 	ct.cc_sysid = 0;
909 	ct.cc_pid = 0;
910 	ct.cc_caller_id = nfs2_srv_caller_id;
911 	ct.cc_flags = CC_DONTBLOCK;
912 
913 	va.va_mask = AT_UID|AT_MODE;
914 
915 	error = VOP_GETATTR(vp, &va, 0, cr, &ct);
916 
917 	if (error) {
918 		VN_RELE(vp);
919 		ns->ns_status = puterrno(error);
920 
921 		return;
922 	}
923 
924 	if (crgetuid(cr) != va.va_uid) {
925 		/*
926 		 * This is a kludge to allow writes of files created
927 		 * with read only permission.  The owner of the file
928 		 * is always allowed to write it.
929 		 */
930 		error = VOP_ACCESS(vp, VWRITE, 0, cr, &ct);
931 
932 		if (error) {
933 			VN_RELE(vp);
934 			ns->ns_status = puterrno(error);
935 			return;
936 		}
937 	}
938 
939 	/*
940 	 * Can't access a mandatory lock file.  This might cause
941 	 * the NFS service thread to block forever waiting for a
942 	 * lock to be released that will never be released.
943 	 */
944 	if (MANDLOCK(vp, va.va_mode)) {
945 		VN_RELE(vp);
946 		ns->ns_status = NFSERR_ACCES;
947 		return;
948 	}
949 
950 	/*
951 	 * We have to enter the critical region before calling VOP_RWLOCK
952 	 * to avoid a deadlock with ufs.
953 	 */
954 	if (nbl_need_check(vp)) {
955 		nbl_start_crit(vp, RW_READER);
956 		in_crit = 1;
957 		if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
958 		    wa->wa_count, 0, NULL)) {
959 			error = EACCES;
960 			goto out;
961 		}
962 	}
963 
964 	error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
965 
966 	/* check if a monitor detected a delegation conflict */
967 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
968 		VN_RELE(vp);
969 		/* mark as wouldblock so response is dropped */
970 		curthread->t_flag |= T_WOULDBLOCK;
971 		return;
972 	}
973 
974 	if (wa->wa_data || wa->wa_rlist) {
975 		/* Do the RDMA thing if necessary */
976 		if (wa->wa_rlist) {
977 			iov[0].iov_base = (char *)((wa->wa_rlist)->u.c_daddr3);
978 			iov[0].iov_len = wa->wa_count;
979 		} else  {
980 			iov[0].iov_base = wa->wa_data;
981 			iov[0].iov_len = wa->wa_count;
982 		}
983 		uio.uio_iov = iov;
984 		uio.uio_iovcnt = 1;
985 		uio.uio_segflg = UIO_SYSSPACE;
986 		uio.uio_extflg = UIO_COPY_DEFAULT;
987 		uio.uio_loffset = (offset_t)wa->wa_offset;
988 		uio.uio_resid = wa->wa_count;
989 		/*
990 		 * The limit is checked on the client. We
991 		 * should allow any size writes here.
992 		 */
993 		uio.uio_llimit = curproc->p_fsz_ctl;
994 		rlimit = uio.uio_llimit - wa->wa_offset;
995 		if (rlimit < (rlim64_t)uio.uio_resid)
996 			uio.uio_resid = (uint_t)rlimit;
997 
998 		/*
999 		 * for now we assume no append mode
1000 		 */
1001 		/*
1002 		 * We're changing creds because VM may fault and we need
1003 		 * the cred of the current thread to be used if quota
1004 		 * checking is enabled.
1005 		 */
1006 		savecred = curthread->t_cred;
1007 		curthread->t_cred = cr;
1008 		error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1009 		curthread->t_cred = savecred;
1010 	} else {
1011 		iovcnt = 0;
1012 		for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
1013 			iovcnt++;
1014 		if (iovcnt <= MAX_IOVECS) {
1015 #ifdef DEBUG
1016 			rfs_write_sync_hits++;
1017 #endif
1018 			iovp = iov;
1019 		} else {
1020 #ifdef DEBUG
1021 			rfs_write_sync_misses++;
1022 #endif
1023 			iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
1024 		}
1025 		mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
1026 		uio.uio_iov = iovp;
1027 		uio.uio_iovcnt = iovcnt;
1028 		uio.uio_segflg = UIO_SYSSPACE;
1029 		uio.uio_extflg = UIO_COPY_DEFAULT;
1030 		uio.uio_loffset = (offset_t)wa->wa_offset;
1031 		uio.uio_resid = wa->wa_count;
1032 		/*
1033 		 * The limit is checked on the client. We
1034 		 * should allow any size writes here.
1035 		 */
1036 		uio.uio_llimit = curproc->p_fsz_ctl;
1037 		rlimit = uio.uio_llimit - wa->wa_offset;
1038 		if (rlimit < (rlim64_t)uio.uio_resid)
1039 			uio.uio_resid = (uint_t)rlimit;
1040 
1041 		/*
1042 		 * For now we assume no append mode.
1043 		 */
1044 		/*
1045 		 * We're changing creds because VM may fault and we need
1046 		 * the cred of the current thread to be used if quota
1047 		 * checking is enabled.
1048 		 */
1049 		savecred = curthread->t_cred;
1050 		curthread->t_cred = cr;
1051 		error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1052 		curthread->t_cred = savecred;
1053 
1054 		if (iovp != iov)
1055 			kmem_free(iovp, sizeof (*iovp) * iovcnt);
1056 	}
1057 
1058 	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1059 
1060 	if (!error) {
1061 		/*
1062 		 * Get attributes again so we send the latest mod
1063 		 * time to the client side for his cache.
1064 		 */
1065 		va.va_mask = AT_ALL;	/* now we want everything */
1066 
1067 		error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1068 
1069 		/* check for overflows */
1070 		if (!error) {
1071 			acl_perm(vp, exi, &va, cr);
1072 			error = vattr_to_nattr(&va, &ns->ns_attr);
1073 		}
1074 	}
1075 
1076 out:
1077 	if (in_crit)
1078 		nbl_end_crit(vp);
1079 	VN_RELE(vp);
1080 
1081 	/* check if a monitor detected a delegation conflict */
1082 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1083 		/* mark as wouldblock so response is dropped */
1084 		curthread->t_flag |= T_WOULDBLOCK;
1085 	else
1086 		ns->ns_status = puterrno(error);
1087 
1088 }
1089 
1090 struct rfs_async_write {
1091 	struct nfswriteargs *wa;
1092 	struct nfsattrstat *ns;
1093 	struct svc_req *req;
1094 	cred_t *cr;
1095 	kthread_t *thread;
1096 	struct rfs_async_write *list;
1097 };
1098 
1099 struct rfs_async_write_list {
1100 	fhandle_t *fhp;
1101 	kcondvar_t cv;
1102 	struct rfs_async_write *list;
1103 	struct rfs_async_write_list *next;
1104 };
1105 
1106 static struct rfs_async_write_list *rfs_async_write_head = NULL;
1107 static kmutex_t rfs_async_write_lock;
1108 static int rfs_write_async = 1;	/* enables write clustering if == 1 */
1109 
1110 #define	MAXCLIOVECS	42
1111 #define	RFSWRITE_INITVAL (enum nfsstat) -1
1112 
1113 #ifdef DEBUG
1114 static int rfs_write_hits = 0;
1115 static int rfs_write_misses = 0;
1116 #endif
1117 
1118 /*
1119  * Write data to file.
1120  * Returns attributes of a file after writing some data to it.
1121  */
1122 void
1123 rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns,
1124 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1125 {
1126 	int error;
1127 	vnode_t *vp;
1128 	rlim64_t rlimit;
1129 	struct vattr va;
1130 	struct uio uio;
1131 	struct rfs_async_write_list *lp;
1132 	struct rfs_async_write_list *nlp;
1133 	struct rfs_async_write *rp;
1134 	struct rfs_async_write *nrp;
1135 	struct rfs_async_write *trp;
1136 	struct rfs_async_write *lrp;
1137 	int data_written;
1138 	int iovcnt;
1139 	mblk_t *m;
1140 	struct iovec *iovp;
1141 	struct iovec *niovp;
1142 	struct iovec iov[MAXCLIOVECS];
1143 	int count;
1144 	int rcount;
1145 	uint_t off;
1146 	uint_t len;
1147 	struct rfs_async_write nrpsp;
1148 	struct rfs_async_write_list nlpsp;
1149 	ushort_t t_flag;
1150 	cred_t *savecred;
1151 	int in_crit = 0;
1152 	caller_context_t ct;
1153 
1154 	if (!rfs_write_async) {
1155 		rfs_write_sync(wa, ns, exi, req, cr);
1156 		return;
1157 	}
1158 
1159 	/*
1160 	 * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
1161 	 * is considered an OK.
1162 	 */
1163 	ns->ns_status = RFSWRITE_INITVAL;
1164 
1165 	nrp = &nrpsp;
1166 	nrp->wa = wa;
1167 	nrp->ns = ns;
1168 	nrp->req = req;
1169 	nrp->cr = cr;
1170 	nrp->thread = curthread;
1171 
1172 	ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
1173 
1174 	/*
1175 	 * Look to see if there is already a cluster started
1176 	 * for this file.
1177 	 */
1178 	mutex_enter(&rfs_async_write_lock);
1179 	for (lp = rfs_async_write_head; lp != NULL; lp = lp->next) {
1180 		if (bcmp(&wa->wa_fhandle, lp->fhp,
1181 		    sizeof (fhandle_t)) == 0)
1182 			break;
1183 	}
1184 
1185 	/*
1186 	 * If lp is non-NULL, then there is already a cluster
1187 	 * started.  We need to place ourselves in the cluster
1188 	 * list in the right place as determined by starting
1189 	 * offset.  Conflicts with non-blocking mandatory locked
1190 	 * regions will be checked when the cluster is processed.
1191 	 */
1192 	if (lp != NULL) {
1193 		rp = lp->list;
1194 		trp = NULL;
1195 		while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
1196 			trp = rp;
1197 			rp = rp->list;
1198 		}
1199 		nrp->list = rp;
1200 		if (trp == NULL)
1201 			lp->list = nrp;
1202 		else
1203 			trp->list = nrp;
1204 		while (nrp->ns->ns_status == RFSWRITE_INITVAL)
1205 			cv_wait(&lp->cv, &rfs_async_write_lock);
1206 		mutex_exit(&rfs_async_write_lock);
1207 
1208 		return;
1209 	}
1210 
1211 	/*
1212 	 * No cluster started yet, start one and add ourselves
1213 	 * to the list of clusters.
1214 	 */
1215 	nrp->list = NULL;
1216 
1217 	nlp = &nlpsp;
1218 	nlp->fhp = &wa->wa_fhandle;
1219 	cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
1220 	nlp->list = nrp;
1221 	nlp->next = NULL;
1222 
1223 	if (rfs_async_write_head == NULL) {
1224 		rfs_async_write_head = nlp;
1225 	} else {
1226 		lp = rfs_async_write_head;
1227 		while (lp->next != NULL)
1228 			lp = lp->next;
1229 		lp->next = nlp;
1230 	}
1231 	mutex_exit(&rfs_async_write_lock);
1232 
1233 	/*
1234 	 * Convert the file handle common to all of the requests
1235 	 * in this cluster to a vnode.
1236 	 */
1237 	vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1238 	if (vp == NULL) {
1239 		mutex_enter(&rfs_async_write_lock);
1240 		if (rfs_async_write_head == nlp)
1241 			rfs_async_write_head = nlp->next;
1242 		else {
1243 			lp = rfs_async_write_head;
1244 			while (lp->next != nlp)
1245 				lp = lp->next;
1246 			lp->next = nlp->next;
1247 		}
1248 		t_flag = curthread->t_flag & T_WOULDBLOCK;
1249 		for (rp = nlp->list; rp != NULL; rp = rp->list) {
1250 			rp->ns->ns_status = NFSERR_STALE;
1251 			rp->thread->t_flag |= t_flag;
1252 		}
1253 		cv_broadcast(&nlp->cv);
1254 		mutex_exit(&rfs_async_write_lock);
1255 
1256 		return;
1257 	}
1258 
1259 	/*
1260 	 * Can only write regular files.  Attempts to write any
1261 	 * other file types fail with EISDIR.
1262 	 */
1263 	if (vp->v_type != VREG) {
1264 		VN_RELE(vp);
1265 		mutex_enter(&rfs_async_write_lock);
1266 		if (rfs_async_write_head == nlp)
1267 			rfs_async_write_head = nlp->next;
1268 		else {
1269 			lp = rfs_async_write_head;
1270 			while (lp->next != nlp)
1271 				lp = lp->next;
1272 			lp->next = nlp->next;
1273 		}
1274 		t_flag = curthread->t_flag & T_WOULDBLOCK;
1275 		for (rp = nlp->list; rp != NULL; rp = rp->list) {
1276 			rp->ns->ns_status = NFSERR_ISDIR;
1277 			rp->thread->t_flag |= t_flag;
1278 		}
1279 		cv_broadcast(&nlp->cv);
1280 		mutex_exit(&rfs_async_write_lock);
1281 
1282 		return;
1283 	}
1284 
1285 	/*
1286 	 * Enter the critical region before calling VOP_RWLOCK, to avoid a
1287 	 * deadlock with ufs.
1288 	 */
1289 	if (nbl_need_check(vp)) {
1290 		nbl_start_crit(vp, RW_READER);
1291 		in_crit = 1;
1292 	}
1293 
1294 	ct.cc_sysid = 0;
1295 	ct.cc_pid = 0;
1296 	ct.cc_caller_id = nfs2_srv_caller_id;
1297 	ct.cc_flags = CC_DONTBLOCK;
1298 
1299 	/*
1300 	 * Lock the file for writing.  This operation provides
1301 	 * the delay which allows clusters to grow.
1302 	 */
1303 	error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1304 
1305 	/* check if a monitor detected a delegation conflict */
1306 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1307 		if (in_crit)
1308 			nbl_end_crit(vp);
1309 		VN_RELE(vp);
1310 		/* mark as wouldblock so response is dropped */
1311 		curthread->t_flag |= T_WOULDBLOCK;
1312 		mutex_enter(&rfs_async_write_lock);
1313 		if (rfs_async_write_head == nlp)
1314 			rfs_async_write_head = nlp->next;
1315 		else {
1316 			lp = rfs_async_write_head;
1317 			while (lp->next != nlp)
1318 				lp = lp->next;
1319 			lp->next = nlp->next;
1320 		}
1321 		for (rp = nlp->list; rp != NULL; rp = rp->list) {
1322 			if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1323 				rp->ns->ns_status = puterrno(error);
1324 				rp->thread->t_flag |= T_WOULDBLOCK;
1325 			}
1326 		}
1327 		cv_broadcast(&nlp->cv);
1328 		mutex_exit(&rfs_async_write_lock);
1329 
1330 		return;
1331 	}
1332 
1333 	/*
1334 	 * Disconnect this cluster from the list of clusters.
1335 	 * The cluster that is being dealt with must be fixed
1336 	 * in size after this point, so there is no reason
1337 	 * to leave it on the list so that new requests can
1338 	 * find it.
1339 	 *
1340 	 * The algorithm is that the first write request will
1341 	 * create a cluster, convert the file handle to a
1342 	 * vnode pointer, and then lock the file for writing.
1343 	 * This request is not likely to be clustered with
1344 	 * any others.  However, the next request will create
1345 	 * a new cluster and be blocked in VOP_RWLOCK while
1346 	 * the first request is being processed.  This delay
1347 	 * will allow more requests to be clustered in this
1348 	 * second cluster.
1349 	 */
1350 	mutex_enter(&rfs_async_write_lock);
1351 	if (rfs_async_write_head == nlp)
1352 		rfs_async_write_head = nlp->next;
1353 	else {
1354 		lp = rfs_async_write_head;
1355 		while (lp->next != nlp)
1356 			lp = lp->next;
1357 		lp->next = nlp->next;
1358 	}
1359 	mutex_exit(&rfs_async_write_lock);
1360 
1361 	/*
1362 	 * Step through the list of requests in this cluster.
1363 	 * We need to check permissions to make sure that all
1364 	 * of the requests have sufficient permission to write
1365 	 * the file.  A cluster can be composed of requests
1366 	 * from different clients and different users on each
1367 	 * client.
1368 	 *
1369 	 * As a side effect, we also calculate the size of the
1370 	 * byte range that this cluster encompasses.
1371 	 */
1372 	rp = nlp->list;
1373 	off = rp->wa->wa_offset;
1374 	len = (uint_t)0;
1375 	do {
1376 		if (rdonly(exi, rp->req)) {
1377 			rp->ns->ns_status = NFSERR_ROFS;
1378 			t_flag = curthread->t_flag & T_WOULDBLOCK;
1379 			rp->thread->t_flag |= t_flag;
1380 			continue;
1381 		}
1382 
1383 		va.va_mask = AT_UID|AT_MODE;
1384 
1385 		error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1386 
1387 		if (!error) {
1388 			if (crgetuid(rp->cr) != va.va_uid) {
1389 				/*
1390 				 * This is a kludge to allow writes of files
1391 				 * created with read only permission.  The
1392 				 * owner of the file is always allowed to
1393 				 * write it.
1394 				 */
1395 				error = VOP_ACCESS(vp, VWRITE, 0, rp->cr, &ct);
1396 			}
1397 			if (!error && MANDLOCK(vp, va.va_mode))
1398 				error = EACCES;
1399 		}
1400 
1401 		/*
1402 		 * Check for a conflict with a nbmand-locked region.
1403 		 */
1404 		if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset,
1405 		    rp->wa->wa_count, 0, NULL)) {
1406 			error = EACCES;
1407 		}
1408 
1409 		if (error) {
1410 			rp->ns->ns_status = puterrno(error);
1411 			t_flag = curthread->t_flag & T_WOULDBLOCK;
1412 			rp->thread->t_flag |= t_flag;
1413 			continue;
1414 		}
1415 		if (len < rp->wa->wa_offset + rp->wa->wa_count - off)
1416 			len = rp->wa->wa_offset + rp->wa->wa_count - off;
1417 	} while ((rp = rp->list) != NULL);
1418 
1419 	/*
1420 	 * Step through the cluster attempting to gather as many
1421 	 * requests which are contiguous as possible.  These
1422 	 * contiguous requests are handled via one call to VOP_WRITE
1423 	 * instead of different calls to VOP_WRITE.  We also keep
1424 	 * track of the fact that any data was written.
1425 	 */
1426 	rp = nlp->list;
1427 	data_written = 0;
1428 	do {
1429 		/*
1430 		 * Skip any requests which are already marked as having an
1431 		 * error.
1432 		 */
1433 		if (rp->ns->ns_status != RFSWRITE_INITVAL) {
1434 			rp = rp->list;
1435 			continue;
1436 		}
1437 
1438 		/*
1439 		 * Count the number of iovec's which are required
1440 		 * to handle this set of requests.  One iovec is
1441 		 * needed for each data buffer, whether addressed
1442 		 * by wa_data or by the b_rptr pointers in the
1443 		 * mblk chains.
1444 		 */
1445 		iovcnt = 0;
1446 		lrp = rp;
1447 		for (;;) {
1448 			if (lrp->wa->wa_data || lrp->wa->wa_rlist)
1449 				iovcnt++;
1450 			else {
1451 				m = lrp->wa->wa_mblk;
1452 				while (m != NULL) {
1453 					iovcnt++;
1454 					m = m->b_cont;
1455 				}
1456 			}
1457 			if (lrp->list == NULL ||
1458 			    lrp->list->ns->ns_status != RFSWRITE_INITVAL ||
1459 			    lrp->wa->wa_offset + lrp->wa->wa_count !=
1460 			    lrp->list->wa->wa_offset) {
1461 				lrp = lrp->list;
1462 				break;
1463 			}
1464 			lrp = lrp->list;
1465 		}
1466 
1467 		if (iovcnt <= MAXCLIOVECS) {
1468 #ifdef DEBUG
1469 			rfs_write_hits++;
1470 #endif
1471 			niovp = iov;
1472 		} else {
1473 #ifdef DEBUG
1474 			rfs_write_misses++;
1475 #endif
1476 			niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP);
1477 		}
1478 		/*
1479 		 * Put together the scatter/gather iovecs.
1480 		 */
1481 		iovp = niovp;
1482 		trp = rp;
1483 		count = 0;
1484 		do {
1485 			if (trp->wa->wa_data || trp->wa->wa_rlist) {
1486 				if (trp->wa->wa_rlist) {
1487 					iovp->iov_base =
1488 					    (char *)((trp->wa->wa_rlist)->
1489 					    u.c_daddr3);
1490 					iovp->iov_len = trp->wa->wa_count;
1491 				} else  {
1492 					iovp->iov_base = trp->wa->wa_data;
1493 					iovp->iov_len = trp->wa->wa_count;
1494 				}
1495 				iovp++;
1496 			} else {
1497 				m = trp->wa->wa_mblk;
1498 				rcount = trp->wa->wa_count;
1499 				while (m != NULL) {
1500 					iovp->iov_base = (caddr_t)m->b_rptr;
1501 					iovp->iov_len = (m->b_wptr - m->b_rptr);
1502 					rcount -= iovp->iov_len;
1503 					if (rcount < 0)
1504 						iovp->iov_len += rcount;
1505 					iovp++;
1506 					if (rcount <= 0)
1507 						break;
1508 					m = m->b_cont;
1509 				}
1510 			}
1511 			count += trp->wa->wa_count;
1512 			trp = trp->list;
1513 		} while (trp != lrp);
1514 
1515 		uio.uio_iov = niovp;
1516 		uio.uio_iovcnt = iovcnt;
1517 		uio.uio_segflg = UIO_SYSSPACE;
1518 		uio.uio_extflg = UIO_COPY_DEFAULT;
1519 		uio.uio_loffset = (offset_t)rp->wa->wa_offset;
1520 		uio.uio_resid = count;
1521 		/*
1522 		 * The limit is checked on the client. We
1523 		 * should allow any size writes here.
1524 		 */
1525 		uio.uio_llimit = curproc->p_fsz_ctl;
1526 		rlimit = uio.uio_llimit - rp->wa->wa_offset;
1527 		if (rlimit < (rlim64_t)uio.uio_resid)
1528 			uio.uio_resid = (uint_t)rlimit;
1529 
1530 		/*
1531 		 * For now we assume no append mode.
1532 		 */
1533 
1534 		/*
1535 		 * We're changing creds because VM may fault
1536 		 * and we need the cred of the current
1537 		 * thread to be used if quota * checking is
1538 		 * enabled.
1539 		 */
1540 		savecred = curthread->t_cred;
1541 		curthread->t_cred = cr;
1542 		error = VOP_WRITE(vp, &uio, 0, rp->cr, &ct);
1543 		curthread->t_cred = savecred;
1544 
1545 		/* check if a monitor detected a delegation conflict */
1546 		if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1547 			/* mark as wouldblock so response is dropped */
1548 			curthread->t_flag |= T_WOULDBLOCK;
1549 
1550 		if (niovp != iov)
1551 			kmem_free(niovp, sizeof (*niovp) * iovcnt);
1552 
1553 		if (!error) {
1554 			data_written = 1;
1555 			/*
1556 			 * Get attributes again so we send the latest mod
1557 			 * time to the client side for his cache.
1558 			 */
1559 			va.va_mask = AT_ALL;	/* now we want everything */
1560 
1561 			error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1562 
1563 			if (!error)
1564 				acl_perm(vp, exi, &va, rp->cr);
1565 		}
1566 
1567 		/*
1568 		 * Fill in the status responses for each request
1569 		 * which was just handled.  Also, copy the latest
1570 		 * attributes in to the attribute responses if
1571 		 * appropriate.
1572 		 */
1573 		t_flag = curthread->t_flag & T_WOULDBLOCK;
1574 		do {
1575 			rp->thread->t_flag |= t_flag;
1576 			/* check for overflows */
1577 			if (!error) {
1578 				error  = vattr_to_nattr(&va, &rp->ns->ns_attr);
1579 			}
1580 			rp->ns->ns_status = puterrno(error);
1581 			rp = rp->list;
1582 		} while (rp != lrp);
1583 	} while (rp != NULL);
1584 
1585 	/*
1586 	 * If any data was written at all, then we need to flush
1587 	 * the data and metadata to stable storage.
1588 	 */
1589 	if (data_written) {
1590 		error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr, &ct);
1591 
1592 		if (!error) {
1593 			error = VOP_FSYNC(vp, FNODSYNC, cr, &ct);
1594 		}
1595 	}
1596 
1597 	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1598 
1599 	if (in_crit)
1600 		nbl_end_crit(vp);
1601 	VN_RELE(vp);
1602 
1603 	t_flag = curthread->t_flag & T_WOULDBLOCK;
1604 	mutex_enter(&rfs_async_write_lock);
1605 	for (rp = nlp->list; rp != NULL; rp = rp->list) {
1606 		if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1607 			rp->ns->ns_status = puterrno(error);
1608 			rp->thread->t_flag |= t_flag;
1609 		}
1610 	}
1611 	cv_broadcast(&nlp->cv);
1612 	mutex_exit(&rfs_async_write_lock);
1613 
1614 }
1615 
1616 void *
1617 rfs_write_getfh(struct nfswriteargs *wa)
1618 {
1619 	return (&wa->wa_fhandle);
1620 }
1621 
1622 /*
1623  * Create a file.
1624  * Creates a file with given attributes and returns those attributes
1625  * and an fhandle for the new file.
1626  */
1627 void
1628 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
1629 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1630 {
1631 	int error;
1632 	int lookuperr;
1633 	int in_crit = 0;
1634 	struct vattr va;
1635 	vnode_t *vp;
1636 	vnode_t *realvp;
1637 	vnode_t *dvp;
1638 	char *name = args->ca_da.da_name;
1639 	vnode_t *tvp = NULL;
1640 	int mode;
1641 	int lookup_ok;
1642 	bool_t trunc;
1643 	struct sockaddr *ca;
1644 
1645 	/*
1646 	 * Disallow NULL paths
1647 	 */
1648 	if (name == NULL || *name == '\0') {
1649 		dr->dr_status = NFSERR_ACCES;
1650 		return;
1651 	}
1652 
1653 	dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
1654 	if (dvp == NULL) {
1655 		dr->dr_status = NFSERR_STALE;
1656 		return;
1657 	}
1658 
1659 	error = sattr_to_vattr(args->ca_sa, &va);
1660 	if (error) {
1661 		dr->dr_status = puterrno(error);
1662 		return;
1663 	}
1664 
1665 	/*
1666 	 * Must specify the mode.
1667 	 */
1668 	if (!(va.va_mask & AT_MODE)) {
1669 		VN_RELE(dvp);
1670 		dr->dr_status = NFSERR_INVAL;
1671 		return;
1672 	}
1673 
1674 	/*
1675 	 * This is a completely gross hack to make mknod
1676 	 * work over the wire until we can wack the protocol
1677 	 */
1678 	if ((va.va_mode & IFMT) == IFCHR) {
1679 		if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV)
1680 			va.va_type = VFIFO;	/* xtra kludge for named pipe */
1681 		else {
1682 			va.va_type = VCHR;
1683 			/*
1684 			 * uncompress the received dev_t
1685 			 * if the top half is zero indicating a request
1686 			 * from an `older style' OS.
1687 			 */
1688 			if ((va.va_size & 0xffff0000) == 0)
1689 				va.va_rdev = nfsv2_expdev(va.va_size);
1690 			else
1691 				va.va_rdev = (dev_t)va.va_size;
1692 		}
1693 		va.va_mask &= ~AT_SIZE;
1694 	} else if ((va.va_mode & IFMT) == IFBLK) {
1695 		va.va_type = VBLK;
1696 		/*
1697 		 * uncompress the received dev_t
1698 		 * if the top half is zero indicating a request
1699 		 * from an `older style' OS.
1700 		 */
1701 		if ((va.va_size & 0xffff0000) == 0)
1702 			va.va_rdev = nfsv2_expdev(va.va_size);
1703 		else
1704 			va.va_rdev = (dev_t)va.va_size;
1705 		va.va_mask &= ~AT_SIZE;
1706 	} else if ((va.va_mode & IFMT) == IFSOCK) {
1707 		va.va_type = VSOCK;
1708 	} else {
1709 		va.va_type = VREG;
1710 	}
1711 	va.va_mode &= ~IFMT;
1712 	va.va_mask |= AT_TYPE;
1713 
1714 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
1715 	name = nfscmd_convname(ca, exi, name, NFSCMD_CONV_INBOUND,
1716 	    MAXPATHLEN);
1717 	if (name == NULL) {
1718 		dr->dr_status = puterrno(EINVAL);
1719 		return;
1720 	}
1721 
1722 	/*
1723 	 * Why was the choice made to use VWRITE as the mode to the
1724 	 * call to VOP_CREATE ? This results in a bug.  When a client
1725 	 * opens a file that already exists and is RDONLY, the second
1726 	 * open fails with an EACESS because of the mode.
1727 	 * bug ID 1054648.
1728 	 */
1729 	lookup_ok = 0;
1730 	mode = VWRITE;
1731 	if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) {
1732 		error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1733 		    NULL, NULL, NULL);
1734 		if (!error) {
1735 			struct vattr at;
1736 
1737 			lookup_ok = 1;
1738 			at.va_mask = AT_MODE;
1739 			error = VOP_GETATTR(tvp, &at, 0, cr, NULL);
1740 			if (!error)
1741 				mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD;
1742 			VN_RELE(tvp);
1743 			tvp = NULL;
1744 		}
1745 	}
1746 
1747 	if (!lookup_ok) {
1748 		if (rdonly(exi, req)) {
1749 			error = EROFS;
1750 		} else if (va.va_type != VREG && va.va_type != VFIFO &&
1751 		    va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) {
1752 			error = EPERM;
1753 		} else {
1754 			error = 0;
1755 		}
1756 	}
1757 
1758 	/*
1759 	 * If file size is being modified on an already existing file
1760 	 * make sure that there are no conflicting non-blocking mandatory
1761 	 * locks in the region being manipulated. Return EACCES if there
1762 	 * are conflicting locks.
1763 	 */
1764 	if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) {
1765 		lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1766 		    NULL, NULL, NULL);
1767 
1768 		if (!lookuperr &&
1769 		    rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) {
1770 			VN_RELE(tvp);
1771 			curthread->t_flag |= T_WOULDBLOCK;
1772 			goto out;
1773 		}
1774 
1775 		if (!lookuperr && nbl_need_check(tvp)) {
1776 			/*
1777 			 * The file exists. Now check if it has any
1778 			 * conflicting non-blocking mandatory locks
1779 			 * in the region being changed.
1780 			 */
1781 			struct vattr bva;
1782 			u_offset_t offset;
1783 			ssize_t length;
1784 
1785 			nbl_start_crit(tvp, RW_READER);
1786 			in_crit = 1;
1787 
1788 			bva.va_mask = AT_SIZE;
1789 			error = VOP_GETATTR(tvp, &bva, 0, cr, NULL);
1790 			if (!error) {
1791 				if (va.va_size < bva.va_size) {
1792 					offset = va.va_size;
1793 					length = bva.va_size - va.va_size;
1794 				} else {
1795 					offset = bva.va_size;
1796 					length = va.va_size - bva.va_size;
1797 				}
1798 				if (length) {
1799 					if (nbl_conflict(tvp, NBL_WRITE,
1800 					    offset, length, 0, NULL)) {
1801 						error = EACCES;
1802 					}
1803 				}
1804 			}
1805 			if (error) {
1806 				nbl_end_crit(tvp);
1807 				VN_RELE(tvp);
1808 				in_crit = 0;
1809 			}
1810 		} else if (tvp != NULL) {
1811 			VN_RELE(tvp);
1812 		}
1813 	}
1814 
1815 	if (!error) {
1816 		/*
1817 		 * If filesystem is shared with nosuid the remove any
1818 		 * setuid/setgid bits on create.
1819 		 */
1820 		if (va.va_type == VREG &&
1821 		    exi->exi_export.ex_flags & EX_NOSUID)
1822 			va.va_mode &= ~(VSUID | VSGID);
1823 
1824 		error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0,
1825 		    NULL, NULL);
1826 
1827 		if (!error) {
1828 
1829 			if ((va.va_mask & AT_SIZE) && (va.va_size == 0))
1830 				trunc = TRUE;
1831 			else
1832 				trunc = FALSE;
1833 
1834 			if (rfs4_check_delegated(FWRITE, vp, trunc)) {
1835 				VN_RELE(vp);
1836 				curthread->t_flag |= T_WOULDBLOCK;
1837 				goto out;
1838 			}
1839 			va.va_mask = AT_ALL;
1840 
1841 			error = VOP_GETATTR(vp, &va, 0, cr, NULL);
1842 
1843 			/* check for overflows */
1844 			if (!error) {
1845 				acl_perm(vp, exi, &va, cr);
1846 				error = vattr_to_nattr(&va, &dr->dr_attr);
1847 				if (!error) {
1848 					error = makefh(&dr->dr_fhandle, vp,
1849 					    exi);
1850 				}
1851 			}
1852 			/*
1853 			 * Force modified metadata out to stable storage.
1854 			 *
1855 			 * if a underlying vp exists, pass it to VOP_FSYNC
1856 			 */
1857 			if (VOP_REALVP(vp, &realvp, NULL) == 0)
1858 				(void) VOP_FSYNC(realvp, FNODSYNC, cr, NULL);
1859 			else
1860 				(void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
1861 			VN_RELE(vp);
1862 		}
1863 
1864 		if (in_crit) {
1865 			nbl_end_crit(tvp);
1866 			VN_RELE(tvp);
1867 		}
1868 	}
1869 
1870 	/*
1871 	 * Force modified data and metadata out to stable storage.
1872 	 */
1873 	(void) VOP_FSYNC(dvp, 0, cr, NULL);
1874 
1875 out:
1876 
1877 	VN_RELE(dvp);
1878 
1879 	dr->dr_status = puterrno(error);
1880 
1881 	if (name != args->ca_da.da_name)
1882 		kmem_free(name, MAXPATHLEN);
1883 }
1884 void *
1885 rfs_create_getfh(struct nfscreatargs *args)
1886 {
1887 	return (args->ca_da.da_fhandle);
1888 }
1889 
1890 /*
1891  * Remove a file.
1892  * Remove named file from parent directory.
1893  */
1894 void
1895 rfs_remove(struct nfsdiropargs *da, enum nfsstat *status,
1896 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1897 {
1898 	int error = 0;
1899 	vnode_t *vp;
1900 	vnode_t *targvp;
1901 	int in_crit = 0;
1902 
1903 	/*
1904 	 * Disallow NULL paths
1905 	 */
1906 	if (da->da_name == NULL || *da->da_name == '\0') {
1907 		*status = NFSERR_ACCES;
1908 		return;
1909 	}
1910 
1911 	vp = nfs_fhtovp(da->da_fhandle, exi);
1912 	if (vp == NULL) {
1913 		*status = NFSERR_STALE;
1914 		return;
1915 	}
1916 
1917 	if (rdonly(exi, req)) {
1918 		VN_RELE(vp);
1919 		*status = NFSERR_ROFS;
1920 		return;
1921 	}
1922 
1923 	/*
1924 	 * Check for a conflict with a non-blocking mandatory share reservation.
1925 	 */
1926 	error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0,
1927 	    NULL, cr, NULL, NULL, NULL);
1928 	if (error != 0) {
1929 		VN_RELE(vp);
1930 		*status = puterrno(error);
1931 		return;
1932 	}
1933 
1934 	/*
1935 	 * If the file is delegated to an v4 client, then initiate
1936 	 * recall and drop this request (by setting T_WOULDBLOCK).
1937 	 * The client will eventually re-transmit the request and
1938 	 * (hopefully), by then, the v4 client will have returned
1939 	 * the delegation.
1940 	 */
1941 
1942 	if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
1943 		VN_RELE(vp);
1944 		VN_RELE(targvp);
1945 		curthread->t_flag |= T_WOULDBLOCK;
1946 		return;
1947 	}
1948 
1949 	if (nbl_need_check(targvp)) {
1950 		nbl_start_crit(targvp, RW_READER);
1951 		in_crit = 1;
1952 		if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
1953 			error = EACCES;
1954 			goto out;
1955 		}
1956 	}
1957 
1958 	error = VOP_REMOVE(vp, da->da_name, cr, NULL, 0);
1959 
1960 	/*
1961 	 * Force modified data and metadata out to stable storage.
1962 	 */
1963 	(void) VOP_FSYNC(vp, 0, cr, NULL);
1964 
1965 out:
1966 	if (in_crit)
1967 		nbl_end_crit(targvp);
1968 	VN_RELE(targvp);
1969 	VN_RELE(vp);
1970 
1971 	*status = puterrno(error);
1972 
1973 }
1974 
1975 void *
1976 rfs_remove_getfh(struct nfsdiropargs *da)
1977 {
1978 	return (da->da_fhandle);
1979 }
1980 
1981 /*
1982  * rename a file
1983  * Give a file (from) a new name (to).
1984  */
1985 void
1986 rfs_rename(struct nfsrnmargs *args, enum nfsstat *status,
1987 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1988 {
1989 	int error = 0;
1990 	vnode_t *fromvp;
1991 	vnode_t *tovp;
1992 	struct exportinfo *to_exi;
1993 	fhandle_t *fh;
1994 	vnode_t *srcvp;
1995 	vnode_t *targvp;
1996 	int in_crit = 0;
1997 
1998 	fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi);
1999 	if (fromvp == NULL) {
2000 		*status = NFSERR_STALE;
2001 		return;
2002 	}
2003 
2004 	fh = args->rna_to.da_fhandle;
2005 	to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2006 	if (to_exi == NULL) {
2007 		VN_RELE(fromvp);
2008 		*status = NFSERR_ACCES;
2009 		return;
2010 	}
2011 	exi_rele(to_exi);
2012 
2013 	if (to_exi != exi) {
2014 		VN_RELE(fromvp);
2015 		*status = NFSERR_XDEV;
2016 		return;
2017 	}
2018 
2019 	tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi);
2020 	if (tovp == NULL) {
2021 		VN_RELE(fromvp);
2022 		*status = NFSERR_STALE;
2023 		return;
2024 	}
2025 
2026 	if (fromvp->v_type != VDIR || tovp->v_type != VDIR) {
2027 		VN_RELE(tovp);
2028 		VN_RELE(fromvp);
2029 		*status = NFSERR_NOTDIR;
2030 		return;
2031 	}
2032 
2033 	/*
2034 	 * Disallow NULL paths
2035 	 */
2036 	if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' ||
2037 	    args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') {
2038 		VN_RELE(tovp);
2039 		VN_RELE(fromvp);
2040 		*status = NFSERR_ACCES;
2041 		return;
2042 	}
2043 
2044 	if (rdonly(exi, req)) {
2045 		VN_RELE(tovp);
2046 		VN_RELE(fromvp);
2047 		*status = NFSERR_ROFS;
2048 		return;
2049 	}
2050 
2051 	/*
2052 	 * Check for a conflict with a non-blocking mandatory share reservation.
2053 	 */
2054 	error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
2055 	    NULL, cr, NULL, NULL, NULL);
2056 	if (error != 0) {
2057 		VN_RELE(tovp);
2058 		VN_RELE(fromvp);
2059 		*status = puterrno(error);
2060 		return;
2061 	}
2062 
2063 	/* Check for delegations on the source file */
2064 
2065 	if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
2066 		VN_RELE(tovp);
2067 		VN_RELE(fromvp);
2068 		VN_RELE(srcvp);
2069 		curthread->t_flag |= T_WOULDBLOCK;
2070 		return;
2071 	}
2072 
2073 	/* Check for delegation on the file being renamed over, if it exists */
2074 
2075 	if (rfs4_deleg_policy != SRV_NEVER_DELEGATE &&
2076 	    VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
2077 	    NULL, NULL, NULL) == 0) {
2078 
2079 		if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2080 			VN_RELE(tovp);
2081 			VN_RELE(fromvp);
2082 			VN_RELE(srcvp);
2083 			VN_RELE(targvp);
2084 			curthread->t_flag |= T_WOULDBLOCK;
2085 			return;
2086 		}
2087 		VN_RELE(targvp);
2088 	}
2089 
2090 
2091 	if (nbl_need_check(srcvp)) {
2092 		nbl_start_crit(srcvp, RW_READER);
2093 		in_crit = 1;
2094 		if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) {
2095 			error = EACCES;
2096 			goto out;
2097 		}
2098 	}
2099 
2100 	error = VOP_RENAME(fromvp, args->rna_from.da_name,
2101 	    tovp, args->rna_to.da_name, cr, NULL, 0);
2102 
2103 	if (error == 0)
2104 		vn_renamepath(tovp, srcvp, args->rna_to.da_name,
2105 		    strlen(args->rna_to.da_name));
2106 
2107 	/*
2108 	 * Force modified data and metadata out to stable storage.
2109 	 */
2110 	(void) VOP_FSYNC(tovp, 0, cr, NULL);
2111 	(void) VOP_FSYNC(fromvp, 0, cr, NULL);
2112 
2113 out:
2114 	if (in_crit)
2115 		nbl_end_crit(srcvp);
2116 	VN_RELE(srcvp);
2117 	VN_RELE(tovp);
2118 	VN_RELE(fromvp);
2119 
2120 	*status = puterrno(error);
2121 
2122 }
2123 void *
2124 rfs_rename_getfh(struct nfsrnmargs *args)
2125 {
2126 	return (args->rna_from.da_fhandle);
2127 }
2128 
2129 /*
2130  * Link to a file.
2131  * Create a file (to) which is a hard link to the given file (from).
2132  */
2133 void
2134 rfs_link(struct nfslinkargs *args, enum nfsstat *status,
2135 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2136 {
2137 	int error;
2138 	vnode_t *fromvp;
2139 	vnode_t *tovp;
2140 	struct exportinfo *to_exi;
2141 	fhandle_t *fh;
2142 
2143 	fromvp = nfs_fhtovp(args->la_from, exi);
2144 	if (fromvp == NULL) {
2145 		*status = NFSERR_STALE;
2146 		return;
2147 	}
2148 
2149 	fh = args->la_to.da_fhandle;
2150 	to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2151 	if (to_exi == NULL) {
2152 		VN_RELE(fromvp);
2153 		*status = NFSERR_ACCES;
2154 		return;
2155 	}
2156 	exi_rele(to_exi);
2157 
2158 	if (to_exi != exi) {
2159 		VN_RELE(fromvp);
2160 		*status = NFSERR_XDEV;
2161 		return;
2162 	}
2163 
2164 	tovp = nfs_fhtovp(args->la_to.da_fhandle, exi);
2165 	if (tovp == NULL) {
2166 		VN_RELE(fromvp);
2167 		*status = NFSERR_STALE;
2168 		return;
2169 	}
2170 
2171 	if (tovp->v_type != VDIR) {
2172 		VN_RELE(tovp);
2173 		VN_RELE(fromvp);
2174 		*status = NFSERR_NOTDIR;
2175 		return;
2176 	}
2177 	/*
2178 	 * Disallow NULL paths
2179 	 */
2180 	if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') {
2181 		VN_RELE(tovp);
2182 		VN_RELE(fromvp);
2183 		*status = NFSERR_ACCES;
2184 		return;
2185 	}
2186 
2187 	if (rdonly(exi, req)) {
2188 		VN_RELE(tovp);
2189 		VN_RELE(fromvp);
2190 		*status = NFSERR_ROFS;
2191 		return;
2192 	}
2193 
2194 	error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr, NULL, 0);
2195 
2196 	/*
2197 	 * Force modified data and metadata out to stable storage.
2198 	 */
2199 	(void) VOP_FSYNC(tovp, 0, cr, NULL);
2200 	(void) VOP_FSYNC(fromvp, FNODSYNC, cr, NULL);
2201 
2202 	VN_RELE(tovp);
2203 	VN_RELE(fromvp);
2204 
2205 	*status = puterrno(error);
2206 
2207 }
2208 void *
2209 rfs_link_getfh(struct nfslinkargs *args)
2210 {
2211 	return (args->la_from);
2212 }
2213 
2214 /*
2215  * Symbolicly link to a file.
2216  * Create a file (to) with the given attributes which is a symbolic link
2217  * to the given path name (to).
2218  */
2219 void
2220 rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
2221 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2222 {
2223 	int error;
2224 	struct vattr va;
2225 	vnode_t *vp;
2226 	vnode_t *svp;
2227 	int lerror;
2228 	struct sockaddr *ca;
2229 	char *name = NULL;
2230 
2231 	/*
2232 	 * Disallow NULL paths
2233 	 */
2234 	if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') {
2235 		*status = NFSERR_ACCES;
2236 		return;
2237 	}
2238 
2239 	vp = nfs_fhtovp(args->sla_from.da_fhandle, exi);
2240 	if (vp == NULL) {
2241 		*status = NFSERR_STALE;
2242 		return;
2243 	}
2244 
2245 	if (rdonly(exi, req)) {
2246 		VN_RELE(vp);
2247 		*status = NFSERR_ROFS;
2248 		return;
2249 	}
2250 
2251 	error = sattr_to_vattr(args->sla_sa, &va);
2252 	if (error) {
2253 		VN_RELE(vp);
2254 		*status = puterrno(error);
2255 		return;
2256 	}
2257 
2258 	if (!(va.va_mask & AT_MODE)) {
2259 		VN_RELE(vp);
2260 		*status = NFSERR_INVAL;
2261 		return;
2262 	}
2263 
2264 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2265 	name = nfscmd_convname(ca, exi, args->sla_tnm,
2266 	    NFSCMD_CONV_INBOUND, MAXPATHLEN);
2267 
2268 	if (name == NULL) {
2269 		*status = NFSERR_ACCES;
2270 		return;
2271 	}
2272 
2273 	va.va_type = VLNK;
2274 	va.va_mask |= AT_TYPE;
2275 
2276 	error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, name, cr, NULL, 0);
2277 
2278 	/*
2279 	 * Force new data and metadata out to stable storage.
2280 	 */
2281 	lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL, 0,
2282 	    NULL, cr, NULL, NULL, NULL);
2283 
2284 	if (!lerror) {
2285 		(void) VOP_FSYNC(svp, 0, cr, NULL);
2286 		VN_RELE(svp);
2287 	}
2288 
2289 	/*
2290 	 * Force modified data and metadata out to stable storage.
2291 	 */
2292 	(void) VOP_FSYNC(vp, 0, cr, NULL);
2293 
2294 	VN_RELE(vp);
2295 
2296 	*status = puterrno(error);
2297 	if (name != args->sla_tnm)
2298 		kmem_free(name, MAXPATHLEN);
2299 
2300 }
2301 void *
2302 rfs_symlink_getfh(struct nfsslargs *args)
2303 {
2304 	return (args->sla_from.da_fhandle);
2305 }
2306 
2307 /*
2308  * Make a directory.
2309  * Create a directory with the given name, parent directory, and attributes.
2310  * Returns a file handle and attributes for the new directory.
2311  */
2312 void
2313 rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr,
2314 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2315 {
2316 	int error;
2317 	struct vattr va;
2318 	vnode_t *dvp = NULL;
2319 	vnode_t *vp;
2320 	char *name = args->ca_da.da_name;
2321 
2322 	/*
2323 	 * Disallow NULL paths
2324 	 */
2325 	if (name == NULL || *name == '\0') {
2326 		dr->dr_status = NFSERR_ACCES;
2327 		return;
2328 	}
2329 
2330 	vp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
2331 	if (vp == NULL) {
2332 		dr->dr_status = NFSERR_STALE;
2333 		return;
2334 	}
2335 
2336 	if (rdonly(exi, req)) {
2337 		VN_RELE(vp);
2338 		dr->dr_status = NFSERR_ROFS;
2339 		return;
2340 	}
2341 
2342 	error = sattr_to_vattr(args->ca_sa, &va);
2343 	if (error) {
2344 		VN_RELE(vp);
2345 		dr->dr_status = puterrno(error);
2346 		return;
2347 	}
2348 
2349 	if (!(va.va_mask & AT_MODE)) {
2350 		VN_RELE(vp);
2351 		dr->dr_status = NFSERR_INVAL;
2352 		return;
2353 	}
2354 
2355 	va.va_type = VDIR;
2356 	va.va_mask |= AT_TYPE;
2357 
2358 	error = VOP_MKDIR(vp, name, &va, &dvp, cr, NULL, 0, NULL);
2359 
2360 	if (!error) {
2361 		/*
2362 		 * Attribtutes of the newly created directory should
2363 		 * be returned to the client.
2364 		 */
2365 		va.va_mask = AT_ALL; /* We want everything */
2366 		error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2367 
2368 		/* check for overflows */
2369 		if (!error) {
2370 			acl_perm(vp, exi, &va, cr);
2371 			error = vattr_to_nattr(&va, &dr->dr_attr);
2372 			if (!error) {
2373 				error = makefh(&dr->dr_fhandle, dvp, exi);
2374 			}
2375 		}
2376 		/*
2377 		 * Force new data and metadata out to stable storage.
2378 		 */
2379 		(void) VOP_FSYNC(dvp, 0, cr, NULL);
2380 		VN_RELE(dvp);
2381 	}
2382 
2383 	/*
2384 	 * Force modified data and metadata out to stable storage.
2385 	 */
2386 	(void) VOP_FSYNC(vp, 0, cr, NULL);
2387 
2388 	VN_RELE(vp);
2389 
2390 	dr->dr_status = puterrno(error);
2391 
2392 }
2393 void *
2394 rfs_mkdir_getfh(struct nfscreatargs *args)
2395 {
2396 	return (args->ca_da.da_fhandle);
2397 }
2398 
2399 /*
2400  * Remove a directory.
2401  * Remove the given directory name from the given parent directory.
2402  */
2403 void
2404 rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status,
2405 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2406 {
2407 	int error;
2408 	vnode_t *vp;
2409 
2410 
2411 	/*
2412 	 * Disallow NULL paths
2413 	 */
2414 	if (da->da_name == NULL || *da->da_name == '\0') {
2415 		*status = NFSERR_ACCES;
2416 		return;
2417 	}
2418 
2419 	vp = nfs_fhtovp(da->da_fhandle, exi);
2420 	if (vp == NULL) {
2421 		*status = NFSERR_STALE;
2422 		return;
2423 	}
2424 
2425 	if (rdonly(exi, req)) {
2426 		VN_RELE(vp);
2427 		*status = NFSERR_ROFS;
2428 		return;
2429 	}
2430 
2431 	/*
2432 	 * VOP_RMDIR now takes a new third argument (the current
2433 	 * directory of the process).  That's because someone
2434 	 * wants to return EINVAL if one tries to remove ".".
2435 	 * Of course, NFS servers have no idea what their
2436 	 * clients' current directories are.  We fake it by
2437 	 * supplying a vnode known to exist and illegal to
2438 	 * remove.
2439 	 */
2440 	error = VOP_RMDIR(vp, da->da_name, rootdir, cr, NULL, 0);
2441 
2442 	/*
2443 	 * Force modified data and metadata out to stable storage.
2444 	 */
2445 	(void) VOP_FSYNC(vp, 0, cr, NULL);
2446 
2447 	VN_RELE(vp);
2448 
2449 	/*
2450 	 * System V defines rmdir to return EEXIST, not ENOTEMPTY,
2451 	 * if the directory is not empty.  A System V NFS server
2452 	 * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
2453 	 * over the wire.
2454 	 */
2455 	if (error == EEXIST)
2456 		*status = NFSERR_NOTEMPTY;
2457 	else
2458 		*status = puterrno(error);
2459 
2460 }
2461 void *
2462 rfs_rmdir_getfh(struct nfsdiropargs *da)
2463 {
2464 	return (da->da_fhandle);
2465 }
2466 
2467 /* ARGSUSED */
2468 void
2469 rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
2470 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2471 {
2472 	int error;
2473 	int iseof;
2474 	struct iovec iov;
2475 	struct uio uio;
2476 	vnode_t *vp;
2477 	char *ndata = NULL;
2478 	struct sockaddr *ca;
2479 	size_t nents;
2480 	int ret;
2481 
2482 	vp = nfs_fhtovp(&rda->rda_fh, exi);
2483 	if (vp == NULL) {
2484 		rd->rd_entries = NULL;
2485 		rd->rd_status = NFSERR_STALE;
2486 		return;
2487 	}
2488 
2489 	if (vp->v_type != VDIR) {
2490 		VN_RELE(vp);
2491 		rd->rd_entries = NULL;
2492 		rd->rd_status = NFSERR_NOTDIR;
2493 		return;
2494 	}
2495 
2496 	(void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
2497 
2498 	error = VOP_ACCESS(vp, VREAD, 0, cr, NULL);
2499 
2500 	if (error) {
2501 		rd->rd_entries = NULL;
2502 		goto bad;
2503 	}
2504 
2505 	if (rda->rda_count == 0) {
2506 		rd->rd_entries = NULL;
2507 		rd->rd_size = 0;
2508 		rd->rd_eof = FALSE;
2509 		goto bad;
2510 	}
2511 
2512 	rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA);
2513 
2514 	/*
2515 	 * Allocate data for entries.  This will be freed by rfs_rddirfree.
2516 	 */
2517 	rd->rd_bufsize = (uint_t)rda->rda_count;
2518 	rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP);
2519 
2520 	/*
2521 	 * Set up io vector to read directory data
2522 	 */
2523 	iov.iov_base = (caddr_t)rd->rd_entries;
2524 	iov.iov_len = rda->rda_count;
2525 	uio.uio_iov = &iov;
2526 	uio.uio_iovcnt = 1;
2527 	uio.uio_segflg = UIO_SYSSPACE;
2528 	uio.uio_extflg = UIO_COPY_CACHED;
2529 	uio.uio_loffset = (offset_t)rda->rda_offset;
2530 	uio.uio_resid = rda->rda_count;
2531 
2532 	/*
2533 	 * read directory
2534 	 */
2535 	error = VOP_READDIR(vp, &uio, cr, &iseof, NULL, 0);
2536 
2537 	/*
2538 	 * Clean up
2539 	 */
2540 	if (!error) {
2541 		/*
2542 		 * set size and eof
2543 		 */
2544 		if (uio.uio_resid == rda->rda_count) {
2545 			rd->rd_size = 0;
2546 			rd->rd_eof = TRUE;
2547 		} else {
2548 			rd->rd_size = (uint32_t)(rda->rda_count -
2549 			    uio.uio_resid);
2550 			rd->rd_eof = iseof ? TRUE : FALSE;
2551 		}
2552 	}
2553 
2554 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2555 	nents = nfscmd_countents((char *)rd->rd_entries, rd->rd_size);
2556 	ret = nfscmd_convdirplus(ca, exi, (char *)rd->rd_entries, nents,
2557 	    rda->rda_count, &ndata);
2558 
2559 	if (ret != 0) {
2560 		size_t dropbytes;
2561 		/*
2562 		 * We had to drop one or more entries in order to fit
2563 		 * during the character conversion.  We need to patch
2564 		 * up the size and eof info.
2565 		 */
2566 		if (rd->rd_eof)
2567 			rd->rd_eof = FALSE;
2568 		dropbytes = nfscmd_dropped_entrysize(
2569 		    (struct dirent64 *)rd->rd_entries, nents, ret);
2570 		rd->rd_size -= dropbytes;
2571 	}
2572 	if (ndata == NULL) {
2573 		ndata = (char *)rd->rd_entries;
2574 	} else if (ndata != (char *)rd->rd_entries) {
2575 		kmem_free(rd->rd_entries, rd->rd_bufsize);
2576 		rd->rd_entries = (void *)ndata;
2577 		rd->rd_bufsize = rda->rda_count;
2578 	}
2579 
2580 bad:
2581 	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
2582 
2583 #if 0 /* notyet */
2584 	/*
2585 	 * Don't do this.  It causes local disk writes when just
2586 	 * reading the file and the overhead is deemed larger
2587 	 * than the benefit.
2588 	 */
2589 	/*
2590 	 * Force modified metadata out to stable storage.
2591 	 */
2592 	(void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2593 #endif
2594 
2595 	VN_RELE(vp);
2596 
2597 	rd->rd_status = puterrno(error);
2598 
2599 }
2600 void *
2601 rfs_readdir_getfh(struct nfsrddirargs *rda)
2602 {
2603 	return (&rda->rda_fh);
2604 }
2605 void
2606 rfs_rddirfree(struct nfsrddirres *rd)
2607 {
2608 	if (rd->rd_entries != NULL)
2609 		kmem_free(rd->rd_entries, rd->rd_bufsize);
2610 }
2611 
2612 /* ARGSUSED */
2613 void
2614 rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
2615 	struct svc_req *req, cred_t *cr)
2616 {
2617 	int error;
2618 	struct statvfs64 sb;
2619 	vnode_t *vp;
2620 
2621 	vp = nfs_fhtovp(fh, exi);
2622 	if (vp == NULL) {
2623 		fs->fs_status = NFSERR_STALE;
2624 		return;
2625 	}
2626 
2627 	error = VFS_STATVFS(vp->v_vfsp, &sb);
2628 
2629 	if (!error) {
2630 		fs->fs_tsize = nfstsize();
2631 		fs->fs_bsize = sb.f_frsize;
2632 		fs->fs_blocks = sb.f_blocks;
2633 		fs->fs_bfree = sb.f_bfree;
2634 		fs->fs_bavail = sb.f_bavail;
2635 	}
2636 
2637 	VN_RELE(vp);
2638 
2639 	fs->fs_status = puterrno(error);
2640 
2641 }
2642 void *
2643 rfs_statfs_getfh(fhandle_t *fh)
2644 {
2645 	return (fh);
2646 }
2647 
2648 static int
2649 sattr_to_vattr(struct nfssattr *sa, struct vattr *vap)
2650 {
2651 	vap->va_mask = 0;
2652 
2653 	/*
2654 	 * There was a sign extension bug in some VFS based systems
2655 	 * which stored the mode as a short.  When it would get
2656 	 * assigned to a u_long, no sign extension would occur.
2657 	 * It needed to, but this wasn't noticed because sa_mode
2658 	 * would then get assigned back to the short, thus ignoring
2659 	 * the upper 16 bits of sa_mode.
2660 	 *
2661 	 * To make this implementation work for both broken
2662 	 * clients and good clients, we check for both versions
2663 	 * of the mode.
2664 	 */
2665 	if (sa->sa_mode != (uint32_t)((ushort_t)-1) &&
2666 	    sa->sa_mode != (uint32_t)-1) {
2667 		vap->va_mask |= AT_MODE;
2668 		vap->va_mode = sa->sa_mode;
2669 	}
2670 	if (sa->sa_uid != (uint32_t)-1) {
2671 		vap->va_mask |= AT_UID;
2672 		vap->va_uid = sa->sa_uid;
2673 	}
2674 	if (sa->sa_gid != (uint32_t)-1) {
2675 		vap->va_mask |= AT_GID;
2676 		vap->va_gid = sa->sa_gid;
2677 	}
2678 	if (sa->sa_size != (uint32_t)-1) {
2679 		vap->va_mask |= AT_SIZE;
2680 		vap->va_size = sa->sa_size;
2681 	}
2682 	if (sa->sa_atime.tv_sec != (int32_t)-1 &&
2683 	    sa->sa_atime.tv_usec != (int32_t)-1) {
2684 #ifndef _LP64
2685 		/* return error if time overflow */
2686 		if (!NFS2_TIME_OK(sa->sa_atime.tv_sec))
2687 			return (EOVERFLOW);
2688 #endif
2689 		vap->va_mask |= AT_ATIME;
2690 		/*
2691 		 * nfs protocol defines times as unsigned so don't extend sign,
2692 		 * unless sysadmin set nfs_allow_preepoch_time.
2693 		 */
2694 		NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec);
2695 		vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
2696 	}
2697 	if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
2698 	    sa->sa_mtime.tv_usec != (int32_t)-1) {
2699 #ifndef _LP64
2700 		/* return error if time overflow */
2701 		if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
2702 			return (EOVERFLOW);
2703 #endif
2704 		vap->va_mask |= AT_MTIME;
2705 		/*
2706 		 * nfs protocol defines times as unsigned so don't extend sign,
2707 		 * unless sysadmin set nfs_allow_preepoch_time.
2708 		 */
2709 		NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
2710 		vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
2711 	}
2712 	return (0);
2713 }
2714 
2715 static enum nfsftype vt_to_nf[] = {
2716 	0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
2717 };
2718 
2719 /*
2720  * check the following fields for overflow: nodeid, size, and time.
2721  * There could be a problem when converting 64-bit LP64 fields
2722  * into 32-bit ones.  Return an error if there is an overflow.
2723  */
2724 int
2725 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
2726 {
2727 	ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
2728 	na->na_type = vt_to_nf[vap->va_type];
2729 
2730 	if (vap->va_mode == (unsigned short) -1)
2731 		na->na_mode = (uint32_t)-1;
2732 	else
2733 		na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
2734 
2735 	if (vap->va_uid == (unsigned short)(-1))
2736 		na->na_uid = (uint32_t)(-1);
2737 	else if (vap->va_uid == UID_NOBODY)
2738 		na->na_uid = (uint32_t)NFS_UID_NOBODY;
2739 	else
2740 		na->na_uid = vap->va_uid;
2741 
2742 	if (vap->va_gid == (unsigned short)(-1))
2743 		na->na_gid = (uint32_t)-1;
2744 	else if (vap->va_gid == GID_NOBODY)
2745 		na->na_gid = (uint32_t)NFS_GID_NOBODY;
2746 	else
2747 		na->na_gid = vap->va_gid;
2748 
2749 	/*
2750 	 * Do we need to check fsid for overflow?  It is 64-bit in the
2751 	 * vattr, but are bigger than 32 bit values supported?
2752 	 */
2753 	na->na_fsid = vap->va_fsid;
2754 
2755 	na->na_nodeid = vap->va_nodeid;
2756 
2757 	/*
2758 	 * Check to make sure that the nodeid is representable over the
2759 	 * wire without losing bits.
2760 	 */
2761 	if (vap->va_nodeid != (u_longlong_t)na->na_nodeid)
2762 		return (EFBIG);
2763 	na->na_nlink = vap->va_nlink;
2764 
2765 	/*
2766 	 * Check for big files here, instead of at the caller.  See
2767 	 * comments in cstat for large special file explanation.
2768 	 */
2769 	if (vap->va_size > (u_longlong_t)MAXOFF32_T) {
2770 		if ((vap->va_type == VREG) || (vap->va_type == VDIR))
2771 			return (EFBIG);
2772 		if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) {
2773 			/* UNKNOWN_SIZE | OVERFLOW */
2774 			na->na_size = MAXOFF32_T;
2775 		} else
2776 			na->na_size = vap->va_size;
2777 	} else
2778 		na->na_size = vap->va_size;
2779 
2780 	/*
2781 	 * If the vnode times overflow the 32-bit times that NFS2
2782 	 * uses on the wire then return an error.
2783 	 */
2784 	if (!NFS_VAP_TIME_OK(vap)) {
2785 		return (EOVERFLOW);
2786 	}
2787 	na->na_atime.tv_sec = vap->va_atime.tv_sec;
2788 	na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2789 
2790 	na->na_mtime.tv_sec = vap->va_mtime.tv_sec;
2791 	na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2792 
2793 	na->na_ctime.tv_sec = vap->va_ctime.tv_sec;
2794 	na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000;
2795 
2796 	/*
2797 	 * If the dev_t will fit into 16 bits then compress
2798 	 * it, otherwise leave it alone. See comments in
2799 	 * nfs_client.c.
2800 	 */
2801 	if (getminor(vap->va_rdev) <= SO4_MAXMIN &&
2802 	    getmajor(vap->va_rdev) <= SO4_MAXMAJ)
2803 		na->na_rdev = nfsv2_cmpdev(vap->va_rdev);
2804 	else
2805 		(void) cmpldev(&na->na_rdev, vap->va_rdev);
2806 
2807 	na->na_blocks = vap->va_nblocks;
2808 	na->na_blocksize = vap->va_blksize;
2809 
2810 	/*
2811 	 * This bit of ugliness is a *TEMPORARY* hack to preserve the
2812 	 * over-the-wire protocols for named-pipe vnodes.  It remaps the
2813 	 * VFIFO type to the special over-the-wire type. (see note in nfs.h)
2814 	 *
2815 	 * BUYER BEWARE:
2816 	 *  If you are porting the NFS to a non-Sun server, you probably
2817 	 *  don't want to include the following block of code.  The
2818 	 *  over-the-wire special file types will be changing with the
2819 	 *  NFS Protocol Revision.
2820 	 */
2821 	if (vap->va_type == VFIFO)
2822 		NA_SETFIFO(na);
2823 	return (0);
2824 }
2825 
2826 /*
2827  * acl v2 support: returns approximate permission.
2828  *	default: returns minimal permission (more restrictive)
2829  *	aclok: returns maximal permission (less restrictive)
2830  *	This routine changes the permissions that are alaredy in *va.
2831  *	If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES,
2832  *	CLASS_OBJ is always the same as GROUP_OBJ entry.
2833  */
2834 static void
2835 acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr)
2836 {
2837 	vsecattr_t	vsa;
2838 	int		aclcnt;
2839 	aclent_t	*aclentp;
2840 	mode_t		mask_perm;
2841 	mode_t		grp_perm;
2842 	mode_t		other_perm;
2843 	mode_t		other_orig;
2844 	int		error;
2845 
2846 	/* dont care default acl */
2847 	vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT);
2848 	error = VOP_GETSECATTR(vp, &vsa, 0, cr, NULL);
2849 
2850 	if (!error) {
2851 		aclcnt = vsa.vsa_aclcnt;
2852 		if (aclcnt > MIN_ACL_ENTRIES) {
2853 			/* non-trivial ACL */
2854 			aclentp = vsa.vsa_aclentp;
2855 			if (exi->exi_export.ex_flags & EX_ACLOK) {
2856 				/* maximal permissions */
2857 				grp_perm = 0;
2858 				other_perm = 0;
2859 				for (; aclcnt > 0; aclcnt--, aclentp++) {
2860 					switch (aclentp->a_type) {
2861 					case USER_OBJ:
2862 						break;
2863 					case USER:
2864 						grp_perm |=
2865 						    aclentp->a_perm << 3;
2866 						other_perm |= aclentp->a_perm;
2867 						break;
2868 					case GROUP_OBJ:
2869 						grp_perm |=
2870 						    aclentp->a_perm << 3;
2871 						break;
2872 					case GROUP:
2873 						other_perm |= aclentp->a_perm;
2874 						break;
2875 					case OTHER_OBJ:
2876 						other_orig = aclentp->a_perm;
2877 						break;
2878 					case CLASS_OBJ:
2879 						mask_perm = aclentp->a_perm;
2880 						break;
2881 					default:
2882 						break;
2883 					}
2884 				}
2885 				grp_perm &= mask_perm << 3;
2886 				other_perm &= mask_perm;
2887 				other_perm |= other_orig;
2888 
2889 			} else {
2890 				/* minimal permissions */
2891 				grp_perm = 070;
2892 				other_perm = 07;
2893 				for (; aclcnt > 0; aclcnt--, aclentp++) {
2894 					switch (aclentp->a_type) {
2895 					case USER_OBJ:
2896 						break;
2897 					case USER:
2898 					case CLASS_OBJ:
2899 						grp_perm &=
2900 						    aclentp->a_perm << 3;
2901 						other_perm &=
2902 						    aclentp->a_perm;
2903 						break;
2904 					case GROUP_OBJ:
2905 						grp_perm &=
2906 						    aclentp->a_perm << 3;
2907 						break;
2908 					case GROUP:
2909 						other_perm &=
2910 						    aclentp->a_perm;
2911 						break;
2912 					case OTHER_OBJ:
2913 						other_perm &=
2914 						    aclentp->a_perm;
2915 						break;
2916 					default:
2917 						break;
2918 					}
2919 				}
2920 			}
2921 			/* copy to va */
2922 			va->va_mode &= ~077;
2923 			va->va_mode |= grp_perm | other_perm;
2924 		}
2925 		if (vsa.vsa_aclcnt)
2926 			kmem_free(vsa.vsa_aclentp,
2927 			    vsa.vsa_aclcnt * sizeof (aclent_t));
2928 	}
2929 }
2930 
2931 void
2932 rfs_srvrinit(void)
2933 {
2934 	mutex_init(&rfs_async_write_lock, NULL, MUTEX_DEFAULT, NULL);
2935 	nfs2_srv_caller_id = fs_new_caller_id();
2936 }
2937 
2938 void
2939 rfs_srvrfini(void)
2940 {
2941 	mutex_destroy(&rfs_async_write_lock);
2942 }
2943 
2944 static int
2945 rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
2946 {
2947 	struct clist	*wcl;
2948 	int		data_len, avail_len, num;
2949 	uint32_t	count = rr->rr_count;
2950 
2951 	data_len = num = avail_len = 0;
2952 
2953 	wcl = ra->ra_wlist;
2954 	while (wcl != NULL) {
2955 		if (wcl->c_dmemhandle.mrc_rmr == 0)
2956 			break;
2957 
2958 		avail_len += wcl->c_len;
2959 		if (wcl->c_len < count) {
2960 			data_len += wcl->c_len;
2961 		} else {
2962 			/* Can make the rest chunks all 0-len */
2963 			data_len += count;
2964 			wcl->c_len = count;
2965 		}
2966 		count -= wcl->c_len;
2967 		num ++;
2968 		wcl = wcl->c_next;
2969 	}
2970 
2971 	/*
2972 	 * MUST fail if there are still more data
2973 	 */
2974 	if (count > 0) {
2975 		DTRACE_PROBE2(nfss__e__read__wlist__fail,
2976 		    int, data_len, int, count);
2977 		return (FALSE);
2978 	}
2979 
2980 	wcl = ra->ra_wlist;
2981 	rr->rr_count = data_len;
2982 	rr->rr_ok.rrok_wlist_len = data_len;
2983 	rr->rr_ok.rrok_wlist = wcl;
2984 
2985 	return (TRUE);
2986 }
2987