xref: /titanic_51/usr/src/uts/common/fs/nfs/nfs_srv.c (revision dcda19f50b2b80bfc622fff718ac04fb0e1cb670)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  *	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
28  *	All rights reserved.
29  */
30 
31 #include <sys/param.h>
32 #include <sys/types.h>
33 #include <sys/systm.h>
34 #include <sys/cred.h>
35 #include <sys/buf.h>
36 #include <sys/vfs.h>
37 #include <sys/vnode.h>
38 #include <sys/uio.h>
39 #include <sys/stat.h>
40 #include <sys/errno.h>
41 #include <sys/sysmacros.h>
42 #include <sys/statvfs.h>
43 #include <sys/kmem.h>
44 #include <sys/kstat.h>
45 #include <sys/dirent.h>
46 #include <sys/cmn_err.h>
47 #include <sys/debug.h>
48 #include <sys/vtrace.h>
49 #include <sys/mode.h>
50 #include <sys/acl.h>
51 #include <sys/nbmlock.h>
52 #include <sys/policy.h>
53 #include <sys/sdt.h>
54 
55 #include <rpc/types.h>
56 #include <rpc/auth.h>
57 #include <rpc/svc.h>
58 
59 #include <nfs/nfs.h>
60 #include <nfs/export.h>
61 #include <nfs/nfs_cmd.h>
62 
63 #include <vm/hat.h>
64 #include <vm/as.h>
65 #include <vm/seg.h>
66 #include <vm/seg_map.h>
67 #include <vm/seg_kmem.h>
68 
69 #include <sys/strsubr.h>
70 
71 /*
72  * These are the interface routines for the server side of the
73  * Network File System.  See the NFS version 2 protocol specification
74  * for a description of this interface.
75  */
76 
77 static int	sattr_to_vattr(struct nfssattr *, struct vattr *);
78 static void	acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
79 			cred_t *);
80 
81 /*
82  * Some "over the wire" UNIX file types.  These are encoded
83  * into the mode.  This needs to be fixed in the next rev.
84  */
85 #define	IFMT		0170000		/* type of file */
86 #define	IFCHR		0020000		/* character special */
87 #define	IFBLK		0060000		/* block special */
88 #define	IFSOCK		0140000		/* socket */
89 
90 u_longlong_t nfs2_srv_caller_id;
91 
92 /*
93  * Get file attributes.
94  * Returns the current attributes of the file with the given fhandle.
95  */
96 /* ARGSUSED */
97 void
98 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
99 	struct svc_req *req, cred_t *cr)
100 {
101 	int error;
102 	vnode_t *vp;
103 	struct vattr va;
104 
105 	vp = nfs_fhtovp(fhp, exi);
106 	if (vp == NULL) {
107 		ns->ns_status = NFSERR_STALE;
108 		return;
109 	}
110 
111 	/*
112 	 * Do the getattr.
113 	 */
114 	va.va_mask = AT_ALL;	/* we want all the attributes */
115 
116 	error = rfs4_delegated_getattr(vp, &va, 0, cr);
117 
118 	/* check for overflows */
119 	if (!error) {
120 		acl_perm(vp, exi, &va, cr);
121 		error = vattr_to_nattr(&va, &ns->ns_attr);
122 	}
123 
124 	VN_RELE(vp);
125 
126 	ns->ns_status = puterrno(error);
127 }
128 void *
129 rfs_getattr_getfh(fhandle_t *fhp)
130 {
131 	return (fhp);
132 }
133 
134 /*
135  * Set file attributes.
136  * Sets the attributes of the file with the given fhandle.  Returns
137  * the new attributes.
138  */
139 void
140 rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns,
141 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
142 {
143 	int error;
144 	int flag;
145 	int in_crit = 0;
146 	vnode_t *vp;
147 	struct vattr va;
148 	struct vattr bva;
149 	struct flock64 bf;
150 	caller_context_t ct;
151 
152 
153 	vp = nfs_fhtovp(&args->saa_fh, exi);
154 	if (vp == NULL) {
155 		ns->ns_status = NFSERR_STALE;
156 		return;
157 	}
158 
159 	if (rdonly(exi, req) || vn_is_readonly(vp)) {
160 		VN_RELE(vp);
161 		ns->ns_status = NFSERR_ROFS;
162 		return;
163 	}
164 
165 	error = sattr_to_vattr(&args->saa_sa, &va);
166 	if (error) {
167 		VN_RELE(vp);
168 		ns->ns_status = puterrno(error);
169 		return;
170 	}
171 
172 	/*
173 	 * If the client is requesting a change to the mtime,
174 	 * but the nanosecond field is set to 1 billion, then
175 	 * this is a flag to the server that it should set the
176 	 * atime and mtime fields to the server's current time.
177 	 * The 1 billion number actually came from the client
178 	 * as 1 million, but the units in the over the wire
179 	 * request are microseconds instead of nanoseconds.
180 	 *
181 	 * This is an overload of the protocol and should be
182 	 * documented in the NFS Version 2 protocol specification.
183 	 */
184 	if (va.va_mask & AT_MTIME) {
185 		if (va.va_mtime.tv_nsec == 1000000000) {
186 			gethrestime(&va.va_mtime);
187 			va.va_atime = va.va_mtime;
188 			va.va_mask |= AT_ATIME;
189 			flag = 0;
190 		} else
191 			flag = ATTR_UTIME;
192 	} else
193 		flag = 0;
194 
195 	/*
196 	 * If the filesystem is exported with nosuid, then mask off
197 	 * the setuid and setgid bits.
198 	 */
199 	if ((va.va_mask & AT_MODE) && vp->v_type == VREG &&
200 	    (exi->exi_export.ex_flags & EX_NOSUID))
201 		va.va_mode &= ~(VSUID | VSGID);
202 
203 	ct.cc_sysid = 0;
204 	ct.cc_pid = 0;
205 	ct.cc_caller_id = nfs2_srv_caller_id;
206 	ct.cc_flags = CC_DONTBLOCK;
207 
208 	/*
209 	 * We need to specially handle size changes because it is
210 	 * possible for the client to create a file with modes
211 	 * which indicate read-only, but with the file opened for
212 	 * writing.  If the client then tries to set the size of
213 	 * the file, then the normal access checking done in
214 	 * VOP_SETATTR would prevent the client from doing so,
215 	 * although it should be legal for it to do so.  To get
216 	 * around this, we do the access checking for ourselves
217 	 * and then use VOP_SPACE which doesn't do the access
218 	 * checking which VOP_SETATTR does. VOP_SPACE can only
219 	 * operate on VREG files, let VOP_SETATTR handle the other
220 	 * extremely rare cases.
221 	 * Also the client should not be allowed to change the
222 	 * size of the file if there is a conflicting non-blocking
223 	 * mandatory lock in the region of change.
224 	 */
225 	if (vp->v_type == VREG && va.va_mask & AT_SIZE) {
226 		if (nbl_need_check(vp)) {
227 			nbl_start_crit(vp, RW_READER);
228 			in_crit = 1;
229 		}
230 
231 		bva.va_mask = AT_UID | AT_SIZE;
232 
233 		error = VOP_GETATTR(vp, &bva, 0, cr, &ct);
234 
235 		if (error) {
236 			if (in_crit)
237 				nbl_end_crit(vp);
238 			VN_RELE(vp);
239 			ns->ns_status = puterrno(error);
240 			return;
241 		}
242 
243 		if (in_crit) {
244 			u_offset_t offset;
245 			ssize_t length;
246 
247 			if (va.va_size < bva.va_size) {
248 				offset = va.va_size;
249 				length = bva.va_size - va.va_size;
250 			} else {
251 				offset = bva.va_size;
252 				length = va.va_size - bva.va_size;
253 			}
254 			if (nbl_conflict(vp, NBL_WRITE, offset, length, 0,
255 			    NULL)) {
256 				error = EACCES;
257 			}
258 		}
259 
260 		if (crgetuid(cr) == bva.va_uid && !error &&
261 		    va.va_size != bva.va_size) {
262 			va.va_mask &= ~AT_SIZE;
263 			bf.l_type = F_WRLCK;
264 			bf.l_whence = 0;
265 			bf.l_start = (off64_t)va.va_size;
266 			bf.l_len = 0;
267 			bf.l_sysid = 0;
268 			bf.l_pid = 0;
269 
270 			error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE,
271 			    (offset_t)va.va_size, cr, &ct);
272 		}
273 		if (in_crit)
274 			nbl_end_crit(vp);
275 	} else
276 		error = 0;
277 
278 	/*
279 	 * Do the setattr.
280 	 */
281 	if (!error && va.va_mask) {
282 		error = VOP_SETATTR(vp, &va, flag, cr, &ct);
283 	}
284 
285 	/*
286 	 * check if the monitor on either vop_space or vop_setattr detected
287 	 * a delegation conflict and if so, mark the thread flag as
288 	 * wouldblock so that the response is dropped and the client will
289 	 * try again.
290 	 */
291 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
292 		VN_RELE(vp);
293 		curthread->t_flag |= T_WOULDBLOCK;
294 		return;
295 	}
296 
297 	if (!error) {
298 		va.va_mask = AT_ALL;	/* get everything */
299 
300 		error = rfs4_delegated_getattr(vp, &va, 0, cr);
301 
302 		/* check for overflows */
303 		if (!error) {
304 			acl_perm(vp, exi, &va, cr);
305 			error = vattr_to_nattr(&va, &ns->ns_attr);
306 		}
307 	}
308 
309 	ct.cc_flags = 0;
310 
311 	/*
312 	 * Force modified metadata out to stable storage.
313 	 */
314 	(void) VOP_FSYNC(vp, FNODSYNC, cr, &ct);
315 
316 	VN_RELE(vp);
317 
318 	ns->ns_status = puterrno(error);
319 }
320 void *
321 rfs_setattr_getfh(struct nfssaargs *args)
322 {
323 	return (&args->saa_fh);
324 }
325 
326 /*
327  * Directory lookup.
328  * Returns an fhandle and file attributes for file name in a directory.
329  */
330 /* ARGSUSED */
331 void
332 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
333 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
334 {
335 	int error;
336 	vnode_t *dvp;
337 	vnode_t *vp;
338 	struct vattr va;
339 	fhandle_t *fhp = da->da_fhandle;
340 	struct sec_ol sec = {0, 0};
341 	bool_t publicfh_flag = FALSE, auth_weak = FALSE;
342 	char *name;
343 	struct sockaddr *ca;
344 
345 	/*
346 	 * Trusted Extension doesn't support NFSv2. MOUNT
347 	 * will reject v2 clients. Need to prevent v2 client
348 	 * access via WebNFS here.
349 	 */
350 	if (is_system_labeled() && req->rq_vers == 2) {
351 		dr->dr_status = NFSERR_ACCES;
352 		return;
353 	}
354 
355 	/*
356 	 * Disallow NULL paths
357 	 */
358 	if (da->da_name == NULL || *da->da_name == '\0') {
359 		dr->dr_status = NFSERR_ACCES;
360 		return;
361 	}
362 
363 	/*
364 	 * Allow lookups from the root - the default
365 	 * location of the public filehandle.
366 	 */
367 	if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
368 		dvp = rootdir;
369 		VN_HOLD(dvp);
370 	} else {
371 		dvp = nfs_fhtovp(fhp, exi);
372 		if (dvp == NULL) {
373 			dr->dr_status = NFSERR_STALE;
374 			return;
375 		}
376 	}
377 
378 	/*
379 	 * Not allow lookup beyond root.
380 	 * If the filehandle matches a filehandle of the exi,
381 	 * then the ".." refers beyond the root of an exported filesystem.
382 	 */
383 	if (strcmp(da->da_name, "..") == 0 &&
384 	    EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
385 		VN_RELE(dvp);
386 		dr->dr_status = NFSERR_NOENT;
387 		return;
388 	}
389 
390 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
391 	name = nfscmd_convname(ca, exi, da->da_name, NFSCMD_CONV_INBOUND,
392 	    MAXPATHLEN);
393 
394 	if (name == NULL) {
395 		dr->dr_status = NFSERR_ACCES;
396 		return;
397 	}
398 
399 	/*
400 	 * If the public filehandle is used then allow
401 	 * a multi-component lookup, i.e. evaluate
402 	 * a pathname and follow symbolic links if
403 	 * necessary.
404 	 *
405 	 * This may result in a vnode in another filesystem
406 	 * which is OK as long as the filesystem is exported.
407 	 */
408 	if (PUBLIC_FH2(fhp)) {
409 		publicfh_flag = TRUE;
410 		error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &exi,
411 		    &sec);
412 	} else {
413 		/*
414 		 * Do a normal single component lookup.
415 		 */
416 		error = VOP_LOOKUP(dvp, name, &vp, NULL, 0, NULL, cr,
417 		    NULL, NULL, NULL);
418 	}
419 
420 	if (name != da->da_name)
421 		kmem_free(name, MAXPATHLEN);
422 
423 
424 	if (!error) {
425 		va.va_mask = AT_ALL;	/* we want everything */
426 
427 		error = rfs4_delegated_getattr(vp, &va, 0, cr);
428 
429 		/* check for overflows */
430 		if (!error) {
431 			acl_perm(vp, exi, &va, cr);
432 			error = vattr_to_nattr(&va, &dr->dr_attr);
433 			if (!error) {
434 				if (sec.sec_flags & SEC_QUERY)
435 					error = makefh_ol(&dr->dr_fhandle, exi,
436 					    sec.sec_index);
437 				else {
438 					error = makefh(&dr->dr_fhandle, vp,
439 					    exi);
440 					if (!error && publicfh_flag &&
441 					    !chk_clnt_sec(exi, req))
442 						auth_weak = TRUE;
443 				}
444 			}
445 		}
446 		VN_RELE(vp);
447 	}
448 
449 	VN_RELE(dvp);
450 
451 	/*
452 	 * If publicfh_flag is true then we have called rfs_publicfh_mclookup
453 	 * and have obtained a new exportinfo in exi which needs to be
454 	 * released. Note the the original exportinfo pointed to by exi
455 	 * will be released by the caller, comon_dispatch.
456 	 */
457 	if (publicfh_flag && exi != NULL)
458 		exi_rele(exi);
459 
460 	/*
461 	 * If it's public fh, no 0x81, and client's flavor is
462 	 * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
463 	 * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
464 	 */
465 	if (auth_weak)
466 		dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR;
467 	else
468 		dr->dr_status = puterrno(error);
469 }
470 void *
471 rfs_lookup_getfh(struct nfsdiropargs *da)
472 {
473 	return (da->da_fhandle);
474 }
475 
476 /*
477  * Read symbolic link.
478  * Returns the string in the symbolic link at the given fhandle.
479  */
480 /* ARGSUSED */
481 void
482 rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi,
483 	struct svc_req *req, cred_t *cr)
484 {
485 	int error;
486 	struct iovec iov;
487 	struct uio uio;
488 	vnode_t *vp;
489 	struct vattr va;
490 	struct sockaddr *ca;
491 	char *name = NULL;
492 
493 	vp = nfs_fhtovp(fhp, exi);
494 	if (vp == NULL) {
495 		rl->rl_data = NULL;
496 		rl->rl_status = NFSERR_STALE;
497 		return;
498 	}
499 
500 	va.va_mask = AT_MODE;
501 
502 	error = VOP_GETATTR(vp, &va, 0, cr, NULL);
503 
504 	if (error) {
505 		VN_RELE(vp);
506 		rl->rl_data = NULL;
507 		rl->rl_status = puterrno(error);
508 		return;
509 	}
510 
511 	if (MANDLOCK(vp, va.va_mode)) {
512 		VN_RELE(vp);
513 		rl->rl_data = NULL;
514 		rl->rl_status = NFSERR_ACCES;
515 		return;
516 	}
517 
518 	/*
519 	 * XNFS and RFC1094 require us to return ENXIO if argument
520 	 * is not a link. BUGID 1138002.
521 	 */
522 	if (vp->v_type != VLNK) {
523 		VN_RELE(vp);
524 		rl->rl_data = NULL;
525 		rl->rl_status = NFSERR_NXIO;
526 		return;
527 	}
528 
529 	/*
530 	 * Allocate data for pathname.  This will be freed by rfs_rlfree.
531 	 */
532 	rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
533 
534 	/*
535 	 * Set up io vector to read sym link data
536 	 */
537 	iov.iov_base = rl->rl_data;
538 	iov.iov_len = NFS_MAXPATHLEN;
539 	uio.uio_iov = &iov;
540 	uio.uio_iovcnt = 1;
541 	uio.uio_segflg = UIO_SYSSPACE;
542 	uio.uio_extflg = UIO_COPY_CACHED;
543 	uio.uio_loffset = (offset_t)0;
544 	uio.uio_resid = NFS_MAXPATHLEN;
545 
546 	/*
547 	 * Do the readlink.
548 	 */
549 	error = VOP_READLINK(vp, &uio, cr, NULL);
550 
551 	VN_RELE(vp);
552 
553 	rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid);
554 	rl->rl_data[rl->rl_count] = '\0';
555 
556 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
557 	name = nfscmd_convname(ca, exi, rl->rl_data,
558 	    NFSCMD_CONV_OUTBOUND, MAXPATHLEN);
559 
560 	if (name != NULL && name != rl->rl_data) {
561 		kmem_free(rl->rl_data, NFS_MAXPATHLEN);
562 		rl->rl_data = name;
563 	}
564 
565 	/*
566 	 * XNFS and RFC1094 require us to return ENXIO if argument
567 	 * is not a link. UFS returns EINVAL if this is the case,
568 	 * so we do the mapping here. BUGID 1138002.
569 	 */
570 	if (error == EINVAL)
571 		rl->rl_status = NFSERR_NXIO;
572 	else
573 		rl->rl_status = puterrno(error);
574 
575 }
576 void *
577 rfs_readlink_getfh(fhandle_t *fhp)
578 {
579 	return (fhp);
580 }
581 /*
582  * Free data allocated by rfs_readlink
583  */
584 void
585 rfs_rlfree(struct nfsrdlnres *rl)
586 {
587 	if (rl->rl_data != NULL)
588 		kmem_free(rl->rl_data, NFS_MAXPATHLEN);
589 }
590 
591 static int rdma_setup_read_data2(struct nfsreadargs *, struct nfsrdresult *);
592 
593 /*
594  * Read data.
595  * Returns some data read from the file at the given fhandle.
596  */
597 /* ARGSUSED */
598 void
599 rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr,
600 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
601 {
602 	vnode_t *vp;
603 	int error;
604 	struct vattr va;
605 	struct iovec iov;
606 	struct uio uio;
607 	mblk_t *mp;
608 	int alloc_err = 0;
609 	int in_crit = 0;
610 	caller_context_t ct;
611 
612 	vp = nfs_fhtovp(&ra->ra_fhandle, exi);
613 	if (vp == NULL) {
614 		rr->rr_data = NULL;
615 		rr->rr_status = NFSERR_STALE;
616 		return;
617 	}
618 
619 	if (vp->v_type != VREG) {
620 		VN_RELE(vp);
621 		rr->rr_data = NULL;
622 		rr->rr_status = NFSERR_ISDIR;
623 		return;
624 	}
625 
626 	ct.cc_sysid = 0;
627 	ct.cc_pid = 0;
628 	ct.cc_caller_id = nfs2_srv_caller_id;
629 	ct.cc_flags = CC_DONTBLOCK;
630 
631 	/*
632 	 * Enter the critical region before calling VOP_RWLOCK
633 	 * to avoid a deadlock with write requests.
634 	 */
635 	if (nbl_need_check(vp)) {
636 		nbl_start_crit(vp, RW_READER);
637 		if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
638 		    0, NULL)) {
639 			nbl_end_crit(vp);
640 			VN_RELE(vp);
641 			rr->rr_data = NULL;
642 			rr->rr_status = NFSERR_ACCES;
643 			return;
644 		}
645 		in_crit = 1;
646 	}
647 
648 	error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct);
649 
650 	/* check if a monitor detected a delegation conflict */
651 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
652 		VN_RELE(vp);
653 		/* mark as wouldblock so response is dropped */
654 		curthread->t_flag |= T_WOULDBLOCK;
655 
656 		rr->rr_data = NULL;
657 		return;
658 	}
659 
660 	va.va_mask = AT_ALL;
661 
662 	error = VOP_GETATTR(vp, &va, 0, cr, &ct);
663 
664 	if (error) {
665 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
666 		if (in_crit)
667 			nbl_end_crit(vp);
668 
669 		VN_RELE(vp);
670 		rr->rr_data = NULL;
671 		rr->rr_status = puterrno(error);
672 
673 		return;
674 	}
675 
676 	/*
677 	 * This is a kludge to allow reading of files created
678 	 * with no read permission.  The owner of the file
679 	 * is always allowed to read it.
680 	 */
681 	if (crgetuid(cr) != va.va_uid) {
682 		error = VOP_ACCESS(vp, VREAD, 0, cr, &ct);
683 
684 		if (error) {
685 			/*
686 			 * Exec is the same as read over the net because
687 			 * of demand loading.
688 			 */
689 			error = VOP_ACCESS(vp, VEXEC, 0, cr, &ct);
690 		}
691 		if (error) {
692 			VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
693 			if (in_crit)
694 				nbl_end_crit(vp);
695 			VN_RELE(vp);
696 			rr->rr_data = NULL;
697 			rr->rr_status = puterrno(error);
698 
699 			return;
700 		}
701 	}
702 
703 	if (MANDLOCK(vp, va.va_mode)) {
704 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
705 		if (in_crit)
706 			nbl_end_crit(vp);
707 
708 		VN_RELE(vp);
709 		rr->rr_data = NULL;
710 		rr->rr_status = NFSERR_ACCES;
711 
712 		return;
713 	}
714 
715 	rr->rr_ok.rrok_wlist_len = 0;
716 	rr->rr_ok.rrok_wlist = NULL;
717 
718 	if ((u_offset_t)ra->ra_offset >= va.va_size) {
719 		rr->rr_count = 0;
720 		rr->rr_data = NULL;
721 		/*
722 		 * In this case, status is NFS_OK, but there is no data
723 		 * to encode. So set rr_mp to NULL.
724 		 */
725 		rr->rr_mp = NULL;
726 		rr->rr_ok.rrok_wlist = ra->ra_wlist;
727 		if (rr->rr_ok.rrok_wlist)
728 			clist_zero_len(rr->rr_ok.rrok_wlist);
729 		goto done;
730 	}
731 
732 	if (ra->ra_wlist) {
733 		mp = NULL;
734 		rr->rr_mp = NULL;
735 		(void) rdma_get_wchunk(req, &iov, ra->ra_wlist);
736 	} else {
737 		/*
738 		 * mp will contain the data to be sent out in the read reply.
739 		 * This will be freed after the reply has been sent out (by the
740 		 * driver).
741 		 * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so
742 		 * that the call to xdrmblk_putmblk() never fails.
743 		 */
744 		mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG,
745 		    &alloc_err);
746 		ASSERT(mp != NULL);
747 		ASSERT(alloc_err == 0);
748 
749 		rr->rr_mp = mp;
750 
751 		/*
752 		 * Set up io vector
753 		 */
754 		iov.iov_base = (caddr_t)mp->b_datap->db_base;
755 		iov.iov_len = ra->ra_count;
756 	}
757 
758 	uio.uio_iov = &iov;
759 	uio.uio_iovcnt = 1;
760 	uio.uio_segflg = UIO_SYSSPACE;
761 	uio.uio_extflg = UIO_COPY_CACHED;
762 	uio.uio_loffset = (offset_t)ra->ra_offset;
763 	uio.uio_resid = ra->ra_count;
764 
765 	error = VOP_READ(vp, &uio, 0, cr, &ct);
766 
767 	if (error) {
768 		if (mp)
769 			freeb(mp);
770 
771 		/*
772 		 * check if a monitor detected a delegation conflict and
773 		 * mark as wouldblock so response is dropped
774 		 */
775 		if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
776 			curthread->t_flag |= T_WOULDBLOCK;
777 		else
778 			rr->rr_status = puterrno(error);
779 
780 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
781 		if (in_crit)
782 			nbl_end_crit(vp);
783 
784 		VN_RELE(vp);
785 		rr->rr_data = NULL;
786 
787 		return;
788 	}
789 
790 	/*
791 	 * Get attributes again so we can send the latest access
792 	 * time to the client side for his cache.
793 	 */
794 	va.va_mask = AT_ALL;
795 
796 	error = VOP_GETATTR(vp, &va, 0, cr, &ct);
797 
798 	if (error) {
799 		if (mp)
800 			freeb(mp);
801 
802 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
803 		if (in_crit)
804 			nbl_end_crit(vp);
805 
806 		VN_RELE(vp);
807 		rr->rr_data = NULL;
808 		rr->rr_status = puterrno(error);
809 
810 		return;
811 	}
812 
813 	rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid);
814 
815 	if (mp) {
816 		rr->rr_data = (char *)mp->b_datap->db_base;
817 	} else {
818 		if (ra->ra_wlist) {
819 			rr->rr_data = (caddr_t)iov.iov_base;
820 			if (!rdma_setup_read_data2(ra, rr)) {
821 				rr->rr_data = NULL;
822 				rr->rr_status = puterrno(NFSERR_INVAL);
823 			}
824 		}
825 	}
826 done:
827 	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
828 	if (in_crit)
829 		nbl_end_crit(vp);
830 
831 	acl_perm(vp, exi, &va, cr);
832 
833 	/* check for overflows */
834 	error = vattr_to_nattr(&va, &rr->rr_attr);
835 
836 	VN_RELE(vp);
837 
838 	rr->rr_status = puterrno(error);
839 }
840 
841 /*
842  * Free data allocated by rfs_read
843  */
844 void
845 rfs_rdfree(struct nfsrdresult *rr)
846 {
847 	mblk_t *mp;
848 
849 	if (rr->rr_status == NFS_OK) {
850 		mp = rr->rr_mp;
851 		if (mp != NULL)
852 			freeb(mp);
853 	}
854 }
855 
856 void *
857 rfs_read_getfh(struct nfsreadargs *ra)
858 {
859 	return (&ra->ra_fhandle);
860 }
861 
862 #define	MAX_IOVECS	12
863 
864 #ifdef DEBUG
865 static int rfs_write_sync_hits = 0;
866 static int rfs_write_sync_misses = 0;
867 #endif
868 
869 /*
870  * Write data to file.
871  * Returns attributes of a file after writing some data to it.
872  *
873  * Any changes made here, especially in error handling might have
874  * to also be done in rfs_write (which clusters write requests).
875  */
876 void
877 rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns,
878 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
879 {
880 	int error;
881 	vnode_t *vp;
882 	rlim64_t rlimit;
883 	struct vattr va;
884 	struct uio uio;
885 	struct iovec iov[MAX_IOVECS];
886 	mblk_t *m;
887 	struct iovec *iovp;
888 	int iovcnt;
889 	cred_t *savecred;
890 	int in_crit = 0;
891 	caller_context_t ct;
892 
893 	vp = nfs_fhtovp(&wa->wa_fhandle, exi);
894 	if (vp == NULL) {
895 		ns->ns_status = NFSERR_STALE;
896 		return;
897 	}
898 
899 	if (rdonly(exi, req)) {
900 		VN_RELE(vp);
901 		ns->ns_status = NFSERR_ROFS;
902 		return;
903 	}
904 
905 	if (vp->v_type != VREG) {
906 		VN_RELE(vp);
907 		ns->ns_status = NFSERR_ISDIR;
908 		return;
909 	}
910 
911 	ct.cc_sysid = 0;
912 	ct.cc_pid = 0;
913 	ct.cc_caller_id = nfs2_srv_caller_id;
914 	ct.cc_flags = CC_DONTBLOCK;
915 
916 	va.va_mask = AT_UID|AT_MODE;
917 
918 	error = VOP_GETATTR(vp, &va, 0, cr, &ct);
919 
920 	if (error) {
921 		VN_RELE(vp);
922 		ns->ns_status = puterrno(error);
923 
924 		return;
925 	}
926 
927 	if (crgetuid(cr) != va.va_uid) {
928 		/*
929 		 * This is a kludge to allow writes of files created
930 		 * with read only permission.  The owner of the file
931 		 * is always allowed to write it.
932 		 */
933 		error = VOP_ACCESS(vp, VWRITE, 0, cr, &ct);
934 
935 		if (error) {
936 			VN_RELE(vp);
937 			ns->ns_status = puterrno(error);
938 			return;
939 		}
940 	}
941 
942 	/*
943 	 * Can't access a mandatory lock file.  This might cause
944 	 * the NFS service thread to block forever waiting for a
945 	 * lock to be released that will never be released.
946 	 */
947 	if (MANDLOCK(vp, va.va_mode)) {
948 		VN_RELE(vp);
949 		ns->ns_status = NFSERR_ACCES;
950 		return;
951 	}
952 
953 	/*
954 	 * We have to enter the critical region before calling VOP_RWLOCK
955 	 * to avoid a deadlock with ufs.
956 	 */
957 	if (nbl_need_check(vp)) {
958 		nbl_start_crit(vp, RW_READER);
959 		in_crit = 1;
960 		if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
961 		    wa->wa_count, 0, NULL)) {
962 			error = EACCES;
963 			goto out;
964 		}
965 	}
966 
967 	error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
968 
969 	/* check if a monitor detected a delegation conflict */
970 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
971 		VN_RELE(vp);
972 		/* mark as wouldblock so response is dropped */
973 		curthread->t_flag |= T_WOULDBLOCK;
974 		return;
975 	}
976 
977 	if (wa->wa_data || wa->wa_rlist) {
978 		/* Do the RDMA thing if necessary */
979 		if (wa->wa_rlist) {
980 			iov[0].iov_base = (char *)((wa->wa_rlist)->u.c_daddr3);
981 			iov[0].iov_len = wa->wa_count;
982 		} else  {
983 			iov[0].iov_base = wa->wa_data;
984 			iov[0].iov_len = wa->wa_count;
985 		}
986 		uio.uio_iov = iov;
987 		uio.uio_iovcnt = 1;
988 		uio.uio_segflg = UIO_SYSSPACE;
989 		uio.uio_extflg = UIO_COPY_DEFAULT;
990 		uio.uio_loffset = (offset_t)wa->wa_offset;
991 		uio.uio_resid = wa->wa_count;
992 		/*
993 		 * The limit is checked on the client. We
994 		 * should allow any size writes here.
995 		 */
996 		uio.uio_llimit = curproc->p_fsz_ctl;
997 		rlimit = uio.uio_llimit - wa->wa_offset;
998 		if (rlimit < (rlim64_t)uio.uio_resid)
999 			uio.uio_resid = (uint_t)rlimit;
1000 
1001 		/*
1002 		 * for now we assume no append mode
1003 		 */
1004 		/*
1005 		 * We're changing creds because VM may fault and we need
1006 		 * the cred of the current thread to be used if quota
1007 		 * checking is enabled.
1008 		 */
1009 		savecred = curthread->t_cred;
1010 		curthread->t_cred = cr;
1011 		error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1012 		curthread->t_cred = savecred;
1013 	} else {
1014 		iovcnt = 0;
1015 		for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
1016 			iovcnt++;
1017 		if (iovcnt <= MAX_IOVECS) {
1018 #ifdef DEBUG
1019 			rfs_write_sync_hits++;
1020 #endif
1021 			iovp = iov;
1022 		} else {
1023 #ifdef DEBUG
1024 			rfs_write_sync_misses++;
1025 #endif
1026 			iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
1027 		}
1028 		mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
1029 		uio.uio_iov = iovp;
1030 		uio.uio_iovcnt = iovcnt;
1031 		uio.uio_segflg = UIO_SYSSPACE;
1032 		uio.uio_extflg = UIO_COPY_DEFAULT;
1033 		uio.uio_loffset = (offset_t)wa->wa_offset;
1034 		uio.uio_resid = wa->wa_count;
1035 		/*
1036 		 * The limit is checked on the client. We
1037 		 * should allow any size writes here.
1038 		 */
1039 		uio.uio_llimit = curproc->p_fsz_ctl;
1040 		rlimit = uio.uio_llimit - wa->wa_offset;
1041 		if (rlimit < (rlim64_t)uio.uio_resid)
1042 			uio.uio_resid = (uint_t)rlimit;
1043 
1044 		/*
1045 		 * For now we assume no append mode.
1046 		 */
1047 		/*
1048 		 * We're changing creds because VM may fault and we need
1049 		 * the cred of the current thread to be used if quota
1050 		 * checking is enabled.
1051 		 */
1052 		savecred = curthread->t_cred;
1053 		curthread->t_cred = cr;
1054 		error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1055 		curthread->t_cred = savecred;
1056 
1057 		if (iovp != iov)
1058 			kmem_free(iovp, sizeof (*iovp) * iovcnt);
1059 	}
1060 
1061 	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1062 
1063 	if (!error) {
1064 		/*
1065 		 * Get attributes again so we send the latest mod
1066 		 * time to the client side for his cache.
1067 		 */
1068 		va.va_mask = AT_ALL;	/* now we want everything */
1069 
1070 		error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1071 
1072 		/* check for overflows */
1073 		if (!error) {
1074 			acl_perm(vp, exi, &va, cr);
1075 			error = vattr_to_nattr(&va, &ns->ns_attr);
1076 		}
1077 	}
1078 
1079 out:
1080 	if (in_crit)
1081 		nbl_end_crit(vp);
1082 	VN_RELE(vp);
1083 
1084 	/* check if a monitor detected a delegation conflict */
1085 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1086 		/* mark as wouldblock so response is dropped */
1087 		curthread->t_flag |= T_WOULDBLOCK;
1088 	else
1089 		ns->ns_status = puterrno(error);
1090 
1091 }
1092 
1093 struct rfs_async_write {
1094 	struct nfswriteargs *wa;
1095 	struct nfsattrstat *ns;
1096 	struct svc_req *req;
1097 	cred_t *cr;
1098 	kthread_t *thread;
1099 	struct rfs_async_write *list;
1100 };
1101 
1102 struct rfs_async_write_list {
1103 	fhandle_t *fhp;
1104 	kcondvar_t cv;
1105 	struct rfs_async_write *list;
1106 	struct rfs_async_write_list *next;
1107 };
1108 
1109 static struct rfs_async_write_list *rfs_async_write_head = NULL;
1110 static kmutex_t rfs_async_write_lock;
1111 static int rfs_write_async = 1;	/* enables write clustering if == 1 */
1112 
1113 #define	MAXCLIOVECS	42
1114 #define	RFSWRITE_INITVAL (enum nfsstat) -1
1115 
1116 #ifdef DEBUG
1117 static int rfs_write_hits = 0;
1118 static int rfs_write_misses = 0;
1119 #endif
1120 
1121 /*
1122  * Write data to file.
1123  * Returns attributes of a file after writing some data to it.
1124  */
1125 void
1126 rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns,
1127 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1128 {
1129 	int error;
1130 	vnode_t *vp;
1131 	rlim64_t rlimit;
1132 	struct vattr va;
1133 	struct uio uio;
1134 	struct rfs_async_write_list *lp;
1135 	struct rfs_async_write_list *nlp;
1136 	struct rfs_async_write *rp;
1137 	struct rfs_async_write *nrp;
1138 	struct rfs_async_write *trp;
1139 	struct rfs_async_write *lrp;
1140 	int data_written;
1141 	int iovcnt;
1142 	mblk_t *m;
1143 	struct iovec *iovp;
1144 	struct iovec *niovp;
1145 	struct iovec iov[MAXCLIOVECS];
1146 	int count;
1147 	int rcount;
1148 	uint_t off;
1149 	uint_t len;
1150 	struct rfs_async_write nrpsp;
1151 	struct rfs_async_write_list nlpsp;
1152 	ushort_t t_flag;
1153 	cred_t *savecred;
1154 	int in_crit = 0;
1155 	caller_context_t ct;
1156 
1157 	if (!rfs_write_async) {
1158 		rfs_write_sync(wa, ns, exi, req, cr);
1159 		return;
1160 	}
1161 
1162 	/*
1163 	 * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
1164 	 * is considered an OK.
1165 	 */
1166 	ns->ns_status = RFSWRITE_INITVAL;
1167 
1168 	nrp = &nrpsp;
1169 	nrp->wa = wa;
1170 	nrp->ns = ns;
1171 	nrp->req = req;
1172 	nrp->cr = cr;
1173 	nrp->thread = curthread;
1174 
1175 	ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
1176 
1177 	/*
1178 	 * Look to see if there is already a cluster started
1179 	 * for this file.
1180 	 */
1181 	mutex_enter(&rfs_async_write_lock);
1182 	for (lp = rfs_async_write_head; lp != NULL; lp = lp->next) {
1183 		if (bcmp(&wa->wa_fhandle, lp->fhp,
1184 		    sizeof (fhandle_t)) == 0)
1185 			break;
1186 	}
1187 
1188 	/*
1189 	 * If lp is non-NULL, then there is already a cluster
1190 	 * started.  We need to place ourselves in the cluster
1191 	 * list in the right place as determined by starting
1192 	 * offset.  Conflicts with non-blocking mandatory locked
1193 	 * regions will be checked when the cluster is processed.
1194 	 */
1195 	if (lp != NULL) {
1196 		rp = lp->list;
1197 		trp = NULL;
1198 		while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
1199 			trp = rp;
1200 			rp = rp->list;
1201 		}
1202 		nrp->list = rp;
1203 		if (trp == NULL)
1204 			lp->list = nrp;
1205 		else
1206 			trp->list = nrp;
1207 		while (nrp->ns->ns_status == RFSWRITE_INITVAL)
1208 			cv_wait(&lp->cv, &rfs_async_write_lock);
1209 		mutex_exit(&rfs_async_write_lock);
1210 
1211 		return;
1212 	}
1213 
1214 	/*
1215 	 * No cluster started yet, start one and add ourselves
1216 	 * to the list of clusters.
1217 	 */
1218 	nrp->list = NULL;
1219 
1220 	nlp = &nlpsp;
1221 	nlp->fhp = &wa->wa_fhandle;
1222 	cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
1223 	nlp->list = nrp;
1224 	nlp->next = NULL;
1225 
1226 	if (rfs_async_write_head == NULL) {
1227 		rfs_async_write_head = nlp;
1228 	} else {
1229 		lp = rfs_async_write_head;
1230 		while (lp->next != NULL)
1231 			lp = lp->next;
1232 		lp->next = nlp;
1233 	}
1234 	mutex_exit(&rfs_async_write_lock);
1235 
1236 	/*
1237 	 * Convert the file handle common to all of the requests
1238 	 * in this cluster to a vnode.
1239 	 */
1240 	vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1241 	if (vp == NULL) {
1242 		mutex_enter(&rfs_async_write_lock);
1243 		if (rfs_async_write_head == nlp)
1244 			rfs_async_write_head = nlp->next;
1245 		else {
1246 			lp = rfs_async_write_head;
1247 			while (lp->next != nlp)
1248 				lp = lp->next;
1249 			lp->next = nlp->next;
1250 		}
1251 		t_flag = curthread->t_flag & T_WOULDBLOCK;
1252 		for (rp = nlp->list; rp != NULL; rp = rp->list) {
1253 			rp->ns->ns_status = NFSERR_STALE;
1254 			rp->thread->t_flag |= t_flag;
1255 		}
1256 		cv_broadcast(&nlp->cv);
1257 		mutex_exit(&rfs_async_write_lock);
1258 
1259 		return;
1260 	}
1261 
1262 	/*
1263 	 * Can only write regular files.  Attempts to write any
1264 	 * other file types fail with EISDIR.
1265 	 */
1266 	if (vp->v_type != VREG) {
1267 		VN_RELE(vp);
1268 		mutex_enter(&rfs_async_write_lock);
1269 		if (rfs_async_write_head == nlp)
1270 			rfs_async_write_head = nlp->next;
1271 		else {
1272 			lp = rfs_async_write_head;
1273 			while (lp->next != nlp)
1274 				lp = lp->next;
1275 			lp->next = nlp->next;
1276 		}
1277 		t_flag = curthread->t_flag & T_WOULDBLOCK;
1278 		for (rp = nlp->list; rp != NULL; rp = rp->list) {
1279 			rp->ns->ns_status = NFSERR_ISDIR;
1280 			rp->thread->t_flag |= t_flag;
1281 		}
1282 		cv_broadcast(&nlp->cv);
1283 		mutex_exit(&rfs_async_write_lock);
1284 
1285 		return;
1286 	}
1287 
1288 	/*
1289 	 * Enter the critical region before calling VOP_RWLOCK, to avoid a
1290 	 * deadlock with ufs.
1291 	 */
1292 	if (nbl_need_check(vp)) {
1293 		nbl_start_crit(vp, RW_READER);
1294 		in_crit = 1;
1295 	}
1296 
1297 	ct.cc_sysid = 0;
1298 	ct.cc_pid = 0;
1299 	ct.cc_caller_id = nfs2_srv_caller_id;
1300 	ct.cc_flags = CC_DONTBLOCK;
1301 
1302 	/*
1303 	 * Lock the file for writing.  This operation provides
1304 	 * the delay which allows clusters to grow.
1305 	 */
1306 	error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1307 
1308 	/* check if a monitor detected a delegation conflict */
1309 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1310 		if (in_crit)
1311 			nbl_end_crit(vp);
1312 		VN_RELE(vp);
1313 		/* mark as wouldblock so response is dropped */
1314 		curthread->t_flag |= T_WOULDBLOCK;
1315 		mutex_enter(&rfs_async_write_lock);
1316 		if (rfs_async_write_head == nlp)
1317 			rfs_async_write_head = nlp->next;
1318 		else {
1319 			lp = rfs_async_write_head;
1320 			while (lp->next != nlp)
1321 				lp = lp->next;
1322 			lp->next = nlp->next;
1323 		}
1324 		for (rp = nlp->list; rp != NULL; rp = rp->list) {
1325 			if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1326 				rp->ns->ns_status = puterrno(error);
1327 				rp->thread->t_flag |= T_WOULDBLOCK;
1328 			}
1329 		}
1330 		cv_broadcast(&nlp->cv);
1331 		mutex_exit(&rfs_async_write_lock);
1332 
1333 		return;
1334 	}
1335 
1336 	/*
1337 	 * Disconnect this cluster from the list of clusters.
1338 	 * The cluster that is being dealt with must be fixed
1339 	 * in size after this point, so there is no reason
1340 	 * to leave it on the list so that new requests can
1341 	 * find it.
1342 	 *
1343 	 * The algorithm is that the first write request will
1344 	 * create a cluster, convert the file handle to a
1345 	 * vnode pointer, and then lock the file for writing.
1346 	 * This request is not likely to be clustered with
1347 	 * any others.  However, the next request will create
1348 	 * a new cluster and be blocked in VOP_RWLOCK while
1349 	 * the first request is being processed.  This delay
1350 	 * will allow more requests to be clustered in this
1351 	 * second cluster.
1352 	 */
1353 	mutex_enter(&rfs_async_write_lock);
1354 	if (rfs_async_write_head == nlp)
1355 		rfs_async_write_head = nlp->next;
1356 	else {
1357 		lp = rfs_async_write_head;
1358 		while (lp->next != nlp)
1359 			lp = lp->next;
1360 		lp->next = nlp->next;
1361 	}
1362 	mutex_exit(&rfs_async_write_lock);
1363 
1364 	/*
1365 	 * Step through the list of requests in this cluster.
1366 	 * We need to check permissions to make sure that all
1367 	 * of the requests have sufficient permission to write
1368 	 * the file.  A cluster can be composed of requests
1369 	 * from different clients and different users on each
1370 	 * client.
1371 	 *
1372 	 * As a side effect, we also calculate the size of the
1373 	 * byte range that this cluster encompasses.
1374 	 */
1375 	rp = nlp->list;
1376 	off = rp->wa->wa_offset;
1377 	len = (uint_t)0;
1378 	do {
1379 		if (rdonly(exi, rp->req)) {
1380 			rp->ns->ns_status = NFSERR_ROFS;
1381 			t_flag = curthread->t_flag & T_WOULDBLOCK;
1382 			rp->thread->t_flag |= t_flag;
1383 			continue;
1384 		}
1385 
1386 		va.va_mask = AT_UID|AT_MODE;
1387 
1388 		error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1389 
1390 		if (!error) {
1391 			if (crgetuid(rp->cr) != va.va_uid) {
1392 				/*
1393 				 * This is a kludge to allow writes of files
1394 				 * created with read only permission.  The
1395 				 * owner of the file is always allowed to
1396 				 * write it.
1397 				 */
1398 				error = VOP_ACCESS(vp, VWRITE, 0, rp->cr, &ct);
1399 			}
1400 			if (!error && MANDLOCK(vp, va.va_mode))
1401 				error = EACCES;
1402 		}
1403 
1404 		/*
1405 		 * Check for a conflict with a nbmand-locked region.
1406 		 */
1407 		if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset,
1408 		    rp->wa->wa_count, 0, NULL)) {
1409 			error = EACCES;
1410 		}
1411 
1412 		if (error) {
1413 			rp->ns->ns_status = puterrno(error);
1414 			t_flag = curthread->t_flag & T_WOULDBLOCK;
1415 			rp->thread->t_flag |= t_flag;
1416 			continue;
1417 		}
1418 		if (len < rp->wa->wa_offset + rp->wa->wa_count - off)
1419 			len = rp->wa->wa_offset + rp->wa->wa_count - off;
1420 	} while ((rp = rp->list) != NULL);
1421 
1422 	/*
1423 	 * Step through the cluster attempting to gather as many
1424 	 * requests which are contiguous as possible.  These
1425 	 * contiguous requests are handled via one call to VOP_WRITE
1426 	 * instead of different calls to VOP_WRITE.  We also keep
1427 	 * track of the fact that any data was written.
1428 	 */
1429 	rp = nlp->list;
1430 	data_written = 0;
1431 	do {
1432 		/*
1433 		 * Skip any requests which are already marked as having an
1434 		 * error.
1435 		 */
1436 		if (rp->ns->ns_status != RFSWRITE_INITVAL) {
1437 			rp = rp->list;
1438 			continue;
1439 		}
1440 
1441 		/*
1442 		 * Count the number of iovec's which are required
1443 		 * to handle this set of requests.  One iovec is
1444 		 * needed for each data buffer, whether addressed
1445 		 * by wa_data or by the b_rptr pointers in the
1446 		 * mblk chains.
1447 		 */
1448 		iovcnt = 0;
1449 		lrp = rp;
1450 		for (;;) {
1451 			if (lrp->wa->wa_data || lrp->wa->wa_rlist)
1452 				iovcnt++;
1453 			else {
1454 				m = lrp->wa->wa_mblk;
1455 				while (m != NULL) {
1456 					iovcnt++;
1457 					m = m->b_cont;
1458 				}
1459 			}
1460 			if (lrp->list == NULL ||
1461 			    lrp->list->ns->ns_status != RFSWRITE_INITVAL ||
1462 			    lrp->wa->wa_offset + lrp->wa->wa_count !=
1463 			    lrp->list->wa->wa_offset) {
1464 				lrp = lrp->list;
1465 				break;
1466 			}
1467 			lrp = lrp->list;
1468 		}
1469 
1470 		if (iovcnt <= MAXCLIOVECS) {
1471 #ifdef DEBUG
1472 			rfs_write_hits++;
1473 #endif
1474 			niovp = iov;
1475 		} else {
1476 #ifdef DEBUG
1477 			rfs_write_misses++;
1478 #endif
1479 			niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP);
1480 		}
1481 		/*
1482 		 * Put together the scatter/gather iovecs.
1483 		 */
1484 		iovp = niovp;
1485 		trp = rp;
1486 		count = 0;
1487 		do {
1488 			if (trp->wa->wa_data || trp->wa->wa_rlist) {
1489 				if (trp->wa->wa_rlist) {
1490 					iovp->iov_base =
1491 					    (char *)((trp->wa->wa_rlist)->
1492 					    u.c_daddr3);
1493 					iovp->iov_len = trp->wa->wa_count;
1494 				} else  {
1495 					iovp->iov_base = trp->wa->wa_data;
1496 					iovp->iov_len = trp->wa->wa_count;
1497 				}
1498 				iovp++;
1499 			} else {
1500 				m = trp->wa->wa_mblk;
1501 				rcount = trp->wa->wa_count;
1502 				while (m != NULL) {
1503 					iovp->iov_base = (caddr_t)m->b_rptr;
1504 					iovp->iov_len = (m->b_wptr - m->b_rptr);
1505 					rcount -= iovp->iov_len;
1506 					if (rcount < 0)
1507 						iovp->iov_len += rcount;
1508 					iovp++;
1509 					if (rcount <= 0)
1510 						break;
1511 					m = m->b_cont;
1512 				}
1513 			}
1514 			count += trp->wa->wa_count;
1515 			trp = trp->list;
1516 		} while (trp != lrp);
1517 
1518 		uio.uio_iov = niovp;
1519 		uio.uio_iovcnt = iovcnt;
1520 		uio.uio_segflg = UIO_SYSSPACE;
1521 		uio.uio_extflg = UIO_COPY_DEFAULT;
1522 		uio.uio_loffset = (offset_t)rp->wa->wa_offset;
1523 		uio.uio_resid = count;
1524 		/*
1525 		 * The limit is checked on the client. We
1526 		 * should allow any size writes here.
1527 		 */
1528 		uio.uio_llimit = curproc->p_fsz_ctl;
1529 		rlimit = uio.uio_llimit - rp->wa->wa_offset;
1530 		if (rlimit < (rlim64_t)uio.uio_resid)
1531 			uio.uio_resid = (uint_t)rlimit;
1532 
1533 		/*
1534 		 * For now we assume no append mode.
1535 		 */
1536 
1537 		/*
1538 		 * We're changing creds because VM may fault
1539 		 * and we need the cred of the current
1540 		 * thread to be used if quota * checking is
1541 		 * enabled.
1542 		 */
1543 		savecred = curthread->t_cred;
1544 		curthread->t_cred = cr;
1545 		error = VOP_WRITE(vp, &uio, 0, rp->cr, &ct);
1546 		curthread->t_cred = savecred;
1547 
1548 		/* check if a monitor detected a delegation conflict */
1549 		if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1550 			/* mark as wouldblock so response is dropped */
1551 			curthread->t_flag |= T_WOULDBLOCK;
1552 
1553 		if (niovp != iov)
1554 			kmem_free(niovp, sizeof (*niovp) * iovcnt);
1555 
1556 		if (!error) {
1557 			data_written = 1;
1558 			/*
1559 			 * Get attributes again so we send the latest mod
1560 			 * time to the client side for his cache.
1561 			 */
1562 			va.va_mask = AT_ALL;	/* now we want everything */
1563 
1564 			error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1565 
1566 			if (!error)
1567 				acl_perm(vp, exi, &va, rp->cr);
1568 		}
1569 
1570 		/*
1571 		 * Fill in the status responses for each request
1572 		 * which was just handled.  Also, copy the latest
1573 		 * attributes in to the attribute responses if
1574 		 * appropriate.
1575 		 */
1576 		t_flag = curthread->t_flag & T_WOULDBLOCK;
1577 		do {
1578 			rp->thread->t_flag |= t_flag;
1579 			/* check for overflows */
1580 			if (!error) {
1581 				error  = vattr_to_nattr(&va, &rp->ns->ns_attr);
1582 			}
1583 			rp->ns->ns_status = puterrno(error);
1584 			rp = rp->list;
1585 		} while (rp != lrp);
1586 	} while (rp != NULL);
1587 
1588 	/*
1589 	 * If any data was written at all, then we need to flush
1590 	 * the data and metadata to stable storage.
1591 	 */
1592 	if (data_written) {
1593 		error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr, &ct);
1594 
1595 		if (!error) {
1596 			error = VOP_FSYNC(vp, FNODSYNC, cr, &ct);
1597 		}
1598 	}
1599 
1600 	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1601 
1602 	if (in_crit)
1603 		nbl_end_crit(vp);
1604 	VN_RELE(vp);
1605 
1606 	t_flag = curthread->t_flag & T_WOULDBLOCK;
1607 	mutex_enter(&rfs_async_write_lock);
1608 	for (rp = nlp->list; rp != NULL; rp = rp->list) {
1609 		if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1610 			rp->ns->ns_status = puterrno(error);
1611 			rp->thread->t_flag |= t_flag;
1612 		}
1613 	}
1614 	cv_broadcast(&nlp->cv);
1615 	mutex_exit(&rfs_async_write_lock);
1616 
1617 }
1618 
1619 void *
1620 rfs_write_getfh(struct nfswriteargs *wa)
1621 {
1622 	return (&wa->wa_fhandle);
1623 }
1624 
1625 /*
1626  * Create a file.
1627  * Creates a file with given attributes and returns those attributes
1628  * and an fhandle for the new file.
1629  */
1630 void
1631 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
1632 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1633 {
1634 	int error;
1635 	int lookuperr;
1636 	int in_crit = 0;
1637 	struct vattr va;
1638 	vnode_t *vp;
1639 	vnode_t *realvp;
1640 	vnode_t *dvp;
1641 	char *name = args->ca_da.da_name;
1642 	vnode_t *tvp = NULL;
1643 	int mode;
1644 	int lookup_ok;
1645 	bool_t trunc;
1646 	struct sockaddr *ca;
1647 
1648 	/*
1649 	 * Disallow NULL paths
1650 	 */
1651 	if (name == NULL || *name == '\0') {
1652 		dr->dr_status = NFSERR_ACCES;
1653 		return;
1654 	}
1655 
1656 	dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
1657 	if (dvp == NULL) {
1658 		dr->dr_status = NFSERR_STALE;
1659 		return;
1660 	}
1661 
1662 	error = sattr_to_vattr(args->ca_sa, &va);
1663 	if (error) {
1664 		dr->dr_status = puterrno(error);
1665 		return;
1666 	}
1667 
1668 	/*
1669 	 * Must specify the mode.
1670 	 */
1671 	if (!(va.va_mask & AT_MODE)) {
1672 		VN_RELE(dvp);
1673 		dr->dr_status = NFSERR_INVAL;
1674 		return;
1675 	}
1676 
1677 	/*
1678 	 * This is a completely gross hack to make mknod
1679 	 * work over the wire until we can wack the protocol
1680 	 */
1681 	if ((va.va_mode & IFMT) == IFCHR) {
1682 		if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV)
1683 			va.va_type = VFIFO;	/* xtra kludge for named pipe */
1684 		else {
1685 			va.va_type = VCHR;
1686 			/*
1687 			 * uncompress the received dev_t
1688 			 * if the top half is zero indicating a request
1689 			 * from an `older style' OS.
1690 			 */
1691 			if ((va.va_size & 0xffff0000) == 0)
1692 				va.va_rdev = nfsv2_expdev(va.va_size);
1693 			else
1694 				va.va_rdev = (dev_t)va.va_size;
1695 		}
1696 		va.va_mask &= ~AT_SIZE;
1697 	} else if ((va.va_mode & IFMT) == IFBLK) {
1698 		va.va_type = VBLK;
1699 		/*
1700 		 * uncompress the received dev_t
1701 		 * if the top half is zero indicating a request
1702 		 * from an `older style' OS.
1703 		 */
1704 		if ((va.va_size & 0xffff0000) == 0)
1705 			va.va_rdev = nfsv2_expdev(va.va_size);
1706 		else
1707 			va.va_rdev = (dev_t)va.va_size;
1708 		va.va_mask &= ~AT_SIZE;
1709 	} else if ((va.va_mode & IFMT) == IFSOCK) {
1710 		va.va_type = VSOCK;
1711 	} else {
1712 		va.va_type = VREG;
1713 	}
1714 	va.va_mode &= ~IFMT;
1715 	va.va_mask |= AT_TYPE;
1716 
1717 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
1718 	name = nfscmd_convname(ca, exi, name, NFSCMD_CONV_INBOUND,
1719 	    MAXPATHLEN);
1720 	if (name == NULL) {
1721 		dr->dr_status = puterrno(EINVAL);
1722 		return;
1723 	}
1724 
1725 	/*
1726 	 * Why was the choice made to use VWRITE as the mode to the
1727 	 * call to VOP_CREATE ? This results in a bug.  When a client
1728 	 * opens a file that already exists and is RDONLY, the second
1729 	 * open fails with an EACESS because of the mode.
1730 	 * bug ID 1054648.
1731 	 */
1732 	lookup_ok = 0;
1733 	mode = VWRITE;
1734 	if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) {
1735 		error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1736 		    NULL, NULL, NULL);
1737 		if (!error) {
1738 			struct vattr at;
1739 
1740 			lookup_ok = 1;
1741 			at.va_mask = AT_MODE;
1742 			error = VOP_GETATTR(tvp, &at, 0, cr, NULL);
1743 			if (!error)
1744 				mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD;
1745 			VN_RELE(tvp);
1746 			tvp = NULL;
1747 		}
1748 	}
1749 
1750 	if (!lookup_ok) {
1751 		if (rdonly(exi, req)) {
1752 			error = EROFS;
1753 		} else if (va.va_type != VREG && va.va_type != VFIFO &&
1754 		    va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) {
1755 			error = EPERM;
1756 		} else {
1757 			error = 0;
1758 		}
1759 	}
1760 
1761 	/*
1762 	 * If file size is being modified on an already existing file
1763 	 * make sure that there are no conflicting non-blocking mandatory
1764 	 * locks in the region being manipulated. Return EACCES if there
1765 	 * are conflicting locks.
1766 	 */
1767 	if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) {
1768 		lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1769 		    NULL, NULL, NULL);
1770 
1771 		if (!lookuperr &&
1772 		    rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) {
1773 			VN_RELE(tvp);
1774 			curthread->t_flag |= T_WOULDBLOCK;
1775 			goto out;
1776 		}
1777 
1778 		if (!lookuperr && nbl_need_check(tvp)) {
1779 			/*
1780 			 * The file exists. Now check if it has any
1781 			 * conflicting non-blocking mandatory locks
1782 			 * in the region being changed.
1783 			 */
1784 			struct vattr bva;
1785 			u_offset_t offset;
1786 			ssize_t length;
1787 
1788 			nbl_start_crit(tvp, RW_READER);
1789 			in_crit = 1;
1790 
1791 			bva.va_mask = AT_SIZE;
1792 			error = VOP_GETATTR(tvp, &bva, 0, cr, NULL);
1793 			if (!error) {
1794 				if (va.va_size < bva.va_size) {
1795 					offset = va.va_size;
1796 					length = bva.va_size - va.va_size;
1797 				} else {
1798 					offset = bva.va_size;
1799 					length = va.va_size - bva.va_size;
1800 				}
1801 				if (length) {
1802 					if (nbl_conflict(tvp, NBL_WRITE,
1803 					    offset, length, 0, NULL)) {
1804 						error = EACCES;
1805 					}
1806 				}
1807 			}
1808 			if (error) {
1809 				nbl_end_crit(tvp);
1810 				VN_RELE(tvp);
1811 				in_crit = 0;
1812 			}
1813 		} else if (tvp != NULL) {
1814 			VN_RELE(tvp);
1815 		}
1816 	}
1817 
1818 	if (!error) {
1819 		/*
1820 		 * If filesystem is shared with nosuid the remove any
1821 		 * setuid/setgid bits on create.
1822 		 */
1823 		if (va.va_type == VREG &&
1824 		    exi->exi_export.ex_flags & EX_NOSUID)
1825 			va.va_mode &= ~(VSUID | VSGID);
1826 
1827 		error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0,
1828 		    NULL, NULL);
1829 
1830 		if (!error) {
1831 
1832 			if ((va.va_mask & AT_SIZE) && (va.va_size == 0))
1833 				trunc = TRUE;
1834 			else
1835 				trunc = FALSE;
1836 
1837 			if (rfs4_check_delegated(FWRITE, vp, trunc)) {
1838 				VN_RELE(vp);
1839 				curthread->t_flag |= T_WOULDBLOCK;
1840 				goto out;
1841 			}
1842 			va.va_mask = AT_ALL;
1843 
1844 			error = VOP_GETATTR(vp, &va, 0, cr, NULL);
1845 
1846 			/* check for overflows */
1847 			if (!error) {
1848 				acl_perm(vp, exi, &va, cr);
1849 				error = vattr_to_nattr(&va, &dr->dr_attr);
1850 				if (!error) {
1851 					error = makefh(&dr->dr_fhandle, vp,
1852 					    exi);
1853 				}
1854 			}
1855 			/*
1856 			 * Force modified metadata out to stable storage.
1857 			 *
1858 			 * if a underlying vp exists, pass it to VOP_FSYNC
1859 			 */
1860 			if (VOP_REALVP(vp, &realvp, NULL) == 0)
1861 				(void) VOP_FSYNC(realvp, FNODSYNC, cr, NULL);
1862 			else
1863 				(void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
1864 			VN_RELE(vp);
1865 		}
1866 
1867 		if (in_crit) {
1868 			nbl_end_crit(tvp);
1869 			VN_RELE(tvp);
1870 		}
1871 	}
1872 
1873 	/*
1874 	 * Force modified data and metadata out to stable storage.
1875 	 */
1876 	(void) VOP_FSYNC(dvp, 0, cr, NULL);
1877 
1878 out:
1879 
1880 	VN_RELE(dvp);
1881 
1882 	dr->dr_status = puterrno(error);
1883 
1884 	if (name != args->ca_da.da_name)
1885 		kmem_free(name, MAXPATHLEN);
1886 }
1887 void *
1888 rfs_create_getfh(struct nfscreatargs *args)
1889 {
1890 	return (args->ca_da.da_fhandle);
1891 }
1892 
1893 /*
1894  * Remove a file.
1895  * Remove named file from parent directory.
1896  */
1897 void
1898 rfs_remove(struct nfsdiropargs *da, enum nfsstat *status,
1899 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1900 {
1901 	int error = 0;
1902 	vnode_t *vp;
1903 	vnode_t *targvp;
1904 	int in_crit = 0;
1905 
1906 	/*
1907 	 * Disallow NULL paths
1908 	 */
1909 	if (da->da_name == NULL || *da->da_name == '\0') {
1910 		*status = NFSERR_ACCES;
1911 		return;
1912 	}
1913 
1914 	vp = nfs_fhtovp(da->da_fhandle, exi);
1915 	if (vp == NULL) {
1916 		*status = NFSERR_STALE;
1917 		return;
1918 	}
1919 
1920 	if (rdonly(exi, req)) {
1921 		VN_RELE(vp);
1922 		*status = NFSERR_ROFS;
1923 		return;
1924 	}
1925 
1926 	/*
1927 	 * Check for a conflict with a non-blocking mandatory share reservation.
1928 	 */
1929 	error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0,
1930 	    NULL, cr, NULL, NULL, NULL);
1931 	if (error != 0) {
1932 		VN_RELE(vp);
1933 		*status = puterrno(error);
1934 		return;
1935 	}
1936 
1937 	/*
1938 	 * If the file is delegated to an v4 client, then initiate
1939 	 * recall and drop this request (by setting T_WOULDBLOCK).
1940 	 * The client will eventually re-transmit the request and
1941 	 * (hopefully), by then, the v4 client will have returned
1942 	 * the delegation.
1943 	 */
1944 
1945 	if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
1946 		VN_RELE(vp);
1947 		VN_RELE(targvp);
1948 		curthread->t_flag |= T_WOULDBLOCK;
1949 		return;
1950 	}
1951 
1952 	if (nbl_need_check(targvp)) {
1953 		nbl_start_crit(targvp, RW_READER);
1954 		in_crit = 1;
1955 		if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
1956 			error = EACCES;
1957 			goto out;
1958 		}
1959 	}
1960 
1961 	error = VOP_REMOVE(vp, da->da_name, cr, NULL, 0);
1962 
1963 	/*
1964 	 * Force modified data and metadata out to stable storage.
1965 	 */
1966 	(void) VOP_FSYNC(vp, 0, cr, NULL);
1967 
1968 out:
1969 	if (in_crit)
1970 		nbl_end_crit(targvp);
1971 	VN_RELE(targvp);
1972 	VN_RELE(vp);
1973 
1974 	*status = puterrno(error);
1975 
1976 }
1977 
1978 void *
1979 rfs_remove_getfh(struct nfsdiropargs *da)
1980 {
1981 	return (da->da_fhandle);
1982 }
1983 
1984 /*
1985  * rename a file
1986  * Give a file (from) a new name (to).
1987  */
1988 void
1989 rfs_rename(struct nfsrnmargs *args, enum nfsstat *status,
1990 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1991 {
1992 	int error = 0;
1993 	vnode_t *fromvp;
1994 	vnode_t *tovp;
1995 	struct exportinfo *to_exi;
1996 	fhandle_t *fh;
1997 	vnode_t *srcvp;
1998 	vnode_t *targvp;
1999 	int in_crit = 0;
2000 
2001 	fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi);
2002 	if (fromvp == NULL) {
2003 		*status = NFSERR_STALE;
2004 		return;
2005 	}
2006 
2007 	fh = args->rna_to.da_fhandle;
2008 	to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2009 	if (to_exi == NULL) {
2010 		VN_RELE(fromvp);
2011 		*status = NFSERR_ACCES;
2012 		return;
2013 	}
2014 	exi_rele(to_exi);
2015 
2016 	if (to_exi != exi) {
2017 		VN_RELE(fromvp);
2018 		*status = NFSERR_XDEV;
2019 		return;
2020 	}
2021 
2022 	tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi);
2023 	if (tovp == NULL) {
2024 		VN_RELE(fromvp);
2025 		*status = NFSERR_STALE;
2026 		return;
2027 	}
2028 
2029 	if (fromvp->v_type != VDIR || tovp->v_type != VDIR) {
2030 		VN_RELE(tovp);
2031 		VN_RELE(fromvp);
2032 		*status = NFSERR_NOTDIR;
2033 		return;
2034 	}
2035 
2036 	/*
2037 	 * Disallow NULL paths
2038 	 */
2039 	if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' ||
2040 	    args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') {
2041 		VN_RELE(tovp);
2042 		VN_RELE(fromvp);
2043 		*status = NFSERR_ACCES;
2044 		return;
2045 	}
2046 
2047 	if (rdonly(exi, req)) {
2048 		VN_RELE(tovp);
2049 		VN_RELE(fromvp);
2050 		*status = NFSERR_ROFS;
2051 		return;
2052 	}
2053 
2054 	/*
2055 	 * Check for a conflict with a non-blocking mandatory share reservation.
2056 	 */
2057 	error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
2058 	    NULL, cr, NULL, NULL, NULL);
2059 	if (error != 0) {
2060 		VN_RELE(tovp);
2061 		VN_RELE(fromvp);
2062 		*status = puterrno(error);
2063 		return;
2064 	}
2065 
2066 	/* Check for delegations on the source file */
2067 
2068 	if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
2069 		VN_RELE(tovp);
2070 		VN_RELE(fromvp);
2071 		VN_RELE(srcvp);
2072 		curthread->t_flag |= T_WOULDBLOCK;
2073 		return;
2074 	}
2075 
2076 	/* Check for delegation on the file being renamed over, if it exists */
2077 
2078 	if (rfs4_deleg_policy != SRV_NEVER_DELEGATE &&
2079 	    VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
2080 	    NULL, NULL, NULL) == 0) {
2081 
2082 		if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2083 			VN_RELE(tovp);
2084 			VN_RELE(fromvp);
2085 			VN_RELE(srcvp);
2086 			VN_RELE(targvp);
2087 			curthread->t_flag |= T_WOULDBLOCK;
2088 			return;
2089 		}
2090 		VN_RELE(targvp);
2091 	}
2092 
2093 
2094 	if (nbl_need_check(srcvp)) {
2095 		nbl_start_crit(srcvp, RW_READER);
2096 		in_crit = 1;
2097 		if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) {
2098 			error = EACCES;
2099 			goto out;
2100 		}
2101 	}
2102 
2103 	error = VOP_RENAME(fromvp, args->rna_from.da_name,
2104 	    tovp, args->rna_to.da_name, cr, NULL, 0);
2105 
2106 	if (error == 0)
2107 		vn_renamepath(tovp, srcvp, args->rna_to.da_name,
2108 		    strlen(args->rna_to.da_name));
2109 
2110 	/*
2111 	 * Force modified data and metadata out to stable storage.
2112 	 */
2113 	(void) VOP_FSYNC(tovp, 0, cr, NULL);
2114 	(void) VOP_FSYNC(fromvp, 0, cr, NULL);
2115 
2116 out:
2117 	if (in_crit)
2118 		nbl_end_crit(srcvp);
2119 	VN_RELE(srcvp);
2120 	VN_RELE(tovp);
2121 	VN_RELE(fromvp);
2122 
2123 	*status = puterrno(error);
2124 
2125 }
2126 void *
2127 rfs_rename_getfh(struct nfsrnmargs *args)
2128 {
2129 	return (args->rna_from.da_fhandle);
2130 }
2131 
2132 /*
2133  * Link to a file.
2134  * Create a file (to) which is a hard link to the given file (from).
2135  */
2136 void
2137 rfs_link(struct nfslinkargs *args, enum nfsstat *status,
2138 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2139 {
2140 	int error;
2141 	vnode_t *fromvp;
2142 	vnode_t *tovp;
2143 	struct exportinfo *to_exi;
2144 	fhandle_t *fh;
2145 
2146 	fromvp = nfs_fhtovp(args->la_from, exi);
2147 	if (fromvp == NULL) {
2148 		*status = NFSERR_STALE;
2149 		return;
2150 	}
2151 
2152 	fh = args->la_to.da_fhandle;
2153 	to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2154 	if (to_exi == NULL) {
2155 		VN_RELE(fromvp);
2156 		*status = NFSERR_ACCES;
2157 		return;
2158 	}
2159 	exi_rele(to_exi);
2160 
2161 	if (to_exi != exi) {
2162 		VN_RELE(fromvp);
2163 		*status = NFSERR_XDEV;
2164 		return;
2165 	}
2166 
2167 	tovp = nfs_fhtovp(args->la_to.da_fhandle, exi);
2168 	if (tovp == NULL) {
2169 		VN_RELE(fromvp);
2170 		*status = NFSERR_STALE;
2171 		return;
2172 	}
2173 
2174 	if (tovp->v_type != VDIR) {
2175 		VN_RELE(tovp);
2176 		VN_RELE(fromvp);
2177 		*status = NFSERR_NOTDIR;
2178 		return;
2179 	}
2180 	/*
2181 	 * Disallow NULL paths
2182 	 */
2183 	if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') {
2184 		VN_RELE(tovp);
2185 		VN_RELE(fromvp);
2186 		*status = NFSERR_ACCES;
2187 		return;
2188 	}
2189 
2190 	if (rdonly(exi, req)) {
2191 		VN_RELE(tovp);
2192 		VN_RELE(fromvp);
2193 		*status = NFSERR_ROFS;
2194 		return;
2195 	}
2196 
2197 	error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr, NULL, 0);
2198 
2199 	/*
2200 	 * Force modified data and metadata out to stable storage.
2201 	 */
2202 	(void) VOP_FSYNC(tovp, 0, cr, NULL);
2203 	(void) VOP_FSYNC(fromvp, FNODSYNC, cr, NULL);
2204 
2205 	VN_RELE(tovp);
2206 	VN_RELE(fromvp);
2207 
2208 	*status = puterrno(error);
2209 
2210 }
2211 void *
2212 rfs_link_getfh(struct nfslinkargs *args)
2213 {
2214 	return (args->la_from);
2215 }
2216 
2217 /*
2218  * Symbolicly link to a file.
2219  * Create a file (to) with the given attributes which is a symbolic link
2220  * to the given path name (to).
2221  */
2222 void
2223 rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
2224 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2225 {
2226 	int error;
2227 	struct vattr va;
2228 	vnode_t *vp;
2229 	vnode_t *svp;
2230 	int lerror;
2231 	struct sockaddr *ca;
2232 	char *name = NULL;
2233 
2234 	/*
2235 	 * Disallow NULL paths
2236 	 */
2237 	if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') {
2238 		*status = NFSERR_ACCES;
2239 		return;
2240 	}
2241 
2242 	vp = nfs_fhtovp(args->sla_from.da_fhandle, exi);
2243 	if (vp == NULL) {
2244 		*status = NFSERR_STALE;
2245 		return;
2246 	}
2247 
2248 	if (rdonly(exi, req)) {
2249 		VN_RELE(vp);
2250 		*status = NFSERR_ROFS;
2251 		return;
2252 	}
2253 
2254 	error = sattr_to_vattr(args->sla_sa, &va);
2255 	if (error) {
2256 		VN_RELE(vp);
2257 		*status = puterrno(error);
2258 		return;
2259 	}
2260 
2261 	if (!(va.va_mask & AT_MODE)) {
2262 		VN_RELE(vp);
2263 		*status = NFSERR_INVAL;
2264 		return;
2265 	}
2266 
2267 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2268 	name = nfscmd_convname(ca, exi, args->sla_tnm,
2269 	    NFSCMD_CONV_INBOUND, MAXPATHLEN);
2270 
2271 	if (name == NULL) {
2272 		*status = NFSERR_ACCES;
2273 		return;
2274 	}
2275 
2276 	va.va_type = VLNK;
2277 	va.va_mask |= AT_TYPE;
2278 
2279 	error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, name, cr, NULL, 0);
2280 
2281 	/*
2282 	 * Force new data and metadata out to stable storage.
2283 	 */
2284 	lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL, 0,
2285 	    NULL, cr, NULL, NULL, NULL);
2286 
2287 	if (!lerror) {
2288 		(void) VOP_FSYNC(svp, 0, cr, NULL);
2289 		VN_RELE(svp);
2290 	}
2291 
2292 	/*
2293 	 * Force modified data and metadata out to stable storage.
2294 	 */
2295 	(void) VOP_FSYNC(vp, 0, cr, NULL);
2296 
2297 	VN_RELE(vp);
2298 
2299 	*status = puterrno(error);
2300 	if (name != args->sla_tnm)
2301 		kmem_free(name, MAXPATHLEN);
2302 
2303 }
2304 void *
2305 rfs_symlink_getfh(struct nfsslargs *args)
2306 {
2307 	return (args->sla_from.da_fhandle);
2308 }
2309 
2310 /*
2311  * Make a directory.
2312  * Create a directory with the given name, parent directory, and attributes.
2313  * Returns a file handle and attributes for the new directory.
2314  */
2315 void
2316 rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr,
2317 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2318 {
2319 	int error;
2320 	struct vattr va;
2321 	vnode_t *dvp = NULL;
2322 	vnode_t *vp;
2323 	char *name = args->ca_da.da_name;
2324 
2325 	/*
2326 	 * Disallow NULL paths
2327 	 */
2328 	if (name == NULL || *name == '\0') {
2329 		dr->dr_status = NFSERR_ACCES;
2330 		return;
2331 	}
2332 
2333 	vp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
2334 	if (vp == NULL) {
2335 		dr->dr_status = NFSERR_STALE;
2336 		return;
2337 	}
2338 
2339 	if (rdonly(exi, req)) {
2340 		VN_RELE(vp);
2341 		dr->dr_status = NFSERR_ROFS;
2342 		return;
2343 	}
2344 
2345 	error = sattr_to_vattr(args->ca_sa, &va);
2346 	if (error) {
2347 		VN_RELE(vp);
2348 		dr->dr_status = puterrno(error);
2349 		return;
2350 	}
2351 
2352 	if (!(va.va_mask & AT_MODE)) {
2353 		VN_RELE(vp);
2354 		dr->dr_status = NFSERR_INVAL;
2355 		return;
2356 	}
2357 
2358 	va.va_type = VDIR;
2359 	va.va_mask |= AT_TYPE;
2360 
2361 	error = VOP_MKDIR(vp, name, &va, &dvp, cr, NULL, 0, NULL);
2362 
2363 	if (!error) {
2364 		/*
2365 		 * Attribtutes of the newly created directory should
2366 		 * be returned to the client.
2367 		 */
2368 		va.va_mask = AT_ALL; /* We want everything */
2369 		error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2370 
2371 		/* check for overflows */
2372 		if (!error) {
2373 			acl_perm(vp, exi, &va, cr);
2374 			error = vattr_to_nattr(&va, &dr->dr_attr);
2375 			if (!error) {
2376 				error = makefh(&dr->dr_fhandle, dvp, exi);
2377 			}
2378 		}
2379 		/*
2380 		 * Force new data and metadata out to stable storage.
2381 		 */
2382 		(void) VOP_FSYNC(dvp, 0, cr, NULL);
2383 		VN_RELE(dvp);
2384 	}
2385 
2386 	/*
2387 	 * Force modified data and metadata out to stable storage.
2388 	 */
2389 	(void) VOP_FSYNC(vp, 0, cr, NULL);
2390 
2391 	VN_RELE(vp);
2392 
2393 	dr->dr_status = puterrno(error);
2394 
2395 }
2396 void *
2397 rfs_mkdir_getfh(struct nfscreatargs *args)
2398 {
2399 	return (args->ca_da.da_fhandle);
2400 }
2401 
2402 /*
2403  * Remove a directory.
2404  * Remove the given directory name from the given parent directory.
2405  */
2406 void
2407 rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status,
2408 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2409 {
2410 	int error;
2411 	vnode_t *vp;
2412 
2413 
2414 	/*
2415 	 * Disallow NULL paths
2416 	 */
2417 	if (da->da_name == NULL || *da->da_name == '\0') {
2418 		*status = NFSERR_ACCES;
2419 		return;
2420 	}
2421 
2422 	vp = nfs_fhtovp(da->da_fhandle, exi);
2423 	if (vp == NULL) {
2424 		*status = NFSERR_STALE;
2425 		return;
2426 	}
2427 
2428 	if (rdonly(exi, req)) {
2429 		VN_RELE(vp);
2430 		*status = NFSERR_ROFS;
2431 		return;
2432 	}
2433 
2434 	/*
2435 	 * VOP_RMDIR now takes a new third argument (the current
2436 	 * directory of the process).  That's because someone
2437 	 * wants to return EINVAL if one tries to remove ".".
2438 	 * Of course, NFS servers have no idea what their
2439 	 * clients' current directories are.  We fake it by
2440 	 * supplying a vnode known to exist and illegal to
2441 	 * remove.
2442 	 */
2443 	error = VOP_RMDIR(vp, da->da_name, rootdir, cr, NULL, 0);
2444 
2445 	/*
2446 	 * Force modified data and metadata out to stable storage.
2447 	 */
2448 	(void) VOP_FSYNC(vp, 0, cr, NULL);
2449 
2450 	VN_RELE(vp);
2451 
2452 	/*
2453 	 * System V defines rmdir to return EEXIST, not ENOTEMPTY,
2454 	 * if the directory is not empty.  A System V NFS server
2455 	 * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
2456 	 * over the wire.
2457 	 */
2458 	if (error == EEXIST)
2459 		*status = NFSERR_NOTEMPTY;
2460 	else
2461 		*status = puterrno(error);
2462 
2463 }
2464 void *
2465 rfs_rmdir_getfh(struct nfsdiropargs *da)
2466 {
2467 	return (da->da_fhandle);
2468 }
2469 
2470 /* ARGSUSED */
2471 void
2472 rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
2473 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2474 {
2475 	int error;
2476 	int iseof;
2477 	struct iovec iov;
2478 	struct uio uio;
2479 	vnode_t *vp;
2480 	char *ndata = NULL;
2481 	struct sockaddr *ca;
2482 	size_t nents;
2483 	int ret;
2484 
2485 	vp = nfs_fhtovp(&rda->rda_fh, exi);
2486 	if (vp == NULL) {
2487 		rd->rd_entries = NULL;
2488 		rd->rd_status = NFSERR_STALE;
2489 		return;
2490 	}
2491 
2492 	if (vp->v_type != VDIR) {
2493 		VN_RELE(vp);
2494 		rd->rd_entries = NULL;
2495 		rd->rd_status = NFSERR_NOTDIR;
2496 		return;
2497 	}
2498 
2499 	(void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
2500 
2501 	error = VOP_ACCESS(vp, VREAD, 0, cr, NULL);
2502 
2503 	if (error) {
2504 		rd->rd_entries = NULL;
2505 		goto bad;
2506 	}
2507 
2508 	if (rda->rda_count == 0) {
2509 		rd->rd_entries = NULL;
2510 		rd->rd_size = 0;
2511 		rd->rd_eof = FALSE;
2512 		goto bad;
2513 	}
2514 
2515 	rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA);
2516 
2517 	/*
2518 	 * Allocate data for entries.  This will be freed by rfs_rddirfree.
2519 	 */
2520 	rd->rd_bufsize = (uint_t)rda->rda_count;
2521 	rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP);
2522 
2523 	/*
2524 	 * Set up io vector to read directory data
2525 	 */
2526 	iov.iov_base = (caddr_t)rd->rd_entries;
2527 	iov.iov_len = rda->rda_count;
2528 	uio.uio_iov = &iov;
2529 	uio.uio_iovcnt = 1;
2530 	uio.uio_segflg = UIO_SYSSPACE;
2531 	uio.uio_extflg = UIO_COPY_CACHED;
2532 	uio.uio_loffset = (offset_t)rda->rda_offset;
2533 	uio.uio_resid = rda->rda_count;
2534 
2535 	/*
2536 	 * read directory
2537 	 */
2538 	error = VOP_READDIR(vp, &uio, cr, &iseof, NULL, 0);
2539 
2540 	/*
2541 	 * Clean up
2542 	 */
2543 	if (!error) {
2544 		/*
2545 		 * set size and eof
2546 		 */
2547 		if (uio.uio_resid == rda->rda_count) {
2548 			rd->rd_size = 0;
2549 			rd->rd_eof = TRUE;
2550 		} else {
2551 			rd->rd_size = (uint32_t)(rda->rda_count -
2552 			    uio.uio_resid);
2553 			rd->rd_eof = iseof ? TRUE : FALSE;
2554 		}
2555 	}
2556 
2557 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2558 	nents = nfscmd_countents((char *)rd->rd_entries, rd->rd_size);
2559 	ret = nfscmd_convdirplus(ca, exi, (char *)rd->rd_entries, nents,
2560 	    rda->rda_count, &ndata);
2561 
2562 	if (ret != 0) {
2563 		size_t dropbytes;
2564 		/*
2565 		 * We had to drop one or more entries in order to fit
2566 		 * during the character conversion.  We need to patch
2567 		 * up the size and eof info.
2568 		 */
2569 		if (rd->rd_eof)
2570 			rd->rd_eof = FALSE;
2571 		dropbytes = nfscmd_dropped_entrysize(
2572 		    (struct dirent64 *)rd->rd_entries, nents, ret);
2573 		rd->rd_size -= dropbytes;
2574 	}
2575 	if (ndata == NULL) {
2576 		ndata = (char *)rd->rd_entries;
2577 	} else if (ndata != (char *)rd->rd_entries) {
2578 		kmem_free(rd->rd_entries, rd->rd_bufsize);
2579 		rd->rd_entries = (void *)ndata;
2580 		rd->rd_bufsize = rda->rda_count;
2581 	}
2582 
2583 bad:
2584 	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
2585 
2586 #if 0 /* notyet */
2587 	/*
2588 	 * Don't do this.  It causes local disk writes when just
2589 	 * reading the file and the overhead is deemed larger
2590 	 * than the benefit.
2591 	 */
2592 	/*
2593 	 * Force modified metadata out to stable storage.
2594 	 */
2595 	(void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2596 #endif
2597 
2598 	VN_RELE(vp);
2599 
2600 	rd->rd_status = puterrno(error);
2601 
2602 }
2603 void *
2604 rfs_readdir_getfh(struct nfsrddirargs *rda)
2605 {
2606 	return (&rda->rda_fh);
2607 }
2608 void
2609 rfs_rddirfree(struct nfsrddirres *rd)
2610 {
2611 	if (rd->rd_entries != NULL)
2612 		kmem_free(rd->rd_entries, rd->rd_bufsize);
2613 }
2614 
2615 /* ARGSUSED */
2616 void
2617 rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
2618 	struct svc_req *req, cred_t *cr)
2619 {
2620 	int error;
2621 	struct statvfs64 sb;
2622 	vnode_t *vp;
2623 
2624 	vp = nfs_fhtovp(fh, exi);
2625 	if (vp == NULL) {
2626 		fs->fs_status = NFSERR_STALE;
2627 		return;
2628 	}
2629 
2630 	error = VFS_STATVFS(vp->v_vfsp, &sb);
2631 
2632 	if (!error) {
2633 		fs->fs_tsize = nfstsize();
2634 		fs->fs_bsize = sb.f_frsize;
2635 		fs->fs_blocks = sb.f_blocks;
2636 		fs->fs_bfree = sb.f_bfree;
2637 		fs->fs_bavail = sb.f_bavail;
2638 	}
2639 
2640 	VN_RELE(vp);
2641 
2642 	fs->fs_status = puterrno(error);
2643 
2644 }
2645 void *
2646 rfs_statfs_getfh(fhandle_t *fh)
2647 {
2648 	return (fh);
2649 }
2650 
2651 static int
2652 sattr_to_vattr(struct nfssattr *sa, struct vattr *vap)
2653 {
2654 	vap->va_mask = 0;
2655 
2656 	/*
2657 	 * There was a sign extension bug in some VFS based systems
2658 	 * which stored the mode as a short.  When it would get
2659 	 * assigned to a u_long, no sign extension would occur.
2660 	 * It needed to, but this wasn't noticed because sa_mode
2661 	 * would then get assigned back to the short, thus ignoring
2662 	 * the upper 16 bits of sa_mode.
2663 	 *
2664 	 * To make this implementation work for both broken
2665 	 * clients and good clients, we check for both versions
2666 	 * of the mode.
2667 	 */
2668 	if (sa->sa_mode != (uint32_t)((ushort_t)-1) &&
2669 	    sa->sa_mode != (uint32_t)-1) {
2670 		vap->va_mask |= AT_MODE;
2671 		vap->va_mode = sa->sa_mode;
2672 	}
2673 	if (sa->sa_uid != (uint32_t)-1) {
2674 		vap->va_mask |= AT_UID;
2675 		vap->va_uid = sa->sa_uid;
2676 	}
2677 	if (sa->sa_gid != (uint32_t)-1) {
2678 		vap->va_mask |= AT_GID;
2679 		vap->va_gid = sa->sa_gid;
2680 	}
2681 	if (sa->sa_size != (uint32_t)-1) {
2682 		vap->va_mask |= AT_SIZE;
2683 		vap->va_size = sa->sa_size;
2684 	}
2685 	if (sa->sa_atime.tv_sec != (int32_t)-1 &&
2686 	    sa->sa_atime.tv_usec != (int32_t)-1) {
2687 #ifndef _LP64
2688 		/* return error if time overflow */
2689 		if (!NFS2_TIME_OK(sa->sa_atime.tv_sec))
2690 			return (EOVERFLOW);
2691 #endif
2692 		vap->va_mask |= AT_ATIME;
2693 		/*
2694 		 * nfs protocol defines times as unsigned so don't extend sign,
2695 		 * unless sysadmin set nfs_allow_preepoch_time.
2696 		 */
2697 		NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec);
2698 		vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
2699 	}
2700 	if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
2701 	    sa->sa_mtime.tv_usec != (int32_t)-1) {
2702 #ifndef _LP64
2703 		/* return error if time overflow */
2704 		if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
2705 			return (EOVERFLOW);
2706 #endif
2707 		vap->va_mask |= AT_MTIME;
2708 		/*
2709 		 * nfs protocol defines times as unsigned so don't extend sign,
2710 		 * unless sysadmin set nfs_allow_preepoch_time.
2711 		 */
2712 		NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
2713 		vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
2714 	}
2715 	return (0);
2716 }
2717 
2718 static enum nfsftype vt_to_nf[] = {
2719 	0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
2720 };
2721 
2722 /*
2723  * check the following fields for overflow: nodeid, size, and time.
2724  * There could be a problem when converting 64-bit LP64 fields
2725  * into 32-bit ones.  Return an error if there is an overflow.
2726  */
2727 int
2728 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
2729 {
2730 	ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
2731 	na->na_type = vt_to_nf[vap->va_type];
2732 
2733 	if (vap->va_mode == (unsigned short) -1)
2734 		na->na_mode = (uint32_t)-1;
2735 	else
2736 		na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
2737 
2738 	if (vap->va_uid == (unsigned short)(-1))
2739 		na->na_uid = (uint32_t)(-1);
2740 	else if (vap->va_uid == UID_NOBODY)
2741 		na->na_uid = (uint32_t)NFS_UID_NOBODY;
2742 	else
2743 		na->na_uid = vap->va_uid;
2744 
2745 	if (vap->va_gid == (unsigned short)(-1))
2746 		na->na_gid = (uint32_t)-1;
2747 	else if (vap->va_gid == GID_NOBODY)
2748 		na->na_gid = (uint32_t)NFS_GID_NOBODY;
2749 	else
2750 		na->na_gid = vap->va_gid;
2751 
2752 	/*
2753 	 * Do we need to check fsid for overflow?  It is 64-bit in the
2754 	 * vattr, but are bigger than 32 bit values supported?
2755 	 */
2756 	na->na_fsid = vap->va_fsid;
2757 
2758 	na->na_nodeid = vap->va_nodeid;
2759 
2760 	/*
2761 	 * Check to make sure that the nodeid is representable over the
2762 	 * wire without losing bits.
2763 	 */
2764 	if (vap->va_nodeid != (u_longlong_t)na->na_nodeid)
2765 		return (EFBIG);
2766 	na->na_nlink = vap->va_nlink;
2767 
2768 	/*
2769 	 * Check for big files here, instead of at the caller.  See
2770 	 * comments in cstat for large special file explanation.
2771 	 */
2772 	if (vap->va_size > (u_longlong_t)MAXOFF32_T) {
2773 		if ((vap->va_type == VREG) || (vap->va_type == VDIR))
2774 			return (EFBIG);
2775 		if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) {
2776 			/* UNKNOWN_SIZE | OVERFLOW */
2777 			na->na_size = MAXOFF32_T;
2778 		} else
2779 			na->na_size = vap->va_size;
2780 	} else
2781 		na->na_size = vap->va_size;
2782 
2783 	/*
2784 	 * If the vnode times overflow the 32-bit times that NFS2
2785 	 * uses on the wire then return an error.
2786 	 */
2787 	if (!NFS_VAP_TIME_OK(vap)) {
2788 		return (EOVERFLOW);
2789 	}
2790 	na->na_atime.tv_sec = vap->va_atime.tv_sec;
2791 	na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2792 
2793 	na->na_mtime.tv_sec = vap->va_mtime.tv_sec;
2794 	na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2795 
2796 	na->na_ctime.tv_sec = vap->va_ctime.tv_sec;
2797 	na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000;
2798 
2799 	/*
2800 	 * If the dev_t will fit into 16 bits then compress
2801 	 * it, otherwise leave it alone. See comments in
2802 	 * nfs_client.c.
2803 	 */
2804 	if (getminor(vap->va_rdev) <= SO4_MAXMIN &&
2805 	    getmajor(vap->va_rdev) <= SO4_MAXMAJ)
2806 		na->na_rdev = nfsv2_cmpdev(vap->va_rdev);
2807 	else
2808 		(void) cmpldev(&na->na_rdev, vap->va_rdev);
2809 
2810 	na->na_blocks = vap->va_nblocks;
2811 	na->na_blocksize = vap->va_blksize;
2812 
2813 	/*
2814 	 * This bit of ugliness is a *TEMPORARY* hack to preserve the
2815 	 * over-the-wire protocols for named-pipe vnodes.  It remaps the
2816 	 * VFIFO type to the special over-the-wire type. (see note in nfs.h)
2817 	 *
2818 	 * BUYER BEWARE:
2819 	 *  If you are porting the NFS to a non-Sun server, you probably
2820 	 *  don't want to include the following block of code.  The
2821 	 *  over-the-wire special file types will be changing with the
2822 	 *  NFS Protocol Revision.
2823 	 */
2824 	if (vap->va_type == VFIFO)
2825 		NA_SETFIFO(na);
2826 	return (0);
2827 }
2828 
2829 /*
2830  * acl v2 support: returns approximate permission.
2831  *	default: returns minimal permission (more restrictive)
2832  *	aclok: returns maximal permission (less restrictive)
2833  *	This routine changes the permissions that are alaredy in *va.
2834  *	If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES,
2835  *	CLASS_OBJ is always the same as GROUP_OBJ entry.
2836  */
2837 static void
2838 acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr)
2839 {
2840 	vsecattr_t	vsa;
2841 	int		aclcnt;
2842 	aclent_t	*aclentp;
2843 	mode_t		mask_perm;
2844 	mode_t		grp_perm;
2845 	mode_t		other_perm;
2846 	mode_t		other_orig;
2847 	int		error;
2848 
2849 	/* dont care default acl */
2850 	vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT);
2851 	error = VOP_GETSECATTR(vp, &vsa, 0, cr, NULL);
2852 
2853 	if (!error) {
2854 		aclcnt = vsa.vsa_aclcnt;
2855 		if (aclcnt > MIN_ACL_ENTRIES) {
2856 			/* non-trivial ACL */
2857 			aclentp = vsa.vsa_aclentp;
2858 			if (exi->exi_export.ex_flags & EX_ACLOK) {
2859 				/* maximal permissions */
2860 				grp_perm = 0;
2861 				other_perm = 0;
2862 				for (; aclcnt > 0; aclcnt--, aclentp++) {
2863 					switch (aclentp->a_type) {
2864 					case USER_OBJ:
2865 						break;
2866 					case USER:
2867 						grp_perm |=
2868 						    aclentp->a_perm << 3;
2869 						other_perm |= aclentp->a_perm;
2870 						break;
2871 					case GROUP_OBJ:
2872 						grp_perm |=
2873 						    aclentp->a_perm << 3;
2874 						break;
2875 					case GROUP:
2876 						other_perm |= aclentp->a_perm;
2877 						break;
2878 					case OTHER_OBJ:
2879 						other_orig = aclentp->a_perm;
2880 						break;
2881 					case CLASS_OBJ:
2882 						mask_perm = aclentp->a_perm;
2883 						break;
2884 					default:
2885 						break;
2886 					}
2887 				}
2888 				grp_perm &= mask_perm << 3;
2889 				other_perm &= mask_perm;
2890 				other_perm |= other_orig;
2891 
2892 			} else {
2893 				/* minimal permissions */
2894 				grp_perm = 070;
2895 				other_perm = 07;
2896 				for (; aclcnt > 0; aclcnt--, aclentp++) {
2897 					switch (aclentp->a_type) {
2898 					case USER_OBJ:
2899 						break;
2900 					case USER:
2901 					case CLASS_OBJ:
2902 						grp_perm &=
2903 						    aclentp->a_perm << 3;
2904 						other_perm &=
2905 						    aclentp->a_perm;
2906 						break;
2907 					case GROUP_OBJ:
2908 						grp_perm &=
2909 						    aclentp->a_perm << 3;
2910 						break;
2911 					case GROUP:
2912 						other_perm &=
2913 						    aclentp->a_perm;
2914 						break;
2915 					case OTHER_OBJ:
2916 						other_perm &=
2917 						    aclentp->a_perm;
2918 						break;
2919 					default:
2920 						break;
2921 					}
2922 				}
2923 			}
2924 			/* copy to va */
2925 			va->va_mode &= ~077;
2926 			va->va_mode |= grp_perm | other_perm;
2927 		}
2928 		if (vsa.vsa_aclcnt)
2929 			kmem_free(vsa.vsa_aclentp,
2930 			    vsa.vsa_aclcnt * sizeof (aclent_t));
2931 	}
2932 }
2933 
2934 void
2935 rfs_srvrinit(void)
2936 {
2937 	mutex_init(&rfs_async_write_lock, NULL, MUTEX_DEFAULT, NULL);
2938 	nfs2_srv_caller_id = fs_new_caller_id();
2939 }
2940 
2941 void
2942 rfs_srvrfini(void)
2943 {
2944 	mutex_destroy(&rfs_async_write_lock);
2945 }
2946 
2947 static int
2948 rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
2949 {
2950 	struct clist	*wcl;
2951 	int		wlist_len;
2952 	uint32_t	count = rr->rr_count;
2953 
2954 	wcl = ra->ra_wlist;
2955 
2956 	if (rdma_setup_read_chunks(wcl, count, &wlist_len) == FALSE) {
2957 		return (FALSE);
2958 	}
2959 
2960 	wcl = ra->ra_wlist;
2961 	rr->rr_ok.rrok_wlist_len = wlist_len;
2962 	rr->rr_ok.rrok_wlist = wcl;
2963 
2964 	return (TRUE);
2965 }
2966