xref: /titanic_50/usr/src/uts/common/fs/nfs/nfs_srv.c (revision 15721462a45d32950a5b1f05d7b7a0fe34f81811)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright 2012 Nexenta Systems, Inc.  All rights reserved.
24  */
25 
26 /*
27  *	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
28  *	All rights reserved.
29  */
30 
31 #include <sys/param.h>
32 #include <sys/types.h>
33 #include <sys/systm.h>
34 #include <sys/cred.h>
35 #include <sys/buf.h>
36 #include <sys/vfs.h>
37 #include <sys/vnode.h>
38 #include <sys/uio.h>
39 #include <sys/stat.h>
40 #include <sys/errno.h>
41 #include <sys/sysmacros.h>
42 #include <sys/statvfs.h>
43 #include <sys/kmem.h>
44 #include <sys/kstat.h>
45 #include <sys/dirent.h>
46 #include <sys/cmn_err.h>
47 #include <sys/debug.h>
48 #include <sys/vtrace.h>
49 #include <sys/mode.h>
50 #include <sys/acl.h>
51 #include <sys/nbmlock.h>
52 #include <sys/policy.h>
53 #include <sys/sdt.h>
54 
55 #include <rpc/types.h>
56 #include <rpc/auth.h>
57 #include <rpc/svc.h>
58 
59 #include <nfs/nfs.h>
60 #include <nfs/export.h>
61 #include <nfs/nfs_cmd.h>
62 
63 #include <vm/hat.h>
64 #include <vm/as.h>
65 #include <vm/seg.h>
66 #include <vm/seg_map.h>
67 #include <vm/seg_kmem.h>
68 
69 #include <sys/strsubr.h>
70 
71 /*
72  * These are the interface routines for the server side of the
73  * Network File System.  See the NFS version 2 protocol specification
74  * for a description of this interface.
75  */
76 
77 static int	sattr_to_vattr(struct nfssattr *, struct vattr *);
78 static void	acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
79 			cred_t *);
80 
81 /*
82  * Some "over the wire" UNIX file types.  These are encoded
83  * into the mode.  This needs to be fixed in the next rev.
84  */
85 #define	IFMT		0170000		/* type of file */
86 #define	IFCHR		0020000		/* character special */
87 #define	IFBLK		0060000		/* block special */
88 #define	IFSOCK		0140000		/* socket */
89 
90 u_longlong_t nfs2_srv_caller_id;
91 
92 /*
93  * Get file attributes.
94  * Returns the current attributes of the file with the given fhandle.
95  */
96 /* ARGSUSED */
97 void
98 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
99 	struct svc_req *req, cred_t *cr)
100 {
101 	int error;
102 	vnode_t *vp;
103 	struct vattr va;
104 
105 	vp = nfs_fhtovp(fhp, exi);
106 	if (vp == NULL) {
107 		ns->ns_status = NFSERR_STALE;
108 		return;
109 	}
110 
111 	/*
112 	 * Do the getattr.
113 	 */
114 	va.va_mask = AT_ALL;	/* we want all the attributes */
115 
116 	error = rfs4_delegated_getattr(vp, &va, 0, cr);
117 
118 	/* check for overflows */
119 	if (!error) {
120 		/* Lie about the object type for a referral */
121 		if (vn_is_nfs_reparse(vp, cr))
122 			va.va_type = VLNK;
123 
124 		acl_perm(vp, exi, &va, cr);
125 		error = vattr_to_nattr(&va, &ns->ns_attr);
126 	}
127 
128 	VN_RELE(vp);
129 
130 	ns->ns_status = puterrno(error);
131 }
132 void *
133 rfs_getattr_getfh(fhandle_t *fhp)
134 {
135 	return (fhp);
136 }
137 
138 /*
139  * Set file attributes.
140  * Sets the attributes of the file with the given fhandle.  Returns
141  * the new attributes.
142  */
143 void
144 rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns,
145 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
146 {
147 	int error;
148 	int flag;
149 	int in_crit = 0;
150 	vnode_t *vp;
151 	struct vattr va;
152 	struct vattr bva;
153 	struct flock64 bf;
154 	caller_context_t ct;
155 
156 
157 	vp = nfs_fhtovp(&args->saa_fh, exi);
158 	if (vp == NULL) {
159 		ns->ns_status = NFSERR_STALE;
160 		return;
161 	}
162 
163 	if (rdonly(exi, req) || vn_is_readonly(vp)) {
164 		VN_RELE(vp);
165 		ns->ns_status = NFSERR_ROFS;
166 		return;
167 	}
168 
169 	error = sattr_to_vattr(&args->saa_sa, &va);
170 	if (error) {
171 		VN_RELE(vp);
172 		ns->ns_status = puterrno(error);
173 		return;
174 	}
175 
176 	/*
177 	 * If the client is requesting a change to the mtime,
178 	 * but the nanosecond field is set to 1 billion, then
179 	 * this is a flag to the server that it should set the
180 	 * atime and mtime fields to the server's current time.
181 	 * The 1 billion number actually came from the client
182 	 * as 1 million, but the units in the over the wire
183 	 * request are microseconds instead of nanoseconds.
184 	 *
185 	 * This is an overload of the protocol and should be
186 	 * documented in the NFS Version 2 protocol specification.
187 	 */
188 	if (va.va_mask & AT_MTIME) {
189 		if (va.va_mtime.tv_nsec == 1000000000) {
190 			gethrestime(&va.va_mtime);
191 			va.va_atime = va.va_mtime;
192 			va.va_mask |= AT_ATIME;
193 			flag = 0;
194 		} else
195 			flag = ATTR_UTIME;
196 	} else
197 		flag = 0;
198 
199 	/*
200 	 * If the filesystem is exported with nosuid, then mask off
201 	 * the setuid and setgid bits.
202 	 */
203 	if ((va.va_mask & AT_MODE) && vp->v_type == VREG &&
204 	    (exi->exi_export.ex_flags & EX_NOSUID))
205 		va.va_mode &= ~(VSUID | VSGID);
206 
207 	ct.cc_sysid = 0;
208 	ct.cc_pid = 0;
209 	ct.cc_caller_id = nfs2_srv_caller_id;
210 	ct.cc_flags = CC_DONTBLOCK;
211 
212 	/*
213 	 * We need to specially handle size changes because it is
214 	 * possible for the client to create a file with modes
215 	 * which indicate read-only, but with the file opened for
216 	 * writing.  If the client then tries to set the size of
217 	 * the file, then the normal access checking done in
218 	 * VOP_SETATTR would prevent the client from doing so,
219 	 * although it should be legal for it to do so.  To get
220 	 * around this, we do the access checking for ourselves
221 	 * and then use VOP_SPACE which doesn't do the access
222 	 * checking which VOP_SETATTR does. VOP_SPACE can only
223 	 * operate on VREG files, let VOP_SETATTR handle the other
224 	 * extremely rare cases.
225 	 * Also the client should not be allowed to change the
226 	 * size of the file if there is a conflicting non-blocking
227 	 * mandatory lock in the region of change.
228 	 */
229 	if (vp->v_type == VREG && va.va_mask & AT_SIZE) {
230 		if (nbl_need_check(vp)) {
231 			nbl_start_crit(vp, RW_READER);
232 			in_crit = 1;
233 		}
234 
235 		bva.va_mask = AT_UID | AT_SIZE;
236 
237 		error = VOP_GETATTR(vp, &bva, 0, cr, &ct);
238 
239 		if (error) {
240 			if (in_crit)
241 				nbl_end_crit(vp);
242 			VN_RELE(vp);
243 			ns->ns_status = puterrno(error);
244 			return;
245 		}
246 
247 		if (in_crit) {
248 			u_offset_t offset;
249 			ssize_t length;
250 
251 			if (va.va_size < bva.va_size) {
252 				offset = va.va_size;
253 				length = bva.va_size - va.va_size;
254 			} else {
255 				offset = bva.va_size;
256 				length = va.va_size - bva.va_size;
257 			}
258 			if (nbl_conflict(vp, NBL_WRITE, offset, length, 0,
259 			    NULL)) {
260 				error = EACCES;
261 			}
262 		}
263 
264 		if (crgetuid(cr) == bva.va_uid && !error &&
265 		    va.va_size != bva.va_size) {
266 			va.va_mask &= ~AT_SIZE;
267 			bf.l_type = F_WRLCK;
268 			bf.l_whence = 0;
269 			bf.l_start = (off64_t)va.va_size;
270 			bf.l_len = 0;
271 			bf.l_sysid = 0;
272 			bf.l_pid = 0;
273 
274 			error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE,
275 			    (offset_t)va.va_size, cr, &ct);
276 		}
277 		if (in_crit)
278 			nbl_end_crit(vp);
279 	} else
280 		error = 0;
281 
282 	/*
283 	 * Do the setattr.
284 	 */
285 	if (!error && va.va_mask) {
286 		error = VOP_SETATTR(vp, &va, flag, cr, &ct);
287 	}
288 
289 	/*
290 	 * check if the monitor on either vop_space or vop_setattr detected
291 	 * a delegation conflict and if so, mark the thread flag as
292 	 * wouldblock so that the response is dropped and the client will
293 	 * try again.
294 	 */
295 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
296 		VN_RELE(vp);
297 		curthread->t_flag |= T_WOULDBLOCK;
298 		return;
299 	}
300 
301 	if (!error) {
302 		va.va_mask = AT_ALL;	/* get everything */
303 
304 		error = rfs4_delegated_getattr(vp, &va, 0, cr);
305 
306 		/* check for overflows */
307 		if (!error) {
308 			acl_perm(vp, exi, &va, cr);
309 			error = vattr_to_nattr(&va, &ns->ns_attr);
310 		}
311 	}
312 
313 	ct.cc_flags = 0;
314 
315 	/*
316 	 * Force modified metadata out to stable storage.
317 	 */
318 	(void) VOP_FSYNC(vp, FNODSYNC, cr, &ct);
319 
320 	VN_RELE(vp);
321 
322 	ns->ns_status = puterrno(error);
323 }
324 void *
325 rfs_setattr_getfh(struct nfssaargs *args)
326 {
327 	return (&args->saa_fh);
328 }
329 
330 /*
331  * Directory lookup.
332  * Returns an fhandle and file attributes for file name in a directory.
333  */
334 /* ARGSUSED */
335 void
336 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
337 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
338 {
339 	int error;
340 	vnode_t *dvp;
341 	vnode_t *vp;
342 	struct vattr va;
343 	fhandle_t *fhp = da->da_fhandle;
344 	struct sec_ol sec = {0, 0};
345 	bool_t publicfh_flag = FALSE, auth_weak = FALSE;
346 	char *name;
347 	struct sockaddr *ca;
348 
349 	/*
350 	 * Trusted Extension doesn't support NFSv2. MOUNT
351 	 * will reject v2 clients. Need to prevent v2 client
352 	 * access via WebNFS here.
353 	 */
354 	if (is_system_labeled() && req->rq_vers == 2) {
355 		dr->dr_status = NFSERR_ACCES;
356 		return;
357 	}
358 
359 	/*
360 	 * Disallow NULL paths
361 	 */
362 	if (da->da_name == NULL || *da->da_name == '\0') {
363 		dr->dr_status = NFSERR_ACCES;
364 		return;
365 	}
366 
367 	/*
368 	 * Allow lookups from the root - the default
369 	 * location of the public filehandle.
370 	 */
371 	if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
372 		dvp = rootdir;
373 		VN_HOLD(dvp);
374 	} else {
375 		dvp = nfs_fhtovp(fhp, exi);
376 		if (dvp == NULL) {
377 			dr->dr_status = NFSERR_STALE;
378 			return;
379 		}
380 	}
381 
382 	/*
383 	 * Not allow lookup beyond root.
384 	 * If the filehandle matches a filehandle of the exi,
385 	 * then the ".." refers beyond the root of an exported filesystem.
386 	 */
387 	if (strcmp(da->da_name, "..") == 0 &&
388 	    EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
389 		VN_RELE(dvp);
390 		dr->dr_status = NFSERR_NOENT;
391 		return;
392 	}
393 
394 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
395 	name = nfscmd_convname(ca, exi, da->da_name, NFSCMD_CONV_INBOUND,
396 	    MAXPATHLEN);
397 
398 	if (name == NULL) {
399 		dr->dr_status = NFSERR_ACCES;
400 		return;
401 	}
402 
403 	exi_hold(exi);
404 
405 	/*
406 	 * If the public filehandle is used then allow
407 	 * a multi-component lookup, i.e. evaluate
408 	 * a pathname and follow symbolic links if
409 	 * necessary.
410 	 *
411 	 * This may result in a vnode in another filesystem
412 	 * which is OK as long as the filesystem is exported.
413 	 */
414 	if (PUBLIC_FH2(fhp)) {
415 		struct exportinfo *new;
416 
417 		publicfh_flag = TRUE;
418 		error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &new,
419 		    &sec);
420 
421 		if (error == 0) {
422 			exi_rele(exi);
423 			exi = new;
424 		}
425 	} else {
426 		/*
427 		 * Do a normal single component lookup.
428 		 */
429 		error = VOP_LOOKUP(dvp, name, &vp, NULL, 0, NULL, cr,
430 		    NULL, NULL, NULL);
431 	}
432 
433 	if (name != da->da_name)
434 		kmem_free(name, MAXPATHLEN);
435 
436 
437 	if (!error) {
438 		va.va_mask = AT_ALL;	/* we want everything */
439 
440 		error = rfs4_delegated_getattr(vp, &va, 0, cr);
441 
442 		/* check for overflows */
443 		if (!error) {
444 			acl_perm(vp, exi, &va, cr);
445 			error = vattr_to_nattr(&va, &dr->dr_attr);
446 			if (!error) {
447 				if (sec.sec_flags & SEC_QUERY)
448 					error = makefh_ol(&dr->dr_fhandle, exi,
449 					    sec.sec_index);
450 				else {
451 					error = makefh(&dr->dr_fhandle, vp,
452 					    exi);
453 					if (!error && publicfh_flag &&
454 					    !chk_clnt_sec(exi, req))
455 						auth_weak = TRUE;
456 				}
457 			}
458 		}
459 		VN_RELE(vp);
460 	}
461 
462 	VN_RELE(dvp);
463 
464 	/*
465 	 * The passed argument exportinfo is released by the
466 	 * caller, comon_dispatch
467 	 */
468 	exi_rele(exi);
469 
470 	/*
471 	 * If it's public fh, no 0x81, and client's flavor is
472 	 * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
473 	 * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
474 	 */
475 	if (auth_weak)
476 		dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR;
477 	else
478 		dr->dr_status = puterrno(error);
479 }
480 void *
481 rfs_lookup_getfh(struct nfsdiropargs *da)
482 {
483 	return (da->da_fhandle);
484 }
485 
486 /*
487  * Read symbolic link.
488  * Returns the string in the symbolic link at the given fhandle.
489  */
490 /* ARGSUSED */
491 void
492 rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi,
493 	struct svc_req *req, cred_t *cr)
494 {
495 	int error;
496 	struct iovec iov;
497 	struct uio uio;
498 	vnode_t *vp;
499 	struct vattr va;
500 	struct sockaddr *ca;
501 	char *name = NULL;
502 	int is_referral = 0;
503 
504 	vp = nfs_fhtovp(fhp, exi);
505 	if (vp == NULL) {
506 		rl->rl_data = NULL;
507 		rl->rl_status = NFSERR_STALE;
508 		return;
509 	}
510 
511 	va.va_mask = AT_MODE;
512 
513 	error = VOP_GETATTR(vp, &va, 0, cr, NULL);
514 
515 	if (error) {
516 		VN_RELE(vp);
517 		rl->rl_data = NULL;
518 		rl->rl_status = puterrno(error);
519 		return;
520 	}
521 
522 	if (MANDLOCK(vp, va.va_mode)) {
523 		VN_RELE(vp);
524 		rl->rl_data = NULL;
525 		rl->rl_status = NFSERR_ACCES;
526 		return;
527 	}
528 
529 	/* We lied about the object type for a referral */
530 	if (vn_is_nfs_reparse(vp, cr))
531 		is_referral = 1;
532 
533 	/*
534 	 * XNFS and RFC1094 require us to return ENXIO if argument
535 	 * is not a link. BUGID 1138002.
536 	 */
537 	if (vp->v_type != VLNK && !is_referral) {
538 		VN_RELE(vp);
539 		rl->rl_data = NULL;
540 		rl->rl_status = NFSERR_NXIO;
541 		return;
542 	}
543 
544 	/*
545 	 * Allocate data for pathname.  This will be freed by rfs_rlfree.
546 	 */
547 	rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
548 
549 	if (is_referral) {
550 		char *s;
551 		size_t strsz;
552 
553 		/* Get an artificial symlink based on a referral */
554 		s = build_symlink(vp, cr, &strsz);
555 		global_svstat_ptr[2][NFS_REFERLINKS].value.ui64++;
556 		DTRACE_PROBE2(nfs2serv__func__referral__reflink,
557 		    vnode_t *, vp, char *, s);
558 		if (s == NULL)
559 			error = EINVAL;
560 		else {
561 			error = 0;
562 			(void) strlcpy(rl->rl_data, s, NFS_MAXPATHLEN);
563 			rl->rl_count = (uint32_t)MIN(strsz, NFS_MAXPATHLEN);
564 			kmem_free(s, strsz);
565 		}
566 
567 	} else {
568 
569 		/*
570 		 * Set up io vector to read sym link data
571 		 */
572 		iov.iov_base = rl->rl_data;
573 		iov.iov_len = NFS_MAXPATHLEN;
574 		uio.uio_iov = &iov;
575 		uio.uio_iovcnt = 1;
576 		uio.uio_segflg = UIO_SYSSPACE;
577 		uio.uio_extflg = UIO_COPY_CACHED;
578 		uio.uio_loffset = (offset_t)0;
579 		uio.uio_resid = NFS_MAXPATHLEN;
580 
581 		/*
582 		 * Do the readlink.
583 		 */
584 		error = VOP_READLINK(vp, &uio, cr, NULL);
585 
586 		rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid);
587 
588 		if (!error)
589 			rl->rl_data[rl->rl_count] = '\0';
590 
591 	}
592 
593 
594 	VN_RELE(vp);
595 
596 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
597 	name = nfscmd_convname(ca, exi, rl->rl_data,
598 	    NFSCMD_CONV_OUTBOUND, MAXPATHLEN);
599 
600 	if (name != NULL && name != rl->rl_data) {
601 		kmem_free(rl->rl_data, NFS_MAXPATHLEN);
602 		rl->rl_data = name;
603 	}
604 
605 	/*
606 	 * XNFS and RFC1094 require us to return ENXIO if argument
607 	 * is not a link. UFS returns EINVAL if this is the case,
608 	 * so we do the mapping here. BUGID 1138002.
609 	 */
610 	if (error == EINVAL)
611 		rl->rl_status = NFSERR_NXIO;
612 	else
613 		rl->rl_status = puterrno(error);
614 
615 }
616 void *
617 rfs_readlink_getfh(fhandle_t *fhp)
618 {
619 	return (fhp);
620 }
621 /*
622  * Free data allocated by rfs_readlink
623  */
624 void
625 rfs_rlfree(struct nfsrdlnres *rl)
626 {
627 	if (rl->rl_data != NULL)
628 		kmem_free(rl->rl_data, NFS_MAXPATHLEN);
629 }
630 
631 static int rdma_setup_read_data2(struct nfsreadargs *, struct nfsrdresult *);
632 
633 /*
634  * Read data.
635  * Returns some data read from the file at the given fhandle.
636  */
637 /* ARGSUSED */
638 void
639 rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr,
640 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
641 {
642 	vnode_t *vp;
643 	int error;
644 	struct vattr va;
645 	struct iovec iov;
646 	struct uio uio;
647 	mblk_t *mp;
648 	int alloc_err = 0;
649 	int in_crit = 0;
650 	caller_context_t ct;
651 
652 	vp = nfs_fhtovp(&ra->ra_fhandle, exi);
653 	if (vp == NULL) {
654 		rr->rr_data = NULL;
655 		rr->rr_status = NFSERR_STALE;
656 		return;
657 	}
658 
659 	if (vp->v_type != VREG) {
660 		VN_RELE(vp);
661 		rr->rr_data = NULL;
662 		rr->rr_status = NFSERR_ISDIR;
663 		return;
664 	}
665 
666 	ct.cc_sysid = 0;
667 	ct.cc_pid = 0;
668 	ct.cc_caller_id = nfs2_srv_caller_id;
669 	ct.cc_flags = CC_DONTBLOCK;
670 
671 	/*
672 	 * Enter the critical region before calling VOP_RWLOCK
673 	 * to avoid a deadlock with write requests.
674 	 */
675 	if (nbl_need_check(vp)) {
676 		nbl_start_crit(vp, RW_READER);
677 		if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
678 		    0, NULL)) {
679 			nbl_end_crit(vp);
680 			VN_RELE(vp);
681 			rr->rr_data = NULL;
682 			rr->rr_status = NFSERR_ACCES;
683 			return;
684 		}
685 		in_crit = 1;
686 	}
687 
688 	error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct);
689 
690 	/* check if a monitor detected a delegation conflict */
691 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
692 		VN_RELE(vp);
693 		/* mark as wouldblock so response is dropped */
694 		curthread->t_flag |= T_WOULDBLOCK;
695 
696 		rr->rr_data = NULL;
697 		return;
698 	}
699 
700 	va.va_mask = AT_ALL;
701 
702 	error = VOP_GETATTR(vp, &va, 0, cr, &ct);
703 
704 	if (error) {
705 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
706 		if (in_crit)
707 			nbl_end_crit(vp);
708 
709 		VN_RELE(vp);
710 		rr->rr_data = NULL;
711 		rr->rr_status = puterrno(error);
712 
713 		return;
714 	}
715 
716 	/*
717 	 * This is a kludge to allow reading of files created
718 	 * with no read permission.  The owner of the file
719 	 * is always allowed to read it.
720 	 */
721 	if (crgetuid(cr) != va.va_uid) {
722 		error = VOP_ACCESS(vp, VREAD, 0, cr, &ct);
723 
724 		if (error) {
725 			/*
726 			 * Exec is the same as read over the net because
727 			 * of demand loading.
728 			 */
729 			error = VOP_ACCESS(vp, VEXEC, 0, cr, &ct);
730 		}
731 		if (error) {
732 			VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
733 			if (in_crit)
734 				nbl_end_crit(vp);
735 			VN_RELE(vp);
736 			rr->rr_data = NULL;
737 			rr->rr_status = puterrno(error);
738 
739 			return;
740 		}
741 	}
742 
743 	if (MANDLOCK(vp, va.va_mode)) {
744 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
745 		if (in_crit)
746 			nbl_end_crit(vp);
747 
748 		VN_RELE(vp);
749 		rr->rr_data = NULL;
750 		rr->rr_status = NFSERR_ACCES;
751 
752 		return;
753 	}
754 
755 	rr->rr_ok.rrok_wlist_len = 0;
756 	rr->rr_ok.rrok_wlist = NULL;
757 
758 	if ((u_offset_t)ra->ra_offset >= va.va_size) {
759 		rr->rr_count = 0;
760 		rr->rr_data = NULL;
761 		/*
762 		 * In this case, status is NFS_OK, but there is no data
763 		 * to encode. So set rr_mp to NULL.
764 		 */
765 		rr->rr_mp = NULL;
766 		rr->rr_ok.rrok_wlist = ra->ra_wlist;
767 		if (rr->rr_ok.rrok_wlist)
768 			clist_zero_len(rr->rr_ok.rrok_wlist);
769 		goto done;
770 	}
771 
772 	if (ra->ra_wlist) {
773 		mp = NULL;
774 		rr->rr_mp = NULL;
775 		(void) rdma_get_wchunk(req, &iov, ra->ra_wlist);
776 		if (ra->ra_count > iov.iov_len) {
777 			rr->rr_data = NULL;
778 			rr->rr_status = NFSERR_INVAL;
779 			goto done;
780 		}
781 	} else {
782 		/*
783 		 * mp will contain the data to be sent out in the read reply.
784 		 * This will be freed after the reply has been sent out (by the
785 		 * driver).
786 		 * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so
787 		 * that the call to xdrmblk_putmblk() never fails.
788 		 */
789 		mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG,
790 		    &alloc_err);
791 		ASSERT(mp != NULL);
792 		ASSERT(alloc_err == 0);
793 
794 		rr->rr_mp = mp;
795 
796 		/*
797 		 * Set up io vector
798 		 */
799 		iov.iov_base = (caddr_t)mp->b_datap->db_base;
800 		iov.iov_len = ra->ra_count;
801 	}
802 
803 	uio.uio_iov = &iov;
804 	uio.uio_iovcnt = 1;
805 	uio.uio_segflg = UIO_SYSSPACE;
806 	uio.uio_extflg = UIO_COPY_CACHED;
807 	uio.uio_loffset = (offset_t)ra->ra_offset;
808 	uio.uio_resid = ra->ra_count;
809 
810 	error = VOP_READ(vp, &uio, 0, cr, &ct);
811 
812 	if (error) {
813 		if (mp)
814 			freeb(mp);
815 
816 		/*
817 		 * check if a monitor detected a delegation conflict and
818 		 * mark as wouldblock so response is dropped
819 		 */
820 		if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
821 			curthread->t_flag |= T_WOULDBLOCK;
822 		else
823 			rr->rr_status = puterrno(error);
824 
825 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
826 		if (in_crit)
827 			nbl_end_crit(vp);
828 
829 		VN_RELE(vp);
830 		rr->rr_data = NULL;
831 
832 		return;
833 	}
834 
835 	/*
836 	 * Get attributes again so we can send the latest access
837 	 * time to the client side for his cache.
838 	 */
839 	va.va_mask = AT_ALL;
840 
841 	error = VOP_GETATTR(vp, &va, 0, cr, &ct);
842 
843 	if (error) {
844 		if (mp)
845 			freeb(mp);
846 
847 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
848 		if (in_crit)
849 			nbl_end_crit(vp);
850 
851 		VN_RELE(vp);
852 		rr->rr_data = NULL;
853 		rr->rr_status = puterrno(error);
854 
855 		return;
856 	}
857 
858 	rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid);
859 
860 	if (mp) {
861 		rr->rr_data = (char *)mp->b_datap->db_base;
862 	} else {
863 		if (ra->ra_wlist) {
864 			rr->rr_data = (caddr_t)iov.iov_base;
865 			if (!rdma_setup_read_data2(ra, rr)) {
866 				rr->rr_data = NULL;
867 				rr->rr_status = puterrno(NFSERR_INVAL);
868 			}
869 		}
870 	}
871 done:
872 	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
873 	if (in_crit)
874 		nbl_end_crit(vp);
875 
876 	acl_perm(vp, exi, &va, cr);
877 
878 	/* check for overflows */
879 	error = vattr_to_nattr(&va, &rr->rr_attr);
880 
881 	VN_RELE(vp);
882 
883 	rr->rr_status = puterrno(error);
884 }
885 
886 /*
887  * Free data allocated by rfs_read
888  */
889 void
890 rfs_rdfree(struct nfsrdresult *rr)
891 {
892 	mblk_t *mp;
893 
894 	if (rr->rr_status == NFS_OK) {
895 		mp = rr->rr_mp;
896 		if (mp != NULL)
897 			freeb(mp);
898 	}
899 }
900 
901 void *
902 rfs_read_getfh(struct nfsreadargs *ra)
903 {
904 	return (&ra->ra_fhandle);
905 }
906 
907 #define	MAX_IOVECS	12
908 
909 #ifdef DEBUG
910 static int rfs_write_sync_hits = 0;
911 static int rfs_write_sync_misses = 0;
912 #endif
913 
914 /*
915  * Write data to file.
916  * Returns attributes of a file after writing some data to it.
917  *
918  * Any changes made here, especially in error handling might have
919  * to also be done in rfs_write (which clusters write requests).
920  */
921 void
922 rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns,
923 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
924 {
925 	int error;
926 	vnode_t *vp;
927 	rlim64_t rlimit;
928 	struct vattr va;
929 	struct uio uio;
930 	struct iovec iov[MAX_IOVECS];
931 	mblk_t *m;
932 	struct iovec *iovp;
933 	int iovcnt;
934 	cred_t *savecred;
935 	int in_crit = 0;
936 	caller_context_t ct;
937 
938 	vp = nfs_fhtovp(&wa->wa_fhandle, exi);
939 	if (vp == NULL) {
940 		ns->ns_status = NFSERR_STALE;
941 		return;
942 	}
943 
944 	if (rdonly(exi, req)) {
945 		VN_RELE(vp);
946 		ns->ns_status = NFSERR_ROFS;
947 		return;
948 	}
949 
950 	if (vp->v_type != VREG) {
951 		VN_RELE(vp);
952 		ns->ns_status = NFSERR_ISDIR;
953 		return;
954 	}
955 
956 	ct.cc_sysid = 0;
957 	ct.cc_pid = 0;
958 	ct.cc_caller_id = nfs2_srv_caller_id;
959 	ct.cc_flags = CC_DONTBLOCK;
960 
961 	va.va_mask = AT_UID|AT_MODE;
962 
963 	error = VOP_GETATTR(vp, &va, 0, cr, &ct);
964 
965 	if (error) {
966 		VN_RELE(vp);
967 		ns->ns_status = puterrno(error);
968 
969 		return;
970 	}
971 
972 	if (crgetuid(cr) != va.va_uid) {
973 		/*
974 		 * This is a kludge to allow writes of files created
975 		 * with read only permission.  The owner of the file
976 		 * is always allowed to write it.
977 		 */
978 		error = VOP_ACCESS(vp, VWRITE, 0, cr, &ct);
979 
980 		if (error) {
981 			VN_RELE(vp);
982 			ns->ns_status = puterrno(error);
983 			return;
984 		}
985 	}
986 
987 	/*
988 	 * Can't access a mandatory lock file.  This might cause
989 	 * the NFS service thread to block forever waiting for a
990 	 * lock to be released that will never be released.
991 	 */
992 	if (MANDLOCK(vp, va.va_mode)) {
993 		VN_RELE(vp);
994 		ns->ns_status = NFSERR_ACCES;
995 		return;
996 	}
997 
998 	/*
999 	 * We have to enter the critical region before calling VOP_RWLOCK
1000 	 * to avoid a deadlock with ufs.
1001 	 */
1002 	if (nbl_need_check(vp)) {
1003 		nbl_start_crit(vp, RW_READER);
1004 		in_crit = 1;
1005 		if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
1006 		    wa->wa_count, 0, NULL)) {
1007 			error = EACCES;
1008 			goto out;
1009 		}
1010 	}
1011 
1012 	error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1013 
1014 	/* check if a monitor detected a delegation conflict */
1015 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1016 		VN_RELE(vp);
1017 		/* mark as wouldblock so response is dropped */
1018 		curthread->t_flag |= T_WOULDBLOCK;
1019 		return;
1020 	}
1021 
1022 	if (wa->wa_data || wa->wa_rlist) {
1023 		/* Do the RDMA thing if necessary */
1024 		if (wa->wa_rlist) {
1025 			iov[0].iov_base = (char *)((wa->wa_rlist)->u.c_daddr3);
1026 			iov[0].iov_len = wa->wa_count;
1027 		} else  {
1028 			iov[0].iov_base = wa->wa_data;
1029 			iov[0].iov_len = wa->wa_count;
1030 		}
1031 		uio.uio_iov = iov;
1032 		uio.uio_iovcnt = 1;
1033 		uio.uio_segflg = UIO_SYSSPACE;
1034 		uio.uio_extflg = UIO_COPY_DEFAULT;
1035 		uio.uio_loffset = (offset_t)wa->wa_offset;
1036 		uio.uio_resid = wa->wa_count;
1037 		/*
1038 		 * The limit is checked on the client. We
1039 		 * should allow any size writes here.
1040 		 */
1041 		uio.uio_llimit = curproc->p_fsz_ctl;
1042 		rlimit = uio.uio_llimit - wa->wa_offset;
1043 		if (rlimit < (rlim64_t)uio.uio_resid)
1044 			uio.uio_resid = (uint_t)rlimit;
1045 
1046 		/*
1047 		 * for now we assume no append mode
1048 		 */
1049 		/*
1050 		 * We're changing creds because VM may fault and we need
1051 		 * the cred of the current thread to be used if quota
1052 		 * checking is enabled.
1053 		 */
1054 		savecred = curthread->t_cred;
1055 		curthread->t_cred = cr;
1056 		error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1057 		curthread->t_cred = savecred;
1058 	} else {
1059 		iovcnt = 0;
1060 		for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
1061 			iovcnt++;
1062 		if (iovcnt <= MAX_IOVECS) {
1063 #ifdef DEBUG
1064 			rfs_write_sync_hits++;
1065 #endif
1066 			iovp = iov;
1067 		} else {
1068 #ifdef DEBUG
1069 			rfs_write_sync_misses++;
1070 #endif
1071 			iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
1072 		}
1073 		mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
1074 		uio.uio_iov = iovp;
1075 		uio.uio_iovcnt = iovcnt;
1076 		uio.uio_segflg = UIO_SYSSPACE;
1077 		uio.uio_extflg = UIO_COPY_DEFAULT;
1078 		uio.uio_loffset = (offset_t)wa->wa_offset;
1079 		uio.uio_resid = wa->wa_count;
1080 		/*
1081 		 * The limit is checked on the client. We
1082 		 * should allow any size writes here.
1083 		 */
1084 		uio.uio_llimit = curproc->p_fsz_ctl;
1085 		rlimit = uio.uio_llimit - wa->wa_offset;
1086 		if (rlimit < (rlim64_t)uio.uio_resid)
1087 			uio.uio_resid = (uint_t)rlimit;
1088 
1089 		/*
1090 		 * For now we assume no append mode.
1091 		 */
1092 		/*
1093 		 * We're changing creds because VM may fault and we need
1094 		 * the cred of the current thread to be used if quota
1095 		 * checking is enabled.
1096 		 */
1097 		savecred = curthread->t_cred;
1098 		curthread->t_cred = cr;
1099 		error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1100 		curthread->t_cred = savecred;
1101 
1102 		if (iovp != iov)
1103 			kmem_free(iovp, sizeof (*iovp) * iovcnt);
1104 	}
1105 
1106 	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1107 
1108 	if (!error) {
1109 		/*
1110 		 * Get attributes again so we send the latest mod
1111 		 * time to the client side for his cache.
1112 		 */
1113 		va.va_mask = AT_ALL;	/* now we want everything */
1114 
1115 		error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1116 
1117 		/* check for overflows */
1118 		if (!error) {
1119 			acl_perm(vp, exi, &va, cr);
1120 			error = vattr_to_nattr(&va, &ns->ns_attr);
1121 		}
1122 	}
1123 
1124 out:
1125 	if (in_crit)
1126 		nbl_end_crit(vp);
1127 	VN_RELE(vp);
1128 
1129 	/* check if a monitor detected a delegation conflict */
1130 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1131 		/* mark as wouldblock so response is dropped */
1132 		curthread->t_flag |= T_WOULDBLOCK;
1133 	else
1134 		ns->ns_status = puterrno(error);
1135 
1136 }
1137 
1138 struct rfs_async_write {
1139 	struct nfswriteargs *wa;
1140 	struct nfsattrstat *ns;
1141 	struct svc_req *req;
1142 	cred_t *cr;
1143 	kthread_t *thread;
1144 	struct rfs_async_write *list;
1145 };
1146 
1147 struct rfs_async_write_list {
1148 	fhandle_t *fhp;
1149 	kcondvar_t cv;
1150 	struct rfs_async_write *list;
1151 	struct rfs_async_write_list *next;
1152 };
1153 
1154 static struct rfs_async_write_list *rfs_async_write_head = NULL;
1155 static kmutex_t rfs_async_write_lock;
1156 static int rfs_write_async = 1;	/* enables write clustering if == 1 */
1157 
1158 #define	MAXCLIOVECS	42
1159 #define	RFSWRITE_INITVAL (enum nfsstat) -1
1160 
1161 #ifdef DEBUG
1162 static int rfs_write_hits = 0;
1163 static int rfs_write_misses = 0;
1164 #endif
1165 
1166 /*
1167  * Write data to file.
1168  * Returns attributes of a file after writing some data to it.
1169  */
1170 void
1171 rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns,
1172 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1173 {
1174 	int error;
1175 	vnode_t *vp;
1176 	rlim64_t rlimit;
1177 	struct vattr va;
1178 	struct uio uio;
1179 	struct rfs_async_write_list *lp;
1180 	struct rfs_async_write_list *nlp;
1181 	struct rfs_async_write *rp;
1182 	struct rfs_async_write *nrp;
1183 	struct rfs_async_write *trp;
1184 	struct rfs_async_write *lrp;
1185 	int data_written;
1186 	int iovcnt;
1187 	mblk_t *m;
1188 	struct iovec *iovp;
1189 	struct iovec *niovp;
1190 	struct iovec iov[MAXCLIOVECS];
1191 	int count;
1192 	int rcount;
1193 	uint_t off;
1194 	uint_t len;
1195 	struct rfs_async_write nrpsp;
1196 	struct rfs_async_write_list nlpsp;
1197 	ushort_t t_flag;
1198 	cred_t *savecred;
1199 	int in_crit = 0;
1200 	caller_context_t ct;
1201 
1202 	if (!rfs_write_async) {
1203 		rfs_write_sync(wa, ns, exi, req, cr);
1204 		return;
1205 	}
1206 
1207 	/*
1208 	 * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
1209 	 * is considered an OK.
1210 	 */
1211 	ns->ns_status = RFSWRITE_INITVAL;
1212 
1213 	nrp = &nrpsp;
1214 	nrp->wa = wa;
1215 	nrp->ns = ns;
1216 	nrp->req = req;
1217 	nrp->cr = cr;
1218 	nrp->thread = curthread;
1219 
1220 	ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
1221 
1222 	/*
1223 	 * Look to see if there is already a cluster started
1224 	 * for this file.
1225 	 */
1226 	mutex_enter(&rfs_async_write_lock);
1227 	for (lp = rfs_async_write_head; lp != NULL; lp = lp->next) {
1228 		if (bcmp(&wa->wa_fhandle, lp->fhp,
1229 		    sizeof (fhandle_t)) == 0)
1230 			break;
1231 	}
1232 
1233 	/*
1234 	 * If lp is non-NULL, then there is already a cluster
1235 	 * started.  We need to place ourselves in the cluster
1236 	 * list in the right place as determined by starting
1237 	 * offset.  Conflicts with non-blocking mandatory locked
1238 	 * regions will be checked when the cluster is processed.
1239 	 */
1240 	if (lp != NULL) {
1241 		rp = lp->list;
1242 		trp = NULL;
1243 		while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
1244 			trp = rp;
1245 			rp = rp->list;
1246 		}
1247 		nrp->list = rp;
1248 		if (trp == NULL)
1249 			lp->list = nrp;
1250 		else
1251 			trp->list = nrp;
1252 		while (nrp->ns->ns_status == RFSWRITE_INITVAL)
1253 			cv_wait(&lp->cv, &rfs_async_write_lock);
1254 		mutex_exit(&rfs_async_write_lock);
1255 
1256 		return;
1257 	}
1258 
1259 	/*
1260 	 * No cluster started yet, start one and add ourselves
1261 	 * to the list of clusters.
1262 	 */
1263 	nrp->list = NULL;
1264 
1265 	nlp = &nlpsp;
1266 	nlp->fhp = &wa->wa_fhandle;
1267 	cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
1268 	nlp->list = nrp;
1269 	nlp->next = NULL;
1270 
1271 	if (rfs_async_write_head == NULL) {
1272 		rfs_async_write_head = nlp;
1273 	} else {
1274 		lp = rfs_async_write_head;
1275 		while (lp->next != NULL)
1276 			lp = lp->next;
1277 		lp->next = nlp;
1278 	}
1279 	mutex_exit(&rfs_async_write_lock);
1280 
1281 	/*
1282 	 * Convert the file handle common to all of the requests
1283 	 * in this cluster to a vnode.
1284 	 */
1285 	vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1286 	if (vp == NULL) {
1287 		mutex_enter(&rfs_async_write_lock);
1288 		if (rfs_async_write_head == nlp)
1289 			rfs_async_write_head = nlp->next;
1290 		else {
1291 			lp = rfs_async_write_head;
1292 			while (lp->next != nlp)
1293 				lp = lp->next;
1294 			lp->next = nlp->next;
1295 		}
1296 		t_flag = curthread->t_flag & T_WOULDBLOCK;
1297 		for (rp = nlp->list; rp != NULL; rp = rp->list) {
1298 			rp->ns->ns_status = NFSERR_STALE;
1299 			rp->thread->t_flag |= t_flag;
1300 		}
1301 		cv_broadcast(&nlp->cv);
1302 		mutex_exit(&rfs_async_write_lock);
1303 
1304 		return;
1305 	}
1306 
1307 	/*
1308 	 * Can only write regular files.  Attempts to write any
1309 	 * other file types fail with EISDIR.
1310 	 */
1311 	if (vp->v_type != VREG) {
1312 		VN_RELE(vp);
1313 		mutex_enter(&rfs_async_write_lock);
1314 		if (rfs_async_write_head == nlp)
1315 			rfs_async_write_head = nlp->next;
1316 		else {
1317 			lp = rfs_async_write_head;
1318 			while (lp->next != nlp)
1319 				lp = lp->next;
1320 			lp->next = nlp->next;
1321 		}
1322 		t_flag = curthread->t_flag & T_WOULDBLOCK;
1323 		for (rp = nlp->list; rp != NULL; rp = rp->list) {
1324 			rp->ns->ns_status = NFSERR_ISDIR;
1325 			rp->thread->t_flag |= t_flag;
1326 		}
1327 		cv_broadcast(&nlp->cv);
1328 		mutex_exit(&rfs_async_write_lock);
1329 
1330 		return;
1331 	}
1332 
1333 	/*
1334 	 * Enter the critical region before calling VOP_RWLOCK, to avoid a
1335 	 * deadlock with ufs.
1336 	 */
1337 	if (nbl_need_check(vp)) {
1338 		nbl_start_crit(vp, RW_READER);
1339 		in_crit = 1;
1340 	}
1341 
1342 	ct.cc_sysid = 0;
1343 	ct.cc_pid = 0;
1344 	ct.cc_caller_id = nfs2_srv_caller_id;
1345 	ct.cc_flags = CC_DONTBLOCK;
1346 
1347 	/*
1348 	 * Lock the file for writing.  This operation provides
1349 	 * the delay which allows clusters to grow.
1350 	 */
1351 	error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1352 
1353 	/* check if a monitor detected a delegation conflict */
1354 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1355 		if (in_crit)
1356 			nbl_end_crit(vp);
1357 		VN_RELE(vp);
1358 		/* mark as wouldblock so response is dropped */
1359 		curthread->t_flag |= T_WOULDBLOCK;
1360 		mutex_enter(&rfs_async_write_lock);
1361 		if (rfs_async_write_head == nlp)
1362 			rfs_async_write_head = nlp->next;
1363 		else {
1364 			lp = rfs_async_write_head;
1365 			while (lp->next != nlp)
1366 				lp = lp->next;
1367 			lp->next = nlp->next;
1368 		}
1369 		for (rp = nlp->list; rp != NULL; rp = rp->list) {
1370 			if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1371 				rp->ns->ns_status = puterrno(error);
1372 				rp->thread->t_flag |= T_WOULDBLOCK;
1373 			}
1374 		}
1375 		cv_broadcast(&nlp->cv);
1376 		mutex_exit(&rfs_async_write_lock);
1377 
1378 		return;
1379 	}
1380 
1381 	/*
1382 	 * Disconnect this cluster from the list of clusters.
1383 	 * The cluster that is being dealt with must be fixed
1384 	 * in size after this point, so there is no reason
1385 	 * to leave it on the list so that new requests can
1386 	 * find it.
1387 	 *
1388 	 * The algorithm is that the first write request will
1389 	 * create a cluster, convert the file handle to a
1390 	 * vnode pointer, and then lock the file for writing.
1391 	 * This request is not likely to be clustered with
1392 	 * any others.  However, the next request will create
1393 	 * a new cluster and be blocked in VOP_RWLOCK while
1394 	 * the first request is being processed.  This delay
1395 	 * will allow more requests to be clustered in this
1396 	 * second cluster.
1397 	 */
1398 	mutex_enter(&rfs_async_write_lock);
1399 	if (rfs_async_write_head == nlp)
1400 		rfs_async_write_head = nlp->next;
1401 	else {
1402 		lp = rfs_async_write_head;
1403 		while (lp->next != nlp)
1404 			lp = lp->next;
1405 		lp->next = nlp->next;
1406 	}
1407 	mutex_exit(&rfs_async_write_lock);
1408 
1409 	/*
1410 	 * Step through the list of requests in this cluster.
1411 	 * We need to check permissions to make sure that all
1412 	 * of the requests have sufficient permission to write
1413 	 * the file.  A cluster can be composed of requests
1414 	 * from different clients and different users on each
1415 	 * client.
1416 	 *
1417 	 * As a side effect, we also calculate the size of the
1418 	 * byte range that this cluster encompasses.
1419 	 */
1420 	rp = nlp->list;
1421 	off = rp->wa->wa_offset;
1422 	len = (uint_t)0;
1423 	do {
1424 		if (rdonly(exi, rp->req)) {
1425 			rp->ns->ns_status = NFSERR_ROFS;
1426 			t_flag = curthread->t_flag & T_WOULDBLOCK;
1427 			rp->thread->t_flag |= t_flag;
1428 			continue;
1429 		}
1430 
1431 		va.va_mask = AT_UID|AT_MODE;
1432 
1433 		error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1434 
1435 		if (!error) {
1436 			if (crgetuid(rp->cr) != va.va_uid) {
1437 				/*
1438 				 * This is a kludge to allow writes of files
1439 				 * created with read only permission.  The
1440 				 * owner of the file is always allowed to
1441 				 * write it.
1442 				 */
1443 				error = VOP_ACCESS(vp, VWRITE, 0, rp->cr, &ct);
1444 			}
1445 			if (!error && MANDLOCK(vp, va.va_mode))
1446 				error = EACCES;
1447 		}
1448 
1449 		/*
1450 		 * Check for a conflict with a nbmand-locked region.
1451 		 */
1452 		if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset,
1453 		    rp->wa->wa_count, 0, NULL)) {
1454 			error = EACCES;
1455 		}
1456 
1457 		if (error) {
1458 			rp->ns->ns_status = puterrno(error);
1459 			t_flag = curthread->t_flag & T_WOULDBLOCK;
1460 			rp->thread->t_flag |= t_flag;
1461 			continue;
1462 		}
1463 		if (len < rp->wa->wa_offset + rp->wa->wa_count - off)
1464 			len = rp->wa->wa_offset + rp->wa->wa_count - off;
1465 	} while ((rp = rp->list) != NULL);
1466 
1467 	/*
1468 	 * Step through the cluster attempting to gather as many
1469 	 * requests which are contiguous as possible.  These
1470 	 * contiguous requests are handled via one call to VOP_WRITE
1471 	 * instead of different calls to VOP_WRITE.  We also keep
1472 	 * track of the fact that any data was written.
1473 	 */
1474 	rp = nlp->list;
1475 	data_written = 0;
1476 	do {
1477 		/*
1478 		 * Skip any requests which are already marked as having an
1479 		 * error.
1480 		 */
1481 		if (rp->ns->ns_status != RFSWRITE_INITVAL) {
1482 			rp = rp->list;
1483 			continue;
1484 		}
1485 
1486 		/*
1487 		 * Count the number of iovec's which are required
1488 		 * to handle this set of requests.  One iovec is
1489 		 * needed for each data buffer, whether addressed
1490 		 * by wa_data or by the b_rptr pointers in the
1491 		 * mblk chains.
1492 		 */
1493 		iovcnt = 0;
1494 		lrp = rp;
1495 		for (;;) {
1496 			if (lrp->wa->wa_data || lrp->wa->wa_rlist)
1497 				iovcnt++;
1498 			else {
1499 				m = lrp->wa->wa_mblk;
1500 				while (m != NULL) {
1501 					iovcnt++;
1502 					m = m->b_cont;
1503 				}
1504 			}
1505 			if (lrp->list == NULL ||
1506 			    lrp->list->ns->ns_status != RFSWRITE_INITVAL ||
1507 			    lrp->wa->wa_offset + lrp->wa->wa_count !=
1508 			    lrp->list->wa->wa_offset) {
1509 				lrp = lrp->list;
1510 				break;
1511 			}
1512 			lrp = lrp->list;
1513 		}
1514 
1515 		if (iovcnt <= MAXCLIOVECS) {
1516 #ifdef DEBUG
1517 			rfs_write_hits++;
1518 #endif
1519 			niovp = iov;
1520 		} else {
1521 #ifdef DEBUG
1522 			rfs_write_misses++;
1523 #endif
1524 			niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP);
1525 		}
1526 		/*
1527 		 * Put together the scatter/gather iovecs.
1528 		 */
1529 		iovp = niovp;
1530 		trp = rp;
1531 		count = 0;
1532 		do {
1533 			if (trp->wa->wa_data || trp->wa->wa_rlist) {
1534 				if (trp->wa->wa_rlist) {
1535 					iovp->iov_base =
1536 					    (char *)((trp->wa->wa_rlist)->
1537 					    u.c_daddr3);
1538 					iovp->iov_len = trp->wa->wa_count;
1539 				} else  {
1540 					iovp->iov_base = trp->wa->wa_data;
1541 					iovp->iov_len = trp->wa->wa_count;
1542 				}
1543 				iovp++;
1544 			} else {
1545 				m = trp->wa->wa_mblk;
1546 				rcount = trp->wa->wa_count;
1547 				while (m != NULL) {
1548 					iovp->iov_base = (caddr_t)m->b_rptr;
1549 					iovp->iov_len = (m->b_wptr - m->b_rptr);
1550 					rcount -= iovp->iov_len;
1551 					if (rcount < 0)
1552 						iovp->iov_len += rcount;
1553 					iovp++;
1554 					if (rcount <= 0)
1555 						break;
1556 					m = m->b_cont;
1557 				}
1558 			}
1559 			count += trp->wa->wa_count;
1560 			trp = trp->list;
1561 		} while (trp != lrp);
1562 
1563 		uio.uio_iov = niovp;
1564 		uio.uio_iovcnt = iovcnt;
1565 		uio.uio_segflg = UIO_SYSSPACE;
1566 		uio.uio_extflg = UIO_COPY_DEFAULT;
1567 		uio.uio_loffset = (offset_t)rp->wa->wa_offset;
1568 		uio.uio_resid = count;
1569 		/*
1570 		 * The limit is checked on the client. We
1571 		 * should allow any size writes here.
1572 		 */
1573 		uio.uio_llimit = curproc->p_fsz_ctl;
1574 		rlimit = uio.uio_llimit - rp->wa->wa_offset;
1575 		if (rlimit < (rlim64_t)uio.uio_resid)
1576 			uio.uio_resid = (uint_t)rlimit;
1577 
1578 		/*
1579 		 * For now we assume no append mode.
1580 		 */
1581 
1582 		/*
1583 		 * We're changing creds because VM may fault
1584 		 * and we need the cred of the current
1585 		 * thread to be used if quota * checking is
1586 		 * enabled.
1587 		 */
1588 		savecred = curthread->t_cred;
1589 		curthread->t_cred = cr;
1590 		error = VOP_WRITE(vp, &uio, 0, rp->cr, &ct);
1591 		curthread->t_cred = savecred;
1592 
1593 		/* check if a monitor detected a delegation conflict */
1594 		if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1595 			/* mark as wouldblock so response is dropped */
1596 			curthread->t_flag |= T_WOULDBLOCK;
1597 
1598 		if (niovp != iov)
1599 			kmem_free(niovp, sizeof (*niovp) * iovcnt);
1600 
1601 		if (!error) {
1602 			data_written = 1;
1603 			/*
1604 			 * Get attributes again so we send the latest mod
1605 			 * time to the client side for his cache.
1606 			 */
1607 			va.va_mask = AT_ALL;	/* now we want everything */
1608 
1609 			error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1610 
1611 			if (!error)
1612 				acl_perm(vp, exi, &va, rp->cr);
1613 		}
1614 
1615 		/*
1616 		 * Fill in the status responses for each request
1617 		 * which was just handled.  Also, copy the latest
1618 		 * attributes in to the attribute responses if
1619 		 * appropriate.
1620 		 */
1621 		t_flag = curthread->t_flag & T_WOULDBLOCK;
1622 		do {
1623 			rp->thread->t_flag |= t_flag;
1624 			/* check for overflows */
1625 			if (!error) {
1626 				error  = vattr_to_nattr(&va, &rp->ns->ns_attr);
1627 			}
1628 			rp->ns->ns_status = puterrno(error);
1629 			rp = rp->list;
1630 		} while (rp != lrp);
1631 	} while (rp != NULL);
1632 
1633 	/*
1634 	 * If any data was written at all, then we need to flush
1635 	 * the data and metadata to stable storage.
1636 	 */
1637 	if (data_written) {
1638 		error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr, &ct);
1639 
1640 		if (!error) {
1641 			error = VOP_FSYNC(vp, FNODSYNC, cr, &ct);
1642 		}
1643 	}
1644 
1645 	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1646 
1647 	if (in_crit)
1648 		nbl_end_crit(vp);
1649 	VN_RELE(vp);
1650 
1651 	t_flag = curthread->t_flag & T_WOULDBLOCK;
1652 	mutex_enter(&rfs_async_write_lock);
1653 	for (rp = nlp->list; rp != NULL; rp = rp->list) {
1654 		if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1655 			rp->ns->ns_status = puterrno(error);
1656 			rp->thread->t_flag |= t_flag;
1657 		}
1658 	}
1659 	cv_broadcast(&nlp->cv);
1660 	mutex_exit(&rfs_async_write_lock);
1661 
1662 }
1663 
1664 void *
1665 rfs_write_getfh(struct nfswriteargs *wa)
1666 {
1667 	return (&wa->wa_fhandle);
1668 }
1669 
1670 /*
1671  * Create a file.
1672  * Creates a file with given attributes and returns those attributes
1673  * and an fhandle for the new file.
1674  */
1675 void
1676 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
1677 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1678 {
1679 	int error;
1680 	int lookuperr;
1681 	int in_crit = 0;
1682 	struct vattr va;
1683 	vnode_t *vp;
1684 	vnode_t *realvp;
1685 	vnode_t *dvp;
1686 	char *name = args->ca_da.da_name;
1687 	vnode_t *tvp = NULL;
1688 	int mode;
1689 	int lookup_ok;
1690 	bool_t trunc;
1691 	struct sockaddr *ca;
1692 
1693 	/*
1694 	 * Disallow NULL paths
1695 	 */
1696 	if (name == NULL || *name == '\0') {
1697 		dr->dr_status = NFSERR_ACCES;
1698 		return;
1699 	}
1700 
1701 	dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
1702 	if (dvp == NULL) {
1703 		dr->dr_status = NFSERR_STALE;
1704 		return;
1705 	}
1706 
1707 	error = sattr_to_vattr(args->ca_sa, &va);
1708 	if (error) {
1709 		dr->dr_status = puterrno(error);
1710 		return;
1711 	}
1712 
1713 	/*
1714 	 * Must specify the mode.
1715 	 */
1716 	if (!(va.va_mask & AT_MODE)) {
1717 		VN_RELE(dvp);
1718 		dr->dr_status = NFSERR_INVAL;
1719 		return;
1720 	}
1721 
1722 	/*
1723 	 * This is a completely gross hack to make mknod
1724 	 * work over the wire until we can wack the protocol
1725 	 */
1726 	if ((va.va_mode & IFMT) == IFCHR) {
1727 		if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV)
1728 			va.va_type = VFIFO;	/* xtra kludge for named pipe */
1729 		else {
1730 			va.va_type = VCHR;
1731 			/*
1732 			 * uncompress the received dev_t
1733 			 * if the top half is zero indicating a request
1734 			 * from an `older style' OS.
1735 			 */
1736 			if ((va.va_size & 0xffff0000) == 0)
1737 				va.va_rdev = nfsv2_expdev(va.va_size);
1738 			else
1739 				va.va_rdev = (dev_t)va.va_size;
1740 		}
1741 		va.va_mask &= ~AT_SIZE;
1742 	} else if ((va.va_mode & IFMT) == IFBLK) {
1743 		va.va_type = VBLK;
1744 		/*
1745 		 * uncompress the received dev_t
1746 		 * if the top half is zero indicating a request
1747 		 * from an `older style' OS.
1748 		 */
1749 		if ((va.va_size & 0xffff0000) == 0)
1750 			va.va_rdev = nfsv2_expdev(va.va_size);
1751 		else
1752 			va.va_rdev = (dev_t)va.va_size;
1753 		va.va_mask &= ~AT_SIZE;
1754 	} else if ((va.va_mode & IFMT) == IFSOCK) {
1755 		va.va_type = VSOCK;
1756 	} else {
1757 		va.va_type = VREG;
1758 	}
1759 	va.va_mode &= ~IFMT;
1760 	va.va_mask |= AT_TYPE;
1761 
1762 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
1763 	name = nfscmd_convname(ca, exi, name, NFSCMD_CONV_INBOUND,
1764 	    MAXPATHLEN);
1765 	if (name == NULL) {
1766 		dr->dr_status = puterrno(EINVAL);
1767 		return;
1768 	}
1769 
1770 	/*
1771 	 * Why was the choice made to use VWRITE as the mode to the
1772 	 * call to VOP_CREATE ? This results in a bug.  When a client
1773 	 * opens a file that already exists and is RDONLY, the second
1774 	 * open fails with an EACESS because of the mode.
1775 	 * bug ID 1054648.
1776 	 */
1777 	lookup_ok = 0;
1778 	mode = VWRITE;
1779 	if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) {
1780 		error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1781 		    NULL, NULL, NULL);
1782 		if (!error) {
1783 			struct vattr at;
1784 
1785 			lookup_ok = 1;
1786 			at.va_mask = AT_MODE;
1787 			error = VOP_GETATTR(tvp, &at, 0, cr, NULL);
1788 			if (!error)
1789 				mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD;
1790 			VN_RELE(tvp);
1791 			tvp = NULL;
1792 		}
1793 	}
1794 
1795 	if (!lookup_ok) {
1796 		if (rdonly(exi, req)) {
1797 			error = EROFS;
1798 		} else if (va.va_type != VREG && va.va_type != VFIFO &&
1799 		    va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) {
1800 			error = EPERM;
1801 		} else {
1802 			error = 0;
1803 		}
1804 	}
1805 
1806 	/*
1807 	 * If file size is being modified on an already existing file
1808 	 * make sure that there are no conflicting non-blocking mandatory
1809 	 * locks in the region being manipulated. Return EACCES if there
1810 	 * are conflicting locks.
1811 	 */
1812 	if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) {
1813 		lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1814 		    NULL, NULL, NULL);
1815 
1816 		if (!lookuperr &&
1817 		    rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) {
1818 			VN_RELE(tvp);
1819 			curthread->t_flag |= T_WOULDBLOCK;
1820 			goto out;
1821 		}
1822 
1823 		if (!lookuperr && nbl_need_check(tvp)) {
1824 			/*
1825 			 * The file exists. Now check if it has any
1826 			 * conflicting non-blocking mandatory locks
1827 			 * in the region being changed.
1828 			 */
1829 			struct vattr bva;
1830 			u_offset_t offset;
1831 			ssize_t length;
1832 
1833 			nbl_start_crit(tvp, RW_READER);
1834 			in_crit = 1;
1835 
1836 			bva.va_mask = AT_SIZE;
1837 			error = VOP_GETATTR(tvp, &bva, 0, cr, NULL);
1838 			if (!error) {
1839 				if (va.va_size < bva.va_size) {
1840 					offset = va.va_size;
1841 					length = bva.va_size - va.va_size;
1842 				} else {
1843 					offset = bva.va_size;
1844 					length = va.va_size - bva.va_size;
1845 				}
1846 				if (length) {
1847 					if (nbl_conflict(tvp, NBL_WRITE,
1848 					    offset, length, 0, NULL)) {
1849 						error = EACCES;
1850 					}
1851 				}
1852 			}
1853 			if (error) {
1854 				nbl_end_crit(tvp);
1855 				VN_RELE(tvp);
1856 				in_crit = 0;
1857 			}
1858 		} else if (tvp != NULL) {
1859 			VN_RELE(tvp);
1860 		}
1861 	}
1862 
1863 	if (!error) {
1864 		/*
1865 		 * If filesystem is shared with nosuid the remove any
1866 		 * setuid/setgid bits on create.
1867 		 */
1868 		if (va.va_type == VREG &&
1869 		    exi->exi_export.ex_flags & EX_NOSUID)
1870 			va.va_mode &= ~(VSUID | VSGID);
1871 
1872 		error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0,
1873 		    NULL, NULL);
1874 
1875 		if (!error) {
1876 
1877 			if ((va.va_mask & AT_SIZE) && (va.va_size == 0))
1878 				trunc = TRUE;
1879 			else
1880 				trunc = FALSE;
1881 
1882 			if (rfs4_check_delegated(FWRITE, vp, trunc)) {
1883 				VN_RELE(vp);
1884 				curthread->t_flag |= T_WOULDBLOCK;
1885 				goto out;
1886 			}
1887 			va.va_mask = AT_ALL;
1888 
1889 			error = VOP_GETATTR(vp, &va, 0, cr, NULL);
1890 
1891 			/* check for overflows */
1892 			if (!error) {
1893 				acl_perm(vp, exi, &va, cr);
1894 				error = vattr_to_nattr(&va, &dr->dr_attr);
1895 				if (!error) {
1896 					error = makefh(&dr->dr_fhandle, vp,
1897 					    exi);
1898 				}
1899 			}
1900 			/*
1901 			 * Force modified metadata out to stable storage.
1902 			 *
1903 			 * if a underlying vp exists, pass it to VOP_FSYNC
1904 			 */
1905 			if (VOP_REALVP(vp, &realvp, NULL) == 0)
1906 				(void) VOP_FSYNC(realvp, FNODSYNC, cr, NULL);
1907 			else
1908 				(void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
1909 			VN_RELE(vp);
1910 		}
1911 
1912 		if (in_crit) {
1913 			nbl_end_crit(tvp);
1914 			VN_RELE(tvp);
1915 		}
1916 	}
1917 
1918 	/*
1919 	 * Force modified data and metadata out to stable storage.
1920 	 */
1921 	(void) VOP_FSYNC(dvp, 0, cr, NULL);
1922 
1923 out:
1924 
1925 	VN_RELE(dvp);
1926 
1927 	dr->dr_status = puterrno(error);
1928 
1929 	if (name != args->ca_da.da_name)
1930 		kmem_free(name, MAXPATHLEN);
1931 }
1932 void *
1933 rfs_create_getfh(struct nfscreatargs *args)
1934 {
1935 	return (args->ca_da.da_fhandle);
1936 }
1937 
1938 /*
1939  * Remove a file.
1940  * Remove named file from parent directory.
1941  */
1942 void
1943 rfs_remove(struct nfsdiropargs *da, enum nfsstat *status,
1944 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1945 {
1946 	int error = 0;
1947 	vnode_t *vp;
1948 	vnode_t *targvp;
1949 	int in_crit = 0;
1950 
1951 	/*
1952 	 * Disallow NULL paths
1953 	 */
1954 	if (da->da_name == NULL || *da->da_name == '\0') {
1955 		*status = NFSERR_ACCES;
1956 		return;
1957 	}
1958 
1959 	vp = nfs_fhtovp(da->da_fhandle, exi);
1960 	if (vp == NULL) {
1961 		*status = NFSERR_STALE;
1962 		return;
1963 	}
1964 
1965 	if (rdonly(exi, req)) {
1966 		VN_RELE(vp);
1967 		*status = NFSERR_ROFS;
1968 		return;
1969 	}
1970 
1971 	/*
1972 	 * Check for a conflict with a non-blocking mandatory share reservation.
1973 	 */
1974 	error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0,
1975 	    NULL, cr, NULL, NULL, NULL);
1976 	if (error != 0) {
1977 		VN_RELE(vp);
1978 		*status = puterrno(error);
1979 		return;
1980 	}
1981 
1982 	/*
1983 	 * If the file is delegated to an v4 client, then initiate
1984 	 * recall and drop this request (by setting T_WOULDBLOCK).
1985 	 * The client will eventually re-transmit the request and
1986 	 * (hopefully), by then, the v4 client will have returned
1987 	 * the delegation.
1988 	 */
1989 
1990 	if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
1991 		VN_RELE(vp);
1992 		VN_RELE(targvp);
1993 		curthread->t_flag |= T_WOULDBLOCK;
1994 		return;
1995 	}
1996 
1997 	if (nbl_need_check(targvp)) {
1998 		nbl_start_crit(targvp, RW_READER);
1999 		in_crit = 1;
2000 		if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
2001 			error = EACCES;
2002 			goto out;
2003 		}
2004 	}
2005 
2006 	error = VOP_REMOVE(vp, da->da_name, cr, NULL, 0);
2007 
2008 	/*
2009 	 * Force modified data and metadata out to stable storage.
2010 	 */
2011 	(void) VOP_FSYNC(vp, 0, cr, NULL);
2012 
2013 out:
2014 	if (in_crit)
2015 		nbl_end_crit(targvp);
2016 	VN_RELE(targvp);
2017 	VN_RELE(vp);
2018 
2019 	*status = puterrno(error);
2020 
2021 }
2022 
2023 void *
2024 rfs_remove_getfh(struct nfsdiropargs *da)
2025 {
2026 	return (da->da_fhandle);
2027 }
2028 
2029 /*
2030  * rename a file
2031  * Give a file (from) a new name (to).
2032  */
2033 void
2034 rfs_rename(struct nfsrnmargs *args, enum nfsstat *status,
2035 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2036 {
2037 	int error = 0;
2038 	vnode_t *fromvp;
2039 	vnode_t *tovp;
2040 	struct exportinfo *to_exi;
2041 	fhandle_t *fh;
2042 	vnode_t *srcvp;
2043 	vnode_t *targvp;
2044 	int in_crit = 0;
2045 
2046 	fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi);
2047 	if (fromvp == NULL) {
2048 		*status = NFSERR_STALE;
2049 		return;
2050 	}
2051 
2052 	fh = args->rna_to.da_fhandle;
2053 	to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2054 	if (to_exi == NULL) {
2055 		VN_RELE(fromvp);
2056 		*status = NFSERR_ACCES;
2057 		return;
2058 	}
2059 	exi_rele(to_exi);
2060 
2061 	if (to_exi != exi) {
2062 		VN_RELE(fromvp);
2063 		*status = NFSERR_XDEV;
2064 		return;
2065 	}
2066 
2067 	tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi);
2068 	if (tovp == NULL) {
2069 		VN_RELE(fromvp);
2070 		*status = NFSERR_STALE;
2071 		return;
2072 	}
2073 
2074 	if (fromvp->v_type != VDIR || tovp->v_type != VDIR) {
2075 		VN_RELE(tovp);
2076 		VN_RELE(fromvp);
2077 		*status = NFSERR_NOTDIR;
2078 		return;
2079 	}
2080 
2081 	/*
2082 	 * Disallow NULL paths
2083 	 */
2084 	if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' ||
2085 	    args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') {
2086 		VN_RELE(tovp);
2087 		VN_RELE(fromvp);
2088 		*status = NFSERR_ACCES;
2089 		return;
2090 	}
2091 
2092 	if (rdonly(exi, req)) {
2093 		VN_RELE(tovp);
2094 		VN_RELE(fromvp);
2095 		*status = NFSERR_ROFS;
2096 		return;
2097 	}
2098 
2099 	/*
2100 	 * Check for a conflict with a non-blocking mandatory share reservation.
2101 	 */
2102 	error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
2103 	    NULL, cr, NULL, NULL, NULL);
2104 	if (error != 0) {
2105 		VN_RELE(tovp);
2106 		VN_RELE(fromvp);
2107 		*status = puterrno(error);
2108 		return;
2109 	}
2110 
2111 	/* Check for delegations on the source file */
2112 
2113 	if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
2114 		VN_RELE(tovp);
2115 		VN_RELE(fromvp);
2116 		VN_RELE(srcvp);
2117 		curthread->t_flag |= T_WOULDBLOCK;
2118 		return;
2119 	}
2120 
2121 	/* Check for delegation on the file being renamed over, if it exists */
2122 
2123 	if (rfs4_deleg_policy != SRV_NEVER_DELEGATE &&
2124 	    VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
2125 	    NULL, NULL, NULL) == 0) {
2126 
2127 		if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2128 			VN_RELE(tovp);
2129 			VN_RELE(fromvp);
2130 			VN_RELE(srcvp);
2131 			VN_RELE(targvp);
2132 			curthread->t_flag |= T_WOULDBLOCK;
2133 			return;
2134 		}
2135 		VN_RELE(targvp);
2136 	}
2137 
2138 
2139 	if (nbl_need_check(srcvp)) {
2140 		nbl_start_crit(srcvp, RW_READER);
2141 		in_crit = 1;
2142 		if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) {
2143 			error = EACCES;
2144 			goto out;
2145 		}
2146 	}
2147 
2148 	error = VOP_RENAME(fromvp, args->rna_from.da_name,
2149 	    tovp, args->rna_to.da_name, cr, NULL, 0);
2150 
2151 	if (error == 0)
2152 		vn_renamepath(tovp, srcvp, args->rna_to.da_name,
2153 		    strlen(args->rna_to.da_name));
2154 
2155 	/*
2156 	 * Force modified data and metadata out to stable storage.
2157 	 */
2158 	(void) VOP_FSYNC(tovp, 0, cr, NULL);
2159 	(void) VOP_FSYNC(fromvp, 0, cr, NULL);
2160 
2161 out:
2162 	if (in_crit)
2163 		nbl_end_crit(srcvp);
2164 	VN_RELE(srcvp);
2165 	VN_RELE(tovp);
2166 	VN_RELE(fromvp);
2167 
2168 	*status = puterrno(error);
2169 
2170 }
2171 void *
2172 rfs_rename_getfh(struct nfsrnmargs *args)
2173 {
2174 	return (args->rna_from.da_fhandle);
2175 }
2176 
2177 /*
2178  * Link to a file.
2179  * Create a file (to) which is a hard link to the given file (from).
2180  */
2181 void
2182 rfs_link(struct nfslinkargs *args, enum nfsstat *status,
2183 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2184 {
2185 	int error;
2186 	vnode_t *fromvp;
2187 	vnode_t *tovp;
2188 	struct exportinfo *to_exi;
2189 	fhandle_t *fh;
2190 
2191 	fromvp = nfs_fhtovp(args->la_from, exi);
2192 	if (fromvp == NULL) {
2193 		*status = NFSERR_STALE;
2194 		return;
2195 	}
2196 
2197 	fh = args->la_to.da_fhandle;
2198 	to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2199 	if (to_exi == NULL) {
2200 		VN_RELE(fromvp);
2201 		*status = NFSERR_ACCES;
2202 		return;
2203 	}
2204 	exi_rele(to_exi);
2205 
2206 	if (to_exi != exi) {
2207 		VN_RELE(fromvp);
2208 		*status = NFSERR_XDEV;
2209 		return;
2210 	}
2211 
2212 	tovp = nfs_fhtovp(args->la_to.da_fhandle, exi);
2213 	if (tovp == NULL) {
2214 		VN_RELE(fromvp);
2215 		*status = NFSERR_STALE;
2216 		return;
2217 	}
2218 
2219 	if (tovp->v_type != VDIR) {
2220 		VN_RELE(tovp);
2221 		VN_RELE(fromvp);
2222 		*status = NFSERR_NOTDIR;
2223 		return;
2224 	}
2225 	/*
2226 	 * Disallow NULL paths
2227 	 */
2228 	if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') {
2229 		VN_RELE(tovp);
2230 		VN_RELE(fromvp);
2231 		*status = NFSERR_ACCES;
2232 		return;
2233 	}
2234 
2235 	if (rdonly(exi, req)) {
2236 		VN_RELE(tovp);
2237 		VN_RELE(fromvp);
2238 		*status = NFSERR_ROFS;
2239 		return;
2240 	}
2241 
2242 	error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr, NULL, 0);
2243 
2244 	/*
2245 	 * Force modified data and metadata out to stable storage.
2246 	 */
2247 	(void) VOP_FSYNC(tovp, 0, cr, NULL);
2248 	(void) VOP_FSYNC(fromvp, FNODSYNC, cr, NULL);
2249 
2250 	VN_RELE(tovp);
2251 	VN_RELE(fromvp);
2252 
2253 	*status = puterrno(error);
2254 
2255 }
2256 void *
2257 rfs_link_getfh(struct nfslinkargs *args)
2258 {
2259 	return (args->la_from);
2260 }
2261 
2262 /*
2263  * Symbolicly link to a file.
2264  * Create a file (to) with the given attributes which is a symbolic link
2265  * to the given path name (to).
2266  */
2267 void
2268 rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
2269 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2270 {
2271 	int error;
2272 	struct vattr va;
2273 	vnode_t *vp;
2274 	vnode_t *svp;
2275 	int lerror;
2276 	struct sockaddr *ca;
2277 	char *name = NULL;
2278 
2279 	/*
2280 	 * Disallow NULL paths
2281 	 */
2282 	if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') {
2283 		*status = NFSERR_ACCES;
2284 		return;
2285 	}
2286 
2287 	vp = nfs_fhtovp(args->sla_from.da_fhandle, exi);
2288 	if (vp == NULL) {
2289 		*status = NFSERR_STALE;
2290 		return;
2291 	}
2292 
2293 	if (rdonly(exi, req)) {
2294 		VN_RELE(vp);
2295 		*status = NFSERR_ROFS;
2296 		return;
2297 	}
2298 
2299 	error = sattr_to_vattr(args->sla_sa, &va);
2300 	if (error) {
2301 		VN_RELE(vp);
2302 		*status = puterrno(error);
2303 		return;
2304 	}
2305 
2306 	if (!(va.va_mask & AT_MODE)) {
2307 		VN_RELE(vp);
2308 		*status = NFSERR_INVAL;
2309 		return;
2310 	}
2311 
2312 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2313 	name = nfscmd_convname(ca, exi, args->sla_tnm,
2314 	    NFSCMD_CONV_INBOUND, MAXPATHLEN);
2315 
2316 	if (name == NULL) {
2317 		*status = NFSERR_ACCES;
2318 		return;
2319 	}
2320 
2321 	va.va_type = VLNK;
2322 	va.va_mask |= AT_TYPE;
2323 
2324 	error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, name, cr, NULL, 0);
2325 
2326 	/*
2327 	 * Force new data and metadata out to stable storage.
2328 	 */
2329 	lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL, 0,
2330 	    NULL, cr, NULL, NULL, NULL);
2331 
2332 	if (!lerror) {
2333 		(void) VOP_FSYNC(svp, 0, cr, NULL);
2334 		VN_RELE(svp);
2335 	}
2336 
2337 	/*
2338 	 * Force modified data and metadata out to stable storage.
2339 	 */
2340 	(void) VOP_FSYNC(vp, 0, cr, NULL);
2341 
2342 	VN_RELE(vp);
2343 
2344 	*status = puterrno(error);
2345 	if (name != args->sla_tnm)
2346 		kmem_free(name, MAXPATHLEN);
2347 
2348 }
2349 void *
2350 rfs_symlink_getfh(struct nfsslargs *args)
2351 {
2352 	return (args->sla_from.da_fhandle);
2353 }
2354 
2355 /*
2356  * Make a directory.
2357  * Create a directory with the given name, parent directory, and attributes.
2358  * Returns a file handle and attributes for the new directory.
2359  */
2360 void
2361 rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr,
2362 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2363 {
2364 	int error;
2365 	struct vattr va;
2366 	vnode_t *dvp = NULL;
2367 	vnode_t *vp;
2368 	char *name = args->ca_da.da_name;
2369 
2370 	/*
2371 	 * Disallow NULL paths
2372 	 */
2373 	if (name == NULL || *name == '\0') {
2374 		dr->dr_status = NFSERR_ACCES;
2375 		return;
2376 	}
2377 
2378 	vp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
2379 	if (vp == NULL) {
2380 		dr->dr_status = NFSERR_STALE;
2381 		return;
2382 	}
2383 
2384 	if (rdonly(exi, req)) {
2385 		VN_RELE(vp);
2386 		dr->dr_status = NFSERR_ROFS;
2387 		return;
2388 	}
2389 
2390 	error = sattr_to_vattr(args->ca_sa, &va);
2391 	if (error) {
2392 		VN_RELE(vp);
2393 		dr->dr_status = puterrno(error);
2394 		return;
2395 	}
2396 
2397 	if (!(va.va_mask & AT_MODE)) {
2398 		VN_RELE(vp);
2399 		dr->dr_status = NFSERR_INVAL;
2400 		return;
2401 	}
2402 
2403 	va.va_type = VDIR;
2404 	va.va_mask |= AT_TYPE;
2405 
2406 	error = VOP_MKDIR(vp, name, &va, &dvp, cr, NULL, 0, NULL);
2407 
2408 	if (!error) {
2409 		/*
2410 		 * Attribtutes of the newly created directory should
2411 		 * be returned to the client.
2412 		 */
2413 		va.va_mask = AT_ALL; /* We want everything */
2414 		error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2415 
2416 		/* check for overflows */
2417 		if (!error) {
2418 			acl_perm(vp, exi, &va, cr);
2419 			error = vattr_to_nattr(&va, &dr->dr_attr);
2420 			if (!error) {
2421 				error = makefh(&dr->dr_fhandle, dvp, exi);
2422 			}
2423 		}
2424 		/*
2425 		 * Force new data and metadata out to stable storage.
2426 		 */
2427 		(void) VOP_FSYNC(dvp, 0, cr, NULL);
2428 		VN_RELE(dvp);
2429 	}
2430 
2431 	/*
2432 	 * Force modified data and metadata out to stable storage.
2433 	 */
2434 	(void) VOP_FSYNC(vp, 0, cr, NULL);
2435 
2436 	VN_RELE(vp);
2437 
2438 	dr->dr_status = puterrno(error);
2439 
2440 }
2441 void *
2442 rfs_mkdir_getfh(struct nfscreatargs *args)
2443 {
2444 	return (args->ca_da.da_fhandle);
2445 }
2446 
2447 /*
2448  * Remove a directory.
2449  * Remove the given directory name from the given parent directory.
2450  */
2451 void
2452 rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status,
2453 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2454 {
2455 	int error;
2456 	vnode_t *vp;
2457 
2458 
2459 	/*
2460 	 * Disallow NULL paths
2461 	 */
2462 	if (da->da_name == NULL || *da->da_name == '\0') {
2463 		*status = NFSERR_ACCES;
2464 		return;
2465 	}
2466 
2467 	vp = nfs_fhtovp(da->da_fhandle, exi);
2468 	if (vp == NULL) {
2469 		*status = NFSERR_STALE;
2470 		return;
2471 	}
2472 
2473 	if (rdonly(exi, req)) {
2474 		VN_RELE(vp);
2475 		*status = NFSERR_ROFS;
2476 		return;
2477 	}
2478 
2479 	/*
2480 	 * VOP_RMDIR now takes a new third argument (the current
2481 	 * directory of the process).  That's because someone
2482 	 * wants to return EINVAL if one tries to remove ".".
2483 	 * Of course, NFS servers have no idea what their
2484 	 * clients' current directories are.  We fake it by
2485 	 * supplying a vnode known to exist and illegal to
2486 	 * remove.
2487 	 */
2488 	error = VOP_RMDIR(vp, da->da_name, rootdir, cr, NULL, 0);
2489 
2490 	/*
2491 	 * Force modified data and metadata out to stable storage.
2492 	 */
2493 	(void) VOP_FSYNC(vp, 0, cr, NULL);
2494 
2495 	VN_RELE(vp);
2496 
2497 	/*
2498 	 * System V defines rmdir to return EEXIST, not ENOTEMPTY,
2499 	 * if the directory is not empty.  A System V NFS server
2500 	 * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
2501 	 * over the wire.
2502 	 */
2503 	if (error == EEXIST)
2504 		*status = NFSERR_NOTEMPTY;
2505 	else
2506 		*status = puterrno(error);
2507 
2508 }
2509 void *
2510 rfs_rmdir_getfh(struct nfsdiropargs *da)
2511 {
2512 	return (da->da_fhandle);
2513 }
2514 
2515 /* ARGSUSED */
2516 void
2517 rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
2518 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2519 {
2520 	int error;
2521 	int iseof;
2522 	struct iovec iov;
2523 	struct uio uio;
2524 	vnode_t *vp;
2525 	char *ndata = NULL;
2526 	struct sockaddr *ca;
2527 	size_t nents;
2528 	int ret;
2529 
2530 	vp = nfs_fhtovp(&rda->rda_fh, exi);
2531 	if (vp == NULL) {
2532 		rd->rd_entries = NULL;
2533 		rd->rd_status = NFSERR_STALE;
2534 		return;
2535 	}
2536 
2537 	if (vp->v_type != VDIR) {
2538 		VN_RELE(vp);
2539 		rd->rd_entries = NULL;
2540 		rd->rd_status = NFSERR_NOTDIR;
2541 		return;
2542 	}
2543 
2544 	(void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
2545 
2546 	error = VOP_ACCESS(vp, VREAD, 0, cr, NULL);
2547 
2548 	if (error) {
2549 		rd->rd_entries = NULL;
2550 		goto bad;
2551 	}
2552 
2553 	if (rda->rda_count == 0) {
2554 		rd->rd_entries = NULL;
2555 		rd->rd_size = 0;
2556 		rd->rd_eof = FALSE;
2557 		goto bad;
2558 	}
2559 
2560 	rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA);
2561 
2562 	/*
2563 	 * Allocate data for entries.  This will be freed by rfs_rddirfree.
2564 	 */
2565 	rd->rd_bufsize = (uint_t)rda->rda_count;
2566 	rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP);
2567 
2568 	/*
2569 	 * Set up io vector to read directory data
2570 	 */
2571 	iov.iov_base = (caddr_t)rd->rd_entries;
2572 	iov.iov_len = rda->rda_count;
2573 	uio.uio_iov = &iov;
2574 	uio.uio_iovcnt = 1;
2575 	uio.uio_segflg = UIO_SYSSPACE;
2576 	uio.uio_extflg = UIO_COPY_CACHED;
2577 	uio.uio_loffset = (offset_t)rda->rda_offset;
2578 	uio.uio_resid = rda->rda_count;
2579 
2580 	/*
2581 	 * read directory
2582 	 */
2583 	error = VOP_READDIR(vp, &uio, cr, &iseof, NULL, 0);
2584 
2585 	/*
2586 	 * Clean up
2587 	 */
2588 	if (!error) {
2589 		/*
2590 		 * set size and eof
2591 		 */
2592 		if (uio.uio_resid == rda->rda_count) {
2593 			rd->rd_size = 0;
2594 			rd->rd_eof = TRUE;
2595 		} else {
2596 			rd->rd_size = (uint32_t)(rda->rda_count -
2597 			    uio.uio_resid);
2598 			rd->rd_eof = iseof ? TRUE : FALSE;
2599 		}
2600 	}
2601 
2602 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2603 	nents = nfscmd_countents((char *)rd->rd_entries, rd->rd_size);
2604 	ret = nfscmd_convdirplus(ca, exi, (char *)rd->rd_entries, nents,
2605 	    rda->rda_count, &ndata);
2606 
2607 	if (ret != 0) {
2608 		size_t dropbytes;
2609 		/*
2610 		 * We had to drop one or more entries in order to fit
2611 		 * during the character conversion.  We need to patch
2612 		 * up the size and eof info.
2613 		 */
2614 		if (rd->rd_eof)
2615 			rd->rd_eof = FALSE;
2616 		dropbytes = nfscmd_dropped_entrysize(
2617 		    (struct dirent64 *)rd->rd_entries, nents, ret);
2618 		rd->rd_size -= dropbytes;
2619 	}
2620 	if (ndata == NULL) {
2621 		ndata = (char *)rd->rd_entries;
2622 	} else if (ndata != (char *)rd->rd_entries) {
2623 		kmem_free(rd->rd_entries, rd->rd_bufsize);
2624 		rd->rd_entries = (void *)ndata;
2625 		rd->rd_bufsize = rda->rda_count;
2626 	}
2627 
2628 bad:
2629 	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
2630 
2631 #if 0 /* notyet */
2632 	/*
2633 	 * Don't do this.  It causes local disk writes when just
2634 	 * reading the file and the overhead is deemed larger
2635 	 * than the benefit.
2636 	 */
2637 	/*
2638 	 * Force modified metadata out to stable storage.
2639 	 */
2640 	(void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2641 #endif
2642 
2643 	VN_RELE(vp);
2644 
2645 	rd->rd_status = puterrno(error);
2646 
2647 }
2648 void *
2649 rfs_readdir_getfh(struct nfsrddirargs *rda)
2650 {
2651 	return (&rda->rda_fh);
2652 }
2653 void
2654 rfs_rddirfree(struct nfsrddirres *rd)
2655 {
2656 	if (rd->rd_entries != NULL)
2657 		kmem_free(rd->rd_entries, rd->rd_bufsize);
2658 }
2659 
2660 /* ARGSUSED */
2661 void
2662 rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
2663 	struct svc_req *req, cred_t *cr)
2664 {
2665 	int error;
2666 	struct statvfs64 sb;
2667 	vnode_t *vp;
2668 
2669 	vp = nfs_fhtovp(fh, exi);
2670 	if (vp == NULL) {
2671 		fs->fs_status = NFSERR_STALE;
2672 		return;
2673 	}
2674 
2675 	error = VFS_STATVFS(vp->v_vfsp, &sb);
2676 
2677 	if (!error) {
2678 		fs->fs_tsize = nfstsize();
2679 		fs->fs_bsize = sb.f_frsize;
2680 		fs->fs_blocks = sb.f_blocks;
2681 		fs->fs_bfree = sb.f_bfree;
2682 		fs->fs_bavail = sb.f_bavail;
2683 	}
2684 
2685 	VN_RELE(vp);
2686 
2687 	fs->fs_status = puterrno(error);
2688 
2689 }
2690 void *
2691 rfs_statfs_getfh(fhandle_t *fh)
2692 {
2693 	return (fh);
2694 }
2695 
2696 static int
2697 sattr_to_vattr(struct nfssattr *sa, struct vattr *vap)
2698 {
2699 	vap->va_mask = 0;
2700 
2701 	/*
2702 	 * There was a sign extension bug in some VFS based systems
2703 	 * which stored the mode as a short.  When it would get
2704 	 * assigned to a u_long, no sign extension would occur.
2705 	 * It needed to, but this wasn't noticed because sa_mode
2706 	 * would then get assigned back to the short, thus ignoring
2707 	 * the upper 16 bits of sa_mode.
2708 	 *
2709 	 * To make this implementation work for both broken
2710 	 * clients and good clients, we check for both versions
2711 	 * of the mode.
2712 	 */
2713 	if (sa->sa_mode != (uint32_t)((ushort_t)-1) &&
2714 	    sa->sa_mode != (uint32_t)-1) {
2715 		vap->va_mask |= AT_MODE;
2716 		vap->va_mode = sa->sa_mode;
2717 	}
2718 	if (sa->sa_uid != (uint32_t)-1) {
2719 		vap->va_mask |= AT_UID;
2720 		vap->va_uid = sa->sa_uid;
2721 	}
2722 	if (sa->sa_gid != (uint32_t)-1) {
2723 		vap->va_mask |= AT_GID;
2724 		vap->va_gid = sa->sa_gid;
2725 	}
2726 	if (sa->sa_size != (uint32_t)-1) {
2727 		vap->va_mask |= AT_SIZE;
2728 		vap->va_size = sa->sa_size;
2729 	}
2730 	if (sa->sa_atime.tv_sec != (int32_t)-1 &&
2731 	    sa->sa_atime.tv_usec != (int32_t)-1) {
2732 #ifndef _LP64
2733 		/* return error if time overflow */
2734 		if (!NFS2_TIME_OK(sa->sa_atime.tv_sec))
2735 			return (EOVERFLOW);
2736 #endif
2737 		vap->va_mask |= AT_ATIME;
2738 		/*
2739 		 * nfs protocol defines times as unsigned so don't extend sign,
2740 		 * unless sysadmin set nfs_allow_preepoch_time.
2741 		 */
2742 		NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec);
2743 		vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
2744 	}
2745 	if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
2746 	    sa->sa_mtime.tv_usec != (int32_t)-1) {
2747 #ifndef _LP64
2748 		/* return error if time overflow */
2749 		if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
2750 			return (EOVERFLOW);
2751 #endif
2752 		vap->va_mask |= AT_MTIME;
2753 		/*
2754 		 * nfs protocol defines times as unsigned so don't extend sign,
2755 		 * unless sysadmin set nfs_allow_preepoch_time.
2756 		 */
2757 		NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
2758 		vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
2759 	}
2760 	return (0);
2761 }
2762 
2763 static enum nfsftype vt_to_nf[] = {
2764 	0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
2765 };
2766 
2767 /*
2768  * check the following fields for overflow: nodeid, size, and time.
2769  * There could be a problem when converting 64-bit LP64 fields
2770  * into 32-bit ones.  Return an error if there is an overflow.
2771  */
2772 int
2773 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
2774 {
2775 	ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
2776 	na->na_type = vt_to_nf[vap->va_type];
2777 
2778 	if (vap->va_mode == (unsigned short) -1)
2779 		na->na_mode = (uint32_t)-1;
2780 	else
2781 		na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
2782 
2783 	if (vap->va_uid == (unsigned short)(-1))
2784 		na->na_uid = (uint32_t)(-1);
2785 	else if (vap->va_uid == UID_NOBODY)
2786 		na->na_uid = (uint32_t)NFS_UID_NOBODY;
2787 	else
2788 		na->na_uid = vap->va_uid;
2789 
2790 	if (vap->va_gid == (unsigned short)(-1))
2791 		na->na_gid = (uint32_t)-1;
2792 	else if (vap->va_gid == GID_NOBODY)
2793 		na->na_gid = (uint32_t)NFS_GID_NOBODY;
2794 	else
2795 		na->na_gid = vap->va_gid;
2796 
2797 	/*
2798 	 * Do we need to check fsid for overflow?  It is 64-bit in the
2799 	 * vattr, but are bigger than 32 bit values supported?
2800 	 */
2801 	na->na_fsid = vap->va_fsid;
2802 
2803 	na->na_nodeid = vap->va_nodeid;
2804 
2805 	/*
2806 	 * Check to make sure that the nodeid is representable over the
2807 	 * wire without losing bits.
2808 	 */
2809 	if (vap->va_nodeid != (u_longlong_t)na->na_nodeid)
2810 		return (EFBIG);
2811 	na->na_nlink = vap->va_nlink;
2812 
2813 	/*
2814 	 * Check for big files here, instead of at the caller.  See
2815 	 * comments in cstat for large special file explanation.
2816 	 */
2817 	if (vap->va_size > (u_longlong_t)MAXOFF32_T) {
2818 		if ((vap->va_type == VREG) || (vap->va_type == VDIR))
2819 			return (EFBIG);
2820 		if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) {
2821 			/* UNKNOWN_SIZE | OVERFLOW */
2822 			na->na_size = MAXOFF32_T;
2823 		} else
2824 			na->na_size = vap->va_size;
2825 	} else
2826 		na->na_size = vap->va_size;
2827 
2828 	/*
2829 	 * If the vnode times overflow the 32-bit times that NFS2
2830 	 * uses on the wire then return an error.
2831 	 */
2832 	if (!NFS_VAP_TIME_OK(vap)) {
2833 		return (EOVERFLOW);
2834 	}
2835 	na->na_atime.tv_sec = vap->va_atime.tv_sec;
2836 	na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2837 
2838 	na->na_mtime.tv_sec = vap->va_mtime.tv_sec;
2839 	na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2840 
2841 	na->na_ctime.tv_sec = vap->va_ctime.tv_sec;
2842 	na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000;
2843 
2844 	/*
2845 	 * If the dev_t will fit into 16 bits then compress
2846 	 * it, otherwise leave it alone. See comments in
2847 	 * nfs_client.c.
2848 	 */
2849 	if (getminor(vap->va_rdev) <= SO4_MAXMIN &&
2850 	    getmajor(vap->va_rdev) <= SO4_MAXMAJ)
2851 		na->na_rdev = nfsv2_cmpdev(vap->va_rdev);
2852 	else
2853 		(void) cmpldev(&na->na_rdev, vap->va_rdev);
2854 
2855 	na->na_blocks = vap->va_nblocks;
2856 	na->na_blocksize = vap->va_blksize;
2857 
2858 	/*
2859 	 * This bit of ugliness is a *TEMPORARY* hack to preserve the
2860 	 * over-the-wire protocols for named-pipe vnodes.  It remaps the
2861 	 * VFIFO type to the special over-the-wire type. (see note in nfs.h)
2862 	 *
2863 	 * BUYER BEWARE:
2864 	 *  If you are porting the NFS to a non-Sun server, you probably
2865 	 *  don't want to include the following block of code.  The
2866 	 *  over-the-wire special file types will be changing with the
2867 	 *  NFS Protocol Revision.
2868 	 */
2869 	if (vap->va_type == VFIFO)
2870 		NA_SETFIFO(na);
2871 	return (0);
2872 }
2873 
2874 /*
2875  * acl v2 support: returns approximate permission.
2876  *	default: returns minimal permission (more restrictive)
2877  *	aclok: returns maximal permission (less restrictive)
2878  *	This routine changes the permissions that are alaredy in *va.
2879  *	If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES,
2880  *	CLASS_OBJ is always the same as GROUP_OBJ entry.
2881  */
2882 static void
2883 acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr)
2884 {
2885 	vsecattr_t	vsa;
2886 	int		aclcnt;
2887 	aclent_t	*aclentp;
2888 	mode_t		mask_perm;
2889 	mode_t		grp_perm;
2890 	mode_t		other_perm;
2891 	mode_t		other_orig;
2892 	int		error;
2893 
2894 	/* dont care default acl */
2895 	vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT);
2896 	error = VOP_GETSECATTR(vp, &vsa, 0, cr, NULL);
2897 
2898 	if (!error) {
2899 		aclcnt = vsa.vsa_aclcnt;
2900 		if (aclcnt > MIN_ACL_ENTRIES) {
2901 			/* non-trivial ACL */
2902 			aclentp = vsa.vsa_aclentp;
2903 			if (exi->exi_export.ex_flags & EX_ACLOK) {
2904 				/* maximal permissions */
2905 				grp_perm = 0;
2906 				other_perm = 0;
2907 				for (; aclcnt > 0; aclcnt--, aclentp++) {
2908 					switch (aclentp->a_type) {
2909 					case USER_OBJ:
2910 						break;
2911 					case USER:
2912 						grp_perm |=
2913 						    aclentp->a_perm << 3;
2914 						other_perm |= aclentp->a_perm;
2915 						break;
2916 					case GROUP_OBJ:
2917 						grp_perm |=
2918 						    aclentp->a_perm << 3;
2919 						break;
2920 					case GROUP:
2921 						other_perm |= aclentp->a_perm;
2922 						break;
2923 					case OTHER_OBJ:
2924 						other_orig = aclentp->a_perm;
2925 						break;
2926 					case CLASS_OBJ:
2927 						mask_perm = aclentp->a_perm;
2928 						break;
2929 					default:
2930 						break;
2931 					}
2932 				}
2933 				grp_perm &= mask_perm << 3;
2934 				other_perm &= mask_perm;
2935 				other_perm |= other_orig;
2936 
2937 			} else {
2938 				/* minimal permissions */
2939 				grp_perm = 070;
2940 				other_perm = 07;
2941 				for (; aclcnt > 0; aclcnt--, aclentp++) {
2942 					switch (aclentp->a_type) {
2943 					case USER_OBJ:
2944 						break;
2945 					case USER:
2946 					case CLASS_OBJ:
2947 						grp_perm &=
2948 						    aclentp->a_perm << 3;
2949 						other_perm &=
2950 						    aclentp->a_perm;
2951 						break;
2952 					case GROUP_OBJ:
2953 						grp_perm &=
2954 						    aclentp->a_perm << 3;
2955 						break;
2956 					case GROUP:
2957 						other_perm &=
2958 						    aclentp->a_perm;
2959 						break;
2960 					case OTHER_OBJ:
2961 						other_perm &=
2962 						    aclentp->a_perm;
2963 						break;
2964 					default:
2965 						break;
2966 					}
2967 				}
2968 			}
2969 			/* copy to va */
2970 			va->va_mode &= ~077;
2971 			va->va_mode |= grp_perm | other_perm;
2972 		}
2973 		if (vsa.vsa_aclcnt)
2974 			kmem_free(vsa.vsa_aclentp,
2975 			    vsa.vsa_aclcnt * sizeof (aclent_t));
2976 	}
2977 }
2978 
2979 void
2980 rfs_srvrinit(void)
2981 {
2982 	mutex_init(&rfs_async_write_lock, NULL, MUTEX_DEFAULT, NULL);
2983 	nfs2_srv_caller_id = fs_new_caller_id();
2984 }
2985 
2986 void
2987 rfs_srvrfini(void)
2988 {
2989 	mutex_destroy(&rfs_async_write_lock);
2990 }
2991 
2992 static int
2993 rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
2994 {
2995 	struct clist	*wcl;
2996 	int		wlist_len;
2997 	uint32_t	count = rr->rr_count;
2998 
2999 	wcl = ra->ra_wlist;
3000 
3001 	if (rdma_setup_read_chunks(wcl, count, &wlist_len) == FALSE) {
3002 		return (FALSE);
3003 	}
3004 
3005 	wcl = ra->ra_wlist;
3006 	rr->rr_ok.rrok_wlist_len = wlist_len;
3007 	rr->rr_ok.rrok_wlist = wcl;
3008 
3009 	return (TRUE);
3010 }
3011