xref: /titanic_41/usr/src/uts/common/fs/nfs/nfs_srv.c (revision 5e01956f3000408c2a2c5a08c8d0acf2c2a9d8ee)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 /*
26  *	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
27  *	All rights reserved.
28  */
29 
30 #include <sys/param.h>
31 #include <sys/types.h>
32 #include <sys/systm.h>
33 #include <sys/cred.h>
34 #include <sys/buf.h>
35 #include <sys/vfs.h>
36 #include <sys/vnode.h>
37 #include <sys/uio.h>
38 #include <sys/stat.h>
39 #include <sys/errno.h>
40 #include <sys/sysmacros.h>
41 #include <sys/statvfs.h>
42 #include <sys/kmem.h>
43 #include <sys/kstat.h>
44 #include <sys/dirent.h>
45 #include <sys/cmn_err.h>
46 #include <sys/debug.h>
47 #include <sys/vtrace.h>
48 #include <sys/mode.h>
49 #include <sys/acl.h>
50 #include <sys/nbmlock.h>
51 #include <sys/policy.h>
52 #include <sys/sdt.h>
53 
54 #include <rpc/types.h>
55 #include <rpc/auth.h>
56 #include <rpc/svc.h>
57 
58 #include <nfs/nfs.h>
59 #include <nfs/export.h>
60 #include <nfs/nfs_cmd.h>
61 
62 #include <vm/hat.h>
63 #include <vm/as.h>
64 #include <vm/seg.h>
65 #include <vm/seg_map.h>
66 #include <vm/seg_kmem.h>
67 
68 #include <sys/strsubr.h>
69 
70 /*
71  * These are the interface routines for the server side of the
72  * Network File System.  See the NFS version 2 protocol specification
73  * for a description of this interface.
74  */
75 
76 static int	sattr_to_vattr(struct nfssattr *, struct vattr *);
77 static void	acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
78 			cred_t *);
79 
80 /*
81  * Some "over the wire" UNIX file types.  These are encoded
82  * into the mode.  This needs to be fixed in the next rev.
83  */
84 #define	IFMT		0170000		/* type of file */
85 #define	IFCHR		0020000		/* character special */
86 #define	IFBLK		0060000		/* block special */
87 #define	IFSOCK		0140000		/* socket */
88 
89 u_longlong_t nfs2_srv_caller_id;
90 
91 /*
92  * Get file attributes.
93  * Returns the current attributes of the file with the given fhandle.
94  */
95 /* ARGSUSED */
96 void
97 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
98 	struct svc_req *req, cred_t *cr)
99 {
100 	int error;
101 	vnode_t *vp;
102 	struct vattr va;
103 
104 	vp = nfs_fhtovp(fhp, exi);
105 	if (vp == NULL) {
106 		ns->ns_status = NFSERR_STALE;
107 		return;
108 	}
109 
110 	/*
111 	 * Do the getattr.
112 	 */
113 	va.va_mask = AT_ALL;	/* we want all the attributes */
114 
115 	error = rfs4_delegated_getattr(vp, &va, 0, cr);
116 
117 	/* check for overflows */
118 	if (!error) {
119 		/* Lie about the object type for a referral */
120 		if (vn_is_nfs_reparse(vp, cr))
121 			va.va_type = VLNK;
122 
123 		acl_perm(vp, exi, &va, cr);
124 		error = vattr_to_nattr(&va, &ns->ns_attr);
125 	}
126 
127 	VN_RELE(vp);
128 
129 	ns->ns_status = puterrno(error);
130 }
131 void *
132 rfs_getattr_getfh(fhandle_t *fhp)
133 {
134 	return (fhp);
135 }
136 
137 /*
138  * Set file attributes.
139  * Sets the attributes of the file with the given fhandle.  Returns
140  * the new attributes.
141  */
142 void
143 rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns,
144 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
145 {
146 	int error;
147 	int flag;
148 	int in_crit = 0;
149 	vnode_t *vp;
150 	struct vattr va;
151 	struct vattr bva;
152 	struct flock64 bf;
153 	caller_context_t ct;
154 
155 
156 	vp = nfs_fhtovp(&args->saa_fh, exi);
157 	if (vp == NULL) {
158 		ns->ns_status = NFSERR_STALE;
159 		return;
160 	}
161 
162 	if (rdonly(exi, req) || vn_is_readonly(vp)) {
163 		VN_RELE(vp);
164 		ns->ns_status = NFSERR_ROFS;
165 		return;
166 	}
167 
168 	error = sattr_to_vattr(&args->saa_sa, &va);
169 	if (error) {
170 		VN_RELE(vp);
171 		ns->ns_status = puterrno(error);
172 		return;
173 	}
174 
175 	/*
176 	 * If the client is requesting a change to the mtime,
177 	 * but the nanosecond field is set to 1 billion, then
178 	 * this is a flag to the server that it should set the
179 	 * atime and mtime fields to the server's current time.
180 	 * The 1 billion number actually came from the client
181 	 * as 1 million, but the units in the over the wire
182 	 * request are microseconds instead of nanoseconds.
183 	 *
184 	 * This is an overload of the protocol and should be
185 	 * documented in the NFS Version 2 protocol specification.
186 	 */
187 	if (va.va_mask & AT_MTIME) {
188 		if (va.va_mtime.tv_nsec == 1000000000) {
189 			gethrestime(&va.va_mtime);
190 			va.va_atime = va.va_mtime;
191 			va.va_mask |= AT_ATIME;
192 			flag = 0;
193 		} else
194 			flag = ATTR_UTIME;
195 	} else
196 		flag = 0;
197 
198 	/*
199 	 * If the filesystem is exported with nosuid, then mask off
200 	 * the setuid and setgid bits.
201 	 */
202 	if ((va.va_mask & AT_MODE) && vp->v_type == VREG &&
203 	    (exi->exi_export.ex_flags & EX_NOSUID))
204 		va.va_mode &= ~(VSUID | VSGID);
205 
206 	ct.cc_sysid = 0;
207 	ct.cc_pid = 0;
208 	ct.cc_caller_id = nfs2_srv_caller_id;
209 	ct.cc_flags = CC_DONTBLOCK;
210 
211 	/*
212 	 * We need to specially handle size changes because it is
213 	 * possible for the client to create a file with modes
214 	 * which indicate read-only, but with the file opened for
215 	 * writing.  If the client then tries to set the size of
216 	 * the file, then the normal access checking done in
217 	 * VOP_SETATTR would prevent the client from doing so,
218 	 * although it should be legal for it to do so.  To get
219 	 * around this, we do the access checking for ourselves
220 	 * and then use VOP_SPACE which doesn't do the access
221 	 * checking which VOP_SETATTR does. VOP_SPACE can only
222 	 * operate on VREG files, let VOP_SETATTR handle the other
223 	 * extremely rare cases.
224 	 * Also the client should not be allowed to change the
225 	 * size of the file if there is a conflicting non-blocking
226 	 * mandatory lock in the region of change.
227 	 */
228 	if (vp->v_type == VREG && va.va_mask & AT_SIZE) {
229 		if (nbl_need_check(vp)) {
230 			nbl_start_crit(vp, RW_READER);
231 			in_crit = 1;
232 		}
233 
234 		bva.va_mask = AT_UID | AT_SIZE;
235 
236 		error = VOP_GETATTR(vp, &bva, 0, cr, &ct);
237 
238 		if (error) {
239 			if (in_crit)
240 				nbl_end_crit(vp);
241 			VN_RELE(vp);
242 			ns->ns_status = puterrno(error);
243 			return;
244 		}
245 
246 		if (in_crit) {
247 			u_offset_t offset;
248 			ssize_t length;
249 
250 			if (va.va_size < bva.va_size) {
251 				offset = va.va_size;
252 				length = bva.va_size - va.va_size;
253 			} else {
254 				offset = bva.va_size;
255 				length = va.va_size - bva.va_size;
256 			}
257 			if (nbl_conflict(vp, NBL_WRITE, offset, length, 0,
258 			    NULL)) {
259 				error = EACCES;
260 			}
261 		}
262 
263 		if (crgetuid(cr) == bva.va_uid && !error &&
264 		    va.va_size != bva.va_size) {
265 			va.va_mask &= ~AT_SIZE;
266 			bf.l_type = F_WRLCK;
267 			bf.l_whence = 0;
268 			bf.l_start = (off64_t)va.va_size;
269 			bf.l_len = 0;
270 			bf.l_sysid = 0;
271 			bf.l_pid = 0;
272 
273 			error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE,
274 			    (offset_t)va.va_size, cr, &ct);
275 		}
276 		if (in_crit)
277 			nbl_end_crit(vp);
278 	} else
279 		error = 0;
280 
281 	/*
282 	 * Do the setattr.
283 	 */
284 	if (!error && va.va_mask) {
285 		error = VOP_SETATTR(vp, &va, flag, cr, &ct);
286 	}
287 
288 	/*
289 	 * check if the monitor on either vop_space or vop_setattr detected
290 	 * a delegation conflict and if so, mark the thread flag as
291 	 * wouldblock so that the response is dropped and the client will
292 	 * try again.
293 	 */
294 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
295 		VN_RELE(vp);
296 		curthread->t_flag |= T_WOULDBLOCK;
297 		return;
298 	}
299 
300 	if (!error) {
301 		va.va_mask = AT_ALL;	/* get everything */
302 
303 		error = rfs4_delegated_getattr(vp, &va, 0, cr);
304 
305 		/* check for overflows */
306 		if (!error) {
307 			acl_perm(vp, exi, &va, cr);
308 			error = vattr_to_nattr(&va, &ns->ns_attr);
309 		}
310 	}
311 
312 	ct.cc_flags = 0;
313 
314 	/*
315 	 * Force modified metadata out to stable storage.
316 	 */
317 	(void) VOP_FSYNC(vp, FNODSYNC, cr, &ct);
318 
319 	VN_RELE(vp);
320 
321 	ns->ns_status = puterrno(error);
322 }
323 void *
324 rfs_setattr_getfh(struct nfssaargs *args)
325 {
326 	return (&args->saa_fh);
327 }
328 
329 /*
330  * Directory lookup.
331  * Returns an fhandle and file attributes for file name in a directory.
332  */
333 /* ARGSUSED */
334 void
335 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
336 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
337 {
338 	int error;
339 	vnode_t *dvp;
340 	vnode_t *vp;
341 	struct vattr va;
342 	fhandle_t *fhp = da->da_fhandle;
343 	struct sec_ol sec = {0, 0};
344 	bool_t publicfh_flag = FALSE, auth_weak = FALSE;
345 	char *name;
346 	struct sockaddr *ca;
347 
348 	/*
349 	 * Trusted Extension doesn't support NFSv2. MOUNT
350 	 * will reject v2 clients. Need to prevent v2 client
351 	 * access via WebNFS here.
352 	 */
353 	if (is_system_labeled() && req->rq_vers == 2) {
354 		dr->dr_status = NFSERR_ACCES;
355 		return;
356 	}
357 
358 	/*
359 	 * Disallow NULL paths
360 	 */
361 	if (da->da_name == NULL || *da->da_name == '\0') {
362 		dr->dr_status = NFSERR_ACCES;
363 		return;
364 	}
365 
366 	/*
367 	 * Allow lookups from the root - the default
368 	 * location of the public filehandle.
369 	 */
370 	if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
371 		dvp = rootdir;
372 		VN_HOLD(dvp);
373 	} else {
374 		dvp = nfs_fhtovp(fhp, exi);
375 		if (dvp == NULL) {
376 			dr->dr_status = NFSERR_STALE;
377 			return;
378 		}
379 	}
380 
381 	/*
382 	 * Not allow lookup beyond root.
383 	 * If the filehandle matches a filehandle of the exi,
384 	 * then the ".." refers beyond the root of an exported filesystem.
385 	 */
386 	if (strcmp(da->da_name, "..") == 0 &&
387 	    EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
388 		VN_RELE(dvp);
389 		dr->dr_status = NFSERR_NOENT;
390 		return;
391 	}
392 
393 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
394 	name = nfscmd_convname(ca, exi, da->da_name, NFSCMD_CONV_INBOUND,
395 	    MAXPATHLEN);
396 
397 	if (name == NULL) {
398 		dr->dr_status = NFSERR_ACCES;
399 		return;
400 	}
401 
402 	/*
403 	 * If the public filehandle is used then allow
404 	 * a multi-component lookup, i.e. evaluate
405 	 * a pathname and follow symbolic links if
406 	 * necessary.
407 	 *
408 	 * This may result in a vnode in another filesystem
409 	 * which is OK as long as the filesystem is exported.
410 	 */
411 	if (PUBLIC_FH2(fhp)) {
412 		publicfh_flag = TRUE;
413 		error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &exi,
414 		    &sec);
415 	} else {
416 		/*
417 		 * Do a normal single component lookup.
418 		 */
419 		error = VOP_LOOKUP(dvp, name, &vp, NULL, 0, NULL, cr,
420 		    NULL, NULL, NULL);
421 	}
422 
423 	if (name != da->da_name)
424 		kmem_free(name, MAXPATHLEN);
425 
426 
427 	if (!error) {
428 		va.va_mask = AT_ALL;	/* we want everything */
429 
430 		error = rfs4_delegated_getattr(vp, &va, 0, cr);
431 
432 		/* check for overflows */
433 		if (!error) {
434 			acl_perm(vp, exi, &va, cr);
435 			error = vattr_to_nattr(&va, &dr->dr_attr);
436 			if (!error) {
437 				if (sec.sec_flags & SEC_QUERY)
438 					error = makefh_ol(&dr->dr_fhandle, exi,
439 					    sec.sec_index);
440 				else {
441 					error = makefh(&dr->dr_fhandle, vp,
442 					    exi);
443 					if (!error && publicfh_flag &&
444 					    !chk_clnt_sec(exi, req))
445 						auth_weak = TRUE;
446 				}
447 			}
448 		}
449 		VN_RELE(vp);
450 	}
451 
452 	VN_RELE(dvp);
453 
454 	/*
455 	 * If publicfh_flag is true then we have called rfs_publicfh_mclookup
456 	 * and have obtained a new exportinfo in exi which needs to be
457 	 * released. Note the the original exportinfo pointed to by exi
458 	 * will be released by the caller, comon_dispatch.
459 	 */
460 	if (publicfh_flag && exi != NULL)
461 		exi_rele(exi);
462 
463 	/*
464 	 * If it's public fh, no 0x81, and client's flavor is
465 	 * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
466 	 * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
467 	 */
468 	if (auth_weak)
469 		dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR;
470 	else
471 		dr->dr_status = puterrno(error);
472 }
473 void *
474 rfs_lookup_getfh(struct nfsdiropargs *da)
475 {
476 	return (da->da_fhandle);
477 }
478 
479 /*
480  * Read symbolic link.
481  * Returns the string in the symbolic link at the given fhandle.
482  */
483 /* ARGSUSED */
484 void
485 rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi,
486 	struct svc_req *req, cred_t *cr)
487 {
488 	int error;
489 	struct iovec iov;
490 	struct uio uio;
491 	vnode_t *vp;
492 	struct vattr va;
493 	struct sockaddr *ca;
494 	char *name = NULL;
495 	int is_referral = 0;
496 
497 	vp = nfs_fhtovp(fhp, exi);
498 	if (vp == NULL) {
499 		rl->rl_data = NULL;
500 		rl->rl_status = NFSERR_STALE;
501 		return;
502 	}
503 
504 	va.va_mask = AT_MODE;
505 
506 	error = VOP_GETATTR(vp, &va, 0, cr, NULL);
507 
508 	if (error) {
509 		VN_RELE(vp);
510 		rl->rl_data = NULL;
511 		rl->rl_status = puterrno(error);
512 		return;
513 	}
514 
515 	if (MANDLOCK(vp, va.va_mode)) {
516 		VN_RELE(vp);
517 		rl->rl_data = NULL;
518 		rl->rl_status = NFSERR_ACCES;
519 		return;
520 	}
521 
522 	/* We lied about the object type for a referral */
523 	if (vn_is_nfs_reparse(vp, cr))
524 		is_referral = 1;
525 
526 	/*
527 	 * XNFS and RFC1094 require us to return ENXIO if argument
528 	 * is not a link. BUGID 1138002.
529 	 */
530 	if (vp->v_type != VLNK && !is_referral) {
531 		VN_RELE(vp);
532 		rl->rl_data = NULL;
533 		rl->rl_status = NFSERR_NXIO;
534 		return;
535 	}
536 
537 	/*
538 	 * Allocate data for pathname.  This will be freed by rfs_rlfree.
539 	 */
540 	rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
541 
542 	if (is_referral) {
543 		char *s;
544 		size_t strsz;
545 
546 		/* Get an artificial symlink based on a referral */
547 		s = build_symlink(vp, cr, &strsz);
548 		global_svstat_ptr[2][NFS_REFERLINKS].value.ui64++;
549 		DTRACE_PROBE2(nfs2serv__func__referral__reflink,
550 		    vnode_t *, vp, char *, s);
551 		if (s == NULL)
552 			error = EINVAL;
553 		else {
554 			error = 0;
555 			(void) strlcpy(rl->rl_data, s, NFS_MAXPATHLEN);
556 			rl->rl_count = (uint32_t)MIN(strsz, NFS_MAXPATHLEN);
557 			kmem_free(s, strsz);
558 		}
559 
560 	} else {
561 
562 		/*
563 		 * Set up io vector to read sym link data
564 		 */
565 		iov.iov_base = rl->rl_data;
566 		iov.iov_len = NFS_MAXPATHLEN;
567 		uio.uio_iov = &iov;
568 		uio.uio_iovcnt = 1;
569 		uio.uio_segflg = UIO_SYSSPACE;
570 		uio.uio_extflg = UIO_COPY_CACHED;
571 		uio.uio_loffset = (offset_t)0;
572 		uio.uio_resid = NFS_MAXPATHLEN;
573 
574 		/*
575 		 * Do the readlink.
576 		 */
577 		error = VOP_READLINK(vp, &uio, cr, NULL);
578 
579 		rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid);
580 
581 		if (!error)
582 			rl->rl_data[rl->rl_count] = '\0';
583 
584 	}
585 
586 
587 	VN_RELE(vp);
588 
589 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
590 	name = nfscmd_convname(ca, exi, rl->rl_data,
591 	    NFSCMD_CONV_OUTBOUND, MAXPATHLEN);
592 
593 	if (name != NULL && name != rl->rl_data) {
594 		kmem_free(rl->rl_data, NFS_MAXPATHLEN);
595 		rl->rl_data = name;
596 	}
597 
598 	/*
599 	 * XNFS and RFC1094 require us to return ENXIO if argument
600 	 * is not a link. UFS returns EINVAL if this is the case,
601 	 * so we do the mapping here. BUGID 1138002.
602 	 */
603 	if (error == EINVAL)
604 		rl->rl_status = NFSERR_NXIO;
605 	else
606 		rl->rl_status = puterrno(error);
607 
608 }
609 void *
610 rfs_readlink_getfh(fhandle_t *fhp)
611 {
612 	return (fhp);
613 }
614 /*
615  * Free data allocated by rfs_readlink
616  */
617 void
618 rfs_rlfree(struct nfsrdlnres *rl)
619 {
620 	if (rl->rl_data != NULL)
621 		kmem_free(rl->rl_data, NFS_MAXPATHLEN);
622 }
623 
624 static int rdma_setup_read_data2(struct nfsreadargs *, struct nfsrdresult *);
625 
626 /*
627  * Read data.
628  * Returns some data read from the file at the given fhandle.
629  */
630 /* ARGSUSED */
631 void
632 rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr,
633 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
634 {
635 	vnode_t *vp;
636 	int error;
637 	struct vattr va;
638 	struct iovec iov;
639 	struct uio uio;
640 	mblk_t *mp;
641 	int alloc_err = 0;
642 	int in_crit = 0;
643 	caller_context_t ct;
644 
645 	vp = nfs_fhtovp(&ra->ra_fhandle, exi);
646 	if (vp == NULL) {
647 		rr->rr_data = NULL;
648 		rr->rr_status = NFSERR_STALE;
649 		return;
650 	}
651 
652 	if (vp->v_type != VREG) {
653 		VN_RELE(vp);
654 		rr->rr_data = NULL;
655 		rr->rr_status = NFSERR_ISDIR;
656 		return;
657 	}
658 
659 	ct.cc_sysid = 0;
660 	ct.cc_pid = 0;
661 	ct.cc_caller_id = nfs2_srv_caller_id;
662 	ct.cc_flags = CC_DONTBLOCK;
663 
664 	/*
665 	 * Enter the critical region before calling VOP_RWLOCK
666 	 * to avoid a deadlock with write requests.
667 	 */
668 	if (nbl_need_check(vp)) {
669 		nbl_start_crit(vp, RW_READER);
670 		if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
671 		    0, NULL)) {
672 			nbl_end_crit(vp);
673 			VN_RELE(vp);
674 			rr->rr_data = NULL;
675 			rr->rr_status = NFSERR_ACCES;
676 			return;
677 		}
678 		in_crit = 1;
679 	}
680 
681 	error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct);
682 
683 	/* check if a monitor detected a delegation conflict */
684 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
685 		VN_RELE(vp);
686 		/* mark as wouldblock so response is dropped */
687 		curthread->t_flag |= T_WOULDBLOCK;
688 
689 		rr->rr_data = NULL;
690 		return;
691 	}
692 
693 	va.va_mask = AT_ALL;
694 
695 	error = VOP_GETATTR(vp, &va, 0, cr, &ct);
696 
697 	if (error) {
698 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
699 		if (in_crit)
700 			nbl_end_crit(vp);
701 
702 		VN_RELE(vp);
703 		rr->rr_data = NULL;
704 		rr->rr_status = puterrno(error);
705 
706 		return;
707 	}
708 
709 	/*
710 	 * This is a kludge to allow reading of files created
711 	 * with no read permission.  The owner of the file
712 	 * is always allowed to read it.
713 	 */
714 	if (crgetuid(cr) != va.va_uid) {
715 		error = VOP_ACCESS(vp, VREAD, 0, cr, &ct);
716 
717 		if (error) {
718 			/*
719 			 * Exec is the same as read over the net because
720 			 * of demand loading.
721 			 */
722 			error = VOP_ACCESS(vp, VEXEC, 0, cr, &ct);
723 		}
724 		if (error) {
725 			VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
726 			if (in_crit)
727 				nbl_end_crit(vp);
728 			VN_RELE(vp);
729 			rr->rr_data = NULL;
730 			rr->rr_status = puterrno(error);
731 
732 			return;
733 		}
734 	}
735 
736 	if (MANDLOCK(vp, va.va_mode)) {
737 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
738 		if (in_crit)
739 			nbl_end_crit(vp);
740 
741 		VN_RELE(vp);
742 		rr->rr_data = NULL;
743 		rr->rr_status = NFSERR_ACCES;
744 
745 		return;
746 	}
747 
748 	rr->rr_ok.rrok_wlist_len = 0;
749 	rr->rr_ok.rrok_wlist = NULL;
750 
751 	if ((u_offset_t)ra->ra_offset >= va.va_size) {
752 		rr->rr_count = 0;
753 		rr->rr_data = NULL;
754 		/*
755 		 * In this case, status is NFS_OK, but there is no data
756 		 * to encode. So set rr_mp to NULL.
757 		 */
758 		rr->rr_mp = NULL;
759 		rr->rr_ok.rrok_wlist = ra->ra_wlist;
760 		if (rr->rr_ok.rrok_wlist)
761 			clist_zero_len(rr->rr_ok.rrok_wlist);
762 		goto done;
763 	}
764 
765 	if (ra->ra_wlist) {
766 		mp = NULL;
767 		rr->rr_mp = NULL;
768 		(void) rdma_get_wchunk(req, &iov, ra->ra_wlist);
769 		if (ra->ra_count > iov.iov_len) {
770 			rr->rr_data = NULL;
771 			rr->rr_status = NFSERR_INVAL;
772 			goto done;
773 		}
774 	} else {
775 		/*
776 		 * mp will contain the data to be sent out in the read reply.
777 		 * This will be freed after the reply has been sent out (by the
778 		 * driver).
779 		 * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so
780 		 * that the call to xdrmblk_putmblk() never fails.
781 		 */
782 		mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG,
783 		    &alloc_err);
784 		ASSERT(mp != NULL);
785 		ASSERT(alloc_err == 0);
786 
787 		rr->rr_mp = mp;
788 
789 		/*
790 		 * Set up io vector
791 		 */
792 		iov.iov_base = (caddr_t)mp->b_datap->db_base;
793 		iov.iov_len = ra->ra_count;
794 	}
795 
796 	uio.uio_iov = &iov;
797 	uio.uio_iovcnt = 1;
798 	uio.uio_segflg = UIO_SYSSPACE;
799 	uio.uio_extflg = UIO_COPY_CACHED;
800 	uio.uio_loffset = (offset_t)ra->ra_offset;
801 	uio.uio_resid = ra->ra_count;
802 
803 	error = VOP_READ(vp, &uio, 0, cr, &ct);
804 
805 	if (error) {
806 		if (mp)
807 			freeb(mp);
808 
809 		/*
810 		 * check if a monitor detected a delegation conflict and
811 		 * mark as wouldblock so response is dropped
812 		 */
813 		if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
814 			curthread->t_flag |= T_WOULDBLOCK;
815 		else
816 			rr->rr_status = puterrno(error);
817 
818 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
819 		if (in_crit)
820 			nbl_end_crit(vp);
821 
822 		VN_RELE(vp);
823 		rr->rr_data = NULL;
824 
825 		return;
826 	}
827 
828 	/*
829 	 * Get attributes again so we can send the latest access
830 	 * time to the client side for his cache.
831 	 */
832 	va.va_mask = AT_ALL;
833 
834 	error = VOP_GETATTR(vp, &va, 0, cr, &ct);
835 
836 	if (error) {
837 		if (mp)
838 			freeb(mp);
839 
840 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
841 		if (in_crit)
842 			nbl_end_crit(vp);
843 
844 		VN_RELE(vp);
845 		rr->rr_data = NULL;
846 		rr->rr_status = puterrno(error);
847 
848 		return;
849 	}
850 
851 	rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid);
852 
853 	if (mp) {
854 		rr->rr_data = (char *)mp->b_datap->db_base;
855 	} else {
856 		if (ra->ra_wlist) {
857 			rr->rr_data = (caddr_t)iov.iov_base;
858 			if (!rdma_setup_read_data2(ra, rr)) {
859 				rr->rr_data = NULL;
860 				rr->rr_status = puterrno(NFSERR_INVAL);
861 			}
862 		}
863 	}
864 done:
865 	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
866 	if (in_crit)
867 		nbl_end_crit(vp);
868 
869 	acl_perm(vp, exi, &va, cr);
870 
871 	/* check for overflows */
872 	error = vattr_to_nattr(&va, &rr->rr_attr);
873 
874 	VN_RELE(vp);
875 
876 	rr->rr_status = puterrno(error);
877 }
878 
879 /*
880  * Free data allocated by rfs_read
881  */
882 void
883 rfs_rdfree(struct nfsrdresult *rr)
884 {
885 	mblk_t *mp;
886 
887 	if (rr->rr_status == NFS_OK) {
888 		mp = rr->rr_mp;
889 		if (mp != NULL)
890 			freeb(mp);
891 	}
892 }
893 
894 void *
895 rfs_read_getfh(struct nfsreadargs *ra)
896 {
897 	return (&ra->ra_fhandle);
898 }
899 
900 #define	MAX_IOVECS	12
901 
902 #ifdef DEBUG
903 static int rfs_write_sync_hits = 0;
904 static int rfs_write_sync_misses = 0;
905 #endif
906 
907 /*
908  * Write data to file.
909  * Returns attributes of a file after writing some data to it.
910  *
911  * Any changes made here, especially in error handling might have
912  * to also be done in rfs_write (which clusters write requests).
913  */
914 void
915 rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns,
916 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
917 {
918 	int error;
919 	vnode_t *vp;
920 	rlim64_t rlimit;
921 	struct vattr va;
922 	struct uio uio;
923 	struct iovec iov[MAX_IOVECS];
924 	mblk_t *m;
925 	struct iovec *iovp;
926 	int iovcnt;
927 	cred_t *savecred;
928 	int in_crit = 0;
929 	caller_context_t ct;
930 
931 	vp = nfs_fhtovp(&wa->wa_fhandle, exi);
932 	if (vp == NULL) {
933 		ns->ns_status = NFSERR_STALE;
934 		return;
935 	}
936 
937 	if (rdonly(exi, req)) {
938 		VN_RELE(vp);
939 		ns->ns_status = NFSERR_ROFS;
940 		return;
941 	}
942 
943 	if (vp->v_type != VREG) {
944 		VN_RELE(vp);
945 		ns->ns_status = NFSERR_ISDIR;
946 		return;
947 	}
948 
949 	ct.cc_sysid = 0;
950 	ct.cc_pid = 0;
951 	ct.cc_caller_id = nfs2_srv_caller_id;
952 	ct.cc_flags = CC_DONTBLOCK;
953 
954 	va.va_mask = AT_UID|AT_MODE;
955 
956 	error = VOP_GETATTR(vp, &va, 0, cr, &ct);
957 
958 	if (error) {
959 		VN_RELE(vp);
960 		ns->ns_status = puterrno(error);
961 
962 		return;
963 	}
964 
965 	if (crgetuid(cr) != va.va_uid) {
966 		/*
967 		 * This is a kludge to allow writes of files created
968 		 * with read only permission.  The owner of the file
969 		 * is always allowed to write it.
970 		 */
971 		error = VOP_ACCESS(vp, VWRITE, 0, cr, &ct);
972 
973 		if (error) {
974 			VN_RELE(vp);
975 			ns->ns_status = puterrno(error);
976 			return;
977 		}
978 	}
979 
980 	/*
981 	 * Can't access a mandatory lock file.  This might cause
982 	 * the NFS service thread to block forever waiting for a
983 	 * lock to be released that will never be released.
984 	 */
985 	if (MANDLOCK(vp, va.va_mode)) {
986 		VN_RELE(vp);
987 		ns->ns_status = NFSERR_ACCES;
988 		return;
989 	}
990 
991 	/*
992 	 * We have to enter the critical region before calling VOP_RWLOCK
993 	 * to avoid a deadlock with ufs.
994 	 */
995 	if (nbl_need_check(vp)) {
996 		nbl_start_crit(vp, RW_READER);
997 		in_crit = 1;
998 		if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
999 		    wa->wa_count, 0, NULL)) {
1000 			error = EACCES;
1001 			goto out;
1002 		}
1003 	}
1004 
1005 	error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1006 
1007 	/* check if a monitor detected a delegation conflict */
1008 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1009 		VN_RELE(vp);
1010 		/* mark as wouldblock so response is dropped */
1011 		curthread->t_flag |= T_WOULDBLOCK;
1012 		return;
1013 	}
1014 
1015 	if (wa->wa_data || wa->wa_rlist) {
1016 		/* Do the RDMA thing if necessary */
1017 		if (wa->wa_rlist) {
1018 			iov[0].iov_base = (char *)((wa->wa_rlist)->u.c_daddr3);
1019 			iov[0].iov_len = wa->wa_count;
1020 		} else  {
1021 			iov[0].iov_base = wa->wa_data;
1022 			iov[0].iov_len = wa->wa_count;
1023 		}
1024 		uio.uio_iov = iov;
1025 		uio.uio_iovcnt = 1;
1026 		uio.uio_segflg = UIO_SYSSPACE;
1027 		uio.uio_extflg = UIO_COPY_DEFAULT;
1028 		uio.uio_loffset = (offset_t)wa->wa_offset;
1029 		uio.uio_resid = wa->wa_count;
1030 		/*
1031 		 * The limit is checked on the client. We
1032 		 * should allow any size writes here.
1033 		 */
1034 		uio.uio_llimit = curproc->p_fsz_ctl;
1035 		rlimit = uio.uio_llimit - wa->wa_offset;
1036 		if (rlimit < (rlim64_t)uio.uio_resid)
1037 			uio.uio_resid = (uint_t)rlimit;
1038 
1039 		/*
1040 		 * for now we assume no append mode
1041 		 */
1042 		/*
1043 		 * We're changing creds because VM may fault and we need
1044 		 * the cred of the current thread to be used if quota
1045 		 * checking is enabled.
1046 		 */
1047 		savecred = curthread->t_cred;
1048 		curthread->t_cred = cr;
1049 		error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1050 		curthread->t_cred = savecred;
1051 	} else {
1052 		iovcnt = 0;
1053 		for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
1054 			iovcnt++;
1055 		if (iovcnt <= MAX_IOVECS) {
1056 #ifdef DEBUG
1057 			rfs_write_sync_hits++;
1058 #endif
1059 			iovp = iov;
1060 		} else {
1061 #ifdef DEBUG
1062 			rfs_write_sync_misses++;
1063 #endif
1064 			iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
1065 		}
1066 		mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
1067 		uio.uio_iov = iovp;
1068 		uio.uio_iovcnt = iovcnt;
1069 		uio.uio_segflg = UIO_SYSSPACE;
1070 		uio.uio_extflg = UIO_COPY_DEFAULT;
1071 		uio.uio_loffset = (offset_t)wa->wa_offset;
1072 		uio.uio_resid = wa->wa_count;
1073 		/*
1074 		 * The limit is checked on the client. We
1075 		 * should allow any size writes here.
1076 		 */
1077 		uio.uio_llimit = curproc->p_fsz_ctl;
1078 		rlimit = uio.uio_llimit - wa->wa_offset;
1079 		if (rlimit < (rlim64_t)uio.uio_resid)
1080 			uio.uio_resid = (uint_t)rlimit;
1081 
1082 		/*
1083 		 * For now we assume no append mode.
1084 		 */
1085 		/*
1086 		 * We're changing creds because VM may fault and we need
1087 		 * the cred of the current thread to be used if quota
1088 		 * checking is enabled.
1089 		 */
1090 		savecred = curthread->t_cred;
1091 		curthread->t_cred = cr;
1092 		error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1093 		curthread->t_cred = savecred;
1094 
1095 		if (iovp != iov)
1096 			kmem_free(iovp, sizeof (*iovp) * iovcnt);
1097 	}
1098 
1099 	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1100 
1101 	if (!error) {
1102 		/*
1103 		 * Get attributes again so we send the latest mod
1104 		 * time to the client side for his cache.
1105 		 */
1106 		va.va_mask = AT_ALL;	/* now we want everything */
1107 
1108 		error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1109 
1110 		/* check for overflows */
1111 		if (!error) {
1112 			acl_perm(vp, exi, &va, cr);
1113 			error = vattr_to_nattr(&va, &ns->ns_attr);
1114 		}
1115 	}
1116 
1117 out:
1118 	if (in_crit)
1119 		nbl_end_crit(vp);
1120 	VN_RELE(vp);
1121 
1122 	/* check if a monitor detected a delegation conflict */
1123 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1124 		/* mark as wouldblock so response is dropped */
1125 		curthread->t_flag |= T_WOULDBLOCK;
1126 	else
1127 		ns->ns_status = puterrno(error);
1128 
1129 }
1130 
1131 struct rfs_async_write {
1132 	struct nfswriteargs *wa;
1133 	struct nfsattrstat *ns;
1134 	struct svc_req *req;
1135 	cred_t *cr;
1136 	kthread_t *thread;
1137 	struct rfs_async_write *list;
1138 };
1139 
1140 struct rfs_async_write_list {
1141 	fhandle_t *fhp;
1142 	kcondvar_t cv;
1143 	struct rfs_async_write *list;
1144 	struct rfs_async_write_list *next;
1145 };
1146 
1147 static struct rfs_async_write_list *rfs_async_write_head = NULL;
1148 static kmutex_t rfs_async_write_lock;
1149 static int rfs_write_async = 1;	/* enables write clustering if == 1 */
1150 
1151 #define	MAXCLIOVECS	42
1152 #define	RFSWRITE_INITVAL (enum nfsstat) -1
1153 
1154 #ifdef DEBUG
1155 static int rfs_write_hits = 0;
1156 static int rfs_write_misses = 0;
1157 #endif
1158 
1159 /*
1160  * Write data to file.
1161  * Returns attributes of a file after writing some data to it.
1162  */
1163 void
1164 rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns,
1165 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1166 {
1167 	int error;
1168 	vnode_t *vp;
1169 	rlim64_t rlimit;
1170 	struct vattr va;
1171 	struct uio uio;
1172 	struct rfs_async_write_list *lp;
1173 	struct rfs_async_write_list *nlp;
1174 	struct rfs_async_write *rp;
1175 	struct rfs_async_write *nrp;
1176 	struct rfs_async_write *trp;
1177 	struct rfs_async_write *lrp;
1178 	int data_written;
1179 	int iovcnt;
1180 	mblk_t *m;
1181 	struct iovec *iovp;
1182 	struct iovec *niovp;
1183 	struct iovec iov[MAXCLIOVECS];
1184 	int count;
1185 	int rcount;
1186 	uint_t off;
1187 	uint_t len;
1188 	struct rfs_async_write nrpsp;
1189 	struct rfs_async_write_list nlpsp;
1190 	ushort_t t_flag;
1191 	cred_t *savecred;
1192 	int in_crit = 0;
1193 	caller_context_t ct;
1194 
1195 	if (!rfs_write_async) {
1196 		rfs_write_sync(wa, ns, exi, req, cr);
1197 		return;
1198 	}
1199 
1200 	/*
1201 	 * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
1202 	 * is considered an OK.
1203 	 */
1204 	ns->ns_status = RFSWRITE_INITVAL;
1205 
1206 	nrp = &nrpsp;
1207 	nrp->wa = wa;
1208 	nrp->ns = ns;
1209 	nrp->req = req;
1210 	nrp->cr = cr;
1211 	nrp->thread = curthread;
1212 
1213 	ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
1214 
1215 	/*
1216 	 * Look to see if there is already a cluster started
1217 	 * for this file.
1218 	 */
1219 	mutex_enter(&rfs_async_write_lock);
1220 	for (lp = rfs_async_write_head; lp != NULL; lp = lp->next) {
1221 		if (bcmp(&wa->wa_fhandle, lp->fhp,
1222 		    sizeof (fhandle_t)) == 0)
1223 			break;
1224 	}
1225 
1226 	/*
1227 	 * If lp is non-NULL, then there is already a cluster
1228 	 * started.  We need to place ourselves in the cluster
1229 	 * list in the right place as determined by starting
1230 	 * offset.  Conflicts with non-blocking mandatory locked
1231 	 * regions will be checked when the cluster is processed.
1232 	 */
1233 	if (lp != NULL) {
1234 		rp = lp->list;
1235 		trp = NULL;
1236 		while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
1237 			trp = rp;
1238 			rp = rp->list;
1239 		}
1240 		nrp->list = rp;
1241 		if (trp == NULL)
1242 			lp->list = nrp;
1243 		else
1244 			trp->list = nrp;
1245 		while (nrp->ns->ns_status == RFSWRITE_INITVAL)
1246 			cv_wait(&lp->cv, &rfs_async_write_lock);
1247 		mutex_exit(&rfs_async_write_lock);
1248 
1249 		return;
1250 	}
1251 
1252 	/*
1253 	 * No cluster started yet, start one and add ourselves
1254 	 * to the list of clusters.
1255 	 */
1256 	nrp->list = NULL;
1257 
1258 	nlp = &nlpsp;
1259 	nlp->fhp = &wa->wa_fhandle;
1260 	cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
1261 	nlp->list = nrp;
1262 	nlp->next = NULL;
1263 
1264 	if (rfs_async_write_head == NULL) {
1265 		rfs_async_write_head = nlp;
1266 	} else {
1267 		lp = rfs_async_write_head;
1268 		while (lp->next != NULL)
1269 			lp = lp->next;
1270 		lp->next = nlp;
1271 	}
1272 	mutex_exit(&rfs_async_write_lock);
1273 
1274 	/*
1275 	 * Convert the file handle common to all of the requests
1276 	 * in this cluster to a vnode.
1277 	 */
1278 	vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1279 	if (vp == NULL) {
1280 		mutex_enter(&rfs_async_write_lock);
1281 		if (rfs_async_write_head == nlp)
1282 			rfs_async_write_head = nlp->next;
1283 		else {
1284 			lp = rfs_async_write_head;
1285 			while (lp->next != nlp)
1286 				lp = lp->next;
1287 			lp->next = nlp->next;
1288 		}
1289 		t_flag = curthread->t_flag & T_WOULDBLOCK;
1290 		for (rp = nlp->list; rp != NULL; rp = rp->list) {
1291 			rp->ns->ns_status = NFSERR_STALE;
1292 			rp->thread->t_flag |= t_flag;
1293 		}
1294 		cv_broadcast(&nlp->cv);
1295 		mutex_exit(&rfs_async_write_lock);
1296 
1297 		return;
1298 	}
1299 
1300 	/*
1301 	 * Can only write regular files.  Attempts to write any
1302 	 * other file types fail with EISDIR.
1303 	 */
1304 	if (vp->v_type != VREG) {
1305 		VN_RELE(vp);
1306 		mutex_enter(&rfs_async_write_lock);
1307 		if (rfs_async_write_head == nlp)
1308 			rfs_async_write_head = nlp->next;
1309 		else {
1310 			lp = rfs_async_write_head;
1311 			while (lp->next != nlp)
1312 				lp = lp->next;
1313 			lp->next = nlp->next;
1314 		}
1315 		t_flag = curthread->t_flag & T_WOULDBLOCK;
1316 		for (rp = nlp->list; rp != NULL; rp = rp->list) {
1317 			rp->ns->ns_status = NFSERR_ISDIR;
1318 			rp->thread->t_flag |= t_flag;
1319 		}
1320 		cv_broadcast(&nlp->cv);
1321 		mutex_exit(&rfs_async_write_lock);
1322 
1323 		return;
1324 	}
1325 
1326 	/*
1327 	 * Enter the critical region before calling VOP_RWLOCK, to avoid a
1328 	 * deadlock with ufs.
1329 	 */
1330 	if (nbl_need_check(vp)) {
1331 		nbl_start_crit(vp, RW_READER);
1332 		in_crit = 1;
1333 	}
1334 
1335 	ct.cc_sysid = 0;
1336 	ct.cc_pid = 0;
1337 	ct.cc_caller_id = nfs2_srv_caller_id;
1338 	ct.cc_flags = CC_DONTBLOCK;
1339 
1340 	/*
1341 	 * Lock the file for writing.  This operation provides
1342 	 * the delay which allows clusters to grow.
1343 	 */
1344 	error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1345 
1346 	/* check if a monitor detected a delegation conflict */
1347 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1348 		if (in_crit)
1349 			nbl_end_crit(vp);
1350 		VN_RELE(vp);
1351 		/* mark as wouldblock so response is dropped */
1352 		curthread->t_flag |= T_WOULDBLOCK;
1353 		mutex_enter(&rfs_async_write_lock);
1354 		if (rfs_async_write_head == nlp)
1355 			rfs_async_write_head = nlp->next;
1356 		else {
1357 			lp = rfs_async_write_head;
1358 			while (lp->next != nlp)
1359 				lp = lp->next;
1360 			lp->next = nlp->next;
1361 		}
1362 		for (rp = nlp->list; rp != NULL; rp = rp->list) {
1363 			if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1364 				rp->ns->ns_status = puterrno(error);
1365 				rp->thread->t_flag |= T_WOULDBLOCK;
1366 			}
1367 		}
1368 		cv_broadcast(&nlp->cv);
1369 		mutex_exit(&rfs_async_write_lock);
1370 
1371 		return;
1372 	}
1373 
1374 	/*
1375 	 * Disconnect this cluster from the list of clusters.
1376 	 * The cluster that is being dealt with must be fixed
1377 	 * in size after this point, so there is no reason
1378 	 * to leave it on the list so that new requests can
1379 	 * find it.
1380 	 *
1381 	 * The algorithm is that the first write request will
1382 	 * create a cluster, convert the file handle to a
1383 	 * vnode pointer, and then lock the file for writing.
1384 	 * This request is not likely to be clustered with
1385 	 * any others.  However, the next request will create
1386 	 * a new cluster and be blocked in VOP_RWLOCK while
1387 	 * the first request is being processed.  This delay
1388 	 * will allow more requests to be clustered in this
1389 	 * second cluster.
1390 	 */
1391 	mutex_enter(&rfs_async_write_lock);
1392 	if (rfs_async_write_head == nlp)
1393 		rfs_async_write_head = nlp->next;
1394 	else {
1395 		lp = rfs_async_write_head;
1396 		while (lp->next != nlp)
1397 			lp = lp->next;
1398 		lp->next = nlp->next;
1399 	}
1400 	mutex_exit(&rfs_async_write_lock);
1401 
1402 	/*
1403 	 * Step through the list of requests in this cluster.
1404 	 * We need to check permissions to make sure that all
1405 	 * of the requests have sufficient permission to write
1406 	 * the file.  A cluster can be composed of requests
1407 	 * from different clients and different users on each
1408 	 * client.
1409 	 *
1410 	 * As a side effect, we also calculate the size of the
1411 	 * byte range that this cluster encompasses.
1412 	 */
1413 	rp = nlp->list;
1414 	off = rp->wa->wa_offset;
1415 	len = (uint_t)0;
1416 	do {
1417 		if (rdonly(exi, rp->req)) {
1418 			rp->ns->ns_status = NFSERR_ROFS;
1419 			t_flag = curthread->t_flag & T_WOULDBLOCK;
1420 			rp->thread->t_flag |= t_flag;
1421 			continue;
1422 		}
1423 
1424 		va.va_mask = AT_UID|AT_MODE;
1425 
1426 		error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1427 
1428 		if (!error) {
1429 			if (crgetuid(rp->cr) != va.va_uid) {
1430 				/*
1431 				 * This is a kludge to allow writes of files
1432 				 * created with read only permission.  The
1433 				 * owner of the file is always allowed to
1434 				 * write it.
1435 				 */
1436 				error = VOP_ACCESS(vp, VWRITE, 0, rp->cr, &ct);
1437 			}
1438 			if (!error && MANDLOCK(vp, va.va_mode))
1439 				error = EACCES;
1440 		}
1441 
1442 		/*
1443 		 * Check for a conflict with a nbmand-locked region.
1444 		 */
1445 		if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset,
1446 		    rp->wa->wa_count, 0, NULL)) {
1447 			error = EACCES;
1448 		}
1449 
1450 		if (error) {
1451 			rp->ns->ns_status = puterrno(error);
1452 			t_flag = curthread->t_flag & T_WOULDBLOCK;
1453 			rp->thread->t_flag |= t_flag;
1454 			continue;
1455 		}
1456 		if (len < rp->wa->wa_offset + rp->wa->wa_count - off)
1457 			len = rp->wa->wa_offset + rp->wa->wa_count - off;
1458 	} while ((rp = rp->list) != NULL);
1459 
1460 	/*
1461 	 * Step through the cluster attempting to gather as many
1462 	 * requests which are contiguous as possible.  These
1463 	 * contiguous requests are handled via one call to VOP_WRITE
1464 	 * instead of different calls to VOP_WRITE.  We also keep
1465 	 * track of the fact that any data was written.
1466 	 */
1467 	rp = nlp->list;
1468 	data_written = 0;
1469 	do {
1470 		/*
1471 		 * Skip any requests which are already marked as having an
1472 		 * error.
1473 		 */
1474 		if (rp->ns->ns_status != RFSWRITE_INITVAL) {
1475 			rp = rp->list;
1476 			continue;
1477 		}
1478 
1479 		/*
1480 		 * Count the number of iovec's which are required
1481 		 * to handle this set of requests.  One iovec is
1482 		 * needed for each data buffer, whether addressed
1483 		 * by wa_data or by the b_rptr pointers in the
1484 		 * mblk chains.
1485 		 */
1486 		iovcnt = 0;
1487 		lrp = rp;
1488 		for (;;) {
1489 			if (lrp->wa->wa_data || lrp->wa->wa_rlist)
1490 				iovcnt++;
1491 			else {
1492 				m = lrp->wa->wa_mblk;
1493 				while (m != NULL) {
1494 					iovcnt++;
1495 					m = m->b_cont;
1496 				}
1497 			}
1498 			if (lrp->list == NULL ||
1499 			    lrp->list->ns->ns_status != RFSWRITE_INITVAL ||
1500 			    lrp->wa->wa_offset + lrp->wa->wa_count !=
1501 			    lrp->list->wa->wa_offset) {
1502 				lrp = lrp->list;
1503 				break;
1504 			}
1505 			lrp = lrp->list;
1506 		}
1507 
1508 		if (iovcnt <= MAXCLIOVECS) {
1509 #ifdef DEBUG
1510 			rfs_write_hits++;
1511 #endif
1512 			niovp = iov;
1513 		} else {
1514 #ifdef DEBUG
1515 			rfs_write_misses++;
1516 #endif
1517 			niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP);
1518 		}
1519 		/*
1520 		 * Put together the scatter/gather iovecs.
1521 		 */
1522 		iovp = niovp;
1523 		trp = rp;
1524 		count = 0;
1525 		do {
1526 			if (trp->wa->wa_data || trp->wa->wa_rlist) {
1527 				if (trp->wa->wa_rlist) {
1528 					iovp->iov_base =
1529 					    (char *)((trp->wa->wa_rlist)->
1530 					    u.c_daddr3);
1531 					iovp->iov_len = trp->wa->wa_count;
1532 				} else  {
1533 					iovp->iov_base = trp->wa->wa_data;
1534 					iovp->iov_len = trp->wa->wa_count;
1535 				}
1536 				iovp++;
1537 			} else {
1538 				m = trp->wa->wa_mblk;
1539 				rcount = trp->wa->wa_count;
1540 				while (m != NULL) {
1541 					iovp->iov_base = (caddr_t)m->b_rptr;
1542 					iovp->iov_len = (m->b_wptr - m->b_rptr);
1543 					rcount -= iovp->iov_len;
1544 					if (rcount < 0)
1545 						iovp->iov_len += rcount;
1546 					iovp++;
1547 					if (rcount <= 0)
1548 						break;
1549 					m = m->b_cont;
1550 				}
1551 			}
1552 			count += trp->wa->wa_count;
1553 			trp = trp->list;
1554 		} while (trp != lrp);
1555 
1556 		uio.uio_iov = niovp;
1557 		uio.uio_iovcnt = iovcnt;
1558 		uio.uio_segflg = UIO_SYSSPACE;
1559 		uio.uio_extflg = UIO_COPY_DEFAULT;
1560 		uio.uio_loffset = (offset_t)rp->wa->wa_offset;
1561 		uio.uio_resid = count;
1562 		/*
1563 		 * The limit is checked on the client. We
1564 		 * should allow any size writes here.
1565 		 */
1566 		uio.uio_llimit = curproc->p_fsz_ctl;
1567 		rlimit = uio.uio_llimit - rp->wa->wa_offset;
1568 		if (rlimit < (rlim64_t)uio.uio_resid)
1569 			uio.uio_resid = (uint_t)rlimit;
1570 
1571 		/*
1572 		 * For now we assume no append mode.
1573 		 */
1574 
1575 		/*
1576 		 * We're changing creds because VM may fault
1577 		 * and we need the cred of the current
1578 		 * thread to be used if quota * checking is
1579 		 * enabled.
1580 		 */
1581 		savecred = curthread->t_cred;
1582 		curthread->t_cred = cr;
1583 		error = VOP_WRITE(vp, &uio, 0, rp->cr, &ct);
1584 		curthread->t_cred = savecred;
1585 
1586 		/* check if a monitor detected a delegation conflict */
1587 		if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1588 			/* mark as wouldblock so response is dropped */
1589 			curthread->t_flag |= T_WOULDBLOCK;
1590 
1591 		if (niovp != iov)
1592 			kmem_free(niovp, sizeof (*niovp) * iovcnt);
1593 
1594 		if (!error) {
1595 			data_written = 1;
1596 			/*
1597 			 * Get attributes again so we send the latest mod
1598 			 * time to the client side for his cache.
1599 			 */
1600 			va.va_mask = AT_ALL;	/* now we want everything */
1601 
1602 			error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1603 
1604 			if (!error)
1605 				acl_perm(vp, exi, &va, rp->cr);
1606 		}
1607 
1608 		/*
1609 		 * Fill in the status responses for each request
1610 		 * which was just handled.  Also, copy the latest
1611 		 * attributes in to the attribute responses if
1612 		 * appropriate.
1613 		 */
1614 		t_flag = curthread->t_flag & T_WOULDBLOCK;
1615 		do {
1616 			rp->thread->t_flag |= t_flag;
1617 			/* check for overflows */
1618 			if (!error) {
1619 				error  = vattr_to_nattr(&va, &rp->ns->ns_attr);
1620 			}
1621 			rp->ns->ns_status = puterrno(error);
1622 			rp = rp->list;
1623 		} while (rp != lrp);
1624 	} while (rp != NULL);
1625 
1626 	/*
1627 	 * If any data was written at all, then we need to flush
1628 	 * the data and metadata to stable storage.
1629 	 */
1630 	if (data_written) {
1631 		error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr, &ct);
1632 
1633 		if (!error) {
1634 			error = VOP_FSYNC(vp, FNODSYNC, cr, &ct);
1635 		}
1636 	}
1637 
1638 	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1639 
1640 	if (in_crit)
1641 		nbl_end_crit(vp);
1642 	VN_RELE(vp);
1643 
1644 	t_flag = curthread->t_flag & T_WOULDBLOCK;
1645 	mutex_enter(&rfs_async_write_lock);
1646 	for (rp = nlp->list; rp != NULL; rp = rp->list) {
1647 		if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1648 			rp->ns->ns_status = puterrno(error);
1649 			rp->thread->t_flag |= t_flag;
1650 		}
1651 	}
1652 	cv_broadcast(&nlp->cv);
1653 	mutex_exit(&rfs_async_write_lock);
1654 
1655 }
1656 
1657 void *
1658 rfs_write_getfh(struct nfswriteargs *wa)
1659 {
1660 	return (&wa->wa_fhandle);
1661 }
1662 
1663 /*
1664  * Create a file.
1665  * Creates a file with given attributes and returns those attributes
1666  * and an fhandle for the new file.
1667  */
1668 void
1669 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
1670 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1671 {
1672 	int error;
1673 	int lookuperr;
1674 	int in_crit = 0;
1675 	struct vattr va;
1676 	vnode_t *vp;
1677 	vnode_t *realvp;
1678 	vnode_t *dvp;
1679 	char *name = args->ca_da.da_name;
1680 	vnode_t *tvp = NULL;
1681 	int mode;
1682 	int lookup_ok;
1683 	bool_t trunc;
1684 	struct sockaddr *ca;
1685 
1686 	/*
1687 	 * Disallow NULL paths
1688 	 */
1689 	if (name == NULL || *name == '\0') {
1690 		dr->dr_status = NFSERR_ACCES;
1691 		return;
1692 	}
1693 
1694 	dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
1695 	if (dvp == NULL) {
1696 		dr->dr_status = NFSERR_STALE;
1697 		return;
1698 	}
1699 
1700 	error = sattr_to_vattr(args->ca_sa, &va);
1701 	if (error) {
1702 		dr->dr_status = puterrno(error);
1703 		return;
1704 	}
1705 
1706 	/*
1707 	 * Must specify the mode.
1708 	 */
1709 	if (!(va.va_mask & AT_MODE)) {
1710 		VN_RELE(dvp);
1711 		dr->dr_status = NFSERR_INVAL;
1712 		return;
1713 	}
1714 
1715 	/*
1716 	 * This is a completely gross hack to make mknod
1717 	 * work over the wire until we can wack the protocol
1718 	 */
1719 	if ((va.va_mode & IFMT) == IFCHR) {
1720 		if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV)
1721 			va.va_type = VFIFO;	/* xtra kludge for named pipe */
1722 		else {
1723 			va.va_type = VCHR;
1724 			/*
1725 			 * uncompress the received dev_t
1726 			 * if the top half is zero indicating a request
1727 			 * from an `older style' OS.
1728 			 */
1729 			if ((va.va_size & 0xffff0000) == 0)
1730 				va.va_rdev = nfsv2_expdev(va.va_size);
1731 			else
1732 				va.va_rdev = (dev_t)va.va_size;
1733 		}
1734 		va.va_mask &= ~AT_SIZE;
1735 	} else if ((va.va_mode & IFMT) == IFBLK) {
1736 		va.va_type = VBLK;
1737 		/*
1738 		 * uncompress the received dev_t
1739 		 * if the top half is zero indicating a request
1740 		 * from an `older style' OS.
1741 		 */
1742 		if ((va.va_size & 0xffff0000) == 0)
1743 			va.va_rdev = nfsv2_expdev(va.va_size);
1744 		else
1745 			va.va_rdev = (dev_t)va.va_size;
1746 		va.va_mask &= ~AT_SIZE;
1747 	} else if ((va.va_mode & IFMT) == IFSOCK) {
1748 		va.va_type = VSOCK;
1749 	} else {
1750 		va.va_type = VREG;
1751 	}
1752 	va.va_mode &= ~IFMT;
1753 	va.va_mask |= AT_TYPE;
1754 
1755 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
1756 	name = nfscmd_convname(ca, exi, name, NFSCMD_CONV_INBOUND,
1757 	    MAXPATHLEN);
1758 	if (name == NULL) {
1759 		dr->dr_status = puterrno(EINVAL);
1760 		return;
1761 	}
1762 
1763 	/*
1764 	 * Why was the choice made to use VWRITE as the mode to the
1765 	 * call to VOP_CREATE ? This results in a bug.  When a client
1766 	 * opens a file that already exists and is RDONLY, the second
1767 	 * open fails with an EACESS because of the mode.
1768 	 * bug ID 1054648.
1769 	 */
1770 	lookup_ok = 0;
1771 	mode = VWRITE;
1772 	if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) {
1773 		error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1774 		    NULL, NULL, NULL);
1775 		if (!error) {
1776 			struct vattr at;
1777 
1778 			lookup_ok = 1;
1779 			at.va_mask = AT_MODE;
1780 			error = VOP_GETATTR(tvp, &at, 0, cr, NULL);
1781 			if (!error)
1782 				mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD;
1783 			VN_RELE(tvp);
1784 			tvp = NULL;
1785 		}
1786 	}
1787 
1788 	if (!lookup_ok) {
1789 		if (rdonly(exi, req)) {
1790 			error = EROFS;
1791 		} else if (va.va_type != VREG && va.va_type != VFIFO &&
1792 		    va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) {
1793 			error = EPERM;
1794 		} else {
1795 			error = 0;
1796 		}
1797 	}
1798 
1799 	/*
1800 	 * If file size is being modified on an already existing file
1801 	 * make sure that there are no conflicting non-blocking mandatory
1802 	 * locks in the region being manipulated. Return EACCES if there
1803 	 * are conflicting locks.
1804 	 */
1805 	if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) {
1806 		lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1807 		    NULL, NULL, NULL);
1808 
1809 		if (!lookuperr &&
1810 		    rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) {
1811 			VN_RELE(tvp);
1812 			curthread->t_flag |= T_WOULDBLOCK;
1813 			goto out;
1814 		}
1815 
1816 		if (!lookuperr && nbl_need_check(tvp)) {
1817 			/*
1818 			 * The file exists. Now check if it has any
1819 			 * conflicting non-blocking mandatory locks
1820 			 * in the region being changed.
1821 			 */
1822 			struct vattr bva;
1823 			u_offset_t offset;
1824 			ssize_t length;
1825 
1826 			nbl_start_crit(tvp, RW_READER);
1827 			in_crit = 1;
1828 
1829 			bva.va_mask = AT_SIZE;
1830 			error = VOP_GETATTR(tvp, &bva, 0, cr, NULL);
1831 			if (!error) {
1832 				if (va.va_size < bva.va_size) {
1833 					offset = va.va_size;
1834 					length = bva.va_size - va.va_size;
1835 				} else {
1836 					offset = bva.va_size;
1837 					length = va.va_size - bva.va_size;
1838 				}
1839 				if (length) {
1840 					if (nbl_conflict(tvp, NBL_WRITE,
1841 					    offset, length, 0, NULL)) {
1842 						error = EACCES;
1843 					}
1844 				}
1845 			}
1846 			if (error) {
1847 				nbl_end_crit(tvp);
1848 				VN_RELE(tvp);
1849 				in_crit = 0;
1850 			}
1851 		} else if (tvp != NULL) {
1852 			VN_RELE(tvp);
1853 		}
1854 	}
1855 
1856 	if (!error) {
1857 		/*
1858 		 * If filesystem is shared with nosuid the remove any
1859 		 * setuid/setgid bits on create.
1860 		 */
1861 		if (va.va_type == VREG &&
1862 		    exi->exi_export.ex_flags & EX_NOSUID)
1863 			va.va_mode &= ~(VSUID | VSGID);
1864 
1865 		error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0,
1866 		    NULL, NULL);
1867 
1868 		if (!error) {
1869 
1870 			if ((va.va_mask & AT_SIZE) && (va.va_size == 0))
1871 				trunc = TRUE;
1872 			else
1873 				trunc = FALSE;
1874 
1875 			if (rfs4_check_delegated(FWRITE, vp, trunc)) {
1876 				VN_RELE(vp);
1877 				curthread->t_flag |= T_WOULDBLOCK;
1878 				goto out;
1879 			}
1880 			va.va_mask = AT_ALL;
1881 
1882 			error = VOP_GETATTR(vp, &va, 0, cr, NULL);
1883 
1884 			/* check for overflows */
1885 			if (!error) {
1886 				acl_perm(vp, exi, &va, cr);
1887 				error = vattr_to_nattr(&va, &dr->dr_attr);
1888 				if (!error) {
1889 					error = makefh(&dr->dr_fhandle, vp,
1890 					    exi);
1891 				}
1892 			}
1893 			/*
1894 			 * Force modified metadata out to stable storage.
1895 			 *
1896 			 * if a underlying vp exists, pass it to VOP_FSYNC
1897 			 */
1898 			if (VOP_REALVP(vp, &realvp, NULL) == 0)
1899 				(void) VOP_FSYNC(realvp, FNODSYNC, cr, NULL);
1900 			else
1901 				(void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
1902 			VN_RELE(vp);
1903 		}
1904 
1905 		if (in_crit) {
1906 			nbl_end_crit(tvp);
1907 			VN_RELE(tvp);
1908 		}
1909 	}
1910 
1911 	/*
1912 	 * Force modified data and metadata out to stable storage.
1913 	 */
1914 	(void) VOP_FSYNC(dvp, 0, cr, NULL);
1915 
1916 out:
1917 
1918 	VN_RELE(dvp);
1919 
1920 	dr->dr_status = puterrno(error);
1921 
1922 	if (name != args->ca_da.da_name)
1923 		kmem_free(name, MAXPATHLEN);
1924 }
1925 void *
1926 rfs_create_getfh(struct nfscreatargs *args)
1927 {
1928 	return (args->ca_da.da_fhandle);
1929 }
1930 
1931 /*
1932  * Remove a file.
1933  * Remove named file from parent directory.
1934  */
1935 void
1936 rfs_remove(struct nfsdiropargs *da, enum nfsstat *status,
1937 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1938 {
1939 	int error = 0;
1940 	vnode_t *vp;
1941 	vnode_t *targvp;
1942 	int in_crit = 0;
1943 
1944 	/*
1945 	 * Disallow NULL paths
1946 	 */
1947 	if (da->da_name == NULL || *da->da_name == '\0') {
1948 		*status = NFSERR_ACCES;
1949 		return;
1950 	}
1951 
1952 	vp = nfs_fhtovp(da->da_fhandle, exi);
1953 	if (vp == NULL) {
1954 		*status = NFSERR_STALE;
1955 		return;
1956 	}
1957 
1958 	if (rdonly(exi, req)) {
1959 		VN_RELE(vp);
1960 		*status = NFSERR_ROFS;
1961 		return;
1962 	}
1963 
1964 	/*
1965 	 * Check for a conflict with a non-blocking mandatory share reservation.
1966 	 */
1967 	error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0,
1968 	    NULL, cr, NULL, NULL, NULL);
1969 	if (error != 0) {
1970 		VN_RELE(vp);
1971 		*status = puterrno(error);
1972 		return;
1973 	}
1974 
1975 	/*
1976 	 * If the file is delegated to an v4 client, then initiate
1977 	 * recall and drop this request (by setting T_WOULDBLOCK).
1978 	 * The client will eventually re-transmit the request and
1979 	 * (hopefully), by then, the v4 client will have returned
1980 	 * the delegation.
1981 	 */
1982 
1983 	if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
1984 		VN_RELE(vp);
1985 		VN_RELE(targvp);
1986 		curthread->t_flag |= T_WOULDBLOCK;
1987 		return;
1988 	}
1989 
1990 	if (nbl_need_check(targvp)) {
1991 		nbl_start_crit(targvp, RW_READER);
1992 		in_crit = 1;
1993 		if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
1994 			error = EACCES;
1995 			goto out;
1996 		}
1997 	}
1998 
1999 	error = VOP_REMOVE(vp, da->da_name, cr, NULL, 0);
2000 
2001 	/*
2002 	 * Force modified data and metadata out to stable storage.
2003 	 */
2004 	(void) VOP_FSYNC(vp, 0, cr, NULL);
2005 
2006 out:
2007 	if (in_crit)
2008 		nbl_end_crit(targvp);
2009 	VN_RELE(targvp);
2010 	VN_RELE(vp);
2011 
2012 	*status = puterrno(error);
2013 
2014 }
2015 
2016 void *
2017 rfs_remove_getfh(struct nfsdiropargs *da)
2018 {
2019 	return (da->da_fhandle);
2020 }
2021 
2022 /*
2023  * rename a file
2024  * Give a file (from) a new name (to).
2025  */
2026 void
2027 rfs_rename(struct nfsrnmargs *args, enum nfsstat *status,
2028 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2029 {
2030 	int error = 0;
2031 	vnode_t *fromvp;
2032 	vnode_t *tovp;
2033 	struct exportinfo *to_exi;
2034 	fhandle_t *fh;
2035 	vnode_t *srcvp;
2036 	vnode_t *targvp;
2037 	int in_crit = 0;
2038 
2039 	fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi);
2040 	if (fromvp == NULL) {
2041 		*status = NFSERR_STALE;
2042 		return;
2043 	}
2044 
2045 	fh = args->rna_to.da_fhandle;
2046 	to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2047 	if (to_exi == NULL) {
2048 		VN_RELE(fromvp);
2049 		*status = NFSERR_ACCES;
2050 		return;
2051 	}
2052 	exi_rele(to_exi);
2053 
2054 	if (to_exi != exi) {
2055 		VN_RELE(fromvp);
2056 		*status = NFSERR_XDEV;
2057 		return;
2058 	}
2059 
2060 	tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi);
2061 	if (tovp == NULL) {
2062 		VN_RELE(fromvp);
2063 		*status = NFSERR_STALE;
2064 		return;
2065 	}
2066 
2067 	if (fromvp->v_type != VDIR || tovp->v_type != VDIR) {
2068 		VN_RELE(tovp);
2069 		VN_RELE(fromvp);
2070 		*status = NFSERR_NOTDIR;
2071 		return;
2072 	}
2073 
2074 	/*
2075 	 * Disallow NULL paths
2076 	 */
2077 	if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' ||
2078 	    args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') {
2079 		VN_RELE(tovp);
2080 		VN_RELE(fromvp);
2081 		*status = NFSERR_ACCES;
2082 		return;
2083 	}
2084 
2085 	if (rdonly(exi, req)) {
2086 		VN_RELE(tovp);
2087 		VN_RELE(fromvp);
2088 		*status = NFSERR_ROFS;
2089 		return;
2090 	}
2091 
2092 	/*
2093 	 * Check for a conflict with a non-blocking mandatory share reservation.
2094 	 */
2095 	error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
2096 	    NULL, cr, NULL, NULL, NULL);
2097 	if (error != 0) {
2098 		VN_RELE(tovp);
2099 		VN_RELE(fromvp);
2100 		*status = puterrno(error);
2101 		return;
2102 	}
2103 
2104 	/* Check for delegations on the source file */
2105 
2106 	if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
2107 		VN_RELE(tovp);
2108 		VN_RELE(fromvp);
2109 		VN_RELE(srcvp);
2110 		curthread->t_flag |= T_WOULDBLOCK;
2111 		return;
2112 	}
2113 
2114 	/* Check for delegation on the file being renamed over, if it exists */
2115 
2116 	if (rfs4_deleg_policy != SRV_NEVER_DELEGATE &&
2117 	    VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
2118 	    NULL, NULL, NULL) == 0) {
2119 
2120 		if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2121 			VN_RELE(tovp);
2122 			VN_RELE(fromvp);
2123 			VN_RELE(srcvp);
2124 			VN_RELE(targvp);
2125 			curthread->t_flag |= T_WOULDBLOCK;
2126 			return;
2127 		}
2128 		VN_RELE(targvp);
2129 	}
2130 
2131 
2132 	if (nbl_need_check(srcvp)) {
2133 		nbl_start_crit(srcvp, RW_READER);
2134 		in_crit = 1;
2135 		if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) {
2136 			error = EACCES;
2137 			goto out;
2138 		}
2139 	}
2140 
2141 	error = VOP_RENAME(fromvp, args->rna_from.da_name,
2142 	    tovp, args->rna_to.da_name, cr, NULL, 0);
2143 
2144 	if (error == 0)
2145 		vn_renamepath(tovp, srcvp, args->rna_to.da_name,
2146 		    strlen(args->rna_to.da_name));
2147 
2148 	/*
2149 	 * Force modified data and metadata out to stable storage.
2150 	 */
2151 	(void) VOP_FSYNC(tovp, 0, cr, NULL);
2152 	(void) VOP_FSYNC(fromvp, 0, cr, NULL);
2153 
2154 out:
2155 	if (in_crit)
2156 		nbl_end_crit(srcvp);
2157 	VN_RELE(srcvp);
2158 	VN_RELE(tovp);
2159 	VN_RELE(fromvp);
2160 
2161 	*status = puterrno(error);
2162 
2163 }
2164 void *
2165 rfs_rename_getfh(struct nfsrnmargs *args)
2166 {
2167 	return (args->rna_from.da_fhandle);
2168 }
2169 
2170 /*
2171  * Link to a file.
2172  * Create a file (to) which is a hard link to the given file (from).
2173  */
2174 void
2175 rfs_link(struct nfslinkargs *args, enum nfsstat *status,
2176 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2177 {
2178 	int error;
2179 	vnode_t *fromvp;
2180 	vnode_t *tovp;
2181 	struct exportinfo *to_exi;
2182 	fhandle_t *fh;
2183 
2184 	fromvp = nfs_fhtovp(args->la_from, exi);
2185 	if (fromvp == NULL) {
2186 		*status = NFSERR_STALE;
2187 		return;
2188 	}
2189 
2190 	fh = args->la_to.da_fhandle;
2191 	to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2192 	if (to_exi == NULL) {
2193 		VN_RELE(fromvp);
2194 		*status = NFSERR_ACCES;
2195 		return;
2196 	}
2197 	exi_rele(to_exi);
2198 
2199 	if (to_exi != exi) {
2200 		VN_RELE(fromvp);
2201 		*status = NFSERR_XDEV;
2202 		return;
2203 	}
2204 
2205 	tovp = nfs_fhtovp(args->la_to.da_fhandle, exi);
2206 	if (tovp == NULL) {
2207 		VN_RELE(fromvp);
2208 		*status = NFSERR_STALE;
2209 		return;
2210 	}
2211 
2212 	if (tovp->v_type != VDIR) {
2213 		VN_RELE(tovp);
2214 		VN_RELE(fromvp);
2215 		*status = NFSERR_NOTDIR;
2216 		return;
2217 	}
2218 	/*
2219 	 * Disallow NULL paths
2220 	 */
2221 	if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') {
2222 		VN_RELE(tovp);
2223 		VN_RELE(fromvp);
2224 		*status = NFSERR_ACCES;
2225 		return;
2226 	}
2227 
2228 	if (rdonly(exi, req)) {
2229 		VN_RELE(tovp);
2230 		VN_RELE(fromvp);
2231 		*status = NFSERR_ROFS;
2232 		return;
2233 	}
2234 
2235 	error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr, NULL, 0);
2236 
2237 	/*
2238 	 * Force modified data and metadata out to stable storage.
2239 	 */
2240 	(void) VOP_FSYNC(tovp, 0, cr, NULL);
2241 	(void) VOP_FSYNC(fromvp, FNODSYNC, cr, NULL);
2242 
2243 	VN_RELE(tovp);
2244 	VN_RELE(fromvp);
2245 
2246 	*status = puterrno(error);
2247 
2248 }
2249 void *
2250 rfs_link_getfh(struct nfslinkargs *args)
2251 {
2252 	return (args->la_from);
2253 }
2254 
2255 /*
2256  * Symbolicly link to a file.
2257  * Create a file (to) with the given attributes which is a symbolic link
2258  * to the given path name (to).
2259  */
2260 void
2261 rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
2262 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2263 {
2264 	int error;
2265 	struct vattr va;
2266 	vnode_t *vp;
2267 	vnode_t *svp;
2268 	int lerror;
2269 	struct sockaddr *ca;
2270 	char *name = NULL;
2271 
2272 	/*
2273 	 * Disallow NULL paths
2274 	 */
2275 	if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') {
2276 		*status = NFSERR_ACCES;
2277 		return;
2278 	}
2279 
2280 	vp = nfs_fhtovp(args->sla_from.da_fhandle, exi);
2281 	if (vp == NULL) {
2282 		*status = NFSERR_STALE;
2283 		return;
2284 	}
2285 
2286 	if (rdonly(exi, req)) {
2287 		VN_RELE(vp);
2288 		*status = NFSERR_ROFS;
2289 		return;
2290 	}
2291 
2292 	error = sattr_to_vattr(args->sla_sa, &va);
2293 	if (error) {
2294 		VN_RELE(vp);
2295 		*status = puterrno(error);
2296 		return;
2297 	}
2298 
2299 	if (!(va.va_mask & AT_MODE)) {
2300 		VN_RELE(vp);
2301 		*status = NFSERR_INVAL;
2302 		return;
2303 	}
2304 
2305 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2306 	name = nfscmd_convname(ca, exi, args->sla_tnm,
2307 	    NFSCMD_CONV_INBOUND, MAXPATHLEN);
2308 
2309 	if (name == NULL) {
2310 		*status = NFSERR_ACCES;
2311 		return;
2312 	}
2313 
2314 	va.va_type = VLNK;
2315 	va.va_mask |= AT_TYPE;
2316 
2317 	error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, name, cr, NULL, 0);
2318 
2319 	/*
2320 	 * Force new data and metadata out to stable storage.
2321 	 */
2322 	lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL, 0,
2323 	    NULL, cr, NULL, NULL, NULL);
2324 
2325 	if (!lerror) {
2326 		(void) VOP_FSYNC(svp, 0, cr, NULL);
2327 		VN_RELE(svp);
2328 	}
2329 
2330 	/*
2331 	 * Force modified data and metadata out to stable storage.
2332 	 */
2333 	(void) VOP_FSYNC(vp, 0, cr, NULL);
2334 
2335 	VN_RELE(vp);
2336 
2337 	*status = puterrno(error);
2338 	if (name != args->sla_tnm)
2339 		kmem_free(name, MAXPATHLEN);
2340 
2341 }
2342 void *
2343 rfs_symlink_getfh(struct nfsslargs *args)
2344 {
2345 	return (args->sla_from.da_fhandle);
2346 }
2347 
2348 /*
2349  * Make a directory.
2350  * Create a directory with the given name, parent directory, and attributes.
2351  * Returns a file handle and attributes for the new directory.
2352  */
2353 void
2354 rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr,
2355 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2356 {
2357 	int error;
2358 	struct vattr va;
2359 	vnode_t *dvp = NULL;
2360 	vnode_t *vp;
2361 	char *name = args->ca_da.da_name;
2362 
2363 	/*
2364 	 * Disallow NULL paths
2365 	 */
2366 	if (name == NULL || *name == '\0') {
2367 		dr->dr_status = NFSERR_ACCES;
2368 		return;
2369 	}
2370 
2371 	vp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
2372 	if (vp == NULL) {
2373 		dr->dr_status = NFSERR_STALE;
2374 		return;
2375 	}
2376 
2377 	if (rdonly(exi, req)) {
2378 		VN_RELE(vp);
2379 		dr->dr_status = NFSERR_ROFS;
2380 		return;
2381 	}
2382 
2383 	error = sattr_to_vattr(args->ca_sa, &va);
2384 	if (error) {
2385 		VN_RELE(vp);
2386 		dr->dr_status = puterrno(error);
2387 		return;
2388 	}
2389 
2390 	if (!(va.va_mask & AT_MODE)) {
2391 		VN_RELE(vp);
2392 		dr->dr_status = NFSERR_INVAL;
2393 		return;
2394 	}
2395 
2396 	va.va_type = VDIR;
2397 	va.va_mask |= AT_TYPE;
2398 
2399 	error = VOP_MKDIR(vp, name, &va, &dvp, cr, NULL, 0, NULL);
2400 
2401 	if (!error) {
2402 		/*
2403 		 * Attribtutes of the newly created directory should
2404 		 * be returned to the client.
2405 		 */
2406 		va.va_mask = AT_ALL; /* We want everything */
2407 		error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2408 
2409 		/* check for overflows */
2410 		if (!error) {
2411 			acl_perm(vp, exi, &va, cr);
2412 			error = vattr_to_nattr(&va, &dr->dr_attr);
2413 			if (!error) {
2414 				error = makefh(&dr->dr_fhandle, dvp, exi);
2415 			}
2416 		}
2417 		/*
2418 		 * Force new data and metadata out to stable storage.
2419 		 */
2420 		(void) VOP_FSYNC(dvp, 0, cr, NULL);
2421 		VN_RELE(dvp);
2422 	}
2423 
2424 	/*
2425 	 * Force modified data and metadata out to stable storage.
2426 	 */
2427 	(void) VOP_FSYNC(vp, 0, cr, NULL);
2428 
2429 	VN_RELE(vp);
2430 
2431 	dr->dr_status = puterrno(error);
2432 
2433 }
2434 void *
2435 rfs_mkdir_getfh(struct nfscreatargs *args)
2436 {
2437 	return (args->ca_da.da_fhandle);
2438 }
2439 
2440 /*
2441  * Remove a directory.
2442  * Remove the given directory name from the given parent directory.
2443  */
2444 void
2445 rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status,
2446 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2447 {
2448 	int error;
2449 	vnode_t *vp;
2450 
2451 
2452 	/*
2453 	 * Disallow NULL paths
2454 	 */
2455 	if (da->da_name == NULL || *da->da_name == '\0') {
2456 		*status = NFSERR_ACCES;
2457 		return;
2458 	}
2459 
2460 	vp = nfs_fhtovp(da->da_fhandle, exi);
2461 	if (vp == NULL) {
2462 		*status = NFSERR_STALE;
2463 		return;
2464 	}
2465 
2466 	if (rdonly(exi, req)) {
2467 		VN_RELE(vp);
2468 		*status = NFSERR_ROFS;
2469 		return;
2470 	}
2471 
2472 	/*
2473 	 * VOP_RMDIR now takes a new third argument (the current
2474 	 * directory of the process).  That's because someone
2475 	 * wants to return EINVAL if one tries to remove ".".
2476 	 * Of course, NFS servers have no idea what their
2477 	 * clients' current directories are.  We fake it by
2478 	 * supplying a vnode known to exist and illegal to
2479 	 * remove.
2480 	 */
2481 	error = VOP_RMDIR(vp, da->da_name, rootdir, cr, NULL, 0);
2482 
2483 	/*
2484 	 * Force modified data and metadata out to stable storage.
2485 	 */
2486 	(void) VOP_FSYNC(vp, 0, cr, NULL);
2487 
2488 	VN_RELE(vp);
2489 
2490 	/*
2491 	 * System V defines rmdir to return EEXIST, not ENOTEMPTY,
2492 	 * if the directory is not empty.  A System V NFS server
2493 	 * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
2494 	 * over the wire.
2495 	 */
2496 	if (error == EEXIST)
2497 		*status = NFSERR_NOTEMPTY;
2498 	else
2499 		*status = puterrno(error);
2500 
2501 }
2502 void *
2503 rfs_rmdir_getfh(struct nfsdiropargs *da)
2504 {
2505 	return (da->da_fhandle);
2506 }
2507 
2508 /* ARGSUSED */
2509 void
2510 rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
2511 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2512 {
2513 	int error;
2514 	int iseof;
2515 	struct iovec iov;
2516 	struct uio uio;
2517 	vnode_t *vp;
2518 	char *ndata = NULL;
2519 	struct sockaddr *ca;
2520 	size_t nents;
2521 	int ret;
2522 
2523 	vp = nfs_fhtovp(&rda->rda_fh, exi);
2524 	if (vp == NULL) {
2525 		rd->rd_entries = NULL;
2526 		rd->rd_status = NFSERR_STALE;
2527 		return;
2528 	}
2529 
2530 	if (vp->v_type != VDIR) {
2531 		VN_RELE(vp);
2532 		rd->rd_entries = NULL;
2533 		rd->rd_status = NFSERR_NOTDIR;
2534 		return;
2535 	}
2536 
2537 	(void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
2538 
2539 	error = VOP_ACCESS(vp, VREAD, 0, cr, NULL);
2540 
2541 	if (error) {
2542 		rd->rd_entries = NULL;
2543 		goto bad;
2544 	}
2545 
2546 	if (rda->rda_count == 0) {
2547 		rd->rd_entries = NULL;
2548 		rd->rd_size = 0;
2549 		rd->rd_eof = FALSE;
2550 		goto bad;
2551 	}
2552 
2553 	rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA);
2554 
2555 	/*
2556 	 * Allocate data for entries.  This will be freed by rfs_rddirfree.
2557 	 */
2558 	rd->rd_bufsize = (uint_t)rda->rda_count;
2559 	rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP);
2560 
2561 	/*
2562 	 * Set up io vector to read directory data
2563 	 */
2564 	iov.iov_base = (caddr_t)rd->rd_entries;
2565 	iov.iov_len = rda->rda_count;
2566 	uio.uio_iov = &iov;
2567 	uio.uio_iovcnt = 1;
2568 	uio.uio_segflg = UIO_SYSSPACE;
2569 	uio.uio_extflg = UIO_COPY_CACHED;
2570 	uio.uio_loffset = (offset_t)rda->rda_offset;
2571 	uio.uio_resid = rda->rda_count;
2572 
2573 	/*
2574 	 * read directory
2575 	 */
2576 	error = VOP_READDIR(vp, &uio, cr, &iseof, NULL, 0);
2577 
2578 	/*
2579 	 * Clean up
2580 	 */
2581 	if (!error) {
2582 		/*
2583 		 * set size and eof
2584 		 */
2585 		if (uio.uio_resid == rda->rda_count) {
2586 			rd->rd_size = 0;
2587 			rd->rd_eof = TRUE;
2588 		} else {
2589 			rd->rd_size = (uint32_t)(rda->rda_count -
2590 			    uio.uio_resid);
2591 			rd->rd_eof = iseof ? TRUE : FALSE;
2592 		}
2593 	}
2594 
2595 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2596 	nents = nfscmd_countents((char *)rd->rd_entries, rd->rd_size);
2597 	ret = nfscmd_convdirplus(ca, exi, (char *)rd->rd_entries, nents,
2598 	    rda->rda_count, &ndata);
2599 
2600 	if (ret != 0) {
2601 		size_t dropbytes;
2602 		/*
2603 		 * We had to drop one or more entries in order to fit
2604 		 * during the character conversion.  We need to patch
2605 		 * up the size and eof info.
2606 		 */
2607 		if (rd->rd_eof)
2608 			rd->rd_eof = FALSE;
2609 		dropbytes = nfscmd_dropped_entrysize(
2610 		    (struct dirent64 *)rd->rd_entries, nents, ret);
2611 		rd->rd_size -= dropbytes;
2612 	}
2613 	if (ndata == NULL) {
2614 		ndata = (char *)rd->rd_entries;
2615 	} else if (ndata != (char *)rd->rd_entries) {
2616 		kmem_free(rd->rd_entries, rd->rd_bufsize);
2617 		rd->rd_entries = (void *)ndata;
2618 		rd->rd_bufsize = rda->rda_count;
2619 	}
2620 
2621 bad:
2622 	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
2623 
2624 #if 0 /* notyet */
2625 	/*
2626 	 * Don't do this.  It causes local disk writes when just
2627 	 * reading the file and the overhead is deemed larger
2628 	 * than the benefit.
2629 	 */
2630 	/*
2631 	 * Force modified metadata out to stable storage.
2632 	 */
2633 	(void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2634 #endif
2635 
2636 	VN_RELE(vp);
2637 
2638 	rd->rd_status = puterrno(error);
2639 
2640 }
2641 void *
2642 rfs_readdir_getfh(struct nfsrddirargs *rda)
2643 {
2644 	return (&rda->rda_fh);
2645 }
2646 void
2647 rfs_rddirfree(struct nfsrddirres *rd)
2648 {
2649 	if (rd->rd_entries != NULL)
2650 		kmem_free(rd->rd_entries, rd->rd_bufsize);
2651 }
2652 
2653 /* ARGSUSED */
2654 void
2655 rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
2656 	struct svc_req *req, cred_t *cr)
2657 {
2658 	int error;
2659 	struct statvfs64 sb;
2660 	vnode_t *vp;
2661 
2662 	vp = nfs_fhtovp(fh, exi);
2663 	if (vp == NULL) {
2664 		fs->fs_status = NFSERR_STALE;
2665 		return;
2666 	}
2667 
2668 	error = VFS_STATVFS(vp->v_vfsp, &sb);
2669 
2670 	if (!error) {
2671 		fs->fs_tsize = nfstsize();
2672 		fs->fs_bsize = sb.f_frsize;
2673 		fs->fs_blocks = sb.f_blocks;
2674 		fs->fs_bfree = sb.f_bfree;
2675 		fs->fs_bavail = sb.f_bavail;
2676 	}
2677 
2678 	VN_RELE(vp);
2679 
2680 	fs->fs_status = puterrno(error);
2681 
2682 }
2683 void *
2684 rfs_statfs_getfh(fhandle_t *fh)
2685 {
2686 	return (fh);
2687 }
2688 
2689 static int
2690 sattr_to_vattr(struct nfssattr *sa, struct vattr *vap)
2691 {
2692 	vap->va_mask = 0;
2693 
2694 	/*
2695 	 * There was a sign extension bug in some VFS based systems
2696 	 * which stored the mode as a short.  When it would get
2697 	 * assigned to a u_long, no sign extension would occur.
2698 	 * It needed to, but this wasn't noticed because sa_mode
2699 	 * would then get assigned back to the short, thus ignoring
2700 	 * the upper 16 bits of sa_mode.
2701 	 *
2702 	 * To make this implementation work for both broken
2703 	 * clients and good clients, we check for both versions
2704 	 * of the mode.
2705 	 */
2706 	if (sa->sa_mode != (uint32_t)((ushort_t)-1) &&
2707 	    sa->sa_mode != (uint32_t)-1) {
2708 		vap->va_mask |= AT_MODE;
2709 		vap->va_mode = sa->sa_mode;
2710 	}
2711 	if (sa->sa_uid != (uint32_t)-1) {
2712 		vap->va_mask |= AT_UID;
2713 		vap->va_uid = sa->sa_uid;
2714 	}
2715 	if (sa->sa_gid != (uint32_t)-1) {
2716 		vap->va_mask |= AT_GID;
2717 		vap->va_gid = sa->sa_gid;
2718 	}
2719 	if (sa->sa_size != (uint32_t)-1) {
2720 		vap->va_mask |= AT_SIZE;
2721 		vap->va_size = sa->sa_size;
2722 	}
2723 	if (sa->sa_atime.tv_sec != (int32_t)-1 &&
2724 	    sa->sa_atime.tv_usec != (int32_t)-1) {
2725 #ifndef _LP64
2726 		/* return error if time overflow */
2727 		if (!NFS2_TIME_OK(sa->sa_atime.tv_sec))
2728 			return (EOVERFLOW);
2729 #endif
2730 		vap->va_mask |= AT_ATIME;
2731 		/*
2732 		 * nfs protocol defines times as unsigned so don't extend sign,
2733 		 * unless sysadmin set nfs_allow_preepoch_time.
2734 		 */
2735 		NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec);
2736 		vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
2737 	}
2738 	if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
2739 	    sa->sa_mtime.tv_usec != (int32_t)-1) {
2740 #ifndef _LP64
2741 		/* return error if time overflow */
2742 		if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
2743 			return (EOVERFLOW);
2744 #endif
2745 		vap->va_mask |= AT_MTIME;
2746 		/*
2747 		 * nfs protocol defines times as unsigned so don't extend sign,
2748 		 * unless sysadmin set nfs_allow_preepoch_time.
2749 		 */
2750 		NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
2751 		vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
2752 	}
2753 	return (0);
2754 }
2755 
2756 static enum nfsftype vt_to_nf[] = {
2757 	0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
2758 };
2759 
2760 /*
2761  * check the following fields for overflow: nodeid, size, and time.
2762  * There could be a problem when converting 64-bit LP64 fields
2763  * into 32-bit ones.  Return an error if there is an overflow.
2764  */
2765 int
2766 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
2767 {
2768 	ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
2769 	na->na_type = vt_to_nf[vap->va_type];
2770 
2771 	if (vap->va_mode == (unsigned short) -1)
2772 		na->na_mode = (uint32_t)-1;
2773 	else
2774 		na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
2775 
2776 	if (vap->va_uid == (unsigned short)(-1))
2777 		na->na_uid = (uint32_t)(-1);
2778 	else if (vap->va_uid == UID_NOBODY)
2779 		na->na_uid = (uint32_t)NFS_UID_NOBODY;
2780 	else
2781 		na->na_uid = vap->va_uid;
2782 
2783 	if (vap->va_gid == (unsigned short)(-1))
2784 		na->na_gid = (uint32_t)-1;
2785 	else if (vap->va_gid == GID_NOBODY)
2786 		na->na_gid = (uint32_t)NFS_GID_NOBODY;
2787 	else
2788 		na->na_gid = vap->va_gid;
2789 
2790 	/*
2791 	 * Do we need to check fsid for overflow?  It is 64-bit in the
2792 	 * vattr, but are bigger than 32 bit values supported?
2793 	 */
2794 	na->na_fsid = vap->va_fsid;
2795 
2796 	na->na_nodeid = vap->va_nodeid;
2797 
2798 	/*
2799 	 * Check to make sure that the nodeid is representable over the
2800 	 * wire without losing bits.
2801 	 */
2802 	if (vap->va_nodeid != (u_longlong_t)na->na_nodeid)
2803 		return (EFBIG);
2804 	na->na_nlink = vap->va_nlink;
2805 
2806 	/*
2807 	 * Check for big files here, instead of at the caller.  See
2808 	 * comments in cstat for large special file explanation.
2809 	 */
2810 	if (vap->va_size > (u_longlong_t)MAXOFF32_T) {
2811 		if ((vap->va_type == VREG) || (vap->va_type == VDIR))
2812 			return (EFBIG);
2813 		if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) {
2814 			/* UNKNOWN_SIZE | OVERFLOW */
2815 			na->na_size = MAXOFF32_T;
2816 		} else
2817 			na->na_size = vap->va_size;
2818 	} else
2819 		na->na_size = vap->va_size;
2820 
2821 	/*
2822 	 * If the vnode times overflow the 32-bit times that NFS2
2823 	 * uses on the wire then return an error.
2824 	 */
2825 	if (!NFS_VAP_TIME_OK(vap)) {
2826 		return (EOVERFLOW);
2827 	}
2828 	na->na_atime.tv_sec = vap->va_atime.tv_sec;
2829 	na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2830 
2831 	na->na_mtime.tv_sec = vap->va_mtime.tv_sec;
2832 	na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2833 
2834 	na->na_ctime.tv_sec = vap->va_ctime.tv_sec;
2835 	na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000;
2836 
2837 	/*
2838 	 * If the dev_t will fit into 16 bits then compress
2839 	 * it, otherwise leave it alone. See comments in
2840 	 * nfs_client.c.
2841 	 */
2842 	if (getminor(vap->va_rdev) <= SO4_MAXMIN &&
2843 	    getmajor(vap->va_rdev) <= SO4_MAXMAJ)
2844 		na->na_rdev = nfsv2_cmpdev(vap->va_rdev);
2845 	else
2846 		(void) cmpldev(&na->na_rdev, vap->va_rdev);
2847 
2848 	na->na_blocks = vap->va_nblocks;
2849 	na->na_blocksize = vap->va_blksize;
2850 
2851 	/*
2852 	 * This bit of ugliness is a *TEMPORARY* hack to preserve the
2853 	 * over-the-wire protocols for named-pipe vnodes.  It remaps the
2854 	 * VFIFO type to the special over-the-wire type. (see note in nfs.h)
2855 	 *
2856 	 * BUYER BEWARE:
2857 	 *  If you are porting the NFS to a non-Sun server, you probably
2858 	 *  don't want to include the following block of code.  The
2859 	 *  over-the-wire special file types will be changing with the
2860 	 *  NFS Protocol Revision.
2861 	 */
2862 	if (vap->va_type == VFIFO)
2863 		NA_SETFIFO(na);
2864 	return (0);
2865 }
2866 
2867 /*
2868  * acl v2 support: returns approximate permission.
2869  *	default: returns minimal permission (more restrictive)
2870  *	aclok: returns maximal permission (less restrictive)
2871  *	This routine changes the permissions that are alaredy in *va.
2872  *	If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES,
2873  *	CLASS_OBJ is always the same as GROUP_OBJ entry.
2874  */
2875 static void
2876 acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr)
2877 {
2878 	vsecattr_t	vsa;
2879 	int		aclcnt;
2880 	aclent_t	*aclentp;
2881 	mode_t		mask_perm;
2882 	mode_t		grp_perm;
2883 	mode_t		other_perm;
2884 	mode_t		other_orig;
2885 	int		error;
2886 
2887 	/* dont care default acl */
2888 	vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT);
2889 	error = VOP_GETSECATTR(vp, &vsa, 0, cr, NULL);
2890 
2891 	if (!error) {
2892 		aclcnt = vsa.vsa_aclcnt;
2893 		if (aclcnt > MIN_ACL_ENTRIES) {
2894 			/* non-trivial ACL */
2895 			aclentp = vsa.vsa_aclentp;
2896 			if (exi->exi_export.ex_flags & EX_ACLOK) {
2897 				/* maximal permissions */
2898 				grp_perm = 0;
2899 				other_perm = 0;
2900 				for (; aclcnt > 0; aclcnt--, aclentp++) {
2901 					switch (aclentp->a_type) {
2902 					case USER_OBJ:
2903 						break;
2904 					case USER:
2905 						grp_perm |=
2906 						    aclentp->a_perm << 3;
2907 						other_perm |= aclentp->a_perm;
2908 						break;
2909 					case GROUP_OBJ:
2910 						grp_perm |=
2911 						    aclentp->a_perm << 3;
2912 						break;
2913 					case GROUP:
2914 						other_perm |= aclentp->a_perm;
2915 						break;
2916 					case OTHER_OBJ:
2917 						other_orig = aclentp->a_perm;
2918 						break;
2919 					case CLASS_OBJ:
2920 						mask_perm = aclentp->a_perm;
2921 						break;
2922 					default:
2923 						break;
2924 					}
2925 				}
2926 				grp_perm &= mask_perm << 3;
2927 				other_perm &= mask_perm;
2928 				other_perm |= other_orig;
2929 
2930 			} else {
2931 				/* minimal permissions */
2932 				grp_perm = 070;
2933 				other_perm = 07;
2934 				for (; aclcnt > 0; aclcnt--, aclentp++) {
2935 					switch (aclentp->a_type) {
2936 					case USER_OBJ:
2937 						break;
2938 					case USER:
2939 					case CLASS_OBJ:
2940 						grp_perm &=
2941 						    aclentp->a_perm << 3;
2942 						other_perm &=
2943 						    aclentp->a_perm;
2944 						break;
2945 					case GROUP_OBJ:
2946 						grp_perm &=
2947 						    aclentp->a_perm << 3;
2948 						break;
2949 					case GROUP:
2950 						other_perm &=
2951 						    aclentp->a_perm;
2952 						break;
2953 					case OTHER_OBJ:
2954 						other_perm &=
2955 						    aclentp->a_perm;
2956 						break;
2957 					default:
2958 						break;
2959 					}
2960 				}
2961 			}
2962 			/* copy to va */
2963 			va->va_mode &= ~077;
2964 			va->va_mode |= grp_perm | other_perm;
2965 		}
2966 		if (vsa.vsa_aclcnt)
2967 			kmem_free(vsa.vsa_aclentp,
2968 			    vsa.vsa_aclcnt * sizeof (aclent_t));
2969 	}
2970 }
2971 
2972 void
2973 rfs_srvrinit(void)
2974 {
2975 	mutex_init(&rfs_async_write_lock, NULL, MUTEX_DEFAULT, NULL);
2976 	nfs2_srv_caller_id = fs_new_caller_id();
2977 }
2978 
2979 void
2980 rfs_srvrfini(void)
2981 {
2982 	mutex_destroy(&rfs_async_write_lock);
2983 }
2984 
2985 static int
2986 rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
2987 {
2988 	struct clist	*wcl;
2989 	int		wlist_len;
2990 	uint32_t	count = rr->rr_count;
2991 
2992 	wcl = ra->ra_wlist;
2993 
2994 	if (rdma_setup_read_chunks(wcl, count, &wlist_len) == FALSE) {
2995 		return (FALSE);
2996 	}
2997 
2998 	wcl = ra->ra_wlist;
2999 	rr->rr_ok.rrok_wlist_len = wlist_len;
3000 	rr->rr_ok.rrok_wlist = wcl;
3001 
3002 	return (TRUE);
3003 }
3004