xref: /titanic_41/usr/src/uts/common/fs/nfs/nfs_srv.c (revision f3312ec0e8acbd249df97358fb8c3ca92f4e089c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
24  */
25 
26 /*
27  *	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
28  *	All rights reserved.
29  */
30 
31 #include <sys/param.h>
32 #include <sys/types.h>
33 #include <sys/systm.h>
34 #include <sys/cred.h>
35 #include <sys/buf.h>
36 #include <sys/vfs.h>
37 #include <sys/vnode.h>
38 #include <sys/uio.h>
39 #include <sys/stat.h>
40 #include <sys/errno.h>
41 #include <sys/sysmacros.h>
42 #include <sys/statvfs.h>
43 #include <sys/kmem.h>
44 #include <sys/kstat.h>
45 #include <sys/dirent.h>
46 #include <sys/cmn_err.h>
47 #include <sys/debug.h>
48 #include <sys/vtrace.h>
49 #include <sys/mode.h>
50 #include <sys/acl.h>
51 #include <sys/nbmlock.h>
52 #include <sys/policy.h>
53 #include <sys/sdt.h>
54 
55 #include <rpc/types.h>
56 #include <rpc/auth.h>
57 #include <rpc/svc.h>
58 
59 #include <nfs/nfs.h>
60 #include <nfs/export.h>
61 #include <nfs/nfs_cmd.h>
62 
63 #include <vm/hat.h>
64 #include <vm/as.h>
65 #include <vm/seg.h>
66 #include <vm/seg_map.h>
67 #include <vm/seg_kmem.h>
68 
69 #include <sys/strsubr.h>
70 
71 /*
72  * These are the interface routines for the server side of the
73  * Network File System.  See the NFS version 2 protocol specification
74  * for a description of this interface.
75  */
76 
77 static int	sattr_to_vattr(struct nfssattr *, struct vattr *);
78 static void	acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
79 			cred_t *);
80 
81 /*
82  * Some "over the wire" UNIX file types.  These are encoded
83  * into the mode.  This needs to be fixed in the next rev.
84  */
85 #define	IFMT		0170000		/* type of file */
86 #define	IFCHR		0020000		/* character special */
87 #define	IFBLK		0060000		/* block special */
88 #define	IFSOCK		0140000		/* socket */
89 
90 u_longlong_t nfs2_srv_caller_id;
91 
92 /*
93  * Get file attributes.
94  * Returns the current attributes of the file with the given fhandle.
95  */
96 /* ARGSUSED */
97 void
98 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
99     struct svc_req *req, cred_t *cr, bool_t ro)
100 {
101 	int error;
102 	vnode_t *vp;
103 	struct vattr va;
104 
105 	vp = nfs_fhtovp(fhp, exi);
106 	if (vp == NULL) {
107 		ns->ns_status = NFSERR_STALE;
108 		return;
109 	}
110 
111 	/*
112 	 * Do the getattr.
113 	 */
114 	va.va_mask = AT_ALL;	/* we want all the attributes */
115 
116 	error = rfs4_delegated_getattr(vp, &va, 0, cr);
117 
118 	/* check for overflows */
119 	if (!error) {
120 		/* Lie about the object type for a referral */
121 		if (vn_is_nfs_reparse(vp, cr))
122 			va.va_type = VLNK;
123 
124 		acl_perm(vp, exi, &va, cr);
125 		error = vattr_to_nattr(&va, &ns->ns_attr);
126 	}
127 
128 	VN_RELE(vp);
129 
130 	ns->ns_status = puterrno(error);
131 }
132 void *
133 rfs_getattr_getfh(fhandle_t *fhp)
134 {
135 	return (fhp);
136 }
137 
138 /*
139  * Set file attributes.
140  * Sets the attributes of the file with the given fhandle.  Returns
141  * the new attributes.
142  */
143 /* ARGSUSED */
144 void
145 rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns,
146     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
147 {
148 	int error;
149 	int flag;
150 	int in_crit = 0;
151 	vnode_t *vp;
152 	struct vattr va;
153 	struct vattr bva;
154 	struct flock64 bf;
155 	caller_context_t ct;
156 
157 
158 	vp = nfs_fhtovp(&args->saa_fh, exi);
159 	if (vp == NULL) {
160 		ns->ns_status = NFSERR_STALE;
161 		return;
162 	}
163 
164 	if (rdonly(ro, vp)) {
165 		VN_RELE(vp);
166 		ns->ns_status = NFSERR_ROFS;
167 		return;
168 	}
169 
170 	error = sattr_to_vattr(&args->saa_sa, &va);
171 	if (error) {
172 		VN_RELE(vp);
173 		ns->ns_status = puterrno(error);
174 		return;
175 	}
176 
177 	/*
178 	 * If the client is requesting a change to the mtime,
179 	 * but the nanosecond field is set to 1 billion, then
180 	 * this is a flag to the server that it should set the
181 	 * atime and mtime fields to the server's current time.
182 	 * The 1 billion number actually came from the client
183 	 * as 1 million, but the units in the over the wire
184 	 * request are microseconds instead of nanoseconds.
185 	 *
186 	 * This is an overload of the protocol and should be
187 	 * documented in the NFS Version 2 protocol specification.
188 	 */
189 	if (va.va_mask & AT_MTIME) {
190 		if (va.va_mtime.tv_nsec == 1000000000) {
191 			gethrestime(&va.va_mtime);
192 			va.va_atime = va.va_mtime;
193 			va.va_mask |= AT_ATIME;
194 			flag = 0;
195 		} else
196 			flag = ATTR_UTIME;
197 	} else
198 		flag = 0;
199 
200 	/*
201 	 * If the filesystem is exported with nosuid, then mask off
202 	 * the setuid and setgid bits.
203 	 */
204 	if ((va.va_mask & AT_MODE) && vp->v_type == VREG &&
205 	    (exi->exi_export.ex_flags & EX_NOSUID))
206 		va.va_mode &= ~(VSUID | VSGID);
207 
208 	ct.cc_sysid = 0;
209 	ct.cc_pid = 0;
210 	ct.cc_caller_id = nfs2_srv_caller_id;
211 	ct.cc_flags = CC_DONTBLOCK;
212 
213 	/*
214 	 * We need to specially handle size changes because it is
215 	 * possible for the client to create a file with modes
216 	 * which indicate read-only, but with the file opened for
217 	 * writing.  If the client then tries to set the size of
218 	 * the file, then the normal access checking done in
219 	 * VOP_SETATTR would prevent the client from doing so,
220 	 * although it should be legal for it to do so.  To get
221 	 * around this, we do the access checking for ourselves
222 	 * and then use VOP_SPACE which doesn't do the access
223 	 * checking which VOP_SETATTR does. VOP_SPACE can only
224 	 * operate on VREG files, let VOP_SETATTR handle the other
225 	 * extremely rare cases.
226 	 * Also the client should not be allowed to change the
227 	 * size of the file if there is a conflicting non-blocking
228 	 * mandatory lock in the region of change.
229 	 */
230 	if (vp->v_type == VREG && va.va_mask & AT_SIZE) {
231 		if (nbl_need_check(vp)) {
232 			nbl_start_crit(vp, RW_READER);
233 			in_crit = 1;
234 		}
235 
236 		bva.va_mask = AT_UID | AT_SIZE;
237 
238 		error = VOP_GETATTR(vp, &bva, 0, cr, &ct);
239 
240 		if (error) {
241 			if (in_crit)
242 				nbl_end_crit(vp);
243 			VN_RELE(vp);
244 			ns->ns_status = puterrno(error);
245 			return;
246 		}
247 
248 		if (in_crit) {
249 			u_offset_t offset;
250 			ssize_t length;
251 
252 			if (va.va_size < bva.va_size) {
253 				offset = va.va_size;
254 				length = bva.va_size - va.va_size;
255 			} else {
256 				offset = bva.va_size;
257 				length = va.va_size - bva.va_size;
258 			}
259 			if (nbl_conflict(vp, NBL_WRITE, offset, length, 0,
260 			    NULL)) {
261 				error = EACCES;
262 			}
263 		}
264 
265 		if (crgetuid(cr) == bva.va_uid && !error &&
266 		    va.va_size != bva.va_size) {
267 			va.va_mask &= ~AT_SIZE;
268 			bf.l_type = F_WRLCK;
269 			bf.l_whence = 0;
270 			bf.l_start = (off64_t)va.va_size;
271 			bf.l_len = 0;
272 			bf.l_sysid = 0;
273 			bf.l_pid = 0;
274 
275 			error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE,
276 			    (offset_t)va.va_size, cr, &ct);
277 		}
278 		if (in_crit)
279 			nbl_end_crit(vp);
280 	} else
281 		error = 0;
282 
283 	/*
284 	 * Do the setattr.
285 	 */
286 	if (!error && va.va_mask) {
287 		error = VOP_SETATTR(vp, &va, flag, cr, &ct);
288 	}
289 
290 	/*
291 	 * check if the monitor on either vop_space or vop_setattr detected
292 	 * a delegation conflict and if so, mark the thread flag as
293 	 * wouldblock so that the response is dropped and the client will
294 	 * try again.
295 	 */
296 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
297 		VN_RELE(vp);
298 		curthread->t_flag |= T_WOULDBLOCK;
299 		return;
300 	}
301 
302 	if (!error) {
303 		va.va_mask = AT_ALL;	/* get everything */
304 
305 		error = rfs4_delegated_getattr(vp, &va, 0, cr);
306 
307 		/* check for overflows */
308 		if (!error) {
309 			acl_perm(vp, exi, &va, cr);
310 			error = vattr_to_nattr(&va, &ns->ns_attr);
311 		}
312 	}
313 
314 	ct.cc_flags = 0;
315 
316 	/*
317 	 * Force modified metadata out to stable storage.
318 	 */
319 	(void) VOP_FSYNC(vp, FNODSYNC, cr, &ct);
320 
321 	VN_RELE(vp);
322 
323 	ns->ns_status = puterrno(error);
324 }
325 void *
326 rfs_setattr_getfh(struct nfssaargs *args)
327 {
328 	return (&args->saa_fh);
329 }
330 
331 /*
332  * Directory lookup.
333  * Returns an fhandle and file attributes for file name in a directory.
334  */
335 /* ARGSUSED */
336 void
337 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
338     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
339 {
340 	int error;
341 	vnode_t *dvp;
342 	vnode_t *vp;
343 	struct vattr va;
344 	fhandle_t *fhp = da->da_fhandle;
345 	struct sec_ol sec = {0, 0};
346 	bool_t publicfh_flag = FALSE, auth_weak = FALSE;
347 	char *name;
348 	struct sockaddr *ca;
349 
350 	/*
351 	 * Trusted Extension doesn't support NFSv2. MOUNT
352 	 * will reject v2 clients. Need to prevent v2 client
353 	 * access via WebNFS here.
354 	 */
355 	if (is_system_labeled() && req->rq_vers == 2) {
356 		dr->dr_status = NFSERR_ACCES;
357 		return;
358 	}
359 
360 	/*
361 	 * Disallow NULL paths
362 	 */
363 	if (da->da_name == NULL || *da->da_name == '\0') {
364 		dr->dr_status = NFSERR_ACCES;
365 		return;
366 	}
367 
368 	/*
369 	 * Allow lookups from the root - the default
370 	 * location of the public filehandle.
371 	 */
372 	if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
373 		dvp = rootdir;
374 		VN_HOLD(dvp);
375 	} else {
376 		dvp = nfs_fhtovp(fhp, exi);
377 		if (dvp == NULL) {
378 			dr->dr_status = NFSERR_STALE;
379 			return;
380 		}
381 	}
382 
383 	/*
384 	 * Not allow lookup beyond root.
385 	 * If the filehandle matches a filehandle of the exi,
386 	 * then the ".." refers beyond the root of an exported filesystem.
387 	 */
388 	if (strcmp(da->da_name, "..") == 0 &&
389 	    EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
390 		VN_RELE(dvp);
391 		dr->dr_status = NFSERR_NOENT;
392 		return;
393 	}
394 
395 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
396 	name = nfscmd_convname(ca, exi, da->da_name, NFSCMD_CONV_INBOUND,
397 	    MAXPATHLEN);
398 
399 	if (name == NULL) {
400 		dr->dr_status = NFSERR_ACCES;
401 		return;
402 	}
403 
404 	/*
405 	 * If the public filehandle is used then allow
406 	 * a multi-component lookup, i.e. evaluate
407 	 * a pathname and follow symbolic links if
408 	 * necessary.
409 	 *
410 	 * This may result in a vnode in another filesystem
411 	 * which is OK as long as the filesystem is exported.
412 	 */
413 	if (PUBLIC_FH2(fhp)) {
414 		publicfh_flag = TRUE;
415 		error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &exi,
416 		    &sec);
417 	} else {
418 		/*
419 		 * Do a normal single component lookup.
420 		 */
421 		error = VOP_LOOKUP(dvp, name, &vp, NULL, 0, NULL, cr,
422 		    NULL, NULL, NULL);
423 	}
424 
425 	if (name != da->da_name)
426 		kmem_free(name, MAXPATHLEN);
427 
428 
429 	if (!error) {
430 		va.va_mask = AT_ALL;	/* we want everything */
431 
432 		error = rfs4_delegated_getattr(vp, &va, 0, cr);
433 
434 		/* check for overflows */
435 		if (!error) {
436 			acl_perm(vp, exi, &va, cr);
437 			error = vattr_to_nattr(&va, &dr->dr_attr);
438 			if (!error) {
439 				if (sec.sec_flags & SEC_QUERY)
440 					error = makefh_ol(&dr->dr_fhandle, exi,
441 					    sec.sec_index);
442 				else {
443 					error = makefh(&dr->dr_fhandle, vp,
444 					    exi);
445 					if (!error && publicfh_flag &&
446 					    !chk_clnt_sec(exi, req))
447 						auth_weak = TRUE;
448 				}
449 			}
450 		}
451 		VN_RELE(vp);
452 	}
453 
454 	VN_RELE(dvp);
455 
456 	/*
457 	 * If publicfh_flag is true then we have called rfs_publicfh_mclookup
458 	 * and have obtained a new exportinfo in exi which needs to be
459 	 * released. Note the the original exportinfo pointed to by exi
460 	 * will be released by the caller, comon_dispatch.
461 	 */
462 	if (publicfh_flag && exi != NULL)
463 		exi_rele(exi);
464 
465 	/*
466 	 * If it's public fh, no 0x81, and client's flavor is
467 	 * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
468 	 * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
469 	 */
470 	if (auth_weak)
471 		dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR;
472 	else
473 		dr->dr_status = puterrno(error);
474 }
475 void *
476 rfs_lookup_getfh(struct nfsdiropargs *da)
477 {
478 	return (da->da_fhandle);
479 }
480 
481 /*
482  * Read symbolic link.
483  * Returns the string in the symbolic link at the given fhandle.
484  */
485 /* ARGSUSED */
486 void
487 rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi,
488     struct svc_req *req, cred_t *cr, bool_t ro)
489 {
490 	int error;
491 	struct iovec iov;
492 	struct uio uio;
493 	vnode_t *vp;
494 	struct vattr va;
495 	struct sockaddr *ca;
496 	char *name = NULL;
497 	int is_referral = 0;
498 
499 	vp = nfs_fhtovp(fhp, exi);
500 	if (vp == NULL) {
501 		rl->rl_data = NULL;
502 		rl->rl_status = NFSERR_STALE;
503 		return;
504 	}
505 
506 	va.va_mask = AT_MODE;
507 
508 	error = VOP_GETATTR(vp, &va, 0, cr, NULL);
509 
510 	if (error) {
511 		VN_RELE(vp);
512 		rl->rl_data = NULL;
513 		rl->rl_status = puterrno(error);
514 		return;
515 	}
516 
517 	if (MANDLOCK(vp, va.va_mode)) {
518 		VN_RELE(vp);
519 		rl->rl_data = NULL;
520 		rl->rl_status = NFSERR_ACCES;
521 		return;
522 	}
523 
524 	/* We lied about the object type for a referral */
525 	if (vn_is_nfs_reparse(vp, cr))
526 		is_referral = 1;
527 
528 	/*
529 	 * XNFS and RFC1094 require us to return ENXIO if argument
530 	 * is not a link. BUGID 1138002.
531 	 */
532 	if (vp->v_type != VLNK && !is_referral) {
533 		VN_RELE(vp);
534 		rl->rl_data = NULL;
535 		rl->rl_status = NFSERR_NXIO;
536 		return;
537 	}
538 
539 	/*
540 	 * Allocate data for pathname.  This will be freed by rfs_rlfree.
541 	 */
542 	rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
543 
544 	if (is_referral) {
545 		char *s;
546 		size_t strsz;
547 
548 		/* Get an artificial symlink based on a referral */
549 		s = build_symlink(vp, cr, &strsz);
550 		global_svstat_ptr[2][NFS_REFERLINKS].value.ui64++;
551 		DTRACE_PROBE2(nfs2serv__func__referral__reflink,
552 		    vnode_t *, vp, char *, s);
553 		if (s == NULL)
554 			error = EINVAL;
555 		else {
556 			error = 0;
557 			(void) strlcpy(rl->rl_data, s, NFS_MAXPATHLEN);
558 			rl->rl_count = (uint32_t)MIN(strsz, NFS_MAXPATHLEN);
559 			kmem_free(s, strsz);
560 		}
561 
562 	} else {
563 
564 		/*
565 		 * Set up io vector to read sym link data
566 		 */
567 		iov.iov_base = rl->rl_data;
568 		iov.iov_len = NFS_MAXPATHLEN;
569 		uio.uio_iov = &iov;
570 		uio.uio_iovcnt = 1;
571 		uio.uio_segflg = UIO_SYSSPACE;
572 		uio.uio_extflg = UIO_COPY_CACHED;
573 		uio.uio_loffset = (offset_t)0;
574 		uio.uio_resid = NFS_MAXPATHLEN;
575 
576 		/*
577 		 * Do the readlink.
578 		 */
579 		error = VOP_READLINK(vp, &uio, cr, NULL);
580 
581 		rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid);
582 
583 		if (!error)
584 			rl->rl_data[rl->rl_count] = '\0';
585 
586 	}
587 
588 
589 	VN_RELE(vp);
590 
591 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
592 	name = nfscmd_convname(ca, exi, rl->rl_data,
593 	    NFSCMD_CONV_OUTBOUND, MAXPATHLEN);
594 
595 	if (name != NULL && name != rl->rl_data) {
596 		kmem_free(rl->rl_data, NFS_MAXPATHLEN);
597 		rl->rl_data = name;
598 	}
599 
600 	/*
601 	 * XNFS and RFC1094 require us to return ENXIO if argument
602 	 * is not a link. UFS returns EINVAL if this is the case,
603 	 * so we do the mapping here. BUGID 1138002.
604 	 */
605 	if (error == EINVAL)
606 		rl->rl_status = NFSERR_NXIO;
607 	else
608 		rl->rl_status = puterrno(error);
609 
610 }
611 void *
612 rfs_readlink_getfh(fhandle_t *fhp)
613 {
614 	return (fhp);
615 }
616 /*
617  * Free data allocated by rfs_readlink
618  */
619 void
620 rfs_rlfree(struct nfsrdlnres *rl)
621 {
622 	if (rl->rl_data != NULL)
623 		kmem_free(rl->rl_data, NFS_MAXPATHLEN);
624 }
625 
626 static int rdma_setup_read_data2(struct nfsreadargs *, struct nfsrdresult *);
627 
628 /*
629  * Read data.
630  * Returns some data read from the file at the given fhandle.
631  */
632 /* ARGSUSED */
633 void
634 rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr,
635     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
636 {
637 	vnode_t *vp;
638 	int error;
639 	struct vattr va;
640 	struct iovec iov;
641 	struct uio uio;
642 	mblk_t *mp;
643 	int alloc_err = 0;
644 	int in_crit = 0;
645 	caller_context_t ct;
646 
647 	vp = nfs_fhtovp(&ra->ra_fhandle, exi);
648 	if (vp == NULL) {
649 		rr->rr_data = NULL;
650 		rr->rr_status = NFSERR_STALE;
651 		return;
652 	}
653 
654 	if (vp->v_type != VREG) {
655 		VN_RELE(vp);
656 		rr->rr_data = NULL;
657 		rr->rr_status = NFSERR_ISDIR;
658 		return;
659 	}
660 
661 	ct.cc_sysid = 0;
662 	ct.cc_pid = 0;
663 	ct.cc_caller_id = nfs2_srv_caller_id;
664 	ct.cc_flags = CC_DONTBLOCK;
665 
666 	/*
667 	 * Enter the critical region before calling VOP_RWLOCK
668 	 * to avoid a deadlock with write requests.
669 	 */
670 	if (nbl_need_check(vp)) {
671 		nbl_start_crit(vp, RW_READER);
672 		if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
673 		    0, NULL)) {
674 			nbl_end_crit(vp);
675 			VN_RELE(vp);
676 			rr->rr_data = NULL;
677 			rr->rr_status = NFSERR_ACCES;
678 			return;
679 		}
680 		in_crit = 1;
681 	}
682 
683 	error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct);
684 
685 	/* check if a monitor detected a delegation conflict */
686 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
687 		VN_RELE(vp);
688 		/* mark as wouldblock so response is dropped */
689 		curthread->t_flag |= T_WOULDBLOCK;
690 
691 		rr->rr_data = NULL;
692 		return;
693 	}
694 
695 	va.va_mask = AT_ALL;
696 
697 	error = VOP_GETATTR(vp, &va, 0, cr, &ct);
698 
699 	if (error) {
700 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
701 		if (in_crit)
702 			nbl_end_crit(vp);
703 
704 		VN_RELE(vp);
705 		rr->rr_data = NULL;
706 		rr->rr_status = puterrno(error);
707 
708 		return;
709 	}
710 
711 	/*
712 	 * This is a kludge to allow reading of files created
713 	 * with no read permission.  The owner of the file
714 	 * is always allowed to read it.
715 	 */
716 	if (crgetuid(cr) != va.va_uid) {
717 		error = VOP_ACCESS(vp, VREAD, 0, cr, &ct);
718 
719 		if (error) {
720 			/*
721 			 * Exec is the same as read over the net because
722 			 * of demand loading.
723 			 */
724 			error = VOP_ACCESS(vp, VEXEC, 0, cr, &ct);
725 		}
726 		if (error) {
727 			VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
728 			if (in_crit)
729 				nbl_end_crit(vp);
730 			VN_RELE(vp);
731 			rr->rr_data = NULL;
732 			rr->rr_status = puterrno(error);
733 
734 			return;
735 		}
736 	}
737 
738 	if (MANDLOCK(vp, va.va_mode)) {
739 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
740 		if (in_crit)
741 			nbl_end_crit(vp);
742 
743 		VN_RELE(vp);
744 		rr->rr_data = NULL;
745 		rr->rr_status = NFSERR_ACCES;
746 
747 		return;
748 	}
749 
750 	rr->rr_ok.rrok_wlist_len = 0;
751 	rr->rr_ok.rrok_wlist = NULL;
752 
753 	if ((u_offset_t)ra->ra_offset >= va.va_size) {
754 		rr->rr_count = 0;
755 		rr->rr_data = NULL;
756 		/*
757 		 * In this case, status is NFS_OK, but there is no data
758 		 * to encode. So set rr_mp to NULL.
759 		 */
760 		rr->rr_mp = NULL;
761 		rr->rr_ok.rrok_wlist = ra->ra_wlist;
762 		if (rr->rr_ok.rrok_wlist)
763 			clist_zero_len(rr->rr_ok.rrok_wlist);
764 		goto done;
765 	}
766 
767 	if (ra->ra_wlist) {
768 		mp = NULL;
769 		rr->rr_mp = NULL;
770 		(void) rdma_get_wchunk(req, &iov, ra->ra_wlist);
771 		if (ra->ra_count > iov.iov_len) {
772 			rr->rr_data = NULL;
773 			rr->rr_status = NFSERR_INVAL;
774 			goto done;
775 		}
776 	} else {
777 		/*
778 		 * mp will contain the data to be sent out in the read reply.
779 		 * This will be freed after the reply has been sent out (by the
780 		 * driver).
781 		 * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so
782 		 * that the call to xdrmblk_putmblk() never fails.
783 		 */
784 		mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG,
785 		    &alloc_err);
786 		ASSERT(mp != NULL);
787 		ASSERT(alloc_err == 0);
788 
789 		rr->rr_mp = mp;
790 
791 		/*
792 		 * Set up io vector
793 		 */
794 		iov.iov_base = (caddr_t)mp->b_datap->db_base;
795 		iov.iov_len = ra->ra_count;
796 	}
797 
798 	uio.uio_iov = &iov;
799 	uio.uio_iovcnt = 1;
800 	uio.uio_segflg = UIO_SYSSPACE;
801 	uio.uio_extflg = UIO_COPY_CACHED;
802 	uio.uio_loffset = (offset_t)ra->ra_offset;
803 	uio.uio_resid = ra->ra_count;
804 
805 	error = VOP_READ(vp, &uio, 0, cr, &ct);
806 
807 	if (error) {
808 		if (mp)
809 			freeb(mp);
810 
811 		/*
812 		 * check if a monitor detected a delegation conflict and
813 		 * mark as wouldblock so response is dropped
814 		 */
815 		if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
816 			curthread->t_flag |= T_WOULDBLOCK;
817 		else
818 			rr->rr_status = puterrno(error);
819 
820 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
821 		if (in_crit)
822 			nbl_end_crit(vp);
823 
824 		VN_RELE(vp);
825 		rr->rr_data = NULL;
826 
827 		return;
828 	}
829 
830 	/*
831 	 * Get attributes again so we can send the latest access
832 	 * time to the client side for his cache.
833 	 */
834 	va.va_mask = AT_ALL;
835 
836 	error = VOP_GETATTR(vp, &va, 0, cr, &ct);
837 
838 	if (error) {
839 		if (mp)
840 			freeb(mp);
841 
842 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
843 		if (in_crit)
844 			nbl_end_crit(vp);
845 
846 		VN_RELE(vp);
847 		rr->rr_data = NULL;
848 		rr->rr_status = puterrno(error);
849 
850 		return;
851 	}
852 
853 	rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid);
854 
855 	if (mp) {
856 		rr->rr_data = (char *)mp->b_datap->db_base;
857 	} else {
858 		if (ra->ra_wlist) {
859 			rr->rr_data = (caddr_t)iov.iov_base;
860 			if (!rdma_setup_read_data2(ra, rr)) {
861 				rr->rr_data = NULL;
862 				rr->rr_status = puterrno(NFSERR_INVAL);
863 			}
864 		}
865 	}
866 done:
867 	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
868 	if (in_crit)
869 		nbl_end_crit(vp);
870 
871 	acl_perm(vp, exi, &va, cr);
872 
873 	/* check for overflows */
874 	error = vattr_to_nattr(&va, &rr->rr_attr);
875 
876 	VN_RELE(vp);
877 
878 	rr->rr_status = puterrno(error);
879 }
880 
881 /*
882  * Free data allocated by rfs_read
883  */
884 void
885 rfs_rdfree(struct nfsrdresult *rr)
886 {
887 	mblk_t *mp;
888 
889 	if (rr->rr_status == NFS_OK) {
890 		mp = rr->rr_mp;
891 		if (mp != NULL)
892 			freeb(mp);
893 	}
894 }
895 
896 void *
897 rfs_read_getfh(struct nfsreadargs *ra)
898 {
899 	return (&ra->ra_fhandle);
900 }
901 
902 #define	MAX_IOVECS	12
903 
904 #ifdef DEBUG
905 static int rfs_write_sync_hits = 0;
906 static int rfs_write_sync_misses = 0;
907 #endif
908 
909 /*
910  * Write data to file.
911  * Returns attributes of a file after writing some data to it.
912  *
913  * Any changes made here, especially in error handling might have
914  * to also be done in rfs_write (which clusters write requests).
915  */
916 /* ARGSUSED */
917 void
918 rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns,
919     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
920 {
921 	int error;
922 	vnode_t *vp;
923 	rlim64_t rlimit;
924 	struct vattr va;
925 	struct uio uio;
926 	struct iovec iov[MAX_IOVECS];
927 	mblk_t *m;
928 	struct iovec *iovp;
929 	int iovcnt;
930 	cred_t *savecred;
931 	int in_crit = 0;
932 	caller_context_t ct;
933 
934 	vp = nfs_fhtovp(&wa->wa_fhandle, exi);
935 	if (vp == NULL) {
936 		ns->ns_status = NFSERR_STALE;
937 		return;
938 	}
939 
940 	if (rdonly(ro, vp)) {
941 		VN_RELE(vp);
942 		ns->ns_status = NFSERR_ROFS;
943 		return;
944 	}
945 
946 	if (vp->v_type != VREG) {
947 		VN_RELE(vp);
948 		ns->ns_status = NFSERR_ISDIR;
949 		return;
950 	}
951 
952 	ct.cc_sysid = 0;
953 	ct.cc_pid = 0;
954 	ct.cc_caller_id = nfs2_srv_caller_id;
955 	ct.cc_flags = CC_DONTBLOCK;
956 
957 	va.va_mask = AT_UID|AT_MODE;
958 
959 	error = VOP_GETATTR(vp, &va, 0, cr, &ct);
960 
961 	if (error) {
962 		VN_RELE(vp);
963 		ns->ns_status = puterrno(error);
964 
965 		return;
966 	}
967 
968 	if (crgetuid(cr) != va.va_uid) {
969 		/*
970 		 * This is a kludge to allow writes of files created
971 		 * with read only permission.  The owner of the file
972 		 * is always allowed to write it.
973 		 */
974 		error = VOP_ACCESS(vp, VWRITE, 0, cr, &ct);
975 
976 		if (error) {
977 			VN_RELE(vp);
978 			ns->ns_status = puterrno(error);
979 			return;
980 		}
981 	}
982 
983 	/*
984 	 * Can't access a mandatory lock file.  This might cause
985 	 * the NFS service thread to block forever waiting for a
986 	 * lock to be released that will never be released.
987 	 */
988 	if (MANDLOCK(vp, va.va_mode)) {
989 		VN_RELE(vp);
990 		ns->ns_status = NFSERR_ACCES;
991 		return;
992 	}
993 
994 	/*
995 	 * We have to enter the critical region before calling VOP_RWLOCK
996 	 * to avoid a deadlock with ufs.
997 	 */
998 	if (nbl_need_check(vp)) {
999 		nbl_start_crit(vp, RW_READER);
1000 		in_crit = 1;
1001 		if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
1002 		    wa->wa_count, 0, NULL)) {
1003 			error = EACCES;
1004 			goto out;
1005 		}
1006 	}
1007 
1008 	error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1009 
1010 	/* check if a monitor detected a delegation conflict */
1011 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1012 		VN_RELE(vp);
1013 		/* mark as wouldblock so response is dropped */
1014 		curthread->t_flag |= T_WOULDBLOCK;
1015 		return;
1016 	}
1017 
1018 	if (wa->wa_data || wa->wa_rlist) {
1019 		/* Do the RDMA thing if necessary */
1020 		if (wa->wa_rlist) {
1021 			iov[0].iov_base = (char *)((wa->wa_rlist)->u.c_daddr3);
1022 			iov[0].iov_len = wa->wa_count;
1023 		} else  {
1024 			iov[0].iov_base = wa->wa_data;
1025 			iov[0].iov_len = wa->wa_count;
1026 		}
1027 		uio.uio_iov = iov;
1028 		uio.uio_iovcnt = 1;
1029 		uio.uio_segflg = UIO_SYSSPACE;
1030 		uio.uio_extflg = UIO_COPY_DEFAULT;
1031 		uio.uio_loffset = (offset_t)wa->wa_offset;
1032 		uio.uio_resid = wa->wa_count;
1033 		/*
1034 		 * The limit is checked on the client. We
1035 		 * should allow any size writes here.
1036 		 */
1037 		uio.uio_llimit = curproc->p_fsz_ctl;
1038 		rlimit = uio.uio_llimit - wa->wa_offset;
1039 		if (rlimit < (rlim64_t)uio.uio_resid)
1040 			uio.uio_resid = (uint_t)rlimit;
1041 
1042 		/*
1043 		 * for now we assume no append mode
1044 		 */
1045 		/*
1046 		 * We're changing creds because VM may fault and we need
1047 		 * the cred of the current thread to be used if quota
1048 		 * checking is enabled.
1049 		 */
1050 		savecred = curthread->t_cred;
1051 		curthread->t_cred = cr;
1052 		error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1053 		curthread->t_cred = savecred;
1054 	} else {
1055 		iovcnt = 0;
1056 		for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
1057 			iovcnt++;
1058 		if (iovcnt <= MAX_IOVECS) {
1059 #ifdef DEBUG
1060 			rfs_write_sync_hits++;
1061 #endif
1062 			iovp = iov;
1063 		} else {
1064 #ifdef DEBUG
1065 			rfs_write_sync_misses++;
1066 #endif
1067 			iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
1068 		}
1069 		mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
1070 		uio.uio_iov = iovp;
1071 		uio.uio_iovcnt = iovcnt;
1072 		uio.uio_segflg = UIO_SYSSPACE;
1073 		uio.uio_extflg = UIO_COPY_DEFAULT;
1074 		uio.uio_loffset = (offset_t)wa->wa_offset;
1075 		uio.uio_resid = wa->wa_count;
1076 		/*
1077 		 * The limit is checked on the client. We
1078 		 * should allow any size writes here.
1079 		 */
1080 		uio.uio_llimit = curproc->p_fsz_ctl;
1081 		rlimit = uio.uio_llimit - wa->wa_offset;
1082 		if (rlimit < (rlim64_t)uio.uio_resid)
1083 			uio.uio_resid = (uint_t)rlimit;
1084 
1085 		/*
1086 		 * For now we assume no append mode.
1087 		 */
1088 		/*
1089 		 * We're changing creds because VM may fault and we need
1090 		 * the cred of the current thread to be used if quota
1091 		 * checking is enabled.
1092 		 */
1093 		savecred = curthread->t_cred;
1094 		curthread->t_cred = cr;
1095 		error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1096 		curthread->t_cred = savecred;
1097 
1098 		if (iovp != iov)
1099 			kmem_free(iovp, sizeof (*iovp) * iovcnt);
1100 	}
1101 
1102 	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1103 
1104 	if (!error) {
1105 		/*
1106 		 * Get attributes again so we send the latest mod
1107 		 * time to the client side for his cache.
1108 		 */
1109 		va.va_mask = AT_ALL;	/* now we want everything */
1110 
1111 		error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1112 
1113 		/* check for overflows */
1114 		if (!error) {
1115 			acl_perm(vp, exi, &va, cr);
1116 			error = vattr_to_nattr(&va, &ns->ns_attr);
1117 		}
1118 	}
1119 
1120 out:
1121 	if (in_crit)
1122 		nbl_end_crit(vp);
1123 	VN_RELE(vp);
1124 
1125 	/* check if a monitor detected a delegation conflict */
1126 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1127 		/* mark as wouldblock so response is dropped */
1128 		curthread->t_flag |= T_WOULDBLOCK;
1129 	else
1130 		ns->ns_status = puterrno(error);
1131 
1132 }
1133 
1134 struct rfs_async_write {
1135 	struct nfswriteargs *wa;
1136 	struct nfsattrstat *ns;
1137 	struct svc_req *req;
1138 	cred_t *cr;
1139 	bool_t ro;
1140 	kthread_t *thread;
1141 	struct rfs_async_write *list;
1142 };
1143 
1144 struct rfs_async_write_list {
1145 	fhandle_t *fhp;
1146 	kcondvar_t cv;
1147 	struct rfs_async_write *list;
1148 	struct rfs_async_write_list *next;
1149 };
1150 
1151 static struct rfs_async_write_list *rfs_async_write_head = NULL;
1152 static kmutex_t rfs_async_write_lock;
1153 static int rfs_write_async = 1;	/* enables write clustering if == 1 */
1154 
1155 #define	MAXCLIOVECS	42
1156 #define	RFSWRITE_INITVAL (enum nfsstat) -1
1157 
1158 #ifdef DEBUG
1159 static int rfs_write_hits = 0;
1160 static int rfs_write_misses = 0;
1161 #endif
1162 
1163 /*
1164  * Write data to file.
1165  * Returns attributes of a file after writing some data to it.
1166  */
1167 void
1168 rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns,
1169     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1170 {
1171 	int error;
1172 	vnode_t *vp;
1173 	rlim64_t rlimit;
1174 	struct vattr va;
1175 	struct uio uio;
1176 	struct rfs_async_write_list *lp;
1177 	struct rfs_async_write_list *nlp;
1178 	struct rfs_async_write *rp;
1179 	struct rfs_async_write *nrp;
1180 	struct rfs_async_write *trp;
1181 	struct rfs_async_write *lrp;
1182 	int data_written;
1183 	int iovcnt;
1184 	mblk_t *m;
1185 	struct iovec *iovp;
1186 	struct iovec *niovp;
1187 	struct iovec iov[MAXCLIOVECS];
1188 	int count;
1189 	int rcount;
1190 	uint_t off;
1191 	uint_t len;
1192 	struct rfs_async_write nrpsp;
1193 	struct rfs_async_write_list nlpsp;
1194 	ushort_t t_flag;
1195 	cred_t *savecred;
1196 	int in_crit = 0;
1197 	caller_context_t ct;
1198 
1199 	if (!rfs_write_async) {
1200 		rfs_write_sync(wa, ns, exi, req, cr, ro);
1201 		return;
1202 	}
1203 
1204 	/*
1205 	 * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
1206 	 * is considered an OK.
1207 	 */
1208 	ns->ns_status = RFSWRITE_INITVAL;
1209 
1210 	nrp = &nrpsp;
1211 	nrp->wa = wa;
1212 	nrp->ns = ns;
1213 	nrp->req = req;
1214 	nrp->cr = cr;
1215 	nrp->ro = ro;
1216 	nrp->thread = curthread;
1217 
1218 	ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
1219 
1220 	/*
1221 	 * Look to see if there is already a cluster started
1222 	 * for this file.
1223 	 */
1224 	mutex_enter(&rfs_async_write_lock);
1225 	for (lp = rfs_async_write_head; lp != NULL; lp = lp->next) {
1226 		if (bcmp(&wa->wa_fhandle, lp->fhp,
1227 		    sizeof (fhandle_t)) == 0)
1228 			break;
1229 	}
1230 
1231 	/*
1232 	 * If lp is non-NULL, then there is already a cluster
1233 	 * started.  We need to place ourselves in the cluster
1234 	 * list in the right place as determined by starting
1235 	 * offset.  Conflicts with non-blocking mandatory locked
1236 	 * regions will be checked when the cluster is processed.
1237 	 */
1238 	if (lp != NULL) {
1239 		rp = lp->list;
1240 		trp = NULL;
1241 		while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
1242 			trp = rp;
1243 			rp = rp->list;
1244 		}
1245 		nrp->list = rp;
1246 		if (trp == NULL)
1247 			lp->list = nrp;
1248 		else
1249 			trp->list = nrp;
1250 		while (nrp->ns->ns_status == RFSWRITE_INITVAL)
1251 			cv_wait(&lp->cv, &rfs_async_write_lock);
1252 		mutex_exit(&rfs_async_write_lock);
1253 
1254 		return;
1255 	}
1256 
1257 	/*
1258 	 * No cluster started yet, start one and add ourselves
1259 	 * to the list of clusters.
1260 	 */
1261 	nrp->list = NULL;
1262 
1263 	nlp = &nlpsp;
1264 	nlp->fhp = &wa->wa_fhandle;
1265 	cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
1266 	nlp->list = nrp;
1267 	nlp->next = NULL;
1268 
1269 	if (rfs_async_write_head == NULL) {
1270 		rfs_async_write_head = nlp;
1271 	} else {
1272 		lp = rfs_async_write_head;
1273 		while (lp->next != NULL)
1274 			lp = lp->next;
1275 		lp->next = nlp;
1276 	}
1277 	mutex_exit(&rfs_async_write_lock);
1278 
1279 	/*
1280 	 * Convert the file handle common to all of the requests
1281 	 * in this cluster to a vnode.
1282 	 */
1283 	vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1284 	if (vp == NULL) {
1285 		mutex_enter(&rfs_async_write_lock);
1286 		if (rfs_async_write_head == nlp)
1287 			rfs_async_write_head = nlp->next;
1288 		else {
1289 			lp = rfs_async_write_head;
1290 			while (lp->next != nlp)
1291 				lp = lp->next;
1292 			lp->next = nlp->next;
1293 		}
1294 		t_flag = curthread->t_flag & T_WOULDBLOCK;
1295 		for (rp = nlp->list; rp != NULL; rp = rp->list) {
1296 			rp->ns->ns_status = NFSERR_STALE;
1297 			rp->thread->t_flag |= t_flag;
1298 		}
1299 		cv_broadcast(&nlp->cv);
1300 		mutex_exit(&rfs_async_write_lock);
1301 
1302 		return;
1303 	}
1304 
1305 	/*
1306 	 * Can only write regular files.  Attempts to write any
1307 	 * other file types fail with EISDIR.
1308 	 */
1309 	if (vp->v_type != VREG) {
1310 		VN_RELE(vp);
1311 		mutex_enter(&rfs_async_write_lock);
1312 		if (rfs_async_write_head == nlp)
1313 			rfs_async_write_head = nlp->next;
1314 		else {
1315 			lp = rfs_async_write_head;
1316 			while (lp->next != nlp)
1317 				lp = lp->next;
1318 			lp->next = nlp->next;
1319 		}
1320 		t_flag = curthread->t_flag & T_WOULDBLOCK;
1321 		for (rp = nlp->list; rp != NULL; rp = rp->list) {
1322 			rp->ns->ns_status = NFSERR_ISDIR;
1323 			rp->thread->t_flag |= t_flag;
1324 		}
1325 		cv_broadcast(&nlp->cv);
1326 		mutex_exit(&rfs_async_write_lock);
1327 
1328 		return;
1329 	}
1330 
1331 	/*
1332 	 * Enter the critical region before calling VOP_RWLOCK, to avoid a
1333 	 * deadlock with ufs.
1334 	 */
1335 	if (nbl_need_check(vp)) {
1336 		nbl_start_crit(vp, RW_READER);
1337 		in_crit = 1;
1338 	}
1339 
1340 	ct.cc_sysid = 0;
1341 	ct.cc_pid = 0;
1342 	ct.cc_caller_id = nfs2_srv_caller_id;
1343 	ct.cc_flags = CC_DONTBLOCK;
1344 
1345 	/*
1346 	 * Lock the file for writing.  This operation provides
1347 	 * the delay which allows clusters to grow.
1348 	 */
1349 	error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1350 
1351 	/* check if a monitor detected a delegation conflict */
1352 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1353 		if (in_crit)
1354 			nbl_end_crit(vp);
1355 		VN_RELE(vp);
1356 		/* mark as wouldblock so response is dropped */
1357 		curthread->t_flag |= T_WOULDBLOCK;
1358 		mutex_enter(&rfs_async_write_lock);
1359 		if (rfs_async_write_head == nlp)
1360 			rfs_async_write_head = nlp->next;
1361 		else {
1362 			lp = rfs_async_write_head;
1363 			while (lp->next != nlp)
1364 				lp = lp->next;
1365 			lp->next = nlp->next;
1366 		}
1367 		for (rp = nlp->list; rp != NULL; rp = rp->list) {
1368 			if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1369 				rp->ns->ns_status = puterrno(error);
1370 				rp->thread->t_flag |= T_WOULDBLOCK;
1371 			}
1372 		}
1373 		cv_broadcast(&nlp->cv);
1374 		mutex_exit(&rfs_async_write_lock);
1375 
1376 		return;
1377 	}
1378 
1379 	/*
1380 	 * Disconnect this cluster from the list of clusters.
1381 	 * The cluster that is being dealt with must be fixed
1382 	 * in size after this point, so there is no reason
1383 	 * to leave it on the list so that new requests can
1384 	 * find it.
1385 	 *
1386 	 * The algorithm is that the first write request will
1387 	 * create a cluster, convert the file handle to a
1388 	 * vnode pointer, and then lock the file for writing.
1389 	 * This request is not likely to be clustered with
1390 	 * any others.  However, the next request will create
1391 	 * a new cluster and be blocked in VOP_RWLOCK while
1392 	 * the first request is being processed.  This delay
1393 	 * will allow more requests to be clustered in this
1394 	 * second cluster.
1395 	 */
1396 	mutex_enter(&rfs_async_write_lock);
1397 	if (rfs_async_write_head == nlp)
1398 		rfs_async_write_head = nlp->next;
1399 	else {
1400 		lp = rfs_async_write_head;
1401 		while (lp->next != nlp)
1402 			lp = lp->next;
1403 		lp->next = nlp->next;
1404 	}
1405 	mutex_exit(&rfs_async_write_lock);
1406 
1407 	/*
1408 	 * Step through the list of requests in this cluster.
1409 	 * We need to check permissions to make sure that all
1410 	 * of the requests have sufficient permission to write
1411 	 * the file.  A cluster can be composed of requests
1412 	 * from different clients and different users on each
1413 	 * client.
1414 	 *
1415 	 * As a side effect, we also calculate the size of the
1416 	 * byte range that this cluster encompasses.
1417 	 */
1418 	rp = nlp->list;
1419 	off = rp->wa->wa_offset;
1420 	len = (uint_t)0;
1421 	do {
1422 		if (rdonly(rp->ro, vp)) {
1423 			rp->ns->ns_status = NFSERR_ROFS;
1424 			t_flag = curthread->t_flag & T_WOULDBLOCK;
1425 			rp->thread->t_flag |= t_flag;
1426 			continue;
1427 		}
1428 
1429 		va.va_mask = AT_UID|AT_MODE;
1430 
1431 		error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1432 
1433 		if (!error) {
1434 			if (crgetuid(rp->cr) != va.va_uid) {
1435 				/*
1436 				 * This is a kludge to allow writes of files
1437 				 * created with read only permission.  The
1438 				 * owner of the file is always allowed to
1439 				 * write it.
1440 				 */
1441 				error = VOP_ACCESS(vp, VWRITE, 0, rp->cr, &ct);
1442 			}
1443 			if (!error && MANDLOCK(vp, va.va_mode))
1444 				error = EACCES;
1445 		}
1446 
1447 		/*
1448 		 * Check for a conflict with a nbmand-locked region.
1449 		 */
1450 		if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset,
1451 		    rp->wa->wa_count, 0, NULL)) {
1452 			error = EACCES;
1453 		}
1454 
1455 		if (error) {
1456 			rp->ns->ns_status = puterrno(error);
1457 			t_flag = curthread->t_flag & T_WOULDBLOCK;
1458 			rp->thread->t_flag |= t_flag;
1459 			continue;
1460 		}
1461 		if (len < rp->wa->wa_offset + rp->wa->wa_count - off)
1462 			len = rp->wa->wa_offset + rp->wa->wa_count - off;
1463 	} while ((rp = rp->list) != NULL);
1464 
1465 	/*
1466 	 * Step through the cluster attempting to gather as many
1467 	 * requests which are contiguous as possible.  These
1468 	 * contiguous requests are handled via one call to VOP_WRITE
1469 	 * instead of different calls to VOP_WRITE.  We also keep
1470 	 * track of the fact that any data was written.
1471 	 */
1472 	rp = nlp->list;
1473 	data_written = 0;
1474 	do {
1475 		/*
1476 		 * Skip any requests which are already marked as having an
1477 		 * error.
1478 		 */
1479 		if (rp->ns->ns_status != RFSWRITE_INITVAL) {
1480 			rp = rp->list;
1481 			continue;
1482 		}
1483 
1484 		/*
1485 		 * Count the number of iovec's which are required
1486 		 * to handle this set of requests.  One iovec is
1487 		 * needed for each data buffer, whether addressed
1488 		 * by wa_data or by the b_rptr pointers in the
1489 		 * mblk chains.
1490 		 */
1491 		iovcnt = 0;
1492 		lrp = rp;
1493 		for (;;) {
1494 			if (lrp->wa->wa_data || lrp->wa->wa_rlist)
1495 				iovcnt++;
1496 			else {
1497 				m = lrp->wa->wa_mblk;
1498 				while (m != NULL) {
1499 					iovcnt++;
1500 					m = m->b_cont;
1501 				}
1502 			}
1503 			if (lrp->list == NULL ||
1504 			    lrp->list->ns->ns_status != RFSWRITE_INITVAL ||
1505 			    lrp->wa->wa_offset + lrp->wa->wa_count !=
1506 			    lrp->list->wa->wa_offset) {
1507 				lrp = lrp->list;
1508 				break;
1509 			}
1510 			lrp = lrp->list;
1511 		}
1512 
1513 		if (iovcnt <= MAXCLIOVECS) {
1514 #ifdef DEBUG
1515 			rfs_write_hits++;
1516 #endif
1517 			niovp = iov;
1518 		} else {
1519 #ifdef DEBUG
1520 			rfs_write_misses++;
1521 #endif
1522 			niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP);
1523 		}
1524 		/*
1525 		 * Put together the scatter/gather iovecs.
1526 		 */
1527 		iovp = niovp;
1528 		trp = rp;
1529 		count = 0;
1530 		do {
1531 			if (trp->wa->wa_data || trp->wa->wa_rlist) {
1532 				if (trp->wa->wa_rlist) {
1533 					iovp->iov_base =
1534 					    (char *)((trp->wa->wa_rlist)->
1535 					    u.c_daddr3);
1536 					iovp->iov_len = trp->wa->wa_count;
1537 				} else  {
1538 					iovp->iov_base = trp->wa->wa_data;
1539 					iovp->iov_len = trp->wa->wa_count;
1540 				}
1541 				iovp++;
1542 			} else {
1543 				m = trp->wa->wa_mblk;
1544 				rcount = trp->wa->wa_count;
1545 				while (m != NULL) {
1546 					iovp->iov_base = (caddr_t)m->b_rptr;
1547 					iovp->iov_len = (m->b_wptr - m->b_rptr);
1548 					rcount -= iovp->iov_len;
1549 					if (rcount < 0)
1550 						iovp->iov_len += rcount;
1551 					iovp++;
1552 					if (rcount <= 0)
1553 						break;
1554 					m = m->b_cont;
1555 				}
1556 			}
1557 			count += trp->wa->wa_count;
1558 			trp = trp->list;
1559 		} while (trp != lrp);
1560 
1561 		uio.uio_iov = niovp;
1562 		uio.uio_iovcnt = iovcnt;
1563 		uio.uio_segflg = UIO_SYSSPACE;
1564 		uio.uio_extflg = UIO_COPY_DEFAULT;
1565 		uio.uio_loffset = (offset_t)rp->wa->wa_offset;
1566 		uio.uio_resid = count;
1567 		/*
1568 		 * The limit is checked on the client. We
1569 		 * should allow any size writes here.
1570 		 */
1571 		uio.uio_llimit = curproc->p_fsz_ctl;
1572 		rlimit = uio.uio_llimit - rp->wa->wa_offset;
1573 		if (rlimit < (rlim64_t)uio.uio_resid)
1574 			uio.uio_resid = (uint_t)rlimit;
1575 
1576 		/*
1577 		 * For now we assume no append mode.
1578 		 */
1579 
1580 		/*
1581 		 * We're changing creds because VM may fault
1582 		 * and we need the cred of the current
1583 		 * thread to be used if quota * checking is
1584 		 * enabled.
1585 		 */
1586 		savecred = curthread->t_cred;
1587 		curthread->t_cred = cr;
1588 		error = VOP_WRITE(vp, &uio, 0, rp->cr, &ct);
1589 		curthread->t_cred = savecred;
1590 
1591 		/* check if a monitor detected a delegation conflict */
1592 		if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1593 			/* mark as wouldblock so response is dropped */
1594 			curthread->t_flag |= T_WOULDBLOCK;
1595 
1596 		if (niovp != iov)
1597 			kmem_free(niovp, sizeof (*niovp) * iovcnt);
1598 
1599 		if (!error) {
1600 			data_written = 1;
1601 			/*
1602 			 * Get attributes again so we send the latest mod
1603 			 * time to the client side for his cache.
1604 			 */
1605 			va.va_mask = AT_ALL;	/* now we want everything */
1606 
1607 			error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1608 
1609 			if (!error)
1610 				acl_perm(vp, exi, &va, rp->cr);
1611 		}
1612 
1613 		/*
1614 		 * Fill in the status responses for each request
1615 		 * which was just handled.  Also, copy the latest
1616 		 * attributes in to the attribute responses if
1617 		 * appropriate.
1618 		 */
1619 		t_flag = curthread->t_flag & T_WOULDBLOCK;
1620 		do {
1621 			rp->thread->t_flag |= t_flag;
1622 			/* check for overflows */
1623 			if (!error) {
1624 				error  = vattr_to_nattr(&va, &rp->ns->ns_attr);
1625 			}
1626 			rp->ns->ns_status = puterrno(error);
1627 			rp = rp->list;
1628 		} while (rp != lrp);
1629 	} while (rp != NULL);
1630 
1631 	/*
1632 	 * If any data was written at all, then we need to flush
1633 	 * the data and metadata to stable storage.
1634 	 */
1635 	if (data_written) {
1636 		error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr, &ct);
1637 
1638 		if (!error) {
1639 			error = VOP_FSYNC(vp, FNODSYNC, cr, &ct);
1640 		}
1641 	}
1642 
1643 	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1644 
1645 	if (in_crit)
1646 		nbl_end_crit(vp);
1647 	VN_RELE(vp);
1648 
1649 	t_flag = curthread->t_flag & T_WOULDBLOCK;
1650 	mutex_enter(&rfs_async_write_lock);
1651 	for (rp = nlp->list; rp != NULL; rp = rp->list) {
1652 		if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1653 			rp->ns->ns_status = puterrno(error);
1654 			rp->thread->t_flag |= t_flag;
1655 		}
1656 	}
1657 	cv_broadcast(&nlp->cv);
1658 	mutex_exit(&rfs_async_write_lock);
1659 
1660 }
1661 
1662 void *
1663 rfs_write_getfh(struct nfswriteargs *wa)
1664 {
1665 	return (&wa->wa_fhandle);
1666 }
1667 
1668 /*
1669  * Create a file.
1670  * Creates a file with given attributes and returns those attributes
1671  * and an fhandle for the new file.
1672  */
1673 void
1674 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
1675     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1676 {
1677 	int error;
1678 	int lookuperr;
1679 	int in_crit = 0;
1680 	struct vattr va;
1681 	vnode_t *vp;
1682 	vnode_t *realvp;
1683 	vnode_t *dvp;
1684 	char *name = args->ca_da.da_name;
1685 	vnode_t *tvp = NULL;
1686 	int mode;
1687 	int lookup_ok;
1688 	bool_t trunc;
1689 	struct sockaddr *ca;
1690 
1691 	/*
1692 	 * Disallow NULL paths
1693 	 */
1694 	if (name == NULL || *name == '\0') {
1695 		dr->dr_status = NFSERR_ACCES;
1696 		return;
1697 	}
1698 
1699 	dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
1700 	if (dvp == NULL) {
1701 		dr->dr_status = NFSERR_STALE;
1702 		return;
1703 	}
1704 
1705 	error = sattr_to_vattr(args->ca_sa, &va);
1706 	if (error) {
1707 		dr->dr_status = puterrno(error);
1708 		return;
1709 	}
1710 
1711 	/*
1712 	 * Must specify the mode.
1713 	 */
1714 	if (!(va.va_mask & AT_MODE)) {
1715 		VN_RELE(dvp);
1716 		dr->dr_status = NFSERR_INVAL;
1717 		return;
1718 	}
1719 
1720 	/*
1721 	 * This is a completely gross hack to make mknod
1722 	 * work over the wire until we can wack the protocol
1723 	 */
1724 	if ((va.va_mode & IFMT) == IFCHR) {
1725 		if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV)
1726 			va.va_type = VFIFO;	/* xtra kludge for named pipe */
1727 		else {
1728 			va.va_type = VCHR;
1729 			/*
1730 			 * uncompress the received dev_t
1731 			 * if the top half is zero indicating a request
1732 			 * from an `older style' OS.
1733 			 */
1734 			if ((va.va_size & 0xffff0000) == 0)
1735 				va.va_rdev = nfsv2_expdev(va.va_size);
1736 			else
1737 				va.va_rdev = (dev_t)va.va_size;
1738 		}
1739 		va.va_mask &= ~AT_SIZE;
1740 	} else if ((va.va_mode & IFMT) == IFBLK) {
1741 		va.va_type = VBLK;
1742 		/*
1743 		 * uncompress the received dev_t
1744 		 * if the top half is zero indicating a request
1745 		 * from an `older style' OS.
1746 		 */
1747 		if ((va.va_size & 0xffff0000) == 0)
1748 			va.va_rdev = nfsv2_expdev(va.va_size);
1749 		else
1750 			va.va_rdev = (dev_t)va.va_size;
1751 		va.va_mask &= ~AT_SIZE;
1752 	} else if ((va.va_mode & IFMT) == IFSOCK) {
1753 		va.va_type = VSOCK;
1754 	} else {
1755 		va.va_type = VREG;
1756 	}
1757 	va.va_mode &= ~IFMT;
1758 	va.va_mask |= AT_TYPE;
1759 
1760 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
1761 	name = nfscmd_convname(ca, exi, name, NFSCMD_CONV_INBOUND,
1762 	    MAXPATHLEN);
1763 	if (name == NULL) {
1764 		dr->dr_status = puterrno(EINVAL);
1765 		return;
1766 	}
1767 
1768 	/*
1769 	 * Why was the choice made to use VWRITE as the mode to the
1770 	 * call to VOP_CREATE ? This results in a bug.  When a client
1771 	 * opens a file that already exists and is RDONLY, the second
1772 	 * open fails with an EACESS because of the mode.
1773 	 * bug ID 1054648.
1774 	 */
1775 	lookup_ok = 0;
1776 	mode = VWRITE;
1777 	if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) {
1778 		error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1779 		    NULL, NULL, NULL);
1780 		if (!error) {
1781 			struct vattr at;
1782 
1783 			lookup_ok = 1;
1784 			at.va_mask = AT_MODE;
1785 			error = VOP_GETATTR(tvp, &at, 0, cr, NULL);
1786 			if (!error)
1787 				mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD;
1788 			VN_RELE(tvp);
1789 			tvp = NULL;
1790 		}
1791 	}
1792 
1793 	if (!lookup_ok) {
1794 		if (rdonly(ro, dvp)) {
1795 			error = EROFS;
1796 		} else if (va.va_type != VREG && va.va_type != VFIFO &&
1797 		    va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) {
1798 			error = EPERM;
1799 		} else {
1800 			error = 0;
1801 		}
1802 	}
1803 
1804 	/*
1805 	 * If file size is being modified on an already existing file
1806 	 * make sure that there are no conflicting non-blocking mandatory
1807 	 * locks in the region being manipulated. Return EACCES if there
1808 	 * are conflicting locks.
1809 	 */
1810 	if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) {
1811 		lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1812 		    NULL, NULL, NULL);
1813 
1814 		if (!lookuperr &&
1815 		    rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) {
1816 			VN_RELE(tvp);
1817 			curthread->t_flag |= T_WOULDBLOCK;
1818 			goto out;
1819 		}
1820 
1821 		if (!lookuperr && nbl_need_check(tvp)) {
1822 			/*
1823 			 * The file exists. Now check if it has any
1824 			 * conflicting non-blocking mandatory locks
1825 			 * in the region being changed.
1826 			 */
1827 			struct vattr bva;
1828 			u_offset_t offset;
1829 			ssize_t length;
1830 
1831 			nbl_start_crit(tvp, RW_READER);
1832 			in_crit = 1;
1833 
1834 			bva.va_mask = AT_SIZE;
1835 			error = VOP_GETATTR(tvp, &bva, 0, cr, NULL);
1836 			if (!error) {
1837 				if (va.va_size < bva.va_size) {
1838 					offset = va.va_size;
1839 					length = bva.va_size - va.va_size;
1840 				} else {
1841 					offset = bva.va_size;
1842 					length = va.va_size - bva.va_size;
1843 				}
1844 				if (length) {
1845 					if (nbl_conflict(tvp, NBL_WRITE,
1846 					    offset, length, 0, NULL)) {
1847 						error = EACCES;
1848 					}
1849 				}
1850 			}
1851 			if (error) {
1852 				nbl_end_crit(tvp);
1853 				VN_RELE(tvp);
1854 				in_crit = 0;
1855 			}
1856 		} else if (tvp != NULL) {
1857 			VN_RELE(tvp);
1858 		}
1859 	}
1860 
1861 	if (!error) {
1862 		/*
1863 		 * If filesystem is shared with nosuid the remove any
1864 		 * setuid/setgid bits on create.
1865 		 */
1866 		if (va.va_type == VREG &&
1867 		    exi->exi_export.ex_flags & EX_NOSUID)
1868 			va.va_mode &= ~(VSUID | VSGID);
1869 
1870 		error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0,
1871 		    NULL, NULL);
1872 
1873 		if (!error) {
1874 
1875 			if ((va.va_mask & AT_SIZE) && (va.va_size == 0))
1876 				trunc = TRUE;
1877 			else
1878 				trunc = FALSE;
1879 
1880 			if (rfs4_check_delegated(FWRITE, vp, trunc)) {
1881 				VN_RELE(vp);
1882 				curthread->t_flag |= T_WOULDBLOCK;
1883 				goto out;
1884 			}
1885 			va.va_mask = AT_ALL;
1886 
1887 			error = VOP_GETATTR(vp, &va, 0, cr, NULL);
1888 
1889 			/* check for overflows */
1890 			if (!error) {
1891 				acl_perm(vp, exi, &va, cr);
1892 				error = vattr_to_nattr(&va, &dr->dr_attr);
1893 				if (!error) {
1894 					error = makefh(&dr->dr_fhandle, vp,
1895 					    exi);
1896 				}
1897 			}
1898 			/*
1899 			 * Force modified metadata out to stable storage.
1900 			 *
1901 			 * if a underlying vp exists, pass it to VOP_FSYNC
1902 			 */
1903 			if (VOP_REALVP(vp, &realvp, NULL) == 0)
1904 				(void) VOP_FSYNC(realvp, FNODSYNC, cr, NULL);
1905 			else
1906 				(void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
1907 			VN_RELE(vp);
1908 		}
1909 
1910 		if (in_crit) {
1911 			nbl_end_crit(tvp);
1912 			VN_RELE(tvp);
1913 		}
1914 	}
1915 
1916 	/*
1917 	 * Force modified data and metadata out to stable storage.
1918 	 */
1919 	(void) VOP_FSYNC(dvp, 0, cr, NULL);
1920 
1921 out:
1922 
1923 	VN_RELE(dvp);
1924 
1925 	dr->dr_status = puterrno(error);
1926 
1927 	if (name != args->ca_da.da_name)
1928 		kmem_free(name, MAXPATHLEN);
1929 }
1930 void *
1931 rfs_create_getfh(struct nfscreatargs *args)
1932 {
1933 	return (args->ca_da.da_fhandle);
1934 }
1935 
1936 /*
1937  * Remove a file.
1938  * Remove named file from parent directory.
1939  */
1940 /* ARGSUSED */
1941 void
1942 rfs_remove(struct nfsdiropargs *da, enum nfsstat *status,
1943     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1944 {
1945 	int error = 0;
1946 	vnode_t *vp;
1947 	vnode_t *targvp;
1948 	int in_crit = 0;
1949 
1950 	/*
1951 	 * Disallow NULL paths
1952 	 */
1953 	if (da->da_name == NULL || *da->da_name == '\0') {
1954 		*status = NFSERR_ACCES;
1955 		return;
1956 	}
1957 
1958 	vp = nfs_fhtovp(da->da_fhandle, exi);
1959 	if (vp == NULL) {
1960 		*status = NFSERR_STALE;
1961 		return;
1962 	}
1963 
1964 	if (rdonly(ro, vp)) {
1965 		VN_RELE(vp);
1966 		*status = NFSERR_ROFS;
1967 		return;
1968 	}
1969 
1970 	/*
1971 	 * Check for a conflict with a non-blocking mandatory share reservation.
1972 	 */
1973 	error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0,
1974 	    NULL, cr, NULL, NULL, NULL);
1975 	if (error != 0) {
1976 		VN_RELE(vp);
1977 		*status = puterrno(error);
1978 		return;
1979 	}
1980 
1981 	/*
1982 	 * If the file is delegated to an v4 client, then initiate
1983 	 * recall and drop this request (by setting T_WOULDBLOCK).
1984 	 * The client will eventually re-transmit the request and
1985 	 * (hopefully), by then, the v4 client will have returned
1986 	 * the delegation.
1987 	 */
1988 
1989 	if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
1990 		VN_RELE(vp);
1991 		VN_RELE(targvp);
1992 		curthread->t_flag |= T_WOULDBLOCK;
1993 		return;
1994 	}
1995 
1996 	if (nbl_need_check(targvp)) {
1997 		nbl_start_crit(targvp, RW_READER);
1998 		in_crit = 1;
1999 		if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
2000 			error = EACCES;
2001 			goto out;
2002 		}
2003 	}
2004 
2005 	error = VOP_REMOVE(vp, da->da_name, cr, NULL, 0);
2006 
2007 	/*
2008 	 * Force modified data and metadata out to stable storage.
2009 	 */
2010 	(void) VOP_FSYNC(vp, 0, cr, NULL);
2011 
2012 out:
2013 	if (in_crit)
2014 		nbl_end_crit(targvp);
2015 	VN_RELE(targvp);
2016 	VN_RELE(vp);
2017 
2018 	*status = puterrno(error);
2019 
2020 }
2021 
2022 void *
2023 rfs_remove_getfh(struct nfsdiropargs *da)
2024 {
2025 	return (da->da_fhandle);
2026 }
2027 
2028 /*
2029  * rename a file
2030  * Give a file (from) a new name (to).
2031  */
2032 /* ARGSUSED */
2033 void
2034 rfs_rename(struct nfsrnmargs *args, enum nfsstat *status,
2035     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2036 {
2037 	int error = 0;
2038 	vnode_t *fromvp;
2039 	vnode_t *tovp;
2040 	struct exportinfo *to_exi;
2041 	fhandle_t *fh;
2042 	vnode_t *srcvp;
2043 	vnode_t *targvp;
2044 	int in_crit = 0;
2045 
2046 	fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi);
2047 	if (fromvp == NULL) {
2048 		*status = NFSERR_STALE;
2049 		return;
2050 	}
2051 
2052 	fh = args->rna_to.da_fhandle;
2053 	to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2054 	if (to_exi == NULL) {
2055 		VN_RELE(fromvp);
2056 		*status = NFSERR_ACCES;
2057 		return;
2058 	}
2059 	exi_rele(to_exi);
2060 
2061 	if (to_exi != exi) {
2062 		VN_RELE(fromvp);
2063 		*status = NFSERR_XDEV;
2064 		return;
2065 	}
2066 
2067 	tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi);
2068 	if (tovp == NULL) {
2069 		VN_RELE(fromvp);
2070 		*status = NFSERR_STALE;
2071 		return;
2072 	}
2073 
2074 	if (fromvp->v_type != VDIR || tovp->v_type != VDIR) {
2075 		VN_RELE(tovp);
2076 		VN_RELE(fromvp);
2077 		*status = NFSERR_NOTDIR;
2078 		return;
2079 	}
2080 
2081 	/*
2082 	 * Disallow NULL paths
2083 	 */
2084 	if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' ||
2085 	    args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') {
2086 		VN_RELE(tovp);
2087 		VN_RELE(fromvp);
2088 		*status = NFSERR_ACCES;
2089 		return;
2090 	}
2091 
2092 	if (rdonly(ro, tovp)) {
2093 		VN_RELE(tovp);
2094 		VN_RELE(fromvp);
2095 		*status = NFSERR_ROFS;
2096 		return;
2097 	}
2098 
2099 	/*
2100 	 * Check for a conflict with a non-blocking mandatory share reservation.
2101 	 */
2102 	error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
2103 	    NULL, cr, NULL, NULL, NULL);
2104 	if (error != 0) {
2105 		VN_RELE(tovp);
2106 		VN_RELE(fromvp);
2107 		*status = puterrno(error);
2108 		return;
2109 	}
2110 
2111 	/* Check for delegations on the source file */
2112 
2113 	if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
2114 		VN_RELE(tovp);
2115 		VN_RELE(fromvp);
2116 		VN_RELE(srcvp);
2117 		curthread->t_flag |= T_WOULDBLOCK;
2118 		return;
2119 	}
2120 
2121 	/* Check for delegation on the file being renamed over, if it exists */
2122 
2123 	if (rfs4_deleg_policy != SRV_NEVER_DELEGATE &&
2124 	    VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
2125 	    NULL, NULL, NULL) == 0) {
2126 
2127 		if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2128 			VN_RELE(tovp);
2129 			VN_RELE(fromvp);
2130 			VN_RELE(srcvp);
2131 			VN_RELE(targvp);
2132 			curthread->t_flag |= T_WOULDBLOCK;
2133 			return;
2134 		}
2135 		VN_RELE(targvp);
2136 	}
2137 
2138 
2139 	if (nbl_need_check(srcvp)) {
2140 		nbl_start_crit(srcvp, RW_READER);
2141 		in_crit = 1;
2142 		if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) {
2143 			error = EACCES;
2144 			goto out;
2145 		}
2146 	}
2147 
2148 	error = VOP_RENAME(fromvp, args->rna_from.da_name,
2149 	    tovp, args->rna_to.da_name, cr, NULL, 0);
2150 
2151 	if (error == 0)
2152 		vn_renamepath(tovp, srcvp, args->rna_to.da_name,
2153 		    strlen(args->rna_to.da_name));
2154 
2155 	/*
2156 	 * Force modified data and metadata out to stable storage.
2157 	 */
2158 	(void) VOP_FSYNC(tovp, 0, cr, NULL);
2159 	(void) VOP_FSYNC(fromvp, 0, cr, NULL);
2160 
2161 out:
2162 	if (in_crit)
2163 		nbl_end_crit(srcvp);
2164 	VN_RELE(srcvp);
2165 	VN_RELE(tovp);
2166 	VN_RELE(fromvp);
2167 
2168 	*status = puterrno(error);
2169 
2170 }
2171 void *
2172 rfs_rename_getfh(struct nfsrnmargs *args)
2173 {
2174 	return (args->rna_from.da_fhandle);
2175 }
2176 
2177 /*
2178  * Link to a file.
2179  * Create a file (to) which is a hard link to the given file (from).
2180  */
2181 /* ARGSUSED */
2182 void
2183 rfs_link(struct nfslinkargs *args, enum nfsstat *status,
2184     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2185 {
2186 	int error;
2187 	vnode_t *fromvp;
2188 	vnode_t *tovp;
2189 	struct exportinfo *to_exi;
2190 	fhandle_t *fh;
2191 
2192 	fromvp = nfs_fhtovp(args->la_from, exi);
2193 	if (fromvp == NULL) {
2194 		*status = NFSERR_STALE;
2195 		return;
2196 	}
2197 
2198 	fh = args->la_to.da_fhandle;
2199 	to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2200 	if (to_exi == NULL) {
2201 		VN_RELE(fromvp);
2202 		*status = NFSERR_ACCES;
2203 		return;
2204 	}
2205 	exi_rele(to_exi);
2206 
2207 	if (to_exi != exi) {
2208 		VN_RELE(fromvp);
2209 		*status = NFSERR_XDEV;
2210 		return;
2211 	}
2212 
2213 	tovp = nfs_fhtovp(args->la_to.da_fhandle, exi);
2214 	if (tovp == NULL) {
2215 		VN_RELE(fromvp);
2216 		*status = NFSERR_STALE;
2217 		return;
2218 	}
2219 
2220 	if (tovp->v_type != VDIR) {
2221 		VN_RELE(tovp);
2222 		VN_RELE(fromvp);
2223 		*status = NFSERR_NOTDIR;
2224 		return;
2225 	}
2226 	/*
2227 	 * Disallow NULL paths
2228 	 */
2229 	if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') {
2230 		VN_RELE(tovp);
2231 		VN_RELE(fromvp);
2232 		*status = NFSERR_ACCES;
2233 		return;
2234 	}
2235 
2236 	if (rdonly(ro, tovp)) {
2237 		VN_RELE(tovp);
2238 		VN_RELE(fromvp);
2239 		*status = NFSERR_ROFS;
2240 		return;
2241 	}
2242 
2243 	error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr, NULL, 0);
2244 
2245 	/*
2246 	 * Force modified data and metadata out to stable storage.
2247 	 */
2248 	(void) VOP_FSYNC(tovp, 0, cr, NULL);
2249 	(void) VOP_FSYNC(fromvp, FNODSYNC, cr, NULL);
2250 
2251 	VN_RELE(tovp);
2252 	VN_RELE(fromvp);
2253 
2254 	*status = puterrno(error);
2255 
2256 }
2257 void *
2258 rfs_link_getfh(struct nfslinkargs *args)
2259 {
2260 	return (args->la_from);
2261 }
2262 
2263 /*
2264  * Symbolicly link to a file.
2265  * Create a file (to) with the given attributes which is a symbolic link
2266  * to the given path name (to).
2267  */
2268 void
2269 rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
2270     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2271 {
2272 	int error;
2273 	struct vattr va;
2274 	vnode_t *vp;
2275 	vnode_t *svp;
2276 	int lerror;
2277 	struct sockaddr *ca;
2278 	char *name = NULL;
2279 
2280 	/*
2281 	 * Disallow NULL paths
2282 	 */
2283 	if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') {
2284 		*status = NFSERR_ACCES;
2285 		return;
2286 	}
2287 
2288 	vp = nfs_fhtovp(args->sla_from.da_fhandle, exi);
2289 	if (vp == NULL) {
2290 		*status = NFSERR_STALE;
2291 		return;
2292 	}
2293 
2294 	if (rdonly(ro, vp)) {
2295 		VN_RELE(vp);
2296 		*status = NFSERR_ROFS;
2297 		return;
2298 	}
2299 
2300 	error = sattr_to_vattr(args->sla_sa, &va);
2301 	if (error) {
2302 		VN_RELE(vp);
2303 		*status = puterrno(error);
2304 		return;
2305 	}
2306 
2307 	if (!(va.va_mask & AT_MODE)) {
2308 		VN_RELE(vp);
2309 		*status = NFSERR_INVAL;
2310 		return;
2311 	}
2312 
2313 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2314 	name = nfscmd_convname(ca, exi, args->sla_tnm,
2315 	    NFSCMD_CONV_INBOUND, MAXPATHLEN);
2316 
2317 	if (name == NULL) {
2318 		*status = NFSERR_ACCES;
2319 		return;
2320 	}
2321 
2322 	va.va_type = VLNK;
2323 	va.va_mask |= AT_TYPE;
2324 
2325 	error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, name, cr, NULL, 0);
2326 
2327 	/*
2328 	 * Force new data and metadata out to stable storage.
2329 	 */
2330 	lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL, 0,
2331 	    NULL, cr, NULL, NULL, NULL);
2332 
2333 	if (!lerror) {
2334 		(void) VOP_FSYNC(svp, 0, cr, NULL);
2335 		VN_RELE(svp);
2336 	}
2337 
2338 	/*
2339 	 * Force modified data and metadata out to stable storage.
2340 	 */
2341 	(void) VOP_FSYNC(vp, 0, cr, NULL);
2342 
2343 	VN_RELE(vp);
2344 
2345 	*status = puterrno(error);
2346 	if (name != args->sla_tnm)
2347 		kmem_free(name, MAXPATHLEN);
2348 
2349 }
2350 void *
2351 rfs_symlink_getfh(struct nfsslargs *args)
2352 {
2353 	return (args->sla_from.da_fhandle);
2354 }
2355 
2356 /*
2357  * Make a directory.
2358  * Create a directory with the given name, parent directory, and attributes.
2359  * Returns a file handle and attributes for the new directory.
2360  */
2361 /* ARGSUSED */
2362 void
2363 rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr,
2364     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2365 {
2366 	int error;
2367 	struct vattr va;
2368 	vnode_t *dvp = NULL;
2369 	vnode_t *vp;
2370 	char *name = args->ca_da.da_name;
2371 
2372 	/*
2373 	 * Disallow NULL paths
2374 	 */
2375 	if (name == NULL || *name == '\0') {
2376 		dr->dr_status = NFSERR_ACCES;
2377 		return;
2378 	}
2379 
2380 	vp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
2381 	if (vp == NULL) {
2382 		dr->dr_status = NFSERR_STALE;
2383 		return;
2384 	}
2385 
2386 	if (rdonly(ro, vp)) {
2387 		VN_RELE(vp);
2388 		dr->dr_status = NFSERR_ROFS;
2389 		return;
2390 	}
2391 
2392 	error = sattr_to_vattr(args->ca_sa, &va);
2393 	if (error) {
2394 		VN_RELE(vp);
2395 		dr->dr_status = puterrno(error);
2396 		return;
2397 	}
2398 
2399 	if (!(va.va_mask & AT_MODE)) {
2400 		VN_RELE(vp);
2401 		dr->dr_status = NFSERR_INVAL;
2402 		return;
2403 	}
2404 
2405 	va.va_type = VDIR;
2406 	va.va_mask |= AT_TYPE;
2407 
2408 	error = VOP_MKDIR(vp, name, &va, &dvp, cr, NULL, 0, NULL);
2409 
2410 	if (!error) {
2411 		/*
2412 		 * Attribtutes of the newly created directory should
2413 		 * be returned to the client.
2414 		 */
2415 		va.va_mask = AT_ALL; /* We want everything */
2416 		error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2417 
2418 		/* check for overflows */
2419 		if (!error) {
2420 			acl_perm(vp, exi, &va, cr);
2421 			error = vattr_to_nattr(&va, &dr->dr_attr);
2422 			if (!error) {
2423 				error = makefh(&dr->dr_fhandle, dvp, exi);
2424 			}
2425 		}
2426 		/*
2427 		 * Force new data and metadata out to stable storage.
2428 		 */
2429 		(void) VOP_FSYNC(dvp, 0, cr, NULL);
2430 		VN_RELE(dvp);
2431 	}
2432 
2433 	/*
2434 	 * Force modified data and metadata out to stable storage.
2435 	 */
2436 	(void) VOP_FSYNC(vp, 0, cr, NULL);
2437 
2438 	VN_RELE(vp);
2439 
2440 	dr->dr_status = puterrno(error);
2441 
2442 }
2443 void *
2444 rfs_mkdir_getfh(struct nfscreatargs *args)
2445 {
2446 	return (args->ca_da.da_fhandle);
2447 }
2448 
2449 /*
2450  * Remove a directory.
2451  * Remove the given directory name from the given parent directory.
2452  */
2453 /* ARGSUSED */
2454 void
2455 rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status,
2456     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2457 {
2458 	int error;
2459 	vnode_t *vp;
2460 
2461 	/*
2462 	 * Disallow NULL paths
2463 	 */
2464 	if (da->da_name == NULL || *da->da_name == '\0') {
2465 		*status = NFSERR_ACCES;
2466 		return;
2467 	}
2468 
2469 	vp = nfs_fhtovp(da->da_fhandle, exi);
2470 	if (vp == NULL) {
2471 		*status = NFSERR_STALE;
2472 		return;
2473 	}
2474 
2475 	if (rdonly(ro, vp)) {
2476 		VN_RELE(vp);
2477 		*status = NFSERR_ROFS;
2478 		return;
2479 	}
2480 
2481 	/*
2482 	 * VOP_RMDIR takes a third argument (the current
2483 	 * directory of the process).  That's because someone
2484 	 * wants to return EINVAL if one tries to remove ".".
2485 	 * Of course, NFS servers have no idea what their
2486 	 * clients' current directories are.  We fake it by
2487 	 * supplying a vnode known to exist and illegal to
2488 	 * remove.
2489 	 */
2490 	error = VOP_RMDIR(vp, da->da_name, rootdir, cr, NULL, 0);
2491 
2492 	/*
2493 	 * Force modified data and metadata out to stable storage.
2494 	 */
2495 	(void) VOP_FSYNC(vp, 0, cr, NULL);
2496 
2497 	VN_RELE(vp);
2498 
2499 	/*
2500 	 * System V defines rmdir to return EEXIST, not ENOTEMPTY,
2501 	 * if the directory is not empty.  A System V NFS server
2502 	 * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
2503 	 * over the wire.
2504 	 */
2505 	if (error == EEXIST)
2506 		*status = NFSERR_NOTEMPTY;
2507 	else
2508 		*status = puterrno(error);
2509 
2510 }
2511 void *
2512 rfs_rmdir_getfh(struct nfsdiropargs *da)
2513 {
2514 	return (da->da_fhandle);
2515 }
2516 
2517 /* ARGSUSED */
2518 void
2519 rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
2520     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2521 {
2522 	int error;
2523 	int iseof;
2524 	struct iovec iov;
2525 	struct uio uio;
2526 	vnode_t *vp;
2527 	char *ndata = NULL;
2528 	struct sockaddr *ca;
2529 	size_t nents;
2530 	int ret;
2531 
2532 	vp = nfs_fhtovp(&rda->rda_fh, exi);
2533 	if (vp == NULL) {
2534 		rd->rd_entries = NULL;
2535 		rd->rd_status = NFSERR_STALE;
2536 		return;
2537 	}
2538 
2539 	if (vp->v_type != VDIR) {
2540 		VN_RELE(vp);
2541 		rd->rd_entries = NULL;
2542 		rd->rd_status = NFSERR_NOTDIR;
2543 		return;
2544 	}
2545 
2546 	(void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
2547 
2548 	error = VOP_ACCESS(vp, VREAD, 0, cr, NULL);
2549 
2550 	if (error) {
2551 		rd->rd_entries = NULL;
2552 		goto bad;
2553 	}
2554 
2555 	if (rda->rda_count == 0) {
2556 		rd->rd_entries = NULL;
2557 		rd->rd_size = 0;
2558 		rd->rd_eof = FALSE;
2559 		goto bad;
2560 	}
2561 
2562 	rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA);
2563 
2564 	/*
2565 	 * Allocate data for entries.  This will be freed by rfs_rddirfree.
2566 	 */
2567 	rd->rd_bufsize = (uint_t)rda->rda_count;
2568 	rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP);
2569 
2570 	/*
2571 	 * Set up io vector to read directory data
2572 	 */
2573 	iov.iov_base = (caddr_t)rd->rd_entries;
2574 	iov.iov_len = rda->rda_count;
2575 	uio.uio_iov = &iov;
2576 	uio.uio_iovcnt = 1;
2577 	uio.uio_segflg = UIO_SYSSPACE;
2578 	uio.uio_extflg = UIO_COPY_CACHED;
2579 	uio.uio_loffset = (offset_t)rda->rda_offset;
2580 	uio.uio_resid = rda->rda_count;
2581 
2582 	/*
2583 	 * read directory
2584 	 */
2585 	error = VOP_READDIR(vp, &uio, cr, &iseof, NULL, 0);
2586 
2587 	/*
2588 	 * Clean up
2589 	 */
2590 	if (!error) {
2591 		/*
2592 		 * set size and eof
2593 		 */
2594 		if (uio.uio_resid == rda->rda_count) {
2595 			rd->rd_size = 0;
2596 			rd->rd_eof = TRUE;
2597 		} else {
2598 			rd->rd_size = (uint32_t)(rda->rda_count -
2599 			    uio.uio_resid);
2600 			rd->rd_eof = iseof ? TRUE : FALSE;
2601 		}
2602 	}
2603 
2604 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2605 	nents = nfscmd_countents((char *)rd->rd_entries, rd->rd_size);
2606 	ret = nfscmd_convdirplus(ca, exi, (char *)rd->rd_entries, nents,
2607 	    rda->rda_count, &ndata);
2608 
2609 	if (ret != 0) {
2610 		size_t dropbytes;
2611 		/*
2612 		 * We had to drop one or more entries in order to fit
2613 		 * during the character conversion.  We need to patch
2614 		 * up the size and eof info.
2615 		 */
2616 		if (rd->rd_eof)
2617 			rd->rd_eof = FALSE;
2618 		dropbytes = nfscmd_dropped_entrysize(
2619 		    (struct dirent64 *)rd->rd_entries, nents, ret);
2620 		rd->rd_size -= dropbytes;
2621 	}
2622 	if (ndata == NULL) {
2623 		ndata = (char *)rd->rd_entries;
2624 	} else if (ndata != (char *)rd->rd_entries) {
2625 		kmem_free(rd->rd_entries, rd->rd_bufsize);
2626 		rd->rd_entries = (void *)ndata;
2627 		rd->rd_bufsize = rda->rda_count;
2628 	}
2629 
2630 bad:
2631 	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
2632 
2633 #if 0 /* notyet */
2634 	/*
2635 	 * Don't do this.  It causes local disk writes when just
2636 	 * reading the file and the overhead is deemed larger
2637 	 * than the benefit.
2638 	 */
2639 	/*
2640 	 * Force modified metadata out to stable storage.
2641 	 */
2642 	(void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2643 #endif
2644 
2645 	VN_RELE(vp);
2646 
2647 	rd->rd_status = puterrno(error);
2648 
2649 }
2650 void *
2651 rfs_readdir_getfh(struct nfsrddirargs *rda)
2652 {
2653 	return (&rda->rda_fh);
2654 }
2655 void
2656 rfs_rddirfree(struct nfsrddirres *rd)
2657 {
2658 	if (rd->rd_entries != NULL)
2659 		kmem_free(rd->rd_entries, rd->rd_bufsize);
2660 }
2661 
2662 /* ARGSUSED */
2663 void
2664 rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
2665     struct svc_req *req, cred_t *cr, bool_t ro)
2666 {
2667 	int error;
2668 	struct statvfs64 sb;
2669 	vnode_t *vp;
2670 
2671 	vp = nfs_fhtovp(fh, exi);
2672 	if (vp == NULL) {
2673 		fs->fs_status = NFSERR_STALE;
2674 		return;
2675 	}
2676 
2677 	error = VFS_STATVFS(vp->v_vfsp, &sb);
2678 
2679 	if (!error) {
2680 		fs->fs_tsize = nfstsize();
2681 		fs->fs_bsize = sb.f_frsize;
2682 		fs->fs_blocks = sb.f_blocks;
2683 		fs->fs_bfree = sb.f_bfree;
2684 		fs->fs_bavail = sb.f_bavail;
2685 	}
2686 
2687 	VN_RELE(vp);
2688 
2689 	fs->fs_status = puterrno(error);
2690 
2691 }
2692 void *
2693 rfs_statfs_getfh(fhandle_t *fh)
2694 {
2695 	return (fh);
2696 }
2697 
2698 static int
2699 sattr_to_vattr(struct nfssattr *sa, struct vattr *vap)
2700 {
2701 	vap->va_mask = 0;
2702 
2703 	/*
2704 	 * There was a sign extension bug in some VFS based systems
2705 	 * which stored the mode as a short.  When it would get
2706 	 * assigned to a u_long, no sign extension would occur.
2707 	 * It needed to, but this wasn't noticed because sa_mode
2708 	 * would then get assigned back to the short, thus ignoring
2709 	 * the upper 16 bits of sa_mode.
2710 	 *
2711 	 * To make this implementation work for both broken
2712 	 * clients and good clients, we check for both versions
2713 	 * of the mode.
2714 	 */
2715 	if (sa->sa_mode != (uint32_t)((ushort_t)-1) &&
2716 	    sa->sa_mode != (uint32_t)-1) {
2717 		vap->va_mask |= AT_MODE;
2718 		vap->va_mode = sa->sa_mode;
2719 	}
2720 	if (sa->sa_uid != (uint32_t)-1) {
2721 		vap->va_mask |= AT_UID;
2722 		vap->va_uid = sa->sa_uid;
2723 	}
2724 	if (sa->sa_gid != (uint32_t)-1) {
2725 		vap->va_mask |= AT_GID;
2726 		vap->va_gid = sa->sa_gid;
2727 	}
2728 	if (sa->sa_size != (uint32_t)-1) {
2729 		vap->va_mask |= AT_SIZE;
2730 		vap->va_size = sa->sa_size;
2731 	}
2732 	if (sa->sa_atime.tv_sec != (int32_t)-1 &&
2733 	    sa->sa_atime.tv_usec != (int32_t)-1) {
2734 #ifndef _LP64
2735 		/* return error if time overflow */
2736 		if (!NFS2_TIME_OK(sa->sa_atime.tv_sec))
2737 			return (EOVERFLOW);
2738 #endif
2739 		vap->va_mask |= AT_ATIME;
2740 		/*
2741 		 * nfs protocol defines times as unsigned so don't extend sign,
2742 		 * unless sysadmin set nfs_allow_preepoch_time.
2743 		 */
2744 		NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec);
2745 		vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
2746 	}
2747 	if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
2748 	    sa->sa_mtime.tv_usec != (int32_t)-1) {
2749 #ifndef _LP64
2750 		/* return error if time overflow */
2751 		if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
2752 			return (EOVERFLOW);
2753 #endif
2754 		vap->va_mask |= AT_MTIME;
2755 		/*
2756 		 * nfs protocol defines times as unsigned so don't extend sign,
2757 		 * unless sysadmin set nfs_allow_preepoch_time.
2758 		 */
2759 		NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
2760 		vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
2761 	}
2762 	return (0);
2763 }
2764 
2765 static enum nfsftype vt_to_nf[] = {
2766 	0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
2767 };
2768 
2769 /*
2770  * check the following fields for overflow: nodeid, size, and time.
2771  * There could be a problem when converting 64-bit LP64 fields
2772  * into 32-bit ones.  Return an error if there is an overflow.
2773  */
2774 int
2775 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
2776 {
2777 	ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
2778 	na->na_type = vt_to_nf[vap->va_type];
2779 
2780 	if (vap->va_mode == (unsigned short) -1)
2781 		na->na_mode = (uint32_t)-1;
2782 	else
2783 		na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
2784 
2785 	if (vap->va_uid == (unsigned short)(-1))
2786 		na->na_uid = (uint32_t)(-1);
2787 	else if (vap->va_uid == UID_NOBODY)
2788 		na->na_uid = (uint32_t)NFS_UID_NOBODY;
2789 	else
2790 		na->na_uid = vap->va_uid;
2791 
2792 	if (vap->va_gid == (unsigned short)(-1))
2793 		na->na_gid = (uint32_t)-1;
2794 	else if (vap->va_gid == GID_NOBODY)
2795 		na->na_gid = (uint32_t)NFS_GID_NOBODY;
2796 	else
2797 		na->na_gid = vap->va_gid;
2798 
2799 	/*
2800 	 * Do we need to check fsid for overflow?  It is 64-bit in the
2801 	 * vattr, but are bigger than 32 bit values supported?
2802 	 */
2803 	na->na_fsid = vap->va_fsid;
2804 
2805 	na->na_nodeid = vap->va_nodeid;
2806 
2807 	/*
2808 	 * Check to make sure that the nodeid is representable over the
2809 	 * wire without losing bits.
2810 	 */
2811 	if (vap->va_nodeid != (u_longlong_t)na->na_nodeid)
2812 		return (EFBIG);
2813 	na->na_nlink = vap->va_nlink;
2814 
2815 	/*
2816 	 * Check for big files here, instead of at the caller.  See
2817 	 * comments in cstat for large special file explanation.
2818 	 */
2819 	if (vap->va_size > (u_longlong_t)MAXOFF32_T) {
2820 		if ((vap->va_type == VREG) || (vap->va_type == VDIR))
2821 			return (EFBIG);
2822 		if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) {
2823 			/* UNKNOWN_SIZE | OVERFLOW */
2824 			na->na_size = MAXOFF32_T;
2825 		} else
2826 			na->na_size = vap->va_size;
2827 	} else
2828 		na->na_size = vap->va_size;
2829 
2830 	/*
2831 	 * If the vnode times overflow the 32-bit times that NFS2
2832 	 * uses on the wire then return an error.
2833 	 */
2834 	if (!NFS_VAP_TIME_OK(vap)) {
2835 		return (EOVERFLOW);
2836 	}
2837 	na->na_atime.tv_sec = vap->va_atime.tv_sec;
2838 	na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2839 
2840 	na->na_mtime.tv_sec = vap->va_mtime.tv_sec;
2841 	na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2842 
2843 	na->na_ctime.tv_sec = vap->va_ctime.tv_sec;
2844 	na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000;
2845 
2846 	/*
2847 	 * If the dev_t will fit into 16 bits then compress
2848 	 * it, otherwise leave it alone. See comments in
2849 	 * nfs_client.c.
2850 	 */
2851 	if (getminor(vap->va_rdev) <= SO4_MAXMIN &&
2852 	    getmajor(vap->va_rdev) <= SO4_MAXMAJ)
2853 		na->na_rdev = nfsv2_cmpdev(vap->va_rdev);
2854 	else
2855 		(void) cmpldev(&na->na_rdev, vap->va_rdev);
2856 
2857 	na->na_blocks = vap->va_nblocks;
2858 	na->na_blocksize = vap->va_blksize;
2859 
2860 	/*
2861 	 * This bit of ugliness is a *TEMPORARY* hack to preserve the
2862 	 * over-the-wire protocols for named-pipe vnodes.  It remaps the
2863 	 * VFIFO type to the special over-the-wire type. (see note in nfs.h)
2864 	 *
2865 	 * BUYER BEWARE:
2866 	 *  If you are porting the NFS to a non-Sun server, you probably
2867 	 *  don't want to include the following block of code.  The
2868 	 *  over-the-wire special file types will be changing with the
2869 	 *  NFS Protocol Revision.
2870 	 */
2871 	if (vap->va_type == VFIFO)
2872 		NA_SETFIFO(na);
2873 	return (0);
2874 }
2875 
2876 /*
2877  * acl v2 support: returns approximate permission.
2878  *	default: returns minimal permission (more restrictive)
2879  *	aclok: returns maximal permission (less restrictive)
2880  *	This routine changes the permissions that are alaredy in *va.
2881  *	If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES,
2882  *	CLASS_OBJ is always the same as GROUP_OBJ entry.
2883  */
2884 static void
2885 acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr)
2886 {
2887 	vsecattr_t	vsa;
2888 	int		aclcnt;
2889 	aclent_t	*aclentp;
2890 	mode_t		mask_perm;
2891 	mode_t		grp_perm;
2892 	mode_t		other_perm;
2893 	mode_t		other_orig;
2894 	int		error;
2895 
2896 	/* dont care default acl */
2897 	vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT);
2898 	error = VOP_GETSECATTR(vp, &vsa, 0, cr, NULL);
2899 
2900 	if (!error) {
2901 		aclcnt = vsa.vsa_aclcnt;
2902 		if (aclcnt > MIN_ACL_ENTRIES) {
2903 			/* non-trivial ACL */
2904 			aclentp = vsa.vsa_aclentp;
2905 			if (exi->exi_export.ex_flags & EX_ACLOK) {
2906 				/* maximal permissions */
2907 				grp_perm = 0;
2908 				other_perm = 0;
2909 				for (; aclcnt > 0; aclcnt--, aclentp++) {
2910 					switch (aclentp->a_type) {
2911 					case USER_OBJ:
2912 						break;
2913 					case USER:
2914 						grp_perm |=
2915 						    aclentp->a_perm << 3;
2916 						other_perm |= aclentp->a_perm;
2917 						break;
2918 					case GROUP_OBJ:
2919 						grp_perm |=
2920 						    aclentp->a_perm << 3;
2921 						break;
2922 					case GROUP:
2923 						other_perm |= aclentp->a_perm;
2924 						break;
2925 					case OTHER_OBJ:
2926 						other_orig = aclentp->a_perm;
2927 						break;
2928 					case CLASS_OBJ:
2929 						mask_perm = aclentp->a_perm;
2930 						break;
2931 					default:
2932 						break;
2933 					}
2934 				}
2935 				grp_perm &= mask_perm << 3;
2936 				other_perm &= mask_perm;
2937 				other_perm |= other_orig;
2938 
2939 			} else {
2940 				/* minimal permissions */
2941 				grp_perm = 070;
2942 				other_perm = 07;
2943 				for (; aclcnt > 0; aclcnt--, aclentp++) {
2944 					switch (aclentp->a_type) {
2945 					case USER_OBJ:
2946 						break;
2947 					case USER:
2948 					case CLASS_OBJ:
2949 						grp_perm &=
2950 						    aclentp->a_perm << 3;
2951 						other_perm &=
2952 						    aclentp->a_perm;
2953 						break;
2954 					case GROUP_OBJ:
2955 						grp_perm &=
2956 						    aclentp->a_perm << 3;
2957 						break;
2958 					case GROUP:
2959 						other_perm &=
2960 						    aclentp->a_perm;
2961 						break;
2962 					case OTHER_OBJ:
2963 						other_perm &=
2964 						    aclentp->a_perm;
2965 						break;
2966 					default:
2967 						break;
2968 					}
2969 				}
2970 			}
2971 			/* copy to va */
2972 			va->va_mode &= ~077;
2973 			va->va_mode |= grp_perm | other_perm;
2974 		}
2975 		if (vsa.vsa_aclcnt)
2976 			kmem_free(vsa.vsa_aclentp,
2977 			    vsa.vsa_aclcnt * sizeof (aclent_t));
2978 	}
2979 }
2980 
2981 void
2982 rfs_srvrinit(void)
2983 {
2984 	mutex_init(&rfs_async_write_lock, NULL, MUTEX_DEFAULT, NULL);
2985 	nfs2_srv_caller_id = fs_new_caller_id();
2986 }
2987 
2988 void
2989 rfs_srvrfini(void)
2990 {
2991 	mutex_destroy(&rfs_async_write_lock);
2992 }
2993 
2994 static int
2995 rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
2996 {
2997 	struct clist	*wcl;
2998 	int		wlist_len;
2999 	uint32_t	count = rr->rr_count;
3000 
3001 	wcl = ra->ra_wlist;
3002 
3003 	if (rdma_setup_read_chunks(wcl, count, &wlist_len) == FALSE) {
3004 		return (FALSE);
3005 	}
3006 
3007 	wcl = ra->ra_wlist;
3008 	rr->rr_ok.rrok_wlist_len = wlist_len;
3009 	rr->rr_ok.rrok_wlist = wcl;
3010 
3011 	return (TRUE);
3012 }
3013