xref: /titanic_51/usr/src/uts/common/fs/nfs/nfs_srv.c (revision c39526b769298791ff5b0b6c5e761f49aabaeb4e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  *	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
28  *	All rights reserved.
29  */
30 
31 #include <sys/param.h>
32 #include <sys/types.h>
33 #include <sys/systm.h>
34 #include <sys/cred.h>
35 #include <sys/buf.h>
36 #include <sys/vfs.h>
37 #include <sys/vnode.h>
38 #include <sys/uio.h>
39 #include <sys/stat.h>
40 #include <sys/errno.h>
41 #include <sys/sysmacros.h>
42 #include <sys/statvfs.h>
43 #include <sys/kmem.h>
44 #include <sys/kstat.h>
45 #include <sys/dirent.h>
46 #include <sys/cmn_err.h>
47 #include <sys/debug.h>
48 #include <sys/vtrace.h>
49 #include <sys/mode.h>
50 #include <sys/acl.h>
51 #include <sys/nbmlock.h>
52 #include <sys/policy.h>
53 #include <sys/sdt.h>
54 
55 #include <rpc/types.h>
56 #include <rpc/auth.h>
57 #include <rpc/svc.h>
58 
59 #include <nfs/nfs.h>
60 #include <nfs/export.h>
61 #include <nfs/nfs_cmd.h>
62 
63 #include <vm/hat.h>
64 #include <vm/as.h>
65 #include <vm/seg.h>
66 #include <vm/seg_map.h>
67 #include <vm/seg_kmem.h>
68 
69 #include <sys/strsubr.h>
70 
71 /*
72  * These are the interface routines for the server side of the
73  * Network File System.  See the NFS version 2 protocol specification
74  * for a description of this interface.
75  */
76 
77 static int	sattr_to_vattr(struct nfssattr *, struct vattr *);
78 static void	acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
79 			cred_t *);
80 
81 /*
82  * Some "over the wire" UNIX file types.  These are encoded
83  * into the mode.  This needs to be fixed in the next rev.
84  */
85 #define	IFMT		0170000		/* type of file */
86 #define	IFCHR		0020000		/* character special */
87 #define	IFBLK		0060000		/* block special */
88 #define	IFSOCK		0140000		/* socket */
89 
90 u_longlong_t nfs2_srv_caller_id;
91 
92 /*
93  * Get file attributes.
94  * Returns the current attributes of the file with the given fhandle.
95  */
96 /* ARGSUSED */
97 void
98 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
99 	struct svc_req *req, cred_t *cr)
100 {
101 	int error;
102 	vnode_t *vp;
103 	struct vattr va;
104 
105 	vp = nfs_fhtovp(fhp, exi);
106 	if (vp == NULL) {
107 		ns->ns_status = NFSERR_STALE;
108 		return;
109 	}
110 
111 	/*
112 	 * Do the getattr.
113 	 */
114 	va.va_mask = AT_ALL;	/* we want all the attributes */
115 
116 	error = rfs4_delegated_getattr(vp, &va, 0, cr);
117 
118 	/* check for overflows */
119 	if (!error) {
120 		/* Lie about the object type for a referral */
121 		if (vn_is_nfs_reparse(vp, cr))
122 			va.va_type = VLNK;
123 
124 		acl_perm(vp, exi, &va, cr);
125 		error = vattr_to_nattr(&va, &ns->ns_attr);
126 	}
127 
128 	VN_RELE(vp);
129 
130 	ns->ns_status = puterrno(error);
131 }
132 void *
133 rfs_getattr_getfh(fhandle_t *fhp)
134 {
135 	return (fhp);
136 }
137 
138 /*
139  * Set file attributes.
140  * Sets the attributes of the file with the given fhandle.  Returns
141  * the new attributes.
142  */
143 void
144 rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns,
145 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
146 {
147 	int error;
148 	int flag;
149 	int in_crit = 0;
150 	vnode_t *vp;
151 	struct vattr va;
152 	struct vattr bva;
153 	struct flock64 bf;
154 	caller_context_t ct;
155 
156 
157 	vp = nfs_fhtovp(&args->saa_fh, exi);
158 	if (vp == NULL) {
159 		ns->ns_status = NFSERR_STALE;
160 		return;
161 	}
162 
163 	if (rdonly(exi, req) || vn_is_readonly(vp)) {
164 		VN_RELE(vp);
165 		ns->ns_status = NFSERR_ROFS;
166 		return;
167 	}
168 
169 	error = sattr_to_vattr(&args->saa_sa, &va);
170 	if (error) {
171 		VN_RELE(vp);
172 		ns->ns_status = puterrno(error);
173 		return;
174 	}
175 
176 	/*
177 	 * If the client is requesting a change to the mtime,
178 	 * but the nanosecond field is set to 1 billion, then
179 	 * this is a flag to the server that it should set the
180 	 * atime and mtime fields to the server's current time.
181 	 * The 1 billion number actually came from the client
182 	 * as 1 million, but the units in the over the wire
183 	 * request are microseconds instead of nanoseconds.
184 	 *
185 	 * This is an overload of the protocol and should be
186 	 * documented in the NFS Version 2 protocol specification.
187 	 */
188 	if (va.va_mask & AT_MTIME) {
189 		if (va.va_mtime.tv_nsec == 1000000000) {
190 			gethrestime(&va.va_mtime);
191 			va.va_atime = va.va_mtime;
192 			va.va_mask |= AT_ATIME;
193 			flag = 0;
194 		} else
195 			flag = ATTR_UTIME;
196 	} else
197 		flag = 0;
198 
199 	/*
200 	 * If the filesystem is exported with nosuid, then mask off
201 	 * the setuid and setgid bits.
202 	 */
203 	if ((va.va_mask & AT_MODE) && vp->v_type == VREG &&
204 	    (exi->exi_export.ex_flags & EX_NOSUID))
205 		va.va_mode &= ~(VSUID | VSGID);
206 
207 	ct.cc_sysid = 0;
208 	ct.cc_pid = 0;
209 	ct.cc_caller_id = nfs2_srv_caller_id;
210 	ct.cc_flags = CC_DONTBLOCK;
211 
212 	/*
213 	 * We need to specially handle size changes because it is
214 	 * possible for the client to create a file with modes
215 	 * which indicate read-only, but with the file opened for
216 	 * writing.  If the client then tries to set the size of
217 	 * the file, then the normal access checking done in
218 	 * VOP_SETATTR would prevent the client from doing so,
219 	 * although it should be legal for it to do so.  To get
220 	 * around this, we do the access checking for ourselves
221 	 * and then use VOP_SPACE which doesn't do the access
222 	 * checking which VOP_SETATTR does. VOP_SPACE can only
223 	 * operate on VREG files, let VOP_SETATTR handle the other
224 	 * extremely rare cases.
225 	 * Also the client should not be allowed to change the
226 	 * size of the file if there is a conflicting non-blocking
227 	 * mandatory lock in the region of change.
228 	 */
229 	if (vp->v_type == VREG && va.va_mask & AT_SIZE) {
230 		if (nbl_need_check(vp)) {
231 			nbl_start_crit(vp, RW_READER);
232 			in_crit = 1;
233 		}
234 
235 		bva.va_mask = AT_UID | AT_SIZE;
236 
237 		error = VOP_GETATTR(vp, &bva, 0, cr, &ct);
238 
239 		if (error) {
240 			if (in_crit)
241 				nbl_end_crit(vp);
242 			VN_RELE(vp);
243 			ns->ns_status = puterrno(error);
244 			return;
245 		}
246 
247 		if (in_crit) {
248 			u_offset_t offset;
249 			ssize_t length;
250 
251 			if (va.va_size < bva.va_size) {
252 				offset = va.va_size;
253 				length = bva.va_size - va.va_size;
254 			} else {
255 				offset = bva.va_size;
256 				length = va.va_size - bva.va_size;
257 			}
258 			if (nbl_conflict(vp, NBL_WRITE, offset, length, 0,
259 			    NULL)) {
260 				error = EACCES;
261 			}
262 		}
263 
264 		if (crgetuid(cr) == bva.va_uid && !error &&
265 		    va.va_size != bva.va_size) {
266 			va.va_mask &= ~AT_SIZE;
267 			bf.l_type = F_WRLCK;
268 			bf.l_whence = 0;
269 			bf.l_start = (off64_t)va.va_size;
270 			bf.l_len = 0;
271 			bf.l_sysid = 0;
272 			bf.l_pid = 0;
273 
274 			error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE,
275 			    (offset_t)va.va_size, cr, &ct);
276 		}
277 		if (in_crit)
278 			nbl_end_crit(vp);
279 	} else
280 		error = 0;
281 
282 	/*
283 	 * Do the setattr.
284 	 */
285 	if (!error && va.va_mask) {
286 		error = VOP_SETATTR(vp, &va, flag, cr, &ct);
287 	}
288 
289 	/*
290 	 * check if the monitor on either vop_space or vop_setattr detected
291 	 * a delegation conflict and if so, mark the thread flag as
292 	 * wouldblock so that the response is dropped and the client will
293 	 * try again.
294 	 */
295 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
296 		VN_RELE(vp);
297 		curthread->t_flag |= T_WOULDBLOCK;
298 		return;
299 	}
300 
301 	if (!error) {
302 		va.va_mask = AT_ALL;	/* get everything */
303 
304 		error = rfs4_delegated_getattr(vp, &va, 0, cr);
305 
306 		/* check for overflows */
307 		if (!error) {
308 			acl_perm(vp, exi, &va, cr);
309 			error = vattr_to_nattr(&va, &ns->ns_attr);
310 		}
311 	}
312 
313 	ct.cc_flags = 0;
314 
315 	/*
316 	 * Force modified metadata out to stable storage.
317 	 */
318 	(void) VOP_FSYNC(vp, FNODSYNC, cr, &ct);
319 
320 	VN_RELE(vp);
321 
322 	ns->ns_status = puterrno(error);
323 }
324 void *
325 rfs_setattr_getfh(struct nfssaargs *args)
326 {
327 	return (&args->saa_fh);
328 }
329 
330 /*
331  * Directory lookup.
332  * Returns an fhandle and file attributes for file name in a directory.
333  */
334 /* ARGSUSED */
335 void
336 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
337 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
338 {
339 	int error;
340 	vnode_t *dvp;
341 	vnode_t *vp;
342 	struct vattr va;
343 	fhandle_t *fhp = da->da_fhandle;
344 	struct sec_ol sec = {0, 0};
345 	bool_t publicfh_flag = FALSE, auth_weak = FALSE;
346 	char *name;
347 	struct sockaddr *ca;
348 
349 	/*
350 	 * Trusted Extension doesn't support NFSv2. MOUNT
351 	 * will reject v2 clients. Need to prevent v2 client
352 	 * access via WebNFS here.
353 	 */
354 	if (is_system_labeled() && req->rq_vers == 2) {
355 		dr->dr_status = NFSERR_ACCES;
356 		return;
357 	}
358 
359 	/*
360 	 * Disallow NULL paths
361 	 */
362 	if (da->da_name == NULL || *da->da_name == '\0') {
363 		dr->dr_status = NFSERR_ACCES;
364 		return;
365 	}
366 
367 	/*
368 	 * Allow lookups from the root - the default
369 	 * location of the public filehandle.
370 	 */
371 	if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
372 		dvp = rootdir;
373 		VN_HOLD(dvp);
374 	} else {
375 		dvp = nfs_fhtovp(fhp, exi);
376 		if (dvp == NULL) {
377 			dr->dr_status = NFSERR_STALE;
378 			return;
379 		}
380 	}
381 
382 	/*
383 	 * Not allow lookup beyond root.
384 	 * If the filehandle matches a filehandle of the exi,
385 	 * then the ".." refers beyond the root of an exported filesystem.
386 	 */
387 	if (strcmp(da->da_name, "..") == 0 &&
388 	    EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
389 		VN_RELE(dvp);
390 		dr->dr_status = NFSERR_NOENT;
391 		return;
392 	}
393 
394 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
395 	name = nfscmd_convname(ca, exi, da->da_name, NFSCMD_CONV_INBOUND,
396 	    MAXPATHLEN);
397 
398 	if (name == NULL) {
399 		dr->dr_status = NFSERR_ACCES;
400 		return;
401 	}
402 
403 	/*
404 	 * If the public filehandle is used then allow
405 	 * a multi-component lookup, i.e. evaluate
406 	 * a pathname and follow symbolic links if
407 	 * necessary.
408 	 *
409 	 * This may result in a vnode in another filesystem
410 	 * which is OK as long as the filesystem is exported.
411 	 */
412 	if (PUBLIC_FH2(fhp)) {
413 		publicfh_flag = TRUE;
414 		error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &exi,
415 		    &sec);
416 	} else {
417 		/*
418 		 * Do a normal single component lookup.
419 		 */
420 		error = VOP_LOOKUP(dvp, name, &vp, NULL, 0, NULL, cr,
421 		    NULL, NULL, NULL);
422 	}
423 
424 	if (name != da->da_name)
425 		kmem_free(name, MAXPATHLEN);
426 
427 
428 	if (!error) {
429 		va.va_mask = AT_ALL;	/* we want everything */
430 
431 		error = rfs4_delegated_getattr(vp, &va, 0, cr);
432 
433 		/* check for overflows */
434 		if (!error) {
435 			acl_perm(vp, exi, &va, cr);
436 			error = vattr_to_nattr(&va, &dr->dr_attr);
437 			if (!error) {
438 				if (sec.sec_flags & SEC_QUERY)
439 					error = makefh_ol(&dr->dr_fhandle, exi,
440 					    sec.sec_index);
441 				else {
442 					error = makefh(&dr->dr_fhandle, vp,
443 					    exi);
444 					if (!error && publicfh_flag &&
445 					    !chk_clnt_sec(exi, req))
446 						auth_weak = TRUE;
447 				}
448 			}
449 		}
450 		VN_RELE(vp);
451 	}
452 
453 	VN_RELE(dvp);
454 
455 	/*
456 	 * If publicfh_flag is true then we have called rfs_publicfh_mclookup
457 	 * and have obtained a new exportinfo in exi which needs to be
458 	 * released. Note the the original exportinfo pointed to by exi
459 	 * will be released by the caller, comon_dispatch.
460 	 */
461 	if (publicfh_flag && exi != NULL)
462 		exi_rele(exi);
463 
464 	/*
465 	 * If it's public fh, no 0x81, and client's flavor is
466 	 * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
467 	 * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
468 	 */
469 	if (auth_weak)
470 		dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR;
471 	else
472 		dr->dr_status = puterrno(error);
473 }
474 void *
475 rfs_lookup_getfh(struct nfsdiropargs *da)
476 {
477 	return (da->da_fhandle);
478 }
479 
480 /*
481  * Read symbolic link.
482  * Returns the string in the symbolic link at the given fhandle.
483  */
484 /* ARGSUSED */
485 void
486 rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi,
487 	struct svc_req *req, cred_t *cr)
488 {
489 	int error;
490 	struct iovec iov;
491 	struct uio uio;
492 	vnode_t *vp;
493 	struct vattr va;
494 	struct sockaddr *ca;
495 	char *name = NULL;
496 	int is_referral = 0;
497 
498 	vp = nfs_fhtovp(fhp, exi);
499 	if (vp == NULL) {
500 		rl->rl_data = NULL;
501 		rl->rl_status = NFSERR_STALE;
502 		return;
503 	}
504 
505 	va.va_mask = AT_MODE;
506 
507 	error = VOP_GETATTR(vp, &va, 0, cr, NULL);
508 
509 	if (error) {
510 		VN_RELE(vp);
511 		rl->rl_data = NULL;
512 		rl->rl_status = puterrno(error);
513 		return;
514 	}
515 
516 	if (MANDLOCK(vp, va.va_mode)) {
517 		VN_RELE(vp);
518 		rl->rl_data = NULL;
519 		rl->rl_status = NFSERR_ACCES;
520 		return;
521 	}
522 
523 	/* We lied about the object type for a referral */
524 	if (vn_is_nfs_reparse(vp, cr))
525 		is_referral = 1;
526 
527 	/*
528 	 * XNFS and RFC1094 require us to return ENXIO if argument
529 	 * is not a link. BUGID 1138002.
530 	 */
531 	if (vp->v_type != VLNK && !is_referral) {
532 		VN_RELE(vp);
533 		rl->rl_data = NULL;
534 		rl->rl_status = NFSERR_NXIO;
535 		return;
536 	}
537 
538 	/*
539 	 * Allocate data for pathname.  This will be freed by rfs_rlfree.
540 	 */
541 	rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
542 
543 	if (is_referral) {
544 		char *s;
545 		size_t strsz;
546 
547 		/* Get an artificial symlink based on a referral */
548 		s = build_symlink(vp, cr, &strsz);
549 		global_svstat_ptr[2][NFS_REFERLINKS].value.ui64++;
550 		DTRACE_PROBE2(nfs2serv__func__referral__reflink,
551 		    vnode_t *, vp, char *, s);
552 		if (s == NULL)
553 			error = EINVAL;
554 		else {
555 			error = 0;
556 			(void) strlcpy(rl->rl_data, s, NFS_MAXPATHLEN);
557 			rl->rl_count = (uint32_t)MIN(strsz, NFS_MAXPATHLEN);
558 			kmem_free(s, strsz);
559 		}
560 
561 	} else {
562 
563 		/*
564 		 * Set up io vector to read sym link data
565 		 */
566 		iov.iov_base = rl->rl_data;
567 		iov.iov_len = NFS_MAXPATHLEN;
568 		uio.uio_iov = &iov;
569 		uio.uio_iovcnt = 1;
570 		uio.uio_segflg = UIO_SYSSPACE;
571 		uio.uio_extflg = UIO_COPY_CACHED;
572 		uio.uio_loffset = (offset_t)0;
573 		uio.uio_resid = NFS_MAXPATHLEN;
574 
575 		/*
576 		 * Do the readlink.
577 		 */
578 		error = VOP_READLINK(vp, &uio, cr, NULL);
579 
580 		rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid);
581 
582 		if (!error)
583 			rl->rl_data[rl->rl_count] = '\0';
584 
585 	}
586 
587 
588 	VN_RELE(vp);
589 
590 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
591 	name = nfscmd_convname(ca, exi, rl->rl_data,
592 	    NFSCMD_CONV_OUTBOUND, MAXPATHLEN);
593 
594 	if (name != NULL && name != rl->rl_data) {
595 		kmem_free(rl->rl_data, NFS_MAXPATHLEN);
596 		rl->rl_data = name;
597 	}
598 
599 	/*
600 	 * XNFS and RFC1094 require us to return ENXIO if argument
601 	 * is not a link. UFS returns EINVAL if this is the case,
602 	 * so we do the mapping here. BUGID 1138002.
603 	 */
604 	if (error == EINVAL)
605 		rl->rl_status = NFSERR_NXIO;
606 	else
607 		rl->rl_status = puterrno(error);
608 
609 }
610 void *
611 rfs_readlink_getfh(fhandle_t *fhp)
612 {
613 	return (fhp);
614 }
615 /*
616  * Free data allocated by rfs_readlink
617  */
618 void
619 rfs_rlfree(struct nfsrdlnres *rl)
620 {
621 	if (rl->rl_data != NULL)
622 		kmem_free(rl->rl_data, NFS_MAXPATHLEN);
623 }
624 
625 static int rdma_setup_read_data2(struct nfsreadargs *, struct nfsrdresult *);
626 
627 /*
628  * Read data.
629  * Returns some data read from the file at the given fhandle.
630  */
631 /* ARGSUSED */
632 void
633 rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr,
634 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
635 {
636 	vnode_t *vp;
637 	int error;
638 	struct vattr va;
639 	struct iovec iov;
640 	struct uio uio;
641 	mblk_t *mp;
642 	int alloc_err = 0;
643 	int in_crit = 0;
644 	caller_context_t ct;
645 
646 	vp = nfs_fhtovp(&ra->ra_fhandle, exi);
647 	if (vp == NULL) {
648 		rr->rr_data = NULL;
649 		rr->rr_status = NFSERR_STALE;
650 		return;
651 	}
652 
653 	if (vp->v_type != VREG) {
654 		VN_RELE(vp);
655 		rr->rr_data = NULL;
656 		rr->rr_status = NFSERR_ISDIR;
657 		return;
658 	}
659 
660 	ct.cc_sysid = 0;
661 	ct.cc_pid = 0;
662 	ct.cc_caller_id = nfs2_srv_caller_id;
663 	ct.cc_flags = CC_DONTBLOCK;
664 
665 	/*
666 	 * Enter the critical region before calling VOP_RWLOCK
667 	 * to avoid a deadlock with write requests.
668 	 */
669 	if (nbl_need_check(vp)) {
670 		nbl_start_crit(vp, RW_READER);
671 		if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
672 		    0, NULL)) {
673 			nbl_end_crit(vp);
674 			VN_RELE(vp);
675 			rr->rr_data = NULL;
676 			rr->rr_status = NFSERR_ACCES;
677 			return;
678 		}
679 		in_crit = 1;
680 	}
681 
682 	error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct);
683 
684 	/* check if a monitor detected a delegation conflict */
685 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
686 		VN_RELE(vp);
687 		/* mark as wouldblock so response is dropped */
688 		curthread->t_flag |= T_WOULDBLOCK;
689 
690 		rr->rr_data = NULL;
691 		return;
692 	}
693 
694 	va.va_mask = AT_ALL;
695 
696 	error = VOP_GETATTR(vp, &va, 0, cr, &ct);
697 
698 	if (error) {
699 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
700 		if (in_crit)
701 			nbl_end_crit(vp);
702 
703 		VN_RELE(vp);
704 		rr->rr_data = NULL;
705 		rr->rr_status = puterrno(error);
706 
707 		return;
708 	}
709 
710 	/*
711 	 * This is a kludge to allow reading of files created
712 	 * with no read permission.  The owner of the file
713 	 * is always allowed to read it.
714 	 */
715 	if (crgetuid(cr) != va.va_uid) {
716 		error = VOP_ACCESS(vp, VREAD, 0, cr, &ct);
717 
718 		if (error) {
719 			/*
720 			 * Exec is the same as read over the net because
721 			 * of demand loading.
722 			 */
723 			error = VOP_ACCESS(vp, VEXEC, 0, cr, &ct);
724 		}
725 		if (error) {
726 			VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
727 			if (in_crit)
728 				nbl_end_crit(vp);
729 			VN_RELE(vp);
730 			rr->rr_data = NULL;
731 			rr->rr_status = puterrno(error);
732 
733 			return;
734 		}
735 	}
736 
737 	if (MANDLOCK(vp, va.va_mode)) {
738 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
739 		if (in_crit)
740 			nbl_end_crit(vp);
741 
742 		VN_RELE(vp);
743 		rr->rr_data = NULL;
744 		rr->rr_status = NFSERR_ACCES;
745 
746 		return;
747 	}
748 
749 	rr->rr_ok.rrok_wlist_len = 0;
750 	rr->rr_ok.rrok_wlist = NULL;
751 
752 	if ((u_offset_t)ra->ra_offset >= va.va_size) {
753 		rr->rr_count = 0;
754 		rr->rr_data = NULL;
755 		/*
756 		 * In this case, status is NFS_OK, but there is no data
757 		 * to encode. So set rr_mp to NULL.
758 		 */
759 		rr->rr_mp = NULL;
760 		rr->rr_ok.rrok_wlist = ra->ra_wlist;
761 		if (rr->rr_ok.rrok_wlist)
762 			clist_zero_len(rr->rr_ok.rrok_wlist);
763 		goto done;
764 	}
765 
766 	if (ra->ra_wlist) {
767 		mp = NULL;
768 		rr->rr_mp = NULL;
769 		(void) rdma_get_wchunk(req, &iov, ra->ra_wlist);
770 	} else {
771 		/*
772 		 * mp will contain the data to be sent out in the read reply.
773 		 * This will be freed after the reply has been sent out (by the
774 		 * driver).
775 		 * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so
776 		 * that the call to xdrmblk_putmblk() never fails.
777 		 */
778 		mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG,
779 		    &alloc_err);
780 		ASSERT(mp != NULL);
781 		ASSERT(alloc_err == 0);
782 
783 		rr->rr_mp = mp;
784 
785 		/*
786 		 * Set up io vector
787 		 */
788 		iov.iov_base = (caddr_t)mp->b_datap->db_base;
789 		iov.iov_len = ra->ra_count;
790 	}
791 
792 	uio.uio_iov = &iov;
793 	uio.uio_iovcnt = 1;
794 	uio.uio_segflg = UIO_SYSSPACE;
795 	uio.uio_extflg = UIO_COPY_CACHED;
796 	uio.uio_loffset = (offset_t)ra->ra_offset;
797 	uio.uio_resid = ra->ra_count;
798 
799 	error = VOP_READ(vp, &uio, 0, cr, &ct);
800 
801 	if (error) {
802 		if (mp)
803 			freeb(mp);
804 
805 		/*
806 		 * check if a monitor detected a delegation conflict and
807 		 * mark as wouldblock so response is dropped
808 		 */
809 		if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
810 			curthread->t_flag |= T_WOULDBLOCK;
811 		else
812 			rr->rr_status = puterrno(error);
813 
814 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
815 		if (in_crit)
816 			nbl_end_crit(vp);
817 
818 		VN_RELE(vp);
819 		rr->rr_data = NULL;
820 
821 		return;
822 	}
823 
824 	/*
825 	 * Get attributes again so we can send the latest access
826 	 * time to the client side for his cache.
827 	 */
828 	va.va_mask = AT_ALL;
829 
830 	error = VOP_GETATTR(vp, &va, 0, cr, &ct);
831 
832 	if (error) {
833 		if (mp)
834 			freeb(mp);
835 
836 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
837 		if (in_crit)
838 			nbl_end_crit(vp);
839 
840 		VN_RELE(vp);
841 		rr->rr_data = NULL;
842 		rr->rr_status = puterrno(error);
843 
844 		return;
845 	}
846 
847 	rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid);
848 
849 	if (mp) {
850 		rr->rr_data = (char *)mp->b_datap->db_base;
851 	} else {
852 		if (ra->ra_wlist) {
853 			rr->rr_data = (caddr_t)iov.iov_base;
854 			if (!rdma_setup_read_data2(ra, rr)) {
855 				rr->rr_data = NULL;
856 				rr->rr_status = puterrno(NFSERR_INVAL);
857 			}
858 		}
859 	}
860 done:
861 	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
862 	if (in_crit)
863 		nbl_end_crit(vp);
864 
865 	acl_perm(vp, exi, &va, cr);
866 
867 	/* check for overflows */
868 	error = vattr_to_nattr(&va, &rr->rr_attr);
869 
870 	VN_RELE(vp);
871 
872 	rr->rr_status = puterrno(error);
873 }
874 
875 /*
876  * Free data allocated by rfs_read
877  */
878 void
879 rfs_rdfree(struct nfsrdresult *rr)
880 {
881 	mblk_t *mp;
882 
883 	if (rr->rr_status == NFS_OK) {
884 		mp = rr->rr_mp;
885 		if (mp != NULL)
886 			freeb(mp);
887 	}
888 }
889 
890 void *
891 rfs_read_getfh(struct nfsreadargs *ra)
892 {
893 	return (&ra->ra_fhandle);
894 }
895 
896 #define	MAX_IOVECS	12
897 
898 #ifdef DEBUG
899 static int rfs_write_sync_hits = 0;
900 static int rfs_write_sync_misses = 0;
901 #endif
902 
903 /*
904  * Write data to file.
905  * Returns attributes of a file after writing some data to it.
906  *
907  * Any changes made here, especially in error handling might have
908  * to also be done in rfs_write (which clusters write requests).
909  */
910 void
911 rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns,
912 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
913 {
914 	int error;
915 	vnode_t *vp;
916 	rlim64_t rlimit;
917 	struct vattr va;
918 	struct uio uio;
919 	struct iovec iov[MAX_IOVECS];
920 	mblk_t *m;
921 	struct iovec *iovp;
922 	int iovcnt;
923 	cred_t *savecred;
924 	int in_crit = 0;
925 	caller_context_t ct;
926 
927 	vp = nfs_fhtovp(&wa->wa_fhandle, exi);
928 	if (vp == NULL) {
929 		ns->ns_status = NFSERR_STALE;
930 		return;
931 	}
932 
933 	if (rdonly(exi, req)) {
934 		VN_RELE(vp);
935 		ns->ns_status = NFSERR_ROFS;
936 		return;
937 	}
938 
939 	if (vp->v_type != VREG) {
940 		VN_RELE(vp);
941 		ns->ns_status = NFSERR_ISDIR;
942 		return;
943 	}
944 
945 	ct.cc_sysid = 0;
946 	ct.cc_pid = 0;
947 	ct.cc_caller_id = nfs2_srv_caller_id;
948 	ct.cc_flags = CC_DONTBLOCK;
949 
950 	va.va_mask = AT_UID|AT_MODE;
951 
952 	error = VOP_GETATTR(vp, &va, 0, cr, &ct);
953 
954 	if (error) {
955 		VN_RELE(vp);
956 		ns->ns_status = puterrno(error);
957 
958 		return;
959 	}
960 
961 	if (crgetuid(cr) != va.va_uid) {
962 		/*
963 		 * This is a kludge to allow writes of files created
964 		 * with read only permission.  The owner of the file
965 		 * is always allowed to write it.
966 		 */
967 		error = VOP_ACCESS(vp, VWRITE, 0, cr, &ct);
968 
969 		if (error) {
970 			VN_RELE(vp);
971 			ns->ns_status = puterrno(error);
972 			return;
973 		}
974 	}
975 
976 	/*
977 	 * Can't access a mandatory lock file.  This might cause
978 	 * the NFS service thread to block forever waiting for a
979 	 * lock to be released that will never be released.
980 	 */
981 	if (MANDLOCK(vp, va.va_mode)) {
982 		VN_RELE(vp);
983 		ns->ns_status = NFSERR_ACCES;
984 		return;
985 	}
986 
987 	/*
988 	 * We have to enter the critical region before calling VOP_RWLOCK
989 	 * to avoid a deadlock with ufs.
990 	 */
991 	if (nbl_need_check(vp)) {
992 		nbl_start_crit(vp, RW_READER);
993 		in_crit = 1;
994 		if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
995 		    wa->wa_count, 0, NULL)) {
996 			error = EACCES;
997 			goto out;
998 		}
999 	}
1000 
1001 	error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1002 
1003 	/* check if a monitor detected a delegation conflict */
1004 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1005 		VN_RELE(vp);
1006 		/* mark as wouldblock so response is dropped */
1007 		curthread->t_flag |= T_WOULDBLOCK;
1008 		return;
1009 	}
1010 
1011 	if (wa->wa_data || wa->wa_rlist) {
1012 		/* Do the RDMA thing if necessary */
1013 		if (wa->wa_rlist) {
1014 			iov[0].iov_base = (char *)((wa->wa_rlist)->u.c_daddr3);
1015 			iov[0].iov_len = wa->wa_count;
1016 		} else  {
1017 			iov[0].iov_base = wa->wa_data;
1018 			iov[0].iov_len = wa->wa_count;
1019 		}
1020 		uio.uio_iov = iov;
1021 		uio.uio_iovcnt = 1;
1022 		uio.uio_segflg = UIO_SYSSPACE;
1023 		uio.uio_extflg = UIO_COPY_DEFAULT;
1024 		uio.uio_loffset = (offset_t)wa->wa_offset;
1025 		uio.uio_resid = wa->wa_count;
1026 		/*
1027 		 * The limit is checked on the client. We
1028 		 * should allow any size writes here.
1029 		 */
1030 		uio.uio_llimit = curproc->p_fsz_ctl;
1031 		rlimit = uio.uio_llimit - wa->wa_offset;
1032 		if (rlimit < (rlim64_t)uio.uio_resid)
1033 			uio.uio_resid = (uint_t)rlimit;
1034 
1035 		/*
1036 		 * for now we assume no append mode
1037 		 */
1038 		/*
1039 		 * We're changing creds because VM may fault and we need
1040 		 * the cred of the current thread to be used if quota
1041 		 * checking is enabled.
1042 		 */
1043 		savecred = curthread->t_cred;
1044 		curthread->t_cred = cr;
1045 		error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1046 		curthread->t_cred = savecred;
1047 	} else {
1048 		iovcnt = 0;
1049 		for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
1050 			iovcnt++;
1051 		if (iovcnt <= MAX_IOVECS) {
1052 #ifdef DEBUG
1053 			rfs_write_sync_hits++;
1054 #endif
1055 			iovp = iov;
1056 		} else {
1057 #ifdef DEBUG
1058 			rfs_write_sync_misses++;
1059 #endif
1060 			iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
1061 		}
1062 		mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
1063 		uio.uio_iov = iovp;
1064 		uio.uio_iovcnt = iovcnt;
1065 		uio.uio_segflg = UIO_SYSSPACE;
1066 		uio.uio_extflg = UIO_COPY_DEFAULT;
1067 		uio.uio_loffset = (offset_t)wa->wa_offset;
1068 		uio.uio_resid = wa->wa_count;
1069 		/*
1070 		 * The limit is checked on the client. We
1071 		 * should allow any size writes here.
1072 		 */
1073 		uio.uio_llimit = curproc->p_fsz_ctl;
1074 		rlimit = uio.uio_llimit - wa->wa_offset;
1075 		if (rlimit < (rlim64_t)uio.uio_resid)
1076 			uio.uio_resid = (uint_t)rlimit;
1077 
1078 		/*
1079 		 * For now we assume no append mode.
1080 		 */
1081 		/*
1082 		 * We're changing creds because VM may fault and we need
1083 		 * the cred of the current thread to be used if quota
1084 		 * checking is enabled.
1085 		 */
1086 		savecred = curthread->t_cred;
1087 		curthread->t_cred = cr;
1088 		error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1089 		curthread->t_cred = savecred;
1090 
1091 		if (iovp != iov)
1092 			kmem_free(iovp, sizeof (*iovp) * iovcnt);
1093 	}
1094 
1095 	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1096 
1097 	if (!error) {
1098 		/*
1099 		 * Get attributes again so we send the latest mod
1100 		 * time to the client side for his cache.
1101 		 */
1102 		va.va_mask = AT_ALL;	/* now we want everything */
1103 
1104 		error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1105 
1106 		/* check for overflows */
1107 		if (!error) {
1108 			acl_perm(vp, exi, &va, cr);
1109 			error = vattr_to_nattr(&va, &ns->ns_attr);
1110 		}
1111 	}
1112 
1113 out:
1114 	if (in_crit)
1115 		nbl_end_crit(vp);
1116 	VN_RELE(vp);
1117 
1118 	/* check if a monitor detected a delegation conflict */
1119 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1120 		/* mark as wouldblock so response is dropped */
1121 		curthread->t_flag |= T_WOULDBLOCK;
1122 	else
1123 		ns->ns_status = puterrno(error);
1124 
1125 }
1126 
1127 struct rfs_async_write {
1128 	struct nfswriteargs *wa;
1129 	struct nfsattrstat *ns;
1130 	struct svc_req *req;
1131 	cred_t *cr;
1132 	kthread_t *thread;
1133 	struct rfs_async_write *list;
1134 };
1135 
1136 struct rfs_async_write_list {
1137 	fhandle_t *fhp;
1138 	kcondvar_t cv;
1139 	struct rfs_async_write *list;
1140 	struct rfs_async_write_list *next;
1141 };
1142 
1143 static struct rfs_async_write_list *rfs_async_write_head = NULL;
1144 static kmutex_t rfs_async_write_lock;
1145 static int rfs_write_async = 1;	/* enables write clustering if == 1 */
1146 
1147 #define	MAXCLIOVECS	42
1148 #define	RFSWRITE_INITVAL (enum nfsstat) -1
1149 
1150 #ifdef DEBUG
1151 static int rfs_write_hits = 0;
1152 static int rfs_write_misses = 0;
1153 #endif
1154 
1155 /*
1156  * Write data to file.
1157  * Returns attributes of a file after writing some data to it.
1158  */
1159 void
1160 rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns,
1161 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1162 {
1163 	int error;
1164 	vnode_t *vp;
1165 	rlim64_t rlimit;
1166 	struct vattr va;
1167 	struct uio uio;
1168 	struct rfs_async_write_list *lp;
1169 	struct rfs_async_write_list *nlp;
1170 	struct rfs_async_write *rp;
1171 	struct rfs_async_write *nrp;
1172 	struct rfs_async_write *trp;
1173 	struct rfs_async_write *lrp;
1174 	int data_written;
1175 	int iovcnt;
1176 	mblk_t *m;
1177 	struct iovec *iovp;
1178 	struct iovec *niovp;
1179 	struct iovec iov[MAXCLIOVECS];
1180 	int count;
1181 	int rcount;
1182 	uint_t off;
1183 	uint_t len;
1184 	struct rfs_async_write nrpsp;
1185 	struct rfs_async_write_list nlpsp;
1186 	ushort_t t_flag;
1187 	cred_t *savecred;
1188 	int in_crit = 0;
1189 	caller_context_t ct;
1190 
1191 	if (!rfs_write_async) {
1192 		rfs_write_sync(wa, ns, exi, req, cr);
1193 		return;
1194 	}
1195 
1196 	/*
1197 	 * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
1198 	 * is considered an OK.
1199 	 */
1200 	ns->ns_status = RFSWRITE_INITVAL;
1201 
1202 	nrp = &nrpsp;
1203 	nrp->wa = wa;
1204 	nrp->ns = ns;
1205 	nrp->req = req;
1206 	nrp->cr = cr;
1207 	nrp->thread = curthread;
1208 
1209 	ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
1210 
1211 	/*
1212 	 * Look to see if there is already a cluster started
1213 	 * for this file.
1214 	 */
1215 	mutex_enter(&rfs_async_write_lock);
1216 	for (lp = rfs_async_write_head; lp != NULL; lp = lp->next) {
1217 		if (bcmp(&wa->wa_fhandle, lp->fhp,
1218 		    sizeof (fhandle_t)) == 0)
1219 			break;
1220 	}
1221 
1222 	/*
1223 	 * If lp is non-NULL, then there is already a cluster
1224 	 * started.  We need to place ourselves in the cluster
1225 	 * list in the right place as determined by starting
1226 	 * offset.  Conflicts with non-blocking mandatory locked
1227 	 * regions will be checked when the cluster is processed.
1228 	 */
1229 	if (lp != NULL) {
1230 		rp = lp->list;
1231 		trp = NULL;
1232 		while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
1233 			trp = rp;
1234 			rp = rp->list;
1235 		}
1236 		nrp->list = rp;
1237 		if (trp == NULL)
1238 			lp->list = nrp;
1239 		else
1240 			trp->list = nrp;
1241 		while (nrp->ns->ns_status == RFSWRITE_INITVAL)
1242 			cv_wait(&lp->cv, &rfs_async_write_lock);
1243 		mutex_exit(&rfs_async_write_lock);
1244 
1245 		return;
1246 	}
1247 
1248 	/*
1249 	 * No cluster started yet, start one and add ourselves
1250 	 * to the list of clusters.
1251 	 */
1252 	nrp->list = NULL;
1253 
1254 	nlp = &nlpsp;
1255 	nlp->fhp = &wa->wa_fhandle;
1256 	cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
1257 	nlp->list = nrp;
1258 	nlp->next = NULL;
1259 
1260 	if (rfs_async_write_head == NULL) {
1261 		rfs_async_write_head = nlp;
1262 	} else {
1263 		lp = rfs_async_write_head;
1264 		while (lp->next != NULL)
1265 			lp = lp->next;
1266 		lp->next = nlp;
1267 	}
1268 	mutex_exit(&rfs_async_write_lock);
1269 
1270 	/*
1271 	 * Convert the file handle common to all of the requests
1272 	 * in this cluster to a vnode.
1273 	 */
1274 	vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1275 	if (vp == NULL) {
1276 		mutex_enter(&rfs_async_write_lock);
1277 		if (rfs_async_write_head == nlp)
1278 			rfs_async_write_head = nlp->next;
1279 		else {
1280 			lp = rfs_async_write_head;
1281 			while (lp->next != nlp)
1282 				lp = lp->next;
1283 			lp->next = nlp->next;
1284 		}
1285 		t_flag = curthread->t_flag & T_WOULDBLOCK;
1286 		for (rp = nlp->list; rp != NULL; rp = rp->list) {
1287 			rp->ns->ns_status = NFSERR_STALE;
1288 			rp->thread->t_flag |= t_flag;
1289 		}
1290 		cv_broadcast(&nlp->cv);
1291 		mutex_exit(&rfs_async_write_lock);
1292 
1293 		return;
1294 	}
1295 
1296 	/*
1297 	 * Can only write regular files.  Attempts to write any
1298 	 * other file types fail with EISDIR.
1299 	 */
1300 	if (vp->v_type != VREG) {
1301 		VN_RELE(vp);
1302 		mutex_enter(&rfs_async_write_lock);
1303 		if (rfs_async_write_head == nlp)
1304 			rfs_async_write_head = nlp->next;
1305 		else {
1306 			lp = rfs_async_write_head;
1307 			while (lp->next != nlp)
1308 				lp = lp->next;
1309 			lp->next = nlp->next;
1310 		}
1311 		t_flag = curthread->t_flag & T_WOULDBLOCK;
1312 		for (rp = nlp->list; rp != NULL; rp = rp->list) {
1313 			rp->ns->ns_status = NFSERR_ISDIR;
1314 			rp->thread->t_flag |= t_flag;
1315 		}
1316 		cv_broadcast(&nlp->cv);
1317 		mutex_exit(&rfs_async_write_lock);
1318 
1319 		return;
1320 	}
1321 
1322 	/*
1323 	 * Enter the critical region before calling VOP_RWLOCK, to avoid a
1324 	 * deadlock with ufs.
1325 	 */
1326 	if (nbl_need_check(vp)) {
1327 		nbl_start_crit(vp, RW_READER);
1328 		in_crit = 1;
1329 	}
1330 
1331 	ct.cc_sysid = 0;
1332 	ct.cc_pid = 0;
1333 	ct.cc_caller_id = nfs2_srv_caller_id;
1334 	ct.cc_flags = CC_DONTBLOCK;
1335 
1336 	/*
1337 	 * Lock the file for writing.  This operation provides
1338 	 * the delay which allows clusters to grow.
1339 	 */
1340 	error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1341 
1342 	/* check if a monitor detected a delegation conflict */
1343 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1344 		if (in_crit)
1345 			nbl_end_crit(vp);
1346 		VN_RELE(vp);
1347 		/* mark as wouldblock so response is dropped */
1348 		curthread->t_flag |= T_WOULDBLOCK;
1349 		mutex_enter(&rfs_async_write_lock);
1350 		if (rfs_async_write_head == nlp)
1351 			rfs_async_write_head = nlp->next;
1352 		else {
1353 			lp = rfs_async_write_head;
1354 			while (lp->next != nlp)
1355 				lp = lp->next;
1356 			lp->next = nlp->next;
1357 		}
1358 		for (rp = nlp->list; rp != NULL; rp = rp->list) {
1359 			if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1360 				rp->ns->ns_status = puterrno(error);
1361 				rp->thread->t_flag |= T_WOULDBLOCK;
1362 			}
1363 		}
1364 		cv_broadcast(&nlp->cv);
1365 		mutex_exit(&rfs_async_write_lock);
1366 
1367 		return;
1368 	}
1369 
1370 	/*
1371 	 * Disconnect this cluster from the list of clusters.
1372 	 * The cluster that is being dealt with must be fixed
1373 	 * in size after this point, so there is no reason
1374 	 * to leave it on the list so that new requests can
1375 	 * find it.
1376 	 *
1377 	 * The algorithm is that the first write request will
1378 	 * create a cluster, convert the file handle to a
1379 	 * vnode pointer, and then lock the file for writing.
1380 	 * This request is not likely to be clustered with
1381 	 * any others.  However, the next request will create
1382 	 * a new cluster and be blocked in VOP_RWLOCK while
1383 	 * the first request is being processed.  This delay
1384 	 * will allow more requests to be clustered in this
1385 	 * second cluster.
1386 	 */
1387 	mutex_enter(&rfs_async_write_lock);
1388 	if (rfs_async_write_head == nlp)
1389 		rfs_async_write_head = nlp->next;
1390 	else {
1391 		lp = rfs_async_write_head;
1392 		while (lp->next != nlp)
1393 			lp = lp->next;
1394 		lp->next = nlp->next;
1395 	}
1396 	mutex_exit(&rfs_async_write_lock);
1397 
1398 	/*
1399 	 * Step through the list of requests in this cluster.
1400 	 * We need to check permissions to make sure that all
1401 	 * of the requests have sufficient permission to write
1402 	 * the file.  A cluster can be composed of requests
1403 	 * from different clients and different users on each
1404 	 * client.
1405 	 *
1406 	 * As a side effect, we also calculate the size of the
1407 	 * byte range that this cluster encompasses.
1408 	 */
1409 	rp = nlp->list;
1410 	off = rp->wa->wa_offset;
1411 	len = (uint_t)0;
1412 	do {
1413 		if (rdonly(exi, rp->req)) {
1414 			rp->ns->ns_status = NFSERR_ROFS;
1415 			t_flag = curthread->t_flag & T_WOULDBLOCK;
1416 			rp->thread->t_flag |= t_flag;
1417 			continue;
1418 		}
1419 
1420 		va.va_mask = AT_UID|AT_MODE;
1421 
1422 		error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1423 
1424 		if (!error) {
1425 			if (crgetuid(rp->cr) != va.va_uid) {
1426 				/*
1427 				 * This is a kludge to allow writes of files
1428 				 * created with read only permission.  The
1429 				 * owner of the file is always allowed to
1430 				 * write it.
1431 				 */
1432 				error = VOP_ACCESS(vp, VWRITE, 0, rp->cr, &ct);
1433 			}
1434 			if (!error && MANDLOCK(vp, va.va_mode))
1435 				error = EACCES;
1436 		}
1437 
1438 		/*
1439 		 * Check for a conflict with a nbmand-locked region.
1440 		 */
1441 		if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset,
1442 		    rp->wa->wa_count, 0, NULL)) {
1443 			error = EACCES;
1444 		}
1445 
1446 		if (error) {
1447 			rp->ns->ns_status = puterrno(error);
1448 			t_flag = curthread->t_flag & T_WOULDBLOCK;
1449 			rp->thread->t_flag |= t_flag;
1450 			continue;
1451 		}
1452 		if (len < rp->wa->wa_offset + rp->wa->wa_count - off)
1453 			len = rp->wa->wa_offset + rp->wa->wa_count - off;
1454 	} while ((rp = rp->list) != NULL);
1455 
1456 	/*
1457 	 * Step through the cluster attempting to gather as many
1458 	 * requests which are contiguous as possible.  These
1459 	 * contiguous requests are handled via one call to VOP_WRITE
1460 	 * instead of different calls to VOP_WRITE.  We also keep
1461 	 * track of the fact that any data was written.
1462 	 */
1463 	rp = nlp->list;
1464 	data_written = 0;
1465 	do {
1466 		/*
1467 		 * Skip any requests which are already marked as having an
1468 		 * error.
1469 		 */
1470 		if (rp->ns->ns_status != RFSWRITE_INITVAL) {
1471 			rp = rp->list;
1472 			continue;
1473 		}
1474 
1475 		/*
1476 		 * Count the number of iovec's which are required
1477 		 * to handle this set of requests.  One iovec is
1478 		 * needed for each data buffer, whether addressed
1479 		 * by wa_data or by the b_rptr pointers in the
1480 		 * mblk chains.
1481 		 */
1482 		iovcnt = 0;
1483 		lrp = rp;
1484 		for (;;) {
1485 			if (lrp->wa->wa_data || lrp->wa->wa_rlist)
1486 				iovcnt++;
1487 			else {
1488 				m = lrp->wa->wa_mblk;
1489 				while (m != NULL) {
1490 					iovcnt++;
1491 					m = m->b_cont;
1492 				}
1493 			}
1494 			if (lrp->list == NULL ||
1495 			    lrp->list->ns->ns_status != RFSWRITE_INITVAL ||
1496 			    lrp->wa->wa_offset + lrp->wa->wa_count !=
1497 			    lrp->list->wa->wa_offset) {
1498 				lrp = lrp->list;
1499 				break;
1500 			}
1501 			lrp = lrp->list;
1502 		}
1503 
1504 		if (iovcnt <= MAXCLIOVECS) {
1505 #ifdef DEBUG
1506 			rfs_write_hits++;
1507 #endif
1508 			niovp = iov;
1509 		} else {
1510 #ifdef DEBUG
1511 			rfs_write_misses++;
1512 #endif
1513 			niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP);
1514 		}
1515 		/*
1516 		 * Put together the scatter/gather iovecs.
1517 		 */
1518 		iovp = niovp;
1519 		trp = rp;
1520 		count = 0;
1521 		do {
1522 			if (trp->wa->wa_data || trp->wa->wa_rlist) {
1523 				if (trp->wa->wa_rlist) {
1524 					iovp->iov_base =
1525 					    (char *)((trp->wa->wa_rlist)->
1526 					    u.c_daddr3);
1527 					iovp->iov_len = trp->wa->wa_count;
1528 				} else  {
1529 					iovp->iov_base = trp->wa->wa_data;
1530 					iovp->iov_len = trp->wa->wa_count;
1531 				}
1532 				iovp++;
1533 			} else {
1534 				m = trp->wa->wa_mblk;
1535 				rcount = trp->wa->wa_count;
1536 				while (m != NULL) {
1537 					iovp->iov_base = (caddr_t)m->b_rptr;
1538 					iovp->iov_len = (m->b_wptr - m->b_rptr);
1539 					rcount -= iovp->iov_len;
1540 					if (rcount < 0)
1541 						iovp->iov_len += rcount;
1542 					iovp++;
1543 					if (rcount <= 0)
1544 						break;
1545 					m = m->b_cont;
1546 				}
1547 			}
1548 			count += trp->wa->wa_count;
1549 			trp = trp->list;
1550 		} while (trp != lrp);
1551 
1552 		uio.uio_iov = niovp;
1553 		uio.uio_iovcnt = iovcnt;
1554 		uio.uio_segflg = UIO_SYSSPACE;
1555 		uio.uio_extflg = UIO_COPY_DEFAULT;
1556 		uio.uio_loffset = (offset_t)rp->wa->wa_offset;
1557 		uio.uio_resid = count;
1558 		/*
1559 		 * The limit is checked on the client. We
1560 		 * should allow any size writes here.
1561 		 */
1562 		uio.uio_llimit = curproc->p_fsz_ctl;
1563 		rlimit = uio.uio_llimit - rp->wa->wa_offset;
1564 		if (rlimit < (rlim64_t)uio.uio_resid)
1565 			uio.uio_resid = (uint_t)rlimit;
1566 
1567 		/*
1568 		 * For now we assume no append mode.
1569 		 */
1570 
1571 		/*
1572 		 * We're changing creds because VM may fault
1573 		 * and we need the cred of the current
1574 		 * thread to be used if quota * checking is
1575 		 * enabled.
1576 		 */
1577 		savecred = curthread->t_cred;
1578 		curthread->t_cred = cr;
1579 		error = VOP_WRITE(vp, &uio, 0, rp->cr, &ct);
1580 		curthread->t_cred = savecred;
1581 
1582 		/* check if a monitor detected a delegation conflict */
1583 		if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1584 			/* mark as wouldblock so response is dropped */
1585 			curthread->t_flag |= T_WOULDBLOCK;
1586 
1587 		if (niovp != iov)
1588 			kmem_free(niovp, sizeof (*niovp) * iovcnt);
1589 
1590 		if (!error) {
1591 			data_written = 1;
1592 			/*
1593 			 * Get attributes again so we send the latest mod
1594 			 * time to the client side for his cache.
1595 			 */
1596 			va.va_mask = AT_ALL;	/* now we want everything */
1597 
1598 			error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1599 
1600 			if (!error)
1601 				acl_perm(vp, exi, &va, rp->cr);
1602 		}
1603 
1604 		/*
1605 		 * Fill in the status responses for each request
1606 		 * which was just handled.  Also, copy the latest
1607 		 * attributes in to the attribute responses if
1608 		 * appropriate.
1609 		 */
1610 		t_flag = curthread->t_flag & T_WOULDBLOCK;
1611 		do {
1612 			rp->thread->t_flag |= t_flag;
1613 			/* check for overflows */
1614 			if (!error) {
1615 				error  = vattr_to_nattr(&va, &rp->ns->ns_attr);
1616 			}
1617 			rp->ns->ns_status = puterrno(error);
1618 			rp = rp->list;
1619 		} while (rp != lrp);
1620 	} while (rp != NULL);
1621 
1622 	/*
1623 	 * If any data was written at all, then we need to flush
1624 	 * the data and metadata to stable storage.
1625 	 */
1626 	if (data_written) {
1627 		error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr, &ct);
1628 
1629 		if (!error) {
1630 			error = VOP_FSYNC(vp, FNODSYNC, cr, &ct);
1631 		}
1632 	}
1633 
1634 	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1635 
1636 	if (in_crit)
1637 		nbl_end_crit(vp);
1638 	VN_RELE(vp);
1639 
1640 	t_flag = curthread->t_flag & T_WOULDBLOCK;
1641 	mutex_enter(&rfs_async_write_lock);
1642 	for (rp = nlp->list; rp != NULL; rp = rp->list) {
1643 		if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1644 			rp->ns->ns_status = puterrno(error);
1645 			rp->thread->t_flag |= t_flag;
1646 		}
1647 	}
1648 	cv_broadcast(&nlp->cv);
1649 	mutex_exit(&rfs_async_write_lock);
1650 
1651 }
1652 
1653 void *
1654 rfs_write_getfh(struct nfswriteargs *wa)
1655 {
1656 	return (&wa->wa_fhandle);
1657 }
1658 
1659 /*
1660  * Create a file.
1661  * Creates a file with given attributes and returns those attributes
1662  * and an fhandle for the new file.
1663  */
1664 void
1665 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
1666 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1667 {
1668 	int error;
1669 	int lookuperr;
1670 	int in_crit = 0;
1671 	struct vattr va;
1672 	vnode_t *vp;
1673 	vnode_t *realvp;
1674 	vnode_t *dvp;
1675 	char *name = args->ca_da.da_name;
1676 	vnode_t *tvp = NULL;
1677 	int mode;
1678 	int lookup_ok;
1679 	bool_t trunc;
1680 	struct sockaddr *ca;
1681 
1682 	/*
1683 	 * Disallow NULL paths
1684 	 */
1685 	if (name == NULL || *name == '\0') {
1686 		dr->dr_status = NFSERR_ACCES;
1687 		return;
1688 	}
1689 
1690 	dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
1691 	if (dvp == NULL) {
1692 		dr->dr_status = NFSERR_STALE;
1693 		return;
1694 	}
1695 
1696 	error = sattr_to_vattr(args->ca_sa, &va);
1697 	if (error) {
1698 		dr->dr_status = puterrno(error);
1699 		return;
1700 	}
1701 
1702 	/*
1703 	 * Must specify the mode.
1704 	 */
1705 	if (!(va.va_mask & AT_MODE)) {
1706 		VN_RELE(dvp);
1707 		dr->dr_status = NFSERR_INVAL;
1708 		return;
1709 	}
1710 
1711 	/*
1712 	 * This is a completely gross hack to make mknod
1713 	 * work over the wire until we can wack the protocol
1714 	 */
1715 	if ((va.va_mode & IFMT) == IFCHR) {
1716 		if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV)
1717 			va.va_type = VFIFO;	/* xtra kludge for named pipe */
1718 		else {
1719 			va.va_type = VCHR;
1720 			/*
1721 			 * uncompress the received dev_t
1722 			 * if the top half is zero indicating a request
1723 			 * from an `older style' OS.
1724 			 */
1725 			if ((va.va_size & 0xffff0000) == 0)
1726 				va.va_rdev = nfsv2_expdev(va.va_size);
1727 			else
1728 				va.va_rdev = (dev_t)va.va_size;
1729 		}
1730 		va.va_mask &= ~AT_SIZE;
1731 	} else if ((va.va_mode & IFMT) == IFBLK) {
1732 		va.va_type = VBLK;
1733 		/*
1734 		 * uncompress the received dev_t
1735 		 * if the top half is zero indicating a request
1736 		 * from an `older style' OS.
1737 		 */
1738 		if ((va.va_size & 0xffff0000) == 0)
1739 			va.va_rdev = nfsv2_expdev(va.va_size);
1740 		else
1741 			va.va_rdev = (dev_t)va.va_size;
1742 		va.va_mask &= ~AT_SIZE;
1743 	} else if ((va.va_mode & IFMT) == IFSOCK) {
1744 		va.va_type = VSOCK;
1745 	} else {
1746 		va.va_type = VREG;
1747 	}
1748 	va.va_mode &= ~IFMT;
1749 	va.va_mask |= AT_TYPE;
1750 
1751 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
1752 	name = nfscmd_convname(ca, exi, name, NFSCMD_CONV_INBOUND,
1753 	    MAXPATHLEN);
1754 	if (name == NULL) {
1755 		dr->dr_status = puterrno(EINVAL);
1756 		return;
1757 	}
1758 
1759 	/*
1760 	 * Why was the choice made to use VWRITE as the mode to the
1761 	 * call to VOP_CREATE ? This results in a bug.  When a client
1762 	 * opens a file that already exists and is RDONLY, the second
1763 	 * open fails with an EACESS because of the mode.
1764 	 * bug ID 1054648.
1765 	 */
1766 	lookup_ok = 0;
1767 	mode = VWRITE;
1768 	if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) {
1769 		error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1770 		    NULL, NULL, NULL);
1771 		if (!error) {
1772 			struct vattr at;
1773 
1774 			lookup_ok = 1;
1775 			at.va_mask = AT_MODE;
1776 			error = VOP_GETATTR(tvp, &at, 0, cr, NULL);
1777 			if (!error)
1778 				mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD;
1779 			VN_RELE(tvp);
1780 			tvp = NULL;
1781 		}
1782 	}
1783 
1784 	if (!lookup_ok) {
1785 		if (rdonly(exi, req)) {
1786 			error = EROFS;
1787 		} else if (va.va_type != VREG && va.va_type != VFIFO &&
1788 		    va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) {
1789 			error = EPERM;
1790 		} else {
1791 			error = 0;
1792 		}
1793 	}
1794 
1795 	/*
1796 	 * If file size is being modified on an already existing file
1797 	 * make sure that there are no conflicting non-blocking mandatory
1798 	 * locks in the region being manipulated. Return EACCES if there
1799 	 * are conflicting locks.
1800 	 */
1801 	if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) {
1802 		lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1803 		    NULL, NULL, NULL);
1804 
1805 		if (!lookuperr &&
1806 		    rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) {
1807 			VN_RELE(tvp);
1808 			curthread->t_flag |= T_WOULDBLOCK;
1809 			goto out;
1810 		}
1811 
1812 		if (!lookuperr && nbl_need_check(tvp)) {
1813 			/*
1814 			 * The file exists. Now check if it has any
1815 			 * conflicting non-blocking mandatory locks
1816 			 * in the region being changed.
1817 			 */
1818 			struct vattr bva;
1819 			u_offset_t offset;
1820 			ssize_t length;
1821 
1822 			nbl_start_crit(tvp, RW_READER);
1823 			in_crit = 1;
1824 
1825 			bva.va_mask = AT_SIZE;
1826 			error = VOP_GETATTR(tvp, &bva, 0, cr, NULL);
1827 			if (!error) {
1828 				if (va.va_size < bva.va_size) {
1829 					offset = va.va_size;
1830 					length = bva.va_size - va.va_size;
1831 				} else {
1832 					offset = bva.va_size;
1833 					length = va.va_size - bva.va_size;
1834 				}
1835 				if (length) {
1836 					if (nbl_conflict(tvp, NBL_WRITE,
1837 					    offset, length, 0, NULL)) {
1838 						error = EACCES;
1839 					}
1840 				}
1841 			}
1842 			if (error) {
1843 				nbl_end_crit(tvp);
1844 				VN_RELE(tvp);
1845 				in_crit = 0;
1846 			}
1847 		} else if (tvp != NULL) {
1848 			VN_RELE(tvp);
1849 		}
1850 	}
1851 
1852 	if (!error) {
1853 		/*
1854 		 * If filesystem is shared with nosuid the remove any
1855 		 * setuid/setgid bits on create.
1856 		 */
1857 		if (va.va_type == VREG &&
1858 		    exi->exi_export.ex_flags & EX_NOSUID)
1859 			va.va_mode &= ~(VSUID | VSGID);
1860 
1861 		error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0,
1862 		    NULL, NULL);
1863 
1864 		if (!error) {
1865 
1866 			if ((va.va_mask & AT_SIZE) && (va.va_size == 0))
1867 				trunc = TRUE;
1868 			else
1869 				trunc = FALSE;
1870 
1871 			if (rfs4_check_delegated(FWRITE, vp, trunc)) {
1872 				VN_RELE(vp);
1873 				curthread->t_flag |= T_WOULDBLOCK;
1874 				goto out;
1875 			}
1876 			va.va_mask = AT_ALL;
1877 
1878 			error = VOP_GETATTR(vp, &va, 0, cr, NULL);
1879 
1880 			/* check for overflows */
1881 			if (!error) {
1882 				acl_perm(vp, exi, &va, cr);
1883 				error = vattr_to_nattr(&va, &dr->dr_attr);
1884 				if (!error) {
1885 					error = makefh(&dr->dr_fhandle, vp,
1886 					    exi);
1887 				}
1888 			}
1889 			/*
1890 			 * Force modified metadata out to stable storage.
1891 			 *
1892 			 * if a underlying vp exists, pass it to VOP_FSYNC
1893 			 */
1894 			if (VOP_REALVP(vp, &realvp, NULL) == 0)
1895 				(void) VOP_FSYNC(realvp, FNODSYNC, cr, NULL);
1896 			else
1897 				(void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
1898 			VN_RELE(vp);
1899 		}
1900 
1901 		if (in_crit) {
1902 			nbl_end_crit(tvp);
1903 			VN_RELE(tvp);
1904 		}
1905 	}
1906 
1907 	/*
1908 	 * Force modified data and metadata out to stable storage.
1909 	 */
1910 	(void) VOP_FSYNC(dvp, 0, cr, NULL);
1911 
1912 out:
1913 
1914 	VN_RELE(dvp);
1915 
1916 	dr->dr_status = puterrno(error);
1917 
1918 	if (name != args->ca_da.da_name)
1919 		kmem_free(name, MAXPATHLEN);
1920 }
1921 void *
1922 rfs_create_getfh(struct nfscreatargs *args)
1923 {
1924 	return (args->ca_da.da_fhandle);
1925 }
1926 
1927 /*
1928  * Remove a file.
1929  * Remove named file from parent directory.
1930  */
1931 void
1932 rfs_remove(struct nfsdiropargs *da, enum nfsstat *status,
1933 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1934 {
1935 	int error = 0;
1936 	vnode_t *vp;
1937 	vnode_t *targvp;
1938 	int in_crit = 0;
1939 
1940 	/*
1941 	 * Disallow NULL paths
1942 	 */
1943 	if (da->da_name == NULL || *da->da_name == '\0') {
1944 		*status = NFSERR_ACCES;
1945 		return;
1946 	}
1947 
1948 	vp = nfs_fhtovp(da->da_fhandle, exi);
1949 	if (vp == NULL) {
1950 		*status = NFSERR_STALE;
1951 		return;
1952 	}
1953 
1954 	if (rdonly(exi, req)) {
1955 		VN_RELE(vp);
1956 		*status = NFSERR_ROFS;
1957 		return;
1958 	}
1959 
1960 	/*
1961 	 * Check for a conflict with a non-blocking mandatory share reservation.
1962 	 */
1963 	error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0,
1964 	    NULL, cr, NULL, NULL, NULL);
1965 	if (error != 0) {
1966 		VN_RELE(vp);
1967 		*status = puterrno(error);
1968 		return;
1969 	}
1970 
1971 	/*
1972 	 * If the file is delegated to an v4 client, then initiate
1973 	 * recall and drop this request (by setting T_WOULDBLOCK).
1974 	 * The client will eventually re-transmit the request and
1975 	 * (hopefully), by then, the v4 client will have returned
1976 	 * the delegation.
1977 	 */
1978 
1979 	if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
1980 		VN_RELE(vp);
1981 		VN_RELE(targvp);
1982 		curthread->t_flag |= T_WOULDBLOCK;
1983 		return;
1984 	}
1985 
1986 	if (nbl_need_check(targvp)) {
1987 		nbl_start_crit(targvp, RW_READER);
1988 		in_crit = 1;
1989 		if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
1990 			error = EACCES;
1991 			goto out;
1992 		}
1993 	}
1994 
1995 	error = VOP_REMOVE(vp, da->da_name, cr, NULL, 0);
1996 
1997 	/*
1998 	 * Force modified data and metadata out to stable storage.
1999 	 */
2000 	(void) VOP_FSYNC(vp, 0, cr, NULL);
2001 
2002 out:
2003 	if (in_crit)
2004 		nbl_end_crit(targvp);
2005 	VN_RELE(targvp);
2006 	VN_RELE(vp);
2007 
2008 	*status = puterrno(error);
2009 
2010 }
2011 
2012 void *
2013 rfs_remove_getfh(struct nfsdiropargs *da)
2014 {
2015 	return (da->da_fhandle);
2016 }
2017 
2018 /*
2019  * rename a file
2020  * Give a file (from) a new name (to).
2021  */
2022 void
2023 rfs_rename(struct nfsrnmargs *args, enum nfsstat *status,
2024 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2025 {
2026 	int error = 0;
2027 	vnode_t *fromvp;
2028 	vnode_t *tovp;
2029 	struct exportinfo *to_exi;
2030 	fhandle_t *fh;
2031 	vnode_t *srcvp;
2032 	vnode_t *targvp;
2033 	int in_crit = 0;
2034 
2035 	fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi);
2036 	if (fromvp == NULL) {
2037 		*status = NFSERR_STALE;
2038 		return;
2039 	}
2040 
2041 	fh = args->rna_to.da_fhandle;
2042 	to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2043 	if (to_exi == NULL) {
2044 		VN_RELE(fromvp);
2045 		*status = NFSERR_ACCES;
2046 		return;
2047 	}
2048 	exi_rele(to_exi);
2049 
2050 	if (to_exi != exi) {
2051 		VN_RELE(fromvp);
2052 		*status = NFSERR_XDEV;
2053 		return;
2054 	}
2055 
2056 	tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi);
2057 	if (tovp == NULL) {
2058 		VN_RELE(fromvp);
2059 		*status = NFSERR_STALE;
2060 		return;
2061 	}
2062 
2063 	if (fromvp->v_type != VDIR || tovp->v_type != VDIR) {
2064 		VN_RELE(tovp);
2065 		VN_RELE(fromvp);
2066 		*status = NFSERR_NOTDIR;
2067 		return;
2068 	}
2069 
2070 	/*
2071 	 * Disallow NULL paths
2072 	 */
2073 	if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' ||
2074 	    args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') {
2075 		VN_RELE(tovp);
2076 		VN_RELE(fromvp);
2077 		*status = NFSERR_ACCES;
2078 		return;
2079 	}
2080 
2081 	if (rdonly(exi, req)) {
2082 		VN_RELE(tovp);
2083 		VN_RELE(fromvp);
2084 		*status = NFSERR_ROFS;
2085 		return;
2086 	}
2087 
2088 	/*
2089 	 * Check for a conflict with a non-blocking mandatory share reservation.
2090 	 */
2091 	error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
2092 	    NULL, cr, NULL, NULL, NULL);
2093 	if (error != 0) {
2094 		VN_RELE(tovp);
2095 		VN_RELE(fromvp);
2096 		*status = puterrno(error);
2097 		return;
2098 	}
2099 
2100 	/* Check for delegations on the source file */
2101 
2102 	if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
2103 		VN_RELE(tovp);
2104 		VN_RELE(fromvp);
2105 		VN_RELE(srcvp);
2106 		curthread->t_flag |= T_WOULDBLOCK;
2107 		return;
2108 	}
2109 
2110 	/* Check for delegation on the file being renamed over, if it exists */
2111 
2112 	if (rfs4_deleg_policy != SRV_NEVER_DELEGATE &&
2113 	    VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
2114 	    NULL, NULL, NULL) == 0) {
2115 
2116 		if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2117 			VN_RELE(tovp);
2118 			VN_RELE(fromvp);
2119 			VN_RELE(srcvp);
2120 			VN_RELE(targvp);
2121 			curthread->t_flag |= T_WOULDBLOCK;
2122 			return;
2123 		}
2124 		VN_RELE(targvp);
2125 	}
2126 
2127 
2128 	if (nbl_need_check(srcvp)) {
2129 		nbl_start_crit(srcvp, RW_READER);
2130 		in_crit = 1;
2131 		if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) {
2132 			error = EACCES;
2133 			goto out;
2134 		}
2135 	}
2136 
2137 	error = VOP_RENAME(fromvp, args->rna_from.da_name,
2138 	    tovp, args->rna_to.da_name, cr, NULL, 0);
2139 
2140 	if (error == 0)
2141 		vn_renamepath(tovp, srcvp, args->rna_to.da_name,
2142 		    strlen(args->rna_to.da_name));
2143 
2144 	/*
2145 	 * Force modified data and metadata out to stable storage.
2146 	 */
2147 	(void) VOP_FSYNC(tovp, 0, cr, NULL);
2148 	(void) VOP_FSYNC(fromvp, 0, cr, NULL);
2149 
2150 out:
2151 	if (in_crit)
2152 		nbl_end_crit(srcvp);
2153 	VN_RELE(srcvp);
2154 	VN_RELE(tovp);
2155 	VN_RELE(fromvp);
2156 
2157 	*status = puterrno(error);
2158 
2159 }
2160 void *
2161 rfs_rename_getfh(struct nfsrnmargs *args)
2162 {
2163 	return (args->rna_from.da_fhandle);
2164 }
2165 
2166 /*
2167  * Link to a file.
2168  * Create a file (to) which is a hard link to the given file (from).
2169  */
2170 void
2171 rfs_link(struct nfslinkargs *args, enum nfsstat *status,
2172 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2173 {
2174 	int error;
2175 	vnode_t *fromvp;
2176 	vnode_t *tovp;
2177 	struct exportinfo *to_exi;
2178 	fhandle_t *fh;
2179 
2180 	fromvp = nfs_fhtovp(args->la_from, exi);
2181 	if (fromvp == NULL) {
2182 		*status = NFSERR_STALE;
2183 		return;
2184 	}
2185 
2186 	fh = args->la_to.da_fhandle;
2187 	to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2188 	if (to_exi == NULL) {
2189 		VN_RELE(fromvp);
2190 		*status = NFSERR_ACCES;
2191 		return;
2192 	}
2193 	exi_rele(to_exi);
2194 
2195 	if (to_exi != exi) {
2196 		VN_RELE(fromvp);
2197 		*status = NFSERR_XDEV;
2198 		return;
2199 	}
2200 
2201 	tovp = nfs_fhtovp(args->la_to.da_fhandle, exi);
2202 	if (tovp == NULL) {
2203 		VN_RELE(fromvp);
2204 		*status = NFSERR_STALE;
2205 		return;
2206 	}
2207 
2208 	if (tovp->v_type != VDIR) {
2209 		VN_RELE(tovp);
2210 		VN_RELE(fromvp);
2211 		*status = NFSERR_NOTDIR;
2212 		return;
2213 	}
2214 	/*
2215 	 * Disallow NULL paths
2216 	 */
2217 	if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') {
2218 		VN_RELE(tovp);
2219 		VN_RELE(fromvp);
2220 		*status = NFSERR_ACCES;
2221 		return;
2222 	}
2223 
2224 	if (rdonly(exi, req)) {
2225 		VN_RELE(tovp);
2226 		VN_RELE(fromvp);
2227 		*status = NFSERR_ROFS;
2228 		return;
2229 	}
2230 
2231 	error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr, NULL, 0);
2232 
2233 	/*
2234 	 * Force modified data and metadata out to stable storage.
2235 	 */
2236 	(void) VOP_FSYNC(tovp, 0, cr, NULL);
2237 	(void) VOP_FSYNC(fromvp, FNODSYNC, cr, NULL);
2238 
2239 	VN_RELE(tovp);
2240 	VN_RELE(fromvp);
2241 
2242 	*status = puterrno(error);
2243 
2244 }
2245 void *
2246 rfs_link_getfh(struct nfslinkargs *args)
2247 {
2248 	return (args->la_from);
2249 }
2250 
2251 /*
2252  * Symbolicly link to a file.
2253  * Create a file (to) with the given attributes which is a symbolic link
2254  * to the given path name (to).
2255  */
2256 void
2257 rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
2258 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2259 {
2260 	int error;
2261 	struct vattr va;
2262 	vnode_t *vp;
2263 	vnode_t *svp;
2264 	int lerror;
2265 	struct sockaddr *ca;
2266 	char *name = NULL;
2267 
2268 	/*
2269 	 * Disallow NULL paths
2270 	 */
2271 	if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') {
2272 		*status = NFSERR_ACCES;
2273 		return;
2274 	}
2275 
2276 	vp = nfs_fhtovp(args->sla_from.da_fhandle, exi);
2277 	if (vp == NULL) {
2278 		*status = NFSERR_STALE;
2279 		return;
2280 	}
2281 
2282 	if (rdonly(exi, req)) {
2283 		VN_RELE(vp);
2284 		*status = NFSERR_ROFS;
2285 		return;
2286 	}
2287 
2288 	error = sattr_to_vattr(args->sla_sa, &va);
2289 	if (error) {
2290 		VN_RELE(vp);
2291 		*status = puterrno(error);
2292 		return;
2293 	}
2294 
2295 	if (!(va.va_mask & AT_MODE)) {
2296 		VN_RELE(vp);
2297 		*status = NFSERR_INVAL;
2298 		return;
2299 	}
2300 
2301 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2302 	name = nfscmd_convname(ca, exi, args->sla_tnm,
2303 	    NFSCMD_CONV_INBOUND, MAXPATHLEN);
2304 
2305 	if (name == NULL) {
2306 		*status = NFSERR_ACCES;
2307 		return;
2308 	}
2309 
2310 	va.va_type = VLNK;
2311 	va.va_mask |= AT_TYPE;
2312 
2313 	error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, name, cr, NULL, 0);
2314 
2315 	/*
2316 	 * Force new data and metadata out to stable storage.
2317 	 */
2318 	lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL, 0,
2319 	    NULL, cr, NULL, NULL, NULL);
2320 
2321 	if (!lerror) {
2322 		(void) VOP_FSYNC(svp, 0, cr, NULL);
2323 		VN_RELE(svp);
2324 	}
2325 
2326 	/*
2327 	 * Force modified data and metadata out to stable storage.
2328 	 */
2329 	(void) VOP_FSYNC(vp, 0, cr, NULL);
2330 
2331 	VN_RELE(vp);
2332 
2333 	*status = puterrno(error);
2334 	if (name != args->sla_tnm)
2335 		kmem_free(name, MAXPATHLEN);
2336 
2337 }
2338 void *
2339 rfs_symlink_getfh(struct nfsslargs *args)
2340 {
2341 	return (args->sla_from.da_fhandle);
2342 }
2343 
2344 /*
2345  * Make a directory.
2346  * Create a directory with the given name, parent directory, and attributes.
2347  * Returns a file handle and attributes for the new directory.
2348  */
2349 void
2350 rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr,
2351 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2352 {
2353 	int error;
2354 	struct vattr va;
2355 	vnode_t *dvp = NULL;
2356 	vnode_t *vp;
2357 	char *name = args->ca_da.da_name;
2358 
2359 	/*
2360 	 * Disallow NULL paths
2361 	 */
2362 	if (name == NULL || *name == '\0') {
2363 		dr->dr_status = NFSERR_ACCES;
2364 		return;
2365 	}
2366 
2367 	vp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
2368 	if (vp == NULL) {
2369 		dr->dr_status = NFSERR_STALE;
2370 		return;
2371 	}
2372 
2373 	if (rdonly(exi, req)) {
2374 		VN_RELE(vp);
2375 		dr->dr_status = NFSERR_ROFS;
2376 		return;
2377 	}
2378 
2379 	error = sattr_to_vattr(args->ca_sa, &va);
2380 	if (error) {
2381 		VN_RELE(vp);
2382 		dr->dr_status = puterrno(error);
2383 		return;
2384 	}
2385 
2386 	if (!(va.va_mask & AT_MODE)) {
2387 		VN_RELE(vp);
2388 		dr->dr_status = NFSERR_INVAL;
2389 		return;
2390 	}
2391 
2392 	va.va_type = VDIR;
2393 	va.va_mask |= AT_TYPE;
2394 
2395 	error = VOP_MKDIR(vp, name, &va, &dvp, cr, NULL, 0, NULL);
2396 
2397 	if (!error) {
2398 		/*
2399 		 * Attribtutes of the newly created directory should
2400 		 * be returned to the client.
2401 		 */
2402 		va.va_mask = AT_ALL; /* We want everything */
2403 		error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2404 
2405 		/* check for overflows */
2406 		if (!error) {
2407 			acl_perm(vp, exi, &va, cr);
2408 			error = vattr_to_nattr(&va, &dr->dr_attr);
2409 			if (!error) {
2410 				error = makefh(&dr->dr_fhandle, dvp, exi);
2411 			}
2412 		}
2413 		/*
2414 		 * Force new data and metadata out to stable storage.
2415 		 */
2416 		(void) VOP_FSYNC(dvp, 0, cr, NULL);
2417 		VN_RELE(dvp);
2418 	}
2419 
2420 	/*
2421 	 * Force modified data and metadata out to stable storage.
2422 	 */
2423 	(void) VOP_FSYNC(vp, 0, cr, NULL);
2424 
2425 	VN_RELE(vp);
2426 
2427 	dr->dr_status = puterrno(error);
2428 
2429 }
2430 void *
2431 rfs_mkdir_getfh(struct nfscreatargs *args)
2432 {
2433 	return (args->ca_da.da_fhandle);
2434 }
2435 
2436 /*
2437  * Remove a directory.
2438  * Remove the given directory name from the given parent directory.
2439  */
2440 void
2441 rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status,
2442 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2443 {
2444 	int error;
2445 	vnode_t *vp;
2446 
2447 
2448 	/*
2449 	 * Disallow NULL paths
2450 	 */
2451 	if (da->da_name == NULL || *da->da_name == '\0') {
2452 		*status = NFSERR_ACCES;
2453 		return;
2454 	}
2455 
2456 	vp = nfs_fhtovp(da->da_fhandle, exi);
2457 	if (vp == NULL) {
2458 		*status = NFSERR_STALE;
2459 		return;
2460 	}
2461 
2462 	if (rdonly(exi, req)) {
2463 		VN_RELE(vp);
2464 		*status = NFSERR_ROFS;
2465 		return;
2466 	}
2467 
2468 	/*
2469 	 * VOP_RMDIR now takes a new third argument (the current
2470 	 * directory of the process).  That's because someone
2471 	 * wants to return EINVAL if one tries to remove ".".
2472 	 * Of course, NFS servers have no idea what their
2473 	 * clients' current directories are.  We fake it by
2474 	 * supplying a vnode known to exist and illegal to
2475 	 * remove.
2476 	 */
2477 	error = VOP_RMDIR(vp, da->da_name, rootdir, cr, NULL, 0);
2478 
2479 	/*
2480 	 * Force modified data and metadata out to stable storage.
2481 	 */
2482 	(void) VOP_FSYNC(vp, 0, cr, NULL);
2483 
2484 	VN_RELE(vp);
2485 
2486 	/*
2487 	 * System V defines rmdir to return EEXIST, not ENOTEMPTY,
2488 	 * if the directory is not empty.  A System V NFS server
2489 	 * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
2490 	 * over the wire.
2491 	 */
2492 	if (error == EEXIST)
2493 		*status = NFSERR_NOTEMPTY;
2494 	else
2495 		*status = puterrno(error);
2496 
2497 }
2498 void *
2499 rfs_rmdir_getfh(struct nfsdiropargs *da)
2500 {
2501 	return (da->da_fhandle);
2502 }
2503 
2504 /* ARGSUSED */
2505 void
2506 rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
2507 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2508 {
2509 	int error;
2510 	int iseof;
2511 	struct iovec iov;
2512 	struct uio uio;
2513 	vnode_t *vp;
2514 	char *ndata = NULL;
2515 	struct sockaddr *ca;
2516 	size_t nents;
2517 	int ret;
2518 
2519 	vp = nfs_fhtovp(&rda->rda_fh, exi);
2520 	if (vp == NULL) {
2521 		rd->rd_entries = NULL;
2522 		rd->rd_status = NFSERR_STALE;
2523 		return;
2524 	}
2525 
2526 	if (vp->v_type != VDIR) {
2527 		VN_RELE(vp);
2528 		rd->rd_entries = NULL;
2529 		rd->rd_status = NFSERR_NOTDIR;
2530 		return;
2531 	}
2532 
2533 	(void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
2534 
2535 	error = VOP_ACCESS(vp, VREAD, 0, cr, NULL);
2536 
2537 	if (error) {
2538 		rd->rd_entries = NULL;
2539 		goto bad;
2540 	}
2541 
2542 	if (rda->rda_count == 0) {
2543 		rd->rd_entries = NULL;
2544 		rd->rd_size = 0;
2545 		rd->rd_eof = FALSE;
2546 		goto bad;
2547 	}
2548 
2549 	rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA);
2550 
2551 	/*
2552 	 * Allocate data for entries.  This will be freed by rfs_rddirfree.
2553 	 */
2554 	rd->rd_bufsize = (uint_t)rda->rda_count;
2555 	rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP);
2556 
2557 	/*
2558 	 * Set up io vector to read directory data
2559 	 */
2560 	iov.iov_base = (caddr_t)rd->rd_entries;
2561 	iov.iov_len = rda->rda_count;
2562 	uio.uio_iov = &iov;
2563 	uio.uio_iovcnt = 1;
2564 	uio.uio_segflg = UIO_SYSSPACE;
2565 	uio.uio_extflg = UIO_COPY_CACHED;
2566 	uio.uio_loffset = (offset_t)rda->rda_offset;
2567 	uio.uio_resid = rda->rda_count;
2568 
2569 	/*
2570 	 * read directory
2571 	 */
2572 	error = VOP_READDIR(vp, &uio, cr, &iseof, NULL, 0);
2573 
2574 	/*
2575 	 * Clean up
2576 	 */
2577 	if (!error) {
2578 		/*
2579 		 * set size and eof
2580 		 */
2581 		if (uio.uio_resid == rda->rda_count) {
2582 			rd->rd_size = 0;
2583 			rd->rd_eof = TRUE;
2584 		} else {
2585 			rd->rd_size = (uint32_t)(rda->rda_count -
2586 			    uio.uio_resid);
2587 			rd->rd_eof = iseof ? TRUE : FALSE;
2588 		}
2589 	}
2590 
2591 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2592 	nents = nfscmd_countents((char *)rd->rd_entries, rd->rd_size);
2593 	ret = nfscmd_convdirplus(ca, exi, (char *)rd->rd_entries, nents,
2594 	    rda->rda_count, &ndata);
2595 
2596 	if (ret != 0) {
2597 		size_t dropbytes;
2598 		/*
2599 		 * We had to drop one or more entries in order to fit
2600 		 * during the character conversion.  We need to patch
2601 		 * up the size and eof info.
2602 		 */
2603 		if (rd->rd_eof)
2604 			rd->rd_eof = FALSE;
2605 		dropbytes = nfscmd_dropped_entrysize(
2606 		    (struct dirent64 *)rd->rd_entries, nents, ret);
2607 		rd->rd_size -= dropbytes;
2608 	}
2609 	if (ndata == NULL) {
2610 		ndata = (char *)rd->rd_entries;
2611 	} else if (ndata != (char *)rd->rd_entries) {
2612 		kmem_free(rd->rd_entries, rd->rd_bufsize);
2613 		rd->rd_entries = (void *)ndata;
2614 		rd->rd_bufsize = rda->rda_count;
2615 	}
2616 
2617 bad:
2618 	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
2619 
2620 #if 0 /* notyet */
2621 	/*
2622 	 * Don't do this.  It causes local disk writes when just
2623 	 * reading the file and the overhead is deemed larger
2624 	 * than the benefit.
2625 	 */
2626 	/*
2627 	 * Force modified metadata out to stable storage.
2628 	 */
2629 	(void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2630 #endif
2631 
2632 	VN_RELE(vp);
2633 
2634 	rd->rd_status = puterrno(error);
2635 
2636 }
2637 void *
2638 rfs_readdir_getfh(struct nfsrddirargs *rda)
2639 {
2640 	return (&rda->rda_fh);
2641 }
2642 void
2643 rfs_rddirfree(struct nfsrddirres *rd)
2644 {
2645 	if (rd->rd_entries != NULL)
2646 		kmem_free(rd->rd_entries, rd->rd_bufsize);
2647 }
2648 
2649 /* ARGSUSED */
2650 void
2651 rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
2652 	struct svc_req *req, cred_t *cr)
2653 {
2654 	int error;
2655 	struct statvfs64 sb;
2656 	vnode_t *vp;
2657 
2658 	vp = nfs_fhtovp(fh, exi);
2659 	if (vp == NULL) {
2660 		fs->fs_status = NFSERR_STALE;
2661 		return;
2662 	}
2663 
2664 	error = VFS_STATVFS(vp->v_vfsp, &sb);
2665 
2666 	if (!error) {
2667 		fs->fs_tsize = nfstsize();
2668 		fs->fs_bsize = sb.f_frsize;
2669 		fs->fs_blocks = sb.f_blocks;
2670 		fs->fs_bfree = sb.f_bfree;
2671 		fs->fs_bavail = sb.f_bavail;
2672 	}
2673 
2674 	VN_RELE(vp);
2675 
2676 	fs->fs_status = puterrno(error);
2677 
2678 }
2679 void *
2680 rfs_statfs_getfh(fhandle_t *fh)
2681 {
2682 	return (fh);
2683 }
2684 
2685 static int
2686 sattr_to_vattr(struct nfssattr *sa, struct vattr *vap)
2687 {
2688 	vap->va_mask = 0;
2689 
2690 	/*
2691 	 * There was a sign extension bug in some VFS based systems
2692 	 * which stored the mode as a short.  When it would get
2693 	 * assigned to a u_long, no sign extension would occur.
2694 	 * It needed to, but this wasn't noticed because sa_mode
2695 	 * would then get assigned back to the short, thus ignoring
2696 	 * the upper 16 bits of sa_mode.
2697 	 *
2698 	 * To make this implementation work for both broken
2699 	 * clients and good clients, we check for both versions
2700 	 * of the mode.
2701 	 */
2702 	if (sa->sa_mode != (uint32_t)((ushort_t)-1) &&
2703 	    sa->sa_mode != (uint32_t)-1) {
2704 		vap->va_mask |= AT_MODE;
2705 		vap->va_mode = sa->sa_mode;
2706 	}
2707 	if (sa->sa_uid != (uint32_t)-1) {
2708 		vap->va_mask |= AT_UID;
2709 		vap->va_uid = sa->sa_uid;
2710 	}
2711 	if (sa->sa_gid != (uint32_t)-1) {
2712 		vap->va_mask |= AT_GID;
2713 		vap->va_gid = sa->sa_gid;
2714 	}
2715 	if (sa->sa_size != (uint32_t)-1) {
2716 		vap->va_mask |= AT_SIZE;
2717 		vap->va_size = sa->sa_size;
2718 	}
2719 	if (sa->sa_atime.tv_sec != (int32_t)-1 &&
2720 	    sa->sa_atime.tv_usec != (int32_t)-1) {
2721 #ifndef _LP64
2722 		/* return error if time overflow */
2723 		if (!NFS2_TIME_OK(sa->sa_atime.tv_sec))
2724 			return (EOVERFLOW);
2725 #endif
2726 		vap->va_mask |= AT_ATIME;
2727 		/*
2728 		 * nfs protocol defines times as unsigned so don't extend sign,
2729 		 * unless sysadmin set nfs_allow_preepoch_time.
2730 		 */
2731 		NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec);
2732 		vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
2733 	}
2734 	if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
2735 	    sa->sa_mtime.tv_usec != (int32_t)-1) {
2736 #ifndef _LP64
2737 		/* return error if time overflow */
2738 		if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
2739 			return (EOVERFLOW);
2740 #endif
2741 		vap->va_mask |= AT_MTIME;
2742 		/*
2743 		 * nfs protocol defines times as unsigned so don't extend sign,
2744 		 * unless sysadmin set nfs_allow_preepoch_time.
2745 		 */
2746 		NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
2747 		vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
2748 	}
2749 	return (0);
2750 }
2751 
2752 static enum nfsftype vt_to_nf[] = {
2753 	0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
2754 };
2755 
2756 /*
2757  * check the following fields for overflow: nodeid, size, and time.
2758  * There could be a problem when converting 64-bit LP64 fields
2759  * into 32-bit ones.  Return an error if there is an overflow.
2760  */
2761 int
2762 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
2763 {
2764 	ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
2765 	na->na_type = vt_to_nf[vap->va_type];
2766 
2767 	if (vap->va_mode == (unsigned short) -1)
2768 		na->na_mode = (uint32_t)-1;
2769 	else
2770 		na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
2771 
2772 	if (vap->va_uid == (unsigned short)(-1))
2773 		na->na_uid = (uint32_t)(-1);
2774 	else if (vap->va_uid == UID_NOBODY)
2775 		na->na_uid = (uint32_t)NFS_UID_NOBODY;
2776 	else
2777 		na->na_uid = vap->va_uid;
2778 
2779 	if (vap->va_gid == (unsigned short)(-1))
2780 		na->na_gid = (uint32_t)-1;
2781 	else if (vap->va_gid == GID_NOBODY)
2782 		na->na_gid = (uint32_t)NFS_GID_NOBODY;
2783 	else
2784 		na->na_gid = vap->va_gid;
2785 
2786 	/*
2787 	 * Do we need to check fsid for overflow?  It is 64-bit in the
2788 	 * vattr, but are bigger than 32 bit values supported?
2789 	 */
2790 	na->na_fsid = vap->va_fsid;
2791 
2792 	na->na_nodeid = vap->va_nodeid;
2793 
2794 	/*
2795 	 * Check to make sure that the nodeid is representable over the
2796 	 * wire without losing bits.
2797 	 */
2798 	if (vap->va_nodeid != (u_longlong_t)na->na_nodeid)
2799 		return (EFBIG);
2800 	na->na_nlink = vap->va_nlink;
2801 
2802 	/*
2803 	 * Check for big files here, instead of at the caller.  See
2804 	 * comments in cstat for large special file explanation.
2805 	 */
2806 	if (vap->va_size > (u_longlong_t)MAXOFF32_T) {
2807 		if ((vap->va_type == VREG) || (vap->va_type == VDIR))
2808 			return (EFBIG);
2809 		if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) {
2810 			/* UNKNOWN_SIZE | OVERFLOW */
2811 			na->na_size = MAXOFF32_T;
2812 		} else
2813 			na->na_size = vap->va_size;
2814 	} else
2815 		na->na_size = vap->va_size;
2816 
2817 	/*
2818 	 * If the vnode times overflow the 32-bit times that NFS2
2819 	 * uses on the wire then return an error.
2820 	 */
2821 	if (!NFS_VAP_TIME_OK(vap)) {
2822 		return (EOVERFLOW);
2823 	}
2824 	na->na_atime.tv_sec = vap->va_atime.tv_sec;
2825 	na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2826 
2827 	na->na_mtime.tv_sec = vap->va_mtime.tv_sec;
2828 	na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2829 
2830 	na->na_ctime.tv_sec = vap->va_ctime.tv_sec;
2831 	na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000;
2832 
2833 	/*
2834 	 * If the dev_t will fit into 16 bits then compress
2835 	 * it, otherwise leave it alone. See comments in
2836 	 * nfs_client.c.
2837 	 */
2838 	if (getminor(vap->va_rdev) <= SO4_MAXMIN &&
2839 	    getmajor(vap->va_rdev) <= SO4_MAXMAJ)
2840 		na->na_rdev = nfsv2_cmpdev(vap->va_rdev);
2841 	else
2842 		(void) cmpldev(&na->na_rdev, vap->va_rdev);
2843 
2844 	na->na_blocks = vap->va_nblocks;
2845 	na->na_blocksize = vap->va_blksize;
2846 
2847 	/*
2848 	 * This bit of ugliness is a *TEMPORARY* hack to preserve the
2849 	 * over-the-wire protocols for named-pipe vnodes.  It remaps the
2850 	 * VFIFO type to the special over-the-wire type. (see note in nfs.h)
2851 	 *
2852 	 * BUYER BEWARE:
2853 	 *  If you are porting the NFS to a non-Sun server, you probably
2854 	 *  don't want to include the following block of code.  The
2855 	 *  over-the-wire special file types will be changing with the
2856 	 *  NFS Protocol Revision.
2857 	 */
2858 	if (vap->va_type == VFIFO)
2859 		NA_SETFIFO(na);
2860 	return (0);
2861 }
2862 
2863 /*
2864  * acl v2 support: returns approximate permission.
2865  *	default: returns minimal permission (more restrictive)
2866  *	aclok: returns maximal permission (less restrictive)
2867  *	This routine changes the permissions that are alaredy in *va.
2868  *	If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES,
2869  *	CLASS_OBJ is always the same as GROUP_OBJ entry.
2870  */
2871 static void
2872 acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr)
2873 {
2874 	vsecattr_t	vsa;
2875 	int		aclcnt;
2876 	aclent_t	*aclentp;
2877 	mode_t		mask_perm;
2878 	mode_t		grp_perm;
2879 	mode_t		other_perm;
2880 	mode_t		other_orig;
2881 	int		error;
2882 
2883 	/* dont care default acl */
2884 	vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT);
2885 	error = VOP_GETSECATTR(vp, &vsa, 0, cr, NULL);
2886 
2887 	if (!error) {
2888 		aclcnt = vsa.vsa_aclcnt;
2889 		if (aclcnt > MIN_ACL_ENTRIES) {
2890 			/* non-trivial ACL */
2891 			aclentp = vsa.vsa_aclentp;
2892 			if (exi->exi_export.ex_flags & EX_ACLOK) {
2893 				/* maximal permissions */
2894 				grp_perm = 0;
2895 				other_perm = 0;
2896 				for (; aclcnt > 0; aclcnt--, aclentp++) {
2897 					switch (aclentp->a_type) {
2898 					case USER_OBJ:
2899 						break;
2900 					case USER:
2901 						grp_perm |=
2902 						    aclentp->a_perm << 3;
2903 						other_perm |= aclentp->a_perm;
2904 						break;
2905 					case GROUP_OBJ:
2906 						grp_perm |=
2907 						    aclentp->a_perm << 3;
2908 						break;
2909 					case GROUP:
2910 						other_perm |= aclentp->a_perm;
2911 						break;
2912 					case OTHER_OBJ:
2913 						other_orig = aclentp->a_perm;
2914 						break;
2915 					case CLASS_OBJ:
2916 						mask_perm = aclentp->a_perm;
2917 						break;
2918 					default:
2919 						break;
2920 					}
2921 				}
2922 				grp_perm &= mask_perm << 3;
2923 				other_perm &= mask_perm;
2924 				other_perm |= other_orig;
2925 
2926 			} else {
2927 				/* minimal permissions */
2928 				grp_perm = 070;
2929 				other_perm = 07;
2930 				for (; aclcnt > 0; aclcnt--, aclentp++) {
2931 					switch (aclentp->a_type) {
2932 					case USER_OBJ:
2933 						break;
2934 					case USER:
2935 					case CLASS_OBJ:
2936 						grp_perm &=
2937 						    aclentp->a_perm << 3;
2938 						other_perm &=
2939 						    aclentp->a_perm;
2940 						break;
2941 					case GROUP_OBJ:
2942 						grp_perm &=
2943 						    aclentp->a_perm << 3;
2944 						break;
2945 					case GROUP:
2946 						other_perm &=
2947 						    aclentp->a_perm;
2948 						break;
2949 					case OTHER_OBJ:
2950 						other_perm &=
2951 						    aclentp->a_perm;
2952 						break;
2953 					default:
2954 						break;
2955 					}
2956 				}
2957 			}
2958 			/* copy to va */
2959 			va->va_mode &= ~077;
2960 			va->va_mode |= grp_perm | other_perm;
2961 		}
2962 		if (vsa.vsa_aclcnt)
2963 			kmem_free(vsa.vsa_aclentp,
2964 			    vsa.vsa_aclcnt * sizeof (aclent_t));
2965 	}
2966 }
2967 
2968 void
2969 rfs_srvrinit(void)
2970 {
2971 	mutex_init(&rfs_async_write_lock, NULL, MUTEX_DEFAULT, NULL);
2972 	nfs2_srv_caller_id = fs_new_caller_id();
2973 }
2974 
2975 void
2976 rfs_srvrfini(void)
2977 {
2978 	mutex_destroy(&rfs_async_write_lock);
2979 }
2980 
2981 static int
2982 rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
2983 {
2984 	struct clist	*wcl;
2985 	int		wlist_len;
2986 	uint32_t	count = rr->rr_count;
2987 
2988 	wcl = ra->ra_wlist;
2989 
2990 	if (rdma_setup_read_chunks(wcl, count, &wlist_len) == FALSE) {
2991 		return (FALSE);
2992 	}
2993 
2994 	wcl = ra->ra_wlist;
2995 	rr->rr_ok.rrok_wlist_len = wlist_len;
2996 	rr->rr_ok.rrok_wlist = wcl;
2997 
2998 	return (TRUE);
2999 }
3000