xref: /titanic_50/usr/src/uts/common/fs/nfs/nfs_srv.c (revision 0b1b4412cfd6c4ac5467dbe6f4088dcec4f55fe8)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  *	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
28  *	All rights reserved.
29  */
30 
31 #include <sys/param.h>
32 #include <sys/types.h>
33 #include <sys/systm.h>
34 #include <sys/cred.h>
35 #include <sys/buf.h>
36 #include <sys/vfs.h>
37 #include <sys/vnode.h>
38 #include <sys/uio.h>
39 #include <sys/stat.h>
40 #include <sys/errno.h>
41 #include <sys/sysmacros.h>
42 #include <sys/statvfs.h>
43 #include <sys/kmem.h>
44 #include <sys/kstat.h>
45 #include <sys/dirent.h>
46 #include <sys/cmn_err.h>
47 #include <sys/debug.h>
48 #include <sys/vtrace.h>
49 #include <sys/mode.h>
50 #include <sys/acl.h>
51 #include <sys/nbmlock.h>
52 #include <sys/policy.h>
53 #include <sys/sdt.h>
54 
55 #include <rpc/types.h>
56 #include <rpc/auth.h>
57 #include <rpc/svc.h>
58 
59 #include <nfs/nfs.h>
60 #include <nfs/export.h>
61 
62 #include <vm/hat.h>
63 #include <vm/as.h>
64 #include <vm/seg.h>
65 #include <vm/seg_map.h>
66 #include <vm/seg_kmem.h>
67 
68 #include <sys/strsubr.h>
69 
70 /*
71  * These are the interface routines for the server side of the
72  * Network File System.  See the NFS version 2 protocol specification
73  * for a description of this interface.
74  */
75 
76 static int	sattr_to_vattr(struct nfssattr *, struct vattr *);
77 static void	acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
78 			cred_t *);
79 
80 /*
81  * Some "over the wire" UNIX file types.  These are encoded
82  * into the mode.  This needs to be fixed in the next rev.
83  */
84 #define	IFMT		0170000		/* type of file */
85 #define	IFCHR		0020000		/* character special */
86 #define	IFBLK		0060000		/* block special */
87 #define	IFSOCK		0140000		/* socket */
88 
89 u_longlong_t nfs2_srv_caller_id;
90 
91 /*
92  * Get file attributes.
93  * Returns the current attributes of the file with the given fhandle.
94  */
95 /* ARGSUSED */
96 void
97 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
98 	struct svc_req *req, cred_t *cr)
99 {
100 	int error;
101 	vnode_t *vp;
102 	struct vattr va;
103 
104 	vp = nfs_fhtovp(fhp, exi);
105 	if (vp == NULL) {
106 		ns->ns_status = NFSERR_STALE;
107 		return;
108 	}
109 
110 	/*
111 	 * Do the getattr.
112 	 */
113 	va.va_mask = AT_ALL;	/* we want all the attributes */
114 
115 	error = rfs4_delegated_getattr(vp, &va, 0, cr);
116 
117 	/* check for overflows */
118 	if (!error) {
119 		acl_perm(vp, exi, &va, cr);
120 		error = vattr_to_nattr(&va, &ns->ns_attr);
121 	}
122 
123 	VN_RELE(vp);
124 
125 	ns->ns_status = puterrno(error);
126 }
127 void *
128 rfs_getattr_getfh(fhandle_t *fhp)
129 {
130 	return (fhp);
131 }
132 
133 /*
134  * Set file attributes.
135  * Sets the attributes of the file with the given fhandle.  Returns
136  * the new attributes.
137  */
138 void
139 rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns,
140 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
141 {
142 	int error;
143 	int flag;
144 	int in_crit = 0;
145 	vnode_t *vp;
146 	struct vattr va;
147 	struct vattr bva;
148 	struct flock64 bf;
149 	caller_context_t ct;
150 
151 
152 	vp = nfs_fhtovp(&args->saa_fh, exi);
153 	if (vp == NULL) {
154 		ns->ns_status = NFSERR_STALE;
155 		return;
156 	}
157 
158 	if (rdonly(exi, req) || vn_is_readonly(vp)) {
159 		VN_RELE(vp);
160 		ns->ns_status = NFSERR_ROFS;
161 		return;
162 	}
163 
164 	error = sattr_to_vattr(&args->saa_sa, &va);
165 	if (error) {
166 		VN_RELE(vp);
167 		ns->ns_status = puterrno(error);
168 		return;
169 	}
170 
171 	/*
172 	 * If the client is requesting a change to the mtime,
173 	 * but the nanosecond field is set to 1 billion, then
174 	 * this is a flag to the server that it should set the
175 	 * atime and mtime fields to the server's current time.
176 	 * The 1 billion number actually came from the client
177 	 * as 1 million, but the units in the over the wire
178 	 * request are microseconds instead of nanoseconds.
179 	 *
180 	 * This is an overload of the protocol and should be
181 	 * documented in the NFS Version 2 protocol specification.
182 	 */
183 	if (va.va_mask & AT_MTIME) {
184 		if (va.va_mtime.tv_nsec == 1000000000) {
185 			gethrestime(&va.va_mtime);
186 			va.va_atime = va.va_mtime;
187 			va.va_mask |= AT_ATIME;
188 			flag = 0;
189 		} else
190 			flag = ATTR_UTIME;
191 	} else
192 		flag = 0;
193 
194 	/*
195 	 * If the filesystem is exported with nosuid, then mask off
196 	 * the setuid and setgid bits.
197 	 */
198 	if ((va.va_mask & AT_MODE) && vp->v_type == VREG &&
199 	    (exi->exi_export.ex_flags & EX_NOSUID))
200 		va.va_mode &= ~(VSUID | VSGID);
201 
202 	ct.cc_sysid = 0;
203 	ct.cc_pid = 0;
204 	ct.cc_caller_id = nfs2_srv_caller_id;
205 	ct.cc_flags = CC_DONTBLOCK;
206 
207 	/*
208 	 * We need to specially handle size changes because it is
209 	 * possible for the client to create a file with modes
210 	 * which indicate read-only, but with the file opened for
211 	 * writing.  If the client then tries to set the size of
212 	 * the file, then the normal access checking done in
213 	 * VOP_SETATTR would prevent the client from doing so,
214 	 * although it should be legal for it to do so.  To get
215 	 * around this, we do the access checking for ourselves
216 	 * and then use VOP_SPACE which doesn't do the access
217 	 * checking which VOP_SETATTR does. VOP_SPACE can only
218 	 * operate on VREG files, let VOP_SETATTR handle the other
219 	 * extremely rare cases.
220 	 * Also the client should not be allowed to change the
221 	 * size of the file if there is a conflicting non-blocking
222 	 * mandatory lock in the region of change.
223 	 */
224 	if (vp->v_type == VREG && va.va_mask & AT_SIZE) {
225 		if (nbl_need_check(vp)) {
226 			nbl_start_crit(vp, RW_READER);
227 			in_crit = 1;
228 		}
229 
230 		bva.va_mask = AT_UID | AT_SIZE;
231 
232 		error = VOP_GETATTR(vp, &bva, 0, cr, &ct);
233 
234 		if (error) {
235 			if (in_crit)
236 				nbl_end_crit(vp);
237 			VN_RELE(vp);
238 			ns->ns_status = puterrno(error);
239 			return;
240 		}
241 
242 		if (in_crit) {
243 			u_offset_t offset;
244 			ssize_t length;
245 
246 			if (va.va_size < bva.va_size) {
247 				offset = va.va_size;
248 				length = bva.va_size - va.va_size;
249 			} else {
250 				offset = bva.va_size;
251 				length = va.va_size - bva.va_size;
252 			}
253 			if (nbl_conflict(vp, NBL_WRITE, offset, length, 0,
254 			    NULL)) {
255 				error = EACCES;
256 			}
257 		}
258 
259 		if (crgetuid(cr) == bva.va_uid && !error &&
260 		    va.va_size != bva.va_size) {
261 			va.va_mask &= ~AT_SIZE;
262 			bf.l_type = F_WRLCK;
263 			bf.l_whence = 0;
264 			bf.l_start = (off64_t)va.va_size;
265 			bf.l_len = 0;
266 			bf.l_sysid = 0;
267 			bf.l_pid = 0;
268 
269 			error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE,
270 			    (offset_t)va.va_size, cr, &ct);
271 		}
272 		if (in_crit)
273 			nbl_end_crit(vp);
274 	} else
275 		error = 0;
276 
277 	/*
278 	 * Do the setattr.
279 	 */
280 	if (!error && va.va_mask) {
281 		error = VOP_SETATTR(vp, &va, flag, cr, &ct);
282 	}
283 
284 	/*
285 	 * check if the monitor on either vop_space or vop_setattr detected
286 	 * a delegation conflict and if so, mark the thread flag as
287 	 * wouldblock so that the response is dropped and the client will
288 	 * try again.
289 	 */
290 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
291 		VN_RELE(vp);
292 		curthread->t_flag |= T_WOULDBLOCK;
293 		return;
294 	}
295 
296 	if (!error) {
297 		va.va_mask = AT_ALL;	/* get everything */
298 
299 		error = rfs4_delegated_getattr(vp, &va, 0, cr);
300 
301 		/* check for overflows */
302 		if (!error) {
303 			acl_perm(vp, exi, &va, cr);
304 			error = vattr_to_nattr(&va, &ns->ns_attr);
305 		}
306 	}
307 
308 	ct.cc_flags = 0;
309 
310 	/*
311 	 * Force modified metadata out to stable storage.
312 	 */
313 	(void) VOP_FSYNC(vp, FNODSYNC, cr, &ct);
314 
315 	VN_RELE(vp);
316 
317 	ns->ns_status = puterrno(error);
318 }
319 void *
320 rfs_setattr_getfh(struct nfssaargs *args)
321 {
322 	return (&args->saa_fh);
323 }
324 
325 /*
326  * Directory lookup.
327  * Returns an fhandle and file attributes for file name in a directory.
328  */
329 /* ARGSUSED */
330 void
331 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
332 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
333 {
334 	int error;
335 	vnode_t *dvp;
336 	vnode_t *vp;
337 	struct vattr va;
338 	fhandle_t *fhp = da->da_fhandle;
339 	struct sec_ol sec = {0, 0};
340 	bool_t publicfh_flag = FALSE, auth_weak = FALSE;
341 
342 	/*
343 	 * Trusted Extension doesn't support NFSv2. MOUNT
344 	 * will reject v2 clients. Need to prevent v2 client
345 	 * access via WebNFS here.
346 	 */
347 	if (is_system_labeled() && req->rq_vers == 2) {
348 		dr->dr_status = NFSERR_ACCES;
349 		return;
350 	}
351 
352 	/*
353 	 * Disallow NULL paths
354 	 */
355 	if (da->da_name == NULL || *da->da_name == '\0') {
356 		dr->dr_status = NFSERR_ACCES;
357 		return;
358 	}
359 
360 	/*
361 	 * Allow lookups from the root - the default
362 	 * location of the public filehandle.
363 	 */
364 	if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
365 		dvp = rootdir;
366 		VN_HOLD(dvp);
367 	} else {
368 		dvp = nfs_fhtovp(fhp, exi);
369 		if (dvp == NULL) {
370 			dr->dr_status = NFSERR_STALE;
371 			return;
372 		}
373 	}
374 
375 	/*
376 	 * Not allow lookup beyond root.
377 	 * If the filehandle matches a filehandle of the exi,
378 	 * then the ".." refers beyond the root of an exported filesystem.
379 	 */
380 	if (strcmp(da->da_name, "..") == 0 &&
381 	    EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
382 		VN_RELE(dvp);
383 		dr->dr_status = NFSERR_NOENT;
384 		return;
385 	}
386 
387 	/*
388 	 * If the public filehandle is used then allow
389 	 * a multi-component lookup, i.e. evaluate
390 	 * a pathname and follow symbolic links if
391 	 * necessary.
392 	 *
393 	 * This may result in a vnode in another filesystem
394 	 * which is OK as long as the filesystem is exported.
395 	 */
396 	if (PUBLIC_FH2(fhp)) {
397 		publicfh_flag = TRUE;
398 		error = rfs_publicfh_mclookup(da->da_name, dvp, cr, &vp, &exi,
399 		    &sec);
400 	} else {
401 		/*
402 		 * Do a normal single component lookup.
403 		 */
404 		error = VOP_LOOKUP(dvp, da->da_name, &vp, NULL, 0, NULL, cr,
405 		    NULL, NULL, NULL);
406 	}
407 
408 	if (!error) {
409 		va.va_mask = AT_ALL;	/* we want everything */
410 
411 		error = rfs4_delegated_getattr(vp, &va, 0, cr);
412 
413 		/* check for overflows */
414 		if (!error) {
415 			acl_perm(vp, exi, &va, cr);
416 			error = vattr_to_nattr(&va, &dr->dr_attr);
417 			if (!error) {
418 				if (sec.sec_flags & SEC_QUERY)
419 					error = makefh_ol(&dr->dr_fhandle, exi,
420 					    sec.sec_index);
421 				else {
422 					error = makefh(&dr->dr_fhandle, vp,
423 					    exi);
424 					if (!error && publicfh_flag &&
425 					    !chk_clnt_sec(exi, req))
426 						auth_weak = TRUE;
427 				}
428 			}
429 		}
430 		VN_RELE(vp);
431 	}
432 
433 	VN_RELE(dvp);
434 
435 	/*
436 	 * If publicfh_flag is true then we have called rfs_publicfh_mclookup
437 	 * and have obtained a new exportinfo in exi which needs to be
438 	 * released. Note the the original exportinfo pointed to by exi
439 	 * will be released by the caller, comon_dispatch.
440 	 */
441 	if (publicfh_flag && exi != NULL)
442 		exi_rele(exi);
443 
444 	/*
445 	 * If it's public fh, no 0x81, and client's flavor is
446 	 * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
447 	 * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
448 	 */
449 	if (auth_weak)
450 		dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR;
451 	else
452 		dr->dr_status = puterrno(error);
453 }
454 void *
455 rfs_lookup_getfh(struct nfsdiropargs *da)
456 {
457 	return (da->da_fhandle);
458 }
459 
460 /*
461  * Read symbolic link.
462  * Returns the string in the symbolic link at the given fhandle.
463  */
464 /* ARGSUSED */
465 void
466 rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi,
467 	struct svc_req *req, cred_t *cr)
468 {
469 	int error;
470 	struct iovec iov;
471 	struct uio uio;
472 	vnode_t *vp;
473 	struct vattr va;
474 
475 	vp = nfs_fhtovp(fhp, exi);
476 	if (vp == NULL) {
477 		rl->rl_data = NULL;
478 		rl->rl_status = NFSERR_STALE;
479 		return;
480 	}
481 
482 	va.va_mask = AT_MODE;
483 
484 	error = VOP_GETATTR(vp, &va, 0, cr, NULL);
485 
486 	if (error) {
487 		VN_RELE(vp);
488 		rl->rl_data = NULL;
489 		rl->rl_status = puterrno(error);
490 		return;
491 	}
492 
493 	if (MANDLOCK(vp, va.va_mode)) {
494 		VN_RELE(vp);
495 		rl->rl_data = NULL;
496 		rl->rl_status = NFSERR_ACCES;
497 		return;
498 	}
499 
500 	/*
501 	 * XNFS and RFC1094 require us to return ENXIO if argument
502 	 * is not a link. BUGID 1138002.
503 	 */
504 	if (vp->v_type != VLNK) {
505 		VN_RELE(vp);
506 		rl->rl_data = NULL;
507 		rl->rl_status = NFSERR_NXIO;
508 		return;
509 	}
510 
511 	/*
512 	 * Allocate data for pathname.  This will be freed by rfs_rlfree.
513 	 */
514 	rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
515 
516 	/*
517 	 * Set up io vector to read sym link data
518 	 */
519 	iov.iov_base = rl->rl_data;
520 	iov.iov_len = NFS_MAXPATHLEN;
521 	uio.uio_iov = &iov;
522 	uio.uio_iovcnt = 1;
523 	uio.uio_segflg = UIO_SYSSPACE;
524 	uio.uio_extflg = UIO_COPY_CACHED;
525 	uio.uio_loffset = (offset_t)0;
526 	uio.uio_resid = NFS_MAXPATHLEN;
527 
528 	/*
529 	 * Do the readlink.
530 	 */
531 	error = VOP_READLINK(vp, &uio, cr, NULL);
532 
533 	VN_RELE(vp);
534 
535 	rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid);
536 
537 	/*
538 	 * XNFS and RFC1094 require us to return ENXIO if argument
539 	 * is not a link. UFS returns EINVAL if this is the case,
540 	 * so we do the mapping here. BUGID 1138002.
541 	 */
542 	if (error == EINVAL)
543 		rl->rl_status = NFSERR_NXIO;
544 	else
545 		rl->rl_status = puterrno(error);
546 
547 }
548 void *
549 rfs_readlink_getfh(fhandle_t *fhp)
550 {
551 	return (fhp);
552 }
553 /*
554  * Free data allocated by rfs_readlink
555  */
556 void
557 rfs_rlfree(struct nfsrdlnres *rl)
558 {
559 	if (rl->rl_data != NULL)
560 		kmem_free(rl->rl_data, NFS_MAXPATHLEN);
561 }
562 
563 static int rdma_setup_read_data2(struct nfsreadargs *, struct nfsrdresult *);
564 
565 /*
566  * Read data.
567  * Returns some data read from the file at the given fhandle.
568  */
569 /* ARGSUSED */
570 void
571 rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr,
572 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
573 {
574 	vnode_t *vp;
575 	int error;
576 	struct vattr va;
577 	struct iovec iov;
578 	struct uio uio;
579 	mblk_t *mp;
580 	int alloc_err = 0;
581 	int in_crit = 0;
582 	caller_context_t ct;
583 
584 	vp = nfs_fhtovp(&ra->ra_fhandle, exi);
585 	if (vp == NULL) {
586 		rr->rr_data = NULL;
587 		rr->rr_status = NFSERR_STALE;
588 		return;
589 	}
590 
591 	if (vp->v_type != VREG) {
592 		VN_RELE(vp);
593 		rr->rr_data = NULL;
594 		rr->rr_status = NFSERR_ISDIR;
595 		return;
596 	}
597 
598 	ct.cc_sysid = 0;
599 	ct.cc_pid = 0;
600 	ct.cc_caller_id = nfs2_srv_caller_id;
601 	ct.cc_flags = CC_DONTBLOCK;
602 
603 	/*
604 	 * Enter the critical region before calling VOP_RWLOCK
605 	 * to avoid a deadlock with write requests.
606 	 */
607 	if (nbl_need_check(vp)) {
608 		nbl_start_crit(vp, RW_READER);
609 		if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
610 		    0, NULL)) {
611 			nbl_end_crit(vp);
612 			VN_RELE(vp);
613 			rr->rr_data = NULL;
614 			rr->rr_status = NFSERR_ACCES;
615 			return;
616 		}
617 		in_crit = 1;
618 	}
619 
620 	error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct);
621 
622 	/* check if a monitor detected a delegation conflict */
623 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
624 		VN_RELE(vp);
625 		/* mark as wouldblock so response is dropped */
626 		curthread->t_flag |= T_WOULDBLOCK;
627 
628 		rr->rr_data = NULL;
629 		return;
630 	}
631 
632 	va.va_mask = AT_ALL;
633 
634 	error = VOP_GETATTR(vp, &va, 0, cr, &ct);
635 
636 	if (error) {
637 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
638 		if (in_crit)
639 			nbl_end_crit(vp);
640 
641 		VN_RELE(vp);
642 		rr->rr_data = NULL;
643 		rr->rr_status = puterrno(error);
644 
645 		return;
646 	}
647 
648 	/*
649 	 * This is a kludge to allow reading of files created
650 	 * with no read permission.  The owner of the file
651 	 * is always allowed to read it.
652 	 */
653 	if (crgetuid(cr) != va.va_uid) {
654 		error = VOP_ACCESS(vp, VREAD, 0, cr, &ct);
655 
656 		if (error) {
657 			/*
658 			 * Exec is the same as read over the net because
659 			 * of demand loading.
660 			 */
661 			error = VOP_ACCESS(vp, VEXEC, 0, cr, &ct);
662 		}
663 		if (error) {
664 			VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
665 			if (in_crit)
666 				nbl_end_crit(vp);
667 			VN_RELE(vp);
668 			rr->rr_data = NULL;
669 			rr->rr_status = puterrno(error);
670 
671 			return;
672 		}
673 	}
674 
675 	if (MANDLOCK(vp, va.va_mode)) {
676 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
677 		if (in_crit)
678 			nbl_end_crit(vp);
679 
680 		VN_RELE(vp);
681 		rr->rr_data = NULL;
682 		rr->rr_status = NFSERR_ACCES;
683 
684 		return;
685 	}
686 
687 	rr->rr_ok.rrok_wlist_len = 0;
688 	rr->rr_ok.rrok_wlist = NULL;
689 
690 	if ((u_offset_t)ra->ra_offset >= va.va_size) {
691 		rr->rr_count = 0;
692 		rr->rr_data = NULL;
693 		/*
694 		 * In this case, status is NFS_OK, but there is no data
695 		 * to encode. So set rr_mp to NULL.
696 		 */
697 		rr->rr_mp = NULL;
698 		goto done;
699 	}
700 
701 	if (ra->ra_wlist) {
702 		mp = NULL;
703 		rr->rr_mp = NULL;
704 		(void) rdma_get_wchunk(req, &iov, ra->ra_wlist);
705 	} else {
706 		/*
707 		 * mp will contain the data to be sent out in the read reply.
708 		 * This will be freed after the reply has been sent out (by the
709 		 * driver).
710 		 * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so
711 		 * that the call to xdrmblk_putmblk() never fails.
712 		 */
713 		mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG,
714 		    &alloc_err);
715 		ASSERT(mp != NULL);
716 		ASSERT(alloc_err == 0);
717 
718 		rr->rr_mp = mp;
719 
720 		/*
721 		 * Set up io vector
722 		 */
723 		iov.iov_base = (caddr_t)mp->b_datap->db_base;
724 		iov.iov_len = ra->ra_count;
725 	}
726 
727 	uio.uio_iov = &iov;
728 	uio.uio_iovcnt = 1;
729 	uio.uio_segflg = UIO_SYSSPACE;
730 	uio.uio_extflg = UIO_COPY_CACHED;
731 	uio.uio_loffset = (offset_t)ra->ra_offset;
732 	uio.uio_resid = ra->ra_count;
733 
734 	error = VOP_READ(vp, &uio, 0, cr, &ct);
735 
736 	if (error) {
737 		if (mp)
738 			freeb(mp);
739 
740 		/*
741 		 * check if a monitor detected a delegation conflict and
742 		 * mark as wouldblock so response is dropped
743 		 */
744 		if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
745 			curthread->t_flag |= T_WOULDBLOCK;
746 		else
747 			rr->rr_status = puterrno(error);
748 
749 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
750 		if (in_crit)
751 			nbl_end_crit(vp);
752 
753 		VN_RELE(vp);
754 		rr->rr_data = NULL;
755 
756 		return;
757 	}
758 
759 	/*
760 	 * Get attributes again so we can send the latest access
761 	 * time to the client side for his cache.
762 	 */
763 	va.va_mask = AT_ALL;
764 
765 	error = VOP_GETATTR(vp, &va, 0, cr, &ct);
766 
767 	if (error) {
768 		if (mp)
769 			freeb(mp);
770 
771 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
772 		if (in_crit)
773 			nbl_end_crit(vp);
774 
775 		VN_RELE(vp);
776 		rr->rr_data = NULL;
777 		rr->rr_status = puterrno(error);
778 
779 		return;
780 	}
781 
782 	rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid);
783 
784 	if (mp) {
785 		rr->rr_data = (char *)mp->b_datap->db_base;
786 	} else {
787 		if (ra->ra_wlist) {
788 			rr->rr_data = (caddr_t)iov.iov_base;
789 			if (!rdma_setup_read_data2(ra, rr)) {
790 				rr->rr_data = NULL;
791 				rr->rr_status = puterrno(NFSERR_INVAL);
792 			}
793 		}
794 	}
795 done:
796 	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
797 	if (in_crit)
798 		nbl_end_crit(vp);
799 
800 	acl_perm(vp, exi, &va, cr);
801 
802 	/* check for overflows */
803 	error = vattr_to_nattr(&va, &rr->rr_attr);
804 
805 	VN_RELE(vp);
806 
807 	rr->rr_status = puterrno(error);
808 }
809 
810 /*
811  * Free data allocated by rfs_read
812  */
813 void
814 rfs_rdfree(struct nfsrdresult *rr)
815 {
816 	mblk_t *mp;
817 
818 	if (rr->rr_status == NFS_OK) {
819 		mp = rr->rr_mp;
820 		if (mp != NULL)
821 			freeb(mp);
822 	}
823 }
824 
825 void *
826 rfs_read_getfh(struct nfsreadargs *ra)
827 {
828 	return (&ra->ra_fhandle);
829 }
830 
831 #define	MAX_IOVECS	12
832 
833 #ifdef DEBUG
834 static int rfs_write_sync_hits = 0;
835 static int rfs_write_sync_misses = 0;
836 #endif
837 
838 /*
839  * Write data to file.
840  * Returns attributes of a file after writing some data to it.
841  *
842  * Any changes made here, especially in error handling might have
843  * to also be done in rfs_write (which clusters write requests).
844  */
845 void
846 rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns,
847 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
848 {
849 	int error;
850 	vnode_t *vp;
851 	rlim64_t rlimit;
852 	struct vattr va;
853 	struct uio uio;
854 	struct iovec iov[MAX_IOVECS];
855 	mblk_t *m;
856 	struct iovec *iovp;
857 	int iovcnt;
858 	cred_t *savecred;
859 	int in_crit = 0;
860 	caller_context_t ct;
861 
862 	vp = nfs_fhtovp(&wa->wa_fhandle, exi);
863 	if (vp == NULL) {
864 		ns->ns_status = NFSERR_STALE;
865 		return;
866 	}
867 
868 	if (rdonly(exi, req)) {
869 		VN_RELE(vp);
870 		ns->ns_status = NFSERR_ROFS;
871 		return;
872 	}
873 
874 	if (vp->v_type != VREG) {
875 		VN_RELE(vp);
876 		ns->ns_status = NFSERR_ISDIR;
877 		return;
878 	}
879 
880 	ct.cc_sysid = 0;
881 	ct.cc_pid = 0;
882 	ct.cc_caller_id = nfs2_srv_caller_id;
883 	ct.cc_flags = CC_DONTBLOCK;
884 
885 	va.va_mask = AT_UID|AT_MODE;
886 
887 	error = VOP_GETATTR(vp, &va, 0, cr, &ct);
888 
889 	if (error) {
890 		VN_RELE(vp);
891 		ns->ns_status = puterrno(error);
892 
893 		return;
894 	}
895 
896 	if (crgetuid(cr) != va.va_uid) {
897 		/*
898 		 * This is a kludge to allow writes of files created
899 		 * with read only permission.  The owner of the file
900 		 * is always allowed to write it.
901 		 */
902 		error = VOP_ACCESS(vp, VWRITE, 0, cr, &ct);
903 
904 		if (error) {
905 			VN_RELE(vp);
906 			ns->ns_status = puterrno(error);
907 			return;
908 		}
909 	}
910 
911 	/*
912 	 * Can't access a mandatory lock file.  This might cause
913 	 * the NFS service thread to block forever waiting for a
914 	 * lock to be released that will never be released.
915 	 */
916 	if (MANDLOCK(vp, va.va_mode)) {
917 		VN_RELE(vp);
918 		ns->ns_status = NFSERR_ACCES;
919 		return;
920 	}
921 
922 	/*
923 	 * We have to enter the critical region before calling VOP_RWLOCK
924 	 * to avoid a deadlock with ufs.
925 	 */
926 	if (nbl_need_check(vp)) {
927 		nbl_start_crit(vp, RW_READER);
928 		in_crit = 1;
929 		if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
930 		    wa->wa_count, 0, NULL)) {
931 			error = EACCES;
932 			goto out;
933 		}
934 	}
935 
936 	error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
937 
938 	/* check if a monitor detected a delegation conflict */
939 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
940 		VN_RELE(vp);
941 		/* mark as wouldblock so response is dropped */
942 		curthread->t_flag |= T_WOULDBLOCK;
943 		return;
944 	}
945 
946 	if (wa->wa_data || wa->wa_rlist) {
947 		/* Do the RDMA thing if necessary */
948 		if (wa->wa_rlist) {
949 			iov[0].iov_base = (char *)((wa->wa_rlist)->u.c_daddr3);
950 			iov[0].iov_len = wa->wa_count;
951 		} else  {
952 			iov[0].iov_base = wa->wa_data;
953 			iov[0].iov_len = wa->wa_count;
954 		}
955 		uio.uio_iov = iov;
956 		uio.uio_iovcnt = 1;
957 		uio.uio_segflg = UIO_SYSSPACE;
958 		uio.uio_extflg = UIO_COPY_DEFAULT;
959 		uio.uio_loffset = (offset_t)wa->wa_offset;
960 		uio.uio_resid = wa->wa_count;
961 		/*
962 		 * The limit is checked on the client. We
963 		 * should allow any size writes here.
964 		 */
965 		uio.uio_llimit = curproc->p_fsz_ctl;
966 		rlimit = uio.uio_llimit - wa->wa_offset;
967 		if (rlimit < (rlim64_t)uio.uio_resid)
968 			uio.uio_resid = (uint_t)rlimit;
969 
970 		/*
971 		 * for now we assume no append mode
972 		 */
973 		/*
974 		 * We're changing creds because VM may fault and we need
975 		 * the cred of the current thread to be used if quota
976 		 * checking is enabled.
977 		 */
978 		savecred = curthread->t_cred;
979 		curthread->t_cred = cr;
980 		error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
981 		curthread->t_cred = savecred;
982 	} else {
983 		iovcnt = 0;
984 		for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
985 			iovcnt++;
986 		if (iovcnt <= MAX_IOVECS) {
987 #ifdef DEBUG
988 			rfs_write_sync_hits++;
989 #endif
990 			iovp = iov;
991 		} else {
992 #ifdef DEBUG
993 			rfs_write_sync_misses++;
994 #endif
995 			iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
996 		}
997 		mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
998 		uio.uio_iov = iovp;
999 		uio.uio_iovcnt = iovcnt;
1000 		uio.uio_segflg = UIO_SYSSPACE;
1001 		uio.uio_extflg = UIO_COPY_DEFAULT;
1002 		uio.uio_loffset = (offset_t)wa->wa_offset;
1003 		uio.uio_resid = wa->wa_count;
1004 		/*
1005 		 * The limit is checked on the client. We
1006 		 * should allow any size writes here.
1007 		 */
1008 		uio.uio_llimit = curproc->p_fsz_ctl;
1009 		rlimit = uio.uio_llimit - wa->wa_offset;
1010 		if (rlimit < (rlim64_t)uio.uio_resid)
1011 			uio.uio_resid = (uint_t)rlimit;
1012 
1013 		/*
1014 		 * For now we assume no append mode.
1015 		 */
1016 		/*
1017 		 * We're changing creds because VM may fault and we need
1018 		 * the cred of the current thread to be used if quota
1019 		 * checking is enabled.
1020 		 */
1021 		savecred = curthread->t_cred;
1022 		curthread->t_cred = cr;
1023 		error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1024 		curthread->t_cred = savecred;
1025 
1026 		if (iovp != iov)
1027 			kmem_free(iovp, sizeof (*iovp) * iovcnt);
1028 	}
1029 
1030 	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1031 
1032 	if (!error) {
1033 		/*
1034 		 * Get attributes again so we send the latest mod
1035 		 * time to the client side for his cache.
1036 		 */
1037 		va.va_mask = AT_ALL;	/* now we want everything */
1038 
1039 		error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1040 
1041 		/* check for overflows */
1042 		if (!error) {
1043 			acl_perm(vp, exi, &va, cr);
1044 			error = vattr_to_nattr(&va, &ns->ns_attr);
1045 		}
1046 	}
1047 
1048 out:
1049 	if (in_crit)
1050 		nbl_end_crit(vp);
1051 	VN_RELE(vp);
1052 
1053 	/* check if a monitor detected a delegation conflict */
1054 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1055 		/* mark as wouldblock so response is dropped */
1056 		curthread->t_flag |= T_WOULDBLOCK;
1057 	else
1058 		ns->ns_status = puterrno(error);
1059 
1060 }
1061 
1062 struct rfs_async_write {
1063 	struct nfswriteargs *wa;
1064 	struct nfsattrstat *ns;
1065 	struct svc_req *req;
1066 	cred_t *cr;
1067 	kthread_t *thread;
1068 	struct rfs_async_write *list;
1069 };
1070 
1071 struct rfs_async_write_list {
1072 	fhandle_t *fhp;
1073 	kcondvar_t cv;
1074 	struct rfs_async_write *list;
1075 	struct rfs_async_write_list *next;
1076 };
1077 
1078 static struct rfs_async_write_list *rfs_async_write_head = NULL;
1079 static kmutex_t rfs_async_write_lock;
1080 static int rfs_write_async = 1;	/* enables write clustering if == 1 */
1081 
1082 #define	MAXCLIOVECS	42
1083 #define	RFSWRITE_INITVAL (enum nfsstat) -1
1084 
1085 #ifdef DEBUG
1086 static int rfs_write_hits = 0;
1087 static int rfs_write_misses = 0;
1088 #endif
1089 
1090 /*
1091  * Write data to file.
1092  * Returns attributes of a file after writing some data to it.
1093  */
1094 void
1095 rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns,
1096 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1097 {
1098 	int error;
1099 	vnode_t *vp;
1100 	rlim64_t rlimit;
1101 	struct vattr va;
1102 	struct uio uio;
1103 	struct rfs_async_write_list *lp;
1104 	struct rfs_async_write_list *nlp;
1105 	struct rfs_async_write *rp;
1106 	struct rfs_async_write *nrp;
1107 	struct rfs_async_write *trp;
1108 	struct rfs_async_write *lrp;
1109 	int data_written;
1110 	int iovcnt;
1111 	mblk_t *m;
1112 	struct iovec *iovp;
1113 	struct iovec *niovp;
1114 	struct iovec iov[MAXCLIOVECS];
1115 	int count;
1116 	int rcount;
1117 	uint_t off;
1118 	uint_t len;
1119 	struct rfs_async_write nrpsp;
1120 	struct rfs_async_write_list nlpsp;
1121 	ushort_t t_flag;
1122 	cred_t *savecred;
1123 	int in_crit = 0;
1124 	caller_context_t ct;
1125 
1126 	if (!rfs_write_async) {
1127 		rfs_write_sync(wa, ns, exi, req, cr);
1128 		return;
1129 	}
1130 
1131 	/*
1132 	 * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
1133 	 * is considered an OK.
1134 	 */
1135 	ns->ns_status = RFSWRITE_INITVAL;
1136 
1137 	nrp = &nrpsp;
1138 	nrp->wa = wa;
1139 	nrp->ns = ns;
1140 	nrp->req = req;
1141 	nrp->cr = cr;
1142 	nrp->thread = curthread;
1143 
1144 	ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
1145 
1146 	/*
1147 	 * Look to see if there is already a cluster started
1148 	 * for this file.
1149 	 */
1150 	mutex_enter(&rfs_async_write_lock);
1151 	for (lp = rfs_async_write_head; lp != NULL; lp = lp->next) {
1152 		if (bcmp(&wa->wa_fhandle, lp->fhp,
1153 		    sizeof (fhandle_t)) == 0)
1154 			break;
1155 	}
1156 
1157 	/*
1158 	 * If lp is non-NULL, then there is already a cluster
1159 	 * started.  We need to place ourselves in the cluster
1160 	 * list in the right place as determined by starting
1161 	 * offset.  Conflicts with non-blocking mandatory locked
1162 	 * regions will be checked when the cluster is processed.
1163 	 */
1164 	if (lp != NULL) {
1165 		rp = lp->list;
1166 		trp = NULL;
1167 		while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
1168 			trp = rp;
1169 			rp = rp->list;
1170 		}
1171 		nrp->list = rp;
1172 		if (trp == NULL)
1173 			lp->list = nrp;
1174 		else
1175 			trp->list = nrp;
1176 		while (nrp->ns->ns_status == RFSWRITE_INITVAL)
1177 			cv_wait(&lp->cv, &rfs_async_write_lock);
1178 		mutex_exit(&rfs_async_write_lock);
1179 
1180 		return;
1181 	}
1182 
1183 	/*
1184 	 * No cluster started yet, start one and add ourselves
1185 	 * to the list of clusters.
1186 	 */
1187 	nrp->list = NULL;
1188 
1189 	nlp = &nlpsp;
1190 	nlp->fhp = &wa->wa_fhandle;
1191 	cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
1192 	nlp->list = nrp;
1193 	nlp->next = NULL;
1194 
1195 	if (rfs_async_write_head == NULL) {
1196 		rfs_async_write_head = nlp;
1197 	} else {
1198 		lp = rfs_async_write_head;
1199 		while (lp->next != NULL)
1200 			lp = lp->next;
1201 		lp->next = nlp;
1202 	}
1203 	mutex_exit(&rfs_async_write_lock);
1204 
1205 	/*
1206 	 * Convert the file handle common to all of the requests
1207 	 * in this cluster to a vnode.
1208 	 */
1209 	vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1210 	if (vp == NULL) {
1211 		mutex_enter(&rfs_async_write_lock);
1212 		if (rfs_async_write_head == nlp)
1213 			rfs_async_write_head = nlp->next;
1214 		else {
1215 			lp = rfs_async_write_head;
1216 			while (lp->next != nlp)
1217 				lp = lp->next;
1218 			lp->next = nlp->next;
1219 		}
1220 		t_flag = curthread->t_flag & T_WOULDBLOCK;
1221 		for (rp = nlp->list; rp != NULL; rp = rp->list) {
1222 			rp->ns->ns_status = NFSERR_STALE;
1223 			rp->thread->t_flag |= t_flag;
1224 		}
1225 		cv_broadcast(&nlp->cv);
1226 		mutex_exit(&rfs_async_write_lock);
1227 
1228 		return;
1229 	}
1230 
1231 	/*
1232 	 * Can only write regular files.  Attempts to write any
1233 	 * other file types fail with EISDIR.
1234 	 */
1235 	if (vp->v_type != VREG) {
1236 		VN_RELE(vp);
1237 		mutex_enter(&rfs_async_write_lock);
1238 		if (rfs_async_write_head == nlp)
1239 			rfs_async_write_head = nlp->next;
1240 		else {
1241 			lp = rfs_async_write_head;
1242 			while (lp->next != nlp)
1243 				lp = lp->next;
1244 			lp->next = nlp->next;
1245 		}
1246 		t_flag = curthread->t_flag & T_WOULDBLOCK;
1247 		for (rp = nlp->list; rp != NULL; rp = rp->list) {
1248 			rp->ns->ns_status = NFSERR_ISDIR;
1249 			rp->thread->t_flag |= t_flag;
1250 		}
1251 		cv_broadcast(&nlp->cv);
1252 		mutex_exit(&rfs_async_write_lock);
1253 
1254 		return;
1255 	}
1256 
1257 	/*
1258 	 * Enter the critical region before calling VOP_RWLOCK, to avoid a
1259 	 * deadlock with ufs.
1260 	 */
1261 	if (nbl_need_check(vp)) {
1262 		nbl_start_crit(vp, RW_READER);
1263 		in_crit = 1;
1264 	}
1265 
1266 	ct.cc_sysid = 0;
1267 	ct.cc_pid = 0;
1268 	ct.cc_caller_id = nfs2_srv_caller_id;
1269 	ct.cc_flags = CC_DONTBLOCK;
1270 
1271 	/*
1272 	 * Lock the file for writing.  This operation provides
1273 	 * the delay which allows clusters to grow.
1274 	 */
1275 	error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1276 
1277 	/* check if a monitor detected a delegation conflict */
1278 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1279 		VN_RELE(vp);
1280 		/* mark as wouldblock so response is dropped */
1281 		curthread->t_flag |= T_WOULDBLOCK;
1282 		mutex_enter(&rfs_async_write_lock);
1283 		for (rp = nlp->list; rp != NULL; rp = rp->list) {
1284 			if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1285 				rp->ns->ns_status = puterrno(error);
1286 				rp->thread->t_flag |= T_WOULDBLOCK;
1287 			}
1288 		}
1289 		cv_broadcast(&nlp->cv);
1290 		mutex_exit(&rfs_async_write_lock);
1291 
1292 		return;
1293 	}
1294 
1295 	/*
1296 	 * Disconnect this cluster from the list of clusters.
1297 	 * The cluster that is being dealt with must be fixed
1298 	 * in size after this point, so there is no reason
1299 	 * to leave it on the list so that new requests can
1300 	 * find it.
1301 	 *
1302 	 * The algorithm is that the first write request will
1303 	 * create a cluster, convert the file handle to a
1304 	 * vnode pointer, and then lock the file for writing.
1305 	 * This request is not likely to be clustered with
1306 	 * any others.  However, the next request will create
1307 	 * a new cluster and be blocked in VOP_RWLOCK while
1308 	 * the first request is being processed.  This delay
1309 	 * will allow more requests to be clustered in this
1310 	 * second cluster.
1311 	 */
1312 	mutex_enter(&rfs_async_write_lock);
1313 	if (rfs_async_write_head == nlp)
1314 		rfs_async_write_head = nlp->next;
1315 	else {
1316 		lp = rfs_async_write_head;
1317 		while (lp->next != nlp)
1318 			lp = lp->next;
1319 		lp->next = nlp->next;
1320 	}
1321 	mutex_exit(&rfs_async_write_lock);
1322 
1323 	/*
1324 	 * Step through the list of requests in this cluster.
1325 	 * We need to check permissions to make sure that all
1326 	 * of the requests have sufficient permission to write
1327 	 * the file.  A cluster can be composed of requests
1328 	 * from different clients and different users on each
1329 	 * client.
1330 	 *
1331 	 * As a side effect, we also calculate the size of the
1332 	 * byte range that this cluster encompasses.
1333 	 */
1334 	rp = nlp->list;
1335 	off = rp->wa->wa_offset;
1336 	len = (uint_t)0;
1337 	do {
1338 		if (rdonly(exi, rp->req)) {
1339 			rp->ns->ns_status = NFSERR_ROFS;
1340 			t_flag = curthread->t_flag & T_WOULDBLOCK;
1341 			rp->thread->t_flag |= t_flag;
1342 			continue;
1343 		}
1344 
1345 		va.va_mask = AT_UID|AT_MODE;
1346 
1347 		error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1348 
1349 		if (!error) {
1350 			if (crgetuid(rp->cr) != va.va_uid) {
1351 				/*
1352 				 * This is a kludge to allow writes of files
1353 				 * created with read only permission.  The
1354 				 * owner of the file is always allowed to
1355 				 * write it.
1356 				 */
1357 				error = VOP_ACCESS(vp, VWRITE, 0, rp->cr, &ct);
1358 			}
1359 			if (!error && MANDLOCK(vp, va.va_mode))
1360 				error = EACCES;
1361 		}
1362 
1363 		/*
1364 		 * Check for a conflict with a nbmand-locked region.
1365 		 */
1366 		if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset,
1367 		    rp->wa->wa_count, 0, NULL)) {
1368 			error = EACCES;
1369 		}
1370 
1371 		if (error) {
1372 			rp->ns->ns_status = puterrno(error);
1373 			t_flag = curthread->t_flag & T_WOULDBLOCK;
1374 			rp->thread->t_flag |= t_flag;
1375 			continue;
1376 		}
1377 		if (len < rp->wa->wa_offset + rp->wa->wa_count - off)
1378 			len = rp->wa->wa_offset + rp->wa->wa_count - off;
1379 	} while ((rp = rp->list) != NULL);
1380 
1381 	/*
1382 	 * Step through the cluster attempting to gather as many
1383 	 * requests which are contiguous as possible.  These
1384 	 * contiguous requests are handled via one call to VOP_WRITE
1385 	 * instead of different calls to VOP_WRITE.  We also keep
1386 	 * track of the fact that any data was written.
1387 	 */
1388 	rp = nlp->list;
1389 	data_written = 0;
1390 	do {
1391 		/*
1392 		 * Skip any requests which are already marked as having an
1393 		 * error.
1394 		 */
1395 		if (rp->ns->ns_status != RFSWRITE_INITVAL) {
1396 			rp = rp->list;
1397 			continue;
1398 		}
1399 
1400 		/*
1401 		 * Count the number of iovec's which are required
1402 		 * to handle this set of requests.  One iovec is
1403 		 * needed for each data buffer, whether addressed
1404 		 * by wa_data or by the b_rptr pointers in the
1405 		 * mblk chains.
1406 		 */
1407 		iovcnt = 0;
1408 		lrp = rp;
1409 		for (;;) {
1410 			if (lrp->wa->wa_data || lrp->wa->wa_rlist)
1411 				iovcnt++;
1412 			else {
1413 				m = lrp->wa->wa_mblk;
1414 				while (m != NULL) {
1415 					iovcnt++;
1416 					m = m->b_cont;
1417 				}
1418 			}
1419 			if (lrp->list == NULL ||
1420 			    lrp->list->ns->ns_status != RFSWRITE_INITVAL ||
1421 			    lrp->wa->wa_offset + lrp->wa->wa_count !=
1422 			    lrp->list->wa->wa_offset) {
1423 				lrp = lrp->list;
1424 				break;
1425 			}
1426 			lrp = lrp->list;
1427 		}
1428 
1429 		if (iovcnt <= MAXCLIOVECS) {
1430 #ifdef DEBUG
1431 			rfs_write_hits++;
1432 #endif
1433 			niovp = iov;
1434 		} else {
1435 #ifdef DEBUG
1436 			rfs_write_misses++;
1437 #endif
1438 			niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP);
1439 		}
1440 		/*
1441 		 * Put together the scatter/gather iovecs.
1442 		 */
1443 		iovp = niovp;
1444 		trp = rp;
1445 		count = 0;
1446 		do {
1447 			if (trp->wa->wa_data || trp->wa->wa_rlist) {
1448 				if (trp->wa->wa_rlist) {
1449 					iovp->iov_base =
1450 					    (char *)((trp->wa->wa_rlist)->
1451 					    u.c_daddr3);
1452 					iovp->iov_len = trp->wa->wa_count;
1453 				} else  {
1454 					iovp->iov_base = trp->wa->wa_data;
1455 					iovp->iov_len = trp->wa->wa_count;
1456 				}
1457 				iovp++;
1458 			} else {
1459 				m = trp->wa->wa_mblk;
1460 				rcount = trp->wa->wa_count;
1461 				while (m != NULL) {
1462 					iovp->iov_base = (caddr_t)m->b_rptr;
1463 					iovp->iov_len = (m->b_wptr - m->b_rptr);
1464 					rcount -= iovp->iov_len;
1465 					if (rcount < 0)
1466 						iovp->iov_len += rcount;
1467 					iovp++;
1468 					if (rcount <= 0)
1469 						break;
1470 					m = m->b_cont;
1471 				}
1472 			}
1473 			count += trp->wa->wa_count;
1474 			trp = trp->list;
1475 		} while (trp != lrp);
1476 
1477 		uio.uio_iov = niovp;
1478 		uio.uio_iovcnt = iovcnt;
1479 		uio.uio_segflg = UIO_SYSSPACE;
1480 		uio.uio_extflg = UIO_COPY_DEFAULT;
1481 		uio.uio_loffset = (offset_t)rp->wa->wa_offset;
1482 		uio.uio_resid = count;
1483 		/*
1484 		 * The limit is checked on the client. We
1485 		 * should allow any size writes here.
1486 		 */
1487 		uio.uio_llimit = curproc->p_fsz_ctl;
1488 		rlimit = uio.uio_llimit - rp->wa->wa_offset;
1489 		if (rlimit < (rlim64_t)uio.uio_resid)
1490 			uio.uio_resid = (uint_t)rlimit;
1491 
1492 		/*
1493 		 * For now we assume no append mode.
1494 		 */
1495 
1496 		/*
1497 		 * We're changing creds because VM may fault
1498 		 * and we need the cred of the current
1499 		 * thread to be used if quota * checking is
1500 		 * enabled.
1501 		 */
1502 		savecred = curthread->t_cred;
1503 		curthread->t_cred = cr;
1504 		error = VOP_WRITE(vp, &uio, 0, rp->cr, &ct);
1505 		curthread->t_cred = savecred;
1506 
1507 		/* check if a monitor detected a delegation conflict */
1508 		if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1509 			/* mark as wouldblock so response is dropped */
1510 			curthread->t_flag |= T_WOULDBLOCK;
1511 
1512 		if (niovp != iov)
1513 			kmem_free(niovp, sizeof (*niovp) * iovcnt);
1514 
1515 		if (!error) {
1516 			data_written = 1;
1517 			/*
1518 			 * Get attributes again so we send the latest mod
1519 			 * time to the client side for his cache.
1520 			 */
1521 			va.va_mask = AT_ALL;	/* now we want everything */
1522 
1523 			error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1524 
1525 			if (!error)
1526 				acl_perm(vp, exi, &va, rp->cr);
1527 		}
1528 
1529 		/*
1530 		 * Fill in the status responses for each request
1531 		 * which was just handled.  Also, copy the latest
1532 		 * attributes in to the attribute responses if
1533 		 * appropriate.
1534 		 */
1535 		t_flag = curthread->t_flag & T_WOULDBLOCK;
1536 		do {
1537 			rp->thread->t_flag |= t_flag;
1538 			/* check for overflows */
1539 			if (!error) {
1540 				error  = vattr_to_nattr(&va, &rp->ns->ns_attr);
1541 			}
1542 			rp->ns->ns_status = puterrno(error);
1543 			rp = rp->list;
1544 		} while (rp != lrp);
1545 	} while (rp != NULL);
1546 
1547 	/*
1548 	 * If any data was written at all, then we need to flush
1549 	 * the data and metadata to stable storage.
1550 	 */
1551 	if (data_written) {
1552 		error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr, &ct);
1553 
1554 		if (!error) {
1555 			error = VOP_FSYNC(vp, FNODSYNC, cr, &ct);
1556 		}
1557 	}
1558 
1559 	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1560 
1561 	if (in_crit)
1562 		nbl_end_crit(vp);
1563 	VN_RELE(vp);
1564 
1565 	t_flag = curthread->t_flag & T_WOULDBLOCK;
1566 	mutex_enter(&rfs_async_write_lock);
1567 	for (rp = nlp->list; rp != NULL; rp = rp->list) {
1568 		if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1569 			rp->ns->ns_status = puterrno(error);
1570 			rp->thread->t_flag |= t_flag;
1571 		}
1572 	}
1573 	cv_broadcast(&nlp->cv);
1574 	mutex_exit(&rfs_async_write_lock);
1575 
1576 }
1577 
1578 void *
1579 rfs_write_getfh(struct nfswriteargs *wa)
1580 {
1581 	return (&wa->wa_fhandle);
1582 }
1583 
1584 /*
1585  * Create a file.
1586  * Creates a file with given attributes and returns those attributes
1587  * and an fhandle for the new file.
1588  */
1589 void
1590 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
1591 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1592 {
1593 	int error;
1594 	int lookuperr;
1595 	int in_crit = 0;
1596 	struct vattr va;
1597 	vnode_t *vp;
1598 	vnode_t *realvp;
1599 	vnode_t *dvp;
1600 	char *name = args->ca_da.da_name;
1601 	vnode_t *tvp = NULL;
1602 	int mode;
1603 	int lookup_ok;
1604 	bool_t trunc;
1605 
1606 	/*
1607 	 * Disallow NULL paths
1608 	 */
1609 	if (name == NULL || *name == '\0') {
1610 		dr->dr_status = NFSERR_ACCES;
1611 		return;
1612 	}
1613 
1614 	dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
1615 	if (dvp == NULL) {
1616 		dr->dr_status = NFSERR_STALE;
1617 		return;
1618 	}
1619 
1620 	error = sattr_to_vattr(args->ca_sa, &va);
1621 	if (error) {
1622 		dr->dr_status = puterrno(error);
1623 		return;
1624 	}
1625 
1626 	/*
1627 	 * Must specify the mode.
1628 	 */
1629 	if (!(va.va_mask & AT_MODE)) {
1630 		VN_RELE(dvp);
1631 		dr->dr_status = NFSERR_INVAL;
1632 		return;
1633 	}
1634 
1635 	/*
1636 	 * This is a completely gross hack to make mknod
1637 	 * work over the wire until we can wack the protocol
1638 	 */
1639 	if ((va.va_mode & IFMT) == IFCHR) {
1640 		if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV)
1641 			va.va_type = VFIFO;	/* xtra kludge for named pipe */
1642 		else {
1643 			va.va_type = VCHR;
1644 			/*
1645 			 * uncompress the received dev_t
1646 			 * if the top half is zero indicating a request
1647 			 * from an `older style' OS.
1648 			 */
1649 			if ((va.va_size & 0xffff0000) == 0)
1650 				va.va_rdev = nfsv2_expdev(va.va_size);
1651 			else
1652 				va.va_rdev = (dev_t)va.va_size;
1653 		}
1654 		va.va_mask &= ~AT_SIZE;
1655 	} else if ((va.va_mode & IFMT) == IFBLK) {
1656 		va.va_type = VBLK;
1657 		/*
1658 		 * uncompress the received dev_t
1659 		 * if the top half is zero indicating a request
1660 		 * from an `older style' OS.
1661 		 */
1662 		if ((va.va_size & 0xffff0000) == 0)
1663 			va.va_rdev = nfsv2_expdev(va.va_size);
1664 		else
1665 			va.va_rdev = (dev_t)va.va_size;
1666 		va.va_mask &= ~AT_SIZE;
1667 	} else if ((va.va_mode & IFMT) == IFSOCK) {
1668 		va.va_type = VSOCK;
1669 	} else
1670 		va.va_type = VREG;
1671 	va.va_mode &= ~IFMT;
1672 	va.va_mask |= AT_TYPE;
1673 
1674 	/*
1675 	 * Why was the choice made to use VWRITE as the mode to the
1676 	 * call to VOP_CREATE ? This results in a bug.  When a client
1677 	 * opens a file that already exists and is RDONLY, the second
1678 	 * open fails with an EACESS because of the mode.
1679 	 * bug ID 1054648.
1680 	 */
1681 	lookup_ok = 0;
1682 	mode = VWRITE;
1683 	if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) {
1684 		error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1685 		    NULL, NULL, NULL);
1686 		if (!error) {
1687 			struct vattr at;
1688 
1689 			lookup_ok = 1;
1690 			at.va_mask = AT_MODE;
1691 			error = VOP_GETATTR(tvp, &at, 0, cr, NULL);
1692 			if (!error)
1693 				mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD;
1694 			VN_RELE(tvp);
1695 			tvp = NULL;
1696 		}
1697 	}
1698 
1699 	if (!lookup_ok) {
1700 		if (rdonly(exi, req)) {
1701 			error = EROFS;
1702 		} else if (va.va_type != VREG && va.va_type != VFIFO &&
1703 		    va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) {
1704 			error = EPERM;
1705 		} else {
1706 			error = 0;
1707 		}
1708 	}
1709 
1710 	/*
1711 	 * If file size is being modified on an already existing file
1712 	 * make sure that there are no conflicting non-blocking mandatory
1713 	 * locks in the region being manipulated. Return EACCES if there
1714 	 * are conflicting locks.
1715 	 */
1716 	if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) {
1717 		lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1718 		    NULL, NULL, NULL);
1719 
1720 		if (!lookuperr &&
1721 		    rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) {
1722 			VN_RELE(tvp);
1723 			curthread->t_flag |= T_WOULDBLOCK;
1724 			goto out;
1725 		}
1726 
1727 		if (!lookuperr && nbl_need_check(tvp)) {
1728 			/*
1729 			 * The file exists. Now check if it has any
1730 			 * conflicting non-blocking mandatory locks
1731 			 * in the region being changed.
1732 			 */
1733 			struct vattr bva;
1734 			u_offset_t offset;
1735 			ssize_t length;
1736 
1737 			nbl_start_crit(tvp, RW_READER);
1738 			in_crit = 1;
1739 
1740 			bva.va_mask = AT_SIZE;
1741 			error = VOP_GETATTR(tvp, &bva, 0, cr, NULL);
1742 			if (!error) {
1743 				if (va.va_size < bva.va_size) {
1744 					offset = va.va_size;
1745 					length = bva.va_size - va.va_size;
1746 				} else {
1747 					offset = bva.va_size;
1748 					length = va.va_size - bva.va_size;
1749 				}
1750 				if (length) {
1751 					if (nbl_conflict(tvp, NBL_WRITE,
1752 					    offset, length, 0, NULL)) {
1753 						error = EACCES;
1754 					}
1755 				}
1756 			}
1757 			if (error) {
1758 				nbl_end_crit(tvp);
1759 				VN_RELE(tvp);
1760 				in_crit = 0;
1761 			}
1762 		} else if (tvp != NULL) {
1763 			VN_RELE(tvp);
1764 		}
1765 	}
1766 
1767 	if (!error) {
1768 		/*
1769 		 * If filesystem is shared with nosuid the remove any
1770 		 * setuid/setgid bits on create.
1771 		 */
1772 		if (va.va_type == VREG &&
1773 		    exi->exi_export.ex_flags & EX_NOSUID)
1774 			va.va_mode &= ~(VSUID | VSGID);
1775 
1776 		error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0,
1777 		    NULL, NULL);
1778 
1779 		if (!error) {
1780 
1781 			if ((va.va_mask & AT_SIZE) && (va.va_size == 0))
1782 				trunc = TRUE;
1783 			else
1784 				trunc = FALSE;
1785 
1786 			if (rfs4_check_delegated(FWRITE, vp, trunc)) {
1787 				VN_RELE(vp);
1788 				curthread->t_flag |= T_WOULDBLOCK;
1789 				goto out;
1790 			}
1791 			va.va_mask = AT_ALL;
1792 
1793 			error = VOP_GETATTR(vp, &va, 0, cr, NULL);
1794 
1795 			/* check for overflows */
1796 			if (!error) {
1797 				acl_perm(vp, exi, &va, cr);
1798 				error = vattr_to_nattr(&va, &dr->dr_attr);
1799 				if (!error) {
1800 					error = makefh(&dr->dr_fhandle, vp,
1801 					    exi);
1802 				}
1803 			}
1804 			/*
1805 			 * Force modified metadata out to stable storage.
1806 			 *
1807 			 * if a underlying vp exists, pass it to VOP_FSYNC
1808 			 */
1809 			if (VOP_REALVP(vp, &realvp, NULL) == 0)
1810 				(void) VOP_FSYNC(realvp, FNODSYNC, cr, NULL);
1811 			else
1812 				(void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
1813 			VN_RELE(vp);
1814 		}
1815 
1816 		if (in_crit) {
1817 			nbl_end_crit(tvp);
1818 			VN_RELE(tvp);
1819 		}
1820 	}
1821 
1822 	/*
1823 	 * Force modified data and metadata out to stable storage.
1824 	 */
1825 	(void) VOP_FSYNC(dvp, 0, cr, NULL);
1826 
1827 out:
1828 
1829 	VN_RELE(dvp);
1830 
1831 	dr->dr_status = puterrno(error);
1832 
1833 }
1834 void *
1835 rfs_create_getfh(struct nfscreatargs *args)
1836 {
1837 	return (args->ca_da.da_fhandle);
1838 }
1839 
1840 /*
1841  * Remove a file.
1842  * Remove named file from parent directory.
1843  */
1844 void
1845 rfs_remove(struct nfsdiropargs *da, enum nfsstat *status,
1846 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1847 {
1848 	int error = 0;
1849 	vnode_t *vp;
1850 	vnode_t *targvp;
1851 	int in_crit = 0;
1852 
1853 	/*
1854 	 * Disallow NULL paths
1855 	 */
1856 	if (da->da_name == NULL || *da->da_name == '\0') {
1857 		*status = NFSERR_ACCES;
1858 		return;
1859 	}
1860 
1861 	vp = nfs_fhtovp(da->da_fhandle, exi);
1862 	if (vp == NULL) {
1863 		*status = NFSERR_STALE;
1864 		return;
1865 	}
1866 
1867 	if (rdonly(exi, req)) {
1868 		VN_RELE(vp);
1869 		*status = NFSERR_ROFS;
1870 		return;
1871 	}
1872 
1873 	/*
1874 	 * Check for a conflict with a non-blocking mandatory share reservation.
1875 	 */
1876 	error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0,
1877 	    NULL, cr, NULL, NULL, NULL);
1878 	if (error != 0) {
1879 		VN_RELE(vp);
1880 		*status = puterrno(error);
1881 		return;
1882 	}
1883 
1884 	/*
1885 	 * If the file is delegated to an v4 client, then initiate
1886 	 * recall and drop this request (by setting T_WOULDBLOCK).
1887 	 * The client will eventually re-transmit the request and
1888 	 * (hopefully), by then, the v4 client will have returned
1889 	 * the delegation.
1890 	 */
1891 
1892 	if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
1893 		VN_RELE(vp);
1894 		VN_RELE(targvp);
1895 		curthread->t_flag |= T_WOULDBLOCK;
1896 		return;
1897 	}
1898 
1899 	if (nbl_need_check(targvp)) {
1900 		nbl_start_crit(targvp, RW_READER);
1901 		in_crit = 1;
1902 		if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
1903 			error = EACCES;
1904 			goto out;
1905 		}
1906 	}
1907 
1908 	error = VOP_REMOVE(vp, da->da_name, cr, NULL, 0);
1909 
1910 	/*
1911 	 * Force modified data and metadata out to stable storage.
1912 	 */
1913 	(void) VOP_FSYNC(vp, 0, cr, NULL);
1914 
1915 out:
1916 	if (in_crit)
1917 		nbl_end_crit(targvp);
1918 	VN_RELE(targvp);
1919 	VN_RELE(vp);
1920 
1921 	*status = puterrno(error);
1922 
1923 }
1924 
1925 void *
1926 rfs_remove_getfh(struct nfsdiropargs *da)
1927 {
1928 	return (da->da_fhandle);
1929 }
1930 
1931 /*
1932  * rename a file
1933  * Give a file (from) a new name (to).
1934  */
1935 void
1936 rfs_rename(struct nfsrnmargs *args, enum nfsstat *status,
1937 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1938 {
1939 	int error = 0;
1940 	vnode_t *fromvp;
1941 	vnode_t *tovp;
1942 	struct exportinfo *to_exi;
1943 	fhandle_t *fh;
1944 	vnode_t *srcvp;
1945 	vnode_t *targvp;
1946 	int in_crit = 0;
1947 
1948 	fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi);
1949 	if (fromvp == NULL) {
1950 		*status = NFSERR_STALE;
1951 		return;
1952 	}
1953 
1954 	fh = args->rna_to.da_fhandle;
1955 	to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
1956 	if (to_exi == NULL) {
1957 		VN_RELE(fromvp);
1958 		*status = NFSERR_ACCES;
1959 		return;
1960 	}
1961 	exi_rele(to_exi);
1962 
1963 	if (to_exi != exi) {
1964 		VN_RELE(fromvp);
1965 		*status = NFSERR_XDEV;
1966 		return;
1967 	}
1968 
1969 	tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi);
1970 	if (tovp == NULL) {
1971 		VN_RELE(fromvp);
1972 		*status = NFSERR_STALE;
1973 		return;
1974 	}
1975 
1976 	if (fromvp->v_type != VDIR || tovp->v_type != VDIR) {
1977 		VN_RELE(tovp);
1978 		VN_RELE(fromvp);
1979 		*status = NFSERR_NOTDIR;
1980 		return;
1981 	}
1982 
1983 	/*
1984 	 * Disallow NULL paths
1985 	 */
1986 	if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' ||
1987 	    args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') {
1988 		VN_RELE(tovp);
1989 		VN_RELE(fromvp);
1990 		*status = NFSERR_ACCES;
1991 		return;
1992 	}
1993 
1994 	if (rdonly(exi, req)) {
1995 		VN_RELE(tovp);
1996 		VN_RELE(fromvp);
1997 		*status = NFSERR_ROFS;
1998 		return;
1999 	}
2000 
2001 	/*
2002 	 * Check for a conflict with a non-blocking mandatory share reservation.
2003 	 */
2004 	error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
2005 	    NULL, cr, NULL, NULL, NULL);
2006 	if (error != 0) {
2007 		VN_RELE(tovp);
2008 		VN_RELE(fromvp);
2009 		*status = puterrno(error);
2010 		return;
2011 	}
2012 
2013 	/* Check for delegations on the source file */
2014 
2015 	if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
2016 		VN_RELE(tovp);
2017 		VN_RELE(fromvp);
2018 		VN_RELE(srcvp);
2019 		curthread->t_flag |= T_WOULDBLOCK;
2020 		return;
2021 	}
2022 
2023 	/* Check for delegation on the file being renamed over, if it exists */
2024 
2025 	if (rfs4_deleg_policy != SRV_NEVER_DELEGATE &&
2026 	    VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
2027 	    NULL, NULL, NULL) == 0) {
2028 
2029 		if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2030 			VN_RELE(tovp);
2031 			VN_RELE(fromvp);
2032 			VN_RELE(srcvp);
2033 			VN_RELE(targvp);
2034 			curthread->t_flag |= T_WOULDBLOCK;
2035 			return;
2036 		}
2037 		VN_RELE(targvp);
2038 	}
2039 
2040 
2041 	if (nbl_need_check(srcvp)) {
2042 		nbl_start_crit(srcvp, RW_READER);
2043 		in_crit = 1;
2044 		if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) {
2045 			error = EACCES;
2046 			goto out;
2047 		}
2048 	}
2049 
2050 	error = VOP_RENAME(fromvp, args->rna_from.da_name,
2051 	    tovp, args->rna_to.da_name, cr, NULL, 0);
2052 
2053 	if (error == 0)
2054 		vn_renamepath(tovp, srcvp, args->rna_to.da_name,
2055 		    strlen(args->rna_to.da_name));
2056 
2057 	/*
2058 	 * Force modified data and metadata out to stable storage.
2059 	 */
2060 	(void) VOP_FSYNC(tovp, 0, cr, NULL);
2061 	(void) VOP_FSYNC(fromvp, 0, cr, NULL);
2062 
2063 out:
2064 	if (in_crit)
2065 		nbl_end_crit(srcvp);
2066 	VN_RELE(srcvp);
2067 	VN_RELE(tovp);
2068 	VN_RELE(fromvp);
2069 
2070 	*status = puterrno(error);
2071 
2072 }
2073 void *
2074 rfs_rename_getfh(struct nfsrnmargs *args)
2075 {
2076 	return (args->rna_from.da_fhandle);
2077 }
2078 
2079 /*
2080  * Link to a file.
2081  * Create a file (to) which is a hard link to the given file (from).
2082  */
2083 void
2084 rfs_link(struct nfslinkargs *args, enum nfsstat *status,
2085 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2086 {
2087 	int error;
2088 	vnode_t *fromvp;
2089 	vnode_t *tovp;
2090 	struct exportinfo *to_exi;
2091 	fhandle_t *fh;
2092 
2093 	fromvp = nfs_fhtovp(args->la_from, exi);
2094 	if (fromvp == NULL) {
2095 		*status = NFSERR_STALE;
2096 		return;
2097 	}
2098 
2099 	fh = args->la_to.da_fhandle;
2100 	to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2101 	if (to_exi == NULL) {
2102 		VN_RELE(fromvp);
2103 		*status = NFSERR_ACCES;
2104 		return;
2105 	}
2106 	exi_rele(to_exi);
2107 
2108 	if (to_exi != exi) {
2109 		VN_RELE(fromvp);
2110 		*status = NFSERR_XDEV;
2111 		return;
2112 	}
2113 
2114 	tovp = nfs_fhtovp(args->la_to.da_fhandle, exi);
2115 	if (tovp == NULL) {
2116 		VN_RELE(fromvp);
2117 		*status = NFSERR_STALE;
2118 		return;
2119 	}
2120 
2121 	if (tovp->v_type != VDIR) {
2122 		VN_RELE(tovp);
2123 		VN_RELE(fromvp);
2124 		*status = NFSERR_NOTDIR;
2125 		return;
2126 	}
2127 	/*
2128 	 * Disallow NULL paths
2129 	 */
2130 	if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') {
2131 		VN_RELE(tovp);
2132 		VN_RELE(fromvp);
2133 		*status = NFSERR_ACCES;
2134 		return;
2135 	}
2136 
2137 	if (rdonly(exi, req)) {
2138 		VN_RELE(tovp);
2139 		VN_RELE(fromvp);
2140 		*status = NFSERR_ROFS;
2141 		return;
2142 	}
2143 
2144 	error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr, NULL, 0);
2145 
2146 	/*
2147 	 * Force modified data and metadata out to stable storage.
2148 	 */
2149 	(void) VOP_FSYNC(tovp, 0, cr, NULL);
2150 	(void) VOP_FSYNC(fromvp, FNODSYNC, cr, NULL);
2151 
2152 	VN_RELE(tovp);
2153 	VN_RELE(fromvp);
2154 
2155 	*status = puterrno(error);
2156 
2157 }
2158 void *
2159 rfs_link_getfh(struct nfslinkargs *args)
2160 {
2161 	return (args->la_from);
2162 }
2163 
2164 /*
2165  * Symbolicly link to a file.
2166  * Create a file (to) with the given attributes which is a symbolic link
2167  * to the given path name (to).
2168  */
2169 void
2170 rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
2171 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2172 {
2173 	int error;
2174 	struct vattr va;
2175 	vnode_t *vp;
2176 	vnode_t *svp;
2177 	int lerror;
2178 
2179 	/*
2180 	 * Disallow NULL paths
2181 	 */
2182 	if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') {
2183 		*status = NFSERR_ACCES;
2184 		return;
2185 	}
2186 
2187 	vp = nfs_fhtovp(args->sla_from.da_fhandle, exi);
2188 	if (vp == NULL) {
2189 		*status = NFSERR_STALE;
2190 		return;
2191 	}
2192 
2193 	if (rdonly(exi, req)) {
2194 		VN_RELE(vp);
2195 		*status = NFSERR_ROFS;
2196 		return;
2197 	}
2198 
2199 	error = sattr_to_vattr(args->sla_sa, &va);
2200 	if (error) {
2201 		VN_RELE(vp);
2202 		*status = puterrno(error);
2203 		return;
2204 	}
2205 
2206 	if (!(va.va_mask & AT_MODE)) {
2207 		VN_RELE(vp);
2208 		*status = NFSERR_INVAL;
2209 		return;
2210 	}
2211 
2212 	va.va_type = VLNK;
2213 	va.va_mask |= AT_TYPE;
2214 
2215 	error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, args->sla_tnm, cr,
2216 	    NULL, 0);
2217 
2218 	/*
2219 	 * Force new data and metadata out to stable storage.
2220 	 */
2221 	lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL,
2222 	    0, NULL, cr, NULL, NULL, NULL);
2223 
2224 	if (!lerror) {
2225 		(void) VOP_FSYNC(svp, 0, cr, NULL);
2226 		VN_RELE(svp);
2227 	}
2228 
2229 	/*
2230 	 * Force modified data and metadata out to stable storage.
2231 	 */
2232 	(void) VOP_FSYNC(vp, 0, cr, NULL);
2233 
2234 	VN_RELE(vp);
2235 
2236 	*status = puterrno(error);
2237 
2238 }
2239 void *
2240 rfs_symlink_getfh(struct nfsslargs *args)
2241 {
2242 	return (args->sla_from.da_fhandle);
2243 }
2244 
2245 /*
2246  * Make a directory.
2247  * Create a directory with the given name, parent directory, and attributes.
2248  * Returns a file handle and attributes for the new directory.
2249  */
2250 void
2251 rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr,
2252 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2253 {
2254 	int error;
2255 	struct vattr va;
2256 	vnode_t *dvp = NULL;
2257 	vnode_t *vp;
2258 	char *name = args->ca_da.da_name;
2259 
2260 	/*
2261 	 * Disallow NULL paths
2262 	 */
2263 	if (name == NULL || *name == '\0') {
2264 		dr->dr_status = NFSERR_ACCES;
2265 		return;
2266 	}
2267 
2268 	vp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
2269 	if (vp == NULL) {
2270 		dr->dr_status = NFSERR_STALE;
2271 		return;
2272 	}
2273 
2274 	if (rdonly(exi, req)) {
2275 		VN_RELE(vp);
2276 		dr->dr_status = NFSERR_ROFS;
2277 		return;
2278 	}
2279 
2280 	error = sattr_to_vattr(args->ca_sa, &va);
2281 	if (error) {
2282 		VN_RELE(vp);
2283 		dr->dr_status = puterrno(error);
2284 		return;
2285 	}
2286 
2287 	if (!(va.va_mask & AT_MODE)) {
2288 		VN_RELE(vp);
2289 		dr->dr_status = NFSERR_INVAL;
2290 		return;
2291 	}
2292 
2293 	va.va_type = VDIR;
2294 	va.va_mask |= AT_TYPE;
2295 
2296 	error = VOP_MKDIR(vp, name, &va, &dvp, cr, NULL, 0, NULL);
2297 
2298 	if (!error) {
2299 		/*
2300 		 * Attribtutes of the newly created directory should
2301 		 * be returned to the client.
2302 		 */
2303 		va.va_mask = AT_ALL; /* We want everything */
2304 		error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2305 
2306 		/* check for overflows */
2307 		if (!error) {
2308 			acl_perm(vp, exi, &va, cr);
2309 			error = vattr_to_nattr(&va, &dr->dr_attr);
2310 			if (!error) {
2311 				error = makefh(&dr->dr_fhandle, dvp, exi);
2312 			}
2313 		}
2314 		/*
2315 		 * Force new data and metadata out to stable storage.
2316 		 */
2317 		(void) VOP_FSYNC(dvp, 0, cr, NULL);
2318 		VN_RELE(dvp);
2319 	}
2320 
2321 	/*
2322 	 * Force modified data and metadata out to stable storage.
2323 	 */
2324 	(void) VOP_FSYNC(vp, 0, cr, NULL);
2325 
2326 	VN_RELE(vp);
2327 
2328 	dr->dr_status = puterrno(error);
2329 
2330 }
2331 void *
2332 rfs_mkdir_getfh(struct nfscreatargs *args)
2333 {
2334 	return (args->ca_da.da_fhandle);
2335 }
2336 
2337 /*
2338  * Remove a directory.
2339  * Remove the given directory name from the given parent directory.
2340  */
2341 void
2342 rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status,
2343 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2344 {
2345 	int error;
2346 	vnode_t *vp;
2347 
2348 
2349 	/*
2350 	 * Disallow NULL paths
2351 	 */
2352 	if (da->da_name == NULL || *da->da_name == '\0') {
2353 		*status = NFSERR_ACCES;
2354 		return;
2355 	}
2356 
2357 	vp = nfs_fhtovp(da->da_fhandle, exi);
2358 	if (vp == NULL) {
2359 		*status = NFSERR_STALE;
2360 		return;
2361 	}
2362 
2363 	if (rdonly(exi, req)) {
2364 		VN_RELE(vp);
2365 		*status = NFSERR_ROFS;
2366 		return;
2367 	}
2368 
2369 	/*
2370 	 * VOP_RMDIR now takes a new third argument (the current
2371 	 * directory of the process).  That's because someone
2372 	 * wants to return EINVAL if one tries to remove ".".
2373 	 * Of course, NFS servers have no idea what their
2374 	 * clients' current directories are.  We fake it by
2375 	 * supplying a vnode known to exist and illegal to
2376 	 * remove.
2377 	 */
2378 	error = VOP_RMDIR(vp, da->da_name, rootdir, cr, NULL, 0);
2379 
2380 	/*
2381 	 * Force modified data and metadata out to stable storage.
2382 	 */
2383 	(void) VOP_FSYNC(vp, 0, cr, NULL);
2384 
2385 	VN_RELE(vp);
2386 
2387 	/*
2388 	 * System V defines rmdir to return EEXIST, not ENOTEMPTY,
2389 	 * if the directory is not empty.  A System V NFS server
2390 	 * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
2391 	 * over the wire.
2392 	 */
2393 	if (error == EEXIST)
2394 		*status = NFSERR_NOTEMPTY;
2395 	else
2396 		*status = puterrno(error);
2397 
2398 }
2399 void *
2400 rfs_rmdir_getfh(struct nfsdiropargs *da)
2401 {
2402 	return (da->da_fhandle);
2403 }
2404 
2405 /* ARGSUSED */
2406 void
2407 rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
2408 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2409 {
2410 	int error;
2411 	int iseof;
2412 	struct iovec iov;
2413 	struct uio uio;
2414 	vnode_t *vp;
2415 
2416 	vp = nfs_fhtovp(&rda->rda_fh, exi);
2417 	if (vp == NULL) {
2418 		rd->rd_entries = NULL;
2419 		rd->rd_status = NFSERR_STALE;
2420 		return;
2421 	}
2422 
2423 	if (vp->v_type != VDIR) {
2424 		VN_RELE(vp);
2425 		rd->rd_entries = NULL;
2426 		rd->rd_status = NFSERR_NOTDIR;
2427 		return;
2428 	}
2429 
2430 	(void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
2431 
2432 	error = VOP_ACCESS(vp, VREAD, 0, cr, NULL);
2433 
2434 	if (error) {
2435 		rd->rd_entries = NULL;
2436 		goto bad;
2437 	}
2438 
2439 	if (rda->rda_count == 0) {
2440 		rd->rd_entries = NULL;
2441 		rd->rd_size = 0;
2442 		rd->rd_eof = FALSE;
2443 		goto bad;
2444 	}
2445 
2446 	rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA);
2447 
2448 	/*
2449 	 * Allocate data for entries.  This will be freed by rfs_rddirfree.
2450 	 */
2451 	rd->rd_bufsize = (uint_t)rda->rda_count;
2452 	rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP);
2453 
2454 	/*
2455 	 * Set up io vector to read directory data
2456 	 */
2457 	iov.iov_base = (caddr_t)rd->rd_entries;
2458 	iov.iov_len = rda->rda_count;
2459 	uio.uio_iov = &iov;
2460 	uio.uio_iovcnt = 1;
2461 	uio.uio_segflg = UIO_SYSSPACE;
2462 	uio.uio_extflg = UIO_COPY_CACHED;
2463 	uio.uio_loffset = (offset_t)rda->rda_offset;
2464 	uio.uio_resid = rda->rda_count;
2465 
2466 	/*
2467 	 * read directory
2468 	 */
2469 	error = VOP_READDIR(vp, &uio, cr, &iseof, NULL, 0);
2470 
2471 	/*
2472 	 * Clean up
2473 	 */
2474 	if (!error) {
2475 		/*
2476 		 * set size and eof
2477 		 */
2478 		if (uio.uio_resid == rda->rda_count) {
2479 			rd->rd_size = 0;
2480 			rd->rd_eof = TRUE;
2481 		} else {
2482 			rd->rd_size = (uint32_t)(rda->rda_count -
2483 			    uio.uio_resid);
2484 			rd->rd_eof = iseof ? TRUE : FALSE;
2485 		}
2486 	}
2487 
2488 bad:
2489 	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
2490 
2491 #if 0 /* notyet */
2492 	/*
2493 	 * Don't do this.  It causes local disk writes when just
2494 	 * reading the file and the overhead is deemed larger
2495 	 * than the benefit.
2496 	 */
2497 	/*
2498 	 * Force modified metadata out to stable storage.
2499 	 */
2500 	(void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2501 #endif
2502 
2503 	VN_RELE(vp);
2504 
2505 	rd->rd_status = puterrno(error);
2506 
2507 }
2508 void *
2509 rfs_readdir_getfh(struct nfsrddirargs *rda)
2510 {
2511 	return (&rda->rda_fh);
2512 }
2513 void
2514 rfs_rddirfree(struct nfsrddirres *rd)
2515 {
2516 	if (rd->rd_entries != NULL)
2517 		kmem_free(rd->rd_entries, rd->rd_bufsize);
2518 }
2519 
2520 /* ARGSUSED */
2521 void
2522 rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
2523 	struct svc_req *req, cred_t *cr)
2524 {
2525 	int error;
2526 	struct statvfs64 sb;
2527 	vnode_t *vp;
2528 
2529 	vp = nfs_fhtovp(fh, exi);
2530 	if (vp == NULL) {
2531 		fs->fs_status = NFSERR_STALE;
2532 		return;
2533 	}
2534 
2535 	error = VFS_STATVFS(vp->v_vfsp, &sb);
2536 
2537 	if (!error) {
2538 		fs->fs_tsize = nfstsize();
2539 		fs->fs_bsize = sb.f_frsize;
2540 		fs->fs_blocks = sb.f_blocks;
2541 		fs->fs_bfree = sb.f_bfree;
2542 		fs->fs_bavail = sb.f_bavail;
2543 	}
2544 
2545 	VN_RELE(vp);
2546 
2547 	fs->fs_status = puterrno(error);
2548 
2549 }
2550 void *
2551 rfs_statfs_getfh(fhandle_t *fh)
2552 {
2553 	return (fh);
2554 }
2555 
2556 static int
2557 sattr_to_vattr(struct nfssattr *sa, struct vattr *vap)
2558 {
2559 	vap->va_mask = 0;
2560 
2561 	/*
2562 	 * There was a sign extension bug in some VFS based systems
2563 	 * which stored the mode as a short.  When it would get
2564 	 * assigned to a u_long, no sign extension would occur.
2565 	 * It needed to, but this wasn't noticed because sa_mode
2566 	 * would then get assigned back to the short, thus ignoring
2567 	 * the upper 16 bits of sa_mode.
2568 	 *
2569 	 * To make this implementation work for both broken
2570 	 * clients and good clients, we check for both versions
2571 	 * of the mode.
2572 	 */
2573 	if (sa->sa_mode != (uint32_t)((ushort_t)-1) &&
2574 	    sa->sa_mode != (uint32_t)-1) {
2575 		vap->va_mask |= AT_MODE;
2576 		vap->va_mode = sa->sa_mode;
2577 	}
2578 	if (sa->sa_uid != (uint32_t)-1) {
2579 		vap->va_mask |= AT_UID;
2580 		vap->va_uid = sa->sa_uid;
2581 	}
2582 	if (sa->sa_gid != (uint32_t)-1) {
2583 		vap->va_mask |= AT_GID;
2584 		vap->va_gid = sa->sa_gid;
2585 	}
2586 	if (sa->sa_size != (uint32_t)-1) {
2587 		vap->va_mask |= AT_SIZE;
2588 		vap->va_size = sa->sa_size;
2589 	}
2590 	if (sa->sa_atime.tv_sec != (int32_t)-1 &&
2591 	    sa->sa_atime.tv_usec != (int32_t)-1) {
2592 #ifndef _LP64
2593 		/* return error if time overflow */
2594 		if (!NFS2_TIME_OK(sa->sa_atime.tv_sec))
2595 			return (EOVERFLOW);
2596 #endif
2597 		vap->va_mask |= AT_ATIME;
2598 		/*
2599 		 * nfs protocol defines times as unsigned so don't extend sign,
2600 		 * unless sysadmin set nfs_allow_preepoch_time.
2601 		 */
2602 		NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec);
2603 		vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
2604 	}
2605 	if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
2606 	    sa->sa_mtime.tv_usec != (int32_t)-1) {
2607 #ifndef _LP64
2608 		/* return error if time overflow */
2609 		if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
2610 			return (EOVERFLOW);
2611 #endif
2612 		vap->va_mask |= AT_MTIME;
2613 		/*
2614 		 * nfs protocol defines times as unsigned so don't extend sign,
2615 		 * unless sysadmin set nfs_allow_preepoch_time.
2616 		 */
2617 		NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
2618 		vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
2619 	}
2620 	return (0);
2621 }
2622 
2623 static enum nfsftype vt_to_nf[] = {
2624 	0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
2625 };
2626 
2627 /*
2628  * check the following fields for overflow: nodeid, size, and time.
2629  * There could be a problem when converting 64-bit LP64 fields
2630  * into 32-bit ones.  Return an error if there is an overflow.
2631  */
2632 int
2633 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
2634 {
2635 	ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
2636 	na->na_type = vt_to_nf[vap->va_type];
2637 
2638 	if (vap->va_mode == (unsigned short) -1)
2639 		na->na_mode = (uint32_t)-1;
2640 	else
2641 		na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
2642 
2643 	if (vap->va_uid == (unsigned short)(-1))
2644 		na->na_uid = (uint32_t)(-1);
2645 	else if (vap->va_uid == UID_NOBODY)
2646 		na->na_uid = (uint32_t)NFS_UID_NOBODY;
2647 	else
2648 		na->na_uid = vap->va_uid;
2649 
2650 	if (vap->va_gid == (unsigned short)(-1))
2651 		na->na_gid = (uint32_t)-1;
2652 	else if (vap->va_gid == GID_NOBODY)
2653 		na->na_gid = (uint32_t)NFS_GID_NOBODY;
2654 	else
2655 		na->na_gid = vap->va_gid;
2656 
2657 	/*
2658 	 * Do we need to check fsid for overflow?  It is 64-bit in the
2659 	 * vattr, but are bigger than 32 bit values supported?
2660 	 */
2661 	na->na_fsid = vap->va_fsid;
2662 
2663 	na->na_nodeid = vap->va_nodeid;
2664 
2665 	/*
2666 	 * Check to make sure that the nodeid is representable over the
2667 	 * wire without losing bits.
2668 	 */
2669 	if (vap->va_nodeid != (u_longlong_t)na->na_nodeid)
2670 		return (EFBIG);
2671 	na->na_nlink = vap->va_nlink;
2672 
2673 	/*
2674 	 * Check for big files here, instead of at the caller.  See
2675 	 * comments in cstat for large special file explanation.
2676 	 */
2677 	if (vap->va_size > (u_longlong_t)MAXOFF32_T) {
2678 		if ((vap->va_type == VREG) || (vap->va_type == VDIR))
2679 			return (EFBIG);
2680 		if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) {
2681 			/* UNKNOWN_SIZE | OVERFLOW */
2682 			na->na_size = MAXOFF32_T;
2683 		} else
2684 			na->na_size = vap->va_size;
2685 	} else
2686 		na->na_size = vap->va_size;
2687 
2688 	/*
2689 	 * If the vnode times overflow the 32-bit times that NFS2
2690 	 * uses on the wire then return an error.
2691 	 */
2692 	if (!NFS_VAP_TIME_OK(vap)) {
2693 		return (EOVERFLOW);
2694 	}
2695 	na->na_atime.tv_sec = vap->va_atime.tv_sec;
2696 	na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2697 
2698 	na->na_mtime.tv_sec = vap->va_mtime.tv_sec;
2699 	na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2700 
2701 	na->na_ctime.tv_sec = vap->va_ctime.tv_sec;
2702 	na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000;
2703 
2704 	/*
2705 	 * If the dev_t will fit into 16 bits then compress
2706 	 * it, otherwise leave it alone. See comments in
2707 	 * nfs_client.c.
2708 	 */
2709 	if (getminor(vap->va_rdev) <= SO4_MAXMIN &&
2710 	    getmajor(vap->va_rdev) <= SO4_MAXMAJ)
2711 		na->na_rdev = nfsv2_cmpdev(vap->va_rdev);
2712 	else
2713 		(void) cmpldev(&na->na_rdev, vap->va_rdev);
2714 
2715 	na->na_blocks = vap->va_nblocks;
2716 	na->na_blocksize = vap->va_blksize;
2717 
2718 	/*
2719 	 * This bit of ugliness is a *TEMPORARY* hack to preserve the
2720 	 * over-the-wire protocols for named-pipe vnodes.  It remaps the
2721 	 * VFIFO type to the special over-the-wire type. (see note in nfs.h)
2722 	 *
2723 	 * BUYER BEWARE:
2724 	 *  If you are porting the NFS to a non-Sun server, you probably
2725 	 *  don't want to include the following block of code.  The
2726 	 *  over-the-wire special file types will be changing with the
2727 	 *  NFS Protocol Revision.
2728 	 */
2729 	if (vap->va_type == VFIFO)
2730 		NA_SETFIFO(na);
2731 	return (0);
2732 }
2733 
2734 /*
2735  * acl v2 support: returns approximate permission.
2736  *	default: returns minimal permission (more restrictive)
2737  *	aclok: returns maximal permission (less restrictive)
2738  *	This routine changes the permissions that are alaredy in *va.
2739  *	If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES,
2740  *	CLASS_OBJ is always the same as GROUP_OBJ entry.
2741  */
2742 static void
2743 acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr)
2744 {
2745 	vsecattr_t	vsa;
2746 	int		aclcnt;
2747 	aclent_t	*aclentp;
2748 	mode_t		mask_perm;
2749 	mode_t		grp_perm;
2750 	mode_t		other_perm;
2751 	mode_t		other_orig;
2752 	int		error;
2753 
2754 	/* dont care default acl */
2755 	vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT);
2756 	error = VOP_GETSECATTR(vp, &vsa, 0, cr, NULL);
2757 
2758 	if (!error) {
2759 		aclcnt = vsa.vsa_aclcnt;
2760 		if (aclcnt > MIN_ACL_ENTRIES) {
2761 			/* non-trivial ACL */
2762 			aclentp = vsa.vsa_aclentp;
2763 			if (exi->exi_export.ex_flags & EX_ACLOK) {
2764 				/* maximal permissions */
2765 				grp_perm = 0;
2766 				other_perm = 0;
2767 				for (; aclcnt > 0; aclcnt--, aclentp++) {
2768 					switch (aclentp->a_type) {
2769 					case USER_OBJ:
2770 						break;
2771 					case USER:
2772 						grp_perm |=
2773 						    aclentp->a_perm << 3;
2774 						other_perm |= aclentp->a_perm;
2775 						break;
2776 					case GROUP_OBJ:
2777 						grp_perm |=
2778 						    aclentp->a_perm << 3;
2779 						break;
2780 					case GROUP:
2781 						other_perm |= aclentp->a_perm;
2782 						break;
2783 					case OTHER_OBJ:
2784 						other_orig = aclentp->a_perm;
2785 						break;
2786 					case CLASS_OBJ:
2787 						mask_perm = aclentp->a_perm;
2788 						break;
2789 					default:
2790 						break;
2791 					}
2792 				}
2793 				grp_perm &= mask_perm << 3;
2794 				other_perm &= mask_perm;
2795 				other_perm |= other_orig;
2796 
2797 			} else {
2798 				/* minimal permissions */
2799 				grp_perm = 070;
2800 				other_perm = 07;
2801 				for (; aclcnt > 0; aclcnt--, aclentp++) {
2802 					switch (aclentp->a_type) {
2803 					case USER_OBJ:
2804 						break;
2805 					case USER:
2806 					case CLASS_OBJ:
2807 						grp_perm &=
2808 						    aclentp->a_perm << 3;
2809 						other_perm &=
2810 						    aclentp->a_perm;
2811 						break;
2812 					case GROUP_OBJ:
2813 						grp_perm &=
2814 						    aclentp->a_perm << 3;
2815 						break;
2816 					case GROUP:
2817 						other_perm &=
2818 						    aclentp->a_perm;
2819 						break;
2820 					case OTHER_OBJ:
2821 						other_perm &=
2822 						    aclentp->a_perm;
2823 						break;
2824 					default:
2825 						break;
2826 					}
2827 				}
2828 			}
2829 			/* copy to va */
2830 			va->va_mode &= ~077;
2831 			va->va_mode |= grp_perm | other_perm;
2832 		}
2833 		if (vsa.vsa_aclcnt)
2834 			kmem_free(vsa.vsa_aclentp,
2835 			    vsa.vsa_aclcnt * sizeof (aclent_t));
2836 	}
2837 }
2838 
2839 void
2840 rfs_srvrinit(void)
2841 {
2842 	mutex_init(&rfs_async_write_lock, NULL, MUTEX_DEFAULT, NULL);
2843 	nfs2_srv_caller_id = fs_new_caller_id();
2844 }
2845 
2846 void
2847 rfs_srvrfini(void)
2848 {
2849 	mutex_destroy(&rfs_async_write_lock);
2850 }
2851 
2852 static int
2853 rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
2854 {
2855 	struct clist	*wcl;
2856 	int		data_len, avail_len, num;
2857 	uint32_t	count = rr->rr_count;
2858 
2859 	data_len = num = avail_len = 0;
2860 
2861 	wcl = ra->ra_wlist;
2862 	while (wcl != NULL) {
2863 		if (wcl->c_dmemhandle.mrc_rmr == 0)
2864 			break;
2865 
2866 		avail_len += wcl->c_len;
2867 		if (wcl->c_len < count) {
2868 			data_len += wcl->c_len;
2869 		} else {
2870 			/* Can make the rest chunks all 0-len */
2871 			data_len += count;
2872 			wcl->c_len = count;
2873 		}
2874 		count -= wcl->c_len;
2875 		num ++;
2876 		wcl = wcl->c_next;
2877 	}
2878 
2879 	/*
2880 	 * MUST fail if there are still more data
2881 	 */
2882 	if (count > 0) {
2883 		DTRACE_PROBE2(nfss__e__read__wlist__fail,
2884 		    int, data_len, int, count);
2885 		return (FALSE);
2886 	}
2887 
2888 	wcl = ra->ra_wlist;
2889 	rr->rr_count = data_len;
2890 	rr->rr_ok.rrok_wlist_len = data_len;
2891 	rr->rr_ok.rrok_wlist = wcl;
2892 
2893 	return (TRUE);
2894 }
2895