xref: /illumos-gate/usr/src/uts/common/fs/nfs/nfs_srv.c (revision b1e2e3fb17324e9ddf43db264a0c64da7756d9e6)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
24  * Copyright (c) 2016 by Delphix. All rights reserved.
25  */
26 
27 /*
28  *	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
29  *	All rights reserved.
30  */
31 
32 #include <sys/param.h>
33 #include <sys/types.h>
34 #include <sys/systm.h>
35 #include <sys/cred.h>
36 #include <sys/buf.h>
37 #include <sys/vfs.h>
38 #include <sys/vnode.h>
39 #include <sys/uio.h>
40 #include <sys/stat.h>
41 #include <sys/errno.h>
42 #include <sys/sysmacros.h>
43 #include <sys/statvfs.h>
44 #include <sys/kmem.h>
45 #include <sys/kstat.h>
46 #include <sys/dirent.h>
47 #include <sys/cmn_err.h>
48 #include <sys/debug.h>
49 #include <sys/vtrace.h>
50 #include <sys/mode.h>
51 #include <sys/acl.h>
52 #include <sys/nbmlock.h>
53 #include <sys/policy.h>
54 #include <sys/sdt.h>
55 
56 #include <rpc/types.h>
57 #include <rpc/auth.h>
58 #include <rpc/svc.h>
59 
60 #include <nfs/nfs.h>
61 #include <nfs/export.h>
62 #include <nfs/nfs_cmd.h>
63 
64 #include <vm/hat.h>
65 #include <vm/as.h>
66 #include <vm/seg.h>
67 #include <vm/seg_map.h>
68 #include <vm/seg_kmem.h>
69 
70 #include <sys/strsubr.h>
71 
72 /*
73  * These are the interface routines for the server side of the
74  * Network File System.  See the NFS version 2 protocol specification
75  * for a description of this interface.
76  */
77 
78 static int	sattr_to_vattr(struct nfssattr *, struct vattr *);
79 static void	acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
80 			cred_t *);
81 
82 /*
83  * Some "over the wire" UNIX file types.  These are encoded
84  * into the mode.  This needs to be fixed in the next rev.
85  */
86 #define	IFMT		0170000		/* type of file */
87 #define	IFCHR		0020000		/* character special */
88 #define	IFBLK		0060000		/* block special */
89 #define	IFSOCK		0140000		/* socket */
90 
91 u_longlong_t nfs2_srv_caller_id;
92 
93 /*
94  * Get file attributes.
95  * Returns the current attributes of the file with the given fhandle.
96  */
97 /* ARGSUSED */
98 void
99 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
100     struct svc_req *req, cred_t *cr, bool_t ro)
101 {
102 	int error;
103 	vnode_t *vp;
104 	struct vattr va;
105 
106 	vp = nfs_fhtovp(fhp, exi);
107 	if (vp == NULL) {
108 		ns->ns_status = NFSERR_STALE;
109 		return;
110 	}
111 
112 	/*
113 	 * Do the getattr.
114 	 */
115 	va.va_mask = AT_ALL;	/* we want all the attributes */
116 
117 	error = rfs4_delegated_getattr(vp, &va, 0, cr);
118 
119 	/* check for overflows */
120 	if (!error) {
121 		/* Lie about the object type for a referral */
122 		if (vn_is_nfs_reparse(vp, cr))
123 			va.va_type = VLNK;
124 
125 		acl_perm(vp, exi, &va, cr);
126 		error = vattr_to_nattr(&va, &ns->ns_attr);
127 	}
128 
129 	VN_RELE(vp);
130 
131 	ns->ns_status = puterrno(error);
132 }
133 void *
134 rfs_getattr_getfh(fhandle_t *fhp)
135 {
136 	return (fhp);
137 }
138 
139 /*
140  * Set file attributes.
141  * Sets the attributes of the file with the given fhandle.  Returns
142  * the new attributes.
143  */
144 /* ARGSUSED */
145 void
146 rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns,
147     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
148 {
149 	int error;
150 	int flag;
151 	int in_crit = 0;
152 	vnode_t *vp;
153 	struct vattr va;
154 	struct vattr bva;
155 	struct flock64 bf;
156 	caller_context_t ct;
157 
158 
159 	vp = nfs_fhtovp(&args->saa_fh, exi);
160 	if (vp == NULL) {
161 		ns->ns_status = NFSERR_STALE;
162 		return;
163 	}
164 
165 	if (rdonly(ro, vp)) {
166 		VN_RELE(vp);
167 		ns->ns_status = NFSERR_ROFS;
168 		return;
169 	}
170 
171 	error = sattr_to_vattr(&args->saa_sa, &va);
172 	if (error) {
173 		VN_RELE(vp);
174 		ns->ns_status = puterrno(error);
175 		return;
176 	}
177 
178 	/*
179 	 * If the client is requesting a change to the mtime,
180 	 * but the nanosecond field is set to 1 billion, then
181 	 * this is a flag to the server that it should set the
182 	 * atime and mtime fields to the server's current time.
183 	 * The 1 billion number actually came from the client
184 	 * as 1 million, but the units in the over the wire
185 	 * request are microseconds instead of nanoseconds.
186 	 *
187 	 * This is an overload of the protocol and should be
188 	 * documented in the NFS Version 2 protocol specification.
189 	 */
190 	if (va.va_mask & AT_MTIME) {
191 		if (va.va_mtime.tv_nsec == 1000000000) {
192 			gethrestime(&va.va_mtime);
193 			va.va_atime = va.va_mtime;
194 			va.va_mask |= AT_ATIME;
195 			flag = 0;
196 		} else
197 			flag = ATTR_UTIME;
198 	} else
199 		flag = 0;
200 
201 	/*
202 	 * If the filesystem is exported with nosuid, then mask off
203 	 * the setuid and setgid bits.
204 	 */
205 	if ((va.va_mask & AT_MODE) && vp->v_type == VREG &&
206 	    (exi->exi_export.ex_flags & EX_NOSUID))
207 		va.va_mode &= ~(VSUID | VSGID);
208 
209 	ct.cc_sysid = 0;
210 	ct.cc_pid = 0;
211 	ct.cc_caller_id = nfs2_srv_caller_id;
212 	ct.cc_flags = CC_DONTBLOCK;
213 
214 	/*
215 	 * We need to specially handle size changes because it is
216 	 * possible for the client to create a file with modes
217 	 * which indicate read-only, but with the file opened for
218 	 * writing.  If the client then tries to set the size of
219 	 * the file, then the normal access checking done in
220 	 * VOP_SETATTR would prevent the client from doing so,
221 	 * although it should be legal for it to do so.  To get
222 	 * around this, we do the access checking for ourselves
223 	 * and then use VOP_SPACE which doesn't do the access
224 	 * checking which VOP_SETATTR does. VOP_SPACE can only
225 	 * operate on VREG files, let VOP_SETATTR handle the other
226 	 * extremely rare cases.
227 	 * Also the client should not be allowed to change the
228 	 * size of the file if there is a conflicting non-blocking
229 	 * mandatory lock in the region of change.
230 	 */
231 	if (vp->v_type == VREG && va.va_mask & AT_SIZE) {
232 		if (nbl_need_check(vp)) {
233 			nbl_start_crit(vp, RW_READER);
234 			in_crit = 1;
235 		}
236 
237 		bva.va_mask = AT_UID | AT_SIZE;
238 
239 		error = VOP_GETATTR(vp, &bva, 0, cr, &ct);
240 
241 		if (error) {
242 			if (in_crit)
243 				nbl_end_crit(vp);
244 			VN_RELE(vp);
245 			ns->ns_status = puterrno(error);
246 			return;
247 		}
248 
249 		if (in_crit) {
250 			u_offset_t offset;
251 			ssize_t length;
252 
253 			if (va.va_size < bva.va_size) {
254 				offset = va.va_size;
255 				length = bva.va_size - va.va_size;
256 			} else {
257 				offset = bva.va_size;
258 				length = va.va_size - bva.va_size;
259 			}
260 			if (nbl_conflict(vp, NBL_WRITE, offset, length, 0,
261 			    NULL)) {
262 				error = EACCES;
263 			}
264 		}
265 
266 		if (crgetuid(cr) == bva.va_uid && !error &&
267 		    va.va_size != bva.va_size) {
268 			va.va_mask &= ~AT_SIZE;
269 			bf.l_type = F_WRLCK;
270 			bf.l_whence = 0;
271 			bf.l_start = (off64_t)va.va_size;
272 			bf.l_len = 0;
273 			bf.l_sysid = 0;
274 			bf.l_pid = 0;
275 
276 			error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE,
277 			    (offset_t)va.va_size, cr, &ct);
278 		}
279 		if (in_crit)
280 			nbl_end_crit(vp);
281 	} else
282 		error = 0;
283 
284 	/*
285 	 * Do the setattr.
286 	 */
287 	if (!error && va.va_mask) {
288 		error = VOP_SETATTR(vp, &va, flag, cr, &ct);
289 	}
290 
291 	/*
292 	 * check if the monitor on either vop_space or vop_setattr detected
293 	 * a delegation conflict and if so, mark the thread flag as
294 	 * wouldblock so that the response is dropped and the client will
295 	 * try again.
296 	 */
297 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
298 		VN_RELE(vp);
299 		curthread->t_flag |= T_WOULDBLOCK;
300 		return;
301 	}
302 
303 	if (!error) {
304 		va.va_mask = AT_ALL;	/* get everything */
305 
306 		error = rfs4_delegated_getattr(vp, &va, 0, cr);
307 
308 		/* check for overflows */
309 		if (!error) {
310 			acl_perm(vp, exi, &va, cr);
311 			error = vattr_to_nattr(&va, &ns->ns_attr);
312 		}
313 	}
314 
315 	ct.cc_flags = 0;
316 
317 	/*
318 	 * Force modified metadata out to stable storage.
319 	 */
320 	(void) VOP_FSYNC(vp, FNODSYNC, cr, &ct);
321 
322 	VN_RELE(vp);
323 
324 	ns->ns_status = puterrno(error);
325 }
326 void *
327 rfs_setattr_getfh(struct nfssaargs *args)
328 {
329 	return (&args->saa_fh);
330 }
331 
332 /*
333  * Directory lookup.
334  * Returns an fhandle and file attributes for file name in a directory.
335  */
336 /* ARGSUSED */
337 void
338 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
339     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
340 {
341 	int error;
342 	vnode_t *dvp;
343 	vnode_t *vp;
344 	struct vattr va;
345 	fhandle_t *fhp = da->da_fhandle;
346 	struct sec_ol sec = {0, 0};
347 	bool_t publicfh_flag = FALSE, auth_weak = FALSE;
348 	char *name;
349 	struct sockaddr *ca;
350 
351 	/*
352 	 * Trusted Extension doesn't support NFSv2. MOUNT
353 	 * will reject v2 clients. Need to prevent v2 client
354 	 * access via WebNFS here.
355 	 */
356 	if (is_system_labeled() && req->rq_vers == 2) {
357 		dr->dr_status = NFSERR_ACCES;
358 		return;
359 	}
360 
361 	/*
362 	 * Disallow NULL paths
363 	 */
364 	if (da->da_name == NULL || *da->da_name == '\0') {
365 		dr->dr_status = NFSERR_ACCES;
366 		return;
367 	}
368 
369 	/*
370 	 * Allow lookups from the root - the default
371 	 * location of the public filehandle.
372 	 */
373 	if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
374 		dvp = rootdir;
375 		VN_HOLD(dvp);
376 	} else {
377 		dvp = nfs_fhtovp(fhp, exi);
378 		if (dvp == NULL) {
379 			dr->dr_status = NFSERR_STALE;
380 			return;
381 		}
382 	}
383 
384 	/*
385 	 * Not allow lookup beyond root.
386 	 * If the filehandle matches a filehandle of the exi,
387 	 * then the ".." refers beyond the root of an exported filesystem.
388 	 */
389 	if (strcmp(da->da_name, "..") == 0 &&
390 	    EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
391 		VN_RELE(dvp);
392 		dr->dr_status = NFSERR_NOENT;
393 		return;
394 	}
395 
396 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
397 	name = nfscmd_convname(ca, exi, da->da_name, NFSCMD_CONV_INBOUND,
398 	    MAXPATHLEN);
399 
400 	if (name == NULL) {
401 		dr->dr_status = NFSERR_ACCES;
402 		return;
403 	}
404 
405 	/*
406 	 * If the public filehandle is used then allow
407 	 * a multi-component lookup, i.e. evaluate
408 	 * a pathname and follow symbolic links if
409 	 * necessary.
410 	 *
411 	 * This may result in a vnode in another filesystem
412 	 * which is OK as long as the filesystem is exported.
413 	 */
414 	if (PUBLIC_FH2(fhp)) {
415 		publicfh_flag = TRUE;
416 		error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &exi,
417 		    &sec);
418 	} else {
419 		/*
420 		 * Do a normal single component lookup.
421 		 */
422 		error = VOP_LOOKUP(dvp, name, &vp, NULL, 0, NULL, cr,
423 		    NULL, NULL, NULL);
424 	}
425 
426 	if (name != da->da_name)
427 		kmem_free(name, MAXPATHLEN);
428 
429 
430 	if (!error) {
431 		va.va_mask = AT_ALL;	/* we want everything */
432 
433 		error = rfs4_delegated_getattr(vp, &va, 0, cr);
434 
435 		/* check for overflows */
436 		if (!error) {
437 			acl_perm(vp, exi, &va, cr);
438 			error = vattr_to_nattr(&va, &dr->dr_attr);
439 			if (!error) {
440 				if (sec.sec_flags & SEC_QUERY)
441 					error = makefh_ol(&dr->dr_fhandle, exi,
442 					    sec.sec_index);
443 				else {
444 					error = makefh(&dr->dr_fhandle, vp,
445 					    exi);
446 					if (!error && publicfh_flag &&
447 					    !chk_clnt_sec(exi, req))
448 						auth_weak = TRUE;
449 				}
450 			}
451 		}
452 		VN_RELE(vp);
453 	}
454 
455 	VN_RELE(dvp);
456 
457 	/*
458 	 * If publicfh_flag is true then we have called rfs_publicfh_mclookup
459 	 * and have obtained a new exportinfo in exi which needs to be
460 	 * released. Note the the original exportinfo pointed to by exi
461 	 * will be released by the caller, comon_dispatch.
462 	 */
463 	if (publicfh_flag && exi != NULL)
464 		exi_rele(exi);
465 
466 	/*
467 	 * If it's public fh, no 0x81, and client's flavor is
468 	 * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
469 	 * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
470 	 */
471 	if (auth_weak)
472 		dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR;
473 	else
474 		dr->dr_status = puterrno(error);
475 }
476 void *
477 rfs_lookup_getfh(struct nfsdiropargs *da)
478 {
479 	return (da->da_fhandle);
480 }
481 
482 /*
483  * Read symbolic link.
484  * Returns the string in the symbolic link at the given fhandle.
485  */
486 /* ARGSUSED */
487 void
488 rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi,
489     struct svc_req *req, cred_t *cr, bool_t ro)
490 {
491 	int error;
492 	struct iovec iov;
493 	struct uio uio;
494 	vnode_t *vp;
495 	struct vattr va;
496 	struct sockaddr *ca;
497 	char *name = NULL;
498 	int is_referral = 0;
499 
500 	vp = nfs_fhtovp(fhp, exi);
501 	if (vp == NULL) {
502 		rl->rl_data = NULL;
503 		rl->rl_status = NFSERR_STALE;
504 		return;
505 	}
506 
507 	va.va_mask = AT_MODE;
508 
509 	error = VOP_GETATTR(vp, &va, 0, cr, NULL);
510 
511 	if (error) {
512 		VN_RELE(vp);
513 		rl->rl_data = NULL;
514 		rl->rl_status = puterrno(error);
515 		return;
516 	}
517 
518 	if (MANDLOCK(vp, va.va_mode)) {
519 		VN_RELE(vp);
520 		rl->rl_data = NULL;
521 		rl->rl_status = NFSERR_ACCES;
522 		return;
523 	}
524 
525 	/* We lied about the object type for a referral */
526 	if (vn_is_nfs_reparse(vp, cr))
527 		is_referral = 1;
528 
529 	/*
530 	 * XNFS and RFC1094 require us to return ENXIO if argument
531 	 * is not a link. BUGID 1138002.
532 	 */
533 	if (vp->v_type != VLNK && !is_referral) {
534 		VN_RELE(vp);
535 		rl->rl_data = NULL;
536 		rl->rl_status = NFSERR_NXIO;
537 		return;
538 	}
539 
540 	/*
541 	 * Allocate data for pathname.  This will be freed by rfs_rlfree.
542 	 */
543 	rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
544 
545 	if (is_referral) {
546 		char *s;
547 		size_t strsz;
548 
549 		/* Get an artificial symlink based on a referral */
550 		s = build_symlink(vp, cr, &strsz);
551 		global_svstat_ptr[2][NFS_REFERLINKS].value.ui64++;
552 		DTRACE_PROBE2(nfs2serv__func__referral__reflink,
553 		    vnode_t *, vp, char *, s);
554 		if (s == NULL)
555 			error = EINVAL;
556 		else {
557 			error = 0;
558 			(void) strlcpy(rl->rl_data, s, NFS_MAXPATHLEN);
559 			rl->rl_count = (uint32_t)MIN(strsz, NFS_MAXPATHLEN);
560 			kmem_free(s, strsz);
561 		}
562 
563 	} else {
564 
565 		/*
566 		 * Set up io vector to read sym link data
567 		 */
568 		iov.iov_base = rl->rl_data;
569 		iov.iov_len = NFS_MAXPATHLEN;
570 		uio.uio_iov = &iov;
571 		uio.uio_iovcnt = 1;
572 		uio.uio_segflg = UIO_SYSSPACE;
573 		uio.uio_extflg = UIO_COPY_CACHED;
574 		uio.uio_loffset = (offset_t)0;
575 		uio.uio_resid = NFS_MAXPATHLEN;
576 
577 		/*
578 		 * Do the readlink.
579 		 */
580 		error = VOP_READLINK(vp, &uio, cr, NULL);
581 
582 		rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid);
583 
584 		if (!error)
585 			rl->rl_data[rl->rl_count] = '\0';
586 
587 	}
588 
589 
590 	VN_RELE(vp);
591 
592 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
593 	name = nfscmd_convname(ca, exi, rl->rl_data,
594 	    NFSCMD_CONV_OUTBOUND, MAXPATHLEN);
595 
596 	if (name != NULL && name != rl->rl_data) {
597 		kmem_free(rl->rl_data, NFS_MAXPATHLEN);
598 		rl->rl_data = name;
599 	}
600 
601 	/*
602 	 * XNFS and RFC1094 require us to return ENXIO if argument
603 	 * is not a link. UFS returns EINVAL if this is the case,
604 	 * so we do the mapping here. BUGID 1138002.
605 	 */
606 	if (error == EINVAL)
607 		rl->rl_status = NFSERR_NXIO;
608 	else
609 		rl->rl_status = puterrno(error);
610 
611 }
612 void *
613 rfs_readlink_getfh(fhandle_t *fhp)
614 {
615 	return (fhp);
616 }
617 /*
618  * Free data allocated by rfs_readlink
619  */
620 void
621 rfs_rlfree(struct nfsrdlnres *rl)
622 {
623 	if (rl->rl_data != NULL)
624 		kmem_free(rl->rl_data, NFS_MAXPATHLEN);
625 }
626 
627 static int rdma_setup_read_data2(struct nfsreadargs *, struct nfsrdresult *);
628 
629 /*
630  * Read data.
631  * Returns some data read from the file at the given fhandle.
632  */
633 /* ARGSUSED */
634 void
635 rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr,
636     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
637 {
638 	vnode_t *vp;
639 	int error;
640 	struct vattr va;
641 	struct iovec iov;
642 	struct uio uio;
643 	mblk_t *mp;
644 	int alloc_err = 0;
645 	int in_crit = 0;
646 	caller_context_t ct;
647 
648 	vp = nfs_fhtovp(&ra->ra_fhandle, exi);
649 	if (vp == NULL) {
650 		rr->rr_data = NULL;
651 		rr->rr_status = NFSERR_STALE;
652 		return;
653 	}
654 
655 	if (vp->v_type != VREG) {
656 		VN_RELE(vp);
657 		rr->rr_data = NULL;
658 		rr->rr_status = NFSERR_ISDIR;
659 		return;
660 	}
661 
662 	ct.cc_sysid = 0;
663 	ct.cc_pid = 0;
664 	ct.cc_caller_id = nfs2_srv_caller_id;
665 	ct.cc_flags = CC_DONTBLOCK;
666 
667 	/*
668 	 * Enter the critical region before calling VOP_RWLOCK
669 	 * to avoid a deadlock with write requests.
670 	 */
671 	if (nbl_need_check(vp)) {
672 		nbl_start_crit(vp, RW_READER);
673 		if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
674 		    0, NULL)) {
675 			nbl_end_crit(vp);
676 			VN_RELE(vp);
677 			rr->rr_data = NULL;
678 			rr->rr_status = NFSERR_ACCES;
679 			return;
680 		}
681 		in_crit = 1;
682 	}
683 
684 	error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct);
685 
686 	/* check if a monitor detected a delegation conflict */
687 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
688 		VN_RELE(vp);
689 		/* mark as wouldblock so response is dropped */
690 		curthread->t_flag |= T_WOULDBLOCK;
691 
692 		rr->rr_data = NULL;
693 		return;
694 	}
695 
696 	va.va_mask = AT_ALL;
697 
698 	error = VOP_GETATTR(vp, &va, 0, cr, &ct);
699 
700 	if (error) {
701 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
702 		if (in_crit)
703 			nbl_end_crit(vp);
704 
705 		VN_RELE(vp);
706 		rr->rr_data = NULL;
707 		rr->rr_status = puterrno(error);
708 
709 		return;
710 	}
711 
712 	/*
713 	 * This is a kludge to allow reading of files created
714 	 * with no read permission.  The owner of the file
715 	 * is always allowed to read it.
716 	 */
717 	if (crgetuid(cr) != va.va_uid) {
718 		error = VOP_ACCESS(vp, VREAD, 0, cr, &ct);
719 
720 		if (error) {
721 			/*
722 			 * Exec is the same as read over the net because
723 			 * of demand loading.
724 			 */
725 			error = VOP_ACCESS(vp, VEXEC, 0, cr, &ct);
726 		}
727 		if (error) {
728 			VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
729 			if (in_crit)
730 				nbl_end_crit(vp);
731 			VN_RELE(vp);
732 			rr->rr_data = NULL;
733 			rr->rr_status = puterrno(error);
734 
735 			return;
736 		}
737 	}
738 
739 	if (MANDLOCK(vp, va.va_mode)) {
740 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
741 		if (in_crit)
742 			nbl_end_crit(vp);
743 
744 		VN_RELE(vp);
745 		rr->rr_data = NULL;
746 		rr->rr_status = NFSERR_ACCES;
747 
748 		return;
749 	}
750 
751 	rr->rr_ok.rrok_wlist_len = 0;
752 	rr->rr_ok.rrok_wlist = NULL;
753 
754 	if ((u_offset_t)ra->ra_offset >= va.va_size) {
755 		rr->rr_count = 0;
756 		rr->rr_data = NULL;
757 		/*
758 		 * In this case, status is NFS_OK, but there is no data
759 		 * to encode. So set rr_mp to NULL.
760 		 */
761 		rr->rr_mp = NULL;
762 		rr->rr_ok.rrok_wlist = ra->ra_wlist;
763 		if (rr->rr_ok.rrok_wlist)
764 			clist_zero_len(rr->rr_ok.rrok_wlist);
765 		goto done;
766 	}
767 
768 	if (ra->ra_wlist) {
769 		mp = NULL;
770 		rr->rr_mp = NULL;
771 		(void) rdma_get_wchunk(req, &iov, ra->ra_wlist);
772 		if (ra->ra_count > iov.iov_len) {
773 			rr->rr_data = NULL;
774 			rr->rr_status = NFSERR_INVAL;
775 			goto done;
776 		}
777 	} else {
778 		/*
779 		 * mp will contain the data to be sent out in the read reply.
780 		 * This will be freed after the reply has been sent out (by the
781 		 * driver).
782 		 * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so
783 		 * that the call to xdrmblk_putmblk() never fails.
784 		 */
785 		mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG,
786 		    &alloc_err);
787 		ASSERT(mp != NULL);
788 		ASSERT(alloc_err == 0);
789 
790 		rr->rr_mp = mp;
791 
792 		/*
793 		 * Set up io vector
794 		 */
795 		iov.iov_base = (caddr_t)mp->b_datap->db_base;
796 		iov.iov_len = ra->ra_count;
797 	}
798 
799 	uio.uio_iov = &iov;
800 	uio.uio_iovcnt = 1;
801 	uio.uio_segflg = UIO_SYSSPACE;
802 	uio.uio_extflg = UIO_COPY_CACHED;
803 	uio.uio_loffset = (offset_t)ra->ra_offset;
804 	uio.uio_resid = ra->ra_count;
805 
806 	error = VOP_READ(vp, &uio, 0, cr, &ct);
807 
808 	if (error) {
809 		if (mp)
810 			freeb(mp);
811 
812 		/*
813 		 * check if a monitor detected a delegation conflict and
814 		 * mark as wouldblock so response is dropped
815 		 */
816 		if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
817 			curthread->t_flag |= T_WOULDBLOCK;
818 		else
819 			rr->rr_status = puterrno(error);
820 
821 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
822 		if (in_crit)
823 			nbl_end_crit(vp);
824 
825 		VN_RELE(vp);
826 		rr->rr_data = NULL;
827 
828 		return;
829 	}
830 
831 	/*
832 	 * Get attributes again so we can send the latest access
833 	 * time to the client side for its cache.
834 	 */
835 	va.va_mask = AT_ALL;
836 
837 	error = VOP_GETATTR(vp, &va, 0, cr, &ct);
838 
839 	if (error) {
840 		if (mp)
841 			freeb(mp);
842 
843 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
844 		if (in_crit)
845 			nbl_end_crit(vp);
846 
847 		VN_RELE(vp);
848 		rr->rr_data = NULL;
849 		rr->rr_status = puterrno(error);
850 
851 		return;
852 	}
853 
854 	rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid);
855 
856 	if (mp) {
857 		rr->rr_data = (char *)mp->b_datap->db_base;
858 	} else {
859 		if (ra->ra_wlist) {
860 			rr->rr_data = (caddr_t)iov.iov_base;
861 			if (!rdma_setup_read_data2(ra, rr)) {
862 				rr->rr_data = NULL;
863 				rr->rr_status = puterrno(NFSERR_INVAL);
864 			}
865 		}
866 	}
867 done:
868 	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
869 	if (in_crit)
870 		nbl_end_crit(vp);
871 
872 	acl_perm(vp, exi, &va, cr);
873 
874 	/* check for overflows */
875 	error = vattr_to_nattr(&va, &rr->rr_attr);
876 
877 	VN_RELE(vp);
878 
879 	rr->rr_status = puterrno(error);
880 }
881 
882 /*
883  * Free data allocated by rfs_read
884  */
885 void
886 rfs_rdfree(struct nfsrdresult *rr)
887 {
888 	mblk_t *mp;
889 
890 	if (rr->rr_status == NFS_OK) {
891 		mp = rr->rr_mp;
892 		if (mp != NULL)
893 			freeb(mp);
894 	}
895 }
896 
897 void *
898 rfs_read_getfh(struct nfsreadargs *ra)
899 {
900 	return (&ra->ra_fhandle);
901 }
902 
903 #define	MAX_IOVECS	12
904 
905 #ifdef DEBUG
906 static int rfs_write_sync_hits = 0;
907 static int rfs_write_sync_misses = 0;
908 #endif
909 
910 /*
911  * Write data to file.
912  * Returns attributes of a file after writing some data to it.
913  *
914  * Any changes made here, especially in error handling might have
915  * to also be done in rfs_write (which clusters write requests).
916  */
917 /* ARGSUSED */
918 void
919 rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns,
920     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
921 {
922 	int error;
923 	vnode_t *vp;
924 	rlim64_t rlimit;
925 	struct vattr va;
926 	struct uio uio;
927 	struct iovec iov[MAX_IOVECS];
928 	mblk_t *m;
929 	struct iovec *iovp;
930 	int iovcnt;
931 	cred_t *savecred;
932 	int in_crit = 0;
933 	caller_context_t ct;
934 
935 	vp = nfs_fhtovp(&wa->wa_fhandle, exi);
936 	if (vp == NULL) {
937 		ns->ns_status = NFSERR_STALE;
938 		return;
939 	}
940 
941 	if (rdonly(ro, vp)) {
942 		VN_RELE(vp);
943 		ns->ns_status = NFSERR_ROFS;
944 		return;
945 	}
946 
947 	if (vp->v_type != VREG) {
948 		VN_RELE(vp);
949 		ns->ns_status = NFSERR_ISDIR;
950 		return;
951 	}
952 
953 	ct.cc_sysid = 0;
954 	ct.cc_pid = 0;
955 	ct.cc_caller_id = nfs2_srv_caller_id;
956 	ct.cc_flags = CC_DONTBLOCK;
957 
958 	va.va_mask = AT_UID|AT_MODE;
959 
960 	error = VOP_GETATTR(vp, &va, 0, cr, &ct);
961 
962 	if (error) {
963 		VN_RELE(vp);
964 		ns->ns_status = puterrno(error);
965 
966 		return;
967 	}
968 
969 	if (crgetuid(cr) != va.va_uid) {
970 		/*
971 		 * This is a kludge to allow writes of files created
972 		 * with read only permission.  The owner of the file
973 		 * is always allowed to write it.
974 		 */
975 		error = VOP_ACCESS(vp, VWRITE, 0, cr, &ct);
976 
977 		if (error) {
978 			VN_RELE(vp);
979 			ns->ns_status = puterrno(error);
980 			return;
981 		}
982 	}
983 
984 	/*
985 	 * Can't access a mandatory lock file.  This might cause
986 	 * the NFS service thread to block forever waiting for a
987 	 * lock to be released that will never be released.
988 	 */
989 	if (MANDLOCK(vp, va.va_mode)) {
990 		VN_RELE(vp);
991 		ns->ns_status = NFSERR_ACCES;
992 		return;
993 	}
994 
995 	/*
996 	 * We have to enter the critical region before calling VOP_RWLOCK
997 	 * to avoid a deadlock with ufs.
998 	 */
999 	if (nbl_need_check(vp)) {
1000 		nbl_start_crit(vp, RW_READER);
1001 		in_crit = 1;
1002 		if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
1003 		    wa->wa_count, 0, NULL)) {
1004 			error = EACCES;
1005 			goto out;
1006 		}
1007 	}
1008 
1009 	error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1010 
1011 	/* check if a monitor detected a delegation conflict */
1012 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1013 		VN_RELE(vp);
1014 		/* mark as wouldblock so response is dropped */
1015 		curthread->t_flag |= T_WOULDBLOCK;
1016 		return;
1017 	}
1018 
1019 	if (wa->wa_data || wa->wa_rlist) {
1020 		/* Do the RDMA thing if necessary */
1021 		if (wa->wa_rlist) {
1022 			iov[0].iov_base = (char *)((wa->wa_rlist)->u.c_daddr3);
1023 			iov[0].iov_len = wa->wa_count;
1024 		} else  {
1025 			iov[0].iov_base = wa->wa_data;
1026 			iov[0].iov_len = wa->wa_count;
1027 		}
1028 		uio.uio_iov = iov;
1029 		uio.uio_iovcnt = 1;
1030 		uio.uio_segflg = UIO_SYSSPACE;
1031 		uio.uio_extflg = UIO_COPY_DEFAULT;
1032 		uio.uio_loffset = (offset_t)wa->wa_offset;
1033 		uio.uio_resid = wa->wa_count;
1034 		/*
1035 		 * The limit is checked on the client. We
1036 		 * should allow any size writes here.
1037 		 */
1038 		uio.uio_llimit = curproc->p_fsz_ctl;
1039 		rlimit = uio.uio_llimit - wa->wa_offset;
1040 		if (rlimit < (rlim64_t)uio.uio_resid)
1041 			uio.uio_resid = (uint_t)rlimit;
1042 
1043 		/*
1044 		 * for now we assume no append mode
1045 		 */
1046 		/*
1047 		 * We're changing creds because VM may fault and we need
1048 		 * the cred of the current thread to be used if quota
1049 		 * checking is enabled.
1050 		 */
1051 		savecred = curthread->t_cred;
1052 		curthread->t_cred = cr;
1053 		error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1054 		curthread->t_cred = savecred;
1055 	} else {
1056 		iovcnt = 0;
1057 		for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
1058 			iovcnt++;
1059 		if (iovcnt <= MAX_IOVECS) {
1060 #ifdef DEBUG
1061 			rfs_write_sync_hits++;
1062 #endif
1063 			iovp = iov;
1064 		} else {
1065 #ifdef DEBUG
1066 			rfs_write_sync_misses++;
1067 #endif
1068 			iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
1069 		}
1070 		mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
1071 		uio.uio_iov = iovp;
1072 		uio.uio_iovcnt = iovcnt;
1073 		uio.uio_segflg = UIO_SYSSPACE;
1074 		uio.uio_extflg = UIO_COPY_DEFAULT;
1075 		uio.uio_loffset = (offset_t)wa->wa_offset;
1076 		uio.uio_resid = wa->wa_count;
1077 		/*
1078 		 * The limit is checked on the client. We
1079 		 * should allow any size writes here.
1080 		 */
1081 		uio.uio_llimit = curproc->p_fsz_ctl;
1082 		rlimit = uio.uio_llimit - wa->wa_offset;
1083 		if (rlimit < (rlim64_t)uio.uio_resid)
1084 			uio.uio_resid = (uint_t)rlimit;
1085 
1086 		/*
1087 		 * For now we assume no append mode.
1088 		 */
1089 		/*
1090 		 * We're changing creds because VM may fault and we need
1091 		 * the cred of the current thread to be used if quota
1092 		 * checking is enabled.
1093 		 */
1094 		savecred = curthread->t_cred;
1095 		curthread->t_cred = cr;
1096 		error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1097 		curthread->t_cred = savecred;
1098 
1099 		if (iovp != iov)
1100 			kmem_free(iovp, sizeof (*iovp) * iovcnt);
1101 	}
1102 
1103 	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1104 
1105 	if (!error) {
1106 		/*
1107 		 * Get attributes again so we send the latest mod
1108 		 * time to the client side for its cache.
1109 		 */
1110 		va.va_mask = AT_ALL;	/* now we want everything */
1111 
1112 		error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1113 
1114 		/* check for overflows */
1115 		if (!error) {
1116 			acl_perm(vp, exi, &va, cr);
1117 			error = vattr_to_nattr(&va, &ns->ns_attr);
1118 		}
1119 	}
1120 
1121 out:
1122 	if (in_crit)
1123 		nbl_end_crit(vp);
1124 	VN_RELE(vp);
1125 
1126 	/* check if a monitor detected a delegation conflict */
1127 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1128 		/* mark as wouldblock so response is dropped */
1129 		curthread->t_flag |= T_WOULDBLOCK;
1130 	else
1131 		ns->ns_status = puterrno(error);
1132 
1133 }
1134 
1135 struct rfs_async_write {
1136 	struct nfswriteargs *wa;
1137 	struct nfsattrstat *ns;
1138 	struct svc_req *req;
1139 	cred_t *cr;
1140 	bool_t ro;
1141 	kthread_t *thread;
1142 	struct rfs_async_write *list;
1143 };
1144 
1145 struct rfs_async_write_list {
1146 	fhandle_t *fhp;
1147 	kcondvar_t cv;
1148 	struct rfs_async_write *list;
1149 	struct rfs_async_write_list *next;
1150 };
1151 
1152 static struct rfs_async_write_list *rfs_async_write_head = NULL;
1153 static kmutex_t rfs_async_write_lock;
1154 static int rfs_write_async = 1;	/* enables write clustering if == 1 */
1155 
1156 #define	MAXCLIOVECS	42
1157 #define	RFSWRITE_INITVAL (enum nfsstat) -1
1158 
1159 #ifdef DEBUG
1160 static int rfs_write_hits = 0;
1161 static int rfs_write_misses = 0;
1162 #endif
1163 
1164 /*
1165  * Write data to file.
1166  * Returns attributes of a file after writing some data to it.
1167  */
1168 void
1169 rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns,
1170     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1171 {
1172 	int error;
1173 	vnode_t *vp;
1174 	rlim64_t rlimit;
1175 	struct vattr va;
1176 	struct uio uio;
1177 	struct rfs_async_write_list *lp;
1178 	struct rfs_async_write_list *nlp;
1179 	struct rfs_async_write *rp;
1180 	struct rfs_async_write *nrp;
1181 	struct rfs_async_write *trp;
1182 	struct rfs_async_write *lrp;
1183 	int data_written;
1184 	int iovcnt;
1185 	mblk_t *m;
1186 	struct iovec *iovp;
1187 	struct iovec *niovp;
1188 	struct iovec iov[MAXCLIOVECS];
1189 	int count;
1190 	int rcount;
1191 	uint_t off;
1192 	uint_t len;
1193 	struct rfs_async_write nrpsp;
1194 	struct rfs_async_write_list nlpsp;
1195 	ushort_t t_flag;
1196 	cred_t *savecred;
1197 	int in_crit = 0;
1198 	caller_context_t ct;
1199 
1200 	if (!rfs_write_async) {
1201 		rfs_write_sync(wa, ns, exi, req, cr, ro);
1202 		return;
1203 	}
1204 
1205 	/*
1206 	 * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
1207 	 * is considered an OK.
1208 	 */
1209 	ns->ns_status = RFSWRITE_INITVAL;
1210 
1211 	nrp = &nrpsp;
1212 	nrp->wa = wa;
1213 	nrp->ns = ns;
1214 	nrp->req = req;
1215 	nrp->cr = cr;
1216 	nrp->ro = ro;
1217 	nrp->thread = curthread;
1218 
1219 	ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
1220 
1221 	/*
1222 	 * Look to see if there is already a cluster started
1223 	 * for this file.
1224 	 */
1225 	mutex_enter(&rfs_async_write_lock);
1226 	for (lp = rfs_async_write_head; lp != NULL; lp = lp->next) {
1227 		if (bcmp(&wa->wa_fhandle, lp->fhp,
1228 		    sizeof (fhandle_t)) == 0)
1229 			break;
1230 	}
1231 
1232 	/*
1233 	 * If lp is non-NULL, then there is already a cluster
1234 	 * started.  We need to place ourselves in the cluster
1235 	 * list in the right place as determined by starting
1236 	 * offset.  Conflicts with non-blocking mandatory locked
1237 	 * regions will be checked when the cluster is processed.
1238 	 */
1239 	if (lp != NULL) {
1240 		rp = lp->list;
1241 		trp = NULL;
1242 		while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
1243 			trp = rp;
1244 			rp = rp->list;
1245 		}
1246 		nrp->list = rp;
1247 		if (trp == NULL)
1248 			lp->list = nrp;
1249 		else
1250 			trp->list = nrp;
1251 		while (nrp->ns->ns_status == RFSWRITE_INITVAL)
1252 			cv_wait(&lp->cv, &rfs_async_write_lock);
1253 		mutex_exit(&rfs_async_write_lock);
1254 
1255 		return;
1256 	}
1257 
1258 	/*
1259 	 * No cluster started yet, start one and add ourselves
1260 	 * to the list of clusters.
1261 	 */
1262 	nrp->list = NULL;
1263 
1264 	nlp = &nlpsp;
1265 	nlp->fhp = &wa->wa_fhandle;
1266 	cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
1267 	nlp->list = nrp;
1268 	nlp->next = NULL;
1269 
1270 	if (rfs_async_write_head == NULL) {
1271 		rfs_async_write_head = nlp;
1272 	} else {
1273 		lp = rfs_async_write_head;
1274 		while (lp->next != NULL)
1275 			lp = lp->next;
1276 		lp->next = nlp;
1277 	}
1278 	mutex_exit(&rfs_async_write_lock);
1279 
1280 	/*
1281 	 * Convert the file handle common to all of the requests
1282 	 * in this cluster to a vnode.
1283 	 */
1284 	vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1285 	if (vp == NULL) {
1286 		mutex_enter(&rfs_async_write_lock);
1287 		if (rfs_async_write_head == nlp)
1288 			rfs_async_write_head = nlp->next;
1289 		else {
1290 			lp = rfs_async_write_head;
1291 			while (lp->next != nlp)
1292 				lp = lp->next;
1293 			lp->next = nlp->next;
1294 		}
1295 		t_flag = curthread->t_flag & T_WOULDBLOCK;
1296 		for (rp = nlp->list; rp != NULL; rp = rp->list) {
1297 			rp->ns->ns_status = NFSERR_STALE;
1298 			rp->thread->t_flag |= t_flag;
1299 		}
1300 		cv_broadcast(&nlp->cv);
1301 		mutex_exit(&rfs_async_write_lock);
1302 
1303 		return;
1304 	}
1305 
1306 	/*
1307 	 * Can only write regular files.  Attempts to write any
1308 	 * other file types fail with EISDIR.
1309 	 */
1310 	if (vp->v_type != VREG) {
1311 		VN_RELE(vp);
1312 		mutex_enter(&rfs_async_write_lock);
1313 		if (rfs_async_write_head == nlp)
1314 			rfs_async_write_head = nlp->next;
1315 		else {
1316 			lp = rfs_async_write_head;
1317 			while (lp->next != nlp)
1318 				lp = lp->next;
1319 			lp->next = nlp->next;
1320 		}
1321 		t_flag = curthread->t_flag & T_WOULDBLOCK;
1322 		for (rp = nlp->list; rp != NULL; rp = rp->list) {
1323 			rp->ns->ns_status = NFSERR_ISDIR;
1324 			rp->thread->t_flag |= t_flag;
1325 		}
1326 		cv_broadcast(&nlp->cv);
1327 		mutex_exit(&rfs_async_write_lock);
1328 
1329 		return;
1330 	}
1331 
1332 	/*
1333 	 * Enter the critical region before calling VOP_RWLOCK, to avoid a
1334 	 * deadlock with ufs.
1335 	 */
1336 	if (nbl_need_check(vp)) {
1337 		nbl_start_crit(vp, RW_READER);
1338 		in_crit = 1;
1339 	}
1340 
1341 	ct.cc_sysid = 0;
1342 	ct.cc_pid = 0;
1343 	ct.cc_caller_id = nfs2_srv_caller_id;
1344 	ct.cc_flags = CC_DONTBLOCK;
1345 
1346 	/*
1347 	 * Lock the file for writing.  This operation provides
1348 	 * the delay which allows clusters to grow.
1349 	 */
1350 	error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1351 
1352 	/* check if a monitor detected a delegation conflict */
1353 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1354 		if (in_crit)
1355 			nbl_end_crit(vp);
1356 		VN_RELE(vp);
1357 		/* mark as wouldblock so response is dropped */
1358 		curthread->t_flag |= T_WOULDBLOCK;
1359 		mutex_enter(&rfs_async_write_lock);
1360 		if (rfs_async_write_head == nlp)
1361 			rfs_async_write_head = nlp->next;
1362 		else {
1363 			lp = rfs_async_write_head;
1364 			while (lp->next != nlp)
1365 				lp = lp->next;
1366 			lp->next = nlp->next;
1367 		}
1368 		for (rp = nlp->list; rp != NULL; rp = rp->list) {
1369 			if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1370 				rp->ns->ns_status = puterrno(error);
1371 				rp->thread->t_flag |= T_WOULDBLOCK;
1372 			}
1373 		}
1374 		cv_broadcast(&nlp->cv);
1375 		mutex_exit(&rfs_async_write_lock);
1376 
1377 		return;
1378 	}
1379 
1380 	/*
1381 	 * Disconnect this cluster from the list of clusters.
1382 	 * The cluster that is being dealt with must be fixed
1383 	 * in size after this point, so there is no reason
1384 	 * to leave it on the list so that new requests can
1385 	 * find it.
1386 	 *
1387 	 * The algorithm is that the first write request will
1388 	 * create a cluster, convert the file handle to a
1389 	 * vnode pointer, and then lock the file for writing.
1390 	 * This request is not likely to be clustered with
1391 	 * any others.  However, the next request will create
1392 	 * a new cluster and be blocked in VOP_RWLOCK while
1393 	 * the first request is being processed.  This delay
1394 	 * will allow more requests to be clustered in this
1395 	 * second cluster.
1396 	 */
1397 	mutex_enter(&rfs_async_write_lock);
1398 	if (rfs_async_write_head == nlp)
1399 		rfs_async_write_head = nlp->next;
1400 	else {
1401 		lp = rfs_async_write_head;
1402 		while (lp->next != nlp)
1403 			lp = lp->next;
1404 		lp->next = nlp->next;
1405 	}
1406 	mutex_exit(&rfs_async_write_lock);
1407 
1408 	/*
1409 	 * Step through the list of requests in this cluster.
1410 	 * We need to check permissions to make sure that all
1411 	 * of the requests have sufficient permission to write
1412 	 * the file.  A cluster can be composed of requests
1413 	 * from different clients and different users on each
1414 	 * client.
1415 	 *
1416 	 * As a side effect, we also calculate the size of the
1417 	 * byte range that this cluster encompasses.
1418 	 */
1419 	rp = nlp->list;
1420 	off = rp->wa->wa_offset;
1421 	len = (uint_t)0;
1422 	do {
1423 		if (rdonly(rp->ro, vp)) {
1424 			rp->ns->ns_status = NFSERR_ROFS;
1425 			t_flag = curthread->t_flag & T_WOULDBLOCK;
1426 			rp->thread->t_flag |= t_flag;
1427 			continue;
1428 		}
1429 
1430 		va.va_mask = AT_UID|AT_MODE;
1431 
1432 		error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1433 
1434 		if (!error) {
1435 			if (crgetuid(rp->cr) != va.va_uid) {
1436 				/*
1437 				 * This is a kludge to allow writes of files
1438 				 * created with read only permission.  The
1439 				 * owner of the file is always allowed to
1440 				 * write it.
1441 				 */
1442 				error = VOP_ACCESS(vp, VWRITE, 0, rp->cr, &ct);
1443 			}
1444 			if (!error && MANDLOCK(vp, va.va_mode))
1445 				error = EACCES;
1446 		}
1447 
1448 		/*
1449 		 * Check for a conflict with a nbmand-locked region.
1450 		 */
1451 		if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset,
1452 		    rp->wa->wa_count, 0, NULL)) {
1453 			error = EACCES;
1454 		}
1455 
1456 		if (error) {
1457 			rp->ns->ns_status = puterrno(error);
1458 			t_flag = curthread->t_flag & T_WOULDBLOCK;
1459 			rp->thread->t_flag |= t_flag;
1460 			continue;
1461 		}
1462 		if (len < rp->wa->wa_offset + rp->wa->wa_count - off)
1463 			len = rp->wa->wa_offset + rp->wa->wa_count - off;
1464 	} while ((rp = rp->list) != NULL);
1465 
1466 	/*
1467 	 * Step through the cluster attempting to gather as many
1468 	 * requests which are contiguous as possible.  These
1469 	 * contiguous requests are handled via one call to VOP_WRITE
1470 	 * instead of different calls to VOP_WRITE.  We also keep
1471 	 * track of the fact that any data was written.
1472 	 */
1473 	rp = nlp->list;
1474 	data_written = 0;
1475 	do {
1476 		/*
1477 		 * Skip any requests which are already marked as having an
1478 		 * error.
1479 		 */
1480 		if (rp->ns->ns_status != RFSWRITE_INITVAL) {
1481 			rp = rp->list;
1482 			continue;
1483 		}
1484 
1485 		/*
1486 		 * Count the number of iovec's which are required
1487 		 * to handle this set of requests.  One iovec is
1488 		 * needed for each data buffer, whether addressed
1489 		 * by wa_data or by the b_rptr pointers in the
1490 		 * mblk chains.
1491 		 */
1492 		iovcnt = 0;
1493 		lrp = rp;
1494 		for (;;) {
1495 			if (lrp->wa->wa_data || lrp->wa->wa_rlist)
1496 				iovcnt++;
1497 			else {
1498 				m = lrp->wa->wa_mblk;
1499 				while (m != NULL) {
1500 					iovcnt++;
1501 					m = m->b_cont;
1502 				}
1503 			}
1504 			if (lrp->list == NULL ||
1505 			    lrp->list->ns->ns_status != RFSWRITE_INITVAL ||
1506 			    lrp->wa->wa_offset + lrp->wa->wa_count !=
1507 			    lrp->list->wa->wa_offset) {
1508 				lrp = lrp->list;
1509 				break;
1510 			}
1511 			lrp = lrp->list;
1512 		}
1513 
1514 		if (iovcnt <= MAXCLIOVECS) {
1515 #ifdef DEBUG
1516 			rfs_write_hits++;
1517 #endif
1518 			niovp = iov;
1519 		} else {
1520 #ifdef DEBUG
1521 			rfs_write_misses++;
1522 #endif
1523 			niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP);
1524 		}
1525 		/*
1526 		 * Put together the scatter/gather iovecs.
1527 		 */
1528 		iovp = niovp;
1529 		trp = rp;
1530 		count = 0;
1531 		do {
1532 			if (trp->wa->wa_data || trp->wa->wa_rlist) {
1533 				if (trp->wa->wa_rlist) {
1534 					iovp->iov_base =
1535 					    (char *)((trp->wa->wa_rlist)->
1536 					    u.c_daddr3);
1537 					iovp->iov_len = trp->wa->wa_count;
1538 				} else  {
1539 					iovp->iov_base = trp->wa->wa_data;
1540 					iovp->iov_len = trp->wa->wa_count;
1541 				}
1542 				iovp++;
1543 			} else {
1544 				m = trp->wa->wa_mblk;
1545 				rcount = trp->wa->wa_count;
1546 				while (m != NULL) {
1547 					iovp->iov_base = (caddr_t)m->b_rptr;
1548 					iovp->iov_len = (m->b_wptr - m->b_rptr);
1549 					rcount -= iovp->iov_len;
1550 					if (rcount < 0)
1551 						iovp->iov_len += rcount;
1552 					iovp++;
1553 					if (rcount <= 0)
1554 						break;
1555 					m = m->b_cont;
1556 				}
1557 			}
1558 			count += trp->wa->wa_count;
1559 			trp = trp->list;
1560 		} while (trp != lrp);
1561 
1562 		uio.uio_iov = niovp;
1563 		uio.uio_iovcnt = iovcnt;
1564 		uio.uio_segflg = UIO_SYSSPACE;
1565 		uio.uio_extflg = UIO_COPY_DEFAULT;
1566 		uio.uio_loffset = (offset_t)rp->wa->wa_offset;
1567 		uio.uio_resid = count;
1568 		/*
1569 		 * The limit is checked on the client. We
1570 		 * should allow any size writes here.
1571 		 */
1572 		uio.uio_llimit = curproc->p_fsz_ctl;
1573 		rlimit = uio.uio_llimit - rp->wa->wa_offset;
1574 		if (rlimit < (rlim64_t)uio.uio_resid)
1575 			uio.uio_resid = (uint_t)rlimit;
1576 
1577 		/*
1578 		 * For now we assume no append mode.
1579 		 */
1580 
1581 		/*
1582 		 * We're changing creds because VM may fault
1583 		 * and we need the cred of the current
1584 		 * thread to be used if quota * checking is
1585 		 * enabled.
1586 		 */
1587 		savecred = curthread->t_cred;
1588 		curthread->t_cred = cr;
1589 		error = VOP_WRITE(vp, &uio, 0, rp->cr, &ct);
1590 		curthread->t_cred = savecred;
1591 
1592 		/* check if a monitor detected a delegation conflict */
1593 		if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1594 			/* mark as wouldblock so response is dropped */
1595 			curthread->t_flag |= T_WOULDBLOCK;
1596 
1597 		if (niovp != iov)
1598 			kmem_free(niovp, sizeof (*niovp) * iovcnt);
1599 
1600 		if (!error) {
1601 			data_written = 1;
1602 			/*
1603 			 * Get attributes again so we send the latest mod
1604 			 * time to the client side for its cache.
1605 			 */
1606 			va.va_mask = AT_ALL;	/* now we want everything */
1607 
1608 			error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1609 
1610 			if (!error)
1611 				acl_perm(vp, exi, &va, rp->cr);
1612 		}
1613 
1614 		/*
1615 		 * Fill in the status responses for each request
1616 		 * which was just handled.  Also, copy the latest
1617 		 * attributes in to the attribute responses if
1618 		 * appropriate.
1619 		 */
1620 		t_flag = curthread->t_flag & T_WOULDBLOCK;
1621 		do {
1622 			rp->thread->t_flag |= t_flag;
1623 			/* check for overflows */
1624 			if (!error) {
1625 				error  = vattr_to_nattr(&va, &rp->ns->ns_attr);
1626 			}
1627 			rp->ns->ns_status = puterrno(error);
1628 			rp = rp->list;
1629 		} while (rp != lrp);
1630 	} while (rp != NULL);
1631 
1632 	/*
1633 	 * If any data was written at all, then we need to flush
1634 	 * the data and metadata to stable storage.
1635 	 */
1636 	if (data_written) {
1637 		error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr, &ct);
1638 
1639 		if (!error) {
1640 			error = VOP_FSYNC(vp, FNODSYNC, cr, &ct);
1641 		}
1642 	}
1643 
1644 	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1645 
1646 	if (in_crit)
1647 		nbl_end_crit(vp);
1648 	VN_RELE(vp);
1649 
1650 	t_flag = curthread->t_flag & T_WOULDBLOCK;
1651 	mutex_enter(&rfs_async_write_lock);
1652 	for (rp = nlp->list; rp != NULL; rp = rp->list) {
1653 		if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1654 			rp->ns->ns_status = puterrno(error);
1655 			rp->thread->t_flag |= t_flag;
1656 		}
1657 	}
1658 	cv_broadcast(&nlp->cv);
1659 	mutex_exit(&rfs_async_write_lock);
1660 
1661 }
1662 
1663 void *
1664 rfs_write_getfh(struct nfswriteargs *wa)
1665 {
1666 	return (&wa->wa_fhandle);
1667 }
1668 
1669 /*
1670  * Create a file.
1671  * Creates a file with given attributes and returns those attributes
1672  * and an fhandle for the new file.
1673  */
1674 void
1675 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
1676     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1677 {
1678 	int error;
1679 	int lookuperr;
1680 	int in_crit = 0;
1681 	struct vattr va;
1682 	vnode_t *vp;
1683 	vnode_t *realvp;
1684 	vnode_t *dvp;
1685 	char *name = args->ca_da.da_name;
1686 	vnode_t *tvp = NULL;
1687 	int mode;
1688 	int lookup_ok;
1689 	bool_t trunc;
1690 	struct sockaddr *ca;
1691 
1692 	/*
1693 	 * Disallow NULL paths
1694 	 */
1695 	if (name == NULL || *name == '\0') {
1696 		dr->dr_status = NFSERR_ACCES;
1697 		return;
1698 	}
1699 
1700 	dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
1701 	if (dvp == NULL) {
1702 		dr->dr_status = NFSERR_STALE;
1703 		return;
1704 	}
1705 
1706 	error = sattr_to_vattr(args->ca_sa, &va);
1707 	if (error) {
1708 		dr->dr_status = puterrno(error);
1709 		return;
1710 	}
1711 
1712 	/*
1713 	 * Must specify the mode.
1714 	 */
1715 	if (!(va.va_mask & AT_MODE)) {
1716 		VN_RELE(dvp);
1717 		dr->dr_status = NFSERR_INVAL;
1718 		return;
1719 	}
1720 
1721 	/*
1722 	 * This is a completely gross hack to make mknod
1723 	 * work over the wire until we can wack the protocol
1724 	 */
1725 	if ((va.va_mode & IFMT) == IFCHR) {
1726 		if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV)
1727 			va.va_type = VFIFO;	/* xtra kludge for named pipe */
1728 		else {
1729 			va.va_type = VCHR;
1730 			/*
1731 			 * uncompress the received dev_t
1732 			 * if the top half is zero indicating a request
1733 			 * from an `older style' OS.
1734 			 */
1735 			if ((va.va_size & 0xffff0000) == 0)
1736 				va.va_rdev = nfsv2_expdev(va.va_size);
1737 			else
1738 				va.va_rdev = (dev_t)va.va_size;
1739 		}
1740 		va.va_mask &= ~AT_SIZE;
1741 	} else if ((va.va_mode & IFMT) == IFBLK) {
1742 		va.va_type = VBLK;
1743 		/*
1744 		 * uncompress the received dev_t
1745 		 * if the top half is zero indicating a request
1746 		 * from an `older style' OS.
1747 		 */
1748 		if ((va.va_size & 0xffff0000) == 0)
1749 			va.va_rdev = nfsv2_expdev(va.va_size);
1750 		else
1751 			va.va_rdev = (dev_t)va.va_size;
1752 		va.va_mask &= ~AT_SIZE;
1753 	} else if ((va.va_mode & IFMT) == IFSOCK) {
1754 		va.va_type = VSOCK;
1755 	} else {
1756 		va.va_type = VREG;
1757 	}
1758 	va.va_mode &= ~IFMT;
1759 	va.va_mask |= AT_TYPE;
1760 
1761 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
1762 	name = nfscmd_convname(ca, exi, name, NFSCMD_CONV_INBOUND,
1763 	    MAXPATHLEN);
1764 	if (name == NULL) {
1765 		dr->dr_status = puterrno(EINVAL);
1766 		return;
1767 	}
1768 
1769 	/*
1770 	 * Why was the choice made to use VWRITE as the mode to the
1771 	 * call to VOP_CREATE ? This results in a bug.  When a client
1772 	 * opens a file that already exists and is RDONLY, the second
1773 	 * open fails with an EACESS because of the mode.
1774 	 * bug ID 1054648.
1775 	 */
1776 	lookup_ok = 0;
1777 	mode = VWRITE;
1778 	if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) {
1779 		error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1780 		    NULL, NULL, NULL);
1781 		if (!error) {
1782 			struct vattr at;
1783 
1784 			lookup_ok = 1;
1785 			at.va_mask = AT_MODE;
1786 			error = VOP_GETATTR(tvp, &at, 0, cr, NULL);
1787 			if (!error)
1788 				mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD;
1789 			VN_RELE(tvp);
1790 			tvp = NULL;
1791 		}
1792 	}
1793 
1794 	if (!lookup_ok) {
1795 		if (rdonly(ro, dvp)) {
1796 			error = EROFS;
1797 		} else if (va.va_type != VREG && va.va_type != VFIFO &&
1798 		    va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) {
1799 			error = EPERM;
1800 		} else {
1801 			error = 0;
1802 		}
1803 	}
1804 
1805 	/*
1806 	 * If file size is being modified on an already existing file
1807 	 * make sure that there are no conflicting non-blocking mandatory
1808 	 * locks in the region being manipulated. Return EACCES if there
1809 	 * are conflicting locks.
1810 	 */
1811 	if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) {
1812 		lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1813 		    NULL, NULL, NULL);
1814 
1815 		if (!lookuperr &&
1816 		    rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) {
1817 			VN_RELE(tvp);
1818 			curthread->t_flag |= T_WOULDBLOCK;
1819 			goto out;
1820 		}
1821 
1822 		if (!lookuperr && nbl_need_check(tvp)) {
1823 			/*
1824 			 * The file exists. Now check if it has any
1825 			 * conflicting non-blocking mandatory locks
1826 			 * in the region being changed.
1827 			 */
1828 			struct vattr bva;
1829 			u_offset_t offset;
1830 			ssize_t length;
1831 
1832 			nbl_start_crit(tvp, RW_READER);
1833 			in_crit = 1;
1834 
1835 			bva.va_mask = AT_SIZE;
1836 			error = VOP_GETATTR(tvp, &bva, 0, cr, NULL);
1837 			if (!error) {
1838 				if (va.va_size < bva.va_size) {
1839 					offset = va.va_size;
1840 					length = bva.va_size - va.va_size;
1841 				} else {
1842 					offset = bva.va_size;
1843 					length = va.va_size - bva.va_size;
1844 				}
1845 				if (length) {
1846 					if (nbl_conflict(tvp, NBL_WRITE,
1847 					    offset, length, 0, NULL)) {
1848 						error = EACCES;
1849 					}
1850 				}
1851 			}
1852 			if (error) {
1853 				nbl_end_crit(tvp);
1854 				VN_RELE(tvp);
1855 				in_crit = 0;
1856 			}
1857 		} else if (tvp != NULL) {
1858 			VN_RELE(tvp);
1859 		}
1860 	}
1861 
1862 	if (!error) {
1863 		/*
1864 		 * If filesystem is shared with nosuid the remove any
1865 		 * setuid/setgid bits on create.
1866 		 */
1867 		if (va.va_type == VREG &&
1868 		    exi->exi_export.ex_flags & EX_NOSUID)
1869 			va.va_mode &= ~(VSUID | VSGID);
1870 
1871 		error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0,
1872 		    NULL, NULL);
1873 
1874 		if (!error) {
1875 
1876 			if ((va.va_mask & AT_SIZE) && (va.va_size == 0))
1877 				trunc = TRUE;
1878 			else
1879 				trunc = FALSE;
1880 
1881 			if (rfs4_check_delegated(FWRITE, vp, trunc)) {
1882 				VN_RELE(vp);
1883 				curthread->t_flag |= T_WOULDBLOCK;
1884 				goto out;
1885 			}
1886 			va.va_mask = AT_ALL;
1887 
1888 			error = VOP_GETATTR(vp, &va, 0, cr, NULL);
1889 
1890 			/* check for overflows */
1891 			if (!error) {
1892 				acl_perm(vp, exi, &va, cr);
1893 				error = vattr_to_nattr(&va, &dr->dr_attr);
1894 				if (!error) {
1895 					error = makefh(&dr->dr_fhandle, vp,
1896 					    exi);
1897 				}
1898 			}
1899 			/*
1900 			 * Force modified metadata out to stable storage.
1901 			 *
1902 			 * if a underlying vp exists, pass it to VOP_FSYNC
1903 			 */
1904 			if (VOP_REALVP(vp, &realvp, NULL) == 0)
1905 				(void) VOP_FSYNC(realvp, FNODSYNC, cr, NULL);
1906 			else
1907 				(void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
1908 			VN_RELE(vp);
1909 		}
1910 
1911 		if (in_crit) {
1912 			nbl_end_crit(tvp);
1913 			VN_RELE(tvp);
1914 		}
1915 	}
1916 
1917 	/*
1918 	 * Force modified data and metadata out to stable storage.
1919 	 */
1920 	(void) VOP_FSYNC(dvp, 0, cr, NULL);
1921 
1922 out:
1923 
1924 	VN_RELE(dvp);
1925 
1926 	dr->dr_status = puterrno(error);
1927 
1928 	if (name != args->ca_da.da_name)
1929 		kmem_free(name, MAXPATHLEN);
1930 }
1931 void *
1932 rfs_create_getfh(struct nfscreatargs *args)
1933 {
1934 	return (args->ca_da.da_fhandle);
1935 }
1936 
1937 /*
1938  * Remove a file.
1939  * Remove named file from parent directory.
1940  */
1941 /* ARGSUSED */
1942 void
1943 rfs_remove(struct nfsdiropargs *da, enum nfsstat *status,
1944     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1945 {
1946 	int error = 0;
1947 	vnode_t *vp;
1948 	vnode_t *targvp;
1949 	int in_crit = 0;
1950 
1951 	/*
1952 	 * Disallow NULL paths
1953 	 */
1954 	if (da->da_name == NULL || *da->da_name == '\0') {
1955 		*status = NFSERR_ACCES;
1956 		return;
1957 	}
1958 
1959 	vp = nfs_fhtovp(da->da_fhandle, exi);
1960 	if (vp == NULL) {
1961 		*status = NFSERR_STALE;
1962 		return;
1963 	}
1964 
1965 	if (rdonly(ro, vp)) {
1966 		VN_RELE(vp);
1967 		*status = NFSERR_ROFS;
1968 		return;
1969 	}
1970 
1971 	/*
1972 	 * Check for a conflict with a non-blocking mandatory share reservation.
1973 	 */
1974 	error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0,
1975 	    NULL, cr, NULL, NULL, NULL);
1976 	if (error != 0) {
1977 		VN_RELE(vp);
1978 		*status = puterrno(error);
1979 		return;
1980 	}
1981 
1982 	/*
1983 	 * If the file is delegated to an v4 client, then initiate
1984 	 * recall and drop this request (by setting T_WOULDBLOCK).
1985 	 * The client will eventually re-transmit the request and
1986 	 * (hopefully), by then, the v4 client will have returned
1987 	 * the delegation.
1988 	 */
1989 
1990 	if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
1991 		VN_RELE(vp);
1992 		VN_RELE(targvp);
1993 		curthread->t_flag |= T_WOULDBLOCK;
1994 		return;
1995 	}
1996 
1997 	if (nbl_need_check(targvp)) {
1998 		nbl_start_crit(targvp, RW_READER);
1999 		in_crit = 1;
2000 		if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
2001 			error = EACCES;
2002 			goto out;
2003 		}
2004 	}
2005 
2006 	error = VOP_REMOVE(vp, da->da_name, cr, NULL, 0);
2007 
2008 	/*
2009 	 * Force modified data and metadata out to stable storage.
2010 	 */
2011 	(void) VOP_FSYNC(vp, 0, cr, NULL);
2012 
2013 out:
2014 	if (in_crit)
2015 		nbl_end_crit(targvp);
2016 	VN_RELE(targvp);
2017 	VN_RELE(vp);
2018 
2019 	*status = puterrno(error);
2020 
2021 }
2022 
2023 void *
2024 rfs_remove_getfh(struct nfsdiropargs *da)
2025 {
2026 	return (da->da_fhandle);
2027 }
2028 
2029 /*
2030  * rename a file
2031  * Give a file (from) a new name (to).
2032  */
2033 /* ARGSUSED */
2034 void
2035 rfs_rename(struct nfsrnmargs *args, enum nfsstat *status,
2036     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2037 {
2038 	int error = 0;
2039 	vnode_t *fromvp;
2040 	vnode_t *tovp;
2041 	struct exportinfo *to_exi;
2042 	fhandle_t *fh;
2043 	vnode_t *srcvp;
2044 	vnode_t *targvp;
2045 	int in_crit = 0;
2046 
2047 	fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi);
2048 	if (fromvp == NULL) {
2049 		*status = NFSERR_STALE;
2050 		return;
2051 	}
2052 
2053 	fh = args->rna_to.da_fhandle;
2054 	to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2055 	if (to_exi == NULL) {
2056 		VN_RELE(fromvp);
2057 		*status = NFSERR_ACCES;
2058 		return;
2059 	}
2060 	exi_rele(to_exi);
2061 
2062 	if (to_exi != exi) {
2063 		VN_RELE(fromvp);
2064 		*status = NFSERR_XDEV;
2065 		return;
2066 	}
2067 
2068 	tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi);
2069 	if (tovp == NULL) {
2070 		VN_RELE(fromvp);
2071 		*status = NFSERR_STALE;
2072 		return;
2073 	}
2074 
2075 	if (fromvp->v_type != VDIR || tovp->v_type != VDIR) {
2076 		VN_RELE(tovp);
2077 		VN_RELE(fromvp);
2078 		*status = NFSERR_NOTDIR;
2079 		return;
2080 	}
2081 
2082 	/*
2083 	 * Disallow NULL paths
2084 	 */
2085 	if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' ||
2086 	    args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') {
2087 		VN_RELE(tovp);
2088 		VN_RELE(fromvp);
2089 		*status = NFSERR_ACCES;
2090 		return;
2091 	}
2092 
2093 	if (rdonly(ro, tovp)) {
2094 		VN_RELE(tovp);
2095 		VN_RELE(fromvp);
2096 		*status = NFSERR_ROFS;
2097 		return;
2098 	}
2099 
2100 	/*
2101 	 * Check for a conflict with a non-blocking mandatory share reservation.
2102 	 */
2103 	error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
2104 	    NULL, cr, NULL, NULL, NULL);
2105 	if (error != 0) {
2106 		VN_RELE(tovp);
2107 		VN_RELE(fromvp);
2108 		*status = puterrno(error);
2109 		return;
2110 	}
2111 
2112 	/* Check for delegations on the source file */
2113 
2114 	if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
2115 		VN_RELE(tovp);
2116 		VN_RELE(fromvp);
2117 		VN_RELE(srcvp);
2118 		curthread->t_flag |= T_WOULDBLOCK;
2119 		return;
2120 	}
2121 
2122 	/* Check for delegation on the file being renamed over, if it exists */
2123 
2124 	if (rfs4_deleg_policy != SRV_NEVER_DELEGATE &&
2125 	    VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
2126 	    NULL, NULL, NULL) == 0) {
2127 
2128 		if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2129 			VN_RELE(tovp);
2130 			VN_RELE(fromvp);
2131 			VN_RELE(srcvp);
2132 			VN_RELE(targvp);
2133 			curthread->t_flag |= T_WOULDBLOCK;
2134 			return;
2135 		}
2136 		VN_RELE(targvp);
2137 	}
2138 
2139 
2140 	if (nbl_need_check(srcvp)) {
2141 		nbl_start_crit(srcvp, RW_READER);
2142 		in_crit = 1;
2143 		if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) {
2144 			error = EACCES;
2145 			goto out;
2146 		}
2147 	}
2148 
2149 	error = VOP_RENAME(fromvp, args->rna_from.da_name,
2150 	    tovp, args->rna_to.da_name, cr, NULL, 0);
2151 
2152 	if (error == 0)
2153 		vn_renamepath(tovp, srcvp, args->rna_to.da_name,
2154 		    strlen(args->rna_to.da_name));
2155 
2156 	/*
2157 	 * Force modified data and metadata out to stable storage.
2158 	 */
2159 	(void) VOP_FSYNC(tovp, 0, cr, NULL);
2160 	(void) VOP_FSYNC(fromvp, 0, cr, NULL);
2161 
2162 out:
2163 	if (in_crit)
2164 		nbl_end_crit(srcvp);
2165 	VN_RELE(srcvp);
2166 	VN_RELE(tovp);
2167 	VN_RELE(fromvp);
2168 
2169 	*status = puterrno(error);
2170 
2171 }
2172 void *
2173 rfs_rename_getfh(struct nfsrnmargs *args)
2174 {
2175 	return (args->rna_from.da_fhandle);
2176 }
2177 
2178 /*
2179  * Link to a file.
2180  * Create a file (to) which is a hard link to the given file (from).
2181  */
2182 /* ARGSUSED */
2183 void
2184 rfs_link(struct nfslinkargs *args, enum nfsstat *status,
2185     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2186 {
2187 	int error;
2188 	vnode_t *fromvp;
2189 	vnode_t *tovp;
2190 	struct exportinfo *to_exi;
2191 	fhandle_t *fh;
2192 
2193 	fromvp = nfs_fhtovp(args->la_from, exi);
2194 	if (fromvp == NULL) {
2195 		*status = NFSERR_STALE;
2196 		return;
2197 	}
2198 
2199 	fh = args->la_to.da_fhandle;
2200 	to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2201 	if (to_exi == NULL) {
2202 		VN_RELE(fromvp);
2203 		*status = NFSERR_ACCES;
2204 		return;
2205 	}
2206 	exi_rele(to_exi);
2207 
2208 	if (to_exi != exi) {
2209 		VN_RELE(fromvp);
2210 		*status = NFSERR_XDEV;
2211 		return;
2212 	}
2213 
2214 	tovp = nfs_fhtovp(args->la_to.da_fhandle, exi);
2215 	if (tovp == NULL) {
2216 		VN_RELE(fromvp);
2217 		*status = NFSERR_STALE;
2218 		return;
2219 	}
2220 
2221 	if (tovp->v_type != VDIR) {
2222 		VN_RELE(tovp);
2223 		VN_RELE(fromvp);
2224 		*status = NFSERR_NOTDIR;
2225 		return;
2226 	}
2227 	/*
2228 	 * Disallow NULL paths
2229 	 */
2230 	if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') {
2231 		VN_RELE(tovp);
2232 		VN_RELE(fromvp);
2233 		*status = NFSERR_ACCES;
2234 		return;
2235 	}
2236 
2237 	if (rdonly(ro, tovp)) {
2238 		VN_RELE(tovp);
2239 		VN_RELE(fromvp);
2240 		*status = NFSERR_ROFS;
2241 		return;
2242 	}
2243 
2244 	error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr, NULL, 0);
2245 
2246 	/*
2247 	 * Force modified data and metadata out to stable storage.
2248 	 */
2249 	(void) VOP_FSYNC(tovp, 0, cr, NULL);
2250 	(void) VOP_FSYNC(fromvp, FNODSYNC, cr, NULL);
2251 
2252 	VN_RELE(tovp);
2253 	VN_RELE(fromvp);
2254 
2255 	*status = puterrno(error);
2256 
2257 }
2258 void *
2259 rfs_link_getfh(struct nfslinkargs *args)
2260 {
2261 	return (args->la_from);
2262 }
2263 
2264 /*
2265  * Symbolicly link to a file.
2266  * Create a file (to) with the given attributes which is a symbolic link
2267  * to the given path name (to).
2268  */
2269 void
2270 rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
2271     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2272 {
2273 	int error;
2274 	struct vattr va;
2275 	vnode_t *vp;
2276 	vnode_t *svp;
2277 	int lerror;
2278 	struct sockaddr *ca;
2279 	char *name = NULL;
2280 
2281 	/*
2282 	 * Disallow NULL paths
2283 	 */
2284 	if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') {
2285 		*status = NFSERR_ACCES;
2286 		return;
2287 	}
2288 
2289 	vp = nfs_fhtovp(args->sla_from.da_fhandle, exi);
2290 	if (vp == NULL) {
2291 		*status = NFSERR_STALE;
2292 		return;
2293 	}
2294 
2295 	if (rdonly(ro, vp)) {
2296 		VN_RELE(vp);
2297 		*status = NFSERR_ROFS;
2298 		return;
2299 	}
2300 
2301 	error = sattr_to_vattr(args->sla_sa, &va);
2302 	if (error) {
2303 		VN_RELE(vp);
2304 		*status = puterrno(error);
2305 		return;
2306 	}
2307 
2308 	if (!(va.va_mask & AT_MODE)) {
2309 		VN_RELE(vp);
2310 		*status = NFSERR_INVAL;
2311 		return;
2312 	}
2313 
2314 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2315 	name = nfscmd_convname(ca, exi, args->sla_tnm,
2316 	    NFSCMD_CONV_INBOUND, MAXPATHLEN);
2317 
2318 	if (name == NULL) {
2319 		*status = NFSERR_ACCES;
2320 		return;
2321 	}
2322 
2323 	va.va_type = VLNK;
2324 	va.va_mask |= AT_TYPE;
2325 
2326 	error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, name, cr, NULL, 0);
2327 
2328 	/*
2329 	 * Force new data and metadata out to stable storage.
2330 	 */
2331 	lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL, 0,
2332 	    NULL, cr, NULL, NULL, NULL);
2333 
2334 	if (!lerror) {
2335 		(void) VOP_FSYNC(svp, 0, cr, NULL);
2336 		VN_RELE(svp);
2337 	}
2338 
2339 	/*
2340 	 * Force modified data and metadata out to stable storage.
2341 	 */
2342 	(void) VOP_FSYNC(vp, 0, cr, NULL);
2343 
2344 	VN_RELE(vp);
2345 
2346 	*status = puterrno(error);
2347 	if (name != args->sla_tnm)
2348 		kmem_free(name, MAXPATHLEN);
2349 
2350 }
2351 void *
2352 rfs_symlink_getfh(struct nfsslargs *args)
2353 {
2354 	return (args->sla_from.da_fhandle);
2355 }
2356 
2357 /*
2358  * Make a directory.
2359  * Create a directory with the given name, parent directory, and attributes.
2360  * Returns a file handle and attributes for the new directory.
2361  */
2362 /* ARGSUSED */
2363 void
2364 rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr,
2365     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2366 {
2367 	int error;
2368 	struct vattr va;
2369 	vnode_t *dvp = NULL;
2370 	vnode_t *vp;
2371 	char *name = args->ca_da.da_name;
2372 
2373 	/*
2374 	 * Disallow NULL paths
2375 	 */
2376 	if (name == NULL || *name == '\0') {
2377 		dr->dr_status = NFSERR_ACCES;
2378 		return;
2379 	}
2380 
2381 	vp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
2382 	if (vp == NULL) {
2383 		dr->dr_status = NFSERR_STALE;
2384 		return;
2385 	}
2386 
2387 	if (rdonly(ro, vp)) {
2388 		VN_RELE(vp);
2389 		dr->dr_status = NFSERR_ROFS;
2390 		return;
2391 	}
2392 
2393 	error = sattr_to_vattr(args->ca_sa, &va);
2394 	if (error) {
2395 		VN_RELE(vp);
2396 		dr->dr_status = puterrno(error);
2397 		return;
2398 	}
2399 
2400 	if (!(va.va_mask & AT_MODE)) {
2401 		VN_RELE(vp);
2402 		dr->dr_status = NFSERR_INVAL;
2403 		return;
2404 	}
2405 
2406 	va.va_type = VDIR;
2407 	va.va_mask |= AT_TYPE;
2408 
2409 	error = VOP_MKDIR(vp, name, &va, &dvp, cr, NULL, 0, NULL);
2410 
2411 	if (!error) {
2412 		/*
2413 		 * Attribtutes of the newly created directory should
2414 		 * be returned to the client.
2415 		 */
2416 		va.va_mask = AT_ALL; /* We want everything */
2417 		error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2418 
2419 		/* check for overflows */
2420 		if (!error) {
2421 			acl_perm(vp, exi, &va, cr);
2422 			error = vattr_to_nattr(&va, &dr->dr_attr);
2423 			if (!error) {
2424 				error = makefh(&dr->dr_fhandle, dvp, exi);
2425 			}
2426 		}
2427 		/*
2428 		 * Force new data and metadata out to stable storage.
2429 		 */
2430 		(void) VOP_FSYNC(dvp, 0, cr, NULL);
2431 		VN_RELE(dvp);
2432 	}
2433 
2434 	/*
2435 	 * Force modified data and metadata out to stable storage.
2436 	 */
2437 	(void) VOP_FSYNC(vp, 0, cr, NULL);
2438 
2439 	VN_RELE(vp);
2440 
2441 	dr->dr_status = puterrno(error);
2442 
2443 }
2444 void *
2445 rfs_mkdir_getfh(struct nfscreatargs *args)
2446 {
2447 	return (args->ca_da.da_fhandle);
2448 }
2449 
2450 /*
2451  * Remove a directory.
2452  * Remove the given directory name from the given parent directory.
2453  */
2454 /* ARGSUSED */
2455 void
2456 rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status,
2457     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2458 {
2459 	int error;
2460 	vnode_t *vp;
2461 
2462 	/*
2463 	 * Disallow NULL paths
2464 	 */
2465 	if (da->da_name == NULL || *da->da_name == '\0') {
2466 		*status = NFSERR_ACCES;
2467 		return;
2468 	}
2469 
2470 	vp = nfs_fhtovp(da->da_fhandle, exi);
2471 	if (vp == NULL) {
2472 		*status = NFSERR_STALE;
2473 		return;
2474 	}
2475 
2476 	if (rdonly(ro, vp)) {
2477 		VN_RELE(vp);
2478 		*status = NFSERR_ROFS;
2479 		return;
2480 	}
2481 
2482 	/*
2483 	 * VOP_RMDIR takes a third argument (the current
2484 	 * directory of the process).  That's because someone
2485 	 * wants to return EINVAL if one tries to remove ".".
2486 	 * Of course, NFS servers have no idea what their
2487 	 * clients' current directories are.  We fake it by
2488 	 * supplying a vnode known to exist and illegal to
2489 	 * remove.
2490 	 */
2491 	error = VOP_RMDIR(vp, da->da_name, rootdir, cr, NULL, 0);
2492 
2493 	/*
2494 	 * Force modified data and metadata out to stable storage.
2495 	 */
2496 	(void) VOP_FSYNC(vp, 0, cr, NULL);
2497 
2498 	VN_RELE(vp);
2499 
2500 	/*
2501 	 * System V defines rmdir to return EEXIST, not ENOTEMPTY,
2502 	 * if the directory is not empty.  A System V NFS server
2503 	 * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
2504 	 * over the wire.
2505 	 */
2506 	if (error == EEXIST)
2507 		*status = NFSERR_NOTEMPTY;
2508 	else
2509 		*status = puterrno(error);
2510 
2511 }
2512 void *
2513 rfs_rmdir_getfh(struct nfsdiropargs *da)
2514 {
2515 	return (da->da_fhandle);
2516 }
2517 
2518 /* ARGSUSED */
2519 void
2520 rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
2521     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2522 {
2523 	int error;
2524 	int iseof;
2525 	struct iovec iov;
2526 	struct uio uio;
2527 	vnode_t *vp;
2528 	char *ndata = NULL;
2529 	struct sockaddr *ca;
2530 	size_t nents;
2531 	int ret;
2532 
2533 	vp = nfs_fhtovp(&rda->rda_fh, exi);
2534 	if (vp == NULL) {
2535 		rd->rd_entries = NULL;
2536 		rd->rd_status = NFSERR_STALE;
2537 		return;
2538 	}
2539 
2540 	if (vp->v_type != VDIR) {
2541 		VN_RELE(vp);
2542 		rd->rd_entries = NULL;
2543 		rd->rd_status = NFSERR_NOTDIR;
2544 		return;
2545 	}
2546 
2547 	(void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
2548 
2549 	error = VOP_ACCESS(vp, VREAD, 0, cr, NULL);
2550 
2551 	if (error) {
2552 		rd->rd_entries = NULL;
2553 		goto bad;
2554 	}
2555 
2556 	if (rda->rda_count == 0) {
2557 		rd->rd_entries = NULL;
2558 		rd->rd_size = 0;
2559 		rd->rd_eof = FALSE;
2560 		goto bad;
2561 	}
2562 
2563 	rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA);
2564 
2565 	/*
2566 	 * Allocate data for entries.  This will be freed by rfs_rddirfree.
2567 	 */
2568 	rd->rd_bufsize = (uint_t)rda->rda_count;
2569 	rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP);
2570 
2571 	/*
2572 	 * Set up io vector to read directory data
2573 	 */
2574 	iov.iov_base = (caddr_t)rd->rd_entries;
2575 	iov.iov_len = rda->rda_count;
2576 	uio.uio_iov = &iov;
2577 	uio.uio_iovcnt = 1;
2578 	uio.uio_segflg = UIO_SYSSPACE;
2579 	uio.uio_extflg = UIO_COPY_CACHED;
2580 	uio.uio_loffset = (offset_t)rda->rda_offset;
2581 	uio.uio_resid = rda->rda_count;
2582 
2583 	/*
2584 	 * read directory
2585 	 */
2586 	error = VOP_READDIR(vp, &uio, cr, &iseof, NULL, 0);
2587 
2588 	/*
2589 	 * Clean up
2590 	 */
2591 	if (!error) {
2592 		/*
2593 		 * set size and eof
2594 		 */
2595 		if (uio.uio_resid == rda->rda_count) {
2596 			rd->rd_size = 0;
2597 			rd->rd_eof = TRUE;
2598 		} else {
2599 			rd->rd_size = (uint32_t)(rda->rda_count -
2600 			    uio.uio_resid);
2601 			rd->rd_eof = iseof ? TRUE : FALSE;
2602 		}
2603 	}
2604 
2605 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2606 	nents = nfscmd_countents((char *)rd->rd_entries, rd->rd_size);
2607 	ret = nfscmd_convdirplus(ca, exi, (char *)rd->rd_entries, nents,
2608 	    rda->rda_count, &ndata);
2609 
2610 	if (ret != 0) {
2611 		size_t dropbytes;
2612 		/*
2613 		 * We had to drop one or more entries in order to fit
2614 		 * during the character conversion.  We need to patch
2615 		 * up the size and eof info.
2616 		 */
2617 		if (rd->rd_eof)
2618 			rd->rd_eof = FALSE;
2619 		dropbytes = nfscmd_dropped_entrysize(
2620 		    (struct dirent64 *)rd->rd_entries, nents, ret);
2621 		rd->rd_size -= dropbytes;
2622 	}
2623 	if (ndata == NULL) {
2624 		ndata = (char *)rd->rd_entries;
2625 	} else if (ndata != (char *)rd->rd_entries) {
2626 		kmem_free(rd->rd_entries, rd->rd_bufsize);
2627 		rd->rd_entries = (void *)ndata;
2628 		rd->rd_bufsize = rda->rda_count;
2629 	}
2630 
2631 bad:
2632 	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
2633 
2634 #if 0 /* notyet */
2635 	/*
2636 	 * Don't do this.  It causes local disk writes when just
2637 	 * reading the file and the overhead is deemed larger
2638 	 * than the benefit.
2639 	 */
2640 	/*
2641 	 * Force modified metadata out to stable storage.
2642 	 */
2643 	(void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2644 #endif
2645 
2646 	VN_RELE(vp);
2647 
2648 	rd->rd_status = puterrno(error);
2649 
2650 }
2651 void *
2652 rfs_readdir_getfh(struct nfsrddirargs *rda)
2653 {
2654 	return (&rda->rda_fh);
2655 }
2656 void
2657 rfs_rddirfree(struct nfsrddirres *rd)
2658 {
2659 	if (rd->rd_entries != NULL)
2660 		kmem_free(rd->rd_entries, rd->rd_bufsize);
2661 }
2662 
2663 /* ARGSUSED */
2664 void
2665 rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
2666     struct svc_req *req, cred_t *cr, bool_t ro)
2667 {
2668 	int error;
2669 	struct statvfs64 sb;
2670 	vnode_t *vp;
2671 
2672 	vp = nfs_fhtovp(fh, exi);
2673 	if (vp == NULL) {
2674 		fs->fs_status = NFSERR_STALE;
2675 		return;
2676 	}
2677 
2678 	error = VFS_STATVFS(vp->v_vfsp, &sb);
2679 
2680 	if (!error) {
2681 		fs->fs_tsize = nfstsize();
2682 		fs->fs_bsize = sb.f_frsize;
2683 		fs->fs_blocks = sb.f_blocks;
2684 		fs->fs_bfree = sb.f_bfree;
2685 		fs->fs_bavail = sb.f_bavail;
2686 	}
2687 
2688 	VN_RELE(vp);
2689 
2690 	fs->fs_status = puterrno(error);
2691 
2692 }
2693 void *
2694 rfs_statfs_getfh(fhandle_t *fh)
2695 {
2696 	return (fh);
2697 }
2698 
2699 static int
2700 sattr_to_vattr(struct nfssattr *sa, struct vattr *vap)
2701 {
2702 	vap->va_mask = 0;
2703 
2704 	/*
2705 	 * There was a sign extension bug in some VFS based systems
2706 	 * which stored the mode as a short.  When it would get
2707 	 * assigned to a u_long, no sign extension would occur.
2708 	 * It needed to, but this wasn't noticed because sa_mode
2709 	 * would then get assigned back to the short, thus ignoring
2710 	 * the upper 16 bits of sa_mode.
2711 	 *
2712 	 * To make this implementation work for both broken
2713 	 * clients and good clients, we check for both versions
2714 	 * of the mode.
2715 	 */
2716 	if (sa->sa_mode != (uint32_t)((ushort_t)-1) &&
2717 	    sa->sa_mode != (uint32_t)-1) {
2718 		vap->va_mask |= AT_MODE;
2719 		vap->va_mode = sa->sa_mode;
2720 	}
2721 	if (sa->sa_uid != (uint32_t)-1) {
2722 		vap->va_mask |= AT_UID;
2723 		vap->va_uid = sa->sa_uid;
2724 	}
2725 	if (sa->sa_gid != (uint32_t)-1) {
2726 		vap->va_mask |= AT_GID;
2727 		vap->va_gid = sa->sa_gid;
2728 	}
2729 	if (sa->sa_size != (uint32_t)-1) {
2730 		vap->va_mask |= AT_SIZE;
2731 		vap->va_size = sa->sa_size;
2732 	}
2733 	if (sa->sa_atime.tv_sec != (int32_t)-1 &&
2734 	    sa->sa_atime.tv_usec != (int32_t)-1) {
2735 #ifndef _LP64
2736 		/* return error if time overflow */
2737 		if (!NFS2_TIME_OK(sa->sa_atime.tv_sec))
2738 			return (EOVERFLOW);
2739 #endif
2740 		vap->va_mask |= AT_ATIME;
2741 		/*
2742 		 * nfs protocol defines times as unsigned so don't extend sign,
2743 		 * unless sysadmin set nfs_allow_preepoch_time.
2744 		 */
2745 		NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec);
2746 		vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
2747 	}
2748 	if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
2749 	    sa->sa_mtime.tv_usec != (int32_t)-1) {
2750 #ifndef _LP64
2751 		/* return error if time overflow */
2752 		if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
2753 			return (EOVERFLOW);
2754 #endif
2755 		vap->va_mask |= AT_MTIME;
2756 		/*
2757 		 * nfs protocol defines times as unsigned so don't extend sign,
2758 		 * unless sysadmin set nfs_allow_preepoch_time.
2759 		 */
2760 		NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
2761 		vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
2762 	}
2763 	return (0);
2764 }
2765 
2766 static enum nfsftype vt_to_nf[] = {
2767 	0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
2768 };
2769 
2770 /*
2771  * check the following fields for overflow: nodeid, size, and time.
2772  * There could be a problem when converting 64-bit LP64 fields
2773  * into 32-bit ones.  Return an error if there is an overflow.
2774  */
2775 int
2776 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
2777 {
2778 	ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
2779 	na->na_type = vt_to_nf[vap->va_type];
2780 
2781 	if (vap->va_mode == (unsigned short) -1)
2782 		na->na_mode = (uint32_t)-1;
2783 	else
2784 		na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
2785 
2786 	if (vap->va_uid == (unsigned short)(-1))
2787 		na->na_uid = (uint32_t)(-1);
2788 	else if (vap->va_uid == UID_NOBODY)
2789 		na->na_uid = (uint32_t)NFS_UID_NOBODY;
2790 	else
2791 		na->na_uid = vap->va_uid;
2792 
2793 	if (vap->va_gid == (unsigned short)(-1))
2794 		na->na_gid = (uint32_t)-1;
2795 	else if (vap->va_gid == GID_NOBODY)
2796 		na->na_gid = (uint32_t)NFS_GID_NOBODY;
2797 	else
2798 		na->na_gid = vap->va_gid;
2799 
2800 	/*
2801 	 * Do we need to check fsid for overflow?  It is 64-bit in the
2802 	 * vattr, but are bigger than 32 bit values supported?
2803 	 */
2804 	na->na_fsid = vap->va_fsid;
2805 
2806 	na->na_nodeid = vap->va_nodeid;
2807 
2808 	/*
2809 	 * Check to make sure that the nodeid is representable over the
2810 	 * wire without losing bits.
2811 	 */
2812 	if (vap->va_nodeid != (u_longlong_t)na->na_nodeid)
2813 		return (EFBIG);
2814 	na->na_nlink = vap->va_nlink;
2815 
2816 	/*
2817 	 * Check for big files here, instead of at the caller.  See
2818 	 * comments in cstat for large special file explanation.
2819 	 */
2820 	if (vap->va_size > (u_longlong_t)MAXOFF32_T) {
2821 		if ((vap->va_type == VREG) || (vap->va_type == VDIR))
2822 			return (EFBIG);
2823 		if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) {
2824 			/* UNKNOWN_SIZE | OVERFLOW */
2825 			na->na_size = MAXOFF32_T;
2826 		} else
2827 			na->na_size = vap->va_size;
2828 	} else
2829 		na->na_size = vap->va_size;
2830 
2831 	/*
2832 	 * If the vnode times overflow the 32-bit times that NFS2
2833 	 * uses on the wire then return an error.
2834 	 */
2835 	if (!NFS_VAP_TIME_OK(vap)) {
2836 		return (EOVERFLOW);
2837 	}
2838 	na->na_atime.tv_sec = vap->va_atime.tv_sec;
2839 	na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2840 
2841 	na->na_mtime.tv_sec = vap->va_mtime.tv_sec;
2842 	na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2843 
2844 	na->na_ctime.tv_sec = vap->va_ctime.tv_sec;
2845 	na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000;
2846 
2847 	/*
2848 	 * If the dev_t will fit into 16 bits then compress
2849 	 * it, otherwise leave it alone. See comments in
2850 	 * nfs_client.c.
2851 	 */
2852 	if (getminor(vap->va_rdev) <= SO4_MAXMIN &&
2853 	    getmajor(vap->va_rdev) <= SO4_MAXMAJ)
2854 		na->na_rdev = nfsv2_cmpdev(vap->va_rdev);
2855 	else
2856 		(void) cmpldev(&na->na_rdev, vap->va_rdev);
2857 
2858 	na->na_blocks = vap->va_nblocks;
2859 	na->na_blocksize = vap->va_blksize;
2860 
2861 	/*
2862 	 * This bit of ugliness is a *TEMPORARY* hack to preserve the
2863 	 * over-the-wire protocols for named-pipe vnodes.  It remaps the
2864 	 * VFIFO type to the special over-the-wire type. (see note in nfs.h)
2865 	 *
2866 	 * BUYER BEWARE:
2867 	 *  If you are porting the NFS to a non-Sun server, you probably
2868 	 *  don't want to include the following block of code.  The
2869 	 *  over-the-wire special file types will be changing with the
2870 	 *  NFS Protocol Revision.
2871 	 */
2872 	if (vap->va_type == VFIFO)
2873 		NA_SETFIFO(na);
2874 	return (0);
2875 }
2876 
2877 /*
2878  * acl v2 support: returns approximate permission.
2879  *	default: returns minimal permission (more restrictive)
2880  *	aclok: returns maximal permission (less restrictive)
2881  *	This routine changes the permissions that are alaredy in *va.
2882  *	If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES,
2883  *	CLASS_OBJ is always the same as GROUP_OBJ entry.
2884  */
2885 static void
2886 acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr)
2887 {
2888 	vsecattr_t	vsa;
2889 	int		aclcnt;
2890 	aclent_t	*aclentp;
2891 	mode_t		mask_perm;
2892 	mode_t		grp_perm;
2893 	mode_t		other_perm;
2894 	mode_t		other_orig;
2895 	int		error;
2896 
2897 	/* dont care default acl */
2898 	vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT);
2899 	error = VOP_GETSECATTR(vp, &vsa, 0, cr, NULL);
2900 
2901 	if (!error) {
2902 		aclcnt = vsa.vsa_aclcnt;
2903 		if (aclcnt > MIN_ACL_ENTRIES) {
2904 			/* non-trivial ACL */
2905 			aclentp = vsa.vsa_aclentp;
2906 			if (exi->exi_export.ex_flags & EX_ACLOK) {
2907 				/* maximal permissions */
2908 				grp_perm = 0;
2909 				other_perm = 0;
2910 				for (; aclcnt > 0; aclcnt--, aclentp++) {
2911 					switch (aclentp->a_type) {
2912 					case USER_OBJ:
2913 						break;
2914 					case USER:
2915 						grp_perm |=
2916 						    aclentp->a_perm << 3;
2917 						other_perm |= aclentp->a_perm;
2918 						break;
2919 					case GROUP_OBJ:
2920 						grp_perm |=
2921 						    aclentp->a_perm << 3;
2922 						break;
2923 					case GROUP:
2924 						other_perm |= aclentp->a_perm;
2925 						break;
2926 					case OTHER_OBJ:
2927 						other_orig = aclentp->a_perm;
2928 						break;
2929 					case CLASS_OBJ:
2930 						mask_perm = aclentp->a_perm;
2931 						break;
2932 					default:
2933 						break;
2934 					}
2935 				}
2936 				grp_perm &= mask_perm << 3;
2937 				other_perm &= mask_perm;
2938 				other_perm |= other_orig;
2939 
2940 			} else {
2941 				/* minimal permissions */
2942 				grp_perm = 070;
2943 				other_perm = 07;
2944 				for (; aclcnt > 0; aclcnt--, aclentp++) {
2945 					switch (aclentp->a_type) {
2946 					case USER_OBJ:
2947 						break;
2948 					case USER:
2949 					case CLASS_OBJ:
2950 						grp_perm &=
2951 						    aclentp->a_perm << 3;
2952 						other_perm &=
2953 						    aclentp->a_perm;
2954 						break;
2955 					case GROUP_OBJ:
2956 						grp_perm &=
2957 						    aclentp->a_perm << 3;
2958 						break;
2959 					case GROUP:
2960 						other_perm &=
2961 						    aclentp->a_perm;
2962 						break;
2963 					case OTHER_OBJ:
2964 						other_perm &=
2965 						    aclentp->a_perm;
2966 						break;
2967 					default:
2968 						break;
2969 					}
2970 				}
2971 			}
2972 			/* copy to va */
2973 			va->va_mode &= ~077;
2974 			va->va_mode |= grp_perm | other_perm;
2975 		}
2976 		if (vsa.vsa_aclcnt)
2977 			kmem_free(vsa.vsa_aclentp,
2978 			    vsa.vsa_aclcnt * sizeof (aclent_t));
2979 	}
2980 }
2981 
2982 void
2983 rfs_srvrinit(void)
2984 {
2985 	mutex_init(&rfs_async_write_lock, NULL, MUTEX_DEFAULT, NULL);
2986 	nfs2_srv_caller_id = fs_new_caller_id();
2987 }
2988 
2989 void
2990 rfs_srvrfini(void)
2991 {
2992 	mutex_destroy(&rfs_async_write_lock);
2993 }
2994 
2995 static int
2996 rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
2997 {
2998 	struct clist	*wcl;
2999 	int		wlist_len;
3000 	uint32_t	count = rr->rr_count;
3001 
3002 	wcl = ra->ra_wlist;
3003 
3004 	if (rdma_setup_read_chunks(wcl, count, &wlist_len) == FALSE) {
3005 		return (FALSE);
3006 	}
3007 
3008 	wcl = ra->ra_wlist;
3009 	rr->rr_ok.rrok_wlist_len = wlist_len;
3010 	rr->rr_ok.rrok_wlist = wcl;
3011 
3012 	return (TRUE);
3013 }
3014