xref: /titanic_44/usr/src/uts/common/fs/nfs/nfs_srv.c (revision 6907ca4bc759c5fd81b8a60fc5e1a1c7d1972e6a)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
24  */
25 
26 /*
27  *	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
28  *	All rights reserved.
29  */
30 
31 #include <sys/param.h>
32 #include <sys/types.h>
33 #include <sys/systm.h>
34 #include <sys/cred.h>
35 #include <sys/buf.h>
36 #include <sys/vfs.h>
37 #include <sys/vnode.h>
38 #include <sys/uio.h>
39 #include <sys/stat.h>
40 #include <sys/errno.h>
41 #include <sys/sysmacros.h>
42 #include <sys/statvfs.h>
43 #include <sys/kmem.h>
44 #include <sys/kstat.h>
45 #include <sys/dirent.h>
46 #include <sys/cmn_err.h>
47 #include <sys/debug.h>
48 #include <sys/vtrace.h>
49 #include <sys/mode.h>
50 #include <sys/acl.h>
51 #include <sys/nbmlock.h>
52 #include <sys/policy.h>
53 #include <sys/sdt.h>
54 
55 #include <rpc/types.h>
56 #include <rpc/auth.h>
57 #include <rpc/svc.h>
58 
59 #include <nfs/nfs.h>
60 #include <nfs/export.h>
61 #include <nfs/nfs_cmd.h>
62 
63 #include <vm/hat.h>
64 #include <vm/as.h>
65 #include <vm/seg.h>
66 #include <vm/seg_map.h>
67 #include <vm/seg_kmem.h>
68 
69 #include <sys/strsubr.h>
70 
71 /*
72  * These are the interface routines for the server side of the
73  * Network File System.  See the NFS version 2 protocol specification
74  * for a description of this interface.
75  */
76 
77 static int	sattr_to_vattr(struct nfssattr *, struct vattr *);
78 static void	acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
79 			cred_t *);
80 
81 /*
82  * Some "over the wire" UNIX file types.  These are encoded
83  * into the mode.  This needs to be fixed in the next rev.
84  */
85 #define	IFMT		0170000		/* type of file */
86 #define	IFCHR		0020000		/* character special */
87 #define	IFBLK		0060000		/* block special */
88 #define	IFSOCK		0140000		/* socket */
89 
90 u_longlong_t nfs2_srv_caller_id;
91 
92 /*
93  * Get file attributes.
94  * Returns the current attributes of the file with the given fhandle.
95  */
96 /* ARGSUSED */
97 void
98 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
99 	struct svc_req *req, cred_t *cr)
100 {
101 	int error;
102 	vnode_t *vp;
103 	struct vattr va;
104 
105 	vp = nfs_fhtovp(fhp, exi);
106 	if (vp == NULL) {
107 		ns->ns_status = NFSERR_STALE;
108 		return;
109 	}
110 
111 	/*
112 	 * Do the getattr.
113 	 */
114 	va.va_mask = AT_ALL;	/* we want all the attributes */
115 
116 	error = rfs4_delegated_getattr(vp, &va, 0, cr);
117 
118 	/* check for overflows */
119 	if (!error) {
120 		/* Lie about the object type for a referral */
121 		if (vn_is_nfs_reparse(vp, cr))
122 			va.va_type = VLNK;
123 
124 		acl_perm(vp, exi, &va, cr);
125 		error = vattr_to_nattr(&va, &ns->ns_attr);
126 	}
127 
128 	VN_RELE(vp);
129 
130 	ns->ns_status = puterrno(error);
131 }
132 void *
133 rfs_getattr_getfh(fhandle_t *fhp)
134 {
135 	return (fhp);
136 }
137 
138 /*
139  * Set file attributes.
140  * Sets the attributes of the file with the given fhandle.  Returns
141  * the new attributes.
142  */
143 void
144 rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns,
145 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
146 {
147 	int error;
148 	int flag;
149 	int in_crit = 0;
150 	vnode_t *vp;
151 	struct vattr va;
152 	struct vattr bva;
153 	struct flock64 bf;
154 	caller_context_t ct;
155 
156 
157 	vp = nfs_fhtovp(&args->saa_fh, exi);
158 	if (vp == NULL) {
159 		ns->ns_status = NFSERR_STALE;
160 		return;
161 	}
162 
163 	if (rdonly(exi, vp, req)) {
164 		VN_RELE(vp);
165 		ns->ns_status = NFSERR_ROFS;
166 		return;
167 	}
168 
169 	error = sattr_to_vattr(&args->saa_sa, &va);
170 	if (error) {
171 		VN_RELE(vp);
172 		ns->ns_status = puterrno(error);
173 		return;
174 	}
175 
176 	/*
177 	 * If the client is requesting a change to the mtime,
178 	 * but the nanosecond field is set to 1 billion, then
179 	 * this is a flag to the server that it should set the
180 	 * atime and mtime fields to the server's current time.
181 	 * The 1 billion number actually came from the client
182 	 * as 1 million, but the units in the over the wire
183 	 * request are microseconds instead of nanoseconds.
184 	 *
185 	 * This is an overload of the protocol and should be
186 	 * documented in the NFS Version 2 protocol specification.
187 	 */
188 	if (va.va_mask & AT_MTIME) {
189 		if (va.va_mtime.tv_nsec == 1000000000) {
190 			gethrestime(&va.va_mtime);
191 			va.va_atime = va.va_mtime;
192 			va.va_mask |= AT_ATIME;
193 			flag = 0;
194 		} else
195 			flag = ATTR_UTIME;
196 	} else
197 		flag = 0;
198 
199 	/*
200 	 * If the filesystem is exported with nosuid, then mask off
201 	 * the setuid and setgid bits.
202 	 */
203 	if ((va.va_mask & AT_MODE) && vp->v_type == VREG &&
204 	    (exi->exi_export.ex_flags & EX_NOSUID))
205 		va.va_mode &= ~(VSUID | VSGID);
206 
207 	ct.cc_sysid = 0;
208 	ct.cc_pid = 0;
209 	ct.cc_caller_id = nfs2_srv_caller_id;
210 	ct.cc_flags = CC_DONTBLOCK;
211 
212 	/*
213 	 * We need to specially handle size changes because it is
214 	 * possible for the client to create a file with modes
215 	 * which indicate read-only, but with the file opened for
216 	 * writing.  If the client then tries to set the size of
217 	 * the file, then the normal access checking done in
218 	 * VOP_SETATTR would prevent the client from doing so,
219 	 * although it should be legal for it to do so.  To get
220 	 * around this, we do the access checking for ourselves
221 	 * and then use VOP_SPACE which doesn't do the access
222 	 * checking which VOP_SETATTR does. VOP_SPACE can only
223 	 * operate on VREG files, let VOP_SETATTR handle the other
224 	 * extremely rare cases.
225 	 * Also the client should not be allowed to change the
226 	 * size of the file if there is a conflicting non-blocking
227 	 * mandatory lock in the region of change.
228 	 */
229 	if (vp->v_type == VREG && va.va_mask & AT_SIZE) {
230 		if (nbl_need_check(vp)) {
231 			nbl_start_crit(vp, RW_READER);
232 			in_crit = 1;
233 		}
234 
235 		bva.va_mask = AT_UID | AT_SIZE;
236 
237 		error = VOP_GETATTR(vp, &bva, 0, cr, &ct);
238 
239 		if (error) {
240 			if (in_crit)
241 				nbl_end_crit(vp);
242 			VN_RELE(vp);
243 			ns->ns_status = puterrno(error);
244 			return;
245 		}
246 
247 		if (in_crit) {
248 			u_offset_t offset;
249 			ssize_t length;
250 
251 			if (va.va_size < bva.va_size) {
252 				offset = va.va_size;
253 				length = bva.va_size - va.va_size;
254 			} else {
255 				offset = bva.va_size;
256 				length = va.va_size - bva.va_size;
257 			}
258 			if (nbl_conflict(vp, NBL_WRITE, offset, length, 0,
259 			    NULL)) {
260 				error = EACCES;
261 			}
262 		}
263 
264 		if (crgetuid(cr) == bva.va_uid && !error &&
265 		    va.va_size != bva.va_size) {
266 			va.va_mask &= ~AT_SIZE;
267 			bf.l_type = F_WRLCK;
268 			bf.l_whence = 0;
269 			bf.l_start = (off64_t)va.va_size;
270 			bf.l_len = 0;
271 			bf.l_sysid = 0;
272 			bf.l_pid = 0;
273 
274 			error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE,
275 			    (offset_t)va.va_size, cr, &ct);
276 		}
277 		if (in_crit)
278 			nbl_end_crit(vp);
279 	} else
280 		error = 0;
281 
282 	/*
283 	 * Do the setattr.
284 	 */
285 	if (!error && va.va_mask) {
286 		error = VOP_SETATTR(vp, &va, flag, cr, &ct);
287 	}
288 
289 	/*
290 	 * check if the monitor on either vop_space or vop_setattr detected
291 	 * a delegation conflict and if so, mark the thread flag as
292 	 * wouldblock so that the response is dropped and the client will
293 	 * try again.
294 	 */
295 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
296 		VN_RELE(vp);
297 		curthread->t_flag |= T_WOULDBLOCK;
298 		return;
299 	}
300 
301 	if (!error) {
302 		va.va_mask = AT_ALL;	/* get everything */
303 
304 		error = rfs4_delegated_getattr(vp, &va, 0, cr);
305 
306 		/* check for overflows */
307 		if (!error) {
308 			acl_perm(vp, exi, &va, cr);
309 			error = vattr_to_nattr(&va, &ns->ns_attr);
310 		}
311 	}
312 
313 	ct.cc_flags = 0;
314 
315 	/*
316 	 * Force modified metadata out to stable storage.
317 	 */
318 	(void) VOP_FSYNC(vp, FNODSYNC, cr, &ct);
319 
320 	VN_RELE(vp);
321 
322 	ns->ns_status = puterrno(error);
323 }
324 void *
325 rfs_setattr_getfh(struct nfssaargs *args)
326 {
327 	return (&args->saa_fh);
328 }
329 
330 /*
331  * Directory lookup.
332  * Returns an fhandle and file attributes for file name in a directory.
333  */
334 /* ARGSUSED */
335 void
336 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
337 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
338 {
339 	int error;
340 	vnode_t *dvp;
341 	vnode_t *vp;
342 	struct vattr va;
343 	fhandle_t *fhp = da->da_fhandle;
344 	struct sec_ol sec = {0, 0};
345 	bool_t publicfh_flag = FALSE, auth_weak = FALSE;
346 	char *name;
347 	struct sockaddr *ca;
348 
349 	/*
350 	 * Trusted Extension doesn't support NFSv2. MOUNT
351 	 * will reject v2 clients. Need to prevent v2 client
352 	 * access via WebNFS here.
353 	 */
354 	if (is_system_labeled() && req->rq_vers == 2) {
355 		dr->dr_status = NFSERR_ACCES;
356 		return;
357 	}
358 
359 	/*
360 	 * Disallow NULL paths
361 	 */
362 	if (da->da_name == NULL || *da->da_name == '\0') {
363 		dr->dr_status = NFSERR_ACCES;
364 		return;
365 	}
366 
367 	/*
368 	 * Allow lookups from the root - the default
369 	 * location of the public filehandle.
370 	 */
371 	if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
372 		dvp = rootdir;
373 		VN_HOLD(dvp);
374 	} else {
375 		dvp = nfs_fhtovp(fhp, exi);
376 		if (dvp == NULL) {
377 			dr->dr_status = NFSERR_STALE;
378 			return;
379 		}
380 	}
381 
382 	/*
383 	 * Not allow lookup beyond root.
384 	 * If the filehandle matches a filehandle of the exi,
385 	 * then the ".." refers beyond the root of an exported filesystem.
386 	 */
387 	if (strcmp(da->da_name, "..") == 0 &&
388 	    EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
389 		VN_RELE(dvp);
390 		dr->dr_status = NFSERR_NOENT;
391 		return;
392 	}
393 
394 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
395 	name = nfscmd_convname(ca, exi, da->da_name, NFSCMD_CONV_INBOUND,
396 	    MAXPATHLEN);
397 
398 	if (name == NULL) {
399 		dr->dr_status = NFSERR_ACCES;
400 		return;
401 	}
402 
403 	/*
404 	 * If the public filehandle is used then allow
405 	 * a multi-component lookup, i.e. evaluate
406 	 * a pathname and follow symbolic links if
407 	 * necessary.
408 	 *
409 	 * This may result in a vnode in another filesystem
410 	 * which is OK as long as the filesystem is exported.
411 	 */
412 	if (PUBLIC_FH2(fhp)) {
413 		publicfh_flag = TRUE;
414 		error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &exi,
415 		    &sec);
416 	} else {
417 		/*
418 		 * Do a normal single component lookup.
419 		 */
420 		error = VOP_LOOKUP(dvp, name, &vp, NULL, 0, NULL, cr,
421 		    NULL, NULL, NULL);
422 	}
423 
424 	if (name != da->da_name)
425 		kmem_free(name, MAXPATHLEN);
426 
427 
428 	if (!error) {
429 		va.va_mask = AT_ALL;	/* we want everything */
430 
431 		error = rfs4_delegated_getattr(vp, &va, 0, cr);
432 
433 		/* check for overflows */
434 		if (!error) {
435 			acl_perm(vp, exi, &va, cr);
436 			error = vattr_to_nattr(&va, &dr->dr_attr);
437 			if (!error) {
438 				if (sec.sec_flags & SEC_QUERY)
439 					error = makefh_ol(&dr->dr_fhandle, exi,
440 					    sec.sec_index);
441 				else {
442 					error = makefh(&dr->dr_fhandle, vp,
443 					    exi);
444 					if (!error && publicfh_flag &&
445 					    !chk_clnt_sec(exi, req))
446 						auth_weak = TRUE;
447 				}
448 			}
449 		}
450 		VN_RELE(vp);
451 	}
452 
453 	VN_RELE(dvp);
454 
455 	/*
456 	 * If publicfh_flag is true then we have called rfs_publicfh_mclookup
457 	 * and have obtained a new exportinfo in exi which needs to be
458 	 * released. Note the the original exportinfo pointed to by exi
459 	 * will be released by the caller, comon_dispatch.
460 	 */
461 	if (publicfh_flag && exi != NULL)
462 		exi_rele(exi);
463 
464 	/*
465 	 * If it's public fh, no 0x81, and client's flavor is
466 	 * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
467 	 * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
468 	 */
469 	if (auth_weak)
470 		dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR;
471 	else
472 		dr->dr_status = puterrno(error);
473 }
474 void *
475 rfs_lookup_getfh(struct nfsdiropargs *da)
476 {
477 	return (da->da_fhandle);
478 }
479 
480 /*
481  * Read symbolic link.
482  * Returns the string in the symbolic link at the given fhandle.
483  */
484 /* ARGSUSED */
485 void
486 rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi,
487 	struct svc_req *req, cred_t *cr)
488 {
489 	int error;
490 	struct iovec iov;
491 	struct uio uio;
492 	vnode_t *vp;
493 	struct vattr va;
494 	struct sockaddr *ca;
495 	char *name = NULL;
496 	int is_referral = 0;
497 
498 	vp = nfs_fhtovp(fhp, exi);
499 	if (vp == NULL) {
500 		rl->rl_data = NULL;
501 		rl->rl_status = NFSERR_STALE;
502 		return;
503 	}
504 
505 	va.va_mask = AT_MODE;
506 
507 	error = VOP_GETATTR(vp, &va, 0, cr, NULL);
508 
509 	if (error) {
510 		VN_RELE(vp);
511 		rl->rl_data = NULL;
512 		rl->rl_status = puterrno(error);
513 		return;
514 	}
515 
516 	if (MANDLOCK(vp, va.va_mode)) {
517 		VN_RELE(vp);
518 		rl->rl_data = NULL;
519 		rl->rl_status = NFSERR_ACCES;
520 		return;
521 	}
522 
523 	/* We lied about the object type for a referral */
524 	if (vn_is_nfs_reparse(vp, cr))
525 		is_referral = 1;
526 
527 	/*
528 	 * XNFS and RFC1094 require us to return ENXIO if argument
529 	 * is not a link. BUGID 1138002.
530 	 */
531 	if (vp->v_type != VLNK && !is_referral) {
532 		VN_RELE(vp);
533 		rl->rl_data = NULL;
534 		rl->rl_status = NFSERR_NXIO;
535 		return;
536 	}
537 
538 	/*
539 	 * Allocate data for pathname.  This will be freed by rfs_rlfree.
540 	 */
541 	rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
542 
543 	if (is_referral) {
544 		char *s;
545 		size_t strsz;
546 
547 		/* Get an artificial symlink based on a referral */
548 		s = build_symlink(vp, cr, &strsz);
549 		global_svstat_ptr[2][NFS_REFERLINKS].value.ui64++;
550 		DTRACE_PROBE2(nfs2serv__func__referral__reflink,
551 		    vnode_t *, vp, char *, s);
552 		if (s == NULL)
553 			error = EINVAL;
554 		else {
555 			error = 0;
556 			(void) strlcpy(rl->rl_data, s, NFS_MAXPATHLEN);
557 			rl->rl_count = (uint32_t)MIN(strsz, NFS_MAXPATHLEN);
558 			kmem_free(s, strsz);
559 		}
560 
561 	} else {
562 
563 		/*
564 		 * Set up io vector to read sym link data
565 		 */
566 		iov.iov_base = rl->rl_data;
567 		iov.iov_len = NFS_MAXPATHLEN;
568 		uio.uio_iov = &iov;
569 		uio.uio_iovcnt = 1;
570 		uio.uio_segflg = UIO_SYSSPACE;
571 		uio.uio_extflg = UIO_COPY_CACHED;
572 		uio.uio_loffset = (offset_t)0;
573 		uio.uio_resid = NFS_MAXPATHLEN;
574 
575 		/*
576 		 * Do the readlink.
577 		 */
578 		error = VOP_READLINK(vp, &uio, cr, NULL);
579 
580 		rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid);
581 
582 		if (!error)
583 			rl->rl_data[rl->rl_count] = '\0';
584 
585 	}
586 
587 
588 	VN_RELE(vp);
589 
590 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
591 	name = nfscmd_convname(ca, exi, rl->rl_data,
592 	    NFSCMD_CONV_OUTBOUND, MAXPATHLEN);
593 
594 	if (name != NULL && name != rl->rl_data) {
595 		kmem_free(rl->rl_data, NFS_MAXPATHLEN);
596 		rl->rl_data = name;
597 	}
598 
599 	/*
600 	 * XNFS and RFC1094 require us to return ENXIO if argument
601 	 * is not a link. UFS returns EINVAL if this is the case,
602 	 * so we do the mapping here. BUGID 1138002.
603 	 */
604 	if (error == EINVAL)
605 		rl->rl_status = NFSERR_NXIO;
606 	else
607 		rl->rl_status = puterrno(error);
608 
609 }
610 void *
611 rfs_readlink_getfh(fhandle_t *fhp)
612 {
613 	return (fhp);
614 }
615 /*
616  * Free data allocated by rfs_readlink
617  */
618 void
619 rfs_rlfree(struct nfsrdlnres *rl)
620 {
621 	if (rl->rl_data != NULL)
622 		kmem_free(rl->rl_data, NFS_MAXPATHLEN);
623 }
624 
625 static int rdma_setup_read_data2(struct nfsreadargs *, struct nfsrdresult *);
626 
627 /*
628  * Read data.
629  * Returns some data read from the file at the given fhandle.
630  */
631 /* ARGSUSED */
632 void
633 rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr,
634 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
635 {
636 	vnode_t *vp;
637 	int error;
638 	struct vattr va;
639 	struct iovec iov;
640 	struct uio uio;
641 	mblk_t *mp;
642 	int alloc_err = 0;
643 	int in_crit = 0;
644 	caller_context_t ct;
645 
646 	vp = nfs_fhtovp(&ra->ra_fhandle, exi);
647 	if (vp == NULL) {
648 		rr->rr_data = NULL;
649 		rr->rr_status = NFSERR_STALE;
650 		return;
651 	}
652 
653 	if (vp->v_type != VREG) {
654 		VN_RELE(vp);
655 		rr->rr_data = NULL;
656 		rr->rr_status = NFSERR_ISDIR;
657 		return;
658 	}
659 
660 	ct.cc_sysid = 0;
661 	ct.cc_pid = 0;
662 	ct.cc_caller_id = nfs2_srv_caller_id;
663 	ct.cc_flags = CC_DONTBLOCK;
664 
665 	/*
666 	 * Enter the critical region before calling VOP_RWLOCK
667 	 * to avoid a deadlock with write requests.
668 	 */
669 	if (nbl_need_check(vp)) {
670 		nbl_start_crit(vp, RW_READER);
671 		if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
672 		    0, NULL)) {
673 			nbl_end_crit(vp);
674 			VN_RELE(vp);
675 			rr->rr_data = NULL;
676 			rr->rr_status = NFSERR_ACCES;
677 			return;
678 		}
679 		in_crit = 1;
680 	}
681 
682 	error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct);
683 
684 	/* check if a monitor detected a delegation conflict */
685 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
686 		VN_RELE(vp);
687 		/* mark as wouldblock so response is dropped */
688 		curthread->t_flag |= T_WOULDBLOCK;
689 
690 		rr->rr_data = NULL;
691 		return;
692 	}
693 
694 	va.va_mask = AT_ALL;
695 
696 	error = VOP_GETATTR(vp, &va, 0, cr, &ct);
697 
698 	if (error) {
699 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
700 		if (in_crit)
701 			nbl_end_crit(vp);
702 
703 		VN_RELE(vp);
704 		rr->rr_data = NULL;
705 		rr->rr_status = puterrno(error);
706 
707 		return;
708 	}
709 
710 	/*
711 	 * This is a kludge to allow reading of files created
712 	 * with no read permission.  The owner of the file
713 	 * is always allowed to read it.
714 	 */
715 	if (crgetuid(cr) != va.va_uid) {
716 		error = VOP_ACCESS(vp, VREAD, 0, cr, &ct);
717 
718 		if (error) {
719 			/*
720 			 * Exec is the same as read over the net because
721 			 * of demand loading.
722 			 */
723 			error = VOP_ACCESS(vp, VEXEC, 0, cr, &ct);
724 		}
725 		if (error) {
726 			VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
727 			if (in_crit)
728 				nbl_end_crit(vp);
729 			VN_RELE(vp);
730 			rr->rr_data = NULL;
731 			rr->rr_status = puterrno(error);
732 
733 			return;
734 		}
735 	}
736 
737 	if (MANDLOCK(vp, va.va_mode)) {
738 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
739 		if (in_crit)
740 			nbl_end_crit(vp);
741 
742 		VN_RELE(vp);
743 		rr->rr_data = NULL;
744 		rr->rr_status = NFSERR_ACCES;
745 
746 		return;
747 	}
748 
749 	rr->rr_ok.rrok_wlist_len = 0;
750 	rr->rr_ok.rrok_wlist = NULL;
751 
752 	if ((u_offset_t)ra->ra_offset >= va.va_size) {
753 		rr->rr_count = 0;
754 		rr->rr_data = NULL;
755 		/*
756 		 * In this case, status is NFS_OK, but there is no data
757 		 * to encode. So set rr_mp to NULL.
758 		 */
759 		rr->rr_mp = NULL;
760 		rr->rr_ok.rrok_wlist = ra->ra_wlist;
761 		if (rr->rr_ok.rrok_wlist)
762 			clist_zero_len(rr->rr_ok.rrok_wlist);
763 		goto done;
764 	}
765 
766 	if (ra->ra_wlist) {
767 		mp = NULL;
768 		rr->rr_mp = NULL;
769 		(void) rdma_get_wchunk(req, &iov, ra->ra_wlist);
770 		if (ra->ra_count > iov.iov_len) {
771 			rr->rr_data = NULL;
772 			rr->rr_status = NFSERR_INVAL;
773 			goto done;
774 		}
775 	} else {
776 		/*
777 		 * mp will contain the data to be sent out in the read reply.
778 		 * This will be freed after the reply has been sent out (by the
779 		 * driver).
780 		 * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so
781 		 * that the call to xdrmblk_putmblk() never fails.
782 		 */
783 		mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG,
784 		    &alloc_err);
785 		ASSERT(mp != NULL);
786 		ASSERT(alloc_err == 0);
787 
788 		rr->rr_mp = mp;
789 
790 		/*
791 		 * Set up io vector
792 		 */
793 		iov.iov_base = (caddr_t)mp->b_datap->db_base;
794 		iov.iov_len = ra->ra_count;
795 	}
796 
797 	uio.uio_iov = &iov;
798 	uio.uio_iovcnt = 1;
799 	uio.uio_segflg = UIO_SYSSPACE;
800 	uio.uio_extflg = UIO_COPY_CACHED;
801 	uio.uio_loffset = (offset_t)ra->ra_offset;
802 	uio.uio_resid = ra->ra_count;
803 
804 	error = VOP_READ(vp, &uio, 0, cr, &ct);
805 
806 	if (error) {
807 		if (mp)
808 			freeb(mp);
809 
810 		/*
811 		 * check if a monitor detected a delegation conflict and
812 		 * mark as wouldblock so response is dropped
813 		 */
814 		if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
815 			curthread->t_flag |= T_WOULDBLOCK;
816 		else
817 			rr->rr_status = puterrno(error);
818 
819 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
820 		if (in_crit)
821 			nbl_end_crit(vp);
822 
823 		VN_RELE(vp);
824 		rr->rr_data = NULL;
825 
826 		return;
827 	}
828 
829 	/*
830 	 * Get attributes again so we can send the latest access
831 	 * time to the client side for his cache.
832 	 */
833 	va.va_mask = AT_ALL;
834 
835 	error = VOP_GETATTR(vp, &va, 0, cr, &ct);
836 
837 	if (error) {
838 		if (mp)
839 			freeb(mp);
840 
841 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
842 		if (in_crit)
843 			nbl_end_crit(vp);
844 
845 		VN_RELE(vp);
846 		rr->rr_data = NULL;
847 		rr->rr_status = puterrno(error);
848 
849 		return;
850 	}
851 
852 	rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid);
853 
854 	if (mp) {
855 		rr->rr_data = (char *)mp->b_datap->db_base;
856 	} else {
857 		if (ra->ra_wlist) {
858 			rr->rr_data = (caddr_t)iov.iov_base;
859 			if (!rdma_setup_read_data2(ra, rr)) {
860 				rr->rr_data = NULL;
861 				rr->rr_status = puterrno(NFSERR_INVAL);
862 			}
863 		}
864 	}
865 done:
866 	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
867 	if (in_crit)
868 		nbl_end_crit(vp);
869 
870 	acl_perm(vp, exi, &va, cr);
871 
872 	/* check for overflows */
873 	error = vattr_to_nattr(&va, &rr->rr_attr);
874 
875 	VN_RELE(vp);
876 
877 	rr->rr_status = puterrno(error);
878 }
879 
880 /*
881  * Free data allocated by rfs_read
882  */
883 void
884 rfs_rdfree(struct nfsrdresult *rr)
885 {
886 	mblk_t *mp;
887 
888 	if (rr->rr_status == NFS_OK) {
889 		mp = rr->rr_mp;
890 		if (mp != NULL)
891 			freeb(mp);
892 	}
893 }
894 
895 void *
896 rfs_read_getfh(struct nfsreadargs *ra)
897 {
898 	return (&ra->ra_fhandle);
899 }
900 
901 #define	MAX_IOVECS	12
902 
903 #ifdef DEBUG
904 static int rfs_write_sync_hits = 0;
905 static int rfs_write_sync_misses = 0;
906 #endif
907 
908 /*
909  * Write data to file.
910  * Returns attributes of a file after writing some data to it.
911  *
912  * Any changes made here, especially in error handling might have
913  * to also be done in rfs_write (which clusters write requests).
914  */
915 void
916 rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns,
917 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
918 {
919 	int error;
920 	vnode_t *vp;
921 	rlim64_t rlimit;
922 	struct vattr va;
923 	struct uio uio;
924 	struct iovec iov[MAX_IOVECS];
925 	mblk_t *m;
926 	struct iovec *iovp;
927 	int iovcnt;
928 	cred_t *savecred;
929 	int in_crit = 0;
930 	caller_context_t ct;
931 
932 	vp = nfs_fhtovp(&wa->wa_fhandle, exi);
933 	if (vp == NULL) {
934 		ns->ns_status = NFSERR_STALE;
935 		return;
936 	}
937 
938 	if (rdonly(exi, vp, req)) {
939 		VN_RELE(vp);
940 		ns->ns_status = NFSERR_ROFS;
941 		return;
942 	}
943 
944 	if (vp->v_type != VREG) {
945 		VN_RELE(vp);
946 		ns->ns_status = NFSERR_ISDIR;
947 		return;
948 	}
949 
950 	ct.cc_sysid = 0;
951 	ct.cc_pid = 0;
952 	ct.cc_caller_id = nfs2_srv_caller_id;
953 	ct.cc_flags = CC_DONTBLOCK;
954 
955 	va.va_mask = AT_UID|AT_MODE;
956 
957 	error = VOP_GETATTR(vp, &va, 0, cr, &ct);
958 
959 	if (error) {
960 		VN_RELE(vp);
961 		ns->ns_status = puterrno(error);
962 
963 		return;
964 	}
965 
966 	if (crgetuid(cr) != va.va_uid) {
967 		/*
968 		 * This is a kludge to allow writes of files created
969 		 * with read only permission.  The owner of the file
970 		 * is always allowed to write it.
971 		 */
972 		error = VOP_ACCESS(vp, VWRITE, 0, cr, &ct);
973 
974 		if (error) {
975 			VN_RELE(vp);
976 			ns->ns_status = puterrno(error);
977 			return;
978 		}
979 	}
980 
981 	/*
982 	 * Can't access a mandatory lock file.  This might cause
983 	 * the NFS service thread to block forever waiting for a
984 	 * lock to be released that will never be released.
985 	 */
986 	if (MANDLOCK(vp, va.va_mode)) {
987 		VN_RELE(vp);
988 		ns->ns_status = NFSERR_ACCES;
989 		return;
990 	}
991 
992 	/*
993 	 * We have to enter the critical region before calling VOP_RWLOCK
994 	 * to avoid a deadlock with ufs.
995 	 */
996 	if (nbl_need_check(vp)) {
997 		nbl_start_crit(vp, RW_READER);
998 		in_crit = 1;
999 		if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
1000 		    wa->wa_count, 0, NULL)) {
1001 			error = EACCES;
1002 			goto out;
1003 		}
1004 	}
1005 
1006 	error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1007 
1008 	/* check if a monitor detected a delegation conflict */
1009 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1010 		VN_RELE(vp);
1011 		/* mark as wouldblock so response is dropped */
1012 		curthread->t_flag |= T_WOULDBLOCK;
1013 		return;
1014 	}
1015 
1016 	if (wa->wa_data || wa->wa_rlist) {
1017 		/* Do the RDMA thing if necessary */
1018 		if (wa->wa_rlist) {
1019 			iov[0].iov_base = (char *)((wa->wa_rlist)->u.c_daddr3);
1020 			iov[0].iov_len = wa->wa_count;
1021 		} else  {
1022 			iov[0].iov_base = wa->wa_data;
1023 			iov[0].iov_len = wa->wa_count;
1024 		}
1025 		uio.uio_iov = iov;
1026 		uio.uio_iovcnt = 1;
1027 		uio.uio_segflg = UIO_SYSSPACE;
1028 		uio.uio_extflg = UIO_COPY_DEFAULT;
1029 		uio.uio_loffset = (offset_t)wa->wa_offset;
1030 		uio.uio_resid = wa->wa_count;
1031 		/*
1032 		 * The limit is checked on the client. We
1033 		 * should allow any size writes here.
1034 		 */
1035 		uio.uio_llimit = curproc->p_fsz_ctl;
1036 		rlimit = uio.uio_llimit - wa->wa_offset;
1037 		if (rlimit < (rlim64_t)uio.uio_resid)
1038 			uio.uio_resid = (uint_t)rlimit;
1039 
1040 		/*
1041 		 * for now we assume no append mode
1042 		 */
1043 		/*
1044 		 * We're changing creds because VM may fault and we need
1045 		 * the cred of the current thread to be used if quota
1046 		 * checking is enabled.
1047 		 */
1048 		savecred = curthread->t_cred;
1049 		curthread->t_cred = cr;
1050 		error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1051 		curthread->t_cred = savecred;
1052 	} else {
1053 		iovcnt = 0;
1054 		for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
1055 			iovcnt++;
1056 		if (iovcnt <= MAX_IOVECS) {
1057 #ifdef DEBUG
1058 			rfs_write_sync_hits++;
1059 #endif
1060 			iovp = iov;
1061 		} else {
1062 #ifdef DEBUG
1063 			rfs_write_sync_misses++;
1064 #endif
1065 			iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
1066 		}
1067 		mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
1068 		uio.uio_iov = iovp;
1069 		uio.uio_iovcnt = iovcnt;
1070 		uio.uio_segflg = UIO_SYSSPACE;
1071 		uio.uio_extflg = UIO_COPY_DEFAULT;
1072 		uio.uio_loffset = (offset_t)wa->wa_offset;
1073 		uio.uio_resid = wa->wa_count;
1074 		/*
1075 		 * The limit is checked on the client. We
1076 		 * should allow any size writes here.
1077 		 */
1078 		uio.uio_llimit = curproc->p_fsz_ctl;
1079 		rlimit = uio.uio_llimit - wa->wa_offset;
1080 		if (rlimit < (rlim64_t)uio.uio_resid)
1081 			uio.uio_resid = (uint_t)rlimit;
1082 
1083 		/*
1084 		 * For now we assume no append mode.
1085 		 */
1086 		/*
1087 		 * We're changing creds because VM may fault and we need
1088 		 * the cred of the current thread to be used if quota
1089 		 * checking is enabled.
1090 		 */
1091 		savecred = curthread->t_cred;
1092 		curthread->t_cred = cr;
1093 		error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1094 		curthread->t_cred = savecred;
1095 
1096 		if (iovp != iov)
1097 			kmem_free(iovp, sizeof (*iovp) * iovcnt);
1098 	}
1099 
1100 	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1101 
1102 	if (!error) {
1103 		/*
1104 		 * Get attributes again so we send the latest mod
1105 		 * time to the client side for his cache.
1106 		 */
1107 		va.va_mask = AT_ALL;	/* now we want everything */
1108 
1109 		error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1110 
1111 		/* check for overflows */
1112 		if (!error) {
1113 			acl_perm(vp, exi, &va, cr);
1114 			error = vattr_to_nattr(&va, &ns->ns_attr);
1115 		}
1116 	}
1117 
1118 out:
1119 	if (in_crit)
1120 		nbl_end_crit(vp);
1121 	VN_RELE(vp);
1122 
1123 	/* check if a monitor detected a delegation conflict */
1124 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1125 		/* mark as wouldblock so response is dropped */
1126 		curthread->t_flag |= T_WOULDBLOCK;
1127 	else
1128 		ns->ns_status = puterrno(error);
1129 
1130 }
1131 
1132 struct rfs_async_write {
1133 	struct nfswriteargs *wa;
1134 	struct nfsattrstat *ns;
1135 	struct svc_req *req;
1136 	cred_t *cr;
1137 	kthread_t *thread;
1138 	struct rfs_async_write *list;
1139 };
1140 
1141 struct rfs_async_write_list {
1142 	fhandle_t *fhp;
1143 	kcondvar_t cv;
1144 	struct rfs_async_write *list;
1145 	struct rfs_async_write_list *next;
1146 };
1147 
1148 static struct rfs_async_write_list *rfs_async_write_head = NULL;
1149 static kmutex_t rfs_async_write_lock;
1150 static int rfs_write_async = 1;	/* enables write clustering if == 1 */
1151 
1152 #define	MAXCLIOVECS	42
1153 #define	RFSWRITE_INITVAL (enum nfsstat) -1
1154 
1155 #ifdef DEBUG
1156 static int rfs_write_hits = 0;
1157 static int rfs_write_misses = 0;
1158 #endif
1159 
1160 /*
1161  * Write data to file.
1162  * Returns attributes of a file after writing some data to it.
1163  */
1164 void
1165 rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns,
1166 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1167 {
1168 	int error;
1169 	vnode_t *vp;
1170 	rlim64_t rlimit;
1171 	struct vattr va;
1172 	struct uio uio;
1173 	struct rfs_async_write_list *lp;
1174 	struct rfs_async_write_list *nlp;
1175 	struct rfs_async_write *rp;
1176 	struct rfs_async_write *nrp;
1177 	struct rfs_async_write *trp;
1178 	struct rfs_async_write *lrp;
1179 	int data_written;
1180 	int iovcnt;
1181 	mblk_t *m;
1182 	struct iovec *iovp;
1183 	struct iovec *niovp;
1184 	struct iovec iov[MAXCLIOVECS];
1185 	int count;
1186 	int rcount;
1187 	uint_t off;
1188 	uint_t len;
1189 	struct rfs_async_write nrpsp;
1190 	struct rfs_async_write_list nlpsp;
1191 	ushort_t t_flag;
1192 	cred_t *savecred;
1193 	int in_crit = 0;
1194 	caller_context_t ct;
1195 
1196 	if (!rfs_write_async) {
1197 		rfs_write_sync(wa, ns, exi, req, cr);
1198 		return;
1199 	}
1200 
1201 	/*
1202 	 * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
1203 	 * is considered an OK.
1204 	 */
1205 	ns->ns_status = RFSWRITE_INITVAL;
1206 
1207 	nrp = &nrpsp;
1208 	nrp->wa = wa;
1209 	nrp->ns = ns;
1210 	nrp->req = req;
1211 	nrp->cr = cr;
1212 	nrp->thread = curthread;
1213 
1214 	ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
1215 
1216 	/*
1217 	 * Look to see if there is already a cluster started
1218 	 * for this file.
1219 	 */
1220 	mutex_enter(&rfs_async_write_lock);
1221 	for (lp = rfs_async_write_head; lp != NULL; lp = lp->next) {
1222 		if (bcmp(&wa->wa_fhandle, lp->fhp,
1223 		    sizeof (fhandle_t)) == 0)
1224 			break;
1225 	}
1226 
1227 	/*
1228 	 * If lp is non-NULL, then there is already a cluster
1229 	 * started.  We need to place ourselves in the cluster
1230 	 * list in the right place as determined by starting
1231 	 * offset.  Conflicts with non-blocking mandatory locked
1232 	 * regions will be checked when the cluster is processed.
1233 	 */
1234 	if (lp != NULL) {
1235 		rp = lp->list;
1236 		trp = NULL;
1237 		while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
1238 			trp = rp;
1239 			rp = rp->list;
1240 		}
1241 		nrp->list = rp;
1242 		if (trp == NULL)
1243 			lp->list = nrp;
1244 		else
1245 			trp->list = nrp;
1246 		while (nrp->ns->ns_status == RFSWRITE_INITVAL)
1247 			cv_wait(&lp->cv, &rfs_async_write_lock);
1248 		mutex_exit(&rfs_async_write_lock);
1249 
1250 		return;
1251 	}
1252 
1253 	/*
1254 	 * No cluster started yet, start one and add ourselves
1255 	 * to the list of clusters.
1256 	 */
1257 	nrp->list = NULL;
1258 
1259 	nlp = &nlpsp;
1260 	nlp->fhp = &wa->wa_fhandle;
1261 	cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
1262 	nlp->list = nrp;
1263 	nlp->next = NULL;
1264 
1265 	if (rfs_async_write_head == NULL) {
1266 		rfs_async_write_head = nlp;
1267 	} else {
1268 		lp = rfs_async_write_head;
1269 		while (lp->next != NULL)
1270 			lp = lp->next;
1271 		lp->next = nlp;
1272 	}
1273 	mutex_exit(&rfs_async_write_lock);
1274 
1275 	/*
1276 	 * Convert the file handle common to all of the requests
1277 	 * in this cluster to a vnode.
1278 	 */
1279 	vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1280 	if (vp == NULL) {
1281 		mutex_enter(&rfs_async_write_lock);
1282 		if (rfs_async_write_head == nlp)
1283 			rfs_async_write_head = nlp->next;
1284 		else {
1285 			lp = rfs_async_write_head;
1286 			while (lp->next != nlp)
1287 				lp = lp->next;
1288 			lp->next = nlp->next;
1289 		}
1290 		t_flag = curthread->t_flag & T_WOULDBLOCK;
1291 		for (rp = nlp->list; rp != NULL; rp = rp->list) {
1292 			rp->ns->ns_status = NFSERR_STALE;
1293 			rp->thread->t_flag |= t_flag;
1294 		}
1295 		cv_broadcast(&nlp->cv);
1296 		mutex_exit(&rfs_async_write_lock);
1297 
1298 		return;
1299 	}
1300 
1301 	/*
1302 	 * Can only write regular files.  Attempts to write any
1303 	 * other file types fail with EISDIR.
1304 	 */
1305 	if (vp->v_type != VREG) {
1306 		VN_RELE(vp);
1307 		mutex_enter(&rfs_async_write_lock);
1308 		if (rfs_async_write_head == nlp)
1309 			rfs_async_write_head = nlp->next;
1310 		else {
1311 			lp = rfs_async_write_head;
1312 			while (lp->next != nlp)
1313 				lp = lp->next;
1314 			lp->next = nlp->next;
1315 		}
1316 		t_flag = curthread->t_flag & T_WOULDBLOCK;
1317 		for (rp = nlp->list; rp != NULL; rp = rp->list) {
1318 			rp->ns->ns_status = NFSERR_ISDIR;
1319 			rp->thread->t_flag |= t_flag;
1320 		}
1321 		cv_broadcast(&nlp->cv);
1322 		mutex_exit(&rfs_async_write_lock);
1323 
1324 		return;
1325 	}
1326 
1327 	/*
1328 	 * Enter the critical region before calling VOP_RWLOCK, to avoid a
1329 	 * deadlock with ufs.
1330 	 */
1331 	if (nbl_need_check(vp)) {
1332 		nbl_start_crit(vp, RW_READER);
1333 		in_crit = 1;
1334 	}
1335 
1336 	ct.cc_sysid = 0;
1337 	ct.cc_pid = 0;
1338 	ct.cc_caller_id = nfs2_srv_caller_id;
1339 	ct.cc_flags = CC_DONTBLOCK;
1340 
1341 	/*
1342 	 * Lock the file for writing.  This operation provides
1343 	 * the delay which allows clusters to grow.
1344 	 */
1345 	error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1346 
1347 	/* check if a monitor detected a delegation conflict */
1348 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1349 		if (in_crit)
1350 			nbl_end_crit(vp);
1351 		VN_RELE(vp);
1352 		/* mark as wouldblock so response is dropped */
1353 		curthread->t_flag |= T_WOULDBLOCK;
1354 		mutex_enter(&rfs_async_write_lock);
1355 		if (rfs_async_write_head == nlp)
1356 			rfs_async_write_head = nlp->next;
1357 		else {
1358 			lp = rfs_async_write_head;
1359 			while (lp->next != nlp)
1360 				lp = lp->next;
1361 			lp->next = nlp->next;
1362 		}
1363 		for (rp = nlp->list; rp != NULL; rp = rp->list) {
1364 			if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1365 				rp->ns->ns_status = puterrno(error);
1366 				rp->thread->t_flag |= T_WOULDBLOCK;
1367 			}
1368 		}
1369 		cv_broadcast(&nlp->cv);
1370 		mutex_exit(&rfs_async_write_lock);
1371 
1372 		return;
1373 	}
1374 
1375 	/*
1376 	 * Disconnect this cluster from the list of clusters.
1377 	 * The cluster that is being dealt with must be fixed
1378 	 * in size after this point, so there is no reason
1379 	 * to leave it on the list so that new requests can
1380 	 * find it.
1381 	 *
1382 	 * The algorithm is that the first write request will
1383 	 * create a cluster, convert the file handle to a
1384 	 * vnode pointer, and then lock the file for writing.
1385 	 * This request is not likely to be clustered with
1386 	 * any others.  However, the next request will create
1387 	 * a new cluster and be blocked in VOP_RWLOCK while
1388 	 * the first request is being processed.  This delay
1389 	 * will allow more requests to be clustered in this
1390 	 * second cluster.
1391 	 */
1392 	mutex_enter(&rfs_async_write_lock);
1393 	if (rfs_async_write_head == nlp)
1394 		rfs_async_write_head = nlp->next;
1395 	else {
1396 		lp = rfs_async_write_head;
1397 		while (lp->next != nlp)
1398 			lp = lp->next;
1399 		lp->next = nlp->next;
1400 	}
1401 	mutex_exit(&rfs_async_write_lock);
1402 
1403 	/*
1404 	 * Step through the list of requests in this cluster.
1405 	 * We need to check permissions to make sure that all
1406 	 * of the requests have sufficient permission to write
1407 	 * the file.  A cluster can be composed of requests
1408 	 * from different clients and different users on each
1409 	 * client.
1410 	 *
1411 	 * As a side effect, we also calculate the size of the
1412 	 * byte range that this cluster encompasses.
1413 	 */
1414 	rp = nlp->list;
1415 	off = rp->wa->wa_offset;
1416 	len = (uint_t)0;
1417 	do {
1418 		if (rdonly(exi, vp, rp->req)) {
1419 			rp->ns->ns_status = NFSERR_ROFS;
1420 			t_flag = curthread->t_flag & T_WOULDBLOCK;
1421 			rp->thread->t_flag |= t_flag;
1422 			continue;
1423 		}
1424 
1425 		va.va_mask = AT_UID|AT_MODE;
1426 
1427 		error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1428 
1429 		if (!error) {
1430 			if (crgetuid(rp->cr) != va.va_uid) {
1431 				/*
1432 				 * This is a kludge to allow writes of files
1433 				 * created with read only permission.  The
1434 				 * owner of the file is always allowed to
1435 				 * write it.
1436 				 */
1437 				error = VOP_ACCESS(vp, VWRITE, 0, rp->cr, &ct);
1438 			}
1439 			if (!error && MANDLOCK(vp, va.va_mode))
1440 				error = EACCES;
1441 		}
1442 
1443 		/*
1444 		 * Check for a conflict with a nbmand-locked region.
1445 		 */
1446 		if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset,
1447 		    rp->wa->wa_count, 0, NULL)) {
1448 			error = EACCES;
1449 		}
1450 
1451 		if (error) {
1452 			rp->ns->ns_status = puterrno(error);
1453 			t_flag = curthread->t_flag & T_WOULDBLOCK;
1454 			rp->thread->t_flag |= t_flag;
1455 			continue;
1456 		}
1457 		if (len < rp->wa->wa_offset + rp->wa->wa_count - off)
1458 			len = rp->wa->wa_offset + rp->wa->wa_count - off;
1459 	} while ((rp = rp->list) != NULL);
1460 
1461 	/*
1462 	 * Step through the cluster attempting to gather as many
1463 	 * requests which are contiguous as possible.  These
1464 	 * contiguous requests are handled via one call to VOP_WRITE
1465 	 * instead of different calls to VOP_WRITE.  We also keep
1466 	 * track of the fact that any data was written.
1467 	 */
1468 	rp = nlp->list;
1469 	data_written = 0;
1470 	do {
1471 		/*
1472 		 * Skip any requests which are already marked as having an
1473 		 * error.
1474 		 */
1475 		if (rp->ns->ns_status != RFSWRITE_INITVAL) {
1476 			rp = rp->list;
1477 			continue;
1478 		}
1479 
1480 		/*
1481 		 * Count the number of iovec's which are required
1482 		 * to handle this set of requests.  One iovec is
1483 		 * needed for each data buffer, whether addressed
1484 		 * by wa_data or by the b_rptr pointers in the
1485 		 * mblk chains.
1486 		 */
1487 		iovcnt = 0;
1488 		lrp = rp;
1489 		for (;;) {
1490 			if (lrp->wa->wa_data || lrp->wa->wa_rlist)
1491 				iovcnt++;
1492 			else {
1493 				m = lrp->wa->wa_mblk;
1494 				while (m != NULL) {
1495 					iovcnt++;
1496 					m = m->b_cont;
1497 				}
1498 			}
1499 			if (lrp->list == NULL ||
1500 			    lrp->list->ns->ns_status != RFSWRITE_INITVAL ||
1501 			    lrp->wa->wa_offset + lrp->wa->wa_count !=
1502 			    lrp->list->wa->wa_offset) {
1503 				lrp = lrp->list;
1504 				break;
1505 			}
1506 			lrp = lrp->list;
1507 		}
1508 
1509 		if (iovcnt <= MAXCLIOVECS) {
1510 #ifdef DEBUG
1511 			rfs_write_hits++;
1512 #endif
1513 			niovp = iov;
1514 		} else {
1515 #ifdef DEBUG
1516 			rfs_write_misses++;
1517 #endif
1518 			niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP);
1519 		}
1520 		/*
1521 		 * Put together the scatter/gather iovecs.
1522 		 */
1523 		iovp = niovp;
1524 		trp = rp;
1525 		count = 0;
1526 		do {
1527 			if (trp->wa->wa_data || trp->wa->wa_rlist) {
1528 				if (trp->wa->wa_rlist) {
1529 					iovp->iov_base =
1530 					    (char *)((trp->wa->wa_rlist)->
1531 					    u.c_daddr3);
1532 					iovp->iov_len = trp->wa->wa_count;
1533 				} else  {
1534 					iovp->iov_base = trp->wa->wa_data;
1535 					iovp->iov_len = trp->wa->wa_count;
1536 				}
1537 				iovp++;
1538 			} else {
1539 				m = trp->wa->wa_mblk;
1540 				rcount = trp->wa->wa_count;
1541 				while (m != NULL) {
1542 					iovp->iov_base = (caddr_t)m->b_rptr;
1543 					iovp->iov_len = (m->b_wptr - m->b_rptr);
1544 					rcount -= iovp->iov_len;
1545 					if (rcount < 0)
1546 						iovp->iov_len += rcount;
1547 					iovp++;
1548 					if (rcount <= 0)
1549 						break;
1550 					m = m->b_cont;
1551 				}
1552 			}
1553 			count += trp->wa->wa_count;
1554 			trp = trp->list;
1555 		} while (trp != lrp);
1556 
1557 		uio.uio_iov = niovp;
1558 		uio.uio_iovcnt = iovcnt;
1559 		uio.uio_segflg = UIO_SYSSPACE;
1560 		uio.uio_extflg = UIO_COPY_DEFAULT;
1561 		uio.uio_loffset = (offset_t)rp->wa->wa_offset;
1562 		uio.uio_resid = count;
1563 		/*
1564 		 * The limit is checked on the client. We
1565 		 * should allow any size writes here.
1566 		 */
1567 		uio.uio_llimit = curproc->p_fsz_ctl;
1568 		rlimit = uio.uio_llimit - rp->wa->wa_offset;
1569 		if (rlimit < (rlim64_t)uio.uio_resid)
1570 			uio.uio_resid = (uint_t)rlimit;
1571 
1572 		/*
1573 		 * For now we assume no append mode.
1574 		 */
1575 
1576 		/*
1577 		 * We're changing creds because VM may fault
1578 		 * and we need the cred of the current
1579 		 * thread to be used if quota * checking is
1580 		 * enabled.
1581 		 */
1582 		savecred = curthread->t_cred;
1583 		curthread->t_cred = cr;
1584 		error = VOP_WRITE(vp, &uio, 0, rp->cr, &ct);
1585 		curthread->t_cred = savecred;
1586 
1587 		/* check if a monitor detected a delegation conflict */
1588 		if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1589 			/* mark as wouldblock so response is dropped */
1590 			curthread->t_flag |= T_WOULDBLOCK;
1591 
1592 		if (niovp != iov)
1593 			kmem_free(niovp, sizeof (*niovp) * iovcnt);
1594 
1595 		if (!error) {
1596 			data_written = 1;
1597 			/*
1598 			 * Get attributes again so we send the latest mod
1599 			 * time to the client side for his cache.
1600 			 */
1601 			va.va_mask = AT_ALL;	/* now we want everything */
1602 
1603 			error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1604 
1605 			if (!error)
1606 				acl_perm(vp, exi, &va, rp->cr);
1607 		}
1608 
1609 		/*
1610 		 * Fill in the status responses for each request
1611 		 * which was just handled.  Also, copy the latest
1612 		 * attributes in to the attribute responses if
1613 		 * appropriate.
1614 		 */
1615 		t_flag = curthread->t_flag & T_WOULDBLOCK;
1616 		do {
1617 			rp->thread->t_flag |= t_flag;
1618 			/* check for overflows */
1619 			if (!error) {
1620 				error  = vattr_to_nattr(&va, &rp->ns->ns_attr);
1621 			}
1622 			rp->ns->ns_status = puterrno(error);
1623 			rp = rp->list;
1624 		} while (rp != lrp);
1625 	} while (rp != NULL);
1626 
1627 	/*
1628 	 * If any data was written at all, then we need to flush
1629 	 * the data and metadata to stable storage.
1630 	 */
1631 	if (data_written) {
1632 		error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr, &ct);
1633 
1634 		if (!error) {
1635 			error = VOP_FSYNC(vp, FNODSYNC, cr, &ct);
1636 		}
1637 	}
1638 
1639 	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1640 
1641 	if (in_crit)
1642 		nbl_end_crit(vp);
1643 	VN_RELE(vp);
1644 
1645 	t_flag = curthread->t_flag & T_WOULDBLOCK;
1646 	mutex_enter(&rfs_async_write_lock);
1647 	for (rp = nlp->list; rp != NULL; rp = rp->list) {
1648 		if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1649 			rp->ns->ns_status = puterrno(error);
1650 			rp->thread->t_flag |= t_flag;
1651 		}
1652 	}
1653 	cv_broadcast(&nlp->cv);
1654 	mutex_exit(&rfs_async_write_lock);
1655 
1656 }
1657 
1658 void *
1659 rfs_write_getfh(struct nfswriteargs *wa)
1660 {
1661 	return (&wa->wa_fhandle);
1662 }
1663 
1664 /*
1665  * Create a file.
1666  * Creates a file with given attributes and returns those attributes
1667  * and an fhandle for the new file.
1668  */
1669 void
1670 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
1671 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1672 {
1673 	int error;
1674 	int lookuperr;
1675 	int in_crit = 0;
1676 	struct vattr va;
1677 	vnode_t *vp;
1678 	vnode_t *realvp;
1679 	vnode_t *dvp;
1680 	char *name = args->ca_da.da_name;
1681 	vnode_t *tvp = NULL;
1682 	int mode;
1683 	int lookup_ok;
1684 	bool_t trunc;
1685 	struct sockaddr *ca;
1686 
1687 	/*
1688 	 * Disallow NULL paths
1689 	 */
1690 	if (name == NULL || *name == '\0') {
1691 		dr->dr_status = NFSERR_ACCES;
1692 		return;
1693 	}
1694 
1695 	dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
1696 	if (dvp == NULL) {
1697 		dr->dr_status = NFSERR_STALE;
1698 		return;
1699 	}
1700 
1701 	error = sattr_to_vattr(args->ca_sa, &va);
1702 	if (error) {
1703 		dr->dr_status = puterrno(error);
1704 		return;
1705 	}
1706 
1707 	/*
1708 	 * Must specify the mode.
1709 	 */
1710 	if (!(va.va_mask & AT_MODE)) {
1711 		VN_RELE(dvp);
1712 		dr->dr_status = NFSERR_INVAL;
1713 		return;
1714 	}
1715 
1716 	/*
1717 	 * This is a completely gross hack to make mknod
1718 	 * work over the wire until we can wack the protocol
1719 	 */
1720 	if ((va.va_mode & IFMT) == IFCHR) {
1721 		if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV)
1722 			va.va_type = VFIFO;	/* xtra kludge for named pipe */
1723 		else {
1724 			va.va_type = VCHR;
1725 			/*
1726 			 * uncompress the received dev_t
1727 			 * if the top half is zero indicating a request
1728 			 * from an `older style' OS.
1729 			 */
1730 			if ((va.va_size & 0xffff0000) == 0)
1731 				va.va_rdev = nfsv2_expdev(va.va_size);
1732 			else
1733 				va.va_rdev = (dev_t)va.va_size;
1734 		}
1735 		va.va_mask &= ~AT_SIZE;
1736 	} else if ((va.va_mode & IFMT) == IFBLK) {
1737 		va.va_type = VBLK;
1738 		/*
1739 		 * uncompress the received dev_t
1740 		 * if the top half is zero indicating a request
1741 		 * from an `older style' OS.
1742 		 */
1743 		if ((va.va_size & 0xffff0000) == 0)
1744 			va.va_rdev = nfsv2_expdev(va.va_size);
1745 		else
1746 			va.va_rdev = (dev_t)va.va_size;
1747 		va.va_mask &= ~AT_SIZE;
1748 	} else if ((va.va_mode & IFMT) == IFSOCK) {
1749 		va.va_type = VSOCK;
1750 	} else {
1751 		va.va_type = VREG;
1752 	}
1753 	va.va_mode &= ~IFMT;
1754 	va.va_mask |= AT_TYPE;
1755 
1756 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
1757 	name = nfscmd_convname(ca, exi, name, NFSCMD_CONV_INBOUND,
1758 	    MAXPATHLEN);
1759 	if (name == NULL) {
1760 		dr->dr_status = puterrno(EINVAL);
1761 		return;
1762 	}
1763 
1764 	/*
1765 	 * Why was the choice made to use VWRITE as the mode to the
1766 	 * call to VOP_CREATE ? This results in a bug.  When a client
1767 	 * opens a file that already exists and is RDONLY, the second
1768 	 * open fails with an EACESS because of the mode.
1769 	 * bug ID 1054648.
1770 	 */
1771 	lookup_ok = 0;
1772 	mode = VWRITE;
1773 	if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) {
1774 		error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1775 		    NULL, NULL, NULL);
1776 		if (!error) {
1777 			struct vattr at;
1778 
1779 			lookup_ok = 1;
1780 			at.va_mask = AT_MODE;
1781 			error = VOP_GETATTR(tvp, &at, 0, cr, NULL);
1782 			if (!error)
1783 				mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD;
1784 			VN_RELE(tvp);
1785 			tvp = NULL;
1786 		}
1787 	}
1788 
1789 	if (!lookup_ok) {
1790 		if (rdonly(exi, dvp, req)) {
1791 			error = EROFS;
1792 		} else if (va.va_type != VREG && va.va_type != VFIFO &&
1793 		    va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) {
1794 			error = EPERM;
1795 		} else {
1796 			error = 0;
1797 		}
1798 	}
1799 
1800 	/*
1801 	 * If file size is being modified on an already existing file
1802 	 * make sure that there are no conflicting non-blocking mandatory
1803 	 * locks in the region being manipulated. Return EACCES if there
1804 	 * are conflicting locks.
1805 	 */
1806 	if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) {
1807 		lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1808 		    NULL, NULL, NULL);
1809 
1810 		if (!lookuperr &&
1811 		    rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) {
1812 			VN_RELE(tvp);
1813 			curthread->t_flag |= T_WOULDBLOCK;
1814 			goto out;
1815 		}
1816 
1817 		if (!lookuperr && nbl_need_check(tvp)) {
1818 			/*
1819 			 * The file exists. Now check if it has any
1820 			 * conflicting non-blocking mandatory locks
1821 			 * in the region being changed.
1822 			 */
1823 			struct vattr bva;
1824 			u_offset_t offset;
1825 			ssize_t length;
1826 
1827 			nbl_start_crit(tvp, RW_READER);
1828 			in_crit = 1;
1829 
1830 			bva.va_mask = AT_SIZE;
1831 			error = VOP_GETATTR(tvp, &bva, 0, cr, NULL);
1832 			if (!error) {
1833 				if (va.va_size < bva.va_size) {
1834 					offset = va.va_size;
1835 					length = bva.va_size - va.va_size;
1836 				} else {
1837 					offset = bva.va_size;
1838 					length = va.va_size - bva.va_size;
1839 				}
1840 				if (length) {
1841 					if (nbl_conflict(tvp, NBL_WRITE,
1842 					    offset, length, 0, NULL)) {
1843 						error = EACCES;
1844 					}
1845 				}
1846 			}
1847 			if (error) {
1848 				nbl_end_crit(tvp);
1849 				VN_RELE(tvp);
1850 				in_crit = 0;
1851 			}
1852 		} else if (tvp != NULL) {
1853 			VN_RELE(tvp);
1854 		}
1855 	}
1856 
1857 	if (!error) {
1858 		/*
1859 		 * If filesystem is shared with nosuid the remove any
1860 		 * setuid/setgid bits on create.
1861 		 */
1862 		if (va.va_type == VREG &&
1863 		    exi->exi_export.ex_flags & EX_NOSUID)
1864 			va.va_mode &= ~(VSUID | VSGID);
1865 
1866 		error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0,
1867 		    NULL, NULL);
1868 
1869 		if (!error) {
1870 
1871 			if ((va.va_mask & AT_SIZE) && (va.va_size == 0))
1872 				trunc = TRUE;
1873 			else
1874 				trunc = FALSE;
1875 
1876 			if (rfs4_check_delegated(FWRITE, vp, trunc)) {
1877 				VN_RELE(vp);
1878 				curthread->t_flag |= T_WOULDBLOCK;
1879 				goto out;
1880 			}
1881 			va.va_mask = AT_ALL;
1882 
1883 			error = VOP_GETATTR(vp, &va, 0, cr, NULL);
1884 
1885 			/* check for overflows */
1886 			if (!error) {
1887 				acl_perm(vp, exi, &va, cr);
1888 				error = vattr_to_nattr(&va, &dr->dr_attr);
1889 				if (!error) {
1890 					error = makefh(&dr->dr_fhandle, vp,
1891 					    exi);
1892 				}
1893 			}
1894 			/*
1895 			 * Force modified metadata out to stable storage.
1896 			 *
1897 			 * if a underlying vp exists, pass it to VOP_FSYNC
1898 			 */
1899 			if (VOP_REALVP(vp, &realvp, NULL) == 0)
1900 				(void) VOP_FSYNC(realvp, FNODSYNC, cr, NULL);
1901 			else
1902 				(void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
1903 			VN_RELE(vp);
1904 		}
1905 
1906 		if (in_crit) {
1907 			nbl_end_crit(tvp);
1908 			VN_RELE(tvp);
1909 		}
1910 	}
1911 
1912 	/*
1913 	 * Force modified data and metadata out to stable storage.
1914 	 */
1915 	(void) VOP_FSYNC(dvp, 0, cr, NULL);
1916 
1917 out:
1918 
1919 	VN_RELE(dvp);
1920 
1921 	dr->dr_status = puterrno(error);
1922 
1923 	if (name != args->ca_da.da_name)
1924 		kmem_free(name, MAXPATHLEN);
1925 }
1926 void *
1927 rfs_create_getfh(struct nfscreatargs *args)
1928 {
1929 	return (args->ca_da.da_fhandle);
1930 }
1931 
1932 /*
1933  * Remove a file.
1934  * Remove named file from parent directory.
1935  */
1936 void
1937 rfs_remove(struct nfsdiropargs *da, enum nfsstat *status,
1938 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1939 {
1940 	int error = 0;
1941 	vnode_t *vp;
1942 	vnode_t *targvp;
1943 	int in_crit = 0;
1944 
1945 	/*
1946 	 * Disallow NULL paths
1947 	 */
1948 	if (da->da_name == NULL || *da->da_name == '\0') {
1949 		*status = NFSERR_ACCES;
1950 		return;
1951 	}
1952 
1953 	vp = nfs_fhtovp(da->da_fhandle, exi);
1954 	if (vp == NULL) {
1955 		*status = NFSERR_STALE;
1956 		return;
1957 	}
1958 
1959 	if (rdonly(exi, vp, req)) {
1960 		VN_RELE(vp);
1961 		*status = NFSERR_ROFS;
1962 		return;
1963 	}
1964 
1965 	/*
1966 	 * Check for a conflict with a non-blocking mandatory share reservation.
1967 	 */
1968 	error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0,
1969 	    NULL, cr, NULL, NULL, NULL);
1970 	if (error != 0) {
1971 		VN_RELE(vp);
1972 		*status = puterrno(error);
1973 		return;
1974 	}
1975 
1976 	/*
1977 	 * If the file is delegated to an v4 client, then initiate
1978 	 * recall and drop this request (by setting T_WOULDBLOCK).
1979 	 * The client will eventually re-transmit the request and
1980 	 * (hopefully), by then, the v4 client will have returned
1981 	 * the delegation.
1982 	 */
1983 
1984 	if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
1985 		VN_RELE(vp);
1986 		VN_RELE(targvp);
1987 		curthread->t_flag |= T_WOULDBLOCK;
1988 		return;
1989 	}
1990 
1991 	if (nbl_need_check(targvp)) {
1992 		nbl_start_crit(targvp, RW_READER);
1993 		in_crit = 1;
1994 		if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
1995 			error = EACCES;
1996 			goto out;
1997 		}
1998 	}
1999 
2000 	error = VOP_REMOVE(vp, da->da_name, cr, NULL, 0);
2001 
2002 	/*
2003 	 * Force modified data and metadata out to stable storage.
2004 	 */
2005 	(void) VOP_FSYNC(vp, 0, cr, NULL);
2006 
2007 out:
2008 	if (in_crit)
2009 		nbl_end_crit(targvp);
2010 	VN_RELE(targvp);
2011 	VN_RELE(vp);
2012 
2013 	*status = puterrno(error);
2014 
2015 }
2016 
2017 void *
2018 rfs_remove_getfh(struct nfsdiropargs *da)
2019 {
2020 	return (da->da_fhandle);
2021 }
2022 
2023 /*
2024  * rename a file
2025  * Give a file (from) a new name (to).
2026  */
2027 void
2028 rfs_rename(struct nfsrnmargs *args, enum nfsstat *status,
2029 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2030 {
2031 	int error = 0;
2032 	vnode_t *fromvp;
2033 	vnode_t *tovp;
2034 	struct exportinfo *to_exi;
2035 	fhandle_t *fh;
2036 	vnode_t *srcvp;
2037 	vnode_t *targvp;
2038 	int in_crit = 0;
2039 
2040 	fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi);
2041 	if (fromvp == NULL) {
2042 		*status = NFSERR_STALE;
2043 		return;
2044 	}
2045 
2046 	fh = args->rna_to.da_fhandle;
2047 	to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2048 	if (to_exi == NULL) {
2049 		VN_RELE(fromvp);
2050 		*status = NFSERR_ACCES;
2051 		return;
2052 	}
2053 	exi_rele(to_exi);
2054 
2055 	if (to_exi != exi) {
2056 		VN_RELE(fromvp);
2057 		*status = NFSERR_XDEV;
2058 		return;
2059 	}
2060 
2061 	tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi);
2062 	if (tovp == NULL) {
2063 		VN_RELE(fromvp);
2064 		*status = NFSERR_STALE;
2065 		return;
2066 	}
2067 
2068 	if (fromvp->v_type != VDIR || tovp->v_type != VDIR) {
2069 		VN_RELE(tovp);
2070 		VN_RELE(fromvp);
2071 		*status = NFSERR_NOTDIR;
2072 		return;
2073 	}
2074 
2075 	/*
2076 	 * Disallow NULL paths
2077 	 */
2078 	if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' ||
2079 	    args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') {
2080 		VN_RELE(tovp);
2081 		VN_RELE(fromvp);
2082 		*status = NFSERR_ACCES;
2083 		return;
2084 	}
2085 
2086 	if (rdonly(exi, tovp, req)) {
2087 		VN_RELE(tovp);
2088 		VN_RELE(fromvp);
2089 		*status = NFSERR_ROFS;
2090 		return;
2091 	}
2092 
2093 	/*
2094 	 * Check for a conflict with a non-blocking mandatory share reservation.
2095 	 */
2096 	error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
2097 	    NULL, cr, NULL, NULL, NULL);
2098 	if (error != 0) {
2099 		VN_RELE(tovp);
2100 		VN_RELE(fromvp);
2101 		*status = puterrno(error);
2102 		return;
2103 	}
2104 
2105 	/* Check for delegations on the source file */
2106 
2107 	if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
2108 		VN_RELE(tovp);
2109 		VN_RELE(fromvp);
2110 		VN_RELE(srcvp);
2111 		curthread->t_flag |= T_WOULDBLOCK;
2112 		return;
2113 	}
2114 
2115 	/* Check for delegation on the file being renamed over, if it exists */
2116 
2117 	if (rfs4_deleg_policy != SRV_NEVER_DELEGATE &&
2118 	    VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
2119 	    NULL, NULL, NULL) == 0) {
2120 
2121 		if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2122 			VN_RELE(tovp);
2123 			VN_RELE(fromvp);
2124 			VN_RELE(srcvp);
2125 			VN_RELE(targvp);
2126 			curthread->t_flag |= T_WOULDBLOCK;
2127 			return;
2128 		}
2129 		VN_RELE(targvp);
2130 	}
2131 
2132 
2133 	if (nbl_need_check(srcvp)) {
2134 		nbl_start_crit(srcvp, RW_READER);
2135 		in_crit = 1;
2136 		if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) {
2137 			error = EACCES;
2138 			goto out;
2139 		}
2140 	}
2141 
2142 	error = VOP_RENAME(fromvp, args->rna_from.da_name,
2143 	    tovp, args->rna_to.da_name, cr, NULL, 0);
2144 
2145 	if (error == 0)
2146 		vn_renamepath(tovp, srcvp, args->rna_to.da_name,
2147 		    strlen(args->rna_to.da_name));
2148 
2149 	/*
2150 	 * Force modified data and metadata out to stable storage.
2151 	 */
2152 	(void) VOP_FSYNC(tovp, 0, cr, NULL);
2153 	(void) VOP_FSYNC(fromvp, 0, cr, NULL);
2154 
2155 out:
2156 	if (in_crit)
2157 		nbl_end_crit(srcvp);
2158 	VN_RELE(srcvp);
2159 	VN_RELE(tovp);
2160 	VN_RELE(fromvp);
2161 
2162 	*status = puterrno(error);
2163 
2164 }
2165 void *
2166 rfs_rename_getfh(struct nfsrnmargs *args)
2167 {
2168 	return (args->rna_from.da_fhandle);
2169 }
2170 
2171 /*
2172  * Link to a file.
2173  * Create a file (to) which is a hard link to the given file (from).
2174  */
2175 void
2176 rfs_link(struct nfslinkargs *args, enum nfsstat *status,
2177 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2178 {
2179 	int error;
2180 	vnode_t *fromvp;
2181 	vnode_t *tovp;
2182 	struct exportinfo *to_exi;
2183 	fhandle_t *fh;
2184 
2185 	fromvp = nfs_fhtovp(args->la_from, exi);
2186 	if (fromvp == NULL) {
2187 		*status = NFSERR_STALE;
2188 		return;
2189 	}
2190 
2191 	fh = args->la_to.da_fhandle;
2192 	to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2193 	if (to_exi == NULL) {
2194 		VN_RELE(fromvp);
2195 		*status = NFSERR_ACCES;
2196 		return;
2197 	}
2198 	exi_rele(to_exi);
2199 
2200 	if (to_exi != exi) {
2201 		VN_RELE(fromvp);
2202 		*status = NFSERR_XDEV;
2203 		return;
2204 	}
2205 
2206 	tovp = nfs_fhtovp(args->la_to.da_fhandle, exi);
2207 	if (tovp == NULL) {
2208 		VN_RELE(fromvp);
2209 		*status = NFSERR_STALE;
2210 		return;
2211 	}
2212 
2213 	if (tovp->v_type != VDIR) {
2214 		VN_RELE(tovp);
2215 		VN_RELE(fromvp);
2216 		*status = NFSERR_NOTDIR;
2217 		return;
2218 	}
2219 	/*
2220 	 * Disallow NULL paths
2221 	 */
2222 	if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') {
2223 		VN_RELE(tovp);
2224 		VN_RELE(fromvp);
2225 		*status = NFSERR_ACCES;
2226 		return;
2227 	}
2228 
2229 	if (rdonly(exi, tovp, req)) {
2230 		VN_RELE(tovp);
2231 		VN_RELE(fromvp);
2232 		*status = NFSERR_ROFS;
2233 		return;
2234 	}
2235 
2236 	error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr, NULL, 0);
2237 
2238 	/*
2239 	 * Force modified data and metadata out to stable storage.
2240 	 */
2241 	(void) VOP_FSYNC(tovp, 0, cr, NULL);
2242 	(void) VOP_FSYNC(fromvp, FNODSYNC, cr, NULL);
2243 
2244 	VN_RELE(tovp);
2245 	VN_RELE(fromvp);
2246 
2247 	*status = puterrno(error);
2248 
2249 }
2250 void *
2251 rfs_link_getfh(struct nfslinkargs *args)
2252 {
2253 	return (args->la_from);
2254 }
2255 
2256 /*
2257  * Symbolicly link to a file.
2258  * Create a file (to) with the given attributes which is a symbolic link
2259  * to the given path name (to).
2260  */
2261 void
2262 rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
2263 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2264 {
2265 	int error;
2266 	struct vattr va;
2267 	vnode_t *vp;
2268 	vnode_t *svp;
2269 	int lerror;
2270 	struct sockaddr *ca;
2271 	char *name = NULL;
2272 
2273 	/*
2274 	 * Disallow NULL paths
2275 	 */
2276 	if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') {
2277 		*status = NFSERR_ACCES;
2278 		return;
2279 	}
2280 
2281 	vp = nfs_fhtovp(args->sla_from.da_fhandle, exi);
2282 	if (vp == NULL) {
2283 		*status = NFSERR_STALE;
2284 		return;
2285 	}
2286 
2287 	if (rdonly(exi, vp, req)) {
2288 		VN_RELE(vp);
2289 		*status = NFSERR_ROFS;
2290 		return;
2291 	}
2292 
2293 	error = sattr_to_vattr(args->sla_sa, &va);
2294 	if (error) {
2295 		VN_RELE(vp);
2296 		*status = puterrno(error);
2297 		return;
2298 	}
2299 
2300 	if (!(va.va_mask & AT_MODE)) {
2301 		VN_RELE(vp);
2302 		*status = NFSERR_INVAL;
2303 		return;
2304 	}
2305 
2306 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2307 	name = nfscmd_convname(ca, exi, args->sla_tnm,
2308 	    NFSCMD_CONV_INBOUND, MAXPATHLEN);
2309 
2310 	if (name == NULL) {
2311 		*status = NFSERR_ACCES;
2312 		return;
2313 	}
2314 
2315 	va.va_type = VLNK;
2316 	va.va_mask |= AT_TYPE;
2317 
2318 	error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, name, cr, NULL, 0);
2319 
2320 	/*
2321 	 * Force new data and metadata out to stable storage.
2322 	 */
2323 	lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL, 0,
2324 	    NULL, cr, NULL, NULL, NULL);
2325 
2326 	if (!lerror) {
2327 		(void) VOP_FSYNC(svp, 0, cr, NULL);
2328 		VN_RELE(svp);
2329 	}
2330 
2331 	/*
2332 	 * Force modified data and metadata out to stable storage.
2333 	 */
2334 	(void) VOP_FSYNC(vp, 0, cr, NULL);
2335 
2336 	VN_RELE(vp);
2337 
2338 	*status = puterrno(error);
2339 	if (name != args->sla_tnm)
2340 		kmem_free(name, MAXPATHLEN);
2341 
2342 }
2343 void *
2344 rfs_symlink_getfh(struct nfsslargs *args)
2345 {
2346 	return (args->sla_from.da_fhandle);
2347 }
2348 
2349 /*
2350  * Make a directory.
2351  * Create a directory with the given name, parent directory, and attributes.
2352  * Returns a file handle and attributes for the new directory.
2353  */
2354 void
2355 rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr,
2356 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2357 {
2358 	int error;
2359 	struct vattr va;
2360 	vnode_t *dvp = NULL;
2361 	vnode_t *vp;
2362 	char *name = args->ca_da.da_name;
2363 
2364 	/*
2365 	 * Disallow NULL paths
2366 	 */
2367 	if (name == NULL || *name == '\0') {
2368 		dr->dr_status = NFSERR_ACCES;
2369 		return;
2370 	}
2371 
2372 	vp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
2373 	if (vp == NULL) {
2374 		dr->dr_status = NFSERR_STALE;
2375 		return;
2376 	}
2377 
2378 	if (rdonly(exi, vp, req)) {
2379 		VN_RELE(vp);
2380 		dr->dr_status = NFSERR_ROFS;
2381 		return;
2382 	}
2383 
2384 	error = sattr_to_vattr(args->ca_sa, &va);
2385 	if (error) {
2386 		VN_RELE(vp);
2387 		dr->dr_status = puterrno(error);
2388 		return;
2389 	}
2390 
2391 	if (!(va.va_mask & AT_MODE)) {
2392 		VN_RELE(vp);
2393 		dr->dr_status = NFSERR_INVAL;
2394 		return;
2395 	}
2396 
2397 	va.va_type = VDIR;
2398 	va.va_mask |= AT_TYPE;
2399 
2400 	error = VOP_MKDIR(vp, name, &va, &dvp, cr, NULL, 0, NULL);
2401 
2402 	if (!error) {
2403 		/*
2404 		 * Attribtutes of the newly created directory should
2405 		 * be returned to the client.
2406 		 */
2407 		va.va_mask = AT_ALL; /* We want everything */
2408 		error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2409 
2410 		/* check for overflows */
2411 		if (!error) {
2412 			acl_perm(vp, exi, &va, cr);
2413 			error = vattr_to_nattr(&va, &dr->dr_attr);
2414 			if (!error) {
2415 				error = makefh(&dr->dr_fhandle, dvp, exi);
2416 			}
2417 		}
2418 		/*
2419 		 * Force new data and metadata out to stable storage.
2420 		 */
2421 		(void) VOP_FSYNC(dvp, 0, cr, NULL);
2422 		VN_RELE(dvp);
2423 	}
2424 
2425 	/*
2426 	 * Force modified data and metadata out to stable storage.
2427 	 */
2428 	(void) VOP_FSYNC(vp, 0, cr, NULL);
2429 
2430 	VN_RELE(vp);
2431 
2432 	dr->dr_status = puterrno(error);
2433 
2434 }
2435 void *
2436 rfs_mkdir_getfh(struct nfscreatargs *args)
2437 {
2438 	return (args->ca_da.da_fhandle);
2439 }
2440 
2441 /*
2442  * Remove a directory.
2443  * Remove the given directory name from the given parent directory.
2444  */
2445 void
2446 rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status,
2447 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2448 {
2449 	int error;
2450 	vnode_t *vp;
2451 
2452 	/*
2453 	 * Disallow NULL paths
2454 	 */
2455 	if (da->da_name == NULL || *da->da_name == '\0') {
2456 		*status = NFSERR_ACCES;
2457 		return;
2458 	}
2459 
2460 	vp = nfs_fhtovp(da->da_fhandle, exi);
2461 	if (vp == NULL) {
2462 		*status = NFSERR_STALE;
2463 		return;
2464 	}
2465 
2466 	if (rdonly(exi, vp, req)) {
2467 		VN_RELE(vp);
2468 		*status = NFSERR_ROFS;
2469 		return;
2470 	}
2471 
2472 	/*
2473 	 * VOP_RMDIR takes a third argument (the current
2474 	 * directory of the process).  That's because someone
2475 	 * wants to return EINVAL if one tries to remove ".".
2476 	 * Of course, NFS servers have no idea what their
2477 	 * clients' current directories are.  We fake it by
2478 	 * supplying a vnode known to exist and illegal to
2479 	 * remove.
2480 	 */
2481 	error = VOP_RMDIR(vp, da->da_name, rootdir, cr, NULL, 0);
2482 
2483 	/*
2484 	 * Force modified data and metadata out to stable storage.
2485 	 */
2486 	(void) VOP_FSYNC(vp, 0, cr, NULL);
2487 
2488 	VN_RELE(vp);
2489 
2490 	/*
2491 	 * System V defines rmdir to return EEXIST, not ENOTEMPTY,
2492 	 * if the directory is not empty.  A System V NFS server
2493 	 * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
2494 	 * over the wire.
2495 	 */
2496 	if (error == EEXIST)
2497 		*status = NFSERR_NOTEMPTY;
2498 	else
2499 		*status = puterrno(error);
2500 
2501 }
2502 void *
2503 rfs_rmdir_getfh(struct nfsdiropargs *da)
2504 {
2505 	return (da->da_fhandle);
2506 }
2507 
2508 /* ARGSUSED */
2509 void
2510 rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
2511 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2512 {
2513 	int error;
2514 	int iseof;
2515 	struct iovec iov;
2516 	struct uio uio;
2517 	vnode_t *vp;
2518 	char *ndata = NULL;
2519 	struct sockaddr *ca;
2520 	size_t nents;
2521 	int ret;
2522 
2523 	vp = nfs_fhtovp(&rda->rda_fh, exi);
2524 	if (vp == NULL) {
2525 		rd->rd_entries = NULL;
2526 		rd->rd_status = NFSERR_STALE;
2527 		return;
2528 	}
2529 
2530 	if (vp->v_type != VDIR) {
2531 		VN_RELE(vp);
2532 		rd->rd_entries = NULL;
2533 		rd->rd_status = NFSERR_NOTDIR;
2534 		return;
2535 	}
2536 
2537 	(void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
2538 
2539 	error = VOP_ACCESS(vp, VREAD, 0, cr, NULL);
2540 
2541 	if (error) {
2542 		rd->rd_entries = NULL;
2543 		goto bad;
2544 	}
2545 
2546 	if (rda->rda_count == 0) {
2547 		rd->rd_entries = NULL;
2548 		rd->rd_size = 0;
2549 		rd->rd_eof = FALSE;
2550 		goto bad;
2551 	}
2552 
2553 	rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA);
2554 
2555 	/*
2556 	 * Allocate data for entries.  This will be freed by rfs_rddirfree.
2557 	 */
2558 	rd->rd_bufsize = (uint_t)rda->rda_count;
2559 	rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP);
2560 
2561 	/*
2562 	 * Set up io vector to read directory data
2563 	 */
2564 	iov.iov_base = (caddr_t)rd->rd_entries;
2565 	iov.iov_len = rda->rda_count;
2566 	uio.uio_iov = &iov;
2567 	uio.uio_iovcnt = 1;
2568 	uio.uio_segflg = UIO_SYSSPACE;
2569 	uio.uio_extflg = UIO_COPY_CACHED;
2570 	uio.uio_loffset = (offset_t)rda->rda_offset;
2571 	uio.uio_resid = rda->rda_count;
2572 
2573 	/*
2574 	 * read directory
2575 	 */
2576 	error = VOP_READDIR(vp, &uio, cr, &iseof, NULL, 0);
2577 
2578 	/*
2579 	 * Clean up
2580 	 */
2581 	if (!error) {
2582 		/*
2583 		 * set size and eof
2584 		 */
2585 		if (uio.uio_resid == rda->rda_count) {
2586 			rd->rd_size = 0;
2587 			rd->rd_eof = TRUE;
2588 		} else {
2589 			rd->rd_size = (uint32_t)(rda->rda_count -
2590 			    uio.uio_resid);
2591 			rd->rd_eof = iseof ? TRUE : FALSE;
2592 		}
2593 	}
2594 
2595 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2596 	nents = nfscmd_countents((char *)rd->rd_entries, rd->rd_size);
2597 	ret = nfscmd_convdirplus(ca, exi, (char *)rd->rd_entries, nents,
2598 	    rda->rda_count, &ndata);
2599 
2600 	if (ret != 0) {
2601 		size_t dropbytes;
2602 		/*
2603 		 * We had to drop one or more entries in order to fit
2604 		 * during the character conversion.  We need to patch
2605 		 * up the size and eof info.
2606 		 */
2607 		if (rd->rd_eof)
2608 			rd->rd_eof = FALSE;
2609 		dropbytes = nfscmd_dropped_entrysize(
2610 		    (struct dirent64 *)rd->rd_entries, nents, ret);
2611 		rd->rd_size -= dropbytes;
2612 	}
2613 	if (ndata == NULL) {
2614 		ndata = (char *)rd->rd_entries;
2615 	} else if (ndata != (char *)rd->rd_entries) {
2616 		kmem_free(rd->rd_entries, rd->rd_bufsize);
2617 		rd->rd_entries = (void *)ndata;
2618 		rd->rd_bufsize = rda->rda_count;
2619 	}
2620 
2621 bad:
2622 	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
2623 
2624 #if 0 /* notyet */
2625 	/*
2626 	 * Don't do this.  It causes local disk writes when just
2627 	 * reading the file and the overhead is deemed larger
2628 	 * than the benefit.
2629 	 */
2630 	/*
2631 	 * Force modified metadata out to stable storage.
2632 	 */
2633 	(void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2634 #endif
2635 
2636 	VN_RELE(vp);
2637 
2638 	rd->rd_status = puterrno(error);
2639 
2640 }
2641 void *
2642 rfs_readdir_getfh(struct nfsrddirargs *rda)
2643 {
2644 	return (&rda->rda_fh);
2645 }
2646 void
2647 rfs_rddirfree(struct nfsrddirres *rd)
2648 {
2649 	if (rd->rd_entries != NULL)
2650 		kmem_free(rd->rd_entries, rd->rd_bufsize);
2651 }
2652 
2653 /* ARGSUSED */
2654 void
2655 rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
2656 	struct svc_req *req, cred_t *cr)
2657 {
2658 	int error;
2659 	struct statvfs64 sb;
2660 	vnode_t *vp;
2661 
2662 	vp = nfs_fhtovp(fh, exi);
2663 	if (vp == NULL) {
2664 		fs->fs_status = NFSERR_STALE;
2665 		return;
2666 	}
2667 
2668 	error = VFS_STATVFS(vp->v_vfsp, &sb);
2669 
2670 	if (!error) {
2671 		fs->fs_tsize = nfstsize();
2672 		fs->fs_bsize = sb.f_frsize;
2673 		fs->fs_blocks = sb.f_blocks;
2674 		fs->fs_bfree = sb.f_bfree;
2675 		fs->fs_bavail = sb.f_bavail;
2676 	}
2677 
2678 	VN_RELE(vp);
2679 
2680 	fs->fs_status = puterrno(error);
2681 
2682 }
2683 void *
2684 rfs_statfs_getfh(fhandle_t *fh)
2685 {
2686 	return (fh);
2687 }
2688 
2689 static int
2690 sattr_to_vattr(struct nfssattr *sa, struct vattr *vap)
2691 {
2692 	vap->va_mask = 0;
2693 
2694 	/*
2695 	 * There was a sign extension bug in some VFS based systems
2696 	 * which stored the mode as a short.  When it would get
2697 	 * assigned to a u_long, no sign extension would occur.
2698 	 * It needed to, but this wasn't noticed because sa_mode
2699 	 * would then get assigned back to the short, thus ignoring
2700 	 * the upper 16 bits of sa_mode.
2701 	 *
2702 	 * To make this implementation work for both broken
2703 	 * clients and good clients, we check for both versions
2704 	 * of the mode.
2705 	 */
2706 	if (sa->sa_mode != (uint32_t)((ushort_t)-1) &&
2707 	    sa->sa_mode != (uint32_t)-1) {
2708 		vap->va_mask |= AT_MODE;
2709 		vap->va_mode = sa->sa_mode;
2710 	}
2711 	if (sa->sa_uid != (uint32_t)-1) {
2712 		vap->va_mask |= AT_UID;
2713 		vap->va_uid = sa->sa_uid;
2714 	}
2715 	if (sa->sa_gid != (uint32_t)-1) {
2716 		vap->va_mask |= AT_GID;
2717 		vap->va_gid = sa->sa_gid;
2718 	}
2719 	if (sa->sa_size != (uint32_t)-1) {
2720 		vap->va_mask |= AT_SIZE;
2721 		vap->va_size = sa->sa_size;
2722 	}
2723 	if (sa->sa_atime.tv_sec != (int32_t)-1 &&
2724 	    sa->sa_atime.tv_usec != (int32_t)-1) {
2725 #ifndef _LP64
2726 		/* return error if time overflow */
2727 		if (!NFS2_TIME_OK(sa->sa_atime.tv_sec))
2728 			return (EOVERFLOW);
2729 #endif
2730 		vap->va_mask |= AT_ATIME;
2731 		/*
2732 		 * nfs protocol defines times as unsigned so don't extend sign,
2733 		 * unless sysadmin set nfs_allow_preepoch_time.
2734 		 */
2735 		NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec);
2736 		vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
2737 	}
2738 	if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
2739 	    sa->sa_mtime.tv_usec != (int32_t)-1) {
2740 #ifndef _LP64
2741 		/* return error if time overflow */
2742 		if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
2743 			return (EOVERFLOW);
2744 #endif
2745 		vap->va_mask |= AT_MTIME;
2746 		/*
2747 		 * nfs protocol defines times as unsigned so don't extend sign,
2748 		 * unless sysadmin set nfs_allow_preepoch_time.
2749 		 */
2750 		NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
2751 		vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
2752 	}
2753 	return (0);
2754 }
2755 
2756 static enum nfsftype vt_to_nf[] = {
2757 	0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
2758 };
2759 
2760 /*
2761  * check the following fields for overflow: nodeid, size, and time.
2762  * There could be a problem when converting 64-bit LP64 fields
2763  * into 32-bit ones.  Return an error if there is an overflow.
2764  */
2765 int
2766 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
2767 {
2768 	ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
2769 	na->na_type = vt_to_nf[vap->va_type];
2770 
2771 	if (vap->va_mode == (unsigned short) -1)
2772 		na->na_mode = (uint32_t)-1;
2773 	else
2774 		na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
2775 
2776 	if (vap->va_uid == (unsigned short)(-1))
2777 		na->na_uid = (uint32_t)(-1);
2778 	else if (vap->va_uid == UID_NOBODY)
2779 		na->na_uid = (uint32_t)NFS_UID_NOBODY;
2780 	else
2781 		na->na_uid = vap->va_uid;
2782 
2783 	if (vap->va_gid == (unsigned short)(-1))
2784 		na->na_gid = (uint32_t)-1;
2785 	else if (vap->va_gid == GID_NOBODY)
2786 		na->na_gid = (uint32_t)NFS_GID_NOBODY;
2787 	else
2788 		na->na_gid = vap->va_gid;
2789 
2790 	/*
2791 	 * Do we need to check fsid for overflow?  It is 64-bit in the
2792 	 * vattr, but are bigger than 32 bit values supported?
2793 	 */
2794 	na->na_fsid = vap->va_fsid;
2795 
2796 	na->na_nodeid = vap->va_nodeid;
2797 
2798 	/*
2799 	 * Check to make sure that the nodeid is representable over the
2800 	 * wire without losing bits.
2801 	 */
2802 	if (vap->va_nodeid != (u_longlong_t)na->na_nodeid)
2803 		return (EFBIG);
2804 	na->na_nlink = vap->va_nlink;
2805 
2806 	/*
2807 	 * Check for big files here, instead of at the caller.  See
2808 	 * comments in cstat for large special file explanation.
2809 	 */
2810 	if (vap->va_size > (u_longlong_t)MAXOFF32_T) {
2811 		if ((vap->va_type == VREG) || (vap->va_type == VDIR))
2812 			return (EFBIG);
2813 		if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) {
2814 			/* UNKNOWN_SIZE | OVERFLOW */
2815 			na->na_size = MAXOFF32_T;
2816 		} else
2817 			na->na_size = vap->va_size;
2818 	} else
2819 		na->na_size = vap->va_size;
2820 
2821 	/*
2822 	 * If the vnode times overflow the 32-bit times that NFS2
2823 	 * uses on the wire then return an error.
2824 	 */
2825 	if (!NFS_VAP_TIME_OK(vap)) {
2826 		return (EOVERFLOW);
2827 	}
2828 	na->na_atime.tv_sec = vap->va_atime.tv_sec;
2829 	na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2830 
2831 	na->na_mtime.tv_sec = vap->va_mtime.tv_sec;
2832 	na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2833 
2834 	na->na_ctime.tv_sec = vap->va_ctime.tv_sec;
2835 	na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000;
2836 
2837 	/*
2838 	 * If the dev_t will fit into 16 bits then compress
2839 	 * it, otherwise leave it alone. See comments in
2840 	 * nfs_client.c.
2841 	 */
2842 	if (getminor(vap->va_rdev) <= SO4_MAXMIN &&
2843 	    getmajor(vap->va_rdev) <= SO4_MAXMAJ)
2844 		na->na_rdev = nfsv2_cmpdev(vap->va_rdev);
2845 	else
2846 		(void) cmpldev(&na->na_rdev, vap->va_rdev);
2847 
2848 	na->na_blocks = vap->va_nblocks;
2849 	na->na_blocksize = vap->va_blksize;
2850 
2851 	/*
2852 	 * This bit of ugliness is a *TEMPORARY* hack to preserve the
2853 	 * over-the-wire protocols for named-pipe vnodes.  It remaps the
2854 	 * VFIFO type to the special over-the-wire type. (see note in nfs.h)
2855 	 *
2856 	 * BUYER BEWARE:
2857 	 *  If you are porting the NFS to a non-Sun server, you probably
2858 	 *  don't want to include the following block of code.  The
2859 	 *  over-the-wire special file types will be changing with the
2860 	 *  NFS Protocol Revision.
2861 	 */
2862 	if (vap->va_type == VFIFO)
2863 		NA_SETFIFO(na);
2864 	return (0);
2865 }
2866 
2867 /*
2868  * acl v2 support: returns approximate permission.
2869  *	default: returns minimal permission (more restrictive)
2870  *	aclok: returns maximal permission (less restrictive)
2871  *	This routine changes the permissions that are alaredy in *va.
2872  *	If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES,
2873  *	CLASS_OBJ is always the same as GROUP_OBJ entry.
2874  */
2875 static void
2876 acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr)
2877 {
2878 	vsecattr_t	vsa;
2879 	int		aclcnt;
2880 	aclent_t	*aclentp;
2881 	mode_t		mask_perm;
2882 	mode_t		grp_perm;
2883 	mode_t		other_perm;
2884 	mode_t		other_orig;
2885 	int		error;
2886 
2887 	/* dont care default acl */
2888 	vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT);
2889 	error = VOP_GETSECATTR(vp, &vsa, 0, cr, NULL);
2890 
2891 	if (!error) {
2892 		aclcnt = vsa.vsa_aclcnt;
2893 		if (aclcnt > MIN_ACL_ENTRIES) {
2894 			/* non-trivial ACL */
2895 			aclentp = vsa.vsa_aclentp;
2896 			if (exi->exi_export.ex_flags & EX_ACLOK) {
2897 				/* maximal permissions */
2898 				grp_perm = 0;
2899 				other_perm = 0;
2900 				for (; aclcnt > 0; aclcnt--, aclentp++) {
2901 					switch (aclentp->a_type) {
2902 					case USER_OBJ:
2903 						break;
2904 					case USER:
2905 						grp_perm |=
2906 						    aclentp->a_perm << 3;
2907 						other_perm |= aclentp->a_perm;
2908 						break;
2909 					case GROUP_OBJ:
2910 						grp_perm |=
2911 						    aclentp->a_perm << 3;
2912 						break;
2913 					case GROUP:
2914 						other_perm |= aclentp->a_perm;
2915 						break;
2916 					case OTHER_OBJ:
2917 						other_orig = aclentp->a_perm;
2918 						break;
2919 					case CLASS_OBJ:
2920 						mask_perm = aclentp->a_perm;
2921 						break;
2922 					default:
2923 						break;
2924 					}
2925 				}
2926 				grp_perm &= mask_perm << 3;
2927 				other_perm &= mask_perm;
2928 				other_perm |= other_orig;
2929 
2930 			} else {
2931 				/* minimal permissions */
2932 				grp_perm = 070;
2933 				other_perm = 07;
2934 				for (; aclcnt > 0; aclcnt--, aclentp++) {
2935 					switch (aclentp->a_type) {
2936 					case USER_OBJ:
2937 						break;
2938 					case USER:
2939 					case CLASS_OBJ:
2940 						grp_perm &=
2941 						    aclentp->a_perm << 3;
2942 						other_perm &=
2943 						    aclentp->a_perm;
2944 						break;
2945 					case GROUP_OBJ:
2946 						grp_perm &=
2947 						    aclentp->a_perm << 3;
2948 						break;
2949 					case GROUP:
2950 						other_perm &=
2951 						    aclentp->a_perm;
2952 						break;
2953 					case OTHER_OBJ:
2954 						other_perm &=
2955 						    aclentp->a_perm;
2956 						break;
2957 					default:
2958 						break;
2959 					}
2960 				}
2961 			}
2962 			/* copy to va */
2963 			va->va_mode &= ~077;
2964 			va->va_mode |= grp_perm | other_perm;
2965 		}
2966 		if (vsa.vsa_aclcnt)
2967 			kmem_free(vsa.vsa_aclentp,
2968 			    vsa.vsa_aclcnt * sizeof (aclent_t));
2969 	}
2970 }
2971 
2972 void
2973 rfs_srvrinit(void)
2974 {
2975 	mutex_init(&rfs_async_write_lock, NULL, MUTEX_DEFAULT, NULL);
2976 	nfs2_srv_caller_id = fs_new_caller_id();
2977 }
2978 
2979 void
2980 rfs_srvrfini(void)
2981 {
2982 	mutex_destroy(&rfs_async_write_lock);
2983 }
2984 
2985 static int
2986 rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
2987 {
2988 	struct clist	*wcl;
2989 	int		wlist_len;
2990 	uint32_t	count = rr->rr_count;
2991 
2992 	wcl = ra->ra_wlist;
2993 
2994 	if (rdma_setup_read_chunks(wcl, count, &wlist_len) == FALSE) {
2995 		return (FALSE);
2996 	}
2997 
2998 	wcl = ra->ra_wlist;
2999 	rr->rr_ok.rrok_wlist_len = wlist_len;
3000 	rr->rr_ok.rrok_wlist = wcl;
3001 
3002 	return (TRUE);
3003 }
3004