xref: /titanic_51/usr/src/uts/common/fs/nfs/nfs_srv.c (revision 15d9d0b528387242011cdcc6190c9e598cfe3a07)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  *	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
28  *	All rights reserved.
29  */
30 
31 #pragma ident	"%Z%%M%	%I%	%E% SMI"
32 
33 #include <sys/param.h>
34 #include <sys/types.h>
35 #include <sys/systm.h>
36 #include <sys/cred.h>
37 #include <sys/buf.h>
38 #include <sys/vfs.h>
39 #include <sys/vnode.h>
40 #include <sys/uio.h>
41 #include <sys/stat.h>
42 #include <sys/errno.h>
43 #include <sys/sysmacros.h>
44 #include <sys/statvfs.h>
45 #include <sys/kmem.h>
46 #include <sys/kstat.h>
47 #include <sys/dirent.h>
48 #include <sys/cmn_err.h>
49 #include <sys/debug.h>
50 #include <sys/vtrace.h>
51 #include <sys/mode.h>
52 #include <sys/acl.h>
53 #include <sys/nbmlock.h>
54 #include <sys/policy.h>
55 
56 #include <rpc/types.h>
57 #include <rpc/auth.h>
58 #include <rpc/svc.h>
59 
60 #include <nfs/nfs.h>
61 #include <nfs/export.h>
62 
63 #include <vm/hat.h>
64 #include <vm/as.h>
65 #include <vm/seg.h>
66 #include <vm/seg_map.h>
67 #include <vm/seg_kmem.h>
68 
69 #include <sys/strsubr.h>
70 
71 /*
72  * These are the interface routines for the server side of the
73  * Network File System.  See the NFS version 2 protocol specification
74  * for a description of this interface.
75  */
76 
77 static int	sattr_to_vattr(struct nfssattr *, struct vattr *);
78 static void	acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
79 			cred_t *);
80 
81 /*
82  * Some "over the wire" UNIX file types.  These are encoded
83  * into the mode.  This needs to be fixed in the next rev.
84  */
85 #define	IFMT		0170000		/* type of file */
86 #define	IFCHR		0020000		/* character special */
87 #define	IFBLK		0060000		/* block special */
88 #define	IFSOCK		0140000		/* socket */
89 
90 u_longlong_t nfs2_srv_caller_id;
91 
92 /*
93  * Get file attributes.
94  * Returns the current attributes of the file with the given fhandle.
95  */
96 /* ARGSUSED */
97 void
98 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
99 	struct svc_req *req, cred_t *cr)
100 {
101 	int error;
102 	vnode_t *vp;
103 	struct vattr va;
104 
105 	TRACE_0(TR_FAC_NFS, TR_RFS_GETATTR_START, "rfs_getattr_start:");
106 
107 	vp = nfs_fhtovp(fhp, exi);
108 	if (vp == NULL) {
109 		ns->ns_status = NFSERR_STALE;
110 		TRACE_1(TR_FAC_NFS, TR_RFS_GETATTR_END,
111 		    "rfs_getattr_end:(%S)", "stale");
112 		return;
113 	}
114 
115 	/*
116 	 * Do the getattr.
117 	 */
118 	va.va_mask = AT_ALL;	/* we want all the attributes */
119 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:");
120 	error = rfs4_delegated_getattr(vp, &va, 0, cr);
121 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:");
122 
123 	/* check for overflows */
124 	if (!error) {
125 		acl_perm(vp, exi, &va, cr);
126 		error = vattr_to_nattr(&va, &ns->ns_attr);
127 	}
128 
129 	VN_RELE(vp);
130 
131 	ns->ns_status = puterrno(error);
132 
133 	TRACE_1(TR_FAC_NFS, TR_RFS_GETATTR_END, "rfs_getattr_end:(%S)", "done");
134 }
135 void *
136 rfs_getattr_getfh(fhandle_t *fhp)
137 {
138 	return (fhp);
139 }
140 
141 /*
142  * Set file attributes.
143  * Sets the attributes of the file with the given fhandle.  Returns
144  * the new attributes.
145  */
146 void
147 rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns,
148 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
149 {
150 	int error;
151 	int flag;
152 	int in_crit = 0;
153 	vnode_t *vp;
154 	struct vattr va;
155 	struct vattr bva;
156 	struct flock64 bf;
157 	caller_context_t ct;
158 
159 	TRACE_0(TR_FAC_NFS, TR_RFS_SETATTR_START, "rfs_setattr_start:");
160 
161 	vp = nfs_fhtovp(&args->saa_fh, exi);
162 	if (vp == NULL) {
163 		ns->ns_status = NFSERR_STALE;
164 		TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END,
165 		    "rfs_setattr_end:(%S)", "stale");
166 		return;
167 	}
168 
169 	if (rdonly(exi, req) || vn_is_readonly(vp)) {
170 		VN_RELE(vp);
171 		ns->ns_status = NFSERR_ROFS;
172 		TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END,
173 		    "rfs_setattr_end:(%S)", "rofs");
174 		return;
175 	}
176 
177 	error = sattr_to_vattr(&args->saa_sa, &va);
178 	if (error) {
179 		VN_RELE(vp);
180 		ns->ns_status = puterrno(error);
181 		TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END,
182 		    "rfs_setattr_end:(%S)", "sattr");
183 		return;
184 	}
185 
186 	/*
187 	 * If the client is requesting a change to the mtime,
188 	 * but the nanosecond field is set to 1 billion, then
189 	 * this is a flag to the server that it should set the
190 	 * atime and mtime fields to the server's current time.
191 	 * The 1 billion number actually came from the client
192 	 * as 1 million, but the units in the over the wire
193 	 * request are microseconds instead of nanoseconds.
194 	 *
195 	 * This is an overload of the protocol and should be
196 	 * documented in the NFS Version 2 protocol specification.
197 	 */
198 	if (va.va_mask & AT_MTIME) {
199 		if (va.va_mtime.tv_nsec == 1000000000) {
200 			gethrestime(&va.va_mtime);
201 			va.va_atime = va.va_mtime;
202 			va.va_mask |= AT_ATIME;
203 			flag = 0;
204 		} else
205 			flag = ATTR_UTIME;
206 	} else
207 		flag = 0;
208 
209 	/*
210 	 * If the filesystem is exported with nosuid, then mask off
211 	 * the setuid and setgid bits.
212 	 */
213 	if ((va.va_mask & AT_MODE) && vp->v_type == VREG &&
214 	    (exi->exi_export.ex_flags & EX_NOSUID))
215 		va.va_mode &= ~(VSUID | VSGID);
216 
217 	ct.cc_sysid = 0;
218 	ct.cc_pid = 0;
219 	ct.cc_caller_id = nfs2_srv_caller_id;
220 	ct.cc_flags = CC_DONTBLOCK;
221 
222 	/*
223 	 * We need to specially handle size changes because it is
224 	 * possible for the client to create a file with modes
225 	 * which indicate read-only, but with the file opened for
226 	 * writing.  If the client then tries to set the size of
227 	 * the file, then the normal access checking done in
228 	 * VOP_SETATTR would prevent the client from doing so,
229 	 * although it should be legal for it to do so.  To get
230 	 * around this, we do the access checking for ourselves
231 	 * and then use VOP_SPACE which doesn't do the access
232 	 * checking which VOP_SETATTR does. VOP_SPACE can only
233 	 * operate on VREG files, let VOP_SETATTR handle the other
234 	 * extremely rare cases.
235 	 * Also the client should not be allowed to change the
236 	 * size of the file if there is a conflicting non-blocking
237 	 * mandatory lock in the region of change.
238 	 */
239 	if (vp->v_type == VREG && va.va_mask & AT_SIZE) {
240 		if (nbl_need_check(vp)) {
241 			nbl_start_crit(vp, RW_READER);
242 			in_crit = 1;
243 		}
244 
245 		bva.va_mask = AT_UID | AT_SIZE;
246 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:");
247 		error = VOP_GETATTR(vp, &bva, 0, cr, &ct);
248 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:");
249 		if (error) {
250 			if (in_crit)
251 				nbl_end_crit(vp);
252 			VN_RELE(vp);
253 			ns->ns_status = puterrno(error);
254 			TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END,
255 			    "rfs_setattr_end:(%S)", "getattr");
256 			return;
257 		}
258 
259 		if (in_crit) {
260 			u_offset_t offset;
261 			ssize_t length;
262 
263 			if (va.va_size < bva.va_size) {
264 				offset = va.va_size;
265 				length = bva.va_size - va.va_size;
266 			} else {
267 				offset = bva.va_size;
268 				length = va.va_size - bva.va_size;
269 			}
270 			if (nbl_conflict(vp, NBL_WRITE, offset, length, 0,
271 			    NULL)) {
272 				error = EACCES;
273 			}
274 		}
275 
276 		if (crgetuid(cr) == bva.va_uid && !error &&
277 		    va.va_size != bva.va_size) {
278 			va.va_mask &= ~AT_SIZE;
279 			bf.l_type = F_WRLCK;
280 			bf.l_whence = 0;
281 			bf.l_start = (off64_t)va.va_size;
282 			bf.l_len = 0;
283 			bf.l_sysid = 0;
284 			bf.l_pid = 0;
285 			TRACE_0(TR_FAC_NFS, TR_VOP_SPACE_START,
286 			    "vop_space_start:");
287 			error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE,
288 			    (offset_t)va.va_size, cr, &ct);
289 			TRACE_0(TR_FAC_NFS, TR_VOP_SPACE_END, "vop_space_end:");
290 		}
291 		if (in_crit)
292 			nbl_end_crit(vp);
293 	} else
294 		error = 0;
295 
296 	/*
297 	 * Do the setattr.
298 	 */
299 	if (!error && va.va_mask) {
300 		TRACE_0(TR_FAC_NFS, TR_VOP_SETATTR_START, "vop_setattr_start:");
301 		error = VOP_SETATTR(vp, &va, flag, cr, &ct);
302 		TRACE_0(TR_FAC_NFS, TR_VOP_SETATTR_END, "vop_setattr_end:");
303 	}
304 
305 	/*
306 	 * check if the monitor on either vop_space or vop_setattr detected
307 	 * a delegation conflict and if so, mark the thread flag as
308 	 * wouldblock so that the response is dropped and the client will
309 	 * try again.
310 	 */
311 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
312 		VN_RELE(vp);
313 		curthread->t_flag |= T_WOULDBLOCK;
314 		TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END,
315 		    "rfs_setattr_end:(%S)", "delegated");
316 		return;
317 	}
318 
319 	if (!error) {
320 		va.va_mask = AT_ALL;	/* get everything */
321 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:");
322 		error = rfs4_delegated_getattr(vp, &va, 0, cr);
323 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:");
324 
325 		/* check for overflows */
326 		if (!error) {
327 			acl_perm(vp, exi, &va, cr);
328 			error = vattr_to_nattr(&va, &ns->ns_attr);
329 		}
330 	}
331 
332 	ct.cc_flags = 0;
333 
334 	/*
335 	 * Force modified metadata out to stable storage.
336 	 */
337 	(void) VOP_FSYNC(vp, FNODSYNC, cr, &ct);
338 
339 	VN_RELE(vp);
340 
341 	ns->ns_status = puterrno(error);
342 
343 	TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END, "rfs_setattr_end:(%S)", "done");
344 }
345 void *
346 rfs_setattr_getfh(struct nfssaargs *args)
347 {
348 	return (&args->saa_fh);
349 }
350 
351 /*
352  * Directory lookup.
353  * Returns an fhandle and file attributes for file name in a directory.
354  */
355 /* ARGSUSED */
356 void
357 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
358 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
359 {
360 	int error;
361 	vnode_t *dvp;
362 	vnode_t *vp;
363 	struct vattr va;
364 	fhandle_t *fhp = da->da_fhandle;
365 	struct sec_ol sec = {0, 0};
366 	bool_t publicfh_flag = FALSE, auth_weak = FALSE;
367 
368 	TRACE_0(TR_FAC_NFS, TR_RFS_LOOKUP_START, "rfs_lookup_start:");
369 
370 	/*
371 	 * Trusted Extension doesn't support NFSv2. MOUNT
372 	 * will reject v2 clients. Need to prevent v2 client
373 	 * access via WebNFS here.
374 	 */
375 	if (is_system_labeled() && req->rq_vers == 2) {
376 		dr->dr_status = NFSERR_ACCES;
377 		TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END,
378 		    "rfs_lookup_end:(%S)", "access");
379 		return;
380 	}
381 
382 	/*
383 	 * Disallow NULL paths
384 	 */
385 	if (da->da_name == NULL || *da->da_name == '\0') {
386 		dr->dr_status = NFSERR_ACCES;
387 		TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END,
388 		    "rfs_lookup_end:(%S)", "access");
389 		return;
390 	}
391 
392 	/*
393 	 * Allow lookups from the root - the default
394 	 * location of the public filehandle.
395 	 */
396 	if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
397 		dvp = rootdir;
398 		VN_HOLD(dvp);
399 	} else {
400 		dvp = nfs_fhtovp(fhp, exi);
401 		if (dvp == NULL) {
402 			dr->dr_status = NFSERR_STALE;
403 			TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END,
404 			    "rfs_lookup_end:(%S)", "stale");
405 			return;
406 		}
407 	}
408 
409 	/*
410 	 * Not allow lookup beyond root.
411 	 * If the filehandle matches a filehandle of the exi,
412 	 * then the ".." refers beyond the root of an exported filesystem.
413 	 */
414 	if (strcmp(da->da_name, "..") == 0 &&
415 	    EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
416 		VN_RELE(dvp);
417 		dr->dr_status = NFSERR_NOENT;
418 		TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END,
419 		    "rfs_lookup_end:(%S)", "noent");
420 		return;
421 	}
422 
423 	/*
424 	 * If the public filehandle is used then allow
425 	 * a multi-component lookup, i.e. evaluate
426 	 * a pathname and follow symbolic links if
427 	 * necessary.
428 	 *
429 	 * This may result in a vnode in another filesystem
430 	 * which is OK as long as the filesystem is exported.
431 	 */
432 	if (PUBLIC_FH2(fhp)) {
433 		publicfh_flag = TRUE;
434 		error = rfs_publicfh_mclookup(da->da_name, dvp, cr, &vp, &exi,
435 		    &sec);
436 	} else {
437 		/*
438 		 * Do a normal single component lookup.
439 		 */
440 		TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_START, "vop_lookup_start:");
441 		error = VOP_LOOKUP(dvp, da->da_name, &vp, NULL, 0, NULL, cr,
442 		    NULL, NULL, NULL);
443 		TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_END, "vop_lookup_end:");
444 	}
445 
446 	if (!error) {
447 		va.va_mask = AT_ALL;	/* we want everything */
448 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:");
449 		error = rfs4_delegated_getattr(vp, &va, 0, cr);
450 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:");
451 		/* check for overflows */
452 		if (!error) {
453 			acl_perm(vp, exi, &va, cr);
454 			error = vattr_to_nattr(&va, &dr->dr_attr);
455 			if (!error) {
456 				if (sec.sec_flags & SEC_QUERY)
457 					error = makefh_ol(&dr->dr_fhandle, exi,
458 					    sec.sec_index);
459 				else {
460 					error = makefh(&dr->dr_fhandle, vp,
461 					    exi);
462 					if (!error && publicfh_flag &&
463 					    !chk_clnt_sec(exi, req))
464 						auth_weak = TRUE;
465 				}
466 			}
467 		}
468 		VN_RELE(vp);
469 	}
470 
471 	VN_RELE(dvp);
472 
473 	/*
474 	 * If publicfh_flag is true then we have called rfs_publicfh_mclookup
475 	 * and have obtained a new exportinfo in exi which needs to be
476 	 * released. Note the the original exportinfo pointed to by exi
477 	 * will be released by the caller, comon_dispatch.
478 	 */
479 	if (publicfh_flag && exi != NULL)
480 		exi_rele(exi);
481 
482 	/*
483 	 * If it's public fh, no 0x81, and client's flavor is
484 	 * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
485 	 * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
486 	 */
487 	if (auth_weak)
488 		dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR;
489 	else
490 		dr->dr_status = puterrno(error);
491 
492 	TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END, "rfs_lookup_end:(%S)", "done");
493 }
494 void *
495 rfs_lookup_getfh(struct nfsdiropargs *da)
496 {
497 	return (da->da_fhandle);
498 }
499 
500 /*
501  * Read symbolic link.
502  * Returns the string in the symbolic link at the given fhandle.
503  */
504 /* ARGSUSED */
505 void
506 rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi,
507 	struct svc_req *req, cred_t *cr)
508 {
509 	int error;
510 	struct iovec iov;
511 	struct uio uio;
512 	vnode_t *vp;
513 	struct vattr va;
514 
515 	TRACE_0(TR_FAC_NFS, TR_RFS_READLINK_START, "rfs_readlink_start:");
516 
517 	vp = nfs_fhtovp(fhp, exi);
518 	if (vp == NULL) {
519 		rl->rl_data = NULL;
520 		rl->rl_status = NFSERR_STALE;
521 		TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END,
522 		    "rfs_readlink_end:(%S)", "stale");
523 		return;
524 	}
525 
526 	va.va_mask = AT_MODE;
527 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:");
528 	error = VOP_GETATTR(vp, &va, 0, cr, NULL);
529 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:");
530 
531 	if (error) {
532 		VN_RELE(vp);
533 		rl->rl_data = NULL;
534 		rl->rl_status = puterrno(error);
535 		TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END,
536 		    "rfs_readlink_end:(%S)", "getattr error");
537 		return;
538 	}
539 
540 	if (MANDLOCK(vp, va.va_mode)) {
541 		VN_RELE(vp);
542 		rl->rl_data = NULL;
543 		rl->rl_status = NFSERR_ACCES;
544 		TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END,
545 		    "rfs_readlink_end:(%S)", "access");
546 		return;
547 	}
548 
549 	/*
550 	 * XNFS and RFC1094 require us to return ENXIO if argument
551 	 * is not a link. BUGID 1138002.
552 	 */
553 	if (vp->v_type != VLNK) {
554 		VN_RELE(vp);
555 		rl->rl_data = NULL;
556 		rl->rl_status = NFSERR_NXIO;
557 		TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END,
558 		    "rfs_readlink_end:(%S)", "nxio");
559 		return;
560 	}
561 
562 	/*
563 	 * Allocate data for pathname.  This will be freed by rfs_rlfree.
564 	 */
565 	rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
566 
567 	/*
568 	 * Set up io vector to read sym link data
569 	 */
570 	iov.iov_base = rl->rl_data;
571 	iov.iov_len = NFS_MAXPATHLEN;
572 	uio.uio_iov = &iov;
573 	uio.uio_iovcnt = 1;
574 	uio.uio_segflg = UIO_SYSSPACE;
575 	uio.uio_extflg = UIO_COPY_CACHED;
576 	uio.uio_loffset = (offset_t)0;
577 	uio.uio_resid = NFS_MAXPATHLEN;
578 
579 	/*
580 	 * Do the readlink.
581 	 */
582 	TRACE_0(TR_FAC_NFS, TR_VOP_READLINK_START, "vop_readlink_start:");
583 	error = VOP_READLINK(vp, &uio, cr, NULL);
584 	TRACE_0(TR_FAC_NFS, TR_VOP_READLINK_END, "vop_readlink_end:");
585 
586 #if 0 /* notyet */
587 	/*
588 	 * Don't do this.  It causes local disk writes when just
589 	 * reading the file and the overhead is deemed larger
590 	 * than the benefit.
591 	 */
592 	/*
593 	 * Force modified metadata out to stable storage.
594 	 */
595 	(void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
596 #endif
597 
598 	VN_RELE(vp);
599 
600 	rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid);
601 
602 	/*
603 	 * XNFS and RFC1094 require us to return ENXIO if argument
604 	 * is not a link. UFS returns EINVAL if this is the case,
605 	 * so we do the mapping here. BUGID 1138002.
606 	 */
607 	if (error == EINVAL)
608 		rl->rl_status = NFSERR_NXIO;
609 	else
610 		rl->rl_status = puterrno(error);
611 
612 	TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END,
613 	    "rfs_readlink_end:(%S)", "done");
614 }
615 void *
616 rfs_readlink_getfh(fhandle_t *fhp)
617 {
618 	return (fhp);
619 }
620 /*
621  * Free data allocated by rfs_readlink
622  */
623 void
624 rfs_rlfree(struct nfsrdlnres *rl)
625 {
626 	if (rl->rl_data != NULL)
627 		kmem_free(rl->rl_data, NFS_MAXPATHLEN);
628 }
629 
630 /*
631  * Read data.
632  * Returns some data read from the file at the given fhandle.
633  */
634 /* ARGSUSED */
635 void
636 rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr,
637 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
638 {
639 	vnode_t *vp;
640 	int error;
641 	struct vattr va;
642 	struct iovec iov;
643 	struct uio uio;
644 	mblk_t *mp;
645 	int alloc_err = 0;
646 	int in_crit = 0;
647 	caller_context_t ct;
648 
649 	TRACE_0(TR_FAC_NFS, TR_RFS_READ_START, "rfs_read_start:");
650 
651 	vp = nfs_fhtovp(&ra->ra_fhandle, exi);
652 	if (vp == NULL) {
653 		rr->rr_data = NULL;
654 		rr->rr_status = NFSERR_STALE;
655 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
656 		    "rfs_read_end:(%S)", "stale");
657 		return;
658 	}
659 
660 	if (vp->v_type != VREG) {
661 		VN_RELE(vp);
662 		rr->rr_data = NULL;
663 		rr->rr_status = NFSERR_ISDIR;
664 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
665 		    "rfs_read_end:(%S)", "isdir");
666 		return;
667 	}
668 
669 	ct.cc_sysid = 0;
670 	ct.cc_pid = 0;
671 	ct.cc_caller_id = nfs2_srv_caller_id;
672 	ct.cc_flags = CC_DONTBLOCK;
673 
674 	/*
675 	 * Enter the critical region before calling VOP_RWLOCK
676 	 * to avoid a deadlock with write requests.
677 	 */
678 	if (nbl_need_check(vp)) {
679 		nbl_start_crit(vp, RW_READER);
680 		if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
681 		    0, NULL)) {
682 			nbl_end_crit(vp);
683 			VN_RELE(vp);
684 			rr->rr_data = NULL;
685 			rr->rr_status = NFSERR_ACCES;
686 			TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
687 			    "rfs_read_end:(%S)", " csf access error");
688 			return;
689 		}
690 		in_crit = 1;
691 	}
692 
693 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START, "vop_rwlock_start:");
694 	error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct);
695 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END, "vop_rwlock_end:");
696 
697 	/* check if a monitor detected a delegation conflict */
698 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
699 		VN_RELE(vp);
700 		/* mark as wouldblock so response is dropped */
701 		curthread->t_flag |= T_WOULDBLOCK;
702 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
703 		    "rfs_read_end:(%S)", "delegated");
704 		rr->rr_data = NULL;
705 		return;
706 	}
707 
708 	va.va_mask = AT_ALL;
709 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:");
710 	error = VOP_GETATTR(vp, &va, 0, cr, &ct);
711 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:");
712 
713 	if (error) {
714 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
715 		    "vop_rwunlock_start:");
716 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
717 		if (in_crit)
718 			nbl_end_crit(vp);
719 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, "vop_rwunlock_end:");
720 		VN_RELE(vp);
721 		rr->rr_data = NULL;
722 		rr->rr_status = puterrno(error);
723 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
724 		    "rfs_read_end:(%S)", "getattr error");
725 		return;
726 	}
727 
728 	/*
729 	 * This is a kludge to allow reading of files created
730 	 * with no read permission.  The owner of the file
731 	 * is always allowed to read it.
732 	 */
733 	if (crgetuid(cr) != va.va_uid) {
734 		TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START, "vop_access_start:");
735 		error = VOP_ACCESS(vp, VREAD, 0, cr, &ct);
736 		TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END, "vop_access_end:");
737 		if (error) {
738 			/*
739 			 * Exec is the same as read over the net because
740 			 * of demand loading.
741 			 */
742 			TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START,
743 			    "vop_access_start:");
744 			error = VOP_ACCESS(vp, VEXEC, 0, cr, &ct);
745 			TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END,
746 			    "vop_access_end:");
747 		}
748 		if (error) {
749 			TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
750 			    "vop_rwunlock_start:");
751 			VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
752 			if (in_crit)
753 				nbl_end_crit(vp);
754 			TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END,
755 			    "vop_rwunlock_end:");
756 			VN_RELE(vp);
757 			rr->rr_data = NULL;
758 			rr->rr_status = puterrno(error);
759 			TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
760 			    "rfs_read_end:(%S)", "access error");
761 			return;
762 		}
763 	}
764 
765 	if (MANDLOCK(vp, va.va_mode)) {
766 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
767 		    "vop_rwunlock_start:");
768 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
769 		if (in_crit)
770 			nbl_end_crit(vp);
771 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, "vop_rwunlock_end:");
772 		VN_RELE(vp);
773 		rr->rr_data = NULL;
774 		rr->rr_status = NFSERR_ACCES;
775 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
776 		    "rfs_read_end:(%S)", "mand lock");
777 		return;
778 	}
779 
780 	if ((u_offset_t)ra->ra_offset >= va.va_size) {
781 		rr->rr_count = 0;
782 		rr->rr_data = NULL;
783 		/*
784 		 * In this case, status is NFS_OK, but there is no data
785 		 * to encode. So set rr_mp to NULL.
786 		 */
787 		rr->rr_mp = NULL;
788 		goto done;
789 	}
790 
791 	/*
792 	 * mp will contain the data to be sent out in the read reply.
793 	 * This will be freed after the reply has been sent out (by the
794 	 * driver).
795 	 * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so
796 	 * that the call to xdrmblk_putmblk() never fails.
797 	 */
798 	mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG,
799 	    &alloc_err);
800 	ASSERT(mp != NULL);
801 	ASSERT(alloc_err == 0);
802 
803 	rr->rr_mp = mp;
804 
805 	/*
806 	 * Set up io vector
807 	 */
808 	iov.iov_base = (caddr_t)mp->b_datap->db_base;
809 	iov.iov_len = ra->ra_count;
810 	uio.uio_iov = &iov;
811 	uio.uio_iovcnt = 1;
812 	uio.uio_segflg = UIO_SYSSPACE;
813 	uio.uio_extflg = UIO_COPY_CACHED;
814 	uio.uio_loffset = (offset_t)ra->ra_offset;
815 	uio.uio_resid = ra->ra_count;
816 
817 	TRACE_0(TR_FAC_NFS, TR_VOP_READ_START, "vop_read_start:");
818 	error = VOP_READ(vp, &uio, 0, cr, &ct);
819 	TRACE_0(TR_FAC_NFS, TR_VOP_READ_END, "vop_read_end:");
820 
821 	if (error) {
822 		freeb(mp);
823 
824 		/*
825 		 * check if a monitor detected a delegation conflict and
826 		 * mark as wouldblock so response is dropped
827 		 */
828 		if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
829 			curthread->t_flag |= T_WOULDBLOCK;
830 		else
831 			rr->rr_status = puterrno(error);
832 
833 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
834 		    "vop_rwunlock_start:");
835 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
836 		if (in_crit)
837 			nbl_end_crit(vp);
838 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, "vop_rwunlock_end:");
839 		VN_RELE(vp);
840 		rr->rr_data = NULL;
841 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
842 		    "rfs_read_end:(%S)", "read error");
843 		return;
844 	}
845 
846 	/*
847 	 * Get attributes again so we can send the latest access
848 	 * time to the client side for his cache.
849 	 */
850 	va.va_mask = AT_ALL;
851 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:");
852 	error = VOP_GETATTR(vp, &va, 0, cr, &ct);
853 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:");
854 	if (error) {
855 		freeb(mp);
856 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
857 		    "vop_rwunlock_start:");
858 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
859 		if (in_crit)
860 			nbl_end_crit(vp);
861 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END,
862 		    "vop_rwunlock_end:");
863 		VN_RELE(vp);
864 		rr->rr_data = NULL;
865 		rr->rr_status = puterrno(error);
866 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
867 		    "rfs_read_end:(%S)", "read error");
868 		return;
869 	}
870 
871 	rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid);
872 
873 	rr->rr_data = (char *)mp->b_datap->db_base;
874 
875 done:
876 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, "vop_rwunlock_start:");
877 	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
878 	if (in_crit)
879 		nbl_end_crit(vp);
880 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, "vop_rwunlock_end:");
881 
882 	acl_perm(vp, exi, &va, cr);
883 
884 	/* check for overflows */
885 	error = vattr_to_nattr(&va, &rr->rr_attr);
886 
887 #if 0 /* notyet */
888 	/*
889 	 * Don't do this.  It causes local disk writes when just
890 	 * reading the file and the overhead is deemed larger
891 	 * than the benefit.
892 	 */
893 	/*
894 	 * Force modified metadata out to stable storage.
895 	 */
896 	(void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
897 #endif
898 
899 	VN_RELE(vp);
900 
901 	rr->rr_status = puterrno(error);
902 
903 	TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, "rfs_read_end:(%S)", "done");
904 }
905 
906 /*
907  * Free data allocated by rfs_read
908  */
909 void
910 rfs_rdfree(struct nfsrdresult *rr)
911 {
912 	mblk_t *mp;
913 
914 	if (rr->rr_status == NFS_OK) {
915 		mp = rr->rr_mp;
916 		if (mp != NULL)
917 			freeb(mp);
918 	}
919 }
920 
921 void *
922 rfs_read_getfh(struct nfsreadargs *ra)
923 {
924 	return (&ra->ra_fhandle);
925 }
926 
927 #define	MAX_IOVECS	12
928 
929 #ifdef DEBUG
930 static int rfs_write_sync_hits = 0;
931 static int rfs_write_sync_misses = 0;
932 #endif
933 
934 /*
935  * Write data to file.
936  * Returns attributes of a file after writing some data to it.
937  *
938  * Any changes made here, especially in error handling might have
939  * to also be done in rfs_write (which clusters write requests).
940  */
941 void
942 rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns,
943 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
944 {
945 	int error;
946 	vnode_t *vp;
947 	rlim64_t rlimit;
948 	struct vattr va;
949 	struct uio uio;
950 	struct iovec iov[MAX_IOVECS];
951 	mblk_t *m;
952 	struct iovec *iovp;
953 	int iovcnt;
954 	cred_t *savecred;
955 	int in_crit = 0;
956 	caller_context_t ct;
957 
958 	TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_START, "rfs_write_start:(%S)", "sync");
959 
960 	vp = nfs_fhtovp(&wa->wa_fhandle, exi);
961 	if (vp == NULL) {
962 		ns->ns_status = NFSERR_STALE;
963 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
964 		    "rfs_write_end:(%S)", "stale");
965 		return;
966 	}
967 
968 	if (rdonly(exi, req)) {
969 		VN_RELE(vp);
970 		ns->ns_status = NFSERR_ROFS;
971 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
972 		    "rfs_write_end:(%S)", "rofs");
973 		return;
974 	}
975 
976 	if (vp->v_type != VREG) {
977 		VN_RELE(vp);
978 		ns->ns_status = NFSERR_ISDIR;
979 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
980 		    "rfs_write_end:(%S)", "isdir");
981 		return;
982 	}
983 
984 	ct.cc_sysid = 0;
985 	ct.cc_pid = 0;
986 	ct.cc_caller_id = nfs2_srv_caller_id;
987 	ct.cc_flags = CC_DONTBLOCK;
988 
989 	va.va_mask = AT_UID|AT_MODE;
990 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:");
991 	error = VOP_GETATTR(vp, &va, 0, cr, &ct);
992 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:");
993 
994 	if (error) {
995 		VN_RELE(vp);
996 		ns->ns_status = puterrno(error);
997 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
998 		    "rfs_write_end:(%S)", "getattr error");
999 		return;
1000 	}
1001 
1002 	if (crgetuid(cr) != va.va_uid) {
1003 		/*
1004 		 * This is a kludge to allow writes of files created
1005 		 * with read only permission.  The owner of the file
1006 		 * is always allowed to write it.
1007 		 */
1008 		TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START, "vop_access_start:");
1009 		error = VOP_ACCESS(vp, VWRITE, 0, cr, &ct);
1010 		TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END, "vop_access_end:");
1011 		if (error) {
1012 			VN_RELE(vp);
1013 			ns->ns_status = puterrno(error);
1014 			TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
1015 			    "rfs_write_end:(%S)", "access error");
1016 			return;
1017 		}
1018 	}
1019 
1020 	/*
1021 	 * Can't access a mandatory lock file.  This might cause
1022 	 * the NFS service thread to block forever waiting for a
1023 	 * lock to be released that will never be released.
1024 	 */
1025 	if (MANDLOCK(vp, va.va_mode)) {
1026 		VN_RELE(vp);
1027 		ns->ns_status = NFSERR_ACCES;
1028 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
1029 		    "rfs_write_end:(%S)", "mand lock");
1030 		return;
1031 	}
1032 
1033 	/*
1034 	 * We have to enter the critical region before calling VOP_RWLOCK
1035 	 * to avoid a deadlock with ufs.
1036 	 */
1037 	if (nbl_need_check(vp)) {
1038 		nbl_start_crit(vp, RW_READER);
1039 		in_crit = 1;
1040 		if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
1041 		    wa->wa_count, 0, NULL)) {
1042 			error = EACCES;
1043 			goto out;
1044 		}
1045 	}
1046 
1047 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START, "vop_rwlock_start:");
1048 	error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1049 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END, "vop_rwlock_end:");
1050 
1051 	/* check if a monitor detected a delegation conflict */
1052 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1053 		VN_RELE(vp);
1054 		/* mark as wouldblock so response is dropped */
1055 		curthread->t_flag |= T_WOULDBLOCK;
1056 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
1057 		    "rfs_write_end:(%S)", "delegated");
1058 		return;
1059 	}
1060 
1061 	if (wa->wa_data) {
1062 		iov[0].iov_base = wa->wa_data;
1063 		iov[0].iov_len = wa->wa_count;
1064 		uio.uio_iov = iov;
1065 		uio.uio_iovcnt = 1;
1066 		uio.uio_segflg = UIO_SYSSPACE;
1067 		uio.uio_extflg = UIO_COPY_DEFAULT;
1068 		uio.uio_loffset = (offset_t)wa->wa_offset;
1069 		uio.uio_resid = wa->wa_count;
1070 		/*
1071 		 * The limit is checked on the client. We
1072 		 * should allow any size writes here.
1073 		 */
1074 		uio.uio_llimit = curproc->p_fsz_ctl;
1075 		rlimit = uio.uio_llimit - wa->wa_offset;
1076 		if (rlimit < (rlim64_t)uio.uio_resid)
1077 			uio.uio_resid = (uint_t)rlimit;
1078 
1079 		/*
1080 		 * for now we assume no append mode
1081 		 */
1082 		TRACE_1(TR_FAC_NFS, TR_VOP_WRITE_START,
1083 		    "vop_write_start:(%S)", "sync");
1084 		/*
1085 		 * We're changing creds because VM may fault and we need
1086 		 * the cred of the current thread to be used if quota
1087 		 * checking is enabled.
1088 		 */
1089 		savecred = curthread->t_cred;
1090 		curthread->t_cred = cr;
1091 		error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1092 		curthread->t_cred = savecred;
1093 		TRACE_0(TR_FAC_NFS, TR_VOP_WRITE_END, "vop_write_end:");
1094 	} else {
1095 		iovcnt = 0;
1096 		for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
1097 			iovcnt++;
1098 		if (iovcnt <= MAX_IOVECS) {
1099 #ifdef DEBUG
1100 			rfs_write_sync_hits++;
1101 #endif
1102 			iovp = iov;
1103 		} else {
1104 #ifdef DEBUG
1105 			rfs_write_sync_misses++;
1106 #endif
1107 			iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
1108 		}
1109 		mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
1110 		uio.uio_iov = iovp;
1111 		uio.uio_iovcnt = iovcnt;
1112 		uio.uio_segflg = UIO_SYSSPACE;
1113 		uio.uio_extflg = UIO_COPY_DEFAULT;
1114 		uio.uio_loffset = (offset_t)wa->wa_offset;
1115 		uio.uio_resid = wa->wa_count;
1116 		/*
1117 		 * The limit is checked on the client. We
1118 		 * should allow any size writes here.
1119 		 */
1120 		uio.uio_llimit = curproc->p_fsz_ctl;
1121 		rlimit = uio.uio_llimit - wa->wa_offset;
1122 		if (rlimit < (rlim64_t)uio.uio_resid)
1123 			uio.uio_resid = (uint_t)rlimit;
1124 
1125 		/*
1126 		 * For now we assume no append mode.
1127 		 */
1128 		TRACE_1(TR_FAC_NFS, TR_VOP_WRITE_START,
1129 		    "vop_write_start:(%S)", "iov sync");
1130 		/*
1131 		 * We're changing creds because VM may fault and we need
1132 		 * the cred of the current thread to be used if quota
1133 		 * checking is enabled.
1134 		 */
1135 		savecred = curthread->t_cred;
1136 		curthread->t_cred = cr;
1137 		error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1138 		curthread->t_cred = savecred;
1139 		TRACE_0(TR_FAC_NFS, TR_VOP_WRITE_END, "vop_write_end:");
1140 
1141 		if (iovp != iov)
1142 			kmem_free(iovp, sizeof (*iovp) * iovcnt);
1143 	}
1144 
1145 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, "vop_rwunlock_start:");
1146 	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1147 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, "vop_rwunlock_end:");
1148 
1149 	if (!error) {
1150 		/*
1151 		 * Get attributes again so we send the latest mod
1152 		 * time to the client side for his cache.
1153 		 */
1154 		va.va_mask = AT_ALL;	/* now we want everything */
1155 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:");
1156 		error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1157 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:");
1158 		/* check for overflows */
1159 		if (!error) {
1160 			acl_perm(vp, exi, &va, cr);
1161 			error = vattr_to_nattr(&va, &ns->ns_attr);
1162 		}
1163 	}
1164 
1165 out:
1166 	if (in_crit)
1167 		nbl_end_crit(vp);
1168 	VN_RELE(vp);
1169 
1170 	/* check if a monitor detected a delegation conflict */
1171 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1172 		/* mark as wouldblock so response is dropped */
1173 		curthread->t_flag |= T_WOULDBLOCK;
1174 	else
1175 		ns->ns_status = puterrno(error);
1176 
1177 	TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, "rfs_write_end:(%S)", "sync");
1178 }
1179 
1180 struct rfs_async_write {
1181 	struct nfswriteargs *wa;
1182 	struct nfsattrstat *ns;
1183 	struct svc_req *req;
1184 	cred_t *cr;
1185 	kthread_t *thread;
1186 	struct rfs_async_write *list;
1187 };
1188 
1189 struct rfs_async_write_list {
1190 	fhandle_t *fhp;
1191 	kcondvar_t cv;
1192 	struct rfs_async_write *list;
1193 	struct rfs_async_write_list *next;
1194 };
1195 
1196 static struct rfs_async_write_list *rfs_async_write_head = NULL;
1197 static kmutex_t rfs_async_write_lock;
1198 static int rfs_write_async = 1;	/* enables write clustering if == 1 */
1199 
1200 #define	MAXCLIOVECS	42
1201 #define	RFSWRITE_INITVAL (enum nfsstat) -1
1202 
1203 #ifdef DEBUG
1204 static int rfs_write_hits = 0;
1205 static int rfs_write_misses = 0;
1206 #endif
1207 
1208 /*
1209  * Write data to file.
1210  * Returns attributes of a file after writing some data to it.
1211  */
1212 void
1213 rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns,
1214 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1215 {
1216 	int error;
1217 	vnode_t *vp;
1218 	rlim64_t rlimit;
1219 	struct vattr va;
1220 	struct uio uio;
1221 	struct rfs_async_write_list *lp;
1222 	struct rfs_async_write_list *nlp;
1223 	struct rfs_async_write *rp;
1224 	struct rfs_async_write *nrp;
1225 	struct rfs_async_write *trp;
1226 	struct rfs_async_write *lrp;
1227 	int data_written;
1228 	int iovcnt;
1229 	mblk_t *m;
1230 	struct iovec *iovp;
1231 	struct iovec *niovp;
1232 	struct iovec iov[MAXCLIOVECS];
1233 	int count;
1234 	int rcount;
1235 	uint_t off;
1236 	uint_t len;
1237 	struct rfs_async_write nrpsp;
1238 	struct rfs_async_write_list nlpsp;
1239 	ushort_t t_flag;
1240 	cred_t *savecred;
1241 	int in_crit = 0;
1242 	caller_context_t ct;
1243 
1244 	if (!rfs_write_async) {
1245 		rfs_write_sync(wa, ns, exi, req, cr);
1246 		return;
1247 	}
1248 
1249 	TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_START,
1250 	    "rfs_write_start:(%S)", "async");
1251 
1252 	/*
1253 	 * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
1254 	 * is considered an OK.
1255 	 */
1256 	ns->ns_status = RFSWRITE_INITVAL;
1257 
1258 	nrp = &nrpsp;
1259 	nrp->wa = wa;
1260 	nrp->ns = ns;
1261 	nrp->req = req;
1262 	nrp->cr = cr;
1263 	nrp->thread = curthread;
1264 
1265 	ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
1266 
1267 	/*
1268 	 * Look to see if there is already a cluster started
1269 	 * for this file.
1270 	 */
1271 	mutex_enter(&rfs_async_write_lock);
1272 	for (lp = rfs_async_write_head; lp != NULL; lp = lp->next) {
1273 		if (bcmp(&wa->wa_fhandle, lp->fhp,
1274 		    sizeof (fhandle_t)) == 0)
1275 			break;
1276 	}
1277 
1278 	/*
1279 	 * If lp is non-NULL, then there is already a cluster
1280 	 * started.  We need to place ourselves in the cluster
1281 	 * list in the right place as determined by starting
1282 	 * offset.  Conflicts with non-blocking mandatory locked
1283 	 * regions will be checked when the cluster is processed.
1284 	 */
1285 	if (lp != NULL) {
1286 		rp = lp->list;
1287 		trp = NULL;
1288 		while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
1289 			trp = rp;
1290 			rp = rp->list;
1291 		}
1292 		nrp->list = rp;
1293 		if (trp == NULL)
1294 			lp->list = nrp;
1295 		else
1296 			trp->list = nrp;
1297 		while (nrp->ns->ns_status == RFSWRITE_INITVAL)
1298 			cv_wait(&lp->cv, &rfs_async_write_lock);
1299 		mutex_exit(&rfs_async_write_lock);
1300 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
1301 		    "rfs_write_end:(%S)", "cluster child");
1302 		return;
1303 	}
1304 
1305 	/*
1306 	 * No cluster started yet, start one and add ourselves
1307 	 * to the list of clusters.
1308 	 */
1309 	nrp->list = NULL;
1310 
1311 	nlp = &nlpsp;
1312 	nlp->fhp = &wa->wa_fhandle;
1313 	cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
1314 	nlp->list = nrp;
1315 	nlp->next = NULL;
1316 
1317 	if (rfs_async_write_head == NULL) {
1318 		rfs_async_write_head = nlp;
1319 	} else {
1320 		lp = rfs_async_write_head;
1321 		while (lp->next != NULL)
1322 			lp = lp->next;
1323 		lp->next = nlp;
1324 	}
1325 	mutex_exit(&rfs_async_write_lock);
1326 
1327 	/*
1328 	 * Convert the file handle common to all of the requests
1329 	 * in this cluster to a vnode.
1330 	 */
1331 	vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1332 	if (vp == NULL) {
1333 		mutex_enter(&rfs_async_write_lock);
1334 		if (rfs_async_write_head == nlp)
1335 			rfs_async_write_head = nlp->next;
1336 		else {
1337 			lp = rfs_async_write_head;
1338 			while (lp->next != nlp)
1339 				lp = lp->next;
1340 			lp->next = nlp->next;
1341 		}
1342 		t_flag = curthread->t_flag & T_WOULDBLOCK;
1343 		for (rp = nlp->list; rp != NULL; rp = rp->list) {
1344 			rp->ns->ns_status = NFSERR_STALE;
1345 			rp->thread->t_flag |= t_flag;
1346 		}
1347 		cv_broadcast(&nlp->cv);
1348 		mutex_exit(&rfs_async_write_lock);
1349 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
1350 		    "rfs_write_end:(%S)", "stale");
1351 		return;
1352 	}
1353 
1354 	/*
1355 	 * Can only write regular files.  Attempts to write any
1356 	 * other file types fail with EISDIR.
1357 	 */
1358 	if (vp->v_type != VREG) {
1359 		VN_RELE(vp);
1360 		mutex_enter(&rfs_async_write_lock);
1361 		if (rfs_async_write_head == nlp)
1362 			rfs_async_write_head = nlp->next;
1363 		else {
1364 			lp = rfs_async_write_head;
1365 			while (lp->next != nlp)
1366 				lp = lp->next;
1367 			lp->next = nlp->next;
1368 		}
1369 		t_flag = curthread->t_flag & T_WOULDBLOCK;
1370 		for (rp = nlp->list; rp != NULL; rp = rp->list) {
1371 			rp->ns->ns_status = NFSERR_ISDIR;
1372 			rp->thread->t_flag |= t_flag;
1373 		}
1374 		cv_broadcast(&nlp->cv);
1375 		mutex_exit(&rfs_async_write_lock);
1376 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
1377 		    "rfs_write_end:(%S)", "isdir");
1378 		return;
1379 	}
1380 
1381 	/*
1382 	 * Enter the critical region before calling VOP_RWLOCK, to avoid a
1383 	 * deadlock with ufs.
1384 	 */
1385 	if (nbl_need_check(vp)) {
1386 		nbl_start_crit(vp, RW_READER);
1387 		in_crit = 1;
1388 	}
1389 
1390 	ct.cc_sysid = 0;
1391 	ct.cc_pid = 0;
1392 	ct.cc_caller_id = nfs2_srv_caller_id;
1393 	ct.cc_flags = CC_DONTBLOCK;
1394 
1395 	/*
1396 	 * Lock the file for writing.  This operation provides
1397 	 * the delay which allows clusters to grow.
1398 	 */
1399 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START, "vop_wrlock_start:");
1400 	error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1401 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END, "vop_wrlock_end");
1402 
1403 	/* check if a monitor detected a delegation conflict */
1404 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1405 		VN_RELE(vp);
1406 		/* mark as wouldblock so response is dropped */
1407 		curthread->t_flag |= T_WOULDBLOCK;
1408 		mutex_enter(&rfs_async_write_lock);
1409 		for (rp = nlp->list; rp != NULL; rp = rp->list) {
1410 			if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1411 				rp->ns->ns_status = puterrno(error);
1412 				rp->thread->t_flag |= T_WOULDBLOCK;
1413 			}
1414 		}
1415 		cv_broadcast(&nlp->cv);
1416 		mutex_exit(&rfs_async_write_lock);
1417 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
1418 		    "rfs_write_end:(%S)", "delegated");
1419 		return;
1420 	}
1421 
1422 	/*
1423 	 * Disconnect this cluster from the list of clusters.
1424 	 * The cluster that is being dealt with must be fixed
1425 	 * in size after this point, so there is no reason
1426 	 * to leave it on the list so that new requests can
1427 	 * find it.
1428 	 *
1429 	 * The algorithm is that the first write request will
1430 	 * create a cluster, convert the file handle to a
1431 	 * vnode pointer, and then lock the file for writing.
1432 	 * This request is not likely to be clustered with
1433 	 * any others.  However, the next request will create
1434 	 * a new cluster and be blocked in VOP_RWLOCK while
1435 	 * the first request is being processed.  This delay
1436 	 * will allow more requests to be clustered in this
1437 	 * second cluster.
1438 	 */
1439 	mutex_enter(&rfs_async_write_lock);
1440 	if (rfs_async_write_head == nlp)
1441 		rfs_async_write_head = nlp->next;
1442 	else {
1443 		lp = rfs_async_write_head;
1444 		while (lp->next != nlp)
1445 			lp = lp->next;
1446 		lp->next = nlp->next;
1447 	}
1448 	mutex_exit(&rfs_async_write_lock);
1449 
1450 	/*
1451 	 * Step through the list of requests in this cluster.
1452 	 * We need to check permissions to make sure that all
1453 	 * of the requests have sufficient permission to write
1454 	 * the file.  A cluster can be composed of requests
1455 	 * from different clients and different users on each
1456 	 * client.
1457 	 *
1458 	 * As a side effect, we also calculate the size of the
1459 	 * byte range that this cluster encompasses.
1460 	 */
1461 	rp = nlp->list;
1462 	off = rp->wa->wa_offset;
1463 	len = (uint_t)0;
1464 	do {
1465 		if (rdonly(exi, rp->req)) {
1466 			rp->ns->ns_status = NFSERR_ROFS;
1467 			t_flag = curthread->t_flag & T_WOULDBLOCK;
1468 			rp->thread->t_flag |= t_flag;
1469 			continue;
1470 		}
1471 
1472 		va.va_mask = AT_UID|AT_MODE;
1473 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:");
1474 		error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1475 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:");
1476 		if (!error) {
1477 			if (crgetuid(rp->cr) != va.va_uid) {
1478 				/*
1479 				 * This is a kludge to allow writes of files
1480 				 * created with read only permission.  The
1481 				 * owner of the file is always allowed to
1482 				 * write it.
1483 				 */
1484 				TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START,
1485 				    "vop_access_start:");
1486 				error = VOP_ACCESS(vp, VWRITE, 0, rp->cr, &ct);
1487 				TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END,
1488 				    "vop_access_end:");
1489 			}
1490 			if (!error && MANDLOCK(vp, va.va_mode))
1491 				error = EACCES;
1492 		}
1493 
1494 		/*
1495 		 * Check for a conflict with a nbmand-locked region.
1496 		 */
1497 		if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset,
1498 		    rp->wa->wa_count, 0, NULL)) {
1499 			error = EACCES;
1500 		}
1501 
1502 		if (error) {
1503 			rp->ns->ns_status = puterrno(error);
1504 			t_flag = curthread->t_flag & T_WOULDBLOCK;
1505 			rp->thread->t_flag |= t_flag;
1506 			continue;
1507 		}
1508 		if (len < rp->wa->wa_offset + rp->wa->wa_count - off)
1509 			len = rp->wa->wa_offset + rp->wa->wa_count - off;
1510 	} while ((rp = rp->list) != NULL);
1511 
1512 	/*
1513 	 * Step through the cluster attempting to gather as many
1514 	 * requests which are contiguous as possible.  These
1515 	 * contiguous requests are handled via one call to VOP_WRITE
1516 	 * instead of different calls to VOP_WRITE.  We also keep
1517 	 * track of the fact that any data was written.
1518 	 */
1519 	rp = nlp->list;
1520 	data_written = 0;
1521 	do {
1522 		/*
1523 		 * Skip any requests which are already marked as having an
1524 		 * error.
1525 		 */
1526 		if (rp->ns->ns_status != RFSWRITE_INITVAL) {
1527 			rp = rp->list;
1528 			continue;
1529 		}
1530 
1531 		/*
1532 		 * Count the number of iovec's which are required
1533 		 * to handle this set of requests.  One iovec is
1534 		 * needed for each data buffer, whether addressed
1535 		 * by wa_data or by the b_rptr pointers in the
1536 		 * mblk chains.
1537 		 */
1538 		iovcnt = 0;
1539 		lrp = rp;
1540 		for (;;) {
1541 			if (lrp->wa->wa_data)
1542 				iovcnt++;
1543 			else {
1544 				m = lrp->wa->wa_mblk;
1545 				while (m != NULL) {
1546 					iovcnt++;
1547 					m = m->b_cont;
1548 				}
1549 			}
1550 			if (lrp->list == NULL ||
1551 			    lrp->list->ns->ns_status != RFSWRITE_INITVAL ||
1552 			    lrp->wa->wa_offset + lrp->wa->wa_count !=
1553 			    lrp->list->wa->wa_offset) {
1554 				lrp = lrp->list;
1555 				break;
1556 			}
1557 			lrp = lrp->list;
1558 		}
1559 
1560 		if (iovcnt <= MAXCLIOVECS) {
1561 #ifdef DEBUG
1562 			rfs_write_hits++;
1563 #endif
1564 			niovp = iov;
1565 		} else {
1566 #ifdef DEBUG
1567 			rfs_write_misses++;
1568 #endif
1569 			niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP);
1570 		}
1571 		/*
1572 		 * Put together the scatter/gather iovecs.
1573 		 */
1574 		iovp = niovp;
1575 		trp = rp;
1576 		count = 0;
1577 		do {
1578 			if (trp->wa->wa_data) {
1579 				iovp->iov_base = trp->wa->wa_data;
1580 				iovp->iov_len = trp->wa->wa_count;
1581 				iovp++;
1582 			} else {
1583 				m = trp->wa->wa_mblk;
1584 				rcount = trp->wa->wa_count;
1585 				while (m != NULL) {
1586 					iovp->iov_base = (caddr_t)m->b_rptr;
1587 					iovp->iov_len = (m->b_wptr - m->b_rptr);
1588 					rcount -= iovp->iov_len;
1589 					if (rcount < 0)
1590 						iovp->iov_len += rcount;
1591 					iovp++;
1592 					if (rcount <= 0)
1593 						break;
1594 					m = m->b_cont;
1595 				}
1596 			}
1597 			count += trp->wa->wa_count;
1598 			trp = trp->list;
1599 		} while (trp != lrp);
1600 
1601 		uio.uio_iov = niovp;
1602 		uio.uio_iovcnt = iovcnt;
1603 		uio.uio_segflg = UIO_SYSSPACE;
1604 		uio.uio_extflg = UIO_COPY_DEFAULT;
1605 		uio.uio_loffset = (offset_t)rp->wa->wa_offset;
1606 		uio.uio_resid = count;
1607 		/*
1608 		 * The limit is checked on the client. We
1609 		 * should allow any size writes here.
1610 		 */
1611 		uio.uio_llimit = curproc->p_fsz_ctl;
1612 		rlimit = uio.uio_llimit - rp->wa->wa_offset;
1613 		if (rlimit < (rlim64_t)uio.uio_resid)
1614 			uio.uio_resid = (uint_t)rlimit;
1615 
1616 		/*
1617 		 * For now we assume no append mode.
1618 		 */
1619 		TRACE_1(TR_FAC_NFS, TR_VOP_WRITE_START,
1620 		    "vop_write_start:(%S)", "async");
1621 
1622 		/*
1623 		 * We're changing creds because VM may fault
1624 		 * and we need the cred of the current
1625 		 * thread to be used if quota * checking is
1626 		 * enabled.
1627 		 */
1628 		savecred = curthread->t_cred;
1629 		curthread->t_cred = cr;
1630 		error = VOP_WRITE(vp, &uio, 0, rp->cr, &ct);
1631 		curthread->t_cred = savecred;
1632 		TRACE_0(TR_FAC_NFS, TR_VOP_WRITE_END, "vop_write_end:");
1633 
1634 		/* check if a monitor detected a delegation conflict */
1635 		if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1636 			/* mark as wouldblock so response is dropped */
1637 			curthread->t_flag |= T_WOULDBLOCK;
1638 
1639 		if (niovp != iov)
1640 			kmem_free(niovp, sizeof (*niovp) * iovcnt);
1641 
1642 		if (!error) {
1643 			data_written = 1;
1644 			/*
1645 			 * Get attributes again so we send the latest mod
1646 			 * time to the client side for his cache.
1647 			 */
1648 			va.va_mask = AT_ALL;	/* now we want everything */
1649 			TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
1650 			    "vop_getattr_start:");
1651 			error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1652 			TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
1653 			    "vop_getattr_end:");
1654 			if (!error)
1655 				acl_perm(vp, exi, &va, rp->cr);
1656 		}
1657 
1658 		/*
1659 		 * Fill in the status responses for each request
1660 		 * which was just handled.  Also, copy the latest
1661 		 * attributes in to the attribute responses if
1662 		 * appropriate.
1663 		 */
1664 		t_flag = curthread->t_flag & T_WOULDBLOCK;
1665 		do {
1666 			rp->thread->t_flag |= t_flag;
1667 			/* check for overflows */
1668 			if (!error) {
1669 				error  = vattr_to_nattr(&va, &rp->ns->ns_attr);
1670 			}
1671 			rp->ns->ns_status = puterrno(error);
1672 			rp = rp->list;
1673 		} while (rp != lrp);
1674 	} while (rp != NULL);
1675 
1676 	/*
1677 	 * If any data was written at all, then we need to flush
1678 	 * the data and metadata to stable storage.
1679 	 */
1680 	if (data_written) {
1681 		TRACE_0(TR_FAC_NFS, TR_VOP_PUTPAGE_START, "vop_putpage_start:");
1682 		error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr, &ct);
1683 		TRACE_0(TR_FAC_NFS, TR_VOP_PUTPAGE_END, "vop_putpage_end:");
1684 		if (!error) {
1685 			TRACE_0(TR_FAC_NFS, TR_VOP_FSYNC_START,
1686 			    "vop_fsync_start:");
1687 			error = VOP_FSYNC(vp, FNODSYNC, cr, &ct);
1688 			TRACE_0(TR_FAC_NFS, TR_VOP_FSYNC_END, "vop_fsync_end:");
1689 		}
1690 	}
1691 
1692 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, "vop_rwunlock_start:");
1693 	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1694 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, "vop_rwunlock_end:");
1695 
1696 	if (in_crit)
1697 		nbl_end_crit(vp);
1698 	VN_RELE(vp);
1699 
1700 	t_flag = curthread->t_flag & T_WOULDBLOCK;
1701 	mutex_enter(&rfs_async_write_lock);
1702 	for (rp = nlp->list; rp != NULL; rp = rp->list) {
1703 		if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1704 			rp->ns->ns_status = puterrno(error);
1705 			rp->thread->t_flag |= t_flag;
1706 		}
1707 	}
1708 	cv_broadcast(&nlp->cv);
1709 	mutex_exit(&rfs_async_write_lock);
1710 
1711 	TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, "rfs_write_end:(%S)", "async");
1712 }
1713 
1714 void *
1715 rfs_write_getfh(struct nfswriteargs *wa)
1716 {
1717 	return (&wa->wa_fhandle);
1718 }
1719 
1720 /*
1721  * Create a file.
1722  * Creates a file with given attributes and returns those attributes
1723  * and an fhandle for the new file.
1724  */
1725 void
1726 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
1727 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1728 {
1729 	int error;
1730 	int lookuperr;
1731 	int in_crit = 0;
1732 	struct vattr va;
1733 	vnode_t *vp;
1734 	vnode_t *dvp;
1735 	char *name = args->ca_da.da_name;
1736 	vnode_t *tvp = NULL;
1737 	int mode;
1738 	int lookup_ok;
1739 	bool_t trunc;
1740 
1741 	TRACE_0(TR_FAC_NFS, TR_RFS_CREATE_START, "rfs_create_start:");
1742 
1743 	/*
1744 	 * Disallow NULL paths
1745 	 */
1746 	if (name == NULL || *name == '\0') {
1747 		dr->dr_status = NFSERR_ACCES;
1748 		TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END,
1749 		    "rfs_create_end:(%S)", "access");
1750 		return;
1751 	}
1752 
1753 	dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
1754 	if (dvp == NULL) {
1755 		dr->dr_status = NFSERR_STALE;
1756 		TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END,
1757 		    "rfs_create_end:(%S)", "stale");
1758 		return;
1759 	}
1760 
1761 	error = sattr_to_vattr(args->ca_sa, &va);
1762 	if (error) {
1763 		dr->dr_status = puterrno(error);
1764 		TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END,
1765 		    "rfs_create_end:(%S)", "sattr");
1766 		return;
1767 	}
1768 
1769 	/*
1770 	 * Must specify the mode.
1771 	 */
1772 	if (!(va.va_mask & AT_MODE)) {
1773 		VN_RELE(dvp);
1774 		dr->dr_status = NFSERR_INVAL;
1775 		TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END,
1776 		    "rfs_create_end:(%S)", "no mode");
1777 		return;
1778 	}
1779 
1780 	/*
1781 	 * This is a completely gross hack to make mknod
1782 	 * work over the wire until we can wack the protocol
1783 	 */
1784 	if ((va.va_mode & IFMT) == IFCHR) {
1785 		if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV)
1786 			va.va_type = VFIFO;	/* xtra kludge for named pipe */
1787 		else {
1788 			va.va_type = VCHR;
1789 			/*
1790 			 * uncompress the received dev_t
1791 			 * if the top half is zero indicating a request
1792 			 * from an `older style' OS.
1793 			 */
1794 			if ((va.va_size & 0xffff0000) == 0)
1795 				va.va_rdev = nfsv2_expdev(va.va_size);
1796 			else
1797 				va.va_rdev = (dev_t)va.va_size;
1798 		}
1799 		va.va_mask &= ~AT_SIZE;
1800 	} else if ((va.va_mode & IFMT) == IFBLK) {
1801 		va.va_type = VBLK;
1802 		/*
1803 		 * uncompress the received dev_t
1804 		 * if the top half is zero indicating a request
1805 		 * from an `older style' OS.
1806 		 */
1807 		if ((va.va_size & 0xffff0000) == 0)
1808 			va.va_rdev = nfsv2_expdev(va.va_size);
1809 		else
1810 			va.va_rdev = (dev_t)va.va_size;
1811 		va.va_mask &= ~AT_SIZE;
1812 	} else if ((va.va_mode & IFMT) == IFSOCK) {
1813 		va.va_type = VSOCK;
1814 	} else
1815 		va.va_type = VREG;
1816 	va.va_mode &= ~IFMT;
1817 	va.va_mask |= AT_TYPE;
1818 
1819 	/*
1820 	 * Why was the choice made to use VWRITE as the mode to the
1821 	 * call to VOP_CREATE ? This results in a bug.  When a client
1822 	 * opens a file that already exists and is RDONLY, the second
1823 	 * open fails with an EACESS because of the mode.
1824 	 * bug ID 1054648.
1825 	 */
1826 	lookup_ok = 0;
1827 	mode = VWRITE;
1828 	if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) {
1829 		TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_START, "vop_lookup_start:");
1830 		error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1831 		    NULL, NULL, NULL);
1832 		TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_END, "vop_lookup_end:");
1833 		if (!error) {
1834 			struct vattr at;
1835 
1836 			lookup_ok = 1;
1837 			at.va_mask = AT_MODE;
1838 			TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
1839 			    "vop_getattr_start:");
1840 			error = VOP_GETATTR(tvp, &at, 0, cr, NULL);
1841 			TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
1842 			    "vop_getattr_end:");
1843 			if (!error)
1844 				mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD;
1845 			VN_RELE(tvp);
1846 			tvp = NULL;
1847 		}
1848 	}
1849 
1850 	if (!lookup_ok) {
1851 		if (rdonly(exi, req)) {
1852 			error = EROFS;
1853 		} else if (va.va_type != VREG && va.va_type != VFIFO &&
1854 		    va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) {
1855 			error = EPERM;
1856 		} else {
1857 			error = 0;
1858 		}
1859 	}
1860 
1861 	/*
1862 	 * If file size is being modified on an already existing file
1863 	 * make sure that there are no conflicting non-blocking mandatory
1864 	 * locks in the region being manipulated. Return EACCES if there
1865 	 * are conflicting locks.
1866 	 */
1867 	if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) {
1868 		lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1869 		    NULL, NULL, NULL);
1870 
1871 		if (!lookuperr &&
1872 		    rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) {
1873 			VN_RELE(tvp);
1874 			curthread->t_flag |= T_WOULDBLOCK;
1875 			goto out;
1876 		}
1877 
1878 		if (!lookuperr && nbl_need_check(tvp)) {
1879 			/*
1880 			 * The file exists. Now check if it has any
1881 			 * conflicting non-blocking mandatory locks
1882 			 * in the region being changed.
1883 			 */
1884 			struct vattr bva;
1885 			u_offset_t offset;
1886 			ssize_t length;
1887 
1888 			nbl_start_crit(tvp, RW_READER);
1889 			in_crit = 1;
1890 
1891 			bva.va_mask = AT_SIZE;
1892 			error = VOP_GETATTR(tvp, &bva, 0, cr, NULL);
1893 			if (!error) {
1894 				if (va.va_size < bva.va_size) {
1895 					offset = va.va_size;
1896 					length = bva.va_size - va.va_size;
1897 				} else {
1898 					offset = bva.va_size;
1899 					length = va.va_size - bva.va_size;
1900 				}
1901 				if (length) {
1902 					if (nbl_conflict(tvp, NBL_WRITE,
1903 					    offset, length, 0, NULL)) {
1904 						error = EACCES;
1905 					}
1906 				}
1907 			}
1908 			if (error) {
1909 				nbl_end_crit(tvp);
1910 				VN_RELE(tvp);
1911 				in_crit = 0;
1912 			}
1913 		} else if (tvp != NULL) {
1914 			VN_RELE(tvp);
1915 		}
1916 	}
1917 
1918 	if (!error) {
1919 		/*
1920 		 * If filesystem is shared with nosuid the remove any
1921 		 * setuid/setgid bits on create.
1922 		 */
1923 		if (va.va_type == VREG &&
1924 		    exi->exi_export.ex_flags & EX_NOSUID)
1925 			va.va_mode &= ~(VSUID | VSGID);
1926 
1927 		TRACE_0(TR_FAC_NFS, TR_VOP_CREATE_START, "vop_create_start:");
1928 		error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0,
1929 		    NULL, NULL);
1930 		TRACE_0(TR_FAC_NFS, TR_VOP_CREATE_END, "vop_create_end:");
1931 
1932 		if (!error) {
1933 
1934 			if ((va.va_mask & AT_SIZE) && (va.va_size == 0))
1935 				trunc = TRUE;
1936 			else
1937 				trunc = FALSE;
1938 
1939 			if (rfs4_check_delegated(FWRITE, vp, trunc)) {
1940 				VN_RELE(vp);
1941 				curthread->t_flag |= T_WOULDBLOCK;
1942 				goto out;
1943 			}
1944 			va.va_mask = AT_ALL;
1945 			TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
1946 			    "vop_getattr_start:");
1947 			error = VOP_GETATTR(vp, &va, 0, cr, NULL);
1948 			TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
1949 			    "vop_getattr_end:");
1950 			/* check for overflows */
1951 			if (!error) {
1952 				acl_perm(vp, exi, &va, cr);
1953 				error = vattr_to_nattr(&va, &dr->dr_attr);
1954 				if (!error) {
1955 					error = makefh(&dr->dr_fhandle, vp,
1956 					    exi);
1957 				}
1958 			}
1959 			/*
1960 			 * Force modified metadata out to stable storage.
1961 			 */
1962 			(void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
1963 			VN_RELE(vp);
1964 		}
1965 
1966 		if (in_crit) {
1967 			nbl_end_crit(tvp);
1968 			VN_RELE(tvp);
1969 		}
1970 	}
1971 
1972 	/*
1973 	 * Force modified data and metadata out to stable storage.
1974 	 */
1975 	(void) VOP_FSYNC(dvp, 0, cr, NULL);
1976 
1977 out:
1978 
1979 	VN_RELE(dvp);
1980 
1981 	dr->dr_status = puterrno(error);
1982 
1983 	TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END, "rfs_create_end:(%S)", "done");
1984 }
1985 void *
1986 rfs_create_getfh(struct nfscreatargs *args)
1987 {
1988 	return (args->ca_da.da_fhandle);
1989 }
1990 
1991 /*
1992  * Remove a file.
1993  * Remove named file from parent directory.
1994  */
1995 void
1996 rfs_remove(struct nfsdiropargs *da, enum nfsstat *status,
1997 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1998 {
1999 	int error = 0;
2000 	vnode_t *vp;
2001 	vnode_t *targvp;
2002 	int in_crit = 0;
2003 
2004 	TRACE_0(TR_FAC_NFS, TR_RFS_REMOVE_START, "rfs_remove_start:");
2005 
2006 	/*
2007 	 * Disallow NULL paths
2008 	 */
2009 	if (da->da_name == NULL || *da->da_name == '\0') {
2010 		*status = NFSERR_ACCES;
2011 		TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END,
2012 		    "rfs_remove_end:(%S)", "access");
2013 		return;
2014 	}
2015 
2016 	vp = nfs_fhtovp(da->da_fhandle, exi);
2017 	if (vp == NULL) {
2018 		*status = NFSERR_STALE;
2019 		TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END,
2020 		    "rfs_remove_end:(%S)", "stale");
2021 		return;
2022 	}
2023 
2024 	if (rdonly(exi, req)) {
2025 		VN_RELE(vp);
2026 		*status = NFSERR_ROFS;
2027 		TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END,
2028 		    "rfs_remove_end:(%S)", "rofs");
2029 		return;
2030 	}
2031 
2032 	/*
2033 	 * Check for a conflict with a non-blocking mandatory share reservation.
2034 	 */
2035 	error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0,
2036 	    NULL, cr, NULL, NULL, NULL);
2037 	if (error != 0) {
2038 		VN_RELE(vp);
2039 		*status = puterrno(error);
2040 		return;
2041 	}
2042 
2043 	/*
2044 	 * If the file is delegated to an v4 client, then initiate
2045 	 * recall and drop this request (by setting T_WOULDBLOCK).
2046 	 * The client will eventually re-transmit the request and
2047 	 * (hopefully), by then, the v4 client will have returned
2048 	 * the delegation.
2049 	 */
2050 
2051 	if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2052 		VN_RELE(vp);
2053 		VN_RELE(targvp);
2054 		curthread->t_flag |= T_WOULDBLOCK;
2055 		return;
2056 	}
2057 
2058 	if (nbl_need_check(targvp)) {
2059 		nbl_start_crit(targvp, RW_READER);
2060 		in_crit = 1;
2061 		if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
2062 			error = EACCES;
2063 			goto out;
2064 		}
2065 	}
2066 
2067 	TRACE_0(TR_FAC_NFS, TR_VOP_REMOVE_START, "vop_remove_start:");
2068 	error = VOP_REMOVE(vp, da->da_name, cr, NULL, 0);
2069 	TRACE_0(TR_FAC_NFS, TR_VOP_REMOVE_END, "vop_remove_end:");
2070 
2071 	/*
2072 	 * Force modified data and metadata out to stable storage.
2073 	 */
2074 	(void) VOP_FSYNC(vp, 0, cr, NULL);
2075 
2076 out:
2077 	if (in_crit)
2078 		nbl_end_crit(targvp);
2079 	VN_RELE(targvp);
2080 	VN_RELE(vp);
2081 
2082 	*status = puterrno(error);
2083 
2084 	TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END, "rfs_remove_end:(%S)", "done");
2085 }
2086 
2087 void *
2088 rfs_remove_getfh(struct nfsdiropargs *da)
2089 {
2090 	return (da->da_fhandle);
2091 }
2092 
2093 /*
2094  * rename a file
2095  * Give a file (from) a new name (to).
2096  */
2097 void
2098 rfs_rename(struct nfsrnmargs *args, enum nfsstat *status,
2099 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2100 {
2101 	int error = 0;
2102 	vnode_t *fromvp;
2103 	vnode_t *tovp;
2104 	struct exportinfo *to_exi;
2105 	fhandle_t *fh;
2106 	vnode_t *srcvp;
2107 	vnode_t *targvp;
2108 	int in_crit = 0;
2109 
2110 	TRACE_0(TR_FAC_NFS, TR_RFS_RENAME_START, "rfs_rename_start:");
2111 
2112 	fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi);
2113 	if (fromvp == NULL) {
2114 		*status = NFSERR_STALE;
2115 		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2116 		    "rfs_rename_end:(%S)", "from stale");
2117 		return;
2118 	}
2119 
2120 	fh = args->rna_to.da_fhandle;
2121 	to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2122 	if (to_exi == NULL) {
2123 		VN_RELE(fromvp);
2124 		*status = NFSERR_ACCES;
2125 		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2126 		    "rfs_rename_end:(%S)", "cross device");
2127 		return;
2128 	}
2129 	exi_rele(to_exi);
2130 
2131 	if (to_exi != exi) {
2132 		VN_RELE(fromvp);
2133 		*status = NFSERR_XDEV;
2134 		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2135 		    "rfs_rename_end:(%S)", "from stale");
2136 		return;
2137 	}
2138 
2139 	tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi);
2140 	if (tovp == NULL) {
2141 		VN_RELE(fromvp);
2142 		*status = NFSERR_STALE;
2143 		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2144 		    "rfs_rename_end:(%S)", "to stale");
2145 		return;
2146 	}
2147 
2148 	if (fromvp->v_type != VDIR || tovp->v_type != VDIR) {
2149 		VN_RELE(tovp);
2150 		VN_RELE(fromvp);
2151 		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2152 		    "rfs_rename_end:(%S)", "not dir");
2153 		*status = NFSERR_NOTDIR;
2154 		return;
2155 	}
2156 
2157 	/*
2158 	 * Disallow NULL paths
2159 	 */
2160 	if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' ||
2161 	    args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') {
2162 		VN_RELE(tovp);
2163 		VN_RELE(fromvp);
2164 		*status = NFSERR_ACCES;
2165 		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2166 		    "rfs_rename_end:(%S)", "access");
2167 		return;
2168 	}
2169 
2170 	if (rdonly(exi, req)) {
2171 		VN_RELE(tovp);
2172 		VN_RELE(fromvp);
2173 		*status = NFSERR_ROFS;
2174 		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2175 		    "rfs_rename_end:(%S)", "rofs");
2176 		return;
2177 	}
2178 
2179 	/*
2180 	 * Check for a conflict with a non-blocking mandatory share reservation.
2181 	 */
2182 	error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
2183 	    NULL, cr, NULL, NULL, NULL);
2184 	if (error != 0) {
2185 		VN_RELE(tovp);
2186 		VN_RELE(fromvp);
2187 		*status = puterrno(error);
2188 		return;
2189 	}
2190 
2191 	/* Check for delegations on the source file */
2192 
2193 	if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
2194 		VN_RELE(tovp);
2195 		VN_RELE(fromvp);
2196 		VN_RELE(srcvp);
2197 		curthread->t_flag |= T_WOULDBLOCK;
2198 		return;
2199 	}
2200 
2201 	/* Check for delegation on the file being renamed over, if it exists */
2202 
2203 	if (rfs4_deleg_policy != SRV_NEVER_DELEGATE &&
2204 	    VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
2205 	    NULL, NULL, NULL) == 0) {
2206 
2207 		if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2208 			VN_RELE(tovp);
2209 			VN_RELE(fromvp);
2210 			VN_RELE(srcvp);
2211 			VN_RELE(targvp);
2212 			curthread->t_flag |= T_WOULDBLOCK;
2213 			return;
2214 		}
2215 		VN_RELE(targvp);
2216 	}
2217 
2218 
2219 	if (nbl_need_check(srcvp)) {
2220 		nbl_start_crit(srcvp, RW_READER);
2221 		in_crit = 1;
2222 		if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) {
2223 			error = EACCES;
2224 			goto out;
2225 		}
2226 	}
2227 
2228 	TRACE_0(TR_FAC_NFS, TR_VOP_RENAME_START, "vop_rename_start:");
2229 	error = VOP_RENAME(fromvp, args->rna_from.da_name,
2230 	    tovp, args->rna_to.da_name, cr, NULL, 0);
2231 	TRACE_0(TR_FAC_NFS, TR_VOP_RENAME_END, "vop_rename_end:");
2232 
2233 	if (error == 0) {
2234 		char *tmp;
2235 
2236 		/* fix the path name for the renamed file */
2237 		mutex_enter(&srcvp->v_lock);
2238 		tmp = srcvp->v_path;
2239 		srcvp->v_path = NULL;
2240 		mutex_exit(&srcvp->v_lock);
2241 		vn_setpath(rootdir, tovp, srcvp, args->rna_to.da_name,
2242 		    strlen(args->rna_to.da_name));
2243 		if (tmp != NULL)
2244 			kmem_free(tmp, strlen(tmp) + 1);
2245 	}
2246 
2247 	/*
2248 	 * Force modified data and metadata out to stable storage.
2249 	 */
2250 	(void) VOP_FSYNC(tovp, 0, cr, NULL);
2251 	(void) VOP_FSYNC(fromvp, 0, cr, NULL);
2252 
2253 out:
2254 	if (in_crit)
2255 		nbl_end_crit(srcvp);
2256 	VN_RELE(srcvp);
2257 	VN_RELE(tovp);
2258 	VN_RELE(fromvp);
2259 
2260 	*status = puterrno(error);
2261 
2262 	TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, "rfs_rename_end:(%S)", "done");
2263 }
2264 void *
2265 rfs_rename_getfh(struct nfsrnmargs *args)
2266 {
2267 	return (args->rna_from.da_fhandle);
2268 }
2269 
2270 /*
2271  * Link to a file.
2272  * Create a file (to) which is a hard link to the given file (from).
2273  */
2274 void
2275 rfs_link(struct nfslinkargs *args, enum nfsstat *status,
2276 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2277 {
2278 	int error;
2279 	vnode_t *fromvp;
2280 	vnode_t *tovp;
2281 	struct exportinfo *to_exi;
2282 	fhandle_t *fh;
2283 
2284 	TRACE_0(TR_FAC_NFS, TR_RFS_LINK_START, "rfs_link_start:");
2285 
2286 	fromvp = nfs_fhtovp(args->la_from, exi);
2287 	if (fromvp == NULL) {
2288 		*status = NFSERR_STALE;
2289 		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2290 		    "rfs_link_end:(%S)", "from stale");
2291 		return;
2292 	}
2293 
2294 	fh = args->la_to.da_fhandle;
2295 	to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2296 	if (to_exi == NULL) {
2297 		VN_RELE(fromvp);
2298 		*status = NFSERR_ACCES;
2299 		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2300 		    "rfs_link_end:(%S)", "cross device");
2301 		return;
2302 	}
2303 	exi_rele(to_exi);
2304 
2305 	if (to_exi != exi) {
2306 		VN_RELE(fromvp);
2307 		*status = NFSERR_XDEV;
2308 		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2309 		    "rfs_link_end:(%S)", "cross device");
2310 		return;
2311 	}
2312 
2313 	tovp = nfs_fhtovp(args->la_to.da_fhandle, exi);
2314 	if (tovp == NULL) {
2315 		VN_RELE(fromvp);
2316 		*status = NFSERR_STALE;
2317 		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2318 		    "rfs_link_end:(%S)", "to stale");
2319 		return;
2320 	}
2321 
2322 	if (tovp->v_type != VDIR) {
2323 		VN_RELE(tovp);
2324 		VN_RELE(fromvp);
2325 		*status = NFSERR_NOTDIR;
2326 		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2327 		    "rfs_link_end:(%S)", "not dir");
2328 		return;
2329 	}
2330 	/*
2331 	 * Disallow NULL paths
2332 	 */
2333 	if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') {
2334 		VN_RELE(tovp);
2335 		VN_RELE(fromvp);
2336 		*status = NFSERR_ACCES;
2337 		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2338 		    "rfs_link_end:(%S)", "access");
2339 		return;
2340 	}
2341 
2342 	if (rdonly(exi, req)) {
2343 		VN_RELE(tovp);
2344 		VN_RELE(fromvp);
2345 		*status = NFSERR_ROFS;
2346 		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2347 		    "rfs_link_end:(%S)", "rofs");
2348 		return;
2349 	}
2350 
2351 	TRACE_0(TR_FAC_NFS, TR_VOP_LINK_START, "vop_link_start:");
2352 	error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr, NULL, 0);
2353 	TRACE_0(TR_FAC_NFS, TR_VOP_LINK_END, "vop_link_end:");
2354 
2355 	/*
2356 	 * Force modified data and metadata out to stable storage.
2357 	 */
2358 	(void) VOP_FSYNC(tovp, 0, cr, NULL);
2359 	(void) VOP_FSYNC(fromvp, FNODSYNC, cr, NULL);
2360 
2361 	VN_RELE(tovp);
2362 	VN_RELE(fromvp);
2363 
2364 	*status = puterrno(error);
2365 
2366 	TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, "rfs_link_end:(%S)", "done");
2367 }
2368 void *
2369 rfs_link_getfh(struct nfslinkargs *args)
2370 {
2371 	return (args->la_from);
2372 }
2373 
2374 /*
2375  * Symbolicly link to a file.
2376  * Create a file (to) with the given attributes which is a symbolic link
2377  * to the given path name (to).
2378  */
2379 void
2380 rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
2381 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2382 {
2383 	int error;
2384 	struct vattr va;
2385 	vnode_t *vp;
2386 	vnode_t *svp;
2387 	int lerror;
2388 
2389 	TRACE_0(TR_FAC_NFS, TR_RFS_SYMLINK_START, "rfs_symlink_start:");
2390 
2391 	/*
2392 	 * Disallow NULL paths
2393 	 */
2394 	if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') {
2395 		*status = NFSERR_ACCES;
2396 		TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END,
2397 		    "rfs_symlink_end:(%S)", "access");
2398 		return;
2399 	}
2400 
2401 	vp = nfs_fhtovp(args->sla_from.da_fhandle, exi);
2402 	if (vp == NULL) {
2403 		*status = NFSERR_STALE;
2404 		TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END,
2405 		    "rfs_symlink_end:(%S)", "stale");
2406 		return;
2407 	}
2408 
2409 	if (rdonly(exi, req)) {
2410 		VN_RELE(vp);
2411 		*status = NFSERR_ROFS;
2412 		TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END,
2413 		    "rfs_symlink_end:(%S)", "rofs");
2414 		return;
2415 	}
2416 
2417 	error = sattr_to_vattr(args->sla_sa, &va);
2418 	if (error) {
2419 		VN_RELE(vp);
2420 		*status = puterrno(error);
2421 		TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END,
2422 		    "rfs_symlink_end:(%S)", "sattr");
2423 		return;
2424 	}
2425 
2426 	if (!(va.va_mask & AT_MODE)) {
2427 		VN_RELE(vp);
2428 		*status = NFSERR_INVAL;
2429 		TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END,
2430 		    "rfs_symlink_end:(%S)", "no mode");
2431 		return;
2432 	}
2433 
2434 	va.va_type = VLNK;
2435 	va.va_mask |= AT_TYPE;
2436 
2437 	TRACE_0(TR_FAC_NFS, TR_VOP_SYMLINK_START, "vop_symlink_start:");
2438 	error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, args->sla_tnm, cr,
2439 	    NULL, 0);
2440 	TRACE_0(TR_FAC_NFS, TR_VOP_SYMLINK_END, "vop_symlink_end:");
2441 
2442 	/*
2443 	 * Force new data and metadata out to stable storage.
2444 	 */
2445 	TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_START, "vop_lookup_start:");
2446 	lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL,
2447 	    0, NULL, cr, NULL, NULL, NULL);
2448 	TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_END, "vop_lookup_end:");
2449 	if (!lerror) {
2450 		(void) VOP_FSYNC(svp, 0, cr, NULL);
2451 		VN_RELE(svp);
2452 	}
2453 
2454 	/*
2455 	 * Force modified data and metadata out to stable storage.
2456 	 */
2457 	(void) VOP_FSYNC(vp, 0, cr, NULL);
2458 
2459 	VN_RELE(vp);
2460 
2461 	*status = puterrno(error);
2462 
2463 	TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END, "rfs_symlink_end:(%S)", "done");
2464 }
2465 void *
2466 rfs_symlink_getfh(struct nfsslargs *args)
2467 {
2468 	return (args->sla_from.da_fhandle);
2469 }
2470 
2471 /*
2472  * Make a directory.
2473  * Create a directory with the given name, parent directory, and attributes.
2474  * Returns a file handle and attributes for the new directory.
2475  */
2476 void
2477 rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr,
2478 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2479 {
2480 	int error;
2481 	struct vattr va;
2482 	vnode_t *dvp = NULL;
2483 	vnode_t *vp;
2484 	char *name = args->ca_da.da_name;
2485 
2486 	TRACE_0(TR_FAC_NFS, TR_RFS_MKDIR_START, "rfs_mkdir_start:");
2487 
2488 	/*
2489 	 * Disallow NULL paths
2490 	 */
2491 	if (name == NULL || *name == '\0') {
2492 		dr->dr_status = NFSERR_ACCES;
2493 		TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END,
2494 		    "rfs_mkdir_end:(%S)", "access");
2495 		return;
2496 	}
2497 
2498 	vp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
2499 	if (vp == NULL) {
2500 		dr->dr_status = NFSERR_STALE;
2501 		TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END,
2502 		    "rfs_mkdir_end:(%S)", "stale");
2503 		return;
2504 	}
2505 
2506 	if (rdonly(exi, req)) {
2507 		VN_RELE(vp);
2508 		dr->dr_status = NFSERR_ROFS;
2509 		TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END,
2510 		    "rfs_mkdir_end:(%S)", "rofs");
2511 		return;
2512 	}
2513 
2514 	error = sattr_to_vattr(args->ca_sa, &va);
2515 	if (error) {
2516 		VN_RELE(vp);
2517 		dr->dr_status = puterrno(error);
2518 		TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END,
2519 		    "rfs_mkdir_end:(%S)", "sattr");
2520 		return;
2521 	}
2522 
2523 	if (!(va.va_mask & AT_MODE)) {
2524 		VN_RELE(vp);
2525 		dr->dr_status = NFSERR_INVAL;
2526 		TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END,
2527 		    "rfs_mkdir_end:(%S)", "no mode");
2528 		return;
2529 	}
2530 
2531 	va.va_type = VDIR;
2532 	va.va_mask |= AT_TYPE;
2533 
2534 	TRACE_0(TR_FAC_NFS, TR_VOP_MKDIR_START, "vop_mkdir_start:");
2535 	error = VOP_MKDIR(vp, name, &va, &dvp, cr, NULL, 0, NULL);
2536 	TRACE_0(TR_FAC_NFS, TR_VOP_MKDIR_END, "vop_mkdir_end:");
2537 
2538 	if (!error) {
2539 		/*
2540 		 * Attribtutes of the newly created directory should
2541 		 * be returned to the client.
2542 		 */
2543 		va.va_mask = AT_ALL; /* We want everything */
2544 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:");
2545 		error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2546 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:");
2547 		/* check for overflows */
2548 		if (!error) {
2549 			acl_perm(vp, exi, &va, cr);
2550 			error = vattr_to_nattr(&va, &dr->dr_attr);
2551 			if (!error) {
2552 				error = makefh(&dr->dr_fhandle, dvp, exi);
2553 			}
2554 		}
2555 		/*
2556 		 * Force new data and metadata out to stable storage.
2557 		 */
2558 		(void) VOP_FSYNC(dvp, 0, cr, NULL);
2559 		VN_RELE(dvp);
2560 	}
2561 
2562 	/*
2563 	 * Force modified data and metadata out to stable storage.
2564 	 */
2565 	(void) VOP_FSYNC(vp, 0, cr, NULL);
2566 
2567 	VN_RELE(vp);
2568 
2569 	dr->dr_status = puterrno(error);
2570 
2571 	TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END, "rfs_mkdir_end:(%S)", "done");
2572 }
2573 void *
2574 rfs_mkdir_getfh(struct nfscreatargs *args)
2575 {
2576 	return (args->ca_da.da_fhandle);
2577 }
2578 
2579 /*
2580  * Remove a directory.
2581  * Remove the given directory name from the given parent directory.
2582  */
2583 void
2584 rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status,
2585 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2586 {
2587 	int error;
2588 	vnode_t *vp;
2589 
2590 	TRACE_0(TR_FAC_NFS, TR_RFS_RMDIR_START, "rfs_rmdir_start:");
2591 
2592 	/*
2593 	 * Disallow NULL paths
2594 	 */
2595 	if (da->da_name == NULL || *da->da_name == '\0') {
2596 		*status = NFSERR_ACCES;
2597 		TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END,
2598 		    "rfs_rmdir_end:(%S)", "access");
2599 		return;
2600 	}
2601 
2602 	vp = nfs_fhtovp(da->da_fhandle, exi);
2603 	if (vp == NULL) {
2604 		*status = NFSERR_STALE;
2605 		TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END,
2606 		    "rfs_rmdir_end:(%S)", "stale");
2607 		return;
2608 	}
2609 
2610 	if (rdonly(exi, req)) {
2611 		VN_RELE(vp);
2612 		*status = NFSERR_ROFS;
2613 		TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END,
2614 		    "rfs_rmdir_end:(%S)", "rofs");
2615 		return;
2616 	}
2617 
2618 	/*
2619 	 * VOP_RMDIR now takes a new third argument (the current
2620 	 * directory of the process).  That's because someone
2621 	 * wants to return EINVAL if one tries to remove ".".
2622 	 * Of course, NFS servers have no idea what their
2623 	 * clients' current directories are.  We fake it by
2624 	 * supplying a vnode known to exist and illegal to
2625 	 * remove.
2626 	 */
2627 	TRACE_0(TR_FAC_NFS, TR_VOP_RMDIR_START, "vop_rmdir_start:");
2628 	error = VOP_RMDIR(vp, da->da_name, rootdir, cr, NULL, 0);
2629 	TRACE_0(TR_FAC_NFS, TR_VOP_RMDIR_END, "vop_rmdir_end:");
2630 
2631 	/*
2632 	 * Force modified data and metadata out to stable storage.
2633 	 */
2634 	(void) VOP_FSYNC(vp, 0, cr, NULL);
2635 
2636 	VN_RELE(vp);
2637 
2638 	/*
2639 	 * System V defines rmdir to return EEXIST, not ENOTEMPTY,
2640 	 * if the directory is not empty.  A System V NFS server
2641 	 * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
2642 	 * over the wire.
2643 	 */
2644 	if (error == EEXIST)
2645 		*status = NFSERR_NOTEMPTY;
2646 	else
2647 		*status = puterrno(error);
2648 
2649 	TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END, "rfs_rmdir_end:(%S)", "done");
2650 }
2651 void *
2652 rfs_rmdir_getfh(struct nfsdiropargs *da)
2653 {
2654 	return (da->da_fhandle);
2655 }
2656 
2657 /* ARGSUSED */
2658 void
2659 rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
2660 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2661 {
2662 	int error;
2663 	int iseof;
2664 	struct iovec iov;
2665 	struct uio uio;
2666 	vnode_t *vp;
2667 
2668 	TRACE_0(TR_FAC_NFS, TR_RFS_READDIR_START, "rfs_readdir_start:");
2669 
2670 	vp = nfs_fhtovp(&rda->rda_fh, exi);
2671 	if (vp == NULL) {
2672 		rd->rd_entries = NULL;
2673 		rd->rd_status = NFSERR_STALE;
2674 		TRACE_1(TR_FAC_NFS, TR_RFS_READDIR_END,
2675 		    "rfs_readdir_end:(%S)", "stale");
2676 		return;
2677 	}
2678 
2679 	if (vp->v_type != VDIR) {
2680 		VN_RELE(vp);
2681 		rd->rd_entries = NULL;
2682 		rd->rd_status = NFSERR_NOTDIR;
2683 		TRACE_1(TR_FAC_NFS, TR_RFS_READDIR_END,
2684 		    "rfs_readdir_end:(%S)", "notdir");
2685 		return;
2686 	}
2687 
2688 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START, "vop_rwlock_start:");
2689 	(void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
2690 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END, "vop_rwlock_end:");
2691 
2692 	TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START, "vop_access_start:");
2693 	error = VOP_ACCESS(vp, VREAD, 0, cr, NULL);
2694 	TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END, "vop_access_end:");
2695 	if (error) {
2696 		rd->rd_entries = NULL;
2697 		goto bad;
2698 	}
2699 
2700 	if (rda->rda_count == 0) {
2701 		rd->rd_entries = NULL;
2702 		rd->rd_size = 0;
2703 		rd->rd_eof = FALSE;
2704 		goto bad;
2705 	}
2706 
2707 	rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA);
2708 
2709 	/*
2710 	 * Allocate data for entries.  This will be freed by rfs_rddirfree.
2711 	 */
2712 	rd->rd_bufsize = (uint_t)rda->rda_count;
2713 	rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP);
2714 
2715 	/*
2716 	 * Set up io vector to read directory data
2717 	 */
2718 	iov.iov_base = (caddr_t)rd->rd_entries;
2719 	iov.iov_len = rda->rda_count;
2720 	uio.uio_iov = &iov;
2721 	uio.uio_iovcnt = 1;
2722 	uio.uio_segflg = UIO_SYSSPACE;
2723 	uio.uio_extflg = UIO_COPY_CACHED;
2724 	uio.uio_loffset = (offset_t)rda->rda_offset;
2725 	uio.uio_resid = rda->rda_count;
2726 
2727 	/*
2728 	 * read directory
2729 	 */
2730 	TRACE_0(TR_FAC_NFS, TR_VOP_READDIR_START, "vop_readdir_start:");
2731 	error = VOP_READDIR(vp, &uio, cr, &iseof, NULL, 0);
2732 	TRACE_0(TR_FAC_NFS, TR_VOP_READDIR_END, "vop_readdir_end:");
2733 
2734 	/*
2735 	 * Clean up
2736 	 */
2737 	if (!error) {
2738 		/*
2739 		 * set size and eof
2740 		 */
2741 		if (uio.uio_resid == rda->rda_count) {
2742 			rd->rd_size = 0;
2743 			rd->rd_eof = TRUE;
2744 		} else {
2745 			rd->rd_size = (uint32_t)(rda->rda_count -
2746 			    uio.uio_resid);
2747 			rd->rd_eof = iseof ? TRUE : FALSE;
2748 		}
2749 	}
2750 
2751 bad:
2752 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, "vop_rwunlock_start:");
2753 	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
2754 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, "vop_rwunlock_end:");
2755 
2756 #if 0 /* notyet */
2757 	/*
2758 	 * Don't do this.  It causes local disk writes when just
2759 	 * reading the file and the overhead is deemed larger
2760 	 * than the benefit.
2761 	 */
2762 	/*
2763 	 * Force modified metadata out to stable storage.
2764 	 */
2765 	(void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2766 #endif
2767 
2768 	VN_RELE(vp);
2769 
2770 	rd->rd_status = puterrno(error);
2771 
2772 	TRACE_1(TR_FAC_NFS, TR_RFS_READDIR_END, "rfs_readdir_end:(%S)", "done");
2773 }
2774 void *
2775 rfs_readdir_getfh(struct nfsrddirargs *rda)
2776 {
2777 	return (&rda->rda_fh);
2778 }
2779 void
2780 rfs_rddirfree(struct nfsrddirres *rd)
2781 {
2782 	if (rd->rd_entries != NULL)
2783 		kmem_free(rd->rd_entries, rd->rd_bufsize);
2784 }
2785 
2786 /* ARGSUSED */
2787 void
2788 rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
2789 	struct svc_req *req, cred_t *cr)
2790 {
2791 	int error;
2792 	struct statvfs64 sb;
2793 	vnode_t *vp;
2794 
2795 	TRACE_0(TR_FAC_NFS, TR_RFS_STATFS_START, "rfs_statfs_start:");
2796 
2797 	vp = nfs_fhtovp(fh, exi);
2798 	if (vp == NULL) {
2799 		fs->fs_status = NFSERR_STALE;
2800 		TRACE_1(TR_FAC_NFS, TR_RFS_STATFS_END,
2801 		    "rfs_statfs_end:(%S)", "stale");
2802 		return;
2803 	}
2804 
2805 	error = VFS_STATVFS(vp->v_vfsp, &sb);
2806 
2807 	if (!error) {
2808 		fs->fs_tsize = nfstsize();
2809 		fs->fs_bsize = sb.f_frsize;
2810 		fs->fs_blocks = sb.f_blocks;
2811 		fs->fs_bfree = sb.f_bfree;
2812 		fs->fs_bavail = sb.f_bavail;
2813 	}
2814 
2815 	VN_RELE(vp);
2816 
2817 	fs->fs_status = puterrno(error);
2818 
2819 	TRACE_1(TR_FAC_NFS, TR_RFS_STATFS_END, "rfs_statfs_end:(%S)", "done");
2820 }
2821 void *
2822 rfs_statfs_getfh(fhandle_t *fh)
2823 {
2824 	return (fh);
2825 }
2826 
2827 static int
2828 sattr_to_vattr(struct nfssattr *sa, struct vattr *vap)
2829 {
2830 	vap->va_mask = 0;
2831 
2832 	/*
2833 	 * There was a sign extension bug in some VFS based systems
2834 	 * which stored the mode as a short.  When it would get
2835 	 * assigned to a u_long, no sign extension would occur.
2836 	 * It needed to, but this wasn't noticed because sa_mode
2837 	 * would then get assigned back to the short, thus ignoring
2838 	 * the upper 16 bits of sa_mode.
2839 	 *
2840 	 * To make this implementation work for both broken
2841 	 * clients and good clients, we check for both versions
2842 	 * of the mode.
2843 	 */
2844 	if (sa->sa_mode != (uint32_t)((ushort_t)-1) &&
2845 	    sa->sa_mode != (uint32_t)-1) {
2846 		vap->va_mask |= AT_MODE;
2847 		vap->va_mode = sa->sa_mode;
2848 	}
2849 	if (sa->sa_uid != (uint32_t)-1) {
2850 		vap->va_mask |= AT_UID;
2851 		vap->va_uid = sa->sa_uid;
2852 	}
2853 	if (sa->sa_gid != (uint32_t)-1) {
2854 		vap->va_mask |= AT_GID;
2855 		vap->va_gid = sa->sa_gid;
2856 	}
2857 	if (sa->sa_size != (uint32_t)-1) {
2858 		vap->va_mask |= AT_SIZE;
2859 		vap->va_size = sa->sa_size;
2860 	}
2861 	if (sa->sa_atime.tv_sec != (int32_t)-1 &&
2862 	    sa->sa_atime.tv_usec != (int32_t)-1) {
2863 #ifndef _LP64
2864 		/* return error if time overflow */
2865 		if (!NFS2_TIME_OK(sa->sa_atime.tv_sec))
2866 			return (EOVERFLOW);
2867 #endif
2868 		vap->va_mask |= AT_ATIME;
2869 		/*
2870 		 * nfs protocol defines times as unsigned so don't extend sign,
2871 		 * unless sysadmin set nfs_allow_preepoch_time.
2872 		 */
2873 		NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec);
2874 		vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
2875 	}
2876 	if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
2877 	    sa->sa_mtime.tv_usec != (int32_t)-1) {
2878 #ifndef _LP64
2879 		/* return error if time overflow */
2880 		if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
2881 			return (EOVERFLOW);
2882 #endif
2883 		vap->va_mask |= AT_MTIME;
2884 		/*
2885 		 * nfs protocol defines times as unsigned so don't extend sign,
2886 		 * unless sysadmin set nfs_allow_preepoch_time.
2887 		 */
2888 		NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
2889 		vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
2890 	}
2891 	return (0);
2892 }
2893 
2894 static enum nfsftype vt_to_nf[] = {
2895 	0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
2896 };
2897 
2898 /*
2899  * check the following fields for overflow: nodeid, size, and time.
2900  * There could be a problem when converting 64-bit LP64 fields
2901  * into 32-bit ones.  Return an error if there is an overflow.
2902  */
2903 int
2904 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
2905 {
2906 	ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
2907 	na->na_type = vt_to_nf[vap->va_type];
2908 
2909 	if (vap->va_mode == (unsigned short) -1)
2910 		na->na_mode = (uint32_t)-1;
2911 	else
2912 		na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
2913 
2914 	if (vap->va_uid == (unsigned short)(-1))
2915 		na->na_uid = (uint32_t)(-1);
2916 	else if (vap->va_uid == UID_NOBODY)
2917 		na->na_uid = (uint32_t)NFS_UID_NOBODY;
2918 	else
2919 		na->na_uid = vap->va_uid;
2920 
2921 	if (vap->va_gid == (unsigned short)(-1))
2922 		na->na_gid = (uint32_t)-1;
2923 	else if (vap->va_gid == GID_NOBODY)
2924 		na->na_gid = (uint32_t)NFS_GID_NOBODY;
2925 	else
2926 		na->na_gid = vap->va_gid;
2927 
2928 	/*
2929 	 * Do we need to check fsid for overflow?  It is 64-bit in the
2930 	 * vattr, but are bigger than 32 bit values supported?
2931 	 */
2932 	na->na_fsid = vap->va_fsid;
2933 
2934 	na->na_nodeid = vap->va_nodeid;
2935 
2936 	/*
2937 	 * Check to make sure that the nodeid is representable over the
2938 	 * wire without losing bits.
2939 	 */
2940 	if (vap->va_nodeid != (u_longlong_t)na->na_nodeid)
2941 		return (EFBIG);
2942 	na->na_nlink = vap->va_nlink;
2943 
2944 	/*
2945 	 * Check for big files here, instead of at the caller.  See
2946 	 * comments in cstat for large special file explanation.
2947 	 */
2948 	if (vap->va_size > (u_longlong_t)MAXOFF32_T) {
2949 		if ((vap->va_type == VREG) || (vap->va_type == VDIR))
2950 			return (EFBIG);
2951 		if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) {
2952 			/* UNKNOWN_SIZE | OVERFLOW */
2953 			na->na_size = MAXOFF32_T;
2954 		} else
2955 			na->na_size = vap->va_size;
2956 	} else
2957 		na->na_size = vap->va_size;
2958 
2959 	/*
2960 	 * If the vnode times overflow the 32-bit times that NFS2
2961 	 * uses on the wire then return an error.
2962 	 */
2963 	if (!NFS_VAP_TIME_OK(vap)) {
2964 		return (EOVERFLOW);
2965 	}
2966 	na->na_atime.tv_sec = vap->va_atime.tv_sec;
2967 	na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2968 
2969 	na->na_mtime.tv_sec = vap->va_mtime.tv_sec;
2970 	na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2971 
2972 	na->na_ctime.tv_sec = vap->va_ctime.tv_sec;
2973 	na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000;
2974 
2975 	/*
2976 	 * If the dev_t will fit into 16 bits then compress
2977 	 * it, otherwise leave it alone. See comments in
2978 	 * nfs_client.c.
2979 	 */
2980 	if (getminor(vap->va_rdev) <= SO4_MAXMIN &&
2981 	    getmajor(vap->va_rdev) <= SO4_MAXMAJ)
2982 		na->na_rdev = nfsv2_cmpdev(vap->va_rdev);
2983 	else
2984 		(void) cmpldev(&na->na_rdev, vap->va_rdev);
2985 
2986 	na->na_blocks = vap->va_nblocks;
2987 	na->na_blocksize = vap->va_blksize;
2988 
2989 	/*
2990 	 * This bit of ugliness is a *TEMPORARY* hack to preserve the
2991 	 * over-the-wire protocols for named-pipe vnodes.  It remaps the
2992 	 * VFIFO type to the special over-the-wire type. (see note in nfs.h)
2993 	 *
2994 	 * BUYER BEWARE:
2995 	 *  If you are porting the NFS to a non-Sun server, you probably
2996 	 *  don't want to include the following block of code.  The
2997 	 *  over-the-wire special file types will be changing with the
2998 	 *  NFS Protocol Revision.
2999 	 */
3000 	if (vap->va_type == VFIFO)
3001 		NA_SETFIFO(na);
3002 	return (0);
3003 }
3004 
3005 /*
3006  * acl v2 support: returns approximate permission.
3007  *	default: returns minimal permission (more restrictive)
3008  *	aclok: returns maximal permission (less restrictive)
3009  *	This routine changes the permissions that are alaredy in *va.
3010  *	If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES,
3011  *	CLASS_OBJ is always the same as GROUP_OBJ entry.
3012  */
3013 static void
3014 acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr)
3015 {
3016 	vsecattr_t	vsa;
3017 	int		aclcnt;
3018 	aclent_t	*aclentp;
3019 	mode_t		mask_perm;
3020 	mode_t		grp_perm;
3021 	mode_t		other_perm;
3022 	mode_t		other_orig;
3023 	int		error;
3024 
3025 	/* dont care default acl */
3026 	vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT);
3027 	error = VOP_GETSECATTR(vp, &vsa, 0, cr, NULL);
3028 
3029 	if (!error) {
3030 		aclcnt = vsa.vsa_aclcnt;
3031 		if (aclcnt > MIN_ACL_ENTRIES) {
3032 			/* non-trivial ACL */
3033 			aclentp = vsa.vsa_aclentp;
3034 			if (exi->exi_export.ex_flags & EX_ACLOK) {
3035 				/* maximal permissions */
3036 				grp_perm = 0;
3037 				other_perm = 0;
3038 				for (; aclcnt > 0; aclcnt--, aclentp++) {
3039 					switch (aclentp->a_type) {
3040 					case USER_OBJ:
3041 						break;
3042 					case USER:
3043 						grp_perm |=
3044 						    aclentp->a_perm << 3;
3045 						other_perm |= aclentp->a_perm;
3046 						break;
3047 					case GROUP_OBJ:
3048 						grp_perm |=
3049 						    aclentp->a_perm << 3;
3050 						break;
3051 					case GROUP:
3052 						other_perm |= aclentp->a_perm;
3053 						break;
3054 					case OTHER_OBJ:
3055 						other_orig = aclentp->a_perm;
3056 						break;
3057 					case CLASS_OBJ:
3058 						mask_perm = aclentp->a_perm;
3059 						break;
3060 					default:
3061 						break;
3062 					}
3063 				}
3064 				grp_perm &= mask_perm << 3;
3065 				other_perm &= mask_perm;
3066 				other_perm |= other_orig;
3067 
3068 			} else {
3069 				/* minimal permissions */
3070 				grp_perm = 070;
3071 				other_perm = 07;
3072 				for (; aclcnt > 0; aclcnt--, aclentp++) {
3073 					switch (aclentp->a_type) {
3074 					case USER_OBJ:
3075 						break;
3076 					case USER:
3077 					case CLASS_OBJ:
3078 						grp_perm &=
3079 						    aclentp->a_perm << 3;
3080 						other_perm &=
3081 						    aclentp->a_perm;
3082 						break;
3083 					case GROUP_OBJ:
3084 						grp_perm &=
3085 						    aclentp->a_perm << 3;
3086 						break;
3087 					case GROUP:
3088 						other_perm &=
3089 						    aclentp->a_perm;
3090 						break;
3091 					case OTHER_OBJ:
3092 						other_perm &=
3093 						    aclentp->a_perm;
3094 						break;
3095 					default:
3096 						break;
3097 					}
3098 				}
3099 			}
3100 			/* copy to va */
3101 			va->va_mode &= ~077;
3102 			va->va_mode |= grp_perm | other_perm;
3103 		}
3104 		if (vsa.vsa_aclcnt)
3105 			kmem_free(vsa.vsa_aclentp,
3106 			    vsa.vsa_aclcnt * sizeof (aclent_t));
3107 	}
3108 }
3109 
3110 void
3111 rfs_srvrinit(void)
3112 {
3113 	mutex_init(&rfs_async_write_lock, NULL, MUTEX_DEFAULT, NULL);
3114 	nfs2_srv_caller_id = fs_new_caller_id();
3115 }
3116 
3117 void
3118 rfs_srvrfini(void)
3119 {
3120 	mutex_destroy(&rfs_async_write_lock);
3121 }
3122