xref: /titanic_50/usr/src/uts/common/fs/nfs/nfs_srv.c (revision 76396fcc7c8d694e8cb7822a4aeda9cf2b30fb3a)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  *	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
28  *	All rights reserved.
29  */
30 
31 #pragma ident	"%Z%%M%	%I%	%E% SMI"
32 
33 #include <sys/param.h>
34 #include <sys/types.h>
35 #include <sys/systm.h>
36 #include <sys/cred.h>
37 #include <sys/buf.h>
38 #include <sys/vfs.h>
39 #include <sys/vnode.h>
40 #include <sys/uio.h>
41 #include <sys/stat.h>
42 #include <sys/errno.h>
43 #include <sys/sysmacros.h>
44 #include <sys/statvfs.h>
45 #include <sys/kmem.h>
46 #include <sys/kstat.h>
47 #include <sys/dirent.h>
48 #include <sys/cmn_err.h>
49 #include <sys/debug.h>
50 #include <sys/vtrace.h>
51 #include <sys/mode.h>
52 #include <sys/acl.h>
53 #include <sys/nbmlock.h>
54 #include <sys/policy.h>
55 
56 #include <rpc/types.h>
57 #include <rpc/auth.h>
58 #include <rpc/svc.h>
59 
60 #include <nfs/nfs.h>
61 #include <nfs/export.h>
62 
63 #include <vm/hat.h>
64 #include <vm/as.h>
65 #include <vm/seg.h>
66 #include <vm/seg_map.h>
67 #include <vm/seg_kmem.h>
68 
69 #include <sys/strsubr.h>
70 
71 /*
72  * These are the interface routines for the server side of the
73  * Network File System.  See the NFS version 2 protocol specification
74  * for a description of this interface.
75  */
76 
77 static int	sattr_to_vattr(struct nfssattr *, struct vattr *);
78 static void	acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
79 			cred_t *);
80 
81 /*
82  * Some "over the wire" UNIX file types.  These are encoded
83  * into the mode.  This needs to be fixed in the next rev.
84  */
85 #define	IFMT		0170000		/* type of file */
86 #define	IFCHR		0020000		/* character special */
87 #define	IFBLK		0060000		/* block special */
88 #define	IFSOCK		0140000		/* socket */
89 
90 u_longlong_t nfs2_srv_caller_id;
91 
92 /*
93  * Get file attributes.
94  * Returns the current attributes of the file with the given fhandle.
95  */
96 /* ARGSUSED */
97 void
98 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
99 	struct svc_req *req, cred_t *cr)
100 {
101 	int error;
102 	vnode_t *vp;
103 	struct vattr va;
104 
105 	TRACE_0(TR_FAC_NFS, TR_RFS_GETATTR_START, "rfs_getattr_start:");
106 
107 	vp = nfs_fhtovp(fhp, exi);
108 	if (vp == NULL) {
109 		ns->ns_status = NFSERR_STALE;
110 		TRACE_1(TR_FAC_NFS, TR_RFS_GETATTR_END,
111 		    "rfs_getattr_end:(%S)", "stale");
112 		return;
113 	}
114 
115 	/*
116 	 * Do the getattr.
117 	 */
118 	va.va_mask = AT_ALL;	/* we want all the attributes */
119 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:");
120 	error = rfs4_delegated_getattr(vp, &va, 0, cr);
121 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:");
122 
123 	/* check for overflows */
124 	if (!error) {
125 		acl_perm(vp, exi, &va, cr);
126 		error = vattr_to_nattr(&va, &ns->ns_attr);
127 	}
128 
129 	VN_RELE(vp);
130 
131 	ns->ns_status = puterrno(error);
132 
133 	TRACE_1(TR_FAC_NFS, TR_RFS_GETATTR_END, "rfs_getattr_end:(%S)", "done");
134 }
135 void *
136 rfs_getattr_getfh(fhandle_t *fhp)
137 {
138 	return (fhp);
139 }
140 
141 /*
142  * Set file attributes.
143  * Sets the attributes of the file with the given fhandle.  Returns
144  * the new attributes.
145  */
146 void
147 rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns,
148 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
149 {
150 	int error;
151 	int flag;
152 	int in_crit = 0;
153 	vnode_t *vp;
154 	struct vattr va;
155 	struct vattr bva;
156 	struct flock64 bf;
157 	caller_context_t ct;
158 
159 	TRACE_0(TR_FAC_NFS, TR_RFS_SETATTR_START, "rfs_setattr_start:");
160 
161 	vp = nfs_fhtovp(&args->saa_fh, exi);
162 	if (vp == NULL) {
163 		ns->ns_status = NFSERR_STALE;
164 		TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END,
165 		    "rfs_setattr_end:(%S)", "stale");
166 		return;
167 	}
168 
169 	if (rdonly(exi, req) || vn_is_readonly(vp)) {
170 		VN_RELE(vp);
171 		ns->ns_status = NFSERR_ROFS;
172 		TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END,
173 		    "rfs_setattr_end:(%S)", "rofs");
174 		return;
175 	}
176 
177 	error = sattr_to_vattr(&args->saa_sa, &va);
178 	if (error) {
179 		VN_RELE(vp);
180 		ns->ns_status = puterrno(error);
181 		TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END,
182 		    "rfs_setattr_end:(%S)", "sattr");
183 		return;
184 	}
185 
186 	/*
187 	 * If the client is requesting a change to the mtime,
188 	 * but the nanosecond field is set to 1 billion, then
189 	 * this is a flag to the server that it should set the
190 	 * atime and mtime fields to the server's current time.
191 	 * The 1 billion number actually came from the client
192 	 * as 1 million, but the units in the over the wire
193 	 * request are microseconds instead of nanoseconds.
194 	 *
195 	 * This is an overload of the protocol and should be
196 	 * documented in the NFS Version 2 protocol specification.
197 	 */
198 	if (va.va_mask & AT_MTIME) {
199 		if (va.va_mtime.tv_nsec == 1000000000) {
200 			gethrestime(&va.va_mtime);
201 			va.va_atime = va.va_mtime;
202 			va.va_mask |= AT_ATIME;
203 			flag = 0;
204 		} else
205 			flag = ATTR_UTIME;
206 	} else
207 		flag = 0;
208 
209 	/*
210 	 * If the filesystem is exported with nosuid, then mask off
211 	 * the setuid and setgid bits.
212 	 */
213 	if ((va.va_mask & AT_MODE) && vp->v_type == VREG &&
214 	    (exi->exi_export.ex_flags & EX_NOSUID))
215 		va.va_mode &= ~(VSUID | VSGID);
216 
217 	ct.cc_sysid = 0;
218 	ct.cc_pid = 0;
219 	ct.cc_caller_id = nfs2_srv_caller_id;
220 	ct.cc_flags = CC_DONTBLOCK;
221 
222 	/*
223 	 * We need to specially handle size changes because it is
224 	 * possible for the client to create a file with modes
225 	 * which indicate read-only, but with the file opened for
226 	 * writing.  If the client then tries to set the size of
227 	 * the file, then the normal access checking done in
228 	 * VOP_SETATTR would prevent the client from doing so,
229 	 * although it should be legal for it to do so.  To get
230 	 * around this, we do the access checking for ourselves
231 	 * and then use VOP_SPACE which doesn't do the access
232 	 * checking which VOP_SETATTR does. VOP_SPACE can only
233 	 * operate on VREG files, let VOP_SETATTR handle the other
234 	 * extremely rare cases.
235 	 * Also the client should not be allowed to change the
236 	 * size of the file if there is a conflicting non-blocking
237 	 * mandatory lock in the region of change.
238 	 */
239 	if (vp->v_type == VREG && va.va_mask & AT_SIZE) {
240 		if (nbl_need_check(vp)) {
241 			nbl_start_crit(vp, RW_READER);
242 			in_crit = 1;
243 		}
244 
245 		bva.va_mask = AT_UID | AT_SIZE;
246 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:");
247 		error = VOP_GETATTR(vp, &bva, 0, cr, &ct);
248 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:");
249 		if (error) {
250 			if (in_crit)
251 				nbl_end_crit(vp);
252 			VN_RELE(vp);
253 			ns->ns_status = puterrno(error);
254 			TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END,
255 			    "rfs_setattr_end:(%S)", "getattr");
256 			return;
257 		}
258 
259 		if (in_crit) {
260 			u_offset_t offset;
261 			ssize_t length;
262 
263 			if (va.va_size < bva.va_size) {
264 				offset = va.va_size;
265 				length = bva.va_size - va.va_size;
266 			} else {
267 				offset = bva.va_size;
268 				length = va.va_size - bva.va_size;
269 			}
270 			if (nbl_conflict(vp, NBL_WRITE, offset, length, 0,
271 			    NULL)) {
272 				error = EACCES;
273 			}
274 		}
275 
276 		if (crgetuid(cr) == bva.va_uid && !error &&
277 		    va.va_size != bva.va_size) {
278 			va.va_mask &= ~AT_SIZE;
279 			bf.l_type = F_WRLCK;
280 			bf.l_whence = 0;
281 			bf.l_start = (off64_t)va.va_size;
282 			bf.l_len = 0;
283 			bf.l_sysid = 0;
284 			bf.l_pid = 0;
285 			TRACE_0(TR_FAC_NFS, TR_VOP_SPACE_START,
286 			    "vop_space_start:");
287 			error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE,
288 			    (offset_t)va.va_size, cr, &ct);
289 			TRACE_0(TR_FAC_NFS, TR_VOP_SPACE_END, "vop_space_end:");
290 		}
291 		if (in_crit)
292 			nbl_end_crit(vp);
293 	} else
294 		error = 0;
295 
296 	/*
297 	 * Do the setattr.
298 	 */
299 	if (!error && va.va_mask) {
300 		TRACE_0(TR_FAC_NFS, TR_VOP_SETATTR_START, "vop_setattr_start:");
301 		error = VOP_SETATTR(vp, &va, flag, cr, &ct);
302 		TRACE_0(TR_FAC_NFS, TR_VOP_SETATTR_END, "vop_setattr_end:");
303 	}
304 
305 	/*
306 	 * check if the monitor on either vop_space or vop_setattr detected
307 	 * a delegation conflict and if so, mark the thread flag as
308 	 * wouldblock so that the response is dropped and the client will
309 	 * try again.
310 	 */
311 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
312 		VN_RELE(vp);
313 		curthread->t_flag |= T_WOULDBLOCK;
314 		TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END,
315 		    "rfs_setattr_end:(%S)", "delegated");
316 		return;
317 	}
318 
319 	if (!error) {
320 		va.va_mask = AT_ALL;	/* get everything */
321 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:");
322 		error = rfs4_delegated_getattr(vp, &va, 0, cr);
323 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:");
324 
325 		/* check for overflows */
326 		if (!error) {
327 			acl_perm(vp, exi, &va, cr);
328 			error = vattr_to_nattr(&va, &ns->ns_attr);
329 		}
330 	}
331 
332 	ct.cc_flags = 0;
333 
334 	/*
335 	 * Force modified metadata out to stable storage.
336 	 */
337 	(void) VOP_FSYNC(vp, FNODSYNC, cr, &ct);
338 
339 	VN_RELE(vp);
340 
341 	ns->ns_status = puterrno(error);
342 
343 	TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END, "rfs_setattr_end:(%S)", "done");
344 }
345 void *
346 rfs_setattr_getfh(struct nfssaargs *args)
347 {
348 	return (&args->saa_fh);
349 }
350 
351 /*
352  * Directory lookup.
353  * Returns an fhandle and file attributes for file name in a directory.
354  */
355 /* ARGSUSED */
356 void
357 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
358 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
359 {
360 	int error;
361 	vnode_t *dvp;
362 	vnode_t *vp;
363 	struct vattr va;
364 	fhandle_t *fhp = da->da_fhandle;
365 	struct sec_ol sec = {0, 0};
366 	bool_t publicfh_flag = FALSE, auth_weak = FALSE;
367 
368 	TRACE_0(TR_FAC_NFS, TR_RFS_LOOKUP_START, "rfs_lookup_start:");
369 
370 	/*
371 	 * Trusted Extension doesn't support NFSv2. MOUNT
372 	 * will reject v2 clients. Need to prevent v2 client
373 	 * access via WebNFS here.
374 	 */
375 	if (is_system_labeled() && req->rq_vers == 2) {
376 		dr->dr_status = NFSERR_ACCES;
377 		TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END,
378 		    "rfs_lookup_end:(%S)", "access");
379 		return;
380 	}
381 
382 	/*
383 	 * Disallow NULL paths
384 	 */
385 	if (da->da_name == NULL || *da->da_name == '\0') {
386 		dr->dr_status = NFSERR_ACCES;
387 		TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END,
388 		    "rfs_lookup_end:(%S)", "access");
389 		return;
390 	}
391 
392 	/*
393 	 * Allow lookups from the root - the default
394 	 * location of the public filehandle.
395 	 */
396 	if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
397 		dvp = rootdir;
398 		VN_HOLD(dvp);
399 	} else {
400 		dvp = nfs_fhtovp(fhp, exi);
401 		if (dvp == NULL) {
402 			dr->dr_status = NFSERR_STALE;
403 			TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END,
404 			    "rfs_lookup_end:(%S)", "stale");
405 			return;
406 		}
407 	}
408 
409 	/*
410 	 * Not allow lookup beyond root.
411 	 * If the filehandle matches a filehandle of the exi,
412 	 * then the ".." refers beyond the root of an exported filesystem.
413 	 */
414 	if (strcmp(da->da_name, "..") == 0 &&
415 	    EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
416 		VN_RELE(dvp);
417 		dr->dr_status = NFSERR_NOENT;
418 		TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END,
419 		    "rfs_lookup_end:(%S)", "noent");
420 		return;
421 	}
422 
423 	/*
424 	 * If the public filehandle is used then allow
425 	 * a multi-component lookup, i.e. evaluate
426 	 * a pathname and follow symbolic links if
427 	 * necessary.
428 	 *
429 	 * This may result in a vnode in another filesystem
430 	 * which is OK as long as the filesystem is exported.
431 	 */
432 	if (PUBLIC_FH2(fhp)) {
433 		publicfh_flag = TRUE;
434 		error = rfs_publicfh_mclookup(da->da_name, dvp, cr, &vp, &exi,
435 		    &sec);
436 	} else {
437 		/*
438 		 * Do a normal single component lookup.
439 		 */
440 		TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_START, "vop_lookup_start:");
441 		error = VOP_LOOKUP(dvp, da->da_name, &vp, NULL, 0, NULL, cr,
442 		    NULL, NULL, NULL);
443 		TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_END, "vop_lookup_end:");
444 	}
445 
446 	if (!error) {
447 		va.va_mask = AT_ALL;	/* we want everything */
448 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:");
449 		error = rfs4_delegated_getattr(vp, &va, 0, cr);
450 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:");
451 		/* check for overflows */
452 		if (!error) {
453 			acl_perm(vp, exi, &va, cr);
454 			error = vattr_to_nattr(&va, &dr->dr_attr);
455 			if (!error) {
456 				if (sec.sec_flags & SEC_QUERY)
457 					error = makefh_ol(&dr->dr_fhandle, exi,
458 					    sec.sec_index);
459 				else {
460 					error = makefh(&dr->dr_fhandle, vp,
461 					    exi);
462 					if (!error && publicfh_flag &&
463 					    !chk_clnt_sec(exi, req))
464 						auth_weak = TRUE;
465 				}
466 			}
467 		}
468 		VN_RELE(vp);
469 	}
470 
471 	VN_RELE(dvp);
472 
473 	/*
474 	 * If publicfh_flag is true then we have called rfs_publicfh_mclookup
475 	 * and have obtained a new exportinfo in exi which needs to be
476 	 * released. Note the the original exportinfo pointed to by exi
477 	 * will be released by the caller, comon_dispatch.
478 	 */
479 	if (publicfh_flag && exi != NULL)
480 		exi_rele(exi);
481 
482 	/*
483 	 * If it's public fh, no 0x81, and client's flavor is
484 	 * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
485 	 * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
486 	 */
487 	if (auth_weak)
488 		dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR;
489 	else
490 		dr->dr_status = puterrno(error);
491 
492 	TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END, "rfs_lookup_end:(%S)", "done");
493 }
494 void *
495 rfs_lookup_getfh(struct nfsdiropargs *da)
496 {
497 	return (da->da_fhandle);
498 }
499 
500 /*
501  * Read symbolic link.
502  * Returns the string in the symbolic link at the given fhandle.
503  */
504 /* ARGSUSED */
505 void
506 rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi,
507 	struct svc_req *req, cred_t *cr)
508 {
509 	int error;
510 	struct iovec iov;
511 	struct uio uio;
512 	vnode_t *vp;
513 	struct vattr va;
514 
515 	TRACE_0(TR_FAC_NFS, TR_RFS_READLINK_START, "rfs_readlink_start:");
516 
517 	vp = nfs_fhtovp(fhp, exi);
518 	if (vp == NULL) {
519 		rl->rl_data = NULL;
520 		rl->rl_status = NFSERR_STALE;
521 		TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END,
522 		    "rfs_readlink_end:(%S)", "stale");
523 		return;
524 	}
525 
526 	va.va_mask = AT_MODE;
527 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:");
528 	error = VOP_GETATTR(vp, &va, 0, cr, NULL);
529 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:");
530 
531 	if (error) {
532 		VN_RELE(vp);
533 		rl->rl_data = NULL;
534 		rl->rl_status = puterrno(error);
535 		TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END,
536 		    "rfs_readlink_end:(%S)", "getattr error");
537 		return;
538 	}
539 
540 	if (MANDLOCK(vp, va.va_mode)) {
541 		VN_RELE(vp);
542 		rl->rl_data = NULL;
543 		rl->rl_status = NFSERR_ACCES;
544 		TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END,
545 		    "rfs_readlink_end:(%S)", "access");
546 		return;
547 	}
548 
549 	/*
550 	 * XNFS and RFC1094 require us to return ENXIO if argument
551 	 * is not a link. BUGID 1138002.
552 	 */
553 	if (vp->v_type != VLNK) {
554 		VN_RELE(vp);
555 		rl->rl_data = NULL;
556 		rl->rl_status = NFSERR_NXIO;
557 		TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END,
558 		    "rfs_readlink_end:(%S)", "nxio");
559 		return;
560 	}
561 
562 	/*
563 	 * Allocate data for pathname.  This will be freed by rfs_rlfree.
564 	 */
565 	rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
566 
567 	/*
568 	 * Set up io vector to read sym link data
569 	 */
570 	iov.iov_base = rl->rl_data;
571 	iov.iov_len = NFS_MAXPATHLEN;
572 	uio.uio_iov = &iov;
573 	uio.uio_iovcnt = 1;
574 	uio.uio_segflg = UIO_SYSSPACE;
575 	uio.uio_extflg = UIO_COPY_CACHED;
576 	uio.uio_loffset = (offset_t)0;
577 	uio.uio_resid = NFS_MAXPATHLEN;
578 
579 	/*
580 	 * Do the readlink.
581 	 */
582 	TRACE_0(TR_FAC_NFS, TR_VOP_READLINK_START, "vop_readlink_start:");
583 	error = VOP_READLINK(vp, &uio, cr, NULL);
584 	TRACE_0(TR_FAC_NFS, TR_VOP_READLINK_END, "vop_readlink_end:");
585 
586 #if 0 /* notyet */
587 	/*
588 	 * Don't do this.  It causes local disk writes when just
589 	 * reading the file and the overhead is deemed larger
590 	 * than the benefit.
591 	 */
592 	/*
593 	 * Force modified metadata out to stable storage.
594 	 */
595 	(void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
596 #endif
597 
598 	VN_RELE(vp);
599 
600 	rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid);
601 
602 	/*
603 	 * XNFS and RFC1094 require us to return ENXIO if argument
604 	 * is not a link. UFS returns EINVAL if this is the case,
605 	 * so we do the mapping here. BUGID 1138002.
606 	 */
607 	if (error == EINVAL)
608 		rl->rl_status = NFSERR_NXIO;
609 	else
610 		rl->rl_status = puterrno(error);
611 
612 	TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END,
613 	    "rfs_readlink_end:(%S)", "done");
614 }
615 void *
616 rfs_readlink_getfh(fhandle_t *fhp)
617 {
618 	return (fhp);
619 }
620 /*
621  * Free data allocated by rfs_readlink
622  */
623 void
624 rfs_rlfree(struct nfsrdlnres *rl)
625 {
626 	if (rl->rl_data != NULL)
627 		kmem_free(rl->rl_data, NFS_MAXPATHLEN);
628 }
629 
630 /*
631  * Read data.
632  * Returns some data read from the file at the given fhandle.
633  */
634 /* ARGSUSED */
635 void
636 rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr,
637 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
638 {
639 	vnode_t *vp;
640 	int error;
641 	struct vattr va;
642 	struct iovec iov;
643 	struct uio uio;
644 	mblk_t *mp;
645 	int alloc_err = 0;
646 	int in_crit = 0;
647 	caller_context_t ct;
648 
649 	TRACE_0(TR_FAC_NFS, TR_RFS_READ_START, "rfs_read_start:");
650 
651 	vp = nfs_fhtovp(&ra->ra_fhandle, exi);
652 	if (vp == NULL) {
653 		rr->rr_data = NULL;
654 		rr->rr_status = NFSERR_STALE;
655 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
656 		    "rfs_read_end:(%S)", "stale");
657 		return;
658 	}
659 
660 	if (vp->v_type != VREG) {
661 		VN_RELE(vp);
662 		rr->rr_data = NULL;
663 		rr->rr_status = NFSERR_ISDIR;
664 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
665 		    "rfs_read_end:(%S)", "isdir");
666 		return;
667 	}
668 
669 	ct.cc_sysid = 0;
670 	ct.cc_pid = 0;
671 	ct.cc_caller_id = nfs2_srv_caller_id;
672 	ct.cc_flags = CC_DONTBLOCK;
673 
674 	/*
675 	 * Enter the critical region before calling VOP_RWLOCK
676 	 * to avoid a deadlock with write requests.
677 	 */
678 	if (nbl_need_check(vp)) {
679 		nbl_start_crit(vp, RW_READER);
680 		if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
681 		    0, NULL)) {
682 			nbl_end_crit(vp);
683 			VN_RELE(vp);
684 			rr->rr_data = NULL;
685 			rr->rr_status = NFSERR_ACCES;
686 			TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
687 			    "rfs_read_end:(%S)", " csf access error");
688 			return;
689 		}
690 		in_crit = 1;
691 	}
692 
693 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START, "vop_rwlock_start:");
694 	error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct);
695 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END, "vop_rwlock_end:");
696 
697 	/* check if a monitor detected a delegation conflict */
698 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
699 		VN_RELE(vp);
700 		/* mark as wouldblock so response is dropped */
701 		curthread->t_flag |= T_WOULDBLOCK;
702 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
703 		    "rfs_read_end:(%S)", "delegated");
704 		rr->rr_data = NULL;
705 		return;
706 	}
707 
708 	va.va_mask = AT_ALL;
709 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:");
710 	error = VOP_GETATTR(vp, &va, 0, cr, &ct);
711 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:");
712 
713 	if (error) {
714 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
715 		    "vop_rwunlock_start:");
716 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
717 		if (in_crit)
718 			nbl_end_crit(vp);
719 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, "vop_rwunlock_end:");
720 		VN_RELE(vp);
721 		rr->rr_data = NULL;
722 		rr->rr_status = puterrno(error);
723 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
724 		    "rfs_read_end:(%S)", "getattr error");
725 		return;
726 	}
727 
728 	/*
729 	 * This is a kludge to allow reading of files created
730 	 * with no read permission.  The owner of the file
731 	 * is always allowed to read it.
732 	 */
733 	if (crgetuid(cr) != va.va_uid) {
734 		TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START, "vop_access_start:");
735 		error = VOP_ACCESS(vp, VREAD, 0, cr, &ct);
736 		TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END, "vop_access_end:");
737 		if (error) {
738 			/*
739 			 * Exec is the same as read over the net because
740 			 * of demand loading.
741 			 */
742 			TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START,
743 			    "vop_access_start:");
744 			error = VOP_ACCESS(vp, VEXEC, 0, cr, &ct);
745 			TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END,
746 			    "vop_access_end:");
747 		}
748 		if (error) {
749 			TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
750 			    "vop_rwunlock_start:");
751 			VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
752 			if (in_crit)
753 				nbl_end_crit(vp);
754 			TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END,
755 			    "vop_rwunlock_end:");
756 			VN_RELE(vp);
757 			rr->rr_data = NULL;
758 			rr->rr_status = puterrno(error);
759 			TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
760 			    "rfs_read_end:(%S)", "access error");
761 			return;
762 		}
763 	}
764 
765 	if (MANDLOCK(vp, va.va_mode)) {
766 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
767 		    "vop_rwunlock_start:");
768 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
769 		if (in_crit)
770 			nbl_end_crit(vp);
771 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, "vop_rwunlock_end:");
772 		VN_RELE(vp);
773 		rr->rr_data = NULL;
774 		rr->rr_status = NFSERR_ACCES;
775 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
776 		    "rfs_read_end:(%S)", "mand lock");
777 		return;
778 	}
779 
780 	if ((u_offset_t)ra->ra_offset >= va.va_size) {
781 		rr->rr_count = 0;
782 		rr->rr_data = NULL;
783 		/*
784 		 * In this case, status is NFS_OK, but there is no data
785 		 * to encode. So set rr_mp to NULL.
786 		 */
787 		rr->rr_mp = NULL;
788 		goto done;
789 	}
790 
791 	/*
792 	 * mp will contain the data to be sent out in the read reply.
793 	 * This will be freed after the reply has been sent out (by the
794 	 * driver).
795 	 * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so
796 	 * that the call to xdrmblk_putmblk() never fails.
797 	 */
798 	mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG,
799 	    &alloc_err);
800 	ASSERT(mp != NULL);
801 	ASSERT(alloc_err == 0);
802 
803 	rr->rr_mp = mp;
804 
805 	/*
806 	 * Set up io vector
807 	 */
808 	iov.iov_base = (caddr_t)mp->b_datap->db_base;
809 	iov.iov_len = ra->ra_count;
810 	uio.uio_iov = &iov;
811 	uio.uio_iovcnt = 1;
812 	uio.uio_segflg = UIO_SYSSPACE;
813 	uio.uio_extflg = UIO_COPY_CACHED;
814 	uio.uio_loffset = (offset_t)ra->ra_offset;
815 	uio.uio_resid = ra->ra_count;
816 
817 	TRACE_0(TR_FAC_NFS, TR_VOP_READ_START, "vop_read_start:");
818 	error = VOP_READ(vp, &uio, 0, cr, &ct);
819 	TRACE_0(TR_FAC_NFS, TR_VOP_READ_END, "vop_read_end:");
820 
821 	if (error) {
822 		freeb(mp);
823 
824 		/*
825 		 * check if a monitor detected a delegation conflict and
826 		 * mark as wouldblock so response is dropped
827 		 */
828 		if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
829 			curthread->t_flag |= T_WOULDBLOCK;
830 		else
831 			rr->rr_status = puterrno(error);
832 
833 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
834 		    "vop_rwunlock_start:");
835 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
836 		if (in_crit)
837 			nbl_end_crit(vp);
838 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, "vop_rwunlock_end:");
839 		VN_RELE(vp);
840 		rr->rr_data = NULL;
841 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
842 		    "rfs_read_end:(%S)", "read error");
843 		return;
844 	}
845 
846 	/*
847 	 * Get attributes again so we can send the latest access
848 	 * time to the client side for his cache.
849 	 */
850 	va.va_mask = AT_ALL;
851 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:");
852 	error = VOP_GETATTR(vp, &va, 0, cr, &ct);
853 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:");
854 	if (error) {
855 		freeb(mp);
856 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
857 		    "vop_rwunlock_start:");
858 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
859 		if (in_crit)
860 			nbl_end_crit(vp);
861 		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END,
862 		    "vop_rwunlock_end:");
863 		VN_RELE(vp);
864 		rr->rr_data = NULL;
865 		rr->rr_status = puterrno(error);
866 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
867 		    "rfs_read_end:(%S)", "read error");
868 		return;
869 	}
870 
871 	rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid);
872 
873 	rr->rr_data = (char *)mp->b_datap->db_base;
874 
875 done:
876 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, "vop_rwunlock_start:");
877 	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
878 	if (in_crit)
879 		nbl_end_crit(vp);
880 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, "vop_rwunlock_end:");
881 
882 	acl_perm(vp, exi, &va, cr);
883 
884 	/* check for overflows */
885 	error = vattr_to_nattr(&va, &rr->rr_attr);
886 
887 #if 0 /* notyet */
888 	/*
889 	 * Don't do this.  It causes local disk writes when just
890 	 * reading the file and the overhead is deemed larger
891 	 * than the benefit.
892 	 */
893 	/*
894 	 * Force modified metadata out to stable storage.
895 	 */
896 	(void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
897 #endif
898 
899 	VN_RELE(vp);
900 
901 	rr->rr_status = puterrno(error);
902 
903 	TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, "rfs_read_end:(%S)", "done");
904 }
905 
906 /*
907  * Free data allocated by rfs_read
908  */
909 void
910 rfs_rdfree(struct nfsrdresult *rr)
911 {
912 	mblk_t *mp;
913 
914 	if (rr->rr_status == NFS_OK) {
915 		mp = rr->rr_mp;
916 		if (mp != NULL)
917 			freeb(mp);
918 	}
919 }
920 
921 void *
922 rfs_read_getfh(struct nfsreadargs *ra)
923 {
924 	return (&ra->ra_fhandle);
925 }
926 
927 #define	MAX_IOVECS	12
928 
929 #ifdef DEBUG
930 static int rfs_write_sync_hits = 0;
931 static int rfs_write_sync_misses = 0;
932 #endif
933 
934 /*
935  * Write data to file.
936  * Returns attributes of a file after writing some data to it.
937  *
938  * Any changes made here, especially in error handling might have
939  * to also be done in rfs_write (which clusters write requests).
940  */
941 void
942 rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns,
943 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
944 {
945 	int error;
946 	vnode_t *vp;
947 	rlim64_t rlimit;
948 	struct vattr va;
949 	struct uio uio;
950 	struct iovec iov[MAX_IOVECS];
951 	mblk_t *m;
952 	struct iovec *iovp;
953 	int iovcnt;
954 	cred_t *savecred;
955 	int in_crit = 0;
956 	caller_context_t ct;
957 
958 	TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_START, "rfs_write_start:(%S)", "sync");
959 
960 	vp = nfs_fhtovp(&wa->wa_fhandle, exi);
961 	if (vp == NULL) {
962 		ns->ns_status = NFSERR_STALE;
963 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
964 		    "rfs_write_end:(%S)", "stale");
965 		return;
966 	}
967 
968 	if (rdonly(exi, req)) {
969 		VN_RELE(vp);
970 		ns->ns_status = NFSERR_ROFS;
971 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
972 		    "rfs_write_end:(%S)", "rofs");
973 		return;
974 	}
975 
976 	if (vp->v_type != VREG) {
977 		VN_RELE(vp);
978 		ns->ns_status = NFSERR_ISDIR;
979 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
980 		    "rfs_write_end:(%S)", "isdir");
981 		return;
982 	}
983 
984 	ct.cc_sysid = 0;
985 	ct.cc_pid = 0;
986 	ct.cc_caller_id = nfs2_srv_caller_id;
987 	ct.cc_flags = CC_DONTBLOCK;
988 
989 	va.va_mask = AT_UID|AT_MODE;
990 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:");
991 	error = VOP_GETATTR(vp, &va, 0, cr, &ct);
992 	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:");
993 
994 	if (error) {
995 		VN_RELE(vp);
996 		ns->ns_status = puterrno(error);
997 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
998 		    "rfs_write_end:(%S)", "getattr error");
999 		return;
1000 	}
1001 
1002 	if (crgetuid(cr) != va.va_uid) {
1003 		/*
1004 		 * This is a kludge to allow writes of files created
1005 		 * with read only permission.  The owner of the file
1006 		 * is always allowed to write it.
1007 		 */
1008 		TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START, "vop_access_start:");
1009 		error = VOP_ACCESS(vp, VWRITE, 0, cr, &ct);
1010 		TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END, "vop_access_end:");
1011 		if (error) {
1012 			VN_RELE(vp);
1013 			ns->ns_status = puterrno(error);
1014 			TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
1015 			    "rfs_write_end:(%S)", "access error");
1016 			return;
1017 		}
1018 	}
1019 
1020 	/*
1021 	 * Can't access a mandatory lock file.  This might cause
1022 	 * the NFS service thread to block forever waiting for a
1023 	 * lock to be released that will never be released.
1024 	 */
1025 	if (MANDLOCK(vp, va.va_mode)) {
1026 		VN_RELE(vp);
1027 		ns->ns_status = NFSERR_ACCES;
1028 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
1029 		    "rfs_write_end:(%S)", "mand lock");
1030 		return;
1031 	}
1032 
1033 	/*
1034 	 * We have to enter the critical region before calling VOP_RWLOCK
1035 	 * to avoid a deadlock with ufs.
1036 	 */
1037 	if (nbl_need_check(vp)) {
1038 		nbl_start_crit(vp, RW_READER);
1039 		in_crit = 1;
1040 		if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
1041 		    wa->wa_count, 0, NULL)) {
1042 			error = EACCES;
1043 			goto out;
1044 		}
1045 	}
1046 
1047 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START, "vop_rwlock_start:");
1048 	error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1049 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END, "vop_rwlock_end:");
1050 
1051 	/* check if a monitor detected a delegation conflict */
1052 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1053 		VN_RELE(vp);
1054 		/* mark as wouldblock so response is dropped */
1055 		curthread->t_flag |= T_WOULDBLOCK;
1056 		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
1057 		    "rfs_write_end:(%S)", "delegated");
1058 		return;
1059 	}
1060 
1061 	if (wa->wa_data) {
1062 		iov[0].iov_base = wa->wa_data;
1063 		iov[0].iov_len = wa->wa_count;
1064 		uio.uio_iov = iov;
1065 		uio.uio_iovcnt = 1;
1066 		uio.uio_segflg = UIO_SYSSPACE;
1067 		uio.uio_extflg = UIO_COPY_DEFAULT;
1068 		uio.uio_loffset = (offset_t)wa->wa_offset;
1069 		uio.uio_resid = wa->wa_count;
1070 		/*
1071 		 * The limit is checked on the client. We
1072 		 * should allow any size writes here.
1073 		 */
1074 		uio.uio_llimit = curproc->p_fsz_ctl;
1075 		rlimit = uio.uio_llimit - wa->wa_offset;
1076 		if (rlimit < (rlim64_t)uio.uio_resid)
1077 			uio.uio_resid = (uint_t)rlimit;
1078 
1079 		/*
1080 		 * for now we assume no append mode
1081 		 */
1082 		TRACE_1(TR_FAC_NFS, TR_VOP_WRITE_START,
1083 		    "vop_write_start:(%S)", "sync");
1084 		/*
1085 		 * We're changing creds because VM may fault and we need
1086 		 * the cred of the current thread to be used if quota
1087 		 * checking is enabled.
1088 		 */
1089 		savecred = curthread->t_cred;
1090 		curthread->t_cred = cr;
1091 		error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1092 		curthread->t_cred = savecred;
1093 		TRACE_0(TR_FAC_NFS, TR_VOP_WRITE_END, "vop_write_end:");
1094 	} else {
1095 		iovcnt = 0;
1096 		for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
1097 			iovcnt++;
1098 		if (iovcnt <= MAX_IOVECS) {
1099 #ifdef DEBUG
1100 			rfs_write_sync_hits++;
1101 #endif
1102 			iovp = iov;
1103 		} else {
1104 #ifdef DEBUG
1105 			rfs_write_sync_misses++;
1106 #endif
1107 			iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
1108 		}
1109 		mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
1110 		uio.uio_iov = iovp;
1111 		uio.uio_iovcnt = iovcnt;
1112 		uio.uio_segflg = UIO_SYSSPACE;
1113 		uio.uio_extflg = UIO_COPY_DEFAULT;
1114 		uio.uio_loffset = (offset_t)wa->wa_offset;
1115 		uio.uio_resid = wa->wa_count;
1116 		/*
1117 		 * The limit is checked on the client. We
1118 		 * should allow any size writes here.
1119 		 */
1120 		uio.uio_llimit = curproc->p_fsz_ctl;
1121 		rlimit = uio.uio_llimit - wa->wa_offset;
1122 		if (rlimit < (rlim64_t)uio.uio_resid)
1123 			uio.uio_resid = (uint_t)rlimit;
1124 
1125 		/*
1126 		 * For now we assume no append mode.
1127 		 */
1128 		TRACE_1(TR_FAC_NFS, TR_VOP_WRITE_START,
1129 		    "vop_write_start:(%S)", "iov sync");
1130 		/*
1131 		 * We're changing creds because VM may fault and we need
1132 		 * the cred of the current thread to be used if quota
1133 		 * checking is enabled.
1134 		 */
1135 		savecred = curthread->t_cred;
1136 		curthread->t_cred = cr;
1137 		error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1138 		curthread->t_cred = savecred;
1139 		TRACE_0(TR_FAC_NFS, TR_VOP_WRITE_END, "vop_write_end:");
1140 
1141 		if (iovp != iov)
1142 			kmem_free(iovp, sizeof (*iovp) * iovcnt);
1143 	}
1144 
1145 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, "vop_rwunlock_start:");
1146 	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1147 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, "vop_rwunlock_end:");
1148 
1149 	if (!error) {
1150 		/*
1151 		 * Get attributes again so we send the latest mod
1152 		 * time to the client side for his cache.
1153 		 */
1154 		va.va_mask = AT_ALL;	/* now we want everything */
1155 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:");
1156 		error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1157 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:");
1158 		/* check for overflows */
1159 		if (!error) {
1160 			acl_perm(vp, exi, &va, cr);
1161 			error = vattr_to_nattr(&va, &ns->ns_attr);
1162 		}
1163 	}
1164 
1165 out:
1166 	if (in_crit)
1167 		nbl_end_crit(vp);
1168 	VN_RELE(vp);
1169 
1170 	/* check if a monitor detected a delegation conflict */
1171 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1172 		/* mark as wouldblock so response is dropped */
1173 		curthread->t_flag |= T_WOULDBLOCK;
1174 	else
1175 		ns->ns_status = puterrno(error);
1176 
1177 	TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, "rfs_write_end:(%S)", "sync");
1178 }
1179 
1180 struct rfs_async_write {
1181 	struct nfswriteargs *wa;
1182 	struct nfsattrstat *ns;
1183 	struct svc_req *req;
1184 	cred_t *cr;
1185 	kthread_t *thread;
1186 	struct rfs_async_write *list;
1187 };
1188 
1189 struct rfs_async_write_list {
1190 	fhandle_t *fhp;
1191 	kcondvar_t cv;
1192 	struct rfs_async_write *list;
1193 	struct rfs_async_write_list *next;
1194 };
1195 
1196 static struct rfs_async_write_list *rfs_async_write_head = NULL;
1197 static kmutex_t rfs_async_write_lock;
1198 static int rfs_write_async = 1;	/* enables write clustering if == 1 */
1199 
1200 #define	MAXCLIOVECS	42
1201 #define	RFSWRITE_INITVAL (enum nfsstat) -1
1202 
1203 #ifdef DEBUG
1204 static int rfs_write_hits = 0;
1205 static int rfs_write_misses = 0;
1206 #endif
1207 
1208 /*
1209  * Write data to file.
1210  * Returns attributes of a file after writing some data to it.
1211  */
1212 void
1213 rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns,
1214 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1215 {
1216 	int error;
1217 	vnode_t *vp;
1218 	rlim64_t rlimit;
1219 	struct vattr va;
1220 	struct uio uio;
1221 	struct rfs_async_write_list *lp;
1222 	struct rfs_async_write_list *nlp;
1223 	struct rfs_async_write *rp;
1224 	struct rfs_async_write *nrp;
1225 	struct rfs_async_write *trp;
1226 	struct rfs_async_write *lrp;
1227 	int data_written;
1228 	int iovcnt;
1229 	mblk_t *m;
1230 	struct iovec *iovp;
1231 	struct iovec *niovp;
1232 	struct iovec iov[MAXCLIOVECS];
1233 	int count;
1234 	int rcount;
1235 	uint_t off;
1236 	uint_t len;
1237 	struct rfs_async_write nrpsp;
1238 	struct rfs_async_write_list nlpsp;
1239 	ushort_t t_flag;
1240 	cred_t *savecred;
1241 	int in_crit = 0;
1242 	caller_context_t ct;
1243 
1244 	if (!rfs_write_async) {
1245 		rfs_write_sync(wa, ns, exi, req, cr);
1246 		return;
1247 	}
1248 
1249 	TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_START,
1250 	    "rfs_write_start:(%S)", "async");
1251 
1252 	/*
1253 	 * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
1254 	 * is considered an OK.
1255 	 */
1256 	ns->ns_status = RFSWRITE_INITVAL;
1257 
1258 	nrp = &nrpsp;
1259 	nrp->wa = wa;
1260 	nrp->ns = ns;
1261 	nrp->req = req;
1262 	nrp->cr = cr;
1263 	nrp->thread = curthread;
1264 
1265 	ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
1266 
1267 	/*
1268 	 * Look to see if there is already a cluster started
1269 	 * for this file.
1270 	 */
1271 	mutex_enter(&rfs_async_write_lock);
1272 	for (lp = rfs_async_write_head; lp != NULL; lp = lp->next) {
1273 		if (bcmp(&wa->wa_fhandle, lp->fhp,
1274 		    sizeof (fhandle_t)) == 0)
1275 			break;
1276 	}
1277 
1278 	/*
1279 	 * If lp is non-NULL, then there is already a cluster
1280 	 * started.  We need to place ourselves in the cluster
1281 	 * list in the right place as determined by starting
1282 	 * offset.  Conflicts with non-blocking mandatory locked
1283 	 * regions will be checked when the cluster is processed.
1284 	 */
1285 	if (lp != NULL) {
1286 		rp = lp->list;
1287 		trp = NULL;
1288 		while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
1289 			trp = rp;
1290 			rp = rp->list;
1291 		}
1292 		nrp->list = rp;
1293 		if (trp == NULL)
1294 			lp->list = nrp;
1295 		else
1296 			trp->list = nrp;
1297 		while (nrp->ns->ns_status == RFSWRITE_INITVAL)
1298 			cv_wait(&lp->cv, &rfs_async_write_lock);
1299 		mutex_exit(&rfs_async_write_lock);
1300 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
1301 		    "rfs_write_end:(%S)", "cluster child");
1302 		return;
1303 	}
1304 
1305 	/*
1306 	 * No cluster started yet, start one and add ourselves
1307 	 * to the list of clusters.
1308 	 */
1309 	nrp->list = NULL;
1310 
1311 	nlp = &nlpsp;
1312 	nlp->fhp = &wa->wa_fhandle;
1313 	cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
1314 	nlp->list = nrp;
1315 	nlp->next = NULL;
1316 
1317 	if (rfs_async_write_head == NULL) {
1318 		rfs_async_write_head = nlp;
1319 	} else {
1320 		lp = rfs_async_write_head;
1321 		while (lp->next != NULL)
1322 			lp = lp->next;
1323 		lp->next = nlp;
1324 	}
1325 	mutex_exit(&rfs_async_write_lock);
1326 
1327 	/*
1328 	 * Convert the file handle common to all of the requests
1329 	 * in this cluster to a vnode.
1330 	 */
1331 	vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1332 	if (vp == NULL) {
1333 		mutex_enter(&rfs_async_write_lock);
1334 		if (rfs_async_write_head == nlp)
1335 			rfs_async_write_head = nlp->next;
1336 		else {
1337 			lp = rfs_async_write_head;
1338 			while (lp->next != nlp)
1339 				lp = lp->next;
1340 			lp->next = nlp->next;
1341 		}
1342 		t_flag = curthread->t_flag & T_WOULDBLOCK;
1343 		for (rp = nlp->list; rp != NULL; rp = rp->list) {
1344 			rp->ns->ns_status = NFSERR_STALE;
1345 			rp->thread->t_flag |= t_flag;
1346 		}
1347 		cv_broadcast(&nlp->cv);
1348 		mutex_exit(&rfs_async_write_lock);
1349 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
1350 		    "rfs_write_end:(%S)", "stale");
1351 		return;
1352 	}
1353 
1354 	/*
1355 	 * Can only write regular files.  Attempts to write any
1356 	 * other file types fail with EISDIR.
1357 	 */
1358 	if (vp->v_type != VREG) {
1359 		VN_RELE(vp);
1360 		mutex_enter(&rfs_async_write_lock);
1361 		if (rfs_async_write_head == nlp)
1362 			rfs_async_write_head = nlp->next;
1363 		else {
1364 			lp = rfs_async_write_head;
1365 			while (lp->next != nlp)
1366 				lp = lp->next;
1367 			lp->next = nlp->next;
1368 		}
1369 		t_flag = curthread->t_flag & T_WOULDBLOCK;
1370 		for (rp = nlp->list; rp != NULL; rp = rp->list) {
1371 			rp->ns->ns_status = NFSERR_ISDIR;
1372 			rp->thread->t_flag |= t_flag;
1373 		}
1374 		cv_broadcast(&nlp->cv);
1375 		mutex_exit(&rfs_async_write_lock);
1376 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
1377 		    "rfs_write_end:(%S)", "isdir");
1378 		return;
1379 	}
1380 
1381 	/*
1382 	 * Enter the critical region before calling VOP_RWLOCK, to avoid a
1383 	 * deadlock with ufs.
1384 	 */
1385 	if (nbl_need_check(vp)) {
1386 		nbl_start_crit(vp, RW_READER);
1387 		in_crit = 1;
1388 	}
1389 
1390 	ct.cc_sysid = 0;
1391 	ct.cc_pid = 0;
1392 	ct.cc_caller_id = nfs2_srv_caller_id;
1393 	ct.cc_flags = CC_DONTBLOCK;
1394 
1395 	/*
1396 	 * Lock the file for writing.  This operation provides
1397 	 * the delay which allows clusters to grow.
1398 	 */
1399 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START, "vop_wrlock_start:");
1400 	error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1401 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END, "vop_wrlock_end");
1402 
1403 	/* check if a monitor detected a delegation conflict */
1404 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1405 		VN_RELE(vp);
1406 		/* mark as wouldblock so response is dropped */
1407 		curthread->t_flag |= T_WOULDBLOCK;
1408 		mutex_enter(&rfs_async_write_lock);
1409 		for (rp = nlp->list; rp != NULL; rp = rp->list) {
1410 			if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1411 				rp->ns->ns_status = puterrno(error);
1412 				rp->thread->t_flag |= T_WOULDBLOCK;
1413 			}
1414 		}
1415 		cv_broadcast(&nlp->cv);
1416 		mutex_exit(&rfs_async_write_lock);
1417 		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
1418 		    "rfs_write_end:(%S)", "delegated");
1419 		return;
1420 	}
1421 
1422 	/*
1423 	 * Disconnect this cluster from the list of clusters.
1424 	 * The cluster that is being dealt with must be fixed
1425 	 * in size after this point, so there is no reason
1426 	 * to leave it on the list so that new requests can
1427 	 * find it.
1428 	 *
1429 	 * The algorithm is that the first write request will
1430 	 * create a cluster, convert the file handle to a
1431 	 * vnode pointer, and then lock the file for writing.
1432 	 * This request is not likely to be clustered with
1433 	 * any others.  However, the next request will create
1434 	 * a new cluster and be blocked in VOP_RWLOCK while
1435 	 * the first request is being processed.  This delay
1436 	 * will allow more requests to be clustered in this
1437 	 * second cluster.
1438 	 */
1439 	mutex_enter(&rfs_async_write_lock);
1440 	if (rfs_async_write_head == nlp)
1441 		rfs_async_write_head = nlp->next;
1442 	else {
1443 		lp = rfs_async_write_head;
1444 		while (lp->next != nlp)
1445 			lp = lp->next;
1446 		lp->next = nlp->next;
1447 	}
1448 	mutex_exit(&rfs_async_write_lock);
1449 
1450 	/*
1451 	 * Step through the list of requests in this cluster.
1452 	 * We need to check permissions to make sure that all
1453 	 * of the requests have sufficient permission to write
1454 	 * the file.  A cluster can be composed of requests
1455 	 * from different clients and different users on each
1456 	 * client.
1457 	 *
1458 	 * As a side effect, we also calculate the size of the
1459 	 * byte range that this cluster encompasses.
1460 	 */
1461 	rp = nlp->list;
1462 	off = rp->wa->wa_offset;
1463 	len = (uint_t)0;
1464 	do {
1465 		if (rdonly(exi, rp->req)) {
1466 			rp->ns->ns_status = NFSERR_ROFS;
1467 			t_flag = curthread->t_flag & T_WOULDBLOCK;
1468 			rp->thread->t_flag |= t_flag;
1469 			continue;
1470 		}
1471 
1472 		va.va_mask = AT_UID|AT_MODE;
1473 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:");
1474 		error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1475 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:");
1476 		if (!error) {
1477 			if (crgetuid(rp->cr) != va.va_uid) {
1478 				/*
1479 				 * This is a kludge to allow writes of files
1480 				 * created with read only permission.  The
1481 				 * owner of the file is always allowed to
1482 				 * write it.
1483 				 */
1484 				TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START,
1485 				    "vop_access_start:");
1486 				error = VOP_ACCESS(vp, VWRITE, 0, rp->cr, &ct);
1487 				TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END,
1488 				    "vop_access_end:");
1489 			}
1490 			if (!error && MANDLOCK(vp, va.va_mode))
1491 				error = EACCES;
1492 		}
1493 
1494 		/*
1495 		 * Check for a conflict with a nbmand-locked region.
1496 		 */
1497 		if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset,
1498 		    rp->wa->wa_count, 0, NULL)) {
1499 			error = EACCES;
1500 		}
1501 
1502 		if (error) {
1503 			rp->ns->ns_status = puterrno(error);
1504 			t_flag = curthread->t_flag & T_WOULDBLOCK;
1505 			rp->thread->t_flag |= t_flag;
1506 			continue;
1507 		}
1508 		if (len < rp->wa->wa_offset + rp->wa->wa_count - off)
1509 			len = rp->wa->wa_offset + rp->wa->wa_count - off;
1510 	} while ((rp = rp->list) != NULL);
1511 
1512 	/*
1513 	 * Step through the cluster attempting to gather as many
1514 	 * requests which are contiguous as possible.  These
1515 	 * contiguous requests are handled via one call to VOP_WRITE
1516 	 * instead of different calls to VOP_WRITE.  We also keep
1517 	 * track of the fact that any data was written.
1518 	 */
1519 	rp = nlp->list;
1520 	data_written = 0;
1521 	do {
1522 		/*
1523 		 * Skip any requests which are already marked as having an
1524 		 * error.
1525 		 */
1526 		if (rp->ns->ns_status != RFSWRITE_INITVAL) {
1527 			rp = rp->list;
1528 			continue;
1529 		}
1530 
1531 		/*
1532 		 * Count the number of iovec's which are required
1533 		 * to handle this set of requests.  One iovec is
1534 		 * needed for each data buffer, whether addressed
1535 		 * by wa_data or by the b_rptr pointers in the
1536 		 * mblk chains.
1537 		 */
1538 		iovcnt = 0;
1539 		lrp = rp;
1540 		for (;;) {
1541 			if (lrp->wa->wa_data)
1542 				iovcnt++;
1543 			else {
1544 				m = lrp->wa->wa_mblk;
1545 				while (m != NULL) {
1546 					iovcnt++;
1547 					m = m->b_cont;
1548 				}
1549 			}
1550 			if (lrp->list == NULL ||
1551 			    lrp->list->ns->ns_status != RFSWRITE_INITVAL ||
1552 			    lrp->wa->wa_offset + lrp->wa->wa_count !=
1553 			    lrp->list->wa->wa_offset) {
1554 				lrp = lrp->list;
1555 				break;
1556 			}
1557 			lrp = lrp->list;
1558 		}
1559 
1560 		if (iovcnt <= MAXCLIOVECS) {
1561 #ifdef DEBUG
1562 			rfs_write_hits++;
1563 #endif
1564 			niovp = iov;
1565 		} else {
1566 #ifdef DEBUG
1567 			rfs_write_misses++;
1568 #endif
1569 			niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP);
1570 		}
1571 		/*
1572 		 * Put together the scatter/gather iovecs.
1573 		 */
1574 		iovp = niovp;
1575 		trp = rp;
1576 		count = 0;
1577 		do {
1578 			if (trp->wa->wa_data) {
1579 				iovp->iov_base = trp->wa->wa_data;
1580 				iovp->iov_len = trp->wa->wa_count;
1581 				iovp++;
1582 			} else {
1583 				m = trp->wa->wa_mblk;
1584 				rcount = trp->wa->wa_count;
1585 				while (m != NULL) {
1586 					iovp->iov_base = (caddr_t)m->b_rptr;
1587 					iovp->iov_len = (m->b_wptr - m->b_rptr);
1588 					rcount -= iovp->iov_len;
1589 					if (rcount < 0)
1590 						iovp->iov_len += rcount;
1591 					iovp++;
1592 					if (rcount <= 0)
1593 						break;
1594 					m = m->b_cont;
1595 				}
1596 			}
1597 			count += trp->wa->wa_count;
1598 			trp = trp->list;
1599 		} while (trp != lrp);
1600 
1601 		uio.uio_iov = niovp;
1602 		uio.uio_iovcnt = iovcnt;
1603 		uio.uio_segflg = UIO_SYSSPACE;
1604 		uio.uio_extflg = UIO_COPY_DEFAULT;
1605 		uio.uio_loffset = (offset_t)rp->wa->wa_offset;
1606 		uio.uio_resid = count;
1607 		/*
1608 		 * The limit is checked on the client. We
1609 		 * should allow any size writes here.
1610 		 */
1611 		uio.uio_llimit = curproc->p_fsz_ctl;
1612 		rlimit = uio.uio_llimit - rp->wa->wa_offset;
1613 		if (rlimit < (rlim64_t)uio.uio_resid)
1614 			uio.uio_resid = (uint_t)rlimit;
1615 
1616 		/*
1617 		 * For now we assume no append mode.
1618 		 */
1619 		TRACE_1(TR_FAC_NFS, TR_VOP_WRITE_START,
1620 		    "vop_write_start:(%S)", "async");
1621 
1622 		/*
1623 		 * We're changing creds because VM may fault
1624 		 * and we need the cred of the current
1625 		 * thread to be used if quota * checking is
1626 		 * enabled.
1627 		 */
1628 		savecred = curthread->t_cred;
1629 		curthread->t_cred = cr;
1630 		error = VOP_WRITE(vp, &uio, 0, rp->cr, &ct);
1631 		curthread->t_cred = savecred;
1632 		TRACE_0(TR_FAC_NFS, TR_VOP_WRITE_END, "vop_write_end:");
1633 
1634 		/* check if a monitor detected a delegation conflict */
1635 		if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1636 			/* mark as wouldblock so response is dropped */
1637 			curthread->t_flag |= T_WOULDBLOCK;
1638 
1639 		if (niovp != iov)
1640 			kmem_free(niovp, sizeof (*niovp) * iovcnt);
1641 
1642 		if (!error) {
1643 			data_written = 1;
1644 			/*
1645 			 * Get attributes again so we send the latest mod
1646 			 * time to the client side for his cache.
1647 			 */
1648 			va.va_mask = AT_ALL;	/* now we want everything */
1649 			TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
1650 			    "vop_getattr_start:");
1651 			error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1652 			TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
1653 			    "vop_getattr_end:");
1654 			if (!error)
1655 				acl_perm(vp, exi, &va, rp->cr);
1656 		}
1657 
1658 		/*
1659 		 * Fill in the status responses for each request
1660 		 * which was just handled.  Also, copy the latest
1661 		 * attributes in to the attribute responses if
1662 		 * appropriate.
1663 		 */
1664 		t_flag = curthread->t_flag & T_WOULDBLOCK;
1665 		do {
1666 			rp->thread->t_flag |= t_flag;
1667 			/* check for overflows */
1668 			if (!error) {
1669 				error  = vattr_to_nattr(&va, &rp->ns->ns_attr);
1670 			}
1671 			rp->ns->ns_status = puterrno(error);
1672 			rp = rp->list;
1673 		} while (rp != lrp);
1674 	} while (rp != NULL);
1675 
1676 	/*
1677 	 * If any data was written at all, then we need to flush
1678 	 * the data and metadata to stable storage.
1679 	 */
1680 	if (data_written) {
1681 		TRACE_0(TR_FAC_NFS, TR_VOP_PUTPAGE_START, "vop_putpage_start:");
1682 		error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr, &ct);
1683 		TRACE_0(TR_FAC_NFS, TR_VOP_PUTPAGE_END, "vop_putpage_end:");
1684 		if (!error) {
1685 			TRACE_0(TR_FAC_NFS, TR_VOP_FSYNC_START,
1686 			    "vop_fsync_start:");
1687 			error = VOP_FSYNC(vp, FNODSYNC, cr, &ct);
1688 			TRACE_0(TR_FAC_NFS, TR_VOP_FSYNC_END, "vop_fsync_end:");
1689 		}
1690 	}
1691 
1692 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, "vop_rwunlock_start:");
1693 	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1694 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, "vop_rwunlock_end:");
1695 
1696 	if (in_crit)
1697 		nbl_end_crit(vp);
1698 	VN_RELE(vp);
1699 
1700 	t_flag = curthread->t_flag & T_WOULDBLOCK;
1701 	mutex_enter(&rfs_async_write_lock);
1702 	for (rp = nlp->list; rp != NULL; rp = rp->list) {
1703 		if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1704 			rp->ns->ns_status = puterrno(error);
1705 			rp->thread->t_flag |= t_flag;
1706 		}
1707 	}
1708 	cv_broadcast(&nlp->cv);
1709 	mutex_exit(&rfs_async_write_lock);
1710 
1711 	TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, "rfs_write_end:(%S)", "async");
1712 }
1713 
1714 void *
1715 rfs_write_getfh(struct nfswriteargs *wa)
1716 {
1717 	return (&wa->wa_fhandle);
1718 }
1719 
1720 /*
1721  * Create a file.
1722  * Creates a file with given attributes and returns those attributes
1723  * and an fhandle for the new file.
1724  */
1725 void
1726 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
1727 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1728 {
1729 	int error;
1730 	int lookuperr;
1731 	int in_crit = 0;
1732 	struct vattr va;
1733 	vnode_t *vp;
1734 	vnode_t *realvp;
1735 	vnode_t *dvp;
1736 	char *name = args->ca_da.da_name;
1737 	vnode_t *tvp = NULL;
1738 	int mode;
1739 	int lookup_ok;
1740 	bool_t trunc;
1741 
1742 	TRACE_0(TR_FAC_NFS, TR_RFS_CREATE_START, "rfs_create_start:");
1743 
1744 	/*
1745 	 * Disallow NULL paths
1746 	 */
1747 	if (name == NULL || *name == '\0') {
1748 		dr->dr_status = NFSERR_ACCES;
1749 		TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END,
1750 		    "rfs_create_end:(%S)", "access");
1751 		return;
1752 	}
1753 
1754 	dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
1755 	if (dvp == NULL) {
1756 		dr->dr_status = NFSERR_STALE;
1757 		TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END,
1758 		    "rfs_create_end:(%S)", "stale");
1759 		return;
1760 	}
1761 
1762 	error = sattr_to_vattr(args->ca_sa, &va);
1763 	if (error) {
1764 		dr->dr_status = puterrno(error);
1765 		TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END,
1766 		    "rfs_create_end:(%S)", "sattr");
1767 		return;
1768 	}
1769 
1770 	/*
1771 	 * Must specify the mode.
1772 	 */
1773 	if (!(va.va_mask & AT_MODE)) {
1774 		VN_RELE(dvp);
1775 		dr->dr_status = NFSERR_INVAL;
1776 		TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END,
1777 		    "rfs_create_end:(%S)", "no mode");
1778 		return;
1779 	}
1780 
1781 	/*
1782 	 * This is a completely gross hack to make mknod
1783 	 * work over the wire until we can wack the protocol
1784 	 */
1785 	if ((va.va_mode & IFMT) == IFCHR) {
1786 		if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV)
1787 			va.va_type = VFIFO;	/* xtra kludge for named pipe */
1788 		else {
1789 			va.va_type = VCHR;
1790 			/*
1791 			 * uncompress the received dev_t
1792 			 * if the top half is zero indicating a request
1793 			 * from an `older style' OS.
1794 			 */
1795 			if ((va.va_size & 0xffff0000) == 0)
1796 				va.va_rdev = nfsv2_expdev(va.va_size);
1797 			else
1798 				va.va_rdev = (dev_t)va.va_size;
1799 		}
1800 		va.va_mask &= ~AT_SIZE;
1801 	} else if ((va.va_mode & IFMT) == IFBLK) {
1802 		va.va_type = VBLK;
1803 		/*
1804 		 * uncompress the received dev_t
1805 		 * if the top half is zero indicating a request
1806 		 * from an `older style' OS.
1807 		 */
1808 		if ((va.va_size & 0xffff0000) == 0)
1809 			va.va_rdev = nfsv2_expdev(va.va_size);
1810 		else
1811 			va.va_rdev = (dev_t)va.va_size;
1812 		va.va_mask &= ~AT_SIZE;
1813 	} else if ((va.va_mode & IFMT) == IFSOCK) {
1814 		va.va_type = VSOCK;
1815 	} else
1816 		va.va_type = VREG;
1817 	va.va_mode &= ~IFMT;
1818 	va.va_mask |= AT_TYPE;
1819 
1820 	/*
1821 	 * Why was the choice made to use VWRITE as the mode to the
1822 	 * call to VOP_CREATE ? This results in a bug.  When a client
1823 	 * opens a file that already exists and is RDONLY, the second
1824 	 * open fails with an EACESS because of the mode.
1825 	 * bug ID 1054648.
1826 	 */
1827 	lookup_ok = 0;
1828 	mode = VWRITE;
1829 	if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) {
1830 		TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_START, "vop_lookup_start:");
1831 		error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1832 		    NULL, NULL, NULL);
1833 		TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_END, "vop_lookup_end:");
1834 		if (!error) {
1835 			struct vattr at;
1836 
1837 			lookup_ok = 1;
1838 			at.va_mask = AT_MODE;
1839 			TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
1840 			    "vop_getattr_start:");
1841 			error = VOP_GETATTR(tvp, &at, 0, cr, NULL);
1842 			TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
1843 			    "vop_getattr_end:");
1844 			if (!error)
1845 				mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD;
1846 			VN_RELE(tvp);
1847 			tvp = NULL;
1848 		}
1849 	}
1850 
1851 	if (!lookup_ok) {
1852 		if (rdonly(exi, req)) {
1853 			error = EROFS;
1854 		} else if (va.va_type != VREG && va.va_type != VFIFO &&
1855 		    va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) {
1856 			error = EPERM;
1857 		} else {
1858 			error = 0;
1859 		}
1860 	}
1861 
1862 	/*
1863 	 * If file size is being modified on an already existing file
1864 	 * make sure that there are no conflicting non-blocking mandatory
1865 	 * locks in the region being manipulated. Return EACCES if there
1866 	 * are conflicting locks.
1867 	 */
1868 	if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) {
1869 		lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1870 		    NULL, NULL, NULL);
1871 
1872 		if (!lookuperr &&
1873 		    rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) {
1874 			VN_RELE(tvp);
1875 			curthread->t_flag |= T_WOULDBLOCK;
1876 			goto out;
1877 		}
1878 
1879 		if (!lookuperr && nbl_need_check(tvp)) {
1880 			/*
1881 			 * The file exists. Now check if it has any
1882 			 * conflicting non-blocking mandatory locks
1883 			 * in the region being changed.
1884 			 */
1885 			struct vattr bva;
1886 			u_offset_t offset;
1887 			ssize_t length;
1888 
1889 			nbl_start_crit(tvp, RW_READER);
1890 			in_crit = 1;
1891 
1892 			bva.va_mask = AT_SIZE;
1893 			error = VOP_GETATTR(tvp, &bva, 0, cr, NULL);
1894 			if (!error) {
1895 				if (va.va_size < bva.va_size) {
1896 					offset = va.va_size;
1897 					length = bva.va_size - va.va_size;
1898 				} else {
1899 					offset = bva.va_size;
1900 					length = va.va_size - bva.va_size;
1901 				}
1902 				if (length) {
1903 					if (nbl_conflict(tvp, NBL_WRITE,
1904 					    offset, length, 0, NULL)) {
1905 						error = EACCES;
1906 					}
1907 				}
1908 			}
1909 			if (error) {
1910 				nbl_end_crit(tvp);
1911 				VN_RELE(tvp);
1912 				in_crit = 0;
1913 			}
1914 		} else if (tvp != NULL) {
1915 			VN_RELE(tvp);
1916 		}
1917 	}
1918 
1919 	if (!error) {
1920 		/*
1921 		 * If filesystem is shared with nosuid the remove any
1922 		 * setuid/setgid bits on create.
1923 		 */
1924 		if (va.va_type == VREG &&
1925 		    exi->exi_export.ex_flags & EX_NOSUID)
1926 			va.va_mode &= ~(VSUID | VSGID);
1927 
1928 		TRACE_0(TR_FAC_NFS, TR_VOP_CREATE_START, "vop_create_start:");
1929 		error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0,
1930 		    NULL, NULL);
1931 		TRACE_0(TR_FAC_NFS, TR_VOP_CREATE_END, "vop_create_end:");
1932 
1933 		if (!error) {
1934 
1935 			if ((va.va_mask & AT_SIZE) && (va.va_size == 0))
1936 				trunc = TRUE;
1937 			else
1938 				trunc = FALSE;
1939 
1940 			if (rfs4_check_delegated(FWRITE, vp, trunc)) {
1941 				VN_RELE(vp);
1942 				curthread->t_flag |= T_WOULDBLOCK;
1943 				goto out;
1944 			}
1945 			va.va_mask = AT_ALL;
1946 			TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
1947 			    "vop_getattr_start:");
1948 			error = VOP_GETATTR(vp, &va, 0, cr, NULL);
1949 			TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
1950 			    "vop_getattr_end:");
1951 			/* check for overflows */
1952 			if (!error) {
1953 				acl_perm(vp, exi, &va, cr);
1954 				error = vattr_to_nattr(&va, &dr->dr_attr);
1955 				if (!error) {
1956 					error = makefh(&dr->dr_fhandle, vp,
1957 					    exi);
1958 				}
1959 			}
1960 			/*
1961 			 * Force modified metadata out to stable storage.
1962 			 *
1963 			 * if a underlying vp exists, pass it to VOP_FSYNC
1964 			 */
1965 			if (VOP_REALVP(vp, &realvp, NULL) == 0)
1966 				(void) VOP_FSYNC(realvp, FNODSYNC, cr, NULL);
1967 			else
1968 				(void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
1969 			VN_RELE(vp);
1970 		}
1971 
1972 		if (in_crit) {
1973 			nbl_end_crit(tvp);
1974 			VN_RELE(tvp);
1975 		}
1976 	}
1977 
1978 	/*
1979 	 * Force modified data and metadata out to stable storage.
1980 	 */
1981 	(void) VOP_FSYNC(dvp, 0, cr, NULL);
1982 
1983 out:
1984 
1985 	VN_RELE(dvp);
1986 
1987 	dr->dr_status = puterrno(error);
1988 
1989 	TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END, "rfs_create_end:(%S)", "done");
1990 }
1991 void *
1992 rfs_create_getfh(struct nfscreatargs *args)
1993 {
1994 	return (args->ca_da.da_fhandle);
1995 }
1996 
1997 /*
1998  * Remove a file.
1999  * Remove named file from parent directory.
2000  */
2001 void
2002 rfs_remove(struct nfsdiropargs *da, enum nfsstat *status,
2003 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2004 {
2005 	int error = 0;
2006 	vnode_t *vp;
2007 	vnode_t *targvp;
2008 	int in_crit = 0;
2009 
2010 	TRACE_0(TR_FAC_NFS, TR_RFS_REMOVE_START, "rfs_remove_start:");
2011 
2012 	/*
2013 	 * Disallow NULL paths
2014 	 */
2015 	if (da->da_name == NULL || *da->da_name == '\0') {
2016 		*status = NFSERR_ACCES;
2017 		TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END,
2018 		    "rfs_remove_end:(%S)", "access");
2019 		return;
2020 	}
2021 
2022 	vp = nfs_fhtovp(da->da_fhandle, exi);
2023 	if (vp == NULL) {
2024 		*status = NFSERR_STALE;
2025 		TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END,
2026 		    "rfs_remove_end:(%S)", "stale");
2027 		return;
2028 	}
2029 
2030 	if (rdonly(exi, req)) {
2031 		VN_RELE(vp);
2032 		*status = NFSERR_ROFS;
2033 		TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END,
2034 		    "rfs_remove_end:(%S)", "rofs");
2035 		return;
2036 	}
2037 
2038 	/*
2039 	 * Check for a conflict with a non-blocking mandatory share reservation.
2040 	 */
2041 	error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0,
2042 	    NULL, cr, NULL, NULL, NULL);
2043 	if (error != 0) {
2044 		VN_RELE(vp);
2045 		*status = puterrno(error);
2046 		return;
2047 	}
2048 
2049 	/*
2050 	 * If the file is delegated to an v4 client, then initiate
2051 	 * recall and drop this request (by setting T_WOULDBLOCK).
2052 	 * The client will eventually re-transmit the request and
2053 	 * (hopefully), by then, the v4 client will have returned
2054 	 * the delegation.
2055 	 */
2056 
2057 	if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2058 		VN_RELE(vp);
2059 		VN_RELE(targvp);
2060 		curthread->t_flag |= T_WOULDBLOCK;
2061 		return;
2062 	}
2063 
2064 	if (nbl_need_check(targvp)) {
2065 		nbl_start_crit(targvp, RW_READER);
2066 		in_crit = 1;
2067 		if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
2068 			error = EACCES;
2069 			goto out;
2070 		}
2071 	}
2072 
2073 	TRACE_0(TR_FAC_NFS, TR_VOP_REMOVE_START, "vop_remove_start:");
2074 	error = VOP_REMOVE(vp, da->da_name, cr, NULL, 0);
2075 	TRACE_0(TR_FAC_NFS, TR_VOP_REMOVE_END, "vop_remove_end:");
2076 
2077 	/*
2078 	 * Force modified data and metadata out to stable storage.
2079 	 */
2080 	(void) VOP_FSYNC(vp, 0, cr, NULL);
2081 
2082 out:
2083 	if (in_crit)
2084 		nbl_end_crit(targvp);
2085 	VN_RELE(targvp);
2086 	VN_RELE(vp);
2087 
2088 	*status = puterrno(error);
2089 
2090 	TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END, "rfs_remove_end:(%S)", "done");
2091 }
2092 
2093 void *
2094 rfs_remove_getfh(struct nfsdiropargs *da)
2095 {
2096 	return (da->da_fhandle);
2097 }
2098 
2099 /*
2100  * rename a file
2101  * Give a file (from) a new name (to).
2102  */
2103 void
2104 rfs_rename(struct nfsrnmargs *args, enum nfsstat *status,
2105 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2106 {
2107 	int error = 0;
2108 	vnode_t *fromvp;
2109 	vnode_t *tovp;
2110 	struct exportinfo *to_exi;
2111 	fhandle_t *fh;
2112 	vnode_t *srcvp;
2113 	vnode_t *targvp;
2114 	int in_crit = 0;
2115 
2116 	TRACE_0(TR_FAC_NFS, TR_RFS_RENAME_START, "rfs_rename_start:");
2117 
2118 	fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi);
2119 	if (fromvp == NULL) {
2120 		*status = NFSERR_STALE;
2121 		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2122 		    "rfs_rename_end:(%S)", "from stale");
2123 		return;
2124 	}
2125 
2126 	fh = args->rna_to.da_fhandle;
2127 	to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2128 	if (to_exi == NULL) {
2129 		VN_RELE(fromvp);
2130 		*status = NFSERR_ACCES;
2131 		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2132 		    "rfs_rename_end:(%S)", "cross device");
2133 		return;
2134 	}
2135 	exi_rele(to_exi);
2136 
2137 	if (to_exi != exi) {
2138 		VN_RELE(fromvp);
2139 		*status = NFSERR_XDEV;
2140 		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2141 		    "rfs_rename_end:(%S)", "from stale");
2142 		return;
2143 	}
2144 
2145 	tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi);
2146 	if (tovp == NULL) {
2147 		VN_RELE(fromvp);
2148 		*status = NFSERR_STALE;
2149 		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2150 		    "rfs_rename_end:(%S)", "to stale");
2151 		return;
2152 	}
2153 
2154 	if (fromvp->v_type != VDIR || tovp->v_type != VDIR) {
2155 		VN_RELE(tovp);
2156 		VN_RELE(fromvp);
2157 		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2158 		    "rfs_rename_end:(%S)", "not dir");
2159 		*status = NFSERR_NOTDIR;
2160 		return;
2161 	}
2162 
2163 	/*
2164 	 * Disallow NULL paths
2165 	 */
2166 	if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' ||
2167 	    args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') {
2168 		VN_RELE(tovp);
2169 		VN_RELE(fromvp);
2170 		*status = NFSERR_ACCES;
2171 		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2172 		    "rfs_rename_end:(%S)", "access");
2173 		return;
2174 	}
2175 
2176 	if (rdonly(exi, req)) {
2177 		VN_RELE(tovp);
2178 		VN_RELE(fromvp);
2179 		*status = NFSERR_ROFS;
2180 		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
2181 		    "rfs_rename_end:(%S)", "rofs");
2182 		return;
2183 	}
2184 
2185 	/*
2186 	 * Check for a conflict with a non-blocking mandatory share reservation.
2187 	 */
2188 	error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
2189 	    NULL, cr, NULL, NULL, NULL);
2190 	if (error != 0) {
2191 		VN_RELE(tovp);
2192 		VN_RELE(fromvp);
2193 		*status = puterrno(error);
2194 		return;
2195 	}
2196 
2197 	/* Check for delegations on the source file */
2198 
2199 	if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
2200 		VN_RELE(tovp);
2201 		VN_RELE(fromvp);
2202 		VN_RELE(srcvp);
2203 		curthread->t_flag |= T_WOULDBLOCK;
2204 		return;
2205 	}
2206 
2207 	/* Check for delegation on the file being renamed over, if it exists */
2208 
2209 	if (rfs4_deleg_policy != SRV_NEVER_DELEGATE &&
2210 	    VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
2211 	    NULL, NULL, NULL) == 0) {
2212 
2213 		if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2214 			VN_RELE(tovp);
2215 			VN_RELE(fromvp);
2216 			VN_RELE(srcvp);
2217 			VN_RELE(targvp);
2218 			curthread->t_flag |= T_WOULDBLOCK;
2219 			return;
2220 		}
2221 		VN_RELE(targvp);
2222 	}
2223 
2224 
2225 	if (nbl_need_check(srcvp)) {
2226 		nbl_start_crit(srcvp, RW_READER);
2227 		in_crit = 1;
2228 		if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) {
2229 			error = EACCES;
2230 			goto out;
2231 		}
2232 	}
2233 
2234 	TRACE_0(TR_FAC_NFS, TR_VOP_RENAME_START, "vop_rename_start:");
2235 	error = VOP_RENAME(fromvp, args->rna_from.da_name,
2236 	    tovp, args->rna_to.da_name, cr, NULL, 0);
2237 	TRACE_0(TR_FAC_NFS, TR_VOP_RENAME_END, "vop_rename_end:");
2238 
2239 	if (error == 0) {
2240 		char *tmp;
2241 
2242 		/* fix the path name for the renamed file */
2243 		mutex_enter(&srcvp->v_lock);
2244 		tmp = srcvp->v_path;
2245 		srcvp->v_path = NULL;
2246 		mutex_exit(&srcvp->v_lock);
2247 		vn_setpath(rootdir, tovp, srcvp, args->rna_to.da_name,
2248 		    strlen(args->rna_to.da_name));
2249 		if (tmp != NULL)
2250 			kmem_free(tmp, strlen(tmp) + 1);
2251 	}
2252 
2253 	/*
2254 	 * Force modified data and metadata out to stable storage.
2255 	 */
2256 	(void) VOP_FSYNC(tovp, 0, cr, NULL);
2257 	(void) VOP_FSYNC(fromvp, 0, cr, NULL);
2258 
2259 out:
2260 	if (in_crit)
2261 		nbl_end_crit(srcvp);
2262 	VN_RELE(srcvp);
2263 	VN_RELE(tovp);
2264 	VN_RELE(fromvp);
2265 
2266 	*status = puterrno(error);
2267 
2268 	TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, "rfs_rename_end:(%S)", "done");
2269 }
2270 void *
2271 rfs_rename_getfh(struct nfsrnmargs *args)
2272 {
2273 	return (args->rna_from.da_fhandle);
2274 }
2275 
2276 /*
2277  * Link to a file.
2278  * Create a file (to) which is a hard link to the given file (from).
2279  */
2280 void
2281 rfs_link(struct nfslinkargs *args, enum nfsstat *status,
2282 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2283 {
2284 	int error;
2285 	vnode_t *fromvp;
2286 	vnode_t *tovp;
2287 	struct exportinfo *to_exi;
2288 	fhandle_t *fh;
2289 
2290 	TRACE_0(TR_FAC_NFS, TR_RFS_LINK_START, "rfs_link_start:");
2291 
2292 	fromvp = nfs_fhtovp(args->la_from, exi);
2293 	if (fromvp == NULL) {
2294 		*status = NFSERR_STALE;
2295 		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2296 		    "rfs_link_end:(%S)", "from stale");
2297 		return;
2298 	}
2299 
2300 	fh = args->la_to.da_fhandle;
2301 	to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2302 	if (to_exi == NULL) {
2303 		VN_RELE(fromvp);
2304 		*status = NFSERR_ACCES;
2305 		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2306 		    "rfs_link_end:(%S)", "cross device");
2307 		return;
2308 	}
2309 	exi_rele(to_exi);
2310 
2311 	if (to_exi != exi) {
2312 		VN_RELE(fromvp);
2313 		*status = NFSERR_XDEV;
2314 		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2315 		    "rfs_link_end:(%S)", "cross device");
2316 		return;
2317 	}
2318 
2319 	tovp = nfs_fhtovp(args->la_to.da_fhandle, exi);
2320 	if (tovp == NULL) {
2321 		VN_RELE(fromvp);
2322 		*status = NFSERR_STALE;
2323 		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2324 		    "rfs_link_end:(%S)", "to stale");
2325 		return;
2326 	}
2327 
2328 	if (tovp->v_type != VDIR) {
2329 		VN_RELE(tovp);
2330 		VN_RELE(fromvp);
2331 		*status = NFSERR_NOTDIR;
2332 		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2333 		    "rfs_link_end:(%S)", "not dir");
2334 		return;
2335 	}
2336 	/*
2337 	 * Disallow NULL paths
2338 	 */
2339 	if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') {
2340 		VN_RELE(tovp);
2341 		VN_RELE(fromvp);
2342 		*status = NFSERR_ACCES;
2343 		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2344 		    "rfs_link_end:(%S)", "access");
2345 		return;
2346 	}
2347 
2348 	if (rdonly(exi, req)) {
2349 		VN_RELE(tovp);
2350 		VN_RELE(fromvp);
2351 		*status = NFSERR_ROFS;
2352 		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
2353 		    "rfs_link_end:(%S)", "rofs");
2354 		return;
2355 	}
2356 
2357 	TRACE_0(TR_FAC_NFS, TR_VOP_LINK_START, "vop_link_start:");
2358 	error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr, NULL, 0);
2359 	TRACE_0(TR_FAC_NFS, TR_VOP_LINK_END, "vop_link_end:");
2360 
2361 	/*
2362 	 * Force modified data and metadata out to stable storage.
2363 	 */
2364 	(void) VOP_FSYNC(tovp, 0, cr, NULL);
2365 	(void) VOP_FSYNC(fromvp, FNODSYNC, cr, NULL);
2366 
2367 	VN_RELE(tovp);
2368 	VN_RELE(fromvp);
2369 
2370 	*status = puterrno(error);
2371 
2372 	TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, "rfs_link_end:(%S)", "done");
2373 }
2374 void *
2375 rfs_link_getfh(struct nfslinkargs *args)
2376 {
2377 	return (args->la_from);
2378 }
2379 
2380 /*
2381  * Symbolicly link to a file.
2382  * Create a file (to) with the given attributes which is a symbolic link
2383  * to the given path name (to).
2384  */
2385 void
2386 rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
2387 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2388 {
2389 	int error;
2390 	struct vattr va;
2391 	vnode_t *vp;
2392 	vnode_t *svp;
2393 	int lerror;
2394 
2395 	TRACE_0(TR_FAC_NFS, TR_RFS_SYMLINK_START, "rfs_symlink_start:");
2396 
2397 	/*
2398 	 * Disallow NULL paths
2399 	 */
2400 	if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') {
2401 		*status = NFSERR_ACCES;
2402 		TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END,
2403 		    "rfs_symlink_end:(%S)", "access");
2404 		return;
2405 	}
2406 
2407 	vp = nfs_fhtovp(args->sla_from.da_fhandle, exi);
2408 	if (vp == NULL) {
2409 		*status = NFSERR_STALE;
2410 		TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END,
2411 		    "rfs_symlink_end:(%S)", "stale");
2412 		return;
2413 	}
2414 
2415 	if (rdonly(exi, req)) {
2416 		VN_RELE(vp);
2417 		*status = NFSERR_ROFS;
2418 		TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END,
2419 		    "rfs_symlink_end:(%S)", "rofs");
2420 		return;
2421 	}
2422 
2423 	error = sattr_to_vattr(args->sla_sa, &va);
2424 	if (error) {
2425 		VN_RELE(vp);
2426 		*status = puterrno(error);
2427 		TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END,
2428 		    "rfs_symlink_end:(%S)", "sattr");
2429 		return;
2430 	}
2431 
2432 	if (!(va.va_mask & AT_MODE)) {
2433 		VN_RELE(vp);
2434 		*status = NFSERR_INVAL;
2435 		TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END,
2436 		    "rfs_symlink_end:(%S)", "no mode");
2437 		return;
2438 	}
2439 
2440 	va.va_type = VLNK;
2441 	va.va_mask |= AT_TYPE;
2442 
2443 	TRACE_0(TR_FAC_NFS, TR_VOP_SYMLINK_START, "vop_symlink_start:");
2444 	error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, args->sla_tnm, cr,
2445 	    NULL, 0);
2446 	TRACE_0(TR_FAC_NFS, TR_VOP_SYMLINK_END, "vop_symlink_end:");
2447 
2448 	/*
2449 	 * Force new data and metadata out to stable storage.
2450 	 */
2451 	TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_START, "vop_lookup_start:");
2452 	lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL,
2453 	    0, NULL, cr, NULL, NULL, NULL);
2454 	TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_END, "vop_lookup_end:");
2455 	if (!lerror) {
2456 		(void) VOP_FSYNC(svp, 0, cr, NULL);
2457 		VN_RELE(svp);
2458 	}
2459 
2460 	/*
2461 	 * Force modified data and metadata out to stable storage.
2462 	 */
2463 	(void) VOP_FSYNC(vp, 0, cr, NULL);
2464 
2465 	VN_RELE(vp);
2466 
2467 	*status = puterrno(error);
2468 
2469 	TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END, "rfs_symlink_end:(%S)", "done");
2470 }
2471 void *
2472 rfs_symlink_getfh(struct nfsslargs *args)
2473 {
2474 	return (args->sla_from.da_fhandle);
2475 }
2476 
2477 /*
2478  * Make a directory.
2479  * Create a directory with the given name, parent directory, and attributes.
2480  * Returns a file handle and attributes for the new directory.
2481  */
2482 void
2483 rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr,
2484 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2485 {
2486 	int error;
2487 	struct vattr va;
2488 	vnode_t *dvp = NULL;
2489 	vnode_t *vp;
2490 	char *name = args->ca_da.da_name;
2491 
2492 	TRACE_0(TR_FAC_NFS, TR_RFS_MKDIR_START, "rfs_mkdir_start:");
2493 
2494 	/*
2495 	 * Disallow NULL paths
2496 	 */
2497 	if (name == NULL || *name == '\0') {
2498 		dr->dr_status = NFSERR_ACCES;
2499 		TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END,
2500 		    "rfs_mkdir_end:(%S)", "access");
2501 		return;
2502 	}
2503 
2504 	vp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
2505 	if (vp == NULL) {
2506 		dr->dr_status = NFSERR_STALE;
2507 		TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END,
2508 		    "rfs_mkdir_end:(%S)", "stale");
2509 		return;
2510 	}
2511 
2512 	if (rdonly(exi, req)) {
2513 		VN_RELE(vp);
2514 		dr->dr_status = NFSERR_ROFS;
2515 		TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END,
2516 		    "rfs_mkdir_end:(%S)", "rofs");
2517 		return;
2518 	}
2519 
2520 	error = sattr_to_vattr(args->ca_sa, &va);
2521 	if (error) {
2522 		VN_RELE(vp);
2523 		dr->dr_status = puterrno(error);
2524 		TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END,
2525 		    "rfs_mkdir_end:(%S)", "sattr");
2526 		return;
2527 	}
2528 
2529 	if (!(va.va_mask & AT_MODE)) {
2530 		VN_RELE(vp);
2531 		dr->dr_status = NFSERR_INVAL;
2532 		TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END,
2533 		    "rfs_mkdir_end:(%S)", "no mode");
2534 		return;
2535 	}
2536 
2537 	va.va_type = VDIR;
2538 	va.va_mask |= AT_TYPE;
2539 
2540 	TRACE_0(TR_FAC_NFS, TR_VOP_MKDIR_START, "vop_mkdir_start:");
2541 	error = VOP_MKDIR(vp, name, &va, &dvp, cr, NULL, 0, NULL);
2542 	TRACE_0(TR_FAC_NFS, TR_VOP_MKDIR_END, "vop_mkdir_end:");
2543 
2544 	if (!error) {
2545 		/*
2546 		 * Attribtutes of the newly created directory should
2547 		 * be returned to the client.
2548 		 */
2549 		va.va_mask = AT_ALL; /* We want everything */
2550 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:");
2551 		error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2552 		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:");
2553 		/* check for overflows */
2554 		if (!error) {
2555 			acl_perm(vp, exi, &va, cr);
2556 			error = vattr_to_nattr(&va, &dr->dr_attr);
2557 			if (!error) {
2558 				error = makefh(&dr->dr_fhandle, dvp, exi);
2559 			}
2560 		}
2561 		/*
2562 		 * Force new data and metadata out to stable storage.
2563 		 */
2564 		(void) VOP_FSYNC(dvp, 0, cr, NULL);
2565 		VN_RELE(dvp);
2566 	}
2567 
2568 	/*
2569 	 * Force modified data and metadata out to stable storage.
2570 	 */
2571 	(void) VOP_FSYNC(vp, 0, cr, NULL);
2572 
2573 	VN_RELE(vp);
2574 
2575 	dr->dr_status = puterrno(error);
2576 
2577 	TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END, "rfs_mkdir_end:(%S)", "done");
2578 }
2579 void *
2580 rfs_mkdir_getfh(struct nfscreatargs *args)
2581 {
2582 	return (args->ca_da.da_fhandle);
2583 }
2584 
2585 /*
2586  * Remove a directory.
2587  * Remove the given directory name from the given parent directory.
2588  */
2589 void
2590 rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status,
2591 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2592 {
2593 	int error;
2594 	vnode_t *vp;
2595 
2596 	TRACE_0(TR_FAC_NFS, TR_RFS_RMDIR_START, "rfs_rmdir_start:");
2597 
2598 	/*
2599 	 * Disallow NULL paths
2600 	 */
2601 	if (da->da_name == NULL || *da->da_name == '\0') {
2602 		*status = NFSERR_ACCES;
2603 		TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END,
2604 		    "rfs_rmdir_end:(%S)", "access");
2605 		return;
2606 	}
2607 
2608 	vp = nfs_fhtovp(da->da_fhandle, exi);
2609 	if (vp == NULL) {
2610 		*status = NFSERR_STALE;
2611 		TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END,
2612 		    "rfs_rmdir_end:(%S)", "stale");
2613 		return;
2614 	}
2615 
2616 	if (rdonly(exi, req)) {
2617 		VN_RELE(vp);
2618 		*status = NFSERR_ROFS;
2619 		TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END,
2620 		    "rfs_rmdir_end:(%S)", "rofs");
2621 		return;
2622 	}
2623 
2624 	/*
2625 	 * VOP_RMDIR now takes a new third argument (the current
2626 	 * directory of the process).  That's because someone
2627 	 * wants to return EINVAL if one tries to remove ".".
2628 	 * Of course, NFS servers have no idea what their
2629 	 * clients' current directories are.  We fake it by
2630 	 * supplying a vnode known to exist and illegal to
2631 	 * remove.
2632 	 */
2633 	TRACE_0(TR_FAC_NFS, TR_VOP_RMDIR_START, "vop_rmdir_start:");
2634 	error = VOP_RMDIR(vp, da->da_name, rootdir, cr, NULL, 0);
2635 	TRACE_0(TR_FAC_NFS, TR_VOP_RMDIR_END, "vop_rmdir_end:");
2636 
2637 	/*
2638 	 * Force modified data and metadata out to stable storage.
2639 	 */
2640 	(void) VOP_FSYNC(vp, 0, cr, NULL);
2641 
2642 	VN_RELE(vp);
2643 
2644 	/*
2645 	 * System V defines rmdir to return EEXIST, not ENOTEMPTY,
2646 	 * if the directory is not empty.  A System V NFS server
2647 	 * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
2648 	 * over the wire.
2649 	 */
2650 	if (error == EEXIST)
2651 		*status = NFSERR_NOTEMPTY;
2652 	else
2653 		*status = puterrno(error);
2654 
2655 	TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END, "rfs_rmdir_end:(%S)", "done");
2656 }
2657 void *
2658 rfs_rmdir_getfh(struct nfsdiropargs *da)
2659 {
2660 	return (da->da_fhandle);
2661 }
2662 
2663 /* ARGSUSED */
2664 void
2665 rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
2666 	struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2667 {
2668 	int error;
2669 	int iseof;
2670 	struct iovec iov;
2671 	struct uio uio;
2672 	vnode_t *vp;
2673 
2674 	TRACE_0(TR_FAC_NFS, TR_RFS_READDIR_START, "rfs_readdir_start:");
2675 
2676 	vp = nfs_fhtovp(&rda->rda_fh, exi);
2677 	if (vp == NULL) {
2678 		rd->rd_entries = NULL;
2679 		rd->rd_status = NFSERR_STALE;
2680 		TRACE_1(TR_FAC_NFS, TR_RFS_READDIR_END,
2681 		    "rfs_readdir_end:(%S)", "stale");
2682 		return;
2683 	}
2684 
2685 	if (vp->v_type != VDIR) {
2686 		VN_RELE(vp);
2687 		rd->rd_entries = NULL;
2688 		rd->rd_status = NFSERR_NOTDIR;
2689 		TRACE_1(TR_FAC_NFS, TR_RFS_READDIR_END,
2690 		    "rfs_readdir_end:(%S)", "notdir");
2691 		return;
2692 	}
2693 
2694 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START, "vop_rwlock_start:");
2695 	(void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
2696 	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END, "vop_rwlock_end:");
2697 
2698 	TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START, "vop_access_start:");
2699 	error = VOP_ACCESS(vp, VREAD, 0, cr, NULL);
2700 	TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END, "vop_access_end:");
2701 	if (error) {
2702 		rd->rd_entries = NULL;
2703 		goto bad;
2704 	}
2705 
2706 	if (rda->rda_count == 0) {
2707 		rd->rd_entries = NULL;
2708 		rd->rd_size = 0;
2709 		rd->rd_eof = FALSE;
2710 		goto bad;
2711 	}
2712 
2713 	rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA);
2714 
2715 	/*
2716 	 * Allocate data for entries.  This will be freed by rfs_rddirfree.
2717 	 */
2718 	rd->rd_bufsize = (uint_t)rda->rda_count;
2719 	rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP);
2720 
2721 	/*
2722 	 * Set up io vector to read directory data
2723 	 */
2724 	iov.iov_base = (caddr_t)rd->rd_entries;
2725 	iov.iov_len = rda->rda_count;
2726 	uio.uio_iov = &iov;
2727 	uio.uio_iovcnt = 1;
2728 	uio.uio_segflg = UIO_SYSSPACE;
2729 	uio.uio_extflg = UIO_COPY_CACHED;
2730 	uio.uio_loffset = (offset_t)rda->rda_offset;
2731 	uio.uio_resid = rda->rda_count;
2732 
2733 	/*
2734 	 * read directory
2735 	 */
2736 	TRACE_0(TR_FAC_NFS, TR_VOP_READDIR_START, "vop_readdir_start:");
2737 	error = VOP_READDIR(vp, &uio, cr, &iseof, NULL, 0);
2738 	TRACE_0(TR_FAC_NFS, TR_VOP_READDIR_END, "vop_readdir_end:");
2739 
2740 	/*
2741 	 * Clean up
2742 	 */
2743 	if (!error) {
2744 		/*
2745 		 * set size and eof
2746 		 */
2747 		if (uio.uio_resid == rda->rda_count) {
2748 			rd->rd_size = 0;
2749 			rd->rd_eof = TRUE;
2750 		} else {
2751 			rd->rd_size = (uint32_t)(rda->rda_count -
2752 			    uio.uio_resid);
2753 			rd->rd_eof = iseof ? TRUE : FALSE;
2754 		}
2755 	}
2756 
2757 bad:
2758 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, "vop_rwunlock_start:");
2759 	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
2760 	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, "vop_rwunlock_end:");
2761 
2762 #if 0 /* notyet */
2763 	/*
2764 	 * Don't do this.  It causes local disk writes when just
2765 	 * reading the file and the overhead is deemed larger
2766 	 * than the benefit.
2767 	 */
2768 	/*
2769 	 * Force modified metadata out to stable storage.
2770 	 */
2771 	(void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2772 #endif
2773 
2774 	VN_RELE(vp);
2775 
2776 	rd->rd_status = puterrno(error);
2777 
2778 	TRACE_1(TR_FAC_NFS, TR_RFS_READDIR_END, "rfs_readdir_end:(%S)", "done");
2779 }
2780 void *
2781 rfs_readdir_getfh(struct nfsrddirargs *rda)
2782 {
2783 	return (&rda->rda_fh);
2784 }
2785 void
2786 rfs_rddirfree(struct nfsrddirres *rd)
2787 {
2788 	if (rd->rd_entries != NULL)
2789 		kmem_free(rd->rd_entries, rd->rd_bufsize);
2790 }
2791 
2792 /* ARGSUSED */
2793 void
2794 rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
2795 	struct svc_req *req, cred_t *cr)
2796 {
2797 	int error;
2798 	struct statvfs64 sb;
2799 	vnode_t *vp;
2800 
2801 	TRACE_0(TR_FAC_NFS, TR_RFS_STATFS_START, "rfs_statfs_start:");
2802 
2803 	vp = nfs_fhtovp(fh, exi);
2804 	if (vp == NULL) {
2805 		fs->fs_status = NFSERR_STALE;
2806 		TRACE_1(TR_FAC_NFS, TR_RFS_STATFS_END,
2807 		    "rfs_statfs_end:(%S)", "stale");
2808 		return;
2809 	}
2810 
2811 	error = VFS_STATVFS(vp->v_vfsp, &sb);
2812 
2813 	if (!error) {
2814 		fs->fs_tsize = nfstsize();
2815 		fs->fs_bsize = sb.f_frsize;
2816 		fs->fs_blocks = sb.f_blocks;
2817 		fs->fs_bfree = sb.f_bfree;
2818 		fs->fs_bavail = sb.f_bavail;
2819 	}
2820 
2821 	VN_RELE(vp);
2822 
2823 	fs->fs_status = puterrno(error);
2824 
2825 	TRACE_1(TR_FAC_NFS, TR_RFS_STATFS_END, "rfs_statfs_end:(%S)", "done");
2826 }
2827 void *
2828 rfs_statfs_getfh(fhandle_t *fh)
2829 {
2830 	return (fh);
2831 }
2832 
2833 static int
2834 sattr_to_vattr(struct nfssattr *sa, struct vattr *vap)
2835 {
2836 	vap->va_mask = 0;
2837 
2838 	/*
2839 	 * There was a sign extension bug in some VFS based systems
2840 	 * which stored the mode as a short.  When it would get
2841 	 * assigned to a u_long, no sign extension would occur.
2842 	 * It needed to, but this wasn't noticed because sa_mode
2843 	 * would then get assigned back to the short, thus ignoring
2844 	 * the upper 16 bits of sa_mode.
2845 	 *
2846 	 * To make this implementation work for both broken
2847 	 * clients and good clients, we check for both versions
2848 	 * of the mode.
2849 	 */
2850 	if (sa->sa_mode != (uint32_t)((ushort_t)-1) &&
2851 	    sa->sa_mode != (uint32_t)-1) {
2852 		vap->va_mask |= AT_MODE;
2853 		vap->va_mode = sa->sa_mode;
2854 	}
2855 	if (sa->sa_uid != (uint32_t)-1) {
2856 		vap->va_mask |= AT_UID;
2857 		vap->va_uid = sa->sa_uid;
2858 	}
2859 	if (sa->sa_gid != (uint32_t)-1) {
2860 		vap->va_mask |= AT_GID;
2861 		vap->va_gid = sa->sa_gid;
2862 	}
2863 	if (sa->sa_size != (uint32_t)-1) {
2864 		vap->va_mask |= AT_SIZE;
2865 		vap->va_size = sa->sa_size;
2866 	}
2867 	if (sa->sa_atime.tv_sec != (int32_t)-1 &&
2868 	    sa->sa_atime.tv_usec != (int32_t)-1) {
2869 #ifndef _LP64
2870 		/* return error if time overflow */
2871 		if (!NFS2_TIME_OK(sa->sa_atime.tv_sec))
2872 			return (EOVERFLOW);
2873 #endif
2874 		vap->va_mask |= AT_ATIME;
2875 		/*
2876 		 * nfs protocol defines times as unsigned so don't extend sign,
2877 		 * unless sysadmin set nfs_allow_preepoch_time.
2878 		 */
2879 		NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec);
2880 		vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
2881 	}
2882 	if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
2883 	    sa->sa_mtime.tv_usec != (int32_t)-1) {
2884 #ifndef _LP64
2885 		/* return error if time overflow */
2886 		if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
2887 			return (EOVERFLOW);
2888 #endif
2889 		vap->va_mask |= AT_MTIME;
2890 		/*
2891 		 * nfs protocol defines times as unsigned so don't extend sign,
2892 		 * unless sysadmin set nfs_allow_preepoch_time.
2893 		 */
2894 		NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
2895 		vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
2896 	}
2897 	return (0);
2898 }
2899 
2900 static enum nfsftype vt_to_nf[] = {
2901 	0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
2902 };
2903 
2904 /*
2905  * check the following fields for overflow: nodeid, size, and time.
2906  * There could be a problem when converting 64-bit LP64 fields
2907  * into 32-bit ones.  Return an error if there is an overflow.
2908  */
2909 int
2910 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
2911 {
2912 	ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
2913 	na->na_type = vt_to_nf[vap->va_type];
2914 
2915 	if (vap->va_mode == (unsigned short) -1)
2916 		na->na_mode = (uint32_t)-1;
2917 	else
2918 		na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
2919 
2920 	if (vap->va_uid == (unsigned short)(-1))
2921 		na->na_uid = (uint32_t)(-1);
2922 	else if (vap->va_uid == UID_NOBODY)
2923 		na->na_uid = (uint32_t)NFS_UID_NOBODY;
2924 	else
2925 		na->na_uid = vap->va_uid;
2926 
2927 	if (vap->va_gid == (unsigned short)(-1))
2928 		na->na_gid = (uint32_t)-1;
2929 	else if (vap->va_gid == GID_NOBODY)
2930 		na->na_gid = (uint32_t)NFS_GID_NOBODY;
2931 	else
2932 		na->na_gid = vap->va_gid;
2933 
2934 	/*
2935 	 * Do we need to check fsid for overflow?  It is 64-bit in the
2936 	 * vattr, but are bigger than 32 bit values supported?
2937 	 */
2938 	na->na_fsid = vap->va_fsid;
2939 
2940 	na->na_nodeid = vap->va_nodeid;
2941 
2942 	/*
2943 	 * Check to make sure that the nodeid is representable over the
2944 	 * wire without losing bits.
2945 	 */
2946 	if (vap->va_nodeid != (u_longlong_t)na->na_nodeid)
2947 		return (EFBIG);
2948 	na->na_nlink = vap->va_nlink;
2949 
2950 	/*
2951 	 * Check for big files here, instead of at the caller.  See
2952 	 * comments in cstat for large special file explanation.
2953 	 */
2954 	if (vap->va_size > (u_longlong_t)MAXOFF32_T) {
2955 		if ((vap->va_type == VREG) || (vap->va_type == VDIR))
2956 			return (EFBIG);
2957 		if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) {
2958 			/* UNKNOWN_SIZE | OVERFLOW */
2959 			na->na_size = MAXOFF32_T;
2960 		} else
2961 			na->na_size = vap->va_size;
2962 	} else
2963 		na->na_size = vap->va_size;
2964 
2965 	/*
2966 	 * If the vnode times overflow the 32-bit times that NFS2
2967 	 * uses on the wire then return an error.
2968 	 */
2969 	if (!NFS_VAP_TIME_OK(vap)) {
2970 		return (EOVERFLOW);
2971 	}
2972 	na->na_atime.tv_sec = vap->va_atime.tv_sec;
2973 	na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2974 
2975 	na->na_mtime.tv_sec = vap->va_mtime.tv_sec;
2976 	na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2977 
2978 	na->na_ctime.tv_sec = vap->va_ctime.tv_sec;
2979 	na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000;
2980 
2981 	/*
2982 	 * If the dev_t will fit into 16 bits then compress
2983 	 * it, otherwise leave it alone. See comments in
2984 	 * nfs_client.c.
2985 	 */
2986 	if (getminor(vap->va_rdev) <= SO4_MAXMIN &&
2987 	    getmajor(vap->va_rdev) <= SO4_MAXMAJ)
2988 		na->na_rdev = nfsv2_cmpdev(vap->va_rdev);
2989 	else
2990 		(void) cmpldev(&na->na_rdev, vap->va_rdev);
2991 
2992 	na->na_blocks = vap->va_nblocks;
2993 	na->na_blocksize = vap->va_blksize;
2994 
2995 	/*
2996 	 * This bit of ugliness is a *TEMPORARY* hack to preserve the
2997 	 * over-the-wire protocols for named-pipe vnodes.  It remaps the
2998 	 * VFIFO type to the special over-the-wire type. (see note in nfs.h)
2999 	 *
3000 	 * BUYER BEWARE:
3001 	 *  If you are porting the NFS to a non-Sun server, you probably
3002 	 *  don't want to include the following block of code.  The
3003 	 *  over-the-wire special file types will be changing with the
3004 	 *  NFS Protocol Revision.
3005 	 */
3006 	if (vap->va_type == VFIFO)
3007 		NA_SETFIFO(na);
3008 	return (0);
3009 }
3010 
3011 /*
3012  * acl v2 support: returns approximate permission.
3013  *	default: returns minimal permission (more restrictive)
3014  *	aclok: returns maximal permission (less restrictive)
3015  *	This routine changes the permissions that are alaredy in *va.
3016  *	If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES,
3017  *	CLASS_OBJ is always the same as GROUP_OBJ entry.
3018  */
3019 static void
3020 acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr)
3021 {
3022 	vsecattr_t	vsa;
3023 	int		aclcnt;
3024 	aclent_t	*aclentp;
3025 	mode_t		mask_perm;
3026 	mode_t		grp_perm;
3027 	mode_t		other_perm;
3028 	mode_t		other_orig;
3029 	int		error;
3030 
3031 	/* dont care default acl */
3032 	vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT);
3033 	error = VOP_GETSECATTR(vp, &vsa, 0, cr, NULL);
3034 
3035 	if (!error) {
3036 		aclcnt = vsa.vsa_aclcnt;
3037 		if (aclcnt > MIN_ACL_ENTRIES) {
3038 			/* non-trivial ACL */
3039 			aclentp = vsa.vsa_aclentp;
3040 			if (exi->exi_export.ex_flags & EX_ACLOK) {
3041 				/* maximal permissions */
3042 				grp_perm = 0;
3043 				other_perm = 0;
3044 				for (; aclcnt > 0; aclcnt--, aclentp++) {
3045 					switch (aclentp->a_type) {
3046 					case USER_OBJ:
3047 						break;
3048 					case USER:
3049 						grp_perm |=
3050 						    aclentp->a_perm << 3;
3051 						other_perm |= aclentp->a_perm;
3052 						break;
3053 					case GROUP_OBJ:
3054 						grp_perm |=
3055 						    aclentp->a_perm << 3;
3056 						break;
3057 					case GROUP:
3058 						other_perm |= aclentp->a_perm;
3059 						break;
3060 					case OTHER_OBJ:
3061 						other_orig = aclentp->a_perm;
3062 						break;
3063 					case CLASS_OBJ:
3064 						mask_perm = aclentp->a_perm;
3065 						break;
3066 					default:
3067 						break;
3068 					}
3069 				}
3070 				grp_perm &= mask_perm << 3;
3071 				other_perm &= mask_perm;
3072 				other_perm |= other_orig;
3073 
3074 			} else {
3075 				/* minimal permissions */
3076 				grp_perm = 070;
3077 				other_perm = 07;
3078 				for (; aclcnt > 0; aclcnt--, aclentp++) {
3079 					switch (aclentp->a_type) {
3080 					case USER_OBJ:
3081 						break;
3082 					case USER:
3083 					case CLASS_OBJ:
3084 						grp_perm &=
3085 						    aclentp->a_perm << 3;
3086 						other_perm &=
3087 						    aclentp->a_perm;
3088 						break;
3089 					case GROUP_OBJ:
3090 						grp_perm &=
3091 						    aclentp->a_perm << 3;
3092 						break;
3093 					case GROUP:
3094 						other_perm &=
3095 						    aclentp->a_perm;
3096 						break;
3097 					case OTHER_OBJ:
3098 						other_perm &=
3099 						    aclentp->a_perm;
3100 						break;
3101 					default:
3102 						break;
3103 					}
3104 				}
3105 			}
3106 			/* copy to va */
3107 			va->va_mode &= ~077;
3108 			va->va_mode |= grp_perm | other_perm;
3109 		}
3110 		if (vsa.vsa_aclcnt)
3111 			kmem_free(vsa.vsa_aclentp,
3112 			    vsa.vsa_aclcnt * sizeof (aclent_t));
3113 	}
3114 }
3115 
3116 void
3117 rfs_srvrinit(void)
3118 {
3119 	mutex_init(&rfs_async_write_lock, NULL, MUTEX_DEFAULT, NULL);
3120 	nfs2_srv_caller_id = fs_new_caller_id();
3121 }
3122 
3123 void
3124 rfs_srvrfini(void)
3125 {
3126 	mutex_destroy(&rfs_async_write_lock);
3127 }
3128