xref: /illumos-gate/usr/src/uts/common/fs/fs_subr.c (revision 8c0b080c8ed055a259d8cd26b9f005211c6a9753)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
22 /*	All Rights Reserved	*/
23 
24 
25 /*
26  * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
27  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
28  * Copyright 2017 Joyent, Inc.
29  * Copyright 2022 Oxide Computer Company
30  */
31 
32 /*
33  * Generic vnode operations.
34  */
35 #include <sys/types.h>
36 #include <sys/param.h>
37 #include <sys/systm.h>
38 #include <sys/errno.h>
39 #include <sys/fcntl.h>
40 #include <sys/flock.h>
41 #include <sys/statvfs.h>
42 #include <sys/vfs.h>
43 #include <sys/vnode.h>
44 #include <sys/proc.h>
45 #include <sys/user.h>
46 #include <sys/unistd.h>
47 #include <sys/cred.h>
48 #include <sys/poll.h>
49 #include <sys/debug.h>
50 #include <sys/cmn_err.h>
51 #include <sys/stream.h>
52 #include <fs/fs_subr.h>
53 #include <fs/fs_reparse.h>
54 #include <sys/door.h>
55 #include <sys/acl.h>
56 #include <sys/share.h>
57 #include <sys/file.h>
58 #include <sys/kmem.h>
59 #include <sys/file.h>
60 #include <sys/nbmlock.h>
61 #include <acl/acl_common.h>
62 #include <sys/pathname.h>
63 
64 /* required for fs_reject_epoll */
65 #include <sys/poll_impl.h>
66 
67 static callb_cpr_t *frlock_serialize_blocked(flk_cb_when_t, void *);
68 
69 /*
70  * Tunable to limit the number of retry to recover from STALE error.
71  */
72 int fs_estale_retry = 5;
73 
74 /*
75  * supports for reparse point door upcall
76  */
77 static door_handle_t reparsed_door;
78 static kmutex_t reparsed_door_lock;
79 
80 /*
81  * The associated operation is not supported by the file system.
82  */
83 int
84 fs_nosys()
85 {
86 	return (ENOSYS);
87 }
88 
89 /*
90  * The associated operation is invalid (on this vnode).
91  */
92 int
93 fs_inval()
94 {
95 	return (EINVAL);
96 }
97 
98 /*
99  * The associated operation is valid only for directories.
100  */
101 int
102 fs_notdir()
103 {
104 	return (ENOTDIR);
105 }
106 
107 /*
108  * Free the file system specific resources. For the file systems that
109  * do not support the forced unmount, it will be a nop function.
110  */
111 
112 /*ARGSUSED*/
113 void
114 fs_freevfs(vfs_t *vfsp)
115 {
116 }
117 
118 /* ARGSUSED */
119 int
120 fs_nosys_map(struct vnode *vp, offset_t off, struct as *as, caddr_t *addrp,
121     size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, struct cred *cr,
122     caller_context_t *ct)
123 {
124 	return (ENOSYS);
125 }
126 
127 /* ARGSUSED */
128 int
129 fs_nosys_addmap(struct vnode *vp, offset_t off, struct as *as, caddr_t addr,
130     size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, struct cred *cr,
131     caller_context_t *ct)
132 {
133 	return (ENOSYS);
134 }
135 
136 /* ARGSUSED */
137 int
138 fs_nosys_poll(vnode_t *vp, short events, int anyyet, short *reventsp,
139     struct pollhead **phpp, caller_context_t *ct)
140 {
141 	return (ENOSYS);
142 }
143 
144 
145 /*
146  * The file system has nothing to sync to disk.  However, the
147  * VFS_SYNC operation must not fail.
148  */
149 /* ARGSUSED */
150 int
151 fs_sync(struct vfs *vfspp, short flag, cred_t *cr)
152 {
153 	return (0);
154 }
155 
156 /*
157  * Does nothing but VOP_FSYNC must not fail.
158  */
159 /* ARGSUSED */
160 int
161 fs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
162 {
163 	return (0);
164 }
165 
166 /*
167  * Does nothing but VOP_PUTPAGE must not fail.
168  */
169 /* ARGSUSED */
170 int
171 fs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
172     caller_context_t *ctp)
173 {
174 	return (0);
175 }
176 
177 /*
178  * Does nothing but VOP_IOCTL must not fail.
179  */
180 /* ARGSUSED */
181 int
182 fs_ioctl(vnode_t *vp, int com, intptr_t data, int flag, cred_t *cred,
183     int *rvalp)
184 {
185 	return (0);
186 }
187 
188 /*
189  * Read/write lock/unlock.  Does nothing.
190  */
191 /* ARGSUSED */
192 int
193 fs_rwlock(vnode_t *vp, int write_lock, caller_context_t *ctp)
194 {
195 	return (-1);
196 }
197 
198 /* ARGSUSED */
199 void
200 fs_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ctp)
201 {
202 }
203 
204 /*
205  * Compare two vnodes.
206  */
207 /*ARGSUSED2*/
208 int
209 fs_cmp(vnode_t *vp1, vnode_t *vp2, caller_context_t *ct)
210 {
211 	return (vp1 == vp2);
212 }
213 
214 /*
215  * No-op seek operation.
216  */
217 /* ARGSUSED */
218 int
219 fs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct)
220 {
221 	return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
222 }
223 
224 /*
225  * File and record locking.
226  */
227 /* ARGSUSED */
228 int
229 fs_frlock(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, offset_t offset,
230     flk_callback_t *flk_cbp, cred_t *cr, caller_context_t *ct)
231 {
232 	int frcmd;
233 	int nlmid;
234 	int error = 0;
235 	boolean_t skip_lock = B_FALSE;
236 	flk_callback_t serialize_callback;
237 	int serialize = 0;
238 	v_mode_t mode;
239 
240 	switch (cmd) {
241 
242 	case F_GETLK:
243 	case F_O_GETLK:
244 		if (flag & F_REMOTELOCK) {
245 			frcmd = RCMDLCK;
246 		} else if (flag & F_PXFSLOCK) {
247 			frcmd = PCMDLCK;
248 		} else {
249 			frcmd = 0;
250 			bfp->l_pid = ttoproc(curthread)->p_pid;
251 			bfp->l_sysid = 0;
252 		}
253 		break;
254 
255 	case F_OFD_GETLK:
256 		/*
257 		 * TBD we do not support remote OFD locks at this time.
258 		 */
259 		if (flag & (F_REMOTELOCK | F_PXFSLOCK)) {
260 			error = EOPNOTSUPP;
261 			goto done;
262 		}
263 		skip_lock = B_TRUE;
264 		break;
265 
266 	case F_SETLK_NBMAND:
267 		/*
268 		 * Are NBMAND locks allowed on this file?
269 		 */
270 		if (!vp->v_vfsp ||
271 		    !(vp->v_vfsp->vfs_flag & VFS_NBMAND)) {
272 			error = EINVAL;
273 			goto done;
274 		}
275 		if (vp->v_type != VREG) {
276 			error = EINVAL;
277 			goto done;
278 		}
279 		/*FALLTHROUGH*/
280 
281 	case F_SETLK:
282 		if (flag & F_REMOTELOCK) {
283 			frcmd = SETFLCK|RCMDLCK;
284 		} else if (flag & F_PXFSLOCK) {
285 			frcmd = SETFLCK|PCMDLCK;
286 		} else {
287 			frcmd = SETFLCK;
288 			bfp->l_pid = ttoproc(curthread)->p_pid;
289 			bfp->l_sysid = 0;
290 		}
291 		if (cmd == F_SETLK_NBMAND &&
292 		    (bfp->l_type == F_RDLCK || bfp->l_type == F_WRLCK)) {
293 			frcmd |= NBMLCK;
294 		}
295 
296 		if (nbl_need_check(vp)) {
297 			nbl_start_crit(vp, RW_WRITER);
298 			serialize = 1;
299 			if (frcmd & NBMLCK) {
300 				mode = (bfp->l_type == F_RDLCK) ?
301 				    V_READ : V_RDANDWR;
302 				if (vn_is_mapped(vp, mode)) {
303 					error = EAGAIN;
304 					goto done;
305 				}
306 			}
307 		}
308 		break;
309 
310 	case F_SETLKW:
311 		if (flag & F_REMOTELOCK) {
312 			frcmd = SETFLCK|SLPFLCK|RCMDLCK;
313 		} else if (flag & F_PXFSLOCK) {
314 			frcmd = SETFLCK|SLPFLCK|PCMDLCK;
315 		} else {
316 			frcmd = SETFLCK|SLPFLCK;
317 			bfp->l_pid = ttoproc(curthread)->p_pid;
318 			bfp->l_sysid = 0;
319 		}
320 
321 		if (nbl_need_check(vp)) {
322 			nbl_start_crit(vp, RW_WRITER);
323 			serialize = 1;
324 		}
325 		break;
326 
327 	case F_OFD_SETLK:
328 	case F_OFD_SETLKW:
329 	case F_FLOCK:
330 	case F_FLOCKW:
331 		/*
332 		 * TBD we do not support remote OFD locks at this time.
333 		 */
334 		if (flag & (F_REMOTELOCK | F_PXFSLOCK)) {
335 			error = EOPNOTSUPP;
336 			goto done;
337 		}
338 		skip_lock = B_TRUE;
339 		break;
340 
341 	case F_HASREMOTELOCKS:
342 		nlmid = GETNLMID(bfp->l_sysid);
343 		if (nlmid != 0) {	/* booted as a cluster */
344 			l_has_rmt(bfp) =
345 			    cl_flk_has_remote_locks_for_nlmid(vp, nlmid);
346 		} else {		/* not booted as a cluster */
347 			l_has_rmt(bfp) = flk_has_remote_locks(vp);
348 		}
349 
350 		goto done;
351 
352 	default:
353 		error = EINVAL;
354 		goto done;
355 	}
356 
357 	/*
358 	 * If this is a blocking lock request and we're serializing lock
359 	 * requests, modify the callback list to leave the critical region
360 	 * while we're waiting for the lock.
361 	 */
362 
363 	if (serialize && (frcmd & SLPFLCK) != 0) {
364 		flk_add_callback(&serialize_callback,
365 		    frlock_serialize_blocked, vp, flk_cbp);
366 		flk_cbp = &serialize_callback;
367 	}
368 
369 	if (!skip_lock)
370 		error = reclock(vp, bfp, frcmd, flag, offset, flk_cbp);
371 
372 	if (serialize && (frcmd & SLPFLCK) != 0)
373 		flk_del_callback(&serialize_callback);
374 
375 done:
376 	if (serialize)
377 		nbl_end_crit(vp);
378 
379 	return (error);
380 }
381 
382 /*
383  * Callback when a lock request blocks and we are serializing requests.  If
384  * before sleeping, leave the critical region.  If after wakeup, reenter
385  * the critical region.
386  */
387 
388 static callb_cpr_t *
389 frlock_serialize_blocked(flk_cb_when_t when, void *infop)
390 {
391 	vnode_t *vp = (vnode_t *)infop;
392 
393 	if (when == FLK_BEFORE_SLEEP)
394 		nbl_end_crit(vp);
395 	else {
396 		nbl_start_crit(vp, RW_WRITER);
397 	}
398 
399 	return (NULL);
400 }
401 
402 /*
403  * Allow any flags.
404  */
405 /* ARGSUSED */
406 int
407 fs_setfl(vnode_t *vp, int oflags, int nflags, cred_t *cr, caller_context_t *ct)
408 {
409 	return (0);
410 }
411 
412 /*
413  * Unlike poll(2), epoll should reject attempts to add normal files or
414  * directories to a given handle.  Most non-pseudo filesystems rely on
415  * fs_poll() as their implementation of polling behavior.  Exceptions to that
416  * rule (ufs) can use fs_reject_epoll(), so they don't require access to the
417  * inner details of poll.  Potential race conditions related to the poll module
418  * being loaded are avoided by implementing the check here in genunix.
419  */
420 boolean_t
421 fs_reject_epoll()
422 {
423 	/* Check if the currently-active pollcache is epoll-enabled. */
424 	return (curthread->t_pollcache != NULL &&
425 	    (curthread->t_pollcache->pc_flag & PC_EPOLL) != 0);
426 }
427 
428 /* ARGSUSED */
429 int
430 fs_poll(vnode_t *vp, short events, int anyyet, short *reventsp,
431     struct pollhead **phpp, caller_context_t *ct)
432 {
433 	/*
434 	 * Regular filesystems should reject epollers.  On the off chance that
435 	 * a non-epoll consumer expresses the desire for edge-triggered
436 	 * polling, we reject them too.  Yes, the expected error for this
437 	 * really is EPERM.
438 	 */
439 	if (fs_reject_epoll() || (events & POLLET) != 0) {
440 		return (EPERM);
441 	}
442 
443 	*reventsp = 0;
444 	if (events & POLLIN)
445 		*reventsp |= POLLIN;
446 	if (events & POLLRDNORM)
447 		*reventsp |= POLLRDNORM;
448 	if (events & POLLRDBAND)
449 		*reventsp |= POLLRDBAND;
450 	if (events & POLLOUT)
451 		*reventsp |= POLLOUT;
452 	if (events & POLLWRBAND)
453 		*reventsp |= POLLWRBAND;
454 
455 	return (0);
456 }
457 
458 /*
459  * POSIX pathconf() support.
460  */
461 /* ARGSUSED */
462 int
463 fs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
464     caller_context_t *ct)
465 {
466 	ulong_t val;
467 	int error = 0;
468 	struct statvfs64 vfsbuf;
469 
470 	switch (cmd) {
471 
472 	case _PC_LINK_MAX:
473 		val = MAXLINK;
474 		break;
475 
476 	case _PC_MAX_CANON:
477 		val = MAX_CANON;
478 		break;
479 
480 	case _PC_MAX_INPUT:
481 		val = MAX_INPUT;
482 		break;
483 
484 	case _PC_NAME_MAX:
485 		bzero(&vfsbuf, sizeof (vfsbuf));
486 		if (error = VFS_STATVFS(vp->v_vfsp, &vfsbuf))
487 			break;
488 		val = vfsbuf.f_namemax;
489 		break;
490 
491 	case _PC_PATH_MAX:
492 	case _PC_SYMLINK_MAX:
493 		val = MAXPATHLEN;
494 		break;
495 
496 	case _PC_PIPE_BUF:
497 		val = PIPE_BUF;
498 		break;
499 
500 	case _PC_NO_TRUNC:
501 		if (vp->v_vfsp->vfs_flag & VFS_NOTRUNC)
502 			val = 1;	/* NOTRUNC is enabled for vp */
503 		else
504 			val = (ulong_t)-1;
505 		break;
506 
507 	case _PC_VDISABLE:
508 		val = _POSIX_VDISABLE;
509 		break;
510 
511 	case _PC_CHOWN_RESTRICTED:
512 		if (rstchown)
513 			val = rstchown; /* chown restricted enabled */
514 		else
515 			val = (ulong_t)-1;
516 		break;
517 
518 	case _PC_FILESIZEBITS:
519 
520 		/*
521 		 * If ever we come here it means that underlying file system
522 		 * does not recognise the command and therefore this
523 		 * configurable limit cannot be determined. We return -1
524 		 * and don't change errno.
525 		 */
526 
527 		val = (ulong_t)-1;    /* large file support */
528 		break;
529 
530 	case _PC_ACL_ENABLED:
531 		val = 0;
532 		break;
533 
534 	case _PC_CASE_BEHAVIOR:
535 		val = _CASE_SENSITIVE;
536 		if (vfs_has_feature(vp->v_vfsp, VFSFT_CASEINSENSITIVE) == 1)
537 			val |= _CASE_INSENSITIVE;
538 		if (vfs_has_feature(vp->v_vfsp, VFSFT_NOCASESENSITIVE) == 1)
539 			val &= ~_CASE_SENSITIVE;
540 		break;
541 
542 	case _PC_SATTR_ENABLED:
543 	case _PC_SATTR_EXISTS:
544 		val = 0;
545 		break;
546 
547 	case _PC_ACCESS_FILTERING:
548 		val = 0;
549 		break;
550 
551 	default:
552 		error = EINVAL;
553 		break;
554 	}
555 
556 	if (error == 0)
557 		*valp = val;
558 	return (error);
559 }
560 
561 /*
562  * Dispose of a page.
563  */
564 /* ARGSUSED */
565 void
566 fs_dispose(struct vnode *vp, page_t *pp, int fl, int dn, struct cred *cr,
567     caller_context_t *ct)
568 {
569 
570 	ASSERT(fl == B_FREE || fl == B_INVAL);
571 
572 	if (fl == B_FREE)
573 		page_free(pp, dn);
574 	else
575 		page_destroy(pp, dn);
576 }
577 
578 /* ARGSUSED */
579 void
580 fs_nodispose(struct vnode *vp, page_t *pp, int fl, int dn, struct cred *cr,
581     caller_context_t *ct)
582 {
583 	cmn_err(CE_PANIC, "fs_nodispose invoked");
584 }
585 
586 /*
587  * fabricate acls for file systems that do not support acls.
588  */
589 /* ARGSUSED */
590 int
591 fs_fab_acl(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr,
592     caller_context_t *ct)
593 {
594 	aclent_t	*aclentp;
595 	struct vattr	vattr;
596 	int		error;
597 	size_t		aclsize;
598 
599 	vsecattr->vsa_aclcnt	= 0;
600 	vsecattr->vsa_aclentsz	= 0;
601 	vsecattr->vsa_aclentp	= NULL;
602 	vsecattr->vsa_dfaclcnt	= 0;	/* Default ACLs are not fabricated */
603 	vsecattr->vsa_dfaclentp	= NULL;
604 
605 	vattr.va_mask = AT_MODE | AT_UID | AT_GID;
606 	if (error = VOP_GETATTR(vp, &vattr, 0, cr, ct))
607 		return (error);
608 
609 	if (vsecattr->vsa_mask & (VSA_ACLCNT | VSA_ACL)) {
610 		aclsize = 4 * sizeof (aclent_t);
611 		vsecattr->vsa_aclcnt	= 4; /* USER, GROUP, OTHER, and CLASS */
612 		vsecattr->vsa_aclentp = kmem_zalloc(aclsize, KM_SLEEP);
613 		aclentp = vsecattr->vsa_aclentp;
614 
615 		aclentp->a_type = USER_OBJ;	/* Owner */
616 		aclentp->a_perm = ((ushort_t)(vattr.va_mode & 0700)) >> 6;
617 		aclentp->a_id = vattr.va_uid;   /* Really undefined */
618 		aclentp++;
619 
620 		aclentp->a_type = GROUP_OBJ;    /* Group */
621 		aclentp->a_perm = ((ushort_t)(vattr.va_mode & 0070)) >> 3;
622 		aclentp->a_id = vattr.va_gid;   /* Really undefined */
623 		aclentp++;
624 
625 		aclentp->a_type = OTHER_OBJ;    /* Other */
626 		aclentp->a_perm = vattr.va_mode & 0007;
627 		aclentp->a_id = (gid_t)-1;	/* Really undefined */
628 		aclentp++;
629 
630 		aclentp->a_type = CLASS_OBJ;    /* Class */
631 		aclentp->a_perm = (ushort_t)(0007);
632 		aclentp->a_id = (gid_t)-1;	/* Really undefined */
633 	} else if (vsecattr->vsa_mask & (VSA_ACECNT | VSA_ACE)) {
634 		VERIFY(0 == acl_trivial_create(vattr.va_mode,
635 		    (vp->v_type == VDIR), (ace_t **)&vsecattr->vsa_aclentp,
636 		    &vsecattr->vsa_aclcnt));
637 		vsecattr->vsa_aclentsz = vsecattr->vsa_aclcnt * sizeof (ace_t);
638 	}
639 
640 	return (error);
641 }
642 
643 /*
644  * Common code for implementing DOS share reservations
645  */
646 /* ARGSUSED4 */
647 int
648 fs_shrlock(struct vnode *vp, int cmd, struct shrlock *shr, int flag, cred_t *cr,
649     caller_context_t *ct)
650 {
651 	int error;
652 
653 	/*
654 	 * Make sure that the file was opened with permissions appropriate
655 	 * for the request, and make sure the caller isn't trying to sneak
656 	 * in an NBMAND request.
657 	 */
658 	if (cmd == F_SHARE) {
659 		if (((shr->s_access & F_RDACC) && (flag & FREAD) == 0) ||
660 		    ((shr->s_access & F_WRACC) && (flag & FWRITE) == 0))
661 			return (EBADF);
662 		if (shr->s_access & (F_RMACC | F_MDACC))
663 			return (EINVAL);
664 		if (shr->s_deny & (F_MANDDNY | F_RMDNY))
665 			return (EINVAL);
666 	}
667 	if (cmd == F_SHARE_NBMAND) {
668 		/* make sure nbmand is allowed on the file */
669 		if (!vp->v_vfsp ||
670 		    !(vp->v_vfsp->vfs_flag & VFS_NBMAND)) {
671 			return (EINVAL);
672 		}
673 		if (vp->v_type != VREG) {
674 			return (EINVAL);
675 		}
676 	}
677 
678 	nbl_start_crit(vp, RW_WRITER);
679 
680 	switch (cmd) {
681 
682 	case F_SHARE_NBMAND:
683 		shr->s_deny |= F_MANDDNY;
684 		/*FALLTHROUGH*/
685 	case F_SHARE:
686 		error = add_share(vp, shr);
687 		break;
688 
689 	case F_UNSHARE:
690 		error = del_share(vp, shr);
691 		break;
692 
693 	case F_HASREMOTELOCKS:
694 		/*
695 		 * We are overloading this command to refer to remote
696 		 * shares as well as remote locks, despite its name.
697 		 */
698 		shr->s_access = shr_has_remote_shares(vp, shr->s_sysid);
699 		error = 0;
700 		break;
701 
702 	default:
703 		error = EINVAL;
704 		break;
705 	}
706 
707 	nbl_end_crit(vp);
708 	return (error);
709 }
710 
711 /*ARGSUSED1*/
712 int
713 fs_vnevent_nosupport(vnode_t *vp, vnevent_t e, vnode_t *dvp, char *fnm,
714     caller_context_t *ct)
715 {
716 	ASSERT(vp != NULL);
717 	return (ENOTSUP);
718 }
719 
720 /*ARGSUSED1*/
721 int
722 fs_vnevent_support(vnode_t *vp, vnevent_t e, vnode_t *dvp, char *fnm,
723     caller_context_t *ct)
724 {
725 	ASSERT(vp != NULL);
726 	return (0);
727 }
728 
729 /*
730  * return 1 for non-trivial ACL.
731  *
732  * NB: It is not necessary for the caller to VOP_RWLOCK since
733  *	we only issue VOP_GETSECATTR.
734  *
735  * Returns 0 == trivial
736  *         1 == NOT Trivial
737  *	   <0 could not determine.
738  */
739 int
740 fs_acl_nontrivial(vnode_t *vp, cred_t *cr)
741 {
742 	ulong_t		acl_styles;
743 	ulong_t		acl_flavor;
744 	vsecattr_t	vsecattr;
745 	int		error;
746 	int		isnontrivial;
747 
748 	/* determine the forms of ACLs maintained */
749 	error = VOP_PATHCONF(vp, _PC_ACL_ENABLED, &acl_styles, cr, NULL);
750 
751 	/* clear bits we don't understand and establish default acl_style */
752 	acl_styles &= (_ACL_ACLENT_ENABLED | _ACL_ACE_ENABLED);
753 	if (error || (acl_styles == 0))
754 		acl_styles = _ACL_ACLENT_ENABLED;
755 
756 	vsecattr.vsa_aclentp = NULL;
757 	vsecattr.vsa_dfaclentp = NULL;
758 	vsecattr.vsa_aclcnt = 0;
759 	vsecattr.vsa_dfaclcnt = 0;
760 
761 	while (acl_styles) {
762 		/* select one of the styles as current flavor */
763 		acl_flavor = 0;
764 		if (acl_styles & _ACL_ACLENT_ENABLED) {
765 			acl_flavor = _ACL_ACLENT_ENABLED;
766 			vsecattr.vsa_mask = VSA_ACLCNT | VSA_DFACLCNT;
767 		} else if (acl_styles & _ACL_ACE_ENABLED) {
768 			acl_flavor = _ACL_ACE_ENABLED;
769 			vsecattr.vsa_mask = VSA_ACECNT | VSA_ACE;
770 		}
771 
772 		ASSERT(vsecattr.vsa_mask && acl_flavor);
773 		error = VOP_GETSECATTR(vp, &vsecattr, 0, cr, NULL);
774 		if (error == 0)
775 			break;
776 
777 		/* that flavor failed */
778 		acl_styles &= ~acl_flavor;
779 	}
780 
781 	/* if all styles fail then assume trivial */
782 	if (acl_styles == 0)
783 		return (0);
784 
785 	/* process the flavor that worked */
786 	isnontrivial = 0;
787 	if (acl_flavor & _ACL_ACLENT_ENABLED) {
788 		if (vsecattr.vsa_aclcnt > MIN_ACL_ENTRIES)
789 			isnontrivial = 1;
790 		if (vsecattr.vsa_aclcnt && vsecattr.vsa_aclentp != NULL)
791 			kmem_free(vsecattr.vsa_aclentp,
792 			    vsecattr.vsa_aclcnt * sizeof (aclent_t));
793 		if (vsecattr.vsa_dfaclcnt && vsecattr.vsa_dfaclentp != NULL)
794 			kmem_free(vsecattr.vsa_dfaclentp,
795 			    vsecattr.vsa_dfaclcnt * sizeof (aclent_t));
796 	}
797 	if (acl_flavor & _ACL_ACE_ENABLED) {
798 		isnontrivial = ace_trivial(vsecattr.vsa_aclentp,
799 		    vsecattr.vsa_aclcnt);
800 
801 		if (vsecattr.vsa_aclcnt && vsecattr.vsa_aclentp != NULL)
802 			kmem_free(vsecattr.vsa_aclentp,
803 			    vsecattr.vsa_aclcnt * sizeof (ace_t));
804 		/* ACE has no vsecattr.vsa_dfaclcnt */
805 	}
806 	return (isnontrivial);
807 }
808 
809 /*
810  * Check whether we need a retry to recover from STALE error.
811  */
812 int
813 fs_need_estale_retry(int retry_count)
814 {
815 	if (retry_count < fs_estale_retry)
816 		return (1);
817 	else
818 		return (0);
819 }
820 
821 
822 static int (*fs_av_scan)(vnode_t *, cred_t *, int) = NULL;
823 
824 /*
825  * Routine for anti-virus scanner to call to register its scanning routine.
826  */
827 void
828 fs_vscan_register(int (*av_scan)(vnode_t *, cred_t *, int))
829 {
830 	fs_av_scan = av_scan;
831 }
832 
833 /*
834  * Routine for file systems to call to initiate anti-virus scanning.
835  * Scanning will only be done on REGular files (currently).
836  */
837 int
838 fs_vscan(vnode_t *vp, cred_t *cr, int async)
839 {
840 	int ret = 0;
841 
842 	if (fs_av_scan && vp->v_type == VREG)
843 		ret = (*fs_av_scan)(vp, cr, async);
844 
845 	return (ret);
846 }
847 
848 /*
849  * support functions for reparse point
850  */
851 /*
852  * reparse_vnode_parse
853  *
854  * Read the symlink data of a reparse point specified by the vnode
855  * and return the reparse data as name-value pair in the nvlist.
856  */
857 int
858 reparse_vnode_parse(vnode_t *vp, nvlist_t *nvl)
859 {
860 	int err;
861 	char *lkdata;
862 	struct uio uio;
863 	struct iovec iov;
864 
865 	if (vp == NULL || nvl == NULL)
866 		return (EINVAL);
867 
868 	lkdata = kmem_alloc(MAXREPARSELEN, KM_SLEEP);
869 
870 	/*
871 	 * Set up io vector to read sym link data
872 	 */
873 	iov.iov_base = lkdata;
874 	iov.iov_len = MAXREPARSELEN;
875 	uio.uio_iov = &iov;
876 	uio.uio_iovcnt = 1;
877 	uio.uio_segflg = UIO_SYSSPACE;
878 	uio.uio_extflg = UIO_COPY_CACHED;
879 	uio.uio_loffset = (offset_t)0;
880 	uio.uio_resid = MAXREPARSELEN;
881 
882 	if ((err = VOP_READLINK(vp, &uio, kcred, NULL)) == 0) {
883 		*(lkdata + MAXREPARSELEN - uio.uio_resid) = '\0';
884 		err = reparse_parse(lkdata, nvl);
885 	}
886 	kmem_free(lkdata, MAXREPARSELEN);	/* done with lkdata */
887 
888 	return (err);
889 }
890 
891 void
892 reparse_point_init()
893 {
894 	mutex_init(&reparsed_door_lock, NULL, MUTEX_DEFAULT, NULL);
895 }
896 
897 static door_handle_t
898 reparse_door_get_handle()
899 {
900 	door_handle_t dh;
901 
902 	mutex_enter(&reparsed_door_lock);
903 	if ((dh = reparsed_door) == NULL) {
904 		if (door_ki_open(REPARSED_DOOR, &reparsed_door) != 0) {
905 			reparsed_door = NULL;
906 			dh = NULL;
907 		} else
908 			dh = reparsed_door;
909 	}
910 	mutex_exit(&reparsed_door_lock);
911 	return (dh);
912 }
913 
914 static void
915 reparse_door_reset_handle()
916 {
917 	mutex_enter(&reparsed_door_lock);
918 	reparsed_door = NULL;
919 	mutex_exit(&reparsed_door_lock);
920 }
921 
922 /*
923  * reparse_kderef
924  *
925  * Accepts the service-specific item from the reparse point and returns
926  * the service-specific data requested.  The caller specifies the size of
927  * the buffer provided via *bufsz; the routine will fail with EOVERFLOW
928  * if the results will not fit in the buffer, in which case, *bufsz will
929  * contain the number of bytes needed to hold the results.
930  *
931  * if ok return 0 and update *bufsize with length of actual result
932  * else return error code.
933  */
934 int
935 reparse_kderef(const char *svc_type, const char *svc_data, char *buf,
936     size_t *bufsize)
937 {
938 	int err, retries, need_free, retried_doorhd;
939 	size_t dlen, res_len;
940 	char *darg;
941 	door_arg_t door_args;
942 	reparsed_door_res_t *resp;
943 	door_handle_t rp_door;
944 
945 	if (svc_type == NULL || svc_data == NULL || buf == NULL ||
946 	    bufsize == NULL)
947 		return (EINVAL);
948 
949 	/* get reparsed's door handle */
950 	if ((rp_door = reparse_door_get_handle()) == NULL)
951 		return (EBADF);
952 
953 	/* setup buffer for door_call args and results */
954 	dlen = strlen(svc_type) + strlen(svc_data) + 2;
955 	if (*bufsize < dlen) {
956 		darg = kmem_alloc(dlen, KM_SLEEP);
957 		need_free = 1;
958 	} else {
959 		darg = buf;	/* use same buffer for door's args & results */
960 		need_free = 0;
961 	}
962 
963 	/* build argument string of door call */
964 	(void) snprintf(darg, dlen, "%s:%s", svc_type, svc_data);
965 
966 	/* setup args for door call */
967 	door_args.data_ptr = darg;
968 	door_args.data_size = dlen;
969 	door_args.desc_ptr = NULL;
970 	door_args.desc_num = 0;
971 	door_args.rbuf = buf;
972 	door_args.rsize = *bufsize;
973 
974 	/* do the door_call */
975 	retried_doorhd = 0;
976 	retries = 0;
977 	door_ki_hold(rp_door);
978 	while ((err = door_ki_upcall_limited(rp_door, &door_args,
979 	    NULL, SIZE_MAX, 0)) != 0) {
980 		if (err == EAGAIN || err == EINTR) {
981 			if (++retries < REPARSED_DOORCALL_MAX_RETRY) {
982 				delay(SEC_TO_TICK(1));
983 				continue;
984 			}
985 		} else if (err == EBADF) {
986 			/* door server goes away... */
987 			reparse_door_reset_handle();
988 
989 			if (retried_doorhd == 0) {
990 				door_ki_rele(rp_door);
991 				retried_doorhd++;
992 				rp_door = reparse_door_get_handle();
993 				if (rp_door != NULL) {
994 					door_ki_hold(rp_door);
995 					continue;
996 				}
997 			}
998 		}
999 		break;
1000 	}
1001 
1002 	if (rp_door)
1003 		door_ki_rele(rp_door);
1004 
1005 	if (need_free)
1006 		kmem_free(darg, dlen);		/* done with args buffer */
1007 
1008 	if (err != 0)
1009 		return (err);
1010 
1011 	resp = (reparsed_door_res_t *)door_args.rbuf;
1012 	if ((err = resp->res_status) == 0) {
1013 		/*
1014 		 * have to save the length of the results before the
1015 		 * bcopy below since it's can be an overlap copy that
1016 		 * overwrites the reparsed_door_res_t structure at
1017 		 * the beginning of the buffer.
1018 		 */
1019 		res_len = (size_t)resp->res_len;
1020 
1021 		/* deref call is ok */
1022 		if (res_len > *bufsize)
1023 			err = EOVERFLOW;
1024 		else
1025 			bcopy(resp->res_data, buf, res_len);
1026 		*bufsize = res_len;
1027 	}
1028 	if (door_args.rbuf != buf)
1029 		kmem_free(door_args.rbuf, door_args.rsize);
1030 
1031 	return (err);
1032 }
1033