xref: /illumos-gate/usr/src/uts/common/fs/fs_subr.c (revision 4763305e3243687c189d755d737d52205b2614ed)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
22 /*	All Rights Reserved	*/
23 
24 
25 /*
26  * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
27  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
28  * Copyright 2017 Joyent, Inc.
29  * Copyright 2024 Oxide Computer Company
30  */
31 
32 /*
33  * Generic vnode operations.
34  */
35 
36 #include <sys/types.h>
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/errno.h>
40 #include <sys/fcntl.h>
41 #include <sys/flock.h>
42 #include <sys/statvfs.h>
43 #include <sys/vfs.h>
44 #include <sys/vnode.h>
45 #include <sys/proc.h>
46 #include <sys/user.h>
47 #include <sys/unistd.h>
48 #include <sys/cred.h>
49 #include <sys/poll.h>
50 #include <sys/debug.h>
51 #include <sys/cmn_err.h>
52 #include <sys/stream.h>
53 #include <fs/fs_subr.h>
54 #include <fs/fs_reparse.h>
55 #include <sys/door.h>
56 #include <sys/acl.h>
57 #include <sys/share.h>
58 #include <sys/file.h>
59 #include <sys/kmem.h>
60 #include <sys/file.h>
61 #include <sys/nbmlock.h>
62 #include <acl/acl_common.h>
63 #include <sys/pathname.h>
64 
65 /* required for fs_reject_epoll */
66 #include <sys/poll_impl.h>
67 
68 static callb_cpr_t *frlock_serialize_blocked(flk_cb_when_t, void *);
69 
70 /*
71  * Tunable to limit the number of retry to recover from STALE error.
72  */
73 int fs_estale_retry = 5;
74 
75 /*
76  * supports for reparse point door upcall
77  */
78 static door_handle_t reparsed_door;
79 static kmutex_t reparsed_door_lock;
80 
81 /*
82  * The associated operation is not supported by the file system.
83  */
84 int
fs_nosys()85 fs_nosys()
86 {
87 	return (ENOSYS);
88 }
89 
90 /*
91  * This is the version of syncfs that a file system gets by default if it
92  * doesn't support the entry point and has a valid sync routine.
93  */
94 int
fs_nosys_syncfs(vfs_t * vfsp,uint64_t flags,cred_t * cr)95 fs_nosys_syncfs(vfs_t *vfsp, uint64_t flags, cred_t *cr)
96 {
97 	return (ENOSYS);
98 }
99 
100 /*
101  * The associated operation is invalid (on this vnode).
102  */
103 int
fs_inval()104 fs_inval()
105 {
106 	return (EINVAL);
107 }
108 
109 /*
110  * The associated operation is valid only for directories.
111  */
112 int
fs_notdir()113 fs_notdir()
114 {
115 	return (ENOTDIR);
116 }
117 
118 /*
119  * Free the file system specific resources. For the file systems that
120  * do not support the forced unmount, it will be a nop function.
121  */
122 
123 /*ARGSUSED*/
124 void
fs_freevfs(vfs_t * vfsp)125 fs_freevfs(vfs_t *vfsp)
126 {
127 }
128 
129 /* ARGSUSED */
130 int
fs_nosys_map(struct vnode * vp,offset_t off,struct as * as,caddr_t * addrp,size_t len,uchar_t prot,uchar_t maxprot,uint_t flags,struct cred * cr,caller_context_t * ct)131 fs_nosys_map(struct vnode *vp, offset_t off, struct as *as, caddr_t *addrp,
132     size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, struct cred *cr,
133     caller_context_t *ct)
134 {
135 	return (ENOSYS);
136 }
137 
138 /* ARGSUSED */
139 int
fs_nosys_addmap(struct vnode * vp,offset_t off,struct as * as,caddr_t addr,size_t len,uchar_t prot,uchar_t maxprot,uint_t flags,struct cred * cr,caller_context_t * ct)140 fs_nosys_addmap(struct vnode *vp, offset_t off, struct as *as, caddr_t addr,
141     size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, struct cred *cr,
142     caller_context_t *ct)
143 {
144 	return (ENOSYS);
145 }
146 
147 /* ARGSUSED */
148 int
fs_nosys_poll(vnode_t * vp,short events,int anyyet,short * reventsp,struct pollhead ** phpp,caller_context_t * ct)149 fs_nosys_poll(vnode_t *vp, short events, int anyyet, short *reventsp,
150     struct pollhead **phpp, caller_context_t *ct)
151 {
152 	return (ENOSYS);
153 }
154 
155 
156 /*
157  * The file system has nothing to sync to disk.  However, the
158  * VFS_SYNC operation must not fail.
159  */
160 /* ARGSUSED */
161 int
fs_sync(struct vfs * vfspp,short flag,cred_t * cr)162 fs_sync(struct vfs *vfspp, short flag, cred_t *cr)
163 {
164 	return (0);
165 }
166 
167 /*
168  * This should be used for file systems which do not need to support any kind of
169  * sync(2) style operation.
170  */
171 int
fs_syncfs_nop(vfs_t * vfspp,uint64_t flag,cred_t * cr)172 fs_syncfs_nop(vfs_t *vfspp, uint64_t flag, cred_t *cr)
173 {
174 	return (0);
175 }
176 
177 /*
178  * Does nothing but VOP_FSYNC must not fail.
179  */
180 /* ARGSUSED */
181 int
fs_fsync(vnode_t * vp,int syncflag,cred_t * cr,caller_context_t * ct)182 fs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
183 {
184 	return (0);
185 }
186 
187 /*
188  * Does nothing but VOP_PUTPAGE must not fail.
189  */
190 /* ARGSUSED */
191 int
fs_putpage(vnode_t * vp,offset_t off,size_t len,int flags,cred_t * cr,caller_context_t * ctp)192 fs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
193     caller_context_t *ctp)
194 {
195 	return (0);
196 }
197 
198 /*
199  * Does nothing but VOP_IOCTL must not fail.
200  */
201 /* ARGSUSED */
202 int
fs_ioctl(vnode_t * vp,int com,intptr_t data,int flag,cred_t * cred,int * rvalp)203 fs_ioctl(vnode_t *vp, int com, intptr_t data, int flag, cred_t *cred,
204     int *rvalp)
205 {
206 	return (0);
207 }
208 
209 /*
210  * Read/write lock/unlock.  Does nothing.
211  */
212 /* ARGSUSED */
213 int
fs_rwlock(vnode_t * vp,int write_lock,caller_context_t * ctp)214 fs_rwlock(vnode_t *vp, int write_lock, caller_context_t *ctp)
215 {
216 	return (-1);
217 }
218 
219 /* ARGSUSED */
220 void
fs_rwunlock(vnode_t * vp,int write_lock,caller_context_t * ctp)221 fs_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ctp)
222 {
223 }
224 
225 /*
226  * Compare two vnodes.
227  */
228 /*ARGSUSED2*/
229 int
fs_cmp(vnode_t * vp1,vnode_t * vp2,caller_context_t * ct)230 fs_cmp(vnode_t *vp1, vnode_t *vp2, caller_context_t *ct)
231 {
232 	return (vp1 == vp2);
233 }
234 
235 /*
236  * No-op seek operation.
237  */
238 /* ARGSUSED */
239 int
fs_seek(vnode_t * vp,offset_t ooff,offset_t * noffp,caller_context_t * ct)240 fs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct)
241 {
242 	return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
243 }
244 
245 /*
246  * File and record locking.
247  */
248 /* ARGSUSED */
249 int
fs_frlock(vnode_t * vp,int cmd,struct flock64 * bfp,int flag,offset_t offset,flk_callback_t * flk_cbp,cred_t * cr,caller_context_t * ct)250 fs_frlock(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, offset_t offset,
251     flk_callback_t *flk_cbp, cred_t *cr, caller_context_t *ct)
252 {
253 	int frcmd;
254 	int nlmid;
255 	int error = 0;
256 	boolean_t skip_lock = B_FALSE;
257 	flk_callback_t serialize_callback;
258 	int serialize = 0;
259 	v_mode_t mode;
260 
261 	switch (cmd) {
262 
263 	case F_GETLK:
264 	case F_O_GETLK:
265 		if (flag & F_REMOTELOCK) {
266 			frcmd = RCMDLCK;
267 		} else if (flag & F_PXFSLOCK) {
268 			frcmd = PCMDLCK;
269 		} else {
270 			frcmd = 0;
271 			bfp->l_pid = ttoproc(curthread)->p_pid;
272 			bfp->l_sysid = 0;
273 		}
274 		break;
275 
276 	case F_OFD_GETLK:
277 		/*
278 		 * TBD we do not support remote OFD locks at this time.
279 		 */
280 		if (flag & (F_REMOTELOCK | F_PXFSLOCK)) {
281 			error = EOPNOTSUPP;
282 			goto done;
283 		}
284 		skip_lock = B_TRUE;
285 		break;
286 
287 	case F_SETLK_NBMAND:
288 		/*
289 		 * Are NBMAND locks allowed on this file?
290 		 */
291 		if (!vp->v_vfsp ||
292 		    !(vp->v_vfsp->vfs_flag & VFS_NBMAND)) {
293 			error = EINVAL;
294 			goto done;
295 		}
296 		if (vp->v_type != VREG) {
297 			error = EINVAL;
298 			goto done;
299 		}
300 		/*FALLTHROUGH*/
301 
302 	case F_SETLK:
303 		if (flag & F_REMOTELOCK) {
304 			frcmd = SETFLCK|RCMDLCK;
305 		} else if (flag & F_PXFSLOCK) {
306 			frcmd = SETFLCK|PCMDLCK;
307 		} else {
308 			frcmd = SETFLCK;
309 			bfp->l_pid = ttoproc(curthread)->p_pid;
310 			bfp->l_sysid = 0;
311 		}
312 		if (cmd == F_SETLK_NBMAND &&
313 		    (bfp->l_type == F_RDLCK || bfp->l_type == F_WRLCK)) {
314 			frcmd |= NBMLCK;
315 		}
316 
317 		if (nbl_need_check(vp)) {
318 			nbl_start_crit(vp, RW_WRITER);
319 			serialize = 1;
320 			if (frcmd & NBMLCK) {
321 				mode = (bfp->l_type == F_RDLCK) ?
322 				    V_READ : V_RDANDWR;
323 				if (vn_is_mapped(vp, mode)) {
324 					error = EAGAIN;
325 					goto done;
326 				}
327 			}
328 		}
329 		break;
330 
331 	case F_SETLKW:
332 		if (flag & F_REMOTELOCK) {
333 			frcmd = SETFLCK|SLPFLCK|RCMDLCK;
334 		} else if (flag & F_PXFSLOCK) {
335 			frcmd = SETFLCK|SLPFLCK|PCMDLCK;
336 		} else {
337 			frcmd = SETFLCK|SLPFLCK;
338 			bfp->l_pid = ttoproc(curthread)->p_pid;
339 			bfp->l_sysid = 0;
340 		}
341 
342 		if (nbl_need_check(vp)) {
343 			nbl_start_crit(vp, RW_WRITER);
344 			serialize = 1;
345 		}
346 		break;
347 
348 	case F_OFD_SETLK:
349 	case F_OFD_SETLKW:
350 	case F_FLOCK:
351 	case F_FLOCKW:
352 		/*
353 		 * TBD we do not support remote OFD locks at this time.
354 		 */
355 		if (flag & (F_REMOTELOCK | F_PXFSLOCK)) {
356 			error = EOPNOTSUPP;
357 			goto done;
358 		}
359 		skip_lock = B_TRUE;
360 		break;
361 
362 	case F_HASREMOTELOCKS:
363 		nlmid = GETNLMID(bfp->l_sysid);
364 		if (nlmid != 0) {	/* booted as a cluster */
365 			l_has_rmt(bfp) =
366 			    cl_flk_has_remote_locks_for_nlmid(vp, nlmid);
367 		} else {		/* not booted as a cluster */
368 			l_has_rmt(bfp) = flk_has_remote_locks(vp);
369 		}
370 
371 		goto done;
372 
373 	default:
374 		error = EINVAL;
375 		goto done;
376 	}
377 
378 	/*
379 	 * If this is a blocking lock request and we're serializing lock
380 	 * requests, modify the callback list to leave the critical region
381 	 * while we're waiting for the lock.
382 	 */
383 
384 	if (serialize && (frcmd & SLPFLCK) != 0) {
385 		flk_add_callback(&serialize_callback,
386 		    frlock_serialize_blocked, vp, flk_cbp);
387 		flk_cbp = &serialize_callback;
388 	}
389 
390 	if (!skip_lock)
391 		error = reclock(vp, bfp, frcmd, flag, offset, flk_cbp);
392 
393 	if (serialize && (frcmd & SLPFLCK) != 0)
394 		flk_del_callback(&serialize_callback);
395 
396 done:
397 	if (serialize)
398 		nbl_end_crit(vp);
399 
400 	return (error);
401 }
402 
403 /*
404  * Callback when a lock request blocks and we are serializing requests.  If
405  * before sleeping, leave the critical region.  If after wakeup, reenter
406  * the critical region.
407  */
408 
409 static callb_cpr_t *
frlock_serialize_blocked(flk_cb_when_t when,void * infop)410 frlock_serialize_blocked(flk_cb_when_t when, void *infop)
411 {
412 	vnode_t *vp = (vnode_t *)infop;
413 
414 	if (when == FLK_BEFORE_SLEEP)
415 		nbl_end_crit(vp);
416 	else {
417 		nbl_start_crit(vp, RW_WRITER);
418 	}
419 
420 	return (NULL);
421 }
422 
423 /*
424  * Allow any flags.
425  */
426 /* ARGSUSED */
427 int
fs_setfl(vnode_t * vp,int oflags,int nflags,cred_t * cr,caller_context_t * ct)428 fs_setfl(vnode_t *vp, int oflags, int nflags, cred_t *cr, caller_context_t *ct)
429 {
430 	return (0);
431 }
432 
433 /*
434  * Unlike poll(2), epoll should reject attempts to add normal files or
435  * directories to a given handle.  Most non-pseudo filesystems rely on
436  * fs_poll() as their implementation of polling behavior.  Exceptions to that
437  * rule (ufs) can use fs_reject_epoll(), so they don't require access to the
438  * inner details of poll.  Potential race conditions related to the poll module
439  * being loaded are avoided by implementing the check here in genunix.
440  */
441 boolean_t
fs_reject_epoll()442 fs_reject_epoll()
443 {
444 	/* Check if the currently-active pollcache is epoll-enabled. */
445 	return (curthread->t_pollcache != NULL &&
446 	    (curthread->t_pollcache->pc_flag & PC_EPOLL) != 0);
447 }
448 
449 /* ARGSUSED */
450 int
fs_poll(vnode_t * vp,short events,int anyyet,short * reventsp,struct pollhead ** phpp,caller_context_t * ct)451 fs_poll(vnode_t *vp, short events, int anyyet, short *reventsp,
452     struct pollhead **phpp, caller_context_t *ct)
453 {
454 	/*
455 	 * Regular filesystems should reject epollers.  On the off chance that
456 	 * a non-epoll consumer expresses the desire for edge-triggered
457 	 * polling, we reject them too.  Yes, the expected error for this
458 	 * really is EPERM.
459 	 */
460 	if (fs_reject_epoll() || (events & POLLET) != 0) {
461 		return (EPERM);
462 	}
463 
464 	*reventsp = 0;
465 	if (events & POLLIN)
466 		*reventsp |= POLLIN;
467 	if (events & POLLRDNORM)
468 		*reventsp |= POLLRDNORM;
469 	if (events & POLLRDBAND)
470 		*reventsp |= POLLRDBAND;
471 	if (events & POLLOUT)
472 		*reventsp |= POLLOUT;
473 	if (events & POLLWRBAND)
474 		*reventsp |= POLLWRBAND;
475 
476 	return (0);
477 }
478 
479 /*
480  * POSIX pathconf() support.
481  */
482 /* ARGSUSED */
483 int
fs_pathconf(vnode_t * vp,int cmd,ulong_t * valp,cred_t * cr,caller_context_t * ct)484 fs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
485     caller_context_t *ct)
486 {
487 	ulong_t val;
488 	int error = 0;
489 	struct statvfs64 vfsbuf;
490 
491 	switch (cmd) {
492 
493 	case _PC_LINK_MAX:
494 		val = MAXLINK;
495 		break;
496 
497 	case _PC_MAX_CANON:
498 		val = MAX_CANON;
499 		break;
500 
501 	case _PC_MAX_INPUT:
502 		val = MAX_INPUT;
503 		break;
504 
505 	case _PC_NAME_MAX:
506 		bzero(&vfsbuf, sizeof (vfsbuf));
507 		if (error = VFS_STATVFS(vp->v_vfsp, &vfsbuf))
508 			break;
509 		val = vfsbuf.f_namemax;
510 		break;
511 
512 	case _PC_PATH_MAX:
513 	case _PC_SYMLINK_MAX:
514 		val = MAXPATHLEN;
515 		break;
516 
517 	case _PC_PIPE_BUF:
518 		val = PIPE_BUF;
519 		break;
520 
521 	case _PC_NO_TRUNC:
522 		if (vp->v_vfsp->vfs_flag & VFS_NOTRUNC)
523 			val = 1;	/* NOTRUNC is enabled for vp */
524 		else
525 			val = (ulong_t)-1;
526 		break;
527 
528 	case _PC_VDISABLE:
529 		val = _POSIX_VDISABLE;
530 		break;
531 
532 	case _PC_CHOWN_RESTRICTED:
533 		if (rstchown)
534 			val = rstchown; /* chown restricted enabled */
535 		else
536 			val = (ulong_t)-1;
537 		break;
538 
539 	case _PC_FILESIZEBITS:
540 
541 		/*
542 		 * If ever we come here it means that underlying file system
543 		 * does not recognise the command and therefore this
544 		 * configurable limit cannot be determined. We return -1
545 		 * and don't change errno.
546 		 */
547 
548 		val = (ulong_t)-1;    /* large file support */
549 		break;
550 
551 	case _PC_ACL_ENABLED:
552 		val = 0;
553 		break;
554 
555 	case _PC_CASE_BEHAVIOR:
556 		val = _CASE_SENSITIVE;
557 		if (vfs_has_feature(vp->v_vfsp, VFSFT_CASEINSENSITIVE) == 1)
558 			val |= _CASE_INSENSITIVE;
559 		if (vfs_has_feature(vp->v_vfsp, VFSFT_NOCASESENSITIVE) == 1)
560 			val &= ~_CASE_SENSITIVE;
561 		break;
562 
563 	case _PC_SATTR_ENABLED:
564 	case _PC_SATTR_EXISTS:
565 		val = 0;
566 		break;
567 
568 	case _PC_ACCESS_FILTERING:
569 		val = 0;
570 		break;
571 
572 	default:
573 		error = EINVAL;
574 		break;
575 	}
576 
577 	if (error == 0)
578 		*valp = val;
579 	return (error);
580 }
581 
582 /*
583  * Dispose of a page.
584  */
585 /* ARGSUSED */
586 void
fs_dispose(struct vnode * vp,page_t * pp,int fl,int dn,struct cred * cr,caller_context_t * ct)587 fs_dispose(struct vnode *vp, page_t *pp, int fl, int dn, struct cred *cr,
588     caller_context_t *ct)
589 {
590 
591 	ASSERT(fl == B_FREE || fl == B_INVAL);
592 
593 	if (fl == B_FREE)
594 		page_free(pp, dn);
595 	else
596 		page_destroy(pp, dn);
597 }
598 
599 /* ARGSUSED */
600 void
fs_nodispose(struct vnode * vp,page_t * pp,int fl,int dn,struct cred * cr,caller_context_t * ct)601 fs_nodispose(struct vnode *vp, page_t *pp, int fl, int dn, struct cred *cr,
602     caller_context_t *ct)
603 {
604 	cmn_err(CE_PANIC, "fs_nodispose invoked");
605 }
606 
607 /*
608  * fabricate acls for file systems that do not support acls.
609  */
610 /* ARGSUSED */
611 int
fs_fab_acl(vnode_t * vp,vsecattr_t * vsecattr,int flag,cred_t * cr,caller_context_t * ct)612 fs_fab_acl(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr,
613     caller_context_t *ct)
614 {
615 	aclent_t	*aclentp;
616 	struct vattr	vattr;
617 	int		error;
618 	size_t		aclsize;
619 
620 	vsecattr->vsa_aclcnt	= 0;
621 	vsecattr->vsa_aclentsz	= 0;
622 	vsecattr->vsa_aclentp	= NULL;
623 	vsecattr->vsa_dfaclcnt	= 0;	/* Default ACLs are not fabricated */
624 	vsecattr->vsa_dfaclentp	= NULL;
625 
626 	vattr.va_mask = AT_MODE | AT_UID | AT_GID;
627 	if (error = VOP_GETATTR(vp, &vattr, 0, cr, ct))
628 		return (error);
629 
630 	if (vsecattr->vsa_mask & (VSA_ACLCNT | VSA_ACL)) {
631 		aclsize = 4 * sizeof (aclent_t);
632 		vsecattr->vsa_aclcnt	= 4; /* USER, GROUP, OTHER, and CLASS */
633 		vsecattr->vsa_aclentp = kmem_zalloc(aclsize, KM_SLEEP);
634 		aclentp = vsecattr->vsa_aclentp;
635 
636 		aclentp->a_type = USER_OBJ;	/* Owner */
637 		aclentp->a_perm = ((ushort_t)(vattr.va_mode & 0700)) >> 6;
638 		aclentp->a_id = vattr.va_uid;   /* Really undefined */
639 		aclentp++;
640 
641 		aclentp->a_type = GROUP_OBJ;    /* Group */
642 		aclentp->a_perm = ((ushort_t)(vattr.va_mode & 0070)) >> 3;
643 		aclentp->a_id = vattr.va_gid;   /* Really undefined */
644 		aclentp++;
645 
646 		aclentp->a_type = OTHER_OBJ;    /* Other */
647 		aclentp->a_perm = vattr.va_mode & 0007;
648 		aclentp->a_id = (gid_t)-1;	/* Really undefined */
649 		aclentp++;
650 
651 		aclentp->a_type = CLASS_OBJ;    /* Class */
652 		aclentp->a_perm = (ushort_t)(0007);
653 		aclentp->a_id = (gid_t)-1;	/* Really undefined */
654 	} else if (vsecattr->vsa_mask & (VSA_ACECNT | VSA_ACE)) {
655 		VERIFY(0 == acl_trivial_create(vattr.va_mode,
656 		    (vp->v_type == VDIR), (ace_t **)&vsecattr->vsa_aclentp,
657 		    &vsecattr->vsa_aclcnt));
658 		vsecattr->vsa_aclentsz = vsecattr->vsa_aclcnt * sizeof (ace_t);
659 	}
660 
661 	return (error);
662 }
663 
664 /*
665  * Common code for implementing DOS share reservations
666  */
667 /* ARGSUSED4 */
668 int
fs_shrlock(struct vnode * vp,int cmd,struct shrlock * shr,int flag,cred_t * cr,caller_context_t * ct)669 fs_shrlock(struct vnode *vp, int cmd, struct shrlock *shr, int flag, cred_t *cr,
670     caller_context_t *ct)
671 {
672 	int error;
673 
674 	/*
675 	 * Make sure that the file was opened with permissions appropriate
676 	 * for the request, and make sure the caller isn't trying to sneak
677 	 * in an NBMAND request.
678 	 */
679 	if (cmd == F_SHARE) {
680 		if (((shr->s_access & F_RDACC) && (flag & FREAD) == 0) ||
681 		    ((shr->s_access & F_WRACC) && (flag & FWRITE) == 0))
682 			return (EBADF);
683 		if (shr->s_access & (F_RMACC | F_MDACC))
684 			return (EINVAL);
685 		if (shr->s_deny & (F_MANDDNY | F_RMDNY))
686 			return (EINVAL);
687 	}
688 	if (cmd == F_SHARE_NBMAND) {
689 		/* make sure nbmand is allowed on the file */
690 		if (!vp->v_vfsp ||
691 		    !(vp->v_vfsp->vfs_flag & VFS_NBMAND)) {
692 			return (EINVAL);
693 		}
694 		if (vp->v_type != VREG) {
695 			return (EINVAL);
696 		}
697 	}
698 
699 	nbl_start_crit(vp, RW_WRITER);
700 
701 	switch (cmd) {
702 
703 	case F_SHARE_NBMAND:
704 		shr->s_deny |= F_MANDDNY;
705 		/*FALLTHROUGH*/
706 	case F_SHARE:
707 		error = add_share(vp, shr);
708 		break;
709 
710 	case F_UNSHARE:
711 		error = del_share(vp, shr);
712 		break;
713 
714 	case F_HASREMOTELOCKS:
715 		/*
716 		 * We are overloading this command to refer to remote
717 		 * shares as well as remote locks, despite its name.
718 		 */
719 		shr->s_access = shr_has_remote_shares(vp, shr->s_sysid);
720 		error = 0;
721 		break;
722 
723 	default:
724 		error = EINVAL;
725 		break;
726 	}
727 
728 	nbl_end_crit(vp);
729 	return (error);
730 }
731 
732 /*ARGSUSED1*/
733 int
fs_vnevent_nosupport(vnode_t * vp,vnevent_t e,vnode_t * dvp,char * fnm,caller_context_t * ct)734 fs_vnevent_nosupport(vnode_t *vp, vnevent_t e, vnode_t *dvp, char *fnm,
735     caller_context_t *ct)
736 {
737 	ASSERT(vp != NULL);
738 	return (ENOTSUP);
739 }
740 
741 /*ARGSUSED1*/
742 int
fs_vnevent_support(vnode_t * vp,vnevent_t e,vnode_t * dvp,char * fnm,caller_context_t * ct)743 fs_vnevent_support(vnode_t *vp, vnevent_t e, vnode_t *dvp, char *fnm,
744     caller_context_t *ct)
745 {
746 	ASSERT(vp != NULL);
747 	return (0);
748 }
749 
750 /*
751  * return 1 for non-trivial ACL.
752  *
753  * NB: It is not necessary for the caller to VOP_RWLOCK since
754  *	we only issue VOP_GETSECATTR.
755  *
756  * Returns 0 == trivial
757  *         1 == NOT Trivial
758  *	   <0 could not determine.
759  */
760 int
fs_acl_nontrivial(vnode_t * vp,cred_t * cr)761 fs_acl_nontrivial(vnode_t *vp, cred_t *cr)
762 {
763 	ulong_t		acl_styles;
764 	ulong_t		acl_flavor;
765 	vsecattr_t	vsecattr;
766 	int		error;
767 	int		isnontrivial;
768 
769 	/* determine the forms of ACLs maintained */
770 	error = VOP_PATHCONF(vp, _PC_ACL_ENABLED, &acl_styles, cr, NULL);
771 
772 	/* clear bits we don't understand and establish default acl_style */
773 	acl_styles &= (_ACL_ACLENT_ENABLED | _ACL_ACE_ENABLED);
774 	if (error || (acl_styles == 0))
775 		acl_styles = _ACL_ACLENT_ENABLED;
776 
777 	vsecattr.vsa_aclentp = NULL;
778 	vsecattr.vsa_dfaclentp = NULL;
779 	vsecattr.vsa_aclcnt = 0;
780 	vsecattr.vsa_dfaclcnt = 0;
781 
782 	while (acl_styles) {
783 		/* select one of the styles as current flavor */
784 		acl_flavor = 0;
785 		if (acl_styles & _ACL_ACLENT_ENABLED) {
786 			acl_flavor = _ACL_ACLENT_ENABLED;
787 			vsecattr.vsa_mask = VSA_ACLCNT | VSA_DFACLCNT;
788 		} else if (acl_styles & _ACL_ACE_ENABLED) {
789 			acl_flavor = _ACL_ACE_ENABLED;
790 			vsecattr.vsa_mask = VSA_ACECNT | VSA_ACE;
791 		}
792 
793 		ASSERT(vsecattr.vsa_mask && acl_flavor);
794 		error = VOP_GETSECATTR(vp, &vsecattr, 0, cr, NULL);
795 		if (error == 0)
796 			break;
797 
798 		/* that flavor failed */
799 		acl_styles &= ~acl_flavor;
800 	}
801 
802 	/* if all styles fail then assume trivial */
803 	if (acl_styles == 0)
804 		return (0);
805 
806 	/* process the flavor that worked */
807 	isnontrivial = 0;
808 	if (acl_flavor & _ACL_ACLENT_ENABLED) {
809 		if (vsecattr.vsa_aclcnt > MIN_ACL_ENTRIES)
810 			isnontrivial = 1;
811 		if (vsecattr.vsa_aclcnt && vsecattr.vsa_aclentp != NULL)
812 			kmem_free(vsecattr.vsa_aclentp,
813 			    vsecattr.vsa_aclcnt * sizeof (aclent_t));
814 		if (vsecattr.vsa_dfaclcnt && vsecattr.vsa_dfaclentp != NULL)
815 			kmem_free(vsecattr.vsa_dfaclentp,
816 			    vsecattr.vsa_dfaclcnt * sizeof (aclent_t));
817 	}
818 	if (acl_flavor & _ACL_ACE_ENABLED) {
819 		isnontrivial = ace_trivial(vsecattr.vsa_aclentp,
820 		    vsecattr.vsa_aclcnt);
821 
822 		if (vsecattr.vsa_aclcnt && vsecattr.vsa_aclentp != NULL)
823 			kmem_free(vsecattr.vsa_aclentp,
824 			    vsecattr.vsa_aclcnt * sizeof (ace_t));
825 		/* ACE has no vsecattr.vsa_dfaclcnt */
826 	}
827 	return (isnontrivial);
828 }
829 
830 /*
831  * Check whether we need a retry to recover from STALE error.
832  */
833 int
fs_need_estale_retry(int retry_count)834 fs_need_estale_retry(int retry_count)
835 {
836 	if (retry_count < fs_estale_retry)
837 		return (1);
838 	else
839 		return (0);
840 }
841 
842 
843 static int (*fs_av_scan)(vnode_t *, cred_t *, int) = NULL;
844 
845 /*
846  * Routine for anti-virus scanner to call to register its scanning routine.
847  */
848 void
fs_vscan_register(int (* av_scan)(vnode_t *,cred_t *,int))849 fs_vscan_register(int (*av_scan)(vnode_t *, cred_t *, int))
850 {
851 	fs_av_scan = av_scan;
852 }
853 
854 /*
855  * Routine for file systems to call to initiate anti-virus scanning.
856  * Scanning will only be done on REGular files (currently).
857  */
858 int
fs_vscan(vnode_t * vp,cred_t * cr,int async)859 fs_vscan(vnode_t *vp, cred_t *cr, int async)
860 {
861 	int ret = 0;
862 
863 	if (fs_av_scan && vp->v_type == VREG)
864 		ret = (*fs_av_scan)(vp, cr, async);
865 
866 	return (ret);
867 }
868 
869 /*
870  * support functions for reparse point
871  */
872 /*
873  * reparse_vnode_parse
874  *
875  * Read the symlink data of a reparse point specified by the vnode
876  * and return the reparse data as name-value pair in the nvlist.
877  */
878 int
reparse_vnode_parse(vnode_t * vp,nvlist_t * nvl)879 reparse_vnode_parse(vnode_t *vp, nvlist_t *nvl)
880 {
881 	int err;
882 	char *lkdata;
883 	struct uio uio;
884 	struct iovec iov;
885 
886 	if (vp == NULL || nvl == NULL)
887 		return (EINVAL);
888 
889 	lkdata = kmem_alloc(MAXREPARSELEN, KM_SLEEP);
890 
891 	/*
892 	 * Set up io vector to read sym link data
893 	 */
894 	iov.iov_base = lkdata;
895 	iov.iov_len = MAXREPARSELEN;
896 	uio.uio_iov = &iov;
897 	uio.uio_iovcnt = 1;
898 	uio.uio_segflg = UIO_SYSSPACE;
899 	uio.uio_extflg = UIO_COPY_CACHED;
900 	uio.uio_loffset = (offset_t)0;
901 	uio.uio_resid = MAXREPARSELEN;
902 
903 	if ((err = VOP_READLINK(vp, &uio, kcred, NULL)) == 0) {
904 		*(lkdata + MAXREPARSELEN - uio.uio_resid) = '\0';
905 		err = reparse_parse(lkdata, nvl);
906 	}
907 	kmem_free(lkdata, MAXREPARSELEN);	/* done with lkdata */
908 
909 	return (err);
910 }
911 
912 void
reparse_point_init()913 reparse_point_init()
914 {
915 	mutex_init(&reparsed_door_lock, NULL, MUTEX_DEFAULT, NULL);
916 }
917 
918 static door_handle_t
reparse_door_get_handle()919 reparse_door_get_handle()
920 {
921 	door_handle_t dh;
922 
923 	mutex_enter(&reparsed_door_lock);
924 	if ((dh = reparsed_door) == NULL) {
925 		if (door_ki_open(REPARSED_DOOR, &reparsed_door) != 0) {
926 			reparsed_door = NULL;
927 			dh = NULL;
928 		} else
929 			dh = reparsed_door;
930 	}
931 	mutex_exit(&reparsed_door_lock);
932 	return (dh);
933 }
934 
935 static void
reparse_door_reset_handle()936 reparse_door_reset_handle()
937 {
938 	mutex_enter(&reparsed_door_lock);
939 	reparsed_door = NULL;
940 	mutex_exit(&reparsed_door_lock);
941 }
942 
943 /*
944  * reparse_kderef
945  *
946  * Accepts the service-specific item from the reparse point and returns
947  * the service-specific data requested.  The caller specifies the size of
948  * the buffer provided via *bufsz; the routine will fail with EOVERFLOW
949  * if the results will not fit in the buffer, in which case, *bufsz will
950  * contain the number of bytes needed to hold the results.
951  *
952  * if ok return 0 and update *bufsize with length of actual result
953  * else return error code.
954  */
955 int
reparse_kderef(const char * svc_type,const char * svc_data,char * buf,size_t * bufsize)956 reparse_kderef(const char *svc_type, const char *svc_data, char *buf,
957     size_t *bufsize)
958 {
959 	int err, retries, need_free, retried_doorhd;
960 	size_t dlen, res_len;
961 	char *darg;
962 	door_arg_t door_args;
963 	reparsed_door_res_t *resp;
964 	door_handle_t rp_door;
965 
966 	if (svc_type == NULL || svc_data == NULL || buf == NULL ||
967 	    bufsize == NULL)
968 		return (EINVAL);
969 
970 	/* get reparsed's door handle */
971 	if ((rp_door = reparse_door_get_handle()) == NULL)
972 		return (EBADF);
973 
974 	/* setup buffer for door_call args and results */
975 	dlen = strlen(svc_type) + strlen(svc_data) + 2;
976 	if (*bufsize < dlen) {
977 		darg = kmem_alloc(dlen, KM_SLEEP);
978 		need_free = 1;
979 	} else {
980 		darg = buf;	/* use same buffer for door's args & results */
981 		need_free = 0;
982 	}
983 
984 	/* build argument string of door call */
985 	(void) snprintf(darg, dlen, "%s:%s", svc_type, svc_data);
986 
987 	/* setup args for door call */
988 	door_args.data_ptr = darg;
989 	door_args.data_size = dlen;
990 	door_args.desc_ptr = NULL;
991 	door_args.desc_num = 0;
992 	door_args.rbuf = buf;
993 	door_args.rsize = *bufsize;
994 
995 	/* do the door_call */
996 	retried_doorhd = 0;
997 	retries = 0;
998 	door_ki_hold(rp_door);
999 	while ((err = door_ki_upcall_limited(rp_door, &door_args,
1000 	    NULL, SIZE_MAX, 0)) != 0) {
1001 		if (err == EAGAIN || err == EINTR) {
1002 			if (++retries < REPARSED_DOORCALL_MAX_RETRY) {
1003 				delay(SEC_TO_TICK(1));
1004 				continue;
1005 			}
1006 		} else if (err == EBADF) {
1007 			/* door server goes away... */
1008 			reparse_door_reset_handle();
1009 
1010 			if (retried_doorhd == 0) {
1011 				door_ki_rele(rp_door);
1012 				retried_doorhd++;
1013 				rp_door = reparse_door_get_handle();
1014 				if (rp_door != NULL) {
1015 					door_ki_hold(rp_door);
1016 					continue;
1017 				}
1018 			}
1019 		}
1020 		break;
1021 	}
1022 
1023 	if (rp_door)
1024 		door_ki_rele(rp_door);
1025 
1026 	if (need_free)
1027 		kmem_free(darg, dlen);		/* done with args buffer */
1028 
1029 	if (err != 0)
1030 		return (err);
1031 
1032 	resp = (reparsed_door_res_t *)door_args.rbuf;
1033 	if ((err = resp->res_status) == 0) {
1034 		/*
1035 		 * have to save the length of the results before the
1036 		 * bcopy below since it's can be an overlap copy that
1037 		 * overwrites the reparsed_door_res_t structure at
1038 		 * the beginning of the buffer.
1039 		 */
1040 		res_len = (size_t)resp->res_len;
1041 
1042 		/* deref call is ok */
1043 		if (res_len > *bufsize)
1044 			err = EOVERFLOW;
1045 		else
1046 			bcopy(resp->res_data, buf, res_len);
1047 		*bufsize = res_len;
1048 	}
1049 	if (door_args.rbuf != buf)
1050 		kmem_free(door_args.rbuf, door_args.rsize);
1051 
1052 	return (err);
1053 }
1054 
1055 /*
1056  * This routine is used to create a single vfs_t that is used globally in the
1057  * system for a psuedo-file system that does not actually ever "mount", like
1058  * sockfs or fifofs. This constructs a single vfs_t that will not be
1059  * accidentally freed nor will it end up on a zone's list of file systems.
1060  * Please do not add new file systems that need to use this. The kmem_zalloc
1061  * explicitly takes care of ensuring the following (amongst others):
1062  *
1063  *  - This vfs_t is explicitly not linked on any list (vfs_next/prev are NULL)
1064  *  - The vnode is not covered and has no flags
1065  *  - There is no mount point, resource, or options
1066  *  - There is no zone that nominally owns this
1067  *  - There is no file system specific data
1068  */
1069 vfs_t *
fs_vfsp_global(struct vfsops * ops,dev_t dev,int fstype,uint_t bsize)1070 fs_vfsp_global(struct vfsops *ops, dev_t dev, int fstype, uint_t bsize)
1071 {
1072 	vfs_t *vfsp = kmem_zalloc(sizeof (struct vfs), KM_SLEEP);
1073 
1074 	vfs_setops(vfsp, ops);
1075 	vfsp->vfs_bsize = bsize;
1076 	vfsp->vfs_fstype = fstype;
1077 	vfs_make_fsid(&vfsp->vfs_fsid, dev, fstype);
1078 	vfsp->vfs_dev = dev;
1079 
1080 	/*
1081 	 * We purposefully bump the reference on this vfs_t to one. This vfs_t
1082 	 * is intended to always exist regardless of surrounding activity.
1083 	 * Importantly this ensures that something that incidentally performs a
1084 	 * VFS_HOLD followed by a VFS_RELE on the vfs_t doesn't end up freeing
1085 	 * this.
1086 	 */
1087 	vfsp->vfs_count = 1;
1088 
1089 	return (vfsp);
1090 }
1091