xref: /illumos-gate/usr/src/uts/common/fs/fs_subr.c (revision 0173c38a73f34277e0c97a19fedfd25d81ba8380)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
22 /*	  All Rights Reserved  	*/
23 
24 
25 /*
26  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
27  * Use is subject to license terms.
28  */
29 
30 #pragma ident	"%Z%%M%	%I%	%E% SMI"
31 
32 /*
33  * Generic vnode operations.
34  */
35 #include <sys/types.h>
36 #include <sys/param.h>
37 #include <sys/systm.h>
38 #include <sys/errno.h>
39 #include <sys/fcntl.h>
40 #include <sys/flock.h>
41 #include <sys/statvfs.h>
42 #include <sys/vfs.h>
43 #include <sys/vnode.h>
44 #include <sys/proc.h>
45 #include <sys/user.h>
46 #include <sys/unistd.h>
47 #include <sys/cred.h>
48 #include <sys/poll.h>
49 #include <sys/debug.h>
50 #include <sys/cmn_err.h>
51 #include <sys/stream.h>
52 #include <fs/fs_subr.h>
53 #include <sys/acl.h>
54 #include <sys/share.h>
55 #include <sys/file.h>
56 #include <sys/kmem.h>
57 #include <sys/file.h>
58 #include <sys/nbmlock.h>
59 #include <acl/acl_common.h>
60 
61 static callb_cpr_t *frlock_serialize_blocked(flk_cb_when_t, void *);
62 
63 /*
64  * Tunable to limit the number of retry to recover from STALE error.
65  */
66 int fs_estale_retry = 5;
67 
68 /*
69  * The associated operation is not supported by the file system.
70  */
71 int
72 fs_nosys()
73 {
74 	return (ENOSYS);
75 }
76 
77 /*
78  * The associated operation is invalid (on this vnode).
79  */
80 int
81 fs_inval()
82 {
83 	return (EINVAL);
84 }
85 
86 /*
87  * The associated operation is valid only for directories.
88  */
89 int
90 fs_notdir()
91 {
92 	return (ENOTDIR);
93 }
94 
95 /*
96  * Free the file system specific resources. For the file systems that
97  * do not support the forced unmount, it will be a nop function.
98  */
99 
100 /*ARGSUSED*/
101 void
102 fs_freevfs(vfs_t *vfsp)
103 {
104 }
105 
106 /* ARGSUSED */
107 int
108 fs_nosys_map(struct vnode *vp,
109 	offset_t off,
110 	struct as *as,
111 	caddr_t *addrp,
112 	size_t len,
113 	uchar_t prot,
114 	uchar_t maxprot,
115 	uint_t flags,
116 	struct cred *cr)
117 {
118 	return (ENOSYS);
119 }
120 
121 /* ARGSUSED */
122 int
123 fs_nosys_addmap(struct vnode *vp,
124 	offset_t off,
125 	struct as *as,
126 	caddr_t addr,
127 	size_t len,
128 	uchar_t prot,
129 	uchar_t maxprot,
130 	uint_t flags,
131 	struct cred *cr)
132 {
133 	return (ENOSYS);
134 }
135 
136 /* ARGSUSED */
137 int
138 fs_nosys_poll(vnode_t *vp,
139 	register short events,
140 	int anyyet,
141 	register short *reventsp,
142 	struct pollhead **phpp)
143 {
144 	return (ENOSYS);
145 }
146 
147 
148 /*
149  * The file system has nothing to sync to disk.  However, the
150  * VFS_SYNC operation must not fail.
151  */
152 /* ARGSUSED */
153 int
154 fs_sync(struct vfs *vfspp, short flag, cred_t *cr)
155 {
156 	return (0);
157 }
158 
159 /*
160  * Read/write lock/unlock.  Does nothing.
161  */
162 /* ARGSUSED */
163 int
164 fs_rwlock(vnode_t *vp, int write_lock, caller_context_t *ctp)
165 {
166 	return (-1);
167 }
168 
169 /* ARGSUSED */
170 void
171 fs_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ctp)
172 {
173 }
174 
175 /*
176  * Compare two vnodes.
177  */
178 int
179 fs_cmp(vnode_t *vp1, vnode_t *vp2)
180 {
181 	return (vp1 == vp2);
182 }
183 
184 /*
185  * No-op seek operation.
186  */
187 /* ARGSUSED */
188 int
189 fs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp)
190 {
191 	return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
192 }
193 
194 /*
195  * File and record locking.
196  */
197 /* ARGSUSED */
198 int
199 fs_frlock(register vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
200 	offset_t offset, flk_callback_t *flk_cbp, cred_t *cr)
201 {
202 	int frcmd;
203 	int nlmid;
204 	int error = 0;
205 	flk_callback_t serialize_callback;
206 	int serialize = 0;
207 
208 	switch (cmd) {
209 
210 	case F_GETLK:
211 	case F_O_GETLK:
212 		if (flag & F_REMOTELOCK) {
213 			frcmd = RCMDLCK;
214 			break;
215 		}
216 		if (flag & F_PXFSLOCK) {
217 			frcmd = PCMDLCK;
218 			break;
219 		}
220 		bfp->l_pid = ttoproc(curthread)->p_pid;
221 		bfp->l_sysid = 0;
222 		frcmd = 0;
223 		break;
224 
225 	case F_SETLK_NBMAND:
226 		/*
227 		 * Are NBMAND locks allowed on this file?
228 		 */
229 		if (!vp->v_vfsp ||
230 		    !(vp->v_vfsp->vfs_flag & VFS_NBMAND)) {
231 			error = EINVAL;
232 			goto done;
233 		}
234 		if (vp->v_type != VREG) {
235 			error = EINVAL;
236 			goto done;
237 		}
238 		/*FALLTHROUGH*/
239 
240 	case F_SETLK:
241 		/*
242 		 * Check whether there is an NBMAND share reservation that
243 		 * conflicts with the lock request.
244 		 */
245 		if (nbl_need_check(vp)) {
246 			nbl_start_crit(vp, RW_WRITER);
247 			serialize = 1;
248 			if (share_blocks_lock(vp, bfp)) {
249 				error = EAGAIN;
250 				goto done;
251 			}
252 		}
253 		if (flag & F_REMOTELOCK) {
254 			frcmd = SETFLCK|RCMDLCK;
255 			break;
256 		}
257 		if (flag & F_PXFSLOCK) {
258 			frcmd = SETFLCK|PCMDLCK;
259 			break;
260 		}
261 		bfp->l_pid = ttoproc(curthread)->p_pid;
262 		bfp->l_sysid = 0;
263 		frcmd = SETFLCK;
264 		if (cmd == F_SETLK_NBMAND &&
265 		    (bfp->l_type == F_RDLCK || bfp->l_type == F_WRLCK)) {
266 			/* would check here for conflict with mapped region */
267 			frcmd |= NBMLCK;
268 		}
269 		break;
270 
271 	case F_SETLKW:
272 		/*
273 		 * If there is an NBMAND share reservation that conflicts
274 		 * with the lock request, block until the conflicting share
275 		 * reservation goes away.
276 		 */
277 		if (nbl_need_check(vp)) {
278 			nbl_start_crit(vp, RW_WRITER);
279 			serialize = 1;
280 			if (share_blocks_lock(vp, bfp)) {
281 				error = wait_for_share(vp, bfp);
282 				if (error != 0)
283 					goto done;
284 			}
285 		}
286 		if (flag & F_REMOTELOCK) {
287 			frcmd = SETFLCK|SLPFLCK|RCMDLCK;
288 			break;
289 		}
290 		if (flag & F_PXFSLOCK) {
291 			frcmd = SETFLCK|SLPFLCK|PCMDLCK;
292 			break;
293 		}
294 		bfp->l_pid = ttoproc(curthread)->p_pid;
295 		bfp->l_sysid = 0;
296 		frcmd = SETFLCK|SLPFLCK;
297 		break;
298 
299 	case F_HASREMOTELOCKS:
300 		nlmid = GETNLMID(bfp->l_sysid);
301 		if (nlmid != 0) {	/* booted as a cluster */
302 			l_has_rmt(bfp) =
303 				cl_flk_has_remote_locks_for_nlmid(vp, nlmid);
304 		} else {		/* not booted as a cluster */
305 			l_has_rmt(bfp) = flk_has_remote_locks(vp);
306 		}
307 
308 		goto done;
309 
310 	default:
311 		error = EINVAL;
312 		goto done;
313 	}
314 
315 	/*
316 	 * If this is a blocking lock request and we're serializing lock
317 	 * requests, modify the callback list to leave the critical region
318 	 * while we're waiting for the lock.
319 	 */
320 
321 	if (serialize && (frcmd & SLPFLCK) != 0) {
322 		flk_add_callback(&serialize_callback,
323 				frlock_serialize_blocked, vp, flk_cbp);
324 		flk_cbp = &serialize_callback;
325 	}
326 
327 	error = reclock(vp, bfp, frcmd, flag, offset, flk_cbp);
328 
329 done:
330 	if (serialize)
331 		nbl_end_crit(vp);
332 
333 	return (error);
334 }
335 
336 /*
337  * Callback when a lock request blocks and we are serializing requests.  If
338  * before sleeping, leave the critical region.  If after wakeup, reenter
339  * the critical region.
340  */
341 
342 static callb_cpr_t *
343 frlock_serialize_blocked(flk_cb_when_t when, void *infop)
344 {
345 	vnode_t *vp = (vnode_t *)infop;
346 
347 	if (when == FLK_BEFORE_SLEEP)
348 		nbl_end_crit(vp);
349 	else {
350 		nbl_start_crit(vp, RW_WRITER);
351 	}
352 
353 	return (NULL);
354 }
355 
356 /*
357  * Allow any flags.
358  */
359 /* ARGSUSED */
360 int
361 fs_setfl(vnode_t *vp, int oflags, int nflags, cred_t *cr)
362 {
363 	return (0);
364 }
365 
366 /*
367  * Return the answer requested to poll() for non-device files.
368  * Only POLLIN, POLLRDNORM, and POLLOUT are recognized.
369  */
370 struct pollhead fs_pollhd;
371 
372 /* ARGSUSED */
373 int
374 fs_poll(vnode_t *vp,
375 	register short events,
376 	int anyyet,
377 	register short *reventsp,
378 	struct pollhead **phpp)
379 {
380 	*reventsp = 0;
381 	if (events & POLLIN)
382 		*reventsp |= POLLIN;
383 	if (events & POLLRDNORM)
384 		*reventsp |= POLLRDNORM;
385 	if (events & POLLRDBAND)
386 		*reventsp |= POLLRDBAND;
387 	if (events & POLLOUT)
388 		*reventsp |= POLLOUT;
389 	if (events & POLLWRBAND)
390 		*reventsp |= POLLWRBAND;
391 	*phpp = !anyyet && !*reventsp ? &fs_pollhd : (struct pollhead *)NULL;
392 	return (0);
393 }
394 
395 /*
396  * POSIX pathconf() support.
397  */
398 /* ARGSUSED */
399 int
400 fs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr)
401 {
402 	register ulong_t val;
403 	register int error = 0;
404 	struct statvfs64 vfsbuf;
405 
406 	switch (cmd) {
407 
408 	case _PC_LINK_MAX:
409 		val = MAXLINK;
410 		break;
411 
412 	case _PC_MAX_CANON:
413 		val = MAX_CANON;
414 		break;
415 
416 	case _PC_MAX_INPUT:
417 		val = MAX_INPUT;
418 		break;
419 
420 	case _PC_NAME_MAX:
421 		bzero(&vfsbuf, sizeof (vfsbuf));
422 		if (error = VFS_STATVFS(vp->v_vfsp, &vfsbuf))
423 			break;
424 		val = vfsbuf.f_namemax;
425 		break;
426 
427 	case _PC_PATH_MAX:
428 	case _PC_SYMLINK_MAX:
429 		val = MAXPATHLEN;
430 		break;
431 
432 	case _PC_PIPE_BUF:
433 		val = PIPE_BUF;
434 		break;
435 
436 	case _PC_NO_TRUNC:
437 		if (vp->v_vfsp->vfs_flag & VFS_NOTRUNC)
438 			val = 1;	/* NOTRUNC is enabled for vp */
439 		else
440 			val = (ulong_t)-1;
441 		break;
442 
443 	case _PC_VDISABLE:
444 		val = _POSIX_VDISABLE;
445 		break;
446 
447 	case _PC_CHOWN_RESTRICTED:
448 		if (rstchown)
449 			val = rstchown; /* chown restricted enabled */
450 		else
451 			val = (ulong_t)-1;
452 		break;
453 
454 	case _PC_FILESIZEBITS:
455 
456 		/*
457 		 * If ever we come here it means that underlying file system
458 		 * does not recognise the command and therefore this
459 		 * configurable limit cannot be determined. We return -1
460 		 * and don't change errno.
461 		 */
462 
463 		val = (ulong_t)-1;    /* large file support */
464 		break;
465 
466 	case _PC_ACL_ENABLED:
467 		val = 0;
468 		break;
469 
470 	default:
471 		error = EINVAL;
472 		break;
473 	}
474 
475 	if (error == 0)
476 		*valp = val;
477 	return (error);
478 }
479 
480 /*
481  * Dispose of a page.
482  */
483 /* ARGSUSED */
484 void
485 fs_dispose(struct vnode *vp, page_t *pp, int fl, int dn, struct cred *cr)
486 {
487 
488 	ASSERT(fl == B_FREE || fl == B_INVAL);
489 
490 	if (fl == B_FREE)
491 		page_free(pp, dn);
492 	else
493 		page_destroy(pp, dn);
494 }
495 
496 /* ARGSUSED */
497 void
498 fs_nodispose(struct vnode *vp, page_t *pp, int fl, int dn, struct cred *cr)
499 {
500 	cmn_err(CE_PANIC, "fs_nodispose invoked");
501 }
502 
503 /*
504  * fabricate acls for file systems that do not support acls.
505  */
506 /* ARGSUSED */
507 int
508 fs_fab_acl(vp, vsecattr, flag, cr)
509 vnode_t		*vp;
510 vsecattr_t	*vsecattr;
511 int		flag;
512 cred_t		*cr;
513 {
514 	aclent_t	*aclentp;
515 	ace_t		*acep;
516 	struct vattr	vattr;
517 	int		error;
518 
519 	vsecattr->vsa_aclcnt	= 0;
520 	vsecattr->vsa_aclentp	= NULL;
521 	vsecattr->vsa_dfaclcnt	= 0;	/* Default ACLs are not fabricated */
522 	vsecattr->vsa_dfaclentp	= NULL;
523 
524 	vattr.va_mask = AT_MODE | AT_UID | AT_GID;
525 	if (error = VOP_GETATTR(vp, &vattr, 0, cr))
526 		return (error);
527 
528 	if (vsecattr->vsa_mask & (VSA_ACLCNT | VSA_ACL)) {
529 		vsecattr->vsa_aclcnt	= 4; /* USER, GROUP, OTHER, and CLASS */
530 		vsecattr->vsa_aclentp = kmem_zalloc(4 * sizeof (aclent_t),
531 		    KM_SLEEP);
532 		aclentp = vsecattr->vsa_aclentp;
533 
534 		aclentp->a_type = USER_OBJ;	/* Owner */
535 		aclentp->a_perm = ((ushort_t)(vattr.va_mode & 0700)) >> 6;
536 		aclentp->a_id = vattr.va_uid;   /* Really undefined */
537 		aclentp++;
538 
539 		aclentp->a_type = GROUP_OBJ;    /* Group */
540 		aclentp->a_perm = ((ushort_t)(vattr.va_mode & 0070)) >> 3;
541 		aclentp->a_id = vattr.va_gid;   /* Really undefined */
542 		aclentp++;
543 
544 		aclentp->a_type = OTHER_OBJ;    /* Other */
545 		aclentp->a_perm = vattr.va_mode & 0007;
546 		aclentp->a_id = -1;		/* Really undefined */
547 		aclentp++;
548 
549 		aclentp->a_type = CLASS_OBJ;    /* Class */
550 		aclentp->a_perm = (ushort_t)(0007);
551 		aclentp->a_id = -1;		/* Really undefined */
552 	} else if (vsecattr->vsa_mask & (VSA_ACECNT | VSA_ACE)) {
553 		vsecattr->vsa_aclcnt	= 6;
554 		vsecattr->vsa_aclentp = kmem_zalloc(6 * sizeof (ace_t),
555 		    KM_SLEEP);
556 		acep = vsecattr->vsa_aclentp;
557 		(void) memcpy(acep, trivial_acl, sizeof (ace_t) * 6);
558 		adjust_ace_pair(acep, (vattr.va_mode & 0700) >> 6);
559 		adjust_ace_pair(acep + 2, (vattr.va_mode & 0070) >> 3);
560 		adjust_ace_pair(acep + 4, vattr.va_mode & 0007);
561 	}
562 
563 	return (0);
564 }
565 
566 /*
567  * Common code for implementing DOS share reservations
568  */
569 /* ARGSUSED4 */
570 int
571 fs_shrlock(struct vnode *vp, int cmd, struct shrlock *shr, int flag, cred_t *cr)
572 {
573 	int error;
574 
575 	/*
576 	 * Make sure that the file was opened with permissions appropriate
577 	 * for the request, and make sure the caller isn't trying to sneak
578 	 * in an NBMAND request.
579 	 */
580 	if (cmd == F_SHARE) {
581 		if (((shr->s_access & F_RDACC) && (flag & FREAD) == 0) ||
582 		    ((shr->s_access & F_WRACC) && (flag & FWRITE) == 0))
583 			return (EBADF);
584 		if (shr->s_deny & F_MANDDNY)
585 			return (EINVAL);
586 	}
587 	if (cmd == F_SHARE_NBMAND) {
588 		/* must have write permission to deny read access */
589 		if ((shr->s_deny & F_RDDNY) && (flag & FWRITE) == 0)
590 			return (EBADF);
591 		/* make sure nbmand is allowed on the file */
592 		if (!vp->v_vfsp ||
593 		    !(vp->v_vfsp->vfs_flag & VFS_NBMAND)) {
594 			return (EINVAL);
595 		}
596 		if (vp->v_type != VREG) {
597 			return (EINVAL);
598 		}
599 	}
600 
601 	nbl_start_crit(vp, RW_WRITER);
602 
603 	switch (cmd) {
604 
605 	case F_SHARE_NBMAND:
606 		shr->s_deny |= F_MANDDNY;
607 		/*FALLTHROUGH*/
608 	case F_SHARE:
609 		error = add_share(vp, shr);
610 		break;
611 
612 	case F_UNSHARE:
613 		error = del_share(vp, shr);
614 		break;
615 
616 	case F_HASREMOTELOCKS:
617 		/*
618 		 * We are overloading this command to refer to remote
619 		 * shares as well as remote locks, despite its name.
620 		 */
621 		shr->s_access = shr_has_remote_shares(vp, shr->s_sysid);
622 		error = 0;
623 		break;
624 
625 	default:
626 		error = EINVAL;
627 		break;
628 	}
629 
630 	nbl_end_crit(vp);
631 	return (error);
632 }
633 
634 /*ARGSUSED1*/
635 int
636 fs_vnevent_nosupport(vnode_t *vp, vnevent_t vnevent)
637 {
638 	ASSERT(vp != NULL);
639 	return (ENOTSUP);
640 }
641 
642 /*ARGSUSED1*/
643 int
644 fs_vnevent_support(vnode_t *vp, vnevent_t vnevent)
645 {
646 	ASSERT(vp != NULL);
647 	return (0);
648 }
649 
650 /*
651  * return 1 for non-trivial ACL.
652  *
653  * NB: It is not necessary for the caller to VOP_RWLOCK since
654  *	we only issue VOP_GETSECATTR.
655  *
656  * Returns 0 == trivial
657  *         1 == NOT Trivial
658  *	   <0 could not determine.
659  */
660 int
661 fs_acl_nontrivial(vnode_t *vp, cred_t *cr)
662 {
663 	ulong_t		acl_styles;
664 	ulong_t		acl_flavor;
665 	vsecattr_t 	vsecattr;
666 	int 		error;
667 	int		isnontrivial;
668 
669 	/* determine the forms of ACLs maintained */
670 	error = VOP_PATHCONF(vp, _PC_ACL_ENABLED, &acl_styles, cr);
671 
672 	/* clear bits we don't understand and establish default acl_style */
673 	acl_styles &= (_ACL_ACLENT_ENABLED | _ACL_ACE_ENABLED);
674 	if (error || (acl_styles == 0))
675 		acl_styles = _ACL_ACLENT_ENABLED;
676 
677 	vsecattr.vsa_aclentp = NULL;
678 	vsecattr.vsa_dfaclentp = NULL;
679 	vsecattr.vsa_aclcnt = 0;
680 	vsecattr.vsa_dfaclcnt = 0;
681 
682 	while (acl_styles) {
683 		/* select one of the styles as current flavor */
684 		acl_flavor = 0;
685 		if (acl_styles & _ACL_ACLENT_ENABLED) {
686 			acl_flavor = _ACL_ACLENT_ENABLED;
687 			vsecattr.vsa_mask = VSA_ACLCNT | VSA_DFACLCNT;
688 		} else if (acl_styles & _ACL_ACE_ENABLED) {
689 			acl_flavor = _ACL_ACE_ENABLED;
690 			vsecattr.vsa_mask = VSA_ACECNT | VSA_ACE;
691 		}
692 
693 		ASSERT(vsecattr.vsa_mask && acl_flavor);
694 		error = VOP_GETSECATTR(vp, &vsecattr, 0, cr);
695 		if (error == 0)
696 			break;
697 
698 		/* that flavor failed */
699 		acl_styles &= ~acl_flavor;
700 	}
701 
702 	/* if all styles fail then assume trivial */
703 	if (acl_styles == 0)
704 		return (0);
705 
706 	/* process the flavor that worked */
707 	isnontrivial = 0;
708 	if (acl_flavor & _ACL_ACLENT_ENABLED) {
709 		if (vsecattr.vsa_aclcnt > MIN_ACL_ENTRIES)
710 			isnontrivial = 1;
711 		if (vsecattr.vsa_aclcnt && vsecattr.vsa_aclentp != NULL)
712 			kmem_free(vsecattr.vsa_aclentp,
713 			    vsecattr.vsa_aclcnt * sizeof (aclent_t));
714 		if (vsecattr.vsa_dfaclcnt && vsecattr.vsa_dfaclentp != NULL)
715 			kmem_free(vsecattr.vsa_dfaclentp,
716 			    vsecattr.vsa_dfaclcnt * sizeof (aclent_t));
717 	}
718 	if (acl_flavor & _ACL_ACE_ENABLED) {
719 
720 		isnontrivial = ace_trivial(vsecattr.vsa_aclentp,
721 		    vsecattr.vsa_aclcnt);
722 
723 		if (vsecattr.vsa_aclcnt && vsecattr.vsa_aclentp != NULL)
724 			kmem_free(vsecattr.vsa_aclentp,
725 			    vsecattr.vsa_aclcnt * sizeof (ace_t));
726 		/* ACE has no vsecattr.vsa_dfaclcnt */
727 	}
728 	return (isnontrivial);
729 }
730 
731 /*
732  * Check whether we need a retry to recover from STALE error.
733  */
734 int
735 fs_need_estale_retry(int retry_count)
736 {
737 	if (retry_count < fs_estale_retry)
738 		return (1);
739 	else
740 		return (0);
741 }
742