xref: /titanic_41/usr/src/uts/common/fs/fd/fdops.c (revision 4c06356b0f0fffb4fc1b6eccc8e5d8e2254a84d6)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
29 /*	  All rights reserved.  	*/
30 
31 
32 #include <sys/types.h>
33 #include <sys/param.h>
34 #include <sys/cmn_err.h>
35 #include <sys/debug.h>
36 #include <sys/dirent.h>
37 #include <sys/errno.h>
38 #include <sys/file.h>
39 #include <sys/inline.h>
40 #include <sys/kmem.h>
41 #include <sys/pathname.h>
42 #include <sys/resource.h>
43 #include <sys/statvfs.h>
44 #include <sys/mount.h>
45 #include <sys/sysmacros.h>
46 #include <sys/systm.h>
47 #include <sys/uio.h>
48 #include <sys/vfs.h>
49 #include <sys/vfs_opreg.h>
50 #include <sys/vnode.h>
51 #include <sys/cred.h>
52 #include <sys/mntent.h>
53 #include <sys/mount.h>
54 #include <sys/user.h>
55 #include <sys/t_lock.h>
56 #include <sys/modctl.h>
57 #include <sys/policy.h>
58 #include <fs/fs_subr.h>
59 #include <sys/atomic.h>
60 #include <sys/mkdev.h>
61 
62 #define	round(r)	(((r)+sizeof (int)-1)&(~(sizeof (int)-1)))
63 #define	fdtoi(n)	((n)+100)
64 
65 #define	FDDIRSIZE 14
66 struct fddirect {
67 	short	d_ino;
68 	char	d_name[FDDIRSIZE];
69 };
70 
71 #define	FDROOTINO	2
72 #define	FDSDSIZE	sizeof (struct fddirect)
73 #define	FDNSIZE		10
74 
75 static int		fdfstype = 0;
76 static major_t		fdfsmaj;
77 static minor_t		fdfsmin;
78 static major_t		fdrmaj;
79 static kmutex_t		fd_minor_lock;
80 
81 static int fdget(vnode_t *, char *, vnode_t **);
82 
83 /* ARGSUSED */
84 static int
85 fdopen(vnode_t **vpp, int mode, cred_t *cr, caller_context_t *ct)
86 {
87 	if ((*vpp)->v_type != VDIR) {
88 		mutex_enter(&(*vpp)->v_lock);
89 		(*vpp)->v_flag |= VDUP;
90 		mutex_exit(&(*vpp)->v_lock);
91 	}
92 	return (0);
93 }
94 
95 /* ARGSUSED */
96 static int
97 fdclose(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
98 	caller_context_t *ct)
99 {
100 	return (0);
101 }
102 
103 /* ARGSUSED */
104 static int
105 fdread(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr, caller_context_t *ct)
106 {
107 	static struct fddirect dotbuf[] = {
108 		{ FDROOTINO, "."  },
109 		{ FDROOTINO, ".." }
110 	};
111 	struct fddirect dirbuf;
112 	int i, n;
113 	int minfd, maxfd, modoff, error = 0;
114 	int nentries;
115 	rctl_qty_t fdno_ctl;
116 	int endoff;
117 
118 	if (vp->v_type != VDIR)
119 		return (ENOSYS);
120 
121 	mutex_enter(&curproc->p_lock);
122 	fdno_ctl = rctl_enforced_value(rctlproc_legacy[RLIMIT_NOFILE],
123 	    curproc->p_rctls, curproc);
124 	nentries = MIN(P_FINFO(curproc)->fi_nfiles, (int)fdno_ctl);
125 	mutex_exit(&curproc->p_lock);
126 
127 	endoff = (nentries + 2) * FDSDSIZE;
128 
129 	/*
130 	 * Fake up ".", "..", and the /dev/fd directory entries.
131 	 */
132 	if (uiop->uio_loffset < (offset_t)0 ||
133 	    uiop->uio_loffset >= (offset_t)endoff ||
134 	    uiop->uio_resid <= 0)
135 		return (0);
136 	ASSERT(uiop->uio_loffset <= MAXOFF_T);
137 	if (uiop->uio_offset < 2*FDSDSIZE) {
138 		error = uiomove((caddr_t)dotbuf + uiop->uio_offset,
139 		    MIN(uiop->uio_resid, 2*FDSDSIZE - uiop->uio_offset),
140 		    UIO_READ, uiop);
141 		if (uiop->uio_resid <= 0 || error)
142 			return (error);
143 	}
144 	minfd = (uiop->uio_offset - 2*FDSDSIZE)/FDSDSIZE;
145 	maxfd = (uiop->uio_offset + uiop->uio_resid - 1)/FDSDSIZE;
146 	modoff = uiop->uio_offset % FDSDSIZE;
147 
148 	for (i = 0; i < FDDIRSIZE; i++)
149 		dirbuf.d_name[i] = '\0';
150 	for (i = minfd; i < MIN(maxfd, nentries); i++) {
151 		n = i;
152 		dirbuf.d_ino = fdtoi(n);
153 		numtos((ulong_t)n, dirbuf.d_name);
154 		error = uiomove((caddr_t)&dirbuf + modoff,
155 		    MIN(uiop->uio_resid, FDSDSIZE - modoff),
156 		    UIO_READ, uiop);
157 		if (uiop->uio_resid <= 0 || error)
158 			return (error);
159 		modoff = 0;
160 	}
161 
162 	return (error);
163 }
164 
165 /* ARGSUSED */
166 static int
167 fdgetattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
168 	caller_context_t *ct)
169 {
170 	vfs_t *vfsp = vp->v_vfsp;
171 	timestruc_t now;
172 
173 	if (vp->v_type == VDIR) {
174 		vap->va_nlink = 2;
175 		vap->va_size = (u_offset_t)
176 		    ((P_FINFO(curproc)->fi_nfiles + 2) * FDSDSIZE);
177 		vap->va_mode = 0555;
178 		vap->va_nodeid = (ino64_t)FDROOTINO;
179 	} else {
180 		vap->va_nlink = 1;
181 		vap->va_size = (u_offset_t)0;
182 		vap->va_mode = 0666;
183 		vap->va_nodeid = (ino64_t)fdtoi(getminor(vp->v_rdev));
184 	}
185 	vap->va_type = vp->v_type;
186 	vap->va_rdev = vp->v_rdev;
187 	vap->va_blksize = vfsp->vfs_bsize;
188 	vap->va_nblocks = (fsblkcnt64_t)0;
189 	gethrestime(&now);
190 	vap->va_atime = vap->va_mtime = vap->va_ctime = now;
191 	vap->va_uid = 0;
192 	vap->va_gid = 0;
193 	vap->va_fsid = vfsp->vfs_dev;
194 	vap->va_seq = 0;
195 	return (0);
196 }
197 
198 /* ARGSUSED */
199 static int
200 fdaccess(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct)
201 {
202 	return (0);
203 }
204 
205 /* ARGSUSED */
206 static int
207 fdlookup(vnode_t *dp, char *comp, vnode_t **vpp, pathname_t *pnp,
208 	int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
209 	int *direntflags, pathname_t *realpnp)
210 {
211 	if (comp[0] == 0 || strcmp(comp, ".") == 0 || strcmp(comp, "..") == 0) {
212 		VN_HOLD(dp);
213 		*vpp = dp;
214 		return (0);
215 	}
216 	return (fdget(dp, comp, vpp));
217 }
218 
219 /* ARGSUSED */
220 static int
221 fdcreate(vnode_t *dvp, char *comp, vattr_t *vap, enum vcexcl excl,
222 	int mode, vnode_t **vpp, cred_t *cr, int flag, caller_context_t *ct,
223 	vsecattr_t *vsecp)
224 {
225 	return (fdget(dvp, comp, vpp));
226 }
227 
228 /* ARGSUSED */
229 static int
230 fdreaddir(vnode_t *vp, uio_t *uiop, cred_t *cr, int *eofp, caller_context_t *ct,
231 	int flags)
232 {
233 	/* bp holds one dirent structure */
234 	u_offset_t bp[DIRENT64_RECLEN(FDNSIZE) / sizeof (u_offset_t)];
235 	struct dirent64 *dirent = (struct dirent64 *)bp;
236 	int reclen, nentries;
237 	rctl_qty_t fdno_ctl;
238 	int  n;
239 	int oresid;
240 	off_t off;
241 
242 	if (uiop->uio_offset < 0 || uiop->uio_resid <= 0 ||
243 	    (uiop->uio_offset % FDSDSIZE) != 0)
244 		return (ENOENT);
245 
246 	ASSERT(uiop->uio_loffset <= MAXOFF_T);
247 	oresid = uiop->uio_resid;
248 	bzero(bp, sizeof (bp));
249 
250 	mutex_enter(&curproc->p_lock);
251 	fdno_ctl = rctl_enforced_value(rctlproc_legacy[RLIMIT_NOFILE],
252 	    curproc->p_rctls, curproc);
253 	nentries = MIN(P_FINFO(curproc)->fi_nfiles, (int)fdno_ctl);
254 	mutex_exit(&curproc->p_lock);
255 
256 	while (uiop->uio_resid > 0) {
257 		if ((off = uiop->uio_offset) == 0) {	/* "." */
258 			dirent->d_ino = (ino64_t)FDROOTINO;
259 			dirent->d_name[0] = '.';
260 			dirent->d_name[1] = '\0';
261 			reclen = DIRENT64_RECLEN(1);
262 		} else if (off == FDSDSIZE) {		/* ".." */
263 			dirent->d_ino = (ino64_t)FDROOTINO;
264 			dirent->d_name[0] = '.';
265 			dirent->d_name[1] = '.';
266 			dirent->d_name[2] = '\0';
267 			reclen = DIRENT64_RECLEN(2);
268 		} else {
269 			/*
270 			 * Return entries corresponding to the allowable
271 			 * number of file descriptors for this process.
272 			 */
273 			if ((n = (off-2*FDSDSIZE)/FDSDSIZE) >= nentries)
274 				break;
275 			dirent->d_ino = (ino64_t)fdtoi(n);
276 			numtos((ulong_t)n, dirent->d_name);
277 			reclen = DIRENT64_RECLEN(strlen(dirent->d_name));
278 		}
279 		dirent->d_off = (offset_t)(uiop->uio_offset + FDSDSIZE);
280 		dirent->d_reclen = (ushort_t)reclen;
281 
282 		if (reclen > uiop->uio_resid) {
283 			/*
284 			 * Error if no entries have been returned yet.
285 			 */
286 			if (uiop->uio_resid == oresid)
287 				return (EINVAL);
288 			break;
289 		}
290 		/*
291 		 * uiomove() updates both resid and offset by the same
292 		 * amount.  But we want offset to change in increments
293 		 * of FDSDSIZE, which is different from the number of bytes
294 		 * being returned to the user.  So we set uio_offset
295 		 * separately, ignoring what uiomove() does.
296 		 */
297 		if (uiomove((caddr_t)dirent, reclen, UIO_READ, uiop))
298 			return (EFAULT);
299 		uiop->uio_offset = off + FDSDSIZE;
300 	}
301 	if (eofp)
302 		*eofp = ((uiop->uio_offset-2*FDSDSIZE)/FDSDSIZE >= nentries);
303 	return (0);
304 }
305 
306 /* ARGSUSED */
307 static void
308 fdinactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
309 {
310 	mutex_enter(&vp->v_lock);
311 	ASSERT(vp->v_count >= 1);
312 	if (--vp->v_count != 0) {
313 		mutex_exit(&vp->v_lock);
314 		return;
315 	}
316 	mutex_exit(&vp->v_lock);
317 	vn_invalid(vp);
318 	vn_free(vp);
319 }
320 
321 static struct vnodeops *fd_vnodeops;
322 
323 static const fs_operation_def_t fd_vnodeops_template[] = {
324 	VOPNAME_OPEN,		{ .vop_open = fdopen },
325 	VOPNAME_CLOSE,		{ .vop_close = fdclose },
326 	VOPNAME_READ,		{ .vop_read = fdread },
327 	VOPNAME_GETATTR,	{ .vop_getattr = fdgetattr },
328 	VOPNAME_ACCESS,		{ .vop_access = fdaccess },
329 	VOPNAME_LOOKUP,		{ .vop_lookup = fdlookup },
330 	VOPNAME_CREATE,		{ .vop_create = fdcreate },
331 	VOPNAME_READDIR,	{ .vop_readdir = fdreaddir },
332 	VOPNAME_INACTIVE,	{ .vop_inactive = fdinactive },
333 	VOPNAME_FRLOCK,		{ .error = fs_error },
334 	VOPNAME_POLL,		{ .error = fs_error },
335 	VOPNAME_DISPOSE,	{ .error = fs_error },
336 	NULL,			NULL
337 };
338 
339 static int
340 fdget(struct vnode *dvp, char *comp, struct vnode **vpp)
341 {
342 	int n = 0;
343 	struct vnode *vp;
344 
345 	while (*comp) {
346 		if (*comp < '0' || *comp > '9')
347 			return (ENOENT);
348 		n = 10 * n + *comp++ - '0';
349 	}
350 	vp = vn_alloc(KM_SLEEP);
351 	vp->v_type = VCHR;
352 	vp->v_vfsp = dvp->v_vfsp;
353 	vn_setops(vp, fd_vnodeops);
354 	vp->v_data = NULL;
355 	vp->v_flag = VNOMAP;
356 	vp->v_rdev = makedevice(fdrmaj, n);
357 	vn_exists(vp);
358 	*vpp = vp;
359 	return (0);
360 }
361 
362 /*
363  * fdfs is mounted on /dev/fd, however, there are two interesting
364  * possibilities - two threads racing to do the same mount (protected
365  * by vfs locking), and two threads mounting fdfs in different places.
366  */
367 /*ARGSUSED*/
368 static int
369 fdmount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
370 {
371 	struct vnode *vp;
372 
373 	if (secpolicy_fs_mount(cr, mvp, vfsp) != 0)
374 		return (EPERM);
375 	if (mvp->v_type != VDIR)
376 		return (ENOTDIR);
377 
378 	mutex_enter(&mvp->v_lock);
379 	if ((uap->flags & MS_OVERLAY) == 0 &&
380 	    (mvp->v_count > 1 || (mvp->v_flag & VROOT))) {
381 		mutex_exit(&mvp->v_lock);
382 		return (EBUSY);
383 	}
384 	mutex_exit(&mvp->v_lock);
385 
386 	/*
387 	 * Having the resource be anything but "fd" doesn't make sense
388 	 */
389 	vfs_setresource(vfsp, "fd");
390 
391 	vp = vn_alloc(KM_SLEEP);
392 	vp->v_vfsp = vfsp;
393 	vn_setops(vp, fd_vnodeops);
394 	vp->v_type = VDIR;
395 	vp->v_data = NULL;
396 	vp->v_flag |= VROOT;
397 	vfsp->vfs_fstype = fdfstype;
398 	vfsp->vfs_data = (char *)vp;
399 	mutex_enter(&fd_minor_lock);
400 	do {
401 		fdfsmin = (fdfsmin + 1) & L_MAXMIN32;
402 		vfsp->vfs_dev = makedevice(fdfsmaj, fdfsmin);
403 	} while (vfs_devismounted(vfsp->vfs_dev));
404 	mutex_exit(&fd_minor_lock);
405 	vfs_make_fsid(&vfsp->vfs_fsid, vfsp->vfs_dev, fdfstype);
406 	vfsp->vfs_bsize = 1024;
407 	return (0);
408 }
409 
410 /* ARGSUSED */
411 static int
412 fdunmount(vfs_t *vfsp, int flag, cred_t *cr)
413 {
414 	vnode_t *rvp;
415 
416 	if (secpolicy_fs_unmount(cr, vfsp) != 0)
417 		return (EPERM);
418 
419 	/*
420 	 * forced unmount is not supported by this file system
421 	 * and thus, ENOTSUP, is being returned.
422 	 */
423 	if (flag & MS_FORCE)
424 		return (ENOTSUP);
425 
426 	rvp = (vnode_t *)vfsp->vfs_data;
427 	if (rvp->v_count > 1)
428 		return (EBUSY);
429 
430 	VN_RELE(rvp);
431 	return (0);
432 }
433 
434 /* ARGSUSED */
435 static int
436 fdroot(vfs_t *vfsp, vnode_t **vpp)
437 {
438 	vnode_t *vp = (vnode_t *)vfsp->vfs_data;
439 
440 	VN_HOLD(vp);
441 	*vpp = vp;
442 	return (0);
443 }
444 
445 /*
446  * No locking required because I held the root vnode before calling this
447  * function so the vfs won't disappear on me.  To be more explicit:
448  * fdvrootp->v_count will be greater than 1 so fdunmount will just return.
449  */
450 static int
451 fdstatvfs(struct vfs *vfsp, struct statvfs64 *sp)
452 {
453 	dev32_t d32;
454 	rctl_qty_t fdno_ctl;
455 
456 	mutex_enter(&curproc->p_lock);
457 	fdno_ctl = rctl_enforced_value(rctlproc_legacy[RLIMIT_NOFILE],
458 	    curproc->p_rctls, curproc);
459 	mutex_exit(&curproc->p_lock);
460 
461 	bzero(sp, sizeof (*sp));
462 	sp->f_bsize = 1024;
463 	sp->f_frsize = 1024;
464 	sp->f_blocks = (fsblkcnt64_t)0;
465 	sp->f_bfree = (fsblkcnt64_t)0;
466 	sp->f_bavail = (fsblkcnt64_t)0;
467 	sp->f_files = (fsfilcnt64_t)
468 	    (MIN(P_FINFO(curproc)->fi_nfiles, fdno_ctl + 2));
469 	sp->f_ffree = (fsfilcnt64_t)0;
470 	sp->f_favail = (fsfilcnt64_t)0;
471 	(void) cmpldev(&d32, vfsp->vfs_dev);
472 	sp->f_fsid = d32;
473 	(void) strcpy(sp->f_basetype, vfssw[fdfstype].vsw_name);
474 	sp->f_flag = vf_to_stf(vfsp->vfs_flag);
475 	sp->f_namemax = FDNSIZE;
476 	(void) strcpy(sp->f_fstr, "/dev/fd");
477 	(void) strcpy(&sp->f_fstr[8], "/dev/fd");
478 	return (0);
479 }
480 
481 int
482 fdinit(int fstype, char *name)
483 {
484 	static const fs_operation_def_t fd_vfsops_template[] = {
485 		VFSNAME_MOUNT,		{ .vfs_mount = fdmount },
486 		VFSNAME_UNMOUNT,	{ .vfs_unmount = fdunmount },
487 		VFSNAME_ROOT, 		{ .vfs_root = fdroot },
488 		VFSNAME_STATVFS,	{ .vfs_statvfs = fdstatvfs },
489 		NULL,			NULL
490 	};
491 	int error;
492 
493 	fdfstype = fstype;
494 	ASSERT(fdfstype != 0);
495 
496 	/*
497 	 * Associate VFS ops vector with this fstype.
498 	 */
499 	error = vfs_setfsops(fstype, fd_vfsops_template, NULL);
500 	if (error != 0) {
501 		cmn_err(CE_WARN, "fdinit: bad vnode ops template");
502 		return (error);
503 	}
504 
505 	error = vn_make_ops(name, fd_vnodeops_template, &fd_vnodeops);
506 	if (error != 0) {
507 		(void) vfs_freevfsops_by_type(fstype);
508 		cmn_err(CE_WARN, "fdinit: bad vnode ops template");
509 		return (error);
510 	}
511 
512 	/*
513 	 * Assign unique "device" numbers (reported by stat(2)).
514 	 */
515 	fdfsmaj = getudev();
516 	fdrmaj = getudev();
517 	if (fdfsmaj == (major_t)-1 || fdrmaj == (major_t)-1) {
518 		cmn_err(CE_WARN, "fdinit: can't get unique device numbers");
519 		if (fdfsmaj == (major_t)-1)
520 			fdfsmaj = 0;
521 		if (fdrmaj == (major_t)-1)
522 			fdrmaj = 0;
523 	}
524 	mutex_init(&fd_minor_lock, NULL, MUTEX_DEFAULT, NULL);
525 	return (0);
526 }
527 
528 /*
529  * FDFS Mount options table
530  */
531 static char *rw_cancel[] = { MNTOPT_RO, NULL };
532 
533 static mntopt_t mntopts[] = {
534 /*
535  *	option name		cancel option	default arg	flags
536  */
537 	{ MNTOPT_RW,		rw_cancel,	NULL,		MO_DEFAULT,
538 		(void *)MNTOPT_NOINTR },
539 	{ MNTOPT_IGNORE,	NULL,		NULL,		0,
540 		(void *)0 },
541 };
542 
543 static mntopts_t fdfs_mntopts = {
544 	sizeof (mntopts) / sizeof (mntopt_t),
545 	mntopts
546 };
547 
548 static vfsdef_t vfw = {
549 	VFSDEF_VERSION,
550 	"fd",
551 	fdinit,
552 	VSW_HASPROTO,
553 	&fdfs_mntopts
554 };
555 
556 static struct modlfs modlfs = {
557 	&mod_fsops,
558 	"filesystem for fd",
559 	&vfw
560 };
561 
562 static struct modlinkage modlinkage = {
563 	MODREV_1,
564 	&modlfs,
565 	NULL
566 };
567 
568 int
569 _init(void)
570 {
571 	return (mod_install(&modlinkage));
572 }
573 
574 int
575 _info(struct modinfo *modinfop)
576 {
577 	return (mod_info(&modlinkage, modinfop));
578 }
579