xref: /titanic_44/usr/src/uts/common/fs/fd/fdops.c (revision 1a5e258f5471356ca102c7176637cdce45bac147)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
26 /*	  All rights reserved.  	*/
27 
28 
29 #include <sys/types.h>
30 #include <sys/param.h>
31 #include <sys/cmn_err.h>
32 #include <sys/debug.h>
33 #include <sys/dirent.h>
34 #include <sys/errno.h>
35 #include <sys/file.h>
36 #include <sys/inline.h>
37 #include <sys/kmem.h>
38 #include <sys/pathname.h>
39 #include <sys/resource.h>
40 #include <sys/statvfs.h>
41 #include <sys/mount.h>
42 #include <sys/sysmacros.h>
43 #include <sys/systm.h>
44 #include <sys/uio.h>
45 #include <sys/vfs.h>
46 #include <sys/vfs_opreg.h>
47 #include <sys/vnode.h>
48 #include <sys/cred.h>
49 #include <sys/mntent.h>
50 #include <sys/mount.h>
51 #include <sys/user.h>
52 #include <sys/t_lock.h>
53 #include <sys/modctl.h>
54 #include <sys/policy.h>
55 #include <fs/fs_subr.h>
56 #include <sys/atomic.h>
57 #include <sys/mkdev.h>
58 
59 #define	round(r)	(((r)+sizeof (int)-1)&(~(sizeof (int)-1)))
60 #define	fdtoi(n)	((n)+100)
61 
62 #define	FDDIRSIZE 14
63 struct fddirect {
64 	short	d_ino;
65 	char	d_name[FDDIRSIZE];
66 };
67 
68 #define	FDROOTINO	2
69 #define	FDSDSIZE	sizeof (struct fddirect)
70 #define	FDNSIZE		10
71 
72 static int		fdfstype = 0;
73 static major_t		fdfsmaj;
74 static minor_t		fdfsmin;
75 static major_t		fdrmaj;
76 static kmutex_t		fd_minor_lock;
77 
78 static int fdget(vnode_t *, char *, vnode_t **);
79 
80 /* ARGSUSED */
81 static int
82 fdopen(vnode_t **vpp, int mode, cred_t *cr, caller_context_t *ct)
83 {
84 	if ((*vpp)->v_type != VDIR) {
85 		mutex_enter(&(*vpp)->v_lock);
86 		(*vpp)->v_flag |= VDUP;
87 		mutex_exit(&(*vpp)->v_lock);
88 	}
89 	return (0);
90 }
91 
92 /* ARGSUSED */
93 static int
94 fdclose(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
95 	caller_context_t *ct)
96 {
97 	return (0);
98 }
99 
100 /* ARGSUSED */
101 static int
102 fdread(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr, caller_context_t *ct)
103 {
104 	static struct fddirect dotbuf[] = {
105 		{ FDROOTINO, "."  },
106 		{ FDROOTINO, ".." }
107 	};
108 	struct fddirect dirbuf;
109 	int i, n;
110 	int minfd, maxfd, modoff, error = 0;
111 	int nentries;
112 	rctl_qty_t fdno_ctl;
113 	int endoff;
114 
115 	if (vp->v_type != VDIR)
116 		return (ENOSYS);
117 
118 	mutex_enter(&curproc->p_lock);
119 	fdno_ctl = rctl_enforced_value(rctlproc_legacy[RLIMIT_NOFILE],
120 	    curproc->p_rctls, curproc);
121 	nentries = MIN(P_FINFO(curproc)->fi_nfiles, (int)fdno_ctl);
122 	mutex_exit(&curproc->p_lock);
123 
124 	endoff = (nentries + 2) * FDSDSIZE;
125 
126 	/*
127 	 * Fake up ".", "..", and the /dev/fd directory entries.
128 	 */
129 	if (uiop->uio_loffset < (offset_t)0 ||
130 	    uiop->uio_loffset >= (offset_t)endoff ||
131 	    uiop->uio_resid <= 0)
132 		return (0);
133 	ASSERT(uiop->uio_loffset <= MAXOFF_T);
134 	if (uiop->uio_offset < 2*FDSDSIZE) {
135 		error = uiomove((caddr_t)dotbuf + uiop->uio_offset,
136 		    MIN(uiop->uio_resid, 2*FDSDSIZE - uiop->uio_offset),
137 		    UIO_READ, uiop);
138 		if (uiop->uio_resid <= 0 || error)
139 			return (error);
140 	}
141 	minfd = (uiop->uio_offset - 2*FDSDSIZE)/FDSDSIZE;
142 	maxfd = (uiop->uio_offset + uiop->uio_resid - 1)/FDSDSIZE;
143 	modoff = uiop->uio_offset % FDSDSIZE;
144 
145 	for (i = 0; i < FDDIRSIZE; i++)
146 		dirbuf.d_name[i] = '\0';
147 	for (i = minfd; i < MIN(maxfd, nentries); i++) {
148 		n = i;
149 		dirbuf.d_ino = fdtoi(n);
150 		numtos((ulong_t)n, dirbuf.d_name);
151 		error = uiomove((caddr_t)&dirbuf + modoff,
152 		    MIN(uiop->uio_resid, FDSDSIZE - modoff),
153 		    UIO_READ, uiop);
154 		if (uiop->uio_resid <= 0 || error)
155 			return (error);
156 		modoff = 0;
157 	}
158 
159 	return (error);
160 }
161 
162 /* ARGSUSED */
163 static int
164 fdgetattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
165 	caller_context_t *ct)
166 {
167 	vfs_t *vfsp = vp->v_vfsp;
168 	timestruc_t now;
169 
170 	if (vp->v_type == VDIR) {
171 		vap->va_nlink = 2;
172 		vap->va_size = (u_offset_t)
173 		    ((P_FINFO(curproc)->fi_nfiles + 2) * FDSDSIZE);
174 		vap->va_mode = 0555;
175 		vap->va_nodeid = (ino64_t)FDROOTINO;
176 	} else {
177 		vap->va_nlink = 1;
178 		vap->va_size = (u_offset_t)0;
179 		vap->va_mode = 0666;
180 		vap->va_nodeid = (ino64_t)fdtoi(getminor(vp->v_rdev));
181 	}
182 	vap->va_type = vp->v_type;
183 	vap->va_rdev = vp->v_rdev;
184 	vap->va_blksize = vfsp->vfs_bsize;
185 	vap->va_nblocks = (fsblkcnt64_t)0;
186 	gethrestime(&now);
187 	vap->va_atime = vap->va_mtime = vap->va_ctime = now;
188 	vap->va_uid = 0;
189 	vap->va_gid = 0;
190 	vap->va_fsid = vfsp->vfs_dev;
191 	vap->va_seq = 0;
192 	return (0);
193 }
194 
195 /* ARGSUSED */
196 static int
197 fdaccess(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct)
198 {
199 	return (0);
200 }
201 
202 /* ARGSUSED */
203 static int
204 fdlookup(vnode_t *dp, char *comp, vnode_t **vpp, pathname_t *pnp,
205 	int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
206 	int *direntflags, pathname_t *realpnp)
207 {
208 	if (comp[0] == 0 || strcmp(comp, ".") == 0 || strcmp(comp, "..") == 0) {
209 		VN_HOLD(dp);
210 		*vpp = dp;
211 		return (0);
212 	}
213 	return (fdget(dp, comp, vpp));
214 }
215 
216 /* ARGSUSED */
217 static int
218 fdcreate(vnode_t *dvp, char *comp, vattr_t *vap, enum vcexcl excl,
219 	int mode, vnode_t **vpp, cred_t *cr, int flag, caller_context_t *ct,
220 	vsecattr_t *vsecp)
221 {
222 	return (fdget(dvp, comp, vpp));
223 }
224 
225 /* ARGSUSED */
226 static int
227 fdreaddir(vnode_t *vp, uio_t *uiop, cred_t *cr, int *eofp, caller_context_t *ct,
228 	int flags)
229 {
230 	/* bp holds one dirent structure */
231 	u_offset_t bp[DIRENT64_RECLEN(FDNSIZE) / sizeof (u_offset_t)];
232 	struct dirent64 *dirent = (struct dirent64 *)bp;
233 	int reclen, nentries;
234 	rctl_qty_t fdno_ctl;
235 	int  n;
236 	int oresid;
237 	off_t off;
238 
239 	if (uiop->uio_offset < 0 || uiop->uio_resid <= 0 ||
240 	    (uiop->uio_offset % FDSDSIZE) != 0)
241 		return (ENOENT);
242 
243 	ASSERT(uiop->uio_loffset <= MAXOFF_T);
244 	oresid = uiop->uio_resid;
245 	bzero(bp, sizeof (bp));
246 
247 	mutex_enter(&curproc->p_lock);
248 	fdno_ctl = rctl_enforced_value(rctlproc_legacy[RLIMIT_NOFILE],
249 	    curproc->p_rctls, curproc);
250 	nentries = MIN(P_FINFO(curproc)->fi_nfiles, (int)fdno_ctl);
251 	mutex_exit(&curproc->p_lock);
252 
253 	while (uiop->uio_resid > 0) {
254 		if ((off = uiop->uio_offset) == 0) {	/* "." */
255 			dirent->d_ino = (ino64_t)FDROOTINO;
256 			dirent->d_name[0] = '.';
257 			dirent->d_name[1] = '\0';
258 			reclen = DIRENT64_RECLEN(1);
259 		} else if (off == FDSDSIZE) {		/* ".." */
260 			dirent->d_ino = (ino64_t)FDROOTINO;
261 			dirent->d_name[0] = '.';
262 			dirent->d_name[1] = '.';
263 			dirent->d_name[2] = '\0';
264 			reclen = DIRENT64_RECLEN(2);
265 		} else {
266 			/*
267 			 * Return entries corresponding to the allowable
268 			 * number of file descriptors for this process.
269 			 */
270 			if ((n = (off-2*FDSDSIZE)/FDSDSIZE) >= nentries)
271 				break;
272 			dirent->d_ino = (ino64_t)fdtoi(n);
273 			numtos((ulong_t)n, dirent->d_name);
274 			reclen = DIRENT64_RECLEN(strlen(dirent->d_name));
275 		}
276 		dirent->d_off = (offset_t)(uiop->uio_offset + FDSDSIZE);
277 		dirent->d_reclen = (ushort_t)reclen;
278 
279 		if (reclen > uiop->uio_resid) {
280 			/*
281 			 * Error if no entries have been returned yet.
282 			 */
283 			if (uiop->uio_resid == oresid)
284 				return (EINVAL);
285 			break;
286 		}
287 		/*
288 		 * uiomove() updates both resid and offset by the same
289 		 * amount.  But we want offset to change in increments
290 		 * of FDSDSIZE, which is different from the number of bytes
291 		 * being returned to the user.  So we set uio_offset
292 		 * separately, ignoring what uiomove() does.
293 		 */
294 		if (uiomove((caddr_t)dirent, reclen, UIO_READ, uiop))
295 			return (EFAULT);
296 		uiop->uio_offset = off + FDSDSIZE;
297 	}
298 	if (eofp)
299 		*eofp = ((uiop->uio_offset-2*FDSDSIZE)/FDSDSIZE >= nentries);
300 	return (0);
301 }
302 
303 /* ARGSUSED */
304 static void
305 fdinactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
306 {
307 	mutex_enter(&vp->v_lock);
308 	ASSERT(vp->v_count >= 1);
309 	if (--vp->v_count != 0) {
310 		mutex_exit(&vp->v_lock);
311 		return;
312 	}
313 	mutex_exit(&vp->v_lock);
314 	vn_invalid(vp);
315 	vn_free(vp);
316 }
317 
318 static struct vnodeops *fd_vnodeops;
319 
320 static const fs_operation_def_t fd_vnodeops_template[] = {
321 	VOPNAME_OPEN,		{ .vop_open = fdopen },
322 	VOPNAME_CLOSE,		{ .vop_close = fdclose },
323 	VOPNAME_READ,		{ .vop_read = fdread },
324 	VOPNAME_GETATTR,	{ .vop_getattr = fdgetattr },
325 	VOPNAME_ACCESS,		{ .vop_access = fdaccess },
326 	VOPNAME_LOOKUP,		{ .vop_lookup = fdlookup },
327 	VOPNAME_CREATE,		{ .vop_create = fdcreate },
328 	VOPNAME_READDIR,	{ .vop_readdir = fdreaddir },
329 	VOPNAME_INACTIVE,	{ .vop_inactive = fdinactive },
330 	VOPNAME_FRLOCK,		{ .error = fs_error },
331 	VOPNAME_POLL,		{ .error = fs_error },
332 	VOPNAME_DISPOSE,	{ .error = fs_error },
333 	NULL,			NULL
334 };
335 
336 static int
337 fdget(struct vnode *dvp, char *comp, struct vnode **vpp)
338 {
339 	int n = 0;
340 	struct vnode *vp;
341 
342 	while (*comp) {
343 		if (*comp < '0' || *comp > '9')
344 			return (ENOENT);
345 		n = 10 * n + *comp++ - '0';
346 	}
347 	vp = vn_alloc(KM_SLEEP);
348 	vp->v_type = VCHR;
349 	vp->v_vfsp = dvp->v_vfsp;
350 	vn_setops(vp, fd_vnodeops);
351 	vp->v_data = NULL;
352 	vp->v_flag = VNOMAP;
353 	vp->v_rdev = makedevice(fdrmaj, n);
354 	vn_exists(vp);
355 	*vpp = vp;
356 	return (0);
357 }
358 
359 /*
360  * fdfs is mounted on /dev/fd, however, there are two interesting
361  * possibilities - two threads racing to do the same mount (protected
362  * by vfs locking), and two threads mounting fdfs in different places.
363  */
364 /*ARGSUSED*/
365 static int
366 fdmount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
367 {
368 	struct vnode *vp;
369 
370 	if (secpolicy_fs_mount(cr, mvp, vfsp) != 0)
371 		return (EPERM);
372 	if (mvp->v_type != VDIR)
373 		return (ENOTDIR);
374 
375 	mutex_enter(&mvp->v_lock);
376 	if ((uap->flags & MS_OVERLAY) == 0 &&
377 	    (mvp->v_count > 1 || (mvp->v_flag & VROOT))) {
378 		mutex_exit(&mvp->v_lock);
379 		return (EBUSY);
380 	}
381 	mutex_exit(&mvp->v_lock);
382 
383 	/*
384 	 * Having the resource be anything but "fd" doesn't make sense
385 	 */
386 	vfs_setresource(vfsp, "fd", 0);
387 
388 	vp = vn_alloc(KM_SLEEP);
389 	vp->v_vfsp = vfsp;
390 	vn_setops(vp, fd_vnodeops);
391 	vp->v_type = VDIR;
392 	vp->v_data = NULL;
393 	vp->v_flag |= VROOT;
394 	vfsp->vfs_fstype = fdfstype;
395 	vfsp->vfs_data = (char *)vp;
396 	mutex_enter(&fd_minor_lock);
397 	do {
398 		fdfsmin = (fdfsmin + 1) & L_MAXMIN32;
399 		vfsp->vfs_dev = makedevice(fdfsmaj, fdfsmin);
400 	} while (vfs_devismounted(vfsp->vfs_dev));
401 	mutex_exit(&fd_minor_lock);
402 	vfs_make_fsid(&vfsp->vfs_fsid, vfsp->vfs_dev, fdfstype);
403 	vfsp->vfs_bsize = 1024;
404 	return (0);
405 }
406 
407 /* ARGSUSED */
408 static int
409 fdunmount(vfs_t *vfsp, int flag, cred_t *cr)
410 {
411 	vnode_t *rvp;
412 
413 	if (secpolicy_fs_unmount(cr, vfsp) != 0)
414 		return (EPERM);
415 
416 	/*
417 	 * forced unmount is not supported by this file system
418 	 * and thus, ENOTSUP, is being returned.
419 	 */
420 	if (flag & MS_FORCE)
421 		return (ENOTSUP);
422 
423 	rvp = (vnode_t *)vfsp->vfs_data;
424 	if (rvp->v_count > 1)
425 		return (EBUSY);
426 
427 	VN_RELE(rvp);
428 	return (0);
429 }
430 
431 /* ARGSUSED */
432 static int
433 fdroot(vfs_t *vfsp, vnode_t **vpp)
434 {
435 	vnode_t *vp = (vnode_t *)vfsp->vfs_data;
436 
437 	VN_HOLD(vp);
438 	*vpp = vp;
439 	return (0);
440 }
441 
442 /*
443  * No locking required because I held the root vnode before calling this
444  * function so the vfs won't disappear on me.  To be more explicit:
445  * fdvrootp->v_count will be greater than 1 so fdunmount will just return.
446  */
447 static int
448 fdstatvfs(struct vfs *vfsp, struct statvfs64 *sp)
449 {
450 	dev32_t d32;
451 	rctl_qty_t fdno_ctl;
452 
453 	mutex_enter(&curproc->p_lock);
454 	fdno_ctl = rctl_enforced_value(rctlproc_legacy[RLIMIT_NOFILE],
455 	    curproc->p_rctls, curproc);
456 	mutex_exit(&curproc->p_lock);
457 
458 	bzero(sp, sizeof (*sp));
459 	sp->f_bsize = 1024;
460 	sp->f_frsize = 1024;
461 	sp->f_blocks = (fsblkcnt64_t)0;
462 	sp->f_bfree = (fsblkcnt64_t)0;
463 	sp->f_bavail = (fsblkcnt64_t)0;
464 	sp->f_files = (fsfilcnt64_t)
465 	    (MIN(P_FINFO(curproc)->fi_nfiles, fdno_ctl + 2));
466 	sp->f_ffree = (fsfilcnt64_t)0;
467 	sp->f_favail = (fsfilcnt64_t)0;
468 	(void) cmpldev(&d32, vfsp->vfs_dev);
469 	sp->f_fsid = d32;
470 	(void) strcpy(sp->f_basetype, vfssw[fdfstype].vsw_name);
471 	sp->f_flag = vf_to_stf(vfsp->vfs_flag);
472 	sp->f_namemax = FDNSIZE;
473 	(void) strcpy(sp->f_fstr, "/dev/fd");
474 	(void) strcpy(&sp->f_fstr[8], "/dev/fd");
475 	return (0);
476 }
477 
478 int
479 fdinit(int fstype, char *name)
480 {
481 	static const fs_operation_def_t fd_vfsops_template[] = {
482 		VFSNAME_MOUNT,		{ .vfs_mount = fdmount },
483 		VFSNAME_UNMOUNT,	{ .vfs_unmount = fdunmount },
484 		VFSNAME_ROOT, 		{ .vfs_root = fdroot },
485 		VFSNAME_STATVFS,	{ .vfs_statvfs = fdstatvfs },
486 		NULL,			NULL
487 	};
488 	int error;
489 
490 	fdfstype = fstype;
491 	ASSERT(fdfstype != 0);
492 
493 	/*
494 	 * Associate VFS ops vector with this fstype.
495 	 */
496 	error = vfs_setfsops(fstype, fd_vfsops_template, NULL);
497 	if (error != 0) {
498 		cmn_err(CE_WARN, "fdinit: bad vnode ops template");
499 		return (error);
500 	}
501 
502 	error = vn_make_ops(name, fd_vnodeops_template, &fd_vnodeops);
503 	if (error != 0) {
504 		(void) vfs_freevfsops_by_type(fstype);
505 		cmn_err(CE_WARN, "fdinit: bad vnode ops template");
506 		return (error);
507 	}
508 
509 	/*
510 	 * Assign unique "device" numbers (reported by stat(2)).
511 	 */
512 	fdfsmaj = getudev();
513 	fdrmaj = getudev();
514 	if (fdfsmaj == (major_t)-1 || fdrmaj == (major_t)-1) {
515 		cmn_err(CE_WARN, "fdinit: can't get unique device numbers");
516 		if (fdfsmaj == (major_t)-1)
517 			fdfsmaj = 0;
518 		if (fdrmaj == (major_t)-1)
519 			fdrmaj = 0;
520 	}
521 	mutex_init(&fd_minor_lock, NULL, MUTEX_DEFAULT, NULL);
522 	return (0);
523 }
524 
525 /*
526  * FDFS Mount options table
527  */
528 static char *rw_cancel[] = { MNTOPT_RO, NULL };
529 
530 static mntopt_t mntopts[] = {
531 /*
532  *	option name		cancel option	default arg	flags
533  */
534 	{ MNTOPT_RW,		rw_cancel,	NULL,		MO_DEFAULT,
535 		(void *)MNTOPT_NOINTR },
536 	{ MNTOPT_IGNORE,	NULL,		NULL,		0,
537 		(void *)0 },
538 };
539 
540 static mntopts_t fdfs_mntopts = {
541 	sizeof (mntopts) / sizeof (mntopt_t),
542 	mntopts
543 };
544 
545 static vfsdef_t vfw = {
546 	VFSDEF_VERSION,
547 	"fd",
548 	fdinit,
549 	VSW_HASPROTO | VSW_ZMOUNT,
550 	&fdfs_mntopts
551 };
552 
553 static struct modlfs modlfs = {
554 	&mod_fsops,
555 	"filesystem for fd",
556 	&vfw
557 };
558 
559 static struct modlinkage modlinkage = {
560 	MODREV_1,
561 	&modlfs,
562 	NULL
563 };
564 
565 int
566 _init(void)
567 {
568 	return (mod_install(&modlinkage));
569 }
570 
571 int
572 _info(struct modinfo *modinfop)
573 {
574 	return (mod_info(&modlinkage, modinfop));
575 }
576