1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2017 by Delphix. All rights reserved.
24 */
25
26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
27 /* All rights reserved. */
28
29
30 #include <sys/types.h>
31 #include <sys/param.h>
32 #include <sys/cmn_err.h>
33 #include <sys/debug.h>
34 #include <sys/dirent.h>
35 #include <sys/errno.h>
36 #include <sys/file.h>
37 #include <sys/inline.h>
38 #include <sys/kmem.h>
39 #include <sys/pathname.h>
40 #include <sys/resource.h>
41 #include <sys/statvfs.h>
42 #include <sys/mount.h>
43 #include <sys/sysmacros.h>
44 #include <sys/systm.h>
45 #include <sys/uio.h>
46 #include <sys/vfs.h>
47 #include <sys/vfs_opreg.h>
48 #include <sys/vnode.h>
49 #include <sys/cred.h>
50 #include <sys/mntent.h>
51 #include <sys/mount.h>
52 #include <sys/user.h>
53 #include <sys/t_lock.h>
54 #include <sys/modctl.h>
55 #include <sys/policy.h>
56 #include <fs/fs_subr.h>
57 #include <sys/atomic.h>
58 #include <sys/mkdev.h>
59
60 #define round(r) (((r)+sizeof (int)-1)&(~(sizeof (int)-1)))
61 #define fdtoi(n) ((n)+100)
62
63 #define FDDIRSIZE 14
64 struct fddirect {
65 short d_ino;
66 char d_name[FDDIRSIZE];
67 };
68
69 #define FDROOTINO 2
70 #define FDSDSIZE sizeof (struct fddirect)
71 #define FDNSIZE 10
72
73 static int fdfstype = 0;
74 static major_t fdfsmaj;
75 static minor_t fdfsmin;
76 static major_t fdrmaj;
77 static kmutex_t fd_minor_lock;
78
79 static int fdget(vnode_t *, char *, vnode_t **);
80
81 /* ARGSUSED */
82 static int
fdopen(vnode_t ** vpp,int mode,cred_t * cr,caller_context_t * ct)83 fdopen(vnode_t **vpp, int mode, cred_t *cr, caller_context_t *ct)
84 {
85 if ((*vpp)->v_type != VDIR) {
86 mutex_enter(&(*vpp)->v_lock);
87 (*vpp)->v_flag |= VDUP;
88 mutex_exit(&(*vpp)->v_lock);
89 }
90 return (0);
91 }
92
93 /* ARGSUSED */
94 static int
fdclose(vnode_t * vp,int flag,int count,offset_t offset,cred_t * cr,caller_context_t * ct)95 fdclose(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
96 caller_context_t *ct)
97 {
98 return (0);
99 }
100
101 /* ARGSUSED */
102 static int
fdread(vnode_t * vp,uio_t * uiop,int ioflag,cred_t * cr,caller_context_t * ct)103 fdread(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr, caller_context_t *ct)
104 {
105 static struct fddirect dotbuf[] = {
106 { FDROOTINO, "." },
107 { FDROOTINO, ".." }
108 };
109 struct fddirect dirbuf;
110 int i, n;
111 int minfd, maxfd, modoff, error = 0;
112 int nentries;
113 rctl_qty_t fdno_ctl;
114 int endoff;
115
116 if (vp->v_type != VDIR)
117 return (ENOSYS);
118
119 mutex_enter(&curproc->p_lock);
120 fdno_ctl = rctl_enforced_value(rctlproc_legacy[RLIMIT_NOFILE],
121 curproc->p_rctls, curproc);
122 nentries = MIN(P_FINFO(curproc)->fi_nfiles, (int)fdno_ctl);
123 mutex_exit(&curproc->p_lock);
124
125 endoff = (nentries + 2) * FDSDSIZE;
126
127 /*
128 * Fake up ".", "..", and the /dev/fd directory entries.
129 */
130 if (uiop->uio_loffset < (offset_t)0 ||
131 uiop->uio_loffset >= (offset_t)endoff ||
132 uiop->uio_resid <= 0)
133 return (0);
134 ASSERT(uiop->uio_loffset <= MAXOFF_T);
135 if (uiop->uio_offset < 2*FDSDSIZE) {
136 error = uiomove((caddr_t)dotbuf + uiop->uio_offset,
137 MIN(uiop->uio_resid, 2*FDSDSIZE - uiop->uio_offset),
138 UIO_READ, uiop);
139 if (uiop->uio_resid <= 0 || error)
140 return (error);
141 }
142 minfd = (uiop->uio_offset - 2*FDSDSIZE)/FDSDSIZE;
143 maxfd = (uiop->uio_offset + uiop->uio_resid - 1)/FDSDSIZE;
144 modoff = uiop->uio_offset % FDSDSIZE;
145
146 for (i = 0; i < FDDIRSIZE; i++)
147 dirbuf.d_name[i] = '\0';
148 for (i = minfd; i < MIN(maxfd, nentries); i++) {
149 n = i;
150 dirbuf.d_ino = fdtoi(n);
151 numtos((ulong_t)n, dirbuf.d_name);
152 error = uiomove((caddr_t)&dirbuf + modoff,
153 MIN(uiop->uio_resid, FDSDSIZE - modoff),
154 UIO_READ, uiop);
155 if (uiop->uio_resid <= 0 || error)
156 return (error);
157 modoff = 0;
158 }
159
160 return (error);
161 }
162
163 /* ARGSUSED */
164 static int
fdgetattr(vnode_t * vp,vattr_t * vap,int flags,cred_t * cr,caller_context_t * ct)165 fdgetattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
166 caller_context_t *ct)
167 {
168 vfs_t *vfsp = vp->v_vfsp;
169 timestruc_t now;
170
171 if (vp->v_type == VDIR) {
172 vap->va_nlink = 2;
173 vap->va_size = (u_offset_t)
174 ((P_FINFO(curproc)->fi_nfiles + 2) * FDSDSIZE);
175 vap->va_mode = 0555;
176 vap->va_nodeid = (ino64_t)FDROOTINO;
177 } else {
178 vap->va_nlink = 1;
179 vap->va_size = (u_offset_t)0;
180 vap->va_mode = 0666;
181 vap->va_nodeid = (ino64_t)fdtoi(getminor(vp->v_rdev));
182 }
183 vap->va_type = vp->v_type;
184 vap->va_rdev = vp->v_rdev;
185 vap->va_blksize = vfsp->vfs_bsize;
186 vap->va_nblocks = (fsblkcnt64_t)0;
187 gethrestime(&now);
188 vap->va_atime = vap->va_mtime = vap->va_ctime = now;
189 vap->va_uid = 0;
190 vap->va_gid = 0;
191 vap->va_fsid = vfsp->vfs_dev;
192 vap->va_seq = 0;
193 return (0);
194 }
195
196 /* ARGSUSED */
197 static int
fdaccess(vnode_t * vp,int mode,int flags,cred_t * cr,caller_context_t * ct)198 fdaccess(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct)
199 {
200 return (0);
201 }
202
203 /* ARGSUSED */
204 static int
fdlookup(vnode_t * dp,char * comp,vnode_t ** vpp,pathname_t * pnp,int flags,vnode_t * rdir,cred_t * cr,caller_context_t * ct,int * direntflags,pathname_t * realpnp)205 fdlookup(vnode_t *dp, char *comp, vnode_t **vpp, pathname_t *pnp, int flags,
206 vnode_t *rdir, cred_t *cr, caller_context_t *ct, int *direntflags,
207 pathname_t *realpnp)
208 {
209 if (comp[0] == 0 || strcmp(comp, ".") == 0 || strcmp(comp, "..") == 0) {
210 VN_HOLD(dp);
211 *vpp = dp;
212 return (0);
213 }
214 return (fdget(dp, comp, vpp));
215 }
216
217 /* ARGSUSED */
218 static int
fdcreate(vnode_t * dvp,char * comp,vattr_t * vap,enum vcexcl excl,int mode,vnode_t ** vpp,cred_t * cr,int flag,caller_context_t * ct,vsecattr_t * vsecp)219 fdcreate(vnode_t *dvp, char *comp, vattr_t *vap, enum vcexcl excl, int mode,
220 vnode_t **vpp, cred_t *cr, int flag, caller_context_t *ct,
221 vsecattr_t *vsecp)
222 {
223 return (fdget(dvp, comp, vpp));
224 }
225
226 /* ARGSUSED */
227 static int
fdreaddir(vnode_t * vp,uio_t * uiop,cred_t * cr,int * eofp,caller_context_t * ct,int flags)228 fdreaddir(vnode_t *vp, uio_t *uiop, cred_t *cr, int *eofp, caller_context_t *ct,
229 int flags)
230 {
231 /* bp holds one dirent structure */
232 u_offset_t bp[DIRENT64_RECLEN(FDNSIZE) / sizeof (u_offset_t)];
233 struct dirent64 *dirent = (struct dirent64 *)bp;
234 int reclen, nentries;
235 rctl_qty_t fdno_ctl;
236 int n;
237 int oresid;
238 off_t off;
239
240 if (uiop->uio_offset < 0 || uiop->uio_resid <= 0 ||
241 (uiop->uio_offset % FDSDSIZE) != 0)
242 return (ENOENT);
243
244 ASSERT(uiop->uio_loffset <= MAXOFF_T);
245 oresid = uiop->uio_resid;
246 bzero(bp, sizeof (bp));
247
248 mutex_enter(&curproc->p_lock);
249 fdno_ctl = rctl_enforced_value(rctlproc_legacy[RLIMIT_NOFILE],
250 curproc->p_rctls, curproc);
251 nentries = MIN(P_FINFO(curproc)->fi_nfiles, (int)fdno_ctl);
252 mutex_exit(&curproc->p_lock);
253
254 while (uiop->uio_resid > 0) {
255 if ((off = uiop->uio_offset) == 0) { /* "." */
256 dirent->d_ino = (ino64_t)FDROOTINO;
257 dirent->d_name[0] = '.';
258 dirent->d_name[1] = '\0';
259 reclen = DIRENT64_RECLEN(1);
260 } else if (off == FDSDSIZE) { /* ".." */
261 dirent->d_ino = (ino64_t)FDROOTINO;
262 dirent->d_name[0] = '.';
263 dirent->d_name[1] = '.';
264 dirent->d_name[2] = '\0';
265 reclen = DIRENT64_RECLEN(2);
266 } else {
267 /*
268 * Return entries corresponding to the allowable
269 * number of file descriptors for this process.
270 */
271 if ((n = (off-2*FDSDSIZE)/FDSDSIZE) >= nentries)
272 break;
273 dirent->d_ino = (ino64_t)fdtoi(n);
274 numtos((ulong_t)n, dirent->d_name);
275 reclen = DIRENT64_RECLEN(strlen(dirent->d_name));
276 }
277 dirent->d_off = (offset_t)(uiop->uio_offset + FDSDSIZE);
278 dirent->d_reclen = (ushort_t)reclen;
279
280 if (reclen > uiop->uio_resid) {
281 /*
282 * Error if no entries have been returned yet.
283 */
284 if (uiop->uio_resid == oresid)
285 return (EINVAL);
286 break;
287 }
288 /*
289 * uiomove() updates both resid and offset by the same
290 * amount. But we want offset to change in increments
291 * of FDSDSIZE, which is different from the number of bytes
292 * being returned to the user. So we set uio_offset
293 * separately, ignoring what uiomove() does.
294 */
295 if (uiomove((caddr_t)dirent, reclen, UIO_READ, uiop))
296 return (EFAULT);
297 uiop->uio_offset = off + FDSDSIZE;
298 }
299 if (eofp)
300 *eofp = ((uiop->uio_offset-2*FDSDSIZE)/FDSDSIZE >= nentries);
301 return (0);
302 }
303
304 /* ARGSUSED */
305 static void
fdinactive(vnode_t * vp,cred_t * cr,caller_context_t * ct)306 fdinactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
307 {
308 mutex_enter(&vp->v_lock);
309 ASSERT(vp->v_count >= 1);
310 VN_RELE_LOCKED(vp);
311 if (vp->v_count != 0) {
312 mutex_exit(&vp->v_lock);
313 return;
314 }
315 mutex_exit(&vp->v_lock);
316 vn_invalid(vp);
317 vn_free(vp);
318 }
319
320 static struct vnodeops *fd_vnodeops;
321
322 static const fs_operation_def_t fd_vnodeops_template[] = {
323 VOPNAME_OPEN, { .vop_open = fdopen },
324 VOPNAME_CLOSE, { .vop_close = fdclose },
325 VOPNAME_READ, { .vop_read = fdread },
326 VOPNAME_GETATTR, { .vop_getattr = fdgetattr },
327 VOPNAME_ACCESS, { .vop_access = fdaccess },
328 VOPNAME_LOOKUP, { .vop_lookup = fdlookup },
329 VOPNAME_CREATE, { .vop_create = fdcreate },
330 VOPNAME_READDIR, { .vop_readdir = fdreaddir },
331 VOPNAME_INACTIVE, { .vop_inactive = fdinactive },
332 VOPNAME_FRLOCK, { .error = fs_error },
333 VOPNAME_POLL, { .error = fs_error },
334 VOPNAME_DISPOSE, { .error = fs_error },
335 NULL, NULL
336 };
337
338 static int
fdget(struct vnode * dvp,char * comp,struct vnode ** vpp)339 fdget(struct vnode *dvp, char *comp, struct vnode **vpp)
340 {
341 int n = 0;
342 struct vnode *vp;
343
344 while (*comp) {
345 if (*comp < '0' || *comp > '9')
346 return (ENOENT);
347 n = 10 * n + *comp++ - '0';
348 }
349 vp = vn_alloc(KM_SLEEP);
350 vp->v_type = VCHR;
351 vp->v_vfsp = dvp->v_vfsp;
352 vn_setops(vp, fd_vnodeops);
353 vp->v_data = NULL;
354 vp->v_flag = VNOMAP;
355 vp->v_rdev = makedevice(fdrmaj, n);
356 vn_exists(vp);
357 *vpp = vp;
358 return (0);
359 }
360
361 /*
362 * fdfs is mounted on /dev/fd, however, there are two interesting
363 * possibilities - two threads racing to do the same mount (protected
364 * by vfs locking), and two threads mounting fdfs in different places.
365 */
366 /*ARGSUSED*/
367 static int
fdmount(vfs_t * vfsp,vnode_t * mvp,struct mounta * uap,cred_t * cr)368 fdmount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
369 {
370 struct vnode *vp;
371
372 if (secpolicy_fs_mount(cr, mvp, vfsp) != 0)
373 return (EPERM);
374 if (mvp->v_type != VDIR)
375 return (ENOTDIR);
376
377 mutex_enter(&mvp->v_lock);
378 if ((uap->flags & MS_OVERLAY) == 0 &&
379 (mvp->v_count > 1 || (mvp->v_flag & VROOT))) {
380 mutex_exit(&mvp->v_lock);
381 return (EBUSY);
382 }
383 mutex_exit(&mvp->v_lock);
384
385 /*
386 * Having the resource be anything but "fd" doesn't make sense
387 */
388 vfs_setresource(vfsp, "fd", 0);
389
390 vp = vn_alloc(KM_SLEEP);
391 vp->v_vfsp = vfsp;
392 vn_setops(vp, fd_vnodeops);
393 vp->v_type = VDIR;
394 vp->v_data = NULL;
395 vp->v_flag |= VROOT;
396 vfsp->vfs_fstype = fdfstype;
397 vfsp->vfs_data = (char *)vp;
398 mutex_enter(&fd_minor_lock);
399 do {
400 fdfsmin = (fdfsmin + 1) & L_MAXMIN32;
401 vfsp->vfs_dev = makedevice(fdfsmaj, fdfsmin);
402 } while (vfs_devismounted(vfsp->vfs_dev));
403 mutex_exit(&fd_minor_lock);
404 vfs_make_fsid(&vfsp->vfs_fsid, vfsp->vfs_dev, fdfstype);
405 vfsp->vfs_bsize = 1024;
406 return (0);
407 }
408
409 /* ARGSUSED */
410 static int
fdunmount(vfs_t * vfsp,int flag,cred_t * cr)411 fdunmount(vfs_t *vfsp, int flag, cred_t *cr)
412 {
413 vnode_t *rvp;
414
415 if (secpolicy_fs_unmount(cr, vfsp) != 0)
416 return (EPERM);
417
418 /*
419 * forced unmount is not supported by this file system
420 * and thus, ENOTSUP, is being returned.
421 */
422 if (flag & MS_FORCE)
423 return (ENOTSUP);
424
425 rvp = (vnode_t *)vfsp->vfs_data;
426 if (rvp->v_count > 1)
427 return (EBUSY);
428
429 VN_RELE(rvp);
430 return (0);
431 }
432
433 /* ARGSUSED */
434 static int
fdroot(vfs_t * vfsp,vnode_t ** vpp)435 fdroot(vfs_t *vfsp, vnode_t **vpp)
436 {
437 vnode_t *vp = (vnode_t *)vfsp->vfs_data;
438
439 VN_HOLD(vp);
440 *vpp = vp;
441 return (0);
442 }
443
444 /*
445 * No locking required because I held the root vnode before calling this
446 * function so the vfs won't disappear on me. To be more explicit:
447 * fdvrootp->v_count will be greater than 1 so fdunmount will just return.
448 */
449 static int
fdstatvfs(struct vfs * vfsp,struct statvfs64 * sp)450 fdstatvfs(struct vfs *vfsp, struct statvfs64 *sp)
451 {
452 dev32_t d32;
453 rctl_qty_t fdno_ctl;
454
455 mutex_enter(&curproc->p_lock);
456 fdno_ctl = rctl_enforced_value(rctlproc_legacy[RLIMIT_NOFILE],
457 curproc->p_rctls, curproc);
458 mutex_exit(&curproc->p_lock);
459
460 bzero(sp, sizeof (*sp));
461 sp->f_bsize = 1024;
462 sp->f_frsize = 1024;
463 sp->f_blocks = (fsblkcnt64_t)0;
464 sp->f_bfree = (fsblkcnt64_t)0;
465 sp->f_bavail = (fsblkcnt64_t)0;
466 sp->f_files = (fsfilcnt64_t)
467 (MIN(P_FINFO(curproc)->fi_nfiles, fdno_ctl + 2));
468 sp->f_ffree = (fsfilcnt64_t)0;
469 sp->f_favail = (fsfilcnt64_t)0;
470 (void) cmpldev(&d32, vfsp->vfs_dev);
471 sp->f_fsid = d32;
472 (void) strcpy(sp->f_basetype, vfssw[fdfstype].vsw_name);
473 sp->f_flag = vf_to_stf(vfsp->vfs_flag);
474 sp->f_namemax = FDNSIZE;
475 (void) strcpy(sp->f_fstr, "/dev/fd");
476 (void) strcpy(&sp->f_fstr[8], "/dev/fd");
477 return (0);
478 }
479
480 int
fdinit(int fstype,char * name)481 fdinit(int fstype, char *name)
482 {
483 static const fs_operation_def_t fd_vfsops_template[] = {
484 VFSNAME_MOUNT, { .vfs_mount = fdmount },
485 VFSNAME_UNMOUNT, { .vfs_unmount = fdunmount },
486 VFSNAME_ROOT, { .vfs_root = fdroot },
487 VFSNAME_STATVFS, { .vfs_statvfs = fdstatvfs },
488 NULL, NULL
489 };
490 int error;
491
492 fdfstype = fstype;
493 ASSERT(fdfstype != 0);
494
495 /*
496 * Associate VFS ops vector with this fstype.
497 */
498 error = vfs_setfsops(fstype, fd_vfsops_template, NULL);
499 if (error != 0) {
500 cmn_err(CE_WARN, "fdinit: bad vnode ops template");
501 return (error);
502 }
503
504 error = vn_make_ops(name, fd_vnodeops_template, &fd_vnodeops);
505 if (error != 0) {
506 (void) vfs_freevfsops_by_type(fstype);
507 cmn_err(CE_WARN, "fdinit: bad vnode ops template");
508 return (error);
509 }
510
511 /*
512 * Assign unique "device" numbers (reported by stat(2)).
513 */
514 fdfsmaj = getudev();
515 fdrmaj = getudev();
516 if (fdfsmaj == (major_t)-1 || fdrmaj == (major_t)-1) {
517 cmn_err(CE_WARN, "fdinit: can't get unique device numbers");
518 if (fdfsmaj == (major_t)-1)
519 fdfsmaj = 0;
520 if (fdrmaj == (major_t)-1)
521 fdrmaj = 0;
522 }
523 mutex_init(&fd_minor_lock, NULL, MUTEX_DEFAULT, NULL);
524 return (0);
525 }
526
527 /*
528 * FDFS Mount options table
529 */
530 static char *rw_cancel[] = { MNTOPT_RO, NULL };
531
532 static mntopt_t mntopts[] = {
533 /*
534 * option name cancel option default arg flags
535 */
536 { MNTOPT_RW, rw_cancel, NULL, MO_DEFAULT,
537 (void *)MNTOPT_NOINTR },
538 { MNTOPT_IGNORE, NULL, NULL, 0,
539 (void *)0 },
540 };
541
542 static mntopts_t fdfs_mntopts = {
543 sizeof (mntopts) / sizeof (mntopt_t),
544 mntopts
545 };
546
547 static vfsdef_t vfw = {
548 VFSDEF_VERSION,
549 "fd",
550 fdinit,
551 VSW_HASPROTO | VSW_ZMOUNT,
552 &fdfs_mntopts
553 };
554
555 static struct modlfs modlfs = {
556 &mod_fsops,
557 "filesystem for fd",
558 &vfw
559 };
560
561 static struct modlinkage modlinkage = {
562 MODREV_1,
563 &modlfs,
564 NULL
565 };
566
567 int
_init(void)568 _init(void)
569 {
570 return (mod_install(&modlinkage));
571 }
572
573 int
_info(struct modinfo * modinfop)574 _info(struct modinfo *modinfop)
575 {
576 return (mod_info(&modlinkage, modinfop));
577 }
578