1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2017 by Delphix. All rights reserved. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All rights reserved. */ 28 29 30 #include <sys/types.h> 31 #include <sys/param.h> 32 #include <sys/cmn_err.h> 33 #include <sys/debug.h> 34 #include <sys/dirent.h> 35 #include <sys/errno.h> 36 #include <sys/file.h> 37 #include <sys/inline.h> 38 #include <sys/kmem.h> 39 #include <sys/pathname.h> 40 #include <sys/resource.h> 41 #include <sys/statvfs.h> 42 #include <sys/mount.h> 43 #include <sys/sysmacros.h> 44 #include <sys/systm.h> 45 #include <sys/uio.h> 46 #include <sys/vfs.h> 47 #include <sys/vfs_opreg.h> 48 #include <sys/vnode.h> 49 #include <sys/cred.h> 50 #include <sys/mntent.h> 51 #include <sys/mount.h> 52 #include <sys/user.h> 53 #include <sys/t_lock.h> 54 #include <sys/modctl.h> 55 #include <sys/policy.h> 56 #include <fs/fs_subr.h> 57 #include <sys/atomic.h> 58 #include <sys/mkdev.h> 59 60 #define round(r) (((r)+sizeof (int)-1)&(~(sizeof (int)-1))) 61 #define fdtoi(n) ((n)+100) 62 63 #define FDDIRSIZE 14 64 struct fddirect { 65 short d_ino; 66 char d_name[FDDIRSIZE]; 67 }; 68 69 #define FDROOTINO 2 70 #define FDSDSIZE sizeof (struct fddirect) 71 #define FDNSIZE 10 72 73 static int fdfstype = 0; 74 static major_t fdfsmaj; 75 static minor_t fdfsmin; 76 static major_t fdrmaj; 77 static kmutex_t fd_minor_lock; 78 79 static int fdget(vnode_t *, char *, vnode_t **); 80 81 /* ARGSUSED */ 82 static int 83 fdopen(vnode_t **vpp, int mode, cred_t *cr, caller_context_t *ct) 84 { 85 if ((*vpp)->v_type != VDIR) { 86 mutex_enter(&(*vpp)->v_lock); 87 (*vpp)->v_flag |= VDUP; 88 mutex_exit(&(*vpp)->v_lock); 89 } 90 return (0); 91 } 92 93 /* ARGSUSED */ 94 static int 95 fdclose(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, 96 caller_context_t *ct) 97 { 98 return (0); 99 } 100 101 /* ARGSUSED */ 102 static int 103 fdread(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr, caller_context_t *ct) 104 { 105 static struct fddirect dotbuf[] = { 106 { FDROOTINO, "." }, 107 { FDROOTINO, ".." } 108 }; 109 struct fddirect dirbuf; 110 int i, n; 111 int minfd, maxfd, modoff, error = 0; 112 int nentries; 113 rctl_qty_t fdno_ctl; 114 int endoff; 115 116 if (vp->v_type != VDIR) 117 return (ENOSYS); 118 119 mutex_enter(&curproc->p_lock); 120 fdno_ctl = rctl_enforced_value(rctlproc_legacy[RLIMIT_NOFILE], 121 curproc->p_rctls, curproc); 122 nentries = MIN(P_FINFO(curproc)->fi_nfiles, (int)fdno_ctl); 123 mutex_exit(&curproc->p_lock); 124 125 endoff = (nentries + 2) * FDSDSIZE; 126 127 /* 128 * Fake up ".", "..", and the /dev/fd directory entries. 129 */ 130 if (uiop->uio_loffset < (offset_t)0 || 131 uiop->uio_loffset >= (offset_t)endoff || 132 uiop->uio_resid <= 0) 133 return (0); 134 ASSERT(uiop->uio_loffset <= MAXOFF_T); 135 if (uiop->uio_offset < 2*FDSDSIZE) { 136 error = uiomove((caddr_t)dotbuf + uiop->uio_offset, 137 MIN(uiop->uio_resid, 2*FDSDSIZE - uiop->uio_offset), 138 UIO_READ, uiop); 139 if (uiop->uio_resid <= 0 || error) 140 return (error); 141 } 142 minfd = (uiop->uio_offset - 2*FDSDSIZE)/FDSDSIZE; 143 maxfd = (uiop->uio_offset + uiop->uio_resid - 1)/FDSDSIZE; 144 modoff = uiop->uio_offset % FDSDSIZE; 145 146 for (i = 0; i < FDDIRSIZE; i++) 147 dirbuf.d_name[i] = '\0'; 148 for (i = minfd; i < MIN(maxfd, nentries); i++) { 149 n = i; 150 dirbuf.d_ino = fdtoi(n); 151 numtos((ulong_t)n, dirbuf.d_name); 152 error = uiomove((caddr_t)&dirbuf + modoff, 153 MIN(uiop->uio_resid, FDSDSIZE - modoff), 154 UIO_READ, uiop); 155 if (uiop->uio_resid <= 0 || error) 156 return (error); 157 modoff = 0; 158 } 159 160 return (error); 161 } 162 163 /* ARGSUSED */ 164 static int 165 fdgetattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, 166 caller_context_t *ct) 167 { 168 vfs_t *vfsp = vp->v_vfsp; 169 timestruc_t now; 170 171 if (vp->v_type == VDIR) { 172 vap->va_nlink = 2; 173 vap->va_size = (u_offset_t) 174 ((P_FINFO(curproc)->fi_nfiles + 2) * FDSDSIZE); 175 vap->va_mode = 0555; 176 vap->va_nodeid = (ino64_t)FDROOTINO; 177 } else { 178 vap->va_nlink = 1; 179 vap->va_size = (u_offset_t)0; 180 vap->va_mode = 0666; 181 vap->va_nodeid = (ino64_t)fdtoi(getminor(vp->v_rdev)); 182 } 183 vap->va_type = vp->v_type; 184 vap->va_rdev = vp->v_rdev; 185 vap->va_blksize = vfsp->vfs_bsize; 186 vap->va_nblocks = (fsblkcnt64_t)0; 187 gethrestime(&now); 188 vap->va_atime = vap->va_mtime = vap->va_ctime = now; 189 vap->va_uid = 0; 190 vap->va_gid = 0; 191 vap->va_fsid = vfsp->vfs_dev; 192 vap->va_seq = 0; 193 return (0); 194 } 195 196 /* ARGSUSED */ 197 static int 198 fdaccess(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct) 199 { 200 return (0); 201 } 202 203 /* ARGSUSED */ 204 static int 205 fdlookup(vnode_t *dp, char *comp, vnode_t **vpp, pathname_t *pnp, int flags, 206 vnode_t *rdir, cred_t *cr, caller_context_t *ct, int *direntflags, 207 pathname_t *realpnp) 208 { 209 if (comp[0] == 0 || strcmp(comp, ".") == 0 || strcmp(comp, "..") == 0) { 210 VN_HOLD(dp); 211 *vpp = dp; 212 return (0); 213 } 214 return (fdget(dp, comp, vpp)); 215 } 216 217 /* ARGSUSED */ 218 static int 219 fdcreate(vnode_t *dvp, char *comp, vattr_t *vap, enum vcexcl excl, int mode, 220 vnode_t **vpp, cred_t *cr, int flag, caller_context_t *ct, 221 vsecattr_t *vsecp) 222 { 223 return (fdget(dvp, comp, vpp)); 224 } 225 226 /* ARGSUSED */ 227 static int 228 fdreaddir(vnode_t *vp, uio_t *uiop, cred_t *cr, int *eofp, caller_context_t *ct, 229 int flags) 230 { 231 /* bp holds one dirent structure */ 232 u_offset_t bp[DIRENT64_RECLEN(FDNSIZE) / sizeof (u_offset_t)]; 233 struct dirent64 *dirent = (struct dirent64 *)bp; 234 int reclen, nentries; 235 rctl_qty_t fdno_ctl; 236 int n; 237 int oresid; 238 off_t off; 239 240 if (uiop->uio_offset < 0 || uiop->uio_resid <= 0 || 241 (uiop->uio_offset % FDSDSIZE) != 0) 242 return (ENOENT); 243 244 ASSERT(uiop->uio_loffset <= MAXOFF_T); 245 oresid = uiop->uio_resid; 246 bzero(bp, sizeof (bp)); 247 248 mutex_enter(&curproc->p_lock); 249 fdno_ctl = rctl_enforced_value(rctlproc_legacy[RLIMIT_NOFILE], 250 curproc->p_rctls, curproc); 251 nentries = MIN(P_FINFO(curproc)->fi_nfiles, (int)fdno_ctl); 252 mutex_exit(&curproc->p_lock); 253 254 while (uiop->uio_resid > 0) { 255 if ((off = uiop->uio_offset) == 0) { /* "." */ 256 dirent->d_ino = (ino64_t)FDROOTINO; 257 dirent->d_name[0] = '.'; 258 dirent->d_name[1] = '\0'; 259 reclen = DIRENT64_RECLEN(1); 260 } else if (off == FDSDSIZE) { /* ".." */ 261 dirent->d_ino = (ino64_t)FDROOTINO; 262 dirent->d_name[0] = '.'; 263 dirent->d_name[1] = '.'; 264 dirent->d_name[2] = '\0'; 265 reclen = DIRENT64_RECLEN(2); 266 } else { 267 /* 268 * Return entries corresponding to the allowable 269 * number of file descriptors for this process. 270 */ 271 if ((n = (off-2*FDSDSIZE)/FDSDSIZE) >= nentries) 272 break; 273 dirent->d_ino = (ino64_t)fdtoi(n); 274 numtos((ulong_t)n, dirent->d_name); 275 reclen = DIRENT64_RECLEN(strlen(dirent->d_name)); 276 } 277 dirent->d_off = (offset_t)(uiop->uio_offset + FDSDSIZE); 278 dirent->d_reclen = (ushort_t)reclen; 279 280 if (reclen > uiop->uio_resid) { 281 /* 282 * Error if no entries have been returned yet. 283 */ 284 if (uiop->uio_resid == oresid) 285 return (EINVAL); 286 break; 287 } 288 /* 289 * uiomove() updates both resid and offset by the same 290 * amount. But we want offset to change in increments 291 * of FDSDSIZE, which is different from the number of bytes 292 * being returned to the user. So we set uio_offset 293 * separately, ignoring what uiomove() does. 294 */ 295 if (uiomove((caddr_t)dirent, reclen, UIO_READ, uiop)) 296 return (EFAULT); 297 uiop->uio_offset = off + FDSDSIZE; 298 } 299 if (eofp) 300 *eofp = ((uiop->uio_offset-2*FDSDSIZE)/FDSDSIZE >= nentries); 301 return (0); 302 } 303 304 /* ARGSUSED */ 305 static void 306 fdinactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) 307 { 308 mutex_enter(&vp->v_lock); 309 ASSERT(vp->v_count >= 1); 310 VN_RELE_LOCKED(vp); 311 if (vp->v_count != 0) { 312 mutex_exit(&vp->v_lock); 313 return; 314 } 315 mutex_exit(&vp->v_lock); 316 vn_invalid(vp); 317 vn_free(vp); 318 } 319 320 static struct vnodeops *fd_vnodeops; 321 322 static const fs_operation_def_t fd_vnodeops_template[] = { 323 VOPNAME_OPEN, { .vop_open = fdopen }, 324 VOPNAME_CLOSE, { .vop_close = fdclose }, 325 VOPNAME_READ, { .vop_read = fdread }, 326 VOPNAME_GETATTR, { .vop_getattr = fdgetattr }, 327 VOPNAME_ACCESS, { .vop_access = fdaccess }, 328 VOPNAME_LOOKUP, { .vop_lookup = fdlookup }, 329 VOPNAME_CREATE, { .vop_create = fdcreate }, 330 VOPNAME_READDIR, { .vop_readdir = fdreaddir }, 331 VOPNAME_INACTIVE, { .vop_inactive = fdinactive }, 332 VOPNAME_FRLOCK, { .error = fs_error }, 333 VOPNAME_POLL, { .error = fs_error }, 334 VOPNAME_DISPOSE, { .error = fs_error }, 335 NULL, NULL 336 }; 337 338 static int 339 fdget(struct vnode *dvp, char *comp, struct vnode **vpp) 340 { 341 int n = 0; 342 struct vnode *vp; 343 344 while (*comp) { 345 if (*comp < '0' || *comp > '9') 346 return (ENOENT); 347 n = 10 * n + *comp++ - '0'; 348 } 349 vp = vn_alloc(KM_SLEEP); 350 vp->v_type = VCHR; 351 vp->v_vfsp = dvp->v_vfsp; 352 vn_setops(vp, fd_vnodeops); 353 vp->v_data = NULL; 354 vp->v_flag = VNOMAP; 355 vp->v_rdev = makedevice(fdrmaj, n); 356 vn_exists(vp); 357 *vpp = vp; 358 return (0); 359 } 360 361 /* 362 * fdfs is mounted on /dev/fd, however, there are two interesting 363 * possibilities - two threads racing to do the same mount (protected 364 * by vfs locking), and two threads mounting fdfs in different places. 365 */ 366 /*ARGSUSED*/ 367 static int 368 fdmount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) 369 { 370 struct vnode *vp; 371 372 if (secpolicy_fs_mount(cr, mvp, vfsp) != 0) 373 return (EPERM); 374 if (mvp->v_type != VDIR) 375 return (ENOTDIR); 376 377 mutex_enter(&mvp->v_lock); 378 if ((uap->flags & MS_OVERLAY) == 0 && 379 (mvp->v_count > 1 || (mvp->v_flag & VROOT))) { 380 mutex_exit(&mvp->v_lock); 381 return (EBUSY); 382 } 383 mutex_exit(&mvp->v_lock); 384 385 /* 386 * Having the resource be anything but "fd" doesn't make sense 387 */ 388 vfs_setresource(vfsp, "fd", 0); 389 390 vp = vn_alloc(KM_SLEEP); 391 vp->v_vfsp = vfsp; 392 vn_setops(vp, fd_vnodeops); 393 vp->v_type = VDIR; 394 vp->v_data = NULL; 395 vp->v_flag |= VROOT; 396 vfsp->vfs_fstype = fdfstype; 397 vfsp->vfs_data = (char *)vp; 398 mutex_enter(&fd_minor_lock); 399 do { 400 fdfsmin = (fdfsmin + 1) & L_MAXMIN32; 401 vfsp->vfs_dev = makedevice(fdfsmaj, fdfsmin); 402 } while (vfs_devismounted(vfsp->vfs_dev)); 403 mutex_exit(&fd_minor_lock); 404 vfs_make_fsid(&vfsp->vfs_fsid, vfsp->vfs_dev, fdfstype); 405 vfsp->vfs_bsize = 1024; 406 return (0); 407 } 408 409 /* ARGSUSED */ 410 static int 411 fdunmount(vfs_t *vfsp, int flag, cred_t *cr) 412 { 413 vnode_t *rvp; 414 415 if (secpolicy_fs_unmount(cr, vfsp) != 0) 416 return (EPERM); 417 418 /* 419 * forced unmount is not supported by this file system 420 * and thus, ENOTSUP, is being returned. 421 */ 422 if (flag & MS_FORCE) 423 return (ENOTSUP); 424 425 rvp = (vnode_t *)vfsp->vfs_data; 426 if (rvp->v_count > 1) 427 return (EBUSY); 428 429 VN_RELE(rvp); 430 return (0); 431 } 432 433 /* ARGSUSED */ 434 static int 435 fdroot(vfs_t *vfsp, vnode_t **vpp) 436 { 437 vnode_t *vp = (vnode_t *)vfsp->vfs_data; 438 439 VN_HOLD(vp); 440 *vpp = vp; 441 return (0); 442 } 443 444 /* 445 * No locking required because I held the root vnode before calling this 446 * function so the vfs won't disappear on me. To be more explicit: 447 * fdvrootp->v_count will be greater than 1 so fdunmount will just return. 448 */ 449 static int 450 fdstatvfs(struct vfs *vfsp, struct statvfs64 *sp) 451 { 452 dev32_t d32; 453 rctl_qty_t fdno_ctl; 454 455 mutex_enter(&curproc->p_lock); 456 fdno_ctl = rctl_enforced_value(rctlproc_legacy[RLIMIT_NOFILE], 457 curproc->p_rctls, curproc); 458 mutex_exit(&curproc->p_lock); 459 460 bzero(sp, sizeof (*sp)); 461 sp->f_bsize = 1024; 462 sp->f_frsize = 1024; 463 sp->f_blocks = (fsblkcnt64_t)0; 464 sp->f_bfree = (fsblkcnt64_t)0; 465 sp->f_bavail = (fsblkcnt64_t)0; 466 sp->f_files = (fsfilcnt64_t) 467 (MIN(P_FINFO(curproc)->fi_nfiles, fdno_ctl + 2)); 468 sp->f_ffree = (fsfilcnt64_t)0; 469 sp->f_favail = (fsfilcnt64_t)0; 470 (void) cmpldev(&d32, vfsp->vfs_dev); 471 sp->f_fsid = d32; 472 (void) strcpy(sp->f_basetype, vfssw[fdfstype].vsw_name); 473 sp->f_flag = vf_to_stf(vfsp->vfs_flag); 474 sp->f_namemax = FDNSIZE; 475 (void) strcpy(sp->f_fstr, "/dev/fd"); 476 (void) strcpy(&sp->f_fstr[8], "/dev/fd"); 477 return (0); 478 } 479 480 int 481 fdinit(int fstype, char *name) 482 { 483 static const fs_operation_def_t fd_vfsops_template[] = { 484 VFSNAME_MOUNT, { .vfs_mount = fdmount }, 485 VFSNAME_UNMOUNT, { .vfs_unmount = fdunmount }, 486 VFSNAME_ROOT, { .vfs_root = fdroot }, 487 VFSNAME_STATVFS, { .vfs_statvfs = fdstatvfs }, 488 NULL, NULL 489 }; 490 int error; 491 492 fdfstype = fstype; 493 ASSERT(fdfstype != 0); 494 495 /* 496 * Associate VFS ops vector with this fstype. 497 */ 498 error = vfs_setfsops(fstype, fd_vfsops_template, NULL); 499 if (error != 0) { 500 cmn_err(CE_WARN, "fdinit: bad vnode ops template"); 501 return (error); 502 } 503 504 error = vn_make_ops(name, fd_vnodeops_template, &fd_vnodeops); 505 if (error != 0) { 506 (void) vfs_freevfsops_by_type(fstype); 507 cmn_err(CE_WARN, "fdinit: bad vnode ops template"); 508 return (error); 509 } 510 511 /* 512 * Assign unique "device" numbers (reported by stat(2)). 513 */ 514 fdfsmaj = getudev(); 515 fdrmaj = getudev(); 516 if (fdfsmaj == (major_t)-1 || fdrmaj == (major_t)-1) { 517 cmn_err(CE_WARN, "fdinit: can't get unique device numbers"); 518 if (fdfsmaj == (major_t)-1) 519 fdfsmaj = 0; 520 if (fdrmaj == (major_t)-1) 521 fdrmaj = 0; 522 } 523 mutex_init(&fd_minor_lock, NULL, MUTEX_DEFAULT, NULL); 524 return (0); 525 } 526 527 /* 528 * FDFS Mount options table 529 */ 530 static char *rw_cancel[] = { MNTOPT_RO, NULL }; 531 532 static mntopt_t mntopts[] = { 533 /* 534 * option name cancel option default arg flags 535 */ 536 { MNTOPT_RW, rw_cancel, NULL, MO_DEFAULT, 537 (void *)MNTOPT_NOINTR }, 538 { MNTOPT_IGNORE, NULL, NULL, 0, 539 (void *)0 }, 540 }; 541 542 static mntopts_t fdfs_mntopts = { 543 sizeof (mntopts) / sizeof (mntopt_t), 544 mntopts 545 }; 546 547 static vfsdef_t vfw = { 548 VFSDEF_VERSION, 549 "fd", 550 fdinit, 551 VSW_HASPROTO | VSW_ZMOUNT, 552 &fdfs_mntopts 553 }; 554 555 static struct modlfs modlfs = { 556 &mod_fsops, 557 "filesystem for fd", 558 &vfw 559 }; 560 561 static struct modlinkage modlinkage = { 562 MODREV_1, 563 &modlfs, 564 NULL 565 }; 566 567 int 568 _init(void) 569 { 570 return (mod_install(&modlinkage)); 571 } 572 573 int 574 _info(struct modinfo *modinfop) 575 { 576 return (mod_info(&modlinkage, modinfop)); 577 } 578