1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2017 by Delphix. All rights reserved.
24 * Copyright 2023 Oxide Computer Company
25 */
26
27 /*
28 * vnode ops for the devfs
29 *
30 * For leaf vnode special files (VCHR|VBLK) specfs will always see the VOP
31 * first because dv_find always performs leaf vnode substitution, returning
32 * a specfs vnode with an s_realvp pointing to the devfs leaf vnode. This
33 * means that the only leaf special file VOP operations that devfs will see
34 * after VOP_LOOKUP are the ones that specfs forwards.
35 */
36
37 #include <sys/types.h>
38 #include <sys/param.h>
39 #include <sys/t_lock.h>
40 #include <sys/systm.h>
41 #include <sys/sysmacros.h>
42 #include <sys/user.h>
43 #include <sys/time.h>
44 #include <sys/vfs.h>
45 #include <sys/vnode.h>
46 #include <sys/vfs_opreg.h>
47 #include <sys/file.h>
48 #include <sys/fcntl.h>
49 #include <sys/flock.h>
50 #include <sys/kmem.h>
51 #include <sys/uio.h>
52 #include <sys/errno.h>
53 #include <sys/stat.h>
54 #include <sys/cred.h>
55 #include <sys/dirent.h>
56 #include <sys/pathname.h>
57 #include <sys/cmn_err.h>
58 #include <sys/debug.h>
59 #include <sys/policy.h>
60 #include <sys/modctl.h>
61 #include <sys/sunndi.h>
62 #include <fs/fs_subr.h>
63 #include <sys/fs/dv_node.h>
64
65 extern struct vattr dv_vattr_dir, dv_vattr_file;
66 extern dev_t rconsdev;
67
68 /*
69 * Open of devices (leaf nodes) is handled by specfs.
70 * There is nothing to do to open a directory
71 */
72 /*ARGSUSED*/
73 static int
devfs_open(struct vnode ** vpp,int flag,struct cred * cred,caller_context_t * ct)74 devfs_open(struct vnode **vpp, int flag, struct cred *cred,
75 caller_context_t *ct)
76 {
77 struct dv_node *dv = VTODV(*vpp);
78
79 dcmn_err2(("devfs_open %s\n", dv->dv_name));
80 ASSERT((*vpp)->v_type == VDIR);
81 return (0);
82 }
83
84 /*
85 * Close of devices (leaf nodes) is handled by specfs.
86 * There is nothing much to do inorder to close a directory.
87 */
88 /*ARGSUSED1*/
89 static int
devfs_close(struct vnode * vp,int flag,int count,offset_t offset,struct cred * cred,caller_context_t * ct)90 devfs_close(struct vnode *vp, int flag, int count,
91 offset_t offset, struct cred *cred, caller_context_t *ct)
92 {
93 struct dv_node *dv = VTODV(vp);
94
95 dcmn_err2(("devfs_close %s\n", dv->dv_name));
96 ASSERT(vp->v_type == VDIR);
97
98 cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
99 cleanshares(vp, ttoproc(curthread)->p_pid);
100 return (0);
101 }
102
103 /*
104 * Read of devices (leaf nodes) is handled by specfs.
105 * Read of directories is not supported.
106 */
107 /*ARGSUSED*/
108 static int
devfs_read(struct vnode * vp,struct uio * uiop,int ioflag,struct cred * cred,struct caller_context * ct)109 devfs_read(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cred,
110 struct caller_context *ct)
111 {
112 dcmn_err2(("devfs_read %s\n", VTODV(vp)->dv_name));
113 ASSERT(vp->v_type == VDIR);
114 ASSERT(RW_READ_HELD(&VTODV(vp)->dv_contents));
115 return (EISDIR);
116 }
117
118 /*
119 * Write of devices (leaf nodes) is handled by specfs.
120 * Write of directories is not supported.
121 */
122 /*ARGSUSED*/
123 static int
devfs_write(struct vnode * vp,struct uio * uiop,int ioflag,struct cred * cred,struct caller_context * ct)124 devfs_write(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cred,
125 struct caller_context *ct)
126 {
127 dcmn_err2(("devfs_write %s\n", VTODV(vp)->dv_name));
128 ASSERT(vp->v_type == VDIR);
129 ASSERT(RW_WRITE_HELD(&VTODV(vp)->dv_contents));
130 return (EISDIR);
131 }
132
133 /*
134 * Ioctls to device (leaf nodes) is handled by specfs.
135 * Ioctl to directories is not supported.
136 */
137 /*ARGSUSED*/
138 static int
devfs_ioctl(struct vnode * vp,int cmd,intptr_t arg,int flag,struct cred * cred,int * rvalp,caller_context_t * ct)139 devfs_ioctl(struct vnode *vp, int cmd, intptr_t arg, int flag,
140 struct cred *cred, int *rvalp, caller_context_t *ct)
141 {
142 dcmn_err2(("devfs_ioctl %s\n", VTODV(vp)->dv_name));
143 ASSERT(vp->v_type == VDIR);
144
145 return (ENOTTY); /* no ioctls supported */
146 }
147
148 /*
149 * We can be asked directly about the attributes of directories, or
150 * (via sp->s_realvp) about the filesystem attributes of special files.
151 *
152 * For directories, we just believe the attribute store
153 * though we mangle the nodeid, fsid, and rdev to convince userland we
154 * really are a different filesystem.
155 *
156 * For special files, a little more fakery is required.
157 *
158 * If the attribute store is not there (read only root), we believe our
159 * memory based attributes.
160 */
161 static int
devfs_getattr(struct vnode * vp,struct vattr * vap,int flags,struct cred * cr,caller_context_t * ct)162 devfs_getattr(struct vnode *vp, struct vattr *vap, int flags, struct cred *cr,
163 caller_context_t *ct)
164 {
165 struct dv_node *dv = VTODV(vp);
166 int error = 0;
167 uint_t mask;
168
169 /*
170 * Message goes to console only. Otherwise, the message
171 * causes devfs_getattr to be invoked again... infinite loop
172 */
173 dcmn_err2(("?devfs_getattr %s\n", dv->dv_name));
174 ASSERT(dv->dv_attr || dv->dv_attrvp);
175
176 if (!(vp->v_type == VDIR || vp->v_type == VCHR || vp->v_type == VBLK)) {
177 cmn_err(CE_WARN, /* panic ? */
178 "?%s: getattr on vnode type %d", dvnm, vp->v_type);
179 return (ENOENT);
180 }
181
182 rw_enter(&dv->dv_contents, RW_READER);
183 if (dv->dv_attr) {
184 /*
185 * obtain from the memory version of attribute.
186 * preserve mask for those that optimize.
187 * devfs specific fields are already merged on creation.
188 */
189 mask = vap->va_mask;
190 *vap = *dv->dv_attr;
191 vap->va_mask = mask;
192 } else {
193 /* obtain from attribute store and merge */
194 error = VOP_GETATTR(dv->dv_attrvp, vap, flags, cr, ct);
195 dsysdebug(error, ("vop_getattr %s %d\n", dv->dv_name, error));
196 dv_vattr_merge(dv, vap);
197 }
198 rw_exit(&dv->dv_contents);
199
200 /*
201 * Restrict the permissions of the node fronting the console
202 * to 0600 with root as the owner. This prevents a non-root
203 * user from gaining access to a serial terminal (like /dev/term/a)
204 * which is in reality serving as the console device (/dev/console).
205 */
206 if (vp->v_rdev == rconsdev) {
207 mode_t rconsmask = S_IXUSR|S_IRWXG|S_IRWXO;
208 vap->va_mode &= (~rconsmask);
209 vap->va_uid = 0;
210 }
211
212 return (error);
213 }
214
215 static int devfs_unlocked_access(void *, int, struct cred *);
216
217 /*ARGSUSED4*/
218 static int
devfs_setattr_dir(struct dv_node * dv,struct vnode * vp,struct vattr * vap,int flags,struct cred * cr)219 devfs_setattr_dir(
220 struct dv_node *dv,
221 struct vnode *vp,
222 struct vattr *vap,
223 int flags,
224 struct cred *cr)
225 {
226 struct vattr *map;
227 uint_t mask;
228 int error = 0;
229 struct vattr vattr;
230
231 ASSERT(dv->dv_attr || dv->dv_attrvp);
232
233 ASSERT(vp->v_type == VDIR);
234 ASSERT((dv->dv_flags & DV_NO_FSPERM) == 0);
235
236 if (vap->va_mask & AT_NOSET)
237 return (EINVAL);
238
239 /* to ensure consistency, single thread setting of attributes */
240 rw_enter(&dv->dv_contents, RW_WRITER);
241
242 again: if (dv->dv_attr) {
243
244 error = secpolicy_vnode_setattr(cr, vp, vap,
245 dv->dv_attr, flags, devfs_unlocked_access, dv);
246
247 if (error)
248 goto out;
249
250 /*
251 * Apply changes to the memory based attribute. This code
252 * is modeled after the tmpfs implementation of memory
253 * based vnodes
254 */
255 map = dv->dv_attr;
256 mask = vap->va_mask;
257
258 /* Change file access modes. */
259 if (mask & AT_MODE) {
260 map->va_mode &= S_IFMT;
261 map->va_mode |= vap->va_mode & ~S_IFMT;
262 }
263 if (mask & AT_UID)
264 map->va_uid = vap->va_uid;
265 if (mask & AT_GID)
266 map->va_gid = vap->va_gid;
267 if (mask & AT_ATIME)
268 map->va_atime = vap->va_atime;
269 if (mask & AT_MTIME)
270 map->va_mtime = vap->va_mtime;
271
272 if (mask & (AT_MODE | AT_UID | AT_GID | AT_MTIME))
273 gethrestime(&map->va_ctime);
274 } else {
275 /* use the backing attribute store */
276 ASSERT(dv->dv_attrvp);
277
278 /*
279 * See if we are changing something we care about
280 * the persistence of - return success if we don't care.
281 */
282 if (vap->va_mask & (AT_MODE|AT_UID|AT_GID|AT_ATIME|AT_MTIME)) {
283 /* Set the attributes */
284 error = VOP_SETATTR(dv->dv_attrvp,
285 vap, flags, cr, NULL);
286 dsysdebug(error,
287 ("vop_setattr %s %d\n", dv->dv_name, error));
288
289 /*
290 * Some file systems may return EROFS for a setattr
291 * on a readonly file system. In this case we create
292 * our own memory based attribute.
293 */
294 if (error == EROFS) {
295 /*
296 * obtain attributes from existing file
297 * that we will modify and switch to memory
298 * based attribute until attribute store is
299 * read/write.
300 */
301 vattr = dv_vattr_dir;
302 if (VOP_GETATTR(dv->dv_attrvp,
303 &vattr, flags, cr, NULL) == 0) {
304 dv->dv_attr = kmem_alloc(
305 sizeof (struct vattr), KM_SLEEP);
306 *dv->dv_attr = vattr;
307 dv_vattr_merge(dv, dv->dv_attr);
308 goto again;
309 }
310 }
311 }
312 }
313 out:
314 rw_exit(&dv->dv_contents);
315 return (error);
316 }
317
318
319 /*
320 * Compare the uid/gid/mode changes requested for a setattr
321 * operation with the same details of a node's default minor
322 * perm information. Return 0 if identical.
323 */
324 static int
dv_setattr_cmp(struct vattr * map,mperm_t * mp)325 dv_setattr_cmp(struct vattr *map, mperm_t *mp)
326 {
327 if ((map->va_mode & S_IAMB) != (mp->mp_mode & S_IAMB))
328 return (1);
329 if (map->va_uid != mp->mp_uid)
330 return (1);
331 if (map->va_gid != mp->mp_gid)
332 return (1);
333 return (0);
334 }
335
336
337 /*ARGSUSED4*/
338 static int
devfs_setattr(struct vnode * vp,struct vattr * vap,int flags,struct cred * cr,caller_context_t * ct)339 devfs_setattr(
340 struct vnode *vp,
341 struct vattr *vap,
342 int flags,
343 struct cred *cr,
344 caller_context_t *ct)
345 {
346 struct dv_node *dv = VTODV(vp);
347 struct dv_node *ddv;
348 struct vnode *dvp;
349 struct vattr *map;
350 uint_t mask;
351 int error = 0;
352 struct vattr *free_vattr = NULL;
353 struct vattr *vattrp = NULL;
354 mperm_t mp;
355 int persist;
356
357 /*
358 * Message goes to console only. Otherwise, the message
359 * causes devfs_getattr to be invoked again... infinite loop
360 */
361 dcmn_err2(("?devfs_setattr %s\n", dv->dv_name));
362 ASSERT(dv->dv_attr || dv->dv_attrvp);
363
364 if (!(vp->v_type == VDIR || vp->v_type == VCHR || vp->v_type == VBLK)) {
365 cmn_err(CE_WARN, /* panic ? */
366 "?%s: getattr on vnode type %d", dvnm, vp->v_type);
367 return (ENOENT);
368 }
369
370 if (vap->va_mask & AT_NOSET)
371 return (EINVAL);
372
373 /*
374 * If we are changing something we don't care about
375 * the persistence of, return success.
376 */
377 if ((vap->va_mask &
378 (AT_MODE|AT_UID|AT_GID|AT_ATIME|AT_MTIME)) == 0)
379 return (0);
380
381 /*
382 * If driver overrides fs perm, disallow chmod
383 * and do not create attribute nodes.
384 */
385 if (dv->dv_flags & DV_NO_FSPERM) {
386 ASSERT(dv->dv_attr);
387 if (vap->va_mask & (AT_MODE | AT_UID | AT_GID))
388 return (EPERM);
389 if ((vap->va_mask & (AT_ATIME|AT_MTIME)) == 0)
390 return (0);
391 rw_enter(&dv->dv_contents, RW_WRITER);
392 if (vap->va_mask & AT_ATIME)
393 dv->dv_attr->va_atime = vap->va_atime;
394 if (vap->va_mask & AT_MTIME)
395 dv->dv_attr->va_mtime = vap->va_mtime;
396 rw_exit(&dv->dv_contents);
397 return (0);
398 }
399
400 /*
401 * Directories are always created but device nodes are
402 * only used to persist non-default permissions.
403 */
404 if (vp->v_type == VDIR) {
405 ASSERT(dv->dv_attr || dv->dv_attrvp);
406 return (devfs_setattr_dir(dv, vp, vap, flags, cr));
407 }
408
409 /*
410 * Allocate now before we take any locks
411 */
412 vattrp = kmem_zalloc(sizeof (*vattrp), KM_SLEEP);
413
414 /* to ensure consistency, single thread setting of attributes */
415 rw_enter(&dv->dv_contents, RW_WRITER);
416
417 /*
418 * We don't need to create an attribute node
419 * to persist access or modification times.
420 */
421 persist = (vap->va_mask & (AT_MODE | AT_UID | AT_GID));
422
423 /*
424 * If persisting something, get the default permissions
425 * for this minor to compare against what the attributes
426 * are now being set to. Default ordering is:
427 * - minor_perm match for this minor
428 * - mode supplied by ddi_create_priv_minor_node
429 * - devfs defaults
430 */
431 if (persist) {
432 if (dev_minorperm(dv->dv_devi, dv->dv_name, &mp) != 0) {
433 mp.mp_uid = dv_vattr_file.va_uid;
434 mp.mp_gid = dv_vattr_file.va_gid;
435 mp.mp_mode = dv_vattr_file.va_mode;
436 if (dv->dv_flags & DV_DFLT_MODE) {
437 ASSERT((dv->dv_dflt_mode & ~S_IAMB) == 0);
438 mp.mp_mode &= ~S_IAMB;
439 mp.mp_mode |= dv->dv_dflt_mode;
440 dcmn_err5(("%s: setattr priv default 0%o\n",
441 dv->dv_name, mp.mp_mode));
442 } else {
443 dcmn_err5(("%s: setattr devfs default 0%o\n",
444 dv->dv_name, mp.mp_mode));
445 }
446 } else {
447 dcmn_err5(("%s: setattr minor perm default 0%o\n",
448 dv->dv_name, mp.mp_mode));
449 }
450 }
451
452 /*
453 * If we don't have a vattr for this node, construct one.
454 */
455 if (dv->dv_attr) {
456 free_vattr = vattrp;
457 vattrp = NULL;
458 } else {
459 ASSERT(dv->dv_attrvp);
460 ASSERT(vp->v_type != VDIR);
461 *vattrp = dv_vattr_file;
462 error = VOP_GETATTR(dv->dv_attrvp, vattrp, 0, cr, ct);
463 dsysdebug(error, ("vop_getattr %s %d\n", dv->dv_name, error));
464 if (error)
465 goto out;
466 dv->dv_attr = vattrp;
467 dv_vattr_merge(dv, dv->dv_attr);
468 vattrp = NULL;
469 }
470
471 error = secpolicy_vnode_setattr(cr, vp, vap, dv->dv_attr,
472 flags, devfs_unlocked_access, dv);
473 if (error) {
474 dsysdebug(error, ("devfs_setattr %s secpolicy error %d\n",
475 dv->dv_name, error));
476 goto out;
477 }
478
479 /*
480 * Apply changes to the memory based attribute. This code
481 * is modeled after the tmpfs implementation of memory
482 * based vnodes
483 */
484 map = dv->dv_attr;
485 mask = vap->va_mask;
486
487 /* Change file access modes. */
488 if (mask & AT_MODE) {
489 map->va_mode &= S_IFMT;
490 map->va_mode |= vap->va_mode & ~S_IFMT;
491 }
492 if (mask & AT_UID)
493 map->va_uid = vap->va_uid;
494 if (mask & AT_GID)
495 map->va_gid = vap->va_gid;
496 if (mask & AT_ATIME)
497 map->va_atime = vap->va_atime;
498 if (mask & AT_MTIME)
499 map->va_mtime = vap->va_mtime;
500
501 if (mask & (AT_MODE | AT_UID | AT_GID | AT_MTIME)) {
502 gethrestime(&map->va_ctime);
503 }
504
505 /*
506 * A setattr to defaults means we no longer need the
507 * shadow node as a persistent store, unless there
508 * are ACLs. Otherwise create a shadow node if one
509 * doesn't exist yet.
510 */
511 if (persist) {
512 if ((dv_setattr_cmp(map, &mp) == 0) &&
513 ((dv->dv_flags & DV_ACL) == 0)) {
514
515 if (dv->dv_attrvp) {
516 ddv = dv->dv_dotdot;
517 ASSERT(ddv->dv_attrvp);
518 error = VOP_REMOVE(ddv->dv_attrvp,
519 dv->dv_name, cr, ct, 0);
520 dsysdebug(error,
521 ("vop_remove %s %s %d\n",
522 ddv->dv_name, dv->dv_name, error));
523
524 if (error == EROFS)
525 error = 0;
526 VN_RELE(dv->dv_attrvp);
527 dv->dv_attrvp = NULL;
528 }
529 ASSERT(dv->dv_attr);
530 } else {
531 if (mask & AT_MODE)
532 dcmn_err5(("%s persisting mode 0%o\n",
533 dv->dv_name, vap->va_mode));
534 if (mask & AT_UID)
535 dcmn_err5(("%s persisting uid %d\n",
536 dv->dv_name, vap->va_uid));
537 if (mask & AT_GID)
538 dcmn_err5(("%s persisting gid %d\n",
539 dv->dv_name, vap->va_gid));
540
541 if (dv->dv_attrvp == NULL) {
542 dvp = DVTOV(dv->dv_dotdot);
543 dv_shadow_node(dvp, dv->dv_name, vp,
544 NULL, NULLVP, cr,
545 DV_SHADOW_CREATE | DV_SHADOW_WRITE_HELD);
546 }
547 if (dv->dv_attrvp) {
548 /* If map still valid do TIME for free. */
549 if (dv->dv_attr == map) {
550 mask = map->va_mask;
551 map->va_mask =
552 vap->va_mask | AT_ATIME | AT_MTIME;
553 error = VOP_SETATTR(dv->dv_attrvp, map,
554 flags, cr, NULL);
555 map->va_mask = mask;
556 } else {
557 error = VOP_SETATTR(dv->dv_attrvp,
558 vap, flags, cr, NULL);
559 }
560 dsysdebug(error, ("vop_setattr %s %d\n",
561 dv->dv_name, error));
562 }
563 /*
564 * Some file systems may return EROFS for a setattr
565 * on a readonly file system. In this case save
566 * as our own memory based attribute.
567 * NOTE: ufs is NOT one of these (see ufs_iupdat).
568 */
569 if (dv->dv_attr && dv->dv_attrvp && error == 0) {
570 vattrp = dv->dv_attr;
571 dv->dv_attr = NULL;
572 } else if (error == EROFS)
573 error = 0;
574 }
575 }
576
577 out:
578 rw_exit(&dv->dv_contents);
579
580 if (vattrp)
581 kmem_free(vattrp, sizeof (*vattrp));
582 if (free_vattr)
583 kmem_free(free_vattr, sizeof (*free_vattr));
584 return (error);
585 }
586
587 static int
devfs_pathconf(vnode_t * vp,int cmd,ulong_t * valp,cred_t * cr,caller_context_t * ct)588 devfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
589 caller_context_t *ct)
590 {
591 switch (cmd) {
592 case _PC_ACL_ENABLED:
593 /*
594 * We rely on the underlying filesystem for ACLs,
595 * so direct the query for ACL support there.
596 * ACL support isn't relative to the file
597 * and we can't guarantee that the dv node
598 * has an attribute node, so any valid
599 * attribute node will suffice.
600 */
601 ASSERT(dvroot);
602 ASSERT(dvroot->dv_attrvp);
603 return (VOP_PATHCONF(dvroot->dv_attrvp, cmd, valp, cr, ct));
604 /*NOTREACHED*/
605 }
606
607 return (fs_pathconf(vp, cmd, valp, cr, ct));
608 }
609
610 /*
611 * Let avp handle security attributes (acl's).
612 */
613 static int
devfs_getsecattr(struct vnode * vp,struct vsecattr * vsap,int flags,struct cred * cr,caller_context_t * ct)614 devfs_getsecattr(struct vnode *vp, struct vsecattr *vsap, int flags,
615 struct cred *cr, caller_context_t *ct)
616 {
617 dvnode_t *dv = VTODV(vp);
618 struct vnode *avp;
619 int error;
620
621 dcmn_err2(("devfs_getsecattr %s\n", dv->dv_name));
622 ASSERT(vp->v_type == VDIR || vp->v_type == VCHR || vp->v_type == VBLK);
623
624 rw_enter(&dv->dv_contents, RW_READER);
625
626 avp = dv->dv_attrvp;
627
628 /* fabricate the acl */
629 if (avp == NULL) {
630 error = fs_fab_acl(vp, vsap, flags, cr, ct);
631 rw_exit(&dv->dv_contents);
632 return (error);
633 }
634
635 error = VOP_GETSECATTR(avp, vsap, flags, cr, ct);
636 dsysdebug(error, ("vop_getsecattr %s %d\n", VTODV(vp)->dv_name, error));
637 rw_exit(&dv->dv_contents);
638 return (error);
639 }
640
641 /*
642 * Set security attributes (acl's)
643 *
644 * Note that the dv_contents lock has already been acquired
645 * by the caller's VOP_RWLOCK.
646 */
647 static int
devfs_setsecattr(struct vnode * vp,struct vsecattr * vsap,int flags,struct cred * cr,caller_context_t * ct)648 devfs_setsecattr(struct vnode *vp, struct vsecattr *vsap, int flags,
649 struct cred *cr, caller_context_t *ct)
650 {
651 dvnode_t *dv = VTODV(vp);
652 struct vnode *avp;
653 int error;
654
655 dcmn_err2(("devfs_setsecattr %s\n", dv->dv_name));
656 ASSERT(vp->v_type == VDIR || vp->v_type == VCHR || vp->v_type == VBLK);
657 ASSERT(RW_LOCK_HELD(&dv->dv_contents));
658
659 /*
660 * Not a supported operation on drivers not providing
661 * file system based permissions.
662 */
663 if (dv->dv_flags & DV_NO_FSPERM)
664 return (ENOTSUP);
665
666 /*
667 * To complete, the setsecattr requires an underlying attribute node.
668 */
669 if (dv->dv_attrvp == NULL) {
670 ASSERT(vp->v_type == VCHR || vp->v_type == VBLK);
671 dv_shadow_node(DVTOV(dv->dv_dotdot), dv->dv_name, vp,
672 NULL, NULLVP, cr, DV_SHADOW_CREATE | DV_SHADOW_WRITE_HELD);
673 }
674
675 if ((avp = dv->dv_attrvp) == NULL) {
676 dcmn_err2(("devfs_setsecattr %s: "
677 "cannot construct attribute node\n", dv->dv_name));
678 return (fs_nosys());
679 }
680
681 /*
682 * The acl(2) system call issues a VOP_RWLOCK before setting an ACL.
683 * Since backing file systems expect the lock to be held before seeing
684 * a VOP_SETSECATTR ACL, we need to issue the VOP_RWLOCK to the backing
685 * store before forwarding the ACL.
686 */
687 (void) VOP_RWLOCK(avp, V_WRITELOCK_TRUE, NULL);
688 error = VOP_SETSECATTR(avp, vsap, flags, cr, ct);
689 dsysdebug(error, ("vop_setsecattr %s %d\n", VTODV(vp)->dv_name, error));
690 VOP_RWUNLOCK(avp, V_WRITELOCK_TRUE, NULL);
691
692 /*
693 * Set DV_ACL if we have a non-trivial set of ACLs. It is not
694 * necessary to hold VOP_RWLOCK since fs_acl_nontrivial only does
695 * VOP_GETSECATTR calls.
696 */
697 if (fs_acl_nontrivial(avp, cr))
698 dv->dv_flags |= DV_ACL;
699 return (error);
700 }
701
702 /*
703 * This function is used for secpolicy_setattr(). It must call an
704 * access() like function while it is already holding the
705 * dv_contents lock. We only care about this when dv_attr != NULL;
706 * so the unlocked access call only concerns itself with that
707 * particular branch of devfs_access().
708 */
709 static int
devfs_unlocked_access(void * vdv,int mode,struct cred * cr)710 devfs_unlocked_access(void *vdv, int mode, struct cred *cr)
711 {
712 struct dv_node *dv = vdv;
713 int shift = 0;
714 uid_t owner = dv->dv_attr->va_uid;
715
716 /* Check access based on owner, group and public permissions. */
717 if (crgetuid(cr) != owner) {
718 shift += 3;
719 if (groupmember(dv->dv_attr->va_gid, cr) == 0)
720 shift += 3;
721 }
722
723 return (secpolicy_vnode_access2(cr, DVTOV(dv), owner,
724 dv->dv_attr->va_mode << shift, mode));
725 }
726
727 static int
devfs_access(struct vnode * vp,int mode,int flags,struct cred * cr,caller_context_t * ct)728 devfs_access(struct vnode *vp, int mode, int flags, struct cred *cr,
729 caller_context_t *ct)
730 {
731 struct dv_node *dv = VTODV(vp);
732 int res;
733
734 dcmn_err2(("devfs_access %s\n", dv->dv_name));
735 ASSERT(dv->dv_attr || dv->dv_attrvp);
736
737 /* restrict console access to privileged processes */
738 if ((vp->v_rdev == rconsdev) && secpolicy_console(cr) != 0) {
739 return (EACCES);
740 }
741
742 rw_enter(&dv->dv_contents, RW_READER);
743 if (dv->dv_attr && ((dv->dv_flags & DV_ACL) == 0)) {
744 res = devfs_unlocked_access(dv, mode, cr);
745 } else {
746 res = VOP_ACCESS(dv->dv_attrvp, mode, flags, cr, ct);
747 }
748 rw_exit(&dv->dv_contents);
749 return (res);
750 }
751
752 /*
753 * Lookup
754 *
755 * Given the directory vnode and the name of the component, return
756 * the corresponding held vnode for that component.
757 *
758 * Of course in these fictional filesystems, nothing's ever quite
759 * -that- simple.
760 *
761 * devfs name type shadow (fs attributes) type comments
762 * -------------------------------------------------------------------------
763 * drv[@addr] VDIR drv[@addr] VDIR nexus driver
764 * drv[@addr]:m VCHR/VBLK drv[@addr]:m VREG leaf driver
765 * drv[@addr] VCHR/VBLK drv[@addr]:.default VREG leaf driver
766 * -------------------------------------------------------------------------
767 *
768 * The following names are reserved for the attribute filesystem (which
769 * could easily be another layer on top of this one - we simply need to
770 * hold the vnode of the thing we're looking at)
771 *
772 * attr name type shadow (fs attributes) type comments
773 * -------------------------------------------------------------------------
774 * drv[@addr] VDIR - - attribute dir
775 * minorname VDIR - - minorname
776 * attribute VREG - - attribute
777 * -------------------------------------------------------------------------
778 *
779 * Examples:
780 *
781 * devfs:/devices/.../mm@0:zero VCHR
782 * shadow:/.devices/.../mm@0:zero VREG, fs attrs
783 * devfs:/devices/.../mm@0:/zero/attr VREG, driver attribute
784 *
785 * devfs:/devices/.../sd@0,0:a VBLK
786 * shadow:/.devices/.../sd@0,0:a VREG, fs attrs
787 * devfs:/devices/.../sd@0,0:/a/.type VREG, "ddi_block:chan"
788 *
789 * devfs:/devices/.../mm@0 VCHR
790 * shadow:/.devices/.../mm@0:.default VREG, fs attrs
791 * devfs:/devices/.../mm@0:/.default/attr VREG, driver attribute
792 * devfs:/devices/.../mm@0:/.default/.type VREG, "ddi_pseudo"
793 *
794 * devfs:/devices/.../obio VDIR
795 * shadow:/devices/.../obio VDIR, needed for fs attrs.
796 * devfs:/devices/.../obio:/.default/attr VDIR, driver attribute
797 *
798 * We also need to be able deal with "old" devices that have gone away,
799 * though I think that provided we return them with readdir, they can
800 * be removed (i.e. they don't have to respond to lookup, though it might
801 * be weird if they didn't ;-)
802 *
803 * Lookup has side-effects.
804 *
805 * - It will create directories and fs attribute files in the shadow hierarchy.
806 * - It should cause non-SID devices to be probed (ask the parent nexi).
807 */
808 /*ARGSUSED3*/
809 static int
devfs_lookup(struct vnode * dvp,char * nm,struct vnode ** vpp,struct pathname * pnp,int flags,struct vnode * rdir,struct cred * cred,caller_context_t * ct,int * direntflags,pathname_t * realpnp)810 devfs_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
811 struct pathname *pnp, int flags, struct vnode *rdir, struct cred *cred,
812 caller_context_t *ct, int *direntflags, pathname_t *realpnp)
813 {
814 ASSERT(dvp->v_type == VDIR);
815 dcmn_err2(("devfs_lookup: %s\n", nm));
816 return (dv_find(VTODV(dvp), nm, vpp, pnp, rdir, cred, 0));
817 }
818
819 /*
820 * devfs nodes can't really be created directly by userland - however,
821 * we do allow creates to find existing nodes:
822 *
823 * - any create fails if the node doesn't exist - EROFS.
824 * - creating an existing directory read-only succeeds, otherwise EISDIR.
825 * - exclusive creates fail if the node already exists - EEXIST.
826 * - failure to create the snode for an existing device - ENOSYS.
827 */
828 /*ARGSUSED2*/
829 static int
devfs_create(struct vnode * dvp,char * nm,struct vattr * vap,vcexcl_t excl,int mode,struct vnode ** vpp,struct cred * cred,int flag,caller_context_t * ct,vsecattr_t * vsecp)830 devfs_create(struct vnode *dvp, char *nm, struct vattr *vap, vcexcl_t excl,
831 int mode, struct vnode **vpp, struct cred *cred, int flag,
832 caller_context_t *ct, vsecattr_t *vsecp)
833 {
834 int error;
835 struct vnode *vp;
836
837 dcmn_err2(("devfs_create %s\n", nm));
838 error = dv_find(VTODV(dvp), nm, &vp, NULL, NULLVP, cred, 0);
839 if (error == 0) {
840 if (excl == EXCL)
841 error = EEXIST;
842 else if (vp->v_type == VDIR && (mode & VWRITE))
843 error = EISDIR;
844 else
845 error = VOP_ACCESS(vp, mode, 0, cred, ct);
846
847 if (error) {
848 VN_RELE(vp);
849 } else
850 *vpp = vp;
851 } else if (error == ENOENT)
852 error = EROFS;
853
854 return (error);
855 }
856
857 /*
858 * If DV_BUILD is set, we call into nexus driver to do a BUS_CONFIG_ALL.
859 * Otherwise, simply return cached dv_node's. Hotplug code always call
860 * devfs_clean() to invalid the dv_node cache.
861 */
862 /*ARGSUSED5*/
863 static int
devfs_readdir(struct vnode * dvp,struct uio * uiop,struct cred * cred,int * eofp,caller_context_t * ct,int flags)864 devfs_readdir(struct vnode *dvp, struct uio *uiop, struct cred *cred, int *eofp,
865 caller_context_t *ct, int flags)
866 {
867 struct dv_node *ddv, *dv;
868 struct dirent64 *de, *bufp;
869 offset_t diroff;
870 offset_t soff;
871 size_t reclen, movesz;
872 int error;
873 struct vattr va;
874 size_t bufsz;
875
876 ddv = VTODV(dvp);
877 dcmn_err2(("devfs_readdir %s: offset %lld len %ld\n",
878 ddv->dv_name, uiop->uio_loffset, uiop->uio_iov->iov_len));
879 ASSERT(ddv->dv_attr || ddv->dv_attrvp);
880 ASSERT(RW_READ_HELD(&ddv->dv_contents));
881
882 if (uiop->uio_loffset >= MAXOFF_T) {
883 if (eofp)
884 *eofp = 1;
885 return (0);
886 }
887
888 if (uiop->uio_iovcnt != 1)
889 return (EINVAL);
890
891 if (dvp->v_type != VDIR)
892 return (ENOTDIR);
893
894 /* Load the initial contents */
895 if (ddv->dv_flags & DV_BUILD) {
896 if (!rw_tryupgrade(&ddv->dv_contents)) {
897 rw_exit(&ddv->dv_contents);
898 rw_enter(&ddv->dv_contents, RW_WRITER);
899 }
900
901 /* recheck and fill */
902 if (ddv->dv_flags & DV_BUILD)
903 dv_filldir(ddv);
904
905 rw_downgrade(&ddv->dv_contents);
906 }
907
908 /*
909 * Even if the dv node was not stale at entry to this function, it may
910 * be stale now if another process got in between the rw_exit/rw_enter
911 * calls above and unlinked it.
912 */
913 if (DV_STALE(ddv))
914 return (ESTALE);
915
916 soff = uiop->uio_loffset;
917 bufsz = uiop->uio_iov->iov_len;
918 de = bufp = kmem_alloc(bufsz, KM_SLEEP);
919 movesz = 0;
920 dv = (struct dv_node *)-1;
921
922 /*
923 * Move as many entries into the uio structure as it will take.
924 * Special case "." and "..".
925 */
926 diroff = 0;
927 if (soff == 0) { /* . */
928 reclen = DIRENT64_RECLEN(strlen("."));
929 if ((movesz + reclen) > bufsz)
930 goto full;
931 de->d_ino = (ino64_t)ddv->dv_ino;
932 de->d_off = (off64_t)diroff + 1;
933 de->d_reclen = (ushort_t)reclen;
934
935 /* use strncpy(9f) to zero out uninitialized bytes */
936
937 (void) strncpy(de->d_name, ".", DIRENT64_NAMELEN(reclen));
938 movesz += reclen;
939 de = (dirent64_t *)(intptr_t)((char *)de + reclen);
940 dcmn_err3(("devfs_readdir: A: diroff %lld, soff %lld: '%s' "
941 "reclen %lu\n", diroff, soff, ".", reclen));
942 }
943
944 diroff++;
945 if (soff <= 1) { /* .. */
946 reclen = DIRENT64_RECLEN(strlen(".."));
947 if ((movesz + reclen) > bufsz)
948 goto full;
949 de->d_ino = (ino64_t)ddv->dv_dotdot->dv_ino;
950 de->d_off = (off64_t)diroff + 1;
951 de->d_reclen = (ushort_t)reclen;
952
953 /* use strncpy(9f) to zero out uninitialized bytes */
954
955 (void) strncpy(de->d_name, "..", DIRENT64_NAMELEN(reclen));
956 movesz += reclen;
957 de = (dirent64_t *)(intptr_t)((char *)de + reclen);
958 dcmn_err3(("devfs_readdir: B: diroff %lld, soff %lld: '%s' "
959 "reclen %lu\n", diroff, soff, "..", reclen));
960 }
961
962 diroff++;
963 for (dv = DV_FIRST_ENTRY(ddv); dv;
964 dv = DV_NEXT_ENTRY(ddv, dv), diroff++) {
965 /* skip entries until at correct directory offset */
966 if (diroff < soff)
967 continue;
968
969 /*
970 * hidden nodes are skipped (but they still occupy a
971 * directory offset).
972 */
973 if (dv->dv_devi && ndi_dev_is_hidden_node(dv->dv_devi))
974 continue;
975
976 /*
977 * DDM_INTERNAL_PATH minor nodes are skipped for readdirs
978 * outside the kernel (but they still occupy a directory
979 * offset).
980 */
981 if ((dv->dv_flags & DV_INTERNAL) && (cred != kcred))
982 continue;
983
984 reclen = DIRENT64_RECLEN(strlen(dv->dv_name));
985 if ((movesz + reclen) > bufsz) {
986 dcmn_err3(("devfs_readdir: C: diroff "
987 "%lld, soff %lld: '%s' reclen %lu\n",
988 diroff, soff, dv->dv_name, reclen));
989 goto full;
990 }
991 de->d_ino = (ino64_t)dv->dv_ino;
992 de->d_off = (off64_t)diroff + 1;
993 de->d_reclen = (ushort_t)reclen;
994
995 /* use strncpy(9f) to zero out uninitialized bytes */
996
997 ASSERT(strlen(dv->dv_name) + 1 <=
998 DIRENT64_NAMELEN(reclen));
999 (void) strncpy(de->d_name, dv->dv_name,
1000 DIRENT64_NAMELEN(reclen));
1001
1002 movesz += reclen;
1003 de = (dirent64_t *)(intptr_t)((char *)de + reclen);
1004 dcmn_err4(("devfs_readdir: D: diroff "
1005 "%lld, soff %lld: '%s' reclen %lu\n", diroff, soff,
1006 dv->dv_name, reclen));
1007 }
1008
1009 /* the buffer is full, or we exhausted everything */
1010 full: dcmn_err3(("devfs_readdir: moving %lu bytes: "
1011 "diroff %lld, soff %lld, dv %p\n",
1012 movesz, diroff, soff, (void *)dv));
1013
1014 if ((movesz == 0) && dv)
1015 error = EINVAL; /* cannot be represented */
1016 else {
1017 error = uiomove(bufp, movesz, UIO_READ, uiop);
1018 if (error == 0) {
1019 if (eofp)
1020 *eofp = dv ? 0 : 1;
1021 uiop->uio_loffset = diroff;
1022 }
1023
1024 va.va_mask = AT_ATIME;
1025 gethrestime(&va.va_atime);
1026 rw_exit(&ddv->dv_contents);
1027 (void) devfs_setattr(dvp, &va, 0, cred, ct);
1028 rw_enter(&ddv->dv_contents, RW_READER);
1029 }
1030
1031 kmem_free(bufp, bufsz);
1032 return (error);
1033 }
1034
1035 /*ARGSUSED*/
1036 static int
devfs_fsync(struct vnode * vp,int syncflag,struct cred * cred,caller_context_t * ct)1037 devfs_fsync(struct vnode *vp, int syncflag, struct cred *cred,
1038 caller_context_t *ct)
1039 {
1040 /*
1041 * Message goes to console only. Otherwise, the message
1042 * causes devfs_fsync to be invoked again... infinite loop
1043 */
1044 dcmn_err2(("devfs_fsync %s\n", VTODV(vp)->dv_name));
1045 return (0);
1046 }
1047
1048 /*
1049 * Normally, we leave the dv_node here at count of 0.
1050 * The node will be destroyed when dv_cleandir() is called.
1051 *
1052 * Stale dv_node's are already unlinked from the fs tree,
1053 * so dv_cleandir() won't find them. We destroy such nodes
1054 * immediately.
1055 */
1056 /*ARGSUSED1*/
1057 static void
devfs_inactive(struct vnode * vp,struct cred * cred,caller_context_t * ct)1058 devfs_inactive(struct vnode *vp, struct cred *cred, caller_context_t *ct)
1059 {
1060 int destroy;
1061 struct dv_node *dv = VTODV(vp);
1062
1063 dcmn_err2(("devfs_inactive: %s\n", dv->dv_name));
1064 mutex_enter(&vp->v_lock);
1065 ASSERT(vp->v_count >= 1);
1066 VN_RELE_LOCKED(vp);
1067 destroy = (DV_STALE(dv) && vp->v_count == 0);
1068 mutex_exit(&vp->v_lock);
1069
1070 /* stale nodes cannot be rediscovered, destroy it here */
1071 if (destroy)
1072 dv_destroy(dv, 0);
1073 }
1074
1075 /*
1076 * XXX Why do we need this? NFS mounted /dev directories?
1077 * XXX Talk to peter staubach about this.
1078 */
1079 /*ARGSUSED2*/
1080 static int
devfs_fid(struct vnode * vp,struct fid * fidp,caller_context_t * ct)1081 devfs_fid(struct vnode *vp, struct fid *fidp, caller_context_t *ct)
1082 {
1083 struct dv_node *dv = VTODV(vp);
1084 struct dv_fid *dv_fid;
1085
1086 if (fidp->fid_len < (sizeof (struct dv_fid) - sizeof (ushort_t))) {
1087 fidp->fid_len = sizeof (struct dv_fid) - sizeof (ushort_t);
1088 return (ENOSPC);
1089 }
1090
1091 dv_fid = (struct dv_fid *)fidp;
1092 bzero(dv_fid, sizeof (struct dv_fid));
1093 dv_fid->dvfid_len = (int)sizeof (struct dv_fid) - sizeof (ushort_t);
1094 dv_fid->dvfid_ino = dv->dv_ino;
1095 /* dv_fid->dvfid_gen = dv->tn_gen; XXX ? */
1096
1097 return (0);
1098 }
1099
1100 /*
1101 * This pair of routines bracket all VOP_READ, VOP_WRITE
1102 * and VOP_READDIR requests. The contents lock stops things
1103 * moving around while we're looking at them.
1104 *
1105 * Also used by file and record locking.
1106 */
1107 /*ARGSUSED2*/
1108 static int
devfs_rwlock(struct vnode * vp,int write_flag,caller_context_t * ct)1109 devfs_rwlock(struct vnode *vp, int write_flag, caller_context_t *ct)
1110 {
1111 dcmn_err2(("devfs_rwlock %s\n", VTODV(vp)->dv_name));
1112 rw_enter(&VTODV(vp)->dv_contents, write_flag ? RW_WRITER : RW_READER);
1113 return (write_flag);
1114 }
1115
1116 /*ARGSUSED1*/
1117 static void
devfs_rwunlock(struct vnode * vp,int write_flag,caller_context_t * ct)1118 devfs_rwunlock(struct vnode *vp, int write_flag, caller_context_t *ct)
1119 {
1120 dcmn_err2(("devfs_rwunlock %s\n", VTODV(vp)->dv_name));
1121 rw_exit(&VTODV(vp)->dv_contents);
1122 }
1123
1124 /*
1125 * XXX Should probably do a better job of computing the maximum
1126 * offset available in the directory.
1127 */
1128 /*ARGSUSED1*/
1129 static int
devfs_seek(struct vnode * vp,offset_t ooff,offset_t * noffp,caller_context_t * ct)1130 devfs_seek(struct vnode *vp, offset_t ooff, offset_t *noffp,
1131 caller_context_t *ct)
1132 {
1133 ASSERT(vp->v_type == VDIR);
1134 dcmn_err2(("devfs_seek %s\n", VTODV(vp)->dv_name));
1135 return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
1136 }
1137
1138 vnodeops_t *dv_vnodeops;
1139
1140 const fs_operation_def_t dv_vnodeops_template[] = {
1141 VOPNAME_OPEN, { .vop_open = devfs_open },
1142 VOPNAME_CLOSE, { .vop_close = devfs_close },
1143 VOPNAME_READ, { .vop_read = devfs_read },
1144 VOPNAME_WRITE, { .vop_write = devfs_write },
1145 VOPNAME_IOCTL, { .vop_ioctl = devfs_ioctl },
1146 VOPNAME_GETATTR, { .vop_getattr = devfs_getattr },
1147 VOPNAME_SETATTR, { .vop_setattr = devfs_setattr },
1148 VOPNAME_ACCESS, { .vop_access = devfs_access },
1149 VOPNAME_LOOKUP, { .vop_lookup = devfs_lookup },
1150 VOPNAME_CREATE, { .vop_create = devfs_create },
1151 VOPNAME_READDIR, { .vop_readdir = devfs_readdir },
1152 VOPNAME_FSYNC, { .vop_fsync = devfs_fsync },
1153 VOPNAME_INACTIVE, { .vop_inactive = devfs_inactive },
1154 VOPNAME_FID, { .vop_fid = devfs_fid },
1155 VOPNAME_RWLOCK, { .vop_rwlock = devfs_rwlock },
1156 VOPNAME_RWUNLOCK, { .vop_rwunlock = devfs_rwunlock },
1157 VOPNAME_SEEK, { .vop_seek = devfs_seek },
1158 VOPNAME_PATHCONF, { .vop_pathconf = devfs_pathconf },
1159 VOPNAME_DISPOSE, { .error = fs_error },
1160 VOPNAME_SETSECATTR, { .vop_setsecattr = devfs_setsecattr },
1161 VOPNAME_GETSECATTR, { .vop_getsecattr = devfs_getsecattr },
1162 NULL, NULL
1163 };
1164