xref: /illumos-gate/usr/src/uts/common/fs/devfs/devfs_vnops.c (revision 45744051679350ee063cdc366b66bee5223a11ea)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2017 by Delphix. All rights reserved.
24  * Copyright 2023 Oxide Computer Company
25  */
26 
27 /*
28  * vnode ops for the devfs
29  *
30  * For leaf vnode special files (VCHR|VBLK) specfs will always see the VOP
31  * first because dv_find always performs leaf vnode substitution, returning
32  * a specfs vnode with an s_realvp pointing to the devfs leaf vnode. This
33  * means that the only leaf special file VOP operations that devfs will see
34  * after VOP_LOOKUP are the ones that specfs forwards.
35  */
36 
37 #include <sys/types.h>
38 #include <sys/param.h>
39 #include <sys/t_lock.h>
40 #include <sys/systm.h>
41 #include <sys/sysmacros.h>
42 #include <sys/user.h>
43 #include <sys/time.h>
44 #include <sys/vfs.h>
45 #include <sys/vnode.h>
46 #include <sys/vfs_opreg.h>
47 #include <sys/file.h>
48 #include <sys/fcntl.h>
49 #include <sys/flock.h>
50 #include <sys/kmem.h>
51 #include <sys/uio.h>
52 #include <sys/errno.h>
53 #include <sys/stat.h>
54 #include <sys/cred.h>
55 #include <sys/dirent.h>
56 #include <sys/pathname.h>
57 #include <sys/cmn_err.h>
58 #include <sys/debug.h>
59 #include <sys/policy.h>
60 #include <sys/modctl.h>
61 #include <sys/sunndi.h>
62 #include <fs/fs_subr.h>
63 #include <sys/fs/dv_node.h>
64 
65 extern struct vattr	dv_vattr_dir, dv_vattr_file;
66 extern dev_t rconsdev;
67 
68 /*
69  * Open of devices (leaf nodes) is handled by specfs.
70  * There is nothing to do to open a directory
71  */
72 /*ARGSUSED*/
73 static int
74 devfs_open(struct vnode **vpp, int flag, struct cred *cred,
75     caller_context_t *ct)
76 {
77 	struct dv_node	*dv = VTODV(*vpp);
78 
79 	dcmn_err2(("devfs_open %s\n", dv->dv_name));
80 	ASSERT((*vpp)->v_type == VDIR);
81 	return (0);
82 }
83 
84 /*
85  * Close of devices (leaf nodes) is handled by specfs.
86  * There is nothing much to do inorder to close a directory.
87  */
88 /*ARGSUSED1*/
89 static int
90 devfs_close(struct vnode *vp, int flag, int count,
91     offset_t offset, struct cred *cred, caller_context_t *ct)
92 {
93 	struct dv_node	*dv = VTODV(vp);
94 
95 	dcmn_err2(("devfs_close %s\n", dv->dv_name));
96 	ASSERT(vp->v_type == VDIR);
97 
98 	cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
99 	cleanshares(vp, ttoproc(curthread)->p_pid);
100 	return (0);
101 }
102 
103 /*
104  * Read of devices (leaf nodes) is handled by specfs.
105  * Read of directories is not supported.
106  */
107 /*ARGSUSED*/
108 static int
109 devfs_read(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cred,
110     struct caller_context *ct)
111 {
112 	dcmn_err2(("devfs_read %s\n", VTODV(vp)->dv_name));
113 	ASSERT(vp->v_type == VDIR);
114 	ASSERT(RW_READ_HELD(&VTODV(vp)->dv_contents));
115 	return (EISDIR);
116 }
117 
118 /*
119  * Write of devices (leaf nodes) is handled by specfs.
120  * Write of directories is not supported.
121  */
122 /*ARGSUSED*/
123 static int
124 devfs_write(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cred,
125     struct caller_context *ct)
126 {
127 	dcmn_err2(("devfs_write %s\n", VTODV(vp)->dv_name));
128 	ASSERT(vp->v_type == VDIR);
129 	ASSERT(RW_WRITE_HELD(&VTODV(vp)->dv_contents));
130 	return (EISDIR);
131 }
132 
133 /*
134  * Ioctls to device (leaf nodes) is handled by specfs.
135  * Ioctl to directories is not supported.
136  */
137 /*ARGSUSED*/
138 static int
139 devfs_ioctl(struct vnode *vp, int cmd, intptr_t arg, int flag,
140     struct cred *cred, int *rvalp, caller_context_t *ct)
141 {
142 	dcmn_err2(("devfs_ioctl %s\n", VTODV(vp)->dv_name));
143 	ASSERT(vp->v_type == VDIR);
144 
145 	return (ENOTTY);	/* no ioctls supported */
146 }
147 
148 /*
149  * We can be asked directly about the attributes of directories, or
150  * (via sp->s_realvp) about the filesystem attributes of special files.
151  *
152  * For directories, we just believe the attribute store
153  * though we mangle the nodeid, fsid, and rdev to convince userland we
154  * really are a different filesystem.
155  *
156  * For special files, a little more fakery is required.
157  *
158  * If the attribute store is not there (read only root), we believe our
159  * memory based attributes.
160  */
161 static int
162 devfs_getattr(struct vnode *vp, struct vattr *vap, int flags, struct cred *cr,
163     caller_context_t *ct)
164 {
165 	struct dv_node	*dv = VTODV(vp);
166 	int		error = 0;
167 	uint_t		mask;
168 
169 	/*
170 	 * Message goes to console only. Otherwise, the message
171 	 * causes devfs_getattr to be invoked again... infinite loop
172 	 */
173 	dcmn_err2(("?devfs_getattr %s\n", dv->dv_name));
174 	ASSERT(dv->dv_attr || dv->dv_attrvp);
175 
176 	if (!(vp->v_type == VDIR || vp->v_type == VCHR || vp->v_type == VBLK)) {
177 		cmn_err(CE_WARN,	/* panic ? */
178 		    "?%s: getattr on vnode type %d", dvnm, vp->v_type);
179 		return (ENOENT);
180 	}
181 
182 	rw_enter(&dv->dv_contents, RW_READER);
183 	if (dv->dv_attr) {
184 		/*
185 		 * obtain from the memory version of attribute.
186 		 * preserve mask for those that optimize.
187 		 * devfs specific fields are already merged on creation.
188 		 */
189 		mask = vap->va_mask;
190 		*vap = *dv->dv_attr;
191 		vap->va_mask = mask;
192 	} else {
193 		/* obtain from attribute store and merge */
194 		error = VOP_GETATTR(dv->dv_attrvp, vap, flags, cr, ct);
195 		dsysdebug(error, ("vop_getattr %s %d\n", dv->dv_name, error));
196 		dv_vattr_merge(dv, vap);
197 	}
198 	rw_exit(&dv->dv_contents);
199 
200 	/*
201 	 * Restrict the permissions of the node fronting the console
202 	 * to 0600 with root as the owner.  This prevents a non-root
203 	 * user from gaining access to a serial terminal (like /dev/term/a)
204 	 * which is in reality serving as the console device (/dev/console).
205 	 */
206 	if (vp->v_rdev == rconsdev) {
207 		mode_t	rconsmask = S_IXUSR|S_IRWXG|S_IRWXO;
208 		vap->va_mode &= (~rconsmask);
209 		vap->va_uid = 0;
210 	}
211 
212 	return (error);
213 }
214 
215 static int devfs_unlocked_access(void *, int, struct cred *);
216 
217 /*ARGSUSED4*/
218 static int
219 devfs_setattr_dir(
220 	struct dv_node *dv,
221 	struct vnode *vp,
222 	struct vattr *vap,
223 	int flags,
224 	struct cred *cr)
225 {
226 	struct vattr	*map;
227 	uint_t		mask;
228 	int		error = 0;
229 	struct vattr	vattr;
230 
231 	ASSERT(dv->dv_attr || dv->dv_attrvp);
232 
233 	ASSERT(vp->v_type == VDIR);
234 	ASSERT((dv->dv_flags & DV_NO_FSPERM) == 0);
235 
236 	if (vap->va_mask & AT_NOSET)
237 		return (EINVAL);
238 
239 	/* to ensure consistency, single thread setting of attributes */
240 	rw_enter(&dv->dv_contents, RW_WRITER);
241 
242 again:	if (dv->dv_attr) {
243 
244 		error = secpolicy_vnode_setattr(cr, vp, vap,
245 		    dv->dv_attr, flags, devfs_unlocked_access, dv);
246 
247 		if (error)
248 			goto out;
249 
250 		/*
251 		 * Apply changes to the memory based attribute. This code
252 		 * is modeled after the tmpfs implementation of memory
253 		 * based vnodes
254 		 */
255 		map = dv->dv_attr;
256 		mask = vap->va_mask;
257 
258 		/* Change file access modes. */
259 		if (mask & AT_MODE) {
260 			map->va_mode &= S_IFMT;
261 			map->va_mode |= vap->va_mode & ~S_IFMT;
262 		}
263 		if (mask & AT_UID)
264 			map->va_uid = vap->va_uid;
265 		if (mask & AT_GID)
266 			map->va_gid = vap->va_gid;
267 		if (mask & AT_ATIME)
268 			map->va_atime = vap->va_atime;
269 		if (mask & AT_MTIME)
270 			map->va_mtime = vap->va_mtime;
271 
272 		if (mask & (AT_MODE | AT_UID | AT_GID | AT_MTIME))
273 			gethrestime(&map->va_ctime);
274 	} else {
275 		/* use the backing attribute store */
276 		ASSERT(dv->dv_attrvp);
277 
278 		/*
279 		 * See if we are changing something we care about
280 		 * the persistence of - return success if we don't care.
281 		 */
282 		if (vap->va_mask & (AT_MODE|AT_UID|AT_GID|AT_ATIME|AT_MTIME)) {
283 			/* Set the attributes */
284 			error = VOP_SETATTR(dv->dv_attrvp,
285 			    vap, flags, cr, NULL);
286 			dsysdebug(error,
287 			    ("vop_setattr %s %d\n", dv->dv_name, error));
288 
289 			/*
290 			 * Some file systems may return EROFS for a setattr
291 			 * on a readonly file system.  In this case we create
292 			 * our own memory based attribute.
293 			 */
294 			if (error == EROFS) {
295 				/*
296 				 * obtain attributes from existing file
297 				 * that we will modify and switch to memory
298 				 * based attribute until attribute store is
299 				 * read/write.
300 				 */
301 				vattr = dv_vattr_dir;
302 				if (VOP_GETATTR(dv->dv_attrvp,
303 				    &vattr, flags, cr, NULL) == 0) {
304 					dv->dv_attr = kmem_alloc(
305 					    sizeof (struct vattr), KM_SLEEP);
306 					*dv->dv_attr = vattr;
307 					dv_vattr_merge(dv, dv->dv_attr);
308 					goto again;
309 				}
310 			}
311 		}
312 	}
313 out:
314 	rw_exit(&dv->dv_contents);
315 	return (error);
316 }
317 
318 
319 /*
320  * Compare the uid/gid/mode changes requested for a setattr
321  * operation with the same details of a node's default minor
322  * perm information.  Return 0 if identical.
323  */
324 static int
325 dv_setattr_cmp(struct vattr *map, mperm_t *mp)
326 {
327 	if ((map->va_mode & S_IAMB) != (mp->mp_mode & S_IAMB))
328 		return (1);
329 	if (map->va_uid != mp->mp_uid)
330 		return (1);
331 	if (map->va_gid != mp->mp_gid)
332 		return (1);
333 	return (0);
334 }
335 
336 
337 /*ARGSUSED4*/
338 static int
339 devfs_setattr(
340 	struct vnode *vp,
341 	struct vattr *vap,
342 	int flags,
343 	struct cred *cr,
344 	caller_context_t *ct)
345 {
346 	struct dv_node	*dv = VTODV(vp);
347 	struct dv_node	*ddv;
348 	struct vnode	*dvp;
349 	struct vattr	*map;
350 	uint_t		mask;
351 	int		error = 0;
352 	struct vattr	*free_vattr = NULL;
353 	struct vattr	*vattrp = NULL;
354 	mperm_t		mp;
355 	int		persist;
356 
357 	/*
358 	 * Message goes to console only. Otherwise, the message
359 	 * causes devfs_getattr to be invoked again... infinite loop
360 	 */
361 	dcmn_err2(("?devfs_setattr %s\n", dv->dv_name));
362 	ASSERT(dv->dv_attr || dv->dv_attrvp);
363 
364 	if (!(vp->v_type == VDIR || vp->v_type == VCHR || vp->v_type == VBLK)) {
365 		cmn_err(CE_WARN,	/* panic ? */
366 		    "?%s: getattr on vnode type %d", dvnm, vp->v_type);
367 		return (ENOENT);
368 	}
369 
370 	if (vap->va_mask & AT_NOSET)
371 		return (EINVAL);
372 
373 	/*
374 	 * If we are changing something we don't care about
375 	 * the persistence of, return success.
376 	 */
377 	if ((vap->va_mask &
378 	    (AT_MODE|AT_UID|AT_GID|AT_ATIME|AT_MTIME)) == 0)
379 		return (0);
380 
381 	/*
382 	 * If driver overrides fs perm, disallow chmod
383 	 * and do not create attribute nodes.
384 	 */
385 	if (dv->dv_flags & DV_NO_FSPERM) {
386 		ASSERT(dv->dv_attr);
387 		if (vap->va_mask & (AT_MODE | AT_UID | AT_GID))
388 			return (EPERM);
389 		if ((vap->va_mask & (AT_ATIME|AT_MTIME)) == 0)
390 			return (0);
391 		rw_enter(&dv->dv_contents, RW_WRITER);
392 		if (vap->va_mask & AT_ATIME)
393 			dv->dv_attr->va_atime = vap->va_atime;
394 		if (vap->va_mask & AT_MTIME)
395 			dv->dv_attr->va_mtime = vap->va_mtime;
396 		rw_exit(&dv->dv_contents);
397 		return (0);
398 	}
399 
400 	/*
401 	 * Directories are always created but device nodes are
402 	 * only used to persist non-default permissions.
403 	 */
404 	if (vp->v_type == VDIR) {
405 		ASSERT(dv->dv_attr || dv->dv_attrvp);
406 		return (devfs_setattr_dir(dv, vp, vap, flags, cr));
407 	}
408 
409 	/*
410 	 * Allocate now before we take any locks
411 	 */
412 	vattrp = kmem_zalloc(sizeof (*vattrp), KM_SLEEP);
413 
414 	/* to ensure consistency, single thread setting of attributes */
415 	rw_enter(&dv->dv_contents, RW_WRITER);
416 
417 	/*
418 	 * We don't need to create an attribute node
419 	 * to persist access or modification times.
420 	 */
421 	persist = (vap->va_mask & (AT_MODE | AT_UID | AT_GID));
422 
423 	/*
424 	 * If persisting something, get the default permissions
425 	 * for this minor to compare against what the attributes
426 	 * are now being set to.  Default ordering is:
427 	 *	- minor_perm match for this minor
428 	 *	- mode supplied by ddi_create_priv_minor_node
429 	 *	- devfs defaults
430 	 */
431 	if (persist) {
432 		if (dev_minorperm(dv->dv_devi, dv->dv_name, &mp) != 0) {
433 			mp.mp_uid = dv_vattr_file.va_uid;
434 			mp.mp_gid = dv_vattr_file.va_gid;
435 			mp.mp_mode = dv_vattr_file.va_mode;
436 			if (dv->dv_flags & DV_DFLT_MODE) {
437 				ASSERT((dv->dv_dflt_mode & ~S_IAMB) == 0);
438 				mp.mp_mode &= ~S_IAMB;
439 				mp.mp_mode |= dv->dv_dflt_mode;
440 				dcmn_err5(("%s: setattr priv default 0%o\n",
441 				    dv->dv_name, mp.mp_mode));
442 			} else {
443 				dcmn_err5(("%s: setattr devfs default 0%o\n",
444 				    dv->dv_name, mp.mp_mode));
445 			}
446 		} else {
447 			dcmn_err5(("%s: setattr minor perm default 0%o\n",
448 			    dv->dv_name, mp.mp_mode));
449 		}
450 	}
451 
452 	/*
453 	 * If we don't have a vattr for this node, construct one.
454 	 */
455 	if (dv->dv_attr) {
456 		free_vattr = vattrp;
457 		vattrp = NULL;
458 	} else {
459 		ASSERT(dv->dv_attrvp);
460 		ASSERT(vp->v_type != VDIR);
461 		*vattrp = dv_vattr_file;
462 		error = VOP_GETATTR(dv->dv_attrvp, vattrp, 0, cr, ct);
463 		dsysdebug(error, ("vop_getattr %s %d\n", dv->dv_name, error));
464 		if (error)
465 			goto out;
466 		dv->dv_attr = vattrp;
467 		dv_vattr_merge(dv, dv->dv_attr);
468 		vattrp = NULL;
469 	}
470 
471 	error = secpolicy_vnode_setattr(cr, vp, vap, dv->dv_attr,
472 	    flags, devfs_unlocked_access, dv);
473 	if (error) {
474 		dsysdebug(error, ("devfs_setattr %s secpolicy error %d\n",
475 		    dv->dv_name, error));
476 		goto out;
477 	}
478 
479 	/*
480 	 * Apply changes to the memory based attribute. This code
481 	 * is modeled after the tmpfs implementation of memory
482 	 * based vnodes
483 	 */
484 	map = dv->dv_attr;
485 	mask = vap->va_mask;
486 
487 	/* Change file access modes. */
488 	if (mask & AT_MODE) {
489 		map->va_mode &= S_IFMT;
490 		map->va_mode |= vap->va_mode & ~S_IFMT;
491 	}
492 	if (mask & AT_UID)
493 		map->va_uid = vap->va_uid;
494 	if (mask & AT_GID)
495 		map->va_gid = vap->va_gid;
496 	if (mask & AT_ATIME)
497 		map->va_atime = vap->va_atime;
498 	if (mask & AT_MTIME)
499 		map->va_mtime = vap->va_mtime;
500 
501 	if (mask & (AT_MODE | AT_UID | AT_GID | AT_MTIME)) {
502 		gethrestime(&map->va_ctime);
503 	}
504 
505 	/*
506 	 * A setattr to defaults means we no longer need the
507 	 * shadow node as a persistent store, unless there
508 	 * are ACLs.  Otherwise create a shadow node if one
509 	 * doesn't exist yet.
510 	 */
511 	if (persist) {
512 		if ((dv_setattr_cmp(map, &mp) == 0) &&
513 		    ((dv->dv_flags & DV_ACL) == 0)) {
514 
515 			if (dv->dv_attrvp) {
516 				ddv = dv->dv_dotdot;
517 				ASSERT(ddv->dv_attrvp);
518 				error = VOP_REMOVE(ddv->dv_attrvp,
519 				    dv->dv_name, cr, ct, 0);
520 				dsysdebug(error,
521 				    ("vop_remove %s %s %d\n",
522 				    ddv->dv_name, dv->dv_name, error));
523 
524 				if (error == EROFS)
525 					error = 0;
526 				VN_RELE(dv->dv_attrvp);
527 				dv->dv_attrvp = NULL;
528 			}
529 			ASSERT(dv->dv_attr);
530 		} else {
531 			if (mask & AT_MODE)
532 				dcmn_err5(("%s persisting mode 0%o\n",
533 				    dv->dv_name, vap->va_mode));
534 			if (mask & AT_UID)
535 				dcmn_err5(("%s persisting uid %d\n",
536 				    dv->dv_name, vap->va_uid));
537 			if (mask & AT_GID)
538 				dcmn_err5(("%s persisting gid %d\n",
539 				    dv->dv_name, vap->va_gid));
540 
541 			if (dv->dv_attrvp == NULL) {
542 				dvp = DVTOV(dv->dv_dotdot);
543 				dv_shadow_node(dvp, dv->dv_name, vp,
544 				    NULL, NULLVP, cr,
545 				    DV_SHADOW_CREATE | DV_SHADOW_WRITE_HELD);
546 			}
547 			if (dv->dv_attrvp) {
548 				/* If map still valid do TIME for free. */
549 				if (dv->dv_attr == map) {
550 					mask = map->va_mask;
551 					map->va_mask =
552 					    vap->va_mask | AT_ATIME | AT_MTIME;
553 					error = VOP_SETATTR(dv->dv_attrvp, map,
554 					    flags, cr, NULL);
555 					map->va_mask = mask;
556 				} else {
557 					error = VOP_SETATTR(dv->dv_attrvp,
558 					    vap, flags, cr, NULL);
559 				}
560 				dsysdebug(error, ("vop_setattr %s %d\n",
561 				    dv->dv_name, error));
562 			}
563 			/*
564 			 * Some file systems may return EROFS for a setattr
565 			 * on a readonly file system.  In this case save
566 			 * as our own memory based attribute.
567 			 * NOTE: ufs is NOT one of these (see ufs_iupdat).
568 			 */
569 			if (dv->dv_attr && dv->dv_attrvp && error == 0) {
570 				vattrp = dv->dv_attr;
571 				dv->dv_attr = NULL;
572 			} else if (error == EROFS)
573 				error = 0;
574 		}
575 	}
576 
577 out:
578 	rw_exit(&dv->dv_contents);
579 
580 	if (vattrp)
581 		kmem_free(vattrp, sizeof (*vattrp));
582 	if (free_vattr)
583 		kmem_free(free_vattr, sizeof (*free_vattr));
584 	return (error);
585 }
586 
587 static int
588 devfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
589     caller_context_t *ct)
590 {
591 	switch (cmd) {
592 	case _PC_ACL_ENABLED:
593 		/*
594 		 * We rely on the underlying filesystem for ACLs,
595 		 * so direct the query for ACL support there.
596 		 * ACL support isn't relative to the file
597 		 * and we can't guarantee that the dv node
598 		 * has an attribute node, so any valid
599 		 * attribute node will suffice.
600 		 */
601 		ASSERT(dvroot);
602 		ASSERT(dvroot->dv_attrvp);
603 		return (VOP_PATHCONF(dvroot->dv_attrvp, cmd, valp, cr, ct));
604 		/*NOTREACHED*/
605 	}
606 
607 	return (fs_pathconf(vp, cmd, valp, cr, ct));
608 }
609 
610 /*
611  * Let avp handle security attributes (acl's).
612  */
613 static int
614 devfs_getsecattr(struct vnode *vp, struct vsecattr *vsap, int flags,
615     struct cred *cr, caller_context_t *ct)
616 {
617 	dvnode_t *dv = VTODV(vp);
618 	struct vnode *avp;
619 	int	error;
620 
621 	dcmn_err2(("devfs_getsecattr %s\n", dv->dv_name));
622 	ASSERT(vp->v_type == VDIR || vp->v_type == VCHR || vp->v_type == VBLK);
623 
624 	rw_enter(&dv->dv_contents, RW_READER);
625 
626 	avp = dv->dv_attrvp;
627 
628 	/* fabricate the acl */
629 	if (avp == NULL) {
630 		error = fs_fab_acl(vp, vsap, flags, cr, ct);
631 		rw_exit(&dv->dv_contents);
632 		return (error);
633 	}
634 
635 	error = VOP_GETSECATTR(avp, vsap, flags, cr, ct);
636 	dsysdebug(error, ("vop_getsecattr %s %d\n", VTODV(vp)->dv_name, error));
637 	rw_exit(&dv->dv_contents);
638 	return (error);
639 }
640 
641 /*
642  * Set security attributes (acl's)
643  *
644  * Note that the dv_contents lock has already been acquired
645  * by the caller's VOP_RWLOCK.
646  */
647 static int
648 devfs_setsecattr(struct vnode *vp, struct vsecattr *vsap, int flags,
649     struct cred *cr, caller_context_t *ct)
650 {
651 	dvnode_t *dv = VTODV(vp);
652 	struct vnode *avp;
653 	int	error;
654 
655 	dcmn_err2(("devfs_setsecattr %s\n", dv->dv_name));
656 	ASSERT(vp->v_type == VDIR || vp->v_type == VCHR || vp->v_type == VBLK);
657 	ASSERT(RW_LOCK_HELD(&dv->dv_contents));
658 
659 	/*
660 	 * Not a supported operation on drivers not providing
661 	 * file system based permissions.
662 	 */
663 	if (dv->dv_flags & DV_NO_FSPERM)
664 		return (ENOTSUP);
665 
666 	/*
667 	 * To complete, the setsecattr requires an underlying attribute node.
668 	 */
669 	if (dv->dv_attrvp == NULL) {
670 		ASSERT(vp->v_type == VCHR || vp->v_type == VBLK);
671 		dv_shadow_node(DVTOV(dv->dv_dotdot), dv->dv_name, vp,
672 		    NULL, NULLVP, cr, DV_SHADOW_CREATE | DV_SHADOW_WRITE_HELD);
673 	}
674 
675 	if ((avp = dv->dv_attrvp) == NULL) {
676 		dcmn_err2(("devfs_setsecattr %s: "
677 		    "cannot construct attribute node\n", dv->dv_name));
678 		return (fs_nosys());
679 	}
680 
681 	/*
682 	 * The acl(2) system call issues a VOP_RWLOCK before setting an ACL.
683 	 * Since backing file systems expect the lock to be held before seeing
684 	 * a VOP_SETSECATTR ACL, we need to issue the VOP_RWLOCK to the backing
685 	 * store before forwarding the ACL.
686 	 */
687 	(void) VOP_RWLOCK(avp, V_WRITELOCK_TRUE, NULL);
688 	error = VOP_SETSECATTR(avp, vsap, flags, cr, ct);
689 	dsysdebug(error, ("vop_setsecattr %s %d\n", VTODV(vp)->dv_name, error));
690 	VOP_RWUNLOCK(avp, V_WRITELOCK_TRUE, NULL);
691 
692 	/*
693 	 * Set DV_ACL if we have a non-trivial set of ACLs.  It is not
694 	 * necessary to hold VOP_RWLOCK since fs_acl_nontrivial only does
695 	 * VOP_GETSECATTR calls.
696 	 */
697 	if (fs_acl_nontrivial(avp, cr))
698 		dv->dv_flags |= DV_ACL;
699 	return (error);
700 }
701 
702 /*
703  * This function is used for secpolicy_setattr().  It must call an
704  * access() like function while it is already holding the
705  * dv_contents lock.  We only care about this when dv_attr != NULL;
706  * so the unlocked access call only concerns itself with that
707  * particular branch of devfs_access().
708  */
709 static int
710 devfs_unlocked_access(void *vdv, int mode, struct cred *cr)
711 {
712 	struct dv_node *dv = vdv;
713 	int shift = 0;
714 	uid_t owner = dv->dv_attr->va_uid;
715 
716 	/* Check access based on owner, group and public permissions. */
717 	if (crgetuid(cr) != owner) {
718 		shift += 3;
719 		if (groupmember(dv->dv_attr->va_gid, cr) == 0)
720 			shift += 3;
721 	}
722 
723 	return (secpolicy_vnode_access2(cr, DVTOV(dv), owner,
724 	    dv->dv_attr->va_mode << shift, mode));
725 }
726 
727 static int
728 devfs_access(struct vnode *vp, int mode, int flags, struct cred *cr,
729     caller_context_t *ct)
730 {
731 	struct dv_node	*dv = VTODV(vp);
732 	int		res;
733 
734 	dcmn_err2(("devfs_access %s\n", dv->dv_name));
735 	ASSERT(dv->dv_attr || dv->dv_attrvp);
736 
737 	/* restrict console access to privileged processes */
738 	if ((vp->v_rdev == rconsdev) && secpolicy_console(cr) != 0) {
739 		return (EACCES);
740 	}
741 
742 	rw_enter(&dv->dv_contents, RW_READER);
743 	if (dv->dv_attr && ((dv->dv_flags & DV_ACL) == 0)) {
744 		res = devfs_unlocked_access(dv, mode, cr);
745 	} else {
746 		res = VOP_ACCESS(dv->dv_attrvp, mode, flags, cr, ct);
747 	}
748 	rw_exit(&dv->dv_contents);
749 	return (res);
750 }
751 
752 /*
753  * Lookup
754  *
755  * Given the directory vnode and the name of the component, return
756  * the corresponding held vnode for that component.
757  *
758  * Of course in these fictional filesystems, nothing's ever quite
759  * -that- simple.
760  *
761  * devfs name	type		shadow (fs attributes)	type	comments
762  * -------------------------------------------------------------------------
763  * drv[@addr]	VDIR		drv[@addr]		VDIR	nexus driver
764  * drv[@addr]:m	VCHR/VBLK	drv[@addr]:m		VREG	leaf driver
765  * drv[@addr]	VCHR/VBLK	drv[@addr]:.default	VREG	leaf driver
766  * -------------------------------------------------------------------------
767  *
768  * The following names are reserved for the attribute filesystem (which
769  * could easily be another layer on top of this one - we simply need to
770  * hold the vnode of the thing we're looking at)
771  *
772  * attr name	type		shadow (fs attributes)	type	comments
773  * -------------------------------------------------------------------------
774  * drv[@addr]	VDIR		-			-	attribute dir
775  * minorname	VDIR		-			-	minorname
776  * attribute	VREG		-			-	attribute
777  * -------------------------------------------------------------------------
778  *
779  * Examples:
780  *
781  *	devfs:/devices/.../mm@0:zero		VCHR
782  *	shadow:/.devices/.../mm@0:zero		VREG, fs attrs
783  *	devfs:/devices/.../mm@0:/zero/attr	VREG, driver attribute
784  *
785  *	devfs:/devices/.../sd@0,0:a		VBLK
786  *	shadow:/.devices/.../sd@0,0:a		VREG, fs attrs
787  *	devfs:/devices/.../sd@0,0:/a/.type	VREG, "ddi_block:chan"
788  *
789  *	devfs:/devices/.../mm@0			VCHR
790  *	shadow:/.devices/.../mm@0:.default	VREG, fs attrs
791  *	devfs:/devices/.../mm@0:/.default/attr	VREG, driver attribute
792  *	devfs:/devices/.../mm@0:/.default/.type	VREG, "ddi_pseudo"
793  *
794  *	devfs:/devices/.../obio			VDIR
795  *	shadow:/devices/.../obio		VDIR, needed for fs attrs.
796  *	devfs:/devices/.../obio:/.default/attr	VDIR, driver attribute
797  *
798  * We also need to be able deal with "old" devices that have gone away,
799  * though I think that provided we return them with readdir, they can
800  * be removed (i.e. they don't have to respond to lookup, though it might
801  * be weird if they didn't ;-)
802  *
803  * Lookup has side-effects.
804  *
805  * - It will create directories and fs attribute files in the shadow hierarchy.
806  * - It should cause non-SID devices to be probed (ask the parent nexi).
807  */
808 /*ARGSUSED3*/
809 static int
810 devfs_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
811     struct pathname *pnp, int flags, struct vnode *rdir, struct cred *cred,
812     caller_context_t *ct, int *direntflags, pathname_t *realpnp)
813 {
814 	ASSERT(dvp->v_type == VDIR);
815 	dcmn_err2(("devfs_lookup: %s\n", nm));
816 	return (dv_find(VTODV(dvp), nm, vpp, pnp, rdir, cred, 0));
817 }
818 
819 /*
820  * devfs nodes can't really be created directly by userland - however,
821  * we do allow creates to find existing nodes:
822  *
823  * - any create fails if the node doesn't exist - EROFS.
824  * - creating an existing directory read-only succeeds, otherwise EISDIR.
825  * - exclusive creates fail if the node already exists - EEXIST.
826  * - failure to create the snode for an existing device - ENOSYS.
827  */
828 /*ARGSUSED2*/
829 static int
830 devfs_create(struct vnode *dvp, char *nm, struct vattr *vap, vcexcl_t excl,
831     int mode, struct vnode **vpp, struct cred *cred, int flag,
832     caller_context_t *ct, vsecattr_t *vsecp)
833 {
834 	int error;
835 	struct vnode *vp;
836 
837 	dcmn_err2(("devfs_create %s\n", nm));
838 	error = dv_find(VTODV(dvp), nm, &vp, NULL, NULLVP, cred, 0);
839 	if (error == 0) {
840 		if (excl == EXCL)
841 			error = EEXIST;
842 		else if (vp->v_type == VDIR && (mode & VWRITE))
843 			error = EISDIR;
844 		else
845 			error = VOP_ACCESS(vp, mode, 0, cred, ct);
846 
847 		if (error) {
848 			VN_RELE(vp);
849 		} else
850 			*vpp = vp;
851 	} else if (error == ENOENT)
852 		error = EROFS;
853 
854 	return (error);
855 }
856 
857 /*
858  * If DV_BUILD is set, we call into nexus driver to do a BUS_CONFIG_ALL.
859  * Otherwise, simply return cached dv_node's. Hotplug code always call
860  * devfs_clean() to invalid the dv_node cache.
861  */
862 /*ARGSUSED5*/
863 static int
864 devfs_readdir(struct vnode *dvp, struct uio *uiop, struct cred *cred, int *eofp,
865     caller_context_t *ct, int flags)
866 {
867 	struct dv_node *ddv, *dv;
868 	struct dirent64 *de, *bufp;
869 	offset_t diroff;
870 	offset_t	soff;
871 	size_t reclen, movesz;
872 	int error;
873 	struct vattr va;
874 	size_t bufsz;
875 
876 	ddv = VTODV(dvp);
877 	dcmn_err2(("devfs_readdir %s: offset %lld len %ld\n",
878 	    ddv->dv_name, uiop->uio_loffset, uiop->uio_iov->iov_len));
879 	ASSERT(ddv->dv_attr || ddv->dv_attrvp);
880 	ASSERT(RW_READ_HELD(&ddv->dv_contents));
881 
882 	if (uiop->uio_loffset >= MAXOFF_T) {
883 		if (eofp)
884 			*eofp = 1;
885 		return (0);
886 	}
887 
888 	if (uiop->uio_iovcnt != 1)
889 		return (EINVAL);
890 
891 	if (dvp->v_type != VDIR)
892 		return (ENOTDIR);
893 
894 	/* Load the initial contents */
895 	if (ddv->dv_flags & DV_BUILD) {
896 		if (!rw_tryupgrade(&ddv->dv_contents)) {
897 			rw_exit(&ddv->dv_contents);
898 			rw_enter(&ddv->dv_contents, RW_WRITER);
899 		}
900 
901 		/* recheck and fill */
902 		if (ddv->dv_flags & DV_BUILD)
903 			dv_filldir(ddv);
904 
905 		rw_downgrade(&ddv->dv_contents);
906 	}
907 
908 	/*
909 	 * Even if the dv node was not stale at entry to this function, it may
910 	 * be stale now if another process got in between the rw_exit/rw_enter
911 	 * calls above and unlinked it.
912 	 */
913 	if (DV_STALE(ddv))
914 		return (ESTALE);
915 
916 	soff = uiop->uio_loffset;
917 	bufsz = uiop->uio_iov->iov_len;
918 	de = bufp = kmem_alloc(bufsz, KM_SLEEP);
919 	movesz = 0;
920 	dv = (struct dv_node *)-1;
921 
922 	/*
923 	 * Move as many entries into the uio structure as it will take.
924 	 * Special case "." and "..".
925 	 */
926 	diroff = 0;
927 	if (soff == 0) {				/* . */
928 		reclen = DIRENT64_RECLEN(strlen("."));
929 		if ((movesz + reclen) > bufsz)
930 			goto full;
931 		de->d_ino = (ino64_t)ddv->dv_ino;
932 		de->d_off = (off64_t)diroff + 1;
933 		de->d_reclen = (ushort_t)reclen;
934 
935 		/* use strncpy(9f) to zero out uninitialized bytes */
936 
937 		(void) strncpy(de->d_name, ".", DIRENT64_NAMELEN(reclen));
938 		movesz += reclen;
939 		de = (dirent64_t *)(intptr_t)((char *)de + reclen);
940 		dcmn_err3(("devfs_readdir: A: diroff %lld, soff %lld: '%s' "
941 		    "reclen %lu\n", diroff, soff, ".", reclen));
942 	}
943 
944 	diroff++;
945 	if (soff <= 1) {				/* .. */
946 		reclen = DIRENT64_RECLEN(strlen(".."));
947 		if ((movesz + reclen) > bufsz)
948 			goto full;
949 		de->d_ino = (ino64_t)ddv->dv_dotdot->dv_ino;
950 		de->d_off = (off64_t)diroff + 1;
951 		de->d_reclen = (ushort_t)reclen;
952 
953 		/* use strncpy(9f) to zero out uninitialized bytes */
954 
955 		(void) strncpy(de->d_name, "..", DIRENT64_NAMELEN(reclen));
956 		movesz += reclen;
957 		de = (dirent64_t *)(intptr_t)((char *)de + reclen);
958 		dcmn_err3(("devfs_readdir: B: diroff %lld, soff %lld: '%s' "
959 		    "reclen %lu\n", diroff, soff, "..", reclen));
960 	}
961 
962 	diroff++;
963 	for (dv = DV_FIRST_ENTRY(ddv); dv;
964 	    dv = DV_NEXT_ENTRY(ddv, dv), diroff++) {
965 		/* skip entries until at correct directory offset */
966 		if (diroff < soff)
967 			continue;
968 
969 		/*
970 		 * hidden nodes are skipped (but they still occupy a
971 		 * directory offset).
972 		 */
973 		if (dv->dv_devi && ndi_dev_is_hidden_node(dv->dv_devi))
974 			continue;
975 
976 		/*
977 		 * DDM_INTERNAL_PATH minor nodes are skipped for readdirs
978 		 * outside the kernel (but they still occupy a directory
979 		 * offset).
980 		 */
981 		if ((dv->dv_flags & DV_INTERNAL) && (cred != kcred))
982 			continue;
983 
984 		reclen = DIRENT64_RECLEN(strlen(dv->dv_name));
985 		if ((movesz + reclen) > bufsz) {
986 			dcmn_err3(("devfs_readdir: C: diroff "
987 			    "%lld, soff %lld: '%s' reclen %lu\n",
988 			    diroff, soff, dv->dv_name, reclen));
989 			goto full;
990 		}
991 		de->d_ino = (ino64_t)dv->dv_ino;
992 		de->d_off = (off64_t)diroff + 1;
993 		de->d_reclen = (ushort_t)reclen;
994 
995 		/* use strncpy(9f) to zero out uninitialized bytes */
996 
997 		ASSERT(strlen(dv->dv_name) + 1 <=
998 		    DIRENT64_NAMELEN(reclen));
999 		(void) strncpy(de->d_name, dv->dv_name,
1000 		    DIRENT64_NAMELEN(reclen));
1001 
1002 		movesz += reclen;
1003 		de = (dirent64_t *)(intptr_t)((char *)de + reclen);
1004 		dcmn_err4(("devfs_readdir: D: diroff "
1005 		    "%lld, soff %lld: '%s' reclen %lu\n", diroff, soff,
1006 		    dv->dv_name, reclen));
1007 	}
1008 
1009 	/* the buffer is full, or we exhausted everything */
1010 full:	dcmn_err3(("devfs_readdir: moving %lu bytes: "
1011 	    "diroff %lld, soff %lld, dv %p\n",
1012 	    movesz, diroff, soff, (void *)dv));
1013 
1014 	if ((movesz == 0) && dv)
1015 		error = EINVAL;		/* cannot be represented */
1016 	else {
1017 		error = uiomove(bufp, movesz, UIO_READ, uiop);
1018 		if (error == 0) {
1019 			if (eofp)
1020 				*eofp = dv ? 0 : 1;
1021 			uiop->uio_loffset = diroff;
1022 		}
1023 
1024 		va.va_mask = AT_ATIME;
1025 		gethrestime(&va.va_atime);
1026 		rw_exit(&ddv->dv_contents);
1027 		(void) devfs_setattr(dvp, &va, 0, cred, ct);
1028 		rw_enter(&ddv->dv_contents, RW_READER);
1029 	}
1030 
1031 	kmem_free(bufp, bufsz);
1032 	return (error);
1033 }
1034 
1035 /*ARGSUSED*/
1036 static int
1037 devfs_fsync(struct vnode *vp, int syncflag, struct cred *cred,
1038     caller_context_t *ct)
1039 {
1040 	/*
1041 	 * Message goes to console only. Otherwise, the message
1042 	 * causes devfs_fsync to be invoked again... infinite loop
1043 	 */
1044 	dcmn_err2(("devfs_fsync %s\n", VTODV(vp)->dv_name));
1045 	return (0);
1046 }
1047 
1048 /*
1049  * Normally, we leave the dv_node here at count of 0.
1050  * The node will be destroyed when dv_cleandir() is called.
1051  *
1052  * Stale dv_node's are already unlinked from the fs tree,
1053  * so dv_cleandir() won't find them. We destroy such nodes
1054  * immediately.
1055  */
1056 /*ARGSUSED1*/
1057 static void
1058 devfs_inactive(struct vnode *vp, struct cred *cred, caller_context_t *ct)
1059 {
1060 	int destroy;
1061 	struct dv_node *dv = VTODV(vp);
1062 
1063 	dcmn_err2(("devfs_inactive: %s\n", dv->dv_name));
1064 	mutex_enter(&vp->v_lock);
1065 	ASSERT(vp->v_count >= 1);
1066 	VN_RELE_LOCKED(vp);
1067 	destroy = (DV_STALE(dv) && vp->v_count == 0);
1068 	mutex_exit(&vp->v_lock);
1069 
1070 	/* stale nodes cannot be rediscovered, destroy it here */
1071 	if (destroy)
1072 		dv_destroy(dv, 0);
1073 }
1074 
1075 /*
1076  * XXX Why do we need this?  NFS mounted /dev directories?
1077  * XXX Talk to peter staubach about this.
1078  */
1079 /*ARGSUSED2*/
1080 static int
1081 devfs_fid(struct vnode *vp, struct fid *fidp, caller_context_t *ct)
1082 {
1083 	struct dv_node	*dv = VTODV(vp);
1084 	struct dv_fid	*dv_fid;
1085 
1086 	if (fidp->fid_len < (sizeof (struct dv_fid) - sizeof (ushort_t))) {
1087 		fidp->fid_len = sizeof (struct dv_fid) - sizeof (ushort_t);
1088 		return (ENOSPC);
1089 	}
1090 
1091 	dv_fid = (struct dv_fid *)fidp;
1092 	bzero(dv_fid, sizeof (struct dv_fid));
1093 	dv_fid->dvfid_len = (int)sizeof (struct dv_fid) - sizeof (ushort_t);
1094 	dv_fid->dvfid_ino = dv->dv_ino;
1095 	/* dv_fid->dvfid_gen = dv->tn_gen; XXX ? */
1096 
1097 	return (0);
1098 }
1099 
1100 /*
1101  * This pair of routines bracket all VOP_READ, VOP_WRITE
1102  * and VOP_READDIR requests.  The contents lock stops things
1103  * moving around while we're looking at them.
1104  *
1105  * Also used by file and record locking.
1106  */
1107 /*ARGSUSED2*/
1108 static int
1109 devfs_rwlock(struct vnode *vp, int write_flag, caller_context_t *ct)
1110 {
1111 	dcmn_err2(("devfs_rwlock %s\n", VTODV(vp)->dv_name));
1112 	rw_enter(&VTODV(vp)->dv_contents, write_flag ? RW_WRITER : RW_READER);
1113 	return (write_flag);
1114 }
1115 
1116 /*ARGSUSED1*/
1117 static void
1118 devfs_rwunlock(struct vnode *vp, int write_flag, caller_context_t *ct)
1119 {
1120 	dcmn_err2(("devfs_rwunlock %s\n", VTODV(vp)->dv_name));
1121 	rw_exit(&VTODV(vp)->dv_contents);
1122 }
1123 
1124 /*
1125  * XXX	Should probably do a better job of computing the maximum
1126  *	offset available in the directory.
1127  */
1128 /*ARGSUSED1*/
1129 static int
1130 devfs_seek(struct vnode *vp, offset_t ooff, offset_t *noffp,
1131     caller_context_t *ct)
1132 {
1133 	ASSERT(vp->v_type == VDIR);
1134 	dcmn_err2(("devfs_seek %s\n", VTODV(vp)->dv_name));
1135 	return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
1136 }
1137 
1138 vnodeops_t *dv_vnodeops;
1139 
1140 const fs_operation_def_t dv_vnodeops_template[] = {
1141 	VOPNAME_OPEN,		{ .vop_open = devfs_open },
1142 	VOPNAME_CLOSE,		{ .vop_close = devfs_close },
1143 	VOPNAME_READ,		{ .vop_read = devfs_read },
1144 	VOPNAME_WRITE,		{ .vop_write = devfs_write },
1145 	VOPNAME_IOCTL,		{ .vop_ioctl = devfs_ioctl },
1146 	VOPNAME_GETATTR,	{ .vop_getattr = devfs_getattr },
1147 	VOPNAME_SETATTR,	{ .vop_setattr = devfs_setattr },
1148 	VOPNAME_ACCESS,		{ .vop_access = devfs_access },
1149 	VOPNAME_LOOKUP,		{ .vop_lookup = devfs_lookup },
1150 	VOPNAME_CREATE,		{ .vop_create = devfs_create },
1151 	VOPNAME_READDIR,	{ .vop_readdir = devfs_readdir },
1152 	VOPNAME_FSYNC,		{ .vop_fsync = devfs_fsync },
1153 	VOPNAME_INACTIVE,	{ .vop_inactive = devfs_inactive },
1154 	VOPNAME_FID,		{ .vop_fid = devfs_fid },
1155 	VOPNAME_RWLOCK,		{ .vop_rwlock = devfs_rwlock },
1156 	VOPNAME_RWUNLOCK,	{ .vop_rwunlock = devfs_rwunlock },
1157 	VOPNAME_SEEK,		{ .vop_seek = devfs_seek },
1158 	VOPNAME_PATHCONF,	{ .vop_pathconf = devfs_pathconf },
1159 	VOPNAME_DISPOSE,	{ .error = fs_error },
1160 	VOPNAME_SETSECATTR,	{ .vop_setsecattr = devfs_setsecattr },
1161 	VOPNAME_GETSECATTR,	{ .vop_getsecattr = devfs_getsecattr },
1162 	NULL,			NULL
1163 };
1164