xref: /illumos-gate/usr/src/uts/common/fs/devfs/devfs_vnops.c (revision 45ede40b2394db7967e59f19288fae9b62efd4aa)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2017 by Delphix. All rights reserved.
24  */
25 
26 /*
27  * vnode ops for the devfs
28  *
29  * For leaf vnode special files (VCHR|VBLK) specfs will always see the VOP
30  * first because dv_find always performs leaf vnode substitution, returning
31  * a specfs vnode with an s_realvp pointing to the devfs leaf vnode. This
32  * means that the only leaf special file VOP operations that devfs will see
33  * after VOP_LOOKUP are the ones that specfs forwards.
34  */
35 
36 #include <sys/types.h>
37 #include <sys/param.h>
38 #include <sys/t_lock.h>
39 #include <sys/systm.h>
40 #include <sys/sysmacros.h>
41 #include <sys/user.h>
42 #include <sys/time.h>
43 #include <sys/vfs.h>
44 #include <sys/vnode.h>
45 #include <sys/vfs_opreg.h>
46 #include <sys/file.h>
47 #include <sys/fcntl.h>
48 #include <sys/flock.h>
49 #include <sys/kmem.h>
50 #include <sys/uio.h>
51 #include <sys/errno.h>
52 #include <sys/stat.h>
53 #include <sys/cred.h>
54 #include <sys/dirent.h>
55 #include <sys/pathname.h>
56 #include <sys/cmn_err.h>
57 #include <sys/debug.h>
58 #include <sys/policy.h>
59 #include <sys/modctl.h>
60 #include <sys/sunndi.h>
61 #include <fs/fs_subr.h>
62 #include <sys/fs/dv_node.h>
63 
64 extern struct vattr	dv_vattr_dir, dv_vattr_file;
65 extern dev_t rconsdev;
66 
67 /*
68  * Open of devices (leaf nodes) is handled by specfs.
69  * There is nothing to do to open a directory
70  */
71 /*ARGSUSED*/
72 static int
73 devfs_open(struct vnode **vpp, int flag, struct cred *cred,
74     caller_context_t *ct)
75 {
76 	struct dv_node	*dv = VTODV(*vpp);
77 
78 	dcmn_err2(("devfs_open %s\n", dv->dv_name));
79 	ASSERT((*vpp)->v_type == VDIR);
80 	return (0);
81 }
82 
83 /*
84  * Close of devices (leaf nodes) is handled by specfs.
85  * There is nothing much to do inorder to close a directory.
86  */
87 /*ARGSUSED1*/
88 static int
89 devfs_close(struct vnode *vp, int flag, int count,
90     offset_t offset, struct cred *cred, caller_context_t *ct)
91 {
92 	struct dv_node	*dv = VTODV(vp);
93 
94 	dcmn_err2(("devfs_close %s\n", dv->dv_name));
95 	ASSERT(vp->v_type == VDIR);
96 
97 	cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
98 	cleanshares(vp, ttoproc(curthread)->p_pid);
99 	return (0);
100 }
101 
102 /*
103  * Read of devices (leaf nodes) is handled by specfs.
104  * Read of directories is not supported.
105  */
106 /*ARGSUSED*/
107 static int
108 devfs_read(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cred,
109     struct caller_context *ct)
110 {
111 	dcmn_err2(("devfs_read %s\n", VTODV(vp)->dv_name));
112 	ASSERT(vp->v_type == VDIR);
113 	ASSERT(RW_READ_HELD(&VTODV(vp)->dv_contents));
114 	return (EISDIR);
115 }
116 
117 /*
118  * Write of devices (leaf nodes) is handled by specfs.
119  * Write of directories is not supported.
120  */
121 /*ARGSUSED*/
122 static int
123 devfs_write(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cred,
124     struct caller_context *ct)
125 {
126 	dcmn_err2(("devfs_write %s\n", VTODV(vp)->dv_name));
127 	ASSERT(vp->v_type == VDIR);
128 	ASSERT(RW_WRITE_HELD(&VTODV(vp)->dv_contents));
129 	return (EISDIR);
130 }
131 
132 /*
133  * Ioctls to device (leaf nodes) is handled by specfs.
134  * Ioctl to directories is not supported.
135  */
136 /*ARGSUSED*/
137 static int
138 devfs_ioctl(struct vnode *vp, int cmd, intptr_t arg, int flag,
139     struct cred *cred, int *rvalp, caller_context_t *ct)
140 {
141 	dcmn_err2(("devfs_ioctl %s\n", VTODV(vp)->dv_name));
142 	ASSERT(vp->v_type == VDIR);
143 
144 	return (ENOTTY);	/* no ioctls supported */
145 }
146 
147 /*
148  * We can be asked directly about the attributes of directories, or
149  * (via sp->s_realvp) about the filesystem attributes of special files.
150  *
151  * For directories, we just believe the attribute store
152  * though we mangle the nodeid, fsid, and rdev to convince userland we
153  * really are a different filesystem.
154  *
155  * For special files, a little more fakery is required.
156  *
157  * If the attribute store is not there (read only root), we believe our
158  * memory based attributes.
159  */
160 static int
161 devfs_getattr(struct vnode *vp, struct vattr *vap, int flags, struct cred *cr,
162     caller_context_t *ct)
163 {
164 	struct dv_node	*dv = VTODV(vp);
165 	int		error = 0;
166 	uint_t		mask;
167 
168 	/*
169 	 * Message goes to console only. Otherwise, the message
170 	 * causes devfs_getattr to be invoked again... infinite loop
171 	 */
172 	dcmn_err2(("?devfs_getattr %s\n", dv->dv_name));
173 	ASSERT(dv->dv_attr || dv->dv_attrvp);
174 
175 	if (!(vp->v_type == VDIR || vp->v_type == VCHR || vp->v_type == VBLK)) {
176 		cmn_err(CE_WARN,	/* panic ? */
177 		    "?%s: getattr on vnode type %d", dvnm, vp->v_type);
178 		return (ENOENT);
179 	}
180 
181 	rw_enter(&dv->dv_contents, RW_READER);
182 	if (dv->dv_attr) {
183 		/*
184 		 * obtain from the memory version of attribute.
185 		 * preserve mask for those that optimize.
186 		 * devfs specific fields are already merged on creation.
187 		 */
188 		mask = vap->va_mask;
189 		*vap = *dv->dv_attr;
190 		vap->va_mask = mask;
191 	} else {
192 		/* obtain from attribute store and merge */
193 		error = VOP_GETATTR(dv->dv_attrvp, vap, flags, cr, ct);
194 		dsysdebug(error, ("vop_getattr %s %d\n", dv->dv_name, error));
195 		dv_vattr_merge(dv, vap);
196 	}
197 	rw_exit(&dv->dv_contents);
198 
199 	/*
200 	 * Restrict the permissions of the node fronting the console
201 	 * to 0600 with root as the owner.  This prevents a non-root
202 	 * user from gaining access to a serial terminal (like /dev/term/a)
203 	 * which is in reality serving as the console device (/dev/console).
204 	 */
205 	if (vp->v_rdev == rconsdev) {
206 		mode_t	rconsmask = S_IXUSR|S_IRWXG|S_IRWXO;
207 		vap->va_mode &= (~rconsmask);
208 		vap->va_uid = 0;
209 	}
210 
211 	return (error);
212 }
213 
214 static int devfs_unlocked_access(void *, int, struct cred *);
215 
216 /*ARGSUSED4*/
217 static int
218 devfs_setattr_dir(
219 	struct dv_node *dv,
220 	struct vnode *vp,
221 	struct vattr *vap,
222 	int flags,
223 	struct cred *cr)
224 {
225 	struct vattr	*map;
226 	uint_t		mask;
227 	int		error = 0;
228 	struct vattr	vattr;
229 
230 	ASSERT(dv->dv_attr || dv->dv_attrvp);
231 
232 	ASSERT(vp->v_type == VDIR);
233 	ASSERT((dv->dv_flags & DV_NO_FSPERM) == 0);
234 
235 	if (vap->va_mask & AT_NOSET)
236 		return (EINVAL);
237 
238 	/* to ensure consistency, single thread setting of attributes */
239 	rw_enter(&dv->dv_contents, RW_WRITER);
240 
241 again:	if (dv->dv_attr) {
242 
243 		error = secpolicy_vnode_setattr(cr, vp, vap,
244 		    dv->dv_attr, flags, devfs_unlocked_access, dv);
245 
246 		if (error)
247 			goto out;
248 
249 		/*
250 		 * Apply changes to the memory based attribute. This code
251 		 * is modeled after the tmpfs implementation of memory
252 		 * based vnodes
253 		 */
254 		map = dv->dv_attr;
255 		mask = vap->va_mask;
256 
257 		/* Change file access modes. */
258 		if (mask & AT_MODE) {
259 			map->va_mode &= S_IFMT;
260 			map->va_mode |= vap->va_mode & ~S_IFMT;
261 		}
262 		if (mask & AT_UID)
263 			map->va_uid = vap->va_uid;
264 		if (mask & AT_GID)
265 			map->va_gid = vap->va_gid;
266 		if (mask & AT_ATIME)
267 			map->va_atime = vap->va_atime;
268 		if (mask & AT_MTIME)
269 			map->va_mtime = vap->va_mtime;
270 
271 		if (mask & (AT_MODE | AT_UID | AT_GID | AT_MTIME))
272 			gethrestime(&map->va_ctime);
273 	} else {
274 		/* use the backing attribute store */
275 		ASSERT(dv->dv_attrvp);
276 
277 		/*
278 		 * See if we are changing something we care about
279 		 * the persistence of - return success if we don't care.
280 		 */
281 		if (vap->va_mask & (AT_MODE|AT_UID|AT_GID|AT_ATIME|AT_MTIME)) {
282 			/* Set the attributes */
283 			error = VOP_SETATTR(dv->dv_attrvp,
284 			    vap, flags, cr, NULL);
285 			dsysdebug(error,
286 			    ("vop_setattr %s %d\n", dv->dv_name, error));
287 
288 			/*
289 			 * Some file systems may return EROFS for a setattr
290 			 * on a readonly file system.  In this case we create
291 			 * our own memory based attribute.
292 			 */
293 			if (error == EROFS) {
294 				/*
295 				 * obtain attributes from existing file
296 				 * that we will modify and switch to memory
297 				 * based attribute until attribute store is
298 				 * read/write.
299 				 */
300 				vattr = dv_vattr_dir;
301 				if (VOP_GETATTR(dv->dv_attrvp,
302 				    &vattr, flags, cr, NULL) == 0) {
303 					dv->dv_attr = kmem_alloc(
304 					    sizeof (struct vattr), KM_SLEEP);
305 					*dv->dv_attr = vattr;
306 					dv_vattr_merge(dv, dv->dv_attr);
307 					goto again;
308 				}
309 			}
310 		}
311 	}
312 out:
313 	rw_exit(&dv->dv_contents);
314 	return (error);
315 }
316 
317 
318 /*
319  * Compare the uid/gid/mode changes requested for a setattr
320  * operation with the same details of a node's default minor
321  * perm information.  Return 0 if identical.
322  */
323 static int
324 dv_setattr_cmp(struct vattr *map, mperm_t *mp)
325 {
326 	if ((map->va_mode & S_IAMB) != (mp->mp_mode & S_IAMB))
327 		return (1);
328 	if (map->va_uid != mp->mp_uid)
329 		return (1);
330 	if (map->va_gid != mp->mp_gid)
331 		return (1);
332 	return (0);
333 }
334 
335 
336 /*ARGSUSED4*/
337 static int
338 devfs_setattr(
339 	struct vnode *vp,
340 	struct vattr *vap,
341 	int flags,
342 	struct cred *cr,
343 	caller_context_t *ct)
344 {
345 	struct dv_node	*dv = VTODV(vp);
346 	struct dv_node	*ddv;
347 	struct vnode	*dvp;
348 	struct vattr	*map;
349 	uint_t		mask;
350 	int		error = 0;
351 	struct vattr	*free_vattr = NULL;
352 	struct vattr	*vattrp = NULL;
353 	mperm_t		mp;
354 	int		persist;
355 
356 	/*
357 	 * Message goes to console only. Otherwise, the message
358 	 * causes devfs_getattr to be invoked again... infinite loop
359 	 */
360 	dcmn_err2(("?devfs_setattr %s\n", dv->dv_name));
361 	ASSERT(dv->dv_attr || dv->dv_attrvp);
362 
363 	if (!(vp->v_type == VDIR || vp->v_type == VCHR || vp->v_type == VBLK)) {
364 		cmn_err(CE_WARN,	/* panic ? */
365 		    "?%s: getattr on vnode type %d", dvnm, vp->v_type);
366 		return (ENOENT);
367 	}
368 
369 	if (vap->va_mask & AT_NOSET)
370 		return (EINVAL);
371 
372 	/*
373 	 * If we are changing something we don't care about
374 	 * the persistence of, return success.
375 	 */
376 	if ((vap->va_mask &
377 	    (AT_MODE|AT_UID|AT_GID|AT_ATIME|AT_MTIME)) == 0)
378 		return (0);
379 
380 	/*
381 	 * If driver overrides fs perm, disallow chmod
382 	 * and do not create attribute nodes.
383 	 */
384 	if (dv->dv_flags & DV_NO_FSPERM) {
385 		ASSERT(dv->dv_attr);
386 		if (vap->va_mask & (AT_MODE | AT_UID | AT_GID))
387 			return (EPERM);
388 		if ((vap->va_mask & (AT_ATIME|AT_MTIME)) == 0)
389 			return (0);
390 		rw_enter(&dv->dv_contents, RW_WRITER);
391 		if (vap->va_mask & AT_ATIME)
392 			dv->dv_attr->va_atime = vap->va_atime;
393 		if (vap->va_mask & AT_MTIME)
394 			dv->dv_attr->va_mtime = vap->va_mtime;
395 		rw_exit(&dv->dv_contents);
396 		return (0);
397 	}
398 
399 	/*
400 	 * Directories are always created but device nodes are
401 	 * only used to persist non-default permissions.
402 	 */
403 	if (vp->v_type == VDIR) {
404 		ASSERT(dv->dv_attr || dv->dv_attrvp);
405 		return (devfs_setattr_dir(dv, vp, vap, flags, cr));
406 	}
407 
408 	/*
409 	 * Allocate now before we take any locks
410 	 */
411 	vattrp = kmem_zalloc(sizeof (*vattrp), KM_SLEEP);
412 
413 	/* to ensure consistency, single thread setting of attributes */
414 	rw_enter(&dv->dv_contents, RW_WRITER);
415 
416 	/*
417 	 * We don't need to create an attribute node
418 	 * to persist access or modification times.
419 	 */
420 	persist = (vap->va_mask & (AT_MODE | AT_UID | AT_GID));
421 
422 	/*
423 	 * If persisting something, get the default permissions
424 	 * for this minor to compare against what the attributes
425 	 * are now being set to.  Default ordering is:
426 	 *	- minor_perm match for this minor
427 	 *	- mode supplied by ddi_create_priv_minor_node
428 	 *	- devfs defaults
429 	 */
430 	if (persist) {
431 		if (dev_minorperm(dv->dv_devi, dv->dv_name, &mp) != 0) {
432 			mp.mp_uid = dv_vattr_file.va_uid;
433 			mp.mp_gid = dv_vattr_file.va_gid;
434 			mp.mp_mode = dv_vattr_file.va_mode;
435 			if (dv->dv_flags & DV_DFLT_MODE) {
436 				ASSERT((dv->dv_dflt_mode & ~S_IAMB) == 0);
437 				mp.mp_mode &= ~S_IAMB;
438 				mp.mp_mode |= dv->dv_dflt_mode;
439 				dcmn_err5(("%s: setattr priv default 0%o\n",
440 				    dv->dv_name, mp.mp_mode));
441 			} else {
442 				dcmn_err5(("%s: setattr devfs default 0%o\n",
443 				    dv->dv_name, mp.mp_mode));
444 			}
445 		} else {
446 			dcmn_err5(("%s: setattr minor perm default 0%o\n",
447 			    dv->dv_name, mp.mp_mode));
448 		}
449 	}
450 
451 	/*
452 	 * If we don't have a vattr for this node, construct one.
453 	 */
454 	if (dv->dv_attr) {
455 		free_vattr = vattrp;
456 		vattrp = NULL;
457 	} else {
458 		ASSERT(dv->dv_attrvp);
459 		ASSERT(vp->v_type != VDIR);
460 		*vattrp = dv_vattr_file;
461 		error = VOP_GETATTR(dv->dv_attrvp, vattrp, 0, cr, ct);
462 		dsysdebug(error, ("vop_getattr %s %d\n", dv->dv_name, error));
463 		if (error)
464 			goto out;
465 		dv->dv_attr = vattrp;
466 		dv_vattr_merge(dv, dv->dv_attr);
467 		vattrp = NULL;
468 	}
469 
470 	error = secpolicy_vnode_setattr(cr, vp, vap, dv->dv_attr,
471 	    flags, devfs_unlocked_access, dv);
472 	if (error) {
473 		dsysdebug(error, ("devfs_setattr %s secpolicy error %d\n",
474 		    dv->dv_name, error));
475 		goto out;
476 	}
477 
478 	/*
479 	 * Apply changes to the memory based attribute. This code
480 	 * is modeled after the tmpfs implementation of memory
481 	 * based vnodes
482 	 */
483 	map = dv->dv_attr;
484 	mask = vap->va_mask;
485 
486 	/* Change file access modes. */
487 	if (mask & AT_MODE) {
488 		map->va_mode &= S_IFMT;
489 		map->va_mode |= vap->va_mode & ~S_IFMT;
490 	}
491 	if (mask & AT_UID)
492 		map->va_uid = vap->va_uid;
493 	if (mask & AT_GID)
494 		map->va_gid = vap->va_gid;
495 	if (mask & AT_ATIME)
496 		map->va_atime = vap->va_atime;
497 	if (mask & AT_MTIME)
498 		map->va_mtime = vap->va_mtime;
499 
500 	if (mask & (AT_MODE | AT_UID | AT_GID | AT_MTIME)) {
501 		gethrestime(&map->va_ctime);
502 	}
503 
504 	/*
505 	 * A setattr to defaults means we no longer need the
506 	 * shadow node as a persistent store, unless there
507 	 * are ACLs.  Otherwise create a shadow node if one
508 	 * doesn't exist yet.
509 	 */
510 	if (persist) {
511 		if ((dv_setattr_cmp(map, &mp) == 0) &&
512 		    ((dv->dv_flags & DV_ACL) == 0)) {
513 
514 			if (dv->dv_attrvp) {
515 				ddv = dv->dv_dotdot;
516 				ASSERT(ddv->dv_attrvp);
517 				error = VOP_REMOVE(ddv->dv_attrvp,
518 				    dv->dv_name, cr, ct, 0);
519 				dsysdebug(error,
520 				    ("vop_remove %s %s %d\n",
521 				    ddv->dv_name, dv->dv_name, error));
522 
523 				if (error == EROFS)
524 					error = 0;
525 				VN_RELE(dv->dv_attrvp);
526 				dv->dv_attrvp = NULL;
527 			}
528 			ASSERT(dv->dv_attr);
529 		} else {
530 			if (mask & AT_MODE)
531 				dcmn_err5(("%s persisting mode 0%o\n",
532 				    dv->dv_name, vap->va_mode));
533 			if (mask & AT_UID)
534 				dcmn_err5(("%s persisting uid %d\n",
535 				    dv->dv_name, vap->va_uid));
536 			if (mask & AT_GID)
537 				dcmn_err5(("%s persisting gid %d\n",
538 				    dv->dv_name, vap->va_gid));
539 
540 			if (dv->dv_attrvp == NULL) {
541 				dvp = DVTOV(dv->dv_dotdot);
542 				dv_shadow_node(dvp, dv->dv_name, vp,
543 				    NULL, NULLVP, cr,
544 				    DV_SHADOW_CREATE | DV_SHADOW_WRITE_HELD);
545 			}
546 			if (dv->dv_attrvp) {
547 				/* If map still valid do TIME for free. */
548 				if (dv->dv_attr == map) {
549 					mask = map->va_mask;
550 					map->va_mask =
551 					    vap->va_mask | AT_ATIME | AT_MTIME;
552 					error = VOP_SETATTR(dv->dv_attrvp, map,
553 					    flags, cr, NULL);
554 					map->va_mask = mask;
555 				} else {
556 					error = VOP_SETATTR(dv->dv_attrvp,
557 					    vap, flags, cr, NULL);
558 				}
559 				dsysdebug(error, ("vop_setattr %s %d\n",
560 				    dv->dv_name, error));
561 			}
562 			/*
563 			 * Some file systems may return EROFS for a setattr
564 			 * on a readonly file system.  In this case save
565 			 * as our own memory based attribute.
566 			 * NOTE: ufs is NOT one of these (see ufs_iupdat).
567 			 */
568 			if (dv->dv_attr && dv->dv_attrvp && error == 0) {
569 				vattrp = dv->dv_attr;
570 				dv->dv_attr = NULL;
571 			} else if (error == EROFS)
572 				error = 0;
573 		}
574 	}
575 
576 out:
577 	rw_exit(&dv->dv_contents);
578 
579 	if (vattrp)
580 		kmem_free(vattrp, sizeof (*vattrp));
581 	if (free_vattr)
582 		kmem_free(free_vattr, sizeof (*free_vattr));
583 	return (error);
584 }
585 
586 static int
587 devfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
588     caller_context_t *ct)
589 {
590 	switch (cmd) {
591 	case _PC_ACL_ENABLED:
592 		/*
593 		 * We rely on the underlying filesystem for ACLs,
594 		 * so direct the query for ACL support there.
595 		 * ACL support isn't relative to the file
596 		 * and we can't guarantee that the dv node
597 		 * has an attribute node, so any valid
598 		 * attribute node will suffice.
599 		 */
600 		ASSERT(dvroot);
601 		ASSERT(dvroot->dv_attrvp);
602 		return (VOP_PATHCONF(dvroot->dv_attrvp, cmd, valp, cr, ct));
603 		/*NOTREACHED*/
604 	}
605 
606 	return (fs_pathconf(vp, cmd, valp, cr, ct));
607 }
608 
609 /*
610  * Let avp handle security attributes (acl's).
611  */
612 static int
613 devfs_getsecattr(struct vnode *vp, struct vsecattr *vsap, int flags,
614     struct cred *cr, caller_context_t *ct)
615 {
616 	dvnode_t *dv = VTODV(vp);
617 	struct vnode *avp;
618 	int	error;
619 
620 	dcmn_err2(("devfs_getsecattr %s\n", dv->dv_name));
621 	ASSERT(vp->v_type == VDIR || vp->v_type == VCHR || vp->v_type == VBLK);
622 
623 	rw_enter(&dv->dv_contents, RW_READER);
624 
625 	avp = dv->dv_attrvp;
626 
627 	/* fabricate the acl */
628 	if (avp == NULL) {
629 		error = fs_fab_acl(vp, vsap, flags, cr, ct);
630 		rw_exit(&dv->dv_contents);
631 		return (error);
632 	}
633 
634 	error = VOP_GETSECATTR(avp, vsap, flags, cr, ct);
635 	dsysdebug(error, ("vop_getsecattr %s %d\n", VTODV(vp)->dv_name, error));
636 	rw_exit(&dv->dv_contents);
637 	return (error);
638 }
639 
640 /*
641  * Set security attributes (acl's)
642  *
643  * Note that the dv_contents lock has already been acquired
644  * by the caller's VOP_RWLOCK.
645  */
646 static int
647 devfs_setsecattr(struct vnode *vp, struct vsecattr *vsap, int flags,
648     struct cred *cr, caller_context_t *ct)
649 {
650 	dvnode_t *dv = VTODV(vp);
651 	struct vnode *avp;
652 	int	error;
653 
654 	dcmn_err2(("devfs_setsecattr %s\n", dv->dv_name));
655 	ASSERT(vp->v_type == VDIR || vp->v_type == VCHR || vp->v_type == VBLK);
656 	ASSERT(RW_LOCK_HELD(&dv->dv_contents));
657 
658 	/*
659 	 * Not a supported operation on drivers not providing
660 	 * file system based permissions.
661 	 */
662 	if (dv->dv_flags & DV_NO_FSPERM)
663 		return (ENOTSUP);
664 
665 	/*
666 	 * To complete, the setsecattr requires an underlying attribute node.
667 	 */
668 	if (dv->dv_attrvp == NULL) {
669 		ASSERT(vp->v_type == VCHR || vp->v_type == VBLK);
670 		dv_shadow_node(DVTOV(dv->dv_dotdot), dv->dv_name, vp,
671 		    NULL, NULLVP, cr, DV_SHADOW_CREATE | DV_SHADOW_WRITE_HELD);
672 	}
673 
674 	if ((avp = dv->dv_attrvp) == NULL) {
675 		dcmn_err2(("devfs_setsecattr %s: "
676 		    "cannot construct attribute node\n", dv->dv_name));
677 		return (fs_nosys());
678 	}
679 
680 	/*
681 	 * The acl(2) system call issues a VOP_RWLOCK before setting an ACL.
682 	 * Since backing file systems expect the lock to be held before seeing
683 	 * a VOP_SETSECATTR ACL, we need to issue the VOP_RWLOCK to the backing
684 	 * store before forwarding the ACL.
685 	 */
686 	(void) VOP_RWLOCK(avp, V_WRITELOCK_TRUE, NULL);
687 	error = VOP_SETSECATTR(avp, vsap, flags, cr, ct);
688 	dsysdebug(error, ("vop_setsecattr %s %d\n", VTODV(vp)->dv_name, error));
689 	VOP_RWUNLOCK(avp, V_WRITELOCK_TRUE, NULL);
690 
691 	/*
692 	 * Set DV_ACL if we have a non-trivial set of ACLs.  It is not
693 	 * necessary to hold VOP_RWLOCK since fs_acl_nontrivial only does
694 	 * VOP_GETSECATTR calls.
695 	 */
696 	if (fs_acl_nontrivial(avp, cr))
697 		dv->dv_flags |= DV_ACL;
698 	return (error);
699 }
700 
701 /*
702  * This function is used for secpolicy_setattr().  It must call an
703  * access() like function while it is already holding the
704  * dv_contents lock.  We only care about this when dv_attr != NULL;
705  * so the unlocked access call only concerns itself with that
706  * particular branch of devfs_access().
707  */
708 static int
709 devfs_unlocked_access(void *vdv, int mode, struct cred *cr)
710 {
711 	struct dv_node *dv = vdv;
712 	int shift = 0;
713 	uid_t owner = dv->dv_attr->va_uid;
714 
715 	/* Check access based on owner, group and public permissions. */
716 	if (crgetuid(cr) != owner) {
717 		shift += 3;
718 		if (groupmember(dv->dv_attr->va_gid, cr) == 0)
719 			shift += 3;
720 	}
721 
722 	return (secpolicy_vnode_access2(cr, DVTOV(dv), owner,
723 	    dv->dv_attr->va_mode << shift, mode));
724 }
725 
726 static int
727 devfs_access(struct vnode *vp, int mode, int flags, struct cred *cr,
728     caller_context_t *ct)
729 {
730 	struct dv_node	*dv = VTODV(vp);
731 	int		res;
732 
733 	dcmn_err2(("devfs_access %s\n", dv->dv_name));
734 	ASSERT(dv->dv_attr || dv->dv_attrvp);
735 
736 	/* restrict console access to privileged processes */
737 	if ((vp->v_rdev == rconsdev) && secpolicy_console(cr) != 0) {
738 		return (EACCES);
739 	}
740 
741 	rw_enter(&dv->dv_contents, RW_READER);
742 	if (dv->dv_attr && ((dv->dv_flags & DV_ACL) == 0)) {
743 		res = devfs_unlocked_access(dv, mode, cr);
744 	} else {
745 		res = VOP_ACCESS(dv->dv_attrvp, mode, flags, cr, ct);
746 	}
747 	rw_exit(&dv->dv_contents);
748 	return (res);
749 }
750 
751 /*
752  * Lookup
753  *
754  * Given the directory vnode and the name of the component, return
755  * the corresponding held vnode for that component.
756  *
757  * Of course in these fictional filesystems, nothing's ever quite
758  * -that- simple.
759  *
760  * devfs name	type		shadow (fs attributes)	type	comments
761  * -------------------------------------------------------------------------
762  * drv[@addr]	VDIR		drv[@addr]		VDIR	nexus driver
763  * drv[@addr]:m	VCHR/VBLK	drv[@addr]:m		VREG	leaf driver
764  * drv[@addr]	VCHR/VBLK	drv[@addr]:.default	VREG	leaf driver
765  * -------------------------------------------------------------------------
766  *
767  * The following names are reserved for the attribute filesystem (which
768  * could easily be another layer on top of this one - we simply need to
769  * hold the vnode of the thing we're looking at)
770  *
771  * attr name	type		shadow (fs attributes)	type	comments
772  * -------------------------------------------------------------------------
773  * drv[@addr]	VDIR		-			-	attribute dir
774  * minorname	VDIR		-			-	minorname
775  * attribute	VREG		-			-	attribute
776  * -------------------------------------------------------------------------
777  *
778  * Examples:
779  *
780  *	devfs:/devices/.../mm@0:zero		VCHR
781  *	shadow:/.devices/.../mm@0:zero		VREG, fs attrs
782  *	devfs:/devices/.../mm@0:/zero/attr	VREG, driver attribute
783  *
784  *	devfs:/devices/.../sd@0,0:a		VBLK
785  *	shadow:/.devices/.../sd@0,0:a		VREG, fs attrs
786  *	devfs:/devices/.../sd@0,0:/a/.type	VREG, "ddi_block:chan"
787  *
788  *	devfs:/devices/.../mm@0			VCHR
789  *	shadow:/.devices/.../mm@0:.default	VREG, fs attrs
790  *	devfs:/devices/.../mm@0:/.default/attr	VREG, driver attribute
791  *	devfs:/devices/.../mm@0:/.default/.type	VREG, "ddi_pseudo"
792  *
793  *	devfs:/devices/.../obio			VDIR
794  *	shadow:/devices/.../obio		VDIR, needed for fs attrs.
795  *	devfs:/devices/.../obio:/.default/attr	VDIR, driver attribute
796  *
797  * We also need to be able deal with "old" devices that have gone away,
798  * though I think that provided we return them with readdir, they can
799  * be removed (i.e. they don't have to respond to lookup, though it might
800  * be weird if they didn't ;-)
801  *
802  * Lookup has side-effects.
803  *
804  * - It will create directories and fs attribute files in the shadow hierarchy.
805  * - It should cause non-SID devices to be probed (ask the parent nexi).
806  */
807 /*ARGSUSED3*/
808 static int
809 devfs_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
810     struct pathname *pnp, int flags, struct vnode *rdir, struct cred *cred,
811     caller_context_t *ct, int *direntflags, pathname_t *realpnp)
812 {
813 	ASSERT(dvp->v_type == VDIR);
814 	dcmn_err2(("devfs_lookup: %s\n", nm));
815 	return (dv_find(VTODV(dvp), nm, vpp, pnp, rdir, cred, 0));
816 }
817 
818 /*
819  * devfs nodes can't really be created directly by userland - however,
820  * we do allow creates to find existing nodes:
821  *
822  * - any create fails if the node doesn't exist - EROFS.
823  * - creating an existing directory read-only succeeds, otherwise EISDIR.
824  * - exclusive creates fail if the node already exists - EEXIST.
825  * - failure to create the snode for an existing device - ENOSYS.
826  */
827 /*ARGSUSED2*/
828 static int
829 devfs_create(struct vnode *dvp, char *nm, struct vattr *vap, vcexcl_t excl,
830     int mode, struct vnode **vpp, struct cred *cred, int flag,
831     caller_context_t *ct, vsecattr_t *vsecp)
832 {
833 	int error;
834 	struct vnode *vp;
835 
836 	dcmn_err2(("devfs_create %s\n", nm));
837 	error = dv_find(VTODV(dvp), nm, &vp, NULL, NULLVP, cred, 0);
838 	if (error == 0) {
839 		if (excl == EXCL)
840 			error = EEXIST;
841 		else if (vp->v_type == VDIR && (mode & VWRITE))
842 			error = EISDIR;
843 		else
844 			error = VOP_ACCESS(vp, mode, 0, cred, ct);
845 
846 		if (error) {
847 			VN_RELE(vp);
848 		} else
849 			*vpp = vp;
850 	} else if (error == ENOENT)
851 		error = EROFS;
852 
853 	return (error);
854 }
855 
856 /*
857  * If DV_BUILD is set, we call into nexus driver to do a BUS_CONFIG_ALL.
858  * Otherwise, simply return cached dv_node's. Hotplug code always call
859  * devfs_clean() to invalid the dv_node cache.
860  */
861 /*ARGSUSED5*/
862 static int
863 devfs_readdir(struct vnode *dvp, struct uio *uiop, struct cred *cred, int *eofp,
864     caller_context_t *ct, int flags)
865 {
866 	struct dv_node *ddv, *dv;
867 	struct dirent64 *de, *bufp;
868 	offset_t diroff;
869 	offset_t	soff;
870 	size_t reclen, movesz;
871 	int error;
872 	struct vattr va;
873 	size_t bufsz;
874 
875 	ddv = VTODV(dvp);
876 	dcmn_err2(("devfs_readdir %s: offset %lld len %ld\n",
877 	    ddv->dv_name, uiop->uio_loffset, uiop->uio_iov->iov_len));
878 	ASSERT(ddv->dv_attr || ddv->dv_attrvp);
879 	ASSERT(RW_READ_HELD(&ddv->dv_contents));
880 
881 	if (uiop->uio_loffset >= MAXOFF_T) {
882 		if (eofp)
883 			*eofp = 1;
884 		return (0);
885 	}
886 
887 	if (uiop->uio_iovcnt != 1)
888 		return (EINVAL);
889 
890 	if (dvp->v_type != VDIR)
891 		return (ENOTDIR);
892 
893 	/* Load the initial contents */
894 	if (ddv->dv_flags & DV_BUILD) {
895 		if (!rw_tryupgrade(&ddv->dv_contents)) {
896 			rw_exit(&ddv->dv_contents);
897 			rw_enter(&ddv->dv_contents, RW_WRITER);
898 		}
899 
900 		/* recheck and fill */
901 		if (ddv->dv_flags & DV_BUILD)
902 			dv_filldir(ddv);
903 
904 		rw_downgrade(&ddv->dv_contents);
905 	}
906 
907 	soff = uiop->uio_loffset;
908 	bufsz = uiop->uio_iov->iov_len;
909 	de = bufp = kmem_alloc(bufsz, KM_SLEEP);
910 	movesz = 0;
911 	dv = (struct dv_node *)-1;
912 
913 	/*
914 	 * Move as many entries into the uio structure as it will take.
915 	 * Special case "." and "..".
916 	 */
917 	diroff = 0;
918 	if (soff == 0) {				/* . */
919 		reclen = DIRENT64_RECLEN(strlen("."));
920 		if ((movesz + reclen) > bufsz)
921 			goto full;
922 		de->d_ino = (ino64_t)ddv->dv_ino;
923 		de->d_off = (off64_t)diroff + 1;
924 		de->d_reclen = (ushort_t)reclen;
925 
926 		/* use strncpy(9f) to zero out uninitialized bytes */
927 
928 		(void) strncpy(de->d_name, ".", DIRENT64_NAMELEN(reclen));
929 		movesz += reclen;
930 		de = (dirent64_t *)(intptr_t)((char *)de + reclen);
931 		dcmn_err3(("devfs_readdir: A: diroff %lld, soff %lld: '%s' "
932 		    "reclen %lu\n", diroff, soff, ".", reclen));
933 	}
934 
935 	diroff++;
936 	if (soff <= 1) {				/* .. */
937 		reclen = DIRENT64_RECLEN(strlen(".."));
938 		if ((movesz + reclen) > bufsz)
939 			goto full;
940 		de->d_ino = (ino64_t)ddv->dv_dotdot->dv_ino;
941 		de->d_off = (off64_t)diroff + 1;
942 		de->d_reclen = (ushort_t)reclen;
943 
944 		/* use strncpy(9f) to zero out uninitialized bytes */
945 
946 		(void) strncpy(de->d_name, "..", DIRENT64_NAMELEN(reclen));
947 		movesz += reclen;
948 		de = (dirent64_t *)(intptr_t)((char *)de + reclen);
949 		dcmn_err3(("devfs_readdir: B: diroff %lld, soff %lld: '%s' "
950 		    "reclen %lu\n", diroff, soff, "..", reclen));
951 	}
952 
953 	diroff++;
954 	for (dv = DV_FIRST_ENTRY(ddv); dv;
955 	    dv = DV_NEXT_ENTRY(ddv, dv), diroff++) {
956 		/* skip entries until at correct directory offset */
957 		if (diroff < soff)
958 			continue;
959 
960 		/*
961 		 * hidden nodes are skipped (but they still occupy a
962 		 * directory offset).
963 		 */
964 		if (dv->dv_devi && ndi_dev_is_hidden_node(dv->dv_devi))
965 			continue;
966 
967 		/*
968 		 * DDM_INTERNAL_PATH minor nodes are skipped for readdirs
969 		 * outside the kernel (but they still occupy a directory
970 		 * offset).
971 		 */
972 		if ((dv->dv_flags & DV_INTERNAL) && (cred != kcred))
973 			continue;
974 
975 		reclen = DIRENT64_RECLEN(strlen(dv->dv_name));
976 		if ((movesz + reclen) > bufsz) {
977 			dcmn_err3(("devfs_readdir: C: diroff "
978 			    "%lld, soff %lld: '%s' reclen %lu\n",
979 			    diroff, soff, dv->dv_name, reclen));
980 			goto full;
981 		}
982 		de->d_ino = (ino64_t)dv->dv_ino;
983 		de->d_off = (off64_t)diroff + 1;
984 		de->d_reclen = (ushort_t)reclen;
985 
986 		/* use strncpy(9f) to zero out uninitialized bytes */
987 
988 		ASSERT(strlen(dv->dv_name) + 1 <=
989 		    DIRENT64_NAMELEN(reclen));
990 		(void) strncpy(de->d_name, dv->dv_name,
991 		    DIRENT64_NAMELEN(reclen));
992 
993 		movesz += reclen;
994 		de = (dirent64_t *)(intptr_t)((char *)de + reclen);
995 		dcmn_err4(("devfs_readdir: D: diroff "
996 		    "%lld, soff %lld: '%s' reclen %lu\n", diroff, soff,
997 		    dv->dv_name, reclen));
998 	}
999 
1000 	/* the buffer is full, or we exhausted everything */
1001 full:	dcmn_err3(("devfs_readdir: moving %lu bytes: "
1002 	    "diroff %lld, soff %lld, dv %p\n",
1003 	    movesz, diroff, soff, (void *)dv));
1004 
1005 	if ((movesz == 0) && dv)
1006 		error = EINVAL;		/* cannot be represented */
1007 	else {
1008 		error = uiomove(bufp, movesz, UIO_READ, uiop);
1009 		if (error == 0) {
1010 			if (eofp)
1011 				*eofp = dv ? 0 : 1;
1012 			uiop->uio_loffset = diroff;
1013 		}
1014 
1015 		va.va_mask = AT_ATIME;
1016 		gethrestime(&va.va_atime);
1017 		rw_exit(&ddv->dv_contents);
1018 		(void) devfs_setattr(dvp, &va, 0, cred, ct);
1019 		rw_enter(&ddv->dv_contents, RW_READER);
1020 	}
1021 
1022 	kmem_free(bufp, bufsz);
1023 	return (error);
1024 }
1025 
1026 /*ARGSUSED*/
1027 static int
1028 devfs_fsync(struct vnode *vp, int syncflag, struct cred *cred,
1029     caller_context_t *ct)
1030 {
1031 	/*
1032 	 * Message goes to console only. Otherwise, the message
1033 	 * causes devfs_fsync to be invoked again... infinite loop
1034 	 */
1035 	dcmn_err2(("devfs_fsync %s\n", VTODV(vp)->dv_name));
1036 	return (0);
1037 }
1038 
1039 /*
1040  * Normally, we leave the dv_node here at count of 0.
1041  * The node will be destroyed when dv_cleandir() is called.
1042  *
1043  * Stale dv_node's are already unlinked from the fs tree,
1044  * so dv_cleandir() won't find them. We destroy such nodes
1045  * immediately.
1046  */
1047 /*ARGSUSED1*/
1048 static void
1049 devfs_inactive(struct vnode *vp, struct cred *cred, caller_context_t *ct)
1050 {
1051 	int destroy;
1052 	struct dv_node *dv = VTODV(vp);
1053 
1054 	dcmn_err2(("devfs_inactive: %s\n", dv->dv_name));
1055 	mutex_enter(&vp->v_lock);
1056 	ASSERT(vp->v_count >= 1);
1057 	VN_RELE_LOCKED(vp);
1058 	destroy = (DV_STALE(dv) && vp->v_count == 0);
1059 	mutex_exit(&vp->v_lock);
1060 
1061 	/* stale nodes cannot be rediscovered, destroy it here */
1062 	if (destroy)
1063 		dv_destroy(dv, 0);
1064 }
1065 
1066 /*
1067  * XXX Why do we need this?  NFS mounted /dev directories?
1068  * XXX Talk to peter staubach about this.
1069  */
1070 /*ARGSUSED2*/
1071 static int
1072 devfs_fid(struct vnode *vp, struct fid *fidp, caller_context_t *ct)
1073 {
1074 	struct dv_node	*dv = VTODV(vp);
1075 	struct dv_fid	*dv_fid;
1076 
1077 	if (fidp->fid_len < (sizeof (struct dv_fid) - sizeof (ushort_t))) {
1078 		fidp->fid_len = sizeof (struct dv_fid) - sizeof (ushort_t);
1079 		return (ENOSPC);
1080 	}
1081 
1082 	dv_fid = (struct dv_fid *)fidp;
1083 	bzero(dv_fid, sizeof (struct dv_fid));
1084 	dv_fid->dvfid_len = (int)sizeof (struct dv_fid) - sizeof (ushort_t);
1085 	dv_fid->dvfid_ino = dv->dv_ino;
1086 	/* dv_fid->dvfid_gen = dv->tn_gen; XXX ? */
1087 
1088 	return (0);
1089 }
1090 
1091 /*
1092  * This pair of routines bracket all VOP_READ, VOP_WRITE
1093  * and VOP_READDIR requests.  The contents lock stops things
1094  * moving around while we're looking at them.
1095  *
1096  * Also used by file and record locking.
1097  */
1098 /*ARGSUSED2*/
1099 static int
1100 devfs_rwlock(struct vnode *vp, int write_flag, caller_context_t *ct)
1101 {
1102 	dcmn_err2(("devfs_rwlock %s\n", VTODV(vp)->dv_name));
1103 	rw_enter(&VTODV(vp)->dv_contents, write_flag ? RW_WRITER : RW_READER);
1104 	return (write_flag);
1105 }
1106 
1107 /*ARGSUSED1*/
1108 static void
1109 devfs_rwunlock(struct vnode *vp, int write_flag, caller_context_t *ct)
1110 {
1111 	dcmn_err2(("devfs_rwunlock %s\n", VTODV(vp)->dv_name));
1112 	rw_exit(&VTODV(vp)->dv_contents);
1113 }
1114 
1115 /*
1116  * XXX	Should probably do a better job of computing the maximum
1117  *	offset available in the directory.
1118  */
1119 /*ARGSUSED1*/
1120 static int
1121 devfs_seek(struct vnode *vp, offset_t ooff, offset_t *noffp,
1122     caller_context_t *ct)
1123 {
1124 	ASSERT(vp->v_type == VDIR);
1125 	dcmn_err2(("devfs_seek %s\n", VTODV(vp)->dv_name));
1126 	return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
1127 }
1128 
1129 vnodeops_t *dv_vnodeops;
1130 
1131 const fs_operation_def_t dv_vnodeops_template[] = {
1132 	VOPNAME_OPEN,		{ .vop_open = devfs_open },
1133 	VOPNAME_CLOSE,		{ .vop_close = devfs_close },
1134 	VOPNAME_READ,		{ .vop_read = devfs_read },
1135 	VOPNAME_WRITE,		{ .vop_write = devfs_write },
1136 	VOPNAME_IOCTL,		{ .vop_ioctl = devfs_ioctl },
1137 	VOPNAME_GETATTR,	{ .vop_getattr = devfs_getattr },
1138 	VOPNAME_SETATTR,	{ .vop_setattr = devfs_setattr },
1139 	VOPNAME_ACCESS,		{ .vop_access = devfs_access },
1140 	VOPNAME_LOOKUP,		{ .vop_lookup = devfs_lookup },
1141 	VOPNAME_CREATE,		{ .vop_create = devfs_create },
1142 	VOPNAME_READDIR,	{ .vop_readdir = devfs_readdir },
1143 	VOPNAME_FSYNC,		{ .vop_fsync = devfs_fsync },
1144 	VOPNAME_INACTIVE,	{ .vop_inactive = devfs_inactive },
1145 	VOPNAME_FID,		{ .vop_fid = devfs_fid },
1146 	VOPNAME_RWLOCK,		{ .vop_rwlock = devfs_rwlock },
1147 	VOPNAME_RWUNLOCK,	{ .vop_rwunlock = devfs_rwunlock },
1148 	VOPNAME_SEEK,		{ .vop_seek = devfs_seek },
1149 	VOPNAME_PATHCONF,	{ .vop_pathconf = devfs_pathconf },
1150 	VOPNAME_DISPOSE,	{ .error = fs_error },
1151 	VOPNAME_SETSECATTR,	{ .vop_setsecattr = devfs_setsecattr },
1152 	VOPNAME_GETSECATTR,	{ .vop_getsecattr = devfs_getsecattr },
1153 	NULL,			NULL
1154 };
1155