xref: /titanic_44/usr/src/uts/common/fs/devfs/devfs_vnops.c (revision 14ea4bb737263733ad80a36b4f73f681c30a6b45)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * vnode ops for the devfs
30  *
31  * For leaf vnode special files (VCHR|VBLK) specfs will always see the VOP
32  * first because dv_find always performs leaf vnode substitution, returning
33  * a specfs vnode with an s_realvp pointing to the devfs leaf vnode. This
34  * means that the only leaf special file VOP operations that devfs will see
35  * after VOP_LOOKUP are the ones that specfs forwards.
36  */
37 
38 #include <sys/types.h>
39 #include <sys/param.h>
40 #include <sys/t_lock.h>
41 #include <sys/systm.h>
42 #include <sys/sysmacros.h>
43 #include <sys/user.h>
44 #include <sys/time.h>
45 #include <sys/vfs.h>
46 #include <sys/vnode.h>
47 #include <sys/file.h>
48 #include <sys/fcntl.h>
49 #include <sys/flock.h>
50 #include <sys/kmem.h>
51 #include <sys/uio.h>
52 #include <sys/errno.h>
53 #include <sys/stat.h>
54 #include <sys/cred.h>
55 #include <sys/dirent.h>
56 #include <sys/pathname.h>
57 #include <sys/cmn_err.h>
58 #include <sys/debug.h>
59 #include <sys/policy.h>
60 #include <sys/modctl.h>
61 
62 #include <fs/fs_subr.h>
63 #include <sys/fs/dv_node.h>
64 
65 extern struct vattr	dv_vattr_dir, dv_vattr_file;
66 extern dev_t rconsdev;
67 
68 /*
69  * Open of devices (leaf nodes) is handled by specfs.
70  * There is nothing to do to open a directory
71  */
72 /*ARGSUSED*/
73 static int
74 devfs_open(struct vnode **vpp, int flag, struct cred *cred)
75 {
76 	struct dv_node	*dv = VTODV(*vpp);
77 
78 	dcmn_err2(("devfs_open %s\n", dv->dv_name));
79 	ASSERT((*vpp)->v_type == VDIR);
80 	return (0);
81 }
82 
83 /*
84  * Close of devices (leaf nodes) is handled by specfs.
85  * There is nothing much to do inorder to close a directory.
86  */
87 /*ARGSUSED1*/
88 static int
89 devfs_close(struct vnode *vp, int flag, int count,
90     offset_t offset, struct cred *cred)
91 {
92 	struct dv_node	*dv = VTODV(vp);
93 
94 	dcmn_err2(("devfs_close %s\n", dv->dv_name));
95 	ASSERT(vp->v_type == VDIR);
96 
97 	cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
98 	cleanshares(vp, ttoproc(curthread)->p_pid);
99 	return (0);
100 }
101 
102 /*
103  * Read of devices (leaf nodes) is handled by specfs.
104  * Read of directories is not supported.
105  */
106 /*ARGSUSED*/
107 static int
108 devfs_read(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cred,
109 	struct caller_context *ct)
110 {
111 	dcmn_err2(("devfs_read %s\n", VTODV(vp)->dv_name));
112 	ASSERT(vp->v_type == VDIR);
113 	ASSERT(RW_READ_HELD(&VTODV(vp)->dv_contents));
114 	return (EISDIR);
115 }
116 
117 /*
118  * Write of devices (leaf nodes) is handled by specfs.
119  * Write of directories is not supported.
120  */
121 /*ARGSUSED*/
122 static int
123 devfs_write(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cred,
124 	struct caller_context *ct)
125 {
126 	dcmn_err2(("devfs_write %s\n", VTODV(vp)->dv_name));
127 	ASSERT(vp->v_type == VDIR);
128 	ASSERT(RW_WRITE_HELD(&VTODV(vp)->dv_contents));
129 	return (EISDIR);
130 }
131 
132 /*
133  * Ioctls to device (leaf nodes) is handled by specfs.
134  * Ioctl to directories is not supported.
135  */
136 /*ARGSUSED*/
137 static int
138 devfs_ioctl(struct vnode *vp, int cmd, intptr_t arg, int flag,
139     struct cred *cred, int *rvalp)
140 {
141 	dcmn_err2(("devfs_ioctl %s\n", VTODV(vp)->dv_name));
142 	ASSERT(vp->v_type == VDIR);
143 
144 	return (ENOTTY);	/* no ioctls supported */
145 }
146 
147 /*
148  * We can be asked directly about the attributes of directories, or
149  * (via sp->s_realvp) about the filesystem attributes of special files.
150  *
151  * For directories, we just believe the attribute store
152  * though we mangle the nodeid, fsid, and rdev to convince userland we
153  * really are a different filesystem.
154  *
155  * For special files, a little more fakery is required.
156  *
157  * If the attribute store is not there (read only root), we believe our
158  * memory based attributes.
159  */
160 static int
161 devfs_getattr(struct vnode *vp, struct vattr *vap, int flags, struct cred *cr)
162 {
163 	struct dv_node	*dv = VTODV(vp);
164 	int		error = 0;
165 	uint_t		mask;
166 
167 	/*
168 	 * Message goes to console only. Otherwise, the message
169 	 * causes devfs_getattr to be invoked again... infinite loop
170 	 */
171 	dcmn_err2(("?devfs_getattr %s\n", dv->dv_name));
172 	ASSERT(dv->dv_attr || dv->dv_attrvp);
173 
174 	if (!(vp->v_type == VDIR || vp->v_type == VCHR || vp->v_type == VBLK)) {
175 		cmn_err(CE_WARN,	/* panic ? */
176 		    "?%s: getattr on vnode type %d", dvnm, vp->v_type);
177 		return (ENOENT);
178 	}
179 
180 	if (dv->dv_attr) {
181 		/*
182 		 * obtain from the memory version of attribute.
183 		 * preserve mask for those that optimize.
184 		 * devfs specific fields are already merged on creation.
185 		 */
186 		mask = vap->va_mask;
187 		*vap = *dv->dv_attr;
188 		vap->va_mask = mask;
189 	} else {
190 		/* obtain from attribute store and merge */
191 		error = VOP_GETATTR(dv->dv_attrvp, vap, flags, cr);
192 		dsysdebug(error, ("vop_getattr %s %d\n", dv->dv_name, error));
193 		dv_vattr_merge(dv, vap);
194 	}
195 
196 	/*
197 	 * Restrict the permissions of the node fronting the console
198 	 * to 0600 with root as the owner.  This prevents a non-root
199 	 * user from gaining access to a serial terminal (like /dev/term/a)
200 	 * which is in reality serving as the console device (/dev/console).
201 	 */
202 	if (vp->v_rdev == rconsdev) {
203 		mode_t	rconsmask = S_IXUSR|S_IRWXG|S_IRWXO;
204 		vap->va_mode &= (~rconsmask);
205 		vap->va_uid = 0;
206 	}
207 
208 	return (error);
209 }
210 
211 static int devfs_unlocked_access(void *, int, struct cred *);
212 
213 /*ARGSUSED4*/
214 static int
215 devfs_setattr_dir(
216 	struct dv_node *dv,
217 	struct vnode *vp,
218 	struct vattr *vap,
219 	int flags,
220 	struct cred *cr)
221 {
222 	struct vattr	*map;
223 	long int	mask;
224 	int		error = 0;
225 	struct vattr	vattr;
226 
227 	ASSERT(dv->dv_attr || dv->dv_attrvp);
228 
229 	ASSERT(vp->v_type == VDIR);
230 	ASSERT((dv->dv_flags & DV_NO_FSPERM) == 0);
231 
232 	if (vap->va_mask & AT_NOSET)
233 		return (EINVAL);
234 
235 	/* to ensure consistency, single thread setting of attributes */
236 	rw_enter(&dv->dv_contents, RW_WRITER);
237 
238 again:	if (dv->dv_attr) {
239 
240 		error = secpolicy_vnode_setattr(cr, vp, vap, dv->dv_attr,
241 					flags, devfs_unlocked_access, dv);
242 
243 		if (error)
244 			goto out;
245 
246 		/*
247 		 * Apply changes to the memory based attribute. This code
248 		 * is modeled after the tmpfs implementation of memory
249 		 * based vnodes
250 		 */
251 		map = dv->dv_attr;
252 		mask = vap->va_mask;
253 
254 		/* Change file access modes. */
255 		if (mask & AT_MODE) {
256 			map->va_mode &= S_IFMT;
257 			map->va_mode |= vap->va_mode & ~S_IFMT;
258 		}
259 		if (mask & AT_UID)
260 			map->va_uid = vap->va_uid;
261 		if (mask & AT_GID)
262 			map->va_gid = vap->va_gid;
263 		if (mask & AT_ATIME)
264 			map->va_atime = vap->va_atime;
265 		if (mask & AT_MTIME)
266 			map->va_mtime = vap->va_mtime;
267 
268 		if (mask & (AT_MODE | AT_UID | AT_GID | AT_MTIME))
269 			gethrestime(&map->va_ctime);
270 	} else {
271 		/* use the backing attribute store */
272 		ASSERT(dv->dv_attrvp);
273 
274 		/*
275 		 * See if we are changing something we care about
276 		 * the persistence of - return success if we don't care.
277 		 */
278 		if (vap->va_mask & (AT_MODE|AT_UID|AT_GID|AT_ATIME|AT_MTIME)) {
279 			/* Set the attributes */
280 			error = VOP_SETATTR(dv->dv_attrvp,
281 				vap, flags, cr, NULL);
282 			dsysdebug(error,
283 				("vop_setattr %s %d\n", dv->dv_name, error));
284 
285 			/*
286 			 * Some file systems may return EROFS for a setattr
287 			 * on a readonly file system.  In this case we create
288 			 * our own memory based attribute.
289 			 */
290 			if (error == EROFS) {
291 				/*
292 				 * obtain attributes from existing file
293 				 * that we will modify and switch to memory
294 				 * based attribute until attribute store is
295 				 * read/write.
296 				 */
297 				vattr = dv_vattr_dir;
298 				if (VOP_GETATTR(dv->dv_attrvp, &vattr,
299 				    flags, cr) == 0) {
300 					dv->dv_attr = kmem_alloc(
301 					    sizeof (struct vattr), KM_SLEEP);
302 					*dv->dv_attr = vattr;
303 					dv_vattr_merge(dv, dv->dv_attr);
304 					goto again;
305 				}
306 			}
307 		}
308 	}
309 out:
310 	rw_exit(&dv->dv_contents);
311 	return (error);
312 }
313 
314 
315 /*
316  * Compare the uid/gid/mode changes requested for a setattr
317  * operation with the same details of a node's default minor
318  * perm information.  Return 0 if identical.
319  */
320 static int
321 dv_setattr_cmp(struct vattr *map, mperm_t *mp)
322 {
323 	if ((map->va_mode & S_IAMB) != (mp->mp_mode & S_IAMB))
324 		return (1);
325 	if (map->va_uid != mp->mp_uid)
326 		return (1);
327 	if (map->va_gid != mp->mp_gid)
328 		return (1);
329 	return (0);
330 }
331 
332 
333 /*ARGSUSED4*/
334 static int
335 devfs_setattr(
336 	struct vnode *vp,
337 	struct vattr *vap,
338 	int flags,
339 	struct cred *cr,
340 	caller_context_t *ct)
341 {
342 	struct dv_node	*dv = VTODV(vp);
343 	struct dv_node	*ddv;
344 	struct vnode	*dvp;
345 	struct vattr	*map;
346 	long int	mask;
347 	int		error = 0;
348 	struct vattr	*free_vattr = NULL;
349 	struct vattr	*vattrp = NULL;
350 	mperm_t		mp;
351 	int		persist;
352 
353 	/*
354 	 * Message goes to console only. Otherwise, the message
355 	 * causes devfs_getattr to be invoked again... infinite loop
356 	 */
357 	dcmn_err2(("?devfs_setattr %s\n", dv->dv_name));
358 	ASSERT(dv->dv_attr || dv->dv_attrvp);
359 
360 	if (!(vp->v_type == VDIR || vp->v_type == VCHR || vp->v_type == VBLK)) {
361 		cmn_err(CE_WARN,	/* panic ? */
362 		    "?%s: getattr on vnode type %d", dvnm, vp->v_type);
363 		return (ENOENT);
364 	}
365 
366 	if (vap->va_mask & AT_NOSET)
367 		return (EINVAL);
368 
369 	/*
370 	 * If we are changing something we don't care about
371 	 * the persistence of, return success.
372 	 */
373 	if ((vap->va_mask &
374 	    (AT_MODE|AT_UID|AT_GID|AT_ATIME|AT_MTIME)) == 0)
375 		return (0);
376 
377 	/*
378 	 * If driver overrides fs perm, disallow chmod
379 	 * and do not create attribute nodes.
380 	 */
381 	if (dv->dv_flags & DV_NO_FSPERM) {
382 		ASSERT(dv->dv_attr);
383 		if (vap->va_mask & (AT_MODE | AT_UID | AT_GID))
384 			return (EPERM);
385 		if ((vap->va_mask & (AT_ATIME|AT_MTIME)) == 0)
386 			return (0);
387 		rw_enter(&dv->dv_contents, RW_WRITER);
388 		if (vap->va_mask & AT_ATIME)
389 			dv->dv_attr->va_atime = vap->va_atime;
390 		if (vap->va_mask & AT_MTIME)
391 			dv->dv_attr->va_mtime = vap->va_mtime;
392 		rw_exit(&dv->dv_contents);
393 		return (0);
394 	}
395 
396 	/*
397 	 * Directories are always created but device nodes are
398 	 * only used to persist non-default permissions.
399 	 */
400 	if (vp->v_type == VDIR) {
401 		ASSERT(dv->dv_attr || dv->dv_attrvp);
402 		return (devfs_setattr_dir(dv, vp, vap, flags, cr));
403 	}
404 
405 	/*
406 	 * Allocate now before we take any locks
407 	 */
408 	vattrp = kmem_zalloc(sizeof (*vattrp), KM_SLEEP);
409 
410 	/* to ensure consistency, single thread setting of attributes */
411 	rw_enter(&dv->dv_contents, RW_WRITER);
412 
413 	/*
414 	 * We don't need to create an attribute node
415 	 * to persist access or modification times.
416 	 */
417 	persist = (vap->va_mask & (AT_MODE | AT_UID | AT_GID));
418 
419 	/*
420 	 * If persisting something, get the default permissions
421 	 * for this minor to compare against what the attributes
422 	 * are now being set to.  Default ordering is:
423 	 *	- minor_perm match for this minor
424 	 *	- mode supplied by ddi_create_priv_minor_node
425 	 *	- devfs defaults
426 	 */
427 	if (persist) {
428 		if (dev_minorperm(dv->dv_devi, dv->dv_name, &mp) != 0) {
429 			mp.mp_uid = dv_vattr_file.va_uid;
430 			mp.mp_gid = dv_vattr_file.va_gid;
431 			mp.mp_mode = dv_vattr_file.va_mode;
432 			if (dv->dv_flags & DV_DFLT_MODE) {
433 				ASSERT((dv->dv_dflt_mode & ~S_IAMB) == 0);
434 				mp.mp_mode &= ~S_IAMB;
435 				mp.mp_mode |= dv->dv_dflt_mode;
436 				dcmn_err5(("%s: setattr priv default 0%o\n",
437 				    dv->dv_name, mp.mp_mode));
438 			} else {
439 				dcmn_err5(("%s: setattr devfs default 0%o\n",
440 				    dv->dv_name, mp.mp_mode));
441 			}
442 		} else {
443 			dcmn_err5(("%s: setattr minor perm default 0%o\n",
444 			    dv->dv_name, mp.mp_mode));
445 		}
446 	}
447 
448 	/*
449 	 * If we don't have a vattr for this node, construct one.
450 	 */
451 	if (dv->dv_attr) {
452 		free_vattr = vattrp;
453 		vattrp = NULL;
454 	} else {
455 		ASSERT(dv->dv_attrvp);
456 		ASSERT(vp->v_type != VDIR);
457 		*vattrp = dv_vattr_file;
458 		error = VOP_GETATTR(dv->dv_attrvp, vattrp, 0, cr);
459 		dsysdebug(error, ("vop_getattr %s %d\n",
460 			dv->dv_name, error));
461 		if (error)
462 			goto out;
463 		dv->dv_attr = vattrp;
464 		dv_vattr_merge(dv, dv->dv_attr);
465 		vattrp = NULL;
466 	}
467 
468 	error = secpolicy_vnode_setattr(cr, vp, vap, dv->dv_attr,
469 					flags, devfs_unlocked_access, dv);
470 	if (error) {
471 		dsysdebug(error, ("devfs_setattr %s secpolicy error %d\n",
472 			dv->dv_name, error));
473 		goto out;
474 	}
475 
476 	/*
477 	 * Apply changes to the memory based attribute. This code
478 	 * is modeled after the tmpfs implementation of memory
479 	 * based vnodes
480 	 */
481 	map = dv->dv_attr;
482 	mask = vap->va_mask;
483 
484 	/* Change file access modes. */
485 	if (mask & AT_MODE) {
486 		map->va_mode &= S_IFMT;
487 		map->va_mode |= vap->va_mode & ~S_IFMT;
488 	}
489 	if (mask & AT_UID)
490 		map->va_uid = vap->va_uid;
491 	if (mask & AT_GID)
492 		map->va_gid = vap->va_gid;
493 	if (mask & AT_ATIME)
494 		map->va_atime = vap->va_atime;
495 	if (mask & AT_MTIME)
496 		map->va_mtime = vap->va_mtime;
497 
498 	if (mask & (AT_MODE | AT_UID | AT_GID | AT_MTIME)) {
499 		gethrestime(&map->va_ctime);
500 	}
501 
502 	/*
503 	 * A setattr to defaults means we no longer need the
504 	 * shadow node as a persistent store, unless there
505 	 * are ACLs.  Otherwise create a shadow node if one
506 	 * doesn't exist yet.
507 	 */
508 	if (persist) {
509 		if ((dv_setattr_cmp(map, &mp) == 0) &&
510 		    ((dv->dv_flags & DV_ACL) == 0)) {
511 
512 			if (dv->dv_attrvp) {
513 				ddv = dv->dv_dotdot;
514 				ASSERT(ddv->dv_attrvp);
515 				error = VOP_REMOVE(ddv->dv_attrvp,
516 				    dv->dv_name, cr);
517 				dsysdebug(error,
518 				    ("vop_remove %s %s %d\n",
519 				    ddv->dv_name, dv->dv_name, error));
520 
521 				if (error == EROFS)
522 					error = 0;
523 				VN_RELE(dv->dv_attrvp);
524 				dv->dv_attrvp = NULL;
525 			}
526 			ASSERT(dv->dv_attr);
527 		} else {
528 			if (mask & AT_MODE)
529 				dcmn_err5(("%s persisting mode 0%o\n",
530 					dv->dv_name, vap->va_mode));
531 			if (mask & AT_UID)
532 				dcmn_err5(("%s persisting uid %d\n",
533 					dv->dv_name, vap->va_uid));
534 			if (mask & AT_GID)
535 				dcmn_err5(("%s persisting gid %d\n",
536 					dv->dv_name, vap->va_gid));
537 
538 			if (dv->dv_attrvp == NULL) {
539 				dvp = DVTOV(dv->dv_dotdot);
540 				dv_shadow_node(dvp, dv->dv_name, vp,
541 				    NULL, NULLVP, cr,
542 				    DV_SHADOW_CREATE | DV_SHADOW_WRITE_HELD);
543 			}
544 			if (dv->dv_attrvp) {
545 				error = VOP_SETATTR(dv->dv_attrvp,
546 				    vap, flags, cr, NULL);
547 				dsysdebug(error, ("vop_setattr %s %d\n",
548 				    dv->dv_name, error));
549 			}
550 			/*
551 			 * Some file systems may return EROFS for a setattr
552 			 * on a readonly file system.  In this case save
553 			 * as our own memory based attribute.
554 			 * NOTE: ufs is NOT one of these (see ufs_iupdat).
555 			 */
556 			if (dv->dv_attr && dv->dv_attrvp && error == 0) {
557 				vattrp = dv->dv_attr;
558 				dv->dv_attr = NULL;
559 			} else if (error == EROFS)
560 				error = 0;
561 		}
562 	}
563 
564 out:
565 	rw_exit(&dv->dv_contents);
566 
567 	if (vattrp)
568 		kmem_free(vattrp, sizeof (*vattrp));
569 	if (free_vattr)
570 		kmem_free(free_vattr, sizeof (*free_vattr));
571 	return (error);
572 }
573 
574 static int
575 devfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr)
576 {
577 	switch (cmd) {
578 	case _PC_ACL_ENABLED:
579 		/*
580 		 * We rely on the underlying filesystem for ACLs,
581 		 * so direct the query for ACL support there.
582 		 * ACL support isn't relative to the file
583 		 * and we can't guarantee that the dv node
584 		 * has an attribute node, so any valid
585 		 * attribute node will suffice.
586 		 */
587 		ASSERT(dvroot);
588 		ASSERT(dvroot->dv_attrvp);
589 		return (VOP_PATHCONF(dvroot->dv_attrvp, cmd, valp, cr));
590 		/*NOTREACHED*/
591 	}
592 
593 	return (fs_pathconf(vp, cmd, valp, cr));
594 }
595 
596 /*
597  * Let avp handle security attributes (acl's).
598  */
599 static int
600 devfs_getsecattr(struct vnode *vp, struct vsecattr *vsap, int flags,
601     struct cred *cr)
602 {
603 	dvnode_t *dv = VTODV(vp);
604 	struct vnode *avp;
605 	int	error;
606 
607 	dcmn_err2(("devfs_getsecattr %s\n", dv->dv_name));
608 	ASSERT(vp->v_type == VDIR || vp->v_type == VCHR || vp->v_type == VBLK);
609 
610 	rw_enter(&dv->dv_contents, RW_READER);
611 
612 	avp = dv->dv_attrvp;
613 
614 	/* fabricate the acl */
615 	if (avp == NULL) {
616 		error = fs_fab_acl(vp, vsap, flags, cr);
617 		rw_exit(&dv->dv_contents);
618 		return (error);
619 	}
620 
621 	error = VOP_GETSECATTR(avp, vsap, flags, cr);
622 	dsysdebug(error, ("vop_getsecattr %s %d\n", VTODV(vp)->dv_name, error));
623 	rw_exit(&dv->dv_contents);
624 	return (error);
625 }
626 
627 /*
628  * Set security attributes (acl's)
629  *
630  * Note that the dv_contents lock has already been acquired
631  * by the caller's VOP_RWLOCK.
632  */
633 static int
634 devfs_setsecattr(struct vnode *vp, struct vsecattr *vsap, int flags,
635     struct cred *cr)
636 {
637 	dvnode_t *dv = VTODV(vp);
638 	struct vnode *avp;
639 	int	error;
640 
641 	dcmn_err2(("devfs_setsecattr %s\n", dv->dv_name));
642 	ASSERT(vp->v_type == VDIR || vp->v_type == VCHR || vp->v_type == VBLK);
643 	ASSERT(RW_LOCK_HELD(&dv->dv_contents));
644 
645 	/*
646 	 * Not a supported operation on drivers not providing
647 	 * file system based permissions.
648 	 */
649 	if (dv->dv_flags & DV_NO_FSPERM)
650 		return (ENOTSUP);
651 
652 	/*
653 	 * To complete, the setsecattr requires an underlying attribute node.
654 	 */
655 	if (dv->dv_attrvp == NULL) {
656 		ASSERT(vp->v_type == VCHR || vp->v_type == VBLK);
657 		dv_shadow_node(DVTOV(dv->dv_dotdot), dv->dv_name, vp,
658 		    NULL, NULLVP, cr, DV_SHADOW_CREATE | DV_SHADOW_WRITE_HELD);
659 	}
660 
661 	if ((avp = dv->dv_attrvp) == NULL) {
662 		dcmn_err2(("devfs_setsecattr %s: "
663 		    "cannot construct attribute node\n", dv->dv_name));
664 		return (fs_nosys());
665 	}
666 
667 	/*
668 	 * The acl(2) system call issues a VOP_RWLOCK before setting an ACL.
669 	 * Since backing file systems expect the lock to be held before seeing
670 	 * a VOP_SETSECATTR ACL, we need to issue the VOP_RWLOCK to the backing
671 	 * store before forwarding the ACL.
672 	 */
673 	(void) VOP_RWLOCK(avp, V_WRITELOCK_TRUE, NULL);
674 	error = VOP_SETSECATTR(avp, vsap, flags, cr);
675 	dsysdebug(error, ("vop_setsecattr %s %d\n", VTODV(vp)->dv_name, error));
676 	VOP_RWUNLOCK(avp, V_WRITELOCK_TRUE, NULL);
677 
678 	/*
679 	 * Set DV_ACL if we have a non-trivial set of ACLs.  It is not
680 	 * necessary to hold VOP_RWLOCK since fs_acl_nontrivial only does
681 	 * VOP_GETSECATTR calls.
682 	 */
683 	if (fs_acl_nontrivial(avp, cr))
684 		dv->dv_flags |= DV_ACL;
685 	return (error);
686 }
687 
688 /*
689  * This function is used for secpolicy_setattr().  It must call an
690  * access() like function while it is already holding the
691  * dv_contents lock.  We only care about this when dv_attr != NULL;
692  * so the unlocked access call only concerns itself with that
693  * particular branch of devfs_access().
694  */
695 static int
696 devfs_unlocked_access(void *vdv, int mode, struct cred *cr)
697 {
698 	struct dv_node *dv = vdv;
699 	int shift = 0;
700 	uid_t owner = dv->dv_attr->va_uid;
701 
702 	/* Check access based on owner, group and public permissions. */
703 	if (crgetuid(cr) != owner) {
704 		shift += 3;
705 		if (groupmember(dv->dv_attr->va_gid, cr) == 0)
706 			shift += 3;
707 	}
708 
709 	/* compute missing mode bits */
710 	mode &= ~(dv->dv_attr->va_mode << shift);
711 
712 	if (mode == 0)
713 		return (0);
714 
715 	return (secpolicy_vnode_access(cr, DVTOV(dv), owner, mode));
716 }
717 
718 static int
719 devfs_access(struct vnode *vp, int mode, int flags, struct cred *cr)
720 {
721 	struct dv_node	*dv = VTODV(vp);
722 	int		res;
723 
724 	dcmn_err2(("devfs_access %s\n", dv->dv_name));
725 	ASSERT(dv->dv_attr || dv->dv_attrvp);
726 
727 	/* restrict console access to privileged processes */
728 	if ((vp->v_rdev == rconsdev) && secpolicy_console(cr) != 0) {
729 		return (EACCES);
730 	}
731 
732 	if (dv->dv_attr && ((dv->dv_flags & DV_ACL) == 0)) {
733 		rw_enter(&dv->dv_contents, RW_READER);
734 		if (dv->dv_attr) {
735 			res = devfs_unlocked_access(dv, mode, cr);
736 			rw_exit(&dv->dv_contents);
737 			return (res);
738 		}
739 		rw_exit(&dv->dv_contents);
740 	}
741 	return (VOP_ACCESS(dv->dv_attrvp, mode, flags, cr));
742 }
743 
744 /*
745  * Lookup
746  *
747  * Given the directory vnode and the name of the component, return
748  * the corresponding held vnode for that component.
749  *
750  * Of course in these fictional filesystems, nothing's ever quite
751  * -that- simple.
752  *
753  * devfs name	type		shadow (fs attributes)	type	comments
754  * -------------------------------------------------------------------------
755  * drv[@addr]	VDIR		drv[@addr]		VDIR	nexus driver
756  * drv[@addr]:m	VCHR/VBLK	drv[@addr]:m		VREG	leaf driver
757  * drv[@addr]	VCHR/VBLK	drv[@addr]:.default	VREG	leaf driver
758  * -------------------------------------------------------------------------
759  *
760  * The following names are reserved for the attribute filesystem (which
761  * could easily be another layer on top of this one - we simply need to
762  * hold the vnode of the thing we're looking at)
763  *
764  * attr name	type		shadow (fs attributes)	type	comments
765  * -------------------------------------------------------------------------
766  * drv[@addr]	VDIR		-			-	attribute dir
767  * minorname	VDIR		-			-	minorname
768  * attribute	VREG		-			-	attribute
769  * -------------------------------------------------------------------------
770  *
771  * Examples:
772  *
773  *	devfs:/devices/.../mm@0:zero		VCHR
774  *	shadow:/.devices/.../mm@0:zero		VREG, fs attrs
775  *	devfs:/devices/.../mm@0:/zero/attr	VREG, driver attribute
776  *
777  *	devfs:/devices/.../sd@0,0:a		VBLK
778  *	shadow:/.devices/.../sd@0,0:a		VREG, fs attrs
779  *	devfs:/devices/.../sd@0,0:/a/.type	VREG, "ddi_block:chan"
780  *
781  *	devfs:/devices/.../mm@0			VCHR
782  *	shadow:/.devices/.../mm@0:.default	VREG, fs attrs
783  *	devfs:/devices/.../mm@0:/.default/attr	VREG, driver attribute
784  *	devfs:/devices/.../mm@0:/.default/.type	VREG, "ddi_pseudo"
785  *
786  *	devfs:/devices/.../obio			VDIR
787  *	shadow:/devices/.../obio		VDIR, needed for fs attrs.
788  *	devfs:/devices/.../obio:/.default/attr	VDIR, driver attribute
789  *
790  * We also need to be able deal with "old" devices that have gone away,
791  * though I think that provided we return them with readdir, they can
792  * be removed (i.e. they don't have to respond to lookup, though it might
793  * be weird if they didn't ;-)
794  *
795  * Lookup has side-effects.
796  *
797  * - It will create directories and fs attribute files in the shadow hierarchy.
798  * - It should cause non-SID devices to be probed (ask the parent nexi).
799  */
800 /*ARGSUSED3*/
801 static int
802 devfs_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
803     struct pathname *pnp, int flags, struct vnode *rdir, struct cred *cred)
804 {
805 	ASSERT(dvp->v_type == VDIR);
806 	dcmn_err2(("devfs_lookup: %s\n", nm));
807 	return (dv_find(VTODV(dvp), nm, vpp, pnp, rdir, cred, 0));
808 }
809 
810 /*
811  * devfs nodes can't really be created directly by userland - however,
812  * we do allow creates to find existing nodes:
813  *
814  * - any create fails if the node doesn't exist - EROFS.
815  * - creating an existing directory read-only succeeds, otherwise EISDIR.
816  * - exclusive creates fail if the node already exists - EEXIST.
817  * - failure to create the snode for an existing device - ENOSYS.
818  */
819 /*ARGSUSED2*/
820 static int
821 devfs_create(struct vnode *dvp, char *nm, struct vattr *vap, vcexcl_t excl,
822     int mode, struct vnode **vpp, struct cred *cred, int flag)
823 {
824 	int error;
825 	struct vnode *vp;
826 
827 	dcmn_err2(("devfs_create %s\n", nm));
828 	error = dv_find(VTODV(dvp), nm, &vp, NULL, NULLVP, cred, 0);
829 	if (error == 0) {
830 		if (excl == EXCL)
831 			error = EEXIST;
832 		else if (vp->v_type == VDIR && (mode & VWRITE))
833 			error = EISDIR;
834 		else
835 			error = VOP_ACCESS(vp, mode, 0, cred);
836 
837 		if (error) {
838 			VN_RELE(vp);
839 		} else
840 			*vpp = vp;
841 	} else if (error == ENOENT)
842 		error = EROFS;
843 
844 	return (error);
845 }
846 
847 /*
848  * If DV_BUILD is set, we call into nexus driver to do a BUS_CONFIG_ALL.
849  * Otherwise, simply return cached dv_node's. Hotplug code always call
850  * devfs_clean() to invalid the dv_node cache.
851  */
852 static int
853 devfs_readdir(struct vnode *dvp, struct uio *uiop, struct cred *cred, int *eofp)
854 {
855 	struct dv_node *ddv, *dv;
856 	struct dirent64 *de, *bufp;
857 	offset_t diroff;
858 	offset_t	soff;
859 	size_t reclen, movesz;
860 	int error;
861 	struct vattr va;
862 	size_t bufsz;
863 
864 	ddv = VTODV(dvp);
865 	dcmn_err2(("devfs_readdir %s: offset %lld len %ld\n",
866 	    ddv->dv_name, uiop->uio_loffset, uiop->uio_iov->iov_len));
867 	ASSERT(ddv->dv_attr || ddv->dv_attrvp);
868 	ASSERT(RW_READ_HELD(&ddv->dv_contents));
869 
870 	if (uiop->uio_loffset >= MAXOFF_T) {
871 		if (eofp)
872 			*eofp = 1;
873 		return (0);
874 	}
875 
876 	if (uiop->uio_iovcnt != 1)
877 		return (EINVAL);
878 
879 	if (dvp->v_type != VDIR)
880 		return (ENOTDIR);
881 
882 	/* Load the initial contents */
883 	if (ddv->dv_flags & DV_BUILD) {
884 		if (!rw_tryupgrade(&ddv->dv_contents)) {
885 			rw_exit(&ddv->dv_contents);
886 			rw_enter(&ddv->dv_contents, RW_WRITER);
887 		}
888 
889 		/* recheck and fill */
890 		if (ddv->dv_flags & DV_BUILD)
891 			dv_filldir(ddv);
892 
893 		rw_downgrade(&ddv->dv_contents);
894 	}
895 
896 	soff = uiop->uio_loffset;
897 	bufsz = uiop->uio_iov->iov_len;
898 	de = bufp = kmem_alloc(bufsz, KM_SLEEP);
899 	movesz = 0;
900 	dv = (struct dv_node *)-1;
901 
902 	/*
903 	 * Move as many entries into the uio structure as it will take.
904 	 * Special case "." and "..".
905 	 */
906 	diroff = 0;
907 	if (soff == 0) {				/* . */
908 		reclen = DIRENT64_RECLEN(strlen("."));
909 		if ((movesz + reclen) > bufsz)
910 			goto full;
911 		de->d_ino = (ino64_t)ddv->dv_ino;
912 		de->d_off = (off64_t)diroff + 1;
913 		de->d_reclen = (ushort_t)reclen;
914 
915 		/* use strncpy(9f) to zero out uninitialized bytes */
916 
917 		(void) strncpy(de->d_name, ".", DIRENT64_NAMELEN(reclen));
918 		movesz += reclen;
919 		de = (dirent64_t *)(intptr_t)((char *)de + reclen);
920 		dcmn_err3(("devfs_readdir: A: diroff %lld, soff %lld: '%s' "
921 		    "reclen %lu\n", diroff, soff, ".", reclen));
922 	}
923 
924 	diroff++;
925 	if (soff <= 1) {				/* .. */
926 		reclen = DIRENT64_RECLEN(strlen(".."));
927 		if ((movesz + reclen) > bufsz)
928 			goto full;
929 		de->d_ino = (ino64_t)ddv->dv_dotdot->dv_ino;
930 		de->d_off = (off64_t)diroff + 1;
931 		de->d_reclen = (ushort_t)reclen;
932 
933 		/* use strncpy(9f) to zero out uninitialized bytes */
934 
935 		(void) strncpy(de->d_name, "..", DIRENT64_NAMELEN(reclen));
936 		movesz += reclen;
937 		de = (dirent64_t *)(intptr_t)((char *)de + reclen);
938 		dcmn_err3(("devfs_readdir: B: diroff %lld, soff %lld: '%s' "
939 		    "reclen %lu\n", diroff, soff, "..", reclen));
940 	}
941 
942 	diroff++;
943 	for (dv = ddv->dv_dot; dv; dv = dv->dv_next, diroff++) {
944 		/*
945 		 * although DDM_INTERNAL_PATH minor nodes are skipped for
946 		 * readdirs outside the kernel, they still occupy directory
947 		 * offsets
948 		 */
949 		if (diroff < soff ||
950 		    ((dv->dv_flags & DV_INTERNAL) && (cred != kcred)))
951 			continue;
952 
953 		reclen = DIRENT64_RECLEN(strlen(dv->dv_name));
954 		if ((movesz + reclen) > bufsz) {
955 			dcmn_err3(("devfs_readdir: C: diroff "
956 			    "%lld, soff %lld: '%s' reclen %lu\n",
957 			    diroff, soff, dv->dv_name, reclen));
958 			goto full;
959 		}
960 		de->d_ino = (ino64_t)dv->dv_ino;
961 		de->d_off = (off64_t)diroff + 1;
962 		de->d_reclen = (ushort_t)reclen;
963 
964 		/* use strncpy(9f) to zero out uninitialized bytes */
965 
966 		ASSERT(strlen(dv->dv_name) + 1 <=
967 		    DIRENT64_NAMELEN(reclen));
968 		(void) strncpy(de->d_name, dv->dv_name,
969 		    DIRENT64_NAMELEN(reclen));
970 
971 		movesz += reclen;
972 		de = (dirent64_t *)(intptr_t)((char *)de + reclen);
973 		dcmn_err4(("devfs_readdir: D: diroff "
974 		    "%lld, soff %lld: '%s' reclen %lu\n", diroff, soff,
975 		    dv->dv_name, reclen));
976 	}
977 
978 	/* the buffer is full, or we exhausted everything */
979 full:	dcmn_err3(("devfs_readdir: moving %lu bytes: "
980 	    "diroff %lld, soff %lld, dv %p\n",
981 	    movesz, diroff, soff, (void *)dv));
982 
983 	if ((movesz == 0) && dv)
984 		error = EINVAL;		/* cannot be represented */
985 	else {
986 		error = uiomove(bufp, movesz, UIO_READ, uiop);
987 		if (error == 0) {
988 			if (eofp)
989 				*eofp = dv ? 0 : 1;
990 			uiop->uio_loffset = diroff;
991 		}
992 
993 		va.va_mask = AT_ATIME;
994 		gethrestime(&va.va_atime);
995 		rw_exit(&ddv->dv_contents);
996 		(void) devfs_setattr(dvp, &va, 0, cred, NULL);
997 		rw_enter(&ddv->dv_contents, RW_READER);
998 	}
999 
1000 	kmem_free(bufp, bufsz);
1001 	return (error);
1002 }
1003 
1004 /*ARGSUSED*/
1005 static int
1006 devfs_fsync(struct vnode *vp, int syncflag, struct cred *cred)
1007 {
1008 	/*
1009 	 * Message goes to console only. Otherwise, the message
1010 	 * causes devfs_fsync to be invoked again... infinite loop
1011 	 */
1012 	dcmn_err2(("devfs_fsync %s\n", VTODV(vp)->dv_name));
1013 	return (0);
1014 }
1015 
1016 /*
1017  * Normally, we leave the dv_node here at count of 0.
1018  * The node will be destroyed when dv_cleandir() is called.
1019  *
1020  * Stale dv_node's are already unlinked from the fs tree,
1021  * so dv_cleandir() won't find them. We destroy such nodes
1022  * immediately.
1023  */
1024 /*ARGSUSED1*/
1025 static void
1026 devfs_inactive(struct vnode *vp, struct cred *cred)
1027 {
1028 	int destroy;
1029 	struct dv_node *dv = VTODV(vp);
1030 
1031 	dcmn_err2(("devfs_inactive: %s\n", dv->dv_name));
1032 	mutex_enter(&vp->v_lock);
1033 	ASSERT(vp->v_count >= 1);
1034 	--vp->v_count;
1035 	destroy = (DV_STALE(dv) && vp->v_count == 0);
1036 	mutex_exit(&vp->v_lock);
1037 
1038 	/* stale nodes cannot be rediscovered, destroy it here */
1039 	if (destroy)
1040 		dv_destroy(dv, 0);
1041 }
1042 
1043 /*
1044  * XXX Why do we need this?  NFS mounted /dev directories?
1045  * XXX Talk to peter staubach about this.
1046  */
1047 static int
1048 devfs_fid(struct vnode *vp, struct fid *fidp)
1049 {
1050 	struct dv_node	*dv = VTODV(vp);
1051 	struct dv_fid	*dv_fid;
1052 
1053 	if (fidp->fid_len < (sizeof (struct dv_fid) - sizeof (ushort_t))) {
1054 		fidp->fid_len = sizeof (struct dv_fid) - sizeof (ushort_t);
1055 		return (ENOSPC);
1056 	}
1057 
1058 	dv_fid = (struct dv_fid *)fidp;
1059 	bzero(dv_fid, sizeof (struct dv_fid));
1060 	dv_fid->dvfid_len = (int)sizeof (struct dv_fid) - sizeof (ushort_t);
1061 	dv_fid->dvfid_ino = dv->dv_ino;
1062 	/* dv_fid->dvfid_gen = dv->tn_gen; XXX ? */
1063 
1064 	return (0);
1065 }
1066 
1067 /*
1068  * This pair of routines bracket all VOP_READ, VOP_WRITE
1069  * and VOP_READDIR requests.  The contents lock stops things
1070  * moving around while we're looking at them.
1071  *
1072  * Also used by file and record locking.
1073  */
1074 /*ARGSUSED2*/
1075 static int
1076 devfs_rwlock(struct vnode *vp, int write_flag, caller_context_t *ct)
1077 {
1078 	dcmn_err2(("devfs_rwlock %s\n", VTODV(vp)->dv_name));
1079 	rw_enter(&VTODV(vp)->dv_contents, write_flag ? RW_WRITER : RW_READER);
1080 	return (write_flag);
1081 }
1082 
1083 /*ARGSUSED1*/
1084 static void
1085 devfs_rwunlock(struct vnode *vp, int write_flag, caller_context_t *ct)
1086 {
1087 	dcmn_err2(("devfs_rwunlock %s\n", VTODV(vp)->dv_name));
1088 	rw_exit(&VTODV(vp)->dv_contents);
1089 }
1090 
1091 /*
1092  * XXX	Should probably do a better job of computing the maximum
1093  *	offset available in the directory.
1094  */
1095 /*ARGSUSED1*/
1096 static int
1097 devfs_seek(struct vnode *vp, offset_t ooff, offset_t *noffp)
1098 {
1099 	ASSERT(vp->v_type == VDIR);
1100 	dcmn_err2(("devfs_seek %s\n", VTODV(vp)->dv_name));
1101 	return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
1102 }
1103 
1104 vnodeops_t *dv_vnodeops;
1105 
1106 const fs_operation_def_t dv_vnodeops_template[] = {
1107 	VOPNAME_OPEN, devfs_open,
1108 	VOPNAME_CLOSE, devfs_close,
1109 	VOPNAME_READ, devfs_read,
1110 	VOPNAME_WRITE, devfs_write,
1111 	VOPNAME_IOCTL, devfs_ioctl,
1112 	VOPNAME_GETATTR, devfs_getattr,
1113 	VOPNAME_SETATTR, devfs_setattr,
1114 	VOPNAME_ACCESS, devfs_access,
1115 	VOPNAME_LOOKUP, devfs_lookup,
1116 	VOPNAME_CREATE, devfs_create,
1117 	VOPNAME_READDIR, devfs_readdir,
1118 	VOPNAME_FSYNC, devfs_fsync,
1119 	VOPNAME_INACTIVE, (fs_generic_func_p) devfs_inactive,
1120 	VOPNAME_FID, devfs_fid,
1121 	VOPNAME_RWLOCK, devfs_rwlock,
1122 	VOPNAME_RWUNLOCK, (fs_generic_func_p) devfs_rwunlock,
1123 	VOPNAME_SEEK, devfs_seek,
1124 	VOPNAME_PATHCONF, devfs_pathconf,
1125 	VOPNAME_DISPOSE, fs_error,
1126 	VOPNAME_SETSECATTR, devfs_setsecattr,
1127 	VOPNAME_GETSECATTR, devfs_getsecattr,
1128 	NULL, NULL
1129 };
1130