xref: /titanic_44/usr/src/uts/common/fs/devfs/devfs_vnops.c (revision 87308b749adb04ab3f564c765b089962af3edb4d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 /*
26  * vnode ops for the devfs
27  *
28  * For leaf vnode special files (VCHR|VBLK) specfs will always see the VOP
29  * first because dv_find always performs leaf vnode substitution, returning
30  * a specfs vnode with an s_realvp pointing to the devfs leaf vnode. This
31  * means that the only leaf special file VOP operations that devfs will see
32  * after VOP_LOOKUP are the ones that specfs forwards.
33  */
34 
35 #include <sys/types.h>
36 #include <sys/param.h>
37 #include <sys/t_lock.h>
38 #include <sys/systm.h>
39 #include <sys/sysmacros.h>
40 #include <sys/user.h>
41 #include <sys/time.h>
42 #include <sys/vfs.h>
43 #include <sys/vnode.h>
44 #include <sys/vfs_opreg.h>
45 #include <sys/file.h>
46 #include <sys/fcntl.h>
47 #include <sys/flock.h>
48 #include <sys/kmem.h>
49 #include <sys/uio.h>
50 #include <sys/errno.h>
51 #include <sys/stat.h>
52 #include <sys/cred.h>
53 #include <sys/dirent.h>
54 #include <sys/pathname.h>
55 #include <sys/cmn_err.h>
56 #include <sys/debug.h>
57 #include <sys/policy.h>
58 #include <sys/modctl.h>
59 #include <sys/sunndi.h>
60 #include <fs/fs_subr.h>
61 #include <sys/fs/dv_node.h>
62 
63 extern struct vattr	dv_vattr_dir, dv_vattr_file;
64 extern dev_t rconsdev;
65 
66 /*
67  * Open of devices (leaf nodes) is handled by specfs.
68  * There is nothing to do to open a directory
69  */
70 /*ARGSUSED*/
71 static int
72 devfs_open(struct vnode **vpp, int flag, struct cred *cred,
73     caller_context_t *ct)
74 {
75 	struct dv_node	*dv = VTODV(*vpp);
76 
77 	dcmn_err2(("devfs_open %s\n", dv->dv_name));
78 	ASSERT((*vpp)->v_type == VDIR);
79 	return (0);
80 }
81 
82 /*
83  * Close of devices (leaf nodes) is handled by specfs.
84  * There is nothing much to do inorder to close a directory.
85  */
86 /*ARGSUSED1*/
87 static int
88 devfs_close(struct vnode *vp, int flag, int count,
89     offset_t offset, struct cred *cred, caller_context_t *ct)
90 {
91 	struct dv_node	*dv = VTODV(vp);
92 
93 	dcmn_err2(("devfs_close %s\n", dv->dv_name));
94 	ASSERT(vp->v_type == VDIR);
95 
96 	cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
97 	cleanshares(vp, ttoproc(curthread)->p_pid);
98 	return (0);
99 }
100 
101 /*
102  * Read of devices (leaf nodes) is handled by specfs.
103  * Read of directories is not supported.
104  */
105 /*ARGSUSED*/
106 static int
107 devfs_read(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cred,
108 	struct caller_context *ct)
109 {
110 	dcmn_err2(("devfs_read %s\n", VTODV(vp)->dv_name));
111 	ASSERT(vp->v_type == VDIR);
112 	ASSERT(RW_READ_HELD(&VTODV(vp)->dv_contents));
113 	return (EISDIR);
114 }
115 
116 /*
117  * Write of devices (leaf nodes) is handled by specfs.
118  * Write of directories is not supported.
119  */
120 /*ARGSUSED*/
121 static int
122 devfs_write(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cred,
123 	struct caller_context *ct)
124 {
125 	dcmn_err2(("devfs_write %s\n", VTODV(vp)->dv_name));
126 	ASSERT(vp->v_type == VDIR);
127 	ASSERT(RW_WRITE_HELD(&VTODV(vp)->dv_contents));
128 	return (EISDIR);
129 }
130 
131 /*
132  * Ioctls to device (leaf nodes) is handled by specfs.
133  * Ioctl to directories is not supported.
134  */
135 /*ARGSUSED*/
136 static int
137 devfs_ioctl(struct vnode *vp, int cmd, intptr_t arg, int flag,
138     struct cred *cred, int *rvalp, caller_context_t *ct)
139 {
140 	dcmn_err2(("devfs_ioctl %s\n", VTODV(vp)->dv_name));
141 	ASSERT(vp->v_type == VDIR);
142 
143 	return (ENOTTY);	/* no ioctls supported */
144 }
145 
146 /*
147  * We can be asked directly about the attributes of directories, or
148  * (via sp->s_realvp) about the filesystem attributes of special files.
149  *
150  * For directories, we just believe the attribute store
151  * though we mangle the nodeid, fsid, and rdev to convince userland we
152  * really are a different filesystem.
153  *
154  * For special files, a little more fakery is required.
155  *
156  * If the attribute store is not there (read only root), we believe our
157  * memory based attributes.
158  */
159 static int
160 devfs_getattr(struct vnode *vp, struct vattr *vap, int flags, struct cred *cr,
161     caller_context_t *ct)
162 {
163 	struct dv_node	*dv = VTODV(vp);
164 	int		error = 0;
165 	uint_t		mask;
166 
167 	/*
168 	 * Message goes to console only. Otherwise, the message
169 	 * causes devfs_getattr to be invoked again... infinite loop
170 	 */
171 	dcmn_err2(("?devfs_getattr %s\n", dv->dv_name));
172 	ASSERT(dv->dv_attr || dv->dv_attrvp);
173 
174 	if (!(vp->v_type == VDIR || vp->v_type == VCHR || vp->v_type == VBLK)) {
175 		cmn_err(CE_WARN,	/* panic ? */
176 		    "?%s: getattr on vnode type %d", dvnm, vp->v_type);
177 		return (ENOENT);
178 	}
179 
180 	rw_enter(&dv->dv_contents, RW_READER);
181 	if (dv->dv_attr) {
182 		/*
183 		 * obtain from the memory version of attribute.
184 		 * preserve mask for those that optimize.
185 		 * devfs specific fields are already merged on creation.
186 		 */
187 		mask = vap->va_mask;
188 		*vap = *dv->dv_attr;
189 		vap->va_mask = mask;
190 	} else {
191 		/* obtain from attribute store and merge */
192 		error = VOP_GETATTR(dv->dv_attrvp, vap, flags, cr, ct);
193 		dsysdebug(error, ("vop_getattr %s %d\n", dv->dv_name, error));
194 		dv_vattr_merge(dv, vap);
195 	}
196 	rw_exit(&dv->dv_contents);
197 
198 	/*
199 	 * Restrict the permissions of the node fronting the console
200 	 * to 0600 with root as the owner.  This prevents a non-root
201 	 * user from gaining access to a serial terminal (like /dev/term/a)
202 	 * which is in reality serving as the console device (/dev/console).
203 	 */
204 	if (vp->v_rdev == rconsdev) {
205 		mode_t	rconsmask = S_IXUSR|S_IRWXG|S_IRWXO;
206 		vap->va_mode &= (~rconsmask);
207 		vap->va_uid = 0;
208 	}
209 
210 	return (error);
211 }
212 
213 static int devfs_unlocked_access(void *, int, struct cred *);
214 
215 /*ARGSUSED4*/
216 static int
217 devfs_setattr_dir(
218 	struct dv_node *dv,
219 	struct vnode *vp,
220 	struct vattr *vap,
221 	int flags,
222 	struct cred *cr)
223 {
224 	struct vattr	*map;
225 	uint_t		mask;
226 	int		error = 0;
227 	struct vattr	vattr;
228 
229 	ASSERT(dv->dv_attr || dv->dv_attrvp);
230 
231 	ASSERT(vp->v_type == VDIR);
232 	ASSERT((dv->dv_flags & DV_NO_FSPERM) == 0);
233 
234 	if (vap->va_mask & AT_NOSET)
235 		return (EINVAL);
236 
237 	/* to ensure consistency, single thread setting of attributes */
238 	rw_enter(&dv->dv_contents, RW_WRITER);
239 
240 again:	if (dv->dv_attr) {
241 
242 		error = secpolicy_vnode_setattr(cr, vp, vap,
243 		    dv->dv_attr, flags, devfs_unlocked_access, dv);
244 
245 		if (error)
246 			goto out;
247 
248 		/*
249 		 * Apply changes to the memory based attribute. This code
250 		 * is modeled after the tmpfs implementation of memory
251 		 * based vnodes
252 		 */
253 		map = dv->dv_attr;
254 		mask = vap->va_mask;
255 
256 		/* Change file access modes. */
257 		if (mask & AT_MODE) {
258 			map->va_mode &= S_IFMT;
259 			map->va_mode |= vap->va_mode & ~S_IFMT;
260 		}
261 		if (mask & AT_UID)
262 			map->va_uid = vap->va_uid;
263 		if (mask & AT_GID)
264 			map->va_gid = vap->va_gid;
265 		if (mask & AT_ATIME)
266 			map->va_atime = vap->va_atime;
267 		if (mask & AT_MTIME)
268 			map->va_mtime = vap->va_mtime;
269 
270 		if (mask & (AT_MODE | AT_UID | AT_GID | AT_MTIME))
271 			gethrestime(&map->va_ctime);
272 	} else {
273 		/* use the backing attribute store */
274 		ASSERT(dv->dv_attrvp);
275 
276 		/*
277 		 * See if we are changing something we care about
278 		 * the persistence of - return success if we don't care.
279 		 */
280 		if (vap->va_mask & (AT_MODE|AT_UID|AT_GID|AT_ATIME|AT_MTIME)) {
281 			/* Set the attributes */
282 			error = VOP_SETATTR(dv->dv_attrvp,
283 			    vap, flags, cr, NULL);
284 			dsysdebug(error,
285 			    ("vop_setattr %s %d\n", dv->dv_name, error));
286 
287 			/*
288 			 * Some file systems may return EROFS for a setattr
289 			 * on a readonly file system.  In this case we create
290 			 * our own memory based attribute.
291 			 */
292 			if (error == EROFS) {
293 				/*
294 				 * obtain attributes from existing file
295 				 * that we will modify and switch to memory
296 				 * based attribute until attribute store is
297 				 * read/write.
298 				 */
299 				vattr = dv_vattr_dir;
300 				if (VOP_GETATTR(dv->dv_attrvp,
301 				    &vattr, flags, cr, NULL) == 0) {
302 					dv->dv_attr = kmem_alloc(
303 					    sizeof (struct vattr), KM_SLEEP);
304 					*dv->dv_attr = vattr;
305 					dv_vattr_merge(dv, dv->dv_attr);
306 					goto again;
307 				}
308 			}
309 		}
310 	}
311 out:
312 	rw_exit(&dv->dv_contents);
313 	return (error);
314 }
315 
316 
317 /*
318  * Compare the uid/gid/mode changes requested for a setattr
319  * operation with the same details of a node's default minor
320  * perm information.  Return 0 if identical.
321  */
322 static int
323 dv_setattr_cmp(struct vattr *map, mperm_t *mp)
324 {
325 	if ((map->va_mode & S_IAMB) != (mp->mp_mode & S_IAMB))
326 		return (1);
327 	if (map->va_uid != mp->mp_uid)
328 		return (1);
329 	if (map->va_gid != mp->mp_gid)
330 		return (1);
331 	return (0);
332 }
333 
334 
335 /*ARGSUSED4*/
336 static int
337 devfs_setattr(
338 	struct vnode *vp,
339 	struct vattr *vap,
340 	int flags,
341 	struct cred *cr,
342 	caller_context_t *ct)
343 {
344 	struct dv_node	*dv = VTODV(vp);
345 	struct dv_node	*ddv;
346 	struct vnode	*dvp;
347 	struct vattr	*map;
348 	uint_t		mask;
349 	int		error = 0;
350 	struct vattr	*free_vattr = NULL;
351 	struct vattr	*vattrp = NULL;
352 	mperm_t		mp;
353 	int		persist;
354 
355 	/*
356 	 * Message goes to console only. Otherwise, the message
357 	 * causes devfs_getattr to be invoked again... infinite loop
358 	 */
359 	dcmn_err2(("?devfs_setattr %s\n", dv->dv_name));
360 	ASSERT(dv->dv_attr || dv->dv_attrvp);
361 
362 	if (!(vp->v_type == VDIR || vp->v_type == VCHR || vp->v_type == VBLK)) {
363 		cmn_err(CE_WARN,	/* panic ? */
364 		    "?%s: getattr on vnode type %d", dvnm, vp->v_type);
365 		return (ENOENT);
366 	}
367 
368 	if (vap->va_mask & AT_NOSET)
369 		return (EINVAL);
370 
371 	/*
372 	 * If we are changing something we don't care about
373 	 * the persistence of, return success.
374 	 */
375 	if ((vap->va_mask &
376 	    (AT_MODE|AT_UID|AT_GID|AT_ATIME|AT_MTIME)) == 0)
377 		return (0);
378 
379 	/*
380 	 * If driver overrides fs perm, disallow chmod
381 	 * and do not create attribute nodes.
382 	 */
383 	if (dv->dv_flags & DV_NO_FSPERM) {
384 		ASSERT(dv->dv_attr);
385 		if (vap->va_mask & (AT_MODE | AT_UID | AT_GID))
386 			return (EPERM);
387 		if ((vap->va_mask & (AT_ATIME|AT_MTIME)) == 0)
388 			return (0);
389 		rw_enter(&dv->dv_contents, RW_WRITER);
390 		if (vap->va_mask & AT_ATIME)
391 			dv->dv_attr->va_atime = vap->va_atime;
392 		if (vap->va_mask & AT_MTIME)
393 			dv->dv_attr->va_mtime = vap->va_mtime;
394 		rw_exit(&dv->dv_contents);
395 		return (0);
396 	}
397 
398 	/*
399 	 * Directories are always created but device nodes are
400 	 * only used to persist non-default permissions.
401 	 */
402 	if (vp->v_type == VDIR) {
403 		ASSERT(dv->dv_attr || dv->dv_attrvp);
404 		return (devfs_setattr_dir(dv, vp, vap, flags, cr));
405 	}
406 
407 	/*
408 	 * Allocate now before we take any locks
409 	 */
410 	vattrp = kmem_zalloc(sizeof (*vattrp), KM_SLEEP);
411 
412 	/* to ensure consistency, single thread setting of attributes */
413 	rw_enter(&dv->dv_contents, RW_WRITER);
414 
415 	/*
416 	 * We don't need to create an attribute node
417 	 * to persist access or modification times.
418 	 */
419 	persist = (vap->va_mask & (AT_MODE | AT_UID | AT_GID));
420 
421 	/*
422 	 * If persisting something, get the default permissions
423 	 * for this minor to compare against what the attributes
424 	 * are now being set to.  Default ordering is:
425 	 *	- minor_perm match for this minor
426 	 *	- mode supplied by ddi_create_priv_minor_node
427 	 *	- devfs defaults
428 	 */
429 	if (persist) {
430 		if (dev_minorperm(dv->dv_devi, dv->dv_name, &mp) != 0) {
431 			mp.mp_uid = dv_vattr_file.va_uid;
432 			mp.mp_gid = dv_vattr_file.va_gid;
433 			mp.mp_mode = dv_vattr_file.va_mode;
434 			if (dv->dv_flags & DV_DFLT_MODE) {
435 				ASSERT((dv->dv_dflt_mode & ~S_IAMB) == 0);
436 				mp.mp_mode &= ~S_IAMB;
437 				mp.mp_mode |= dv->dv_dflt_mode;
438 				dcmn_err5(("%s: setattr priv default 0%o\n",
439 				    dv->dv_name, mp.mp_mode));
440 			} else {
441 				dcmn_err5(("%s: setattr devfs default 0%o\n",
442 				    dv->dv_name, mp.mp_mode));
443 			}
444 		} else {
445 			dcmn_err5(("%s: setattr minor perm default 0%o\n",
446 			    dv->dv_name, mp.mp_mode));
447 		}
448 	}
449 
450 	/*
451 	 * If we don't have a vattr for this node, construct one.
452 	 */
453 	if (dv->dv_attr) {
454 		free_vattr = vattrp;
455 		vattrp = NULL;
456 	} else {
457 		ASSERT(dv->dv_attrvp);
458 		ASSERT(vp->v_type != VDIR);
459 		*vattrp = dv_vattr_file;
460 		error = VOP_GETATTR(dv->dv_attrvp, vattrp, 0, cr, ct);
461 		dsysdebug(error, ("vop_getattr %s %d\n", dv->dv_name, error));
462 		if (error)
463 			goto out;
464 		dv->dv_attr = vattrp;
465 		dv_vattr_merge(dv, dv->dv_attr);
466 		vattrp = NULL;
467 	}
468 
469 	error = secpolicy_vnode_setattr(cr, vp, vap, dv->dv_attr,
470 	    flags, devfs_unlocked_access, dv);
471 	if (error) {
472 		dsysdebug(error, ("devfs_setattr %s secpolicy error %d\n",
473 		    dv->dv_name, error));
474 		goto out;
475 	}
476 
477 	/*
478 	 * Apply changes to the memory based attribute. This code
479 	 * is modeled after the tmpfs implementation of memory
480 	 * based vnodes
481 	 */
482 	map = dv->dv_attr;
483 	mask = vap->va_mask;
484 
485 	/* Change file access modes. */
486 	if (mask & AT_MODE) {
487 		map->va_mode &= S_IFMT;
488 		map->va_mode |= vap->va_mode & ~S_IFMT;
489 	}
490 	if (mask & AT_UID)
491 		map->va_uid = vap->va_uid;
492 	if (mask & AT_GID)
493 		map->va_gid = vap->va_gid;
494 	if (mask & AT_ATIME)
495 		map->va_atime = vap->va_atime;
496 	if (mask & AT_MTIME)
497 		map->va_mtime = vap->va_mtime;
498 
499 	if (mask & (AT_MODE | AT_UID | AT_GID | AT_MTIME)) {
500 		gethrestime(&map->va_ctime);
501 	}
502 
503 	/*
504 	 * A setattr to defaults means we no longer need the
505 	 * shadow node as a persistent store, unless there
506 	 * are ACLs.  Otherwise create a shadow node if one
507 	 * doesn't exist yet.
508 	 */
509 	if (persist) {
510 		if ((dv_setattr_cmp(map, &mp) == 0) &&
511 		    ((dv->dv_flags & DV_ACL) == 0)) {
512 
513 			if (dv->dv_attrvp) {
514 				ddv = dv->dv_dotdot;
515 				ASSERT(ddv->dv_attrvp);
516 				error = VOP_REMOVE(ddv->dv_attrvp,
517 				    dv->dv_name, cr, ct, 0);
518 				dsysdebug(error,
519 				    ("vop_remove %s %s %d\n",
520 				    ddv->dv_name, dv->dv_name, error));
521 
522 				if (error == EROFS)
523 					error = 0;
524 				VN_RELE(dv->dv_attrvp);
525 				dv->dv_attrvp = NULL;
526 			}
527 			ASSERT(dv->dv_attr);
528 		} else {
529 			if (mask & AT_MODE)
530 				dcmn_err5(("%s persisting mode 0%o\n",
531 				    dv->dv_name, vap->va_mode));
532 			if (mask & AT_UID)
533 				dcmn_err5(("%s persisting uid %d\n",
534 				    dv->dv_name, vap->va_uid));
535 			if (mask & AT_GID)
536 				dcmn_err5(("%s persisting gid %d\n",
537 				    dv->dv_name, vap->va_gid));
538 
539 			if (dv->dv_attrvp == NULL) {
540 				dvp = DVTOV(dv->dv_dotdot);
541 				dv_shadow_node(dvp, dv->dv_name, vp,
542 				    NULL, NULLVP, cr,
543 				    DV_SHADOW_CREATE | DV_SHADOW_WRITE_HELD);
544 			}
545 			if (dv->dv_attrvp) {
546 				/* If map still valid do TIME for free. */
547 				if (dv->dv_attr == map) {
548 					mask = map->va_mask;
549 					map->va_mask =
550 					    vap->va_mask | AT_ATIME | AT_MTIME;
551 					error = VOP_SETATTR(dv->dv_attrvp, map,
552 					    flags, cr, NULL);
553 					map->va_mask = mask;
554 				} else {
555 					error = VOP_SETATTR(dv->dv_attrvp,
556 					    vap, flags, cr, NULL);
557 				}
558 				dsysdebug(error, ("vop_setattr %s %d\n",
559 				    dv->dv_name, error));
560 			}
561 			/*
562 			 * Some file systems may return EROFS for a setattr
563 			 * on a readonly file system.  In this case save
564 			 * as our own memory based attribute.
565 			 * NOTE: ufs is NOT one of these (see ufs_iupdat).
566 			 */
567 			if (dv->dv_attr && dv->dv_attrvp && error == 0) {
568 				vattrp = dv->dv_attr;
569 				dv->dv_attr = NULL;
570 			} else if (error == EROFS)
571 				error = 0;
572 		}
573 	}
574 
575 out:
576 	rw_exit(&dv->dv_contents);
577 
578 	if (vattrp)
579 		kmem_free(vattrp, sizeof (*vattrp));
580 	if (free_vattr)
581 		kmem_free(free_vattr, sizeof (*free_vattr));
582 	return (error);
583 }
584 
585 static int
586 devfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
587     caller_context_t *ct)
588 {
589 	switch (cmd) {
590 	case _PC_ACL_ENABLED:
591 		/*
592 		 * We rely on the underlying filesystem for ACLs,
593 		 * so direct the query for ACL support there.
594 		 * ACL support isn't relative to the file
595 		 * and we can't guarantee that the dv node
596 		 * has an attribute node, so any valid
597 		 * attribute node will suffice.
598 		 */
599 		ASSERT(dvroot);
600 		ASSERT(dvroot->dv_attrvp);
601 		return (VOP_PATHCONF(dvroot->dv_attrvp, cmd, valp, cr, ct));
602 		/*NOTREACHED*/
603 	}
604 
605 	return (fs_pathconf(vp, cmd, valp, cr, ct));
606 }
607 
608 /*
609  * Let avp handle security attributes (acl's).
610  */
611 static int
612 devfs_getsecattr(struct vnode *vp, struct vsecattr *vsap, int flags,
613     struct cred *cr, caller_context_t *ct)
614 {
615 	dvnode_t *dv = VTODV(vp);
616 	struct vnode *avp;
617 	int	error;
618 
619 	dcmn_err2(("devfs_getsecattr %s\n", dv->dv_name));
620 	ASSERT(vp->v_type == VDIR || vp->v_type == VCHR || vp->v_type == VBLK);
621 
622 	rw_enter(&dv->dv_contents, RW_READER);
623 
624 	avp = dv->dv_attrvp;
625 
626 	/* fabricate the acl */
627 	if (avp == NULL) {
628 		error = fs_fab_acl(vp, vsap, flags, cr, ct);
629 		rw_exit(&dv->dv_contents);
630 		return (error);
631 	}
632 
633 	error = VOP_GETSECATTR(avp, vsap, flags, cr, ct);
634 	dsysdebug(error, ("vop_getsecattr %s %d\n", VTODV(vp)->dv_name, error));
635 	rw_exit(&dv->dv_contents);
636 	return (error);
637 }
638 
639 /*
640  * Set security attributes (acl's)
641  *
642  * Note that the dv_contents lock has already been acquired
643  * by the caller's VOP_RWLOCK.
644  */
645 static int
646 devfs_setsecattr(struct vnode *vp, struct vsecattr *vsap, int flags,
647     struct cred *cr, caller_context_t *ct)
648 {
649 	dvnode_t *dv = VTODV(vp);
650 	struct vnode *avp;
651 	int	error;
652 
653 	dcmn_err2(("devfs_setsecattr %s\n", dv->dv_name));
654 	ASSERT(vp->v_type == VDIR || vp->v_type == VCHR || vp->v_type == VBLK);
655 	ASSERT(RW_LOCK_HELD(&dv->dv_contents));
656 
657 	/*
658 	 * Not a supported operation on drivers not providing
659 	 * file system based permissions.
660 	 */
661 	if (dv->dv_flags & DV_NO_FSPERM)
662 		return (ENOTSUP);
663 
664 	/*
665 	 * To complete, the setsecattr requires an underlying attribute node.
666 	 */
667 	if (dv->dv_attrvp == NULL) {
668 		ASSERT(vp->v_type == VCHR || vp->v_type == VBLK);
669 		dv_shadow_node(DVTOV(dv->dv_dotdot), dv->dv_name, vp,
670 		    NULL, NULLVP, cr, DV_SHADOW_CREATE | DV_SHADOW_WRITE_HELD);
671 	}
672 
673 	if ((avp = dv->dv_attrvp) == NULL) {
674 		dcmn_err2(("devfs_setsecattr %s: "
675 		    "cannot construct attribute node\n", dv->dv_name));
676 		return (fs_nosys());
677 	}
678 
679 	/*
680 	 * The acl(2) system call issues a VOP_RWLOCK before setting an ACL.
681 	 * Since backing file systems expect the lock to be held before seeing
682 	 * a VOP_SETSECATTR ACL, we need to issue the VOP_RWLOCK to the backing
683 	 * store before forwarding the ACL.
684 	 */
685 	(void) VOP_RWLOCK(avp, V_WRITELOCK_TRUE, NULL);
686 	error = VOP_SETSECATTR(avp, vsap, flags, cr, ct);
687 	dsysdebug(error, ("vop_setsecattr %s %d\n", VTODV(vp)->dv_name, error));
688 	VOP_RWUNLOCK(avp, V_WRITELOCK_TRUE, NULL);
689 
690 	/*
691 	 * Set DV_ACL if we have a non-trivial set of ACLs.  It is not
692 	 * necessary to hold VOP_RWLOCK since fs_acl_nontrivial only does
693 	 * VOP_GETSECATTR calls.
694 	 */
695 	if (fs_acl_nontrivial(avp, cr))
696 		dv->dv_flags |= DV_ACL;
697 	return (error);
698 }
699 
700 /*
701  * This function is used for secpolicy_setattr().  It must call an
702  * access() like function while it is already holding the
703  * dv_contents lock.  We only care about this when dv_attr != NULL;
704  * so the unlocked access call only concerns itself with that
705  * particular branch of devfs_access().
706  */
707 static int
708 devfs_unlocked_access(void *vdv, int mode, struct cred *cr)
709 {
710 	struct dv_node *dv = vdv;
711 	int shift = 0;
712 	uid_t owner = dv->dv_attr->va_uid;
713 
714 	/* Check access based on owner, group and public permissions. */
715 	if (crgetuid(cr) != owner) {
716 		shift += 3;
717 		if (groupmember(dv->dv_attr->va_gid, cr) == 0)
718 			shift += 3;
719 	}
720 
721 	return (secpolicy_vnode_access2(cr, DVTOV(dv), owner,
722 	    dv->dv_attr->va_mode << shift, mode));
723 }
724 
725 static int
726 devfs_access(struct vnode *vp, int mode, int flags, struct cred *cr,
727     caller_context_t *ct)
728 {
729 	struct dv_node	*dv = VTODV(vp);
730 	int		res;
731 
732 	dcmn_err2(("devfs_access %s\n", dv->dv_name));
733 	ASSERT(dv->dv_attr || dv->dv_attrvp);
734 
735 	/* restrict console access to privileged processes */
736 	if ((vp->v_rdev == rconsdev) && secpolicy_console(cr) != 0) {
737 		return (EACCES);
738 	}
739 
740 	rw_enter(&dv->dv_contents, RW_READER);
741 	if (dv->dv_attr && ((dv->dv_flags & DV_ACL) == 0)) {
742 		res = devfs_unlocked_access(dv, mode, cr);
743 	} else {
744 		res = VOP_ACCESS(dv->dv_attrvp, mode, flags, cr, ct);
745 	}
746 	rw_exit(&dv->dv_contents);
747 	return (res);
748 }
749 
750 /*
751  * Lookup
752  *
753  * Given the directory vnode and the name of the component, return
754  * the corresponding held vnode for that component.
755  *
756  * Of course in these fictional filesystems, nothing's ever quite
757  * -that- simple.
758  *
759  * devfs name	type		shadow (fs attributes)	type	comments
760  * -------------------------------------------------------------------------
761  * drv[@addr]	VDIR		drv[@addr]		VDIR	nexus driver
762  * drv[@addr]:m	VCHR/VBLK	drv[@addr]:m		VREG	leaf driver
763  * drv[@addr]	VCHR/VBLK	drv[@addr]:.default	VREG	leaf driver
764  * -------------------------------------------------------------------------
765  *
766  * The following names are reserved for the attribute filesystem (which
767  * could easily be another layer on top of this one - we simply need to
768  * hold the vnode of the thing we're looking at)
769  *
770  * attr name	type		shadow (fs attributes)	type	comments
771  * -------------------------------------------------------------------------
772  * drv[@addr]	VDIR		-			-	attribute dir
773  * minorname	VDIR		-			-	minorname
774  * attribute	VREG		-			-	attribute
775  * -------------------------------------------------------------------------
776  *
777  * Examples:
778  *
779  *	devfs:/devices/.../mm@0:zero		VCHR
780  *	shadow:/.devices/.../mm@0:zero		VREG, fs attrs
781  *	devfs:/devices/.../mm@0:/zero/attr	VREG, driver attribute
782  *
783  *	devfs:/devices/.../sd@0,0:a		VBLK
784  *	shadow:/.devices/.../sd@0,0:a		VREG, fs attrs
785  *	devfs:/devices/.../sd@0,0:/a/.type	VREG, "ddi_block:chan"
786  *
787  *	devfs:/devices/.../mm@0			VCHR
788  *	shadow:/.devices/.../mm@0:.default	VREG, fs attrs
789  *	devfs:/devices/.../mm@0:/.default/attr	VREG, driver attribute
790  *	devfs:/devices/.../mm@0:/.default/.type	VREG, "ddi_pseudo"
791  *
792  *	devfs:/devices/.../obio			VDIR
793  *	shadow:/devices/.../obio		VDIR, needed for fs attrs.
794  *	devfs:/devices/.../obio:/.default/attr	VDIR, driver attribute
795  *
796  * We also need to be able deal with "old" devices that have gone away,
797  * though I think that provided we return them with readdir, they can
798  * be removed (i.e. they don't have to respond to lookup, though it might
799  * be weird if they didn't ;-)
800  *
801  * Lookup has side-effects.
802  *
803  * - It will create directories and fs attribute files in the shadow hierarchy.
804  * - It should cause non-SID devices to be probed (ask the parent nexi).
805  */
806 /*ARGSUSED3*/
807 static int
808 devfs_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
809     struct pathname *pnp, int flags, struct vnode *rdir, struct cred *cred,
810     caller_context_t *ct, int *direntflags, pathname_t *realpnp)
811 {
812 	ASSERT(dvp->v_type == VDIR);
813 	dcmn_err2(("devfs_lookup: %s\n", nm));
814 	return (dv_find(VTODV(dvp), nm, vpp, pnp, rdir, cred, 0));
815 }
816 
817 /*
818  * devfs nodes can't really be created directly by userland - however,
819  * we do allow creates to find existing nodes:
820  *
821  * - any create fails if the node doesn't exist - EROFS.
822  * - creating an existing directory read-only succeeds, otherwise EISDIR.
823  * - exclusive creates fail if the node already exists - EEXIST.
824  * - failure to create the snode for an existing device - ENOSYS.
825  */
826 /*ARGSUSED2*/
827 static int
828 devfs_create(struct vnode *dvp, char *nm, struct vattr *vap, vcexcl_t excl,
829     int mode, struct vnode **vpp, struct cred *cred, int flag,
830     caller_context_t *ct, vsecattr_t *vsecp)
831 {
832 	int error;
833 	struct vnode *vp;
834 
835 	dcmn_err2(("devfs_create %s\n", nm));
836 	error = dv_find(VTODV(dvp), nm, &vp, NULL, NULLVP, cred, 0);
837 	if (error == 0) {
838 		if (excl == EXCL)
839 			error = EEXIST;
840 		else if (vp->v_type == VDIR && (mode & VWRITE))
841 			error = EISDIR;
842 		else
843 			error = VOP_ACCESS(vp, mode, 0, cred, ct);
844 
845 		if (error) {
846 			VN_RELE(vp);
847 		} else
848 			*vpp = vp;
849 	} else if (error == ENOENT)
850 		error = EROFS;
851 
852 	return (error);
853 }
854 
855 /*
856  * If DV_BUILD is set, we call into nexus driver to do a BUS_CONFIG_ALL.
857  * Otherwise, simply return cached dv_node's. Hotplug code always call
858  * devfs_clean() to invalid the dv_node cache.
859  */
860 /*ARGSUSED5*/
861 static int
862 devfs_readdir(struct vnode *dvp, struct uio *uiop, struct cred *cred, int *eofp,
863     caller_context_t *ct, int flags)
864 {
865 	struct dv_node *ddv, *dv;
866 	struct dirent64 *de, *bufp;
867 	offset_t diroff;
868 	offset_t	soff;
869 	size_t reclen, movesz;
870 	int error;
871 	struct vattr va;
872 	size_t bufsz;
873 
874 	ddv = VTODV(dvp);
875 	dcmn_err2(("devfs_readdir %s: offset %lld len %ld\n",
876 	    ddv->dv_name, uiop->uio_loffset, uiop->uio_iov->iov_len));
877 	ASSERT(ddv->dv_attr || ddv->dv_attrvp);
878 	ASSERT(RW_READ_HELD(&ddv->dv_contents));
879 
880 	if (uiop->uio_loffset >= MAXOFF_T) {
881 		if (eofp)
882 			*eofp = 1;
883 		return (0);
884 	}
885 
886 	if (uiop->uio_iovcnt != 1)
887 		return (EINVAL);
888 
889 	if (dvp->v_type != VDIR)
890 		return (ENOTDIR);
891 
892 	/* Load the initial contents */
893 	if (ddv->dv_flags & DV_BUILD) {
894 		if (!rw_tryupgrade(&ddv->dv_contents)) {
895 			rw_exit(&ddv->dv_contents);
896 			rw_enter(&ddv->dv_contents, RW_WRITER);
897 		}
898 
899 		/* recheck and fill */
900 		if (ddv->dv_flags & DV_BUILD)
901 			dv_filldir(ddv);
902 
903 		rw_downgrade(&ddv->dv_contents);
904 	}
905 
906 	soff = uiop->uio_loffset;
907 	bufsz = uiop->uio_iov->iov_len;
908 	de = bufp = kmem_alloc(bufsz, KM_SLEEP);
909 	movesz = 0;
910 	dv = (struct dv_node *)-1;
911 
912 	/*
913 	 * Move as many entries into the uio structure as it will take.
914 	 * Special case "." and "..".
915 	 */
916 	diroff = 0;
917 	if (soff == 0) {				/* . */
918 		reclen = DIRENT64_RECLEN(strlen("."));
919 		if ((movesz + reclen) > bufsz)
920 			goto full;
921 		de->d_ino = (ino64_t)ddv->dv_ino;
922 		de->d_off = (off64_t)diroff + 1;
923 		de->d_reclen = (ushort_t)reclen;
924 
925 		/* use strncpy(9f) to zero out uninitialized bytes */
926 
927 		(void) strncpy(de->d_name, ".", DIRENT64_NAMELEN(reclen));
928 		movesz += reclen;
929 		de = (dirent64_t *)(intptr_t)((char *)de + reclen);
930 		dcmn_err3(("devfs_readdir: A: diroff %lld, soff %lld: '%s' "
931 		    "reclen %lu\n", diroff, soff, ".", reclen));
932 	}
933 
934 	diroff++;
935 	if (soff <= 1) {				/* .. */
936 		reclen = DIRENT64_RECLEN(strlen(".."));
937 		if ((movesz + reclen) > bufsz)
938 			goto full;
939 		de->d_ino = (ino64_t)ddv->dv_dotdot->dv_ino;
940 		de->d_off = (off64_t)diroff + 1;
941 		de->d_reclen = (ushort_t)reclen;
942 
943 		/* use strncpy(9f) to zero out uninitialized bytes */
944 
945 		(void) strncpy(de->d_name, "..", DIRENT64_NAMELEN(reclen));
946 		movesz += reclen;
947 		de = (dirent64_t *)(intptr_t)((char *)de + reclen);
948 		dcmn_err3(("devfs_readdir: B: diroff %lld, soff %lld: '%s' "
949 		    "reclen %lu\n", diroff, soff, "..", reclen));
950 	}
951 
952 	diroff++;
953 	for (dv = DV_FIRST_ENTRY(ddv); dv;
954 	    dv = DV_NEXT_ENTRY(ddv, dv), diroff++) {
955 		/* skip entries until at correct directory offset */
956 		if (diroff < soff)
957 			continue;
958 
959 		/*
960 		 * hidden nodes are skipped (but they still occupy a
961 		 * directory offset).
962 		 */
963 		if (dv->dv_devi && ndi_dev_is_hidden_node(dv->dv_devi))
964 			continue;
965 
966 		/*
967 		 * DDM_INTERNAL_PATH minor nodes are skipped for readdirs
968 		 * outside the kernel (but they still occupy a directory
969 		 * offset).
970 		 */
971 		if ((dv->dv_flags & DV_INTERNAL) && (cred != kcred))
972 			continue;
973 
974 		reclen = DIRENT64_RECLEN(strlen(dv->dv_name));
975 		if ((movesz + reclen) > bufsz) {
976 			dcmn_err3(("devfs_readdir: C: diroff "
977 			    "%lld, soff %lld: '%s' reclen %lu\n",
978 			    diroff, soff, dv->dv_name, reclen));
979 			goto full;
980 		}
981 		de->d_ino = (ino64_t)dv->dv_ino;
982 		de->d_off = (off64_t)diroff + 1;
983 		de->d_reclen = (ushort_t)reclen;
984 
985 		/* use strncpy(9f) to zero out uninitialized bytes */
986 
987 		ASSERT(strlen(dv->dv_name) + 1 <=
988 		    DIRENT64_NAMELEN(reclen));
989 		(void) strncpy(de->d_name, dv->dv_name,
990 		    DIRENT64_NAMELEN(reclen));
991 
992 		movesz += reclen;
993 		de = (dirent64_t *)(intptr_t)((char *)de + reclen);
994 		dcmn_err4(("devfs_readdir: D: diroff "
995 		    "%lld, soff %lld: '%s' reclen %lu\n", diroff, soff,
996 		    dv->dv_name, reclen));
997 	}
998 
999 	/* the buffer is full, or we exhausted everything */
1000 full:	dcmn_err3(("devfs_readdir: moving %lu bytes: "
1001 	    "diroff %lld, soff %lld, dv %p\n",
1002 	    movesz, diroff, soff, (void *)dv));
1003 
1004 	if ((movesz == 0) && dv)
1005 		error = EINVAL;		/* cannot be represented */
1006 	else {
1007 		error = uiomove(bufp, movesz, UIO_READ, uiop);
1008 		if (error == 0) {
1009 			if (eofp)
1010 				*eofp = dv ? 0 : 1;
1011 			uiop->uio_loffset = diroff;
1012 		}
1013 
1014 		va.va_mask = AT_ATIME;
1015 		gethrestime(&va.va_atime);
1016 		rw_exit(&ddv->dv_contents);
1017 		(void) devfs_setattr(dvp, &va, 0, cred, ct);
1018 		rw_enter(&ddv->dv_contents, RW_READER);
1019 	}
1020 
1021 	kmem_free(bufp, bufsz);
1022 	return (error);
1023 }
1024 
1025 /*ARGSUSED*/
1026 static int
1027 devfs_fsync(struct vnode *vp, int syncflag, struct cred *cred,
1028     caller_context_t *ct)
1029 {
1030 	/*
1031 	 * Message goes to console only. Otherwise, the message
1032 	 * causes devfs_fsync to be invoked again... infinite loop
1033 	 */
1034 	dcmn_err2(("devfs_fsync %s\n", VTODV(vp)->dv_name));
1035 	return (0);
1036 }
1037 
1038 /*
1039  * Normally, we leave the dv_node here at count of 0.
1040  * The node will be destroyed when dv_cleandir() is called.
1041  *
1042  * Stale dv_node's are already unlinked from the fs tree,
1043  * so dv_cleandir() won't find them. We destroy such nodes
1044  * immediately.
1045  */
1046 /*ARGSUSED1*/
1047 static void
1048 devfs_inactive(struct vnode *vp, struct cred *cred, caller_context_t *ct)
1049 {
1050 	int destroy;
1051 	struct dv_node *dv = VTODV(vp);
1052 
1053 	dcmn_err2(("devfs_inactive: %s\n", dv->dv_name));
1054 	mutex_enter(&vp->v_lock);
1055 	ASSERT(vp->v_count >= 1);
1056 	--vp->v_count;
1057 	destroy = (DV_STALE(dv) && vp->v_count == 0);
1058 	mutex_exit(&vp->v_lock);
1059 
1060 	/* stale nodes cannot be rediscovered, destroy it here */
1061 	if (destroy)
1062 		dv_destroy(dv, 0);
1063 }
1064 
1065 /*
1066  * XXX Why do we need this?  NFS mounted /dev directories?
1067  * XXX Talk to peter staubach about this.
1068  */
1069 /*ARGSUSED2*/
1070 static int
1071 devfs_fid(struct vnode *vp, struct fid *fidp, caller_context_t *ct)
1072 {
1073 	struct dv_node	*dv = VTODV(vp);
1074 	struct dv_fid	*dv_fid;
1075 
1076 	if (fidp->fid_len < (sizeof (struct dv_fid) - sizeof (ushort_t))) {
1077 		fidp->fid_len = sizeof (struct dv_fid) - sizeof (ushort_t);
1078 		return (ENOSPC);
1079 	}
1080 
1081 	dv_fid = (struct dv_fid *)fidp;
1082 	bzero(dv_fid, sizeof (struct dv_fid));
1083 	dv_fid->dvfid_len = (int)sizeof (struct dv_fid) - sizeof (ushort_t);
1084 	dv_fid->dvfid_ino = dv->dv_ino;
1085 	/* dv_fid->dvfid_gen = dv->tn_gen; XXX ? */
1086 
1087 	return (0);
1088 }
1089 
1090 /*
1091  * This pair of routines bracket all VOP_READ, VOP_WRITE
1092  * and VOP_READDIR requests.  The contents lock stops things
1093  * moving around while we're looking at them.
1094  *
1095  * Also used by file and record locking.
1096  */
1097 /*ARGSUSED2*/
1098 static int
1099 devfs_rwlock(struct vnode *vp, int write_flag, caller_context_t *ct)
1100 {
1101 	dcmn_err2(("devfs_rwlock %s\n", VTODV(vp)->dv_name));
1102 	rw_enter(&VTODV(vp)->dv_contents, write_flag ? RW_WRITER : RW_READER);
1103 	return (write_flag);
1104 }
1105 
1106 /*ARGSUSED1*/
1107 static void
1108 devfs_rwunlock(struct vnode *vp, int write_flag, caller_context_t *ct)
1109 {
1110 	dcmn_err2(("devfs_rwunlock %s\n", VTODV(vp)->dv_name));
1111 	rw_exit(&VTODV(vp)->dv_contents);
1112 }
1113 
1114 /*
1115  * XXX	Should probably do a better job of computing the maximum
1116  *	offset available in the directory.
1117  */
1118 /*ARGSUSED1*/
1119 static int
1120 devfs_seek(struct vnode *vp, offset_t ooff, offset_t *noffp,
1121     caller_context_t *ct)
1122 {
1123 	ASSERT(vp->v_type == VDIR);
1124 	dcmn_err2(("devfs_seek %s\n", VTODV(vp)->dv_name));
1125 	return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
1126 }
1127 
1128 vnodeops_t *dv_vnodeops;
1129 
1130 const fs_operation_def_t dv_vnodeops_template[] = {
1131 	VOPNAME_OPEN,		{ .vop_open = devfs_open },
1132 	VOPNAME_CLOSE,		{ .vop_close = devfs_close },
1133 	VOPNAME_READ,		{ .vop_read = devfs_read },
1134 	VOPNAME_WRITE,		{ .vop_write = devfs_write },
1135 	VOPNAME_IOCTL,		{ .vop_ioctl = devfs_ioctl },
1136 	VOPNAME_GETATTR,	{ .vop_getattr = devfs_getattr },
1137 	VOPNAME_SETATTR,	{ .vop_setattr = devfs_setattr },
1138 	VOPNAME_ACCESS,		{ .vop_access = devfs_access },
1139 	VOPNAME_LOOKUP,		{ .vop_lookup = devfs_lookup },
1140 	VOPNAME_CREATE,		{ .vop_create = devfs_create },
1141 	VOPNAME_READDIR,	{ .vop_readdir = devfs_readdir },
1142 	VOPNAME_FSYNC,		{ .vop_fsync = devfs_fsync },
1143 	VOPNAME_INACTIVE,	{ .vop_inactive = devfs_inactive },
1144 	VOPNAME_FID,		{ .vop_fid = devfs_fid },
1145 	VOPNAME_RWLOCK,		{ .vop_rwlock = devfs_rwlock },
1146 	VOPNAME_RWUNLOCK,	{ .vop_rwunlock = devfs_rwunlock },
1147 	VOPNAME_SEEK,		{ .vop_seek = devfs_seek },
1148 	VOPNAME_PATHCONF,	{ .vop_pathconf = devfs_pathconf },
1149 	VOPNAME_DISPOSE,	{ .error = fs_error },
1150 	VOPNAME_SETSECATTR,	{ .vop_setsecattr = devfs_setsecattr },
1151 	VOPNAME_GETSECATTR,	{ .vop_getsecattr = devfs_getsecattr },
1152 	NULL,			NULL
1153 };
1154