xref: /titanic_50/usr/src/uts/common/fs/specfs/specvnops.c (revision 80868c5387b92f32fe0e8ea709e36cb535287e03)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
28 /*	  All Rights Reserved  	*/
29 
30 /*
31  * University Copyright- Copyright (c) 1982, 1986, 1988
32  * The Regents of the University of California
33  * All Rights Reserved
34  *
35  * University Acknowledgment- Portions of this document are derived from
36  * software developed by the University of California, Berkeley, and its
37  * contributors.
38  */
39 
40 
41 #pragma ident	"%Z%%M%	%I%	%E% SMI"
42 
43 #include <sys/types.h>
44 #include <sys/thread.h>
45 #include <sys/t_lock.h>
46 #include <sys/param.h>
47 #include <sys/systm.h>
48 #include <sys/bitmap.h>
49 #include <sys/buf.h>
50 #include <sys/cmn_err.h>
51 #include <sys/conf.h>
52 #include <sys/ddi.h>
53 #include <sys/debug.h>
54 #include <sys/errno.h>
55 #include <sys/time.h>
56 #include <sys/fcntl.h>
57 #include <sys/flock.h>
58 #include <sys/file.h>
59 #include <sys/kmem.h>
60 #include <sys/mman.h>
61 #include <sys/open.h>
62 #include <sys/swap.h>
63 #include <sys/sysmacros.h>
64 #include <sys/uio.h>
65 #include <sys/vfs.h>
66 #include <sys/vnode.h>
67 #include <sys/stat.h>
68 #include <sys/poll.h>
69 #include <sys/stream.h>
70 #include <sys/strsubr.h>
71 #include <sys/policy.h>
72 #include <sys/devpolicy.h>
73 
74 #include <sys/proc.h>
75 #include <sys/user.h>
76 #include <sys/session.h>
77 #include <sys/vmsystm.h>
78 #include <sys/vtrace.h>
79 #include <sys/pathname.h>
80 
81 #include <sys/fs/snode.h>
82 
83 #include <vm/seg.h>
84 #include <vm/seg_map.h>
85 #include <vm/page.h>
86 #include <vm/pvn.h>
87 #include <vm/seg_dev.h>
88 #include <vm/seg_vn.h>
89 
90 #include <fs/fs_subr.h>
91 
92 #include <sys/esunddi.h>
93 #include <sys/autoconf.h>
94 #include <sys/sunndi.h>
95 
96 
97 static int spec_open(struct vnode **, int, struct cred *);
98 static int spec_close(struct vnode *, int, int, offset_t, struct cred *);
99 static int spec_read(struct vnode *, struct uio *, int, struct cred *,
100 	struct caller_context *);
101 static int spec_write(struct vnode *, struct uio *, int, struct cred *,
102 	struct caller_context *);
103 static int spec_ioctl(struct vnode *, int, intptr_t, int, struct cred *, int *);
104 static int spec_getattr(struct vnode *, struct vattr *, int, struct cred *);
105 static int spec_setattr(struct vnode *, struct vattr *, int, struct cred *,
106 	caller_context_t *);
107 static int spec_access(struct vnode *, int, int, struct cred *);
108 static int spec_create(struct vnode *, char *, vattr_t *, enum vcexcl,
109     int, struct vnode **, struct cred *, int);
110 static int spec_fsync(struct vnode *, int, struct cred *);
111 static void spec_inactive(struct vnode *, struct cred *);
112 static int spec_fid(struct vnode *, struct fid *);
113 static int spec_seek(struct vnode *, offset_t, offset_t *);
114 static int spec_frlock(struct vnode *, int, struct flock64 *, int, offset_t,
115     struct flk_callback *, struct cred *);
116 static int spec_realvp(struct vnode *, struct vnode **);
117 
118 static int spec_getpage(struct vnode *, offset_t, size_t, uint_t *, page_t **,
119     size_t, struct seg *, caddr_t, enum seg_rw, struct cred *);
120 static int spec_putapage(struct vnode *, page_t *, u_offset_t *, size_t *, int,
121 	struct cred *);
122 static struct buf *spec_startio(struct vnode *, page_t *, u_offset_t, size_t,
123 	int);
124 static int spec_getapage(struct vnode *, u_offset_t, size_t, uint_t *,
125     page_t **, size_t, struct seg *, caddr_t, enum seg_rw, struct cred *);
126 static int spec_map(struct vnode *, offset_t, struct as *, caddr_t *, size_t,
127     uchar_t, uchar_t, uint_t, struct cred *);
128 static int spec_addmap(struct vnode *, offset_t, struct as *, caddr_t, size_t,
129     uchar_t, uchar_t, uint_t, struct cred *);
130 static int spec_delmap(struct vnode *, offset_t, struct as *, caddr_t, size_t,
131     uint_t, uint_t, uint_t, struct cred *);
132 
133 static int spec_poll(struct vnode *, short, int, short *, struct pollhead **);
134 static int spec_dump(struct vnode *, caddr_t, int, int);
135 static int spec_pageio(struct vnode *, page_t *, u_offset_t, size_t, int,
136     cred_t *);
137 
138 static int spec_getsecattr(struct vnode *, vsecattr_t *, int, struct cred *);
139 static int spec_setsecattr(struct vnode *, vsecattr_t *, int, struct cred *);
140 static int spec_pathconf(struct	vnode *, int, ulong_t *, struct cred *);
141 
142 #define	SN_HOLD(csp)	{ \
143 	mutex_enter(&csp->s_lock); \
144 	csp->s_count++; \
145 	mutex_exit(&csp->s_lock); \
146 }
147 
148 #define	SN_RELE(csp)	{ \
149 	mutex_enter(&csp->s_lock); \
150 	csp->s_count--; \
151 	ASSERT((csp->s_count > 0) || (csp->s_vnode->v_stream == NULL)); \
152 	mutex_exit(&csp->s_lock); \
153 }
154 
155 struct vnodeops *spec_vnodeops;
156 
157 const fs_operation_def_t spec_vnodeops_template[] = {
158 	VOPNAME_OPEN, spec_open,
159 	VOPNAME_CLOSE, spec_close,
160 	VOPNAME_READ, spec_read,
161 	VOPNAME_WRITE, spec_write,
162 	VOPNAME_IOCTL, spec_ioctl,
163 	VOPNAME_GETATTR, spec_getattr,
164 	VOPNAME_SETATTR, spec_setattr,
165 	VOPNAME_ACCESS, spec_access,
166 	VOPNAME_CREATE, spec_create,
167 	VOPNAME_FSYNC, spec_fsync,
168 	VOPNAME_INACTIVE, (fs_generic_func_p) spec_inactive,
169 	VOPNAME_FID, spec_fid,
170 	VOPNAME_SEEK, spec_seek,
171 	VOPNAME_PATHCONF, spec_pathconf,
172 	VOPNAME_FRLOCK, spec_frlock,
173 	VOPNAME_REALVP, spec_realvp,
174 	VOPNAME_GETPAGE, spec_getpage,
175 	VOPNAME_PUTPAGE, spec_putpage,
176 	VOPNAME_MAP, (fs_generic_func_p) spec_map,
177 	VOPNAME_ADDMAP, (fs_generic_func_p) spec_addmap,
178 	VOPNAME_DELMAP, spec_delmap,
179 	VOPNAME_POLL, (fs_generic_func_p) spec_poll,
180 	VOPNAME_DUMP, spec_dump,
181 	VOPNAME_PAGEIO, spec_pageio,
182 	VOPNAME_SETSECATTR, spec_setsecattr,
183 	VOPNAME_GETSECATTR, spec_getsecattr,
184 	NULL, NULL
185 };
186 
187 /*
188  * Return address of spec_vnodeops
189  */
190 struct vnodeops *
191 spec_getvnodeops(void)
192 {
193 	return (spec_vnodeops);
194 }
195 
196 extern vnode_t *rconsvp;
197 
198 /*
199  * Acquire the serial lock on the common snode.
200  */
201 #define	LOCK_CSP(csp)					\
202 	mutex_enter(&csp->s_lock);			\
203 	while (csp->s_flag & SLOCKED) {			\
204 		csp->s_flag |= SWANT;			\
205 		cv_wait(&csp->s_cv, &csp->s_lock);	\
206 	}						\
207 	csp->s_flag |= SLOCKED;				\
208 	mutex_exit(&csp->s_lock);
209 
210 #define	LOCK_CSP_SIG(csp)	lock_csp_sig(csp)
211 
212 /*
213  * Acquire the serial lock on the common snode checking for a signal.
214  * cv_wait_sig is used to allow signals to pull us out.
215  * Return 1 if locked, 0 if interrupted
216  */
217 static int
218 lock_csp_sig(struct snode *csp)
219 {
220 	mutex_enter(&csp->s_lock);
221 	while (csp->s_flag & SLOCKED) {
222 		csp->s_flag |= SWANT;
223 		if (!cv_wait_sig(&csp->s_cv, &csp->s_lock)) {
224 			mutex_exit(&csp->s_lock);
225 			/* interrupted */
226 			return (0);
227 		}
228 	}
229 	csp->s_flag |= SLOCKED;
230 	mutex_exit(&csp->s_lock);
231 
232 	return (1);
233 }
234 
235 /*
236  * Unlock the serial lock on the common snode
237  */
238 #define	UNLOCK_CSP_LOCK_HELD(csp)			\
239 	ASSERT(mutex_owned(&csp->s_lock));		\
240 	if (csp->s_flag & SWANT)			\
241 		cv_broadcast(&csp->s_cv);		\
242 	csp->s_flag &= ~(SWANT|SLOCKED);
243 
244 #define	UNLOCK_CSP(csp)					\
245 	mutex_enter(&csp->s_lock);			\
246 	UNLOCK_CSP_LOCK_HELD(csp);			\
247 	mutex_exit(&csp->s_lock);
248 
249 /*
250  * compute/return the size of the device
251  */
252 #define	SPEC_SIZE(csp)	\
253 	(((csp)->s_flag & SSIZEVALID) ? (csp)->s_size : spec_size(csp))
254 
255 /*
256  * Compute and return the size.  If the size in the common snode is valid then
257  * return it.  If not valid then get the size from the driver and set size in
258  * the common snode.  If the device has not been attached then we don't ask for
259  * an update from the driver- for non-streams SSIZEVALID stays unset until the
260  * device is attached. A stat of a mknod outside /devices (non-devfs) may
261  * report UNKNOWN_SIZE because the device may not be attached yet (SDIPSET not
262  * established in mknod until open time). An stat in /devices will report the
263  * size correctly.  Specfs should always call SPEC_SIZE instead of referring
264  * directly to s_size to initialize/retrieve the size of a device.
265  *
266  * XXX There is an inconsistency between block and raw - "unknown" is
267  * UNKNOWN_SIZE for VBLK and 0 for VCHR(raw).
268  */
269 static u_offset_t
270 spec_size(struct snode *csp)
271 {
272 	struct vnode	*cvp = STOV(csp);
273 	u_offset_t	size;
274 	int		plen;
275 	uint32_t	size32;
276 	dev_t		dev;
277 	dev_info_t	*devi;
278 	major_t		maj;
279 
280 	ASSERT((csp)->s_commonvp == cvp);	/* must be common node */
281 
282 	/* return cached value */
283 	mutex_enter(&csp->s_lock);
284 	if (csp->s_flag & SSIZEVALID) {
285 		mutex_exit(&csp->s_lock);
286 		return (csp->s_size);
287 	}
288 
289 	/* VOP_GETATTR of mknod has not had devcnt restriction applied */
290 	dev = cvp->v_rdev;
291 	maj = getmajor(dev);
292 	if (maj >= devcnt) {
293 		/* return non-cached UNKNOWN_SIZE */
294 		mutex_exit(&csp->s_lock);
295 		return ((cvp->v_type == VCHR) ? 0 : UNKNOWN_SIZE);
296 	}
297 
298 	/* establish cached zero size for streams */
299 	if (STREAMSTAB(maj)) {
300 		csp->s_size = 0;
301 		csp->s_flag |= SSIZEVALID;
302 		mutex_exit(&csp->s_lock);
303 		return (0);
304 	}
305 
306 	/*
307 	 * Return non-cached UNKNOWN_SIZE if not open.
308 	 *
309 	 * NB: This check is bogus, calling prop_op(9E) should be gated by
310 	 * attach, not open. Not having this check however opens up a new
311 	 * context under which a driver's prop_op(9E) could be called. Calling
312 	 * prop_op(9E) in this new context has been shown to expose latent
313 	 * driver bugs (insufficient NULL pointer checks that lead to panic).
314 	 * We are keeping this open check for now to avoid these panics.
315 	 */
316 	if (csp->s_count == 0) {
317 		mutex_exit(&csp->s_lock);
318 		return ((cvp->v_type == VCHR) ? 0 : UNKNOWN_SIZE);
319 	}
320 
321 	/* Return non-cached UNKNOWN_SIZE if not attached. */
322 	if (((csp->s_flag & SDIPSET) == 0) || (csp->s_dip == NULL) ||
323 	    (i_ddi_node_state(csp->s_dip) < DS_ATTACHED)) {
324 		mutex_exit(&csp->s_lock);
325 		return ((cvp->v_type == VCHR) ? 0 : UNKNOWN_SIZE);
326 	}
327 
328 	devi = csp->s_dip;
329 
330 	/*
331 	 * Established cached size obtained from the attached driver. Since we
332 	 * know the devinfo node, for efficiency we use cdev_prop_op directly
333 	 * instead of [cb]dev_[Ss]size.
334 	 */
335 	if (cvp->v_type == VCHR) {
336 		size = 0;
337 		plen = sizeof (size);
338 		if (cdev_prop_op(dev, devi, PROP_LEN_AND_VAL_BUF,
339 		    DDI_PROP_NOTPROM | DDI_PROP_DONTPASS |
340 		    DDI_PROP_CONSUMER_TYPED, "Size", (caddr_t)&size,
341 		    &plen) != DDI_PROP_SUCCESS) {
342 			plen = sizeof (size32);
343 			if (cdev_prop_op(dev, devi, PROP_LEN_AND_VAL_BUF,
344 			    DDI_PROP_NOTPROM | DDI_PROP_DONTPASS,
345 			    "size", (caddr_t)&size32, &plen) ==
346 			    DDI_PROP_SUCCESS)
347 				size = size32;
348 		}
349 	} else {
350 		size = UNKNOWN_SIZE;
351 		plen = sizeof (size);
352 		if (cdev_prop_op(dev, devi, PROP_LEN_AND_VAL_BUF,
353 		    DDI_PROP_NOTPROM | DDI_PROP_DONTPASS |
354 		    DDI_PROP_CONSUMER_TYPED, "Nblocks", (caddr_t)&size,
355 		    &plen) != DDI_PROP_SUCCESS) {
356 			plen = sizeof (size32);
357 			if (cdev_prop_op(dev, devi, PROP_LEN_AND_VAL_BUF,
358 			    DDI_PROP_NOTPROM | DDI_PROP_DONTPASS,
359 			    "nblocks", (caddr_t)&size32, &plen) ==
360 			    DDI_PROP_SUCCESS)
361 				size = size32;
362 		}
363 
364 		if (size != UNKNOWN_SIZE) {
365 			/* convert from block size to byte size */
366 			if (size < (MAXOFFSET_T >> DEV_BSHIFT))
367 				size = size << DEV_BSHIFT;
368 			else
369 				size = UNKNOWN_SIZE;
370 		}
371 	}
372 
373 	csp->s_size = size;
374 	csp->s_flag |= SSIZEVALID;
375 
376 	mutex_exit(&csp->s_lock);
377 	return (size);
378 }
379 
380 /*
381  * This function deal with vnode substitution in the case of
382  * device cloning.
383  */
384 static int
385 spec_clone(struct vnode **vpp, dev_t newdev, int vtype, struct stdata *stp)
386 {
387 	dev_t		dev = (*vpp)->v_rdev;
388 	major_t		maj = getmajor(dev);
389 	major_t 	newmaj = getmajor(newdev);
390 	int		sysclone = (maj == clone_major);
391 	int		qassociate_used = 0;
392 	struct snode	*oldsp, *oldcsp;
393 	struct snode	*newsp, *newcsp;
394 	struct vnode	*newvp, *newcvp;
395 	dev_info_t	*dip;
396 	queue_t		*dq;
397 
398 	ASSERT(dev != newdev);
399 
400 	/*
401 	 * Check for cloning across different drivers.
402 	 * We only support this under the system provided clone driver
403 	 */
404 	if ((maj != newmaj) && !sysclone) {
405 		cmn_err(CE_NOTE,
406 		    "unsupported clone open maj = %u, newmaj = %u",
407 		    maj, newmaj);
408 		return (ENXIO);
409 	}
410 
411 	/* old */
412 	oldsp = VTOS(*vpp);
413 	oldcsp = VTOS(oldsp->s_commonvp);
414 
415 	/* new */
416 	newvp = makespecvp(newdev, vtype);
417 	ASSERT(newvp != NULL);
418 	newsp = VTOS(newvp);
419 	newcvp = newsp->s_commonvp;
420 	newcsp = VTOS(newcvp);
421 
422 	/*
423 	 * Clones inherit fsid, realvp, and dip.
424 	 * XXX realvp inherit is not occurring, does fstat of clone work?
425 	 */
426 	newsp->s_fsid = oldsp->s_fsid;
427 	if (sysclone) {
428 		newsp->s_flag |= SCLONE;
429 		dip = NULL;
430 	} else {
431 		newsp->s_flag |= SSELFCLONE;
432 		dip = oldcsp->s_dip;
433 	}
434 
435 	/*
436 	 * If we cloned to an opened newdev that already has called
437 	 * spec_assoc_vp_with_devi (SDIPSET set) then the association is
438 	 * already established.
439 	 */
440 	if (!(newcsp->s_flag & SDIPSET)) {
441 		/*
442 		 * Establish s_dip association for newdev.
443 		 *
444 		 * If we trusted the getinfo(9E) DDI_INFO_DEVT2INSTANCE
445 		 * implementation of all cloning drivers  (SCLONE and SELFCLONE)
446 		 * we would always use e_ddi_hold_devi_by_dev().  We know that
447 		 * many drivers have had (still have?) problems with
448 		 * DDI_INFO_DEVT2INSTANCE, so we try to minimize reliance by
449 		 * detecting drivers that use QASSOCIATE (by looking down the
450 		 * stream) and setting their s_dip association to NULL.
451 		 */
452 		qassociate_used = 0;
453 		if (stp) {
454 			for (dq = stp->sd_wrq; dq; dq = dq->q_next) {
455 				if (_RD(dq)->q_flag & _QASSOCIATED) {
456 					qassociate_used = 1;
457 					dip = NULL;
458 					break;
459 				}
460 			}
461 		}
462 
463 		if (dip || qassociate_used) {
464 			spec_assoc_vp_with_devi(newvp, dip);
465 		} else {
466 			/* derive association from newdev */
467 			dip = e_ddi_hold_devi_by_dev(newdev, 0);
468 			spec_assoc_vp_with_devi(newvp, dip);
469 			if (dip)
470 				ddi_release_devi(dip);
471 		}
472 	}
473 
474 	SN_HOLD(newcsp);
475 
476 	/* deal with stream stuff */
477 	if (stp != NULL) {
478 		LOCK_CSP(newcsp);	/* synchronize stream open/close */
479 		mutex_enter(&newcsp->s_lock);
480 		newcvp->v_stream = newvp->v_stream = stp;
481 		stp->sd_vnode = newcvp;
482 		stp->sd_strtab = STREAMSTAB(newmaj);
483 		mutex_exit(&newcsp->s_lock);
484 		UNLOCK_CSP(newcsp);
485 	}
486 
487 	/* substitute the vnode */
488 	SN_RELE(oldcsp);
489 	VN_RELE(*vpp);
490 	*vpp = newvp;
491 
492 	return (0);
493 }
494 
495 static int
496 spec_open(struct vnode **vpp, int flag, struct cred *cr)
497 {
498 	major_t maj;
499 	dev_t dev, newdev;
500 	struct vnode *vp, *cvp;
501 	struct snode *sp, *csp;
502 	struct stdata *stp;
503 	dev_info_t *dip;
504 	int error, type;
505 
506 	flag &= ~FCREAT;		/* paranoia */
507 
508 	vp = *vpp;
509 	sp = VTOS(vp);
510 	ASSERT((vp->v_type == VCHR) || (vp->v_type == VBLK));
511 	if ((vp->v_type != VCHR) && (vp->v_type != VBLK))
512 		return (ENXIO);
513 
514 	/*
515 	 * If the VFS_NODEVICES bit was set for the mount,
516 	 * do not allow opens of special devices.
517 	 */
518 	if (sp->s_realvp && (sp->s_realvp->v_vfsp->vfs_flag & VFS_NODEVICES))
519 		return (ENXIO);
520 
521 	newdev = dev = vp->v_rdev;
522 
523 	/*
524 	 * If we are opening a node that has not had spec_assoc_vp_with_devi
525 	 * called against it (mknod outside /devices or a non-dacf makespecvp
526 	 * node) then SDIPSET will not be set. In this case we call an
527 	 * interface which will reconstruct the path and lookup (drive attach)
528 	 * through devfs (e_ddi_hold_devi_by_dev -> e_ddi_hold_devi_by_path ->
529 	 * devfs_lookupname).  For support of broken drivers that don't call
530 	 * ddi_create_minor_node for all minor nodes in their instance space,
531 	 * we call interfaces that operates at the directory/devinfo
532 	 * (major/instance) level instead of to the leaf/minor node level.
533 	 * After finding and attaching the dip we associate it with the
534 	 * common specfs vnode (s_dip), which sets SDIPSET.  A DL_DETACH_REQ
535 	 * to style-2 stream driver may set s_dip to NULL with SDIPSET set.
536 	 *
537 	 * NOTE: Although e_ddi_hold_devi_by_dev takes a dev_t argument, its
538 	 * implementation operates at the major/instance level since it only
539 	 * need to return a dip.
540 	 */
541 	cvp = sp->s_commonvp;
542 	csp = VTOS(cvp);
543 	if (!(csp->s_flag & SDIPSET)) {
544 		/* try to attach, return error if we fail */
545 		if ((dip = e_ddi_hold_devi_by_dev(dev, 0)) == NULL)
546 			return (ENXIO);
547 
548 		/* associate dip with the common snode s_dip */
549 		spec_assoc_vp_with_devi(vp, dip);
550 		ddi_release_devi(dip);	/* from e_ddi_hold_devi_by_dev */
551 	}
552 
553 #ifdef  DEBUG
554 	/* verify attach/open exclusion guarantee */
555 	dip = csp->s_dip;
556 	ASSERT((dip == NULL) || (i_ddi_node_state(dip) >= DS_ATTACHED));
557 #endif  /* DEBUG */
558 
559 	if ((error = secpolicy_spec_open(cr, cvp, flag)) != 0)
560 		return (error);
561 
562 	maj = getmajor(dev);
563 	if (STREAMSTAB(maj))
564 		goto streams_open;
565 
566 	SN_HOLD(csp);			/* increment open count */
567 
568 	/* non streams open */
569 	type = (vp->v_type == VBLK ? OTYP_BLK : OTYP_CHR);
570 	error = dev_open(&newdev, flag, type, cr);
571 
572 	/* deal with clone case */
573 	if (error == 0 && dev != newdev) {
574 		error = spec_clone(vpp, newdev, vp->v_type, NULL);
575 		/*
576 		 * bail on clone failure, further processing
577 		 * results in undefined behaviors.
578 		 */
579 		if (error != 0)
580 			return (error);
581 		sp = VTOS(*vpp);
582 		csp = VTOS(sp->s_commonvp);
583 	}
584 
585 	if (error == 0) {
586 		sp->s_size = SPEC_SIZE(csp);
587 
588 		if ((csp->s_flag & SNEEDCLOSE) == 0) {
589 			int nmaj = getmajor(newdev);
590 			mutex_enter(&csp->s_lock);
591 			/* successful open needs a close later */
592 			csp->s_flag |= SNEEDCLOSE;
593 
594 			/*
595 			 * Invalidate possible cached "unknown" size
596 			 * established by a VOP_GETATTR while open was in
597 			 * progress, and the driver might fail prop_op(9E).
598 			 */
599 			if (((cvp->v_type == VCHR) && (csp->s_size == 0)) ||
600 			    ((cvp->v_type == VBLK) &&
601 			    (csp->s_size == UNKNOWN_SIZE)))
602 				csp->s_flag &= ~SSIZEVALID;
603 
604 			if (devopsp[nmaj]->devo_cb_ops->cb_flag & D_64BIT)
605 				csp->s_flag |= SLOFFSET;
606 			if (devopsp[nmaj]->devo_cb_ops->cb_flag & D_U64BIT)
607 				csp->s_flag |= SLOFFSET | SANYOFFSET;
608 			mutex_exit(&csp->s_lock);
609 		}
610 		return (0);
611 	}
612 
613 	/*
614 	 * Open failed. If we missed a close operation because
615 	 * we were trying to get the device open and it is the
616 	 * last in progress open that is failing then call close.
617 	 *
618 	 * NOTE: Only non-streams open has this race condition.
619 	 */
620 	mutex_enter(&csp->s_lock);
621 	csp->s_count--;			/* decrement open count : SN_RELE */
622 	if ((csp->s_count == 0) &&	/* no outstanding open */
623 	    (csp->s_mapcnt == 0) &&	/* no mapping */
624 	    (csp->s_flag & SNEEDCLOSE)) { /* need a close */
625 		csp->s_flag &= ~(SNEEDCLOSE | SSIZEVALID);
626 
627 		/* See comment in spec_close() */
628 		if (csp->s_flag & (SCLONE | SSELFCLONE))
629 			csp->s_flag &= ~SDIPSET;
630 
631 		mutex_exit(&csp->s_lock);
632 		ASSERT(*vpp != NULL);
633 		(void) device_close(*vpp, flag, cr);
634 	} else {
635 		mutex_exit(&csp->s_lock);
636 	}
637 	return (error);
638 
639 streams_open:
640 	if (vp->v_type != VCHR)
641 		return (ENXIO);
642 
643 	/*
644 	 * Lock common snode to prevent any new clone opens
645 	 * on this stream while one is in progress.
646 	 * This is necessary since the stream currently
647 	 * associated with the clone device will not be part
648 	 * of it after the clone open completes.
649 	 * Unfortunately we don't know in advance if this is
650 	 * a clone device so we have to lock all opens.
651 	 *
652 	 * If we fail, it's because of an interrupt.
653 	 */
654 	if (LOCK_CSP_SIG(csp) == 0)
655 		return (EINTR);
656 
657 	SN_HOLD(csp);			/* increment open count */
658 
659 	error = stropen(cvp, &newdev, flag, cr);
660 	stp = cvp->v_stream;
661 
662 	/* deal with the clone case */
663 	if ((error == 0) && (dev != newdev)) {
664 		vp->v_stream = cvp->v_stream = NULL;
665 		UNLOCK_CSP(csp);
666 		error = spec_clone(vpp, newdev, vp->v_type, stp);
667 		/*
668 		 * bail on clone failure, further processing
669 		 * results in undefined behaviors.
670 		 */
671 		if (error != 0)
672 			return (error);
673 		sp = VTOS(*vpp);
674 		csp = VTOS(sp->s_commonvp);
675 	} else if (error == 0) {
676 		vp->v_stream = stp;
677 		UNLOCK_CSP(csp);
678 	}
679 
680 	if (error == 0) {
681 		/* STREAMS devices don't have a size */
682 		sp->s_size = csp->s_size = 0;
683 
684 		/*
685 		 * try to allocate it as a controlling terminal
686 		 */
687 		if ((stp->sd_flag & STRISTTY) && !(flag & FNOCTTY))
688 			stralloctty(stp);
689 
690 		return (0);
691 	}
692 
693 	/*
694 	 * Deal with stropen failure.
695 	 *
696 	 * sd_flag in the stream head cannot change since the
697 	 * common snode is locked before the call to stropen().
698 	 */
699 	if ((stp != NULL) && (stp->sd_flag & STREOPENFAIL)) {
700 		/*
701 		 * Open failed part way through.
702 		 */
703 		mutex_enter(&stp->sd_lock);
704 		stp->sd_flag &= ~STREOPENFAIL;
705 		mutex_exit(&stp->sd_lock);
706 
707 		UNLOCK_CSP(csp);
708 		(void) spec_close(vp, flag, 1, 0, cr);
709 	} else {
710 		UNLOCK_CSP(csp);
711 		SN_RELE(csp);
712 	}
713 
714 	return (error);
715 }
716 
717 /*ARGSUSED2*/
718 static int
719 spec_close(
720 	struct vnode	*vp,
721 	int		flag,
722 	int		count,
723 	offset_t	offset,
724 	struct cred	*cr)
725 {
726 	struct vnode *cvp;
727 	struct snode *sp, *csp;
728 	enum vtype type;
729 	dev_t dev;
730 	int error = 0;
731 	int sysclone;
732 
733 	if (!(flag & FKLYR)) {
734 		/* this only applies to closes of devices from userland */
735 		cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
736 		cleanshares(vp, ttoproc(curthread)->p_pid);
737 		if (vp->v_stream)
738 			strclean(vp);
739 	}
740 	if (count > 1)
741 		return (0);
742 
743 	sp = VTOS(vp);
744 	cvp = sp->s_commonvp;
745 
746 	dev = sp->s_dev;
747 	type = vp->v_type;
748 
749 	ASSERT(type == VCHR || type == VBLK);
750 
751 	/*
752 	 * Prevent close/close and close/open races by serializing closes
753 	 * on this common snode. Clone opens are held up until after
754 	 * we have closed this device so the streams linkage is maintained
755 	 */
756 	csp = VTOS(cvp);
757 
758 	LOCK_CSP(csp);
759 	mutex_enter(&csp->s_lock);
760 
761 	csp->s_count--;			/* one fewer open reference : SN_RELE */
762 	sysclone = sp->s_flag & SCLONE;
763 
764 	/*
765 	 * Invalidate size on each close.
766 	 *
767 	 * XXX We do this on each close because we don't have interfaces that
768 	 * allow a driver to invalidate the size.  Since clearing this on each
769 	 * close this causes property overhead we skip /dev/null and
770 	 * /dev/zero to avoid degrading kenbus performance.
771 	 */
772 	if (getmajor(dev) != mm_major)
773 		csp->s_flag &= ~SSIZEVALID;
774 
775 	/*
776 	 * Only call the close routine when the last open reference through
777 	 * any [s, v]node goes away.  This can be checked by looking at
778 	 * s_count on the common vnode.
779 	 */
780 	if ((csp->s_count == 0) && (csp->s_mapcnt == 0)) {
781 		/* we don't need a close */
782 		csp->s_flag &= ~(SNEEDCLOSE | SSIZEVALID);
783 
784 		/*
785 		 * A cloning driver may open-clone to the same dev_t that we
786 		 * are closing before spec_inactive destroys the common snode.
787 		 * If this occurs the s_dip association needs to be reevaluated.
788 		 * We clear SDIPSET to force reevaluation in this case.  When
789 		 * reevaluation occurs (by spec_clone after open), if the
790 		 * devinfo association has changed then the old association
791 		 * will be released as the new association is established by
792 		 * spec_assoc_vp_with_devi().
793 		 */
794 		if (csp->s_flag & (SCLONE | SSELFCLONE))
795 			csp->s_flag &= ~SDIPSET;
796 
797 		mutex_exit(&csp->s_lock);
798 		error = device_close(vp, flag, cr);
799 
800 		/*
801 		 * Decrement the devops held in clnopen()
802 		 */
803 		if (sysclone) {
804 			ddi_rele_driver(getmajor(dev));
805 		}
806 		mutex_enter(&csp->s_lock);
807 	}
808 
809 	UNLOCK_CSP_LOCK_HELD(csp);
810 	mutex_exit(&csp->s_lock);
811 
812 	return (error);
813 }
814 
815 /*ARGSUSED2*/
816 static int
817 spec_read(
818 	struct vnode	*vp,
819 	struct uio	*uiop,
820 	int		ioflag,
821 	struct cred	*cr,
822 	struct caller_context *ct)
823 {
824 	int error;
825 	struct snode *sp = VTOS(vp);
826 	dev_t dev = sp->s_dev;
827 	size_t n;
828 	ulong_t on;
829 	u_offset_t bdevsize;
830 	offset_t maxoff;
831 	offset_t off;
832 	struct vnode *blkvp;
833 
834 	ASSERT(vp->v_type == VCHR || vp->v_type == VBLK);
835 
836 	if (STREAMSTAB(getmajor(dev))) {	/* stream */
837 		ASSERT(vp->v_type == VCHR);
838 		smark(sp, SACC);
839 		return (strread(vp, uiop, cr));
840 	}
841 
842 	if (uiop->uio_resid == 0)
843 		return (0);
844 
845 	/*
846 	 * Plain old character devices that set D_U64BIT can have
847 	 * unrestricted offsets.
848 	 */
849 	maxoff = spec_maxoffset(vp);
850 	ASSERT(maxoff != -1 || vp->v_type == VCHR);
851 
852 	if (maxoff != -1 && (uiop->uio_loffset < 0 ||
853 	    uiop->uio_loffset + uiop->uio_resid > maxoff))
854 		return (EINVAL);
855 
856 	if (vp->v_type == VCHR) {
857 		smark(sp, SACC);
858 		ASSERT(STREAMSTAB(getmajor(dev)) == 0);
859 		return (cdev_read(dev, uiop, cr));
860 	}
861 
862 	/*
863 	 * Block device.
864 	 */
865 	error = 0;
866 	blkvp = sp->s_commonvp;
867 	bdevsize = SPEC_SIZE(VTOS(blkvp));
868 
869 	do {
870 		caddr_t base;
871 		offset_t diff;
872 
873 		off = uiop->uio_loffset & (offset_t)MAXBMASK;
874 		on = (size_t)(uiop->uio_loffset & MAXBOFFSET);
875 		n = (size_t)MIN(MAXBSIZE - on, uiop->uio_resid);
876 		diff = bdevsize - uiop->uio_loffset;
877 
878 		if (diff <= 0)
879 			break;
880 		if (diff < n)
881 			n = (size_t)diff;
882 
883 		base = segmap_getmapflt(segkmap, blkvp,
884 			(u_offset_t)(off + on), n, 1, S_READ);
885 
886 		if ((error = uiomove(base + on, n, UIO_READ, uiop)) == 0) {
887 			int flags = 0;
888 			/*
889 			 * If we read a whole block, we won't need this
890 			 * buffer again soon.
891 			 */
892 			if (n + on == MAXBSIZE)
893 				flags = SM_DONTNEED | SM_FREE;
894 			error = segmap_release(segkmap, base, flags);
895 		} else {
896 			(void) segmap_release(segkmap, base, 0);
897 			if (bdevsize == UNKNOWN_SIZE) {
898 				error = 0;
899 				break;
900 			}
901 		}
902 	} while (error == 0 && uiop->uio_resid > 0 && n != 0);
903 
904 	return (error);
905 }
906 
907 /*ARGSUSED*/
908 static int
909 spec_write(
910 	struct vnode *vp,
911 	struct uio *uiop,
912 	int ioflag,
913 	struct cred *cr,
914 	struct caller_context *ct)
915 {
916 	int error;
917 	struct snode *sp = VTOS(vp);
918 	dev_t dev = sp->s_dev;
919 	size_t n;
920 	ulong_t on;
921 	u_offset_t bdevsize;
922 	offset_t maxoff;
923 	offset_t off;
924 	struct vnode *blkvp;
925 
926 	ASSERT(vp->v_type == VCHR || vp->v_type == VBLK);
927 
928 	if (STREAMSTAB(getmajor(dev))) {
929 		ASSERT(vp->v_type == VCHR);
930 		smark(sp, SUPD);
931 		return (strwrite(vp, uiop, cr));
932 	}
933 
934 	/*
935 	 * Plain old character devices that set D_U64BIT can have
936 	 * unrestricted offsets.
937 	 */
938 	maxoff = spec_maxoffset(vp);
939 	ASSERT(maxoff != -1 || vp->v_type == VCHR);
940 
941 	if (maxoff != -1 && (uiop->uio_loffset < 0 ||
942 	    uiop->uio_loffset + uiop->uio_resid > maxoff))
943 		return (EINVAL);
944 
945 	if (vp->v_type == VCHR) {
946 		smark(sp, SUPD);
947 		ASSERT(STREAMSTAB(getmajor(dev)) == 0);
948 		return (cdev_write(dev, uiop, cr));
949 	}
950 
951 	if (uiop->uio_resid == 0)
952 		return (0);
953 
954 	error = 0;
955 	blkvp = sp->s_commonvp;
956 	bdevsize = SPEC_SIZE(VTOS(blkvp));
957 
958 	do {
959 		int pagecreate;
960 		int newpage;
961 		caddr_t base;
962 		offset_t diff;
963 
964 		off = uiop->uio_loffset & (offset_t)MAXBMASK;
965 		on = (ulong_t)(uiop->uio_loffset & MAXBOFFSET);
966 		n = (size_t)MIN(MAXBSIZE - on, uiop->uio_resid);
967 		pagecreate = 0;
968 
969 		diff = bdevsize - uiop->uio_loffset;
970 		if (diff <= 0) {
971 			error = ENXIO;
972 			break;
973 		}
974 		if (diff < n)
975 			n = (size_t)diff;
976 
977 		/*
978 		 * Check to see if we can skip reading in the page
979 		 * and just allocate the memory.  We can do this
980 		 * if we are going to rewrite the entire mapping
981 		 * or if we are going to write to end of the device
982 		 * from the beginning of the mapping.
983 		 */
984 		if (n == MAXBSIZE || (on == 0 && (off + n) == bdevsize))
985 			pagecreate = 1;
986 
987 		base = segmap_getmapflt(segkmap, blkvp,
988 		    (u_offset_t)(off + on), n, !pagecreate, S_WRITE);
989 
990 		/*
991 		 * segmap_pagecreate() returns 1 if it calls
992 		 * page_create_va() to allocate any pages.
993 		 */
994 		newpage = 0;
995 
996 		if (pagecreate)
997 			newpage = segmap_pagecreate(segkmap, base + on,
998 				n, 0);
999 
1000 		error = uiomove(base + on, n, UIO_WRITE, uiop);
1001 
1002 		if (pagecreate &&
1003 		    uiop->uio_loffset <
1004 		    P2ROUNDUP_TYPED(off + on + n, PAGESIZE, offset_t)) {
1005 			/*
1006 			 * We created pages w/o initializing them completely,
1007 			 * thus we need to zero the part that wasn't set up.
1008 			 * This can happen if we write to the end of the device
1009 			 * or if we had some sort of error during the uiomove.
1010 			 */
1011 			long nzero;
1012 			offset_t nmoved;
1013 
1014 			nmoved = (uiop->uio_loffset - (off + on));
1015 			if (nmoved < 0 || nmoved > n) {
1016 				panic("spec_write: nmoved bogus");
1017 				/*NOTREACHED*/
1018 			}
1019 			nzero = (long)P2ROUNDUP(on + n, PAGESIZE) -
1020 			    (on + nmoved);
1021 			if (nzero < 0 || (on + nmoved + nzero > MAXBSIZE)) {
1022 				panic("spec_write: nzero bogus");
1023 				/*NOTREACHED*/
1024 			}
1025 			(void) kzero(base + on + nmoved, (size_t)nzero);
1026 		}
1027 
1028 		/*
1029 		 * Unlock the pages which have been allocated by
1030 		 * page_create_va() in segmap_pagecreate().
1031 		 */
1032 		if (newpage)
1033 			segmap_pageunlock(segkmap, base + on,
1034 				(size_t)n, S_WRITE);
1035 
1036 		if (error == 0) {
1037 			int flags = 0;
1038 
1039 			/*
1040 			 * Force write back for synchronous write cases.
1041 			 */
1042 			if (ioflag & (FSYNC|FDSYNC))
1043 				flags = SM_WRITE;
1044 			else if (n + on == MAXBSIZE || IS_SWAPVP(vp)) {
1045 				/*
1046 				 * Have written a whole block.
1047 				 * Start an asynchronous write and
1048 				 * mark the buffer to indicate that
1049 				 * it won't be needed again soon.
1050 				 * Push swap files here, since it
1051 				 * won't happen anywhere else.
1052 				 */
1053 				flags = SM_WRITE | SM_ASYNC | SM_DONTNEED;
1054 			}
1055 			smark(sp, SUPD|SCHG);
1056 			error = segmap_release(segkmap, base, flags);
1057 		} else
1058 			(void) segmap_release(segkmap, base, SM_INVAL);
1059 
1060 	} while (error == 0 && uiop->uio_resid > 0 && n != 0);
1061 
1062 	return (error);
1063 }
1064 
1065 static int
1066 spec_ioctl(struct vnode *vp, int cmd, intptr_t arg, int mode, struct cred *cr,
1067     int *rvalp)
1068 {
1069 	struct snode *sp;
1070 	dev_t dev;
1071 	int error;
1072 
1073 	if (vp->v_type != VCHR)
1074 		return (ENOTTY);
1075 	sp = VTOS(vp);
1076 	dev = sp->s_dev;
1077 	if (STREAMSTAB(getmajor(dev))) {
1078 		error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
1079 	} else {
1080 		error = cdev_ioctl(dev, cmd, arg, mode, cr, rvalp);
1081 	}
1082 	return (error);
1083 }
1084 
1085 static int
1086 spec_getattr(struct vnode *vp, struct vattr *vap, int flags, struct cred *cr)
1087 {
1088 	int error;
1089 	struct snode *sp;
1090 	struct vnode *realvp;
1091 
1092 	/* With ATTR_COMM we will not get attributes from realvp */
1093 	if (flags & ATTR_COMM) {
1094 		sp = VTOS(vp);
1095 		vp = sp->s_commonvp;
1096 	}
1097 	sp = VTOS(vp);
1098 	realvp = sp->s_realvp;
1099 
1100 	if (realvp == NULL) {
1101 		static int snode_shift	= 0;
1102 
1103 		/*
1104 		 * Calculate the amount of bitshift to a snode pointer which
1105 		 * will still keep it unique.  See below.
1106 		 */
1107 		if (snode_shift == 0)
1108 			snode_shift = highbit(sizeof (struct snode));
1109 		ASSERT(snode_shift > 0);
1110 
1111 		/*
1112 		 * No real vnode behind this one.  Fill in the fields
1113 		 * from the snode.
1114 		 *
1115 		 * This code should be refined to return only the
1116 		 * attributes asked for instead of all of them.
1117 		 */
1118 		vap->va_type = vp->v_type;
1119 		vap->va_mode = 0;
1120 		vap->va_uid = vap->va_gid = 0;
1121 		vap->va_fsid = sp->s_fsid;
1122 
1123 		/*
1124 		 * If the va_nodeid is > MAX_USHORT, then i386 stats might
1125 		 * fail. So we shift down the snode pointer to try and get
1126 		 * the most uniqueness into 16-bits.
1127 		 */
1128 		vap->va_nodeid = ((ino64_t)(uintptr_t)sp >> snode_shift) &
1129 		    0xFFFF;
1130 		vap->va_nlink = 0;
1131 		vap->va_rdev = sp->s_dev;
1132 
1133 		/*
1134 		 * va_nblocks is the number of 512 byte blocks used to store
1135 		 * the mknod for the device, not the number of blocks on the
1136 		 * device itself.  This is typically zero since the mknod is
1137 		 * represented directly in the inode itself.
1138 		 */
1139 		vap->va_nblocks = 0;
1140 	} else {
1141 		error = VOP_GETATTR(realvp, vap, flags, cr);
1142 		if (error != 0)
1143 			return (error);
1144 	}
1145 
1146 	/* set the size from the snode */
1147 	vap->va_size = SPEC_SIZE(VTOS(sp->s_commonvp));
1148 	vap->va_blksize = MAXBSIZE;
1149 
1150 	mutex_enter(&sp->s_lock);
1151 	vap->va_atime.tv_sec = sp->s_atime;
1152 	vap->va_mtime.tv_sec = sp->s_mtime;
1153 	vap->va_ctime.tv_sec = sp->s_ctime;
1154 	mutex_exit(&sp->s_lock);
1155 
1156 	vap->va_atime.tv_nsec = 0;
1157 	vap->va_mtime.tv_nsec = 0;
1158 	vap->va_ctime.tv_nsec = 0;
1159 	vap->va_seq = 0;
1160 
1161 	return (0);
1162 }
1163 
1164 static int
1165 spec_setattr(
1166 	struct vnode *vp,
1167 	struct vattr *vap,
1168 	int flags,
1169 	struct cred *cr,
1170 	caller_context_t *ctp)
1171 {
1172 	struct snode *sp = VTOS(vp);
1173 	struct vnode *realvp;
1174 	int error;
1175 
1176 	if (vp->v_type == VCHR && vp->v_stream && (vap->va_mask & AT_SIZE)) {
1177 		/*
1178 		 * 1135080:	O_TRUNC should have no effect on
1179 		 *		named pipes and terminal devices.
1180 		 */
1181 		ASSERT(vap->va_mask == AT_SIZE);
1182 		return (0);
1183 	}
1184 
1185 	if ((realvp = sp->s_realvp) == NULL)
1186 		error = 0;	/* no real vnode to update */
1187 	else
1188 		error = VOP_SETATTR(realvp, vap, flags, cr, ctp);
1189 	if (error == 0) {
1190 		/*
1191 		 * If times were changed, update snode.
1192 		 */
1193 		mutex_enter(&sp->s_lock);
1194 		if (vap->va_mask & AT_ATIME)
1195 			sp->s_atime = vap->va_atime.tv_sec;
1196 		if (vap->va_mask & AT_MTIME) {
1197 			sp->s_mtime = vap->va_mtime.tv_sec;
1198 			sp->s_ctime = gethrestime_sec();
1199 		}
1200 		mutex_exit(&sp->s_lock);
1201 	}
1202 	return (error);
1203 }
1204 
1205 static int
1206 spec_access(struct vnode *vp, int mode, int flags, struct cred *cr)
1207 {
1208 	struct vnode *realvp;
1209 	struct snode *sp = VTOS(vp);
1210 
1211 	if ((realvp = sp->s_realvp) != NULL)
1212 		return (VOP_ACCESS(realvp, mode, flags, cr));
1213 	else
1214 		return (0);	/* Allow all access. */
1215 }
1216 
1217 /*
1218  * This can be called if creat or an open with O_CREAT is done on the root
1219  * of a lofs mount where the mounted entity is a special file.
1220  */
1221 /*ARGSUSED*/
1222 static int
1223 spec_create(struct vnode *dvp, char *name, vattr_t *vap, enum vcexcl excl,
1224     int mode, struct vnode **vpp, struct cred *cr, int flag)
1225 {
1226 	int error;
1227 
1228 	ASSERT(dvp && (dvp->v_flag & VROOT) && *name == '\0');
1229 	if (excl == NONEXCL) {
1230 		if (mode && (error = spec_access(dvp, mode, 0, cr)))
1231 			return (error);
1232 		VN_HOLD(dvp);
1233 		return (0);
1234 	}
1235 	return (EEXIST);
1236 }
1237 
1238 /*
1239  * In order to sync out the snode times without multi-client problems,
1240  * make sure the times written out are never earlier than the times
1241  * already set in the vnode.
1242  */
1243 static int
1244 spec_fsync(struct vnode *vp, int syncflag, struct cred *cr)
1245 {
1246 	struct snode *sp = VTOS(vp);
1247 	struct vnode *realvp;
1248 	struct vnode *cvp;
1249 	struct vattr va, vatmp;
1250 
1251 	/* If times didn't change, don't flush anything. */
1252 	mutex_enter(&sp->s_lock);
1253 	if ((sp->s_flag & (SACC|SUPD|SCHG)) == 0 && vp->v_type != VBLK) {
1254 		mutex_exit(&sp->s_lock);
1255 		return (0);
1256 	}
1257 	sp->s_flag &= ~(SACC|SUPD|SCHG);
1258 	mutex_exit(&sp->s_lock);
1259 	cvp = sp->s_commonvp;
1260 	realvp = sp->s_realvp;
1261 
1262 	if (vp->v_type == VBLK && cvp != vp && vn_has_cached_data(cvp) &&
1263 	    (cvp->v_flag & VISSWAP) == 0)
1264 		(void) VOP_PUTPAGE(cvp, (offset_t)0, 0, 0, cr);
1265 
1266 	/*
1267 	 * If no real vnode to update, don't flush anything.
1268 	 */
1269 	if (realvp == NULL)
1270 		return (0);
1271 
1272 	vatmp.va_mask = AT_ATIME|AT_MTIME;
1273 	if (VOP_GETATTR(realvp, &vatmp, 0, cr) == 0) {
1274 
1275 		mutex_enter(&sp->s_lock);
1276 		if (vatmp.va_atime.tv_sec > sp->s_atime)
1277 			va.va_atime = vatmp.va_atime;
1278 		else {
1279 			va.va_atime.tv_sec = sp->s_atime;
1280 			va.va_atime.tv_nsec = 0;
1281 		}
1282 		if (vatmp.va_mtime.tv_sec > sp->s_mtime)
1283 			va.va_mtime = vatmp.va_mtime;
1284 		else {
1285 			va.va_mtime.tv_sec = sp->s_mtime;
1286 			va.va_mtime.tv_nsec = 0;
1287 		}
1288 		mutex_exit(&sp->s_lock);
1289 
1290 		va.va_mask = AT_ATIME|AT_MTIME;
1291 		(void) VOP_SETATTR(realvp, &va, 0, cr, NULL);
1292 	}
1293 	(void) VOP_FSYNC(realvp, syncflag, cr);
1294 	return (0);
1295 }
1296 
1297 /*ARGSUSED*/
1298 static void
1299 spec_inactive(struct vnode *vp, struct cred *cr)
1300 {
1301 	struct snode *sp = VTOS(vp);
1302 	struct vnode *cvp;
1303 	struct vnode *rvp;
1304 
1305 	/*
1306 	 * If no one has reclaimed the vnode, remove from the
1307 	 * cache now.
1308 	 */
1309 	if (vp->v_count < 1) {
1310 		panic("spec_inactive: Bad v_count");
1311 		/*NOTREACHED*/
1312 	}
1313 	mutex_enter(&stable_lock);
1314 
1315 	mutex_enter(&vp->v_lock);
1316 	/*
1317 	 * Drop the temporary hold by vn_rele now
1318 	 */
1319 	if (--vp->v_count != 0) {
1320 		mutex_exit(&vp->v_lock);
1321 		mutex_exit(&stable_lock);
1322 		return;
1323 	}
1324 	mutex_exit(&vp->v_lock);
1325 
1326 	sdelete(sp);
1327 	mutex_exit(&stable_lock);
1328 
1329 	/* We are the sole owner of sp now */
1330 	cvp = sp->s_commonvp;
1331 	rvp = sp->s_realvp;
1332 
1333 	if (rvp) {
1334 		/*
1335 		 * If the snode times changed, then update the times
1336 		 * associated with the "realvp".
1337 		 */
1338 		if ((sp->s_flag & (SACC|SUPD|SCHG)) != 0) {
1339 
1340 			struct vattr va, vatmp;
1341 
1342 			mutex_enter(&sp->s_lock);
1343 			sp->s_flag &= ~(SACC|SUPD|SCHG);
1344 			mutex_exit(&sp->s_lock);
1345 			vatmp.va_mask = AT_ATIME|AT_MTIME;
1346 			/*
1347 			 * The user may not own the device, but we
1348 			 * want to update the attributes anyway.
1349 			 */
1350 			if (VOP_GETATTR(rvp, &vatmp, 0, kcred) == 0) {
1351 				if (vatmp.va_atime.tv_sec > sp->s_atime)
1352 					va.va_atime = vatmp.va_atime;
1353 				else {
1354 					va.va_atime.tv_sec = sp->s_atime;
1355 					va.va_atime.tv_nsec = 0;
1356 				}
1357 				if (vatmp.va_mtime.tv_sec > sp->s_mtime)
1358 					va.va_mtime = vatmp.va_mtime;
1359 				else {
1360 					va.va_mtime.tv_sec = sp->s_mtime;
1361 					va.va_mtime.tv_nsec = 0;
1362 				}
1363 
1364 				va.va_mask = AT_ATIME|AT_MTIME;
1365 				(void) VOP_SETATTR(rvp, &va, 0, kcred, NULL);
1366 			}
1367 		}
1368 	}
1369 	ASSERT(!vn_has_cached_data(vp));
1370 	vn_invalid(vp);
1371 
1372 	/* if we are sharing another file systems vfs, release it */
1373 	if (vp->v_vfsp && (vp->v_vfsp != &spec_vfs))
1374 		VFS_RELE(vp->v_vfsp);
1375 
1376 	/* if we have a realvp, release the realvp */
1377 	if (rvp)
1378 		VN_RELE(rvp);
1379 
1380 	/* if we have a common, release the common */
1381 	if (cvp && (cvp != vp)) {
1382 		VN_RELE(cvp);
1383 #ifdef DEBUG
1384 	} else if (cvp) {
1385 		/*
1386 		 * if this is the last reference to a common vnode, any
1387 		 * associated stream had better have been closed
1388 		 */
1389 		ASSERT(cvp == vp);
1390 		ASSERT(cvp->v_stream == NULL);
1391 #endif /* DEBUG */
1392 	}
1393 
1394 	/*
1395 	 * if we have a hold on a devinfo node (established by
1396 	 * spec_assoc_vp_with_devi), release the hold
1397 	 */
1398 	if (sp->s_dip)
1399 		ddi_release_devi(sp->s_dip);
1400 
1401 	/*
1402 	 * If we have an associated device policy, release it.
1403 	 */
1404 	if (sp->s_plcy != NULL)
1405 		dpfree(sp->s_plcy);
1406 
1407 	/*
1408 	 * If all holds on the devinfo node are through specfs/devfs
1409 	 * and we just destroyed the last specfs node associated with the
1410 	 * device, then the devinfo node reference count should now be
1411 	 * zero.  We can't check this because there may be other holds
1412 	 * on the node from non file system sources: ddi_hold_devi_by_instance
1413 	 * for example.
1414 	 */
1415 	kmem_cache_free(snode_cache, sp);
1416 }
1417 
1418 static int
1419 spec_fid(struct vnode *vp, struct fid *fidp)
1420 {
1421 	struct vnode *realvp;
1422 	struct snode *sp = VTOS(vp);
1423 
1424 	if ((realvp = sp->s_realvp) != NULL)
1425 		return (VOP_FID(realvp, fidp));
1426 	else
1427 		return (EINVAL);
1428 }
1429 
1430 /*ARGSUSED1*/
1431 static int
1432 spec_seek(struct vnode *vp, offset_t ooff, offset_t *noffp)
1433 {
1434 	offset_t maxoff = spec_maxoffset(vp);
1435 
1436 	if (maxoff == -1 || *noffp <= maxoff)
1437 		return (0);
1438 	else
1439 		return (EINVAL);
1440 }
1441 
1442 static int
1443 spec_frlock(
1444 	struct vnode *vp,
1445 	int		cmd,
1446 	struct flock64	*bfp,
1447 	int		flag,
1448 	offset_t	offset,
1449 	struct flk_callback *flk_cbp,
1450 	struct cred	*cr)
1451 {
1452 	struct snode *sp = VTOS(vp);
1453 	struct snode *csp;
1454 
1455 	csp = VTOS(sp->s_commonvp);
1456 	/*
1457 	 * If file is being mapped, disallow frlock.
1458 	 */
1459 	if (csp->s_mapcnt > 0)
1460 		return (EAGAIN);
1461 
1462 	return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr));
1463 }
1464 
1465 static int
1466 spec_realvp(struct vnode *vp, struct vnode **vpp)
1467 {
1468 	struct vnode *rvp;
1469 
1470 	if ((rvp = VTOS(vp)->s_realvp) != NULL) {
1471 		vp = rvp;
1472 		if (VOP_REALVP(vp, &rvp) == 0)
1473 			vp = rvp;
1474 	}
1475 
1476 	*vpp = vp;
1477 	return (0);
1478 }
1479 
1480 /*
1481  * Return all the pages from [off..off + len] in block
1482  * or character device.
1483  */
1484 static int
1485 spec_getpage(
1486 	struct vnode	*vp,
1487 	offset_t	off,
1488 	size_t		len,
1489 	uint_t		*protp,
1490 	page_t		*pl[],
1491 	size_t		plsz,
1492 	struct seg	*seg,
1493 	caddr_t		addr,
1494 	enum seg_rw	rw,
1495 	struct cred	*cr)
1496 {
1497 	struct snode *sp = VTOS(vp);
1498 	int err;
1499 
1500 	ASSERT(sp->s_commonvp == vp);
1501 
1502 	/*
1503 	 * XXX	Given the above assertion, this might not do
1504 	 *	what is wanted here.
1505 	 */
1506 	if (vp->v_flag & VNOMAP)
1507 		return (ENOSYS);
1508 	TRACE_4(TR_FAC_SPECFS, TR_SPECFS_GETPAGE,
1509 		"specfs getpage:vp %p off %llx len %ld snode %p",
1510 		vp, off, len, sp);
1511 
1512 	switch (vp->v_type) {
1513 	case VBLK:
1514 		if (protp != NULL)
1515 			*protp = PROT_ALL;
1516 
1517 		if (((u_offset_t)off + len) > (SPEC_SIZE(sp) + PAGEOFFSET))
1518 			return (EFAULT);	/* beyond EOF */
1519 
1520 		if (len <= PAGESIZE)
1521 			err = spec_getapage(vp, (u_offset_t)off, len, protp, pl,
1522 			    plsz, seg, addr, rw, cr);
1523 		else
1524 			err = pvn_getpages(spec_getapage, vp, (u_offset_t)off,
1525 			    len, protp, pl, plsz, seg, addr, rw, cr);
1526 		break;
1527 
1528 	case VCHR:
1529 		cmn_err(CE_NOTE, "spec_getpage called for character device. "
1530 		    "Check any non-ON consolidation drivers");
1531 		err = 0;
1532 		pl[0] = (page_t *)0;
1533 		break;
1534 
1535 	default:
1536 		panic("spec_getpage: bad v_type 0x%x", vp->v_type);
1537 		/*NOTREACHED*/
1538 	}
1539 
1540 	return (err);
1541 }
1542 
1543 extern int klustsize;	/* set in machdep.c */
1544 
1545 int spec_ra = 1;
1546 int spec_lostpage;	/* number of times we lost original page */
1547 
1548 /*ARGSUSED2*/
1549 static int
1550 spec_getapage(
1551 	struct vnode *vp,
1552 	u_offset_t	off,
1553 	size_t		len,
1554 	uint_t		*protp,
1555 	page_t		*pl[],
1556 	size_t		plsz,
1557 	struct seg	*seg,
1558 	caddr_t		addr,
1559 	enum seg_rw	rw,
1560 	struct cred	*cr)
1561 {
1562 	struct snode *sp;
1563 	struct buf *bp;
1564 	page_t *pp, *pp2;
1565 	u_offset_t io_off1, io_off2;
1566 	size_t io_len1;
1567 	size_t io_len2;
1568 	size_t blksz;
1569 	u_offset_t blkoff;
1570 	int dora, err;
1571 	page_t *pagefound;
1572 	uint_t xlen;
1573 	size_t adj_klustsize;
1574 	u_offset_t size;
1575 	u_offset_t tmpoff;
1576 
1577 	sp = VTOS(vp);
1578 	TRACE_3(TR_FAC_SPECFS, TR_SPECFS_GETAPAGE,
1579 		"specfs getapage:vp %p off %llx snode %p", vp, off, sp);
1580 reread:
1581 
1582 	err = 0;
1583 	bp = NULL;
1584 	pp = NULL;
1585 	pp2 = NULL;
1586 
1587 	if (pl != NULL)
1588 		pl[0] = NULL;
1589 
1590 	size = SPEC_SIZE(VTOS(sp->s_commonvp));
1591 
1592 	if (spec_ra && sp->s_nextr == off)
1593 		dora = 1;
1594 	else
1595 		dora = 0;
1596 
1597 	if (size == UNKNOWN_SIZE) {
1598 		dora = 0;
1599 		adj_klustsize = PAGESIZE;
1600 	} else {
1601 		adj_klustsize = dora ? klustsize : PAGESIZE;
1602 	}
1603 
1604 again:
1605 	if ((pagefound = page_exists(vp, off)) == NULL) {
1606 		if (rw == S_CREATE) {
1607 			/*
1608 			 * We're allocating a swap slot and it's
1609 			 * associated page was not found, so allocate
1610 			 * and return it.
1611 			 */
1612 			if ((pp = page_create_va(vp, off,
1613 			    PAGESIZE, PG_WAIT, seg, addr)) == NULL) {
1614 				panic("spec_getapage: page_create");
1615 				/*NOTREACHED*/
1616 			}
1617 			io_len1 = PAGESIZE;
1618 			sp->s_nextr = off + PAGESIZE;
1619 		} else {
1620 			/*
1621 			 * Need to really do disk I/O to get the page(s).
1622 			 */
1623 			blkoff = (off / adj_klustsize) * adj_klustsize;
1624 			if (size == UNKNOWN_SIZE) {
1625 				blksz = PAGESIZE;
1626 			} else {
1627 				if (blkoff + adj_klustsize <= size)
1628 					blksz = adj_klustsize;
1629 				else
1630 					blksz =
1631 					    MIN(size - blkoff, adj_klustsize);
1632 			}
1633 
1634 			pp = pvn_read_kluster(vp, off, seg, addr, &tmpoff,
1635 			    &io_len1, blkoff, blksz, 0);
1636 			io_off1 = tmpoff;
1637 			/*
1638 			 * Make sure the page didn't sneek into the
1639 			 * cache while we blocked in pvn_read_kluster.
1640 			 */
1641 			if (pp == NULL)
1642 				goto again;
1643 
1644 			/*
1645 			 * Zero part of page which we are not
1646 			 * going to be reading from disk now.
1647 			 */
1648 			xlen = (uint_t)(io_len1 & PAGEOFFSET);
1649 			if (xlen != 0)
1650 				pagezero(pp->p_prev, xlen, PAGESIZE - xlen);
1651 
1652 			bp = spec_startio(vp, pp, io_off1, io_len1,
1653 			    pl == NULL ? (B_ASYNC | B_READ) : B_READ);
1654 			sp->s_nextr = io_off1 + io_len1;
1655 		}
1656 	}
1657 
1658 	if (dora && rw != S_CREATE) {
1659 		u_offset_t off2;
1660 		caddr_t addr2;
1661 
1662 		off2 = ((off / adj_klustsize) + 1) * adj_klustsize;
1663 		addr2 = addr + (off2 - off);
1664 
1665 		pp2 = NULL;
1666 		/*
1667 		 * If we are past EOF then don't bother trying
1668 		 * with read-ahead.
1669 		 */
1670 		if (off2 >= size)
1671 			pp2 = NULL;
1672 		else {
1673 			if (off2 + adj_klustsize <= size)
1674 				blksz = adj_klustsize;
1675 			else
1676 				blksz = MIN(size - off2, adj_klustsize);
1677 
1678 			pp2 = pvn_read_kluster(vp, off2, seg, addr2, &tmpoff,
1679 			    &io_len2, off2, blksz, 1);
1680 			io_off2 = tmpoff;
1681 		}
1682 
1683 		if (pp2 != NULL) {
1684 			/*
1685 			 * Zero part of page which we are not
1686 			 * going to be reading from disk now.
1687 			 */
1688 			xlen = (uint_t)(io_len2 & PAGEOFFSET);
1689 			if (xlen != 0)
1690 				pagezero(pp2->p_prev, xlen, PAGESIZE - xlen);
1691 
1692 			(void) spec_startio(vp, pp2, io_off2, io_len2,
1693 			    B_READ | B_ASYNC);
1694 		}
1695 	}
1696 
1697 	if (pl == NULL)
1698 		return (err);
1699 
1700 	if (bp != NULL) {
1701 		err = biowait(bp);
1702 		pageio_done(bp);
1703 
1704 		if (err) {
1705 			if (pp != NULL)
1706 				pvn_read_done(pp, B_ERROR);
1707 			return (err);
1708 		}
1709 	}
1710 
1711 	if (pagefound) {
1712 		se_t se = (rw == S_CREATE ? SE_EXCL : SE_SHARED);
1713 		/*
1714 		 * Page exists in the cache, acquire the appropriate
1715 		 * lock.  If this fails, start all over again.
1716 		 */
1717 
1718 		if ((pp = page_lookup(vp, off, se)) == NULL) {
1719 			spec_lostpage++;
1720 			goto reread;
1721 		}
1722 		pl[0] = pp;
1723 		pl[1] = NULL;
1724 
1725 		sp->s_nextr = off + PAGESIZE;
1726 		return (0);
1727 	}
1728 
1729 	if (pp != NULL)
1730 		pvn_plist_init(pp, pl, plsz, off, io_len1, rw);
1731 	return (0);
1732 }
1733 
1734 /*
1735  * Flags are composed of {B_INVAL, B_DIRTY B_FREE, B_DONTNEED, B_FORCE}.
1736  * If len == 0, do from off to EOF.
1737  *
1738  * The normal cases should be len == 0 & off == 0 (entire vp list),
1739  * len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE
1740  * (from pageout).
1741  */
1742 int
1743 spec_putpage(
1744 	struct vnode *vp,
1745 	offset_t	off,
1746 	size_t		len,
1747 	int		flags,
1748 	struct cred	*cr)
1749 {
1750 	struct snode *sp = VTOS(vp);
1751 	struct vnode *cvp;
1752 	page_t *pp;
1753 	u_offset_t io_off;
1754 	size_t io_len = 0;	/* for lint */
1755 	int err = 0;
1756 	u_offset_t size;
1757 	u_offset_t tmpoff;
1758 
1759 	ASSERT(vp->v_count != 0);
1760 
1761 	if (vp->v_flag & VNOMAP)
1762 		return (ENOSYS);
1763 
1764 	cvp = sp->s_commonvp;
1765 	size = SPEC_SIZE(VTOS(cvp));
1766 
1767 	if (!vn_has_cached_data(vp) || off >= size)
1768 		return (0);
1769 
1770 	ASSERT(vp->v_type == VBLK && cvp == vp);
1771 	TRACE_4(TR_FAC_SPECFS, TR_SPECFS_PUTPAGE,
1772 		"specfs putpage:vp %p off %llx len %ld snode %p",
1773 		vp, off, len, sp);
1774 
1775 	if (len == 0) {
1776 		/*
1777 		 * Search the entire vp list for pages >= off.
1778 		 */
1779 		err = pvn_vplist_dirty(vp, off, spec_putapage,
1780 		    flags, cr);
1781 	} else {
1782 		u_offset_t eoff;
1783 
1784 		/*
1785 		 * Loop over all offsets in the range [off...off + len]
1786 		 * looking for pages to deal with.  We set limits so
1787 		 * that we kluster to klustsize boundaries.
1788 		 */
1789 		eoff = off + len;
1790 		for (io_off = off; io_off < eoff && io_off < size;
1791 		    io_off += io_len) {
1792 			/*
1793 			 * If we are not invalidating, synchronously
1794 			 * freeing or writing pages use the routine
1795 			 * page_lookup_nowait() to prevent reclaiming
1796 			 * them from the free list.
1797 			 */
1798 			if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
1799 				pp = page_lookup(vp, io_off,
1800 					(flags & (B_INVAL | B_FREE)) ?
1801 					    SE_EXCL : SE_SHARED);
1802 			} else {
1803 				pp = page_lookup_nowait(vp, io_off,
1804 					(flags & B_FREE) ? SE_EXCL : SE_SHARED);
1805 			}
1806 
1807 			if (pp == NULL || pvn_getdirty(pp, flags) == 0)
1808 				io_len = PAGESIZE;
1809 			else {
1810 				err = spec_putapage(vp, pp, &tmpoff, &io_len,
1811 				    flags, cr);
1812 				io_off = tmpoff;
1813 				if (err != 0)
1814 					break;
1815 				/*
1816 				 * "io_off" and "io_len" are returned as
1817 				 * the range of pages we actually wrote.
1818 				 * This allows us to skip ahead more quickly
1819 				 * since several pages may've been dealt
1820 				 * with by this iteration of the loop.
1821 				 */
1822 			}
1823 		}
1824 	}
1825 	return (err);
1826 }
1827 
1828 
1829 /*
1830  * Write out a single page, possibly klustering adjacent
1831  * dirty pages.
1832  */
1833 /*ARGSUSED5*/
1834 static int
1835 spec_putapage(
1836 	struct vnode	*vp,
1837 	page_t		*pp,
1838 	u_offset_t	*offp,		/* return value */
1839 	size_t		*lenp,		/* return value */
1840 	int		flags,
1841 	struct cred	*cr)
1842 {
1843 	struct snode *sp = VTOS(vp);
1844 	u_offset_t io_off;
1845 	size_t io_len;
1846 	size_t blksz;
1847 	u_offset_t blkoff;
1848 	int err = 0;
1849 	struct buf *bp;
1850 	u_offset_t size;
1851 	size_t adj_klustsize;
1852 	u_offset_t tmpoff;
1853 
1854 	/*
1855 	 * Destroy read ahead value since we are really going to write.
1856 	 */
1857 	sp->s_nextr = 0;
1858 	size = SPEC_SIZE(VTOS(sp->s_commonvp));
1859 
1860 	adj_klustsize = klustsize;
1861 
1862 	blkoff = (pp->p_offset / adj_klustsize) * adj_klustsize;
1863 
1864 	if (blkoff + adj_klustsize <= size)
1865 		blksz = adj_klustsize;
1866 	else
1867 		blksz = size - blkoff;
1868 
1869 	/*
1870 	 * Find a kluster that fits in one contiguous chunk.
1871 	 */
1872 	pp = pvn_write_kluster(vp, pp, &tmpoff, &io_len, blkoff,
1873 		blksz, flags);
1874 	io_off = tmpoff;
1875 
1876 	/*
1877 	 * Check for page length rounding problems
1878 	 * XXX - Is this necessary?
1879 	 */
1880 	if (io_off + io_len > size) {
1881 		ASSERT((io_off + io_len) - size < PAGESIZE);
1882 		io_len = size - io_off;
1883 	}
1884 
1885 	bp = spec_startio(vp, pp, io_off, io_len, B_WRITE | flags);
1886 
1887 	/*
1888 	 * Wait for i/o to complete if the request is not B_ASYNC.
1889 	 */
1890 	if ((flags & B_ASYNC) == 0) {
1891 		err = biowait(bp);
1892 		pageio_done(bp);
1893 		pvn_write_done(pp, ((err) ? B_ERROR : 0) | B_WRITE | flags);
1894 	}
1895 
1896 	if (offp)
1897 		*offp = io_off;
1898 	if (lenp)
1899 		*lenp = io_len;
1900 	TRACE_4(TR_FAC_SPECFS, TR_SPECFS_PUTAPAGE,
1901 		"specfs putapage:vp %p offp %p snode %p err %d",
1902 		vp, offp, sp, err);
1903 	return (err);
1904 }
1905 
1906 /*
1907  * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED}
1908  */
1909 static struct buf *
1910 spec_startio(
1911 	struct vnode *vp,
1912 	page_t		*pp,
1913 	u_offset_t	io_off,
1914 	size_t		io_len,
1915 	int		flags)
1916 {
1917 	struct buf *bp;
1918 
1919 	bp = pageio_setup(pp, io_len, vp, flags);
1920 
1921 	bp->b_edev = vp->v_rdev;
1922 	bp->b_dev = cmpdev(vp->v_rdev);
1923 	bp->b_blkno = btodt(io_off);
1924 	bp->b_un.b_addr = (caddr_t)0;
1925 
1926 	(void) bdev_strategy(bp);
1927 
1928 	if (flags & B_READ)
1929 		lwp_stat_update(LWP_STAT_INBLK, 1);
1930 	else
1931 		lwp_stat_update(LWP_STAT_OUBLK, 1);
1932 
1933 	return (bp);
1934 }
1935 
1936 static int
1937 spec_poll(
1938 	struct vnode	*vp,
1939 	short		events,
1940 	int		anyyet,
1941 	short		*reventsp,
1942 	struct pollhead **phpp)
1943 {
1944 	dev_t dev;
1945 	int error;
1946 
1947 	if (vp->v_type == VBLK)
1948 		error = fs_poll(vp, events, anyyet, reventsp, phpp);
1949 	else {
1950 		ASSERT(vp->v_type == VCHR);
1951 		dev = vp->v_rdev;
1952 		if (STREAMSTAB(getmajor(dev))) {
1953 			ASSERT(vp->v_stream != NULL);
1954 			error = strpoll(vp->v_stream, events, anyyet,
1955 			    reventsp, phpp);
1956 		} else if (devopsp[getmajor(dev)]->devo_cb_ops->cb_chpoll) {
1957 			error = cdev_poll(dev, events, anyyet, reventsp, phpp);
1958 		} else {
1959 			error = fs_poll(vp, events, anyyet, reventsp, phpp);
1960 		}
1961 	}
1962 	return (error);
1963 }
1964 
1965 /*
1966  * This routine is called through the cdevsw[] table to handle
1967  * traditional mmap'able devices that support a d_mmap function.
1968  */
1969 /*ARGSUSED8*/
1970 int
1971 spec_segmap(
1972 	dev_t dev,
1973 	off_t off,
1974 	struct as *as,
1975 	caddr_t *addrp,
1976 	off_t len,
1977 	uint_t prot,
1978 	uint_t maxprot,
1979 	uint_t flags,
1980 	struct cred *cred)
1981 {
1982 	struct segdev_crargs dev_a;
1983 	int (*mapfunc)(dev_t dev, off_t off, int prot);
1984 	size_t i;
1985 	int	error;
1986 
1987 	if ((mapfunc = devopsp[getmajor(dev)]->devo_cb_ops->cb_mmap) == nodev)
1988 		return (ENODEV);
1989 	TRACE_4(TR_FAC_SPECFS, TR_SPECFS_SEGMAP,
1990 		"specfs segmap:dev %x as %p len %lx prot %x",
1991 		dev, as, len, prot);
1992 
1993 	/*
1994 	 * Character devices that support the d_mmap
1995 	 * interface can only be mmap'ed shared.
1996 	 */
1997 	if ((flags & MAP_TYPE) != MAP_SHARED)
1998 		return (EINVAL);
1999 
2000 	/*
2001 	 * Check to ensure that the entire range is
2002 	 * legal and we are not trying to map in
2003 	 * more than the device will let us.
2004 	 */
2005 	for (i = 0; i < len; i += PAGESIZE) {
2006 		if (cdev_mmap(mapfunc, dev, off + i, maxprot) == -1)
2007 			return (ENXIO);
2008 	}
2009 
2010 	as_rangelock(as);
2011 	if ((flags & MAP_FIXED) == 0) {
2012 		/*
2013 		 * Pick an address w/o worrying about
2014 		 * any vac alignment constraints.
2015 		 */
2016 		map_addr(addrp, len, (offset_t)off, 0, flags);
2017 		if (*addrp == NULL) {
2018 			as_rangeunlock(as);
2019 			return (ENOMEM);
2020 		}
2021 	} else {
2022 		/*
2023 		 * User-specified address; blow away any previous mappings.
2024 		 */
2025 		(void) as_unmap(as, *addrp, len);
2026 	}
2027 
2028 	dev_a.mapfunc = mapfunc;
2029 	dev_a.dev = dev;
2030 	dev_a.offset = off;
2031 	dev_a.prot = (uchar_t)prot;
2032 	dev_a.maxprot = (uchar_t)maxprot;
2033 	dev_a.hat_flags = 0;
2034 	dev_a.hat_attr = 0;
2035 	dev_a.devmap_data = NULL;
2036 
2037 	error = as_map(as, *addrp, len, segdev_create, &dev_a);
2038 	as_rangeunlock(as);
2039 	return (error);
2040 }
2041 
2042 int
2043 spec_char_map(
2044 	dev_t dev,
2045 	offset_t off,
2046 	struct as *as,
2047 	caddr_t *addrp,
2048 	size_t len,
2049 	uchar_t prot,
2050 	uchar_t maxprot,
2051 	uint_t flags,
2052 	struct cred *cred)
2053 {
2054 	int error = 0;
2055 	major_t maj = getmajor(dev);
2056 	int map_flag;
2057 	int (*segmap)(dev_t, off_t, struct as *,
2058 	    caddr_t *, off_t, uint_t, uint_t, uint_t, cred_t *);
2059 	int (*devmap)(dev_t, devmap_cookie_t, offset_t,
2060 		size_t, size_t *, uint_t);
2061 	int (*mmap)(dev_t dev, off_t off, int prot);
2062 
2063 	/*
2064 	 * Character device: let the device driver
2065 	 * pick the appropriate segment driver.
2066 	 *
2067 	 * 4.x compat.: allow 'NULL' cb_segmap => spec_segmap
2068 	 * Kindness: allow 'nulldev' cb_segmap => spec_segmap
2069 	 */
2070 	segmap = devopsp[maj]->devo_cb_ops->cb_segmap;
2071 	if (segmap == NULL || segmap == nulldev || segmap == nodev) {
2072 		mmap = devopsp[maj]->devo_cb_ops->cb_mmap;
2073 		map_flag = devopsp[maj]->devo_cb_ops->cb_flag;
2074 
2075 		/*
2076 		 * Use old mmap framework if the driver has both mmap
2077 		 * and devmap entry points.  This is to prevent the
2078 		 * system from calling invalid devmap entry point
2079 		 * for some drivers that might have put garbage in the
2080 		 * devmap entry point.
2081 		 */
2082 		if ((map_flag & D_DEVMAP) || mmap == NULL ||
2083 		    mmap == nulldev || mmap == nodev) {
2084 			devmap = devopsp[maj]->devo_cb_ops->cb_devmap;
2085 
2086 			/*
2087 			 * If driver provides devmap entry point in
2088 			 * cb_ops but not xx_segmap(9E), call
2089 			 * devmap_setup with default settings
2090 			 * (NULL) for callback_ops and driver
2091 			 * callback private data
2092 			 */
2093 			if (devmap == nodev || devmap == NULL ||
2094 			    devmap == nulldev)
2095 				return (ENODEV);
2096 
2097 			error = devmap_setup(dev, off, as, addrp,
2098 			    len, prot, maxprot, flags, cred);
2099 
2100 			return (error);
2101 		} else
2102 			segmap = spec_segmap;
2103 	} else
2104 		segmap = cdev_segmap;
2105 
2106 	return ((*segmap)(dev, (off_t)off, as, addrp, len, prot,
2107 	    maxprot, flags, cred));
2108 }
2109 
2110 static int
2111 spec_map(
2112 	struct vnode *vp,
2113 	offset_t off,
2114 	struct as *as,
2115 	caddr_t *addrp,
2116 	size_t len,
2117 	uchar_t prot,
2118 	uchar_t maxprot,
2119 	uint_t flags,
2120 	struct cred *cred)
2121 {
2122 	int error = 0;
2123 
2124 	if (vp->v_flag & VNOMAP)
2125 		return (ENOSYS);
2126 
2127 	/*
2128 	 * If file is locked, fail mapping attempt.
2129 	 */
2130 	if (vn_has_flocks(vp))
2131 		return (EAGAIN);
2132 
2133 	if (vp->v_type == VCHR) {
2134 		return (spec_char_map(vp->v_rdev, off, as, addrp, len, prot,
2135 		    maxprot, flags, cred));
2136 	} else if (vp->v_type == VBLK) {
2137 		struct segvn_crargs vn_a;
2138 		struct vnode *cvp;
2139 		struct snode *sp;
2140 
2141 		/*
2142 		 * Block device, use segvn mapping to the underlying commonvp
2143 		 * for pages.
2144 		 */
2145 		if (off > spec_maxoffset(vp))
2146 			return (ENXIO);
2147 
2148 		sp = VTOS(vp);
2149 		cvp = sp->s_commonvp;
2150 		ASSERT(cvp != NULL);
2151 
2152 		if (off < 0 || (off + len) < 0)
2153 			return (ENXIO);
2154 
2155 		as_rangelock(as);
2156 		if ((flags & MAP_FIXED) == 0) {
2157 			map_addr(addrp, len, off, 1, flags);
2158 			if (*addrp == NULL) {
2159 				as_rangeunlock(as);
2160 				return (ENOMEM);
2161 			}
2162 		} else {
2163 			/*
2164 			 * User-specified address; blow away any
2165 			 * previous mappings.
2166 			 */
2167 			(void) as_unmap(as, *addrp, len);
2168 		}
2169 
2170 		vn_a.vp = cvp;
2171 		vn_a.offset = off;
2172 		vn_a.type = flags & MAP_TYPE;
2173 		vn_a.prot = (uchar_t)prot;
2174 		vn_a.maxprot = (uchar_t)maxprot;
2175 		vn_a.flags = flags & ~MAP_TYPE;
2176 		vn_a.cred = cred;
2177 		vn_a.amp = NULL;
2178 		vn_a.szc = 0;
2179 		vn_a.lgrp_mem_policy_flags = 0;
2180 
2181 		error = as_map(as, *addrp, len, segvn_create, &vn_a);
2182 		as_rangeunlock(as);
2183 	} else
2184 		return (ENODEV);
2185 
2186 	return (error);
2187 }
2188 
2189 /*ARGSUSED1*/
2190 static int
2191 spec_addmap(
2192 	struct vnode *vp,	/* the common vnode */
2193 	offset_t off,
2194 	struct as *as,
2195 	caddr_t addr,
2196 	size_t len,		/* how many bytes to add */
2197 	uchar_t prot,
2198 	uchar_t maxprot,
2199 	uint_t flags,
2200 	struct cred *cred)
2201 {
2202 	int error = 0;
2203 	struct snode *csp = VTOS(vp);
2204 	ulong_t npages;
2205 
2206 	ASSERT(vp != NULL && VTOS(vp)->s_commonvp == vp);
2207 
2208 	/*
2209 	 * XXX	Given the above assertion, this might not
2210 	 *	be a particularly sensible thing to test.
2211 	 */
2212 	if (vp->v_flag & VNOMAP)
2213 		return (ENOSYS);
2214 
2215 	npages = btopr(len);
2216 	LOCK_CSP(csp);
2217 	csp->s_mapcnt += npages;
2218 
2219 	UNLOCK_CSP(csp);
2220 	return (error);
2221 }
2222 
2223 /*ARGSUSED1*/
2224 static int
2225 spec_delmap(
2226 	struct vnode *vp,	/* the common vnode */
2227 	offset_t off,
2228 	struct as *as,
2229 	caddr_t addr,
2230 	size_t len,		/* how many bytes to take away */
2231 	uint_t prot,
2232 	uint_t maxprot,
2233 	uint_t flags,
2234 	struct cred *cred)
2235 {
2236 	struct snode *csp = VTOS(vp);
2237 	ulong_t npages;
2238 	long mcnt;
2239 
2240 	/* segdev passes us the common vp */
2241 
2242 	ASSERT(vp != NULL && VTOS(vp)->s_commonvp == vp);
2243 
2244 	/*
2245 	 * XXX	Given the above assertion, this might not
2246 	 *	be a particularly sensible thing to test..
2247 	 */
2248 	if (vp->v_flag & VNOMAP)
2249 		return (ENOSYS);
2250 
2251 	npages = btopr(len);
2252 
2253 	LOCK_CSP(csp);
2254 	mutex_enter(&csp->s_lock);
2255 	mcnt = (csp->s_mapcnt -= npages);
2256 
2257 	if (mcnt == 0) {
2258 		/*
2259 		 * Call the close routine when the last reference of any
2260 		 * kind through any [s, v]node goes away.  The s_dip hold
2261 		 * on the devinfo node is released when the vnode is
2262 		 * destroyed.
2263 		 */
2264 		if (csp->s_count == 0) {
2265 			csp->s_flag &= ~(SNEEDCLOSE | SSIZEVALID);
2266 
2267 			/* See comment in spec_close() */
2268 			if (csp->s_flag & (SCLONE | SSELFCLONE))
2269 				csp->s_flag &= ~SDIPSET;
2270 
2271 			mutex_exit(&csp->s_lock);
2272 
2273 			(void) device_close(vp, 0, cred);
2274 		} else
2275 			mutex_exit(&csp->s_lock);
2276 
2277 		mutex_enter(&csp->s_lock);
2278 	}
2279 	ASSERT(mcnt >= 0);
2280 
2281 	UNLOCK_CSP_LOCK_HELD(csp);
2282 	mutex_exit(&csp->s_lock);
2283 
2284 	return (0);
2285 }
2286 
2287 static int
2288 spec_dump(struct vnode *vp, caddr_t addr, int bn, int count)
2289 {
2290 	ASSERT(vp->v_type == VBLK);
2291 	return (bdev_dump(vp->v_rdev, addr, bn, count));
2292 }
2293 
2294 
2295 /*
2296  * Do i/o on the given page list from/to vp, io_off for io_len.
2297  * Flags are composed of:
2298  * 	{B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_READ, B_WRITE}
2299  * If B_ASYNC is not set i/o is waited for.
2300  */
2301 /*ARGSUSED5*/
2302 static int
2303 spec_pageio(
2304 	struct vnode *vp,
2305 	page_t	*pp,
2306 	u_offset_t io_off,
2307 	size_t	io_len,
2308 	int	flags,
2309 	cred_t	*cr)
2310 {
2311 	struct buf *bp = NULL;
2312 	int err = 0;
2313 
2314 	if (pp == NULL)
2315 		return (EINVAL);
2316 
2317 	bp = spec_startio(vp, pp, io_off, io_len, flags);
2318 
2319 	/*
2320 	 * Wait for i/o to complete if the request is not B_ASYNC.
2321 	 */
2322 	if ((flags & B_ASYNC) == 0) {
2323 		err = biowait(bp);
2324 		pageio_done(bp);
2325 	}
2326 	return (err);
2327 }
2328 
2329 /*
2330  * Set ACL on underlying vnode if one exists, or return ENOSYS otherwise.
2331  */
2332 int
2333 spec_setsecattr(struct vnode *vp, vsecattr_t *vsap, int flag, struct cred *cr)
2334 {
2335 	struct vnode *realvp;
2336 	struct snode *sp = VTOS(vp);
2337 	int error;
2338 
2339 	/*
2340 	 * The acl(2) system calls VOP_RWLOCK on the file before setting an
2341 	 * ACL, but since specfs does not serialize reads and writes, this
2342 	 * VOP does not do anything.  However, some backing file systems may
2343 	 * expect the lock to be held before setting an ACL, so it is taken
2344 	 * here privately to avoid serializing specfs reads and writes.
2345 	 */
2346 	if ((realvp = sp->s_realvp) != NULL) {
2347 		(void) VOP_RWLOCK(realvp, V_WRITELOCK_TRUE, NULL);
2348 		error = VOP_SETSECATTR(realvp, vsap, flag, cr);
2349 		(void) VOP_RWUNLOCK(realvp, V_WRITELOCK_TRUE, NULL);
2350 		return (error);
2351 	} else
2352 		return (fs_nosys());
2353 }
2354 
2355 /*
2356  * Get ACL from underlying vnode if one exists, or fabricate it from
2357  * the permissions returned by spec_getattr() otherwise.
2358  */
2359 int
2360 spec_getsecattr(struct vnode *vp, vsecattr_t *vsap, int flag, struct cred *cr)
2361 {
2362 	struct vnode *realvp;
2363 	struct snode *sp = VTOS(vp);
2364 
2365 	if ((realvp = sp->s_realvp) != NULL)
2366 		return (VOP_GETSECATTR(realvp, vsap, flag, cr));
2367 	else
2368 		return (fs_fab_acl(vp, vsap, flag, cr));
2369 }
2370 
2371 int
2372 spec_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr)
2373 {
2374 	vnode_t *realvp;
2375 	struct snode *sp = VTOS(vp);
2376 
2377 	if ((realvp = sp->s_realvp) != NULL)
2378 		return (VOP_PATHCONF(realvp, cmd, valp, cr));
2379 	else
2380 		return (fs_pathconf(vp, cmd, valp, cr));
2381 }
2382