1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 #include <sys/types.h> 40 #include <sys/thread.h> 41 #include <sys/t_lock.h> 42 #include <sys/param.h> 43 #include <sys/systm.h> 44 #include <sys/bitmap.h> 45 #include <sys/buf.h> 46 #include <sys/cmn_err.h> 47 #include <sys/conf.h> 48 #include <sys/ddi.h> 49 #include <sys/debug.h> 50 #include <sys/dkio.h> 51 #include <sys/errno.h> 52 #include <sys/time.h> 53 #include <sys/fcntl.h> 54 #include <sys/flock.h> 55 #include <sys/file.h> 56 #include <sys/kmem.h> 57 #include <sys/mman.h> 58 #include <sys/open.h> 59 #include <sys/swap.h> 60 #include <sys/sysmacros.h> 61 #include <sys/uio.h> 62 #include <sys/vfs.h> 63 #include <sys/vfs_opreg.h> 64 #include <sys/vnode.h> 65 #include <sys/stat.h> 66 #include <sys/poll.h> 67 #include <sys/stream.h> 68 #include <sys/strsubr.h> 69 #include <sys/policy.h> 70 #include <sys/devpolicy.h> 71 72 #include <sys/proc.h> 73 #include <sys/user.h> 74 #include <sys/session.h> 75 #include <sys/vmsystm.h> 76 #include <sys/vtrace.h> 77 #include <sys/pathname.h> 78 79 #include <sys/fs/snode.h> 80 81 #include <vm/seg.h> 82 #include <vm/seg_map.h> 83 #include <vm/page.h> 84 #include <vm/pvn.h> 85 #include <vm/seg_dev.h> 86 #include <vm/seg_vn.h> 87 88 #include <fs/fs_subr.h> 89 90 #include <sys/esunddi.h> 91 #include <sys/autoconf.h> 92 #include <sys/sunndi.h> 93 #include <sys/contract/device_impl.h> 94 95 96 static int spec_open(struct vnode **, int, struct cred *, caller_context_t *); 97 static int spec_close(struct vnode *, int, int, offset_t, struct cred *, 98 caller_context_t *); 99 static int spec_read(struct vnode *, struct uio *, int, struct cred *, 100 caller_context_t *); 101 static int spec_write(struct vnode *, struct uio *, int, struct cred *, 102 caller_context_t *); 103 static int spec_ioctl(struct vnode *, int, intptr_t, int, struct cred *, int *, 104 caller_context_t *); 105 static int spec_getattr(struct vnode *, struct vattr *, int, struct cred *, 106 caller_context_t *); 107 static int spec_setattr(struct vnode *, struct vattr *, int, struct cred *, 108 caller_context_t *); 109 static int spec_access(struct vnode *, int, int, struct cred *, 110 caller_context_t *); 111 static int spec_create(struct vnode *, char *, vattr_t *, enum vcexcl, int, 112 struct vnode **, struct cred *, int, caller_context_t *, vsecattr_t *); 113 static int spec_fsync(struct vnode *, int, struct cred *, caller_context_t *); 114 static void spec_inactive(struct vnode *, struct cred *, caller_context_t *); 115 static int spec_fid(struct vnode *, struct fid *, caller_context_t *); 116 static int spec_seek(struct vnode *, offset_t, offset_t *, caller_context_t *); 117 static int spec_frlock(struct vnode *, int, struct flock64 *, int, offset_t, 118 struct flk_callback *, struct cred *, caller_context_t *); 119 static int spec_realvp(struct vnode *, struct vnode **, caller_context_t *); 120 121 static int spec_getpage(struct vnode *, offset_t, size_t, uint_t *, page_t **, 122 size_t, struct seg *, caddr_t, enum seg_rw, struct cred *, 123 caller_context_t *); 124 static int spec_putapage(struct vnode *, page_t *, u_offset_t *, size_t *, int, 125 struct cred *); 126 static struct buf *spec_startio(struct vnode *, page_t *, u_offset_t, size_t, 127 int); 128 static int spec_getapage(struct vnode *, u_offset_t, size_t, uint_t *, 129 page_t **, size_t, struct seg *, caddr_t, enum seg_rw, struct cred *); 130 static int spec_map(struct vnode *, offset_t, struct as *, caddr_t *, size_t, 131 uchar_t, uchar_t, uint_t, struct cred *, caller_context_t *); 132 static int spec_addmap(struct vnode *, offset_t, struct as *, caddr_t, size_t, 133 uchar_t, uchar_t, uint_t, struct cred *, caller_context_t *); 134 static int spec_delmap(struct vnode *, offset_t, struct as *, caddr_t, size_t, 135 uint_t, uint_t, uint_t, struct cred *, caller_context_t *); 136 137 static int spec_poll(struct vnode *, short, int, short *, struct pollhead **, 138 caller_context_t *); 139 static int spec_dump(struct vnode *, caddr_t, offset_t, offset_t, 140 caller_context_t *); 141 static int spec_pageio(struct vnode *, page_t *, u_offset_t, size_t, int, 142 cred_t *, caller_context_t *); 143 144 static int spec_getsecattr(struct vnode *, vsecattr_t *, int, struct cred *, 145 caller_context_t *); 146 static int spec_setsecattr(struct vnode *, vsecattr_t *, int, struct cred *, 147 caller_context_t *); 148 static int spec_pathconf(struct vnode *, int, ulong_t *, struct cred *, 149 caller_context_t *); 150 151 #define SN_HOLD(csp) { \ 152 mutex_enter(&csp->s_lock); \ 153 csp->s_count++; \ 154 mutex_exit(&csp->s_lock); \ 155 } 156 157 #define SN_RELE(csp) { \ 158 mutex_enter(&csp->s_lock); \ 159 csp->s_count--; \ 160 ASSERT((csp->s_count > 0) || (csp->s_vnode->v_stream == NULL)); \ 161 mutex_exit(&csp->s_lock); \ 162 } 163 164 #define S_ISFENCED(sp) ((VTOS((sp)->s_commonvp))->s_flag & SFENCED) 165 166 struct vnodeops *spec_vnodeops; 167 168 /* 169 * *PLEASE NOTE*: If you add new entry points to specfs, do 170 * not forget to add support for fencing. A fenced snode 171 * is indicated by the SFENCED flag in the common snode. 172 * If a snode is fenced, determine if your entry point is 173 * a configuration operation (Example: open), a detection 174 * operation (Example: gettattr), an I/O operation (Example: ioctl()) 175 * or an unconfiguration operation (Example: close). If it is 176 * a configuration or detection operation, fail the operation 177 * for a fenced snode with an ENXIO or EIO as appropriate. If 178 * it is any other operation, let it through. 179 */ 180 181 const fs_operation_def_t spec_vnodeops_template[] = { 182 VOPNAME_OPEN, { .vop_open = spec_open }, 183 VOPNAME_CLOSE, { .vop_close = spec_close }, 184 VOPNAME_READ, { .vop_read = spec_read }, 185 VOPNAME_WRITE, { .vop_write = spec_write }, 186 VOPNAME_IOCTL, { .vop_ioctl = spec_ioctl }, 187 VOPNAME_GETATTR, { .vop_getattr = spec_getattr }, 188 VOPNAME_SETATTR, { .vop_setattr = spec_setattr }, 189 VOPNAME_ACCESS, { .vop_access = spec_access }, 190 VOPNAME_CREATE, { .vop_create = spec_create }, 191 VOPNAME_FSYNC, { .vop_fsync = spec_fsync }, 192 VOPNAME_INACTIVE, { .vop_inactive = spec_inactive }, 193 VOPNAME_FID, { .vop_fid = spec_fid }, 194 VOPNAME_SEEK, { .vop_seek = spec_seek }, 195 VOPNAME_PATHCONF, { .vop_pathconf = spec_pathconf }, 196 VOPNAME_FRLOCK, { .vop_frlock = spec_frlock }, 197 VOPNAME_REALVP, { .vop_realvp = spec_realvp }, 198 VOPNAME_GETPAGE, { .vop_getpage = spec_getpage }, 199 VOPNAME_PUTPAGE, { .vop_putpage = spec_putpage }, 200 VOPNAME_MAP, { .vop_map = spec_map }, 201 VOPNAME_ADDMAP, { .vop_addmap = spec_addmap }, 202 VOPNAME_DELMAP, { .vop_delmap = spec_delmap }, 203 VOPNAME_POLL, { .vop_poll = spec_poll }, 204 VOPNAME_DUMP, { .vop_dump = spec_dump }, 205 VOPNAME_PAGEIO, { .vop_pageio = spec_pageio }, 206 VOPNAME_SETSECATTR, { .vop_setsecattr = spec_setsecattr }, 207 VOPNAME_GETSECATTR, { .vop_getsecattr = spec_getsecattr }, 208 NULL, NULL 209 }; 210 211 /* 212 * Return address of spec_vnodeops 213 */ 214 struct vnodeops * 215 spec_getvnodeops(void) 216 { 217 return (spec_vnodeops); 218 } 219 220 extern vnode_t *rconsvp; 221 222 /* 223 * Acquire the serial lock on the common snode. 224 */ 225 #define LOCK_CSP(csp) (void) spec_lockcsp(csp, 0, 1, 0) 226 #define LOCKHOLD_CSP_SIG(csp) spec_lockcsp(csp, 1, 1, 1) 227 #define SYNCHOLD_CSP_SIG(csp, intr) spec_lockcsp(csp, intr, 0, 1) 228 229 typedef enum { 230 LOOP, 231 INTR, 232 SUCCESS 233 } slock_ret_t; 234 235 /* 236 * Synchronize with active SLOCKED snode, optionally checking for a signal and 237 * optionally returning with SLOCKED set and SN_HOLD done. The 'intr' 238 * argument determines if the thread is interruptible by a signal while 239 * waiting, the function returns INTR if interrupted while there is another 240 * thread closing this snonde and LOOP if interrupted otherwise. 241 * When SUCCESS is returned the 'hold' argument determines if the open 242 * count (SN_HOLD) has been incremented and the 'setlock' argument 243 * determines if the function returns with SLOCKED set. 244 */ 245 static slock_ret_t 246 spec_lockcsp(struct snode *csp, int intr, int setlock, int hold) 247 { 248 slock_ret_t ret = SUCCESS; 249 mutex_enter(&csp->s_lock); 250 while (csp->s_flag & SLOCKED) { 251 csp->s_flag |= SWANT; 252 if (intr) { 253 if (!cv_wait_sig(&csp->s_cv, &csp->s_lock)) { 254 if (csp->s_flag & SCLOSING) 255 ret = INTR; 256 else 257 ret = LOOP; 258 mutex_exit(&csp->s_lock); 259 return (ret); /* interrupted */ 260 } 261 } else { 262 cv_wait(&csp->s_cv, &csp->s_lock); 263 } 264 } 265 if (setlock) 266 csp->s_flag |= SLOCKED; 267 if (hold) 268 csp->s_count++; /* one more open reference : SN_HOLD */ 269 mutex_exit(&csp->s_lock); 270 return (ret); /* serialized/locked */ 271 } 272 273 /* 274 * Unlock the serial lock on the common snode 275 */ 276 #define UNLOCK_CSP_LOCK_HELD(csp) \ 277 ASSERT(mutex_owned(&csp->s_lock)); \ 278 if (csp->s_flag & SWANT) \ 279 cv_broadcast(&csp->s_cv); \ 280 csp->s_flag &= ~(SWANT|SLOCKED); 281 282 #define UNLOCK_CSP(csp) \ 283 mutex_enter(&csp->s_lock); \ 284 UNLOCK_CSP_LOCK_HELD(csp); \ 285 mutex_exit(&csp->s_lock); 286 287 /* 288 * compute/return the size of the device 289 */ 290 #define SPEC_SIZE(csp) \ 291 (((csp)->s_flag & SSIZEVALID) ? (csp)->s_size : spec_size(csp)) 292 293 /* 294 * Compute and return the size. If the size in the common snode is valid then 295 * return it. If not valid then get the size from the driver and set size in 296 * the common snode. If the device has not been attached then we don't ask for 297 * an update from the driver- for non-streams SSIZEVALID stays unset until the 298 * device is attached. A stat of a mknod outside /devices (non-devfs) may 299 * report UNKNOWN_SIZE because the device may not be attached yet (SDIPSET not 300 * established in mknod until open time). An stat in /devices will report the 301 * size correctly. Specfs should always call SPEC_SIZE instead of referring 302 * directly to s_size to initialize/retrieve the size of a device. 303 * 304 * XXX There is an inconsistency between block and raw - "unknown" is 305 * UNKNOWN_SIZE for VBLK and 0 for VCHR(raw). 306 */ 307 static u_offset_t 308 spec_size(struct snode *csp) 309 { 310 struct vnode *cvp = STOV(csp); 311 u_offset_t size; 312 int plen; 313 uint32_t size32; 314 dev_t dev; 315 dev_info_t *devi; 316 major_t maj; 317 uint_t blksize; 318 int blkshift; 319 320 ASSERT((csp)->s_commonvp == cvp); /* must be common node */ 321 322 /* return cached value */ 323 mutex_enter(&csp->s_lock); 324 if (csp->s_flag & SSIZEVALID) { 325 mutex_exit(&csp->s_lock); 326 return (csp->s_size); 327 } 328 329 /* VOP_GETATTR of mknod has not had devcnt restriction applied */ 330 dev = cvp->v_rdev; 331 maj = getmajor(dev); 332 if (maj >= devcnt) { 333 /* return non-cached UNKNOWN_SIZE */ 334 mutex_exit(&csp->s_lock); 335 return ((cvp->v_type == VCHR) ? 0 : UNKNOWN_SIZE); 336 } 337 338 /* establish cached zero size for streams */ 339 if (STREAMSTAB(maj)) { 340 csp->s_size = 0; 341 csp->s_flag |= SSIZEVALID; 342 mutex_exit(&csp->s_lock); 343 return (0); 344 } 345 346 /* 347 * Return non-cached UNKNOWN_SIZE if not open. 348 * 349 * NB: This check is bogus, calling prop_op(9E) should be gated by 350 * attach, not open. Not having this check however opens up a new 351 * context under which a driver's prop_op(9E) could be called. Calling 352 * prop_op(9E) in this new context has been shown to expose latent 353 * driver bugs (insufficient NULL pointer checks that lead to panic). 354 * We are keeping this open check for now to avoid these panics. 355 */ 356 if (csp->s_count == 0) { 357 mutex_exit(&csp->s_lock); 358 return ((cvp->v_type == VCHR) ? 0 : UNKNOWN_SIZE); 359 } 360 361 /* Return non-cached UNKNOWN_SIZE if not attached. */ 362 if (((csp->s_flag & SDIPSET) == 0) || (csp->s_dip == NULL) || 363 !i_ddi_devi_attached(csp->s_dip)) { 364 mutex_exit(&csp->s_lock); 365 return ((cvp->v_type == VCHR) ? 0 : UNKNOWN_SIZE); 366 } 367 368 devi = csp->s_dip; 369 370 /* 371 * Established cached size obtained from the attached driver. Since we 372 * know the devinfo node, for efficiency we use cdev_prop_op directly 373 * instead of [cb]dev_[Ss]size. 374 */ 375 if (cvp->v_type == VCHR) { 376 size = 0; 377 plen = sizeof (size); 378 if (cdev_prop_op(dev, devi, PROP_LEN_AND_VAL_BUF, 379 DDI_PROP_NOTPROM | DDI_PROP_DONTPASS | 380 DDI_PROP_CONSUMER_TYPED, "Size", (caddr_t)&size, 381 &plen) != DDI_PROP_SUCCESS) { 382 plen = sizeof (size32); 383 if (cdev_prop_op(dev, devi, PROP_LEN_AND_VAL_BUF, 384 DDI_PROP_NOTPROM | DDI_PROP_DONTPASS, 385 "size", (caddr_t)&size32, &plen) == 386 DDI_PROP_SUCCESS) 387 size = size32; 388 } 389 } else { 390 size = UNKNOWN_SIZE; 391 plen = sizeof (size); 392 if (cdev_prop_op(dev, devi, PROP_LEN_AND_VAL_BUF, 393 DDI_PROP_NOTPROM | DDI_PROP_DONTPASS | 394 DDI_PROP_CONSUMER_TYPED, "Nblocks", (caddr_t)&size, 395 &plen) != DDI_PROP_SUCCESS) { 396 plen = sizeof (size32); 397 if (cdev_prop_op(dev, devi, PROP_LEN_AND_VAL_BUF, 398 DDI_PROP_NOTPROM | DDI_PROP_DONTPASS, 399 "nblocks", (caddr_t)&size32, &plen) == 400 DDI_PROP_SUCCESS) 401 size = size32; 402 } 403 404 if (size != UNKNOWN_SIZE) { 405 blksize = DEV_BSIZE; /* default */ 406 plen = sizeof (blksize); 407 408 /* try to get dev_t specific "blksize" */ 409 if (cdev_prop_op(dev, devi, PROP_LEN_AND_VAL_BUF, 410 DDI_PROP_NOTPROM | DDI_PROP_DONTPASS, 411 "blksize", (caddr_t)&blksize, &plen) != 412 DDI_PROP_SUCCESS) { 413 /* 414 * Try for dev_info node "device-blksize". 415 * If this fails then blksize will still be 416 * DEV_BSIZE default value. 417 */ 418 (void) cdev_prop_op(DDI_DEV_T_ANY, devi, 419 PROP_LEN_AND_VAL_BUF, 420 DDI_PROP_NOTPROM | DDI_PROP_DONTPASS, 421 "device-blksize", (caddr_t)&blksize, &plen); 422 } 423 424 /* blksize must be a power of two */ 425 ASSERT(BIT_ONLYONESET(blksize)); 426 blkshift = highbit(blksize) - 1; 427 428 /* convert from block size to byte size */ 429 if (size < (MAXOFFSET_T >> blkshift)) 430 size = size << blkshift; 431 else 432 size = UNKNOWN_SIZE; 433 } 434 } 435 436 csp->s_size = size; 437 csp->s_flag |= SSIZEVALID; 438 439 mutex_exit(&csp->s_lock); 440 return (size); 441 } 442 443 /* 444 * This function deal with vnode substitution in the case of 445 * device cloning. 446 */ 447 static int 448 spec_clone(struct vnode **vpp, dev_t newdev, int vtype, struct stdata *stp) 449 { 450 dev_t dev = (*vpp)->v_rdev; 451 major_t maj = getmajor(dev); 452 major_t newmaj = getmajor(newdev); 453 int sysclone = (maj == clone_major); 454 int qassociate_used = 0; 455 struct snode *oldsp, *oldcsp; 456 struct snode *newsp, *newcsp; 457 struct vnode *newvp, *newcvp; 458 dev_info_t *dip; 459 queue_t *dq; 460 461 ASSERT(dev != newdev); 462 463 /* 464 * Check for cloning across different drivers. 465 * We only support this under the system provided clone driver 466 */ 467 if ((maj != newmaj) && !sysclone) { 468 cmn_err(CE_NOTE, 469 "unsupported clone open maj = %u, newmaj = %u", 470 maj, newmaj); 471 return (ENXIO); 472 } 473 474 /* old */ 475 oldsp = VTOS(*vpp); 476 oldcsp = VTOS(oldsp->s_commonvp); 477 478 /* new */ 479 newvp = makespecvp(newdev, vtype); 480 ASSERT(newvp != NULL); 481 newsp = VTOS(newvp); 482 newcvp = newsp->s_commonvp; 483 newcsp = VTOS(newcvp); 484 485 /* 486 * Clones inherit fsid, realvp, and dip. 487 * XXX realvp inherit is not occurring, does fstat of clone work? 488 */ 489 newsp->s_fsid = oldsp->s_fsid; 490 if (sysclone) { 491 newsp->s_flag |= SCLONE; 492 dip = NULL; 493 } else { 494 newsp->s_flag |= SSELFCLONE; 495 dip = oldcsp->s_dip; 496 } 497 498 /* 499 * If we cloned to an opened newdev that already has called 500 * spec_assoc_vp_with_devi (SDIPSET set) then the association is 501 * already established. 502 */ 503 if (!(newcsp->s_flag & SDIPSET)) { 504 /* 505 * Establish s_dip association for newdev. 506 * 507 * If we trusted the getinfo(9E) DDI_INFO_DEVT2INSTANCE 508 * implementation of all cloning drivers (SCLONE and SELFCLONE) 509 * we would always use e_ddi_hold_devi_by_dev(). We know that 510 * many drivers have had (still have?) problems with 511 * DDI_INFO_DEVT2INSTANCE, so we try to minimize reliance by 512 * detecting drivers that use QASSOCIATE (by looking down the 513 * stream) and setting their s_dip association to NULL. 514 */ 515 qassociate_used = 0; 516 if (stp) { 517 for (dq = stp->sd_wrq; dq; dq = dq->q_next) { 518 if (_RD(dq)->q_flag & _QASSOCIATED) { 519 qassociate_used = 1; 520 dip = NULL; 521 break; 522 } 523 } 524 } 525 526 if (dip || qassociate_used) { 527 spec_assoc_vp_with_devi(newvp, dip); 528 } else { 529 /* derive association from newdev */ 530 dip = e_ddi_hold_devi_by_dev(newdev, 0); 531 spec_assoc_vp_with_devi(newvp, dip); 532 if (dip) 533 ddi_release_devi(dip); 534 } 535 } 536 537 SN_HOLD(newcsp); 538 539 /* deal with stream stuff */ 540 if (stp != NULL) { 541 LOCK_CSP(newcsp); /* synchronize stream open/close */ 542 mutex_enter(&newcsp->s_lock); 543 newcvp->v_stream = newvp->v_stream = stp; 544 stp->sd_vnode = newcvp; 545 stp->sd_strtab = STREAMSTAB(newmaj); 546 mutex_exit(&newcsp->s_lock); 547 UNLOCK_CSP(newcsp); 548 } 549 550 /* substitute the vnode */ 551 SN_RELE(oldcsp); 552 VN_RELE(*vpp); 553 *vpp = newvp; 554 555 return (0); 556 } 557 558 static int 559 spec_open(struct vnode **vpp, int flag, struct cred *cr, caller_context_t *cc) 560 { 561 major_t maj; 562 dev_t dev, newdev; 563 struct vnode *vp, *cvp; 564 struct snode *sp, *csp; 565 struct stdata *stp; 566 dev_info_t *dip; 567 int error, type; 568 contract_t *ct = NULL; 569 int open_returns_eintr; 570 slock_ret_t spec_locksp_ret; 571 572 573 flag &= ~FCREAT; /* paranoia */ 574 575 vp = *vpp; 576 sp = VTOS(vp); 577 ASSERT((vp->v_type == VCHR) || (vp->v_type == VBLK)); 578 if ((vp->v_type != VCHR) && (vp->v_type != VBLK)) 579 return (ENXIO); 580 581 /* 582 * If the VFS_NODEVICES bit was set for the mount, 583 * do not allow opens of special devices. 584 */ 585 if (sp->s_realvp && (sp->s_realvp->v_vfsp->vfs_flag & VFS_NODEVICES)) 586 return (ENXIO); 587 588 newdev = dev = vp->v_rdev; 589 590 /* 591 * If we are opening a node that has not had spec_assoc_vp_with_devi 592 * called against it (mknod outside /devices or a non-dacf makespecvp 593 * node) then SDIPSET will not be set. In this case we call an 594 * interface which will reconstruct the path and lookup (drive attach) 595 * through devfs (e_ddi_hold_devi_by_dev -> e_ddi_hold_devi_by_path -> 596 * devfs_lookupname). For support of broken drivers that don't call 597 * ddi_create_minor_node for all minor nodes in their instance space, 598 * we call interfaces that operates at the directory/devinfo 599 * (major/instance) level instead of to the leaf/minor node level. 600 * After finding and attaching the dip we associate it with the 601 * common specfs vnode (s_dip), which sets SDIPSET. A DL_DETACH_REQ 602 * to style-2 stream driver may set s_dip to NULL with SDIPSET set. 603 * 604 * NOTE: Although e_ddi_hold_devi_by_dev takes a dev_t argument, its 605 * implementation operates at the major/instance level since it only 606 * need to return a dip. 607 */ 608 cvp = sp->s_commonvp; 609 csp = VTOS(cvp); 610 if (!(csp->s_flag & SDIPSET)) { 611 /* try to attach, return error if we fail */ 612 if ((dip = e_ddi_hold_devi_by_dev(dev, 0)) == NULL) 613 return (ENXIO); 614 615 /* associate dip with the common snode s_dip */ 616 spec_assoc_vp_with_devi(vp, dip); 617 ddi_release_devi(dip); /* from e_ddi_hold_devi_by_dev */ 618 } 619 620 /* check if device fenced off */ 621 if (S_ISFENCED(sp)) 622 return (ENXIO); 623 624 #ifdef DEBUG 625 /* verify attach/open exclusion guarantee */ 626 dip = csp->s_dip; 627 ASSERT((dip == NULL) || i_ddi_devi_attached(dip)); 628 #endif /* DEBUG */ 629 630 if ((error = secpolicy_spec_open(cr, vp, flag)) != 0) 631 return (error); 632 633 /* Verify existance of open(9E) implementation. */ 634 maj = getmajor(dev); 635 if ((maj >= devcnt) || 636 (devopsp[maj]->devo_cb_ops == NULL) || 637 (devopsp[maj]->devo_cb_ops->cb_open == NULL)) 638 return (ENXIO); 639 640 /* split streams .vs. non-streams */ 641 if (STREAMSTAB(maj)) 642 goto streams_open; 643 644 /* 645 * Wait for in progress last close to complete. This guarantees 646 * to the driver writer that we will never be in the drivers 647 * open and close on the same (dev_t, otype) at the same time. 648 * Open count already incremented (SN_HOLD) on non-zero return. 649 * The wait is interruptible by a signal if the driver sets the 650 * D_OPEN_RETURNS_EINTR cb_ops(9S) cb_flag or sets the 651 * ddi-open-returns-eintr(9P) property in its driver.conf. 652 */ 653 if ((devopsp[maj]->devo_cb_ops->cb_flag & D_OPEN_RETURNS_EINTR) || 654 (devnamesp[maj].dn_flags & DN_OPEN_RETURNS_EINTR)) 655 open_returns_eintr = 1; 656 else 657 open_returns_eintr = 0; 658 while ((spec_locksp_ret = SYNCHOLD_CSP_SIG(csp, open_returns_eintr)) != 659 SUCCESS) { 660 if (spec_locksp_ret == INTR) 661 return (EINTR); 662 } 663 664 /* non streams open */ 665 type = (vp->v_type == VBLK ? OTYP_BLK : OTYP_CHR); 666 error = dev_open(&newdev, flag, type, cr); 667 668 /* deal with clone case */ 669 if (error == 0 && dev != newdev) { 670 error = spec_clone(vpp, newdev, vp->v_type, NULL); 671 /* 672 * bail on clone failure, further processing 673 * results in undefined behaviors. 674 */ 675 if (error != 0) 676 return (error); 677 sp = VTOS(*vpp); 678 csp = VTOS(sp->s_commonvp); 679 } 680 681 /* 682 * create contracts only for userland opens 683 * Successful open and cloning is done at this point. 684 */ 685 if (error == 0 && !(flag & FKLYR)) { 686 int spec_type; 687 spec_type = (STOV(csp)->v_type == VCHR) ? S_IFCHR : S_IFBLK; 688 if (contract_device_open(newdev, spec_type, NULL) != 0) { 689 error = EIO; 690 } 691 } 692 693 if (error == 0) { 694 sp->s_size = SPEC_SIZE(csp); 695 696 if ((csp->s_flag & SNEEDCLOSE) == 0) { 697 int nmaj = getmajor(newdev); 698 mutex_enter(&csp->s_lock); 699 /* successful open needs a close later */ 700 csp->s_flag |= SNEEDCLOSE; 701 702 /* 703 * Invalidate possible cached "unknown" size 704 * established by a VOP_GETATTR while open was in 705 * progress, and the driver might fail prop_op(9E). 706 */ 707 if (((cvp->v_type == VCHR) && (csp->s_size == 0)) || 708 ((cvp->v_type == VBLK) && 709 (csp->s_size == UNKNOWN_SIZE))) 710 csp->s_flag &= ~SSIZEVALID; 711 712 if (devopsp[nmaj]->devo_cb_ops->cb_flag & D_64BIT) 713 csp->s_flag |= SLOFFSET; 714 if (devopsp[nmaj]->devo_cb_ops->cb_flag & D_U64BIT) 715 csp->s_flag |= SLOFFSET | SANYOFFSET; 716 mutex_exit(&csp->s_lock); 717 } 718 return (0); 719 } 720 721 /* 722 * Open failed. If we missed a close operation because 723 * we were trying to get the device open and it is the 724 * last in progress open that is failing then call close. 725 * 726 * NOTE: Only non-streams open has this race condition. 727 */ 728 mutex_enter(&csp->s_lock); 729 csp->s_count--; /* decrement open count : SN_RELE */ 730 if ((csp->s_count == 0) && /* no outstanding open */ 731 (csp->s_mapcnt == 0) && /* no mapping */ 732 (csp->s_flag & SNEEDCLOSE)) { /* need a close */ 733 csp->s_flag &= ~(SNEEDCLOSE | SSIZEVALID); 734 735 /* See comment in spec_close() */ 736 if (csp->s_flag & (SCLONE | SSELFCLONE)) 737 csp->s_flag &= ~SDIPSET; 738 739 csp->s_flag |= SCLOSING; 740 mutex_exit(&csp->s_lock); 741 742 ASSERT(*vpp != NULL); 743 (void) device_close(*vpp, flag, cr); 744 745 mutex_enter(&csp->s_lock); 746 csp->s_flag &= ~SCLOSING; 747 mutex_exit(&csp->s_lock); 748 } else { 749 mutex_exit(&csp->s_lock); 750 } 751 return (error); 752 753 streams_open: 754 if (vp->v_type != VCHR) 755 return (ENXIO); 756 757 /* 758 * Lock common snode to prevent any new clone opens on this 759 * stream while one is in progress. This is necessary since 760 * the stream currently associated with the clone device will 761 * not be part of it after the clone open completes. Unfortunately 762 * we don't know in advance if this is a clone 763 * device so we have to lock all opens. 764 * 765 * If we fail, it's because of an interrupt - EINTR return is an 766 * expected aspect of opening a stream so we don't need to check 767 * D_OPEN_RETURNS_EINTR. Open count already incremented (SN_HOLD) 768 * on non-zero return. 769 */ 770 if (LOCKHOLD_CSP_SIG(csp) != SUCCESS) 771 return (EINTR); 772 773 error = stropen(cvp, &newdev, flag, cr); 774 stp = cvp->v_stream; 775 776 /* deal with the clone case */ 777 if ((error == 0) && (dev != newdev)) { 778 vp->v_stream = cvp->v_stream = NULL; 779 UNLOCK_CSP(csp); 780 error = spec_clone(vpp, newdev, vp->v_type, stp); 781 /* 782 * bail on clone failure, further processing 783 * results in undefined behaviors. 784 */ 785 if (error != 0) 786 return (error); 787 sp = VTOS(*vpp); 788 csp = VTOS(sp->s_commonvp); 789 } else if (error == 0) { 790 vp->v_stream = stp; 791 UNLOCK_CSP(csp); 792 } 793 794 /* 795 * create contracts only for userland opens 796 * Successful open and cloning is done at this point. 797 */ 798 if (error == 0 && !(flag & FKLYR)) { 799 /* STREAM is of type S_IFCHR */ 800 if (contract_device_open(newdev, S_IFCHR, &ct) != 0) { 801 UNLOCK_CSP(csp); 802 (void) spec_close(vp, flag, 1, 0, cr, cc); 803 return (EIO); 804 } 805 } 806 807 if (error == 0) { 808 /* STREAMS devices don't have a size */ 809 sp->s_size = csp->s_size = 0; 810 811 if (!(stp->sd_flag & STRISTTY) || (flag & FNOCTTY)) 812 return (0); 813 814 /* try to allocate it as a controlling terminal */ 815 if (strctty(stp) != EINTR) 816 return (0); 817 818 /* strctty() was interrupted by a signal */ 819 if (ct) { 820 /* we only create contracts for userland opens */ 821 ASSERT(ttoproc(curthread)); 822 (void) contract_abandon(ct, ttoproc(curthread), 0); 823 } 824 (void) spec_close(vp, flag, 1, 0, cr, cc); 825 return (EINTR); 826 } 827 828 /* 829 * Deal with stropen failure. 830 * 831 * sd_flag in the stream head cannot change since the 832 * common snode is locked before the call to stropen(). 833 */ 834 if ((stp != NULL) && (stp->sd_flag & STREOPENFAIL)) { 835 /* 836 * Open failed part way through. 837 */ 838 mutex_enter(&stp->sd_lock); 839 stp->sd_flag &= ~STREOPENFAIL; 840 mutex_exit(&stp->sd_lock); 841 842 UNLOCK_CSP(csp); 843 (void) spec_close(vp, flag, 1, 0, cr, cc); 844 } else { 845 UNLOCK_CSP(csp); 846 SN_RELE(csp); 847 } 848 849 return (error); 850 } 851 852 /*ARGSUSED2*/ 853 static int 854 spec_close( 855 struct vnode *vp, 856 int flag, 857 int count, 858 offset_t offset, 859 struct cred *cr, 860 caller_context_t *ct) 861 { 862 struct vnode *cvp; 863 struct snode *sp, *csp; 864 enum vtype type; 865 dev_t dev; 866 int error = 0; 867 int sysclone; 868 869 if (!(flag & FKLYR)) { 870 /* this only applies to closes of devices from userland */ 871 cleanlocks(vp, ttoproc(curthread)->p_pid, 0); 872 cleanshares(vp, ttoproc(curthread)->p_pid); 873 if (vp->v_stream) 874 strclean(vp); 875 } 876 if (count > 1) 877 return (0); 878 879 /* we allow close to succeed even if device is fenced off */ 880 sp = VTOS(vp); 881 cvp = sp->s_commonvp; 882 883 dev = sp->s_dev; 884 type = vp->v_type; 885 886 ASSERT(type == VCHR || type == VBLK); 887 888 /* 889 * Prevent close/close and close/open races by serializing closes 890 * on this common snode. Clone opens are held up until after 891 * we have closed this device so the streams linkage is maintained 892 */ 893 csp = VTOS(cvp); 894 895 LOCK_CSP(csp); 896 mutex_enter(&csp->s_lock); 897 898 csp->s_count--; /* one fewer open reference : SN_RELE */ 899 sysclone = sp->s_flag & SCLONE; 900 901 /* 902 * Invalidate size on each close. 903 * 904 * XXX We do this on each close because we don't have interfaces that 905 * allow a driver to invalidate the size. Since clearing this on each 906 * close this causes property overhead we skip /dev/null and 907 * /dev/zero to avoid degrading kenbus performance. 908 */ 909 if (getmajor(dev) != mm_major) 910 csp->s_flag &= ~SSIZEVALID; 911 912 /* 913 * Only call the close routine when the last open reference through 914 * any [s, v]node goes away. This can be checked by looking at 915 * s_count on the common vnode. 916 */ 917 if ((csp->s_count == 0) && (csp->s_mapcnt == 0)) { 918 /* we don't need a close */ 919 csp->s_flag &= ~(SNEEDCLOSE | SSIZEVALID); 920 921 /* 922 * A cloning driver may open-clone to the same dev_t that we 923 * are closing before spec_inactive destroys the common snode. 924 * If this occurs the s_dip association needs to be reevaluated. 925 * We clear SDIPSET to force reevaluation in this case. When 926 * reevaluation occurs (by spec_clone after open), if the 927 * devinfo association has changed then the old association 928 * will be released as the new association is established by 929 * spec_assoc_vp_with_devi(). 930 */ 931 if (csp->s_flag & (SCLONE | SSELFCLONE)) 932 csp->s_flag &= ~SDIPSET; 933 934 csp->s_flag |= SCLOSING; 935 mutex_exit(&csp->s_lock); 936 error = device_close(vp, flag, cr); 937 938 /* 939 * Decrement the devops held in clnopen() 940 */ 941 if (sysclone) { 942 ddi_rele_driver(getmajor(dev)); 943 } 944 mutex_enter(&csp->s_lock); 945 csp->s_flag &= ~SCLOSING; 946 } 947 948 UNLOCK_CSP_LOCK_HELD(csp); 949 mutex_exit(&csp->s_lock); 950 951 return (error); 952 } 953 954 /*ARGSUSED2*/ 955 static int 956 spec_read( 957 struct vnode *vp, 958 struct uio *uiop, 959 int ioflag, 960 struct cred *cr, 961 caller_context_t *ct) 962 { 963 int error; 964 struct snode *sp = VTOS(vp); 965 dev_t dev = sp->s_dev; 966 size_t n; 967 ulong_t on; 968 u_offset_t bdevsize; 969 offset_t maxoff; 970 offset_t off; 971 struct vnode *blkvp; 972 973 ASSERT(vp->v_type == VCHR || vp->v_type == VBLK); 974 975 if (STREAMSTAB(getmajor(dev))) { /* stream */ 976 ASSERT(vp->v_type == VCHR); 977 smark(sp, SACC); 978 return (strread(vp, uiop, cr)); 979 } 980 981 if (uiop->uio_resid == 0) 982 return (0); 983 984 /* 985 * Plain old character devices that set D_U64BIT can have 986 * unrestricted offsets. 987 */ 988 maxoff = spec_maxoffset(vp); 989 ASSERT(maxoff != -1 || vp->v_type == VCHR); 990 991 if (maxoff != -1 && (uiop->uio_loffset < 0 || 992 uiop->uio_loffset + uiop->uio_resid > maxoff)) 993 return (EINVAL); 994 995 if (vp->v_type == VCHR) { 996 smark(sp, SACC); 997 ASSERT(STREAMSTAB(getmajor(dev)) == 0); 998 return (cdev_read(dev, uiop, cr)); 999 } 1000 1001 /* 1002 * Block device. 1003 */ 1004 error = 0; 1005 blkvp = sp->s_commonvp; 1006 bdevsize = SPEC_SIZE(VTOS(blkvp)); 1007 1008 do { 1009 caddr_t base; 1010 offset_t diff; 1011 1012 off = uiop->uio_loffset & (offset_t)MAXBMASK; 1013 on = (size_t)(uiop->uio_loffset & MAXBOFFSET); 1014 n = (size_t)MIN(MAXBSIZE - on, uiop->uio_resid); 1015 diff = bdevsize - uiop->uio_loffset; 1016 1017 if (diff <= 0) 1018 break; 1019 if (diff < n) 1020 n = (size_t)diff; 1021 1022 if (vpm_enable) { 1023 error = vpm_data_copy(blkvp, (u_offset_t)(off + on), 1024 n, uiop, 1, NULL, 0, S_READ); 1025 } else { 1026 base = segmap_getmapflt(segkmap, blkvp, 1027 (u_offset_t)(off + on), n, 1, S_READ); 1028 1029 error = uiomove(base + on, n, UIO_READ, uiop); 1030 } 1031 if (!error) { 1032 int flags = 0; 1033 /* 1034 * If we read a whole block, we won't need this 1035 * buffer again soon. 1036 */ 1037 if (n + on == MAXBSIZE) 1038 flags = SM_DONTNEED | SM_FREE; 1039 if (vpm_enable) { 1040 error = vpm_sync_pages(blkvp, off, n, flags); 1041 } else { 1042 error = segmap_release(segkmap, base, flags); 1043 } 1044 } else { 1045 if (vpm_enable) { 1046 (void) vpm_sync_pages(blkvp, off, n, 0); 1047 } else { 1048 (void) segmap_release(segkmap, base, 0); 1049 } 1050 if (bdevsize == UNKNOWN_SIZE) { 1051 error = 0; 1052 break; 1053 } 1054 } 1055 } while (error == 0 && uiop->uio_resid > 0 && n != 0); 1056 1057 return (error); 1058 } 1059 1060 /*ARGSUSED*/ 1061 static int 1062 spec_write( 1063 struct vnode *vp, 1064 struct uio *uiop, 1065 int ioflag, 1066 struct cred *cr, 1067 caller_context_t *ct) 1068 { 1069 int error; 1070 struct snode *sp = VTOS(vp); 1071 dev_t dev = sp->s_dev; 1072 size_t n; 1073 ulong_t on; 1074 u_offset_t bdevsize; 1075 offset_t maxoff; 1076 offset_t off; 1077 struct vnode *blkvp; 1078 1079 ASSERT(vp->v_type == VCHR || vp->v_type == VBLK); 1080 1081 if (STREAMSTAB(getmajor(dev))) { 1082 ASSERT(vp->v_type == VCHR); 1083 smark(sp, SUPD); 1084 return (strwrite(vp, uiop, cr)); 1085 } 1086 1087 /* 1088 * Plain old character devices that set D_U64BIT can have 1089 * unrestricted offsets. 1090 */ 1091 maxoff = spec_maxoffset(vp); 1092 ASSERT(maxoff != -1 || vp->v_type == VCHR); 1093 1094 if (maxoff != -1 && (uiop->uio_loffset < 0 || 1095 uiop->uio_loffset + uiop->uio_resid > maxoff)) 1096 return (EINVAL); 1097 1098 if (vp->v_type == VCHR) { 1099 smark(sp, SUPD); 1100 ASSERT(STREAMSTAB(getmajor(dev)) == 0); 1101 return (cdev_write(dev, uiop, cr)); 1102 } 1103 1104 if (uiop->uio_resid == 0) 1105 return (0); 1106 1107 error = 0; 1108 blkvp = sp->s_commonvp; 1109 bdevsize = SPEC_SIZE(VTOS(blkvp)); 1110 1111 do { 1112 int pagecreate; 1113 int newpage; 1114 caddr_t base; 1115 offset_t diff; 1116 1117 off = uiop->uio_loffset & (offset_t)MAXBMASK; 1118 on = (ulong_t)(uiop->uio_loffset & MAXBOFFSET); 1119 n = (size_t)MIN(MAXBSIZE - on, uiop->uio_resid); 1120 pagecreate = 0; 1121 1122 diff = bdevsize - uiop->uio_loffset; 1123 if (diff <= 0) { 1124 error = ENXIO; 1125 break; 1126 } 1127 if (diff < n) 1128 n = (size_t)diff; 1129 1130 /* 1131 * Check to see if we can skip reading in the page 1132 * and just allocate the memory. We can do this 1133 * if we are going to rewrite the entire mapping 1134 * or if we are going to write to end of the device 1135 * from the beginning of the mapping. 1136 */ 1137 if (n == MAXBSIZE || (on == 0 && (off + n) == bdevsize)) 1138 pagecreate = 1; 1139 1140 newpage = 0; 1141 1142 /* 1143 * Touch the page and fault it in if it is not in core 1144 * before segmap_getmapflt or vpm_data_copy can lock it. 1145 * This is to avoid the deadlock if the buffer is mapped 1146 * to the same file through mmap which we want to write. 1147 */ 1148 uio_prefaultpages((long)n, uiop); 1149 1150 if (vpm_enable) { 1151 error = vpm_data_copy(blkvp, (u_offset_t)(off + on), 1152 n, uiop, !pagecreate, NULL, 0, S_WRITE); 1153 } else { 1154 base = segmap_getmapflt(segkmap, blkvp, 1155 (u_offset_t)(off + on), n, !pagecreate, S_WRITE); 1156 1157 /* 1158 * segmap_pagecreate() returns 1 if it calls 1159 * page_create_va() to allocate any pages. 1160 */ 1161 1162 if (pagecreate) 1163 newpage = segmap_pagecreate(segkmap, base + on, 1164 n, 0); 1165 1166 error = uiomove(base + on, n, UIO_WRITE, uiop); 1167 } 1168 1169 if (!vpm_enable && pagecreate && 1170 uiop->uio_loffset < 1171 P2ROUNDUP_TYPED(off + on + n, PAGESIZE, offset_t)) { 1172 /* 1173 * We created pages w/o initializing them completely, 1174 * thus we need to zero the part that wasn't set up. 1175 * This can happen if we write to the end of the device 1176 * or if we had some sort of error during the uiomove. 1177 */ 1178 long nzero; 1179 offset_t nmoved; 1180 1181 nmoved = (uiop->uio_loffset - (off + on)); 1182 if (nmoved < 0 || nmoved > n) { 1183 panic("spec_write: nmoved bogus"); 1184 /*NOTREACHED*/ 1185 } 1186 nzero = (long)P2ROUNDUP(on + n, PAGESIZE) - 1187 (on + nmoved); 1188 if (nzero < 0 || (on + nmoved + nzero > MAXBSIZE)) { 1189 panic("spec_write: nzero bogus"); 1190 /*NOTREACHED*/ 1191 } 1192 (void) kzero(base + on + nmoved, (size_t)nzero); 1193 } 1194 1195 /* 1196 * Unlock the pages which have been allocated by 1197 * page_create_va() in segmap_pagecreate(). 1198 */ 1199 if (!vpm_enable && newpage) 1200 segmap_pageunlock(segkmap, base + on, 1201 (size_t)n, S_WRITE); 1202 1203 if (error == 0) { 1204 int flags = 0; 1205 1206 /* 1207 * Force write back for synchronous write cases. 1208 */ 1209 if (ioflag & (FSYNC|FDSYNC)) 1210 flags = SM_WRITE; 1211 else if (n + on == MAXBSIZE || IS_SWAPVP(vp)) { 1212 /* 1213 * Have written a whole block. 1214 * Start an asynchronous write and 1215 * mark the buffer to indicate that 1216 * it won't be needed again soon. 1217 * Push swap files here, since it 1218 * won't happen anywhere else. 1219 */ 1220 flags = SM_WRITE | SM_ASYNC | SM_DONTNEED; 1221 } 1222 smark(sp, SUPD|SCHG); 1223 if (vpm_enable) { 1224 error = vpm_sync_pages(blkvp, off, n, flags); 1225 } else { 1226 error = segmap_release(segkmap, base, flags); 1227 } 1228 } else { 1229 if (vpm_enable) { 1230 (void) vpm_sync_pages(blkvp, off, n, SM_INVAL); 1231 } else { 1232 (void) segmap_release(segkmap, base, SM_INVAL); 1233 } 1234 } 1235 1236 } while (error == 0 && uiop->uio_resid > 0 && n != 0); 1237 1238 return (error); 1239 } 1240 1241 /*ARGSUSED6*/ 1242 static int 1243 spec_ioctl(struct vnode *vp, int cmd, intptr_t arg, int mode, struct cred *cr, 1244 int *rvalp, caller_context_t *ct) 1245 { 1246 struct snode *sp; 1247 dev_t dev; 1248 int error; 1249 1250 if (vp->v_type != VCHR) 1251 return (ENOTTY); 1252 1253 /* 1254 * allow ioctls() to go through even for fenced snodes, as they 1255 * may include unconfiguration operation - for example popping of 1256 * streams modules. 1257 */ 1258 1259 sp = VTOS(vp); 1260 dev = sp->s_dev; 1261 if (STREAMSTAB(getmajor(dev))) { 1262 error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp); 1263 } else { 1264 error = cdev_ioctl(dev, cmd, arg, mode, cr, rvalp); 1265 } 1266 return (error); 1267 } 1268 1269 static int 1270 spec_getattr( 1271 struct vnode *vp, 1272 struct vattr *vap, 1273 int flags, 1274 struct cred *cr, 1275 caller_context_t *ct) 1276 { 1277 int error; 1278 struct snode *sp; 1279 struct vnode *realvp; 1280 1281 /* With ATTR_COMM we will not get attributes from realvp */ 1282 if (flags & ATTR_COMM) { 1283 sp = VTOS(vp); 1284 vp = sp->s_commonvp; 1285 } 1286 sp = VTOS(vp); 1287 1288 /* we want stat() to fail with ENXIO if the device is fenced off */ 1289 if (S_ISFENCED(sp)) 1290 return (ENXIO); 1291 1292 realvp = sp->s_realvp; 1293 1294 if (realvp == NULL) { 1295 static int snode_shift = 0; 1296 1297 /* 1298 * Calculate the amount of bitshift to a snode pointer which 1299 * will still keep it unique. See below. 1300 */ 1301 if (snode_shift == 0) 1302 snode_shift = highbit(sizeof (struct snode)); 1303 ASSERT(snode_shift > 0); 1304 1305 /* 1306 * No real vnode behind this one. Fill in the fields 1307 * from the snode. 1308 * 1309 * This code should be refined to return only the 1310 * attributes asked for instead of all of them. 1311 */ 1312 vap->va_type = vp->v_type; 1313 vap->va_mode = 0; 1314 vap->va_uid = vap->va_gid = 0; 1315 vap->va_fsid = sp->s_fsid; 1316 1317 /* 1318 * If the va_nodeid is > MAX_USHORT, then i386 stats might 1319 * fail. So we shift down the snode pointer to try and get 1320 * the most uniqueness into 16-bits. 1321 */ 1322 vap->va_nodeid = ((ino64_t)(uintptr_t)sp >> snode_shift) & 1323 0xFFFF; 1324 vap->va_nlink = 0; 1325 vap->va_rdev = sp->s_dev; 1326 1327 /* 1328 * va_nblocks is the number of 512 byte blocks used to store 1329 * the mknod for the device, not the number of blocks on the 1330 * device itself. This is typically zero since the mknod is 1331 * represented directly in the inode itself. 1332 */ 1333 vap->va_nblocks = 0; 1334 } else { 1335 error = VOP_GETATTR(realvp, vap, flags, cr, ct); 1336 if (error != 0) 1337 return (error); 1338 } 1339 1340 /* set the size from the snode */ 1341 vap->va_size = SPEC_SIZE(VTOS(sp->s_commonvp)); 1342 vap->va_blksize = MAXBSIZE; 1343 1344 mutex_enter(&sp->s_lock); 1345 vap->va_atime.tv_sec = sp->s_atime; 1346 vap->va_mtime.tv_sec = sp->s_mtime; 1347 vap->va_ctime.tv_sec = sp->s_ctime; 1348 mutex_exit(&sp->s_lock); 1349 1350 vap->va_atime.tv_nsec = 0; 1351 vap->va_mtime.tv_nsec = 0; 1352 vap->va_ctime.tv_nsec = 0; 1353 vap->va_seq = 0; 1354 1355 return (0); 1356 } 1357 1358 static int 1359 spec_setattr( 1360 struct vnode *vp, 1361 struct vattr *vap, 1362 int flags, 1363 struct cred *cr, 1364 caller_context_t *ct) 1365 { 1366 struct snode *sp = VTOS(vp); 1367 struct vnode *realvp; 1368 int error; 1369 1370 /* fail with ENXIO if the device is fenced off */ 1371 if (S_ISFENCED(sp)) 1372 return (ENXIO); 1373 1374 if (vp->v_type == VCHR && vp->v_stream && (vap->va_mask & AT_SIZE)) { 1375 /* 1376 * 1135080: O_TRUNC should have no effect on 1377 * named pipes and terminal devices. 1378 */ 1379 ASSERT(vap->va_mask == AT_SIZE); 1380 return (0); 1381 } 1382 1383 if ((realvp = sp->s_realvp) == NULL) 1384 error = 0; /* no real vnode to update */ 1385 else 1386 error = VOP_SETATTR(realvp, vap, flags, cr, ct); 1387 if (error == 0) { 1388 /* 1389 * If times were changed, update snode. 1390 */ 1391 mutex_enter(&sp->s_lock); 1392 if (vap->va_mask & AT_ATIME) 1393 sp->s_atime = vap->va_atime.tv_sec; 1394 if (vap->va_mask & AT_MTIME) { 1395 sp->s_mtime = vap->va_mtime.tv_sec; 1396 sp->s_ctime = gethrestime_sec(); 1397 } 1398 mutex_exit(&sp->s_lock); 1399 } 1400 return (error); 1401 } 1402 1403 static int 1404 spec_access( 1405 struct vnode *vp, 1406 int mode, 1407 int flags, 1408 struct cred *cr, 1409 caller_context_t *ct) 1410 { 1411 struct vnode *realvp; 1412 struct snode *sp = VTOS(vp); 1413 1414 /* fail with ENXIO if the device is fenced off */ 1415 if (S_ISFENCED(sp)) 1416 return (ENXIO); 1417 1418 if ((realvp = sp->s_realvp) != NULL) 1419 return (VOP_ACCESS(realvp, mode, flags, cr, ct)); 1420 else 1421 return (0); /* Allow all access. */ 1422 } 1423 1424 /* 1425 * This can be called if creat or an open with O_CREAT is done on the root 1426 * of a lofs mount where the mounted entity is a special file. 1427 */ 1428 /*ARGSUSED*/ 1429 static int 1430 spec_create( 1431 struct vnode *dvp, 1432 char *name, 1433 vattr_t *vap, 1434 enum vcexcl excl, 1435 int mode, 1436 struct vnode **vpp, 1437 struct cred *cr, 1438 int flag, 1439 caller_context_t *ct, 1440 vsecattr_t *vsecp) 1441 { 1442 int error; 1443 struct snode *sp = VTOS(dvp); 1444 1445 /* fail with ENXIO if the device is fenced off */ 1446 if (S_ISFENCED(sp)) 1447 return (ENXIO); 1448 1449 ASSERT(dvp && (dvp->v_flag & VROOT) && *name == '\0'); 1450 if (excl == NONEXCL) { 1451 if (mode && (error = spec_access(dvp, mode, 0, cr, ct))) 1452 return (error); 1453 VN_HOLD(dvp); 1454 return (0); 1455 } 1456 return (EEXIST); 1457 } 1458 1459 /* 1460 * In order to sync out the snode times without multi-client problems, 1461 * make sure the times written out are never earlier than the times 1462 * already set in the vnode. 1463 */ 1464 static int 1465 spec_fsync( 1466 struct vnode *vp, 1467 int syncflag, 1468 struct cred *cr, 1469 caller_context_t *ct) 1470 { 1471 struct snode *sp = VTOS(vp); 1472 struct vnode *realvp; 1473 struct vnode *cvp; 1474 struct vattr va, vatmp; 1475 1476 /* allow syncing even if device is fenced off */ 1477 1478 /* If times didn't change, don't flush anything. */ 1479 mutex_enter(&sp->s_lock); 1480 if ((sp->s_flag & (SACC|SUPD|SCHG)) == 0 && vp->v_type != VBLK) { 1481 mutex_exit(&sp->s_lock); 1482 return (0); 1483 } 1484 sp->s_flag &= ~(SACC|SUPD|SCHG); 1485 mutex_exit(&sp->s_lock); 1486 cvp = sp->s_commonvp; 1487 realvp = sp->s_realvp; 1488 1489 if (vp->v_type == VBLK && cvp != vp && vn_has_cached_data(cvp) && 1490 (cvp->v_flag & VISSWAP) == 0) 1491 (void) VOP_PUTPAGE(cvp, (offset_t)0, 0, 0, cr, ct); 1492 1493 /* 1494 * For devices that support it, force write cache to stable storage. 1495 * We don't need the lock to check s_flags since we can treat 1496 * SNOFLUSH as a hint. 1497 */ 1498 if ((vp->v_type == VBLK || vp->v_type == VCHR) && 1499 !(sp->s_flag & SNOFLUSH)) { 1500 int rval, rc; 1501 struct dk_callback spec_callback; 1502 1503 spec_callback.dkc_flag = FLUSH_VOLATILE; 1504 spec_callback.dkc_callback = NULL; 1505 1506 /* synchronous flush on volatile cache */ 1507 rc = cdev_ioctl(vp->v_rdev, DKIOCFLUSHWRITECACHE, 1508 (intptr_t)&spec_callback, FNATIVE|FKIOCTL, cr, &rval); 1509 1510 if (rc == ENOTSUP || rc == ENOTTY) { 1511 mutex_enter(&sp->s_lock); 1512 sp->s_flag |= SNOFLUSH; 1513 mutex_exit(&sp->s_lock); 1514 } 1515 } 1516 1517 /* 1518 * If no real vnode to update, don't flush anything. 1519 */ 1520 if (realvp == NULL) 1521 return (0); 1522 1523 vatmp.va_mask = AT_ATIME|AT_MTIME; 1524 if (VOP_GETATTR(realvp, &vatmp, 0, cr, ct) == 0) { 1525 1526 mutex_enter(&sp->s_lock); 1527 if (vatmp.va_atime.tv_sec > sp->s_atime) 1528 va.va_atime = vatmp.va_atime; 1529 else { 1530 va.va_atime.tv_sec = sp->s_atime; 1531 va.va_atime.tv_nsec = 0; 1532 } 1533 if (vatmp.va_mtime.tv_sec > sp->s_mtime) 1534 va.va_mtime = vatmp.va_mtime; 1535 else { 1536 va.va_mtime.tv_sec = sp->s_mtime; 1537 va.va_mtime.tv_nsec = 0; 1538 } 1539 mutex_exit(&sp->s_lock); 1540 1541 va.va_mask = AT_ATIME|AT_MTIME; 1542 (void) VOP_SETATTR(realvp, &va, 0, cr, ct); 1543 } 1544 (void) VOP_FSYNC(realvp, syncflag, cr, ct); 1545 return (0); 1546 } 1547 1548 /*ARGSUSED*/ 1549 static void 1550 spec_inactive(struct vnode *vp, struct cred *cr, caller_context_t *ct) 1551 { 1552 struct snode *sp = VTOS(vp); 1553 struct vnode *cvp; 1554 struct vnode *rvp; 1555 1556 /* 1557 * If no one has reclaimed the vnode, remove from the 1558 * cache now. 1559 */ 1560 if (vp->v_count < 1) { 1561 panic("spec_inactive: Bad v_count"); 1562 /*NOTREACHED*/ 1563 } 1564 mutex_enter(&stable_lock); 1565 1566 mutex_enter(&vp->v_lock); 1567 /* 1568 * Drop the temporary hold by vn_rele now 1569 */ 1570 if (--vp->v_count != 0) { 1571 mutex_exit(&vp->v_lock); 1572 mutex_exit(&stable_lock); 1573 return; 1574 } 1575 mutex_exit(&vp->v_lock); 1576 1577 sdelete(sp); 1578 mutex_exit(&stable_lock); 1579 1580 /* We are the sole owner of sp now */ 1581 cvp = sp->s_commonvp; 1582 rvp = sp->s_realvp; 1583 1584 if (rvp) { 1585 /* 1586 * If the snode times changed, then update the times 1587 * associated with the "realvp". 1588 */ 1589 if ((sp->s_flag & (SACC|SUPD|SCHG)) != 0) { 1590 1591 struct vattr va, vatmp; 1592 1593 mutex_enter(&sp->s_lock); 1594 sp->s_flag &= ~(SACC|SUPD|SCHG); 1595 mutex_exit(&sp->s_lock); 1596 vatmp.va_mask = AT_ATIME|AT_MTIME; 1597 /* 1598 * The user may not own the device, but we 1599 * want to update the attributes anyway. 1600 */ 1601 if (VOP_GETATTR(rvp, &vatmp, 0, kcred, ct) == 0) { 1602 if (vatmp.va_atime.tv_sec > sp->s_atime) 1603 va.va_atime = vatmp.va_atime; 1604 else { 1605 va.va_atime.tv_sec = sp->s_atime; 1606 va.va_atime.tv_nsec = 0; 1607 } 1608 if (vatmp.va_mtime.tv_sec > sp->s_mtime) 1609 va.va_mtime = vatmp.va_mtime; 1610 else { 1611 va.va_mtime.tv_sec = sp->s_mtime; 1612 va.va_mtime.tv_nsec = 0; 1613 } 1614 1615 va.va_mask = AT_ATIME|AT_MTIME; 1616 (void) VOP_SETATTR(rvp, &va, 0, kcred, ct); 1617 } 1618 } 1619 } 1620 ASSERT(!vn_has_cached_data(vp)); 1621 vn_invalid(vp); 1622 1623 /* if we are sharing another file systems vfs, release it */ 1624 if (vp->v_vfsp && (vp->v_vfsp != &spec_vfs)) 1625 VFS_RELE(vp->v_vfsp); 1626 1627 /* if we have a realvp, release the realvp */ 1628 if (rvp) 1629 VN_RELE(rvp); 1630 1631 /* if we have a common, release the common */ 1632 if (cvp && (cvp != vp)) { 1633 VN_RELE(cvp); 1634 #ifdef DEBUG 1635 } else if (cvp) { 1636 /* 1637 * if this is the last reference to a common vnode, any 1638 * associated stream had better have been closed 1639 */ 1640 ASSERT(cvp == vp); 1641 ASSERT(cvp->v_stream == NULL); 1642 #endif /* DEBUG */ 1643 } 1644 1645 /* 1646 * if we have a hold on a devinfo node (established by 1647 * spec_assoc_vp_with_devi), release the hold 1648 */ 1649 if (sp->s_dip) 1650 ddi_release_devi(sp->s_dip); 1651 1652 /* 1653 * If we have an associated device policy, release it. 1654 */ 1655 if (sp->s_plcy != NULL) 1656 dpfree(sp->s_plcy); 1657 1658 /* 1659 * If all holds on the devinfo node are through specfs/devfs 1660 * and we just destroyed the last specfs node associated with the 1661 * device, then the devinfo node reference count should now be 1662 * zero. We can't check this because there may be other holds 1663 * on the node from non file system sources: ddi_hold_devi_by_instance 1664 * for example. 1665 */ 1666 kmem_cache_free(snode_cache, sp); 1667 } 1668 1669 static int 1670 spec_fid(struct vnode *vp, struct fid *fidp, caller_context_t *ct) 1671 { 1672 struct vnode *realvp; 1673 struct snode *sp = VTOS(vp); 1674 1675 if ((realvp = sp->s_realvp) != NULL) 1676 return (VOP_FID(realvp, fidp, ct)); 1677 else 1678 return (EINVAL); 1679 } 1680 1681 /*ARGSUSED1*/ 1682 static int 1683 spec_seek( 1684 struct vnode *vp, 1685 offset_t ooff, 1686 offset_t *noffp, 1687 caller_context_t *ct) 1688 { 1689 offset_t maxoff = spec_maxoffset(vp); 1690 1691 if (maxoff == -1 || *noffp <= maxoff) 1692 return (0); 1693 else 1694 return (EINVAL); 1695 } 1696 1697 static int 1698 spec_frlock( 1699 struct vnode *vp, 1700 int cmd, 1701 struct flock64 *bfp, 1702 int flag, 1703 offset_t offset, 1704 struct flk_callback *flk_cbp, 1705 struct cred *cr, 1706 caller_context_t *ct) 1707 { 1708 struct snode *sp = VTOS(vp); 1709 struct snode *csp; 1710 1711 csp = VTOS(sp->s_commonvp); 1712 /* 1713 * If file is being mapped, disallow frlock. 1714 */ 1715 if (csp->s_mapcnt > 0) 1716 return (EAGAIN); 1717 1718 return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct)); 1719 } 1720 1721 static int 1722 spec_realvp(struct vnode *vp, struct vnode **vpp, caller_context_t *ct) 1723 { 1724 struct vnode *rvp; 1725 1726 if ((rvp = VTOS(vp)->s_realvp) != NULL) { 1727 vp = rvp; 1728 if (VOP_REALVP(vp, &rvp, ct) == 0) 1729 vp = rvp; 1730 } 1731 1732 *vpp = vp; 1733 return (0); 1734 } 1735 1736 /* 1737 * Return all the pages from [off..off + len] in block 1738 * or character device. 1739 */ 1740 /*ARGSUSED*/ 1741 static int 1742 spec_getpage( 1743 struct vnode *vp, 1744 offset_t off, 1745 size_t len, 1746 uint_t *protp, 1747 page_t *pl[], 1748 size_t plsz, 1749 struct seg *seg, 1750 caddr_t addr, 1751 enum seg_rw rw, 1752 struct cred *cr, 1753 caller_context_t *ct) 1754 { 1755 struct snode *sp = VTOS(vp); 1756 int err; 1757 1758 ASSERT(sp->s_commonvp == vp); 1759 1760 /* 1761 * XXX Given the above assertion, this might not do 1762 * what is wanted here. 1763 */ 1764 if (vp->v_flag & VNOMAP) 1765 return (ENOSYS); 1766 TRACE_4(TR_FAC_SPECFS, TR_SPECFS_GETPAGE, 1767 "specfs getpage:vp %p off %llx len %ld snode %p", 1768 vp, off, len, sp); 1769 1770 switch (vp->v_type) { 1771 case VBLK: 1772 if (protp != NULL) 1773 *protp = PROT_ALL; 1774 1775 if (((u_offset_t)off + len) > (SPEC_SIZE(sp) + PAGEOFFSET)) 1776 return (EFAULT); /* beyond EOF */ 1777 1778 if (len <= PAGESIZE) 1779 err = spec_getapage(vp, (u_offset_t)off, len, protp, pl, 1780 plsz, seg, addr, rw, cr); 1781 else 1782 err = pvn_getpages(spec_getapage, vp, (u_offset_t)off, 1783 len, protp, pl, plsz, seg, addr, rw, cr); 1784 break; 1785 1786 case VCHR: 1787 cmn_err(CE_NOTE, "spec_getpage called for character device. " 1788 "Check any non-ON consolidation drivers"); 1789 err = 0; 1790 pl[0] = (page_t *)0; 1791 break; 1792 1793 default: 1794 panic("spec_getpage: bad v_type 0x%x", vp->v_type); 1795 /*NOTREACHED*/ 1796 } 1797 1798 return (err); 1799 } 1800 1801 extern int klustsize; /* set in machdep.c */ 1802 1803 int spec_ra = 1; 1804 int spec_lostpage; /* number of times we lost original page */ 1805 1806 /*ARGSUSED2*/ 1807 static int 1808 spec_getapage( 1809 struct vnode *vp, 1810 u_offset_t off, 1811 size_t len, 1812 uint_t *protp, 1813 page_t *pl[], 1814 size_t plsz, 1815 struct seg *seg, 1816 caddr_t addr, 1817 enum seg_rw rw, 1818 struct cred *cr) 1819 { 1820 struct snode *sp; 1821 struct buf *bp; 1822 page_t *pp, *pp2; 1823 u_offset_t io_off1, io_off2; 1824 size_t io_len1; 1825 size_t io_len2; 1826 size_t blksz; 1827 u_offset_t blkoff; 1828 int dora, err; 1829 page_t *pagefound; 1830 uint_t xlen; 1831 size_t adj_klustsize; 1832 u_offset_t size; 1833 u_offset_t tmpoff; 1834 1835 sp = VTOS(vp); 1836 TRACE_3(TR_FAC_SPECFS, TR_SPECFS_GETAPAGE, 1837 "specfs getapage:vp %p off %llx snode %p", vp, off, sp); 1838 reread: 1839 1840 err = 0; 1841 bp = NULL; 1842 pp = NULL; 1843 pp2 = NULL; 1844 1845 if (pl != NULL) 1846 pl[0] = NULL; 1847 1848 size = SPEC_SIZE(VTOS(sp->s_commonvp)); 1849 1850 if (spec_ra && sp->s_nextr == off) 1851 dora = 1; 1852 else 1853 dora = 0; 1854 1855 if (size == UNKNOWN_SIZE) { 1856 dora = 0; 1857 adj_klustsize = PAGESIZE; 1858 } else { 1859 adj_klustsize = dora ? klustsize : PAGESIZE; 1860 } 1861 1862 again: 1863 if ((pagefound = page_exists(vp, off)) == NULL) { 1864 if (rw == S_CREATE) { 1865 /* 1866 * We're allocating a swap slot and it's 1867 * associated page was not found, so allocate 1868 * and return it. 1869 */ 1870 if ((pp = page_create_va(vp, off, 1871 PAGESIZE, PG_WAIT, seg, addr)) == NULL) { 1872 panic("spec_getapage: page_create"); 1873 /*NOTREACHED*/ 1874 } 1875 io_len1 = PAGESIZE; 1876 sp->s_nextr = off + PAGESIZE; 1877 } else { 1878 /* 1879 * Need to really do disk I/O to get the page(s). 1880 */ 1881 blkoff = (off / adj_klustsize) * adj_klustsize; 1882 if (size == UNKNOWN_SIZE) { 1883 blksz = PAGESIZE; 1884 } else { 1885 if (blkoff + adj_klustsize <= size) 1886 blksz = adj_klustsize; 1887 else 1888 blksz = 1889 MIN(size - blkoff, adj_klustsize); 1890 } 1891 1892 pp = pvn_read_kluster(vp, off, seg, addr, &tmpoff, 1893 &io_len1, blkoff, blksz, 0); 1894 io_off1 = tmpoff; 1895 /* 1896 * Make sure the page didn't sneek into the 1897 * cache while we blocked in pvn_read_kluster. 1898 */ 1899 if (pp == NULL) 1900 goto again; 1901 1902 /* 1903 * Zero part of page which we are not 1904 * going to be reading from disk now. 1905 */ 1906 xlen = (uint_t)(io_len1 & PAGEOFFSET); 1907 if (xlen != 0) 1908 pagezero(pp->p_prev, xlen, PAGESIZE - xlen); 1909 1910 bp = spec_startio(vp, pp, io_off1, io_len1, 1911 pl == NULL ? (B_ASYNC | B_READ) : B_READ); 1912 sp->s_nextr = io_off1 + io_len1; 1913 } 1914 } 1915 1916 if (dora && rw != S_CREATE) { 1917 u_offset_t off2; 1918 caddr_t addr2; 1919 1920 off2 = ((off / adj_klustsize) + 1) * adj_klustsize; 1921 addr2 = addr + (off2 - off); 1922 1923 pp2 = NULL; 1924 /* 1925 * If we are past EOF then don't bother trying 1926 * with read-ahead. 1927 */ 1928 if (off2 >= size) 1929 pp2 = NULL; 1930 else { 1931 if (off2 + adj_klustsize <= size) 1932 blksz = adj_klustsize; 1933 else 1934 blksz = MIN(size - off2, adj_klustsize); 1935 1936 pp2 = pvn_read_kluster(vp, off2, seg, addr2, &tmpoff, 1937 &io_len2, off2, blksz, 1); 1938 io_off2 = tmpoff; 1939 } 1940 1941 if (pp2 != NULL) { 1942 /* 1943 * Zero part of page which we are not 1944 * going to be reading from disk now. 1945 */ 1946 xlen = (uint_t)(io_len2 & PAGEOFFSET); 1947 if (xlen != 0) 1948 pagezero(pp2->p_prev, xlen, PAGESIZE - xlen); 1949 1950 (void) spec_startio(vp, pp2, io_off2, io_len2, 1951 B_READ | B_ASYNC); 1952 } 1953 } 1954 1955 if (pl == NULL) 1956 return (err); 1957 1958 if (bp != NULL) { 1959 err = biowait(bp); 1960 pageio_done(bp); 1961 1962 if (err) { 1963 if (pp != NULL) 1964 pvn_read_done(pp, B_ERROR); 1965 return (err); 1966 } 1967 } 1968 1969 if (pagefound) { 1970 se_t se = (rw == S_CREATE ? SE_EXCL : SE_SHARED); 1971 /* 1972 * Page exists in the cache, acquire the appropriate 1973 * lock. If this fails, start all over again. 1974 */ 1975 1976 if ((pp = page_lookup(vp, off, se)) == NULL) { 1977 spec_lostpage++; 1978 goto reread; 1979 } 1980 pl[0] = pp; 1981 pl[1] = NULL; 1982 1983 sp->s_nextr = off + PAGESIZE; 1984 return (0); 1985 } 1986 1987 if (pp != NULL) 1988 pvn_plist_init(pp, pl, plsz, off, io_len1, rw); 1989 return (0); 1990 } 1991 1992 /* 1993 * Flags are composed of {B_INVAL, B_DIRTY B_FREE, B_DONTNEED, B_FORCE}. 1994 * If len == 0, do from off to EOF. 1995 * 1996 * The normal cases should be len == 0 & off == 0 (entire vp list), 1997 * len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE 1998 * (from pageout). 1999 */ 2000 /*ARGSUSED5*/ 2001 int 2002 spec_putpage( 2003 struct vnode *vp, 2004 offset_t off, 2005 size_t len, 2006 int flags, 2007 struct cred *cr, 2008 caller_context_t *ct) 2009 { 2010 struct snode *sp = VTOS(vp); 2011 struct vnode *cvp; 2012 page_t *pp; 2013 u_offset_t io_off; 2014 size_t io_len = 0; /* for lint */ 2015 int err = 0; 2016 u_offset_t size; 2017 u_offset_t tmpoff; 2018 2019 ASSERT(vp->v_count != 0); 2020 2021 if (vp->v_flag & VNOMAP) 2022 return (ENOSYS); 2023 2024 cvp = sp->s_commonvp; 2025 size = SPEC_SIZE(VTOS(cvp)); 2026 2027 if (!vn_has_cached_data(vp) || off >= size) 2028 return (0); 2029 2030 ASSERT(vp->v_type == VBLK && cvp == vp); 2031 TRACE_4(TR_FAC_SPECFS, TR_SPECFS_PUTPAGE, 2032 "specfs putpage:vp %p off %llx len %ld snode %p", 2033 vp, off, len, sp); 2034 2035 if (len == 0) { 2036 /* 2037 * Search the entire vp list for pages >= off. 2038 */ 2039 err = pvn_vplist_dirty(vp, off, spec_putapage, 2040 flags, cr); 2041 } else { 2042 u_offset_t eoff; 2043 2044 /* 2045 * Loop over all offsets in the range [off...off + len] 2046 * looking for pages to deal with. We set limits so 2047 * that we kluster to klustsize boundaries. 2048 */ 2049 eoff = off + len; 2050 for (io_off = off; io_off < eoff && io_off < size; 2051 io_off += io_len) { 2052 /* 2053 * If we are not invalidating, synchronously 2054 * freeing or writing pages use the routine 2055 * page_lookup_nowait() to prevent reclaiming 2056 * them from the free list. 2057 */ 2058 if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) { 2059 pp = page_lookup(vp, io_off, 2060 (flags & (B_INVAL | B_FREE)) ? 2061 SE_EXCL : SE_SHARED); 2062 } else { 2063 pp = page_lookup_nowait(vp, io_off, 2064 (flags & B_FREE) ? SE_EXCL : SE_SHARED); 2065 } 2066 2067 if (pp == NULL || pvn_getdirty(pp, flags) == 0) 2068 io_len = PAGESIZE; 2069 else { 2070 err = spec_putapage(vp, pp, &tmpoff, &io_len, 2071 flags, cr); 2072 io_off = tmpoff; 2073 if (err != 0) 2074 break; 2075 /* 2076 * "io_off" and "io_len" are returned as 2077 * the range of pages we actually wrote. 2078 * This allows us to skip ahead more quickly 2079 * since several pages may've been dealt 2080 * with by this iteration of the loop. 2081 */ 2082 } 2083 } 2084 } 2085 return (err); 2086 } 2087 2088 2089 /* 2090 * Write out a single page, possibly klustering adjacent 2091 * dirty pages. 2092 */ 2093 /*ARGSUSED5*/ 2094 static int 2095 spec_putapage( 2096 struct vnode *vp, 2097 page_t *pp, 2098 u_offset_t *offp, /* return value */ 2099 size_t *lenp, /* return value */ 2100 int flags, 2101 struct cred *cr) 2102 { 2103 struct snode *sp = VTOS(vp); 2104 u_offset_t io_off; 2105 size_t io_len; 2106 size_t blksz; 2107 u_offset_t blkoff; 2108 int err = 0; 2109 struct buf *bp; 2110 u_offset_t size; 2111 size_t adj_klustsize; 2112 u_offset_t tmpoff; 2113 2114 /* 2115 * Destroy read ahead value since we are really going to write. 2116 */ 2117 sp->s_nextr = 0; 2118 size = SPEC_SIZE(VTOS(sp->s_commonvp)); 2119 2120 adj_klustsize = klustsize; 2121 2122 blkoff = (pp->p_offset / adj_klustsize) * adj_klustsize; 2123 2124 if (blkoff + adj_klustsize <= size) 2125 blksz = adj_klustsize; 2126 else 2127 blksz = size - blkoff; 2128 2129 /* 2130 * Find a kluster that fits in one contiguous chunk. 2131 */ 2132 pp = pvn_write_kluster(vp, pp, &tmpoff, &io_len, blkoff, 2133 blksz, flags); 2134 io_off = tmpoff; 2135 2136 /* 2137 * Check for page length rounding problems 2138 * XXX - Is this necessary? 2139 */ 2140 if (io_off + io_len > size) { 2141 ASSERT((io_off + io_len) - size < PAGESIZE); 2142 io_len = size - io_off; 2143 } 2144 2145 bp = spec_startio(vp, pp, io_off, io_len, B_WRITE | flags); 2146 2147 /* 2148 * Wait for i/o to complete if the request is not B_ASYNC. 2149 */ 2150 if ((flags & B_ASYNC) == 0) { 2151 err = biowait(bp); 2152 pageio_done(bp); 2153 pvn_write_done(pp, ((err) ? B_ERROR : 0) | B_WRITE | flags); 2154 } 2155 2156 if (offp) 2157 *offp = io_off; 2158 if (lenp) 2159 *lenp = io_len; 2160 TRACE_4(TR_FAC_SPECFS, TR_SPECFS_PUTAPAGE, 2161 "specfs putapage:vp %p offp %p snode %p err %d", 2162 vp, offp, sp, err); 2163 return (err); 2164 } 2165 2166 /* 2167 * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED} 2168 */ 2169 static struct buf * 2170 spec_startio( 2171 struct vnode *vp, 2172 page_t *pp, 2173 u_offset_t io_off, 2174 size_t io_len, 2175 int flags) 2176 { 2177 struct buf *bp; 2178 2179 bp = pageio_setup(pp, io_len, vp, flags); 2180 2181 bp->b_edev = vp->v_rdev; 2182 bp->b_dev = cmpdev(vp->v_rdev); 2183 bp->b_blkno = btodt(io_off); 2184 bp->b_un.b_addr = (caddr_t)0; 2185 2186 (void) bdev_strategy(bp); 2187 2188 if (flags & B_READ) 2189 lwp_stat_update(LWP_STAT_INBLK, 1); 2190 else 2191 lwp_stat_update(LWP_STAT_OUBLK, 1); 2192 2193 return (bp); 2194 } 2195 2196 static int 2197 spec_poll( 2198 struct vnode *vp, 2199 short events, 2200 int anyyet, 2201 short *reventsp, 2202 struct pollhead **phpp, 2203 caller_context_t *ct) 2204 { 2205 dev_t dev; 2206 int error; 2207 2208 if (vp->v_type == VBLK) 2209 error = fs_poll(vp, events, anyyet, reventsp, phpp, ct); 2210 else { 2211 ASSERT(vp->v_type == VCHR); 2212 dev = vp->v_rdev; 2213 if (STREAMSTAB(getmajor(dev))) { 2214 ASSERT(vp->v_stream != NULL); 2215 error = strpoll(vp->v_stream, events, anyyet, 2216 reventsp, phpp); 2217 } else if (devopsp[getmajor(dev)]->devo_cb_ops->cb_chpoll) { 2218 error = cdev_poll(dev, events, anyyet, reventsp, phpp); 2219 } else { 2220 error = fs_poll(vp, events, anyyet, reventsp, phpp, ct); 2221 } 2222 } 2223 return (error); 2224 } 2225 2226 /* 2227 * This routine is called through the cdevsw[] table to handle 2228 * traditional mmap'able devices that support a d_mmap function. 2229 */ 2230 /*ARGSUSED8*/ 2231 int 2232 spec_segmap( 2233 dev_t dev, 2234 off_t off, 2235 struct as *as, 2236 caddr_t *addrp, 2237 off_t len, 2238 uint_t prot, 2239 uint_t maxprot, 2240 uint_t flags, 2241 struct cred *cred) 2242 { 2243 struct segdev_crargs dev_a; 2244 int (*mapfunc)(dev_t dev, off_t off, int prot); 2245 size_t i; 2246 int error; 2247 2248 if ((mapfunc = devopsp[getmajor(dev)]->devo_cb_ops->cb_mmap) == nodev) 2249 return (ENODEV); 2250 TRACE_4(TR_FAC_SPECFS, TR_SPECFS_SEGMAP, 2251 "specfs segmap:dev %x as %p len %lx prot %x", 2252 dev, as, len, prot); 2253 2254 /* 2255 * Character devices that support the d_mmap 2256 * interface can only be mmap'ed shared. 2257 */ 2258 if ((flags & MAP_TYPE) != MAP_SHARED) 2259 return (EINVAL); 2260 2261 /* 2262 * Check to ensure that the entire range is 2263 * legal and we are not trying to map in 2264 * more than the device will let us. 2265 */ 2266 for (i = 0; i < len; i += PAGESIZE) { 2267 if (cdev_mmap(mapfunc, dev, off + i, maxprot) == -1) 2268 return (ENXIO); 2269 } 2270 2271 as_rangelock(as); 2272 /* Pick an address w/o worrying about any vac alignment constraints. */ 2273 error = choose_addr(as, addrp, len, off, ADDR_NOVACALIGN, flags); 2274 if (error != 0) { 2275 as_rangeunlock(as); 2276 return (error); 2277 } 2278 2279 dev_a.mapfunc = mapfunc; 2280 dev_a.dev = dev; 2281 dev_a.offset = off; 2282 dev_a.prot = (uchar_t)prot; 2283 dev_a.maxprot = (uchar_t)maxprot; 2284 dev_a.hat_flags = 0; 2285 dev_a.hat_attr = 0; 2286 dev_a.devmap_data = NULL; 2287 2288 error = as_map(as, *addrp, len, segdev_create, &dev_a); 2289 as_rangeunlock(as); 2290 return (error); 2291 } 2292 2293 int 2294 spec_char_map( 2295 dev_t dev, 2296 offset_t off, 2297 struct as *as, 2298 caddr_t *addrp, 2299 size_t len, 2300 uchar_t prot, 2301 uchar_t maxprot, 2302 uint_t flags, 2303 struct cred *cred) 2304 { 2305 int error = 0; 2306 major_t maj = getmajor(dev); 2307 int map_flag; 2308 int (*segmap)(dev_t, off_t, struct as *, 2309 caddr_t *, off_t, uint_t, uint_t, uint_t, cred_t *); 2310 int (*devmap)(dev_t, devmap_cookie_t, offset_t, 2311 size_t, size_t *, uint_t); 2312 int (*mmap)(dev_t dev, off_t off, int prot); 2313 2314 /* 2315 * Character device: let the device driver 2316 * pick the appropriate segment driver. 2317 * 2318 * 4.x compat.: allow 'NULL' cb_segmap => spec_segmap 2319 * Kindness: allow 'nulldev' cb_segmap => spec_segmap 2320 */ 2321 segmap = devopsp[maj]->devo_cb_ops->cb_segmap; 2322 if (segmap == NULL || segmap == nulldev || segmap == nodev) { 2323 mmap = devopsp[maj]->devo_cb_ops->cb_mmap; 2324 map_flag = devopsp[maj]->devo_cb_ops->cb_flag; 2325 2326 /* 2327 * Use old mmap framework if the driver has both mmap 2328 * and devmap entry points. This is to prevent the 2329 * system from calling invalid devmap entry point 2330 * for some drivers that might have put garbage in the 2331 * devmap entry point. 2332 */ 2333 if ((map_flag & D_DEVMAP) || mmap == NULL || 2334 mmap == nulldev || mmap == nodev) { 2335 devmap = devopsp[maj]->devo_cb_ops->cb_devmap; 2336 2337 /* 2338 * If driver provides devmap entry point in 2339 * cb_ops but not xx_segmap(9E), call 2340 * devmap_setup with default settings 2341 * (NULL) for callback_ops and driver 2342 * callback private data 2343 */ 2344 if (devmap == nodev || devmap == NULL || 2345 devmap == nulldev) 2346 return (ENODEV); 2347 2348 error = devmap_setup(dev, off, as, addrp, 2349 len, prot, maxprot, flags, cred); 2350 2351 return (error); 2352 } else 2353 segmap = spec_segmap; 2354 } else 2355 segmap = cdev_segmap; 2356 2357 return ((*segmap)(dev, (off_t)off, as, addrp, len, prot, 2358 maxprot, flags, cred)); 2359 } 2360 2361 /*ARGSUSED9*/ 2362 static int 2363 spec_map( 2364 struct vnode *vp, 2365 offset_t off, 2366 struct as *as, 2367 caddr_t *addrp, 2368 size_t len, 2369 uchar_t prot, 2370 uchar_t maxprot, 2371 uint_t flags, 2372 struct cred *cred, 2373 caller_context_t *ct) 2374 { 2375 int error = 0; 2376 struct snode *sp = VTOS(vp); 2377 2378 if (vp->v_flag & VNOMAP) 2379 return (ENOSYS); 2380 2381 /* fail map with ENXIO if the device is fenced off */ 2382 if (S_ISFENCED(sp)) 2383 return (ENXIO); 2384 2385 /* 2386 * If file is locked, fail mapping attempt. 2387 */ 2388 if (vn_has_flocks(vp)) 2389 return (EAGAIN); 2390 2391 if (vp->v_type == VCHR) { 2392 return (spec_char_map(vp->v_rdev, off, as, addrp, len, prot, 2393 maxprot, flags, cred)); 2394 } else if (vp->v_type == VBLK) { 2395 struct segvn_crargs vn_a; 2396 struct vnode *cvp; 2397 struct snode *sp; 2398 2399 /* 2400 * Block device, use segvn mapping to the underlying commonvp 2401 * for pages. 2402 */ 2403 if (off > spec_maxoffset(vp)) 2404 return (ENXIO); 2405 2406 sp = VTOS(vp); 2407 cvp = sp->s_commonvp; 2408 ASSERT(cvp != NULL); 2409 2410 if (off < 0 || ((offset_t)(off + len) < 0)) 2411 return (ENXIO); 2412 2413 as_rangelock(as); 2414 error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags); 2415 if (error != 0) { 2416 as_rangeunlock(as); 2417 return (error); 2418 } 2419 2420 vn_a.vp = cvp; 2421 vn_a.offset = off; 2422 vn_a.type = flags & MAP_TYPE; 2423 vn_a.prot = (uchar_t)prot; 2424 vn_a.maxprot = (uchar_t)maxprot; 2425 vn_a.flags = flags & ~MAP_TYPE; 2426 vn_a.cred = cred; 2427 vn_a.amp = NULL; 2428 vn_a.szc = 0; 2429 vn_a.lgrp_mem_policy_flags = 0; 2430 2431 error = as_map(as, *addrp, len, segvn_create, &vn_a); 2432 as_rangeunlock(as); 2433 } else 2434 return (ENODEV); 2435 2436 return (error); 2437 } 2438 2439 /*ARGSUSED1*/ 2440 static int 2441 spec_addmap( 2442 struct vnode *vp, /* the common vnode */ 2443 offset_t off, 2444 struct as *as, 2445 caddr_t addr, 2446 size_t len, /* how many bytes to add */ 2447 uchar_t prot, 2448 uchar_t maxprot, 2449 uint_t flags, 2450 struct cred *cred, 2451 caller_context_t *ct) 2452 { 2453 int error = 0; 2454 struct snode *csp = VTOS(vp); 2455 ulong_t npages; 2456 2457 ASSERT(vp != NULL && VTOS(vp)->s_commonvp == vp); 2458 2459 /* 2460 * XXX Given the above assertion, this might not 2461 * be a particularly sensible thing to test. 2462 */ 2463 if (vp->v_flag & VNOMAP) 2464 return (ENOSYS); 2465 2466 /* fail with EIO if the device is fenced off */ 2467 if (S_ISFENCED(csp)) 2468 return (EIO); 2469 2470 npages = btopr(len); 2471 LOCK_CSP(csp); 2472 csp->s_mapcnt += npages; 2473 2474 UNLOCK_CSP(csp); 2475 return (error); 2476 } 2477 2478 /*ARGSUSED1*/ 2479 static int 2480 spec_delmap( 2481 struct vnode *vp, /* the common vnode */ 2482 offset_t off, 2483 struct as *as, 2484 caddr_t addr, 2485 size_t len, /* how many bytes to take away */ 2486 uint_t prot, 2487 uint_t maxprot, 2488 uint_t flags, 2489 struct cred *cred, 2490 caller_context_t *ct) 2491 { 2492 struct snode *csp = VTOS(vp); 2493 ulong_t npages; 2494 long mcnt; 2495 2496 /* segdev passes us the common vp */ 2497 2498 ASSERT(vp != NULL && VTOS(vp)->s_commonvp == vp); 2499 2500 /* allow delmap to succeed even if device fenced off */ 2501 2502 /* 2503 * XXX Given the above assertion, this might not 2504 * be a particularly sensible thing to test.. 2505 */ 2506 if (vp->v_flag & VNOMAP) 2507 return (ENOSYS); 2508 2509 npages = btopr(len); 2510 2511 LOCK_CSP(csp); 2512 mutex_enter(&csp->s_lock); 2513 mcnt = (csp->s_mapcnt -= npages); 2514 2515 if (mcnt == 0) { 2516 /* 2517 * Call the close routine when the last reference of any 2518 * kind through any [s, v]node goes away. The s_dip hold 2519 * on the devinfo node is released when the vnode is 2520 * destroyed. 2521 */ 2522 if (csp->s_count == 0) { 2523 csp->s_flag &= ~(SNEEDCLOSE | SSIZEVALID); 2524 2525 /* See comment in spec_close() */ 2526 if (csp->s_flag & (SCLONE | SSELFCLONE)) 2527 csp->s_flag &= ~SDIPSET; 2528 2529 mutex_exit(&csp->s_lock); 2530 2531 (void) device_close(vp, 0, cred); 2532 } else 2533 mutex_exit(&csp->s_lock); 2534 2535 mutex_enter(&csp->s_lock); 2536 } 2537 ASSERT(mcnt >= 0); 2538 2539 UNLOCK_CSP_LOCK_HELD(csp); 2540 mutex_exit(&csp->s_lock); 2541 2542 return (0); 2543 } 2544 2545 /*ARGSUSED4*/ 2546 static int 2547 spec_dump( 2548 struct vnode *vp, 2549 caddr_t addr, 2550 offset_t bn, 2551 offset_t count, 2552 caller_context_t *ct) 2553 { 2554 /* allow dump to succeed even if device fenced off */ 2555 2556 ASSERT(vp->v_type == VBLK); 2557 return (bdev_dump(vp->v_rdev, addr, (daddr_t)bn, (int)count)); 2558 } 2559 2560 2561 /* 2562 * Do i/o on the given page list from/to vp, io_off for io_len. 2563 * Flags are composed of: 2564 * {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_READ, B_WRITE} 2565 * If B_ASYNC is not set i/o is waited for. 2566 */ 2567 /*ARGSUSED5*/ 2568 static int 2569 spec_pageio( 2570 struct vnode *vp, 2571 page_t *pp, 2572 u_offset_t io_off, 2573 size_t io_len, 2574 int flags, 2575 cred_t *cr, 2576 caller_context_t *ct) 2577 { 2578 struct buf *bp = NULL; 2579 int err = 0; 2580 2581 if (pp == NULL) 2582 return (EINVAL); 2583 2584 bp = spec_startio(vp, pp, io_off, io_len, flags); 2585 2586 /* 2587 * Wait for i/o to complete if the request is not B_ASYNC. 2588 */ 2589 if ((flags & B_ASYNC) == 0) { 2590 err = biowait(bp); 2591 pageio_done(bp); 2592 } 2593 return (err); 2594 } 2595 2596 /* 2597 * Set ACL on underlying vnode if one exists, or return ENOSYS otherwise. 2598 */ 2599 int 2600 spec_setsecattr( 2601 struct vnode *vp, 2602 vsecattr_t *vsap, 2603 int flag, 2604 struct cred *cr, 2605 caller_context_t *ct) 2606 { 2607 struct vnode *realvp; 2608 struct snode *sp = VTOS(vp); 2609 int error; 2610 2611 /* fail with ENXIO if the device is fenced off */ 2612 if (S_ISFENCED(sp)) 2613 return (ENXIO); 2614 2615 /* 2616 * The acl(2) system calls VOP_RWLOCK on the file before setting an 2617 * ACL, but since specfs does not serialize reads and writes, this 2618 * VOP does not do anything. However, some backing file systems may 2619 * expect the lock to be held before setting an ACL, so it is taken 2620 * here privately to avoid serializing specfs reads and writes. 2621 */ 2622 if ((realvp = sp->s_realvp) != NULL) { 2623 (void) VOP_RWLOCK(realvp, V_WRITELOCK_TRUE, ct); 2624 error = VOP_SETSECATTR(realvp, vsap, flag, cr, ct); 2625 (void) VOP_RWUNLOCK(realvp, V_WRITELOCK_TRUE, ct); 2626 return (error); 2627 } else 2628 return (fs_nosys()); 2629 } 2630 2631 /* 2632 * Get ACL from underlying vnode if one exists, or fabricate it from 2633 * the permissions returned by spec_getattr() otherwise. 2634 */ 2635 int 2636 spec_getsecattr( 2637 struct vnode *vp, 2638 vsecattr_t *vsap, 2639 int flag, 2640 struct cred *cr, 2641 caller_context_t *ct) 2642 { 2643 struct vnode *realvp; 2644 struct snode *sp = VTOS(vp); 2645 2646 /* fail with ENXIO if the device is fenced off */ 2647 if (S_ISFENCED(sp)) 2648 return (ENXIO); 2649 2650 if ((realvp = sp->s_realvp) != NULL) 2651 return (VOP_GETSECATTR(realvp, vsap, flag, cr, ct)); 2652 else 2653 return (fs_fab_acl(vp, vsap, flag, cr, ct)); 2654 } 2655 2656 int 2657 spec_pathconf( 2658 vnode_t *vp, 2659 int cmd, 2660 ulong_t *valp, 2661 cred_t *cr, 2662 caller_context_t *ct) 2663 { 2664 vnode_t *realvp; 2665 struct snode *sp = VTOS(vp); 2666 2667 /* fail with ENXIO if the device is fenced off */ 2668 if (S_ISFENCED(sp)) 2669 return (ENXIO); 2670 2671 if ((realvp = sp->s_realvp) != NULL) 2672 return (VOP_PATHCONF(realvp, cmd, valp, cr, ct)); 2673 else 2674 return (fs_pathconf(vp, cmd, valp, cr, ct)); 2675 } 2676