1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 27 /* 28 * lofi (loopback file) driver - allows you to attach a file to a device, 29 * which can then be accessed through that device. The simple model is that 30 * you tell lofi to open a file, and then use the block device you get as 31 * you would any block device. lofi translates access to the block device 32 * into I/O on the underlying file. This is mostly useful for 33 * mounting images of filesystems. 34 * 35 * lofi is controlled through /dev/lofictl - this is the only device exported 36 * during attach, and is minor number 0. lofiadm communicates with lofi through 37 * ioctls on this device. When a file is attached to lofi, block and character 38 * devices are exported in /dev/lofi and /dev/rlofi. Currently, these devices 39 * are identified by their minor number, and the minor number is also used 40 * as the name in /dev/lofi. If we ever decide to support virtual disks, 41 * we'll have to divide the minor number space to identify fdisk partitions 42 * and slices, and the name will then be the minor number shifted down a 43 * few bits. Minor devices are tracked with state structures handled with 44 * ddi_soft_state(9F) for simplicity. 45 * 46 * A file attached to lofi is opened when attached and not closed until 47 * explicitly detached from lofi. This seems more sensible than deferring 48 * the open until the /dev/lofi device is opened, for a number of reasons. 49 * One is that any failure is likely to be noticed by the person (or script) 50 * running lofiadm. Another is that it would be a security problem if the 51 * file was replaced by another one after being added but before being opened. 52 * 53 * The only hard part about lofi is the ioctls. In order to support things 54 * like 'newfs' on a lofi device, it needs to support certain disk ioctls. 55 * So it has to fake disk geometry and partition information. More may need 56 * to be faked if your favorite utility doesn't work and you think it should 57 * (fdformat doesn't work because it really wants to know the type of floppy 58 * controller to talk to, and that didn't seem easy to fake. Or possibly even 59 * necessary, since we have mkfs_pcfs now). 60 * 61 * Normally, a lofi device cannot be detached if it is open (i.e. busy). To 62 * support simulation of hotplug events, an optional force flag is provided. 63 * If a lofi device is open when a force detach is requested, then the 64 * underlying file is closed and any subsequent operations return EIO. When the 65 * device is closed for the last time, it will be cleaned up at that time. In 66 * addition, the DKIOCSTATE ioctl will return DKIO_DEV_GONE when the device is 67 * detached but not removed. 68 * 69 * Known problems: 70 * 71 * UFS logging. Mounting a UFS filesystem image "logging" 72 * works for basic copy testing but wedges during a build of ON through 73 * that image. Some deadlock in lufs holding the log mutex and then 74 * getting stuck on a buf. So for now, don't do that. 75 * 76 * Direct I/O. Since the filesystem data is being cached in the buffer 77 * cache, _and_ again in the underlying filesystem, it's tempting to 78 * enable direct I/O on the underlying file. Don't, because that deadlocks. 79 * I think to fix the cache-twice problem we might need filesystem support. 80 * 81 * lofi on itself. The simple lock strategy (lofi_lock) precludes this 82 * because you'll be in lofi_ioctl, holding the lock when you open the 83 * file, which, if it's lofi, will grab lofi_lock. We prevent this for 84 * now, though not using ddi_soft_state(9F) would make it possible to 85 * do. Though it would still be silly. 86 * 87 * Interesting things to do: 88 * 89 * Allow multiple files for each device. A poor-man's metadisk, basically. 90 * 91 * Pass-through ioctls on block devices. You can (though it's not 92 * documented), give lofi a block device as a file name. Then we shouldn't 93 * need to fake a geometry. But this is also silly unless you're replacing 94 * metadisk. 95 * 96 * Encryption. tpm would like this. Apparently Windows 2000 has it, and 97 * so does Linux. 98 */ 99 100 #include <sys/types.h> 101 #include <netinet/in.h> 102 #include <sys/sysmacros.h> 103 #include <sys/uio.h> 104 #include <sys/kmem.h> 105 #include <sys/cred.h> 106 #include <sys/mman.h> 107 #include <sys/errno.h> 108 #include <sys/aio_req.h> 109 #include <sys/stat.h> 110 #include <sys/file.h> 111 #include <sys/modctl.h> 112 #include <sys/conf.h> 113 #include <sys/debug.h> 114 #include <sys/vnode.h> 115 #include <sys/lofi.h> 116 #include <sys/fcntl.h> 117 #include <sys/pathname.h> 118 #include <sys/filio.h> 119 #include <sys/fdio.h> 120 #include <sys/open.h> 121 #include <sys/disp.h> 122 #include <vm/seg_map.h> 123 #include <sys/ddi.h> 124 #include <sys/sunddi.h> 125 #include <sys/zmod.h> 126 127 #define NBLOCKS_PROP_NAME "Nblocks" 128 #define SIZE_PROP_NAME "Size" 129 130 static dev_info_t *lofi_dip; 131 static void *lofi_statep; 132 static kmutex_t lofi_lock; /* state lock */ 133 134 /* 135 * Because lofi_taskq_nthreads limits the actual swamping of the device, the 136 * maxalloc parameter (lofi_taskq_maxalloc) should be tuned conservatively 137 * high. If we want to be assured that the underlying device is always busy, 138 * we must be sure that the number of bytes enqueued when the number of 139 * enqueued tasks exceeds maxalloc is sufficient to keep the device busy for 140 * the duration of the sleep time in taskq_ent_alloc(). That is, lofi should 141 * set maxalloc to be the maximum throughput (in bytes per second) of the 142 * underlying device divided by the minimum I/O size. We assume a realistic 143 * maximum throughput of one hundred megabytes per second; we set maxalloc on 144 * the lofi task queue to be 104857600 divided by DEV_BSIZE. 145 */ 146 static int lofi_taskq_maxalloc = 104857600 / DEV_BSIZE; 147 static int lofi_taskq_nthreads = 4; /* # of taskq threads per device */ 148 149 uint32_t lofi_max_files = LOFI_MAX_FILES; 150 151 static int gzip_decompress(void *src, size_t srclen, void *dst, 152 size_t *destlen, int level); 153 154 lofi_compress_info_t lofi_compress_table[LOFI_COMPRESS_FUNCTIONS] = { 155 {gzip_decompress, NULL, 6, "gzip"}, /* default */ 156 {gzip_decompress, NULL, 6, "gzip-6"}, 157 {gzip_decompress, NULL, 9, "gzip-9"} 158 }; 159 160 static int 161 lofi_busy(void) 162 { 163 minor_t minor; 164 165 /* 166 * We need to make sure no mappings exist - mod_remove won't 167 * help because the device isn't open. 168 */ 169 mutex_enter(&lofi_lock); 170 for (minor = 1; minor <= lofi_max_files; minor++) { 171 if (ddi_get_soft_state(lofi_statep, minor) != NULL) { 172 mutex_exit(&lofi_lock); 173 return (EBUSY); 174 } 175 } 176 mutex_exit(&lofi_lock); 177 return (0); 178 } 179 180 static int 181 is_opened(struct lofi_state *lsp) 182 { 183 ASSERT(mutex_owned(&lofi_lock)); 184 return (lsp->ls_chr_open || lsp->ls_blk_open || lsp->ls_lyr_open_count); 185 } 186 187 static int 188 mark_opened(struct lofi_state *lsp, int otyp) 189 { 190 ASSERT(mutex_owned(&lofi_lock)); 191 switch (otyp) { 192 case OTYP_CHR: 193 lsp->ls_chr_open = 1; 194 break; 195 case OTYP_BLK: 196 lsp->ls_blk_open = 1; 197 break; 198 case OTYP_LYR: 199 lsp->ls_lyr_open_count++; 200 break; 201 default: 202 return (-1); 203 } 204 return (0); 205 } 206 207 static void 208 mark_closed(struct lofi_state *lsp, int otyp) 209 { 210 ASSERT(mutex_owned(&lofi_lock)); 211 switch (otyp) { 212 case OTYP_CHR: 213 lsp->ls_chr_open = 0; 214 break; 215 case OTYP_BLK: 216 lsp->ls_blk_open = 0; 217 break; 218 case OTYP_LYR: 219 lsp->ls_lyr_open_count--; 220 break; 221 default: 222 break; 223 } 224 } 225 226 static void 227 lofi_free_handle(dev_t dev, minor_t minor, struct lofi_state *lsp, 228 cred_t *credp) 229 { 230 dev_t newdev; 231 char namebuf[50]; 232 233 if (lsp->ls_vp) { 234 (void) VOP_CLOSE(lsp->ls_vp, lsp->ls_openflag, 235 1, 0, credp, NULL); 236 VN_RELE(lsp->ls_vp); 237 lsp->ls_vp = NULL; 238 } 239 240 newdev = makedevice(getmajor(dev), minor); 241 (void) ddi_prop_remove(newdev, lofi_dip, SIZE_PROP_NAME); 242 (void) ddi_prop_remove(newdev, lofi_dip, NBLOCKS_PROP_NAME); 243 244 (void) snprintf(namebuf, sizeof (namebuf), "%d", minor); 245 ddi_remove_minor_node(lofi_dip, namebuf); 246 (void) snprintf(namebuf, sizeof (namebuf), "%d,raw", minor); 247 ddi_remove_minor_node(lofi_dip, namebuf); 248 249 kmem_free(lsp->ls_filename, lsp->ls_filename_sz); 250 taskq_destroy(lsp->ls_taskq); 251 if (lsp->ls_kstat) { 252 kstat_delete(lsp->ls_kstat); 253 mutex_destroy(&lsp->ls_kstat_lock); 254 } 255 256 if (lsp->ls_uncomp_seg_sz > 0) { 257 kmem_free(lsp->ls_comp_index_data, lsp->ls_comp_index_data_sz); 258 lsp->ls_uncomp_seg_sz = 0; 259 } 260 ddi_soft_state_free(lofi_statep, minor); 261 } 262 263 /*ARGSUSED*/ 264 static int 265 lofi_open(dev_t *devp, int flag, int otyp, struct cred *credp) 266 { 267 minor_t minor; 268 struct lofi_state *lsp; 269 270 mutex_enter(&lofi_lock); 271 minor = getminor(*devp); 272 if (minor == 0) { 273 /* master control device */ 274 /* must be opened exclusively */ 275 if (((flag & FEXCL) != FEXCL) || (otyp != OTYP_CHR)) { 276 mutex_exit(&lofi_lock); 277 return (EINVAL); 278 } 279 lsp = ddi_get_soft_state(lofi_statep, 0); 280 if (lsp == NULL) { 281 mutex_exit(&lofi_lock); 282 return (ENXIO); 283 } 284 if (is_opened(lsp)) { 285 mutex_exit(&lofi_lock); 286 return (EBUSY); 287 } 288 (void) mark_opened(lsp, OTYP_CHR); 289 mutex_exit(&lofi_lock); 290 return (0); 291 } 292 293 /* otherwise, the mapping should already exist */ 294 lsp = ddi_get_soft_state(lofi_statep, minor); 295 if (lsp == NULL) { 296 mutex_exit(&lofi_lock); 297 return (EINVAL); 298 } 299 300 if (lsp->ls_vp == NULL) { 301 mutex_exit(&lofi_lock); 302 return (ENXIO); 303 } 304 305 if (mark_opened(lsp, otyp) == -1) { 306 mutex_exit(&lofi_lock); 307 return (EINVAL); 308 } 309 310 mutex_exit(&lofi_lock); 311 return (0); 312 } 313 314 /*ARGSUSED*/ 315 static int 316 lofi_close(dev_t dev, int flag, int otyp, struct cred *credp) 317 { 318 minor_t minor; 319 struct lofi_state *lsp; 320 321 mutex_enter(&lofi_lock); 322 minor = getminor(dev); 323 lsp = ddi_get_soft_state(lofi_statep, minor); 324 if (lsp == NULL) { 325 mutex_exit(&lofi_lock); 326 return (EINVAL); 327 } 328 mark_closed(lsp, otyp); 329 330 /* 331 * If we forcibly closed the underlying device (li_force), or 332 * asked for cleanup (li_cleanup), finish up if we're the last 333 * out of the door. 334 */ 335 if (minor != 0 && !is_opened(lsp) && 336 (lsp->ls_cleanup || lsp->ls_vp == NULL)) 337 lofi_free_handle(dev, minor, lsp, credp); 338 339 mutex_exit(&lofi_lock); 340 return (0); 341 } 342 343 static int 344 lofi_mapped_rdwr(caddr_t bufaddr, offset_t offset, struct buf *bp, 345 struct lofi_state *lsp) 346 { 347 int error; 348 offset_t alignedoffset, mapoffset; 349 size_t xfersize; 350 int isread; 351 int smflags; 352 caddr_t mapaddr; 353 size_t len; 354 enum seg_rw srw; 355 356 /* 357 * segmap always gives us an 8K (MAXBSIZE) chunk, aligned on 358 * an 8K boundary, but the buf transfer address may not be 359 * aligned on more than a 512-byte boundary (we don't enforce 360 * that even though we could). This matters since the initial 361 * part of the transfer may not start at offset 0 within the 362 * segmap'd chunk. So we have to compensate for that with 363 * 'mapoffset'. Subsequent chunks always start off at the 364 * beginning, and the last is capped by b_resid 365 */ 366 mapoffset = offset & MAXBOFFSET; 367 alignedoffset = offset - mapoffset; 368 bp->b_resid = bp->b_bcount; 369 isread = bp->b_flags & B_READ; 370 srw = isread ? S_READ : S_WRITE; 371 do { 372 xfersize = MIN(lsp->ls_vp_comp_size - offset, 373 MIN(MAXBSIZE - mapoffset, bp->b_resid)); 374 len = roundup(mapoffset + xfersize, PAGESIZE); 375 mapaddr = segmap_getmapflt(segkmap, lsp->ls_vp, 376 alignedoffset, MAXBSIZE, 1, srw); 377 /* 378 * Now fault in the pages. This lets us check 379 * for errors before we reference mapaddr and 380 * try to resolve the fault in bcopy (which would 381 * panic instead). And this can easily happen, 382 * particularly if you've lofi'd a file over NFS 383 * and someone deletes the file on the server. 384 */ 385 error = segmap_fault(kas.a_hat, segkmap, mapaddr, 386 len, F_SOFTLOCK, srw); 387 if (error) { 388 (void) segmap_release(segkmap, mapaddr, 0); 389 if (FC_CODE(error) == FC_OBJERR) 390 error = FC_ERRNO(error); 391 else 392 error = EIO; 393 break; 394 } 395 smflags = 0; 396 if (isread) { 397 smflags |= SM_FREE; 398 /* 399 * If we're reading an entire page starting 400 * at a page boundary, there's a good chance 401 * we won't need it again. Put it on the 402 * head of the freelist. 403 */ 404 if (mapoffset == 0 && xfersize == MAXBSIZE) 405 smflags |= SM_DONTNEED; 406 bcopy(mapaddr + mapoffset, bufaddr, xfersize); 407 } else { 408 smflags |= SM_WRITE; 409 bcopy(bufaddr, mapaddr + mapoffset, xfersize); 410 } 411 bp->b_resid -= xfersize; 412 bufaddr += xfersize; 413 offset += xfersize; 414 (void) segmap_fault(kas.a_hat, segkmap, mapaddr, 415 len, F_SOFTUNLOCK, srw); 416 error = segmap_release(segkmap, mapaddr, smflags); 417 /* only the first map may start partial */ 418 mapoffset = 0; 419 alignedoffset += MAXBSIZE; 420 } while ((error == 0) && (bp->b_resid > 0) && 421 (offset < lsp->ls_vp_comp_size)); 422 423 return (error); 424 } 425 426 /*ARGSUSED*/ 427 static int gzip_decompress(void *src, size_t srclen, void *dst, 428 size_t *dstlen, int level) 429 { 430 ASSERT(*dstlen >= srclen); 431 432 if (z_uncompress(dst, dstlen, src, srclen) != Z_OK) 433 return (-1); 434 return (0); 435 } 436 437 /* 438 * This is basically what strategy used to be before we found we 439 * needed task queues. 440 */ 441 static void 442 lofi_strategy_task(void *arg) 443 { 444 struct buf *bp = (struct buf *)arg; 445 int error; 446 struct lofi_state *lsp; 447 uint64_t sblkno, eblkno, cmpbytes; 448 offset_t offset, sblkoff, eblkoff; 449 u_offset_t salign, ealign; 450 u_offset_t sdiff; 451 uint32_t comp_data_sz; 452 caddr_t bufaddr; 453 unsigned char *compressed_seg = NULL, *cmpbuf; 454 unsigned char *uncompressed_seg = NULL; 455 lofi_compress_info_t *li; 456 size_t oblkcount, xfersize; 457 unsigned long seglen; 458 459 lsp = ddi_get_soft_state(lofi_statep, getminor(bp->b_edev)); 460 if (lsp->ls_kstat) { 461 mutex_enter(lsp->ls_kstat->ks_lock); 462 kstat_waitq_to_runq(KSTAT_IO_PTR(lsp->ls_kstat)); 463 mutex_exit(lsp->ls_kstat->ks_lock); 464 } 465 bp_mapin(bp); 466 bufaddr = bp->b_un.b_addr; 467 offset = bp->b_lblkno * DEV_BSIZE; /* offset within file */ 468 469 /* 470 * We used to always use vn_rdwr here, but we cannot do that because 471 * we might decide to read or write from the the underlying 472 * file during this call, which would be a deadlock because 473 * we have the rw_lock. So instead we page, unless it's not 474 * mapable or it's a character device. 475 */ 476 if (lsp->ls_vp == NULL || lsp->ls_vp_closereq) { 477 error = EIO; 478 } else if (((lsp->ls_vp->v_flag & VNOMAP) == 0) && 479 (lsp->ls_vp->v_type != VCHR)) { 480 uint64_t i; 481 482 /* 483 * Handle uncompressed files with a regular read 484 */ 485 if (lsp->ls_uncomp_seg_sz == 0) { 486 error = lofi_mapped_rdwr(bufaddr, offset, bp, lsp); 487 goto done; 488 } 489 490 /* 491 * From here on we're dealing primarily with compressed files 492 */ 493 494 /* 495 * Compressed files can only be read from and 496 * not written to 497 */ 498 if (!(bp->b_flags & B_READ)) { 499 bp->b_resid = bp->b_bcount; 500 error = EROFS; 501 goto done; 502 } 503 504 ASSERT(lsp->ls_comp_algorithm_index >= 0); 505 li = &lofi_compress_table[lsp->ls_comp_algorithm_index]; 506 /* 507 * Compute starting and ending compressed segment numbers 508 * We use only bitwise operations avoiding division and 509 * modulus because we enforce the compression segment size 510 * to a power of 2 511 */ 512 sblkno = offset >> lsp->ls_comp_seg_shift; 513 sblkoff = offset & (lsp->ls_uncomp_seg_sz - 1); 514 eblkno = (offset + bp->b_bcount) >> lsp->ls_comp_seg_shift; 515 eblkoff = (offset + bp->b_bcount) & (lsp->ls_uncomp_seg_sz - 1); 516 517 /* 518 * Align start offset to block boundary for segmap 519 */ 520 salign = lsp->ls_comp_seg_index[sblkno]; 521 sdiff = salign & (DEV_BSIZE - 1); 522 salign -= sdiff; 523 if (eblkno >= (lsp->ls_comp_index_sz - 1)) { 524 /* 525 * We're dealing with the last segment of 526 * the compressed file -- the size of this 527 * segment *may not* be the same as the 528 * segment size for the file 529 */ 530 eblkoff = (offset + bp->b_bcount) & 531 (lsp->ls_uncomp_last_seg_sz - 1); 532 ealign = lsp->ls_vp_comp_size; 533 } else { 534 ealign = lsp->ls_comp_seg_index[eblkno + 1]; 535 } 536 537 /* 538 * Preserve original request paramaters 539 */ 540 oblkcount = bp->b_bcount; 541 542 /* 543 * Assign the calculated parameters 544 */ 545 comp_data_sz = ealign - salign; 546 bp->b_bcount = comp_data_sz; 547 548 /* 549 * Allocate fixed size memory blocks to hold compressed 550 * segments and one uncompressed segment since we 551 * uncompress segments one at a time 552 */ 553 compressed_seg = kmem_alloc(bp->b_bcount, KM_SLEEP); 554 uncompressed_seg = kmem_alloc(lsp->ls_uncomp_seg_sz, KM_SLEEP); 555 /* 556 * Map in the calculated number of blocks 557 */ 558 error = lofi_mapped_rdwr((caddr_t)compressed_seg, salign, 559 bp, lsp); 560 561 bp->b_bcount = oblkcount; 562 bp->b_resid = oblkcount; 563 if (error != 0) 564 goto done; 565 566 /* 567 * We have the compressed blocks, now uncompress them 568 */ 569 cmpbuf = compressed_seg + sdiff; 570 for (i = sblkno; i < (eblkno + 1) && i < lsp->ls_comp_index_sz; 571 i++) { 572 /* 573 * Each of the segment index entries contains 574 * the starting block number for that segment. 575 * The number of compressed bytes in a segment 576 * is thus the difference between the starting 577 * block number of this segment and the starting 578 * block number of the next segment. 579 */ 580 if ((i == eblkno) && 581 (i == lsp->ls_comp_index_sz - 1)) { 582 cmpbytes = lsp->ls_vp_comp_size - 583 lsp->ls_comp_seg_index[i]; 584 } else { 585 cmpbytes = lsp->ls_comp_seg_index[i + 1] - 586 lsp->ls_comp_seg_index[i]; 587 } 588 589 /* 590 * The first byte in a compressed segment is a flag 591 * that indicates whether this segment is compressed 592 * at all 593 */ 594 if (*cmpbuf == UNCOMPRESSED) { 595 bcopy((cmpbuf + SEGHDR), uncompressed_seg, 596 (cmpbytes - SEGHDR)); 597 } else { 598 seglen = lsp->ls_uncomp_seg_sz; 599 600 if (li->l_decompress((cmpbuf + SEGHDR), 601 (cmpbytes - SEGHDR), uncompressed_seg, 602 &seglen, li->l_level) != 0) { 603 error = EIO; 604 goto done; 605 } 606 } 607 608 /* 609 * Determine how much uncompressed data we 610 * have to copy and copy it 611 */ 612 xfersize = lsp->ls_uncomp_seg_sz - sblkoff; 613 if (i == eblkno) { 614 if (i == (lsp->ls_comp_index_sz - 1)) 615 xfersize -= (lsp->ls_uncomp_last_seg_sz 616 - eblkoff); 617 else 618 xfersize -= 619 (lsp->ls_uncomp_seg_sz - eblkoff); 620 } 621 622 bcopy((uncompressed_seg + sblkoff), bufaddr, xfersize); 623 624 cmpbuf += cmpbytes; 625 bufaddr += xfersize; 626 bp->b_resid -= xfersize; 627 sblkoff = 0; 628 629 if (bp->b_resid == 0) 630 break; 631 } 632 } else { 633 ssize_t resid; 634 enum uio_rw rw; 635 636 if (bp->b_flags & B_READ) 637 rw = UIO_READ; 638 else 639 rw = UIO_WRITE; 640 error = vn_rdwr(rw, lsp->ls_vp, bufaddr, bp->b_bcount, 641 offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); 642 bp->b_resid = resid; 643 } 644 645 done: 646 if (compressed_seg != NULL) 647 kmem_free(compressed_seg, comp_data_sz); 648 if (uncompressed_seg != NULL) 649 kmem_free(uncompressed_seg, lsp->ls_uncomp_seg_sz); 650 651 if (lsp->ls_kstat) { 652 size_t n_done = bp->b_bcount - bp->b_resid; 653 kstat_io_t *kioptr; 654 655 mutex_enter(lsp->ls_kstat->ks_lock); 656 kioptr = KSTAT_IO_PTR(lsp->ls_kstat); 657 if (bp->b_flags & B_READ) { 658 kioptr->nread += n_done; 659 kioptr->reads++; 660 } else { 661 kioptr->nwritten += n_done; 662 kioptr->writes++; 663 } 664 kstat_runq_exit(kioptr); 665 mutex_exit(lsp->ls_kstat->ks_lock); 666 } 667 668 mutex_enter(&lsp->ls_vp_lock); 669 if (--lsp->ls_vp_iocount == 0) 670 cv_broadcast(&lsp->ls_vp_cv); 671 mutex_exit(&lsp->ls_vp_lock); 672 673 bioerror(bp, error); 674 biodone(bp); 675 } 676 677 static int 678 lofi_strategy(struct buf *bp) 679 { 680 struct lofi_state *lsp; 681 offset_t offset; 682 683 /* 684 * We cannot just do I/O here, because the current thread 685 * _might_ end up back in here because the underlying filesystem 686 * wants a buffer, which eventually gets into bio_recycle and 687 * might call into lofi to write out a delayed-write buffer. 688 * This is bad if the filesystem above lofi is the same as below. 689 * 690 * We could come up with a complex strategy using threads to 691 * do the I/O asynchronously, or we could use task queues. task 692 * queues were incredibly easy so they win. 693 */ 694 lsp = ddi_get_soft_state(lofi_statep, getminor(bp->b_edev)); 695 mutex_enter(&lsp->ls_vp_lock); 696 if (lsp->ls_vp == NULL || lsp->ls_vp_closereq) { 697 bioerror(bp, EIO); 698 biodone(bp); 699 mutex_exit(&lsp->ls_vp_lock); 700 return (0); 701 } 702 703 offset = bp->b_lblkno * DEV_BSIZE; /* offset within file */ 704 if (offset == lsp->ls_vp_size) { 705 /* EOF */ 706 if ((bp->b_flags & B_READ) != 0) { 707 bp->b_resid = bp->b_bcount; 708 bioerror(bp, 0); 709 } else { 710 /* writes should fail */ 711 bioerror(bp, ENXIO); 712 } 713 biodone(bp); 714 mutex_exit(&lsp->ls_vp_lock); 715 return (0); 716 } 717 if (offset > lsp->ls_vp_size) { 718 bioerror(bp, ENXIO); 719 biodone(bp); 720 mutex_exit(&lsp->ls_vp_lock); 721 return (0); 722 } 723 lsp->ls_vp_iocount++; 724 mutex_exit(&lsp->ls_vp_lock); 725 726 if (lsp->ls_kstat) { 727 mutex_enter(lsp->ls_kstat->ks_lock); 728 kstat_waitq_enter(KSTAT_IO_PTR(lsp->ls_kstat)); 729 mutex_exit(lsp->ls_kstat->ks_lock); 730 } 731 (void) taskq_dispatch(lsp->ls_taskq, lofi_strategy_task, bp, KM_SLEEP); 732 return (0); 733 } 734 735 /*ARGSUSED2*/ 736 static int 737 lofi_read(dev_t dev, struct uio *uio, struct cred *credp) 738 { 739 if (getminor(dev) == 0) 740 return (EINVAL); 741 return (physio(lofi_strategy, NULL, dev, B_READ, minphys, uio)); 742 } 743 744 /*ARGSUSED2*/ 745 static int 746 lofi_write(dev_t dev, struct uio *uio, struct cred *credp) 747 { 748 if (getminor(dev) == 0) 749 return (EINVAL); 750 return (physio(lofi_strategy, NULL, dev, B_WRITE, minphys, uio)); 751 } 752 753 /*ARGSUSED2*/ 754 static int 755 lofi_aread(dev_t dev, struct aio_req *aio, struct cred *credp) 756 { 757 if (getminor(dev) == 0) 758 return (EINVAL); 759 return (aphysio(lofi_strategy, anocancel, dev, B_READ, minphys, aio)); 760 } 761 762 /*ARGSUSED2*/ 763 static int 764 lofi_awrite(dev_t dev, struct aio_req *aio, struct cred *credp) 765 { 766 if (getminor(dev) == 0) 767 return (EINVAL); 768 return (aphysio(lofi_strategy, anocancel, dev, B_WRITE, minphys, aio)); 769 } 770 771 /*ARGSUSED*/ 772 static int 773 lofi_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) 774 { 775 switch (infocmd) { 776 case DDI_INFO_DEVT2DEVINFO: 777 *result = lofi_dip; 778 return (DDI_SUCCESS); 779 case DDI_INFO_DEVT2INSTANCE: 780 *result = 0; 781 return (DDI_SUCCESS); 782 } 783 return (DDI_FAILURE); 784 } 785 786 static int 787 lofi_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 788 { 789 int error; 790 791 if (cmd != DDI_ATTACH) 792 return (DDI_FAILURE); 793 error = ddi_soft_state_zalloc(lofi_statep, 0); 794 if (error == DDI_FAILURE) { 795 return (DDI_FAILURE); 796 } 797 error = ddi_create_minor_node(dip, LOFI_CTL_NODE, S_IFCHR, 0, 798 DDI_PSEUDO, NULL); 799 if (error == DDI_FAILURE) { 800 ddi_soft_state_free(lofi_statep, 0); 801 return (DDI_FAILURE); 802 } 803 /* driver handles kernel-issued IOCTLs */ 804 if (ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP, 805 DDI_KERNEL_IOCTL, NULL, 0) != DDI_PROP_SUCCESS) { 806 ddi_remove_minor_node(dip, NULL); 807 ddi_soft_state_free(lofi_statep, 0); 808 return (DDI_FAILURE); 809 } 810 lofi_dip = dip; 811 ddi_report_dev(dip); 812 return (DDI_SUCCESS); 813 } 814 815 static int 816 lofi_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 817 { 818 if (cmd != DDI_DETACH) 819 return (DDI_FAILURE); 820 if (lofi_busy()) 821 return (DDI_FAILURE); 822 lofi_dip = NULL; 823 ddi_remove_minor_node(dip, NULL); 824 ddi_prop_remove_all(dip); 825 ddi_soft_state_free(lofi_statep, 0); 826 return (DDI_SUCCESS); 827 } 828 829 /* 830 * These two just simplify the rest of the ioctls that need to copyin/out 831 * the lofi_ioctl structure. 832 */ 833 struct lofi_ioctl * 834 copy_in_lofi_ioctl(const struct lofi_ioctl *ulip, int flag) 835 { 836 struct lofi_ioctl *klip; 837 int error; 838 839 klip = kmem_alloc(sizeof (struct lofi_ioctl), KM_SLEEP); 840 error = ddi_copyin(ulip, klip, sizeof (struct lofi_ioctl), flag); 841 if (error) { 842 kmem_free(klip, sizeof (struct lofi_ioctl)); 843 return (NULL); 844 } 845 846 /* make sure filename is always null-terminated */ 847 klip->li_filename[MAXPATHLEN - 1] = '\0'; 848 849 /* validate minor number */ 850 if (klip->li_minor > lofi_max_files) { 851 kmem_free(klip, sizeof (struct lofi_ioctl)); 852 return (NULL); 853 } 854 return (klip); 855 } 856 857 int 858 copy_out_lofi_ioctl(const struct lofi_ioctl *klip, struct lofi_ioctl *ulip, 859 int flag) 860 { 861 int error; 862 863 error = ddi_copyout(klip, ulip, sizeof (struct lofi_ioctl), flag); 864 if (error) 865 return (EFAULT); 866 return (0); 867 } 868 869 void 870 free_lofi_ioctl(struct lofi_ioctl *klip) 871 { 872 kmem_free(klip, sizeof (struct lofi_ioctl)); 873 } 874 875 /* 876 * Return the minor number 'filename' is mapped to, if it is. 877 */ 878 static int 879 file_to_minor(char *filename) 880 { 881 minor_t minor; 882 struct lofi_state *lsp; 883 884 ASSERT(mutex_owned(&lofi_lock)); 885 for (minor = 1; minor <= lofi_max_files; minor++) { 886 lsp = ddi_get_soft_state(lofi_statep, minor); 887 if (lsp == NULL) 888 continue; 889 if (strcmp(lsp->ls_filename, filename) == 0) 890 return (minor); 891 } 892 return (0); 893 } 894 895 /* 896 * lofiadm does some validation, but since Joe Random (or crashme) could 897 * do our ioctls, we need to do some validation too. 898 */ 899 static int 900 valid_filename(const char *filename) 901 { 902 static char *blkprefix = "/dev/" LOFI_BLOCK_NAME "/"; 903 static char *charprefix = "/dev/" LOFI_CHAR_NAME "/"; 904 905 /* must be absolute path */ 906 if (filename[0] != '/') 907 return (0); 908 /* must not be lofi */ 909 if (strncmp(filename, blkprefix, strlen(blkprefix)) == 0) 910 return (0); 911 if (strncmp(filename, charprefix, strlen(charprefix)) == 0) 912 return (0); 913 return (1); 914 } 915 916 /* 917 * Fakes up a disk geometry, and one big partition, based on the size 918 * of the file. This is needed because we allow newfs'ing the device, 919 * and newfs will do several disk ioctls to figure out the geometry and 920 * partition information. It uses that information to determine the parameters 921 * to pass to mkfs. Geometry is pretty much irrelevant these days, but we 922 * have to support it. 923 */ 924 static void 925 fake_disk_geometry(struct lofi_state *lsp) 926 { 927 /* dk_geom - see dkio(7I) */ 928 /* 929 * dkg_ncyl _could_ be set to one here (one big cylinder with gobs 930 * of sectors), but that breaks programs like fdisk which want to 931 * partition a disk by cylinder. With one cylinder, you can't create 932 * an fdisk partition and put pcfs on it for testing (hard to pick 933 * a number between one and one). 934 * 935 * The cheezy floppy test is an attempt to not have too few cylinders 936 * for a small file, or so many on a big file that you waste space 937 * for backup superblocks or cylinder group structures. 938 */ 939 if (lsp->ls_vp_size < (2 * 1024 * 1024)) /* floppy? */ 940 lsp->ls_dkg.dkg_ncyl = lsp->ls_vp_size / (100 * 1024); 941 else 942 lsp->ls_dkg.dkg_ncyl = lsp->ls_vp_size / (300 * 1024); 943 /* in case file file is < 100k */ 944 if (lsp->ls_dkg.dkg_ncyl == 0) 945 lsp->ls_dkg.dkg_ncyl = 1; 946 lsp->ls_dkg.dkg_acyl = 0; 947 lsp->ls_dkg.dkg_bcyl = 0; 948 lsp->ls_dkg.dkg_nhead = 1; 949 lsp->ls_dkg.dkg_obs1 = 0; 950 lsp->ls_dkg.dkg_intrlv = 0; 951 lsp->ls_dkg.dkg_obs2 = 0; 952 lsp->ls_dkg.dkg_obs3 = 0; 953 lsp->ls_dkg.dkg_apc = 0; 954 lsp->ls_dkg.dkg_rpm = 7200; 955 lsp->ls_dkg.dkg_pcyl = lsp->ls_dkg.dkg_ncyl + lsp->ls_dkg.dkg_acyl; 956 lsp->ls_dkg.dkg_nsect = lsp->ls_vp_size / 957 (DEV_BSIZE * lsp->ls_dkg.dkg_ncyl); 958 lsp->ls_dkg.dkg_write_reinstruct = 0; 959 lsp->ls_dkg.dkg_read_reinstruct = 0; 960 961 /* vtoc - see dkio(7I) */ 962 bzero(&lsp->ls_vtoc, sizeof (struct vtoc)); 963 lsp->ls_vtoc.v_sanity = VTOC_SANE; 964 lsp->ls_vtoc.v_version = V_VERSION; 965 bcopy(LOFI_DRIVER_NAME, lsp->ls_vtoc.v_volume, 7); 966 lsp->ls_vtoc.v_sectorsz = DEV_BSIZE; 967 lsp->ls_vtoc.v_nparts = 1; 968 lsp->ls_vtoc.v_part[0].p_tag = V_UNASSIGNED; 969 970 /* 971 * A compressed file is read-only, other files can 972 * be read-write 973 */ 974 if (lsp->ls_uncomp_seg_sz > 0) { 975 lsp->ls_vtoc.v_part[0].p_flag = V_UNMNT | V_RONLY; 976 } else { 977 lsp->ls_vtoc.v_part[0].p_flag = V_UNMNT; 978 } 979 lsp->ls_vtoc.v_part[0].p_start = (daddr_t)0; 980 /* 981 * The partition size cannot just be the number of sectors, because 982 * that might not end on a cylinder boundary. And if that's the case, 983 * newfs/mkfs will print a scary warning. So just figure the size 984 * based on the number of cylinders and sectors/cylinder. 985 */ 986 lsp->ls_vtoc.v_part[0].p_size = lsp->ls_dkg.dkg_pcyl * 987 lsp->ls_dkg.dkg_nsect * lsp->ls_dkg.dkg_nhead; 988 989 /* dk_cinfo - see dkio(7I) */ 990 bzero(&lsp->ls_ci, sizeof (struct dk_cinfo)); 991 (void) strcpy(lsp->ls_ci.dki_cname, LOFI_DRIVER_NAME); 992 lsp->ls_ci.dki_ctype = DKC_MD; 993 lsp->ls_ci.dki_flags = 0; 994 lsp->ls_ci.dki_cnum = 0; 995 lsp->ls_ci.dki_addr = 0; 996 lsp->ls_ci.dki_space = 0; 997 lsp->ls_ci.dki_prio = 0; 998 lsp->ls_ci.dki_vec = 0; 999 (void) strcpy(lsp->ls_ci.dki_dname, LOFI_DRIVER_NAME); 1000 lsp->ls_ci.dki_unit = 0; 1001 lsp->ls_ci.dki_slave = 0; 1002 lsp->ls_ci.dki_partition = 0; 1003 /* 1004 * newfs uses this to set maxcontig. Must not be < 16, or it 1005 * will be 0 when newfs multiplies it by DEV_BSIZE and divides 1006 * it by the block size. Then tunefs doesn't work because 1007 * maxcontig is 0. 1008 */ 1009 lsp->ls_ci.dki_maxtransfer = 16; 1010 } 1011 1012 /* 1013 * map in a compressed file 1014 * 1015 * Read in the header and the index that follows. 1016 * 1017 * The header is as follows - 1018 * 1019 * Signature (name of the compression algorithm) 1020 * Compression segment size (a multiple of 512) 1021 * Number of index entries 1022 * Size of the last block 1023 * The array containing the index entries 1024 * 1025 * The header information is always stored in 1026 * network byte order on disk. 1027 */ 1028 static int 1029 lofi_map_compressed_file(struct lofi_state *lsp, char *buf) 1030 { 1031 uint32_t index_sz, header_len, i; 1032 ssize_t resid; 1033 enum uio_rw rw; 1034 char *tbuf = buf; 1035 int error; 1036 1037 /* The signature has already been read */ 1038 tbuf += sizeof (lsp->ls_comp_algorithm); 1039 bcopy(tbuf, &(lsp->ls_uncomp_seg_sz), sizeof (lsp->ls_uncomp_seg_sz)); 1040 lsp->ls_uncomp_seg_sz = ntohl(lsp->ls_uncomp_seg_sz); 1041 1042 /* 1043 * The compressed segment size must be a power of 2 1044 */ 1045 if (lsp->ls_uncomp_seg_sz % 2) 1046 return (EINVAL); 1047 1048 for (i = 0; !((lsp->ls_uncomp_seg_sz >> i) & 1); i++) 1049 ; 1050 1051 lsp->ls_comp_seg_shift = i; 1052 1053 tbuf += sizeof (lsp->ls_uncomp_seg_sz); 1054 bcopy(tbuf, &(lsp->ls_comp_index_sz), sizeof (lsp->ls_comp_index_sz)); 1055 lsp->ls_comp_index_sz = ntohl(lsp->ls_comp_index_sz); 1056 1057 tbuf += sizeof (lsp->ls_comp_index_sz); 1058 bcopy(tbuf, &(lsp->ls_uncomp_last_seg_sz), 1059 sizeof (lsp->ls_uncomp_last_seg_sz)); 1060 lsp->ls_uncomp_last_seg_sz = ntohl(lsp->ls_uncomp_last_seg_sz); 1061 1062 /* 1063 * Compute the total size of the uncompressed data 1064 * for use in fake_disk_geometry and other calculations. 1065 * Disk geometry has to be faked with respect to the 1066 * actual uncompressed data size rather than the 1067 * compressed file size. 1068 */ 1069 lsp->ls_vp_size = (lsp->ls_comp_index_sz - 2) * lsp->ls_uncomp_seg_sz 1070 + lsp->ls_uncomp_last_seg_sz; 1071 1072 /* 1073 * Index size is rounded up to a 512 byte boundary for ease 1074 * of segmapping 1075 */ 1076 index_sz = sizeof (*lsp->ls_comp_seg_index) * lsp->ls_comp_index_sz; 1077 header_len = sizeof (lsp->ls_comp_algorithm) + 1078 sizeof (lsp->ls_uncomp_seg_sz) + 1079 sizeof (lsp->ls_comp_index_sz) + 1080 sizeof (lsp->ls_uncomp_last_seg_sz); 1081 lsp->ls_comp_offbase = header_len + index_sz; 1082 1083 index_sz += header_len; 1084 index_sz = roundup(index_sz, DEV_BSIZE); 1085 1086 lsp->ls_comp_index_data = kmem_alloc(index_sz, KM_SLEEP); 1087 lsp->ls_comp_index_data_sz = index_sz; 1088 1089 /* 1090 * Read in the index -- this has a side-effect 1091 * of reading in the header as well 1092 */ 1093 rw = UIO_READ; 1094 error = vn_rdwr(rw, lsp->ls_vp, lsp->ls_comp_index_data, index_sz, 1095 0, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); 1096 1097 if (error != 0) 1098 return (error); 1099 1100 /* Skip the header, this is where the index really begins */ 1101 lsp->ls_comp_seg_index = 1102 /*LINTED*/ 1103 (uint64_t *)(lsp->ls_comp_index_data + header_len); 1104 1105 /* 1106 * Now recompute offsets in the index to account for 1107 * the header length 1108 */ 1109 for (i = 0; i < lsp->ls_comp_index_sz; i++) { 1110 lsp->ls_comp_seg_index[i] = lsp->ls_comp_offbase + 1111 BE_64(lsp->ls_comp_seg_index[i]); 1112 } 1113 1114 return (error); 1115 } 1116 1117 /* 1118 * Check to see if the passed in signature is a valid 1119 * one. If it is valid, return the index into 1120 * lofi_compress_table. 1121 * 1122 * Return -1 if it is invalid 1123 */ 1124 static int lofi_compress_select(char *signature) 1125 { 1126 int i; 1127 1128 for (i = 0; i < LOFI_COMPRESS_FUNCTIONS; i++) { 1129 if (strcmp(lofi_compress_table[i].l_name, signature) == 0) 1130 return (i); 1131 } 1132 1133 return (-1); 1134 } 1135 1136 /* 1137 * map a file to a minor number. Return the minor number. 1138 */ 1139 static int 1140 lofi_map_file(dev_t dev, struct lofi_ioctl *ulip, int pickminor, 1141 int *rvalp, struct cred *credp, int ioctl_flag) 1142 { 1143 minor_t newminor; 1144 struct lofi_state *lsp; 1145 struct lofi_ioctl *klip; 1146 int error; 1147 struct vnode *vp; 1148 int64_t Nblocks_prop_val; 1149 int64_t Size_prop_val; 1150 int compress_index; 1151 vattr_t vattr; 1152 int flag; 1153 enum vtype v_type; 1154 int zalloced = 0; 1155 dev_t newdev; 1156 char namebuf[50]; 1157 char buf[DEV_BSIZE]; 1158 char *tbuf; 1159 ssize_t resid; 1160 enum uio_rw rw; 1161 1162 klip = copy_in_lofi_ioctl(ulip, ioctl_flag); 1163 if (klip == NULL) 1164 return (EFAULT); 1165 1166 mutex_enter(&lofi_lock); 1167 1168 if (!valid_filename(klip->li_filename)) { 1169 error = EINVAL; 1170 goto out; 1171 } 1172 1173 if (file_to_minor(klip->li_filename) != 0) { 1174 error = EBUSY; 1175 goto out; 1176 } 1177 1178 if (pickminor) { 1179 /* Find a free one */ 1180 for (newminor = 1; newminor <= lofi_max_files; newminor++) 1181 if (ddi_get_soft_state(lofi_statep, newminor) == NULL) 1182 break; 1183 if (newminor >= lofi_max_files) { 1184 error = EAGAIN; 1185 goto out; 1186 } 1187 } else { 1188 newminor = klip->li_minor; 1189 if (ddi_get_soft_state(lofi_statep, newminor) != NULL) { 1190 error = EEXIST; 1191 goto out; 1192 } 1193 } 1194 1195 /* make sure it's valid */ 1196 error = lookupname(klip->li_filename, UIO_SYSSPACE, FOLLOW, 1197 NULLVPP, &vp); 1198 if (error) { 1199 goto out; 1200 } 1201 v_type = vp->v_type; 1202 VN_RELE(vp); 1203 if (!V_ISLOFIABLE(v_type)) { 1204 error = EINVAL; 1205 goto out; 1206 } 1207 flag = FREAD | FWRITE | FOFFMAX | FEXCL; 1208 error = vn_open(klip->li_filename, UIO_SYSSPACE, flag, 0, &vp, 0, 0); 1209 if (error) { 1210 /* try read-only */ 1211 flag &= ~FWRITE; 1212 error = vn_open(klip->li_filename, UIO_SYSSPACE, flag, 0, 1213 &vp, 0, 0); 1214 if (error) { 1215 goto out; 1216 } 1217 } 1218 vattr.va_mask = AT_SIZE; 1219 error = VOP_GETATTR(vp, &vattr, 0, credp, NULL); 1220 if (error) { 1221 goto closeout; 1222 } 1223 /* the file needs to be a multiple of the block size */ 1224 if ((vattr.va_size % DEV_BSIZE) != 0) { 1225 error = EINVAL; 1226 goto closeout; 1227 } 1228 newdev = makedevice(getmajor(dev), newminor); 1229 Size_prop_val = vattr.va_size; 1230 if ((ddi_prop_update_int64(newdev, lofi_dip, 1231 SIZE_PROP_NAME, Size_prop_val)) != DDI_PROP_SUCCESS) { 1232 error = EINVAL; 1233 goto closeout; 1234 } 1235 Nblocks_prop_val = vattr.va_size / DEV_BSIZE; 1236 if ((ddi_prop_update_int64(newdev, lofi_dip, 1237 NBLOCKS_PROP_NAME, Nblocks_prop_val)) != DDI_PROP_SUCCESS) { 1238 error = EINVAL; 1239 goto propout; 1240 } 1241 error = ddi_soft_state_zalloc(lofi_statep, newminor); 1242 if (error == DDI_FAILURE) { 1243 error = ENOMEM; 1244 goto propout; 1245 } 1246 zalloced = 1; 1247 (void) snprintf(namebuf, sizeof (namebuf), "%d", newminor); 1248 error = ddi_create_minor_node(lofi_dip, namebuf, S_IFBLK, newminor, 1249 DDI_PSEUDO, NULL); 1250 if (error != DDI_SUCCESS) { 1251 error = ENXIO; 1252 goto propout; 1253 } 1254 (void) snprintf(namebuf, sizeof (namebuf), "%d,raw", newminor); 1255 error = ddi_create_minor_node(lofi_dip, namebuf, S_IFCHR, newminor, 1256 DDI_PSEUDO, NULL); 1257 if (error != DDI_SUCCESS) { 1258 /* remove block node */ 1259 (void) snprintf(namebuf, sizeof (namebuf), "%d", newminor); 1260 ddi_remove_minor_node(lofi_dip, namebuf); 1261 error = ENXIO; 1262 goto propout; 1263 } 1264 lsp = ddi_get_soft_state(lofi_statep, newminor); 1265 lsp->ls_filename_sz = strlen(klip->li_filename) + 1; 1266 lsp->ls_filename = kmem_alloc(lsp->ls_filename_sz, KM_SLEEP); 1267 (void) snprintf(namebuf, sizeof (namebuf), "%s_taskq_%d", 1268 LOFI_DRIVER_NAME, newminor); 1269 lsp->ls_taskq = taskq_create(namebuf, lofi_taskq_nthreads, 1270 minclsyspri, 1, lofi_taskq_maxalloc, 0); 1271 lsp->ls_kstat = kstat_create(LOFI_DRIVER_NAME, newminor, 1272 NULL, "disk", KSTAT_TYPE_IO, 1, 0); 1273 if (lsp->ls_kstat) { 1274 mutex_init(&lsp->ls_kstat_lock, NULL, MUTEX_DRIVER, NULL); 1275 lsp->ls_kstat->ks_lock = &lsp->ls_kstat_lock; 1276 kstat_install(lsp->ls_kstat); 1277 } 1278 cv_init(&lsp->ls_vp_cv, NULL, CV_DRIVER, NULL); 1279 mutex_init(&lsp->ls_vp_lock, NULL, MUTEX_DRIVER, NULL); 1280 1281 /* 1282 * save open mode so file can be closed properly and vnode counts 1283 * updated correctly. 1284 */ 1285 lsp->ls_openflag = flag; 1286 1287 /* 1288 * Try to handle stacked lofs vnodes. 1289 */ 1290 if (vp->v_type == VREG) { 1291 if (VOP_REALVP(vp, &lsp->ls_vp, NULL) != 0) { 1292 lsp->ls_vp = vp; 1293 } else { 1294 /* 1295 * Even though vp was obtained via vn_open(), we 1296 * can't call vn_close() on it, since lofs will 1297 * pass the VOP_CLOSE() on down to the realvp 1298 * (which we are about to use). Hence we merely 1299 * drop the reference to the lofs vnode and hold 1300 * the realvp so things behave as if we've 1301 * opened the realvp without any interaction 1302 * with lofs. 1303 */ 1304 VN_HOLD(lsp->ls_vp); 1305 VN_RELE(vp); 1306 } 1307 } else { 1308 lsp->ls_vp = vp; 1309 } 1310 lsp->ls_vp_size = vattr.va_size; 1311 (void) strcpy(lsp->ls_filename, klip->li_filename); 1312 if (rvalp) 1313 *rvalp = (int)newminor; 1314 klip->li_minor = newminor; 1315 1316 /* 1317 * Read the file signature to check if it is compressed. 1318 * 'rw' is set to read since only reads are allowed to 1319 * a compressed file. 1320 */ 1321 rw = UIO_READ; 1322 error = vn_rdwr(rw, lsp->ls_vp, buf, DEV_BSIZE, 0, UIO_SYSSPACE, 1323 0, RLIM64_INFINITY, kcred, &resid); 1324 1325 if (error != 0) 1326 goto propout; 1327 1328 tbuf = buf; 1329 lsp->ls_uncomp_seg_sz = 0; 1330 lsp->ls_vp_comp_size = lsp->ls_vp_size; 1331 lsp->ls_comp_algorithm[0] = '\0'; 1332 1333 compress_index = lofi_compress_select(tbuf); 1334 if (compress_index != -1) { 1335 lsp->ls_comp_algorithm_index = compress_index; 1336 (void) strlcpy(lsp->ls_comp_algorithm, 1337 lofi_compress_table[compress_index].l_name, 1338 sizeof (lsp->ls_comp_algorithm)); 1339 error = lofi_map_compressed_file(lsp, buf); 1340 if (error != 0) 1341 goto propout; 1342 1343 /* update DDI properties */ 1344 Size_prop_val = lsp->ls_vp_size; 1345 if ((ddi_prop_update_int64(newdev, lofi_dip, SIZE_PROP_NAME, 1346 Size_prop_val)) != DDI_PROP_SUCCESS) { 1347 error = EINVAL; 1348 goto propout; 1349 } 1350 1351 Nblocks_prop_val = lsp->ls_vp_size / DEV_BSIZE; 1352 if ((ddi_prop_update_int64(newdev, lofi_dip, NBLOCKS_PROP_NAME, 1353 Nblocks_prop_val)) != DDI_PROP_SUCCESS) { 1354 error = EINVAL; 1355 goto propout; 1356 } 1357 } 1358 1359 fake_disk_geometry(lsp); 1360 mutex_exit(&lofi_lock); 1361 (void) copy_out_lofi_ioctl(klip, ulip, ioctl_flag); 1362 free_lofi_ioctl(klip); 1363 return (0); 1364 1365 propout: 1366 (void) ddi_prop_remove(newdev, lofi_dip, SIZE_PROP_NAME); 1367 (void) ddi_prop_remove(newdev, lofi_dip, NBLOCKS_PROP_NAME); 1368 closeout: 1369 (void) VOP_CLOSE(vp, flag, 1, 0, credp, NULL); 1370 VN_RELE(vp); 1371 out: 1372 if (zalloced) 1373 ddi_soft_state_free(lofi_statep, newminor); 1374 mutex_exit(&lofi_lock); 1375 free_lofi_ioctl(klip); 1376 return (error); 1377 } 1378 1379 /* 1380 * unmap a file. 1381 */ 1382 static int 1383 lofi_unmap_file(dev_t dev, struct lofi_ioctl *ulip, int byfilename, 1384 struct cred *credp, int ioctl_flag) 1385 { 1386 struct lofi_state *lsp; 1387 struct lofi_ioctl *klip; 1388 minor_t minor; 1389 1390 klip = copy_in_lofi_ioctl(ulip, ioctl_flag); 1391 if (klip == NULL) 1392 return (EFAULT); 1393 1394 mutex_enter(&lofi_lock); 1395 if (byfilename) { 1396 minor = file_to_minor(klip->li_filename); 1397 } else { 1398 minor = klip->li_minor; 1399 } 1400 if (minor == 0) { 1401 mutex_exit(&lofi_lock); 1402 free_lofi_ioctl(klip); 1403 return (ENXIO); 1404 } 1405 lsp = ddi_get_soft_state(lofi_statep, minor); 1406 if (lsp == NULL || lsp->ls_vp == NULL) { 1407 mutex_exit(&lofi_lock); 1408 free_lofi_ioctl(klip); 1409 return (ENXIO); 1410 } 1411 1412 /* 1413 * If it's still held open, we'll do one of three things: 1414 * 1415 * If no flag is set, just return EBUSY. 1416 * 1417 * If the 'cleanup' flag is set, unmap and remove the device when 1418 * the last user finishes. 1419 * 1420 * If the 'force' flag is set, then we forcibly close the underlying 1421 * file. Subsequent operations will fail, and the DKIOCSTATE ioctl 1422 * will return DKIO_DEV_GONE. When the device is last closed, the 1423 * device will be cleaned up appropriately. 1424 * 1425 * This is complicated by the fact that we may have outstanding 1426 * dispatched I/Os. Rather than having a single mutex to serialize all 1427 * I/O, we keep a count of the number of outstanding I/O requests, as 1428 * well as a flag to indicate that no new I/Os should be dispatched. 1429 * We set the flag, wait for the number of outstanding I/Os to reach 0, 1430 * and then close the underlying vnode. 1431 */ 1432 1433 if (is_opened(lsp)) { 1434 if (klip->li_force) { 1435 mutex_enter(&lsp->ls_vp_lock); 1436 lsp->ls_vp_closereq = B_TRUE; 1437 while (lsp->ls_vp_iocount > 0) 1438 cv_wait(&lsp->ls_vp_cv, &lsp->ls_vp_lock); 1439 (void) VOP_CLOSE(lsp->ls_vp, lsp->ls_openflag, 1, 0, 1440 credp, NULL); 1441 VN_RELE(lsp->ls_vp); 1442 lsp->ls_vp = NULL; 1443 cv_broadcast(&lsp->ls_vp_cv); 1444 mutex_exit(&lsp->ls_vp_lock); 1445 mutex_exit(&lofi_lock); 1446 klip->li_minor = minor; 1447 (void) copy_out_lofi_ioctl(klip, ulip, ioctl_flag); 1448 free_lofi_ioctl(klip); 1449 return (0); 1450 } else if (klip->li_cleanup) { 1451 lsp->ls_cleanup = 1; 1452 mutex_exit(&lofi_lock); 1453 free_lofi_ioctl(klip); 1454 return (0); 1455 } 1456 1457 mutex_exit(&lofi_lock); 1458 free_lofi_ioctl(klip); 1459 return (EBUSY); 1460 } 1461 1462 lofi_free_handle(dev, minor, lsp, credp); 1463 1464 klip->li_minor = minor; 1465 mutex_exit(&lofi_lock); 1466 (void) copy_out_lofi_ioctl(klip, ulip, ioctl_flag); 1467 free_lofi_ioctl(klip); 1468 return (0); 1469 } 1470 1471 /* 1472 * get the filename given the minor number, or the minor number given 1473 * the name. 1474 */ 1475 /*ARGSUSED*/ 1476 static int 1477 lofi_get_info(dev_t dev, struct lofi_ioctl *ulip, int which, 1478 struct cred *credp, int ioctl_flag) 1479 { 1480 struct lofi_state *lsp; 1481 struct lofi_ioctl *klip; 1482 int error; 1483 minor_t minor; 1484 1485 klip = copy_in_lofi_ioctl(ulip, ioctl_flag); 1486 if (klip == NULL) 1487 return (EFAULT); 1488 1489 switch (which) { 1490 case LOFI_GET_FILENAME: 1491 minor = klip->li_minor; 1492 if (minor == 0) { 1493 free_lofi_ioctl(klip); 1494 return (EINVAL); 1495 } 1496 1497 mutex_enter(&lofi_lock); 1498 lsp = ddi_get_soft_state(lofi_statep, minor); 1499 if (lsp == NULL) { 1500 mutex_exit(&lofi_lock); 1501 free_lofi_ioctl(klip); 1502 return (ENXIO); 1503 } 1504 (void) strcpy(klip->li_filename, lsp->ls_filename); 1505 (void) strlcpy(klip->li_algorithm, lsp->ls_comp_algorithm, 1506 sizeof (klip->li_algorithm)); 1507 mutex_exit(&lofi_lock); 1508 error = copy_out_lofi_ioctl(klip, ulip, ioctl_flag); 1509 free_lofi_ioctl(klip); 1510 return (error); 1511 case LOFI_GET_MINOR: 1512 mutex_enter(&lofi_lock); 1513 klip->li_minor = file_to_minor(klip->li_filename); 1514 mutex_exit(&lofi_lock); 1515 if (klip->li_minor == 0) { 1516 free_lofi_ioctl(klip); 1517 return (ENOENT); 1518 } 1519 error = copy_out_lofi_ioctl(klip, ulip, ioctl_flag); 1520 free_lofi_ioctl(klip); 1521 return (error); 1522 case LOFI_CHECK_COMPRESSED: 1523 mutex_enter(&lofi_lock); 1524 klip->li_minor = file_to_minor(klip->li_filename); 1525 mutex_exit(&lofi_lock); 1526 if (klip->li_minor == 0) { 1527 free_lofi_ioctl(klip); 1528 return (ENOENT); 1529 } 1530 mutex_enter(&lofi_lock); 1531 lsp = ddi_get_soft_state(lofi_statep, klip->li_minor); 1532 if (lsp == NULL) { 1533 mutex_exit(&lofi_lock); 1534 free_lofi_ioctl(klip); 1535 return (ENXIO); 1536 } 1537 ASSERT(strcmp(klip->li_filename, lsp->ls_filename) == 0); 1538 1539 (void) strlcpy(klip->li_algorithm, lsp->ls_comp_algorithm, 1540 sizeof (klip->li_algorithm)); 1541 mutex_exit(&lofi_lock); 1542 error = copy_out_lofi_ioctl(klip, ulip, ioctl_flag); 1543 free_lofi_ioctl(klip); 1544 return (error); 1545 default: 1546 free_lofi_ioctl(klip); 1547 return (EINVAL); 1548 } 1549 1550 } 1551 1552 static int 1553 lofi_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *credp, 1554 int *rvalp) 1555 { 1556 int error; 1557 enum dkio_state dkstate; 1558 struct lofi_state *lsp; 1559 minor_t minor; 1560 1561 #ifdef lint 1562 credp = credp; 1563 #endif 1564 1565 minor = getminor(dev); 1566 /* lofi ioctls only apply to the master device */ 1567 if (minor == 0) { 1568 struct lofi_ioctl *lip = (struct lofi_ioctl *)arg; 1569 1570 /* 1571 * the query command only need read-access - i.e., normal 1572 * users are allowed to do those on the ctl device as 1573 * long as they can open it read-only. 1574 */ 1575 switch (cmd) { 1576 case LOFI_MAP_FILE: 1577 if ((flag & FWRITE) == 0) 1578 return (EPERM); 1579 return (lofi_map_file(dev, lip, 1, rvalp, credp, flag)); 1580 case LOFI_MAP_FILE_MINOR: 1581 if ((flag & FWRITE) == 0) 1582 return (EPERM); 1583 return (lofi_map_file(dev, lip, 0, rvalp, credp, flag)); 1584 case LOFI_UNMAP_FILE: 1585 if ((flag & FWRITE) == 0) 1586 return (EPERM); 1587 return (lofi_unmap_file(dev, lip, 1, credp, flag)); 1588 case LOFI_UNMAP_FILE_MINOR: 1589 if ((flag & FWRITE) == 0) 1590 return (EPERM); 1591 return (lofi_unmap_file(dev, lip, 0, credp, flag)); 1592 case LOFI_GET_FILENAME: 1593 return (lofi_get_info(dev, lip, LOFI_GET_FILENAME, 1594 credp, flag)); 1595 case LOFI_GET_MINOR: 1596 return (lofi_get_info(dev, lip, LOFI_GET_MINOR, 1597 credp, flag)); 1598 case LOFI_GET_MAXMINOR: 1599 error = ddi_copyout(&lofi_max_files, &lip->li_minor, 1600 sizeof (lofi_max_files), flag); 1601 if (error) 1602 return (EFAULT); 1603 return (0); 1604 case LOFI_CHECK_COMPRESSED: 1605 return (lofi_get_info(dev, lip, LOFI_CHECK_COMPRESSED, 1606 credp, flag)); 1607 default: 1608 break; 1609 } 1610 } 1611 1612 lsp = ddi_get_soft_state(lofi_statep, minor); 1613 if (lsp == NULL) 1614 return (ENXIO); 1615 1616 /* 1617 * We explicitly allow DKIOCSTATE, but all other ioctls should fail with 1618 * EIO as if the device was no longer present. 1619 */ 1620 if (lsp->ls_vp == NULL && cmd != DKIOCSTATE) 1621 return (EIO); 1622 1623 /* these are for faking out utilities like newfs */ 1624 switch (cmd) { 1625 case DKIOCGVTOC: 1626 switch (ddi_model_convert_from(flag & FMODELS)) { 1627 case DDI_MODEL_ILP32: { 1628 struct vtoc32 vtoc32; 1629 1630 vtoctovtoc32(lsp->ls_vtoc, vtoc32); 1631 if (ddi_copyout(&vtoc32, (void *)arg, 1632 sizeof (struct vtoc32), flag)) 1633 return (EFAULT); 1634 break; 1635 } 1636 1637 case DDI_MODEL_NONE: 1638 if (ddi_copyout(&lsp->ls_vtoc, (void *)arg, 1639 sizeof (struct vtoc), flag)) 1640 return (EFAULT); 1641 break; 1642 } 1643 return (0); 1644 case DKIOCINFO: 1645 error = ddi_copyout(&lsp->ls_ci, (void *)arg, 1646 sizeof (struct dk_cinfo), flag); 1647 if (error) 1648 return (EFAULT); 1649 return (0); 1650 case DKIOCG_VIRTGEOM: 1651 case DKIOCG_PHYGEOM: 1652 case DKIOCGGEOM: 1653 error = ddi_copyout(&lsp->ls_dkg, (void *)arg, 1654 sizeof (struct dk_geom), flag); 1655 if (error) 1656 return (EFAULT); 1657 return (0); 1658 case DKIOCSTATE: 1659 /* 1660 * Normally, lofi devices are always in the INSERTED state. If 1661 * a device is forcefully unmapped, then the device transitions 1662 * to the DKIO_DEV_GONE state. 1663 */ 1664 if (ddi_copyin((void *)arg, &dkstate, sizeof (dkstate), 1665 flag) != 0) 1666 return (EFAULT); 1667 1668 mutex_enter(&lsp->ls_vp_lock); 1669 while ((dkstate == DKIO_INSERTED && lsp->ls_vp != NULL) || 1670 (dkstate == DKIO_DEV_GONE && lsp->ls_vp == NULL)) { 1671 /* 1672 * By virtue of having the device open, we know that 1673 * 'lsp' will remain valid when we return. 1674 */ 1675 if (!cv_wait_sig(&lsp->ls_vp_cv, 1676 &lsp->ls_vp_lock)) { 1677 mutex_exit(&lsp->ls_vp_lock); 1678 return (EINTR); 1679 } 1680 } 1681 1682 dkstate = (lsp->ls_vp != NULL ? DKIO_INSERTED : DKIO_DEV_GONE); 1683 mutex_exit(&lsp->ls_vp_lock); 1684 1685 if (ddi_copyout(&dkstate, (void *)arg, 1686 sizeof (dkstate), flag) != 0) 1687 return (EFAULT); 1688 return (0); 1689 default: 1690 return (ENOTTY); 1691 } 1692 } 1693 1694 static struct cb_ops lofi_cb_ops = { 1695 lofi_open, /* open */ 1696 lofi_close, /* close */ 1697 lofi_strategy, /* strategy */ 1698 nodev, /* print */ 1699 nodev, /* dump */ 1700 lofi_read, /* read */ 1701 lofi_write, /* write */ 1702 lofi_ioctl, /* ioctl */ 1703 nodev, /* devmap */ 1704 nodev, /* mmap */ 1705 nodev, /* segmap */ 1706 nochpoll, /* poll */ 1707 ddi_prop_op, /* prop_op */ 1708 0, /* streamtab */ 1709 D_64BIT | D_NEW | D_MP, /* Driver compatibility flag */ 1710 CB_REV, 1711 lofi_aread, 1712 lofi_awrite 1713 }; 1714 1715 static struct dev_ops lofi_ops = { 1716 DEVO_REV, /* devo_rev, */ 1717 0, /* refcnt */ 1718 lofi_info, /* info */ 1719 nulldev, /* identify */ 1720 nulldev, /* probe */ 1721 lofi_attach, /* attach */ 1722 lofi_detach, /* detach */ 1723 nodev, /* reset */ 1724 &lofi_cb_ops, /* driver operations */ 1725 NULL, /* no bus operations */ 1726 NULL, /* power */ 1727 ddi_quiesce_not_needed, /* quiesce */ 1728 }; 1729 1730 static struct modldrv modldrv = { 1731 &mod_driverops, 1732 "loopback file driver", 1733 &lofi_ops, 1734 }; 1735 1736 static struct modlinkage modlinkage = { 1737 MODREV_1, 1738 &modldrv, 1739 NULL 1740 }; 1741 1742 int 1743 _init(void) 1744 { 1745 int error; 1746 1747 error = ddi_soft_state_init(&lofi_statep, 1748 sizeof (struct lofi_state), 0); 1749 if (error) 1750 return (error); 1751 1752 mutex_init(&lofi_lock, NULL, MUTEX_DRIVER, NULL); 1753 error = mod_install(&modlinkage); 1754 if (error) { 1755 mutex_destroy(&lofi_lock); 1756 ddi_soft_state_fini(&lofi_statep); 1757 } 1758 1759 return (error); 1760 } 1761 1762 int 1763 _fini(void) 1764 { 1765 int error; 1766 1767 if (lofi_busy()) 1768 return (EBUSY); 1769 1770 error = mod_remove(&modlinkage); 1771 if (error) 1772 return (error); 1773 1774 mutex_destroy(&lofi_lock); 1775 ddi_soft_state_fini(&lofi_statep); 1776 1777 return (error); 1778 } 1779 1780 int 1781 _info(struct modinfo *modinfop) 1782 { 1783 return (mod_info(&modlinkage, modinfop)); 1784 } 1785