1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * lofi (loopback file) driver - allows you to attach a file to a device, 30 * which can then be accessed through that device. The simple model is that 31 * you tell lofi to open a file, and then use the block device you get as 32 * you would any block device. lofi translates access to the block device 33 * into I/O on the underlying file. This is mostly useful for 34 * mounting images of filesystems. 35 * 36 * lofi is controlled through /dev/lofictl - this is the only device exported 37 * during attach, and is minor number 0. lofiadm communicates with lofi through 38 * ioctls on this device. When a file is attached to lofi, block and character 39 * devices are exported in /dev/lofi and /dev/rlofi. Currently, these devices 40 * are identified by their minor number, and the minor number is also used 41 * as the name in /dev/lofi. If we ever decide to support virtual disks, 42 * we'll have to divide the minor number space to identify fdisk partitions 43 * and slices, and the name will then be the minor number shifted down a 44 * few bits. Minor devices are tracked with state structures handled with 45 * ddi_soft_state(9F) for simplicity. 46 * 47 * A file attached to lofi is opened when attached and not closed until 48 * explicitly detached from lofi. This seems more sensible than deferring 49 * the open until the /dev/lofi device is opened, for a number of reasons. 50 * One is that any failure is likely to be noticed by the person (or script) 51 * running lofiadm. Another is that it would be a security problem if the 52 * file was replaced by another one after being added but before being opened. 53 * 54 * The only hard part about lofi is the ioctls. In order to support things 55 * like 'newfs' on a lofi device, it needs to support certain disk ioctls. 56 * So it has to fake disk geometry and partition information. More may need 57 * to be faked if your favorite utility doesn't work and you think it should 58 * (fdformat doesn't work because it really wants to know the type of floppy 59 * controller to talk to, and that didn't seem easy to fake. Or possibly even 60 * necessary, since we have mkfs_pcfs now). 61 * 62 * Normally, a lofi device cannot be detached if it is open (i.e. busy). To 63 * support simulation of hotplug events, an optional force flag is provided. 64 * If a lofi device is open when a force detach is requested, then the 65 * underlying file is closed and any subsequent operations return EIO. When the 66 * device is closed for the last time, it will be cleaned up at that time. In 67 * addition, the DKIOCSTATE ioctl will return DKIO_DEV_GONE when the device is 68 * detached but not removed. 69 * 70 * Known problems: 71 * 72 * UFS logging. Mounting a UFS filesystem image "logging" 73 * works for basic copy testing but wedges during a build of ON through 74 * that image. Some deadlock in lufs holding the log mutex and then 75 * getting stuck on a buf. So for now, don't do that. 76 * 77 * Direct I/O. Since the filesystem data is being cached in the buffer 78 * cache, _and_ again in the underlying filesystem, it's tempting to 79 * enable direct I/O on the underlying file. Don't, because that deadlocks. 80 * I think to fix the cache-twice problem we might need filesystem support. 81 * 82 * lofi on itself. The simple lock strategy (lofi_lock) precludes this 83 * because you'll be in lofi_ioctl, holding the lock when you open the 84 * file, which, if it's lofi, will grab lofi_lock. We prevent this for 85 * now, though not using ddi_soft_state(9F) would make it possible to 86 * do. Though it would still be silly. 87 * 88 * Interesting things to do: 89 * 90 * Allow multiple files for each device. A poor-man's metadisk, basically. 91 * 92 * Pass-through ioctls on block devices. You can (though it's not 93 * documented), give lofi a block device as a file name. Then we shouldn't 94 * need to fake a geometry. But this is also silly unless you're replacing 95 * metadisk. 96 * 97 * Encryption. tpm would like this. Apparently Windows 2000 has it, and 98 * so does Linux. 99 */ 100 101 #include <sys/types.h> 102 #include <netinet/in.h> 103 #include <sys/sysmacros.h> 104 #include <sys/uio.h> 105 #include <sys/kmem.h> 106 #include <sys/cred.h> 107 #include <sys/mman.h> 108 #include <sys/errno.h> 109 #include <sys/aio_req.h> 110 #include <sys/stat.h> 111 #include <sys/file.h> 112 #include <sys/modctl.h> 113 #include <sys/conf.h> 114 #include <sys/debug.h> 115 #include <sys/vnode.h> 116 #include <sys/lofi.h> 117 #include <sys/fcntl.h> 118 #include <sys/pathname.h> 119 #include <sys/filio.h> 120 #include <sys/fdio.h> 121 #include <sys/open.h> 122 #include <sys/disp.h> 123 #include <vm/seg_map.h> 124 #include <sys/ddi.h> 125 #include <sys/sunddi.h> 126 #include <sys/zmod.h> 127 128 #define NBLOCKS_PROP_NAME "Nblocks" 129 #define SIZE_PROP_NAME "Size" 130 131 static dev_info_t *lofi_dip; 132 static void *lofi_statep; 133 static kmutex_t lofi_lock; /* state lock */ 134 135 /* 136 * Because lofi_taskq_nthreads limits the actual swamping of the device, the 137 * maxalloc parameter (lofi_taskq_maxalloc) should be tuned conservatively 138 * high. If we want to be assured that the underlying device is always busy, 139 * we must be sure that the number of bytes enqueued when the number of 140 * enqueued tasks exceeds maxalloc is sufficient to keep the device busy for 141 * the duration of the sleep time in taskq_ent_alloc(). That is, lofi should 142 * set maxalloc to be the maximum throughput (in bytes per second) of the 143 * underlying device divided by the minimum I/O size. We assume a realistic 144 * maximum throughput of one hundred megabytes per second; we set maxalloc on 145 * the lofi task queue to be 104857600 divided by DEV_BSIZE. 146 */ 147 static int lofi_taskq_maxalloc = 104857600 / DEV_BSIZE; 148 static int lofi_taskq_nthreads = 4; /* # of taskq threads per device */ 149 150 uint32_t lofi_max_files = LOFI_MAX_FILES; 151 152 static int gzip_decompress(void *src, size_t srclen, void *dst, 153 size_t *destlen, int level); 154 155 lofi_compress_info_t lofi_compress_table[LOFI_COMPRESS_FUNCTIONS] = { 156 {gzip_decompress, NULL, 6, "gzip"}, /* default */ 157 {gzip_decompress, NULL, 6, "gzip-6"}, 158 {gzip_decompress, NULL, 9, "gzip-9"} 159 }; 160 161 static int 162 lofi_busy(void) 163 { 164 minor_t minor; 165 166 /* 167 * We need to make sure no mappings exist - mod_remove won't 168 * help because the device isn't open. 169 */ 170 mutex_enter(&lofi_lock); 171 for (minor = 1; minor <= lofi_max_files; minor++) { 172 if (ddi_get_soft_state(lofi_statep, minor) != NULL) { 173 mutex_exit(&lofi_lock); 174 return (EBUSY); 175 } 176 } 177 mutex_exit(&lofi_lock); 178 return (0); 179 } 180 181 static int 182 is_opened(struct lofi_state *lsp) 183 { 184 ASSERT(mutex_owned(&lofi_lock)); 185 return (lsp->ls_chr_open || lsp->ls_blk_open || lsp->ls_lyr_open_count); 186 } 187 188 static int 189 mark_opened(struct lofi_state *lsp, int otyp) 190 { 191 ASSERT(mutex_owned(&lofi_lock)); 192 switch (otyp) { 193 case OTYP_CHR: 194 lsp->ls_chr_open = 1; 195 break; 196 case OTYP_BLK: 197 lsp->ls_blk_open = 1; 198 break; 199 case OTYP_LYR: 200 lsp->ls_lyr_open_count++; 201 break; 202 default: 203 return (-1); 204 } 205 return (0); 206 } 207 208 static void 209 mark_closed(struct lofi_state *lsp, int otyp) 210 { 211 ASSERT(mutex_owned(&lofi_lock)); 212 switch (otyp) { 213 case OTYP_CHR: 214 lsp->ls_chr_open = 0; 215 break; 216 case OTYP_BLK: 217 lsp->ls_blk_open = 0; 218 break; 219 case OTYP_LYR: 220 lsp->ls_lyr_open_count--; 221 break; 222 default: 223 break; 224 } 225 } 226 227 static void 228 lofi_free_handle(dev_t dev, minor_t minor, struct lofi_state *lsp, 229 cred_t *credp) 230 { 231 dev_t newdev; 232 char namebuf[50]; 233 234 if (lsp->ls_vp) { 235 (void) VOP_CLOSE(lsp->ls_vp, lsp->ls_openflag, 236 1, 0, credp, NULL); 237 VN_RELE(lsp->ls_vp); 238 lsp->ls_vp = NULL; 239 } 240 241 newdev = makedevice(getmajor(dev), minor); 242 (void) ddi_prop_remove(newdev, lofi_dip, SIZE_PROP_NAME); 243 (void) ddi_prop_remove(newdev, lofi_dip, NBLOCKS_PROP_NAME); 244 245 (void) snprintf(namebuf, sizeof (namebuf), "%d", minor); 246 ddi_remove_minor_node(lofi_dip, namebuf); 247 (void) snprintf(namebuf, sizeof (namebuf), "%d,raw", minor); 248 ddi_remove_minor_node(lofi_dip, namebuf); 249 250 kmem_free(lsp->ls_filename, lsp->ls_filename_sz); 251 taskq_destroy(lsp->ls_taskq); 252 if (lsp->ls_kstat) { 253 kstat_delete(lsp->ls_kstat); 254 mutex_destroy(&lsp->ls_kstat_lock); 255 } 256 257 if (lsp->ls_uncomp_seg_sz > 0) { 258 kmem_free(lsp->ls_comp_index_data, lsp->ls_comp_index_data_sz); 259 lsp->ls_uncomp_seg_sz = 0; 260 } 261 ddi_soft_state_free(lofi_statep, minor); 262 } 263 264 /*ARGSUSED*/ 265 static int 266 lofi_open(dev_t *devp, int flag, int otyp, struct cred *credp) 267 { 268 minor_t minor; 269 struct lofi_state *lsp; 270 271 mutex_enter(&lofi_lock); 272 minor = getminor(*devp); 273 if (minor == 0) { 274 /* master control device */ 275 /* must be opened exclusively */ 276 if (((flag & FEXCL) != FEXCL) || (otyp != OTYP_CHR)) { 277 mutex_exit(&lofi_lock); 278 return (EINVAL); 279 } 280 lsp = ddi_get_soft_state(lofi_statep, 0); 281 if (lsp == NULL) { 282 mutex_exit(&lofi_lock); 283 return (ENXIO); 284 } 285 if (is_opened(lsp)) { 286 mutex_exit(&lofi_lock); 287 return (EBUSY); 288 } 289 (void) mark_opened(lsp, OTYP_CHR); 290 mutex_exit(&lofi_lock); 291 return (0); 292 } 293 294 /* otherwise, the mapping should already exist */ 295 lsp = ddi_get_soft_state(lofi_statep, minor); 296 if (lsp == NULL) { 297 mutex_exit(&lofi_lock); 298 return (EINVAL); 299 } 300 301 if (lsp->ls_vp == NULL) { 302 mutex_exit(&lofi_lock); 303 return (ENXIO); 304 } 305 306 if (mark_opened(lsp, otyp) == -1) { 307 mutex_exit(&lofi_lock); 308 return (EINVAL); 309 } 310 311 mutex_exit(&lofi_lock); 312 return (0); 313 } 314 315 /*ARGSUSED*/ 316 static int 317 lofi_close(dev_t dev, int flag, int otyp, struct cred *credp) 318 { 319 minor_t minor; 320 struct lofi_state *lsp; 321 322 mutex_enter(&lofi_lock); 323 minor = getminor(dev); 324 lsp = ddi_get_soft_state(lofi_statep, minor); 325 if (lsp == NULL) { 326 mutex_exit(&lofi_lock); 327 return (EINVAL); 328 } 329 mark_closed(lsp, otyp); 330 331 /* 332 * If we forcibly closed the underlying device (li_force), or 333 * asked for cleanup (li_cleanup), finish up if we're the last 334 * out of the door. 335 */ 336 if (minor != 0 && !is_opened(lsp) && 337 (lsp->ls_cleanup || lsp->ls_vp == NULL)) 338 lofi_free_handle(dev, minor, lsp, credp); 339 340 mutex_exit(&lofi_lock); 341 return (0); 342 } 343 344 static int 345 lofi_mapped_rdwr(caddr_t bufaddr, offset_t offset, struct buf *bp, 346 struct lofi_state *lsp) 347 { 348 int error; 349 offset_t alignedoffset, mapoffset; 350 size_t xfersize; 351 int isread; 352 int smflags; 353 caddr_t mapaddr; 354 size_t len; 355 enum seg_rw srw; 356 357 /* 358 * segmap always gives us an 8K (MAXBSIZE) chunk, aligned on 359 * an 8K boundary, but the buf transfer address may not be 360 * aligned on more than a 512-byte boundary (we don't enforce 361 * that even though we could). This matters since the initial 362 * part of the transfer may not start at offset 0 within the 363 * segmap'd chunk. So we have to compensate for that with 364 * 'mapoffset'. Subsequent chunks always start off at the 365 * beginning, and the last is capped by b_resid 366 */ 367 mapoffset = offset & MAXBOFFSET; 368 alignedoffset = offset - mapoffset; 369 bp->b_resid = bp->b_bcount; 370 isread = bp->b_flags & B_READ; 371 srw = isread ? S_READ : S_WRITE; 372 do { 373 xfersize = MIN(lsp->ls_vp_comp_size - offset, 374 MIN(MAXBSIZE - mapoffset, bp->b_resid)); 375 len = roundup(mapoffset + xfersize, PAGESIZE); 376 mapaddr = segmap_getmapflt(segkmap, lsp->ls_vp, 377 alignedoffset, MAXBSIZE, 1, srw); 378 /* 379 * Now fault in the pages. This lets us check 380 * for errors before we reference mapaddr and 381 * try to resolve the fault in bcopy (which would 382 * panic instead). And this can easily happen, 383 * particularly if you've lofi'd a file over NFS 384 * and someone deletes the file on the server. 385 */ 386 error = segmap_fault(kas.a_hat, segkmap, mapaddr, 387 len, F_SOFTLOCK, srw); 388 if (error) { 389 (void) segmap_release(segkmap, mapaddr, 0); 390 if (FC_CODE(error) == FC_OBJERR) 391 error = FC_ERRNO(error); 392 else 393 error = EIO; 394 break; 395 } 396 smflags = 0; 397 if (isread) { 398 smflags |= SM_FREE; 399 /* 400 * If we're reading an entire page starting 401 * at a page boundary, there's a good chance 402 * we won't need it again. Put it on the 403 * head of the freelist. 404 */ 405 if (mapoffset == 0 && xfersize == PAGESIZE) 406 smflags |= SM_DONTNEED; 407 bcopy(mapaddr + mapoffset, bufaddr, xfersize); 408 } else { 409 smflags |= SM_WRITE; 410 bcopy(bufaddr, mapaddr + mapoffset, xfersize); 411 } 412 bp->b_resid -= xfersize; 413 bufaddr += xfersize; 414 offset += xfersize; 415 (void) segmap_fault(kas.a_hat, segkmap, mapaddr, 416 len, F_SOFTUNLOCK, srw); 417 error = segmap_release(segkmap, mapaddr, smflags); 418 /* only the first map may start partial */ 419 mapoffset = 0; 420 alignedoffset += MAXBSIZE; 421 } while ((error == 0) && (bp->b_resid > 0) && 422 (offset < lsp->ls_vp_comp_size)); 423 424 return (error); 425 } 426 427 /*ARGSUSED*/ 428 static int gzip_decompress(void *src, size_t srclen, void *dst, 429 size_t *dstlen, int level) 430 { 431 ASSERT(*dstlen >= srclen); 432 433 if (z_uncompress(dst, dstlen, src, srclen) != Z_OK) 434 return (-1); 435 return (0); 436 } 437 438 /* 439 * This is basically what strategy used to be before we found we 440 * needed task queues. 441 */ 442 static void 443 lofi_strategy_task(void *arg) 444 { 445 struct buf *bp = (struct buf *)arg; 446 int error; 447 struct lofi_state *lsp; 448 uint64_t sblkno, eblkno, cmpbytes; 449 offset_t offset, sblkoff, eblkoff; 450 u_offset_t salign, ealign; 451 u_offset_t sdiff; 452 uint32_t comp_data_sz; 453 caddr_t bufaddr; 454 unsigned char *compressed_seg = NULL, *cmpbuf; 455 unsigned char *uncompressed_seg = NULL; 456 lofi_compress_info_t *li; 457 size_t oblkcount, xfersize; 458 unsigned long seglen; 459 460 lsp = ddi_get_soft_state(lofi_statep, getminor(bp->b_edev)); 461 if (lsp->ls_kstat) { 462 mutex_enter(lsp->ls_kstat->ks_lock); 463 kstat_waitq_to_runq(KSTAT_IO_PTR(lsp->ls_kstat)); 464 mutex_exit(lsp->ls_kstat->ks_lock); 465 } 466 bp_mapin(bp); 467 bufaddr = bp->b_un.b_addr; 468 offset = bp->b_lblkno * DEV_BSIZE; /* offset within file */ 469 470 /* 471 * We used to always use vn_rdwr here, but we cannot do that because 472 * we might decide to read or write from the the underlying 473 * file during this call, which would be a deadlock because 474 * we have the rw_lock. So instead we page, unless it's not 475 * mapable or it's a character device. 476 */ 477 if (lsp->ls_vp == NULL || lsp->ls_vp_closereq) { 478 error = EIO; 479 } else if (((lsp->ls_vp->v_flag & VNOMAP) == 0) && 480 (lsp->ls_vp->v_type != VCHR)) { 481 uint64_t i; 482 483 /* 484 * Handle uncompressed files with a regular read 485 */ 486 if (lsp->ls_uncomp_seg_sz == 0) { 487 error = lofi_mapped_rdwr(bufaddr, offset, bp, lsp); 488 goto done; 489 } 490 491 /* 492 * From here on we're dealing primarily with compressed files 493 */ 494 495 /* 496 * Compressed files can only be read from and 497 * not written to 498 */ 499 if (!(bp->b_flags & B_READ)) { 500 bp->b_resid = bp->b_bcount; 501 error = EROFS; 502 goto done; 503 } 504 505 ASSERT(lsp->ls_comp_algorithm_index >= 0); 506 li = &lofi_compress_table[lsp->ls_comp_algorithm_index]; 507 /* 508 * Compute starting and ending compressed segment numbers 509 * We use only bitwise operations avoiding division and 510 * modulus because we enforce the compression segment size 511 * to a power of 2 512 */ 513 sblkno = offset >> lsp->ls_comp_seg_shift; 514 sblkoff = offset & (lsp->ls_uncomp_seg_sz - 1); 515 eblkno = (offset + bp->b_bcount) >> lsp->ls_comp_seg_shift; 516 eblkoff = (offset + bp->b_bcount) & (lsp->ls_uncomp_seg_sz - 1); 517 518 /* 519 * Align start offset to block boundary for segmap 520 */ 521 salign = lsp->ls_comp_seg_index[sblkno]; 522 sdiff = salign & (DEV_BSIZE - 1); 523 salign -= sdiff; 524 if (eblkno >= (lsp->ls_comp_index_sz - 1)) { 525 /* 526 * We're dealing with the last segment of 527 * the compressed file -- the size of this 528 * segment *may not* be the same as the 529 * segment size for the file 530 */ 531 eblkoff = (offset + bp->b_bcount) & 532 (lsp->ls_uncomp_last_seg_sz - 1); 533 ealign = lsp->ls_vp_comp_size; 534 } else { 535 ealign = lsp->ls_comp_seg_index[eblkno + 1]; 536 } 537 538 /* 539 * Preserve original request paramaters 540 */ 541 oblkcount = bp->b_bcount; 542 543 /* 544 * Assign the calculated parameters 545 */ 546 comp_data_sz = ealign - salign; 547 bp->b_bcount = comp_data_sz; 548 549 /* 550 * Allocate fixed size memory blocks to hold compressed 551 * segments and one uncompressed segment since we 552 * uncompress segments one at a time 553 */ 554 compressed_seg = kmem_alloc(bp->b_bcount, KM_SLEEP); 555 uncompressed_seg = kmem_alloc(lsp->ls_uncomp_seg_sz, KM_SLEEP); 556 /* 557 * Map in the calculated number of blocks 558 */ 559 error = lofi_mapped_rdwr((caddr_t)compressed_seg, salign, 560 bp, lsp); 561 562 bp->b_bcount = oblkcount; 563 bp->b_resid = oblkcount; 564 if (error != 0) 565 goto done; 566 567 /* 568 * We have the compressed blocks, now uncompress them 569 */ 570 cmpbuf = compressed_seg + sdiff; 571 for (i = sblkno; i < (eblkno + 1) && i < lsp->ls_comp_index_sz; 572 i++) { 573 /* 574 * Each of the segment index entries contains 575 * the starting block number for that segment. 576 * The number of compressed bytes in a segment 577 * is thus the difference between the starting 578 * block number of this segment and the starting 579 * block number of the next segment. 580 */ 581 if ((i == eblkno) && 582 (i == lsp->ls_comp_index_sz - 1)) { 583 cmpbytes = lsp->ls_vp_comp_size - 584 lsp->ls_comp_seg_index[i]; 585 } else { 586 cmpbytes = lsp->ls_comp_seg_index[i + 1] - 587 lsp->ls_comp_seg_index[i]; 588 } 589 590 /* 591 * The first byte in a compressed segment is a flag 592 * that indicates whether this segment is compressed 593 * at all 594 */ 595 if (*cmpbuf == UNCOMPRESSED) { 596 bcopy((cmpbuf + SEGHDR), uncompressed_seg, 597 (cmpbytes - SEGHDR)); 598 } else { 599 seglen = lsp->ls_uncomp_seg_sz; 600 601 if (li->l_decompress((cmpbuf + SEGHDR), 602 (cmpbytes - SEGHDR), uncompressed_seg, 603 &seglen, li->l_level) != 0) { 604 error = EIO; 605 goto done; 606 } 607 } 608 609 /* 610 * Determine how much uncompressed data we 611 * have to copy and copy it 612 */ 613 xfersize = lsp->ls_uncomp_seg_sz - sblkoff; 614 if (i == eblkno) { 615 if (i == (lsp->ls_comp_index_sz - 1)) 616 xfersize -= (lsp->ls_uncomp_last_seg_sz 617 - eblkoff); 618 else 619 xfersize -= 620 (lsp->ls_uncomp_seg_sz - eblkoff); 621 } 622 623 bcopy((uncompressed_seg + sblkoff), bufaddr, xfersize); 624 625 cmpbuf += cmpbytes; 626 bufaddr += xfersize; 627 bp->b_resid -= xfersize; 628 sblkoff = 0; 629 630 if (bp->b_resid == 0) 631 break; 632 } 633 } else { 634 ssize_t resid; 635 enum uio_rw rw; 636 637 if (bp->b_flags & B_READ) 638 rw = UIO_READ; 639 else 640 rw = UIO_WRITE; 641 error = vn_rdwr(rw, lsp->ls_vp, bufaddr, bp->b_bcount, 642 offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); 643 bp->b_resid = resid; 644 } 645 646 done: 647 if (compressed_seg != NULL) 648 kmem_free(compressed_seg, comp_data_sz); 649 if (uncompressed_seg != NULL) 650 kmem_free(uncompressed_seg, lsp->ls_uncomp_seg_sz); 651 652 if (lsp->ls_kstat) { 653 size_t n_done = bp->b_bcount - bp->b_resid; 654 kstat_io_t *kioptr; 655 656 mutex_enter(lsp->ls_kstat->ks_lock); 657 kioptr = KSTAT_IO_PTR(lsp->ls_kstat); 658 if (bp->b_flags & B_READ) { 659 kioptr->nread += n_done; 660 kioptr->reads++; 661 } else { 662 kioptr->nwritten += n_done; 663 kioptr->writes++; 664 } 665 kstat_runq_exit(kioptr); 666 mutex_exit(lsp->ls_kstat->ks_lock); 667 } 668 669 mutex_enter(&lsp->ls_vp_lock); 670 if (--lsp->ls_vp_iocount == 0) 671 cv_broadcast(&lsp->ls_vp_cv); 672 mutex_exit(&lsp->ls_vp_lock); 673 674 bioerror(bp, error); 675 biodone(bp); 676 } 677 678 static int 679 lofi_strategy(struct buf *bp) 680 { 681 struct lofi_state *lsp; 682 offset_t offset; 683 684 /* 685 * We cannot just do I/O here, because the current thread 686 * _might_ end up back in here because the underlying filesystem 687 * wants a buffer, which eventually gets into bio_recycle and 688 * might call into lofi to write out a delayed-write buffer. 689 * This is bad if the filesystem above lofi is the same as below. 690 * 691 * We could come up with a complex strategy using threads to 692 * do the I/O asynchronously, or we could use task queues. task 693 * queues were incredibly easy so they win. 694 */ 695 lsp = ddi_get_soft_state(lofi_statep, getminor(bp->b_edev)); 696 mutex_enter(&lsp->ls_vp_lock); 697 if (lsp->ls_vp == NULL || lsp->ls_vp_closereq) { 698 bioerror(bp, EIO); 699 biodone(bp); 700 mutex_exit(&lsp->ls_vp_lock); 701 return (0); 702 } 703 704 offset = bp->b_lblkno * DEV_BSIZE; /* offset within file */ 705 if (offset == lsp->ls_vp_size) { 706 /* EOF */ 707 if ((bp->b_flags & B_READ) != 0) { 708 bp->b_resid = bp->b_bcount; 709 bioerror(bp, 0); 710 } else { 711 /* writes should fail */ 712 bioerror(bp, ENXIO); 713 } 714 biodone(bp); 715 mutex_exit(&lsp->ls_vp_lock); 716 return (0); 717 } 718 if (offset > lsp->ls_vp_size) { 719 bioerror(bp, ENXIO); 720 biodone(bp); 721 mutex_exit(&lsp->ls_vp_lock); 722 return (0); 723 } 724 lsp->ls_vp_iocount++; 725 mutex_exit(&lsp->ls_vp_lock); 726 727 if (lsp->ls_kstat) { 728 mutex_enter(lsp->ls_kstat->ks_lock); 729 kstat_waitq_enter(KSTAT_IO_PTR(lsp->ls_kstat)); 730 mutex_exit(lsp->ls_kstat->ks_lock); 731 } 732 (void) taskq_dispatch(lsp->ls_taskq, lofi_strategy_task, bp, KM_SLEEP); 733 return (0); 734 } 735 736 /*ARGSUSED2*/ 737 static int 738 lofi_read(dev_t dev, struct uio *uio, struct cred *credp) 739 { 740 if (getminor(dev) == 0) 741 return (EINVAL); 742 return (physio(lofi_strategy, NULL, dev, B_READ, minphys, uio)); 743 } 744 745 /*ARGSUSED2*/ 746 static int 747 lofi_write(dev_t dev, struct uio *uio, struct cred *credp) 748 { 749 if (getminor(dev) == 0) 750 return (EINVAL); 751 return (physio(lofi_strategy, NULL, dev, B_WRITE, minphys, uio)); 752 } 753 754 /*ARGSUSED2*/ 755 static int 756 lofi_aread(dev_t dev, struct aio_req *aio, struct cred *credp) 757 { 758 if (getminor(dev) == 0) 759 return (EINVAL); 760 return (aphysio(lofi_strategy, anocancel, dev, B_READ, minphys, aio)); 761 } 762 763 /*ARGSUSED2*/ 764 static int 765 lofi_awrite(dev_t dev, struct aio_req *aio, struct cred *credp) 766 { 767 if (getminor(dev) == 0) 768 return (EINVAL); 769 return (aphysio(lofi_strategy, anocancel, dev, B_WRITE, minphys, aio)); 770 } 771 772 /*ARGSUSED*/ 773 static int 774 lofi_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) 775 { 776 switch (infocmd) { 777 case DDI_INFO_DEVT2DEVINFO: 778 *result = lofi_dip; 779 return (DDI_SUCCESS); 780 case DDI_INFO_DEVT2INSTANCE: 781 *result = 0; 782 return (DDI_SUCCESS); 783 } 784 return (DDI_FAILURE); 785 } 786 787 static int 788 lofi_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 789 { 790 int error; 791 792 if (cmd != DDI_ATTACH) 793 return (DDI_FAILURE); 794 error = ddi_soft_state_zalloc(lofi_statep, 0); 795 if (error == DDI_FAILURE) { 796 return (DDI_FAILURE); 797 } 798 error = ddi_create_minor_node(dip, LOFI_CTL_NODE, S_IFCHR, 0, 799 DDI_PSEUDO, NULL); 800 if (error == DDI_FAILURE) { 801 ddi_soft_state_free(lofi_statep, 0); 802 return (DDI_FAILURE); 803 } 804 /* driver handles kernel-issued IOCTLs */ 805 if (ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP, 806 DDI_KERNEL_IOCTL, NULL, 0) != DDI_PROP_SUCCESS) { 807 ddi_remove_minor_node(dip, NULL); 808 ddi_soft_state_free(lofi_statep, 0); 809 return (DDI_FAILURE); 810 } 811 lofi_dip = dip; 812 ddi_report_dev(dip); 813 return (DDI_SUCCESS); 814 } 815 816 static int 817 lofi_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 818 { 819 if (cmd != DDI_DETACH) 820 return (DDI_FAILURE); 821 if (lofi_busy()) 822 return (DDI_FAILURE); 823 lofi_dip = NULL; 824 ddi_remove_minor_node(dip, NULL); 825 ddi_prop_remove_all(dip); 826 ddi_soft_state_free(lofi_statep, 0); 827 return (DDI_SUCCESS); 828 } 829 830 /* 831 * These two just simplify the rest of the ioctls that need to copyin/out 832 * the lofi_ioctl structure. 833 */ 834 struct lofi_ioctl * 835 copy_in_lofi_ioctl(const struct lofi_ioctl *ulip, int flag) 836 { 837 struct lofi_ioctl *klip; 838 int error; 839 840 klip = kmem_alloc(sizeof (struct lofi_ioctl), KM_SLEEP); 841 error = ddi_copyin(ulip, klip, sizeof (struct lofi_ioctl), flag); 842 if (error) { 843 kmem_free(klip, sizeof (struct lofi_ioctl)); 844 return (NULL); 845 } 846 847 /* make sure filename is always null-terminated */ 848 klip->li_filename[MAXPATHLEN] = '\0'; 849 850 /* validate minor number */ 851 if (klip->li_minor > lofi_max_files) { 852 kmem_free(klip, sizeof (struct lofi_ioctl)); 853 return (NULL); 854 } 855 return (klip); 856 } 857 858 int 859 copy_out_lofi_ioctl(const struct lofi_ioctl *klip, struct lofi_ioctl *ulip, 860 int flag) 861 { 862 int error; 863 864 error = ddi_copyout(klip, ulip, sizeof (struct lofi_ioctl), flag); 865 if (error) 866 return (EFAULT); 867 return (0); 868 } 869 870 void 871 free_lofi_ioctl(struct lofi_ioctl *klip) 872 { 873 kmem_free(klip, sizeof (struct lofi_ioctl)); 874 } 875 876 /* 877 * Return the minor number 'filename' is mapped to, if it is. 878 */ 879 static int 880 file_to_minor(char *filename) 881 { 882 minor_t minor; 883 struct lofi_state *lsp; 884 885 ASSERT(mutex_owned(&lofi_lock)); 886 for (minor = 1; minor <= lofi_max_files; minor++) { 887 lsp = ddi_get_soft_state(lofi_statep, minor); 888 if (lsp == NULL) 889 continue; 890 if (strcmp(lsp->ls_filename, filename) == 0) 891 return (minor); 892 } 893 return (0); 894 } 895 896 /* 897 * lofiadm does some validation, but since Joe Random (or crashme) could 898 * do our ioctls, we need to do some validation too. 899 */ 900 static int 901 valid_filename(const char *filename) 902 { 903 static char *blkprefix = "/dev/" LOFI_BLOCK_NAME "/"; 904 static char *charprefix = "/dev/" LOFI_CHAR_NAME "/"; 905 906 /* must be absolute path */ 907 if (filename[0] != '/') 908 return (0); 909 /* must not be lofi */ 910 if (strncmp(filename, blkprefix, strlen(blkprefix)) == 0) 911 return (0); 912 if (strncmp(filename, charprefix, strlen(charprefix)) == 0) 913 return (0); 914 return (1); 915 } 916 917 /* 918 * Fakes up a disk geometry, and one big partition, based on the size 919 * of the file. This is needed because we allow newfs'ing the device, 920 * and newfs will do several disk ioctls to figure out the geometry and 921 * partition information. It uses that information to determine the parameters 922 * to pass to mkfs. Geometry is pretty much irrelevant these days, but we 923 * have to support it. 924 */ 925 static void 926 fake_disk_geometry(struct lofi_state *lsp) 927 { 928 /* dk_geom - see dkio(7I) */ 929 /* 930 * dkg_ncyl _could_ be set to one here (one big cylinder with gobs 931 * of sectors), but that breaks programs like fdisk which want to 932 * partition a disk by cylinder. With one cylinder, you can't create 933 * an fdisk partition and put pcfs on it for testing (hard to pick 934 * a number between one and one). 935 * 936 * The cheezy floppy test is an attempt to not have too few cylinders 937 * for a small file, or so many on a big file that you waste space 938 * for backup superblocks or cylinder group structures. 939 */ 940 if (lsp->ls_vp_size < (2 * 1024 * 1024)) /* floppy? */ 941 lsp->ls_dkg.dkg_ncyl = lsp->ls_vp_size / (100 * 1024); 942 else 943 lsp->ls_dkg.dkg_ncyl = lsp->ls_vp_size / (300 * 1024); 944 /* in case file file is < 100k */ 945 if (lsp->ls_dkg.dkg_ncyl == 0) 946 lsp->ls_dkg.dkg_ncyl = 1; 947 lsp->ls_dkg.dkg_acyl = 0; 948 lsp->ls_dkg.dkg_bcyl = 0; 949 lsp->ls_dkg.dkg_nhead = 1; 950 lsp->ls_dkg.dkg_obs1 = 0; 951 lsp->ls_dkg.dkg_intrlv = 0; 952 lsp->ls_dkg.dkg_obs2 = 0; 953 lsp->ls_dkg.dkg_obs3 = 0; 954 lsp->ls_dkg.dkg_apc = 0; 955 lsp->ls_dkg.dkg_rpm = 7200; 956 lsp->ls_dkg.dkg_pcyl = lsp->ls_dkg.dkg_ncyl + lsp->ls_dkg.dkg_acyl; 957 lsp->ls_dkg.dkg_nsect = lsp->ls_vp_size / 958 (DEV_BSIZE * lsp->ls_dkg.dkg_ncyl); 959 lsp->ls_dkg.dkg_write_reinstruct = 0; 960 lsp->ls_dkg.dkg_read_reinstruct = 0; 961 962 /* vtoc - see dkio(7I) */ 963 bzero(&lsp->ls_vtoc, sizeof (struct vtoc)); 964 lsp->ls_vtoc.v_sanity = VTOC_SANE; 965 lsp->ls_vtoc.v_version = V_VERSION; 966 bcopy(LOFI_DRIVER_NAME, lsp->ls_vtoc.v_volume, 7); 967 lsp->ls_vtoc.v_sectorsz = DEV_BSIZE; 968 lsp->ls_vtoc.v_nparts = 1; 969 lsp->ls_vtoc.v_part[0].p_tag = V_UNASSIGNED; 970 971 /* 972 * A compressed file is read-only, other files can 973 * be read-write 974 */ 975 if (lsp->ls_uncomp_seg_sz > 0) { 976 lsp->ls_vtoc.v_part[0].p_flag = V_UNMNT | V_RONLY; 977 } else { 978 lsp->ls_vtoc.v_part[0].p_flag = V_UNMNT; 979 } 980 lsp->ls_vtoc.v_part[0].p_start = (daddr_t)0; 981 /* 982 * The partition size cannot just be the number of sectors, because 983 * that might not end on a cylinder boundary. And if that's the case, 984 * newfs/mkfs will print a scary warning. So just figure the size 985 * based on the number of cylinders and sectors/cylinder. 986 */ 987 lsp->ls_vtoc.v_part[0].p_size = lsp->ls_dkg.dkg_pcyl * 988 lsp->ls_dkg.dkg_nsect * lsp->ls_dkg.dkg_nhead; 989 990 /* dk_cinfo - see dkio(7I) */ 991 bzero(&lsp->ls_ci, sizeof (struct dk_cinfo)); 992 (void) strcpy(lsp->ls_ci.dki_cname, LOFI_DRIVER_NAME); 993 lsp->ls_ci.dki_ctype = DKC_MD; 994 lsp->ls_ci.dki_flags = 0; 995 lsp->ls_ci.dki_cnum = 0; 996 lsp->ls_ci.dki_addr = 0; 997 lsp->ls_ci.dki_space = 0; 998 lsp->ls_ci.dki_prio = 0; 999 lsp->ls_ci.dki_vec = 0; 1000 (void) strcpy(lsp->ls_ci.dki_dname, LOFI_DRIVER_NAME); 1001 lsp->ls_ci.dki_unit = 0; 1002 lsp->ls_ci.dki_slave = 0; 1003 lsp->ls_ci.dki_partition = 0; 1004 /* 1005 * newfs uses this to set maxcontig. Must not be < 16, or it 1006 * will be 0 when newfs multiplies it by DEV_BSIZE and divides 1007 * it by the block size. Then tunefs doesn't work because 1008 * maxcontig is 0. 1009 */ 1010 lsp->ls_ci.dki_maxtransfer = 16; 1011 } 1012 1013 /* 1014 * map in a compressed file 1015 * 1016 * Read in the header and the index that follows. 1017 * 1018 * The header is as follows - 1019 * 1020 * Signature (name of the compression algorithm) 1021 * Compression segment size (a multiple of 512) 1022 * Number of index entries 1023 * Size of the last block 1024 * The array containing the index entries 1025 * 1026 * The header information is always stored in 1027 * network byte order on disk. 1028 */ 1029 static int 1030 lofi_map_compressed_file(struct lofi_state *lsp, char *buf) 1031 { 1032 uint32_t index_sz, header_len, i; 1033 ssize_t resid; 1034 enum uio_rw rw; 1035 char *tbuf = buf; 1036 int error; 1037 1038 /* The signature has already been read */ 1039 tbuf += sizeof (lsp->ls_comp_algorithm); 1040 bcopy(tbuf, &(lsp->ls_uncomp_seg_sz), sizeof (lsp->ls_uncomp_seg_sz)); 1041 lsp->ls_uncomp_seg_sz = ntohl(lsp->ls_uncomp_seg_sz); 1042 1043 /* 1044 * The compressed segment size must be a power of 2 1045 */ 1046 if (lsp->ls_uncomp_seg_sz % 2) 1047 return (EINVAL); 1048 1049 for (i = 0; !((lsp->ls_uncomp_seg_sz >> i) & 1); i++) 1050 ; 1051 1052 lsp->ls_comp_seg_shift = i; 1053 1054 tbuf += sizeof (lsp->ls_uncomp_seg_sz); 1055 bcopy(tbuf, &(lsp->ls_comp_index_sz), sizeof (lsp->ls_comp_index_sz)); 1056 lsp->ls_comp_index_sz = ntohl(lsp->ls_comp_index_sz); 1057 1058 tbuf += sizeof (lsp->ls_comp_index_sz); 1059 bcopy(tbuf, &(lsp->ls_uncomp_last_seg_sz), 1060 sizeof (lsp->ls_uncomp_last_seg_sz)); 1061 lsp->ls_uncomp_last_seg_sz = ntohl(lsp->ls_uncomp_last_seg_sz); 1062 1063 /* 1064 * Compute the total size of the uncompressed data 1065 * for use in fake_disk_geometry and other calculations. 1066 * Disk geometry has to be faked with respect to the 1067 * actual uncompressed data size rather than the 1068 * compressed file size. 1069 */ 1070 lsp->ls_vp_size = (lsp->ls_comp_index_sz - 2) * lsp->ls_uncomp_seg_sz 1071 + lsp->ls_uncomp_last_seg_sz; 1072 1073 /* 1074 * Index size is rounded up to a 512 byte boundary for ease 1075 * of segmapping 1076 */ 1077 index_sz = sizeof (*lsp->ls_comp_seg_index) * lsp->ls_comp_index_sz; 1078 header_len = sizeof (lsp->ls_comp_algorithm) + 1079 sizeof (lsp->ls_uncomp_seg_sz) + 1080 sizeof (lsp->ls_comp_index_sz) + 1081 sizeof (lsp->ls_uncomp_last_seg_sz); 1082 lsp->ls_comp_offbase = header_len + index_sz; 1083 1084 index_sz += header_len; 1085 index_sz = roundup(index_sz, DEV_BSIZE); 1086 1087 lsp->ls_comp_index_data = kmem_alloc(index_sz, KM_SLEEP); 1088 lsp->ls_comp_index_data_sz = index_sz; 1089 1090 /* 1091 * Read in the index -- this has a side-effect 1092 * of reading in the header as well 1093 */ 1094 rw = UIO_READ; 1095 error = vn_rdwr(rw, lsp->ls_vp, lsp->ls_comp_index_data, index_sz, 1096 0, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); 1097 1098 if (error != 0) 1099 return (error); 1100 1101 /* Skip the header, this is where the index really begins */ 1102 lsp->ls_comp_seg_index = 1103 /*LINTED*/ 1104 (uint64_t *)(lsp->ls_comp_index_data + header_len); 1105 1106 /* 1107 * Now recompute offsets in the index to account for 1108 * the header length 1109 */ 1110 for (i = 0; i < lsp->ls_comp_index_sz; i++) { 1111 lsp->ls_comp_seg_index[i] = lsp->ls_comp_offbase + 1112 BE_64(lsp->ls_comp_seg_index[i]); 1113 } 1114 1115 return (error); 1116 } 1117 1118 /* 1119 * Check to see if the passed in signature is a valid 1120 * one. If it is valid, return the index into 1121 * lofi_compress_table. 1122 * 1123 * Return -1 if it is invalid 1124 */ 1125 static int lofi_compress_select(char *signature) 1126 { 1127 int i; 1128 1129 for (i = 0; i < LOFI_COMPRESS_FUNCTIONS; i++) { 1130 if (strcmp(lofi_compress_table[i].l_name, signature) == 0) 1131 return (i); 1132 } 1133 1134 return (-1); 1135 } 1136 1137 /* 1138 * map a file to a minor number. Return the minor number. 1139 */ 1140 static int 1141 lofi_map_file(dev_t dev, struct lofi_ioctl *ulip, int pickminor, 1142 int *rvalp, struct cred *credp, int ioctl_flag) 1143 { 1144 minor_t newminor; 1145 struct lofi_state *lsp; 1146 struct lofi_ioctl *klip; 1147 int error; 1148 struct vnode *vp; 1149 int64_t Nblocks_prop_val; 1150 int64_t Size_prop_val; 1151 int compress_index; 1152 vattr_t vattr; 1153 int flag; 1154 enum vtype v_type; 1155 int zalloced = 0; 1156 dev_t newdev; 1157 char namebuf[50]; 1158 char buf[DEV_BSIZE]; 1159 char *tbuf; 1160 ssize_t resid; 1161 enum uio_rw rw; 1162 1163 klip = copy_in_lofi_ioctl(ulip, ioctl_flag); 1164 if (klip == NULL) 1165 return (EFAULT); 1166 1167 mutex_enter(&lofi_lock); 1168 1169 if (!valid_filename(klip->li_filename)) { 1170 error = EINVAL; 1171 goto out; 1172 } 1173 1174 if (file_to_minor(klip->li_filename) != 0) { 1175 error = EBUSY; 1176 goto out; 1177 } 1178 1179 if (pickminor) { 1180 /* Find a free one */ 1181 for (newminor = 1; newminor <= lofi_max_files; newminor++) 1182 if (ddi_get_soft_state(lofi_statep, newminor) == NULL) 1183 break; 1184 if (newminor >= lofi_max_files) { 1185 error = EAGAIN; 1186 goto out; 1187 } 1188 } else { 1189 newminor = klip->li_minor; 1190 if (ddi_get_soft_state(lofi_statep, newminor) != NULL) { 1191 error = EEXIST; 1192 goto out; 1193 } 1194 } 1195 1196 /* make sure it's valid */ 1197 error = lookupname(klip->li_filename, UIO_SYSSPACE, FOLLOW, 1198 NULLVPP, &vp); 1199 if (error) { 1200 goto out; 1201 } 1202 v_type = vp->v_type; 1203 VN_RELE(vp); 1204 if (!V_ISLOFIABLE(v_type)) { 1205 error = EINVAL; 1206 goto out; 1207 } 1208 flag = FREAD | FWRITE | FOFFMAX | FEXCL; 1209 error = vn_open(klip->li_filename, UIO_SYSSPACE, flag, 0, &vp, 0, 0); 1210 if (error) { 1211 /* try read-only */ 1212 flag &= ~FWRITE; 1213 error = vn_open(klip->li_filename, UIO_SYSSPACE, flag, 0, 1214 &vp, 0, 0); 1215 if (error) { 1216 goto out; 1217 } 1218 } 1219 vattr.va_mask = AT_SIZE; 1220 error = VOP_GETATTR(vp, &vattr, 0, credp, NULL); 1221 if (error) { 1222 goto closeout; 1223 } 1224 /* the file needs to be a multiple of the block size */ 1225 if ((vattr.va_size % DEV_BSIZE) != 0) { 1226 error = EINVAL; 1227 goto closeout; 1228 } 1229 newdev = makedevice(getmajor(dev), newminor); 1230 Size_prop_val = vattr.va_size; 1231 if ((ddi_prop_update_int64(newdev, lofi_dip, 1232 SIZE_PROP_NAME, Size_prop_val)) != DDI_PROP_SUCCESS) { 1233 error = EINVAL; 1234 goto closeout; 1235 } 1236 Nblocks_prop_val = vattr.va_size / DEV_BSIZE; 1237 if ((ddi_prop_update_int64(newdev, lofi_dip, 1238 NBLOCKS_PROP_NAME, Nblocks_prop_val)) != DDI_PROP_SUCCESS) { 1239 error = EINVAL; 1240 goto propout; 1241 } 1242 error = ddi_soft_state_zalloc(lofi_statep, newminor); 1243 if (error == DDI_FAILURE) { 1244 error = ENOMEM; 1245 goto propout; 1246 } 1247 zalloced = 1; 1248 (void) snprintf(namebuf, sizeof (namebuf), "%d", newminor); 1249 (void) ddi_create_minor_node(lofi_dip, namebuf, S_IFBLK, newminor, 1250 DDI_PSEUDO, NULL); 1251 if (error != DDI_SUCCESS) { 1252 error = ENXIO; 1253 goto propout; 1254 } 1255 (void) snprintf(namebuf, sizeof (namebuf), "%d,raw", newminor); 1256 error = ddi_create_minor_node(lofi_dip, namebuf, S_IFCHR, newminor, 1257 DDI_PSEUDO, NULL); 1258 if (error != DDI_SUCCESS) { 1259 /* remove block node */ 1260 (void) snprintf(namebuf, sizeof (namebuf), "%d", newminor); 1261 ddi_remove_minor_node(lofi_dip, namebuf); 1262 error = ENXIO; 1263 goto propout; 1264 } 1265 lsp = ddi_get_soft_state(lofi_statep, newminor); 1266 lsp->ls_filename_sz = strlen(klip->li_filename) + 1; 1267 lsp->ls_filename = kmem_alloc(lsp->ls_filename_sz, KM_SLEEP); 1268 (void) snprintf(namebuf, sizeof (namebuf), "%s_taskq_%d", 1269 LOFI_DRIVER_NAME, newminor); 1270 lsp->ls_taskq = taskq_create(namebuf, lofi_taskq_nthreads, 1271 minclsyspri, 1, lofi_taskq_maxalloc, 0); 1272 lsp->ls_kstat = kstat_create(LOFI_DRIVER_NAME, newminor, 1273 NULL, "disk", KSTAT_TYPE_IO, 1, 0); 1274 if (lsp->ls_kstat) { 1275 mutex_init(&lsp->ls_kstat_lock, NULL, MUTEX_DRIVER, NULL); 1276 lsp->ls_kstat->ks_lock = &lsp->ls_kstat_lock; 1277 kstat_install(lsp->ls_kstat); 1278 } 1279 cv_init(&lsp->ls_vp_cv, NULL, CV_DRIVER, NULL); 1280 mutex_init(&lsp->ls_vp_lock, NULL, MUTEX_DRIVER, NULL); 1281 1282 /* 1283 * save open mode so file can be closed properly and vnode counts 1284 * updated correctly. 1285 */ 1286 lsp->ls_openflag = flag; 1287 1288 /* 1289 * Try to handle stacked lofs vnodes. 1290 */ 1291 if (vp->v_type == VREG) { 1292 if (VOP_REALVP(vp, &lsp->ls_vp, NULL) != 0) { 1293 lsp->ls_vp = vp; 1294 } else { 1295 /* 1296 * Even though vp was obtained via vn_open(), we 1297 * can't call vn_close() on it, since lofs will 1298 * pass the VOP_CLOSE() on down to the realvp 1299 * (which we are about to use). Hence we merely 1300 * drop the reference to the lofs vnode and hold 1301 * the realvp so things behave as if we've 1302 * opened the realvp without any interaction 1303 * with lofs. 1304 */ 1305 VN_HOLD(lsp->ls_vp); 1306 VN_RELE(vp); 1307 } 1308 } else { 1309 lsp->ls_vp = vp; 1310 } 1311 lsp->ls_vp_size = vattr.va_size; 1312 (void) strcpy(lsp->ls_filename, klip->li_filename); 1313 if (rvalp) 1314 *rvalp = (int)newminor; 1315 klip->li_minor = newminor; 1316 1317 /* 1318 * Read the file signature to check if it is compressed. 1319 * 'rw' is set to read since only reads are allowed to 1320 * a compressed file. 1321 */ 1322 rw = UIO_READ; 1323 error = vn_rdwr(rw, lsp->ls_vp, buf, DEV_BSIZE, 0, UIO_SYSSPACE, 1324 0, RLIM64_INFINITY, kcred, &resid); 1325 1326 if (error != 0) 1327 goto propout; 1328 1329 tbuf = buf; 1330 lsp->ls_uncomp_seg_sz = 0; 1331 lsp->ls_vp_comp_size = lsp->ls_vp_size; 1332 lsp->ls_comp_algorithm[0] = '\0'; 1333 1334 compress_index = lofi_compress_select(tbuf); 1335 if (compress_index != -1) { 1336 lsp->ls_comp_algorithm_index = compress_index; 1337 (void) strlcpy(lsp->ls_comp_algorithm, 1338 lofi_compress_table[compress_index].l_name, 1339 sizeof (lsp->ls_comp_algorithm)); 1340 error = lofi_map_compressed_file(lsp, buf); 1341 if (error != 0) 1342 goto propout; 1343 1344 /* update DDI properties */ 1345 Size_prop_val = lsp->ls_vp_size; 1346 if ((ddi_prop_update_int64(newdev, lofi_dip, SIZE_PROP_NAME, 1347 Size_prop_val)) != DDI_PROP_SUCCESS) { 1348 error = EINVAL; 1349 goto propout; 1350 } 1351 1352 Nblocks_prop_val = lsp->ls_vp_size / DEV_BSIZE; 1353 if ((ddi_prop_update_int64(newdev, lofi_dip, NBLOCKS_PROP_NAME, 1354 Nblocks_prop_val)) != DDI_PROP_SUCCESS) { 1355 error = EINVAL; 1356 goto propout; 1357 } 1358 } 1359 1360 fake_disk_geometry(lsp); 1361 mutex_exit(&lofi_lock); 1362 (void) copy_out_lofi_ioctl(klip, ulip, ioctl_flag); 1363 free_lofi_ioctl(klip); 1364 return (0); 1365 1366 propout: 1367 (void) ddi_prop_remove(newdev, lofi_dip, SIZE_PROP_NAME); 1368 (void) ddi_prop_remove(newdev, lofi_dip, NBLOCKS_PROP_NAME); 1369 closeout: 1370 (void) VOP_CLOSE(vp, flag, 1, 0, credp, NULL); 1371 VN_RELE(vp); 1372 out: 1373 if (zalloced) 1374 ddi_soft_state_free(lofi_statep, newminor); 1375 mutex_exit(&lofi_lock); 1376 free_lofi_ioctl(klip); 1377 return (error); 1378 } 1379 1380 /* 1381 * unmap a file. 1382 */ 1383 static int 1384 lofi_unmap_file(dev_t dev, struct lofi_ioctl *ulip, int byfilename, 1385 struct cred *credp, int ioctl_flag) 1386 { 1387 struct lofi_state *lsp; 1388 struct lofi_ioctl *klip; 1389 minor_t minor; 1390 1391 klip = copy_in_lofi_ioctl(ulip, ioctl_flag); 1392 if (klip == NULL) 1393 return (EFAULT); 1394 1395 mutex_enter(&lofi_lock); 1396 if (byfilename) { 1397 minor = file_to_minor(klip->li_filename); 1398 } else { 1399 minor = klip->li_minor; 1400 } 1401 if (minor == 0) { 1402 mutex_exit(&lofi_lock); 1403 free_lofi_ioctl(klip); 1404 return (ENXIO); 1405 } 1406 lsp = ddi_get_soft_state(lofi_statep, minor); 1407 if (lsp == NULL || lsp->ls_vp == NULL) { 1408 mutex_exit(&lofi_lock); 1409 free_lofi_ioctl(klip); 1410 return (ENXIO); 1411 } 1412 1413 /* 1414 * If it's still held open, we'll do one of three things: 1415 * 1416 * If no flag is set, just return EBUSY. 1417 * 1418 * If the 'cleanup' flag is set, unmap and remove the device when 1419 * the last user finishes. 1420 * 1421 * If the 'force' flag is set, then we forcibly close the underlying 1422 * file. Subsequent operations will fail, and the DKIOCSTATE ioctl 1423 * will return DKIO_DEV_GONE. When the device is last closed, the 1424 * device will be cleaned up appropriately. 1425 * 1426 * This is complicated by the fact that we may have outstanding 1427 * dispatched I/Os. Rather than having a single mutex to serialize all 1428 * I/O, we keep a count of the number of outstanding I/O requests, as 1429 * well as a flag to indicate that no new I/Os should be dispatched. 1430 * We set the flag, wait for the number of outstanding I/Os to reach 0, 1431 * and then close the underlying vnode. 1432 */ 1433 1434 if (is_opened(lsp)) { 1435 if (klip->li_force) { 1436 mutex_enter(&lsp->ls_vp_lock); 1437 lsp->ls_vp_closereq = B_TRUE; 1438 while (lsp->ls_vp_iocount > 0) 1439 cv_wait(&lsp->ls_vp_cv, &lsp->ls_vp_lock); 1440 (void) VOP_CLOSE(lsp->ls_vp, lsp->ls_openflag, 1, 0, 1441 credp, NULL); 1442 VN_RELE(lsp->ls_vp); 1443 lsp->ls_vp = NULL; 1444 cv_broadcast(&lsp->ls_vp_cv); 1445 mutex_exit(&lsp->ls_vp_lock); 1446 mutex_exit(&lofi_lock); 1447 klip->li_minor = minor; 1448 (void) copy_out_lofi_ioctl(klip, ulip, ioctl_flag); 1449 free_lofi_ioctl(klip); 1450 return (0); 1451 } else if (klip->li_cleanup) { 1452 lsp->ls_cleanup = 1; 1453 mutex_exit(&lofi_lock); 1454 free_lofi_ioctl(klip); 1455 return (0); 1456 } 1457 1458 mutex_exit(&lofi_lock); 1459 free_lofi_ioctl(klip); 1460 return (EBUSY); 1461 } 1462 1463 lofi_free_handle(dev, minor, lsp, credp); 1464 1465 klip->li_minor = minor; 1466 mutex_exit(&lofi_lock); 1467 (void) copy_out_lofi_ioctl(klip, ulip, ioctl_flag); 1468 free_lofi_ioctl(klip); 1469 return (0); 1470 } 1471 1472 /* 1473 * get the filename given the minor number, or the minor number given 1474 * the name. 1475 */ 1476 /*ARGSUSED*/ 1477 static int 1478 lofi_get_info(dev_t dev, struct lofi_ioctl *ulip, int which, 1479 struct cred *credp, int ioctl_flag) 1480 { 1481 struct lofi_state *lsp; 1482 struct lofi_ioctl *klip; 1483 int error; 1484 minor_t minor; 1485 1486 klip = copy_in_lofi_ioctl(ulip, ioctl_flag); 1487 if (klip == NULL) 1488 return (EFAULT); 1489 1490 switch (which) { 1491 case LOFI_GET_FILENAME: 1492 minor = klip->li_minor; 1493 if (minor == 0) { 1494 free_lofi_ioctl(klip); 1495 return (EINVAL); 1496 } 1497 1498 mutex_enter(&lofi_lock); 1499 lsp = ddi_get_soft_state(lofi_statep, minor); 1500 if (lsp == NULL) { 1501 mutex_exit(&lofi_lock); 1502 free_lofi_ioctl(klip); 1503 return (ENXIO); 1504 } 1505 (void) strcpy(klip->li_filename, lsp->ls_filename); 1506 (void) strlcpy(klip->li_algorithm, lsp->ls_comp_algorithm, 1507 sizeof (klip->li_algorithm)); 1508 mutex_exit(&lofi_lock); 1509 error = copy_out_lofi_ioctl(klip, ulip, ioctl_flag); 1510 free_lofi_ioctl(klip); 1511 return (error); 1512 case LOFI_GET_MINOR: 1513 mutex_enter(&lofi_lock); 1514 klip->li_minor = file_to_minor(klip->li_filename); 1515 mutex_exit(&lofi_lock); 1516 if (klip->li_minor == 0) { 1517 free_lofi_ioctl(klip); 1518 return (ENOENT); 1519 } 1520 error = copy_out_lofi_ioctl(klip, ulip, ioctl_flag); 1521 free_lofi_ioctl(klip); 1522 return (error); 1523 case LOFI_CHECK_COMPRESSED: 1524 mutex_enter(&lofi_lock); 1525 klip->li_minor = file_to_minor(klip->li_filename); 1526 mutex_exit(&lofi_lock); 1527 if (klip->li_minor == 0) { 1528 free_lofi_ioctl(klip); 1529 return (ENOENT); 1530 } 1531 mutex_enter(&lofi_lock); 1532 lsp = ddi_get_soft_state(lofi_statep, klip->li_minor); 1533 if (lsp == NULL) { 1534 mutex_exit(&lofi_lock); 1535 free_lofi_ioctl(klip); 1536 return (ENXIO); 1537 } 1538 ASSERT(strcmp(klip->li_filename, lsp->ls_filename) == 0); 1539 1540 (void) strlcpy(klip->li_algorithm, lsp->ls_comp_algorithm, 1541 sizeof (klip->li_algorithm)); 1542 mutex_exit(&lofi_lock); 1543 error = copy_out_lofi_ioctl(klip, ulip, ioctl_flag); 1544 free_lofi_ioctl(klip); 1545 return (error); 1546 default: 1547 free_lofi_ioctl(klip); 1548 return (EINVAL); 1549 } 1550 1551 } 1552 1553 static int 1554 lofi_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *credp, 1555 int *rvalp) 1556 { 1557 int error; 1558 enum dkio_state dkstate; 1559 struct lofi_state *lsp; 1560 minor_t minor; 1561 1562 #ifdef lint 1563 credp = credp; 1564 #endif 1565 1566 minor = getminor(dev); 1567 /* lofi ioctls only apply to the master device */ 1568 if (minor == 0) { 1569 struct lofi_ioctl *lip = (struct lofi_ioctl *)arg; 1570 1571 /* 1572 * the query command only need read-access - i.e., normal 1573 * users are allowed to do those on the ctl device as 1574 * long as they can open it read-only. 1575 */ 1576 switch (cmd) { 1577 case LOFI_MAP_FILE: 1578 if ((flag & FWRITE) == 0) 1579 return (EPERM); 1580 return (lofi_map_file(dev, lip, 1, rvalp, credp, flag)); 1581 case LOFI_MAP_FILE_MINOR: 1582 if ((flag & FWRITE) == 0) 1583 return (EPERM); 1584 return (lofi_map_file(dev, lip, 0, rvalp, credp, flag)); 1585 case LOFI_UNMAP_FILE: 1586 if ((flag & FWRITE) == 0) 1587 return (EPERM); 1588 return (lofi_unmap_file(dev, lip, 1, credp, flag)); 1589 case LOFI_UNMAP_FILE_MINOR: 1590 if ((flag & FWRITE) == 0) 1591 return (EPERM); 1592 return (lofi_unmap_file(dev, lip, 0, credp, flag)); 1593 case LOFI_GET_FILENAME: 1594 return (lofi_get_info(dev, lip, LOFI_GET_FILENAME, 1595 credp, flag)); 1596 case LOFI_GET_MINOR: 1597 return (lofi_get_info(dev, lip, LOFI_GET_MINOR, 1598 credp, flag)); 1599 case LOFI_GET_MAXMINOR: 1600 error = ddi_copyout(&lofi_max_files, &lip->li_minor, 1601 sizeof (lofi_max_files), flag); 1602 if (error) 1603 return (EFAULT); 1604 return (0); 1605 case LOFI_CHECK_COMPRESSED: 1606 return (lofi_get_info(dev, lip, LOFI_CHECK_COMPRESSED, 1607 credp, flag)); 1608 default: 1609 break; 1610 } 1611 } 1612 1613 lsp = ddi_get_soft_state(lofi_statep, minor); 1614 if (lsp == NULL) 1615 return (ENXIO); 1616 1617 /* 1618 * We explicitly allow DKIOCSTATE, but all other ioctls should fail with 1619 * EIO as if the device was no longer present. 1620 */ 1621 if (lsp->ls_vp == NULL && cmd != DKIOCSTATE) 1622 return (EIO); 1623 1624 /* these are for faking out utilities like newfs */ 1625 switch (cmd) { 1626 case DKIOCGVTOC: 1627 switch (ddi_model_convert_from(flag & FMODELS)) { 1628 case DDI_MODEL_ILP32: { 1629 struct vtoc32 vtoc32; 1630 1631 vtoctovtoc32(lsp->ls_vtoc, vtoc32); 1632 if (ddi_copyout(&vtoc32, (void *)arg, 1633 sizeof (struct vtoc32), flag)) 1634 return (EFAULT); 1635 break; 1636 } 1637 1638 case DDI_MODEL_NONE: 1639 if (ddi_copyout(&lsp->ls_vtoc, (void *)arg, 1640 sizeof (struct vtoc), flag)) 1641 return (EFAULT); 1642 break; 1643 } 1644 return (0); 1645 case DKIOCINFO: 1646 error = ddi_copyout(&lsp->ls_ci, (void *)arg, 1647 sizeof (struct dk_cinfo), flag); 1648 if (error) 1649 return (EFAULT); 1650 return (0); 1651 case DKIOCG_VIRTGEOM: 1652 case DKIOCG_PHYGEOM: 1653 case DKIOCGGEOM: 1654 error = ddi_copyout(&lsp->ls_dkg, (void *)arg, 1655 sizeof (struct dk_geom), flag); 1656 if (error) 1657 return (EFAULT); 1658 return (0); 1659 case DKIOCSTATE: 1660 /* 1661 * Normally, lofi devices are always in the INSERTED state. If 1662 * a device is forcefully unmapped, then the device transitions 1663 * to the DKIO_DEV_GONE state. 1664 */ 1665 if (ddi_copyin((void *)arg, &dkstate, sizeof (dkstate), 1666 flag) != 0) 1667 return (EFAULT); 1668 1669 mutex_enter(&lsp->ls_vp_lock); 1670 while ((dkstate == DKIO_INSERTED && lsp->ls_vp != NULL) || 1671 (dkstate == DKIO_DEV_GONE && lsp->ls_vp == NULL)) { 1672 /* 1673 * By virtue of having the device open, we know that 1674 * 'lsp' will remain valid when we return. 1675 */ 1676 if (!cv_wait_sig(&lsp->ls_vp_cv, 1677 &lsp->ls_vp_lock)) { 1678 mutex_exit(&lsp->ls_vp_lock); 1679 return (EINTR); 1680 } 1681 } 1682 1683 dkstate = (lsp->ls_vp != NULL ? DKIO_INSERTED : DKIO_DEV_GONE); 1684 mutex_exit(&lsp->ls_vp_lock); 1685 1686 if (ddi_copyout(&dkstate, (void *)arg, 1687 sizeof (dkstate), flag) != 0) 1688 return (EFAULT); 1689 return (0); 1690 default: 1691 return (ENOTTY); 1692 } 1693 } 1694 1695 static struct cb_ops lofi_cb_ops = { 1696 lofi_open, /* open */ 1697 lofi_close, /* close */ 1698 lofi_strategy, /* strategy */ 1699 nodev, /* print */ 1700 nodev, /* dump */ 1701 lofi_read, /* read */ 1702 lofi_write, /* write */ 1703 lofi_ioctl, /* ioctl */ 1704 nodev, /* devmap */ 1705 nodev, /* mmap */ 1706 nodev, /* segmap */ 1707 nochpoll, /* poll */ 1708 ddi_prop_op, /* prop_op */ 1709 0, /* streamtab */ 1710 D_64BIT | D_NEW | D_MP, /* Driver compatibility flag */ 1711 CB_REV, 1712 lofi_aread, 1713 lofi_awrite 1714 }; 1715 1716 static struct dev_ops lofi_ops = { 1717 DEVO_REV, /* devo_rev, */ 1718 0, /* refcnt */ 1719 lofi_info, /* info */ 1720 nulldev, /* identify */ 1721 nulldev, /* probe */ 1722 lofi_attach, /* attach */ 1723 lofi_detach, /* detach */ 1724 nodev, /* reset */ 1725 &lofi_cb_ops, /* driver operations */ 1726 NULL /* no bus operations */ 1727 }; 1728 1729 static struct modldrv modldrv = { 1730 &mod_driverops, 1731 "loopback file driver (%I%)", 1732 &lofi_ops, 1733 }; 1734 1735 static struct modlinkage modlinkage = { 1736 MODREV_1, 1737 &modldrv, 1738 NULL 1739 }; 1740 1741 int 1742 _init(void) 1743 { 1744 int error; 1745 1746 error = ddi_soft_state_init(&lofi_statep, 1747 sizeof (struct lofi_state), 0); 1748 if (error) 1749 return (error); 1750 1751 mutex_init(&lofi_lock, NULL, MUTEX_DRIVER, NULL); 1752 error = mod_install(&modlinkage); 1753 if (error) { 1754 mutex_destroy(&lofi_lock); 1755 ddi_soft_state_fini(&lofi_statep); 1756 } 1757 1758 return (error); 1759 } 1760 1761 int 1762 _fini(void) 1763 { 1764 int error; 1765 1766 if (lofi_busy()) 1767 return (EBUSY); 1768 1769 error = mod_remove(&modlinkage); 1770 if (error) 1771 return (error); 1772 1773 mutex_destroy(&lofi_lock); 1774 ddi_soft_state_fini(&lofi_statep); 1775 1776 return (error); 1777 } 1778 1779 int 1780 _info(struct modinfo *modinfop) 1781 { 1782 return (mod_info(&modlinkage, modinfop)); 1783 } 1784