1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved. 23 * 24 * Copyright 2013 Nexenta Systems, Inc. All rights reserved. 25 * Copyright (c) 2016 Andrey Sokolov 26 * Copyright 2016 Toomas Soome <tsoome@me.com> 27 */ 28 29 /* 30 * lofi (loopback file) driver - allows you to attach a file to a device, 31 * which can then be accessed through that device. The simple model is that 32 * you tell lofi to open a file, and then use the block device you get as 33 * you would any block device. lofi translates access to the block device 34 * into I/O on the underlying file. This is mostly useful for 35 * mounting images of filesystems. 36 * 37 * lofi is controlled through /dev/lofictl - this is the only device exported 38 * during attach, and is instance number 0. lofiadm communicates with lofi 39 * through ioctls on this device. When a file is attached to lofi, block and 40 * character devices are exported in /dev/lofi and /dev/rlofi. These devices 41 * are identified by lofi instance number, and the instance number is also used 42 * as the name in /dev/lofi. 43 * 44 * Virtual disks, or, labeled lofi, implements virtual disk support to 45 * support partition table and related tools. Such mappings will cause 46 * block and character devices to be exported in /dev/dsk and /dev/rdsk 47 * directories. 48 * 49 * To support virtual disks, the instance number space is divided to two 50 * parts, upper part for instance number and lower part for minor number 51 * space to identify partitions and slices. The virtual disk support is 52 * implemented by stacking cmlb module. For virtual disks, the partition 53 * related ioctl calls are routed to cmlb module. Compression and encryption 54 * is not supported for virtual disks. 55 * 56 * Mapped devices are tracked with state structures handled with 57 * ddi_soft_state(9F) for simplicity. 58 * 59 * A file attached to lofi is opened when attached and not closed until 60 * explicitly detached from lofi. This seems more sensible than deferring 61 * the open until the /dev/lofi device is opened, for a number of reasons. 62 * One is that any failure is likely to be noticed by the person (or script) 63 * running lofiadm. Another is that it would be a security problem if the 64 * file was replaced by another one after being added but before being opened. 65 * 66 * The only hard part about lofi is the ioctls. In order to support things 67 * like 'newfs' on a lofi device, it needs to support certain disk ioctls. 68 * So it has to fake disk geometry and partition information. More may need 69 * to be faked if your favorite utility doesn't work and you think it should 70 * (fdformat doesn't work because it really wants to know the type of floppy 71 * controller to talk to, and that didn't seem easy to fake. Or possibly even 72 * necessary, since we have mkfs_pcfs now). 73 * 74 * Normally, a lofi device cannot be detached if it is open (i.e. busy). To 75 * support simulation of hotplug events, an optional force flag is provided. 76 * If a lofi device is open when a force detach is requested, then the 77 * underlying file is closed and any subsequent operations return EIO. When the 78 * device is closed for the last time, it will be cleaned up at that time. In 79 * addition, the DKIOCSTATE ioctl will return DKIO_DEV_GONE when the device is 80 * detached but not removed. 81 * 82 * Known problems: 83 * 84 * UFS logging. Mounting a UFS filesystem image "logging" 85 * works for basic copy testing but wedges during a build of ON through 86 * that image. Some deadlock in lufs holding the log mutex and then 87 * getting stuck on a buf. So for now, don't do that. 88 * 89 * Direct I/O. Since the filesystem data is being cached in the buffer 90 * cache, _and_ again in the underlying filesystem, it's tempting to 91 * enable direct I/O on the underlying file. Don't, because that deadlocks. 92 * I think to fix the cache-twice problem we might need filesystem support. 93 * 94 * Interesting things to do: 95 * 96 * Allow multiple files for each device. A poor-man's metadisk, basically. 97 * 98 * Pass-through ioctls on block devices. You can (though it's not 99 * documented), give lofi a block device as a file name. Then we shouldn't 100 * need to fake a geometry, however, it may be relevant if you're replacing 101 * metadisk, or using lofi to get crypto. 102 * It makes sense to do lofiadm -c aes -a /dev/dsk/c0t0d0s4 /dev/lofi/1 103 * and then in /etc/vfstab have an entry for /dev/lofi/1 as /export/home. 104 * In fact this even makes sense if you have lofi "above" metadisk. 105 * 106 * Encryption: 107 * Each lofi device can have its own symmetric key and cipher. 108 * They are passed to us by lofiadm(1m) in the correct format for use 109 * with the misc/kcf crypto_* routines. 110 * 111 * Each block has its own IV, that is calculated in lofi_blk_mech(), based 112 * on the "master" key held in the lsp and the block number of the buffer. 113 */ 114 115 #include <sys/types.h> 116 #include <netinet/in.h> 117 #include <sys/sysmacros.h> 118 #include <sys/uio.h> 119 #include <sys/kmem.h> 120 #include <sys/cred.h> 121 #include <sys/mman.h> 122 #include <sys/errno.h> 123 #include <sys/aio_req.h> 124 #include <sys/stat.h> 125 #include <sys/file.h> 126 #include <sys/modctl.h> 127 #include <sys/conf.h> 128 #include <sys/debug.h> 129 #include <sys/vnode.h> 130 #include <sys/lofi.h> 131 #include <sys/lofi_impl.h> /* for cache structure */ 132 #include <sys/fcntl.h> 133 #include <sys/pathname.h> 134 #include <sys/filio.h> 135 #include <sys/fdio.h> 136 #include <sys/open.h> 137 #include <sys/disp.h> 138 #include <vm/seg_map.h> 139 #include <sys/ddi.h> 140 #include <sys/sunddi.h> 141 #include <sys/zmod.h> 142 #include <sys/id_space.h> 143 #include <sys/mkdev.h> 144 #include <sys/crypto/common.h> 145 #include <sys/crypto/api.h> 146 #include <sys/rctl.h> 147 #include <sys/vtoc.h> 148 #include <sys/scsi/scsi.h> /* for DTYPE_DIRECT */ 149 #include <sys/scsi/impl/uscsi.h> 150 #include <sys/sysevent/dev.h> 151 #include <LzmaDec.h> 152 153 #define NBLOCKS_PROP_NAME "Nblocks" 154 #define SIZE_PROP_NAME "Size" 155 #define ZONE_PROP_NAME "zone" 156 157 #define SETUP_C_DATA(cd, buf, len) \ 158 (cd).cd_format = CRYPTO_DATA_RAW; \ 159 (cd).cd_offset = 0; \ 160 (cd).cd_miscdata = NULL; \ 161 (cd).cd_length = (len); \ 162 (cd).cd_raw.iov_base = (buf); \ 163 (cd).cd_raw.iov_len = (len); 164 165 #define UIO_CHECK(uio) \ 166 if (((uio)->uio_loffset % DEV_BSIZE) != 0 || \ 167 ((uio)->uio_resid % DEV_BSIZE) != 0) { \ 168 return (EINVAL); \ 169 } 170 171 #define LOFI_TIMEOUT 30 172 173 static void *lofi_statep; 174 static kmutex_t lofi_lock; /* state lock */ 175 static id_space_t *lofi_id; /* lofi ID values */ 176 static list_t lofi_list; 177 static zone_key_t lofi_zone_key; 178 179 /* 180 * Because lofi_taskq_nthreads limits the actual swamping of the device, the 181 * maxalloc parameter (lofi_taskq_maxalloc) should be tuned conservatively 182 * high. If we want to be assured that the underlying device is always busy, 183 * we must be sure that the number of bytes enqueued when the number of 184 * enqueued tasks exceeds maxalloc is sufficient to keep the device busy for 185 * the duration of the sleep time in taskq_ent_alloc(). That is, lofi should 186 * set maxalloc to be the maximum throughput (in bytes per second) of the 187 * underlying device divided by the minimum I/O size. We assume a realistic 188 * maximum throughput of one hundred megabytes per second; we set maxalloc on 189 * the lofi task queue to be 104857600 divided by DEV_BSIZE. 190 */ 191 static int lofi_taskq_maxalloc = 104857600 / DEV_BSIZE; 192 static int lofi_taskq_nthreads = 4; /* # of taskq threads per device */ 193 194 const char lofi_crypto_magic[6] = LOFI_CRYPTO_MAGIC; 195 196 /* 197 * To avoid decompressing data in a compressed segment multiple times 198 * when accessing small parts of a segment's data, we cache and reuse 199 * the uncompressed segment's data. 200 * 201 * A single cached segment is sufficient to avoid lots of duplicate 202 * segment decompress operations. A small cache size also reduces the 203 * memory footprint. 204 * 205 * lofi_max_comp_cache is the maximum number of decompressed data segments 206 * cached for each compressed lofi image. It can be set to 0 to disable 207 * caching. 208 */ 209 210 uint32_t lofi_max_comp_cache = 1; 211 212 static int gzip_decompress(void *src, size_t srclen, void *dst, 213 size_t *destlen, int level); 214 215 static int lzma_decompress(void *src, size_t srclen, void *dst, 216 size_t *dstlen, int level); 217 218 lofi_compress_info_t lofi_compress_table[LOFI_COMPRESS_FUNCTIONS] = { 219 {gzip_decompress, NULL, 6, "gzip"}, /* default */ 220 {gzip_decompress, NULL, 6, "gzip-6"}, 221 {gzip_decompress, NULL, 9, "gzip-9"}, 222 {lzma_decompress, NULL, 0, "lzma"} 223 }; 224 225 static void lofi_strategy_task(void *); 226 static int lofi_tg_rdwr(dev_info_t *, uchar_t, void *, diskaddr_t, 227 size_t, void *); 228 static int lofi_tg_getinfo(dev_info_t *, int, void *, void *); 229 230 struct cmlb_tg_ops lofi_tg_ops = { 231 TG_DK_OPS_VERSION_1, 232 lofi_tg_rdwr, 233 lofi_tg_getinfo 234 }; 235 236 /*ARGSUSED*/ 237 static void 238 *SzAlloc(void *p, size_t size) 239 { 240 return (kmem_alloc(size, KM_SLEEP)); 241 } 242 243 /*ARGSUSED*/ 244 static void 245 SzFree(void *p, void *address, size_t size) 246 { 247 kmem_free(address, size); 248 } 249 250 static ISzAlloc g_Alloc = { SzAlloc, SzFree }; 251 252 /* 253 * Free data referenced by the linked list of cached uncompressed 254 * segments. 255 */ 256 static void 257 lofi_free_comp_cache(struct lofi_state *lsp) 258 { 259 struct lofi_comp_cache *lc; 260 261 while ((lc = list_remove_head(&lsp->ls_comp_cache)) != NULL) { 262 kmem_free(lc->lc_data, lsp->ls_uncomp_seg_sz); 263 kmem_free(lc, sizeof (struct lofi_comp_cache)); 264 lsp->ls_comp_cache_count--; 265 } 266 ASSERT(lsp->ls_comp_cache_count == 0); 267 } 268 269 static int 270 is_opened(struct lofi_state *lsp) 271 { 272 int i; 273 boolean_t last = B_TRUE; 274 275 ASSERT(MUTEX_HELD(&lofi_lock)); 276 for (i = 0; i < LOFI_PART_MAX; i++) { 277 if (lsp->ls_open_lyr[i]) { 278 last = B_FALSE; 279 break; 280 } 281 } 282 283 for (i = 0; last && (i < OTYP_LYR); i++) { 284 if (lsp->ls_open_reg[i]) { 285 last = B_FALSE; 286 } 287 } 288 289 return (!last); 290 } 291 292 static void 293 lofi_free_crypto(struct lofi_state *lsp) 294 { 295 ASSERT(MUTEX_HELD(&lofi_lock)); 296 297 if (lsp->ls_crypto_enabled) { 298 /* 299 * Clean up the crypto state so that it doesn't hang around 300 * in memory after we are done with it. 301 */ 302 if (lsp->ls_key.ck_data != NULL) { 303 bzero(lsp->ls_key.ck_data, 304 CRYPTO_BITS2BYTES(lsp->ls_key.ck_length)); 305 kmem_free(lsp->ls_key.ck_data, 306 CRYPTO_BITS2BYTES(lsp->ls_key.ck_length)); 307 lsp->ls_key.ck_data = NULL; 308 lsp->ls_key.ck_length = 0; 309 } 310 311 if (lsp->ls_mech.cm_param != NULL) { 312 kmem_free(lsp->ls_mech.cm_param, 313 lsp->ls_mech.cm_param_len); 314 lsp->ls_mech.cm_param = NULL; 315 lsp->ls_mech.cm_param_len = 0; 316 } 317 318 if (lsp->ls_iv_mech.cm_param != NULL) { 319 kmem_free(lsp->ls_iv_mech.cm_param, 320 lsp->ls_iv_mech.cm_param_len); 321 lsp->ls_iv_mech.cm_param = NULL; 322 lsp->ls_iv_mech.cm_param_len = 0; 323 } 324 325 mutex_destroy(&lsp->ls_crypto_lock); 326 } 327 } 328 329 /* ARGSUSED */ 330 static int 331 lofi_tg_rdwr(dev_info_t *dip, uchar_t cmd, void *bufaddr, diskaddr_t start, 332 size_t length, void *tg_cookie) 333 { 334 struct lofi_state *lsp; 335 buf_t *bp; 336 int instance; 337 int rv = 0; 338 339 instance = ddi_get_instance(dip); 340 if (instance == 0) /* control node does not have disk */ 341 return (ENXIO); 342 343 lsp = ddi_get_soft_state(lofi_statep, instance); 344 345 if (lsp == NULL) 346 return (ENXIO); 347 348 if (cmd != TG_READ && cmd != TG_WRITE) 349 return (EINVAL); 350 351 /* 352 * Make sure the mapping is set up by checking lsp->ls_vp_ready. 353 */ 354 mutex_enter(&lsp->ls_vp_lock); 355 while (lsp->ls_vp_ready == B_FALSE) 356 cv_wait(&lsp->ls_vp_cv, &lsp->ls_vp_lock); 357 mutex_exit(&lsp->ls_vp_lock); 358 359 if (P2PHASE(length, (1U << lsp->ls_lbshift)) != 0) { 360 /* We can only transfer whole blocks at a time! */ 361 return (EINVAL); 362 } 363 364 bp = getrbuf(KM_SLEEP); 365 366 if (cmd == TG_READ) { 367 bp->b_flags = B_READ; 368 } else { 369 if (lsp->ls_readonly == B_TRUE) { 370 freerbuf(bp); 371 return (EROFS); 372 } 373 bp->b_flags = B_WRITE; 374 } 375 376 bp->b_un.b_addr = bufaddr; 377 bp->b_bcount = length; 378 bp->b_lblkno = start; 379 bp->b_private = NULL; 380 bp->b_edev = lsp->ls_dev; 381 382 if (lsp->ls_kstat) { 383 mutex_enter(lsp->ls_kstat->ks_lock); 384 kstat_waitq_enter(KSTAT_IO_PTR(lsp->ls_kstat)); 385 mutex_exit(lsp->ls_kstat->ks_lock); 386 } 387 (void) taskq_dispatch(lsp->ls_taskq, lofi_strategy_task, bp, KM_SLEEP); 388 (void) biowait(bp); 389 390 rv = geterror(bp); 391 freerbuf(bp); 392 return (rv); 393 } 394 395 /* 396 * Get device geometry info for cmlb. 397 * 398 * We have mapped disk image as virtual block device and have to report 399 * physical/virtual geometry to cmlb. 400 * 401 * So we have two principal cases: 402 * 1. Uninitialised image without any existing labels, 403 * for this case we fabricate the data based on mapped image. 404 * 2. Image with existing label information. 405 * Since we have no information how the image was created (it may be 406 * dump from some physical device), we need to rely on label information 407 * from image, or we get "corrupted label" errors. 408 * NOTE: label can be MBR, MBR+SMI, GPT 409 */ 410 static int 411 lofi_tg_getinfo(dev_info_t *dip, int cmd, void *arg, void *tg_cookie) 412 { 413 struct lofi_state *lsp; 414 int instance; 415 int ashift; 416 417 _NOTE(ARGUNUSED(tg_cookie)); 418 instance = ddi_get_instance(dip); 419 if (instance == 0) /* control device has no storage */ 420 return (ENXIO); 421 422 lsp = ddi_get_soft_state(lofi_statep, instance); 423 424 if (lsp == NULL) 425 return (ENXIO); 426 427 /* 428 * Make sure the mapping is set up by checking lsp->ls_vp_ready. 429 * 430 * When mapping is created, new lofi instance is created and 431 * lofi_attach() will call cmlb_attach() as part of the procedure 432 * to set the mapping up. This chain of events will happen in 433 * the same thread. 434 * Since cmlb_attach() will call lofi_tg_getinfo to get 435 * capacity, we return error on that call if cookie is set, 436 * otherwise lofi_attach will be stuck as the mapping is not yet 437 * finalized and lofi is not yet ready. 438 * Note, such error is not fatal for cmlb, as the label setup 439 * will be finalized when cmlb_validate() is called. 440 */ 441 mutex_enter(&lsp->ls_vp_lock); 442 if (tg_cookie != NULL && lsp->ls_vp_ready == B_FALSE) { 443 mutex_exit(&lsp->ls_vp_lock); 444 return (ENXIO); 445 } 446 while (lsp->ls_vp_ready == B_FALSE) 447 cv_wait(&lsp->ls_vp_cv, &lsp->ls_vp_lock); 448 mutex_exit(&lsp->ls_vp_lock); 449 450 ashift = lsp->ls_lbshift; 451 452 switch (cmd) { 453 case TG_GETPHYGEOM: { 454 cmlb_geom_t *geomp = arg; 455 456 geomp->g_capacity = 457 (lsp->ls_vp_size - lsp->ls_crypto_offset) >> ashift; 458 geomp->g_nsect = lsp->ls_dkg.dkg_nsect; 459 geomp->g_nhead = lsp->ls_dkg.dkg_nhead; 460 geomp->g_acyl = lsp->ls_dkg.dkg_acyl; 461 geomp->g_ncyl = lsp->ls_dkg.dkg_ncyl; 462 geomp->g_secsize = (1U << ashift); 463 geomp->g_intrlv = lsp->ls_dkg.dkg_intrlv; 464 geomp->g_rpm = lsp->ls_dkg.dkg_rpm; 465 return (0); 466 } 467 468 case TG_GETCAPACITY: 469 *(diskaddr_t *)arg = 470 (lsp->ls_vp_size - lsp->ls_crypto_offset) >> ashift; 471 return (0); 472 473 case TG_GETBLOCKSIZE: 474 *(uint32_t *)arg = (1U << ashift); 475 return (0); 476 477 case TG_GETATTR: { 478 tg_attribute_t *tgattr = arg; 479 480 tgattr->media_is_writable = !lsp->ls_readonly; 481 tgattr->media_is_solid_state = B_FALSE; 482 return (0); 483 } 484 485 default: 486 return (EINVAL); 487 } 488 } 489 490 static void 491 lofi_destroy(struct lofi_state *lsp, cred_t *credp) 492 { 493 int id = LOFI_MINOR2ID(getminor(lsp->ls_dev)); 494 int i; 495 496 ASSERT(MUTEX_HELD(&lofi_lock)); 497 498 list_remove(&lofi_list, lsp); 499 500 lofi_free_crypto(lsp); 501 502 /* 503 * Free pre-allocated compressed buffers 504 */ 505 if (lsp->ls_comp_bufs != NULL) { 506 for (i = 0; i < lofi_taskq_nthreads; i++) { 507 if (lsp->ls_comp_bufs[i].bufsize > 0) 508 kmem_free(lsp->ls_comp_bufs[i].buf, 509 lsp->ls_comp_bufs[i].bufsize); 510 } 511 kmem_free(lsp->ls_comp_bufs, 512 sizeof (struct compbuf) * lofi_taskq_nthreads); 513 } 514 515 if (lsp->ls_vp != NULL) { 516 (void) VOP_PUTPAGE(lsp->ls_vp, 0, 0, B_INVAL, credp, NULL); 517 (void) VOP_CLOSE(lsp->ls_vp, lsp->ls_openflag, 518 1, 0, credp, NULL); 519 VN_RELE(lsp->ls_vp); 520 } 521 if (lsp->ls_stacked_vp != lsp->ls_vp) 522 VN_RELE(lsp->ls_stacked_vp); 523 524 if (lsp->ls_taskq != NULL) 525 taskq_destroy(lsp->ls_taskq); 526 527 if (lsp->ls_kstat != NULL) 528 kstat_delete(lsp->ls_kstat); 529 530 /* 531 * Free cached decompressed segment data 532 */ 533 lofi_free_comp_cache(lsp); 534 list_destroy(&lsp->ls_comp_cache); 535 536 if (lsp->ls_uncomp_seg_sz > 0) { 537 kmem_free(lsp->ls_comp_index_data, lsp->ls_comp_index_data_sz); 538 lsp->ls_uncomp_seg_sz = 0; 539 } 540 541 rctl_decr_lofi(lsp->ls_zone.zref_zone, 1); 542 zone_rele_ref(&lsp->ls_zone, ZONE_REF_LOFI); 543 544 mutex_destroy(&lsp->ls_comp_cache_lock); 545 mutex_destroy(&lsp->ls_comp_bufs_lock); 546 mutex_destroy(&lsp->ls_kstat_lock); 547 mutex_destroy(&lsp->ls_vp_lock); 548 cv_destroy(&lsp->ls_vp_cv); 549 lsp->ls_vp_ready = B_FALSE; 550 551 ASSERT(ddi_get_soft_state(lofi_statep, id) == lsp); 552 (void) ndi_devi_offline(lsp->ls_dip, NDI_DEVI_REMOVE); 553 id_free(lofi_id, id); 554 } 555 556 static void 557 lofi_free_dev(struct lofi_state *lsp) 558 { 559 ASSERT(MUTEX_HELD(&lofi_lock)); 560 561 if (lsp->ls_cmlbhandle != NULL) { 562 cmlb_invalidate(lsp->ls_cmlbhandle, 0); 563 cmlb_detach(lsp->ls_cmlbhandle, 0); 564 cmlb_free_handle(&lsp->ls_cmlbhandle); 565 lsp->ls_cmlbhandle = NULL; 566 } 567 (void) ddi_prop_remove_all(lsp->ls_dip); 568 ddi_remove_minor_node(lsp->ls_dip, NULL); 569 } 570 571 /*ARGSUSED*/ 572 static void 573 lofi_zone_shutdown(zoneid_t zoneid, void *arg) 574 { 575 struct lofi_state *lsp; 576 struct lofi_state *next; 577 578 mutex_enter(&lofi_lock); 579 580 for (lsp = list_head(&lofi_list); lsp != NULL; lsp = next) { 581 582 /* lofi_destroy() frees lsp */ 583 next = list_next(&lofi_list, lsp); 584 585 if (lsp->ls_zone.zref_zone->zone_id != zoneid) 586 continue; 587 588 /* 589 * No in-zone processes are running, but something has this 590 * open. It's either a global zone process, or a lofi 591 * mount. In either case we set ls_cleanup so the last 592 * user destroys the device. 593 */ 594 if (is_opened(lsp)) { 595 lsp->ls_cleanup = 1; 596 } else { 597 lofi_free_dev(lsp); 598 lofi_destroy(lsp, kcred); 599 } 600 } 601 602 mutex_exit(&lofi_lock); 603 } 604 605 /*ARGSUSED*/ 606 static int 607 lofi_open(dev_t *devp, int flag, int otyp, struct cred *credp) 608 { 609 int id; 610 minor_t part; 611 uint64_t mask; 612 diskaddr_t nblks; 613 diskaddr_t lba; 614 boolean_t ndelay; 615 616 struct lofi_state *lsp; 617 618 if (otyp >= OTYPCNT) 619 return (EINVAL); 620 621 ndelay = (flag & (FNDELAY | FNONBLOCK)) ? B_TRUE : B_FALSE; 622 623 /* 624 * lofiadm -a /dev/lofi/1 gets us here. 625 */ 626 if (mutex_owner(&lofi_lock) == curthread) 627 return (EINVAL); 628 629 mutex_enter(&lofi_lock); 630 631 id = LOFI_MINOR2ID(getminor(*devp)); 632 part = LOFI_PART(getminor(*devp)); 633 mask = (1U << part); 634 635 /* master control device */ 636 if (id == 0) { 637 mutex_exit(&lofi_lock); 638 return (0); 639 } 640 641 /* otherwise, the mapping should already exist */ 642 lsp = ddi_get_soft_state(lofi_statep, id); 643 if (lsp == NULL) { 644 mutex_exit(&lofi_lock); 645 return (EINVAL); 646 } 647 648 if (lsp->ls_vp == NULL) { 649 mutex_exit(&lofi_lock); 650 return (ENXIO); 651 } 652 653 if (lsp->ls_readonly && (flag & FWRITE)) { 654 mutex_exit(&lofi_lock); 655 return (EROFS); 656 } 657 658 if ((lsp->ls_open_excl) & (mask)) { 659 mutex_exit(&lofi_lock); 660 return (EBUSY); 661 } 662 663 if (flag & FEXCL) { 664 if (lsp->ls_open_lyr[part]) { 665 mutex_exit(&lofi_lock); 666 return (EBUSY); 667 } 668 for (int i = 0; i < OTYP_LYR; i++) { 669 if (lsp->ls_open_reg[i] & mask) { 670 mutex_exit(&lofi_lock); 671 return (EBUSY); 672 } 673 } 674 } 675 676 if (lsp->ls_cmlbhandle != NULL) { 677 if (cmlb_validate(lsp->ls_cmlbhandle, 0, 0) != 0) { 678 /* 679 * non-blocking opens are allowed to succeed to 680 * support format and fdisk to create partitioning. 681 */ 682 if (!ndelay) { 683 mutex_exit(&lofi_lock); 684 return (ENXIO); 685 } 686 } else if (cmlb_partinfo(lsp->ls_cmlbhandle, part, &nblks, &lba, 687 NULL, NULL, 0) == 0) { 688 if ((!nblks) && ((!ndelay) || (otyp != OTYP_CHR))) { 689 mutex_exit(&lofi_lock); 690 return (ENXIO); 691 } 692 } else if (!ndelay) { 693 mutex_exit(&lofi_lock); 694 return (ENXIO); 695 } 696 } 697 698 if (otyp == OTYP_LYR) { 699 lsp->ls_open_lyr[part]++; 700 } else { 701 lsp->ls_open_reg[otyp] |= mask; 702 } 703 if (flag & FEXCL) { 704 lsp->ls_open_excl |= mask; 705 } 706 707 mutex_exit(&lofi_lock); 708 return (0); 709 } 710 711 /*ARGSUSED*/ 712 static int 713 lofi_close(dev_t dev, int flag, int otyp, struct cred *credp) 714 { 715 minor_t part; 716 int id; 717 uint64_t mask; 718 struct lofi_state *lsp; 719 720 id = LOFI_MINOR2ID(getminor(dev)); 721 part = LOFI_PART(getminor(dev)); 722 mask = (1U << part); 723 724 mutex_enter(&lofi_lock); 725 lsp = ddi_get_soft_state(lofi_statep, id); 726 if (lsp == NULL) { 727 mutex_exit(&lofi_lock); 728 return (EINVAL); 729 } 730 731 if (id == 0) { 732 mutex_exit(&lofi_lock); 733 return (0); 734 } 735 736 if (lsp->ls_open_excl & mask) 737 lsp->ls_open_excl &= ~mask; 738 739 if (otyp == OTYP_LYR) { 740 lsp->ls_open_lyr[part]--; 741 } else { 742 lsp->ls_open_reg[otyp] &= ~mask; 743 } 744 745 /* 746 * If we forcibly closed the underlying device (li_force), or 747 * asked for cleanup (li_cleanup), finish up if we're the last 748 * out of the door. 749 */ 750 if (!is_opened(lsp) && (lsp->ls_cleanup || lsp->ls_vp == NULL)) { 751 lofi_free_dev(lsp); 752 lofi_destroy(lsp, credp); 753 } 754 755 mutex_exit(&lofi_lock); 756 return (0); 757 } 758 759 /* 760 * Sets the mechanism's initialization vector (IV) if one is needed. 761 * The IV is computed from the data block number. lsp->ls_mech is 762 * altered so that: 763 * lsp->ls_mech.cm_param_len is set to the IV len. 764 * lsp->ls_mech.cm_param is set to the IV. 765 */ 766 static int 767 lofi_blk_mech(struct lofi_state *lsp, longlong_t lblkno) 768 { 769 int ret; 770 crypto_data_t cdata; 771 char *iv; 772 size_t iv_len; 773 size_t min; 774 void *data; 775 size_t datasz; 776 777 ASSERT(MUTEX_HELD(&lsp->ls_crypto_lock)); 778 779 if (lsp == NULL) 780 return (CRYPTO_DEVICE_ERROR); 781 782 /* lsp->ls_mech.cm_param{_len} has already been set for static iv */ 783 if (lsp->ls_iv_type == IVM_NONE) { 784 return (CRYPTO_SUCCESS); 785 } 786 787 /* 788 * if kmem already alloced from previous call and it's the same size 789 * we need now, just recycle it; allocate new kmem only if we have to 790 */ 791 if (lsp->ls_mech.cm_param == NULL || 792 lsp->ls_mech.cm_param_len != lsp->ls_iv_len) { 793 iv_len = lsp->ls_iv_len; 794 iv = kmem_zalloc(iv_len, KM_SLEEP); 795 } else { 796 iv_len = lsp->ls_mech.cm_param_len; 797 iv = lsp->ls_mech.cm_param; 798 bzero(iv, iv_len); 799 } 800 801 switch (lsp->ls_iv_type) { 802 case IVM_ENC_BLKNO: 803 /* iv is not static, lblkno changes each time */ 804 data = &lblkno; 805 datasz = sizeof (lblkno); 806 break; 807 default: 808 data = 0; 809 datasz = 0; 810 break; 811 } 812 813 /* 814 * write blkno into the iv buffer padded on the left in case 815 * blkno ever grows bigger than its current longlong_t size 816 * or a variation other than blkno is used for the iv data 817 */ 818 min = MIN(datasz, iv_len); 819 bcopy(data, iv + (iv_len - min), min); 820 821 /* encrypt the data in-place to get the IV */ 822 SETUP_C_DATA(cdata, iv, iv_len); 823 824 ret = crypto_encrypt(&lsp->ls_iv_mech, &cdata, &lsp->ls_key, 825 NULL, NULL, NULL); 826 if (ret != CRYPTO_SUCCESS) { 827 cmn_err(CE_WARN, "failed to create iv for block %lld: (0x%x)", 828 lblkno, ret); 829 if (lsp->ls_mech.cm_param != iv) 830 kmem_free(iv, iv_len); 831 832 return (ret); 833 } 834 835 /* clean up the iv from the last computation */ 836 if (lsp->ls_mech.cm_param != NULL && lsp->ls_mech.cm_param != iv) 837 kmem_free(lsp->ls_mech.cm_param, lsp->ls_mech.cm_param_len); 838 839 lsp->ls_mech.cm_param_len = iv_len; 840 lsp->ls_mech.cm_param = iv; 841 842 return (CRYPTO_SUCCESS); 843 } 844 845 /* 846 * Performs encryption and decryption of a chunk of data of size "len", 847 * one DEV_BSIZE block at a time. "len" is assumed to be a multiple of 848 * DEV_BSIZE. 849 */ 850 static int 851 lofi_crypto(struct lofi_state *lsp, struct buf *bp, caddr_t plaintext, 852 caddr_t ciphertext, size_t len, boolean_t op_encrypt) 853 { 854 crypto_data_t cdata; 855 crypto_data_t wdata; 856 int ret; 857 longlong_t lblkno = bp->b_lblkno; 858 859 mutex_enter(&lsp->ls_crypto_lock); 860 861 /* 862 * though we could encrypt/decrypt entire "len" chunk of data, we need 863 * to break it into DEV_BSIZE pieces to capture blkno incrementing 864 */ 865 SETUP_C_DATA(cdata, plaintext, len); 866 cdata.cd_length = DEV_BSIZE; 867 if (ciphertext != NULL) { /* not in-place crypto */ 868 SETUP_C_DATA(wdata, ciphertext, len); 869 wdata.cd_length = DEV_BSIZE; 870 } 871 872 do { 873 ret = lofi_blk_mech(lsp, lblkno); 874 if (ret != CRYPTO_SUCCESS) 875 continue; 876 877 if (op_encrypt) { 878 ret = crypto_encrypt(&lsp->ls_mech, &cdata, 879 &lsp->ls_key, NULL, 880 ((ciphertext != NULL) ? &wdata : NULL), NULL); 881 } else { 882 ret = crypto_decrypt(&lsp->ls_mech, &cdata, 883 &lsp->ls_key, NULL, 884 ((ciphertext != NULL) ? &wdata : NULL), NULL); 885 } 886 887 cdata.cd_offset += DEV_BSIZE; 888 if (ciphertext != NULL) 889 wdata.cd_offset += DEV_BSIZE; 890 lblkno++; 891 } while (ret == CRYPTO_SUCCESS && cdata.cd_offset < len); 892 893 mutex_exit(&lsp->ls_crypto_lock); 894 895 if (ret != CRYPTO_SUCCESS) { 896 cmn_err(CE_WARN, "%s failed for block %lld: (0x%x)", 897 op_encrypt ? "crypto_encrypt()" : "crypto_decrypt()", 898 lblkno, ret); 899 } 900 901 return (ret); 902 } 903 904 #define RDWR_RAW 1 905 #define RDWR_BCOPY 2 906 907 static int 908 lofi_rdwr(caddr_t bufaddr, offset_t offset, struct buf *bp, 909 struct lofi_state *lsp, size_t len, int method, caddr_t bcopy_locn) 910 { 911 ssize_t resid; 912 int isread; 913 int error; 914 915 /* 916 * Handles reads/writes for both plain and encrypted lofi 917 * Note: offset is already shifted by lsp->ls_crypto_offset 918 * when it gets here. 919 */ 920 921 isread = bp->b_flags & B_READ; 922 if (isread) { 923 if (method == RDWR_BCOPY) { 924 /* DO NOT update bp->b_resid for bcopy */ 925 bcopy(bcopy_locn, bufaddr, len); 926 error = 0; 927 } else { /* RDWR_RAW */ 928 error = vn_rdwr(UIO_READ, lsp->ls_vp, bufaddr, len, 929 offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, 930 &resid); 931 bp->b_resid = resid; 932 } 933 if (lsp->ls_crypto_enabled && error == 0) { 934 if (lofi_crypto(lsp, bp, bufaddr, NULL, len, 935 B_FALSE) != CRYPTO_SUCCESS) { 936 /* 937 * XXX: original code didn't set residual 938 * back to len because no error was expected 939 * from bcopy() if encryption is not enabled 940 */ 941 if (method != RDWR_BCOPY) 942 bp->b_resid = len; 943 error = EIO; 944 } 945 } 946 return (error); 947 } else { 948 void *iobuf = bufaddr; 949 950 if (lsp->ls_crypto_enabled) { 951 /* don't do in-place crypto to keep bufaddr intact */ 952 iobuf = kmem_alloc(len, KM_SLEEP); 953 if (lofi_crypto(lsp, bp, bufaddr, iobuf, len, 954 B_TRUE) != CRYPTO_SUCCESS) { 955 kmem_free(iobuf, len); 956 if (method != RDWR_BCOPY) 957 bp->b_resid = len; 958 return (EIO); 959 } 960 } 961 if (method == RDWR_BCOPY) { 962 /* DO NOT update bp->b_resid for bcopy */ 963 bcopy(iobuf, bcopy_locn, len); 964 error = 0; 965 } else { /* RDWR_RAW */ 966 error = vn_rdwr(UIO_WRITE, lsp->ls_vp, iobuf, len, 967 offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, 968 &resid); 969 bp->b_resid = resid; 970 } 971 if (lsp->ls_crypto_enabled) { 972 kmem_free(iobuf, len); 973 } 974 return (error); 975 } 976 } 977 978 static int 979 lofi_mapped_rdwr(caddr_t bufaddr, offset_t offset, struct buf *bp, 980 struct lofi_state *lsp) 981 { 982 int error; 983 offset_t alignedoffset, mapoffset; 984 size_t xfersize; 985 int isread; 986 int smflags; 987 caddr_t mapaddr; 988 size_t len; 989 enum seg_rw srw; 990 int save_error; 991 992 /* 993 * Note: offset is already shifted by lsp->ls_crypto_offset 994 * when it gets here. 995 */ 996 if (lsp->ls_crypto_enabled) 997 ASSERT(lsp->ls_vp_comp_size == lsp->ls_vp_size); 998 999 /* 1000 * segmap always gives us an 8K (MAXBSIZE) chunk, aligned on 1001 * an 8K boundary, but the buf transfer address may not be 1002 * aligned on more than a 512-byte boundary (we don't enforce 1003 * that even though we could). This matters since the initial 1004 * part of the transfer may not start at offset 0 within the 1005 * segmap'd chunk. So we have to compensate for that with 1006 * 'mapoffset'. Subsequent chunks always start off at the 1007 * beginning, and the last is capped by b_resid 1008 * 1009 * Visually, where "|" represents page map boundaries: 1010 * alignedoffset (mapaddr begins at this segmap boundary) 1011 * | offset (from beginning of file) 1012 * | | len 1013 * v v v 1014 * ===|====X========|====...======|========X====|==== 1015 * /-------------...---------------/ 1016 * ^ bp->b_bcount/bp->b_resid at start 1017 * /----/--------/----...------/--------/ 1018 * ^ ^ ^ ^ ^ 1019 * | | | | nth xfersize (<= MAXBSIZE) 1020 * | | 2nd thru n-1st xfersize (= MAXBSIZE) 1021 * | 1st xfersize (<= MAXBSIZE) 1022 * mapoffset (offset into 1st segmap, non-0 1st time, 0 thereafter) 1023 * 1024 * Notes: "alignedoffset" is "offset" rounded down to nearest 1025 * MAXBSIZE boundary. "len" is next page boundary of size 1026 * PAGESIZE after "alignedoffset". 1027 */ 1028 mapoffset = offset & MAXBOFFSET; 1029 alignedoffset = offset - mapoffset; 1030 bp->b_resid = bp->b_bcount; 1031 isread = bp->b_flags & B_READ; 1032 srw = isread ? S_READ : S_WRITE; 1033 do { 1034 xfersize = MIN(lsp->ls_vp_comp_size - offset, 1035 MIN(MAXBSIZE - mapoffset, bp->b_resid)); 1036 len = roundup(mapoffset + xfersize, PAGESIZE); 1037 mapaddr = segmap_getmapflt(segkmap, lsp->ls_vp, 1038 alignedoffset, MAXBSIZE, 1, srw); 1039 /* 1040 * Now fault in the pages. This lets us check 1041 * for errors before we reference mapaddr and 1042 * try to resolve the fault in bcopy (which would 1043 * panic instead). And this can easily happen, 1044 * particularly if you've lofi'd a file over NFS 1045 * and someone deletes the file on the server. 1046 */ 1047 error = segmap_fault(kas.a_hat, segkmap, mapaddr, 1048 len, F_SOFTLOCK, srw); 1049 if (error) { 1050 (void) segmap_release(segkmap, mapaddr, 0); 1051 if (FC_CODE(error) == FC_OBJERR) 1052 error = FC_ERRNO(error); 1053 else 1054 error = EIO; 1055 break; 1056 } 1057 /* error may be non-zero for encrypted lofi */ 1058 error = lofi_rdwr(bufaddr, 0, bp, lsp, xfersize, 1059 RDWR_BCOPY, mapaddr + mapoffset); 1060 if (error == 0) { 1061 bp->b_resid -= xfersize; 1062 bufaddr += xfersize; 1063 offset += xfersize; 1064 } 1065 smflags = 0; 1066 if (isread) { 1067 smflags |= SM_FREE; 1068 /* 1069 * If we're reading an entire page starting 1070 * at a page boundary, there's a good chance 1071 * we won't need it again. Put it on the 1072 * head of the freelist. 1073 */ 1074 if (mapoffset == 0 && xfersize == MAXBSIZE) 1075 smflags |= SM_DONTNEED; 1076 } else { 1077 /* 1078 * Write back good pages, it is okay to 1079 * always release asynchronous here as we'll 1080 * follow with VOP_FSYNC for B_SYNC buffers. 1081 */ 1082 if (error == 0) 1083 smflags |= SM_WRITE | SM_ASYNC; 1084 } 1085 (void) segmap_fault(kas.a_hat, segkmap, mapaddr, 1086 len, F_SOFTUNLOCK, srw); 1087 save_error = segmap_release(segkmap, mapaddr, smflags); 1088 if (error == 0) 1089 error = save_error; 1090 /* only the first map may start partial */ 1091 mapoffset = 0; 1092 alignedoffset += MAXBSIZE; 1093 } while ((error == 0) && (bp->b_resid > 0) && 1094 (offset < lsp->ls_vp_comp_size)); 1095 1096 return (error); 1097 } 1098 1099 /* 1100 * Check if segment seg_index is present in the decompressed segment 1101 * data cache. 1102 * 1103 * Returns a pointer to the decompressed segment data cache entry if 1104 * found, and NULL when decompressed data for this segment is not yet 1105 * cached. 1106 */ 1107 static struct lofi_comp_cache * 1108 lofi_find_comp_data(struct lofi_state *lsp, uint64_t seg_index) 1109 { 1110 struct lofi_comp_cache *lc; 1111 1112 ASSERT(MUTEX_HELD(&lsp->ls_comp_cache_lock)); 1113 1114 for (lc = list_head(&lsp->ls_comp_cache); lc != NULL; 1115 lc = list_next(&lsp->ls_comp_cache, lc)) { 1116 if (lc->lc_index == seg_index) { 1117 /* 1118 * Decompressed segment data was found in the 1119 * cache. 1120 * 1121 * The cache uses an LRU replacement strategy; 1122 * move the entry to head of list. 1123 */ 1124 list_remove(&lsp->ls_comp_cache, lc); 1125 list_insert_head(&lsp->ls_comp_cache, lc); 1126 return (lc); 1127 } 1128 } 1129 return (NULL); 1130 } 1131 1132 /* 1133 * Add the data for a decompressed segment at segment index 1134 * seg_index to the cache of the decompressed segments. 1135 * 1136 * Returns a pointer to the cache element structure in case 1137 * the data was added to the cache; returns NULL when the data 1138 * wasn't cached. 1139 */ 1140 static struct lofi_comp_cache * 1141 lofi_add_comp_data(struct lofi_state *lsp, uint64_t seg_index, 1142 uchar_t *data) 1143 { 1144 struct lofi_comp_cache *lc; 1145 1146 ASSERT(MUTEX_HELD(&lsp->ls_comp_cache_lock)); 1147 1148 while (lsp->ls_comp_cache_count > lofi_max_comp_cache) { 1149 lc = list_remove_tail(&lsp->ls_comp_cache); 1150 ASSERT(lc != NULL); 1151 kmem_free(lc->lc_data, lsp->ls_uncomp_seg_sz); 1152 kmem_free(lc, sizeof (struct lofi_comp_cache)); 1153 lsp->ls_comp_cache_count--; 1154 } 1155 1156 /* 1157 * Do not cache when disabled by tunable variable 1158 */ 1159 if (lofi_max_comp_cache == 0) 1160 return (NULL); 1161 1162 /* 1163 * When the cache has not yet reached the maximum allowed 1164 * number of segments, allocate a new cache element. 1165 * Otherwise the cache is full; reuse the last list element 1166 * (LRU) for caching the decompressed segment data. 1167 * 1168 * The cache element for the new decompressed segment data is 1169 * added to the head of the list. 1170 */ 1171 if (lsp->ls_comp_cache_count < lofi_max_comp_cache) { 1172 lc = kmem_alloc(sizeof (struct lofi_comp_cache), KM_SLEEP); 1173 lc->lc_data = NULL; 1174 list_insert_head(&lsp->ls_comp_cache, lc); 1175 lsp->ls_comp_cache_count++; 1176 } else { 1177 lc = list_remove_tail(&lsp->ls_comp_cache); 1178 if (lc == NULL) 1179 return (NULL); 1180 list_insert_head(&lsp->ls_comp_cache, lc); 1181 } 1182 1183 /* 1184 * Free old uncompressed segment data when reusing a cache 1185 * entry. 1186 */ 1187 if (lc->lc_data != NULL) 1188 kmem_free(lc->lc_data, lsp->ls_uncomp_seg_sz); 1189 1190 lc->lc_data = data; 1191 lc->lc_index = seg_index; 1192 return (lc); 1193 } 1194 1195 1196 /*ARGSUSED*/ 1197 static int 1198 gzip_decompress(void *src, size_t srclen, void *dst, 1199 size_t *dstlen, int level) 1200 { 1201 ASSERT(*dstlen >= srclen); 1202 1203 if (z_uncompress(dst, dstlen, src, srclen) != Z_OK) 1204 return (-1); 1205 return (0); 1206 } 1207 1208 #define LZMA_HEADER_SIZE (LZMA_PROPS_SIZE + 8) 1209 /*ARGSUSED*/ 1210 static int 1211 lzma_decompress(void *src, size_t srclen, void *dst, 1212 size_t *dstlen, int level) 1213 { 1214 size_t insizepure; 1215 void *actual_src; 1216 ELzmaStatus status; 1217 1218 insizepure = srclen - LZMA_HEADER_SIZE; 1219 actual_src = (void *)((Byte *)src + LZMA_HEADER_SIZE); 1220 1221 if (LzmaDecode((Byte *)dst, (size_t *)dstlen, 1222 (const Byte *)actual_src, &insizepure, 1223 (const Byte *)src, LZMA_PROPS_SIZE, LZMA_FINISH_ANY, &status, 1224 &g_Alloc) != SZ_OK) { 1225 return (-1); 1226 } 1227 return (0); 1228 } 1229 1230 /* 1231 * This is basically what strategy used to be before we found we 1232 * needed task queues. 1233 */ 1234 static void 1235 lofi_strategy_task(void *arg) 1236 { 1237 struct buf *bp = (struct buf *)arg; 1238 int error; 1239 int syncflag = 0; 1240 struct lofi_state *lsp; 1241 offset_t offset; 1242 caddr_t bufaddr; 1243 size_t len; 1244 size_t xfersize; 1245 boolean_t bufinited = B_FALSE; 1246 1247 lsp = ddi_get_soft_state(lofi_statep, 1248 LOFI_MINOR2ID(getminor(bp->b_edev))); 1249 1250 if (lsp == NULL) { 1251 error = ENXIO; 1252 goto errout; 1253 } 1254 if (lsp->ls_kstat) { 1255 mutex_enter(lsp->ls_kstat->ks_lock); 1256 kstat_waitq_to_runq(KSTAT_IO_PTR(lsp->ls_kstat)); 1257 mutex_exit(lsp->ls_kstat->ks_lock); 1258 } 1259 1260 mutex_enter(&lsp->ls_vp_lock); 1261 lsp->ls_vp_iocount++; 1262 mutex_exit(&lsp->ls_vp_lock); 1263 1264 bp_mapin(bp); 1265 bufaddr = bp->b_un.b_addr; 1266 offset = (bp->b_lblkno + (diskaddr_t)(uintptr_t)bp->b_private) 1267 << lsp->ls_lbshift; /* offset within file */ 1268 if (lsp->ls_crypto_enabled) { 1269 /* encrypted data really begins after crypto header */ 1270 offset += lsp->ls_crypto_offset; 1271 } 1272 len = bp->b_bcount; 1273 bufinited = B_TRUE; 1274 1275 if (lsp->ls_vp == NULL || lsp->ls_vp_closereq) { 1276 error = EIO; 1277 goto errout; 1278 } 1279 1280 /* 1281 * If we're writing and the buffer was not B_ASYNC 1282 * we'll follow up with a VOP_FSYNC() to force any 1283 * asynchronous I/O to stable storage. 1284 */ 1285 if (!(bp->b_flags & B_READ) && !(bp->b_flags & B_ASYNC)) 1286 syncflag = FSYNC; 1287 1288 /* 1289 * We used to always use vn_rdwr here, but we cannot do that because 1290 * we might decide to read or write from the the underlying 1291 * file during this call, which would be a deadlock because 1292 * we have the rw_lock. So instead we page, unless it's not 1293 * mapable or it's a character device or it's an encrypted lofi. 1294 */ 1295 if ((lsp->ls_vp->v_flag & VNOMAP) || (lsp->ls_vp->v_type == VCHR) || 1296 lsp->ls_crypto_enabled) { 1297 error = lofi_rdwr(bufaddr, offset, bp, lsp, len, RDWR_RAW, 1298 NULL); 1299 } else if (lsp->ls_uncomp_seg_sz == 0) { 1300 error = lofi_mapped_rdwr(bufaddr, offset, bp, lsp); 1301 } else { 1302 uchar_t *compressed_seg = NULL, *cmpbuf; 1303 uchar_t *uncompressed_seg = NULL; 1304 lofi_compress_info_t *li; 1305 size_t oblkcount; 1306 ulong_t seglen; 1307 uint64_t sblkno, eblkno, cmpbytes; 1308 uint64_t uncompressed_seg_index; 1309 struct lofi_comp_cache *lc; 1310 offset_t sblkoff, eblkoff; 1311 u_offset_t salign, ealign; 1312 u_offset_t sdiff; 1313 uint32_t comp_data_sz; 1314 uint64_t i; 1315 int j; 1316 1317 /* 1318 * From here on we're dealing primarily with compressed files 1319 */ 1320 ASSERT(!lsp->ls_crypto_enabled); 1321 1322 /* 1323 * Compressed files can only be read from and 1324 * not written to 1325 */ 1326 if (!(bp->b_flags & B_READ)) { 1327 bp->b_resid = bp->b_bcount; 1328 error = EROFS; 1329 goto done; 1330 } 1331 1332 ASSERT(lsp->ls_comp_algorithm_index >= 0); 1333 li = &lofi_compress_table[lsp->ls_comp_algorithm_index]; 1334 /* 1335 * Compute starting and ending compressed segment numbers 1336 * We use only bitwise operations avoiding division and 1337 * modulus because we enforce the compression segment size 1338 * to a power of 2 1339 */ 1340 sblkno = offset >> lsp->ls_comp_seg_shift; 1341 sblkoff = offset & (lsp->ls_uncomp_seg_sz - 1); 1342 eblkno = (offset + bp->b_bcount) >> lsp->ls_comp_seg_shift; 1343 eblkoff = (offset + bp->b_bcount) & (lsp->ls_uncomp_seg_sz - 1); 1344 1345 /* 1346 * Check the decompressed segment cache. 1347 * 1348 * The cache is used only when the requested data 1349 * is within a segment. Requests that cross 1350 * segment boundaries bypass the cache. 1351 */ 1352 if (sblkno == eblkno || 1353 (sblkno + 1 == eblkno && eblkoff == 0)) { 1354 /* 1355 * Request doesn't cross a segment boundary, 1356 * now check the cache. 1357 */ 1358 mutex_enter(&lsp->ls_comp_cache_lock); 1359 lc = lofi_find_comp_data(lsp, sblkno); 1360 if (lc != NULL) { 1361 /* 1362 * We've found the decompressed segment 1363 * data in the cache; reuse it. 1364 */ 1365 bcopy(lc->lc_data + sblkoff, bufaddr, 1366 bp->b_bcount); 1367 mutex_exit(&lsp->ls_comp_cache_lock); 1368 bp->b_resid = 0; 1369 error = 0; 1370 goto done; 1371 } 1372 mutex_exit(&lsp->ls_comp_cache_lock); 1373 } 1374 1375 /* 1376 * Align start offset to block boundary for segmap 1377 */ 1378 salign = lsp->ls_comp_seg_index[sblkno]; 1379 sdiff = salign & (DEV_BSIZE - 1); 1380 salign -= sdiff; 1381 if (eblkno >= (lsp->ls_comp_index_sz - 1)) { 1382 /* 1383 * We're dealing with the last segment of 1384 * the compressed file -- the size of this 1385 * segment *may not* be the same as the 1386 * segment size for the file 1387 */ 1388 eblkoff = (offset + bp->b_bcount) & 1389 (lsp->ls_uncomp_last_seg_sz - 1); 1390 ealign = lsp->ls_vp_comp_size; 1391 } else { 1392 ealign = lsp->ls_comp_seg_index[eblkno + 1]; 1393 } 1394 1395 /* 1396 * Preserve original request paramaters 1397 */ 1398 oblkcount = bp->b_bcount; 1399 1400 /* 1401 * Assign the calculated parameters 1402 */ 1403 comp_data_sz = ealign - salign; 1404 bp->b_bcount = comp_data_sz; 1405 1406 /* 1407 * Buffers to hold compressed segments are pre-allocated 1408 * on a per-thread basis. Find a pre-allocated buffer 1409 * that is not currently in use and mark it for use. 1410 */ 1411 mutex_enter(&lsp->ls_comp_bufs_lock); 1412 for (j = 0; j < lofi_taskq_nthreads; j++) { 1413 if (lsp->ls_comp_bufs[j].inuse == 0) { 1414 lsp->ls_comp_bufs[j].inuse = 1; 1415 break; 1416 } 1417 } 1418 1419 mutex_exit(&lsp->ls_comp_bufs_lock); 1420 ASSERT(j < lofi_taskq_nthreads); 1421 1422 /* 1423 * If the pre-allocated buffer size does not match 1424 * the size of the I/O request, re-allocate it with 1425 * the appropriate size 1426 */ 1427 if (lsp->ls_comp_bufs[j].bufsize < bp->b_bcount) { 1428 if (lsp->ls_comp_bufs[j].bufsize > 0) 1429 kmem_free(lsp->ls_comp_bufs[j].buf, 1430 lsp->ls_comp_bufs[j].bufsize); 1431 lsp->ls_comp_bufs[j].buf = kmem_alloc(bp->b_bcount, 1432 KM_SLEEP); 1433 lsp->ls_comp_bufs[j].bufsize = bp->b_bcount; 1434 } 1435 compressed_seg = lsp->ls_comp_bufs[j].buf; 1436 1437 /* 1438 * Map in the calculated number of blocks 1439 */ 1440 error = lofi_mapped_rdwr((caddr_t)compressed_seg, salign, 1441 bp, lsp); 1442 1443 bp->b_bcount = oblkcount; 1444 bp->b_resid = oblkcount; 1445 if (error != 0) 1446 goto done; 1447 1448 /* 1449 * decompress compressed blocks start 1450 */ 1451 cmpbuf = compressed_seg + sdiff; 1452 for (i = sblkno; i <= eblkno; i++) { 1453 ASSERT(i < lsp->ls_comp_index_sz - 1); 1454 uchar_t *useg; 1455 1456 /* 1457 * The last segment is special in that it is 1458 * most likely not going to be the same 1459 * (uncompressed) size as the other segments. 1460 */ 1461 if (i == (lsp->ls_comp_index_sz - 2)) { 1462 seglen = lsp->ls_uncomp_last_seg_sz; 1463 } else { 1464 seglen = lsp->ls_uncomp_seg_sz; 1465 } 1466 1467 /* 1468 * Each of the segment index entries contains 1469 * the starting block number for that segment. 1470 * The number of compressed bytes in a segment 1471 * is thus the difference between the starting 1472 * block number of this segment and the starting 1473 * block number of the next segment. 1474 */ 1475 cmpbytes = lsp->ls_comp_seg_index[i + 1] - 1476 lsp->ls_comp_seg_index[i]; 1477 1478 /* 1479 * The first byte in a compressed segment is a flag 1480 * that indicates whether this segment is compressed 1481 * at all. 1482 * 1483 * The variable 'useg' is used (instead of 1484 * uncompressed_seg) in this loop to keep a 1485 * reference to the uncompressed segment. 1486 * 1487 * N.B. If 'useg' is replaced with uncompressed_seg, 1488 * it leads to memory leaks and heap corruption in 1489 * corner cases where compressed segments lie 1490 * adjacent to uncompressed segments. 1491 */ 1492 if (*cmpbuf == UNCOMPRESSED) { 1493 useg = cmpbuf + SEGHDR; 1494 } else { 1495 if (uncompressed_seg == NULL) 1496 uncompressed_seg = 1497 kmem_alloc(lsp->ls_uncomp_seg_sz, 1498 KM_SLEEP); 1499 useg = uncompressed_seg; 1500 uncompressed_seg_index = i; 1501 1502 if (li->l_decompress((cmpbuf + SEGHDR), 1503 (cmpbytes - SEGHDR), uncompressed_seg, 1504 &seglen, li->l_level) != 0) { 1505 error = EIO; 1506 goto done; 1507 } 1508 } 1509 1510 /* 1511 * Determine how much uncompressed data we 1512 * have to copy and copy it 1513 */ 1514 xfersize = lsp->ls_uncomp_seg_sz - sblkoff; 1515 if (i == eblkno) 1516 xfersize -= (lsp->ls_uncomp_seg_sz - eblkoff); 1517 1518 bcopy((useg + sblkoff), bufaddr, xfersize); 1519 1520 cmpbuf += cmpbytes; 1521 bufaddr += xfersize; 1522 bp->b_resid -= xfersize; 1523 sblkoff = 0; 1524 1525 if (bp->b_resid == 0) 1526 break; 1527 } /* decompress compressed blocks ends */ 1528 1529 /* 1530 * Skip to done if there is no uncompressed data to cache 1531 */ 1532 if (uncompressed_seg == NULL) 1533 goto done; 1534 1535 /* 1536 * Add the data for the last decompressed segment to 1537 * the cache. 1538 * 1539 * In case the uncompressed segment data was added to (and 1540 * is referenced by) the cache, make sure we don't free it 1541 * here. 1542 */ 1543 mutex_enter(&lsp->ls_comp_cache_lock); 1544 if ((lc = lofi_add_comp_data(lsp, uncompressed_seg_index, 1545 uncompressed_seg)) != NULL) { 1546 uncompressed_seg = NULL; 1547 } 1548 mutex_exit(&lsp->ls_comp_cache_lock); 1549 1550 done: 1551 if (compressed_seg != NULL) { 1552 mutex_enter(&lsp->ls_comp_bufs_lock); 1553 lsp->ls_comp_bufs[j].inuse = 0; 1554 mutex_exit(&lsp->ls_comp_bufs_lock); 1555 } 1556 if (uncompressed_seg != NULL) 1557 kmem_free(uncompressed_seg, lsp->ls_uncomp_seg_sz); 1558 } /* end of handling compressed files */ 1559 1560 if ((error == 0) && (syncflag != 0)) 1561 error = VOP_FSYNC(lsp->ls_vp, syncflag, kcred, NULL); 1562 1563 errout: 1564 if (bufinited && lsp->ls_kstat) { 1565 size_t n_done = bp->b_bcount - bp->b_resid; 1566 kstat_io_t *kioptr; 1567 1568 mutex_enter(lsp->ls_kstat->ks_lock); 1569 kioptr = KSTAT_IO_PTR(lsp->ls_kstat); 1570 if (bp->b_flags & B_READ) { 1571 kioptr->nread += n_done; 1572 kioptr->reads++; 1573 } else { 1574 kioptr->nwritten += n_done; 1575 kioptr->writes++; 1576 } 1577 kstat_runq_exit(kioptr); 1578 mutex_exit(lsp->ls_kstat->ks_lock); 1579 } 1580 1581 mutex_enter(&lsp->ls_vp_lock); 1582 if (--lsp->ls_vp_iocount == 0) 1583 cv_broadcast(&lsp->ls_vp_cv); 1584 mutex_exit(&lsp->ls_vp_lock); 1585 1586 bioerror(bp, error); 1587 biodone(bp); 1588 } 1589 1590 static int 1591 lofi_strategy(struct buf *bp) 1592 { 1593 struct lofi_state *lsp; 1594 offset_t offset; 1595 minor_t part; 1596 diskaddr_t p_lba; 1597 diskaddr_t p_nblks; 1598 int shift; 1599 1600 /* 1601 * We cannot just do I/O here, because the current thread 1602 * _might_ end up back in here because the underlying filesystem 1603 * wants a buffer, which eventually gets into bio_recycle and 1604 * might call into lofi to write out a delayed-write buffer. 1605 * This is bad if the filesystem above lofi is the same as below. 1606 * 1607 * We could come up with a complex strategy using threads to 1608 * do the I/O asynchronously, or we could use task queues. task 1609 * queues were incredibly easy so they win. 1610 */ 1611 1612 lsp = ddi_get_soft_state(lofi_statep, 1613 LOFI_MINOR2ID(getminor(bp->b_edev))); 1614 part = LOFI_PART(getminor(bp->b_edev)); 1615 1616 if (lsp == NULL) { 1617 bioerror(bp, ENXIO); 1618 biodone(bp); 1619 return (0); 1620 } 1621 shift = lsp->ls_lbshift; 1622 1623 p_lba = 0; 1624 p_nblks = lsp->ls_vp_size >> shift; 1625 1626 if (lsp->ls_cmlbhandle != NULL) { 1627 if (cmlb_partinfo(lsp->ls_cmlbhandle, part, &p_nblks, &p_lba, 1628 NULL, NULL, 0)) { 1629 bioerror(bp, ENXIO); 1630 biodone(bp); 1631 return (0); 1632 } 1633 } 1634 1635 /* start block past partition end? */ 1636 if (bp->b_lblkno > p_nblks) { 1637 bioerror(bp, ENXIO); 1638 biodone(bp); 1639 return (0); 1640 } 1641 1642 offset = (bp->b_lblkno+p_lba) << shift; /* offset within file */ 1643 1644 mutex_enter(&lsp->ls_vp_lock); 1645 if (lsp->ls_vp == NULL || lsp->ls_vp_closereq) { 1646 bioerror(bp, EIO); 1647 biodone(bp); 1648 mutex_exit(&lsp->ls_vp_lock); 1649 return (0); 1650 } 1651 1652 if (lsp->ls_crypto_enabled) { 1653 /* encrypted data really begins after crypto header */ 1654 offset += lsp->ls_crypto_offset; 1655 } 1656 1657 /* make sure we will not pass the file or partition size */ 1658 if (offset == lsp->ls_vp_size || 1659 offset == (((p_lba + p_nblks) << shift) + lsp->ls_crypto_offset)) { 1660 /* EOF */ 1661 if ((bp->b_flags & B_READ) != 0) { 1662 bp->b_resid = bp->b_bcount; 1663 bioerror(bp, 0); 1664 } else { 1665 /* writes should fail */ 1666 bioerror(bp, ENXIO); 1667 } 1668 biodone(bp); 1669 mutex_exit(&lsp->ls_vp_lock); 1670 return (0); 1671 } 1672 if ((offset > lsp->ls_vp_size) || 1673 (offset > (((p_lba + p_nblks) << shift) + lsp->ls_crypto_offset)) || 1674 ((offset + bp->b_bcount) > ((p_lba + p_nblks) << shift))) { 1675 bioerror(bp, ENXIO); 1676 biodone(bp); 1677 mutex_exit(&lsp->ls_vp_lock); 1678 return (0); 1679 } 1680 1681 mutex_exit(&lsp->ls_vp_lock); 1682 1683 if (lsp->ls_kstat) { 1684 mutex_enter(lsp->ls_kstat->ks_lock); 1685 kstat_waitq_enter(KSTAT_IO_PTR(lsp->ls_kstat)); 1686 mutex_exit(lsp->ls_kstat->ks_lock); 1687 } 1688 bp->b_private = (void *)(uintptr_t)p_lba; /* partition start */ 1689 (void) taskq_dispatch(lsp->ls_taskq, lofi_strategy_task, bp, KM_SLEEP); 1690 return (0); 1691 } 1692 1693 /*ARGSUSED2*/ 1694 static int 1695 lofi_read(dev_t dev, struct uio *uio, struct cred *credp) 1696 { 1697 if (getminor(dev) == 0) 1698 return (EINVAL); 1699 UIO_CHECK(uio); 1700 return (physio(lofi_strategy, NULL, dev, B_READ, minphys, uio)); 1701 } 1702 1703 /*ARGSUSED2*/ 1704 static int 1705 lofi_write(dev_t dev, struct uio *uio, struct cred *credp) 1706 { 1707 if (getminor(dev) == 0) 1708 return (EINVAL); 1709 UIO_CHECK(uio); 1710 return (physio(lofi_strategy, NULL, dev, B_WRITE, minphys, uio)); 1711 } 1712 1713 /*ARGSUSED2*/ 1714 static int 1715 lofi_aread(dev_t dev, struct aio_req *aio, struct cred *credp) 1716 { 1717 if (getminor(dev) == 0) 1718 return (EINVAL); 1719 UIO_CHECK(aio->aio_uio); 1720 return (aphysio(lofi_strategy, anocancel, dev, B_READ, minphys, aio)); 1721 } 1722 1723 /*ARGSUSED2*/ 1724 static int 1725 lofi_awrite(dev_t dev, struct aio_req *aio, struct cred *credp) 1726 { 1727 if (getminor(dev) == 0) 1728 return (EINVAL); 1729 UIO_CHECK(aio->aio_uio); 1730 return (aphysio(lofi_strategy, anocancel, dev, B_WRITE, minphys, aio)); 1731 } 1732 1733 /*ARGSUSED*/ 1734 static int 1735 lofi_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) 1736 { 1737 struct lofi_state *lsp; 1738 dev_t dev = (dev_t)arg; 1739 int instance; 1740 1741 instance = LOFI_MINOR2ID(getminor(dev)); 1742 switch (infocmd) { 1743 case DDI_INFO_DEVT2DEVINFO: 1744 lsp = ddi_get_soft_state(lofi_statep, instance); 1745 if (lsp == NULL) 1746 return (DDI_FAILURE); 1747 *result = lsp->ls_dip; 1748 return (DDI_SUCCESS); 1749 case DDI_INFO_DEVT2INSTANCE: 1750 *result = (void *) (intptr_t)instance; 1751 return (DDI_SUCCESS); 1752 } 1753 return (DDI_FAILURE); 1754 } 1755 1756 static int 1757 lofi_create_minor_nodes(struct lofi_state *lsp, boolean_t labeled) 1758 { 1759 int error = 0; 1760 int instance = ddi_get_instance(lsp->ls_dip); 1761 1762 if (labeled == B_TRUE) { 1763 cmlb_alloc_handle(&lsp->ls_cmlbhandle); 1764 error = cmlb_attach(lsp->ls_dip, &lofi_tg_ops, DTYPE_DIRECT, 1765 B_FALSE, B_FALSE, DDI_NT_BLOCK_CHAN, 1766 CMLB_CREATE_P0_MINOR_NODE, lsp->ls_cmlbhandle, (void *)1); 1767 1768 if (error != DDI_SUCCESS) { 1769 cmlb_free_handle(&lsp->ls_cmlbhandle); 1770 lsp->ls_cmlbhandle = NULL; 1771 error = ENXIO; 1772 } 1773 } else { 1774 /* create minor nodes */ 1775 error = ddi_create_minor_node(lsp->ls_dip, LOFI_BLOCK_NODE, 1776 S_IFBLK, LOFI_ID2MINOR(instance), DDI_PSEUDO, 0); 1777 if (error == DDI_SUCCESS) { 1778 error = ddi_create_minor_node(lsp->ls_dip, 1779 LOFI_CHAR_NODE, S_IFCHR, LOFI_ID2MINOR(instance), 1780 DDI_PSEUDO, 0); 1781 if (error != DDI_SUCCESS) { 1782 ddi_remove_minor_node(lsp->ls_dip, 1783 LOFI_BLOCK_NODE); 1784 error = ENXIO; 1785 } 1786 } else 1787 error = ENXIO; 1788 } 1789 return (error); 1790 } 1791 1792 static int 1793 lofi_zone_bind(struct lofi_state *lsp) 1794 { 1795 int error = 0; 1796 1797 mutex_enter(&curproc->p_lock); 1798 if ((error = rctl_incr_lofi(curproc, curproc->p_zone, 1)) != 0) { 1799 mutex_exit(&curproc->p_lock); 1800 return (error); 1801 } 1802 mutex_exit(&curproc->p_lock); 1803 1804 if (ddi_prop_update_string(DDI_DEV_T_NONE, lsp->ls_dip, ZONE_PROP_NAME, 1805 (char *)curproc->p_zone->zone_name) != DDI_PROP_SUCCESS) { 1806 rctl_decr_lofi(curproc->p_zone, 1); 1807 error = EINVAL; 1808 } else { 1809 zone_init_ref(&lsp->ls_zone); 1810 zone_hold_ref(curzone, &lsp->ls_zone, ZONE_REF_LOFI); 1811 } 1812 return (error); 1813 } 1814 1815 static void 1816 lofi_zone_unbind(struct lofi_state *lsp) 1817 { 1818 (void) ddi_prop_remove(DDI_DEV_T_NONE, lsp->ls_dip, ZONE_PROP_NAME); 1819 rctl_decr_lofi(curproc->p_zone, 1); 1820 zone_rele_ref(&lsp->ls_zone, ZONE_REF_LOFI); 1821 } 1822 1823 static int 1824 lofi_online_dev(dev_info_t *dip) 1825 { 1826 boolean_t labeled; 1827 int error; 1828 int instance = ddi_get_instance(dip); 1829 struct lofi_state *lsp; 1830 1831 labeled = B_FALSE; 1832 if (ddi_prop_exists(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, "labeled")) 1833 labeled = B_TRUE; 1834 1835 /* lsp alloc+init, soft state is freed in lofi_detach */ 1836 error = ddi_soft_state_zalloc(lofi_statep, instance); 1837 if (error == DDI_FAILURE) { 1838 return (ENOMEM); 1839 } 1840 1841 lsp = ddi_get_soft_state(lofi_statep, instance); 1842 lsp->ls_dip = dip; 1843 1844 if ((error = lofi_zone_bind(lsp)) != 0) 1845 goto err; 1846 1847 cv_init(&lsp->ls_vp_cv, NULL, CV_DRIVER, NULL); 1848 mutex_init(&lsp->ls_comp_cache_lock, NULL, MUTEX_DRIVER, NULL); 1849 mutex_init(&lsp->ls_comp_bufs_lock, NULL, MUTEX_DRIVER, NULL); 1850 mutex_init(&lsp->ls_kstat_lock, NULL, MUTEX_DRIVER, NULL); 1851 mutex_init(&lsp->ls_vp_lock, NULL, MUTEX_DRIVER, NULL); 1852 1853 if ((error = lofi_create_minor_nodes(lsp, labeled)) != 0) { 1854 lofi_zone_unbind(lsp); 1855 goto lerr; 1856 } 1857 1858 /* driver handles kernel-issued IOCTLs */ 1859 if (ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP, 1860 DDI_KERNEL_IOCTL, NULL, 0) != DDI_PROP_SUCCESS) { 1861 error = DDI_FAILURE; 1862 goto merr; 1863 } 1864 1865 lsp->ls_kstat = kstat_create_zone(LOFI_DRIVER_NAME, instance, 1866 NULL, "disk", KSTAT_TYPE_IO, 1, 0, getzoneid()); 1867 if (lsp->ls_kstat == NULL) { 1868 (void) ddi_prop_remove(DDI_DEV_T_NONE, lsp->ls_dip, 1869 DDI_KERNEL_IOCTL); 1870 error = ENOMEM; 1871 goto merr; 1872 } 1873 1874 lsp->ls_kstat->ks_lock = &lsp->ls_kstat_lock; 1875 kstat_zone_add(lsp->ls_kstat, GLOBAL_ZONEID); 1876 kstat_install(lsp->ls_kstat); 1877 return (DDI_SUCCESS); 1878 merr: 1879 if (lsp->ls_cmlbhandle != NULL) { 1880 cmlb_detach(lsp->ls_cmlbhandle, 0); 1881 cmlb_free_handle(&lsp->ls_cmlbhandle); 1882 } 1883 ddi_remove_minor_node(dip, NULL); 1884 lofi_zone_unbind(lsp); 1885 lerr: 1886 mutex_destroy(&lsp->ls_comp_cache_lock); 1887 mutex_destroy(&lsp->ls_comp_bufs_lock); 1888 mutex_destroy(&lsp->ls_kstat_lock); 1889 mutex_destroy(&lsp->ls_vp_lock); 1890 cv_destroy(&lsp->ls_vp_cv); 1891 err: 1892 ddi_soft_state_free(lofi_statep, instance); 1893 return (error); 1894 } 1895 1896 static int 1897 lofi_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 1898 { 1899 int rv; 1900 int instance = ddi_get_instance(dip); 1901 struct lofi_state *lsp; 1902 1903 if (cmd != DDI_ATTACH) 1904 return (DDI_FAILURE); 1905 1906 /* 1907 * Instance 0 is control instance, attaching control instance 1908 * will set the lofi up and ready. 1909 */ 1910 if (instance == 0) { 1911 rv = ddi_soft_state_zalloc(lofi_statep, 0); 1912 if (rv == DDI_FAILURE) { 1913 return (DDI_FAILURE); 1914 } 1915 lsp = ddi_get_soft_state(lofi_statep, instance); 1916 rv = ddi_create_minor_node(dip, LOFI_CTL_NODE, S_IFCHR, 0, 1917 DDI_PSEUDO, 0); 1918 if (rv == DDI_FAILURE) { 1919 ddi_soft_state_free(lofi_statep, 0); 1920 return (DDI_FAILURE); 1921 } 1922 /* driver handles kernel-issued IOCTLs */ 1923 if (ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP, 1924 DDI_KERNEL_IOCTL, NULL, 0) != DDI_PROP_SUCCESS) { 1925 ddi_remove_minor_node(dip, NULL); 1926 ddi_soft_state_free(lofi_statep, 0); 1927 return (DDI_FAILURE); 1928 } 1929 1930 zone_key_create(&lofi_zone_key, NULL, lofi_zone_shutdown, NULL); 1931 1932 lsp->ls_dip = dip; 1933 } else { 1934 if (lofi_online_dev(dip) == DDI_FAILURE) 1935 return (DDI_FAILURE); 1936 } 1937 1938 ddi_report_dev(dip); 1939 return (DDI_SUCCESS); 1940 } 1941 1942 static int 1943 lofi_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 1944 { 1945 struct lofi_state *lsp; 1946 int instance = ddi_get_instance(dip); 1947 1948 if (cmd != DDI_DETACH) 1949 return (DDI_FAILURE); 1950 1951 /* 1952 * If the instance is not 0, release state. 1953 * The instance 0 is control device, we can not detach it 1954 * before other instances are detached. 1955 */ 1956 if (instance != 0) { 1957 lsp = ddi_get_soft_state(lofi_statep, instance); 1958 if (lsp != NULL && lsp->ls_vp_ready == B_FALSE) { 1959 ddi_soft_state_free(lofi_statep, instance); 1960 return (DDI_SUCCESS); 1961 } else 1962 return (DDI_FAILURE); 1963 } 1964 mutex_enter(&lofi_lock); 1965 1966 if (!list_is_empty(&lofi_list)) { 1967 mutex_exit(&lofi_lock); 1968 return (DDI_FAILURE); 1969 } 1970 1971 ddi_remove_minor_node(dip, NULL); 1972 ddi_prop_remove_all(dip); 1973 1974 mutex_exit(&lofi_lock); 1975 1976 if (zone_key_delete(lofi_zone_key) != 0) 1977 cmn_err(CE_WARN, "failed to delete zone key"); 1978 1979 ddi_soft_state_free(lofi_statep, 0); 1980 1981 return (DDI_SUCCESS); 1982 } 1983 1984 /* 1985 * With the addition of encryption, we must be careful that encryption key is 1986 * wiped before kernel's data structures are freed so it cannot accidentally 1987 * slip out to userland through uninitialized data elsewhere. 1988 */ 1989 static void 1990 free_lofi_ioctl(struct lofi_ioctl *klip) 1991 { 1992 /* Make sure this encryption key doesn't stick around */ 1993 bzero(klip->li_key, sizeof (klip->li_key)); 1994 kmem_free(klip, sizeof (struct lofi_ioctl)); 1995 } 1996 1997 /* 1998 * These two functions simplify the rest of the ioctls that need to copyin/out 1999 * the lofi_ioctl structure. 2000 */ 2001 int 2002 copy_in_lofi_ioctl(const struct lofi_ioctl *ulip, struct lofi_ioctl **klipp, 2003 int flag) 2004 { 2005 struct lofi_ioctl *klip; 2006 int error; 2007 2008 klip = *klipp = kmem_alloc(sizeof (struct lofi_ioctl), KM_SLEEP); 2009 error = ddi_copyin(ulip, klip, sizeof (struct lofi_ioctl), flag); 2010 if (error) 2011 goto err; 2012 2013 /* ensure NULL termination */ 2014 klip->li_filename[MAXPATHLEN-1] = '\0'; 2015 klip->li_devpath[MAXPATHLEN-1] = '\0'; 2016 klip->li_algorithm[MAXALGLEN-1] = '\0'; 2017 klip->li_cipher[CRYPTO_MAX_MECH_NAME-1] = '\0'; 2018 klip->li_iv_cipher[CRYPTO_MAX_MECH_NAME-1] = '\0'; 2019 2020 if (klip->li_id > L_MAXMIN32) { 2021 error = EINVAL; 2022 goto err; 2023 } 2024 2025 return (0); 2026 2027 err: 2028 free_lofi_ioctl(klip); 2029 return (error); 2030 } 2031 2032 int 2033 copy_out_lofi_ioctl(const struct lofi_ioctl *klip, struct lofi_ioctl *ulip, 2034 int flag) 2035 { 2036 int error; 2037 2038 /* 2039 * NOTE: Do NOT copy the crypto_key_t "back" to userland. 2040 * This ensures that an attacker can't trivially find the 2041 * key for a mapping just by issuing the ioctl. 2042 * 2043 * It can still be found by poking around in kmem with mdb(1), 2044 * but there is no point in making it easy when the info isn't 2045 * of any use in this direction anyway. 2046 * 2047 * Either way we don't actually have the raw key stored in 2048 * a form that we can get it anyway, since we just used it 2049 * to create a ctx template and didn't keep "the original". 2050 */ 2051 error = ddi_copyout(klip, ulip, sizeof (struct lofi_ioctl), flag); 2052 if (error) 2053 return (EFAULT); 2054 return (0); 2055 } 2056 2057 static int 2058 lofi_access(struct lofi_state *lsp) 2059 { 2060 ASSERT(MUTEX_HELD(&lofi_lock)); 2061 if (INGLOBALZONE(curproc) || lsp->ls_zone.zref_zone == curzone) 2062 return (0); 2063 return (EPERM); 2064 } 2065 2066 /* 2067 * Find the lofi state for the given filename. We compare by vnode to 2068 * allow the global zone visibility into NGZ lofi nodes. 2069 */ 2070 static int 2071 file_to_lofi_nocheck(char *filename, boolean_t readonly, 2072 struct lofi_state **lspp) 2073 { 2074 struct lofi_state *lsp; 2075 vnode_t *vp = NULL; 2076 int err = 0; 2077 int rdfiles = 0; 2078 2079 ASSERT(MUTEX_HELD(&lofi_lock)); 2080 2081 if ((err = lookupname(filename, UIO_SYSSPACE, FOLLOW, 2082 NULLVPP, &vp)) != 0) 2083 goto out; 2084 2085 if (vp->v_type == VREG) { 2086 vnode_t *realvp; 2087 if (VOP_REALVP(vp, &realvp, NULL) == 0) { 2088 VN_HOLD(realvp); 2089 VN_RELE(vp); 2090 vp = realvp; 2091 } 2092 } 2093 2094 for (lsp = list_head(&lofi_list); lsp != NULL; 2095 lsp = list_next(&lofi_list, lsp)) { 2096 if (lsp->ls_vp == vp) { 2097 if (lspp != NULL) 2098 *lspp = lsp; 2099 if (lsp->ls_readonly) { 2100 rdfiles++; 2101 /* Skip if '-r' is specified */ 2102 if (readonly) 2103 continue; 2104 } 2105 goto out; 2106 } 2107 } 2108 2109 err = ENOENT; 2110 2111 /* 2112 * If a filename is given as an argument for lofi_unmap, we shouldn't 2113 * allow unmap if there are multiple read-only lofi devices associated 2114 * with this file. 2115 */ 2116 if (lspp != NULL) { 2117 if (rdfiles == 1) 2118 err = 0; 2119 else if (rdfiles > 1) 2120 err = EBUSY; 2121 } 2122 2123 out: 2124 if (vp != NULL) 2125 VN_RELE(vp); 2126 return (err); 2127 } 2128 2129 /* 2130 * Find the minor for the given filename, checking the zone can access 2131 * it. 2132 */ 2133 static int 2134 file_to_lofi(char *filename, boolean_t readonly, struct lofi_state **lspp) 2135 { 2136 int err = 0; 2137 2138 ASSERT(MUTEX_HELD(&lofi_lock)); 2139 2140 if ((err = file_to_lofi_nocheck(filename, readonly, lspp)) != 0) 2141 return (err); 2142 2143 if ((err = lofi_access(*lspp)) != 0) 2144 return (err); 2145 2146 return (0); 2147 } 2148 2149 /* 2150 * Fakes up a disk geometry based on the size of the file. This is needed 2151 * to support newfs on traditional lofi device, but also will provide 2152 * geometry hint for cmlb. 2153 */ 2154 static void 2155 fake_disk_geometry(struct lofi_state *lsp) 2156 { 2157 u_offset_t dsize = lsp->ls_vp_size - lsp->ls_crypto_offset; 2158 2159 /* dk_geom - see dkio(7I) */ 2160 /* 2161 * dkg_ncyl _could_ be set to one here (one big cylinder with gobs 2162 * of sectors), but that breaks programs like fdisk which want to 2163 * partition a disk by cylinder. With one cylinder, you can't create 2164 * an fdisk partition and put pcfs on it for testing (hard to pick 2165 * a number between one and one). 2166 * 2167 * The cheezy floppy test is an attempt to not have too few cylinders 2168 * for a small file, or so many on a big file that you waste space 2169 * for backup superblocks or cylinder group structures. 2170 */ 2171 bzero(&lsp->ls_dkg, sizeof (lsp->ls_dkg)); 2172 if (dsize < (2 * 1024 * 1024)) /* floppy? */ 2173 lsp->ls_dkg.dkg_ncyl = dsize / (100 * 1024); 2174 else 2175 lsp->ls_dkg.dkg_ncyl = dsize / (300 * 1024); 2176 /* in case file file is < 100k */ 2177 if (lsp->ls_dkg.dkg_ncyl == 0) 2178 lsp->ls_dkg.dkg_ncyl = 1; 2179 2180 lsp->ls_dkg.dkg_pcyl = lsp->ls_dkg.dkg_ncyl; 2181 lsp->ls_dkg.dkg_nhead = 1; 2182 lsp->ls_dkg.dkg_rpm = 7200; 2183 2184 lsp->ls_dkg.dkg_nsect = dsize / 2185 (lsp->ls_dkg.dkg_ncyl << lsp->ls_pbshift); 2186 } 2187 2188 /* 2189 * build vtoc - see dkio(7I) 2190 * 2191 * Fakes one big partition based on the size of the file. This is needed 2192 * because we allow newfs'ing the traditional lofi device and newfs will 2193 * do several disk ioctls to figure out the geometry and partition information. 2194 * It uses that information to determine the parameters to pass to mkfs. 2195 */ 2196 static void 2197 fake_disk_vtoc(struct lofi_state *lsp, struct vtoc *vt) 2198 { 2199 bzero(vt, sizeof (struct vtoc)); 2200 vt->v_sanity = VTOC_SANE; 2201 vt->v_version = V_VERSION; 2202 (void) strncpy(vt->v_volume, LOFI_DRIVER_NAME, 2203 sizeof (vt->v_volume)); 2204 vt->v_sectorsz = 1 << lsp->ls_pbshift; 2205 vt->v_nparts = 1; 2206 vt->v_part[0].p_tag = V_UNASSIGNED; 2207 2208 /* 2209 * A compressed file is read-only, other files can 2210 * be read-write 2211 */ 2212 if (lsp->ls_uncomp_seg_sz > 0) { 2213 vt->v_part[0].p_flag = V_UNMNT | V_RONLY; 2214 } else { 2215 vt->v_part[0].p_flag = V_UNMNT; 2216 } 2217 vt->v_part[0].p_start = (daddr_t)0; 2218 /* 2219 * The partition size cannot just be the number of sectors, because 2220 * that might not end on a cylinder boundary. And if that's the case, 2221 * newfs/mkfs will print a scary warning. So just figure the size 2222 * based on the number of cylinders and sectors/cylinder. 2223 */ 2224 vt->v_part[0].p_size = lsp->ls_dkg.dkg_pcyl * 2225 lsp->ls_dkg.dkg_nsect * lsp->ls_dkg.dkg_nhead; 2226 } 2227 2228 /* 2229 * build dk_cinfo - see dkio(7I) 2230 */ 2231 static void 2232 fake_disk_info(dev_t dev, struct dk_cinfo *ci) 2233 { 2234 bzero(ci, sizeof (struct dk_cinfo)); 2235 (void) strlcpy(ci->dki_cname, LOFI_DRIVER_NAME, sizeof (ci->dki_cname)); 2236 ci->dki_ctype = DKC_SCSI_CCS; 2237 (void) strlcpy(ci->dki_dname, LOFI_DRIVER_NAME, sizeof (ci->dki_dname)); 2238 ci->dki_unit = LOFI_MINOR2ID(getminor(dev)); 2239 ci->dki_partition = LOFI_PART(getminor(dev)); 2240 /* 2241 * newfs uses this to set maxcontig. Must not be < 16, or it 2242 * will be 0 when newfs multiplies it by DEV_BSIZE and divides 2243 * it by the block size. Then tunefs doesn't work because 2244 * maxcontig is 0. 2245 */ 2246 ci->dki_maxtransfer = 16; 2247 } 2248 2249 /* 2250 * map in a compressed file 2251 * 2252 * Read in the header and the index that follows. 2253 * 2254 * The header is as follows - 2255 * 2256 * Signature (name of the compression algorithm) 2257 * Compression segment size (a multiple of 512) 2258 * Number of index entries 2259 * Size of the last block 2260 * The array containing the index entries 2261 * 2262 * The header information is always stored in 2263 * network byte order on disk. 2264 */ 2265 static int 2266 lofi_map_compressed_file(struct lofi_state *lsp, char *buf) 2267 { 2268 uint32_t index_sz, header_len, i; 2269 ssize_t resid; 2270 enum uio_rw rw; 2271 char *tbuf = buf; 2272 int error; 2273 2274 /* The signature has already been read */ 2275 tbuf += sizeof (lsp->ls_comp_algorithm); 2276 bcopy(tbuf, &(lsp->ls_uncomp_seg_sz), sizeof (lsp->ls_uncomp_seg_sz)); 2277 lsp->ls_uncomp_seg_sz = ntohl(lsp->ls_uncomp_seg_sz); 2278 2279 /* 2280 * The compressed segment size must be a power of 2 2281 */ 2282 if (lsp->ls_uncomp_seg_sz < DEV_BSIZE || 2283 !ISP2(lsp->ls_uncomp_seg_sz)) 2284 return (EINVAL); 2285 2286 for (i = 0; !((lsp->ls_uncomp_seg_sz >> i) & 1); i++) 2287 ; 2288 2289 lsp->ls_comp_seg_shift = i; 2290 2291 tbuf += sizeof (lsp->ls_uncomp_seg_sz); 2292 bcopy(tbuf, &(lsp->ls_comp_index_sz), sizeof (lsp->ls_comp_index_sz)); 2293 lsp->ls_comp_index_sz = ntohl(lsp->ls_comp_index_sz); 2294 2295 tbuf += sizeof (lsp->ls_comp_index_sz); 2296 bcopy(tbuf, &(lsp->ls_uncomp_last_seg_sz), 2297 sizeof (lsp->ls_uncomp_last_seg_sz)); 2298 lsp->ls_uncomp_last_seg_sz = ntohl(lsp->ls_uncomp_last_seg_sz); 2299 2300 /* 2301 * Compute the total size of the uncompressed data 2302 * for use in fake_disk_geometry and other calculations. 2303 * Disk geometry has to be faked with respect to the 2304 * actual uncompressed data size rather than the 2305 * compressed file size. 2306 */ 2307 lsp->ls_vp_size = 2308 (u_offset_t)(lsp->ls_comp_index_sz - 2) * lsp->ls_uncomp_seg_sz 2309 + lsp->ls_uncomp_last_seg_sz; 2310 2311 /* 2312 * Index size is rounded up to DEV_BSIZE for ease 2313 * of segmapping 2314 */ 2315 index_sz = sizeof (*lsp->ls_comp_seg_index) * lsp->ls_comp_index_sz; 2316 header_len = sizeof (lsp->ls_comp_algorithm) + 2317 sizeof (lsp->ls_uncomp_seg_sz) + 2318 sizeof (lsp->ls_comp_index_sz) + 2319 sizeof (lsp->ls_uncomp_last_seg_sz); 2320 lsp->ls_comp_offbase = header_len + index_sz; 2321 2322 index_sz += header_len; 2323 index_sz = roundup(index_sz, DEV_BSIZE); 2324 2325 lsp->ls_comp_index_data = kmem_alloc(index_sz, KM_SLEEP); 2326 lsp->ls_comp_index_data_sz = index_sz; 2327 2328 /* 2329 * Read in the index -- this has a side-effect 2330 * of reading in the header as well 2331 */ 2332 rw = UIO_READ; 2333 error = vn_rdwr(rw, lsp->ls_vp, lsp->ls_comp_index_data, index_sz, 2334 0, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); 2335 2336 if (error != 0) 2337 return (error); 2338 2339 /* Skip the header, this is where the index really begins */ 2340 lsp->ls_comp_seg_index = 2341 /*LINTED*/ 2342 (uint64_t *)(lsp->ls_comp_index_data + header_len); 2343 2344 /* 2345 * Now recompute offsets in the index to account for 2346 * the header length 2347 */ 2348 for (i = 0; i < lsp->ls_comp_index_sz; i++) { 2349 lsp->ls_comp_seg_index[i] = lsp->ls_comp_offbase + 2350 BE_64(lsp->ls_comp_seg_index[i]); 2351 } 2352 2353 return (error); 2354 } 2355 2356 static int 2357 lofi_init_crypto(struct lofi_state *lsp, struct lofi_ioctl *klip) 2358 { 2359 struct crypto_meta chead; 2360 char buf[DEV_BSIZE]; 2361 ssize_t resid; 2362 char *marker; 2363 int error; 2364 int ret; 2365 int i; 2366 2367 if (!klip->li_crypto_enabled) 2368 return (0); 2369 2370 /* 2371 * All current algorithms have a max of 448 bits. 2372 */ 2373 if (klip->li_iv_len > CRYPTO_BITS2BYTES(512)) 2374 return (EINVAL); 2375 2376 if (CRYPTO_BITS2BYTES(klip->li_key_len) > sizeof (klip->li_key)) 2377 return (EINVAL); 2378 2379 lsp->ls_crypto_enabled = klip->li_crypto_enabled; 2380 2381 mutex_init(&lsp->ls_crypto_lock, NULL, MUTEX_DRIVER, NULL); 2382 2383 lsp->ls_mech.cm_type = crypto_mech2id(klip->li_cipher); 2384 if (lsp->ls_mech.cm_type == CRYPTO_MECH_INVALID) { 2385 cmn_err(CE_WARN, "invalid cipher %s requested for %s", 2386 klip->li_cipher, klip->li_filename); 2387 return (EINVAL); 2388 } 2389 2390 /* this is just initialization here */ 2391 lsp->ls_mech.cm_param = NULL; 2392 lsp->ls_mech.cm_param_len = 0; 2393 2394 lsp->ls_iv_type = klip->li_iv_type; 2395 lsp->ls_iv_mech.cm_type = crypto_mech2id(klip->li_iv_cipher); 2396 if (lsp->ls_iv_mech.cm_type == CRYPTO_MECH_INVALID) { 2397 cmn_err(CE_WARN, "invalid iv cipher %s requested" 2398 " for %s", klip->li_iv_cipher, klip->li_filename); 2399 return (EINVAL); 2400 } 2401 2402 /* iv mech must itself take a null iv */ 2403 lsp->ls_iv_mech.cm_param = NULL; 2404 lsp->ls_iv_mech.cm_param_len = 0; 2405 lsp->ls_iv_len = klip->li_iv_len; 2406 2407 /* 2408 * Create ctx using li_cipher & the raw li_key after checking 2409 * that it isn't a weak key. 2410 */ 2411 lsp->ls_key.ck_format = CRYPTO_KEY_RAW; 2412 lsp->ls_key.ck_length = klip->li_key_len; 2413 lsp->ls_key.ck_data = kmem_alloc( 2414 CRYPTO_BITS2BYTES(lsp->ls_key.ck_length), KM_SLEEP); 2415 bcopy(klip->li_key, lsp->ls_key.ck_data, 2416 CRYPTO_BITS2BYTES(lsp->ls_key.ck_length)); 2417 2418 ret = crypto_key_check(&lsp->ls_mech, &lsp->ls_key); 2419 if (ret != CRYPTO_SUCCESS) { 2420 cmn_err(CE_WARN, "weak key check failed for cipher " 2421 "%s on file %s (0x%x)", klip->li_cipher, 2422 klip->li_filename, ret); 2423 return (EINVAL); 2424 } 2425 2426 error = vn_rdwr(UIO_READ, lsp->ls_vp, buf, DEV_BSIZE, 2427 CRYOFF, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); 2428 if (error != 0) 2429 return (error); 2430 2431 /* 2432 * This is the case where the header in the lofi image is already 2433 * initialized to indicate it is encrypted. 2434 */ 2435 if (strncmp(buf, lofi_crypto_magic, sizeof (lofi_crypto_magic)) == 0) { 2436 /* 2437 * The encryption header information is laid out this way: 2438 * 6 bytes: hex "CFLOFI" 2439 * 2 bytes: version = 0 ... for now 2440 * 96 bytes: reserved1 (not implemented yet) 2441 * 4 bytes: data_sector = 2 ... for now 2442 * more... not implemented yet 2443 */ 2444 2445 marker = buf; 2446 2447 /* copy the magic */ 2448 bcopy(marker, lsp->ls_crypto.magic, 2449 sizeof (lsp->ls_crypto.magic)); 2450 marker += sizeof (lsp->ls_crypto.magic); 2451 2452 /* read the encryption version number */ 2453 bcopy(marker, &(lsp->ls_crypto.version), 2454 sizeof (lsp->ls_crypto.version)); 2455 lsp->ls_crypto.version = ntohs(lsp->ls_crypto.version); 2456 marker += sizeof (lsp->ls_crypto.version); 2457 2458 /* read a chunk of reserved data */ 2459 bcopy(marker, lsp->ls_crypto.reserved1, 2460 sizeof (lsp->ls_crypto.reserved1)); 2461 marker += sizeof (lsp->ls_crypto.reserved1); 2462 2463 /* read block number where encrypted data begins */ 2464 bcopy(marker, &(lsp->ls_crypto.data_sector), 2465 sizeof (lsp->ls_crypto.data_sector)); 2466 lsp->ls_crypto.data_sector = ntohl(lsp->ls_crypto.data_sector); 2467 marker += sizeof (lsp->ls_crypto.data_sector); 2468 2469 /* and ignore the rest until it is implemented */ 2470 2471 lsp->ls_crypto_offset = lsp->ls_crypto.data_sector * DEV_BSIZE; 2472 return (0); 2473 } 2474 2475 /* 2476 * We've requested encryption, but no magic was found, so it must be 2477 * a new image. 2478 */ 2479 2480 for (i = 0; i < sizeof (struct crypto_meta); i++) { 2481 if (buf[i] != '\0') 2482 return (EINVAL); 2483 } 2484 2485 marker = buf; 2486 bcopy(lofi_crypto_magic, marker, sizeof (lofi_crypto_magic)); 2487 marker += sizeof (lofi_crypto_magic); 2488 chead.version = htons(LOFI_CRYPTO_VERSION); 2489 bcopy(&(chead.version), marker, sizeof (chead.version)); 2490 marker += sizeof (chead.version); 2491 marker += sizeof (chead.reserved1); 2492 chead.data_sector = htonl(LOFI_CRYPTO_DATA_SECTOR); 2493 bcopy(&(chead.data_sector), marker, sizeof (chead.data_sector)); 2494 2495 /* write the header */ 2496 error = vn_rdwr(UIO_WRITE, lsp->ls_vp, buf, DEV_BSIZE, 2497 CRYOFF, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); 2498 if (error != 0) 2499 return (error); 2500 2501 /* fix things up so it looks like we read this info */ 2502 bcopy(lofi_crypto_magic, lsp->ls_crypto.magic, 2503 sizeof (lofi_crypto_magic)); 2504 lsp->ls_crypto.version = LOFI_CRYPTO_VERSION; 2505 lsp->ls_crypto.data_sector = LOFI_CRYPTO_DATA_SECTOR; 2506 lsp->ls_crypto_offset = lsp->ls_crypto.data_sector * DEV_BSIZE; 2507 return (0); 2508 } 2509 2510 /* 2511 * Check to see if the passed in signature is a valid one. If it is 2512 * valid, return the index into lofi_compress_table. 2513 * 2514 * Return -1 if it is invalid 2515 */ 2516 static int 2517 lofi_compress_select(const char *signature) 2518 { 2519 int i; 2520 2521 for (i = 0; i < LOFI_COMPRESS_FUNCTIONS; i++) { 2522 if (strcmp(lofi_compress_table[i].l_name, signature) == 0) 2523 return (i); 2524 } 2525 2526 return (-1); 2527 } 2528 2529 static int 2530 lofi_init_compress(struct lofi_state *lsp) 2531 { 2532 char buf[DEV_BSIZE]; 2533 int compress_index; 2534 ssize_t resid; 2535 int error; 2536 2537 error = vn_rdwr(UIO_READ, lsp->ls_vp, buf, DEV_BSIZE, 0, UIO_SYSSPACE, 2538 0, RLIM64_INFINITY, kcred, &resid); 2539 2540 if (error != 0) 2541 return (error); 2542 2543 if ((compress_index = lofi_compress_select(buf)) == -1) 2544 return (0); 2545 2546 /* compression and encryption are mutually exclusive */ 2547 if (lsp->ls_crypto_enabled) 2548 return (ENOTSUP); 2549 2550 /* initialize compression info for compressed lofi */ 2551 lsp->ls_comp_algorithm_index = compress_index; 2552 (void) strlcpy(lsp->ls_comp_algorithm, 2553 lofi_compress_table[compress_index].l_name, 2554 sizeof (lsp->ls_comp_algorithm)); 2555 2556 /* Finally setup per-thread pre-allocated buffers */ 2557 lsp->ls_comp_bufs = kmem_zalloc(lofi_taskq_nthreads * 2558 sizeof (struct compbuf), KM_SLEEP); 2559 2560 return (lofi_map_compressed_file(lsp, buf)); 2561 } 2562 2563 /* 2564 * Allocate new or proposed id from lofi_id. 2565 * 2566 * Special cases for proposed id: 2567 * 0: not allowed, 0 is id for control device. 2568 * -1: allocate first usable id from lofi_id. 2569 * any other value is proposed value from userland 2570 * 2571 * returns DDI_SUCCESS or errno. 2572 */ 2573 static int 2574 lofi_alloc_id(int *idp) 2575 { 2576 int id, error = DDI_SUCCESS; 2577 2578 if (*idp == -1) { 2579 id = id_allocff_nosleep(lofi_id); 2580 if (id == -1) { 2581 error = EAGAIN; 2582 goto err; 2583 } 2584 } else if (*idp == 0) { 2585 error = EINVAL; 2586 goto err; 2587 } else if (*idp > ((1 << (L_BITSMINOR - LOFI_CMLB_SHIFT)) - 1)) { 2588 error = ERANGE; 2589 goto err; 2590 } else { 2591 if (ddi_get_soft_state(lofi_statep, *idp) != NULL) { 2592 error = EEXIST; 2593 goto err; 2594 } 2595 2596 id = id_alloc_specific_nosleep(lofi_id, *idp); 2597 if (id == -1) { 2598 error = EAGAIN; 2599 goto err; 2600 } 2601 } 2602 *idp = id; 2603 err: 2604 return (error); 2605 } 2606 2607 static int 2608 lofi_create_dev(struct lofi_ioctl *klip) 2609 { 2610 dev_info_t *parent, *child; 2611 struct lofi_state *lsp = NULL; 2612 char namebuf[MAXNAMELEN]; 2613 int error, circ; 2614 2615 /* get control device */ 2616 lsp = ddi_get_soft_state(lofi_statep, 0); 2617 parent = ddi_get_parent(lsp->ls_dip); 2618 2619 if ((error = lofi_alloc_id((int *)&klip->li_id))) 2620 return (error); 2621 2622 (void) snprintf(namebuf, sizeof (namebuf), LOFI_DRIVER_NAME "@%d", 2623 klip->li_id); 2624 2625 ndi_devi_enter(parent, &circ); 2626 child = ndi_devi_findchild(parent, namebuf); 2627 ndi_devi_exit(parent, circ); 2628 2629 if (child == NULL) { 2630 child = ddi_add_child(parent, LOFI_DRIVER_NAME, 2631 (pnode_t)DEVI_SID_NODEID, klip->li_id); 2632 if ((error = ddi_prop_update_int(DDI_DEV_T_NONE, child, 2633 "instance", klip->li_id)) != DDI_PROP_SUCCESS) 2634 goto err; 2635 2636 if (klip->li_labeled == B_TRUE) { 2637 if ((error = ddi_prop_create(DDI_DEV_T_NONE, child, 2638 DDI_PROP_CANSLEEP, "labeled", 0, 0)) 2639 != DDI_PROP_SUCCESS) 2640 goto err; 2641 } 2642 2643 if ((error = ndi_devi_online(child, NDI_ONLINE_ATTACH)) 2644 != NDI_SUCCESS) 2645 goto err; 2646 } else { 2647 id_free(lofi_id, klip->li_id); 2648 error = EEXIST; 2649 return (error); 2650 } 2651 2652 goto done; 2653 2654 err: 2655 ddi_prop_remove_all(child); 2656 (void) ndi_devi_offline(child, NDI_DEVI_REMOVE); 2657 id_free(lofi_id, klip->li_id); 2658 done: 2659 2660 return (error); 2661 } 2662 2663 static void 2664 lofi_create_inquiry(struct lofi_state *lsp, struct scsi_inquiry *inq) 2665 { 2666 char *p = NULL; 2667 2668 (void) strlcpy(inq->inq_vid, LOFI_DRIVER_NAME, sizeof (inq->inq_vid)); 2669 2670 mutex_enter(&lsp->ls_vp_lock); 2671 if (lsp->ls_vp != NULL) 2672 p = strrchr(lsp->ls_vp->v_path, '/'); 2673 if (p != NULL) 2674 (void) strncpy(inq->inq_pid, p + 1, sizeof (inq->inq_pid)); 2675 mutex_exit(&lsp->ls_vp_lock); 2676 (void) strlcpy(inq->inq_revision, "1.0", sizeof (inq->inq_revision)); 2677 } 2678 2679 /* 2680 * copy devlink name from event cache 2681 */ 2682 static void 2683 lofi_copy_devpath(struct lofi_ioctl *klip) 2684 { 2685 int error; 2686 char namebuf[MAXNAMELEN], *str; 2687 clock_t ticks; 2688 nvlist_t *nvl = NULL; 2689 2690 if (klip->li_labeled == B_TRUE) 2691 klip->li_devpath[0] = '\0'; 2692 else { 2693 /* no need to wait for messages */ 2694 (void) snprintf(klip->li_devpath, sizeof (klip->li_devpath), 2695 "/dev/" LOFI_CHAR_NAME "/%d", klip->li_id); 2696 return; 2697 } 2698 2699 (void) snprintf(namebuf, sizeof (namebuf), "%d", klip->li_id); 2700 ticks = ddi_get_lbolt() + LOFI_TIMEOUT * drv_usectohz(1000000); 2701 2702 mutex_enter(&lofi_devlink_cache.ln_lock); 2703 error = nvlist_lookup_nvlist(lofi_devlink_cache.ln_data, namebuf, &nvl); 2704 while (error != 0) { 2705 error = cv_timedwait(&lofi_devlink_cache.ln_cv, 2706 &lofi_devlink_cache.ln_lock, ticks); 2707 if (error == -1) 2708 break; 2709 error = nvlist_lookup_nvlist(lofi_devlink_cache.ln_data, 2710 namebuf, &nvl); 2711 } 2712 2713 if (nvl != NULL) { 2714 if (nvlist_lookup_string(nvl, DEV_NAME, &str) == 0) { 2715 (void) strlcpy(klip->li_devpath, str, 2716 sizeof (klip->li_devpath)); 2717 } 2718 } 2719 mutex_exit(&lofi_devlink_cache.ln_lock); 2720 } 2721 2722 /* 2723 * map a file to a minor number. Return the minor number. 2724 */ 2725 static int 2726 lofi_map_file(dev_t dev, struct lofi_ioctl *ulip, int pickminor, 2727 int *rvalp, struct cred *credp, int ioctl_flag) 2728 { 2729 int id = -1; 2730 struct lofi_state *lsp = NULL; 2731 struct lofi_ioctl *klip; 2732 int error; 2733 struct vnode *vp = NULL; 2734 vattr_t vattr; 2735 int flag; 2736 char namebuf[MAXNAMELEN]; 2737 2738 error = copy_in_lofi_ioctl(ulip, &klip, ioctl_flag); 2739 if (error != 0) 2740 return (error); 2741 2742 mutex_enter(&lofi_lock); 2743 2744 if (file_to_lofi_nocheck(klip->li_filename, klip->li_readonly, 2745 NULL) == 0) { 2746 error = EBUSY; 2747 goto err; 2748 } 2749 2750 flag = FREAD | FWRITE | FOFFMAX | FEXCL; 2751 error = vn_open(klip->li_filename, UIO_SYSSPACE, flag, 0, &vp, 0, 0); 2752 if (error) { 2753 /* try read-only */ 2754 flag &= ~FWRITE; 2755 error = vn_open(klip->li_filename, UIO_SYSSPACE, flag, 0, 2756 &vp, 0, 0); 2757 if (error) 2758 goto err; 2759 } 2760 2761 if (!V_ISLOFIABLE(vp->v_type)) { 2762 error = EINVAL; 2763 goto err; 2764 } 2765 2766 vattr.va_mask = AT_SIZE; 2767 error = VOP_GETATTR(vp, &vattr, 0, credp, NULL); 2768 if (error) 2769 goto err; 2770 2771 /* the file needs to be a multiple of the block size */ 2772 if ((vattr.va_size % DEV_BSIZE) != 0) { 2773 error = EINVAL; 2774 goto err; 2775 } 2776 2777 if (pickminor) { 2778 klip->li_id = (uint32_t)-1; 2779 } 2780 if ((error = lofi_create_dev(klip)) != 0) 2781 goto err; 2782 2783 id = klip->li_id; 2784 lsp = ddi_get_soft_state(lofi_statep, id); 2785 if (lsp == NULL) 2786 goto err; 2787 2788 /* 2789 * from this point lofi_destroy() is used to clean up on error 2790 * make sure the basic data is set 2791 */ 2792 lsp->ls_dev = makedevice(getmajor(dev), LOFI_ID2MINOR(id)); 2793 2794 list_create(&lsp->ls_comp_cache, sizeof (struct lofi_comp_cache), 2795 offsetof(struct lofi_comp_cache, lc_list)); 2796 2797 /* 2798 * save open mode so file can be closed properly and vnode counts 2799 * updated correctly. 2800 */ 2801 lsp->ls_openflag = flag; 2802 2803 lsp->ls_vp = vp; 2804 lsp->ls_stacked_vp = vp; 2805 2806 lsp->ls_vp_size = vattr.va_size; 2807 lsp->ls_vp_comp_size = lsp->ls_vp_size; 2808 2809 /* 2810 * Try to handle stacked lofs vnodes. 2811 */ 2812 if (vp->v_type == VREG) { 2813 vnode_t *realvp; 2814 2815 if (VOP_REALVP(vp, &realvp, NULL) == 0) { 2816 /* 2817 * We need to use the realvp for uniqueness 2818 * checking, but keep the stacked vp for 2819 * LOFI_GET_FILENAME display. 2820 */ 2821 VN_HOLD(realvp); 2822 lsp->ls_vp = realvp; 2823 } 2824 } 2825 2826 lsp->ls_lbshift = highbit(DEV_BSIZE) - 1; 2827 lsp->ls_pbshift = lsp->ls_lbshift; 2828 2829 lsp->ls_readonly = klip->li_readonly; 2830 lsp->ls_uncomp_seg_sz = 0; 2831 lsp->ls_comp_algorithm[0] = '\0'; 2832 lsp->ls_crypto_offset = 0; 2833 2834 (void) snprintf(namebuf, sizeof (namebuf), "%s_taskq_%d", 2835 LOFI_DRIVER_NAME, id); 2836 lsp->ls_taskq = taskq_create_proc(namebuf, lofi_taskq_nthreads, 2837 minclsyspri, 1, lofi_taskq_maxalloc, curzone->zone_zsched, 0); 2838 2839 if ((error = lofi_init_crypto(lsp, klip)) != 0) 2840 goto err; 2841 2842 if ((error = lofi_init_compress(lsp)) != 0) 2843 goto err; 2844 2845 fake_disk_geometry(lsp); 2846 2847 /* For unlabeled lofi add Nblocks and Size */ 2848 if (klip->li_labeled == B_FALSE) { 2849 error = ddi_prop_update_int64(lsp->ls_dev, lsp->ls_dip, 2850 SIZE_PROP_NAME, lsp->ls_vp_size - lsp->ls_crypto_offset); 2851 if (error != DDI_PROP_SUCCESS) { 2852 error = EINVAL; 2853 goto err; 2854 } 2855 error = ddi_prop_update_int64(lsp->ls_dev, lsp->ls_dip, 2856 NBLOCKS_PROP_NAME, 2857 (lsp->ls_vp_size - lsp->ls_crypto_offset) / DEV_BSIZE); 2858 if (error != DDI_PROP_SUCCESS) { 2859 error = EINVAL; 2860 goto err; 2861 } 2862 } 2863 2864 list_insert_tail(&lofi_list, lsp); 2865 /* 2866 * Notify we are ready to rock. 2867 */ 2868 mutex_enter(&lsp->ls_vp_lock); 2869 lsp->ls_vp_ready = B_TRUE; 2870 cv_broadcast(&lsp->ls_vp_cv); 2871 mutex_exit(&lsp->ls_vp_lock); 2872 mutex_exit(&lofi_lock); 2873 2874 lofi_copy_devpath(klip); 2875 2876 if (rvalp) 2877 *rvalp = id; 2878 (void) copy_out_lofi_ioctl(klip, ulip, ioctl_flag); 2879 free_lofi_ioctl(klip); 2880 return (0); 2881 2882 err: 2883 if (lsp != NULL) { 2884 lofi_destroy(lsp, credp); 2885 } else { 2886 if (vp != NULL) { 2887 (void) VOP_PUTPAGE(vp, 0, 0, B_INVAL, credp, NULL); 2888 (void) VOP_CLOSE(vp, flag, 1, 0, credp, NULL); 2889 VN_RELE(vp); 2890 } 2891 } 2892 2893 mutex_exit(&lofi_lock); 2894 free_lofi_ioctl(klip); 2895 return (error); 2896 } 2897 2898 /* 2899 * unmap a file. 2900 */ 2901 static int 2902 lofi_unmap_file(struct lofi_ioctl *ulip, int byfilename, 2903 struct cred *credp, int ioctl_flag) 2904 { 2905 struct lofi_state *lsp; 2906 struct lofi_ioctl *klip; 2907 nvlist_t *nvl = NULL; 2908 clock_t ticks; 2909 char name[MAXNAMELEN]; 2910 int err; 2911 2912 err = copy_in_lofi_ioctl(ulip, &klip, ioctl_flag); 2913 if (err != 0) 2914 return (err); 2915 2916 mutex_enter(&lofi_lock); 2917 if (byfilename) { 2918 if ((err = file_to_lofi(klip->li_filename, klip->li_readonly, 2919 &lsp)) != 0) { 2920 mutex_exit(&lofi_lock); 2921 return (err); 2922 } 2923 } else if (klip->li_id == 0) { 2924 mutex_exit(&lofi_lock); 2925 free_lofi_ioctl(klip); 2926 return (ENXIO); 2927 } else { 2928 lsp = ddi_get_soft_state(lofi_statep, klip->li_id); 2929 } 2930 2931 if (lsp == NULL || lsp->ls_vp == NULL || lofi_access(lsp) != 0) { 2932 mutex_exit(&lofi_lock); 2933 free_lofi_ioctl(klip); 2934 return (ENXIO); 2935 } 2936 2937 klip->li_id = LOFI_MINOR2ID(getminor(lsp->ls_dev)); 2938 2939 /* 2940 * If it's still held open, we'll do one of three things: 2941 * 2942 * If no flag is set, just return EBUSY. 2943 * 2944 * If the 'cleanup' flag is set, unmap and remove the device when 2945 * the last user finishes. 2946 * 2947 * If the 'force' flag is set, then we forcibly close the underlying 2948 * file. Subsequent operations will fail, and the DKIOCSTATE ioctl 2949 * will return DKIO_DEV_GONE. When the device is last closed, the 2950 * device will be cleaned up appropriately. 2951 * 2952 * This is complicated by the fact that we may have outstanding 2953 * dispatched I/Os. Rather than having a single mutex to serialize all 2954 * I/O, we keep a count of the number of outstanding I/O requests 2955 * (ls_vp_iocount), as well as a flag to indicate that no new I/Os 2956 * should be dispatched (ls_vp_closereq). 2957 * 2958 * We set the flag, wait for the number of outstanding I/Os to reach 0, 2959 * and then close the underlying vnode. 2960 */ 2961 if (is_opened(lsp)) { 2962 if (klip->li_force) { 2963 mutex_enter(&lsp->ls_vp_lock); 2964 lsp->ls_vp_closereq = B_TRUE; 2965 /* wake up any threads waiting on dkiocstate */ 2966 cv_broadcast(&lsp->ls_vp_cv); 2967 while (lsp->ls_vp_iocount > 0) 2968 cv_wait(&lsp->ls_vp_cv, &lsp->ls_vp_lock); 2969 mutex_exit(&lsp->ls_vp_lock); 2970 2971 goto out; 2972 } else if (klip->li_cleanup) { 2973 lsp->ls_cleanup = 1; 2974 mutex_exit(&lofi_lock); 2975 free_lofi_ioctl(klip); 2976 return (0); 2977 } 2978 2979 mutex_exit(&lofi_lock); 2980 free_lofi_ioctl(klip); 2981 return (EBUSY); 2982 } 2983 2984 out: 2985 lofi_free_dev(lsp); 2986 lofi_destroy(lsp, credp); 2987 2988 /* 2989 * check the lofi_devlink_cache if device is really gone. 2990 * note: we just wait for timeout here and dont give error if 2991 * timer will expire. This check is to try to ensure the unmap is 2992 * really done when lofiadm -d completes. 2993 * Since lofi_lock is held, also hopefully the lofiadm -a calls 2994 * wont interfere the the unmap. 2995 */ 2996 (void) snprintf(name, sizeof (name), "%d", klip->li_id); 2997 ticks = ddi_get_lbolt() + LOFI_TIMEOUT * drv_usectohz(1000000); 2998 mutex_enter(&lofi_devlink_cache.ln_lock); 2999 err = nvlist_lookup_nvlist(lofi_devlink_cache.ln_data, name, &nvl); 3000 while (err == 0) { 3001 err = cv_timedwait(&lofi_devlink_cache.ln_cv, 3002 &lofi_devlink_cache.ln_lock, ticks); 3003 if (err == -1) 3004 break; 3005 err = nvlist_lookup_nvlist(lofi_devlink_cache.ln_data, 3006 name, &nvl); 3007 } 3008 mutex_exit(&lofi_devlink_cache.ln_lock); 3009 3010 mutex_exit(&lofi_lock); 3011 (void) copy_out_lofi_ioctl(klip, ulip, ioctl_flag); 3012 free_lofi_ioctl(klip); 3013 return (0); 3014 } 3015 3016 /* 3017 * get the filename given the minor number, or the minor number given 3018 * the name. 3019 */ 3020 /*ARGSUSED*/ 3021 static int 3022 lofi_get_info(dev_t dev, struct lofi_ioctl *ulip, int which, 3023 struct cred *credp, int ioctl_flag) 3024 { 3025 struct lofi_ioctl *klip; 3026 struct lofi_state *lsp; 3027 int error; 3028 3029 error = copy_in_lofi_ioctl(ulip, &klip, ioctl_flag); 3030 if (error != 0) 3031 return (error); 3032 3033 switch (which) { 3034 case LOFI_GET_FILENAME: 3035 if (klip->li_id == 0) { 3036 free_lofi_ioctl(klip); 3037 return (EINVAL); 3038 } 3039 3040 mutex_enter(&lofi_lock); 3041 lsp = ddi_get_soft_state(lofi_statep, klip->li_id); 3042 if (lsp == NULL || lofi_access(lsp) != 0) { 3043 mutex_exit(&lofi_lock); 3044 free_lofi_ioctl(klip); 3045 return (ENXIO); 3046 } 3047 3048 /* 3049 * This may fail if, for example, we're trying to look 3050 * up a zoned NFS path from the global zone. 3051 */ 3052 if (vnodetopath(NULL, lsp->ls_stacked_vp, klip->li_filename, 3053 sizeof (klip->li_filename), CRED()) != 0) { 3054 (void) strlcpy(klip->li_filename, "?", 3055 sizeof (klip->li_filename)); 3056 } 3057 3058 klip->li_readonly = lsp->ls_readonly; 3059 klip->li_labeled = lsp->ls_cmlbhandle != NULL; 3060 3061 (void) strlcpy(klip->li_algorithm, lsp->ls_comp_algorithm, 3062 sizeof (klip->li_algorithm)); 3063 klip->li_crypto_enabled = lsp->ls_crypto_enabled; 3064 mutex_exit(&lofi_lock); 3065 3066 lofi_copy_devpath(klip); 3067 error = copy_out_lofi_ioctl(klip, ulip, ioctl_flag); 3068 free_lofi_ioctl(klip); 3069 return (error); 3070 case LOFI_GET_MINOR: 3071 mutex_enter(&lofi_lock); 3072 error = file_to_lofi(klip->li_filename, 3073 klip->li_readonly, &lsp); 3074 if (error != 0) { 3075 mutex_exit(&lofi_lock); 3076 free_lofi_ioctl(klip); 3077 return (error); 3078 } 3079 klip->li_id = LOFI_MINOR2ID(getminor(lsp->ls_dev)); 3080 3081 klip->li_readonly = lsp->ls_readonly; 3082 klip->li_labeled = lsp->ls_cmlbhandle != NULL; 3083 mutex_exit(&lofi_lock); 3084 3085 lofi_copy_devpath(klip); 3086 error = copy_out_lofi_ioctl(klip, ulip, ioctl_flag); 3087 3088 free_lofi_ioctl(klip); 3089 return (error); 3090 case LOFI_CHECK_COMPRESSED: 3091 mutex_enter(&lofi_lock); 3092 error = file_to_lofi(klip->li_filename, 3093 klip->li_readonly, &lsp); 3094 if (error != 0) { 3095 mutex_exit(&lofi_lock); 3096 free_lofi_ioctl(klip); 3097 return (error); 3098 } 3099 3100 klip->li_id = LOFI_MINOR2ID(getminor(lsp->ls_dev)); 3101 (void) strlcpy(klip->li_algorithm, lsp->ls_comp_algorithm, 3102 sizeof (klip->li_algorithm)); 3103 3104 mutex_exit(&lofi_lock); 3105 error = copy_out_lofi_ioctl(klip, ulip, ioctl_flag); 3106 free_lofi_ioctl(klip); 3107 return (error); 3108 default: 3109 free_lofi_ioctl(klip); 3110 return (EINVAL); 3111 } 3112 } 3113 3114 static int 3115 uscsi_is_inquiry(intptr_t arg, int flag, union scsi_cdb *cdb, 3116 struct uscsi_cmd *uscmd) 3117 { 3118 int rval; 3119 3120 #ifdef _MULTI_DATAMODEL 3121 switch (ddi_model_convert_from(flag & FMODELS)) { 3122 case DDI_MODEL_ILP32: { 3123 struct uscsi_cmd32 ucmd32; 3124 3125 if (ddi_copyin((void *)arg, &ucmd32, sizeof (ucmd32), flag)) { 3126 rval = EFAULT; 3127 goto err; 3128 } 3129 uscsi_cmd32touscsi_cmd((&ucmd32), uscmd); 3130 break; 3131 } 3132 case DDI_MODEL_NONE: 3133 if (ddi_copyin((void *)arg, uscmd, sizeof (*uscmd), flag)) { 3134 rval = EFAULT; 3135 goto err; 3136 } 3137 break; 3138 default: 3139 rval = EFAULT; 3140 goto err; 3141 } 3142 #else 3143 if (ddi_copyin((void *)arg, uscmd, sizeof (*uscmd), flag)) { 3144 rval = EFAULT; 3145 goto err; 3146 } 3147 #endif /* _MULTI_DATAMODEL */ 3148 if (ddi_copyin(uscmd->uscsi_cdb, cdb, uscmd->uscsi_cdblen, flag)) { 3149 rval = EFAULT; 3150 goto err; 3151 } 3152 if (cdb->scc_cmd == SCMD_INQUIRY) { 3153 return (0); 3154 } 3155 err: 3156 return (rval); 3157 } 3158 3159 static int 3160 lofi_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *credp, 3161 int *rvalp) 3162 { 3163 int error; 3164 enum dkio_state dkstate; 3165 struct lofi_state *lsp; 3166 int id; 3167 3168 id = LOFI_MINOR2ID(getminor(dev)); 3169 3170 /* lofi ioctls only apply to the master device */ 3171 if (id == 0) { 3172 struct lofi_ioctl *lip = (struct lofi_ioctl *)arg; 3173 3174 /* 3175 * the query command only need read-access - i.e., normal 3176 * users are allowed to do those on the ctl device as 3177 * long as they can open it read-only. 3178 */ 3179 switch (cmd) { 3180 case LOFI_MAP_FILE: 3181 if ((flag & FWRITE) == 0) 3182 return (EPERM); 3183 return (lofi_map_file(dev, lip, 1, rvalp, credp, flag)); 3184 case LOFI_MAP_FILE_MINOR: 3185 if ((flag & FWRITE) == 0) 3186 return (EPERM); 3187 return (lofi_map_file(dev, lip, 0, rvalp, credp, flag)); 3188 case LOFI_UNMAP_FILE: 3189 if ((flag & FWRITE) == 0) 3190 return (EPERM); 3191 return (lofi_unmap_file(lip, 1, credp, flag)); 3192 case LOFI_UNMAP_FILE_MINOR: 3193 if ((flag & FWRITE) == 0) 3194 return (EPERM); 3195 return (lofi_unmap_file(lip, 0, credp, flag)); 3196 case LOFI_GET_FILENAME: 3197 return (lofi_get_info(dev, lip, LOFI_GET_FILENAME, 3198 credp, flag)); 3199 case LOFI_GET_MINOR: 3200 return (lofi_get_info(dev, lip, LOFI_GET_MINOR, 3201 credp, flag)); 3202 3203 /* 3204 * This API made limited sense when this value was fixed 3205 * at LOFI_MAX_FILES. However, its use to iterate 3206 * across all possible devices in lofiadm means we don't 3207 * want to return L_MAXMIN, but the highest 3208 * *allocated* id. 3209 */ 3210 case LOFI_GET_MAXMINOR: 3211 id = 0; 3212 3213 mutex_enter(&lofi_lock); 3214 3215 for (lsp = list_head(&lofi_list); lsp != NULL; 3216 lsp = list_next(&lofi_list, lsp)) { 3217 int i; 3218 if (lofi_access(lsp) != 0) 3219 continue; 3220 3221 i = ddi_get_instance(lsp->ls_dip); 3222 if (i > id) 3223 id = i; 3224 } 3225 3226 mutex_exit(&lofi_lock); 3227 3228 error = ddi_copyout(&id, &lip->li_id, 3229 sizeof (id), flag); 3230 if (error) 3231 return (EFAULT); 3232 return (0); 3233 3234 case LOFI_CHECK_COMPRESSED: 3235 return (lofi_get_info(dev, lip, LOFI_CHECK_COMPRESSED, 3236 credp, flag)); 3237 default: 3238 return (EINVAL); 3239 } 3240 } 3241 3242 mutex_enter(&lofi_lock); 3243 lsp = ddi_get_soft_state(lofi_statep, id); 3244 if (lsp == NULL || lsp->ls_vp_closereq) { 3245 mutex_exit(&lofi_lock); 3246 return (ENXIO); 3247 } 3248 mutex_exit(&lofi_lock); 3249 3250 if (ddi_prop_exists(DDI_DEV_T_ANY, lsp->ls_dip, DDI_PROP_DONTPASS, 3251 "labeled") == 1) { 3252 error = cmlb_ioctl(lsp->ls_cmlbhandle, dev, cmd, arg, flag, 3253 credp, rvalp, 0); 3254 if (error != ENOTTY) 3255 return (error); 3256 } 3257 3258 /* 3259 * We explicitly allow DKIOCSTATE, but all other ioctls should fail with 3260 * EIO as if the device was no longer present. 3261 */ 3262 if (lsp->ls_vp == NULL && cmd != DKIOCSTATE) 3263 return (EIO); 3264 3265 /* these are for faking out utilities like newfs */ 3266 switch (cmd) { 3267 case DKIOCGMEDIAINFO: 3268 case DKIOCGMEDIAINFOEXT: { 3269 struct dk_minfo_ext media_info; 3270 int shift = lsp->ls_lbshift; 3271 int size; 3272 3273 if (cmd == DKIOCGMEDIAINFOEXT) { 3274 media_info.dki_pbsize = 1U << lsp->ls_pbshift; 3275 size = sizeof (struct dk_minfo_ext); 3276 } else { 3277 size = sizeof (struct dk_minfo); 3278 } 3279 3280 media_info.dki_media_type = DK_FIXED_DISK; 3281 media_info.dki_lbsize = 1U << shift; 3282 media_info.dki_capacity = 3283 (lsp->ls_vp_size - lsp->ls_crypto_offset) >> shift; 3284 3285 if (ddi_copyout(&media_info, (void *)arg, size, flag)) 3286 return (EFAULT); 3287 return (0); 3288 } 3289 case DKIOCREMOVABLE: { 3290 int i = 0; 3291 if (ddi_copyout(&i, (caddr_t)arg, sizeof (int), flag)) 3292 return (EFAULT); 3293 return (0); 3294 } 3295 3296 case DKIOCGVTOC: { 3297 struct vtoc vt; 3298 fake_disk_vtoc(lsp, &vt); 3299 3300 switch (ddi_model_convert_from(flag & FMODELS)) { 3301 case DDI_MODEL_ILP32: { 3302 struct vtoc32 vtoc32; 3303 3304 vtoctovtoc32(vt, vtoc32); 3305 if (ddi_copyout(&vtoc32, (void *)arg, 3306 sizeof (struct vtoc32), flag)) 3307 return (EFAULT); 3308 break; 3309 } 3310 3311 case DDI_MODEL_NONE: 3312 if (ddi_copyout(&vt, (void *)arg, 3313 sizeof (struct vtoc), flag)) 3314 return (EFAULT); 3315 break; 3316 } 3317 return (0); 3318 } 3319 case DKIOCINFO: { 3320 struct dk_cinfo ci; 3321 fake_disk_info(dev, &ci); 3322 if (ddi_copyout(&ci, (void *)arg, sizeof (ci), flag)) 3323 return (EFAULT); 3324 return (0); 3325 } 3326 case DKIOCG_VIRTGEOM: 3327 case DKIOCG_PHYGEOM: 3328 case DKIOCGGEOM: 3329 error = ddi_copyout(&lsp->ls_dkg, (void *)arg, 3330 sizeof (struct dk_geom), flag); 3331 if (error) 3332 return (EFAULT); 3333 return (0); 3334 case DKIOCSTATE: 3335 /* 3336 * Normally, lofi devices are always in the INSERTED state. If 3337 * a device is forcefully unmapped, then the device transitions 3338 * to the DKIO_DEV_GONE state. 3339 */ 3340 if (ddi_copyin((void *)arg, &dkstate, sizeof (dkstate), 3341 flag) != 0) 3342 return (EFAULT); 3343 3344 mutex_enter(&lsp->ls_vp_lock); 3345 lsp->ls_vp_iocount++; 3346 while (((dkstate == DKIO_INSERTED && lsp->ls_vp != NULL) || 3347 (dkstate == DKIO_DEV_GONE && lsp->ls_vp == NULL)) && 3348 !lsp->ls_vp_closereq) { 3349 /* 3350 * By virtue of having the device open, we know that 3351 * 'lsp' will remain valid when we return. 3352 */ 3353 if (!cv_wait_sig(&lsp->ls_vp_cv, 3354 &lsp->ls_vp_lock)) { 3355 lsp->ls_vp_iocount--; 3356 cv_broadcast(&lsp->ls_vp_cv); 3357 mutex_exit(&lsp->ls_vp_lock); 3358 return (EINTR); 3359 } 3360 } 3361 3362 dkstate = (!lsp->ls_vp_closereq && lsp->ls_vp != NULL ? 3363 DKIO_INSERTED : DKIO_DEV_GONE); 3364 lsp->ls_vp_iocount--; 3365 cv_broadcast(&lsp->ls_vp_cv); 3366 mutex_exit(&lsp->ls_vp_lock); 3367 3368 if (ddi_copyout(&dkstate, (void *)arg, 3369 sizeof (dkstate), flag) != 0) 3370 return (EFAULT); 3371 return (0); 3372 case USCSICMD: { 3373 struct uscsi_cmd uscmd; 3374 union scsi_cdb cdb; 3375 3376 if (uscsi_is_inquiry(arg, flag, &cdb, &uscmd) == 0) { 3377 struct scsi_inquiry inq = {0}; 3378 3379 lofi_create_inquiry(lsp, &inq); 3380 if (ddi_copyout(&inq, uscmd.uscsi_bufaddr, 3381 uscmd.uscsi_buflen, flag) != 0) 3382 return (EFAULT); 3383 return (0); 3384 } else if (cdb.scc_cmd == SCMD_READ_CAPACITY) { 3385 struct scsi_capacity capacity; 3386 3387 capacity.capacity = 3388 BE_32((lsp->ls_vp_size - lsp->ls_crypto_offset) >> 3389 lsp->ls_lbshift); 3390 capacity.lbasize = BE_32(1 << lsp->ls_lbshift); 3391 if (ddi_copyout(&capacity, uscmd.uscsi_bufaddr, 3392 uscmd.uscsi_buflen, flag) != 0) 3393 return (EFAULT); 3394 return (0); 3395 } 3396 3397 uscmd.uscsi_rqstatus = 0xff; 3398 #ifdef _MULTI_DATAMODEL 3399 switch (ddi_model_convert_from(flag & FMODELS)) { 3400 case DDI_MODEL_ILP32: { 3401 struct uscsi_cmd32 ucmd32; 3402 uscsi_cmdtouscsi_cmd32((&uscmd), (&ucmd32)); 3403 if (ddi_copyout(&ucmd32, (void *)arg, sizeof (ucmd32), 3404 flag) != 0) 3405 return (EFAULT); 3406 break; 3407 } 3408 case DDI_MODEL_NONE: 3409 if (ddi_copyout(&uscmd, (void *)arg, sizeof (uscmd), 3410 flag) != 0) 3411 return (EFAULT); 3412 break; 3413 default: 3414 return (EFAULT); 3415 } 3416 #else 3417 if (ddi_copyout(&uscmd, (void *)arg, sizeof (uscmd), flag) != 0) 3418 return (EFAULT); 3419 #endif /* _MULTI_DATAMODEL */ 3420 return (0); 3421 } 3422 default: 3423 #ifdef DEBUG 3424 cmn_err(CE_WARN, "lofi_ioctl: %d is not implemented\n", cmd); 3425 #endif /* DEBUG */ 3426 return (ENOTTY); 3427 } 3428 } 3429 3430 static int 3431 lofi_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int mod_flags, 3432 char *name, caddr_t valuep, int *lengthp) 3433 { 3434 struct lofi_state *lsp; 3435 int rc; 3436 3437 lsp = ddi_get_soft_state(lofi_statep, ddi_get_instance(dip)); 3438 if (lsp == NULL) { 3439 return (ddi_prop_op(dev, dip, prop_op, mod_flags, 3440 name, valuep, lengthp)); 3441 } 3442 3443 rc = cmlb_prop_op(lsp->ls_cmlbhandle, dev, dip, prop_op, mod_flags, 3444 name, valuep, lengthp, LOFI_PART(getminor(dev)), NULL); 3445 if (rc == DDI_PROP_SUCCESS) 3446 return (rc); 3447 3448 return (ddi_prop_op(DDI_DEV_T_ANY, dip, prop_op, mod_flags, 3449 name, valuep, lengthp)); 3450 } 3451 3452 static struct cb_ops lofi_cb_ops = { 3453 lofi_open, /* open */ 3454 lofi_close, /* close */ 3455 lofi_strategy, /* strategy */ 3456 nodev, /* print */ 3457 nodev, /* dump */ 3458 lofi_read, /* read */ 3459 lofi_write, /* write */ 3460 lofi_ioctl, /* ioctl */ 3461 nodev, /* devmap */ 3462 nodev, /* mmap */ 3463 nodev, /* segmap */ 3464 nochpoll, /* poll */ 3465 lofi_prop_op, /* prop_op */ 3466 0, /* streamtab */ 3467 D_64BIT | D_NEW | D_MP, /* Driver compatibility flag */ 3468 CB_REV, 3469 lofi_aread, 3470 lofi_awrite 3471 }; 3472 3473 static struct dev_ops lofi_ops = { 3474 DEVO_REV, /* devo_rev, */ 3475 0, /* refcnt */ 3476 lofi_info, /* info */ 3477 nulldev, /* identify */ 3478 nulldev, /* probe */ 3479 lofi_attach, /* attach */ 3480 lofi_detach, /* detach */ 3481 nodev, /* reset */ 3482 &lofi_cb_ops, /* driver operations */ 3483 NULL, /* no bus operations */ 3484 NULL, /* power */ 3485 ddi_quiesce_not_needed, /* quiesce */ 3486 }; 3487 3488 static struct modldrv modldrv = { 3489 &mod_driverops, 3490 "loopback file driver", 3491 &lofi_ops, 3492 }; 3493 3494 static struct modlinkage modlinkage = { 3495 MODREV_1, 3496 &modldrv, 3497 NULL 3498 }; 3499 3500 int 3501 _init(void) 3502 { 3503 int error; 3504 3505 list_create(&lofi_list, sizeof (struct lofi_state), 3506 offsetof(struct lofi_state, ls_list)); 3507 3508 error = ddi_soft_state_init((void **)&lofi_statep, 3509 sizeof (struct lofi_state), 0); 3510 if (error) { 3511 list_destroy(&lofi_list); 3512 return (error); 3513 } 3514 3515 /* 3516 * The minor number is stored as id << LOFI_CMLB_SHIFT as 3517 * we need to reserve space for cmlb minor numbers. 3518 * This will leave out 4096 id values on 32bit kernel, which should 3519 * still suffice. 3520 */ 3521 lofi_id = id_space_create("lofi_id", 1, 3522 (1 << (L_BITSMINOR - LOFI_CMLB_SHIFT))); 3523 3524 if (lofi_id == NULL) { 3525 ddi_soft_state_fini((void **)&lofi_statep); 3526 list_destroy(&lofi_list); 3527 return (DDI_FAILURE); 3528 } 3529 3530 mutex_init(&lofi_lock, NULL, MUTEX_DRIVER, NULL); 3531 3532 error = mod_install(&modlinkage); 3533 3534 if (error) { 3535 id_space_destroy(lofi_id); 3536 mutex_destroy(&lofi_lock); 3537 ddi_soft_state_fini((void **)&lofi_statep); 3538 list_destroy(&lofi_list); 3539 } 3540 3541 return (error); 3542 } 3543 3544 int 3545 _fini(void) 3546 { 3547 int error; 3548 3549 mutex_enter(&lofi_lock); 3550 3551 if (!list_is_empty(&lofi_list)) { 3552 mutex_exit(&lofi_lock); 3553 return (EBUSY); 3554 } 3555 3556 mutex_exit(&lofi_lock); 3557 3558 error = mod_remove(&modlinkage); 3559 if (error) 3560 return (error); 3561 3562 mutex_destroy(&lofi_lock); 3563 id_space_destroy(lofi_id); 3564 ddi_soft_state_fini((void **)&lofi_statep); 3565 list_destroy(&lofi_list); 3566 3567 return (error); 3568 } 3569 3570 int 3571 _info(struct modinfo *modinfop) 3572 { 3573 return (mod_info(&modlinkage, modinfop)); 3574 } 3575