1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved. 23 * 24 * Copyright 2013 Nexenta Systems, Inc. All rights reserved. 25 * Copyright (c) 2016 Andrey Sokolov 26 * Copyright 2019 Joyent, Inc. 27 * Copyright 2019 OmniOS Community Edition (OmniOSce) Association. 28 * Copyright 2021 Toomas Soome <tsoome@me.com> 29 */ 30 31 /* 32 * lofi (loopback file) driver - allows you to attach a file to a device, 33 * which can then be accessed through that device. The simple model is that 34 * you tell lofi to open a file, and then use the block device you get as 35 * you would any block device. lofi translates access to the block device 36 * into I/O on the underlying file. This is mostly useful for 37 * mounting images of filesystems. 38 * 39 * lofi is controlled through /dev/lofictl - this is the only device exported 40 * during attach, and is instance number 0. lofiadm communicates with lofi 41 * through ioctls on this device. When a file is attached to lofi, block and 42 * character devices are exported in /dev/lofi and /dev/rlofi. These devices 43 * are identified by lofi instance number, and the instance number is also used 44 * as the name in /dev/lofi. 45 * 46 * Virtual disks, or, labeled lofi, implements virtual disk support to 47 * support partition table and related tools. Such mappings will cause 48 * block and character devices to be exported in /dev/dsk and /dev/rdsk 49 * directories. 50 * 51 * To support virtual disks, the instance number space is divided to two 52 * parts, upper part for instance number and lower part for minor number 53 * space to identify partitions and slices. The virtual disk support is 54 * implemented by stacking cmlb module. For virtual disks, the partition 55 * related ioctl calls are routed to cmlb module. Compression and encryption 56 * is not supported for virtual disks. 57 * 58 * Mapped devices are tracked with state structures handled with 59 * ddi_soft_state(9F) for simplicity. 60 * 61 * A file attached to lofi is opened when attached and not closed until 62 * explicitly detached from lofi. This seems more sensible than deferring 63 * the open until the /dev/lofi device is opened, for a number of reasons. 64 * One is that any failure is likely to be noticed by the person (or script) 65 * running lofiadm. Another is that it would be a security problem if the 66 * file was replaced by another one after being added but before being opened. 67 * 68 * The only hard part about lofi is the ioctls. In order to support things 69 * like 'newfs' on a lofi device, it needs to support certain disk ioctls. 70 * So it has to fake disk geometry and partition information. More may need 71 * to be faked if your favorite utility doesn't work and you think it should 72 * (fdformat doesn't work because it really wants to know the type of floppy 73 * controller to talk to, and that didn't seem easy to fake. Or possibly even 74 * necessary, since we have mkfs_pcfs now). 75 * 76 * Normally, a lofi device cannot be detached if it is open (i.e. busy). To 77 * support simulation of hotplug events, an optional force flag is provided. 78 * If a lofi device is open when a force detach is requested, then the 79 * underlying file is closed and any subsequent operations return EIO. When the 80 * device is closed for the last time, it will be cleaned up at that time. In 81 * addition, the DKIOCSTATE ioctl will return DKIO_DEV_GONE when the device is 82 * detached but not removed. 83 * 84 * If detach was requested and lofi device is not open, we will perform 85 * unmap and remove the lofi instance. 86 * 87 * If the lofi device is open and the li_cleanup is set on ioctl request, 88 * we set ls_cleanup flag to notify the cleanup is requested, and the 89 * last lofi_close will perform the unmapping and this lofi instance will be 90 * removed. 91 * 92 * If the lofi device is open and the li_force is set on ioctl request, 93 * we set ls_cleanup flag to notify the cleanup is requested, 94 * we also set ls_vp_closereq to notify IO tasks to return EIO on new 95 * IO requests and wait in process IO count to become 0, indicating there 96 * are no more IO requests. Since ls_cleanup is set, the last lofi_close 97 * will perform unmap and this lofi instance will be removed. 98 * See also lofi_unmap_file() for details. 99 * 100 * Once ls_cleanup is set for the instance, we do not allow lofi_open() 101 * calls to succeed and can have last lofi_close() to remove the instance. 102 * 103 * Known problems: 104 * 105 * UFS logging. Mounting a UFS filesystem image "logging" 106 * works for basic copy testing but wedges during a build of ON through 107 * that image. Some deadlock in lufs holding the log mutex and then 108 * getting stuck on a buf. So for now, don't do that. 109 * 110 * Direct I/O. Since the filesystem data is being cached in the buffer 111 * cache, _and_ again in the underlying filesystem, it's tempting to 112 * enable direct I/O on the underlying file. Don't, because that deadlocks. 113 * I think to fix the cache-twice problem we might need filesystem support. 114 * 115 * Interesting things to do: 116 * 117 * Allow multiple files for each device. A poor-man's metadisk, basically. 118 * 119 * Pass-through ioctls on block devices. You can (though it's not 120 * documented), give lofi a block device as a file name. Then we shouldn't 121 * need to fake a geometry, however, it may be relevant if you're replacing 122 * metadisk, or using lofi to get crypto. 123 * It makes sense to do lofiadm -c aes -a /dev/dsk/c0t0d0s4 /dev/lofi/1 124 * and then in /etc/vfstab have an entry for /dev/lofi/1 as /export/home. 125 * In fact this even makes sense if you have lofi "above" metadisk. 126 * 127 * Encryption: 128 * Each lofi device can have its own symmetric key and cipher. 129 * They are passed to us by lofiadm(8) in the correct format for use 130 * with the misc/kcf crypto_* routines. 131 * 132 * Each block has its own IV, that is calculated in lofi_blk_mech(), based 133 * on the "master" key held in the lsp and the block number of the buffer. 134 */ 135 136 #include <sys/types.h> 137 #include <netinet/in.h> 138 #include <sys/sysmacros.h> 139 #include <sys/uio.h> 140 #include <sys/kmem.h> 141 #include <sys/cred.h> 142 #include <sys/mman.h> 143 #include <sys/errno.h> 144 #include <sys/aio_req.h> 145 #include <sys/stat.h> 146 #include <sys/file.h> 147 #include <sys/modctl.h> 148 #include <sys/conf.h> 149 #include <sys/debug.h> 150 #include <sys/vnode.h> 151 #include <sys/lofi.h> 152 #include <sys/lofi_impl.h> /* for cache structure */ 153 #include <sys/fcntl.h> 154 #include <sys/pathname.h> 155 #include <sys/filio.h> 156 #include <sys/fdio.h> 157 #include <sys/open.h> 158 #include <sys/disp.h> 159 #include <vm/seg_map.h> 160 #include <sys/ddi.h> 161 #include <sys/sunddi.h> 162 #include <sys/zmod.h> 163 #include <sys/id_space.h> 164 #include <sys/mkdev.h> 165 #include <sys/crypto/common.h> 166 #include <sys/crypto/api.h> 167 #include <sys/rctl.h> 168 #include <sys/vtoc.h> 169 #include <sys/scsi/scsi.h> /* for DTYPE_DIRECT */ 170 #include <sys/scsi/impl/uscsi.h> 171 #include <sys/sysevent/dev.h> 172 #include <sys/efi_partition.h> 173 #include <sys/note.h> 174 #include <LzmaDec.h> 175 176 #define NBLOCKS_PROP_NAME "Nblocks" 177 #define SIZE_PROP_NAME "Size" 178 #define ZONE_PROP_NAME "zone" 179 180 #define SETUP_C_DATA(cd, buf, len) \ 181 (cd).cd_format = CRYPTO_DATA_RAW; \ 182 (cd).cd_offset = 0; \ 183 (cd).cd_miscdata = NULL; \ 184 (cd).cd_length = (len); \ 185 (cd).cd_raw.iov_base = (buf); \ 186 (cd).cd_raw.iov_len = (len); 187 188 #define UIO_CHECK(uio) \ 189 if (((uio)->uio_loffset % DEV_BSIZE) != 0 || \ 190 ((uio)->uio_resid % DEV_BSIZE) != 0) { \ 191 return (EINVAL); \ 192 } 193 194 #define LOFI_TIMEOUT 120 195 196 int lofi_timeout = LOFI_TIMEOUT; 197 static void *lofi_statep; 198 static kmutex_t lofi_lock; /* state lock */ 199 static id_space_t *lofi_id; /* lofi ID values */ 200 static list_t lofi_list; 201 static zone_key_t lofi_zone_key; 202 203 /* 204 * Because lofi_taskq_nthreads limits the actual swamping of the device, the 205 * maxalloc parameter (lofi_taskq_maxalloc) should be tuned conservatively 206 * high. If we want to be assured that the underlying device is always busy, 207 * we must be sure that the number of bytes enqueued when the number of 208 * enqueued tasks exceeds maxalloc is sufficient to keep the device busy for 209 * the duration of the sleep time in taskq_ent_alloc(). That is, lofi should 210 * set maxalloc to be the maximum throughput (in bytes per second) of the 211 * underlying device divided by the minimum I/O size. We assume a realistic 212 * maximum throughput of one hundred megabytes per second; we set maxalloc on 213 * the lofi task queue to be 104857600 divided by DEV_BSIZE. 214 */ 215 static int lofi_taskq_maxalloc = 104857600 / DEV_BSIZE; 216 static int lofi_taskq_nthreads = 4; /* # of taskq threads per device */ 217 218 const char lofi_crypto_magic[6] = LOFI_CRYPTO_MAGIC; 219 220 /* 221 * To avoid decompressing data in a compressed segment multiple times 222 * when accessing small parts of a segment's data, we cache and reuse 223 * the uncompressed segment's data. 224 * 225 * A single cached segment is sufficient to avoid lots of duplicate 226 * segment decompress operations. A small cache size also reduces the 227 * memory footprint. 228 * 229 * lofi_max_comp_cache is the maximum number of decompressed data segments 230 * cached for each compressed lofi image. It can be set to 0 to disable 231 * caching. 232 */ 233 234 uint32_t lofi_max_comp_cache = 1; 235 236 static int gzip_decompress(void *src, size_t srclen, void *dst, 237 size_t *destlen, int level); 238 239 static int lzma_decompress(void *src, size_t srclen, void *dst, 240 size_t *dstlen, int level); 241 242 lofi_compress_info_t lofi_compress_table[LOFI_COMPRESS_FUNCTIONS] = { 243 {gzip_decompress, NULL, 6, "gzip"}, /* default */ 244 {gzip_decompress, NULL, 6, "gzip-6"}, 245 {gzip_decompress, NULL, 9, "gzip-9"}, 246 {lzma_decompress, NULL, 0, "lzma"} 247 }; 248 249 static void lofi_strategy_task(void *); 250 static int lofi_tg_rdwr(dev_info_t *, uchar_t, void *, diskaddr_t, 251 size_t, void *); 252 static int lofi_tg_getinfo(dev_info_t *, int, void *, void *); 253 254 struct cmlb_tg_ops lofi_tg_ops = { 255 TG_DK_OPS_VERSION_1, 256 lofi_tg_rdwr, 257 lofi_tg_getinfo 258 }; 259 260 /*ARGSUSED*/ 261 static void 262 *SzAlloc(void *p, size_t size) 263 { 264 return (kmem_alloc(size, KM_SLEEP)); 265 } 266 267 /*ARGSUSED*/ 268 static void 269 SzFree(void *p, void *address, size_t size) 270 { 271 kmem_free(address, size); 272 } 273 274 static ISzAlloc g_Alloc = { SzAlloc, SzFree }; 275 276 /* 277 * Free data referenced by the linked list of cached uncompressed 278 * segments. 279 */ 280 static void 281 lofi_free_comp_cache(struct lofi_state *lsp) 282 { 283 struct lofi_comp_cache *lc; 284 285 while ((lc = list_remove_head(&lsp->ls_comp_cache)) != NULL) { 286 kmem_free(lc->lc_data, lsp->ls_uncomp_seg_sz); 287 kmem_free(lc, sizeof (struct lofi_comp_cache)); 288 lsp->ls_comp_cache_count--; 289 } 290 ASSERT(lsp->ls_comp_cache_count == 0); 291 } 292 293 static int 294 is_opened(struct lofi_state *lsp) 295 { 296 int i; 297 boolean_t last = B_TRUE; 298 299 ASSERT(MUTEX_HELD(&lofi_lock)); 300 for (i = 0; i < LOFI_PART_MAX; i++) { 301 if (lsp->ls_open_lyr[i]) { 302 last = B_FALSE; 303 break; 304 } 305 } 306 307 for (i = 0; last && (i < OTYP_LYR); i++) { 308 if (lsp->ls_open_reg[i]) { 309 last = B_FALSE; 310 } 311 } 312 313 return (!last); 314 } 315 316 static void 317 lofi_set_cleanup(struct lofi_state *lsp) 318 { 319 ASSERT(MUTEX_HELD(&lofi_lock)); 320 321 lsp->ls_cleanup = B_TRUE; 322 323 /* wake up any threads waiting on dkiocstate */ 324 cv_broadcast(&lsp->ls_vp_cv); 325 } 326 327 static void 328 lofi_free_crypto(struct lofi_state *lsp) 329 { 330 ASSERT(MUTEX_HELD(&lofi_lock)); 331 332 if (lsp->ls_crypto_enabled) { 333 /* 334 * Clean up the crypto state so that it doesn't hang around 335 * in memory after we are done with it. 336 */ 337 if (lsp->ls_key.ck_data != NULL) { 338 bzero(lsp->ls_key.ck_data, 339 CRYPTO_BITS2BYTES(lsp->ls_key.ck_length)); 340 kmem_free(lsp->ls_key.ck_data, 341 CRYPTO_BITS2BYTES(lsp->ls_key.ck_length)); 342 lsp->ls_key.ck_data = NULL; 343 lsp->ls_key.ck_length = 0; 344 } 345 346 if (lsp->ls_mech.cm_param != NULL) { 347 kmem_free(lsp->ls_mech.cm_param, 348 lsp->ls_mech.cm_param_len); 349 lsp->ls_mech.cm_param = NULL; 350 lsp->ls_mech.cm_param_len = 0; 351 } 352 353 if (lsp->ls_iv_mech.cm_param != NULL) { 354 kmem_free(lsp->ls_iv_mech.cm_param, 355 lsp->ls_iv_mech.cm_param_len); 356 lsp->ls_iv_mech.cm_param = NULL; 357 lsp->ls_iv_mech.cm_param_len = 0; 358 } 359 360 mutex_destroy(&lsp->ls_crypto_lock); 361 } 362 } 363 364 /* ARGSUSED */ 365 static int 366 lofi_tg_rdwr(dev_info_t *dip, uchar_t cmd, void *bufaddr, diskaddr_t start, 367 size_t length, void *tg_cookie) 368 { 369 struct lofi_state *lsp; 370 buf_t *bp; 371 int instance; 372 int rv = 0; 373 374 instance = ddi_get_instance(dip); 375 if (instance == 0) /* control node does not have disk */ 376 return (ENXIO); 377 378 lsp = ddi_get_soft_state(lofi_statep, instance); 379 380 if (lsp == NULL) 381 return (ENXIO); 382 383 if (cmd != TG_READ && cmd != TG_WRITE) 384 return (EINVAL); 385 386 /* 387 * Make sure the mapping is set up by checking lsp->ls_vp_ready. 388 */ 389 mutex_enter(&lsp->ls_vp_lock); 390 while (lsp->ls_vp_ready == B_FALSE) 391 cv_wait(&lsp->ls_vp_cv, &lsp->ls_vp_lock); 392 mutex_exit(&lsp->ls_vp_lock); 393 394 if (P2PHASE(length, (1U << lsp->ls_lbshift)) != 0) { 395 /* We can only transfer whole blocks at a time! */ 396 return (EINVAL); 397 } 398 399 bp = getrbuf(KM_SLEEP); 400 401 if (cmd == TG_READ) { 402 bp->b_flags = B_READ; 403 } else { 404 if (lsp->ls_readonly == B_TRUE) { 405 freerbuf(bp); 406 return (EROFS); 407 } 408 bp->b_flags = B_WRITE; 409 } 410 411 bp->b_un.b_addr = bufaddr; 412 bp->b_bcount = length; 413 bp->b_lblkno = start; 414 bp->b_private = NULL; 415 bp->b_edev = lsp->ls_dev; 416 417 if (lsp->ls_kstat) { 418 mutex_enter(lsp->ls_kstat->ks_lock); 419 kstat_waitq_enter(KSTAT_IO_PTR(lsp->ls_kstat)); 420 mutex_exit(lsp->ls_kstat->ks_lock); 421 } 422 (void) taskq_dispatch(lsp->ls_taskq, lofi_strategy_task, bp, KM_SLEEP); 423 (void) biowait(bp); 424 425 rv = geterror(bp); 426 freerbuf(bp); 427 return (rv); 428 } 429 430 /* 431 * Get device geometry info for cmlb. 432 * 433 * We have mapped disk image as virtual block device and have to report 434 * physical/virtual geometry to cmlb. 435 * 436 * So we have two principal cases: 437 * 1. Uninitialised image without any existing labels, 438 * for this case we fabricate the data based on mapped image. 439 * 2. Image with existing label information. 440 * Since we have no information how the image was created (it may be 441 * dump from some physical device), we need to rely on label information 442 * from image, or we get "corrupted label" errors. 443 * NOTE: label can be MBR, MBR+SMI, GPT 444 */ 445 static int 446 lofi_tg_getinfo(dev_info_t *dip, int cmd, void *arg, void *tg_cookie) 447 { 448 struct lofi_state *lsp; 449 int instance; 450 int ashift; 451 452 _NOTE(ARGUNUSED(tg_cookie)); 453 instance = ddi_get_instance(dip); 454 if (instance == 0) /* control device has no storage */ 455 return (ENXIO); 456 457 lsp = ddi_get_soft_state(lofi_statep, instance); 458 459 if (lsp == NULL) 460 return (ENXIO); 461 462 /* 463 * Make sure the mapping is set up by checking lsp->ls_vp_ready. 464 * 465 * When mapping is created, new lofi instance is created and 466 * lofi_attach() will call cmlb_attach() as part of the procedure 467 * to set the mapping up. This chain of events will happen in 468 * the same thread. 469 * Since cmlb_attach() will call lofi_tg_getinfo to get 470 * capacity, we return error on that call if cookie is set, 471 * otherwise lofi_attach will be stuck as the mapping is not yet 472 * finalized and lofi is not yet ready. 473 * Note, such error is not fatal for cmlb, as the label setup 474 * will be finalized when cmlb_validate() is called. 475 */ 476 mutex_enter(&lsp->ls_vp_lock); 477 if (tg_cookie != NULL && lsp->ls_vp_ready == B_FALSE) { 478 mutex_exit(&lsp->ls_vp_lock); 479 return (ENXIO); 480 } 481 while (lsp->ls_vp_ready == B_FALSE) 482 cv_wait(&lsp->ls_vp_cv, &lsp->ls_vp_lock); 483 mutex_exit(&lsp->ls_vp_lock); 484 485 ashift = lsp->ls_lbshift; 486 487 switch (cmd) { 488 case TG_GETPHYGEOM: { 489 cmlb_geom_t *geomp = arg; 490 491 geomp->g_capacity = 492 (lsp->ls_vp_size - lsp->ls_crypto_offset) >> ashift; 493 geomp->g_nsect = lsp->ls_dkg.dkg_nsect; 494 geomp->g_nhead = lsp->ls_dkg.dkg_nhead; 495 geomp->g_acyl = lsp->ls_dkg.dkg_acyl; 496 geomp->g_ncyl = lsp->ls_dkg.dkg_ncyl; 497 geomp->g_secsize = (1U << ashift); 498 geomp->g_intrlv = lsp->ls_dkg.dkg_intrlv; 499 geomp->g_rpm = lsp->ls_dkg.dkg_rpm; 500 return (0); 501 } 502 503 case TG_GETCAPACITY: 504 *(diskaddr_t *)arg = 505 (lsp->ls_vp_size - lsp->ls_crypto_offset) >> ashift; 506 return (0); 507 508 case TG_GETBLOCKSIZE: 509 *(uint32_t *)arg = (1U << ashift); 510 return (0); 511 512 case TG_GETATTR: { 513 tg_attribute_t *tgattr = arg; 514 515 tgattr->media_is_writable = !lsp->ls_readonly; 516 tgattr->media_is_solid_state = B_FALSE; 517 tgattr->media_is_rotational = B_FALSE; 518 return (0); 519 } 520 521 default: 522 return (EINVAL); 523 } 524 } 525 526 static void 527 lofi_teardown_task(void *arg) 528 { 529 struct lofi_state *lsp = arg; 530 int id = LOFI_MINOR2ID(getminor(lsp->ls_dev)); 531 532 mutex_enter(&lofi_lock); 533 while (ndi_devi_offline(lsp->ls_dip, NDI_DEVI_REMOVE) != NDI_SUCCESS) { 534 mutex_exit(&lofi_lock); 535 /* do a sleeping wait for one second */; 536 delay(drv_usectohz(MICROSEC)); 537 mutex_enter(&lofi_lock); 538 } 539 id_free(lofi_id, id); 540 mutex_exit(&lofi_lock); 541 } 542 543 static void 544 lofi_destroy(struct lofi_state *lsp, cred_t *credp) 545 { 546 int id = LOFI_MINOR2ID(getminor(lsp->ls_dev)); 547 int i; 548 549 ASSERT(MUTEX_HELD(&lofi_lock)); 550 551 /* 552 * Before we can start to release the other resources, 553 * make sure we have all tasks completed and taskq removed. 554 */ 555 if (lsp->ls_taskq != NULL) { 556 taskq_destroy(lsp->ls_taskq); 557 lsp->ls_taskq = NULL; 558 } 559 560 list_remove(&lofi_list, lsp); 561 562 lofi_free_crypto(lsp); 563 564 /* 565 * Free pre-allocated compressed buffers 566 */ 567 if (lsp->ls_comp_bufs != NULL) { 568 for (i = 0; i < lofi_taskq_nthreads; i++) { 569 if (lsp->ls_comp_bufs[i].bufsize > 0) 570 kmem_free(lsp->ls_comp_bufs[i].buf, 571 lsp->ls_comp_bufs[i].bufsize); 572 } 573 kmem_free(lsp->ls_comp_bufs, 574 sizeof (struct compbuf) * lofi_taskq_nthreads); 575 } 576 577 if (lsp->ls_vp != NULL) { 578 (void) VOP_PUTPAGE(lsp->ls_vp, 0, 0, B_FREE, credp, NULL); 579 (void) VOP_CLOSE(lsp->ls_vp, lsp->ls_openflag, 580 1, 0, credp, NULL); 581 VN_RELE(lsp->ls_vp); 582 } 583 if (lsp->ls_stacked_vp != lsp->ls_vp) 584 VN_RELE(lsp->ls_stacked_vp); 585 lsp->ls_vp = lsp->ls_stacked_vp = NULL; 586 587 if (lsp->ls_kstat != NULL) { 588 kstat_delete(lsp->ls_kstat); 589 lsp->ls_kstat = NULL; 590 } 591 592 /* 593 * Free cached decompressed segment data 594 */ 595 lofi_free_comp_cache(lsp); 596 list_destroy(&lsp->ls_comp_cache); 597 598 if (lsp->ls_uncomp_seg_sz > 0) { 599 kmem_free(lsp->ls_comp_index_data, lsp->ls_comp_index_data_sz); 600 lsp->ls_uncomp_seg_sz = 0; 601 } 602 603 rctl_decr_lofi(lsp->ls_zone.zref_zone, 1); 604 zone_rele_ref(&lsp->ls_zone, ZONE_REF_LOFI); 605 606 mutex_destroy(&lsp->ls_comp_cache_lock); 607 mutex_destroy(&lsp->ls_comp_bufs_lock); 608 mutex_destroy(&lsp->ls_kstat_lock); 609 mutex_destroy(&lsp->ls_vp_lock); 610 cv_destroy(&lsp->ls_vp_cv); 611 lsp->ls_vp_ready = B_FALSE; 612 lsp->ls_vp_closereq = B_FALSE; 613 614 ASSERT(ddi_get_soft_state(lofi_statep, id) == lsp); 615 /* 616 * Instance state is allocated in lofi_attach() and freed in 617 * lofi_detach(). New instance is created when we create new mapping. 618 * Instance removal is performed by unmap ioctl on lofi control 619 * instance (0). 620 * 621 * If the unmap is performed with instance which is still in use, 622 * we either cancel unmap with error or we can perform delayed unmap 623 * by blocking all IO, waiting the consumers to close access to this 624 * instance and once there are no more consumers, complete the unmap. 625 * 626 * Delayed unmap will trigger instance removal on last lofi_close(), 627 * but we can not remove device instance while the instance is still 628 * in use due to lofi_close() is running. 629 * Spawn task to complete device instance offlining in separate thread. 630 */ 631 (void) taskq_dispatch(system_taskq, lofi_teardown_task, lsp, KM_SLEEP); 632 } 633 634 static void 635 lofi_free_dev(struct lofi_state *lsp) 636 { 637 ASSERT(MUTEX_HELD(&lofi_lock)); 638 639 if (lsp->ls_cmlbhandle != NULL) { 640 cmlb_invalidate(lsp->ls_cmlbhandle, 0); 641 cmlb_detach(lsp->ls_cmlbhandle, 0); 642 cmlb_free_handle(&lsp->ls_cmlbhandle); 643 lsp->ls_cmlbhandle = NULL; 644 } 645 (void) ddi_prop_remove_all(lsp->ls_dip); 646 ddi_remove_minor_node(lsp->ls_dip, NULL); 647 } 648 649 /*ARGSUSED*/ 650 static void 651 lofi_zone_shutdown(zoneid_t zoneid, void *arg) 652 { 653 struct lofi_state *lsp; 654 struct lofi_state *next; 655 656 mutex_enter(&lofi_lock); 657 658 for (lsp = list_head(&lofi_list); lsp != NULL; lsp = next) { 659 660 /* lofi_destroy() frees lsp */ 661 next = list_next(&lofi_list, lsp); 662 663 if (lsp->ls_zone.zref_zone->zone_id != zoneid) 664 continue; 665 666 /* 667 * No in-zone processes are running, but something has this 668 * open. It's either a global zone process, or a lofi 669 * mount. In either case we set ls_cleanup so the last 670 * user destroys the device. 671 */ 672 if (is_opened(lsp)) { 673 lofi_set_cleanup(lsp); 674 } else { 675 lofi_free_dev(lsp); 676 lofi_destroy(lsp, kcred); 677 } 678 } 679 680 mutex_exit(&lofi_lock); 681 } 682 683 /*ARGSUSED*/ 684 static int 685 lofi_open(dev_t *devp, int flag, int otyp, struct cred *credp) 686 { 687 int id; 688 minor_t part; 689 uint64_t mask; 690 diskaddr_t nblks; 691 diskaddr_t lba; 692 boolean_t ndelay; 693 694 struct lofi_state *lsp; 695 696 if (otyp >= OTYPCNT) 697 return (EINVAL); 698 699 ndelay = (flag & (FNDELAY | FNONBLOCK)) ? B_TRUE : B_FALSE; 700 701 /* 702 * lofiadm -a /dev/lofi/1 gets us here. 703 */ 704 if (mutex_owner(&lofi_lock) == curthread) 705 return (EINVAL); 706 707 mutex_enter(&lofi_lock); 708 709 id = LOFI_MINOR2ID(getminor(*devp)); 710 part = LOFI_PART(getminor(*devp)); 711 mask = (1U << part); 712 713 /* master control device */ 714 if (id == 0) { 715 mutex_exit(&lofi_lock); 716 return (0); 717 } 718 719 /* otherwise, the mapping should already exist */ 720 lsp = ddi_get_soft_state(lofi_statep, id); 721 if (lsp == NULL) { 722 mutex_exit(&lofi_lock); 723 return (EINVAL); 724 } 725 726 if (lsp->ls_cleanup == B_TRUE) { 727 mutex_exit(&lofi_lock); 728 return (ENXIO); 729 } 730 731 if (lsp->ls_vp == NULL) { 732 mutex_exit(&lofi_lock); 733 return (ENXIO); 734 } 735 736 if (lsp->ls_readonly && (flag & FWRITE)) { 737 mutex_exit(&lofi_lock); 738 return (EROFS); 739 } 740 741 if ((lsp->ls_open_excl) & (mask)) { 742 mutex_exit(&lofi_lock); 743 return (EBUSY); 744 } 745 746 if (flag & FEXCL) { 747 if (lsp->ls_open_lyr[part]) { 748 mutex_exit(&lofi_lock); 749 return (EBUSY); 750 } 751 for (int i = 0; i < OTYP_LYR; i++) { 752 if (lsp->ls_open_reg[i] & mask) { 753 mutex_exit(&lofi_lock); 754 return (EBUSY); 755 } 756 } 757 } 758 759 if (lsp->ls_cmlbhandle != NULL) { 760 if (cmlb_validate(lsp->ls_cmlbhandle, 0, 0) != 0) { 761 /* 762 * non-blocking opens are allowed to succeed to 763 * support format and fdisk to create partitioning. 764 */ 765 if (!ndelay) { 766 mutex_exit(&lofi_lock); 767 return (ENXIO); 768 } 769 } else if (cmlb_partinfo(lsp->ls_cmlbhandle, part, &nblks, &lba, 770 NULL, NULL, 0) == 0) { 771 if ((!nblks) && ((!ndelay) || (otyp != OTYP_CHR))) { 772 mutex_exit(&lofi_lock); 773 return (ENXIO); 774 } 775 } else if (!ndelay) { 776 mutex_exit(&lofi_lock); 777 return (ENXIO); 778 } 779 } 780 781 if (otyp == OTYP_LYR) { 782 lsp->ls_open_lyr[part]++; 783 } else { 784 lsp->ls_open_reg[otyp] |= mask; 785 } 786 if (flag & FEXCL) { 787 lsp->ls_open_excl |= mask; 788 } 789 790 mutex_exit(&lofi_lock); 791 return (0); 792 } 793 794 /*ARGSUSED*/ 795 static int 796 lofi_close(dev_t dev, int flag, int otyp, struct cred *credp) 797 { 798 minor_t part; 799 int id; 800 uint64_t mask; 801 struct lofi_state *lsp; 802 803 id = LOFI_MINOR2ID(getminor(dev)); 804 part = LOFI_PART(getminor(dev)); 805 mask = (1U << part); 806 807 mutex_enter(&lofi_lock); 808 lsp = ddi_get_soft_state(lofi_statep, id); 809 if (lsp == NULL) { 810 mutex_exit(&lofi_lock); 811 return (EINVAL); 812 } 813 814 if (id == 0) { 815 mutex_exit(&lofi_lock); 816 return (0); 817 } 818 819 if (lsp->ls_open_excl & mask) 820 lsp->ls_open_excl &= ~mask; 821 822 if (otyp == OTYP_LYR) { 823 lsp->ls_open_lyr[part]--; 824 } else { 825 lsp->ls_open_reg[otyp] &= ~mask; 826 } 827 828 /* 829 * If we forcibly closed the underlying device (li_force), or 830 * asked for cleanup (li_cleanup), finish up if we're the last 831 * out of the door. 832 */ 833 if (!is_opened(lsp) && 834 (lsp->ls_cleanup == B_TRUE || lsp->ls_vp == NULL)) { 835 lofi_free_dev(lsp); 836 lofi_destroy(lsp, credp); 837 } 838 839 mutex_exit(&lofi_lock); 840 return (0); 841 } 842 843 /* 844 * Sets the mechanism's initialization vector (IV) if one is needed. 845 * The IV is computed from the data block number. lsp->ls_mech is 846 * altered so that: 847 * lsp->ls_mech.cm_param_len is set to the IV len. 848 * lsp->ls_mech.cm_param is set to the IV. 849 */ 850 static int 851 lofi_blk_mech(struct lofi_state *lsp, longlong_t lblkno) 852 { 853 int ret; 854 crypto_data_t cdata; 855 char *iv; 856 size_t iv_len; 857 size_t min; 858 void *data; 859 size_t datasz; 860 861 ASSERT(MUTEX_HELD(&lsp->ls_crypto_lock)); 862 863 if (lsp == NULL) 864 return (CRYPTO_DEVICE_ERROR); 865 866 /* lsp->ls_mech.cm_param{_len} has already been set for static iv */ 867 if (lsp->ls_iv_type == IVM_NONE) { 868 return (CRYPTO_SUCCESS); 869 } 870 871 /* 872 * if kmem already alloced from previous call and it's the same size 873 * we need now, just recycle it; allocate new kmem only if we have to 874 */ 875 if (lsp->ls_mech.cm_param == NULL || 876 lsp->ls_mech.cm_param_len != lsp->ls_iv_len) { 877 iv_len = lsp->ls_iv_len; 878 iv = kmem_zalloc(iv_len, KM_SLEEP); 879 } else { 880 iv_len = lsp->ls_mech.cm_param_len; 881 iv = lsp->ls_mech.cm_param; 882 bzero(iv, iv_len); 883 } 884 885 switch (lsp->ls_iv_type) { 886 case IVM_ENC_BLKNO: 887 /* iv is not static, lblkno changes each time */ 888 data = &lblkno; 889 datasz = sizeof (lblkno); 890 break; 891 default: 892 data = 0; 893 datasz = 0; 894 break; 895 } 896 897 /* 898 * write blkno into the iv buffer padded on the left in case 899 * blkno ever grows bigger than its current longlong_t size 900 * or a variation other than blkno is used for the iv data 901 */ 902 min = MIN(datasz, iv_len); 903 bcopy(data, iv + (iv_len - min), min); 904 905 /* encrypt the data in-place to get the IV */ 906 SETUP_C_DATA(cdata, iv, iv_len); 907 908 ret = crypto_encrypt(&lsp->ls_iv_mech, &cdata, &lsp->ls_key, 909 NULL, NULL, NULL); 910 if (ret != CRYPTO_SUCCESS) { 911 cmn_err(CE_WARN, "failed to create iv for block %lld: (0x%x)", 912 lblkno, ret); 913 if (lsp->ls_mech.cm_param != iv) 914 kmem_free(iv, iv_len); 915 916 return (ret); 917 } 918 919 /* clean up the iv from the last computation */ 920 if (lsp->ls_mech.cm_param != NULL && lsp->ls_mech.cm_param != iv) 921 kmem_free(lsp->ls_mech.cm_param, lsp->ls_mech.cm_param_len); 922 923 lsp->ls_mech.cm_param_len = iv_len; 924 lsp->ls_mech.cm_param = iv; 925 926 return (CRYPTO_SUCCESS); 927 } 928 929 /* 930 * Performs encryption and decryption of a chunk of data of size "len", 931 * one DEV_BSIZE block at a time. "len" is assumed to be a multiple of 932 * DEV_BSIZE. 933 */ 934 static int 935 lofi_crypto(struct lofi_state *lsp, struct buf *bp, caddr_t plaintext, 936 caddr_t ciphertext, size_t len, boolean_t op_encrypt) 937 { 938 crypto_data_t cdata; 939 crypto_data_t wdata; 940 int ret; 941 longlong_t lblkno = bp->b_lblkno; 942 943 mutex_enter(&lsp->ls_crypto_lock); 944 945 /* 946 * though we could encrypt/decrypt entire "len" chunk of data, we need 947 * to break it into DEV_BSIZE pieces to capture blkno incrementing 948 */ 949 SETUP_C_DATA(cdata, plaintext, len); 950 cdata.cd_length = DEV_BSIZE; 951 if (ciphertext != NULL) { /* not in-place crypto */ 952 SETUP_C_DATA(wdata, ciphertext, len); 953 wdata.cd_length = DEV_BSIZE; 954 } 955 956 do { 957 ret = lofi_blk_mech(lsp, lblkno); 958 if (ret != CRYPTO_SUCCESS) 959 continue; 960 961 if (op_encrypt) { 962 ret = crypto_encrypt(&lsp->ls_mech, &cdata, 963 &lsp->ls_key, NULL, 964 ((ciphertext != NULL) ? &wdata : NULL), NULL); 965 } else { 966 ret = crypto_decrypt(&lsp->ls_mech, &cdata, 967 &lsp->ls_key, NULL, 968 ((ciphertext != NULL) ? &wdata : NULL), NULL); 969 } 970 971 cdata.cd_offset += DEV_BSIZE; 972 if (ciphertext != NULL) 973 wdata.cd_offset += DEV_BSIZE; 974 lblkno++; 975 } while (ret == CRYPTO_SUCCESS && cdata.cd_offset < len); 976 977 mutex_exit(&lsp->ls_crypto_lock); 978 979 if (ret != CRYPTO_SUCCESS) { 980 cmn_err(CE_WARN, "%s failed for block %lld: (0x%x)", 981 op_encrypt ? "crypto_encrypt()" : "crypto_decrypt()", 982 lblkno, ret); 983 } 984 985 return (ret); 986 } 987 988 #define RDWR_RAW 1 989 #define RDWR_BCOPY 2 990 991 static int 992 lofi_rdwr(caddr_t bufaddr, offset_t offset, struct buf *bp, 993 struct lofi_state *lsp, size_t len, int method, caddr_t bcopy_locn) 994 { 995 ssize_t resid; 996 int isread; 997 int error; 998 999 /* 1000 * Handles reads/writes for both plain and encrypted lofi 1001 * Note: offset is already shifted by lsp->ls_crypto_offset 1002 * when it gets here. 1003 */ 1004 1005 isread = bp->b_flags & B_READ; 1006 if (isread) { 1007 if (method == RDWR_BCOPY) { 1008 /* DO NOT update bp->b_resid for bcopy */ 1009 bcopy(bcopy_locn, bufaddr, len); 1010 error = 0; 1011 } else { /* RDWR_RAW */ 1012 error = vn_rdwr(UIO_READ, lsp->ls_vp, bufaddr, len, 1013 offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, 1014 &resid); 1015 bp->b_resid = resid; 1016 } 1017 if (lsp->ls_crypto_enabled && error == 0) { 1018 if (lofi_crypto(lsp, bp, bufaddr, NULL, len, 1019 B_FALSE) != CRYPTO_SUCCESS) { 1020 /* 1021 * XXX: original code didn't set residual 1022 * back to len because no error was expected 1023 * from bcopy() if encryption is not enabled 1024 */ 1025 if (method != RDWR_BCOPY) 1026 bp->b_resid = len; 1027 error = EIO; 1028 } 1029 } 1030 return (error); 1031 } else { 1032 void *iobuf = bufaddr; 1033 1034 if (lsp->ls_crypto_enabled) { 1035 /* don't do in-place crypto to keep bufaddr intact */ 1036 iobuf = kmem_alloc(len, KM_SLEEP); 1037 if (lofi_crypto(lsp, bp, bufaddr, iobuf, len, 1038 B_TRUE) != CRYPTO_SUCCESS) { 1039 kmem_free(iobuf, len); 1040 if (method != RDWR_BCOPY) 1041 bp->b_resid = len; 1042 return (EIO); 1043 } 1044 } 1045 if (method == RDWR_BCOPY) { 1046 /* DO NOT update bp->b_resid for bcopy */ 1047 bcopy(iobuf, bcopy_locn, len); 1048 error = 0; 1049 } else { /* RDWR_RAW */ 1050 error = vn_rdwr(UIO_WRITE, lsp->ls_vp, iobuf, len, 1051 offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, 1052 &resid); 1053 bp->b_resid = resid; 1054 } 1055 if (lsp->ls_crypto_enabled) { 1056 kmem_free(iobuf, len); 1057 } 1058 return (error); 1059 } 1060 } 1061 1062 static int 1063 lofi_mapped_rdwr(caddr_t bufaddr, offset_t offset, struct buf *bp, 1064 struct lofi_state *lsp) 1065 { 1066 int error; 1067 offset_t alignedoffset, mapoffset; 1068 size_t xfersize; 1069 int isread; 1070 int smflags; 1071 caddr_t mapaddr; 1072 size_t len; 1073 enum seg_rw srw; 1074 int save_error; 1075 1076 /* 1077 * Note: offset is already shifted by lsp->ls_crypto_offset 1078 * when it gets here. 1079 */ 1080 if (lsp->ls_crypto_enabled) 1081 ASSERT(lsp->ls_vp_comp_size == lsp->ls_vp_size); 1082 1083 /* 1084 * segmap always gives us an 8K (MAXBSIZE) chunk, aligned on 1085 * an 8K boundary, but the buf transfer address may not be 1086 * aligned on more than a 512-byte boundary (we don't enforce 1087 * that even though we could). This matters since the initial 1088 * part of the transfer may not start at offset 0 within the 1089 * segmap'd chunk. So we have to compensate for that with 1090 * 'mapoffset'. Subsequent chunks always start off at the 1091 * beginning, and the last is capped by b_resid 1092 * 1093 * Visually, where "|" represents page map boundaries: 1094 * alignedoffset (mapaddr begins at this segmap boundary) 1095 * | offset (from beginning of file) 1096 * | | len 1097 * v v v 1098 * ===|====X========|====...======|========X====|==== 1099 * /-------------...---------------/ 1100 * ^ bp->b_bcount/bp->b_resid at start 1101 * /----/--------/----...------/--------/ 1102 * ^ ^ ^ ^ ^ 1103 * | | | | nth xfersize (<= MAXBSIZE) 1104 * | | 2nd thru n-1st xfersize (= MAXBSIZE) 1105 * | 1st xfersize (<= MAXBSIZE) 1106 * mapoffset (offset into 1st segmap, non-0 1st time, 0 thereafter) 1107 * 1108 * Notes: "alignedoffset" is "offset" rounded down to nearest 1109 * MAXBSIZE boundary. "len" is next page boundary of size 1110 * PAGESIZE after "alignedoffset". 1111 */ 1112 mapoffset = offset & MAXBOFFSET; 1113 alignedoffset = offset - mapoffset; 1114 bp->b_resid = bp->b_bcount; 1115 isread = bp->b_flags & B_READ; 1116 srw = isread ? S_READ : S_WRITE; 1117 do { 1118 xfersize = MIN(lsp->ls_vp_comp_size - offset, 1119 MIN(MAXBSIZE - mapoffset, bp->b_resid)); 1120 len = roundup(mapoffset + xfersize, PAGESIZE); 1121 mapaddr = segmap_getmapflt(segkmap, lsp->ls_vp, 1122 alignedoffset, MAXBSIZE, 1, srw); 1123 /* 1124 * Now fault in the pages. This lets us check 1125 * for errors before we reference mapaddr and 1126 * try to resolve the fault in bcopy (which would 1127 * panic instead). And this can easily happen, 1128 * particularly if you've lofi'd a file over NFS 1129 * and someone deletes the file on the server. 1130 */ 1131 error = segmap_fault(kas.a_hat, segkmap, mapaddr, 1132 len, F_SOFTLOCK, srw); 1133 if (error) { 1134 (void) segmap_release(segkmap, mapaddr, 0); 1135 if (FC_CODE(error) == FC_OBJERR) 1136 error = FC_ERRNO(error); 1137 else 1138 error = EIO; 1139 break; 1140 } 1141 /* error may be non-zero for encrypted lofi */ 1142 error = lofi_rdwr(bufaddr, 0, bp, lsp, xfersize, 1143 RDWR_BCOPY, mapaddr + mapoffset); 1144 if (error == 0) { 1145 bp->b_resid -= xfersize; 1146 bufaddr += xfersize; 1147 offset += xfersize; 1148 } 1149 smflags = 0; 1150 if (isread) { 1151 smflags |= SM_FREE; 1152 /* 1153 * If we're reading an entire page starting 1154 * at a page boundary, there's a good chance 1155 * we won't need it again. Put it on the 1156 * head of the freelist. 1157 */ 1158 if (mapoffset == 0 && xfersize == MAXBSIZE) 1159 smflags |= SM_DONTNEED; 1160 } else { 1161 /* 1162 * Write back good pages, it is okay to 1163 * always release asynchronous here as we'll 1164 * follow with VOP_FSYNC for B_SYNC buffers. 1165 */ 1166 if (error == 0) 1167 smflags |= SM_WRITE | SM_ASYNC; 1168 } 1169 (void) segmap_fault(kas.a_hat, segkmap, mapaddr, 1170 len, F_SOFTUNLOCK, srw); 1171 save_error = segmap_release(segkmap, mapaddr, smflags); 1172 if (error == 0) 1173 error = save_error; 1174 /* only the first map may start partial */ 1175 mapoffset = 0; 1176 alignedoffset += MAXBSIZE; 1177 } while ((error == 0) && (bp->b_resid > 0) && 1178 (offset < lsp->ls_vp_comp_size)); 1179 1180 return (error); 1181 } 1182 1183 /* 1184 * Check if segment seg_index is present in the decompressed segment 1185 * data cache. 1186 * 1187 * Returns a pointer to the decompressed segment data cache entry if 1188 * found, and NULL when decompressed data for this segment is not yet 1189 * cached. 1190 */ 1191 static struct lofi_comp_cache * 1192 lofi_find_comp_data(struct lofi_state *lsp, uint64_t seg_index) 1193 { 1194 struct lofi_comp_cache *lc; 1195 1196 ASSERT(MUTEX_HELD(&lsp->ls_comp_cache_lock)); 1197 1198 for (lc = list_head(&lsp->ls_comp_cache); lc != NULL; 1199 lc = list_next(&lsp->ls_comp_cache, lc)) { 1200 if (lc->lc_index == seg_index) { 1201 /* 1202 * Decompressed segment data was found in the 1203 * cache. 1204 * 1205 * The cache uses an LRU replacement strategy; 1206 * move the entry to head of list. 1207 */ 1208 list_remove(&lsp->ls_comp_cache, lc); 1209 list_insert_head(&lsp->ls_comp_cache, lc); 1210 return (lc); 1211 } 1212 } 1213 return (NULL); 1214 } 1215 1216 /* 1217 * Add the data for a decompressed segment at segment index 1218 * seg_index to the cache of the decompressed segments. 1219 * 1220 * Returns a pointer to the cache element structure in case 1221 * the data was added to the cache; returns NULL when the data 1222 * wasn't cached. 1223 */ 1224 static struct lofi_comp_cache * 1225 lofi_add_comp_data(struct lofi_state *lsp, uint64_t seg_index, 1226 uchar_t *data) 1227 { 1228 struct lofi_comp_cache *lc; 1229 1230 ASSERT(MUTEX_HELD(&lsp->ls_comp_cache_lock)); 1231 1232 while (lsp->ls_comp_cache_count > lofi_max_comp_cache) { 1233 lc = list_remove_tail(&lsp->ls_comp_cache); 1234 ASSERT(lc != NULL); 1235 kmem_free(lc->lc_data, lsp->ls_uncomp_seg_sz); 1236 kmem_free(lc, sizeof (struct lofi_comp_cache)); 1237 lsp->ls_comp_cache_count--; 1238 } 1239 1240 /* 1241 * Do not cache when disabled by tunable variable 1242 */ 1243 if (lofi_max_comp_cache == 0) 1244 return (NULL); 1245 1246 /* 1247 * When the cache has not yet reached the maximum allowed 1248 * number of segments, allocate a new cache element. 1249 * Otherwise the cache is full; reuse the last list element 1250 * (LRU) for caching the decompressed segment data. 1251 * 1252 * The cache element for the new decompressed segment data is 1253 * added to the head of the list. 1254 */ 1255 if (lsp->ls_comp_cache_count < lofi_max_comp_cache) { 1256 lc = kmem_alloc(sizeof (struct lofi_comp_cache), KM_SLEEP); 1257 lc->lc_data = NULL; 1258 list_insert_head(&lsp->ls_comp_cache, lc); 1259 lsp->ls_comp_cache_count++; 1260 } else { 1261 lc = list_remove_tail(&lsp->ls_comp_cache); 1262 if (lc == NULL) 1263 return (NULL); 1264 list_insert_head(&lsp->ls_comp_cache, lc); 1265 } 1266 1267 /* 1268 * Free old uncompressed segment data when reusing a cache 1269 * entry. 1270 */ 1271 if (lc->lc_data != NULL) 1272 kmem_free(lc->lc_data, lsp->ls_uncomp_seg_sz); 1273 1274 lc->lc_data = data; 1275 lc->lc_index = seg_index; 1276 return (lc); 1277 } 1278 1279 1280 /*ARGSUSED*/ 1281 static int 1282 gzip_decompress(void *src, size_t srclen, void *dst, 1283 size_t *dstlen, int level) 1284 { 1285 ASSERT(*dstlen >= srclen); 1286 1287 if (z_uncompress(dst, dstlen, src, srclen) != Z_OK) 1288 return (-1); 1289 return (0); 1290 } 1291 1292 #define LZMA_HEADER_SIZE (LZMA_PROPS_SIZE + 8) 1293 /*ARGSUSED*/ 1294 static int 1295 lzma_decompress(void *src, size_t srclen, void *dst, 1296 size_t *dstlen, int level) 1297 { 1298 size_t insizepure; 1299 void *actual_src; 1300 ELzmaStatus status; 1301 1302 insizepure = srclen - LZMA_HEADER_SIZE; 1303 actual_src = (void *)((Byte *)src + LZMA_HEADER_SIZE); 1304 1305 if (LzmaDecode((Byte *)dst, (size_t *)dstlen, 1306 (const Byte *)actual_src, &insizepure, 1307 (const Byte *)src, LZMA_PROPS_SIZE, LZMA_FINISH_ANY, &status, 1308 &g_Alloc) != SZ_OK) { 1309 return (-1); 1310 } 1311 return (0); 1312 } 1313 1314 /* 1315 * This is basically what strategy used to be before we found we 1316 * needed task queues. 1317 */ 1318 static void 1319 lofi_strategy_task(void *arg) 1320 { 1321 struct buf *bp = (struct buf *)arg; 1322 int error; 1323 int syncflag = 0; 1324 struct lofi_state *lsp; 1325 offset_t offset; 1326 caddr_t bufaddr; 1327 size_t len; 1328 size_t xfersize; 1329 boolean_t bufinited = B_FALSE; 1330 1331 lsp = ddi_get_soft_state(lofi_statep, 1332 LOFI_MINOR2ID(getminor(bp->b_edev))); 1333 1334 if (lsp == NULL) { 1335 error = ENXIO; 1336 goto errout; 1337 } 1338 if (lsp->ls_kstat) { 1339 mutex_enter(lsp->ls_kstat->ks_lock); 1340 kstat_waitq_to_runq(KSTAT_IO_PTR(lsp->ls_kstat)); 1341 mutex_exit(lsp->ls_kstat->ks_lock); 1342 } 1343 1344 mutex_enter(&lsp->ls_vp_lock); 1345 lsp->ls_vp_iocount++; 1346 mutex_exit(&lsp->ls_vp_lock); 1347 1348 bp_mapin(bp); 1349 bufaddr = bp->b_un.b_addr; 1350 offset = (bp->b_lblkno + (diskaddr_t)(uintptr_t)bp->b_private) 1351 << lsp->ls_lbshift; /* offset within file */ 1352 if (lsp->ls_crypto_enabled) { 1353 /* encrypted data really begins after crypto header */ 1354 offset += lsp->ls_crypto_offset; 1355 } 1356 len = bp->b_bcount; 1357 bufinited = B_TRUE; 1358 1359 if (lsp->ls_vp == NULL || lsp->ls_vp_closereq) { 1360 error = EIO; 1361 goto errout; 1362 } 1363 1364 /* 1365 * If we're writing and the buffer was not B_ASYNC 1366 * we'll follow up with a VOP_FSYNC() to force any 1367 * asynchronous I/O to stable storage. 1368 */ 1369 if (!(bp->b_flags & B_READ) && !(bp->b_flags & B_ASYNC)) 1370 syncflag = FSYNC; 1371 1372 /* 1373 * We used to always use vn_rdwr here, but we cannot do that because 1374 * we might decide to read or write from the the underlying 1375 * file during this call, which would be a deadlock because 1376 * we have the rw_lock. So instead we page, unless it's not 1377 * mapable or it's a character device or it's an encrypted lofi. 1378 */ 1379 if ((lsp->ls_vp->v_flag & VNOMAP) || (lsp->ls_vp->v_type == VCHR) || 1380 lsp->ls_crypto_enabled) { 1381 error = lofi_rdwr(bufaddr, offset, bp, lsp, len, RDWR_RAW, 1382 NULL); 1383 } else if (lsp->ls_uncomp_seg_sz == 0) { 1384 error = lofi_mapped_rdwr(bufaddr, offset, bp, lsp); 1385 } else { 1386 uchar_t *compressed_seg = NULL, *cmpbuf; 1387 uchar_t *uncompressed_seg = NULL; 1388 lofi_compress_info_t *li; 1389 size_t oblkcount; 1390 ulong_t seglen; 1391 uint64_t sblkno, eblkno, cmpbytes; 1392 uint64_t uncompressed_seg_index; 1393 struct lofi_comp_cache *lc; 1394 offset_t sblkoff, eblkoff; 1395 u_offset_t salign, ealign; 1396 u_offset_t sdiff; 1397 uint32_t comp_data_sz; 1398 uint64_t i; 1399 int j; 1400 1401 /* 1402 * From here on we're dealing primarily with compressed files 1403 */ 1404 ASSERT(!lsp->ls_crypto_enabled); 1405 1406 /* 1407 * Compressed files can only be read from and 1408 * not written to 1409 */ 1410 if (!(bp->b_flags & B_READ)) { 1411 bp->b_resid = bp->b_bcount; 1412 error = EROFS; 1413 goto done; 1414 } 1415 1416 ASSERT(lsp->ls_comp_algorithm_index >= 0); 1417 li = &lofi_compress_table[lsp->ls_comp_algorithm_index]; 1418 /* 1419 * Compute starting and ending compressed segment numbers 1420 * We use only bitwise operations avoiding division and 1421 * modulus because we enforce the compression segment size 1422 * to a power of 2 1423 */ 1424 sblkno = offset >> lsp->ls_comp_seg_shift; 1425 sblkoff = offset & (lsp->ls_uncomp_seg_sz - 1); 1426 eblkno = (offset + bp->b_bcount) >> lsp->ls_comp_seg_shift; 1427 eblkoff = (offset + bp->b_bcount) & (lsp->ls_uncomp_seg_sz - 1); 1428 1429 /* 1430 * Check the decompressed segment cache. 1431 * 1432 * The cache is used only when the requested data 1433 * is within a segment. Requests that cross 1434 * segment boundaries bypass the cache. 1435 */ 1436 if (sblkno == eblkno || 1437 (sblkno + 1 == eblkno && eblkoff == 0)) { 1438 /* 1439 * Request doesn't cross a segment boundary, 1440 * now check the cache. 1441 */ 1442 mutex_enter(&lsp->ls_comp_cache_lock); 1443 lc = lofi_find_comp_data(lsp, sblkno); 1444 if (lc != NULL) { 1445 /* 1446 * We've found the decompressed segment 1447 * data in the cache; reuse it. 1448 */ 1449 bcopy(lc->lc_data + sblkoff, bufaddr, 1450 bp->b_bcount); 1451 mutex_exit(&lsp->ls_comp_cache_lock); 1452 bp->b_resid = 0; 1453 error = 0; 1454 goto done; 1455 } 1456 mutex_exit(&lsp->ls_comp_cache_lock); 1457 } 1458 1459 /* 1460 * Align start offset to block boundary for segmap 1461 */ 1462 salign = lsp->ls_comp_seg_index[sblkno]; 1463 sdiff = salign & (DEV_BSIZE - 1); 1464 salign -= sdiff; 1465 if (eblkno >= (lsp->ls_comp_index_sz - 1)) { 1466 /* 1467 * We're dealing with the last segment of 1468 * the compressed file -- the size of this 1469 * segment *may not* be the same as the 1470 * segment size for the file 1471 */ 1472 eblkoff = (offset + bp->b_bcount) & 1473 (lsp->ls_uncomp_last_seg_sz - 1); 1474 ealign = lsp->ls_vp_comp_size; 1475 } else { 1476 ealign = lsp->ls_comp_seg_index[eblkno + 1]; 1477 } 1478 1479 /* 1480 * Preserve original request paramaters 1481 */ 1482 oblkcount = bp->b_bcount; 1483 1484 /* 1485 * Assign the calculated parameters 1486 */ 1487 comp_data_sz = ealign - salign; 1488 bp->b_bcount = comp_data_sz; 1489 1490 /* 1491 * Buffers to hold compressed segments are pre-allocated 1492 * on a per-thread basis. Find a pre-allocated buffer 1493 * that is not currently in use and mark it for use. 1494 */ 1495 mutex_enter(&lsp->ls_comp_bufs_lock); 1496 for (j = 0; j < lofi_taskq_nthreads; j++) { 1497 if (lsp->ls_comp_bufs[j].inuse == 0) { 1498 lsp->ls_comp_bufs[j].inuse = 1; 1499 break; 1500 } 1501 } 1502 1503 mutex_exit(&lsp->ls_comp_bufs_lock); 1504 ASSERT(j < lofi_taskq_nthreads); 1505 1506 /* 1507 * If the pre-allocated buffer size does not match 1508 * the size of the I/O request, re-allocate it with 1509 * the appropriate size 1510 */ 1511 if (lsp->ls_comp_bufs[j].bufsize < bp->b_bcount) { 1512 if (lsp->ls_comp_bufs[j].bufsize > 0) 1513 kmem_free(lsp->ls_comp_bufs[j].buf, 1514 lsp->ls_comp_bufs[j].bufsize); 1515 lsp->ls_comp_bufs[j].buf = kmem_alloc(bp->b_bcount, 1516 KM_SLEEP); 1517 lsp->ls_comp_bufs[j].bufsize = bp->b_bcount; 1518 } 1519 compressed_seg = lsp->ls_comp_bufs[j].buf; 1520 1521 /* 1522 * Map in the calculated number of blocks 1523 */ 1524 error = lofi_mapped_rdwr((caddr_t)compressed_seg, salign, 1525 bp, lsp); 1526 1527 bp->b_bcount = oblkcount; 1528 bp->b_resid = oblkcount; 1529 if (error != 0) 1530 goto done; 1531 1532 /* 1533 * decompress compressed blocks start 1534 */ 1535 cmpbuf = compressed_seg + sdiff; 1536 for (i = sblkno; i <= eblkno; i++) { 1537 ASSERT(i < lsp->ls_comp_index_sz - 1); 1538 uchar_t *useg; 1539 1540 /* 1541 * The last segment is special in that it is 1542 * most likely not going to be the same 1543 * (uncompressed) size as the other segments. 1544 */ 1545 if (i == (lsp->ls_comp_index_sz - 2)) { 1546 seglen = lsp->ls_uncomp_last_seg_sz; 1547 } else { 1548 seglen = lsp->ls_uncomp_seg_sz; 1549 } 1550 1551 /* 1552 * Each of the segment index entries contains 1553 * the starting block number for that segment. 1554 * The number of compressed bytes in a segment 1555 * is thus the difference between the starting 1556 * block number of this segment and the starting 1557 * block number of the next segment. 1558 */ 1559 cmpbytes = lsp->ls_comp_seg_index[i + 1] - 1560 lsp->ls_comp_seg_index[i]; 1561 1562 /* 1563 * The first byte in a compressed segment is a flag 1564 * that indicates whether this segment is compressed 1565 * at all. 1566 * 1567 * The variable 'useg' is used (instead of 1568 * uncompressed_seg) in this loop to keep a 1569 * reference to the uncompressed segment. 1570 * 1571 * N.B. If 'useg' is replaced with uncompressed_seg, 1572 * it leads to memory leaks and heap corruption in 1573 * corner cases where compressed segments lie 1574 * adjacent to uncompressed segments. 1575 */ 1576 if (*cmpbuf == UNCOMPRESSED) { 1577 useg = cmpbuf + SEGHDR; 1578 } else { 1579 if (uncompressed_seg == NULL) 1580 uncompressed_seg = 1581 kmem_alloc(lsp->ls_uncomp_seg_sz, 1582 KM_SLEEP); 1583 useg = uncompressed_seg; 1584 uncompressed_seg_index = i; 1585 1586 if (li->l_decompress((cmpbuf + SEGHDR), 1587 (cmpbytes - SEGHDR), uncompressed_seg, 1588 &seglen, li->l_level) != 0) { 1589 error = EIO; 1590 goto done; 1591 } 1592 } 1593 1594 /* 1595 * Determine how much uncompressed data we 1596 * have to copy and copy it 1597 */ 1598 xfersize = lsp->ls_uncomp_seg_sz - sblkoff; 1599 if (i == eblkno) 1600 xfersize -= (lsp->ls_uncomp_seg_sz - eblkoff); 1601 1602 bcopy((useg + sblkoff), bufaddr, xfersize); 1603 1604 cmpbuf += cmpbytes; 1605 bufaddr += xfersize; 1606 bp->b_resid -= xfersize; 1607 sblkoff = 0; 1608 1609 if (bp->b_resid == 0) 1610 break; 1611 } /* decompress compressed blocks ends */ 1612 1613 /* 1614 * Skip to done if there is no uncompressed data to cache 1615 */ 1616 if (uncompressed_seg == NULL) 1617 goto done; 1618 1619 /* 1620 * Add the data for the last decompressed segment to 1621 * the cache. 1622 * 1623 * In case the uncompressed segment data was added to (and 1624 * is referenced by) the cache, make sure we don't free it 1625 * here. 1626 */ 1627 mutex_enter(&lsp->ls_comp_cache_lock); 1628 if ((lc = lofi_add_comp_data(lsp, uncompressed_seg_index, 1629 uncompressed_seg)) != NULL) { 1630 uncompressed_seg = NULL; 1631 } 1632 mutex_exit(&lsp->ls_comp_cache_lock); 1633 1634 done: 1635 if (compressed_seg != NULL) { 1636 mutex_enter(&lsp->ls_comp_bufs_lock); 1637 lsp->ls_comp_bufs[j].inuse = 0; 1638 mutex_exit(&lsp->ls_comp_bufs_lock); 1639 } 1640 if (uncompressed_seg != NULL) 1641 kmem_free(uncompressed_seg, lsp->ls_uncomp_seg_sz); 1642 } /* end of handling compressed files */ 1643 1644 if ((error == 0) && (syncflag != 0)) 1645 error = VOP_FSYNC(lsp->ls_vp, syncflag, kcred, NULL); 1646 1647 errout: 1648 if (bufinited && lsp->ls_kstat) { 1649 size_t n_done = bp->b_bcount - bp->b_resid; 1650 kstat_io_t *kioptr; 1651 1652 mutex_enter(lsp->ls_kstat->ks_lock); 1653 kioptr = KSTAT_IO_PTR(lsp->ls_kstat); 1654 if (bp->b_flags & B_READ) { 1655 kioptr->nread += n_done; 1656 kioptr->reads++; 1657 } else { 1658 kioptr->nwritten += n_done; 1659 kioptr->writes++; 1660 } 1661 kstat_runq_exit(kioptr); 1662 mutex_exit(lsp->ls_kstat->ks_lock); 1663 } 1664 1665 mutex_enter(&lsp->ls_vp_lock); 1666 if (--lsp->ls_vp_iocount == 0) 1667 cv_broadcast(&lsp->ls_vp_cv); 1668 mutex_exit(&lsp->ls_vp_lock); 1669 1670 bioerror(bp, error); 1671 biodone(bp); 1672 } 1673 1674 static int 1675 lofi_strategy(struct buf *bp) 1676 { 1677 struct lofi_state *lsp; 1678 offset_t offset; 1679 minor_t part; 1680 diskaddr_t p_lba; 1681 diskaddr_t p_nblks; 1682 int shift; 1683 1684 /* 1685 * We cannot just do I/O here, because the current thread 1686 * _might_ end up back in here because the underlying filesystem 1687 * wants a buffer, which eventually gets into bio_recycle and 1688 * might call into lofi to write out a delayed-write buffer. 1689 * This is bad if the filesystem above lofi is the same as below. 1690 * 1691 * We could come up with a complex strategy using threads to 1692 * do the I/O asynchronously, or we could use task queues. task 1693 * queues were incredibly easy so they win. 1694 */ 1695 1696 lsp = ddi_get_soft_state(lofi_statep, 1697 LOFI_MINOR2ID(getminor(bp->b_edev))); 1698 part = LOFI_PART(getminor(bp->b_edev)); 1699 1700 if (lsp == NULL) { 1701 bioerror(bp, ENXIO); 1702 biodone(bp); 1703 return (0); 1704 } 1705 1706 /* Check if we are closing. */ 1707 mutex_enter(&lsp->ls_vp_lock); 1708 if (lsp->ls_vp == NULL || lsp->ls_vp_closereq) { 1709 mutex_exit(&lsp->ls_vp_lock); 1710 bioerror(bp, EIO); 1711 biodone(bp); 1712 return (0); 1713 } 1714 mutex_exit(&lsp->ls_vp_lock); 1715 1716 shift = lsp->ls_lbshift; 1717 p_lba = 0; 1718 p_nblks = lsp->ls_vp_size >> shift; 1719 1720 if (lsp->ls_cmlbhandle != NULL) { 1721 if (cmlb_partinfo(lsp->ls_cmlbhandle, part, &p_nblks, &p_lba, 1722 NULL, NULL, 0)) { 1723 bioerror(bp, ENXIO); 1724 biodone(bp); 1725 return (0); 1726 } 1727 } 1728 1729 /* start block past partition end? */ 1730 if (bp->b_lblkno > p_nblks) { 1731 bioerror(bp, ENXIO); 1732 biodone(bp); 1733 return (0); 1734 } 1735 1736 offset = (bp->b_lblkno+p_lba) << shift; /* offset within file */ 1737 1738 mutex_enter(&lsp->ls_vp_lock); 1739 if (lsp->ls_crypto_enabled) { 1740 /* encrypted data really begins after crypto header */ 1741 offset += lsp->ls_crypto_offset; 1742 } 1743 1744 /* make sure we will not pass the file or partition size */ 1745 if (offset == lsp->ls_vp_size || 1746 offset == (((p_lba + p_nblks) << shift) + lsp->ls_crypto_offset)) { 1747 /* EOF */ 1748 if ((bp->b_flags & B_READ) != 0) { 1749 bp->b_resid = bp->b_bcount; 1750 bioerror(bp, 0); 1751 } else { 1752 /* writes should fail */ 1753 bioerror(bp, ENXIO); 1754 } 1755 biodone(bp); 1756 mutex_exit(&lsp->ls_vp_lock); 1757 return (0); 1758 } 1759 if ((offset > lsp->ls_vp_size) || 1760 (offset > (((p_lba + p_nblks) << shift) + lsp->ls_crypto_offset)) || 1761 ((offset + bp->b_bcount) > ((p_lba + p_nblks) << shift))) { 1762 bioerror(bp, ENXIO); 1763 biodone(bp); 1764 mutex_exit(&lsp->ls_vp_lock); 1765 return (0); 1766 } 1767 1768 mutex_exit(&lsp->ls_vp_lock); 1769 1770 if (lsp->ls_kstat) { 1771 mutex_enter(lsp->ls_kstat->ks_lock); 1772 kstat_waitq_enter(KSTAT_IO_PTR(lsp->ls_kstat)); 1773 mutex_exit(lsp->ls_kstat->ks_lock); 1774 } 1775 bp->b_private = (void *)(uintptr_t)p_lba; /* partition start */ 1776 (void) taskq_dispatch(lsp->ls_taskq, lofi_strategy_task, bp, KM_SLEEP); 1777 return (0); 1778 } 1779 1780 static int 1781 lofi_read(dev_t dev, struct uio *uio, struct cred *credp) 1782 { 1783 _NOTE(ARGUNUSED(credp)); 1784 1785 if (getminor(dev) == 0) 1786 return (EINVAL); 1787 UIO_CHECK(uio); 1788 return (physio(lofi_strategy, NULL, dev, B_READ, minphys, uio)); 1789 } 1790 1791 static int 1792 lofi_write(dev_t dev, struct uio *uio, struct cred *credp) 1793 { 1794 _NOTE(ARGUNUSED(credp)); 1795 1796 if (getminor(dev) == 0) 1797 return (EINVAL); 1798 UIO_CHECK(uio); 1799 return (physio(lofi_strategy, NULL, dev, B_WRITE, minphys, uio)); 1800 } 1801 1802 static int 1803 lofi_urw(struct lofi_state *lsp, uint16_t fmode, diskaddr_t off, size_t size, 1804 intptr_t arg, int flag, cred_t *credp) 1805 { 1806 struct uio uio; 1807 iovec_t iov; 1808 1809 /* 1810 * 1024 * 1024 apes cmlb_tg_max_efi_xfer as a reasonable max. 1811 */ 1812 if (size == 0 || size > 1024 * 1024 || 1813 (size % (1 << lsp->ls_lbshift)) != 0) 1814 return (EINVAL); 1815 1816 iov.iov_base = (void *)arg; 1817 iov.iov_len = size; 1818 uio.uio_iov = &iov; 1819 uio.uio_iovcnt = 1; 1820 uio.uio_loffset = off; 1821 uio.uio_segflg = (flag & FKIOCTL) ? UIO_SYSSPACE : UIO_USERSPACE; 1822 uio.uio_llimit = MAXOFFSET_T; 1823 uio.uio_resid = size; 1824 uio.uio_fmode = fmode; 1825 uio.uio_extflg = 0; 1826 1827 return (fmode == FREAD ? 1828 lofi_read(lsp->ls_dev, &uio, credp) : 1829 lofi_write(lsp->ls_dev, &uio, credp)); 1830 } 1831 1832 /*ARGSUSED2*/ 1833 static int 1834 lofi_aread(dev_t dev, struct aio_req *aio, struct cred *credp) 1835 { 1836 if (getminor(dev) == 0) 1837 return (EINVAL); 1838 UIO_CHECK(aio->aio_uio); 1839 return (aphysio(lofi_strategy, anocancel, dev, B_READ, minphys, aio)); 1840 } 1841 1842 /*ARGSUSED2*/ 1843 static int 1844 lofi_awrite(dev_t dev, struct aio_req *aio, struct cred *credp) 1845 { 1846 if (getminor(dev) == 0) 1847 return (EINVAL); 1848 UIO_CHECK(aio->aio_uio); 1849 return (aphysio(lofi_strategy, anocancel, dev, B_WRITE, minphys, aio)); 1850 } 1851 1852 /*ARGSUSED*/ 1853 static int 1854 lofi_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) 1855 { 1856 struct lofi_state *lsp; 1857 dev_t dev = (dev_t)arg; 1858 int instance; 1859 1860 instance = LOFI_MINOR2ID(getminor(dev)); 1861 switch (infocmd) { 1862 case DDI_INFO_DEVT2DEVINFO: 1863 lsp = ddi_get_soft_state(lofi_statep, instance); 1864 if (lsp == NULL) 1865 return (DDI_FAILURE); 1866 *result = lsp->ls_dip; 1867 return (DDI_SUCCESS); 1868 case DDI_INFO_DEVT2INSTANCE: 1869 *result = (void *) (intptr_t)instance; 1870 return (DDI_SUCCESS); 1871 } 1872 return (DDI_FAILURE); 1873 } 1874 1875 static int 1876 lofi_create_minor_nodes(struct lofi_state *lsp, boolean_t labeled) 1877 { 1878 int error = 0; 1879 int instance = ddi_get_instance(lsp->ls_dip); 1880 1881 if (labeled == B_TRUE) { 1882 cmlb_alloc_handle(&lsp->ls_cmlbhandle); 1883 error = cmlb_attach(lsp->ls_dip, &lofi_tg_ops, DTYPE_DIRECT, 1884 B_FALSE, B_FALSE, DDI_NT_BLOCK_CHAN, 1885 CMLB_CREATE_P0_MINOR_NODE, lsp->ls_cmlbhandle, (void *)1); 1886 1887 if (error != DDI_SUCCESS) { 1888 cmlb_free_handle(&lsp->ls_cmlbhandle); 1889 lsp->ls_cmlbhandle = NULL; 1890 error = ENXIO; 1891 } 1892 } else { 1893 /* create minor nodes */ 1894 error = ddi_create_minor_node(lsp->ls_dip, LOFI_BLOCK_NODE, 1895 S_IFBLK, LOFI_ID2MINOR(instance), DDI_PSEUDO, 0); 1896 if (error == DDI_SUCCESS) { 1897 error = ddi_create_minor_node(lsp->ls_dip, 1898 LOFI_CHAR_NODE, S_IFCHR, LOFI_ID2MINOR(instance), 1899 DDI_PSEUDO, 0); 1900 if (error != DDI_SUCCESS) { 1901 ddi_remove_minor_node(lsp->ls_dip, 1902 LOFI_BLOCK_NODE); 1903 error = ENXIO; 1904 } 1905 } else 1906 error = ENXIO; 1907 } 1908 return (error); 1909 } 1910 1911 static int 1912 lofi_zone_bind(struct lofi_state *lsp) 1913 { 1914 int error = 0; 1915 1916 mutex_enter(&curproc->p_lock); 1917 if ((error = rctl_incr_lofi(curproc, curproc->p_zone, 1)) != 0) { 1918 mutex_exit(&curproc->p_lock); 1919 return (error); 1920 } 1921 mutex_exit(&curproc->p_lock); 1922 1923 if (ddi_prop_update_string(DDI_DEV_T_NONE, lsp->ls_dip, ZONE_PROP_NAME, 1924 (char *)curproc->p_zone->zone_name) != DDI_PROP_SUCCESS) { 1925 rctl_decr_lofi(curproc->p_zone, 1); 1926 error = EINVAL; 1927 } else { 1928 zone_init_ref(&lsp->ls_zone); 1929 zone_hold_ref(curzone, &lsp->ls_zone, ZONE_REF_LOFI); 1930 } 1931 return (error); 1932 } 1933 1934 static void 1935 lofi_zone_unbind(struct lofi_state *lsp) 1936 { 1937 (void) ddi_prop_remove(DDI_DEV_T_NONE, lsp->ls_dip, ZONE_PROP_NAME); 1938 rctl_decr_lofi(curproc->p_zone, 1); 1939 zone_rele_ref(&lsp->ls_zone, ZONE_REF_LOFI); 1940 } 1941 1942 static int 1943 lofi_online_dev(dev_info_t *dip) 1944 { 1945 boolean_t labeled; 1946 int error; 1947 int instance = ddi_get_instance(dip); 1948 struct lofi_state *lsp; 1949 1950 labeled = B_FALSE; 1951 if (ddi_prop_exists(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, "labeled")) 1952 labeled = B_TRUE; 1953 1954 /* lsp alloc+init, soft state is freed in lofi_detach */ 1955 error = ddi_soft_state_zalloc(lofi_statep, instance); 1956 if (error == DDI_FAILURE) { 1957 return (ENOMEM); 1958 } 1959 1960 lsp = ddi_get_soft_state(lofi_statep, instance); 1961 lsp->ls_dip = dip; 1962 1963 if ((error = lofi_zone_bind(lsp)) != 0) 1964 goto err; 1965 1966 cv_init(&lsp->ls_vp_cv, NULL, CV_DRIVER, NULL); 1967 mutex_init(&lsp->ls_comp_cache_lock, NULL, MUTEX_DRIVER, NULL); 1968 mutex_init(&lsp->ls_comp_bufs_lock, NULL, MUTEX_DRIVER, NULL); 1969 mutex_init(&lsp->ls_kstat_lock, NULL, MUTEX_DRIVER, NULL); 1970 mutex_init(&lsp->ls_vp_lock, NULL, MUTEX_DRIVER, NULL); 1971 1972 if ((error = lofi_create_minor_nodes(lsp, labeled)) != 0) { 1973 lofi_zone_unbind(lsp); 1974 goto lerr; 1975 } 1976 1977 /* driver handles kernel-issued IOCTLs */ 1978 if (ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP, 1979 DDI_KERNEL_IOCTL, NULL, 0) != DDI_PROP_SUCCESS) { 1980 error = DDI_FAILURE; 1981 goto merr; 1982 } 1983 1984 lsp->ls_kstat = kstat_create_zone(LOFI_DRIVER_NAME, instance, 1985 NULL, "disk", KSTAT_TYPE_IO, 1, 0, getzoneid()); 1986 if (lsp->ls_kstat == NULL) { 1987 (void) ddi_prop_remove(DDI_DEV_T_NONE, lsp->ls_dip, 1988 DDI_KERNEL_IOCTL); 1989 error = ENOMEM; 1990 goto merr; 1991 } 1992 1993 lsp->ls_kstat->ks_lock = &lsp->ls_kstat_lock; 1994 kstat_zone_add(lsp->ls_kstat, GLOBAL_ZONEID); 1995 kstat_install(lsp->ls_kstat); 1996 return (DDI_SUCCESS); 1997 merr: 1998 if (lsp->ls_cmlbhandle != NULL) { 1999 cmlb_detach(lsp->ls_cmlbhandle, 0); 2000 cmlb_free_handle(&lsp->ls_cmlbhandle); 2001 } 2002 ddi_remove_minor_node(dip, NULL); 2003 lofi_zone_unbind(lsp); 2004 lerr: 2005 mutex_destroy(&lsp->ls_comp_cache_lock); 2006 mutex_destroy(&lsp->ls_comp_bufs_lock); 2007 mutex_destroy(&lsp->ls_kstat_lock); 2008 mutex_destroy(&lsp->ls_vp_lock); 2009 cv_destroy(&lsp->ls_vp_cv); 2010 err: 2011 ddi_soft_state_free(lofi_statep, instance); 2012 return (error); 2013 } 2014 2015 static int 2016 lofi_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 2017 { 2018 int rv; 2019 int instance = ddi_get_instance(dip); 2020 struct lofi_state *lsp; 2021 2022 if (cmd != DDI_ATTACH) 2023 return (DDI_FAILURE); 2024 2025 /* 2026 * Instance 0 is control instance, attaching control instance 2027 * will set the lofi up and ready. 2028 */ 2029 if (instance == 0) { 2030 rv = ddi_soft_state_zalloc(lofi_statep, 0); 2031 if (rv == DDI_FAILURE) { 2032 return (DDI_FAILURE); 2033 } 2034 lsp = ddi_get_soft_state(lofi_statep, instance); 2035 rv = ddi_create_minor_node(dip, LOFI_CTL_NODE, S_IFCHR, 0, 2036 DDI_PSEUDO, 0); 2037 if (rv == DDI_FAILURE) { 2038 ddi_soft_state_free(lofi_statep, 0); 2039 return (DDI_FAILURE); 2040 } 2041 /* driver handles kernel-issued IOCTLs */ 2042 if (ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP, 2043 DDI_KERNEL_IOCTL, NULL, 0) != DDI_PROP_SUCCESS) { 2044 ddi_remove_minor_node(dip, NULL); 2045 ddi_soft_state_free(lofi_statep, 0); 2046 return (DDI_FAILURE); 2047 } 2048 2049 zone_key_create(&lofi_zone_key, NULL, lofi_zone_shutdown, NULL); 2050 2051 lsp->ls_dip = dip; 2052 } else { 2053 if (lofi_online_dev(dip) == DDI_FAILURE) 2054 return (DDI_FAILURE); 2055 } 2056 2057 ddi_report_dev(dip); 2058 return (DDI_SUCCESS); 2059 } 2060 2061 static int 2062 lofi_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 2063 { 2064 struct lofi_state *lsp; 2065 int instance = ddi_get_instance(dip); 2066 2067 if (cmd != DDI_DETACH) 2068 return (DDI_FAILURE); 2069 2070 /* 2071 * If the instance is not 0, release state. 2072 * The instance 0 is control device, we can not detach it 2073 * before other instances are detached. 2074 */ 2075 if (instance != 0) { 2076 lsp = ddi_get_soft_state(lofi_statep, instance); 2077 if (lsp != NULL && lsp->ls_vp_ready == B_FALSE) { 2078 ddi_soft_state_free(lofi_statep, instance); 2079 return (DDI_SUCCESS); 2080 } else 2081 return (DDI_FAILURE); 2082 } 2083 mutex_enter(&lofi_lock); 2084 2085 if (!list_is_empty(&lofi_list)) { 2086 mutex_exit(&lofi_lock); 2087 return (DDI_FAILURE); 2088 } 2089 2090 ddi_remove_minor_node(dip, NULL); 2091 ddi_prop_remove_all(dip); 2092 2093 mutex_exit(&lofi_lock); 2094 2095 if (zone_key_delete(lofi_zone_key) != 0) 2096 cmn_err(CE_WARN, "failed to delete zone key"); 2097 2098 ddi_soft_state_free(lofi_statep, 0); 2099 2100 return (DDI_SUCCESS); 2101 } 2102 2103 /* 2104 * With the addition of encryption, we must be careful that encryption key is 2105 * wiped before kernel's data structures are freed so it cannot accidentally 2106 * slip out to userland through uninitialized data elsewhere. 2107 */ 2108 static void 2109 free_lofi_ioctl(struct lofi_ioctl *klip) 2110 { 2111 /* Make sure this encryption key doesn't stick around */ 2112 bzero(klip->li_key, sizeof (klip->li_key)); 2113 kmem_free(klip, sizeof (struct lofi_ioctl)); 2114 } 2115 2116 /* 2117 * These two functions simplify the rest of the ioctls that need to copyin/out 2118 * the lofi_ioctl structure. 2119 */ 2120 int 2121 copy_in_lofi_ioctl(const struct lofi_ioctl *ulip, struct lofi_ioctl **klipp, 2122 int flag) 2123 { 2124 struct lofi_ioctl *klip; 2125 int error; 2126 2127 klip = *klipp = kmem_alloc(sizeof (struct lofi_ioctl), KM_SLEEP); 2128 error = ddi_copyin(ulip, klip, sizeof (struct lofi_ioctl), flag); 2129 if (error) 2130 goto err; 2131 2132 /* ensure NULL termination */ 2133 klip->li_filename[MAXPATHLEN-1] = '\0'; 2134 klip->li_devpath[MAXPATHLEN-1] = '\0'; 2135 klip->li_algorithm[MAXALGLEN-1] = '\0'; 2136 klip->li_cipher[CRYPTO_MAX_MECH_NAME-1] = '\0'; 2137 klip->li_iv_cipher[CRYPTO_MAX_MECH_NAME-1] = '\0'; 2138 2139 if (klip->li_id > L_MAXMIN32) { 2140 error = EINVAL; 2141 goto err; 2142 } 2143 2144 return (0); 2145 2146 err: 2147 free_lofi_ioctl(klip); 2148 return (error); 2149 } 2150 2151 int 2152 copy_out_lofi_ioctl(const struct lofi_ioctl *klip, struct lofi_ioctl *ulip, 2153 int flag) 2154 { 2155 int error; 2156 2157 /* 2158 * NOTE: Do NOT copy the crypto_key_t "back" to userland. 2159 * This ensures that an attacker can't trivially find the 2160 * key for a mapping just by issuing the ioctl. 2161 * 2162 * It can still be found by poking around in kmem with mdb(1), 2163 * but there is no point in making it easy when the info isn't 2164 * of any use in this direction anyway. 2165 * 2166 * Either way we don't actually have the raw key stored in 2167 * a form that we can get it anyway, since we just used it 2168 * to create a ctx template and didn't keep "the original". 2169 */ 2170 error = ddi_copyout(klip, ulip, sizeof (struct lofi_ioctl), flag); 2171 if (error) 2172 return (EFAULT); 2173 return (0); 2174 } 2175 2176 static int 2177 lofi_access(struct lofi_state *lsp) 2178 { 2179 ASSERT(MUTEX_HELD(&lofi_lock)); 2180 if (INGLOBALZONE(curproc) || lsp->ls_zone.zref_zone == curzone) 2181 return (0); 2182 return (EPERM); 2183 } 2184 2185 /* 2186 * Find the lofi state for the given filename. We compare by vnode to 2187 * allow the global zone visibility into NGZ lofi nodes. 2188 */ 2189 static int 2190 file_to_lofi_nocheck(char *filename, boolean_t readonly, 2191 struct lofi_state **lspp) 2192 { 2193 struct lofi_state *lsp; 2194 vnode_t *vp = NULL; 2195 int err = 0; 2196 int rdfiles = 0; 2197 2198 ASSERT(MUTEX_HELD(&lofi_lock)); 2199 2200 if ((err = lookupname(filename, UIO_SYSSPACE, FOLLOW, 2201 NULLVPP, &vp)) != 0) 2202 goto out; 2203 2204 if (vp->v_type == VREG) { 2205 vnode_t *realvp; 2206 if (VOP_REALVP(vp, &realvp, NULL) == 0) { 2207 VN_HOLD(realvp); 2208 VN_RELE(vp); 2209 vp = realvp; 2210 } 2211 } 2212 2213 for (lsp = list_head(&lofi_list); lsp != NULL; 2214 lsp = list_next(&lofi_list, lsp)) { 2215 if (lsp->ls_vp == vp) { 2216 if (lspp != NULL) 2217 *lspp = lsp; 2218 if (lsp->ls_readonly) { 2219 rdfiles++; 2220 /* Skip if '-r' is specified */ 2221 if (readonly) 2222 continue; 2223 } 2224 goto out; 2225 } 2226 } 2227 2228 err = ENOENT; 2229 2230 /* 2231 * If a filename is given as an argument for lofi_unmap, we shouldn't 2232 * allow unmap if there are multiple read-only lofi devices associated 2233 * with this file. 2234 */ 2235 if (lspp != NULL) { 2236 if (rdfiles == 1) 2237 err = 0; 2238 else if (rdfiles > 1) 2239 err = EBUSY; 2240 } 2241 2242 out: 2243 if (vp != NULL) 2244 VN_RELE(vp); 2245 return (err); 2246 } 2247 2248 /* 2249 * Find the minor for the given filename, checking the zone can access 2250 * it. 2251 */ 2252 static int 2253 file_to_lofi(char *filename, boolean_t readonly, struct lofi_state **lspp) 2254 { 2255 int err = 0; 2256 2257 ASSERT(MUTEX_HELD(&lofi_lock)); 2258 2259 if ((err = file_to_lofi_nocheck(filename, readonly, lspp)) != 0) 2260 return (err); 2261 2262 if ((err = lofi_access(*lspp)) != 0) 2263 return (err); 2264 2265 return (0); 2266 } 2267 2268 /* 2269 * Fakes up a disk geometry based on the size of the file. This is needed 2270 * to support newfs on traditional lofi device, but also will provide 2271 * geometry hint for cmlb. 2272 */ 2273 static void 2274 fake_disk_geometry(struct lofi_state *lsp) 2275 { 2276 u_offset_t dsize = lsp->ls_vp_size - lsp->ls_crypto_offset; 2277 2278 /* dk_geom - see dkio(4I) */ 2279 /* 2280 * dkg_ncyl _could_ be set to one here (one big cylinder with gobs 2281 * of sectors), but that breaks programs like fdisk which want to 2282 * partition a disk by cylinder. With one cylinder, you can't create 2283 * an fdisk partition and put pcfs on it for testing (hard to pick 2284 * a number between one and one). 2285 * 2286 * The cheezy floppy test is an attempt to not have too few cylinders 2287 * for a small file, or so many on a big file that you waste space 2288 * for backup superblocks or cylinder group structures. 2289 */ 2290 bzero(&lsp->ls_dkg, sizeof (lsp->ls_dkg)); 2291 if (dsize < (2 * 1024 * 1024)) /* floppy? */ 2292 lsp->ls_dkg.dkg_ncyl = dsize / (100 * 1024); 2293 else 2294 lsp->ls_dkg.dkg_ncyl = dsize / (300 * 1024); 2295 /* in case file file is < 100k */ 2296 if (lsp->ls_dkg.dkg_ncyl == 0) 2297 lsp->ls_dkg.dkg_ncyl = 1; 2298 2299 lsp->ls_dkg.dkg_pcyl = lsp->ls_dkg.dkg_ncyl; 2300 lsp->ls_dkg.dkg_nhead = 1; 2301 lsp->ls_dkg.dkg_rpm = 7200; 2302 2303 lsp->ls_dkg.dkg_nsect = dsize / 2304 (lsp->ls_dkg.dkg_ncyl << lsp->ls_pbshift); 2305 } 2306 2307 /* 2308 * build vtoc - see dkio(4I) 2309 * 2310 * Fakes one big partition based on the size of the file. This is needed 2311 * because we allow newfs'ing the traditional lofi device and newfs will 2312 * do several disk ioctls to figure out the geometry and partition information. 2313 * It uses that information to determine the parameters to pass to mkfs. 2314 */ 2315 static void 2316 fake_disk_vtoc(struct lofi_state *lsp, struct vtoc *vt) 2317 { 2318 bzero(vt, sizeof (struct vtoc)); 2319 vt->v_sanity = VTOC_SANE; 2320 vt->v_version = V_VERSION; 2321 (void) strncpy(vt->v_volume, LOFI_DRIVER_NAME, 2322 sizeof (vt->v_volume)); 2323 vt->v_sectorsz = 1 << lsp->ls_pbshift; 2324 vt->v_nparts = 1; 2325 vt->v_part[0].p_tag = V_UNASSIGNED; 2326 2327 /* 2328 * A compressed file is read-only, other files can 2329 * be read-write 2330 */ 2331 if (lsp->ls_uncomp_seg_sz > 0) { 2332 vt->v_part[0].p_flag = V_UNMNT | V_RONLY; 2333 } else { 2334 vt->v_part[0].p_flag = V_UNMNT; 2335 } 2336 vt->v_part[0].p_start = (daddr_t)0; 2337 /* 2338 * The partition size cannot just be the number of sectors, because 2339 * that might not end on a cylinder boundary. And if that's the case, 2340 * newfs/mkfs will print a scary warning. So just figure the size 2341 * based on the number of cylinders and sectors/cylinder. 2342 */ 2343 vt->v_part[0].p_size = lsp->ls_dkg.dkg_pcyl * 2344 lsp->ls_dkg.dkg_nsect * lsp->ls_dkg.dkg_nhead; 2345 } 2346 2347 /* 2348 * build dk_cinfo - see dkio(4I) 2349 */ 2350 static void 2351 fake_disk_info(dev_t dev, struct dk_cinfo *ci) 2352 { 2353 bzero(ci, sizeof (struct dk_cinfo)); 2354 (void) strlcpy(ci->dki_cname, LOFI_DRIVER_NAME, sizeof (ci->dki_cname)); 2355 ci->dki_ctype = DKC_SCSI_CCS; 2356 (void) strlcpy(ci->dki_dname, LOFI_DRIVER_NAME, sizeof (ci->dki_dname)); 2357 ci->dki_unit = LOFI_MINOR2ID(getminor(dev)); 2358 ci->dki_partition = LOFI_PART(getminor(dev)); 2359 /* 2360 * newfs uses this to set maxcontig. Must not be < 16, or it 2361 * will be 0 when newfs multiplies it by DEV_BSIZE and divides 2362 * it by the block size. Then tunefs doesn't work because 2363 * maxcontig is 0. 2364 */ 2365 ci->dki_maxtransfer = 16; 2366 } 2367 2368 /* 2369 * map in a compressed file 2370 * 2371 * Read in the header and the index that follows. 2372 * 2373 * The header is as follows - 2374 * 2375 * Signature (name of the compression algorithm) 2376 * Compression segment size (a multiple of 512) 2377 * Number of index entries 2378 * Size of the last block 2379 * The array containing the index entries 2380 * 2381 * The header information is always stored in 2382 * network byte order on disk. 2383 */ 2384 static int 2385 lofi_map_compressed_file(struct lofi_state *lsp, char *buf) 2386 { 2387 uint32_t index_sz, header_len, i; 2388 ssize_t resid; 2389 enum uio_rw rw; 2390 char *tbuf = buf; 2391 int error; 2392 2393 /* The signature has already been read */ 2394 tbuf += sizeof (lsp->ls_comp_algorithm); 2395 bcopy(tbuf, &(lsp->ls_uncomp_seg_sz), sizeof (lsp->ls_uncomp_seg_sz)); 2396 lsp->ls_uncomp_seg_sz = ntohl(lsp->ls_uncomp_seg_sz); 2397 2398 /* 2399 * The compressed segment size must be a power of 2 2400 */ 2401 if (lsp->ls_uncomp_seg_sz < DEV_BSIZE || 2402 !ISP2(lsp->ls_uncomp_seg_sz)) 2403 return (EINVAL); 2404 2405 for (i = 0; !((lsp->ls_uncomp_seg_sz >> i) & 1); i++) 2406 ; 2407 2408 lsp->ls_comp_seg_shift = i; 2409 2410 tbuf += sizeof (lsp->ls_uncomp_seg_sz); 2411 bcopy(tbuf, &(lsp->ls_comp_index_sz), sizeof (lsp->ls_comp_index_sz)); 2412 lsp->ls_comp_index_sz = ntohl(lsp->ls_comp_index_sz); 2413 2414 tbuf += sizeof (lsp->ls_comp_index_sz); 2415 bcopy(tbuf, &(lsp->ls_uncomp_last_seg_sz), 2416 sizeof (lsp->ls_uncomp_last_seg_sz)); 2417 lsp->ls_uncomp_last_seg_sz = ntohl(lsp->ls_uncomp_last_seg_sz); 2418 2419 /* 2420 * Compute the total size of the uncompressed data 2421 * for use in fake_disk_geometry and other calculations. 2422 * Disk geometry has to be faked with respect to the 2423 * actual uncompressed data size rather than the 2424 * compressed file size. 2425 */ 2426 lsp->ls_vp_size = 2427 (u_offset_t)(lsp->ls_comp_index_sz - 2) * lsp->ls_uncomp_seg_sz 2428 + lsp->ls_uncomp_last_seg_sz; 2429 2430 /* 2431 * Index size is rounded up to DEV_BSIZE for ease 2432 * of segmapping 2433 */ 2434 index_sz = sizeof (*lsp->ls_comp_seg_index) * lsp->ls_comp_index_sz; 2435 header_len = sizeof (lsp->ls_comp_algorithm) + 2436 sizeof (lsp->ls_uncomp_seg_sz) + 2437 sizeof (lsp->ls_comp_index_sz) + 2438 sizeof (lsp->ls_uncomp_last_seg_sz); 2439 lsp->ls_comp_offbase = header_len + index_sz; 2440 2441 index_sz += header_len; 2442 index_sz = roundup(index_sz, DEV_BSIZE); 2443 2444 lsp->ls_comp_index_data = kmem_alloc(index_sz, KM_SLEEP); 2445 lsp->ls_comp_index_data_sz = index_sz; 2446 2447 /* 2448 * Read in the index -- this has a side-effect 2449 * of reading in the header as well 2450 */ 2451 rw = UIO_READ; 2452 error = vn_rdwr(rw, lsp->ls_vp, lsp->ls_comp_index_data, index_sz, 2453 0, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); 2454 2455 if (error != 0) 2456 return (error); 2457 2458 /* Skip the header, this is where the index really begins */ 2459 lsp->ls_comp_seg_index = 2460 /*LINTED*/ 2461 (uint64_t *)(lsp->ls_comp_index_data + header_len); 2462 2463 /* 2464 * Now recompute offsets in the index to account for 2465 * the header length 2466 */ 2467 for (i = 0; i < lsp->ls_comp_index_sz; i++) { 2468 lsp->ls_comp_seg_index[i] = lsp->ls_comp_offbase + 2469 BE_64(lsp->ls_comp_seg_index[i]); 2470 } 2471 2472 return (error); 2473 } 2474 2475 static int 2476 lofi_init_crypto(struct lofi_state *lsp, struct lofi_ioctl *klip) 2477 { 2478 struct crypto_meta chead; 2479 char buf[DEV_BSIZE]; 2480 ssize_t resid; 2481 char *marker; 2482 int error; 2483 int ret; 2484 int i; 2485 2486 if (!klip->li_crypto_enabled) 2487 return (0); 2488 2489 /* 2490 * All current algorithms have a max of 448 bits. 2491 */ 2492 if (klip->li_iv_len > CRYPTO_BITS2BYTES(512)) 2493 return (EINVAL); 2494 2495 if (CRYPTO_BITS2BYTES(klip->li_key_len) > sizeof (klip->li_key)) 2496 return (EINVAL); 2497 2498 lsp->ls_crypto_enabled = klip->li_crypto_enabled; 2499 2500 mutex_init(&lsp->ls_crypto_lock, NULL, MUTEX_DRIVER, NULL); 2501 2502 lsp->ls_mech.cm_type = crypto_mech2id(klip->li_cipher); 2503 if (lsp->ls_mech.cm_type == CRYPTO_MECH_INVALID) { 2504 cmn_err(CE_WARN, "invalid cipher %s requested for %s", 2505 klip->li_cipher, klip->li_filename); 2506 return (EINVAL); 2507 } 2508 2509 /* this is just initialization here */ 2510 lsp->ls_mech.cm_param = NULL; 2511 lsp->ls_mech.cm_param_len = 0; 2512 2513 lsp->ls_iv_type = klip->li_iv_type; 2514 lsp->ls_iv_mech.cm_type = crypto_mech2id(klip->li_iv_cipher); 2515 if (lsp->ls_iv_mech.cm_type == CRYPTO_MECH_INVALID) { 2516 cmn_err(CE_WARN, "invalid iv cipher %s requested" 2517 " for %s", klip->li_iv_cipher, klip->li_filename); 2518 return (EINVAL); 2519 } 2520 2521 /* iv mech must itself take a null iv */ 2522 lsp->ls_iv_mech.cm_param = NULL; 2523 lsp->ls_iv_mech.cm_param_len = 0; 2524 lsp->ls_iv_len = klip->li_iv_len; 2525 2526 /* 2527 * Create ctx using li_cipher & the raw li_key after checking 2528 * that it isn't a weak key. 2529 */ 2530 lsp->ls_key.ck_format = CRYPTO_KEY_RAW; 2531 lsp->ls_key.ck_length = klip->li_key_len; 2532 lsp->ls_key.ck_data = kmem_alloc( 2533 CRYPTO_BITS2BYTES(lsp->ls_key.ck_length), KM_SLEEP); 2534 bcopy(klip->li_key, lsp->ls_key.ck_data, 2535 CRYPTO_BITS2BYTES(lsp->ls_key.ck_length)); 2536 2537 ret = crypto_key_check(&lsp->ls_mech, &lsp->ls_key); 2538 if (ret != CRYPTO_SUCCESS) { 2539 cmn_err(CE_WARN, "weak key check failed for cipher " 2540 "%s on file %s (0x%x)", klip->li_cipher, 2541 klip->li_filename, ret); 2542 return (EINVAL); 2543 } 2544 2545 error = vn_rdwr(UIO_READ, lsp->ls_vp, buf, DEV_BSIZE, 2546 CRYOFF, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); 2547 if (error != 0) 2548 return (error); 2549 2550 /* 2551 * This is the case where the header in the lofi image is already 2552 * initialized to indicate it is encrypted. 2553 */ 2554 if (strncmp(buf, lofi_crypto_magic, sizeof (lofi_crypto_magic)) == 0) { 2555 /* 2556 * The encryption header information is laid out this way: 2557 * 6 bytes: hex "CFLOFI" 2558 * 2 bytes: version = 0 ... for now 2559 * 96 bytes: reserved1 (not implemented yet) 2560 * 4 bytes: data_sector = 2 ... for now 2561 * more... not implemented yet 2562 */ 2563 2564 marker = buf; 2565 2566 /* copy the magic */ 2567 bcopy(marker, lsp->ls_crypto.magic, 2568 sizeof (lsp->ls_crypto.magic)); 2569 marker += sizeof (lsp->ls_crypto.magic); 2570 2571 /* read the encryption version number */ 2572 bcopy(marker, &(lsp->ls_crypto.version), 2573 sizeof (lsp->ls_crypto.version)); 2574 lsp->ls_crypto.version = ntohs(lsp->ls_crypto.version); 2575 marker += sizeof (lsp->ls_crypto.version); 2576 2577 /* read a chunk of reserved data */ 2578 bcopy(marker, lsp->ls_crypto.reserved1, 2579 sizeof (lsp->ls_crypto.reserved1)); 2580 marker += sizeof (lsp->ls_crypto.reserved1); 2581 2582 /* read block number where encrypted data begins */ 2583 bcopy(marker, &(lsp->ls_crypto.data_sector), 2584 sizeof (lsp->ls_crypto.data_sector)); 2585 lsp->ls_crypto.data_sector = ntohl(lsp->ls_crypto.data_sector); 2586 marker += sizeof (lsp->ls_crypto.data_sector); 2587 2588 /* and ignore the rest until it is implemented */ 2589 2590 lsp->ls_crypto_offset = lsp->ls_crypto.data_sector * DEV_BSIZE; 2591 return (0); 2592 } 2593 2594 /* 2595 * We've requested encryption, but no magic was found, so it must be 2596 * a new image. 2597 */ 2598 2599 for (i = 0; i < sizeof (struct crypto_meta); i++) { 2600 if (buf[i] != '\0') 2601 return (EINVAL); 2602 } 2603 2604 marker = buf; 2605 bcopy(lofi_crypto_magic, marker, sizeof (lofi_crypto_magic)); 2606 marker += sizeof (lofi_crypto_magic); 2607 chead.version = htons(LOFI_CRYPTO_VERSION); 2608 bcopy(&(chead.version), marker, sizeof (chead.version)); 2609 marker += sizeof (chead.version); 2610 marker += sizeof (chead.reserved1); 2611 chead.data_sector = htonl(LOFI_CRYPTO_DATA_SECTOR); 2612 bcopy(&(chead.data_sector), marker, sizeof (chead.data_sector)); 2613 2614 /* write the header */ 2615 error = vn_rdwr(UIO_WRITE, lsp->ls_vp, buf, DEV_BSIZE, 2616 CRYOFF, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); 2617 if (error != 0) 2618 return (error); 2619 2620 /* fix things up so it looks like we read this info */ 2621 bcopy(lofi_crypto_magic, lsp->ls_crypto.magic, 2622 sizeof (lofi_crypto_magic)); 2623 lsp->ls_crypto.version = LOFI_CRYPTO_VERSION; 2624 lsp->ls_crypto.data_sector = LOFI_CRYPTO_DATA_SECTOR; 2625 lsp->ls_crypto_offset = lsp->ls_crypto.data_sector * DEV_BSIZE; 2626 return (0); 2627 } 2628 2629 /* 2630 * Check to see if the passed in signature is a valid one. If it is 2631 * valid, return the index into lofi_compress_table. 2632 * 2633 * Return -1 if it is invalid 2634 */ 2635 static int 2636 lofi_compress_select(const char *signature) 2637 { 2638 int i; 2639 2640 for (i = 0; i < LOFI_COMPRESS_FUNCTIONS; i++) { 2641 if (strcmp(lofi_compress_table[i].l_name, signature) == 0) 2642 return (i); 2643 } 2644 2645 return (-1); 2646 } 2647 2648 static int 2649 lofi_init_compress(struct lofi_state *lsp) 2650 { 2651 char buf[DEV_BSIZE]; 2652 int compress_index; 2653 ssize_t resid; 2654 int error; 2655 2656 error = vn_rdwr(UIO_READ, lsp->ls_vp, buf, DEV_BSIZE, 0, UIO_SYSSPACE, 2657 0, RLIM64_INFINITY, kcred, &resid); 2658 2659 if (error != 0) 2660 return (error); 2661 2662 if ((compress_index = lofi_compress_select(buf)) == -1) 2663 return (0); 2664 2665 /* compression and encryption are mutually exclusive */ 2666 if (lsp->ls_crypto_enabled) 2667 return (ENOTSUP); 2668 2669 /* initialize compression info for compressed lofi */ 2670 lsp->ls_comp_algorithm_index = compress_index; 2671 (void) strlcpy(lsp->ls_comp_algorithm, 2672 lofi_compress_table[compress_index].l_name, 2673 sizeof (lsp->ls_comp_algorithm)); 2674 2675 /* Finally setup per-thread pre-allocated buffers */ 2676 lsp->ls_comp_bufs = kmem_zalloc(lofi_taskq_nthreads * 2677 sizeof (struct compbuf), KM_SLEEP); 2678 2679 return (lofi_map_compressed_file(lsp, buf)); 2680 } 2681 2682 /* 2683 * Allocate new or proposed id from lofi_id. 2684 * 2685 * Special cases for proposed id: 2686 * 0: not allowed, 0 is id for control device. 2687 * -1: allocate first usable id from lofi_id. 2688 * any other value is proposed value from userland 2689 * 2690 * returns DDI_SUCCESS or errno. 2691 */ 2692 static int 2693 lofi_alloc_id(int *idp) 2694 { 2695 int id, error = DDI_SUCCESS; 2696 2697 if (*idp == -1) { 2698 id = id_allocff_nosleep(lofi_id); 2699 if (id == -1) { 2700 error = EAGAIN; 2701 goto err; 2702 } 2703 } else if (*idp == 0) { 2704 error = EINVAL; 2705 goto err; 2706 } else if (*idp > ((1 << (L_BITSMINOR - LOFI_CMLB_SHIFT)) - 1)) { 2707 error = ERANGE; 2708 goto err; 2709 } else { 2710 if (ddi_get_soft_state(lofi_statep, *idp) != NULL) { 2711 error = EEXIST; 2712 goto err; 2713 } 2714 2715 id = id_alloc_specific_nosleep(lofi_id, *idp); 2716 if (id == -1) { 2717 error = EAGAIN; 2718 goto err; 2719 } 2720 } 2721 *idp = id; 2722 err: 2723 return (error); 2724 } 2725 2726 static int 2727 lofi_create_dev(struct lofi_ioctl *klip) 2728 { 2729 dev_info_t *parent, *child; 2730 struct lofi_state *lsp = NULL; 2731 char namebuf[MAXNAMELEN]; 2732 int error, circ; 2733 2734 /* get control device */ 2735 lsp = ddi_get_soft_state(lofi_statep, 0); 2736 parent = ddi_get_parent(lsp->ls_dip); 2737 2738 if ((error = lofi_alloc_id((int *)&klip->li_id))) 2739 return (error); 2740 2741 (void) snprintf(namebuf, sizeof (namebuf), LOFI_DRIVER_NAME "@%d", 2742 klip->li_id); 2743 2744 ndi_devi_enter(parent, &circ); 2745 child = ndi_devi_findchild(parent, namebuf); 2746 ndi_devi_exit(parent, circ); 2747 2748 if (child == NULL) { 2749 child = ddi_add_child(parent, LOFI_DRIVER_NAME, 2750 (pnode_t)DEVI_SID_NODEID, klip->li_id); 2751 if ((error = ddi_prop_update_int(DDI_DEV_T_NONE, child, 2752 "instance", klip->li_id)) != DDI_PROP_SUCCESS) 2753 goto err; 2754 2755 if (klip->li_labeled == B_TRUE) { 2756 if ((error = ddi_prop_create(DDI_DEV_T_NONE, child, 2757 DDI_PROP_CANSLEEP, "labeled", 0, 0)) 2758 != DDI_PROP_SUCCESS) 2759 goto err; 2760 } 2761 2762 if ((error = ndi_devi_online(child, NDI_ONLINE_ATTACH)) 2763 != NDI_SUCCESS) 2764 goto err; 2765 } else { 2766 id_free(lofi_id, klip->li_id); 2767 error = EEXIST; 2768 return (error); 2769 } 2770 2771 goto done; 2772 2773 err: 2774 ddi_prop_remove_all(child); 2775 (void) ndi_devi_offline(child, NDI_DEVI_REMOVE); 2776 id_free(lofi_id, klip->li_id); 2777 done: 2778 2779 return (error); 2780 } 2781 2782 static void 2783 lofi_create_inquiry(struct lofi_state *lsp, struct scsi_inquiry *inq) 2784 { 2785 char *p = NULL; 2786 2787 (void) strlcpy(inq->inq_vid, LOFI_DRIVER_NAME, sizeof (inq->inq_vid)); 2788 2789 mutex_enter(&lsp->ls_vp_lock); 2790 if (lsp->ls_vp != NULL) 2791 p = strrchr(lsp->ls_vp->v_path, '/'); 2792 if (p != NULL) 2793 (void) strncpy(inq->inq_pid, p + 1, sizeof (inq->inq_pid)); 2794 mutex_exit(&lsp->ls_vp_lock); 2795 (void) strlcpy(inq->inq_revision, "1.0", sizeof (inq->inq_revision)); 2796 } 2797 2798 /* 2799 * copy devlink name from event cache 2800 */ 2801 static void 2802 lofi_copy_devpath(struct lofi_ioctl *klip) 2803 { 2804 int error; 2805 char namebuf[MAXNAMELEN], *str; 2806 clock_t ticks; 2807 nvlist_t *nvl = NULL; 2808 2809 if (klip->li_labeled == B_TRUE) 2810 klip->li_devpath[0] = '\0'; 2811 else { 2812 /* no need to wait for messages */ 2813 (void) snprintf(klip->li_devpath, sizeof (klip->li_devpath), 2814 "/dev/" LOFI_CHAR_NAME "/%d", klip->li_id); 2815 return; 2816 } 2817 2818 (void) snprintf(namebuf, sizeof (namebuf), "%d", klip->li_id); 2819 2820 mutex_enter(&lofi_devlink_cache.ln_lock); 2821 for (;;) { 2822 error = nvlist_lookup_nvlist(lofi_devlink_cache.ln_data, 2823 namebuf, &nvl); 2824 2825 if (error == 0 && 2826 nvlist_lookup_string(nvl, DEV_NAME, &str) == 0 && 2827 strncmp(str, "/dev/" LOFI_CHAR_NAME, 2828 sizeof ("/dev/" LOFI_CHAR_NAME) - 1) != 0) { 2829 (void) strlcpy(klip->li_devpath, str, 2830 sizeof (klip->li_devpath)); 2831 break; 2832 } 2833 /* 2834 * Either there is no data in the cache, or the 2835 * cache entry still has the wrong device name. 2836 */ 2837 ticks = ddi_get_lbolt() + lofi_timeout * drv_usectohz(1000000); 2838 error = cv_timedwait(&lofi_devlink_cache.ln_cv, 2839 &lofi_devlink_cache.ln_lock, ticks); 2840 if (error == -1) 2841 break; /* timeout */ 2842 } 2843 mutex_exit(&lofi_devlink_cache.ln_lock); 2844 } 2845 2846 /* 2847 * map a file to a minor number. Return the minor number. 2848 */ 2849 static int 2850 lofi_map_file(dev_t dev, struct lofi_ioctl *ulip, int pickminor, 2851 int *rvalp, struct cred *credp, int ioctl_flag) 2852 { 2853 int id = -1; 2854 struct lofi_state *lsp = NULL; 2855 struct lofi_ioctl *klip; 2856 int error; 2857 struct vnode *vp = NULL; 2858 vattr_t vattr; 2859 int flag; 2860 char namebuf[MAXNAMELEN]; 2861 2862 error = copy_in_lofi_ioctl(ulip, &klip, ioctl_flag); 2863 if (error != 0) 2864 return (error); 2865 2866 mutex_enter(&lofi_lock); 2867 2868 if (file_to_lofi_nocheck(klip->li_filename, klip->li_readonly, 2869 NULL) == 0) { 2870 error = EBUSY; 2871 goto err; 2872 } 2873 2874 flag = FREAD | FWRITE | FOFFMAX | FEXCL; 2875 error = vn_open(klip->li_filename, UIO_SYSSPACE, flag, 0, &vp, 0, 0); 2876 if (error) { 2877 /* try read-only */ 2878 flag &= ~FWRITE; 2879 error = vn_open(klip->li_filename, UIO_SYSSPACE, flag, 0, 2880 &vp, 0, 0); 2881 if (error) 2882 goto err; 2883 } 2884 2885 if (!V_ISLOFIABLE(vp->v_type)) { 2886 error = EINVAL; 2887 goto err; 2888 } 2889 2890 vattr.va_mask = AT_SIZE; 2891 error = VOP_GETATTR(vp, &vattr, 0, credp, NULL); 2892 if (error) 2893 goto err; 2894 2895 /* the file needs to be a multiple of the block size */ 2896 if ((vattr.va_size % DEV_BSIZE) != 0) { 2897 error = EINVAL; 2898 goto err; 2899 } 2900 2901 if (pickminor) { 2902 klip->li_id = (uint32_t)-1; 2903 } 2904 if ((error = lofi_create_dev(klip)) != 0) 2905 goto err; 2906 2907 id = klip->li_id; 2908 lsp = ddi_get_soft_state(lofi_statep, id); 2909 if (lsp == NULL) 2910 goto err; 2911 2912 /* 2913 * from this point lofi_destroy() is used to clean up on error 2914 * make sure the basic data is set 2915 */ 2916 list_insert_tail(&lofi_list, lsp); 2917 lsp->ls_dev = makedevice(getmajor(dev), LOFI_ID2MINOR(id)); 2918 2919 list_create(&lsp->ls_comp_cache, sizeof (struct lofi_comp_cache), 2920 offsetof(struct lofi_comp_cache, lc_list)); 2921 2922 /* 2923 * save open mode so file can be closed properly and vnode counts 2924 * updated correctly. 2925 */ 2926 lsp->ls_openflag = flag; 2927 2928 lsp->ls_vp = vp; 2929 lsp->ls_stacked_vp = vp; 2930 2931 lsp->ls_vp_size = vattr.va_size; 2932 lsp->ls_vp_comp_size = lsp->ls_vp_size; 2933 2934 /* 2935 * Try to handle stacked lofs vnodes. 2936 */ 2937 if (vp->v_type == VREG) { 2938 vnode_t *realvp; 2939 2940 if (VOP_REALVP(vp, &realvp, NULL) == 0) { 2941 /* 2942 * We need to use the realvp for uniqueness 2943 * checking, but keep the stacked vp for 2944 * LOFI_GET_FILENAME display. 2945 */ 2946 VN_HOLD(realvp); 2947 lsp->ls_vp = realvp; 2948 } 2949 } 2950 2951 lsp->ls_lbshift = highbit(DEV_BSIZE) - 1; 2952 lsp->ls_pbshift = lsp->ls_lbshift; 2953 2954 lsp->ls_readonly = klip->li_readonly; 2955 lsp->ls_uncomp_seg_sz = 0; 2956 lsp->ls_comp_algorithm[0] = '\0'; 2957 lsp->ls_crypto_offset = 0; 2958 2959 (void) snprintf(namebuf, sizeof (namebuf), "%s_taskq_%d", 2960 LOFI_DRIVER_NAME, id); 2961 lsp->ls_taskq = taskq_create_proc(namebuf, lofi_taskq_nthreads, 2962 minclsyspri, 1, lofi_taskq_maxalloc, curzone->zone_zsched, 0); 2963 2964 if ((error = lofi_init_crypto(lsp, klip)) != 0) 2965 goto err; 2966 2967 if ((error = lofi_init_compress(lsp)) != 0) 2968 goto err; 2969 2970 fake_disk_geometry(lsp); 2971 2972 /* For unlabeled lofi add Nblocks and Size */ 2973 if (klip->li_labeled == B_FALSE) { 2974 error = ddi_prop_update_int64(lsp->ls_dev, lsp->ls_dip, 2975 SIZE_PROP_NAME, lsp->ls_vp_size - lsp->ls_crypto_offset); 2976 if (error != DDI_PROP_SUCCESS) { 2977 error = EINVAL; 2978 goto err; 2979 } 2980 error = ddi_prop_update_int64(lsp->ls_dev, lsp->ls_dip, 2981 NBLOCKS_PROP_NAME, 2982 (lsp->ls_vp_size - lsp->ls_crypto_offset) / DEV_BSIZE); 2983 if (error != DDI_PROP_SUCCESS) { 2984 error = EINVAL; 2985 goto err; 2986 } 2987 } 2988 2989 /* 2990 * Notify we are ready to rock. 2991 */ 2992 mutex_enter(&lsp->ls_vp_lock); 2993 lsp->ls_vp_ready = B_TRUE; 2994 cv_broadcast(&lsp->ls_vp_cv); 2995 mutex_exit(&lsp->ls_vp_lock); 2996 mutex_exit(&lofi_lock); 2997 2998 lofi_copy_devpath(klip); 2999 3000 if (rvalp) 3001 *rvalp = id; 3002 (void) copy_out_lofi_ioctl(klip, ulip, ioctl_flag); 3003 free_lofi_ioctl(klip); 3004 return (0); 3005 3006 err: 3007 if (lsp != NULL) { 3008 lofi_destroy(lsp, credp); 3009 } else { 3010 if (vp != NULL) { 3011 (void) VOP_PUTPAGE(vp, 0, 0, B_FREE, credp, NULL); 3012 (void) VOP_CLOSE(vp, flag, 1, 0, credp, NULL); 3013 VN_RELE(vp); 3014 } 3015 } 3016 3017 mutex_exit(&lofi_lock); 3018 free_lofi_ioctl(klip); 3019 return (error); 3020 } 3021 3022 /* 3023 * unmap a file. 3024 */ 3025 static int 3026 lofi_unmap_file(struct lofi_ioctl *ulip, int byfilename, 3027 struct cred *credp, int ioctl_flag) 3028 { 3029 struct lofi_state *lsp; 3030 struct lofi_ioctl *klip; 3031 char namebuf[MAXNAMELEN]; 3032 int err; 3033 3034 err = copy_in_lofi_ioctl(ulip, &klip, ioctl_flag); 3035 if (err != 0) 3036 return (err); 3037 3038 mutex_enter(&lofi_lock); 3039 if (byfilename) { 3040 if ((err = file_to_lofi(klip->li_filename, klip->li_readonly, 3041 &lsp)) != 0) { 3042 goto done; 3043 } 3044 } else if (klip->li_id == 0) { 3045 err = ENXIO; 3046 goto done; 3047 } else { 3048 lsp = ddi_get_soft_state(lofi_statep, klip->li_id); 3049 } 3050 3051 if (lsp == NULL || lsp->ls_vp == NULL || lofi_access(lsp) != 0) { 3052 err = ENXIO; 3053 goto done; 3054 } 3055 3056 klip->li_id = LOFI_MINOR2ID(getminor(lsp->ls_dev)); 3057 (void) snprintf(namebuf, sizeof (namebuf), "%u", klip->li_id); 3058 3059 /* 3060 * If it's still held open, we'll do one of three things: 3061 * 3062 * If no flag is set, just return EBUSY. 3063 * 3064 * If the 'cleanup' flag is set, unmap and remove the device when 3065 * the last user finishes. 3066 * 3067 * If the 'force' flag is set, then we forcibly close the underlying 3068 * file. Subsequent operations will fail, and the DKIOCSTATE ioctl 3069 * will return DKIO_DEV_GONE. When the device is last closed, the 3070 * device will be cleaned up appropriately. 3071 * 3072 * This is complicated by the fact that we may have outstanding 3073 * dispatched I/Os. Rather than having a single mutex to serialize all 3074 * I/O, we keep a count of the number of outstanding I/O requests 3075 * (ls_vp_iocount), as well as a flag to indicate that no new I/Os 3076 * should be dispatched (ls_vp_closereq). 3077 * 3078 * We set the flag, wait for the number of outstanding I/Os to reach 0, 3079 * and then close the underlying vnode. 3080 */ 3081 if (is_opened(lsp)) { 3082 if (klip->li_force) { 3083 /* Mark the device for cleanup. */ 3084 lofi_set_cleanup(lsp); 3085 mutex_enter(&lsp->ls_vp_lock); 3086 lsp->ls_vp_closereq = B_TRUE; 3087 /* Wake up any threads waiting on dkiocstate. */ 3088 cv_broadcast(&lsp->ls_vp_cv); 3089 while (lsp->ls_vp_iocount > 0) 3090 cv_wait(&lsp->ls_vp_cv, &lsp->ls_vp_lock); 3091 mutex_exit(&lsp->ls_vp_lock); 3092 } else if (klip->li_cleanup) { 3093 lofi_set_cleanup(lsp); 3094 } else { 3095 err = EBUSY; 3096 } 3097 } else { 3098 lofi_free_dev(lsp); 3099 lofi_destroy(lsp, credp); 3100 } 3101 3102 /* Remove name from devlink cache */ 3103 mutex_enter(&lofi_devlink_cache.ln_lock); 3104 (void) nvlist_remove_all(lofi_devlink_cache.ln_data, namebuf); 3105 cv_broadcast(&lofi_devlink_cache.ln_cv); 3106 mutex_exit(&lofi_devlink_cache.ln_lock); 3107 done: 3108 mutex_exit(&lofi_lock); 3109 if (err == 0) 3110 (void) copy_out_lofi_ioctl(klip, ulip, ioctl_flag); 3111 free_lofi_ioctl(klip); 3112 return (err); 3113 } 3114 3115 /* 3116 * get the filename given the minor number, or the minor number given 3117 * the name. 3118 */ 3119 /*ARGSUSED*/ 3120 static int 3121 lofi_get_info(dev_t dev, struct lofi_ioctl *ulip, int which, 3122 struct cred *credp, int ioctl_flag) 3123 { 3124 struct lofi_ioctl *klip; 3125 struct lofi_state *lsp; 3126 int error; 3127 3128 error = copy_in_lofi_ioctl(ulip, &klip, ioctl_flag); 3129 if (error != 0) 3130 return (error); 3131 3132 switch (which) { 3133 case LOFI_GET_FILENAME: 3134 if (klip->li_id == 0) { 3135 free_lofi_ioctl(klip); 3136 return (EINVAL); 3137 } 3138 3139 mutex_enter(&lofi_lock); 3140 lsp = ddi_get_soft_state(lofi_statep, klip->li_id); 3141 if (lsp == NULL || lofi_access(lsp) != 0) { 3142 mutex_exit(&lofi_lock); 3143 free_lofi_ioctl(klip); 3144 return (ENXIO); 3145 } 3146 3147 /* 3148 * This may fail if, for example, we're trying to look 3149 * up a zoned NFS path from the global zone. 3150 */ 3151 if (lsp->ls_stacked_vp == NULL || 3152 vnodetopath(NULL, lsp->ls_stacked_vp, klip->li_filename, 3153 sizeof (klip->li_filename), CRED()) != 0) { 3154 (void) strlcpy(klip->li_filename, "?", 3155 sizeof (klip->li_filename)); 3156 } 3157 3158 klip->li_readonly = lsp->ls_readonly; 3159 klip->li_labeled = lsp->ls_cmlbhandle != NULL; 3160 3161 (void) strlcpy(klip->li_algorithm, lsp->ls_comp_algorithm, 3162 sizeof (klip->li_algorithm)); 3163 klip->li_crypto_enabled = lsp->ls_crypto_enabled; 3164 mutex_exit(&lofi_lock); 3165 3166 lofi_copy_devpath(klip); 3167 error = copy_out_lofi_ioctl(klip, ulip, ioctl_flag); 3168 free_lofi_ioctl(klip); 3169 return (error); 3170 case LOFI_GET_MINOR: 3171 mutex_enter(&lofi_lock); 3172 error = file_to_lofi(klip->li_filename, 3173 klip->li_readonly, &lsp); 3174 if (error != 0) { 3175 mutex_exit(&lofi_lock); 3176 free_lofi_ioctl(klip); 3177 return (error); 3178 } 3179 klip->li_id = LOFI_MINOR2ID(getminor(lsp->ls_dev)); 3180 3181 klip->li_readonly = lsp->ls_readonly; 3182 klip->li_labeled = lsp->ls_cmlbhandle != NULL; 3183 mutex_exit(&lofi_lock); 3184 3185 lofi_copy_devpath(klip); 3186 error = copy_out_lofi_ioctl(klip, ulip, ioctl_flag); 3187 3188 free_lofi_ioctl(klip); 3189 return (error); 3190 case LOFI_CHECK_COMPRESSED: 3191 mutex_enter(&lofi_lock); 3192 error = file_to_lofi(klip->li_filename, 3193 klip->li_readonly, &lsp); 3194 if (error != 0) { 3195 mutex_exit(&lofi_lock); 3196 free_lofi_ioctl(klip); 3197 return (error); 3198 } 3199 3200 klip->li_id = LOFI_MINOR2ID(getminor(lsp->ls_dev)); 3201 (void) strlcpy(klip->li_algorithm, lsp->ls_comp_algorithm, 3202 sizeof (klip->li_algorithm)); 3203 3204 mutex_exit(&lofi_lock); 3205 error = copy_out_lofi_ioctl(klip, ulip, ioctl_flag); 3206 free_lofi_ioctl(klip); 3207 return (error); 3208 default: 3209 free_lofi_ioctl(klip); 3210 return (EINVAL); 3211 } 3212 } 3213 3214 static int 3215 uscsi_is_inquiry(intptr_t arg, int flag, union scsi_cdb *cdb, 3216 struct uscsi_cmd *uscmd) 3217 { 3218 int rval; 3219 3220 #ifdef _MULTI_DATAMODEL 3221 switch (ddi_model_convert_from(flag & FMODELS)) { 3222 case DDI_MODEL_ILP32: { 3223 struct uscsi_cmd32 ucmd32; 3224 3225 if (ddi_copyin((void *)arg, &ucmd32, sizeof (ucmd32), flag)) { 3226 rval = EFAULT; 3227 goto err; 3228 } 3229 uscsi_cmd32touscsi_cmd((&ucmd32), uscmd); 3230 break; 3231 } 3232 case DDI_MODEL_NONE: 3233 if (ddi_copyin((void *)arg, uscmd, sizeof (*uscmd), flag)) { 3234 rval = EFAULT; 3235 goto err; 3236 } 3237 break; 3238 default: 3239 rval = EFAULT; 3240 goto err; 3241 } 3242 #else 3243 if (ddi_copyin((void *)arg, uscmd, sizeof (*uscmd), flag)) { 3244 rval = EFAULT; 3245 goto err; 3246 } 3247 #endif /* _MULTI_DATAMODEL */ 3248 if (ddi_copyin(uscmd->uscsi_cdb, cdb, uscmd->uscsi_cdblen, flag)) { 3249 rval = EFAULT; 3250 goto err; 3251 } 3252 if (cdb->scc_cmd == SCMD_INQUIRY) { 3253 return (0); 3254 } 3255 err: 3256 return (rval); 3257 } 3258 3259 static int 3260 lofi_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *credp, 3261 int *rvalp) 3262 { 3263 int error; 3264 enum dkio_state dkstate; 3265 struct lofi_state *lsp; 3266 dk_efi_t user_efi; 3267 int id; 3268 3269 id = LOFI_MINOR2ID(getminor(dev)); 3270 3271 /* lofi ioctls only apply to the master device */ 3272 if (id == 0) { 3273 struct lofi_ioctl *lip = (struct lofi_ioctl *)arg; 3274 3275 /* 3276 * the query command only need read-access - i.e., normal 3277 * users are allowed to do those on the ctl device as 3278 * long as they can open it read-only. 3279 */ 3280 switch (cmd) { 3281 case LOFI_MAP_FILE: 3282 if ((flag & FWRITE) == 0) 3283 return (EPERM); 3284 return (lofi_map_file(dev, lip, 1, rvalp, credp, flag)); 3285 case LOFI_MAP_FILE_MINOR: 3286 if ((flag & FWRITE) == 0) 3287 return (EPERM); 3288 return (lofi_map_file(dev, lip, 0, rvalp, credp, flag)); 3289 case LOFI_UNMAP_FILE: 3290 if ((flag & FWRITE) == 0) 3291 return (EPERM); 3292 return (lofi_unmap_file(lip, 1, credp, flag)); 3293 case LOFI_UNMAP_FILE_MINOR: 3294 if ((flag & FWRITE) == 0) 3295 return (EPERM); 3296 return (lofi_unmap_file(lip, 0, credp, flag)); 3297 case LOFI_GET_FILENAME: 3298 return (lofi_get_info(dev, lip, LOFI_GET_FILENAME, 3299 credp, flag)); 3300 case LOFI_GET_MINOR: 3301 return (lofi_get_info(dev, lip, LOFI_GET_MINOR, 3302 credp, flag)); 3303 3304 /* 3305 * This API made limited sense when this value was fixed 3306 * at LOFI_MAX_FILES. However, its use to iterate 3307 * across all possible devices in lofiadm means we don't 3308 * want to return L_MAXMIN, but the highest 3309 * *allocated* id. 3310 */ 3311 case LOFI_GET_MAXMINOR: 3312 id = 0; 3313 3314 mutex_enter(&lofi_lock); 3315 3316 for (lsp = list_head(&lofi_list); lsp != NULL; 3317 lsp = list_next(&lofi_list, lsp)) { 3318 int i; 3319 if (lofi_access(lsp) != 0) 3320 continue; 3321 3322 i = ddi_get_instance(lsp->ls_dip); 3323 if (i > id) 3324 id = i; 3325 } 3326 3327 mutex_exit(&lofi_lock); 3328 3329 error = ddi_copyout(&id, &lip->li_id, 3330 sizeof (id), flag); 3331 if (error) 3332 return (EFAULT); 3333 return (0); 3334 3335 case LOFI_CHECK_COMPRESSED: 3336 return (lofi_get_info(dev, lip, LOFI_CHECK_COMPRESSED, 3337 credp, flag)); 3338 default: 3339 return (EINVAL); 3340 } 3341 } 3342 3343 mutex_enter(&lofi_lock); 3344 lsp = ddi_get_soft_state(lofi_statep, id); 3345 if (lsp == NULL || lsp->ls_cleanup) { 3346 mutex_exit(&lofi_lock); 3347 return (ENXIO); 3348 } 3349 mutex_exit(&lofi_lock); 3350 3351 if (ddi_prop_exists(DDI_DEV_T_ANY, lsp->ls_dip, DDI_PROP_DONTPASS, 3352 "labeled") == 1) { 3353 error = cmlb_ioctl(lsp->ls_cmlbhandle, dev, cmd, arg, flag, 3354 credp, rvalp, 0); 3355 if (error != ENOTTY) 3356 return (error); 3357 } 3358 3359 /* 3360 * We explicitly allow DKIOCSTATE, but all other ioctls should fail with 3361 * EIO as if the device was no longer present. 3362 */ 3363 if (lsp->ls_vp == NULL && cmd != DKIOCSTATE) 3364 return (EIO); 3365 3366 /* these are for faking out utilities like newfs */ 3367 switch (cmd) { 3368 case DKIOCGMEDIAINFO: 3369 case DKIOCGMEDIAINFOEXT: { 3370 struct dk_minfo_ext media_info; 3371 int shift = lsp->ls_lbshift; 3372 int size; 3373 3374 if (cmd == DKIOCGMEDIAINFOEXT) { 3375 media_info.dki_pbsize = 1U << lsp->ls_pbshift; 3376 switch (ddi_model_convert_from(flag & FMODELS)) { 3377 case DDI_MODEL_ILP32: 3378 size = sizeof (struct dk_minfo_ext32); 3379 break; 3380 default: 3381 size = sizeof (struct dk_minfo_ext); 3382 break; 3383 } 3384 } else { 3385 size = sizeof (struct dk_minfo); 3386 } 3387 3388 media_info.dki_media_type = DK_FIXED_DISK; 3389 media_info.dki_lbsize = 1U << shift; 3390 media_info.dki_capacity = 3391 (lsp->ls_vp_size - lsp->ls_crypto_offset) >> shift; 3392 3393 if (ddi_copyout(&media_info, (void *)arg, size, flag)) 3394 return (EFAULT); 3395 return (0); 3396 } 3397 case DKIOCREMOVABLE: { 3398 int i = 0; 3399 if (ddi_copyout(&i, (caddr_t)arg, sizeof (int), flag)) 3400 return (EFAULT); 3401 return (0); 3402 } 3403 3404 case DKIOCGVTOC: { 3405 struct vtoc vt; 3406 fake_disk_vtoc(lsp, &vt); 3407 3408 switch (ddi_model_convert_from(flag & FMODELS)) { 3409 case DDI_MODEL_ILP32: { 3410 struct vtoc32 vtoc32; 3411 3412 vtoctovtoc32(vt, vtoc32); 3413 if (ddi_copyout(&vtoc32, (void *)arg, 3414 sizeof (struct vtoc32), flag)) 3415 return (EFAULT); 3416 break; 3417 } 3418 3419 case DDI_MODEL_NONE: 3420 if (ddi_copyout(&vt, (void *)arg, 3421 sizeof (struct vtoc), flag)) 3422 return (EFAULT); 3423 break; 3424 } 3425 return (0); 3426 } 3427 case DKIOCINFO: { 3428 struct dk_cinfo ci; 3429 fake_disk_info(dev, &ci); 3430 if (ddi_copyout(&ci, (void *)arg, sizeof (ci), flag)) 3431 return (EFAULT); 3432 return (0); 3433 } 3434 case DKIOCG_VIRTGEOM: 3435 case DKIOCG_PHYGEOM: 3436 case DKIOCGGEOM: 3437 error = ddi_copyout(&lsp->ls_dkg, (void *)arg, 3438 sizeof (struct dk_geom), flag); 3439 if (error) 3440 return (EFAULT); 3441 return (0); 3442 case DKIOCSTATE: 3443 /* 3444 * Normally, lofi devices are always in the INSERTED state. If 3445 * a device is forcefully unmapped, then the device transitions 3446 * to the DKIO_DEV_GONE state. 3447 */ 3448 if (ddi_copyin((void *)arg, &dkstate, sizeof (dkstate), 3449 flag) != 0) 3450 return (EFAULT); 3451 3452 mutex_enter(&lsp->ls_vp_lock); 3453 while (((dkstate == DKIO_INSERTED && lsp->ls_vp != NULL) || 3454 (dkstate == DKIO_DEV_GONE && lsp->ls_vp == NULL)) && 3455 !lsp->ls_cleanup) { 3456 /* 3457 * By virtue of having the device open, we know that 3458 * 'lsp' will remain valid when we return. 3459 */ 3460 if (!cv_wait_sig(&lsp->ls_vp_cv, &lsp->ls_vp_lock)) { 3461 mutex_exit(&lsp->ls_vp_lock); 3462 return (EINTR); 3463 } 3464 } 3465 3466 dkstate = (!lsp->ls_cleanup && lsp->ls_vp != NULL ? 3467 DKIO_INSERTED : DKIO_DEV_GONE); 3468 mutex_exit(&lsp->ls_vp_lock); 3469 3470 if (ddi_copyout(&dkstate, (void *)arg, 3471 sizeof (dkstate), flag) != 0) 3472 return (EFAULT); 3473 return (0); 3474 case USCSICMD: { 3475 struct uscsi_cmd uscmd; 3476 union scsi_cdb cdb; 3477 3478 if (uscsi_is_inquiry(arg, flag, &cdb, &uscmd) == 0) { 3479 struct scsi_inquiry inq = {0}; 3480 3481 lofi_create_inquiry(lsp, &inq); 3482 if (ddi_copyout(&inq, uscmd.uscsi_bufaddr, 3483 uscmd.uscsi_buflen, flag) != 0) 3484 return (EFAULT); 3485 return (0); 3486 } else if (cdb.scc_cmd == SCMD_READ_CAPACITY) { 3487 struct scsi_capacity capacity; 3488 3489 capacity.capacity = 3490 BE_32((lsp->ls_vp_size - lsp->ls_crypto_offset) >> 3491 lsp->ls_lbshift); 3492 capacity.lbasize = BE_32(1 << lsp->ls_lbshift); 3493 if (ddi_copyout(&capacity, uscmd.uscsi_bufaddr, 3494 uscmd.uscsi_buflen, flag) != 0) 3495 return (EFAULT); 3496 return (0); 3497 } 3498 3499 uscmd.uscsi_rqstatus = 0xff; 3500 #ifdef _MULTI_DATAMODEL 3501 switch (ddi_model_convert_from(flag & FMODELS)) { 3502 case DDI_MODEL_ILP32: { 3503 struct uscsi_cmd32 ucmd32; 3504 uscsi_cmdtouscsi_cmd32((&uscmd), (&ucmd32)); 3505 if (ddi_copyout(&ucmd32, (void *)arg, sizeof (ucmd32), 3506 flag) != 0) 3507 return (EFAULT); 3508 break; 3509 } 3510 case DDI_MODEL_NONE: 3511 if (ddi_copyout(&uscmd, (void *)arg, sizeof (uscmd), 3512 flag) != 0) 3513 return (EFAULT); 3514 break; 3515 default: 3516 return (EFAULT); 3517 } 3518 #else 3519 if (ddi_copyout(&uscmd, (void *)arg, sizeof (uscmd), flag) != 0) 3520 return (EFAULT); 3521 #endif /* _MULTI_DATAMODEL */ 3522 return (0); 3523 } 3524 3525 case DKIOCGMBOOT: 3526 return (lofi_urw(lsp, FREAD, 0, 1 << lsp->ls_lbshift, 3527 arg, flag, credp)); 3528 3529 case DKIOCSMBOOT: 3530 return (lofi_urw(lsp, FWRITE, 0, 1 << lsp->ls_lbshift, 3531 arg, flag, credp)); 3532 3533 case DKIOCGETEFI: 3534 if (ddi_copyin((void *)arg, &user_efi, 3535 sizeof (dk_efi_t), flag) != 0) 3536 return (EFAULT); 3537 3538 return (lofi_urw(lsp, FREAD, 3539 user_efi.dki_lba * (1 << lsp->ls_lbshift), 3540 user_efi.dki_length, (intptr_t)user_efi.dki_data, 3541 flag, credp)); 3542 3543 case DKIOCSETEFI: 3544 if (ddi_copyin((void *)arg, &user_efi, 3545 sizeof (dk_efi_t), flag) != 0) 3546 return (EFAULT); 3547 3548 return (lofi_urw(lsp, FWRITE, 3549 user_efi.dki_lba * (1 << lsp->ls_lbshift), 3550 user_efi.dki_length, (intptr_t)user_efi.dki_data, 3551 flag, credp)); 3552 3553 default: 3554 #ifdef DEBUG 3555 cmn_err(CE_WARN, "lofi_ioctl: %d is not implemented\n", cmd); 3556 #endif /* DEBUG */ 3557 return (ENOTTY); 3558 } 3559 } 3560 3561 static int 3562 lofi_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int mod_flags, 3563 char *name, caddr_t valuep, int *lengthp) 3564 { 3565 struct lofi_state *lsp; 3566 int rc; 3567 3568 lsp = ddi_get_soft_state(lofi_statep, ddi_get_instance(dip)); 3569 if (lsp == NULL) { 3570 return (ddi_prop_op(dev, dip, prop_op, mod_flags, 3571 name, valuep, lengthp)); 3572 } 3573 3574 rc = cmlb_prop_op(lsp->ls_cmlbhandle, dev, dip, prop_op, mod_flags, 3575 name, valuep, lengthp, LOFI_PART(getminor(dev)), NULL); 3576 if (rc == DDI_PROP_SUCCESS) 3577 return (rc); 3578 3579 return (ddi_prop_op(DDI_DEV_T_ANY, dip, prop_op, mod_flags, 3580 name, valuep, lengthp)); 3581 } 3582 3583 static struct cb_ops lofi_cb_ops = { 3584 lofi_open, /* open */ 3585 lofi_close, /* close */ 3586 lofi_strategy, /* strategy */ 3587 nodev, /* print */ 3588 nodev, /* dump */ 3589 lofi_read, /* read */ 3590 lofi_write, /* write */ 3591 lofi_ioctl, /* ioctl */ 3592 nodev, /* devmap */ 3593 nodev, /* mmap */ 3594 nodev, /* segmap */ 3595 nochpoll, /* poll */ 3596 lofi_prop_op, /* prop_op */ 3597 0, /* streamtab */ 3598 D_64BIT | D_NEW | D_MP, /* Driver compatibility flag */ 3599 CB_REV, 3600 lofi_aread, 3601 lofi_awrite 3602 }; 3603 3604 static struct dev_ops lofi_ops = { 3605 DEVO_REV, /* devo_rev, */ 3606 0, /* refcnt */ 3607 lofi_info, /* info */ 3608 nulldev, /* identify */ 3609 nulldev, /* probe */ 3610 lofi_attach, /* attach */ 3611 lofi_detach, /* detach */ 3612 nodev, /* reset */ 3613 &lofi_cb_ops, /* driver operations */ 3614 NULL, /* no bus operations */ 3615 NULL, /* power */ 3616 ddi_quiesce_not_needed, /* quiesce */ 3617 }; 3618 3619 static struct modldrv modldrv = { 3620 &mod_driverops, 3621 "loopback file driver", 3622 &lofi_ops, 3623 }; 3624 3625 static struct modlinkage modlinkage = { 3626 MODREV_1, 3627 &modldrv, 3628 NULL 3629 }; 3630 3631 int 3632 _init(void) 3633 { 3634 int error; 3635 3636 list_create(&lofi_list, sizeof (struct lofi_state), 3637 offsetof(struct lofi_state, ls_list)); 3638 3639 error = ddi_soft_state_init((void **)&lofi_statep, 3640 sizeof (struct lofi_state), 0); 3641 if (error) { 3642 list_destroy(&lofi_list); 3643 return (error); 3644 } 3645 3646 /* 3647 * The minor number is stored as id << LOFI_CMLB_SHIFT as 3648 * we need to reserve space for cmlb minor numbers. 3649 * This will leave out 4096 id values on 32bit kernel, which should 3650 * still suffice. 3651 */ 3652 lofi_id = id_space_create("lofi_id", 1, 3653 (1 << (L_BITSMINOR - LOFI_CMLB_SHIFT))); 3654 3655 if (lofi_id == NULL) { 3656 ddi_soft_state_fini((void **)&lofi_statep); 3657 list_destroy(&lofi_list); 3658 return (DDI_FAILURE); 3659 } 3660 3661 mutex_init(&lofi_lock, NULL, MUTEX_DRIVER, NULL); 3662 3663 error = mod_install(&modlinkage); 3664 3665 if (error) { 3666 id_space_destroy(lofi_id); 3667 mutex_destroy(&lofi_lock); 3668 ddi_soft_state_fini((void **)&lofi_statep); 3669 list_destroy(&lofi_list); 3670 } 3671 3672 return (error); 3673 } 3674 3675 int 3676 _fini(void) 3677 { 3678 int error; 3679 3680 mutex_enter(&lofi_lock); 3681 3682 if (!list_is_empty(&lofi_list)) { 3683 mutex_exit(&lofi_lock); 3684 return (EBUSY); 3685 } 3686 3687 mutex_exit(&lofi_lock); 3688 3689 error = mod_remove(&modlinkage); 3690 if (error) 3691 return (error); 3692 3693 mutex_destroy(&lofi_lock); 3694 id_space_destroy(lofi_id); 3695 ddi_soft_state_fini((void **)&lofi_statep); 3696 list_destroy(&lofi_list); 3697 3698 return (error); 3699 } 3700 3701 int 3702 _info(struct modinfo *modinfop) 3703 { 3704 return (mod_info(&modlinkage, modinfop)); 3705 } 3706