1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved. 23 * 24 * Copyright 2013 Nexenta Systems, Inc. All rights reserved. 25 * Copyright (c) 2016 Andrey Sokolov 26 * Copyright 2016 Toomas Soome <tsoome@me.com> 27 */ 28 29 /* 30 * lofi (loopback file) driver - allows you to attach a file to a device, 31 * which can then be accessed through that device. The simple model is that 32 * you tell lofi to open a file, and then use the block device you get as 33 * you would any block device. lofi translates access to the block device 34 * into I/O on the underlying file. This is mostly useful for 35 * mounting images of filesystems. 36 * 37 * lofi is controlled through /dev/lofictl - this is the only device exported 38 * during attach, and is instance number 0. lofiadm communicates with lofi 39 * through ioctls on this device. When a file is attached to lofi, block and 40 * character devices are exported in /dev/lofi and /dev/rlofi. These devices 41 * are identified by lofi instance number, and the instance number is also used 42 * as the name in /dev/lofi. 43 * 44 * Virtual disks, or, labeled lofi, implements virtual disk support to 45 * support partition table and related tools. Such mappings will cause 46 * block and character devices to be exported in /dev/dsk and /dev/rdsk 47 * directories. 48 * 49 * To support virtual disks, the instance number space is divided to two 50 * parts, upper part for instance number and lower part for minor number 51 * space to identify partitions and slices. The virtual disk support is 52 * implemented by stacking cmlb module. For virtual disks, the partition 53 * related ioctl calls are routed to cmlb module. Compression and encryption 54 * is not supported for virtual disks. 55 * 56 * Mapped devices are tracked with state structures handled with 57 * ddi_soft_state(9F) for simplicity. 58 * 59 * A file attached to lofi is opened when attached and not closed until 60 * explicitly detached from lofi. This seems more sensible than deferring 61 * the open until the /dev/lofi device is opened, for a number of reasons. 62 * One is that any failure is likely to be noticed by the person (or script) 63 * running lofiadm. Another is that it would be a security problem if the 64 * file was replaced by another one after being added but before being opened. 65 * 66 * The only hard part about lofi is the ioctls. In order to support things 67 * like 'newfs' on a lofi device, it needs to support certain disk ioctls. 68 * So it has to fake disk geometry and partition information. More may need 69 * to be faked if your favorite utility doesn't work and you think it should 70 * (fdformat doesn't work because it really wants to know the type of floppy 71 * controller to talk to, and that didn't seem easy to fake. Or possibly even 72 * necessary, since we have mkfs_pcfs now). 73 * 74 * Normally, a lofi device cannot be detached if it is open (i.e. busy). To 75 * support simulation of hotplug events, an optional force flag is provided. 76 * If a lofi device is open when a force detach is requested, then the 77 * underlying file is closed and any subsequent operations return EIO. When the 78 * device is closed for the last time, it will be cleaned up at that time. In 79 * addition, the DKIOCSTATE ioctl will return DKIO_DEV_GONE when the device is 80 * detached but not removed. 81 * 82 * If detach was requested and lofi device is not open, we will perform 83 * unmap and remove the lofi instance. 84 * 85 * If the lofi device is open and the li_cleanup is set on ioctl request, 86 * we set ls_cleanup flag to notify the cleanup is requested, and the 87 * last lofi_close will perform the unmapping and this lofi instance will be 88 * removed. 89 * 90 * If the lofi device is open and the li_force is set on ioctl request, 91 * we set ls_cleanup flag to notify the cleanup is requested, 92 * we also set ls_vp_closereq to notify IO tasks to return EIO on new 93 * IO requests and wait in process IO count to become 0, indicating there 94 * are no more IO requests. Since ls_cleanup is set, the last lofi_close 95 * will perform unmap and this lofi instance will be removed. 96 * See also lofi_unmap_file() for details. 97 * 98 * Once ls_cleanup is set for the instance, we do not allow lofi_open() 99 * calls to succeed and can have last lofi_close() to remove the instance. 100 * 101 * Known problems: 102 * 103 * UFS logging. Mounting a UFS filesystem image "logging" 104 * works for basic copy testing but wedges during a build of ON through 105 * that image. Some deadlock in lufs holding the log mutex and then 106 * getting stuck on a buf. So for now, don't do that. 107 * 108 * Direct I/O. Since the filesystem data is being cached in the buffer 109 * cache, _and_ again in the underlying filesystem, it's tempting to 110 * enable direct I/O on the underlying file. Don't, because that deadlocks. 111 * I think to fix the cache-twice problem we might need filesystem support. 112 * 113 * Interesting things to do: 114 * 115 * Allow multiple files for each device. A poor-man's metadisk, basically. 116 * 117 * Pass-through ioctls on block devices. You can (though it's not 118 * documented), give lofi a block device as a file name. Then we shouldn't 119 * need to fake a geometry, however, it may be relevant if you're replacing 120 * metadisk, or using lofi to get crypto. 121 * It makes sense to do lofiadm -c aes -a /dev/dsk/c0t0d0s4 /dev/lofi/1 122 * and then in /etc/vfstab have an entry for /dev/lofi/1 as /export/home. 123 * In fact this even makes sense if you have lofi "above" metadisk. 124 * 125 * Encryption: 126 * Each lofi device can have its own symmetric key and cipher. 127 * They are passed to us by lofiadm(1m) in the correct format for use 128 * with the misc/kcf crypto_* routines. 129 * 130 * Each block has its own IV, that is calculated in lofi_blk_mech(), based 131 * on the "master" key held in the lsp and the block number of the buffer. 132 */ 133 134 #include <sys/types.h> 135 #include <netinet/in.h> 136 #include <sys/sysmacros.h> 137 #include <sys/uio.h> 138 #include <sys/kmem.h> 139 #include <sys/cred.h> 140 #include <sys/mman.h> 141 #include <sys/errno.h> 142 #include <sys/aio_req.h> 143 #include <sys/stat.h> 144 #include <sys/file.h> 145 #include <sys/modctl.h> 146 #include <sys/conf.h> 147 #include <sys/debug.h> 148 #include <sys/vnode.h> 149 #include <sys/lofi.h> 150 #include <sys/lofi_impl.h> /* for cache structure */ 151 #include <sys/fcntl.h> 152 #include <sys/pathname.h> 153 #include <sys/filio.h> 154 #include <sys/fdio.h> 155 #include <sys/open.h> 156 #include <sys/disp.h> 157 #include <vm/seg_map.h> 158 #include <sys/ddi.h> 159 #include <sys/sunddi.h> 160 #include <sys/zmod.h> 161 #include <sys/id_space.h> 162 #include <sys/mkdev.h> 163 #include <sys/crypto/common.h> 164 #include <sys/crypto/api.h> 165 #include <sys/rctl.h> 166 #include <sys/vtoc.h> 167 #include <sys/scsi/scsi.h> /* for DTYPE_DIRECT */ 168 #include <sys/scsi/impl/uscsi.h> 169 #include <sys/sysevent/dev.h> 170 #include <LzmaDec.h> 171 172 #define NBLOCKS_PROP_NAME "Nblocks" 173 #define SIZE_PROP_NAME "Size" 174 #define ZONE_PROP_NAME "zone" 175 176 #define SETUP_C_DATA(cd, buf, len) \ 177 (cd).cd_format = CRYPTO_DATA_RAW; \ 178 (cd).cd_offset = 0; \ 179 (cd).cd_miscdata = NULL; \ 180 (cd).cd_length = (len); \ 181 (cd).cd_raw.iov_base = (buf); \ 182 (cd).cd_raw.iov_len = (len); 183 184 #define UIO_CHECK(uio) \ 185 if (((uio)->uio_loffset % DEV_BSIZE) != 0 || \ 186 ((uio)->uio_resid % DEV_BSIZE) != 0) { \ 187 return (EINVAL); \ 188 } 189 190 #define LOFI_TIMEOUT 30 191 192 static void *lofi_statep; 193 static kmutex_t lofi_lock; /* state lock */ 194 static id_space_t *lofi_id; /* lofi ID values */ 195 static list_t lofi_list; 196 static zone_key_t lofi_zone_key; 197 198 /* 199 * Because lofi_taskq_nthreads limits the actual swamping of the device, the 200 * maxalloc parameter (lofi_taskq_maxalloc) should be tuned conservatively 201 * high. If we want to be assured that the underlying device is always busy, 202 * we must be sure that the number of bytes enqueued when the number of 203 * enqueued tasks exceeds maxalloc is sufficient to keep the device busy for 204 * the duration of the sleep time in taskq_ent_alloc(). That is, lofi should 205 * set maxalloc to be the maximum throughput (in bytes per second) of the 206 * underlying device divided by the minimum I/O size. We assume a realistic 207 * maximum throughput of one hundred megabytes per second; we set maxalloc on 208 * the lofi task queue to be 104857600 divided by DEV_BSIZE. 209 */ 210 static int lofi_taskq_maxalloc = 104857600 / DEV_BSIZE; 211 static int lofi_taskq_nthreads = 4; /* # of taskq threads per device */ 212 213 const char lofi_crypto_magic[6] = LOFI_CRYPTO_MAGIC; 214 215 /* 216 * To avoid decompressing data in a compressed segment multiple times 217 * when accessing small parts of a segment's data, we cache and reuse 218 * the uncompressed segment's data. 219 * 220 * A single cached segment is sufficient to avoid lots of duplicate 221 * segment decompress operations. A small cache size also reduces the 222 * memory footprint. 223 * 224 * lofi_max_comp_cache is the maximum number of decompressed data segments 225 * cached for each compressed lofi image. It can be set to 0 to disable 226 * caching. 227 */ 228 229 uint32_t lofi_max_comp_cache = 1; 230 231 static int gzip_decompress(void *src, size_t srclen, void *dst, 232 size_t *destlen, int level); 233 234 static int lzma_decompress(void *src, size_t srclen, void *dst, 235 size_t *dstlen, int level); 236 237 lofi_compress_info_t lofi_compress_table[LOFI_COMPRESS_FUNCTIONS] = { 238 {gzip_decompress, NULL, 6, "gzip"}, /* default */ 239 {gzip_decompress, NULL, 6, "gzip-6"}, 240 {gzip_decompress, NULL, 9, "gzip-9"}, 241 {lzma_decompress, NULL, 0, "lzma"} 242 }; 243 244 static void lofi_strategy_task(void *); 245 static int lofi_tg_rdwr(dev_info_t *, uchar_t, void *, diskaddr_t, 246 size_t, void *); 247 static int lofi_tg_getinfo(dev_info_t *, int, void *, void *); 248 249 struct cmlb_tg_ops lofi_tg_ops = { 250 TG_DK_OPS_VERSION_1, 251 lofi_tg_rdwr, 252 lofi_tg_getinfo 253 }; 254 255 /*ARGSUSED*/ 256 static void 257 *SzAlloc(void *p, size_t size) 258 { 259 return (kmem_alloc(size, KM_SLEEP)); 260 } 261 262 /*ARGSUSED*/ 263 static void 264 SzFree(void *p, void *address, size_t size) 265 { 266 kmem_free(address, size); 267 } 268 269 static ISzAlloc g_Alloc = { SzAlloc, SzFree }; 270 271 /* 272 * Free data referenced by the linked list of cached uncompressed 273 * segments. 274 */ 275 static void 276 lofi_free_comp_cache(struct lofi_state *lsp) 277 { 278 struct lofi_comp_cache *lc; 279 280 while ((lc = list_remove_head(&lsp->ls_comp_cache)) != NULL) { 281 kmem_free(lc->lc_data, lsp->ls_uncomp_seg_sz); 282 kmem_free(lc, sizeof (struct lofi_comp_cache)); 283 lsp->ls_comp_cache_count--; 284 } 285 ASSERT(lsp->ls_comp_cache_count == 0); 286 } 287 288 static int 289 is_opened(struct lofi_state *lsp) 290 { 291 int i; 292 boolean_t last = B_TRUE; 293 294 ASSERT(MUTEX_HELD(&lofi_lock)); 295 for (i = 0; i < LOFI_PART_MAX; i++) { 296 if (lsp->ls_open_lyr[i]) { 297 last = B_FALSE; 298 break; 299 } 300 } 301 302 for (i = 0; last && (i < OTYP_LYR); i++) { 303 if (lsp->ls_open_reg[i]) { 304 last = B_FALSE; 305 } 306 } 307 308 return (!last); 309 } 310 311 static void 312 lofi_set_cleanup(struct lofi_state *lsp) 313 { 314 ASSERT(MUTEX_HELD(&lofi_lock)); 315 316 lsp->ls_cleanup = B_TRUE; 317 318 /* wake up any threads waiting on dkiocstate */ 319 cv_broadcast(&lsp->ls_vp_cv); 320 } 321 322 static void 323 lofi_free_crypto(struct lofi_state *lsp) 324 { 325 ASSERT(MUTEX_HELD(&lofi_lock)); 326 327 if (lsp->ls_crypto_enabled) { 328 /* 329 * Clean up the crypto state so that it doesn't hang around 330 * in memory after we are done with it. 331 */ 332 if (lsp->ls_key.ck_data != NULL) { 333 bzero(lsp->ls_key.ck_data, 334 CRYPTO_BITS2BYTES(lsp->ls_key.ck_length)); 335 kmem_free(lsp->ls_key.ck_data, 336 CRYPTO_BITS2BYTES(lsp->ls_key.ck_length)); 337 lsp->ls_key.ck_data = NULL; 338 lsp->ls_key.ck_length = 0; 339 } 340 341 if (lsp->ls_mech.cm_param != NULL) { 342 kmem_free(lsp->ls_mech.cm_param, 343 lsp->ls_mech.cm_param_len); 344 lsp->ls_mech.cm_param = NULL; 345 lsp->ls_mech.cm_param_len = 0; 346 } 347 348 if (lsp->ls_iv_mech.cm_param != NULL) { 349 kmem_free(lsp->ls_iv_mech.cm_param, 350 lsp->ls_iv_mech.cm_param_len); 351 lsp->ls_iv_mech.cm_param = NULL; 352 lsp->ls_iv_mech.cm_param_len = 0; 353 } 354 355 mutex_destroy(&lsp->ls_crypto_lock); 356 } 357 } 358 359 /* ARGSUSED */ 360 static int 361 lofi_tg_rdwr(dev_info_t *dip, uchar_t cmd, void *bufaddr, diskaddr_t start, 362 size_t length, void *tg_cookie) 363 { 364 struct lofi_state *lsp; 365 buf_t *bp; 366 int instance; 367 int rv = 0; 368 369 instance = ddi_get_instance(dip); 370 if (instance == 0) /* control node does not have disk */ 371 return (ENXIO); 372 373 lsp = ddi_get_soft_state(lofi_statep, instance); 374 375 if (lsp == NULL) 376 return (ENXIO); 377 378 if (cmd != TG_READ && cmd != TG_WRITE) 379 return (EINVAL); 380 381 /* 382 * Make sure the mapping is set up by checking lsp->ls_vp_ready. 383 */ 384 mutex_enter(&lsp->ls_vp_lock); 385 while (lsp->ls_vp_ready == B_FALSE) 386 cv_wait(&lsp->ls_vp_cv, &lsp->ls_vp_lock); 387 mutex_exit(&lsp->ls_vp_lock); 388 389 if (P2PHASE(length, (1U << lsp->ls_lbshift)) != 0) { 390 /* We can only transfer whole blocks at a time! */ 391 return (EINVAL); 392 } 393 394 bp = getrbuf(KM_SLEEP); 395 396 if (cmd == TG_READ) { 397 bp->b_flags = B_READ; 398 } else { 399 if (lsp->ls_readonly == B_TRUE) { 400 freerbuf(bp); 401 return (EROFS); 402 } 403 bp->b_flags = B_WRITE; 404 } 405 406 bp->b_un.b_addr = bufaddr; 407 bp->b_bcount = length; 408 bp->b_lblkno = start; 409 bp->b_private = NULL; 410 bp->b_edev = lsp->ls_dev; 411 412 if (lsp->ls_kstat) { 413 mutex_enter(lsp->ls_kstat->ks_lock); 414 kstat_waitq_enter(KSTAT_IO_PTR(lsp->ls_kstat)); 415 mutex_exit(lsp->ls_kstat->ks_lock); 416 } 417 (void) taskq_dispatch(lsp->ls_taskq, lofi_strategy_task, bp, KM_SLEEP); 418 (void) biowait(bp); 419 420 rv = geterror(bp); 421 freerbuf(bp); 422 return (rv); 423 } 424 425 /* 426 * Get device geometry info for cmlb. 427 * 428 * We have mapped disk image as virtual block device and have to report 429 * physical/virtual geometry to cmlb. 430 * 431 * So we have two principal cases: 432 * 1. Uninitialised image without any existing labels, 433 * for this case we fabricate the data based on mapped image. 434 * 2. Image with existing label information. 435 * Since we have no information how the image was created (it may be 436 * dump from some physical device), we need to rely on label information 437 * from image, or we get "corrupted label" errors. 438 * NOTE: label can be MBR, MBR+SMI, GPT 439 */ 440 static int 441 lofi_tg_getinfo(dev_info_t *dip, int cmd, void *arg, void *tg_cookie) 442 { 443 struct lofi_state *lsp; 444 int instance; 445 int ashift; 446 447 _NOTE(ARGUNUSED(tg_cookie)); 448 instance = ddi_get_instance(dip); 449 if (instance == 0) /* control device has no storage */ 450 return (ENXIO); 451 452 lsp = ddi_get_soft_state(lofi_statep, instance); 453 454 if (lsp == NULL) 455 return (ENXIO); 456 457 /* 458 * Make sure the mapping is set up by checking lsp->ls_vp_ready. 459 * 460 * When mapping is created, new lofi instance is created and 461 * lofi_attach() will call cmlb_attach() as part of the procedure 462 * to set the mapping up. This chain of events will happen in 463 * the same thread. 464 * Since cmlb_attach() will call lofi_tg_getinfo to get 465 * capacity, we return error on that call if cookie is set, 466 * otherwise lofi_attach will be stuck as the mapping is not yet 467 * finalized and lofi is not yet ready. 468 * Note, such error is not fatal for cmlb, as the label setup 469 * will be finalized when cmlb_validate() is called. 470 */ 471 mutex_enter(&lsp->ls_vp_lock); 472 if (tg_cookie != NULL && lsp->ls_vp_ready == B_FALSE) { 473 mutex_exit(&lsp->ls_vp_lock); 474 return (ENXIO); 475 } 476 while (lsp->ls_vp_ready == B_FALSE) 477 cv_wait(&lsp->ls_vp_cv, &lsp->ls_vp_lock); 478 mutex_exit(&lsp->ls_vp_lock); 479 480 ashift = lsp->ls_lbshift; 481 482 switch (cmd) { 483 case TG_GETPHYGEOM: { 484 cmlb_geom_t *geomp = arg; 485 486 geomp->g_capacity = 487 (lsp->ls_vp_size - lsp->ls_crypto_offset) >> ashift; 488 geomp->g_nsect = lsp->ls_dkg.dkg_nsect; 489 geomp->g_nhead = lsp->ls_dkg.dkg_nhead; 490 geomp->g_acyl = lsp->ls_dkg.dkg_acyl; 491 geomp->g_ncyl = lsp->ls_dkg.dkg_ncyl; 492 geomp->g_secsize = (1U << ashift); 493 geomp->g_intrlv = lsp->ls_dkg.dkg_intrlv; 494 geomp->g_rpm = lsp->ls_dkg.dkg_rpm; 495 return (0); 496 } 497 498 case TG_GETCAPACITY: 499 *(diskaddr_t *)arg = 500 (lsp->ls_vp_size - lsp->ls_crypto_offset) >> ashift; 501 return (0); 502 503 case TG_GETBLOCKSIZE: 504 *(uint32_t *)arg = (1U << ashift); 505 return (0); 506 507 case TG_GETATTR: { 508 tg_attribute_t *tgattr = arg; 509 510 tgattr->media_is_writable = !lsp->ls_readonly; 511 tgattr->media_is_solid_state = B_FALSE; 512 return (0); 513 } 514 515 default: 516 return (EINVAL); 517 } 518 } 519 520 static void 521 lofi_destroy(struct lofi_state *lsp, cred_t *credp) 522 { 523 int id = LOFI_MINOR2ID(getminor(lsp->ls_dev)); 524 int i; 525 526 ASSERT(MUTEX_HELD(&lofi_lock)); 527 528 /* 529 * Before we can start to release the other resources, 530 * make sure we have all tasks completed and taskq removed. 531 */ 532 if (lsp->ls_taskq != NULL) { 533 taskq_destroy(lsp->ls_taskq); 534 lsp->ls_taskq = NULL; 535 } 536 537 list_remove(&lofi_list, lsp); 538 539 lofi_free_crypto(lsp); 540 541 /* 542 * Free pre-allocated compressed buffers 543 */ 544 if (lsp->ls_comp_bufs != NULL) { 545 for (i = 0; i < lofi_taskq_nthreads; i++) { 546 if (lsp->ls_comp_bufs[i].bufsize > 0) 547 kmem_free(lsp->ls_comp_bufs[i].buf, 548 lsp->ls_comp_bufs[i].bufsize); 549 } 550 kmem_free(lsp->ls_comp_bufs, 551 sizeof (struct compbuf) * lofi_taskq_nthreads); 552 } 553 554 if (lsp->ls_vp != NULL) { 555 (void) VOP_PUTPAGE(lsp->ls_vp, 0, 0, B_INVAL, credp, NULL); 556 (void) VOP_CLOSE(lsp->ls_vp, lsp->ls_openflag, 557 1, 0, credp, NULL); 558 VN_RELE(lsp->ls_vp); 559 } 560 if (lsp->ls_stacked_vp != lsp->ls_vp) 561 VN_RELE(lsp->ls_stacked_vp); 562 lsp->ls_vp = lsp->ls_stacked_vp = NULL; 563 564 if (lsp->ls_kstat != NULL) { 565 kstat_delete(lsp->ls_kstat); 566 lsp->ls_kstat = NULL; 567 } 568 569 /* 570 * Free cached decompressed segment data 571 */ 572 lofi_free_comp_cache(lsp); 573 list_destroy(&lsp->ls_comp_cache); 574 575 if (lsp->ls_uncomp_seg_sz > 0) { 576 kmem_free(lsp->ls_comp_index_data, lsp->ls_comp_index_data_sz); 577 lsp->ls_uncomp_seg_sz = 0; 578 } 579 580 rctl_decr_lofi(lsp->ls_zone.zref_zone, 1); 581 zone_rele_ref(&lsp->ls_zone, ZONE_REF_LOFI); 582 583 mutex_destroy(&lsp->ls_comp_cache_lock); 584 mutex_destroy(&lsp->ls_comp_bufs_lock); 585 mutex_destroy(&lsp->ls_kstat_lock); 586 mutex_destroy(&lsp->ls_vp_lock); 587 cv_destroy(&lsp->ls_vp_cv); 588 lsp->ls_vp_ready = B_FALSE; 589 lsp->ls_vp_closereq = B_FALSE; 590 591 ASSERT(ddi_get_soft_state(lofi_statep, id) == lsp); 592 (void) ndi_devi_offline(lsp->ls_dip, NDI_DEVI_REMOVE); 593 id_free(lofi_id, id); 594 } 595 596 static void 597 lofi_free_dev(struct lofi_state *lsp) 598 { 599 ASSERT(MUTEX_HELD(&lofi_lock)); 600 601 if (lsp->ls_cmlbhandle != NULL) { 602 cmlb_invalidate(lsp->ls_cmlbhandle, 0); 603 cmlb_detach(lsp->ls_cmlbhandle, 0); 604 cmlb_free_handle(&lsp->ls_cmlbhandle); 605 lsp->ls_cmlbhandle = NULL; 606 } 607 (void) ddi_prop_remove_all(lsp->ls_dip); 608 ddi_remove_minor_node(lsp->ls_dip, NULL); 609 } 610 611 /*ARGSUSED*/ 612 static void 613 lofi_zone_shutdown(zoneid_t zoneid, void *arg) 614 { 615 struct lofi_state *lsp; 616 struct lofi_state *next; 617 618 mutex_enter(&lofi_lock); 619 620 for (lsp = list_head(&lofi_list); lsp != NULL; lsp = next) { 621 622 /* lofi_destroy() frees lsp */ 623 next = list_next(&lofi_list, lsp); 624 625 if (lsp->ls_zone.zref_zone->zone_id != zoneid) 626 continue; 627 628 /* 629 * No in-zone processes are running, but something has this 630 * open. It's either a global zone process, or a lofi 631 * mount. In either case we set ls_cleanup so the last 632 * user destroys the device. 633 */ 634 if (is_opened(lsp)) { 635 lofi_set_cleanup(lsp); 636 } else { 637 lofi_free_dev(lsp); 638 lofi_destroy(lsp, kcred); 639 } 640 } 641 642 mutex_exit(&lofi_lock); 643 } 644 645 /*ARGSUSED*/ 646 static int 647 lofi_open(dev_t *devp, int flag, int otyp, struct cred *credp) 648 { 649 int id; 650 minor_t part; 651 uint64_t mask; 652 diskaddr_t nblks; 653 diskaddr_t lba; 654 boolean_t ndelay; 655 656 struct lofi_state *lsp; 657 658 if (otyp >= OTYPCNT) 659 return (EINVAL); 660 661 ndelay = (flag & (FNDELAY | FNONBLOCK)) ? B_TRUE : B_FALSE; 662 663 /* 664 * lofiadm -a /dev/lofi/1 gets us here. 665 */ 666 if (mutex_owner(&lofi_lock) == curthread) 667 return (EINVAL); 668 669 mutex_enter(&lofi_lock); 670 671 id = LOFI_MINOR2ID(getminor(*devp)); 672 part = LOFI_PART(getminor(*devp)); 673 mask = (1U << part); 674 675 /* master control device */ 676 if (id == 0) { 677 mutex_exit(&lofi_lock); 678 return (0); 679 } 680 681 /* otherwise, the mapping should already exist */ 682 lsp = ddi_get_soft_state(lofi_statep, id); 683 if (lsp == NULL) { 684 mutex_exit(&lofi_lock); 685 return (EINVAL); 686 } 687 688 if (lsp->ls_cleanup == B_TRUE) { 689 mutex_exit(&lofi_lock); 690 return (ENXIO); 691 } 692 693 if (lsp->ls_vp == NULL) { 694 mutex_exit(&lofi_lock); 695 return (ENXIO); 696 } 697 698 if (lsp->ls_readonly && (flag & FWRITE)) { 699 mutex_exit(&lofi_lock); 700 return (EROFS); 701 } 702 703 if ((lsp->ls_open_excl) & (mask)) { 704 mutex_exit(&lofi_lock); 705 return (EBUSY); 706 } 707 708 if (flag & FEXCL) { 709 if (lsp->ls_open_lyr[part]) { 710 mutex_exit(&lofi_lock); 711 return (EBUSY); 712 } 713 for (int i = 0; i < OTYP_LYR; i++) { 714 if (lsp->ls_open_reg[i] & mask) { 715 mutex_exit(&lofi_lock); 716 return (EBUSY); 717 } 718 } 719 } 720 721 if (lsp->ls_cmlbhandle != NULL) { 722 if (cmlb_validate(lsp->ls_cmlbhandle, 0, 0) != 0) { 723 /* 724 * non-blocking opens are allowed to succeed to 725 * support format and fdisk to create partitioning. 726 */ 727 if (!ndelay) { 728 mutex_exit(&lofi_lock); 729 return (ENXIO); 730 } 731 } else if (cmlb_partinfo(lsp->ls_cmlbhandle, part, &nblks, &lba, 732 NULL, NULL, 0) == 0) { 733 if ((!nblks) && ((!ndelay) || (otyp != OTYP_CHR))) { 734 mutex_exit(&lofi_lock); 735 return (ENXIO); 736 } 737 } else if (!ndelay) { 738 mutex_exit(&lofi_lock); 739 return (ENXIO); 740 } 741 } 742 743 if (otyp == OTYP_LYR) { 744 lsp->ls_open_lyr[part]++; 745 } else { 746 lsp->ls_open_reg[otyp] |= mask; 747 } 748 if (flag & FEXCL) { 749 lsp->ls_open_excl |= mask; 750 } 751 752 mutex_exit(&lofi_lock); 753 return (0); 754 } 755 756 /*ARGSUSED*/ 757 static int 758 lofi_close(dev_t dev, int flag, int otyp, struct cred *credp) 759 { 760 minor_t part; 761 int id; 762 uint64_t mask; 763 struct lofi_state *lsp; 764 765 id = LOFI_MINOR2ID(getminor(dev)); 766 part = LOFI_PART(getminor(dev)); 767 mask = (1U << part); 768 769 mutex_enter(&lofi_lock); 770 lsp = ddi_get_soft_state(lofi_statep, id); 771 if (lsp == NULL) { 772 mutex_exit(&lofi_lock); 773 return (EINVAL); 774 } 775 776 if (id == 0) { 777 mutex_exit(&lofi_lock); 778 return (0); 779 } 780 781 if (lsp->ls_open_excl & mask) 782 lsp->ls_open_excl &= ~mask; 783 784 if (otyp == OTYP_LYR) { 785 lsp->ls_open_lyr[part]--; 786 } else { 787 lsp->ls_open_reg[otyp] &= ~mask; 788 } 789 790 /* 791 * If we forcibly closed the underlying device (li_force), or 792 * asked for cleanup (li_cleanup), finish up if we're the last 793 * out of the door. 794 */ 795 if (!is_opened(lsp) && 796 (lsp->ls_cleanup == B_TRUE || lsp->ls_vp == NULL)) { 797 lofi_free_dev(lsp); 798 lofi_destroy(lsp, credp); 799 } 800 801 mutex_exit(&lofi_lock); 802 return (0); 803 } 804 805 /* 806 * Sets the mechanism's initialization vector (IV) if one is needed. 807 * The IV is computed from the data block number. lsp->ls_mech is 808 * altered so that: 809 * lsp->ls_mech.cm_param_len is set to the IV len. 810 * lsp->ls_mech.cm_param is set to the IV. 811 */ 812 static int 813 lofi_blk_mech(struct lofi_state *lsp, longlong_t lblkno) 814 { 815 int ret; 816 crypto_data_t cdata; 817 char *iv; 818 size_t iv_len; 819 size_t min; 820 void *data; 821 size_t datasz; 822 823 ASSERT(MUTEX_HELD(&lsp->ls_crypto_lock)); 824 825 if (lsp == NULL) 826 return (CRYPTO_DEVICE_ERROR); 827 828 /* lsp->ls_mech.cm_param{_len} has already been set for static iv */ 829 if (lsp->ls_iv_type == IVM_NONE) { 830 return (CRYPTO_SUCCESS); 831 } 832 833 /* 834 * if kmem already alloced from previous call and it's the same size 835 * we need now, just recycle it; allocate new kmem only if we have to 836 */ 837 if (lsp->ls_mech.cm_param == NULL || 838 lsp->ls_mech.cm_param_len != lsp->ls_iv_len) { 839 iv_len = lsp->ls_iv_len; 840 iv = kmem_zalloc(iv_len, KM_SLEEP); 841 } else { 842 iv_len = lsp->ls_mech.cm_param_len; 843 iv = lsp->ls_mech.cm_param; 844 bzero(iv, iv_len); 845 } 846 847 switch (lsp->ls_iv_type) { 848 case IVM_ENC_BLKNO: 849 /* iv is not static, lblkno changes each time */ 850 data = &lblkno; 851 datasz = sizeof (lblkno); 852 break; 853 default: 854 data = 0; 855 datasz = 0; 856 break; 857 } 858 859 /* 860 * write blkno into the iv buffer padded on the left in case 861 * blkno ever grows bigger than its current longlong_t size 862 * or a variation other than blkno is used for the iv data 863 */ 864 min = MIN(datasz, iv_len); 865 bcopy(data, iv + (iv_len - min), min); 866 867 /* encrypt the data in-place to get the IV */ 868 SETUP_C_DATA(cdata, iv, iv_len); 869 870 ret = crypto_encrypt(&lsp->ls_iv_mech, &cdata, &lsp->ls_key, 871 NULL, NULL, NULL); 872 if (ret != CRYPTO_SUCCESS) { 873 cmn_err(CE_WARN, "failed to create iv for block %lld: (0x%x)", 874 lblkno, ret); 875 if (lsp->ls_mech.cm_param != iv) 876 kmem_free(iv, iv_len); 877 878 return (ret); 879 } 880 881 /* clean up the iv from the last computation */ 882 if (lsp->ls_mech.cm_param != NULL && lsp->ls_mech.cm_param != iv) 883 kmem_free(lsp->ls_mech.cm_param, lsp->ls_mech.cm_param_len); 884 885 lsp->ls_mech.cm_param_len = iv_len; 886 lsp->ls_mech.cm_param = iv; 887 888 return (CRYPTO_SUCCESS); 889 } 890 891 /* 892 * Performs encryption and decryption of a chunk of data of size "len", 893 * one DEV_BSIZE block at a time. "len" is assumed to be a multiple of 894 * DEV_BSIZE. 895 */ 896 static int 897 lofi_crypto(struct lofi_state *lsp, struct buf *bp, caddr_t plaintext, 898 caddr_t ciphertext, size_t len, boolean_t op_encrypt) 899 { 900 crypto_data_t cdata; 901 crypto_data_t wdata; 902 int ret; 903 longlong_t lblkno = bp->b_lblkno; 904 905 mutex_enter(&lsp->ls_crypto_lock); 906 907 /* 908 * though we could encrypt/decrypt entire "len" chunk of data, we need 909 * to break it into DEV_BSIZE pieces to capture blkno incrementing 910 */ 911 SETUP_C_DATA(cdata, plaintext, len); 912 cdata.cd_length = DEV_BSIZE; 913 if (ciphertext != NULL) { /* not in-place crypto */ 914 SETUP_C_DATA(wdata, ciphertext, len); 915 wdata.cd_length = DEV_BSIZE; 916 } 917 918 do { 919 ret = lofi_blk_mech(lsp, lblkno); 920 if (ret != CRYPTO_SUCCESS) 921 continue; 922 923 if (op_encrypt) { 924 ret = crypto_encrypt(&lsp->ls_mech, &cdata, 925 &lsp->ls_key, NULL, 926 ((ciphertext != NULL) ? &wdata : NULL), NULL); 927 } else { 928 ret = crypto_decrypt(&lsp->ls_mech, &cdata, 929 &lsp->ls_key, NULL, 930 ((ciphertext != NULL) ? &wdata : NULL), NULL); 931 } 932 933 cdata.cd_offset += DEV_BSIZE; 934 if (ciphertext != NULL) 935 wdata.cd_offset += DEV_BSIZE; 936 lblkno++; 937 } while (ret == CRYPTO_SUCCESS && cdata.cd_offset < len); 938 939 mutex_exit(&lsp->ls_crypto_lock); 940 941 if (ret != CRYPTO_SUCCESS) { 942 cmn_err(CE_WARN, "%s failed for block %lld: (0x%x)", 943 op_encrypt ? "crypto_encrypt()" : "crypto_decrypt()", 944 lblkno, ret); 945 } 946 947 return (ret); 948 } 949 950 #define RDWR_RAW 1 951 #define RDWR_BCOPY 2 952 953 static int 954 lofi_rdwr(caddr_t bufaddr, offset_t offset, struct buf *bp, 955 struct lofi_state *lsp, size_t len, int method, caddr_t bcopy_locn) 956 { 957 ssize_t resid; 958 int isread; 959 int error; 960 961 /* 962 * Handles reads/writes for both plain and encrypted lofi 963 * Note: offset is already shifted by lsp->ls_crypto_offset 964 * when it gets here. 965 */ 966 967 isread = bp->b_flags & B_READ; 968 if (isread) { 969 if (method == RDWR_BCOPY) { 970 /* DO NOT update bp->b_resid for bcopy */ 971 bcopy(bcopy_locn, bufaddr, len); 972 error = 0; 973 } else { /* RDWR_RAW */ 974 error = vn_rdwr(UIO_READ, lsp->ls_vp, bufaddr, len, 975 offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, 976 &resid); 977 bp->b_resid = resid; 978 } 979 if (lsp->ls_crypto_enabled && error == 0) { 980 if (lofi_crypto(lsp, bp, bufaddr, NULL, len, 981 B_FALSE) != CRYPTO_SUCCESS) { 982 /* 983 * XXX: original code didn't set residual 984 * back to len because no error was expected 985 * from bcopy() if encryption is not enabled 986 */ 987 if (method != RDWR_BCOPY) 988 bp->b_resid = len; 989 error = EIO; 990 } 991 } 992 return (error); 993 } else { 994 void *iobuf = bufaddr; 995 996 if (lsp->ls_crypto_enabled) { 997 /* don't do in-place crypto to keep bufaddr intact */ 998 iobuf = kmem_alloc(len, KM_SLEEP); 999 if (lofi_crypto(lsp, bp, bufaddr, iobuf, len, 1000 B_TRUE) != CRYPTO_SUCCESS) { 1001 kmem_free(iobuf, len); 1002 if (method != RDWR_BCOPY) 1003 bp->b_resid = len; 1004 return (EIO); 1005 } 1006 } 1007 if (method == RDWR_BCOPY) { 1008 /* DO NOT update bp->b_resid for bcopy */ 1009 bcopy(iobuf, bcopy_locn, len); 1010 error = 0; 1011 } else { /* RDWR_RAW */ 1012 error = vn_rdwr(UIO_WRITE, lsp->ls_vp, iobuf, len, 1013 offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, 1014 &resid); 1015 bp->b_resid = resid; 1016 } 1017 if (lsp->ls_crypto_enabled) { 1018 kmem_free(iobuf, len); 1019 } 1020 return (error); 1021 } 1022 } 1023 1024 static int 1025 lofi_mapped_rdwr(caddr_t bufaddr, offset_t offset, struct buf *bp, 1026 struct lofi_state *lsp) 1027 { 1028 int error; 1029 offset_t alignedoffset, mapoffset; 1030 size_t xfersize; 1031 int isread; 1032 int smflags; 1033 caddr_t mapaddr; 1034 size_t len; 1035 enum seg_rw srw; 1036 int save_error; 1037 1038 /* 1039 * Note: offset is already shifted by lsp->ls_crypto_offset 1040 * when it gets here. 1041 */ 1042 if (lsp->ls_crypto_enabled) 1043 ASSERT(lsp->ls_vp_comp_size == lsp->ls_vp_size); 1044 1045 /* 1046 * segmap always gives us an 8K (MAXBSIZE) chunk, aligned on 1047 * an 8K boundary, but the buf transfer address may not be 1048 * aligned on more than a 512-byte boundary (we don't enforce 1049 * that even though we could). This matters since the initial 1050 * part of the transfer may not start at offset 0 within the 1051 * segmap'd chunk. So we have to compensate for that with 1052 * 'mapoffset'. Subsequent chunks always start off at the 1053 * beginning, and the last is capped by b_resid 1054 * 1055 * Visually, where "|" represents page map boundaries: 1056 * alignedoffset (mapaddr begins at this segmap boundary) 1057 * | offset (from beginning of file) 1058 * | | len 1059 * v v v 1060 * ===|====X========|====...======|========X====|==== 1061 * /-------------...---------------/ 1062 * ^ bp->b_bcount/bp->b_resid at start 1063 * /----/--------/----...------/--------/ 1064 * ^ ^ ^ ^ ^ 1065 * | | | | nth xfersize (<= MAXBSIZE) 1066 * | | 2nd thru n-1st xfersize (= MAXBSIZE) 1067 * | 1st xfersize (<= MAXBSIZE) 1068 * mapoffset (offset into 1st segmap, non-0 1st time, 0 thereafter) 1069 * 1070 * Notes: "alignedoffset" is "offset" rounded down to nearest 1071 * MAXBSIZE boundary. "len" is next page boundary of size 1072 * PAGESIZE after "alignedoffset". 1073 */ 1074 mapoffset = offset & MAXBOFFSET; 1075 alignedoffset = offset - mapoffset; 1076 bp->b_resid = bp->b_bcount; 1077 isread = bp->b_flags & B_READ; 1078 srw = isread ? S_READ : S_WRITE; 1079 do { 1080 xfersize = MIN(lsp->ls_vp_comp_size - offset, 1081 MIN(MAXBSIZE - mapoffset, bp->b_resid)); 1082 len = roundup(mapoffset + xfersize, PAGESIZE); 1083 mapaddr = segmap_getmapflt(segkmap, lsp->ls_vp, 1084 alignedoffset, MAXBSIZE, 1, srw); 1085 /* 1086 * Now fault in the pages. This lets us check 1087 * for errors before we reference mapaddr and 1088 * try to resolve the fault in bcopy (which would 1089 * panic instead). And this can easily happen, 1090 * particularly if you've lofi'd a file over NFS 1091 * and someone deletes the file on the server. 1092 */ 1093 error = segmap_fault(kas.a_hat, segkmap, mapaddr, 1094 len, F_SOFTLOCK, srw); 1095 if (error) { 1096 (void) segmap_release(segkmap, mapaddr, 0); 1097 if (FC_CODE(error) == FC_OBJERR) 1098 error = FC_ERRNO(error); 1099 else 1100 error = EIO; 1101 break; 1102 } 1103 /* error may be non-zero for encrypted lofi */ 1104 error = lofi_rdwr(bufaddr, 0, bp, lsp, xfersize, 1105 RDWR_BCOPY, mapaddr + mapoffset); 1106 if (error == 0) { 1107 bp->b_resid -= xfersize; 1108 bufaddr += xfersize; 1109 offset += xfersize; 1110 } 1111 smflags = 0; 1112 if (isread) { 1113 smflags |= SM_FREE; 1114 /* 1115 * If we're reading an entire page starting 1116 * at a page boundary, there's a good chance 1117 * we won't need it again. Put it on the 1118 * head of the freelist. 1119 */ 1120 if (mapoffset == 0 && xfersize == MAXBSIZE) 1121 smflags |= SM_DONTNEED; 1122 } else { 1123 /* 1124 * Write back good pages, it is okay to 1125 * always release asynchronous here as we'll 1126 * follow with VOP_FSYNC for B_SYNC buffers. 1127 */ 1128 if (error == 0) 1129 smflags |= SM_WRITE | SM_ASYNC; 1130 } 1131 (void) segmap_fault(kas.a_hat, segkmap, mapaddr, 1132 len, F_SOFTUNLOCK, srw); 1133 save_error = segmap_release(segkmap, mapaddr, smflags); 1134 if (error == 0) 1135 error = save_error; 1136 /* only the first map may start partial */ 1137 mapoffset = 0; 1138 alignedoffset += MAXBSIZE; 1139 } while ((error == 0) && (bp->b_resid > 0) && 1140 (offset < lsp->ls_vp_comp_size)); 1141 1142 return (error); 1143 } 1144 1145 /* 1146 * Check if segment seg_index is present in the decompressed segment 1147 * data cache. 1148 * 1149 * Returns a pointer to the decompressed segment data cache entry if 1150 * found, and NULL when decompressed data for this segment is not yet 1151 * cached. 1152 */ 1153 static struct lofi_comp_cache * 1154 lofi_find_comp_data(struct lofi_state *lsp, uint64_t seg_index) 1155 { 1156 struct lofi_comp_cache *lc; 1157 1158 ASSERT(MUTEX_HELD(&lsp->ls_comp_cache_lock)); 1159 1160 for (lc = list_head(&lsp->ls_comp_cache); lc != NULL; 1161 lc = list_next(&lsp->ls_comp_cache, lc)) { 1162 if (lc->lc_index == seg_index) { 1163 /* 1164 * Decompressed segment data was found in the 1165 * cache. 1166 * 1167 * The cache uses an LRU replacement strategy; 1168 * move the entry to head of list. 1169 */ 1170 list_remove(&lsp->ls_comp_cache, lc); 1171 list_insert_head(&lsp->ls_comp_cache, lc); 1172 return (lc); 1173 } 1174 } 1175 return (NULL); 1176 } 1177 1178 /* 1179 * Add the data for a decompressed segment at segment index 1180 * seg_index to the cache of the decompressed segments. 1181 * 1182 * Returns a pointer to the cache element structure in case 1183 * the data was added to the cache; returns NULL when the data 1184 * wasn't cached. 1185 */ 1186 static struct lofi_comp_cache * 1187 lofi_add_comp_data(struct lofi_state *lsp, uint64_t seg_index, 1188 uchar_t *data) 1189 { 1190 struct lofi_comp_cache *lc; 1191 1192 ASSERT(MUTEX_HELD(&lsp->ls_comp_cache_lock)); 1193 1194 while (lsp->ls_comp_cache_count > lofi_max_comp_cache) { 1195 lc = list_remove_tail(&lsp->ls_comp_cache); 1196 ASSERT(lc != NULL); 1197 kmem_free(lc->lc_data, lsp->ls_uncomp_seg_sz); 1198 kmem_free(lc, sizeof (struct lofi_comp_cache)); 1199 lsp->ls_comp_cache_count--; 1200 } 1201 1202 /* 1203 * Do not cache when disabled by tunable variable 1204 */ 1205 if (lofi_max_comp_cache == 0) 1206 return (NULL); 1207 1208 /* 1209 * When the cache has not yet reached the maximum allowed 1210 * number of segments, allocate a new cache element. 1211 * Otherwise the cache is full; reuse the last list element 1212 * (LRU) for caching the decompressed segment data. 1213 * 1214 * The cache element for the new decompressed segment data is 1215 * added to the head of the list. 1216 */ 1217 if (lsp->ls_comp_cache_count < lofi_max_comp_cache) { 1218 lc = kmem_alloc(sizeof (struct lofi_comp_cache), KM_SLEEP); 1219 lc->lc_data = NULL; 1220 list_insert_head(&lsp->ls_comp_cache, lc); 1221 lsp->ls_comp_cache_count++; 1222 } else { 1223 lc = list_remove_tail(&lsp->ls_comp_cache); 1224 if (lc == NULL) 1225 return (NULL); 1226 list_insert_head(&lsp->ls_comp_cache, lc); 1227 } 1228 1229 /* 1230 * Free old uncompressed segment data when reusing a cache 1231 * entry. 1232 */ 1233 if (lc->lc_data != NULL) 1234 kmem_free(lc->lc_data, lsp->ls_uncomp_seg_sz); 1235 1236 lc->lc_data = data; 1237 lc->lc_index = seg_index; 1238 return (lc); 1239 } 1240 1241 1242 /*ARGSUSED*/ 1243 static int 1244 gzip_decompress(void *src, size_t srclen, void *dst, 1245 size_t *dstlen, int level) 1246 { 1247 ASSERT(*dstlen >= srclen); 1248 1249 if (z_uncompress(dst, dstlen, src, srclen) != Z_OK) 1250 return (-1); 1251 return (0); 1252 } 1253 1254 #define LZMA_HEADER_SIZE (LZMA_PROPS_SIZE + 8) 1255 /*ARGSUSED*/ 1256 static int 1257 lzma_decompress(void *src, size_t srclen, void *dst, 1258 size_t *dstlen, int level) 1259 { 1260 size_t insizepure; 1261 void *actual_src; 1262 ELzmaStatus status; 1263 1264 insizepure = srclen - LZMA_HEADER_SIZE; 1265 actual_src = (void *)((Byte *)src + LZMA_HEADER_SIZE); 1266 1267 if (LzmaDecode((Byte *)dst, (size_t *)dstlen, 1268 (const Byte *)actual_src, &insizepure, 1269 (const Byte *)src, LZMA_PROPS_SIZE, LZMA_FINISH_ANY, &status, 1270 &g_Alloc) != SZ_OK) { 1271 return (-1); 1272 } 1273 return (0); 1274 } 1275 1276 /* 1277 * This is basically what strategy used to be before we found we 1278 * needed task queues. 1279 */ 1280 static void 1281 lofi_strategy_task(void *arg) 1282 { 1283 struct buf *bp = (struct buf *)arg; 1284 int error; 1285 int syncflag = 0; 1286 struct lofi_state *lsp; 1287 offset_t offset; 1288 caddr_t bufaddr; 1289 size_t len; 1290 size_t xfersize; 1291 boolean_t bufinited = B_FALSE; 1292 1293 lsp = ddi_get_soft_state(lofi_statep, 1294 LOFI_MINOR2ID(getminor(bp->b_edev))); 1295 1296 if (lsp == NULL) { 1297 error = ENXIO; 1298 goto errout; 1299 } 1300 if (lsp->ls_kstat) { 1301 mutex_enter(lsp->ls_kstat->ks_lock); 1302 kstat_waitq_to_runq(KSTAT_IO_PTR(lsp->ls_kstat)); 1303 mutex_exit(lsp->ls_kstat->ks_lock); 1304 } 1305 1306 mutex_enter(&lsp->ls_vp_lock); 1307 lsp->ls_vp_iocount++; 1308 mutex_exit(&lsp->ls_vp_lock); 1309 1310 bp_mapin(bp); 1311 bufaddr = bp->b_un.b_addr; 1312 offset = (bp->b_lblkno + (diskaddr_t)(uintptr_t)bp->b_private) 1313 << lsp->ls_lbshift; /* offset within file */ 1314 if (lsp->ls_crypto_enabled) { 1315 /* encrypted data really begins after crypto header */ 1316 offset += lsp->ls_crypto_offset; 1317 } 1318 len = bp->b_bcount; 1319 bufinited = B_TRUE; 1320 1321 if (lsp->ls_vp == NULL || lsp->ls_vp_closereq) { 1322 error = EIO; 1323 goto errout; 1324 } 1325 1326 /* 1327 * If we're writing and the buffer was not B_ASYNC 1328 * we'll follow up with a VOP_FSYNC() to force any 1329 * asynchronous I/O to stable storage. 1330 */ 1331 if (!(bp->b_flags & B_READ) && !(bp->b_flags & B_ASYNC)) 1332 syncflag = FSYNC; 1333 1334 /* 1335 * We used to always use vn_rdwr here, but we cannot do that because 1336 * we might decide to read or write from the the underlying 1337 * file during this call, which would be a deadlock because 1338 * we have the rw_lock. So instead we page, unless it's not 1339 * mapable or it's a character device or it's an encrypted lofi. 1340 */ 1341 if ((lsp->ls_vp->v_flag & VNOMAP) || (lsp->ls_vp->v_type == VCHR) || 1342 lsp->ls_crypto_enabled) { 1343 error = lofi_rdwr(bufaddr, offset, bp, lsp, len, RDWR_RAW, 1344 NULL); 1345 } else if (lsp->ls_uncomp_seg_sz == 0) { 1346 error = lofi_mapped_rdwr(bufaddr, offset, bp, lsp); 1347 } else { 1348 uchar_t *compressed_seg = NULL, *cmpbuf; 1349 uchar_t *uncompressed_seg = NULL; 1350 lofi_compress_info_t *li; 1351 size_t oblkcount; 1352 ulong_t seglen; 1353 uint64_t sblkno, eblkno, cmpbytes; 1354 uint64_t uncompressed_seg_index; 1355 struct lofi_comp_cache *lc; 1356 offset_t sblkoff, eblkoff; 1357 u_offset_t salign, ealign; 1358 u_offset_t sdiff; 1359 uint32_t comp_data_sz; 1360 uint64_t i; 1361 int j; 1362 1363 /* 1364 * From here on we're dealing primarily with compressed files 1365 */ 1366 ASSERT(!lsp->ls_crypto_enabled); 1367 1368 /* 1369 * Compressed files can only be read from and 1370 * not written to 1371 */ 1372 if (!(bp->b_flags & B_READ)) { 1373 bp->b_resid = bp->b_bcount; 1374 error = EROFS; 1375 goto done; 1376 } 1377 1378 ASSERT(lsp->ls_comp_algorithm_index >= 0); 1379 li = &lofi_compress_table[lsp->ls_comp_algorithm_index]; 1380 /* 1381 * Compute starting and ending compressed segment numbers 1382 * We use only bitwise operations avoiding division and 1383 * modulus because we enforce the compression segment size 1384 * to a power of 2 1385 */ 1386 sblkno = offset >> lsp->ls_comp_seg_shift; 1387 sblkoff = offset & (lsp->ls_uncomp_seg_sz - 1); 1388 eblkno = (offset + bp->b_bcount) >> lsp->ls_comp_seg_shift; 1389 eblkoff = (offset + bp->b_bcount) & (lsp->ls_uncomp_seg_sz - 1); 1390 1391 /* 1392 * Check the decompressed segment cache. 1393 * 1394 * The cache is used only when the requested data 1395 * is within a segment. Requests that cross 1396 * segment boundaries bypass the cache. 1397 */ 1398 if (sblkno == eblkno || 1399 (sblkno + 1 == eblkno && eblkoff == 0)) { 1400 /* 1401 * Request doesn't cross a segment boundary, 1402 * now check the cache. 1403 */ 1404 mutex_enter(&lsp->ls_comp_cache_lock); 1405 lc = lofi_find_comp_data(lsp, sblkno); 1406 if (lc != NULL) { 1407 /* 1408 * We've found the decompressed segment 1409 * data in the cache; reuse it. 1410 */ 1411 bcopy(lc->lc_data + sblkoff, bufaddr, 1412 bp->b_bcount); 1413 mutex_exit(&lsp->ls_comp_cache_lock); 1414 bp->b_resid = 0; 1415 error = 0; 1416 goto done; 1417 } 1418 mutex_exit(&lsp->ls_comp_cache_lock); 1419 } 1420 1421 /* 1422 * Align start offset to block boundary for segmap 1423 */ 1424 salign = lsp->ls_comp_seg_index[sblkno]; 1425 sdiff = salign & (DEV_BSIZE - 1); 1426 salign -= sdiff; 1427 if (eblkno >= (lsp->ls_comp_index_sz - 1)) { 1428 /* 1429 * We're dealing with the last segment of 1430 * the compressed file -- the size of this 1431 * segment *may not* be the same as the 1432 * segment size for the file 1433 */ 1434 eblkoff = (offset + bp->b_bcount) & 1435 (lsp->ls_uncomp_last_seg_sz - 1); 1436 ealign = lsp->ls_vp_comp_size; 1437 } else { 1438 ealign = lsp->ls_comp_seg_index[eblkno + 1]; 1439 } 1440 1441 /* 1442 * Preserve original request paramaters 1443 */ 1444 oblkcount = bp->b_bcount; 1445 1446 /* 1447 * Assign the calculated parameters 1448 */ 1449 comp_data_sz = ealign - salign; 1450 bp->b_bcount = comp_data_sz; 1451 1452 /* 1453 * Buffers to hold compressed segments are pre-allocated 1454 * on a per-thread basis. Find a pre-allocated buffer 1455 * that is not currently in use and mark it for use. 1456 */ 1457 mutex_enter(&lsp->ls_comp_bufs_lock); 1458 for (j = 0; j < lofi_taskq_nthreads; j++) { 1459 if (lsp->ls_comp_bufs[j].inuse == 0) { 1460 lsp->ls_comp_bufs[j].inuse = 1; 1461 break; 1462 } 1463 } 1464 1465 mutex_exit(&lsp->ls_comp_bufs_lock); 1466 ASSERT(j < lofi_taskq_nthreads); 1467 1468 /* 1469 * If the pre-allocated buffer size does not match 1470 * the size of the I/O request, re-allocate it with 1471 * the appropriate size 1472 */ 1473 if (lsp->ls_comp_bufs[j].bufsize < bp->b_bcount) { 1474 if (lsp->ls_comp_bufs[j].bufsize > 0) 1475 kmem_free(lsp->ls_comp_bufs[j].buf, 1476 lsp->ls_comp_bufs[j].bufsize); 1477 lsp->ls_comp_bufs[j].buf = kmem_alloc(bp->b_bcount, 1478 KM_SLEEP); 1479 lsp->ls_comp_bufs[j].bufsize = bp->b_bcount; 1480 } 1481 compressed_seg = lsp->ls_comp_bufs[j].buf; 1482 1483 /* 1484 * Map in the calculated number of blocks 1485 */ 1486 error = lofi_mapped_rdwr((caddr_t)compressed_seg, salign, 1487 bp, lsp); 1488 1489 bp->b_bcount = oblkcount; 1490 bp->b_resid = oblkcount; 1491 if (error != 0) 1492 goto done; 1493 1494 /* 1495 * decompress compressed blocks start 1496 */ 1497 cmpbuf = compressed_seg + sdiff; 1498 for (i = sblkno; i <= eblkno; i++) { 1499 ASSERT(i < lsp->ls_comp_index_sz - 1); 1500 uchar_t *useg; 1501 1502 /* 1503 * The last segment is special in that it is 1504 * most likely not going to be the same 1505 * (uncompressed) size as the other segments. 1506 */ 1507 if (i == (lsp->ls_comp_index_sz - 2)) { 1508 seglen = lsp->ls_uncomp_last_seg_sz; 1509 } else { 1510 seglen = lsp->ls_uncomp_seg_sz; 1511 } 1512 1513 /* 1514 * Each of the segment index entries contains 1515 * the starting block number for that segment. 1516 * The number of compressed bytes in a segment 1517 * is thus the difference between the starting 1518 * block number of this segment and the starting 1519 * block number of the next segment. 1520 */ 1521 cmpbytes = lsp->ls_comp_seg_index[i + 1] - 1522 lsp->ls_comp_seg_index[i]; 1523 1524 /* 1525 * The first byte in a compressed segment is a flag 1526 * that indicates whether this segment is compressed 1527 * at all. 1528 * 1529 * The variable 'useg' is used (instead of 1530 * uncompressed_seg) in this loop to keep a 1531 * reference to the uncompressed segment. 1532 * 1533 * N.B. If 'useg' is replaced with uncompressed_seg, 1534 * it leads to memory leaks and heap corruption in 1535 * corner cases where compressed segments lie 1536 * adjacent to uncompressed segments. 1537 */ 1538 if (*cmpbuf == UNCOMPRESSED) { 1539 useg = cmpbuf + SEGHDR; 1540 } else { 1541 if (uncompressed_seg == NULL) 1542 uncompressed_seg = 1543 kmem_alloc(lsp->ls_uncomp_seg_sz, 1544 KM_SLEEP); 1545 useg = uncompressed_seg; 1546 uncompressed_seg_index = i; 1547 1548 if (li->l_decompress((cmpbuf + SEGHDR), 1549 (cmpbytes - SEGHDR), uncompressed_seg, 1550 &seglen, li->l_level) != 0) { 1551 error = EIO; 1552 goto done; 1553 } 1554 } 1555 1556 /* 1557 * Determine how much uncompressed data we 1558 * have to copy and copy it 1559 */ 1560 xfersize = lsp->ls_uncomp_seg_sz - sblkoff; 1561 if (i == eblkno) 1562 xfersize -= (lsp->ls_uncomp_seg_sz - eblkoff); 1563 1564 bcopy((useg + sblkoff), bufaddr, xfersize); 1565 1566 cmpbuf += cmpbytes; 1567 bufaddr += xfersize; 1568 bp->b_resid -= xfersize; 1569 sblkoff = 0; 1570 1571 if (bp->b_resid == 0) 1572 break; 1573 } /* decompress compressed blocks ends */ 1574 1575 /* 1576 * Skip to done if there is no uncompressed data to cache 1577 */ 1578 if (uncompressed_seg == NULL) 1579 goto done; 1580 1581 /* 1582 * Add the data for the last decompressed segment to 1583 * the cache. 1584 * 1585 * In case the uncompressed segment data was added to (and 1586 * is referenced by) the cache, make sure we don't free it 1587 * here. 1588 */ 1589 mutex_enter(&lsp->ls_comp_cache_lock); 1590 if ((lc = lofi_add_comp_data(lsp, uncompressed_seg_index, 1591 uncompressed_seg)) != NULL) { 1592 uncompressed_seg = NULL; 1593 } 1594 mutex_exit(&lsp->ls_comp_cache_lock); 1595 1596 done: 1597 if (compressed_seg != NULL) { 1598 mutex_enter(&lsp->ls_comp_bufs_lock); 1599 lsp->ls_comp_bufs[j].inuse = 0; 1600 mutex_exit(&lsp->ls_comp_bufs_lock); 1601 } 1602 if (uncompressed_seg != NULL) 1603 kmem_free(uncompressed_seg, lsp->ls_uncomp_seg_sz); 1604 } /* end of handling compressed files */ 1605 1606 if ((error == 0) && (syncflag != 0)) 1607 error = VOP_FSYNC(lsp->ls_vp, syncflag, kcred, NULL); 1608 1609 errout: 1610 if (bufinited && lsp->ls_kstat) { 1611 size_t n_done = bp->b_bcount - bp->b_resid; 1612 kstat_io_t *kioptr; 1613 1614 mutex_enter(lsp->ls_kstat->ks_lock); 1615 kioptr = KSTAT_IO_PTR(lsp->ls_kstat); 1616 if (bp->b_flags & B_READ) { 1617 kioptr->nread += n_done; 1618 kioptr->reads++; 1619 } else { 1620 kioptr->nwritten += n_done; 1621 kioptr->writes++; 1622 } 1623 kstat_runq_exit(kioptr); 1624 mutex_exit(lsp->ls_kstat->ks_lock); 1625 } 1626 1627 mutex_enter(&lsp->ls_vp_lock); 1628 if (--lsp->ls_vp_iocount == 0) 1629 cv_broadcast(&lsp->ls_vp_cv); 1630 mutex_exit(&lsp->ls_vp_lock); 1631 1632 bioerror(bp, error); 1633 biodone(bp); 1634 } 1635 1636 static int 1637 lofi_strategy(struct buf *bp) 1638 { 1639 struct lofi_state *lsp; 1640 offset_t offset; 1641 minor_t part; 1642 diskaddr_t p_lba; 1643 diskaddr_t p_nblks; 1644 int shift; 1645 1646 /* 1647 * We cannot just do I/O here, because the current thread 1648 * _might_ end up back in here because the underlying filesystem 1649 * wants a buffer, which eventually gets into bio_recycle and 1650 * might call into lofi to write out a delayed-write buffer. 1651 * This is bad if the filesystem above lofi is the same as below. 1652 * 1653 * We could come up with a complex strategy using threads to 1654 * do the I/O asynchronously, or we could use task queues. task 1655 * queues were incredibly easy so they win. 1656 */ 1657 1658 lsp = ddi_get_soft_state(lofi_statep, 1659 LOFI_MINOR2ID(getminor(bp->b_edev))); 1660 part = LOFI_PART(getminor(bp->b_edev)); 1661 1662 if (lsp == NULL) { 1663 bioerror(bp, ENXIO); 1664 biodone(bp); 1665 return (0); 1666 } 1667 1668 /* Check if we are closing. */ 1669 mutex_enter(&lsp->ls_vp_lock); 1670 if (lsp->ls_vp == NULL || lsp->ls_vp_closereq) { 1671 mutex_exit(&lsp->ls_vp_lock); 1672 bioerror(bp, EIO); 1673 biodone(bp); 1674 return (0); 1675 } 1676 mutex_exit(&lsp->ls_vp_lock); 1677 1678 shift = lsp->ls_lbshift; 1679 p_lba = 0; 1680 p_nblks = lsp->ls_vp_size >> shift; 1681 1682 if (lsp->ls_cmlbhandle != NULL) { 1683 if (cmlb_partinfo(lsp->ls_cmlbhandle, part, &p_nblks, &p_lba, 1684 NULL, NULL, 0)) { 1685 bioerror(bp, ENXIO); 1686 biodone(bp); 1687 return (0); 1688 } 1689 } 1690 1691 /* start block past partition end? */ 1692 if (bp->b_lblkno > p_nblks) { 1693 bioerror(bp, ENXIO); 1694 biodone(bp); 1695 return (0); 1696 } 1697 1698 offset = (bp->b_lblkno+p_lba) << shift; /* offset within file */ 1699 1700 mutex_enter(&lsp->ls_vp_lock); 1701 if (lsp->ls_crypto_enabled) { 1702 /* encrypted data really begins after crypto header */ 1703 offset += lsp->ls_crypto_offset; 1704 } 1705 1706 /* make sure we will not pass the file or partition size */ 1707 if (offset == lsp->ls_vp_size || 1708 offset == (((p_lba + p_nblks) << shift) + lsp->ls_crypto_offset)) { 1709 /* EOF */ 1710 if ((bp->b_flags & B_READ) != 0) { 1711 bp->b_resid = bp->b_bcount; 1712 bioerror(bp, 0); 1713 } else { 1714 /* writes should fail */ 1715 bioerror(bp, ENXIO); 1716 } 1717 biodone(bp); 1718 mutex_exit(&lsp->ls_vp_lock); 1719 return (0); 1720 } 1721 if ((offset > lsp->ls_vp_size) || 1722 (offset > (((p_lba + p_nblks) << shift) + lsp->ls_crypto_offset)) || 1723 ((offset + bp->b_bcount) > ((p_lba + p_nblks) << shift))) { 1724 bioerror(bp, ENXIO); 1725 biodone(bp); 1726 mutex_exit(&lsp->ls_vp_lock); 1727 return (0); 1728 } 1729 1730 mutex_exit(&lsp->ls_vp_lock); 1731 1732 if (lsp->ls_kstat) { 1733 mutex_enter(lsp->ls_kstat->ks_lock); 1734 kstat_waitq_enter(KSTAT_IO_PTR(lsp->ls_kstat)); 1735 mutex_exit(lsp->ls_kstat->ks_lock); 1736 } 1737 bp->b_private = (void *)(uintptr_t)p_lba; /* partition start */ 1738 (void) taskq_dispatch(lsp->ls_taskq, lofi_strategy_task, bp, KM_SLEEP); 1739 return (0); 1740 } 1741 1742 /*ARGSUSED2*/ 1743 static int 1744 lofi_read(dev_t dev, struct uio *uio, struct cred *credp) 1745 { 1746 if (getminor(dev) == 0) 1747 return (EINVAL); 1748 UIO_CHECK(uio); 1749 return (physio(lofi_strategy, NULL, dev, B_READ, minphys, uio)); 1750 } 1751 1752 /*ARGSUSED2*/ 1753 static int 1754 lofi_write(dev_t dev, struct uio *uio, struct cred *credp) 1755 { 1756 if (getminor(dev) == 0) 1757 return (EINVAL); 1758 UIO_CHECK(uio); 1759 return (physio(lofi_strategy, NULL, dev, B_WRITE, minphys, uio)); 1760 } 1761 1762 /*ARGSUSED2*/ 1763 static int 1764 lofi_aread(dev_t dev, struct aio_req *aio, struct cred *credp) 1765 { 1766 if (getminor(dev) == 0) 1767 return (EINVAL); 1768 UIO_CHECK(aio->aio_uio); 1769 return (aphysio(lofi_strategy, anocancel, dev, B_READ, minphys, aio)); 1770 } 1771 1772 /*ARGSUSED2*/ 1773 static int 1774 lofi_awrite(dev_t dev, struct aio_req *aio, struct cred *credp) 1775 { 1776 if (getminor(dev) == 0) 1777 return (EINVAL); 1778 UIO_CHECK(aio->aio_uio); 1779 return (aphysio(lofi_strategy, anocancel, dev, B_WRITE, minphys, aio)); 1780 } 1781 1782 /*ARGSUSED*/ 1783 static int 1784 lofi_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) 1785 { 1786 struct lofi_state *lsp; 1787 dev_t dev = (dev_t)arg; 1788 int instance; 1789 1790 instance = LOFI_MINOR2ID(getminor(dev)); 1791 switch (infocmd) { 1792 case DDI_INFO_DEVT2DEVINFO: 1793 lsp = ddi_get_soft_state(lofi_statep, instance); 1794 if (lsp == NULL) 1795 return (DDI_FAILURE); 1796 *result = lsp->ls_dip; 1797 return (DDI_SUCCESS); 1798 case DDI_INFO_DEVT2INSTANCE: 1799 *result = (void *) (intptr_t)instance; 1800 return (DDI_SUCCESS); 1801 } 1802 return (DDI_FAILURE); 1803 } 1804 1805 static int 1806 lofi_create_minor_nodes(struct lofi_state *lsp, boolean_t labeled) 1807 { 1808 int error = 0; 1809 int instance = ddi_get_instance(lsp->ls_dip); 1810 1811 if (labeled == B_TRUE) { 1812 cmlb_alloc_handle(&lsp->ls_cmlbhandle); 1813 error = cmlb_attach(lsp->ls_dip, &lofi_tg_ops, DTYPE_DIRECT, 1814 B_FALSE, B_FALSE, DDI_NT_BLOCK_CHAN, 1815 CMLB_CREATE_P0_MINOR_NODE, lsp->ls_cmlbhandle, (void *)1); 1816 1817 if (error != DDI_SUCCESS) { 1818 cmlb_free_handle(&lsp->ls_cmlbhandle); 1819 lsp->ls_cmlbhandle = NULL; 1820 error = ENXIO; 1821 } 1822 } else { 1823 /* create minor nodes */ 1824 error = ddi_create_minor_node(lsp->ls_dip, LOFI_BLOCK_NODE, 1825 S_IFBLK, LOFI_ID2MINOR(instance), DDI_PSEUDO, 0); 1826 if (error == DDI_SUCCESS) { 1827 error = ddi_create_minor_node(lsp->ls_dip, 1828 LOFI_CHAR_NODE, S_IFCHR, LOFI_ID2MINOR(instance), 1829 DDI_PSEUDO, 0); 1830 if (error != DDI_SUCCESS) { 1831 ddi_remove_minor_node(lsp->ls_dip, 1832 LOFI_BLOCK_NODE); 1833 error = ENXIO; 1834 } 1835 } else 1836 error = ENXIO; 1837 } 1838 return (error); 1839 } 1840 1841 static int 1842 lofi_zone_bind(struct lofi_state *lsp) 1843 { 1844 int error = 0; 1845 1846 mutex_enter(&curproc->p_lock); 1847 if ((error = rctl_incr_lofi(curproc, curproc->p_zone, 1)) != 0) { 1848 mutex_exit(&curproc->p_lock); 1849 return (error); 1850 } 1851 mutex_exit(&curproc->p_lock); 1852 1853 if (ddi_prop_update_string(DDI_DEV_T_NONE, lsp->ls_dip, ZONE_PROP_NAME, 1854 (char *)curproc->p_zone->zone_name) != DDI_PROP_SUCCESS) { 1855 rctl_decr_lofi(curproc->p_zone, 1); 1856 error = EINVAL; 1857 } else { 1858 zone_init_ref(&lsp->ls_zone); 1859 zone_hold_ref(curzone, &lsp->ls_zone, ZONE_REF_LOFI); 1860 } 1861 return (error); 1862 } 1863 1864 static void 1865 lofi_zone_unbind(struct lofi_state *lsp) 1866 { 1867 (void) ddi_prop_remove(DDI_DEV_T_NONE, lsp->ls_dip, ZONE_PROP_NAME); 1868 rctl_decr_lofi(curproc->p_zone, 1); 1869 zone_rele_ref(&lsp->ls_zone, ZONE_REF_LOFI); 1870 } 1871 1872 static int 1873 lofi_online_dev(dev_info_t *dip) 1874 { 1875 boolean_t labeled; 1876 int error; 1877 int instance = ddi_get_instance(dip); 1878 struct lofi_state *lsp; 1879 1880 labeled = B_FALSE; 1881 if (ddi_prop_exists(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, "labeled")) 1882 labeled = B_TRUE; 1883 1884 /* lsp alloc+init, soft state is freed in lofi_detach */ 1885 error = ddi_soft_state_zalloc(lofi_statep, instance); 1886 if (error == DDI_FAILURE) { 1887 return (ENOMEM); 1888 } 1889 1890 lsp = ddi_get_soft_state(lofi_statep, instance); 1891 lsp->ls_dip = dip; 1892 1893 if ((error = lofi_zone_bind(lsp)) != 0) 1894 goto err; 1895 1896 cv_init(&lsp->ls_vp_cv, NULL, CV_DRIVER, NULL); 1897 mutex_init(&lsp->ls_comp_cache_lock, NULL, MUTEX_DRIVER, NULL); 1898 mutex_init(&lsp->ls_comp_bufs_lock, NULL, MUTEX_DRIVER, NULL); 1899 mutex_init(&lsp->ls_kstat_lock, NULL, MUTEX_DRIVER, NULL); 1900 mutex_init(&lsp->ls_vp_lock, NULL, MUTEX_DRIVER, NULL); 1901 1902 if ((error = lofi_create_minor_nodes(lsp, labeled)) != 0) { 1903 lofi_zone_unbind(lsp); 1904 goto lerr; 1905 } 1906 1907 /* driver handles kernel-issued IOCTLs */ 1908 if (ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP, 1909 DDI_KERNEL_IOCTL, NULL, 0) != DDI_PROP_SUCCESS) { 1910 error = DDI_FAILURE; 1911 goto merr; 1912 } 1913 1914 lsp->ls_kstat = kstat_create_zone(LOFI_DRIVER_NAME, instance, 1915 NULL, "disk", KSTAT_TYPE_IO, 1, 0, getzoneid()); 1916 if (lsp->ls_kstat == NULL) { 1917 (void) ddi_prop_remove(DDI_DEV_T_NONE, lsp->ls_dip, 1918 DDI_KERNEL_IOCTL); 1919 error = ENOMEM; 1920 goto merr; 1921 } 1922 1923 lsp->ls_kstat->ks_lock = &lsp->ls_kstat_lock; 1924 kstat_zone_add(lsp->ls_kstat, GLOBAL_ZONEID); 1925 kstat_install(lsp->ls_kstat); 1926 return (DDI_SUCCESS); 1927 merr: 1928 if (lsp->ls_cmlbhandle != NULL) { 1929 cmlb_detach(lsp->ls_cmlbhandle, 0); 1930 cmlb_free_handle(&lsp->ls_cmlbhandle); 1931 } 1932 ddi_remove_minor_node(dip, NULL); 1933 lofi_zone_unbind(lsp); 1934 lerr: 1935 mutex_destroy(&lsp->ls_comp_cache_lock); 1936 mutex_destroy(&lsp->ls_comp_bufs_lock); 1937 mutex_destroy(&lsp->ls_kstat_lock); 1938 mutex_destroy(&lsp->ls_vp_lock); 1939 cv_destroy(&lsp->ls_vp_cv); 1940 err: 1941 ddi_soft_state_free(lofi_statep, instance); 1942 return (error); 1943 } 1944 1945 static int 1946 lofi_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 1947 { 1948 int rv; 1949 int instance = ddi_get_instance(dip); 1950 struct lofi_state *lsp; 1951 1952 if (cmd != DDI_ATTACH) 1953 return (DDI_FAILURE); 1954 1955 /* 1956 * Instance 0 is control instance, attaching control instance 1957 * will set the lofi up and ready. 1958 */ 1959 if (instance == 0) { 1960 rv = ddi_soft_state_zalloc(lofi_statep, 0); 1961 if (rv == DDI_FAILURE) { 1962 return (DDI_FAILURE); 1963 } 1964 lsp = ddi_get_soft_state(lofi_statep, instance); 1965 rv = ddi_create_minor_node(dip, LOFI_CTL_NODE, S_IFCHR, 0, 1966 DDI_PSEUDO, 0); 1967 if (rv == DDI_FAILURE) { 1968 ddi_soft_state_free(lofi_statep, 0); 1969 return (DDI_FAILURE); 1970 } 1971 /* driver handles kernel-issued IOCTLs */ 1972 if (ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP, 1973 DDI_KERNEL_IOCTL, NULL, 0) != DDI_PROP_SUCCESS) { 1974 ddi_remove_minor_node(dip, NULL); 1975 ddi_soft_state_free(lofi_statep, 0); 1976 return (DDI_FAILURE); 1977 } 1978 1979 zone_key_create(&lofi_zone_key, NULL, lofi_zone_shutdown, NULL); 1980 1981 lsp->ls_dip = dip; 1982 } else { 1983 if (lofi_online_dev(dip) == DDI_FAILURE) 1984 return (DDI_FAILURE); 1985 } 1986 1987 ddi_report_dev(dip); 1988 return (DDI_SUCCESS); 1989 } 1990 1991 static int 1992 lofi_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 1993 { 1994 struct lofi_state *lsp; 1995 int instance = ddi_get_instance(dip); 1996 1997 if (cmd != DDI_DETACH) 1998 return (DDI_FAILURE); 1999 2000 /* 2001 * If the instance is not 0, release state. 2002 * The instance 0 is control device, we can not detach it 2003 * before other instances are detached. 2004 */ 2005 if (instance != 0) { 2006 lsp = ddi_get_soft_state(lofi_statep, instance); 2007 if (lsp != NULL && lsp->ls_vp_ready == B_FALSE) { 2008 ddi_soft_state_free(lofi_statep, instance); 2009 return (DDI_SUCCESS); 2010 } else 2011 return (DDI_FAILURE); 2012 } 2013 mutex_enter(&lofi_lock); 2014 2015 if (!list_is_empty(&lofi_list)) { 2016 mutex_exit(&lofi_lock); 2017 return (DDI_FAILURE); 2018 } 2019 2020 ddi_remove_minor_node(dip, NULL); 2021 ddi_prop_remove_all(dip); 2022 2023 mutex_exit(&lofi_lock); 2024 2025 if (zone_key_delete(lofi_zone_key) != 0) 2026 cmn_err(CE_WARN, "failed to delete zone key"); 2027 2028 ddi_soft_state_free(lofi_statep, 0); 2029 2030 return (DDI_SUCCESS); 2031 } 2032 2033 /* 2034 * With the addition of encryption, we must be careful that encryption key is 2035 * wiped before kernel's data structures are freed so it cannot accidentally 2036 * slip out to userland through uninitialized data elsewhere. 2037 */ 2038 static void 2039 free_lofi_ioctl(struct lofi_ioctl *klip) 2040 { 2041 /* Make sure this encryption key doesn't stick around */ 2042 bzero(klip->li_key, sizeof (klip->li_key)); 2043 kmem_free(klip, sizeof (struct lofi_ioctl)); 2044 } 2045 2046 /* 2047 * These two functions simplify the rest of the ioctls that need to copyin/out 2048 * the lofi_ioctl structure. 2049 */ 2050 int 2051 copy_in_lofi_ioctl(const struct lofi_ioctl *ulip, struct lofi_ioctl **klipp, 2052 int flag) 2053 { 2054 struct lofi_ioctl *klip; 2055 int error; 2056 2057 klip = *klipp = kmem_alloc(sizeof (struct lofi_ioctl), KM_SLEEP); 2058 error = ddi_copyin(ulip, klip, sizeof (struct lofi_ioctl), flag); 2059 if (error) 2060 goto err; 2061 2062 /* ensure NULL termination */ 2063 klip->li_filename[MAXPATHLEN-1] = '\0'; 2064 klip->li_devpath[MAXPATHLEN-1] = '\0'; 2065 klip->li_algorithm[MAXALGLEN-1] = '\0'; 2066 klip->li_cipher[CRYPTO_MAX_MECH_NAME-1] = '\0'; 2067 klip->li_iv_cipher[CRYPTO_MAX_MECH_NAME-1] = '\0'; 2068 2069 if (klip->li_id > L_MAXMIN32) { 2070 error = EINVAL; 2071 goto err; 2072 } 2073 2074 return (0); 2075 2076 err: 2077 free_lofi_ioctl(klip); 2078 return (error); 2079 } 2080 2081 int 2082 copy_out_lofi_ioctl(const struct lofi_ioctl *klip, struct lofi_ioctl *ulip, 2083 int flag) 2084 { 2085 int error; 2086 2087 /* 2088 * NOTE: Do NOT copy the crypto_key_t "back" to userland. 2089 * This ensures that an attacker can't trivially find the 2090 * key for a mapping just by issuing the ioctl. 2091 * 2092 * It can still be found by poking around in kmem with mdb(1), 2093 * but there is no point in making it easy when the info isn't 2094 * of any use in this direction anyway. 2095 * 2096 * Either way we don't actually have the raw key stored in 2097 * a form that we can get it anyway, since we just used it 2098 * to create a ctx template and didn't keep "the original". 2099 */ 2100 error = ddi_copyout(klip, ulip, sizeof (struct lofi_ioctl), flag); 2101 if (error) 2102 return (EFAULT); 2103 return (0); 2104 } 2105 2106 static int 2107 lofi_access(struct lofi_state *lsp) 2108 { 2109 ASSERT(MUTEX_HELD(&lofi_lock)); 2110 if (INGLOBALZONE(curproc) || lsp->ls_zone.zref_zone == curzone) 2111 return (0); 2112 return (EPERM); 2113 } 2114 2115 /* 2116 * Find the lofi state for the given filename. We compare by vnode to 2117 * allow the global zone visibility into NGZ lofi nodes. 2118 */ 2119 static int 2120 file_to_lofi_nocheck(char *filename, boolean_t readonly, 2121 struct lofi_state **lspp) 2122 { 2123 struct lofi_state *lsp; 2124 vnode_t *vp = NULL; 2125 int err = 0; 2126 int rdfiles = 0; 2127 2128 ASSERT(MUTEX_HELD(&lofi_lock)); 2129 2130 if ((err = lookupname(filename, UIO_SYSSPACE, FOLLOW, 2131 NULLVPP, &vp)) != 0) 2132 goto out; 2133 2134 if (vp->v_type == VREG) { 2135 vnode_t *realvp; 2136 if (VOP_REALVP(vp, &realvp, NULL) == 0) { 2137 VN_HOLD(realvp); 2138 VN_RELE(vp); 2139 vp = realvp; 2140 } 2141 } 2142 2143 for (lsp = list_head(&lofi_list); lsp != NULL; 2144 lsp = list_next(&lofi_list, lsp)) { 2145 if (lsp->ls_vp == vp) { 2146 if (lspp != NULL) 2147 *lspp = lsp; 2148 if (lsp->ls_readonly) { 2149 rdfiles++; 2150 /* Skip if '-r' is specified */ 2151 if (readonly) 2152 continue; 2153 } 2154 goto out; 2155 } 2156 } 2157 2158 err = ENOENT; 2159 2160 /* 2161 * If a filename is given as an argument for lofi_unmap, we shouldn't 2162 * allow unmap if there are multiple read-only lofi devices associated 2163 * with this file. 2164 */ 2165 if (lspp != NULL) { 2166 if (rdfiles == 1) 2167 err = 0; 2168 else if (rdfiles > 1) 2169 err = EBUSY; 2170 } 2171 2172 out: 2173 if (vp != NULL) 2174 VN_RELE(vp); 2175 return (err); 2176 } 2177 2178 /* 2179 * Find the minor for the given filename, checking the zone can access 2180 * it. 2181 */ 2182 static int 2183 file_to_lofi(char *filename, boolean_t readonly, struct lofi_state **lspp) 2184 { 2185 int err = 0; 2186 2187 ASSERT(MUTEX_HELD(&lofi_lock)); 2188 2189 if ((err = file_to_lofi_nocheck(filename, readonly, lspp)) != 0) 2190 return (err); 2191 2192 if ((err = lofi_access(*lspp)) != 0) 2193 return (err); 2194 2195 return (0); 2196 } 2197 2198 /* 2199 * Fakes up a disk geometry based on the size of the file. This is needed 2200 * to support newfs on traditional lofi device, but also will provide 2201 * geometry hint for cmlb. 2202 */ 2203 static void 2204 fake_disk_geometry(struct lofi_state *lsp) 2205 { 2206 u_offset_t dsize = lsp->ls_vp_size - lsp->ls_crypto_offset; 2207 2208 /* dk_geom - see dkio(7I) */ 2209 /* 2210 * dkg_ncyl _could_ be set to one here (one big cylinder with gobs 2211 * of sectors), but that breaks programs like fdisk which want to 2212 * partition a disk by cylinder. With one cylinder, you can't create 2213 * an fdisk partition and put pcfs on it for testing (hard to pick 2214 * a number between one and one). 2215 * 2216 * The cheezy floppy test is an attempt to not have too few cylinders 2217 * for a small file, or so many on a big file that you waste space 2218 * for backup superblocks or cylinder group structures. 2219 */ 2220 bzero(&lsp->ls_dkg, sizeof (lsp->ls_dkg)); 2221 if (dsize < (2 * 1024 * 1024)) /* floppy? */ 2222 lsp->ls_dkg.dkg_ncyl = dsize / (100 * 1024); 2223 else 2224 lsp->ls_dkg.dkg_ncyl = dsize / (300 * 1024); 2225 /* in case file file is < 100k */ 2226 if (lsp->ls_dkg.dkg_ncyl == 0) 2227 lsp->ls_dkg.dkg_ncyl = 1; 2228 2229 lsp->ls_dkg.dkg_pcyl = lsp->ls_dkg.dkg_ncyl; 2230 lsp->ls_dkg.dkg_nhead = 1; 2231 lsp->ls_dkg.dkg_rpm = 7200; 2232 2233 lsp->ls_dkg.dkg_nsect = dsize / 2234 (lsp->ls_dkg.dkg_ncyl << lsp->ls_pbshift); 2235 } 2236 2237 /* 2238 * build vtoc - see dkio(7I) 2239 * 2240 * Fakes one big partition based on the size of the file. This is needed 2241 * because we allow newfs'ing the traditional lofi device and newfs will 2242 * do several disk ioctls to figure out the geometry and partition information. 2243 * It uses that information to determine the parameters to pass to mkfs. 2244 */ 2245 static void 2246 fake_disk_vtoc(struct lofi_state *lsp, struct vtoc *vt) 2247 { 2248 bzero(vt, sizeof (struct vtoc)); 2249 vt->v_sanity = VTOC_SANE; 2250 vt->v_version = V_VERSION; 2251 (void) strncpy(vt->v_volume, LOFI_DRIVER_NAME, 2252 sizeof (vt->v_volume)); 2253 vt->v_sectorsz = 1 << lsp->ls_pbshift; 2254 vt->v_nparts = 1; 2255 vt->v_part[0].p_tag = V_UNASSIGNED; 2256 2257 /* 2258 * A compressed file is read-only, other files can 2259 * be read-write 2260 */ 2261 if (lsp->ls_uncomp_seg_sz > 0) { 2262 vt->v_part[0].p_flag = V_UNMNT | V_RONLY; 2263 } else { 2264 vt->v_part[0].p_flag = V_UNMNT; 2265 } 2266 vt->v_part[0].p_start = (daddr_t)0; 2267 /* 2268 * The partition size cannot just be the number of sectors, because 2269 * that might not end on a cylinder boundary. And if that's the case, 2270 * newfs/mkfs will print a scary warning. So just figure the size 2271 * based on the number of cylinders and sectors/cylinder. 2272 */ 2273 vt->v_part[0].p_size = lsp->ls_dkg.dkg_pcyl * 2274 lsp->ls_dkg.dkg_nsect * lsp->ls_dkg.dkg_nhead; 2275 } 2276 2277 /* 2278 * build dk_cinfo - see dkio(7I) 2279 */ 2280 static void 2281 fake_disk_info(dev_t dev, struct dk_cinfo *ci) 2282 { 2283 bzero(ci, sizeof (struct dk_cinfo)); 2284 (void) strlcpy(ci->dki_cname, LOFI_DRIVER_NAME, sizeof (ci->dki_cname)); 2285 ci->dki_ctype = DKC_SCSI_CCS; 2286 (void) strlcpy(ci->dki_dname, LOFI_DRIVER_NAME, sizeof (ci->dki_dname)); 2287 ci->dki_unit = LOFI_MINOR2ID(getminor(dev)); 2288 ci->dki_partition = LOFI_PART(getminor(dev)); 2289 /* 2290 * newfs uses this to set maxcontig. Must not be < 16, or it 2291 * will be 0 when newfs multiplies it by DEV_BSIZE and divides 2292 * it by the block size. Then tunefs doesn't work because 2293 * maxcontig is 0. 2294 */ 2295 ci->dki_maxtransfer = 16; 2296 } 2297 2298 /* 2299 * map in a compressed file 2300 * 2301 * Read in the header and the index that follows. 2302 * 2303 * The header is as follows - 2304 * 2305 * Signature (name of the compression algorithm) 2306 * Compression segment size (a multiple of 512) 2307 * Number of index entries 2308 * Size of the last block 2309 * The array containing the index entries 2310 * 2311 * The header information is always stored in 2312 * network byte order on disk. 2313 */ 2314 static int 2315 lofi_map_compressed_file(struct lofi_state *lsp, char *buf) 2316 { 2317 uint32_t index_sz, header_len, i; 2318 ssize_t resid; 2319 enum uio_rw rw; 2320 char *tbuf = buf; 2321 int error; 2322 2323 /* The signature has already been read */ 2324 tbuf += sizeof (lsp->ls_comp_algorithm); 2325 bcopy(tbuf, &(lsp->ls_uncomp_seg_sz), sizeof (lsp->ls_uncomp_seg_sz)); 2326 lsp->ls_uncomp_seg_sz = ntohl(lsp->ls_uncomp_seg_sz); 2327 2328 /* 2329 * The compressed segment size must be a power of 2 2330 */ 2331 if (lsp->ls_uncomp_seg_sz < DEV_BSIZE || 2332 !ISP2(lsp->ls_uncomp_seg_sz)) 2333 return (EINVAL); 2334 2335 for (i = 0; !((lsp->ls_uncomp_seg_sz >> i) & 1); i++) 2336 ; 2337 2338 lsp->ls_comp_seg_shift = i; 2339 2340 tbuf += sizeof (lsp->ls_uncomp_seg_sz); 2341 bcopy(tbuf, &(lsp->ls_comp_index_sz), sizeof (lsp->ls_comp_index_sz)); 2342 lsp->ls_comp_index_sz = ntohl(lsp->ls_comp_index_sz); 2343 2344 tbuf += sizeof (lsp->ls_comp_index_sz); 2345 bcopy(tbuf, &(lsp->ls_uncomp_last_seg_sz), 2346 sizeof (lsp->ls_uncomp_last_seg_sz)); 2347 lsp->ls_uncomp_last_seg_sz = ntohl(lsp->ls_uncomp_last_seg_sz); 2348 2349 /* 2350 * Compute the total size of the uncompressed data 2351 * for use in fake_disk_geometry and other calculations. 2352 * Disk geometry has to be faked with respect to the 2353 * actual uncompressed data size rather than the 2354 * compressed file size. 2355 */ 2356 lsp->ls_vp_size = 2357 (u_offset_t)(lsp->ls_comp_index_sz - 2) * lsp->ls_uncomp_seg_sz 2358 + lsp->ls_uncomp_last_seg_sz; 2359 2360 /* 2361 * Index size is rounded up to DEV_BSIZE for ease 2362 * of segmapping 2363 */ 2364 index_sz = sizeof (*lsp->ls_comp_seg_index) * lsp->ls_comp_index_sz; 2365 header_len = sizeof (lsp->ls_comp_algorithm) + 2366 sizeof (lsp->ls_uncomp_seg_sz) + 2367 sizeof (lsp->ls_comp_index_sz) + 2368 sizeof (lsp->ls_uncomp_last_seg_sz); 2369 lsp->ls_comp_offbase = header_len + index_sz; 2370 2371 index_sz += header_len; 2372 index_sz = roundup(index_sz, DEV_BSIZE); 2373 2374 lsp->ls_comp_index_data = kmem_alloc(index_sz, KM_SLEEP); 2375 lsp->ls_comp_index_data_sz = index_sz; 2376 2377 /* 2378 * Read in the index -- this has a side-effect 2379 * of reading in the header as well 2380 */ 2381 rw = UIO_READ; 2382 error = vn_rdwr(rw, lsp->ls_vp, lsp->ls_comp_index_data, index_sz, 2383 0, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); 2384 2385 if (error != 0) 2386 return (error); 2387 2388 /* Skip the header, this is where the index really begins */ 2389 lsp->ls_comp_seg_index = 2390 /*LINTED*/ 2391 (uint64_t *)(lsp->ls_comp_index_data + header_len); 2392 2393 /* 2394 * Now recompute offsets in the index to account for 2395 * the header length 2396 */ 2397 for (i = 0; i < lsp->ls_comp_index_sz; i++) { 2398 lsp->ls_comp_seg_index[i] = lsp->ls_comp_offbase + 2399 BE_64(lsp->ls_comp_seg_index[i]); 2400 } 2401 2402 return (error); 2403 } 2404 2405 static int 2406 lofi_init_crypto(struct lofi_state *lsp, struct lofi_ioctl *klip) 2407 { 2408 struct crypto_meta chead; 2409 char buf[DEV_BSIZE]; 2410 ssize_t resid; 2411 char *marker; 2412 int error; 2413 int ret; 2414 int i; 2415 2416 if (!klip->li_crypto_enabled) 2417 return (0); 2418 2419 /* 2420 * All current algorithms have a max of 448 bits. 2421 */ 2422 if (klip->li_iv_len > CRYPTO_BITS2BYTES(512)) 2423 return (EINVAL); 2424 2425 if (CRYPTO_BITS2BYTES(klip->li_key_len) > sizeof (klip->li_key)) 2426 return (EINVAL); 2427 2428 lsp->ls_crypto_enabled = klip->li_crypto_enabled; 2429 2430 mutex_init(&lsp->ls_crypto_lock, NULL, MUTEX_DRIVER, NULL); 2431 2432 lsp->ls_mech.cm_type = crypto_mech2id(klip->li_cipher); 2433 if (lsp->ls_mech.cm_type == CRYPTO_MECH_INVALID) { 2434 cmn_err(CE_WARN, "invalid cipher %s requested for %s", 2435 klip->li_cipher, klip->li_filename); 2436 return (EINVAL); 2437 } 2438 2439 /* this is just initialization here */ 2440 lsp->ls_mech.cm_param = NULL; 2441 lsp->ls_mech.cm_param_len = 0; 2442 2443 lsp->ls_iv_type = klip->li_iv_type; 2444 lsp->ls_iv_mech.cm_type = crypto_mech2id(klip->li_iv_cipher); 2445 if (lsp->ls_iv_mech.cm_type == CRYPTO_MECH_INVALID) { 2446 cmn_err(CE_WARN, "invalid iv cipher %s requested" 2447 " for %s", klip->li_iv_cipher, klip->li_filename); 2448 return (EINVAL); 2449 } 2450 2451 /* iv mech must itself take a null iv */ 2452 lsp->ls_iv_mech.cm_param = NULL; 2453 lsp->ls_iv_mech.cm_param_len = 0; 2454 lsp->ls_iv_len = klip->li_iv_len; 2455 2456 /* 2457 * Create ctx using li_cipher & the raw li_key after checking 2458 * that it isn't a weak key. 2459 */ 2460 lsp->ls_key.ck_format = CRYPTO_KEY_RAW; 2461 lsp->ls_key.ck_length = klip->li_key_len; 2462 lsp->ls_key.ck_data = kmem_alloc( 2463 CRYPTO_BITS2BYTES(lsp->ls_key.ck_length), KM_SLEEP); 2464 bcopy(klip->li_key, lsp->ls_key.ck_data, 2465 CRYPTO_BITS2BYTES(lsp->ls_key.ck_length)); 2466 2467 ret = crypto_key_check(&lsp->ls_mech, &lsp->ls_key); 2468 if (ret != CRYPTO_SUCCESS) { 2469 cmn_err(CE_WARN, "weak key check failed for cipher " 2470 "%s on file %s (0x%x)", klip->li_cipher, 2471 klip->li_filename, ret); 2472 return (EINVAL); 2473 } 2474 2475 error = vn_rdwr(UIO_READ, lsp->ls_vp, buf, DEV_BSIZE, 2476 CRYOFF, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); 2477 if (error != 0) 2478 return (error); 2479 2480 /* 2481 * This is the case where the header in the lofi image is already 2482 * initialized to indicate it is encrypted. 2483 */ 2484 if (strncmp(buf, lofi_crypto_magic, sizeof (lofi_crypto_magic)) == 0) { 2485 /* 2486 * The encryption header information is laid out this way: 2487 * 6 bytes: hex "CFLOFI" 2488 * 2 bytes: version = 0 ... for now 2489 * 96 bytes: reserved1 (not implemented yet) 2490 * 4 bytes: data_sector = 2 ... for now 2491 * more... not implemented yet 2492 */ 2493 2494 marker = buf; 2495 2496 /* copy the magic */ 2497 bcopy(marker, lsp->ls_crypto.magic, 2498 sizeof (lsp->ls_crypto.magic)); 2499 marker += sizeof (lsp->ls_crypto.magic); 2500 2501 /* read the encryption version number */ 2502 bcopy(marker, &(lsp->ls_crypto.version), 2503 sizeof (lsp->ls_crypto.version)); 2504 lsp->ls_crypto.version = ntohs(lsp->ls_crypto.version); 2505 marker += sizeof (lsp->ls_crypto.version); 2506 2507 /* read a chunk of reserved data */ 2508 bcopy(marker, lsp->ls_crypto.reserved1, 2509 sizeof (lsp->ls_crypto.reserved1)); 2510 marker += sizeof (lsp->ls_crypto.reserved1); 2511 2512 /* read block number where encrypted data begins */ 2513 bcopy(marker, &(lsp->ls_crypto.data_sector), 2514 sizeof (lsp->ls_crypto.data_sector)); 2515 lsp->ls_crypto.data_sector = ntohl(lsp->ls_crypto.data_sector); 2516 marker += sizeof (lsp->ls_crypto.data_sector); 2517 2518 /* and ignore the rest until it is implemented */ 2519 2520 lsp->ls_crypto_offset = lsp->ls_crypto.data_sector * DEV_BSIZE; 2521 return (0); 2522 } 2523 2524 /* 2525 * We've requested encryption, but no magic was found, so it must be 2526 * a new image. 2527 */ 2528 2529 for (i = 0; i < sizeof (struct crypto_meta); i++) { 2530 if (buf[i] != '\0') 2531 return (EINVAL); 2532 } 2533 2534 marker = buf; 2535 bcopy(lofi_crypto_magic, marker, sizeof (lofi_crypto_magic)); 2536 marker += sizeof (lofi_crypto_magic); 2537 chead.version = htons(LOFI_CRYPTO_VERSION); 2538 bcopy(&(chead.version), marker, sizeof (chead.version)); 2539 marker += sizeof (chead.version); 2540 marker += sizeof (chead.reserved1); 2541 chead.data_sector = htonl(LOFI_CRYPTO_DATA_SECTOR); 2542 bcopy(&(chead.data_sector), marker, sizeof (chead.data_sector)); 2543 2544 /* write the header */ 2545 error = vn_rdwr(UIO_WRITE, lsp->ls_vp, buf, DEV_BSIZE, 2546 CRYOFF, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); 2547 if (error != 0) 2548 return (error); 2549 2550 /* fix things up so it looks like we read this info */ 2551 bcopy(lofi_crypto_magic, lsp->ls_crypto.magic, 2552 sizeof (lofi_crypto_magic)); 2553 lsp->ls_crypto.version = LOFI_CRYPTO_VERSION; 2554 lsp->ls_crypto.data_sector = LOFI_CRYPTO_DATA_SECTOR; 2555 lsp->ls_crypto_offset = lsp->ls_crypto.data_sector * DEV_BSIZE; 2556 return (0); 2557 } 2558 2559 /* 2560 * Check to see if the passed in signature is a valid one. If it is 2561 * valid, return the index into lofi_compress_table. 2562 * 2563 * Return -1 if it is invalid 2564 */ 2565 static int 2566 lofi_compress_select(const char *signature) 2567 { 2568 int i; 2569 2570 for (i = 0; i < LOFI_COMPRESS_FUNCTIONS; i++) { 2571 if (strcmp(lofi_compress_table[i].l_name, signature) == 0) 2572 return (i); 2573 } 2574 2575 return (-1); 2576 } 2577 2578 static int 2579 lofi_init_compress(struct lofi_state *lsp) 2580 { 2581 char buf[DEV_BSIZE]; 2582 int compress_index; 2583 ssize_t resid; 2584 int error; 2585 2586 error = vn_rdwr(UIO_READ, lsp->ls_vp, buf, DEV_BSIZE, 0, UIO_SYSSPACE, 2587 0, RLIM64_INFINITY, kcred, &resid); 2588 2589 if (error != 0) 2590 return (error); 2591 2592 if ((compress_index = lofi_compress_select(buf)) == -1) 2593 return (0); 2594 2595 /* compression and encryption are mutually exclusive */ 2596 if (lsp->ls_crypto_enabled) 2597 return (ENOTSUP); 2598 2599 /* initialize compression info for compressed lofi */ 2600 lsp->ls_comp_algorithm_index = compress_index; 2601 (void) strlcpy(lsp->ls_comp_algorithm, 2602 lofi_compress_table[compress_index].l_name, 2603 sizeof (lsp->ls_comp_algorithm)); 2604 2605 /* Finally setup per-thread pre-allocated buffers */ 2606 lsp->ls_comp_bufs = kmem_zalloc(lofi_taskq_nthreads * 2607 sizeof (struct compbuf), KM_SLEEP); 2608 2609 return (lofi_map_compressed_file(lsp, buf)); 2610 } 2611 2612 /* 2613 * Allocate new or proposed id from lofi_id. 2614 * 2615 * Special cases for proposed id: 2616 * 0: not allowed, 0 is id for control device. 2617 * -1: allocate first usable id from lofi_id. 2618 * any other value is proposed value from userland 2619 * 2620 * returns DDI_SUCCESS or errno. 2621 */ 2622 static int 2623 lofi_alloc_id(int *idp) 2624 { 2625 int id, error = DDI_SUCCESS; 2626 2627 if (*idp == -1) { 2628 id = id_allocff_nosleep(lofi_id); 2629 if (id == -1) { 2630 error = EAGAIN; 2631 goto err; 2632 } 2633 } else if (*idp == 0) { 2634 error = EINVAL; 2635 goto err; 2636 } else if (*idp > ((1 << (L_BITSMINOR - LOFI_CMLB_SHIFT)) - 1)) { 2637 error = ERANGE; 2638 goto err; 2639 } else { 2640 if (ddi_get_soft_state(lofi_statep, *idp) != NULL) { 2641 error = EEXIST; 2642 goto err; 2643 } 2644 2645 id = id_alloc_specific_nosleep(lofi_id, *idp); 2646 if (id == -1) { 2647 error = EAGAIN; 2648 goto err; 2649 } 2650 } 2651 *idp = id; 2652 err: 2653 return (error); 2654 } 2655 2656 static int 2657 lofi_create_dev(struct lofi_ioctl *klip) 2658 { 2659 dev_info_t *parent, *child; 2660 struct lofi_state *lsp = NULL; 2661 char namebuf[MAXNAMELEN]; 2662 int error, circ; 2663 2664 /* get control device */ 2665 lsp = ddi_get_soft_state(lofi_statep, 0); 2666 parent = ddi_get_parent(lsp->ls_dip); 2667 2668 if ((error = lofi_alloc_id((int *)&klip->li_id))) 2669 return (error); 2670 2671 (void) snprintf(namebuf, sizeof (namebuf), LOFI_DRIVER_NAME "@%d", 2672 klip->li_id); 2673 2674 ndi_devi_enter(parent, &circ); 2675 child = ndi_devi_findchild(parent, namebuf); 2676 ndi_devi_exit(parent, circ); 2677 2678 if (child == NULL) { 2679 child = ddi_add_child(parent, LOFI_DRIVER_NAME, 2680 (pnode_t)DEVI_SID_NODEID, klip->li_id); 2681 if ((error = ddi_prop_update_int(DDI_DEV_T_NONE, child, 2682 "instance", klip->li_id)) != DDI_PROP_SUCCESS) 2683 goto err; 2684 2685 if (klip->li_labeled == B_TRUE) { 2686 if ((error = ddi_prop_create(DDI_DEV_T_NONE, child, 2687 DDI_PROP_CANSLEEP, "labeled", 0, 0)) 2688 != DDI_PROP_SUCCESS) 2689 goto err; 2690 } 2691 2692 if ((error = ndi_devi_online(child, NDI_ONLINE_ATTACH)) 2693 != NDI_SUCCESS) 2694 goto err; 2695 } else { 2696 id_free(lofi_id, klip->li_id); 2697 error = EEXIST; 2698 return (error); 2699 } 2700 2701 goto done; 2702 2703 err: 2704 ddi_prop_remove_all(child); 2705 (void) ndi_devi_offline(child, NDI_DEVI_REMOVE); 2706 id_free(lofi_id, klip->li_id); 2707 done: 2708 2709 return (error); 2710 } 2711 2712 static void 2713 lofi_create_inquiry(struct lofi_state *lsp, struct scsi_inquiry *inq) 2714 { 2715 char *p = NULL; 2716 2717 (void) strlcpy(inq->inq_vid, LOFI_DRIVER_NAME, sizeof (inq->inq_vid)); 2718 2719 mutex_enter(&lsp->ls_vp_lock); 2720 if (lsp->ls_vp != NULL) 2721 p = strrchr(lsp->ls_vp->v_path, '/'); 2722 if (p != NULL) 2723 (void) strncpy(inq->inq_pid, p + 1, sizeof (inq->inq_pid)); 2724 mutex_exit(&lsp->ls_vp_lock); 2725 (void) strlcpy(inq->inq_revision, "1.0", sizeof (inq->inq_revision)); 2726 } 2727 2728 /* 2729 * copy devlink name from event cache 2730 */ 2731 static void 2732 lofi_copy_devpath(struct lofi_ioctl *klip) 2733 { 2734 int error; 2735 char namebuf[MAXNAMELEN], *str; 2736 clock_t ticks; 2737 nvlist_t *nvl = NULL; 2738 2739 if (klip->li_labeled == B_TRUE) 2740 klip->li_devpath[0] = '\0'; 2741 else { 2742 /* no need to wait for messages */ 2743 (void) snprintf(klip->li_devpath, sizeof (klip->li_devpath), 2744 "/dev/" LOFI_CHAR_NAME "/%d", klip->li_id); 2745 return; 2746 } 2747 2748 (void) snprintf(namebuf, sizeof (namebuf), "%d", klip->li_id); 2749 ticks = ddi_get_lbolt() + LOFI_TIMEOUT * drv_usectohz(1000000); 2750 2751 mutex_enter(&lofi_devlink_cache.ln_lock); 2752 error = nvlist_lookup_nvlist(lofi_devlink_cache.ln_data, namebuf, &nvl); 2753 while (error != 0) { 2754 error = cv_timedwait(&lofi_devlink_cache.ln_cv, 2755 &lofi_devlink_cache.ln_lock, ticks); 2756 if (error == -1) 2757 break; 2758 error = nvlist_lookup_nvlist(lofi_devlink_cache.ln_data, 2759 namebuf, &nvl); 2760 } 2761 2762 if (nvl != NULL) { 2763 if (nvlist_lookup_string(nvl, DEV_NAME, &str) == 0) { 2764 (void) strlcpy(klip->li_devpath, str, 2765 sizeof (klip->li_devpath)); 2766 } 2767 } 2768 mutex_exit(&lofi_devlink_cache.ln_lock); 2769 } 2770 2771 /* 2772 * map a file to a minor number. Return the minor number. 2773 */ 2774 static int 2775 lofi_map_file(dev_t dev, struct lofi_ioctl *ulip, int pickminor, 2776 int *rvalp, struct cred *credp, int ioctl_flag) 2777 { 2778 int id = -1; 2779 struct lofi_state *lsp = NULL; 2780 struct lofi_ioctl *klip; 2781 int error; 2782 struct vnode *vp = NULL; 2783 vattr_t vattr; 2784 int flag; 2785 char namebuf[MAXNAMELEN]; 2786 2787 error = copy_in_lofi_ioctl(ulip, &klip, ioctl_flag); 2788 if (error != 0) 2789 return (error); 2790 2791 mutex_enter(&lofi_lock); 2792 2793 if (file_to_lofi_nocheck(klip->li_filename, klip->li_readonly, 2794 NULL) == 0) { 2795 error = EBUSY; 2796 goto err; 2797 } 2798 2799 flag = FREAD | FWRITE | FOFFMAX | FEXCL; 2800 error = vn_open(klip->li_filename, UIO_SYSSPACE, flag, 0, &vp, 0, 0); 2801 if (error) { 2802 /* try read-only */ 2803 flag &= ~FWRITE; 2804 error = vn_open(klip->li_filename, UIO_SYSSPACE, flag, 0, 2805 &vp, 0, 0); 2806 if (error) 2807 goto err; 2808 } 2809 2810 if (!V_ISLOFIABLE(vp->v_type)) { 2811 error = EINVAL; 2812 goto err; 2813 } 2814 2815 vattr.va_mask = AT_SIZE; 2816 error = VOP_GETATTR(vp, &vattr, 0, credp, NULL); 2817 if (error) 2818 goto err; 2819 2820 /* the file needs to be a multiple of the block size */ 2821 if ((vattr.va_size % DEV_BSIZE) != 0) { 2822 error = EINVAL; 2823 goto err; 2824 } 2825 2826 if (pickminor) { 2827 klip->li_id = (uint32_t)-1; 2828 } 2829 if ((error = lofi_create_dev(klip)) != 0) 2830 goto err; 2831 2832 id = klip->li_id; 2833 lsp = ddi_get_soft_state(lofi_statep, id); 2834 if (lsp == NULL) 2835 goto err; 2836 2837 /* 2838 * from this point lofi_destroy() is used to clean up on error 2839 * make sure the basic data is set 2840 */ 2841 lsp->ls_dev = makedevice(getmajor(dev), LOFI_ID2MINOR(id)); 2842 2843 list_create(&lsp->ls_comp_cache, sizeof (struct lofi_comp_cache), 2844 offsetof(struct lofi_comp_cache, lc_list)); 2845 2846 /* 2847 * save open mode so file can be closed properly and vnode counts 2848 * updated correctly. 2849 */ 2850 lsp->ls_openflag = flag; 2851 2852 lsp->ls_vp = vp; 2853 lsp->ls_stacked_vp = vp; 2854 2855 lsp->ls_vp_size = vattr.va_size; 2856 lsp->ls_vp_comp_size = lsp->ls_vp_size; 2857 2858 /* 2859 * Try to handle stacked lofs vnodes. 2860 */ 2861 if (vp->v_type == VREG) { 2862 vnode_t *realvp; 2863 2864 if (VOP_REALVP(vp, &realvp, NULL) == 0) { 2865 /* 2866 * We need to use the realvp for uniqueness 2867 * checking, but keep the stacked vp for 2868 * LOFI_GET_FILENAME display. 2869 */ 2870 VN_HOLD(realvp); 2871 lsp->ls_vp = realvp; 2872 } 2873 } 2874 2875 lsp->ls_lbshift = highbit(DEV_BSIZE) - 1; 2876 lsp->ls_pbshift = lsp->ls_lbshift; 2877 2878 lsp->ls_readonly = klip->li_readonly; 2879 lsp->ls_uncomp_seg_sz = 0; 2880 lsp->ls_comp_algorithm[0] = '\0'; 2881 lsp->ls_crypto_offset = 0; 2882 2883 (void) snprintf(namebuf, sizeof (namebuf), "%s_taskq_%d", 2884 LOFI_DRIVER_NAME, id); 2885 lsp->ls_taskq = taskq_create_proc(namebuf, lofi_taskq_nthreads, 2886 minclsyspri, 1, lofi_taskq_maxalloc, curzone->zone_zsched, 0); 2887 2888 if ((error = lofi_init_crypto(lsp, klip)) != 0) 2889 goto err; 2890 2891 if ((error = lofi_init_compress(lsp)) != 0) 2892 goto err; 2893 2894 fake_disk_geometry(lsp); 2895 2896 /* For unlabeled lofi add Nblocks and Size */ 2897 if (klip->li_labeled == B_FALSE) { 2898 error = ddi_prop_update_int64(lsp->ls_dev, lsp->ls_dip, 2899 SIZE_PROP_NAME, lsp->ls_vp_size - lsp->ls_crypto_offset); 2900 if (error != DDI_PROP_SUCCESS) { 2901 error = EINVAL; 2902 goto err; 2903 } 2904 error = ddi_prop_update_int64(lsp->ls_dev, lsp->ls_dip, 2905 NBLOCKS_PROP_NAME, 2906 (lsp->ls_vp_size - lsp->ls_crypto_offset) / DEV_BSIZE); 2907 if (error != DDI_PROP_SUCCESS) { 2908 error = EINVAL; 2909 goto err; 2910 } 2911 } 2912 2913 list_insert_tail(&lofi_list, lsp); 2914 /* 2915 * Notify we are ready to rock. 2916 */ 2917 mutex_enter(&lsp->ls_vp_lock); 2918 lsp->ls_vp_ready = B_TRUE; 2919 cv_broadcast(&lsp->ls_vp_cv); 2920 mutex_exit(&lsp->ls_vp_lock); 2921 mutex_exit(&lofi_lock); 2922 2923 lofi_copy_devpath(klip); 2924 2925 if (rvalp) 2926 *rvalp = id; 2927 (void) copy_out_lofi_ioctl(klip, ulip, ioctl_flag); 2928 free_lofi_ioctl(klip); 2929 return (0); 2930 2931 err: 2932 if (lsp != NULL) { 2933 lofi_destroy(lsp, credp); 2934 } else { 2935 if (vp != NULL) { 2936 (void) VOP_PUTPAGE(vp, 0, 0, B_INVAL, credp, NULL); 2937 (void) VOP_CLOSE(vp, flag, 1, 0, credp, NULL); 2938 VN_RELE(vp); 2939 } 2940 } 2941 2942 mutex_exit(&lofi_lock); 2943 free_lofi_ioctl(klip); 2944 return (error); 2945 } 2946 2947 /* 2948 * unmap a file. 2949 */ 2950 static int 2951 lofi_unmap_file(struct lofi_ioctl *ulip, int byfilename, 2952 struct cred *credp, int ioctl_flag) 2953 { 2954 struct lofi_state *lsp; 2955 struct lofi_ioctl *klip; 2956 int err; 2957 2958 err = copy_in_lofi_ioctl(ulip, &klip, ioctl_flag); 2959 if (err != 0) 2960 return (err); 2961 2962 mutex_enter(&lofi_lock); 2963 if (byfilename) { 2964 if ((err = file_to_lofi(klip->li_filename, klip->li_readonly, 2965 &lsp)) != 0) { 2966 goto done; 2967 } 2968 } else if (klip->li_id == 0) { 2969 err = ENXIO; 2970 goto done; 2971 } else { 2972 lsp = ddi_get_soft_state(lofi_statep, klip->li_id); 2973 } 2974 2975 if (lsp == NULL || lsp->ls_vp == NULL || lofi_access(lsp) != 0) { 2976 err = ENXIO; 2977 goto done; 2978 } 2979 2980 klip->li_id = LOFI_MINOR2ID(getminor(lsp->ls_dev)); 2981 2982 /* 2983 * If it's still held open, we'll do one of three things: 2984 * 2985 * If no flag is set, just return EBUSY. 2986 * 2987 * If the 'cleanup' flag is set, unmap and remove the device when 2988 * the last user finishes. 2989 * 2990 * If the 'force' flag is set, then we forcibly close the underlying 2991 * file. Subsequent operations will fail, and the DKIOCSTATE ioctl 2992 * will return DKIO_DEV_GONE. When the device is last closed, the 2993 * device will be cleaned up appropriately. 2994 * 2995 * This is complicated by the fact that we may have outstanding 2996 * dispatched I/Os. Rather than having a single mutex to serialize all 2997 * I/O, we keep a count of the number of outstanding I/O requests 2998 * (ls_vp_iocount), as well as a flag to indicate that no new I/Os 2999 * should be dispatched (ls_vp_closereq). 3000 * 3001 * We set the flag, wait for the number of outstanding I/Os to reach 0, 3002 * and then close the underlying vnode. 3003 */ 3004 if (is_opened(lsp)) { 3005 if (klip->li_force) { 3006 /* Mark the device for cleanup. */ 3007 lofi_set_cleanup(lsp); 3008 mutex_enter(&lsp->ls_vp_lock); 3009 lsp->ls_vp_closereq = B_TRUE; 3010 /* Wake up any threads waiting on dkiocstate. */ 3011 cv_broadcast(&lsp->ls_vp_cv); 3012 while (lsp->ls_vp_iocount > 0) 3013 cv_wait(&lsp->ls_vp_cv, &lsp->ls_vp_lock); 3014 mutex_exit(&lsp->ls_vp_lock); 3015 } else if (klip->li_cleanup) { 3016 lofi_set_cleanup(lsp); 3017 } else { 3018 err = EBUSY; 3019 } 3020 } else { 3021 lofi_free_dev(lsp); 3022 lofi_destroy(lsp, credp); 3023 } 3024 3025 done: 3026 mutex_exit(&lofi_lock); 3027 if (err == 0) 3028 (void) copy_out_lofi_ioctl(klip, ulip, ioctl_flag); 3029 free_lofi_ioctl(klip); 3030 return (err); 3031 } 3032 3033 /* 3034 * get the filename given the minor number, or the minor number given 3035 * the name. 3036 */ 3037 /*ARGSUSED*/ 3038 static int 3039 lofi_get_info(dev_t dev, struct lofi_ioctl *ulip, int which, 3040 struct cred *credp, int ioctl_flag) 3041 { 3042 struct lofi_ioctl *klip; 3043 struct lofi_state *lsp; 3044 int error; 3045 3046 error = copy_in_lofi_ioctl(ulip, &klip, ioctl_flag); 3047 if (error != 0) 3048 return (error); 3049 3050 switch (which) { 3051 case LOFI_GET_FILENAME: 3052 if (klip->li_id == 0) { 3053 free_lofi_ioctl(klip); 3054 return (EINVAL); 3055 } 3056 3057 mutex_enter(&lofi_lock); 3058 lsp = ddi_get_soft_state(lofi_statep, klip->li_id); 3059 if (lsp == NULL || lofi_access(lsp) != 0) { 3060 mutex_exit(&lofi_lock); 3061 free_lofi_ioctl(klip); 3062 return (ENXIO); 3063 } 3064 3065 /* 3066 * This may fail if, for example, we're trying to look 3067 * up a zoned NFS path from the global zone. 3068 */ 3069 if (vnodetopath(NULL, lsp->ls_stacked_vp, klip->li_filename, 3070 sizeof (klip->li_filename), CRED()) != 0) { 3071 (void) strlcpy(klip->li_filename, "?", 3072 sizeof (klip->li_filename)); 3073 } 3074 3075 klip->li_readonly = lsp->ls_readonly; 3076 klip->li_labeled = lsp->ls_cmlbhandle != NULL; 3077 3078 (void) strlcpy(klip->li_algorithm, lsp->ls_comp_algorithm, 3079 sizeof (klip->li_algorithm)); 3080 klip->li_crypto_enabled = lsp->ls_crypto_enabled; 3081 mutex_exit(&lofi_lock); 3082 3083 lofi_copy_devpath(klip); 3084 error = copy_out_lofi_ioctl(klip, ulip, ioctl_flag); 3085 free_lofi_ioctl(klip); 3086 return (error); 3087 case LOFI_GET_MINOR: 3088 mutex_enter(&lofi_lock); 3089 error = file_to_lofi(klip->li_filename, 3090 klip->li_readonly, &lsp); 3091 if (error != 0) { 3092 mutex_exit(&lofi_lock); 3093 free_lofi_ioctl(klip); 3094 return (error); 3095 } 3096 klip->li_id = LOFI_MINOR2ID(getminor(lsp->ls_dev)); 3097 3098 klip->li_readonly = lsp->ls_readonly; 3099 klip->li_labeled = lsp->ls_cmlbhandle != NULL; 3100 mutex_exit(&lofi_lock); 3101 3102 lofi_copy_devpath(klip); 3103 error = copy_out_lofi_ioctl(klip, ulip, ioctl_flag); 3104 3105 free_lofi_ioctl(klip); 3106 return (error); 3107 case LOFI_CHECK_COMPRESSED: 3108 mutex_enter(&lofi_lock); 3109 error = file_to_lofi(klip->li_filename, 3110 klip->li_readonly, &lsp); 3111 if (error != 0) { 3112 mutex_exit(&lofi_lock); 3113 free_lofi_ioctl(klip); 3114 return (error); 3115 } 3116 3117 klip->li_id = LOFI_MINOR2ID(getminor(lsp->ls_dev)); 3118 (void) strlcpy(klip->li_algorithm, lsp->ls_comp_algorithm, 3119 sizeof (klip->li_algorithm)); 3120 3121 mutex_exit(&lofi_lock); 3122 error = copy_out_lofi_ioctl(klip, ulip, ioctl_flag); 3123 free_lofi_ioctl(klip); 3124 return (error); 3125 default: 3126 free_lofi_ioctl(klip); 3127 return (EINVAL); 3128 } 3129 } 3130 3131 static int 3132 uscsi_is_inquiry(intptr_t arg, int flag, union scsi_cdb *cdb, 3133 struct uscsi_cmd *uscmd) 3134 { 3135 int rval; 3136 3137 #ifdef _MULTI_DATAMODEL 3138 switch (ddi_model_convert_from(flag & FMODELS)) { 3139 case DDI_MODEL_ILP32: { 3140 struct uscsi_cmd32 ucmd32; 3141 3142 if (ddi_copyin((void *)arg, &ucmd32, sizeof (ucmd32), flag)) { 3143 rval = EFAULT; 3144 goto err; 3145 } 3146 uscsi_cmd32touscsi_cmd((&ucmd32), uscmd); 3147 break; 3148 } 3149 case DDI_MODEL_NONE: 3150 if (ddi_copyin((void *)arg, uscmd, sizeof (*uscmd), flag)) { 3151 rval = EFAULT; 3152 goto err; 3153 } 3154 break; 3155 default: 3156 rval = EFAULT; 3157 goto err; 3158 } 3159 #else 3160 if (ddi_copyin((void *)arg, uscmd, sizeof (*uscmd), flag)) { 3161 rval = EFAULT; 3162 goto err; 3163 } 3164 #endif /* _MULTI_DATAMODEL */ 3165 if (ddi_copyin(uscmd->uscsi_cdb, cdb, uscmd->uscsi_cdblen, flag)) { 3166 rval = EFAULT; 3167 goto err; 3168 } 3169 if (cdb->scc_cmd == SCMD_INQUIRY) { 3170 return (0); 3171 } 3172 err: 3173 return (rval); 3174 } 3175 3176 static int 3177 lofi_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *credp, 3178 int *rvalp) 3179 { 3180 int error; 3181 enum dkio_state dkstate; 3182 struct lofi_state *lsp; 3183 int id; 3184 3185 id = LOFI_MINOR2ID(getminor(dev)); 3186 3187 /* lofi ioctls only apply to the master device */ 3188 if (id == 0) { 3189 struct lofi_ioctl *lip = (struct lofi_ioctl *)arg; 3190 3191 /* 3192 * the query command only need read-access - i.e., normal 3193 * users are allowed to do those on the ctl device as 3194 * long as they can open it read-only. 3195 */ 3196 switch (cmd) { 3197 case LOFI_MAP_FILE: 3198 if ((flag & FWRITE) == 0) 3199 return (EPERM); 3200 return (lofi_map_file(dev, lip, 1, rvalp, credp, flag)); 3201 case LOFI_MAP_FILE_MINOR: 3202 if ((flag & FWRITE) == 0) 3203 return (EPERM); 3204 return (lofi_map_file(dev, lip, 0, rvalp, credp, flag)); 3205 case LOFI_UNMAP_FILE: 3206 if ((flag & FWRITE) == 0) 3207 return (EPERM); 3208 return (lofi_unmap_file(lip, 1, credp, flag)); 3209 case LOFI_UNMAP_FILE_MINOR: 3210 if ((flag & FWRITE) == 0) 3211 return (EPERM); 3212 return (lofi_unmap_file(lip, 0, credp, flag)); 3213 case LOFI_GET_FILENAME: 3214 return (lofi_get_info(dev, lip, LOFI_GET_FILENAME, 3215 credp, flag)); 3216 case LOFI_GET_MINOR: 3217 return (lofi_get_info(dev, lip, LOFI_GET_MINOR, 3218 credp, flag)); 3219 3220 /* 3221 * This API made limited sense when this value was fixed 3222 * at LOFI_MAX_FILES. However, its use to iterate 3223 * across all possible devices in lofiadm means we don't 3224 * want to return L_MAXMIN, but the highest 3225 * *allocated* id. 3226 */ 3227 case LOFI_GET_MAXMINOR: 3228 id = 0; 3229 3230 mutex_enter(&lofi_lock); 3231 3232 for (lsp = list_head(&lofi_list); lsp != NULL; 3233 lsp = list_next(&lofi_list, lsp)) { 3234 int i; 3235 if (lofi_access(lsp) != 0) 3236 continue; 3237 3238 i = ddi_get_instance(lsp->ls_dip); 3239 if (i > id) 3240 id = i; 3241 } 3242 3243 mutex_exit(&lofi_lock); 3244 3245 error = ddi_copyout(&id, &lip->li_id, 3246 sizeof (id), flag); 3247 if (error) 3248 return (EFAULT); 3249 return (0); 3250 3251 case LOFI_CHECK_COMPRESSED: 3252 return (lofi_get_info(dev, lip, LOFI_CHECK_COMPRESSED, 3253 credp, flag)); 3254 default: 3255 return (EINVAL); 3256 } 3257 } 3258 3259 mutex_enter(&lofi_lock); 3260 lsp = ddi_get_soft_state(lofi_statep, id); 3261 if (lsp == NULL || lsp->ls_cleanup) { 3262 mutex_exit(&lofi_lock); 3263 return (ENXIO); 3264 } 3265 mutex_exit(&lofi_lock); 3266 3267 if (ddi_prop_exists(DDI_DEV_T_ANY, lsp->ls_dip, DDI_PROP_DONTPASS, 3268 "labeled") == 1) { 3269 error = cmlb_ioctl(lsp->ls_cmlbhandle, dev, cmd, arg, flag, 3270 credp, rvalp, 0); 3271 if (error != ENOTTY) 3272 return (error); 3273 } 3274 3275 /* 3276 * We explicitly allow DKIOCSTATE, but all other ioctls should fail with 3277 * EIO as if the device was no longer present. 3278 */ 3279 if (lsp->ls_vp == NULL && cmd != DKIOCSTATE) 3280 return (EIO); 3281 3282 /* these are for faking out utilities like newfs */ 3283 switch (cmd) { 3284 case DKIOCGMEDIAINFO: 3285 case DKIOCGMEDIAINFOEXT: { 3286 struct dk_minfo_ext media_info; 3287 int shift = lsp->ls_lbshift; 3288 int size; 3289 3290 if (cmd == DKIOCGMEDIAINFOEXT) { 3291 media_info.dki_pbsize = 1U << lsp->ls_pbshift; 3292 size = sizeof (struct dk_minfo_ext); 3293 } else { 3294 size = sizeof (struct dk_minfo); 3295 } 3296 3297 media_info.dki_media_type = DK_FIXED_DISK; 3298 media_info.dki_lbsize = 1U << shift; 3299 media_info.dki_capacity = 3300 (lsp->ls_vp_size - lsp->ls_crypto_offset) >> shift; 3301 3302 if (ddi_copyout(&media_info, (void *)arg, size, flag)) 3303 return (EFAULT); 3304 return (0); 3305 } 3306 case DKIOCREMOVABLE: { 3307 int i = 0; 3308 if (ddi_copyout(&i, (caddr_t)arg, sizeof (int), flag)) 3309 return (EFAULT); 3310 return (0); 3311 } 3312 3313 case DKIOCGVTOC: { 3314 struct vtoc vt; 3315 fake_disk_vtoc(lsp, &vt); 3316 3317 switch (ddi_model_convert_from(flag & FMODELS)) { 3318 case DDI_MODEL_ILP32: { 3319 struct vtoc32 vtoc32; 3320 3321 vtoctovtoc32(vt, vtoc32); 3322 if (ddi_copyout(&vtoc32, (void *)arg, 3323 sizeof (struct vtoc32), flag)) 3324 return (EFAULT); 3325 break; 3326 } 3327 3328 case DDI_MODEL_NONE: 3329 if (ddi_copyout(&vt, (void *)arg, 3330 sizeof (struct vtoc), flag)) 3331 return (EFAULT); 3332 break; 3333 } 3334 return (0); 3335 } 3336 case DKIOCINFO: { 3337 struct dk_cinfo ci; 3338 fake_disk_info(dev, &ci); 3339 if (ddi_copyout(&ci, (void *)arg, sizeof (ci), flag)) 3340 return (EFAULT); 3341 return (0); 3342 } 3343 case DKIOCG_VIRTGEOM: 3344 case DKIOCG_PHYGEOM: 3345 case DKIOCGGEOM: 3346 error = ddi_copyout(&lsp->ls_dkg, (void *)arg, 3347 sizeof (struct dk_geom), flag); 3348 if (error) 3349 return (EFAULT); 3350 return (0); 3351 case DKIOCSTATE: 3352 /* 3353 * Normally, lofi devices are always in the INSERTED state. If 3354 * a device is forcefully unmapped, then the device transitions 3355 * to the DKIO_DEV_GONE state. 3356 */ 3357 if (ddi_copyin((void *)arg, &dkstate, sizeof (dkstate), 3358 flag) != 0) 3359 return (EFAULT); 3360 3361 mutex_enter(&lsp->ls_vp_lock); 3362 while (((dkstate == DKIO_INSERTED && lsp->ls_vp != NULL) || 3363 (dkstate == DKIO_DEV_GONE && lsp->ls_vp == NULL)) && 3364 !lsp->ls_cleanup) { 3365 /* 3366 * By virtue of having the device open, we know that 3367 * 'lsp' will remain valid when we return. 3368 */ 3369 if (!cv_wait_sig(&lsp->ls_vp_cv, &lsp->ls_vp_lock)) { 3370 mutex_exit(&lsp->ls_vp_lock); 3371 return (EINTR); 3372 } 3373 } 3374 3375 dkstate = (!lsp->ls_cleanup && lsp->ls_vp != NULL ? 3376 DKIO_INSERTED : DKIO_DEV_GONE); 3377 mutex_exit(&lsp->ls_vp_lock); 3378 3379 if (ddi_copyout(&dkstate, (void *)arg, 3380 sizeof (dkstate), flag) != 0) 3381 return (EFAULT); 3382 return (0); 3383 case USCSICMD: { 3384 struct uscsi_cmd uscmd; 3385 union scsi_cdb cdb; 3386 3387 if (uscsi_is_inquiry(arg, flag, &cdb, &uscmd) == 0) { 3388 struct scsi_inquiry inq = {0}; 3389 3390 lofi_create_inquiry(lsp, &inq); 3391 if (ddi_copyout(&inq, uscmd.uscsi_bufaddr, 3392 uscmd.uscsi_buflen, flag) != 0) 3393 return (EFAULT); 3394 return (0); 3395 } else if (cdb.scc_cmd == SCMD_READ_CAPACITY) { 3396 struct scsi_capacity capacity; 3397 3398 capacity.capacity = 3399 BE_32((lsp->ls_vp_size - lsp->ls_crypto_offset) >> 3400 lsp->ls_lbshift); 3401 capacity.lbasize = BE_32(1 << lsp->ls_lbshift); 3402 if (ddi_copyout(&capacity, uscmd.uscsi_bufaddr, 3403 uscmd.uscsi_buflen, flag) != 0) 3404 return (EFAULT); 3405 return (0); 3406 } 3407 3408 uscmd.uscsi_rqstatus = 0xff; 3409 #ifdef _MULTI_DATAMODEL 3410 switch (ddi_model_convert_from(flag & FMODELS)) { 3411 case DDI_MODEL_ILP32: { 3412 struct uscsi_cmd32 ucmd32; 3413 uscsi_cmdtouscsi_cmd32((&uscmd), (&ucmd32)); 3414 if (ddi_copyout(&ucmd32, (void *)arg, sizeof (ucmd32), 3415 flag) != 0) 3416 return (EFAULT); 3417 break; 3418 } 3419 case DDI_MODEL_NONE: 3420 if (ddi_copyout(&uscmd, (void *)arg, sizeof (uscmd), 3421 flag) != 0) 3422 return (EFAULT); 3423 break; 3424 default: 3425 return (EFAULT); 3426 } 3427 #else 3428 if (ddi_copyout(&uscmd, (void *)arg, sizeof (uscmd), flag) != 0) 3429 return (EFAULT); 3430 #endif /* _MULTI_DATAMODEL */ 3431 return (0); 3432 } 3433 default: 3434 #ifdef DEBUG 3435 cmn_err(CE_WARN, "lofi_ioctl: %d is not implemented\n", cmd); 3436 #endif /* DEBUG */ 3437 return (ENOTTY); 3438 } 3439 } 3440 3441 static int 3442 lofi_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int mod_flags, 3443 char *name, caddr_t valuep, int *lengthp) 3444 { 3445 struct lofi_state *lsp; 3446 int rc; 3447 3448 lsp = ddi_get_soft_state(lofi_statep, ddi_get_instance(dip)); 3449 if (lsp == NULL) { 3450 return (ddi_prop_op(dev, dip, prop_op, mod_flags, 3451 name, valuep, lengthp)); 3452 } 3453 3454 rc = cmlb_prop_op(lsp->ls_cmlbhandle, dev, dip, prop_op, mod_flags, 3455 name, valuep, lengthp, LOFI_PART(getminor(dev)), NULL); 3456 if (rc == DDI_PROP_SUCCESS) 3457 return (rc); 3458 3459 return (ddi_prop_op(DDI_DEV_T_ANY, dip, prop_op, mod_flags, 3460 name, valuep, lengthp)); 3461 } 3462 3463 static struct cb_ops lofi_cb_ops = { 3464 lofi_open, /* open */ 3465 lofi_close, /* close */ 3466 lofi_strategy, /* strategy */ 3467 nodev, /* print */ 3468 nodev, /* dump */ 3469 lofi_read, /* read */ 3470 lofi_write, /* write */ 3471 lofi_ioctl, /* ioctl */ 3472 nodev, /* devmap */ 3473 nodev, /* mmap */ 3474 nodev, /* segmap */ 3475 nochpoll, /* poll */ 3476 lofi_prop_op, /* prop_op */ 3477 0, /* streamtab */ 3478 D_64BIT | D_NEW | D_MP, /* Driver compatibility flag */ 3479 CB_REV, 3480 lofi_aread, 3481 lofi_awrite 3482 }; 3483 3484 static struct dev_ops lofi_ops = { 3485 DEVO_REV, /* devo_rev, */ 3486 0, /* refcnt */ 3487 lofi_info, /* info */ 3488 nulldev, /* identify */ 3489 nulldev, /* probe */ 3490 lofi_attach, /* attach */ 3491 lofi_detach, /* detach */ 3492 nodev, /* reset */ 3493 &lofi_cb_ops, /* driver operations */ 3494 NULL, /* no bus operations */ 3495 NULL, /* power */ 3496 ddi_quiesce_not_needed, /* quiesce */ 3497 }; 3498 3499 static struct modldrv modldrv = { 3500 &mod_driverops, 3501 "loopback file driver", 3502 &lofi_ops, 3503 }; 3504 3505 static struct modlinkage modlinkage = { 3506 MODREV_1, 3507 &modldrv, 3508 NULL 3509 }; 3510 3511 int 3512 _init(void) 3513 { 3514 int error; 3515 3516 list_create(&lofi_list, sizeof (struct lofi_state), 3517 offsetof(struct lofi_state, ls_list)); 3518 3519 error = ddi_soft_state_init((void **)&lofi_statep, 3520 sizeof (struct lofi_state), 0); 3521 if (error) { 3522 list_destroy(&lofi_list); 3523 return (error); 3524 } 3525 3526 /* 3527 * The minor number is stored as id << LOFI_CMLB_SHIFT as 3528 * we need to reserve space for cmlb minor numbers. 3529 * This will leave out 4096 id values on 32bit kernel, which should 3530 * still suffice. 3531 */ 3532 lofi_id = id_space_create("lofi_id", 1, 3533 (1 << (L_BITSMINOR - LOFI_CMLB_SHIFT))); 3534 3535 if (lofi_id == NULL) { 3536 ddi_soft_state_fini((void **)&lofi_statep); 3537 list_destroy(&lofi_list); 3538 return (DDI_FAILURE); 3539 } 3540 3541 mutex_init(&lofi_lock, NULL, MUTEX_DRIVER, NULL); 3542 3543 error = mod_install(&modlinkage); 3544 3545 if (error) { 3546 id_space_destroy(lofi_id); 3547 mutex_destroy(&lofi_lock); 3548 ddi_soft_state_fini((void **)&lofi_statep); 3549 list_destroy(&lofi_list); 3550 } 3551 3552 return (error); 3553 } 3554 3555 int 3556 _fini(void) 3557 { 3558 int error; 3559 3560 mutex_enter(&lofi_lock); 3561 3562 if (!list_is_empty(&lofi_list)) { 3563 mutex_exit(&lofi_lock); 3564 return (EBUSY); 3565 } 3566 3567 mutex_exit(&lofi_lock); 3568 3569 error = mod_remove(&modlinkage); 3570 if (error) 3571 return (error); 3572 3573 mutex_destroy(&lofi_lock); 3574 id_space_destroy(lofi_id); 3575 ddi_soft_state_fini((void **)&lofi_statep); 3576 list_destroy(&lofi_list); 3577 3578 return (error); 3579 } 3580 3581 int 3582 _info(struct modinfo *modinfop) 3583 { 3584 return (mod_info(&modlinkage, modinfop)); 3585 } 3586