xref: /titanic_50/usr/src/uts/common/io/lofi.c (revision f441771b0ce9f9d6122d318ff8290cb1a2848f9d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 /*
26  * lofi (loopback file) driver - allows you to attach a file to a device,
27  * which can then be accessed through that device. The simple model is that
28  * you tell lofi to open a file, and then use the block device you get as
29  * you would any block device. lofi translates access to the block device
30  * into I/O on the underlying file. This is mostly useful for
31  * mounting images of filesystems.
32  *
33  * lofi is controlled through /dev/lofictl - this is the only device exported
34  * during attach, and is minor number 0. lofiadm communicates with lofi through
35  * ioctls on this device. When a file is attached to lofi, block and character
36  * devices are exported in /dev/lofi and /dev/rlofi. Currently, these devices
37  * are identified by their minor number, and the minor number is also used
38  * as the name in /dev/lofi. If we ever decide to support virtual disks,
39  * we'll have to divide the minor number space to identify fdisk partitions
40  * and slices, and the name will then be the minor number shifted down a
41  * few bits. Minor devices are tracked with state structures handled with
42  * ddi_soft_state(9F) for simplicity.
43  *
44  * A file attached to lofi is opened when attached and not closed until
45  * explicitly detached from lofi. This seems more sensible than deferring
46  * the open until the /dev/lofi device is opened, for a number of reasons.
47  * One is that any failure is likely to be noticed by the person (or script)
48  * running lofiadm. Another is that it would be a security problem if the
49  * file was replaced by another one after being added but before being opened.
50  *
51  * The only hard part about lofi is the ioctls. In order to support things
52  * like 'newfs' on a lofi device, it needs to support certain disk ioctls.
53  * So it has to fake disk geometry and partition information. More may need
54  * to be faked if your favorite utility doesn't work and you think it should
55  * (fdformat doesn't work because it really wants to know the type of floppy
56  * controller to talk to, and that didn't seem easy to fake. Or possibly even
57  * necessary, since we have mkfs_pcfs now).
58  *
59  * Normally, a lofi device cannot be detached if it is open (i.e. busy).  To
60  * support simulation of hotplug events, an optional force flag is provided.
61  * If a lofi device is open when a force detach is requested, then the
62  * underlying file is closed and any subsequent operations return EIO.  When the
63  * device is closed for the last time, it will be cleaned up at that time.  In
64  * addition, the DKIOCSTATE ioctl will return DKIO_DEV_GONE when the device is
65  * detached but not removed.
66  *
67  * Known problems:
68  *
69  *	UFS logging. Mounting a UFS filesystem image "logging"
70  *	works for basic copy testing but wedges during a build of ON through
71  *	that image. Some deadlock in lufs holding the log mutex and then
72  *	getting stuck on a buf. So for now, don't do that.
73  *
74  *	Direct I/O. Since the filesystem data is being cached in the buffer
75  *	cache, _and_ again in the underlying filesystem, it's tempting to
76  *	enable direct I/O on the underlying file. Don't, because that deadlocks.
77  *	I think to fix the cache-twice problem we might need filesystem support.
78  *
79  *	lofi on itself. The simple lock strategy (lofi_lock) precludes this
80  *	because you'll be in lofi_ioctl, holding the lock when you open the
81  *	file, which, if it's lofi, will grab lofi_lock. We prevent this for
82  *	now, though not using ddi_soft_state(9F) would make it possible to
83  *	do. Though it would still be silly.
84  *
85  * Interesting things to do:
86  *
87  *	Allow multiple files for each device. A poor-man's metadisk, basically.
88  *
89  *	Pass-through ioctls on block devices. You can (though it's not
90  *	documented), give lofi a block device as a file name. Then we shouldn't
91  *	need to fake a geometry, however, it may be relevant if you're replacing
92  *	metadisk, or using lofi to get crypto.
93  *	It makes sense to do lofiadm -c aes -a /dev/dsk/c0t0d0s4 /dev/lofi/1
94  *	and then in /etc/vfstab have an entry for /dev/lofi/1 as /export/home.
95  *	In fact this even makes sense if you have lofi "above" metadisk.
96  *
97  * Encryption:
98  *	Each lofi device can have its own symmetric key and cipher.
99  *	They are passed to us by lofiadm(1m) in the correct format for use
100  *	with the misc/kcf crypto_* routines.
101  *
102  *	Each block has its own IV, that is calculated in lofi_blk_mech(), based
103  *	on the "master" key held in the lsp and the block number of the buffer.
104  */
105 
106 #include <sys/types.h>
107 #include <netinet/in.h>
108 #include <sys/sysmacros.h>
109 #include <sys/uio.h>
110 #include <sys/kmem.h>
111 #include <sys/cred.h>
112 #include <sys/mman.h>
113 #include <sys/errno.h>
114 #include <sys/aio_req.h>
115 #include <sys/stat.h>
116 #include <sys/file.h>
117 #include <sys/modctl.h>
118 #include <sys/conf.h>
119 #include <sys/debug.h>
120 #include <sys/vnode.h>
121 #include <sys/lofi.h>
122 #include <sys/fcntl.h>
123 #include <sys/pathname.h>
124 #include <sys/filio.h>
125 #include <sys/fdio.h>
126 #include <sys/open.h>
127 #include <sys/disp.h>
128 #include <vm/seg_map.h>
129 #include <sys/ddi.h>
130 #include <sys/sunddi.h>
131 #include <sys/zmod.h>
132 #include <sys/crypto/common.h>
133 #include <sys/crypto/api.h>
134 #include <LzmaDec.h>
135 
136 /*
137  * The basis for CRYOFF is derived from usr/src/uts/common/sys/fs/ufs_fs.h.
138  * Crypto metadata, if it exists, is located at the end of the boot block
139  * (BBOFF + BBSIZE, which is SBOFF).  The super block and everything after
140  * is offset by the size of the crypto metadata which is handled by
141  * lsp->ls_crypto_offset.
142  */
143 #define	CRYOFF	((off_t)8192)
144 
145 #define	NBLOCKS_PROP_NAME	"Nblocks"
146 #define	SIZE_PROP_NAME		"Size"
147 
148 #define	SETUP_C_DATA(cd, buf, len) 		\
149 	(cd).cd_format = CRYPTO_DATA_RAW;	\
150 	(cd).cd_offset = 0;			\
151 	(cd).cd_miscdata = NULL;		\
152 	(cd).cd_length = (len);			\
153 	(cd).cd_raw.iov_base = (buf);		\
154 	(cd).cd_raw.iov_len = (len);
155 
156 #define	UIO_CHECK(uio)	\
157 	if (((uio)->uio_loffset % DEV_BSIZE) != 0 || \
158 	    ((uio)->uio_resid % DEV_BSIZE) != 0) { \
159 		return (EINVAL); \
160 	}
161 
162 static dev_info_t *lofi_dip = NULL;
163 static void *lofi_statep = NULL;
164 static kmutex_t lofi_lock;		/* state lock */
165 
166 /*
167  * Because lofi_taskq_nthreads limits the actual swamping of the device, the
168  * maxalloc parameter (lofi_taskq_maxalloc) should be tuned conservatively
169  * high.  If we want to be assured that the underlying device is always busy,
170  * we must be sure that the number of bytes enqueued when the number of
171  * enqueued tasks exceeds maxalloc is sufficient to keep the device busy for
172  * the duration of the sleep time in taskq_ent_alloc().  That is, lofi should
173  * set maxalloc to be the maximum throughput (in bytes per second) of the
174  * underlying device divided by the minimum I/O size.  We assume a realistic
175  * maximum throughput of one hundred megabytes per second; we set maxalloc on
176  * the lofi task queue to be 104857600 divided by DEV_BSIZE.
177  */
178 static int lofi_taskq_maxalloc = 104857600 / DEV_BSIZE;
179 static int lofi_taskq_nthreads = 4;	/* # of taskq threads per device */
180 
181 uint32_t lofi_max_files = LOFI_MAX_FILES;
182 const char lofi_crypto_magic[6] = LOFI_CRYPTO_MAGIC;
183 
184 /*
185  * To avoid decompressing data in a compressed segment multiple times
186  * when accessing small parts of a segment's data, we cache and reuse
187  * the uncompressed segment's data.
188  *
189  * A single cached segment is sufficient to avoid lots of duplicate
190  * segment decompress operations. A small cache size also reduces the
191  * memory footprint.
192  *
193  * lofi_max_comp_cache is the maximum number of decompressed data segments
194  * cached for each compressed lofi image. It can be set to 0 to disable
195  * caching.
196  */
197 
198 uint32_t lofi_max_comp_cache = 1;
199 
200 static int gzip_decompress(void *src, size_t srclen, void *dst,
201 	size_t *destlen, int level);
202 
203 static int lzma_decompress(void *src, size_t srclen, void *dst,
204 	size_t *dstlen, int level);
205 
206 lofi_compress_info_t lofi_compress_table[LOFI_COMPRESS_FUNCTIONS] = {
207 	{gzip_decompress,	NULL,	6,	"gzip"}, /* default */
208 	{gzip_decompress,	NULL,	6,	"gzip-6"},
209 	{gzip_decompress,	NULL,	9,	"gzip-9"},
210 	{lzma_decompress,	NULL,	0,	"lzma"}
211 };
212 
213 /*ARGSUSED*/
214 static void
215 *SzAlloc(void *p, size_t size)
216 {
217 	return (kmem_alloc(size, KM_SLEEP));
218 }
219 
220 /*ARGSUSED*/
221 static void
222 SzFree(void *p, void *address, size_t size)
223 {
224 	kmem_free(address, size);
225 }
226 
227 static ISzAlloc g_Alloc = { SzAlloc, SzFree };
228 
229 /*
230  * Free data referenced by the linked list of cached uncompressed
231  * segments.
232  */
233 static void
234 lofi_free_comp_cache(struct lofi_state *lsp)
235 {
236 	struct lofi_comp_cache *lc;
237 
238 	while ((lc = list_remove_head(&lsp->ls_comp_cache)) != NULL) {
239 		kmem_free(lc->lc_data, lsp->ls_uncomp_seg_sz);
240 		kmem_free(lc, sizeof (struct lofi_comp_cache));
241 		lsp->ls_comp_cache_count--;
242 	}
243 	ASSERT(lsp->ls_comp_cache_count == 0);
244 }
245 
246 static int
247 lofi_busy(void)
248 {
249 	minor_t	minor;
250 
251 	/*
252 	 * We need to make sure no mappings exist - mod_remove won't
253 	 * help because the device isn't open.
254 	 */
255 	mutex_enter(&lofi_lock);
256 	for (minor = 1; minor <= lofi_max_files; minor++) {
257 		if (ddi_get_soft_state(lofi_statep, minor) != NULL) {
258 			mutex_exit(&lofi_lock);
259 			return (EBUSY);
260 		}
261 	}
262 	mutex_exit(&lofi_lock);
263 	return (0);
264 }
265 
266 static int
267 is_opened(struct lofi_state *lsp)
268 {
269 	ASSERT(mutex_owned(&lofi_lock));
270 	return (lsp->ls_chr_open || lsp->ls_blk_open || lsp->ls_lyr_open_count);
271 }
272 
273 static int
274 mark_opened(struct lofi_state *lsp, int otyp)
275 {
276 	ASSERT(mutex_owned(&lofi_lock));
277 	switch (otyp) {
278 	case OTYP_CHR:
279 		lsp->ls_chr_open = 1;
280 		break;
281 	case OTYP_BLK:
282 		lsp->ls_blk_open = 1;
283 		break;
284 	case OTYP_LYR:
285 		lsp->ls_lyr_open_count++;
286 		break;
287 	default:
288 		return (-1);
289 	}
290 	return (0);
291 }
292 
293 static void
294 mark_closed(struct lofi_state *lsp, int otyp)
295 {
296 	ASSERT(mutex_owned(&lofi_lock));
297 	switch (otyp) {
298 	case OTYP_CHR:
299 		lsp->ls_chr_open = 0;
300 		break;
301 	case OTYP_BLK:
302 		lsp->ls_blk_open = 0;
303 		break;
304 	case OTYP_LYR:
305 		lsp->ls_lyr_open_count--;
306 		break;
307 	default:
308 		break;
309 	}
310 }
311 
312 static void
313 lofi_free_crypto(struct lofi_state *lsp)
314 {
315 	ASSERT(mutex_owned(&lofi_lock));
316 
317 	if (lsp->ls_crypto_enabled) {
318 		/*
319 		 * Clean up the crypto state so that it doesn't hang around
320 		 * in memory after we are done with it.
321 		 */
322 		bzero(lsp->ls_key.ck_data,
323 		    CRYPTO_BITS2BYTES(lsp->ls_key.ck_length));
324 		kmem_free(lsp->ls_key.ck_data,
325 		    CRYPTO_BITS2BYTES(lsp->ls_key.ck_length));
326 		lsp->ls_key.ck_data = NULL;
327 		lsp->ls_key.ck_length = 0;
328 
329 		if (lsp->ls_mech.cm_param != NULL) {
330 			kmem_free(lsp->ls_mech.cm_param,
331 			    lsp->ls_mech.cm_param_len);
332 			lsp->ls_mech.cm_param = NULL;
333 			lsp->ls_mech.cm_param_len = 0;
334 		}
335 
336 		if (lsp->ls_iv_mech.cm_param != NULL) {
337 			kmem_free(lsp->ls_iv_mech.cm_param,
338 			    lsp->ls_iv_mech.cm_param_len);
339 			lsp->ls_iv_mech.cm_param = NULL;
340 			lsp->ls_iv_mech.cm_param_len = 0;
341 		}
342 
343 		mutex_destroy(&lsp->ls_crypto_lock);
344 	}
345 }
346 
347 static void
348 lofi_free_handle(dev_t dev, minor_t minor, struct lofi_state *lsp,
349     cred_t *credp)
350 {
351 	dev_t	newdev;
352 	char	namebuf[50];
353 	int	i;
354 
355 	ASSERT(mutex_owned(&lofi_lock));
356 
357 	lofi_free_crypto(lsp);
358 
359 	if (lsp->ls_vp) {
360 		(void) VOP_CLOSE(lsp->ls_vp, lsp->ls_openflag,
361 		    1, 0, credp, NULL);
362 		VN_RELE(lsp->ls_vp);
363 		lsp->ls_vp = NULL;
364 	}
365 
366 	newdev = makedevice(getmajor(dev), minor);
367 	(void) ddi_prop_remove(newdev, lofi_dip, SIZE_PROP_NAME);
368 	(void) ddi_prop_remove(newdev, lofi_dip, NBLOCKS_PROP_NAME);
369 
370 	(void) snprintf(namebuf, sizeof (namebuf), "%d", minor);
371 	ddi_remove_minor_node(lofi_dip, namebuf);
372 	(void) snprintf(namebuf, sizeof (namebuf), "%d,raw", minor);
373 	ddi_remove_minor_node(lofi_dip, namebuf);
374 
375 	kmem_free(lsp->ls_filename, lsp->ls_filename_sz);
376 	taskq_destroy(lsp->ls_taskq);
377 	if (lsp->ls_kstat) {
378 		kstat_delete(lsp->ls_kstat);
379 		mutex_destroy(&lsp->ls_kstat_lock);
380 	}
381 
382 	/*
383 	 * Free cached decompressed segment data
384 	 */
385 	lofi_free_comp_cache(lsp);
386 	list_destroy(&lsp->ls_comp_cache);
387 	mutex_destroy(&lsp->ls_comp_cache_lock);
388 
389 	if (lsp->ls_uncomp_seg_sz > 0) {
390 		kmem_free(lsp->ls_comp_index_data, lsp->ls_comp_index_data_sz);
391 		lsp->ls_uncomp_seg_sz = 0;
392 	}
393 
394 	/*
395 	 * Free pre-allocated compressed buffers
396 	 */
397 	if (lsp->ls_comp_bufs != NULL) {
398 		for (i = 0; i < lofi_taskq_nthreads; i++) {
399 			if (lsp->ls_comp_bufs[i].bufsize > 0)
400 				kmem_free(lsp->ls_comp_bufs[i].buf,
401 				    lsp->ls_comp_bufs[i].bufsize);
402 		}
403 		kmem_free(lsp->ls_comp_bufs,
404 		    sizeof (struct compbuf) * lofi_taskq_nthreads);
405 		mutex_destroy(&lsp->ls_comp_bufs_lock);
406 	}
407 
408 	mutex_destroy(&lsp->ls_vp_lock);
409 
410 	ddi_soft_state_free(lofi_statep, minor);
411 }
412 
413 /*ARGSUSED*/
414 static int
415 lofi_open(dev_t *devp, int flag, int otyp, struct cred *credp)
416 {
417 	minor_t	minor;
418 	struct lofi_state *lsp;
419 
420 	mutex_enter(&lofi_lock);
421 	minor = getminor(*devp);
422 	if (minor == 0) {
423 		/* master control device */
424 		/* must be opened exclusively */
425 		if (((flag & FEXCL) != FEXCL) || (otyp != OTYP_CHR)) {
426 			mutex_exit(&lofi_lock);
427 			return (EINVAL);
428 		}
429 		lsp = ddi_get_soft_state(lofi_statep, 0);
430 		if (lsp == NULL) {
431 			mutex_exit(&lofi_lock);
432 			return (ENXIO);
433 		}
434 		if (is_opened(lsp)) {
435 			mutex_exit(&lofi_lock);
436 			return (EBUSY);
437 		}
438 		(void) mark_opened(lsp, OTYP_CHR);
439 		mutex_exit(&lofi_lock);
440 		return (0);
441 	}
442 
443 	/* otherwise, the mapping should already exist */
444 	lsp = ddi_get_soft_state(lofi_statep, minor);
445 	if (lsp == NULL) {
446 		mutex_exit(&lofi_lock);
447 		return (EINVAL);
448 	}
449 
450 	if (lsp->ls_vp == NULL) {
451 		mutex_exit(&lofi_lock);
452 		return (ENXIO);
453 	}
454 
455 	if (mark_opened(lsp, otyp) == -1) {
456 		mutex_exit(&lofi_lock);
457 		return (EINVAL);
458 	}
459 
460 	mutex_exit(&lofi_lock);
461 	return (0);
462 }
463 
464 /*ARGSUSED*/
465 static int
466 lofi_close(dev_t dev, int flag, int otyp, struct cred *credp)
467 {
468 	minor_t	minor;
469 	struct lofi_state *lsp;
470 
471 	mutex_enter(&lofi_lock);
472 	minor = getminor(dev);
473 	lsp = ddi_get_soft_state(lofi_statep, minor);
474 	if (lsp == NULL) {
475 		mutex_exit(&lofi_lock);
476 		return (EINVAL);
477 	}
478 	mark_closed(lsp, otyp);
479 
480 	/*
481 	 * If we forcibly closed the underlying device (li_force), or
482 	 * asked for cleanup (li_cleanup), finish up if we're the last
483 	 * out of the door.
484 	 */
485 	if (minor != 0 && !is_opened(lsp) &&
486 	    (lsp->ls_cleanup || lsp->ls_vp == NULL))
487 		lofi_free_handle(dev, minor, lsp, credp);
488 
489 	mutex_exit(&lofi_lock);
490 	return (0);
491 }
492 
493 /*
494  * Sets the mechanism's initialization vector (IV) if one is needed.
495  * The IV is computed from the data block number.  lsp->ls_mech is
496  * altered so that:
497  *	lsp->ls_mech.cm_param_len is set to the IV len.
498  *	lsp->ls_mech.cm_param is set to the IV.
499  */
500 static int
501 lofi_blk_mech(struct lofi_state *lsp, longlong_t lblkno)
502 {
503 	int	ret;
504 	crypto_data_t cdata;
505 	char	*iv;
506 	size_t	iv_len;
507 	size_t	min;
508 	void	*data;
509 	size_t	datasz;
510 
511 	ASSERT(mutex_owned(&lsp->ls_crypto_lock));
512 
513 	if (lsp == NULL)
514 		return (CRYPTO_DEVICE_ERROR);
515 
516 	/* lsp->ls_mech.cm_param{_len} has already been set for static iv */
517 	if (lsp->ls_iv_type == IVM_NONE) {
518 		return (CRYPTO_SUCCESS);
519 	}
520 
521 	/*
522 	 * if kmem already alloced from previous call and it's the same size
523 	 * we need now, just recycle it; allocate new kmem only if we have to
524 	 */
525 	if (lsp->ls_mech.cm_param == NULL ||
526 	    lsp->ls_mech.cm_param_len != lsp->ls_iv_len) {
527 		iv_len = lsp->ls_iv_len;
528 		iv = kmem_zalloc(iv_len, KM_SLEEP);
529 	} else {
530 		iv_len = lsp->ls_mech.cm_param_len;
531 		iv = lsp->ls_mech.cm_param;
532 		bzero(iv, iv_len);
533 	}
534 
535 	switch (lsp->ls_iv_type) {
536 	case IVM_ENC_BLKNO:
537 		/* iv is not static, lblkno changes each time */
538 		data = &lblkno;
539 		datasz = sizeof (lblkno);
540 		break;
541 	default:
542 		data = 0;
543 		datasz = 0;
544 		break;
545 	}
546 
547 	/*
548 	 * write blkno into the iv buffer padded on the left in case
549 	 * blkno ever grows bigger than its current longlong_t size
550 	 * or a variation other than blkno is used for the iv data
551 	 */
552 	min = MIN(datasz, iv_len);
553 	bcopy(data, iv + (iv_len - min), min);
554 
555 	/* encrypt the data in-place to get the IV */
556 	SETUP_C_DATA(cdata, iv, iv_len);
557 
558 	ret = crypto_encrypt(&lsp->ls_iv_mech, &cdata, &lsp->ls_key,
559 	    NULL, NULL, NULL);
560 	if (ret != CRYPTO_SUCCESS) {
561 		cmn_err(CE_WARN, "failed to create iv for block %lld: (0x%x)",
562 		    lblkno, ret);
563 		if (lsp->ls_mech.cm_param != iv)
564 			kmem_free(iv, iv_len);
565 
566 		return (ret);
567 	}
568 
569 	/* clean up the iv from the last computation */
570 	if (lsp->ls_mech.cm_param != NULL && lsp->ls_mech.cm_param != iv)
571 		kmem_free(lsp->ls_mech.cm_param, lsp->ls_mech.cm_param_len);
572 
573 	lsp->ls_mech.cm_param_len = iv_len;
574 	lsp->ls_mech.cm_param = iv;
575 
576 	return (CRYPTO_SUCCESS);
577 }
578 
579 /*
580  * Performs encryption and decryption of a chunk of data of size "len",
581  * one DEV_BSIZE block at a time.  "len" is assumed to be a multiple of
582  * DEV_BSIZE.
583  */
584 static int
585 lofi_crypto(struct lofi_state *lsp, struct buf *bp, caddr_t plaintext,
586     caddr_t ciphertext, size_t len, boolean_t op_encrypt)
587 {
588 	crypto_data_t cdata;
589 	crypto_data_t wdata;
590 	int ret;
591 	longlong_t lblkno = bp->b_lblkno;
592 
593 	mutex_enter(&lsp->ls_crypto_lock);
594 
595 	/*
596 	 * though we could encrypt/decrypt entire "len" chunk of data, we need
597 	 * to break it into DEV_BSIZE pieces to capture blkno incrementing
598 	 */
599 	SETUP_C_DATA(cdata, plaintext, len);
600 	cdata.cd_length = DEV_BSIZE;
601 	if (ciphertext != NULL) {		/* not in-place crypto */
602 		SETUP_C_DATA(wdata, ciphertext, len);
603 		wdata.cd_length = DEV_BSIZE;
604 	}
605 
606 	do {
607 		ret = lofi_blk_mech(lsp, lblkno);
608 		if (ret != CRYPTO_SUCCESS)
609 			continue;
610 
611 		if (op_encrypt) {
612 			ret = crypto_encrypt(&lsp->ls_mech, &cdata,
613 			    &lsp->ls_key, NULL,
614 			    ((ciphertext != NULL) ? &wdata : NULL), NULL);
615 		} else {
616 			ret = crypto_decrypt(&lsp->ls_mech, &cdata,
617 			    &lsp->ls_key, NULL,
618 			    ((ciphertext != NULL) ? &wdata : NULL), NULL);
619 		}
620 
621 		cdata.cd_offset += DEV_BSIZE;
622 		if (ciphertext != NULL)
623 			wdata.cd_offset += DEV_BSIZE;
624 		lblkno++;
625 	} while (ret == CRYPTO_SUCCESS && cdata.cd_offset < len);
626 
627 	mutex_exit(&lsp->ls_crypto_lock);
628 
629 	if (ret != CRYPTO_SUCCESS) {
630 		cmn_err(CE_WARN, "%s failed for block %lld:  (0x%x)",
631 		    op_encrypt ? "crypto_encrypt()" : "crypto_decrypt()",
632 		    lblkno, ret);
633 	}
634 
635 	return (ret);
636 }
637 
638 #define	RDWR_RAW	1
639 #define	RDWR_BCOPY	2
640 
641 static int
642 lofi_rdwr(caddr_t bufaddr, offset_t offset, struct buf *bp,
643     struct lofi_state *lsp, size_t len, int method, caddr_t bcopy_locn)
644 {
645 	ssize_t resid;
646 	int isread;
647 	int error;
648 
649 	/*
650 	 * Handles reads/writes for both plain and encrypted lofi
651 	 * Note:  offset is already shifted by lsp->ls_crypto_offset
652 	 * when it gets here.
653 	 */
654 
655 	isread = bp->b_flags & B_READ;
656 	if (isread) {
657 		if (method == RDWR_BCOPY) {
658 			/* DO NOT update bp->b_resid for bcopy */
659 			bcopy(bcopy_locn, bufaddr, len);
660 			error = 0;
661 		} else {		/* RDWR_RAW */
662 			error = vn_rdwr(UIO_READ, lsp->ls_vp, bufaddr, len,
663 			    offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred,
664 			    &resid);
665 			bp->b_resid = resid;
666 		}
667 		if (lsp->ls_crypto_enabled && error == 0) {
668 			if (lofi_crypto(lsp, bp, bufaddr, NULL, len,
669 			    B_FALSE) != CRYPTO_SUCCESS) {
670 				/*
671 				 * XXX: original code didn't set residual
672 				 * back to len because no error was expected
673 				 * from bcopy() if encryption is not enabled
674 				 */
675 				if (method != RDWR_BCOPY)
676 					bp->b_resid = len;
677 				error = EIO;
678 			}
679 		}
680 		return (error);
681 	} else {
682 		void *iobuf = bufaddr;
683 
684 		if (lsp->ls_crypto_enabled) {
685 			/* don't do in-place crypto to keep bufaddr intact */
686 			iobuf = kmem_alloc(len, KM_SLEEP);
687 			if (lofi_crypto(lsp, bp, bufaddr, iobuf, len,
688 			    B_TRUE) != CRYPTO_SUCCESS) {
689 				kmem_free(iobuf, len);
690 				if (method != RDWR_BCOPY)
691 					bp->b_resid = len;
692 				return (EIO);
693 			}
694 		}
695 		if (method == RDWR_BCOPY) {
696 			/* DO NOT update bp->b_resid for bcopy */
697 			bcopy(iobuf, bcopy_locn, len);
698 			error = 0;
699 		} else {		/* RDWR_RAW */
700 			error = vn_rdwr(UIO_WRITE, lsp->ls_vp, iobuf, len,
701 			    offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred,
702 			    &resid);
703 			bp->b_resid = resid;
704 		}
705 		if (lsp->ls_crypto_enabled) {
706 			kmem_free(iobuf, len);
707 		}
708 		return (error);
709 	}
710 }
711 
712 static int
713 lofi_mapped_rdwr(caddr_t bufaddr, offset_t offset, struct buf *bp,
714     struct lofi_state *lsp)
715 {
716 	int error;
717 	offset_t alignedoffset, mapoffset;
718 	size_t	xfersize;
719 	int	isread;
720 	int	smflags;
721 	caddr_t	mapaddr;
722 	size_t	len;
723 	enum seg_rw srw;
724 	int	save_error;
725 
726 	/*
727 	 * Note:  offset is already shifted by lsp->ls_crypto_offset
728 	 * when it gets here.
729 	 */
730 	if (lsp->ls_crypto_enabled)
731 		ASSERT(lsp->ls_vp_comp_size == lsp->ls_vp_size);
732 
733 	/*
734 	 * segmap always gives us an 8K (MAXBSIZE) chunk, aligned on
735 	 * an 8K boundary, but the buf transfer address may not be
736 	 * aligned on more than a 512-byte boundary (we don't enforce
737 	 * that even though we could). This matters since the initial
738 	 * part of the transfer may not start at offset 0 within the
739 	 * segmap'd chunk. So we have to compensate for that with
740 	 * 'mapoffset'. Subsequent chunks always start off at the
741 	 * beginning, and the last is capped by b_resid
742 	 *
743 	 * Visually, where "|" represents page map boundaries:
744 	 *   alignedoffset (mapaddr begins at this segmap boundary)
745 	 *    |   offset (from beginning of file)
746 	 *    |    |	   len
747 	 *    v    v	    v
748 	 * ===|====X========|====...======|========X====|====
749 	 *	   /-------------...---------------/
750 	 *		^ bp->b_bcount/bp->b_resid at start
751 	 *    /----/--------/----...------/--------/
752 	 *	^	^	^   ^		^
753 	 *	|	|	|   |		nth xfersize (<= MAXBSIZE)
754 	 *	|	|	2nd thru n-1st xfersize (= MAXBSIZE)
755 	 *	|	1st xfersize (<= MAXBSIZE)
756 	 *    mapoffset (offset into 1st segmap, non-0 1st time, 0 thereafter)
757 	 *
758 	 * Notes: "alignedoffset" is "offset" rounded down to nearest
759 	 * MAXBSIZE boundary.  "len" is next page boundary of size
760 	 * PAGESIZE after "alignedoffset".
761 	 */
762 	mapoffset = offset & MAXBOFFSET;
763 	alignedoffset = offset - mapoffset;
764 	bp->b_resid = bp->b_bcount;
765 	isread = bp->b_flags & B_READ;
766 	srw = isread ? S_READ : S_WRITE;
767 	do {
768 		xfersize = MIN(lsp->ls_vp_comp_size - offset,
769 		    MIN(MAXBSIZE - mapoffset, bp->b_resid));
770 		len = roundup(mapoffset + xfersize, PAGESIZE);
771 		mapaddr = segmap_getmapflt(segkmap, lsp->ls_vp,
772 		    alignedoffset, MAXBSIZE, 1, srw);
773 		/*
774 		 * Now fault in the pages. This lets us check
775 		 * for errors before we reference mapaddr and
776 		 * try to resolve the fault in bcopy (which would
777 		 * panic instead). And this can easily happen,
778 		 * particularly if you've lofi'd a file over NFS
779 		 * and someone deletes the file on the server.
780 		 */
781 		error = segmap_fault(kas.a_hat, segkmap, mapaddr,
782 		    len, F_SOFTLOCK, srw);
783 		if (error) {
784 			(void) segmap_release(segkmap, mapaddr, 0);
785 			if (FC_CODE(error) == FC_OBJERR)
786 				error = FC_ERRNO(error);
787 			else
788 				error = EIO;
789 			break;
790 		}
791 		/* error may be non-zero for encrypted lofi */
792 		error = lofi_rdwr(bufaddr, 0, bp, lsp, xfersize,
793 		    RDWR_BCOPY, mapaddr + mapoffset);
794 		if (error == 0) {
795 			bp->b_resid -= xfersize;
796 			bufaddr += xfersize;
797 			offset += xfersize;
798 		}
799 		smflags = 0;
800 		if (isread) {
801 			smflags |= SM_FREE;
802 			/*
803 			 * If we're reading an entire page starting
804 			 * at a page boundary, there's a good chance
805 			 * we won't need it again. Put it on the
806 			 * head of the freelist.
807 			 */
808 			if (mapoffset == 0 && xfersize == MAXBSIZE)
809 				smflags |= SM_DONTNEED;
810 		} else {
811 			/*
812 			 * Write back good pages, it is okay to
813 			 * always release asynchronous here as we'll
814 			 * follow with VOP_FSYNC for B_SYNC buffers.
815 			 */
816 			if (error == 0)
817 				smflags |= SM_WRITE | SM_ASYNC;
818 		}
819 		(void) segmap_fault(kas.a_hat, segkmap, mapaddr,
820 		    len, F_SOFTUNLOCK, srw);
821 		save_error = segmap_release(segkmap, mapaddr, smflags);
822 		if (error == 0)
823 			error = save_error;
824 		/* only the first map may start partial */
825 		mapoffset = 0;
826 		alignedoffset += MAXBSIZE;
827 	} while ((error == 0) && (bp->b_resid > 0) &&
828 	    (offset < lsp->ls_vp_comp_size));
829 
830 	return (error);
831 }
832 
833 /*
834  * Check if segment seg_index is present in the decompressed segment
835  * data cache.
836  *
837  * Returns a pointer to the decompressed segment data cache entry if
838  * found, and NULL when decompressed data for this segment is not yet
839  * cached.
840  */
841 static struct lofi_comp_cache *
842 lofi_find_comp_data(struct lofi_state *lsp, uint64_t seg_index)
843 {
844 	struct lofi_comp_cache *lc;
845 
846 	ASSERT(mutex_owned(&lsp->ls_comp_cache_lock));
847 
848 	for (lc = list_head(&lsp->ls_comp_cache); lc != NULL;
849 	    lc = list_next(&lsp->ls_comp_cache, lc)) {
850 		if (lc->lc_index == seg_index) {
851 			/*
852 			 * Decompressed segment data was found in the
853 			 * cache.
854 			 *
855 			 * The cache uses an LRU replacement strategy;
856 			 * move the entry to head of list.
857 			 */
858 			list_remove(&lsp->ls_comp_cache, lc);
859 			list_insert_head(&lsp->ls_comp_cache, lc);
860 			return (lc);
861 		}
862 	}
863 	return (NULL);
864 }
865 
866 /*
867  * Add the data for a decompressed segment at segment index
868  * seg_index to the cache of the decompressed segments.
869  *
870  * Returns a pointer to the cache element structure in case
871  * the data was added to the cache; returns NULL when the data
872  * wasn't cached.
873  */
874 static struct lofi_comp_cache *
875 lofi_add_comp_data(struct lofi_state *lsp, uint64_t seg_index,
876     uchar_t *data)
877 {
878 	struct lofi_comp_cache *lc;
879 
880 	ASSERT(mutex_owned(&lsp->ls_comp_cache_lock));
881 
882 	while (lsp->ls_comp_cache_count > lofi_max_comp_cache) {
883 		lc = list_remove_tail(&lsp->ls_comp_cache);
884 		ASSERT(lc != NULL);
885 		kmem_free(lc->lc_data, lsp->ls_uncomp_seg_sz);
886 		kmem_free(lc, sizeof (struct lofi_comp_cache));
887 		lsp->ls_comp_cache_count--;
888 	}
889 
890 	/*
891 	 * Do not cache when disabled by tunable variable
892 	 */
893 	if (lofi_max_comp_cache == 0)
894 		return (NULL);
895 
896 	/*
897 	 * When the cache has not yet reached the maximum allowed
898 	 * number of segments, allocate a new cache element.
899 	 * Otherwise the cache is full; reuse the last list element
900 	 * (LRU) for caching the decompressed segment data.
901 	 *
902 	 * The cache element for the new decompressed segment data is
903 	 * added to the head of the list.
904 	 */
905 	if (lsp->ls_comp_cache_count < lofi_max_comp_cache) {
906 		lc = kmem_alloc(sizeof (struct lofi_comp_cache), KM_SLEEP);
907 		lc->lc_data = NULL;
908 		list_insert_head(&lsp->ls_comp_cache, lc);
909 		lsp->ls_comp_cache_count++;
910 	} else {
911 		lc = list_remove_tail(&lsp->ls_comp_cache);
912 		if (lc == NULL)
913 			return (NULL);
914 		list_insert_head(&lsp->ls_comp_cache, lc);
915 	}
916 
917 	/*
918 	 * Free old uncompressed segment data when reusing a cache
919 	 * entry.
920 	 */
921 	if (lc->lc_data != NULL)
922 		kmem_free(lc->lc_data, lsp->ls_uncomp_seg_sz);
923 
924 	lc->lc_data = data;
925 	lc->lc_index = seg_index;
926 	return (lc);
927 }
928 
929 
930 /*ARGSUSED*/
931 static int
932 gzip_decompress(void *src, size_t srclen, void *dst,
933     size_t *dstlen, int level)
934 {
935 	ASSERT(*dstlen >= srclen);
936 
937 	if (z_uncompress(dst, dstlen, src, srclen) != Z_OK)
938 		return (-1);
939 	return (0);
940 }
941 
942 #define	LZMA_HEADER_SIZE	(LZMA_PROPS_SIZE + 8)
943 /*ARGSUSED*/
944 static int
945 lzma_decompress(void *src, size_t srclen, void *dst,
946 	size_t *dstlen, int level)
947 {
948 	size_t insizepure;
949 	void *actual_src;
950 	ELzmaStatus status;
951 
952 	insizepure = srclen - LZMA_HEADER_SIZE;
953 	actual_src = (void *)((Byte *)src + LZMA_HEADER_SIZE);
954 
955 	if (LzmaDecode((Byte *)dst, (size_t *)dstlen,
956 	    (const Byte *)actual_src, &insizepure,
957 	    (const Byte *)src, LZMA_PROPS_SIZE, LZMA_FINISH_ANY, &status,
958 	    &g_Alloc) != SZ_OK) {
959 		return (-1);
960 	}
961 	return (0);
962 }
963 
964 /*
965  * This is basically what strategy used to be before we found we
966  * needed task queues.
967  */
968 static void
969 lofi_strategy_task(void *arg)
970 {
971 	struct buf *bp = (struct buf *)arg;
972 	int error;
973 	int syncflag = 0;
974 	struct lofi_state *lsp;
975 	offset_t offset;
976 	caddr_t	bufaddr;
977 	size_t	len;
978 	size_t	xfersize;
979 	boolean_t bufinited = B_FALSE;
980 
981 	lsp = ddi_get_soft_state(lofi_statep, getminor(bp->b_edev));
982 	if (lsp == NULL) {
983 		error = ENXIO;
984 		goto errout;
985 	}
986 	if (lsp->ls_kstat) {
987 		mutex_enter(lsp->ls_kstat->ks_lock);
988 		kstat_waitq_to_runq(KSTAT_IO_PTR(lsp->ls_kstat));
989 		mutex_exit(lsp->ls_kstat->ks_lock);
990 	}
991 	bp_mapin(bp);
992 	bufaddr = bp->b_un.b_addr;
993 	offset = bp->b_lblkno * DEV_BSIZE;	/* offset within file */
994 	if (lsp->ls_crypto_enabled) {
995 		/* encrypted data really begins after crypto header */
996 		offset += lsp->ls_crypto_offset;
997 	}
998 	len = bp->b_bcount;
999 	bufinited = B_TRUE;
1000 
1001 	if (lsp->ls_vp == NULL || lsp->ls_vp_closereq) {
1002 		error = EIO;
1003 		goto errout;
1004 	}
1005 
1006 	/*
1007 	 * If we're writing and the buffer was not B_ASYNC
1008 	 * we'll follow up with a VOP_FSYNC() to force any
1009 	 * asynchronous I/O to stable storage.
1010 	 */
1011 	if (!(bp->b_flags & B_READ) && !(bp->b_flags & B_ASYNC))
1012 		syncflag = FSYNC;
1013 
1014 	/*
1015 	 * We used to always use vn_rdwr here, but we cannot do that because
1016 	 * we might decide to read or write from the the underlying
1017 	 * file during this call, which would be a deadlock because
1018 	 * we have the rw_lock. So instead we page, unless it's not
1019 	 * mapable or it's a character device or it's an encrypted lofi.
1020 	 */
1021 	if ((lsp->ls_vp->v_flag & VNOMAP) || (lsp->ls_vp->v_type == VCHR) ||
1022 	    lsp->ls_crypto_enabled) {
1023 		error = lofi_rdwr(bufaddr, offset, bp, lsp, len, RDWR_RAW,
1024 		    NULL);
1025 	} else if (lsp->ls_uncomp_seg_sz == 0) {
1026 		error = lofi_mapped_rdwr(bufaddr, offset, bp, lsp);
1027 	} else {
1028 		uchar_t *compressed_seg = NULL, *cmpbuf;
1029 		uchar_t *uncompressed_seg = NULL;
1030 		lofi_compress_info_t *li;
1031 		size_t oblkcount;
1032 		ulong_t seglen;
1033 		uint64_t sblkno, eblkno, cmpbytes;
1034 		uint64_t uncompressed_seg_index;
1035 		struct lofi_comp_cache *lc;
1036 		offset_t sblkoff, eblkoff;
1037 		u_offset_t salign, ealign;
1038 		u_offset_t sdiff;
1039 		uint32_t comp_data_sz;
1040 		uint64_t i;
1041 		int j;
1042 
1043 		/*
1044 		 * From here on we're dealing primarily with compressed files
1045 		 */
1046 		ASSERT(!lsp->ls_crypto_enabled);
1047 
1048 		/*
1049 		 * Compressed files can only be read from and
1050 		 * not written to
1051 		 */
1052 		if (!(bp->b_flags & B_READ)) {
1053 			bp->b_resid = bp->b_bcount;
1054 			error = EROFS;
1055 			goto done;
1056 		}
1057 
1058 		ASSERT(lsp->ls_comp_algorithm_index >= 0);
1059 		li = &lofi_compress_table[lsp->ls_comp_algorithm_index];
1060 		/*
1061 		 * Compute starting and ending compressed segment numbers
1062 		 * We use only bitwise operations avoiding division and
1063 		 * modulus because we enforce the compression segment size
1064 		 * to a power of 2
1065 		 */
1066 		sblkno = offset >> lsp->ls_comp_seg_shift;
1067 		sblkoff = offset & (lsp->ls_uncomp_seg_sz - 1);
1068 		eblkno = (offset + bp->b_bcount) >> lsp->ls_comp_seg_shift;
1069 		eblkoff = (offset + bp->b_bcount) & (lsp->ls_uncomp_seg_sz - 1);
1070 
1071 		/*
1072 		 * Check the decompressed segment cache.
1073 		 *
1074 		 * The cache is used only when the requested data
1075 		 * is within a segment. Requests that cross
1076 		 * segment boundaries bypass the cache.
1077 		 */
1078 		if (sblkno == eblkno ||
1079 		    (sblkno + 1 == eblkno && eblkoff == 0)) {
1080 			/*
1081 			 * Request doesn't cross a segment boundary,
1082 			 * now check the cache.
1083 			 */
1084 			mutex_enter(&lsp->ls_comp_cache_lock);
1085 			lc = lofi_find_comp_data(lsp, sblkno);
1086 			if (lc != NULL) {
1087 				/*
1088 				 * We've found the decompressed segment
1089 				 * data in the cache; reuse it.
1090 				 */
1091 				bcopy(lc->lc_data + sblkoff, bufaddr,
1092 				    bp->b_bcount);
1093 				mutex_exit(&lsp->ls_comp_cache_lock);
1094 				bp->b_resid = 0;
1095 				error = 0;
1096 				goto done;
1097 			}
1098 			mutex_exit(&lsp->ls_comp_cache_lock);
1099 		}
1100 
1101 		/*
1102 		 * Align start offset to block boundary for segmap
1103 		 */
1104 		salign = lsp->ls_comp_seg_index[sblkno];
1105 		sdiff = salign & (DEV_BSIZE - 1);
1106 		salign -= sdiff;
1107 		if (eblkno >= (lsp->ls_comp_index_sz - 1)) {
1108 			/*
1109 			 * We're dealing with the last segment of
1110 			 * the compressed file -- the size of this
1111 			 * segment *may not* be the same as the
1112 			 * segment size for the file
1113 			 */
1114 			eblkoff = (offset + bp->b_bcount) &
1115 			    (lsp->ls_uncomp_last_seg_sz - 1);
1116 			ealign = lsp->ls_vp_comp_size;
1117 		} else {
1118 			ealign = lsp->ls_comp_seg_index[eblkno + 1];
1119 		}
1120 
1121 		/*
1122 		 * Preserve original request paramaters
1123 		 */
1124 		oblkcount = bp->b_bcount;
1125 
1126 		/*
1127 		 * Assign the calculated parameters
1128 		 */
1129 		comp_data_sz = ealign - salign;
1130 		bp->b_bcount = comp_data_sz;
1131 
1132 		/*
1133 		 * Buffers to hold compressed segments are pre-allocated
1134 		 * on a per-thread basis. Find a pre-allocated buffer
1135 		 * that is not currently in use and mark it for use.
1136 		 */
1137 		mutex_enter(&lsp->ls_comp_bufs_lock);
1138 		for (j = 0; j < lofi_taskq_nthreads; j++) {
1139 			if (lsp->ls_comp_bufs[j].inuse == 0) {
1140 				lsp->ls_comp_bufs[j].inuse = 1;
1141 				break;
1142 			}
1143 		}
1144 
1145 		mutex_exit(&lsp->ls_comp_bufs_lock);
1146 		ASSERT(j < lofi_taskq_nthreads);
1147 
1148 		/*
1149 		 * If the pre-allocated buffer size does not match
1150 		 * the size of the I/O request, re-allocate it with
1151 		 * the appropriate size
1152 		 */
1153 		if (lsp->ls_comp_bufs[j].bufsize < bp->b_bcount) {
1154 			if (lsp->ls_comp_bufs[j].bufsize > 0)
1155 				kmem_free(lsp->ls_comp_bufs[j].buf,
1156 				    lsp->ls_comp_bufs[j].bufsize);
1157 			lsp->ls_comp_bufs[j].buf = kmem_alloc(bp->b_bcount,
1158 			    KM_SLEEP);
1159 			lsp->ls_comp_bufs[j].bufsize = bp->b_bcount;
1160 		}
1161 		compressed_seg = lsp->ls_comp_bufs[j].buf;
1162 
1163 		/*
1164 		 * Map in the calculated number of blocks
1165 		 */
1166 		error = lofi_mapped_rdwr((caddr_t)compressed_seg, salign,
1167 		    bp, lsp);
1168 
1169 		bp->b_bcount = oblkcount;
1170 		bp->b_resid = oblkcount;
1171 		if (error != 0)
1172 			goto done;
1173 
1174 		/*
1175 		 * decompress compressed blocks start
1176 		 */
1177 		cmpbuf = compressed_seg + sdiff;
1178 		for (i = sblkno; i <= eblkno; i++) {
1179 			ASSERT(i < lsp->ls_comp_index_sz - 1);
1180 			uchar_t *useg;
1181 
1182 			/*
1183 			 * The last segment is special in that it is
1184 			 * most likely not going to be the same
1185 			 * (uncompressed) size as the other segments.
1186 			 */
1187 			if (i == (lsp->ls_comp_index_sz - 2)) {
1188 				seglen = lsp->ls_uncomp_last_seg_sz;
1189 			} else {
1190 				seglen = lsp->ls_uncomp_seg_sz;
1191 			}
1192 
1193 			/*
1194 			 * Each of the segment index entries contains
1195 			 * the starting block number for that segment.
1196 			 * The number of compressed bytes in a segment
1197 			 * is thus the difference between the starting
1198 			 * block number of this segment and the starting
1199 			 * block number of the next segment.
1200 			 */
1201 			cmpbytes = lsp->ls_comp_seg_index[i + 1] -
1202 			    lsp->ls_comp_seg_index[i];
1203 
1204 			/*
1205 			 * The first byte in a compressed segment is a flag
1206 			 * that indicates whether this segment is compressed
1207 			 * at all.
1208 			 *
1209 			 * The variable 'useg' is used (instead of
1210 			 * uncompressed_seg) in this loop to keep a
1211 			 * reference to the uncompressed segment.
1212 			 *
1213 			 * N.B. If 'useg' is replaced with uncompressed_seg,
1214 			 * it leads to memory leaks and heap corruption in
1215 			 * corner cases where compressed segments lie
1216 			 * adjacent to uncompressed segments.
1217 			 */
1218 			if (*cmpbuf == UNCOMPRESSED) {
1219 				useg = cmpbuf + SEGHDR;
1220 			} else {
1221 				if (uncompressed_seg == NULL)
1222 					uncompressed_seg =
1223 					    kmem_alloc(lsp->ls_uncomp_seg_sz,
1224 					    KM_SLEEP);
1225 				useg = uncompressed_seg;
1226 				uncompressed_seg_index = i;
1227 
1228 				if (li->l_decompress((cmpbuf + SEGHDR),
1229 				    (cmpbytes - SEGHDR), uncompressed_seg,
1230 				    &seglen, li->l_level) != 0) {
1231 					error = EIO;
1232 					goto done;
1233 				}
1234 			}
1235 
1236 			/*
1237 			 * Determine how much uncompressed data we
1238 			 * have to copy and copy it
1239 			 */
1240 			xfersize = lsp->ls_uncomp_seg_sz - sblkoff;
1241 			if (i == eblkno)
1242 				xfersize -= (lsp->ls_uncomp_seg_sz - eblkoff);
1243 
1244 			bcopy((useg + sblkoff), bufaddr, xfersize);
1245 
1246 			cmpbuf += cmpbytes;
1247 			bufaddr += xfersize;
1248 			bp->b_resid -= xfersize;
1249 			sblkoff = 0;
1250 
1251 			if (bp->b_resid == 0)
1252 				break;
1253 		} /* decompress compressed blocks ends */
1254 
1255 		/*
1256 		 * Skip to done if there is no uncompressed data to cache
1257 		 */
1258 		if (uncompressed_seg == NULL)
1259 			goto done;
1260 
1261 		/*
1262 		 * Add the data for the last decompressed segment to
1263 		 * the cache.
1264 		 *
1265 		 * In case the uncompressed segment data was added to (and
1266 		 * is referenced by) the cache, make sure we don't free it
1267 		 * here.
1268 		 */
1269 		mutex_enter(&lsp->ls_comp_cache_lock);
1270 		if ((lc = lofi_add_comp_data(lsp, uncompressed_seg_index,
1271 		    uncompressed_seg)) != NULL) {
1272 			uncompressed_seg = NULL;
1273 		}
1274 		mutex_exit(&lsp->ls_comp_cache_lock);
1275 
1276 done:
1277 		if (compressed_seg != NULL) {
1278 			mutex_enter(&lsp->ls_comp_bufs_lock);
1279 			lsp->ls_comp_bufs[j].inuse = 0;
1280 			mutex_exit(&lsp->ls_comp_bufs_lock);
1281 		}
1282 		if (uncompressed_seg != NULL)
1283 			kmem_free(uncompressed_seg, lsp->ls_uncomp_seg_sz);
1284 	} /* end of handling compressed files */
1285 
1286 	if ((error == 0) && (syncflag != 0))
1287 		error = VOP_FSYNC(lsp->ls_vp, syncflag, kcred, NULL);
1288 
1289 errout:
1290 	if (bufinited && lsp->ls_kstat) {
1291 		size_t n_done = bp->b_bcount - bp->b_resid;
1292 		kstat_io_t *kioptr;
1293 
1294 		mutex_enter(lsp->ls_kstat->ks_lock);
1295 		kioptr = KSTAT_IO_PTR(lsp->ls_kstat);
1296 		if (bp->b_flags & B_READ) {
1297 			kioptr->nread += n_done;
1298 			kioptr->reads++;
1299 		} else {
1300 			kioptr->nwritten += n_done;
1301 			kioptr->writes++;
1302 		}
1303 		kstat_runq_exit(kioptr);
1304 		mutex_exit(lsp->ls_kstat->ks_lock);
1305 	}
1306 
1307 	mutex_enter(&lsp->ls_vp_lock);
1308 	if (--lsp->ls_vp_iocount == 0)
1309 		cv_broadcast(&lsp->ls_vp_cv);
1310 	mutex_exit(&lsp->ls_vp_lock);
1311 
1312 	bioerror(bp, error);
1313 	biodone(bp);
1314 }
1315 
1316 static int
1317 lofi_strategy(struct buf *bp)
1318 {
1319 	struct lofi_state *lsp;
1320 	offset_t	offset;
1321 
1322 	/*
1323 	 * We cannot just do I/O here, because the current thread
1324 	 * _might_ end up back in here because the underlying filesystem
1325 	 * wants a buffer, which eventually gets into bio_recycle and
1326 	 * might call into lofi to write out a delayed-write buffer.
1327 	 * This is bad if the filesystem above lofi is the same as below.
1328 	 *
1329 	 * We could come up with a complex strategy using threads to
1330 	 * do the I/O asynchronously, or we could use task queues. task
1331 	 * queues were incredibly easy so they win.
1332 	 */
1333 	lsp = ddi_get_soft_state(lofi_statep, getminor(bp->b_edev));
1334 	if (lsp == NULL) {
1335 		bioerror(bp, ENXIO);
1336 		biodone(bp);
1337 		return (0);
1338 	}
1339 
1340 	mutex_enter(&lsp->ls_vp_lock);
1341 	if (lsp->ls_vp == NULL || lsp->ls_vp_closereq) {
1342 		bioerror(bp, EIO);
1343 		biodone(bp);
1344 		mutex_exit(&lsp->ls_vp_lock);
1345 		return (0);
1346 	}
1347 
1348 	offset = bp->b_lblkno * DEV_BSIZE;	/* offset within file */
1349 	if (lsp->ls_crypto_enabled) {
1350 		/* encrypted data really begins after crypto header */
1351 		offset += lsp->ls_crypto_offset;
1352 	}
1353 	if (offset == lsp->ls_vp_size) {
1354 		/* EOF */
1355 		if ((bp->b_flags & B_READ) != 0) {
1356 			bp->b_resid = bp->b_bcount;
1357 			bioerror(bp, 0);
1358 		} else {
1359 			/* writes should fail */
1360 			bioerror(bp, ENXIO);
1361 		}
1362 		biodone(bp);
1363 		mutex_exit(&lsp->ls_vp_lock);
1364 		return (0);
1365 	}
1366 	if (offset > lsp->ls_vp_size) {
1367 		bioerror(bp, ENXIO);
1368 		biodone(bp);
1369 		mutex_exit(&lsp->ls_vp_lock);
1370 		return (0);
1371 	}
1372 	lsp->ls_vp_iocount++;
1373 	mutex_exit(&lsp->ls_vp_lock);
1374 
1375 	if (lsp->ls_kstat) {
1376 		mutex_enter(lsp->ls_kstat->ks_lock);
1377 		kstat_waitq_enter(KSTAT_IO_PTR(lsp->ls_kstat));
1378 		mutex_exit(lsp->ls_kstat->ks_lock);
1379 	}
1380 	(void) taskq_dispatch(lsp->ls_taskq, lofi_strategy_task, bp, KM_SLEEP);
1381 	return (0);
1382 }
1383 
1384 /*ARGSUSED2*/
1385 static int
1386 lofi_read(dev_t dev, struct uio *uio, struct cred *credp)
1387 {
1388 	if (getminor(dev) == 0)
1389 		return (EINVAL);
1390 	UIO_CHECK(uio);
1391 	return (physio(lofi_strategy, NULL, dev, B_READ, minphys, uio));
1392 }
1393 
1394 /*ARGSUSED2*/
1395 static int
1396 lofi_write(dev_t dev, struct uio *uio, struct cred *credp)
1397 {
1398 	if (getminor(dev) == 0)
1399 		return (EINVAL);
1400 	UIO_CHECK(uio);
1401 	return (physio(lofi_strategy, NULL, dev, B_WRITE, minphys, uio));
1402 }
1403 
1404 /*ARGSUSED2*/
1405 static int
1406 lofi_aread(dev_t dev, struct aio_req *aio, struct cred *credp)
1407 {
1408 	if (getminor(dev) == 0)
1409 		return (EINVAL);
1410 	UIO_CHECK(aio->aio_uio);
1411 	return (aphysio(lofi_strategy, anocancel, dev, B_READ, minphys, aio));
1412 }
1413 
1414 /*ARGSUSED2*/
1415 static int
1416 lofi_awrite(dev_t dev, struct aio_req *aio, struct cred *credp)
1417 {
1418 	if (getminor(dev) == 0)
1419 		return (EINVAL);
1420 	UIO_CHECK(aio->aio_uio);
1421 	return (aphysio(lofi_strategy, anocancel, dev, B_WRITE, minphys, aio));
1422 }
1423 
1424 /*ARGSUSED*/
1425 static int
1426 lofi_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
1427 {
1428 	switch (infocmd) {
1429 	case DDI_INFO_DEVT2DEVINFO:
1430 		*result = lofi_dip;
1431 		return (DDI_SUCCESS);
1432 	case DDI_INFO_DEVT2INSTANCE:
1433 		*result = 0;
1434 		return (DDI_SUCCESS);
1435 	}
1436 	return (DDI_FAILURE);
1437 }
1438 
1439 static int
1440 lofi_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
1441 {
1442 	int	error;
1443 
1444 	if (cmd != DDI_ATTACH)
1445 		return (DDI_FAILURE);
1446 	error = ddi_soft_state_zalloc(lofi_statep, 0);
1447 	if (error == DDI_FAILURE) {
1448 		return (DDI_FAILURE);
1449 	}
1450 	error = ddi_create_minor_node(dip, LOFI_CTL_NODE, S_IFCHR, 0,
1451 	    DDI_PSEUDO, NULL);
1452 	if (error == DDI_FAILURE) {
1453 		ddi_soft_state_free(lofi_statep, 0);
1454 		return (DDI_FAILURE);
1455 	}
1456 	/* driver handles kernel-issued IOCTLs */
1457 	if (ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
1458 	    DDI_KERNEL_IOCTL, NULL, 0) != DDI_PROP_SUCCESS) {
1459 		ddi_remove_minor_node(dip, NULL);
1460 		ddi_soft_state_free(lofi_statep, 0);
1461 		return (DDI_FAILURE);
1462 	}
1463 	lofi_dip = dip;
1464 	ddi_report_dev(dip);
1465 	return (DDI_SUCCESS);
1466 }
1467 
1468 static int
1469 lofi_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
1470 {
1471 	if (cmd != DDI_DETACH)
1472 		return (DDI_FAILURE);
1473 	if (lofi_busy())
1474 		return (DDI_FAILURE);
1475 	lofi_dip = NULL;
1476 	ddi_remove_minor_node(dip, NULL);
1477 	ddi_prop_remove_all(dip);
1478 	ddi_soft_state_free(lofi_statep, 0);
1479 	return (DDI_SUCCESS);
1480 }
1481 
1482 /*
1483  * With addition of encryption, be careful that encryption key is wiped before
1484  * kernel memory structures are freed, and also that key is not accidentally
1485  * passed out into userland structures.
1486  */
1487 static void
1488 free_lofi_ioctl(struct lofi_ioctl *klip)
1489 {
1490 	/* Make sure this encryption key doesn't stick around */
1491 	bzero(klip->li_key, sizeof (klip->li_key));
1492 	kmem_free(klip, sizeof (struct lofi_ioctl));
1493 }
1494 
1495 /*
1496  * These two just simplify the rest of the ioctls that need to copyin/out
1497  * the lofi_ioctl structure.
1498  */
1499 struct lofi_ioctl *
1500 copy_in_lofi_ioctl(const struct lofi_ioctl *ulip, int flag)
1501 {
1502 	struct lofi_ioctl *klip;
1503 	int	error;
1504 
1505 	klip = kmem_alloc(sizeof (struct lofi_ioctl), KM_SLEEP);
1506 	error = ddi_copyin(ulip, klip, sizeof (struct lofi_ioctl), flag);
1507 	if (error) {
1508 		free_lofi_ioctl(klip);
1509 		return (NULL);
1510 	}
1511 
1512 	/* make sure filename is always null-terminated */
1513 	klip->li_filename[MAXPATHLEN-1] = '\0';
1514 
1515 	/* validate minor number */
1516 	if (klip->li_minor > lofi_max_files) {
1517 		free_lofi_ioctl(klip);
1518 		cmn_err(CE_WARN, "attempt to map more than lofi_max_files (%d)",
1519 		    lofi_max_files);
1520 		return (NULL);
1521 	}
1522 	return (klip);
1523 }
1524 
1525 int
1526 copy_out_lofi_ioctl(const struct lofi_ioctl *klip, struct lofi_ioctl *ulip,
1527 	int flag)
1528 {
1529 	int	error;
1530 
1531 	/*
1532 	 * NOTE: Do NOT copy the crypto_key_t "back" to userland.
1533 	 * This ensures that an attacker can't trivially find the
1534 	 * key for a mapping just by issuing the ioctl.
1535 	 *
1536 	 * It can still be found by poking around in kmem with mdb(1),
1537 	 * but there is no point in making it easy when the info isn't
1538 	 * of any use in this direction anyway.
1539 	 *
1540 	 * Either way we don't actually have the raw key stored in
1541 	 * a form that we can get it anyway, since we just used it
1542 	 * to create a ctx template and didn't keep "the original".
1543 	 */
1544 	error = ddi_copyout(klip, ulip, sizeof (struct lofi_ioctl), flag);
1545 	if (error)
1546 		return (EFAULT);
1547 	return (0);
1548 }
1549 
1550 /*
1551  * Return the minor number 'filename' is mapped to, if it is.
1552  */
1553 static int
1554 file_to_minor(char *filename)
1555 {
1556 	minor_t	minor;
1557 	struct lofi_state *lsp;
1558 
1559 	ASSERT(mutex_owned(&lofi_lock));
1560 	for (minor = 1; minor <= lofi_max_files; minor++) {
1561 		lsp = ddi_get_soft_state(lofi_statep, minor);
1562 		if (lsp == NULL)
1563 			continue;
1564 		if (strcmp(lsp->ls_filename, filename) == 0)
1565 			return (minor);
1566 	}
1567 	return (0);
1568 }
1569 
1570 /*
1571  * lofiadm does some validation, but since Joe Random (or crashme) could
1572  * do our ioctls, we need to do some validation too.
1573  */
1574 static int
1575 valid_filename(const char *filename)
1576 {
1577 	static char *blkprefix = "/dev/" LOFI_BLOCK_NAME "/";
1578 	static char *charprefix = "/dev/" LOFI_CHAR_NAME "/";
1579 
1580 	/* must be absolute path */
1581 	if (filename[0] != '/')
1582 		return (0);
1583 	/* must not be lofi */
1584 	if (strncmp(filename, blkprefix, strlen(blkprefix)) == 0)
1585 		return (0);
1586 	if (strncmp(filename, charprefix, strlen(charprefix)) == 0)
1587 		return (0);
1588 	return (1);
1589 }
1590 
1591 /*
1592  * Fakes up a disk geometry, and one big partition, based on the size
1593  * of the file. This is needed because we allow newfs'ing the device,
1594  * and newfs will do several disk ioctls to figure out the geometry and
1595  * partition information. It uses that information to determine the parameters
1596  * to pass to mkfs. Geometry is pretty much irrelevant these days, but we
1597  * have to support it.
1598  */
1599 static void
1600 fake_disk_geometry(struct lofi_state *lsp)
1601 {
1602 	u_offset_t dsize = lsp->ls_vp_size - lsp->ls_crypto_offset;
1603 
1604 	/* dk_geom - see dkio(7I) */
1605 	/*
1606 	 * dkg_ncyl _could_ be set to one here (one big cylinder with gobs
1607 	 * of sectors), but that breaks programs like fdisk which want to
1608 	 * partition a disk by cylinder. With one cylinder, you can't create
1609 	 * an fdisk partition and put pcfs on it for testing (hard to pick
1610 	 * a number between one and one).
1611 	 *
1612 	 * The cheezy floppy test is an attempt to not have too few cylinders
1613 	 * for a small file, or so many on a big file that you waste space
1614 	 * for backup superblocks or cylinder group structures.
1615 	 */
1616 	if (dsize < (2 * 1024 * 1024)) /* floppy? */
1617 		lsp->ls_dkg.dkg_ncyl = dsize / (100 * 1024);
1618 	else
1619 		lsp->ls_dkg.dkg_ncyl = dsize / (300 * 1024);
1620 	/* in case file file is < 100k */
1621 	if (lsp->ls_dkg.dkg_ncyl == 0)
1622 		lsp->ls_dkg.dkg_ncyl = 1;
1623 	lsp->ls_dkg.dkg_acyl = 0;
1624 	lsp->ls_dkg.dkg_bcyl = 0;
1625 	lsp->ls_dkg.dkg_nhead = 1;
1626 	lsp->ls_dkg.dkg_obs1 = 0;
1627 	lsp->ls_dkg.dkg_intrlv = 0;
1628 	lsp->ls_dkg.dkg_obs2 = 0;
1629 	lsp->ls_dkg.dkg_obs3 = 0;
1630 	lsp->ls_dkg.dkg_apc = 0;
1631 	lsp->ls_dkg.dkg_rpm = 7200;
1632 	lsp->ls_dkg.dkg_pcyl = lsp->ls_dkg.dkg_ncyl + lsp->ls_dkg.dkg_acyl;
1633 	lsp->ls_dkg.dkg_nsect = dsize / (DEV_BSIZE * lsp->ls_dkg.dkg_ncyl);
1634 	lsp->ls_dkg.dkg_write_reinstruct = 0;
1635 	lsp->ls_dkg.dkg_read_reinstruct = 0;
1636 
1637 	/* vtoc - see dkio(7I) */
1638 	bzero(&lsp->ls_vtoc, sizeof (struct vtoc));
1639 	lsp->ls_vtoc.v_sanity = VTOC_SANE;
1640 	lsp->ls_vtoc.v_version = V_VERSION;
1641 	(void) strncpy(lsp->ls_vtoc.v_volume, LOFI_DRIVER_NAME,
1642 	    sizeof (lsp->ls_vtoc.v_volume));
1643 	lsp->ls_vtoc.v_sectorsz = DEV_BSIZE;
1644 	lsp->ls_vtoc.v_nparts = 1;
1645 	lsp->ls_vtoc.v_part[0].p_tag = V_UNASSIGNED;
1646 
1647 	/*
1648 	 * A compressed file is read-only, other files can
1649 	 * be read-write
1650 	 */
1651 	if (lsp->ls_uncomp_seg_sz > 0) {
1652 		lsp->ls_vtoc.v_part[0].p_flag = V_UNMNT | V_RONLY;
1653 	} else {
1654 		lsp->ls_vtoc.v_part[0].p_flag = V_UNMNT;
1655 	}
1656 	lsp->ls_vtoc.v_part[0].p_start = (daddr_t)0;
1657 	/*
1658 	 * The partition size cannot just be the number of sectors, because
1659 	 * that might not end on a cylinder boundary. And if that's the case,
1660 	 * newfs/mkfs will print a scary warning. So just figure the size
1661 	 * based on the number of cylinders and sectors/cylinder.
1662 	 */
1663 	lsp->ls_vtoc.v_part[0].p_size = lsp->ls_dkg.dkg_pcyl *
1664 	    lsp->ls_dkg.dkg_nsect * lsp->ls_dkg.dkg_nhead;
1665 
1666 	/* dk_cinfo - see dkio(7I) */
1667 	bzero(&lsp->ls_ci, sizeof (struct dk_cinfo));
1668 	(void) strcpy(lsp->ls_ci.dki_cname, LOFI_DRIVER_NAME);
1669 	lsp->ls_ci.dki_ctype = DKC_MD;
1670 	lsp->ls_ci.dki_flags = 0;
1671 	lsp->ls_ci.dki_cnum = 0;
1672 	lsp->ls_ci.dki_addr = 0;
1673 	lsp->ls_ci.dki_space = 0;
1674 	lsp->ls_ci.dki_prio = 0;
1675 	lsp->ls_ci.dki_vec = 0;
1676 	(void) strcpy(lsp->ls_ci.dki_dname, LOFI_DRIVER_NAME);
1677 	lsp->ls_ci.dki_unit = 0;
1678 	lsp->ls_ci.dki_slave = 0;
1679 	lsp->ls_ci.dki_partition = 0;
1680 	/*
1681 	 * newfs uses this to set maxcontig. Must not be < 16, or it
1682 	 * will be 0 when newfs multiplies it by DEV_BSIZE and divides
1683 	 * it by the block size. Then tunefs doesn't work because
1684 	 * maxcontig is 0.
1685 	 */
1686 	lsp->ls_ci.dki_maxtransfer = 16;
1687 }
1688 
1689 /*
1690  * map in a compressed file
1691  *
1692  * Read in the header and the index that follows.
1693  *
1694  * The header is as follows -
1695  *
1696  * Signature (name of the compression algorithm)
1697  * Compression segment size (a multiple of 512)
1698  * Number of index entries
1699  * Size of the last block
1700  * The array containing the index entries
1701  *
1702  * The header information is always stored in
1703  * network byte order on disk.
1704  */
1705 static int
1706 lofi_map_compressed_file(struct lofi_state *lsp, char *buf)
1707 {
1708 	uint32_t index_sz, header_len, i;
1709 	ssize_t	resid;
1710 	enum uio_rw rw;
1711 	char *tbuf = buf;
1712 	int error;
1713 
1714 	/* The signature has already been read */
1715 	tbuf += sizeof (lsp->ls_comp_algorithm);
1716 	bcopy(tbuf, &(lsp->ls_uncomp_seg_sz), sizeof (lsp->ls_uncomp_seg_sz));
1717 	lsp->ls_uncomp_seg_sz = ntohl(lsp->ls_uncomp_seg_sz);
1718 
1719 	/*
1720 	 * The compressed segment size must be a power of 2
1721 	 */
1722 	if (lsp->ls_uncomp_seg_sz < DEV_BSIZE ||
1723 	    !ISP2(lsp->ls_uncomp_seg_sz))
1724 		return (EINVAL);
1725 
1726 	for (i = 0; !((lsp->ls_uncomp_seg_sz >> i) & 1); i++)
1727 		;
1728 
1729 	lsp->ls_comp_seg_shift = i;
1730 
1731 	tbuf += sizeof (lsp->ls_uncomp_seg_sz);
1732 	bcopy(tbuf, &(lsp->ls_comp_index_sz), sizeof (lsp->ls_comp_index_sz));
1733 	lsp->ls_comp_index_sz = ntohl(lsp->ls_comp_index_sz);
1734 
1735 	tbuf += sizeof (lsp->ls_comp_index_sz);
1736 	bcopy(tbuf, &(lsp->ls_uncomp_last_seg_sz),
1737 	    sizeof (lsp->ls_uncomp_last_seg_sz));
1738 	lsp->ls_uncomp_last_seg_sz = ntohl(lsp->ls_uncomp_last_seg_sz);
1739 
1740 	/*
1741 	 * Compute the total size of the uncompressed data
1742 	 * for use in fake_disk_geometry and other calculations.
1743 	 * Disk geometry has to be faked with respect to the
1744 	 * actual uncompressed data size rather than the
1745 	 * compressed file size.
1746 	 */
1747 	lsp->ls_vp_size =
1748 	    (u_offset_t)(lsp->ls_comp_index_sz - 2) * lsp->ls_uncomp_seg_sz
1749 	    + lsp->ls_uncomp_last_seg_sz;
1750 
1751 	/*
1752 	 * Index size is rounded up to DEV_BSIZE for ease
1753 	 * of segmapping
1754 	 */
1755 	index_sz = sizeof (*lsp->ls_comp_seg_index) * lsp->ls_comp_index_sz;
1756 	header_len = sizeof (lsp->ls_comp_algorithm) +
1757 	    sizeof (lsp->ls_uncomp_seg_sz) +
1758 	    sizeof (lsp->ls_comp_index_sz) +
1759 	    sizeof (lsp->ls_uncomp_last_seg_sz);
1760 	lsp->ls_comp_offbase = header_len + index_sz;
1761 
1762 	index_sz += header_len;
1763 	index_sz = roundup(index_sz, DEV_BSIZE);
1764 
1765 	lsp->ls_comp_index_data = kmem_alloc(index_sz, KM_SLEEP);
1766 	lsp->ls_comp_index_data_sz = index_sz;
1767 
1768 	/*
1769 	 * Read in the index -- this has a side-effect
1770 	 * of reading in the header as well
1771 	 */
1772 	rw = UIO_READ;
1773 	error = vn_rdwr(rw, lsp->ls_vp, lsp->ls_comp_index_data, index_sz,
1774 	    0, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
1775 
1776 	if (error != 0)
1777 		return (error);
1778 
1779 	/* Skip the header, this is where the index really begins */
1780 	lsp->ls_comp_seg_index =
1781 	    /*LINTED*/
1782 	    (uint64_t *)(lsp->ls_comp_index_data + header_len);
1783 
1784 	/*
1785 	 * Now recompute offsets in the index to account for
1786 	 * the header length
1787 	 */
1788 	for (i = 0; i < lsp->ls_comp_index_sz; i++) {
1789 		lsp->ls_comp_seg_index[i] = lsp->ls_comp_offbase +
1790 		    BE_64(lsp->ls_comp_seg_index[i]);
1791 	}
1792 
1793 	/*
1794 	 * Finally setup per-thread pre-allocated buffers
1795 	 */
1796 	lsp->ls_comp_bufs = kmem_zalloc(lofi_taskq_nthreads *
1797 	    sizeof (struct compbuf), KM_SLEEP);
1798 	mutex_init(&lsp->ls_comp_bufs_lock, NULL, MUTEX_DRIVER, NULL);
1799 
1800 	return (error);
1801 }
1802 
1803 /*
1804  * Check to see if the passed in signature is a valid
1805  * one.  If it is valid, return the index into
1806  * lofi_compress_table.
1807  *
1808  * Return -1 if it is invalid
1809  */
1810 static int lofi_compress_select(char *signature)
1811 {
1812 	int i;
1813 
1814 	for (i = 0; i < LOFI_COMPRESS_FUNCTIONS; i++) {
1815 		if (strcmp(lofi_compress_table[i].l_name, signature) == 0)
1816 			return (i);
1817 	}
1818 
1819 	return (-1);
1820 }
1821 
1822 /*
1823  * map a file to a minor number. Return the minor number.
1824  */
1825 static int
1826 lofi_map_file(dev_t dev, struct lofi_ioctl *ulip, int pickminor,
1827     int *rvalp, struct cred *credp, int ioctl_flag)
1828 {
1829 	minor_t	newminor;
1830 	struct lofi_state *lsp;
1831 	struct lofi_ioctl *klip;
1832 	int	error;
1833 	struct vnode *vp;
1834 	int64_t	Nblocks_prop_val;
1835 	int64_t	Size_prop_val;
1836 	int	compress_index;
1837 	vattr_t	vattr;
1838 	int	flag;
1839 	enum vtype v_type;
1840 	int zalloced = 0;
1841 	dev_t	newdev;
1842 	char	namebuf[50];
1843 	char	buf[DEV_BSIZE];
1844 	char	crybuf[DEV_BSIZE];
1845 	ssize_t	resid;
1846 	boolean_t need_vn_close = B_FALSE;
1847 	boolean_t keycopied = B_FALSE;
1848 	boolean_t need_size_update = B_FALSE;
1849 
1850 	klip = copy_in_lofi_ioctl(ulip, ioctl_flag);
1851 	if (klip == NULL)
1852 		return (EFAULT);
1853 
1854 	mutex_enter(&lofi_lock);
1855 
1856 	if (!valid_filename(klip->li_filename)) {
1857 		error = EINVAL;
1858 		goto out;
1859 	}
1860 
1861 	if (file_to_minor(klip->li_filename) != 0) {
1862 		error = EBUSY;
1863 		goto out;
1864 	}
1865 
1866 	if (pickminor) {
1867 		/* Find a free one */
1868 		for (newminor = 1; newminor <= lofi_max_files; newminor++)
1869 			if (ddi_get_soft_state(lofi_statep, newminor) == NULL)
1870 				break;
1871 		if (newminor >= lofi_max_files) {
1872 			error = EAGAIN;
1873 			goto out;
1874 		}
1875 	} else {
1876 		newminor = klip->li_minor;
1877 		if (ddi_get_soft_state(lofi_statep, newminor) != NULL) {
1878 			error = EEXIST;
1879 			goto out;
1880 		}
1881 	}
1882 
1883 	/* make sure it's valid */
1884 	error = lookupname(klip->li_filename, UIO_SYSSPACE, FOLLOW,
1885 	    NULLVPP, &vp);
1886 	if (error) {
1887 		goto out;
1888 	}
1889 	v_type = vp->v_type;
1890 	VN_RELE(vp);
1891 	if (!V_ISLOFIABLE(v_type)) {
1892 		error = EINVAL;
1893 		goto out;
1894 	}
1895 	flag = FREAD | FWRITE | FOFFMAX | FEXCL;
1896 	error = vn_open(klip->li_filename, UIO_SYSSPACE, flag, 0, &vp, 0, 0);
1897 	if (error) {
1898 		/* try read-only */
1899 		flag &= ~FWRITE;
1900 		error = vn_open(klip->li_filename, UIO_SYSSPACE, flag, 0,
1901 		    &vp, 0, 0);
1902 		if (error) {
1903 			goto out;
1904 		}
1905 	}
1906 	need_vn_close = B_TRUE;
1907 
1908 	vattr.va_mask = AT_SIZE;
1909 	error = VOP_GETATTR(vp, &vattr, 0, credp, NULL);
1910 	if (error) {
1911 		goto out;
1912 	}
1913 	/* the file needs to be a multiple of the block size */
1914 	if ((vattr.va_size % DEV_BSIZE) != 0) {
1915 		error = EINVAL;
1916 		goto out;
1917 	}
1918 	newdev = makedevice(getmajor(dev), newminor);
1919 	Size_prop_val = vattr.va_size;
1920 	if ((ddi_prop_update_int64(newdev, lofi_dip,
1921 	    SIZE_PROP_NAME, Size_prop_val)) != DDI_PROP_SUCCESS) {
1922 		error = EINVAL;
1923 		goto out;
1924 	}
1925 	Nblocks_prop_val = vattr.va_size / DEV_BSIZE;
1926 	if ((ddi_prop_update_int64(newdev, lofi_dip,
1927 	    NBLOCKS_PROP_NAME, Nblocks_prop_val)) != DDI_PROP_SUCCESS) {
1928 		error = EINVAL;
1929 		goto propout;
1930 	}
1931 	error = ddi_soft_state_zalloc(lofi_statep, newminor);
1932 	if (error == DDI_FAILURE) {
1933 		error = ENOMEM;
1934 		goto propout;
1935 	}
1936 	zalloced = 1;
1937 	(void) snprintf(namebuf, sizeof (namebuf), "%d", newminor);
1938 	error = ddi_create_minor_node(lofi_dip, namebuf, S_IFBLK, newminor,
1939 	    DDI_PSEUDO, NULL);
1940 	if (error != DDI_SUCCESS) {
1941 		error = ENXIO;
1942 		goto propout;
1943 	}
1944 	(void) snprintf(namebuf, sizeof (namebuf), "%d,raw", newminor);
1945 	error = ddi_create_minor_node(lofi_dip, namebuf, S_IFCHR, newminor,
1946 	    DDI_PSEUDO, NULL);
1947 	if (error != DDI_SUCCESS) {
1948 		/* remove block node */
1949 		(void) snprintf(namebuf, sizeof (namebuf), "%d", newminor);
1950 		ddi_remove_minor_node(lofi_dip, namebuf);
1951 		error = ENXIO;
1952 		goto propout;
1953 	}
1954 	lsp = ddi_get_soft_state(lofi_statep, newminor);
1955 	lsp->ls_filename_sz = strlen(klip->li_filename) + 1;
1956 	lsp->ls_filename = kmem_alloc(lsp->ls_filename_sz, KM_SLEEP);
1957 	(void) snprintf(namebuf, sizeof (namebuf), "%s_taskq_%d",
1958 	    LOFI_DRIVER_NAME, newminor);
1959 	lsp->ls_taskq = taskq_create(namebuf, lofi_taskq_nthreads,
1960 	    minclsyspri, 1, lofi_taskq_maxalloc, 0);
1961 	lsp->ls_kstat = kstat_create(LOFI_DRIVER_NAME, newminor,
1962 	    NULL, "disk", KSTAT_TYPE_IO, 1, 0);
1963 	if (lsp->ls_kstat) {
1964 		mutex_init(&lsp->ls_kstat_lock, NULL, MUTEX_DRIVER, NULL);
1965 		lsp->ls_kstat->ks_lock = &lsp->ls_kstat_lock;
1966 		kstat_install(lsp->ls_kstat);
1967 	}
1968 	cv_init(&lsp->ls_vp_cv, NULL, CV_DRIVER, NULL);
1969 	mutex_init(&lsp->ls_vp_lock, NULL, MUTEX_DRIVER, NULL);
1970 
1971 	list_create(&lsp->ls_comp_cache, sizeof (struct lofi_comp_cache),
1972 	    offsetof(struct lofi_comp_cache, lc_list));
1973 	mutex_init(&lsp->ls_comp_cache_lock, NULL, MUTEX_DRIVER, NULL);
1974 
1975 	/*
1976 	 * save open mode so file can be closed properly and vnode counts
1977 	 * updated correctly.
1978 	 */
1979 	lsp->ls_openflag = flag;
1980 
1981 	/*
1982 	 * Try to handle stacked lofs vnodes.
1983 	 */
1984 	if (vp->v_type == VREG) {
1985 		if (VOP_REALVP(vp, &lsp->ls_vp, NULL) != 0) {
1986 			lsp->ls_vp = vp;
1987 		} else {
1988 			/*
1989 			 * Even though vp was obtained via vn_open(), we
1990 			 * can't call vn_close() on it, since lofs will
1991 			 * pass the VOP_CLOSE() on down to the realvp
1992 			 * (which we are about to use). Hence we merely
1993 			 * drop the reference to the lofs vnode and hold
1994 			 * the realvp so things behave as if we've
1995 			 * opened the realvp without any interaction
1996 			 * with lofs.
1997 			 */
1998 			VN_HOLD(lsp->ls_vp);
1999 			VN_RELE(vp);
2000 		}
2001 	} else {
2002 		lsp->ls_vp = vp;
2003 	}
2004 	lsp->ls_vp_size = vattr.va_size;
2005 	(void) strcpy(lsp->ls_filename, klip->li_filename);
2006 	if (rvalp)
2007 		*rvalp = (int)newminor;
2008 	klip->li_minor = newminor;
2009 
2010 	/*
2011 	 * Initialize crypto details for encrypted lofi
2012 	 */
2013 	if (klip->li_crypto_enabled) {
2014 		int ret;
2015 
2016 		mutex_init(&lsp->ls_crypto_lock, NULL, MUTEX_DRIVER, NULL);
2017 
2018 		lsp->ls_mech.cm_type = crypto_mech2id(klip->li_cipher);
2019 		if (lsp->ls_mech.cm_type == CRYPTO_MECH_INVALID) {
2020 			cmn_err(CE_WARN, "invalid cipher %s requested for %s",
2021 			    klip->li_cipher, lsp->ls_filename);
2022 			error = EINVAL;
2023 			goto propout;
2024 		}
2025 
2026 		/* this is just initialization here */
2027 		lsp->ls_mech.cm_param = NULL;
2028 		lsp->ls_mech.cm_param_len = 0;
2029 
2030 		lsp->ls_iv_type = klip->li_iv_type;
2031 		lsp->ls_iv_mech.cm_type = crypto_mech2id(klip->li_iv_cipher);
2032 		if (lsp->ls_iv_mech.cm_type == CRYPTO_MECH_INVALID) {
2033 			cmn_err(CE_WARN, "invalid iv cipher %s requested"
2034 			    " for %s", klip->li_iv_cipher, lsp->ls_filename);
2035 			error = EINVAL;
2036 			goto propout;
2037 		}
2038 
2039 		/* iv mech must itself take a null iv */
2040 		lsp->ls_iv_mech.cm_param = NULL;
2041 		lsp->ls_iv_mech.cm_param_len = 0;
2042 		lsp->ls_iv_len = klip->li_iv_len;
2043 
2044 		/*
2045 		 * Create ctx using li_cipher & the raw li_key after checking
2046 		 * that it isn't a weak key.
2047 		 */
2048 		lsp->ls_key.ck_format = CRYPTO_KEY_RAW;
2049 		lsp->ls_key.ck_length = klip->li_key_len;
2050 		lsp->ls_key.ck_data = kmem_alloc(
2051 		    CRYPTO_BITS2BYTES(lsp->ls_key.ck_length), KM_SLEEP);
2052 		bcopy(klip->li_key, lsp->ls_key.ck_data,
2053 		    CRYPTO_BITS2BYTES(lsp->ls_key.ck_length));
2054 		keycopied = B_TRUE;
2055 
2056 		ret = crypto_key_check(&lsp->ls_mech, &lsp->ls_key);
2057 		if (ret != CRYPTO_SUCCESS) {
2058 			error = EINVAL;
2059 			cmn_err(CE_WARN, "weak key check failed for cipher "
2060 			    "%s on file %s (0x%x)", klip->li_cipher,
2061 			    lsp->ls_filename, ret);
2062 			goto propout;
2063 		}
2064 	}
2065 	lsp->ls_crypto_enabled = klip->li_crypto_enabled;
2066 
2067 	/*
2068 	 * Read the file signature to check if it is compressed or encrypted.
2069 	 * Crypto signature is in a different location; both areas should
2070 	 * read to keep compression and encryption mutually exclusive.
2071 	 */
2072 	if (lsp->ls_crypto_enabled) {
2073 		error = vn_rdwr(UIO_READ, lsp->ls_vp, crybuf, DEV_BSIZE,
2074 		    CRYOFF, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
2075 		if (error != 0)
2076 			goto propout;
2077 	}
2078 	error = vn_rdwr(UIO_READ, lsp->ls_vp, buf, DEV_BSIZE, 0, UIO_SYSSPACE,
2079 	    0, RLIM64_INFINITY, kcred, &resid);
2080 	if (error != 0)
2081 		goto propout;
2082 
2083 	/* initialize these variables for all lofi files */
2084 	lsp->ls_comp_bufs = NULL;
2085 	lsp->ls_uncomp_seg_sz = 0;
2086 	lsp->ls_vp_comp_size = lsp->ls_vp_size;
2087 	lsp->ls_comp_algorithm[0] = '\0';
2088 
2089 	/* encrypted lofi reads/writes shifted by crypto metadata size */
2090 	lsp->ls_crypto_offset = 0;
2091 
2092 	/* this is a compressed lofi */
2093 	if ((compress_index = lofi_compress_select(buf)) != -1) {
2094 
2095 		/* compression and encryption are mutually exclusive */
2096 		if (klip->li_crypto_enabled) {
2097 			error = ENOTSUP;
2098 			goto propout;
2099 		}
2100 
2101 		/* initialize compression info for compressed lofi */
2102 		lsp->ls_comp_algorithm_index = compress_index;
2103 		(void) strlcpy(lsp->ls_comp_algorithm,
2104 		    lofi_compress_table[compress_index].l_name,
2105 		    sizeof (lsp->ls_comp_algorithm));
2106 
2107 		error = lofi_map_compressed_file(lsp, buf);
2108 		if (error != 0)
2109 			goto propout;
2110 		need_size_update = B_TRUE;
2111 
2112 	/* this is an encrypted lofi */
2113 	} else if (strncmp(crybuf, lofi_crypto_magic,
2114 	    sizeof (lofi_crypto_magic)) == 0) {
2115 
2116 		char *marker = crybuf;
2117 
2118 		/*
2119 		 * This is the case where the header in the lofi image is
2120 		 * already initialized to indicate it is encrypted.
2121 		 * There is another case (see below) where encryption is
2122 		 * requested but the lofi image has never been used yet,
2123 		 * so the header needs to be written with encryption magic.
2124 		 */
2125 
2126 		/* indicate this must be an encrypted lofi due to magic */
2127 		klip->li_crypto_enabled = B_TRUE;
2128 
2129 		/*
2130 		 * The encryption header information is laid out this way:
2131 		 *	6 bytes:	hex "CFLOFI"
2132 		 *	2 bytes:	version = 0 ... for now
2133 		 *	96 bytes:	reserved1 (not implemented yet)
2134 		 *	4 bytes:	data_sector = 2 ... for now
2135 		 *	more...		not implemented yet
2136 		 */
2137 
2138 		/* copy the magic */
2139 		bcopy(marker, lsp->ls_crypto.magic,
2140 		    sizeof (lsp->ls_crypto.magic));
2141 		marker += sizeof (lsp->ls_crypto.magic);
2142 
2143 		/* read the encryption version number */
2144 		bcopy(marker, &(lsp->ls_crypto.version),
2145 		    sizeof (lsp->ls_crypto.version));
2146 		lsp->ls_crypto.version = ntohs(lsp->ls_crypto.version);
2147 		marker += sizeof (lsp->ls_crypto.version);
2148 
2149 		/* read a chunk of reserved data */
2150 		bcopy(marker, lsp->ls_crypto.reserved1,
2151 		    sizeof (lsp->ls_crypto.reserved1));
2152 		marker += sizeof (lsp->ls_crypto.reserved1);
2153 
2154 		/* read block number where encrypted data begins */
2155 		bcopy(marker, &(lsp->ls_crypto.data_sector),
2156 		    sizeof (lsp->ls_crypto.data_sector));
2157 		lsp->ls_crypto.data_sector = ntohl(lsp->ls_crypto.data_sector);
2158 		marker += sizeof (lsp->ls_crypto.data_sector);
2159 
2160 		/* and ignore the rest until it is implemented */
2161 
2162 		lsp->ls_crypto_offset = lsp->ls_crypto.data_sector * DEV_BSIZE;
2163 		need_size_update = B_TRUE;
2164 
2165 	/* neither compressed nor encrypted, BUT could be new encrypted lofi */
2166 	} else if (klip->li_crypto_enabled) {
2167 
2168 		/*
2169 		 * This is the case where encryption was requested but the
2170 		 * appears to be entirely blank where the encryption header
2171 		 * would have been in the lofi image.  If it is blank,
2172 		 * assume it is a brand new lofi image and initialize the
2173 		 * header area with encryption magic and current version
2174 		 * header data.  If it is not blank, that's an error.
2175 		 */
2176 		int	i;
2177 		char	*marker;
2178 		struct crypto_meta	chead;
2179 
2180 		for (i = 0; i < sizeof (struct crypto_meta); i++)
2181 			if (crybuf[i] != '\0')
2182 				break;
2183 		if (i != sizeof (struct crypto_meta)) {
2184 			error = EINVAL;
2185 			goto propout;
2186 		}
2187 
2188 		/* nothing there, initialize as encrypted lofi */
2189 		marker = crybuf;
2190 		bcopy(lofi_crypto_magic, marker, sizeof (lofi_crypto_magic));
2191 		marker += sizeof (lofi_crypto_magic);
2192 		chead.version = htons(LOFI_CRYPTO_VERSION);
2193 		bcopy(&(chead.version), marker, sizeof (chead.version));
2194 		marker += sizeof (chead.version);
2195 		marker += sizeof (chead.reserved1);
2196 		chead.data_sector = htonl(LOFI_CRYPTO_DATA_SECTOR);
2197 		bcopy(&(chead.data_sector), marker, sizeof (chead.data_sector));
2198 
2199 		/* write the header */
2200 		error = vn_rdwr(UIO_WRITE, lsp->ls_vp, crybuf, DEV_BSIZE,
2201 		    CRYOFF, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
2202 		if (error != 0)
2203 			goto propout;
2204 
2205 		/* fix things up so it looks like we read this info */
2206 		bcopy(lofi_crypto_magic, lsp->ls_crypto.magic,
2207 		    sizeof (lofi_crypto_magic));
2208 		lsp->ls_crypto.version = LOFI_CRYPTO_VERSION;
2209 		lsp->ls_crypto.data_sector = LOFI_CRYPTO_DATA_SECTOR;
2210 
2211 		lsp->ls_crypto_offset = lsp->ls_crypto.data_sector * DEV_BSIZE;
2212 		need_size_update = B_TRUE;
2213 	}
2214 
2215 	/*
2216 	 * Either lsp->ls_vp_size or lsp->ls_crypto_offset changed;
2217 	 * for encrypted lofi, advertise that it is somewhat shorter
2218 	 * due to embedded crypto metadata section
2219 	 */
2220 	if (need_size_update) {
2221 		/* update DDI properties */
2222 		Size_prop_val = lsp->ls_vp_size - lsp->ls_crypto_offset;
2223 		if ((ddi_prop_update_int64(newdev, lofi_dip, SIZE_PROP_NAME,
2224 		    Size_prop_val)) != DDI_PROP_SUCCESS) {
2225 			error = EINVAL;
2226 			goto propout;
2227 		}
2228 		Nblocks_prop_val =
2229 		    (lsp->ls_vp_size - lsp->ls_crypto_offset) / DEV_BSIZE;
2230 		if ((ddi_prop_update_int64(newdev, lofi_dip, NBLOCKS_PROP_NAME,
2231 		    Nblocks_prop_val)) != DDI_PROP_SUCCESS) {
2232 			error = EINVAL;
2233 			goto propout;
2234 		}
2235 	}
2236 
2237 	fake_disk_geometry(lsp);
2238 	mutex_exit(&lofi_lock);
2239 	(void) copy_out_lofi_ioctl(klip, ulip, ioctl_flag);
2240 	free_lofi_ioctl(klip);
2241 	return (0);
2242 
2243 propout:
2244 	if (keycopied) {
2245 		bzero(lsp->ls_key.ck_data,
2246 		    CRYPTO_BITS2BYTES(lsp->ls_key.ck_length));
2247 		kmem_free(lsp->ls_key.ck_data,
2248 		    CRYPTO_BITS2BYTES(lsp->ls_key.ck_length));
2249 		lsp->ls_key.ck_data = NULL;
2250 		lsp->ls_key.ck_length = 0;
2251 	}
2252 
2253 	if (zalloced)
2254 		ddi_soft_state_free(lofi_statep, newminor);
2255 
2256 	(void) ddi_prop_remove(newdev, lofi_dip, SIZE_PROP_NAME);
2257 	(void) ddi_prop_remove(newdev, lofi_dip, NBLOCKS_PROP_NAME);
2258 
2259 out:
2260 	if (need_vn_close) {
2261 		(void) VOP_CLOSE(vp, flag, 1, 0, credp, NULL);
2262 		VN_RELE(vp);
2263 	}
2264 
2265 	mutex_exit(&lofi_lock);
2266 	free_lofi_ioctl(klip);
2267 	return (error);
2268 }
2269 
2270 /*
2271  * unmap a file.
2272  */
2273 static int
2274 lofi_unmap_file(dev_t dev, struct lofi_ioctl *ulip, int byfilename,
2275     struct cred *credp, int ioctl_flag)
2276 {
2277 	struct lofi_state *lsp;
2278 	struct lofi_ioctl *klip;
2279 	minor_t	minor;
2280 
2281 	klip = copy_in_lofi_ioctl(ulip, ioctl_flag);
2282 	if (klip == NULL)
2283 		return (EFAULT);
2284 
2285 	mutex_enter(&lofi_lock);
2286 	if (byfilename) {
2287 		minor = file_to_minor(klip->li_filename);
2288 	} else {
2289 		minor = klip->li_minor;
2290 	}
2291 	if (minor == 0) {
2292 		mutex_exit(&lofi_lock);
2293 		free_lofi_ioctl(klip);
2294 		return (ENXIO);
2295 	}
2296 	lsp = ddi_get_soft_state(lofi_statep, minor);
2297 	if (lsp == NULL || lsp->ls_vp == NULL) {
2298 		mutex_exit(&lofi_lock);
2299 		free_lofi_ioctl(klip);
2300 		return (ENXIO);
2301 	}
2302 
2303 	/*
2304 	 * If it's still held open, we'll do one of three things:
2305 	 *
2306 	 * If no flag is set, just return EBUSY.
2307 	 *
2308 	 * If the 'cleanup' flag is set, unmap and remove the device when
2309 	 * the last user finishes.
2310 	 *
2311 	 * If the 'force' flag is set, then we forcibly close the underlying
2312 	 * file.  Subsequent operations will fail, and the DKIOCSTATE ioctl
2313 	 * will return DKIO_DEV_GONE.  When the device is last closed, the
2314 	 * device will be cleaned up appropriately.
2315 	 *
2316 	 * This is complicated by the fact that we may have outstanding
2317 	 * dispatched I/Os.  Rather than having a single mutex to serialize all
2318 	 * I/O, we keep a count of the number of outstanding I/O requests
2319 	 * (ls_vp_iocount), as well as a flag to indicate that no new I/Os
2320 	 * should be dispatched (ls_vp_closereq).
2321 	 *
2322 	 * We set the flag, wait for the number of outstanding I/Os to reach 0,
2323 	 * and then close the underlying vnode.
2324 	 */
2325 	if (is_opened(lsp)) {
2326 		if (klip->li_force) {
2327 			mutex_enter(&lsp->ls_vp_lock);
2328 			lsp->ls_vp_closereq = B_TRUE;
2329 			/* wake up any threads waiting on dkiocstate */
2330 			cv_broadcast(&lsp->ls_vp_cv);
2331 			while (lsp->ls_vp_iocount > 0)
2332 				cv_wait(&lsp->ls_vp_cv, &lsp->ls_vp_lock);
2333 			mutex_exit(&lsp->ls_vp_lock);
2334 			lofi_free_handle(dev, minor, lsp, credp);
2335 
2336 			klip->li_minor = minor;
2337 			mutex_exit(&lofi_lock);
2338 			(void) copy_out_lofi_ioctl(klip, ulip, ioctl_flag);
2339 			free_lofi_ioctl(klip);
2340 			return (0);
2341 		} else if (klip->li_cleanup) {
2342 			lsp->ls_cleanup = 1;
2343 			mutex_exit(&lofi_lock);
2344 			free_lofi_ioctl(klip);
2345 			return (0);
2346 		}
2347 
2348 		mutex_exit(&lofi_lock);
2349 		free_lofi_ioctl(klip);
2350 		return (EBUSY);
2351 	}
2352 
2353 	lofi_free_handle(dev, minor, lsp, credp);
2354 
2355 	klip->li_minor = minor;
2356 	mutex_exit(&lofi_lock);
2357 	(void) copy_out_lofi_ioctl(klip, ulip, ioctl_flag);
2358 	free_lofi_ioctl(klip);
2359 	return (0);
2360 }
2361 
2362 /*
2363  * get the filename given the minor number, or the minor number given
2364  * the name.
2365  */
2366 /*ARGSUSED*/
2367 static int
2368 lofi_get_info(dev_t dev, struct lofi_ioctl *ulip, int which,
2369     struct cred *credp, int ioctl_flag)
2370 {
2371 	struct lofi_state *lsp;
2372 	struct lofi_ioctl *klip;
2373 	int	error;
2374 	minor_t	minor;
2375 
2376 	klip = copy_in_lofi_ioctl(ulip, ioctl_flag);
2377 	if (klip == NULL)
2378 		return (EFAULT);
2379 
2380 	switch (which) {
2381 	case LOFI_GET_FILENAME:
2382 		minor = klip->li_minor;
2383 		if (minor == 0) {
2384 			free_lofi_ioctl(klip);
2385 			return (EINVAL);
2386 		}
2387 
2388 		mutex_enter(&lofi_lock);
2389 		lsp = ddi_get_soft_state(lofi_statep, minor);
2390 		if (lsp == NULL) {
2391 			mutex_exit(&lofi_lock);
2392 			free_lofi_ioctl(klip);
2393 			return (ENXIO);
2394 		}
2395 		(void) strcpy(klip->li_filename, lsp->ls_filename);
2396 		(void) strlcpy(klip->li_algorithm, lsp->ls_comp_algorithm,
2397 		    sizeof (klip->li_algorithm));
2398 		klip->li_crypto_enabled = lsp->ls_crypto_enabled;
2399 		mutex_exit(&lofi_lock);
2400 		error = copy_out_lofi_ioctl(klip, ulip, ioctl_flag);
2401 		free_lofi_ioctl(klip);
2402 		return (error);
2403 	case LOFI_GET_MINOR:
2404 		mutex_enter(&lofi_lock);
2405 		klip->li_minor = file_to_minor(klip->li_filename);
2406 		/* caller should not depend on klip->li_crypto_enabled here */
2407 		mutex_exit(&lofi_lock);
2408 		if (klip->li_minor == 0) {
2409 			free_lofi_ioctl(klip);
2410 			return (ENOENT);
2411 		}
2412 		error = copy_out_lofi_ioctl(klip, ulip, ioctl_flag);
2413 		free_lofi_ioctl(klip);
2414 		return (error);
2415 	case LOFI_CHECK_COMPRESSED:
2416 		mutex_enter(&lofi_lock);
2417 		klip->li_minor = file_to_minor(klip->li_filename);
2418 		mutex_exit(&lofi_lock);
2419 		if (klip->li_minor == 0) {
2420 			free_lofi_ioctl(klip);
2421 			return (ENOENT);
2422 		}
2423 		mutex_enter(&lofi_lock);
2424 		lsp = ddi_get_soft_state(lofi_statep, klip->li_minor);
2425 		if (lsp == NULL) {
2426 			mutex_exit(&lofi_lock);
2427 			free_lofi_ioctl(klip);
2428 			return (ENXIO);
2429 		}
2430 		ASSERT(strcmp(klip->li_filename, lsp->ls_filename) == 0);
2431 
2432 		(void) strlcpy(klip->li_algorithm, lsp->ls_comp_algorithm,
2433 		    sizeof (klip->li_algorithm));
2434 		mutex_exit(&lofi_lock);
2435 		error = copy_out_lofi_ioctl(klip, ulip, ioctl_flag);
2436 		free_lofi_ioctl(klip);
2437 		return (error);
2438 	default:
2439 		free_lofi_ioctl(klip);
2440 		return (EINVAL);
2441 	}
2442 
2443 }
2444 
2445 static int
2446 lofi_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *credp,
2447     int *rvalp)
2448 {
2449 	int	error;
2450 	enum dkio_state dkstate;
2451 	struct lofi_state *lsp;
2452 	minor_t	minor;
2453 
2454 	minor = getminor(dev);
2455 	/* lofi ioctls only apply to the master device */
2456 	if (minor == 0) {
2457 		struct lofi_ioctl *lip = (struct lofi_ioctl *)arg;
2458 
2459 		/*
2460 		 * the query command only need read-access - i.e., normal
2461 		 * users are allowed to do those on the ctl device as
2462 		 * long as they can open it read-only.
2463 		 */
2464 		switch (cmd) {
2465 		case LOFI_MAP_FILE:
2466 			if ((flag & FWRITE) == 0)
2467 				return (EPERM);
2468 			return (lofi_map_file(dev, lip, 1, rvalp, credp, flag));
2469 		case LOFI_MAP_FILE_MINOR:
2470 			if ((flag & FWRITE) == 0)
2471 				return (EPERM);
2472 			return (lofi_map_file(dev, lip, 0, rvalp, credp, flag));
2473 		case LOFI_UNMAP_FILE:
2474 			if ((flag & FWRITE) == 0)
2475 				return (EPERM);
2476 			return (lofi_unmap_file(dev, lip, 1, credp, flag));
2477 		case LOFI_UNMAP_FILE_MINOR:
2478 			if ((flag & FWRITE) == 0)
2479 				return (EPERM);
2480 			return (lofi_unmap_file(dev, lip, 0, credp, flag));
2481 		case LOFI_GET_FILENAME:
2482 			return (lofi_get_info(dev, lip, LOFI_GET_FILENAME,
2483 			    credp, flag));
2484 		case LOFI_GET_MINOR:
2485 			return (lofi_get_info(dev, lip, LOFI_GET_MINOR,
2486 			    credp, flag));
2487 		case LOFI_GET_MAXMINOR:
2488 			error = ddi_copyout(&lofi_max_files, &lip->li_minor,
2489 			    sizeof (lofi_max_files), flag);
2490 			if (error)
2491 				return (EFAULT);
2492 			return (0);
2493 		case LOFI_CHECK_COMPRESSED:
2494 			return (lofi_get_info(dev, lip, LOFI_CHECK_COMPRESSED,
2495 			    credp, flag));
2496 		default:
2497 			break;
2498 		}
2499 	}
2500 
2501 	mutex_enter(&lofi_lock);
2502 	lsp = ddi_get_soft_state(lofi_statep, minor);
2503 	if (lsp == NULL || lsp->ls_vp_closereq) {
2504 		mutex_exit(&lofi_lock);
2505 		return (ENXIO);
2506 	}
2507 	mutex_exit(&lofi_lock);
2508 
2509 	/*
2510 	 * We explicitly allow DKIOCSTATE, but all other ioctls should fail with
2511 	 * EIO as if the device was no longer present.
2512 	 */
2513 	if (lsp->ls_vp == NULL && cmd != DKIOCSTATE)
2514 		return (EIO);
2515 
2516 	/* these are for faking out utilities like newfs */
2517 	switch (cmd) {
2518 	case DKIOCGVTOC:
2519 		switch (ddi_model_convert_from(flag & FMODELS)) {
2520 		case DDI_MODEL_ILP32: {
2521 			struct vtoc32 vtoc32;
2522 
2523 			vtoctovtoc32(lsp->ls_vtoc, vtoc32);
2524 			if (ddi_copyout(&vtoc32, (void *)arg,
2525 			    sizeof (struct vtoc32), flag))
2526 				return (EFAULT);
2527 			break;
2528 			}
2529 
2530 		case DDI_MODEL_NONE:
2531 			if (ddi_copyout(&lsp->ls_vtoc, (void *)arg,
2532 			    sizeof (struct vtoc), flag))
2533 				return (EFAULT);
2534 			break;
2535 		}
2536 		return (0);
2537 	case DKIOCINFO:
2538 		error = ddi_copyout(&lsp->ls_ci, (void *)arg,
2539 		    sizeof (struct dk_cinfo), flag);
2540 		if (error)
2541 			return (EFAULT);
2542 		return (0);
2543 	case DKIOCG_VIRTGEOM:
2544 	case DKIOCG_PHYGEOM:
2545 	case DKIOCGGEOM:
2546 		error = ddi_copyout(&lsp->ls_dkg, (void *)arg,
2547 		    sizeof (struct dk_geom), flag);
2548 		if (error)
2549 			return (EFAULT);
2550 		return (0);
2551 	case DKIOCSTATE:
2552 		/*
2553 		 * Normally, lofi devices are always in the INSERTED state.  If
2554 		 * a device is forcefully unmapped, then the device transitions
2555 		 * to the DKIO_DEV_GONE state.
2556 		 */
2557 		if (ddi_copyin((void *)arg, &dkstate, sizeof (dkstate),
2558 		    flag) != 0)
2559 			return (EFAULT);
2560 
2561 		mutex_enter(&lsp->ls_vp_lock);
2562 		lsp->ls_vp_iocount++;
2563 		while (((dkstate == DKIO_INSERTED && lsp->ls_vp != NULL) ||
2564 		    (dkstate == DKIO_DEV_GONE && lsp->ls_vp == NULL)) &&
2565 		    !lsp->ls_vp_closereq) {
2566 			/*
2567 			 * By virtue of having the device open, we know that
2568 			 * 'lsp' will remain valid when we return.
2569 			 */
2570 			if (!cv_wait_sig(&lsp->ls_vp_cv,
2571 			    &lsp->ls_vp_lock)) {
2572 				lsp->ls_vp_iocount--;
2573 				cv_broadcast(&lsp->ls_vp_cv);
2574 				mutex_exit(&lsp->ls_vp_lock);
2575 				return (EINTR);
2576 			}
2577 		}
2578 
2579 		dkstate = (!lsp->ls_vp_closereq && lsp->ls_vp != NULL ?
2580 		    DKIO_INSERTED : DKIO_DEV_GONE);
2581 		lsp->ls_vp_iocount--;
2582 		cv_broadcast(&lsp->ls_vp_cv);
2583 		mutex_exit(&lsp->ls_vp_lock);
2584 
2585 		if (ddi_copyout(&dkstate, (void *)arg,
2586 		    sizeof (dkstate), flag) != 0)
2587 			return (EFAULT);
2588 		return (0);
2589 	default:
2590 		return (ENOTTY);
2591 	}
2592 }
2593 
2594 static struct cb_ops lofi_cb_ops = {
2595 	lofi_open,		/* open */
2596 	lofi_close,		/* close */
2597 	lofi_strategy,		/* strategy */
2598 	nodev,			/* print */
2599 	nodev,			/* dump */
2600 	lofi_read,		/* read */
2601 	lofi_write,		/* write */
2602 	lofi_ioctl,		/* ioctl */
2603 	nodev,			/* devmap */
2604 	nodev,			/* mmap */
2605 	nodev,			/* segmap */
2606 	nochpoll,		/* poll */
2607 	ddi_prop_op,		/* prop_op */
2608 	0,			/* streamtab  */
2609 	D_64BIT | D_NEW | D_MP,	/* Driver compatibility flag */
2610 	CB_REV,
2611 	lofi_aread,
2612 	lofi_awrite
2613 };
2614 
2615 static struct dev_ops lofi_ops = {
2616 	DEVO_REV,		/* devo_rev, */
2617 	0,			/* refcnt  */
2618 	lofi_info,		/* info */
2619 	nulldev,		/* identify */
2620 	nulldev,		/* probe */
2621 	lofi_attach,		/* attach */
2622 	lofi_detach,		/* detach */
2623 	nodev,			/* reset */
2624 	&lofi_cb_ops,		/* driver operations */
2625 	NULL,			/* no bus operations */
2626 	NULL,			/* power */
2627 	ddi_quiesce_not_needed,	/* quiesce */
2628 };
2629 
2630 static struct modldrv modldrv = {
2631 	&mod_driverops,
2632 	"loopback file driver",
2633 	&lofi_ops,
2634 };
2635 
2636 static struct modlinkage modlinkage = {
2637 	MODREV_1,
2638 	&modldrv,
2639 	NULL
2640 };
2641 
2642 int
2643 _init(void)
2644 {
2645 	int error;
2646 
2647 	error = ddi_soft_state_init(&lofi_statep,
2648 	    sizeof (struct lofi_state), 0);
2649 	if (error)
2650 		return (error);
2651 
2652 	mutex_init(&lofi_lock, NULL, MUTEX_DRIVER, NULL);
2653 	error = mod_install(&modlinkage);
2654 	if (error) {
2655 		mutex_destroy(&lofi_lock);
2656 		ddi_soft_state_fini(&lofi_statep);
2657 	}
2658 
2659 	return (error);
2660 }
2661 
2662 int
2663 _fini(void)
2664 {
2665 	int	error;
2666 
2667 	if (lofi_busy())
2668 		return (EBUSY);
2669 
2670 	error = mod_remove(&modlinkage);
2671 	if (error)
2672 		return (error);
2673 
2674 	mutex_destroy(&lofi_lock);
2675 	ddi_soft_state_fini(&lofi_statep);
2676 
2677 	return (error);
2678 }
2679 
2680 int
2681 _info(struct modinfo *modinfop)
2682 {
2683 	return (mod_info(&modlinkage, modinfop));
2684 }
2685