xref: /illumos-gate/usr/src/uts/common/io/lofi.c (revision 91cfa10a8e55050a5103c4b2e83b0bf8d783a7cb)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * lofi (loopback file) driver - allows you to attach a file to a device,
30  * which can then be accessed through that device. The simple model is that
31  * you tell lofi to open a file, and then use the block device you get as
32  * you would any block device. lofi translates access to the block device
33  * into I/O on the underlying file. This is mostly useful for
34  * mounting images of filesystems.
35  *
36  * lofi is controlled through /dev/lofictl - this is the only device exported
37  * during attach, and is minor number 0. lofiadm communicates with lofi through
38  * ioctls on this device. When a file is attached to lofi, block and character
39  * devices are exported in /dev/lofi and /dev/rlofi. Currently, these devices
40  * are identified by their minor number, and the minor number is also used
41  * as the name in /dev/lofi. If we ever decide to support virtual disks,
42  * we'll have to divide the minor number space to identify fdisk partitions
43  * and slices, and the name will then be the minor number shifted down a
44  * few bits. Minor devices are tracked with state structures handled with
45  * ddi_soft_state(9F) for simplicity.
46  *
47  * A file attached to lofi is opened when attached and not closed until
48  * explicitly detached from lofi. This seems more sensible than deferring
49  * the open until the /dev/lofi device is opened, for a number of reasons.
50  * One is that any failure is likely to be noticed by the person (or script)
51  * running lofiadm. Another is that it would be a security problem if the
52  * file was replaced by another one after being added but before being opened.
53  *
54  * The only hard part about lofi is the ioctls. In order to support things
55  * like 'newfs' on a lofi device, it needs to support certain disk ioctls.
56  * So it has to fake disk geometry and partition information. More may need
57  * to be faked if your favorite utility doesn't work and you think it should
58  * (fdformat doesn't work because it really wants to know the type of floppy
59  * controller to talk to, and that didn't seem easy to fake. Or possibly even
60  * necessary, since we have mkfs_pcfs now).
61  *
62  * Known problems:
63  *
64  *	UFS logging. Mounting a UFS filesystem image "logging"
65  *	works for basic copy testing but wedges during a build of ON through
66  *	that image. Some deadlock in lufs holding the log mutex and then
67  *	getting stuck on a buf. So for now, don't do that.
68  *
69  *	Direct I/O. Since the filesystem data is being cached in the buffer
70  *	cache, _and_ again in the underlying filesystem, it's tempting to
71  *	enable direct I/O on the underlying file. Don't, because that deadlocks.
72  *	I think to fix the cache-twice problem we might need filesystem support.
73  *
74  *	lofi on itself. The simple lock strategy (lofi_lock) precludes this
75  *	because you'll be in lofi_ioctl, holding the lock when you open the
76  *	file, which, if it's lofi, will grab lofi_lock. We prevent this for
77  *	now, though not using ddi_soft_state(9F) would make it possible to
78  *	do. Though it would still be silly.
79  *
80  * Interesting things to do:
81  *
82  *	Allow multiple files for each device. A poor-man's metadisk, basically.
83  *
84  *	Pass-through ioctls on block devices. You can (though it's not
85  *	documented), give lofi a block device as a file name. Then we shouldn't
86  *	need to fake a geometry. But this is also silly unless you're replacing
87  *	metadisk.
88  *
89  *	Encryption. tpm would like this. Apparently Windows 2000 has it, and
90  *	so does Linux.
91  */
92 
93 #include <sys/types.h>
94 #include <sys/sysmacros.h>
95 #include <sys/cmn_err.h>
96 #include <sys/uio.h>
97 #include <sys/kmem.h>
98 #include <sys/cred.h>
99 #include <sys/mman.h>
100 #include <sys/errno.h>
101 #include <sys/aio_req.h>
102 #include <sys/stat.h>
103 #include <sys/file.h>
104 #include <sys/modctl.h>
105 #include <sys/conf.h>
106 #include <sys/debug.h>
107 #include <sys/vnode.h>
108 #include <sys/lofi.h>
109 #include <sys/fcntl.h>
110 #include <sys/pathname.h>
111 #include <sys/filio.h>
112 #include <sys/fdio.h>
113 #include <sys/open.h>
114 #include <sys/disp.h>
115 #include <vm/seg_map.h>
116 #include <sys/ddi.h>
117 #include <sys/sunddi.h>
118 
119 /* seems safer than having to get the string right many times */
120 #define	NBLOCKS_PROP_NAME	"Nblocks"
121 #define	SIZE_PROP_NAME	"Size"
122 
123 static dev_info_t *lofi_dip;
124 static void	*lofi_statep;
125 static kmutex_t lofi_lock;		/* state lock */
126 
127 /*
128  * Because lofi_taskq_nthreads limits the actual swamping of the device, the
129  * maxalloc parameter (lofi_taskq_maxalloc) should be tuned conservatively
130  * high.  If we want to be assured that the underlying device is always busy,
131  * we must be sure that the number of bytes enqueued when the number of
132  * enqueued tasks exceeds maxalloc is sufficient to keep the device busy for
133  * the duration of the sleep time in taskq_ent_alloc().  That is, lofi should
134  * set maxalloc to be the maximum throughput (in bytes per second) of the
135  * underlying device divided by the minimum I/O size.  We assume a realistic
136  * maximum throughput of one hundred megabytes per second; we set maxalloc on
137  * the lofi task queue to be 104857600 divided by DEV_BSIZE.
138  */
139 static int lofi_taskq_maxalloc = 104857600 / DEV_BSIZE;
140 static int lofi_taskq_nthreads = 4;	/* # of taskq threads per device */
141 
142 uint32_t lofi_max_files = LOFI_MAX_FILES;
143 
144 static int
145 lofi_busy(void)
146 {
147 	minor_t	minor;
148 
149 	/*
150 	 * We need to make sure no mappings exist - mod_remove won't
151 	 * help because the device isn't open.
152 	 */
153 	mutex_enter(&lofi_lock);
154 	for (minor = 1; minor <= lofi_max_files; minor++) {
155 		if (ddi_get_soft_state(lofi_statep, minor) != NULL) {
156 			mutex_exit(&lofi_lock);
157 			return (EBUSY);
158 		}
159 	}
160 	mutex_exit(&lofi_lock);
161 	return (0);
162 }
163 
164 static int
165 is_opened(struct lofi_state *lsp)
166 {
167 	ASSERT(mutex_owned(&lofi_lock));
168 	return (lsp->ls_chr_open || lsp->ls_blk_open || lsp->ls_lyr_open_count);
169 }
170 
171 static int
172 mark_opened(struct lofi_state *lsp, int otyp)
173 {
174 	ASSERT(mutex_owned(&lofi_lock));
175 	switch (otyp) {
176 	case OTYP_CHR:
177 		lsp->ls_chr_open = 1;
178 		break;
179 	case OTYP_BLK:
180 		lsp->ls_blk_open = 1;
181 		break;
182 	case OTYP_LYR:
183 		lsp->ls_lyr_open_count++;
184 		break;
185 	default:
186 		return (-1);
187 	}
188 	return (0);
189 }
190 
191 static void
192 mark_closed(struct lofi_state *lsp, int otyp)
193 {
194 	ASSERT(mutex_owned(&lofi_lock));
195 	switch (otyp) {
196 	case OTYP_CHR:
197 		lsp->ls_chr_open = 0;
198 		break;
199 	case OTYP_BLK:
200 		lsp->ls_blk_open = 0;
201 		break;
202 	case OTYP_LYR:
203 		lsp->ls_lyr_open_count--;
204 		break;
205 	default:
206 		break;
207 	}
208 }
209 
210 /*ARGSUSED3*/
211 static int
212 lofi_open(dev_t *devp, int flag, int otyp, struct cred *credp)
213 {
214 	minor_t	minor;
215 	struct lofi_state *lsp;
216 
217 	mutex_enter(&lofi_lock);
218 	minor = getminor(*devp);
219 	if (minor == 0) {
220 		/* master control device */
221 		/* must be opened exclusively */
222 		if (((flag & FEXCL) != FEXCL) || (otyp != OTYP_CHR)) {
223 			mutex_exit(&lofi_lock);
224 			return (EINVAL);
225 		}
226 		lsp = ddi_get_soft_state(lofi_statep, 0);
227 		if (lsp == NULL) {
228 			mutex_exit(&lofi_lock);
229 			return (ENXIO);
230 		}
231 		if (is_opened(lsp)) {
232 			mutex_exit(&lofi_lock);
233 			return (EBUSY);
234 		}
235 		(void) mark_opened(lsp, OTYP_CHR);
236 		mutex_exit(&lofi_lock);
237 		return (0);
238 	}
239 
240 	/* otherwise, the mapping should already exist */
241 	lsp = ddi_get_soft_state(lofi_statep, minor);
242 	if (lsp == NULL) {
243 		mutex_exit(&lofi_lock);
244 		return (EINVAL);
245 	}
246 
247 	if (mark_opened(lsp, otyp) == -1) {
248 		mutex_exit(&lofi_lock);
249 		return (EINVAL);
250 	}
251 
252 	mutex_exit(&lofi_lock);
253 	return (0);
254 }
255 
256 /*ARGSUSED3*/
257 static int
258 lofi_close(dev_t dev, int flag, int otyp, struct cred *credp)
259 {
260 	minor_t	minor;
261 	struct lofi_state *lsp;
262 
263 #ifdef lint
264 	flag = flag;
265 #endif
266 	mutex_enter(&lofi_lock);
267 	minor = getminor(dev);
268 	lsp = ddi_get_soft_state(lofi_statep, minor);
269 	if (lsp == NULL) {
270 		mutex_exit(&lofi_lock);
271 		return (EINVAL);
272 	}
273 	mark_closed(lsp, otyp);
274 	mutex_exit(&lofi_lock);
275 	return (0);
276 }
277 
278 /*
279  * This is basically what strategy used to be before we found we
280  * needed task queues.
281  */
282 static void
283 lofi_strategy_task(void *arg)
284 {
285 	struct buf *bp = (struct buf *)arg;
286 	int error;
287 	struct lofi_state *lsp;
288 	offset_t	offset, alignedoffset;
289 	offset_t	mapoffset;
290 	caddr_t	bufaddr;
291 	caddr_t	mapaddr;
292 	size_t	xfersize;
293 	size_t	len;
294 	int	isread;
295 	int	smflags;
296 	enum seg_rw srw;
297 
298 	lsp = ddi_get_soft_state(lofi_statep, getminor(bp->b_edev));
299 	if (lsp->ls_kstat) {
300 		mutex_enter(lsp->ls_kstat->ks_lock);
301 		kstat_waitq_to_runq(KSTAT_IO_PTR(lsp->ls_kstat));
302 		mutex_exit(lsp->ls_kstat->ks_lock);
303 	}
304 	bp_mapin(bp);
305 	bufaddr = bp->b_un.b_addr;
306 	offset = bp->b_lblkno * DEV_BSIZE;	/* offset within file */
307 
308 	/*
309 	 * We used to always use vn_rdwr here, but we cannot do that because
310 	 * we might decide to read or write from the the underlying
311 	 * file during this call, which would be a deadlock because
312 	 * we have the rw_lock. So instead we page, unless it's not
313 	 * mapable or it's a character device.
314 	 */
315 	if (((lsp->ls_vp->v_flag & VNOMAP) == 0) &&
316 	    (lsp->ls_vp->v_type != VCHR)) {
317 		/*
318 		 * segmap always gives us an 8K (MAXBSIZE) chunk, aligned on
319 		 * an 8K boundary, but the buf transfer address may not be
320 		 * aligned on more than a 512-byte boundary (we don't
321 		 * enforce that, though we could). This matters since the
322 		 * initial part of the transfer may not start at offset 0
323 		 * within the segmap'd chunk. So we have to compensate for
324 		 * that with 'mapoffset'. Subsequent chunks always start
325 		 * off at the beginning, and the last is capped by b_resid.
326 		 */
327 		mapoffset = offset & MAXBOFFSET;
328 		alignedoffset = offset - mapoffset;	/* now map-aligned */
329 		bp->b_resid = bp->b_bcount;
330 		isread = bp->b_flags & B_READ;
331 		srw = isread ? S_READ : S_WRITE;
332 		do {
333 			xfersize = MIN(lsp->ls_vp_size - offset,
334 			    MIN(MAXBSIZE - mapoffset, bp->b_resid));
335 			len = roundup(mapoffset + xfersize, PAGESIZE);
336 			mapaddr = segmap_getmapflt(segkmap, lsp->ls_vp,
337 			    alignedoffset, MAXBSIZE, 1, srw);
338 			/*
339 			 * Now fault in the pages. This lets us check
340 			 * for errors before we reference mapaddr and
341 			 * try to resolve the fault in bcopy (which would
342 			 * panic instead). And this can easily happen,
343 			 * particularly if you've lofi'd a file over NFS
344 			 * and someone deletes the file on the server.
345 			 */
346 			error = segmap_fault(kas.a_hat, segkmap, mapaddr,
347 			    len, F_SOFTLOCK, srw);
348 			if (error) {
349 				(void) segmap_release(segkmap, mapaddr, 0);
350 				if (FC_CODE(error) == FC_OBJERR)
351 					error = FC_ERRNO(error);
352 				else
353 					error = EIO;
354 				break;
355 			}
356 			smflags = 0;
357 			if (isread) {
358 				bcopy(mapaddr + mapoffset, bufaddr, xfersize);
359 			} else {
360 				smflags |= SM_WRITE;
361 				bcopy(bufaddr, mapaddr + mapoffset, xfersize);
362 			}
363 			bp->b_resid -= xfersize;
364 			bufaddr += xfersize;
365 			offset += xfersize;
366 			(void) segmap_fault(kas.a_hat, segkmap, mapaddr,
367 			    len, F_SOFTUNLOCK, srw);
368 			error = segmap_release(segkmap, mapaddr, smflags);
369 			/* only the first map may start partial */
370 			mapoffset = 0;
371 			alignedoffset += MAXBSIZE;
372 		} while ((error == 0) && (bp->b_resid > 0) &&
373 		    (offset < lsp->ls_vp_size));
374 	} else {
375 		ssize_t	resid;
376 		enum uio_rw rw;
377 
378 		if (bp->b_flags & B_READ)
379 			rw = UIO_READ;
380 		else
381 			rw = UIO_WRITE;
382 		error = vn_rdwr(rw, lsp->ls_vp, bufaddr, bp->b_bcount,
383 		    offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
384 		bp->b_resid = resid;
385 	}
386 
387 	if (lsp->ls_kstat) {
388 		size_t n_done = bp->b_bcount - bp->b_resid;
389 		kstat_io_t *kioptr;
390 
391 		mutex_enter(lsp->ls_kstat->ks_lock);
392 		kioptr = KSTAT_IO_PTR(lsp->ls_kstat);
393 		if (bp->b_flags & B_READ) {
394 			kioptr->nread += n_done;
395 			kioptr->reads++;
396 		} else {
397 			kioptr->nwritten += n_done;
398 			kioptr->writes++;
399 		}
400 		kstat_runq_exit(kioptr);
401 		mutex_exit(lsp->ls_kstat->ks_lock);
402 	}
403 	bioerror(bp, error);
404 	biodone(bp);
405 }
406 
407 static int
408 lofi_strategy(struct buf *bp)
409 {
410 	struct lofi_state *lsp;
411 	offset_t	offset;
412 
413 	/*
414 	 * We cannot just do I/O here, because the current thread
415 	 * _might_ end up back in here because the underlying filesystem
416 	 * wants a buffer, which eventually gets into bio_recycle and
417 	 * might call into lofi to write out a delayed-write buffer.
418 	 * This is bad if the filesystem above lofi is the same as below.
419 	 *
420 	 * We could come up with a complex strategy using threads to
421 	 * do the I/O asynchronously, or we could use task queues. task
422 	 * queues were incredibly easy so they win.
423 	 */
424 	lsp = ddi_get_soft_state(lofi_statep, getminor(bp->b_edev));
425 	offset = bp->b_lblkno * DEV_BSIZE;	/* offset within file */
426 	if (offset == lsp->ls_vp_size) {
427 		/* EOF */
428 		if ((bp->b_flags & B_READ) != 0) {
429 			bp->b_resid = bp->b_bcount;
430 			bioerror(bp, 0);
431 		} else {
432 			/* writes should fail */
433 			bioerror(bp, ENXIO);
434 		}
435 		biodone(bp);
436 		return (0);
437 	}
438 	if (offset > lsp->ls_vp_size) {
439 		bioerror(bp, ENXIO);
440 		biodone(bp);
441 		return (0);
442 	}
443 	if (lsp->ls_kstat) {
444 		mutex_enter(lsp->ls_kstat->ks_lock);
445 		kstat_waitq_enter(KSTAT_IO_PTR(lsp->ls_kstat));
446 		mutex_exit(lsp->ls_kstat->ks_lock);
447 	}
448 	(void) taskq_dispatch(lsp->ls_taskq, lofi_strategy_task, bp, KM_SLEEP);
449 	return (0);
450 }
451 
452 /*ARGSUSED2*/
453 static int
454 lofi_read(dev_t dev, struct uio *uio, struct cred *credp)
455 {
456 	if (getminor(dev) == 0)
457 		return (EINVAL);
458 	return (physio(lofi_strategy, NULL, dev, B_READ, minphys, uio));
459 }
460 
461 /*ARGSUSED2*/
462 static int
463 lofi_write(dev_t dev, struct uio *uio, struct cred *credp)
464 {
465 	if (getminor(dev) == 0)
466 		return (EINVAL);
467 	return (physio(lofi_strategy, NULL, dev, B_WRITE, minphys, uio));
468 }
469 
470 /*ARGSUSED2*/
471 static int
472 lofi_aread(dev_t dev, struct aio_req *aio, struct cred *credp)
473 {
474 	if (getminor(dev) == 0)
475 		return (EINVAL);
476 	return (aphysio(lofi_strategy, anocancel, dev, B_READ, minphys, aio));
477 }
478 
479 /*ARGSUSED2*/
480 static int
481 lofi_awrite(dev_t dev, struct aio_req *aio, struct cred *credp)
482 {
483 	if (getminor(dev) == 0)
484 		return (EINVAL);
485 	return (aphysio(lofi_strategy, anocancel, dev, B_WRITE, minphys, aio));
486 }
487 
488 /*ARGSUSED*/
489 static int
490 lofi_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
491 {
492 	switch (infocmd) {
493 	case DDI_INFO_DEVT2DEVINFO:
494 		*result = lofi_dip;
495 		return (DDI_SUCCESS);
496 	case DDI_INFO_DEVT2INSTANCE:
497 		*result = 0;
498 		return (DDI_SUCCESS);
499 	}
500 	return (DDI_FAILURE);
501 }
502 
503 static int
504 lofi_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
505 {
506 	int	error;
507 
508 	if (cmd != DDI_ATTACH)
509 		return (DDI_FAILURE);
510 	error = ddi_soft_state_zalloc(lofi_statep, 0);
511 	if (error == DDI_FAILURE) {
512 		return (DDI_FAILURE);
513 	}
514 	error = ddi_create_minor_node(dip, LOFI_CTL_NODE, S_IFCHR, 0,
515 	    DDI_PSEUDO, NULL);
516 	if (error == DDI_FAILURE) {
517 		ddi_soft_state_free(lofi_statep, 0);
518 		return (DDI_FAILURE);
519 	}
520 	lofi_dip = dip;
521 	ddi_report_dev(dip);
522 	return (DDI_SUCCESS);
523 }
524 
525 static int
526 lofi_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
527 {
528 	if (cmd != DDI_DETACH)
529 		return (DDI_FAILURE);
530 	if (lofi_busy())
531 		return (DDI_FAILURE);
532 	lofi_dip = NULL;
533 	ddi_remove_minor_node(dip, NULL);
534 	ddi_soft_state_free(lofi_statep, 0);
535 	return (DDI_SUCCESS);
536 }
537 
538 /*
539  * These two just simplify the rest of the ioctls that need to copyin/out
540  * the lofi_ioctl structure.
541  */
542 struct lofi_ioctl *
543 copy_in_lofi_ioctl(const struct lofi_ioctl *ulip, int flag)
544 {
545 	struct lofi_ioctl *klip;
546 	int	error;
547 
548 	klip = kmem_alloc(sizeof (struct lofi_ioctl), KM_SLEEP);
549 	error = ddi_copyin(ulip, klip, sizeof (struct lofi_ioctl), flag);
550 	if (error) {
551 		kmem_free(klip, sizeof (struct lofi_ioctl));
552 		return (NULL);
553 	}
554 
555 	/* make sure filename is always null-terminated */
556 	klip->li_filename[MAXPATHLEN] = '\0';
557 
558 	/* validate minor number */
559 	if (klip->li_minor > lofi_max_files) {
560 		kmem_free(klip, sizeof (struct lofi_ioctl));
561 		return (NULL);
562 	}
563 	return (klip);
564 }
565 
566 int
567 copy_out_lofi_ioctl(const struct lofi_ioctl *klip, struct lofi_ioctl *ulip,
568 	int flag)
569 {
570 	int	error;
571 
572 	error = ddi_copyout(klip, ulip, sizeof (struct lofi_ioctl), flag);
573 	if (error)
574 		return (EFAULT);
575 	return (0);
576 }
577 
578 void
579 free_lofi_ioctl(struct lofi_ioctl *klip)
580 {
581 	kmem_free(klip, sizeof (struct lofi_ioctl));
582 }
583 
584 /*
585  * Return the minor number 'filename' is mapped to, if it is.
586  */
587 static int
588 file_to_minor(char *filename)
589 {
590 	minor_t	minor;
591 	struct lofi_state *lsp;
592 
593 	ASSERT(mutex_owned(&lofi_lock));
594 	for (minor = 1; minor <= lofi_max_files; minor++) {
595 		lsp = ddi_get_soft_state(lofi_statep, minor);
596 		if (lsp == NULL)
597 			continue;
598 		if (strcmp(lsp->ls_filename, filename) == 0)
599 			return (minor);
600 	}
601 	return (0);
602 }
603 
604 /*
605  * lofiadm does some validation, but since Joe Random (or crashme) could
606  * do our ioctls, we need to do some validation too.
607  */
608 static int
609 valid_filename(const char *filename)
610 {
611 	static char *blkprefix = "/dev/" LOFI_BLOCK_NAME "/";
612 	static char *charprefix = "/dev/" LOFI_CHAR_NAME "/";
613 
614 	/* must be absolute path */
615 	if (filename[0] != '/')
616 		return (0);
617 	/* must not be lofi */
618 	if (strncmp(filename, blkprefix, strlen(blkprefix)) == 0)
619 		return (0);
620 	if (strncmp(filename, charprefix, strlen(charprefix)) == 0)
621 		return (0);
622 	return (1);
623 }
624 
625 /*
626  * Fakes up a disk geometry, and one big partition, based on the size
627  * of the file. This is needed because we allow newfs'ing the device,
628  * and newfs will do several disk ioctls to figure out the geometry and
629  * partition information. It uses that information to determine the parameters
630  * to pass to mkfs. Geometry is pretty much irrelevent these days, but we
631  * have to support it.
632  */
633 static void
634 fake_disk_geometry(struct lofi_state *lsp)
635 {
636 	/* dk_geom - see dkio(7I) */
637 	/*
638 	 * dkg_ncyl _could_ be set to one here (one big cylinder with gobs
639 	 * of sectors), but that breaks programs like fdisk which want to
640 	 * partition a disk by cylinder. With one cylinder, you can't create
641 	 * an fdisk partition and put pcfs on it for testing (hard to pick
642 	 * a number between one and one).
643 	 *
644 	 * The cheezy floppy test is an attempt to not have too few cylinders
645 	 * for a small file, or so many on a big file that you waste space
646 	 * for backup superblocks or cylinder group structures.
647 	 */
648 	if (lsp->ls_vp_size < (2 * 1024 * 1024)) /* floppy? */
649 		lsp->ls_dkg.dkg_ncyl = lsp->ls_vp_size / (100 * 1024);
650 	else
651 		lsp->ls_dkg.dkg_ncyl = lsp->ls_vp_size / (300 * 1024);
652 	/* in case file file is < 100k */
653 	if (lsp->ls_dkg.dkg_ncyl == 0)
654 		lsp->ls_dkg.dkg_ncyl = 1;
655 	lsp->ls_dkg.dkg_acyl = 0;
656 	lsp->ls_dkg.dkg_bcyl = 0;
657 	lsp->ls_dkg.dkg_nhead = 1;
658 	lsp->ls_dkg.dkg_obs1 = 0;
659 	lsp->ls_dkg.dkg_intrlv = 0;
660 	lsp->ls_dkg.dkg_obs2 = 0;
661 	lsp->ls_dkg.dkg_obs3 = 0;
662 	lsp->ls_dkg.dkg_apc = 0;
663 	lsp->ls_dkg.dkg_rpm = 7200;
664 	lsp->ls_dkg.dkg_pcyl = lsp->ls_dkg.dkg_ncyl + lsp->ls_dkg.dkg_acyl;
665 	lsp->ls_dkg.dkg_nsect = lsp->ls_vp_size /
666 	    (DEV_BSIZE * lsp->ls_dkg.dkg_ncyl);
667 	lsp->ls_dkg.dkg_write_reinstruct = 0;
668 	lsp->ls_dkg.dkg_read_reinstruct = 0;
669 
670 	/* vtoc - see dkio(7I) */
671 	bzero(&lsp->ls_vtoc, sizeof (struct vtoc));
672 	lsp->ls_vtoc.v_sanity = VTOC_SANE;
673 	lsp->ls_vtoc.v_version = V_VERSION;
674 	bcopy(LOFI_DRIVER_NAME, lsp->ls_vtoc.v_volume, 7);
675 	lsp->ls_vtoc.v_sectorsz = DEV_BSIZE;
676 	lsp->ls_vtoc.v_nparts = 1;
677 	lsp->ls_vtoc.v_part[0].p_tag = V_UNASSIGNED;
678 	lsp->ls_vtoc.v_part[0].p_flag = V_UNMNT;
679 	lsp->ls_vtoc.v_part[0].p_start = (daddr_t)0;
680 	/*
681 	 * The partition size cannot just be the number of sectors, because
682 	 * that might not end on a cylinder boundary. And if that's the case,
683 	 * newfs/mkfs will print a scary warning. So just figure the size
684 	 * based on the number of cylinders and sectors/cylinder.
685 	 */
686 	lsp->ls_vtoc.v_part[0].p_size = lsp->ls_dkg.dkg_pcyl *
687 	    lsp->ls_dkg.dkg_nsect * lsp->ls_dkg.dkg_nhead;
688 
689 	/* dk_cinfo - see dkio(7I) */
690 	bzero(&lsp->ls_ci, sizeof (struct dk_cinfo));
691 	(void) strcpy(lsp->ls_ci.dki_cname, LOFI_DRIVER_NAME);
692 	lsp->ls_ci.dki_ctype = DKC_MD;
693 	lsp->ls_ci.dki_flags = 0;
694 	lsp->ls_ci.dki_cnum = 0;
695 	lsp->ls_ci.dki_addr = 0;
696 	lsp->ls_ci.dki_space = 0;
697 	lsp->ls_ci.dki_prio = 0;
698 	lsp->ls_ci.dki_vec = 0;
699 	(void) strcpy(lsp->ls_ci.dki_dname, LOFI_DRIVER_NAME);
700 	lsp->ls_ci.dki_unit = 0;
701 	lsp->ls_ci.dki_slave = 0;
702 	lsp->ls_ci.dki_partition = 0;
703 	/*
704 	 * newfs uses this to set maxcontig. Must not be < 16, or it
705 	 * will be 0 when newfs multiplies it by DEV_BSIZE and divides
706 	 * it by the block size. Then tunefs doesn't work because
707 	 * maxcontig is 0.
708 	 */
709 	lsp->ls_ci.dki_maxtransfer = 16;
710 }
711 
712 /*
713  * map a file to a minor number. Return the minor number.
714  */
715 static int
716 lofi_map_file(dev_t dev, struct lofi_ioctl *ulip, int pickminor,
717     int *rvalp, struct cred *credp, int ioctl_flag)
718 {
719 	minor_t	newminor;
720 	struct lofi_state *lsp;
721 	struct lofi_ioctl *klip;
722 	int	error;
723 	char	namebuf[50];
724 	struct vnode *vp;
725 	int64_t	Nblocks_prop_val;
726 	int64_t	Size_prop_val;
727 	vattr_t	vattr;
728 	int	flag;
729 	enum vtype v_type;
730 	dev_t	newdev;
731 	int zalloced = 0;
732 
733 	klip = copy_in_lofi_ioctl(ulip, ioctl_flag);
734 	if (klip == NULL)
735 		return (EFAULT);
736 
737 	mutex_enter(&lofi_lock);
738 
739 	if (!valid_filename(klip->li_filename)) {
740 		error = EINVAL;
741 		goto out;
742 	}
743 
744 	if (file_to_minor(klip->li_filename) != 0) {
745 		error = EBUSY;
746 		goto out;
747 	}
748 
749 	if (pickminor) {
750 		/* Find a free one */
751 		for (newminor = 1; newminor <= lofi_max_files; newminor++)
752 			if (ddi_get_soft_state(lofi_statep, newminor) == NULL)
753 				break;
754 		if (newminor >= lofi_max_files) {
755 			error = EAGAIN;
756 			goto out;
757 		}
758 	} else {
759 		newminor = klip->li_minor;
760 		if (ddi_get_soft_state(lofi_statep, newminor) != NULL) {
761 			error = EEXIST;
762 			goto out;
763 		}
764 	}
765 
766 	/* make sure it's valid */
767 	error = lookupname(klip->li_filename, UIO_SYSSPACE, FOLLOW,
768 	    NULLVPP, &vp);
769 	if (error) {
770 		goto out;
771 	}
772 	v_type = vp->v_type;
773 	VN_RELE(vp);
774 	if (!V_ISLOFIABLE(v_type)) {
775 		error = EINVAL;
776 		goto out;
777 	}
778 	flag = FREAD | FWRITE | FOFFMAX | FEXCL;
779 	error = vn_open(klip->li_filename, UIO_SYSSPACE, flag, 0, &vp, 0, 0);
780 	if (error) {
781 		/* try read-only */
782 		flag &= ~FWRITE;
783 		error = vn_open(klip->li_filename, UIO_SYSSPACE, flag, 0,
784 		    &vp, 0, 0);
785 		if (error) {
786 			goto out;
787 		}
788 	}
789 	vattr.va_mask = AT_SIZE;
790 	error = VOP_GETATTR(vp, &vattr, 0, credp);
791 	if (error) {
792 		goto closeout;
793 	}
794 	/* the file needs to be a multiple of the block size */
795 	if ((vattr.va_size % DEV_BSIZE) != 0) {
796 		error = EINVAL;
797 		goto closeout;
798 	}
799 	newdev = makedevice(getmajor(dev), newminor);
800 	Size_prop_val = vattr.va_size;
801 	if ((ddi_prop_update_int64(newdev, lofi_dip,
802 	    SIZE_PROP_NAME, Size_prop_val)) != DDI_PROP_SUCCESS) {
803 		error = EINVAL;
804 		goto closeout;
805 	}
806 	Nblocks_prop_val = vattr.va_size / DEV_BSIZE;
807 	if ((ddi_prop_update_int64(newdev, lofi_dip,
808 	    NBLOCKS_PROP_NAME, Nblocks_prop_val)) != DDI_PROP_SUCCESS) {
809 		error = EINVAL;
810 		goto propout;
811 	}
812 	error = ddi_soft_state_zalloc(lofi_statep, newminor);
813 	if (error == DDI_FAILURE) {
814 		error = ENOMEM;
815 		goto propout;
816 	}
817 	zalloced = 1;
818 	(void) snprintf(namebuf, sizeof (namebuf), "%d", newminor);
819 	(void) ddi_create_minor_node(lofi_dip, namebuf, S_IFBLK, newminor,
820 	    DDI_PSEUDO, NULL);
821 	if (error != DDI_SUCCESS) {
822 		error = ENXIO;
823 		goto propout;
824 	}
825 	(void) snprintf(namebuf, sizeof (namebuf), "%d,raw", newminor);
826 	error = ddi_create_minor_node(lofi_dip, namebuf, S_IFCHR, newminor,
827 	    DDI_PSEUDO, NULL);
828 	if (error != DDI_SUCCESS) {
829 		/* remove block node */
830 		(void) snprintf(namebuf, sizeof (namebuf), "%d", newminor);
831 		ddi_remove_minor_node(lofi_dip, namebuf);
832 		error = ENXIO;
833 		goto propout;
834 	}
835 	lsp = ddi_get_soft_state(lofi_statep, newminor);
836 	lsp->ls_filename_sz = strlen(klip->li_filename) + 1;
837 	lsp->ls_filename = kmem_alloc(lsp->ls_filename_sz, KM_SLEEP);
838 	(void) snprintf(namebuf, sizeof (namebuf), "%s_taskq_%d",
839 	    LOFI_DRIVER_NAME, newminor);
840 	lsp->ls_taskq = taskq_create(namebuf, lofi_taskq_nthreads,
841 	    minclsyspri, 1, lofi_taskq_maxalloc, 0);
842 	lsp->ls_kstat = kstat_create(LOFI_DRIVER_NAME, newminor,
843 	    NULL, "disk", KSTAT_TYPE_IO, 1, 0);
844 	if (lsp->ls_kstat) {
845 		mutex_init(&lsp->ls_kstat_lock, NULL, MUTEX_DRIVER, NULL);
846 		lsp->ls_kstat->ks_lock = &lsp->ls_kstat_lock;
847 		kstat_install(lsp->ls_kstat);
848 	}
849 	/*
850 	 * save open mode so file can be closed properly and vnode counts
851 	 * updated correctly.
852 	 */
853 	lsp->ls_openflag = flag;
854 
855 	/*
856 	 * Try to handle stacked lofs vnodes.
857 	 */
858 	if (vp->v_type == VREG) {
859 		if (VOP_REALVP(vp, &lsp->ls_vp) != 0) {
860 			lsp->ls_vp = vp;
861 		} else {
862 			/*
863 			 * Even though vp was obtained via vn_open(), we
864 			 * can't call vn_close() on it, since lofs will
865 			 * pass the VOP_CLOSE() on down to the realvp
866 			 * (which we are about to use). Hence we merely
867 			 * drop the reference to the lofs vnode and hold
868 			 * the realvp so things behave as if we've
869 			 * opened the realvp without any interaction
870 			 * with lofs.
871 			 */
872 			VN_HOLD(lsp->ls_vp);
873 			VN_RELE(vp);
874 		}
875 	} else {
876 		lsp->ls_vp = vp;
877 	}
878 	lsp->ls_vp_size = vattr.va_size;
879 	(void) strcpy(lsp->ls_filename, klip->li_filename);
880 	if (rvalp)
881 		*rvalp = (int)newminor;
882 	klip->li_minor = newminor;
883 
884 	fake_disk_geometry(lsp);
885 	mutex_exit(&lofi_lock);
886 	(void) copy_out_lofi_ioctl(klip, ulip, ioctl_flag);
887 	free_lofi_ioctl(klip);
888 	return (0);
889 
890 propout:
891 	(void) ddi_prop_remove(newdev, lofi_dip, SIZE_PROP_NAME);
892 	(void) ddi_prop_remove(newdev, lofi_dip, NBLOCKS_PROP_NAME);
893 closeout:
894 	(void) VOP_CLOSE(vp, flag, 1, 0, credp);
895 	VN_RELE(vp);
896 out:
897 	if (zalloced)
898 		ddi_soft_state_free(lofi_statep, newminor);
899 	mutex_exit(&lofi_lock);
900 	free_lofi_ioctl(klip);
901 	return (error);
902 }
903 
904 /*
905  * unmap a file.
906  */
907 static int
908 lofi_unmap_file(dev_t dev, struct lofi_ioctl *ulip, int byfilename,
909     struct cred *credp, int ioctl_flag)
910 {
911 	struct lofi_state *lsp;
912 	struct lofi_ioctl *klip;
913 	minor_t	minor;
914 	char	namebuf[20];
915 	dev_t	newdev;
916 
917 	klip = copy_in_lofi_ioctl(ulip, ioctl_flag);
918 	if (klip == NULL)
919 		return (EFAULT);
920 
921 	mutex_enter(&lofi_lock);
922 	if (byfilename) {
923 		minor = file_to_minor(klip->li_filename);
924 	} else {
925 		minor = klip->li_minor;
926 	}
927 	if (minor == 0) {
928 		mutex_exit(&lofi_lock);
929 		free_lofi_ioctl(klip);
930 		return (ENXIO);
931 	}
932 	lsp = ddi_get_soft_state(lofi_statep, minor);
933 	if (lsp == NULL) {
934 		mutex_exit(&lofi_lock);
935 		free_lofi_ioctl(klip);
936 		return (ENXIO);
937 	}
938 	if (is_opened(lsp)) {
939 		mutex_exit(&lofi_lock);
940 		free_lofi_ioctl(klip);
941 		return (EBUSY);
942 	}
943 	/*
944 	 * Use saved open mode to properly update vnode counts
945 	 */
946 	(void) VOP_CLOSE(lsp->ls_vp, lsp->ls_openflag, 1, 0, credp);
947 	VN_RELE(lsp->ls_vp);
948 	lsp->ls_vp = NULL;
949 	newdev = makedevice(getmajor(dev), minor);
950 	(void) ddi_prop_remove(newdev, lofi_dip, SIZE_PROP_NAME);
951 	(void) ddi_prop_remove(newdev, lofi_dip, NBLOCKS_PROP_NAME);
952 
953 	(void) snprintf(namebuf, sizeof (namebuf), "%d", minor);
954 	ddi_remove_minor_node(lofi_dip, namebuf);
955 	(void) snprintf(namebuf, sizeof (namebuf), "%d,raw", minor);
956 	ddi_remove_minor_node(lofi_dip, namebuf);
957 
958 	kmem_free(lsp->ls_filename, lsp->ls_filename_sz);
959 	taskq_destroy(lsp->ls_taskq);
960 	if (lsp->ls_kstat) {
961 		kstat_delete(lsp->ls_kstat);
962 		mutex_destroy(&lsp->ls_kstat_lock);
963 	}
964 	ddi_soft_state_free(lofi_statep, minor);
965 	klip->li_minor = minor;
966 	mutex_exit(&lofi_lock);
967 	(void) copy_out_lofi_ioctl(klip, ulip, ioctl_flag);
968 	free_lofi_ioctl(klip);
969 	return (0);
970 }
971 
972 /*
973  * get the filename given the minor number, or the minor number given
974  * the name.
975  */
976 /*ARGSUSED3*/
977 static int
978 lofi_get_info(dev_t dev, struct lofi_ioctl *ulip, int which,
979     struct cred *credp, int ioctl_flag)
980 {
981 	struct lofi_state *lsp;
982 	struct lofi_ioctl *klip;
983 	int	error;
984 	minor_t	minor;
985 
986 #ifdef lint
987 	dev = dev;
988 #endif
989 	klip = copy_in_lofi_ioctl(ulip, ioctl_flag);
990 	if (klip == NULL)
991 		return (EFAULT);
992 
993 	switch (which) {
994 	case LOFI_GET_FILENAME:
995 		minor = klip->li_minor;
996 		if (minor == 0) {
997 			free_lofi_ioctl(klip);
998 			return (EINVAL);
999 		}
1000 
1001 		mutex_enter(&lofi_lock);
1002 		lsp = ddi_get_soft_state(lofi_statep, minor);
1003 		if (lsp == NULL) {
1004 			mutex_exit(&lofi_lock);
1005 			free_lofi_ioctl(klip);
1006 			return (ENXIO);
1007 		}
1008 		(void) strcpy(klip->li_filename, lsp->ls_filename);
1009 		mutex_exit(&lofi_lock);
1010 		error = copy_out_lofi_ioctl(klip, ulip, ioctl_flag);
1011 		free_lofi_ioctl(klip);
1012 		return (error);
1013 	case LOFI_GET_MINOR:
1014 		mutex_enter(&lofi_lock);
1015 		klip->li_minor = file_to_minor(klip->li_filename);
1016 		mutex_exit(&lofi_lock);
1017 		if (klip->li_minor == 0) {
1018 			free_lofi_ioctl(klip);
1019 			return (ENOENT);
1020 		}
1021 		error = copy_out_lofi_ioctl(klip, ulip, ioctl_flag);
1022 		free_lofi_ioctl(klip);
1023 		return (error);
1024 	default:
1025 		free_lofi_ioctl(klip);
1026 		return (EINVAL);
1027 	}
1028 
1029 }
1030 
1031 static int
1032 lofi_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *credp,
1033     int *rvalp)
1034 {
1035 	int	error;
1036 	enum dkio_state dkstate;
1037 	struct lofi_state *lsp;
1038 	minor_t	minor;
1039 
1040 #ifdef lint
1041 	credp = credp;
1042 #endif
1043 
1044 	minor = getminor(dev);
1045 	/* lofi ioctls only apply to the master device */
1046 	if (minor == 0) {
1047 		struct lofi_ioctl *lip = (struct lofi_ioctl *)arg;
1048 
1049 		/*
1050 		 * the query command only need read-access - i.e., normal
1051 		 * users are allowed to do those on the ctl device as
1052 		 * long as they can open it read-only.
1053 		 */
1054 		switch (cmd) {
1055 		case LOFI_MAP_FILE:
1056 			if ((flag & FWRITE) == 0)
1057 				return (EPERM);
1058 			return (lofi_map_file(dev, lip, 1, rvalp, credp, flag));
1059 		case LOFI_MAP_FILE_MINOR:
1060 			if ((flag & FWRITE) == 0)
1061 				return (EPERM);
1062 			return (lofi_map_file(dev, lip, 0, rvalp, credp, flag));
1063 		case LOFI_UNMAP_FILE:
1064 			if ((flag & FWRITE) == 0)
1065 				return (EPERM);
1066 			return (lofi_unmap_file(dev, lip, 1, credp, flag));
1067 		case LOFI_UNMAP_FILE_MINOR:
1068 			if ((flag & FWRITE) == 0)
1069 				return (EPERM);
1070 			return (lofi_unmap_file(dev, lip, 0, credp, flag));
1071 		case LOFI_GET_FILENAME:
1072 			return (lofi_get_info(dev, lip, LOFI_GET_FILENAME,
1073 			    credp, flag));
1074 		case LOFI_GET_MINOR:
1075 			return (lofi_get_info(dev, lip, LOFI_GET_MINOR,
1076 			    credp, flag));
1077 		case LOFI_GET_MAXMINOR:
1078 			error = ddi_copyout(&lofi_max_files, &lip->li_minor,
1079 			    sizeof (lofi_max_files), flag);
1080 			if (error)
1081 				return (EFAULT);
1082 			return (0);
1083 		default:
1084 			break;
1085 		}
1086 	}
1087 
1088 	lsp = ddi_get_soft_state(lofi_statep, minor);
1089 	if (lsp == NULL)
1090 		return (ENXIO);
1091 
1092 	/* these are for faking out utilities like newfs */
1093 	switch (cmd) {
1094 	case DKIOCGVTOC:
1095 		switch (ddi_model_convert_from(flag & FMODELS)) {
1096 		case DDI_MODEL_ILP32: {
1097 			struct vtoc32 vtoc32;
1098 
1099 			vtoctovtoc32(lsp->ls_vtoc, vtoc32);
1100 			if (ddi_copyout(&vtoc32, (void *)arg,
1101 			    sizeof (struct vtoc32), flag))
1102 				return (EFAULT);
1103 				break;
1104 			}
1105 
1106 		case DDI_MODEL_NONE:
1107 			if (ddi_copyout(&lsp->ls_vtoc, (void *)arg,
1108 			    sizeof (struct vtoc), flag))
1109 				return (EFAULT);
1110 			break;
1111 		}
1112 		return (0);
1113 	case DKIOCINFO:
1114 		error = ddi_copyout(&lsp->ls_ci, (void *)arg,
1115 		    sizeof (struct dk_cinfo), flag);
1116 		if (error)
1117 			return (EFAULT);
1118 		return (0);
1119 	case DKIOCG_VIRTGEOM:
1120 	case DKIOCG_PHYGEOM:
1121 	case DKIOCGGEOM:
1122 		error = ddi_copyout(&lsp->ls_dkg, (void *)arg,
1123 		    sizeof (struct dk_geom), flag);
1124 		if (error)
1125 			return (EFAULT);
1126 		return (0);
1127 	case DKIOCSTATE:
1128 		/* the file is always there */
1129 		dkstate = DKIO_INSERTED;
1130 		error = ddi_copyout(&dkstate, (void *)arg,
1131 		    sizeof (enum dkio_state), flag);
1132 		if (error)
1133 			return (EFAULT);
1134 		return (0);
1135 	default:
1136 		return (ENOTTY);
1137 	}
1138 }
1139 
1140 static struct cb_ops lofi_cb_ops = {
1141 	lofi_open,		/* open */
1142 	lofi_close,		/* close */
1143 	lofi_strategy,		/* strategy */
1144 	nodev,			/* print */
1145 	nodev,			/* dump */
1146 	lofi_read,		/* read */
1147 	lofi_write,		/* write */
1148 	lofi_ioctl,		/* ioctl */
1149 	nodev,			/* devmap */
1150 	nodev,			/* mmap */
1151 	nodev,			/* segmap */
1152 	nochpoll,		/* poll */
1153 	ddi_prop_op,		/* prop_op */
1154 	0,			/* streamtab  */
1155 	D_64BIT | D_NEW | D_MP,	/* Driver compatibility flag */
1156 	CB_REV,
1157 	lofi_aread,
1158 	lofi_awrite
1159 };
1160 
1161 static struct dev_ops lofi_ops = {
1162 	DEVO_REV,		/* devo_rev, */
1163 	0,			/* refcnt  */
1164 	lofi_info,		/* info */
1165 	nulldev,		/* identify */
1166 	nulldev,		/* probe */
1167 	lofi_attach,		/* attach */
1168 	lofi_detach,		/* detach */
1169 	nodev,			/* reset */
1170 	&lofi_cb_ops,		/* driver operations */
1171 	NULL			/* no bus operations */
1172 };
1173 
1174 static struct modldrv modldrv = {
1175 	&mod_driverops,
1176 	"loopback file driver (%I%)",
1177 	&lofi_ops,
1178 };
1179 
1180 static struct modlinkage modlinkage = {
1181 	MODREV_1,
1182 	&modldrv,
1183 	NULL
1184 };
1185 
1186 int
1187 _init(void)
1188 {
1189 	int error;
1190 
1191 	error = ddi_soft_state_init(&lofi_statep,
1192 	    sizeof (struct lofi_state), 0);
1193 	if (error)
1194 		return (error);
1195 
1196 	mutex_init(&lofi_lock, NULL, MUTEX_DRIVER, NULL);
1197 	error = mod_install(&modlinkage);
1198 	if (error) {
1199 		mutex_destroy(&lofi_lock);
1200 		ddi_soft_state_fini(&lofi_statep);
1201 	}
1202 
1203 	return (error);
1204 }
1205 
1206 int
1207 _fini(void)
1208 {
1209 	int	error;
1210 
1211 	if (lofi_busy())
1212 		return (EBUSY);
1213 
1214 	error = mod_remove(&modlinkage);
1215 	if (error)
1216 		return (error);
1217 
1218 	mutex_destroy(&lofi_lock);
1219 	ddi_soft_state_fini(&lofi_statep);
1220 
1221 	return (error);
1222 }
1223 
1224 int
1225 _info(struct modinfo *modinfop)
1226 {
1227 	return (mod_info(&modlinkage, modinfop));
1228 }
1229