xref: /illumos-gate/usr/src/cmd/bhyve/block_if.c (revision 2b9481465d6ee67ac62c160dbf79c3ec3348c611)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2013  Peter Grehan <grehan@freebsd.org>
5  * All rights reserved.
6  * Copyright 2020 Joyent, Inc.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  * $FreeBSD$
30  */
31 
32 /*
33  * Copyright 2020 Joyent, Inc.
34  */
35 
36 #include <sys/cdefs.h>
37 __FBSDID("$FreeBSD$");
38 
39 #include <sys/param.h>
40 #ifndef WITHOUT_CAPSICUM
41 #include <sys/capsicum.h>
42 #endif
43 #include <sys/queue.h>
44 #include <sys/errno.h>
45 #include <sys/stat.h>
46 #include <sys/ioctl.h>
47 #include <sys/disk.h>
48 #include <sys/limits.h>
49 #include <sys/uio.h>
50 #ifndef __FreeBSD__
51 #include <sys/dkio.h>
52 #endif
53 
54 #include <assert.h>
55 #ifndef WITHOUT_CAPSICUM
56 #include <capsicum_helpers.h>
57 #endif
58 #include <err.h>
59 #include <fcntl.h>
60 #include <stdio.h>
61 #include <stdlib.h>
62 #include <string.h>
63 #include <pthread.h>
64 #include <pthread_np.h>
65 #include <signal.h>
66 #include <sysexits.h>
67 #include <unistd.h>
68 
69 #include <machine/atomic.h>
70 
71 #include "bhyverun.h"
72 #include "config.h"
73 #include "debug.h"
74 #ifdef	__FreeBSD__
75 #include "mevent.h"
76 #endif
77 #include "pci_emul.h"
78 #include "block_if.h"
79 
80 #define BLOCKIF_SIG	0xb109b109
81 
82 #ifdef __FreeBSD__
83 #define BLOCKIF_NUMTHR	8
84 #else
85 /* Enlarge to keep pace with the virtio-block ring size */
86 #define BLOCKIF_NUMTHR	16
87 #endif
88 #define BLOCKIF_MAXREQ	(BLOCKIF_RING_MAX + BLOCKIF_NUMTHR)
89 
90 enum blockop {
91 	BOP_READ,
92 	BOP_WRITE,
93 #ifndef __FreeBSD__
94 	BOP_WRITE_SYNC,
95 #endif
96 	BOP_FLUSH,
97 	BOP_DELETE
98 };
99 
100 enum blockstat {
101 	BST_FREE,
102 	BST_BLOCK,
103 	BST_PEND,
104 	BST_BUSY,
105 	BST_DONE
106 };
107 
108 struct blockif_elem {
109 	TAILQ_ENTRY(blockif_elem) be_link;
110 	struct blockif_req  *be_req;
111 	enum blockop	     be_op;
112 	enum blockstat	     be_status;
113 	pthread_t            be_tid;
114 	off_t		     be_block;
115 };
116 
117 #ifndef __FreeBSD__
118 enum blockif_wce {
119 	WCE_NONE = 0,
120 	WCE_IOCTL,
121 	WCE_FCNTL
122 };
123 #endif
124 
125 struct blockif_ctxt {
126 	int			bc_magic;
127 	int			bc_fd;
128 	int			bc_ischr;
129 	int			bc_isgeom;
130 	int			bc_candelete;
131 #ifndef __FreeBSD__
132 	enum blockif_wce	bc_wce;
133 #endif
134 	int			bc_rdonly;
135 	off_t			bc_size;
136 	int			bc_sectsz;
137 	int			bc_psectsz;
138 	int			bc_psectoff;
139 	int			bc_closing;
140 	pthread_t		bc_btid[BLOCKIF_NUMTHR];
141 	pthread_mutex_t		bc_mtx;
142 	pthread_cond_t		bc_cond;
143 
144 	/* Request elements and free/pending/busy queues */
145 	TAILQ_HEAD(, blockif_elem) bc_freeq;
146 	TAILQ_HEAD(, blockif_elem) bc_pendq;
147 	TAILQ_HEAD(, blockif_elem) bc_busyq;
148 	struct blockif_elem	bc_reqs[BLOCKIF_MAXREQ];
149 };
150 
151 static pthread_once_t blockif_once = PTHREAD_ONCE_INIT;
152 
153 struct blockif_sig_elem {
154 	pthread_mutex_t			bse_mtx;
155 	pthread_cond_t			bse_cond;
156 	int				bse_pending;
157 	struct blockif_sig_elem		*bse_next;
158 };
159 
160 static struct blockif_sig_elem *blockif_bse_head;
161 
162 static int
163 blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq,
164 		enum blockop op)
165 {
166 	struct blockif_elem *be, *tbe;
167 	off_t off;
168 	int i;
169 
170 	be = TAILQ_FIRST(&bc->bc_freeq);
171 	assert(be != NULL);
172 	assert(be->be_status == BST_FREE);
173 	TAILQ_REMOVE(&bc->bc_freeq, be, be_link);
174 	be->be_req = breq;
175 	be->be_op = op;
176 	switch (op) {
177 	case BOP_READ:
178 	case BOP_WRITE:
179 #ifndef __FreeBSD__
180 	case BOP_WRITE_SYNC:
181 #endif
182 	case BOP_DELETE:
183 		off = breq->br_offset;
184 		for (i = 0; i < breq->br_iovcnt; i++)
185 			off += breq->br_iov[i].iov_len;
186 		break;
187 	default:
188 		off = OFF_MAX;
189 	}
190 	be->be_block = off;
191 	TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) {
192 		if (tbe->be_block == breq->br_offset)
193 			break;
194 	}
195 	if (tbe == NULL) {
196 		TAILQ_FOREACH(tbe, &bc->bc_busyq, be_link) {
197 			if (tbe->be_block == breq->br_offset)
198 				break;
199 		}
200 	}
201 	if (tbe == NULL)
202 		be->be_status = BST_PEND;
203 	else
204 		be->be_status = BST_BLOCK;
205 	TAILQ_INSERT_TAIL(&bc->bc_pendq, be, be_link);
206 	return (be->be_status == BST_PEND);
207 }
208 
209 static int
210 blockif_dequeue(struct blockif_ctxt *bc, pthread_t t, struct blockif_elem **bep)
211 {
212 	struct blockif_elem *be;
213 
214 	TAILQ_FOREACH(be, &bc->bc_pendq, be_link) {
215 		if (be->be_status == BST_PEND)
216 			break;
217 		assert(be->be_status == BST_BLOCK);
218 	}
219 	if (be == NULL)
220 		return (0);
221 	TAILQ_REMOVE(&bc->bc_pendq, be, be_link);
222 	be->be_status = BST_BUSY;
223 	be->be_tid = t;
224 	TAILQ_INSERT_TAIL(&bc->bc_busyq, be, be_link);
225 	*bep = be;
226 	return (1);
227 }
228 
229 static void
230 blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be)
231 {
232 	struct blockif_elem *tbe;
233 
234 	if (be->be_status == BST_DONE || be->be_status == BST_BUSY)
235 		TAILQ_REMOVE(&bc->bc_busyq, be, be_link);
236 	else
237 		TAILQ_REMOVE(&bc->bc_pendq, be, be_link);
238 	TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) {
239 		if (tbe->be_req->br_offset == be->be_block)
240 			tbe->be_status = BST_PEND;
241 	}
242 	be->be_tid = 0;
243 	be->be_status = BST_FREE;
244 	be->be_req = NULL;
245 	TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link);
246 }
247 
248 static void
249 blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf)
250 {
251 	struct blockif_req *br;
252 #ifdef	__FreeBSD__
253 	off_t arg[2];
254 #endif
255 	ssize_t clen, len, off, boff, voff;
256 	int i, err;
257 
258 	br = be->be_req;
259 	if (br->br_iovcnt <= 1)
260 		buf = NULL;
261 	err = 0;
262 	switch (be->be_op) {
263 	case BOP_READ:
264 		if (buf == NULL) {
265 			if ((len = preadv(bc->bc_fd, br->br_iov, br->br_iovcnt,
266 				   br->br_offset)) < 0)
267 				err = errno;
268 			else
269 				br->br_resid -= len;
270 			break;
271 		}
272 		i = 0;
273 		off = voff = 0;
274 		while (br->br_resid > 0) {
275 			len = MIN(br->br_resid, MAXPHYS);
276 			if (pread(bc->bc_fd, buf, len, br->br_offset +
277 			    off) < 0) {
278 				err = errno;
279 				break;
280 			}
281 			boff = 0;
282 			do {
283 				clen = MIN(len - boff, br->br_iov[i].iov_len -
284 				    voff);
285 				memcpy(br->br_iov[i].iov_base + voff,
286 				    buf + boff, clen);
287 				if (clen < br->br_iov[i].iov_len - voff)
288 					voff += clen;
289 				else {
290 					i++;
291 					voff = 0;
292 				}
293 				boff += clen;
294 			} while (boff < len);
295 			off += len;
296 			br->br_resid -= len;
297 		}
298 		break;
299 	case BOP_WRITE:
300 		if (bc->bc_rdonly) {
301 			err = EROFS;
302 			break;
303 		}
304 		if (buf == NULL) {
305 			if ((len = pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt,
306 				    br->br_offset)) < 0)
307 				err = errno;
308 			else
309 				br->br_resid -= len;
310 			break;
311 		}
312 		i = 0;
313 		off = voff = 0;
314 		while (br->br_resid > 0) {
315 			len = MIN(br->br_resid, MAXPHYS);
316 			boff = 0;
317 			do {
318 				clen = MIN(len - boff, br->br_iov[i].iov_len -
319 				    voff);
320 				memcpy(buf + boff,
321 				    br->br_iov[i].iov_base + voff, clen);
322 				if (clen < br->br_iov[i].iov_len - voff)
323 					voff += clen;
324 				else {
325 					i++;
326 					voff = 0;
327 				}
328 				boff += clen;
329 			} while (boff < len);
330 			if (pwrite(bc->bc_fd, buf, len, br->br_offset +
331 			    off) < 0) {
332 				err = errno;
333 				break;
334 			}
335 			off += len;
336 			br->br_resid -= len;
337 		}
338 		break;
339 	case BOP_FLUSH:
340 #ifdef	__FreeBSD__
341 		if (bc->bc_ischr) {
342 			if (ioctl(bc->bc_fd, DIOCGFLUSH))
343 				err = errno;
344 		} else if (fsync(bc->bc_fd))
345 			err = errno;
346 #else
347 		/*
348 		 * This fsync() should be adequate to flush the cache of a file
349 		 * or device.  In VFS, the VOP_SYNC operation is converted to
350 		 * the appropriate ioctl in both sdev (for real devices) and
351 		 * zfs (for zvols).
352 		 */
353 		if (fsync(bc->bc_fd))
354 			err = errno;
355 #endif
356 		break;
357 	case BOP_DELETE:
358 		if (!bc->bc_candelete)
359 			err = EOPNOTSUPP;
360 		else if (bc->bc_rdonly)
361 			err = EROFS;
362 #ifdef	__FreeBSD__
363 		else if (bc->bc_ischr) {
364 			arg[0] = br->br_offset;
365 			arg[1] = br->br_resid;
366 			if (ioctl(bc->bc_fd, DIOCGDELETE, arg))
367 				err = errno;
368 			else
369 				br->br_resid = 0;
370 		}
371 		else
372 			 err = EOPNOTSUPP;
373 #else
374 		else if (bc->bc_ischr) {
375 			dkioc_free_list_t dfl = {
376 				.dfl_num_exts = 1,
377 				.dfl_offset = 0,
378 				.dfl_flags = 0,
379 				.dfl_exts = {
380 					{
381 						.dfle_start = br->br_offset,
382 						.dfle_length = br->br_resid
383 					}
384 				}
385 			};
386 
387 			if (ioctl(bc->bc_fd, DKIOCFREE, &dfl))
388 				err = errno;
389 			else
390 				br->br_resid = 0;
391 		} else {
392 			struct flock fl = {
393 				.l_whence = 0,
394 				.l_type = F_WRLCK,
395 				.l_start = br->br_offset,
396 				.l_len = br->br_resid
397 			};
398 
399 			if (fcntl(bc->bc_fd, F_FREESP, &fl))
400 				err = errno;
401 			else
402 				br->br_resid = 0;
403 		}
404 #endif
405 		break;
406 	default:
407 		err = EINVAL;
408 		break;
409 	}
410 
411 	be->be_status = BST_DONE;
412 
413 	(*br->br_callback)(br, err);
414 }
415 
416 static void *
417 blockif_thr(void *arg)
418 {
419 	struct blockif_ctxt *bc;
420 	struct blockif_elem *be;
421 	pthread_t t;
422 	uint8_t *buf;
423 
424 	bc = arg;
425 	if (bc->bc_isgeom)
426 		buf = malloc(MAXPHYS);
427 	else
428 		buf = NULL;
429 	t = pthread_self();
430 
431 	pthread_mutex_lock(&bc->bc_mtx);
432 	for (;;) {
433 		while (blockif_dequeue(bc, t, &be)) {
434 			pthread_mutex_unlock(&bc->bc_mtx);
435 			blockif_proc(bc, be, buf);
436 			pthread_mutex_lock(&bc->bc_mtx);
437 			blockif_complete(bc, be);
438 		}
439 		/* Check ctxt status here to see if exit requested */
440 		if (bc->bc_closing)
441 			break;
442 		pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx);
443 	}
444 	pthread_mutex_unlock(&bc->bc_mtx);
445 
446 	if (buf)
447 		free(buf);
448 	pthread_exit(NULL);
449 	return (NULL);
450 }
451 
452 #ifdef	__FreeBSD__
453 static void
454 blockif_sigcont_handler(int signal, enum ev_type type, void *arg)
455 #else
456 static void
457 blockif_sigcont_handler(int signal)
458 #endif
459 {
460 	struct blockif_sig_elem *bse;
461 
462 	for (;;) {
463 		/*
464 		 * Process the entire list even if not intended for
465 		 * this thread.
466 		 */
467 		do {
468 			bse = blockif_bse_head;
469 			if (bse == NULL)
470 				return;
471 		} while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head,
472 					    (uintptr_t)bse,
473 					    (uintptr_t)bse->bse_next));
474 
475 		pthread_mutex_lock(&bse->bse_mtx);
476 		bse->bse_pending = 0;
477 		pthread_cond_signal(&bse->bse_cond);
478 		pthread_mutex_unlock(&bse->bse_mtx);
479 	}
480 }
481 
482 static void
483 blockif_init(void)
484 {
485 #ifdef	__FreeBSD__
486 	mevent_add(SIGCONT, EVF_SIGNAL, blockif_sigcont_handler, NULL);
487 	(void) signal(SIGCONT, SIG_IGN);
488 #else
489 	(void) sigset(SIGCONT, blockif_sigcont_handler);
490 #endif
491 }
492 
493 int
494 blockif_legacy_config(nvlist_t *nvl, const char *opts)
495 {
496 	char *cp, *path;
497 
498 	if (opts == NULL)
499 		return (0);
500 
501 	cp = strchr(opts, ',');
502 	if (cp == NULL) {
503 		set_config_value_node(nvl, "path", opts);
504 		return (0);
505 	}
506 	path = strndup(opts, cp - opts);
507 	set_config_value_node(nvl, "path", path);
508 	free(path);
509 	return (pci_parse_legacy_config(nvl, cp + 1));
510 }
511 
512 struct blockif_ctxt *
513 blockif_open(nvlist_t *nvl, const char *ident)
514 {
515 	char tname[MAXCOMLEN + 1];
516 #ifdef	__FreeBSD__
517 	char name[MAXPATHLEN];
518 #endif
519 	const char *path, *pssval, *ssval;
520 	char *cp;
521 	struct blockif_ctxt *bc;
522 	struct stat sbuf;
523 #ifdef	__FreeBSD__
524 	struct diocgattr_arg arg;
525 #else
526 	enum blockif_wce wce = WCE_NONE;
527 #endif
528 	off_t size, psectsz, psectoff;
529 	int extra, fd, i, sectsz;
530 	int ro, candelete, geom, ssopt, pssopt;
531 	int nodelete;
532 
533 #ifndef WITHOUT_CAPSICUM
534 	cap_rights_t rights;
535 	cap_ioctl_t cmds[] = { DIOCGFLUSH, DIOCGDELETE };
536 #endif
537 
538 	pthread_once(&blockif_once, blockif_init);
539 
540 	fd = -1;
541 	extra = 0;
542 	ssopt = 0;
543 #ifndef __FreeBSD__
544 	pssopt = 0;
545 #endif
546 	ro = 0;
547 	nodelete = 0;
548 
549 	if (get_config_bool_node_default(nvl, "nocache", false))
550 		extra |= O_DIRECT;
551 	if (get_config_bool_node_default(nvl, "nodelete", false))
552 		nodelete = 1;
553 	if (get_config_bool_node_default(nvl, "sync", false) ||
554 	    get_config_bool_node_default(nvl, "direct", false))
555 		extra |= O_SYNC;
556 	if (get_config_bool_node_default(nvl, "ro", false))
557 		ro = 1;
558 	ssval = get_config_value_node(nvl, "sectorsize");
559 	if (ssval != NULL) {
560 		ssopt = strtol(ssval, &cp, 10);
561 		if (cp == ssval) {
562 			EPRINTLN("Invalid sector size \"%s\"", ssval);
563 			goto err;
564 		}
565 		if (*cp == '\0') {
566 			pssopt = ssopt;
567 		} else if (*cp == '/') {
568 			pssval = cp + 1;
569 			pssopt = strtol(pssval, &cp, 10);
570 			if (cp == pssval || *cp != '\0') {
571 				EPRINTLN("Invalid sector size \"%s\"", ssval);
572 				goto err;
573 			}
574 		} else {
575 			EPRINTLN("Invalid sector size \"%s\"", ssval);
576 			goto err;
577 		}
578 	}
579 
580 	path = get_config_value_node(nvl, "path");
581 	if (path == NULL) {
582 		EPRINTLN("Missing \"path\" for block device.");
583 		goto err;
584 	}
585 
586 	fd = open(path, (ro ? O_RDONLY : O_RDWR) | extra);
587 	if (fd < 0 && !ro) {
588 		/* Attempt a r/w fail with a r/o open */
589 		fd = open(path, O_RDONLY | extra);
590 		ro = 1;
591 	}
592 
593 	if (fd < 0) {
594 		warn("Could not open backing file: %s", path);
595 		goto err;
596 	}
597 
598         if (fstat(fd, &sbuf) < 0) {
599 		warn("Could not stat backing file %s", path);
600 		goto err;
601         }
602 
603 #ifndef WITHOUT_CAPSICUM
604 	cap_rights_init(&rights, CAP_FSYNC, CAP_IOCTL, CAP_READ, CAP_SEEK,
605 	    CAP_WRITE);
606 	if (ro)
607 		cap_rights_clear(&rights, CAP_FSYNC, CAP_WRITE);
608 
609 	if (caph_rights_limit(fd, &rights) == -1)
610 		errx(EX_OSERR, "Unable to apply rights for sandbox");
611 #endif
612 
613         /*
614 	 * Deal with raw devices
615 	 */
616         size = sbuf.st_size;
617 	sectsz = DEV_BSIZE;
618 	psectsz = psectoff = 0;
619 	candelete = geom = 0;
620 #ifdef	__FreeBSD__
621 	if (S_ISCHR(sbuf.st_mode)) {
622 		if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 ||
623 		    ioctl(fd, DIOCGSECTORSIZE, &sectsz)) {
624 			perror("Could not fetch dev blk/sector size");
625 			goto err;
626 		}
627 		assert(size != 0);
628 		assert(sectsz != 0);
629 		if (ioctl(fd, DIOCGSTRIPESIZE, &psectsz) == 0 && psectsz > 0)
630 			ioctl(fd, DIOCGSTRIPEOFFSET, &psectoff);
631 		strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name));
632 		arg.len = sizeof(arg.value.i);
633 		if (nodelete == 0 && ioctl(fd, DIOCGATTR, &arg) == 0)
634 			candelete = arg.value.i;
635 		if (ioctl(fd, DIOCGPROVIDERNAME, name) == 0)
636 			geom = 1;
637 	} else {
638 		psectsz = sbuf.st_blksize;
639 	}
640 #else
641 	psectsz = sbuf.st_blksize;
642 	if (S_ISCHR(sbuf.st_mode)) {
643 		struct dk_minfo_ext dkmext;
644 		int wce_val;
645 
646 		/* Look for a more accurate physical blocksize */
647 		if (ioctl(fd, DKIOCGMEDIAINFOEXT, &dkmext) == 0) {
648 			psectsz = dkmext.dki_pbsize;
649 		}
650 		/* See if a configurable write cache is present and working */
651 		if (ioctl(fd, DKIOCGETWCE, &wce_val) == 0) {
652 			/*
653 			 * If WCE is already active, disable it until the
654 			 * specific device driver calls for its return.  If it
655 			 * is not active, toggle it on and off to verify that
656 			 * such actions are possible.
657 			 */
658 			if (wce_val != 0) {
659 				wce_val = 0;
660 				/*
661 				 * Inability to disable the cache is a threat
662 				 * to data durability.
663 				 */
664 				assert(ioctl(fd, DKIOCSETWCE, &wce_val) == 0);
665 				wce = WCE_IOCTL;
666 			} else {
667 				int r1, r2;
668 
669 				wce_val = 1;
670 				r1 = ioctl(fd, DKIOCSETWCE, &wce_val);
671 				wce_val = 0;
672 				r2 = ioctl(fd, DKIOCSETWCE, &wce_val);
673 
674 				if (r1 == 0 && r2 == 0) {
675 					wce = WCE_IOCTL;
676 				} else {
677 					/*
678 					 * If the cache cache toggle was not
679 					 * successful, ensure that the cache
680 					 * was not left enabled.
681 					 */
682 					assert(r1 != 0);
683 				}
684 			}
685 		}
686 
687 		if (nodelete == 0 && ioctl(fd, DKIOC_CANFREE, &candelete))
688 			candelete = 0;
689 
690 	} else {
691 		int flags;
692 
693 		if ((flags = fcntl(fd, F_GETFL)) >= 0) {
694 			flags |= O_DSYNC;
695 			if (fcntl(fd, F_SETFL, flags) != -1) {
696 				wce = WCE_FCNTL;
697 			}
698 		}
699 
700 		/*
701 		 * We don't have a way to discover if a file supports the
702 		 * FREESP fcntl cmd (other than trying it).  However,
703 		 * zfs, ufs, tmpfs, and udfs all support the FREESP fcntl cmd.
704 		 * Nfsv4 and nfsv4 also forward the FREESP request
705 		 * to the server, so we always enable it for file based
706 		 * volumes. Anyone trying to run volumes on an unsupported
707 		 * configuration is on their own, and should be prepared
708 		 * for the requests to fail.
709 		 */
710 		if (nodelete == 0)
711 			candelete = 1;
712 	}
713 #endif
714 
715 #ifndef WITHOUT_CAPSICUM
716 	if (caph_ioctls_limit(fd, cmds, nitems(cmds)) == -1)
717 		errx(EX_OSERR, "Unable to apply rights for sandbox");
718 #endif
719 
720 	if (ssopt != 0) {
721 		if (!powerof2(ssopt) || !powerof2(pssopt) || ssopt < 512 ||
722 		    ssopt > pssopt) {
723 			EPRINTLN("Invalid sector size %d/%d",
724 			    ssopt, pssopt);
725 			goto err;
726 		}
727 
728 		/*
729 		 * Some backend drivers (e.g. cd0, ada0) require that the I/O
730 		 * size be a multiple of the device's sector size.
731 		 *
732 		 * Validate that the emulated sector size complies with this
733 		 * requirement.
734 		 */
735 		if (S_ISCHR(sbuf.st_mode)) {
736 			if (ssopt < sectsz || (ssopt % sectsz) != 0) {
737 				EPRINTLN("Sector size %d incompatible "
738 				    "with underlying device sector size %d",
739 				    ssopt, sectsz);
740 				goto err;
741 			}
742 		}
743 
744 		sectsz = ssopt;
745 		psectsz = pssopt;
746 		psectoff = 0;
747 	}
748 
749 	bc = calloc(1, sizeof(struct blockif_ctxt));
750 	if (bc == NULL) {
751 		perror("calloc");
752 		goto err;
753 	}
754 
755 	bc->bc_magic = BLOCKIF_SIG;
756 	bc->bc_fd = fd;
757 	bc->bc_ischr = S_ISCHR(sbuf.st_mode);
758 	bc->bc_isgeom = geom;
759 	bc->bc_candelete = candelete;
760 #ifndef __FreeBSD__
761 	bc->bc_wce = wce;
762 #endif
763 	bc->bc_rdonly = ro;
764 	bc->bc_size = size;
765 	bc->bc_sectsz = sectsz;
766 	bc->bc_psectsz = psectsz;
767 	bc->bc_psectoff = psectoff;
768 	pthread_mutex_init(&bc->bc_mtx, NULL);
769 	pthread_cond_init(&bc->bc_cond, NULL);
770 	TAILQ_INIT(&bc->bc_freeq);
771 	TAILQ_INIT(&bc->bc_pendq);
772 	TAILQ_INIT(&bc->bc_busyq);
773 	for (i = 0; i < BLOCKIF_MAXREQ; i++) {
774 		bc->bc_reqs[i].be_status = BST_FREE;
775 		TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link);
776 	}
777 
778 	for (i = 0; i < BLOCKIF_NUMTHR; i++) {
779 		pthread_create(&bc->bc_btid[i], NULL, blockif_thr, bc);
780 		snprintf(tname, sizeof(tname), "blk-%s-%d", ident, i);
781 		pthread_set_name_np(bc->bc_btid[i], tname);
782 	}
783 
784 	return (bc);
785 err:
786 	if (fd >= 0)
787 		close(fd);
788 	return (NULL);
789 }
790 
791 static int
792 blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq,
793 		enum blockop op)
794 {
795 	int err;
796 
797 	err = 0;
798 
799 	pthread_mutex_lock(&bc->bc_mtx);
800 	if (!TAILQ_EMPTY(&bc->bc_freeq)) {
801 		/*
802 		 * Enqueue and inform the block i/o thread
803 		 * that there is work available
804 		 */
805 		if (blockif_enqueue(bc, breq, op))
806 			pthread_cond_signal(&bc->bc_cond);
807 	} else {
808 		/*
809 		 * Callers are not allowed to enqueue more than
810 		 * the specified blockif queue limit. Return an
811 		 * error to indicate that the queue length has been
812 		 * exceeded.
813 		 */
814 		err = E2BIG;
815 	}
816 	pthread_mutex_unlock(&bc->bc_mtx);
817 
818 	return (err);
819 }
820 
821 int
822 blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq)
823 {
824 
825 	assert(bc->bc_magic == BLOCKIF_SIG);
826 	return (blockif_request(bc, breq, BOP_READ));
827 }
828 
829 int
830 blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq)
831 {
832 
833 	assert(bc->bc_magic == BLOCKIF_SIG);
834 	return (blockif_request(bc, breq, BOP_WRITE));
835 }
836 
837 int
838 blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq)
839 {
840 
841 	assert(bc->bc_magic == BLOCKIF_SIG);
842 	return (blockif_request(bc, breq, BOP_FLUSH));
843 }
844 
845 int
846 blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq)
847 {
848 
849 	assert(bc->bc_magic == BLOCKIF_SIG);
850 	return (blockif_request(bc, breq, BOP_DELETE));
851 }
852 
853 int
854 blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq)
855 {
856 	struct blockif_elem *be;
857 
858 	assert(bc->bc_magic == BLOCKIF_SIG);
859 
860 	pthread_mutex_lock(&bc->bc_mtx);
861 	/*
862 	 * Check pending requests.
863 	 */
864 	TAILQ_FOREACH(be, &bc->bc_pendq, be_link) {
865 		if (be->be_req == breq)
866 			break;
867 	}
868 	if (be != NULL) {
869 		/*
870 		 * Found it.
871 		 */
872 		blockif_complete(bc, be);
873 		pthread_mutex_unlock(&bc->bc_mtx);
874 
875 		return (0);
876 	}
877 
878 	/*
879 	 * Check in-flight requests.
880 	 */
881 	TAILQ_FOREACH(be, &bc->bc_busyq, be_link) {
882 		if (be->be_req == breq)
883 			break;
884 	}
885 	if (be == NULL) {
886 		/*
887 		 * Didn't find it.
888 		 */
889 		pthread_mutex_unlock(&bc->bc_mtx);
890 		return (EINVAL);
891 	}
892 
893 	/*
894 	 * Interrupt the processing thread to force it return
895 	 * prematurely via it's normal callback path.
896 	 */
897 	while (be->be_status == BST_BUSY) {
898 		struct blockif_sig_elem bse, *old_head;
899 
900 		pthread_mutex_init(&bse.bse_mtx, NULL);
901 		pthread_cond_init(&bse.bse_cond, NULL);
902 
903 		bse.bse_pending = 1;
904 
905 		do {
906 			old_head = blockif_bse_head;
907 			bse.bse_next = old_head;
908 		} while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head,
909 					    (uintptr_t)old_head,
910 					    (uintptr_t)&bse));
911 
912 		pthread_kill(be->be_tid, SIGCONT);
913 
914 		pthread_mutex_lock(&bse.bse_mtx);
915 		while (bse.bse_pending)
916 			pthread_cond_wait(&bse.bse_cond, &bse.bse_mtx);
917 		pthread_mutex_unlock(&bse.bse_mtx);
918 	}
919 
920 	pthread_mutex_unlock(&bc->bc_mtx);
921 
922 	/*
923 	 * The processing thread has been interrupted.  Since it's not
924 	 * clear if the callback has been invoked yet, return EBUSY.
925 	 */
926 	return (EBUSY);
927 }
928 
929 int
930 blockif_close(struct blockif_ctxt *bc)
931 {
932 	void *jval;
933 	int i;
934 
935 	assert(bc->bc_magic == BLOCKIF_SIG);
936 
937 	/*
938 	 * Stop the block i/o thread
939 	 */
940 	pthread_mutex_lock(&bc->bc_mtx);
941 	bc->bc_closing = 1;
942 	pthread_mutex_unlock(&bc->bc_mtx);
943 	pthread_cond_broadcast(&bc->bc_cond);
944 	for (i = 0; i < BLOCKIF_NUMTHR; i++)
945 		pthread_join(bc->bc_btid[i], &jval);
946 
947 	/* XXX Cancel queued i/o's ??? */
948 
949 	/*
950 	 * Release resources
951 	 */
952 	bc->bc_magic = 0;
953 	close(bc->bc_fd);
954 	free(bc);
955 
956 	return (0);
957 }
958 
959 /*
960  * Return virtual C/H/S values for a given block. Use the algorithm
961  * outlined in the VHD specification to calculate values.
962  */
963 void
964 blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s)
965 {
966 	off_t sectors;		/* total sectors of the block dev */
967 	off_t hcyl;		/* cylinders times heads */
968 	uint16_t secpt;		/* sectors per track */
969 	uint8_t heads;
970 
971 	assert(bc->bc_magic == BLOCKIF_SIG);
972 
973 	sectors = bc->bc_size / bc->bc_sectsz;
974 
975 	/* Clamp the size to the largest possible with CHS */
976 	if (sectors > 65535UL*16*255)
977 		sectors = 65535UL*16*255;
978 
979 	if (sectors >= 65536UL*16*63) {
980 		secpt = 255;
981 		heads = 16;
982 		hcyl = sectors / secpt;
983 	} else {
984 		secpt = 17;
985 		hcyl = sectors / secpt;
986 		heads = (hcyl + 1023) / 1024;
987 
988 		if (heads < 4)
989 			heads = 4;
990 
991 		if (hcyl >= (heads * 1024) || heads > 16) {
992 			secpt = 31;
993 			heads = 16;
994 			hcyl = sectors / secpt;
995 		}
996 		if (hcyl >= (heads * 1024)) {
997 			secpt = 63;
998 			heads = 16;
999 			hcyl = sectors / secpt;
1000 		}
1001 	}
1002 
1003 	*c = hcyl / heads;
1004 	*h = heads;
1005 	*s = secpt;
1006 }
1007 
1008 /*
1009  * Accessors
1010  */
1011 off_t
1012 blockif_size(struct blockif_ctxt *bc)
1013 {
1014 
1015 	assert(bc->bc_magic == BLOCKIF_SIG);
1016 	return (bc->bc_size);
1017 }
1018 
1019 int
1020 blockif_sectsz(struct blockif_ctxt *bc)
1021 {
1022 
1023 	assert(bc->bc_magic == BLOCKIF_SIG);
1024 	return (bc->bc_sectsz);
1025 }
1026 
1027 void
1028 blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off)
1029 {
1030 
1031 	assert(bc->bc_magic == BLOCKIF_SIG);
1032 	*size = bc->bc_psectsz;
1033 	*off = bc->bc_psectoff;
1034 }
1035 
1036 int
1037 blockif_queuesz(struct blockif_ctxt *bc)
1038 {
1039 
1040 	assert(bc->bc_magic == BLOCKIF_SIG);
1041 	return (BLOCKIF_MAXREQ - 1);
1042 }
1043 
1044 int
1045 blockif_is_ro(struct blockif_ctxt *bc)
1046 {
1047 
1048 	assert(bc->bc_magic == BLOCKIF_SIG);
1049 	return (bc->bc_rdonly);
1050 }
1051 
1052 int
1053 blockif_candelete(struct blockif_ctxt *bc)
1054 {
1055 
1056 	assert(bc->bc_magic == BLOCKIF_SIG);
1057 	return (bc->bc_candelete);
1058 }
1059 
1060 #ifndef __FreeBSD__
1061 int
1062 blockif_set_wce(struct blockif_ctxt *bc, int wc_enable)
1063 {
1064 	int res = 0, flags;
1065 	int clean_val = (wc_enable != 0) ? 1 : 0;
1066 
1067 	(void) pthread_mutex_lock(&bc->bc_mtx);
1068 	switch (bc->bc_wce) {
1069 	case WCE_IOCTL:
1070 		res = ioctl(bc->bc_fd, DKIOCSETWCE, &clean_val);
1071 		break;
1072 	case WCE_FCNTL:
1073 		if ((flags = fcntl(bc->bc_fd, F_GETFL)) >= 0) {
1074 			if (wc_enable == 0) {
1075 				flags |= O_DSYNC;
1076 			} else {
1077 				flags &= ~O_DSYNC;
1078 			}
1079 			if (fcntl(bc->bc_fd, F_SETFL, flags) == -1) {
1080 				res = -1;
1081 			}
1082 		} else {
1083 			res = -1;
1084 		}
1085 		break;
1086 	default:
1087 		break;
1088 	}
1089 
1090 	/*
1091 	 * After a successful disable of the write cache, ensure that any
1092 	 * lingering data in the cache is synced out.
1093 	 */
1094 	if (res == 0 && wc_enable == 0) {
1095 		res = fsync(bc->bc_fd);
1096 	}
1097 	(void) pthread_mutex_unlock(&bc->bc_mtx);
1098 
1099 	return (res);
1100 }
1101 #endif /* __FreeBSD__ */
1102