xref: /freebsd/usr.sbin/bhyve/block_if.c (revision 61b95bcb42993b24633b280791438266d78f2747)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2013  Peter Grehan <grehan@freebsd.org>
5  * All rights reserved.
6  * Copyright 2020 Joyent, Inc.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  * $FreeBSD$
30  */
31 
32 #include <sys/cdefs.h>
33 __FBSDID("$FreeBSD$");
34 
35 #include <sys/param.h>
36 #ifndef WITHOUT_CAPSICUM
37 #include <sys/capsicum.h>
38 #endif
39 #include <sys/queue.h>
40 #include <sys/errno.h>
41 #include <sys/stat.h>
42 #include <sys/ioctl.h>
43 #include <sys/disk.h>
44 
45 #include <assert.h>
46 #ifndef WITHOUT_CAPSICUM
47 #include <capsicum_helpers.h>
48 #endif
49 #include <err.h>
50 #include <fcntl.h>
51 #include <stdio.h>
52 #include <stdlib.h>
53 #include <string.h>
54 #include <pthread.h>
55 #include <pthread_np.h>
56 #include <signal.h>
57 #include <sysexits.h>
58 #include <unistd.h>
59 
60 #include <machine/atomic.h>
61 #include <machine/vmm_snapshot.h>
62 
63 #include "bhyverun.h"
64 #include "config.h"
65 #include "debug.h"
66 #include "mevent.h"
67 #include "pci_emul.h"
68 #include "block_if.h"
69 
70 #define BLOCKIF_SIG	0xb109b109
71 
72 #define BLOCKIF_NUMTHR	8
73 #define BLOCKIF_MAXREQ	(BLOCKIF_RING_MAX + BLOCKIF_NUMTHR)
74 
75 enum blockop {
76 	BOP_READ,
77 	BOP_WRITE,
78 	BOP_FLUSH,
79 	BOP_DELETE
80 };
81 
82 enum blockstat {
83 	BST_FREE,
84 	BST_BLOCK,
85 	BST_PEND,
86 	BST_BUSY,
87 	BST_DONE
88 };
89 
90 struct blockif_elem {
91 	TAILQ_ENTRY(blockif_elem) be_link;
92 	struct blockif_req  *be_req;
93 	enum blockop	     be_op;
94 	enum blockstat	     be_status;
95 	pthread_t            be_tid;
96 	off_t		     be_block;
97 };
98 
99 struct blockif_ctxt {
100 	unsigned int		bc_magic;
101 	int			bc_fd;
102 	int			bc_ischr;
103 	int			bc_isgeom;
104 	int			bc_candelete;
105 	int			bc_rdonly;
106 	off_t			bc_size;
107 	int			bc_sectsz;
108 	int			bc_psectsz;
109 	int			bc_psectoff;
110 	int			bc_closing;
111 	int			bc_paused;
112 	pthread_t		bc_btid[BLOCKIF_NUMTHR];
113 	pthread_mutex_t		bc_mtx;
114 	pthread_cond_t		bc_cond;
115 	pthread_cond_t		bc_work_done_cond;
116 	blockif_resize_cb	*bc_resize_cb;
117 	void			*bc_resize_cb_arg;
118 	struct mevent		*bc_resize_event;
119 
120 	/* Request elements and free/pending/busy queues */
121 	TAILQ_HEAD(, blockif_elem) bc_freeq;
122 	TAILQ_HEAD(, blockif_elem) bc_pendq;
123 	TAILQ_HEAD(, blockif_elem) bc_busyq;
124 	struct blockif_elem	bc_reqs[BLOCKIF_MAXREQ];
125 };
126 
127 static pthread_once_t blockif_once = PTHREAD_ONCE_INIT;
128 
129 struct blockif_sig_elem {
130 	pthread_mutex_t			bse_mtx;
131 	pthread_cond_t			bse_cond;
132 	int				bse_pending;
133 	struct blockif_sig_elem		*bse_next;
134 };
135 
136 static struct blockif_sig_elem *blockif_bse_head;
137 
138 static int
139 blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq,
140 		enum blockop op)
141 {
142 	struct blockif_elem *be, *tbe;
143 	off_t off;
144 	int i;
145 
146 	be = TAILQ_FIRST(&bc->bc_freeq);
147 	assert(be != NULL);
148 	assert(be->be_status == BST_FREE);
149 	TAILQ_REMOVE(&bc->bc_freeq, be, be_link);
150 	be->be_req = breq;
151 	be->be_op = op;
152 	switch (op) {
153 	case BOP_READ:
154 	case BOP_WRITE:
155 	case BOP_DELETE:
156 		off = breq->br_offset;
157 		for (i = 0; i < breq->br_iovcnt; i++)
158 			off += breq->br_iov[i].iov_len;
159 		break;
160 	default:
161 		off = OFF_MAX;
162 	}
163 	be->be_block = off;
164 	TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) {
165 		if (tbe->be_block == breq->br_offset)
166 			break;
167 	}
168 	if (tbe == NULL) {
169 		TAILQ_FOREACH(tbe, &bc->bc_busyq, be_link) {
170 			if (tbe->be_block == breq->br_offset)
171 				break;
172 		}
173 	}
174 	if (tbe == NULL)
175 		be->be_status = BST_PEND;
176 	else
177 		be->be_status = BST_BLOCK;
178 	TAILQ_INSERT_TAIL(&bc->bc_pendq, be, be_link);
179 	return (be->be_status == BST_PEND);
180 }
181 
182 static int
183 blockif_dequeue(struct blockif_ctxt *bc, pthread_t t, struct blockif_elem **bep)
184 {
185 	struct blockif_elem *be;
186 
187 	TAILQ_FOREACH(be, &bc->bc_pendq, be_link) {
188 		if (be->be_status == BST_PEND)
189 			break;
190 		assert(be->be_status == BST_BLOCK);
191 	}
192 	if (be == NULL)
193 		return (0);
194 	TAILQ_REMOVE(&bc->bc_pendq, be, be_link);
195 	be->be_status = BST_BUSY;
196 	be->be_tid = t;
197 	TAILQ_INSERT_TAIL(&bc->bc_busyq, be, be_link);
198 	*bep = be;
199 	return (1);
200 }
201 
202 static void
203 blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be)
204 {
205 	struct blockif_elem *tbe;
206 
207 	if (be->be_status == BST_DONE || be->be_status == BST_BUSY)
208 		TAILQ_REMOVE(&bc->bc_busyq, be, be_link);
209 	else
210 		TAILQ_REMOVE(&bc->bc_pendq, be, be_link);
211 	TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) {
212 		if (tbe->be_req->br_offset == be->be_block)
213 			tbe->be_status = BST_PEND;
214 	}
215 	be->be_tid = 0;
216 	be->be_status = BST_FREE;
217 	be->be_req = NULL;
218 	TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link);
219 }
220 
221 static int
222 blockif_flush_bc(struct blockif_ctxt *bc)
223 {
224 	if (bc->bc_ischr) {
225 		if (ioctl(bc->bc_fd, DIOCGFLUSH))
226 			return (errno);
227 	} else if (fsync(bc->bc_fd))
228 		return (errno);
229 
230 	return (0);
231 }
232 
233 static void
234 blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf)
235 {
236 	struct spacectl_range range;
237 	struct blockif_req *br;
238 	off_t arg[2];
239 	ssize_t n;
240 	size_t clen, len, off, boff, voff;
241 	int i, err;
242 
243 	br = be->be_req;
244 	assert(br->br_resid >= 0);
245 
246 	if (br->br_iovcnt <= 1)
247 		buf = NULL;
248 	err = 0;
249 	switch (be->be_op) {
250 	case BOP_READ:
251 		if (buf == NULL) {
252 			if ((n = preadv(bc->bc_fd, br->br_iov, br->br_iovcnt,
253 			    br->br_offset)) < 0)
254 				err = errno;
255 			else
256 				br->br_resid -= n;
257 			break;
258 		}
259 		i = 0;
260 		off = voff = 0;
261 		while (br->br_resid > 0) {
262 			len = MIN(br->br_resid, MAXPHYS);
263 			n = pread(bc->bc_fd, buf, len, br->br_offset + off);
264 			if (n < 0) {
265 				err = errno;
266 				break;
267 			}
268 			len = (size_t)n;
269 			boff = 0;
270 			do {
271 				clen = MIN(len - boff, br->br_iov[i].iov_len -
272 				    voff);
273 				memcpy((uint8_t *)br->br_iov[i].iov_base + voff,
274 				    buf + boff, clen);
275 				if (clen < br->br_iov[i].iov_len - voff)
276 					voff += clen;
277 				else {
278 					i++;
279 					voff = 0;
280 				}
281 				boff += clen;
282 			} while (boff < len);
283 			off += len;
284 			br->br_resid -= len;
285 		}
286 		break;
287 	case BOP_WRITE:
288 		if (bc->bc_rdonly) {
289 			err = EROFS;
290 			break;
291 		}
292 		if (buf == NULL) {
293 			if ((n = pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt,
294 			    br->br_offset)) < 0)
295 				err = errno;
296 			else
297 				br->br_resid -= n;
298 			break;
299 		}
300 		i = 0;
301 		off = voff = 0;
302 		while (br->br_resid > 0) {
303 			len = MIN(br->br_resid, MAXPHYS);
304 			boff = 0;
305 			do {
306 				clen = MIN(len - boff, br->br_iov[i].iov_len -
307 				    voff);
308 				memcpy(buf + boff,
309 				    (uint8_t *)br->br_iov[i].iov_base + voff,
310 				    clen);
311 				if (clen < br->br_iov[i].iov_len - voff)
312 					voff += clen;
313 				else {
314 					i++;
315 					voff = 0;
316 				}
317 				boff += clen;
318 			} while (boff < len);
319 
320 			n = pwrite(bc->bc_fd, buf, len, br->br_offset + off);
321 			if (n < 0) {
322 				err = errno;
323 				break;
324 			}
325 			off += n;
326 			br->br_resid -= n;
327 		}
328 		break;
329 	case BOP_FLUSH:
330 		err = blockif_flush_bc(bc);
331 		break;
332 	case BOP_DELETE:
333 		if (!bc->bc_candelete)
334 			err = EOPNOTSUPP;
335 		else if (bc->bc_rdonly)
336 			err = EROFS;
337 		else if (bc->bc_ischr) {
338 			arg[0] = br->br_offset;
339 			arg[1] = br->br_resid;
340 			if (ioctl(bc->bc_fd, DIOCGDELETE, arg))
341 				err = errno;
342 			else
343 				br->br_resid = 0;
344 		} else {
345 			range.r_offset = br->br_offset;
346 			range.r_len = br->br_resid;
347 
348 			while (range.r_len > 0) {
349 				if (fspacectl(bc->bc_fd, SPACECTL_DEALLOC,
350 				    &range, 0, &range) != 0) {
351 					err = errno;
352 					break;
353 				}
354 			}
355 			if (err == 0)
356 				br->br_resid = 0;
357 		}
358 		break;
359 	default:
360 		err = EINVAL;
361 		break;
362 	}
363 
364 	be->be_status = BST_DONE;
365 
366 	(*br->br_callback)(br, err);
367 }
368 
369 static inline bool
370 blockif_empty(const struct blockif_ctxt *bc)
371 {
372 	return (TAILQ_EMPTY(&bc->bc_pendq) && TAILQ_EMPTY(&bc->bc_busyq));
373 }
374 
375 static void *
376 blockif_thr(void *arg)
377 {
378 	struct blockif_ctxt *bc;
379 	struct blockif_elem *be;
380 	pthread_t t;
381 	uint8_t *buf;
382 
383 	bc = arg;
384 	if (bc->bc_isgeom)
385 		buf = malloc(MAXPHYS);
386 	else
387 		buf = NULL;
388 	t = pthread_self();
389 
390 	pthread_mutex_lock(&bc->bc_mtx);
391 	for (;;) {
392 		while (blockif_dequeue(bc, t, &be)) {
393 			pthread_mutex_unlock(&bc->bc_mtx);
394 			blockif_proc(bc, be, buf);
395 			pthread_mutex_lock(&bc->bc_mtx);
396 			blockif_complete(bc, be);
397 		}
398 
399 		/* If none to work, notify the main thread */
400 		if (blockif_empty(bc))
401 			pthread_cond_broadcast(&bc->bc_work_done_cond);
402 
403 		/* Check ctxt status here to see if exit requested */
404 		if (bc->bc_closing)
405 			break;
406 
407 		pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx);
408 	}
409 	pthread_mutex_unlock(&bc->bc_mtx);
410 
411 	if (buf)
412 		free(buf);
413 	pthread_exit(NULL);
414 	return (NULL);
415 }
416 
417 static void
418 blockif_sigcont_handler(int signal __unused, enum ev_type type __unused,
419     void *arg __unused)
420 {
421 	struct blockif_sig_elem *bse;
422 
423 	for (;;) {
424 		/*
425 		 * Process the entire list even if not intended for
426 		 * this thread.
427 		 */
428 		do {
429 			bse = blockif_bse_head;
430 			if (bse == NULL)
431 				return;
432 		} while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head,
433 					    (uintptr_t)bse,
434 					    (uintptr_t)bse->bse_next));
435 
436 		pthread_mutex_lock(&bse->bse_mtx);
437 		bse->bse_pending = 0;
438 		pthread_cond_signal(&bse->bse_cond);
439 		pthread_mutex_unlock(&bse->bse_mtx);
440 	}
441 }
442 
443 static void
444 blockif_init(void)
445 {
446 	mevent_add(SIGCONT, EVF_SIGNAL, blockif_sigcont_handler, NULL);
447 	(void) signal(SIGCONT, SIG_IGN);
448 }
449 
450 int
451 blockif_legacy_config(nvlist_t *nvl, const char *opts)
452 {
453 	char *cp, *path;
454 
455 	if (opts == NULL)
456 		return (0);
457 
458 	cp = strchr(opts, ',');
459 	if (cp == NULL) {
460 		set_config_value_node(nvl, "path", opts);
461 		return (0);
462 	}
463 	path = strndup(opts, cp - opts);
464 	set_config_value_node(nvl, "path", path);
465 	free(path);
466 	return (pci_parse_legacy_config(nvl, cp + 1));
467 }
468 
469 struct blockif_ctxt *
470 blockif_open(nvlist_t *nvl, const char *ident)
471 {
472 	char tname[MAXCOMLEN + 1];
473 	char name[MAXPATHLEN];
474 	const char *path, *pssval, *ssval;
475 	char *cp;
476 	struct blockif_ctxt *bc;
477 	struct stat sbuf;
478 	struct diocgattr_arg arg;
479 	off_t size, psectsz, psectoff;
480 	int extra, fd, i, sectsz;
481 	int ro, candelete, geom, ssopt, pssopt;
482 	int nodelete;
483 
484 #ifndef WITHOUT_CAPSICUM
485 	cap_rights_t rights;
486 	cap_ioctl_t cmds[] = { DIOCGFLUSH, DIOCGDELETE, DIOCGMEDIASIZE };
487 #endif
488 
489 	pthread_once(&blockif_once, blockif_init);
490 
491 	fd = -1;
492 	extra = 0;
493 	ssopt = 0;
494 	ro = 0;
495 	nodelete = 0;
496 
497 	if (get_config_bool_node_default(nvl, "nocache", false))
498 		extra |= O_DIRECT;
499 	if (get_config_bool_node_default(nvl, "nodelete", false))
500 		nodelete = 1;
501 	if (get_config_bool_node_default(nvl, "sync", false) ||
502 	    get_config_bool_node_default(nvl, "direct", false))
503 		extra |= O_SYNC;
504 	if (get_config_bool_node_default(nvl, "ro", false))
505 		ro = 1;
506 	ssval = get_config_value_node(nvl, "sectorsize");
507 	if (ssval != NULL) {
508 		ssopt = strtol(ssval, &cp, 10);
509 		if (cp == ssval) {
510 			EPRINTLN("Invalid sector size \"%s\"", ssval);
511 			goto err;
512 		}
513 		if (*cp == '\0') {
514 			pssopt = ssopt;
515 		} else if (*cp == '/') {
516 			pssval = cp + 1;
517 			pssopt = strtol(pssval, &cp, 10);
518 			if (cp == pssval || *cp != '\0') {
519 				EPRINTLN("Invalid sector size \"%s\"", ssval);
520 				goto err;
521 			}
522 		} else {
523 			EPRINTLN("Invalid sector size \"%s\"", ssval);
524 			goto err;
525 		}
526 	}
527 
528 	path = get_config_value_node(nvl, "path");
529 	if (path == NULL) {
530 		EPRINTLN("Missing \"path\" for block device.");
531 		goto err;
532 	}
533 
534 	fd = open(path, (ro ? O_RDONLY : O_RDWR) | extra);
535 	if (fd < 0 && !ro) {
536 		/* Attempt a r/w fail with a r/o open */
537 		fd = open(path, O_RDONLY | extra);
538 		ro = 1;
539 	}
540 
541 	if (fd < 0) {
542 		warn("Could not open backing file: %s", path);
543 		goto err;
544 	}
545 
546         if (fstat(fd, &sbuf) < 0) {
547 		warn("Could not stat backing file %s", path);
548 		goto err;
549         }
550 
551 #ifndef WITHOUT_CAPSICUM
552 	cap_rights_init(&rights, CAP_FSYNC, CAP_IOCTL, CAP_READ, CAP_SEEK,
553 	    CAP_WRITE, CAP_FSTAT, CAP_EVENT, CAP_FPATHCONF);
554 	if (ro)
555 		cap_rights_clear(&rights, CAP_FSYNC, CAP_WRITE);
556 
557 	if (caph_rights_limit(fd, &rights) == -1)
558 		errx(EX_OSERR, "Unable to apply rights for sandbox");
559 #endif
560 
561         /*
562 	 * Deal with raw devices
563 	 */
564         size = sbuf.st_size;
565 	sectsz = DEV_BSIZE;
566 	psectsz = psectoff = 0;
567 	candelete = geom = 0;
568 	if (S_ISCHR(sbuf.st_mode)) {
569 		if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 ||
570 		    ioctl(fd, DIOCGSECTORSIZE, &sectsz)) {
571 			perror("Could not fetch dev blk/sector size");
572 			goto err;
573 		}
574 		assert(size != 0);
575 		assert(sectsz != 0);
576 		if (ioctl(fd, DIOCGSTRIPESIZE, &psectsz) == 0 && psectsz > 0)
577 			ioctl(fd, DIOCGSTRIPEOFFSET, &psectoff);
578 		strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name));
579 		arg.len = sizeof(arg.value.i);
580 		if (nodelete == 0 && ioctl(fd, DIOCGATTR, &arg) == 0)
581 			candelete = arg.value.i;
582 		if (ioctl(fd, DIOCGPROVIDERNAME, name) == 0)
583 			geom = 1;
584 	} else {
585 		psectsz = sbuf.st_blksize;
586 		/* Avoid fallback implementation */
587 		candelete = fpathconf(fd, _PC_DEALLOC_PRESENT) == 1;
588 	}
589 
590 #ifndef WITHOUT_CAPSICUM
591 	if (caph_ioctls_limit(fd, cmds, nitems(cmds)) == -1)
592 		errx(EX_OSERR, "Unable to apply rights for sandbox");
593 #endif
594 
595 	if (ssopt != 0) {
596 		if (!powerof2(ssopt) || !powerof2(pssopt) || ssopt < 512 ||
597 		    ssopt > pssopt) {
598 			EPRINTLN("Invalid sector size %d/%d",
599 			    ssopt, pssopt);
600 			goto err;
601 		}
602 
603 		/*
604 		 * Some backend drivers (e.g. cd0, ada0) require that the I/O
605 		 * size be a multiple of the device's sector size.
606 		 *
607 		 * Validate that the emulated sector size complies with this
608 		 * requirement.
609 		 */
610 		if (S_ISCHR(sbuf.st_mode)) {
611 			if (ssopt < sectsz || (ssopt % sectsz) != 0) {
612 				EPRINTLN("Sector size %d incompatible "
613 				    "with underlying device sector size %d",
614 				    ssopt, sectsz);
615 				goto err;
616 			}
617 		}
618 
619 		sectsz = ssopt;
620 		psectsz = pssopt;
621 		psectoff = 0;
622 	}
623 
624 	bc = calloc(1, sizeof(struct blockif_ctxt));
625 	if (bc == NULL) {
626 		perror("calloc");
627 		goto err;
628 	}
629 
630 	bc->bc_magic = BLOCKIF_SIG;
631 	bc->bc_fd = fd;
632 	bc->bc_ischr = S_ISCHR(sbuf.st_mode);
633 	bc->bc_isgeom = geom;
634 	bc->bc_candelete = candelete;
635 	bc->bc_rdonly = ro;
636 	bc->bc_size = size;
637 	bc->bc_sectsz = sectsz;
638 	bc->bc_psectsz = psectsz;
639 	bc->bc_psectoff = psectoff;
640 	pthread_mutex_init(&bc->bc_mtx, NULL);
641 	pthread_cond_init(&bc->bc_cond, NULL);
642 	bc->bc_paused = 0;
643 	pthread_cond_init(&bc->bc_work_done_cond, NULL);
644 	TAILQ_INIT(&bc->bc_freeq);
645 	TAILQ_INIT(&bc->bc_pendq);
646 	TAILQ_INIT(&bc->bc_busyq);
647 	for (i = 0; i < BLOCKIF_MAXREQ; i++) {
648 		bc->bc_reqs[i].be_status = BST_FREE;
649 		TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link);
650 	}
651 
652 	for (i = 0; i < BLOCKIF_NUMTHR; i++) {
653 		pthread_create(&bc->bc_btid[i], NULL, blockif_thr, bc);
654 		snprintf(tname, sizeof(tname), "blk-%s-%d", ident, i);
655 		pthread_set_name_np(bc->bc_btid[i], tname);
656 	}
657 
658 	return (bc);
659 err:
660 	if (fd >= 0)
661 		close(fd);
662 	return (NULL);
663 }
664 
665 static void
666 blockif_resized(int fd, enum ev_type type __unused, void *arg)
667 {
668 	struct blockif_ctxt *bc;
669 	struct stat sb;
670 	off_t mediasize;
671 
672 	if (fstat(fd, &sb) != 0)
673 		return;
674 
675 	if (S_ISCHR(sb.st_mode)) {
676 		if (ioctl(fd, DIOCGMEDIASIZE, &mediasize) < 0) {
677 			EPRINTLN("blockif_resized: get mediasize failed: %s",
678 			    strerror(errno));
679 			return;
680 		}
681 	} else
682 		mediasize = sb.st_size;
683 
684 	bc = arg;
685 	pthread_mutex_lock(&bc->bc_mtx);
686 	if (mediasize != bc->bc_size) {
687 		bc->bc_size = mediasize;
688 		bc->bc_resize_cb(bc, bc->bc_resize_cb_arg, bc->bc_size);
689 	}
690 	pthread_mutex_unlock(&bc->bc_mtx);
691 }
692 
693 int
694 blockif_register_resize_callback(struct blockif_ctxt *bc, blockif_resize_cb *cb,
695     void *cb_arg)
696 {
697 	struct stat sb;
698 	int err;
699 
700 	if (cb == NULL)
701 		return (EINVAL);
702 
703 	err = 0;
704 
705 	pthread_mutex_lock(&bc->bc_mtx);
706 	if (bc->bc_resize_cb != NULL) {
707 		err = EBUSY;
708 		goto out;
709 	}
710 
711 	assert(bc->bc_closing == 0);
712 
713 	if (fstat(bc->bc_fd, &sb) != 0) {
714 		err = errno;
715 		goto out;
716 	}
717 
718 	bc->bc_resize_event = mevent_add_flags(bc->bc_fd, EVF_VNODE,
719 	    EVFF_ATTRIB, blockif_resized, bc);
720 	if (bc->bc_resize_event == NULL) {
721 		err = ENXIO;
722 		goto out;
723 	}
724 
725 	bc->bc_resize_cb = cb;
726 	bc->bc_resize_cb_arg = cb_arg;
727 out:
728 	pthread_mutex_unlock(&bc->bc_mtx);
729 
730 	return (err);
731 }
732 
733 static int
734 blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq,
735 		enum blockop op)
736 {
737 	int err;
738 
739 	err = 0;
740 
741 	pthread_mutex_lock(&bc->bc_mtx);
742 	assert(!bc->bc_paused);
743 	if (!TAILQ_EMPTY(&bc->bc_freeq)) {
744 		/*
745 		 * Enqueue and inform the block i/o thread
746 		 * that there is work available
747 		 */
748 		if (blockif_enqueue(bc, breq, op))
749 			pthread_cond_signal(&bc->bc_cond);
750 	} else {
751 		/*
752 		 * Callers are not allowed to enqueue more than
753 		 * the specified blockif queue limit. Return an
754 		 * error to indicate that the queue length has been
755 		 * exceeded.
756 		 */
757 		err = E2BIG;
758 	}
759 	pthread_mutex_unlock(&bc->bc_mtx);
760 
761 	return (err);
762 }
763 
764 int
765 blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq)
766 {
767 	assert(bc->bc_magic == BLOCKIF_SIG);
768 	return (blockif_request(bc, breq, BOP_READ));
769 }
770 
771 int
772 blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq)
773 {
774 	assert(bc->bc_magic == BLOCKIF_SIG);
775 	return (blockif_request(bc, breq, BOP_WRITE));
776 }
777 
778 int
779 blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq)
780 {
781 	assert(bc->bc_magic == BLOCKIF_SIG);
782 	return (blockif_request(bc, breq, BOP_FLUSH));
783 }
784 
785 int
786 blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq)
787 {
788 	assert(bc->bc_magic == BLOCKIF_SIG);
789 	return (blockif_request(bc, breq, BOP_DELETE));
790 }
791 
792 int
793 blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq)
794 {
795 	struct blockif_elem *be;
796 
797 	assert(bc->bc_magic == BLOCKIF_SIG);
798 
799 	pthread_mutex_lock(&bc->bc_mtx);
800 	/* XXX: not waiting while paused */
801 
802 	/*
803 	 * Check pending requests.
804 	 */
805 	TAILQ_FOREACH(be, &bc->bc_pendq, be_link) {
806 		if (be->be_req == breq)
807 			break;
808 	}
809 	if (be != NULL) {
810 		/*
811 		 * Found it.
812 		 */
813 		blockif_complete(bc, be);
814 		pthread_mutex_unlock(&bc->bc_mtx);
815 
816 		return (0);
817 	}
818 
819 	/*
820 	 * Check in-flight requests.
821 	 */
822 	TAILQ_FOREACH(be, &bc->bc_busyq, be_link) {
823 		if (be->be_req == breq)
824 			break;
825 	}
826 	if (be == NULL) {
827 		/*
828 		 * Didn't find it.
829 		 */
830 		pthread_mutex_unlock(&bc->bc_mtx);
831 		return (EINVAL);
832 	}
833 
834 	/*
835 	 * Interrupt the processing thread to force it return
836 	 * prematurely via it's normal callback path.
837 	 */
838 	while (be->be_status == BST_BUSY) {
839 		struct blockif_sig_elem bse, *old_head;
840 
841 		pthread_mutex_init(&bse.bse_mtx, NULL);
842 		pthread_cond_init(&bse.bse_cond, NULL);
843 
844 		bse.bse_pending = 1;
845 
846 		do {
847 			old_head = blockif_bse_head;
848 			bse.bse_next = old_head;
849 		} while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head,
850 					    (uintptr_t)old_head,
851 					    (uintptr_t)&bse));
852 
853 		pthread_kill(be->be_tid, SIGCONT);
854 
855 		pthread_mutex_lock(&bse.bse_mtx);
856 		while (bse.bse_pending)
857 			pthread_cond_wait(&bse.bse_cond, &bse.bse_mtx);
858 		pthread_mutex_unlock(&bse.bse_mtx);
859 	}
860 
861 	pthread_mutex_unlock(&bc->bc_mtx);
862 
863 	/*
864 	 * The processing thread has been interrupted.  Since it's not
865 	 * clear if the callback has been invoked yet, return EBUSY.
866 	 */
867 	return (EBUSY);
868 }
869 
870 int
871 blockif_close(struct blockif_ctxt *bc)
872 {
873 	void *jval;
874 	int i;
875 
876 	assert(bc->bc_magic == BLOCKIF_SIG);
877 
878 	/*
879 	 * Stop the block i/o thread
880 	 */
881 	pthread_mutex_lock(&bc->bc_mtx);
882 	bc->bc_closing = 1;
883 	if (bc->bc_resize_event != NULL)
884 		mevent_disable(bc->bc_resize_event);
885 	pthread_mutex_unlock(&bc->bc_mtx);
886 	pthread_cond_broadcast(&bc->bc_cond);
887 	for (i = 0; i < BLOCKIF_NUMTHR; i++)
888 		pthread_join(bc->bc_btid[i], &jval);
889 
890 	/* XXX Cancel queued i/o's ??? */
891 
892 	/*
893 	 * Release resources
894 	 */
895 	bc->bc_magic = 0;
896 	close(bc->bc_fd);
897 	free(bc);
898 
899 	return (0);
900 }
901 
902 /*
903  * Return virtual C/H/S values for a given block. Use the algorithm
904  * outlined in the VHD specification to calculate values.
905  */
906 void
907 blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s)
908 {
909 	off_t sectors;		/* total sectors of the block dev */
910 	off_t hcyl;		/* cylinders times heads */
911 	uint16_t secpt;		/* sectors per track */
912 	uint8_t heads;
913 
914 	assert(bc->bc_magic == BLOCKIF_SIG);
915 
916 	sectors = bc->bc_size / bc->bc_sectsz;
917 
918 	/* Clamp the size to the largest possible with CHS */
919 	if (sectors > 65535L * 16 * 255)
920 		sectors = 65535L * 16 * 255;
921 
922 	if (sectors >= 65536L * 16 * 63) {
923 		secpt = 255;
924 		heads = 16;
925 		hcyl = sectors / secpt;
926 	} else {
927 		secpt = 17;
928 		hcyl = sectors / secpt;
929 		heads = (hcyl + 1023) / 1024;
930 
931 		if (heads < 4)
932 			heads = 4;
933 
934 		if (hcyl >= (heads * 1024) || heads > 16) {
935 			secpt = 31;
936 			heads = 16;
937 			hcyl = sectors / secpt;
938 		}
939 		if (hcyl >= (heads * 1024)) {
940 			secpt = 63;
941 			heads = 16;
942 			hcyl = sectors / secpt;
943 		}
944 	}
945 
946 	*c = hcyl / heads;
947 	*h = heads;
948 	*s = secpt;
949 }
950 
951 /*
952  * Accessors
953  */
954 off_t
955 blockif_size(struct blockif_ctxt *bc)
956 {
957 	assert(bc->bc_magic == BLOCKIF_SIG);
958 	return (bc->bc_size);
959 }
960 
961 int
962 blockif_sectsz(struct blockif_ctxt *bc)
963 {
964 	assert(bc->bc_magic == BLOCKIF_SIG);
965 	return (bc->bc_sectsz);
966 }
967 
968 void
969 blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off)
970 {
971 	assert(bc->bc_magic == BLOCKIF_SIG);
972 	*size = bc->bc_psectsz;
973 	*off = bc->bc_psectoff;
974 }
975 
976 int
977 blockif_queuesz(struct blockif_ctxt *bc)
978 {
979 	assert(bc->bc_magic == BLOCKIF_SIG);
980 	return (BLOCKIF_MAXREQ - 1);
981 }
982 
983 int
984 blockif_is_ro(struct blockif_ctxt *bc)
985 {
986 	assert(bc->bc_magic == BLOCKIF_SIG);
987 	return (bc->bc_rdonly);
988 }
989 
990 int
991 blockif_candelete(struct blockif_ctxt *bc)
992 {
993 	assert(bc->bc_magic == BLOCKIF_SIG);
994 	return (bc->bc_candelete);
995 }
996 
997 #ifdef BHYVE_SNAPSHOT
998 void
999 blockif_pause(struct blockif_ctxt *bc)
1000 {
1001 	assert(bc != NULL);
1002 	assert(bc->bc_magic == BLOCKIF_SIG);
1003 
1004 	pthread_mutex_lock(&bc->bc_mtx);
1005 	bc->bc_paused = 1;
1006 
1007 	/* The interface is paused. Wait for workers to finish their work */
1008 	while (!blockif_empty(bc))
1009 		pthread_cond_wait(&bc->bc_work_done_cond, &bc->bc_mtx);
1010 	pthread_mutex_unlock(&bc->bc_mtx);
1011 
1012 	if (!bc->bc_rdonly && blockif_flush_bc(bc))
1013 		fprintf(stderr, "%s: [WARN] failed to flush backing file.\r\n",
1014 			__func__);
1015 }
1016 
1017 void
1018 blockif_resume(struct blockif_ctxt *bc)
1019 {
1020 	assert(bc != NULL);
1021 	assert(bc->bc_magic == BLOCKIF_SIG);
1022 
1023 	pthread_mutex_lock(&bc->bc_mtx);
1024 	bc->bc_paused = 0;
1025 	pthread_mutex_unlock(&bc->bc_mtx);
1026 }
1027 #endif	/* BHYVE_SNAPSHOT */
1028