xref: /freebsd/usr.sbin/bhyve/block_if.c (revision b23dbabb7f3edb3f323a64f03e37be2c9a8b2a45)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2013  Peter Grehan <grehan@freebsd.org>
5  * All rights reserved.
6  * Copyright 2020 Joyent, Inc.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  * $FreeBSD$
30  */
31 
32 #include <sys/cdefs.h>
33 __FBSDID("$FreeBSD$");
34 
35 #include <sys/param.h>
36 #ifndef WITHOUT_CAPSICUM
37 #include <sys/capsicum.h>
38 #endif
39 #include <sys/queue.h>
40 #include <sys/errno.h>
41 #include <sys/stat.h>
42 #include <sys/ioctl.h>
43 #include <sys/disk.h>
44 
45 #include <assert.h>
46 #ifndef WITHOUT_CAPSICUM
47 #include <capsicum_helpers.h>
48 #endif
49 #include <err.h>
50 #include <fcntl.h>
51 #include <stdio.h>
52 #include <stdlib.h>
53 #include <string.h>
54 #include <pthread.h>
55 #include <pthread_np.h>
56 #include <signal.h>
57 #include <sysexits.h>
58 #include <unistd.h>
59 
60 #include <machine/atomic.h>
61 #include <machine/vmm_snapshot.h>
62 
63 #include "bhyverun.h"
64 #include "config.h"
65 #include "debug.h"
66 #include "mevent.h"
67 #include "pci_emul.h"
68 #include "block_if.h"
69 
70 #define BLOCKIF_SIG	0xb109b109
71 
72 #define BLOCKIF_NUMTHR	8
73 #define BLOCKIF_MAXREQ	(BLOCKIF_RING_MAX + BLOCKIF_NUMTHR)
74 
75 enum blockop {
76 	BOP_READ,
77 	BOP_WRITE,
78 	BOP_FLUSH,
79 	BOP_DELETE
80 };
81 
82 enum blockstat {
83 	BST_FREE,
84 	BST_BLOCK,
85 	BST_PEND,
86 	BST_BUSY,
87 	BST_DONE
88 };
89 
90 struct blockif_elem {
91 	TAILQ_ENTRY(blockif_elem) be_link;
92 	struct blockif_req  *be_req;
93 	enum blockop	     be_op;
94 	enum blockstat	     be_status;
95 	pthread_t            be_tid;
96 	off_t		     be_block;
97 };
98 
99 struct blockif_ctxt {
100 	unsigned int		bc_magic;
101 	int			bc_fd;
102 	int			bc_ischr;
103 	int			bc_isgeom;
104 	int			bc_candelete;
105 	int			bc_rdonly;
106 	off_t			bc_size;
107 	int			bc_sectsz;
108 	int			bc_psectsz;
109 	int			bc_psectoff;
110 	int			bc_closing;
111 	int			bc_paused;
112 	pthread_t		bc_btid[BLOCKIF_NUMTHR];
113 	pthread_mutex_t		bc_mtx;
114 	pthread_cond_t		bc_cond;
115 	pthread_cond_t		bc_work_done_cond;
116 	blockif_resize_cb	*bc_resize_cb;
117 	void			*bc_resize_cb_arg;
118 	struct mevent		*bc_resize_event;
119 
120 	/* Request elements and free/pending/busy queues */
121 	TAILQ_HEAD(, blockif_elem) bc_freeq;
122 	TAILQ_HEAD(, blockif_elem) bc_pendq;
123 	TAILQ_HEAD(, blockif_elem) bc_busyq;
124 	struct blockif_elem	bc_reqs[BLOCKIF_MAXREQ];
125 	int			bc_bootindex;
126 };
127 
128 static pthread_once_t blockif_once = PTHREAD_ONCE_INIT;
129 
130 struct blockif_sig_elem {
131 	pthread_mutex_t			bse_mtx;
132 	pthread_cond_t			bse_cond;
133 	int				bse_pending;
134 	struct blockif_sig_elem		*bse_next;
135 };
136 
137 static struct blockif_sig_elem *blockif_bse_head;
138 
139 static int
140 blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq,
141 		enum blockop op)
142 {
143 	struct blockif_elem *be, *tbe;
144 	off_t off;
145 	int i;
146 
147 	be = TAILQ_FIRST(&bc->bc_freeq);
148 	assert(be != NULL);
149 	assert(be->be_status == BST_FREE);
150 	TAILQ_REMOVE(&bc->bc_freeq, be, be_link);
151 	be->be_req = breq;
152 	be->be_op = op;
153 	switch (op) {
154 	case BOP_READ:
155 	case BOP_WRITE:
156 	case BOP_DELETE:
157 		off = breq->br_offset;
158 		for (i = 0; i < breq->br_iovcnt; i++)
159 			off += breq->br_iov[i].iov_len;
160 		break;
161 	default:
162 		off = OFF_MAX;
163 	}
164 	be->be_block = off;
165 	TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) {
166 		if (tbe->be_block == breq->br_offset)
167 			break;
168 	}
169 	if (tbe == NULL) {
170 		TAILQ_FOREACH(tbe, &bc->bc_busyq, be_link) {
171 			if (tbe->be_block == breq->br_offset)
172 				break;
173 		}
174 	}
175 	if (tbe == NULL)
176 		be->be_status = BST_PEND;
177 	else
178 		be->be_status = BST_BLOCK;
179 	TAILQ_INSERT_TAIL(&bc->bc_pendq, be, be_link);
180 	return (be->be_status == BST_PEND);
181 }
182 
183 static int
184 blockif_dequeue(struct blockif_ctxt *bc, pthread_t t, struct blockif_elem **bep)
185 {
186 	struct blockif_elem *be;
187 
188 	TAILQ_FOREACH(be, &bc->bc_pendq, be_link) {
189 		if (be->be_status == BST_PEND)
190 			break;
191 		assert(be->be_status == BST_BLOCK);
192 	}
193 	if (be == NULL)
194 		return (0);
195 	TAILQ_REMOVE(&bc->bc_pendq, be, be_link);
196 	be->be_status = BST_BUSY;
197 	be->be_tid = t;
198 	TAILQ_INSERT_TAIL(&bc->bc_busyq, be, be_link);
199 	*bep = be;
200 	return (1);
201 }
202 
203 static void
204 blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be)
205 {
206 	struct blockif_elem *tbe;
207 
208 	if (be->be_status == BST_DONE || be->be_status == BST_BUSY)
209 		TAILQ_REMOVE(&bc->bc_busyq, be, be_link);
210 	else
211 		TAILQ_REMOVE(&bc->bc_pendq, be, be_link);
212 	TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) {
213 		if (tbe->be_req->br_offset == be->be_block)
214 			tbe->be_status = BST_PEND;
215 	}
216 	be->be_tid = 0;
217 	be->be_status = BST_FREE;
218 	be->be_req = NULL;
219 	TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link);
220 }
221 
222 static int
223 blockif_flush_bc(struct blockif_ctxt *bc)
224 {
225 	if (bc->bc_ischr) {
226 		if (ioctl(bc->bc_fd, DIOCGFLUSH))
227 			return (errno);
228 	} else if (fsync(bc->bc_fd))
229 		return (errno);
230 
231 	return (0);
232 }
233 
234 static void
235 blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf)
236 {
237 	struct spacectl_range range;
238 	struct blockif_req *br;
239 	off_t arg[2];
240 	ssize_t n;
241 	size_t clen, len, off, boff, voff;
242 	int i, err;
243 
244 	br = be->be_req;
245 	assert(br->br_resid >= 0);
246 
247 	if (br->br_iovcnt <= 1)
248 		buf = NULL;
249 	err = 0;
250 	switch (be->be_op) {
251 	case BOP_READ:
252 		if (buf == NULL) {
253 			if ((n = preadv(bc->bc_fd, br->br_iov, br->br_iovcnt,
254 			    br->br_offset)) < 0)
255 				err = errno;
256 			else
257 				br->br_resid -= n;
258 			break;
259 		}
260 		i = 0;
261 		off = voff = 0;
262 		while (br->br_resid > 0) {
263 			len = MIN(br->br_resid, MAXPHYS);
264 			n = pread(bc->bc_fd, buf, len, br->br_offset + off);
265 			if (n < 0) {
266 				err = errno;
267 				break;
268 			}
269 			len = (size_t)n;
270 			boff = 0;
271 			do {
272 				clen = MIN(len - boff, br->br_iov[i].iov_len -
273 				    voff);
274 				memcpy((uint8_t *)br->br_iov[i].iov_base + voff,
275 				    buf + boff, clen);
276 				if (clen < br->br_iov[i].iov_len - voff)
277 					voff += clen;
278 				else {
279 					i++;
280 					voff = 0;
281 				}
282 				boff += clen;
283 			} while (boff < len);
284 			off += len;
285 			br->br_resid -= len;
286 		}
287 		break;
288 	case BOP_WRITE:
289 		if (bc->bc_rdonly) {
290 			err = EROFS;
291 			break;
292 		}
293 		if (buf == NULL) {
294 			if ((n = pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt,
295 			    br->br_offset)) < 0)
296 				err = errno;
297 			else
298 				br->br_resid -= n;
299 			break;
300 		}
301 		i = 0;
302 		off = voff = 0;
303 		while (br->br_resid > 0) {
304 			len = MIN(br->br_resid, MAXPHYS);
305 			boff = 0;
306 			do {
307 				clen = MIN(len - boff, br->br_iov[i].iov_len -
308 				    voff);
309 				memcpy(buf + boff,
310 				    (uint8_t *)br->br_iov[i].iov_base + voff,
311 				    clen);
312 				if (clen < br->br_iov[i].iov_len - voff)
313 					voff += clen;
314 				else {
315 					i++;
316 					voff = 0;
317 				}
318 				boff += clen;
319 			} while (boff < len);
320 
321 			n = pwrite(bc->bc_fd, buf, len, br->br_offset + off);
322 			if (n < 0) {
323 				err = errno;
324 				break;
325 			}
326 			off += n;
327 			br->br_resid -= n;
328 		}
329 		break;
330 	case BOP_FLUSH:
331 		err = blockif_flush_bc(bc);
332 		break;
333 	case BOP_DELETE:
334 		if (!bc->bc_candelete)
335 			err = EOPNOTSUPP;
336 		else if (bc->bc_rdonly)
337 			err = EROFS;
338 		else if (bc->bc_ischr) {
339 			arg[0] = br->br_offset;
340 			arg[1] = br->br_resid;
341 			if (ioctl(bc->bc_fd, DIOCGDELETE, arg))
342 				err = errno;
343 			else
344 				br->br_resid = 0;
345 		} else {
346 			range.r_offset = br->br_offset;
347 			range.r_len = br->br_resid;
348 
349 			while (range.r_len > 0) {
350 				if (fspacectl(bc->bc_fd, SPACECTL_DEALLOC,
351 				    &range, 0, &range) != 0) {
352 					err = errno;
353 					break;
354 				}
355 			}
356 			if (err == 0)
357 				br->br_resid = 0;
358 		}
359 		break;
360 	default:
361 		err = EINVAL;
362 		break;
363 	}
364 
365 	be->be_status = BST_DONE;
366 
367 	(*br->br_callback)(br, err);
368 }
369 
370 static inline bool
371 blockif_empty(const struct blockif_ctxt *bc)
372 {
373 	return (TAILQ_EMPTY(&bc->bc_pendq) && TAILQ_EMPTY(&bc->bc_busyq));
374 }
375 
376 static void *
377 blockif_thr(void *arg)
378 {
379 	struct blockif_ctxt *bc;
380 	struct blockif_elem *be;
381 	pthread_t t;
382 	uint8_t *buf;
383 
384 	bc = arg;
385 	if (bc->bc_isgeom)
386 		buf = malloc(MAXPHYS);
387 	else
388 		buf = NULL;
389 	t = pthread_self();
390 
391 	pthread_mutex_lock(&bc->bc_mtx);
392 	for (;;) {
393 		while (blockif_dequeue(bc, t, &be)) {
394 			pthread_mutex_unlock(&bc->bc_mtx);
395 			blockif_proc(bc, be, buf);
396 			pthread_mutex_lock(&bc->bc_mtx);
397 			blockif_complete(bc, be);
398 		}
399 
400 		/* If none to work, notify the main thread */
401 		if (blockif_empty(bc))
402 			pthread_cond_broadcast(&bc->bc_work_done_cond);
403 
404 		/* Check ctxt status here to see if exit requested */
405 		if (bc->bc_closing)
406 			break;
407 
408 		pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx);
409 	}
410 	pthread_mutex_unlock(&bc->bc_mtx);
411 
412 	if (buf)
413 		free(buf);
414 	pthread_exit(NULL);
415 	return (NULL);
416 }
417 
418 static void
419 blockif_sigcont_handler(int signal __unused, enum ev_type type __unused,
420     void *arg __unused)
421 {
422 	struct blockif_sig_elem *bse;
423 
424 	for (;;) {
425 		/*
426 		 * Process the entire list even if not intended for
427 		 * this thread.
428 		 */
429 		do {
430 			bse = blockif_bse_head;
431 			if (bse == NULL)
432 				return;
433 		} while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head,
434 					    (uintptr_t)bse,
435 					    (uintptr_t)bse->bse_next));
436 
437 		pthread_mutex_lock(&bse->bse_mtx);
438 		bse->bse_pending = 0;
439 		pthread_cond_signal(&bse->bse_cond);
440 		pthread_mutex_unlock(&bse->bse_mtx);
441 	}
442 }
443 
444 static void
445 blockif_init(void)
446 {
447 	mevent_add(SIGCONT, EVF_SIGNAL, blockif_sigcont_handler, NULL);
448 	(void) signal(SIGCONT, SIG_IGN);
449 }
450 
451 int
452 blockif_legacy_config(nvlist_t *nvl, const char *opts)
453 {
454 	char *cp, *path;
455 
456 	if (opts == NULL)
457 		return (0);
458 
459 	cp = strchr(opts, ',');
460 	if (cp == NULL) {
461 		set_config_value_node(nvl, "path", opts);
462 		return (0);
463 	}
464 	path = strndup(opts, cp - opts);
465 	set_config_value_node(nvl, "path", path);
466 	free(path);
467 	return (pci_parse_legacy_config(nvl, cp + 1));
468 }
469 
470 int
471 blockif_add_boot_device(struct pci_devinst *const pi,
472     struct blockif_ctxt *const bc)
473 {
474 	if (bc->bc_bootindex < 0)
475 		return (0);
476 
477 	return (pci_emul_add_boot_device(pi, bc->bc_bootindex));
478 }
479 
480 struct blockif_ctxt *
481 blockif_open(nvlist_t *nvl, const char *ident)
482 {
483 	char tname[MAXCOMLEN + 1];
484 	char name[MAXPATHLEN];
485 	const char *path, *pssval, *ssval, *bootindex_val;
486 	char *cp;
487 	struct blockif_ctxt *bc;
488 	struct stat sbuf;
489 	struct diocgattr_arg arg;
490 	off_t size, psectsz, psectoff;
491 	int extra, fd, i, sectsz;
492 	int ro, candelete, geom, ssopt, pssopt;
493 	int nodelete;
494 	int bootindex;
495 
496 #ifndef WITHOUT_CAPSICUM
497 	cap_rights_t rights;
498 	cap_ioctl_t cmds[] = { DIOCGFLUSH, DIOCGDELETE, DIOCGMEDIASIZE };
499 #endif
500 
501 	pthread_once(&blockif_once, blockif_init);
502 
503 	fd = -1;
504 	extra = 0;
505 	ssopt = 0;
506 	ro = 0;
507 	nodelete = 0;
508 	bootindex = -1;
509 
510 	if (get_config_bool_node_default(nvl, "nocache", false))
511 		extra |= O_DIRECT;
512 	if (get_config_bool_node_default(nvl, "nodelete", false))
513 		nodelete = 1;
514 	if (get_config_bool_node_default(nvl, "sync", false) ||
515 	    get_config_bool_node_default(nvl, "direct", false))
516 		extra |= O_SYNC;
517 	if (get_config_bool_node_default(nvl, "ro", false))
518 		ro = 1;
519 	ssval = get_config_value_node(nvl, "sectorsize");
520 	if (ssval != NULL) {
521 		ssopt = strtol(ssval, &cp, 10);
522 		if (cp == ssval) {
523 			EPRINTLN("Invalid sector size \"%s\"", ssval);
524 			goto err;
525 		}
526 		if (*cp == '\0') {
527 			pssopt = ssopt;
528 		} else if (*cp == '/') {
529 			pssval = cp + 1;
530 			pssopt = strtol(pssval, &cp, 10);
531 			if (cp == pssval || *cp != '\0') {
532 				EPRINTLN("Invalid sector size \"%s\"", ssval);
533 				goto err;
534 			}
535 		} else {
536 			EPRINTLN("Invalid sector size \"%s\"", ssval);
537 			goto err;
538 		}
539 	}
540 
541 	bootindex_val = get_config_value_node(nvl, "bootindex");
542 	if (bootindex_val != NULL) {
543 		bootindex = atoi(bootindex_val);
544 	}
545 
546 	path = get_config_value_node(nvl, "path");
547 	if (path == NULL) {
548 		EPRINTLN("Missing \"path\" for block device.");
549 		goto err;
550 	}
551 
552 	fd = open(path, (ro ? O_RDONLY : O_RDWR) | extra);
553 	if (fd < 0 && !ro) {
554 		/* Attempt a r/w fail with a r/o open */
555 		fd = open(path, O_RDONLY | extra);
556 		ro = 1;
557 	}
558 
559 	if (fd < 0) {
560 		warn("Could not open backing file: %s", path);
561 		goto err;
562 	}
563 
564         if (fstat(fd, &sbuf) < 0) {
565 		warn("Could not stat backing file %s", path);
566 		goto err;
567         }
568 
569 #ifndef WITHOUT_CAPSICUM
570 	cap_rights_init(&rights, CAP_FSYNC, CAP_IOCTL, CAP_READ, CAP_SEEK,
571 	    CAP_WRITE, CAP_FSTAT, CAP_EVENT, CAP_FPATHCONF);
572 	if (ro)
573 		cap_rights_clear(&rights, CAP_FSYNC, CAP_WRITE);
574 
575 	if (caph_rights_limit(fd, &rights) == -1)
576 		errx(EX_OSERR, "Unable to apply rights for sandbox");
577 #endif
578 
579         /*
580 	 * Deal with raw devices
581 	 */
582         size = sbuf.st_size;
583 	sectsz = DEV_BSIZE;
584 	psectsz = psectoff = 0;
585 	candelete = geom = 0;
586 	if (S_ISCHR(sbuf.st_mode)) {
587 		if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 ||
588 		    ioctl(fd, DIOCGSECTORSIZE, &sectsz)) {
589 			perror("Could not fetch dev blk/sector size");
590 			goto err;
591 		}
592 		assert(size != 0);
593 		assert(sectsz != 0);
594 		if (ioctl(fd, DIOCGSTRIPESIZE, &psectsz) == 0 && psectsz > 0)
595 			ioctl(fd, DIOCGSTRIPEOFFSET, &psectoff);
596 		strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name));
597 		arg.len = sizeof(arg.value.i);
598 		if (nodelete == 0 && ioctl(fd, DIOCGATTR, &arg) == 0)
599 			candelete = arg.value.i;
600 		if (ioctl(fd, DIOCGPROVIDERNAME, name) == 0)
601 			geom = 1;
602 	} else {
603 		psectsz = sbuf.st_blksize;
604 		/* Avoid fallback implementation */
605 		candelete = fpathconf(fd, _PC_DEALLOC_PRESENT) == 1;
606 	}
607 
608 #ifndef WITHOUT_CAPSICUM
609 	if (caph_ioctls_limit(fd, cmds, nitems(cmds)) == -1)
610 		errx(EX_OSERR, "Unable to apply rights for sandbox");
611 #endif
612 
613 	if (ssopt != 0) {
614 		if (!powerof2(ssopt) || !powerof2(pssopt) || ssopt < 512 ||
615 		    ssopt > pssopt) {
616 			EPRINTLN("Invalid sector size %d/%d",
617 			    ssopt, pssopt);
618 			goto err;
619 		}
620 
621 		/*
622 		 * Some backend drivers (e.g. cd0, ada0) require that the I/O
623 		 * size be a multiple of the device's sector size.
624 		 *
625 		 * Validate that the emulated sector size complies with this
626 		 * requirement.
627 		 */
628 		if (S_ISCHR(sbuf.st_mode)) {
629 			if (ssopt < sectsz || (ssopt % sectsz) != 0) {
630 				EPRINTLN("Sector size %d incompatible "
631 				    "with underlying device sector size %d",
632 				    ssopt, sectsz);
633 				goto err;
634 			}
635 		}
636 
637 		sectsz = ssopt;
638 		psectsz = pssopt;
639 		psectoff = 0;
640 	}
641 
642 	bc = calloc(1, sizeof(struct blockif_ctxt));
643 	if (bc == NULL) {
644 		perror("calloc");
645 		goto err;
646 	}
647 
648 	bc->bc_magic = BLOCKIF_SIG;
649 	bc->bc_fd = fd;
650 	bc->bc_ischr = S_ISCHR(sbuf.st_mode);
651 	bc->bc_isgeom = geom;
652 	bc->bc_candelete = candelete;
653 	bc->bc_rdonly = ro;
654 	bc->bc_size = size;
655 	bc->bc_sectsz = sectsz;
656 	bc->bc_psectsz = psectsz;
657 	bc->bc_psectoff = psectoff;
658 	pthread_mutex_init(&bc->bc_mtx, NULL);
659 	pthread_cond_init(&bc->bc_cond, NULL);
660 	bc->bc_paused = 0;
661 	pthread_cond_init(&bc->bc_work_done_cond, NULL);
662 	TAILQ_INIT(&bc->bc_freeq);
663 	TAILQ_INIT(&bc->bc_pendq);
664 	TAILQ_INIT(&bc->bc_busyq);
665 	bc->bc_bootindex = bootindex;
666 	for (i = 0; i < BLOCKIF_MAXREQ; i++) {
667 		bc->bc_reqs[i].be_status = BST_FREE;
668 		TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link);
669 	}
670 
671 	for (i = 0; i < BLOCKIF_NUMTHR; i++) {
672 		pthread_create(&bc->bc_btid[i], NULL, blockif_thr, bc);
673 		snprintf(tname, sizeof(tname), "blk-%s-%d", ident, i);
674 		pthread_set_name_np(bc->bc_btid[i], tname);
675 	}
676 
677 	return (bc);
678 err:
679 	if (fd >= 0)
680 		close(fd);
681 	return (NULL);
682 }
683 
684 static void
685 blockif_resized(int fd, enum ev_type type __unused, void *arg)
686 {
687 	struct blockif_ctxt *bc;
688 	struct stat sb;
689 	off_t mediasize;
690 
691 	if (fstat(fd, &sb) != 0)
692 		return;
693 
694 	if (S_ISCHR(sb.st_mode)) {
695 		if (ioctl(fd, DIOCGMEDIASIZE, &mediasize) < 0) {
696 			EPRINTLN("blockif_resized: get mediasize failed: %s",
697 			    strerror(errno));
698 			return;
699 		}
700 	} else
701 		mediasize = sb.st_size;
702 
703 	bc = arg;
704 	pthread_mutex_lock(&bc->bc_mtx);
705 	if (mediasize != bc->bc_size) {
706 		bc->bc_size = mediasize;
707 		bc->bc_resize_cb(bc, bc->bc_resize_cb_arg, bc->bc_size);
708 	}
709 	pthread_mutex_unlock(&bc->bc_mtx);
710 }
711 
712 int
713 blockif_register_resize_callback(struct blockif_ctxt *bc, blockif_resize_cb *cb,
714     void *cb_arg)
715 {
716 	struct stat sb;
717 	int err;
718 
719 	if (cb == NULL)
720 		return (EINVAL);
721 
722 	err = 0;
723 
724 	pthread_mutex_lock(&bc->bc_mtx);
725 	if (bc->bc_resize_cb != NULL) {
726 		err = EBUSY;
727 		goto out;
728 	}
729 
730 	assert(bc->bc_closing == 0);
731 
732 	if (fstat(bc->bc_fd, &sb) != 0) {
733 		err = errno;
734 		goto out;
735 	}
736 
737 	bc->bc_resize_event = mevent_add_flags(bc->bc_fd, EVF_VNODE,
738 	    EVFF_ATTRIB, blockif_resized, bc);
739 	if (bc->bc_resize_event == NULL) {
740 		err = ENXIO;
741 		goto out;
742 	}
743 
744 	bc->bc_resize_cb = cb;
745 	bc->bc_resize_cb_arg = cb_arg;
746 out:
747 	pthread_mutex_unlock(&bc->bc_mtx);
748 
749 	return (err);
750 }
751 
752 static int
753 blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq,
754 		enum blockop op)
755 {
756 	int err;
757 
758 	err = 0;
759 
760 	pthread_mutex_lock(&bc->bc_mtx);
761 	assert(!bc->bc_paused);
762 	if (!TAILQ_EMPTY(&bc->bc_freeq)) {
763 		/*
764 		 * Enqueue and inform the block i/o thread
765 		 * that there is work available
766 		 */
767 		if (blockif_enqueue(bc, breq, op))
768 			pthread_cond_signal(&bc->bc_cond);
769 	} else {
770 		/*
771 		 * Callers are not allowed to enqueue more than
772 		 * the specified blockif queue limit. Return an
773 		 * error to indicate that the queue length has been
774 		 * exceeded.
775 		 */
776 		err = E2BIG;
777 	}
778 	pthread_mutex_unlock(&bc->bc_mtx);
779 
780 	return (err);
781 }
782 
783 int
784 blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq)
785 {
786 	assert(bc->bc_magic == BLOCKIF_SIG);
787 	return (blockif_request(bc, breq, BOP_READ));
788 }
789 
790 int
791 blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq)
792 {
793 	assert(bc->bc_magic == BLOCKIF_SIG);
794 	return (blockif_request(bc, breq, BOP_WRITE));
795 }
796 
797 int
798 blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq)
799 {
800 	assert(bc->bc_magic == BLOCKIF_SIG);
801 	return (blockif_request(bc, breq, BOP_FLUSH));
802 }
803 
804 int
805 blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq)
806 {
807 	assert(bc->bc_magic == BLOCKIF_SIG);
808 	return (blockif_request(bc, breq, BOP_DELETE));
809 }
810 
811 int
812 blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq)
813 {
814 	struct blockif_elem *be;
815 
816 	assert(bc->bc_magic == BLOCKIF_SIG);
817 
818 	pthread_mutex_lock(&bc->bc_mtx);
819 	/* XXX: not waiting while paused */
820 
821 	/*
822 	 * Check pending requests.
823 	 */
824 	TAILQ_FOREACH(be, &bc->bc_pendq, be_link) {
825 		if (be->be_req == breq)
826 			break;
827 	}
828 	if (be != NULL) {
829 		/*
830 		 * Found it.
831 		 */
832 		blockif_complete(bc, be);
833 		pthread_mutex_unlock(&bc->bc_mtx);
834 
835 		return (0);
836 	}
837 
838 	/*
839 	 * Check in-flight requests.
840 	 */
841 	TAILQ_FOREACH(be, &bc->bc_busyq, be_link) {
842 		if (be->be_req == breq)
843 			break;
844 	}
845 	if (be == NULL) {
846 		/*
847 		 * Didn't find it.
848 		 */
849 		pthread_mutex_unlock(&bc->bc_mtx);
850 		return (EINVAL);
851 	}
852 
853 	/*
854 	 * Interrupt the processing thread to force it return
855 	 * prematurely via it's normal callback path.
856 	 */
857 	while (be->be_status == BST_BUSY) {
858 		struct blockif_sig_elem bse, *old_head;
859 
860 		pthread_mutex_init(&bse.bse_mtx, NULL);
861 		pthread_cond_init(&bse.bse_cond, NULL);
862 
863 		bse.bse_pending = 1;
864 
865 		do {
866 			old_head = blockif_bse_head;
867 			bse.bse_next = old_head;
868 		} while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head,
869 					    (uintptr_t)old_head,
870 					    (uintptr_t)&bse));
871 
872 		pthread_kill(be->be_tid, SIGCONT);
873 
874 		pthread_mutex_lock(&bse.bse_mtx);
875 		while (bse.bse_pending)
876 			pthread_cond_wait(&bse.bse_cond, &bse.bse_mtx);
877 		pthread_mutex_unlock(&bse.bse_mtx);
878 	}
879 
880 	pthread_mutex_unlock(&bc->bc_mtx);
881 
882 	/*
883 	 * The processing thread has been interrupted.  Since it's not
884 	 * clear if the callback has been invoked yet, return EBUSY.
885 	 */
886 	return (EBUSY);
887 }
888 
889 int
890 blockif_close(struct blockif_ctxt *bc)
891 {
892 	void *jval;
893 	int i;
894 
895 	assert(bc->bc_magic == BLOCKIF_SIG);
896 
897 	/*
898 	 * Stop the block i/o thread
899 	 */
900 	pthread_mutex_lock(&bc->bc_mtx);
901 	bc->bc_closing = 1;
902 	if (bc->bc_resize_event != NULL)
903 		mevent_disable(bc->bc_resize_event);
904 	pthread_mutex_unlock(&bc->bc_mtx);
905 	pthread_cond_broadcast(&bc->bc_cond);
906 	for (i = 0; i < BLOCKIF_NUMTHR; i++)
907 		pthread_join(bc->bc_btid[i], &jval);
908 
909 	/* XXX Cancel queued i/o's ??? */
910 
911 	/*
912 	 * Release resources
913 	 */
914 	bc->bc_magic = 0;
915 	close(bc->bc_fd);
916 	free(bc);
917 
918 	return (0);
919 }
920 
921 /*
922  * Return virtual C/H/S values for a given block. Use the algorithm
923  * outlined in the VHD specification to calculate values.
924  */
925 void
926 blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s)
927 {
928 	off_t sectors;		/* total sectors of the block dev */
929 	off_t hcyl;		/* cylinders times heads */
930 	uint16_t secpt;		/* sectors per track */
931 	uint8_t heads;
932 
933 	assert(bc->bc_magic == BLOCKIF_SIG);
934 
935 	sectors = bc->bc_size / bc->bc_sectsz;
936 
937 	/* Clamp the size to the largest possible with CHS */
938 	if (sectors > 65535L * 16 * 255)
939 		sectors = 65535L * 16 * 255;
940 
941 	if (sectors >= 65536L * 16 * 63) {
942 		secpt = 255;
943 		heads = 16;
944 		hcyl = sectors / secpt;
945 	} else {
946 		secpt = 17;
947 		hcyl = sectors / secpt;
948 		heads = (hcyl + 1023) / 1024;
949 
950 		if (heads < 4)
951 			heads = 4;
952 
953 		if (hcyl >= (heads * 1024) || heads > 16) {
954 			secpt = 31;
955 			heads = 16;
956 			hcyl = sectors / secpt;
957 		}
958 		if (hcyl >= (heads * 1024)) {
959 			secpt = 63;
960 			heads = 16;
961 			hcyl = sectors / secpt;
962 		}
963 	}
964 
965 	*c = hcyl / heads;
966 	*h = heads;
967 	*s = secpt;
968 }
969 
970 /*
971  * Accessors
972  */
973 off_t
974 blockif_size(struct blockif_ctxt *bc)
975 {
976 	assert(bc->bc_magic == BLOCKIF_SIG);
977 	return (bc->bc_size);
978 }
979 
980 int
981 blockif_sectsz(struct blockif_ctxt *bc)
982 {
983 	assert(bc->bc_magic == BLOCKIF_SIG);
984 	return (bc->bc_sectsz);
985 }
986 
987 void
988 blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off)
989 {
990 	assert(bc->bc_magic == BLOCKIF_SIG);
991 	*size = bc->bc_psectsz;
992 	*off = bc->bc_psectoff;
993 }
994 
995 int
996 blockif_queuesz(struct blockif_ctxt *bc)
997 {
998 	assert(bc->bc_magic == BLOCKIF_SIG);
999 	return (BLOCKIF_MAXREQ - 1);
1000 }
1001 
1002 int
1003 blockif_is_ro(struct blockif_ctxt *bc)
1004 {
1005 	assert(bc->bc_magic == BLOCKIF_SIG);
1006 	return (bc->bc_rdonly);
1007 }
1008 
1009 int
1010 blockif_candelete(struct blockif_ctxt *bc)
1011 {
1012 	assert(bc->bc_magic == BLOCKIF_SIG);
1013 	return (bc->bc_candelete);
1014 }
1015 
1016 #ifdef BHYVE_SNAPSHOT
1017 void
1018 blockif_pause(struct blockif_ctxt *bc)
1019 {
1020 	assert(bc != NULL);
1021 	assert(bc->bc_magic == BLOCKIF_SIG);
1022 
1023 	pthread_mutex_lock(&bc->bc_mtx);
1024 	bc->bc_paused = 1;
1025 
1026 	/* The interface is paused. Wait for workers to finish their work */
1027 	while (!blockif_empty(bc))
1028 		pthread_cond_wait(&bc->bc_work_done_cond, &bc->bc_mtx);
1029 	pthread_mutex_unlock(&bc->bc_mtx);
1030 
1031 	if (!bc->bc_rdonly && blockif_flush_bc(bc))
1032 		fprintf(stderr, "%s: [WARN] failed to flush backing file.\r\n",
1033 			__func__);
1034 }
1035 
1036 void
1037 blockif_resume(struct blockif_ctxt *bc)
1038 {
1039 	assert(bc != NULL);
1040 	assert(bc->bc_magic == BLOCKIF_SIG);
1041 
1042 	pthread_mutex_lock(&bc->bc_mtx);
1043 	bc->bc_paused = 0;
1044 	pthread_mutex_unlock(&bc->bc_mtx);
1045 }
1046 #endif	/* BHYVE_SNAPSHOT */
1047