xref: /freebsd/usr.sbin/bhyve/block_if.c (revision d0b2dbfa0ecf2bbc9709efc5e20baf8e4b44bbbf)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2013  Peter Grehan <grehan@freebsd.org>
5  * All rights reserved.
6  * Copyright 2020 Joyent, Inc.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include <sys/cdefs.h>
31 #include <sys/param.h>
32 #ifndef WITHOUT_CAPSICUM
33 #include <sys/capsicum.h>
34 #endif
35 #include <sys/queue.h>
36 #include <sys/errno.h>
37 #include <sys/stat.h>
38 #include <sys/ioctl.h>
39 #include <sys/disk.h>
40 
41 #include <assert.h>
42 #ifndef WITHOUT_CAPSICUM
43 #include <capsicum_helpers.h>
44 #endif
45 #include <err.h>
46 #include <fcntl.h>
47 #include <stdio.h>
48 #include <stdlib.h>
49 #include <string.h>
50 #include <pthread.h>
51 #include <pthread_np.h>
52 #include <signal.h>
53 #include <sysexits.h>
54 #include <unistd.h>
55 
56 #include <machine/atomic.h>
57 #include <machine/vmm_snapshot.h>
58 
59 #include "bhyverun.h"
60 #include "config.h"
61 #include "debug.h"
62 #include "mevent.h"
63 #include "pci_emul.h"
64 #include "block_if.h"
65 
66 #define BLOCKIF_SIG	0xb109b109
67 
68 #define BLOCKIF_NUMTHR	8
69 #define BLOCKIF_MAXREQ	(BLOCKIF_RING_MAX + BLOCKIF_NUMTHR)
70 
71 enum blockop {
72 	BOP_READ,
73 	BOP_WRITE,
74 	BOP_FLUSH,
75 	BOP_DELETE
76 };
77 
78 enum blockstat {
79 	BST_FREE,
80 	BST_BLOCK,
81 	BST_PEND,
82 	BST_BUSY,
83 	BST_DONE
84 };
85 
86 struct blockif_elem {
87 	TAILQ_ENTRY(blockif_elem) be_link;
88 	struct blockif_req  *be_req;
89 	enum blockop	     be_op;
90 	enum blockstat	     be_status;
91 	pthread_t            be_tid;
92 	off_t		     be_block;
93 };
94 
95 struct blockif_ctxt {
96 	unsigned int		bc_magic;
97 	int			bc_fd;
98 	int			bc_ischr;
99 	int			bc_isgeom;
100 	int			bc_candelete;
101 	int			bc_rdonly;
102 	off_t			bc_size;
103 	int			bc_sectsz;
104 	int			bc_psectsz;
105 	int			bc_psectoff;
106 	int			bc_closing;
107 	int			bc_paused;
108 	pthread_t		bc_btid[BLOCKIF_NUMTHR];
109 	pthread_mutex_t		bc_mtx;
110 	pthread_cond_t		bc_cond;
111 	pthread_cond_t		bc_work_done_cond;
112 	blockif_resize_cb	*bc_resize_cb;
113 	void			*bc_resize_cb_arg;
114 	struct mevent		*bc_resize_event;
115 
116 	/* Request elements and free/pending/busy queues */
117 	TAILQ_HEAD(, blockif_elem) bc_freeq;
118 	TAILQ_HEAD(, blockif_elem) bc_pendq;
119 	TAILQ_HEAD(, blockif_elem) bc_busyq;
120 	struct blockif_elem	bc_reqs[BLOCKIF_MAXREQ];
121 	int			bc_bootindex;
122 };
123 
124 static pthread_once_t blockif_once = PTHREAD_ONCE_INIT;
125 
126 struct blockif_sig_elem {
127 	pthread_mutex_t			bse_mtx;
128 	pthread_cond_t			bse_cond;
129 	int				bse_pending;
130 	struct blockif_sig_elem		*bse_next;
131 };
132 
133 static struct blockif_sig_elem *blockif_bse_head;
134 
135 static int
136 blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq,
137 		enum blockop op)
138 {
139 	struct blockif_elem *be, *tbe;
140 	off_t off;
141 	int i;
142 
143 	be = TAILQ_FIRST(&bc->bc_freeq);
144 	assert(be != NULL);
145 	assert(be->be_status == BST_FREE);
146 	TAILQ_REMOVE(&bc->bc_freeq, be, be_link);
147 	be->be_req = breq;
148 	be->be_op = op;
149 	switch (op) {
150 	case BOP_READ:
151 	case BOP_WRITE:
152 	case BOP_DELETE:
153 		off = breq->br_offset;
154 		for (i = 0; i < breq->br_iovcnt; i++)
155 			off += breq->br_iov[i].iov_len;
156 		break;
157 	default:
158 		off = OFF_MAX;
159 	}
160 	be->be_block = off;
161 	TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) {
162 		if (tbe->be_block == breq->br_offset)
163 			break;
164 	}
165 	if (tbe == NULL) {
166 		TAILQ_FOREACH(tbe, &bc->bc_busyq, be_link) {
167 			if (tbe->be_block == breq->br_offset)
168 				break;
169 		}
170 	}
171 	if (tbe == NULL)
172 		be->be_status = BST_PEND;
173 	else
174 		be->be_status = BST_BLOCK;
175 	TAILQ_INSERT_TAIL(&bc->bc_pendq, be, be_link);
176 	return (be->be_status == BST_PEND);
177 }
178 
179 static int
180 blockif_dequeue(struct blockif_ctxt *bc, pthread_t t, struct blockif_elem **bep)
181 {
182 	struct blockif_elem *be;
183 
184 	TAILQ_FOREACH(be, &bc->bc_pendq, be_link) {
185 		if (be->be_status == BST_PEND)
186 			break;
187 		assert(be->be_status == BST_BLOCK);
188 	}
189 	if (be == NULL)
190 		return (0);
191 	TAILQ_REMOVE(&bc->bc_pendq, be, be_link);
192 	be->be_status = BST_BUSY;
193 	be->be_tid = t;
194 	TAILQ_INSERT_TAIL(&bc->bc_busyq, be, be_link);
195 	*bep = be;
196 	return (1);
197 }
198 
199 static void
200 blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be)
201 {
202 	struct blockif_elem *tbe;
203 
204 	if (be->be_status == BST_DONE || be->be_status == BST_BUSY)
205 		TAILQ_REMOVE(&bc->bc_busyq, be, be_link);
206 	else
207 		TAILQ_REMOVE(&bc->bc_pendq, be, be_link);
208 	TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) {
209 		if (tbe->be_req->br_offset == be->be_block)
210 			tbe->be_status = BST_PEND;
211 	}
212 	be->be_tid = 0;
213 	be->be_status = BST_FREE;
214 	be->be_req = NULL;
215 	TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link);
216 }
217 
218 static int
219 blockif_flush_bc(struct blockif_ctxt *bc)
220 {
221 	if (bc->bc_ischr) {
222 		if (ioctl(bc->bc_fd, DIOCGFLUSH))
223 			return (errno);
224 	} else if (fsync(bc->bc_fd))
225 		return (errno);
226 
227 	return (0);
228 }
229 
230 static void
231 blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf)
232 {
233 	struct spacectl_range range;
234 	struct blockif_req *br;
235 	off_t arg[2];
236 	ssize_t n;
237 	size_t clen, len, off, boff, voff;
238 	int i, err;
239 
240 	br = be->be_req;
241 	assert(br->br_resid >= 0);
242 
243 	if (br->br_iovcnt <= 1)
244 		buf = NULL;
245 	err = 0;
246 	switch (be->be_op) {
247 	case BOP_READ:
248 		if (buf == NULL) {
249 			if ((n = preadv(bc->bc_fd, br->br_iov, br->br_iovcnt,
250 			    br->br_offset)) < 0)
251 				err = errno;
252 			else
253 				br->br_resid -= n;
254 			break;
255 		}
256 		i = 0;
257 		off = voff = 0;
258 		while (br->br_resid > 0) {
259 			len = MIN(br->br_resid, MAXPHYS);
260 			n = pread(bc->bc_fd, buf, len, br->br_offset + off);
261 			if (n < 0) {
262 				err = errno;
263 				break;
264 			}
265 			len = (size_t)n;
266 			boff = 0;
267 			do {
268 				clen = MIN(len - boff, br->br_iov[i].iov_len -
269 				    voff);
270 				memcpy((uint8_t *)br->br_iov[i].iov_base + voff,
271 				    buf + boff, clen);
272 				if (clen < br->br_iov[i].iov_len - voff)
273 					voff += clen;
274 				else {
275 					i++;
276 					voff = 0;
277 				}
278 				boff += clen;
279 			} while (boff < len);
280 			off += len;
281 			br->br_resid -= len;
282 		}
283 		break;
284 	case BOP_WRITE:
285 		if (bc->bc_rdonly) {
286 			err = EROFS;
287 			break;
288 		}
289 		if (buf == NULL) {
290 			if ((n = pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt,
291 			    br->br_offset)) < 0)
292 				err = errno;
293 			else
294 				br->br_resid -= n;
295 			break;
296 		}
297 		i = 0;
298 		off = voff = 0;
299 		while (br->br_resid > 0) {
300 			len = MIN(br->br_resid, MAXPHYS);
301 			boff = 0;
302 			do {
303 				clen = MIN(len - boff, br->br_iov[i].iov_len -
304 				    voff);
305 				memcpy(buf + boff,
306 				    (uint8_t *)br->br_iov[i].iov_base + voff,
307 				    clen);
308 				if (clen < br->br_iov[i].iov_len - voff)
309 					voff += clen;
310 				else {
311 					i++;
312 					voff = 0;
313 				}
314 				boff += clen;
315 			} while (boff < len);
316 
317 			n = pwrite(bc->bc_fd, buf, len, br->br_offset + off);
318 			if (n < 0) {
319 				err = errno;
320 				break;
321 			}
322 			off += n;
323 			br->br_resid -= n;
324 		}
325 		break;
326 	case BOP_FLUSH:
327 		err = blockif_flush_bc(bc);
328 		break;
329 	case BOP_DELETE:
330 		if (!bc->bc_candelete)
331 			err = EOPNOTSUPP;
332 		else if (bc->bc_rdonly)
333 			err = EROFS;
334 		else if (bc->bc_ischr) {
335 			arg[0] = br->br_offset;
336 			arg[1] = br->br_resid;
337 			if (ioctl(bc->bc_fd, DIOCGDELETE, arg))
338 				err = errno;
339 			else
340 				br->br_resid = 0;
341 		} else {
342 			range.r_offset = br->br_offset;
343 			range.r_len = br->br_resid;
344 
345 			while (range.r_len > 0) {
346 				if (fspacectl(bc->bc_fd, SPACECTL_DEALLOC,
347 				    &range, 0, &range) != 0) {
348 					err = errno;
349 					break;
350 				}
351 			}
352 			if (err == 0)
353 				br->br_resid = 0;
354 		}
355 		break;
356 	default:
357 		err = EINVAL;
358 		break;
359 	}
360 
361 	be->be_status = BST_DONE;
362 
363 	(*br->br_callback)(br, err);
364 }
365 
366 static inline bool
367 blockif_empty(const struct blockif_ctxt *bc)
368 {
369 	return (TAILQ_EMPTY(&bc->bc_pendq) && TAILQ_EMPTY(&bc->bc_busyq));
370 }
371 
372 static void *
373 blockif_thr(void *arg)
374 {
375 	struct blockif_ctxt *bc;
376 	struct blockif_elem *be;
377 	pthread_t t;
378 	uint8_t *buf;
379 
380 	bc = arg;
381 	if (bc->bc_isgeom)
382 		buf = malloc(MAXPHYS);
383 	else
384 		buf = NULL;
385 	t = pthread_self();
386 
387 	pthread_mutex_lock(&bc->bc_mtx);
388 	for (;;) {
389 		while (blockif_dequeue(bc, t, &be)) {
390 			pthread_mutex_unlock(&bc->bc_mtx);
391 			blockif_proc(bc, be, buf);
392 			pthread_mutex_lock(&bc->bc_mtx);
393 			blockif_complete(bc, be);
394 		}
395 
396 		/* If none to work, notify the main thread */
397 		if (blockif_empty(bc))
398 			pthread_cond_broadcast(&bc->bc_work_done_cond);
399 
400 		/* Check ctxt status here to see if exit requested */
401 		if (bc->bc_closing)
402 			break;
403 
404 		pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx);
405 	}
406 	pthread_mutex_unlock(&bc->bc_mtx);
407 
408 	if (buf)
409 		free(buf);
410 	pthread_exit(NULL);
411 	return (NULL);
412 }
413 
414 static void
415 blockif_sigcont_handler(int signal __unused, enum ev_type type __unused,
416     void *arg __unused)
417 {
418 	struct blockif_sig_elem *bse;
419 
420 	for (;;) {
421 		/*
422 		 * Process the entire list even if not intended for
423 		 * this thread.
424 		 */
425 		do {
426 			bse = blockif_bse_head;
427 			if (bse == NULL)
428 				return;
429 		} while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head,
430 					    (uintptr_t)bse,
431 					    (uintptr_t)bse->bse_next));
432 
433 		pthread_mutex_lock(&bse->bse_mtx);
434 		bse->bse_pending = 0;
435 		pthread_cond_signal(&bse->bse_cond);
436 		pthread_mutex_unlock(&bse->bse_mtx);
437 	}
438 }
439 
440 static void
441 blockif_init(void)
442 {
443 	mevent_add(SIGCONT, EVF_SIGNAL, blockif_sigcont_handler, NULL);
444 	(void) signal(SIGCONT, SIG_IGN);
445 }
446 
447 int
448 blockif_legacy_config(nvlist_t *nvl, const char *opts)
449 {
450 	char *cp, *path;
451 
452 	if (opts == NULL)
453 		return (0);
454 
455 	cp = strchr(opts, ',');
456 	if (cp == NULL) {
457 		set_config_value_node(nvl, "path", opts);
458 		return (0);
459 	}
460 	path = strndup(opts, cp - opts);
461 	set_config_value_node(nvl, "path", path);
462 	free(path);
463 	return (pci_parse_legacy_config(nvl, cp + 1));
464 }
465 
466 int
467 blockif_add_boot_device(struct pci_devinst *const pi,
468     struct blockif_ctxt *const bc)
469 {
470 	if (bc->bc_bootindex < 0)
471 		return (0);
472 
473 	return (pci_emul_add_boot_device(pi, bc->bc_bootindex));
474 }
475 
476 struct blockif_ctxt *
477 blockif_open(nvlist_t *nvl, const char *ident)
478 {
479 	char tname[MAXCOMLEN + 1];
480 	char name[MAXPATHLEN];
481 	const char *path, *pssval, *ssval, *bootindex_val;
482 	char *cp;
483 	struct blockif_ctxt *bc;
484 	struct stat sbuf;
485 	struct diocgattr_arg arg;
486 	off_t size, psectsz, psectoff;
487 	int extra, fd, i, sectsz;
488 	int ro, candelete, geom, ssopt, pssopt;
489 	int nodelete;
490 	int bootindex;
491 
492 #ifndef WITHOUT_CAPSICUM
493 	cap_rights_t rights;
494 	cap_ioctl_t cmds[] = { DIOCGFLUSH, DIOCGDELETE, DIOCGMEDIASIZE };
495 #endif
496 
497 	pthread_once(&blockif_once, blockif_init);
498 
499 	fd = -1;
500 	extra = 0;
501 	ssopt = 0;
502 	ro = 0;
503 	nodelete = 0;
504 	bootindex = -1;
505 
506 	if (get_config_bool_node_default(nvl, "nocache", false))
507 		extra |= O_DIRECT;
508 	if (get_config_bool_node_default(nvl, "nodelete", false))
509 		nodelete = 1;
510 	if (get_config_bool_node_default(nvl, "sync", false) ||
511 	    get_config_bool_node_default(nvl, "direct", false))
512 		extra |= O_SYNC;
513 	if (get_config_bool_node_default(nvl, "ro", false))
514 		ro = 1;
515 	ssval = get_config_value_node(nvl, "sectorsize");
516 	if (ssval != NULL) {
517 		ssopt = strtol(ssval, &cp, 10);
518 		if (cp == ssval) {
519 			EPRINTLN("Invalid sector size \"%s\"", ssval);
520 			goto err;
521 		}
522 		if (*cp == '\0') {
523 			pssopt = ssopt;
524 		} else if (*cp == '/') {
525 			pssval = cp + 1;
526 			pssopt = strtol(pssval, &cp, 10);
527 			if (cp == pssval || *cp != '\0') {
528 				EPRINTLN("Invalid sector size \"%s\"", ssval);
529 				goto err;
530 			}
531 		} else {
532 			EPRINTLN("Invalid sector size \"%s\"", ssval);
533 			goto err;
534 		}
535 	}
536 
537 	bootindex_val = get_config_value_node(nvl, "bootindex");
538 	if (bootindex_val != NULL) {
539 		bootindex = atoi(bootindex_val);
540 	}
541 
542 	path = get_config_value_node(nvl, "path");
543 	if (path == NULL) {
544 		EPRINTLN("Missing \"path\" for block device.");
545 		goto err;
546 	}
547 
548 	fd = open(path, (ro ? O_RDONLY : O_RDWR) | extra);
549 	if (fd < 0 && !ro) {
550 		/* Attempt a r/w fail with a r/o open */
551 		fd = open(path, O_RDONLY | extra);
552 		ro = 1;
553 	}
554 
555 	if (fd < 0) {
556 		warn("Could not open backing file: %s", path);
557 		goto err;
558 	}
559 
560         if (fstat(fd, &sbuf) < 0) {
561 		warn("Could not stat backing file %s", path);
562 		goto err;
563         }
564 
565 #ifndef WITHOUT_CAPSICUM
566 	cap_rights_init(&rights, CAP_FSYNC, CAP_IOCTL, CAP_READ, CAP_SEEK,
567 	    CAP_WRITE, CAP_FSTAT, CAP_EVENT, CAP_FPATHCONF);
568 	if (ro)
569 		cap_rights_clear(&rights, CAP_FSYNC, CAP_WRITE);
570 
571 	if (caph_rights_limit(fd, &rights) == -1)
572 		errx(EX_OSERR, "Unable to apply rights for sandbox");
573 #endif
574 
575         /*
576 	 * Deal with raw devices
577 	 */
578         size = sbuf.st_size;
579 	sectsz = DEV_BSIZE;
580 	psectsz = psectoff = 0;
581 	candelete = geom = 0;
582 	if (S_ISCHR(sbuf.st_mode)) {
583 		if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 ||
584 		    ioctl(fd, DIOCGSECTORSIZE, &sectsz)) {
585 			perror("Could not fetch dev blk/sector size");
586 			goto err;
587 		}
588 		assert(size != 0);
589 		assert(sectsz != 0);
590 		if (ioctl(fd, DIOCGSTRIPESIZE, &psectsz) == 0 && psectsz > 0)
591 			ioctl(fd, DIOCGSTRIPEOFFSET, &psectoff);
592 		strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name));
593 		arg.len = sizeof(arg.value.i);
594 		if (nodelete == 0 && ioctl(fd, DIOCGATTR, &arg) == 0)
595 			candelete = arg.value.i;
596 		if (ioctl(fd, DIOCGPROVIDERNAME, name) == 0)
597 			geom = 1;
598 	} else {
599 		psectsz = sbuf.st_blksize;
600 		/* Avoid fallback implementation */
601 		candelete = fpathconf(fd, _PC_DEALLOC_PRESENT) == 1;
602 	}
603 
604 #ifndef WITHOUT_CAPSICUM
605 	if (caph_ioctls_limit(fd, cmds, nitems(cmds)) == -1)
606 		errx(EX_OSERR, "Unable to apply rights for sandbox");
607 #endif
608 
609 	if (ssopt != 0) {
610 		if (!powerof2(ssopt) || !powerof2(pssopt) || ssopt < 512 ||
611 		    ssopt > pssopt) {
612 			EPRINTLN("Invalid sector size %d/%d",
613 			    ssopt, pssopt);
614 			goto err;
615 		}
616 
617 		/*
618 		 * Some backend drivers (e.g. cd0, ada0) require that the I/O
619 		 * size be a multiple of the device's sector size.
620 		 *
621 		 * Validate that the emulated sector size complies with this
622 		 * requirement.
623 		 */
624 		if (S_ISCHR(sbuf.st_mode)) {
625 			if (ssopt < sectsz || (ssopt % sectsz) != 0) {
626 				EPRINTLN("Sector size %d incompatible "
627 				    "with underlying device sector size %d",
628 				    ssopt, sectsz);
629 				goto err;
630 			}
631 		}
632 
633 		sectsz = ssopt;
634 		psectsz = pssopt;
635 		psectoff = 0;
636 	}
637 
638 	bc = calloc(1, sizeof(struct blockif_ctxt));
639 	if (bc == NULL) {
640 		perror("calloc");
641 		goto err;
642 	}
643 
644 	bc->bc_magic = BLOCKIF_SIG;
645 	bc->bc_fd = fd;
646 	bc->bc_ischr = S_ISCHR(sbuf.st_mode);
647 	bc->bc_isgeom = geom;
648 	bc->bc_candelete = candelete;
649 	bc->bc_rdonly = ro;
650 	bc->bc_size = size;
651 	bc->bc_sectsz = sectsz;
652 	bc->bc_psectsz = psectsz;
653 	bc->bc_psectoff = psectoff;
654 	pthread_mutex_init(&bc->bc_mtx, NULL);
655 	pthread_cond_init(&bc->bc_cond, NULL);
656 	bc->bc_paused = 0;
657 	pthread_cond_init(&bc->bc_work_done_cond, NULL);
658 	TAILQ_INIT(&bc->bc_freeq);
659 	TAILQ_INIT(&bc->bc_pendq);
660 	TAILQ_INIT(&bc->bc_busyq);
661 	bc->bc_bootindex = bootindex;
662 	for (i = 0; i < BLOCKIF_MAXREQ; i++) {
663 		bc->bc_reqs[i].be_status = BST_FREE;
664 		TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link);
665 	}
666 
667 	for (i = 0; i < BLOCKIF_NUMTHR; i++) {
668 		pthread_create(&bc->bc_btid[i], NULL, blockif_thr, bc);
669 		snprintf(tname, sizeof(tname), "blk-%s-%d", ident, i);
670 		pthread_set_name_np(bc->bc_btid[i], tname);
671 	}
672 
673 	return (bc);
674 err:
675 	if (fd >= 0)
676 		close(fd);
677 	return (NULL);
678 }
679 
680 static void
681 blockif_resized(int fd, enum ev_type type __unused, void *arg)
682 {
683 	struct blockif_ctxt *bc;
684 	struct stat sb;
685 	off_t mediasize;
686 
687 	if (fstat(fd, &sb) != 0)
688 		return;
689 
690 	if (S_ISCHR(sb.st_mode)) {
691 		if (ioctl(fd, DIOCGMEDIASIZE, &mediasize) < 0) {
692 			EPRINTLN("blockif_resized: get mediasize failed: %s",
693 			    strerror(errno));
694 			return;
695 		}
696 	} else
697 		mediasize = sb.st_size;
698 
699 	bc = arg;
700 	pthread_mutex_lock(&bc->bc_mtx);
701 	if (mediasize != bc->bc_size) {
702 		bc->bc_size = mediasize;
703 		bc->bc_resize_cb(bc, bc->bc_resize_cb_arg, bc->bc_size);
704 	}
705 	pthread_mutex_unlock(&bc->bc_mtx);
706 }
707 
708 int
709 blockif_register_resize_callback(struct blockif_ctxt *bc, blockif_resize_cb *cb,
710     void *cb_arg)
711 {
712 	struct stat sb;
713 	int err;
714 
715 	if (cb == NULL)
716 		return (EINVAL);
717 
718 	err = 0;
719 
720 	pthread_mutex_lock(&bc->bc_mtx);
721 	if (bc->bc_resize_cb != NULL) {
722 		err = EBUSY;
723 		goto out;
724 	}
725 
726 	assert(bc->bc_closing == 0);
727 
728 	if (fstat(bc->bc_fd, &sb) != 0) {
729 		err = errno;
730 		goto out;
731 	}
732 
733 	bc->bc_resize_event = mevent_add_flags(bc->bc_fd, EVF_VNODE,
734 	    EVFF_ATTRIB, blockif_resized, bc);
735 	if (bc->bc_resize_event == NULL) {
736 		err = ENXIO;
737 		goto out;
738 	}
739 
740 	bc->bc_resize_cb = cb;
741 	bc->bc_resize_cb_arg = cb_arg;
742 out:
743 	pthread_mutex_unlock(&bc->bc_mtx);
744 
745 	return (err);
746 }
747 
748 static int
749 blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq,
750 		enum blockop op)
751 {
752 	int err;
753 
754 	err = 0;
755 
756 	pthread_mutex_lock(&bc->bc_mtx);
757 	assert(!bc->bc_paused);
758 	if (!TAILQ_EMPTY(&bc->bc_freeq)) {
759 		/*
760 		 * Enqueue and inform the block i/o thread
761 		 * that there is work available
762 		 */
763 		if (blockif_enqueue(bc, breq, op))
764 			pthread_cond_signal(&bc->bc_cond);
765 	} else {
766 		/*
767 		 * Callers are not allowed to enqueue more than
768 		 * the specified blockif queue limit. Return an
769 		 * error to indicate that the queue length has been
770 		 * exceeded.
771 		 */
772 		err = E2BIG;
773 	}
774 	pthread_mutex_unlock(&bc->bc_mtx);
775 
776 	return (err);
777 }
778 
779 int
780 blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq)
781 {
782 	assert(bc->bc_magic == BLOCKIF_SIG);
783 	return (blockif_request(bc, breq, BOP_READ));
784 }
785 
786 int
787 blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq)
788 {
789 	assert(bc->bc_magic == BLOCKIF_SIG);
790 	return (blockif_request(bc, breq, BOP_WRITE));
791 }
792 
793 int
794 blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq)
795 {
796 	assert(bc->bc_magic == BLOCKIF_SIG);
797 	return (blockif_request(bc, breq, BOP_FLUSH));
798 }
799 
800 int
801 blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq)
802 {
803 	assert(bc->bc_magic == BLOCKIF_SIG);
804 	return (blockif_request(bc, breq, BOP_DELETE));
805 }
806 
807 int
808 blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq)
809 {
810 	struct blockif_elem *be;
811 
812 	assert(bc->bc_magic == BLOCKIF_SIG);
813 
814 	pthread_mutex_lock(&bc->bc_mtx);
815 	/* XXX: not waiting while paused */
816 
817 	/*
818 	 * Check pending requests.
819 	 */
820 	TAILQ_FOREACH(be, &bc->bc_pendq, be_link) {
821 		if (be->be_req == breq)
822 			break;
823 	}
824 	if (be != NULL) {
825 		/*
826 		 * Found it.
827 		 */
828 		blockif_complete(bc, be);
829 		pthread_mutex_unlock(&bc->bc_mtx);
830 
831 		return (0);
832 	}
833 
834 	/*
835 	 * Check in-flight requests.
836 	 */
837 	TAILQ_FOREACH(be, &bc->bc_busyq, be_link) {
838 		if (be->be_req == breq)
839 			break;
840 	}
841 	if (be == NULL) {
842 		/*
843 		 * Didn't find it.
844 		 */
845 		pthread_mutex_unlock(&bc->bc_mtx);
846 		return (EINVAL);
847 	}
848 
849 	/*
850 	 * Interrupt the processing thread to force it return
851 	 * prematurely via it's normal callback path.
852 	 */
853 	while (be->be_status == BST_BUSY) {
854 		struct blockif_sig_elem bse, *old_head;
855 
856 		pthread_mutex_init(&bse.bse_mtx, NULL);
857 		pthread_cond_init(&bse.bse_cond, NULL);
858 
859 		bse.bse_pending = 1;
860 
861 		do {
862 			old_head = blockif_bse_head;
863 			bse.bse_next = old_head;
864 		} while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head,
865 					    (uintptr_t)old_head,
866 					    (uintptr_t)&bse));
867 
868 		pthread_kill(be->be_tid, SIGCONT);
869 
870 		pthread_mutex_lock(&bse.bse_mtx);
871 		while (bse.bse_pending)
872 			pthread_cond_wait(&bse.bse_cond, &bse.bse_mtx);
873 		pthread_mutex_unlock(&bse.bse_mtx);
874 	}
875 
876 	pthread_mutex_unlock(&bc->bc_mtx);
877 
878 	/*
879 	 * The processing thread has been interrupted.  Since it's not
880 	 * clear if the callback has been invoked yet, return EBUSY.
881 	 */
882 	return (EBUSY);
883 }
884 
885 int
886 blockif_close(struct blockif_ctxt *bc)
887 {
888 	void *jval;
889 	int i;
890 
891 	assert(bc->bc_magic == BLOCKIF_SIG);
892 
893 	/*
894 	 * Stop the block i/o thread
895 	 */
896 	pthread_mutex_lock(&bc->bc_mtx);
897 	bc->bc_closing = 1;
898 	if (bc->bc_resize_event != NULL)
899 		mevent_disable(bc->bc_resize_event);
900 	pthread_mutex_unlock(&bc->bc_mtx);
901 	pthread_cond_broadcast(&bc->bc_cond);
902 	for (i = 0; i < BLOCKIF_NUMTHR; i++)
903 		pthread_join(bc->bc_btid[i], &jval);
904 
905 	/* XXX Cancel queued i/o's ??? */
906 
907 	/*
908 	 * Release resources
909 	 */
910 	bc->bc_magic = 0;
911 	close(bc->bc_fd);
912 	free(bc);
913 
914 	return (0);
915 }
916 
917 /*
918  * Return virtual C/H/S values for a given block. Use the algorithm
919  * outlined in the VHD specification to calculate values.
920  */
921 void
922 blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s)
923 {
924 	off_t sectors;		/* total sectors of the block dev */
925 	off_t hcyl;		/* cylinders times heads */
926 	uint16_t secpt;		/* sectors per track */
927 	uint8_t heads;
928 
929 	assert(bc->bc_magic == BLOCKIF_SIG);
930 
931 	sectors = bc->bc_size / bc->bc_sectsz;
932 
933 	/* Clamp the size to the largest possible with CHS */
934 	if (sectors > 65535L * 16 * 255)
935 		sectors = 65535L * 16 * 255;
936 
937 	if (sectors >= 65536L * 16 * 63) {
938 		secpt = 255;
939 		heads = 16;
940 		hcyl = sectors / secpt;
941 	} else {
942 		secpt = 17;
943 		hcyl = sectors / secpt;
944 		heads = (hcyl + 1023) / 1024;
945 
946 		if (heads < 4)
947 			heads = 4;
948 
949 		if (hcyl >= (heads * 1024) || heads > 16) {
950 			secpt = 31;
951 			heads = 16;
952 			hcyl = sectors / secpt;
953 		}
954 		if (hcyl >= (heads * 1024)) {
955 			secpt = 63;
956 			heads = 16;
957 			hcyl = sectors / secpt;
958 		}
959 	}
960 
961 	*c = hcyl / heads;
962 	*h = heads;
963 	*s = secpt;
964 }
965 
966 /*
967  * Accessors
968  */
969 off_t
970 blockif_size(struct blockif_ctxt *bc)
971 {
972 	assert(bc->bc_magic == BLOCKIF_SIG);
973 	return (bc->bc_size);
974 }
975 
976 int
977 blockif_sectsz(struct blockif_ctxt *bc)
978 {
979 	assert(bc->bc_magic == BLOCKIF_SIG);
980 	return (bc->bc_sectsz);
981 }
982 
983 void
984 blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off)
985 {
986 	assert(bc->bc_magic == BLOCKIF_SIG);
987 	*size = bc->bc_psectsz;
988 	*off = bc->bc_psectoff;
989 }
990 
991 int
992 blockif_queuesz(struct blockif_ctxt *bc)
993 {
994 	assert(bc->bc_magic == BLOCKIF_SIG);
995 	return (BLOCKIF_MAXREQ - 1);
996 }
997 
998 int
999 blockif_is_ro(struct blockif_ctxt *bc)
1000 {
1001 	assert(bc->bc_magic == BLOCKIF_SIG);
1002 	return (bc->bc_rdonly);
1003 }
1004 
1005 int
1006 blockif_candelete(struct blockif_ctxt *bc)
1007 {
1008 	assert(bc->bc_magic == BLOCKIF_SIG);
1009 	return (bc->bc_candelete);
1010 }
1011 
1012 #ifdef BHYVE_SNAPSHOT
1013 void
1014 blockif_pause(struct blockif_ctxt *bc)
1015 {
1016 	assert(bc != NULL);
1017 	assert(bc->bc_magic == BLOCKIF_SIG);
1018 
1019 	pthread_mutex_lock(&bc->bc_mtx);
1020 	bc->bc_paused = 1;
1021 
1022 	/* The interface is paused. Wait for workers to finish their work */
1023 	while (!blockif_empty(bc))
1024 		pthread_cond_wait(&bc->bc_work_done_cond, &bc->bc_mtx);
1025 	pthread_mutex_unlock(&bc->bc_mtx);
1026 
1027 	if (!bc->bc_rdonly && blockif_flush_bc(bc))
1028 		fprintf(stderr, "%s: [WARN] failed to flush backing file.\r\n",
1029 			__func__);
1030 }
1031 
1032 void
1033 blockif_resume(struct blockif_ctxt *bc)
1034 {
1035 	assert(bc != NULL);
1036 	assert(bc->bc_magic == BLOCKIF_SIG);
1037 
1038 	pthread_mutex_lock(&bc->bc_mtx);
1039 	bc->bc_paused = 0;
1040 	pthread_mutex_unlock(&bc->bc_mtx);
1041 }
1042 #endif	/* BHYVE_SNAPSHOT */
1043