xref: /freebsd/usr.sbin/bhyve/block_if.c (revision 8ddb146abcdf061be9f2c0db7e391697dafad85c)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2013  Peter Grehan <grehan@freebsd.org>
5  * All rights reserved.
6  * Copyright 2020 Joyent, Inc.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  * $FreeBSD$
30  */
31 
32 #include <sys/cdefs.h>
33 __FBSDID("$FreeBSD$");
34 
35 #include <sys/param.h>
36 #ifndef WITHOUT_CAPSICUM
37 #include <sys/capsicum.h>
38 #endif
39 #include <sys/queue.h>
40 #include <sys/errno.h>
41 #include <sys/stat.h>
42 #include <sys/ioctl.h>
43 #include <sys/disk.h>
44 
45 #include <assert.h>
46 #ifndef WITHOUT_CAPSICUM
47 #include <capsicum_helpers.h>
48 #endif
49 #include <err.h>
50 #include <fcntl.h>
51 #include <stdio.h>
52 #include <stdlib.h>
53 #include <string.h>
54 #include <pthread.h>
55 #include <pthread_np.h>
56 #include <signal.h>
57 #include <sysexits.h>
58 #include <unistd.h>
59 
60 #include <machine/atomic.h>
61 #include <machine/vmm_snapshot.h>
62 
63 #include "bhyverun.h"
64 #include "config.h"
65 #include "debug.h"
66 #include "mevent.h"
67 #include "pci_emul.h"
68 #include "block_if.h"
69 
70 #define BLOCKIF_SIG	0xb109b109
71 
72 #define BLOCKIF_NUMTHR	8
73 #define BLOCKIF_MAXREQ	(BLOCKIF_RING_MAX + BLOCKIF_NUMTHR)
74 
75 enum blockop {
76 	BOP_READ,
77 	BOP_WRITE,
78 	BOP_FLUSH,
79 	BOP_DELETE
80 };
81 
82 enum blockstat {
83 	BST_FREE,
84 	BST_BLOCK,
85 	BST_PEND,
86 	BST_BUSY,
87 	BST_DONE
88 };
89 
90 struct blockif_elem {
91 	TAILQ_ENTRY(blockif_elem) be_link;
92 	struct blockif_req  *be_req;
93 	enum blockop	     be_op;
94 	enum blockstat	     be_status;
95 	pthread_t            be_tid;
96 	off_t		     be_block;
97 };
98 
99 struct blockif_ctxt {
100 	int			bc_magic;
101 	int			bc_fd;
102 	int			bc_ischr;
103 	int			bc_isgeom;
104 	int			bc_candelete;
105 	int			bc_rdonly;
106 	off_t			bc_size;
107 	int			bc_sectsz;
108 	int			bc_psectsz;
109 	int			bc_psectoff;
110 	int			bc_closing;
111 	int			bc_paused;
112 	pthread_t		bc_btid[BLOCKIF_NUMTHR];
113 	pthread_mutex_t		bc_mtx;
114 	pthread_cond_t		bc_cond;
115 	pthread_cond_t		bc_work_done_cond;
116 	blockif_resize_cb	*bc_resize_cb;
117 	void			*bc_resize_cb_arg;
118 	struct mevent		*bc_resize_event;
119 
120 	/* Request elements and free/pending/busy queues */
121 	TAILQ_HEAD(, blockif_elem) bc_freeq;
122 	TAILQ_HEAD(, blockif_elem) bc_pendq;
123 	TAILQ_HEAD(, blockif_elem) bc_busyq;
124 	struct blockif_elem	bc_reqs[BLOCKIF_MAXREQ];
125 };
126 
127 static pthread_once_t blockif_once = PTHREAD_ONCE_INIT;
128 
129 struct blockif_sig_elem {
130 	pthread_mutex_t			bse_mtx;
131 	pthread_cond_t			bse_cond;
132 	int				bse_pending;
133 	struct blockif_sig_elem		*bse_next;
134 };
135 
136 static struct blockif_sig_elem *blockif_bse_head;
137 
138 static int
139 blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq,
140 		enum blockop op)
141 {
142 	struct blockif_elem *be, *tbe;
143 	off_t off;
144 	int i;
145 
146 	be = TAILQ_FIRST(&bc->bc_freeq);
147 	assert(be != NULL);
148 	assert(be->be_status == BST_FREE);
149 	TAILQ_REMOVE(&bc->bc_freeq, be, be_link);
150 	be->be_req = breq;
151 	be->be_op = op;
152 	switch (op) {
153 	case BOP_READ:
154 	case BOP_WRITE:
155 	case BOP_DELETE:
156 		off = breq->br_offset;
157 		for (i = 0; i < breq->br_iovcnt; i++)
158 			off += breq->br_iov[i].iov_len;
159 		break;
160 	default:
161 		off = OFF_MAX;
162 	}
163 	be->be_block = off;
164 	TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) {
165 		if (tbe->be_block == breq->br_offset)
166 			break;
167 	}
168 	if (tbe == NULL) {
169 		TAILQ_FOREACH(tbe, &bc->bc_busyq, be_link) {
170 			if (tbe->be_block == breq->br_offset)
171 				break;
172 		}
173 	}
174 	if (tbe == NULL)
175 		be->be_status = BST_PEND;
176 	else
177 		be->be_status = BST_BLOCK;
178 	TAILQ_INSERT_TAIL(&bc->bc_pendq, be, be_link);
179 	return (be->be_status == BST_PEND);
180 }
181 
182 static int
183 blockif_dequeue(struct blockif_ctxt *bc, pthread_t t, struct blockif_elem **bep)
184 {
185 	struct blockif_elem *be;
186 
187 	TAILQ_FOREACH(be, &bc->bc_pendq, be_link) {
188 		if (be->be_status == BST_PEND)
189 			break;
190 		assert(be->be_status == BST_BLOCK);
191 	}
192 	if (be == NULL)
193 		return (0);
194 	TAILQ_REMOVE(&bc->bc_pendq, be, be_link);
195 	be->be_status = BST_BUSY;
196 	be->be_tid = t;
197 	TAILQ_INSERT_TAIL(&bc->bc_busyq, be, be_link);
198 	*bep = be;
199 	return (1);
200 }
201 
202 static void
203 blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be)
204 {
205 	struct blockif_elem *tbe;
206 
207 	if (be->be_status == BST_DONE || be->be_status == BST_BUSY)
208 		TAILQ_REMOVE(&bc->bc_busyq, be, be_link);
209 	else
210 		TAILQ_REMOVE(&bc->bc_pendq, be, be_link);
211 	TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) {
212 		if (tbe->be_req->br_offset == be->be_block)
213 			tbe->be_status = BST_PEND;
214 	}
215 	be->be_tid = 0;
216 	be->be_status = BST_FREE;
217 	be->be_req = NULL;
218 	TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link);
219 }
220 
221 static int
222 blockif_flush_bc(struct blockif_ctxt *bc)
223 {
224 	if (bc->bc_ischr) {
225 		if (ioctl(bc->bc_fd, DIOCGFLUSH))
226 			return (errno);
227 	} else if (fsync(bc->bc_fd))
228 		return (errno);
229 
230 	return (0);
231 }
232 
233 static void
234 blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf)
235 {
236 	struct blockif_req *br;
237 	off_t arg[2];
238 	ssize_t clen, len, off, boff, voff;
239 	int i, err;
240 	struct spacectl_range range;
241 
242 	br = be->be_req;
243 	if (br->br_iovcnt <= 1)
244 		buf = NULL;
245 	err = 0;
246 	switch (be->be_op) {
247 	case BOP_READ:
248 		if (buf == NULL) {
249 			if ((len = preadv(bc->bc_fd, br->br_iov, br->br_iovcnt,
250 				   br->br_offset)) < 0)
251 				err = errno;
252 			else
253 				br->br_resid -= len;
254 			break;
255 		}
256 		i = 0;
257 		off = voff = 0;
258 		while (br->br_resid > 0) {
259 			len = MIN(br->br_resid, MAXPHYS);
260 			if (pread(bc->bc_fd, buf, len, br->br_offset +
261 			    off) < 0) {
262 				err = errno;
263 				break;
264 			}
265 			boff = 0;
266 			do {
267 				clen = MIN(len - boff, br->br_iov[i].iov_len -
268 				    voff);
269 				memcpy(br->br_iov[i].iov_base + voff,
270 				    buf + boff, clen);
271 				if (clen < br->br_iov[i].iov_len - voff)
272 					voff += clen;
273 				else {
274 					i++;
275 					voff = 0;
276 				}
277 				boff += clen;
278 			} while (boff < len);
279 			off += len;
280 			br->br_resid -= len;
281 		}
282 		break;
283 	case BOP_WRITE:
284 		if (bc->bc_rdonly) {
285 			err = EROFS;
286 			break;
287 		}
288 		if (buf == NULL) {
289 			if ((len = pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt,
290 				    br->br_offset)) < 0)
291 				err = errno;
292 			else
293 				br->br_resid -= len;
294 			break;
295 		}
296 		i = 0;
297 		off = voff = 0;
298 		while (br->br_resid > 0) {
299 			len = MIN(br->br_resid, MAXPHYS);
300 			boff = 0;
301 			do {
302 				clen = MIN(len - boff, br->br_iov[i].iov_len -
303 				    voff);
304 				memcpy(buf + boff,
305 				    br->br_iov[i].iov_base + voff, clen);
306 				if (clen < br->br_iov[i].iov_len - voff)
307 					voff += clen;
308 				else {
309 					i++;
310 					voff = 0;
311 				}
312 				boff += clen;
313 			} while (boff < len);
314 			if (pwrite(bc->bc_fd, buf, len, br->br_offset +
315 			    off) < 0) {
316 				err = errno;
317 				break;
318 			}
319 			off += len;
320 			br->br_resid -= len;
321 		}
322 		break;
323 	case BOP_FLUSH:
324 		err = blockif_flush_bc(bc);
325 		break;
326 	case BOP_DELETE:
327 		if (!bc->bc_candelete)
328 			err = EOPNOTSUPP;
329 		else if (bc->bc_rdonly)
330 			err = EROFS;
331 		else if (bc->bc_ischr) {
332 			arg[0] = br->br_offset;
333 			arg[1] = br->br_resid;
334 			if (ioctl(bc->bc_fd, DIOCGDELETE, arg))
335 				err = errno;
336 			else
337 				br->br_resid = 0;
338 		} else {
339 			range.r_offset = br->br_offset;
340 			range.r_len = br->br_resid;
341 
342 			while (range.r_len > 0) {
343 				if (fspacectl(bc->bc_fd, SPACECTL_DEALLOC,
344 				    &range, 0, &range) != 0) {
345 					err = errno;
346 					break;
347 				}
348 			}
349 			if (err == 0)
350 				br->br_resid = 0;
351 		}
352 		break;
353 	default:
354 		err = EINVAL;
355 		break;
356 	}
357 
358 	be->be_status = BST_DONE;
359 
360 	(*br->br_callback)(br, err);
361 }
362 
363 static inline bool
364 blockif_empty(const struct blockif_ctxt *bc)
365 {
366 	return (TAILQ_EMPTY(&bc->bc_pendq) && TAILQ_EMPTY(&bc->bc_busyq));
367 }
368 
369 static void *
370 blockif_thr(void *arg)
371 {
372 	struct blockif_ctxt *bc;
373 	struct blockif_elem *be;
374 	pthread_t t;
375 	uint8_t *buf;
376 
377 	bc = arg;
378 	if (bc->bc_isgeom)
379 		buf = malloc(MAXPHYS);
380 	else
381 		buf = NULL;
382 	t = pthread_self();
383 
384 	pthread_mutex_lock(&bc->bc_mtx);
385 	for (;;) {
386 		while (blockif_dequeue(bc, t, &be)) {
387 			pthread_mutex_unlock(&bc->bc_mtx);
388 			blockif_proc(bc, be, buf);
389 			pthread_mutex_lock(&bc->bc_mtx);
390 			blockif_complete(bc, be);
391 		}
392 
393 		/* If none to work, notify the main thread */
394 		if (blockif_empty(bc))
395 			pthread_cond_broadcast(&bc->bc_work_done_cond);
396 
397 		/* Check ctxt status here to see if exit requested */
398 		if (bc->bc_closing)
399 			break;
400 
401 		pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx);
402 	}
403 	pthread_mutex_unlock(&bc->bc_mtx);
404 
405 	if (buf)
406 		free(buf);
407 	pthread_exit(NULL);
408 	return (NULL);
409 }
410 
411 static void
412 blockif_sigcont_handler(int signal, enum ev_type type, void *arg)
413 {
414 	struct blockif_sig_elem *bse;
415 
416 	for (;;) {
417 		/*
418 		 * Process the entire list even if not intended for
419 		 * this thread.
420 		 */
421 		do {
422 			bse = blockif_bse_head;
423 			if (bse == NULL)
424 				return;
425 		} while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head,
426 					    (uintptr_t)bse,
427 					    (uintptr_t)bse->bse_next));
428 
429 		pthread_mutex_lock(&bse->bse_mtx);
430 		bse->bse_pending = 0;
431 		pthread_cond_signal(&bse->bse_cond);
432 		pthread_mutex_unlock(&bse->bse_mtx);
433 	}
434 }
435 
436 static void
437 blockif_init(void)
438 {
439 	mevent_add(SIGCONT, EVF_SIGNAL, blockif_sigcont_handler, NULL);
440 	(void) signal(SIGCONT, SIG_IGN);
441 }
442 
443 int
444 blockif_legacy_config(nvlist_t *nvl, const char *opts)
445 {
446 	char *cp, *path;
447 
448 	if (opts == NULL)
449 		return (0);
450 
451 	cp = strchr(opts, ',');
452 	if (cp == NULL) {
453 		set_config_value_node(nvl, "path", opts);
454 		return (0);
455 	}
456 	path = strndup(opts, cp - opts);
457 	set_config_value_node(nvl, "path", path);
458 	free(path);
459 	return (pci_parse_legacy_config(nvl, cp + 1));
460 }
461 
462 struct blockif_ctxt *
463 blockif_open(nvlist_t *nvl, const char *ident)
464 {
465 	char tname[MAXCOMLEN + 1];
466 	char name[MAXPATHLEN];
467 	const char *path, *pssval, *ssval;
468 	char *cp;
469 	struct blockif_ctxt *bc;
470 	struct stat sbuf;
471 	struct diocgattr_arg arg;
472 	off_t size, psectsz, psectoff;
473 	int extra, fd, i, sectsz;
474 	int ro, candelete, geom, ssopt, pssopt;
475 	int nodelete;
476 
477 #ifndef WITHOUT_CAPSICUM
478 	cap_rights_t rights;
479 	cap_ioctl_t cmds[] = { DIOCGFLUSH, DIOCGDELETE, DIOCGMEDIASIZE };
480 #endif
481 
482 	pthread_once(&blockif_once, blockif_init);
483 
484 	fd = -1;
485 	extra = 0;
486 	ssopt = 0;
487 	ro = 0;
488 	nodelete = 0;
489 
490 	if (get_config_bool_node_default(nvl, "nocache", false))
491 		extra |= O_DIRECT;
492 	if (get_config_bool_node_default(nvl, "nodelete", false))
493 		nodelete = 1;
494 	if (get_config_bool_node_default(nvl, "sync", false) ||
495 	    get_config_bool_node_default(nvl, "direct", false))
496 		extra |= O_SYNC;
497 	if (get_config_bool_node_default(nvl, "ro", false))
498 		ro = 1;
499 	ssval = get_config_value_node(nvl, "sectorsize");
500 	if (ssval != NULL) {
501 		ssopt = strtol(ssval, &cp, 10);
502 		if (cp == ssval) {
503 			EPRINTLN("Invalid sector size \"%s\"", ssval);
504 			goto err;
505 		}
506 		if (*cp == '\0') {
507 			pssopt = ssopt;
508 		} else if (*cp == '/') {
509 			pssval = cp + 1;
510 			pssopt = strtol(pssval, &cp, 10);
511 			if (cp == pssval || *cp != '\0') {
512 				EPRINTLN("Invalid sector size \"%s\"", ssval);
513 				goto err;
514 			}
515 		} else {
516 			EPRINTLN("Invalid sector size \"%s\"", ssval);
517 			goto err;
518 		}
519 	}
520 
521 	path = get_config_value_node(nvl, "path");
522 	if (path == NULL) {
523 		EPRINTLN("Missing \"path\" for block device.");
524 		goto err;
525 	}
526 
527 	fd = open(path, (ro ? O_RDONLY : O_RDWR) | extra);
528 	if (fd < 0 && !ro) {
529 		/* Attempt a r/w fail with a r/o open */
530 		fd = open(path, O_RDONLY | extra);
531 		ro = 1;
532 	}
533 
534 	if (fd < 0) {
535 		warn("Could not open backing file: %s", path);
536 		goto err;
537 	}
538 
539         if (fstat(fd, &sbuf) < 0) {
540 		warn("Could not stat backing file %s", path);
541 		goto err;
542         }
543 
544 #ifndef WITHOUT_CAPSICUM
545 	cap_rights_init(&rights, CAP_FSYNC, CAP_IOCTL, CAP_READ, CAP_SEEK,
546 	    CAP_WRITE, CAP_FSTAT, CAP_EVENT, CAP_FPATHCONF);
547 	if (ro)
548 		cap_rights_clear(&rights, CAP_FSYNC, CAP_WRITE);
549 
550 	if (caph_rights_limit(fd, &rights) == -1)
551 		errx(EX_OSERR, "Unable to apply rights for sandbox");
552 #endif
553 
554         /*
555 	 * Deal with raw devices
556 	 */
557         size = sbuf.st_size;
558 	sectsz = DEV_BSIZE;
559 	psectsz = psectoff = 0;
560 	candelete = geom = 0;
561 	if (S_ISCHR(sbuf.st_mode)) {
562 		if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 ||
563 		    ioctl(fd, DIOCGSECTORSIZE, &sectsz)) {
564 			perror("Could not fetch dev blk/sector size");
565 			goto err;
566 		}
567 		assert(size != 0);
568 		assert(sectsz != 0);
569 		if (ioctl(fd, DIOCGSTRIPESIZE, &psectsz) == 0 && psectsz > 0)
570 			ioctl(fd, DIOCGSTRIPEOFFSET, &psectoff);
571 		strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name));
572 		arg.len = sizeof(arg.value.i);
573 		if (nodelete == 0 && ioctl(fd, DIOCGATTR, &arg) == 0)
574 			candelete = arg.value.i;
575 		if (ioctl(fd, DIOCGPROVIDERNAME, name) == 0)
576 			geom = 1;
577 	} else {
578 		psectsz = sbuf.st_blksize;
579 		/* Avoid fallback implementation */
580 		candelete = fpathconf(fd, _PC_DEALLOC_PRESENT) == 1;
581 	}
582 
583 #ifndef WITHOUT_CAPSICUM
584 	if (caph_ioctls_limit(fd, cmds, nitems(cmds)) == -1)
585 		errx(EX_OSERR, "Unable to apply rights for sandbox");
586 #endif
587 
588 	if (ssopt != 0) {
589 		if (!powerof2(ssopt) || !powerof2(pssopt) || ssopt < 512 ||
590 		    ssopt > pssopt) {
591 			EPRINTLN("Invalid sector size %d/%d",
592 			    ssopt, pssopt);
593 			goto err;
594 		}
595 
596 		/*
597 		 * Some backend drivers (e.g. cd0, ada0) require that the I/O
598 		 * size be a multiple of the device's sector size.
599 		 *
600 		 * Validate that the emulated sector size complies with this
601 		 * requirement.
602 		 */
603 		if (S_ISCHR(sbuf.st_mode)) {
604 			if (ssopt < sectsz || (ssopt % sectsz) != 0) {
605 				EPRINTLN("Sector size %d incompatible "
606 				    "with underlying device sector size %d",
607 				    ssopt, sectsz);
608 				goto err;
609 			}
610 		}
611 
612 		sectsz = ssopt;
613 		psectsz = pssopt;
614 		psectoff = 0;
615 	}
616 
617 	bc = calloc(1, sizeof(struct blockif_ctxt));
618 	if (bc == NULL) {
619 		perror("calloc");
620 		goto err;
621 	}
622 
623 	bc->bc_magic = BLOCKIF_SIG;
624 	bc->bc_fd = fd;
625 	bc->bc_ischr = S_ISCHR(sbuf.st_mode);
626 	bc->bc_isgeom = geom;
627 	bc->bc_candelete = candelete;
628 	bc->bc_rdonly = ro;
629 	bc->bc_size = size;
630 	bc->bc_sectsz = sectsz;
631 	bc->bc_psectsz = psectsz;
632 	bc->bc_psectoff = psectoff;
633 	pthread_mutex_init(&bc->bc_mtx, NULL);
634 	pthread_cond_init(&bc->bc_cond, NULL);
635 	bc->bc_paused = 0;
636 	pthread_cond_init(&bc->bc_work_done_cond, NULL);
637 	TAILQ_INIT(&bc->bc_freeq);
638 	TAILQ_INIT(&bc->bc_pendq);
639 	TAILQ_INIT(&bc->bc_busyq);
640 	for (i = 0; i < BLOCKIF_MAXREQ; i++) {
641 		bc->bc_reqs[i].be_status = BST_FREE;
642 		TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link);
643 	}
644 
645 	for (i = 0; i < BLOCKIF_NUMTHR; i++) {
646 		pthread_create(&bc->bc_btid[i], NULL, blockif_thr, bc);
647 		snprintf(tname, sizeof(tname), "blk-%s-%d", ident, i);
648 		pthread_set_name_np(bc->bc_btid[i], tname);
649 	}
650 
651 	return (bc);
652 err:
653 	if (fd >= 0)
654 		close(fd);
655 	return (NULL);
656 }
657 
658 static void
659 blockif_resized(int fd, enum ev_type type, void *arg)
660 {
661 	struct blockif_ctxt *bc;
662 	struct stat sb;
663 	off_t mediasize;
664 
665 	if (fstat(fd, &sb) != 0)
666 		return;
667 
668 	if (S_ISCHR(sb.st_mode)) {
669 		if (ioctl(fd, DIOCGMEDIASIZE, &mediasize) < 0) {
670 			EPRINTLN("blockif_resized: get mediasize failed: %s",
671 			    strerror(errno));
672 			return;
673 		}
674 	} else
675 		mediasize = sb.st_size;
676 
677 	bc = arg;
678 	pthread_mutex_lock(&bc->bc_mtx);
679 	if (mediasize != bc->bc_size) {
680 		bc->bc_size = mediasize;
681 		bc->bc_resize_cb(bc, bc->bc_resize_cb_arg, bc->bc_size);
682 	}
683 	pthread_mutex_unlock(&bc->bc_mtx);
684 }
685 
686 int
687 blockif_register_resize_callback(struct blockif_ctxt *bc, blockif_resize_cb *cb,
688     void *cb_arg)
689 {
690 	struct stat sb;
691 	int err;
692 
693 	if (cb == NULL)
694 		return (EINVAL);
695 
696 	pthread_mutex_lock(&bc->bc_mtx);
697 	if (bc->bc_resize_cb != NULL) {
698 		err = EBUSY;
699 		goto out;
700 	}
701 
702 	assert(bc->bc_closing == 0);
703 
704 	if (fstat(bc->bc_fd, &sb) != 0) {
705 		err = errno;
706 		goto out;
707 	}
708 
709 	bc->bc_resize_event = mevent_add_flags(bc->bc_fd, EVF_VNODE,
710 	    EVFF_ATTRIB, blockif_resized, bc);
711 	if (bc->bc_resize_event == NULL) {
712 		err = ENXIO;
713 		goto out;
714 	}
715 
716 	bc->bc_resize_cb = cb;
717 	bc->bc_resize_cb_arg = cb_arg;
718 out:
719 	pthread_mutex_unlock(&bc->bc_mtx);
720 
721 	return (err);
722 }
723 
724 static int
725 blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq,
726 		enum blockop op)
727 {
728 	int err;
729 
730 	err = 0;
731 
732 	pthread_mutex_lock(&bc->bc_mtx);
733 	assert(!bc->bc_paused);
734 	if (!TAILQ_EMPTY(&bc->bc_freeq)) {
735 		/*
736 		 * Enqueue and inform the block i/o thread
737 		 * that there is work available
738 		 */
739 		if (blockif_enqueue(bc, breq, op))
740 			pthread_cond_signal(&bc->bc_cond);
741 	} else {
742 		/*
743 		 * Callers are not allowed to enqueue more than
744 		 * the specified blockif queue limit. Return an
745 		 * error to indicate that the queue length has been
746 		 * exceeded.
747 		 */
748 		err = E2BIG;
749 	}
750 	pthread_mutex_unlock(&bc->bc_mtx);
751 
752 	return (err);
753 }
754 
755 int
756 blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq)
757 {
758 	assert(bc->bc_magic == BLOCKIF_SIG);
759 	return (blockif_request(bc, breq, BOP_READ));
760 }
761 
762 int
763 blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq)
764 {
765 	assert(bc->bc_magic == BLOCKIF_SIG);
766 	return (blockif_request(bc, breq, BOP_WRITE));
767 }
768 
769 int
770 blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq)
771 {
772 	assert(bc->bc_magic == BLOCKIF_SIG);
773 	return (blockif_request(bc, breq, BOP_FLUSH));
774 }
775 
776 int
777 blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq)
778 {
779 	assert(bc->bc_magic == BLOCKIF_SIG);
780 	return (blockif_request(bc, breq, BOP_DELETE));
781 }
782 
783 int
784 blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq)
785 {
786 	struct blockif_elem *be;
787 
788 	assert(bc->bc_magic == BLOCKIF_SIG);
789 
790 	pthread_mutex_lock(&bc->bc_mtx);
791 	/* XXX: not waiting while paused */
792 
793 	/*
794 	 * Check pending requests.
795 	 */
796 	TAILQ_FOREACH(be, &bc->bc_pendq, be_link) {
797 		if (be->be_req == breq)
798 			break;
799 	}
800 	if (be != NULL) {
801 		/*
802 		 * Found it.
803 		 */
804 		blockif_complete(bc, be);
805 		pthread_mutex_unlock(&bc->bc_mtx);
806 
807 		return (0);
808 	}
809 
810 	/*
811 	 * Check in-flight requests.
812 	 */
813 	TAILQ_FOREACH(be, &bc->bc_busyq, be_link) {
814 		if (be->be_req == breq)
815 			break;
816 	}
817 	if (be == NULL) {
818 		/*
819 		 * Didn't find it.
820 		 */
821 		pthread_mutex_unlock(&bc->bc_mtx);
822 		return (EINVAL);
823 	}
824 
825 	/*
826 	 * Interrupt the processing thread to force it return
827 	 * prematurely via it's normal callback path.
828 	 */
829 	while (be->be_status == BST_BUSY) {
830 		struct blockif_sig_elem bse, *old_head;
831 
832 		pthread_mutex_init(&bse.bse_mtx, NULL);
833 		pthread_cond_init(&bse.bse_cond, NULL);
834 
835 		bse.bse_pending = 1;
836 
837 		do {
838 			old_head = blockif_bse_head;
839 			bse.bse_next = old_head;
840 		} while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head,
841 					    (uintptr_t)old_head,
842 					    (uintptr_t)&bse));
843 
844 		pthread_kill(be->be_tid, SIGCONT);
845 
846 		pthread_mutex_lock(&bse.bse_mtx);
847 		while (bse.bse_pending)
848 			pthread_cond_wait(&bse.bse_cond, &bse.bse_mtx);
849 		pthread_mutex_unlock(&bse.bse_mtx);
850 	}
851 
852 	pthread_mutex_unlock(&bc->bc_mtx);
853 
854 	/*
855 	 * The processing thread has been interrupted.  Since it's not
856 	 * clear if the callback has been invoked yet, return EBUSY.
857 	 */
858 	return (EBUSY);
859 }
860 
861 int
862 blockif_close(struct blockif_ctxt *bc)
863 {
864 	void *jval;
865 	int i;
866 
867 	assert(bc->bc_magic == BLOCKIF_SIG);
868 
869 	/*
870 	 * Stop the block i/o thread
871 	 */
872 	pthread_mutex_lock(&bc->bc_mtx);
873 	bc->bc_closing = 1;
874 	if (bc->bc_resize_event != NULL)
875 		mevent_disable(bc->bc_resize_event);
876 	pthread_mutex_unlock(&bc->bc_mtx);
877 	pthread_cond_broadcast(&bc->bc_cond);
878 	for (i = 0; i < BLOCKIF_NUMTHR; i++)
879 		pthread_join(bc->bc_btid[i], &jval);
880 
881 	/* XXX Cancel queued i/o's ??? */
882 
883 	/*
884 	 * Release resources
885 	 */
886 	bc->bc_magic = 0;
887 	close(bc->bc_fd);
888 	free(bc);
889 
890 	return (0);
891 }
892 
893 /*
894  * Return virtual C/H/S values for a given block. Use the algorithm
895  * outlined in the VHD specification to calculate values.
896  */
897 void
898 blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s)
899 {
900 	off_t sectors;		/* total sectors of the block dev */
901 	off_t hcyl;		/* cylinders times heads */
902 	uint16_t secpt;		/* sectors per track */
903 	uint8_t heads;
904 
905 	assert(bc->bc_magic == BLOCKIF_SIG);
906 
907 	sectors = bc->bc_size / bc->bc_sectsz;
908 
909 	/* Clamp the size to the largest possible with CHS */
910 	if (sectors > 65535UL*16*255)
911 		sectors = 65535UL*16*255;
912 
913 	if (sectors >= 65536UL*16*63) {
914 		secpt = 255;
915 		heads = 16;
916 		hcyl = sectors / secpt;
917 	} else {
918 		secpt = 17;
919 		hcyl = sectors / secpt;
920 		heads = (hcyl + 1023) / 1024;
921 
922 		if (heads < 4)
923 			heads = 4;
924 
925 		if (hcyl >= (heads * 1024) || heads > 16) {
926 			secpt = 31;
927 			heads = 16;
928 			hcyl = sectors / secpt;
929 		}
930 		if (hcyl >= (heads * 1024)) {
931 			secpt = 63;
932 			heads = 16;
933 			hcyl = sectors / secpt;
934 		}
935 	}
936 
937 	*c = hcyl / heads;
938 	*h = heads;
939 	*s = secpt;
940 }
941 
942 /*
943  * Accessors
944  */
945 off_t
946 blockif_size(struct blockif_ctxt *bc)
947 {
948 	assert(bc->bc_magic == BLOCKIF_SIG);
949 	return (bc->bc_size);
950 }
951 
952 int
953 blockif_sectsz(struct blockif_ctxt *bc)
954 {
955 	assert(bc->bc_magic == BLOCKIF_SIG);
956 	return (bc->bc_sectsz);
957 }
958 
959 void
960 blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off)
961 {
962 	assert(bc->bc_magic == BLOCKIF_SIG);
963 	*size = bc->bc_psectsz;
964 	*off = bc->bc_psectoff;
965 }
966 
967 int
968 blockif_queuesz(struct blockif_ctxt *bc)
969 {
970 	assert(bc->bc_magic == BLOCKIF_SIG);
971 	return (BLOCKIF_MAXREQ - 1);
972 }
973 
974 int
975 blockif_is_ro(struct blockif_ctxt *bc)
976 {
977 	assert(bc->bc_magic == BLOCKIF_SIG);
978 	return (bc->bc_rdonly);
979 }
980 
981 int
982 blockif_candelete(struct blockif_ctxt *bc)
983 {
984 	assert(bc->bc_magic == BLOCKIF_SIG);
985 	return (bc->bc_candelete);
986 }
987 
988 #ifdef BHYVE_SNAPSHOT
989 void
990 blockif_pause(struct blockif_ctxt *bc)
991 {
992 	assert(bc != NULL);
993 	assert(bc->bc_magic == BLOCKIF_SIG);
994 
995 	pthread_mutex_lock(&bc->bc_mtx);
996 	bc->bc_paused = 1;
997 
998 	/* The interface is paused. Wait for workers to finish their work */
999 	while (!blockif_empty(bc))
1000 		pthread_cond_wait(&bc->bc_work_done_cond, &bc->bc_mtx);
1001 	pthread_mutex_unlock(&bc->bc_mtx);
1002 
1003 	if (blockif_flush_bc(bc))
1004 		fprintf(stderr, "%s: [WARN] failed to flush backing file.\r\n",
1005 			__func__);
1006 }
1007 
1008 void
1009 blockif_resume(struct blockif_ctxt *bc)
1010 {
1011 	assert(bc != NULL);
1012 	assert(bc->bc_magic == BLOCKIF_SIG);
1013 
1014 	pthread_mutex_lock(&bc->bc_mtx);
1015 	bc->bc_paused = 0;
1016 	pthread_mutex_unlock(&bc->bc_mtx);
1017 }
1018 #endif	/* BHYVE_SNAPSHOT */
1019