xref: /freebsd/usr.sbin/bhyve/block_if.c (revision 4d65a7c6951cea0333f1a0c1b32c38489cdfa6c5)
17cf5a7eeSPeter Grehan /*-
24d846d26SWarner Losh  * SPDX-License-Identifier: BSD-2-Clause
31de7b4b8SPedro F. Giffuni  *
47cf5a7eeSPeter Grehan  * Copyright (c) 2013  Peter Grehan <grehan@freebsd.org>
57cf5a7eeSPeter Grehan  * All rights reserved.
622769bbeSAllan Jude  * Copyright 2020 Joyent, Inc.
77cf5a7eeSPeter Grehan  *
87cf5a7eeSPeter Grehan  * Redistribution and use in source and binary forms, with or without
97cf5a7eeSPeter Grehan  * modification, are permitted provided that the following conditions
107cf5a7eeSPeter Grehan  * are met:
117cf5a7eeSPeter Grehan  * 1. Redistributions of source code must retain the above copyright
127cf5a7eeSPeter Grehan  *    notice, this list of conditions and the following disclaimer.
137cf5a7eeSPeter Grehan  * 2. Redistributions in binary form must reproduce the above copyright
147cf5a7eeSPeter Grehan  *    notice, this list of conditions and the following disclaimer in the
157cf5a7eeSPeter Grehan  *    documentation and/or other materials provided with the distribution.
167cf5a7eeSPeter Grehan  *
177cf5a7eeSPeter Grehan  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
187cf5a7eeSPeter Grehan  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
197cf5a7eeSPeter Grehan  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
207cf5a7eeSPeter Grehan  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
217cf5a7eeSPeter Grehan  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
227cf5a7eeSPeter Grehan  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
237cf5a7eeSPeter Grehan  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
247cf5a7eeSPeter Grehan  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
257cf5a7eeSPeter Grehan  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
267cf5a7eeSPeter Grehan  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
277cf5a7eeSPeter Grehan  * SUCH DAMAGE.
287cf5a7eeSPeter Grehan  */
297cf5a7eeSPeter Grehan 
307cf5a7eeSPeter Grehan #include <sys/param.h>
3100ef17beSBartek Rutkowski #ifndef WITHOUT_CAPSICUM
3200ef17beSBartek Rutkowski #include <sys/capsicum.h>
3300ef17beSBartek Rutkowski #endif
347cf5a7eeSPeter Grehan #include <sys/queue.h>
357cf5a7eeSPeter Grehan #include <sys/errno.h>
367cf5a7eeSPeter Grehan #include <sys/stat.h>
377cf5a7eeSPeter Grehan #include <sys/ioctl.h>
387cf5a7eeSPeter Grehan #include <sys/disk.h>
397cf5a7eeSPeter Grehan 
407cf5a7eeSPeter Grehan #include <assert.h>
41abfa3c39SMarcelo Araujo #ifndef WITHOUT_CAPSICUM
42abfa3c39SMarcelo Araujo #include <capsicum_helpers.h>
43abfa3c39SMarcelo Araujo #endif
4486bdfe15SBaptiste Daroussin #include <err.h>
457cf5a7eeSPeter Grehan #include <fcntl.h>
467cf5a7eeSPeter Grehan #include <stdio.h>
477cf5a7eeSPeter Grehan #include <stdlib.h>
487cf5a7eeSPeter Grehan #include <string.h>
497cf5a7eeSPeter Grehan #include <pthread.h>
507cf5a7eeSPeter Grehan #include <pthread_np.h>
51ae45750dSTycho Nightingale #include <signal.h>
5200ef17beSBartek Rutkowski #include <sysexits.h>
537cf5a7eeSPeter Grehan #include <unistd.h>
547cf5a7eeSPeter Grehan 
55ae45750dSTycho Nightingale #include <machine/atomic.h>
56483d953aSJohn Baldwin #include <machine/vmm_snapshot.h>
57ae45750dSTycho Nightingale 
587cf5a7eeSPeter Grehan #include "bhyverun.h"
59621b5090SJohn Baldwin #include "config.h"
60332eff95SVincenzo Maffione #include "debug.h"
61ae45750dSTycho Nightingale #include "mevent.h"
62621b5090SJohn Baldwin #include "pci_emul.h"
637cf5a7eeSPeter Grehan #include "block_if.h"
647cf5a7eeSPeter Grehan 
657cf5a7eeSPeter Grehan #define BLOCKIF_SIG	0xb109b109
667cf5a7eeSPeter Grehan 
6779565afeSAlexander Motin #define BLOCKIF_NUMTHR	8
688c74ade8SJohn Baldwin #define BLOCKIF_MAXREQ	(BLOCKIF_RING_MAX + BLOCKIF_NUMTHR)
697cf5a7eeSPeter Grehan 
707cf5a7eeSPeter Grehan enum blockop {
717cf5a7eeSPeter Grehan 	BOP_READ,
727cf5a7eeSPeter Grehan 	BOP_WRITE,
730b9d25c9SAlexander Motin 	BOP_FLUSH,
740b9d25c9SAlexander Motin 	BOP_DELETE
757cf5a7eeSPeter Grehan };
767cf5a7eeSPeter Grehan 
777cf5a7eeSPeter Grehan enum blockstat {
787cf5a7eeSPeter Grehan 	BST_FREE,
7979565afeSAlexander Motin 	BST_BLOCK,
80ae45750dSTycho Nightingale 	BST_PEND,
81ae45750dSTycho Nightingale 	BST_BUSY,
82ae45750dSTycho Nightingale 	BST_DONE
837cf5a7eeSPeter Grehan };
847cf5a7eeSPeter Grehan 
857cf5a7eeSPeter Grehan struct blockif_elem {
867cf5a7eeSPeter Grehan 	TAILQ_ENTRY(blockif_elem) be_link;
877cf5a7eeSPeter Grehan 	struct blockif_req  *be_req;
887cf5a7eeSPeter Grehan 	enum blockop	     be_op;
897cf5a7eeSPeter Grehan 	enum blockstat	     be_status;
90ae45750dSTycho Nightingale 	pthread_t            be_tid;
9179565afeSAlexander Motin 	off_t		     be_block;
927cf5a7eeSPeter Grehan };
937cf5a7eeSPeter Grehan 
947cf5a7eeSPeter Grehan struct blockif_ctxt {
953dddf73eSMark Johnston 	unsigned int		bc_magic;
967cf5a7eeSPeter Grehan 	int			bc_fd;
972d678f1fSAlexander Motin 	int			bc_ischr;
98bb1524afSAlexander Motin 	int			bc_isgeom;
990b9d25c9SAlexander Motin 	int			bc_candelete;
1007cf5a7eeSPeter Grehan 	int			bc_rdonly;
1017cf5a7eeSPeter Grehan 	off_t			bc_size;
1027cf5a7eeSPeter Grehan 	int			bc_sectsz;
10394682383SAlexander Motin 	int			bc_psectsz;
10494682383SAlexander Motin 	int			bc_psectoff;
10579565afeSAlexander Motin 	int			bc_closing;
106483d953aSJohn Baldwin 	int			bc_paused;
10779565afeSAlexander Motin 	pthread_t		bc_btid[BLOCKIF_NUMTHR];
1087cf5a7eeSPeter Grehan 	pthread_mutex_t		bc_mtx;
1097cf5a7eeSPeter Grehan 	pthread_cond_t		bc_cond;
110483d953aSJohn Baldwin 	pthread_cond_t		bc_work_done_cond;
1118794846aSJohn Baldwin 	blockif_resize_cb	*bc_resize_cb;
1128794846aSJohn Baldwin 	void			*bc_resize_cb_arg;
1138794846aSJohn Baldwin 	struct mevent		*bc_resize_event;
1147cf5a7eeSPeter Grehan 
115ae45750dSTycho Nightingale 	/* Request elements and free/pending/busy queues */
1167cf5a7eeSPeter Grehan 	TAILQ_HEAD(, blockif_elem) bc_freeq;
117ae45750dSTycho Nightingale 	TAILQ_HEAD(, blockif_elem) bc_pendq;
118ae45750dSTycho Nightingale 	TAILQ_HEAD(, blockif_elem) bc_busyq;
1197cf5a7eeSPeter Grehan 	struct blockif_elem	bc_reqs[BLOCKIF_MAXREQ];
120480bef94SCorvin Köhne 	int			bc_bootindex;
1217cf5a7eeSPeter Grehan };
1227cf5a7eeSPeter Grehan 
123ae45750dSTycho Nightingale static pthread_once_t blockif_once = PTHREAD_ONCE_INIT;
124ae45750dSTycho Nightingale 
125ae45750dSTycho Nightingale struct blockif_sig_elem {
126ae45750dSTycho Nightingale 	pthread_mutex_t			bse_mtx;
127ae45750dSTycho Nightingale 	pthread_cond_t			bse_cond;
128ae45750dSTycho Nightingale 	int				bse_pending;
129ae45750dSTycho Nightingale 	struct blockif_sig_elem		*bse_next;
130ae45750dSTycho Nightingale };
131ae45750dSTycho Nightingale 
132ae45750dSTycho Nightingale static struct blockif_sig_elem *blockif_bse_head;
133ae45750dSTycho Nightingale 
1347cf5a7eeSPeter Grehan static int
blockif_enqueue(struct blockif_ctxt * bc,struct blockif_req * breq,enum blockop op)1357cf5a7eeSPeter Grehan blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq,
1367cf5a7eeSPeter Grehan 		enum blockop op)
1377cf5a7eeSPeter Grehan {
13879565afeSAlexander Motin 	struct blockif_elem *be, *tbe;
13979565afeSAlexander Motin 	off_t off;
14079565afeSAlexander Motin 	int i;
1417cf5a7eeSPeter Grehan 
1427cf5a7eeSPeter Grehan 	be = TAILQ_FIRST(&bc->bc_freeq);
1437cf5a7eeSPeter Grehan 	assert(be != NULL);
1447cf5a7eeSPeter Grehan 	assert(be->be_status == BST_FREE);
1457cf5a7eeSPeter Grehan 	TAILQ_REMOVE(&bc->bc_freeq, be, be_link);
1467cf5a7eeSPeter Grehan 	be->be_req = breq;
1477cf5a7eeSPeter Grehan 	be->be_op = op;
14879565afeSAlexander Motin 	switch (op) {
14979565afeSAlexander Motin 	case BOP_READ:
15079565afeSAlexander Motin 	case BOP_WRITE:
15179565afeSAlexander Motin 	case BOP_DELETE:
15279565afeSAlexander Motin 		off = breq->br_offset;
15379565afeSAlexander Motin 		for (i = 0; i < breq->br_iovcnt; i++)
15479565afeSAlexander Motin 			off += breq->br_iov[i].iov_len;
15579565afeSAlexander Motin 		break;
15679565afeSAlexander Motin 	default:
15779565afeSAlexander Motin 		off = OFF_MAX;
15879565afeSAlexander Motin 	}
15979565afeSAlexander Motin 	be->be_block = off;
16079565afeSAlexander Motin 	TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) {
16179565afeSAlexander Motin 		if (tbe->be_block == breq->br_offset)
16279565afeSAlexander Motin 			break;
16379565afeSAlexander Motin 	}
16479565afeSAlexander Motin 	if (tbe == NULL) {
16579565afeSAlexander Motin 		TAILQ_FOREACH(tbe, &bc->bc_busyq, be_link) {
16679565afeSAlexander Motin 			if (tbe->be_block == breq->br_offset)
16779565afeSAlexander Motin 				break;
16879565afeSAlexander Motin 		}
16979565afeSAlexander Motin 	}
17079565afeSAlexander Motin 	if (tbe == NULL)
17179565afeSAlexander Motin 		be->be_status = BST_PEND;
17279565afeSAlexander Motin 	else
17379565afeSAlexander Motin 		be->be_status = BST_BLOCK;
174ae45750dSTycho Nightingale 	TAILQ_INSERT_TAIL(&bc->bc_pendq, be, be_link);
17579565afeSAlexander Motin 	return (be->be_status == BST_PEND);
1767cf5a7eeSPeter Grehan }
1777cf5a7eeSPeter Grehan 
1787cf5a7eeSPeter Grehan static int
blockif_dequeue(struct blockif_ctxt * bc,pthread_t t,struct blockif_elem ** bep)17979565afeSAlexander Motin blockif_dequeue(struct blockif_ctxt *bc, pthread_t t, struct blockif_elem **bep)
1807cf5a7eeSPeter Grehan {
1817cf5a7eeSPeter Grehan 	struct blockif_elem *be;
1827cf5a7eeSPeter Grehan 
18379565afeSAlexander Motin 	TAILQ_FOREACH(be, &bc->bc_pendq, be_link) {
18479565afeSAlexander Motin 		if (be->be_status == BST_PEND)
18579565afeSAlexander Motin 			break;
18679565afeSAlexander Motin 		assert(be->be_status == BST_BLOCK);
18779565afeSAlexander Motin 	}
18879565afeSAlexander Motin 	if (be == NULL)
18979565afeSAlexander Motin 		return (0);
190ae45750dSTycho Nightingale 	TAILQ_REMOVE(&bc->bc_pendq, be, be_link);
191ae45750dSTycho Nightingale 	be->be_status = BST_BUSY;
19279565afeSAlexander Motin 	be->be_tid = t;
193ae45750dSTycho Nightingale 	TAILQ_INSERT_TAIL(&bc->bc_busyq, be, be_link);
194ae45750dSTycho Nightingale 	*bep = be;
19579565afeSAlexander Motin 	return (1);
196ae45750dSTycho Nightingale }
197ae45750dSTycho Nightingale 
198ae45750dSTycho Nightingale static void
blockif_complete(struct blockif_ctxt * bc,struct blockif_elem * be)199ae45750dSTycho Nightingale blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be)
200ae45750dSTycho Nightingale {
20179565afeSAlexander Motin 	struct blockif_elem *tbe;
202ae45750dSTycho Nightingale 
20379565afeSAlexander Motin 	if (be->be_status == BST_DONE || be->be_status == BST_BUSY)
204ae45750dSTycho Nightingale 		TAILQ_REMOVE(&bc->bc_busyq, be, be_link);
20579565afeSAlexander Motin 	else
20679565afeSAlexander Motin 		TAILQ_REMOVE(&bc->bc_pendq, be, be_link);
20779565afeSAlexander Motin 	TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) {
20879565afeSAlexander Motin 		if (tbe->be_req->br_offset == be->be_block)
20979565afeSAlexander Motin 			tbe->be_status = BST_PEND;
21079565afeSAlexander Motin 	}
211ae45750dSTycho Nightingale 	be->be_tid = 0;
2127cf5a7eeSPeter Grehan 	be->be_status = BST_FREE;
2137cf5a7eeSPeter Grehan 	be->be_req = NULL;
2147cf5a7eeSPeter Grehan 	TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link);
2157cf5a7eeSPeter Grehan }
2167cf5a7eeSPeter Grehan 
217483d953aSJohn Baldwin static int
blockif_flush_bc(struct blockif_ctxt * bc)218483d953aSJohn Baldwin blockif_flush_bc(struct blockif_ctxt *bc)
219483d953aSJohn Baldwin {
220483d953aSJohn Baldwin 	if (bc->bc_ischr) {
221483d953aSJohn Baldwin 		if (ioctl(bc->bc_fd, DIOCGFLUSH))
222483d953aSJohn Baldwin 			return (errno);
223483d953aSJohn Baldwin 	} else if (fsync(bc->bc_fd))
224483d953aSJohn Baldwin 		return (errno);
225483d953aSJohn Baldwin 
226483d953aSJohn Baldwin 	return (0);
227483d953aSJohn Baldwin }
228483d953aSJohn Baldwin 
2297cf5a7eeSPeter Grehan static void
blockif_proc(struct blockif_ctxt * bc,struct blockif_elem * be,uint8_t * buf)230bb1524afSAlexander Motin blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf)
2317cf5a7eeSPeter Grehan {
23246f5c828SMark Johnston 	struct spacectl_range range;
2337cf5a7eeSPeter Grehan 	struct blockif_req *br;
2340b9d25c9SAlexander Motin 	off_t arg[2];
23546f5c828SMark Johnston 	ssize_t n;
23646f5c828SMark Johnston 	size_t clen, len, off, boff, voff;
237bb1524afSAlexander Motin 	int i, err;
2387cf5a7eeSPeter Grehan 
2397cf5a7eeSPeter Grehan 	br = be->be_req;
24046f5c828SMark Johnston 	assert(br->br_resid >= 0);
24146f5c828SMark Johnston 
242bb1524afSAlexander Motin 	if (br->br_iovcnt <= 1)
243bb1524afSAlexander Motin 		buf = NULL;
2447cf5a7eeSPeter Grehan 	err = 0;
2457cf5a7eeSPeter Grehan 	switch (be->be_op) {
2467cf5a7eeSPeter Grehan 	case BOP_READ:
247bb1524afSAlexander Motin 		if (buf == NULL) {
24846f5c828SMark Johnston 			if ((n = preadv(bc->bc_fd, br->br_iov, br->br_iovcnt,
249bb1524afSAlexander Motin 			    br->br_offset)) < 0)
250bb1524afSAlexander Motin 				err = errno;
251bb1524afSAlexander Motin 			else
25246f5c828SMark Johnston 				br->br_resid -= n;
253bb1524afSAlexander Motin 			break;
254bb1524afSAlexander Motin 		}
255bb1524afSAlexander Motin 		i = 0;
256bb1524afSAlexander Motin 		off = voff = 0;
257bb1524afSAlexander Motin 		while (br->br_resid > 0) {
258bb1524afSAlexander Motin 			len = MIN(br->br_resid, MAXPHYS);
25946f5c828SMark Johnston 			n = pread(bc->bc_fd, buf, len, br->br_offset + off);
26046f5c828SMark Johnston 			if (n < 0) {
2617cf5a7eeSPeter Grehan 				err = errno;
2627cf5a7eeSPeter Grehan 				break;
263bb1524afSAlexander Motin 			}
26446f5c828SMark Johnston 			len = (size_t)n;
265bb1524afSAlexander Motin 			boff = 0;
266bb1524afSAlexander Motin 			do {
267bb1524afSAlexander Motin 				clen = MIN(len - boff, br->br_iov[i].iov_len -
268bb1524afSAlexander Motin 				    voff);
26903f7ccabSMark Johnston 				memcpy((uint8_t *)br->br_iov[i].iov_base + voff,
270bb1524afSAlexander Motin 				    buf + boff, clen);
271bb1524afSAlexander Motin 				if (clen < br->br_iov[i].iov_len - voff)
272bb1524afSAlexander Motin 					voff += clen;
273bb1524afSAlexander Motin 				else {
274bb1524afSAlexander Motin 					i++;
275bb1524afSAlexander Motin 					voff = 0;
276bb1524afSAlexander Motin 				}
277bb1524afSAlexander Motin 				boff += clen;
278bb1524afSAlexander Motin 			} while (boff < len);
279bb1524afSAlexander Motin 			off += len;
280bb1524afSAlexander Motin 			br->br_resid -= len;
281bb1524afSAlexander Motin 		}
282bb1524afSAlexander Motin 		break;
2837cf5a7eeSPeter Grehan 	case BOP_WRITE:
284bb1524afSAlexander Motin 		if (bc->bc_rdonly) {
2857cf5a7eeSPeter Grehan 			err = EROFS;
286bb1524afSAlexander Motin 			break;
287bb1524afSAlexander Motin 		}
288bb1524afSAlexander Motin 		if (buf == NULL) {
28946f5c828SMark Johnston 			if ((n = pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt,
290bb1524afSAlexander Motin 			    br->br_offset)) < 0)
2917cf5a7eeSPeter Grehan 				err = errno;
292bb1524afSAlexander Motin 			else
29346f5c828SMark Johnston 				br->br_resid -= n;
294bb1524afSAlexander Motin 			break;
295bb1524afSAlexander Motin 		}
296bb1524afSAlexander Motin 		i = 0;
297bb1524afSAlexander Motin 		off = voff = 0;
298bb1524afSAlexander Motin 		while (br->br_resid > 0) {
299bb1524afSAlexander Motin 			len = MIN(br->br_resid, MAXPHYS);
300bb1524afSAlexander Motin 			boff = 0;
301bb1524afSAlexander Motin 			do {
302bb1524afSAlexander Motin 				clen = MIN(len - boff, br->br_iov[i].iov_len -
303bb1524afSAlexander Motin 				    voff);
304bb1524afSAlexander Motin 				memcpy(buf + boff,
30503f7ccabSMark Johnston 				    (uint8_t *)br->br_iov[i].iov_base + voff,
30603f7ccabSMark Johnston 				    clen);
307bb1524afSAlexander Motin 				if (clen < br->br_iov[i].iov_len - voff)
308bb1524afSAlexander Motin 					voff += clen;
309bb1524afSAlexander Motin 				else {
310bb1524afSAlexander Motin 					i++;
311bb1524afSAlexander Motin 					voff = 0;
312bb1524afSAlexander Motin 				}
313bb1524afSAlexander Motin 				boff += clen;
314bb1524afSAlexander Motin 			} while (boff < len);
31546f5c828SMark Johnston 
31646f5c828SMark Johnston 			n = pwrite(bc->bc_fd, buf, len, br->br_offset + off);
31746f5c828SMark Johnston 			if (n < 0) {
318bb1524afSAlexander Motin 				err = errno;
319bb1524afSAlexander Motin 				break;
320bb1524afSAlexander Motin 			}
32146f5c828SMark Johnston 			off += n;
32246f5c828SMark Johnston 			br->br_resid -= n;
323bb1524afSAlexander Motin 		}
3247cf5a7eeSPeter Grehan 		break;
3257cf5a7eeSPeter Grehan 	case BOP_FLUSH:
326483d953aSJohn Baldwin 		err = blockif_flush_bc(bc);
3277cf5a7eeSPeter Grehan 		break;
3280b9d25c9SAlexander Motin 	case BOP_DELETE:
3290b9d25c9SAlexander Motin 		if (!bc->bc_candelete)
3300b9d25c9SAlexander Motin 			err = EOPNOTSUPP;
3317e8e5539SAlexander Motin 		else if (bc->bc_rdonly)
3327e8e5539SAlexander Motin 			err = EROFS;
3330b9d25c9SAlexander Motin 		else if (bc->bc_ischr) {
3340b9d25c9SAlexander Motin 			arg[0] = br->br_offset;
335bb1524afSAlexander Motin 			arg[1] = br->br_resid;
3360b9d25c9SAlexander Motin 			if (ioctl(bc->bc_fd, DIOCGDELETE, arg))
3370b9d25c9SAlexander Motin 				err = errno;
338bb1524afSAlexander Motin 			else
339bb1524afSAlexander Motin 				br->br_resid = 0;
3403676512bSKa Ho Ng 		} else {
3413676512bSKa Ho Ng 			range.r_offset = br->br_offset;
3423676512bSKa Ho Ng 			range.r_len = br->br_resid;
3433676512bSKa Ho Ng 
3443676512bSKa Ho Ng 			while (range.r_len > 0) {
3453676512bSKa Ho Ng 				if (fspacectl(bc->bc_fd, SPACECTL_DEALLOC,
3463676512bSKa Ho Ng 				    &range, 0, &range) != 0) {
3473676512bSKa Ho Ng 					err = errno;
3483676512bSKa Ho Ng 					break;
3493676512bSKa Ho Ng 				}
3503676512bSKa Ho Ng 			}
3513676512bSKa Ho Ng 			if (err == 0)
3523676512bSKa Ho Ng 				br->br_resid = 0;
3533676512bSKa Ho Ng 		}
3540b9d25c9SAlexander Motin 		break;
3557cf5a7eeSPeter Grehan 	default:
3567cf5a7eeSPeter Grehan 		err = EINVAL;
3577cf5a7eeSPeter Grehan 		break;
3587cf5a7eeSPeter Grehan 	}
3597cf5a7eeSPeter Grehan 
360ae45750dSTycho Nightingale 	be->be_status = BST_DONE;
361ae45750dSTycho Nightingale 
3627cf5a7eeSPeter Grehan 	(*br->br_callback)(br, err);
3637cf5a7eeSPeter Grehan }
3647cf5a7eeSPeter Grehan 
365cd9618bdSVitaliy Gusev static inline bool
blockif_empty(const struct blockif_ctxt * bc)366cd9618bdSVitaliy Gusev blockif_empty(const struct blockif_ctxt *bc)
367cd9618bdSVitaliy Gusev {
368cd9618bdSVitaliy Gusev 	return (TAILQ_EMPTY(&bc->bc_pendq) && TAILQ_EMPTY(&bc->bc_busyq));
369cd9618bdSVitaliy Gusev }
370cd9618bdSVitaliy Gusev 
3717cf5a7eeSPeter Grehan static void *
blockif_thr(void * arg)3727cf5a7eeSPeter Grehan blockif_thr(void *arg)
3737cf5a7eeSPeter Grehan {
3747cf5a7eeSPeter Grehan 	struct blockif_ctxt *bc;
375ae45750dSTycho Nightingale 	struct blockif_elem *be;
37679565afeSAlexander Motin 	pthread_t t;
377bb1524afSAlexander Motin 	uint8_t *buf;
3787cf5a7eeSPeter Grehan 
3797cf5a7eeSPeter Grehan 	bc = arg;
380bb1524afSAlexander Motin 	if (bc->bc_isgeom)
381bb1524afSAlexander Motin 		buf = malloc(MAXPHYS);
382bb1524afSAlexander Motin 	else
383bb1524afSAlexander Motin 		buf = NULL;
38479565afeSAlexander Motin 	t = pthread_self();
3857cf5a7eeSPeter Grehan 
3867cf5a7eeSPeter Grehan 	pthread_mutex_lock(&bc->bc_mtx);
38779565afeSAlexander Motin 	for (;;) {
388cd9618bdSVitaliy Gusev 		while (blockif_dequeue(bc, t, &be)) {
3897cf5a7eeSPeter Grehan 			pthread_mutex_unlock(&bc->bc_mtx);
390bb1524afSAlexander Motin 			blockif_proc(bc, be, buf);
3917cf5a7eeSPeter Grehan 			pthread_mutex_lock(&bc->bc_mtx);
392ae45750dSTycho Nightingale 			blockif_complete(bc, be);
3937cf5a7eeSPeter Grehan 		}
394483d953aSJohn Baldwin 
395cd9618bdSVitaliy Gusev 		/* If none to work, notify the main thread */
396cd9618bdSVitaliy Gusev 		if (blockif_empty(bc))
397483d953aSJohn Baldwin 			pthread_cond_broadcast(&bc->bc_work_done_cond);
398483d953aSJohn Baldwin 
39979565afeSAlexander Motin 		/* Check ctxt status here to see if exit requested */
40079565afeSAlexander Motin 		if (bc->bc_closing)
40179565afeSAlexander Motin 			break;
402483d953aSJohn Baldwin 
4037cf5a7eeSPeter Grehan 		pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx);
40479565afeSAlexander Motin 	}
4057cf5a7eeSPeter Grehan 	pthread_mutex_unlock(&bc->bc_mtx);
4067cf5a7eeSPeter Grehan 
407bb1524afSAlexander Motin 	if (buf)
408bb1524afSAlexander Motin 		free(buf);
4097cf5a7eeSPeter Grehan 	pthread_exit(NULL);
4107cf5a7eeSPeter Grehan 	return (NULL);
4117cf5a7eeSPeter Grehan }
4127cf5a7eeSPeter Grehan 
413ae45750dSTycho Nightingale static void
blockif_sigcont_handler(int signal __unused,enum ev_type type __unused,void * arg __unused)41498d920d9SMark Johnston blockif_sigcont_handler(int signal __unused, enum ev_type type __unused,
41598d920d9SMark Johnston     void *arg __unused)
416ae45750dSTycho Nightingale {
417ae45750dSTycho Nightingale 	struct blockif_sig_elem *bse;
418ae45750dSTycho Nightingale 
419ae45750dSTycho Nightingale 	for (;;) {
420ae45750dSTycho Nightingale 		/*
421ae45750dSTycho Nightingale 		 * Process the entire list even if not intended for
422ae45750dSTycho Nightingale 		 * this thread.
423ae45750dSTycho Nightingale 		 */
424ae45750dSTycho Nightingale 		do {
425ae45750dSTycho Nightingale 			bse = blockif_bse_head;
426ae45750dSTycho Nightingale 			if (bse == NULL)
427ae45750dSTycho Nightingale 				return;
428ae45750dSTycho Nightingale 		} while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head,
429ae45750dSTycho Nightingale 					    (uintptr_t)bse,
430ae45750dSTycho Nightingale 					    (uintptr_t)bse->bse_next));
431ae45750dSTycho Nightingale 
432ae45750dSTycho Nightingale 		pthread_mutex_lock(&bse->bse_mtx);
433ae45750dSTycho Nightingale 		bse->bse_pending = 0;
434ae45750dSTycho Nightingale 		pthread_cond_signal(&bse->bse_cond);
435ae45750dSTycho Nightingale 		pthread_mutex_unlock(&bse->bse_mtx);
436ae45750dSTycho Nightingale 	}
437ae45750dSTycho Nightingale }
438ae45750dSTycho Nightingale 
439ae45750dSTycho Nightingale static void
blockif_init(void)440ae45750dSTycho Nightingale blockif_init(void)
441ae45750dSTycho Nightingale {
442ae45750dSTycho Nightingale 	mevent_add(SIGCONT, EVF_SIGNAL, blockif_sigcont_handler, NULL);
443ae45750dSTycho Nightingale 	(void) signal(SIGCONT, SIG_IGN);
444ae45750dSTycho Nightingale }
445ae45750dSTycho Nightingale 
446621b5090SJohn Baldwin int
blockif_legacy_config(nvlist_t * nvl,const char * opts)447621b5090SJohn Baldwin blockif_legacy_config(nvlist_t *nvl, const char *opts)
448621b5090SJohn Baldwin {
449621b5090SJohn Baldwin 	char *cp, *path;
450621b5090SJohn Baldwin 
451621b5090SJohn Baldwin 	if (opts == NULL)
452621b5090SJohn Baldwin 		return (0);
453621b5090SJohn Baldwin 
454621b5090SJohn Baldwin 	cp = strchr(opts, ',');
455621b5090SJohn Baldwin 	if (cp == NULL) {
456621b5090SJohn Baldwin 		set_config_value_node(nvl, "path", opts);
457621b5090SJohn Baldwin 		return (0);
458621b5090SJohn Baldwin 	}
459621b5090SJohn Baldwin 	path = strndup(opts, cp - opts);
460621b5090SJohn Baldwin 	set_config_value_node(nvl, "path", path);
461621b5090SJohn Baldwin 	free(path);
462621b5090SJohn Baldwin 	return (pci_parse_legacy_config(nvl, cp + 1));
463621b5090SJohn Baldwin }
464621b5090SJohn Baldwin 
465480bef94SCorvin Köhne int
blockif_add_boot_device(struct pci_devinst * const pi,struct blockif_ctxt * const bc)466480bef94SCorvin Köhne blockif_add_boot_device(struct pci_devinst *const pi,
467480bef94SCorvin Köhne     struct blockif_ctxt *const bc)
468480bef94SCorvin Köhne {
469480bef94SCorvin Köhne 	if (bc->bc_bootindex < 0)
470480bef94SCorvin Köhne 		return (0);
471480bef94SCorvin Köhne 
472480bef94SCorvin Köhne 	return (pci_emul_add_boot_device(pi, bc->bc_bootindex));
473480bef94SCorvin Köhne }
474480bef94SCorvin Köhne 
4757cf5a7eeSPeter Grehan struct blockif_ctxt *
blockif_open(nvlist_t * nvl,const char * ident)476621b5090SJohn Baldwin blockif_open(nvlist_t *nvl, const char *ident)
4777cf5a7eeSPeter Grehan {
4787cf5a7eeSPeter Grehan 	char tname[MAXCOMLEN + 1];
479bb1524afSAlexander Motin 	char name[MAXPATHLEN];
480480bef94SCorvin Köhne 	const char *path, *pssval, *ssval, *bootindex_val;
481621b5090SJohn Baldwin 	char *cp;
4827cf5a7eeSPeter Grehan 	struct blockif_ctxt *bc;
4837cf5a7eeSPeter Grehan 	struct stat sbuf;
4840b9d25c9SAlexander Motin 	struct diocgattr_arg arg;
48594682383SAlexander Motin 	off_t size, psectsz, psectoff;
4867cf5a7eeSPeter Grehan 	int extra, fd, i, sectsz;
487621b5090SJohn Baldwin 	int ro, candelete, geom, ssopt, pssopt;
48822769bbeSAllan Jude 	int nodelete;
489480bef94SCorvin Köhne 	int bootindex;
49022769bbeSAllan Jude 
49100ef17beSBartek Rutkowski #ifndef WITHOUT_CAPSICUM
49200ef17beSBartek Rutkowski 	cap_rights_t rights;
49308cb63a1SRobert Wing 	cap_ioctl_t cmds[] = { DIOCGFLUSH, DIOCGDELETE, DIOCGMEDIASIZE };
49400ef17beSBartek Rutkowski #endif
4957cf5a7eeSPeter Grehan 
496ae45750dSTycho Nightingale 	pthread_once(&blockif_once, blockif_init);
497ae45750dSTycho Nightingale 
4984e43c1e8SNeel Natu 	fd = -1;
499621b5090SJohn Baldwin 	extra = 0;
5004e43c1e8SNeel Natu 	ssopt = 0;
5017cf5a7eeSPeter Grehan 	ro = 0;
50222769bbeSAllan Jude 	nodelete = 0;
503480bef94SCorvin Köhne 	bootindex = -1;
5047cf5a7eeSPeter Grehan 
505621b5090SJohn Baldwin 	if (get_config_bool_node_default(nvl, "nocache", false))
506621b5090SJohn Baldwin 		extra |= O_DIRECT;
507621b5090SJohn Baldwin 	if (get_config_bool_node_default(nvl, "nodelete", false))
50822769bbeSAllan Jude 		nodelete = 1;
509621b5090SJohn Baldwin 	if (get_config_bool_node_default(nvl, "sync", false) ||
510621b5090SJohn Baldwin 	    get_config_bool_node_default(nvl, "direct", false))
511621b5090SJohn Baldwin 		extra |= O_SYNC;
512621b5090SJohn Baldwin 	if (get_config_bool_node_default(nvl, "ro", false))
5137cf5a7eeSPeter Grehan 		ro = 1;
514621b5090SJohn Baldwin 	ssval = get_config_value_node(nvl, "sectorsize");
515621b5090SJohn Baldwin 	if (ssval != NULL) {
516621b5090SJohn Baldwin 		ssopt = strtol(ssval, &cp, 10);
517621b5090SJohn Baldwin 		if (cp == ssval) {
518621b5090SJohn Baldwin 			EPRINTLN("Invalid sector size \"%s\"", ssval);
519621b5090SJohn Baldwin 			goto err;
520621b5090SJohn Baldwin 		}
521621b5090SJohn Baldwin 		if (*cp == '\0') {
5224e43c1e8SNeel Natu 			pssopt = ssopt;
523621b5090SJohn Baldwin 		} else if (*cp == '/') {
524621b5090SJohn Baldwin 			pssval = cp + 1;
525621b5090SJohn Baldwin 			pssopt = strtol(pssval, &cp, 10);
526621b5090SJohn Baldwin 			if (cp == pssval || *cp != '\0') {
527621b5090SJohn Baldwin 				EPRINTLN("Invalid sector size \"%s\"", ssval);
528621b5090SJohn Baldwin 				goto err;
529621b5090SJohn Baldwin 			}
530621b5090SJohn Baldwin 		} else {
531621b5090SJohn Baldwin 			EPRINTLN("Invalid sector size \"%s\"", ssval);
5324e43c1e8SNeel Natu 			goto err;
5334e43c1e8SNeel Natu 		}
5347cf5a7eeSPeter Grehan 	}
5357cf5a7eeSPeter Grehan 
536480bef94SCorvin Köhne 	bootindex_val = get_config_value_node(nvl, "bootindex");
537480bef94SCorvin Köhne 	if (bootindex_val != NULL) {
538480bef94SCorvin Köhne 		bootindex = atoi(bootindex_val);
539480bef94SCorvin Köhne 	}
540480bef94SCorvin Köhne 
541621b5090SJohn Baldwin 	path = get_config_value_node(nvl, "path");
542621b5090SJohn Baldwin 	if (path == NULL) {
543621b5090SJohn Baldwin 		EPRINTLN("Missing \"path\" for block device.");
544621b5090SJohn Baldwin 		goto err;
545621b5090SJohn Baldwin 	}
5467cf5a7eeSPeter Grehan 
547621b5090SJohn Baldwin 	fd = open(path, (ro ? O_RDONLY : O_RDWR) | extra);
5487cf5a7eeSPeter Grehan 	if (fd < 0 && !ro) {
5497cf5a7eeSPeter Grehan 		/* Attempt a r/w fail with a r/o open */
550621b5090SJohn Baldwin 		fd = open(path, O_RDONLY | extra);
5517cf5a7eeSPeter Grehan 		ro = 1;
5527cf5a7eeSPeter Grehan 	}
5537cf5a7eeSPeter Grehan 
5547cf5a7eeSPeter Grehan 	if (fd < 0) {
555621b5090SJohn Baldwin 		warn("Could not open backing file: %s", path);
5564e43c1e8SNeel Natu 		goto err;
5577cf5a7eeSPeter Grehan 	}
5587cf5a7eeSPeter Grehan 
5597cf5a7eeSPeter Grehan         if (fstat(fd, &sbuf) < 0) {
560621b5090SJohn Baldwin 		warn("Could not stat backing file %s", path);
5614e43c1e8SNeel Natu 		goto err;
5627cf5a7eeSPeter Grehan         }
5637cf5a7eeSPeter Grehan 
56400ef17beSBartek Rutkowski #ifndef WITHOUT_CAPSICUM
56500ef17beSBartek Rutkowski 	cap_rights_init(&rights, CAP_FSYNC, CAP_IOCTL, CAP_READ, CAP_SEEK,
566d8c1d7b6SChuck Tuffli 	    CAP_WRITE, CAP_FSTAT, CAP_EVENT, CAP_FPATHCONF);
56700ef17beSBartek Rutkowski 	if (ro)
56800ef17beSBartek Rutkowski 		cap_rights_clear(&rights, CAP_FSYNC, CAP_WRITE);
56900ef17beSBartek Rutkowski 
570abfa3c39SMarcelo Araujo 	if (caph_rights_limit(fd, &rights) == -1)
57100ef17beSBartek Rutkowski 		errx(EX_OSERR, "Unable to apply rights for sandbox");
57200ef17beSBartek Rutkowski #endif
57300ef17beSBartek Rutkowski 
5747cf5a7eeSPeter Grehan         /*
5757cf5a7eeSPeter Grehan 	 * Deal with raw devices
5767cf5a7eeSPeter Grehan 	 */
5777cf5a7eeSPeter Grehan         size = sbuf.st_size;
5787cf5a7eeSPeter Grehan 	sectsz = DEV_BSIZE;
57994682383SAlexander Motin 	psectsz = psectoff = 0;
580bb1524afSAlexander Motin 	candelete = geom = 0;
5817cf5a7eeSPeter Grehan 	if (S_ISCHR(sbuf.st_mode)) {
5827cf5a7eeSPeter Grehan 		if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 ||
5837cf5a7eeSPeter Grehan 		    ioctl(fd, DIOCGSECTORSIZE, &sectsz)) {
5847cf5a7eeSPeter Grehan 			perror("Could not fetch dev blk/sector size");
5854e43c1e8SNeel Natu 			goto err;
5867cf5a7eeSPeter Grehan 		}
5877cf5a7eeSPeter Grehan 		assert(size != 0);
5887cf5a7eeSPeter Grehan 		assert(sectsz != 0);
58994682383SAlexander Motin 		if (ioctl(fd, DIOCGSTRIPESIZE, &psectsz) == 0 && psectsz > 0)
59094682383SAlexander Motin 			ioctl(fd, DIOCGSTRIPEOFFSET, &psectoff);
5910b9d25c9SAlexander Motin 		strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name));
5920b9d25c9SAlexander Motin 		arg.len = sizeof(arg.value.i);
59322769bbeSAllan Jude 		if (nodelete == 0 && ioctl(fd, DIOCGATTR, &arg) == 0)
5940b9d25c9SAlexander Motin 			candelete = arg.value.i;
595bb1524afSAlexander Motin 		if (ioctl(fd, DIOCGPROVIDERNAME, name) == 0)
596bb1524afSAlexander Motin 			geom = 1;
5973676512bSKa Ho Ng 	} else {
59894682383SAlexander Motin 		psectsz = sbuf.st_blksize;
5993676512bSKa Ho Ng 		/* Avoid fallback implementation */
6003676512bSKa Ho Ng 		candelete = fpathconf(fd, _PC_DEALLOC_PRESENT) == 1;
6013676512bSKa Ho Ng 	}
6027cf5a7eeSPeter Grehan 
60300ef17beSBartek Rutkowski #ifndef WITHOUT_CAPSICUM
604abfa3c39SMarcelo Araujo 	if (caph_ioctls_limit(fd, cmds, nitems(cmds)) == -1)
60500ef17beSBartek Rutkowski 		errx(EX_OSERR, "Unable to apply rights for sandbox");
60600ef17beSBartek Rutkowski #endif
60700ef17beSBartek Rutkowski 
6084e43c1e8SNeel Natu 	if (ssopt != 0) {
6094e43c1e8SNeel Natu 		if (!powerof2(ssopt) || !powerof2(pssopt) || ssopt < 512 ||
6104e43c1e8SNeel Natu 		    ssopt > pssopt) {
611332eff95SVincenzo Maffione 			EPRINTLN("Invalid sector size %d/%d",
6124e43c1e8SNeel Natu 			    ssopt, pssopt);
6134e43c1e8SNeel Natu 			goto err;
6144e43c1e8SNeel Natu 		}
6154e43c1e8SNeel Natu 
6164e43c1e8SNeel Natu 		/*
6174e43c1e8SNeel Natu 		 * Some backend drivers (e.g. cd0, ada0) require that the I/O
6184e43c1e8SNeel Natu 		 * size be a multiple of the device's sector size.
6194e43c1e8SNeel Natu 		 *
6204e43c1e8SNeel Natu 		 * Validate that the emulated sector size complies with this
6214e43c1e8SNeel Natu 		 * requirement.
6224e43c1e8SNeel Natu 		 */
6234e43c1e8SNeel Natu 		if (S_ISCHR(sbuf.st_mode)) {
6244e43c1e8SNeel Natu 			if (ssopt < sectsz || (ssopt % sectsz) != 0) {
625332eff95SVincenzo Maffione 				EPRINTLN("Sector size %d incompatible "
626332eff95SVincenzo Maffione 				    "with underlying device sector size %d",
6274e43c1e8SNeel Natu 				    ssopt, sectsz);
6284e43c1e8SNeel Natu 				goto err;
6294e43c1e8SNeel Natu 			}
6304e43c1e8SNeel Natu 		}
6314e43c1e8SNeel Natu 
6324e43c1e8SNeel Natu 		sectsz = ssopt;
6334e43c1e8SNeel Natu 		psectsz = pssopt;
6344e43c1e8SNeel Natu 		psectoff = 0;
6354e43c1e8SNeel Natu 	}
6364e43c1e8SNeel Natu 
637994f858aSXin LI 	bc = calloc(1, sizeof(struct blockif_ctxt));
6387cf5a7eeSPeter Grehan 	if (bc == NULL) {
6394e43c1e8SNeel Natu 		perror("calloc");
6404e43c1e8SNeel Natu 		goto err;
6417cf5a7eeSPeter Grehan 	}
6427cf5a7eeSPeter Grehan 
6437cf5a7eeSPeter Grehan 	bc->bc_magic = BLOCKIF_SIG;
6447cf5a7eeSPeter Grehan 	bc->bc_fd = fd;
6452d678f1fSAlexander Motin 	bc->bc_ischr = S_ISCHR(sbuf.st_mode);
646bb1524afSAlexander Motin 	bc->bc_isgeom = geom;
6470b9d25c9SAlexander Motin 	bc->bc_candelete = candelete;
6481aba8e7fSNeel Natu 	bc->bc_rdonly = ro;
6497cf5a7eeSPeter Grehan 	bc->bc_size = size;
6507cf5a7eeSPeter Grehan 	bc->bc_sectsz = sectsz;
65194682383SAlexander Motin 	bc->bc_psectsz = psectsz;
65294682383SAlexander Motin 	bc->bc_psectoff = psectoff;
6537cf5a7eeSPeter Grehan 	pthread_mutex_init(&bc->bc_mtx, NULL);
6547cf5a7eeSPeter Grehan 	pthread_cond_init(&bc->bc_cond, NULL);
655483d953aSJohn Baldwin 	bc->bc_paused = 0;
656483d953aSJohn Baldwin 	pthread_cond_init(&bc->bc_work_done_cond, NULL);
6577cf5a7eeSPeter Grehan 	TAILQ_INIT(&bc->bc_freeq);
658ae45750dSTycho Nightingale 	TAILQ_INIT(&bc->bc_pendq);
659ae45750dSTycho Nightingale 	TAILQ_INIT(&bc->bc_busyq);
660480bef94SCorvin Köhne 	bc->bc_bootindex = bootindex;
6617cf5a7eeSPeter Grehan 	for (i = 0; i < BLOCKIF_MAXREQ; i++) {
6627cf5a7eeSPeter Grehan 		bc->bc_reqs[i].be_status = BST_FREE;
6637cf5a7eeSPeter Grehan 		TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link);
6647cf5a7eeSPeter Grehan 	}
6657cf5a7eeSPeter Grehan 
66679565afeSAlexander Motin 	for (i = 0; i < BLOCKIF_NUMTHR; i++) {
66779565afeSAlexander Motin 		pthread_create(&bc->bc_btid[i], NULL, blockif_thr, bc);
66879565afeSAlexander Motin 		snprintf(tname, sizeof(tname), "blk-%s-%d", ident, i);
66979565afeSAlexander Motin 		pthread_set_name_np(bc->bc_btid[i], tname);
67079565afeSAlexander Motin 	}
6717cf5a7eeSPeter Grehan 
6727cf5a7eeSPeter Grehan 	return (bc);
6734e43c1e8SNeel Natu err:
6744e43c1e8SNeel Natu 	if (fd >= 0)
6754e43c1e8SNeel Natu 		close(fd);
6764e43c1e8SNeel Natu 	return (NULL);
6777cf5a7eeSPeter Grehan }
6787cf5a7eeSPeter Grehan 
6798794846aSJohn Baldwin static void
blockif_resized(int fd,enum ev_type type __unused,void * arg)68098d920d9SMark Johnston blockif_resized(int fd, enum ev_type type __unused, void *arg)
6818794846aSJohn Baldwin {
6828794846aSJohn Baldwin 	struct blockif_ctxt *bc;
6838794846aSJohn Baldwin 	struct stat sb;
684ae9ea22eSRobert Wing 	off_t mediasize;
6858794846aSJohn Baldwin 
6868794846aSJohn Baldwin 	if (fstat(fd, &sb) != 0)
6878794846aSJohn Baldwin 		return;
6888794846aSJohn Baldwin 
689ae9ea22eSRobert Wing 	if (S_ISCHR(sb.st_mode)) {
690ae9ea22eSRobert Wing 		if (ioctl(fd, DIOCGMEDIASIZE, &mediasize) < 0) {
691ae9ea22eSRobert Wing 			EPRINTLN("blockif_resized: get mediasize failed: %s",
692ae9ea22eSRobert Wing 			    strerror(errno));
693ae9ea22eSRobert Wing 			return;
694ae9ea22eSRobert Wing 		}
695ae9ea22eSRobert Wing 	} else
696ae9ea22eSRobert Wing 		mediasize = sb.st_size;
697ae9ea22eSRobert Wing 
6988794846aSJohn Baldwin 	bc = arg;
6998794846aSJohn Baldwin 	pthread_mutex_lock(&bc->bc_mtx);
700ae9ea22eSRobert Wing 	if (mediasize != bc->bc_size) {
701ae9ea22eSRobert Wing 		bc->bc_size = mediasize;
7028794846aSJohn Baldwin 		bc->bc_resize_cb(bc, bc->bc_resize_cb_arg, bc->bc_size);
7038794846aSJohn Baldwin 	}
7048794846aSJohn Baldwin 	pthread_mutex_unlock(&bc->bc_mtx);
7058794846aSJohn Baldwin }
7068794846aSJohn Baldwin 
7078794846aSJohn Baldwin int
blockif_register_resize_callback(struct blockif_ctxt * bc,blockif_resize_cb * cb,void * cb_arg)7088794846aSJohn Baldwin blockif_register_resize_callback(struct blockif_ctxt *bc, blockif_resize_cb *cb,
7098794846aSJohn Baldwin     void *cb_arg)
7108794846aSJohn Baldwin {
7118794846aSJohn Baldwin 	struct stat sb;
7128794846aSJohn Baldwin 	int err;
7138794846aSJohn Baldwin 
7148794846aSJohn Baldwin 	if (cb == NULL)
7158794846aSJohn Baldwin 		return (EINVAL);
7168794846aSJohn Baldwin 
7175b966d78SMark Johnston 	err = 0;
7185b966d78SMark Johnston 
7198794846aSJohn Baldwin 	pthread_mutex_lock(&bc->bc_mtx);
7208794846aSJohn Baldwin 	if (bc->bc_resize_cb != NULL) {
7218794846aSJohn Baldwin 		err = EBUSY;
7228794846aSJohn Baldwin 		goto out;
7238794846aSJohn Baldwin 	}
7248794846aSJohn Baldwin 
7258794846aSJohn Baldwin 	assert(bc->bc_closing == 0);
7268794846aSJohn Baldwin 
7278794846aSJohn Baldwin 	if (fstat(bc->bc_fd, &sb) != 0) {
7288794846aSJohn Baldwin 		err = errno;
7298794846aSJohn Baldwin 		goto out;
7308794846aSJohn Baldwin 	}
7318794846aSJohn Baldwin 
7328794846aSJohn Baldwin 	bc->bc_resize_event = mevent_add_flags(bc->bc_fd, EVF_VNODE,
7338794846aSJohn Baldwin 	    EVFF_ATTRIB, blockif_resized, bc);
7348794846aSJohn Baldwin 	if (bc->bc_resize_event == NULL) {
7358794846aSJohn Baldwin 		err = ENXIO;
7368794846aSJohn Baldwin 		goto out;
7378794846aSJohn Baldwin 	}
7388794846aSJohn Baldwin 
7398794846aSJohn Baldwin 	bc->bc_resize_cb = cb;
7408794846aSJohn Baldwin 	bc->bc_resize_cb_arg = cb_arg;
7418794846aSJohn Baldwin out:
7428794846aSJohn Baldwin 	pthread_mutex_unlock(&bc->bc_mtx);
7438794846aSJohn Baldwin 
7448794846aSJohn Baldwin 	return (err);
7458794846aSJohn Baldwin }
7468794846aSJohn Baldwin 
7477cf5a7eeSPeter Grehan static int
blockif_request(struct blockif_ctxt * bc,struct blockif_req * breq,enum blockop op)7487cf5a7eeSPeter Grehan blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq,
7497cf5a7eeSPeter Grehan 		enum blockop op)
7507cf5a7eeSPeter Grehan {
7517cf5a7eeSPeter Grehan 	int err;
7527cf5a7eeSPeter Grehan 
7537cf5a7eeSPeter Grehan 	err = 0;
7547cf5a7eeSPeter Grehan 
7557cf5a7eeSPeter Grehan 	pthread_mutex_lock(&bc->bc_mtx);
756cd9618bdSVitaliy Gusev 	assert(!bc->bc_paused);
75779565afeSAlexander Motin 	if (!TAILQ_EMPTY(&bc->bc_freeq)) {
7587cf5a7eeSPeter Grehan 		/*
7597cf5a7eeSPeter Grehan 		 * Enqueue and inform the block i/o thread
7607cf5a7eeSPeter Grehan 		 * that there is work available
7617cf5a7eeSPeter Grehan 		 */
76279565afeSAlexander Motin 		if (blockif_enqueue(bc, breq, op))
7637cf5a7eeSPeter Grehan 			pthread_cond_signal(&bc->bc_cond);
7647cf5a7eeSPeter Grehan 	} else {
7657cf5a7eeSPeter Grehan 		/*
7667cf5a7eeSPeter Grehan 		 * Callers are not allowed to enqueue more than
7677cf5a7eeSPeter Grehan 		 * the specified blockif queue limit. Return an
7687cf5a7eeSPeter Grehan 		 * error to indicate that the queue length has been
7697cf5a7eeSPeter Grehan 		 * exceeded.
7707cf5a7eeSPeter Grehan 		 */
7717cf5a7eeSPeter Grehan 		err = E2BIG;
7727cf5a7eeSPeter Grehan 	}
7737cf5a7eeSPeter Grehan 	pthread_mutex_unlock(&bc->bc_mtx);
7747cf5a7eeSPeter Grehan 
7757cf5a7eeSPeter Grehan 	return (err);
7767cf5a7eeSPeter Grehan }
7777cf5a7eeSPeter Grehan 
7787cf5a7eeSPeter Grehan int
blockif_read(struct blockif_ctxt * bc,struct blockif_req * breq)7797cf5a7eeSPeter Grehan blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq)
7807cf5a7eeSPeter Grehan {
7817cf5a7eeSPeter Grehan 	assert(bc->bc_magic == BLOCKIF_SIG);
7827cf5a7eeSPeter Grehan 	return (blockif_request(bc, breq, BOP_READ));
7837cf5a7eeSPeter Grehan }
7847cf5a7eeSPeter Grehan 
7857cf5a7eeSPeter Grehan int
blockif_write(struct blockif_ctxt * bc,struct blockif_req * breq)7867cf5a7eeSPeter Grehan blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq)
7877cf5a7eeSPeter Grehan {
7887cf5a7eeSPeter Grehan 	assert(bc->bc_magic == BLOCKIF_SIG);
7897cf5a7eeSPeter Grehan 	return (blockif_request(bc, breq, BOP_WRITE));
7907cf5a7eeSPeter Grehan }
7917cf5a7eeSPeter Grehan 
7927cf5a7eeSPeter Grehan int
blockif_flush(struct blockif_ctxt * bc,struct blockif_req * breq)7937cf5a7eeSPeter Grehan blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq)
7947cf5a7eeSPeter Grehan {
7957cf5a7eeSPeter Grehan 	assert(bc->bc_magic == BLOCKIF_SIG);
7967cf5a7eeSPeter Grehan 	return (blockif_request(bc, breq, BOP_FLUSH));
7977cf5a7eeSPeter Grehan }
7987cf5a7eeSPeter Grehan 
7997cf5a7eeSPeter Grehan int
blockif_delete(struct blockif_ctxt * bc,struct blockif_req * breq)8000b9d25c9SAlexander Motin blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq)
8010b9d25c9SAlexander Motin {
8020b9d25c9SAlexander Motin 	assert(bc->bc_magic == BLOCKIF_SIG);
8030b9d25c9SAlexander Motin 	return (blockif_request(bc, breq, BOP_DELETE));
8040b9d25c9SAlexander Motin }
8050b9d25c9SAlexander Motin 
8060b9d25c9SAlexander Motin int
blockif_cancel(struct blockif_ctxt * bc,struct blockif_req * breq)8077cf5a7eeSPeter Grehan blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq)
8087cf5a7eeSPeter Grehan {
8093ef05c46STycho Nightingale 	struct blockif_elem *be;
8107cf5a7eeSPeter Grehan 
8117cf5a7eeSPeter Grehan 	assert(bc->bc_magic == BLOCKIF_SIG);
8123ef05c46STycho Nightingale 
8133ef05c46STycho Nightingale 	pthread_mutex_lock(&bc->bc_mtx);
814483d953aSJohn Baldwin 	/* XXX: not waiting while paused */
815483d953aSJohn Baldwin 
816ae45750dSTycho Nightingale 	/*
817ae45750dSTycho Nightingale 	 * Check pending requests.
818ae45750dSTycho Nightingale 	 */
819ae45750dSTycho Nightingale 	TAILQ_FOREACH(be, &bc->bc_pendq, be_link) {
8203ef05c46STycho Nightingale 		if (be->be_req == breq)
8213ef05c46STycho Nightingale 			break;
8223ef05c46STycho Nightingale 	}
823ae45750dSTycho Nightingale 	if (be != NULL) {
824ae45750dSTycho Nightingale 		/*
825ae45750dSTycho Nightingale 		 * Found it.
826ae45750dSTycho Nightingale 		 */
82779565afeSAlexander Motin 		blockif_complete(bc, be);
8283ef05c46STycho Nightingale 		pthread_mutex_unlock(&bc->bc_mtx);
8293ef05c46STycho Nightingale 
8303ef05c46STycho Nightingale 		return (0);
8317cf5a7eeSPeter Grehan 	}
8327cf5a7eeSPeter Grehan 
833ae45750dSTycho Nightingale 	/*
834ae45750dSTycho Nightingale 	 * Check in-flight requests.
835ae45750dSTycho Nightingale 	 */
836ae45750dSTycho Nightingale 	TAILQ_FOREACH(be, &bc->bc_busyq, be_link) {
837ae45750dSTycho Nightingale 		if (be->be_req == breq)
838ae45750dSTycho Nightingale 			break;
839ae45750dSTycho Nightingale 	}
840ae45750dSTycho Nightingale 	if (be == NULL) {
841ae45750dSTycho Nightingale 		/*
842ae45750dSTycho Nightingale 		 * Didn't find it.
843ae45750dSTycho Nightingale 		 */
844ae45750dSTycho Nightingale 		pthread_mutex_unlock(&bc->bc_mtx);
845ae45750dSTycho Nightingale 		return (EINVAL);
846ae45750dSTycho Nightingale 	}
847ae45750dSTycho Nightingale 
848ae45750dSTycho Nightingale 	/*
849ae45750dSTycho Nightingale 	 * Interrupt the processing thread to force it return
850ae45750dSTycho Nightingale 	 * prematurely via it's normal callback path.
851ae45750dSTycho Nightingale 	 */
852ae45750dSTycho Nightingale 	while (be->be_status == BST_BUSY) {
853ae45750dSTycho Nightingale 		struct blockif_sig_elem bse, *old_head;
854ae45750dSTycho Nightingale 
855ae45750dSTycho Nightingale 		pthread_mutex_init(&bse.bse_mtx, NULL);
856ae45750dSTycho Nightingale 		pthread_cond_init(&bse.bse_cond, NULL);
857ae45750dSTycho Nightingale 
858ae45750dSTycho Nightingale 		bse.bse_pending = 1;
859ae45750dSTycho Nightingale 
860ae45750dSTycho Nightingale 		do {
861ae45750dSTycho Nightingale 			old_head = blockif_bse_head;
862ae45750dSTycho Nightingale 			bse.bse_next = old_head;
863ae45750dSTycho Nightingale 		} while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head,
864ae45750dSTycho Nightingale 					    (uintptr_t)old_head,
865ae45750dSTycho Nightingale 					    (uintptr_t)&bse));
866ae45750dSTycho Nightingale 
867ae45750dSTycho Nightingale 		pthread_kill(be->be_tid, SIGCONT);
868ae45750dSTycho Nightingale 
869ae45750dSTycho Nightingale 		pthread_mutex_lock(&bse.bse_mtx);
870ae45750dSTycho Nightingale 		while (bse.bse_pending)
871ae45750dSTycho Nightingale 			pthread_cond_wait(&bse.bse_cond, &bse.bse_mtx);
872ae45750dSTycho Nightingale 		pthread_mutex_unlock(&bse.bse_mtx);
873ae45750dSTycho Nightingale 	}
874ae45750dSTycho Nightingale 
875ae45750dSTycho Nightingale 	pthread_mutex_unlock(&bc->bc_mtx);
876ae45750dSTycho Nightingale 
877ae45750dSTycho Nightingale 	/*
878ae45750dSTycho Nightingale 	 * The processing thread has been interrupted.  Since it's not
879ae45750dSTycho Nightingale 	 * clear if the callback has been invoked yet, return EBUSY.
880ae45750dSTycho Nightingale 	 */
881ae45750dSTycho Nightingale 	return (EBUSY);
882ae45750dSTycho Nightingale }
883ae45750dSTycho Nightingale 
8847cf5a7eeSPeter Grehan int
blockif_close(struct blockif_ctxt * bc)8857cf5a7eeSPeter Grehan blockif_close(struct blockif_ctxt *bc)
8867cf5a7eeSPeter Grehan {
8877cf5a7eeSPeter Grehan 	void *jval;
888305b5a14SMarcelo Araujo 	int i;
8897cf5a7eeSPeter Grehan 
8907cf5a7eeSPeter Grehan 	assert(bc->bc_magic == BLOCKIF_SIG);
8917cf5a7eeSPeter Grehan 
8927cf5a7eeSPeter Grehan 	/*
8937cf5a7eeSPeter Grehan 	 * Stop the block i/o thread
8947cf5a7eeSPeter Grehan 	 */
895f2e62de7SAlexander Motin 	pthread_mutex_lock(&bc->bc_mtx);
8967cf5a7eeSPeter Grehan 	bc->bc_closing = 1;
8978794846aSJohn Baldwin 	if (bc->bc_resize_event != NULL)
8988794846aSJohn Baldwin 		mevent_disable(bc->bc_resize_event);
899f2e62de7SAlexander Motin 	pthread_mutex_unlock(&bc->bc_mtx);
90079565afeSAlexander Motin 	pthread_cond_broadcast(&bc->bc_cond);
90179565afeSAlexander Motin 	for (i = 0; i < BLOCKIF_NUMTHR; i++)
90279565afeSAlexander Motin 		pthread_join(bc->bc_btid[i], &jval);
9037cf5a7eeSPeter Grehan 
9047cf5a7eeSPeter Grehan 	/* XXX Cancel queued i/o's ??? */
9057cf5a7eeSPeter Grehan 
9067cf5a7eeSPeter Grehan 	/*
9077cf5a7eeSPeter Grehan 	 * Release resources
9087cf5a7eeSPeter Grehan 	 */
9097cf5a7eeSPeter Grehan 	bc->bc_magic = 0;
9107cf5a7eeSPeter Grehan 	close(bc->bc_fd);
9117cf5a7eeSPeter Grehan 	free(bc);
9127cf5a7eeSPeter Grehan 
9137cf5a7eeSPeter Grehan 	return (0);
9147cf5a7eeSPeter Grehan }
9157cf5a7eeSPeter Grehan 
9167cf5a7eeSPeter Grehan /*
917c4813fadSPeter Grehan  * Return virtual C/H/S values for a given block. Use the algorithm
918c4813fadSPeter Grehan  * outlined in the VHD specification to calculate values.
919c4813fadSPeter Grehan  */
920c4813fadSPeter Grehan void
blockif_chs(struct blockif_ctxt * bc,uint16_t * c,uint8_t * h,uint8_t * s)921c4813fadSPeter Grehan blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s)
922c4813fadSPeter Grehan {
923c4813fadSPeter Grehan 	off_t sectors;		/* total sectors of the block dev */
924c4813fadSPeter Grehan 	off_t hcyl;		/* cylinders times heads */
925c4813fadSPeter Grehan 	uint16_t secpt;		/* sectors per track */
926c4813fadSPeter Grehan 	uint8_t heads;
927c4813fadSPeter Grehan 
928c4813fadSPeter Grehan 	assert(bc->bc_magic == BLOCKIF_SIG);
929c4813fadSPeter Grehan 
930c4813fadSPeter Grehan 	sectors = bc->bc_size / bc->bc_sectsz;
931c4813fadSPeter Grehan 
932c4813fadSPeter Grehan 	/* Clamp the size to the largest possible with CHS */
933ed721684SMark Johnston 	if (sectors > 65535L * 16 * 255)
934ed721684SMark Johnston 		sectors = 65535L * 16 * 255;
935c4813fadSPeter Grehan 
936ed721684SMark Johnston 	if (sectors >= 65536L * 16 * 63) {
937c4813fadSPeter Grehan 		secpt = 255;
938c4813fadSPeter Grehan 		heads = 16;
939c4813fadSPeter Grehan 		hcyl = sectors / secpt;
940c4813fadSPeter Grehan 	} else {
941c4813fadSPeter Grehan 		secpt = 17;
942c4813fadSPeter Grehan 		hcyl = sectors / secpt;
943c4813fadSPeter Grehan 		heads = (hcyl + 1023) / 1024;
944c4813fadSPeter Grehan 
945c4813fadSPeter Grehan 		if (heads < 4)
946c4813fadSPeter Grehan 			heads = 4;
947c4813fadSPeter Grehan 
948c4813fadSPeter Grehan 		if (hcyl >= (heads * 1024) || heads > 16) {
949c4813fadSPeter Grehan 			secpt = 31;
950c4813fadSPeter Grehan 			heads = 16;
951c4813fadSPeter Grehan 			hcyl = sectors / secpt;
952c4813fadSPeter Grehan 		}
953c4813fadSPeter Grehan 		if (hcyl >= (heads * 1024)) {
954c4813fadSPeter Grehan 			secpt = 63;
955c4813fadSPeter Grehan 			heads = 16;
956c4813fadSPeter Grehan 			hcyl = sectors / secpt;
957c4813fadSPeter Grehan 		}
958c4813fadSPeter Grehan 	}
959c4813fadSPeter Grehan 
960c4813fadSPeter Grehan 	*c = hcyl / heads;
961c4813fadSPeter Grehan 	*h = heads;
962c4813fadSPeter Grehan 	*s = secpt;
963c4813fadSPeter Grehan }
964c4813fadSPeter Grehan 
965c4813fadSPeter Grehan /*
9667cf5a7eeSPeter Grehan  * Accessors
9677cf5a7eeSPeter Grehan  */
9687cf5a7eeSPeter Grehan off_t
blockif_size(struct blockif_ctxt * bc)9697cf5a7eeSPeter Grehan blockif_size(struct blockif_ctxt *bc)
9707cf5a7eeSPeter Grehan {
9717cf5a7eeSPeter Grehan 	assert(bc->bc_magic == BLOCKIF_SIG);
9727cf5a7eeSPeter Grehan 	return (bc->bc_size);
9737cf5a7eeSPeter Grehan }
9747cf5a7eeSPeter Grehan 
9757cf5a7eeSPeter Grehan int
blockif_sectsz(struct blockif_ctxt * bc)9767cf5a7eeSPeter Grehan blockif_sectsz(struct blockif_ctxt *bc)
9777cf5a7eeSPeter Grehan {
9787cf5a7eeSPeter Grehan 	assert(bc->bc_magic == BLOCKIF_SIG);
9797cf5a7eeSPeter Grehan 	return (bc->bc_sectsz);
9807cf5a7eeSPeter Grehan }
9817cf5a7eeSPeter Grehan 
98294682383SAlexander Motin void
blockif_psectsz(struct blockif_ctxt * bc,int * size,int * off)98394682383SAlexander Motin blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off)
98494682383SAlexander Motin {
98594682383SAlexander Motin 	assert(bc->bc_magic == BLOCKIF_SIG);
98694682383SAlexander Motin 	*size = bc->bc_psectsz;
98794682383SAlexander Motin 	*off = bc->bc_psectoff;
98894682383SAlexander Motin }
98994682383SAlexander Motin 
9907cf5a7eeSPeter Grehan int
blockif_queuesz(struct blockif_ctxt * bc)9917cf5a7eeSPeter Grehan blockif_queuesz(struct blockif_ctxt *bc)
9927cf5a7eeSPeter Grehan {
9937cf5a7eeSPeter Grehan 	assert(bc->bc_magic == BLOCKIF_SIG);
99448a9d8f2STycho Nightingale 	return (BLOCKIF_MAXREQ - 1);
9957cf5a7eeSPeter Grehan }
9967cf5a7eeSPeter Grehan 
9977cf5a7eeSPeter Grehan int
blockif_is_ro(struct blockif_ctxt * bc)9987cf5a7eeSPeter Grehan blockif_is_ro(struct blockif_ctxt *bc)
9997cf5a7eeSPeter Grehan {
10007cf5a7eeSPeter Grehan 	assert(bc->bc_magic == BLOCKIF_SIG);
10017cf5a7eeSPeter Grehan 	return (bc->bc_rdonly);
10027cf5a7eeSPeter Grehan }
10030b9d25c9SAlexander Motin 
10040b9d25c9SAlexander Motin int
blockif_candelete(struct blockif_ctxt * bc)10050b9d25c9SAlexander Motin blockif_candelete(struct blockif_ctxt *bc)
10060b9d25c9SAlexander Motin {
10070b9d25c9SAlexander Motin 	assert(bc->bc_magic == BLOCKIF_SIG);
10080b9d25c9SAlexander Motin 	return (bc->bc_candelete);
10090b9d25c9SAlexander Motin }
1010483d953aSJohn Baldwin 
1011483d953aSJohn Baldwin #ifdef BHYVE_SNAPSHOT
1012483d953aSJohn Baldwin void
blockif_pause(struct blockif_ctxt * bc)1013483d953aSJohn Baldwin blockif_pause(struct blockif_ctxt *bc)
1014483d953aSJohn Baldwin {
1015483d953aSJohn Baldwin 	assert(bc != NULL);
1016483d953aSJohn Baldwin 	assert(bc->bc_magic == BLOCKIF_SIG);
1017483d953aSJohn Baldwin 
1018483d953aSJohn Baldwin 	pthread_mutex_lock(&bc->bc_mtx);
1019483d953aSJohn Baldwin 	bc->bc_paused = 1;
1020483d953aSJohn Baldwin 
1021483d953aSJohn Baldwin 	/* The interface is paused. Wait for workers to finish their work */
1022cd9618bdSVitaliy Gusev 	while (!blockif_empty(bc))
1023483d953aSJohn Baldwin 		pthread_cond_wait(&bc->bc_work_done_cond, &bc->bc_mtx);
1024483d953aSJohn Baldwin 	pthread_mutex_unlock(&bc->bc_mtx);
1025483d953aSJohn Baldwin 
10265c0a0312SVitaliy Gusev 	if (!bc->bc_rdonly && blockif_flush_bc(bc))
1027*b0936440SJohn Baldwin 		EPRINTLN("%s: [WARN] failed to flush backing file.",
1028483d953aSJohn Baldwin 			__func__);
1029483d953aSJohn Baldwin }
1030483d953aSJohn Baldwin 
1031483d953aSJohn Baldwin void
blockif_resume(struct blockif_ctxt * bc)1032483d953aSJohn Baldwin blockif_resume(struct blockif_ctxt *bc)
1033483d953aSJohn Baldwin {
1034483d953aSJohn Baldwin 	assert(bc != NULL);
1035483d953aSJohn Baldwin 	assert(bc->bc_magic == BLOCKIF_SIG);
1036483d953aSJohn Baldwin 
1037483d953aSJohn Baldwin 	pthread_mutex_lock(&bc->bc_mtx);
1038483d953aSJohn Baldwin 	bc->bc_paused = 0;
1039483d953aSJohn Baldwin 	pthread_mutex_unlock(&bc->bc_mtx);
1040483d953aSJohn Baldwin }
1041cd9618bdSVitaliy Gusev #endif	/* BHYVE_SNAPSHOT */
1042