xref: /illumos-gate/usr/src/cmd/bhyve/block_if.c (revision 4c87aefe8930bd07275b8dd2e96ea5f24d93a52e)
1bf21cd93STycho Nightingale /*-
2*4c87aefeSPatrick Mooney  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3*4c87aefeSPatrick Mooney  *
4bf21cd93STycho Nightingale  * Copyright (c) 2013  Peter Grehan <grehan@freebsd.org>
5bf21cd93STycho Nightingale  * All rights reserved.
6bf21cd93STycho Nightingale  *
7bf21cd93STycho Nightingale  * Redistribution and use in source and binary forms, with or without
8bf21cd93STycho Nightingale  * modification, are permitted provided that the following conditions
9bf21cd93STycho Nightingale  * are met:
10bf21cd93STycho Nightingale  * 1. Redistributions of source code must retain the above copyright
11bf21cd93STycho Nightingale  *    notice, this list of conditions and the following disclaimer.
12bf21cd93STycho Nightingale  * 2. Redistributions in binary form must reproduce the above copyright
13bf21cd93STycho Nightingale  *    notice, this list of conditions and the following disclaimer in the
14bf21cd93STycho Nightingale  *    documentation and/or other materials provided with the distribution.
15bf21cd93STycho Nightingale  *
16bf21cd93STycho Nightingale  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
17bf21cd93STycho Nightingale  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18bf21cd93STycho Nightingale  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19bf21cd93STycho Nightingale  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20bf21cd93STycho Nightingale  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21bf21cd93STycho Nightingale  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22bf21cd93STycho Nightingale  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23bf21cd93STycho Nightingale  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24bf21cd93STycho Nightingale  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25bf21cd93STycho Nightingale  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26bf21cd93STycho Nightingale  * SUCH DAMAGE.
27bf21cd93STycho Nightingale  *
28*4c87aefeSPatrick Mooney  * $FreeBSD$
29*4c87aefeSPatrick Mooney  */
30*4c87aefeSPatrick Mooney 
31*4c87aefeSPatrick Mooney /*
32*4c87aefeSPatrick Mooney  * Copyright 2018 Joyent, Inc.
33bf21cd93STycho Nightingale  */
34bf21cd93STycho Nightingale 
35bf21cd93STycho Nightingale #include <sys/cdefs.h>
36*4c87aefeSPatrick Mooney __FBSDID("$FreeBSD$");
37bf21cd93STycho Nightingale 
38bf21cd93STycho Nightingale #include <sys/param.h>
39*4c87aefeSPatrick Mooney #ifndef WITHOUT_CAPSICUM
40*4c87aefeSPatrick Mooney #include <sys/capsicum.h>
41*4c87aefeSPatrick Mooney #endif
42bf21cd93STycho Nightingale #include <sys/queue.h>
43bf21cd93STycho Nightingale #include <sys/errno.h>
44bf21cd93STycho Nightingale #include <sys/stat.h>
45bf21cd93STycho Nightingale #include <sys/ioctl.h>
46bf21cd93STycho Nightingale #include <sys/disk.h>
47*4c87aefeSPatrick Mooney #include <sys/limits.h>
48*4c87aefeSPatrick Mooney #include <sys/uio.h>
49*4c87aefeSPatrick Mooney #ifndef __FreeBSD__
50*4c87aefeSPatrick Mooney #include <sys/dkio.h>
51*4c87aefeSPatrick Mooney #endif
52bf21cd93STycho Nightingale 
53bf21cd93STycho Nightingale #include <assert.h>
54*4c87aefeSPatrick Mooney #ifndef WITHOUT_CAPSICUM
55*4c87aefeSPatrick Mooney #include <capsicum_helpers.h>
56*4c87aefeSPatrick Mooney #endif
57*4c87aefeSPatrick Mooney #include <err.h>
58bf21cd93STycho Nightingale #include <fcntl.h>
59bf21cd93STycho Nightingale #include <stdio.h>
60bf21cd93STycho Nightingale #include <stdlib.h>
61bf21cd93STycho Nightingale #include <string.h>
62bf21cd93STycho Nightingale #include <pthread.h>
63bf21cd93STycho Nightingale #include <pthread_np.h>
64bf21cd93STycho Nightingale #include <signal.h>
65*4c87aefeSPatrick Mooney #include <sysexits.h>
66bf21cd93STycho Nightingale #include <unistd.h>
67bf21cd93STycho Nightingale 
68bf21cd93STycho Nightingale #include <machine/atomic.h>
69bf21cd93STycho Nightingale 
70bf21cd93STycho Nightingale #include "bhyverun.h"
71bf21cd93STycho Nightingale #ifdef	__FreeBSD__
72bf21cd93STycho Nightingale #include "mevent.h"
73bf21cd93STycho Nightingale #endif
74bf21cd93STycho Nightingale #include "block_if.h"
75bf21cd93STycho Nightingale 
76bf21cd93STycho Nightingale #define BLOCKIF_SIG	0xb109b109
77bf21cd93STycho Nightingale 
78*4c87aefeSPatrick Mooney #ifdef __FreeBSD__
79*4c87aefeSPatrick Mooney #define BLOCKIF_NUMTHR	8
80*4c87aefeSPatrick Mooney #else
81*4c87aefeSPatrick Mooney /* Enlarge to keep pace with the virtio-block ring size */
82*4c87aefeSPatrick Mooney #define BLOCKIF_NUMTHR	16
83*4c87aefeSPatrick Mooney #endif
84*4c87aefeSPatrick Mooney #define BLOCKIF_MAXREQ	(BLOCKIF_RING_MAX + BLOCKIF_NUMTHR)
85bf21cd93STycho Nightingale 
86bf21cd93STycho Nightingale enum blockop {
87bf21cd93STycho Nightingale 	BOP_READ,
88bf21cd93STycho Nightingale 	BOP_WRITE,
89*4c87aefeSPatrick Mooney #ifndef __FreeBSD__
90*4c87aefeSPatrick Mooney 	BOP_WRITE_SYNC,
91*4c87aefeSPatrick Mooney #endif
92*4c87aefeSPatrick Mooney 	BOP_FLUSH,
93*4c87aefeSPatrick Mooney 	BOP_DELETE
94bf21cd93STycho Nightingale };
95bf21cd93STycho Nightingale 
96bf21cd93STycho Nightingale enum blockstat {
97bf21cd93STycho Nightingale 	BST_FREE,
98*4c87aefeSPatrick Mooney 	BST_BLOCK,
99bf21cd93STycho Nightingale 	BST_PEND,
100bf21cd93STycho Nightingale 	BST_BUSY,
101bf21cd93STycho Nightingale 	BST_DONE
102bf21cd93STycho Nightingale };
103bf21cd93STycho Nightingale 
104bf21cd93STycho Nightingale struct blockif_elem {
105bf21cd93STycho Nightingale 	TAILQ_ENTRY(blockif_elem) be_link;
106bf21cd93STycho Nightingale 	struct blockif_req  *be_req;
107bf21cd93STycho Nightingale 	enum blockop	     be_op;
108bf21cd93STycho Nightingale 	enum blockstat	     be_status;
109bf21cd93STycho Nightingale 	pthread_t            be_tid;
110*4c87aefeSPatrick Mooney 	off_t		     be_block;
111bf21cd93STycho Nightingale };
112bf21cd93STycho Nightingale 
113*4c87aefeSPatrick Mooney #ifndef __FreeBSD__
114*4c87aefeSPatrick Mooney enum blockif_wce {
115*4c87aefeSPatrick Mooney 	WCE_NONE = 0,
116*4c87aefeSPatrick Mooney 	WCE_IOCTL,
117*4c87aefeSPatrick Mooney 	WCE_FCNTL
118*4c87aefeSPatrick Mooney };
119*4c87aefeSPatrick Mooney #endif
120*4c87aefeSPatrick Mooney 
121bf21cd93STycho Nightingale struct blockif_ctxt {
122bf21cd93STycho Nightingale 	int			bc_magic;
123bf21cd93STycho Nightingale 	int			bc_fd;
124*4c87aefeSPatrick Mooney 	int			bc_ischr;
125*4c87aefeSPatrick Mooney 	int			bc_isgeom;
126*4c87aefeSPatrick Mooney 	int			bc_candelete;
127*4c87aefeSPatrick Mooney #ifndef __FreeBSD__
128*4c87aefeSPatrick Mooney 	enum blockif_wce	bc_wce;
129*4c87aefeSPatrick Mooney #endif
130bf21cd93STycho Nightingale 	int			bc_rdonly;
131bf21cd93STycho Nightingale 	off_t			bc_size;
132bf21cd93STycho Nightingale 	int			bc_sectsz;
133*4c87aefeSPatrick Mooney 	int			bc_psectsz;
134*4c87aefeSPatrick Mooney 	int			bc_psectoff;
135*4c87aefeSPatrick Mooney 	int			bc_closing;
136*4c87aefeSPatrick Mooney 	pthread_t		bc_btid[BLOCKIF_NUMTHR];
137bf21cd93STycho Nightingale 	pthread_mutex_t		bc_mtx;
138bf21cd93STycho Nightingale 	pthread_cond_t		bc_cond;
139bf21cd93STycho Nightingale 
140bf21cd93STycho Nightingale 	/* Request elements and free/pending/busy queues */
141bf21cd93STycho Nightingale 	TAILQ_HEAD(, blockif_elem) bc_freeq;
142bf21cd93STycho Nightingale 	TAILQ_HEAD(, blockif_elem) bc_pendq;
143bf21cd93STycho Nightingale 	TAILQ_HEAD(, blockif_elem) bc_busyq;
144bf21cd93STycho Nightingale 	struct blockif_elem	bc_reqs[BLOCKIF_MAXREQ];
145bf21cd93STycho Nightingale };
146bf21cd93STycho Nightingale 
147bf21cd93STycho Nightingale static pthread_once_t blockif_once = PTHREAD_ONCE_INIT;
148bf21cd93STycho Nightingale 
149bf21cd93STycho Nightingale struct blockif_sig_elem {
150bf21cd93STycho Nightingale 	pthread_mutex_t			bse_mtx;
151bf21cd93STycho Nightingale 	pthread_cond_t			bse_cond;
152bf21cd93STycho Nightingale 	int				bse_pending;
153bf21cd93STycho Nightingale 	struct blockif_sig_elem		*bse_next;
154bf21cd93STycho Nightingale };
155bf21cd93STycho Nightingale 
156bf21cd93STycho Nightingale static struct blockif_sig_elem *blockif_bse_head;
157bf21cd93STycho Nightingale 
158bf21cd93STycho Nightingale static int
159bf21cd93STycho Nightingale blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq,
160bf21cd93STycho Nightingale 		enum blockop op)
161bf21cd93STycho Nightingale {
162*4c87aefeSPatrick Mooney 	struct blockif_elem *be, *tbe;
163*4c87aefeSPatrick Mooney 	off_t off;
164*4c87aefeSPatrick Mooney 	int i;
165bf21cd93STycho Nightingale 
166bf21cd93STycho Nightingale 	be = TAILQ_FIRST(&bc->bc_freeq);
167bf21cd93STycho Nightingale 	assert(be != NULL);
168bf21cd93STycho Nightingale 	assert(be->be_status == BST_FREE);
169bf21cd93STycho Nightingale 	TAILQ_REMOVE(&bc->bc_freeq, be, be_link);
170bf21cd93STycho Nightingale 	be->be_req = breq;
171bf21cd93STycho Nightingale 	be->be_op = op;
172*4c87aefeSPatrick Mooney 	switch (op) {
173*4c87aefeSPatrick Mooney 	case BOP_READ:
174*4c87aefeSPatrick Mooney 	case BOP_WRITE:
175*4c87aefeSPatrick Mooney #ifndef __FreeBSD__
176*4c87aefeSPatrick Mooney 	case BOP_WRITE_SYNC:
177*4c87aefeSPatrick Mooney #endif
178*4c87aefeSPatrick Mooney 	case BOP_DELETE:
179*4c87aefeSPatrick Mooney 		off = breq->br_offset;
180*4c87aefeSPatrick Mooney 		for (i = 0; i < breq->br_iovcnt; i++)
181*4c87aefeSPatrick Mooney 			off += breq->br_iov[i].iov_len;
182*4c87aefeSPatrick Mooney 		break;
183*4c87aefeSPatrick Mooney 	default:
184*4c87aefeSPatrick Mooney 		off = OFF_MAX;
185*4c87aefeSPatrick Mooney 	}
186*4c87aefeSPatrick Mooney 	be->be_block = off;
187*4c87aefeSPatrick Mooney 	TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) {
188*4c87aefeSPatrick Mooney 		if (tbe->be_block == breq->br_offset)
189*4c87aefeSPatrick Mooney 			break;
190*4c87aefeSPatrick Mooney 	}
191*4c87aefeSPatrick Mooney 	if (tbe == NULL) {
192*4c87aefeSPatrick Mooney 		TAILQ_FOREACH(tbe, &bc->bc_busyq, be_link) {
193*4c87aefeSPatrick Mooney 			if (tbe->be_block == breq->br_offset)
194*4c87aefeSPatrick Mooney 				break;
195*4c87aefeSPatrick Mooney 		}
196*4c87aefeSPatrick Mooney 	}
197*4c87aefeSPatrick Mooney 	if (tbe == NULL)
198*4c87aefeSPatrick Mooney 		be->be_status = BST_PEND;
199*4c87aefeSPatrick Mooney 	else
200*4c87aefeSPatrick Mooney 		be->be_status = BST_BLOCK;
201bf21cd93STycho Nightingale 	TAILQ_INSERT_TAIL(&bc->bc_pendq, be, be_link);
202*4c87aefeSPatrick Mooney 	return (be->be_status == BST_PEND);
203bf21cd93STycho Nightingale }
204bf21cd93STycho Nightingale 
205bf21cd93STycho Nightingale static int
206*4c87aefeSPatrick Mooney blockif_dequeue(struct blockif_ctxt *bc, pthread_t t, struct blockif_elem **bep)
207bf21cd93STycho Nightingale {
208bf21cd93STycho Nightingale 	struct blockif_elem *be;
209bf21cd93STycho Nightingale 
210*4c87aefeSPatrick Mooney 	TAILQ_FOREACH(be, &bc->bc_pendq, be_link) {
211*4c87aefeSPatrick Mooney 		if (be->be_status == BST_PEND)
212*4c87aefeSPatrick Mooney 			break;
213*4c87aefeSPatrick Mooney 		assert(be->be_status == BST_BLOCK);
214*4c87aefeSPatrick Mooney 	}
215*4c87aefeSPatrick Mooney 	if (be == NULL)
216*4c87aefeSPatrick Mooney 		return (0);
217bf21cd93STycho Nightingale 	TAILQ_REMOVE(&bc->bc_pendq, be, be_link);
218bf21cd93STycho Nightingale 	be->be_status = BST_BUSY;
219*4c87aefeSPatrick Mooney 	be->be_tid = t;
220bf21cd93STycho Nightingale 	TAILQ_INSERT_TAIL(&bc->bc_busyq, be, be_link);
221bf21cd93STycho Nightingale 	*bep = be;
222*4c87aefeSPatrick Mooney 	return (1);
223bf21cd93STycho Nightingale }
224bf21cd93STycho Nightingale 
225bf21cd93STycho Nightingale static void
226bf21cd93STycho Nightingale blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be)
227bf21cd93STycho Nightingale {
228*4c87aefeSPatrick Mooney 	struct blockif_elem *tbe;
229bf21cd93STycho Nightingale 
230*4c87aefeSPatrick Mooney 	if (be->be_status == BST_DONE || be->be_status == BST_BUSY)
231bf21cd93STycho Nightingale 		TAILQ_REMOVE(&bc->bc_busyq, be, be_link);
232*4c87aefeSPatrick Mooney 	else
233*4c87aefeSPatrick Mooney 		TAILQ_REMOVE(&bc->bc_pendq, be, be_link);
234*4c87aefeSPatrick Mooney 	TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) {
235*4c87aefeSPatrick Mooney 		if (tbe->be_req->br_offset == be->be_block)
236*4c87aefeSPatrick Mooney 			tbe->be_status = BST_PEND;
237*4c87aefeSPatrick Mooney 	}
238bf21cd93STycho Nightingale 	be->be_tid = 0;
239bf21cd93STycho Nightingale 	be->be_status = BST_FREE;
240bf21cd93STycho Nightingale 	be->be_req = NULL;
241bf21cd93STycho Nightingale 	TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link);
242bf21cd93STycho Nightingale }
243bf21cd93STycho Nightingale 
244bf21cd93STycho Nightingale static void
245*4c87aefeSPatrick Mooney blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf)
246bf21cd93STycho Nightingale {
247bf21cd93STycho Nightingale 	struct blockif_req *br;
248*4c87aefeSPatrick Mooney #ifdef	__FreeBSD__
249*4c87aefeSPatrick Mooney 	off_t arg[2];
250*4c87aefeSPatrick Mooney #endif
251*4c87aefeSPatrick Mooney 	ssize_t clen, len, off, boff, voff;
252*4c87aefeSPatrick Mooney 	int i, err;
253bf21cd93STycho Nightingale 
254bf21cd93STycho Nightingale 	br = be->be_req;
255*4c87aefeSPatrick Mooney 	if (br->br_iovcnt <= 1)
256*4c87aefeSPatrick Mooney 		buf = NULL;
257bf21cd93STycho Nightingale 	err = 0;
258bf21cd93STycho Nightingale 	switch (be->be_op) {
259bf21cd93STycho Nightingale 	case BOP_READ:
260*4c87aefeSPatrick Mooney 		if (buf == NULL) {
261*4c87aefeSPatrick Mooney 			if ((len = preadv(bc->bc_fd, br->br_iov, br->br_iovcnt,
262*4c87aefeSPatrick Mooney 				   br->br_offset)) < 0)
263bf21cd93STycho Nightingale 				err = errno;
264*4c87aefeSPatrick Mooney 			else
265*4c87aefeSPatrick Mooney 				br->br_resid -= len;
266*4c87aefeSPatrick Mooney 			break;
267*4c87aefeSPatrick Mooney 		}
268*4c87aefeSPatrick Mooney 		i = 0;
269*4c87aefeSPatrick Mooney 		off = voff = 0;
270*4c87aefeSPatrick Mooney 		while (br->br_resid > 0) {
271*4c87aefeSPatrick Mooney 			len = MIN(br->br_resid, MAXPHYS);
272*4c87aefeSPatrick Mooney 			if (pread(bc->bc_fd, buf, len, br->br_offset +
273*4c87aefeSPatrick Mooney 			    off) < 0) {
274*4c87aefeSPatrick Mooney 				err = errno;
275*4c87aefeSPatrick Mooney 				break;
276*4c87aefeSPatrick Mooney 			}
277*4c87aefeSPatrick Mooney 			boff = 0;
278*4c87aefeSPatrick Mooney 			do {
279*4c87aefeSPatrick Mooney 				clen = MIN(len - boff, br->br_iov[i].iov_len -
280*4c87aefeSPatrick Mooney 				    voff);
281*4c87aefeSPatrick Mooney 				memcpy(br->br_iov[i].iov_base + voff,
282*4c87aefeSPatrick Mooney 				    buf + boff, clen);
283*4c87aefeSPatrick Mooney 				if (clen < br->br_iov[i].iov_len - voff)
284*4c87aefeSPatrick Mooney 					voff += clen;
285*4c87aefeSPatrick Mooney 				else {
286*4c87aefeSPatrick Mooney 					i++;
287*4c87aefeSPatrick Mooney 					voff = 0;
288*4c87aefeSPatrick Mooney 				}
289*4c87aefeSPatrick Mooney 				boff += clen;
290*4c87aefeSPatrick Mooney 			} while (boff < len);
291*4c87aefeSPatrick Mooney 			off += len;
292*4c87aefeSPatrick Mooney 			br->br_resid -= len;
293*4c87aefeSPatrick Mooney 		}
294bf21cd93STycho Nightingale 		break;
295bf21cd93STycho Nightingale 	case BOP_WRITE:
296*4c87aefeSPatrick Mooney 		if (bc->bc_rdonly) {
297bf21cd93STycho Nightingale 			err = EROFS;
298*4c87aefeSPatrick Mooney 			break;
299*4c87aefeSPatrick Mooney 		}
300*4c87aefeSPatrick Mooney 		if (buf == NULL) {
301*4c87aefeSPatrick Mooney 			if ((len = pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt,
302*4c87aefeSPatrick Mooney 				    br->br_offset)) < 0)
303*4c87aefeSPatrick Mooney 				err = errno;
304*4c87aefeSPatrick Mooney 			else
305*4c87aefeSPatrick Mooney 				br->br_resid -= len;
306*4c87aefeSPatrick Mooney 			break;
307*4c87aefeSPatrick Mooney 		}
308*4c87aefeSPatrick Mooney 		i = 0;
309*4c87aefeSPatrick Mooney 		off = voff = 0;
310*4c87aefeSPatrick Mooney 		while (br->br_resid > 0) {
311*4c87aefeSPatrick Mooney 			len = MIN(br->br_resid, MAXPHYS);
312*4c87aefeSPatrick Mooney 			boff = 0;
313*4c87aefeSPatrick Mooney 			do {
314*4c87aefeSPatrick Mooney 				clen = MIN(len - boff, br->br_iov[i].iov_len -
315*4c87aefeSPatrick Mooney 				    voff);
316*4c87aefeSPatrick Mooney 				memcpy(buf + boff,
317*4c87aefeSPatrick Mooney 				    br->br_iov[i].iov_base + voff, clen);
318*4c87aefeSPatrick Mooney 				if (clen < br->br_iov[i].iov_len - voff)
319*4c87aefeSPatrick Mooney 					voff += clen;
320*4c87aefeSPatrick Mooney 				else {
321*4c87aefeSPatrick Mooney 					i++;
322*4c87aefeSPatrick Mooney 					voff = 0;
323*4c87aefeSPatrick Mooney 				}
324*4c87aefeSPatrick Mooney 				boff += clen;
325*4c87aefeSPatrick Mooney 			} while (boff < len);
326*4c87aefeSPatrick Mooney 			if (pwrite(bc->bc_fd, buf, len, br->br_offset +
327*4c87aefeSPatrick Mooney 			    off) < 0) {
328bf21cd93STycho Nightingale 				err = errno;
329bf21cd93STycho Nightingale 				break;
330*4c87aefeSPatrick Mooney 			}
331*4c87aefeSPatrick Mooney 			off += len;
332*4c87aefeSPatrick Mooney 			br->br_resid -= len;
333*4c87aefeSPatrick Mooney 		}
334*4c87aefeSPatrick Mooney 		break;
335bf21cd93STycho Nightingale 	case BOP_FLUSH:
336*4c87aefeSPatrick Mooney #ifdef	__FreeBSD__
337*4c87aefeSPatrick Mooney 		if (bc->bc_ischr) {
338*4c87aefeSPatrick Mooney 			if (ioctl(bc->bc_fd, DIOCGFLUSH))
339*4c87aefeSPatrick Mooney 				err = errno;
340*4c87aefeSPatrick Mooney 		} else if (fsync(bc->bc_fd))
341*4c87aefeSPatrick Mooney 			err = errno;
342*4c87aefeSPatrick Mooney #else
343*4c87aefeSPatrick Mooney 		/*
344*4c87aefeSPatrick Mooney 		 * This fsync() should be adequate to flush the cache of a file
345*4c87aefeSPatrick Mooney 		 * or device.  In VFS, the VOP_SYNC operation is converted to
346*4c87aefeSPatrick Mooney 		 * the appropriate ioctl in both sdev (for real devices) and
347*4c87aefeSPatrick Mooney 		 * zfs (for zvols).
348*4c87aefeSPatrick Mooney 		 */
349*4c87aefeSPatrick Mooney 		if (fsync(bc->bc_fd))
350*4c87aefeSPatrick Mooney 			err = errno;
351*4c87aefeSPatrick Mooney #endif
352*4c87aefeSPatrick Mooney 		break;
353*4c87aefeSPatrick Mooney 	case BOP_DELETE:
354*4c87aefeSPatrick Mooney 		if (!bc->bc_candelete)
355*4c87aefeSPatrick Mooney 			err = EOPNOTSUPP;
356*4c87aefeSPatrick Mooney 		else if (bc->bc_rdonly)
357*4c87aefeSPatrick Mooney 			err = EROFS;
358*4c87aefeSPatrick Mooney #ifdef	__FreeBSD__
359*4c87aefeSPatrick Mooney 		else if (bc->bc_ischr) {
360*4c87aefeSPatrick Mooney 			arg[0] = br->br_offset;
361*4c87aefeSPatrick Mooney 			arg[1] = br->br_resid;
362*4c87aefeSPatrick Mooney 			if (ioctl(bc->bc_fd, DIOCGDELETE, arg))
363*4c87aefeSPatrick Mooney 				err = errno;
364*4c87aefeSPatrick Mooney 			else
365*4c87aefeSPatrick Mooney 				br->br_resid = 0;
366*4c87aefeSPatrick Mooney 		}
367*4c87aefeSPatrick Mooney #endif
368*4c87aefeSPatrick Mooney 		else
369*4c87aefeSPatrick Mooney 			 err = EOPNOTSUPP;
370bf21cd93STycho Nightingale 		break;
371bf21cd93STycho Nightingale 	default:
372bf21cd93STycho Nightingale 		err = EINVAL;
373bf21cd93STycho Nightingale 		break;
374bf21cd93STycho Nightingale 	}
375bf21cd93STycho Nightingale 
376bf21cd93STycho Nightingale 	be->be_status = BST_DONE;
377bf21cd93STycho Nightingale 
378bf21cd93STycho Nightingale 	(*br->br_callback)(br, err);
379bf21cd93STycho Nightingale }
380bf21cd93STycho Nightingale 
381bf21cd93STycho Nightingale static void *
382bf21cd93STycho Nightingale blockif_thr(void *arg)
383bf21cd93STycho Nightingale {
384bf21cd93STycho Nightingale 	struct blockif_ctxt *bc;
385bf21cd93STycho Nightingale 	struct blockif_elem *be;
386*4c87aefeSPatrick Mooney 	pthread_t t;
387*4c87aefeSPatrick Mooney 	uint8_t *buf;
388bf21cd93STycho Nightingale 
389bf21cd93STycho Nightingale 	bc = arg;
390*4c87aefeSPatrick Mooney 	if (bc->bc_isgeom)
391*4c87aefeSPatrick Mooney 		buf = malloc(MAXPHYS);
392*4c87aefeSPatrick Mooney 	else
393*4c87aefeSPatrick Mooney 		buf = NULL;
394*4c87aefeSPatrick Mooney 	t = pthread_self();
395bf21cd93STycho Nightingale 
396bf21cd93STycho Nightingale 	pthread_mutex_lock(&bc->bc_mtx);
397*4c87aefeSPatrick Mooney 	for (;;) {
398*4c87aefeSPatrick Mooney 		while (blockif_dequeue(bc, t, &be)) {
399bf21cd93STycho Nightingale 			pthread_mutex_unlock(&bc->bc_mtx);
400*4c87aefeSPatrick Mooney 			blockif_proc(bc, be, buf);
401bf21cd93STycho Nightingale 			pthread_mutex_lock(&bc->bc_mtx);
402bf21cd93STycho Nightingale 			blockif_complete(bc, be);
403bf21cd93STycho Nightingale 		}
404*4c87aefeSPatrick Mooney 		/* Check ctxt status here to see if exit requested */
405*4c87aefeSPatrick Mooney 		if (bc->bc_closing)
406*4c87aefeSPatrick Mooney 			break;
407bf21cd93STycho Nightingale 		pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx);
408*4c87aefeSPatrick Mooney 	}
409bf21cd93STycho Nightingale 	pthread_mutex_unlock(&bc->bc_mtx);
410bf21cd93STycho Nightingale 
411*4c87aefeSPatrick Mooney 	if (buf)
412*4c87aefeSPatrick Mooney 		free(buf);
413bf21cd93STycho Nightingale 	pthread_exit(NULL);
414bf21cd93STycho Nightingale 	return (NULL);
415bf21cd93STycho Nightingale }
416bf21cd93STycho Nightingale 
417bf21cd93STycho Nightingale #ifdef	__FreeBSD__
418bf21cd93STycho Nightingale static void
419bf21cd93STycho Nightingale blockif_sigcont_handler(int signal, enum ev_type type, void *arg)
420bf21cd93STycho Nightingale #else
421bf21cd93STycho Nightingale static void
422bf21cd93STycho Nightingale blockif_sigcont_handler(int signal)
423bf21cd93STycho Nightingale #endif
424bf21cd93STycho Nightingale {
425bf21cd93STycho Nightingale 	struct blockif_sig_elem *bse;
426bf21cd93STycho Nightingale 
427bf21cd93STycho Nightingale 	for (;;) {
428bf21cd93STycho Nightingale 		/*
429bf21cd93STycho Nightingale 		 * Process the entire list even if not intended for
430bf21cd93STycho Nightingale 		 * this thread.
431bf21cd93STycho Nightingale 		 */
432bf21cd93STycho Nightingale 		do {
433bf21cd93STycho Nightingale 			bse = blockif_bse_head;
434bf21cd93STycho Nightingale 			if (bse == NULL)
435bf21cd93STycho Nightingale 				return;
436bf21cd93STycho Nightingale 		} while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head,
437bf21cd93STycho Nightingale 					    (uintptr_t)bse,
438bf21cd93STycho Nightingale 					    (uintptr_t)bse->bse_next));
439bf21cd93STycho Nightingale 
440bf21cd93STycho Nightingale 		pthread_mutex_lock(&bse->bse_mtx);
441bf21cd93STycho Nightingale 		bse->bse_pending = 0;
442bf21cd93STycho Nightingale 		pthread_cond_signal(&bse->bse_cond);
443bf21cd93STycho Nightingale 		pthread_mutex_unlock(&bse->bse_mtx);
444bf21cd93STycho Nightingale 	}
445bf21cd93STycho Nightingale }
446bf21cd93STycho Nightingale 
447bf21cd93STycho Nightingale static void
448bf21cd93STycho Nightingale blockif_init(void)
449bf21cd93STycho Nightingale {
450bf21cd93STycho Nightingale #ifdef	__FreeBSD__
451bf21cd93STycho Nightingale 	mevent_add(SIGCONT, EVF_SIGNAL, blockif_sigcont_handler, NULL);
452bf21cd93STycho Nightingale 	(void) signal(SIGCONT, SIG_IGN);
453bf21cd93STycho Nightingale #else
454bf21cd93STycho Nightingale 	(void) sigset(SIGCONT, blockif_sigcont_handler);
455bf21cd93STycho Nightingale #endif
456bf21cd93STycho Nightingale }
457bf21cd93STycho Nightingale 
458bf21cd93STycho Nightingale struct blockif_ctxt *
459bf21cd93STycho Nightingale blockif_open(const char *optstr, const char *ident)
460bf21cd93STycho Nightingale {
461bf21cd93STycho Nightingale 	char tname[MAXCOMLEN + 1];
462*4c87aefeSPatrick Mooney #ifdef	__FreeBSD__
463*4c87aefeSPatrick Mooney 	char name[MAXPATHLEN];
464*4c87aefeSPatrick Mooney 	char *nopt, *xopts, *cp;
465*4c87aefeSPatrick Mooney #else
466*4c87aefeSPatrick Mooney 	char *nopt, *xopts, *cp = NULL;
467*4c87aefeSPatrick Mooney #endif
468bf21cd93STycho Nightingale 	struct blockif_ctxt *bc;
469bf21cd93STycho Nightingale 	struct stat sbuf;
470*4c87aefeSPatrick Mooney #ifdef	__FreeBSD__
471*4c87aefeSPatrick Mooney 	struct diocgattr_arg arg;
472*4c87aefeSPatrick Mooney #else
473*4c87aefeSPatrick Mooney 	enum blockif_wce wce = WCE_NONE;
474*4c87aefeSPatrick Mooney #endif
475*4c87aefeSPatrick Mooney 	off_t size, psectsz, psectoff;
476bf21cd93STycho Nightingale 	int extra, fd, i, sectsz;
477*4c87aefeSPatrick Mooney 	int nocache, sync, ro, candelete, geom, ssopt, pssopt;
478*4c87aefeSPatrick Mooney #ifndef WITHOUT_CAPSICUM
479*4c87aefeSPatrick Mooney 	cap_rights_t rights;
480*4c87aefeSPatrick Mooney 	cap_ioctl_t cmds[] = { DIOCGFLUSH, DIOCGDELETE };
481*4c87aefeSPatrick Mooney #endif
482bf21cd93STycho Nightingale 
483bf21cd93STycho Nightingale 	pthread_once(&blockif_once, blockif_init);
484bf21cd93STycho Nightingale 
485*4c87aefeSPatrick Mooney 	fd = -1;
486*4c87aefeSPatrick Mooney 	ssopt = 0;
487bf21cd93STycho Nightingale 	nocache = 0;
488bf21cd93STycho Nightingale 	sync = 0;
489bf21cd93STycho Nightingale 	ro = 0;
490bf21cd93STycho Nightingale 
491bf21cd93STycho Nightingale 	/*
492bf21cd93STycho Nightingale 	 * The first element in the optstring is always a pathname.
493bf21cd93STycho Nightingale 	 * Optional elements follow
494bf21cd93STycho Nightingale 	 */
495*4c87aefeSPatrick Mooney 	nopt = xopts = strdup(optstr);
496*4c87aefeSPatrick Mooney 	while (xopts != NULL) {
497*4c87aefeSPatrick Mooney 		cp = strsep(&xopts, ",");
498*4c87aefeSPatrick Mooney 		if (cp == nopt)		/* file or device pathname */
499*4c87aefeSPatrick Mooney 			continue;
500*4c87aefeSPatrick Mooney 		else if (!strcmp(cp, "nocache"))
501bf21cd93STycho Nightingale 			nocache = 1;
502*4c87aefeSPatrick Mooney 		else if (!strcmp(cp, "sync") || !strcmp(cp, "direct"))
503bf21cd93STycho Nightingale 			sync = 1;
504*4c87aefeSPatrick Mooney 		else if (!strcmp(cp, "ro"))
505bf21cd93STycho Nightingale 			ro = 1;
506*4c87aefeSPatrick Mooney 		else if (sscanf(cp, "sectorsize=%d/%d", &ssopt, &pssopt) == 2)
507*4c87aefeSPatrick Mooney 			;
508*4c87aefeSPatrick Mooney 		else if (sscanf(cp, "sectorsize=%d", &ssopt) == 1)
509*4c87aefeSPatrick Mooney 			pssopt = ssopt;
510*4c87aefeSPatrick Mooney 		else {
511*4c87aefeSPatrick Mooney 			fprintf(stderr, "Invalid device option \"%s\"\n", cp);
512*4c87aefeSPatrick Mooney 			goto err;
513*4c87aefeSPatrick Mooney 		}
514bf21cd93STycho Nightingale 	}
515bf21cd93STycho Nightingale 
516bf21cd93STycho Nightingale 	extra = 0;
517bf21cd93STycho Nightingale 	if (nocache)
518bf21cd93STycho Nightingale 		extra |= O_DIRECT;
519bf21cd93STycho Nightingale 	if (sync)
520bf21cd93STycho Nightingale 		extra |= O_SYNC;
521bf21cd93STycho Nightingale 
522bf21cd93STycho Nightingale 	fd = open(nopt, (ro ? O_RDONLY : O_RDWR) | extra);
523bf21cd93STycho Nightingale 	if (fd < 0 && !ro) {
524bf21cd93STycho Nightingale 		/* Attempt a r/w fail with a r/o open */
525bf21cd93STycho Nightingale 		fd = open(nopt, O_RDONLY | extra);
526bf21cd93STycho Nightingale 		ro = 1;
527bf21cd93STycho Nightingale 	}
528bf21cd93STycho Nightingale 
529bf21cd93STycho Nightingale 	if (fd < 0) {
530*4c87aefeSPatrick Mooney 		warn("Could not open backing file: %s", nopt);
531*4c87aefeSPatrick Mooney 		goto err;
532bf21cd93STycho Nightingale 	}
533bf21cd93STycho Nightingale 
534bf21cd93STycho Nightingale         if (fstat(fd, &sbuf) < 0) {
535*4c87aefeSPatrick Mooney 		warn("Could not stat backing file %s", nopt);
536*4c87aefeSPatrick Mooney 		goto err;
537bf21cd93STycho Nightingale         }
538bf21cd93STycho Nightingale 
539*4c87aefeSPatrick Mooney #ifndef WITHOUT_CAPSICUM
540*4c87aefeSPatrick Mooney 	cap_rights_init(&rights, CAP_FSYNC, CAP_IOCTL, CAP_READ, CAP_SEEK,
541*4c87aefeSPatrick Mooney 	    CAP_WRITE);
542*4c87aefeSPatrick Mooney 	if (ro)
543*4c87aefeSPatrick Mooney 		cap_rights_clear(&rights, CAP_FSYNC, CAP_WRITE);
544*4c87aefeSPatrick Mooney 
545*4c87aefeSPatrick Mooney 	if (caph_rights_limit(fd, &rights) == -1)
546*4c87aefeSPatrick Mooney 		errx(EX_OSERR, "Unable to apply rights for sandbox");
547*4c87aefeSPatrick Mooney #endif
548*4c87aefeSPatrick Mooney 
549bf21cd93STycho Nightingale         /*
550bf21cd93STycho Nightingale 	 * Deal with raw devices
551bf21cd93STycho Nightingale 	 */
552bf21cd93STycho Nightingale         size = sbuf.st_size;
553bf21cd93STycho Nightingale 	sectsz = DEV_BSIZE;
554*4c87aefeSPatrick Mooney 	psectsz = psectoff = 0;
555*4c87aefeSPatrick Mooney 	candelete = geom = 0;
556bf21cd93STycho Nightingale #ifdef	__FreeBSD__
557bf21cd93STycho Nightingale 	if (S_ISCHR(sbuf.st_mode)) {
558bf21cd93STycho Nightingale 		if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 ||
559bf21cd93STycho Nightingale 		    ioctl(fd, DIOCGSECTORSIZE, &sectsz)) {
560bf21cd93STycho Nightingale 			perror("Could not fetch dev blk/sector size");
561*4c87aefeSPatrick Mooney 			goto err;
562bf21cd93STycho Nightingale 		}
563bf21cd93STycho Nightingale 		assert(size != 0);
564bf21cd93STycho Nightingale 		assert(sectsz != 0);
565*4c87aefeSPatrick Mooney 		if (ioctl(fd, DIOCGSTRIPESIZE, &psectsz) == 0 && psectsz > 0)
566*4c87aefeSPatrick Mooney 			ioctl(fd, DIOCGSTRIPEOFFSET, &psectoff);
567*4c87aefeSPatrick Mooney 		strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name));
568*4c87aefeSPatrick Mooney 		arg.len = sizeof(arg.value.i);
569*4c87aefeSPatrick Mooney 		if (ioctl(fd, DIOCGATTR, &arg) == 0)
570*4c87aefeSPatrick Mooney 			candelete = arg.value.i;
571*4c87aefeSPatrick Mooney 		if (ioctl(fd, DIOCGPROVIDERNAME, name) == 0)
572*4c87aefeSPatrick Mooney 			geom = 1;
573*4c87aefeSPatrick Mooney 	} else {
574*4c87aefeSPatrick Mooney 		psectsz = sbuf.st_blksize;
575*4c87aefeSPatrick Mooney 	}
576*4c87aefeSPatrick Mooney #else
577*4c87aefeSPatrick Mooney 	psectsz = sbuf.st_blksize;
578*4c87aefeSPatrick Mooney 	if (S_ISCHR(sbuf.st_mode)) {
579*4c87aefeSPatrick Mooney 		struct dk_minfo_ext dkmext;
580*4c87aefeSPatrick Mooney 		int wce_val;
581*4c87aefeSPatrick Mooney 
582*4c87aefeSPatrick Mooney 		/* Look for a more accurate physical blocksize */
583*4c87aefeSPatrick Mooney 		if (ioctl(fd, DKIOCGMEDIAINFOEXT, &dkmext) == 0) {
584*4c87aefeSPatrick Mooney 			psectsz = dkmext.dki_pbsize;
585*4c87aefeSPatrick Mooney 		}
586*4c87aefeSPatrick Mooney 		/* See if a configurable write cache is present and working */
587*4c87aefeSPatrick Mooney 		if (ioctl(fd, DKIOCGETWCE, &wce_val) == 0) {
588*4c87aefeSPatrick Mooney 			/*
589*4c87aefeSPatrick Mooney 			 * If WCE is already active, disable it until the
590*4c87aefeSPatrick Mooney 			 * specific device driver calls for its return.  If it
591*4c87aefeSPatrick Mooney 			 * is not active, toggle it on and off to verify that
592*4c87aefeSPatrick Mooney 			 * such actions are possible.
593*4c87aefeSPatrick Mooney 			 */
594*4c87aefeSPatrick Mooney 			if (wce_val != 0) {
595*4c87aefeSPatrick Mooney 				wce_val = 0;
596*4c87aefeSPatrick Mooney 				/*
597*4c87aefeSPatrick Mooney 				 * Inability to disable the cache is a threat
598*4c87aefeSPatrick Mooney 				 * to data durability.
599*4c87aefeSPatrick Mooney 				 */
600*4c87aefeSPatrick Mooney 				assert(ioctl(fd, DKIOCSETWCE, &wce_val) == 0);
601*4c87aefeSPatrick Mooney 				wce = WCE_IOCTL;
602*4c87aefeSPatrick Mooney 			} else {
603*4c87aefeSPatrick Mooney 				int r1, r2;
604*4c87aefeSPatrick Mooney 
605*4c87aefeSPatrick Mooney 				wce_val = 1;
606*4c87aefeSPatrick Mooney 				r1 = ioctl(fd, DKIOCSETWCE, &wce_val);
607*4c87aefeSPatrick Mooney 				wce_val = 0;
608*4c87aefeSPatrick Mooney 				r2 = ioctl(fd, DKIOCSETWCE, &wce_val);
609*4c87aefeSPatrick Mooney 
610*4c87aefeSPatrick Mooney 				if (r1 == 0 && r2 == 0) {
611*4c87aefeSPatrick Mooney 					wce = WCE_IOCTL;
612*4c87aefeSPatrick Mooney 				} else {
613*4c87aefeSPatrick Mooney 					/*
614*4c87aefeSPatrick Mooney 					 * If the cache cache toggle was not
615*4c87aefeSPatrick Mooney 					 * successful, ensure that the cache
616*4c87aefeSPatrick Mooney 					 * was not left enabled.
617*4c87aefeSPatrick Mooney 					 */
618*4c87aefeSPatrick Mooney 					assert(r1 != 0);
619*4c87aefeSPatrick Mooney 				}
620*4c87aefeSPatrick Mooney 			}
621*4c87aefeSPatrick Mooney 		}
622*4c87aefeSPatrick Mooney 	} else {
623*4c87aefeSPatrick Mooney 		int flags;
624*4c87aefeSPatrick Mooney 
625*4c87aefeSPatrick Mooney 		if ((flags = fcntl(fd, F_GETFL)) >= 0) {
626*4c87aefeSPatrick Mooney 			flags |= O_DSYNC;
627*4c87aefeSPatrick Mooney 			if (fcntl(fd, F_SETFL, flags) != -1) {
628*4c87aefeSPatrick Mooney 				wce = WCE_FCNTL;
629*4c87aefeSPatrick Mooney 			}
630*4c87aefeSPatrick Mooney 		}
631bf21cd93STycho Nightingale 	}
632bf21cd93STycho Nightingale #endif
633bf21cd93STycho Nightingale 
634*4c87aefeSPatrick Mooney #ifndef WITHOUT_CAPSICUM
635*4c87aefeSPatrick Mooney 	if (caph_ioctls_limit(fd, cmds, nitems(cmds)) == -1)
636*4c87aefeSPatrick Mooney 		errx(EX_OSERR, "Unable to apply rights for sandbox");
637*4c87aefeSPatrick Mooney #endif
638*4c87aefeSPatrick Mooney 
639*4c87aefeSPatrick Mooney 	if (ssopt != 0) {
640*4c87aefeSPatrick Mooney 		if (!powerof2(ssopt) || !powerof2(pssopt) || ssopt < 512 ||
641*4c87aefeSPatrick Mooney 		    ssopt > pssopt) {
642*4c87aefeSPatrick Mooney 			fprintf(stderr, "Invalid sector size %d/%d\n",
643*4c87aefeSPatrick Mooney 			    ssopt, pssopt);
644*4c87aefeSPatrick Mooney 			goto err;
645*4c87aefeSPatrick Mooney 		}
646*4c87aefeSPatrick Mooney 
647*4c87aefeSPatrick Mooney 		/*
648*4c87aefeSPatrick Mooney 		 * Some backend drivers (e.g. cd0, ada0) require that the I/O
649*4c87aefeSPatrick Mooney 		 * size be a multiple of the device's sector size.
650*4c87aefeSPatrick Mooney 		 *
651*4c87aefeSPatrick Mooney 		 * Validate that the emulated sector size complies with this
652*4c87aefeSPatrick Mooney 		 * requirement.
653*4c87aefeSPatrick Mooney 		 */
654*4c87aefeSPatrick Mooney 		if (S_ISCHR(sbuf.st_mode)) {
655*4c87aefeSPatrick Mooney 			if (ssopt < sectsz || (ssopt % sectsz) != 0) {
656*4c87aefeSPatrick Mooney 				fprintf(stderr, "Sector size %d incompatible "
657*4c87aefeSPatrick Mooney 				    "with underlying device sector size %d\n",
658*4c87aefeSPatrick Mooney 				    ssopt, sectsz);
659*4c87aefeSPatrick Mooney 				goto err;
660*4c87aefeSPatrick Mooney 			}
661*4c87aefeSPatrick Mooney 		}
662*4c87aefeSPatrick Mooney 
663*4c87aefeSPatrick Mooney 		sectsz = ssopt;
664*4c87aefeSPatrick Mooney 		psectsz = pssopt;
665*4c87aefeSPatrick Mooney 		psectoff = 0;
666*4c87aefeSPatrick Mooney 	}
667*4c87aefeSPatrick Mooney 
668bf21cd93STycho Nightingale 	bc = calloc(1, sizeof(struct blockif_ctxt));
669bf21cd93STycho Nightingale 	if (bc == NULL) {
670*4c87aefeSPatrick Mooney 		perror("calloc");
671*4c87aefeSPatrick Mooney 		goto err;
672bf21cd93STycho Nightingale 	}
673bf21cd93STycho Nightingale 
674bf21cd93STycho Nightingale 	bc->bc_magic = BLOCKIF_SIG;
675bf21cd93STycho Nightingale 	bc->bc_fd = fd;
676*4c87aefeSPatrick Mooney 	bc->bc_ischr = S_ISCHR(sbuf.st_mode);
677*4c87aefeSPatrick Mooney 	bc->bc_isgeom = geom;
678*4c87aefeSPatrick Mooney 	bc->bc_candelete = candelete;
679*4c87aefeSPatrick Mooney #ifndef __FreeBSD__
680*4c87aefeSPatrick Mooney 	bc->bc_wce = wce;
681*4c87aefeSPatrick Mooney #endif
682bf21cd93STycho Nightingale 	bc->bc_rdonly = ro;
683bf21cd93STycho Nightingale 	bc->bc_size = size;
684bf21cd93STycho Nightingale 	bc->bc_sectsz = sectsz;
685*4c87aefeSPatrick Mooney 	bc->bc_psectsz = psectsz;
686*4c87aefeSPatrick Mooney 	bc->bc_psectoff = psectoff;
687bf21cd93STycho Nightingale 	pthread_mutex_init(&bc->bc_mtx, NULL);
688bf21cd93STycho Nightingale 	pthread_cond_init(&bc->bc_cond, NULL);
689bf21cd93STycho Nightingale 	TAILQ_INIT(&bc->bc_freeq);
690bf21cd93STycho Nightingale 	TAILQ_INIT(&bc->bc_pendq);
691bf21cd93STycho Nightingale 	TAILQ_INIT(&bc->bc_busyq);
692bf21cd93STycho Nightingale 	for (i = 0; i < BLOCKIF_MAXREQ; i++) {
693bf21cd93STycho Nightingale 		bc->bc_reqs[i].be_status = BST_FREE;
694bf21cd93STycho Nightingale 		TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link);
695bf21cd93STycho Nightingale 	}
696bf21cd93STycho Nightingale 
697*4c87aefeSPatrick Mooney 	for (i = 0; i < BLOCKIF_NUMTHR; i++) {
698*4c87aefeSPatrick Mooney 		pthread_create(&bc->bc_btid[i], NULL, blockif_thr, bc);
699*4c87aefeSPatrick Mooney 		snprintf(tname, sizeof(tname), "blk-%s-%d", ident, i);
700*4c87aefeSPatrick Mooney 		pthread_set_name_np(bc->bc_btid[i], tname);
701*4c87aefeSPatrick Mooney 	}
702bf21cd93STycho Nightingale 
703bf21cd93STycho Nightingale 	return (bc);
704*4c87aefeSPatrick Mooney err:
705*4c87aefeSPatrick Mooney 	if (fd >= 0)
706*4c87aefeSPatrick Mooney 		close(fd);
707*4c87aefeSPatrick Mooney 	free(nopt);
708*4c87aefeSPatrick Mooney 	return (NULL);
709bf21cd93STycho Nightingale }
710bf21cd93STycho Nightingale 
711bf21cd93STycho Nightingale static int
712bf21cd93STycho Nightingale blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq,
713bf21cd93STycho Nightingale 		enum blockop op)
714bf21cd93STycho Nightingale {
715bf21cd93STycho Nightingale 	int err;
716bf21cd93STycho Nightingale 
717bf21cd93STycho Nightingale 	err = 0;
718bf21cd93STycho Nightingale 
719bf21cd93STycho Nightingale 	pthread_mutex_lock(&bc->bc_mtx);
720*4c87aefeSPatrick Mooney 	if (!TAILQ_EMPTY(&bc->bc_freeq)) {
721bf21cd93STycho Nightingale 		/*
722bf21cd93STycho Nightingale 		 * Enqueue and inform the block i/o thread
723bf21cd93STycho Nightingale 		 * that there is work available
724bf21cd93STycho Nightingale 		 */
725*4c87aefeSPatrick Mooney 		if (blockif_enqueue(bc, breq, op))
726bf21cd93STycho Nightingale 			pthread_cond_signal(&bc->bc_cond);
727bf21cd93STycho Nightingale 	} else {
728bf21cd93STycho Nightingale 		/*
729bf21cd93STycho Nightingale 		 * Callers are not allowed to enqueue more than
730bf21cd93STycho Nightingale 		 * the specified blockif queue limit. Return an
731bf21cd93STycho Nightingale 		 * error to indicate that the queue length has been
732bf21cd93STycho Nightingale 		 * exceeded.
733bf21cd93STycho Nightingale 		 */
734bf21cd93STycho Nightingale 		err = E2BIG;
735bf21cd93STycho Nightingale 	}
736bf21cd93STycho Nightingale 	pthread_mutex_unlock(&bc->bc_mtx);
737bf21cd93STycho Nightingale 
738bf21cd93STycho Nightingale 	return (err);
739bf21cd93STycho Nightingale }
740bf21cd93STycho Nightingale 
741bf21cd93STycho Nightingale int
742bf21cd93STycho Nightingale blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq)
743bf21cd93STycho Nightingale {
744bf21cd93STycho Nightingale 
745bf21cd93STycho Nightingale 	assert(bc->bc_magic == BLOCKIF_SIG);
746bf21cd93STycho Nightingale 	return (blockif_request(bc, breq, BOP_READ));
747bf21cd93STycho Nightingale }
748bf21cd93STycho Nightingale 
749bf21cd93STycho Nightingale int
750bf21cd93STycho Nightingale blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq)
751bf21cd93STycho Nightingale {
752bf21cd93STycho Nightingale 
753bf21cd93STycho Nightingale 	assert(bc->bc_magic == BLOCKIF_SIG);
754bf21cd93STycho Nightingale 	return (blockif_request(bc, breq, BOP_WRITE));
755bf21cd93STycho Nightingale }
756bf21cd93STycho Nightingale 
757bf21cd93STycho Nightingale int
758bf21cd93STycho Nightingale blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq)
759bf21cd93STycho Nightingale {
760bf21cd93STycho Nightingale 
761bf21cd93STycho Nightingale 	assert(bc->bc_magic == BLOCKIF_SIG);
762bf21cd93STycho Nightingale 	return (blockif_request(bc, breq, BOP_FLUSH));
763bf21cd93STycho Nightingale }
764bf21cd93STycho Nightingale 
765bf21cd93STycho Nightingale int
766*4c87aefeSPatrick Mooney blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq)
767*4c87aefeSPatrick Mooney {
768*4c87aefeSPatrick Mooney 
769*4c87aefeSPatrick Mooney 	assert(bc->bc_magic == BLOCKIF_SIG);
770*4c87aefeSPatrick Mooney 	return (blockif_request(bc, breq, BOP_DELETE));
771*4c87aefeSPatrick Mooney }
772*4c87aefeSPatrick Mooney 
773*4c87aefeSPatrick Mooney int
774bf21cd93STycho Nightingale blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq)
775bf21cd93STycho Nightingale {
776bf21cd93STycho Nightingale 	struct blockif_elem *be;
777bf21cd93STycho Nightingale 
778bf21cd93STycho Nightingale 	assert(bc->bc_magic == BLOCKIF_SIG);
779bf21cd93STycho Nightingale 
780bf21cd93STycho Nightingale 	pthread_mutex_lock(&bc->bc_mtx);
781bf21cd93STycho Nightingale 	/*
782bf21cd93STycho Nightingale 	 * Check pending requests.
783bf21cd93STycho Nightingale 	 */
784bf21cd93STycho Nightingale 	TAILQ_FOREACH(be, &bc->bc_pendq, be_link) {
785bf21cd93STycho Nightingale 		if (be->be_req == breq)
786bf21cd93STycho Nightingale 			break;
787bf21cd93STycho Nightingale 	}
788bf21cd93STycho Nightingale 	if (be != NULL) {
789bf21cd93STycho Nightingale 		/*
790bf21cd93STycho Nightingale 		 * Found it.
791bf21cd93STycho Nightingale 		 */
792*4c87aefeSPatrick Mooney 		blockif_complete(bc, be);
793bf21cd93STycho Nightingale 		pthread_mutex_unlock(&bc->bc_mtx);
794bf21cd93STycho Nightingale 
795bf21cd93STycho Nightingale 		return (0);
796bf21cd93STycho Nightingale 	}
797bf21cd93STycho Nightingale 
798bf21cd93STycho Nightingale 	/*
799bf21cd93STycho Nightingale 	 * Check in-flight requests.
800bf21cd93STycho Nightingale 	 */
801bf21cd93STycho Nightingale 	TAILQ_FOREACH(be, &bc->bc_busyq, be_link) {
802bf21cd93STycho Nightingale 		if (be->be_req == breq)
803bf21cd93STycho Nightingale 			break;
804bf21cd93STycho Nightingale 	}
805bf21cd93STycho Nightingale 	if (be == NULL) {
806bf21cd93STycho Nightingale 		/*
807bf21cd93STycho Nightingale 		 * Didn't find it.
808bf21cd93STycho Nightingale 		 */
809bf21cd93STycho Nightingale 		pthread_mutex_unlock(&bc->bc_mtx);
810bf21cd93STycho Nightingale 		return (EINVAL);
811bf21cd93STycho Nightingale 	}
812bf21cd93STycho Nightingale 
813bf21cd93STycho Nightingale 	/*
814bf21cd93STycho Nightingale 	 * Interrupt the processing thread to force it return
815bf21cd93STycho Nightingale 	 * prematurely via it's normal callback path.
816bf21cd93STycho Nightingale 	 */
817bf21cd93STycho Nightingale 	while (be->be_status == BST_BUSY) {
818bf21cd93STycho Nightingale 		struct blockif_sig_elem bse, *old_head;
819bf21cd93STycho Nightingale 
820bf21cd93STycho Nightingale 		pthread_mutex_init(&bse.bse_mtx, NULL);
821bf21cd93STycho Nightingale 		pthread_cond_init(&bse.bse_cond, NULL);
822bf21cd93STycho Nightingale 
823bf21cd93STycho Nightingale 		bse.bse_pending = 1;
824bf21cd93STycho Nightingale 
825bf21cd93STycho Nightingale 		do {
826bf21cd93STycho Nightingale 			old_head = blockif_bse_head;
827bf21cd93STycho Nightingale 			bse.bse_next = old_head;
828bf21cd93STycho Nightingale 		} while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head,
829bf21cd93STycho Nightingale 					    (uintptr_t)old_head,
830bf21cd93STycho Nightingale 					    (uintptr_t)&bse));
831bf21cd93STycho Nightingale 
832bf21cd93STycho Nightingale 		pthread_kill(be->be_tid, SIGCONT);
833bf21cd93STycho Nightingale 
834bf21cd93STycho Nightingale 		pthread_mutex_lock(&bse.bse_mtx);
835bf21cd93STycho Nightingale 		while (bse.bse_pending)
836bf21cd93STycho Nightingale 			pthread_cond_wait(&bse.bse_cond, &bse.bse_mtx);
837bf21cd93STycho Nightingale 		pthread_mutex_unlock(&bse.bse_mtx);
838bf21cd93STycho Nightingale 	}
839bf21cd93STycho Nightingale 
840bf21cd93STycho Nightingale 	pthread_mutex_unlock(&bc->bc_mtx);
841bf21cd93STycho Nightingale 
842bf21cd93STycho Nightingale 	/*
843bf21cd93STycho Nightingale 	 * The processing thread has been interrupted.  Since it's not
844bf21cd93STycho Nightingale 	 * clear if the callback has been invoked yet, return EBUSY.
845bf21cd93STycho Nightingale 	 */
846bf21cd93STycho Nightingale 	return (EBUSY);
847bf21cd93STycho Nightingale }
848bf21cd93STycho Nightingale 
849bf21cd93STycho Nightingale int
850bf21cd93STycho Nightingale blockif_close(struct blockif_ctxt *bc)
851bf21cd93STycho Nightingale {
852bf21cd93STycho Nightingale 	void *jval;
853*4c87aefeSPatrick Mooney 	int i;
854bf21cd93STycho Nightingale 
855bf21cd93STycho Nightingale 	assert(bc->bc_magic == BLOCKIF_SIG);
856bf21cd93STycho Nightingale 
857bf21cd93STycho Nightingale 	/*
858bf21cd93STycho Nightingale 	 * Stop the block i/o thread
859bf21cd93STycho Nightingale 	 */
860*4c87aefeSPatrick Mooney 	pthread_mutex_lock(&bc->bc_mtx);
861bf21cd93STycho Nightingale 	bc->bc_closing = 1;
862*4c87aefeSPatrick Mooney 	pthread_mutex_unlock(&bc->bc_mtx);
863*4c87aefeSPatrick Mooney 	pthread_cond_broadcast(&bc->bc_cond);
864*4c87aefeSPatrick Mooney 	for (i = 0; i < BLOCKIF_NUMTHR; i++)
865*4c87aefeSPatrick Mooney 		pthread_join(bc->bc_btid[i], &jval);
866bf21cd93STycho Nightingale 
867bf21cd93STycho Nightingale 	/* XXX Cancel queued i/o's ??? */
868bf21cd93STycho Nightingale 
869bf21cd93STycho Nightingale 	/*
870bf21cd93STycho Nightingale 	 * Release resources
871bf21cd93STycho Nightingale 	 */
872bf21cd93STycho Nightingale 	bc->bc_magic = 0;
873bf21cd93STycho Nightingale 	close(bc->bc_fd);
874bf21cd93STycho Nightingale 	free(bc);
875bf21cd93STycho Nightingale 
876bf21cd93STycho Nightingale 	return (0);
877bf21cd93STycho Nightingale }
878bf21cd93STycho Nightingale 
879bf21cd93STycho Nightingale /*
880bf21cd93STycho Nightingale  * Return virtual C/H/S values for a given block. Use the algorithm
881bf21cd93STycho Nightingale  * outlined in the VHD specification to calculate values.
882bf21cd93STycho Nightingale  */
883bf21cd93STycho Nightingale void
884bf21cd93STycho Nightingale blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s)
885bf21cd93STycho Nightingale {
886bf21cd93STycho Nightingale 	off_t sectors;		/* total sectors of the block dev */
887bf21cd93STycho Nightingale 	off_t hcyl;		/* cylinders times heads */
888bf21cd93STycho Nightingale 	uint16_t secpt;		/* sectors per track */
889bf21cd93STycho Nightingale 	uint8_t heads;
890bf21cd93STycho Nightingale 
891bf21cd93STycho Nightingale 	assert(bc->bc_magic == BLOCKIF_SIG);
892bf21cd93STycho Nightingale 
893bf21cd93STycho Nightingale 	sectors = bc->bc_size / bc->bc_sectsz;
894bf21cd93STycho Nightingale 
895bf21cd93STycho Nightingale 	/* Clamp the size to the largest possible with CHS */
896bf21cd93STycho Nightingale 	if (sectors > 65535UL*16*255)
897bf21cd93STycho Nightingale 		sectors = 65535UL*16*255;
898bf21cd93STycho Nightingale 
899bf21cd93STycho Nightingale 	if (sectors >= 65536UL*16*63) {
900bf21cd93STycho Nightingale 		secpt = 255;
901bf21cd93STycho Nightingale 		heads = 16;
902bf21cd93STycho Nightingale 		hcyl = sectors / secpt;
903bf21cd93STycho Nightingale 	} else {
904bf21cd93STycho Nightingale 		secpt = 17;
905bf21cd93STycho Nightingale 		hcyl = sectors / secpt;
906bf21cd93STycho Nightingale 		heads = (hcyl + 1023) / 1024;
907bf21cd93STycho Nightingale 
908bf21cd93STycho Nightingale 		if (heads < 4)
909bf21cd93STycho Nightingale 			heads = 4;
910bf21cd93STycho Nightingale 
911bf21cd93STycho Nightingale 		if (hcyl >= (heads * 1024) || heads > 16) {
912bf21cd93STycho Nightingale 			secpt = 31;
913bf21cd93STycho Nightingale 			heads = 16;
914bf21cd93STycho Nightingale 			hcyl = sectors / secpt;
915bf21cd93STycho Nightingale 		}
916bf21cd93STycho Nightingale 		if (hcyl >= (heads * 1024)) {
917bf21cd93STycho Nightingale 			secpt = 63;
918bf21cd93STycho Nightingale 			heads = 16;
919bf21cd93STycho Nightingale 			hcyl = sectors / secpt;
920bf21cd93STycho Nightingale 		}
921bf21cd93STycho Nightingale 	}
922bf21cd93STycho Nightingale 
923bf21cd93STycho Nightingale 	*c = hcyl / heads;
924bf21cd93STycho Nightingale 	*h = heads;
925bf21cd93STycho Nightingale 	*s = secpt;
926bf21cd93STycho Nightingale }
927bf21cd93STycho Nightingale 
928bf21cd93STycho Nightingale /*
929bf21cd93STycho Nightingale  * Accessors
930bf21cd93STycho Nightingale  */
931bf21cd93STycho Nightingale off_t
932bf21cd93STycho Nightingale blockif_size(struct blockif_ctxt *bc)
933bf21cd93STycho Nightingale {
934bf21cd93STycho Nightingale 
935bf21cd93STycho Nightingale 	assert(bc->bc_magic == BLOCKIF_SIG);
936bf21cd93STycho Nightingale 	return (bc->bc_size);
937bf21cd93STycho Nightingale }
938bf21cd93STycho Nightingale 
939bf21cd93STycho Nightingale int
940bf21cd93STycho Nightingale blockif_sectsz(struct blockif_ctxt *bc)
941bf21cd93STycho Nightingale {
942bf21cd93STycho Nightingale 
943bf21cd93STycho Nightingale 	assert(bc->bc_magic == BLOCKIF_SIG);
944bf21cd93STycho Nightingale 	return (bc->bc_sectsz);
945bf21cd93STycho Nightingale }
946bf21cd93STycho Nightingale 
947*4c87aefeSPatrick Mooney void
948*4c87aefeSPatrick Mooney blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off)
949*4c87aefeSPatrick Mooney {
950*4c87aefeSPatrick Mooney 
951*4c87aefeSPatrick Mooney 	assert(bc->bc_magic == BLOCKIF_SIG);
952*4c87aefeSPatrick Mooney 	*size = bc->bc_psectsz;
953*4c87aefeSPatrick Mooney 	*off = bc->bc_psectoff;
954*4c87aefeSPatrick Mooney }
955*4c87aefeSPatrick Mooney 
956bf21cd93STycho Nightingale int
957bf21cd93STycho Nightingale blockif_queuesz(struct blockif_ctxt *bc)
958bf21cd93STycho Nightingale {
959bf21cd93STycho Nightingale 
960bf21cd93STycho Nightingale 	assert(bc->bc_magic == BLOCKIF_SIG);
961bf21cd93STycho Nightingale 	return (BLOCKIF_MAXREQ - 1);
962bf21cd93STycho Nightingale }
963bf21cd93STycho Nightingale 
964bf21cd93STycho Nightingale int
965bf21cd93STycho Nightingale blockif_is_ro(struct blockif_ctxt *bc)
966bf21cd93STycho Nightingale {
967bf21cd93STycho Nightingale 
968bf21cd93STycho Nightingale 	assert(bc->bc_magic == BLOCKIF_SIG);
969bf21cd93STycho Nightingale 	return (bc->bc_rdonly);
970bf21cd93STycho Nightingale }
971*4c87aefeSPatrick Mooney 
972*4c87aefeSPatrick Mooney int
973*4c87aefeSPatrick Mooney blockif_candelete(struct blockif_ctxt *bc)
974*4c87aefeSPatrick Mooney {
975*4c87aefeSPatrick Mooney 
976*4c87aefeSPatrick Mooney 	assert(bc->bc_magic == BLOCKIF_SIG);
977*4c87aefeSPatrick Mooney 	return (bc->bc_candelete);
978*4c87aefeSPatrick Mooney }
979*4c87aefeSPatrick Mooney 
980*4c87aefeSPatrick Mooney #ifndef __FreeBSD__
981*4c87aefeSPatrick Mooney int
982*4c87aefeSPatrick Mooney blockif_set_wce(struct blockif_ctxt *bc, int wc_enable)
983*4c87aefeSPatrick Mooney {
984*4c87aefeSPatrick Mooney 	int res = 0, flags;
985*4c87aefeSPatrick Mooney 	int clean_val = (wc_enable != 0) ? 1 : 0;
986*4c87aefeSPatrick Mooney 
987*4c87aefeSPatrick Mooney 	(void) pthread_mutex_lock(&bc->bc_mtx);
988*4c87aefeSPatrick Mooney 	switch (bc->bc_wce) {
989*4c87aefeSPatrick Mooney 	case WCE_IOCTL:
990*4c87aefeSPatrick Mooney 		res = ioctl(bc->bc_fd, DKIOCSETWCE, &clean_val);
991*4c87aefeSPatrick Mooney 		break;
992*4c87aefeSPatrick Mooney 	case WCE_FCNTL:
993*4c87aefeSPatrick Mooney 		if ((flags = fcntl(bc->bc_fd, F_GETFL)) >= 0) {
994*4c87aefeSPatrick Mooney 			if (wc_enable == 0) {
995*4c87aefeSPatrick Mooney 				flags |= O_DSYNC;
996*4c87aefeSPatrick Mooney 			} else {
997*4c87aefeSPatrick Mooney 				flags &= ~O_DSYNC;
998*4c87aefeSPatrick Mooney 			}
999*4c87aefeSPatrick Mooney 			if (fcntl(bc->bc_fd, F_SETFL, flags) == -1) {
1000*4c87aefeSPatrick Mooney 				res = -1;
1001*4c87aefeSPatrick Mooney 			}
1002*4c87aefeSPatrick Mooney 		} else {
1003*4c87aefeSPatrick Mooney 			res = -1;
1004*4c87aefeSPatrick Mooney 		}
1005*4c87aefeSPatrick Mooney 		break;
1006*4c87aefeSPatrick Mooney 	default:
1007*4c87aefeSPatrick Mooney 		break;
1008*4c87aefeSPatrick Mooney 	}
1009*4c87aefeSPatrick Mooney 
1010*4c87aefeSPatrick Mooney 	/*
1011*4c87aefeSPatrick Mooney 	 * After a successful disable of the write cache, ensure that any
1012*4c87aefeSPatrick Mooney 	 * lingering data in the cache is synced out.
1013*4c87aefeSPatrick Mooney 	 */
1014*4c87aefeSPatrick Mooney 	if (res == 0 && wc_enable == 0) {
1015*4c87aefeSPatrick Mooney 		res = fsync(bc->bc_fd);
1016*4c87aefeSPatrick Mooney 	}
1017*4c87aefeSPatrick Mooney 	(void) pthread_mutex_unlock(&bc->bc_mtx);
1018*4c87aefeSPatrick Mooney 
1019*4c87aefeSPatrick Mooney 	return (res);
1020*4c87aefeSPatrick Mooney }
1021*4c87aefeSPatrick Mooney #endif /* __FreeBSD__ */
1022