xref: /illumos-gate/usr/src/cmd/bhyve/block_if.c (revision 282a8ecb1f4aca0718d89ef1299b5928e5405bca)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2013  Peter Grehan <grehan@freebsd.org>
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  *
28  * $FreeBSD$
29  */
30 
31 /*
32  * Copyright 2020 Joyent, Inc.
33  */
34 
35 #include <sys/cdefs.h>
36 __FBSDID("$FreeBSD$");
37 
38 #include <sys/param.h>
39 #ifndef WITHOUT_CAPSICUM
40 #include <sys/capsicum.h>
41 #endif
42 #include <sys/queue.h>
43 #include <sys/errno.h>
44 #include <sys/stat.h>
45 #include <sys/ioctl.h>
46 #include <sys/disk.h>
47 #include <sys/limits.h>
48 #include <sys/uio.h>
49 #ifndef __FreeBSD__
50 #include <sys/dkio.h>
51 #endif
52 
53 #include <assert.h>
54 #ifndef WITHOUT_CAPSICUM
55 #include <capsicum_helpers.h>
56 #endif
57 #include <err.h>
58 #include <fcntl.h>
59 #include <stdio.h>
60 #include <stdlib.h>
61 #include <string.h>
62 #include <pthread.h>
63 #include <pthread_np.h>
64 #include <signal.h>
65 #include <sysexits.h>
66 #include <unistd.h>
67 
68 #include <machine/atomic.h>
69 
70 #include "bhyverun.h"
71 #ifdef	__FreeBSD__
72 #include "mevent.h"
73 #endif
74 #include "block_if.h"
75 
76 #define BLOCKIF_SIG	0xb109b109
77 
78 #ifdef __FreeBSD__
79 #define BLOCKIF_NUMTHR	8
80 #else
81 /* Enlarge to keep pace with the virtio-block ring size */
82 #define BLOCKIF_NUMTHR	16
83 #endif
84 #define BLOCKIF_MAXREQ	(BLOCKIF_RING_MAX + BLOCKIF_NUMTHR)
85 
86 enum blockop {
87 	BOP_READ,
88 	BOP_WRITE,
89 #ifndef __FreeBSD__
90 	BOP_WRITE_SYNC,
91 #endif
92 	BOP_FLUSH,
93 	BOP_DELETE
94 };
95 
96 enum blockstat {
97 	BST_FREE,
98 	BST_BLOCK,
99 	BST_PEND,
100 	BST_BUSY,
101 	BST_DONE
102 };
103 
104 struct blockif_elem {
105 	TAILQ_ENTRY(blockif_elem) be_link;
106 	struct blockif_req  *be_req;
107 	enum blockop	     be_op;
108 	enum blockstat	     be_status;
109 	pthread_t            be_tid;
110 	off_t		     be_block;
111 };
112 
113 #ifndef __FreeBSD__
114 enum blockif_wce {
115 	WCE_NONE = 0,
116 	WCE_IOCTL,
117 	WCE_FCNTL
118 };
119 #endif
120 
121 struct blockif_ctxt {
122 	int			bc_magic;
123 	int			bc_fd;
124 	int			bc_ischr;
125 	int			bc_isgeom;
126 	int			bc_candelete;
127 #ifndef __FreeBSD__
128 	enum blockif_wce	bc_wce;
129 #endif
130 	int			bc_rdonly;
131 	off_t			bc_size;
132 	int			bc_sectsz;
133 	int			bc_psectsz;
134 	int			bc_psectoff;
135 	int			bc_closing;
136 	pthread_t		bc_btid[BLOCKIF_NUMTHR];
137 	pthread_mutex_t		bc_mtx;
138 	pthread_cond_t		bc_cond;
139 
140 	/* Request elements and free/pending/busy queues */
141 	TAILQ_HEAD(, blockif_elem) bc_freeq;
142 	TAILQ_HEAD(, blockif_elem) bc_pendq;
143 	TAILQ_HEAD(, blockif_elem) bc_busyq;
144 	struct blockif_elem	bc_reqs[BLOCKIF_MAXREQ];
145 };
146 
147 static pthread_once_t blockif_once = PTHREAD_ONCE_INIT;
148 
149 struct blockif_sig_elem {
150 	pthread_mutex_t			bse_mtx;
151 	pthread_cond_t			bse_cond;
152 	int				bse_pending;
153 	struct blockif_sig_elem		*bse_next;
154 };
155 
156 static struct blockif_sig_elem *blockif_bse_head;
157 
158 static int
159 blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq,
160 		enum blockop op)
161 {
162 	struct blockif_elem *be, *tbe;
163 	off_t off;
164 	int i;
165 
166 	be = TAILQ_FIRST(&bc->bc_freeq);
167 	assert(be != NULL);
168 	assert(be->be_status == BST_FREE);
169 	TAILQ_REMOVE(&bc->bc_freeq, be, be_link);
170 	be->be_req = breq;
171 	be->be_op = op;
172 	switch (op) {
173 	case BOP_READ:
174 	case BOP_WRITE:
175 #ifndef __FreeBSD__
176 	case BOP_WRITE_SYNC:
177 #endif
178 	case BOP_DELETE:
179 		off = breq->br_offset;
180 		for (i = 0; i < breq->br_iovcnt; i++)
181 			off += breq->br_iov[i].iov_len;
182 		break;
183 	default:
184 		off = OFF_MAX;
185 	}
186 	be->be_block = off;
187 	TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) {
188 		if (tbe->be_block == breq->br_offset)
189 			break;
190 	}
191 	if (tbe == NULL) {
192 		TAILQ_FOREACH(tbe, &bc->bc_busyq, be_link) {
193 			if (tbe->be_block == breq->br_offset)
194 				break;
195 		}
196 	}
197 	if (tbe == NULL)
198 		be->be_status = BST_PEND;
199 	else
200 		be->be_status = BST_BLOCK;
201 	TAILQ_INSERT_TAIL(&bc->bc_pendq, be, be_link);
202 	return (be->be_status == BST_PEND);
203 }
204 
205 static int
206 blockif_dequeue(struct blockif_ctxt *bc, pthread_t t, struct blockif_elem **bep)
207 {
208 	struct blockif_elem *be;
209 
210 	TAILQ_FOREACH(be, &bc->bc_pendq, be_link) {
211 		if (be->be_status == BST_PEND)
212 			break;
213 		assert(be->be_status == BST_BLOCK);
214 	}
215 	if (be == NULL)
216 		return (0);
217 	TAILQ_REMOVE(&bc->bc_pendq, be, be_link);
218 	be->be_status = BST_BUSY;
219 	be->be_tid = t;
220 	TAILQ_INSERT_TAIL(&bc->bc_busyq, be, be_link);
221 	*bep = be;
222 	return (1);
223 }
224 
225 static void
226 blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be)
227 {
228 	struct blockif_elem *tbe;
229 
230 	if (be->be_status == BST_DONE || be->be_status == BST_BUSY)
231 		TAILQ_REMOVE(&bc->bc_busyq, be, be_link);
232 	else
233 		TAILQ_REMOVE(&bc->bc_pendq, be, be_link);
234 	TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) {
235 		if (tbe->be_req->br_offset == be->be_block)
236 			tbe->be_status = BST_PEND;
237 	}
238 	be->be_tid = 0;
239 	be->be_status = BST_FREE;
240 	be->be_req = NULL;
241 	TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link);
242 }
243 
244 static void
245 blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf)
246 {
247 	struct blockif_req *br;
248 #ifdef	__FreeBSD__
249 	off_t arg[2];
250 #endif
251 	ssize_t clen, len, off, boff, voff;
252 	int i, err;
253 
254 	br = be->be_req;
255 	if (br->br_iovcnt <= 1)
256 		buf = NULL;
257 	err = 0;
258 	switch (be->be_op) {
259 	case BOP_READ:
260 		if (buf == NULL) {
261 			if ((len = preadv(bc->bc_fd, br->br_iov, br->br_iovcnt,
262 				   br->br_offset)) < 0)
263 				err = errno;
264 			else
265 				br->br_resid -= len;
266 			break;
267 		}
268 		i = 0;
269 		off = voff = 0;
270 		while (br->br_resid > 0) {
271 			len = MIN(br->br_resid, MAXPHYS);
272 			if (pread(bc->bc_fd, buf, len, br->br_offset +
273 			    off) < 0) {
274 				err = errno;
275 				break;
276 			}
277 			boff = 0;
278 			do {
279 				clen = MIN(len - boff, br->br_iov[i].iov_len -
280 				    voff);
281 				memcpy(br->br_iov[i].iov_base + voff,
282 				    buf + boff, clen);
283 				if (clen < br->br_iov[i].iov_len - voff)
284 					voff += clen;
285 				else {
286 					i++;
287 					voff = 0;
288 				}
289 				boff += clen;
290 			} while (boff < len);
291 			off += len;
292 			br->br_resid -= len;
293 		}
294 		break;
295 	case BOP_WRITE:
296 		if (bc->bc_rdonly) {
297 			err = EROFS;
298 			break;
299 		}
300 		if (buf == NULL) {
301 			if ((len = pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt,
302 				    br->br_offset)) < 0)
303 				err = errno;
304 			else
305 				br->br_resid -= len;
306 			break;
307 		}
308 		i = 0;
309 		off = voff = 0;
310 		while (br->br_resid > 0) {
311 			len = MIN(br->br_resid, MAXPHYS);
312 			boff = 0;
313 			do {
314 				clen = MIN(len - boff, br->br_iov[i].iov_len -
315 				    voff);
316 				memcpy(buf + boff,
317 				    br->br_iov[i].iov_base + voff, clen);
318 				if (clen < br->br_iov[i].iov_len - voff)
319 					voff += clen;
320 				else {
321 					i++;
322 					voff = 0;
323 				}
324 				boff += clen;
325 			} while (boff < len);
326 			if (pwrite(bc->bc_fd, buf, len, br->br_offset +
327 			    off) < 0) {
328 				err = errno;
329 				break;
330 			}
331 			off += len;
332 			br->br_resid -= len;
333 		}
334 		break;
335 	case BOP_FLUSH:
336 #ifdef	__FreeBSD__
337 		if (bc->bc_ischr) {
338 			if (ioctl(bc->bc_fd, DIOCGFLUSH))
339 				err = errno;
340 		} else if (fsync(bc->bc_fd))
341 			err = errno;
342 #else
343 		/*
344 		 * This fsync() should be adequate to flush the cache of a file
345 		 * or device.  In VFS, the VOP_SYNC operation is converted to
346 		 * the appropriate ioctl in both sdev (for real devices) and
347 		 * zfs (for zvols).
348 		 */
349 		if (fsync(bc->bc_fd))
350 			err = errno;
351 #endif
352 		break;
353 	case BOP_DELETE:
354 		if (!bc->bc_candelete)
355 			err = EOPNOTSUPP;
356 		else if (bc->bc_rdonly)
357 			err = EROFS;
358 #ifdef	__FreeBSD__
359 		else if (bc->bc_ischr) {
360 			arg[0] = br->br_offset;
361 			arg[1] = br->br_resid;
362 			if (ioctl(bc->bc_fd, DIOCGDELETE, arg))
363 				err = errno;
364 			else
365 				br->br_resid = 0;
366 		}
367 		else
368 			 err = EOPNOTSUPP;
369 #else
370 		else if (bc->bc_ischr) {
371 			dkioc_free_list_t dfl = {
372 				.dfl_num_exts = 1,
373 				.dfl_offset = 0,
374 				.dfl_flags = 0,
375 				.dfl_exts = {
376 					{
377 						.dfle_start = br->br_offset,
378 						.dfle_length = br->br_resid
379 					}
380 				}
381 			};
382 
383 			if (ioctl(bc->bc_fd, DKIOCFREE, &dfl))
384 				err = errno;
385 			else
386 				br->br_resid = 0;
387 		} else {
388 			struct flock fl = {
389 				.l_whence = 0,
390 				.l_type = F_WRLCK,
391 				.l_start = br->br_offset,
392 				.l_len = br->br_resid
393 			};
394 
395 			if (fcntl(bc->bc_fd, F_FREESP, &fl))
396 				err = errno;
397 			else
398 				br->br_resid = 0;
399 		}
400 #endif
401 		break;
402 	default:
403 		err = EINVAL;
404 		break;
405 	}
406 
407 	be->be_status = BST_DONE;
408 
409 	(*br->br_callback)(br, err);
410 }
411 
412 static void *
413 blockif_thr(void *arg)
414 {
415 	struct blockif_ctxt *bc;
416 	struct blockif_elem *be;
417 	pthread_t t;
418 	uint8_t *buf;
419 
420 	bc = arg;
421 	if (bc->bc_isgeom)
422 		buf = malloc(MAXPHYS);
423 	else
424 		buf = NULL;
425 	t = pthread_self();
426 
427 	pthread_mutex_lock(&bc->bc_mtx);
428 	for (;;) {
429 		while (blockif_dequeue(bc, t, &be)) {
430 			pthread_mutex_unlock(&bc->bc_mtx);
431 			blockif_proc(bc, be, buf);
432 			pthread_mutex_lock(&bc->bc_mtx);
433 			blockif_complete(bc, be);
434 		}
435 		/* Check ctxt status here to see if exit requested */
436 		if (bc->bc_closing)
437 			break;
438 		pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx);
439 	}
440 	pthread_mutex_unlock(&bc->bc_mtx);
441 
442 	if (buf)
443 		free(buf);
444 	pthread_exit(NULL);
445 	return (NULL);
446 }
447 
448 #ifdef	__FreeBSD__
449 static void
450 blockif_sigcont_handler(int signal, enum ev_type type, void *arg)
451 #else
452 static void
453 blockif_sigcont_handler(int signal)
454 #endif
455 {
456 	struct blockif_sig_elem *bse;
457 
458 	for (;;) {
459 		/*
460 		 * Process the entire list even if not intended for
461 		 * this thread.
462 		 */
463 		do {
464 			bse = blockif_bse_head;
465 			if (bse == NULL)
466 				return;
467 		} while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head,
468 					    (uintptr_t)bse,
469 					    (uintptr_t)bse->bse_next));
470 
471 		pthread_mutex_lock(&bse->bse_mtx);
472 		bse->bse_pending = 0;
473 		pthread_cond_signal(&bse->bse_cond);
474 		pthread_mutex_unlock(&bse->bse_mtx);
475 	}
476 }
477 
478 static void
479 blockif_init(void)
480 {
481 #ifdef	__FreeBSD__
482 	mevent_add(SIGCONT, EVF_SIGNAL, blockif_sigcont_handler, NULL);
483 	(void) signal(SIGCONT, SIG_IGN);
484 #else
485 	(void) sigset(SIGCONT, blockif_sigcont_handler);
486 #endif
487 }
488 
489 struct blockif_ctxt *
490 blockif_open(const char *optstr, const char *ident)
491 {
492 	char tname[MAXCOMLEN + 1];
493 #ifdef	__FreeBSD__
494 	char name[MAXPATHLEN];
495 	char *nopt, *xopts, *cp;
496 #else
497 	char *nopt, *xopts, *cp = NULL;
498 #endif
499 	struct blockif_ctxt *bc;
500 	struct stat sbuf;
501 #ifdef	__FreeBSD__
502 	struct diocgattr_arg arg;
503 #else
504 	enum blockif_wce wce = WCE_NONE;
505 #endif
506 	off_t size, psectsz, psectoff;
507 	int extra, fd, i, sectsz;
508 	int nocache, sync, ro, candelete, geom, ssopt, pssopt;
509 	int nodelete;
510 
511 #ifndef WITHOUT_CAPSICUM
512 	cap_rights_t rights;
513 	cap_ioctl_t cmds[] = { DIOCGFLUSH, DIOCGDELETE };
514 #endif
515 
516 	pthread_once(&blockif_once, blockif_init);
517 
518 	fd = -1;
519 	ssopt = 0;
520 	nocache = 0;
521 	sync = 0;
522 	ro = 0;
523 	nodelete = 0;
524 
525 	/*
526 	 * The first element in the optstring is always a pathname.
527 	 * Optional elements follow
528 	 */
529 	nopt = xopts = strdup(optstr);
530 	while (xopts != NULL) {
531 		cp = strsep(&xopts, ",");
532 		if (cp == nopt)		/* file or device pathname */
533 			continue;
534 		else if (!strcmp(cp, "nocache"))
535 			nocache = 1;
536 		else if (!strcmp(cp, "nodelete"))
537 			nodelete = 1;
538 		else if (!strcmp(cp, "sync") || !strcmp(cp, "direct"))
539 			sync = 1;
540 		else if (!strcmp(cp, "ro"))
541 			ro = 1;
542 		else if (sscanf(cp, "sectorsize=%d/%d", &ssopt, &pssopt) == 2)
543 			;
544 		else if (sscanf(cp, "sectorsize=%d", &ssopt) == 1)
545 			pssopt = ssopt;
546 		else {
547 			fprintf(stderr, "Invalid device option \"%s\"\n", cp);
548 			goto err;
549 		}
550 	}
551 
552 	extra = 0;
553 	if (nocache)
554 		extra |= O_DIRECT;
555 	if (sync)
556 		extra |= O_SYNC;
557 
558 	fd = open(nopt, (ro ? O_RDONLY : O_RDWR) | extra);
559 	if (fd < 0 && !ro) {
560 		/* Attempt a r/w fail with a r/o open */
561 		fd = open(nopt, O_RDONLY | extra);
562 		ro = 1;
563 	}
564 
565 	if (fd < 0) {
566 		warn("Could not open backing file: %s", nopt);
567 		goto err;
568 	}
569 
570         if (fstat(fd, &sbuf) < 0) {
571 		warn("Could not stat backing file %s", nopt);
572 		goto err;
573         }
574 
575 #ifndef WITHOUT_CAPSICUM
576 	cap_rights_init(&rights, CAP_FSYNC, CAP_IOCTL, CAP_READ, CAP_SEEK,
577 	    CAP_WRITE);
578 	if (ro)
579 		cap_rights_clear(&rights, CAP_FSYNC, CAP_WRITE);
580 
581 	if (caph_rights_limit(fd, &rights) == -1)
582 		errx(EX_OSERR, "Unable to apply rights for sandbox");
583 #endif
584 
585         /*
586 	 * Deal with raw devices
587 	 */
588         size = sbuf.st_size;
589 	sectsz = DEV_BSIZE;
590 	psectsz = psectoff = 0;
591 	candelete = geom = 0;
592 #ifdef	__FreeBSD__
593 	if (S_ISCHR(sbuf.st_mode)) {
594 		if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 ||
595 		    ioctl(fd, DIOCGSECTORSIZE, &sectsz)) {
596 			perror("Could not fetch dev blk/sector size");
597 			goto err;
598 		}
599 		assert(size != 0);
600 		assert(sectsz != 0);
601 		if (ioctl(fd, DIOCGSTRIPESIZE, &psectsz) == 0 && psectsz > 0)
602 			ioctl(fd, DIOCGSTRIPEOFFSET, &psectoff);
603 		strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name));
604 		arg.len = sizeof(arg.value.i);
605 		if (nodelete == 0 && ioctl(fd, DIOCGATTR, &arg) == 0)
606 			candelete = arg.value.i;
607 		if (ioctl(fd, DIOCGPROVIDERNAME, name) == 0)
608 			geom = 1;
609 	} else {
610 		psectsz = sbuf.st_blksize;
611 	}
612 #else
613 	psectsz = sbuf.st_blksize;
614 	if (S_ISCHR(sbuf.st_mode)) {
615 		struct dk_minfo_ext dkmext;
616 		int wce_val;
617 
618 		/* Look for a more accurate physical blocksize */
619 		if (ioctl(fd, DKIOCGMEDIAINFOEXT, &dkmext) == 0) {
620 			psectsz = dkmext.dki_pbsize;
621 		}
622 		/* See if a configurable write cache is present and working */
623 		if (ioctl(fd, DKIOCGETWCE, &wce_val) == 0) {
624 			/*
625 			 * If WCE is already active, disable it until the
626 			 * specific device driver calls for its return.  If it
627 			 * is not active, toggle it on and off to verify that
628 			 * such actions are possible.
629 			 */
630 			if (wce_val != 0) {
631 				wce_val = 0;
632 				/*
633 				 * Inability to disable the cache is a threat
634 				 * to data durability.
635 				 */
636 				assert(ioctl(fd, DKIOCSETWCE, &wce_val) == 0);
637 				wce = WCE_IOCTL;
638 			} else {
639 				int r1, r2;
640 
641 				wce_val = 1;
642 				r1 = ioctl(fd, DKIOCSETWCE, &wce_val);
643 				wce_val = 0;
644 				r2 = ioctl(fd, DKIOCSETWCE, &wce_val);
645 
646 				if (r1 == 0 && r2 == 0) {
647 					wce = WCE_IOCTL;
648 				} else {
649 					/*
650 					 * If the cache cache toggle was not
651 					 * successful, ensure that the cache
652 					 * was not left enabled.
653 					 */
654 					assert(r1 != 0);
655 				}
656 			}
657 		}
658 
659 		if (nodelete == 0 && ioctl(fd, DKIOC_CANFREE, &candelete))
660 			candelete = 0;
661 
662 	} else {
663 		int flags;
664 
665 		if ((flags = fcntl(fd, F_GETFL)) >= 0) {
666 			flags |= O_DSYNC;
667 			if (fcntl(fd, F_SETFL, flags) != -1) {
668 				wce = WCE_FCNTL;
669 			}
670 		}
671 
672 		/*
673 		 * We don't have a way to discover if a file supports the
674 		 * FREESP fcntl cmd (other than trying it).  However,
675 		 * zfs, ufs, tmpfs, and udfs all support the FREESP fcntl cmd.
676 		 * Nfsv4 and nfsv4 also forward the FREESP request
677 		 * to the server, so we always enable it for file based
678 		 * volumes. Anyone trying to run volumes on an unsupported
679 		 * configuration is on their own, and should be prepared
680 		 * for the requests to fail.
681 		 */
682 		if (nodelete == 0)
683 			candelete = 1;
684 	}
685 #endif
686 
687 #ifndef WITHOUT_CAPSICUM
688 	if (caph_ioctls_limit(fd, cmds, nitems(cmds)) == -1)
689 		errx(EX_OSERR, "Unable to apply rights for sandbox");
690 #endif
691 
692 	if (ssopt != 0) {
693 		if (!powerof2(ssopt) || !powerof2(pssopt) || ssopt < 512 ||
694 		    ssopt > pssopt) {
695 			fprintf(stderr, "Invalid sector size %d/%d\n",
696 			    ssopt, pssopt);
697 			goto err;
698 		}
699 
700 		/*
701 		 * Some backend drivers (e.g. cd0, ada0) require that the I/O
702 		 * size be a multiple of the device's sector size.
703 		 *
704 		 * Validate that the emulated sector size complies with this
705 		 * requirement.
706 		 */
707 		if (S_ISCHR(sbuf.st_mode)) {
708 			if (ssopt < sectsz || (ssopt % sectsz) != 0) {
709 				fprintf(stderr, "Sector size %d incompatible "
710 				    "with underlying device sector size %d\n",
711 				    ssopt, sectsz);
712 				goto err;
713 			}
714 		}
715 
716 		sectsz = ssopt;
717 		psectsz = pssopt;
718 		psectoff = 0;
719 	}
720 
721 	bc = calloc(1, sizeof(struct blockif_ctxt));
722 	if (bc == NULL) {
723 		perror("calloc");
724 		goto err;
725 	}
726 
727 	bc->bc_magic = BLOCKIF_SIG;
728 	bc->bc_fd = fd;
729 	bc->bc_ischr = S_ISCHR(sbuf.st_mode);
730 	bc->bc_isgeom = geom;
731 	bc->bc_candelete = candelete;
732 #ifndef __FreeBSD__
733 	bc->bc_wce = wce;
734 #endif
735 	bc->bc_rdonly = ro;
736 	bc->bc_size = size;
737 	bc->bc_sectsz = sectsz;
738 	bc->bc_psectsz = psectsz;
739 	bc->bc_psectoff = psectoff;
740 	pthread_mutex_init(&bc->bc_mtx, NULL);
741 	pthread_cond_init(&bc->bc_cond, NULL);
742 	TAILQ_INIT(&bc->bc_freeq);
743 	TAILQ_INIT(&bc->bc_pendq);
744 	TAILQ_INIT(&bc->bc_busyq);
745 	for (i = 0; i < BLOCKIF_MAXREQ; i++) {
746 		bc->bc_reqs[i].be_status = BST_FREE;
747 		TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link);
748 	}
749 
750 	for (i = 0; i < BLOCKIF_NUMTHR; i++) {
751 		pthread_create(&bc->bc_btid[i], NULL, blockif_thr, bc);
752 		snprintf(tname, sizeof(tname), "blk-%s-%d", ident, i);
753 		pthread_set_name_np(bc->bc_btid[i], tname);
754 	}
755 
756 	return (bc);
757 err:
758 	if (fd >= 0)
759 		close(fd);
760 	free(nopt);
761 	return (NULL);
762 }
763 
764 static int
765 blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq,
766 		enum blockop op)
767 {
768 	int err;
769 
770 	err = 0;
771 
772 	pthread_mutex_lock(&bc->bc_mtx);
773 	if (!TAILQ_EMPTY(&bc->bc_freeq)) {
774 		/*
775 		 * Enqueue and inform the block i/o thread
776 		 * that there is work available
777 		 */
778 		if (blockif_enqueue(bc, breq, op))
779 			pthread_cond_signal(&bc->bc_cond);
780 	} else {
781 		/*
782 		 * Callers are not allowed to enqueue more than
783 		 * the specified blockif queue limit. Return an
784 		 * error to indicate that the queue length has been
785 		 * exceeded.
786 		 */
787 		err = E2BIG;
788 	}
789 	pthread_mutex_unlock(&bc->bc_mtx);
790 
791 	return (err);
792 }
793 
794 int
795 blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq)
796 {
797 
798 	assert(bc->bc_magic == BLOCKIF_SIG);
799 	return (blockif_request(bc, breq, BOP_READ));
800 }
801 
802 int
803 blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq)
804 {
805 
806 	assert(bc->bc_magic == BLOCKIF_SIG);
807 	return (blockif_request(bc, breq, BOP_WRITE));
808 }
809 
810 int
811 blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq)
812 {
813 
814 	assert(bc->bc_magic == BLOCKIF_SIG);
815 	return (blockif_request(bc, breq, BOP_FLUSH));
816 }
817 
818 int
819 blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq)
820 {
821 
822 	assert(bc->bc_magic == BLOCKIF_SIG);
823 	return (blockif_request(bc, breq, BOP_DELETE));
824 }
825 
826 int
827 blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq)
828 {
829 	struct blockif_elem *be;
830 
831 	assert(bc->bc_magic == BLOCKIF_SIG);
832 
833 	pthread_mutex_lock(&bc->bc_mtx);
834 	/*
835 	 * Check pending requests.
836 	 */
837 	TAILQ_FOREACH(be, &bc->bc_pendq, be_link) {
838 		if (be->be_req == breq)
839 			break;
840 	}
841 	if (be != NULL) {
842 		/*
843 		 * Found it.
844 		 */
845 		blockif_complete(bc, be);
846 		pthread_mutex_unlock(&bc->bc_mtx);
847 
848 		return (0);
849 	}
850 
851 	/*
852 	 * Check in-flight requests.
853 	 */
854 	TAILQ_FOREACH(be, &bc->bc_busyq, be_link) {
855 		if (be->be_req == breq)
856 			break;
857 	}
858 	if (be == NULL) {
859 		/*
860 		 * Didn't find it.
861 		 */
862 		pthread_mutex_unlock(&bc->bc_mtx);
863 		return (EINVAL);
864 	}
865 
866 	/*
867 	 * Interrupt the processing thread to force it return
868 	 * prematurely via it's normal callback path.
869 	 */
870 	while (be->be_status == BST_BUSY) {
871 		struct blockif_sig_elem bse, *old_head;
872 
873 		pthread_mutex_init(&bse.bse_mtx, NULL);
874 		pthread_cond_init(&bse.bse_cond, NULL);
875 
876 		bse.bse_pending = 1;
877 
878 		do {
879 			old_head = blockif_bse_head;
880 			bse.bse_next = old_head;
881 		} while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head,
882 					    (uintptr_t)old_head,
883 					    (uintptr_t)&bse));
884 
885 		pthread_kill(be->be_tid, SIGCONT);
886 
887 		pthread_mutex_lock(&bse.bse_mtx);
888 		while (bse.bse_pending)
889 			pthread_cond_wait(&bse.bse_cond, &bse.bse_mtx);
890 		pthread_mutex_unlock(&bse.bse_mtx);
891 	}
892 
893 	pthread_mutex_unlock(&bc->bc_mtx);
894 
895 	/*
896 	 * The processing thread has been interrupted.  Since it's not
897 	 * clear if the callback has been invoked yet, return EBUSY.
898 	 */
899 	return (EBUSY);
900 }
901 
902 int
903 blockif_close(struct blockif_ctxt *bc)
904 {
905 	void *jval;
906 	int i;
907 
908 	assert(bc->bc_magic == BLOCKIF_SIG);
909 
910 	/*
911 	 * Stop the block i/o thread
912 	 */
913 	pthread_mutex_lock(&bc->bc_mtx);
914 	bc->bc_closing = 1;
915 	pthread_mutex_unlock(&bc->bc_mtx);
916 	pthread_cond_broadcast(&bc->bc_cond);
917 	for (i = 0; i < BLOCKIF_NUMTHR; i++)
918 		pthread_join(bc->bc_btid[i], &jval);
919 
920 	/* XXX Cancel queued i/o's ??? */
921 
922 	/*
923 	 * Release resources
924 	 */
925 	bc->bc_magic = 0;
926 	close(bc->bc_fd);
927 	free(bc);
928 
929 	return (0);
930 }
931 
932 /*
933  * Return virtual C/H/S values for a given block. Use the algorithm
934  * outlined in the VHD specification to calculate values.
935  */
936 void
937 blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s)
938 {
939 	off_t sectors;		/* total sectors of the block dev */
940 	off_t hcyl;		/* cylinders times heads */
941 	uint16_t secpt;		/* sectors per track */
942 	uint8_t heads;
943 
944 	assert(bc->bc_magic == BLOCKIF_SIG);
945 
946 	sectors = bc->bc_size / bc->bc_sectsz;
947 
948 	/* Clamp the size to the largest possible with CHS */
949 	if (sectors > 65535UL*16*255)
950 		sectors = 65535UL*16*255;
951 
952 	if (sectors >= 65536UL*16*63) {
953 		secpt = 255;
954 		heads = 16;
955 		hcyl = sectors / secpt;
956 	} else {
957 		secpt = 17;
958 		hcyl = sectors / secpt;
959 		heads = (hcyl + 1023) / 1024;
960 
961 		if (heads < 4)
962 			heads = 4;
963 
964 		if (hcyl >= (heads * 1024) || heads > 16) {
965 			secpt = 31;
966 			heads = 16;
967 			hcyl = sectors / secpt;
968 		}
969 		if (hcyl >= (heads * 1024)) {
970 			secpt = 63;
971 			heads = 16;
972 			hcyl = sectors / secpt;
973 		}
974 	}
975 
976 	*c = hcyl / heads;
977 	*h = heads;
978 	*s = secpt;
979 }
980 
981 /*
982  * Accessors
983  */
984 off_t
985 blockif_size(struct blockif_ctxt *bc)
986 {
987 
988 	assert(bc->bc_magic == BLOCKIF_SIG);
989 	return (bc->bc_size);
990 }
991 
992 int
993 blockif_sectsz(struct blockif_ctxt *bc)
994 {
995 
996 	assert(bc->bc_magic == BLOCKIF_SIG);
997 	return (bc->bc_sectsz);
998 }
999 
1000 void
1001 blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off)
1002 {
1003 
1004 	assert(bc->bc_magic == BLOCKIF_SIG);
1005 	*size = bc->bc_psectsz;
1006 	*off = bc->bc_psectoff;
1007 }
1008 
1009 int
1010 blockif_queuesz(struct blockif_ctxt *bc)
1011 {
1012 
1013 	assert(bc->bc_magic == BLOCKIF_SIG);
1014 	return (BLOCKIF_MAXREQ - 1);
1015 }
1016 
1017 int
1018 blockif_is_ro(struct blockif_ctxt *bc)
1019 {
1020 
1021 	assert(bc->bc_magic == BLOCKIF_SIG);
1022 	return (bc->bc_rdonly);
1023 }
1024 
1025 int
1026 blockif_candelete(struct blockif_ctxt *bc)
1027 {
1028 
1029 	assert(bc->bc_magic == BLOCKIF_SIG);
1030 	return (bc->bc_candelete);
1031 }
1032 
1033 #ifndef __FreeBSD__
1034 int
1035 blockif_set_wce(struct blockif_ctxt *bc, int wc_enable)
1036 {
1037 	int res = 0, flags;
1038 	int clean_val = (wc_enable != 0) ? 1 : 0;
1039 
1040 	(void) pthread_mutex_lock(&bc->bc_mtx);
1041 	switch (bc->bc_wce) {
1042 	case WCE_IOCTL:
1043 		res = ioctl(bc->bc_fd, DKIOCSETWCE, &clean_val);
1044 		break;
1045 	case WCE_FCNTL:
1046 		if ((flags = fcntl(bc->bc_fd, F_GETFL)) >= 0) {
1047 			if (wc_enable == 0) {
1048 				flags |= O_DSYNC;
1049 			} else {
1050 				flags &= ~O_DSYNC;
1051 			}
1052 			if (fcntl(bc->bc_fd, F_SETFL, flags) == -1) {
1053 				res = -1;
1054 			}
1055 		} else {
1056 			res = -1;
1057 		}
1058 		break;
1059 	default:
1060 		break;
1061 	}
1062 
1063 	/*
1064 	 * After a successful disable of the write cache, ensure that any
1065 	 * lingering data in the cache is synced out.
1066 	 */
1067 	if (res == 0 && wc_enable == 0) {
1068 		res = fsync(bc->bc_fd);
1069 	}
1070 	(void) pthread_mutex_unlock(&bc->bc_mtx);
1071 
1072 	return (res);
1073 }
1074 #endif /* __FreeBSD__ */
1075