xref: /freebsd/usr.sbin/bhyve/block_if.c (revision 26a222dc0c048fc071b548eadad7b80405a1b126)
1 /*-
2  * Copyright (c) 2013  Peter Grehan <grehan@freebsd.org>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  *
26  * $FreeBSD$
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 #include <sys/param.h>
33 #include <sys/queue.h>
34 #include <sys/errno.h>
35 #include <sys/stat.h>
36 #include <sys/ioctl.h>
37 #include <sys/disk.h>
38 
39 #include <assert.h>
40 #include <fcntl.h>
41 #include <stdio.h>
42 #include <stdlib.h>
43 #include <string.h>
44 #include <pthread.h>
45 #include <pthread_np.h>
46 #include <signal.h>
47 #include <unistd.h>
48 
49 #include <machine/atomic.h>
50 
51 #include "bhyverun.h"
52 #include "mevent.h"
53 #include "block_if.h"
54 
55 #define BLOCKIF_SIG	0xb109b109
56 
57 #define BLOCKIF_MAXREQ	33
58 
59 enum blockop {
60 	BOP_READ,
61 	BOP_WRITE,
62 	BOP_FLUSH
63 };
64 
65 enum blockstat {
66 	BST_FREE,
67 	BST_PEND,
68 	BST_BUSY,
69 	BST_DONE
70 };
71 
72 struct blockif_elem {
73 	TAILQ_ENTRY(blockif_elem) be_link;
74 	struct blockif_req  *be_req;
75 	enum blockop	     be_op;
76 	enum blockstat	     be_status;
77 	pthread_t            be_tid;
78 };
79 
80 struct blockif_ctxt {
81 	int			bc_magic;
82 	int			bc_fd;
83 	int			bc_ischr;
84 	int			bc_rdonly;
85 	off_t			bc_size;
86 	int			bc_sectsz;
87 	int			bc_psectsz;
88 	int			bc_psectoff;
89 	pthread_t		bc_btid;
90         pthread_mutex_t		bc_mtx;
91         pthread_cond_t		bc_cond;
92 	int			bc_closing;
93 
94 	/* Request elements and free/pending/busy queues */
95 	TAILQ_HEAD(, blockif_elem) bc_freeq;
96 	TAILQ_HEAD(, blockif_elem) bc_pendq;
97 	TAILQ_HEAD(, blockif_elem) bc_busyq;
98 	u_int			bc_req_count;
99 	struct blockif_elem	bc_reqs[BLOCKIF_MAXREQ];
100 };
101 
102 static pthread_once_t blockif_once = PTHREAD_ONCE_INIT;
103 
104 struct blockif_sig_elem {
105 	pthread_mutex_t			bse_mtx;
106 	pthread_cond_t			bse_cond;
107 	int				bse_pending;
108 	struct blockif_sig_elem		*bse_next;
109 };
110 
111 static struct blockif_sig_elem *blockif_bse_head;
112 
113 static int
114 blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq,
115 		enum blockop op)
116 {
117 	struct blockif_elem *be;
118 
119 	assert(bc->bc_req_count < BLOCKIF_MAXREQ);
120 
121 	be = TAILQ_FIRST(&bc->bc_freeq);
122 	assert(be != NULL);
123 	assert(be->be_status == BST_FREE);
124 
125 	TAILQ_REMOVE(&bc->bc_freeq, be, be_link);
126 	be->be_status = BST_PEND;
127 	be->be_req = breq;
128 	be->be_op = op;
129 	TAILQ_INSERT_TAIL(&bc->bc_pendq, be, be_link);
130 
131 	bc->bc_req_count++;
132 
133 	return (0);
134 }
135 
136 static int
137 blockif_dequeue(struct blockif_ctxt *bc, struct blockif_elem **bep)
138 {
139 	struct blockif_elem *be;
140 
141 	if (bc->bc_req_count == 0)
142 		return (ENOENT);
143 
144 	be = TAILQ_FIRST(&bc->bc_pendq);
145 	assert(be != NULL);
146 	assert(be->be_status == BST_PEND);
147 	TAILQ_REMOVE(&bc->bc_pendq, be, be_link);
148 	be->be_status = BST_BUSY;
149 	be->be_tid = bc->bc_btid;
150 	TAILQ_INSERT_TAIL(&bc->bc_busyq, be, be_link);
151 
152 	*bep = be;
153 
154 	return (0);
155 }
156 
157 static void
158 blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be)
159 {
160 	assert(be->be_status == BST_DONE);
161 
162 	TAILQ_REMOVE(&bc->bc_busyq, be, be_link);
163 	be->be_tid = 0;
164 	be->be_status = BST_FREE;
165 	be->be_req = NULL;
166 	TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link);
167 
168 	bc->bc_req_count--;
169 }
170 
171 static void
172 blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be)
173 {
174 	struct blockif_req *br;
175 	int err;
176 
177 	br = be->be_req;
178 	err = 0;
179 
180 	switch (be->be_op) {
181 	case BOP_READ:
182 		if (preadv(bc->bc_fd, br->br_iov, br->br_iovcnt,
183 			   br->br_offset) < 0)
184 			err = errno;
185 		break;
186 	case BOP_WRITE:
187 		if (bc->bc_rdonly)
188 			err = EROFS;
189 		else if (pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt,
190 			     br->br_offset) < 0)
191 			err = errno;
192 		break;
193 	case BOP_FLUSH:
194 		if (bc->bc_ischr) {
195 			if (ioctl(bc->bc_fd, DIOCGFLUSH))
196 				err = errno;
197 		} else if (fsync(bc->bc_fd))
198 			err = errno;
199 		break;
200 	default:
201 		err = EINVAL;
202 		break;
203 	}
204 
205 	be->be_status = BST_DONE;
206 
207 	(*br->br_callback)(br, err);
208 }
209 
210 static void *
211 blockif_thr(void *arg)
212 {
213 	struct blockif_ctxt *bc;
214 	struct blockif_elem *be;
215 
216 	bc = arg;
217 
218 	for (;;) {
219 		pthread_mutex_lock(&bc->bc_mtx);
220 		while (!blockif_dequeue(bc, &be)) {
221 			pthread_mutex_unlock(&bc->bc_mtx);
222 			blockif_proc(bc, be);
223 			pthread_mutex_lock(&bc->bc_mtx);
224 			blockif_complete(bc, be);
225 		}
226 		pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx);
227 		pthread_mutex_unlock(&bc->bc_mtx);
228 
229 		/*
230 		 * Check ctxt status here to see if exit requested
231 		 */
232 		if (bc->bc_closing)
233 			pthread_exit(NULL);
234 	}
235 
236 	/* Not reached */
237 	return (NULL);
238 }
239 
240 static void
241 blockif_sigcont_handler(int signal, enum ev_type type, void *arg)
242 {
243 	struct blockif_sig_elem *bse;
244 
245 	for (;;) {
246 		/*
247 		 * Process the entire list even if not intended for
248 		 * this thread.
249 		 */
250 		do {
251 			bse = blockif_bse_head;
252 			if (bse == NULL)
253 				return;
254 		} while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head,
255 					    (uintptr_t)bse,
256 					    (uintptr_t)bse->bse_next));
257 
258 		pthread_mutex_lock(&bse->bse_mtx);
259 		bse->bse_pending = 0;
260 		pthread_cond_signal(&bse->bse_cond);
261 		pthread_mutex_unlock(&bse->bse_mtx);
262 	}
263 }
264 
265 static void
266 blockif_init(void)
267 {
268 	mevent_add(SIGCONT, EVF_SIGNAL, blockif_sigcont_handler, NULL);
269 	(void) signal(SIGCONT, SIG_IGN);
270 }
271 
272 struct blockif_ctxt *
273 blockif_open(const char *optstr, const char *ident)
274 {
275 	char tname[MAXCOMLEN + 1];
276 	char *nopt, *xopts;
277 	struct blockif_ctxt *bc;
278 	struct stat sbuf;
279 	off_t size, psectsz, psectoff;
280 	int extra, fd, i, sectsz;
281 	int nocache, sync, ro;
282 
283 	pthread_once(&blockif_once, blockif_init);
284 
285 	nocache = 0;
286 	sync = 0;
287 	ro = 0;
288 
289 	/*
290 	 * The first element in the optstring is always a pathname.
291 	 * Optional elements follow
292 	 */
293 	nopt = strdup(optstr);
294 	for (xopts = strtok(nopt, ",");
295 	     xopts != NULL;
296 	     xopts = strtok(NULL, ",")) {
297 		if (!strcmp(xopts, "nocache"))
298 			nocache = 1;
299 		else if (!strcmp(xopts, "sync"))
300 			sync = 1;
301 		else if (!strcmp(xopts, "ro"))
302 			ro = 1;
303 	}
304 
305 	extra = 0;
306 	if (nocache)
307 		extra |= O_DIRECT;
308 	if (sync)
309 		extra |= O_SYNC;
310 
311 	fd = open(nopt, (ro ? O_RDONLY : O_RDWR) | extra);
312 	if (fd < 0 && !ro) {
313 		/* Attempt a r/w fail with a r/o open */
314 		fd = open(nopt, O_RDONLY | extra);
315 		ro = 1;
316 	}
317 
318 	if (fd < 0) {
319 		perror("Could not open backing file");
320 		return (NULL);
321 	}
322 
323         if (fstat(fd, &sbuf) < 0) {
324                 perror("Could not stat backing file");
325                 close(fd);
326                 return (NULL);
327         }
328 
329         /*
330 	 * Deal with raw devices
331 	 */
332         size = sbuf.st_size;
333 	sectsz = DEV_BSIZE;
334 	psectsz = psectoff = 0;
335 	if (S_ISCHR(sbuf.st_mode)) {
336 		if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 ||
337 		    ioctl(fd, DIOCGSECTORSIZE, &sectsz)) {
338 			perror("Could not fetch dev blk/sector size");
339 			close(fd);
340 			return (NULL);
341 		}
342 		assert(size != 0);
343 		assert(sectsz != 0);
344 		if (ioctl(fd, DIOCGSTRIPESIZE, &psectsz) == 0 && psectsz > 0)
345 			ioctl(fd, DIOCGSTRIPEOFFSET, &psectoff);
346 	} else
347 		psectsz = sbuf.st_blksize;
348 
349 	bc = calloc(1, sizeof(struct blockif_ctxt));
350 	if (bc == NULL) {
351 		close(fd);
352 		return (NULL);
353 	}
354 
355 	bc->bc_magic = BLOCKIF_SIG;
356 	bc->bc_fd = fd;
357 	bc->bc_ischr = S_ISCHR(sbuf.st_mode);
358 	bc->bc_rdonly = ro;
359 	bc->bc_size = size;
360 	bc->bc_sectsz = sectsz;
361 	bc->bc_psectsz = psectsz;
362 	bc->bc_psectoff = psectoff;
363 	pthread_mutex_init(&bc->bc_mtx, NULL);
364 	pthread_cond_init(&bc->bc_cond, NULL);
365 	TAILQ_INIT(&bc->bc_freeq);
366 	TAILQ_INIT(&bc->bc_pendq);
367 	TAILQ_INIT(&bc->bc_busyq);
368 	bc->bc_req_count = 0;
369 	for (i = 0; i < BLOCKIF_MAXREQ; i++) {
370 		bc->bc_reqs[i].be_status = BST_FREE;
371 		TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link);
372 	}
373 
374 	pthread_create(&bc->bc_btid, NULL, blockif_thr, bc);
375 
376 	snprintf(tname, sizeof(tname), "blk-%s", ident);
377 	pthread_set_name_np(bc->bc_btid, tname);
378 
379 	return (bc);
380 }
381 
382 static int
383 blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq,
384 		enum blockop op)
385 {
386 	int err;
387 
388 	err = 0;
389 
390 	pthread_mutex_lock(&bc->bc_mtx);
391 	if (bc->bc_req_count < BLOCKIF_MAXREQ) {
392 		/*
393 		 * Enqueue and inform the block i/o thread
394 		 * that there is work available
395 		 */
396 		blockif_enqueue(bc, breq, op);
397 		pthread_cond_signal(&bc->bc_cond);
398 	} else {
399 		/*
400 		 * Callers are not allowed to enqueue more than
401 		 * the specified blockif queue limit. Return an
402 		 * error to indicate that the queue length has been
403 		 * exceeded.
404 		 */
405 		err = E2BIG;
406 	}
407 	pthread_mutex_unlock(&bc->bc_mtx);
408 
409 	return (err);
410 }
411 
412 int
413 blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq)
414 {
415 
416 	assert(bc->bc_magic == BLOCKIF_SIG);
417 	return (blockif_request(bc, breq, BOP_READ));
418 }
419 
420 int
421 blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq)
422 {
423 
424 	assert(bc->bc_magic == BLOCKIF_SIG);
425 	return (blockif_request(bc, breq, BOP_WRITE));
426 }
427 
428 int
429 blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq)
430 {
431 
432 	assert(bc->bc_magic == BLOCKIF_SIG);
433 	return (blockif_request(bc, breq, BOP_FLUSH));
434 }
435 
436 int
437 blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq)
438 {
439 	struct blockif_elem *be;
440 
441 	assert(bc->bc_magic == BLOCKIF_SIG);
442 
443 	pthread_mutex_lock(&bc->bc_mtx);
444 	/*
445 	 * Check pending requests.
446 	 */
447 	TAILQ_FOREACH(be, &bc->bc_pendq, be_link) {
448 		if (be->be_req == breq)
449 			break;
450 	}
451 	if (be != NULL) {
452 		/*
453 		 * Found it.
454 		 */
455 		TAILQ_REMOVE(&bc->bc_pendq, be, be_link);
456 		be->be_status = BST_FREE;
457 		be->be_req = NULL;
458 		TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link);
459 		bc->bc_req_count--;
460 		pthread_mutex_unlock(&bc->bc_mtx);
461 
462 		return (0);
463 	}
464 
465 	/*
466 	 * Check in-flight requests.
467 	 */
468 	TAILQ_FOREACH(be, &bc->bc_busyq, be_link) {
469 		if (be->be_req == breq)
470 			break;
471 	}
472 	if (be == NULL) {
473 		/*
474 		 * Didn't find it.
475 		 */
476 		pthread_mutex_unlock(&bc->bc_mtx);
477 		return (EINVAL);
478 	}
479 
480 	/*
481 	 * Interrupt the processing thread to force it return
482 	 * prematurely via it's normal callback path.
483 	 */
484 	while (be->be_status == BST_BUSY) {
485 		struct blockif_sig_elem bse, *old_head;
486 
487 		pthread_mutex_init(&bse.bse_mtx, NULL);
488 		pthread_cond_init(&bse.bse_cond, NULL);
489 
490 		bse.bse_pending = 1;
491 
492 		do {
493 			old_head = blockif_bse_head;
494 			bse.bse_next = old_head;
495 		} while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head,
496 					    (uintptr_t)old_head,
497 					    (uintptr_t)&bse));
498 
499 		pthread_kill(be->be_tid, SIGCONT);
500 
501 		pthread_mutex_lock(&bse.bse_mtx);
502 		while (bse.bse_pending)
503 			pthread_cond_wait(&bse.bse_cond, &bse.bse_mtx);
504 		pthread_mutex_unlock(&bse.bse_mtx);
505 	}
506 
507 	pthread_mutex_unlock(&bc->bc_mtx);
508 
509 	/*
510 	 * The processing thread has been interrupted.  Since it's not
511 	 * clear if the callback has been invoked yet, return EBUSY.
512 	 */
513 	return (EBUSY);
514 }
515 
516 int
517 blockif_close(struct blockif_ctxt *bc)
518 {
519 	void *jval;
520 	int err;
521 
522 	err = 0;
523 
524 	assert(bc->bc_magic == BLOCKIF_SIG);
525 
526 	/*
527 	 * Stop the block i/o thread
528 	 */
529 	bc->bc_closing = 1;
530 	pthread_cond_signal(&bc->bc_cond);
531 	pthread_join(bc->bc_btid, &jval);
532 
533 	/* XXX Cancel queued i/o's ??? */
534 
535 	/*
536 	 * Release resources
537 	 */
538 	bc->bc_magic = 0;
539 	close(bc->bc_fd);
540 	free(bc);
541 
542 	return (0);
543 }
544 
545 /*
546  * Return virtual C/H/S values for a given block. Use the algorithm
547  * outlined in the VHD specification to calculate values.
548  */
549 void
550 blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s)
551 {
552 	off_t sectors;		/* total sectors of the block dev */
553 	off_t hcyl;		/* cylinders times heads */
554 	uint16_t secpt;		/* sectors per track */
555 	uint8_t heads;
556 
557 	assert(bc->bc_magic == BLOCKIF_SIG);
558 
559 	sectors = bc->bc_size / bc->bc_sectsz;
560 
561 	/* Clamp the size to the largest possible with CHS */
562 	if (sectors > 65535UL*16*255)
563 		sectors = 65535UL*16*255;
564 
565 	if (sectors >= 65536UL*16*63) {
566 		secpt = 255;
567 		heads = 16;
568 		hcyl = sectors / secpt;
569 	} else {
570 		secpt = 17;
571 		hcyl = sectors / secpt;
572 		heads = (hcyl + 1023) / 1024;
573 
574 		if (heads < 4)
575 			heads = 4;
576 
577 		if (hcyl >= (heads * 1024) || heads > 16) {
578 			secpt = 31;
579 			heads = 16;
580 			hcyl = sectors / secpt;
581 		}
582 		if (hcyl >= (heads * 1024)) {
583 			secpt = 63;
584 			heads = 16;
585 			hcyl = sectors / secpt;
586 		}
587 	}
588 
589 	*c = hcyl / heads;
590 	*h = heads;
591 	*s = secpt;
592 }
593 
594 /*
595  * Accessors
596  */
597 off_t
598 blockif_size(struct blockif_ctxt *bc)
599 {
600 
601 	assert(bc->bc_magic == BLOCKIF_SIG);
602 	return (bc->bc_size);
603 }
604 
605 int
606 blockif_sectsz(struct blockif_ctxt *bc)
607 {
608 
609 	assert(bc->bc_magic == BLOCKIF_SIG);
610 	return (bc->bc_sectsz);
611 }
612 
613 void
614 blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off)
615 {
616 
617 	assert(bc->bc_magic == BLOCKIF_SIG);
618 	*size = bc->bc_psectsz;
619 	*off = bc->bc_psectoff;
620 }
621 
622 int
623 blockif_queuesz(struct blockif_ctxt *bc)
624 {
625 
626 	assert(bc->bc_magic == BLOCKIF_SIG);
627 	return (BLOCKIF_MAXREQ - 1);
628 }
629 
630 int
631 blockif_is_ro(struct blockif_ctxt *bc)
632 {
633 
634 	assert(bc->bc_magic == BLOCKIF_SIG);
635 	return (bc->bc_rdonly);
636 }
637