xref: /freebsd/usr.sbin/bhyve/block_if.c (revision ff0ba87247820afbdfdc1b307c803f7923d0e4d3)
1 /*-
2  * Copyright (c) 2013  Peter Grehan <grehan@freebsd.org>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  *
26  * $FreeBSD$
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 #include <sys/param.h>
33 #include <sys/queue.h>
34 #include <sys/errno.h>
35 #include <sys/stat.h>
36 #include <sys/ioctl.h>
37 #include <sys/disk.h>
38 
39 #include <assert.h>
40 #include <fcntl.h>
41 #include <stdio.h>
42 #include <stdlib.h>
43 #include <string.h>
44 #include <pthread.h>
45 #include <pthread_np.h>
46 #include <signal.h>
47 #include <unistd.h>
48 
49 #include <machine/atomic.h>
50 
51 #include "bhyverun.h"
52 #include "mevent.h"
53 #include "block_if.h"
54 
55 #define BLOCKIF_SIG	0xb109b109
56 
57 #define BLOCKIF_MAXREQ	33
58 
59 enum blockop {
60 	BOP_READ,
61 	BOP_WRITE,
62 	BOP_FLUSH
63 };
64 
65 enum blockstat {
66 	BST_FREE,
67 	BST_PEND,
68 	BST_BUSY,
69 	BST_DONE
70 };
71 
72 struct blockif_elem {
73 	TAILQ_ENTRY(blockif_elem) be_link;
74 	struct blockif_req  *be_req;
75 	enum blockop	     be_op;
76 	enum blockstat	     be_status;
77 	pthread_t            be_tid;
78 };
79 
80 struct blockif_ctxt {
81 	int			bc_magic;
82 	int			bc_fd;
83 	int			bc_rdonly;
84 	off_t			bc_size;
85 	int			bc_sectsz;
86 	pthread_t		bc_btid;
87         pthread_mutex_t		bc_mtx;
88         pthread_cond_t		bc_cond;
89 	int			bc_closing;
90 
91 	/* Request elements and free/pending/busy queues */
92 	TAILQ_HEAD(, blockif_elem) bc_freeq;
93 	TAILQ_HEAD(, blockif_elem) bc_pendq;
94 	TAILQ_HEAD(, blockif_elem) bc_busyq;
95 	u_int			bc_req_count;
96 	struct blockif_elem	bc_reqs[BLOCKIF_MAXREQ];
97 };
98 
99 static pthread_once_t blockif_once = PTHREAD_ONCE_INIT;
100 
101 struct blockif_sig_elem {
102 	pthread_mutex_t			bse_mtx;
103 	pthread_cond_t			bse_cond;
104 	int				bse_pending;
105 	struct blockif_sig_elem		*bse_next;
106 };
107 
108 static struct blockif_sig_elem *blockif_bse_head;
109 
110 static int
111 blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq,
112 		enum blockop op)
113 {
114 	struct blockif_elem *be;
115 
116 	assert(bc->bc_req_count < BLOCKIF_MAXREQ);
117 
118 	be = TAILQ_FIRST(&bc->bc_freeq);
119 	assert(be != NULL);
120 	assert(be->be_status == BST_FREE);
121 
122 	TAILQ_REMOVE(&bc->bc_freeq, be, be_link);
123 	be->be_status = BST_PEND;
124 	be->be_req = breq;
125 	be->be_op = op;
126 	TAILQ_INSERT_TAIL(&bc->bc_pendq, be, be_link);
127 
128 	bc->bc_req_count++;
129 
130 	return (0);
131 }
132 
133 static int
134 blockif_dequeue(struct blockif_ctxt *bc, struct blockif_elem **bep)
135 {
136 	struct blockif_elem *be;
137 
138 	if (bc->bc_req_count == 0)
139 		return (ENOENT);
140 
141 	be = TAILQ_FIRST(&bc->bc_pendq);
142 	assert(be != NULL);
143 	assert(be->be_status == BST_PEND);
144 	TAILQ_REMOVE(&bc->bc_pendq, be, be_link);
145 	be->be_status = BST_BUSY;
146 	be->be_tid = bc->bc_btid;
147 	TAILQ_INSERT_TAIL(&bc->bc_busyq, be, be_link);
148 
149 	*bep = be;
150 
151 	return (0);
152 }
153 
154 static void
155 blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be)
156 {
157 	assert(be->be_status == BST_DONE);
158 
159 	TAILQ_REMOVE(&bc->bc_busyq, be, be_link);
160 	be->be_tid = 0;
161 	be->be_status = BST_FREE;
162 	be->be_req = NULL;
163 	TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link);
164 
165 	bc->bc_req_count--;
166 }
167 
168 static void
169 blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be)
170 {
171 	struct blockif_req *br;
172 	int err;
173 
174 	br = be->be_req;
175 	err = 0;
176 
177 	switch (be->be_op) {
178 	case BOP_READ:
179 		if (preadv(bc->bc_fd, br->br_iov, br->br_iovcnt,
180 			   br->br_offset) < 0)
181 			err = errno;
182 		break;
183 	case BOP_WRITE:
184 		if (bc->bc_rdonly)
185 			err = EROFS;
186 		else if (pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt,
187 			     br->br_offset) < 0)
188 			err = errno;
189 		break;
190 	case BOP_FLUSH:
191 		break;
192 	default:
193 		err = EINVAL;
194 		break;
195 	}
196 
197 	be->be_status = BST_DONE;
198 
199 	(*br->br_callback)(br, err);
200 }
201 
202 static void *
203 blockif_thr(void *arg)
204 {
205 	struct blockif_ctxt *bc;
206 	struct blockif_elem *be;
207 
208 	bc = arg;
209 
210 	for (;;) {
211 		pthread_mutex_lock(&bc->bc_mtx);
212 		while (!blockif_dequeue(bc, &be)) {
213 			pthread_mutex_unlock(&bc->bc_mtx);
214 			blockif_proc(bc, be);
215 			pthread_mutex_lock(&bc->bc_mtx);
216 			blockif_complete(bc, be);
217 		}
218 		pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx);
219 		pthread_mutex_unlock(&bc->bc_mtx);
220 
221 		/*
222 		 * Check ctxt status here to see if exit requested
223 		 */
224 		if (bc->bc_closing)
225 			pthread_exit(NULL);
226 	}
227 
228 	/* Not reached */
229 	return (NULL);
230 }
231 
232 static void
233 blockif_sigcont_handler(int signal, enum ev_type type, void *arg)
234 {
235 	struct blockif_sig_elem *bse;
236 
237 	for (;;) {
238 		/*
239 		 * Process the entire list even if not intended for
240 		 * this thread.
241 		 */
242 		do {
243 			bse = blockif_bse_head;
244 			if (bse == NULL)
245 				return;
246 		} while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head,
247 					    (uintptr_t)bse,
248 					    (uintptr_t)bse->bse_next));
249 
250 		pthread_mutex_lock(&bse->bse_mtx);
251 		bse->bse_pending = 0;
252 		pthread_cond_signal(&bse->bse_cond);
253 		pthread_mutex_unlock(&bse->bse_mtx);
254 	}
255 }
256 
257 static void
258 blockif_init(void)
259 {
260 	mevent_add(SIGCONT, EVF_SIGNAL, blockif_sigcont_handler, NULL);
261 	(void) signal(SIGCONT, SIG_IGN);
262 }
263 
264 struct blockif_ctxt *
265 blockif_open(const char *optstr, const char *ident)
266 {
267 	char tname[MAXCOMLEN + 1];
268 	char *nopt, *xopts;
269 	struct blockif_ctxt *bc;
270 	struct stat sbuf;
271 	off_t size;
272 	int extra, fd, i, sectsz;
273 	int nocache, sync, ro;
274 
275 	pthread_once(&blockif_once, blockif_init);
276 
277 	nocache = 0;
278 	sync = 0;
279 	ro = 0;
280 
281 	/*
282 	 * The first element in the optstring is always a pathname.
283 	 * Optional elements follow
284 	 */
285 	nopt = strdup(optstr);
286 	for (xopts = strtok(nopt, ",");
287 	     xopts != NULL;
288 	     xopts = strtok(NULL, ",")) {
289 		if (!strcmp(xopts, "nocache"))
290 			nocache = 1;
291 		else if (!strcmp(xopts, "sync"))
292 			sync = 1;
293 		else if (!strcmp(xopts, "ro"))
294 			ro = 1;
295 	}
296 
297 	extra = 0;
298 	if (nocache)
299 		extra |= O_DIRECT;
300 	if (sync)
301 		extra |= O_SYNC;
302 
303 	fd = open(nopt, (ro ? O_RDONLY : O_RDWR) | extra);
304 	if (fd < 0 && !ro) {
305 		/* Attempt a r/w fail with a r/o open */
306 		fd = open(nopt, O_RDONLY | extra);
307 		ro = 1;
308 	}
309 
310 	if (fd < 0) {
311 		perror("Could not open backing file");
312 		return (NULL);
313 	}
314 
315         if (fstat(fd, &sbuf) < 0) {
316                 perror("Could not stat backing file");
317                 close(fd);
318                 return (NULL);
319         }
320 
321         /*
322 	 * Deal with raw devices
323 	 */
324         size = sbuf.st_size;
325 	sectsz = DEV_BSIZE;
326 	if (S_ISCHR(sbuf.st_mode)) {
327 		if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 ||
328 		    ioctl(fd, DIOCGSECTORSIZE, &sectsz)) {
329 			perror("Could not fetch dev blk/sector size");
330 			close(fd);
331 			return (NULL);
332 		}
333 		assert(size != 0);
334 		assert(sectsz != 0);
335 	}
336 
337 	bc = calloc(1, sizeof(struct blockif_ctxt));
338 	if (bc == NULL) {
339 		close(fd);
340 		return (NULL);
341 	}
342 
343 	bc->bc_magic = BLOCKIF_SIG;
344 	bc->bc_fd = fd;
345 	bc->bc_rdonly = ro;
346 	bc->bc_size = size;
347 	bc->bc_sectsz = sectsz;
348 	pthread_mutex_init(&bc->bc_mtx, NULL);
349 	pthread_cond_init(&bc->bc_cond, NULL);
350 	TAILQ_INIT(&bc->bc_freeq);
351 	TAILQ_INIT(&bc->bc_pendq);
352 	TAILQ_INIT(&bc->bc_busyq);
353 	bc->bc_req_count = 0;
354 	for (i = 0; i < BLOCKIF_MAXREQ; i++) {
355 		bc->bc_reqs[i].be_status = BST_FREE;
356 		TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link);
357 	}
358 
359 	pthread_create(&bc->bc_btid, NULL, blockif_thr, bc);
360 
361 	snprintf(tname, sizeof(tname), "blk-%s", ident);
362 	pthread_set_name_np(bc->bc_btid, tname);
363 
364 	return (bc);
365 }
366 
367 static int
368 blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq,
369 		enum blockop op)
370 {
371 	int err;
372 
373 	err = 0;
374 
375 	pthread_mutex_lock(&bc->bc_mtx);
376 	if (bc->bc_req_count < BLOCKIF_MAXREQ) {
377 		/*
378 		 * Enqueue and inform the block i/o thread
379 		 * that there is work available
380 		 */
381 		blockif_enqueue(bc, breq, op);
382 		pthread_cond_signal(&bc->bc_cond);
383 	} else {
384 		/*
385 		 * Callers are not allowed to enqueue more than
386 		 * the specified blockif queue limit. Return an
387 		 * error to indicate that the queue length has been
388 		 * exceeded.
389 		 */
390 		err = E2BIG;
391 	}
392 	pthread_mutex_unlock(&bc->bc_mtx);
393 
394 	return (err);
395 }
396 
397 int
398 blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq)
399 {
400 
401 	assert(bc->bc_magic == BLOCKIF_SIG);
402 	return (blockif_request(bc, breq, BOP_READ));
403 }
404 
405 int
406 blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq)
407 {
408 
409 	assert(bc->bc_magic == BLOCKIF_SIG);
410 	return (blockif_request(bc, breq, BOP_WRITE));
411 }
412 
413 int
414 blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq)
415 {
416 
417 	assert(bc->bc_magic == BLOCKIF_SIG);
418 	return (blockif_request(bc, breq, BOP_FLUSH));
419 }
420 
421 int
422 blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq)
423 {
424 	struct blockif_elem *be;
425 
426 	assert(bc->bc_magic == BLOCKIF_SIG);
427 
428 	pthread_mutex_lock(&bc->bc_mtx);
429 	/*
430 	 * Check pending requests.
431 	 */
432 	TAILQ_FOREACH(be, &bc->bc_pendq, be_link) {
433 		if (be->be_req == breq)
434 			break;
435 	}
436 	if (be != NULL) {
437 		/*
438 		 * Found it.
439 		 */
440 		TAILQ_REMOVE(&bc->bc_pendq, be, be_link);
441 		be->be_status = BST_FREE;
442 		be->be_req = NULL;
443 		TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link);
444 		bc->bc_req_count--;
445 		pthread_mutex_unlock(&bc->bc_mtx);
446 
447 		return (0);
448 	}
449 
450 	/*
451 	 * Check in-flight requests.
452 	 */
453 	TAILQ_FOREACH(be, &bc->bc_busyq, be_link) {
454 		if (be->be_req == breq)
455 			break;
456 	}
457 	if (be == NULL) {
458 		/*
459 		 * Didn't find it.
460 		 */
461 		pthread_mutex_unlock(&bc->bc_mtx);
462 		return (EINVAL);
463 	}
464 
465 	/*
466 	 * Interrupt the processing thread to force it return
467 	 * prematurely via it's normal callback path.
468 	 */
469 	while (be->be_status == BST_BUSY) {
470 		struct blockif_sig_elem bse, *old_head;
471 
472 		pthread_mutex_init(&bse.bse_mtx, NULL);
473 		pthread_cond_init(&bse.bse_cond, NULL);
474 
475 		bse.bse_pending = 1;
476 
477 		do {
478 			old_head = blockif_bse_head;
479 			bse.bse_next = old_head;
480 		} while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head,
481 					    (uintptr_t)old_head,
482 					    (uintptr_t)&bse));
483 
484 		pthread_kill(be->be_tid, SIGCONT);
485 
486 		pthread_mutex_lock(&bse.bse_mtx);
487 		while (bse.bse_pending)
488 			pthread_cond_wait(&bse.bse_cond, &bse.bse_mtx);
489 		pthread_mutex_unlock(&bse.bse_mtx);
490 	}
491 
492 	pthread_mutex_unlock(&bc->bc_mtx);
493 
494 	/*
495 	 * The processing thread has been interrupted.  Since it's not
496 	 * clear if the callback has been invoked yet, return EBUSY.
497 	 */
498 	return (EBUSY);
499 }
500 
501 int
502 blockif_close(struct blockif_ctxt *bc)
503 {
504 	void *jval;
505 	int err;
506 
507 	err = 0;
508 
509 	assert(bc->bc_magic == BLOCKIF_SIG);
510 
511 	/*
512 	 * Stop the block i/o thread
513 	 */
514 	bc->bc_closing = 1;
515 	pthread_cond_signal(&bc->bc_cond);
516 	pthread_join(bc->bc_btid, &jval);
517 
518 	/* XXX Cancel queued i/o's ??? */
519 
520 	/*
521 	 * Release resources
522 	 */
523 	bc->bc_magic = 0;
524 	close(bc->bc_fd);
525 	free(bc);
526 
527 	return (0);
528 }
529 
530 /*
531  * Return virtual C/H/S values for a given block. Use the algorithm
532  * outlined in the VHD specification to calculate values.
533  */
534 void
535 blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s)
536 {
537 	off_t sectors;		/* total sectors of the block dev */
538 	off_t hcyl;		/* cylinders times heads */
539 	uint16_t secpt;		/* sectors per track */
540 	uint8_t heads;
541 
542 	assert(bc->bc_magic == BLOCKIF_SIG);
543 
544 	sectors = bc->bc_size / bc->bc_sectsz;
545 
546 	/* Clamp the size to the largest possible with CHS */
547 	if (sectors > 65535UL*16*255)
548 		sectors = 65535UL*16*255;
549 
550 	if (sectors >= 65536UL*16*63) {
551 		secpt = 255;
552 		heads = 16;
553 		hcyl = sectors / secpt;
554 	} else {
555 		secpt = 17;
556 		hcyl = sectors / secpt;
557 		heads = (hcyl + 1023) / 1024;
558 
559 		if (heads < 4)
560 			heads = 4;
561 
562 		if (hcyl >= (heads * 1024) || heads > 16) {
563 			secpt = 31;
564 			heads = 16;
565 			hcyl = sectors / secpt;
566 		}
567 		if (hcyl >= (heads * 1024)) {
568 			secpt = 63;
569 			heads = 16;
570 			hcyl = sectors / secpt;
571 		}
572 	}
573 
574 	*c = hcyl / heads;
575 	*h = heads;
576 	*s = secpt;
577 }
578 
579 /*
580  * Accessors
581  */
582 off_t
583 blockif_size(struct blockif_ctxt *bc)
584 {
585 
586 	assert(bc->bc_magic == BLOCKIF_SIG);
587 	return (bc->bc_size);
588 }
589 
590 int
591 blockif_sectsz(struct blockif_ctxt *bc)
592 {
593 
594 	assert(bc->bc_magic == BLOCKIF_SIG);
595 	return (bc->bc_sectsz);
596 }
597 
598 int
599 blockif_queuesz(struct blockif_ctxt *bc)
600 {
601 
602 	assert(bc->bc_magic == BLOCKIF_SIG);
603 	return (BLOCKIF_MAXREQ - 1);
604 }
605 
606 int
607 blockif_is_ro(struct blockif_ctxt *bc)
608 {
609 
610 	assert(bc->bc_magic == BLOCKIF_SIG);
611 	return (bc->bc_rdonly);
612 }
613