xref: /freebsd/usr.sbin/bhyve/block_if.c (revision 3823d5e198425b4f5e5a80267d195769d1063773)
1 /*-
2  * Copyright (c) 2013  Peter Grehan <grehan@freebsd.org>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  *
26  * $FreeBSD$
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 #include <sys/param.h>
33 #include <sys/queue.h>
34 #include <sys/errno.h>
35 #include <sys/stat.h>
36 #include <sys/ioctl.h>
37 #include <sys/disk.h>
38 
39 #include <assert.h>
40 #include <fcntl.h>
41 #include <stdio.h>
42 #include <stdlib.h>
43 #include <string.h>
44 #include <pthread.h>
45 #include <pthread_np.h>
46 #include <unistd.h>
47 
48 #include "bhyverun.h"
49 #include "block_if.h"
50 
51 #define BLOCKIF_SIG	0xb109b109
52 
53 #define BLOCKIF_MAXREQ	32
54 
55 enum blockop {
56 	BOP_READ,
57 	BOP_WRITE,
58 	BOP_FLUSH
59 };
60 
61 enum blockstat {
62 	BST_FREE,
63 	BST_INUSE
64 };
65 
66 struct blockif_elem {
67 	TAILQ_ENTRY(blockif_elem) be_link;
68 	struct blockif_req  *be_req;
69 	enum blockop	     be_op;
70 	enum blockstat	     be_status;
71 };
72 
73 struct blockif_ctxt {
74 	int			bc_magic;
75 	int			bc_fd;
76 	int			bc_rdonly;
77 	off_t			bc_size;
78 	int			bc_sectsz;
79 	pthread_t		bc_btid;
80         pthread_mutex_t		bc_mtx;
81         pthread_cond_t		bc_cond;
82 	int			bc_closing;
83 
84 	/* Request elements and free/inuse queues */
85 	TAILQ_HEAD(, blockif_elem) bc_freeq;
86 	TAILQ_HEAD(, blockif_elem) bc_inuseq;
87 	u_int			bc_req_count;
88 	struct blockif_elem	bc_reqs[BLOCKIF_MAXREQ];
89 };
90 
91 static int
92 blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq,
93 		enum blockop op)
94 {
95 	struct blockif_elem *be;
96 
97 	assert(bc->bc_req_count < BLOCKIF_MAXREQ);
98 
99 	be = TAILQ_FIRST(&bc->bc_freeq);
100 	assert(be != NULL);
101 	assert(be->be_status == BST_FREE);
102 
103 	TAILQ_REMOVE(&bc->bc_freeq, be, be_link);
104 	be->be_status = BST_INUSE;
105 	be->be_req = breq;
106 	be->be_op = op;
107 	TAILQ_INSERT_TAIL(&bc->bc_inuseq, be, be_link);
108 
109 	bc->bc_req_count++;
110 
111 	return (0);
112 }
113 
114 static int
115 blockif_dequeue(struct blockif_ctxt *bc, struct blockif_elem *el)
116 {
117 	struct blockif_elem *be;
118 
119 	if (bc->bc_req_count == 0)
120 		return (ENOENT);
121 
122 	be = TAILQ_FIRST(&bc->bc_inuseq);
123 	assert(be != NULL);
124 	assert(be->be_status == BST_INUSE);
125 	*el = *be;
126 
127 	TAILQ_REMOVE(&bc->bc_inuseq, be, be_link);
128 	be->be_status = BST_FREE;
129 	be->be_req = NULL;
130 	TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link);
131 
132 	bc->bc_req_count--;
133 
134 	return (0);
135 }
136 
137 static void
138 blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be)
139 {
140 	struct blockif_req *br;
141 	int err;
142 
143 	br = be->be_req;
144 	err = 0;
145 
146 	switch (be->be_op) {
147 	case BOP_READ:
148 		if (preadv(bc->bc_fd, br->br_iov, br->br_iovcnt,
149 			   br->br_offset) < 0)
150 			err = errno;
151 		break;
152 	case BOP_WRITE:
153 		if (bc->bc_rdonly)
154 			err = EROFS;
155 		else if (pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt,
156 			     br->br_offset) < 0)
157 			err = errno;
158 		break;
159 	case BOP_FLUSH:
160 		break;
161 	default:
162 		err = EINVAL;
163 		break;
164 	}
165 
166 	(*br->br_callback)(br, err);
167 }
168 
169 static void *
170 blockif_thr(void *arg)
171 {
172 	struct blockif_ctxt *bc;
173 	struct blockif_elem req;
174 
175 	bc = arg;
176 
177 	for (;;) {
178 		pthread_mutex_lock(&bc->bc_mtx);
179 		while (!blockif_dequeue(bc, &req)) {
180 			pthread_mutex_unlock(&bc->bc_mtx);
181 			blockif_proc(bc, &req);
182 			pthread_mutex_lock(&bc->bc_mtx);
183 		}
184 		pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx);
185 		pthread_mutex_unlock(&bc->bc_mtx);
186 
187 		/*
188 		 * Check ctxt status here to see if exit requested
189 		 */
190 		if (bc->bc_closing)
191 			pthread_exit(NULL);
192 	}
193 
194 	/* Not reached */
195 	return (NULL);
196 }
197 
198 struct blockif_ctxt *
199 blockif_open(const char *optstr, const char *ident)
200 {
201 	char tname[MAXCOMLEN + 1];
202 	char *nopt, *xopts;
203 	struct blockif_ctxt *bc;
204 	struct stat sbuf;
205 	off_t size;
206 	int extra, fd, i, sectsz;
207 	int nocache, sync, ro;
208 
209 	nocache = 0;
210 	sync = 0;
211 	ro = 0;
212 
213 	/*
214 	 * The first element in the optstring is always a pathname.
215 	 * Optional elements follow
216 	 */
217 	nopt = strdup(optstr);
218 	for (xopts = strtok(nopt, ",");
219 	     xopts != NULL;
220 	     xopts = strtok(NULL, ",")) {
221 		if (!strcmp(xopts, "nocache"))
222 			nocache = 1;
223 		else if (!strcmp(xopts, "sync"))
224 			sync = 1;
225 		else if (!strcmp(xopts, "ro"))
226 			ro = 1;
227 	}
228 
229 	extra = 0;
230 	if (nocache)
231 		extra |= O_DIRECT;
232 	if (sync)
233 		extra |= O_SYNC;
234 
235 	fd = open(nopt, (ro ? O_RDONLY : O_RDWR) | extra);
236 	if (fd < 0 && !ro) {
237 		/* Attempt a r/w fail with a r/o open */
238 		fd = open(nopt, O_RDONLY | extra);
239 		ro = 1;
240 	}
241 
242 	if (fd < 0) {
243 		perror("Could not open backing file");
244 		return (NULL);
245 	}
246 
247         if (fstat(fd, &sbuf) < 0) {
248                 perror("Could not stat backing file");
249                 close(fd);
250                 return (NULL);
251         }
252 
253         /*
254 	 * Deal with raw devices
255 	 */
256         size = sbuf.st_size;
257 	sectsz = DEV_BSIZE;
258 	if (S_ISCHR(sbuf.st_mode)) {
259 		if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 ||
260 		    ioctl(fd, DIOCGSECTORSIZE, &sectsz)) {
261 			perror("Could not fetch dev blk/sector size");
262 			close(fd);
263 			return (NULL);
264 		}
265 		assert(size != 0);
266 		assert(sectsz != 0);
267 	}
268 
269 	bc = calloc(1, sizeof(struct blockif_ctxt));
270 	if (bc == NULL) {
271 		close(fd);
272 		return (NULL);
273 	}
274 
275 	bc->bc_magic = BLOCKIF_SIG;
276 	bc->bc_fd = fd;
277 	bc->bc_rdonly = ro;
278 	bc->bc_size = size;
279 	bc->bc_sectsz = sectsz;
280 	pthread_mutex_init(&bc->bc_mtx, NULL);
281 	pthread_cond_init(&bc->bc_cond, NULL);
282 	TAILQ_INIT(&bc->bc_freeq);
283 	TAILQ_INIT(&bc->bc_inuseq);
284 	bc->bc_req_count = 0;
285 	for (i = 0; i < BLOCKIF_MAXREQ; i++) {
286 		bc->bc_reqs[i].be_status = BST_FREE;
287 		TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link);
288 	}
289 
290 	pthread_create(&bc->bc_btid, NULL, blockif_thr, bc);
291 
292 	snprintf(tname, sizeof(tname), "blk-%s", ident);
293 	pthread_set_name_np(bc->bc_btid, tname);
294 
295 	return (bc);
296 }
297 
298 static int
299 blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq,
300 		enum blockop op)
301 {
302 	int err;
303 
304 	err = 0;
305 
306 	pthread_mutex_lock(&bc->bc_mtx);
307 	if (bc->bc_req_count < BLOCKIF_MAXREQ) {
308 		/*
309 		 * Enqueue and inform the block i/o thread
310 		 * that there is work available
311 		 */
312 		blockif_enqueue(bc, breq, op);
313 		pthread_cond_signal(&bc->bc_cond);
314 	} else {
315 		/*
316 		 * Callers are not allowed to enqueue more than
317 		 * the specified blockif queue limit. Return an
318 		 * error to indicate that the queue length has been
319 		 * exceeded.
320 		 */
321 		err = E2BIG;
322 	}
323 	pthread_mutex_unlock(&bc->bc_mtx);
324 
325 	return (err);
326 }
327 
328 int
329 blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq)
330 {
331 
332 	assert(bc->bc_magic == BLOCKIF_SIG);
333 	return (blockif_request(bc, breq, BOP_READ));
334 }
335 
336 int
337 blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq)
338 {
339 
340 	assert(bc->bc_magic == BLOCKIF_SIG);
341 	return (blockif_request(bc, breq, BOP_WRITE));
342 }
343 
344 int
345 blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq)
346 {
347 
348 	assert(bc->bc_magic == BLOCKIF_SIG);
349 	return (blockif_request(bc, breq, BOP_FLUSH));
350 }
351 
352 int
353 blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq)
354 {
355 	struct blockif_elem *be;
356 
357 	assert(bc->bc_magic == BLOCKIF_SIG);
358 
359 	pthread_mutex_lock(&bc->bc_mtx);
360 	TAILQ_FOREACH(be, &bc->bc_inuseq, be_link) {
361 		if (be->be_req == breq)
362 			break;
363 	}
364 	if (be == NULL) {
365 		pthread_mutex_unlock(&bc->bc_mtx);
366 		return (EINVAL);
367 	}
368 
369 	TAILQ_REMOVE(&bc->bc_inuseq, be, be_link);
370 	be->be_status = BST_FREE;
371 	be->be_req = NULL;
372 	TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link);
373 	bc->bc_req_count--;
374 	pthread_mutex_unlock(&bc->bc_mtx);
375 
376 	return (0);
377 }
378 
379 int
380 blockif_close(struct blockif_ctxt *bc)
381 {
382 	void *jval;
383 	int err;
384 
385 	err = 0;
386 
387 	assert(bc->bc_magic == BLOCKIF_SIG);
388 
389 	/*
390 	 * Stop the block i/o thread
391 	 */
392 	bc->bc_closing = 1;
393 	pthread_cond_signal(&bc->bc_cond);
394 	pthread_join(bc->bc_btid, &jval);
395 
396 	/* XXX Cancel queued i/o's ??? */
397 
398 	/*
399 	 * Release resources
400 	 */
401 	bc->bc_magic = 0;
402 	close(bc->bc_fd);
403 	free(bc);
404 
405 	return (0);
406 }
407 
408 /*
409  * Return virtual C/H/S values for a given block. Use the algorithm
410  * outlined in the VHD specification to calculate values.
411  */
412 void
413 blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s)
414 {
415 	off_t sectors;		/* total sectors of the block dev */
416 	off_t hcyl;		/* cylinders times heads */
417 	uint16_t secpt;		/* sectors per track */
418 	uint8_t heads;
419 
420 	assert(bc->bc_magic == BLOCKIF_SIG);
421 
422 	sectors = bc->bc_size / bc->bc_sectsz;
423 
424 	/* Clamp the size to the largest possible with CHS */
425 	if (sectors > 65535UL*16*255)
426 		sectors = 65535UL*16*255;
427 
428 	if (sectors >= 65536UL*16*63) {
429 		secpt = 255;
430 		heads = 16;
431 		hcyl = sectors / secpt;
432 	} else {
433 		secpt = 17;
434 		hcyl = sectors / secpt;
435 		heads = (hcyl + 1023) / 1024;
436 
437 		if (heads < 4)
438 			heads = 4;
439 
440 		if (hcyl >= (heads * 1024) || heads > 16) {
441 			secpt = 31;
442 			heads = 16;
443 			hcyl = sectors / secpt;
444 		}
445 		if (hcyl >= (heads * 1024)) {
446 			secpt = 63;
447 			heads = 16;
448 			hcyl = sectors / secpt;
449 		}
450 	}
451 
452 	*c = hcyl / heads;
453 	*h = heads;
454 	*s = secpt;
455 }
456 
457 /*
458  * Accessors
459  */
460 off_t
461 blockif_size(struct blockif_ctxt *bc)
462 {
463 
464 	assert(bc->bc_magic == BLOCKIF_SIG);
465 	return (bc->bc_size);
466 }
467 
468 int
469 blockif_sectsz(struct blockif_ctxt *bc)
470 {
471 
472 	assert(bc->bc_magic == BLOCKIF_SIG);
473 	return (bc->bc_sectsz);
474 }
475 
476 int
477 blockif_queuesz(struct blockif_ctxt *bc)
478 {
479 
480 	assert(bc->bc_magic == BLOCKIF_SIG);
481 	return (BLOCKIF_MAXREQ);
482 }
483 
484 int
485 blockif_is_ro(struct blockif_ctxt *bc)
486 {
487 
488 	assert(bc->bc_magic == BLOCKIF_SIG);
489 	return (bc->bc_rdonly);
490 }
491