1 /*- 2 * Copyright (c) 2013 Peter Grehan <grehan@freebsd.org> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 * $FreeBSD$ 27 */ 28 29 #include <sys/cdefs.h> 30 __FBSDID("$FreeBSD$"); 31 32 #include <sys/param.h> 33 #include <sys/queue.h> 34 #include <sys/errno.h> 35 #include <sys/stat.h> 36 #include <sys/ioctl.h> 37 #include <sys/disk.h> 38 39 #include <assert.h> 40 #include <fcntl.h> 41 #include <stdio.h> 42 #include <stdlib.h> 43 #include <string.h> 44 #include <pthread.h> 45 #include <pthread_np.h> 46 #include <signal.h> 47 #include <unistd.h> 48 49 #include <machine/atomic.h> 50 51 #include "bhyverun.h" 52 #include "mevent.h" 53 #include "block_if.h" 54 55 #define BLOCKIF_SIG 0xb109b109 56 57 #define BLOCKIF_MAXREQ 33 58 59 enum blockop { 60 BOP_READ, 61 BOP_WRITE, 62 BOP_FLUSH 63 }; 64 65 enum blockstat { 66 BST_FREE, 67 BST_PEND, 68 BST_BUSY, 69 BST_DONE 70 }; 71 72 struct blockif_elem { 73 TAILQ_ENTRY(blockif_elem) be_link; 74 struct blockif_req *be_req; 75 enum blockop be_op; 76 enum blockstat be_status; 77 pthread_t be_tid; 78 }; 79 80 struct blockif_ctxt { 81 int bc_magic; 82 int bc_fd; 83 int bc_ischr; 84 int bc_rdonly; 85 off_t bc_size; 86 int bc_sectsz; 87 int bc_psectsz; 88 int bc_psectoff; 89 pthread_t bc_btid; 90 pthread_mutex_t bc_mtx; 91 pthread_cond_t bc_cond; 92 int bc_closing; 93 94 /* Request elements and free/pending/busy queues */ 95 TAILQ_HEAD(, blockif_elem) bc_freeq; 96 TAILQ_HEAD(, blockif_elem) bc_pendq; 97 TAILQ_HEAD(, blockif_elem) bc_busyq; 98 u_int bc_req_count; 99 struct blockif_elem bc_reqs[BLOCKIF_MAXREQ]; 100 }; 101 102 static pthread_once_t blockif_once = PTHREAD_ONCE_INIT; 103 104 struct blockif_sig_elem { 105 pthread_mutex_t bse_mtx; 106 pthread_cond_t bse_cond; 107 int bse_pending; 108 struct blockif_sig_elem *bse_next; 109 }; 110 111 static struct blockif_sig_elem *blockif_bse_head; 112 113 static int 114 blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq, 115 enum blockop op) 116 { 117 struct blockif_elem *be; 118 119 assert(bc->bc_req_count < BLOCKIF_MAXREQ); 120 121 be = TAILQ_FIRST(&bc->bc_freeq); 122 assert(be != NULL); 123 assert(be->be_status == BST_FREE); 124 125 TAILQ_REMOVE(&bc->bc_freeq, be, be_link); 126 be->be_status = BST_PEND; 127 be->be_req = breq; 128 be->be_op = op; 129 TAILQ_INSERT_TAIL(&bc->bc_pendq, be, be_link); 130 131 bc->bc_req_count++; 132 133 return (0); 134 } 135 136 static int 137 blockif_dequeue(struct blockif_ctxt *bc, struct blockif_elem **bep) 138 { 139 struct blockif_elem *be; 140 141 if (bc->bc_req_count == 0) 142 return (ENOENT); 143 144 be = TAILQ_FIRST(&bc->bc_pendq); 145 assert(be != NULL); 146 assert(be->be_status == BST_PEND); 147 TAILQ_REMOVE(&bc->bc_pendq, be, be_link); 148 be->be_status = BST_BUSY; 149 be->be_tid = bc->bc_btid; 150 TAILQ_INSERT_TAIL(&bc->bc_busyq, be, be_link); 151 152 *bep = be; 153 154 return (0); 155 } 156 157 static void 158 blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be) 159 { 160 assert(be->be_status == BST_DONE); 161 162 TAILQ_REMOVE(&bc->bc_busyq, be, be_link); 163 be->be_tid = 0; 164 be->be_status = BST_FREE; 165 be->be_req = NULL; 166 TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link); 167 168 bc->bc_req_count--; 169 } 170 171 static void 172 blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be) 173 { 174 struct blockif_req *br; 175 int err; 176 177 br = be->be_req; 178 err = 0; 179 180 switch (be->be_op) { 181 case BOP_READ: 182 if (preadv(bc->bc_fd, br->br_iov, br->br_iovcnt, 183 br->br_offset) < 0) 184 err = errno; 185 break; 186 case BOP_WRITE: 187 if (bc->bc_rdonly) 188 err = EROFS; 189 else if (pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt, 190 br->br_offset) < 0) 191 err = errno; 192 break; 193 case BOP_FLUSH: 194 if (bc->bc_ischr) { 195 if (ioctl(bc->bc_fd, DIOCGFLUSH)) 196 err = errno; 197 } else if (fsync(bc->bc_fd)) 198 err = errno; 199 break; 200 default: 201 err = EINVAL; 202 break; 203 } 204 205 be->be_status = BST_DONE; 206 207 (*br->br_callback)(br, err); 208 } 209 210 static void * 211 blockif_thr(void *arg) 212 { 213 struct blockif_ctxt *bc; 214 struct blockif_elem *be; 215 216 bc = arg; 217 218 for (;;) { 219 pthread_mutex_lock(&bc->bc_mtx); 220 while (!blockif_dequeue(bc, &be)) { 221 pthread_mutex_unlock(&bc->bc_mtx); 222 blockif_proc(bc, be); 223 pthread_mutex_lock(&bc->bc_mtx); 224 blockif_complete(bc, be); 225 } 226 pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx); 227 pthread_mutex_unlock(&bc->bc_mtx); 228 229 /* 230 * Check ctxt status here to see if exit requested 231 */ 232 if (bc->bc_closing) 233 pthread_exit(NULL); 234 } 235 236 /* Not reached */ 237 return (NULL); 238 } 239 240 static void 241 blockif_sigcont_handler(int signal, enum ev_type type, void *arg) 242 { 243 struct blockif_sig_elem *bse; 244 245 for (;;) { 246 /* 247 * Process the entire list even if not intended for 248 * this thread. 249 */ 250 do { 251 bse = blockif_bse_head; 252 if (bse == NULL) 253 return; 254 } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head, 255 (uintptr_t)bse, 256 (uintptr_t)bse->bse_next)); 257 258 pthread_mutex_lock(&bse->bse_mtx); 259 bse->bse_pending = 0; 260 pthread_cond_signal(&bse->bse_cond); 261 pthread_mutex_unlock(&bse->bse_mtx); 262 } 263 } 264 265 static void 266 blockif_init(void) 267 { 268 mevent_add(SIGCONT, EVF_SIGNAL, blockif_sigcont_handler, NULL); 269 (void) signal(SIGCONT, SIG_IGN); 270 } 271 272 struct blockif_ctxt * 273 blockif_open(const char *optstr, const char *ident) 274 { 275 char tname[MAXCOMLEN + 1]; 276 char *nopt, *xopts; 277 struct blockif_ctxt *bc; 278 struct stat sbuf; 279 off_t size, psectsz, psectoff; 280 int extra, fd, i, sectsz; 281 int nocache, sync, ro; 282 283 pthread_once(&blockif_once, blockif_init); 284 285 nocache = 0; 286 sync = 0; 287 ro = 0; 288 289 /* 290 * The first element in the optstring is always a pathname. 291 * Optional elements follow 292 */ 293 nopt = strdup(optstr); 294 for (xopts = strtok(nopt, ","); 295 xopts != NULL; 296 xopts = strtok(NULL, ",")) { 297 if (!strcmp(xopts, "nocache")) 298 nocache = 1; 299 else if (!strcmp(xopts, "sync")) 300 sync = 1; 301 else if (!strcmp(xopts, "ro")) 302 ro = 1; 303 } 304 305 extra = 0; 306 if (nocache) 307 extra |= O_DIRECT; 308 if (sync) 309 extra |= O_SYNC; 310 311 fd = open(nopt, (ro ? O_RDONLY : O_RDWR) | extra); 312 if (fd < 0 && !ro) { 313 /* Attempt a r/w fail with a r/o open */ 314 fd = open(nopt, O_RDONLY | extra); 315 ro = 1; 316 } 317 318 if (fd < 0) { 319 perror("Could not open backing file"); 320 return (NULL); 321 } 322 323 if (fstat(fd, &sbuf) < 0) { 324 perror("Could not stat backing file"); 325 close(fd); 326 return (NULL); 327 } 328 329 /* 330 * Deal with raw devices 331 */ 332 size = sbuf.st_size; 333 sectsz = DEV_BSIZE; 334 psectsz = psectoff = 0; 335 if (S_ISCHR(sbuf.st_mode)) { 336 if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 || 337 ioctl(fd, DIOCGSECTORSIZE, §sz)) { 338 perror("Could not fetch dev blk/sector size"); 339 close(fd); 340 return (NULL); 341 } 342 assert(size != 0); 343 assert(sectsz != 0); 344 if (ioctl(fd, DIOCGSTRIPESIZE, &psectsz) == 0 && psectsz > 0) 345 ioctl(fd, DIOCGSTRIPEOFFSET, &psectoff); 346 } else 347 psectsz = sbuf.st_blksize; 348 349 bc = calloc(1, sizeof(struct blockif_ctxt)); 350 if (bc == NULL) { 351 close(fd); 352 return (NULL); 353 } 354 355 bc->bc_magic = BLOCKIF_SIG; 356 bc->bc_fd = fd; 357 bc->bc_ischr = S_ISCHR(sbuf.st_mode); 358 bc->bc_rdonly = ro; 359 bc->bc_size = size; 360 bc->bc_sectsz = sectsz; 361 bc->bc_psectsz = psectsz; 362 bc->bc_psectoff = psectoff; 363 pthread_mutex_init(&bc->bc_mtx, NULL); 364 pthread_cond_init(&bc->bc_cond, NULL); 365 TAILQ_INIT(&bc->bc_freeq); 366 TAILQ_INIT(&bc->bc_pendq); 367 TAILQ_INIT(&bc->bc_busyq); 368 bc->bc_req_count = 0; 369 for (i = 0; i < BLOCKIF_MAXREQ; i++) { 370 bc->bc_reqs[i].be_status = BST_FREE; 371 TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link); 372 } 373 374 pthread_create(&bc->bc_btid, NULL, blockif_thr, bc); 375 376 snprintf(tname, sizeof(tname), "blk-%s", ident); 377 pthread_set_name_np(bc->bc_btid, tname); 378 379 return (bc); 380 } 381 382 static int 383 blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq, 384 enum blockop op) 385 { 386 int err; 387 388 err = 0; 389 390 pthread_mutex_lock(&bc->bc_mtx); 391 if (bc->bc_req_count < BLOCKIF_MAXREQ) { 392 /* 393 * Enqueue and inform the block i/o thread 394 * that there is work available 395 */ 396 blockif_enqueue(bc, breq, op); 397 pthread_cond_signal(&bc->bc_cond); 398 } else { 399 /* 400 * Callers are not allowed to enqueue more than 401 * the specified blockif queue limit. Return an 402 * error to indicate that the queue length has been 403 * exceeded. 404 */ 405 err = E2BIG; 406 } 407 pthread_mutex_unlock(&bc->bc_mtx); 408 409 return (err); 410 } 411 412 int 413 blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq) 414 { 415 416 assert(bc->bc_magic == BLOCKIF_SIG); 417 return (blockif_request(bc, breq, BOP_READ)); 418 } 419 420 int 421 blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq) 422 { 423 424 assert(bc->bc_magic == BLOCKIF_SIG); 425 return (blockif_request(bc, breq, BOP_WRITE)); 426 } 427 428 int 429 blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq) 430 { 431 432 assert(bc->bc_magic == BLOCKIF_SIG); 433 return (blockif_request(bc, breq, BOP_FLUSH)); 434 } 435 436 int 437 blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq) 438 { 439 struct blockif_elem *be; 440 441 assert(bc->bc_magic == BLOCKIF_SIG); 442 443 pthread_mutex_lock(&bc->bc_mtx); 444 /* 445 * Check pending requests. 446 */ 447 TAILQ_FOREACH(be, &bc->bc_pendq, be_link) { 448 if (be->be_req == breq) 449 break; 450 } 451 if (be != NULL) { 452 /* 453 * Found it. 454 */ 455 TAILQ_REMOVE(&bc->bc_pendq, be, be_link); 456 be->be_status = BST_FREE; 457 be->be_req = NULL; 458 TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link); 459 bc->bc_req_count--; 460 pthread_mutex_unlock(&bc->bc_mtx); 461 462 return (0); 463 } 464 465 /* 466 * Check in-flight requests. 467 */ 468 TAILQ_FOREACH(be, &bc->bc_busyq, be_link) { 469 if (be->be_req == breq) 470 break; 471 } 472 if (be == NULL) { 473 /* 474 * Didn't find it. 475 */ 476 pthread_mutex_unlock(&bc->bc_mtx); 477 return (EINVAL); 478 } 479 480 /* 481 * Interrupt the processing thread to force it return 482 * prematurely via it's normal callback path. 483 */ 484 while (be->be_status == BST_BUSY) { 485 struct blockif_sig_elem bse, *old_head; 486 487 pthread_mutex_init(&bse.bse_mtx, NULL); 488 pthread_cond_init(&bse.bse_cond, NULL); 489 490 bse.bse_pending = 1; 491 492 do { 493 old_head = blockif_bse_head; 494 bse.bse_next = old_head; 495 } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head, 496 (uintptr_t)old_head, 497 (uintptr_t)&bse)); 498 499 pthread_kill(be->be_tid, SIGCONT); 500 501 pthread_mutex_lock(&bse.bse_mtx); 502 while (bse.bse_pending) 503 pthread_cond_wait(&bse.bse_cond, &bse.bse_mtx); 504 pthread_mutex_unlock(&bse.bse_mtx); 505 } 506 507 pthread_mutex_unlock(&bc->bc_mtx); 508 509 /* 510 * The processing thread has been interrupted. Since it's not 511 * clear if the callback has been invoked yet, return EBUSY. 512 */ 513 return (EBUSY); 514 } 515 516 int 517 blockif_close(struct blockif_ctxt *bc) 518 { 519 void *jval; 520 int err; 521 522 err = 0; 523 524 assert(bc->bc_magic == BLOCKIF_SIG); 525 526 /* 527 * Stop the block i/o thread 528 */ 529 bc->bc_closing = 1; 530 pthread_cond_signal(&bc->bc_cond); 531 pthread_join(bc->bc_btid, &jval); 532 533 /* XXX Cancel queued i/o's ??? */ 534 535 /* 536 * Release resources 537 */ 538 bc->bc_magic = 0; 539 close(bc->bc_fd); 540 free(bc); 541 542 return (0); 543 } 544 545 /* 546 * Return virtual C/H/S values for a given block. Use the algorithm 547 * outlined in the VHD specification to calculate values. 548 */ 549 void 550 blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s) 551 { 552 off_t sectors; /* total sectors of the block dev */ 553 off_t hcyl; /* cylinders times heads */ 554 uint16_t secpt; /* sectors per track */ 555 uint8_t heads; 556 557 assert(bc->bc_magic == BLOCKIF_SIG); 558 559 sectors = bc->bc_size / bc->bc_sectsz; 560 561 /* Clamp the size to the largest possible with CHS */ 562 if (sectors > 65535UL*16*255) 563 sectors = 65535UL*16*255; 564 565 if (sectors >= 65536UL*16*63) { 566 secpt = 255; 567 heads = 16; 568 hcyl = sectors / secpt; 569 } else { 570 secpt = 17; 571 hcyl = sectors / secpt; 572 heads = (hcyl + 1023) / 1024; 573 574 if (heads < 4) 575 heads = 4; 576 577 if (hcyl >= (heads * 1024) || heads > 16) { 578 secpt = 31; 579 heads = 16; 580 hcyl = sectors / secpt; 581 } 582 if (hcyl >= (heads * 1024)) { 583 secpt = 63; 584 heads = 16; 585 hcyl = sectors / secpt; 586 } 587 } 588 589 *c = hcyl / heads; 590 *h = heads; 591 *s = secpt; 592 } 593 594 /* 595 * Accessors 596 */ 597 off_t 598 blockif_size(struct blockif_ctxt *bc) 599 { 600 601 assert(bc->bc_magic == BLOCKIF_SIG); 602 return (bc->bc_size); 603 } 604 605 int 606 blockif_sectsz(struct blockif_ctxt *bc) 607 { 608 609 assert(bc->bc_magic == BLOCKIF_SIG); 610 return (bc->bc_sectsz); 611 } 612 613 void 614 blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off) 615 { 616 617 assert(bc->bc_magic == BLOCKIF_SIG); 618 *size = bc->bc_psectsz; 619 *off = bc->bc_psectoff; 620 } 621 622 int 623 blockif_queuesz(struct blockif_ctxt *bc) 624 { 625 626 assert(bc->bc_magic == BLOCKIF_SIG); 627 return (BLOCKIF_MAXREQ - 1); 628 } 629 630 int 631 blockif_is_ro(struct blockif_ctxt *bc) 632 { 633 634 assert(bc->bc_magic == BLOCKIF_SIG); 635 return (bc->bc_rdonly); 636 } 637