1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2013 Peter Grehan <grehan@freebsd.org> 5 * All rights reserved. 6 * Copyright 2020 Joyent, Inc. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * $FreeBSD$ 30 */ 31 32 #include <sys/cdefs.h> 33 __FBSDID("$FreeBSD$"); 34 35 #include <sys/param.h> 36 #ifndef WITHOUT_CAPSICUM 37 #include <sys/capsicum.h> 38 #endif 39 #include <sys/queue.h> 40 #include <sys/errno.h> 41 #include <sys/stat.h> 42 #include <sys/ioctl.h> 43 #include <sys/disk.h> 44 45 #include <assert.h> 46 #ifndef WITHOUT_CAPSICUM 47 #include <capsicum_helpers.h> 48 #endif 49 #include <err.h> 50 #include <fcntl.h> 51 #include <stdio.h> 52 #include <stdlib.h> 53 #include <string.h> 54 #include <pthread.h> 55 #include <pthread_np.h> 56 #include <signal.h> 57 #include <sysexits.h> 58 #include <unistd.h> 59 60 #include <machine/atomic.h> 61 #include <machine/vmm_snapshot.h> 62 63 #include "bhyverun.h" 64 #include "config.h" 65 #include "debug.h" 66 #include "mevent.h" 67 #include "pci_emul.h" 68 #include "block_if.h" 69 70 #define BLOCKIF_SIG 0xb109b109 71 72 #define BLOCKIF_NUMTHR 8 73 #define BLOCKIF_MAXREQ (BLOCKIF_RING_MAX + BLOCKIF_NUMTHR) 74 75 enum blockop { 76 BOP_READ, 77 BOP_WRITE, 78 BOP_FLUSH, 79 BOP_DELETE 80 }; 81 82 enum blockstat { 83 BST_FREE, 84 BST_BLOCK, 85 BST_PEND, 86 BST_BUSY, 87 BST_DONE 88 }; 89 90 struct blockif_elem { 91 TAILQ_ENTRY(blockif_elem) be_link; 92 struct blockif_req *be_req; 93 enum blockop be_op; 94 enum blockstat be_status; 95 pthread_t be_tid; 96 off_t be_block; 97 }; 98 99 struct blockif_ctxt { 100 int bc_magic; 101 int bc_fd; 102 int bc_ischr; 103 int bc_isgeom; 104 int bc_candelete; 105 int bc_rdonly; 106 off_t bc_size; 107 int bc_sectsz; 108 int bc_psectsz; 109 int bc_psectoff; 110 int bc_closing; 111 int bc_paused; 112 int bc_work_count; 113 pthread_t bc_btid[BLOCKIF_NUMTHR]; 114 pthread_mutex_t bc_mtx; 115 pthread_cond_t bc_cond; 116 pthread_cond_t bc_paused_cond; 117 pthread_cond_t bc_work_done_cond; 118 119 /* Request elements and free/pending/busy queues */ 120 TAILQ_HEAD(, blockif_elem) bc_freeq; 121 TAILQ_HEAD(, blockif_elem) bc_pendq; 122 TAILQ_HEAD(, blockif_elem) bc_busyq; 123 struct blockif_elem bc_reqs[BLOCKIF_MAXREQ]; 124 }; 125 126 static pthread_once_t blockif_once = PTHREAD_ONCE_INIT; 127 128 struct blockif_sig_elem { 129 pthread_mutex_t bse_mtx; 130 pthread_cond_t bse_cond; 131 int bse_pending; 132 struct blockif_sig_elem *bse_next; 133 }; 134 135 static struct blockif_sig_elem *blockif_bse_head; 136 137 static int 138 blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq, 139 enum blockop op) 140 { 141 struct blockif_elem *be, *tbe; 142 off_t off; 143 int i; 144 145 be = TAILQ_FIRST(&bc->bc_freeq); 146 assert(be != NULL); 147 assert(be->be_status == BST_FREE); 148 TAILQ_REMOVE(&bc->bc_freeq, be, be_link); 149 be->be_req = breq; 150 be->be_op = op; 151 switch (op) { 152 case BOP_READ: 153 case BOP_WRITE: 154 case BOP_DELETE: 155 off = breq->br_offset; 156 for (i = 0; i < breq->br_iovcnt; i++) 157 off += breq->br_iov[i].iov_len; 158 break; 159 default: 160 off = OFF_MAX; 161 } 162 be->be_block = off; 163 TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) { 164 if (tbe->be_block == breq->br_offset) 165 break; 166 } 167 if (tbe == NULL) { 168 TAILQ_FOREACH(tbe, &bc->bc_busyq, be_link) { 169 if (tbe->be_block == breq->br_offset) 170 break; 171 } 172 } 173 if (tbe == NULL) 174 be->be_status = BST_PEND; 175 else 176 be->be_status = BST_BLOCK; 177 TAILQ_INSERT_TAIL(&bc->bc_pendq, be, be_link); 178 return (be->be_status == BST_PEND); 179 } 180 181 static int 182 blockif_dequeue(struct blockif_ctxt *bc, pthread_t t, struct blockif_elem **bep) 183 { 184 struct blockif_elem *be; 185 186 TAILQ_FOREACH(be, &bc->bc_pendq, be_link) { 187 if (be->be_status == BST_PEND) 188 break; 189 assert(be->be_status == BST_BLOCK); 190 } 191 if (be == NULL) 192 return (0); 193 TAILQ_REMOVE(&bc->bc_pendq, be, be_link); 194 be->be_status = BST_BUSY; 195 be->be_tid = t; 196 TAILQ_INSERT_TAIL(&bc->bc_busyq, be, be_link); 197 *bep = be; 198 return (1); 199 } 200 201 static void 202 blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be) 203 { 204 struct blockif_elem *tbe; 205 206 if (be->be_status == BST_DONE || be->be_status == BST_BUSY) 207 TAILQ_REMOVE(&bc->bc_busyq, be, be_link); 208 else 209 TAILQ_REMOVE(&bc->bc_pendq, be, be_link); 210 TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) { 211 if (tbe->be_req->br_offset == be->be_block) 212 tbe->be_status = BST_PEND; 213 } 214 be->be_tid = 0; 215 be->be_status = BST_FREE; 216 be->be_req = NULL; 217 TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link); 218 } 219 220 static int 221 blockif_flush_bc(struct blockif_ctxt *bc) 222 { 223 if (bc->bc_ischr) { 224 if (ioctl(bc->bc_fd, DIOCGFLUSH)) 225 return (errno); 226 } else if (fsync(bc->bc_fd)) 227 return (errno); 228 229 return (0); 230 } 231 232 static void 233 blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf) 234 { 235 struct blockif_req *br; 236 off_t arg[2]; 237 ssize_t clen, len, off, boff, voff; 238 int i, err; 239 240 br = be->be_req; 241 if (br->br_iovcnt <= 1) 242 buf = NULL; 243 err = 0; 244 switch (be->be_op) { 245 case BOP_READ: 246 if (buf == NULL) { 247 if ((len = preadv(bc->bc_fd, br->br_iov, br->br_iovcnt, 248 br->br_offset)) < 0) 249 err = errno; 250 else 251 br->br_resid -= len; 252 break; 253 } 254 i = 0; 255 off = voff = 0; 256 while (br->br_resid > 0) { 257 len = MIN(br->br_resid, MAXPHYS); 258 if (pread(bc->bc_fd, buf, len, br->br_offset + 259 off) < 0) { 260 err = errno; 261 break; 262 } 263 boff = 0; 264 do { 265 clen = MIN(len - boff, br->br_iov[i].iov_len - 266 voff); 267 memcpy(br->br_iov[i].iov_base + voff, 268 buf + boff, clen); 269 if (clen < br->br_iov[i].iov_len - voff) 270 voff += clen; 271 else { 272 i++; 273 voff = 0; 274 } 275 boff += clen; 276 } while (boff < len); 277 off += len; 278 br->br_resid -= len; 279 } 280 break; 281 case BOP_WRITE: 282 if (bc->bc_rdonly) { 283 err = EROFS; 284 break; 285 } 286 if (buf == NULL) { 287 if ((len = pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt, 288 br->br_offset)) < 0) 289 err = errno; 290 else 291 br->br_resid -= len; 292 break; 293 } 294 i = 0; 295 off = voff = 0; 296 while (br->br_resid > 0) { 297 len = MIN(br->br_resid, MAXPHYS); 298 boff = 0; 299 do { 300 clen = MIN(len - boff, br->br_iov[i].iov_len - 301 voff); 302 memcpy(buf + boff, 303 br->br_iov[i].iov_base + voff, clen); 304 if (clen < br->br_iov[i].iov_len - voff) 305 voff += clen; 306 else { 307 i++; 308 voff = 0; 309 } 310 boff += clen; 311 } while (boff < len); 312 if (pwrite(bc->bc_fd, buf, len, br->br_offset + 313 off) < 0) { 314 err = errno; 315 break; 316 } 317 off += len; 318 br->br_resid -= len; 319 } 320 break; 321 case BOP_FLUSH: 322 err = blockif_flush_bc(bc); 323 break; 324 case BOP_DELETE: 325 if (!bc->bc_candelete) 326 err = EOPNOTSUPP; 327 else if (bc->bc_rdonly) 328 err = EROFS; 329 else if (bc->bc_ischr) { 330 arg[0] = br->br_offset; 331 arg[1] = br->br_resid; 332 if (ioctl(bc->bc_fd, DIOCGDELETE, arg)) 333 err = errno; 334 else 335 br->br_resid = 0; 336 } else 337 err = EOPNOTSUPP; 338 break; 339 default: 340 err = EINVAL; 341 break; 342 } 343 344 be->be_status = BST_DONE; 345 346 (*br->br_callback)(br, err); 347 } 348 349 static void * 350 blockif_thr(void *arg) 351 { 352 struct blockif_ctxt *bc; 353 struct blockif_elem *be; 354 pthread_t t; 355 uint8_t *buf; 356 357 bc = arg; 358 if (bc->bc_isgeom) 359 buf = malloc(MAXPHYS); 360 else 361 buf = NULL; 362 t = pthread_self(); 363 364 pthread_mutex_lock(&bc->bc_mtx); 365 for (;;) { 366 bc->bc_work_count++; 367 368 /* We cannot process work if the interface is paused */ 369 while (!bc->bc_paused && blockif_dequeue(bc, t, &be)) { 370 pthread_mutex_unlock(&bc->bc_mtx); 371 blockif_proc(bc, be, buf); 372 pthread_mutex_lock(&bc->bc_mtx); 373 blockif_complete(bc, be); 374 } 375 376 bc->bc_work_count--; 377 378 /* If none of the workers are busy, notify the main thread */ 379 if (bc->bc_work_count == 0) 380 pthread_cond_broadcast(&bc->bc_work_done_cond); 381 382 /* Check ctxt status here to see if exit requested */ 383 if (bc->bc_closing) 384 break; 385 386 /* Make all worker threads wait here if the device is paused */ 387 while (bc->bc_paused) 388 pthread_cond_wait(&bc->bc_paused_cond, &bc->bc_mtx); 389 390 pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx); 391 } 392 pthread_mutex_unlock(&bc->bc_mtx); 393 394 if (buf) 395 free(buf); 396 pthread_exit(NULL); 397 return (NULL); 398 } 399 400 static void 401 blockif_sigcont_handler(int signal, enum ev_type type, void *arg) 402 { 403 struct blockif_sig_elem *bse; 404 405 for (;;) { 406 /* 407 * Process the entire list even if not intended for 408 * this thread. 409 */ 410 do { 411 bse = blockif_bse_head; 412 if (bse == NULL) 413 return; 414 } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head, 415 (uintptr_t)bse, 416 (uintptr_t)bse->bse_next)); 417 418 pthread_mutex_lock(&bse->bse_mtx); 419 bse->bse_pending = 0; 420 pthread_cond_signal(&bse->bse_cond); 421 pthread_mutex_unlock(&bse->bse_mtx); 422 } 423 } 424 425 static void 426 blockif_init(void) 427 { 428 mevent_add(SIGCONT, EVF_SIGNAL, blockif_sigcont_handler, NULL); 429 (void) signal(SIGCONT, SIG_IGN); 430 } 431 432 int 433 blockif_legacy_config(nvlist_t *nvl, const char *opts) 434 { 435 char *cp, *path; 436 437 if (opts == NULL) 438 return (0); 439 440 cp = strchr(opts, ','); 441 if (cp == NULL) { 442 set_config_value_node(nvl, "path", opts); 443 return (0); 444 } 445 path = strndup(opts, cp - opts); 446 set_config_value_node(nvl, "path", path); 447 free(path); 448 return (pci_parse_legacy_config(nvl, cp + 1)); 449 } 450 451 struct blockif_ctxt * 452 blockif_open(nvlist_t *nvl, const char *ident) 453 { 454 char tname[MAXCOMLEN + 1]; 455 char name[MAXPATHLEN]; 456 const char *path, *pssval, *ssval; 457 char *cp; 458 struct blockif_ctxt *bc; 459 struct stat sbuf; 460 struct diocgattr_arg arg; 461 off_t size, psectsz, psectoff; 462 int extra, fd, i, sectsz; 463 int ro, candelete, geom, ssopt, pssopt; 464 int nodelete; 465 466 #ifndef WITHOUT_CAPSICUM 467 cap_rights_t rights; 468 cap_ioctl_t cmds[] = { DIOCGFLUSH, DIOCGDELETE }; 469 #endif 470 471 pthread_once(&blockif_once, blockif_init); 472 473 fd = -1; 474 extra = 0; 475 ssopt = 0; 476 ro = 0; 477 nodelete = 0; 478 479 if (get_config_bool_node_default(nvl, "nocache", false)) 480 extra |= O_DIRECT; 481 if (get_config_bool_node_default(nvl, "nodelete", false)) 482 nodelete = 1; 483 if (get_config_bool_node_default(nvl, "sync", false) || 484 get_config_bool_node_default(nvl, "direct", false)) 485 extra |= O_SYNC; 486 if (get_config_bool_node_default(nvl, "ro", false)) 487 ro = 1; 488 ssval = get_config_value_node(nvl, "sectorsize"); 489 if (ssval != NULL) { 490 ssopt = strtol(ssval, &cp, 10); 491 if (cp == ssval) { 492 EPRINTLN("Invalid sector size \"%s\"", ssval); 493 goto err; 494 } 495 if (*cp == '\0') { 496 pssopt = ssopt; 497 } else if (*cp == '/') { 498 pssval = cp + 1; 499 pssopt = strtol(pssval, &cp, 10); 500 if (cp == pssval || *cp != '\0') { 501 EPRINTLN("Invalid sector size \"%s\"", ssval); 502 goto err; 503 } 504 } else { 505 EPRINTLN("Invalid sector size \"%s\"", ssval); 506 goto err; 507 } 508 } 509 510 path = get_config_value_node(nvl, "path"); 511 if (path == NULL) { 512 EPRINTLN("Missing \"path\" for block device."); 513 goto err; 514 } 515 516 fd = open(path, (ro ? O_RDONLY : O_RDWR) | extra); 517 if (fd < 0 && !ro) { 518 /* Attempt a r/w fail with a r/o open */ 519 fd = open(path, O_RDONLY | extra); 520 ro = 1; 521 } 522 523 if (fd < 0) { 524 warn("Could not open backing file: %s", path); 525 goto err; 526 } 527 528 if (fstat(fd, &sbuf) < 0) { 529 warn("Could not stat backing file %s", path); 530 goto err; 531 } 532 533 #ifndef WITHOUT_CAPSICUM 534 cap_rights_init(&rights, CAP_FSYNC, CAP_IOCTL, CAP_READ, CAP_SEEK, 535 CAP_WRITE); 536 if (ro) 537 cap_rights_clear(&rights, CAP_FSYNC, CAP_WRITE); 538 539 if (caph_rights_limit(fd, &rights) == -1) 540 errx(EX_OSERR, "Unable to apply rights for sandbox"); 541 #endif 542 543 /* 544 * Deal with raw devices 545 */ 546 size = sbuf.st_size; 547 sectsz = DEV_BSIZE; 548 psectsz = psectoff = 0; 549 candelete = geom = 0; 550 if (S_ISCHR(sbuf.st_mode)) { 551 if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 || 552 ioctl(fd, DIOCGSECTORSIZE, §sz)) { 553 perror("Could not fetch dev blk/sector size"); 554 goto err; 555 } 556 assert(size != 0); 557 assert(sectsz != 0); 558 if (ioctl(fd, DIOCGSTRIPESIZE, &psectsz) == 0 && psectsz > 0) 559 ioctl(fd, DIOCGSTRIPEOFFSET, &psectoff); 560 strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name)); 561 arg.len = sizeof(arg.value.i); 562 if (nodelete == 0 && ioctl(fd, DIOCGATTR, &arg) == 0) 563 candelete = arg.value.i; 564 if (ioctl(fd, DIOCGPROVIDERNAME, name) == 0) 565 geom = 1; 566 } else 567 psectsz = sbuf.st_blksize; 568 569 #ifndef WITHOUT_CAPSICUM 570 if (caph_ioctls_limit(fd, cmds, nitems(cmds)) == -1) 571 errx(EX_OSERR, "Unable to apply rights for sandbox"); 572 #endif 573 574 if (ssopt != 0) { 575 if (!powerof2(ssopt) || !powerof2(pssopt) || ssopt < 512 || 576 ssopt > pssopt) { 577 EPRINTLN("Invalid sector size %d/%d", 578 ssopt, pssopt); 579 goto err; 580 } 581 582 /* 583 * Some backend drivers (e.g. cd0, ada0) require that the I/O 584 * size be a multiple of the device's sector size. 585 * 586 * Validate that the emulated sector size complies with this 587 * requirement. 588 */ 589 if (S_ISCHR(sbuf.st_mode)) { 590 if (ssopt < sectsz || (ssopt % sectsz) != 0) { 591 EPRINTLN("Sector size %d incompatible " 592 "with underlying device sector size %d", 593 ssopt, sectsz); 594 goto err; 595 } 596 } 597 598 sectsz = ssopt; 599 psectsz = pssopt; 600 psectoff = 0; 601 } 602 603 bc = calloc(1, sizeof(struct blockif_ctxt)); 604 if (bc == NULL) { 605 perror("calloc"); 606 goto err; 607 } 608 609 bc->bc_magic = BLOCKIF_SIG; 610 bc->bc_fd = fd; 611 bc->bc_ischr = S_ISCHR(sbuf.st_mode); 612 bc->bc_isgeom = geom; 613 bc->bc_candelete = candelete; 614 bc->bc_rdonly = ro; 615 bc->bc_size = size; 616 bc->bc_sectsz = sectsz; 617 bc->bc_psectsz = psectsz; 618 bc->bc_psectoff = psectoff; 619 pthread_mutex_init(&bc->bc_mtx, NULL); 620 pthread_cond_init(&bc->bc_cond, NULL); 621 bc->bc_paused = 0; 622 bc->bc_work_count = 0; 623 pthread_cond_init(&bc->bc_paused_cond, NULL); 624 pthread_cond_init(&bc->bc_work_done_cond, NULL); 625 TAILQ_INIT(&bc->bc_freeq); 626 TAILQ_INIT(&bc->bc_pendq); 627 TAILQ_INIT(&bc->bc_busyq); 628 for (i = 0; i < BLOCKIF_MAXREQ; i++) { 629 bc->bc_reqs[i].be_status = BST_FREE; 630 TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link); 631 } 632 633 for (i = 0; i < BLOCKIF_NUMTHR; i++) { 634 pthread_create(&bc->bc_btid[i], NULL, blockif_thr, bc); 635 snprintf(tname, sizeof(tname), "blk-%s-%d", ident, i); 636 pthread_set_name_np(bc->bc_btid[i], tname); 637 } 638 639 return (bc); 640 err: 641 if (fd >= 0) 642 close(fd); 643 return (NULL); 644 } 645 646 static int 647 blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq, 648 enum blockop op) 649 { 650 int err; 651 652 err = 0; 653 654 pthread_mutex_lock(&bc->bc_mtx); 655 if (!TAILQ_EMPTY(&bc->bc_freeq)) { 656 /* 657 * Enqueue and inform the block i/o thread 658 * that there is work available 659 */ 660 if (blockif_enqueue(bc, breq, op)) 661 pthread_cond_signal(&bc->bc_cond); 662 } else { 663 /* 664 * Callers are not allowed to enqueue more than 665 * the specified blockif queue limit. Return an 666 * error to indicate that the queue length has been 667 * exceeded. 668 */ 669 err = E2BIG; 670 } 671 pthread_mutex_unlock(&bc->bc_mtx); 672 673 return (err); 674 } 675 676 int 677 blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq) 678 { 679 680 assert(bc->bc_magic == BLOCKIF_SIG); 681 return (blockif_request(bc, breq, BOP_READ)); 682 } 683 684 int 685 blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq) 686 { 687 688 assert(bc->bc_magic == BLOCKIF_SIG); 689 return (blockif_request(bc, breq, BOP_WRITE)); 690 } 691 692 int 693 blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq) 694 { 695 696 assert(bc->bc_magic == BLOCKIF_SIG); 697 return (blockif_request(bc, breq, BOP_FLUSH)); 698 } 699 700 int 701 blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq) 702 { 703 704 assert(bc->bc_magic == BLOCKIF_SIG); 705 return (blockif_request(bc, breq, BOP_DELETE)); 706 } 707 708 int 709 blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq) 710 { 711 struct blockif_elem *be; 712 713 assert(bc->bc_magic == BLOCKIF_SIG); 714 715 pthread_mutex_lock(&bc->bc_mtx); 716 /* XXX: not waiting while paused */ 717 718 /* 719 * Check pending requests. 720 */ 721 TAILQ_FOREACH(be, &bc->bc_pendq, be_link) { 722 if (be->be_req == breq) 723 break; 724 } 725 if (be != NULL) { 726 /* 727 * Found it. 728 */ 729 blockif_complete(bc, be); 730 pthread_mutex_unlock(&bc->bc_mtx); 731 732 return (0); 733 } 734 735 /* 736 * Check in-flight requests. 737 */ 738 TAILQ_FOREACH(be, &bc->bc_busyq, be_link) { 739 if (be->be_req == breq) 740 break; 741 } 742 if (be == NULL) { 743 /* 744 * Didn't find it. 745 */ 746 pthread_mutex_unlock(&bc->bc_mtx); 747 return (EINVAL); 748 } 749 750 /* 751 * Interrupt the processing thread to force it return 752 * prematurely via it's normal callback path. 753 */ 754 while (be->be_status == BST_BUSY) { 755 struct blockif_sig_elem bse, *old_head; 756 757 pthread_mutex_init(&bse.bse_mtx, NULL); 758 pthread_cond_init(&bse.bse_cond, NULL); 759 760 bse.bse_pending = 1; 761 762 do { 763 old_head = blockif_bse_head; 764 bse.bse_next = old_head; 765 } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head, 766 (uintptr_t)old_head, 767 (uintptr_t)&bse)); 768 769 pthread_kill(be->be_tid, SIGCONT); 770 771 pthread_mutex_lock(&bse.bse_mtx); 772 while (bse.bse_pending) 773 pthread_cond_wait(&bse.bse_cond, &bse.bse_mtx); 774 pthread_mutex_unlock(&bse.bse_mtx); 775 } 776 777 pthread_mutex_unlock(&bc->bc_mtx); 778 779 /* 780 * The processing thread has been interrupted. Since it's not 781 * clear if the callback has been invoked yet, return EBUSY. 782 */ 783 return (EBUSY); 784 } 785 786 int 787 blockif_close(struct blockif_ctxt *bc) 788 { 789 void *jval; 790 int i; 791 792 assert(bc->bc_magic == BLOCKIF_SIG); 793 794 /* 795 * Stop the block i/o thread 796 */ 797 pthread_mutex_lock(&bc->bc_mtx); 798 bc->bc_closing = 1; 799 pthread_mutex_unlock(&bc->bc_mtx); 800 pthread_cond_broadcast(&bc->bc_cond); 801 for (i = 0; i < BLOCKIF_NUMTHR; i++) 802 pthread_join(bc->bc_btid[i], &jval); 803 804 /* XXX Cancel queued i/o's ??? */ 805 806 /* 807 * Release resources 808 */ 809 bc->bc_magic = 0; 810 close(bc->bc_fd); 811 free(bc); 812 813 return (0); 814 } 815 816 /* 817 * Return virtual C/H/S values for a given block. Use the algorithm 818 * outlined in the VHD specification to calculate values. 819 */ 820 void 821 blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s) 822 { 823 off_t sectors; /* total sectors of the block dev */ 824 off_t hcyl; /* cylinders times heads */ 825 uint16_t secpt; /* sectors per track */ 826 uint8_t heads; 827 828 assert(bc->bc_magic == BLOCKIF_SIG); 829 830 sectors = bc->bc_size / bc->bc_sectsz; 831 832 /* Clamp the size to the largest possible with CHS */ 833 if (sectors > 65535UL*16*255) 834 sectors = 65535UL*16*255; 835 836 if (sectors >= 65536UL*16*63) { 837 secpt = 255; 838 heads = 16; 839 hcyl = sectors / secpt; 840 } else { 841 secpt = 17; 842 hcyl = sectors / secpt; 843 heads = (hcyl + 1023) / 1024; 844 845 if (heads < 4) 846 heads = 4; 847 848 if (hcyl >= (heads * 1024) || heads > 16) { 849 secpt = 31; 850 heads = 16; 851 hcyl = sectors / secpt; 852 } 853 if (hcyl >= (heads * 1024)) { 854 secpt = 63; 855 heads = 16; 856 hcyl = sectors / secpt; 857 } 858 } 859 860 *c = hcyl / heads; 861 *h = heads; 862 *s = secpt; 863 } 864 865 /* 866 * Accessors 867 */ 868 off_t 869 blockif_size(struct blockif_ctxt *bc) 870 { 871 872 assert(bc->bc_magic == BLOCKIF_SIG); 873 return (bc->bc_size); 874 } 875 876 int 877 blockif_sectsz(struct blockif_ctxt *bc) 878 { 879 880 assert(bc->bc_magic == BLOCKIF_SIG); 881 return (bc->bc_sectsz); 882 } 883 884 void 885 blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off) 886 { 887 888 assert(bc->bc_magic == BLOCKIF_SIG); 889 *size = bc->bc_psectsz; 890 *off = bc->bc_psectoff; 891 } 892 893 int 894 blockif_queuesz(struct blockif_ctxt *bc) 895 { 896 897 assert(bc->bc_magic == BLOCKIF_SIG); 898 return (BLOCKIF_MAXREQ - 1); 899 } 900 901 int 902 blockif_is_ro(struct blockif_ctxt *bc) 903 { 904 905 assert(bc->bc_magic == BLOCKIF_SIG); 906 return (bc->bc_rdonly); 907 } 908 909 int 910 blockif_candelete(struct blockif_ctxt *bc) 911 { 912 913 assert(bc->bc_magic == BLOCKIF_SIG); 914 return (bc->bc_candelete); 915 } 916 917 #ifdef BHYVE_SNAPSHOT 918 void 919 blockif_pause(struct blockif_ctxt *bc) 920 { 921 assert(bc != NULL); 922 assert(bc->bc_magic == BLOCKIF_SIG); 923 924 pthread_mutex_lock(&bc->bc_mtx); 925 bc->bc_paused = 1; 926 927 /* The interface is paused. Wait for workers to finish their work */ 928 while (bc->bc_work_count) 929 pthread_cond_wait(&bc->bc_work_done_cond, &bc->bc_mtx); 930 pthread_mutex_unlock(&bc->bc_mtx); 931 932 if (blockif_flush_bc(bc)) 933 fprintf(stderr, "%s: [WARN] failed to flush backing file.\r\n", 934 __func__); 935 } 936 937 void 938 blockif_resume(struct blockif_ctxt *bc) 939 { 940 assert(bc != NULL); 941 assert(bc->bc_magic == BLOCKIF_SIG); 942 943 pthread_mutex_lock(&bc->bc_mtx); 944 bc->bc_paused = 0; 945 /* resume the threads waiting for paused */ 946 pthread_cond_broadcast(&bc->bc_paused_cond); 947 /* kick the threads after restore */ 948 pthread_cond_broadcast(&bc->bc_cond); 949 pthread_mutex_unlock(&bc->bc_mtx); 950 } 951 952 int 953 blockif_snapshot_req(struct blockif_req *br, struct vm_snapshot_meta *meta) 954 { 955 int i; 956 struct iovec *iov; 957 int ret; 958 959 SNAPSHOT_VAR_OR_LEAVE(br->br_iovcnt, meta, ret, done); 960 SNAPSHOT_VAR_OR_LEAVE(br->br_offset, meta, ret, done); 961 SNAPSHOT_VAR_OR_LEAVE(br->br_resid, meta, ret, done); 962 963 /* 964 * XXX: The callback and parameter must be filled by the virtualized 965 * device that uses the interface, during its init; we're not touching 966 * them here. 967 */ 968 969 /* Snapshot the iovecs. */ 970 for (i = 0; i < br->br_iovcnt; i++) { 971 iov = &br->br_iov[i]; 972 973 SNAPSHOT_VAR_OR_LEAVE(iov->iov_len, meta, ret, done); 974 975 /* We assume the iov is a guest-mapped address. */ 976 SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(iov->iov_base, iov->iov_len, 977 false, meta, ret, done); 978 } 979 980 done: 981 return (ret); 982 } 983 984 int 985 blockif_snapshot(struct blockif_ctxt *bc, struct vm_snapshot_meta *meta) 986 { 987 int ret; 988 989 if (bc->bc_paused == 0) { 990 fprintf(stderr, "%s: Snapshot failed: " 991 "interface not paused.\r\n", __func__); 992 return (ENXIO); 993 } 994 995 pthread_mutex_lock(&bc->bc_mtx); 996 997 SNAPSHOT_VAR_OR_LEAVE(bc->bc_magic, meta, ret, done); 998 SNAPSHOT_VAR_OR_LEAVE(bc->bc_ischr, meta, ret, done); 999 SNAPSHOT_VAR_OR_LEAVE(bc->bc_isgeom, meta, ret, done); 1000 SNAPSHOT_VAR_OR_LEAVE(bc->bc_candelete, meta, ret, done); 1001 SNAPSHOT_VAR_OR_LEAVE(bc->bc_rdonly, meta, ret, done); 1002 SNAPSHOT_VAR_OR_LEAVE(bc->bc_size, meta, ret, done); 1003 SNAPSHOT_VAR_OR_LEAVE(bc->bc_sectsz, meta, ret, done); 1004 SNAPSHOT_VAR_OR_LEAVE(bc->bc_psectsz, meta, ret, done); 1005 SNAPSHOT_VAR_OR_LEAVE(bc->bc_psectoff, meta, ret, done); 1006 SNAPSHOT_VAR_OR_LEAVE(bc->bc_closing, meta, ret, done); 1007 1008 done: 1009 pthread_mutex_unlock(&bc->bc_mtx); 1010 return (ret); 1011 } 1012 #endif 1013