1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2013 Peter Grehan <grehan@freebsd.org> 5 * All rights reserved. 6 * Copyright 2020 Joyent, Inc. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * $FreeBSD$ 30 */ 31 32 #include <sys/cdefs.h> 33 __FBSDID("$FreeBSD$"); 34 35 #include <sys/param.h> 36 #ifndef WITHOUT_CAPSICUM 37 #include <sys/capsicum.h> 38 #endif 39 #include <sys/queue.h> 40 #include <sys/errno.h> 41 #include <sys/stat.h> 42 #include <sys/ioctl.h> 43 #include <sys/disk.h> 44 45 #include <assert.h> 46 #ifndef WITHOUT_CAPSICUM 47 #include <capsicum_helpers.h> 48 #endif 49 #include <err.h> 50 #include <fcntl.h> 51 #include <stdio.h> 52 #include <stdlib.h> 53 #include <string.h> 54 #include <pthread.h> 55 #include <pthread_np.h> 56 #include <signal.h> 57 #include <sysexits.h> 58 #include <unistd.h> 59 60 #include <machine/atomic.h> 61 #include <machine/vmm_snapshot.h> 62 63 #include "bhyverun.h" 64 #include "config.h" 65 #include "debug.h" 66 #include "mevent.h" 67 #include "pci_emul.h" 68 #include "block_if.h" 69 70 #define BLOCKIF_SIG 0xb109b109 71 72 #define BLOCKIF_NUMTHR 8 73 #define BLOCKIF_MAXREQ (BLOCKIF_RING_MAX + BLOCKIF_NUMTHR) 74 75 enum blockop { 76 BOP_READ, 77 BOP_WRITE, 78 BOP_FLUSH, 79 BOP_DELETE 80 }; 81 82 enum blockstat { 83 BST_FREE, 84 BST_BLOCK, 85 BST_PEND, 86 BST_BUSY, 87 BST_DONE 88 }; 89 90 struct blockif_elem { 91 TAILQ_ENTRY(blockif_elem) be_link; 92 struct blockif_req *be_req; 93 enum blockop be_op; 94 enum blockstat be_status; 95 pthread_t be_tid; 96 off_t be_block; 97 }; 98 99 struct blockif_ctxt { 100 int bc_magic; 101 int bc_fd; 102 int bc_ischr; 103 int bc_isgeom; 104 int bc_candelete; 105 int bc_rdonly; 106 off_t bc_size; 107 int bc_sectsz; 108 int bc_psectsz; 109 int bc_psectoff; 110 int bc_closing; 111 int bc_paused; 112 int bc_work_count; 113 pthread_t bc_btid[BLOCKIF_NUMTHR]; 114 pthread_mutex_t bc_mtx; 115 pthread_cond_t bc_cond; 116 pthread_cond_t bc_paused_cond; 117 pthread_cond_t bc_work_done_cond; 118 blockif_resize_cb *bc_resize_cb; 119 void *bc_resize_cb_arg; 120 struct mevent *bc_resize_event; 121 122 /* Request elements and free/pending/busy queues */ 123 TAILQ_HEAD(, blockif_elem) bc_freeq; 124 TAILQ_HEAD(, blockif_elem) bc_pendq; 125 TAILQ_HEAD(, blockif_elem) bc_busyq; 126 struct blockif_elem bc_reqs[BLOCKIF_MAXREQ]; 127 }; 128 129 static pthread_once_t blockif_once = PTHREAD_ONCE_INIT; 130 131 struct blockif_sig_elem { 132 pthread_mutex_t bse_mtx; 133 pthread_cond_t bse_cond; 134 int bse_pending; 135 struct blockif_sig_elem *bse_next; 136 }; 137 138 static struct blockif_sig_elem *blockif_bse_head; 139 140 static int 141 blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq, 142 enum blockop op) 143 { 144 struct blockif_elem *be, *tbe; 145 off_t off; 146 int i; 147 148 be = TAILQ_FIRST(&bc->bc_freeq); 149 assert(be != NULL); 150 assert(be->be_status == BST_FREE); 151 TAILQ_REMOVE(&bc->bc_freeq, be, be_link); 152 be->be_req = breq; 153 be->be_op = op; 154 switch (op) { 155 case BOP_READ: 156 case BOP_WRITE: 157 case BOP_DELETE: 158 off = breq->br_offset; 159 for (i = 0; i < breq->br_iovcnt; i++) 160 off += breq->br_iov[i].iov_len; 161 break; 162 default: 163 off = OFF_MAX; 164 } 165 be->be_block = off; 166 TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) { 167 if (tbe->be_block == breq->br_offset) 168 break; 169 } 170 if (tbe == NULL) { 171 TAILQ_FOREACH(tbe, &bc->bc_busyq, be_link) { 172 if (tbe->be_block == breq->br_offset) 173 break; 174 } 175 } 176 if (tbe == NULL) 177 be->be_status = BST_PEND; 178 else 179 be->be_status = BST_BLOCK; 180 TAILQ_INSERT_TAIL(&bc->bc_pendq, be, be_link); 181 return (be->be_status == BST_PEND); 182 } 183 184 static int 185 blockif_dequeue(struct blockif_ctxt *bc, pthread_t t, struct blockif_elem **bep) 186 { 187 struct blockif_elem *be; 188 189 TAILQ_FOREACH(be, &bc->bc_pendq, be_link) { 190 if (be->be_status == BST_PEND) 191 break; 192 assert(be->be_status == BST_BLOCK); 193 } 194 if (be == NULL) 195 return (0); 196 TAILQ_REMOVE(&bc->bc_pendq, be, be_link); 197 be->be_status = BST_BUSY; 198 be->be_tid = t; 199 TAILQ_INSERT_TAIL(&bc->bc_busyq, be, be_link); 200 *bep = be; 201 return (1); 202 } 203 204 static void 205 blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be) 206 { 207 struct blockif_elem *tbe; 208 209 if (be->be_status == BST_DONE || be->be_status == BST_BUSY) 210 TAILQ_REMOVE(&bc->bc_busyq, be, be_link); 211 else 212 TAILQ_REMOVE(&bc->bc_pendq, be, be_link); 213 TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) { 214 if (tbe->be_req->br_offset == be->be_block) 215 tbe->be_status = BST_PEND; 216 } 217 be->be_tid = 0; 218 be->be_status = BST_FREE; 219 be->be_req = NULL; 220 TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link); 221 } 222 223 static int 224 blockif_flush_bc(struct blockif_ctxt *bc) 225 { 226 if (bc->bc_ischr) { 227 if (ioctl(bc->bc_fd, DIOCGFLUSH)) 228 return (errno); 229 } else if (fsync(bc->bc_fd)) 230 return (errno); 231 232 return (0); 233 } 234 235 static void 236 blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf) 237 { 238 struct blockif_req *br; 239 off_t arg[2]; 240 ssize_t clen, len, off, boff, voff; 241 int i, err; 242 243 br = be->be_req; 244 if (br->br_iovcnt <= 1) 245 buf = NULL; 246 err = 0; 247 switch (be->be_op) { 248 case BOP_READ: 249 if (buf == NULL) { 250 if ((len = preadv(bc->bc_fd, br->br_iov, br->br_iovcnt, 251 br->br_offset)) < 0) 252 err = errno; 253 else 254 br->br_resid -= len; 255 break; 256 } 257 i = 0; 258 off = voff = 0; 259 while (br->br_resid > 0) { 260 len = MIN(br->br_resid, MAXPHYS); 261 if (pread(bc->bc_fd, buf, len, br->br_offset + 262 off) < 0) { 263 err = errno; 264 break; 265 } 266 boff = 0; 267 do { 268 clen = MIN(len - boff, br->br_iov[i].iov_len - 269 voff); 270 memcpy(br->br_iov[i].iov_base + voff, 271 buf + boff, clen); 272 if (clen < br->br_iov[i].iov_len - voff) 273 voff += clen; 274 else { 275 i++; 276 voff = 0; 277 } 278 boff += clen; 279 } while (boff < len); 280 off += len; 281 br->br_resid -= len; 282 } 283 break; 284 case BOP_WRITE: 285 if (bc->bc_rdonly) { 286 err = EROFS; 287 break; 288 } 289 if (buf == NULL) { 290 if ((len = pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt, 291 br->br_offset)) < 0) 292 err = errno; 293 else 294 br->br_resid -= len; 295 break; 296 } 297 i = 0; 298 off = voff = 0; 299 while (br->br_resid > 0) { 300 len = MIN(br->br_resid, MAXPHYS); 301 boff = 0; 302 do { 303 clen = MIN(len - boff, br->br_iov[i].iov_len - 304 voff); 305 memcpy(buf + boff, 306 br->br_iov[i].iov_base + voff, clen); 307 if (clen < br->br_iov[i].iov_len - voff) 308 voff += clen; 309 else { 310 i++; 311 voff = 0; 312 } 313 boff += clen; 314 } while (boff < len); 315 if (pwrite(bc->bc_fd, buf, len, br->br_offset + 316 off) < 0) { 317 err = errno; 318 break; 319 } 320 off += len; 321 br->br_resid -= len; 322 } 323 break; 324 case BOP_FLUSH: 325 err = blockif_flush_bc(bc); 326 break; 327 case BOP_DELETE: 328 if (!bc->bc_candelete) 329 err = EOPNOTSUPP; 330 else if (bc->bc_rdonly) 331 err = EROFS; 332 else if (bc->bc_ischr) { 333 arg[0] = br->br_offset; 334 arg[1] = br->br_resid; 335 if (ioctl(bc->bc_fd, DIOCGDELETE, arg)) 336 err = errno; 337 else 338 br->br_resid = 0; 339 } else 340 err = EOPNOTSUPP; 341 break; 342 default: 343 err = EINVAL; 344 break; 345 } 346 347 be->be_status = BST_DONE; 348 349 (*br->br_callback)(br, err); 350 } 351 352 static void * 353 blockif_thr(void *arg) 354 { 355 struct blockif_ctxt *bc; 356 struct blockif_elem *be; 357 pthread_t t; 358 uint8_t *buf; 359 360 bc = arg; 361 if (bc->bc_isgeom) 362 buf = malloc(MAXPHYS); 363 else 364 buf = NULL; 365 t = pthread_self(); 366 367 pthread_mutex_lock(&bc->bc_mtx); 368 for (;;) { 369 bc->bc_work_count++; 370 371 /* We cannot process work if the interface is paused */ 372 while (!bc->bc_paused && blockif_dequeue(bc, t, &be)) { 373 pthread_mutex_unlock(&bc->bc_mtx); 374 blockif_proc(bc, be, buf); 375 pthread_mutex_lock(&bc->bc_mtx); 376 blockif_complete(bc, be); 377 } 378 379 bc->bc_work_count--; 380 381 /* If none of the workers are busy, notify the main thread */ 382 if (bc->bc_work_count == 0) 383 pthread_cond_broadcast(&bc->bc_work_done_cond); 384 385 /* Check ctxt status here to see if exit requested */ 386 if (bc->bc_closing) 387 break; 388 389 /* Make all worker threads wait here if the device is paused */ 390 while (bc->bc_paused) 391 pthread_cond_wait(&bc->bc_paused_cond, &bc->bc_mtx); 392 393 pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx); 394 } 395 pthread_mutex_unlock(&bc->bc_mtx); 396 397 if (buf) 398 free(buf); 399 pthread_exit(NULL); 400 return (NULL); 401 } 402 403 static void 404 blockif_sigcont_handler(int signal, enum ev_type type, void *arg) 405 { 406 struct blockif_sig_elem *bse; 407 408 for (;;) { 409 /* 410 * Process the entire list even if not intended for 411 * this thread. 412 */ 413 do { 414 bse = blockif_bse_head; 415 if (bse == NULL) 416 return; 417 } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head, 418 (uintptr_t)bse, 419 (uintptr_t)bse->bse_next)); 420 421 pthread_mutex_lock(&bse->bse_mtx); 422 bse->bse_pending = 0; 423 pthread_cond_signal(&bse->bse_cond); 424 pthread_mutex_unlock(&bse->bse_mtx); 425 } 426 } 427 428 static void 429 blockif_init(void) 430 { 431 mevent_add(SIGCONT, EVF_SIGNAL, blockif_sigcont_handler, NULL); 432 (void) signal(SIGCONT, SIG_IGN); 433 } 434 435 int 436 blockif_legacy_config(nvlist_t *nvl, const char *opts) 437 { 438 char *cp, *path; 439 440 if (opts == NULL) 441 return (0); 442 443 cp = strchr(opts, ','); 444 if (cp == NULL) { 445 set_config_value_node(nvl, "path", opts); 446 return (0); 447 } 448 path = strndup(opts, cp - opts); 449 set_config_value_node(nvl, "path", path); 450 free(path); 451 return (pci_parse_legacy_config(nvl, cp + 1)); 452 } 453 454 struct blockif_ctxt * 455 blockif_open(nvlist_t *nvl, const char *ident) 456 { 457 char tname[MAXCOMLEN + 1]; 458 char name[MAXPATHLEN]; 459 const char *path, *pssval, *ssval; 460 char *cp; 461 struct blockif_ctxt *bc; 462 struct stat sbuf; 463 struct diocgattr_arg arg; 464 off_t size, psectsz, psectoff; 465 int extra, fd, i, sectsz; 466 int ro, candelete, geom, ssopt, pssopt; 467 int nodelete; 468 469 #ifndef WITHOUT_CAPSICUM 470 cap_rights_t rights; 471 cap_ioctl_t cmds[] = { DIOCGFLUSH, DIOCGDELETE }; 472 #endif 473 474 pthread_once(&blockif_once, blockif_init); 475 476 fd = -1; 477 extra = 0; 478 ssopt = 0; 479 ro = 0; 480 nodelete = 0; 481 482 if (get_config_bool_node_default(nvl, "nocache", false)) 483 extra |= O_DIRECT; 484 if (get_config_bool_node_default(nvl, "nodelete", false)) 485 nodelete = 1; 486 if (get_config_bool_node_default(nvl, "sync", false) || 487 get_config_bool_node_default(nvl, "direct", false)) 488 extra |= O_SYNC; 489 if (get_config_bool_node_default(nvl, "ro", false)) 490 ro = 1; 491 ssval = get_config_value_node(nvl, "sectorsize"); 492 if (ssval != NULL) { 493 ssopt = strtol(ssval, &cp, 10); 494 if (cp == ssval) { 495 EPRINTLN("Invalid sector size \"%s\"", ssval); 496 goto err; 497 } 498 if (*cp == '\0') { 499 pssopt = ssopt; 500 } else if (*cp == '/') { 501 pssval = cp + 1; 502 pssopt = strtol(pssval, &cp, 10); 503 if (cp == pssval || *cp != '\0') { 504 EPRINTLN("Invalid sector size \"%s\"", ssval); 505 goto err; 506 } 507 } else { 508 EPRINTLN("Invalid sector size \"%s\"", ssval); 509 goto err; 510 } 511 } 512 513 path = get_config_value_node(nvl, "path"); 514 if (path == NULL) { 515 EPRINTLN("Missing \"path\" for block device."); 516 goto err; 517 } 518 519 fd = open(path, (ro ? O_RDONLY : O_RDWR) | extra); 520 if (fd < 0 && !ro) { 521 /* Attempt a r/w fail with a r/o open */ 522 fd = open(path, O_RDONLY | extra); 523 ro = 1; 524 } 525 526 if (fd < 0) { 527 warn("Could not open backing file: %s", path); 528 goto err; 529 } 530 531 if (fstat(fd, &sbuf) < 0) { 532 warn("Could not stat backing file %s", path); 533 goto err; 534 } 535 536 #ifndef WITHOUT_CAPSICUM 537 cap_rights_init(&rights, CAP_FSYNC, CAP_IOCTL, CAP_READ, CAP_SEEK, 538 CAP_WRITE, CAP_FSTAT, CAP_EVENT); 539 if (ro) 540 cap_rights_clear(&rights, CAP_FSYNC, CAP_WRITE); 541 542 if (caph_rights_limit(fd, &rights) == -1) 543 errx(EX_OSERR, "Unable to apply rights for sandbox"); 544 #endif 545 546 /* 547 * Deal with raw devices 548 */ 549 size = sbuf.st_size; 550 sectsz = DEV_BSIZE; 551 psectsz = psectoff = 0; 552 candelete = geom = 0; 553 if (S_ISCHR(sbuf.st_mode)) { 554 if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 || 555 ioctl(fd, DIOCGSECTORSIZE, §sz)) { 556 perror("Could not fetch dev blk/sector size"); 557 goto err; 558 } 559 assert(size != 0); 560 assert(sectsz != 0); 561 if (ioctl(fd, DIOCGSTRIPESIZE, &psectsz) == 0 && psectsz > 0) 562 ioctl(fd, DIOCGSTRIPEOFFSET, &psectoff); 563 strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name)); 564 arg.len = sizeof(arg.value.i); 565 if (nodelete == 0 && ioctl(fd, DIOCGATTR, &arg) == 0) 566 candelete = arg.value.i; 567 if (ioctl(fd, DIOCGPROVIDERNAME, name) == 0) 568 geom = 1; 569 } else 570 psectsz = sbuf.st_blksize; 571 572 #ifndef WITHOUT_CAPSICUM 573 if (caph_ioctls_limit(fd, cmds, nitems(cmds)) == -1) 574 errx(EX_OSERR, "Unable to apply rights for sandbox"); 575 #endif 576 577 if (ssopt != 0) { 578 if (!powerof2(ssopt) || !powerof2(pssopt) || ssopt < 512 || 579 ssopt > pssopt) { 580 EPRINTLN("Invalid sector size %d/%d", 581 ssopt, pssopt); 582 goto err; 583 } 584 585 /* 586 * Some backend drivers (e.g. cd0, ada0) require that the I/O 587 * size be a multiple of the device's sector size. 588 * 589 * Validate that the emulated sector size complies with this 590 * requirement. 591 */ 592 if (S_ISCHR(sbuf.st_mode)) { 593 if (ssopt < sectsz || (ssopt % sectsz) != 0) { 594 EPRINTLN("Sector size %d incompatible " 595 "with underlying device sector size %d", 596 ssopt, sectsz); 597 goto err; 598 } 599 } 600 601 sectsz = ssopt; 602 psectsz = pssopt; 603 psectoff = 0; 604 } 605 606 bc = calloc(1, sizeof(struct blockif_ctxt)); 607 if (bc == NULL) { 608 perror("calloc"); 609 goto err; 610 } 611 612 bc->bc_magic = BLOCKIF_SIG; 613 bc->bc_fd = fd; 614 bc->bc_ischr = S_ISCHR(sbuf.st_mode); 615 bc->bc_isgeom = geom; 616 bc->bc_candelete = candelete; 617 bc->bc_rdonly = ro; 618 bc->bc_size = size; 619 bc->bc_sectsz = sectsz; 620 bc->bc_psectsz = psectsz; 621 bc->bc_psectoff = psectoff; 622 pthread_mutex_init(&bc->bc_mtx, NULL); 623 pthread_cond_init(&bc->bc_cond, NULL); 624 bc->bc_paused = 0; 625 bc->bc_work_count = 0; 626 pthread_cond_init(&bc->bc_paused_cond, NULL); 627 pthread_cond_init(&bc->bc_work_done_cond, NULL); 628 TAILQ_INIT(&bc->bc_freeq); 629 TAILQ_INIT(&bc->bc_pendq); 630 TAILQ_INIT(&bc->bc_busyq); 631 for (i = 0; i < BLOCKIF_MAXREQ; i++) { 632 bc->bc_reqs[i].be_status = BST_FREE; 633 TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link); 634 } 635 636 for (i = 0; i < BLOCKIF_NUMTHR; i++) { 637 pthread_create(&bc->bc_btid[i], NULL, blockif_thr, bc); 638 snprintf(tname, sizeof(tname), "blk-%s-%d", ident, i); 639 pthread_set_name_np(bc->bc_btid[i], tname); 640 } 641 642 return (bc); 643 err: 644 if (fd >= 0) 645 close(fd); 646 return (NULL); 647 } 648 649 static void 650 blockif_resized(int fd, enum ev_type type, void *arg) 651 { 652 struct blockif_ctxt *bc; 653 struct stat sb; 654 655 if (fstat(fd, &sb) != 0) 656 return; 657 658 bc = arg; 659 pthread_mutex_lock(&bc->bc_mtx); 660 if (sb.st_size != bc->bc_size) { 661 bc->bc_size = sb.st_size; 662 bc->bc_resize_cb(bc, bc->bc_resize_cb_arg, bc->bc_size); 663 } 664 pthread_mutex_unlock(&bc->bc_mtx); 665 } 666 667 int 668 blockif_register_resize_callback(struct blockif_ctxt *bc, blockif_resize_cb *cb, 669 void *cb_arg) 670 { 671 struct stat sb; 672 int err; 673 674 if (cb == NULL) 675 return (EINVAL); 676 677 pthread_mutex_lock(&bc->bc_mtx); 678 if (bc->bc_resize_cb != NULL) { 679 err = EBUSY; 680 goto out; 681 } 682 683 assert(bc->bc_closing == 0); 684 685 if (fstat(bc->bc_fd, &sb) != 0) { 686 err = errno; 687 goto out; 688 } 689 690 bc->bc_resize_event = mevent_add_flags(bc->bc_fd, EVF_VNODE, 691 EVFF_ATTRIB, blockif_resized, bc); 692 if (bc->bc_resize_event == NULL) { 693 err = ENXIO; 694 goto out; 695 } 696 697 bc->bc_resize_cb = cb; 698 bc->bc_resize_cb_arg = cb_arg; 699 out: 700 pthread_mutex_unlock(&bc->bc_mtx); 701 702 return (err); 703 } 704 705 static int 706 blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq, 707 enum blockop op) 708 { 709 int err; 710 711 err = 0; 712 713 pthread_mutex_lock(&bc->bc_mtx); 714 if (!TAILQ_EMPTY(&bc->bc_freeq)) { 715 /* 716 * Enqueue and inform the block i/o thread 717 * that there is work available 718 */ 719 if (blockif_enqueue(bc, breq, op)) 720 pthread_cond_signal(&bc->bc_cond); 721 } else { 722 /* 723 * Callers are not allowed to enqueue more than 724 * the specified blockif queue limit. Return an 725 * error to indicate that the queue length has been 726 * exceeded. 727 */ 728 err = E2BIG; 729 } 730 pthread_mutex_unlock(&bc->bc_mtx); 731 732 return (err); 733 } 734 735 int 736 blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq) 737 { 738 739 assert(bc->bc_magic == BLOCKIF_SIG); 740 return (blockif_request(bc, breq, BOP_READ)); 741 } 742 743 int 744 blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq) 745 { 746 747 assert(bc->bc_magic == BLOCKIF_SIG); 748 return (blockif_request(bc, breq, BOP_WRITE)); 749 } 750 751 int 752 blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq) 753 { 754 755 assert(bc->bc_magic == BLOCKIF_SIG); 756 return (blockif_request(bc, breq, BOP_FLUSH)); 757 } 758 759 int 760 blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq) 761 { 762 763 assert(bc->bc_magic == BLOCKIF_SIG); 764 return (blockif_request(bc, breq, BOP_DELETE)); 765 } 766 767 int 768 blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq) 769 { 770 struct blockif_elem *be; 771 772 assert(bc->bc_magic == BLOCKIF_SIG); 773 774 pthread_mutex_lock(&bc->bc_mtx); 775 /* XXX: not waiting while paused */ 776 777 /* 778 * Check pending requests. 779 */ 780 TAILQ_FOREACH(be, &bc->bc_pendq, be_link) { 781 if (be->be_req == breq) 782 break; 783 } 784 if (be != NULL) { 785 /* 786 * Found it. 787 */ 788 blockif_complete(bc, be); 789 pthread_mutex_unlock(&bc->bc_mtx); 790 791 return (0); 792 } 793 794 /* 795 * Check in-flight requests. 796 */ 797 TAILQ_FOREACH(be, &bc->bc_busyq, be_link) { 798 if (be->be_req == breq) 799 break; 800 } 801 if (be == NULL) { 802 /* 803 * Didn't find it. 804 */ 805 pthread_mutex_unlock(&bc->bc_mtx); 806 return (EINVAL); 807 } 808 809 /* 810 * Interrupt the processing thread to force it return 811 * prematurely via it's normal callback path. 812 */ 813 while (be->be_status == BST_BUSY) { 814 struct blockif_sig_elem bse, *old_head; 815 816 pthread_mutex_init(&bse.bse_mtx, NULL); 817 pthread_cond_init(&bse.bse_cond, NULL); 818 819 bse.bse_pending = 1; 820 821 do { 822 old_head = blockif_bse_head; 823 bse.bse_next = old_head; 824 } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head, 825 (uintptr_t)old_head, 826 (uintptr_t)&bse)); 827 828 pthread_kill(be->be_tid, SIGCONT); 829 830 pthread_mutex_lock(&bse.bse_mtx); 831 while (bse.bse_pending) 832 pthread_cond_wait(&bse.bse_cond, &bse.bse_mtx); 833 pthread_mutex_unlock(&bse.bse_mtx); 834 } 835 836 pthread_mutex_unlock(&bc->bc_mtx); 837 838 /* 839 * The processing thread has been interrupted. Since it's not 840 * clear if the callback has been invoked yet, return EBUSY. 841 */ 842 return (EBUSY); 843 } 844 845 int 846 blockif_close(struct blockif_ctxt *bc) 847 { 848 void *jval; 849 int i; 850 851 assert(bc->bc_magic == BLOCKIF_SIG); 852 853 /* 854 * Stop the block i/o thread 855 */ 856 pthread_mutex_lock(&bc->bc_mtx); 857 bc->bc_closing = 1; 858 if (bc->bc_resize_event != NULL) 859 mevent_disable(bc->bc_resize_event); 860 pthread_mutex_unlock(&bc->bc_mtx); 861 pthread_cond_broadcast(&bc->bc_cond); 862 for (i = 0; i < BLOCKIF_NUMTHR; i++) 863 pthread_join(bc->bc_btid[i], &jval); 864 865 /* XXX Cancel queued i/o's ??? */ 866 867 /* 868 * Release resources 869 */ 870 bc->bc_magic = 0; 871 close(bc->bc_fd); 872 free(bc); 873 874 return (0); 875 } 876 877 /* 878 * Return virtual C/H/S values for a given block. Use the algorithm 879 * outlined in the VHD specification to calculate values. 880 */ 881 void 882 blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s) 883 { 884 off_t sectors; /* total sectors of the block dev */ 885 off_t hcyl; /* cylinders times heads */ 886 uint16_t secpt; /* sectors per track */ 887 uint8_t heads; 888 889 assert(bc->bc_magic == BLOCKIF_SIG); 890 891 sectors = bc->bc_size / bc->bc_sectsz; 892 893 /* Clamp the size to the largest possible with CHS */ 894 if (sectors > 65535UL*16*255) 895 sectors = 65535UL*16*255; 896 897 if (sectors >= 65536UL*16*63) { 898 secpt = 255; 899 heads = 16; 900 hcyl = sectors / secpt; 901 } else { 902 secpt = 17; 903 hcyl = sectors / secpt; 904 heads = (hcyl + 1023) / 1024; 905 906 if (heads < 4) 907 heads = 4; 908 909 if (hcyl >= (heads * 1024) || heads > 16) { 910 secpt = 31; 911 heads = 16; 912 hcyl = sectors / secpt; 913 } 914 if (hcyl >= (heads * 1024)) { 915 secpt = 63; 916 heads = 16; 917 hcyl = sectors / secpt; 918 } 919 } 920 921 *c = hcyl / heads; 922 *h = heads; 923 *s = secpt; 924 } 925 926 /* 927 * Accessors 928 */ 929 off_t 930 blockif_size(struct blockif_ctxt *bc) 931 { 932 933 assert(bc->bc_magic == BLOCKIF_SIG); 934 return (bc->bc_size); 935 } 936 937 int 938 blockif_sectsz(struct blockif_ctxt *bc) 939 { 940 941 assert(bc->bc_magic == BLOCKIF_SIG); 942 return (bc->bc_sectsz); 943 } 944 945 void 946 blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off) 947 { 948 949 assert(bc->bc_magic == BLOCKIF_SIG); 950 *size = bc->bc_psectsz; 951 *off = bc->bc_psectoff; 952 } 953 954 int 955 blockif_queuesz(struct blockif_ctxt *bc) 956 { 957 958 assert(bc->bc_magic == BLOCKIF_SIG); 959 return (BLOCKIF_MAXREQ - 1); 960 } 961 962 int 963 blockif_is_ro(struct blockif_ctxt *bc) 964 { 965 966 assert(bc->bc_magic == BLOCKIF_SIG); 967 return (bc->bc_rdonly); 968 } 969 970 int 971 blockif_candelete(struct blockif_ctxt *bc) 972 { 973 974 assert(bc->bc_magic == BLOCKIF_SIG); 975 return (bc->bc_candelete); 976 } 977 978 #ifdef BHYVE_SNAPSHOT 979 void 980 blockif_pause(struct blockif_ctxt *bc) 981 { 982 assert(bc != NULL); 983 assert(bc->bc_magic == BLOCKIF_SIG); 984 985 pthread_mutex_lock(&bc->bc_mtx); 986 bc->bc_paused = 1; 987 988 /* The interface is paused. Wait for workers to finish their work */ 989 while (bc->bc_work_count) 990 pthread_cond_wait(&bc->bc_work_done_cond, &bc->bc_mtx); 991 pthread_mutex_unlock(&bc->bc_mtx); 992 993 if (blockif_flush_bc(bc)) 994 fprintf(stderr, "%s: [WARN] failed to flush backing file.\r\n", 995 __func__); 996 } 997 998 void 999 blockif_resume(struct blockif_ctxt *bc) 1000 { 1001 assert(bc != NULL); 1002 assert(bc->bc_magic == BLOCKIF_SIG); 1003 1004 pthread_mutex_lock(&bc->bc_mtx); 1005 bc->bc_paused = 0; 1006 /* resume the threads waiting for paused */ 1007 pthread_cond_broadcast(&bc->bc_paused_cond); 1008 /* kick the threads after restore */ 1009 pthread_cond_broadcast(&bc->bc_cond); 1010 pthread_mutex_unlock(&bc->bc_mtx); 1011 } 1012 1013 int 1014 blockif_snapshot_req(struct blockif_req *br, struct vm_snapshot_meta *meta) 1015 { 1016 int i; 1017 struct iovec *iov; 1018 int ret; 1019 1020 SNAPSHOT_VAR_OR_LEAVE(br->br_iovcnt, meta, ret, done); 1021 SNAPSHOT_VAR_OR_LEAVE(br->br_offset, meta, ret, done); 1022 SNAPSHOT_VAR_OR_LEAVE(br->br_resid, meta, ret, done); 1023 1024 /* 1025 * XXX: The callback and parameter must be filled by the virtualized 1026 * device that uses the interface, during its init; we're not touching 1027 * them here. 1028 */ 1029 1030 /* Snapshot the iovecs. */ 1031 for (i = 0; i < br->br_iovcnt; i++) { 1032 iov = &br->br_iov[i]; 1033 1034 SNAPSHOT_VAR_OR_LEAVE(iov->iov_len, meta, ret, done); 1035 1036 /* We assume the iov is a guest-mapped address. */ 1037 SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(iov->iov_base, iov->iov_len, 1038 false, meta, ret, done); 1039 } 1040 1041 done: 1042 return (ret); 1043 } 1044 1045 int 1046 blockif_snapshot(struct blockif_ctxt *bc, struct vm_snapshot_meta *meta) 1047 { 1048 int ret; 1049 1050 if (bc->bc_paused == 0) { 1051 fprintf(stderr, "%s: Snapshot failed: " 1052 "interface not paused.\r\n", __func__); 1053 return (ENXIO); 1054 } 1055 1056 pthread_mutex_lock(&bc->bc_mtx); 1057 1058 SNAPSHOT_VAR_OR_LEAVE(bc->bc_magic, meta, ret, done); 1059 SNAPSHOT_VAR_OR_LEAVE(bc->bc_ischr, meta, ret, done); 1060 SNAPSHOT_VAR_OR_LEAVE(bc->bc_isgeom, meta, ret, done); 1061 SNAPSHOT_VAR_OR_LEAVE(bc->bc_candelete, meta, ret, done); 1062 SNAPSHOT_VAR_OR_LEAVE(bc->bc_rdonly, meta, ret, done); 1063 SNAPSHOT_VAR_OR_LEAVE(bc->bc_size, meta, ret, done); 1064 SNAPSHOT_VAR_OR_LEAVE(bc->bc_sectsz, meta, ret, done); 1065 SNAPSHOT_VAR_OR_LEAVE(bc->bc_psectsz, meta, ret, done); 1066 SNAPSHOT_VAR_OR_LEAVE(bc->bc_psectoff, meta, ret, done); 1067 SNAPSHOT_VAR_OR_LEAVE(bc->bc_closing, meta, ret, done); 1068 1069 done: 1070 pthread_mutex_unlock(&bc->bc_mtx); 1071 return (ret); 1072 } 1073 #endif 1074