1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2013 Peter Grehan <grehan@freebsd.org> 5 * All rights reserved. 6 * Copyright 2020 Joyent, Inc. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * $FreeBSD$ 30 */ 31 32 #include <sys/cdefs.h> 33 __FBSDID("$FreeBSD$"); 34 35 #include <sys/param.h> 36 #ifndef WITHOUT_CAPSICUM 37 #include <sys/capsicum.h> 38 #endif 39 #include <sys/queue.h> 40 #include <sys/errno.h> 41 #include <sys/stat.h> 42 #include <sys/ioctl.h> 43 #include <sys/disk.h> 44 45 #include <assert.h> 46 #ifndef WITHOUT_CAPSICUM 47 #include <capsicum_helpers.h> 48 #endif 49 #include <err.h> 50 #include <fcntl.h> 51 #include <stdio.h> 52 #include <stdlib.h> 53 #include <string.h> 54 #include <pthread.h> 55 #include <pthread_np.h> 56 #include <signal.h> 57 #include <sysexits.h> 58 #include <unistd.h> 59 60 #include <machine/atomic.h> 61 #include <machine/vmm_snapshot.h> 62 63 #include "bhyverun.h" 64 #include "config.h" 65 #include "debug.h" 66 #include "mevent.h" 67 #include "pci_emul.h" 68 #include "block_if.h" 69 70 #define BLOCKIF_SIG 0xb109b109 71 72 #define BLOCKIF_NUMTHR 8 73 #define BLOCKIF_MAXREQ (BLOCKIF_RING_MAX + BLOCKIF_NUMTHR) 74 75 enum blockop { 76 BOP_READ, 77 BOP_WRITE, 78 BOP_FLUSH, 79 BOP_DELETE 80 }; 81 82 enum blockstat { 83 BST_FREE, 84 BST_BLOCK, 85 BST_PEND, 86 BST_BUSY, 87 BST_DONE 88 }; 89 90 struct blockif_elem { 91 TAILQ_ENTRY(blockif_elem) be_link; 92 struct blockif_req *be_req; 93 enum blockop be_op; 94 enum blockstat be_status; 95 pthread_t be_tid; 96 off_t be_block; 97 }; 98 99 struct blockif_ctxt { 100 int bc_magic; 101 int bc_fd; 102 int bc_ischr; 103 int bc_isgeom; 104 int bc_candelete; 105 int bc_rdonly; 106 off_t bc_size; 107 int bc_sectsz; 108 int bc_psectsz; 109 int bc_psectoff; 110 int bc_closing; 111 int bc_paused; 112 int bc_work_count; 113 pthread_t bc_btid[BLOCKIF_NUMTHR]; 114 pthread_mutex_t bc_mtx; 115 pthread_cond_t bc_cond; 116 pthread_cond_t bc_paused_cond; 117 pthread_cond_t bc_work_done_cond; 118 blockif_resize_cb *bc_resize_cb; 119 void *bc_resize_cb_arg; 120 struct mevent *bc_resize_event; 121 122 /* Request elements and free/pending/busy queues */ 123 TAILQ_HEAD(, blockif_elem) bc_freeq; 124 TAILQ_HEAD(, blockif_elem) bc_pendq; 125 TAILQ_HEAD(, blockif_elem) bc_busyq; 126 struct blockif_elem bc_reqs[BLOCKIF_MAXREQ]; 127 }; 128 129 static pthread_once_t blockif_once = PTHREAD_ONCE_INIT; 130 131 struct blockif_sig_elem { 132 pthread_mutex_t bse_mtx; 133 pthread_cond_t bse_cond; 134 int bse_pending; 135 struct blockif_sig_elem *bse_next; 136 }; 137 138 static struct blockif_sig_elem *blockif_bse_head; 139 140 static int 141 blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq, 142 enum blockop op) 143 { 144 struct blockif_elem *be, *tbe; 145 off_t off; 146 int i; 147 148 be = TAILQ_FIRST(&bc->bc_freeq); 149 assert(be != NULL); 150 assert(be->be_status == BST_FREE); 151 TAILQ_REMOVE(&bc->bc_freeq, be, be_link); 152 be->be_req = breq; 153 be->be_op = op; 154 switch (op) { 155 case BOP_READ: 156 case BOP_WRITE: 157 case BOP_DELETE: 158 off = breq->br_offset; 159 for (i = 0; i < breq->br_iovcnt; i++) 160 off += breq->br_iov[i].iov_len; 161 break; 162 default: 163 off = OFF_MAX; 164 } 165 be->be_block = off; 166 TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) { 167 if (tbe->be_block == breq->br_offset) 168 break; 169 } 170 if (tbe == NULL) { 171 TAILQ_FOREACH(tbe, &bc->bc_busyq, be_link) { 172 if (tbe->be_block == breq->br_offset) 173 break; 174 } 175 } 176 if (tbe == NULL) 177 be->be_status = BST_PEND; 178 else 179 be->be_status = BST_BLOCK; 180 TAILQ_INSERT_TAIL(&bc->bc_pendq, be, be_link); 181 return (be->be_status == BST_PEND); 182 } 183 184 static int 185 blockif_dequeue(struct blockif_ctxt *bc, pthread_t t, struct blockif_elem **bep) 186 { 187 struct blockif_elem *be; 188 189 TAILQ_FOREACH(be, &bc->bc_pendq, be_link) { 190 if (be->be_status == BST_PEND) 191 break; 192 assert(be->be_status == BST_BLOCK); 193 } 194 if (be == NULL) 195 return (0); 196 TAILQ_REMOVE(&bc->bc_pendq, be, be_link); 197 be->be_status = BST_BUSY; 198 be->be_tid = t; 199 TAILQ_INSERT_TAIL(&bc->bc_busyq, be, be_link); 200 *bep = be; 201 return (1); 202 } 203 204 static void 205 blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be) 206 { 207 struct blockif_elem *tbe; 208 209 if (be->be_status == BST_DONE || be->be_status == BST_BUSY) 210 TAILQ_REMOVE(&bc->bc_busyq, be, be_link); 211 else 212 TAILQ_REMOVE(&bc->bc_pendq, be, be_link); 213 TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) { 214 if (tbe->be_req->br_offset == be->be_block) 215 tbe->be_status = BST_PEND; 216 } 217 be->be_tid = 0; 218 be->be_status = BST_FREE; 219 be->be_req = NULL; 220 TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link); 221 } 222 223 static int 224 blockif_flush_bc(struct blockif_ctxt *bc) 225 { 226 if (bc->bc_ischr) { 227 if (ioctl(bc->bc_fd, DIOCGFLUSH)) 228 return (errno); 229 } else if (fsync(bc->bc_fd)) 230 return (errno); 231 232 return (0); 233 } 234 235 static void 236 blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf) 237 { 238 struct blockif_req *br; 239 off_t arg[2]; 240 ssize_t clen, len, off, boff, voff; 241 int i, err; 242 struct spacectl_range range; 243 244 br = be->be_req; 245 if (br->br_iovcnt <= 1) 246 buf = NULL; 247 err = 0; 248 switch (be->be_op) { 249 case BOP_READ: 250 if (buf == NULL) { 251 if ((len = preadv(bc->bc_fd, br->br_iov, br->br_iovcnt, 252 br->br_offset)) < 0) 253 err = errno; 254 else 255 br->br_resid -= len; 256 break; 257 } 258 i = 0; 259 off = voff = 0; 260 while (br->br_resid > 0) { 261 len = MIN(br->br_resid, MAXPHYS); 262 if (pread(bc->bc_fd, buf, len, br->br_offset + 263 off) < 0) { 264 err = errno; 265 break; 266 } 267 boff = 0; 268 do { 269 clen = MIN(len - boff, br->br_iov[i].iov_len - 270 voff); 271 memcpy(br->br_iov[i].iov_base + voff, 272 buf + boff, clen); 273 if (clen < br->br_iov[i].iov_len - voff) 274 voff += clen; 275 else { 276 i++; 277 voff = 0; 278 } 279 boff += clen; 280 } while (boff < len); 281 off += len; 282 br->br_resid -= len; 283 } 284 break; 285 case BOP_WRITE: 286 if (bc->bc_rdonly) { 287 err = EROFS; 288 break; 289 } 290 if (buf == NULL) { 291 if ((len = pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt, 292 br->br_offset)) < 0) 293 err = errno; 294 else 295 br->br_resid -= len; 296 break; 297 } 298 i = 0; 299 off = voff = 0; 300 while (br->br_resid > 0) { 301 len = MIN(br->br_resid, MAXPHYS); 302 boff = 0; 303 do { 304 clen = MIN(len - boff, br->br_iov[i].iov_len - 305 voff); 306 memcpy(buf + boff, 307 br->br_iov[i].iov_base + voff, clen); 308 if (clen < br->br_iov[i].iov_len - voff) 309 voff += clen; 310 else { 311 i++; 312 voff = 0; 313 } 314 boff += clen; 315 } while (boff < len); 316 if (pwrite(bc->bc_fd, buf, len, br->br_offset + 317 off) < 0) { 318 err = errno; 319 break; 320 } 321 off += len; 322 br->br_resid -= len; 323 } 324 break; 325 case BOP_FLUSH: 326 err = blockif_flush_bc(bc); 327 break; 328 case BOP_DELETE: 329 if (!bc->bc_candelete) 330 err = EOPNOTSUPP; 331 else if (bc->bc_rdonly) 332 err = EROFS; 333 else if (bc->bc_ischr) { 334 arg[0] = br->br_offset; 335 arg[1] = br->br_resid; 336 if (ioctl(bc->bc_fd, DIOCGDELETE, arg)) 337 err = errno; 338 else 339 br->br_resid = 0; 340 } else { 341 range.r_offset = br->br_offset; 342 range.r_len = br->br_resid; 343 344 while (range.r_len > 0) { 345 if (fspacectl(bc->bc_fd, SPACECTL_DEALLOC, 346 &range, 0, &range) != 0) { 347 err = errno; 348 break; 349 } 350 } 351 if (err == 0) 352 br->br_resid = 0; 353 } 354 break; 355 default: 356 err = EINVAL; 357 break; 358 } 359 360 be->be_status = BST_DONE; 361 362 (*br->br_callback)(br, err); 363 } 364 365 static void * 366 blockif_thr(void *arg) 367 { 368 struct blockif_ctxt *bc; 369 struct blockif_elem *be; 370 pthread_t t; 371 uint8_t *buf; 372 373 bc = arg; 374 if (bc->bc_isgeom) 375 buf = malloc(MAXPHYS); 376 else 377 buf = NULL; 378 t = pthread_self(); 379 380 pthread_mutex_lock(&bc->bc_mtx); 381 for (;;) { 382 bc->bc_work_count++; 383 384 /* We cannot process work if the interface is paused */ 385 while (!bc->bc_paused && blockif_dequeue(bc, t, &be)) { 386 pthread_mutex_unlock(&bc->bc_mtx); 387 blockif_proc(bc, be, buf); 388 pthread_mutex_lock(&bc->bc_mtx); 389 blockif_complete(bc, be); 390 } 391 392 bc->bc_work_count--; 393 394 /* If none of the workers are busy, notify the main thread */ 395 if (bc->bc_work_count == 0) 396 pthread_cond_broadcast(&bc->bc_work_done_cond); 397 398 /* Check ctxt status here to see if exit requested */ 399 if (bc->bc_closing) 400 break; 401 402 /* Make all worker threads wait here if the device is paused */ 403 while (bc->bc_paused) 404 pthread_cond_wait(&bc->bc_paused_cond, &bc->bc_mtx); 405 406 pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx); 407 } 408 pthread_mutex_unlock(&bc->bc_mtx); 409 410 if (buf) 411 free(buf); 412 pthread_exit(NULL); 413 return (NULL); 414 } 415 416 static void 417 blockif_sigcont_handler(int signal, enum ev_type type, void *arg) 418 { 419 struct blockif_sig_elem *bse; 420 421 for (;;) { 422 /* 423 * Process the entire list even if not intended for 424 * this thread. 425 */ 426 do { 427 bse = blockif_bse_head; 428 if (bse == NULL) 429 return; 430 } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head, 431 (uintptr_t)bse, 432 (uintptr_t)bse->bse_next)); 433 434 pthread_mutex_lock(&bse->bse_mtx); 435 bse->bse_pending = 0; 436 pthread_cond_signal(&bse->bse_cond); 437 pthread_mutex_unlock(&bse->bse_mtx); 438 } 439 } 440 441 static void 442 blockif_init(void) 443 { 444 mevent_add(SIGCONT, EVF_SIGNAL, blockif_sigcont_handler, NULL); 445 (void) signal(SIGCONT, SIG_IGN); 446 } 447 448 int 449 blockif_legacy_config(nvlist_t *nvl, const char *opts) 450 { 451 char *cp, *path; 452 453 if (opts == NULL) 454 return (0); 455 456 cp = strchr(opts, ','); 457 if (cp == NULL) { 458 set_config_value_node(nvl, "path", opts); 459 return (0); 460 } 461 path = strndup(opts, cp - opts); 462 set_config_value_node(nvl, "path", path); 463 free(path); 464 return (pci_parse_legacy_config(nvl, cp + 1)); 465 } 466 467 struct blockif_ctxt * 468 blockif_open(nvlist_t *nvl, const char *ident) 469 { 470 char tname[MAXCOMLEN + 1]; 471 char name[MAXPATHLEN]; 472 const char *path, *pssval, *ssval; 473 char *cp; 474 struct blockif_ctxt *bc; 475 struct stat sbuf; 476 struct diocgattr_arg arg; 477 off_t size, psectsz, psectoff; 478 int extra, fd, i, sectsz; 479 int ro, candelete, geom, ssopt, pssopt; 480 int nodelete; 481 482 #ifndef WITHOUT_CAPSICUM 483 cap_rights_t rights; 484 cap_ioctl_t cmds[] = { DIOCGFLUSH, DIOCGDELETE }; 485 #endif 486 487 pthread_once(&blockif_once, blockif_init); 488 489 fd = -1; 490 extra = 0; 491 ssopt = 0; 492 ro = 0; 493 nodelete = 0; 494 495 if (get_config_bool_node_default(nvl, "nocache", false)) 496 extra |= O_DIRECT; 497 if (get_config_bool_node_default(nvl, "nodelete", false)) 498 nodelete = 1; 499 if (get_config_bool_node_default(nvl, "sync", false) || 500 get_config_bool_node_default(nvl, "direct", false)) 501 extra |= O_SYNC; 502 if (get_config_bool_node_default(nvl, "ro", false)) 503 ro = 1; 504 ssval = get_config_value_node(nvl, "sectorsize"); 505 if (ssval != NULL) { 506 ssopt = strtol(ssval, &cp, 10); 507 if (cp == ssval) { 508 EPRINTLN("Invalid sector size \"%s\"", ssval); 509 goto err; 510 } 511 if (*cp == '\0') { 512 pssopt = ssopt; 513 } else if (*cp == '/') { 514 pssval = cp + 1; 515 pssopt = strtol(pssval, &cp, 10); 516 if (cp == pssval || *cp != '\0') { 517 EPRINTLN("Invalid sector size \"%s\"", ssval); 518 goto err; 519 } 520 } else { 521 EPRINTLN("Invalid sector size \"%s\"", ssval); 522 goto err; 523 } 524 } 525 526 path = get_config_value_node(nvl, "path"); 527 if (path == NULL) { 528 EPRINTLN("Missing \"path\" for block device."); 529 goto err; 530 } 531 532 fd = open(path, (ro ? O_RDONLY : O_RDWR) | extra); 533 if (fd < 0 && !ro) { 534 /* Attempt a r/w fail with a r/o open */ 535 fd = open(path, O_RDONLY | extra); 536 ro = 1; 537 } 538 539 if (fd < 0) { 540 warn("Could not open backing file: %s", path); 541 goto err; 542 } 543 544 if (fstat(fd, &sbuf) < 0) { 545 warn("Could not stat backing file %s", path); 546 goto err; 547 } 548 549 #ifndef WITHOUT_CAPSICUM 550 cap_rights_init(&rights, CAP_FSYNC, CAP_IOCTL, CAP_READ, CAP_SEEK, 551 CAP_WRITE, CAP_FSTAT, CAP_EVENT, CAP_FPATHCONF); 552 if (ro) 553 cap_rights_clear(&rights, CAP_FSYNC, CAP_WRITE); 554 555 if (caph_rights_limit(fd, &rights) == -1) 556 errx(EX_OSERR, "Unable to apply rights for sandbox"); 557 #endif 558 559 /* 560 * Deal with raw devices 561 */ 562 size = sbuf.st_size; 563 sectsz = DEV_BSIZE; 564 psectsz = psectoff = 0; 565 candelete = geom = 0; 566 if (S_ISCHR(sbuf.st_mode)) { 567 if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 || 568 ioctl(fd, DIOCGSECTORSIZE, §sz)) { 569 perror("Could not fetch dev blk/sector size"); 570 goto err; 571 } 572 assert(size != 0); 573 assert(sectsz != 0); 574 if (ioctl(fd, DIOCGSTRIPESIZE, &psectsz) == 0 && psectsz > 0) 575 ioctl(fd, DIOCGSTRIPEOFFSET, &psectoff); 576 strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name)); 577 arg.len = sizeof(arg.value.i); 578 if (nodelete == 0 && ioctl(fd, DIOCGATTR, &arg) == 0) 579 candelete = arg.value.i; 580 if (ioctl(fd, DIOCGPROVIDERNAME, name) == 0) 581 geom = 1; 582 } else { 583 psectsz = sbuf.st_blksize; 584 /* Avoid fallback implementation */ 585 candelete = fpathconf(fd, _PC_DEALLOC_PRESENT) == 1; 586 } 587 588 #ifndef WITHOUT_CAPSICUM 589 if (caph_ioctls_limit(fd, cmds, nitems(cmds)) == -1) 590 errx(EX_OSERR, "Unable to apply rights for sandbox"); 591 #endif 592 593 if (ssopt != 0) { 594 if (!powerof2(ssopt) || !powerof2(pssopt) || ssopt < 512 || 595 ssopt > pssopt) { 596 EPRINTLN("Invalid sector size %d/%d", 597 ssopt, pssopt); 598 goto err; 599 } 600 601 /* 602 * Some backend drivers (e.g. cd0, ada0) require that the I/O 603 * size be a multiple of the device's sector size. 604 * 605 * Validate that the emulated sector size complies with this 606 * requirement. 607 */ 608 if (S_ISCHR(sbuf.st_mode)) { 609 if (ssopt < sectsz || (ssopt % sectsz) != 0) { 610 EPRINTLN("Sector size %d incompatible " 611 "with underlying device sector size %d", 612 ssopt, sectsz); 613 goto err; 614 } 615 } 616 617 sectsz = ssopt; 618 psectsz = pssopt; 619 psectoff = 0; 620 } 621 622 bc = calloc(1, sizeof(struct blockif_ctxt)); 623 if (bc == NULL) { 624 perror("calloc"); 625 goto err; 626 } 627 628 bc->bc_magic = BLOCKIF_SIG; 629 bc->bc_fd = fd; 630 bc->bc_ischr = S_ISCHR(sbuf.st_mode); 631 bc->bc_isgeom = geom; 632 bc->bc_candelete = candelete; 633 bc->bc_rdonly = ro; 634 bc->bc_size = size; 635 bc->bc_sectsz = sectsz; 636 bc->bc_psectsz = psectsz; 637 bc->bc_psectoff = psectoff; 638 pthread_mutex_init(&bc->bc_mtx, NULL); 639 pthread_cond_init(&bc->bc_cond, NULL); 640 bc->bc_paused = 0; 641 bc->bc_work_count = 0; 642 pthread_cond_init(&bc->bc_paused_cond, NULL); 643 pthread_cond_init(&bc->bc_work_done_cond, NULL); 644 TAILQ_INIT(&bc->bc_freeq); 645 TAILQ_INIT(&bc->bc_pendq); 646 TAILQ_INIT(&bc->bc_busyq); 647 for (i = 0; i < BLOCKIF_MAXREQ; i++) { 648 bc->bc_reqs[i].be_status = BST_FREE; 649 TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link); 650 } 651 652 for (i = 0; i < BLOCKIF_NUMTHR; i++) { 653 pthread_create(&bc->bc_btid[i], NULL, blockif_thr, bc); 654 snprintf(tname, sizeof(tname), "blk-%s-%d", ident, i); 655 pthread_set_name_np(bc->bc_btid[i], tname); 656 } 657 658 return (bc); 659 err: 660 if (fd >= 0) 661 close(fd); 662 return (NULL); 663 } 664 665 static void 666 blockif_resized(int fd, enum ev_type type, void *arg) 667 { 668 struct blockif_ctxt *bc; 669 struct stat sb; 670 off_t mediasize; 671 672 if (fstat(fd, &sb) != 0) 673 return; 674 675 if (S_ISCHR(sb.st_mode)) { 676 if (ioctl(fd, DIOCGMEDIASIZE, &mediasize) < 0) { 677 EPRINTLN("blockif_resized: get mediasize failed: %s", 678 strerror(errno)); 679 return; 680 } 681 } else 682 mediasize = sb.st_size; 683 684 bc = arg; 685 pthread_mutex_lock(&bc->bc_mtx); 686 if (mediasize != bc->bc_size) { 687 bc->bc_size = mediasize; 688 bc->bc_resize_cb(bc, bc->bc_resize_cb_arg, bc->bc_size); 689 } 690 pthread_mutex_unlock(&bc->bc_mtx); 691 } 692 693 int 694 blockif_register_resize_callback(struct blockif_ctxt *bc, blockif_resize_cb *cb, 695 void *cb_arg) 696 { 697 struct stat sb; 698 int err; 699 700 if (cb == NULL) 701 return (EINVAL); 702 703 pthread_mutex_lock(&bc->bc_mtx); 704 if (bc->bc_resize_cb != NULL) { 705 err = EBUSY; 706 goto out; 707 } 708 709 assert(bc->bc_closing == 0); 710 711 if (fstat(bc->bc_fd, &sb) != 0) { 712 err = errno; 713 goto out; 714 } 715 716 bc->bc_resize_event = mevent_add_flags(bc->bc_fd, EVF_VNODE, 717 EVFF_ATTRIB, blockif_resized, bc); 718 if (bc->bc_resize_event == NULL) { 719 err = ENXIO; 720 goto out; 721 } 722 723 bc->bc_resize_cb = cb; 724 bc->bc_resize_cb_arg = cb_arg; 725 out: 726 pthread_mutex_unlock(&bc->bc_mtx); 727 728 return (err); 729 } 730 731 static int 732 blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq, 733 enum blockop op) 734 { 735 int err; 736 737 err = 0; 738 739 pthread_mutex_lock(&bc->bc_mtx); 740 if (!TAILQ_EMPTY(&bc->bc_freeq)) { 741 /* 742 * Enqueue and inform the block i/o thread 743 * that there is work available 744 */ 745 if (blockif_enqueue(bc, breq, op)) 746 pthread_cond_signal(&bc->bc_cond); 747 } else { 748 /* 749 * Callers are not allowed to enqueue more than 750 * the specified blockif queue limit. Return an 751 * error to indicate that the queue length has been 752 * exceeded. 753 */ 754 err = E2BIG; 755 } 756 pthread_mutex_unlock(&bc->bc_mtx); 757 758 return (err); 759 } 760 761 int 762 blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq) 763 { 764 765 assert(bc->bc_magic == BLOCKIF_SIG); 766 return (blockif_request(bc, breq, BOP_READ)); 767 } 768 769 int 770 blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq) 771 { 772 773 assert(bc->bc_magic == BLOCKIF_SIG); 774 return (blockif_request(bc, breq, BOP_WRITE)); 775 } 776 777 int 778 blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq) 779 { 780 781 assert(bc->bc_magic == BLOCKIF_SIG); 782 return (blockif_request(bc, breq, BOP_FLUSH)); 783 } 784 785 int 786 blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq) 787 { 788 789 assert(bc->bc_magic == BLOCKIF_SIG); 790 return (blockif_request(bc, breq, BOP_DELETE)); 791 } 792 793 int 794 blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq) 795 { 796 struct blockif_elem *be; 797 798 assert(bc->bc_magic == BLOCKIF_SIG); 799 800 pthread_mutex_lock(&bc->bc_mtx); 801 /* XXX: not waiting while paused */ 802 803 /* 804 * Check pending requests. 805 */ 806 TAILQ_FOREACH(be, &bc->bc_pendq, be_link) { 807 if (be->be_req == breq) 808 break; 809 } 810 if (be != NULL) { 811 /* 812 * Found it. 813 */ 814 blockif_complete(bc, be); 815 pthread_mutex_unlock(&bc->bc_mtx); 816 817 return (0); 818 } 819 820 /* 821 * Check in-flight requests. 822 */ 823 TAILQ_FOREACH(be, &bc->bc_busyq, be_link) { 824 if (be->be_req == breq) 825 break; 826 } 827 if (be == NULL) { 828 /* 829 * Didn't find it. 830 */ 831 pthread_mutex_unlock(&bc->bc_mtx); 832 return (EINVAL); 833 } 834 835 /* 836 * Interrupt the processing thread to force it return 837 * prematurely via it's normal callback path. 838 */ 839 while (be->be_status == BST_BUSY) { 840 struct blockif_sig_elem bse, *old_head; 841 842 pthread_mutex_init(&bse.bse_mtx, NULL); 843 pthread_cond_init(&bse.bse_cond, NULL); 844 845 bse.bse_pending = 1; 846 847 do { 848 old_head = blockif_bse_head; 849 bse.bse_next = old_head; 850 } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head, 851 (uintptr_t)old_head, 852 (uintptr_t)&bse)); 853 854 pthread_kill(be->be_tid, SIGCONT); 855 856 pthread_mutex_lock(&bse.bse_mtx); 857 while (bse.bse_pending) 858 pthread_cond_wait(&bse.bse_cond, &bse.bse_mtx); 859 pthread_mutex_unlock(&bse.bse_mtx); 860 } 861 862 pthread_mutex_unlock(&bc->bc_mtx); 863 864 /* 865 * The processing thread has been interrupted. Since it's not 866 * clear if the callback has been invoked yet, return EBUSY. 867 */ 868 return (EBUSY); 869 } 870 871 int 872 blockif_close(struct blockif_ctxt *bc) 873 { 874 void *jval; 875 int i; 876 877 assert(bc->bc_magic == BLOCKIF_SIG); 878 879 /* 880 * Stop the block i/o thread 881 */ 882 pthread_mutex_lock(&bc->bc_mtx); 883 bc->bc_closing = 1; 884 if (bc->bc_resize_event != NULL) 885 mevent_disable(bc->bc_resize_event); 886 pthread_mutex_unlock(&bc->bc_mtx); 887 pthread_cond_broadcast(&bc->bc_cond); 888 for (i = 0; i < BLOCKIF_NUMTHR; i++) 889 pthread_join(bc->bc_btid[i], &jval); 890 891 /* XXX Cancel queued i/o's ??? */ 892 893 /* 894 * Release resources 895 */ 896 bc->bc_magic = 0; 897 close(bc->bc_fd); 898 free(bc); 899 900 return (0); 901 } 902 903 /* 904 * Return virtual C/H/S values for a given block. Use the algorithm 905 * outlined in the VHD specification to calculate values. 906 */ 907 void 908 blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s) 909 { 910 off_t sectors; /* total sectors of the block dev */ 911 off_t hcyl; /* cylinders times heads */ 912 uint16_t secpt; /* sectors per track */ 913 uint8_t heads; 914 915 assert(bc->bc_magic == BLOCKIF_SIG); 916 917 sectors = bc->bc_size / bc->bc_sectsz; 918 919 /* Clamp the size to the largest possible with CHS */ 920 if (sectors > 65535UL*16*255) 921 sectors = 65535UL*16*255; 922 923 if (sectors >= 65536UL*16*63) { 924 secpt = 255; 925 heads = 16; 926 hcyl = sectors / secpt; 927 } else { 928 secpt = 17; 929 hcyl = sectors / secpt; 930 heads = (hcyl + 1023) / 1024; 931 932 if (heads < 4) 933 heads = 4; 934 935 if (hcyl >= (heads * 1024) || heads > 16) { 936 secpt = 31; 937 heads = 16; 938 hcyl = sectors / secpt; 939 } 940 if (hcyl >= (heads * 1024)) { 941 secpt = 63; 942 heads = 16; 943 hcyl = sectors / secpt; 944 } 945 } 946 947 *c = hcyl / heads; 948 *h = heads; 949 *s = secpt; 950 } 951 952 /* 953 * Accessors 954 */ 955 off_t 956 blockif_size(struct blockif_ctxt *bc) 957 { 958 959 assert(bc->bc_magic == BLOCKIF_SIG); 960 return (bc->bc_size); 961 } 962 963 int 964 blockif_sectsz(struct blockif_ctxt *bc) 965 { 966 967 assert(bc->bc_magic == BLOCKIF_SIG); 968 return (bc->bc_sectsz); 969 } 970 971 void 972 blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off) 973 { 974 975 assert(bc->bc_magic == BLOCKIF_SIG); 976 *size = bc->bc_psectsz; 977 *off = bc->bc_psectoff; 978 } 979 980 int 981 blockif_queuesz(struct blockif_ctxt *bc) 982 { 983 984 assert(bc->bc_magic == BLOCKIF_SIG); 985 return (BLOCKIF_MAXREQ - 1); 986 } 987 988 int 989 blockif_is_ro(struct blockif_ctxt *bc) 990 { 991 992 assert(bc->bc_magic == BLOCKIF_SIG); 993 return (bc->bc_rdonly); 994 } 995 996 int 997 blockif_candelete(struct blockif_ctxt *bc) 998 { 999 1000 assert(bc->bc_magic == BLOCKIF_SIG); 1001 return (bc->bc_candelete); 1002 } 1003 1004 #ifdef BHYVE_SNAPSHOT 1005 void 1006 blockif_pause(struct blockif_ctxt *bc) 1007 { 1008 assert(bc != NULL); 1009 assert(bc->bc_magic == BLOCKIF_SIG); 1010 1011 pthread_mutex_lock(&bc->bc_mtx); 1012 bc->bc_paused = 1; 1013 1014 /* The interface is paused. Wait for workers to finish their work */ 1015 while (bc->bc_work_count) 1016 pthread_cond_wait(&bc->bc_work_done_cond, &bc->bc_mtx); 1017 pthread_mutex_unlock(&bc->bc_mtx); 1018 1019 if (blockif_flush_bc(bc)) 1020 fprintf(stderr, "%s: [WARN] failed to flush backing file.\r\n", 1021 __func__); 1022 } 1023 1024 void 1025 blockif_resume(struct blockif_ctxt *bc) 1026 { 1027 assert(bc != NULL); 1028 assert(bc->bc_magic == BLOCKIF_SIG); 1029 1030 pthread_mutex_lock(&bc->bc_mtx); 1031 bc->bc_paused = 0; 1032 /* resume the threads waiting for paused */ 1033 pthread_cond_broadcast(&bc->bc_paused_cond); 1034 /* kick the threads after restore */ 1035 pthread_cond_broadcast(&bc->bc_cond); 1036 pthread_mutex_unlock(&bc->bc_mtx); 1037 } 1038 1039 int 1040 blockif_snapshot_req(struct blockif_req *br, struct vm_snapshot_meta *meta) 1041 { 1042 int i; 1043 struct iovec *iov; 1044 int ret; 1045 1046 SNAPSHOT_VAR_OR_LEAVE(br->br_iovcnt, meta, ret, done); 1047 SNAPSHOT_VAR_OR_LEAVE(br->br_offset, meta, ret, done); 1048 SNAPSHOT_VAR_OR_LEAVE(br->br_resid, meta, ret, done); 1049 1050 /* 1051 * XXX: The callback and parameter must be filled by the virtualized 1052 * device that uses the interface, during its init; we're not touching 1053 * them here. 1054 */ 1055 1056 /* Snapshot the iovecs. */ 1057 for (i = 0; i < br->br_iovcnt; i++) { 1058 iov = &br->br_iov[i]; 1059 1060 SNAPSHOT_VAR_OR_LEAVE(iov->iov_len, meta, ret, done); 1061 1062 /* We assume the iov is a guest-mapped address. */ 1063 SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(iov->iov_base, iov->iov_len, 1064 false, meta, ret, done); 1065 } 1066 1067 done: 1068 return (ret); 1069 } 1070 1071 int 1072 blockif_snapshot(struct blockif_ctxt *bc, struct vm_snapshot_meta *meta) 1073 { 1074 int ret; 1075 1076 if (bc->bc_paused == 0) { 1077 fprintf(stderr, "%s: Snapshot failed: " 1078 "interface not paused.\r\n", __func__); 1079 return (ENXIO); 1080 } 1081 1082 pthread_mutex_lock(&bc->bc_mtx); 1083 1084 SNAPSHOT_VAR_OR_LEAVE(bc->bc_magic, meta, ret, done); 1085 SNAPSHOT_VAR_OR_LEAVE(bc->bc_ischr, meta, ret, done); 1086 SNAPSHOT_VAR_OR_LEAVE(bc->bc_isgeom, meta, ret, done); 1087 SNAPSHOT_VAR_OR_LEAVE(bc->bc_candelete, meta, ret, done); 1088 SNAPSHOT_VAR_OR_LEAVE(bc->bc_rdonly, meta, ret, done); 1089 SNAPSHOT_VAR_OR_LEAVE(bc->bc_size, meta, ret, done); 1090 SNAPSHOT_VAR_OR_LEAVE(bc->bc_sectsz, meta, ret, done); 1091 SNAPSHOT_VAR_OR_LEAVE(bc->bc_psectsz, meta, ret, done); 1092 SNAPSHOT_VAR_OR_LEAVE(bc->bc_psectoff, meta, ret, done); 1093 SNAPSHOT_VAR_OR_LEAVE(bc->bc_closing, meta, ret, done); 1094 1095 done: 1096 pthread_mutex_unlock(&bc->bc_mtx); 1097 return (ret); 1098 } 1099 #endif 1100