1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2013 Peter Grehan <grehan@freebsd.org> 5 * All rights reserved. 6 * Copyright 2020 Joyent, Inc. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * $FreeBSD$ 30 */ 31 32 #include <sys/cdefs.h> 33 __FBSDID("$FreeBSD$"); 34 35 #include <sys/param.h> 36 #ifndef WITHOUT_CAPSICUM 37 #include <sys/capsicum.h> 38 #endif 39 #include <sys/queue.h> 40 #include <sys/errno.h> 41 #include <sys/stat.h> 42 #include <sys/ioctl.h> 43 #include <sys/disk.h> 44 45 #include <assert.h> 46 #ifndef WITHOUT_CAPSICUM 47 #include <capsicum_helpers.h> 48 #endif 49 #include <err.h> 50 #include <fcntl.h> 51 #include <stdio.h> 52 #include <stdlib.h> 53 #include <string.h> 54 #include <pthread.h> 55 #include <pthread_np.h> 56 #include <signal.h> 57 #include <sysexits.h> 58 #include <unistd.h> 59 60 #include <machine/atomic.h> 61 #include <machine/vmm_snapshot.h> 62 63 #include "bhyverun.h" 64 #include "config.h" 65 #include "debug.h" 66 #include "mevent.h" 67 #include "pci_emul.h" 68 #include "block_if.h" 69 70 #define BLOCKIF_SIG 0xb109b109 71 72 #define BLOCKIF_NUMTHR 8 73 #define BLOCKIF_MAXREQ (BLOCKIF_RING_MAX + BLOCKIF_NUMTHR) 74 75 enum blockop { 76 BOP_READ, 77 BOP_WRITE, 78 BOP_FLUSH, 79 BOP_DELETE 80 }; 81 82 enum blockstat { 83 BST_FREE, 84 BST_BLOCK, 85 BST_PEND, 86 BST_BUSY, 87 BST_DONE 88 }; 89 90 struct blockif_elem { 91 TAILQ_ENTRY(blockif_elem) be_link; 92 struct blockif_req *be_req; 93 enum blockop be_op; 94 enum blockstat be_status; 95 pthread_t be_tid; 96 off_t be_block; 97 }; 98 99 struct blockif_ctxt { 100 int bc_magic; 101 int bc_fd; 102 int bc_ischr; 103 int bc_isgeom; 104 int bc_candelete; 105 int bc_rdonly; 106 off_t bc_size; 107 int bc_sectsz; 108 int bc_psectsz; 109 int bc_psectoff; 110 int bc_closing; 111 int bc_paused; 112 int bc_work_count; 113 pthread_t bc_btid[BLOCKIF_NUMTHR]; 114 pthread_mutex_t bc_mtx; 115 pthread_cond_t bc_cond; 116 pthread_cond_t bc_paused_cond; 117 pthread_cond_t bc_work_done_cond; 118 blockif_resize_cb *bc_resize_cb; 119 void *bc_resize_cb_arg; 120 struct mevent *bc_resize_event; 121 122 /* Request elements and free/pending/busy queues */ 123 TAILQ_HEAD(, blockif_elem) bc_freeq; 124 TAILQ_HEAD(, blockif_elem) bc_pendq; 125 TAILQ_HEAD(, blockif_elem) bc_busyq; 126 struct blockif_elem bc_reqs[BLOCKIF_MAXREQ]; 127 }; 128 129 static pthread_once_t blockif_once = PTHREAD_ONCE_INIT; 130 131 struct blockif_sig_elem { 132 pthread_mutex_t bse_mtx; 133 pthread_cond_t bse_cond; 134 int bse_pending; 135 struct blockif_sig_elem *bse_next; 136 }; 137 138 static struct blockif_sig_elem *blockif_bse_head; 139 140 static int 141 blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq, 142 enum blockop op) 143 { 144 struct blockif_elem *be, *tbe; 145 off_t off; 146 int i; 147 148 be = TAILQ_FIRST(&bc->bc_freeq); 149 assert(be != NULL); 150 assert(be->be_status == BST_FREE); 151 TAILQ_REMOVE(&bc->bc_freeq, be, be_link); 152 be->be_req = breq; 153 be->be_op = op; 154 switch (op) { 155 case BOP_READ: 156 case BOP_WRITE: 157 case BOP_DELETE: 158 off = breq->br_offset; 159 for (i = 0; i < breq->br_iovcnt; i++) 160 off += breq->br_iov[i].iov_len; 161 break; 162 default: 163 off = OFF_MAX; 164 } 165 be->be_block = off; 166 TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) { 167 if (tbe->be_block == breq->br_offset) 168 break; 169 } 170 if (tbe == NULL) { 171 TAILQ_FOREACH(tbe, &bc->bc_busyq, be_link) { 172 if (tbe->be_block == breq->br_offset) 173 break; 174 } 175 } 176 if (tbe == NULL) 177 be->be_status = BST_PEND; 178 else 179 be->be_status = BST_BLOCK; 180 TAILQ_INSERT_TAIL(&bc->bc_pendq, be, be_link); 181 return (be->be_status == BST_PEND); 182 } 183 184 static int 185 blockif_dequeue(struct blockif_ctxt *bc, pthread_t t, struct blockif_elem **bep) 186 { 187 struct blockif_elem *be; 188 189 TAILQ_FOREACH(be, &bc->bc_pendq, be_link) { 190 if (be->be_status == BST_PEND) 191 break; 192 assert(be->be_status == BST_BLOCK); 193 } 194 if (be == NULL) 195 return (0); 196 TAILQ_REMOVE(&bc->bc_pendq, be, be_link); 197 be->be_status = BST_BUSY; 198 be->be_tid = t; 199 TAILQ_INSERT_TAIL(&bc->bc_busyq, be, be_link); 200 *bep = be; 201 return (1); 202 } 203 204 static void 205 blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be) 206 { 207 struct blockif_elem *tbe; 208 209 if (be->be_status == BST_DONE || be->be_status == BST_BUSY) 210 TAILQ_REMOVE(&bc->bc_busyq, be, be_link); 211 else 212 TAILQ_REMOVE(&bc->bc_pendq, be, be_link); 213 TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) { 214 if (tbe->be_req->br_offset == be->be_block) 215 tbe->be_status = BST_PEND; 216 } 217 be->be_tid = 0; 218 be->be_status = BST_FREE; 219 be->be_req = NULL; 220 TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link); 221 } 222 223 static int 224 blockif_flush_bc(struct blockif_ctxt *bc) 225 { 226 if (bc->bc_ischr) { 227 if (ioctl(bc->bc_fd, DIOCGFLUSH)) 228 return (errno); 229 } else if (fsync(bc->bc_fd)) 230 return (errno); 231 232 return (0); 233 } 234 235 static void 236 blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf) 237 { 238 struct blockif_req *br; 239 off_t arg[2]; 240 ssize_t clen, len, off, boff, voff; 241 int i, err; 242 struct spacectl_range range; 243 244 br = be->be_req; 245 if (br->br_iovcnt <= 1) 246 buf = NULL; 247 err = 0; 248 switch (be->be_op) { 249 case BOP_READ: 250 if (buf == NULL) { 251 if ((len = preadv(bc->bc_fd, br->br_iov, br->br_iovcnt, 252 br->br_offset)) < 0) 253 err = errno; 254 else 255 br->br_resid -= len; 256 break; 257 } 258 i = 0; 259 off = voff = 0; 260 while (br->br_resid > 0) { 261 len = MIN(br->br_resid, MAXPHYS); 262 if (pread(bc->bc_fd, buf, len, br->br_offset + 263 off) < 0) { 264 err = errno; 265 break; 266 } 267 boff = 0; 268 do { 269 clen = MIN(len - boff, br->br_iov[i].iov_len - 270 voff); 271 memcpy(br->br_iov[i].iov_base + voff, 272 buf + boff, clen); 273 if (clen < br->br_iov[i].iov_len - voff) 274 voff += clen; 275 else { 276 i++; 277 voff = 0; 278 } 279 boff += clen; 280 } while (boff < len); 281 off += len; 282 br->br_resid -= len; 283 } 284 break; 285 case BOP_WRITE: 286 if (bc->bc_rdonly) { 287 err = EROFS; 288 break; 289 } 290 if (buf == NULL) { 291 if ((len = pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt, 292 br->br_offset)) < 0) 293 err = errno; 294 else 295 br->br_resid -= len; 296 break; 297 } 298 i = 0; 299 off = voff = 0; 300 while (br->br_resid > 0) { 301 len = MIN(br->br_resid, MAXPHYS); 302 boff = 0; 303 do { 304 clen = MIN(len - boff, br->br_iov[i].iov_len - 305 voff); 306 memcpy(buf + boff, 307 br->br_iov[i].iov_base + voff, clen); 308 if (clen < br->br_iov[i].iov_len - voff) 309 voff += clen; 310 else { 311 i++; 312 voff = 0; 313 } 314 boff += clen; 315 } while (boff < len); 316 if (pwrite(bc->bc_fd, buf, len, br->br_offset + 317 off) < 0) { 318 err = errno; 319 break; 320 } 321 off += len; 322 br->br_resid -= len; 323 } 324 break; 325 case BOP_FLUSH: 326 err = blockif_flush_bc(bc); 327 break; 328 case BOP_DELETE: 329 if (!bc->bc_candelete) 330 err = EOPNOTSUPP; 331 else if (bc->bc_rdonly) 332 err = EROFS; 333 else if (bc->bc_ischr) { 334 arg[0] = br->br_offset; 335 arg[1] = br->br_resid; 336 if (ioctl(bc->bc_fd, DIOCGDELETE, arg)) 337 err = errno; 338 else 339 br->br_resid = 0; 340 } else { 341 range.r_offset = br->br_offset; 342 range.r_len = br->br_resid; 343 344 while (range.r_len > 0) { 345 if (fspacectl(bc->bc_fd, SPACECTL_DEALLOC, 346 &range, 0, &range) != 0) { 347 err = errno; 348 break; 349 } 350 } 351 if (err == 0) 352 br->br_resid = 0; 353 } 354 break; 355 default: 356 err = EINVAL; 357 break; 358 } 359 360 be->be_status = BST_DONE; 361 362 (*br->br_callback)(br, err); 363 } 364 365 static void * 366 blockif_thr(void *arg) 367 { 368 struct blockif_ctxt *bc; 369 struct blockif_elem *be; 370 pthread_t t; 371 uint8_t *buf; 372 373 bc = arg; 374 if (bc->bc_isgeom) 375 buf = malloc(MAXPHYS); 376 else 377 buf = NULL; 378 t = pthread_self(); 379 380 pthread_mutex_lock(&bc->bc_mtx); 381 for (;;) { 382 bc->bc_work_count++; 383 384 /* We cannot process work if the interface is paused */ 385 while (!bc->bc_paused && blockif_dequeue(bc, t, &be)) { 386 pthread_mutex_unlock(&bc->bc_mtx); 387 blockif_proc(bc, be, buf); 388 pthread_mutex_lock(&bc->bc_mtx); 389 blockif_complete(bc, be); 390 } 391 392 bc->bc_work_count--; 393 394 /* If none of the workers are busy, notify the main thread */ 395 if (bc->bc_work_count == 0) 396 pthread_cond_broadcast(&bc->bc_work_done_cond); 397 398 /* Check ctxt status here to see if exit requested */ 399 if (bc->bc_closing) 400 break; 401 402 /* Make all worker threads wait here if the device is paused */ 403 while (bc->bc_paused) 404 pthread_cond_wait(&bc->bc_paused_cond, &bc->bc_mtx); 405 406 pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx); 407 } 408 pthread_mutex_unlock(&bc->bc_mtx); 409 410 if (buf) 411 free(buf); 412 pthread_exit(NULL); 413 return (NULL); 414 } 415 416 static void 417 blockif_sigcont_handler(int signal, enum ev_type type, void *arg) 418 { 419 struct blockif_sig_elem *bse; 420 421 for (;;) { 422 /* 423 * Process the entire list even if not intended for 424 * this thread. 425 */ 426 do { 427 bse = blockif_bse_head; 428 if (bse == NULL) 429 return; 430 } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head, 431 (uintptr_t)bse, 432 (uintptr_t)bse->bse_next)); 433 434 pthread_mutex_lock(&bse->bse_mtx); 435 bse->bse_pending = 0; 436 pthread_cond_signal(&bse->bse_cond); 437 pthread_mutex_unlock(&bse->bse_mtx); 438 } 439 } 440 441 static void 442 blockif_init(void) 443 { 444 mevent_add(SIGCONT, EVF_SIGNAL, blockif_sigcont_handler, NULL); 445 (void) signal(SIGCONT, SIG_IGN); 446 } 447 448 int 449 blockif_legacy_config(nvlist_t *nvl, const char *opts) 450 { 451 char *cp, *path; 452 453 if (opts == NULL) 454 return (0); 455 456 cp = strchr(opts, ','); 457 if (cp == NULL) { 458 set_config_value_node(nvl, "path", opts); 459 return (0); 460 } 461 path = strndup(opts, cp - opts); 462 set_config_value_node(nvl, "path", path); 463 free(path); 464 return (pci_parse_legacy_config(nvl, cp + 1)); 465 } 466 467 struct blockif_ctxt * 468 blockif_open(nvlist_t *nvl, const char *ident) 469 { 470 char tname[MAXCOMLEN + 1]; 471 char name[MAXPATHLEN]; 472 const char *path, *pssval, *ssval; 473 char *cp; 474 struct blockif_ctxt *bc; 475 struct stat sbuf; 476 struct diocgattr_arg arg; 477 off_t size, psectsz, psectoff; 478 int extra, fd, i, sectsz; 479 int ro, candelete, geom, ssopt, pssopt; 480 int nodelete; 481 482 #ifndef WITHOUT_CAPSICUM 483 cap_rights_t rights; 484 cap_ioctl_t cmds[] = { DIOCGFLUSH, DIOCGDELETE }; 485 #endif 486 487 pthread_once(&blockif_once, blockif_init); 488 489 fd = -1; 490 extra = 0; 491 ssopt = 0; 492 ro = 0; 493 nodelete = 0; 494 495 if (get_config_bool_node_default(nvl, "nocache", false)) 496 extra |= O_DIRECT; 497 if (get_config_bool_node_default(nvl, "nodelete", false)) 498 nodelete = 1; 499 if (get_config_bool_node_default(nvl, "sync", false) || 500 get_config_bool_node_default(nvl, "direct", false)) 501 extra |= O_SYNC; 502 if (get_config_bool_node_default(nvl, "ro", false)) 503 ro = 1; 504 ssval = get_config_value_node(nvl, "sectorsize"); 505 if (ssval != NULL) { 506 ssopt = strtol(ssval, &cp, 10); 507 if (cp == ssval) { 508 EPRINTLN("Invalid sector size \"%s\"", ssval); 509 goto err; 510 } 511 if (*cp == '\0') { 512 pssopt = ssopt; 513 } else if (*cp == '/') { 514 pssval = cp + 1; 515 pssopt = strtol(pssval, &cp, 10); 516 if (cp == pssval || *cp != '\0') { 517 EPRINTLN("Invalid sector size \"%s\"", ssval); 518 goto err; 519 } 520 } else { 521 EPRINTLN("Invalid sector size \"%s\"", ssval); 522 goto err; 523 } 524 } 525 526 path = get_config_value_node(nvl, "path"); 527 if (path == NULL) { 528 EPRINTLN("Missing \"path\" for block device."); 529 goto err; 530 } 531 532 fd = open(path, (ro ? O_RDONLY : O_RDWR) | extra); 533 if (fd < 0 && !ro) { 534 /* Attempt a r/w fail with a r/o open */ 535 fd = open(path, O_RDONLY | extra); 536 ro = 1; 537 } 538 539 if (fd < 0) { 540 warn("Could not open backing file: %s", path); 541 goto err; 542 } 543 544 if (fstat(fd, &sbuf) < 0) { 545 warn("Could not stat backing file %s", path); 546 goto err; 547 } 548 549 #ifndef WITHOUT_CAPSICUM 550 cap_rights_init(&rights, CAP_FSYNC, CAP_IOCTL, CAP_READ, CAP_SEEK, 551 CAP_WRITE, CAP_FSTAT, CAP_EVENT, CAP_FPATHCONF); 552 if (ro) 553 cap_rights_clear(&rights, CAP_FSYNC, CAP_WRITE); 554 555 if (caph_rights_limit(fd, &rights) == -1) 556 errx(EX_OSERR, "Unable to apply rights for sandbox"); 557 #endif 558 559 /* 560 * Deal with raw devices 561 */ 562 size = sbuf.st_size; 563 sectsz = DEV_BSIZE; 564 psectsz = psectoff = 0; 565 candelete = geom = 0; 566 if (S_ISCHR(sbuf.st_mode)) { 567 if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 || 568 ioctl(fd, DIOCGSECTORSIZE, §sz)) { 569 perror("Could not fetch dev blk/sector size"); 570 goto err; 571 } 572 assert(size != 0); 573 assert(sectsz != 0); 574 if (ioctl(fd, DIOCGSTRIPESIZE, &psectsz) == 0 && psectsz > 0) 575 ioctl(fd, DIOCGSTRIPEOFFSET, &psectoff); 576 strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name)); 577 arg.len = sizeof(arg.value.i); 578 if (nodelete == 0 && ioctl(fd, DIOCGATTR, &arg) == 0) 579 candelete = arg.value.i; 580 if (ioctl(fd, DIOCGPROVIDERNAME, name) == 0) 581 geom = 1; 582 } else { 583 psectsz = sbuf.st_blksize; 584 /* Avoid fallback implementation */ 585 candelete = fpathconf(fd, _PC_DEALLOC_PRESENT) == 1; 586 } 587 588 #ifndef WITHOUT_CAPSICUM 589 if (caph_ioctls_limit(fd, cmds, nitems(cmds)) == -1) 590 errx(EX_OSERR, "Unable to apply rights for sandbox"); 591 #endif 592 593 if (ssopt != 0) { 594 if (!powerof2(ssopt) || !powerof2(pssopt) || ssopt < 512 || 595 ssopt > pssopt) { 596 EPRINTLN("Invalid sector size %d/%d", 597 ssopt, pssopt); 598 goto err; 599 } 600 601 /* 602 * Some backend drivers (e.g. cd0, ada0) require that the I/O 603 * size be a multiple of the device's sector size. 604 * 605 * Validate that the emulated sector size complies with this 606 * requirement. 607 */ 608 if (S_ISCHR(sbuf.st_mode)) { 609 if (ssopt < sectsz || (ssopt % sectsz) != 0) { 610 EPRINTLN("Sector size %d incompatible " 611 "with underlying device sector size %d", 612 ssopt, sectsz); 613 goto err; 614 } 615 } 616 617 sectsz = ssopt; 618 psectsz = pssopt; 619 psectoff = 0; 620 } 621 622 bc = calloc(1, sizeof(struct blockif_ctxt)); 623 if (bc == NULL) { 624 perror("calloc"); 625 goto err; 626 } 627 628 bc->bc_magic = BLOCKIF_SIG; 629 bc->bc_fd = fd; 630 bc->bc_ischr = S_ISCHR(sbuf.st_mode); 631 bc->bc_isgeom = geom; 632 bc->bc_candelete = candelete; 633 bc->bc_rdonly = ro; 634 bc->bc_size = size; 635 bc->bc_sectsz = sectsz; 636 bc->bc_psectsz = psectsz; 637 bc->bc_psectoff = psectoff; 638 pthread_mutex_init(&bc->bc_mtx, NULL); 639 pthread_cond_init(&bc->bc_cond, NULL); 640 bc->bc_paused = 0; 641 bc->bc_work_count = 0; 642 pthread_cond_init(&bc->bc_paused_cond, NULL); 643 pthread_cond_init(&bc->bc_work_done_cond, NULL); 644 TAILQ_INIT(&bc->bc_freeq); 645 TAILQ_INIT(&bc->bc_pendq); 646 TAILQ_INIT(&bc->bc_busyq); 647 for (i = 0; i < BLOCKIF_MAXREQ; i++) { 648 bc->bc_reqs[i].be_status = BST_FREE; 649 TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link); 650 } 651 652 for (i = 0; i < BLOCKIF_NUMTHR; i++) { 653 pthread_create(&bc->bc_btid[i], NULL, blockif_thr, bc); 654 snprintf(tname, sizeof(tname), "blk-%s-%d", ident, i); 655 pthread_set_name_np(bc->bc_btid[i], tname); 656 } 657 658 return (bc); 659 err: 660 if (fd >= 0) 661 close(fd); 662 return (NULL); 663 } 664 665 static void 666 blockif_resized(int fd, enum ev_type type, void *arg) 667 { 668 struct blockif_ctxt *bc; 669 struct stat sb; 670 671 if (fstat(fd, &sb) != 0) 672 return; 673 674 bc = arg; 675 pthread_mutex_lock(&bc->bc_mtx); 676 if (sb.st_size != bc->bc_size) { 677 bc->bc_size = sb.st_size; 678 bc->bc_resize_cb(bc, bc->bc_resize_cb_arg, bc->bc_size); 679 } 680 pthread_mutex_unlock(&bc->bc_mtx); 681 } 682 683 int 684 blockif_register_resize_callback(struct blockif_ctxt *bc, blockif_resize_cb *cb, 685 void *cb_arg) 686 { 687 struct stat sb; 688 int err; 689 690 if (cb == NULL) 691 return (EINVAL); 692 693 pthread_mutex_lock(&bc->bc_mtx); 694 if (bc->bc_resize_cb != NULL) { 695 err = EBUSY; 696 goto out; 697 } 698 699 assert(bc->bc_closing == 0); 700 701 if (fstat(bc->bc_fd, &sb) != 0) { 702 err = errno; 703 goto out; 704 } 705 706 bc->bc_resize_event = mevent_add_flags(bc->bc_fd, EVF_VNODE, 707 EVFF_ATTRIB, blockif_resized, bc); 708 if (bc->bc_resize_event == NULL) { 709 err = ENXIO; 710 goto out; 711 } 712 713 bc->bc_resize_cb = cb; 714 bc->bc_resize_cb_arg = cb_arg; 715 out: 716 pthread_mutex_unlock(&bc->bc_mtx); 717 718 return (err); 719 } 720 721 static int 722 blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq, 723 enum blockop op) 724 { 725 int err; 726 727 err = 0; 728 729 pthread_mutex_lock(&bc->bc_mtx); 730 if (!TAILQ_EMPTY(&bc->bc_freeq)) { 731 /* 732 * Enqueue and inform the block i/o thread 733 * that there is work available 734 */ 735 if (blockif_enqueue(bc, breq, op)) 736 pthread_cond_signal(&bc->bc_cond); 737 } else { 738 /* 739 * Callers are not allowed to enqueue more than 740 * the specified blockif queue limit. Return an 741 * error to indicate that the queue length has been 742 * exceeded. 743 */ 744 err = E2BIG; 745 } 746 pthread_mutex_unlock(&bc->bc_mtx); 747 748 return (err); 749 } 750 751 int 752 blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq) 753 { 754 755 assert(bc->bc_magic == BLOCKIF_SIG); 756 return (blockif_request(bc, breq, BOP_READ)); 757 } 758 759 int 760 blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq) 761 { 762 763 assert(bc->bc_magic == BLOCKIF_SIG); 764 return (blockif_request(bc, breq, BOP_WRITE)); 765 } 766 767 int 768 blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq) 769 { 770 771 assert(bc->bc_magic == BLOCKIF_SIG); 772 return (blockif_request(bc, breq, BOP_FLUSH)); 773 } 774 775 int 776 blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq) 777 { 778 779 assert(bc->bc_magic == BLOCKIF_SIG); 780 return (blockif_request(bc, breq, BOP_DELETE)); 781 } 782 783 int 784 blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq) 785 { 786 struct blockif_elem *be; 787 788 assert(bc->bc_magic == BLOCKIF_SIG); 789 790 pthread_mutex_lock(&bc->bc_mtx); 791 /* XXX: not waiting while paused */ 792 793 /* 794 * Check pending requests. 795 */ 796 TAILQ_FOREACH(be, &bc->bc_pendq, be_link) { 797 if (be->be_req == breq) 798 break; 799 } 800 if (be != NULL) { 801 /* 802 * Found it. 803 */ 804 blockif_complete(bc, be); 805 pthread_mutex_unlock(&bc->bc_mtx); 806 807 return (0); 808 } 809 810 /* 811 * Check in-flight requests. 812 */ 813 TAILQ_FOREACH(be, &bc->bc_busyq, be_link) { 814 if (be->be_req == breq) 815 break; 816 } 817 if (be == NULL) { 818 /* 819 * Didn't find it. 820 */ 821 pthread_mutex_unlock(&bc->bc_mtx); 822 return (EINVAL); 823 } 824 825 /* 826 * Interrupt the processing thread to force it return 827 * prematurely via it's normal callback path. 828 */ 829 while (be->be_status == BST_BUSY) { 830 struct blockif_sig_elem bse, *old_head; 831 832 pthread_mutex_init(&bse.bse_mtx, NULL); 833 pthread_cond_init(&bse.bse_cond, NULL); 834 835 bse.bse_pending = 1; 836 837 do { 838 old_head = blockif_bse_head; 839 bse.bse_next = old_head; 840 } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head, 841 (uintptr_t)old_head, 842 (uintptr_t)&bse)); 843 844 pthread_kill(be->be_tid, SIGCONT); 845 846 pthread_mutex_lock(&bse.bse_mtx); 847 while (bse.bse_pending) 848 pthread_cond_wait(&bse.bse_cond, &bse.bse_mtx); 849 pthread_mutex_unlock(&bse.bse_mtx); 850 } 851 852 pthread_mutex_unlock(&bc->bc_mtx); 853 854 /* 855 * The processing thread has been interrupted. Since it's not 856 * clear if the callback has been invoked yet, return EBUSY. 857 */ 858 return (EBUSY); 859 } 860 861 int 862 blockif_close(struct blockif_ctxt *bc) 863 { 864 void *jval; 865 int i; 866 867 assert(bc->bc_magic == BLOCKIF_SIG); 868 869 /* 870 * Stop the block i/o thread 871 */ 872 pthread_mutex_lock(&bc->bc_mtx); 873 bc->bc_closing = 1; 874 if (bc->bc_resize_event != NULL) 875 mevent_disable(bc->bc_resize_event); 876 pthread_mutex_unlock(&bc->bc_mtx); 877 pthread_cond_broadcast(&bc->bc_cond); 878 for (i = 0; i < BLOCKIF_NUMTHR; i++) 879 pthread_join(bc->bc_btid[i], &jval); 880 881 /* XXX Cancel queued i/o's ??? */ 882 883 /* 884 * Release resources 885 */ 886 bc->bc_magic = 0; 887 close(bc->bc_fd); 888 free(bc); 889 890 return (0); 891 } 892 893 /* 894 * Return virtual C/H/S values for a given block. Use the algorithm 895 * outlined in the VHD specification to calculate values. 896 */ 897 void 898 blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s) 899 { 900 off_t sectors; /* total sectors of the block dev */ 901 off_t hcyl; /* cylinders times heads */ 902 uint16_t secpt; /* sectors per track */ 903 uint8_t heads; 904 905 assert(bc->bc_magic == BLOCKIF_SIG); 906 907 sectors = bc->bc_size / bc->bc_sectsz; 908 909 /* Clamp the size to the largest possible with CHS */ 910 if (sectors > 65535UL*16*255) 911 sectors = 65535UL*16*255; 912 913 if (sectors >= 65536UL*16*63) { 914 secpt = 255; 915 heads = 16; 916 hcyl = sectors / secpt; 917 } else { 918 secpt = 17; 919 hcyl = sectors / secpt; 920 heads = (hcyl + 1023) / 1024; 921 922 if (heads < 4) 923 heads = 4; 924 925 if (hcyl >= (heads * 1024) || heads > 16) { 926 secpt = 31; 927 heads = 16; 928 hcyl = sectors / secpt; 929 } 930 if (hcyl >= (heads * 1024)) { 931 secpt = 63; 932 heads = 16; 933 hcyl = sectors / secpt; 934 } 935 } 936 937 *c = hcyl / heads; 938 *h = heads; 939 *s = secpt; 940 } 941 942 /* 943 * Accessors 944 */ 945 off_t 946 blockif_size(struct blockif_ctxt *bc) 947 { 948 949 assert(bc->bc_magic == BLOCKIF_SIG); 950 return (bc->bc_size); 951 } 952 953 int 954 blockif_sectsz(struct blockif_ctxt *bc) 955 { 956 957 assert(bc->bc_magic == BLOCKIF_SIG); 958 return (bc->bc_sectsz); 959 } 960 961 void 962 blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off) 963 { 964 965 assert(bc->bc_magic == BLOCKIF_SIG); 966 *size = bc->bc_psectsz; 967 *off = bc->bc_psectoff; 968 } 969 970 int 971 blockif_queuesz(struct blockif_ctxt *bc) 972 { 973 974 assert(bc->bc_magic == BLOCKIF_SIG); 975 return (BLOCKIF_MAXREQ - 1); 976 } 977 978 int 979 blockif_is_ro(struct blockif_ctxt *bc) 980 { 981 982 assert(bc->bc_magic == BLOCKIF_SIG); 983 return (bc->bc_rdonly); 984 } 985 986 int 987 blockif_candelete(struct blockif_ctxt *bc) 988 { 989 990 assert(bc->bc_magic == BLOCKIF_SIG); 991 return (bc->bc_candelete); 992 } 993 994 #ifdef BHYVE_SNAPSHOT 995 void 996 blockif_pause(struct blockif_ctxt *bc) 997 { 998 assert(bc != NULL); 999 assert(bc->bc_magic == BLOCKIF_SIG); 1000 1001 pthread_mutex_lock(&bc->bc_mtx); 1002 bc->bc_paused = 1; 1003 1004 /* The interface is paused. Wait for workers to finish their work */ 1005 while (bc->bc_work_count) 1006 pthread_cond_wait(&bc->bc_work_done_cond, &bc->bc_mtx); 1007 pthread_mutex_unlock(&bc->bc_mtx); 1008 1009 if (blockif_flush_bc(bc)) 1010 fprintf(stderr, "%s: [WARN] failed to flush backing file.\r\n", 1011 __func__); 1012 } 1013 1014 void 1015 blockif_resume(struct blockif_ctxt *bc) 1016 { 1017 assert(bc != NULL); 1018 assert(bc->bc_magic == BLOCKIF_SIG); 1019 1020 pthread_mutex_lock(&bc->bc_mtx); 1021 bc->bc_paused = 0; 1022 /* resume the threads waiting for paused */ 1023 pthread_cond_broadcast(&bc->bc_paused_cond); 1024 /* kick the threads after restore */ 1025 pthread_cond_broadcast(&bc->bc_cond); 1026 pthread_mutex_unlock(&bc->bc_mtx); 1027 } 1028 1029 int 1030 blockif_snapshot_req(struct blockif_req *br, struct vm_snapshot_meta *meta) 1031 { 1032 int i; 1033 struct iovec *iov; 1034 int ret; 1035 1036 SNAPSHOT_VAR_OR_LEAVE(br->br_iovcnt, meta, ret, done); 1037 SNAPSHOT_VAR_OR_LEAVE(br->br_offset, meta, ret, done); 1038 SNAPSHOT_VAR_OR_LEAVE(br->br_resid, meta, ret, done); 1039 1040 /* 1041 * XXX: The callback and parameter must be filled by the virtualized 1042 * device that uses the interface, during its init; we're not touching 1043 * them here. 1044 */ 1045 1046 /* Snapshot the iovecs. */ 1047 for (i = 0; i < br->br_iovcnt; i++) { 1048 iov = &br->br_iov[i]; 1049 1050 SNAPSHOT_VAR_OR_LEAVE(iov->iov_len, meta, ret, done); 1051 1052 /* We assume the iov is a guest-mapped address. */ 1053 SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(iov->iov_base, iov->iov_len, 1054 false, meta, ret, done); 1055 } 1056 1057 done: 1058 return (ret); 1059 } 1060 1061 int 1062 blockif_snapshot(struct blockif_ctxt *bc, struct vm_snapshot_meta *meta) 1063 { 1064 int ret; 1065 1066 if (bc->bc_paused == 0) { 1067 fprintf(stderr, "%s: Snapshot failed: " 1068 "interface not paused.\r\n", __func__); 1069 return (ENXIO); 1070 } 1071 1072 pthread_mutex_lock(&bc->bc_mtx); 1073 1074 SNAPSHOT_VAR_OR_LEAVE(bc->bc_magic, meta, ret, done); 1075 SNAPSHOT_VAR_OR_LEAVE(bc->bc_ischr, meta, ret, done); 1076 SNAPSHOT_VAR_OR_LEAVE(bc->bc_isgeom, meta, ret, done); 1077 SNAPSHOT_VAR_OR_LEAVE(bc->bc_candelete, meta, ret, done); 1078 SNAPSHOT_VAR_OR_LEAVE(bc->bc_rdonly, meta, ret, done); 1079 SNAPSHOT_VAR_OR_LEAVE(bc->bc_size, meta, ret, done); 1080 SNAPSHOT_VAR_OR_LEAVE(bc->bc_sectsz, meta, ret, done); 1081 SNAPSHOT_VAR_OR_LEAVE(bc->bc_psectsz, meta, ret, done); 1082 SNAPSHOT_VAR_OR_LEAVE(bc->bc_psectoff, meta, ret, done); 1083 SNAPSHOT_VAR_OR_LEAVE(bc->bc_closing, meta, ret, done); 1084 1085 done: 1086 pthread_mutex_unlock(&bc->bc_mtx); 1087 return (ret); 1088 } 1089 #endif 1090