1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2013 Peter Grehan <grehan@freebsd.org> 5 * All rights reserved. 6 * Copyright 2020 Joyent, Inc. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 #include <sys/param.h> 31 #ifndef WITHOUT_CAPSICUM 32 #include <sys/capsicum.h> 33 #endif 34 #include <sys/queue.h> 35 #include <sys/errno.h> 36 #include <sys/stat.h> 37 #include <sys/ioctl.h> 38 #include <sys/disk.h> 39 40 #include <assert.h> 41 #ifndef WITHOUT_CAPSICUM 42 #include <capsicum_helpers.h> 43 #endif 44 #include <err.h> 45 #include <fcntl.h> 46 #include <stdio.h> 47 #include <stdlib.h> 48 #include <string.h> 49 #include <pthread.h> 50 #include <pthread_np.h> 51 #include <signal.h> 52 #include <sysexits.h> 53 #include <unistd.h> 54 55 #include <machine/atomic.h> 56 #include <machine/vmm_snapshot.h> 57 58 #include "bhyverun.h" 59 #include "config.h" 60 #include "debug.h" 61 #include "mevent.h" 62 #include "pci_emul.h" 63 #include "block_if.h" 64 65 #define BLOCKIF_SIG 0xb109b109 66 67 #define BLOCKIF_NUMTHR 8 68 #define BLOCKIF_MAXREQ (BLOCKIF_RING_MAX + BLOCKIF_NUMTHR) 69 70 enum blockop { 71 BOP_READ, 72 BOP_WRITE, 73 BOP_FLUSH, 74 BOP_DELETE 75 }; 76 77 enum blockstat { 78 BST_FREE, 79 BST_BLOCK, 80 BST_PEND, 81 BST_BUSY, 82 BST_DONE 83 }; 84 85 struct blockif_elem { 86 TAILQ_ENTRY(blockif_elem) be_link; 87 struct blockif_req *be_req; 88 enum blockop be_op; 89 enum blockstat be_status; 90 pthread_t be_tid; 91 off_t be_block; 92 }; 93 94 struct blockif_ctxt { 95 unsigned int bc_magic; 96 int bc_fd; 97 int bc_ischr; 98 int bc_isgeom; 99 int bc_candelete; 100 int bc_rdonly; 101 off_t bc_size; 102 int bc_sectsz; 103 int bc_psectsz; 104 int bc_psectoff; 105 int bc_closing; 106 int bc_paused; 107 pthread_t bc_btid[BLOCKIF_NUMTHR]; 108 pthread_mutex_t bc_mtx; 109 pthread_cond_t bc_cond; 110 pthread_cond_t bc_work_done_cond; 111 blockif_resize_cb *bc_resize_cb; 112 void *bc_resize_cb_arg; 113 struct mevent *bc_resize_event; 114 115 /* Request elements and free/pending/busy queues */ 116 TAILQ_HEAD(, blockif_elem) bc_freeq; 117 TAILQ_HEAD(, blockif_elem) bc_pendq; 118 TAILQ_HEAD(, blockif_elem) bc_busyq; 119 struct blockif_elem bc_reqs[BLOCKIF_MAXREQ]; 120 int bc_bootindex; 121 }; 122 123 static pthread_once_t blockif_once = PTHREAD_ONCE_INIT; 124 125 struct blockif_sig_elem { 126 pthread_mutex_t bse_mtx; 127 pthread_cond_t bse_cond; 128 int bse_pending; 129 struct blockif_sig_elem *bse_next; 130 }; 131 132 static struct blockif_sig_elem *blockif_bse_head; 133 134 static int 135 blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq, 136 enum blockop op) 137 { 138 struct blockif_elem *be, *tbe; 139 off_t off; 140 int i; 141 142 be = TAILQ_FIRST(&bc->bc_freeq); 143 assert(be != NULL); 144 assert(be->be_status == BST_FREE); 145 TAILQ_REMOVE(&bc->bc_freeq, be, be_link); 146 be->be_req = breq; 147 be->be_op = op; 148 switch (op) { 149 case BOP_READ: 150 case BOP_WRITE: 151 case BOP_DELETE: 152 off = breq->br_offset; 153 for (i = 0; i < breq->br_iovcnt; i++) 154 off += breq->br_iov[i].iov_len; 155 break; 156 default: 157 off = OFF_MAX; 158 } 159 be->be_block = off; 160 TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) { 161 if (tbe->be_block == breq->br_offset) 162 break; 163 } 164 if (tbe == NULL) { 165 TAILQ_FOREACH(tbe, &bc->bc_busyq, be_link) { 166 if (tbe->be_block == breq->br_offset) 167 break; 168 } 169 } 170 if (tbe == NULL) 171 be->be_status = BST_PEND; 172 else 173 be->be_status = BST_BLOCK; 174 TAILQ_INSERT_TAIL(&bc->bc_pendq, be, be_link); 175 return (be->be_status == BST_PEND); 176 } 177 178 static int 179 blockif_dequeue(struct blockif_ctxt *bc, pthread_t t, struct blockif_elem **bep) 180 { 181 struct blockif_elem *be; 182 183 TAILQ_FOREACH(be, &bc->bc_pendq, be_link) { 184 if (be->be_status == BST_PEND) 185 break; 186 assert(be->be_status == BST_BLOCK); 187 } 188 if (be == NULL) 189 return (0); 190 TAILQ_REMOVE(&bc->bc_pendq, be, be_link); 191 be->be_status = BST_BUSY; 192 be->be_tid = t; 193 TAILQ_INSERT_TAIL(&bc->bc_busyq, be, be_link); 194 *bep = be; 195 return (1); 196 } 197 198 static void 199 blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be) 200 { 201 struct blockif_elem *tbe; 202 203 if (be->be_status == BST_DONE || be->be_status == BST_BUSY) 204 TAILQ_REMOVE(&bc->bc_busyq, be, be_link); 205 else 206 TAILQ_REMOVE(&bc->bc_pendq, be, be_link); 207 TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) { 208 if (tbe->be_req->br_offset == be->be_block) 209 tbe->be_status = BST_PEND; 210 } 211 be->be_tid = 0; 212 be->be_status = BST_FREE; 213 be->be_req = NULL; 214 TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link); 215 } 216 217 static int 218 blockif_flush_bc(struct blockif_ctxt *bc) 219 { 220 if (bc->bc_ischr) { 221 if (ioctl(bc->bc_fd, DIOCGFLUSH)) 222 return (errno); 223 } else if (fsync(bc->bc_fd)) 224 return (errno); 225 226 return (0); 227 } 228 229 static void 230 blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf) 231 { 232 struct spacectl_range range; 233 struct blockif_req *br; 234 off_t arg[2]; 235 ssize_t n; 236 size_t clen, len, off, boff, voff; 237 int i, err; 238 239 br = be->be_req; 240 assert(br->br_resid >= 0); 241 242 if (br->br_iovcnt <= 1) 243 buf = NULL; 244 err = 0; 245 switch (be->be_op) { 246 case BOP_READ: 247 if (buf == NULL) { 248 if ((n = preadv(bc->bc_fd, br->br_iov, br->br_iovcnt, 249 br->br_offset)) < 0) 250 err = errno; 251 else 252 br->br_resid -= n; 253 break; 254 } 255 i = 0; 256 off = voff = 0; 257 while (br->br_resid > 0) { 258 len = MIN(br->br_resid, MAXPHYS); 259 n = pread(bc->bc_fd, buf, len, br->br_offset + off); 260 if (n < 0) { 261 err = errno; 262 break; 263 } 264 len = (size_t)n; 265 boff = 0; 266 do { 267 clen = MIN(len - boff, br->br_iov[i].iov_len - 268 voff); 269 memcpy((uint8_t *)br->br_iov[i].iov_base + voff, 270 buf + boff, clen); 271 if (clen < br->br_iov[i].iov_len - voff) 272 voff += clen; 273 else { 274 i++; 275 voff = 0; 276 } 277 boff += clen; 278 } while (boff < len); 279 off += len; 280 br->br_resid -= len; 281 } 282 break; 283 case BOP_WRITE: 284 if (bc->bc_rdonly) { 285 err = EROFS; 286 break; 287 } 288 if (buf == NULL) { 289 if ((n = pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt, 290 br->br_offset)) < 0) 291 err = errno; 292 else 293 br->br_resid -= n; 294 break; 295 } 296 i = 0; 297 off = voff = 0; 298 while (br->br_resid > 0) { 299 len = MIN(br->br_resid, MAXPHYS); 300 boff = 0; 301 do { 302 clen = MIN(len - boff, br->br_iov[i].iov_len - 303 voff); 304 memcpy(buf + boff, 305 (uint8_t *)br->br_iov[i].iov_base + voff, 306 clen); 307 if (clen < br->br_iov[i].iov_len - voff) 308 voff += clen; 309 else { 310 i++; 311 voff = 0; 312 } 313 boff += clen; 314 } while (boff < len); 315 316 n = pwrite(bc->bc_fd, buf, len, br->br_offset + off); 317 if (n < 0) { 318 err = errno; 319 break; 320 } 321 off += n; 322 br->br_resid -= n; 323 } 324 break; 325 case BOP_FLUSH: 326 err = blockif_flush_bc(bc); 327 break; 328 case BOP_DELETE: 329 if (!bc->bc_candelete) 330 err = EOPNOTSUPP; 331 else if (bc->bc_rdonly) 332 err = EROFS; 333 else if (bc->bc_ischr) { 334 arg[0] = br->br_offset; 335 arg[1] = br->br_resid; 336 if (ioctl(bc->bc_fd, DIOCGDELETE, arg)) 337 err = errno; 338 else 339 br->br_resid = 0; 340 } else { 341 range.r_offset = br->br_offset; 342 range.r_len = br->br_resid; 343 344 while (range.r_len > 0) { 345 if (fspacectl(bc->bc_fd, SPACECTL_DEALLOC, 346 &range, 0, &range) != 0) { 347 err = errno; 348 break; 349 } 350 } 351 if (err == 0) 352 br->br_resid = 0; 353 } 354 break; 355 default: 356 err = EINVAL; 357 break; 358 } 359 360 be->be_status = BST_DONE; 361 362 (*br->br_callback)(br, err); 363 } 364 365 static inline bool 366 blockif_empty(const struct blockif_ctxt *bc) 367 { 368 return (TAILQ_EMPTY(&bc->bc_pendq) && TAILQ_EMPTY(&bc->bc_busyq)); 369 } 370 371 static void * 372 blockif_thr(void *arg) 373 { 374 struct blockif_ctxt *bc; 375 struct blockif_elem *be; 376 pthread_t t; 377 uint8_t *buf; 378 379 bc = arg; 380 if (bc->bc_isgeom) 381 buf = malloc(MAXPHYS); 382 else 383 buf = NULL; 384 t = pthread_self(); 385 386 pthread_mutex_lock(&bc->bc_mtx); 387 for (;;) { 388 while (blockif_dequeue(bc, t, &be)) { 389 pthread_mutex_unlock(&bc->bc_mtx); 390 blockif_proc(bc, be, buf); 391 pthread_mutex_lock(&bc->bc_mtx); 392 blockif_complete(bc, be); 393 } 394 395 /* If none to work, notify the main thread */ 396 if (blockif_empty(bc)) 397 pthread_cond_broadcast(&bc->bc_work_done_cond); 398 399 /* Check ctxt status here to see if exit requested */ 400 if (bc->bc_closing) 401 break; 402 403 pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx); 404 } 405 pthread_mutex_unlock(&bc->bc_mtx); 406 407 if (buf) 408 free(buf); 409 pthread_exit(NULL); 410 return (NULL); 411 } 412 413 static void 414 blockif_sigcont_handler(int signal __unused, enum ev_type type __unused, 415 void *arg __unused) 416 { 417 struct blockif_sig_elem *bse; 418 419 for (;;) { 420 /* 421 * Process the entire list even if not intended for 422 * this thread. 423 */ 424 do { 425 bse = blockif_bse_head; 426 if (bse == NULL) 427 return; 428 } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head, 429 (uintptr_t)bse, 430 (uintptr_t)bse->bse_next)); 431 432 pthread_mutex_lock(&bse->bse_mtx); 433 bse->bse_pending = 0; 434 pthread_cond_signal(&bse->bse_cond); 435 pthread_mutex_unlock(&bse->bse_mtx); 436 } 437 } 438 439 static void 440 blockif_init(void) 441 { 442 mevent_add(SIGCONT, EVF_SIGNAL, blockif_sigcont_handler, NULL); 443 (void) signal(SIGCONT, SIG_IGN); 444 } 445 446 int 447 blockif_legacy_config(nvlist_t *nvl, const char *opts) 448 { 449 char *cp, *path; 450 451 if (opts == NULL) 452 return (0); 453 454 cp = strchr(opts, ','); 455 if (cp == NULL) { 456 set_config_value_node(nvl, "path", opts); 457 return (0); 458 } 459 path = strndup(opts, cp - opts); 460 set_config_value_node(nvl, "path", path); 461 free(path); 462 return (pci_parse_legacy_config(nvl, cp + 1)); 463 } 464 465 int 466 blockif_add_boot_device(struct pci_devinst *const pi, 467 struct blockif_ctxt *const bc) 468 { 469 if (bc->bc_bootindex < 0) 470 return (0); 471 472 return (pci_emul_add_boot_device(pi, bc->bc_bootindex)); 473 } 474 475 struct blockif_ctxt * 476 blockif_open(nvlist_t *nvl, const char *ident) 477 { 478 char tname[MAXCOMLEN + 1]; 479 char name[MAXPATHLEN]; 480 const char *path, *pssval, *ssval, *bootindex_val; 481 char *cp; 482 struct blockif_ctxt *bc; 483 struct stat sbuf; 484 struct diocgattr_arg arg; 485 off_t size, psectsz, psectoff; 486 int extra, fd, i, sectsz; 487 int ro, candelete, geom, ssopt, pssopt; 488 int nodelete; 489 int bootindex; 490 491 #ifndef WITHOUT_CAPSICUM 492 cap_rights_t rights; 493 cap_ioctl_t cmds[] = { DIOCGFLUSH, DIOCGDELETE, DIOCGMEDIASIZE }; 494 #endif 495 496 pthread_once(&blockif_once, blockif_init); 497 498 fd = -1; 499 extra = 0; 500 ssopt = 0; 501 ro = 0; 502 nodelete = 0; 503 bootindex = -1; 504 505 if (get_config_bool_node_default(nvl, "nocache", false)) 506 extra |= O_DIRECT; 507 if (get_config_bool_node_default(nvl, "nodelete", false)) 508 nodelete = 1; 509 if (get_config_bool_node_default(nvl, "sync", false) || 510 get_config_bool_node_default(nvl, "direct", false)) 511 extra |= O_SYNC; 512 if (get_config_bool_node_default(nvl, "ro", false)) 513 ro = 1; 514 ssval = get_config_value_node(nvl, "sectorsize"); 515 if (ssval != NULL) { 516 ssopt = strtol(ssval, &cp, 10); 517 if (cp == ssval) { 518 EPRINTLN("Invalid sector size \"%s\"", ssval); 519 goto err; 520 } 521 if (*cp == '\0') { 522 pssopt = ssopt; 523 } else if (*cp == '/') { 524 pssval = cp + 1; 525 pssopt = strtol(pssval, &cp, 10); 526 if (cp == pssval || *cp != '\0') { 527 EPRINTLN("Invalid sector size \"%s\"", ssval); 528 goto err; 529 } 530 } else { 531 EPRINTLN("Invalid sector size \"%s\"", ssval); 532 goto err; 533 } 534 } 535 536 bootindex_val = get_config_value_node(nvl, "bootindex"); 537 if (bootindex_val != NULL) { 538 bootindex = atoi(bootindex_val); 539 } 540 541 path = get_config_value_node(nvl, "path"); 542 if (path == NULL) { 543 EPRINTLN("Missing \"path\" for block device."); 544 goto err; 545 } 546 547 fd = open(path, (ro ? O_RDONLY : O_RDWR) | extra); 548 if (fd < 0 && !ro) { 549 /* Attempt a r/w fail with a r/o open */ 550 fd = open(path, O_RDONLY | extra); 551 ro = 1; 552 } 553 554 if (fd < 0) { 555 warn("Could not open backing file: %s", path); 556 goto err; 557 } 558 559 if (fstat(fd, &sbuf) < 0) { 560 warn("Could not stat backing file %s", path); 561 goto err; 562 } 563 564 #ifndef WITHOUT_CAPSICUM 565 cap_rights_init(&rights, CAP_FSYNC, CAP_IOCTL, CAP_READ, CAP_SEEK, 566 CAP_WRITE, CAP_FSTAT, CAP_EVENT, CAP_FPATHCONF); 567 if (ro) 568 cap_rights_clear(&rights, CAP_FSYNC, CAP_WRITE); 569 570 if (caph_rights_limit(fd, &rights) == -1) 571 errx(EX_OSERR, "Unable to apply rights for sandbox"); 572 #endif 573 574 /* 575 * Deal with raw devices 576 */ 577 size = sbuf.st_size; 578 sectsz = DEV_BSIZE; 579 psectsz = psectoff = 0; 580 candelete = geom = 0; 581 if (S_ISCHR(sbuf.st_mode)) { 582 if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 || 583 ioctl(fd, DIOCGSECTORSIZE, §sz)) { 584 perror("Could not fetch dev blk/sector size"); 585 goto err; 586 } 587 assert(size != 0); 588 assert(sectsz != 0); 589 if (ioctl(fd, DIOCGSTRIPESIZE, &psectsz) == 0 && psectsz > 0) 590 ioctl(fd, DIOCGSTRIPEOFFSET, &psectoff); 591 strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name)); 592 arg.len = sizeof(arg.value.i); 593 if (nodelete == 0 && ioctl(fd, DIOCGATTR, &arg) == 0) 594 candelete = arg.value.i; 595 if (ioctl(fd, DIOCGPROVIDERNAME, name) == 0) 596 geom = 1; 597 } else { 598 psectsz = sbuf.st_blksize; 599 /* Avoid fallback implementation */ 600 candelete = fpathconf(fd, _PC_DEALLOC_PRESENT) == 1; 601 } 602 603 #ifndef WITHOUT_CAPSICUM 604 if (caph_ioctls_limit(fd, cmds, nitems(cmds)) == -1) 605 errx(EX_OSERR, "Unable to apply rights for sandbox"); 606 #endif 607 608 if (ssopt != 0) { 609 if (!powerof2(ssopt) || !powerof2(pssopt) || ssopt < 512 || 610 ssopt > pssopt) { 611 EPRINTLN("Invalid sector size %d/%d", 612 ssopt, pssopt); 613 goto err; 614 } 615 616 /* 617 * Some backend drivers (e.g. cd0, ada0) require that the I/O 618 * size be a multiple of the device's sector size. 619 * 620 * Validate that the emulated sector size complies with this 621 * requirement. 622 */ 623 if (S_ISCHR(sbuf.st_mode)) { 624 if (ssopt < sectsz || (ssopt % sectsz) != 0) { 625 EPRINTLN("Sector size %d incompatible " 626 "with underlying device sector size %d", 627 ssopt, sectsz); 628 goto err; 629 } 630 } 631 632 sectsz = ssopt; 633 psectsz = pssopt; 634 psectoff = 0; 635 } 636 637 bc = calloc(1, sizeof(struct blockif_ctxt)); 638 if (bc == NULL) { 639 perror("calloc"); 640 goto err; 641 } 642 643 bc->bc_magic = BLOCKIF_SIG; 644 bc->bc_fd = fd; 645 bc->bc_ischr = S_ISCHR(sbuf.st_mode); 646 bc->bc_isgeom = geom; 647 bc->bc_candelete = candelete; 648 bc->bc_rdonly = ro; 649 bc->bc_size = size; 650 bc->bc_sectsz = sectsz; 651 bc->bc_psectsz = psectsz; 652 bc->bc_psectoff = psectoff; 653 pthread_mutex_init(&bc->bc_mtx, NULL); 654 pthread_cond_init(&bc->bc_cond, NULL); 655 bc->bc_paused = 0; 656 pthread_cond_init(&bc->bc_work_done_cond, NULL); 657 TAILQ_INIT(&bc->bc_freeq); 658 TAILQ_INIT(&bc->bc_pendq); 659 TAILQ_INIT(&bc->bc_busyq); 660 bc->bc_bootindex = bootindex; 661 for (i = 0; i < BLOCKIF_MAXREQ; i++) { 662 bc->bc_reqs[i].be_status = BST_FREE; 663 TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link); 664 } 665 666 for (i = 0; i < BLOCKIF_NUMTHR; i++) { 667 pthread_create(&bc->bc_btid[i], NULL, blockif_thr, bc); 668 snprintf(tname, sizeof(tname), "blk-%s-%d", ident, i); 669 pthread_set_name_np(bc->bc_btid[i], tname); 670 } 671 672 return (bc); 673 err: 674 if (fd >= 0) 675 close(fd); 676 return (NULL); 677 } 678 679 static void 680 blockif_resized(int fd, enum ev_type type __unused, void *arg) 681 { 682 struct blockif_ctxt *bc; 683 struct stat sb; 684 off_t mediasize; 685 686 if (fstat(fd, &sb) != 0) 687 return; 688 689 if (S_ISCHR(sb.st_mode)) { 690 if (ioctl(fd, DIOCGMEDIASIZE, &mediasize) < 0) { 691 EPRINTLN("blockif_resized: get mediasize failed: %s", 692 strerror(errno)); 693 return; 694 } 695 } else 696 mediasize = sb.st_size; 697 698 bc = arg; 699 pthread_mutex_lock(&bc->bc_mtx); 700 if (mediasize != bc->bc_size) { 701 bc->bc_size = mediasize; 702 bc->bc_resize_cb(bc, bc->bc_resize_cb_arg, bc->bc_size); 703 } 704 pthread_mutex_unlock(&bc->bc_mtx); 705 } 706 707 int 708 blockif_register_resize_callback(struct blockif_ctxt *bc, blockif_resize_cb *cb, 709 void *cb_arg) 710 { 711 struct stat sb; 712 int err; 713 714 if (cb == NULL) 715 return (EINVAL); 716 717 err = 0; 718 719 pthread_mutex_lock(&bc->bc_mtx); 720 if (bc->bc_resize_cb != NULL) { 721 err = EBUSY; 722 goto out; 723 } 724 725 assert(bc->bc_closing == 0); 726 727 if (fstat(bc->bc_fd, &sb) != 0) { 728 err = errno; 729 goto out; 730 } 731 732 bc->bc_resize_event = mevent_add_flags(bc->bc_fd, EVF_VNODE, 733 EVFF_ATTRIB, blockif_resized, bc); 734 if (bc->bc_resize_event == NULL) { 735 err = ENXIO; 736 goto out; 737 } 738 739 bc->bc_resize_cb = cb; 740 bc->bc_resize_cb_arg = cb_arg; 741 out: 742 pthread_mutex_unlock(&bc->bc_mtx); 743 744 return (err); 745 } 746 747 static int 748 blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq, 749 enum blockop op) 750 { 751 int err; 752 753 err = 0; 754 755 pthread_mutex_lock(&bc->bc_mtx); 756 assert(!bc->bc_paused); 757 if (!TAILQ_EMPTY(&bc->bc_freeq)) { 758 /* 759 * Enqueue and inform the block i/o thread 760 * that there is work available 761 */ 762 if (blockif_enqueue(bc, breq, op)) 763 pthread_cond_signal(&bc->bc_cond); 764 } else { 765 /* 766 * Callers are not allowed to enqueue more than 767 * the specified blockif queue limit. Return an 768 * error to indicate that the queue length has been 769 * exceeded. 770 */ 771 err = E2BIG; 772 } 773 pthread_mutex_unlock(&bc->bc_mtx); 774 775 return (err); 776 } 777 778 int 779 blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq) 780 { 781 assert(bc->bc_magic == BLOCKIF_SIG); 782 return (blockif_request(bc, breq, BOP_READ)); 783 } 784 785 int 786 blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq) 787 { 788 assert(bc->bc_magic == BLOCKIF_SIG); 789 return (blockif_request(bc, breq, BOP_WRITE)); 790 } 791 792 int 793 blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq) 794 { 795 assert(bc->bc_magic == BLOCKIF_SIG); 796 return (blockif_request(bc, breq, BOP_FLUSH)); 797 } 798 799 int 800 blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq) 801 { 802 assert(bc->bc_magic == BLOCKIF_SIG); 803 return (blockif_request(bc, breq, BOP_DELETE)); 804 } 805 806 int 807 blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq) 808 { 809 struct blockif_elem *be; 810 811 assert(bc->bc_magic == BLOCKIF_SIG); 812 813 pthread_mutex_lock(&bc->bc_mtx); 814 /* XXX: not waiting while paused */ 815 816 /* 817 * Check pending requests. 818 */ 819 TAILQ_FOREACH(be, &bc->bc_pendq, be_link) { 820 if (be->be_req == breq) 821 break; 822 } 823 if (be != NULL) { 824 /* 825 * Found it. 826 */ 827 blockif_complete(bc, be); 828 pthread_mutex_unlock(&bc->bc_mtx); 829 830 return (0); 831 } 832 833 /* 834 * Check in-flight requests. 835 */ 836 TAILQ_FOREACH(be, &bc->bc_busyq, be_link) { 837 if (be->be_req == breq) 838 break; 839 } 840 if (be == NULL) { 841 /* 842 * Didn't find it. 843 */ 844 pthread_mutex_unlock(&bc->bc_mtx); 845 return (EINVAL); 846 } 847 848 /* 849 * Interrupt the processing thread to force it return 850 * prematurely via it's normal callback path. 851 */ 852 while (be->be_status == BST_BUSY) { 853 struct blockif_sig_elem bse, *old_head; 854 855 pthread_mutex_init(&bse.bse_mtx, NULL); 856 pthread_cond_init(&bse.bse_cond, NULL); 857 858 bse.bse_pending = 1; 859 860 do { 861 old_head = blockif_bse_head; 862 bse.bse_next = old_head; 863 } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head, 864 (uintptr_t)old_head, 865 (uintptr_t)&bse)); 866 867 pthread_kill(be->be_tid, SIGCONT); 868 869 pthread_mutex_lock(&bse.bse_mtx); 870 while (bse.bse_pending) 871 pthread_cond_wait(&bse.bse_cond, &bse.bse_mtx); 872 pthread_mutex_unlock(&bse.bse_mtx); 873 } 874 875 pthread_mutex_unlock(&bc->bc_mtx); 876 877 /* 878 * The processing thread has been interrupted. Since it's not 879 * clear if the callback has been invoked yet, return EBUSY. 880 */ 881 return (EBUSY); 882 } 883 884 int 885 blockif_close(struct blockif_ctxt *bc) 886 { 887 void *jval; 888 int i; 889 890 assert(bc->bc_magic == BLOCKIF_SIG); 891 892 /* 893 * Stop the block i/o thread 894 */ 895 pthread_mutex_lock(&bc->bc_mtx); 896 bc->bc_closing = 1; 897 if (bc->bc_resize_event != NULL) 898 mevent_disable(bc->bc_resize_event); 899 pthread_mutex_unlock(&bc->bc_mtx); 900 pthread_cond_broadcast(&bc->bc_cond); 901 for (i = 0; i < BLOCKIF_NUMTHR; i++) 902 pthread_join(bc->bc_btid[i], &jval); 903 904 /* XXX Cancel queued i/o's ??? */ 905 906 /* 907 * Release resources 908 */ 909 bc->bc_magic = 0; 910 close(bc->bc_fd); 911 free(bc); 912 913 return (0); 914 } 915 916 /* 917 * Return virtual C/H/S values for a given block. Use the algorithm 918 * outlined in the VHD specification to calculate values. 919 */ 920 void 921 blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s) 922 { 923 off_t sectors; /* total sectors of the block dev */ 924 off_t hcyl; /* cylinders times heads */ 925 uint16_t secpt; /* sectors per track */ 926 uint8_t heads; 927 928 assert(bc->bc_magic == BLOCKIF_SIG); 929 930 sectors = bc->bc_size / bc->bc_sectsz; 931 932 /* Clamp the size to the largest possible with CHS */ 933 if (sectors > 65535L * 16 * 255) 934 sectors = 65535L * 16 * 255; 935 936 if (sectors >= 65536L * 16 * 63) { 937 secpt = 255; 938 heads = 16; 939 hcyl = sectors / secpt; 940 } else { 941 secpt = 17; 942 hcyl = sectors / secpt; 943 heads = (hcyl + 1023) / 1024; 944 945 if (heads < 4) 946 heads = 4; 947 948 if (hcyl >= (heads * 1024) || heads > 16) { 949 secpt = 31; 950 heads = 16; 951 hcyl = sectors / secpt; 952 } 953 if (hcyl >= (heads * 1024)) { 954 secpt = 63; 955 heads = 16; 956 hcyl = sectors / secpt; 957 } 958 } 959 960 *c = hcyl / heads; 961 *h = heads; 962 *s = secpt; 963 } 964 965 /* 966 * Accessors 967 */ 968 off_t 969 blockif_size(struct blockif_ctxt *bc) 970 { 971 assert(bc->bc_magic == BLOCKIF_SIG); 972 return (bc->bc_size); 973 } 974 975 int 976 blockif_sectsz(struct blockif_ctxt *bc) 977 { 978 assert(bc->bc_magic == BLOCKIF_SIG); 979 return (bc->bc_sectsz); 980 } 981 982 void 983 blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off) 984 { 985 assert(bc->bc_magic == BLOCKIF_SIG); 986 *size = bc->bc_psectsz; 987 *off = bc->bc_psectoff; 988 } 989 990 int 991 blockif_queuesz(struct blockif_ctxt *bc) 992 { 993 assert(bc->bc_magic == BLOCKIF_SIG); 994 return (BLOCKIF_MAXREQ - 1); 995 } 996 997 int 998 blockif_is_ro(struct blockif_ctxt *bc) 999 { 1000 assert(bc->bc_magic == BLOCKIF_SIG); 1001 return (bc->bc_rdonly); 1002 } 1003 1004 int 1005 blockif_candelete(struct blockif_ctxt *bc) 1006 { 1007 assert(bc->bc_magic == BLOCKIF_SIG); 1008 return (bc->bc_candelete); 1009 } 1010 1011 #ifdef BHYVE_SNAPSHOT 1012 void 1013 blockif_pause(struct blockif_ctxt *bc) 1014 { 1015 assert(bc != NULL); 1016 assert(bc->bc_magic == BLOCKIF_SIG); 1017 1018 pthread_mutex_lock(&bc->bc_mtx); 1019 bc->bc_paused = 1; 1020 1021 /* The interface is paused. Wait for workers to finish their work */ 1022 while (!blockif_empty(bc)) 1023 pthread_cond_wait(&bc->bc_work_done_cond, &bc->bc_mtx); 1024 pthread_mutex_unlock(&bc->bc_mtx); 1025 1026 if (!bc->bc_rdonly && blockif_flush_bc(bc)) 1027 EPRINTLN("%s: [WARN] failed to flush backing file.", 1028 __func__); 1029 } 1030 1031 void 1032 blockif_resume(struct blockif_ctxt *bc) 1033 { 1034 assert(bc != NULL); 1035 assert(bc->bc_magic == BLOCKIF_SIG); 1036 1037 pthread_mutex_lock(&bc->bc_mtx); 1038 bc->bc_paused = 0; 1039 pthread_mutex_unlock(&bc->bc_mtx); 1040 } 1041 #endif /* BHYVE_SNAPSHOT */ 1042