1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2013 Peter Grehan <grehan@freebsd.org> 5 * All rights reserved. 6 * Copyright 2020 Joyent, Inc. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 #include <sys/cdefs.h> 31 __FBSDID("$FreeBSD$"); 32 33 #include <sys/param.h> 34 #ifndef WITHOUT_CAPSICUM 35 #include <sys/capsicum.h> 36 #endif 37 #include <sys/queue.h> 38 #include <sys/errno.h> 39 #include <sys/stat.h> 40 #include <sys/ioctl.h> 41 #include <sys/disk.h> 42 43 #include <assert.h> 44 #ifndef WITHOUT_CAPSICUM 45 #include <capsicum_helpers.h> 46 #endif 47 #include <err.h> 48 #include <fcntl.h> 49 #include <stdio.h> 50 #include <stdlib.h> 51 #include <string.h> 52 #include <pthread.h> 53 #include <pthread_np.h> 54 #include <signal.h> 55 #include <sysexits.h> 56 #include <unistd.h> 57 58 #include <machine/atomic.h> 59 #include <machine/vmm_snapshot.h> 60 61 #include "bhyverun.h" 62 #include "config.h" 63 #include "debug.h" 64 #include "mevent.h" 65 #include "pci_emul.h" 66 #include "block_if.h" 67 68 #define BLOCKIF_SIG 0xb109b109 69 70 #define BLOCKIF_NUMTHR 8 71 #define BLOCKIF_MAXREQ (BLOCKIF_RING_MAX + BLOCKIF_NUMTHR) 72 73 enum blockop { 74 BOP_READ, 75 BOP_WRITE, 76 BOP_FLUSH, 77 BOP_DELETE 78 }; 79 80 enum blockstat { 81 BST_FREE, 82 BST_BLOCK, 83 BST_PEND, 84 BST_BUSY, 85 BST_DONE 86 }; 87 88 struct blockif_elem { 89 TAILQ_ENTRY(blockif_elem) be_link; 90 struct blockif_req *be_req; 91 enum blockop be_op; 92 enum blockstat be_status; 93 pthread_t be_tid; 94 off_t be_block; 95 }; 96 97 struct blockif_ctxt { 98 unsigned int bc_magic; 99 int bc_fd; 100 int bc_ischr; 101 int bc_isgeom; 102 int bc_candelete; 103 int bc_rdonly; 104 off_t bc_size; 105 int bc_sectsz; 106 int bc_psectsz; 107 int bc_psectoff; 108 int bc_closing; 109 int bc_paused; 110 pthread_t bc_btid[BLOCKIF_NUMTHR]; 111 pthread_mutex_t bc_mtx; 112 pthread_cond_t bc_cond; 113 pthread_cond_t bc_work_done_cond; 114 blockif_resize_cb *bc_resize_cb; 115 void *bc_resize_cb_arg; 116 struct mevent *bc_resize_event; 117 118 /* Request elements and free/pending/busy queues */ 119 TAILQ_HEAD(, blockif_elem) bc_freeq; 120 TAILQ_HEAD(, blockif_elem) bc_pendq; 121 TAILQ_HEAD(, blockif_elem) bc_busyq; 122 struct blockif_elem bc_reqs[BLOCKIF_MAXREQ]; 123 int bc_bootindex; 124 }; 125 126 static pthread_once_t blockif_once = PTHREAD_ONCE_INIT; 127 128 struct blockif_sig_elem { 129 pthread_mutex_t bse_mtx; 130 pthread_cond_t bse_cond; 131 int bse_pending; 132 struct blockif_sig_elem *bse_next; 133 }; 134 135 static struct blockif_sig_elem *blockif_bse_head; 136 137 static int 138 blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq, 139 enum blockop op) 140 { 141 struct blockif_elem *be, *tbe; 142 off_t off; 143 int i; 144 145 be = TAILQ_FIRST(&bc->bc_freeq); 146 assert(be != NULL); 147 assert(be->be_status == BST_FREE); 148 TAILQ_REMOVE(&bc->bc_freeq, be, be_link); 149 be->be_req = breq; 150 be->be_op = op; 151 switch (op) { 152 case BOP_READ: 153 case BOP_WRITE: 154 case BOP_DELETE: 155 off = breq->br_offset; 156 for (i = 0; i < breq->br_iovcnt; i++) 157 off += breq->br_iov[i].iov_len; 158 break; 159 default: 160 off = OFF_MAX; 161 } 162 be->be_block = off; 163 TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) { 164 if (tbe->be_block == breq->br_offset) 165 break; 166 } 167 if (tbe == NULL) { 168 TAILQ_FOREACH(tbe, &bc->bc_busyq, be_link) { 169 if (tbe->be_block == breq->br_offset) 170 break; 171 } 172 } 173 if (tbe == NULL) 174 be->be_status = BST_PEND; 175 else 176 be->be_status = BST_BLOCK; 177 TAILQ_INSERT_TAIL(&bc->bc_pendq, be, be_link); 178 return (be->be_status == BST_PEND); 179 } 180 181 static int 182 blockif_dequeue(struct blockif_ctxt *bc, pthread_t t, struct blockif_elem **bep) 183 { 184 struct blockif_elem *be; 185 186 TAILQ_FOREACH(be, &bc->bc_pendq, be_link) { 187 if (be->be_status == BST_PEND) 188 break; 189 assert(be->be_status == BST_BLOCK); 190 } 191 if (be == NULL) 192 return (0); 193 TAILQ_REMOVE(&bc->bc_pendq, be, be_link); 194 be->be_status = BST_BUSY; 195 be->be_tid = t; 196 TAILQ_INSERT_TAIL(&bc->bc_busyq, be, be_link); 197 *bep = be; 198 return (1); 199 } 200 201 static void 202 blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be) 203 { 204 struct blockif_elem *tbe; 205 206 if (be->be_status == BST_DONE || be->be_status == BST_BUSY) 207 TAILQ_REMOVE(&bc->bc_busyq, be, be_link); 208 else 209 TAILQ_REMOVE(&bc->bc_pendq, be, be_link); 210 TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) { 211 if (tbe->be_req->br_offset == be->be_block) 212 tbe->be_status = BST_PEND; 213 } 214 be->be_tid = 0; 215 be->be_status = BST_FREE; 216 be->be_req = NULL; 217 TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link); 218 } 219 220 static int 221 blockif_flush_bc(struct blockif_ctxt *bc) 222 { 223 if (bc->bc_ischr) { 224 if (ioctl(bc->bc_fd, DIOCGFLUSH)) 225 return (errno); 226 } else if (fsync(bc->bc_fd)) 227 return (errno); 228 229 return (0); 230 } 231 232 static void 233 blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf) 234 { 235 struct spacectl_range range; 236 struct blockif_req *br; 237 off_t arg[2]; 238 ssize_t n; 239 size_t clen, len, off, boff, voff; 240 int i, err; 241 242 br = be->be_req; 243 assert(br->br_resid >= 0); 244 245 if (br->br_iovcnt <= 1) 246 buf = NULL; 247 err = 0; 248 switch (be->be_op) { 249 case BOP_READ: 250 if (buf == NULL) { 251 if ((n = preadv(bc->bc_fd, br->br_iov, br->br_iovcnt, 252 br->br_offset)) < 0) 253 err = errno; 254 else 255 br->br_resid -= n; 256 break; 257 } 258 i = 0; 259 off = voff = 0; 260 while (br->br_resid > 0) { 261 len = MIN(br->br_resid, MAXPHYS); 262 n = pread(bc->bc_fd, buf, len, br->br_offset + off); 263 if (n < 0) { 264 err = errno; 265 break; 266 } 267 len = (size_t)n; 268 boff = 0; 269 do { 270 clen = MIN(len - boff, br->br_iov[i].iov_len - 271 voff); 272 memcpy((uint8_t *)br->br_iov[i].iov_base + voff, 273 buf + boff, clen); 274 if (clen < br->br_iov[i].iov_len - voff) 275 voff += clen; 276 else { 277 i++; 278 voff = 0; 279 } 280 boff += clen; 281 } while (boff < len); 282 off += len; 283 br->br_resid -= len; 284 } 285 break; 286 case BOP_WRITE: 287 if (bc->bc_rdonly) { 288 err = EROFS; 289 break; 290 } 291 if (buf == NULL) { 292 if ((n = pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt, 293 br->br_offset)) < 0) 294 err = errno; 295 else 296 br->br_resid -= n; 297 break; 298 } 299 i = 0; 300 off = voff = 0; 301 while (br->br_resid > 0) { 302 len = MIN(br->br_resid, MAXPHYS); 303 boff = 0; 304 do { 305 clen = MIN(len - boff, br->br_iov[i].iov_len - 306 voff); 307 memcpy(buf + boff, 308 (uint8_t *)br->br_iov[i].iov_base + voff, 309 clen); 310 if (clen < br->br_iov[i].iov_len - voff) 311 voff += clen; 312 else { 313 i++; 314 voff = 0; 315 } 316 boff += clen; 317 } while (boff < len); 318 319 n = pwrite(bc->bc_fd, buf, len, br->br_offset + off); 320 if (n < 0) { 321 err = errno; 322 break; 323 } 324 off += n; 325 br->br_resid -= n; 326 } 327 break; 328 case BOP_FLUSH: 329 err = blockif_flush_bc(bc); 330 break; 331 case BOP_DELETE: 332 if (!bc->bc_candelete) 333 err = EOPNOTSUPP; 334 else if (bc->bc_rdonly) 335 err = EROFS; 336 else if (bc->bc_ischr) { 337 arg[0] = br->br_offset; 338 arg[1] = br->br_resid; 339 if (ioctl(bc->bc_fd, DIOCGDELETE, arg)) 340 err = errno; 341 else 342 br->br_resid = 0; 343 } else { 344 range.r_offset = br->br_offset; 345 range.r_len = br->br_resid; 346 347 while (range.r_len > 0) { 348 if (fspacectl(bc->bc_fd, SPACECTL_DEALLOC, 349 &range, 0, &range) != 0) { 350 err = errno; 351 break; 352 } 353 } 354 if (err == 0) 355 br->br_resid = 0; 356 } 357 break; 358 default: 359 err = EINVAL; 360 break; 361 } 362 363 be->be_status = BST_DONE; 364 365 (*br->br_callback)(br, err); 366 } 367 368 static inline bool 369 blockif_empty(const struct blockif_ctxt *bc) 370 { 371 return (TAILQ_EMPTY(&bc->bc_pendq) && TAILQ_EMPTY(&bc->bc_busyq)); 372 } 373 374 static void * 375 blockif_thr(void *arg) 376 { 377 struct blockif_ctxt *bc; 378 struct blockif_elem *be; 379 pthread_t t; 380 uint8_t *buf; 381 382 bc = arg; 383 if (bc->bc_isgeom) 384 buf = malloc(MAXPHYS); 385 else 386 buf = NULL; 387 t = pthread_self(); 388 389 pthread_mutex_lock(&bc->bc_mtx); 390 for (;;) { 391 while (blockif_dequeue(bc, t, &be)) { 392 pthread_mutex_unlock(&bc->bc_mtx); 393 blockif_proc(bc, be, buf); 394 pthread_mutex_lock(&bc->bc_mtx); 395 blockif_complete(bc, be); 396 } 397 398 /* If none to work, notify the main thread */ 399 if (blockif_empty(bc)) 400 pthread_cond_broadcast(&bc->bc_work_done_cond); 401 402 /* Check ctxt status here to see if exit requested */ 403 if (bc->bc_closing) 404 break; 405 406 pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx); 407 } 408 pthread_mutex_unlock(&bc->bc_mtx); 409 410 if (buf) 411 free(buf); 412 pthread_exit(NULL); 413 return (NULL); 414 } 415 416 static void 417 blockif_sigcont_handler(int signal __unused, enum ev_type type __unused, 418 void *arg __unused) 419 { 420 struct blockif_sig_elem *bse; 421 422 for (;;) { 423 /* 424 * Process the entire list even if not intended for 425 * this thread. 426 */ 427 do { 428 bse = blockif_bse_head; 429 if (bse == NULL) 430 return; 431 } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head, 432 (uintptr_t)bse, 433 (uintptr_t)bse->bse_next)); 434 435 pthread_mutex_lock(&bse->bse_mtx); 436 bse->bse_pending = 0; 437 pthread_cond_signal(&bse->bse_cond); 438 pthread_mutex_unlock(&bse->bse_mtx); 439 } 440 } 441 442 static void 443 blockif_init(void) 444 { 445 mevent_add(SIGCONT, EVF_SIGNAL, blockif_sigcont_handler, NULL); 446 (void) signal(SIGCONT, SIG_IGN); 447 } 448 449 int 450 blockif_legacy_config(nvlist_t *nvl, const char *opts) 451 { 452 char *cp, *path; 453 454 if (opts == NULL) 455 return (0); 456 457 cp = strchr(opts, ','); 458 if (cp == NULL) { 459 set_config_value_node(nvl, "path", opts); 460 return (0); 461 } 462 path = strndup(opts, cp - opts); 463 set_config_value_node(nvl, "path", path); 464 free(path); 465 return (pci_parse_legacy_config(nvl, cp + 1)); 466 } 467 468 int 469 blockif_add_boot_device(struct pci_devinst *const pi, 470 struct blockif_ctxt *const bc) 471 { 472 if (bc->bc_bootindex < 0) 473 return (0); 474 475 return (pci_emul_add_boot_device(pi, bc->bc_bootindex)); 476 } 477 478 struct blockif_ctxt * 479 blockif_open(nvlist_t *nvl, const char *ident) 480 { 481 char tname[MAXCOMLEN + 1]; 482 char name[MAXPATHLEN]; 483 const char *path, *pssval, *ssval, *bootindex_val; 484 char *cp; 485 struct blockif_ctxt *bc; 486 struct stat sbuf; 487 struct diocgattr_arg arg; 488 off_t size, psectsz, psectoff; 489 int extra, fd, i, sectsz; 490 int ro, candelete, geom, ssopt, pssopt; 491 int nodelete; 492 int bootindex; 493 494 #ifndef WITHOUT_CAPSICUM 495 cap_rights_t rights; 496 cap_ioctl_t cmds[] = { DIOCGFLUSH, DIOCGDELETE, DIOCGMEDIASIZE }; 497 #endif 498 499 pthread_once(&blockif_once, blockif_init); 500 501 fd = -1; 502 extra = 0; 503 ssopt = 0; 504 ro = 0; 505 nodelete = 0; 506 bootindex = -1; 507 508 if (get_config_bool_node_default(nvl, "nocache", false)) 509 extra |= O_DIRECT; 510 if (get_config_bool_node_default(nvl, "nodelete", false)) 511 nodelete = 1; 512 if (get_config_bool_node_default(nvl, "sync", false) || 513 get_config_bool_node_default(nvl, "direct", false)) 514 extra |= O_SYNC; 515 if (get_config_bool_node_default(nvl, "ro", false)) 516 ro = 1; 517 ssval = get_config_value_node(nvl, "sectorsize"); 518 if (ssval != NULL) { 519 ssopt = strtol(ssval, &cp, 10); 520 if (cp == ssval) { 521 EPRINTLN("Invalid sector size \"%s\"", ssval); 522 goto err; 523 } 524 if (*cp == '\0') { 525 pssopt = ssopt; 526 } else if (*cp == '/') { 527 pssval = cp + 1; 528 pssopt = strtol(pssval, &cp, 10); 529 if (cp == pssval || *cp != '\0') { 530 EPRINTLN("Invalid sector size \"%s\"", ssval); 531 goto err; 532 } 533 } else { 534 EPRINTLN("Invalid sector size \"%s\"", ssval); 535 goto err; 536 } 537 } 538 539 bootindex_val = get_config_value_node(nvl, "bootindex"); 540 if (bootindex_val != NULL) { 541 bootindex = atoi(bootindex_val); 542 } 543 544 path = get_config_value_node(nvl, "path"); 545 if (path == NULL) { 546 EPRINTLN("Missing \"path\" for block device."); 547 goto err; 548 } 549 550 fd = open(path, (ro ? O_RDONLY : O_RDWR) | extra); 551 if (fd < 0 && !ro) { 552 /* Attempt a r/w fail with a r/o open */ 553 fd = open(path, O_RDONLY | extra); 554 ro = 1; 555 } 556 557 if (fd < 0) { 558 warn("Could not open backing file: %s", path); 559 goto err; 560 } 561 562 if (fstat(fd, &sbuf) < 0) { 563 warn("Could not stat backing file %s", path); 564 goto err; 565 } 566 567 #ifndef WITHOUT_CAPSICUM 568 cap_rights_init(&rights, CAP_FSYNC, CAP_IOCTL, CAP_READ, CAP_SEEK, 569 CAP_WRITE, CAP_FSTAT, CAP_EVENT, CAP_FPATHCONF); 570 if (ro) 571 cap_rights_clear(&rights, CAP_FSYNC, CAP_WRITE); 572 573 if (caph_rights_limit(fd, &rights) == -1) 574 errx(EX_OSERR, "Unable to apply rights for sandbox"); 575 #endif 576 577 /* 578 * Deal with raw devices 579 */ 580 size = sbuf.st_size; 581 sectsz = DEV_BSIZE; 582 psectsz = psectoff = 0; 583 candelete = geom = 0; 584 if (S_ISCHR(sbuf.st_mode)) { 585 if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 || 586 ioctl(fd, DIOCGSECTORSIZE, §sz)) { 587 perror("Could not fetch dev blk/sector size"); 588 goto err; 589 } 590 assert(size != 0); 591 assert(sectsz != 0); 592 if (ioctl(fd, DIOCGSTRIPESIZE, &psectsz) == 0 && psectsz > 0) 593 ioctl(fd, DIOCGSTRIPEOFFSET, &psectoff); 594 strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name)); 595 arg.len = sizeof(arg.value.i); 596 if (nodelete == 0 && ioctl(fd, DIOCGATTR, &arg) == 0) 597 candelete = arg.value.i; 598 if (ioctl(fd, DIOCGPROVIDERNAME, name) == 0) 599 geom = 1; 600 } else { 601 psectsz = sbuf.st_blksize; 602 /* Avoid fallback implementation */ 603 candelete = fpathconf(fd, _PC_DEALLOC_PRESENT) == 1; 604 } 605 606 #ifndef WITHOUT_CAPSICUM 607 if (caph_ioctls_limit(fd, cmds, nitems(cmds)) == -1) 608 errx(EX_OSERR, "Unable to apply rights for sandbox"); 609 #endif 610 611 if (ssopt != 0) { 612 if (!powerof2(ssopt) || !powerof2(pssopt) || ssopt < 512 || 613 ssopt > pssopt) { 614 EPRINTLN("Invalid sector size %d/%d", 615 ssopt, pssopt); 616 goto err; 617 } 618 619 /* 620 * Some backend drivers (e.g. cd0, ada0) require that the I/O 621 * size be a multiple of the device's sector size. 622 * 623 * Validate that the emulated sector size complies with this 624 * requirement. 625 */ 626 if (S_ISCHR(sbuf.st_mode)) { 627 if (ssopt < sectsz || (ssopt % sectsz) != 0) { 628 EPRINTLN("Sector size %d incompatible " 629 "with underlying device sector size %d", 630 ssopt, sectsz); 631 goto err; 632 } 633 } 634 635 sectsz = ssopt; 636 psectsz = pssopt; 637 psectoff = 0; 638 } 639 640 bc = calloc(1, sizeof(struct blockif_ctxt)); 641 if (bc == NULL) { 642 perror("calloc"); 643 goto err; 644 } 645 646 bc->bc_magic = BLOCKIF_SIG; 647 bc->bc_fd = fd; 648 bc->bc_ischr = S_ISCHR(sbuf.st_mode); 649 bc->bc_isgeom = geom; 650 bc->bc_candelete = candelete; 651 bc->bc_rdonly = ro; 652 bc->bc_size = size; 653 bc->bc_sectsz = sectsz; 654 bc->bc_psectsz = psectsz; 655 bc->bc_psectoff = psectoff; 656 pthread_mutex_init(&bc->bc_mtx, NULL); 657 pthread_cond_init(&bc->bc_cond, NULL); 658 bc->bc_paused = 0; 659 pthread_cond_init(&bc->bc_work_done_cond, NULL); 660 TAILQ_INIT(&bc->bc_freeq); 661 TAILQ_INIT(&bc->bc_pendq); 662 TAILQ_INIT(&bc->bc_busyq); 663 bc->bc_bootindex = bootindex; 664 for (i = 0; i < BLOCKIF_MAXREQ; i++) { 665 bc->bc_reqs[i].be_status = BST_FREE; 666 TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link); 667 } 668 669 for (i = 0; i < BLOCKIF_NUMTHR; i++) { 670 pthread_create(&bc->bc_btid[i], NULL, blockif_thr, bc); 671 snprintf(tname, sizeof(tname), "blk-%s-%d", ident, i); 672 pthread_set_name_np(bc->bc_btid[i], tname); 673 } 674 675 return (bc); 676 err: 677 if (fd >= 0) 678 close(fd); 679 return (NULL); 680 } 681 682 static void 683 blockif_resized(int fd, enum ev_type type __unused, void *arg) 684 { 685 struct blockif_ctxt *bc; 686 struct stat sb; 687 off_t mediasize; 688 689 if (fstat(fd, &sb) != 0) 690 return; 691 692 if (S_ISCHR(sb.st_mode)) { 693 if (ioctl(fd, DIOCGMEDIASIZE, &mediasize) < 0) { 694 EPRINTLN("blockif_resized: get mediasize failed: %s", 695 strerror(errno)); 696 return; 697 } 698 } else 699 mediasize = sb.st_size; 700 701 bc = arg; 702 pthread_mutex_lock(&bc->bc_mtx); 703 if (mediasize != bc->bc_size) { 704 bc->bc_size = mediasize; 705 bc->bc_resize_cb(bc, bc->bc_resize_cb_arg, bc->bc_size); 706 } 707 pthread_mutex_unlock(&bc->bc_mtx); 708 } 709 710 int 711 blockif_register_resize_callback(struct blockif_ctxt *bc, blockif_resize_cb *cb, 712 void *cb_arg) 713 { 714 struct stat sb; 715 int err; 716 717 if (cb == NULL) 718 return (EINVAL); 719 720 err = 0; 721 722 pthread_mutex_lock(&bc->bc_mtx); 723 if (bc->bc_resize_cb != NULL) { 724 err = EBUSY; 725 goto out; 726 } 727 728 assert(bc->bc_closing == 0); 729 730 if (fstat(bc->bc_fd, &sb) != 0) { 731 err = errno; 732 goto out; 733 } 734 735 bc->bc_resize_event = mevent_add_flags(bc->bc_fd, EVF_VNODE, 736 EVFF_ATTRIB, blockif_resized, bc); 737 if (bc->bc_resize_event == NULL) { 738 err = ENXIO; 739 goto out; 740 } 741 742 bc->bc_resize_cb = cb; 743 bc->bc_resize_cb_arg = cb_arg; 744 out: 745 pthread_mutex_unlock(&bc->bc_mtx); 746 747 return (err); 748 } 749 750 static int 751 blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq, 752 enum blockop op) 753 { 754 int err; 755 756 err = 0; 757 758 pthread_mutex_lock(&bc->bc_mtx); 759 assert(!bc->bc_paused); 760 if (!TAILQ_EMPTY(&bc->bc_freeq)) { 761 /* 762 * Enqueue and inform the block i/o thread 763 * that there is work available 764 */ 765 if (blockif_enqueue(bc, breq, op)) 766 pthread_cond_signal(&bc->bc_cond); 767 } else { 768 /* 769 * Callers are not allowed to enqueue more than 770 * the specified blockif queue limit. Return an 771 * error to indicate that the queue length has been 772 * exceeded. 773 */ 774 err = E2BIG; 775 } 776 pthread_mutex_unlock(&bc->bc_mtx); 777 778 return (err); 779 } 780 781 int 782 blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq) 783 { 784 assert(bc->bc_magic == BLOCKIF_SIG); 785 return (blockif_request(bc, breq, BOP_READ)); 786 } 787 788 int 789 blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq) 790 { 791 assert(bc->bc_magic == BLOCKIF_SIG); 792 return (blockif_request(bc, breq, BOP_WRITE)); 793 } 794 795 int 796 blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq) 797 { 798 assert(bc->bc_magic == BLOCKIF_SIG); 799 return (blockif_request(bc, breq, BOP_FLUSH)); 800 } 801 802 int 803 blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq) 804 { 805 assert(bc->bc_magic == BLOCKIF_SIG); 806 return (blockif_request(bc, breq, BOP_DELETE)); 807 } 808 809 int 810 blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq) 811 { 812 struct blockif_elem *be; 813 814 assert(bc->bc_magic == BLOCKIF_SIG); 815 816 pthread_mutex_lock(&bc->bc_mtx); 817 /* XXX: not waiting while paused */ 818 819 /* 820 * Check pending requests. 821 */ 822 TAILQ_FOREACH(be, &bc->bc_pendq, be_link) { 823 if (be->be_req == breq) 824 break; 825 } 826 if (be != NULL) { 827 /* 828 * Found it. 829 */ 830 blockif_complete(bc, be); 831 pthread_mutex_unlock(&bc->bc_mtx); 832 833 return (0); 834 } 835 836 /* 837 * Check in-flight requests. 838 */ 839 TAILQ_FOREACH(be, &bc->bc_busyq, be_link) { 840 if (be->be_req == breq) 841 break; 842 } 843 if (be == NULL) { 844 /* 845 * Didn't find it. 846 */ 847 pthread_mutex_unlock(&bc->bc_mtx); 848 return (EINVAL); 849 } 850 851 /* 852 * Interrupt the processing thread to force it return 853 * prematurely via it's normal callback path. 854 */ 855 while (be->be_status == BST_BUSY) { 856 struct blockif_sig_elem bse, *old_head; 857 858 pthread_mutex_init(&bse.bse_mtx, NULL); 859 pthread_cond_init(&bse.bse_cond, NULL); 860 861 bse.bse_pending = 1; 862 863 do { 864 old_head = blockif_bse_head; 865 bse.bse_next = old_head; 866 } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head, 867 (uintptr_t)old_head, 868 (uintptr_t)&bse)); 869 870 pthread_kill(be->be_tid, SIGCONT); 871 872 pthread_mutex_lock(&bse.bse_mtx); 873 while (bse.bse_pending) 874 pthread_cond_wait(&bse.bse_cond, &bse.bse_mtx); 875 pthread_mutex_unlock(&bse.bse_mtx); 876 } 877 878 pthread_mutex_unlock(&bc->bc_mtx); 879 880 /* 881 * The processing thread has been interrupted. Since it's not 882 * clear if the callback has been invoked yet, return EBUSY. 883 */ 884 return (EBUSY); 885 } 886 887 int 888 blockif_close(struct blockif_ctxt *bc) 889 { 890 void *jval; 891 int i; 892 893 assert(bc->bc_magic == BLOCKIF_SIG); 894 895 /* 896 * Stop the block i/o thread 897 */ 898 pthread_mutex_lock(&bc->bc_mtx); 899 bc->bc_closing = 1; 900 if (bc->bc_resize_event != NULL) 901 mevent_disable(bc->bc_resize_event); 902 pthread_mutex_unlock(&bc->bc_mtx); 903 pthread_cond_broadcast(&bc->bc_cond); 904 for (i = 0; i < BLOCKIF_NUMTHR; i++) 905 pthread_join(bc->bc_btid[i], &jval); 906 907 /* XXX Cancel queued i/o's ??? */ 908 909 /* 910 * Release resources 911 */ 912 bc->bc_magic = 0; 913 close(bc->bc_fd); 914 free(bc); 915 916 return (0); 917 } 918 919 /* 920 * Return virtual C/H/S values for a given block. Use the algorithm 921 * outlined in the VHD specification to calculate values. 922 */ 923 void 924 blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s) 925 { 926 off_t sectors; /* total sectors of the block dev */ 927 off_t hcyl; /* cylinders times heads */ 928 uint16_t secpt; /* sectors per track */ 929 uint8_t heads; 930 931 assert(bc->bc_magic == BLOCKIF_SIG); 932 933 sectors = bc->bc_size / bc->bc_sectsz; 934 935 /* Clamp the size to the largest possible with CHS */ 936 if (sectors > 65535L * 16 * 255) 937 sectors = 65535L * 16 * 255; 938 939 if (sectors >= 65536L * 16 * 63) { 940 secpt = 255; 941 heads = 16; 942 hcyl = sectors / secpt; 943 } else { 944 secpt = 17; 945 hcyl = sectors / secpt; 946 heads = (hcyl + 1023) / 1024; 947 948 if (heads < 4) 949 heads = 4; 950 951 if (hcyl >= (heads * 1024) || heads > 16) { 952 secpt = 31; 953 heads = 16; 954 hcyl = sectors / secpt; 955 } 956 if (hcyl >= (heads * 1024)) { 957 secpt = 63; 958 heads = 16; 959 hcyl = sectors / secpt; 960 } 961 } 962 963 *c = hcyl / heads; 964 *h = heads; 965 *s = secpt; 966 } 967 968 /* 969 * Accessors 970 */ 971 off_t 972 blockif_size(struct blockif_ctxt *bc) 973 { 974 assert(bc->bc_magic == BLOCKIF_SIG); 975 return (bc->bc_size); 976 } 977 978 int 979 blockif_sectsz(struct blockif_ctxt *bc) 980 { 981 assert(bc->bc_magic == BLOCKIF_SIG); 982 return (bc->bc_sectsz); 983 } 984 985 void 986 blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off) 987 { 988 assert(bc->bc_magic == BLOCKIF_SIG); 989 *size = bc->bc_psectsz; 990 *off = bc->bc_psectoff; 991 } 992 993 int 994 blockif_queuesz(struct blockif_ctxt *bc) 995 { 996 assert(bc->bc_magic == BLOCKIF_SIG); 997 return (BLOCKIF_MAXREQ - 1); 998 } 999 1000 int 1001 blockif_is_ro(struct blockif_ctxt *bc) 1002 { 1003 assert(bc->bc_magic == BLOCKIF_SIG); 1004 return (bc->bc_rdonly); 1005 } 1006 1007 int 1008 blockif_candelete(struct blockif_ctxt *bc) 1009 { 1010 assert(bc->bc_magic == BLOCKIF_SIG); 1011 return (bc->bc_candelete); 1012 } 1013 1014 #ifdef BHYVE_SNAPSHOT 1015 void 1016 blockif_pause(struct blockif_ctxt *bc) 1017 { 1018 assert(bc != NULL); 1019 assert(bc->bc_magic == BLOCKIF_SIG); 1020 1021 pthread_mutex_lock(&bc->bc_mtx); 1022 bc->bc_paused = 1; 1023 1024 /* The interface is paused. Wait for workers to finish their work */ 1025 while (!blockif_empty(bc)) 1026 pthread_cond_wait(&bc->bc_work_done_cond, &bc->bc_mtx); 1027 pthread_mutex_unlock(&bc->bc_mtx); 1028 1029 if (!bc->bc_rdonly && blockif_flush_bc(bc)) 1030 fprintf(stderr, "%s: [WARN] failed to flush backing file.\r\n", 1031 __func__); 1032 } 1033 1034 void 1035 blockif_resume(struct blockif_ctxt *bc) 1036 { 1037 assert(bc != NULL); 1038 assert(bc->bc_magic == BLOCKIF_SIG); 1039 1040 pthread_mutex_lock(&bc->bc_mtx); 1041 bc->bc_paused = 0; 1042 pthread_mutex_unlock(&bc->bc_mtx); 1043 } 1044 #endif /* BHYVE_SNAPSHOT */ 1045