1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2013 Peter Grehan <grehan@freebsd.org> 5 * All rights reserved. 6 * Copyright 2020 Joyent, Inc. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * $FreeBSD$ 30 */ 31 32 #include <sys/cdefs.h> 33 __FBSDID("$FreeBSD$"); 34 35 #include <sys/param.h> 36 #ifndef WITHOUT_CAPSICUM 37 #include <sys/capsicum.h> 38 #endif 39 #include <sys/queue.h> 40 #include <sys/errno.h> 41 #include <sys/stat.h> 42 #include <sys/ioctl.h> 43 #include <sys/disk.h> 44 45 #include <assert.h> 46 #ifndef WITHOUT_CAPSICUM 47 #include <capsicum_helpers.h> 48 #endif 49 #include <err.h> 50 #include <fcntl.h> 51 #include <stdio.h> 52 #include <stdlib.h> 53 #include <string.h> 54 #include <pthread.h> 55 #include <pthread_np.h> 56 #include <signal.h> 57 #include <sysexits.h> 58 #include <unistd.h> 59 60 #include <machine/atomic.h> 61 #include <machine/vmm_snapshot.h> 62 63 #include "bhyverun.h" 64 #include "debug.h" 65 #include "mevent.h" 66 #include "block_if.h" 67 68 #define BLOCKIF_SIG 0xb109b109 69 70 #define BLOCKIF_NUMTHR 8 71 #define BLOCKIF_MAXREQ (BLOCKIF_RING_MAX + BLOCKIF_NUMTHR) 72 73 enum blockop { 74 BOP_READ, 75 BOP_WRITE, 76 BOP_FLUSH, 77 BOP_DELETE 78 }; 79 80 enum blockstat { 81 BST_FREE, 82 BST_BLOCK, 83 BST_PEND, 84 BST_BUSY, 85 BST_DONE 86 }; 87 88 struct blockif_elem { 89 TAILQ_ENTRY(blockif_elem) be_link; 90 struct blockif_req *be_req; 91 enum blockop be_op; 92 enum blockstat be_status; 93 pthread_t be_tid; 94 off_t be_block; 95 }; 96 97 struct blockif_ctxt { 98 int bc_magic; 99 int bc_fd; 100 int bc_ischr; 101 int bc_isgeom; 102 int bc_candelete; 103 int bc_rdonly; 104 off_t bc_size; 105 int bc_sectsz; 106 int bc_psectsz; 107 int bc_psectoff; 108 int bc_closing; 109 int bc_paused; 110 int bc_work_count; 111 pthread_t bc_btid[BLOCKIF_NUMTHR]; 112 pthread_mutex_t bc_mtx; 113 pthread_cond_t bc_cond; 114 pthread_cond_t bc_paused_cond; 115 pthread_cond_t bc_work_done_cond; 116 117 /* Request elements and free/pending/busy queues */ 118 TAILQ_HEAD(, blockif_elem) bc_freeq; 119 TAILQ_HEAD(, blockif_elem) bc_pendq; 120 TAILQ_HEAD(, blockif_elem) bc_busyq; 121 struct blockif_elem bc_reqs[BLOCKIF_MAXREQ]; 122 }; 123 124 static pthread_once_t blockif_once = PTHREAD_ONCE_INIT; 125 126 struct blockif_sig_elem { 127 pthread_mutex_t bse_mtx; 128 pthread_cond_t bse_cond; 129 int bse_pending; 130 struct blockif_sig_elem *bse_next; 131 }; 132 133 static struct blockif_sig_elem *blockif_bse_head; 134 135 static int 136 blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq, 137 enum blockop op) 138 { 139 struct blockif_elem *be, *tbe; 140 off_t off; 141 int i; 142 143 be = TAILQ_FIRST(&bc->bc_freeq); 144 assert(be != NULL); 145 assert(be->be_status == BST_FREE); 146 TAILQ_REMOVE(&bc->bc_freeq, be, be_link); 147 be->be_req = breq; 148 be->be_op = op; 149 switch (op) { 150 case BOP_READ: 151 case BOP_WRITE: 152 case BOP_DELETE: 153 off = breq->br_offset; 154 for (i = 0; i < breq->br_iovcnt; i++) 155 off += breq->br_iov[i].iov_len; 156 break; 157 default: 158 off = OFF_MAX; 159 } 160 be->be_block = off; 161 TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) { 162 if (tbe->be_block == breq->br_offset) 163 break; 164 } 165 if (tbe == NULL) { 166 TAILQ_FOREACH(tbe, &bc->bc_busyq, be_link) { 167 if (tbe->be_block == breq->br_offset) 168 break; 169 } 170 } 171 if (tbe == NULL) 172 be->be_status = BST_PEND; 173 else 174 be->be_status = BST_BLOCK; 175 TAILQ_INSERT_TAIL(&bc->bc_pendq, be, be_link); 176 return (be->be_status == BST_PEND); 177 } 178 179 static int 180 blockif_dequeue(struct blockif_ctxt *bc, pthread_t t, struct blockif_elem **bep) 181 { 182 struct blockif_elem *be; 183 184 TAILQ_FOREACH(be, &bc->bc_pendq, be_link) { 185 if (be->be_status == BST_PEND) 186 break; 187 assert(be->be_status == BST_BLOCK); 188 } 189 if (be == NULL) 190 return (0); 191 TAILQ_REMOVE(&bc->bc_pendq, be, be_link); 192 be->be_status = BST_BUSY; 193 be->be_tid = t; 194 TAILQ_INSERT_TAIL(&bc->bc_busyq, be, be_link); 195 *bep = be; 196 return (1); 197 } 198 199 static void 200 blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be) 201 { 202 struct blockif_elem *tbe; 203 204 if (be->be_status == BST_DONE || be->be_status == BST_BUSY) 205 TAILQ_REMOVE(&bc->bc_busyq, be, be_link); 206 else 207 TAILQ_REMOVE(&bc->bc_pendq, be, be_link); 208 TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) { 209 if (tbe->be_req->br_offset == be->be_block) 210 tbe->be_status = BST_PEND; 211 } 212 be->be_tid = 0; 213 be->be_status = BST_FREE; 214 be->be_req = NULL; 215 TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link); 216 } 217 218 static int 219 blockif_flush_bc(struct blockif_ctxt *bc) 220 { 221 if (bc->bc_ischr) { 222 if (ioctl(bc->bc_fd, DIOCGFLUSH)) 223 return (errno); 224 } else if (fsync(bc->bc_fd)) 225 return (errno); 226 227 return (0); 228 } 229 230 static void 231 blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf) 232 { 233 struct blockif_req *br; 234 off_t arg[2]; 235 ssize_t clen, len, off, boff, voff; 236 int i, err; 237 238 br = be->be_req; 239 if (br->br_iovcnt <= 1) 240 buf = NULL; 241 err = 0; 242 switch (be->be_op) { 243 case BOP_READ: 244 if (buf == NULL) { 245 if ((len = preadv(bc->bc_fd, br->br_iov, br->br_iovcnt, 246 br->br_offset)) < 0) 247 err = errno; 248 else 249 br->br_resid -= len; 250 break; 251 } 252 i = 0; 253 off = voff = 0; 254 while (br->br_resid > 0) { 255 len = MIN(br->br_resid, MAXPHYS); 256 if (pread(bc->bc_fd, buf, len, br->br_offset + 257 off) < 0) { 258 err = errno; 259 break; 260 } 261 boff = 0; 262 do { 263 clen = MIN(len - boff, br->br_iov[i].iov_len - 264 voff); 265 memcpy(br->br_iov[i].iov_base + voff, 266 buf + boff, clen); 267 if (clen < br->br_iov[i].iov_len - voff) 268 voff += clen; 269 else { 270 i++; 271 voff = 0; 272 } 273 boff += clen; 274 } while (boff < len); 275 off += len; 276 br->br_resid -= len; 277 } 278 break; 279 case BOP_WRITE: 280 if (bc->bc_rdonly) { 281 err = EROFS; 282 break; 283 } 284 if (buf == NULL) { 285 if ((len = pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt, 286 br->br_offset)) < 0) 287 err = errno; 288 else 289 br->br_resid -= len; 290 break; 291 } 292 i = 0; 293 off = voff = 0; 294 while (br->br_resid > 0) { 295 len = MIN(br->br_resid, MAXPHYS); 296 boff = 0; 297 do { 298 clen = MIN(len - boff, br->br_iov[i].iov_len - 299 voff); 300 memcpy(buf + boff, 301 br->br_iov[i].iov_base + voff, clen); 302 if (clen < br->br_iov[i].iov_len - voff) 303 voff += clen; 304 else { 305 i++; 306 voff = 0; 307 } 308 boff += clen; 309 } while (boff < len); 310 if (pwrite(bc->bc_fd, buf, len, br->br_offset + 311 off) < 0) { 312 err = errno; 313 break; 314 } 315 off += len; 316 br->br_resid -= len; 317 } 318 break; 319 case BOP_FLUSH: 320 err = blockif_flush_bc(bc); 321 break; 322 case BOP_DELETE: 323 if (!bc->bc_candelete) 324 err = EOPNOTSUPP; 325 else if (bc->bc_rdonly) 326 err = EROFS; 327 else if (bc->bc_ischr) { 328 arg[0] = br->br_offset; 329 arg[1] = br->br_resid; 330 if (ioctl(bc->bc_fd, DIOCGDELETE, arg)) 331 err = errno; 332 else 333 br->br_resid = 0; 334 } else 335 err = EOPNOTSUPP; 336 break; 337 default: 338 err = EINVAL; 339 break; 340 } 341 342 be->be_status = BST_DONE; 343 344 (*br->br_callback)(br, err); 345 } 346 347 static void * 348 blockif_thr(void *arg) 349 { 350 struct blockif_ctxt *bc; 351 struct blockif_elem *be; 352 pthread_t t; 353 uint8_t *buf; 354 355 bc = arg; 356 if (bc->bc_isgeom) 357 buf = malloc(MAXPHYS); 358 else 359 buf = NULL; 360 t = pthread_self(); 361 362 pthread_mutex_lock(&bc->bc_mtx); 363 for (;;) { 364 bc->bc_work_count++; 365 366 /* We cannot process work if the interface is paused */ 367 while (!bc->bc_paused && blockif_dequeue(bc, t, &be)) { 368 pthread_mutex_unlock(&bc->bc_mtx); 369 blockif_proc(bc, be, buf); 370 pthread_mutex_lock(&bc->bc_mtx); 371 blockif_complete(bc, be); 372 } 373 374 bc->bc_work_count--; 375 376 /* If none of the workers are busy, notify the main thread */ 377 if (bc->bc_work_count == 0) 378 pthread_cond_broadcast(&bc->bc_work_done_cond); 379 380 /* Check ctxt status here to see if exit requested */ 381 if (bc->bc_closing) 382 break; 383 384 /* Make all worker threads wait here if the device is paused */ 385 while (bc->bc_paused) 386 pthread_cond_wait(&bc->bc_paused_cond, &bc->bc_mtx); 387 388 pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx); 389 } 390 pthread_mutex_unlock(&bc->bc_mtx); 391 392 if (buf) 393 free(buf); 394 pthread_exit(NULL); 395 return (NULL); 396 } 397 398 static void 399 blockif_sigcont_handler(int signal, enum ev_type type, void *arg) 400 { 401 struct blockif_sig_elem *bse; 402 403 for (;;) { 404 /* 405 * Process the entire list even if not intended for 406 * this thread. 407 */ 408 do { 409 bse = blockif_bse_head; 410 if (bse == NULL) 411 return; 412 } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head, 413 (uintptr_t)bse, 414 (uintptr_t)bse->bse_next)); 415 416 pthread_mutex_lock(&bse->bse_mtx); 417 bse->bse_pending = 0; 418 pthread_cond_signal(&bse->bse_cond); 419 pthread_mutex_unlock(&bse->bse_mtx); 420 } 421 } 422 423 static void 424 blockif_init(void) 425 { 426 mevent_add(SIGCONT, EVF_SIGNAL, blockif_sigcont_handler, NULL); 427 (void) signal(SIGCONT, SIG_IGN); 428 } 429 430 struct blockif_ctxt * 431 blockif_open(const char *optstr, const char *ident) 432 { 433 char tname[MAXCOMLEN + 1]; 434 char name[MAXPATHLEN]; 435 char *nopt, *xopts, *cp; 436 struct blockif_ctxt *bc; 437 struct stat sbuf; 438 struct diocgattr_arg arg; 439 off_t size, psectsz, psectoff; 440 int extra, fd, i, sectsz; 441 int nocache, sync, ro, candelete, geom, ssopt, pssopt; 442 int nodelete; 443 444 #ifndef WITHOUT_CAPSICUM 445 cap_rights_t rights; 446 cap_ioctl_t cmds[] = { DIOCGFLUSH, DIOCGDELETE }; 447 #endif 448 449 pthread_once(&blockif_once, blockif_init); 450 451 fd = -1; 452 ssopt = 0; 453 nocache = 0; 454 sync = 0; 455 ro = 0; 456 nodelete = 0; 457 458 /* 459 * The first element in the optstring is always a pathname. 460 * Optional elements follow 461 */ 462 nopt = xopts = strdup(optstr); 463 while (xopts != NULL) { 464 cp = strsep(&xopts, ","); 465 if (cp == nopt) /* file or device pathname */ 466 continue; 467 else if (!strcmp(cp, "nocache")) 468 nocache = 1; 469 else if (!strcmp(cp, "nodelete")) 470 nodelete = 1; 471 else if (!strcmp(cp, "sync") || !strcmp(cp, "direct")) 472 sync = 1; 473 else if (!strcmp(cp, "ro")) 474 ro = 1; 475 else if (sscanf(cp, "sectorsize=%d/%d", &ssopt, &pssopt) == 2) 476 ; 477 else if (sscanf(cp, "sectorsize=%d", &ssopt) == 1) 478 pssopt = ssopt; 479 else { 480 EPRINTLN("Invalid device option \"%s\"", cp); 481 goto err; 482 } 483 } 484 485 extra = 0; 486 if (nocache) 487 extra |= O_DIRECT; 488 if (sync) 489 extra |= O_SYNC; 490 491 fd = open(nopt, (ro ? O_RDONLY : O_RDWR) | extra); 492 if (fd < 0 && !ro) { 493 /* Attempt a r/w fail with a r/o open */ 494 fd = open(nopt, O_RDONLY | extra); 495 ro = 1; 496 } 497 498 if (fd < 0) { 499 warn("Could not open backing file: %s", nopt); 500 goto err; 501 } 502 503 if (fstat(fd, &sbuf) < 0) { 504 warn("Could not stat backing file %s", nopt); 505 goto err; 506 } 507 508 #ifndef WITHOUT_CAPSICUM 509 cap_rights_init(&rights, CAP_FSYNC, CAP_IOCTL, CAP_READ, CAP_SEEK, 510 CAP_WRITE); 511 if (ro) 512 cap_rights_clear(&rights, CAP_FSYNC, CAP_WRITE); 513 514 if (caph_rights_limit(fd, &rights) == -1) 515 errx(EX_OSERR, "Unable to apply rights for sandbox"); 516 #endif 517 518 /* 519 * Deal with raw devices 520 */ 521 size = sbuf.st_size; 522 sectsz = DEV_BSIZE; 523 psectsz = psectoff = 0; 524 candelete = geom = 0; 525 if (S_ISCHR(sbuf.st_mode)) { 526 if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 || 527 ioctl(fd, DIOCGSECTORSIZE, §sz)) { 528 perror("Could not fetch dev blk/sector size"); 529 goto err; 530 } 531 assert(size != 0); 532 assert(sectsz != 0); 533 if (ioctl(fd, DIOCGSTRIPESIZE, &psectsz) == 0 && psectsz > 0) 534 ioctl(fd, DIOCGSTRIPEOFFSET, &psectoff); 535 strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name)); 536 arg.len = sizeof(arg.value.i); 537 if (nodelete == 0 && ioctl(fd, DIOCGATTR, &arg) == 0) 538 candelete = arg.value.i; 539 if (ioctl(fd, DIOCGPROVIDERNAME, name) == 0) 540 geom = 1; 541 } else 542 psectsz = sbuf.st_blksize; 543 544 #ifndef WITHOUT_CAPSICUM 545 if (caph_ioctls_limit(fd, cmds, nitems(cmds)) == -1) 546 errx(EX_OSERR, "Unable to apply rights for sandbox"); 547 #endif 548 549 if (ssopt != 0) { 550 if (!powerof2(ssopt) || !powerof2(pssopt) || ssopt < 512 || 551 ssopt > pssopt) { 552 EPRINTLN("Invalid sector size %d/%d", 553 ssopt, pssopt); 554 goto err; 555 } 556 557 /* 558 * Some backend drivers (e.g. cd0, ada0) require that the I/O 559 * size be a multiple of the device's sector size. 560 * 561 * Validate that the emulated sector size complies with this 562 * requirement. 563 */ 564 if (S_ISCHR(sbuf.st_mode)) { 565 if (ssopt < sectsz || (ssopt % sectsz) != 0) { 566 EPRINTLN("Sector size %d incompatible " 567 "with underlying device sector size %d", 568 ssopt, sectsz); 569 goto err; 570 } 571 } 572 573 sectsz = ssopt; 574 psectsz = pssopt; 575 psectoff = 0; 576 } 577 578 bc = calloc(1, sizeof(struct blockif_ctxt)); 579 if (bc == NULL) { 580 perror("calloc"); 581 goto err; 582 } 583 584 bc->bc_magic = BLOCKIF_SIG; 585 bc->bc_fd = fd; 586 bc->bc_ischr = S_ISCHR(sbuf.st_mode); 587 bc->bc_isgeom = geom; 588 bc->bc_candelete = candelete; 589 bc->bc_rdonly = ro; 590 bc->bc_size = size; 591 bc->bc_sectsz = sectsz; 592 bc->bc_psectsz = psectsz; 593 bc->bc_psectoff = psectoff; 594 pthread_mutex_init(&bc->bc_mtx, NULL); 595 pthread_cond_init(&bc->bc_cond, NULL); 596 bc->bc_paused = 0; 597 bc->bc_work_count = 0; 598 pthread_cond_init(&bc->bc_paused_cond, NULL); 599 pthread_cond_init(&bc->bc_work_done_cond, NULL); 600 TAILQ_INIT(&bc->bc_freeq); 601 TAILQ_INIT(&bc->bc_pendq); 602 TAILQ_INIT(&bc->bc_busyq); 603 for (i = 0; i < BLOCKIF_MAXREQ; i++) { 604 bc->bc_reqs[i].be_status = BST_FREE; 605 TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link); 606 } 607 608 for (i = 0; i < BLOCKIF_NUMTHR; i++) { 609 pthread_create(&bc->bc_btid[i], NULL, blockif_thr, bc); 610 snprintf(tname, sizeof(tname), "blk-%s-%d", ident, i); 611 pthread_set_name_np(bc->bc_btid[i], tname); 612 } 613 614 return (bc); 615 err: 616 if (fd >= 0) 617 close(fd); 618 free(nopt); 619 return (NULL); 620 } 621 622 static int 623 blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq, 624 enum blockop op) 625 { 626 int err; 627 628 err = 0; 629 630 pthread_mutex_lock(&bc->bc_mtx); 631 if (!TAILQ_EMPTY(&bc->bc_freeq)) { 632 /* 633 * Enqueue and inform the block i/o thread 634 * that there is work available 635 */ 636 if (blockif_enqueue(bc, breq, op)) 637 pthread_cond_signal(&bc->bc_cond); 638 } else { 639 /* 640 * Callers are not allowed to enqueue more than 641 * the specified blockif queue limit. Return an 642 * error to indicate that the queue length has been 643 * exceeded. 644 */ 645 err = E2BIG; 646 } 647 pthread_mutex_unlock(&bc->bc_mtx); 648 649 return (err); 650 } 651 652 int 653 blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq) 654 { 655 656 assert(bc->bc_magic == BLOCKIF_SIG); 657 return (blockif_request(bc, breq, BOP_READ)); 658 } 659 660 int 661 blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq) 662 { 663 664 assert(bc->bc_magic == BLOCKIF_SIG); 665 return (blockif_request(bc, breq, BOP_WRITE)); 666 } 667 668 int 669 blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq) 670 { 671 672 assert(bc->bc_magic == BLOCKIF_SIG); 673 return (blockif_request(bc, breq, BOP_FLUSH)); 674 } 675 676 int 677 blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq) 678 { 679 680 assert(bc->bc_magic == BLOCKIF_SIG); 681 return (blockif_request(bc, breq, BOP_DELETE)); 682 } 683 684 int 685 blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq) 686 { 687 struct blockif_elem *be; 688 689 assert(bc->bc_magic == BLOCKIF_SIG); 690 691 pthread_mutex_lock(&bc->bc_mtx); 692 /* XXX: not waiting while paused */ 693 694 /* 695 * Check pending requests. 696 */ 697 TAILQ_FOREACH(be, &bc->bc_pendq, be_link) { 698 if (be->be_req == breq) 699 break; 700 } 701 if (be != NULL) { 702 /* 703 * Found it. 704 */ 705 blockif_complete(bc, be); 706 pthread_mutex_unlock(&bc->bc_mtx); 707 708 return (0); 709 } 710 711 /* 712 * Check in-flight requests. 713 */ 714 TAILQ_FOREACH(be, &bc->bc_busyq, be_link) { 715 if (be->be_req == breq) 716 break; 717 } 718 if (be == NULL) { 719 /* 720 * Didn't find it. 721 */ 722 pthread_mutex_unlock(&bc->bc_mtx); 723 return (EINVAL); 724 } 725 726 /* 727 * Interrupt the processing thread to force it return 728 * prematurely via it's normal callback path. 729 */ 730 while (be->be_status == BST_BUSY) { 731 struct blockif_sig_elem bse, *old_head; 732 733 pthread_mutex_init(&bse.bse_mtx, NULL); 734 pthread_cond_init(&bse.bse_cond, NULL); 735 736 bse.bse_pending = 1; 737 738 do { 739 old_head = blockif_bse_head; 740 bse.bse_next = old_head; 741 } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head, 742 (uintptr_t)old_head, 743 (uintptr_t)&bse)); 744 745 pthread_kill(be->be_tid, SIGCONT); 746 747 pthread_mutex_lock(&bse.bse_mtx); 748 while (bse.bse_pending) 749 pthread_cond_wait(&bse.bse_cond, &bse.bse_mtx); 750 pthread_mutex_unlock(&bse.bse_mtx); 751 } 752 753 pthread_mutex_unlock(&bc->bc_mtx); 754 755 /* 756 * The processing thread has been interrupted. Since it's not 757 * clear if the callback has been invoked yet, return EBUSY. 758 */ 759 return (EBUSY); 760 } 761 762 int 763 blockif_close(struct blockif_ctxt *bc) 764 { 765 void *jval; 766 int i; 767 768 assert(bc->bc_magic == BLOCKIF_SIG); 769 770 /* 771 * Stop the block i/o thread 772 */ 773 pthread_mutex_lock(&bc->bc_mtx); 774 bc->bc_closing = 1; 775 pthread_mutex_unlock(&bc->bc_mtx); 776 pthread_cond_broadcast(&bc->bc_cond); 777 for (i = 0; i < BLOCKIF_NUMTHR; i++) 778 pthread_join(bc->bc_btid[i], &jval); 779 780 /* XXX Cancel queued i/o's ??? */ 781 782 /* 783 * Release resources 784 */ 785 bc->bc_magic = 0; 786 close(bc->bc_fd); 787 free(bc); 788 789 return (0); 790 } 791 792 /* 793 * Return virtual C/H/S values for a given block. Use the algorithm 794 * outlined in the VHD specification to calculate values. 795 */ 796 void 797 blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s) 798 { 799 off_t sectors; /* total sectors of the block dev */ 800 off_t hcyl; /* cylinders times heads */ 801 uint16_t secpt; /* sectors per track */ 802 uint8_t heads; 803 804 assert(bc->bc_magic == BLOCKIF_SIG); 805 806 sectors = bc->bc_size / bc->bc_sectsz; 807 808 /* Clamp the size to the largest possible with CHS */ 809 if (sectors > 65535UL*16*255) 810 sectors = 65535UL*16*255; 811 812 if (sectors >= 65536UL*16*63) { 813 secpt = 255; 814 heads = 16; 815 hcyl = sectors / secpt; 816 } else { 817 secpt = 17; 818 hcyl = sectors / secpt; 819 heads = (hcyl + 1023) / 1024; 820 821 if (heads < 4) 822 heads = 4; 823 824 if (hcyl >= (heads * 1024) || heads > 16) { 825 secpt = 31; 826 heads = 16; 827 hcyl = sectors / secpt; 828 } 829 if (hcyl >= (heads * 1024)) { 830 secpt = 63; 831 heads = 16; 832 hcyl = sectors / secpt; 833 } 834 } 835 836 *c = hcyl / heads; 837 *h = heads; 838 *s = secpt; 839 } 840 841 /* 842 * Accessors 843 */ 844 off_t 845 blockif_size(struct blockif_ctxt *bc) 846 { 847 848 assert(bc->bc_magic == BLOCKIF_SIG); 849 return (bc->bc_size); 850 } 851 852 int 853 blockif_sectsz(struct blockif_ctxt *bc) 854 { 855 856 assert(bc->bc_magic == BLOCKIF_SIG); 857 return (bc->bc_sectsz); 858 } 859 860 void 861 blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off) 862 { 863 864 assert(bc->bc_magic == BLOCKIF_SIG); 865 *size = bc->bc_psectsz; 866 *off = bc->bc_psectoff; 867 } 868 869 int 870 blockif_queuesz(struct blockif_ctxt *bc) 871 { 872 873 assert(bc->bc_magic == BLOCKIF_SIG); 874 return (BLOCKIF_MAXREQ - 1); 875 } 876 877 int 878 blockif_is_ro(struct blockif_ctxt *bc) 879 { 880 881 assert(bc->bc_magic == BLOCKIF_SIG); 882 return (bc->bc_rdonly); 883 } 884 885 int 886 blockif_candelete(struct blockif_ctxt *bc) 887 { 888 889 assert(bc->bc_magic == BLOCKIF_SIG); 890 return (bc->bc_candelete); 891 } 892 893 #ifdef BHYVE_SNAPSHOT 894 void 895 blockif_pause(struct blockif_ctxt *bc) 896 { 897 assert(bc != NULL); 898 assert(bc->bc_magic == BLOCKIF_SIG); 899 900 pthread_mutex_lock(&bc->bc_mtx); 901 bc->bc_paused = 1; 902 903 /* The interface is paused. Wait for workers to finish their work */ 904 while (bc->bc_work_count) 905 pthread_cond_wait(&bc->bc_work_done_cond, &bc->bc_mtx); 906 pthread_mutex_unlock(&bc->bc_mtx); 907 908 if (blockif_flush_bc(bc)) 909 fprintf(stderr, "%s: [WARN] failed to flush backing file.\r\n", 910 __func__); 911 } 912 913 void 914 blockif_resume(struct blockif_ctxt *bc) 915 { 916 assert(bc != NULL); 917 assert(bc->bc_magic == BLOCKIF_SIG); 918 919 pthread_mutex_lock(&bc->bc_mtx); 920 bc->bc_paused = 0; 921 /* resume the threads waiting for paused */ 922 pthread_cond_broadcast(&bc->bc_paused_cond); 923 /* kick the threads after restore */ 924 pthread_cond_broadcast(&bc->bc_cond); 925 pthread_mutex_unlock(&bc->bc_mtx); 926 } 927 928 int 929 blockif_snapshot_req(struct blockif_req *br, struct vm_snapshot_meta *meta) 930 { 931 int i; 932 struct iovec *iov; 933 int ret; 934 935 SNAPSHOT_VAR_OR_LEAVE(br->br_iovcnt, meta, ret, done); 936 SNAPSHOT_VAR_OR_LEAVE(br->br_offset, meta, ret, done); 937 SNAPSHOT_VAR_OR_LEAVE(br->br_resid, meta, ret, done); 938 939 /* 940 * XXX: The callback and parameter must be filled by the virtualized 941 * device that uses the interface, during its init; we're not touching 942 * them here. 943 */ 944 945 /* Snapshot the iovecs. */ 946 for (i = 0; i < br->br_iovcnt; i++) { 947 iov = &br->br_iov[i]; 948 949 SNAPSHOT_VAR_OR_LEAVE(iov->iov_len, meta, ret, done); 950 951 /* We assume the iov is a guest-mapped address. */ 952 SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(iov->iov_base, iov->iov_len, 953 false, meta, ret, done); 954 } 955 956 done: 957 return (ret); 958 } 959 960 int 961 blockif_snapshot(struct blockif_ctxt *bc, struct vm_snapshot_meta *meta) 962 { 963 int ret; 964 965 if (bc->bc_paused == 0) { 966 fprintf(stderr, "%s: Snapshot failed: " 967 "interface not paused.\r\n", __func__); 968 return (ENXIO); 969 } 970 971 pthread_mutex_lock(&bc->bc_mtx); 972 973 SNAPSHOT_VAR_OR_LEAVE(bc->bc_magic, meta, ret, done); 974 SNAPSHOT_VAR_OR_LEAVE(bc->bc_ischr, meta, ret, done); 975 SNAPSHOT_VAR_OR_LEAVE(bc->bc_isgeom, meta, ret, done); 976 SNAPSHOT_VAR_OR_LEAVE(bc->bc_candelete, meta, ret, done); 977 SNAPSHOT_VAR_OR_LEAVE(bc->bc_rdonly, meta, ret, done); 978 SNAPSHOT_VAR_OR_LEAVE(bc->bc_size, meta, ret, done); 979 SNAPSHOT_VAR_OR_LEAVE(bc->bc_sectsz, meta, ret, done); 980 SNAPSHOT_VAR_OR_LEAVE(bc->bc_psectsz, meta, ret, done); 981 SNAPSHOT_VAR_OR_LEAVE(bc->bc_psectoff, meta, ret, done); 982 SNAPSHOT_VAR_OR_LEAVE(bc->bc_closing, meta, ret, done); 983 984 done: 985 pthread_mutex_unlock(&bc->bc_mtx); 986 return (ret); 987 } 988 #endif 989