1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2013 Peter Grehan <grehan@freebsd.org> 5 * All rights reserved. 6 * Copyright 2020 Joyent, Inc. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * $FreeBSD$ 30 */ 31 32 /* 33 * Copyright 2020 Joyent, Inc. 34 */ 35 36 #include <sys/cdefs.h> 37 __FBSDID("$FreeBSD$"); 38 39 #include <sys/param.h> 40 #ifndef WITHOUT_CAPSICUM 41 #include <sys/capsicum.h> 42 #endif 43 #include <sys/queue.h> 44 #include <sys/errno.h> 45 #include <sys/stat.h> 46 #include <sys/ioctl.h> 47 #include <sys/disk.h> 48 #include <sys/limits.h> 49 #include <sys/uio.h> 50 #ifndef __FreeBSD__ 51 #include <sys/dkio.h> 52 #endif 53 54 #include <assert.h> 55 #ifndef WITHOUT_CAPSICUM 56 #include <capsicum_helpers.h> 57 #endif 58 #include <err.h> 59 #include <fcntl.h> 60 #include <stdio.h> 61 #include <stdlib.h> 62 #include <string.h> 63 #include <pthread.h> 64 #include <pthread_np.h> 65 #include <signal.h> 66 #include <sysexits.h> 67 #include <unistd.h> 68 69 #include <machine/atomic.h> 70 71 #include "bhyverun.h" 72 #include "config.h" 73 #include "debug.h" 74 #include "mevent.h" 75 #include "pci_emul.h" 76 #include "block_if.h" 77 78 #define BLOCKIF_SIG 0xb109b109 79 80 #ifdef __FreeBSD__ 81 #define BLOCKIF_NUMTHR 8 82 #else 83 /* Enlarge to keep pace with the virtio-block ring size */ 84 #define BLOCKIF_NUMTHR 16 85 #endif 86 #define BLOCKIF_MAXREQ (BLOCKIF_RING_MAX + BLOCKIF_NUMTHR) 87 88 enum blockop { 89 BOP_READ, 90 BOP_WRITE, 91 #ifndef __FreeBSD__ 92 BOP_WRITE_SYNC, 93 #endif 94 BOP_FLUSH, 95 BOP_DELETE 96 }; 97 98 enum blockstat { 99 BST_FREE, 100 BST_BLOCK, 101 BST_PEND, 102 BST_BUSY, 103 BST_DONE 104 }; 105 106 struct blockif_elem { 107 TAILQ_ENTRY(blockif_elem) be_link; 108 struct blockif_req *be_req; 109 enum blockop be_op; 110 enum blockstat be_status; 111 pthread_t be_tid; 112 off_t be_block; 113 }; 114 115 #ifndef __FreeBSD__ 116 enum blockif_wce { 117 WCE_NONE = 0, 118 WCE_IOCTL, 119 WCE_FCNTL 120 }; 121 #endif 122 123 struct blockif_ctxt { 124 int bc_magic; 125 int bc_fd; 126 int bc_ischr; 127 int bc_isgeom; 128 int bc_candelete; 129 #ifndef __FreeBSD__ 130 enum blockif_wce bc_wce; 131 #endif 132 int bc_rdonly; 133 off_t bc_size; 134 int bc_sectsz; 135 int bc_psectsz; 136 int bc_psectoff; 137 int bc_closing; 138 pthread_t bc_btid[BLOCKIF_NUMTHR]; 139 pthread_mutex_t bc_mtx; 140 pthread_cond_t bc_cond; 141 blockif_resize_cb *bc_resize_cb; 142 void *bc_resize_cb_arg; 143 struct mevent *bc_resize_event; 144 145 /* Request elements and free/pending/busy queues */ 146 TAILQ_HEAD(, blockif_elem) bc_freeq; 147 TAILQ_HEAD(, blockif_elem) bc_pendq; 148 TAILQ_HEAD(, blockif_elem) bc_busyq; 149 struct blockif_elem bc_reqs[BLOCKIF_MAXREQ]; 150 }; 151 152 static pthread_once_t blockif_once = PTHREAD_ONCE_INIT; 153 154 struct blockif_sig_elem { 155 pthread_mutex_t bse_mtx; 156 pthread_cond_t bse_cond; 157 int bse_pending; 158 struct blockif_sig_elem *bse_next; 159 }; 160 161 static struct blockif_sig_elem *blockif_bse_head; 162 163 static int 164 blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq, 165 enum blockop op) 166 { 167 struct blockif_elem *be, *tbe; 168 off_t off; 169 int i; 170 171 be = TAILQ_FIRST(&bc->bc_freeq); 172 assert(be != NULL); 173 assert(be->be_status == BST_FREE); 174 TAILQ_REMOVE(&bc->bc_freeq, be, be_link); 175 be->be_req = breq; 176 be->be_op = op; 177 switch (op) { 178 case BOP_READ: 179 case BOP_WRITE: 180 #ifndef __FreeBSD__ 181 case BOP_WRITE_SYNC: 182 #endif 183 case BOP_DELETE: 184 off = breq->br_offset; 185 for (i = 0; i < breq->br_iovcnt; i++) 186 off += breq->br_iov[i].iov_len; 187 break; 188 default: 189 off = OFF_MAX; 190 } 191 be->be_block = off; 192 TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) { 193 if (tbe->be_block == breq->br_offset) 194 break; 195 } 196 if (tbe == NULL) { 197 TAILQ_FOREACH(tbe, &bc->bc_busyq, be_link) { 198 if (tbe->be_block == breq->br_offset) 199 break; 200 } 201 } 202 if (tbe == NULL) 203 be->be_status = BST_PEND; 204 else 205 be->be_status = BST_BLOCK; 206 TAILQ_INSERT_TAIL(&bc->bc_pendq, be, be_link); 207 return (be->be_status == BST_PEND); 208 } 209 210 static int 211 blockif_dequeue(struct blockif_ctxt *bc, pthread_t t, struct blockif_elem **bep) 212 { 213 struct blockif_elem *be; 214 215 TAILQ_FOREACH(be, &bc->bc_pendq, be_link) { 216 if (be->be_status == BST_PEND) 217 break; 218 assert(be->be_status == BST_BLOCK); 219 } 220 if (be == NULL) 221 return (0); 222 TAILQ_REMOVE(&bc->bc_pendq, be, be_link); 223 be->be_status = BST_BUSY; 224 be->be_tid = t; 225 TAILQ_INSERT_TAIL(&bc->bc_busyq, be, be_link); 226 *bep = be; 227 return (1); 228 } 229 230 static void 231 blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be) 232 { 233 struct blockif_elem *tbe; 234 235 if (be->be_status == BST_DONE || be->be_status == BST_BUSY) 236 TAILQ_REMOVE(&bc->bc_busyq, be, be_link); 237 else 238 TAILQ_REMOVE(&bc->bc_pendq, be, be_link); 239 TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) { 240 if (tbe->be_req->br_offset == be->be_block) 241 tbe->be_status = BST_PEND; 242 } 243 be->be_tid = 0; 244 be->be_status = BST_FREE; 245 be->be_req = NULL; 246 TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link); 247 } 248 249 static void 250 blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf) 251 { 252 struct blockif_req *br; 253 #ifdef __FreeBSD__ 254 off_t arg[2]; 255 #endif 256 ssize_t clen, len, off, boff, voff; 257 int i, err; 258 259 br = be->be_req; 260 if (br->br_iovcnt <= 1) 261 buf = NULL; 262 err = 0; 263 switch (be->be_op) { 264 case BOP_READ: 265 if (buf == NULL) { 266 if ((len = preadv(bc->bc_fd, br->br_iov, br->br_iovcnt, 267 br->br_offset)) < 0) 268 err = errno; 269 else 270 br->br_resid -= len; 271 break; 272 } 273 i = 0; 274 off = voff = 0; 275 while (br->br_resid > 0) { 276 len = MIN(br->br_resid, MAXPHYS); 277 if (pread(bc->bc_fd, buf, len, br->br_offset + 278 off) < 0) { 279 err = errno; 280 break; 281 } 282 boff = 0; 283 do { 284 clen = MIN(len - boff, br->br_iov[i].iov_len - 285 voff); 286 memcpy(br->br_iov[i].iov_base + voff, 287 buf + boff, clen); 288 if (clen < br->br_iov[i].iov_len - voff) 289 voff += clen; 290 else { 291 i++; 292 voff = 0; 293 } 294 boff += clen; 295 } while (boff < len); 296 off += len; 297 br->br_resid -= len; 298 } 299 break; 300 case BOP_WRITE: 301 if (bc->bc_rdonly) { 302 err = EROFS; 303 break; 304 } 305 if (buf == NULL) { 306 if ((len = pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt, 307 br->br_offset)) < 0) 308 err = errno; 309 else 310 br->br_resid -= len; 311 break; 312 } 313 i = 0; 314 off = voff = 0; 315 while (br->br_resid > 0) { 316 len = MIN(br->br_resid, MAXPHYS); 317 boff = 0; 318 do { 319 clen = MIN(len - boff, br->br_iov[i].iov_len - 320 voff); 321 memcpy(buf + boff, 322 br->br_iov[i].iov_base + voff, clen); 323 if (clen < br->br_iov[i].iov_len - voff) 324 voff += clen; 325 else { 326 i++; 327 voff = 0; 328 } 329 boff += clen; 330 } while (boff < len); 331 if (pwrite(bc->bc_fd, buf, len, br->br_offset + 332 off) < 0) { 333 err = errno; 334 break; 335 } 336 off += len; 337 br->br_resid -= len; 338 } 339 break; 340 case BOP_FLUSH: 341 #ifdef __FreeBSD__ 342 if (bc->bc_ischr) { 343 if (ioctl(bc->bc_fd, DIOCGFLUSH)) 344 err = errno; 345 } else if (fsync(bc->bc_fd)) 346 err = errno; 347 #else 348 /* 349 * This fsync() should be adequate to flush the cache of a file 350 * or device. In VFS, the VOP_SYNC operation is converted to 351 * the appropriate ioctl in both sdev (for real devices) and 352 * zfs (for zvols). 353 */ 354 if (fsync(bc->bc_fd)) 355 err = errno; 356 #endif 357 break; 358 case BOP_DELETE: 359 if (!bc->bc_candelete) 360 err = EOPNOTSUPP; 361 else if (bc->bc_rdonly) 362 err = EROFS; 363 #ifdef __FreeBSD__ 364 else if (bc->bc_ischr) { 365 arg[0] = br->br_offset; 366 arg[1] = br->br_resid; 367 if (ioctl(bc->bc_fd, DIOCGDELETE, arg)) 368 err = errno; 369 else 370 br->br_resid = 0; 371 } else { 372 range.r_offset = br->br_offset; 373 range.r_len = br->br_resid; 374 375 while (range.r_len > 0) { 376 if (fspacectl(bc->bc_fd, SPACECTL_DEALLOC, 377 &range, 0, &range) != 0) { 378 err = errno; 379 break; 380 } 381 } 382 if (err == 0) 383 br->br_resid = 0; 384 } 385 #else 386 else if (bc->bc_ischr) { 387 dkioc_free_list_t dfl = { 388 .dfl_num_exts = 1, 389 .dfl_offset = 0, 390 .dfl_flags = 0, 391 .dfl_exts = { 392 { 393 .dfle_start = br->br_offset, 394 .dfle_length = br->br_resid 395 } 396 } 397 }; 398 399 if (ioctl(bc->bc_fd, DKIOCFREE, &dfl)) 400 err = errno; 401 else 402 br->br_resid = 0; 403 } else { 404 struct flock fl = { 405 .l_whence = 0, 406 .l_type = F_WRLCK, 407 .l_start = br->br_offset, 408 .l_len = br->br_resid 409 }; 410 411 if (fcntl(bc->bc_fd, F_FREESP, &fl)) 412 err = errno; 413 else 414 br->br_resid = 0; 415 } 416 #endif 417 break; 418 default: 419 err = EINVAL; 420 break; 421 } 422 423 be->be_status = BST_DONE; 424 425 (*br->br_callback)(br, err); 426 } 427 428 static void * 429 blockif_thr(void *arg) 430 { 431 struct blockif_ctxt *bc; 432 struct blockif_elem *be; 433 pthread_t t; 434 uint8_t *buf; 435 436 bc = arg; 437 if (bc->bc_isgeom) 438 buf = malloc(MAXPHYS); 439 else 440 buf = NULL; 441 t = pthread_self(); 442 443 pthread_mutex_lock(&bc->bc_mtx); 444 for (;;) { 445 while (blockif_dequeue(bc, t, &be)) { 446 pthread_mutex_unlock(&bc->bc_mtx); 447 blockif_proc(bc, be, buf); 448 pthread_mutex_lock(&bc->bc_mtx); 449 blockif_complete(bc, be); 450 } 451 /* Check ctxt status here to see if exit requested */ 452 if (bc->bc_closing) 453 break; 454 pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx); 455 } 456 pthread_mutex_unlock(&bc->bc_mtx); 457 458 if (buf) 459 free(buf); 460 pthread_exit(NULL); 461 return (NULL); 462 } 463 464 #ifdef __FreeBSD__ 465 static void 466 blockif_sigcont_handler(int signal, enum ev_type type, void *arg) 467 #else 468 static void 469 blockif_sigcont_handler(int signal) 470 #endif 471 { 472 struct blockif_sig_elem *bse; 473 474 for (;;) { 475 /* 476 * Process the entire list even if not intended for 477 * this thread. 478 */ 479 do { 480 bse = blockif_bse_head; 481 if (bse == NULL) 482 return; 483 } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head, 484 (uintptr_t)bse, 485 (uintptr_t)bse->bse_next)); 486 487 pthread_mutex_lock(&bse->bse_mtx); 488 bse->bse_pending = 0; 489 pthread_cond_signal(&bse->bse_cond); 490 pthread_mutex_unlock(&bse->bse_mtx); 491 } 492 } 493 494 static void 495 blockif_init(void) 496 { 497 #ifdef __FreeBSD__ 498 mevent_add(SIGCONT, EVF_SIGNAL, blockif_sigcont_handler, NULL); 499 (void) signal(SIGCONT, SIG_IGN); 500 #else 501 (void) sigset(SIGCONT, blockif_sigcont_handler); 502 #endif 503 } 504 505 int 506 blockif_legacy_config(nvlist_t *nvl, const char *opts) 507 { 508 char *cp, *path; 509 510 if (opts == NULL) 511 return (0); 512 513 cp = strchr(opts, ','); 514 if (cp == NULL) { 515 set_config_value_node(nvl, "path", opts); 516 return (0); 517 } 518 path = strndup(opts, cp - opts); 519 set_config_value_node(nvl, "path", path); 520 free(path); 521 return (pci_parse_legacy_config(nvl, cp + 1)); 522 } 523 524 struct blockif_ctxt * 525 blockif_open(nvlist_t *nvl, const char *ident) 526 { 527 char tname[MAXCOMLEN + 1]; 528 #ifdef __FreeBSD__ 529 char name[MAXPATHLEN]; 530 #endif 531 const char *path, *pssval, *ssval; 532 char *cp; 533 struct blockif_ctxt *bc; 534 struct stat sbuf; 535 #ifdef __FreeBSD__ 536 struct diocgattr_arg arg; 537 #else 538 enum blockif_wce wce = WCE_NONE; 539 #endif 540 off_t size, psectsz, psectoff; 541 int extra, fd, i, sectsz; 542 int ro, candelete, geom, ssopt, pssopt; 543 int nodelete; 544 545 #ifndef WITHOUT_CAPSICUM 546 cap_rights_t rights; 547 cap_ioctl_t cmds[] = { DIOCGFLUSH, DIOCGDELETE }; 548 #endif 549 550 pthread_once(&blockif_once, blockif_init); 551 552 fd = -1; 553 extra = 0; 554 ssopt = 0; 555 #ifndef __FreeBSD__ 556 pssopt = 0; 557 #endif 558 ro = 0; 559 nodelete = 0; 560 561 if (get_config_bool_node_default(nvl, "nocache", false)) 562 extra |= O_DIRECT; 563 if (get_config_bool_node_default(nvl, "nodelete", false)) 564 nodelete = 1; 565 if (get_config_bool_node_default(nvl, "sync", false) || 566 get_config_bool_node_default(nvl, "direct", false)) 567 extra |= O_SYNC; 568 if (get_config_bool_node_default(nvl, "ro", false)) 569 ro = 1; 570 ssval = get_config_value_node(nvl, "sectorsize"); 571 if (ssval != NULL) { 572 ssopt = strtol(ssval, &cp, 10); 573 if (cp == ssval) { 574 EPRINTLN("Invalid sector size \"%s\"", ssval); 575 goto err; 576 } 577 if (*cp == '\0') { 578 pssopt = ssopt; 579 } else if (*cp == '/') { 580 pssval = cp + 1; 581 pssopt = strtol(pssval, &cp, 10); 582 if (cp == pssval || *cp != '\0') { 583 EPRINTLN("Invalid sector size \"%s\"", ssval); 584 goto err; 585 } 586 } else { 587 EPRINTLN("Invalid sector size \"%s\"", ssval); 588 goto err; 589 } 590 } 591 592 path = get_config_value_node(nvl, "path"); 593 if (path == NULL) { 594 EPRINTLN("Missing \"path\" for block device."); 595 goto err; 596 } 597 598 fd = open(path, (ro ? O_RDONLY : O_RDWR) | extra); 599 if (fd < 0 && !ro) { 600 /* Attempt a r/w fail with a r/o open */ 601 fd = open(path, O_RDONLY | extra); 602 ro = 1; 603 } 604 605 if (fd < 0) { 606 warn("Could not open backing file: %s", path); 607 goto err; 608 } 609 610 if (fstat(fd, &sbuf) < 0) { 611 warn("Could not stat backing file %s", path); 612 goto err; 613 } 614 615 #ifndef WITHOUT_CAPSICUM 616 cap_rights_init(&rights, CAP_FSYNC, CAP_IOCTL, CAP_READ, CAP_SEEK, 617 CAP_WRITE, CAP_FSTAT, CAP_EVENT); 618 if (ro) 619 cap_rights_clear(&rights, CAP_FSYNC, CAP_WRITE); 620 621 if (caph_rights_limit(fd, &rights) == -1) 622 errx(EX_OSERR, "Unable to apply rights for sandbox"); 623 #endif 624 625 /* 626 * Deal with raw devices 627 */ 628 size = sbuf.st_size; 629 sectsz = DEV_BSIZE; 630 psectsz = psectoff = 0; 631 candelete = geom = 0; 632 #ifdef __FreeBSD__ 633 if (S_ISCHR(sbuf.st_mode)) { 634 if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 || 635 ioctl(fd, DIOCGSECTORSIZE, §sz)) { 636 perror("Could not fetch dev blk/sector size"); 637 goto err; 638 } 639 assert(size != 0); 640 assert(sectsz != 0); 641 if (ioctl(fd, DIOCGSTRIPESIZE, &psectsz) == 0 && psectsz > 0) 642 ioctl(fd, DIOCGSTRIPEOFFSET, &psectoff); 643 strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name)); 644 arg.len = sizeof(arg.value.i); 645 if (nodelete == 0 && ioctl(fd, DIOCGATTR, &arg) == 0) 646 candelete = arg.value.i; 647 if (ioctl(fd, DIOCGPROVIDERNAME, name) == 0) 648 geom = 1; 649 } else { 650 psectsz = sbuf.st_blksize; 651 /* Avoid fallback implementation */ 652 candelete = fpathconf(fd, _PC_DEALLOC_PRESENT) == 1; 653 } 654 #else 655 psectsz = sbuf.st_blksize; 656 if (S_ISCHR(sbuf.st_mode)) { 657 struct dk_minfo_ext dkmext; 658 int wce_val; 659 660 /* Look for a more accurate physical blocksize */ 661 if (ioctl(fd, DKIOCGMEDIAINFOEXT, &dkmext) == 0) { 662 psectsz = dkmext.dki_pbsize; 663 } 664 /* See if a configurable write cache is present and working */ 665 if (ioctl(fd, DKIOCGETWCE, &wce_val) == 0) { 666 /* 667 * If WCE is already active, disable it until the 668 * specific device driver calls for its return. If it 669 * is not active, toggle it on and off to verify that 670 * such actions are possible. 671 */ 672 if (wce_val != 0) { 673 wce_val = 0; 674 /* 675 * Inability to disable the cache is a threat 676 * to data durability. 677 */ 678 assert(ioctl(fd, DKIOCSETWCE, &wce_val) == 0); 679 wce = WCE_IOCTL; 680 } else { 681 int r1, r2; 682 683 wce_val = 1; 684 r1 = ioctl(fd, DKIOCSETWCE, &wce_val); 685 wce_val = 0; 686 r2 = ioctl(fd, DKIOCSETWCE, &wce_val); 687 688 if (r1 == 0 && r2 == 0) { 689 wce = WCE_IOCTL; 690 } else { 691 /* 692 * If the cache cache toggle was not 693 * successful, ensure that the cache 694 * was not left enabled. 695 */ 696 assert(r1 != 0); 697 } 698 } 699 } 700 701 if (nodelete == 0 && ioctl(fd, DKIOC_CANFREE, &candelete)) 702 candelete = 0; 703 704 } else { 705 int flags; 706 707 if ((flags = fcntl(fd, F_GETFL)) >= 0) { 708 flags |= O_DSYNC; 709 if (fcntl(fd, F_SETFL, flags) != -1) { 710 wce = WCE_FCNTL; 711 } 712 } 713 714 /* 715 * We don't have a way to discover if a file supports the 716 * FREESP fcntl cmd (other than trying it). However, 717 * zfs, ufs, tmpfs, and udfs all support the FREESP fcntl cmd. 718 * Nfsv4 and nfsv4 also forward the FREESP request 719 * to the server, so we always enable it for file based 720 * volumes. Anyone trying to run volumes on an unsupported 721 * configuration is on their own, and should be prepared 722 * for the requests to fail. 723 */ 724 if (nodelete == 0) 725 candelete = 1; 726 } 727 #endif 728 729 #ifndef WITHOUT_CAPSICUM 730 if (caph_ioctls_limit(fd, cmds, nitems(cmds)) == -1) 731 errx(EX_OSERR, "Unable to apply rights for sandbox"); 732 #endif 733 734 if (ssopt != 0) { 735 if (!powerof2(ssopt) || !powerof2(pssopt) || ssopt < 512 || 736 ssopt > pssopt) { 737 EPRINTLN("Invalid sector size %d/%d", 738 ssopt, pssopt); 739 goto err; 740 } 741 742 /* 743 * Some backend drivers (e.g. cd0, ada0) require that the I/O 744 * size be a multiple of the device's sector size. 745 * 746 * Validate that the emulated sector size complies with this 747 * requirement. 748 */ 749 if (S_ISCHR(sbuf.st_mode)) { 750 if (ssopt < sectsz || (ssopt % sectsz) != 0) { 751 EPRINTLN("Sector size %d incompatible " 752 "with underlying device sector size %d", 753 ssopt, sectsz); 754 goto err; 755 } 756 } 757 758 sectsz = ssopt; 759 psectsz = pssopt; 760 psectoff = 0; 761 } 762 763 bc = calloc(1, sizeof(struct blockif_ctxt)); 764 if (bc == NULL) { 765 perror("calloc"); 766 goto err; 767 } 768 769 bc->bc_magic = BLOCKIF_SIG; 770 bc->bc_fd = fd; 771 bc->bc_ischr = S_ISCHR(sbuf.st_mode); 772 bc->bc_isgeom = geom; 773 bc->bc_candelete = candelete; 774 #ifndef __FreeBSD__ 775 bc->bc_wce = wce; 776 #endif 777 bc->bc_rdonly = ro; 778 bc->bc_size = size; 779 bc->bc_sectsz = sectsz; 780 bc->bc_psectsz = psectsz; 781 bc->bc_psectoff = psectoff; 782 pthread_mutex_init(&bc->bc_mtx, NULL); 783 pthread_cond_init(&bc->bc_cond, NULL); 784 TAILQ_INIT(&bc->bc_freeq); 785 TAILQ_INIT(&bc->bc_pendq); 786 TAILQ_INIT(&bc->bc_busyq); 787 for (i = 0; i < BLOCKIF_MAXREQ; i++) { 788 bc->bc_reqs[i].be_status = BST_FREE; 789 TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link); 790 } 791 792 for (i = 0; i < BLOCKIF_NUMTHR; i++) { 793 pthread_create(&bc->bc_btid[i], NULL, blockif_thr, bc); 794 snprintf(tname, sizeof(tname), "blk-%s-%d", ident, i); 795 pthread_set_name_np(bc->bc_btid[i], tname); 796 } 797 798 return (bc); 799 err: 800 if (fd >= 0) 801 close(fd); 802 return (NULL); 803 } 804 805 static void 806 blockif_resized(int fd, enum ev_type type, void *arg) 807 { 808 struct blockif_ctxt *bc; 809 struct stat sb; 810 811 if (fstat(fd, &sb) != 0) 812 return; 813 814 bc = arg; 815 pthread_mutex_lock(&bc->bc_mtx); 816 if (sb.st_size != bc->bc_size) { 817 bc->bc_size = sb.st_size; 818 bc->bc_resize_cb(bc, bc->bc_resize_cb_arg, bc->bc_size); 819 } 820 pthread_mutex_unlock(&bc->bc_mtx); 821 } 822 823 int 824 blockif_register_resize_callback(struct blockif_ctxt *bc, blockif_resize_cb *cb, 825 void *cb_arg) 826 { 827 struct stat sb; 828 int err; 829 #ifndef __FreeBSD__ 830 err = 0; 831 #endif 832 833 if (cb == NULL) 834 return (EINVAL); 835 836 pthread_mutex_lock(&bc->bc_mtx); 837 if (bc->bc_resize_cb != NULL) { 838 err = EBUSY; 839 goto out; 840 } 841 842 assert(bc->bc_closing == 0); 843 844 if (fstat(bc->bc_fd, &sb) != 0) { 845 err = errno; 846 goto out; 847 } 848 849 bc->bc_resize_event = mevent_add_flags(bc->bc_fd, EVF_VNODE, 850 EVFF_ATTRIB, blockif_resized, bc); 851 if (bc->bc_resize_event == NULL) { 852 err = ENXIO; 853 goto out; 854 } 855 856 bc->bc_resize_cb = cb; 857 bc->bc_resize_cb_arg = cb_arg; 858 out: 859 pthread_mutex_unlock(&bc->bc_mtx); 860 861 return (err); 862 } 863 864 static int 865 blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq, 866 enum blockop op) 867 { 868 int err; 869 870 err = 0; 871 872 pthread_mutex_lock(&bc->bc_mtx); 873 if (!TAILQ_EMPTY(&bc->bc_freeq)) { 874 /* 875 * Enqueue and inform the block i/o thread 876 * that there is work available 877 */ 878 if (blockif_enqueue(bc, breq, op)) 879 pthread_cond_signal(&bc->bc_cond); 880 } else { 881 /* 882 * Callers are not allowed to enqueue more than 883 * the specified blockif queue limit. Return an 884 * error to indicate that the queue length has been 885 * exceeded. 886 */ 887 err = E2BIG; 888 } 889 pthread_mutex_unlock(&bc->bc_mtx); 890 891 return (err); 892 } 893 894 int 895 blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq) 896 { 897 898 assert(bc->bc_magic == BLOCKIF_SIG); 899 return (blockif_request(bc, breq, BOP_READ)); 900 } 901 902 int 903 blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq) 904 { 905 906 assert(bc->bc_magic == BLOCKIF_SIG); 907 return (blockif_request(bc, breq, BOP_WRITE)); 908 } 909 910 int 911 blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq) 912 { 913 914 assert(bc->bc_magic == BLOCKIF_SIG); 915 return (blockif_request(bc, breq, BOP_FLUSH)); 916 } 917 918 int 919 blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq) 920 { 921 922 assert(bc->bc_magic == BLOCKIF_SIG); 923 return (blockif_request(bc, breq, BOP_DELETE)); 924 } 925 926 int 927 blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq) 928 { 929 struct blockif_elem *be; 930 931 assert(bc->bc_magic == BLOCKIF_SIG); 932 933 pthread_mutex_lock(&bc->bc_mtx); 934 /* 935 * Check pending requests. 936 */ 937 TAILQ_FOREACH(be, &bc->bc_pendq, be_link) { 938 if (be->be_req == breq) 939 break; 940 } 941 if (be != NULL) { 942 /* 943 * Found it. 944 */ 945 blockif_complete(bc, be); 946 pthread_mutex_unlock(&bc->bc_mtx); 947 948 return (0); 949 } 950 951 /* 952 * Check in-flight requests. 953 */ 954 TAILQ_FOREACH(be, &bc->bc_busyq, be_link) { 955 if (be->be_req == breq) 956 break; 957 } 958 if (be == NULL) { 959 /* 960 * Didn't find it. 961 */ 962 pthread_mutex_unlock(&bc->bc_mtx); 963 return (EINVAL); 964 } 965 966 /* 967 * Interrupt the processing thread to force it return 968 * prematurely via it's normal callback path. 969 */ 970 while (be->be_status == BST_BUSY) { 971 struct blockif_sig_elem bse, *old_head; 972 973 pthread_mutex_init(&bse.bse_mtx, NULL); 974 pthread_cond_init(&bse.bse_cond, NULL); 975 976 bse.bse_pending = 1; 977 978 do { 979 old_head = blockif_bse_head; 980 bse.bse_next = old_head; 981 } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head, 982 (uintptr_t)old_head, 983 (uintptr_t)&bse)); 984 985 pthread_kill(be->be_tid, SIGCONT); 986 987 pthread_mutex_lock(&bse.bse_mtx); 988 while (bse.bse_pending) 989 pthread_cond_wait(&bse.bse_cond, &bse.bse_mtx); 990 pthread_mutex_unlock(&bse.bse_mtx); 991 } 992 993 pthread_mutex_unlock(&bc->bc_mtx); 994 995 /* 996 * The processing thread has been interrupted. Since it's not 997 * clear if the callback has been invoked yet, return EBUSY. 998 */ 999 return (EBUSY); 1000 } 1001 1002 int 1003 blockif_close(struct blockif_ctxt *bc) 1004 { 1005 void *jval; 1006 int i; 1007 1008 assert(bc->bc_magic == BLOCKIF_SIG); 1009 1010 /* 1011 * Stop the block i/o thread 1012 */ 1013 pthread_mutex_lock(&bc->bc_mtx); 1014 bc->bc_closing = 1; 1015 if (bc->bc_resize_event != NULL) 1016 mevent_disable(bc->bc_resize_event); 1017 pthread_mutex_unlock(&bc->bc_mtx); 1018 pthread_cond_broadcast(&bc->bc_cond); 1019 for (i = 0; i < BLOCKIF_NUMTHR; i++) 1020 pthread_join(bc->bc_btid[i], &jval); 1021 1022 /* XXX Cancel queued i/o's ??? */ 1023 1024 /* 1025 * Release resources 1026 */ 1027 bc->bc_magic = 0; 1028 close(bc->bc_fd); 1029 free(bc); 1030 1031 return (0); 1032 } 1033 1034 /* 1035 * Return virtual C/H/S values for a given block. Use the algorithm 1036 * outlined in the VHD specification to calculate values. 1037 */ 1038 void 1039 blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s) 1040 { 1041 off_t sectors; /* total sectors of the block dev */ 1042 off_t hcyl; /* cylinders times heads */ 1043 uint16_t secpt; /* sectors per track */ 1044 uint8_t heads; 1045 1046 assert(bc->bc_magic == BLOCKIF_SIG); 1047 1048 sectors = bc->bc_size / bc->bc_sectsz; 1049 1050 /* Clamp the size to the largest possible with CHS */ 1051 if (sectors > 65535UL*16*255) 1052 sectors = 65535UL*16*255; 1053 1054 if (sectors >= 65536UL*16*63) { 1055 secpt = 255; 1056 heads = 16; 1057 hcyl = sectors / secpt; 1058 } else { 1059 secpt = 17; 1060 hcyl = sectors / secpt; 1061 heads = (hcyl + 1023) / 1024; 1062 1063 if (heads < 4) 1064 heads = 4; 1065 1066 if (hcyl >= (heads * 1024) || heads > 16) { 1067 secpt = 31; 1068 heads = 16; 1069 hcyl = sectors / secpt; 1070 } 1071 if (hcyl >= (heads * 1024)) { 1072 secpt = 63; 1073 heads = 16; 1074 hcyl = sectors / secpt; 1075 } 1076 } 1077 1078 *c = hcyl / heads; 1079 *h = heads; 1080 *s = secpt; 1081 } 1082 1083 /* 1084 * Accessors 1085 */ 1086 off_t 1087 blockif_size(struct blockif_ctxt *bc) 1088 { 1089 1090 assert(bc->bc_magic == BLOCKIF_SIG); 1091 return (bc->bc_size); 1092 } 1093 1094 int 1095 blockif_sectsz(struct blockif_ctxt *bc) 1096 { 1097 1098 assert(bc->bc_magic == BLOCKIF_SIG); 1099 return (bc->bc_sectsz); 1100 } 1101 1102 void 1103 blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off) 1104 { 1105 1106 assert(bc->bc_magic == BLOCKIF_SIG); 1107 *size = bc->bc_psectsz; 1108 *off = bc->bc_psectoff; 1109 } 1110 1111 int 1112 blockif_queuesz(struct blockif_ctxt *bc) 1113 { 1114 1115 assert(bc->bc_magic == BLOCKIF_SIG); 1116 return (BLOCKIF_MAXREQ - 1); 1117 } 1118 1119 int 1120 blockif_is_ro(struct blockif_ctxt *bc) 1121 { 1122 1123 assert(bc->bc_magic == BLOCKIF_SIG); 1124 return (bc->bc_rdonly); 1125 } 1126 1127 int 1128 blockif_candelete(struct blockif_ctxt *bc) 1129 { 1130 1131 assert(bc->bc_magic == BLOCKIF_SIG); 1132 return (bc->bc_candelete); 1133 } 1134 1135 #ifndef __FreeBSD__ 1136 int 1137 blockif_set_wce(struct blockif_ctxt *bc, int wc_enable) 1138 { 1139 int res = 0, flags; 1140 int clean_val = (wc_enable != 0) ? 1 : 0; 1141 1142 (void) pthread_mutex_lock(&bc->bc_mtx); 1143 switch (bc->bc_wce) { 1144 case WCE_IOCTL: 1145 res = ioctl(bc->bc_fd, DKIOCSETWCE, &clean_val); 1146 break; 1147 case WCE_FCNTL: 1148 if ((flags = fcntl(bc->bc_fd, F_GETFL)) >= 0) { 1149 if (wc_enable == 0) { 1150 flags |= O_DSYNC; 1151 } else { 1152 flags &= ~O_DSYNC; 1153 } 1154 if (fcntl(bc->bc_fd, F_SETFL, flags) == -1) { 1155 res = -1; 1156 } 1157 } else { 1158 res = -1; 1159 } 1160 break; 1161 default: 1162 break; 1163 } 1164 1165 /* 1166 * After a successful disable of the write cache, ensure that any 1167 * lingering data in the cache is synced out. 1168 */ 1169 if (res == 0 && wc_enable == 0) { 1170 res = fsync(bc->bc_fd); 1171 } 1172 (void) pthread_mutex_unlock(&bc->bc_mtx); 1173 1174 return (res); 1175 } 1176 #endif /* __FreeBSD__ */ 1177