1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2013 Peter Grehan <grehan@freebsd.org> 5 * All rights reserved. 6 * Copyright 2020 Joyent, Inc. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 /* 31 * Copyright 2020 Joyent, Inc. 32 */ 33 34 35 #include <sys/param.h> 36 #ifndef WITHOUT_CAPSICUM 37 #include <sys/capsicum.h> 38 #endif 39 #include <sys/queue.h> 40 #include <sys/errno.h> 41 #include <sys/stat.h> 42 #include <sys/ioctl.h> 43 #include <sys/disk.h> 44 #ifndef __FreeBSD__ 45 #include <sys/limits.h> 46 #include <sys/uio.h> 47 #include <sys/dkio.h> 48 #endif 49 50 #include <assert.h> 51 #ifndef WITHOUT_CAPSICUM 52 #include <capsicum_helpers.h> 53 #endif 54 #include <err.h> 55 #include <fcntl.h> 56 #include <stdio.h> 57 #include <stdlib.h> 58 #include <string.h> 59 #include <pthread.h> 60 #include <pthread_np.h> 61 #include <signal.h> 62 #include <sysexits.h> 63 #include <unistd.h> 64 65 #include <machine/atomic.h> 66 67 #include "bhyverun.h" 68 #include "config.h" 69 #include "debug.h" 70 #include "mevent.h" 71 #include "pci_emul.h" 72 #include "block_if.h" 73 74 #define BLOCKIF_SIG 0xb109b109 75 76 #ifdef __FreeBSD__ 77 #define BLOCKIF_NUMTHR 8 78 #else 79 /* Enlarge to keep pace with the virtio-block ring size */ 80 #define BLOCKIF_NUMTHR 16 81 #endif 82 #define BLOCKIF_MAXREQ (BLOCKIF_RING_MAX + BLOCKIF_NUMTHR) 83 84 enum blockop { 85 BOP_READ, 86 BOP_WRITE, 87 #ifndef __FreeBSD__ 88 BOP_WRITE_SYNC, 89 #endif 90 BOP_FLUSH, 91 BOP_DELETE 92 }; 93 94 enum blockstat { 95 BST_FREE, 96 BST_BLOCK, 97 BST_PEND, 98 BST_BUSY, 99 BST_DONE 100 }; 101 102 struct blockif_elem { 103 TAILQ_ENTRY(blockif_elem) be_link; 104 struct blockif_req *be_req; 105 enum blockop be_op; 106 enum blockstat be_status; 107 pthread_t be_tid; 108 off_t be_block; 109 }; 110 111 #ifndef __FreeBSD__ 112 enum blockif_wce { 113 WCE_NONE = 0, 114 WCE_IOCTL, 115 WCE_FCNTL 116 }; 117 #endif 118 119 struct blockif_ctxt { 120 unsigned int bc_magic; 121 int bc_fd; 122 int bc_ischr; 123 int bc_isgeom; 124 int bc_candelete; 125 #ifndef __FreeBSD__ 126 enum blockif_wce bc_wce; 127 #endif 128 int bc_rdonly; 129 off_t bc_size; 130 int bc_sectsz; 131 int bc_psectsz; 132 int bc_psectoff; 133 int bc_closing; 134 pthread_t bc_btid[BLOCKIF_NUMTHR]; 135 pthread_mutex_t bc_mtx; 136 pthread_cond_t bc_cond; 137 blockif_resize_cb *bc_resize_cb; 138 void *bc_resize_cb_arg; 139 struct mevent *bc_resize_event; 140 141 /* Request elements and free/pending/busy queues */ 142 TAILQ_HEAD(, blockif_elem) bc_freeq; 143 TAILQ_HEAD(, blockif_elem) bc_pendq; 144 TAILQ_HEAD(, blockif_elem) bc_busyq; 145 struct blockif_elem bc_reqs[BLOCKIF_MAXREQ]; 146 int bc_bootindex; 147 }; 148 149 static pthread_once_t blockif_once = PTHREAD_ONCE_INIT; 150 151 struct blockif_sig_elem { 152 pthread_mutex_t bse_mtx; 153 pthread_cond_t bse_cond; 154 int bse_pending; 155 struct blockif_sig_elem *bse_next; 156 }; 157 158 static struct blockif_sig_elem *blockif_bse_head; 159 160 static int 161 blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq, 162 enum blockop op) 163 { 164 struct blockif_elem *be, *tbe; 165 off_t off; 166 int i; 167 168 be = TAILQ_FIRST(&bc->bc_freeq); 169 assert(be != NULL); 170 assert(be->be_status == BST_FREE); 171 TAILQ_REMOVE(&bc->bc_freeq, be, be_link); 172 be->be_req = breq; 173 be->be_op = op; 174 switch (op) { 175 case BOP_READ: 176 case BOP_WRITE: 177 #ifndef __FreeBSD__ 178 case BOP_WRITE_SYNC: 179 #endif 180 case BOP_DELETE: 181 off = breq->br_offset; 182 for (i = 0; i < breq->br_iovcnt; i++) 183 off += breq->br_iov[i].iov_len; 184 break; 185 default: 186 off = OFF_MAX; 187 } 188 be->be_block = off; 189 TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) { 190 if (tbe->be_block == breq->br_offset) 191 break; 192 } 193 if (tbe == NULL) { 194 TAILQ_FOREACH(tbe, &bc->bc_busyq, be_link) { 195 if (tbe->be_block == breq->br_offset) 196 break; 197 } 198 } 199 if (tbe == NULL) 200 be->be_status = BST_PEND; 201 else 202 be->be_status = BST_BLOCK; 203 TAILQ_INSERT_TAIL(&bc->bc_pendq, be, be_link); 204 return (be->be_status == BST_PEND); 205 } 206 207 static int 208 blockif_dequeue(struct blockif_ctxt *bc, pthread_t t, struct blockif_elem **bep) 209 { 210 struct blockif_elem *be; 211 212 TAILQ_FOREACH(be, &bc->bc_pendq, be_link) { 213 if (be->be_status == BST_PEND) 214 break; 215 assert(be->be_status == BST_BLOCK); 216 } 217 if (be == NULL) 218 return (0); 219 TAILQ_REMOVE(&bc->bc_pendq, be, be_link); 220 be->be_status = BST_BUSY; 221 be->be_tid = t; 222 TAILQ_INSERT_TAIL(&bc->bc_busyq, be, be_link); 223 *bep = be; 224 return (1); 225 } 226 227 static void 228 blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be) 229 { 230 struct blockif_elem *tbe; 231 232 if (be->be_status == BST_DONE || be->be_status == BST_BUSY) 233 TAILQ_REMOVE(&bc->bc_busyq, be, be_link); 234 else 235 TAILQ_REMOVE(&bc->bc_pendq, be, be_link); 236 TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) { 237 if (tbe->be_req->br_offset == be->be_block) 238 tbe->be_status = BST_PEND; 239 } 240 be->be_tid = 0; 241 be->be_status = BST_FREE; 242 be->be_req = NULL; 243 TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link); 244 } 245 246 static int 247 blockif_flush_bc(struct blockif_ctxt *bc) 248 { 249 #ifdef __FreeBSD__ 250 if (bc->bc_ischr) { 251 if (ioctl(bc->bc_fd, DIOCGFLUSH)) 252 return (errno); 253 } else if (fsync(bc->bc_fd)) 254 return (errno); 255 #else 256 /* 257 * This fsync() should be adequate to flush the cache of a file 258 * or device. In VFS, the VOP_SYNC operation is converted to 259 * the appropriate ioctl in both sdev (for real devices) and 260 * zfs (for zvols). 261 */ 262 if (fsync(bc->bc_fd)) 263 return (errno); 264 #endif 265 266 return (0); 267 } 268 269 static void 270 blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf) 271 { 272 #ifdef __FreeBSD__ 273 struct spacectl_range range; 274 #endif 275 struct blockif_req *br; 276 #ifdef __FreeBSD__ 277 off_t arg[2]; 278 #endif 279 ssize_t n; 280 size_t clen, len, off, boff, voff; 281 int i, err; 282 283 br = be->be_req; 284 assert(br->br_resid >= 0); 285 286 if (br->br_iovcnt <= 1) 287 buf = NULL; 288 err = 0; 289 switch (be->be_op) { 290 case BOP_READ: 291 if (buf == NULL) { 292 if ((n = preadv(bc->bc_fd, br->br_iov, br->br_iovcnt, 293 br->br_offset)) < 0) 294 err = errno; 295 else 296 br->br_resid -= n; 297 break; 298 } 299 i = 0; 300 off = voff = 0; 301 while (br->br_resid > 0) { 302 len = MIN(br->br_resid, MAXPHYS); 303 n = pread(bc->bc_fd, buf, len, br->br_offset + off); 304 if (n < 0) { 305 err = errno; 306 break; 307 } 308 len = (size_t)n; 309 boff = 0; 310 do { 311 clen = MIN(len - boff, br->br_iov[i].iov_len - 312 voff); 313 memcpy((uint8_t *)br->br_iov[i].iov_base + voff, 314 buf + boff, clen); 315 if (clen < br->br_iov[i].iov_len - voff) 316 voff += clen; 317 else { 318 i++; 319 voff = 0; 320 } 321 boff += clen; 322 } while (boff < len); 323 off += len; 324 br->br_resid -= len; 325 } 326 break; 327 case BOP_WRITE: 328 if (bc->bc_rdonly) { 329 err = EROFS; 330 break; 331 } 332 if (buf == NULL) { 333 if ((n = pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt, 334 br->br_offset)) < 0) 335 err = errno; 336 else 337 br->br_resid -= n; 338 break; 339 } 340 i = 0; 341 off = voff = 0; 342 while (br->br_resid > 0) { 343 len = MIN(br->br_resid, MAXPHYS); 344 boff = 0; 345 do { 346 clen = MIN(len - boff, br->br_iov[i].iov_len - 347 voff); 348 memcpy(buf + boff, 349 (uint8_t *)br->br_iov[i].iov_base + voff, 350 clen); 351 if (clen < br->br_iov[i].iov_len - voff) 352 voff += clen; 353 else { 354 i++; 355 voff = 0; 356 } 357 boff += clen; 358 } while (boff < len); 359 360 n = pwrite(bc->bc_fd, buf, len, br->br_offset + off); 361 if (n < 0) { 362 err = errno; 363 break; 364 } 365 off += n; 366 br->br_resid -= n; 367 } 368 break; 369 case BOP_FLUSH: 370 err = blockif_flush_bc(bc); 371 break; 372 case BOP_DELETE: 373 if (!bc->bc_candelete) 374 err = EOPNOTSUPP; 375 else if (bc->bc_rdonly) 376 err = EROFS; 377 #ifdef __FreeBSD__ 378 else if (bc->bc_ischr) { 379 arg[0] = br->br_offset; 380 arg[1] = br->br_resid; 381 if (ioctl(bc->bc_fd, DIOCGDELETE, arg)) 382 err = errno; 383 else 384 br->br_resid = 0; 385 } else { 386 range.r_offset = br->br_offset; 387 range.r_len = br->br_resid; 388 389 while (range.r_len > 0) { 390 if (fspacectl(bc->bc_fd, SPACECTL_DEALLOC, 391 &range, 0, &range) != 0) { 392 err = errno; 393 break; 394 } 395 } 396 if (err == 0) 397 br->br_resid = 0; 398 } 399 #else 400 else if (bc->bc_ischr) { 401 dkioc_free_list_t dfl = { 402 .dfl_num_exts = 1, 403 .dfl_offset = 0, 404 .dfl_flags = 0, 405 .dfl_exts = { 406 { 407 .dfle_start = br->br_offset, 408 .dfle_length = br->br_resid 409 } 410 } 411 }; 412 413 if (ioctl(bc->bc_fd, DKIOCFREE, &dfl)) 414 err = errno; 415 else 416 br->br_resid = 0; 417 } else { 418 struct flock fl = { 419 .l_whence = 0, 420 .l_type = F_WRLCK, 421 .l_start = br->br_offset, 422 .l_len = br->br_resid 423 }; 424 425 if (fcntl(bc->bc_fd, F_FREESP, &fl)) 426 err = errno; 427 else 428 br->br_resid = 0; 429 } 430 #endif 431 break; 432 default: 433 err = EINVAL; 434 break; 435 } 436 437 be->be_status = BST_DONE; 438 439 (*br->br_callback)(br, err); 440 } 441 442 static inline bool 443 blockif_empty(const struct blockif_ctxt *bc) 444 { 445 return (TAILQ_EMPTY(&bc->bc_pendq) && TAILQ_EMPTY(&bc->bc_busyq)); 446 } 447 448 static void * 449 blockif_thr(void *arg) 450 { 451 struct blockif_ctxt *bc; 452 struct blockif_elem *be; 453 pthread_t t; 454 uint8_t *buf; 455 456 bc = arg; 457 if (bc->bc_isgeom) 458 buf = malloc(MAXPHYS); 459 else 460 buf = NULL; 461 t = pthread_self(); 462 463 pthread_mutex_lock(&bc->bc_mtx); 464 for (;;) { 465 while (blockif_dequeue(bc, t, &be)) { 466 pthread_mutex_unlock(&bc->bc_mtx); 467 blockif_proc(bc, be, buf); 468 pthread_mutex_lock(&bc->bc_mtx); 469 blockif_complete(bc, be); 470 } 471 /* Check ctxt status here to see if exit requested */ 472 if (bc->bc_closing) 473 break; 474 475 pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx); 476 } 477 pthread_mutex_unlock(&bc->bc_mtx); 478 479 if (buf) 480 free(buf); 481 pthread_exit(NULL); 482 return (NULL); 483 } 484 485 #ifdef __FreeBSD__ 486 static void 487 blockif_sigcont_handler(int signal __unused, enum ev_type type __unused, 488 void *arg __unused) 489 #else 490 static void 491 blockif_sigcont_handler(int signal __unused) 492 #endif 493 { 494 struct blockif_sig_elem *bse; 495 496 for (;;) { 497 /* 498 * Process the entire list even if not intended for 499 * this thread. 500 */ 501 do { 502 bse = blockif_bse_head; 503 if (bse == NULL) 504 return; 505 } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head, 506 (uintptr_t)bse, 507 (uintptr_t)bse->bse_next)); 508 509 pthread_mutex_lock(&bse->bse_mtx); 510 bse->bse_pending = 0; 511 pthread_cond_signal(&bse->bse_cond); 512 pthread_mutex_unlock(&bse->bse_mtx); 513 } 514 } 515 516 static void 517 blockif_init(void) 518 { 519 #ifdef __FreeBSD__ 520 mevent_add(SIGCONT, EVF_SIGNAL, blockif_sigcont_handler, NULL); 521 (void) signal(SIGCONT, SIG_IGN); 522 #else 523 (void) sigset(SIGCONT, blockif_sigcont_handler); 524 #endif 525 } 526 527 int 528 blockif_legacy_config(nvlist_t *nvl, const char *opts) 529 { 530 char *cp, *path; 531 532 if (opts == NULL) 533 return (0); 534 535 cp = strchr(opts, ','); 536 if (cp == NULL) { 537 set_config_value_node(nvl, "path", opts); 538 return (0); 539 } 540 path = strndup(opts, cp - opts); 541 set_config_value_node(nvl, "path", path); 542 free(path); 543 return (pci_parse_legacy_config(nvl, cp + 1)); 544 } 545 546 int 547 blockif_add_boot_device(struct pci_devinst *const pi, 548 struct blockif_ctxt *const bc) 549 { 550 if (bc->bc_bootindex < 0) 551 return (0); 552 553 return (pci_emul_add_boot_device(pi, bc->bc_bootindex)); 554 } 555 556 struct blockif_ctxt * 557 blockif_open(nvlist_t *nvl, const char *ident) 558 { 559 char tname[MAXCOMLEN + 1]; 560 #ifdef __FreeBSD__ 561 char name[MAXPATHLEN]; 562 #endif 563 const char *path, *pssval, *ssval, *bootindex_val; 564 char *cp; 565 struct blockif_ctxt *bc; 566 struct stat sbuf; 567 #ifdef __FreeBSD__ 568 struct diocgattr_arg arg; 569 #else 570 enum blockif_wce wce = WCE_NONE; 571 #endif 572 off_t size, psectsz, psectoff; 573 int extra, fd, i, sectsz; 574 int ro, candelete, geom, ssopt, pssopt; 575 int nodelete; 576 int bootindex; 577 578 #ifndef WITHOUT_CAPSICUM 579 cap_rights_t rights; 580 cap_ioctl_t cmds[] = { DIOCGFLUSH, DIOCGDELETE, DIOCGMEDIASIZE }; 581 #endif 582 583 pthread_once(&blockif_once, blockif_init); 584 585 fd = -1; 586 extra = 0; 587 ssopt = 0; 588 #ifndef __FreeBSD__ 589 pssopt = 0; 590 #endif 591 ro = 0; 592 nodelete = 0; 593 bootindex = -1; 594 595 if (get_config_bool_node_default(nvl, "nocache", false)) 596 extra |= O_DIRECT; 597 if (get_config_bool_node_default(nvl, "nodelete", false)) 598 nodelete = 1; 599 if (get_config_bool_node_default(nvl, "sync", false) || 600 get_config_bool_node_default(nvl, "direct", false)) 601 extra |= O_SYNC; 602 if (get_config_bool_node_default(nvl, "ro", false)) 603 ro = 1; 604 ssval = get_config_value_node(nvl, "sectorsize"); 605 if (ssval != NULL) { 606 ssopt = strtol(ssval, &cp, 10); 607 if (cp == ssval) { 608 EPRINTLN("Invalid sector size \"%s\"", ssval); 609 goto err; 610 } 611 if (*cp == '\0') { 612 pssopt = ssopt; 613 } else if (*cp == '/') { 614 pssval = cp + 1; 615 pssopt = strtol(pssval, &cp, 10); 616 if (cp == pssval || *cp != '\0') { 617 EPRINTLN("Invalid sector size \"%s\"", ssval); 618 goto err; 619 } 620 } else { 621 EPRINTLN("Invalid sector size \"%s\"", ssval); 622 goto err; 623 } 624 } 625 626 bootindex_val = get_config_value_node(nvl, "bootindex"); 627 if (bootindex_val != NULL) { 628 bootindex = atoi(bootindex_val); 629 } 630 631 path = get_config_value_node(nvl, "path"); 632 if (path == NULL) { 633 EPRINTLN("Missing \"path\" for block device."); 634 goto err; 635 } 636 637 fd = open(path, (ro ? O_RDONLY : O_RDWR) | extra); 638 if (fd < 0 && !ro) { 639 /* Attempt a r/w fail with a r/o open */ 640 fd = open(path, O_RDONLY | extra); 641 ro = 1; 642 } 643 644 if (fd < 0) { 645 warn("Could not open backing file: %s", path); 646 goto err; 647 } 648 649 if (fstat(fd, &sbuf) < 0) { 650 warn("Could not stat backing file %s", path); 651 goto err; 652 } 653 654 #ifndef WITHOUT_CAPSICUM 655 cap_rights_init(&rights, CAP_FSYNC, CAP_IOCTL, CAP_READ, CAP_SEEK, 656 CAP_WRITE, CAP_FSTAT, CAP_EVENT, CAP_FPATHCONF); 657 if (ro) 658 cap_rights_clear(&rights, CAP_FSYNC, CAP_WRITE); 659 660 if (caph_rights_limit(fd, &rights) == -1) 661 errx(EX_OSERR, "Unable to apply rights for sandbox"); 662 #endif 663 664 /* 665 * Deal with raw devices 666 */ 667 size = sbuf.st_size; 668 sectsz = DEV_BSIZE; 669 psectsz = psectoff = 0; 670 candelete = geom = 0; 671 #ifdef __FreeBSD__ 672 if (S_ISCHR(sbuf.st_mode)) { 673 if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 || 674 ioctl(fd, DIOCGSECTORSIZE, §sz)) { 675 perror("Could not fetch dev blk/sector size"); 676 goto err; 677 } 678 assert(size != 0); 679 assert(sectsz != 0); 680 if (ioctl(fd, DIOCGSTRIPESIZE, &psectsz) == 0 && psectsz > 0) 681 ioctl(fd, DIOCGSTRIPEOFFSET, &psectoff); 682 strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name)); 683 arg.len = sizeof(arg.value.i); 684 if (nodelete == 0 && ioctl(fd, DIOCGATTR, &arg) == 0) 685 candelete = arg.value.i; 686 if (ioctl(fd, DIOCGPROVIDERNAME, name) == 0) 687 geom = 1; 688 } else { 689 psectsz = sbuf.st_blksize; 690 /* Avoid fallback implementation */ 691 candelete = fpathconf(fd, _PC_DEALLOC_PRESENT) == 1; 692 } 693 #else 694 psectsz = sbuf.st_blksize; 695 if (S_ISCHR(sbuf.st_mode)) { 696 struct dk_minfo_ext dkmext; 697 int wce_val; 698 699 /* Look for a more accurate physical block/media size */ 700 if (ioctl(fd, DKIOCGMEDIAINFOEXT, &dkmext) == 0) { 701 psectsz = dkmext.dki_pbsize; 702 size = dkmext.dki_lbsize * dkmext.dki_capacity; 703 } 704 /* See if a configurable write cache is present and working */ 705 if (ioctl(fd, DKIOCGETWCE, &wce_val) == 0) { 706 /* 707 * If WCE is already active, disable it until the 708 * specific device driver calls for its return. If it 709 * is not active, toggle it on and off to verify that 710 * such actions are possible. 711 */ 712 if (wce_val != 0) { 713 wce_val = 0; 714 /* 715 * Inability to disable the cache is a threat 716 * to data durability. 717 */ 718 assert(ioctl(fd, DKIOCSETWCE, &wce_val) == 0); 719 wce = WCE_IOCTL; 720 } else { 721 int r1, r2; 722 723 wce_val = 1; 724 r1 = ioctl(fd, DKIOCSETWCE, &wce_val); 725 wce_val = 0; 726 r2 = ioctl(fd, DKIOCSETWCE, &wce_val); 727 728 if (r1 == 0 && r2 == 0) { 729 wce = WCE_IOCTL; 730 } else { 731 /* 732 * If the cache cache toggle was not 733 * successful, ensure that the cache 734 * was not left enabled. 735 */ 736 assert(r1 != 0); 737 } 738 } 739 } 740 741 if (nodelete == 0 && ioctl(fd, DKIOC_CANFREE, &candelete)) 742 candelete = 0; 743 744 } else { 745 int flags; 746 747 if ((flags = fcntl(fd, F_GETFL)) >= 0) { 748 flags |= O_DSYNC; 749 if (fcntl(fd, F_SETFL, flags) != -1) { 750 wce = WCE_FCNTL; 751 } 752 } 753 754 /* 755 * We don't have a way to discover if a file supports the 756 * FREESP fcntl cmd (other than trying it). However, 757 * zfs, ufs, tmpfs, and udfs all support the FREESP fcntl cmd. 758 * Nfsv4 and nfsv4 also forward the FREESP request 759 * to the server, so we always enable it for file based 760 * volumes. Anyone trying to run volumes on an unsupported 761 * configuration is on their own, and should be prepared 762 * for the requests to fail. 763 */ 764 if (nodelete == 0) 765 candelete = 1; 766 } 767 #endif 768 769 #ifndef WITHOUT_CAPSICUM 770 if (caph_ioctls_limit(fd, cmds, nitems(cmds)) == -1) 771 errx(EX_OSERR, "Unable to apply rights for sandbox"); 772 #endif 773 774 if (ssopt != 0) { 775 if (!powerof2(ssopt) || !powerof2(pssopt) || ssopt < 512 || 776 ssopt > pssopt) { 777 EPRINTLN("Invalid sector size %d/%d", 778 ssopt, pssopt); 779 goto err; 780 } 781 782 /* 783 * Some backend drivers (e.g. cd0, ada0) require that the I/O 784 * size be a multiple of the device's sector size. 785 * 786 * Validate that the emulated sector size complies with this 787 * requirement. 788 */ 789 if (S_ISCHR(sbuf.st_mode)) { 790 if (ssopt < sectsz || (ssopt % sectsz) != 0) { 791 EPRINTLN("Sector size %d incompatible " 792 "with underlying device sector size %d", 793 ssopt, sectsz); 794 goto err; 795 } 796 } 797 798 sectsz = ssopt; 799 psectsz = pssopt; 800 psectoff = 0; 801 } 802 803 bc = calloc(1, sizeof(struct blockif_ctxt)); 804 if (bc == NULL) { 805 perror("calloc"); 806 goto err; 807 } 808 809 bc->bc_magic = BLOCKIF_SIG; 810 bc->bc_fd = fd; 811 bc->bc_ischr = S_ISCHR(sbuf.st_mode); 812 bc->bc_isgeom = geom; 813 bc->bc_candelete = candelete; 814 #ifndef __FreeBSD__ 815 bc->bc_wce = wce; 816 #endif 817 bc->bc_rdonly = ro; 818 bc->bc_size = size; 819 bc->bc_sectsz = sectsz; 820 bc->bc_psectsz = psectsz; 821 bc->bc_psectoff = psectoff; 822 pthread_mutex_init(&bc->bc_mtx, NULL); 823 pthread_cond_init(&bc->bc_cond, NULL); 824 TAILQ_INIT(&bc->bc_freeq); 825 TAILQ_INIT(&bc->bc_pendq); 826 TAILQ_INIT(&bc->bc_busyq); 827 bc->bc_bootindex = bootindex; 828 for (i = 0; i < BLOCKIF_MAXREQ; i++) { 829 bc->bc_reqs[i].be_status = BST_FREE; 830 TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link); 831 } 832 833 for (i = 0; i < BLOCKIF_NUMTHR; i++) { 834 pthread_create(&bc->bc_btid[i], NULL, blockif_thr, bc); 835 snprintf(tname, sizeof(tname), "blk-%s-%d", ident, i); 836 pthread_set_name_np(bc->bc_btid[i], tname); 837 } 838 839 return (bc); 840 err: 841 if (fd >= 0) 842 close(fd); 843 return (NULL); 844 } 845 846 static void 847 blockif_resized(int fd, enum ev_type type __unused, void *arg) 848 { 849 struct blockif_ctxt *bc; 850 struct stat sb; 851 off_t mediasize; 852 853 if (fstat(fd, &sb) != 0) 854 return; 855 856 #ifdef __FreeBSD__ 857 if (S_ISCHR(sb.st_mode)) { 858 if (ioctl(fd, DIOCGMEDIASIZE, &mediasize) < 0) { 859 EPRINTLN("blockif_resized: get mediasize failed: %s", 860 strerror(errno)); 861 return; 862 } 863 } else 864 mediasize = sb.st_size; 865 #else 866 mediasize = sb.st_size; 867 if (S_ISCHR(sb.st_mode)) { 868 struct dk_minfo dkm; 869 870 if (ioctl(fd, DKIOCGMEDIAINFO, &dkm) == 0) 871 mediasize = dkm.dki_lbsize * dkm.dki_capacity; 872 } 873 #endif 874 875 bc = arg; 876 pthread_mutex_lock(&bc->bc_mtx); 877 if (mediasize != bc->bc_size) { 878 bc->bc_size = mediasize; 879 bc->bc_resize_cb(bc, bc->bc_resize_cb_arg, bc->bc_size); 880 } 881 pthread_mutex_unlock(&bc->bc_mtx); 882 } 883 884 int 885 blockif_register_resize_callback(struct blockif_ctxt *bc, blockif_resize_cb *cb, 886 void *cb_arg) 887 { 888 struct stat sb; 889 int err; 890 891 if (cb == NULL) 892 return (EINVAL); 893 894 err = 0; 895 896 pthread_mutex_lock(&bc->bc_mtx); 897 if (bc->bc_resize_cb != NULL) { 898 err = EBUSY; 899 goto out; 900 } 901 902 assert(bc->bc_closing == 0); 903 904 if (fstat(bc->bc_fd, &sb) != 0) { 905 err = errno; 906 goto out; 907 } 908 909 bc->bc_resize_event = mevent_add_flags(bc->bc_fd, EVF_VNODE, 910 EVFF_ATTRIB, blockif_resized, bc); 911 if (bc->bc_resize_event == NULL) { 912 err = ENXIO; 913 goto out; 914 } 915 916 bc->bc_resize_cb = cb; 917 bc->bc_resize_cb_arg = cb_arg; 918 out: 919 pthread_mutex_unlock(&bc->bc_mtx); 920 921 return (err); 922 } 923 924 static int 925 blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq, 926 enum blockop op) 927 { 928 int err; 929 930 err = 0; 931 932 pthread_mutex_lock(&bc->bc_mtx); 933 if (!TAILQ_EMPTY(&bc->bc_freeq)) { 934 /* 935 * Enqueue and inform the block i/o thread 936 * that there is work available 937 */ 938 if (blockif_enqueue(bc, breq, op)) 939 pthread_cond_signal(&bc->bc_cond); 940 } else { 941 /* 942 * Callers are not allowed to enqueue more than 943 * the specified blockif queue limit. Return an 944 * error to indicate that the queue length has been 945 * exceeded. 946 */ 947 err = E2BIG; 948 } 949 pthread_mutex_unlock(&bc->bc_mtx); 950 951 return (err); 952 } 953 954 int 955 blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq) 956 { 957 assert(bc->bc_magic == BLOCKIF_SIG); 958 return (blockif_request(bc, breq, BOP_READ)); 959 } 960 961 int 962 blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq) 963 { 964 assert(bc->bc_magic == BLOCKIF_SIG); 965 return (blockif_request(bc, breq, BOP_WRITE)); 966 } 967 968 int 969 blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq) 970 { 971 assert(bc->bc_magic == BLOCKIF_SIG); 972 return (blockif_request(bc, breq, BOP_FLUSH)); 973 } 974 975 int 976 blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq) 977 { 978 assert(bc->bc_magic == BLOCKIF_SIG); 979 return (blockif_request(bc, breq, BOP_DELETE)); 980 } 981 982 int 983 blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq) 984 { 985 struct blockif_elem *be; 986 987 assert(bc->bc_magic == BLOCKIF_SIG); 988 989 pthread_mutex_lock(&bc->bc_mtx); 990 /* 991 * Check pending requests. 992 */ 993 TAILQ_FOREACH(be, &bc->bc_pendq, be_link) { 994 if (be->be_req == breq) 995 break; 996 } 997 if (be != NULL) { 998 /* 999 * Found it. 1000 */ 1001 blockif_complete(bc, be); 1002 pthread_mutex_unlock(&bc->bc_mtx); 1003 1004 return (0); 1005 } 1006 1007 /* 1008 * Check in-flight requests. 1009 */ 1010 TAILQ_FOREACH(be, &bc->bc_busyq, be_link) { 1011 if (be->be_req == breq) 1012 break; 1013 } 1014 if (be == NULL) { 1015 /* 1016 * Didn't find it. 1017 */ 1018 pthread_mutex_unlock(&bc->bc_mtx); 1019 return (EINVAL); 1020 } 1021 1022 /* 1023 * Interrupt the processing thread to force it return 1024 * prematurely via it's normal callback path. 1025 */ 1026 while (be->be_status == BST_BUSY) { 1027 struct blockif_sig_elem bse, *old_head; 1028 1029 pthread_mutex_init(&bse.bse_mtx, NULL); 1030 pthread_cond_init(&bse.bse_cond, NULL); 1031 1032 bse.bse_pending = 1; 1033 1034 do { 1035 old_head = blockif_bse_head; 1036 bse.bse_next = old_head; 1037 } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head, 1038 (uintptr_t)old_head, 1039 (uintptr_t)&bse)); 1040 1041 pthread_kill(be->be_tid, SIGCONT); 1042 1043 pthread_mutex_lock(&bse.bse_mtx); 1044 while (bse.bse_pending) 1045 pthread_cond_wait(&bse.bse_cond, &bse.bse_mtx); 1046 pthread_mutex_unlock(&bse.bse_mtx); 1047 } 1048 1049 pthread_mutex_unlock(&bc->bc_mtx); 1050 1051 /* 1052 * The processing thread has been interrupted. Since it's not 1053 * clear if the callback has been invoked yet, return EBUSY. 1054 */ 1055 return (EBUSY); 1056 } 1057 1058 int 1059 blockif_close(struct blockif_ctxt *bc) 1060 { 1061 void *jval; 1062 int i; 1063 1064 assert(bc->bc_magic == BLOCKIF_SIG); 1065 1066 /* 1067 * Stop the block i/o thread 1068 */ 1069 pthread_mutex_lock(&bc->bc_mtx); 1070 bc->bc_closing = 1; 1071 if (bc->bc_resize_event != NULL) 1072 mevent_disable(bc->bc_resize_event); 1073 pthread_mutex_unlock(&bc->bc_mtx); 1074 pthread_cond_broadcast(&bc->bc_cond); 1075 for (i = 0; i < BLOCKIF_NUMTHR; i++) 1076 pthread_join(bc->bc_btid[i], &jval); 1077 1078 /* XXX Cancel queued i/o's ??? */ 1079 1080 /* 1081 * Release resources 1082 */ 1083 bc->bc_magic = 0; 1084 close(bc->bc_fd); 1085 free(bc); 1086 1087 return (0); 1088 } 1089 1090 /* 1091 * Return virtual C/H/S values for a given block. Use the algorithm 1092 * outlined in the VHD specification to calculate values. 1093 */ 1094 void 1095 blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s) 1096 { 1097 off_t sectors; /* total sectors of the block dev */ 1098 off_t hcyl; /* cylinders times heads */ 1099 uint16_t secpt; /* sectors per track */ 1100 uint8_t heads; 1101 1102 assert(bc->bc_magic == BLOCKIF_SIG); 1103 1104 sectors = bc->bc_size / bc->bc_sectsz; 1105 1106 /* Clamp the size to the largest possible with CHS */ 1107 if (sectors > 65535L * 16 * 255) 1108 sectors = 65535L * 16 * 255; 1109 1110 if (sectors >= 65536L * 16 * 63) { 1111 secpt = 255; 1112 heads = 16; 1113 hcyl = sectors / secpt; 1114 } else { 1115 secpt = 17; 1116 hcyl = sectors / secpt; 1117 heads = (hcyl + 1023) / 1024; 1118 1119 if (heads < 4) 1120 heads = 4; 1121 1122 if (hcyl >= (heads * 1024) || heads > 16) { 1123 secpt = 31; 1124 heads = 16; 1125 hcyl = sectors / secpt; 1126 } 1127 if (hcyl >= (heads * 1024)) { 1128 secpt = 63; 1129 heads = 16; 1130 hcyl = sectors / secpt; 1131 } 1132 } 1133 1134 *c = hcyl / heads; 1135 *h = heads; 1136 *s = secpt; 1137 } 1138 1139 /* 1140 * Accessors 1141 */ 1142 off_t 1143 blockif_size(struct blockif_ctxt *bc) 1144 { 1145 assert(bc->bc_magic == BLOCKIF_SIG); 1146 return (bc->bc_size); 1147 } 1148 1149 int 1150 blockif_sectsz(struct blockif_ctxt *bc) 1151 { 1152 assert(bc->bc_magic == BLOCKIF_SIG); 1153 return (bc->bc_sectsz); 1154 } 1155 1156 void 1157 blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off) 1158 { 1159 assert(bc->bc_magic == BLOCKIF_SIG); 1160 *size = bc->bc_psectsz; 1161 *off = bc->bc_psectoff; 1162 } 1163 1164 int 1165 blockif_queuesz(struct blockif_ctxt *bc) 1166 { 1167 assert(bc->bc_magic == BLOCKIF_SIG); 1168 return (BLOCKIF_MAXREQ - 1); 1169 } 1170 1171 int 1172 blockif_is_ro(struct blockif_ctxt *bc) 1173 { 1174 assert(bc->bc_magic == BLOCKIF_SIG); 1175 return (bc->bc_rdonly); 1176 } 1177 1178 int 1179 blockif_candelete(struct blockif_ctxt *bc) 1180 { 1181 assert(bc->bc_magic == BLOCKIF_SIG); 1182 return (bc->bc_candelete); 1183 } 1184 1185 #ifndef __FreeBSD__ 1186 int 1187 blockif_set_wce(struct blockif_ctxt *bc, int wc_enable) 1188 { 1189 int res = 0, flags; 1190 int clean_val = (wc_enable != 0) ? 1 : 0; 1191 1192 (void) pthread_mutex_lock(&bc->bc_mtx); 1193 switch (bc->bc_wce) { 1194 case WCE_IOCTL: 1195 res = ioctl(bc->bc_fd, DKIOCSETWCE, &clean_val); 1196 break; 1197 case WCE_FCNTL: 1198 if ((flags = fcntl(bc->bc_fd, F_GETFL)) >= 0) { 1199 if (wc_enable == 0) { 1200 flags |= O_DSYNC; 1201 } else { 1202 flags &= ~O_DSYNC; 1203 } 1204 if (fcntl(bc->bc_fd, F_SETFL, flags) == -1) { 1205 res = -1; 1206 } 1207 } else { 1208 res = -1; 1209 } 1210 break; 1211 default: 1212 break; 1213 } 1214 1215 /* 1216 * After a successful disable of the write cache, ensure that any 1217 * lingering data in the cache is synced out. 1218 */ 1219 if (res == 0 && wc_enable == 0) { 1220 res = fsync(bc->bc_fd); 1221 } 1222 (void) pthread_mutex_unlock(&bc->bc_mtx); 1223 1224 return (res); 1225 } 1226 #endif /* __FreeBSD__ */ 1227