1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2013 Peter Grehan <grehan@freebsd.org> 5 * All rights reserved. 6 * Copyright 2020 Joyent, Inc. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 /* 31 * Copyright 2020 Joyent, Inc. 32 */ 33 34 #include <sys/cdefs.h> 35 36 #include <sys/param.h> 37 #ifndef WITHOUT_CAPSICUM 38 #include <sys/capsicum.h> 39 #endif 40 #include <sys/queue.h> 41 #include <sys/errno.h> 42 #include <sys/stat.h> 43 #include <sys/ioctl.h> 44 #include <sys/disk.h> 45 #ifndef __FreeBSD__ 46 #include <sys/limits.h> 47 #include <sys/uio.h> 48 #include <sys/dkio.h> 49 #endif 50 51 #include <assert.h> 52 #ifndef WITHOUT_CAPSICUM 53 #include <capsicum_helpers.h> 54 #endif 55 #include <err.h> 56 #include <fcntl.h> 57 #include <stdio.h> 58 #include <stdlib.h> 59 #include <string.h> 60 #include <pthread.h> 61 #include <pthread_np.h> 62 #include <signal.h> 63 #include <sysexits.h> 64 #include <unistd.h> 65 66 #include <machine/atomic.h> 67 68 #include "bhyverun.h" 69 #include "config.h" 70 #include "debug.h" 71 #include "mevent.h" 72 #include "pci_emul.h" 73 #include "block_if.h" 74 75 #define BLOCKIF_SIG 0xb109b109 76 77 #ifdef __FreeBSD__ 78 #define BLOCKIF_NUMTHR 8 79 #else 80 /* Enlarge to keep pace with the virtio-block ring size */ 81 #define BLOCKIF_NUMTHR 16 82 #endif 83 #define BLOCKIF_MAXREQ (BLOCKIF_RING_MAX + BLOCKIF_NUMTHR) 84 85 enum blockop { 86 BOP_READ, 87 BOP_WRITE, 88 #ifndef __FreeBSD__ 89 BOP_WRITE_SYNC, 90 #endif 91 BOP_FLUSH, 92 BOP_DELETE 93 }; 94 95 enum blockstat { 96 BST_FREE, 97 BST_BLOCK, 98 BST_PEND, 99 BST_BUSY, 100 BST_DONE 101 }; 102 103 struct blockif_elem { 104 TAILQ_ENTRY(blockif_elem) be_link; 105 struct blockif_req *be_req; 106 enum blockop be_op; 107 enum blockstat be_status; 108 pthread_t be_tid; 109 off_t be_block; 110 }; 111 112 #ifndef __FreeBSD__ 113 enum blockif_wce { 114 WCE_NONE = 0, 115 WCE_IOCTL, 116 WCE_FCNTL 117 }; 118 #endif 119 120 struct blockif_ctxt { 121 unsigned int bc_magic; 122 int bc_fd; 123 int bc_ischr; 124 int bc_isgeom; 125 int bc_candelete; 126 #ifndef __FreeBSD__ 127 enum blockif_wce bc_wce; 128 #endif 129 int bc_rdonly; 130 off_t bc_size; 131 int bc_sectsz; 132 int bc_psectsz; 133 int bc_psectoff; 134 int bc_closing; 135 pthread_t bc_btid[BLOCKIF_NUMTHR]; 136 pthread_mutex_t bc_mtx; 137 pthread_cond_t bc_cond; 138 blockif_resize_cb *bc_resize_cb; 139 void *bc_resize_cb_arg; 140 struct mevent *bc_resize_event; 141 142 /* Request elements and free/pending/busy queues */ 143 TAILQ_HEAD(, blockif_elem) bc_freeq; 144 TAILQ_HEAD(, blockif_elem) bc_pendq; 145 TAILQ_HEAD(, blockif_elem) bc_busyq; 146 struct blockif_elem bc_reqs[BLOCKIF_MAXREQ]; 147 int bc_bootindex; 148 }; 149 150 static pthread_once_t blockif_once = PTHREAD_ONCE_INIT; 151 152 struct blockif_sig_elem { 153 pthread_mutex_t bse_mtx; 154 pthread_cond_t bse_cond; 155 int bse_pending; 156 struct blockif_sig_elem *bse_next; 157 }; 158 159 static struct blockif_sig_elem *blockif_bse_head; 160 161 static int 162 blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq, 163 enum blockop op) 164 { 165 struct blockif_elem *be, *tbe; 166 off_t off; 167 int i; 168 169 be = TAILQ_FIRST(&bc->bc_freeq); 170 assert(be != NULL); 171 assert(be->be_status == BST_FREE); 172 TAILQ_REMOVE(&bc->bc_freeq, be, be_link); 173 be->be_req = breq; 174 be->be_op = op; 175 switch (op) { 176 case BOP_READ: 177 case BOP_WRITE: 178 #ifndef __FreeBSD__ 179 case BOP_WRITE_SYNC: 180 #endif 181 case BOP_DELETE: 182 off = breq->br_offset; 183 for (i = 0; i < breq->br_iovcnt; i++) 184 off += breq->br_iov[i].iov_len; 185 break; 186 default: 187 off = OFF_MAX; 188 } 189 be->be_block = off; 190 TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) { 191 if (tbe->be_block == breq->br_offset) 192 break; 193 } 194 if (tbe == NULL) { 195 TAILQ_FOREACH(tbe, &bc->bc_busyq, be_link) { 196 if (tbe->be_block == breq->br_offset) 197 break; 198 } 199 } 200 if (tbe == NULL) 201 be->be_status = BST_PEND; 202 else 203 be->be_status = BST_BLOCK; 204 TAILQ_INSERT_TAIL(&bc->bc_pendq, be, be_link); 205 return (be->be_status == BST_PEND); 206 } 207 208 static int 209 blockif_dequeue(struct blockif_ctxt *bc, pthread_t t, struct blockif_elem **bep) 210 { 211 struct blockif_elem *be; 212 213 TAILQ_FOREACH(be, &bc->bc_pendq, be_link) { 214 if (be->be_status == BST_PEND) 215 break; 216 assert(be->be_status == BST_BLOCK); 217 } 218 if (be == NULL) 219 return (0); 220 TAILQ_REMOVE(&bc->bc_pendq, be, be_link); 221 be->be_status = BST_BUSY; 222 be->be_tid = t; 223 TAILQ_INSERT_TAIL(&bc->bc_busyq, be, be_link); 224 *bep = be; 225 return (1); 226 } 227 228 static void 229 blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be) 230 { 231 struct blockif_elem *tbe; 232 233 if (be->be_status == BST_DONE || be->be_status == BST_BUSY) 234 TAILQ_REMOVE(&bc->bc_busyq, be, be_link); 235 else 236 TAILQ_REMOVE(&bc->bc_pendq, be, be_link); 237 TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) { 238 if (tbe->be_req->br_offset == be->be_block) 239 tbe->be_status = BST_PEND; 240 } 241 be->be_tid = 0; 242 be->be_status = BST_FREE; 243 be->be_req = NULL; 244 TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link); 245 } 246 247 static int 248 blockif_flush_bc(struct blockif_ctxt *bc) 249 { 250 #ifdef __FreeBSD__ 251 if (bc->bc_ischr) { 252 if (ioctl(bc->bc_fd, DIOCGFLUSH)) 253 return (errno); 254 } else if (fsync(bc->bc_fd)) 255 return (errno); 256 #else 257 /* 258 * This fsync() should be adequate to flush the cache of a file 259 * or device. In VFS, the VOP_SYNC operation is converted to 260 * the appropriate ioctl in both sdev (for real devices) and 261 * zfs (for zvols). 262 */ 263 if (fsync(bc->bc_fd)) 264 return (errno); 265 #endif 266 267 return (0); 268 } 269 270 static void 271 blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf) 272 { 273 #ifdef __FreeBSD__ 274 struct spacectl_range range; 275 #endif 276 struct blockif_req *br; 277 #ifdef __FreeBSD__ 278 off_t arg[2]; 279 #endif 280 ssize_t n; 281 size_t clen, len, off, boff, voff; 282 int i, err; 283 284 br = be->be_req; 285 assert(br->br_resid >= 0); 286 287 if (br->br_iovcnt <= 1) 288 buf = NULL; 289 err = 0; 290 switch (be->be_op) { 291 case BOP_READ: 292 if (buf == NULL) { 293 if ((n = preadv(bc->bc_fd, br->br_iov, br->br_iovcnt, 294 br->br_offset)) < 0) 295 err = errno; 296 else 297 br->br_resid -= n; 298 break; 299 } 300 i = 0; 301 off = voff = 0; 302 while (br->br_resid > 0) { 303 len = MIN(br->br_resid, MAXPHYS); 304 n = pread(bc->bc_fd, buf, len, br->br_offset + off); 305 if (n < 0) { 306 err = errno; 307 break; 308 } 309 len = (size_t)n; 310 boff = 0; 311 do { 312 clen = MIN(len - boff, br->br_iov[i].iov_len - 313 voff); 314 memcpy((uint8_t *)br->br_iov[i].iov_base + voff, 315 buf + boff, clen); 316 if (clen < br->br_iov[i].iov_len - voff) 317 voff += clen; 318 else { 319 i++; 320 voff = 0; 321 } 322 boff += clen; 323 } while (boff < len); 324 off += len; 325 br->br_resid -= len; 326 } 327 break; 328 case BOP_WRITE: 329 if (bc->bc_rdonly) { 330 err = EROFS; 331 break; 332 } 333 if (buf == NULL) { 334 if ((n = pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt, 335 br->br_offset)) < 0) 336 err = errno; 337 else 338 br->br_resid -= n; 339 break; 340 } 341 i = 0; 342 off = voff = 0; 343 while (br->br_resid > 0) { 344 len = MIN(br->br_resid, MAXPHYS); 345 boff = 0; 346 do { 347 clen = MIN(len - boff, br->br_iov[i].iov_len - 348 voff); 349 memcpy(buf + boff, 350 (uint8_t *)br->br_iov[i].iov_base + voff, 351 clen); 352 if (clen < br->br_iov[i].iov_len - voff) 353 voff += clen; 354 else { 355 i++; 356 voff = 0; 357 } 358 boff += clen; 359 } while (boff < len); 360 361 n = pwrite(bc->bc_fd, buf, len, br->br_offset + off); 362 if (n < 0) { 363 err = errno; 364 break; 365 } 366 off += n; 367 br->br_resid -= n; 368 } 369 break; 370 case BOP_FLUSH: 371 err = blockif_flush_bc(bc); 372 break; 373 case BOP_DELETE: 374 if (!bc->bc_candelete) 375 err = EOPNOTSUPP; 376 else if (bc->bc_rdonly) 377 err = EROFS; 378 #ifdef __FreeBSD__ 379 else if (bc->bc_ischr) { 380 arg[0] = br->br_offset; 381 arg[1] = br->br_resid; 382 if (ioctl(bc->bc_fd, DIOCGDELETE, arg)) 383 err = errno; 384 else 385 br->br_resid = 0; 386 } else { 387 range.r_offset = br->br_offset; 388 range.r_len = br->br_resid; 389 390 while (range.r_len > 0) { 391 if (fspacectl(bc->bc_fd, SPACECTL_DEALLOC, 392 &range, 0, &range) != 0) { 393 err = errno; 394 break; 395 } 396 } 397 if (err == 0) 398 br->br_resid = 0; 399 } 400 #else 401 else if (bc->bc_ischr) { 402 dkioc_free_list_t dfl = { 403 .dfl_num_exts = 1, 404 .dfl_offset = 0, 405 .dfl_flags = 0, 406 .dfl_exts = { 407 { 408 .dfle_start = br->br_offset, 409 .dfle_length = br->br_resid 410 } 411 } 412 }; 413 414 if (ioctl(bc->bc_fd, DKIOCFREE, &dfl)) 415 err = errno; 416 else 417 br->br_resid = 0; 418 } else { 419 struct flock fl = { 420 .l_whence = 0, 421 .l_type = F_WRLCK, 422 .l_start = br->br_offset, 423 .l_len = br->br_resid 424 }; 425 426 if (fcntl(bc->bc_fd, F_FREESP, &fl)) 427 err = errno; 428 else 429 br->br_resid = 0; 430 } 431 #endif 432 break; 433 default: 434 err = EINVAL; 435 break; 436 } 437 438 be->be_status = BST_DONE; 439 440 (*br->br_callback)(br, err); 441 } 442 443 static inline bool 444 blockif_empty(const struct blockif_ctxt *bc) 445 { 446 return (TAILQ_EMPTY(&bc->bc_pendq) && TAILQ_EMPTY(&bc->bc_busyq)); 447 } 448 449 static void * 450 blockif_thr(void *arg) 451 { 452 struct blockif_ctxt *bc; 453 struct blockif_elem *be; 454 pthread_t t; 455 uint8_t *buf; 456 457 bc = arg; 458 if (bc->bc_isgeom) 459 buf = malloc(MAXPHYS); 460 else 461 buf = NULL; 462 t = pthread_self(); 463 464 pthread_mutex_lock(&bc->bc_mtx); 465 for (;;) { 466 while (blockif_dequeue(bc, t, &be)) { 467 pthread_mutex_unlock(&bc->bc_mtx); 468 blockif_proc(bc, be, buf); 469 pthread_mutex_lock(&bc->bc_mtx); 470 blockif_complete(bc, be); 471 } 472 /* Check ctxt status here to see if exit requested */ 473 if (bc->bc_closing) 474 break; 475 476 pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx); 477 } 478 pthread_mutex_unlock(&bc->bc_mtx); 479 480 if (buf) 481 free(buf); 482 pthread_exit(NULL); 483 return (NULL); 484 } 485 486 #ifdef __FreeBSD__ 487 static void 488 blockif_sigcont_handler(int signal __unused, enum ev_type type __unused, 489 void *arg __unused) 490 #else 491 static void 492 blockif_sigcont_handler(int signal __unused) 493 #endif 494 { 495 struct blockif_sig_elem *bse; 496 497 for (;;) { 498 /* 499 * Process the entire list even if not intended for 500 * this thread. 501 */ 502 do { 503 bse = blockif_bse_head; 504 if (bse == NULL) 505 return; 506 } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head, 507 (uintptr_t)bse, 508 (uintptr_t)bse->bse_next)); 509 510 pthread_mutex_lock(&bse->bse_mtx); 511 bse->bse_pending = 0; 512 pthread_cond_signal(&bse->bse_cond); 513 pthread_mutex_unlock(&bse->bse_mtx); 514 } 515 } 516 517 static void 518 blockif_init(void) 519 { 520 #ifdef __FreeBSD__ 521 mevent_add(SIGCONT, EVF_SIGNAL, blockif_sigcont_handler, NULL); 522 (void) signal(SIGCONT, SIG_IGN); 523 #else 524 (void) sigset(SIGCONT, blockif_sigcont_handler); 525 #endif 526 } 527 528 int 529 blockif_legacy_config(nvlist_t *nvl, const char *opts) 530 { 531 char *cp, *path; 532 533 if (opts == NULL) 534 return (0); 535 536 cp = strchr(opts, ','); 537 if (cp == NULL) { 538 set_config_value_node(nvl, "path", opts); 539 return (0); 540 } 541 path = strndup(opts, cp - opts); 542 set_config_value_node(nvl, "path", path); 543 free(path); 544 return (pci_parse_legacy_config(nvl, cp + 1)); 545 } 546 547 int 548 blockif_add_boot_device(struct pci_devinst *const pi, 549 struct blockif_ctxt *const bc) 550 { 551 if (bc->bc_bootindex < 0) 552 return (0); 553 554 return (pci_emul_add_boot_device(pi, bc->bc_bootindex)); 555 } 556 557 struct blockif_ctxt * 558 blockif_open(nvlist_t *nvl, const char *ident) 559 { 560 char tname[MAXCOMLEN + 1]; 561 #ifdef __FreeBSD__ 562 char name[MAXPATHLEN]; 563 #endif 564 const char *path, *pssval, *ssval, *bootindex_val; 565 char *cp; 566 struct blockif_ctxt *bc; 567 struct stat sbuf; 568 #ifdef __FreeBSD__ 569 struct diocgattr_arg arg; 570 #else 571 enum blockif_wce wce = WCE_NONE; 572 #endif 573 off_t size, psectsz, psectoff; 574 int extra, fd, i, sectsz; 575 int ro, candelete, geom, ssopt, pssopt; 576 int nodelete; 577 int bootindex; 578 579 #ifndef WITHOUT_CAPSICUM 580 cap_rights_t rights; 581 cap_ioctl_t cmds[] = { DIOCGFLUSH, DIOCGDELETE, DIOCGMEDIASIZE }; 582 #endif 583 584 pthread_once(&blockif_once, blockif_init); 585 586 fd = -1; 587 extra = 0; 588 ssopt = 0; 589 #ifndef __FreeBSD__ 590 pssopt = 0; 591 #endif 592 ro = 0; 593 nodelete = 0; 594 bootindex = -1; 595 596 if (get_config_bool_node_default(nvl, "nocache", false)) 597 extra |= O_DIRECT; 598 if (get_config_bool_node_default(nvl, "nodelete", false)) 599 nodelete = 1; 600 if (get_config_bool_node_default(nvl, "sync", false) || 601 get_config_bool_node_default(nvl, "direct", false)) 602 extra |= O_SYNC; 603 if (get_config_bool_node_default(nvl, "ro", false)) 604 ro = 1; 605 ssval = get_config_value_node(nvl, "sectorsize"); 606 if (ssval != NULL) { 607 ssopt = strtol(ssval, &cp, 10); 608 if (cp == ssval) { 609 EPRINTLN("Invalid sector size \"%s\"", ssval); 610 goto err; 611 } 612 if (*cp == '\0') { 613 pssopt = ssopt; 614 } else if (*cp == '/') { 615 pssval = cp + 1; 616 pssopt = strtol(pssval, &cp, 10); 617 if (cp == pssval || *cp != '\0') { 618 EPRINTLN("Invalid sector size \"%s\"", ssval); 619 goto err; 620 } 621 } else { 622 EPRINTLN("Invalid sector size \"%s\"", ssval); 623 goto err; 624 } 625 } 626 627 bootindex_val = get_config_value_node(nvl, "bootindex"); 628 if (bootindex_val != NULL) { 629 bootindex = atoi(bootindex_val); 630 } 631 632 path = get_config_value_node(nvl, "path"); 633 if (path == NULL) { 634 EPRINTLN("Missing \"path\" for block device."); 635 goto err; 636 } 637 638 fd = open(path, (ro ? O_RDONLY : O_RDWR) | extra); 639 if (fd < 0 && !ro) { 640 /* Attempt a r/w fail with a r/o open */ 641 fd = open(path, O_RDONLY | extra); 642 ro = 1; 643 } 644 645 if (fd < 0) { 646 warn("Could not open backing file: %s", path); 647 goto err; 648 } 649 650 if (fstat(fd, &sbuf) < 0) { 651 warn("Could not stat backing file %s", path); 652 goto err; 653 } 654 655 #ifndef WITHOUT_CAPSICUM 656 cap_rights_init(&rights, CAP_FSYNC, CAP_IOCTL, CAP_READ, CAP_SEEK, 657 CAP_WRITE, CAP_FSTAT, CAP_EVENT, CAP_FPATHCONF); 658 if (ro) 659 cap_rights_clear(&rights, CAP_FSYNC, CAP_WRITE); 660 661 if (caph_rights_limit(fd, &rights) == -1) 662 errx(EX_OSERR, "Unable to apply rights for sandbox"); 663 #endif 664 665 /* 666 * Deal with raw devices 667 */ 668 size = sbuf.st_size; 669 sectsz = DEV_BSIZE; 670 psectsz = psectoff = 0; 671 candelete = geom = 0; 672 #ifdef __FreeBSD__ 673 if (S_ISCHR(sbuf.st_mode)) { 674 if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 || 675 ioctl(fd, DIOCGSECTORSIZE, §sz)) { 676 perror("Could not fetch dev blk/sector size"); 677 goto err; 678 } 679 assert(size != 0); 680 assert(sectsz != 0); 681 if (ioctl(fd, DIOCGSTRIPESIZE, &psectsz) == 0 && psectsz > 0) 682 ioctl(fd, DIOCGSTRIPEOFFSET, &psectoff); 683 strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name)); 684 arg.len = sizeof(arg.value.i); 685 if (nodelete == 0 && ioctl(fd, DIOCGATTR, &arg) == 0) 686 candelete = arg.value.i; 687 if (ioctl(fd, DIOCGPROVIDERNAME, name) == 0) 688 geom = 1; 689 } else { 690 psectsz = sbuf.st_blksize; 691 /* Avoid fallback implementation */ 692 candelete = fpathconf(fd, _PC_DEALLOC_PRESENT) == 1; 693 } 694 #else 695 psectsz = sbuf.st_blksize; 696 if (S_ISCHR(sbuf.st_mode)) { 697 struct dk_minfo_ext dkmext; 698 int wce_val; 699 700 /* Look for a more accurate physical block/media size */ 701 if (ioctl(fd, DKIOCGMEDIAINFOEXT, &dkmext) == 0) { 702 psectsz = dkmext.dki_pbsize; 703 size = dkmext.dki_lbsize * dkmext.dki_capacity; 704 } 705 /* See if a configurable write cache is present and working */ 706 if (ioctl(fd, DKIOCGETWCE, &wce_val) == 0) { 707 /* 708 * If WCE is already active, disable it until the 709 * specific device driver calls for its return. If it 710 * is not active, toggle it on and off to verify that 711 * such actions are possible. 712 */ 713 if (wce_val != 0) { 714 wce_val = 0; 715 /* 716 * Inability to disable the cache is a threat 717 * to data durability. 718 */ 719 assert(ioctl(fd, DKIOCSETWCE, &wce_val) == 0); 720 wce = WCE_IOCTL; 721 } else { 722 int r1, r2; 723 724 wce_val = 1; 725 r1 = ioctl(fd, DKIOCSETWCE, &wce_val); 726 wce_val = 0; 727 r2 = ioctl(fd, DKIOCSETWCE, &wce_val); 728 729 if (r1 == 0 && r2 == 0) { 730 wce = WCE_IOCTL; 731 } else { 732 /* 733 * If the cache cache toggle was not 734 * successful, ensure that the cache 735 * was not left enabled. 736 */ 737 assert(r1 != 0); 738 } 739 } 740 } 741 742 if (nodelete == 0 && ioctl(fd, DKIOC_CANFREE, &candelete)) 743 candelete = 0; 744 745 } else { 746 int flags; 747 748 if ((flags = fcntl(fd, F_GETFL)) >= 0) { 749 flags |= O_DSYNC; 750 if (fcntl(fd, F_SETFL, flags) != -1) { 751 wce = WCE_FCNTL; 752 } 753 } 754 755 /* 756 * We don't have a way to discover if a file supports the 757 * FREESP fcntl cmd (other than trying it). However, 758 * zfs, ufs, tmpfs, and udfs all support the FREESP fcntl cmd. 759 * Nfsv4 and nfsv4 also forward the FREESP request 760 * to the server, so we always enable it for file based 761 * volumes. Anyone trying to run volumes on an unsupported 762 * configuration is on their own, and should be prepared 763 * for the requests to fail. 764 */ 765 if (nodelete == 0) 766 candelete = 1; 767 } 768 #endif 769 770 #ifndef WITHOUT_CAPSICUM 771 if (caph_ioctls_limit(fd, cmds, nitems(cmds)) == -1) 772 errx(EX_OSERR, "Unable to apply rights for sandbox"); 773 #endif 774 775 if (ssopt != 0) { 776 if (!powerof2(ssopt) || !powerof2(pssopt) || ssopt < 512 || 777 ssopt > pssopt) { 778 EPRINTLN("Invalid sector size %d/%d", 779 ssopt, pssopt); 780 goto err; 781 } 782 783 /* 784 * Some backend drivers (e.g. cd0, ada0) require that the I/O 785 * size be a multiple of the device's sector size. 786 * 787 * Validate that the emulated sector size complies with this 788 * requirement. 789 */ 790 if (S_ISCHR(sbuf.st_mode)) { 791 if (ssopt < sectsz || (ssopt % sectsz) != 0) { 792 EPRINTLN("Sector size %d incompatible " 793 "with underlying device sector size %d", 794 ssopt, sectsz); 795 goto err; 796 } 797 } 798 799 sectsz = ssopt; 800 psectsz = pssopt; 801 psectoff = 0; 802 } 803 804 bc = calloc(1, sizeof(struct blockif_ctxt)); 805 if (bc == NULL) { 806 perror("calloc"); 807 goto err; 808 } 809 810 bc->bc_magic = BLOCKIF_SIG; 811 bc->bc_fd = fd; 812 bc->bc_ischr = S_ISCHR(sbuf.st_mode); 813 bc->bc_isgeom = geom; 814 bc->bc_candelete = candelete; 815 #ifndef __FreeBSD__ 816 bc->bc_wce = wce; 817 #endif 818 bc->bc_rdonly = ro; 819 bc->bc_size = size; 820 bc->bc_sectsz = sectsz; 821 bc->bc_psectsz = psectsz; 822 bc->bc_psectoff = psectoff; 823 pthread_mutex_init(&bc->bc_mtx, NULL); 824 pthread_cond_init(&bc->bc_cond, NULL); 825 TAILQ_INIT(&bc->bc_freeq); 826 TAILQ_INIT(&bc->bc_pendq); 827 TAILQ_INIT(&bc->bc_busyq); 828 bc->bc_bootindex = bootindex; 829 for (i = 0; i < BLOCKIF_MAXREQ; i++) { 830 bc->bc_reqs[i].be_status = BST_FREE; 831 TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link); 832 } 833 834 for (i = 0; i < BLOCKIF_NUMTHR; i++) { 835 pthread_create(&bc->bc_btid[i], NULL, blockif_thr, bc); 836 snprintf(tname, sizeof(tname), "blk-%s-%d", ident, i); 837 pthread_set_name_np(bc->bc_btid[i], tname); 838 } 839 840 return (bc); 841 err: 842 if (fd >= 0) 843 close(fd); 844 return (NULL); 845 } 846 847 static void 848 blockif_resized(int fd, enum ev_type type __unused, void *arg) 849 { 850 struct blockif_ctxt *bc; 851 struct stat sb; 852 off_t mediasize; 853 854 if (fstat(fd, &sb) != 0) 855 return; 856 857 #ifdef __FreeBSD__ 858 if (S_ISCHR(sb.st_mode)) { 859 if (ioctl(fd, DIOCGMEDIASIZE, &mediasize) < 0) { 860 EPRINTLN("blockif_resized: get mediasize failed: %s", 861 strerror(errno)); 862 return; 863 } 864 } else 865 mediasize = sb.st_size; 866 #else 867 mediasize = sb.st_size; 868 if (S_ISCHR(sb.st_mode)) { 869 struct dk_minfo dkm; 870 871 if (ioctl(fd, DKIOCGMEDIAINFO, &dkm) == 0) 872 mediasize = dkm.dki_lbsize * dkm.dki_capacity; 873 } 874 #endif 875 876 bc = arg; 877 pthread_mutex_lock(&bc->bc_mtx); 878 if (mediasize != bc->bc_size) { 879 bc->bc_size = mediasize; 880 bc->bc_resize_cb(bc, bc->bc_resize_cb_arg, bc->bc_size); 881 } 882 pthread_mutex_unlock(&bc->bc_mtx); 883 } 884 885 int 886 blockif_register_resize_callback(struct blockif_ctxt *bc, blockif_resize_cb *cb, 887 void *cb_arg) 888 { 889 struct stat sb; 890 int err; 891 892 if (cb == NULL) 893 return (EINVAL); 894 895 err = 0; 896 897 pthread_mutex_lock(&bc->bc_mtx); 898 if (bc->bc_resize_cb != NULL) { 899 err = EBUSY; 900 goto out; 901 } 902 903 assert(bc->bc_closing == 0); 904 905 if (fstat(bc->bc_fd, &sb) != 0) { 906 err = errno; 907 goto out; 908 } 909 910 bc->bc_resize_event = mevent_add_flags(bc->bc_fd, EVF_VNODE, 911 EVFF_ATTRIB, blockif_resized, bc); 912 if (bc->bc_resize_event == NULL) { 913 err = ENXIO; 914 goto out; 915 } 916 917 bc->bc_resize_cb = cb; 918 bc->bc_resize_cb_arg = cb_arg; 919 out: 920 pthread_mutex_unlock(&bc->bc_mtx); 921 922 return (err); 923 } 924 925 static int 926 blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq, 927 enum blockop op) 928 { 929 int err; 930 931 err = 0; 932 933 pthread_mutex_lock(&bc->bc_mtx); 934 if (!TAILQ_EMPTY(&bc->bc_freeq)) { 935 /* 936 * Enqueue and inform the block i/o thread 937 * that there is work available 938 */ 939 if (blockif_enqueue(bc, breq, op)) 940 pthread_cond_signal(&bc->bc_cond); 941 } else { 942 /* 943 * Callers are not allowed to enqueue more than 944 * the specified blockif queue limit. Return an 945 * error to indicate that the queue length has been 946 * exceeded. 947 */ 948 err = E2BIG; 949 } 950 pthread_mutex_unlock(&bc->bc_mtx); 951 952 return (err); 953 } 954 955 int 956 blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq) 957 { 958 assert(bc->bc_magic == BLOCKIF_SIG); 959 return (blockif_request(bc, breq, BOP_READ)); 960 } 961 962 int 963 blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq) 964 { 965 assert(bc->bc_magic == BLOCKIF_SIG); 966 return (blockif_request(bc, breq, BOP_WRITE)); 967 } 968 969 int 970 blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq) 971 { 972 assert(bc->bc_magic == BLOCKIF_SIG); 973 return (blockif_request(bc, breq, BOP_FLUSH)); 974 } 975 976 int 977 blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq) 978 { 979 assert(bc->bc_magic == BLOCKIF_SIG); 980 return (blockif_request(bc, breq, BOP_DELETE)); 981 } 982 983 int 984 blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq) 985 { 986 struct blockif_elem *be; 987 988 assert(bc->bc_magic == BLOCKIF_SIG); 989 990 pthread_mutex_lock(&bc->bc_mtx); 991 /* 992 * Check pending requests. 993 */ 994 TAILQ_FOREACH(be, &bc->bc_pendq, be_link) { 995 if (be->be_req == breq) 996 break; 997 } 998 if (be != NULL) { 999 /* 1000 * Found it. 1001 */ 1002 blockif_complete(bc, be); 1003 pthread_mutex_unlock(&bc->bc_mtx); 1004 1005 return (0); 1006 } 1007 1008 /* 1009 * Check in-flight requests. 1010 */ 1011 TAILQ_FOREACH(be, &bc->bc_busyq, be_link) { 1012 if (be->be_req == breq) 1013 break; 1014 } 1015 if (be == NULL) { 1016 /* 1017 * Didn't find it. 1018 */ 1019 pthread_mutex_unlock(&bc->bc_mtx); 1020 return (EINVAL); 1021 } 1022 1023 /* 1024 * Interrupt the processing thread to force it return 1025 * prematurely via it's normal callback path. 1026 */ 1027 while (be->be_status == BST_BUSY) { 1028 struct blockif_sig_elem bse, *old_head; 1029 1030 pthread_mutex_init(&bse.bse_mtx, NULL); 1031 pthread_cond_init(&bse.bse_cond, NULL); 1032 1033 bse.bse_pending = 1; 1034 1035 do { 1036 old_head = blockif_bse_head; 1037 bse.bse_next = old_head; 1038 } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head, 1039 (uintptr_t)old_head, 1040 (uintptr_t)&bse)); 1041 1042 pthread_kill(be->be_tid, SIGCONT); 1043 1044 pthread_mutex_lock(&bse.bse_mtx); 1045 while (bse.bse_pending) 1046 pthread_cond_wait(&bse.bse_cond, &bse.bse_mtx); 1047 pthread_mutex_unlock(&bse.bse_mtx); 1048 } 1049 1050 pthread_mutex_unlock(&bc->bc_mtx); 1051 1052 /* 1053 * The processing thread has been interrupted. Since it's not 1054 * clear if the callback has been invoked yet, return EBUSY. 1055 */ 1056 return (EBUSY); 1057 } 1058 1059 int 1060 blockif_close(struct blockif_ctxt *bc) 1061 { 1062 void *jval; 1063 int i; 1064 1065 assert(bc->bc_magic == BLOCKIF_SIG); 1066 1067 /* 1068 * Stop the block i/o thread 1069 */ 1070 pthread_mutex_lock(&bc->bc_mtx); 1071 bc->bc_closing = 1; 1072 if (bc->bc_resize_event != NULL) 1073 mevent_disable(bc->bc_resize_event); 1074 pthread_mutex_unlock(&bc->bc_mtx); 1075 pthread_cond_broadcast(&bc->bc_cond); 1076 for (i = 0; i < BLOCKIF_NUMTHR; i++) 1077 pthread_join(bc->bc_btid[i], &jval); 1078 1079 /* XXX Cancel queued i/o's ??? */ 1080 1081 /* 1082 * Release resources 1083 */ 1084 bc->bc_magic = 0; 1085 close(bc->bc_fd); 1086 free(bc); 1087 1088 return (0); 1089 } 1090 1091 /* 1092 * Return virtual C/H/S values for a given block. Use the algorithm 1093 * outlined in the VHD specification to calculate values. 1094 */ 1095 void 1096 blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s) 1097 { 1098 off_t sectors; /* total sectors of the block dev */ 1099 off_t hcyl; /* cylinders times heads */ 1100 uint16_t secpt; /* sectors per track */ 1101 uint8_t heads; 1102 1103 assert(bc->bc_magic == BLOCKIF_SIG); 1104 1105 sectors = bc->bc_size / bc->bc_sectsz; 1106 1107 /* Clamp the size to the largest possible with CHS */ 1108 if (sectors > 65535L * 16 * 255) 1109 sectors = 65535L * 16 * 255; 1110 1111 if (sectors >= 65536L * 16 * 63) { 1112 secpt = 255; 1113 heads = 16; 1114 hcyl = sectors / secpt; 1115 } else { 1116 secpt = 17; 1117 hcyl = sectors / secpt; 1118 heads = (hcyl + 1023) / 1024; 1119 1120 if (heads < 4) 1121 heads = 4; 1122 1123 if (hcyl >= (heads * 1024) || heads > 16) { 1124 secpt = 31; 1125 heads = 16; 1126 hcyl = sectors / secpt; 1127 } 1128 if (hcyl >= (heads * 1024)) { 1129 secpt = 63; 1130 heads = 16; 1131 hcyl = sectors / secpt; 1132 } 1133 } 1134 1135 *c = hcyl / heads; 1136 *h = heads; 1137 *s = secpt; 1138 } 1139 1140 /* 1141 * Accessors 1142 */ 1143 off_t 1144 blockif_size(struct blockif_ctxt *bc) 1145 { 1146 assert(bc->bc_magic == BLOCKIF_SIG); 1147 return (bc->bc_size); 1148 } 1149 1150 int 1151 blockif_sectsz(struct blockif_ctxt *bc) 1152 { 1153 assert(bc->bc_magic == BLOCKIF_SIG); 1154 return (bc->bc_sectsz); 1155 } 1156 1157 void 1158 blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off) 1159 { 1160 assert(bc->bc_magic == BLOCKIF_SIG); 1161 *size = bc->bc_psectsz; 1162 *off = bc->bc_psectoff; 1163 } 1164 1165 int 1166 blockif_queuesz(struct blockif_ctxt *bc) 1167 { 1168 assert(bc->bc_magic == BLOCKIF_SIG); 1169 return (BLOCKIF_MAXREQ - 1); 1170 } 1171 1172 int 1173 blockif_is_ro(struct blockif_ctxt *bc) 1174 { 1175 assert(bc->bc_magic == BLOCKIF_SIG); 1176 return (bc->bc_rdonly); 1177 } 1178 1179 int 1180 blockif_candelete(struct blockif_ctxt *bc) 1181 { 1182 assert(bc->bc_magic == BLOCKIF_SIG); 1183 return (bc->bc_candelete); 1184 } 1185 1186 #ifndef __FreeBSD__ 1187 int 1188 blockif_set_wce(struct blockif_ctxt *bc, int wc_enable) 1189 { 1190 int res = 0, flags; 1191 int clean_val = (wc_enable != 0) ? 1 : 0; 1192 1193 (void) pthread_mutex_lock(&bc->bc_mtx); 1194 switch (bc->bc_wce) { 1195 case WCE_IOCTL: 1196 res = ioctl(bc->bc_fd, DKIOCSETWCE, &clean_val); 1197 break; 1198 case WCE_FCNTL: 1199 if ((flags = fcntl(bc->bc_fd, F_GETFL)) >= 0) { 1200 if (wc_enable == 0) { 1201 flags |= O_DSYNC; 1202 } else { 1203 flags &= ~O_DSYNC; 1204 } 1205 if (fcntl(bc->bc_fd, F_SETFL, flags) == -1) { 1206 res = -1; 1207 } 1208 } else { 1209 res = -1; 1210 } 1211 break; 1212 default: 1213 break; 1214 } 1215 1216 /* 1217 * After a successful disable of the write cache, ensure that any 1218 * lingering data in the cache is synced out. 1219 */ 1220 if (res == 0 && wc_enable == 0) { 1221 res = fsync(bc->bc_fd); 1222 } 1223 (void) pthread_mutex_unlock(&bc->bc_mtx); 1224 1225 return (res); 1226 } 1227 #endif /* __FreeBSD__ */ 1228