1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2013 Peter Grehan <grehan@freebsd.org> 5 * All rights reserved. 6 * Copyright 2020 Joyent, Inc. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * $FreeBSD$ 30 */ 31 32 /* 33 * Copyright 2020 Joyent, Inc. 34 */ 35 36 #include <sys/cdefs.h> 37 __FBSDID("$FreeBSD$"); 38 39 #include <sys/param.h> 40 #ifndef WITHOUT_CAPSICUM 41 #include <sys/capsicum.h> 42 #endif 43 #include <sys/queue.h> 44 #include <sys/errno.h> 45 #include <sys/stat.h> 46 #include <sys/ioctl.h> 47 #include <sys/disk.h> 48 #include <sys/limits.h> 49 #include <sys/uio.h> 50 #ifndef __FreeBSD__ 51 #include <sys/dkio.h> 52 #endif 53 54 #include <assert.h> 55 #ifndef WITHOUT_CAPSICUM 56 #include <capsicum_helpers.h> 57 #endif 58 #include <err.h> 59 #include <fcntl.h> 60 #include <stdio.h> 61 #include <stdlib.h> 62 #include <string.h> 63 #include <pthread.h> 64 #include <pthread_np.h> 65 #include <signal.h> 66 #include <sysexits.h> 67 #include <unistd.h> 68 69 #include <machine/atomic.h> 70 71 #include "bhyverun.h" 72 #include "config.h" 73 #include "debug.h" 74 #ifdef __FreeBSD__ 75 #include "mevent.h" 76 #endif 77 #include "pci_emul.h" 78 #include "block_if.h" 79 80 #define BLOCKIF_SIG 0xb109b109 81 82 #ifdef __FreeBSD__ 83 #define BLOCKIF_NUMTHR 8 84 #else 85 /* Enlarge to keep pace with the virtio-block ring size */ 86 #define BLOCKIF_NUMTHR 16 87 #endif 88 #define BLOCKIF_MAXREQ (BLOCKIF_RING_MAX + BLOCKIF_NUMTHR) 89 90 enum blockop { 91 BOP_READ, 92 BOP_WRITE, 93 #ifndef __FreeBSD__ 94 BOP_WRITE_SYNC, 95 #endif 96 BOP_FLUSH, 97 BOP_DELETE 98 }; 99 100 enum blockstat { 101 BST_FREE, 102 BST_BLOCK, 103 BST_PEND, 104 BST_BUSY, 105 BST_DONE 106 }; 107 108 struct blockif_elem { 109 TAILQ_ENTRY(blockif_elem) be_link; 110 struct blockif_req *be_req; 111 enum blockop be_op; 112 enum blockstat be_status; 113 pthread_t be_tid; 114 off_t be_block; 115 }; 116 117 #ifndef __FreeBSD__ 118 enum blockif_wce { 119 WCE_NONE = 0, 120 WCE_IOCTL, 121 WCE_FCNTL 122 }; 123 #endif 124 125 struct blockif_ctxt { 126 int bc_magic; 127 int bc_fd; 128 int bc_ischr; 129 int bc_isgeom; 130 int bc_candelete; 131 #ifndef __FreeBSD__ 132 enum blockif_wce bc_wce; 133 #endif 134 int bc_rdonly; 135 off_t bc_size; 136 int bc_sectsz; 137 int bc_psectsz; 138 int bc_psectoff; 139 int bc_closing; 140 pthread_t bc_btid[BLOCKIF_NUMTHR]; 141 pthread_mutex_t bc_mtx; 142 pthread_cond_t bc_cond; 143 144 /* Request elements and free/pending/busy queues */ 145 TAILQ_HEAD(, blockif_elem) bc_freeq; 146 TAILQ_HEAD(, blockif_elem) bc_pendq; 147 TAILQ_HEAD(, blockif_elem) bc_busyq; 148 struct blockif_elem bc_reqs[BLOCKIF_MAXREQ]; 149 }; 150 151 static pthread_once_t blockif_once = PTHREAD_ONCE_INIT; 152 153 struct blockif_sig_elem { 154 pthread_mutex_t bse_mtx; 155 pthread_cond_t bse_cond; 156 int bse_pending; 157 struct blockif_sig_elem *bse_next; 158 }; 159 160 static struct blockif_sig_elem *blockif_bse_head; 161 162 static int 163 blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq, 164 enum blockop op) 165 { 166 struct blockif_elem *be, *tbe; 167 off_t off; 168 int i; 169 170 be = TAILQ_FIRST(&bc->bc_freeq); 171 assert(be != NULL); 172 assert(be->be_status == BST_FREE); 173 TAILQ_REMOVE(&bc->bc_freeq, be, be_link); 174 be->be_req = breq; 175 be->be_op = op; 176 switch (op) { 177 case BOP_READ: 178 case BOP_WRITE: 179 #ifndef __FreeBSD__ 180 case BOP_WRITE_SYNC: 181 #endif 182 case BOP_DELETE: 183 off = breq->br_offset; 184 for (i = 0; i < breq->br_iovcnt; i++) 185 off += breq->br_iov[i].iov_len; 186 break; 187 default: 188 off = OFF_MAX; 189 } 190 be->be_block = off; 191 TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) { 192 if (tbe->be_block == breq->br_offset) 193 break; 194 } 195 if (tbe == NULL) { 196 TAILQ_FOREACH(tbe, &bc->bc_busyq, be_link) { 197 if (tbe->be_block == breq->br_offset) 198 break; 199 } 200 } 201 if (tbe == NULL) 202 be->be_status = BST_PEND; 203 else 204 be->be_status = BST_BLOCK; 205 TAILQ_INSERT_TAIL(&bc->bc_pendq, be, be_link); 206 return (be->be_status == BST_PEND); 207 } 208 209 static int 210 blockif_dequeue(struct blockif_ctxt *bc, pthread_t t, struct blockif_elem **bep) 211 { 212 struct blockif_elem *be; 213 214 TAILQ_FOREACH(be, &bc->bc_pendq, be_link) { 215 if (be->be_status == BST_PEND) 216 break; 217 assert(be->be_status == BST_BLOCK); 218 } 219 if (be == NULL) 220 return (0); 221 TAILQ_REMOVE(&bc->bc_pendq, be, be_link); 222 be->be_status = BST_BUSY; 223 be->be_tid = t; 224 TAILQ_INSERT_TAIL(&bc->bc_busyq, be, be_link); 225 *bep = be; 226 return (1); 227 } 228 229 static void 230 blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be) 231 { 232 struct blockif_elem *tbe; 233 234 if (be->be_status == BST_DONE || be->be_status == BST_BUSY) 235 TAILQ_REMOVE(&bc->bc_busyq, be, be_link); 236 else 237 TAILQ_REMOVE(&bc->bc_pendq, be, be_link); 238 TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) { 239 if (tbe->be_req->br_offset == be->be_block) 240 tbe->be_status = BST_PEND; 241 } 242 be->be_tid = 0; 243 be->be_status = BST_FREE; 244 be->be_req = NULL; 245 TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link); 246 } 247 248 static void 249 blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf) 250 { 251 struct blockif_req *br; 252 #ifdef __FreeBSD__ 253 off_t arg[2]; 254 #endif 255 ssize_t clen, len, off, boff, voff; 256 int i, err; 257 258 br = be->be_req; 259 if (br->br_iovcnt <= 1) 260 buf = NULL; 261 err = 0; 262 switch (be->be_op) { 263 case BOP_READ: 264 if (buf == NULL) { 265 if ((len = preadv(bc->bc_fd, br->br_iov, br->br_iovcnt, 266 br->br_offset)) < 0) 267 err = errno; 268 else 269 br->br_resid -= len; 270 break; 271 } 272 i = 0; 273 off = voff = 0; 274 while (br->br_resid > 0) { 275 len = MIN(br->br_resid, MAXPHYS); 276 if (pread(bc->bc_fd, buf, len, br->br_offset + 277 off) < 0) { 278 err = errno; 279 break; 280 } 281 boff = 0; 282 do { 283 clen = MIN(len - boff, br->br_iov[i].iov_len - 284 voff); 285 memcpy(br->br_iov[i].iov_base + voff, 286 buf + boff, clen); 287 if (clen < br->br_iov[i].iov_len - voff) 288 voff += clen; 289 else { 290 i++; 291 voff = 0; 292 } 293 boff += clen; 294 } while (boff < len); 295 off += len; 296 br->br_resid -= len; 297 } 298 break; 299 case BOP_WRITE: 300 if (bc->bc_rdonly) { 301 err = EROFS; 302 break; 303 } 304 if (buf == NULL) { 305 if ((len = pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt, 306 br->br_offset)) < 0) 307 err = errno; 308 else 309 br->br_resid -= len; 310 break; 311 } 312 i = 0; 313 off = voff = 0; 314 while (br->br_resid > 0) { 315 len = MIN(br->br_resid, MAXPHYS); 316 boff = 0; 317 do { 318 clen = MIN(len - boff, br->br_iov[i].iov_len - 319 voff); 320 memcpy(buf + boff, 321 br->br_iov[i].iov_base + voff, clen); 322 if (clen < br->br_iov[i].iov_len - voff) 323 voff += clen; 324 else { 325 i++; 326 voff = 0; 327 } 328 boff += clen; 329 } while (boff < len); 330 if (pwrite(bc->bc_fd, buf, len, br->br_offset + 331 off) < 0) { 332 err = errno; 333 break; 334 } 335 off += len; 336 br->br_resid -= len; 337 } 338 break; 339 case BOP_FLUSH: 340 #ifdef __FreeBSD__ 341 if (bc->bc_ischr) { 342 if (ioctl(bc->bc_fd, DIOCGFLUSH)) 343 err = errno; 344 } else if (fsync(bc->bc_fd)) 345 err = errno; 346 #else 347 /* 348 * This fsync() should be adequate to flush the cache of a file 349 * or device. In VFS, the VOP_SYNC operation is converted to 350 * the appropriate ioctl in both sdev (for real devices) and 351 * zfs (for zvols). 352 */ 353 if (fsync(bc->bc_fd)) 354 err = errno; 355 #endif 356 break; 357 case BOP_DELETE: 358 if (!bc->bc_candelete) 359 err = EOPNOTSUPP; 360 else if (bc->bc_rdonly) 361 err = EROFS; 362 #ifdef __FreeBSD__ 363 else if (bc->bc_ischr) { 364 arg[0] = br->br_offset; 365 arg[1] = br->br_resid; 366 if (ioctl(bc->bc_fd, DIOCGDELETE, arg)) 367 err = errno; 368 else 369 br->br_resid = 0; 370 } 371 else 372 err = EOPNOTSUPP; 373 #else 374 else if (bc->bc_ischr) { 375 dkioc_free_list_t dfl = { 376 .dfl_num_exts = 1, 377 .dfl_offset = 0, 378 .dfl_flags = 0, 379 .dfl_exts = { 380 { 381 .dfle_start = br->br_offset, 382 .dfle_length = br->br_resid 383 } 384 } 385 }; 386 387 if (ioctl(bc->bc_fd, DKIOCFREE, &dfl)) 388 err = errno; 389 else 390 br->br_resid = 0; 391 } else { 392 struct flock fl = { 393 .l_whence = 0, 394 .l_type = F_WRLCK, 395 .l_start = br->br_offset, 396 .l_len = br->br_resid 397 }; 398 399 if (fcntl(bc->bc_fd, F_FREESP, &fl)) 400 err = errno; 401 else 402 br->br_resid = 0; 403 } 404 #endif 405 break; 406 default: 407 err = EINVAL; 408 break; 409 } 410 411 be->be_status = BST_DONE; 412 413 (*br->br_callback)(br, err); 414 } 415 416 static void * 417 blockif_thr(void *arg) 418 { 419 struct blockif_ctxt *bc; 420 struct blockif_elem *be; 421 pthread_t t; 422 uint8_t *buf; 423 424 bc = arg; 425 if (bc->bc_isgeom) 426 buf = malloc(MAXPHYS); 427 else 428 buf = NULL; 429 t = pthread_self(); 430 431 pthread_mutex_lock(&bc->bc_mtx); 432 for (;;) { 433 while (blockif_dequeue(bc, t, &be)) { 434 pthread_mutex_unlock(&bc->bc_mtx); 435 blockif_proc(bc, be, buf); 436 pthread_mutex_lock(&bc->bc_mtx); 437 blockif_complete(bc, be); 438 } 439 /* Check ctxt status here to see if exit requested */ 440 if (bc->bc_closing) 441 break; 442 pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx); 443 } 444 pthread_mutex_unlock(&bc->bc_mtx); 445 446 if (buf) 447 free(buf); 448 pthread_exit(NULL); 449 return (NULL); 450 } 451 452 #ifdef __FreeBSD__ 453 static void 454 blockif_sigcont_handler(int signal, enum ev_type type, void *arg) 455 #else 456 static void 457 blockif_sigcont_handler(int signal) 458 #endif 459 { 460 struct blockif_sig_elem *bse; 461 462 for (;;) { 463 /* 464 * Process the entire list even if not intended for 465 * this thread. 466 */ 467 do { 468 bse = blockif_bse_head; 469 if (bse == NULL) 470 return; 471 } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head, 472 (uintptr_t)bse, 473 (uintptr_t)bse->bse_next)); 474 475 pthread_mutex_lock(&bse->bse_mtx); 476 bse->bse_pending = 0; 477 pthread_cond_signal(&bse->bse_cond); 478 pthread_mutex_unlock(&bse->bse_mtx); 479 } 480 } 481 482 static void 483 blockif_init(void) 484 { 485 #ifdef __FreeBSD__ 486 mevent_add(SIGCONT, EVF_SIGNAL, blockif_sigcont_handler, NULL); 487 (void) signal(SIGCONT, SIG_IGN); 488 #else 489 (void) sigset(SIGCONT, blockif_sigcont_handler); 490 #endif 491 } 492 493 int 494 blockif_legacy_config(nvlist_t *nvl, const char *opts) 495 { 496 char *cp, *path; 497 498 if (opts == NULL) 499 return (0); 500 501 cp = strchr(opts, ','); 502 if (cp == NULL) { 503 set_config_value_node(nvl, "path", opts); 504 return (0); 505 } 506 path = strndup(opts, cp - opts); 507 set_config_value_node(nvl, "path", path); 508 free(path); 509 return (pci_parse_legacy_config(nvl, cp + 1)); 510 } 511 512 struct blockif_ctxt * 513 blockif_open(nvlist_t *nvl, const char *ident) 514 { 515 char tname[MAXCOMLEN + 1]; 516 #ifdef __FreeBSD__ 517 char name[MAXPATHLEN]; 518 #endif 519 const char *path, *pssval, *ssval; 520 char *cp; 521 struct blockif_ctxt *bc; 522 struct stat sbuf; 523 #ifdef __FreeBSD__ 524 struct diocgattr_arg arg; 525 #else 526 enum blockif_wce wce = WCE_NONE; 527 #endif 528 off_t size, psectsz, psectoff; 529 int extra, fd, i, sectsz; 530 int ro, candelete, geom, ssopt, pssopt; 531 int nodelete; 532 533 #ifndef WITHOUT_CAPSICUM 534 cap_rights_t rights; 535 cap_ioctl_t cmds[] = { DIOCGFLUSH, DIOCGDELETE }; 536 #endif 537 538 pthread_once(&blockif_once, blockif_init); 539 540 fd = -1; 541 extra = 0; 542 ssopt = 0; 543 #ifndef __FreeBSD__ 544 pssopt = 0; 545 #endif 546 ro = 0; 547 nodelete = 0; 548 549 if (get_config_bool_node_default(nvl, "nocache", false)) 550 extra |= O_DIRECT; 551 if (get_config_bool_node_default(nvl, "nodelete", false)) 552 nodelete = 1; 553 if (get_config_bool_node_default(nvl, "sync", false) || 554 get_config_bool_node_default(nvl, "direct", false)) 555 extra |= O_SYNC; 556 if (get_config_bool_node_default(nvl, "ro", false)) 557 ro = 1; 558 ssval = get_config_value_node(nvl, "sectorsize"); 559 if (ssval != NULL) { 560 ssopt = strtol(ssval, &cp, 10); 561 if (cp == ssval) { 562 EPRINTLN("Invalid sector size \"%s\"", ssval); 563 goto err; 564 } 565 if (*cp == '\0') { 566 pssopt = ssopt; 567 } else if (*cp == '/') { 568 pssval = cp + 1; 569 pssopt = strtol(pssval, &cp, 10); 570 if (cp == pssval || *cp != '\0') { 571 EPRINTLN("Invalid sector size \"%s\"", ssval); 572 goto err; 573 } 574 } else { 575 EPRINTLN("Invalid sector size \"%s\"", ssval); 576 goto err; 577 } 578 } 579 580 path = get_config_value_node(nvl, "path"); 581 if (path == NULL) { 582 EPRINTLN("Missing \"path\" for block device."); 583 goto err; 584 } 585 586 fd = open(path, (ro ? O_RDONLY : O_RDWR) | extra); 587 if (fd < 0 && !ro) { 588 /* Attempt a r/w fail with a r/o open */ 589 fd = open(path, O_RDONLY | extra); 590 ro = 1; 591 } 592 593 if (fd < 0) { 594 warn("Could not open backing file: %s", path); 595 goto err; 596 } 597 598 if (fstat(fd, &sbuf) < 0) { 599 warn("Could not stat backing file %s", path); 600 goto err; 601 } 602 603 #ifndef WITHOUT_CAPSICUM 604 cap_rights_init(&rights, CAP_FSYNC, CAP_IOCTL, CAP_READ, CAP_SEEK, 605 CAP_WRITE); 606 if (ro) 607 cap_rights_clear(&rights, CAP_FSYNC, CAP_WRITE); 608 609 if (caph_rights_limit(fd, &rights) == -1) 610 errx(EX_OSERR, "Unable to apply rights for sandbox"); 611 #endif 612 613 /* 614 * Deal with raw devices 615 */ 616 size = sbuf.st_size; 617 sectsz = DEV_BSIZE; 618 psectsz = psectoff = 0; 619 candelete = geom = 0; 620 #ifdef __FreeBSD__ 621 if (S_ISCHR(sbuf.st_mode)) { 622 if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 || 623 ioctl(fd, DIOCGSECTORSIZE, §sz)) { 624 perror("Could not fetch dev blk/sector size"); 625 goto err; 626 } 627 assert(size != 0); 628 assert(sectsz != 0); 629 if (ioctl(fd, DIOCGSTRIPESIZE, &psectsz) == 0 && psectsz > 0) 630 ioctl(fd, DIOCGSTRIPEOFFSET, &psectoff); 631 strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name)); 632 arg.len = sizeof(arg.value.i); 633 if (nodelete == 0 && ioctl(fd, DIOCGATTR, &arg) == 0) 634 candelete = arg.value.i; 635 if (ioctl(fd, DIOCGPROVIDERNAME, name) == 0) 636 geom = 1; 637 } else { 638 psectsz = sbuf.st_blksize; 639 } 640 #else 641 psectsz = sbuf.st_blksize; 642 if (S_ISCHR(sbuf.st_mode)) { 643 struct dk_minfo_ext dkmext; 644 int wce_val; 645 646 /* Look for a more accurate physical blocksize */ 647 if (ioctl(fd, DKIOCGMEDIAINFOEXT, &dkmext) == 0) { 648 psectsz = dkmext.dki_pbsize; 649 } 650 /* See if a configurable write cache is present and working */ 651 if (ioctl(fd, DKIOCGETWCE, &wce_val) == 0) { 652 /* 653 * If WCE is already active, disable it until the 654 * specific device driver calls for its return. If it 655 * is not active, toggle it on and off to verify that 656 * such actions are possible. 657 */ 658 if (wce_val != 0) { 659 wce_val = 0; 660 /* 661 * Inability to disable the cache is a threat 662 * to data durability. 663 */ 664 assert(ioctl(fd, DKIOCSETWCE, &wce_val) == 0); 665 wce = WCE_IOCTL; 666 } else { 667 int r1, r2; 668 669 wce_val = 1; 670 r1 = ioctl(fd, DKIOCSETWCE, &wce_val); 671 wce_val = 0; 672 r2 = ioctl(fd, DKIOCSETWCE, &wce_val); 673 674 if (r1 == 0 && r2 == 0) { 675 wce = WCE_IOCTL; 676 } else { 677 /* 678 * If the cache cache toggle was not 679 * successful, ensure that the cache 680 * was not left enabled. 681 */ 682 assert(r1 != 0); 683 } 684 } 685 } 686 687 if (nodelete == 0 && ioctl(fd, DKIOC_CANFREE, &candelete)) 688 candelete = 0; 689 690 } else { 691 int flags; 692 693 if ((flags = fcntl(fd, F_GETFL)) >= 0) { 694 flags |= O_DSYNC; 695 if (fcntl(fd, F_SETFL, flags) != -1) { 696 wce = WCE_FCNTL; 697 } 698 } 699 700 /* 701 * We don't have a way to discover if a file supports the 702 * FREESP fcntl cmd (other than trying it). However, 703 * zfs, ufs, tmpfs, and udfs all support the FREESP fcntl cmd. 704 * Nfsv4 and nfsv4 also forward the FREESP request 705 * to the server, so we always enable it for file based 706 * volumes. Anyone trying to run volumes on an unsupported 707 * configuration is on their own, and should be prepared 708 * for the requests to fail. 709 */ 710 if (nodelete == 0) 711 candelete = 1; 712 } 713 #endif 714 715 #ifndef WITHOUT_CAPSICUM 716 if (caph_ioctls_limit(fd, cmds, nitems(cmds)) == -1) 717 errx(EX_OSERR, "Unable to apply rights for sandbox"); 718 #endif 719 720 if (ssopt != 0) { 721 if (!powerof2(ssopt) || !powerof2(pssopt) || ssopt < 512 || 722 ssopt > pssopt) { 723 EPRINTLN("Invalid sector size %d/%d", 724 ssopt, pssopt); 725 goto err; 726 } 727 728 /* 729 * Some backend drivers (e.g. cd0, ada0) require that the I/O 730 * size be a multiple of the device's sector size. 731 * 732 * Validate that the emulated sector size complies with this 733 * requirement. 734 */ 735 if (S_ISCHR(sbuf.st_mode)) { 736 if (ssopt < sectsz || (ssopt % sectsz) != 0) { 737 EPRINTLN("Sector size %d incompatible " 738 "with underlying device sector size %d", 739 ssopt, sectsz); 740 goto err; 741 } 742 } 743 744 sectsz = ssopt; 745 psectsz = pssopt; 746 psectoff = 0; 747 } 748 749 bc = calloc(1, sizeof(struct blockif_ctxt)); 750 if (bc == NULL) { 751 perror("calloc"); 752 goto err; 753 } 754 755 bc->bc_magic = BLOCKIF_SIG; 756 bc->bc_fd = fd; 757 bc->bc_ischr = S_ISCHR(sbuf.st_mode); 758 bc->bc_isgeom = geom; 759 bc->bc_candelete = candelete; 760 #ifndef __FreeBSD__ 761 bc->bc_wce = wce; 762 #endif 763 bc->bc_rdonly = ro; 764 bc->bc_size = size; 765 bc->bc_sectsz = sectsz; 766 bc->bc_psectsz = psectsz; 767 bc->bc_psectoff = psectoff; 768 pthread_mutex_init(&bc->bc_mtx, NULL); 769 pthread_cond_init(&bc->bc_cond, NULL); 770 TAILQ_INIT(&bc->bc_freeq); 771 TAILQ_INIT(&bc->bc_pendq); 772 TAILQ_INIT(&bc->bc_busyq); 773 for (i = 0; i < BLOCKIF_MAXREQ; i++) { 774 bc->bc_reqs[i].be_status = BST_FREE; 775 TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link); 776 } 777 778 for (i = 0; i < BLOCKIF_NUMTHR; i++) { 779 pthread_create(&bc->bc_btid[i], NULL, blockif_thr, bc); 780 snprintf(tname, sizeof(tname), "blk-%s-%d", ident, i); 781 pthread_set_name_np(bc->bc_btid[i], tname); 782 } 783 784 return (bc); 785 err: 786 if (fd >= 0) 787 close(fd); 788 return (NULL); 789 } 790 791 static int 792 blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq, 793 enum blockop op) 794 { 795 int err; 796 797 err = 0; 798 799 pthread_mutex_lock(&bc->bc_mtx); 800 if (!TAILQ_EMPTY(&bc->bc_freeq)) { 801 /* 802 * Enqueue and inform the block i/o thread 803 * that there is work available 804 */ 805 if (blockif_enqueue(bc, breq, op)) 806 pthread_cond_signal(&bc->bc_cond); 807 } else { 808 /* 809 * Callers are not allowed to enqueue more than 810 * the specified blockif queue limit. Return an 811 * error to indicate that the queue length has been 812 * exceeded. 813 */ 814 err = E2BIG; 815 } 816 pthread_mutex_unlock(&bc->bc_mtx); 817 818 return (err); 819 } 820 821 int 822 blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq) 823 { 824 825 assert(bc->bc_magic == BLOCKIF_SIG); 826 return (blockif_request(bc, breq, BOP_READ)); 827 } 828 829 int 830 blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq) 831 { 832 833 assert(bc->bc_magic == BLOCKIF_SIG); 834 return (blockif_request(bc, breq, BOP_WRITE)); 835 } 836 837 int 838 blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq) 839 { 840 841 assert(bc->bc_magic == BLOCKIF_SIG); 842 return (blockif_request(bc, breq, BOP_FLUSH)); 843 } 844 845 int 846 blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq) 847 { 848 849 assert(bc->bc_magic == BLOCKIF_SIG); 850 return (blockif_request(bc, breq, BOP_DELETE)); 851 } 852 853 int 854 blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq) 855 { 856 struct blockif_elem *be; 857 858 assert(bc->bc_magic == BLOCKIF_SIG); 859 860 pthread_mutex_lock(&bc->bc_mtx); 861 /* 862 * Check pending requests. 863 */ 864 TAILQ_FOREACH(be, &bc->bc_pendq, be_link) { 865 if (be->be_req == breq) 866 break; 867 } 868 if (be != NULL) { 869 /* 870 * Found it. 871 */ 872 blockif_complete(bc, be); 873 pthread_mutex_unlock(&bc->bc_mtx); 874 875 return (0); 876 } 877 878 /* 879 * Check in-flight requests. 880 */ 881 TAILQ_FOREACH(be, &bc->bc_busyq, be_link) { 882 if (be->be_req == breq) 883 break; 884 } 885 if (be == NULL) { 886 /* 887 * Didn't find it. 888 */ 889 pthread_mutex_unlock(&bc->bc_mtx); 890 return (EINVAL); 891 } 892 893 /* 894 * Interrupt the processing thread to force it return 895 * prematurely via it's normal callback path. 896 */ 897 while (be->be_status == BST_BUSY) { 898 struct blockif_sig_elem bse, *old_head; 899 900 pthread_mutex_init(&bse.bse_mtx, NULL); 901 pthread_cond_init(&bse.bse_cond, NULL); 902 903 bse.bse_pending = 1; 904 905 do { 906 old_head = blockif_bse_head; 907 bse.bse_next = old_head; 908 } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head, 909 (uintptr_t)old_head, 910 (uintptr_t)&bse)); 911 912 pthread_kill(be->be_tid, SIGCONT); 913 914 pthread_mutex_lock(&bse.bse_mtx); 915 while (bse.bse_pending) 916 pthread_cond_wait(&bse.bse_cond, &bse.bse_mtx); 917 pthread_mutex_unlock(&bse.bse_mtx); 918 } 919 920 pthread_mutex_unlock(&bc->bc_mtx); 921 922 /* 923 * The processing thread has been interrupted. Since it's not 924 * clear if the callback has been invoked yet, return EBUSY. 925 */ 926 return (EBUSY); 927 } 928 929 int 930 blockif_close(struct blockif_ctxt *bc) 931 { 932 void *jval; 933 int i; 934 935 assert(bc->bc_magic == BLOCKIF_SIG); 936 937 /* 938 * Stop the block i/o thread 939 */ 940 pthread_mutex_lock(&bc->bc_mtx); 941 bc->bc_closing = 1; 942 pthread_mutex_unlock(&bc->bc_mtx); 943 pthread_cond_broadcast(&bc->bc_cond); 944 for (i = 0; i < BLOCKIF_NUMTHR; i++) 945 pthread_join(bc->bc_btid[i], &jval); 946 947 /* XXX Cancel queued i/o's ??? */ 948 949 /* 950 * Release resources 951 */ 952 bc->bc_magic = 0; 953 close(bc->bc_fd); 954 free(bc); 955 956 return (0); 957 } 958 959 /* 960 * Return virtual C/H/S values for a given block. Use the algorithm 961 * outlined in the VHD specification to calculate values. 962 */ 963 void 964 blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s) 965 { 966 off_t sectors; /* total sectors of the block dev */ 967 off_t hcyl; /* cylinders times heads */ 968 uint16_t secpt; /* sectors per track */ 969 uint8_t heads; 970 971 assert(bc->bc_magic == BLOCKIF_SIG); 972 973 sectors = bc->bc_size / bc->bc_sectsz; 974 975 /* Clamp the size to the largest possible with CHS */ 976 if (sectors > 65535UL*16*255) 977 sectors = 65535UL*16*255; 978 979 if (sectors >= 65536UL*16*63) { 980 secpt = 255; 981 heads = 16; 982 hcyl = sectors / secpt; 983 } else { 984 secpt = 17; 985 hcyl = sectors / secpt; 986 heads = (hcyl + 1023) / 1024; 987 988 if (heads < 4) 989 heads = 4; 990 991 if (hcyl >= (heads * 1024) || heads > 16) { 992 secpt = 31; 993 heads = 16; 994 hcyl = sectors / secpt; 995 } 996 if (hcyl >= (heads * 1024)) { 997 secpt = 63; 998 heads = 16; 999 hcyl = sectors / secpt; 1000 } 1001 } 1002 1003 *c = hcyl / heads; 1004 *h = heads; 1005 *s = secpt; 1006 } 1007 1008 /* 1009 * Accessors 1010 */ 1011 off_t 1012 blockif_size(struct blockif_ctxt *bc) 1013 { 1014 1015 assert(bc->bc_magic == BLOCKIF_SIG); 1016 return (bc->bc_size); 1017 } 1018 1019 int 1020 blockif_sectsz(struct blockif_ctxt *bc) 1021 { 1022 1023 assert(bc->bc_magic == BLOCKIF_SIG); 1024 return (bc->bc_sectsz); 1025 } 1026 1027 void 1028 blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off) 1029 { 1030 1031 assert(bc->bc_magic == BLOCKIF_SIG); 1032 *size = bc->bc_psectsz; 1033 *off = bc->bc_psectoff; 1034 } 1035 1036 int 1037 blockif_queuesz(struct blockif_ctxt *bc) 1038 { 1039 1040 assert(bc->bc_magic == BLOCKIF_SIG); 1041 return (BLOCKIF_MAXREQ - 1); 1042 } 1043 1044 int 1045 blockif_is_ro(struct blockif_ctxt *bc) 1046 { 1047 1048 assert(bc->bc_magic == BLOCKIF_SIG); 1049 return (bc->bc_rdonly); 1050 } 1051 1052 int 1053 blockif_candelete(struct blockif_ctxt *bc) 1054 { 1055 1056 assert(bc->bc_magic == BLOCKIF_SIG); 1057 return (bc->bc_candelete); 1058 } 1059 1060 #ifndef __FreeBSD__ 1061 int 1062 blockif_set_wce(struct blockif_ctxt *bc, int wc_enable) 1063 { 1064 int res = 0, flags; 1065 int clean_val = (wc_enable != 0) ? 1 : 0; 1066 1067 (void) pthread_mutex_lock(&bc->bc_mtx); 1068 switch (bc->bc_wce) { 1069 case WCE_IOCTL: 1070 res = ioctl(bc->bc_fd, DKIOCSETWCE, &clean_val); 1071 break; 1072 case WCE_FCNTL: 1073 if ((flags = fcntl(bc->bc_fd, F_GETFL)) >= 0) { 1074 if (wc_enable == 0) { 1075 flags |= O_DSYNC; 1076 } else { 1077 flags &= ~O_DSYNC; 1078 } 1079 if (fcntl(bc->bc_fd, F_SETFL, flags) == -1) { 1080 res = -1; 1081 } 1082 } else { 1083 res = -1; 1084 } 1085 break; 1086 default: 1087 break; 1088 } 1089 1090 /* 1091 * After a successful disable of the write cache, ensure that any 1092 * lingering data in the cache is synced out. 1093 */ 1094 if (res == 0 && wc_enable == 0) { 1095 res = fsync(bc->bc_fd); 1096 } 1097 (void) pthread_mutex_unlock(&bc->bc_mtx); 1098 1099 return (res); 1100 } 1101 #endif /* __FreeBSD__ */ 1102