1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2013 Peter Grehan <grehan@freebsd.org> 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 * 28 * $FreeBSD$ 29 */ 30 31 /* 32 * Copyright 2020 Joyent, Inc. 33 */ 34 35 #include <sys/cdefs.h> 36 __FBSDID("$FreeBSD$"); 37 38 #include <sys/param.h> 39 #ifndef WITHOUT_CAPSICUM 40 #include <sys/capsicum.h> 41 #endif 42 #include <sys/queue.h> 43 #include <sys/errno.h> 44 #include <sys/stat.h> 45 #include <sys/ioctl.h> 46 #include <sys/disk.h> 47 #include <sys/limits.h> 48 #include <sys/uio.h> 49 #ifndef __FreeBSD__ 50 #include <sys/dkio.h> 51 #endif 52 53 #include <assert.h> 54 #ifndef WITHOUT_CAPSICUM 55 #include <capsicum_helpers.h> 56 #endif 57 #include <err.h> 58 #include <fcntl.h> 59 #include <stdio.h> 60 #include <stdlib.h> 61 #include <string.h> 62 #include <pthread.h> 63 #include <pthread_np.h> 64 #include <signal.h> 65 #include <sysexits.h> 66 #include <unistd.h> 67 68 #include <machine/atomic.h> 69 70 #include "bhyverun.h" 71 #ifdef __FreeBSD__ 72 #include "mevent.h" 73 #endif 74 #include "block_if.h" 75 76 #define BLOCKIF_SIG 0xb109b109 77 78 #ifdef __FreeBSD__ 79 #define BLOCKIF_NUMTHR 8 80 #else 81 /* Enlarge to keep pace with the virtio-block ring size */ 82 #define BLOCKIF_NUMTHR 16 83 #endif 84 #define BLOCKIF_MAXREQ (BLOCKIF_RING_MAX + BLOCKIF_NUMTHR) 85 86 enum blockop { 87 BOP_READ, 88 BOP_WRITE, 89 #ifndef __FreeBSD__ 90 BOP_WRITE_SYNC, 91 #endif 92 BOP_FLUSH, 93 BOP_DELETE 94 }; 95 96 enum blockstat { 97 BST_FREE, 98 BST_BLOCK, 99 BST_PEND, 100 BST_BUSY, 101 BST_DONE 102 }; 103 104 struct blockif_elem { 105 TAILQ_ENTRY(blockif_elem) be_link; 106 struct blockif_req *be_req; 107 enum blockop be_op; 108 enum blockstat be_status; 109 pthread_t be_tid; 110 off_t be_block; 111 }; 112 113 #ifndef __FreeBSD__ 114 enum blockif_wce { 115 WCE_NONE = 0, 116 WCE_IOCTL, 117 WCE_FCNTL 118 }; 119 #endif 120 121 struct blockif_ctxt { 122 int bc_magic; 123 int bc_fd; 124 int bc_ischr; 125 int bc_isgeom; 126 int bc_candelete; 127 #ifndef __FreeBSD__ 128 enum blockif_wce bc_wce; 129 #endif 130 int bc_rdonly; 131 off_t bc_size; 132 int bc_sectsz; 133 int bc_psectsz; 134 int bc_psectoff; 135 int bc_closing; 136 pthread_t bc_btid[BLOCKIF_NUMTHR]; 137 pthread_mutex_t bc_mtx; 138 pthread_cond_t bc_cond; 139 140 /* Request elements and free/pending/busy queues */ 141 TAILQ_HEAD(, blockif_elem) bc_freeq; 142 TAILQ_HEAD(, blockif_elem) bc_pendq; 143 TAILQ_HEAD(, blockif_elem) bc_busyq; 144 struct blockif_elem bc_reqs[BLOCKIF_MAXREQ]; 145 }; 146 147 static pthread_once_t blockif_once = PTHREAD_ONCE_INIT; 148 149 struct blockif_sig_elem { 150 pthread_mutex_t bse_mtx; 151 pthread_cond_t bse_cond; 152 int bse_pending; 153 struct blockif_sig_elem *bse_next; 154 }; 155 156 static struct blockif_sig_elem *blockif_bse_head; 157 158 static int 159 blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq, 160 enum blockop op) 161 { 162 struct blockif_elem *be, *tbe; 163 off_t off; 164 int i; 165 166 be = TAILQ_FIRST(&bc->bc_freeq); 167 assert(be != NULL); 168 assert(be->be_status == BST_FREE); 169 TAILQ_REMOVE(&bc->bc_freeq, be, be_link); 170 be->be_req = breq; 171 be->be_op = op; 172 switch (op) { 173 case BOP_READ: 174 case BOP_WRITE: 175 #ifndef __FreeBSD__ 176 case BOP_WRITE_SYNC: 177 #endif 178 case BOP_DELETE: 179 off = breq->br_offset; 180 for (i = 0; i < breq->br_iovcnt; i++) 181 off += breq->br_iov[i].iov_len; 182 break; 183 default: 184 off = OFF_MAX; 185 } 186 be->be_block = off; 187 TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) { 188 if (tbe->be_block == breq->br_offset) 189 break; 190 } 191 if (tbe == NULL) { 192 TAILQ_FOREACH(tbe, &bc->bc_busyq, be_link) { 193 if (tbe->be_block == breq->br_offset) 194 break; 195 } 196 } 197 if (tbe == NULL) 198 be->be_status = BST_PEND; 199 else 200 be->be_status = BST_BLOCK; 201 TAILQ_INSERT_TAIL(&bc->bc_pendq, be, be_link); 202 return (be->be_status == BST_PEND); 203 } 204 205 static int 206 blockif_dequeue(struct blockif_ctxt *bc, pthread_t t, struct blockif_elem **bep) 207 { 208 struct blockif_elem *be; 209 210 TAILQ_FOREACH(be, &bc->bc_pendq, be_link) { 211 if (be->be_status == BST_PEND) 212 break; 213 assert(be->be_status == BST_BLOCK); 214 } 215 if (be == NULL) 216 return (0); 217 TAILQ_REMOVE(&bc->bc_pendq, be, be_link); 218 be->be_status = BST_BUSY; 219 be->be_tid = t; 220 TAILQ_INSERT_TAIL(&bc->bc_busyq, be, be_link); 221 *bep = be; 222 return (1); 223 } 224 225 static void 226 blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be) 227 { 228 struct blockif_elem *tbe; 229 230 if (be->be_status == BST_DONE || be->be_status == BST_BUSY) 231 TAILQ_REMOVE(&bc->bc_busyq, be, be_link); 232 else 233 TAILQ_REMOVE(&bc->bc_pendq, be, be_link); 234 TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) { 235 if (tbe->be_req->br_offset == be->be_block) 236 tbe->be_status = BST_PEND; 237 } 238 be->be_tid = 0; 239 be->be_status = BST_FREE; 240 be->be_req = NULL; 241 TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link); 242 } 243 244 static void 245 blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf) 246 { 247 struct blockif_req *br; 248 #ifdef __FreeBSD__ 249 off_t arg[2]; 250 #endif 251 ssize_t clen, len, off, boff, voff; 252 int i, err; 253 254 br = be->be_req; 255 if (br->br_iovcnt <= 1) 256 buf = NULL; 257 err = 0; 258 switch (be->be_op) { 259 case BOP_READ: 260 if (buf == NULL) { 261 if ((len = preadv(bc->bc_fd, br->br_iov, br->br_iovcnt, 262 br->br_offset)) < 0) 263 err = errno; 264 else 265 br->br_resid -= len; 266 break; 267 } 268 i = 0; 269 off = voff = 0; 270 while (br->br_resid > 0) { 271 len = MIN(br->br_resid, MAXPHYS); 272 if (pread(bc->bc_fd, buf, len, br->br_offset + 273 off) < 0) { 274 err = errno; 275 break; 276 } 277 boff = 0; 278 do { 279 clen = MIN(len - boff, br->br_iov[i].iov_len - 280 voff); 281 memcpy(br->br_iov[i].iov_base + voff, 282 buf + boff, clen); 283 if (clen < br->br_iov[i].iov_len - voff) 284 voff += clen; 285 else { 286 i++; 287 voff = 0; 288 } 289 boff += clen; 290 } while (boff < len); 291 off += len; 292 br->br_resid -= len; 293 } 294 break; 295 case BOP_WRITE: 296 if (bc->bc_rdonly) { 297 err = EROFS; 298 break; 299 } 300 if (buf == NULL) { 301 if ((len = pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt, 302 br->br_offset)) < 0) 303 err = errno; 304 else 305 br->br_resid -= len; 306 break; 307 } 308 i = 0; 309 off = voff = 0; 310 while (br->br_resid > 0) { 311 len = MIN(br->br_resid, MAXPHYS); 312 boff = 0; 313 do { 314 clen = MIN(len - boff, br->br_iov[i].iov_len - 315 voff); 316 memcpy(buf + boff, 317 br->br_iov[i].iov_base + voff, clen); 318 if (clen < br->br_iov[i].iov_len - voff) 319 voff += clen; 320 else { 321 i++; 322 voff = 0; 323 } 324 boff += clen; 325 } while (boff < len); 326 if (pwrite(bc->bc_fd, buf, len, br->br_offset + 327 off) < 0) { 328 err = errno; 329 break; 330 } 331 off += len; 332 br->br_resid -= len; 333 } 334 break; 335 case BOP_FLUSH: 336 #ifdef __FreeBSD__ 337 if (bc->bc_ischr) { 338 if (ioctl(bc->bc_fd, DIOCGFLUSH)) 339 err = errno; 340 } else if (fsync(bc->bc_fd)) 341 err = errno; 342 #else 343 /* 344 * This fsync() should be adequate to flush the cache of a file 345 * or device. In VFS, the VOP_SYNC operation is converted to 346 * the appropriate ioctl in both sdev (for real devices) and 347 * zfs (for zvols). 348 */ 349 if (fsync(bc->bc_fd)) 350 err = errno; 351 #endif 352 break; 353 case BOP_DELETE: 354 if (!bc->bc_candelete) 355 err = EOPNOTSUPP; 356 else if (bc->bc_rdonly) 357 err = EROFS; 358 #ifdef __FreeBSD__ 359 else if (bc->bc_ischr) { 360 arg[0] = br->br_offset; 361 arg[1] = br->br_resid; 362 if (ioctl(bc->bc_fd, DIOCGDELETE, arg)) 363 err = errno; 364 else 365 br->br_resid = 0; 366 } 367 else 368 err = EOPNOTSUPP; 369 #else 370 else if (bc->bc_ischr) { 371 dkioc_free_list_t dfl = { 372 .dfl_num_exts = 1, 373 .dfl_offset = 0, 374 .dfl_flags = 0, 375 .dfl_exts = { 376 { 377 .dfle_start = br->br_offset, 378 .dfle_length = br->br_resid 379 } 380 } 381 }; 382 383 if (ioctl(bc->bc_fd, DKIOCFREE, &dfl)) 384 err = errno; 385 else 386 br->br_resid = 0; 387 } else { 388 struct flock fl = { 389 .l_whence = 0, 390 .l_type = F_WRLCK, 391 .l_start = br->br_offset, 392 .l_len = br->br_resid 393 }; 394 395 if (fcntl(bc->bc_fd, F_FREESP, &fl)) 396 err = errno; 397 else 398 br->br_resid = 0; 399 } 400 #endif 401 break; 402 default: 403 err = EINVAL; 404 break; 405 } 406 407 be->be_status = BST_DONE; 408 409 (*br->br_callback)(br, err); 410 } 411 412 static void * 413 blockif_thr(void *arg) 414 { 415 struct blockif_ctxt *bc; 416 struct blockif_elem *be; 417 pthread_t t; 418 uint8_t *buf; 419 420 bc = arg; 421 if (bc->bc_isgeom) 422 buf = malloc(MAXPHYS); 423 else 424 buf = NULL; 425 t = pthread_self(); 426 427 pthread_mutex_lock(&bc->bc_mtx); 428 for (;;) { 429 while (blockif_dequeue(bc, t, &be)) { 430 pthread_mutex_unlock(&bc->bc_mtx); 431 blockif_proc(bc, be, buf); 432 pthread_mutex_lock(&bc->bc_mtx); 433 blockif_complete(bc, be); 434 } 435 /* Check ctxt status here to see if exit requested */ 436 if (bc->bc_closing) 437 break; 438 pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx); 439 } 440 pthread_mutex_unlock(&bc->bc_mtx); 441 442 if (buf) 443 free(buf); 444 pthread_exit(NULL); 445 return (NULL); 446 } 447 448 #ifdef __FreeBSD__ 449 static void 450 blockif_sigcont_handler(int signal, enum ev_type type, void *arg) 451 #else 452 static void 453 blockif_sigcont_handler(int signal) 454 #endif 455 { 456 struct blockif_sig_elem *bse; 457 458 for (;;) { 459 /* 460 * Process the entire list even if not intended for 461 * this thread. 462 */ 463 do { 464 bse = blockif_bse_head; 465 if (bse == NULL) 466 return; 467 } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head, 468 (uintptr_t)bse, 469 (uintptr_t)bse->bse_next)); 470 471 pthread_mutex_lock(&bse->bse_mtx); 472 bse->bse_pending = 0; 473 pthread_cond_signal(&bse->bse_cond); 474 pthread_mutex_unlock(&bse->bse_mtx); 475 } 476 } 477 478 static void 479 blockif_init(void) 480 { 481 #ifdef __FreeBSD__ 482 mevent_add(SIGCONT, EVF_SIGNAL, blockif_sigcont_handler, NULL); 483 (void) signal(SIGCONT, SIG_IGN); 484 #else 485 (void) sigset(SIGCONT, blockif_sigcont_handler); 486 #endif 487 } 488 489 struct blockif_ctxt * 490 blockif_open(const char *optstr, const char *ident) 491 { 492 char tname[MAXCOMLEN + 1]; 493 #ifdef __FreeBSD__ 494 char name[MAXPATHLEN]; 495 char *nopt, *xopts, *cp; 496 #else 497 char *nopt, *xopts, *cp = NULL; 498 #endif 499 struct blockif_ctxt *bc; 500 struct stat sbuf; 501 #ifdef __FreeBSD__ 502 struct diocgattr_arg arg; 503 #else 504 enum blockif_wce wce = WCE_NONE; 505 #endif 506 off_t size, psectsz, psectoff; 507 int extra, fd, i, sectsz; 508 int nocache, sync, ro, candelete, geom, ssopt, pssopt; 509 int nodelete; 510 511 #ifndef WITHOUT_CAPSICUM 512 cap_rights_t rights; 513 cap_ioctl_t cmds[] = { DIOCGFLUSH, DIOCGDELETE }; 514 #endif 515 516 pthread_once(&blockif_once, blockif_init); 517 518 fd = -1; 519 ssopt = 0; 520 nocache = 0; 521 sync = 0; 522 ro = 0; 523 nodelete = 0; 524 525 /* 526 * The first element in the optstring is always a pathname. 527 * Optional elements follow 528 */ 529 nopt = xopts = strdup(optstr); 530 while (xopts != NULL) { 531 cp = strsep(&xopts, ","); 532 if (cp == nopt) /* file or device pathname */ 533 continue; 534 else if (!strcmp(cp, "nocache")) 535 nocache = 1; 536 else if (!strcmp(cp, "nodelete")) 537 nodelete = 1; 538 else if (!strcmp(cp, "sync") || !strcmp(cp, "direct")) 539 sync = 1; 540 else if (!strcmp(cp, "ro")) 541 ro = 1; 542 else if (sscanf(cp, "sectorsize=%d/%d", &ssopt, &pssopt) == 2) 543 ; 544 else if (sscanf(cp, "sectorsize=%d", &ssopt) == 1) 545 pssopt = ssopt; 546 else { 547 fprintf(stderr, "Invalid device option \"%s\"\n", cp); 548 goto err; 549 } 550 } 551 552 extra = 0; 553 if (nocache) 554 extra |= O_DIRECT; 555 if (sync) 556 extra |= O_SYNC; 557 558 fd = open(nopt, (ro ? O_RDONLY : O_RDWR) | extra); 559 if (fd < 0 && !ro) { 560 /* Attempt a r/w fail with a r/o open */ 561 fd = open(nopt, O_RDONLY | extra); 562 ro = 1; 563 } 564 565 if (fd < 0) { 566 warn("Could not open backing file: %s", nopt); 567 goto err; 568 } 569 570 if (fstat(fd, &sbuf) < 0) { 571 warn("Could not stat backing file %s", nopt); 572 goto err; 573 } 574 575 #ifndef WITHOUT_CAPSICUM 576 cap_rights_init(&rights, CAP_FSYNC, CAP_IOCTL, CAP_READ, CAP_SEEK, 577 CAP_WRITE); 578 if (ro) 579 cap_rights_clear(&rights, CAP_FSYNC, CAP_WRITE); 580 581 if (caph_rights_limit(fd, &rights) == -1) 582 errx(EX_OSERR, "Unable to apply rights for sandbox"); 583 #endif 584 585 /* 586 * Deal with raw devices 587 */ 588 size = sbuf.st_size; 589 sectsz = DEV_BSIZE; 590 psectsz = psectoff = 0; 591 candelete = geom = 0; 592 #ifdef __FreeBSD__ 593 if (S_ISCHR(sbuf.st_mode)) { 594 if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 || 595 ioctl(fd, DIOCGSECTORSIZE, §sz)) { 596 perror("Could not fetch dev blk/sector size"); 597 goto err; 598 } 599 assert(size != 0); 600 assert(sectsz != 0); 601 if (ioctl(fd, DIOCGSTRIPESIZE, &psectsz) == 0 && psectsz > 0) 602 ioctl(fd, DIOCGSTRIPEOFFSET, &psectoff); 603 strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name)); 604 arg.len = sizeof(arg.value.i); 605 if (nodelete == 0 && ioctl(fd, DIOCGATTR, &arg) == 0) 606 candelete = arg.value.i; 607 if (ioctl(fd, DIOCGPROVIDERNAME, name) == 0) 608 geom = 1; 609 } else { 610 psectsz = sbuf.st_blksize; 611 } 612 #else 613 psectsz = sbuf.st_blksize; 614 if (S_ISCHR(sbuf.st_mode)) { 615 struct dk_minfo_ext dkmext; 616 int wce_val; 617 618 /* Look for a more accurate physical blocksize */ 619 if (ioctl(fd, DKIOCGMEDIAINFOEXT, &dkmext) == 0) { 620 psectsz = dkmext.dki_pbsize; 621 } 622 /* See if a configurable write cache is present and working */ 623 if (ioctl(fd, DKIOCGETWCE, &wce_val) == 0) { 624 /* 625 * If WCE is already active, disable it until the 626 * specific device driver calls for its return. If it 627 * is not active, toggle it on and off to verify that 628 * such actions are possible. 629 */ 630 if (wce_val != 0) { 631 wce_val = 0; 632 /* 633 * Inability to disable the cache is a threat 634 * to data durability. 635 */ 636 assert(ioctl(fd, DKIOCSETWCE, &wce_val) == 0); 637 wce = WCE_IOCTL; 638 } else { 639 int r1, r2; 640 641 wce_val = 1; 642 r1 = ioctl(fd, DKIOCSETWCE, &wce_val); 643 wce_val = 0; 644 r2 = ioctl(fd, DKIOCSETWCE, &wce_val); 645 646 if (r1 == 0 && r2 == 0) { 647 wce = WCE_IOCTL; 648 } else { 649 /* 650 * If the cache cache toggle was not 651 * successful, ensure that the cache 652 * was not left enabled. 653 */ 654 assert(r1 != 0); 655 } 656 } 657 } 658 659 if (nodelete == 0 && ioctl(fd, DKIOC_CANFREE, &candelete)) 660 candelete = 0; 661 662 } else { 663 int flags; 664 665 if ((flags = fcntl(fd, F_GETFL)) >= 0) { 666 flags |= O_DSYNC; 667 if (fcntl(fd, F_SETFL, flags) != -1) { 668 wce = WCE_FCNTL; 669 } 670 } 671 672 /* 673 * We don't have a way to discover if a file supports the 674 * FREESP fcntl cmd (other than trying it). However, 675 * zfs, ufs, tmpfs, and udfs all support the FREESP fcntl cmd. 676 * Nfsv4 and nfsv4 also forward the FREESP request 677 * to the server, so we always enable it for file based 678 * volumes. Anyone trying to run volumes on an unsupported 679 * configuration is on their own, and should be prepared 680 * for the requests to fail. 681 */ 682 if (nodelete == 0) 683 candelete = 1; 684 } 685 #endif 686 687 #ifndef WITHOUT_CAPSICUM 688 if (caph_ioctls_limit(fd, cmds, nitems(cmds)) == -1) 689 errx(EX_OSERR, "Unable to apply rights for sandbox"); 690 #endif 691 692 if (ssopt != 0) { 693 if (!powerof2(ssopt) || !powerof2(pssopt) || ssopt < 512 || 694 ssopt > pssopt) { 695 fprintf(stderr, "Invalid sector size %d/%d\n", 696 ssopt, pssopt); 697 goto err; 698 } 699 700 /* 701 * Some backend drivers (e.g. cd0, ada0) require that the I/O 702 * size be a multiple of the device's sector size. 703 * 704 * Validate that the emulated sector size complies with this 705 * requirement. 706 */ 707 if (S_ISCHR(sbuf.st_mode)) { 708 if (ssopt < sectsz || (ssopt % sectsz) != 0) { 709 fprintf(stderr, "Sector size %d incompatible " 710 "with underlying device sector size %d\n", 711 ssopt, sectsz); 712 goto err; 713 } 714 } 715 716 sectsz = ssopt; 717 psectsz = pssopt; 718 psectoff = 0; 719 } 720 721 bc = calloc(1, sizeof(struct blockif_ctxt)); 722 if (bc == NULL) { 723 perror("calloc"); 724 goto err; 725 } 726 727 bc->bc_magic = BLOCKIF_SIG; 728 bc->bc_fd = fd; 729 bc->bc_ischr = S_ISCHR(sbuf.st_mode); 730 bc->bc_isgeom = geom; 731 bc->bc_candelete = candelete; 732 #ifndef __FreeBSD__ 733 bc->bc_wce = wce; 734 #endif 735 bc->bc_rdonly = ro; 736 bc->bc_size = size; 737 bc->bc_sectsz = sectsz; 738 bc->bc_psectsz = psectsz; 739 bc->bc_psectoff = psectoff; 740 pthread_mutex_init(&bc->bc_mtx, NULL); 741 pthread_cond_init(&bc->bc_cond, NULL); 742 TAILQ_INIT(&bc->bc_freeq); 743 TAILQ_INIT(&bc->bc_pendq); 744 TAILQ_INIT(&bc->bc_busyq); 745 for (i = 0; i < BLOCKIF_MAXREQ; i++) { 746 bc->bc_reqs[i].be_status = BST_FREE; 747 TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link); 748 } 749 750 for (i = 0; i < BLOCKIF_NUMTHR; i++) { 751 pthread_create(&bc->bc_btid[i], NULL, blockif_thr, bc); 752 snprintf(tname, sizeof(tname), "blk-%s-%d", ident, i); 753 pthread_set_name_np(bc->bc_btid[i], tname); 754 } 755 756 return (bc); 757 err: 758 if (fd >= 0) 759 close(fd); 760 free(nopt); 761 return (NULL); 762 } 763 764 static int 765 blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq, 766 enum blockop op) 767 { 768 int err; 769 770 err = 0; 771 772 pthread_mutex_lock(&bc->bc_mtx); 773 if (!TAILQ_EMPTY(&bc->bc_freeq)) { 774 /* 775 * Enqueue and inform the block i/o thread 776 * that there is work available 777 */ 778 if (blockif_enqueue(bc, breq, op)) 779 pthread_cond_signal(&bc->bc_cond); 780 } else { 781 /* 782 * Callers are not allowed to enqueue more than 783 * the specified blockif queue limit. Return an 784 * error to indicate that the queue length has been 785 * exceeded. 786 */ 787 err = E2BIG; 788 } 789 pthread_mutex_unlock(&bc->bc_mtx); 790 791 return (err); 792 } 793 794 int 795 blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq) 796 { 797 798 assert(bc->bc_magic == BLOCKIF_SIG); 799 return (blockif_request(bc, breq, BOP_READ)); 800 } 801 802 int 803 blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq) 804 { 805 806 assert(bc->bc_magic == BLOCKIF_SIG); 807 return (blockif_request(bc, breq, BOP_WRITE)); 808 } 809 810 int 811 blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq) 812 { 813 814 assert(bc->bc_magic == BLOCKIF_SIG); 815 return (blockif_request(bc, breq, BOP_FLUSH)); 816 } 817 818 int 819 blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq) 820 { 821 822 assert(bc->bc_magic == BLOCKIF_SIG); 823 return (blockif_request(bc, breq, BOP_DELETE)); 824 } 825 826 int 827 blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq) 828 { 829 struct blockif_elem *be; 830 831 assert(bc->bc_magic == BLOCKIF_SIG); 832 833 pthread_mutex_lock(&bc->bc_mtx); 834 /* 835 * Check pending requests. 836 */ 837 TAILQ_FOREACH(be, &bc->bc_pendq, be_link) { 838 if (be->be_req == breq) 839 break; 840 } 841 if (be != NULL) { 842 /* 843 * Found it. 844 */ 845 blockif_complete(bc, be); 846 pthread_mutex_unlock(&bc->bc_mtx); 847 848 return (0); 849 } 850 851 /* 852 * Check in-flight requests. 853 */ 854 TAILQ_FOREACH(be, &bc->bc_busyq, be_link) { 855 if (be->be_req == breq) 856 break; 857 } 858 if (be == NULL) { 859 /* 860 * Didn't find it. 861 */ 862 pthread_mutex_unlock(&bc->bc_mtx); 863 return (EINVAL); 864 } 865 866 /* 867 * Interrupt the processing thread to force it return 868 * prematurely via it's normal callback path. 869 */ 870 while (be->be_status == BST_BUSY) { 871 struct blockif_sig_elem bse, *old_head; 872 873 pthread_mutex_init(&bse.bse_mtx, NULL); 874 pthread_cond_init(&bse.bse_cond, NULL); 875 876 bse.bse_pending = 1; 877 878 do { 879 old_head = blockif_bse_head; 880 bse.bse_next = old_head; 881 } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head, 882 (uintptr_t)old_head, 883 (uintptr_t)&bse)); 884 885 pthread_kill(be->be_tid, SIGCONT); 886 887 pthread_mutex_lock(&bse.bse_mtx); 888 while (bse.bse_pending) 889 pthread_cond_wait(&bse.bse_cond, &bse.bse_mtx); 890 pthread_mutex_unlock(&bse.bse_mtx); 891 } 892 893 pthread_mutex_unlock(&bc->bc_mtx); 894 895 /* 896 * The processing thread has been interrupted. Since it's not 897 * clear if the callback has been invoked yet, return EBUSY. 898 */ 899 return (EBUSY); 900 } 901 902 int 903 blockif_close(struct blockif_ctxt *bc) 904 { 905 void *jval; 906 int i; 907 908 assert(bc->bc_magic == BLOCKIF_SIG); 909 910 /* 911 * Stop the block i/o thread 912 */ 913 pthread_mutex_lock(&bc->bc_mtx); 914 bc->bc_closing = 1; 915 pthread_mutex_unlock(&bc->bc_mtx); 916 pthread_cond_broadcast(&bc->bc_cond); 917 for (i = 0; i < BLOCKIF_NUMTHR; i++) 918 pthread_join(bc->bc_btid[i], &jval); 919 920 /* XXX Cancel queued i/o's ??? */ 921 922 /* 923 * Release resources 924 */ 925 bc->bc_magic = 0; 926 close(bc->bc_fd); 927 free(bc); 928 929 return (0); 930 } 931 932 /* 933 * Return virtual C/H/S values for a given block. Use the algorithm 934 * outlined in the VHD specification to calculate values. 935 */ 936 void 937 blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s) 938 { 939 off_t sectors; /* total sectors of the block dev */ 940 off_t hcyl; /* cylinders times heads */ 941 uint16_t secpt; /* sectors per track */ 942 uint8_t heads; 943 944 assert(bc->bc_magic == BLOCKIF_SIG); 945 946 sectors = bc->bc_size / bc->bc_sectsz; 947 948 /* Clamp the size to the largest possible with CHS */ 949 if (sectors > 65535UL*16*255) 950 sectors = 65535UL*16*255; 951 952 if (sectors >= 65536UL*16*63) { 953 secpt = 255; 954 heads = 16; 955 hcyl = sectors / secpt; 956 } else { 957 secpt = 17; 958 hcyl = sectors / secpt; 959 heads = (hcyl + 1023) / 1024; 960 961 if (heads < 4) 962 heads = 4; 963 964 if (hcyl >= (heads * 1024) || heads > 16) { 965 secpt = 31; 966 heads = 16; 967 hcyl = sectors / secpt; 968 } 969 if (hcyl >= (heads * 1024)) { 970 secpt = 63; 971 heads = 16; 972 hcyl = sectors / secpt; 973 } 974 } 975 976 *c = hcyl / heads; 977 *h = heads; 978 *s = secpt; 979 } 980 981 /* 982 * Accessors 983 */ 984 off_t 985 blockif_size(struct blockif_ctxt *bc) 986 { 987 988 assert(bc->bc_magic == BLOCKIF_SIG); 989 return (bc->bc_size); 990 } 991 992 int 993 blockif_sectsz(struct blockif_ctxt *bc) 994 { 995 996 assert(bc->bc_magic == BLOCKIF_SIG); 997 return (bc->bc_sectsz); 998 } 999 1000 void 1001 blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off) 1002 { 1003 1004 assert(bc->bc_magic == BLOCKIF_SIG); 1005 *size = bc->bc_psectsz; 1006 *off = bc->bc_psectoff; 1007 } 1008 1009 int 1010 blockif_queuesz(struct blockif_ctxt *bc) 1011 { 1012 1013 assert(bc->bc_magic == BLOCKIF_SIG); 1014 return (BLOCKIF_MAXREQ - 1); 1015 } 1016 1017 int 1018 blockif_is_ro(struct blockif_ctxt *bc) 1019 { 1020 1021 assert(bc->bc_magic == BLOCKIF_SIG); 1022 return (bc->bc_rdonly); 1023 } 1024 1025 int 1026 blockif_candelete(struct blockif_ctxt *bc) 1027 { 1028 1029 assert(bc->bc_magic == BLOCKIF_SIG); 1030 return (bc->bc_candelete); 1031 } 1032 1033 #ifndef __FreeBSD__ 1034 int 1035 blockif_set_wce(struct blockif_ctxt *bc, int wc_enable) 1036 { 1037 int res = 0, flags; 1038 int clean_val = (wc_enable != 0) ? 1 : 0; 1039 1040 (void) pthread_mutex_lock(&bc->bc_mtx); 1041 switch (bc->bc_wce) { 1042 case WCE_IOCTL: 1043 res = ioctl(bc->bc_fd, DKIOCSETWCE, &clean_val); 1044 break; 1045 case WCE_FCNTL: 1046 if ((flags = fcntl(bc->bc_fd, F_GETFL)) >= 0) { 1047 if (wc_enable == 0) { 1048 flags |= O_DSYNC; 1049 } else { 1050 flags &= ~O_DSYNC; 1051 } 1052 if (fcntl(bc->bc_fd, F_SETFL, flags) == -1) { 1053 res = -1; 1054 } 1055 } else { 1056 res = -1; 1057 } 1058 break; 1059 default: 1060 break; 1061 } 1062 1063 /* 1064 * After a successful disable of the write cache, ensure that any 1065 * lingering data in the cache is synced out. 1066 */ 1067 if (res == 0 && wc_enable == 0) { 1068 res = fsync(bc->bc_fd); 1069 } 1070 (void) pthread_mutex_unlock(&bc->bc_mtx); 1071 1072 return (res); 1073 } 1074 #endif /* __FreeBSD__ */ 1075