1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2013 Peter Grehan <grehan@freebsd.org> 5 * All rights reserved. 6 * Copyright 2020 Joyent, Inc. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * $FreeBSD$ 30 */ 31 32 /* 33 * Copyright 2020 Joyent, Inc. 34 */ 35 36 #include <sys/cdefs.h> 37 __FBSDID("$FreeBSD$"); 38 39 #include <sys/param.h> 40 #ifndef WITHOUT_CAPSICUM 41 #include <sys/capsicum.h> 42 #endif 43 #include <sys/queue.h> 44 #include <sys/errno.h> 45 #include <sys/stat.h> 46 #include <sys/ioctl.h> 47 #include <sys/disk.h> 48 #include <sys/limits.h> 49 #include <sys/uio.h> 50 #ifndef __FreeBSD__ 51 #include <sys/dkio.h> 52 #endif 53 54 #include <assert.h> 55 #ifndef WITHOUT_CAPSICUM 56 #include <capsicum_helpers.h> 57 #endif 58 #include <err.h> 59 #include <fcntl.h> 60 #include <stdio.h> 61 #include <stdlib.h> 62 #include <string.h> 63 #include <pthread.h> 64 #include <pthread_np.h> 65 #include <signal.h> 66 #include <sysexits.h> 67 #include <unistd.h> 68 69 #include <machine/atomic.h> 70 71 #include "bhyverun.h" 72 #include "debug.h" 73 #ifdef __FreeBSD__ 74 #include "mevent.h" 75 #endif 76 #include "block_if.h" 77 78 #define BLOCKIF_SIG 0xb109b109 79 80 #ifdef __FreeBSD__ 81 #define BLOCKIF_NUMTHR 8 82 #else 83 /* Enlarge to keep pace with the virtio-block ring size */ 84 #define BLOCKIF_NUMTHR 16 85 #endif 86 #define BLOCKIF_MAXREQ (BLOCKIF_RING_MAX + BLOCKIF_NUMTHR) 87 88 enum blockop { 89 BOP_READ, 90 BOP_WRITE, 91 #ifndef __FreeBSD__ 92 BOP_WRITE_SYNC, 93 #endif 94 BOP_FLUSH, 95 BOP_DELETE 96 }; 97 98 enum blockstat { 99 BST_FREE, 100 BST_BLOCK, 101 BST_PEND, 102 BST_BUSY, 103 BST_DONE 104 }; 105 106 struct blockif_elem { 107 TAILQ_ENTRY(blockif_elem) be_link; 108 struct blockif_req *be_req; 109 enum blockop be_op; 110 enum blockstat be_status; 111 pthread_t be_tid; 112 off_t be_block; 113 }; 114 115 #ifndef __FreeBSD__ 116 enum blockif_wce { 117 WCE_NONE = 0, 118 WCE_IOCTL, 119 WCE_FCNTL 120 }; 121 #endif 122 123 struct blockif_ctxt { 124 int bc_magic; 125 int bc_fd; 126 int bc_ischr; 127 int bc_isgeom; 128 int bc_candelete; 129 #ifndef __FreeBSD__ 130 enum blockif_wce bc_wce; 131 #endif 132 int bc_rdonly; 133 off_t bc_size; 134 int bc_sectsz; 135 int bc_psectsz; 136 int bc_psectoff; 137 int bc_closing; 138 pthread_t bc_btid[BLOCKIF_NUMTHR]; 139 pthread_mutex_t bc_mtx; 140 pthread_cond_t bc_cond; 141 142 /* Request elements and free/pending/busy queues */ 143 TAILQ_HEAD(, blockif_elem) bc_freeq; 144 TAILQ_HEAD(, blockif_elem) bc_pendq; 145 TAILQ_HEAD(, blockif_elem) bc_busyq; 146 struct blockif_elem bc_reqs[BLOCKIF_MAXREQ]; 147 }; 148 149 static pthread_once_t blockif_once = PTHREAD_ONCE_INIT; 150 151 struct blockif_sig_elem { 152 pthread_mutex_t bse_mtx; 153 pthread_cond_t bse_cond; 154 int bse_pending; 155 struct blockif_sig_elem *bse_next; 156 }; 157 158 static struct blockif_sig_elem *blockif_bse_head; 159 160 static int 161 blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq, 162 enum blockop op) 163 { 164 struct blockif_elem *be, *tbe; 165 off_t off; 166 int i; 167 168 be = TAILQ_FIRST(&bc->bc_freeq); 169 assert(be != NULL); 170 assert(be->be_status == BST_FREE); 171 TAILQ_REMOVE(&bc->bc_freeq, be, be_link); 172 be->be_req = breq; 173 be->be_op = op; 174 switch (op) { 175 case BOP_READ: 176 case BOP_WRITE: 177 #ifndef __FreeBSD__ 178 case BOP_WRITE_SYNC: 179 #endif 180 case BOP_DELETE: 181 off = breq->br_offset; 182 for (i = 0; i < breq->br_iovcnt; i++) 183 off += breq->br_iov[i].iov_len; 184 break; 185 default: 186 off = OFF_MAX; 187 } 188 be->be_block = off; 189 TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) { 190 if (tbe->be_block == breq->br_offset) 191 break; 192 } 193 if (tbe == NULL) { 194 TAILQ_FOREACH(tbe, &bc->bc_busyq, be_link) { 195 if (tbe->be_block == breq->br_offset) 196 break; 197 } 198 } 199 if (tbe == NULL) 200 be->be_status = BST_PEND; 201 else 202 be->be_status = BST_BLOCK; 203 TAILQ_INSERT_TAIL(&bc->bc_pendq, be, be_link); 204 return (be->be_status == BST_PEND); 205 } 206 207 static int 208 blockif_dequeue(struct blockif_ctxt *bc, pthread_t t, struct blockif_elem **bep) 209 { 210 struct blockif_elem *be; 211 212 TAILQ_FOREACH(be, &bc->bc_pendq, be_link) { 213 if (be->be_status == BST_PEND) 214 break; 215 assert(be->be_status == BST_BLOCK); 216 } 217 if (be == NULL) 218 return (0); 219 TAILQ_REMOVE(&bc->bc_pendq, be, be_link); 220 be->be_status = BST_BUSY; 221 be->be_tid = t; 222 TAILQ_INSERT_TAIL(&bc->bc_busyq, be, be_link); 223 *bep = be; 224 return (1); 225 } 226 227 static void 228 blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be) 229 { 230 struct blockif_elem *tbe; 231 232 if (be->be_status == BST_DONE || be->be_status == BST_BUSY) 233 TAILQ_REMOVE(&bc->bc_busyq, be, be_link); 234 else 235 TAILQ_REMOVE(&bc->bc_pendq, be, be_link); 236 TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) { 237 if (tbe->be_req->br_offset == be->be_block) 238 tbe->be_status = BST_PEND; 239 } 240 be->be_tid = 0; 241 be->be_status = BST_FREE; 242 be->be_req = NULL; 243 TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link); 244 } 245 246 static void 247 blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf) 248 { 249 struct blockif_req *br; 250 #ifdef __FreeBSD__ 251 off_t arg[2]; 252 #endif 253 ssize_t clen, len, off, boff, voff; 254 int i, err; 255 256 br = be->be_req; 257 if (br->br_iovcnt <= 1) 258 buf = NULL; 259 err = 0; 260 switch (be->be_op) { 261 case BOP_READ: 262 if (buf == NULL) { 263 if ((len = preadv(bc->bc_fd, br->br_iov, br->br_iovcnt, 264 br->br_offset)) < 0) 265 err = errno; 266 else 267 br->br_resid -= len; 268 break; 269 } 270 i = 0; 271 off = voff = 0; 272 while (br->br_resid > 0) { 273 len = MIN(br->br_resid, MAXPHYS); 274 if (pread(bc->bc_fd, buf, len, br->br_offset + 275 off) < 0) { 276 err = errno; 277 break; 278 } 279 boff = 0; 280 do { 281 clen = MIN(len - boff, br->br_iov[i].iov_len - 282 voff); 283 memcpy(br->br_iov[i].iov_base + voff, 284 buf + boff, clen); 285 if (clen < br->br_iov[i].iov_len - voff) 286 voff += clen; 287 else { 288 i++; 289 voff = 0; 290 } 291 boff += clen; 292 } while (boff < len); 293 off += len; 294 br->br_resid -= len; 295 } 296 break; 297 case BOP_WRITE: 298 if (bc->bc_rdonly) { 299 err = EROFS; 300 break; 301 } 302 if (buf == NULL) { 303 if ((len = pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt, 304 br->br_offset)) < 0) 305 err = errno; 306 else 307 br->br_resid -= len; 308 break; 309 } 310 i = 0; 311 off = voff = 0; 312 while (br->br_resid > 0) { 313 len = MIN(br->br_resid, MAXPHYS); 314 boff = 0; 315 do { 316 clen = MIN(len - boff, br->br_iov[i].iov_len - 317 voff); 318 memcpy(buf + boff, 319 br->br_iov[i].iov_base + voff, clen); 320 if (clen < br->br_iov[i].iov_len - voff) 321 voff += clen; 322 else { 323 i++; 324 voff = 0; 325 } 326 boff += clen; 327 } while (boff < len); 328 if (pwrite(bc->bc_fd, buf, len, br->br_offset + 329 off) < 0) { 330 err = errno; 331 break; 332 } 333 off += len; 334 br->br_resid -= len; 335 } 336 break; 337 case BOP_FLUSH: 338 #ifdef __FreeBSD__ 339 if (bc->bc_ischr) { 340 if (ioctl(bc->bc_fd, DIOCGFLUSH)) 341 err = errno; 342 } else if (fsync(bc->bc_fd)) 343 err = errno; 344 #else 345 /* 346 * This fsync() should be adequate to flush the cache of a file 347 * or device. In VFS, the VOP_SYNC operation is converted to 348 * the appropriate ioctl in both sdev (for real devices) and 349 * zfs (for zvols). 350 */ 351 if (fsync(bc->bc_fd)) 352 err = errno; 353 #endif 354 break; 355 case BOP_DELETE: 356 if (!bc->bc_candelete) 357 err = EOPNOTSUPP; 358 else if (bc->bc_rdonly) 359 err = EROFS; 360 #ifdef __FreeBSD__ 361 else if (bc->bc_ischr) { 362 arg[0] = br->br_offset; 363 arg[1] = br->br_resid; 364 if (ioctl(bc->bc_fd, DIOCGDELETE, arg)) 365 err = errno; 366 else 367 br->br_resid = 0; 368 } 369 else 370 err = EOPNOTSUPP; 371 #else 372 else if (bc->bc_ischr) { 373 dkioc_free_list_t dfl = { 374 .dfl_num_exts = 1, 375 .dfl_offset = 0, 376 .dfl_flags = 0, 377 .dfl_exts = { 378 { 379 .dfle_start = br->br_offset, 380 .dfle_length = br->br_resid 381 } 382 } 383 }; 384 385 if (ioctl(bc->bc_fd, DKIOCFREE, &dfl)) 386 err = errno; 387 else 388 br->br_resid = 0; 389 } else { 390 struct flock fl = { 391 .l_whence = 0, 392 .l_type = F_WRLCK, 393 .l_start = br->br_offset, 394 .l_len = br->br_resid 395 }; 396 397 if (fcntl(bc->bc_fd, F_FREESP, &fl)) 398 err = errno; 399 else 400 br->br_resid = 0; 401 } 402 #endif 403 break; 404 default: 405 err = EINVAL; 406 break; 407 } 408 409 be->be_status = BST_DONE; 410 411 (*br->br_callback)(br, err); 412 } 413 414 static void * 415 blockif_thr(void *arg) 416 { 417 struct blockif_ctxt *bc; 418 struct blockif_elem *be; 419 pthread_t t; 420 uint8_t *buf; 421 422 bc = arg; 423 if (bc->bc_isgeom) 424 buf = malloc(MAXPHYS); 425 else 426 buf = NULL; 427 t = pthread_self(); 428 429 pthread_mutex_lock(&bc->bc_mtx); 430 for (;;) { 431 while (blockif_dequeue(bc, t, &be)) { 432 pthread_mutex_unlock(&bc->bc_mtx); 433 blockif_proc(bc, be, buf); 434 pthread_mutex_lock(&bc->bc_mtx); 435 blockif_complete(bc, be); 436 } 437 /* Check ctxt status here to see if exit requested */ 438 if (bc->bc_closing) 439 break; 440 pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx); 441 } 442 pthread_mutex_unlock(&bc->bc_mtx); 443 444 if (buf) 445 free(buf); 446 pthread_exit(NULL); 447 return (NULL); 448 } 449 450 #ifdef __FreeBSD__ 451 static void 452 blockif_sigcont_handler(int signal, enum ev_type type, void *arg) 453 #else 454 static void 455 blockif_sigcont_handler(int signal) 456 #endif 457 { 458 struct blockif_sig_elem *bse; 459 460 for (;;) { 461 /* 462 * Process the entire list even if not intended for 463 * this thread. 464 */ 465 do { 466 bse = blockif_bse_head; 467 if (bse == NULL) 468 return; 469 } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head, 470 (uintptr_t)bse, 471 (uintptr_t)bse->bse_next)); 472 473 pthread_mutex_lock(&bse->bse_mtx); 474 bse->bse_pending = 0; 475 pthread_cond_signal(&bse->bse_cond); 476 pthread_mutex_unlock(&bse->bse_mtx); 477 } 478 } 479 480 static void 481 blockif_init(void) 482 { 483 #ifdef __FreeBSD__ 484 mevent_add(SIGCONT, EVF_SIGNAL, blockif_sigcont_handler, NULL); 485 (void) signal(SIGCONT, SIG_IGN); 486 #else 487 (void) sigset(SIGCONT, blockif_sigcont_handler); 488 #endif 489 } 490 491 struct blockif_ctxt * 492 blockif_open(const char *optstr, const char *ident) 493 { 494 char tname[MAXCOMLEN + 1]; 495 #ifdef __FreeBSD__ 496 char name[MAXPATHLEN]; 497 char *nopt, *xopts, *cp; 498 #else 499 char *nopt, *xopts, *cp = NULL; 500 #endif 501 struct blockif_ctxt *bc; 502 struct stat sbuf; 503 #ifdef __FreeBSD__ 504 struct diocgattr_arg arg; 505 #else 506 enum blockif_wce wce = WCE_NONE; 507 #endif 508 off_t size, psectsz, psectoff; 509 int extra, fd, i, sectsz; 510 int nocache, sync, ro, candelete, geom, ssopt, pssopt; 511 int nodelete; 512 513 #ifndef WITHOUT_CAPSICUM 514 cap_rights_t rights; 515 cap_ioctl_t cmds[] = { DIOCGFLUSH, DIOCGDELETE }; 516 #endif 517 518 pthread_once(&blockif_once, blockif_init); 519 520 fd = -1; 521 ssopt = 0; 522 nocache = 0; 523 sync = 0; 524 ro = 0; 525 nodelete = 0; 526 527 /* 528 * The first element in the optstring is always a pathname. 529 * Optional elements follow 530 */ 531 nopt = xopts = strdup(optstr); 532 while (xopts != NULL) { 533 cp = strsep(&xopts, ","); 534 if (cp == nopt) /* file or device pathname */ 535 continue; 536 else if (!strcmp(cp, "nocache")) 537 nocache = 1; 538 else if (!strcmp(cp, "nodelete")) 539 nodelete = 1; 540 else if (!strcmp(cp, "sync") || !strcmp(cp, "direct")) 541 sync = 1; 542 else if (!strcmp(cp, "ro")) 543 ro = 1; 544 else if (sscanf(cp, "sectorsize=%d/%d", &ssopt, &pssopt) == 2) 545 ; 546 else if (sscanf(cp, "sectorsize=%d", &ssopt) == 1) 547 pssopt = ssopt; 548 else { 549 EPRINTLN("Invalid device option \"%s\"", cp); 550 goto err; 551 } 552 } 553 554 extra = 0; 555 if (nocache) 556 extra |= O_DIRECT; 557 if (sync) 558 extra |= O_SYNC; 559 560 fd = open(nopt, (ro ? O_RDONLY : O_RDWR) | extra); 561 if (fd < 0 && !ro) { 562 /* Attempt a r/w fail with a r/o open */ 563 fd = open(nopt, O_RDONLY | extra); 564 ro = 1; 565 } 566 567 if (fd < 0) { 568 warn("Could not open backing file: %s", nopt); 569 goto err; 570 } 571 572 if (fstat(fd, &sbuf) < 0) { 573 warn("Could not stat backing file %s", nopt); 574 goto err; 575 } 576 577 #ifndef WITHOUT_CAPSICUM 578 cap_rights_init(&rights, CAP_FSYNC, CAP_IOCTL, CAP_READ, CAP_SEEK, 579 CAP_WRITE); 580 if (ro) 581 cap_rights_clear(&rights, CAP_FSYNC, CAP_WRITE); 582 583 if (caph_rights_limit(fd, &rights) == -1) 584 errx(EX_OSERR, "Unable to apply rights for sandbox"); 585 #endif 586 587 /* 588 * Deal with raw devices 589 */ 590 size = sbuf.st_size; 591 sectsz = DEV_BSIZE; 592 psectsz = psectoff = 0; 593 candelete = geom = 0; 594 #ifdef __FreeBSD__ 595 if (S_ISCHR(sbuf.st_mode)) { 596 if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 || 597 ioctl(fd, DIOCGSECTORSIZE, §sz)) { 598 perror("Could not fetch dev blk/sector size"); 599 goto err; 600 } 601 assert(size != 0); 602 assert(sectsz != 0); 603 if (ioctl(fd, DIOCGSTRIPESIZE, &psectsz) == 0 && psectsz > 0) 604 ioctl(fd, DIOCGSTRIPEOFFSET, &psectoff); 605 strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name)); 606 arg.len = sizeof(arg.value.i); 607 if (nodelete == 0 && ioctl(fd, DIOCGATTR, &arg) == 0) 608 candelete = arg.value.i; 609 if (ioctl(fd, DIOCGPROVIDERNAME, name) == 0) 610 geom = 1; 611 } else { 612 psectsz = sbuf.st_blksize; 613 } 614 #else 615 psectsz = sbuf.st_blksize; 616 if (S_ISCHR(sbuf.st_mode)) { 617 struct dk_minfo_ext dkmext; 618 int wce_val; 619 620 /* Look for a more accurate physical blocksize */ 621 if (ioctl(fd, DKIOCGMEDIAINFOEXT, &dkmext) == 0) { 622 psectsz = dkmext.dki_pbsize; 623 } 624 /* See if a configurable write cache is present and working */ 625 if (ioctl(fd, DKIOCGETWCE, &wce_val) == 0) { 626 /* 627 * If WCE is already active, disable it until the 628 * specific device driver calls for its return. If it 629 * is not active, toggle it on and off to verify that 630 * such actions are possible. 631 */ 632 if (wce_val != 0) { 633 wce_val = 0; 634 /* 635 * Inability to disable the cache is a threat 636 * to data durability. 637 */ 638 assert(ioctl(fd, DKIOCSETWCE, &wce_val) == 0); 639 wce = WCE_IOCTL; 640 } else { 641 int r1, r2; 642 643 wce_val = 1; 644 r1 = ioctl(fd, DKIOCSETWCE, &wce_val); 645 wce_val = 0; 646 r2 = ioctl(fd, DKIOCSETWCE, &wce_val); 647 648 if (r1 == 0 && r2 == 0) { 649 wce = WCE_IOCTL; 650 } else { 651 /* 652 * If the cache cache toggle was not 653 * successful, ensure that the cache 654 * was not left enabled. 655 */ 656 assert(r1 != 0); 657 } 658 } 659 } 660 661 if (nodelete == 0 && ioctl(fd, DKIOC_CANFREE, &candelete)) 662 candelete = 0; 663 664 } else { 665 int flags; 666 667 if ((flags = fcntl(fd, F_GETFL)) >= 0) { 668 flags |= O_DSYNC; 669 if (fcntl(fd, F_SETFL, flags) != -1) { 670 wce = WCE_FCNTL; 671 } 672 } 673 674 /* 675 * We don't have a way to discover if a file supports the 676 * FREESP fcntl cmd (other than trying it). However, 677 * zfs, ufs, tmpfs, and udfs all support the FREESP fcntl cmd. 678 * Nfsv4 and nfsv4 also forward the FREESP request 679 * to the server, so we always enable it for file based 680 * volumes. Anyone trying to run volumes on an unsupported 681 * configuration is on their own, and should be prepared 682 * for the requests to fail. 683 */ 684 if (nodelete == 0) 685 candelete = 1; 686 } 687 #endif 688 689 #ifndef WITHOUT_CAPSICUM 690 if (caph_ioctls_limit(fd, cmds, nitems(cmds)) == -1) 691 errx(EX_OSERR, "Unable to apply rights for sandbox"); 692 #endif 693 694 if (ssopt != 0) { 695 if (!powerof2(ssopt) || !powerof2(pssopt) || ssopt < 512 || 696 ssopt > pssopt) { 697 EPRINTLN("Invalid sector size %d/%d", 698 ssopt, pssopt); 699 goto err; 700 } 701 702 /* 703 * Some backend drivers (e.g. cd0, ada0) require that the I/O 704 * size be a multiple of the device's sector size. 705 * 706 * Validate that the emulated sector size complies with this 707 * requirement. 708 */ 709 if (S_ISCHR(sbuf.st_mode)) { 710 if (ssopt < sectsz || (ssopt % sectsz) != 0) { 711 EPRINTLN("Sector size %d incompatible " 712 "with underlying device sector size %d", 713 ssopt, sectsz); 714 goto err; 715 } 716 } 717 718 sectsz = ssopt; 719 psectsz = pssopt; 720 psectoff = 0; 721 } 722 723 bc = calloc(1, sizeof(struct blockif_ctxt)); 724 if (bc == NULL) { 725 perror("calloc"); 726 goto err; 727 } 728 729 bc->bc_magic = BLOCKIF_SIG; 730 bc->bc_fd = fd; 731 bc->bc_ischr = S_ISCHR(sbuf.st_mode); 732 bc->bc_isgeom = geom; 733 bc->bc_candelete = candelete; 734 #ifndef __FreeBSD__ 735 bc->bc_wce = wce; 736 #endif 737 bc->bc_rdonly = ro; 738 bc->bc_size = size; 739 bc->bc_sectsz = sectsz; 740 bc->bc_psectsz = psectsz; 741 bc->bc_psectoff = psectoff; 742 pthread_mutex_init(&bc->bc_mtx, NULL); 743 pthread_cond_init(&bc->bc_cond, NULL); 744 TAILQ_INIT(&bc->bc_freeq); 745 TAILQ_INIT(&bc->bc_pendq); 746 TAILQ_INIT(&bc->bc_busyq); 747 for (i = 0; i < BLOCKIF_MAXREQ; i++) { 748 bc->bc_reqs[i].be_status = BST_FREE; 749 TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link); 750 } 751 752 for (i = 0; i < BLOCKIF_NUMTHR; i++) { 753 pthread_create(&bc->bc_btid[i], NULL, blockif_thr, bc); 754 snprintf(tname, sizeof(tname), "blk-%s-%d", ident, i); 755 pthread_set_name_np(bc->bc_btid[i], tname); 756 } 757 758 return (bc); 759 err: 760 if (fd >= 0) 761 close(fd); 762 free(nopt); 763 return (NULL); 764 } 765 766 static int 767 blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq, 768 enum blockop op) 769 { 770 int err; 771 772 err = 0; 773 774 pthread_mutex_lock(&bc->bc_mtx); 775 if (!TAILQ_EMPTY(&bc->bc_freeq)) { 776 /* 777 * Enqueue and inform the block i/o thread 778 * that there is work available 779 */ 780 if (blockif_enqueue(bc, breq, op)) 781 pthread_cond_signal(&bc->bc_cond); 782 } else { 783 /* 784 * Callers are not allowed to enqueue more than 785 * the specified blockif queue limit. Return an 786 * error to indicate that the queue length has been 787 * exceeded. 788 */ 789 err = E2BIG; 790 } 791 pthread_mutex_unlock(&bc->bc_mtx); 792 793 return (err); 794 } 795 796 int 797 blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq) 798 { 799 800 assert(bc->bc_magic == BLOCKIF_SIG); 801 return (blockif_request(bc, breq, BOP_READ)); 802 } 803 804 int 805 blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq) 806 { 807 808 assert(bc->bc_magic == BLOCKIF_SIG); 809 return (blockif_request(bc, breq, BOP_WRITE)); 810 } 811 812 int 813 blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq) 814 { 815 816 assert(bc->bc_magic == BLOCKIF_SIG); 817 return (blockif_request(bc, breq, BOP_FLUSH)); 818 } 819 820 int 821 blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq) 822 { 823 824 assert(bc->bc_magic == BLOCKIF_SIG); 825 return (blockif_request(bc, breq, BOP_DELETE)); 826 } 827 828 int 829 blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq) 830 { 831 struct blockif_elem *be; 832 833 assert(bc->bc_magic == BLOCKIF_SIG); 834 835 pthread_mutex_lock(&bc->bc_mtx); 836 /* 837 * Check pending requests. 838 */ 839 TAILQ_FOREACH(be, &bc->bc_pendq, be_link) { 840 if (be->be_req == breq) 841 break; 842 } 843 if (be != NULL) { 844 /* 845 * Found it. 846 */ 847 blockif_complete(bc, be); 848 pthread_mutex_unlock(&bc->bc_mtx); 849 850 return (0); 851 } 852 853 /* 854 * Check in-flight requests. 855 */ 856 TAILQ_FOREACH(be, &bc->bc_busyq, be_link) { 857 if (be->be_req == breq) 858 break; 859 } 860 if (be == NULL) { 861 /* 862 * Didn't find it. 863 */ 864 pthread_mutex_unlock(&bc->bc_mtx); 865 return (EINVAL); 866 } 867 868 /* 869 * Interrupt the processing thread to force it return 870 * prematurely via it's normal callback path. 871 */ 872 while (be->be_status == BST_BUSY) { 873 struct blockif_sig_elem bse, *old_head; 874 875 pthread_mutex_init(&bse.bse_mtx, NULL); 876 pthread_cond_init(&bse.bse_cond, NULL); 877 878 bse.bse_pending = 1; 879 880 do { 881 old_head = blockif_bse_head; 882 bse.bse_next = old_head; 883 } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head, 884 (uintptr_t)old_head, 885 (uintptr_t)&bse)); 886 887 pthread_kill(be->be_tid, SIGCONT); 888 889 pthread_mutex_lock(&bse.bse_mtx); 890 while (bse.bse_pending) 891 pthread_cond_wait(&bse.bse_cond, &bse.bse_mtx); 892 pthread_mutex_unlock(&bse.bse_mtx); 893 } 894 895 pthread_mutex_unlock(&bc->bc_mtx); 896 897 /* 898 * The processing thread has been interrupted. Since it's not 899 * clear if the callback has been invoked yet, return EBUSY. 900 */ 901 return (EBUSY); 902 } 903 904 int 905 blockif_close(struct blockif_ctxt *bc) 906 { 907 void *jval; 908 int i; 909 910 assert(bc->bc_magic == BLOCKIF_SIG); 911 912 /* 913 * Stop the block i/o thread 914 */ 915 pthread_mutex_lock(&bc->bc_mtx); 916 bc->bc_closing = 1; 917 pthread_mutex_unlock(&bc->bc_mtx); 918 pthread_cond_broadcast(&bc->bc_cond); 919 for (i = 0; i < BLOCKIF_NUMTHR; i++) 920 pthread_join(bc->bc_btid[i], &jval); 921 922 /* XXX Cancel queued i/o's ??? */ 923 924 /* 925 * Release resources 926 */ 927 bc->bc_magic = 0; 928 close(bc->bc_fd); 929 free(bc); 930 931 return (0); 932 } 933 934 /* 935 * Return virtual C/H/S values for a given block. Use the algorithm 936 * outlined in the VHD specification to calculate values. 937 */ 938 void 939 blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s) 940 { 941 off_t sectors; /* total sectors of the block dev */ 942 off_t hcyl; /* cylinders times heads */ 943 uint16_t secpt; /* sectors per track */ 944 uint8_t heads; 945 946 assert(bc->bc_magic == BLOCKIF_SIG); 947 948 sectors = bc->bc_size / bc->bc_sectsz; 949 950 /* Clamp the size to the largest possible with CHS */ 951 if (sectors > 65535UL*16*255) 952 sectors = 65535UL*16*255; 953 954 if (sectors >= 65536UL*16*63) { 955 secpt = 255; 956 heads = 16; 957 hcyl = sectors / secpt; 958 } else { 959 secpt = 17; 960 hcyl = sectors / secpt; 961 heads = (hcyl + 1023) / 1024; 962 963 if (heads < 4) 964 heads = 4; 965 966 if (hcyl >= (heads * 1024) || heads > 16) { 967 secpt = 31; 968 heads = 16; 969 hcyl = sectors / secpt; 970 } 971 if (hcyl >= (heads * 1024)) { 972 secpt = 63; 973 heads = 16; 974 hcyl = sectors / secpt; 975 } 976 } 977 978 *c = hcyl / heads; 979 *h = heads; 980 *s = secpt; 981 } 982 983 /* 984 * Accessors 985 */ 986 off_t 987 blockif_size(struct blockif_ctxt *bc) 988 { 989 990 assert(bc->bc_magic == BLOCKIF_SIG); 991 return (bc->bc_size); 992 } 993 994 int 995 blockif_sectsz(struct blockif_ctxt *bc) 996 { 997 998 assert(bc->bc_magic == BLOCKIF_SIG); 999 return (bc->bc_sectsz); 1000 } 1001 1002 void 1003 blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off) 1004 { 1005 1006 assert(bc->bc_magic == BLOCKIF_SIG); 1007 *size = bc->bc_psectsz; 1008 *off = bc->bc_psectoff; 1009 } 1010 1011 int 1012 blockif_queuesz(struct blockif_ctxt *bc) 1013 { 1014 1015 assert(bc->bc_magic == BLOCKIF_SIG); 1016 return (BLOCKIF_MAXREQ - 1); 1017 } 1018 1019 int 1020 blockif_is_ro(struct blockif_ctxt *bc) 1021 { 1022 1023 assert(bc->bc_magic == BLOCKIF_SIG); 1024 return (bc->bc_rdonly); 1025 } 1026 1027 int 1028 blockif_candelete(struct blockif_ctxt *bc) 1029 { 1030 1031 assert(bc->bc_magic == BLOCKIF_SIG); 1032 return (bc->bc_candelete); 1033 } 1034 1035 #ifndef __FreeBSD__ 1036 int 1037 blockif_set_wce(struct blockif_ctxt *bc, int wc_enable) 1038 { 1039 int res = 0, flags; 1040 int clean_val = (wc_enable != 0) ? 1 : 0; 1041 1042 (void) pthread_mutex_lock(&bc->bc_mtx); 1043 switch (bc->bc_wce) { 1044 case WCE_IOCTL: 1045 res = ioctl(bc->bc_fd, DKIOCSETWCE, &clean_val); 1046 break; 1047 case WCE_FCNTL: 1048 if ((flags = fcntl(bc->bc_fd, F_GETFL)) >= 0) { 1049 if (wc_enable == 0) { 1050 flags |= O_DSYNC; 1051 } else { 1052 flags &= ~O_DSYNC; 1053 } 1054 if (fcntl(bc->bc_fd, F_SETFL, flags) == -1) { 1055 res = -1; 1056 } 1057 } else { 1058 res = -1; 1059 } 1060 break; 1061 default: 1062 break; 1063 } 1064 1065 /* 1066 * After a successful disable of the write cache, ensure that any 1067 * lingering data in the cache is synced out. 1068 */ 1069 if (res == 0 && wc_enable == 0) { 1070 res = fsync(bc->bc_fd); 1071 } 1072 (void) pthread_mutex_unlock(&bc->bc_mtx); 1073 1074 return (res); 1075 } 1076 #endif /* __FreeBSD__ */ 1077