1 /*- 2 * Copyright (c) 2013 Peter Grehan <grehan@freebsd.org> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 * $FreeBSD$ 27 */ 28 29 #include <sys/cdefs.h> 30 __FBSDID("$FreeBSD$"); 31 32 #include <sys/param.h> 33 #include <sys/queue.h> 34 #include <sys/errno.h> 35 #include <sys/stat.h> 36 #include <sys/ioctl.h> 37 #include <sys/disk.h> 38 39 #include <assert.h> 40 #include <fcntl.h> 41 #include <stdio.h> 42 #include <stdlib.h> 43 #include <string.h> 44 #include <pthread.h> 45 #include <pthread_np.h> 46 #include <signal.h> 47 #include <unistd.h> 48 49 #include <machine/atomic.h> 50 51 #include "bhyverun.h" 52 #include "mevent.h" 53 #include "block_if.h" 54 55 #define BLOCKIF_SIG 0xb109b109 56 57 #define BLOCKIF_NUMTHR 8 58 #define BLOCKIF_MAXREQ (64 + BLOCKIF_NUMTHR) 59 60 enum blockop { 61 BOP_READ, 62 BOP_WRITE, 63 BOP_FLUSH, 64 BOP_DELETE 65 }; 66 67 enum blockstat { 68 BST_FREE, 69 BST_BLOCK, 70 BST_PEND, 71 BST_BUSY, 72 BST_DONE 73 }; 74 75 struct blockif_elem { 76 TAILQ_ENTRY(blockif_elem) be_link; 77 struct blockif_req *be_req; 78 enum blockop be_op; 79 enum blockstat be_status; 80 pthread_t be_tid; 81 off_t be_block; 82 }; 83 84 struct blockif_ctxt { 85 int bc_magic; 86 int bc_fd; 87 int bc_ischr; 88 int bc_candelete; 89 int bc_rdonly; 90 off_t bc_size; 91 int bc_sectsz; 92 int bc_psectsz; 93 int bc_psectoff; 94 int bc_closing; 95 pthread_t bc_btid[BLOCKIF_NUMTHR]; 96 pthread_mutex_t bc_mtx; 97 pthread_cond_t bc_cond; 98 99 /* Request elements and free/pending/busy queues */ 100 TAILQ_HEAD(, blockif_elem) bc_freeq; 101 TAILQ_HEAD(, blockif_elem) bc_pendq; 102 TAILQ_HEAD(, blockif_elem) bc_busyq; 103 struct blockif_elem bc_reqs[BLOCKIF_MAXREQ]; 104 }; 105 106 static pthread_once_t blockif_once = PTHREAD_ONCE_INIT; 107 108 struct blockif_sig_elem { 109 pthread_mutex_t bse_mtx; 110 pthread_cond_t bse_cond; 111 int bse_pending; 112 struct blockif_sig_elem *bse_next; 113 }; 114 115 static struct blockif_sig_elem *blockif_bse_head; 116 117 static int 118 blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq, 119 enum blockop op) 120 { 121 struct blockif_elem *be, *tbe; 122 off_t off; 123 int i; 124 125 be = TAILQ_FIRST(&bc->bc_freeq); 126 assert(be != NULL); 127 assert(be->be_status == BST_FREE); 128 TAILQ_REMOVE(&bc->bc_freeq, be, be_link); 129 be->be_req = breq; 130 be->be_op = op; 131 switch (op) { 132 case BOP_READ: 133 case BOP_WRITE: 134 case BOP_DELETE: 135 off = breq->br_offset; 136 for (i = 0; i < breq->br_iovcnt; i++) 137 off += breq->br_iov[i].iov_len; 138 break; 139 default: 140 off = OFF_MAX; 141 } 142 be->be_block = off; 143 TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) { 144 if (tbe->be_block == breq->br_offset) 145 break; 146 } 147 if (tbe == NULL) { 148 TAILQ_FOREACH(tbe, &bc->bc_busyq, be_link) { 149 if (tbe->be_block == breq->br_offset) 150 break; 151 } 152 } 153 if (tbe == NULL) 154 be->be_status = BST_PEND; 155 else 156 be->be_status = BST_BLOCK; 157 TAILQ_INSERT_TAIL(&bc->bc_pendq, be, be_link); 158 return (be->be_status == BST_PEND); 159 } 160 161 static int 162 blockif_dequeue(struct blockif_ctxt *bc, pthread_t t, struct blockif_elem **bep) 163 { 164 struct blockif_elem *be; 165 166 TAILQ_FOREACH(be, &bc->bc_pendq, be_link) { 167 if (be->be_status == BST_PEND) 168 break; 169 assert(be->be_status == BST_BLOCK); 170 } 171 if (be == NULL) 172 return (0); 173 TAILQ_REMOVE(&bc->bc_pendq, be, be_link); 174 be->be_status = BST_BUSY; 175 be->be_tid = t; 176 TAILQ_INSERT_TAIL(&bc->bc_busyq, be, be_link); 177 *bep = be; 178 return (1); 179 } 180 181 static void 182 blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be) 183 { 184 struct blockif_elem *tbe; 185 186 if (be->be_status == BST_DONE || be->be_status == BST_BUSY) 187 TAILQ_REMOVE(&bc->bc_busyq, be, be_link); 188 else 189 TAILQ_REMOVE(&bc->bc_pendq, be, be_link); 190 TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) { 191 if (tbe->be_req->br_offset == be->be_block) 192 tbe->be_status = BST_PEND; 193 } 194 be->be_tid = 0; 195 be->be_status = BST_FREE; 196 be->be_req = NULL; 197 TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link); 198 } 199 200 static void 201 blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be) 202 { 203 struct blockif_req *br; 204 off_t arg[2]; 205 int err; 206 207 br = be->be_req; 208 err = 0; 209 210 switch (be->be_op) { 211 case BOP_READ: 212 if (preadv(bc->bc_fd, br->br_iov, br->br_iovcnt, 213 br->br_offset) < 0) 214 err = errno; 215 break; 216 case BOP_WRITE: 217 if (bc->bc_rdonly) 218 err = EROFS; 219 else if (pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt, 220 br->br_offset) < 0) 221 err = errno; 222 break; 223 case BOP_FLUSH: 224 if (bc->bc_ischr) { 225 if (ioctl(bc->bc_fd, DIOCGFLUSH)) 226 err = errno; 227 } else if (fsync(bc->bc_fd)) 228 err = errno; 229 break; 230 case BOP_DELETE: 231 if (!bc->bc_candelete) 232 err = EOPNOTSUPP; 233 else if (bc->bc_rdonly) 234 err = EROFS; 235 else if (bc->bc_ischr) { 236 arg[0] = br->br_offset; 237 arg[1] = br->br_iov[0].iov_len; 238 if (ioctl(bc->bc_fd, DIOCGDELETE, arg)) 239 err = errno; 240 } else 241 err = EOPNOTSUPP; 242 break; 243 default: 244 err = EINVAL; 245 break; 246 } 247 248 be->be_status = BST_DONE; 249 250 (*br->br_callback)(br, err); 251 } 252 253 static void * 254 blockif_thr(void *arg) 255 { 256 struct blockif_ctxt *bc; 257 struct blockif_elem *be; 258 pthread_t t; 259 260 bc = arg; 261 t = pthread_self(); 262 263 pthread_mutex_lock(&bc->bc_mtx); 264 for (;;) { 265 while (blockif_dequeue(bc, t, &be)) { 266 pthread_mutex_unlock(&bc->bc_mtx); 267 blockif_proc(bc, be); 268 pthread_mutex_lock(&bc->bc_mtx); 269 blockif_complete(bc, be); 270 } 271 /* Check ctxt status here to see if exit requested */ 272 if (bc->bc_closing) 273 break; 274 pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx); 275 } 276 pthread_mutex_unlock(&bc->bc_mtx); 277 278 pthread_exit(NULL); 279 return (NULL); 280 } 281 282 static void 283 blockif_sigcont_handler(int signal, enum ev_type type, void *arg) 284 { 285 struct blockif_sig_elem *bse; 286 287 for (;;) { 288 /* 289 * Process the entire list even if not intended for 290 * this thread. 291 */ 292 do { 293 bse = blockif_bse_head; 294 if (bse == NULL) 295 return; 296 } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head, 297 (uintptr_t)bse, 298 (uintptr_t)bse->bse_next)); 299 300 pthread_mutex_lock(&bse->bse_mtx); 301 bse->bse_pending = 0; 302 pthread_cond_signal(&bse->bse_cond); 303 pthread_mutex_unlock(&bse->bse_mtx); 304 } 305 } 306 307 static void 308 blockif_init(void) 309 { 310 mevent_add(SIGCONT, EVF_SIGNAL, blockif_sigcont_handler, NULL); 311 (void) signal(SIGCONT, SIG_IGN); 312 } 313 314 struct blockif_ctxt * 315 blockif_open(const char *optstr, const char *ident) 316 { 317 char tname[MAXCOMLEN + 1]; 318 char *nopt, *xopts; 319 struct blockif_ctxt *bc; 320 struct stat sbuf; 321 struct diocgattr_arg arg; 322 off_t size, psectsz, psectoff; 323 int extra, fd, i, sectsz; 324 int nocache, sync, ro, candelete; 325 326 pthread_once(&blockif_once, blockif_init); 327 328 nocache = 0; 329 sync = 0; 330 ro = 0; 331 332 /* 333 * The first element in the optstring is always a pathname. 334 * Optional elements follow 335 */ 336 nopt = strdup(optstr); 337 for (xopts = strtok(nopt, ","); 338 xopts != NULL; 339 xopts = strtok(NULL, ",")) { 340 if (!strcmp(xopts, "nocache")) 341 nocache = 1; 342 else if (!strcmp(xopts, "sync")) 343 sync = 1; 344 else if (!strcmp(xopts, "ro")) 345 ro = 1; 346 } 347 348 extra = 0; 349 if (nocache) 350 extra |= O_DIRECT; 351 if (sync) 352 extra |= O_SYNC; 353 354 fd = open(nopt, (ro ? O_RDONLY : O_RDWR) | extra); 355 if (fd < 0 && !ro) { 356 /* Attempt a r/w fail with a r/o open */ 357 fd = open(nopt, O_RDONLY | extra); 358 ro = 1; 359 } 360 361 if (fd < 0) { 362 perror("Could not open backing file"); 363 return (NULL); 364 } 365 366 if (fstat(fd, &sbuf) < 0) { 367 perror("Could not stat backing file"); 368 close(fd); 369 return (NULL); 370 } 371 372 /* 373 * Deal with raw devices 374 */ 375 size = sbuf.st_size; 376 sectsz = DEV_BSIZE; 377 psectsz = psectoff = 0; 378 candelete = 0; 379 if (S_ISCHR(sbuf.st_mode)) { 380 if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 || 381 ioctl(fd, DIOCGSECTORSIZE, §sz)) { 382 perror("Could not fetch dev blk/sector size"); 383 close(fd); 384 return (NULL); 385 } 386 assert(size != 0); 387 assert(sectsz != 0); 388 if (ioctl(fd, DIOCGSTRIPESIZE, &psectsz) == 0 && psectsz > 0) 389 ioctl(fd, DIOCGSTRIPEOFFSET, &psectoff); 390 strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name)); 391 arg.len = sizeof(arg.value.i); 392 if (ioctl(fd, DIOCGATTR, &arg) == 0) 393 candelete = arg.value.i; 394 } else 395 psectsz = sbuf.st_blksize; 396 397 bc = calloc(1, sizeof(struct blockif_ctxt)); 398 if (bc == NULL) { 399 close(fd); 400 return (NULL); 401 } 402 403 bc->bc_magic = BLOCKIF_SIG; 404 bc->bc_fd = fd; 405 bc->bc_ischr = S_ISCHR(sbuf.st_mode); 406 bc->bc_candelete = candelete; 407 bc->bc_rdonly = ro; 408 bc->bc_size = size; 409 bc->bc_sectsz = sectsz; 410 bc->bc_psectsz = psectsz; 411 bc->bc_psectoff = psectoff; 412 pthread_mutex_init(&bc->bc_mtx, NULL); 413 pthread_cond_init(&bc->bc_cond, NULL); 414 TAILQ_INIT(&bc->bc_freeq); 415 TAILQ_INIT(&bc->bc_pendq); 416 TAILQ_INIT(&bc->bc_busyq); 417 for (i = 0; i < BLOCKIF_MAXREQ; i++) { 418 bc->bc_reqs[i].be_status = BST_FREE; 419 TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link); 420 } 421 422 for (i = 0; i < BLOCKIF_NUMTHR; i++) { 423 pthread_create(&bc->bc_btid[i], NULL, blockif_thr, bc); 424 snprintf(tname, sizeof(tname), "blk-%s-%d", ident, i); 425 pthread_set_name_np(bc->bc_btid[i], tname); 426 } 427 428 return (bc); 429 } 430 431 static int 432 blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq, 433 enum blockop op) 434 { 435 int err; 436 437 err = 0; 438 439 pthread_mutex_lock(&bc->bc_mtx); 440 if (!TAILQ_EMPTY(&bc->bc_freeq)) { 441 /* 442 * Enqueue and inform the block i/o thread 443 * that there is work available 444 */ 445 if (blockif_enqueue(bc, breq, op)) 446 pthread_cond_signal(&bc->bc_cond); 447 } else { 448 /* 449 * Callers are not allowed to enqueue more than 450 * the specified blockif queue limit. Return an 451 * error to indicate that the queue length has been 452 * exceeded. 453 */ 454 err = E2BIG; 455 } 456 pthread_mutex_unlock(&bc->bc_mtx); 457 458 return (err); 459 } 460 461 int 462 blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq) 463 { 464 465 assert(bc->bc_magic == BLOCKIF_SIG); 466 return (blockif_request(bc, breq, BOP_READ)); 467 } 468 469 int 470 blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq) 471 { 472 473 assert(bc->bc_magic == BLOCKIF_SIG); 474 return (blockif_request(bc, breq, BOP_WRITE)); 475 } 476 477 int 478 blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq) 479 { 480 481 assert(bc->bc_magic == BLOCKIF_SIG); 482 return (blockif_request(bc, breq, BOP_FLUSH)); 483 } 484 485 int 486 blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq) 487 { 488 489 assert(bc->bc_magic == BLOCKIF_SIG); 490 return (blockif_request(bc, breq, BOP_DELETE)); 491 } 492 493 int 494 blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq) 495 { 496 struct blockif_elem *be; 497 498 assert(bc->bc_magic == BLOCKIF_SIG); 499 500 pthread_mutex_lock(&bc->bc_mtx); 501 /* 502 * Check pending requests. 503 */ 504 TAILQ_FOREACH(be, &bc->bc_pendq, be_link) { 505 if (be->be_req == breq) 506 break; 507 } 508 if (be != NULL) { 509 /* 510 * Found it. 511 */ 512 blockif_complete(bc, be); 513 pthread_mutex_unlock(&bc->bc_mtx); 514 515 return (0); 516 } 517 518 /* 519 * Check in-flight requests. 520 */ 521 TAILQ_FOREACH(be, &bc->bc_busyq, be_link) { 522 if (be->be_req == breq) 523 break; 524 } 525 if (be == NULL) { 526 /* 527 * Didn't find it. 528 */ 529 pthread_mutex_unlock(&bc->bc_mtx); 530 return (EINVAL); 531 } 532 533 /* 534 * Interrupt the processing thread to force it return 535 * prematurely via it's normal callback path. 536 */ 537 while (be->be_status == BST_BUSY) { 538 struct blockif_sig_elem bse, *old_head; 539 540 pthread_mutex_init(&bse.bse_mtx, NULL); 541 pthread_cond_init(&bse.bse_cond, NULL); 542 543 bse.bse_pending = 1; 544 545 do { 546 old_head = blockif_bse_head; 547 bse.bse_next = old_head; 548 } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head, 549 (uintptr_t)old_head, 550 (uintptr_t)&bse)); 551 552 pthread_kill(be->be_tid, SIGCONT); 553 554 pthread_mutex_lock(&bse.bse_mtx); 555 while (bse.bse_pending) 556 pthread_cond_wait(&bse.bse_cond, &bse.bse_mtx); 557 pthread_mutex_unlock(&bse.bse_mtx); 558 } 559 560 pthread_mutex_unlock(&bc->bc_mtx); 561 562 /* 563 * The processing thread has been interrupted. Since it's not 564 * clear if the callback has been invoked yet, return EBUSY. 565 */ 566 return (EBUSY); 567 } 568 569 int 570 blockif_close(struct blockif_ctxt *bc) 571 { 572 void *jval; 573 int err, i; 574 575 err = 0; 576 577 assert(bc->bc_magic == BLOCKIF_SIG); 578 579 /* 580 * Stop the block i/o thread 581 */ 582 pthread_mutex_lock(&bc->bc_mtx); 583 bc->bc_closing = 1; 584 pthread_mutex_unlock(&bc->bc_mtx); 585 pthread_cond_broadcast(&bc->bc_cond); 586 for (i = 0; i < BLOCKIF_NUMTHR; i++) 587 pthread_join(bc->bc_btid[i], &jval); 588 589 /* XXX Cancel queued i/o's ??? */ 590 591 /* 592 * Release resources 593 */ 594 bc->bc_magic = 0; 595 close(bc->bc_fd); 596 free(bc); 597 598 return (0); 599 } 600 601 /* 602 * Return virtual C/H/S values for a given block. Use the algorithm 603 * outlined in the VHD specification to calculate values. 604 */ 605 void 606 blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s) 607 { 608 off_t sectors; /* total sectors of the block dev */ 609 off_t hcyl; /* cylinders times heads */ 610 uint16_t secpt; /* sectors per track */ 611 uint8_t heads; 612 613 assert(bc->bc_magic == BLOCKIF_SIG); 614 615 sectors = bc->bc_size / bc->bc_sectsz; 616 617 /* Clamp the size to the largest possible with CHS */ 618 if (sectors > 65535UL*16*255) 619 sectors = 65535UL*16*255; 620 621 if (sectors >= 65536UL*16*63) { 622 secpt = 255; 623 heads = 16; 624 hcyl = sectors / secpt; 625 } else { 626 secpt = 17; 627 hcyl = sectors / secpt; 628 heads = (hcyl + 1023) / 1024; 629 630 if (heads < 4) 631 heads = 4; 632 633 if (hcyl >= (heads * 1024) || heads > 16) { 634 secpt = 31; 635 heads = 16; 636 hcyl = sectors / secpt; 637 } 638 if (hcyl >= (heads * 1024)) { 639 secpt = 63; 640 heads = 16; 641 hcyl = sectors / secpt; 642 } 643 } 644 645 *c = hcyl / heads; 646 *h = heads; 647 *s = secpt; 648 } 649 650 /* 651 * Accessors 652 */ 653 off_t 654 blockif_size(struct blockif_ctxt *bc) 655 { 656 657 assert(bc->bc_magic == BLOCKIF_SIG); 658 return (bc->bc_size); 659 } 660 661 int 662 blockif_sectsz(struct blockif_ctxt *bc) 663 { 664 665 assert(bc->bc_magic == BLOCKIF_SIG); 666 return (bc->bc_sectsz); 667 } 668 669 void 670 blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off) 671 { 672 673 assert(bc->bc_magic == BLOCKIF_SIG); 674 *size = bc->bc_psectsz; 675 *off = bc->bc_psectoff; 676 } 677 678 int 679 blockif_queuesz(struct blockif_ctxt *bc) 680 { 681 682 assert(bc->bc_magic == BLOCKIF_SIG); 683 return (BLOCKIF_MAXREQ - 1); 684 } 685 686 int 687 blockif_is_ro(struct blockif_ctxt *bc) 688 { 689 690 assert(bc->bc_magic == BLOCKIF_SIG); 691 return (bc->bc_rdonly); 692 } 693 694 int 695 blockif_candelete(struct blockif_ctxt *bc) 696 { 697 698 assert(bc->bc_magic == BLOCKIF_SIG); 699 return (bc->bc_candelete); 700 } 701