1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved. 24 * Copyright (c) 2012, Alexey Zaytsev <alexey.zaytsev@gmail.com> 25 * Copyright 2020 Joyent Inc. 26 * Copyright 2019 Western Digital Corporation. 27 */ 28 29 /* 30 * VIRTIO BLOCK DRIVER 31 * 32 * This driver provides support for Virtio Block devices. Each driver instance 33 * attaches to a single underlying block device. 34 * 35 * REQUEST CHAIN LAYOUT 36 * 37 * Every request chain sent to the I/O queue has the following structure. Each 38 * box in the diagram represents a descriptor entry (i.e., a DMA cookie) within 39 * the chain: 40 * 41 * +-0-----------------------------------------+ 42 * | struct virtio_blk_hdr |-----------------------\ 43 * | (written by driver, read by device) | | 44 * +-1-----------------------------------------+ | 45 * | optional data payload |--\ | 46 * | (written by driver for write requests, | | | 47 * | or by device for read requests) | | | 48 * +-2-----------------------------------------+ | | 49 * | ,~` : |-cookies loaned | 50 * |/ : ,~`| | from blkdev | 51 * : / | | | 52 * +-(N - 1)-----------------------------------+ | | 53 * | ... end of data payload. | | | 54 * | | | | 55 * | |--/ | 56 * +-N-----------------------------------------+ | 57 * | status byte | | 58 * | (written by device, read by driver) |--------------------\ | 59 * +-------------------------------------------+ | | 60 * | | 61 * The memory for the header and status bytes (i.e., 0 and N above) | | 62 * is allocated as a single chunk by vioblk_alloc_reqs(): | | 63 * | | 64 * +-------------------------------------------+ | | 65 * | struct virtio_blk_hdr |<----------------------/ 66 * +-------------------------------------------+ | 67 * | status byte |<-------------------/ 68 * +-------------------------------------------+ 69 */ 70 71 #include <sys/modctl.h> 72 #include <sys/blkdev.h> 73 #include <sys/types.h> 74 #include <sys/errno.h> 75 #include <sys/param.h> 76 #include <sys/stropts.h> 77 #include <sys/stream.h> 78 #include <sys/strsubr.h> 79 #include <sys/kmem.h> 80 #include <sys/conf.h> 81 #include <sys/devops.h> 82 #include <sys/ksynch.h> 83 #include <sys/stat.h> 84 #include <sys/modctl.h> 85 #include <sys/debug.h> 86 #include <sys/pci.h> 87 #include <sys/containerof.h> 88 #include <sys/ctype.h> 89 #include <sys/sysmacros.h> 90 #include <sys/dkioc_free_util.h> 91 92 #include "virtio.h" 93 #include "vioblk.h" 94 95 static void vioblk_get_id(vioblk_t *); 96 uint_t vioblk_int_handler(caddr_t, caddr_t); 97 static uint_t vioblk_poll(vioblk_t *); 98 static int vioblk_quiesce(dev_info_t *); 99 static int vioblk_attach(dev_info_t *, ddi_attach_cmd_t); 100 static int vioblk_detach(dev_info_t *, ddi_detach_cmd_t); 101 102 103 static struct dev_ops vioblk_dev_ops = { 104 .devo_rev = DEVO_REV, 105 .devo_refcnt = 0, 106 107 .devo_attach = vioblk_attach, 108 .devo_detach = vioblk_detach, 109 .devo_quiesce = vioblk_quiesce, 110 111 .devo_getinfo = ddi_no_info, 112 .devo_identify = nulldev, 113 .devo_probe = nulldev, 114 .devo_reset = nodev, 115 .devo_cb_ops = NULL, 116 .devo_bus_ops = NULL, 117 .devo_power = NULL, 118 }; 119 120 static struct modldrv vioblk_modldrv = { 121 .drv_modops = &mod_driverops, 122 .drv_linkinfo = "VIRTIO block driver", 123 .drv_dev_ops = &vioblk_dev_ops 124 }; 125 126 static struct modlinkage vioblk_modlinkage = { 127 .ml_rev = MODREV_1, 128 .ml_linkage = { &vioblk_modldrv, NULL } 129 }; 130 131 /* 132 * DMA attribute template for header and status blocks. We also make a 133 * per-instance copy of this template with negotiated sizes from the device for 134 * blkdev. 135 */ 136 static const ddi_dma_attr_t vioblk_dma_attr = { 137 .dma_attr_version = DMA_ATTR_V0, 138 .dma_attr_addr_lo = 0x0000000000000000, 139 .dma_attr_addr_hi = 0xFFFFFFFFFFFFFFFF, 140 .dma_attr_count_max = 0x00000000FFFFFFFF, 141 .dma_attr_align = 1, 142 .dma_attr_burstsizes = 1, 143 .dma_attr_minxfer = 1, 144 .dma_attr_maxxfer = 0x00000000FFFFFFFF, 145 .dma_attr_seg = 0x00000000FFFFFFFF, 146 .dma_attr_sgllen = 1, 147 .dma_attr_granular = 1, 148 .dma_attr_flags = 0 149 }; 150 151 static vioblk_req_t * 152 vioblk_req_alloc(vioblk_t *vib) 153 { 154 vioblk_req_t *vbr; 155 156 VERIFY(MUTEX_HELD(&vib->vib_mutex)); 157 158 if ((vbr = list_remove_head(&vib->vib_reqs)) == NULL) { 159 return (NULL); 160 } 161 vib->vib_nreqs_alloc++; 162 163 VERIFY0(vbr->vbr_status); 164 vbr->vbr_status |= VIOBLK_REQSTAT_ALLOCATED; 165 166 VERIFY3P(vbr->vbr_xfer, ==, NULL); 167 VERIFY3S(vbr->vbr_error, ==, 0); 168 169 return (vbr); 170 } 171 172 static void 173 vioblk_req_free(vioblk_t *vib, vioblk_req_t *vbr) 174 { 175 VERIFY(MUTEX_HELD(&vib->vib_mutex)); 176 177 /* 178 * Check that this request was allocated, then zero the status field to 179 * clear all status bits. 180 */ 181 VERIFY(vbr->vbr_status & VIOBLK_REQSTAT_ALLOCATED); 182 vbr->vbr_status = 0; 183 184 vbr->vbr_xfer = NULL; 185 vbr->vbr_error = 0; 186 vbr->vbr_type = 0; 187 188 list_insert_head(&vib->vib_reqs, vbr); 189 190 VERIFY3U(vib->vib_nreqs_alloc, >, 0); 191 vib->vib_nreqs_alloc--; 192 } 193 194 static void 195 vioblk_complete(vioblk_t *vib, vioblk_req_t *vbr) 196 { 197 VERIFY(MUTEX_HELD(&vib->vib_mutex)); 198 199 VERIFY(!(vbr->vbr_status & VIOBLK_REQSTAT_COMPLETE)); 200 vbr->vbr_status |= VIOBLK_REQSTAT_COMPLETE; 201 202 if (vbr->vbr_type == VIRTIO_BLK_T_FLUSH) { 203 vib->vib_stats->vbs_rw_cacheflush.value.ui64++; 204 } 205 206 if (vbr->vbr_xfer != NULL) { 207 /* 208 * This is a blkdev framework request. 209 */ 210 mutex_exit(&vib->vib_mutex); 211 bd_xfer_done(vbr->vbr_xfer, vbr->vbr_error); 212 mutex_enter(&vib->vib_mutex); 213 vbr->vbr_xfer = NULL; 214 } 215 } 216 217 static virtio_chain_t * 218 vioblk_common_start(vioblk_t *vib, int type, uint64_t sector, 219 boolean_t polled) 220 { 221 vioblk_req_t *vbr = NULL; 222 virtio_chain_t *vic = NULL; 223 224 if ((vbr = vioblk_req_alloc(vib)) == NULL) { 225 vib->vib_stats->vbs_rw_outofmemory.value.ui64++; 226 return (NULL); 227 } 228 vbr->vbr_type = type; 229 230 if (polled) { 231 /* 232 * Mark this command as polled so that we can wait on it 233 * ourselves. 234 */ 235 vbr->vbr_status |= VIOBLK_REQSTAT_POLLED; 236 } 237 238 if ((vic = virtio_chain_alloc(vib->vib_vq, KM_NOSLEEP)) == NULL) { 239 vib->vib_stats->vbs_rw_outofmemory.value.ui64++; 240 goto fail; 241 } 242 243 struct vioblk_req_hdr vbh; 244 vbh.vbh_type = type; 245 vbh.vbh_ioprio = 0; 246 vbh.vbh_sector = (sector * vib->vib_blk_size) / DEV_BSIZE; 247 bcopy(&vbh, virtio_dma_va(vbr->vbr_dma, 0), sizeof (vbh)); 248 249 virtio_chain_data_set(vic, vbr); 250 251 /* 252 * Put the header in the first descriptor. See the block comment at 253 * the top of the file for more details on the chain layout. 254 */ 255 if (virtio_chain_append(vic, virtio_dma_cookie_pa(vbr->vbr_dma, 0), 256 sizeof (struct vioblk_req_hdr), VIRTIO_DIR_DEVICE_READS) != 257 DDI_SUCCESS) { 258 goto fail; 259 } 260 261 return (vic); 262 263 fail: 264 vbr->vbr_xfer = NULL; 265 vioblk_req_free(vib, vbr); 266 if (vic != NULL) { 267 virtio_chain_free(vic); 268 } 269 return (NULL); 270 } 271 272 static int 273 vioblk_common_submit(vioblk_t *vib, virtio_chain_t *vic) 274 { 275 int r; 276 vioblk_req_t *vbr = virtio_chain_data(vic); 277 278 VERIFY(MUTEX_HELD(&vib->vib_mutex)); 279 280 /* 281 * The device will write the status byte into this last descriptor. 282 * See the block comment at the top of the file for more details on the 283 * chain layout. 284 */ 285 if (virtio_chain_append(vic, virtio_dma_cookie_pa(vbr->vbr_dma, 0) + 286 sizeof (struct vioblk_req_hdr), sizeof (uint8_t), 287 VIRTIO_DIR_DEVICE_WRITES) != DDI_SUCCESS) { 288 r = ENOMEM; 289 goto out; 290 } 291 292 virtio_dma_sync(vbr->vbr_dma, DDI_DMA_SYNC_FORDEV); 293 virtio_chain_submit(vic, B_TRUE); 294 295 if (!(vbr->vbr_status & VIOBLK_REQSTAT_POLLED)) { 296 /* 297 * This is not a polled request. Our request will be freed and 298 * the caller notified later in vioblk_poll(). 299 */ 300 return (0); 301 } 302 303 /* 304 * This is a polled request. We need to block here and wait for the 305 * device to complete request processing. 306 */ 307 while (!(vbr->vbr_status & VIOBLK_REQSTAT_POLL_COMPLETE)) { 308 if (ddi_in_panic()) { 309 /* 310 * When panicking, interrupts are disabled. We must 311 * poll the queue manually. 312 */ 313 drv_usecwait(10); 314 (void) vioblk_poll(vib); 315 continue; 316 } 317 318 /* 319 * When not panicking, the device will interrupt on command 320 * completion and vioblk_poll() will be called to wake us up. 321 */ 322 cv_wait(&vib->vib_cv, &vib->vib_mutex); 323 } 324 325 vioblk_complete(vib, vbr); 326 r = vbr->vbr_error; 327 328 out: 329 vioblk_req_free(vib, vbr); 330 virtio_chain_free(vic); 331 return (r); 332 } 333 334 static int 335 vioblk_internal(vioblk_t *vib, int type, virtio_dma_t *dma, 336 uint64_t sector, virtio_direction_t dir) 337 { 338 virtio_chain_t *vic; 339 vioblk_req_t *vbr; 340 int r; 341 342 VERIFY(MUTEX_HELD(&vib->vib_mutex)); 343 344 /* 345 * Allocate a polled request. 346 */ 347 if ((vic = vioblk_common_start(vib, type, sector, B_TRUE)) == NULL) { 348 return (ENOMEM); 349 } 350 vbr = virtio_chain_data(vic); 351 352 /* 353 * If there is a request payload, it goes between the header and the 354 * status byte. See the block comment at the top of the file for more 355 * detail on the chain layout. 356 */ 357 if (dma != NULL) { 358 for (uint_t n = 0; n < virtio_dma_ncookies(dma); n++) { 359 if (virtio_chain_append(vic, 360 virtio_dma_cookie_pa(dma, n), 361 virtio_dma_cookie_size(dma, n), dir) != 362 DDI_SUCCESS) { 363 r = ENOMEM; 364 goto out; 365 } 366 } 367 } 368 369 return (vioblk_common_submit(vib, vic)); 370 371 out: 372 vioblk_req_free(vib, vbr); 373 virtio_chain_free(vic); 374 return (r); 375 } 376 377 static int 378 vioblk_map_discard(vioblk_t *vib, virtio_chain_t *vic, const bd_xfer_t *xfer) 379 { 380 const dkioc_free_list_t *dfl = xfer->x_dfl; 381 const dkioc_free_list_ext_t *exts = dfl->dfl_exts; 382 virtio_dma_t *dma = NULL; 383 struct vioblk_discard_write_zeroes *wzp = NULL; 384 385 dma = virtio_dma_alloc(vib->vib_virtio, 386 dfl->dfl_num_exts * sizeof (*wzp), &vioblk_dma_attr, 387 DDI_DMA_CONSISTENT | DDI_DMA_WRITE, KM_SLEEP); 388 if (dma == NULL) 389 return (ENOMEM); 390 391 wzp = virtio_dma_va(dma, 0); 392 393 for (uint64_t i = 0; i < dfl->dfl_num_exts; i++, exts++, wzp++) { 394 uint64_t start = dfl->dfl_offset + exts->dfle_start; 395 396 const struct vioblk_discard_write_zeroes vdwz = { 397 .vdwz_sector = start >> DEV_BSHIFT, 398 .vdwz_num_sectors = exts->dfle_length >> DEV_BSHIFT, 399 .vdwz_flags = 0 400 }; 401 402 bcopy(&vdwz, wzp, sizeof (*wzp)); 403 } 404 405 if (virtio_chain_append(vic, 406 virtio_dma_cookie_pa(dma, 0), 407 virtio_dma_cookie_size(dma, 0), 408 VIRTIO_DIR_DEVICE_READS) != DDI_SUCCESS) { 409 virtio_dma_free(dma); 410 return (ENOMEM); 411 } 412 413 return (0); 414 } 415 416 static int 417 vioblk_request(vioblk_t *vib, bd_xfer_t *xfer, int type) 418 { 419 virtio_chain_t *vic = NULL; 420 vioblk_req_t *vbr = NULL; 421 uint_t total_cookies = 2; 422 boolean_t polled = (xfer->x_flags & BD_XFER_POLL) != 0; 423 int r; 424 425 VERIFY(MUTEX_HELD(&vib->vib_mutex)); 426 427 /* 428 * Ensure that this request falls within the advertised size of the 429 * block device. Be careful to avoid overflow. 430 */ 431 if (xfer->x_nblks > SIZE_MAX - xfer->x_blkno || 432 (xfer->x_blkno + xfer->x_nblks) > vib->vib_nblks) { 433 vib->vib_stats->vbs_rw_badoffset.value.ui64++; 434 return (EINVAL); 435 } 436 437 if ((vic = vioblk_common_start(vib, type, xfer->x_blkno, polled)) == 438 NULL) { 439 return (ENOMEM); 440 } 441 vbr = virtio_chain_data(vic); 442 vbr->vbr_xfer = xfer; 443 444 /* 445 * If there is a request payload, it goes between the header and the 446 * status byte. See the block comment at the top of the file for more 447 * detail on the chain layout. 448 */ 449 if ((type == VIRTIO_BLK_T_IN || type == VIRTIO_BLK_T_OUT) && 450 xfer->x_nblks > 0) { 451 virtio_direction_t dir = (type == VIRTIO_BLK_T_OUT) ? 452 VIRTIO_DIR_DEVICE_READS : VIRTIO_DIR_DEVICE_WRITES; 453 454 for (uint_t n = 0; n < xfer->x_ndmac; n++) { 455 ddi_dma_cookie_t dmac; 456 457 if (n == 0) { 458 /* 459 * The first cookie is in the blkdev request. 460 */ 461 dmac = xfer->x_dmac; 462 } else { 463 ddi_dma_nextcookie(xfer->x_dmah, &dmac); 464 } 465 466 if (virtio_chain_append(vic, dmac.dmac_laddress, 467 dmac.dmac_size, dir) != DDI_SUCCESS) { 468 r = ENOMEM; 469 goto fail; 470 } 471 } 472 473 total_cookies += xfer->x_ndmac; 474 475 } else if (xfer->x_nblks > 0) { 476 dev_err(vib->vib_dip, CE_PANIC, 477 "request of type %d had payload length of %lu blocks", type, 478 xfer->x_nblks); 479 } else if (type == VIRTIO_BLK_T_DISCARD) { 480 r = vioblk_map_discard(vib, vic, xfer); 481 if (r != 0) { 482 goto fail; 483 } 484 } 485 486 if (vib->vib_stats->vbs_rw_cookiesmax.value.ui32 < total_cookies) { 487 vib->vib_stats->vbs_rw_cookiesmax.value.ui32 = total_cookies; 488 } 489 490 return (vioblk_common_submit(vib, vic)); 491 492 fail: 493 vbr->vbr_xfer = NULL; 494 vioblk_req_free(vib, vbr); 495 virtio_chain_free(vic); 496 return (r); 497 } 498 499 static int 500 vioblk_bd_read(void *arg, bd_xfer_t *xfer) 501 { 502 vioblk_t *vib = arg; 503 int r; 504 505 mutex_enter(&vib->vib_mutex); 506 r = vioblk_request(vib, xfer, VIRTIO_BLK_T_IN); 507 mutex_exit(&vib->vib_mutex); 508 509 return (r); 510 } 511 512 static int 513 vioblk_bd_write(void *arg, bd_xfer_t *xfer) 514 { 515 vioblk_t *vib = arg; 516 int r; 517 518 mutex_enter(&vib->vib_mutex); 519 r = vioblk_request(vib, xfer, VIRTIO_BLK_T_OUT); 520 mutex_exit(&vib->vib_mutex); 521 522 return (r); 523 } 524 525 static int 526 vioblk_bd_flush(void *arg, bd_xfer_t *xfer) 527 { 528 vioblk_t *vib = arg; 529 int r; 530 531 mutex_enter(&vib->vib_mutex); 532 if (!virtio_feature_present(vib->vib_virtio, VIRTIO_BLK_F_FLUSH)) { 533 /* 534 * We don't really expect to get here, because if we did not 535 * negotiate the flush feature we would not have installed this 536 * function in the blkdev ops vector. 537 */ 538 mutex_exit(&vib->vib_mutex); 539 return (ENOTSUP); 540 } 541 542 r = vioblk_request(vib, xfer, VIRTIO_BLK_T_FLUSH); 543 mutex_exit(&vib->vib_mutex); 544 545 return (r); 546 } 547 548 static void 549 vioblk_bd_driveinfo(void *arg, bd_drive_t *drive) 550 { 551 vioblk_t *vib = arg; 552 553 drive->d_qsize = vib->vib_reqs_capacity; 554 drive->d_removable = B_FALSE; 555 drive->d_hotpluggable = B_TRUE; 556 drive->d_target = 0; 557 drive->d_lun = 0; 558 559 drive->d_vendor = "Virtio"; 560 drive->d_vendor_len = strlen(drive->d_vendor); 561 562 drive->d_product = "Block Device"; 563 drive->d_product_len = strlen(drive->d_product); 564 565 drive->d_serial = vib->vib_devid; 566 drive->d_serial_len = strlen(drive->d_serial); 567 568 drive->d_revision = "0000"; 569 drive->d_revision_len = strlen(drive->d_revision); 570 571 if (vib->vib_can_discard) { 572 drive->d_free_align = vib->vib_discard_sector_align; 573 drive->d_max_free_seg = vib->vib_max_discard_seg; 574 drive->d_max_free_blks = vib->vib_max_discard_sectors; 575 /* 576 * The virtio 1.1 spec doesn't specify a per segment sector 577 * limit for discards -- only a limit on the total sectors in 578 * a discard request. Therefore, we assume a vioblk device must 579 * be able to accept a single segment of vib_max_discard_sectors 580 * (when it supports discard requests) and use 581 * vib_max_discard_sectors both for the overall limit for 582 * a discard request, but also as the limit for a single 583 * segment. blkdev will ensure we are never called with 584 * a dkioc_free_list_t that violates either limit. 585 */ 586 drive->d_max_free_seg_blks = vib->vib_max_discard_sectors; 587 } 588 } 589 590 static int 591 vioblk_bd_mediainfo(void *arg, bd_media_t *media) 592 { 593 vioblk_t *vib = (void *)arg; 594 595 /* 596 * The device protocol is specified in terms of 512 byte logical 597 * blocks, regardless of the recommended I/O size which might be 598 * larger. 599 */ 600 media->m_nblks = vib->vib_nblks; 601 media->m_blksize = vib->vib_blk_size; 602 603 media->m_readonly = vib->vib_readonly; 604 media->m_pblksize = vib->vib_pblk_size; 605 return (0); 606 } 607 608 static void 609 vioblk_get_id(vioblk_t *vib) 610 { 611 virtio_dma_t *dma; 612 int r; 613 614 if ((dma = virtio_dma_alloc(vib->vib_virtio, VIRTIO_BLK_ID_BYTES, 615 &vioblk_dma_attr, DDI_DMA_CONSISTENT | DDI_DMA_READ, 616 KM_SLEEP)) == NULL) { 617 return; 618 } 619 620 mutex_enter(&vib->vib_mutex); 621 if ((r = vioblk_internal(vib, VIRTIO_BLK_T_GET_ID, dma, 0, 622 VIRTIO_DIR_DEVICE_WRITES)) == 0) { 623 const char *b = virtio_dma_va(dma, 0); 624 uint_t pos = 0; 625 626 /* 627 * Save the entire response for debugging purposes. 628 */ 629 bcopy(virtio_dma_va(dma, 0), vib->vib_rawid, 630 VIRTIO_BLK_ID_BYTES); 631 632 /* 633 * Process the returned ID. 634 */ 635 bzero(vib->vib_devid, sizeof (vib->vib_devid)); 636 for (uint_t n = 0; n < VIRTIO_BLK_ID_BYTES; n++) { 637 if (isalnum(b[n]) || b[n] == '-' || b[n] == '_') { 638 /* 639 * Accept a subset of printable ASCII 640 * characters. 641 */ 642 vib->vib_devid[pos++] = b[n]; 643 } else { 644 /* 645 * Stop processing at the first sign of 646 * trouble. 647 */ 648 break; 649 } 650 } 651 652 vib->vib_devid_fetched = B_TRUE; 653 } 654 mutex_exit(&vib->vib_mutex); 655 656 virtio_dma_free(dma); 657 } 658 659 static int 660 vioblk_bd_devid(void *arg, dev_info_t *dip, ddi_devid_t *devid) 661 { 662 vioblk_t *vib = arg; 663 size_t len; 664 665 if ((len = strlen(vib->vib_devid)) == 0) { 666 /* 667 * The device has no ID. 668 */ 669 return (DDI_FAILURE); 670 } 671 672 return (ddi_devid_init(dip, DEVID_ATA_SERIAL, len, vib->vib_devid, 673 devid)); 674 } 675 676 static int 677 vioblk_bd_free_space(void *arg, bd_xfer_t *xfer) 678 { 679 vioblk_t *vib = arg; 680 int r = 0; 681 682 /* 683 * Since vib_can_discard is write once (and set during attach), 684 * we can check if it's enabled without taking the mutex. 685 */ 686 if (!vib->vib_can_discard) { 687 return (ENOTSUP); 688 } 689 690 mutex_enter(&vib->vib_mutex); 691 r = vioblk_request(vib, xfer, VIRTIO_BLK_T_DISCARD); 692 mutex_exit(&vib->vib_mutex); 693 694 return (r); 695 } 696 697 /* 698 * As the device completes processing of a request, it returns the chain for 699 * that request to our I/O queue. This routine is called in two contexts: 700 * - from the interrupt handler, in response to notification from the device 701 * - synchronously in line with request processing when panicking 702 */ 703 static uint_t 704 vioblk_poll(vioblk_t *vib) 705 { 706 virtio_chain_t *vic; 707 uint_t count = 0; 708 boolean_t wakeup = B_FALSE; 709 710 VERIFY(MUTEX_HELD(&vib->vib_mutex)); 711 712 while ((vic = virtio_queue_poll(vib->vib_vq)) != NULL) { 713 vioblk_req_t *vbr = virtio_chain_data(vic); 714 uint8_t status; 715 716 virtio_dma_sync(vbr->vbr_dma, DDI_DMA_SYNC_FORCPU); 717 718 bcopy(virtio_dma_va(vbr->vbr_dma, 719 sizeof (struct vioblk_req_hdr)), &status, sizeof (status)); 720 721 switch (status) { 722 case VIRTIO_BLK_S_OK: 723 vbr->vbr_error = 0; 724 break; 725 case VIRTIO_BLK_S_IOERR: 726 vbr->vbr_error = EIO; 727 vib->vib_stats->vbs_io_errors.value.ui64++; 728 break; 729 case VIRTIO_BLK_S_UNSUPP: 730 vbr->vbr_error = ENOTTY; 731 vib->vib_stats->vbs_unsupp_errors.value.ui64++; 732 break; 733 default: 734 vbr->vbr_error = ENXIO; 735 vib->vib_stats->vbs_nxio_errors.value.ui64++; 736 break; 737 } 738 739 count++; 740 741 if (vbr->vbr_status & VIOBLK_REQSTAT_POLLED) { 742 /* 743 * This request must not be freed as it is being held 744 * by a call to vioblk_common_submit(). 745 */ 746 VERIFY(!(vbr->vbr_status & 747 VIOBLK_REQSTAT_POLL_COMPLETE)); 748 vbr->vbr_status |= VIOBLK_REQSTAT_POLL_COMPLETE; 749 wakeup = B_TRUE; 750 continue; 751 } 752 753 vioblk_complete(vib, vbr); 754 755 vioblk_req_free(vib, vbr); 756 virtio_chain_free(vic); 757 } 758 759 if (wakeup) { 760 /* 761 * Signal anybody waiting for polled command completion. 762 */ 763 cv_broadcast(&vib->vib_cv); 764 } 765 766 return (count); 767 } 768 769 uint_t 770 vioblk_int_handler(caddr_t arg0, caddr_t arg1) 771 { 772 vioblk_t *vib = (vioblk_t *)arg0; 773 uint_t count; 774 775 mutex_enter(&vib->vib_mutex); 776 if ((count = vioblk_poll(vib)) > 777 vib->vib_stats->vbs_intr_queuemax.value.ui32) { 778 vib->vib_stats->vbs_intr_queuemax.value.ui32 = count; 779 } 780 781 vib->vib_stats->vbs_intr_total.value.ui64++; 782 mutex_exit(&vib->vib_mutex); 783 784 return (DDI_INTR_CLAIMED); 785 } 786 787 static void 788 vioblk_free_reqs(vioblk_t *vib) 789 { 790 VERIFY3U(vib->vib_nreqs_alloc, ==, 0); 791 792 for (uint_t i = 0; i < vib->vib_reqs_capacity; i++) { 793 struct vioblk_req *vbr = &vib->vib_reqs_mem[i]; 794 795 VERIFY(list_link_active(&vbr->vbr_link)); 796 list_remove(&vib->vib_reqs, vbr); 797 798 VERIFY0(vbr->vbr_status); 799 800 if (vbr->vbr_dma != NULL) { 801 virtio_dma_free(vbr->vbr_dma); 802 vbr->vbr_dma = NULL; 803 } 804 } 805 VERIFY(list_is_empty(&vib->vib_reqs)); 806 807 if (vib->vib_reqs_mem != NULL) { 808 kmem_free(vib->vib_reqs_mem, 809 sizeof (struct vioblk_req) * vib->vib_reqs_capacity); 810 vib->vib_reqs_mem = NULL; 811 vib->vib_reqs_capacity = 0; 812 } 813 } 814 815 static int 816 vioblk_alloc_reqs(vioblk_t *vib) 817 { 818 vib->vib_reqs_capacity = MIN(virtio_queue_size(vib->vib_vq), 819 VIRTIO_BLK_REQ_BUFS); 820 vib->vib_reqs_mem = kmem_zalloc( 821 sizeof (struct vioblk_req) * vib->vib_reqs_capacity, KM_SLEEP); 822 vib->vib_nreqs_alloc = 0; 823 824 for (uint_t i = 0; i < vib->vib_reqs_capacity; i++) { 825 list_insert_tail(&vib->vib_reqs, &vib->vib_reqs_mem[i]); 826 } 827 828 for (vioblk_req_t *vbr = list_head(&vib->vib_reqs); vbr != NULL; 829 vbr = list_next(&vib->vib_reqs, vbr)) { 830 if ((vbr->vbr_dma = virtio_dma_alloc(vib->vib_virtio, 831 sizeof (struct vioblk_req_hdr) + sizeof (uint8_t), 832 &vioblk_dma_attr, DDI_DMA_RDWR | DDI_DMA_CONSISTENT, 833 KM_SLEEP)) == NULL) { 834 goto fail; 835 } 836 } 837 838 return (0); 839 840 fail: 841 vioblk_free_reqs(vib); 842 return (ENOMEM); 843 } 844 845 static int 846 vioblk_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 847 { 848 int instance = ddi_get_instance(dip); 849 vioblk_t *vib; 850 virtio_t *vio; 851 boolean_t did_mutex = B_FALSE; 852 853 if (cmd != DDI_ATTACH) { 854 return (DDI_FAILURE); 855 } 856 857 if ((vio = virtio_init(dip, VIRTIO_BLK_WANTED_FEATURES, B_TRUE)) == 858 NULL) { 859 dev_err(dip, CE_WARN, "failed to start Virtio init"); 860 return (DDI_FAILURE); 861 } 862 863 vib = kmem_zalloc(sizeof (*vib), KM_SLEEP); 864 vib->vib_dip = dip; 865 vib->vib_virtio = vio; 866 ddi_set_driver_private(dip, vib); 867 list_create(&vib->vib_reqs, sizeof (vioblk_req_t), 868 offsetof(vioblk_req_t, vbr_link)); 869 870 /* 871 * Determine how many scatter-gather entries we can use in a single 872 * request. 873 */ 874 vib->vib_seg_max = VIRTIO_BLK_DEFAULT_MAX_SEG; 875 if (virtio_feature_present(vio, VIRTIO_BLK_F_SEG_MAX)) { 876 vib->vib_seg_max = virtio_dev_get32(vio, 877 VIRTIO_BLK_CONFIG_SEG_MAX); 878 879 if (vib->vib_seg_max == 0 || vib->vib_seg_max == PCI_EINVAL32) { 880 /* 881 * We need to be able to use at least one data segment, 882 * so we'll assume that this device is just poorly 883 * implemented and try for one. 884 */ 885 vib->vib_seg_max = 1; 886 } 887 } 888 889 if (virtio_feature_present(vio, VIRTIO_BLK_F_DISCARD)) { 890 vib->vib_max_discard_sectors = virtio_dev_get32(vio, 891 VIRTIO_BLK_CONFIG_MAX_DISCARD_SECT); 892 vib->vib_max_discard_seg = virtio_dev_get32(vio, 893 VIRTIO_BLK_CONFIG_MAX_DISCARD_SEG); 894 vib->vib_discard_sector_align = virtio_dev_get32(vio, 895 VIRTIO_BLK_CONFIG_DISCARD_ALIGN); 896 897 if (vib->vib_max_discard_sectors == 0 || 898 vib->vib_max_discard_seg == 0 || 899 vib->vib_discard_sector_align == 0) { 900 vib->vib_can_discard = B_FALSE; 901 902 /* 903 * The hypervisor shouldn't be giving us bad values. 904 * If it is, it's probably worth notifying the 905 * operator. 906 */ 907 dev_err(dip, CE_NOTE, 908 "Host is advertising DISCARD support but with bad" 909 "parameters: max_discard_sectors=%u, " 910 "max_discard_segments=%u, discard_sector_align=%u", 911 vib->vib_max_discard_sectors, 912 vib->vib_max_discard_seg, 913 vib->vib_discard_sector_align); 914 } else { 915 vib->vib_can_discard = B_TRUE; 916 } 917 } 918 919 /* 920 * When allocating the request queue, we include two additional 921 * descriptors (beyond those required for request data) to account for 922 * the header and the status byte. 923 */ 924 if ((vib->vib_vq = virtio_queue_alloc(vio, VIRTIO_BLK_VIRTQ_IO, "io", 925 vioblk_int_handler, vib, B_FALSE, vib->vib_seg_max + 2)) == NULL) { 926 goto fail; 927 } 928 929 if (virtio_init_complete(vio, 0) != DDI_SUCCESS) { 930 dev_err(dip, CE_WARN, "failed to complete Virtio init"); 931 goto fail; 932 } 933 934 cv_init(&vib->vib_cv, NULL, CV_DRIVER, NULL); 935 mutex_init(&vib->vib_mutex, NULL, MUTEX_DRIVER, virtio_intr_pri(vio)); 936 did_mutex = B_TRUE; 937 938 if ((vib->vib_kstat = kstat_create("vioblk", instance, 939 "statistics", "controller", KSTAT_TYPE_NAMED, 940 sizeof (struct vioblk_stats) / sizeof (kstat_named_t), 941 KSTAT_FLAG_PERSISTENT)) == NULL) { 942 dev_err(dip, CE_WARN, "kstat_create failed"); 943 goto fail; 944 } 945 vib->vib_stats = (vioblk_stats_t *)vib->vib_kstat->ks_data; 946 kstat_named_init(&vib->vib_stats->vbs_rw_outofmemory, 947 "total_rw_outofmemory", KSTAT_DATA_UINT64); 948 kstat_named_init(&vib->vib_stats->vbs_rw_badoffset, 949 "total_rw_badoffset", KSTAT_DATA_UINT64); 950 kstat_named_init(&vib->vib_stats->vbs_intr_total, 951 "total_intr", KSTAT_DATA_UINT64); 952 kstat_named_init(&vib->vib_stats->vbs_io_errors, 953 "total_io_errors", KSTAT_DATA_UINT64); 954 kstat_named_init(&vib->vib_stats->vbs_unsupp_errors, 955 "total_unsupp_errors", KSTAT_DATA_UINT64); 956 kstat_named_init(&vib->vib_stats->vbs_nxio_errors, 957 "total_nxio_errors", KSTAT_DATA_UINT64); 958 kstat_named_init(&vib->vib_stats->vbs_rw_cacheflush, 959 "total_rw_cacheflush", KSTAT_DATA_UINT64); 960 kstat_named_init(&vib->vib_stats->vbs_rw_cookiesmax, 961 "max_rw_cookies", KSTAT_DATA_UINT32); 962 kstat_named_init(&vib->vib_stats->vbs_intr_queuemax, 963 "max_intr_queue", KSTAT_DATA_UINT32); 964 kstat_install(vib->vib_kstat); 965 966 vib->vib_readonly = virtio_feature_present(vio, VIRTIO_BLK_F_RO); 967 if ((vib->vib_nblks = virtio_dev_get64(vio, 968 VIRTIO_BLK_CONFIG_CAPACITY)) == UINT64_MAX) { 969 dev_err(dip, CE_WARN, "invalid capacity"); 970 goto fail; 971 } 972 973 /* 974 * Determine the optimal logical block size recommended by the device. 975 * This size is advisory; the protocol always deals in 512 byte blocks. 976 */ 977 vib->vib_blk_size = DEV_BSIZE; 978 if (virtio_feature_present(vio, VIRTIO_BLK_F_BLK_SIZE)) { 979 uint32_t v = virtio_dev_get32(vio, VIRTIO_BLK_CONFIG_BLK_SIZE); 980 981 if (v != 0 && v != PCI_EINVAL32) { 982 vib->vib_blk_size = v; 983 } 984 } 985 986 /* 987 * Device capacity is always in 512-byte units, convert to 988 * native blocks. 989 */ 990 vib->vib_nblks = (vib->vib_nblks * DEV_BSIZE) / vib->vib_blk_size; 991 992 /* 993 * The device may also provide an advisory physical block size. 994 */ 995 vib->vib_pblk_size = vib->vib_blk_size; 996 if (virtio_feature_present(vio, VIRTIO_BLK_F_TOPOLOGY)) { 997 uint8_t v = virtio_dev_get8(vio, VIRTIO_BLK_CONFIG_TOPO_PBEXP); 998 999 if (v != PCI_EINVAL8) { 1000 vib->vib_pblk_size <<= v; 1001 } 1002 } 1003 1004 /* 1005 * The maximum size for a cookie in a request. 1006 */ 1007 vib->vib_seg_size_max = VIRTIO_BLK_DEFAULT_MAX_SIZE; 1008 if (virtio_feature_present(vio, VIRTIO_BLK_F_SIZE_MAX)) { 1009 uint32_t v = virtio_dev_get32(vio, VIRTIO_BLK_CONFIG_SIZE_MAX); 1010 1011 if (v != 0 && v != PCI_EINVAL32) { 1012 vib->vib_seg_size_max = v; 1013 } 1014 } 1015 1016 /* 1017 * Set up the DMA attributes for blkdev to use for request data. The 1018 * specification is not extremely clear about whether DMA-related 1019 * parameters include or exclude the header and status descriptors. 1020 * For now, we assume they cover only the request data and not the 1021 * headers. 1022 */ 1023 vib->vib_bd_dma_attr = vioblk_dma_attr; 1024 vib->vib_bd_dma_attr.dma_attr_sgllen = vib->vib_seg_max; 1025 vib->vib_bd_dma_attr.dma_attr_count_max = vib->vib_seg_size_max; 1026 vib->vib_bd_dma_attr.dma_attr_maxxfer = vib->vib_seg_max * 1027 vib->vib_seg_size_max; 1028 1029 if (vioblk_alloc_reqs(vib) != 0) { 1030 goto fail; 1031 } 1032 1033 /* 1034 * The blkdev framework does not provide a way to specify that the 1035 * device does not support write cache flushing, except by omitting the 1036 * "o_sync_cache" member from the ops vector. As "bd_alloc_handle()" 1037 * makes a copy of the ops vector, we can safely assemble one on the 1038 * stack based on negotiated features. 1039 * 1040 * Similarly, the blkdev framework does not provide a way to indicate 1041 * if a device supports an TRIM/UNMAP/DISCARD type operation except 1042 * by omitting the "o_free_space" member from the ops vector. 1043 */ 1044 bd_ops_t vioblk_bd_ops = { 1045 .o_version = BD_OPS_CURRENT_VERSION, 1046 .o_drive_info = vioblk_bd_driveinfo, 1047 .o_media_info = vioblk_bd_mediainfo, 1048 .o_devid_init = vioblk_bd_devid, 1049 .o_sync_cache = vioblk_bd_flush, 1050 .o_read = vioblk_bd_read, 1051 .o_write = vioblk_bd_write, 1052 .o_free_space = vioblk_bd_free_space, 1053 }; 1054 if (!virtio_feature_present(vio, VIRTIO_BLK_F_FLUSH)) { 1055 vioblk_bd_ops.o_sync_cache = NULL; 1056 } 1057 if (!vib->vib_can_discard) { 1058 vioblk_bd_ops.o_free_space = NULL; 1059 } 1060 1061 vib->vib_bd_h = bd_alloc_handle(vib, &vioblk_bd_ops, 1062 &vib->vib_bd_dma_attr, KM_SLEEP); 1063 1064 /* 1065 * Enable interrupts now so that we can request the device identity. 1066 */ 1067 if (virtio_interrupts_enable(vio) != DDI_SUCCESS) { 1068 goto fail; 1069 } 1070 1071 vioblk_get_id(vib); 1072 1073 if (bd_attach_handle(dip, vib->vib_bd_h) != DDI_SUCCESS) { 1074 dev_err(dip, CE_WARN, "Failed to attach blkdev"); 1075 goto fail; 1076 } 1077 1078 return (DDI_SUCCESS); 1079 1080 fail: 1081 if (vib->vib_bd_h != NULL) { 1082 (void) bd_detach_handle(vib->vib_bd_h); 1083 bd_free_handle(vib->vib_bd_h); 1084 } 1085 if (vio != NULL) { 1086 (void) virtio_fini(vio, B_TRUE); 1087 } 1088 if (did_mutex) { 1089 mutex_destroy(&vib->vib_mutex); 1090 cv_destroy(&vib->vib_cv); 1091 } 1092 if (vib->vib_kstat != NULL) { 1093 kstat_delete(vib->vib_kstat); 1094 } 1095 vioblk_free_reqs(vib); 1096 kmem_free(vib, sizeof (*vib)); 1097 return (DDI_FAILURE); 1098 } 1099 1100 static int 1101 vioblk_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 1102 { 1103 vioblk_t *vib = ddi_get_driver_private(dip); 1104 1105 if (cmd != DDI_DETACH) { 1106 return (DDI_FAILURE); 1107 } 1108 1109 mutex_enter(&vib->vib_mutex); 1110 if (vib->vib_nreqs_alloc > 0) { 1111 /* 1112 * Cannot detach while there are still outstanding requests. 1113 */ 1114 mutex_exit(&vib->vib_mutex); 1115 return (DDI_FAILURE); 1116 } 1117 1118 if (bd_detach_handle(vib->vib_bd_h) != DDI_SUCCESS) { 1119 mutex_exit(&vib->vib_mutex); 1120 return (DDI_FAILURE); 1121 } 1122 1123 /* 1124 * Tear down the Virtio framework before freeing the rest of the 1125 * resources. This will ensure the interrupt handlers are no longer 1126 * running. 1127 */ 1128 virtio_fini(vib->vib_virtio, B_FALSE); 1129 1130 vioblk_free_reqs(vib); 1131 kstat_delete(vib->vib_kstat); 1132 1133 mutex_exit(&vib->vib_mutex); 1134 mutex_destroy(&vib->vib_mutex); 1135 1136 kmem_free(vib, sizeof (*vib)); 1137 1138 return (DDI_SUCCESS); 1139 } 1140 1141 static int 1142 vioblk_quiesce(dev_info_t *dip) 1143 { 1144 vioblk_t *vib; 1145 1146 if ((vib = ddi_get_driver_private(dip)) == NULL) { 1147 return (DDI_FAILURE); 1148 } 1149 1150 return (virtio_quiesce(vib->vib_virtio)); 1151 } 1152 1153 int 1154 _init(void) 1155 { 1156 int rv; 1157 1158 bd_mod_init(&vioblk_dev_ops); 1159 1160 if ((rv = mod_install(&vioblk_modlinkage)) != 0) { 1161 bd_mod_fini(&vioblk_dev_ops); 1162 } 1163 1164 return (rv); 1165 } 1166 1167 int 1168 _fini(void) 1169 { 1170 int rv; 1171 1172 if ((rv = mod_remove(&vioblk_modlinkage)) == 0) { 1173 bd_mod_fini(&vioblk_dev_ops); 1174 } 1175 1176 return (rv); 1177 } 1178 1179 int 1180 _info(struct modinfo *modinfop) 1181 { 1182 return (mod_info(&vioblk_modlinkage, modinfop)); 1183 } 1184