1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved. 24 * Copyright (c) 2012, Alexey Zaytsev <alexey.zaytsev@gmail.com> 25 * Copyright 2020 Joyent Inc. 26 * Copyright 2019 Western Digital Corporation. 27 * Copyright 2020 Oxide Computer Company 28 */ 29 30 /* 31 * VIRTIO BLOCK DRIVER 32 * 33 * This driver provides support for Virtio Block devices. Each driver instance 34 * attaches to a single underlying block device. 35 * 36 * REQUEST CHAIN LAYOUT 37 * 38 * Every request chain sent to the I/O queue has the following structure. Each 39 * box in the diagram represents a descriptor entry (i.e., a DMA cookie) within 40 * the chain: 41 * 42 * +-0-----------------------------------------+ 43 * | struct virtio_blk_hdr |-----------------------\ 44 * | (written by driver, read by device) | | 45 * +-1-----------------------------------------+ | 46 * | optional data payload |--\ | 47 * | (written by driver for write requests, | | | 48 * | or by device for read requests) | | | 49 * +-2-----------------------------------------+ | | 50 * | ,~` : |-cookies loaned | 51 * |/ : ,~`| | from blkdev | 52 * : / | | | 53 * +-(N - 1)-----------------------------------+ | | 54 * | ... end of data payload. | | | 55 * | | | | 56 * | |--/ | 57 * +-N-----------------------------------------+ | 58 * | status byte | | 59 * | (written by device, read by driver) |--------------------\ | 60 * +-------------------------------------------+ | | 61 * | | 62 * The memory for the header and status bytes (i.e., 0 and N above) | | 63 * is allocated as a single chunk by vioblk_alloc_reqs(): | | 64 * | | 65 * +-------------------------------------------+ | | 66 * | struct virtio_blk_hdr |<----------------------/ 67 * +-------------------------------------------+ | 68 * | status byte |<-------------------/ 69 * +-------------------------------------------+ 70 */ 71 72 #include <sys/modctl.h> 73 #include <sys/blkdev.h> 74 #include <sys/types.h> 75 #include <sys/errno.h> 76 #include <sys/param.h> 77 #include <sys/stropts.h> 78 #include <sys/stream.h> 79 #include <sys/strsubr.h> 80 #include <sys/kmem.h> 81 #include <sys/conf.h> 82 #include <sys/devops.h> 83 #include <sys/ksynch.h> 84 #include <sys/stat.h> 85 #include <sys/modctl.h> 86 #include <sys/debug.h> 87 #include <sys/pci.h> 88 #include <sys/containerof.h> 89 #include <sys/ctype.h> 90 #include <sys/sysmacros.h> 91 #include <sys/dkioc_free_util.h> 92 93 #include "virtio.h" 94 #include "vioblk.h" 95 96 static void vioblk_get_id(vioblk_t *); 97 uint_t vioblk_int_handler(caddr_t, caddr_t); 98 static uint_t vioblk_poll(vioblk_t *); 99 static int vioblk_quiesce(dev_info_t *); 100 static int vioblk_attach(dev_info_t *, ddi_attach_cmd_t); 101 static int vioblk_detach(dev_info_t *, ddi_detach_cmd_t); 102 103 104 static struct dev_ops vioblk_dev_ops = { 105 .devo_rev = DEVO_REV, 106 .devo_refcnt = 0, 107 108 .devo_attach = vioblk_attach, 109 .devo_detach = vioblk_detach, 110 .devo_quiesce = vioblk_quiesce, 111 112 .devo_getinfo = ddi_no_info, 113 .devo_identify = nulldev, 114 .devo_probe = nulldev, 115 .devo_reset = nodev, 116 .devo_cb_ops = NULL, 117 .devo_bus_ops = NULL, 118 .devo_power = NULL, 119 }; 120 121 static struct modldrv vioblk_modldrv = { 122 .drv_modops = &mod_driverops, 123 .drv_linkinfo = "VIRTIO block driver", 124 .drv_dev_ops = &vioblk_dev_ops 125 }; 126 127 static struct modlinkage vioblk_modlinkage = { 128 .ml_rev = MODREV_1, 129 .ml_linkage = { &vioblk_modldrv, NULL } 130 }; 131 132 /* 133 * DMA attribute template for header and status blocks. We also make a 134 * per-instance copy of this template with negotiated sizes from the device for 135 * blkdev. 136 */ 137 static const ddi_dma_attr_t vioblk_dma_attr = { 138 .dma_attr_version = DMA_ATTR_V0, 139 .dma_attr_addr_lo = 0x0000000000000000, 140 .dma_attr_addr_hi = 0xFFFFFFFFFFFFFFFF, 141 .dma_attr_count_max = 0x00000000FFFFFFFF, 142 .dma_attr_align = 1, 143 .dma_attr_burstsizes = 1, 144 .dma_attr_minxfer = 1, 145 .dma_attr_maxxfer = 0x00000000FFFFFFFF, 146 .dma_attr_seg = 0x00000000FFFFFFFF, 147 .dma_attr_sgllen = 1, 148 .dma_attr_granular = 1, 149 .dma_attr_flags = 0 150 }; 151 152 static vioblk_req_t * 153 vioblk_req_alloc(vioblk_t *vib) 154 { 155 vioblk_req_t *vbr; 156 157 VERIFY(MUTEX_HELD(&vib->vib_mutex)); 158 159 if ((vbr = list_remove_head(&vib->vib_reqs)) == NULL) { 160 return (NULL); 161 } 162 vib->vib_nreqs_alloc++; 163 164 VERIFY0(vbr->vbr_status); 165 vbr->vbr_status |= VIOBLK_REQSTAT_ALLOCATED; 166 167 VERIFY3P(vbr->vbr_chain, !=, NULL); 168 VERIFY3P(vbr->vbr_xfer, ==, NULL); 169 VERIFY3S(vbr->vbr_error, ==, 0); 170 171 return (vbr); 172 } 173 174 static void 175 vioblk_req_free(vioblk_t *vib, vioblk_req_t *vbr) 176 { 177 VERIFY(MUTEX_HELD(&vib->vib_mutex)); 178 179 /* 180 * Check that this request was allocated, then zero the status field to 181 * clear all status bits. 182 */ 183 VERIFY(vbr->vbr_status & VIOBLK_REQSTAT_ALLOCATED); 184 vbr->vbr_status = 0; 185 186 vbr->vbr_xfer = NULL; 187 vbr->vbr_error = 0; 188 vbr->vbr_type = 0; 189 virtio_chain_clear(vbr->vbr_chain); 190 191 list_insert_head(&vib->vib_reqs, vbr); 192 193 VERIFY3U(vib->vib_nreqs_alloc, >, 0); 194 vib->vib_nreqs_alloc--; 195 } 196 197 static void 198 vioblk_complete(vioblk_t *vib, vioblk_req_t *vbr) 199 { 200 VERIFY(MUTEX_HELD(&vib->vib_mutex)); 201 202 VERIFY(!(vbr->vbr_status & VIOBLK_REQSTAT_COMPLETE)); 203 vbr->vbr_status |= VIOBLK_REQSTAT_COMPLETE; 204 205 if (vbr->vbr_type == VIRTIO_BLK_T_FLUSH) { 206 vib->vib_stats->vbs_rw_cacheflush.value.ui64++; 207 } 208 209 if (vbr->vbr_xfer != NULL) { 210 /* 211 * This is a blkdev framework request. 212 */ 213 mutex_exit(&vib->vib_mutex); 214 bd_xfer_done(vbr->vbr_xfer, vbr->vbr_error); 215 mutex_enter(&vib->vib_mutex); 216 vbr->vbr_xfer = NULL; 217 } 218 } 219 220 static vioblk_req_t * 221 vioblk_common_start(vioblk_t *vib, int type, uint64_t sector, 222 boolean_t polled) 223 { 224 vioblk_req_t *vbr = NULL; 225 226 if ((vbr = vioblk_req_alloc(vib)) == NULL) { 227 vib->vib_stats->vbs_rw_outofmemory.value.ui64++; 228 return (NULL); 229 } 230 vbr->vbr_type = type; 231 232 if (polled) { 233 /* 234 * Mark this command as polled so that we can wait on it 235 * ourselves. 236 */ 237 vbr->vbr_status |= VIOBLK_REQSTAT_POLLED; 238 } 239 240 struct vioblk_req_hdr vbh; 241 vbh.vbh_type = type; 242 vbh.vbh_ioprio = 0; 243 vbh.vbh_sector = (sector * vib->vib_blk_size) / DEV_BSIZE; 244 bcopy(&vbh, virtio_dma_va(vbr->vbr_dma, 0), sizeof (vbh)); 245 246 /* 247 * Put the header in the first descriptor. See the block comment at 248 * the top of the file for more details on the chain layout. 249 */ 250 if (virtio_chain_append(vbr->vbr_chain, 251 virtio_dma_cookie_pa(vbr->vbr_dma, 0), 252 sizeof (struct vioblk_req_hdr), VIRTIO_DIR_DEVICE_READS) != 253 DDI_SUCCESS) { 254 vioblk_req_free(vib, vbr); 255 return (NULL); 256 } 257 258 return (vbr); 259 } 260 261 static int 262 vioblk_common_submit(vioblk_t *vib, vioblk_req_t *vbr) 263 { 264 virtio_chain_t *vic = vbr->vbr_chain; 265 int r; 266 267 VERIFY(MUTEX_HELD(&vib->vib_mutex)); 268 269 /* 270 * The device will write the status byte into this last descriptor. 271 * See the block comment at the top of the file for more details on the 272 * chain layout. 273 */ 274 if (virtio_chain_append(vic, virtio_dma_cookie_pa(vbr->vbr_dma, 0) + 275 sizeof (struct vioblk_req_hdr), sizeof (uint8_t), 276 VIRTIO_DIR_DEVICE_WRITES) != DDI_SUCCESS) { 277 vioblk_req_free(vib, vbr); 278 return (ENOMEM); 279 } 280 281 virtio_dma_sync(vbr->vbr_dma, DDI_DMA_SYNC_FORDEV); 282 virtio_chain_submit(vic, B_TRUE); 283 284 if (!(vbr->vbr_status & VIOBLK_REQSTAT_POLLED)) { 285 /* 286 * This is not a polled request. Our request will be freed and 287 * the caller notified later in vioblk_poll(). 288 */ 289 return (0); 290 } 291 292 /* 293 * This is a polled request. We need to block here and wait for the 294 * device to complete request processing. 295 */ 296 while (!(vbr->vbr_status & VIOBLK_REQSTAT_POLL_COMPLETE)) { 297 if (ddi_in_panic()) { 298 /* 299 * When panicking, interrupts are disabled. We must 300 * poll the queue manually. 301 */ 302 drv_usecwait(10); 303 (void) vioblk_poll(vib); 304 continue; 305 } 306 307 /* 308 * When not panicking, the device will interrupt on command 309 * completion and vioblk_poll() will be called to wake us up. 310 */ 311 cv_wait(&vib->vib_cv, &vib->vib_mutex); 312 } 313 314 vioblk_complete(vib, vbr); 315 r = vbr->vbr_error; 316 vioblk_req_free(vib, vbr); 317 return (r); 318 } 319 320 static int 321 vioblk_internal(vioblk_t *vib, int type, virtio_dma_t *dma, 322 uint64_t sector, virtio_direction_t dir) 323 { 324 vioblk_req_t *vbr; 325 326 VERIFY(MUTEX_HELD(&vib->vib_mutex)); 327 328 /* 329 * Allocate a polled request. 330 */ 331 if ((vbr = vioblk_common_start(vib, type, sector, B_TRUE)) == NULL) { 332 return (ENOMEM); 333 } 334 335 /* 336 * If there is a request payload, it goes between the header and the 337 * status byte. See the block comment at the top of the file for more 338 * detail on the chain layout. 339 */ 340 if (dma != NULL) { 341 virtio_chain_t *vic = vbr->vbr_chain; 342 for (uint_t n = 0; n < virtio_dma_ncookies(dma); n++) { 343 if (virtio_chain_append(vic, 344 virtio_dma_cookie_pa(dma, n), 345 virtio_dma_cookie_size(dma, n), dir) != 346 DDI_SUCCESS) { 347 vioblk_req_free(vib, vbr); 348 return (ENOMEM); 349 } 350 } 351 } 352 353 return (vioblk_common_submit(vib, vbr)); 354 } 355 356 static int 357 vioblk_map_discard(vioblk_t *vib, virtio_chain_t *vic, const bd_xfer_t *xfer) 358 { 359 const dkioc_free_list_t *dfl = xfer->x_dfl; 360 const dkioc_free_list_ext_t *exts = dfl->dfl_exts; 361 virtio_dma_t *dma = NULL; 362 struct vioblk_discard_write_zeroes *wzp = NULL; 363 364 dma = virtio_dma_alloc(vib->vib_virtio, 365 dfl->dfl_num_exts * sizeof (*wzp), &vioblk_dma_attr, 366 DDI_DMA_CONSISTENT | DDI_DMA_WRITE, KM_SLEEP); 367 if (dma == NULL) 368 return (ENOMEM); 369 370 wzp = virtio_dma_va(dma, 0); 371 372 for (uint64_t i = 0; i < dfl->dfl_num_exts; i++, exts++, wzp++) { 373 uint64_t start = dfl->dfl_offset + exts->dfle_start; 374 375 const struct vioblk_discard_write_zeroes vdwz = { 376 .vdwz_sector = start >> DEV_BSHIFT, 377 .vdwz_num_sectors = exts->dfle_length >> DEV_BSHIFT, 378 .vdwz_flags = 0 379 }; 380 381 bcopy(&vdwz, wzp, sizeof (*wzp)); 382 } 383 384 if (virtio_chain_append(vic, 385 virtio_dma_cookie_pa(dma, 0), 386 virtio_dma_cookie_size(dma, 0), 387 VIRTIO_DIR_DEVICE_READS) != DDI_SUCCESS) { 388 virtio_dma_free(dma); 389 return (ENOMEM); 390 } 391 392 return (0); 393 } 394 395 static int 396 vioblk_request(vioblk_t *vib, bd_xfer_t *xfer, int type) 397 { 398 vioblk_req_t *vbr = NULL; 399 uint_t total_cookies = 2; 400 boolean_t polled = (xfer->x_flags & BD_XFER_POLL) != 0; 401 402 VERIFY(MUTEX_HELD(&vib->vib_mutex)); 403 404 /* 405 * Ensure that this request falls within the advertised size of the 406 * block device. Be careful to avoid overflow. 407 */ 408 if (xfer->x_nblks > SIZE_MAX - xfer->x_blkno || 409 (xfer->x_blkno + xfer->x_nblks) > vib->vib_nblks) { 410 vib->vib_stats->vbs_rw_badoffset.value.ui64++; 411 return (EINVAL); 412 } 413 414 if ((vbr = vioblk_common_start(vib, type, xfer->x_blkno, polled)) == 415 NULL) { 416 return (ENOMEM); 417 } 418 vbr->vbr_xfer = xfer; 419 420 /* 421 * If there is a request payload, it goes between the header and the 422 * status byte. See the block comment at the top of the file for more 423 * detail on the chain layout. 424 */ 425 if ((type == VIRTIO_BLK_T_IN || type == VIRTIO_BLK_T_OUT) && 426 xfer->x_nblks > 0) { 427 virtio_direction_t dir = (type == VIRTIO_BLK_T_OUT) ? 428 VIRTIO_DIR_DEVICE_READS : VIRTIO_DIR_DEVICE_WRITES; 429 virtio_chain_t *vic = vbr->vbr_chain; 430 431 for (uint_t n = 0; n < xfer->x_ndmac; n++) { 432 ddi_dma_cookie_t dmac; 433 434 if (n == 0) { 435 /* 436 * The first cookie is in the blkdev request. 437 */ 438 dmac = xfer->x_dmac; 439 } else { 440 ddi_dma_nextcookie(xfer->x_dmah, &dmac); 441 } 442 443 if (virtio_chain_append(vic, dmac.dmac_laddress, 444 dmac.dmac_size, dir) != DDI_SUCCESS) { 445 vioblk_req_free(vib, vbr); 446 return (ENOMEM); 447 } 448 } 449 450 total_cookies += xfer->x_ndmac; 451 452 } else if (xfer->x_nblks > 0) { 453 dev_err(vib->vib_dip, CE_PANIC, 454 "request of type %d had payload length of %lu blocks", type, 455 xfer->x_nblks); 456 } else if (type == VIRTIO_BLK_T_DISCARD) { 457 int r = vioblk_map_discard(vib, vbr->vbr_chain, xfer); 458 if (r != 0) { 459 vioblk_req_free(vib, vbr); 460 return (r); 461 } 462 } 463 464 if (vib->vib_stats->vbs_rw_cookiesmax.value.ui32 < total_cookies) { 465 vib->vib_stats->vbs_rw_cookiesmax.value.ui32 = total_cookies; 466 } 467 468 return (vioblk_common_submit(vib, vbr)); 469 } 470 471 static int 472 vioblk_bd_read(void *arg, bd_xfer_t *xfer) 473 { 474 vioblk_t *vib = arg; 475 int r; 476 477 mutex_enter(&vib->vib_mutex); 478 r = vioblk_request(vib, xfer, VIRTIO_BLK_T_IN); 479 mutex_exit(&vib->vib_mutex); 480 481 return (r); 482 } 483 484 static int 485 vioblk_bd_write(void *arg, bd_xfer_t *xfer) 486 { 487 vioblk_t *vib = arg; 488 int r; 489 490 mutex_enter(&vib->vib_mutex); 491 r = vioblk_request(vib, xfer, VIRTIO_BLK_T_OUT); 492 mutex_exit(&vib->vib_mutex); 493 494 return (r); 495 } 496 497 static int 498 vioblk_bd_flush(void *arg, bd_xfer_t *xfer) 499 { 500 vioblk_t *vib = arg; 501 int r; 502 503 mutex_enter(&vib->vib_mutex); 504 if (!virtio_feature_present(vib->vib_virtio, VIRTIO_BLK_F_FLUSH)) { 505 /* 506 * We don't really expect to get here, because if we did not 507 * negotiate the flush feature we would not have installed this 508 * function in the blkdev ops vector. 509 */ 510 mutex_exit(&vib->vib_mutex); 511 return (ENOTSUP); 512 } 513 514 r = vioblk_request(vib, xfer, VIRTIO_BLK_T_FLUSH); 515 mutex_exit(&vib->vib_mutex); 516 517 return (r); 518 } 519 520 static void 521 vioblk_bd_driveinfo(void *arg, bd_drive_t *drive) 522 { 523 vioblk_t *vib = arg; 524 525 drive->d_qsize = vib->vib_reqs_capacity; 526 drive->d_removable = B_FALSE; 527 drive->d_hotpluggable = B_TRUE; 528 drive->d_target = 0; 529 drive->d_lun = 0; 530 531 drive->d_vendor = "Virtio"; 532 drive->d_vendor_len = strlen(drive->d_vendor); 533 534 drive->d_product = "Block Device"; 535 drive->d_product_len = strlen(drive->d_product); 536 537 drive->d_serial = vib->vib_devid; 538 drive->d_serial_len = strlen(drive->d_serial); 539 540 drive->d_revision = "0000"; 541 drive->d_revision_len = strlen(drive->d_revision); 542 543 if (vib->vib_can_discard) { 544 drive->d_free_align = vib->vib_discard_sector_align; 545 drive->d_max_free_seg = vib->vib_max_discard_seg; 546 drive->d_max_free_blks = vib->vib_max_discard_sectors; 547 /* 548 * The virtio 1.1 spec doesn't specify a per segment sector 549 * limit for discards -- only a limit on the total sectors in 550 * a discard request. Therefore, we assume a vioblk device must 551 * be able to accept a single segment of vib_max_discard_sectors 552 * (when it supports discard requests) and use 553 * vib_max_discard_sectors both for the overall limit for 554 * a discard request, but also as the limit for a single 555 * segment. blkdev will ensure we are never called with 556 * a dkioc_free_list_t that violates either limit. 557 */ 558 drive->d_max_free_seg_blks = vib->vib_max_discard_sectors; 559 } 560 } 561 562 static int 563 vioblk_bd_mediainfo(void *arg, bd_media_t *media) 564 { 565 vioblk_t *vib = (void *)arg; 566 567 /* 568 * The device protocol is specified in terms of 512 byte logical 569 * blocks, regardless of the recommended I/O size which might be 570 * larger. 571 */ 572 media->m_nblks = vib->vib_nblks; 573 media->m_blksize = vib->vib_blk_size; 574 575 media->m_readonly = vib->vib_readonly; 576 media->m_pblksize = vib->vib_pblk_size; 577 return (0); 578 } 579 580 static void 581 vioblk_get_id(vioblk_t *vib) 582 { 583 virtio_dma_t *dma; 584 int r; 585 586 if ((dma = virtio_dma_alloc(vib->vib_virtio, VIRTIO_BLK_ID_BYTES, 587 &vioblk_dma_attr, DDI_DMA_CONSISTENT | DDI_DMA_READ, 588 KM_SLEEP)) == NULL) { 589 return; 590 } 591 592 mutex_enter(&vib->vib_mutex); 593 if ((r = vioblk_internal(vib, VIRTIO_BLK_T_GET_ID, dma, 0, 594 VIRTIO_DIR_DEVICE_WRITES)) == 0) { 595 const char *b = virtio_dma_va(dma, 0); 596 uint_t pos = 0; 597 598 /* 599 * Save the entire response for debugging purposes. 600 */ 601 bcopy(virtio_dma_va(dma, 0), vib->vib_rawid, 602 VIRTIO_BLK_ID_BYTES); 603 604 /* 605 * Process the returned ID. 606 */ 607 bzero(vib->vib_devid, sizeof (vib->vib_devid)); 608 for (uint_t n = 0; n < VIRTIO_BLK_ID_BYTES; n++) { 609 if (isalnum(b[n]) || b[n] == '-' || b[n] == '_') { 610 /* 611 * Accept a subset of printable ASCII 612 * characters. 613 */ 614 vib->vib_devid[pos++] = b[n]; 615 } else { 616 /* 617 * Stop processing at the first sign of 618 * trouble. 619 */ 620 break; 621 } 622 } 623 624 vib->vib_devid_fetched = B_TRUE; 625 } 626 mutex_exit(&vib->vib_mutex); 627 628 virtio_dma_free(dma); 629 } 630 631 static int 632 vioblk_bd_devid(void *arg, dev_info_t *dip, ddi_devid_t *devid) 633 { 634 vioblk_t *vib = arg; 635 size_t len; 636 637 if ((len = strlen(vib->vib_devid)) == 0) { 638 /* 639 * The device has no ID. 640 */ 641 return (DDI_FAILURE); 642 } 643 644 return (ddi_devid_init(dip, DEVID_ATA_SERIAL, len, vib->vib_devid, 645 devid)); 646 } 647 648 static int 649 vioblk_bd_free_space(void *arg, bd_xfer_t *xfer) 650 { 651 vioblk_t *vib = arg; 652 int r = 0; 653 654 /* 655 * Since vib_can_discard is write once (and set during attach), 656 * we can check if it's enabled without taking the mutex. 657 */ 658 if (!vib->vib_can_discard) { 659 return (ENOTSUP); 660 } 661 662 mutex_enter(&vib->vib_mutex); 663 r = vioblk_request(vib, xfer, VIRTIO_BLK_T_DISCARD); 664 mutex_exit(&vib->vib_mutex); 665 666 return (r); 667 } 668 669 /* 670 * As the device completes processing of a request, it returns the chain for 671 * that request to our I/O queue. This routine is called in two contexts: 672 * - from the interrupt handler, in response to notification from the device 673 * - synchronously in line with request processing when panicking 674 */ 675 static uint_t 676 vioblk_poll(vioblk_t *vib) 677 { 678 virtio_chain_t *vic; 679 uint_t count = 0; 680 boolean_t wakeup = B_FALSE; 681 682 VERIFY(MUTEX_HELD(&vib->vib_mutex)); 683 684 while ((vic = virtio_queue_poll(vib->vib_vq)) != NULL) { 685 vioblk_req_t *vbr = virtio_chain_data(vic); 686 uint8_t status; 687 688 virtio_dma_sync(vbr->vbr_dma, DDI_DMA_SYNC_FORCPU); 689 690 bcopy(virtio_dma_va(vbr->vbr_dma, 691 sizeof (struct vioblk_req_hdr)), &status, sizeof (status)); 692 693 switch (status) { 694 case VIRTIO_BLK_S_OK: 695 vbr->vbr_error = 0; 696 break; 697 case VIRTIO_BLK_S_IOERR: 698 vbr->vbr_error = EIO; 699 vib->vib_stats->vbs_io_errors.value.ui64++; 700 break; 701 case VIRTIO_BLK_S_UNSUPP: 702 vbr->vbr_error = ENOTTY; 703 vib->vib_stats->vbs_unsupp_errors.value.ui64++; 704 break; 705 default: 706 vbr->vbr_error = ENXIO; 707 vib->vib_stats->vbs_nxio_errors.value.ui64++; 708 break; 709 } 710 711 count++; 712 713 if (vbr->vbr_status & VIOBLK_REQSTAT_POLLED) { 714 /* 715 * This request must not be freed as it is being held 716 * by a call to vioblk_common_submit(). 717 */ 718 VERIFY(!(vbr->vbr_status & 719 VIOBLK_REQSTAT_POLL_COMPLETE)); 720 vbr->vbr_status |= VIOBLK_REQSTAT_POLL_COMPLETE; 721 wakeup = B_TRUE; 722 continue; 723 } 724 725 vioblk_complete(vib, vbr); 726 727 vioblk_req_free(vib, vbr); 728 } 729 730 if (wakeup) { 731 /* 732 * Signal anybody waiting for polled command completion. 733 */ 734 cv_broadcast(&vib->vib_cv); 735 } 736 737 return (count); 738 } 739 740 uint_t 741 vioblk_int_handler(caddr_t arg0, caddr_t arg1) 742 { 743 vioblk_t *vib = (vioblk_t *)arg0; 744 uint_t count; 745 746 mutex_enter(&vib->vib_mutex); 747 if ((count = vioblk_poll(vib)) > 748 vib->vib_stats->vbs_intr_queuemax.value.ui32) { 749 vib->vib_stats->vbs_intr_queuemax.value.ui32 = count; 750 } 751 752 vib->vib_stats->vbs_intr_total.value.ui64++; 753 mutex_exit(&vib->vib_mutex); 754 755 return (DDI_INTR_CLAIMED); 756 } 757 758 static void 759 vioblk_free_reqs(vioblk_t *vib) 760 { 761 VERIFY3U(vib->vib_nreqs_alloc, ==, 0); 762 763 for (uint_t i = 0; i < vib->vib_reqs_capacity; i++) { 764 struct vioblk_req *vbr = &vib->vib_reqs_mem[i]; 765 766 VERIFY(list_link_active(&vbr->vbr_link)); 767 list_remove(&vib->vib_reqs, vbr); 768 769 VERIFY0(vbr->vbr_status); 770 771 if (vbr->vbr_chain != NULL) { 772 virtio_chain_free(vbr->vbr_chain); 773 vbr->vbr_chain = NULL; 774 } 775 if (vbr->vbr_dma != NULL) { 776 virtio_dma_free(vbr->vbr_dma); 777 vbr->vbr_dma = NULL; 778 } 779 } 780 VERIFY(list_is_empty(&vib->vib_reqs)); 781 782 if (vib->vib_reqs_mem != NULL) { 783 kmem_free(vib->vib_reqs_mem, 784 sizeof (struct vioblk_req) * vib->vib_reqs_capacity); 785 vib->vib_reqs_mem = NULL; 786 vib->vib_reqs_capacity = 0; 787 } 788 } 789 790 static int 791 vioblk_alloc_reqs(vioblk_t *vib) 792 { 793 vib->vib_reqs_capacity = MIN(virtio_queue_size(vib->vib_vq), 794 VIRTIO_BLK_REQ_BUFS); 795 vib->vib_reqs_mem = kmem_zalloc( 796 sizeof (struct vioblk_req) * vib->vib_reqs_capacity, KM_SLEEP); 797 vib->vib_nreqs_alloc = 0; 798 799 for (uint_t i = 0; i < vib->vib_reqs_capacity; i++) { 800 list_insert_tail(&vib->vib_reqs, &vib->vib_reqs_mem[i]); 801 } 802 803 for (vioblk_req_t *vbr = list_head(&vib->vib_reqs); vbr != NULL; 804 vbr = list_next(&vib->vib_reqs, vbr)) { 805 if ((vbr->vbr_dma = virtio_dma_alloc(vib->vib_virtio, 806 sizeof (struct vioblk_req_hdr) + sizeof (uint8_t), 807 &vioblk_dma_attr, DDI_DMA_RDWR | DDI_DMA_CONSISTENT, 808 KM_SLEEP)) == NULL) { 809 goto fail; 810 } 811 vbr->vbr_chain = virtio_chain_alloc(vib->vib_vq, KM_SLEEP); 812 if (vbr->vbr_chain == NULL) { 813 goto fail; 814 } 815 virtio_chain_data_set(vbr->vbr_chain, vbr); 816 } 817 818 return (0); 819 820 fail: 821 vioblk_free_reqs(vib); 822 return (ENOMEM); 823 } 824 825 static int 826 vioblk_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 827 { 828 int instance = ddi_get_instance(dip); 829 vioblk_t *vib; 830 virtio_t *vio; 831 boolean_t did_mutex = B_FALSE; 832 833 if (cmd != DDI_ATTACH) { 834 return (DDI_FAILURE); 835 } 836 837 if ((vio = virtio_init(dip, VIRTIO_BLK_WANTED_FEATURES, B_TRUE)) == 838 NULL) { 839 dev_err(dip, CE_WARN, "failed to start Virtio init"); 840 return (DDI_FAILURE); 841 } 842 843 vib = kmem_zalloc(sizeof (*vib), KM_SLEEP); 844 vib->vib_dip = dip; 845 vib->vib_virtio = vio; 846 ddi_set_driver_private(dip, vib); 847 list_create(&vib->vib_reqs, sizeof (vioblk_req_t), 848 offsetof(vioblk_req_t, vbr_link)); 849 850 /* 851 * Determine how many scatter-gather entries we can use in a single 852 * request. 853 */ 854 vib->vib_seg_max = VIRTIO_BLK_DEFAULT_MAX_SEG; 855 if (virtio_feature_present(vio, VIRTIO_BLK_F_SEG_MAX)) { 856 vib->vib_seg_max = virtio_dev_get32(vio, 857 VIRTIO_BLK_CONFIG_SEG_MAX); 858 859 if (vib->vib_seg_max == 0 || vib->vib_seg_max == PCI_EINVAL32) { 860 /* 861 * We need to be able to use at least one data segment, 862 * so we'll assume that this device is just poorly 863 * implemented and try for one. 864 */ 865 vib->vib_seg_max = 1; 866 } 867 } 868 869 if (virtio_feature_present(vio, VIRTIO_BLK_F_DISCARD)) { 870 vib->vib_max_discard_sectors = virtio_dev_get32(vio, 871 VIRTIO_BLK_CONFIG_MAX_DISCARD_SECT); 872 vib->vib_max_discard_seg = virtio_dev_get32(vio, 873 VIRTIO_BLK_CONFIG_MAX_DISCARD_SEG); 874 vib->vib_discard_sector_align = virtio_dev_get32(vio, 875 VIRTIO_BLK_CONFIG_DISCARD_ALIGN); 876 877 if (vib->vib_max_discard_sectors == 0 || 878 vib->vib_max_discard_seg == 0 || 879 vib->vib_discard_sector_align == 0) { 880 vib->vib_can_discard = B_FALSE; 881 882 /* 883 * The hypervisor shouldn't be giving us bad values. 884 * If it is, it's probably worth notifying the 885 * operator. 886 */ 887 dev_err(dip, CE_NOTE, 888 "Host is advertising DISCARD support but with bad" 889 "parameters: max_discard_sectors=%u, " 890 "max_discard_segments=%u, discard_sector_align=%u", 891 vib->vib_max_discard_sectors, 892 vib->vib_max_discard_seg, 893 vib->vib_discard_sector_align); 894 } else { 895 vib->vib_can_discard = B_TRUE; 896 } 897 } 898 899 /* 900 * When allocating the request queue, we include two additional 901 * descriptors (beyond those required for request data) to account for 902 * the header and the status byte. 903 */ 904 if ((vib->vib_vq = virtio_queue_alloc(vio, VIRTIO_BLK_VIRTQ_IO, "io", 905 vioblk_int_handler, vib, B_FALSE, vib->vib_seg_max + 2)) == NULL) { 906 goto fail; 907 } 908 909 if (virtio_init_complete(vio, 0) != DDI_SUCCESS) { 910 dev_err(dip, CE_WARN, "failed to complete Virtio init"); 911 goto fail; 912 } 913 914 cv_init(&vib->vib_cv, NULL, CV_DRIVER, NULL); 915 mutex_init(&vib->vib_mutex, NULL, MUTEX_DRIVER, virtio_intr_pri(vio)); 916 did_mutex = B_TRUE; 917 918 if ((vib->vib_kstat = kstat_create("vioblk", instance, 919 "statistics", "controller", KSTAT_TYPE_NAMED, 920 sizeof (struct vioblk_stats) / sizeof (kstat_named_t), 921 KSTAT_FLAG_PERSISTENT)) == NULL) { 922 dev_err(dip, CE_WARN, "kstat_create failed"); 923 goto fail; 924 } 925 vib->vib_stats = (vioblk_stats_t *)vib->vib_kstat->ks_data; 926 kstat_named_init(&vib->vib_stats->vbs_rw_outofmemory, 927 "total_rw_outofmemory", KSTAT_DATA_UINT64); 928 kstat_named_init(&vib->vib_stats->vbs_rw_badoffset, 929 "total_rw_badoffset", KSTAT_DATA_UINT64); 930 kstat_named_init(&vib->vib_stats->vbs_intr_total, 931 "total_intr", KSTAT_DATA_UINT64); 932 kstat_named_init(&vib->vib_stats->vbs_io_errors, 933 "total_io_errors", KSTAT_DATA_UINT64); 934 kstat_named_init(&vib->vib_stats->vbs_unsupp_errors, 935 "total_unsupp_errors", KSTAT_DATA_UINT64); 936 kstat_named_init(&vib->vib_stats->vbs_nxio_errors, 937 "total_nxio_errors", KSTAT_DATA_UINT64); 938 kstat_named_init(&vib->vib_stats->vbs_rw_cacheflush, 939 "total_rw_cacheflush", KSTAT_DATA_UINT64); 940 kstat_named_init(&vib->vib_stats->vbs_rw_cookiesmax, 941 "max_rw_cookies", KSTAT_DATA_UINT32); 942 kstat_named_init(&vib->vib_stats->vbs_intr_queuemax, 943 "max_intr_queue", KSTAT_DATA_UINT32); 944 kstat_install(vib->vib_kstat); 945 946 vib->vib_readonly = virtio_feature_present(vio, VIRTIO_BLK_F_RO); 947 if ((vib->vib_nblks = virtio_dev_get64(vio, 948 VIRTIO_BLK_CONFIG_CAPACITY)) == UINT64_MAX) { 949 dev_err(dip, CE_WARN, "invalid capacity"); 950 goto fail; 951 } 952 953 /* 954 * Determine the optimal logical block size recommended by the device. 955 * This size is advisory; the protocol always deals in 512 byte blocks. 956 */ 957 vib->vib_blk_size = DEV_BSIZE; 958 if (virtio_feature_present(vio, VIRTIO_BLK_F_BLK_SIZE)) { 959 uint32_t v = virtio_dev_get32(vio, VIRTIO_BLK_CONFIG_BLK_SIZE); 960 961 if (v != 0 && v != PCI_EINVAL32) { 962 vib->vib_blk_size = v; 963 } 964 } 965 966 /* 967 * Device capacity is always in 512-byte units, convert to 968 * native blocks. 969 */ 970 vib->vib_nblks = (vib->vib_nblks * DEV_BSIZE) / vib->vib_blk_size; 971 972 /* 973 * The device may also provide an advisory physical block size. 974 */ 975 vib->vib_pblk_size = vib->vib_blk_size; 976 if (virtio_feature_present(vio, VIRTIO_BLK_F_TOPOLOGY)) { 977 uint8_t v = virtio_dev_get8(vio, VIRTIO_BLK_CONFIG_TOPO_PBEXP); 978 979 if (v != PCI_EINVAL8) { 980 vib->vib_pblk_size <<= v; 981 } 982 } 983 984 /* 985 * The maximum size for a cookie in a request. 986 */ 987 vib->vib_seg_size_max = VIRTIO_BLK_DEFAULT_MAX_SIZE; 988 if (virtio_feature_present(vio, VIRTIO_BLK_F_SIZE_MAX)) { 989 uint32_t v = virtio_dev_get32(vio, VIRTIO_BLK_CONFIG_SIZE_MAX); 990 991 if (v != 0 && v != PCI_EINVAL32) { 992 vib->vib_seg_size_max = v; 993 } 994 } 995 996 /* 997 * Set up the DMA attributes for blkdev to use for request data. The 998 * specification is not extremely clear about whether DMA-related 999 * parameters include or exclude the header and status descriptors. 1000 * For now, we assume they cover only the request data and not the 1001 * headers. 1002 */ 1003 vib->vib_bd_dma_attr = vioblk_dma_attr; 1004 vib->vib_bd_dma_attr.dma_attr_sgllen = vib->vib_seg_max; 1005 vib->vib_bd_dma_attr.dma_attr_count_max = vib->vib_seg_size_max; 1006 vib->vib_bd_dma_attr.dma_attr_maxxfer = vib->vib_seg_max * 1007 vib->vib_seg_size_max; 1008 1009 if (vioblk_alloc_reqs(vib) != 0) { 1010 goto fail; 1011 } 1012 1013 /* 1014 * The blkdev framework does not provide a way to specify that the 1015 * device does not support write cache flushing, except by omitting the 1016 * "o_sync_cache" member from the ops vector. As "bd_alloc_handle()" 1017 * makes a copy of the ops vector, we can safely assemble one on the 1018 * stack based on negotiated features. 1019 * 1020 * Similarly, the blkdev framework does not provide a way to indicate 1021 * if a device supports an TRIM/UNMAP/DISCARD type operation except 1022 * by omitting the "o_free_space" member from the ops vector. 1023 */ 1024 bd_ops_t vioblk_bd_ops = { 1025 .o_version = BD_OPS_CURRENT_VERSION, 1026 .o_drive_info = vioblk_bd_driveinfo, 1027 .o_media_info = vioblk_bd_mediainfo, 1028 .o_devid_init = vioblk_bd_devid, 1029 .o_sync_cache = vioblk_bd_flush, 1030 .o_read = vioblk_bd_read, 1031 .o_write = vioblk_bd_write, 1032 .o_free_space = vioblk_bd_free_space, 1033 }; 1034 if (!virtio_feature_present(vio, VIRTIO_BLK_F_FLUSH)) { 1035 vioblk_bd_ops.o_sync_cache = NULL; 1036 } 1037 if (!vib->vib_can_discard) { 1038 vioblk_bd_ops.o_free_space = NULL; 1039 } 1040 1041 vib->vib_bd_h = bd_alloc_handle(vib, &vioblk_bd_ops, 1042 &vib->vib_bd_dma_attr, KM_SLEEP); 1043 1044 /* 1045 * Enable interrupts now so that we can request the device identity. 1046 */ 1047 if (virtio_interrupts_enable(vio) != DDI_SUCCESS) { 1048 goto fail; 1049 } 1050 1051 vioblk_get_id(vib); 1052 1053 if (bd_attach_handle(dip, vib->vib_bd_h) != DDI_SUCCESS) { 1054 dev_err(dip, CE_WARN, "Failed to attach blkdev"); 1055 goto fail; 1056 } 1057 1058 return (DDI_SUCCESS); 1059 1060 fail: 1061 if (vib->vib_bd_h != NULL) { 1062 (void) bd_detach_handle(vib->vib_bd_h); 1063 bd_free_handle(vib->vib_bd_h); 1064 } 1065 if (vio != NULL) { 1066 (void) virtio_fini(vio, B_TRUE); 1067 } 1068 if (did_mutex) { 1069 mutex_destroy(&vib->vib_mutex); 1070 cv_destroy(&vib->vib_cv); 1071 } 1072 if (vib->vib_kstat != NULL) { 1073 kstat_delete(vib->vib_kstat); 1074 } 1075 vioblk_free_reqs(vib); 1076 kmem_free(vib, sizeof (*vib)); 1077 return (DDI_FAILURE); 1078 } 1079 1080 static int 1081 vioblk_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 1082 { 1083 vioblk_t *vib = ddi_get_driver_private(dip); 1084 1085 if (cmd != DDI_DETACH) { 1086 return (DDI_FAILURE); 1087 } 1088 1089 mutex_enter(&vib->vib_mutex); 1090 if (vib->vib_nreqs_alloc > 0) { 1091 /* 1092 * Cannot detach while there are still outstanding requests. 1093 */ 1094 mutex_exit(&vib->vib_mutex); 1095 return (DDI_FAILURE); 1096 } 1097 1098 if (bd_detach_handle(vib->vib_bd_h) != DDI_SUCCESS) { 1099 mutex_exit(&vib->vib_mutex); 1100 return (DDI_FAILURE); 1101 } 1102 1103 /* 1104 * Tear down the Virtio framework before freeing the rest of the 1105 * resources. This will ensure the interrupt handlers are no longer 1106 * running. 1107 */ 1108 virtio_fini(vib->vib_virtio, B_FALSE); 1109 1110 vioblk_free_reqs(vib); 1111 kstat_delete(vib->vib_kstat); 1112 1113 mutex_exit(&vib->vib_mutex); 1114 mutex_destroy(&vib->vib_mutex); 1115 1116 kmem_free(vib, sizeof (*vib)); 1117 1118 return (DDI_SUCCESS); 1119 } 1120 1121 static int 1122 vioblk_quiesce(dev_info_t *dip) 1123 { 1124 vioblk_t *vib; 1125 1126 if ((vib = ddi_get_driver_private(dip)) == NULL) { 1127 return (DDI_FAILURE); 1128 } 1129 1130 return (virtio_quiesce(vib->vib_virtio)); 1131 } 1132 1133 int 1134 _init(void) 1135 { 1136 int rv; 1137 1138 bd_mod_init(&vioblk_dev_ops); 1139 1140 if ((rv = mod_install(&vioblk_modlinkage)) != 0) { 1141 bd_mod_fini(&vioblk_dev_ops); 1142 } 1143 1144 return (rv); 1145 } 1146 1147 int 1148 _fini(void) 1149 { 1150 int rv; 1151 1152 if ((rv = mod_remove(&vioblk_modlinkage)) == 0) { 1153 bd_mod_fini(&vioblk_dev_ops); 1154 } 1155 1156 return (rv); 1157 } 1158 1159 int 1160 _info(struct modinfo *modinfop) 1161 { 1162 return (mod_info(&vioblk_modlinkage, modinfop)); 1163 } 1164