1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved. 24 * Copyright (c) 2012, Alexey Zaytsev <alexey.zaytsev@gmail.com> 25 * Copyright 2020 Joyent Inc. 26 * Copyright 2019 Western Digital Corporation. 27 * Copyright 2020 Oxide Computer Company 28 * Copyright 2022 OmniOS Community Edition (OmniOSce) Association. 29 */ 30 31 /* 32 * VIRTIO BLOCK DRIVER 33 * 34 * This driver provides support for Virtio Block devices. Each driver instance 35 * attaches to a single underlying block device. 36 * 37 * REQUEST CHAIN LAYOUT 38 * 39 * Every request chain sent to the I/O queue has the following structure. Each 40 * box in the diagram represents a descriptor entry (i.e., a DMA cookie) within 41 * the chain: 42 * 43 * +-0-----------------------------------------+ 44 * | struct virtio_blk_hdr |-----------------------\ 45 * | (written by driver, read by device) | | 46 * +-1-----------------------------------------+ | 47 * | optional data payload |--\ | 48 * | (written by driver for write requests, | | | 49 * | or by device for read requests) | | | 50 * +-2-----------------------------------------+ | | 51 * | ,~` : |-cookies loaned | 52 * |/ : ,~`| | from blkdev | 53 * : / | | | 54 * +-(N - 1)-----------------------------------+ | | 55 * | ... end of data payload. | | | 56 * | | | | 57 * | |--/ | 58 * +-N-----------------------------------------+ | 59 * | status byte | | 60 * | (written by device, read by driver) |--------------------\ | 61 * +-------------------------------------------+ | | 62 * | | 63 * The memory for the header and status bytes (i.e., 0 and N above) | | 64 * is allocated as a single chunk by vioblk_alloc_reqs(): | | 65 * | | 66 * +-------------------------------------------+ | | 67 * | struct virtio_blk_hdr |<----------------------/ 68 * +-------------------------------------------+ | 69 * | status byte |<-------------------/ 70 * +-------------------------------------------+ 71 */ 72 73 #include <sys/modctl.h> 74 #include <sys/blkdev.h> 75 #include <sys/types.h> 76 #include <sys/errno.h> 77 #include <sys/param.h> 78 #include <sys/stropts.h> 79 #include <sys/stream.h> 80 #include <sys/strsubr.h> 81 #include <sys/kmem.h> 82 #include <sys/conf.h> 83 #include <sys/devops.h> 84 #include <sys/ksynch.h> 85 #include <sys/stat.h> 86 #include <sys/modctl.h> 87 #include <sys/debug.h> 88 #include <sys/pci.h> 89 #include <sys/containerof.h> 90 #include <sys/ctype.h> 91 #include <sys/sysmacros.h> 92 #include <sys/dkioc_free_util.h> 93 94 #include "virtio.h" 95 #include "vioblk.h" 96 97 static void vioblk_get_id(vioblk_t *); 98 static uint_t vioblk_int_handler(caddr_t, caddr_t); 99 static uint_t vioblk_poll(vioblk_t *); 100 static int vioblk_quiesce(dev_info_t *); 101 static int vioblk_read_capacity(vioblk_t *); 102 static int vioblk_attach(dev_info_t *, ddi_attach_cmd_t); 103 static int vioblk_detach(dev_info_t *, ddi_detach_cmd_t); 104 105 106 static struct dev_ops vioblk_dev_ops = { 107 .devo_rev = DEVO_REV, 108 .devo_refcnt = 0, 109 110 .devo_attach = vioblk_attach, 111 .devo_detach = vioblk_detach, 112 .devo_quiesce = vioblk_quiesce, 113 114 .devo_getinfo = ddi_no_info, 115 .devo_identify = nulldev, 116 .devo_probe = nulldev, 117 .devo_reset = nodev, 118 .devo_cb_ops = NULL, 119 .devo_bus_ops = NULL, 120 .devo_power = NULL, 121 }; 122 123 static struct modldrv vioblk_modldrv = { 124 .drv_modops = &mod_driverops, 125 .drv_linkinfo = "VIRTIO block driver", 126 .drv_dev_ops = &vioblk_dev_ops 127 }; 128 129 static struct modlinkage vioblk_modlinkage = { 130 .ml_rev = MODREV_1, 131 .ml_linkage = { &vioblk_modldrv, NULL } 132 }; 133 134 /* 135 * DMA attribute template for header and status blocks. We also make a 136 * per-instance copy of this template with negotiated sizes from the device for 137 * blkdev. 138 */ 139 static const ddi_dma_attr_t vioblk_dma_attr = { 140 .dma_attr_version = DMA_ATTR_V0, 141 .dma_attr_addr_lo = 0x0000000000000000, 142 .dma_attr_addr_hi = 0xFFFFFFFFFFFFFFFF, 143 .dma_attr_count_max = 0x00000000FFFFFFFF, 144 .dma_attr_align = 1, 145 .dma_attr_burstsizes = 1, 146 .dma_attr_minxfer = 1, 147 .dma_attr_maxxfer = 0x00000000FFFFFFFF, 148 .dma_attr_seg = 0x00000000FFFFFFFF, 149 .dma_attr_sgllen = 1, 150 .dma_attr_granular = 1, 151 .dma_attr_flags = 0 152 }; 153 154 static vioblk_req_t * 155 vioblk_req_alloc(vioblk_t *vib) 156 { 157 vioblk_req_t *vbr; 158 159 VERIFY(MUTEX_HELD(&vib->vib_mutex)); 160 161 if ((vbr = list_remove_head(&vib->vib_reqs)) == NULL) { 162 return (NULL); 163 } 164 vib->vib_nreqs_alloc++; 165 166 VERIFY0(vbr->vbr_status); 167 vbr->vbr_status |= VIOBLK_REQSTAT_ALLOCATED; 168 169 VERIFY3P(vbr->vbr_chain, !=, NULL); 170 VERIFY3P(vbr->vbr_xfer, ==, NULL); 171 VERIFY3S(vbr->vbr_error, ==, 0); 172 173 return (vbr); 174 } 175 176 static void 177 vioblk_req_free(vioblk_t *vib, vioblk_req_t *vbr) 178 { 179 VERIFY(MUTEX_HELD(&vib->vib_mutex)); 180 181 /* 182 * Check that this request was allocated, then zero the status field to 183 * clear all status bits. 184 */ 185 VERIFY(vbr->vbr_status & VIOBLK_REQSTAT_ALLOCATED); 186 vbr->vbr_status = 0; 187 188 vbr->vbr_xfer = NULL; 189 vbr->vbr_error = 0; 190 vbr->vbr_type = 0; 191 virtio_chain_clear(vbr->vbr_chain); 192 193 list_insert_head(&vib->vib_reqs, vbr); 194 195 VERIFY3U(vib->vib_nreqs_alloc, >, 0); 196 vib->vib_nreqs_alloc--; 197 } 198 199 static void 200 vioblk_complete(vioblk_t *vib, vioblk_req_t *vbr) 201 { 202 VERIFY(MUTEX_HELD(&vib->vib_mutex)); 203 204 VERIFY(!(vbr->vbr_status & VIOBLK_REQSTAT_COMPLETE)); 205 vbr->vbr_status |= VIOBLK_REQSTAT_COMPLETE; 206 207 if (vbr->vbr_type == VIRTIO_BLK_T_FLUSH) { 208 vib->vib_stats->vbs_rw_cacheflush.value.ui64++; 209 } 210 211 if (vbr->vbr_xfer != NULL) { 212 /* 213 * This is a blkdev framework request. 214 */ 215 mutex_exit(&vib->vib_mutex); 216 bd_xfer_done(vbr->vbr_xfer, vbr->vbr_error); 217 mutex_enter(&vib->vib_mutex); 218 vbr->vbr_xfer = NULL; 219 } 220 } 221 222 static vioblk_req_t * 223 vioblk_common_start(vioblk_t *vib, int type, uint64_t sector, 224 boolean_t polled) 225 { 226 vioblk_req_t *vbr = NULL; 227 228 if ((vbr = vioblk_req_alloc(vib)) == NULL) { 229 vib->vib_stats->vbs_rw_outofmemory.value.ui64++; 230 return (NULL); 231 } 232 vbr->vbr_type = type; 233 234 if (polled) { 235 /* 236 * Mark this command as polled so that we can wait on it 237 * ourselves. 238 */ 239 vbr->vbr_status |= VIOBLK_REQSTAT_POLLED; 240 } 241 242 struct vioblk_req_hdr vbh; 243 vbh.vbh_type = type; 244 vbh.vbh_ioprio = 0; 245 vbh.vbh_sector = (sector * vib->vib_blk_size) / DEV_BSIZE; 246 bcopy(&vbh, virtio_dma_va(vbr->vbr_dma, 0), sizeof (vbh)); 247 248 /* 249 * Put the header in the first descriptor. See the block comment at 250 * the top of the file for more details on the chain layout. 251 */ 252 if (virtio_chain_append(vbr->vbr_chain, 253 virtio_dma_cookie_pa(vbr->vbr_dma, 0), 254 sizeof (struct vioblk_req_hdr), VIRTIO_DIR_DEVICE_READS) != 255 DDI_SUCCESS) { 256 vioblk_req_free(vib, vbr); 257 return (NULL); 258 } 259 260 return (vbr); 261 } 262 263 static int 264 vioblk_common_submit(vioblk_t *vib, vioblk_req_t *vbr) 265 { 266 virtio_chain_t *vic = vbr->vbr_chain; 267 int r; 268 269 VERIFY(MUTEX_HELD(&vib->vib_mutex)); 270 271 /* 272 * The device will write the status byte into this last descriptor. 273 * See the block comment at the top of the file for more details on the 274 * chain layout. 275 */ 276 if (virtio_chain_append(vic, virtio_dma_cookie_pa(vbr->vbr_dma, 0) + 277 sizeof (struct vioblk_req_hdr), sizeof (uint8_t), 278 VIRTIO_DIR_DEVICE_WRITES) != DDI_SUCCESS) { 279 vioblk_req_free(vib, vbr); 280 return (ENOMEM); 281 } 282 283 virtio_dma_sync(vbr->vbr_dma, DDI_DMA_SYNC_FORDEV); 284 virtio_chain_submit(vic, B_TRUE); 285 286 if (!(vbr->vbr_status & VIOBLK_REQSTAT_POLLED)) { 287 /* 288 * This is not a polled request. Our request will be freed and 289 * the caller notified later in vioblk_poll(). 290 */ 291 return (0); 292 } 293 294 /* 295 * This is a polled request. We need to block here and wait for the 296 * device to complete request processing. 297 */ 298 while (!(vbr->vbr_status & VIOBLK_REQSTAT_POLL_COMPLETE)) { 299 if (ddi_in_panic()) { 300 /* 301 * When panicking, interrupts are disabled. We must 302 * poll the queue manually. 303 */ 304 drv_usecwait(10); 305 (void) vioblk_poll(vib); 306 continue; 307 } 308 309 /* 310 * When not panicking, the device will interrupt on command 311 * completion and vioblk_poll() will be called to wake us up. 312 */ 313 cv_wait(&vib->vib_cv, &vib->vib_mutex); 314 } 315 316 vioblk_complete(vib, vbr); 317 r = vbr->vbr_error; 318 vioblk_req_free(vib, vbr); 319 return (r); 320 } 321 322 static int 323 vioblk_internal(vioblk_t *vib, int type, virtio_dma_t *dma, 324 uint64_t sector, virtio_direction_t dir) 325 { 326 vioblk_req_t *vbr; 327 328 VERIFY(MUTEX_HELD(&vib->vib_mutex)); 329 330 /* 331 * Allocate a polled request. 332 */ 333 if ((vbr = vioblk_common_start(vib, type, sector, B_TRUE)) == NULL) { 334 return (ENOMEM); 335 } 336 337 /* 338 * If there is a request payload, it goes between the header and the 339 * status byte. See the block comment at the top of the file for more 340 * detail on the chain layout. 341 */ 342 if (dma != NULL) { 343 virtio_chain_t *vic = vbr->vbr_chain; 344 for (uint_t n = 0; n < virtio_dma_ncookies(dma); n++) { 345 if (virtio_chain_append(vic, 346 virtio_dma_cookie_pa(dma, n), 347 virtio_dma_cookie_size(dma, n), dir) != 348 DDI_SUCCESS) { 349 vioblk_req_free(vib, vbr); 350 return (ENOMEM); 351 } 352 } 353 } 354 355 return (vioblk_common_submit(vib, vbr)); 356 } 357 358 static int 359 vioblk_map_discard(vioblk_t *vib, virtio_chain_t *vic, const bd_xfer_t *xfer) 360 { 361 const dkioc_free_list_t *dfl = xfer->x_dfl; 362 const dkioc_free_list_ext_t *exts = dfl->dfl_exts; 363 virtio_dma_t *dma = NULL; 364 struct vioblk_discard_write_zeroes *wzp = NULL; 365 366 dma = virtio_dma_alloc(vib->vib_virtio, 367 dfl->dfl_num_exts * sizeof (*wzp), &vioblk_dma_attr, 368 DDI_DMA_CONSISTENT | DDI_DMA_WRITE, KM_SLEEP); 369 if (dma == NULL) 370 return (ENOMEM); 371 372 wzp = virtio_dma_va(dma, 0); 373 374 for (uint64_t i = 0; i < dfl->dfl_num_exts; i++, exts++, wzp++) { 375 uint64_t start = dfl->dfl_offset + exts->dfle_start; 376 377 const struct vioblk_discard_write_zeroes vdwz = { 378 .vdwz_sector = start >> DEV_BSHIFT, 379 .vdwz_num_sectors = exts->dfle_length >> DEV_BSHIFT, 380 .vdwz_flags = 0 381 }; 382 383 bcopy(&vdwz, wzp, sizeof (*wzp)); 384 } 385 386 if (virtio_chain_append(vic, 387 virtio_dma_cookie_pa(dma, 0), 388 virtio_dma_cookie_size(dma, 0), 389 VIRTIO_DIR_DEVICE_READS) != DDI_SUCCESS) { 390 virtio_dma_free(dma); 391 return (ENOMEM); 392 } 393 394 return (0); 395 } 396 397 static int 398 vioblk_request(vioblk_t *vib, bd_xfer_t *xfer, int type) 399 { 400 vioblk_req_t *vbr = NULL; 401 uint_t total_cookies = 2; 402 boolean_t polled = (xfer->x_flags & BD_XFER_POLL) != 0; 403 404 VERIFY(MUTEX_HELD(&vib->vib_mutex)); 405 406 /* 407 * Ensure that this request falls within the advertised size of the 408 * block device. Be careful to avoid overflow. 409 */ 410 if (xfer->x_nblks > SIZE_MAX - xfer->x_blkno || 411 (xfer->x_blkno + xfer->x_nblks) > vib->vib_nblks) { 412 vib->vib_stats->vbs_rw_badoffset.value.ui64++; 413 return (EINVAL); 414 } 415 416 if ((vbr = vioblk_common_start(vib, type, xfer->x_blkno, polled)) == 417 NULL) { 418 return (ENOMEM); 419 } 420 vbr->vbr_xfer = xfer; 421 422 /* 423 * If there is a request payload, it goes between the header and the 424 * status byte. See the block comment at the top of the file for more 425 * detail on the chain layout. 426 */ 427 if ((type == VIRTIO_BLK_T_IN || type == VIRTIO_BLK_T_OUT) && 428 xfer->x_nblks > 0) { 429 virtio_direction_t dir = (type == VIRTIO_BLK_T_OUT) ? 430 VIRTIO_DIR_DEVICE_READS : VIRTIO_DIR_DEVICE_WRITES; 431 virtio_chain_t *vic = vbr->vbr_chain; 432 433 for (uint_t n = 0; n < xfer->x_ndmac; n++) { 434 ddi_dma_cookie_t dmac; 435 436 if (n == 0) { 437 /* 438 * The first cookie is in the blkdev request. 439 */ 440 dmac = xfer->x_dmac; 441 } else { 442 ddi_dma_nextcookie(xfer->x_dmah, &dmac); 443 } 444 445 if (virtio_chain_append(vic, dmac.dmac_laddress, 446 dmac.dmac_size, dir) != DDI_SUCCESS) { 447 vioblk_req_free(vib, vbr); 448 return (ENOMEM); 449 } 450 } 451 452 total_cookies += xfer->x_ndmac; 453 454 } else if (xfer->x_nblks > 0) { 455 dev_err(vib->vib_dip, CE_PANIC, 456 "request of type %d had payload length of %lu blocks", type, 457 xfer->x_nblks); 458 } else if (type == VIRTIO_BLK_T_DISCARD) { 459 int r = vioblk_map_discard(vib, vbr->vbr_chain, xfer); 460 if (r != 0) { 461 vioblk_req_free(vib, vbr); 462 return (r); 463 } 464 } 465 466 if (vib->vib_stats->vbs_rw_cookiesmax.value.ui32 < total_cookies) { 467 vib->vib_stats->vbs_rw_cookiesmax.value.ui32 = total_cookies; 468 } 469 470 return (vioblk_common_submit(vib, vbr)); 471 } 472 473 static int 474 vioblk_bd_read(void *arg, bd_xfer_t *xfer) 475 { 476 vioblk_t *vib = arg; 477 int r; 478 479 mutex_enter(&vib->vib_mutex); 480 r = vioblk_request(vib, xfer, VIRTIO_BLK_T_IN); 481 mutex_exit(&vib->vib_mutex); 482 483 return (r); 484 } 485 486 static int 487 vioblk_bd_write(void *arg, bd_xfer_t *xfer) 488 { 489 vioblk_t *vib = arg; 490 int r; 491 492 mutex_enter(&vib->vib_mutex); 493 r = vioblk_request(vib, xfer, VIRTIO_BLK_T_OUT); 494 mutex_exit(&vib->vib_mutex); 495 496 return (r); 497 } 498 499 static int 500 vioblk_bd_flush(void *arg, bd_xfer_t *xfer) 501 { 502 vioblk_t *vib = arg; 503 int r; 504 505 mutex_enter(&vib->vib_mutex); 506 if (!virtio_feature_present(vib->vib_virtio, VIRTIO_BLK_F_FLUSH)) { 507 /* 508 * We don't really expect to get here, because if we did not 509 * negotiate the flush feature we would not have installed this 510 * function in the blkdev ops vector. 511 */ 512 mutex_exit(&vib->vib_mutex); 513 return (ENOTSUP); 514 } 515 516 r = vioblk_request(vib, xfer, VIRTIO_BLK_T_FLUSH); 517 mutex_exit(&vib->vib_mutex); 518 519 return (r); 520 } 521 522 static void 523 vioblk_bd_driveinfo(void *arg, bd_drive_t *drive) 524 { 525 vioblk_t *vib = arg; 526 527 drive->d_qsize = vib->vib_reqs_capacity; 528 drive->d_removable = B_FALSE; 529 drive->d_hotpluggable = B_TRUE; 530 drive->d_target = 0; 531 drive->d_lun = 0; 532 533 drive->d_vendor = "Virtio"; 534 drive->d_vendor_len = strlen(drive->d_vendor); 535 536 drive->d_product = "Block Device"; 537 drive->d_product_len = strlen(drive->d_product); 538 539 drive->d_serial = vib->vib_devid; 540 drive->d_serial_len = strlen(drive->d_serial); 541 542 drive->d_revision = "0000"; 543 drive->d_revision_len = strlen(drive->d_revision); 544 545 if (vib->vib_can_discard) { 546 drive->d_free_align = vib->vib_discard_sector_align; 547 drive->d_max_free_seg = vib->vib_max_discard_seg; 548 drive->d_max_free_blks = vib->vib_max_discard_sectors; 549 /* 550 * The virtio 1.1 spec doesn't specify a per segment sector 551 * limit for discards -- only a limit on the total sectors in 552 * a discard request. Therefore, we assume a vioblk device must 553 * be able to accept a single segment of vib_max_discard_sectors 554 * (when it supports discard requests) and use 555 * vib_max_discard_sectors both for the overall limit for 556 * a discard request, but also as the limit for a single 557 * segment. blkdev will ensure we are never called with 558 * a dkioc_free_list_t that violates either limit. 559 */ 560 drive->d_max_free_seg_blks = vib->vib_max_discard_sectors; 561 } 562 } 563 564 static int 565 vioblk_bd_mediainfo(void *arg, bd_media_t *media) 566 { 567 vioblk_t *vib = (void *)arg; 568 569 /* 570 * The device protocol is specified in terms of 512 byte logical 571 * blocks, regardless of the recommended I/O size which might be 572 * larger. 573 */ 574 media->m_nblks = vib->vib_nblks; 575 media->m_blksize = vib->vib_blk_size; 576 577 media->m_readonly = vib->vib_readonly; 578 media->m_pblksize = vib->vib_pblk_size; 579 return (0); 580 } 581 582 static void 583 vioblk_get_id(vioblk_t *vib) 584 { 585 virtio_dma_t *dma; 586 int r; 587 588 if ((dma = virtio_dma_alloc(vib->vib_virtio, VIRTIO_BLK_ID_BYTES, 589 &vioblk_dma_attr, DDI_DMA_CONSISTENT | DDI_DMA_READ, 590 KM_SLEEP)) == NULL) { 591 return; 592 } 593 594 mutex_enter(&vib->vib_mutex); 595 if ((r = vioblk_internal(vib, VIRTIO_BLK_T_GET_ID, dma, 0, 596 VIRTIO_DIR_DEVICE_WRITES)) == 0) { 597 const char *b = virtio_dma_va(dma, 0); 598 uint_t pos = 0; 599 600 /* 601 * Save the entire response for debugging purposes. 602 */ 603 bcopy(virtio_dma_va(dma, 0), vib->vib_rawid, 604 VIRTIO_BLK_ID_BYTES); 605 606 /* 607 * Process the returned ID. 608 */ 609 bzero(vib->vib_devid, sizeof (vib->vib_devid)); 610 for (uint_t n = 0; n < VIRTIO_BLK_ID_BYTES; n++) { 611 if (isalnum(b[n]) || b[n] == '-' || b[n] == '_') { 612 /* 613 * Accept a subset of printable ASCII 614 * characters. 615 */ 616 vib->vib_devid[pos++] = b[n]; 617 } else { 618 /* 619 * Stop processing at the first sign of 620 * trouble. 621 */ 622 break; 623 } 624 } 625 626 vib->vib_devid_fetched = B_TRUE; 627 } 628 mutex_exit(&vib->vib_mutex); 629 630 virtio_dma_free(dma); 631 } 632 633 static int 634 vioblk_bd_devid(void *arg, dev_info_t *dip, ddi_devid_t *devid) 635 { 636 vioblk_t *vib = arg; 637 size_t len; 638 639 if ((len = strlen(vib->vib_devid)) == 0) { 640 /* 641 * The device has no ID. 642 */ 643 return (DDI_FAILURE); 644 } 645 646 return (ddi_devid_init(dip, DEVID_ATA_SERIAL, len, vib->vib_devid, 647 devid)); 648 } 649 650 static int 651 vioblk_bd_free_space(void *arg, bd_xfer_t *xfer) 652 { 653 vioblk_t *vib = arg; 654 int r = 0; 655 656 /* 657 * Since vib_can_discard is write once (and set during attach), 658 * we can check if it's enabled without taking the mutex. 659 */ 660 if (!vib->vib_can_discard) { 661 return (ENOTSUP); 662 } 663 664 mutex_enter(&vib->vib_mutex); 665 r = vioblk_request(vib, xfer, VIRTIO_BLK_T_DISCARD); 666 mutex_exit(&vib->vib_mutex); 667 668 return (r); 669 } 670 671 /* 672 * As the device completes processing of a request, it returns the chain for 673 * that request to our I/O queue. This routine is called in two contexts: 674 * - from the interrupt handler, in response to notification from the device 675 * - synchronously in line with request processing when panicking 676 */ 677 static uint_t 678 vioblk_poll(vioblk_t *vib) 679 { 680 virtio_chain_t *vic; 681 uint_t count = 0; 682 boolean_t wakeup = B_FALSE; 683 684 VERIFY(MUTEX_HELD(&vib->vib_mutex)); 685 686 while ((vic = virtio_queue_poll(vib->vib_vq)) != NULL) { 687 vioblk_req_t *vbr = virtio_chain_data(vic); 688 uint8_t status; 689 690 virtio_dma_sync(vbr->vbr_dma, DDI_DMA_SYNC_FORCPU); 691 692 bcopy(virtio_dma_va(vbr->vbr_dma, 693 sizeof (struct vioblk_req_hdr)), &status, sizeof (status)); 694 695 switch (status) { 696 case VIRTIO_BLK_S_OK: 697 vbr->vbr_error = 0; 698 break; 699 case VIRTIO_BLK_S_IOERR: 700 vbr->vbr_error = EIO; 701 vib->vib_stats->vbs_io_errors.value.ui64++; 702 break; 703 case VIRTIO_BLK_S_UNSUPP: 704 vbr->vbr_error = ENOTTY; 705 vib->vib_stats->vbs_unsupp_errors.value.ui64++; 706 break; 707 default: 708 vbr->vbr_error = ENXIO; 709 vib->vib_stats->vbs_nxio_errors.value.ui64++; 710 break; 711 } 712 713 count++; 714 715 if (vbr->vbr_status & VIOBLK_REQSTAT_POLLED) { 716 /* 717 * This request must not be freed as it is being held 718 * by a call to vioblk_common_submit(). 719 */ 720 VERIFY(!(vbr->vbr_status & 721 VIOBLK_REQSTAT_POLL_COMPLETE)); 722 vbr->vbr_status |= VIOBLK_REQSTAT_POLL_COMPLETE; 723 wakeup = B_TRUE; 724 continue; 725 } 726 727 vioblk_complete(vib, vbr); 728 729 vioblk_req_free(vib, vbr); 730 } 731 732 if (wakeup) { 733 /* 734 * Signal anybody waiting for polled command completion. 735 */ 736 cv_broadcast(&vib->vib_cv); 737 } 738 739 return (count); 740 } 741 742 static uint_t 743 vioblk_int_handler(caddr_t arg0, caddr_t arg1 __unused) 744 { 745 vioblk_t *vib = (vioblk_t *)arg0; 746 uint_t count; 747 748 mutex_enter(&vib->vib_mutex); 749 if ((count = vioblk_poll(vib)) > 750 vib->vib_stats->vbs_intr_queuemax.value.ui32) { 751 vib->vib_stats->vbs_intr_queuemax.value.ui32 = count; 752 } 753 754 vib->vib_stats->vbs_intr_total.value.ui64++; 755 mutex_exit(&vib->vib_mutex); 756 757 return (DDI_INTR_CLAIMED); 758 } 759 760 static uint_t 761 vioblk_cfgchange(caddr_t arg0, caddr_t arg1 __unused) 762 { 763 vioblk_t *vib = (vioblk_t *)arg0; 764 765 dev_err(vib->vib_dip, CE_NOTE, "!Configuration changed"); 766 767 mutex_enter(&vib->vib_mutex); 768 769 /* 770 * The configuration space of the device has changed in some way. 771 * At present, we only re-read the device capacity and trigger 772 * blkdev to check the device state. 773 */ 774 775 if (vioblk_read_capacity(vib) == DDI_FAILURE) { 776 mutex_exit(&vib->vib_mutex); 777 return (DDI_INTR_CLAIMED); 778 } 779 780 mutex_exit(&vib->vib_mutex); 781 782 bd_state_change(vib->vib_bd_h); 783 784 return (DDI_INTR_CLAIMED); 785 } 786 787 static void 788 vioblk_free_reqs(vioblk_t *vib) 789 { 790 VERIFY3U(vib->vib_nreqs_alloc, ==, 0); 791 792 for (uint_t i = 0; i < vib->vib_reqs_capacity; i++) { 793 struct vioblk_req *vbr = &vib->vib_reqs_mem[i]; 794 795 VERIFY(list_link_active(&vbr->vbr_link)); 796 list_remove(&vib->vib_reqs, vbr); 797 798 VERIFY0(vbr->vbr_status); 799 800 if (vbr->vbr_chain != NULL) { 801 virtio_chain_free(vbr->vbr_chain); 802 vbr->vbr_chain = NULL; 803 } 804 if (vbr->vbr_dma != NULL) { 805 virtio_dma_free(vbr->vbr_dma); 806 vbr->vbr_dma = NULL; 807 } 808 } 809 VERIFY(list_is_empty(&vib->vib_reqs)); 810 811 if (vib->vib_reqs_mem != NULL) { 812 kmem_free(vib->vib_reqs_mem, 813 sizeof (struct vioblk_req) * vib->vib_reqs_capacity); 814 vib->vib_reqs_mem = NULL; 815 vib->vib_reqs_capacity = 0; 816 } 817 } 818 819 static int 820 vioblk_alloc_reqs(vioblk_t *vib) 821 { 822 vib->vib_reqs_capacity = MIN(virtio_queue_size(vib->vib_vq), 823 VIRTIO_BLK_REQ_BUFS); 824 vib->vib_reqs_mem = kmem_zalloc( 825 sizeof (struct vioblk_req) * vib->vib_reqs_capacity, KM_SLEEP); 826 vib->vib_nreqs_alloc = 0; 827 828 for (uint_t i = 0; i < vib->vib_reqs_capacity; i++) { 829 list_insert_tail(&vib->vib_reqs, &vib->vib_reqs_mem[i]); 830 } 831 832 for (vioblk_req_t *vbr = list_head(&vib->vib_reqs); vbr != NULL; 833 vbr = list_next(&vib->vib_reqs, vbr)) { 834 if ((vbr->vbr_dma = virtio_dma_alloc(vib->vib_virtio, 835 sizeof (struct vioblk_req_hdr) + sizeof (uint8_t), 836 &vioblk_dma_attr, DDI_DMA_RDWR | DDI_DMA_CONSISTENT, 837 KM_SLEEP)) == NULL) { 838 goto fail; 839 } 840 vbr->vbr_chain = virtio_chain_alloc(vib->vib_vq, KM_SLEEP); 841 if (vbr->vbr_chain == NULL) { 842 goto fail; 843 } 844 virtio_chain_data_set(vbr->vbr_chain, vbr); 845 } 846 847 return (0); 848 849 fail: 850 vioblk_free_reqs(vib); 851 return (ENOMEM); 852 } 853 854 static int 855 vioblk_read_capacity(vioblk_t *vib) 856 { 857 virtio_t *vio = vib->vib_virtio; 858 859 /* The capacity is always available */ 860 if ((vib->vib_nblks = virtio_dev_get64(vio, 861 VIRTIO_BLK_CONFIG_CAPACITY)) == UINT64_MAX) { 862 dev_err(vib->vib_dip, CE_WARN, "invalid capacity"); 863 return (DDI_FAILURE); 864 } 865 866 /* 867 * Determine the optimal logical block size recommended by the device. 868 * This size is advisory; the protocol always deals in 512 byte blocks. 869 */ 870 vib->vib_blk_size = DEV_BSIZE; 871 if (virtio_feature_present(vio, VIRTIO_BLK_F_BLK_SIZE)) { 872 uint32_t v = virtio_dev_get32(vio, VIRTIO_BLK_CONFIG_BLK_SIZE); 873 874 if (v != 0 && v != PCI_EINVAL32) 875 vib->vib_blk_size = v; 876 } 877 878 /* 879 * Device capacity is always in 512-byte units, convert to 880 * native blocks. 881 */ 882 vib->vib_nblks = (vib->vib_nblks * DEV_BSIZE) / vib->vib_blk_size; 883 884 /* 885 * The device may also provide an advisory physical block size. 886 */ 887 vib->vib_pblk_size = vib->vib_blk_size; 888 if (virtio_feature_present(vio, VIRTIO_BLK_F_TOPOLOGY)) { 889 uint8_t v = virtio_dev_get8(vio, VIRTIO_BLK_CONFIG_TOPO_PBEXP); 890 891 if (v != PCI_EINVAL8) 892 vib->vib_pblk_size <<= v; 893 } 894 895 return (DDI_SUCCESS); 896 } 897 898 static int 899 vioblk_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 900 { 901 int instance = ddi_get_instance(dip); 902 vioblk_t *vib; 903 virtio_t *vio; 904 boolean_t did_mutex = B_FALSE; 905 906 if (cmd != DDI_ATTACH) { 907 return (DDI_FAILURE); 908 } 909 910 if ((vio = virtio_init(dip, VIRTIO_BLK_WANTED_FEATURES, B_TRUE)) == 911 NULL) { 912 dev_err(dip, CE_WARN, "failed to start Virtio init"); 913 return (DDI_FAILURE); 914 } 915 916 vib = kmem_zalloc(sizeof (*vib), KM_SLEEP); 917 vib->vib_dip = dip; 918 vib->vib_virtio = vio; 919 ddi_set_driver_private(dip, vib); 920 list_create(&vib->vib_reqs, sizeof (vioblk_req_t), 921 offsetof(vioblk_req_t, vbr_link)); 922 923 /* 924 * Determine how many scatter-gather entries we can use in a single 925 * request. 926 */ 927 vib->vib_seg_max = VIRTIO_BLK_DEFAULT_MAX_SEG; 928 if (virtio_feature_present(vio, VIRTIO_BLK_F_SEG_MAX)) { 929 vib->vib_seg_max = virtio_dev_get32(vio, 930 VIRTIO_BLK_CONFIG_SEG_MAX); 931 932 if (vib->vib_seg_max == 0 || vib->vib_seg_max == PCI_EINVAL32) { 933 /* 934 * We need to be able to use at least one data segment, 935 * so we'll assume that this device is just poorly 936 * implemented and try for one. 937 */ 938 vib->vib_seg_max = 1; 939 } 940 } 941 942 if (virtio_feature_present(vio, VIRTIO_BLK_F_DISCARD)) { 943 vib->vib_max_discard_sectors = virtio_dev_get32(vio, 944 VIRTIO_BLK_CONFIG_MAX_DISCARD_SECT); 945 vib->vib_max_discard_seg = virtio_dev_get32(vio, 946 VIRTIO_BLK_CONFIG_MAX_DISCARD_SEG); 947 vib->vib_discard_sector_align = virtio_dev_get32(vio, 948 VIRTIO_BLK_CONFIG_DISCARD_ALIGN); 949 950 if (vib->vib_max_discard_sectors == 0 || 951 vib->vib_max_discard_seg == 0 || 952 vib->vib_discard_sector_align == 0) { 953 vib->vib_can_discard = B_FALSE; 954 955 /* 956 * The hypervisor shouldn't be giving us bad values. 957 * If it is, it's probably worth notifying the 958 * operator. 959 */ 960 dev_err(dip, CE_NOTE, 961 "Host is advertising DISCARD support but with bad" 962 "parameters: max_discard_sectors=%u, " 963 "max_discard_segments=%u, discard_sector_align=%u", 964 vib->vib_max_discard_sectors, 965 vib->vib_max_discard_seg, 966 vib->vib_discard_sector_align); 967 } else { 968 vib->vib_can_discard = B_TRUE; 969 } 970 } 971 972 /* 973 * When allocating the request queue, we include two additional 974 * descriptors (beyond those required for request data) to account for 975 * the header and the status byte. 976 */ 977 if ((vib->vib_vq = virtio_queue_alloc(vio, VIRTIO_BLK_VIRTQ_IO, "io", 978 vioblk_int_handler, vib, B_FALSE, vib->vib_seg_max + 2)) == NULL) { 979 goto fail; 980 } 981 982 virtio_register_cfgchange_handler(vio, vioblk_cfgchange, vib); 983 984 if (virtio_init_complete(vio, 0) != DDI_SUCCESS) { 985 dev_err(dip, CE_WARN, "failed to complete Virtio init"); 986 goto fail; 987 } 988 989 cv_init(&vib->vib_cv, NULL, CV_DRIVER, NULL); 990 mutex_init(&vib->vib_mutex, NULL, MUTEX_DRIVER, virtio_intr_pri(vio)); 991 did_mutex = B_TRUE; 992 993 if ((vib->vib_kstat = kstat_create("vioblk", instance, 994 "statistics", "controller", KSTAT_TYPE_NAMED, 995 sizeof (struct vioblk_stats) / sizeof (kstat_named_t), 996 KSTAT_FLAG_PERSISTENT)) == NULL) { 997 dev_err(dip, CE_WARN, "kstat_create failed"); 998 goto fail; 999 } 1000 vib->vib_stats = (vioblk_stats_t *)vib->vib_kstat->ks_data; 1001 kstat_named_init(&vib->vib_stats->vbs_rw_outofmemory, 1002 "total_rw_outofmemory", KSTAT_DATA_UINT64); 1003 kstat_named_init(&vib->vib_stats->vbs_rw_badoffset, 1004 "total_rw_badoffset", KSTAT_DATA_UINT64); 1005 kstat_named_init(&vib->vib_stats->vbs_intr_total, 1006 "total_intr", KSTAT_DATA_UINT64); 1007 kstat_named_init(&vib->vib_stats->vbs_io_errors, 1008 "total_io_errors", KSTAT_DATA_UINT64); 1009 kstat_named_init(&vib->vib_stats->vbs_unsupp_errors, 1010 "total_unsupp_errors", KSTAT_DATA_UINT64); 1011 kstat_named_init(&vib->vib_stats->vbs_nxio_errors, 1012 "total_nxio_errors", KSTAT_DATA_UINT64); 1013 kstat_named_init(&vib->vib_stats->vbs_rw_cacheflush, 1014 "total_rw_cacheflush", KSTAT_DATA_UINT64); 1015 kstat_named_init(&vib->vib_stats->vbs_rw_cookiesmax, 1016 "max_rw_cookies", KSTAT_DATA_UINT32); 1017 kstat_named_init(&vib->vib_stats->vbs_intr_queuemax, 1018 "max_intr_queue", KSTAT_DATA_UINT32); 1019 kstat_install(vib->vib_kstat); 1020 1021 vib->vib_readonly = virtio_feature_present(vio, VIRTIO_BLK_F_RO); 1022 1023 if (vioblk_read_capacity(vib) == DDI_FAILURE) 1024 goto fail; 1025 1026 /* 1027 * The maximum size for a cookie in a request. 1028 */ 1029 vib->vib_seg_size_max = VIRTIO_BLK_DEFAULT_MAX_SIZE; 1030 if (virtio_feature_present(vio, VIRTIO_BLK_F_SIZE_MAX)) { 1031 uint32_t v = virtio_dev_get32(vio, VIRTIO_BLK_CONFIG_SIZE_MAX); 1032 1033 if (v != 0 && v != PCI_EINVAL32) { 1034 vib->vib_seg_size_max = v; 1035 } 1036 } 1037 1038 /* 1039 * Set up the DMA attributes for blkdev to use for request data. The 1040 * specification is not extremely clear about whether DMA-related 1041 * parameters include or exclude the header and status descriptors. 1042 * For now, we assume they cover only the request data and not the 1043 * headers. 1044 */ 1045 vib->vib_bd_dma_attr = vioblk_dma_attr; 1046 vib->vib_bd_dma_attr.dma_attr_sgllen = vib->vib_seg_max; 1047 vib->vib_bd_dma_attr.dma_attr_count_max = vib->vib_seg_size_max; 1048 vib->vib_bd_dma_attr.dma_attr_maxxfer = vib->vib_seg_max * 1049 vib->vib_seg_size_max; 1050 1051 if (vioblk_alloc_reqs(vib) != 0) { 1052 goto fail; 1053 } 1054 1055 /* 1056 * The blkdev framework does not provide a way to specify that the 1057 * device does not support write cache flushing, except by omitting the 1058 * "o_sync_cache" member from the ops vector. As "bd_alloc_handle()" 1059 * makes a copy of the ops vector, we can safely assemble one on the 1060 * stack based on negotiated features. 1061 * 1062 * Similarly, the blkdev framework does not provide a way to indicate 1063 * if a device supports an TRIM/UNMAP/DISCARD type operation except 1064 * by omitting the "o_free_space" member from the ops vector. 1065 */ 1066 bd_ops_t vioblk_bd_ops = { 1067 .o_version = BD_OPS_CURRENT_VERSION, 1068 .o_drive_info = vioblk_bd_driveinfo, 1069 .o_media_info = vioblk_bd_mediainfo, 1070 .o_devid_init = vioblk_bd_devid, 1071 .o_sync_cache = vioblk_bd_flush, 1072 .o_read = vioblk_bd_read, 1073 .o_write = vioblk_bd_write, 1074 .o_free_space = vioblk_bd_free_space, 1075 }; 1076 if (!virtio_feature_present(vio, VIRTIO_BLK_F_FLUSH)) { 1077 vioblk_bd_ops.o_sync_cache = NULL; 1078 } 1079 if (!vib->vib_can_discard) { 1080 vioblk_bd_ops.o_free_space = NULL; 1081 } 1082 1083 vib->vib_bd_h = bd_alloc_handle(vib, &vioblk_bd_ops, 1084 &vib->vib_bd_dma_attr, KM_SLEEP); 1085 1086 /* 1087 * Enable interrupts now so that we can request the device identity. 1088 */ 1089 if (virtio_interrupts_enable(vio) != DDI_SUCCESS) { 1090 goto fail; 1091 } 1092 1093 vioblk_get_id(vib); 1094 1095 if (bd_attach_handle(dip, vib->vib_bd_h) != DDI_SUCCESS) { 1096 dev_err(dip, CE_WARN, "Failed to attach blkdev"); 1097 goto fail; 1098 } 1099 1100 return (DDI_SUCCESS); 1101 1102 fail: 1103 if (vib->vib_bd_h != NULL) { 1104 (void) bd_detach_handle(vib->vib_bd_h); 1105 bd_free_handle(vib->vib_bd_h); 1106 } 1107 if (vio != NULL) { 1108 (void) virtio_fini(vio, B_TRUE); 1109 } 1110 if (did_mutex) { 1111 mutex_destroy(&vib->vib_mutex); 1112 cv_destroy(&vib->vib_cv); 1113 } 1114 if (vib->vib_kstat != NULL) { 1115 kstat_delete(vib->vib_kstat); 1116 } 1117 vioblk_free_reqs(vib); 1118 kmem_free(vib, sizeof (*vib)); 1119 return (DDI_FAILURE); 1120 } 1121 1122 static int 1123 vioblk_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 1124 { 1125 vioblk_t *vib = ddi_get_driver_private(dip); 1126 1127 if (cmd != DDI_DETACH) { 1128 return (DDI_FAILURE); 1129 } 1130 1131 mutex_enter(&vib->vib_mutex); 1132 if (vib->vib_nreqs_alloc > 0) { 1133 /* 1134 * Cannot detach while there are still outstanding requests. 1135 */ 1136 mutex_exit(&vib->vib_mutex); 1137 return (DDI_FAILURE); 1138 } 1139 1140 if (bd_detach_handle(vib->vib_bd_h) != DDI_SUCCESS) { 1141 mutex_exit(&vib->vib_mutex); 1142 return (DDI_FAILURE); 1143 } 1144 1145 /* 1146 * Tear down the Virtio framework before freeing the rest of the 1147 * resources. This will ensure the interrupt handlers are no longer 1148 * running. 1149 */ 1150 virtio_fini(vib->vib_virtio, B_FALSE); 1151 1152 vioblk_free_reqs(vib); 1153 kstat_delete(vib->vib_kstat); 1154 1155 mutex_exit(&vib->vib_mutex); 1156 mutex_destroy(&vib->vib_mutex); 1157 1158 kmem_free(vib, sizeof (*vib)); 1159 1160 return (DDI_SUCCESS); 1161 } 1162 1163 static int 1164 vioblk_quiesce(dev_info_t *dip) 1165 { 1166 vioblk_t *vib; 1167 1168 if ((vib = ddi_get_driver_private(dip)) == NULL) { 1169 return (DDI_FAILURE); 1170 } 1171 1172 return (virtio_quiesce(vib->vib_virtio)); 1173 } 1174 1175 int 1176 _init(void) 1177 { 1178 int rv; 1179 1180 bd_mod_init(&vioblk_dev_ops); 1181 1182 if ((rv = mod_install(&vioblk_modlinkage)) != 0) { 1183 bd_mod_fini(&vioblk_dev_ops); 1184 } 1185 1186 return (rv); 1187 } 1188 1189 int 1190 _fini(void) 1191 { 1192 int rv; 1193 1194 if ((rv = mod_remove(&vioblk_modlinkage)) == 0) { 1195 bd_mod_fini(&vioblk_dev_ops); 1196 } 1197 1198 return (rv); 1199 } 1200 1201 int 1202 _info(struct modinfo *modinfop) 1203 { 1204 return (mod_info(&vioblk_modlinkage, modinfop)); 1205 } 1206