1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved.
24 * Copyright (c) 2012, Alexey Zaytsev <alexey.zaytsev@gmail.com>
25 * Copyright 2020 Joyent Inc.
26 * Copyright 2019 Western Digital Corporation.
27 * Copyright 2020 Oxide Computer Company
28 * Copyright 2022 OmniOS Community Edition (OmniOSce) Association.
29 */
30
31 /*
32 * VIRTIO BLOCK DRIVER
33 *
34 * This driver provides support for Virtio Block devices. Each driver instance
35 * attaches to a single underlying block device.
36 *
37 * REQUEST CHAIN LAYOUT
38 *
39 * Every request chain sent to the I/O queue has the following structure. Each
40 * box in the diagram represents a descriptor entry (i.e., a DMA cookie) within
41 * the chain:
42 *
43 * +-0-----------------------------------------+
44 * | struct virtio_blk_hdr |-----------------------\
45 * | (written by driver, read by device) | |
46 * +-1-----------------------------------------+ |
47 * | optional data payload |--\ |
48 * | (written by driver for write requests, | | |
49 * | or by device for read requests) | | |
50 * +-2-----------------------------------------+ | |
51 * | ,~` : |-cookies loaned |
52 * |/ : ,~`| | from blkdev |
53 * : / | | |
54 * +-(N - 1)-----------------------------------+ | |
55 * | ... end of data payload. | | |
56 * | | | |
57 * | |--/ |
58 * +-N-----------------------------------------+ |
59 * | status byte | |
60 * | (written by device, read by driver) |--------------------\ |
61 * +-------------------------------------------+ | |
62 * | |
63 * The memory for the header and status bytes (i.e., 0 and N above) | |
64 * is allocated as a single chunk by vioblk_alloc_reqs(): | |
65 * | |
66 * +-------------------------------------------+ | |
67 * | struct virtio_blk_hdr |<----------------------/
68 * +-------------------------------------------+ |
69 * | status byte |<-------------------/
70 * +-------------------------------------------+
71 */
72
73 #include <sys/modctl.h>
74 #include <sys/blkdev.h>
75 #include <sys/types.h>
76 #include <sys/errno.h>
77 #include <sys/param.h>
78 #include <sys/stropts.h>
79 #include <sys/stream.h>
80 #include <sys/strsubr.h>
81 #include <sys/kmem.h>
82 #include <sys/conf.h>
83 #include <sys/devops.h>
84 #include <sys/ksynch.h>
85 #include <sys/stat.h>
86 #include <sys/modctl.h>
87 #include <sys/debug.h>
88 #include <sys/pci.h>
89 #include <sys/containerof.h>
90 #include <sys/ctype.h>
91 #include <sys/sysmacros.h>
92 #include <sys/dkioc_free_util.h>
93
94 #include "virtio.h"
95 #include "vioblk.h"
96
97 static void vioblk_get_id(vioblk_t *);
98 static uint_t vioblk_int_handler(caddr_t, caddr_t);
99 static uint_t vioblk_poll(vioblk_t *);
100 static int vioblk_quiesce(dev_info_t *);
101 static int vioblk_read_capacity(vioblk_t *);
102 static int vioblk_attach(dev_info_t *, ddi_attach_cmd_t);
103 static int vioblk_detach(dev_info_t *, ddi_detach_cmd_t);
104
105
106 static struct dev_ops vioblk_dev_ops = {
107 .devo_rev = DEVO_REV,
108 .devo_refcnt = 0,
109
110 .devo_attach = vioblk_attach,
111 .devo_detach = vioblk_detach,
112 .devo_quiesce = vioblk_quiesce,
113
114 .devo_getinfo = ddi_no_info,
115 .devo_identify = nulldev,
116 .devo_probe = nulldev,
117 .devo_reset = nodev,
118 .devo_cb_ops = NULL,
119 .devo_bus_ops = NULL,
120 .devo_power = NULL,
121 };
122
123 static struct modldrv vioblk_modldrv = {
124 .drv_modops = &mod_driverops,
125 .drv_linkinfo = "VIRTIO block driver",
126 .drv_dev_ops = &vioblk_dev_ops
127 };
128
129 static struct modlinkage vioblk_modlinkage = {
130 .ml_rev = MODREV_1,
131 .ml_linkage = { &vioblk_modldrv, NULL }
132 };
133
134 /*
135 * DMA attribute template for header and status blocks. We also make a
136 * per-instance copy of this template with negotiated sizes from the device for
137 * blkdev.
138 */
139 static const ddi_dma_attr_t vioblk_dma_attr = {
140 .dma_attr_version = DMA_ATTR_V0,
141 .dma_attr_addr_lo = 0x0000000000000000,
142 .dma_attr_addr_hi = 0xFFFFFFFFFFFFFFFF,
143 .dma_attr_count_max = 0x00000000FFFFFFFF,
144 .dma_attr_align = 1,
145 .dma_attr_burstsizes = 1,
146 .dma_attr_minxfer = 1,
147 .dma_attr_maxxfer = 0x00000000FFFFFFFF,
148 .dma_attr_seg = 0x00000000FFFFFFFF,
149 .dma_attr_sgllen = 1,
150 .dma_attr_granular = 1,
151 .dma_attr_flags = 0
152 };
153
154 static vioblk_req_t *
vioblk_req_alloc(vioblk_t * vib)155 vioblk_req_alloc(vioblk_t *vib)
156 {
157 vioblk_req_t *vbr;
158
159 VERIFY(MUTEX_HELD(&vib->vib_mutex));
160
161 if ((vbr = list_remove_head(&vib->vib_reqs)) == NULL) {
162 return (NULL);
163 }
164 vib->vib_nreqs_alloc++;
165
166 VERIFY0(vbr->vbr_status);
167 vbr->vbr_status |= VIOBLK_REQSTAT_ALLOCATED;
168
169 VERIFY3P(vbr->vbr_chain, !=, NULL);
170 VERIFY3P(vbr->vbr_xfer, ==, NULL);
171 VERIFY3S(vbr->vbr_error, ==, 0);
172
173 return (vbr);
174 }
175
176 static void
vioblk_req_free(vioblk_t * vib,vioblk_req_t * vbr)177 vioblk_req_free(vioblk_t *vib, vioblk_req_t *vbr)
178 {
179 VERIFY(MUTEX_HELD(&vib->vib_mutex));
180
181 /*
182 * Check that this request was allocated, then zero the status field to
183 * clear all status bits.
184 */
185 VERIFY(vbr->vbr_status & VIOBLK_REQSTAT_ALLOCATED);
186 vbr->vbr_status = 0;
187
188 vbr->vbr_xfer = NULL;
189 vbr->vbr_error = 0;
190 vbr->vbr_type = 0;
191 virtio_chain_clear(vbr->vbr_chain);
192
193 list_insert_head(&vib->vib_reqs, vbr);
194
195 VERIFY3U(vib->vib_nreqs_alloc, >, 0);
196 vib->vib_nreqs_alloc--;
197 }
198
199 static void
vioblk_complete(vioblk_t * vib,vioblk_req_t * vbr)200 vioblk_complete(vioblk_t *vib, vioblk_req_t *vbr)
201 {
202 VERIFY(MUTEX_HELD(&vib->vib_mutex));
203
204 VERIFY(!(vbr->vbr_status & VIOBLK_REQSTAT_COMPLETE));
205 vbr->vbr_status |= VIOBLK_REQSTAT_COMPLETE;
206
207 if (vbr->vbr_type == VIRTIO_BLK_T_FLUSH) {
208 vib->vib_stats->vbs_rw_cacheflush.value.ui64++;
209 }
210
211 if (vbr->vbr_xfer != NULL) {
212 /*
213 * This is a blkdev framework request.
214 */
215 mutex_exit(&vib->vib_mutex);
216 bd_xfer_done(vbr->vbr_xfer, vbr->vbr_error);
217 mutex_enter(&vib->vib_mutex);
218 vbr->vbr_xfer = NULL;
219 }
220 }
221
222 static vioblk_req_t *
vioblk_common_start(vioblk_t * vib,int type,uint64_t sector,boolean_t polled)223 vioblk_common_start(vioblk_t *vib, int type, uint64_t sector,
224 boolean_t polled)
225 {
226 vioblk_req_t *vbr = NULL;
227
228 if ((vbr = vioblk_req_alloc(vib)) == NULL) {
229 vib->vib_stats->vbs_rw_outofmemory.value.ui64++;
230 return (NULL);
231 }
232 vbr->vbr_type = type;
233
234 if (polled) {
235 /*
236 * Mark this command as polled so that we can wait on it
237 * ourselves.
238 */
239 vbr->vbr_status |= VIOBLK_REQSTAT_POLLED;
240 }
241
242 struct vioblk_req_hdr vbh;
243 vbh.vbh_type = type;
244 vbh.vbh_ioprio = 0;
245 vbh.vbh_sector = (sector * vib->vib_blk_size) / DEV_BSIZE;
246 bcopy(&vbh, virtio_dma_va(vbr->vbr_dma, 0), sizeof (vbh));
247
248 /*
249 * Put the header in the first descriptor. See the block comment at
250 * the top of the file for more details on the chain layout.
251 */
252 if (virtio_chain_append(vbr->vbr_chain,
253 virtio_dma_cookie_pa(vbr->vbr_dma, 0),
254 sizeof (struct vioblk_req_hdr), VIRTIO_DIR_DEVICE_READS) !=
255 DDI_SUCCESS) {
256 vioblk_req_free(vib, vbr);
257 return (NULL);
258 }
259
260 return (vbr);
261 }
262
263 static int
vioblk_common_submit(vioblk_t * vib,vioblk_req_t * vbr)264 vioblk_common_submit(vioblk_t *vib, vioblk_req_t *vbr)
265 {
266 virtio_chain_t *vic = vbr->vbr_chain;
267 int r;
268
269 VERIFY(MUTEX_HELD(&vib->vib_mutex));
270
271 /*
272 * The device will write the status byte into this last descriptor.
273 * See the block comment at the top of the file for more details on the
274 * chain layout.
275 */
276 if (virtio_chain_append(vic, virtio_dma_cookie_pa(vbr->vbr_dma, 0) +
277 sizeof (struct vioblk_req_hdr), sizeof (uint8_t),
278 VIRTIO_DIR_DEVICE_WRITES) != DDI_SUCCESS) {
279 vioblk_req_free(vib, vbr);
280 return (ENOMEM);
281 }
282
283 virtio_dma_sync(vbr->vbr_dma, DDI_DMA_SYNC_FORDEV);
284 virtio_chain_submit(vic, B_TRUE);
285
286 if (!(vbr->vbr_status & VIOBLK_REQSTAT_POLLED)) {
287 /*
288 * This is not a polled request. Our request will be freed and
289 * the caller notified later in vioblk_poll().
290 */
291 return (0);
292 }
293
294 /*
295 * This is a polled request. We need to block here and wait for the
296 * device to complete request processing.
297 */
298 while (!(vbr->vbr_status & VIOBLK_REQSTAT_POLL_COMPLETE)) {
299 if (ddi_in_panic()) {
300 /*
301 * When panicking, interrupts are disabled. We must
302 * poll the queue manually.
303 */
304 drv_usecwait(10);
305 (void) vioblk_poll(vib);
306 continue;
307 }
308
309 /*
310 * When not panicking, the device will interrupt on command
311 * completion and vioblk_poll() will be called to wake us up.
312 */
313 cv_wait(&vib->vib_cv, &vib->vib_mutex);
314 }
315
316 vioblk_complete(vib, vbr);
317 r = vbr->vbr_error;
318 vioblk_req_free(vib, vbr);
319 return (r);
320 }
321
322 static int
vioblk_internal(vioblk_t * vib,int type,virtio_dma_t * dma,uint64_t sector,virtio_direction_t dir)323 vioblk_internal(vioblk_t *vib, int type, virtio_dma_t *dma,
324 uint64_t sector, virtio_direction_t dir)
325 {
326 vioblk_req_t *vbr;
327
328 VERIFY(MUTEX_HELD(&vib->vib_mutex));
329
330 /*
331 * Allocate a polled request.
332 */
333 if ((vbr = vioblk_common_start(vib, type, sector, B_TRUE)) == NULL) {
334 return (ENOMEM);
335 }
336
337 /*
338 * If there is a request payload, it goes between the header and the
339 * status byte. See the block comment at the top of the file for more
340 * detail on the chain layout.
341 */
342 if (dma != NULL) {
343 virtio_chain_t *vic = vbr->vbr_chain;
344 for (uint_t n = 0; n < virtio_dma_ncookies(dma); n++) {
345 if (virtio_chain_append(vic,
346 virtio_dma_cookie_pa(dma, n),
347 virtio_dma_cookie_size(dma, n), dir) !=
348 DDI_SUCCESS) {
349 vioblk_req_free(vib, vbr);
350 return (ENOMEM);
351 }
352 }
353 }
354
355 return (vioblk_common_submit(vib, vbr));
356 }
357
358 static int
vioblk_map_discard(vioblk_t * vib,virtio_chain_t * vic,const bd_xfer_t * xfer)359 vioblk_map_discard(vioblk_t *vib, virtio_chain_t *vic, const bd_xfer_t *xfer)
360 {
361 const dkioc_free_list_t *dfl = xfer->x_dfl;
362 const dkioc_free_list_ext_t *exts = dfl->dfl_exts;
363 virtio_dma_t *dma = NULL;
364 struct vioblk_discard_write_zeroes *wzp = NULL;
365
366 dma = virtio_dma_alloc(vib->vib_virtio,
367 dfl->dfl_num_exts * sizeof (*wzp), &vioblk_dma_attr,
368 DDI_DMA_CONSISTENT | DDI_DMA_WRITE, KM_SLEEP);
369 if (dma == NULL)
370 return (ENOMEM);
371
372 wzp = virtio_dma_va(dma, 0);
373
374 for (uint64_t i = 0; i < dfl->dfl_num_exts; i++, exts++, wzp++) {
375 uint64_t start = dfl->dfl_offset + exts->dfle_start;
376
377 const struct vioblk_discard_write_zeroes vdwz = {
378 .vdwz_sector = start >> DEV_BSHIFT,
379 .vdwz_num_sectors = exts->dfle_length >> DEV_BSHIFT,
380 .vdwz_flags = 0
381 };
382
383 bcopy(&vdwz, wzp, sizeof (*wzp));
384 }
385
386 if (virtio_chain_append(vic,
387 virtio_dma_cookie_pa(dma, 0),
388 virtio_dma_cookie_size(dma, 0),
389 VIRTIO_DIR_DEVICE_READS) != DDI_SUCCESS) {
390 virtio_dma_free(dma);
391 return (ENOMEM);
392 }
393
394 return (0);
395 }
396
397 static int
vioblk_request(vioblk_t * vib,bd_xfer_t * xfer,int type)398 vioblk_request(vioblk_t *vib, bd_xfer_t *xfer, int type)
399 {
400 vioblk_req_t *vbr = NULL;
401 uint_t total_cookies = 2;
402 boolean_t polled = (xfer->x_flags & BD_XFER_POLL) != 0;
403
404 VERIFY(MUTEX_HELD(&vib->vib_mutex));
405
406 /*
407 * Ensure that this request falls within the advertised size of the
408 * block device. Be careful to avoid overflow.
409 */
410 if (xfer->x_nblks > SIZE_MAX - xfer->x_blkno ||
411 (xfer->x_blkno + xfer->x_nblks) > vib->vib_nblks) {
412 vib->vib_stats->vbs_rw_badoffset.value.ui64++;
413 return (EINVAL);
414 }
415
416 if ((vbr = vioblk_common_start(vib, type, xfer->x_blkno, polled)) ==
417 NULL) {
418 return (ENOMEM);
419 }
420 vbr->vbr_xfer = xfer;
421
422 /*
423 * If there is a request payload, it goes between the header and the
424 * status byte. See the block comment at the top of the file for more
425 * detail on the chain layout.
426 */
427 if ((type == VIRTIO_BLK_T_IN || type == VIRTIO_BLK_T_OUT) &&
428 xfer->x_nblks > 0) {
429 virtio_direction_t dir = (type == VIRTIO_BLK_T_OUT) ?
430 VIRTIO_DIR_DEVICE_READS : VIRTIO_DIR_DEVICE_WRITES;
431 virtio_chain_t *vic = vbr->vbr_chain;
432
433 for (uint_t n = 0; n < xfer->x_ndmac; n++) {
434 ddi_dma_cookie_t dmac;
435
436 if (n == 0) {
437 /*
438 * The first cookie is in the blkdev request.
439 */
440 dmac = xfer->x_dmac;
441 } else {
442 ddi_dma_nextcookie(xfer->x_dmah, &dmac);
443 }
444
445 if (virtio_chain_append(vic, dmac.dmac_laddress,
446 dmac.dmac_size, dir) != DDI_SUCCESS) {
447 vioblk_req_free(vib, vbr);
448 return (ENOMEM);
449 }
450 }
451
452 total_cookies += xfer->x_ndmac;
453
454 } else if (xfer->x_nblks > 0) {
455 dev_err(vib->vib_dip, CE_PANIC,
456 "request of type %d had payload length of %lu blocks", type,
457 xfer->x_nblks);
458 } else if (type == VIRTIO_BLK_T_DISCARD) {
459 int r = vioblk_map_discard(vib, vbr->vbr_chain, xfer);
460 if (r != 0) {
461 vioblk_req_free(vib, vbr);
462 return (r);
463 }
464 }
465
466 if (vib->vib_stats->vbs_rw_cookiesmax.value.ui32 < total_cookies) {
467 vib->vib_stats->vbs_rw_cookiesmax.value.ui32 = total_cookies;
468 }
469
470 return (vioblk_common_submit(vib, vbr));
471 }
472
473 static int
vioblk_bd_read(void * arg,bd_xfer_t * xfer)474 vioblk_bd_read(void *arg, bd_xfer_t *xfer)
475 {
476 vioblk_t *vib = arg;
477 int r;
478
479 mutex_enter(&vib->vib_mutex);
480 r = vioblk_request(vib, xfer, VIRTIO_BLK_T_IN);
481 mutex_exit(&vib->vib_mutex);
482
483 return (r);
484 }
485
486 static int
vioblk_bd_write(void * arg,bd_xfer_t * xfer)487 vioblk_bd_write(void *arg, bd_xfer_t *xfer)
488 {
489 vioblk_t *vib = arg;
490 int r;
491
492 mutex_enter(&vib->vib_mutex);
493 r = vioblk_request(vib, xfer, VIRTIO_BLK_T_OUT);
494 mutex_exit(&vib->vib_mutex);
495
496 return (r);
497 }
498
499 static int
vioblk_bd_flush(void * arg,bd_xfer_t * xfer)500 vioblk_bd_flush(void *arg, bd_xfer_t *xfer)
501 {
502 vioblk_t *vib = arg;
503 int r;
504
505 mutex_enter(&vib->vib_mutex);
506 if (!virtio_feature_present(vib->vib_virtio, VIRTIO_BLK_F_FLUSH)) {
507 /*
508 * We don't really expect to get here, because if we did not
509 * negotiate the flush feature we would not have installed this
510 * function in the blkdev ops vector.
511 */
512 mutex_exit(&vib->vib_mutex);
513 return (ENOTSUP);
514 }
515
516 r = vioblk_request(vib, xfer, VIRTIO_BLK_T_FLUSH);
517 mutex_exit(&vib->vib_mutex);
518
519 return (r);
520 }
521
522 static void
vioblk_bd_driveinfo(void * arg,bd_drive_t * drive)523 vioblk_bd_driveinfo(void *arg, bd_drive_t *drive)
524 {
525 vioblk_t *vib = arg;
526
527 drive->d_qsize = vib->vib_reqs_capacity;
528 drive->d_removable = B_FALSE;
529 drive->d_hotpluggable = B_TRUE;
530 drive->d_target = 0;
531 drive->d_lun = 0;
532
533 drive->d_vendor = "Virtio";
534 drive->d_vendor_len = strlen(drive->d_vendor);
535
536 drive->d_product = "Block Device";
537 drive->d_product_len = strlen(drive->d_product);
538
539 drive->d_serial = vib->vib_devid;
540 drive->d_serial_len = strlen(drive->d_serial);
541
542 drive->d_revision = "0000";
543 drive->d_revision_len = strlen(drive->d_revision);
544
545 if (vib->vib_can_discard) {
546 drive->d_free_align = vib->vib_discard_sector_align;
547 drive->d_max_free_seg = vib->vib_max_discard_seg;
548 drive->d_max_free_blks = vib->vib_max_discard_sectors;
549 /*
550 * The virtio 1.1 spec doesn't specify a per segment sector
551 * limit for discards -- only a limit on the total sectors in
552 * a discard request. Therefore, we assume a vioblk device must
553 * be able to accept a single segment of vib_max_discard_sectors
554 * (when it supports discard requests) and use
555 * vib_max_discard_sectors both for the overall limit for
556 * a discard request, but also as the limit for a single
557 * segment. blkdev will ensure we are never called with
558 * a dkioc_free_list_t that violates either limit.
559 */
560 drive->d_max_free_seg_blks = vib->vib_max_discard_sectors;
561 }
562 }
563
564 static int
vioblk_bd_mediainfo(void * arg,bd_media_t * media)565 vioblk_bd_mediainfo(void *arg, bd_media_t *media)
566 {
567 vioblk_t *vib = (void *)arg;
568
569 /*
570 * The device protocol is specified in terms of 512 byte logical
571 * blocks, regardless of the recommended I/O size which might be
572 * larger.
573 */
574 media->m_nblks = vib->vib_nblks;
575 media->m_blksize = vib->vib_blk_size;
576
577 media->m_readonly = vib->vib_readonly;
578 media->m_pblksize = vib->vib_pblk_size;
579 return (0);
580 }
581
582 static void
vioblk_get_id(vioblk_t * vib)583 vioblk_get_id(vioblk_t *vib)
584 {
585 virtio_dma_t *dma;
586 int r;
587
588 if ((dma = virtio_dma_alloc(vib->vib_virtio, VIRTIO_BLK_ID_BYTES,
589 &vioblk_dma_attr, DDI_DMA_CONSISTENT | DDI_DMA_READ,
590 KM_SLEEP)) == NULL) {
591 return;
592 }
593
594 mutex_enter(&vib->vib_mutex);
595 if ((r = vioblk_internal(vib, VIRTIO_BLK_T_GET_ID, dma, 0,
596 VIRTIO_DIR_DEVICE_WRITES)) == 0) {
597 const char *b = virtio_dma_va(dma, 0);
598 uint_t pos = 0;
599
600 /*
601 * Save the entire response for debugging purposes.
602 */
603 bcopy(virtio_dma_va(dma, 0), vib->vib_rawid,
604 VIRTIO_BLK_ID_BYTES);
605
606 /*
607 * Process the returned ID.
608 */
609 bzero(vib->vib_devid, sizeof (vib->vib_devid));
610 for (uint_t n = 0; n < VIRTIO_BLK_ID_BYTES; n++) {
611 if (isalnum(b[n]) || b[n] == '-' || b[n] == '_') {
612 /*
613 * Accept a subset of printable ASCII
614 * characters.
615 */
616 vib->vib_devid[pos++] = b[n];
617 } else {
618 /*
619 * Stop processing at the first sign of
620 * trouble.
621 */
622 break;
623 }
624 }
625
626 vib->vib_devid_fetched = B_TRUE;
627 }
628 mutex_exit(&vib->vib_mutex);
629
630 virtio_dma_free(dma);
631 }
632
633 static int
vioblk_bd_devid(void * arg,dev_info_t * dip,ddi_devid_t * devid)634 vioblk_bd_devid(void *arg, dev_info_t *dip, ddi_devid_t *devid)
635 {
636 vioblk_t *vib = arg;
637 size_t len;
638
639 if ((len = strlen(vib->vib_devid)) == 0) {
640 /*
641 * The device has no ID.
642 */
643 return (DDI_FAILURE);
644 }
645
646 return (ddi_devid_init(dip, DEVID_ATA_SERIAL, len, vib->vib_devid,
647 devid));
648 }
649
650 static int
vioblk_bd_free_space(void * arg,bd_xfer_t * xfer)651 vioblk_bd_free_space(void *arg, bd_xfer_t *xfer)
652 {
653 vioblk_t *vib = arg;
654 int r = 0;
655
656 /*
657 * Since vib_can_discard is write once (and set during attach),
658 * we can check if it's enabled without taking the mutex.
659 */
660 if (!vib->vib_can_discard) {
661 return (ENOTSUP);
662 }
663
664 mutex_enter(&vib->vib_mutex);
665 r = vioblk_request(vib, xfer, VIRTIO_BLK_T_DISCARD);
666 mutex_exit(&vib->vib_mutex);
667
668 return (r);
669 }
670
671 /*
672 * As the device completes processing of a request, it returns the chain for
673 * that request to our I/O queue. This routine is called in two contexts:
674 * - from the interrupt handler, in response to notification from the device
675 * - synchronously in line with request processing when panicking
676 */
677 static uint_t
vioblk_poll(vioblk_t * vib)678 vioblk_poll(vioblk_t *vib)
679 {
680 virtio_chain_t *vic;
681 uint_t count = 0;
682 boolean_t wakeup = B_FALSE;
683
684 VERIFY(MUTEX_HELD(&vib->vib_mutex));
685
686 while ((vic = virtio_queue_poll(vib->vib_vq)) != NULL) {
687 vioblk_req_t *vbr = virtio_chain_data(vic);
688 uint8_t status;
689
690 virtio_dma_sync(vbr->vbr_dma, DDI_DMA_SYNC_FORCPU);
691
692 bcopy(virtio_dma_va(vbr->vbr_dma,
693 sizeof (struct vioblk_req_hdr)), &status, sizeof (status));
694
695 switch (status) {
696 case VIRTIO_BLK_S_OK:
697 vbr->vbr_error = 0;
698 break;
699 case VIRTIO_BLK_S_IOERR:
700 vbr->vbr_error = EIO;
701 vib->vib_stats->vbs_io_errors.value.ui64++;
702 break;
703 case VIRTIO_BLK_S_UNSUPP:
704 vbr->vbr_error = ENOTTY;
705 vib->vib_stats->vbs_unsupp_errors.value.ui64++;
706 break;
707 default:
708 vbr->vbr_error = ENXIO;
709 vib->vib_stats->vbs_nxio_errors.value.ui64++;
710 break;
711 }
712
713 count++;
714
715 if (vbr->vbr_status & VIOBLK_REQSTAT_POLLED) {
716 /*
717 * This request must not be freed as it is being held
718 * by a call to vioblk_common_submit().
719 */
720 VERIFY(!(vbr->vbr_status &
721 VIOBLK_REQSTAT_POLL_COMPLETE));
722 vbr->vbr_status |= VIOBLK_REQSTAT_POLL_COMPLETE;
723 wakeup = B_TRUE;
724 continue;
725 }
726
727 vioblk_complete(vib, vbr);
728
729 vioblk_req_free(vib, vbr);
730 }
731
732 if (wakeup) {
733 /*
734 * Signal anybody waiting for polled command completion.
735 */
736 cv_broadcast(&vib->vib_cv);
737 }
738
739 return (count);
740 }
741
742 static uint_t
vioblk_int_handler(caddr_t arg0,caddr_t arg1 __unused)743 vioblk_int_handler(caddr_t arg0, caddr_t arg1 __unused)
744 {
745 vioblk_t *vib = (vioblk_t *)arg0;
746 uint_t count;
747
748 mutex_enter(&vib->vib_mutex);
749 if ((count = vioblk_poll(vib)) >
750 vib->vib_stats->vbs_intr_queuemax.value.ui32) {
751 vib->vib_stats->vbs_intr_queuemax.value.ui32 = count;
752 }
753
754 vib->vib_stats->vbs_intr_total.value.ui64++;
755 mutex_exit(&vib->vib_mutex);
756
757 return (DDI_INTR_CLAIMED);
758 }
759
760 static uint_t
vioblk_cfgchange(caddr_t arg0,caddr_t arg1 __unused)761 vioblk_cfgchange(caddr_t arg0, caddr_t arg1 __unused)
762 {
763 vioblk_t *vib = (vioblk_t *)arg0;
764
765 dev_err(vib->vib_dip, CE_NOTE, "!Configuration changed");
766
767 mutex_enter(&vib->vib_mutex);
768
769 /*
770 * The configuration space of the device has changed in some way.
771 * At present, we only re-read the device capacity and trigger
772 * blkdev to check the device state.
773 */
774
775 if (vioblk_read_capacity(vib) == DDI_FAILURE) {
776 mutex_exit(&vib->vib_mutex);
777 return (DDI_INTR_CLAIMED);
778 }
779
780 mutex_exit(&vib->vib_mutex);
781
782 bd_state_change(vib->vib_bd_h);
783
784 return (DDI_INTR_CLAIMED);
785 }
786
787 static void
vioblk_free_reqs(vioblk_t * vib)788 vioblk_free_reqs(vioblk_t *vib)
789 {
790 VERIFY3U(vib->vib_nreqs_alloc, ==, 0);
791
792 for (uint_t i = 0; i < vib->vib_reqs_capacity; i++) {
793 struct vioblk_req *vbr = &vib->vib_reqs_mem[i];
794
795 VERIFY(list_link_active(&vbr->vbr_link));
796 list_remove(&vib->vib_reqs, vbr);
797
798 VERIFY0(vbr->vbr_status);
799
800 if (vbr->vbr_chain != NULL) {
801 virtio_chain_free(vbr->vbr_chain);
802 vbr->vbr_chain = NULL;
803 }
804 if (vbr->vbr_dma != NULL) {
805 virtio_dma_free(vbr->vbr_dma);
806 vbr->vbr_dma = NULL;
807 }
808 }
809 VERIFY(list_is_empty(&vib->vib_reqs));
810
811 if (vib->vib_reqs_mem != NULL) {
812 kmem_free(vib->vib_reqs_mem,
813 sizeof (struct vioblk_req) * vib->vib_reqs_capacity);
814 vib->vib_reqs_mem = NULL;
815 vib->vib_reqs_capacity = 0;
816 }
817 }
818
819 static int
vioblk_alloc_reqs(vioblk_t * vib)820 vioblk_alloc_reqs(vioblk_t *vib)
821 {
822 vib->vib_reqs_capacity = MIN(virtio_queue_size(vib->vib_vq),
823 VIRTIO_BLK_REQ_BUFS);
824 vib->vib_reqs_mem = kmem_zalloc(
825 sizeof (struct vioblk_req) * vib->vib_reqs_capacity, KM_SLEEP);
826 vib->vib_nreqs_alloc = 0;
827
828 for (uint_t i = 0; i < vib->vib_reqs_capacity; i++) {
829 list_insert_tail(&vib->vib_reqs, &vib->vib_reqs_mem[i]);
830 }
831
832 for (vioblk_req_t *vbr = list_head(&vib->vib_reqs); vbr != NULL;
833 vbr = list_next(&vib->vib_reqs, vbr)) {
834 if ((vbr->vbr_dma = virtio_dma_alloc(vib->vib_virtio,
835 sizeof (struct vioblk_req_hdr) + sizeof (uint8_t),
836 &vioblk_dma_attr, DDI_DMA_RDWR | DDI_DMA_CONSISTENT,
837 KM_SLEEP)) == NULL) {
838 goto fail;
839 }
840 vbr->vbr_chain = virtio_chain_alloc(vib->vib_vq, KM_SLEEP);
841 if (vbr->vbr_chain == NULL) {
842 goto fail;
843 }
844 virtio_chain_data_set(vbr->vbr_chain, vbr);
845 }
846
847 return (0);
848
849 fail:
850 vioblk_free_reqs(vib);
851 return (ENOMEM);
852 }
853
854 static int
vioblk_read_capacity(vioblk_t * vib)855 vioblk_read_capacity(vioblk_t *vib)
856 {
857 virtio_t *vio = vib->vib_virtio;
858
859 /* The capacity is always available */
860 if ((vib->vib_nblks = virtio_dev_get64(vio,
861 VIRTIO_BLK_CONFIG_CAPACITY)) == UINT64_MAX) {
862 dev_err(vib->vib_dip, CE_WARN, "invalid capacity");
863 return (DDI_FAILURE);
864 }
865
866 /*
867 * Determine the optimal logical block size recommended by the device.
868 * This size is advisory; the protocol always deals in 512 byte blocks.
869 */
870 vib->vib_blk_size = DEV_BSIZE;
871 if (virtio_feature_present(vio, VIRTIO_BLK_F_BLK_SIZE)) {
872 uint32_t v = virtio_dev_get32(vio, VIRTIO_BLK_CONFIG_BLK_SIZE);
873
874 if (v != 0 && v != PCI_EINVAL32)
875 vib->vib_blk_size = v;
876 }
877
878 /*
879 * Device capacity is always in 512-byte units, convert to
880 * native blocks.
881 */
882 vib->vib_nblks = (vib->vib_nblks * DEV_BSIZE) / vib->vib_blk_size;
883
884 /*
885 * The device may also provide an advisory physical block size.
886 */
887 vib->vib_pblk_size = vib->vib_blk_size;
888 if (virtio_feature_present(vio, VIRTIO_BLK_F_TOPOLOGY)) {
889 uint8_t v = virtio_dev_get8(vio, VIRTIO_BLK_CONFIG_TOPO_PBEXP);
890
891 if (v != PCI_EINVAL8)
892 vib->vib_pblk_size <<= v;
893 }
894
895 return (DDI_SUCCESS);
896 }
897
898 static int
vioblk_attach(dev_info_t * dip,ddi_attach_cmd_t cmd)899 vioblk_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
900 {
901 int instance = ddi_get_instance(dip);
902 vioblk_t *vib;
903 virtio_t *vio;
904 boolean_t did_mutex = B_FALSE;
905
906 if (cmd != DDI_ATTACH) {
907 return (DDI_FAILURE);
908 }
909
910 if ((vio = virtio_init(dip, VIRTIO_BLK_WANTED_FEATURES, B_TRUE)) ==
911 NULL) {
912 dev_err(dip, CE_WARN, "failed to start Virtio init");
913 return (DDI_FAILURE);
914 }
915
916 vib = kmem_zalloc(sizeof (*vib), KM_SLEEP);
917 vib->vib_dip = dip;
918 vib->vib_virtio = vio;
919 ddi_set_driver_private(dip, vib);
920 list_create(&vib->vib_reqs, sizeof (vioblk_req_t),
921 offsetof(vioblk_req_t, vbr_link));
922
923 /*
924 * Determine how many scatter-gather entries we can use in a single
925 * request.
926 */
927 vib->vib_seg_max = VIRTIO_BLK_DEFAULT_MAX_SEG;
928 if (virtio_feature_present(vio, VIRTIO_BLK_F_SEG_MAX)) {
929 vib->vib_seg_max = virtio_dev_get32(vio,
930 VIRTIO_BLK_CONFIG_SEG_MAX);
931
932 if (vib->vib_seg_max == 0 || vib->vib_seg_max == PCI_EINVAL32) {
933 /*
934 * We need to be able to use at least one data segment,
935 * so we'll assume that this device is just poorly
936 * implemented and try for one.
937 */
938 vib->vib_seg_max = 1;
939 }
940 }
941
942 if (virtio_feature_present(vio, VIRTIO_BLK_F_DISCARD)) {
943 vib->vib_max_discard_sectors = virtio_dev_get32(vio,
944 VIRTIO_BLK_CONFIG_MAX_DISCARD_SECT);
945 vib->vib_max_discard_seg = virtio_dev_get32(vio,
946 VIRTIO_BLK_CONFIG_MAX_DISCARD_SEG);
947 vib->vib_discard_sector_align = virtio_dev_get32(vio,
948 VIRTIO_BLK_CONFIG_DISCARD_ALIGN);
949
950 if (vib->vib_max_discard_sectors == 0 ||
951 vib->vib_max_discard_seg == 0 ||
952 vib->vib_discard_sector_align == 0) {
953 vib->vib_can_discard = B_FALSE;
954
955 /*
956 * The hypervisor shouldn't be giving us bad values.
957 * If it is, it's probably worth notifying the
958 * operator.
959 */
960 dev_err(dip, CE_NOTE,
961 "Host is advertising DISCARD support but with bad"
962 "parameters: max_discard_sectors=%u, "
963 "max_discard_segments=%u, discard_sector_align=%u",
964 vib->vib_max_discard_sectors,
965 vib->vib_max_discard_seg,
966 vib->vib_discard_sector_align);
967 } else {
968 vib->vib_can_discard = B_TRUE;
969 }
970 }
971
972 /*
973 * When allocating the request queue, we include two additional
974 * descriptors (beyond those required for request data) to account for
975 * the header and the status byte.
976 */
977 if ((vib->vib_vq = virtio_queue_alloc(vio, VIRTIO_BLK_VIRTQ_IO, "io",
978 vioblk_int_handler, vib, B_FALSE, vib->vib_seg_max + 2)) == NULL) {
979 goto fail;
980 }
981
982 virtio_register_cfgchange_handler(vio, vioblk_cfgchange, vib);
983
984 if (virtio_init_complete(vio, VIRTIO_ANY_INTR_TYPE) != DDI_SUCCESS) {
985 dev_err(dip, CE_WARN, "failed to complete Virtio init");
986 goto fail;
987 }
988
989 cv_init(&vib->vib_cv, NULL, CV_DRIVER, NULL);
990 mutex_init(&vib->vib_mutex, NULL, MUTEX_DRIVER, virtio_intr_pri(vio));
991 did_mutex = B_TRUE;
992
993 if ((vib->vib_kstat = kstat_create("vioblk", instance,
994 "statistics", "controller", KSTAT_TYPE_NAMED,
995 sizeof (struct vioblk_stats) / sizeof (kstat_named_t),
996 KSTAT_FLAG_PERSISTENT)) == NULL) {
997 dev_err(dip, CE_WARN, "kstat_create failed");
998 goto fail;
999 }
1000 vib->vib_stats = (vioblk_stats_t *)vib->vib_kstat->ks_data;
1001 kstat_named_init(&vib->vib_stats->vbs_rw_outofmemory,
1002 "total_rw_outofmemory", KSTAT_DATA_UINT64);
1003 kstat_named_init(&vib->vib_stats->vbs_rw_badoffset,
1004 "total_rw_badoffset", KSTAT_DATA_UINT64);
1005 kstat_named_init(&vib->vib_stats->vbs_intr_total,
1006 "total_intr", KSTAT_DATA_UINT64);
1007 kstat_named_init(&vib->vib_stats->vbs_io_errors,
1008 "total_io_errors", KSTAT_DATA_UINT64);
1009 kstat_named_init(&vib->vib_stats->vbs_unsupp_errors,
1010 "total_unsupp_errors", KSTAT_DATA_UINT64);
1011 kstat_named_init(&vib->vib_stats->vbs_nxio_errors,
1012 "total_nxio_errors", KSTAT_DATA_UINT64);
1013 kstat_named_init(&vib->vib_stats->vbs_rw_cacheflush,
1014 "total_rw_cacheflush", KSTAT_DATA_UINT64);
1015 kstat_named_init(&vib->vib_stats->vbs_rw_cookiesmax,
1016 "max_rw_cookies", KSTAT_DATA_UINT32);
1017 kstat_named_init(&vib->vib_stats->vbs_intr_queuemax,
1018 "max_intr_queue", KSTAT_DATA_UINT32);
1019 kstat_install(vib->vib_kstat);
1020
1021 vib->vib_readonly = virtio_feature_present(vio, VIRTIO_BLK_F_RO);
1022
1023 if (vioblk_read_capacity(vib) == DDI_FAILURE)
1024 goto fail;
1025
1026 /*
1027 * The maximum size for a cookie in a request.
1028 */
1029 vib->vib_seg_size_max = VIRTIO_BLK_DEFAULT_MAX_SIZE;
1030 if (virtio_feature_present(vio, VIRTIO_BLK_F_SIZE_MAX)) {
1031 uint32_t v = virtio_dev_get32(vio, VIRTIO_BLK_CONFIG_SIZE_MAX);
1032
1033 if (v != 0 && v != PCI_EINVAL32) {
1034 vib->vib_seg_size_max = v;
1035 }
1036 }
1037
1038 /*
1039 * Set up the DMA attributes for blkdev to use for request data. The
1040 * specification is not extremely clear about whether DMA-related
1041 * parameters include or exclude the header and status descriptors.
1042 * For now, we assume they cover only the request data and not the
1043 * headers.
1044 */
1045 vib->vib_bd_dma_attr = vioblk_dma_attr;
1046 vib->vib_bd_dma_attr.dma_attr_sgllen = vib->vib_seg_max;
1047 vib->vib_bd_dma_attr.dma_attr_count_max = vib->vib_seg_size_max;
1048 vib->vib_bd_dma_attr.dma_attr_maxxfer = vib->vib_seg_max *
1049 vib->vib_seg_size_max;
1050
1051 if (vioblk_alloc_reqs(vib) != 0) {
1052 goto fail;
1053 }
1054
1055 /*
1056 * The blkdev framework does not provide a way to specify that the
1057 * device does not support write cache flushing, except by omitting the
1058 * "o_sync_cache" member from the ops vector. As "bd_alloc_handle()"
1059 * makes a copy of the ops vector, we can safely assemble one on the
1060 * stack based on negotiated features.
1061 *
1062 * Similarly, the blkdev framework does not provide a way to indicate
1063 * if a device supports an TRIM/UNMAP/DISCARD type operation except
1064 * by omitting the "o_free_space" member from the ops vector.
1065 */
1066 bd_ops_t vioblk_bd_ops = {
1067 .o_version = BD_OPS_CURRENT_VERSION,
1068 .o_drive_info = vioblk_bd_driveinfo,
1069 .o_media_info = vioblk_bd_mediainfo,
1070 .o_devid_init = vioblk_bd_devid,
1071 .o_sync_cache = vioblk_bd_flush,
1072 .o_read = vioblk_bd_read,
1073 .o_write = vioblk_bd_write,
1074 .o_free_space = vioblk_bd_free_space,
1075 };
1076 if (!virtio_feature_present(vio, VIRTIO_BLK_F_FLUSH)) {
1077 vioblk_bd_ops.o_sync_cache = NULL;
1078 }
1079 if (!vib->vib_can_discard) {
1080 vioblk_bd_ops.o_free_space = NULL;
1081 }
1082
1083 vib->vib_bd_h = bd_alloc_handle(vib, &vioblk_bd_ops,
1084 &vib->vib_bd_dma_attr, KM_SLEEP);
1085
1086 /*
1087 * Enable interrupts now so that we can request the device identity.
1088 */
1089 if (virtio_interrupts_enable(vio) != DDI_SUCCESS) {
1090 goto fail;
1091 }
1092
1093 vioblk_get_id(vib);
1094
1095 if (bd_attach_handle(dip, vib->vib_bd_h) != DDI_SUCCESS) {
1096 dev_err(dip, CE_WARN, "Failed to attach blkdev");
1097 goto fail;
1098 }
1099
1100 return (DDI_SUCCESS);
1101
1102 fail:
1103 if (vib->vib_bd_h != NULL) {
1104 (void) bd_detach_handle(vib->vib_bd_h);
1105 bd_free_handle(vib->vib_bd_h);
1106 }
1107 if (vio != NULL) {
1108 (void) virtio_fini(vio, B_TRUE);
1109 }
1110 if (did_mutex) {
1111 mutex_destroy(&vib->vib_mutex);
1112 cv_destroy(&vib->vib_cv);
1113 }
1114 if (vib->vib_kstat != NULL) {
1115 kstat_delete(vib->vib_kstat);
1116 }
1117 vioblk_free_reqs(vib);
1118 kmem_free(vib, sizeof (*vib));
1119 return (DDI_FAILURE);
1120 }
1121
1122 static int
vioblk_detach(dev_info_t * dip,ddi_detach_cmd_t cmd)1123 vioblk_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
1124 {
1125 vioblk_t *vib = ddi_get_driver_private(dip);
1126
1127 if (cmd != DDI_DETACH) {
1128 return (DDI_FAILURE);
1129 }
1130
1131 mutex_enter(&vib->vib_mutex);
1132 if (vib->vib_nreqs_alloc > 0) {
1133 /*
1134 * Cannot detach while there are still outstanding requests.
1135 */
1136 mutex_exit(&vib->vib_mutex);
1137 return (DDI_FAILURE);
1138 }
1139
1140 if (bd_detach_handle(vib->vib_bd_h) != DDI_SUCCESS) {
1141 mutex_exit(&vib->vib_mutex);
1142 return (DDI_FAILURE);
1143 }
1144
1145 /*
1146 * Tear down the Virtio framework before freeing the rest of the
1147 * resources. This will ensure the interrupt handlers are no longer
1148 * running.
1149 */
1150 virtio_fini(vib->vib_virtio, B_FALSE);
1151
1152 vioblk_free_reqs(vib);
1153 kstat_delete(vib->vib_kstat);
1154
1155 mutex_exit(&vib->vib_mutex);
1156 mutex_destroy(&vib->vib_mutex);
1157
1158 kmem_free(vib, sizeof (*vib));
1159
1160 return (DDI_SUCCESS);
1161 }
1162
1163 static int
vioblk_quiesce(dev_info_t * dip)1164 vioblk_quiesce(dev_info_t *dip)
1165 {
1166 vioblk_t *vib;
1167
1168 if ((vib = ddi_get_driver_private(dip)) == NULL) {
1169 return (DDI_FAILURE);
1170 }
1171
1172 return (virtio_quiesce(vib->vib_virtio));
1173 }
1174
1175 int
_init(void)1176 _init(void)
1177 {
1178 int rv;
1179
1180 bd_mod_init(&vioblk_dev_ops);
1181
1182 if ((rv = mod_install(&vioblk_modlinkage)) != 0) {
1183 bd_mod_fini(&vioblk_dev_ops);
1184 }
1185
1186 return (rv);
1187 }
1188
1189 int
_fini(void)1190 _fini(void)
1191 {
1192 int rv;
1193
1194 if ((rv = mod_remove(&vioblk_modlinkage)) == 0) {
1195 bd_mod_fini(&vioblk_dev_ops);
1196 }
1197
1198 return (rv);
1199 }
1200
1201 int
_info(struct modinfo * modinfop)1202 _info(struct modinfo *modinfop)
1203 {
1204 return (mod_info(&vioblk_modlinkage, modinfop));
1205 }
1206