1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright 2013 Nexenta Inc. All rights reserved.
14 * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
15 * Copyright 2021 Joyent, Inc.
16 * Copyright 2019 Joshua M. Clulow <josh@sysmgr.org>
17 */
18
19 /* Based on the NetBSD virtio driver by Minoura Makoto. */
20 /*
21 * Copyright (c) 2010 Minoura Makoto.
22 * All rights reserved.
23 *
24 * Redistribution and use in source and binary forms, with or without
25 * modification, are permitted provided that the following conditions
26 * are met:
27 * 1. Redistributions of source code must retain the above copyright
28 * notice, this list of conditions and the following disclaimer.
29 * 2. Redistributions in binary form must reproduce the above copyright
30 * notice, this list of conditions and the following disclaimer in the
31 * documentation and/or other materials provided with the distribution.
32 *
33 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
34 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
35 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
36 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
37 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
38 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
39 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
40 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
41 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
42 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43 */
44
45 /*
46 * VIRTIO NETWORK DRIVER
47 */
48
49 #include <sys/types.h>
50 #include <sys/errno.h>
51 #include <sys/param.h>
52 #include <sys/stropts.h>
53 #include <sys/stream.h>
54 #include <sys/strsubr.h>
55 #include <sys/kmem.h>
56 #include <sys/conf.h>
57 #include <sys/devops.h>
58 #include <sys/ksynch.h>
59 #include <sys/stat.h>
60 #include <sys/modctl.h>
61 #include <sys/debug.h>
62 #include <sys/pci.h>
63 #include <sys/ethernet.h>
64 #include <sys/vlan.h>
65 #include <sys/sysmacros.h>
66 #include <sys/smbios.h>
67
68 #include <sys/dlpi.h>
69 #include <sys/taskq.h>
70
71 #include <sys/pattr.h>
72 #include <sys/strsun.h>
73
74 #include <sys/random.h>
75 #include <sys/containerof.h>
76 #include <sys/stream.h>
77 #include <inet/tcp.h>
78
79 #include <sys/mac.h>
80 #include <sys/mac_provider.h>
81 #include <sys/mac_ether.h>
82
83 #include "virtio.h"
84 #include "vioif.h"
85
86 /*
87 * While most hypervisors support the control queue, older versions of bhyve
88 * on illumos did not. To allow the historic behaviour of the illumos vioif
89 * driver, the following tuneable causes us to pretend that the request always
90 * succeeds if the underlying virtual device does not have support.
91 */
92 int vioif_fake_promisc_success = 1;
93
94 static int vioif_quiesce(dev_info_t *);
95 static int vioif_attach(dev_info_t *, ddi_attach_cmd_t);
96 static int vioif_detach(dev_info_t *, ddi_detach_cmd_t);
97 static boolean_t vioif_has_feature(vioif_t *, uint32_t);
98 static void vioif_reclaim_restart(vioif_t *);
99 static int vioif_m_stat(void *, uint_t, uint64_t *);
100 static void vioif_m_stop(void *);
101 static int vioif_m_start(void *);
102 static int vioif_m_multicst(void *, boolean_t, const uint8_t *);
103 static int vioif_m_setpromisc(void *, boolean_t);
104 static int vioif_m_unicst(void *, const uint8_t *);
105 static mblk_t *vioif_m_tx(void *, mblk_t *);
106 static int vioif_m_setprop(void *, const char *, mac_prop_id_t, uint_t,
107 const void *);
108 static int vioif_m_getprop(void *, const char *, mac_prop_id_t, uint_t, void *);
109 static void vioif_m_propinfo(void *, const char *, mac_prop_id_t,
110 mac_prop_info_handle_t);
111 static boolean_t vioif_m_getcapab(void *, mac_capab_t, void *);
112 static uint_t vioif_add_rx(vioif_t *);
113
114
115 static struct cb_ops vioif_cb_ops = {
116 .cb_rev = CB_REV,
117 .cb_flag = D_MP | D_NEW,
118
119 .cb_open = nulldev,
120 .cb_close = nulldev,
121 .cb_strategy = nodev,
122 .cb_print = nodev,
123 .cb_dump = nodev,
124 .cb_read = nodev,
125 .cb_write = nodev,
126 .cb_ioctl = nodev,
127 .cb_devmap = nodev,
128 .cb_mmap = nodev,
129 .cb_segmap = nodev,
130 .cb_chpoll = nochpoll,
131 .cb_prop_op = ddi_prop_op,
132 .cb_str = NULL,
133 .cb_aread = nodev,
134 .cb_awrite = nodev,
135 };
136
137 static struct dev_ops vioif_dev_ops = {
138 .devo_rev = DEVO_REV,
139 .devo_refcnt = 0,
140
141 .devo_attach = vioif_attach,
142 .devo_detach = vioif_detach,
143 .devo_quiesce = vioif_quiesce,
144
145 .devo_cb_ops = &vioif_cb_ops,
146
147 .devo_getinfo = NULL,
148 .devo_identify = nulldev,
149 .devo_probe = nulldev,
150 .devo_reset = nodev,
151 .devo_bus_ops = NULL,
152 .devo_power = NULL,
153 };
154
155 static struct modldrv vioif_modldrv = {
156 .drv_modops = &mod_driverops,
157 .drv_linkinfo = "VIRTIO network driver",
158 .drv_dev_ops = &vioif_dev_ops
159 };
160
161 static struct modlinkage vioif_modlinkage = {
162 .ml_rev = MODREV_1,
163 .ml_linkage = { &vioif_modldrv, NULL }
164 };
165
166 static mac_callbacks_t vioif_mac_callbacks = {
167 .mc_getstat = vioif_m_stat,
168 .mc_start = vioif_m_start,
169 .mc_stop = vioif_m_stop,
170 .mc_setpromisc = vioif_m_setpromisc,
171 .mc_multicst = vioif_m_multicst,
172 .mc_unicst = vioif_m_unicst,
173 .mc_tx = vioif_m_tx,
174
175 .mc_callbacks = (MC_GETCAPAB | MC_SETPROP |
176 MC_GETPROP | MC_PROPINFO),
177 .mc_getcapab = vioif_m_getcapab,
178 .mc_setprop = vioif_m_setprop,
179 .mc_getprop = vioif_m_getprop,
180 .mc_propinfo = vioif_m_propinfo,
181 };
182
183 static const uchar_t vioif_broadcast[ETHERADDRL] = {
184 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
185 };
186
187 /*
188 * Interval for the periodic TX reclaim.
189 */
190 uint_t vioif_reclaim_ms = 200;
191
192 /*
193 * Allow the operator to override the kinds of interrupts we'll use for
194 * vioif. This value defaults to -1 so that it can be overridden to 0 in
195 * /etc/system.
196 */
197 int vioif_allowed_int_types = -1;
198
199 /*
200 * DMA attribute template for transmit and receive buffers. The SGL entry
201 * count will be modified before using the template. Note that these
202 * allocations are aligned so that VIOIF_HEADER_SKIP places the IP header in
203 * received frames at the correct offset for the networking stack.
204 */
205 ddi_dma_attr_t vioif_dma_attr_bufs = {
206 .dma_attr_version = DMA_ATTR_V0,
207 .dma_attr_addr_lo = 0x0000000000000000,
208 .dma_attr_addr_hi = 0xFFFFFFFFFFFFFFFF,
209 .dma_attr_count_max = 0x00000000FFFFFFFF,
210 .dma_attr_align = VIOIF_HEADER_ALIGN,
211 .dma_attr_burstsizes = 1,
212 .dma_attr_minxfer = 1,
213 .dma_attr_maxxfer = 0x00000000FFFFFFFF,
214 .dma_attr_seg = 0x00000000FFFFFFFF,
215 .dma_attr_sgllen = 0,
216 .dma_attr_granular = 1,
217 .dma_attr_flags = 0
218 };
219
220 /*
221 * DMA attributes for mapping larger transmit buffers from the networking
222 * stack. The requirements are quite loose, but note that the SGL entry length
223 * field is 32-bit.
224 */
225 ddi_dma_attr_t vioif_dma_attr_external = {
226 .dma_attr_version = DMA_ATTR_V0,
227 .dma_attr_addr_lo = 0x0000000000000000,
228 .dma_attr_addr_hi = 0xFFFFFFFFFFFFFFFF,
229 .dma_attr_count_max = 0x00000000FFFFFFFF,
230 .dma_attr_align = 1,
231 .dma_attr_burstsizes = 1,
232 .dma_attr_minxfer = 1,
233 .dma_attr_maxxfer = 0x00000000FFFFFFFF,
234 .dma_attr_seg = 0x00000000FFFFFFFF,
235 .dma_attr_sgllen = VIOIF_MAX_SEGS - 1,
236 .dma_attr_granular = 1,
237 .dma_attr_flags = 0
238 };
239
240
241 /*
242 * VIRTIO NET MAC PROPERTIES
243 */
244 #define VIOIF_MACPROP_TXCOPY_THRESH "_txcopy_thresh"
245 #define VIOIF_MACPROP_TXCOPY_THRESH_DEF 300
246 #define VIOIF_MACPROP_TXCOPY_THRESH_MAX 640
247
248 #define VIOIF_MACPROP_RXCOPY_THRESH "_rxcopy_thresh"
249 #define VIOIF_MACPROP_RXCOPY_THRESH_DEF 300
250 #define VIOIF_MACPROP_RXCOPY_THRESH_MAX 640
251
252 static char *vioif_priv_props[] = {
253 VIOIF_MACPROP_TXCOPY_THRESH,
254 VIOIF_MACPROP_RXCOPY_THRESH,
255 NULL
256 };
257
258
259 static vioif_txbuf_t *
vioif_txbuf_alloc(vioif_t * vif)260 vioif_txbuf_alloc(vioif_t *vif)
261 {
262 vioif_txbuf_t *tb;
263
264 VERIFY(MUTEX_HELD(&vif->vif_mutex));
265
266 if ((tb = list_remove_head(&vif->vif_txbufs)) != NULL) {
267 vif->vif_ntxbufs_alloc++;
268 }
269
270 return (tb);
271 }
272
273 static void
vioif_txbuf_free(vioif_t * vif,vioif_txbuf_t * tb)274 vioif_txbuf_free(vioif_t *vif, vioif_txbuf_t *tb)
275 {
276 VERIFY(MUTEX_HELD(&vif->vif_mutex));
277
278 VERIFY3U(vif->vif_ntxbufs_alloc, >, 0);
279 vif->vif_ntxbufs_alloc--;
280
281 virtio_chain_clear(tb->tb_chain);
282 list_insert_head(&vif->vif_txbufs, tb);
283 }
284
285 static vioif_rxbuf_t *
vioif_rxbuf_alloc(vioif_t * vif)286 vioif_rxbuf_alloc(vioif_t *vif)
287 {
288 vioif_rxbuf_t *rb;
289
290 VERIFY(MUTEX_HELD(&vif->vif_mutex));
291
292 if ((rb = list_remove_head(&vif->vif_rxbufs)) != NULL) {
293 vif->vif_nrxbufs_alloc++;
294 }
295
296 return (rb);
297 }
298
299 static void
vioif_rxbuf_free(vioif_t * vif,vioif_rxbuf_t * rb)300 vioif_rxbuf_free(vioif_t *vif, vioif_rxbuf_t *rb)
301 {
302 VERIFY(MUTEX_HELD(&vif->vif_mutex));
303
304 VERIFY3U(vif->vif_nrxbufs_alloc, >, 0);
305 vif->vif_nrxbufs_alloc--;
306
307 virtio_chain_clear(rb->rb_chain);
308 list_insert_head(&vif->vif_rxbufs, rb);
309 }
310
311 static void
vioif_rx_free_callback(caddr_t free_arg)312 vioif_rx_free_callback(caddr_t free_arg)
313 {
314 vioif_rxbuf_t *rb = (vioif_rxbuf_t *)free_arg;
315 vioif_t *vif = rb->rb_vioif;
316
317 mutex_enter(&vif->vif_mutex);
318
319 /*
320 * Return this receive buffer to the free list.
321 */
322 vioif_rxbuf_free(vif, rb);
323
324 VERIFY3U(vif->vif_nrxbufs_onloan, >, 0);
325 vif->vif_nrxbufs_onloan--;
326
327 /*
328 * Attempt to replenish the receive queue with at least the buffer we
329 * just freed. There isn't a great way to deal with failure here,
330 * though because we'll only loan at most half of the buffers there
331 * should always be at least some available even if this fails.
332 */
333 (void) vioif_add_rx(vif);
334
335 mutex_exit(&vif->vif_mutex);
336 }
337
338 static vioif_ctrlbuf_t *
vioif_ctrlbuf_alloc(vioif_t * vif)339 vioif_ctrlbuf_alloc(vioif_t *vif)
340 {
341 vioif_ctrlbuf_t *cb;
342
343 VERIFY(MUTEX_HELD(&vif->vif_mutex));
344
345 if ((cb = list_remove_head(&vif->vif_ctrlbufs)) != NULL) {
346 vif->vif_nctrlbufs_alloc++;
347 }
348
349 return (cb);
350 }
351
352 static void
vioif_ctrlbuf_free(vioif_t * vif,vioif_ctrlbuf_t * cb)353 vioif_ctrlbuf_free(vioif_t *vif, vioif_ctrlbuf_t *cb)
354 {
355 VERIFY(MUTEX_HELD(&vif->vif_mutex));
356
357 VERIFY3U(vif->vif_nctrlbufs_alloc, >, 0);
358 vif->vif_nctrlbufs_alloc--;
359
360 virtio_chain_clear(cb->cb_chain);
361 list_insert_head(&vif->vif_ctrlbufs, cb);
362 }
363
364 static void
vioif_free_bufs(vioif_t * vif)365 vioif_free_bufs(vioif_t *vif)
366 {
367 VERIFY(MUTEX_HELD(&vif->vif_mutex));
368
369 VERIFY3U(vif->vif_ntxbufs_alloc, ==, 0);
370 for (uint_t i = 0; i < vif->vif_txbufs_capacity; i++) {
371 vioif_txbuf_t *tb = &vif->vif_txbufs_mem[i];
372
373 /*
374 * Ensure that this txbuf is now in the free list:
375 */
376 VERIFY(list_link_active(&tb->tb_link));
377 list_remove(&vif->vif_txbufs, tb);
378
379 /*
380 * We should not have an mblk chain at this point.
381 */
382 VERIFY3P(tb->tb_mp, ==, NULL);
383
384 if (tb->tb_dma != NULL) {
385 virtio_dma_free(tb->tb_dma);
386 tb->tb_dma = NULL;
387 }
388
389 if (tb->tb_chain != NULL) {
390 virtio_chain_free(tb->tb_chain);
391 tb->tb_chain = NULL;
392 }
393
394 if (tb->tb_dmaext != NULL) {
395 for (uint_t j = 0; j < tb->tb_dmaext_capacity; j++) {
396 if (tb->tb_dmaext[j] != NULL) {
397 virtio_dma_free(
398 tb->tb_dmaext[j]);
399 tb->tb_dmaext[j] = NULL;
400 }
401 }
402
403 kmem_free(tb->tb_dmaext,
404 sizeof (virtio_dma_t *) * tb->tb_dmaext_capacity);
405 tb->tb_dmaext = NULL;
406 tb->tb_dmaext_capacity = 0;
407 }
408 }
409 VERIFY(list_is_empty(&vif->vif_txbufs));
410 if (vif->vif_txbufs_mem != NULL) {
411 kmem_free(vif->vif_txbufs_mem,
412 sizeof (vioif_txbuf_t) * vif->vif_txbufs_capacity);
413 vif->vif_txbufs_mem = NULL;
414 vif->vif_txbufs_capacity = 0;
415 }
416
417 VERIFY3U(vif->vif_nrxbufs_alloc, ==, 0);
418 for (uint_t i = 0; i < vif->vif_rxbufs_capacity; i++) {
419 vioif_rxbuf_t *rb = &vif->vif_rxbufs_mem[i];
420
421 /*
422 * Ensure that this rxbuf is now in the free list:
423 */
424 VERIFY(list_link_active(&rb->rb_link));
425 list_remove(&vif->vif_rxbufs, rb);
426
427 if (rb->rb_dma != NULL) {
428 virtio_dma_free(rb->rb_dma);
429 rb->rb_dma = NULL;
430 }
431
432 if (rb->rb_chain != NULL) {
433 virtio_chain_free(rb->rb_chain);
434 rb->rb_chain = NULL;
435 }
436 }
437 VERIFY(list_is_empty(&vif->vif_rxbufs));
438 if (vif->vif_rxbufs_mem != NULL) {
439 kmem_free(vif->vif_rxbufs_mem,
440 sizeof (vioif_rxbuf_t) * vif->vif_rxbufs_capacity);
441 vif->vif_rxbufs_mem = NULL;
442 vif->vif_rxbufs_capacity = 0;
443 }
444
445 if (vif->vif_has_ctrlq) {
446 VERIFY3U(vif->vif_nctrlbufs_alloc, ==, 0);
447 for (uint_t i = 0; i < vif->vif_ctrlbufs_capacity; i++) {
448 vioif_ctrlbuf_t *cb = &vif->vif_ctrlbufs_mem[i];
449
450 /*
451 * Ensure that this ctrlbuf is now in the free list
452 */
453 VERIFY(list_link_active(&cb->cb_link));
454 list_remove(&vif->vif_ctrlbufs, cb);
455
456 if (cb->cb_dma != NULL) {
457 virtio_dma_free(cb->cb_dma);
458 cb->cb_dma = NULL;
459 }
460
461 if (cb->cb_chain != NULL) {
462 virtio_chain_free(cb->cb_chain);
463 cb->cb_chain = NULL;
464 }
465 }
466 VERIFY(list_is_empty(&vif->vif_ctrlbufs));
467 if (vif->vif_ctrlbufs_mem != NULL) {
468 kmem_free(vif->vif_ctrlbufs_mem,
469 sizeof (vioif_ctrlbuf_t) *
470 vif->vif_ctrlbufs_capacity);
471 vif->vif_ctrlbufs_mem = NULL;
472 vif->vif_ctrlbufs_capacity = 0;
473 }
474 }
475 }
476
477 static int
vioif_alloc_bufs(vioif_t * vif)478 vioif_alloc_bufs(vioif_t *vif)
479 {
480 VERIFY(MUTEX_HELD(&vif->vif_mutex));
481
482 /*
483 * Allocate one contiguous chunk of memory for the transmit and receive
484 * buffer tracking objects. If the ring is unusually small, we'll
485 * reduce our target buffer count accordingly.
486 */
487 vif->vif_txbufs_capacity = MIN(VIRTIO_NET_TX_BUFS,
488 virtio_queue_size(vif->vif_tx_vq));
489 vif->vif_txbufs_mem = kmem_zalloc(
490 sizeof (vioif_txbuf_t) * vif->vif_txbufs_capacity, KM_SLEEP);
491 list_create(&vif->vif_txbufs, sizeof (vioif_txbuf_t),
492 offsetof(vioif_txbuf_t, tb_link));
493
494 vif->vif_rxbufs_capacity = MIN(VIRTIO_NET_RX_BUFS,
495 virtio_queue_size(vif->vif_rx_vq));
496 vif->vif_rxbufs_mem = kmem_zalloc(
497 sizeof (vioif_rxbuf_t) * vif->vif_rxbufs_capacity, KM_SLEEP);
498 list_create(&vif->vif_rxbufs, sizeof (vioif_rxbuf_t),
499 offsetof(vioif_rxbuf_t, rb_link));
500
501 if (vif->vif_has_ctrlq) {
502 vif->vif_ctrlbufs_capacity = MIN(VIRTIO_NET_CTRL_BUFS,
503 virtio_queue_size(vif->vif_ctrl_vq));
504 vif->vif_ctrlbufs_mem = kmem_zalloc(
505 sizeof (vioif_ctrlbuf_t) * vif->vif_ctrlbufs_capacity,
506 KM_SLEEP);
507 }
508 list_create(&vif->vif_ctrlbufs, sizeof (vioif_ctrlbuf_t),
509 offsetof(vioif_ctrlbuf_t, cb_link));
510
511 /*
512 * Do not loan more than half of our allocated receive buffers into
513 * the networking stack.
514 */
515 vif->vif_nrxbufs_onloan_max = vif->vif_rxbufs_capacity / 2;
516
517 /*
518 * Put everything in the free list straight away in order to simplify
519 * the use of vioif_free_bufs() for cleanup on allocation failure.
520 */
521 for (uint_t i = 0; i < vif->vif_txbufs_capacity; i++) {
522 list_insert_tail(&vif->vif_txbufs, &vif->vif_txbufs_mem[i]);
523 }
524 for (uint_t i = 0; i < vif->vif_rxbufs_capacity; i++) {
525 list_insert_tail(&vif->vif_rxbufs, &vif->vif_rxbufs_mem[i]);
526 }
527 for (uint_t i = 0; i < vif->vif_ctrlbufs_capacity; i++) {
528 list_insert_tail(&vif->vif_ctrlbufs, &vif->vif_ctrlbufs_mem[i]);
529 }
530
531 /*
532 * Start from the DMA attribute template common to both transmit and
533 * receive buffers. The SGL entry count will be modified for each
534 * buffer type.
535 */
536 ddi_dma_attr_t attr = vioif_dma_attr_bufs;
537
538 /*
539 * The transmit inline buffer is small (less than a page), so it's
540 * reasonable to request a single cookie.
541 */
542 attr.dma_attr_sgllen = 1;
543
544 for (vioif_txbuf_t *tb = list_head(&vif->vif_txbufs); tb != NULL;
545 tb = list_next(&vif->vif_txbufs, tb)) {
546 if ((tb->tb_dma = virtio_dma_alloc(vif->vif_virtio,
547 VIOIF_TX_INLINE_SIZE, &attr,
548 DDI_DMA_STREAMING | DDI_DMA_WRITE, KM_SLEEP)) == NULL) {
549 goto fail;
550 }
551 VERIFY3U(virtio_dma_ncookies(tb->tb_dma), ==, 1);
552
553 if ((tb->tb_chain = virtio_chain_alloc(vif->vif_tx_vq,
554 KM_SLEEP)) == NULL) {
555 goto fail;
556 }
557 virtio_chain_data_set(tb->tb_chain, tb);
558
559 tb->tb_dmaext_capacity = VIOIF_MAX_SEGS - 1;
560 tb->tb_dmaext = kmem_zalloc(
561 sizeof (virtio_dma_t *) * tb->tb_dmaext_capacity,
562 KM_SLEEP);
563 }
564
565 /*
566 * Control queue buffers are also small (less than a page), so we'll
567 * also request a single cookie for them.
568 */
569 for (vioif_ctrlbuf_t *cb = list_head(&vif->vif_ctrlbufs); cb != NULL;
570 cb = list_next(&vif->vif_ctrlbufs, cb)) {
571 if ((cb->cb_dma = virtio_dma_alloc(vif->vif_virtio,
572 VIOIF_CTRL_SIZE, &attr,
573 DDI_DMA_STREAMING | DDI_DMA_RDWR, KM_SLEEP)) == NULL) {
574 goto fail;
575 }
576 VERIFY3U(virtio_dma_ncookies(cb->cb_dma), ==, 1);
577
578 if ((cb->cb_chain = virtio_chain_alloc(vif->vif_ctrl_vq,
579 KM_SLEEP)) == NULL) {
580 goto fail;
581 }
582 virtio_chain_data_set(cb->cb_chain, cb);
583 }
584
585 /*
586 * The receive buffers are larger, and we can tolerate a large number
587 * of segments. Adjust the SGL entry count, setting aside one segment
588 * for the virtio net header.
589 */
590 attr.dma_attr_sgllen = VIOIF_MAX_SEGS - 1;
591
592 for (vioif_rxbuf_t *rb = list_head(&vif->vif_rxbufs); rb != NULL;
593 rb = list_next(&vif->vif_rxbufs, rb)) {
594 if ((rb->rb_dma = virtio_dma_alloc(vif->vif_virtio,
595 VIOIF_RX_BUF_SIZE, &attr, DDI_DMA_STREAMING | DDI_DMA_READ,
596 KM_SLEEP)) == NULL) {
597 goto fail;
598 }
599
600 if ((rb->rb_chain = virtio_chain_alloc(vif->vif_rx_vq,
601 KM_SLEEP)) == NULL) {
602 goto fail;
603 }
604 virtio_chain_data_set(rb->rb_chain, rb);
605
606 /*
607 * Ensure that the first cookie is sufficient to cover the
608 * header skip region plus one byte.
609 */
610 VERIFY3U(virtio_dma_cookie_size(rb->rb_dma, 0), >=,
611 VIOIF_HEADER_SKIP + 1);
612
613 /*
614 * Ensure that the frame data begins at a location with a
615 * correctly aligned IP header.
616 */
617 VERIFY3U((uintptr_t)virtio_dma_va(rb->rb_dma,
618 VIOIF_HEADER_SKIP) % 4, ==, 2);
619
620 rb->rb_vioif = vif;
621 rb->rb_frtn.free_func = vioif_rx_free_callback;
622 rb->rb_frtn.free_arg = (caddr_t)rb;
623 }
624
625 return (0);
626
627 fail:
628 vioif_free_bufs(vif);
629 return (ENOMEM);
630 }
631
632 static int
vioif_ctrlq_req(vioif_t * vif,uint8_t class,uint8_t cmd,void * data,size_t datalen)633 vioif_ctrlq_req(vioif_t *vif, uint8_t class, uint8_t cmd, void *data,
634 size_t datalen)
635 {
636 vioif_ctrlbuf_t *cb = NULL;
637 virtio_chain_t *vic = NULL;
638 uint8_t *p = NULL;
639 uint64_t pa = 0;
640 uint8_t *ackp = NULL;
641 struct virtio_net_ctrlq_hdr hdr = {
642 .vnch_class = class,
643 .vnch_command = cmd,
644 };
645 const size_t hdrlen = sizeof (hdr);
646 const size_t acklen = 1; /* the ack is always 1 byte */
647 size_t totlen = hdrlen + datalen + acklen;
648 int r = DDI_SUCCESS;
649
650 /*
651 * We shouldn't be called unless the ctrlq feature has been
652 * negotiated with the host
653 */
654 VERIFY(vif->vif_has_ctrlq);
655
656 mutex_enter(&vif->vif_mutex);
657 cb = vioif_ctrlbuf_alloc(vif);
658 if (cb == NULL) {
659 vif->vif_noctrlbuf++;
660 mutex_exit(&vif->vif_mutex);
661 r = DDI_FAILURE;
662 goto done;
663 }
664 mutex_exit(&vif->vif_mutex);
665
666 if (totlen > virtio_dma_size(cb->cb_dma)) {
667 vif->vif_ctrlbuf_toosmall++;
668 r = DDI_FAILURE;
669 goto done;
670 }
671
672 /*
673 * Clear the entire buffer. Technically not necessary, but useful
674 * if trying to troubleshoot an issue, and probably not a bad idea
675 * to not let any old data linger.
676 */
677 p = virtio_dma_va(cb->cb_dma, 0);
678 bzero(p, virtio_dma_size(cb->cb_dma));
679
680 /*
681 * We currently do not support VIRTIO_F_ANY_LAYOUT. That means,
682 * that we must put the header, the data, and the ack in their
683 * own respective descriptors. Since all the currently supported
684 * control queue commands take _very_ small amounts of data, we
685 * use a single DMA buffer for all of it, but use 3 descriptors to
686 * reference (respectively) the header, the data, and the ack byte
687 * within that memory to adhere to the virtio spec.
688 *
689 * If we add support for control queue features such as custom
690 * MAC filtering tables, which might require larger amounts of
691 * memory, we likely will want to add more sophistication here
692 * and optionally use additional allocated memory to hold that
693 * data instead of a fixed size buffer.
694 *
695 * Copy the header.
696 */
697 bcopy(&hdr, p, sizeof (hdr));
698 pa = virtio_dma_cookie_pa(cb->cb_dma, 0);
699 if ((r = virtio_chain_append(cb->cb_chain,
700 pa, hdrlen, VIRTIO_DIR_DEVICE_READS)) != DDI_SUCCESS) {
701 goto done;
702 }
703
704 /*
705 * Copy the request data
706 */
707 p = virtio_dma_va(cb->cb_dma, hdrlen);
708 bcopy(data, p, datalen);
709 if ((r = virtio_chain_append(cb->cb_chain,
710 pa + hdrlen, datalen, VIRTIO_DIR_DEVICE_READS)) != DDI_SUCCESS) {
711 goto done;
712 }
713
714 /*
715 * We already cleared the buffer, so don't need to copy out a 0 for
716 * the ack byte. Just add a descriptor for that spot.
717 */
718 ackp = virtio_dma_va(cb->cb_dma, hdrlen + datalen);
719 if ((r = virtio_chain_append(cb->cb_chain,
720 pa + hdrlen + datalen, acklen,
721 VIRTIO_DIR_DEVICE_WRITES)) != DDI_SUCCESS) {
722 goto done;
723 }
724
725 virtio_dma_sync(cb->cb_dma, DDI_DMA_SYNC_FORDEV);
726 virtio_chain_submit(cb->cb_chain, B_TRUE);
727
728 /*
729 * Spin waiting for response.
730 */
731 mutex_enter(&vif->vif_mutex);
732 while ((vic = virtio_queue_poll(vif->vif_ctrl_vq)) == NULL) {
733 mutex_exit(&vif->vif_mutex);
734 delay(drv_usectohz(1000));
735 mutex_enter(&vif->vif_mutex);
736 }
737
738 virtio_dma_sync(cb->cb_dma, DDI_DMA_SYNC_FORCPU);
739 VERIFY3P(virtio_chain_data(vic), ==, cb);
740 mutex_exit(&vif->vif_mutex);
741
742 if (*ackp != VIRTIO_NET_CQ_OK) {
743 r = DDI_FAILURE;
744 }
745
746 done:
747 mutex_enter(&vif->vif_mutex);
748 vioif_ctrlbuf_free(vif, cb);
749 mutex_exit(&vif->vif_mutex);
750
751 return (r);
752 }
753
754 static int
vioif_m_multicst(void * arg,boolean_t add,const uint8_t * mcst_addr)755 vioif_m_multicst(void *arg, boolean_t add, const uint8_t *mcst_addr)
756 {
757 /*
758 * Even though we currently do not have support for programming
759 * multicast filters, or even enabling promiscuous mode, we return
760 * success here to avoid the networking stack falling back to link
761 * layer broadcast for multicast traffic. Some hypervisors already
762 * pass received multicast frames onto the guest, so at least on those
763 * systems multicast will work as expected anyway.
764 */
765 return (0);
766 }
767
768 static int
vioif_m_setpromisc(void * arg,boolean_t on)769 vioif_m_setpromisc(void *arg, boolean_t on)
770 {
771 vioif_t *vif = arg;
772 uint8_t val = on ? 1 : 0;
773
774 if (!vif->vif_has_ctrlq_rx) {
775 if (vioif_fake_promisc_success)
776 return (0);
777
778 return (ENOTSUP);
779 }
780
781 return (vioif_ctrlq_req(vif, VIRTIO_NET_CTRL_RX,
782 VIRTIO_NET_CTRL_RX_PROMISC, &val, sizeof (val)));
783 }
784
785 static int
vioif_m_unicst(void * arg,const uint8_t * mac)786 vioif_m_unicst(void *arg, const uint8_t *mac)
787 {
788 return (ENOTSUP);
789 }
790
791 static uint_t
vioif_add_rx(vioif_t * vif)792 vioif_add_rx(vioif_t *vif)
793 {
794 VERIFY(MUTEX_HELD(&vif->vif_mutex));
795
796 if (vif->vif_runstate != VIOIF_RUNSTATE_RUNNING) {
797 /*
798 * If the NIC is not running, do not give the device any
799 * receive buffers.
800 */
801 return (0);
802 }
803
804 uint_t num_added = 0;
805
806 vioif_rxbuf_t *rb;
807 while ((rb = vioif_rxbuf_alloc(vif)) != NULL) {
808 /*
809 * For legacy devices, and those that have not negotiated
810 * VIRTIO_F_ANY_LAYOUT, the virtio net header must appear in a
811 * separate descriptor entry to the rest of the buffer.
812 */
813 if (virtio_chain_append(rb->rb_chain,
814 virtio_dma_cookie_pa(rb->rb_dma, 0),
815 sizeof (struct virtio_net_hdr),
816 VIRTIO_DIR_DEVICE_WRITES) != DDI_SUCCESS) {
817 goto fail;
818 }
819
820 for (uint_t n = 0; n < virtio_dma_ncookies(rb->rb_dma); n++) {
821 uint64_t pa = virtio_dma_cookie_pa(rb->rb_dma, n);
822 size_t sz = virtio_dma_cookie_size(rb->rb_dma, n);
823
824 if (n == 0) {
825 pa += VIOIF_HEADER_SKIP;
826 VERIFY3U(sz, >, VIOIF_HEADER_SKIP);
827 sz -= VIOIF_HEADER_SKIP;
828 }
829
830 if (virtio_chain_append(rb->rb_chain, pa, sz,
831 VIRTIO_DIR_DEVICE_WRITES) != DDI_SUCCESS) {
832 goto fail;
833 }
834 }
835
836 virtio_chain_submit(rb->rb_chain, B_FALSE);
837 num_added++;
838 continue;
839
840 fail:
841 vioif_rxbuf_free(vif, rb);
842 vif->vif_norecvbuf++;
843 break;
844 }
845
846 if (num_added > 0) {
847 virtio_queue_flush(vif->vif_rx_vq);
848 }
849
850 return (num_added);
851 }
852
853 static uint_t
vioif_process_rx(vioif_t * vif)854 vioif_process_rx(vioif_t *vif)
855 {
856 virtio_chain_t *vic;
857 mblk_t *mphead = NULL, *lastmp = NULL, *mp;
858 uint_t num_processed = 0;
859
860 VERIFY(MUTEX_HELD(&vif->vif_mutex));
861
862 while ((vic = virtio_queue_poll(vif->vif_rx_vq)) != NULL) {
863 /*
864 * We have to use the chain received length here, as the device
865 * does not tell us the received frame length any other way.
866 * In a limited survey of hypervisors, virtio network devices
867 * appear to provide the right value here.
868 */
869 size_t len = virtio_chain_received_length(vic);
870 vioif_rxbuf_t *rb = virtio_chain_data(vic);
871
872 virtio_dma_sync(rb->rb_dma, DDI_DMA_SYNC_FORCPU);
873
874 /*
875 * If the NIC is not running, discard any received frames.
876 */
877 if (vif->vif_runstate != VIOIF_RUNSTATE_RUNNING) {
878 vioif_rxbuf_free(vif, rb);
879 continue;
880 }
881
882 if (len < sizeof (struct virtio_net_hdr)) {
883 vif->vif_rxfail_chain_undersize++;
884 vif->vif_ierrors++;
885 vioif_rxbuf_free(vif, rb);
886 continue;
887 }
888 len -= sizeof (struct virtio_net_hdr);
889
890 /*
891 * We copy small packets that happen to fit into a single
892 * cookie and reuse the buffers. For bigger ones, we loan
893 * the buffers upstream.
894 */
895 if (len < vif->vif_rxcopy_thresh ||
896 vif->vif_nrxbufs_onloan >= vif->vif_nrxbufs_onloan_max) {
897 mutex_exit(&vif->vif_mutex);
898 if ((mp = allocb(len, 0)) == NULL) {
899 mutex_enter(&vif->vif_mutex);
900 vif->vif_norecvbuf++;
901 vif->vif_ierrors++;
902
903 vioif_rxbuf_free(vif, rb);
904 continue;
905 }
906
907 bcopy(virtio_dma_va(rb->rb_dma, VIOIF_HEADER_SKIP),
908 mp->b_rptr, len);
909 mp->b_wptr = mp->b_rptr + len;
910
911 /*
912 * As the packet contents was copied rather than
913 * loaned, we can return the receive buffer resources
914 * to the free list.
915 */
916 mutex_enter(&vif->vif_mutex);
917 vioif_rxbuf_free(vif, rb);
918
919 } else {
920 mutex_exit(&vif->vif_mutex);
921 if ((mp = desballoc(virtio_dma_va(rb->rb_dma,
922 VIOIF_HEADER_SKIP), len, 0,
923 &rb->rb_frtn)) == NULL) {
924 mutex_enter(&vif->vif_mutex);
925 vif->vif_norecvbuf++;
926 vif->vif_ierrors++;
927
928 vioif_rxbuf_free(vif, rb);
929 continue;
930 }
931 mp->b_wptr = mp->b_rptr + len;
932
933 mutex_enter(&vif->vif_mutex);
934 vif->vif_nrxbufs_onloan++;
935 }
936
937 /*
938 * virtio-net does not tell us if this packet is multicast
939 * or broadcast, so we have to check it.
940 */
941 if (mp->b_rptr[0] & 0x1) {
942 if (bcmp(mp->b_rptr, vioif_broadcast, ETHERADDRL) != 0)
943 vif->vif_multircv++;
944 else
945 vif->vif_brdcstrcv++;
946 }
947
948 vif->vif_rbytes += len;
949 vif->vif_ipackets++;
950
951 if (lastmp == NULL) {
952 mphead = mp;
953 } else {
954 lastmp->b_next = mp;
955 }
956 lastmp = mp;
957 num_processed++;
958 }
959
960 if (mphead != NULL) {
961 if (vif->vif_runstate == VIOIF_RUNSTATE_RUNNING) {
962 mutex_exit(&vif->vif_mutex);
963 mac_rx(vif->vif_mac_handle, NULL, mphead);
964 mutex_enter(&vif->vif_mutex);
965 } else {
966 /*
967 * The NIC was disabled part way through our execution,
968 * so free the messages we allocated.
969 */
970 freemsgchain(mphead);
971 }
972 }
973
974 return (num_processed);
975 }
976
977 static uint_t
vioif_reclaim_used_tx(vioif_t * vif)978 vioif_reclaim_used_tx(vioif_t *vif)
979 {
980 virtio_chain_t *vic;
981 uint_t num_reclaimed = 0;
982
983 VERIFY(MUTEX_NOT_HELD(&vif->vif_mutex));
984
985 while ((vic = virtio_queue_poll(vif->vif_tx_vq)) != NULL) {
986 vioif_txbuf_t *tb = virtio_chain_data(vic);
987
988 if (tb->tb_mp != NULL) {
989 /*
990 * Unbind the external mapping.
991 */
992 for (uint_t i = 0; i < tb->tb_dmaext_capacity; i++) {
993 if (tb->tb_dmaext[i] == NULL) {
994 continue;
995 }
996
997 virtio_dma_unbind(tb->tb_dmaext[i]);
998 }
999
1000 freemsg(tb->tb_mp);
1001 tb->tb_mp = NULL;
1002 }
1003
1004 /*
1005 * Return this transmit buffer to the free list for reuse.
1006 */
1007 mutex_enter(&vif->vif_mutex);
1008 vioif_txbuf_free(vif, tb);
1009 mutex_exit(&vif->vif_mutex);
1010
1011 num_reclaimed++;
1012 }
1013
1014 /* Return ring to transmitting state if descriptors were reclaimed. */
1015 if (num_reclaimed > 0) {
1016 boolean_t do_update = B_FALSE;
1017
1018 mutex_enter(&vif->vif_mutex);
1019 vif->vif_stat_tx_reclaim += num_reclaimed;
1020 if (vif->vif_tx_corked) {
1021 /*
1022 * TX was corked on a lack of available descriptors.
1023 * That dire state has passed so the TX interrupt can
1024 * be disabled and MAC can be notified that
1025 * transmission is possible again.
1026 */
1027 vif->vif_tx_corked = B_FALSE;
1028 virtio_queue_no_interrupt(vif->vif_tx_vq, B_TRUE);
1029 do_update = B_TRUE;
1030 }
1031
1032 mutex_exit(&vif->vif_mutex);
1033 if (do_update) {
1034 mac_tx_update(vif->vif_mac_handle);
1035 }
1036 }
1037
1038 return (num_reclaimed);
1039 }
1040
1041 static void
vioif_reclaim_periodic(void * arg)1042 vioif_reclaim_periodic(void *arg)
1043 {
1044 vioif_t *vif = arg;
1045 uint_t num_reclaimed;
1046
1047 num_reclaimed = vioif_reclaim_used_tx(vif);
1048
1049 mutex_enter(&vif->vif_mutex);
1050 vif->vif_tx_reclaim_tid = 0;
1051 /*
1052 * If used descriptors were reclaimed or TX descriptors appear to be
1053 * outstanding, the ring is considered active and periodic reclamation
1054 * is necessary for now.
1055 */
1056 if (num_reclaimed != 0 || virtio_queue_nactive(vif->vif_tx_vq) != 0) {
1057 /* Do not reschedule if the ring is being drained. */
1058 if (!vif->vif_tx_drain) {
1059 vioif_reclaim_restart(vif);
1060 }
1061 }
1062 mutex_exit(&vif->vif_mutex);
1063 }
1064
1065 static void
vioif_reclaim_restart(vioif_t * vif)1066 vioif_reclaim_restart(vioif_t *vif)
1067 {
1068 VERIFY(MUTEX_HELD(&vif->vif_mutex));
1069 VERIFY(!vif->vif_tx_drain);
1070
1071 if (vif->vif_tx_reclaim_tid == 0) {
1072 vif->vif_tx_reclaim_tid = timeout(vioif_reclaim_periodic, vif,
1073 MSEC_TO_TICK_ROUNDUP(vioif_reclaim_ms));
1074 }
1075 }
1076
1077 static void
vioif_tx_drain(vioif_t * vif)1078 vioif_tx_drain(vioif_t *vif)
1079 {
1080 VERIFY(MUTEX_HELD(&vif->vif_mutex));
1081 VERIFY3S(vif->vif_runstate, ==, VIOIF_RUNSTATE_STOPPING);
1082
1083 vif->vif_tx_drain = B_TRUE;
1084 /* Put a stop to the periodic reclaim if it is running */
1085 if (vif->vif_tx_reclaim_tid != 0) {
1086 timeout_id_t tid = vif->vif_tx_reclaim_tid;
1087
1088 /*
1089 * With vif_tx_drain set, there is no risk that a racing
1090 * vioif_reclaim_periodic() call will reschedule itself.
1091 *
1092 * Being part of the mc_stop hook also guarantees that
1093 * vioif_m_tx() will not be called to restart it.
1094 */
1095 vif->vif_tx_reclaim_tid = 0;
1096 mutex_exit(&vif->vif_mutex);
1097 (void) untimeout(tid);
1098 mutex_enter(&vif->vif_mutex);
1099 }
1100 virtio_queue_no_interrupt(vif->vif_tx_vq, B_TRUE);
1101
1102 /*
1103 * Wait for all of the TX descriptors to be processed by the host so
1104 * they can be reclaimed.
1105 */
1106 while (vif->vif_ntxbufs_alloc > 0) {
1107 mutex_exit(&vif->vif_mutex);
1108 (void) vioif_reclaim_used_tx(vif);
1109 delay(5);
1110 mutex_enter(&vif->vif_mutex);
1111 }
1112 VERIFY(!vif->vif_tx_corked);
1113 VERIFY3U(vif->vif_tx_reclaim_tid, ==, 0);
1114 VERIFY3U(virtio_queue_nactive(vif->vif_tx_vq), ==, 0);
1115 }
1116
1117 static int
vioif_tx_inline(vioif_t * vif,vioif_txbuf_t * tb,mblk_t * mp,size_t msg_size)1118 vioif_tx_inline(vioif_t *vif, vioif_txbuf_t *tb, mblk_t *mp, size_t msg_size)
1119 {
1120 VERIFY(MUTEX_NOT_HELD(&vif->vif_mutex));
1121
1122 VERIFY3U(msg_size, <=, virtio_dma_size(tb->tb_dma) - VIOIF_HEADER_SKIP);
1123
1124 /*
1125 * Copy the message into the inline buffer and then free the message.
1126 */
1127 mcopymsg(mp, virtio_dma_va(tb->tb_dma, VIOIF_HEADER_SKIP));
1128
1129 if (virtio_chain_append(tb->tb_chain,
1130 virtio_dma_cookie_pa(tb->tb_dma, 0) + VIOIF_HEADER_SKIP,
1131 msg_size, VIRTIO_DIR_DEVICE_READS) != DDI_SUCCESS) {
1132 return (DDI_FAILURE);
1133 }
1134
1135 return (DDI_SUCCESS);
1136 }
1137
1138 static int
vioif_tx_external(vioif_t * vif,vioif_txbuf_t * tb,mblk_t * mp,size_t msg_size)1139 vioif_tx_external(vioif_t *vif, vioif_txbuf_t *tb, mblk_t *mp, size_t msg_size)
1140 {
1141 VERIFY(MUTEX_NOT_HELD(&vif->vif_mutex));
1142
1143 mblk_t *nmp = mp;
1144 tb->tb_ndmaext = 0;
1145
1146 while (nmp != NULL) {
1147 size_t len;
1148
1149 if ((len = MBLKL(nmp)) == 0) {
1150 /*
1151 * Skip any zero-length entries in the chain.
1152 */
1153 nmp = nmp->b_cont;
1154 continue;
1155 }
1156
1157 if (tb->tb_ndmaext >= tb->tb_dmaext_capacity) {
1158 mutex_enter(&vif->vif_mutex);
1159 vif->vif_txfail_indirect_limit++;
1160 vif->vif_notxbuf++;
1161 mutex_exit(&vif->vif_mutex);
1162 goto fail;
1163 }
1164
1165 if (tb->tb_dmaext[tb->tb_ndmaext] == NULL) {
1166 /*
1167 * Allocate a DMA handle for this slot.
1168 */
1169 if ((tb->tb_dmaext[tb->tb_ndmaext] =
1170 virtio_dma_alloc_nomem(vif->vif_virtio,
1171 &vioif_dma_attr_external, KM_SLEEP)) == NULL) {
1172 mutex_enter(&vif->vif_mutex);
1173 vif->vif_notxbuf++;
1174 mutex_exit(&vif->vif_mutex);
1175 goto fail;
1176 }
1177 }
1178 virtio_dma_t *extdma = tb->tb_dmaext[tb->tb_ndmaext++];
1179
1180 if (virtio_dma_bind(extdma, nmp->b_rptr, len,
1181 DDI_DMA_WRITE | DDI_DMA_STREAMING, KM_SLEEP) !=
1182 DDI_SUCCESS) {
1183 mutex_enter(&vif->vif_mutex);
1184 vif->vif_txfail_dma_bind++;
1185 mutex_exit(&vif->vif_mutex);
1186 goto fail;
1187 }
1188
1189 for (uint_t n = 0; n < virtio_dma_ncookies(extdma); n++) {
1190 uint64_t pa = virtio_dma_cookie_pa(extdma, n);
1191 size_t sz = virtio_dma_cookie_size(extdma, n);
1192
1193 if (virtio_chain_append(tb->tb_chain, pa, sz,
1194 VIRTIO_DIR_DEVICE_READS) != DDI_SUCCESS) {
1195 mutex_enter(&vif->vif_mutex);
1196 vif->vif_txfail_indirect_limit++;
1197 vif->vif_notxbuf++;
1198 mutex_exit(&vif->vif_mutex);
1199 goto fail;
1200 }
1201 }
1202
1203 nmp = nmp->b_cont;
1204 }
1205
1206 /*
1207 * We need to keep the message around until we reclaim the buffer from
1208 * the device before freeing it.
1209 */
1210 tb->tb_mp = mp;
1211
1212 return (DDI_SUCCESS);
1213
1214 fail:
1215 for (uint_t n = 0; n < tb->tb_ndmaext; n++) {
1216 if (tb->tb_dmaext[n] != NULL) {
1217 virtio_dma_unbind(tb->tb_dmaext[n]);
1218 }
1219 }
1220 tb->tb_ndmaext = 0;
1221
1222 freemsg(mp);
1223
1224 return (DDI_FAILURE);
1225 }
1226
1227 static boolean_t
vioif_send(vioif_t * vif,mblk_t * mp)1228 vioif_send(vioif_t *vif, mblk_t *mp)
1229 {
1230 VERIFY(MUTEX_NOT_HELD(&vif->vif_mutex));
1231
1232 vioif_txbuf_t *tb = NULL;
1233 struct virtio_net_hdr *vnh = NULL;
1234 size_t msg_size = 0;
1235 uint32_t csum_start;
1236 uint32_t csum_stuff;
1237 uint32_t csum_flags;
1238 uint32_t lso_flags;
1239 uint32_t lso_mss;
1240 mblk_t *nmp;
1241 int ret;
1242 boolean_t lso_required = B_FALSE;
1243 struct ether_header *ether = (void *)mp->b_rptr;
1244
1245 for (nmp = mp; nmp; nmp = nmp->b_cont)
1246 msg_size += MBLKL(nmp);
1247
1248 if (vif->vif_tx_tso4 || vif->vif_tx_tso6) {
1249 mac_lso_get(mp, &lso_mss, &lso_flags);
1250 lso_required = (lso_flags & HW_LSO) != 0;
1251 }
1252
1253 mutex_enter(&vif->vif_mutex);
1254 if ((tb = vioif_txbuf_alloc(vif)) == NULL) {
1255 vif->vif_notxbuf++;
1256 goto fail;
1257 }
1258 mutex_exit(&vif->vif_mutex);
1259
1260 /*
1261 * Use the inline buffer for the virtio net header. Zero the portion
1262 * of our DMA allocation prior to the packet data.
1263 */
1264 vnh = virtio_dma_va(tb->tb_dma, 0);
1265 bzero(vnh, VIOIF_HEADER_SKIP);
1266
1267 /*
1268 * For legacy devices, and those that have not negotiated
1269 * VIRTIO_F_ANY_LAYOUT, the virtio net header must appear in a separate
1270 * descriptor entry to the rest of the buffer.
1271 */
1272 if (virtio_chain_append(tb->tb_chain,
1273 virtio_dma_cookie_pa(tb->tb_dma, 0), sizeof (struct virtio_net_hdr),
1274 VIRTIO_DIR_DEVICE_READS) != DDI_SUCCESS) {
1275 mutex_enter(&vif->vif_mutex);
1276 vif->vif_notxbuf++;
1277 goto fail;
1278 }
1279
1280 mac_hcksum_get(mp, &csum_start, &csum_stuff, NULL, NULL, &csum_flags);
1281
1282 /*
1283 * They want us to do the TCP/UDP csum calculation.
1284 */
1285 if (csum_flags & HCK_PARTIALCKSUM) {
1286 int eth_hsize;
1287
1288 /*
1289 * Did we ask for it?
1290 */
1291 ASSERT(vif->vif_tx_csum);
1292
1293 /*
1294 * We only asked for partial csum packets.
1295 */
1296 ASSERT(!(csum_flags & HCK_IPV4_HDRCKSUM));
1297 ASSERT(!(csum_flags & HCK_FULLCKSUM));
1298
1299 if (ether->ether_type == htons(ETHERTYPE_VLAN)) {
1300 eth_hsize = sizeof (struct ether_vlan_header);
1301 } else {
1302 eth_hsize = sizeof (struct ether_header);
1303 }
1304
1305 vnh->vnh_flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
1306 vnh->vnh_csum_start = eth_hsize + csum_start;
1307 vnh->vnh_csum_offset = csum_stuff - csum_start;
1308 }
1309
1310 /*
1311 * Setup LSO fields if required.
1312 */
1313 if (lso_required) {
1314 mac_ether_offload_flags_t needed;
1315 mac_ether_offload_info_t meo;
1316 uint32_t cksum;
1317 size_t len;
1318 mblk_t *pullmp = NULL;
1319 tcpha_t *tcpha;
1320
1321 if (mac_ether_offload_info(mp, &meo) != 0) {
1322 goto fail;
1323 }
1324
1325 needed = MEOI_L2INFO_SET | MEOI_L3INFO_SET | MEOI_L4INFO_SET;
1326 if ((meo.meoi_flags & needed) != needed) {
1327 goto fail;
1328 }
1329
1330 if (meo.meoi_l4proto != IPPROTO_TCP) {
1331 goto fail;
1332 }
1333
1334 if (meo.meoi_l3proto == ETHERTYPE_IP && vif->vif_tx_tso4) {
1335 vnh->vnh_gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
1336 } else if (meo.meoi_l3proto == ETHERTYPE_IPV6 &&
1337 vif->vif_tx_tso6) {
1338 vnh->vnh_gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
1339 } else {
1340 goto fail;
1341 }
1342
1343 /*
1344 * The TCP stack does not include the length in the TCP
1345 * pseudo-header when it is performing LSO since hardware
1346 * generally asks for it to be removed (as it'll change).
1347 * Unfortunately, for virtio, we actually need it. This means we
1348 * need to go through and calculate the actual length and fix
1349 * things up. Because the virtio spec cares about the ECN flag
1350 * and indicating that, at least this means we'll have that
1351 * available as well.
1352 */
1353 if (MBLKL(mp) < vnh->vnh_hdr_len) {
1354 pullmp = msgpullup(mp, vnh->vnh_hdr_len);
1355 if (pullmp == NULL)
1356 goto fail;
1357 tcpha = (tcpha_t *)(pullmp->b_rptr + meo.meoi_l2hlen +
1358 meo.meoi_l3hlen);
1359 } else {
1360 tcpha = (tcpha_t *)(mp->b_rptr + meo.meoi_l2hlen +
1361 meo.meoi_l3hlen);
1362 }
1363
1364 len = meo.meoi_len - meo.meoi_l2hlen - meo.meoi_l3hlen;
1365 cksum = ntohs(tcpha->tha_sum) + len;
1366 cksum = (cksum >> 16) + (cksum & 0xffff);
1367 cksum = (cksum >> 16) + (cksum & 0xffff);
1368 tcpha->tha_sum = htons(cksum);
1369
1370 if (tcpha->tha_flags & TH_CWR) {
1371 vnh->vnh_gso_type |= VIRTIO_NET_HDR_GSO_ECN;
1372 }
1373 vnh->vnh_gso_size = (uint16_t)lso_mss;
1374 vnh->vnh_hdr_len = meo.meoi_l2hlen + meo.meoi_l3hlen +
1375 meo.meoi_l4hlen;
1376
1377 freemsg(pullmp);
1378 }
1379
1380 /*
1381 * The device does not maintain its own statistics about broadcast or
1382 * multicast packets, so we have to check the destination address
1383 * ourselves.
1384 */
1385 if ((ether->ether_dhost.ether_addr_octet[0] & 0x01) != 0) {
1386 mutex_enter(&vif->vif_mutex);
1387 if (ether_cmp(ðer->ether_dhost, vioif_broadcast) == 0) {
1388 vif->vif_brdcstxmt++;
1389 } else {
1390 vif->vif_multixmt++;
1391 }
1392 mutex_exit(&vif->vif_mutex);
1393 }
1394
1395 /*
1396 * For small packets, copy into the preallocated inline buffer rather
1397 * than incur the overhead of mapping. Note that both of these
1398 * functions ensure that "mp" is freed before returning.
1399 */
1400 if (msg_size < vif->vif_txcopy_thresh) {
1401 ret = vioif_tx_inline(vif, tb, mp, msg_size);
1402 } else {
1403 ret = vioif_tx_external(vif, tb, mp, msg_size);
1404 }
1405 mp = NULL;
1406
1407 mutex_enter(&vif->vif_mutex);
1408
1409 if (ret != DDI_SUCCESS) {
1410 goto fail;
1411 }
1412
1413 vif->vif_opackets++;
1414 vif->vif_obytes += msg_size;
1415 mutex_exit(&vif->vif_mutex);
1416
1417 virtio_dma_sync(tb->tb_dma, DDI_DMA_SYNC_FORDEV);
1418 virtio_chain_submit(tb->tb_chain, B_TRUE);
1419
1420 return (B_TRUE);
1421
1422 fail:
1423 vif->vif_oerrors++;
1424 if (tb != NULL) {
1425 vioif_txbuf_free(vif, tb);
1426 }
1427 mutex_exit(&vif->vif_mutex);
1428
1429 return (mp == NULL);
1430 }
1431
1432 static mblk_t *
vioif_m_tx(void * arg,mblk_t * mp)1433 vioif_m_tx(void *arg, mblk_t *mp)
1434 {
1435 vioif_t *vif = arg;
1436 mblk_t *nmp;
1437
1438 /*
1439 * Prior to attempting to send any more frames, do a reclaim to pick up
1440 * any descriptors which have been processed by the host.
1441 */
1442 if (virtio_queue_nactive(vif->vif_tx_vq) != 0) {
1443 (void) vioif_reclaim_used_tx(vif);
1444 }
1445
1446 while (mp != NULL) {
1447 nmp = mp->b_next;
1448 mp->b_next = NULL;
1449
1450 if (!vioif_send(vif, mp)) {
1451 /*
1452 * If there are no descriptors available, try to
1453 * reclaim some, allowing a retry of the send if some
1454 * are found.
1455 */
1456 mp->b_next = nmp;
1457 if (vioif_reclaim_used_tx(vif) != 0) {
1458 continue;
1459 }
1460
1461 /*
1462 * Otherwise, enable the TX ring interrupt so that as
1463 * soon as a descriptor becomes available, transmission
1464 * can begin again. For safety, make sure the periodic
1465 * reclaim is running as well.
1466 */
1467 mutex_enter(&vif->vif_mutex);
1468 vif->vif_tx_corked = B_TRUE;
1469 virtio_queue_no_interrupt(vif->vif_tx_vq, B_FALSE);
1470 vioif_reclaim_restart(vif);
1471 mutex_exit(&vif->vif_mutex);
1472 return (mp);
1473 }
1474 mp = nmp;
1475 }
1476
1477 /* Ensure the periodic reclaim has been started. */
1478 mutex_enter(&vif->vif_mutex);
1479 vioif_reclaim_restart(vif);
1480 mutex_exit(&vif->vif_mutex);
1481
1482 return (NULL);
1483 }
1484
1485 static int
vioif_m_start(void * arg)1486 vioif_m_start(void *arg)
1487 {
1488 vioif_t *vif = arg;
1489
1490 mutex_enter(&vif->vif_mutex);
1491
1492 VERIFY3S(vif->vif_runstate, ==, VIOIF_RUNSTATE_STOPPED);
1493 vif->vif_runstate = VIOIF_RUNSTATE_RUNNING;
1494
1495 mac_link_update(vif->vif_mac_handle, LINK_STATE_UP);
1496
1497 virtio_queue_no_interrupt(vif->vif_rx_vq, B_FALSE);
1498
1499 /*
1500 * Starting interrupts on the TX virtqueue is unnecessary at this time.
1501 * Descriptor reclamation is handling during transmit, via a periodic
1502 * timer, and when resources are tight, via the then-enabled interrupt.
1503 */
1504 vif->vif_tx_drain = B_FALSE;
1505
1506 /*
1507 * Add as many receive buffers as we can to the receive queue. If we
1508 * cannot add any, it may be because we have stopped and started again
1509 * and the descriptors are all in the queue already.
1510 */
1511 (void) vioif_add_rx(vif);
1512
1513 mutex_exit(&vif->vif_mutex);
1514 return (DDI_SUCCESS);
1515 }
1516
1517 static void
vioif_m_stop(void * arg)1518 vioif_m_stop(void *arg)
1519 {
1520 vioif_t *vif = arg;
1521
1522 mutex_enter(&vif->vif_mutex);
1523
1524 VERIFY3S(vif->vif_runstate, ==, VIOIF_RUNSTATE_RUNNING);
1525 vif->vif_runstate = VIOIF_RUNSTATE_STOPPING;
1526
1527 /* Ensure all TX descriptors have been processed and reclaimed */
1528 vioif_tx_drain(vif);
1529
1530 virtio_queue_no_interrupt(vif->vif_rx_vq, B_TRUE);
1531
1532 vif->vif_runstate = VIOIF_RUNSTATE_STOPPED;
1533 mutex_exit(&vif->vif_mutex);
1534 }
1535
1536 static int
vioif_m_stat(void * arg,uint_t stat,uint64_t * val)1537 vioif_m_stat(void *arg, uint_t stat, uint64_t *val)
1538 {
1539 vioif_t *vif = arg;
1540
1541 switch (stat) {
1542 case MAC_STAT_IERRORS:
1543 *val = vif->vif_ierrors;
1544 break;
1545 case MAC_STAT_OERRORS:
1546 *val = vif->vif_oerrors;
1547 break;
1548 case MAC_STAT_MULTIRCV:
1549 *val = vif->vif_multircv;
1550 break;
1551 case MAC_STAT_BRDCSTRCV:
1552 *val = vif->vif_brdcstrcv;
1553 break;
1554 case MAC_STAT_MULTIXMT:
1555 *val = vif->vif_multixmt;
1556 break;
1557 case MAC_STAT_BRDCSTXMT:
1558 *val = vif->vif_brdcstxmt;
1559 break;
1560 case MAC_STAT_IPACKETS:
1561 *val = vif->vif_ipackets;
1562 break;
1563 case MAC_STAT_RBYTES:
1564 *val = vif->vif_rbytes;
1565 break;
1566 case MAC_STAT_OPACKETS:
1567 *val = vif->vif_opackets;
1568 break;
1569 case MAC_STAT_OBYTES:
1570 *val = vif->vif_obytes;
1571 break;
1572 case MAC_STAT_NORCVBUF:
1573 *val = vif->vif_norecvbuf;
1574 break;
1575 case MAC_STAT_NOXMTBUF:
1576 *val = vif->vif_notxbuf;
1577 break;
1578 case MAC_STAT_IFSPEED:
1579 /* always 1 Gbit */
1580 *val = 1000000000ULL;
1581 break;
1582 case ETHER_STAT_LINK_DUPLEX:
1583 /* virtual device, always full-duplex */
1584 *val = LINK_DUPLEX_FULL;
1585 break;
1586
1587 default:
1588 return (ENOTSUP);
1589 }
1590
1591 return (DDI_SUCCESS);
1592 }
1593
1594 static int
vioif_m_setprop(void * arg,const char * pr_name,mac_prop_id_t pr_num,uint_t pr_valsize,const void * pr_val)1595 vioif_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
1596 uint_t pr_valsize, const void *pr_val)
1597 {
1598 vioif_t *vif = arg;
1599
1600 switch (pr_num) {
1601 case MAC_PROP_MTU: {
1602 int r;
1603 uint32_t mtu;
1604 if (pr_valsize < sizeof (mtu)) {
1605 return (EOVERFLOW);
1606 }
1607 bcopy(pr_val, &mtu, sizeof (mtu));
1608
1609 if (mtu < ETHERMIN || mtu > vif->vif_mtu_max) {
1610 return (EINVAL);
1611 }
1612
1613 mutex_enter(&vif->vif_mutex);
1614 if ((r = mac_maxsdu_update(vif->vif_mac_handle, mtu)) == 0) {
1615 vif->vif_mtu = mtu;
1616 }
1617 mutex_exit(&vif->vif_mutex);
1618
1619 return (r);
1620 }
1621
1622 case MAC_PROP_PRIVATE: {
1623 long max, result;
1624 uint_t *resp;
1625 char *endptr;
1626
1627 if (strcmp(pr_name, VIOIF_MACPROP_TXCOPY_THRESH) == 0) {
1628 max = VIOIF_MACPROP_TXCOPY_THRESH_MAX;
1629 resp = &vif->vif_txcopy_thresh;
1630 } else if (strcmp(pr_name, VIOIF_MACPROP_RXCOPY_THRESH) == 0) {
1631 max = VIOIF_MACPROP_RXCOPY_THRESH_MAX;
1632 resp = &vif->vif_rxcopy_thresh;
1633 } else {
1634 return (ENOTSUP);
1635 }
1636
1637 if (pr_val == NULL) {
1638 return (EINVAL);
1639 }
1640
1641 if (ddi_strtol(pr_val, &endptr, 10, &result) != 0 ||
1642 *endptr != '\0' || result < 0 || result > max) {
1643 return (EINVAL);
1644 }
1645
1646 mutex_enter(&vif->vif_mutex);
1647 *resp = result;
1648 mutex_exit(&vif->vif_mutex);
1649
1650 return (0);
1651 }
1652
1653 default:
1654 return (ENOTSUP);
1655 }
1656 }
1657
1658 static int
vioif_m_getprop(void * arg,const char * pr_name,mac_prop_id_t pr_num,uint_t pr_valsize,void * pr_val)1659 vioif_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
1660 uint_t pr_valsize, void *pr_val)
1661 {
1662 vioif_t *vif = arg;
1663
1664 switch (pr_num) {
1665 case MAC_PROP_PRIVATE: {
1666 uint_t value;
1667
1668 if (strcmp(pr_name, VIOIF_MACPROP_TXCOPY_THRESH) == 0) {
1669 value = vif->vif_txcopy_thresh;
1670 } else if (strcmp(pr_name, VIOIF_MACPROP_RXCOPY_THRESH) == 0) {
1671 value = vif->vif_rxcopy_thresh;
1672 } else {
1673 return (ENOTSUP);
1674 }
1675
1676 if (snprintf(pr_val, pr_valsize, "%u", value) >= pr_valsize) {
1677 return (EOVERFLOW);
1678 }
1679
1680 return (0);
1681 }
1682
1683 default:
1684 return (ENOTSUP);
1685 }
1686 }
1687
1688 static void
vioif_m_propinfo(void * arg,const char * pr_name,mac_prop_id_t pr_num,mac_prop_info_handle_t prh)1689 vioif_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num,
1690 mac_prop_info_handle_t prh)
1691 {
1692 vioif_t *vif = arg;
1693 char valstr[64];
1694 int value;
1695
1696 switch (pr_num) {
1697 case MAC_PROP_MTU:
1698 mac_prop_info_set_perm(prh, MAC_PROP_PERM_RW);
1699 mac_prop_info_set_range_uint32(prh, ETHERMIN, vif->vif_mtu_max);
1700 return;
1701
1702 case MAC_PROP_PRIVATE:
1703 if (strcmp(pr_name, VIOIF_MACPROP_TXCOPY_THRESH) == 0) {
1704 value = VIOIF_MACPROP_TXCOPY_THRESH_DEF;
1705 } else if (strcmp(pr_name, VIOIF_MACPROP_RXCOPY_THRESH) == 0) {
1706 value = VIOIF_MACPROP_RXCOPY_THRESH_DEF;
1707 } else {
1708 /*
1709 * We do not recognise this private property name.
1710 */
1711 return;
1712 }
1713 mac_prop_info_set_perm(prh, MAC_PROP_PERM_RW);
1714 (void) snprintf(valstr, sizeof (valstr), "%d", value);
1715 mac_prop_info_set_default_str(prh, valstr);
1716 return;
1717
1718 default:
1719 return;
1720 }
1721 }
1722
1723 static boolean_t
vioif_m_getcapab(void * arg,mac_capab_t cap,void * cap_data)1724 vioif_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
1725 {
1726 vioif_t *vif = arg;
1727
1728 switch (cap) {
1729 case MAC_CAPAB_HCKSUM: {
1730 if (!vif->vif_tx_csum) {
1731 return (B_FALSE);
1732 }
1733
1734 *(uint32_t *)cap_data = HCKSUM_INET_PARTIAL;
1735
1736 return (B_TRUE);
1737 }
1738
1739 case MAC_CAPAB_LSO: {
1740 if (!vif->vif_tx_tso4) {
1741 return (B_FALSE);
1742 }
1743
1744 mac_capab_lso_t *lso = cap_data;
1745 lso->lso_flags = LSO_TX_BASIC_TCP_IPV4 | LSO_TX_BASIC_TCP_IPV6;
1746 lso->lso_basic_tcp_ipv4.lso_max = VIOIF_RX_DATA_SIZE;
1747 lso->lso_basic_tcp_ipv6.lso_max = VIOIF_RX_DATA_SIZE;
1748
1749 return (B_TRUE);
1750 }
1751
1752 default:
1753 return (B_FALSE);
1754 }
1755 }
1756
1757 static boolean_t
vioif_has_feature(vioif_t * vif,uint32_t feature)1758 vioif_has_feature(vioif_t *vif, uint32_t feature)
1759 {
1760 return (virtio_feature_present(vif->vif_virtio, feature));
1761 }
1762
1763 /*
1764 * Read the primary MAC address from the device if one is provided. If not,
1765 * generate a random locally administered MAC address and write it back to the
1766 * device.
1767 */
1768 static void
vioif_get_mac(vioif_t * vif)1769 vioif_get_mac(vioif_t *vif)
1770 {
1771 VERIFY(MUTEX_HELD(&vif->vif_mutex));
1772
1773 if (vioif_has_feature(vif, VIRTIO_NET_F_MAC)) {
1774 for (uint_t i = 0; i < ETHERADDRL; i++) {
1775 vif->vif_mac[i] = virtio_dev_get8(vif->vif_virtio,
1776 VIRTIO_NET_CONFIG_MAC + i);
1777 }
1778 vif->vif_mac_from_host = 1;
1779
1780 return;
1781 }
1782
1783 /* Get a few random bytes */
1784 (void) random_get_pseudo_bytes(vif->vif_mac, ETHERADDRL);
1785 /* Make sure it's a unicast MAC */
1786 vif->vif_mac[0] &= ~1;
1787 /* Set the "locally administered" bit */
1788 vif->vif_mac[1] |= 2;
1789
1790 /*
1791 * Write the random MAC address back to the device.
1792 */
1793 for (uint_t i = 0; i < ETHERADDRL; i++) {
1794 virtio_dev_put8(vif->vif_virtio, VIRTIO_NET_CONFIG_MAC + i,
1795 vif->vif_mac[i]);
1796 }
1797 vif->vif_mac_from_host = 0;
1798
1799 dev_err(vif->vif_dip, CE_NOTE, "!Generated a random MAC address: "
1800 "%02x:%02x:%02x:%02x:%02x:%02x",
1801 (uint_t)vif->vif_mac[0], (uint_t)vif->vif_mac[1],
1802 (uint_t)vif->vif_mac[2], (uint_t)vif->vif_mac[3],
1803 (uint_t)vif->vif_mac[4], (uint_t)vif->vif_mac[5]);
1804 }
1805
1806 /*
1807 * Virtqueue interrupt handlers
1808 */
1809 static uint_t
vioif_rx_handler(caddr_t arg0,caddr_t arg1)1810 vioif_rx_handler(caddr_t arg0, caddr_t arg1)
1811 {
1812 vioif_t *vif = (vioif_t *)arg0;
1813
1814 mutex_enter(&vif->vif_mutex);
1815 (void) vioif_process_rx(vif);
1816
1817 /*
1818 * Attempt to replenish the receive queue. If we cannot add any
1819 * descriptors here, it may be because all of the recently received
1820 * packets were loaned up to the networking stack.
1821 */
1822 (void) vioif_add_rx(vif);
1823 mutex_exit(&vif->vif_mutex);
1824
1825 return (DDI_INTR_CLAIMED);
1826 }
1827
1828 static uint_t
vioif_tx_handler(caddr_t arg0,caddr_t arg1)1829 vioif_tx_handler(caddr_t arg0, caddr_t arg1)
1830 {
1831 vioif_t *vif = (vioif_t *)arg0;
1832
1833 /*
1834 * The TX interrupt could race with other reclamation activity, so
1835 * interpreting the return value is unimportant.
1836 */
1837 (void) vioif_reclaim_used_tx(vif);
1838
1839 return (DDI_INTR_CLAIMED);
1840 }
1841
1842 static void
vioif_check_features(vioif_t * vif)1843 vioif_check_features(vioif_t *vif)
1844 {
1845 VERIFY(MUTEX_HELD(&vif->vif_mutex));
1846
1847 vif->vif_tx_csum = 0;
1848 vif->vif_tx_tso4 = 0;
1849 vif->vif_tx_tso6 = 0;
1850
1851 if (vioif_has_feature(vif, VIRTIO_NET_F_CSUM)) {
1852 /*
1853 * The host will accept packets with partial checksums from us.
1854 */
1855 vif->vif_tx_csum = 1;
1856
1857 /*
1858 * The legacy GSO feature represents the combination of
1859 * HOST_TSO4, HOST_TSO6, and HOST_ECN.
1860 */
1861 boolean_t gso = vioif_has_feature(vif, VIRTIO_NET_F_GSO);
1862 boolean_t tso4 = vioif_has_feature(vif, VIRTIO_NET_F_HOST_TSO4);
1863 boolean_t tso6 = vioif_has_feature(vif, VIRTIO_NET_F_HOST_TSO6);
1864 boolean_t ecn = vioif_has_feature(vif, VIRTIO_NET_F_HOST_ECN);
1865
1866 /*
1867 * Explicit congestion notification (ECN) is configured
1868 * globally; see "tcp_ecn_permitted". As we cannot currently
1869 * request that the stack disable ECN on a per interface basis,
1870 * we require the device to support the combination of
1871 * segmentation offload and ECN support.
1872 */
1873 if (gso) {
1874 vif->vif_tx_tso4 = 1;
1875 vif->vif_tx_tso6 = 1;
1876 }
1877 if (tso4 && ecn) {
1878 vif->vif_tx_tso4 = 1;
1879 }
1880 if (tso6 && ecn) {
1881 vif->vif_tx_tso6 = 1;
1882 }
1883 }
1884
1885 if (vioif_has_feature(vif, VIRTIO_NET_F_CTRL_VQ)) {
1886 vif->vif_has_ctrlq = 1;
1887
1888 /*
1889 * The VIRTIO_NET_F_CTRL_VQ feature must be enabled if there's
1890 * any chance of the VIRTIO_NET_F_CTRL_RX being enabled.
1891 */
1892 if (vioif_has_feature(vif, VIRTIO_NET_F_CTRL_RX))
1893 vif->vif_has_ctrlq_rx = 1;
1894 }
1895 }
1896
1897 static int
vioif_select_interrupt_types(void)1898 vioif_select_interrupt_types(void)
1899 {
1900 id_t id;
1901 smbios_system_t sys;
1902 smbios_info_t info;
1903
1904 if (vioif_allowed_int_types != -1) {
1905 /*
1906 * If this value was tuned via /etc/system or the debugger,
1907 * use the provided value directly.
1908 */
1909 return (vioif_allowed_int_types);
1910 }
1911
1912 if (ksmbios == NULL ||
1913 (id = smbios_info_system(ksmbios, &sys)) == SMB_ERR ||
1914 smbios_info_common(ksmbios, id, &info) == SMB_ERR) {
1915 /*
1916 * The system may not have valid SMBIOS data, so ignore a
1917 * failure here.
1918 */
1919 return (VIRTIO_ANY_INTR_TYPE);
1920 }
1921
1922 if (strcmp(info.smbi_manufacturer, "Google") == 0 &&
1923 strcmp(info.smbi_product, "Google Compute Engine") == 0) {
1924 /*
1925 * An undiagnosed issue with the Google Compute Engine (GCE)
1926 * hypervisor exists. In this environment, no RX interrupts
1927 * are received if MSI-X handlers are installed. This does not
1928 * appear to be true for the Virtio SCSI driver. Fixed
1929 * interrupts do appear to work, so we fall back for now:
1930 */
1931 return (DDI_INTR_TYPE_FIXED);
1932 }
1933
1934 return (VIRTIO_ANY_INTR_TYPE);
1935 }
1936
1937 static int
vioif_attach(dev_info_t * dip,ddi_attach_cmd_t cmd)1938 vioif_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
1939 {
1940 int ret;
1941 vioif_t *vif;
1942 virtio_t *vio;
1943 mac_register_t *macp = NULL;
1944
1945 if (cmd != DDI_ATTACH) {
1946 return (DDI_FAILURE);
1947 }
1948
1949 if ((vio = virtio_init(dip, VIRTIO_NET_WANTED_FEATURES, B_TRUE)) ==
1950 NULL) {
1951 return (DDI_FAILURE);
1952 }
1953
1954 vif = kmem_zalloc(sizeof (*vif), KM_SLEEP);
1955 vif->vif_dip = dip;
1956 vif->vif_virtio = vio;
1957 vif->vif_runstate = VIOIF_RUNSTATE_STOPPED;
1958 ddi_set_driver_private(dip, vif);
1959
1960 if ((vif->vif_rx_vq = virtio_queue_alloc(vio, VIRTIO_NET_VIRTQ_RX,
1961 "rx", vioif_rx_handler, vif, B_FALSE, VIOIF_MAX_SEGS)) == NULL ||
1962 (vif->vif_tx_vq = virtio_queue_alloc(vio, VIRTIO_NET_VIRTQ_TX,
1963 "tx", vioif_tx_handler, vif, B_FALSE, VIOIF_MAX_SEGS)) == NULL) {
1964 goto fail;
1965 }
1966
1967 if (vioif_has_feature(vif, VIRTIO_NET_F_CTRL_VQ) &&
1968 (vif->vif_ctrl_vq = virtio_queue_alloc(vio,
1969 VIRTIO_NET_VIRTQ_CONTROL, "ctrlq", NULL, vif,
1970 B_FALSE, VIOIF_MAX_SEGS)) == NULL) {
1971 goto fail;
1972 }
1973
1974 if (virtio_init_complete(vio, vioif_select_interrupt_types()) !=
1975 DDI_SUCCESS) {
1976 dev_err(dip, CE_WARN, "failed to complete Virtio init");
1977 goto fail;
1978 }
1979
1980 virtio_queue_no_interrupt(vif->vif_rx_vq, B_TRUE);
1981 virtio_queue_no_interrupt(vif->vif_tx_vq, B_TRUE);
1982 if (vif->vif_ctrl_vq != NULL)
1983 virtio_queue_no_interrupt(vif->vif_ctrl_vq, B_TRUE);
1984
1985 mutex_init(&vif->vif_mutex, NULL, MUTEX_DRIVER, virtio_intr_pri(vio));
1986 mutex_enter(&vif->vif_mutex);
1987
1988 vioif_get_mac(vif);
1989
1990 vif->vif_rxcopy_thresh = VIOIF_MACPROP_RXCOPY_THRESH_DEF;
1991 vif->vif_txcopy_thresh = VIOIF_MACPROP_TXCOPY_THRESH_DEF;
1992
1993 if (vioif_has_feature(vif, VIRTIO_NET_F_MTU)) {
1994 vif->vif_mtu_max = virtio_dev_get16(vio, VIRTIO_NET_CONFIG_MTU);
1995 } else {
1996 vif->vif_mtu_max = ETHERMTU;
1997 }
1998
1999 vif->vif_mtu = ETHERMTU;
2000 if (vif->vif_mtu > vif->vif_mtu_max) {
2001 vif->vif_mtu = vif->vif_mtu_max;
2002 }
2003
2004 vioif_check_features(vif);
2005
2006 if (vioif_alloc_bufs(vif) != 0) {
2007 mutex_exit(&vif->vif_mutex);
2008 dev_err(dip, CE_WARN, "failed to allocate memory");
2009 goto fail;
2010 }
2011
2012 mutex_exit(&vif->vif_mutex);
2013
2014 if (virtio_interrupts_enable(vio) != DDI_SUCCESS) {
2015 dev_err(dip, CE_WARN, "failed to enable interrupts");
2016 goto fail;
2017 }
2018
2019 if ((macp = mac_alloc(MAC_VERSION)) == NULL) {
2020 dev_err(dip, CE_WARN, "failed to allocate a mac_register");
2021 goto fail;
2022 }
2023
2024 macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
2025 macp->m_driver = vif;
2026 macp->m_dip = dip;
2027 macp->m_src_addr = vif->vif_mac;
2028 macp->m_callbacks = &vioif_mac_callbacks;
2029 macp->m_min_sdu = 0;
2030 macp->m_max_sdu = vif->vif_mtu;
2031 macp->m_margin = VLAN_TAGSZ;
2032 macp->m_priv_props = vioif_priv_props;
2033
2034 if ((ret = mac_register(macp, &vif->vif_mac_handle)) != 0) {
2035 dev_err(dip, CE_WARN, "mac_register() failed (%d)", ret);
2036 goto fail;
2037 }
2038 mac_free(macp);
2039
2040 mac_link_update(vif->vif_mac_handle, LINK_STATE_UP);
2041
2042 return (DDI_SUCCESS);
2043
2044 fail:
2045 vioif_free_bufs(vif);
2046 if (macp != NULL) {
2047 mac_free(macp);
2048 }
2049 (void) virtio_fini(vio, B_TRUE);
2050 kmem_free(vif, sizeof (*vif));
2051 return (DDI_FAILURE);
2052 }
2053
2054 static int
vioif_detach(dev_info_t * dip,ddi_detach_cmd_t cmd)2055 vioif_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2056 {
2057 int r;
2058 vioif_t *vif;
2059
2060 if (cmd != DDI_DETACH) {
2061 return (DDI_FAILURE);
2062 }
2063
2064 if ((vif = ddi_get_driver_private(dip)) == NULL) {
2065 return (DDI_FAILURE);
2066 }
2067
2068 mutex_enter(&vif->vif_mutex);
2069 if (vif->vif_runstate != VIOIF_RUNSTATE_STOPPED) {
2070 dev_err(dip, CE_WARN, "!NIC still running, cannot detach");
2071 mutex_exit(&vif->vif_mutex);
2072 return (DDI_FAILURE);
2073 }
2074
2075 /*
2076 * There should be no outstanding transmit buffers once the NIC is
2077 * completely stopped.
2078 */
2079 VERIFY3U(vif->vif_ntxbufs_alloc, ==, 0);
2080
2081 /*
2082 * Though we cannot claw back all of the receive buffers until we reset
2083 * the device, we must ensure all those loaned to MAC have been
2084 * returned before calling mac_unregister().
2085 */
2086 if (vif->vif_nrxbufs_onloan > 0) {
2087 dev_err(dip, CE_WARN, "!%u receive buffers still loaned, "
2088 "cannot detach", vif->vif_nrxbufs_onloan);
2089 mutex_exit(&vif->vif_mutex);
2090 return (DDI_FAILURE);
2091 }
2092
2093 if ((r = mac_unregister(vif->vif_mac_handle)) != 0) {
2094 dev_err(dip, CE_WARN, "!MAC unregister failed (%d)", r);
2095 return (DDI_FAILURE);
2096 }
2097
2098 /*
2099 * Shut down the device so that we can recover any previously
2100 * submitted receive buffers.
2101 */
2102 virtio_shutdown(vif->vif_virtio);
2103 for (;;) {
2104 virtio_chain_t *vic;
2105
2106 if ((vic = virtio_queue_evacuate(vif->vif_rx_vq)) == NULL) {
2107 break;
2108 }
2109
2110 vioif_rxbuf_t *rb = virtio_chain_data(vic);
2111 vioif_rxbuf_free(vif, rb);
2112 }
2113
2114 /*
2115 * vioif_free_bufs() must be called before virtio_fini()
2116 * as it uses virtio_chain_free() which itself depends on some
2117 * virtio data structures still being around.
2118 */
2119 vioif_free_bufs(vif);
2120 (void) virtio_fini(vif->vif_virtio, B_FALSE);
2121
2122 mutex_exit(&vif->vif_mutex);
2123 mutex_destroy(&vif->vif_mutex);
2124
2125 kmem_free(vif, sizeof (*vif));
2126
2127 return (DDI_SUCCESS);
2128 }
2129
2130 static int
vioif_quiesce(dev_info_t * dip)2131 vioif_quiesce(dev_info_t *dip)
2132 {
2133 vioif_t *vif;
2134
2135 if ((vif = ddi_get_driver_private(dip)) == NULL)
2136 return (DDI_FAILURE);
2137
2138 return (virtio_quiesce(vif->vif_virtio));
2139 }
2140
2141 int
_init(void)2142 _init(void)
2143 {
2144 int ret;
2145
2146 mac_init_ops(&vioif_dev_ops, "vioif");
2147
2148 if ((ret = mod_install(&vioif_modlinkage)) != DDI_SUCCESS) {
2149 mac_fini_ops(&vioif_dev_ops);
2150 }
2151
2152 return (ret);
2153 }
2154
2155 int
_fini(void)2156 _fini(void)
2157 {
2158 int ret;
2159
2160 if ((ret = mod_remove(&vioif_modlinkage)) == DDI_SUCCESS) {
2161 mac_fini_ops(&vioif_dev_ops);
2162 }
2163
2164 return (ret);
2165 }
2166
2167 int
_info(struct modinfo * modinfop)2168 _info(struct modinfo *modinfop)
2169 {
2170 return (mod_info(&vioif_modlinkage, modinfop));
2171 }
2172