1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright 2013 Nexenta Inc. All rights reserved.
14 * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
15 * Copyright 2021 Joyent, Inc.
16 * Copyright 2019 Joshua M. Clulow <josh@sysmgr.org>
17 */
18
19 /* Based on the NetBSD virtio driver by Minoura Makoto. */
20 /*
21 * Copyright (c) 2010 Minoura Makoto.
22 * All rights reserved.
23 *
24 * Redistribution and use in source and binary forms, with or without
25 * modification, are permitted provided that the following conditions
26 * are met:
27 * 1. Redistributions of source code must retain the above copyright
28 * notice, this list of conditions and the following disclaimer.
29 * 2. Redistributions in binary form must reproduce the above copyright
30 * notice, this list of conditions and the following disclaimer in the
31 * documentation and/or other materials provided with the distribution.
32 *
33 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
34 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
35 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
36 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
37 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
38 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
39 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
40 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
41 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
42 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43 */
44
45 /*
46 * VIRTIO NETWORK DRIVER
47 */
48
49 #include <sys/types.h>
50 #include <sys/errno.h>
51 #include <sys/param.h>
52 #include <sys/stropts.h>
53 #include <sys/stream.h>
54 #include <sys/strsubr.h>
55 #include <sys/kmem.h>
56 #include <sys/conf.h>
57 #include <sys/devops.h>
58 #include <sys/ksynch.h>
59 #include <sys/stat.h>
60 #include <sys/modctl.h>
61 #include <sys/debug.h>
62 #include <sys/pci.h>
63 #include <sys/ethernet.h>
64 #include <sys/vlan.h>
65 #include <sys/sysmacros.h>
66 #include <sys/smbios.h>
67
68 #include <sys/dlpi.h>
69 #include <sys/taskq.h>
70
71 #include <sys/pattr.h>
72 #include <sys/strsun.h>
73
74 #include <sys/random.h>
75 #include <sys/containerof.h>
76 #include <sys/stream.h>
77 #include <inet/tcp.h>
78
79 #include <sys/mac.h>
80 #include <sys/mac_provider.h>
81 #include <sys/mac_ether.h>
82
83 #include "virtio.h"
84 #include "vioif.h"
85
86 /*
87 * While most hypervisors support the control queue, older versions of bhyve
88 * on illumos did not. To allow the historic behaviour of the illumos vioif
89 * driver, the following tuneable causes us to pretend that the request always
90 * succeeds if the underlying virtual device does not have support.
91 */
92 int vioif_fake_promisc_success = 1;
93
94 static int vioif_quiesce(dev_info_t *);
95 static int vioif_attach(dev_info_t *, ddi_attach_cmd_t);
96 static int vioif_detach(dev_info_t *, ddi_detach_cmd_t);
97 static boolean_t vioif_has_feature(vioif_t *, uint32_t);
98 static void vioif_reclaim_restart(vioif_t *);
99 static int vioif_m_stat(void *, uint_t, uint64_t *);
100 static void vioif_m_stop(void *);
101 static int vioif_m_start(void *);
102 static int vioif_m_multicst(void *, boolean_t, const uint8_t *);
103 static int vioif_m_setpromisc(void *, boolean_t);
104 static int vioif_m_unicst(void *, const uint8_t *);
105 static mblk_t *vioif_m_tx(void *, mblk_t *);
106 static int vioif_m_setprop(void *, const char *, mac_prop_id_t, uint_t,
107 const void *);
108 static int vioif_m_getprop(void *, const char *, mac_prop_id_t, uint_t, void *);
109 static void vioif_m_propinfo(void *, const char *, mac_prop_id_t,
110 mac_prop_info_handle_t);
111 static boolean_t vioif_m_getcapab(void *, mac_capab_t, void *);
112 static uint_t vioif_add_rx(vioif_t *);
113
114
115 static struct cb_ops vioif_cb_ops = {
116 .cb_rev = CB_REV,
117 .cb_flag = D_MP | D_NEW,
118
119 .cb_open = nulldev,
120 .cb_close = nulldev,
121 .cb_strategy = nodev,
122 .cb_print = nodev,
123 .cb_dump = nodev,
124 .cb_read = nodev,
125 .cb_write = nodev,
126 .cb_ioctl = nodev,
127 .cb_devmap = nodev,
128 .cb_mmap = nodev,
129 .cb_segmap = nodev,
130 .cb_chpoll = nochpoll,
131 .cb_prop_op = ddi_prop_op,
132 .cb_str = NULL,
133 .cb_aread = nodev,
134 .cb_awrite = nodev,
135 };
136
137 static struct dev_ops vioif_dev_ops = {
138 .devo_rev = DEVO_REV,
139 .devo_refcnt = 0,
140
141 .devo_attach = vioif_attach,
142 .devo_detach = vioif_detach,
143 .devo_quiesce = vioif_quiesce,
144
145 .devo_cb_ops = &vioif_cb_ops,
146
147 .devo_getinfo = NULL,
148 .devo_identify = nulldev,
149 .devo_probe = nulldev,
150 .devo_reset = nodev,
151 .devo_bus_ops = NULL,
152 .devo_power = NULL,
153 };
154
155 static struct modldrv vioif_modldrv = {
156 .drv_modops = &mod_driverops,
157 .drv_linkinfo = "VIRTIO network driver",
158 .drv_dev_ops = &vioif_dev_ops
159 };
160
161 static struct modlinkage vioif_modlinkage = {
162 .ml_rev = MODREV_1,
163 .ml_linkage = { &vioif_modldrv, NULL }
164 };
165
166 static mac_callbacks_t vioif_mac_callbacks = {
167 .mc_getstat = vioif_m_stat,
168 .mc_start = vioif_m_start,
169 .mc_stop = vioif_m_stop,
170 .mc_setpromisc = vioif_m_setpromisc,
171 .mc_multicst = vioif_m_multicst,
172 .mc_unicst = vioif_m_unicst,
173 .mc_tx = vioif_m_tx,
174
175 .mc_callbacks = (MC_GETCAPAB | MC_SETPROP |
176 MC_GETPROP | MC_PROPINFO),
177 .mc_getcapab = vioif_m_getcapab,
178 .mc_setprop = vioif_m_setprop,
179 .mc_getprop = vioif_m_getprop,
180 .mc_propinfo = vioif_m_propinfo,
181 };
182
183 static const uchar_t vioif_broadcast[ETHERADDRL] = {
184 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
185 };
186
187 /*
188 * Interval for the periodic TX reclaim.
189 */
190 uint_t vioif_reclaim_ms = 200;
191
192 /*
193 * Allow the operator to override the kinds of interrupts we'll use for
194 * vioif. This value defaults to -1 so that it can be overridden to 0 in
195 * /etc/system.
196 */
197 int vioif_allowed_int_types = -1;
198
199 /*
200 * DMA attribute template for transmit and receive buffers. The SGL entry
201 * count will be modified before using the template. Note that these
202 * allocations are aligned so that VIOIF_HEADER_SKIP places the IP header in
203 * received frames at the correct offset for the networking stack.
204 */
205 ddi_dma_attr_t vioif_dma_attr_bufs = {
206 .dma_attr_version = DMA_ATTR_V0,
207 .dma_attr_addr_lo = 0x0000000000000000,
208 .dma_attr_addr_hi = 0xFFFFFFFFFFFFFFFF,
209 .dma_attr_count_max = 0x00000000FFFFFFFF,
210 .dma_attr_align = VIOIF_HEADER_ALIGN,
211 .dma_attr_burstsizes = 1,
212 .dma_attr_minxfer = 1,
213 .dma_attr_maxxfer = 0x00000000FFFFFFFF,
214 .dma_attr_seg = 0x00000000FFFFFFFF,
215 .dma_attr_sgllen = 0,
216 .dma_attr_granular = 1,
217 .dma_attr_flags = 0
218 };
219
220 /*
221 * DMA attributes for mapping larger transmit buffers from the networking
222 * stack. The requirements are quite loose, but note that the SGL entry length
223 * field is 32-bit.
224 */
225 ddi_dma_attr_t vioif_dma_attr_external = {
226 .dma_attr_version = DMA_ATTR_V0,
227 .dma_attr_addr_lo = 0x0000000000000000,
228 .dma_attr_addr_hi = 0xFFFFFFFFFFFFFFFF,
229 .dma_attr_count_max = 0x00000000FFFFFFFF,
230 .dma_attr_align = 1,
231 .dma_attr_burstsizes = 1,
232 .dma_attr_minxfer = 1,
233 .dma_attr_maxxfer = 0x00000000FFFFFFFF,
234 .dma_attr_seg = 0x00000000FFFFFFFF,
235 .dma_attr_sgllen = VIOIF_MAX_SEGS - 1,
236 .dma_attr_granular = 1,
237 .dma_attr_flags = 0
238 };
239
240
241 /*
242 * VIRTIO NET MAC PROPERTIES
243 */
244 #define VIOIF_MACPROP_TXCOPY_THRESH "_txcopy_thresh"
245 #define VIOIF_MACPROP_TXCOPY_THRESH_DEF 300
246 #define VIOIF_MACPROP_TXCOPY_THRESH_MAX 640
247
248 #define VIOIF_MACPROP_RXCOPY_THRESH "_rxcopy_thresh"
249 #define VIOIF_MACPROP_RXCOPY_THRESH_DEF 300
250 #define VIOIF_MACPROP_RXCOPY_THRESH_MAX 640
251
252 static char *vioif_priv_props[] = {
253 VIOIF_MACPROP_TXCOPY_THRESH,
254 VIOIF_MACPROP_RXCOPY_THRESH,
255 NULL
256 };
257
258
259 static vioif_txbuf_t *
vioif_txbuf_alloc(vioif_t * vif)260 vioif_txbuf_alloc(vioif_t *vif)
261 {
262 vioif_txbuf_t *tb;
263
264 VERIFY(MUTEX_HELD(&vif->vif_mutex));
265
266 if ((tb = list_remove_head(&vif->vif_txbufs)) != NULL) {
267 vif->vif_ntxbufs_alloc++;
268 }
269
270 return (tb);
271 }
272
273 static void
vioif_txbuf_free(vioif_t * vif,vioif_txbuf_t * tb)274 vioif_txbuf_free(vioif_t *vif, vioif_txbuf_t *tb)
275 {
276 VERIFY(MUTEX_HELD(&vif->vif_mutex));
277
278 VERIFY3U(vif->vif_ntxbufs_alloc, >, 0);
279 vif->vif_ntxbufs_alloc--;
280
281 virtio_chain_clear(tb->tb_chain);
282 list_insert_head(&vif->vif_txbufs, tb);
283 }
284
285 static vioif_rxbuf_t *
vioif_rxbuf_alloc(vioif_t * vif)286 vioif_rxbuf_alloc(vioif_t *vif)
287 {
288 vioif_rxbuf_t *rb;
289
290 VERIFY(MUTEX_HELD(&vif->vif_mutex));
291
292 if ((rb = list_remove_head(&vif->vif_rxbufs)) != NULL) {
293 vif->vif_nrxbufs_alloc++;
294 }
295
296 return (rb);
297 }
298
299 static void
vioif_rxbuf_free(vioif_t * vif,vioif_rxbuf_t * rb)300 vioif_rxbuf_free(vioif_t *vif, vioif_rxbuf_t *rb)
301 {
302 VERIFY(MUTEX_HELD(&vif->vif_mutex));
303
304 VERIFY3U(vif->vif_nrxbufs_alloc, >, 0);
305 vif->vif_nrxbufs_alloc--;
306
307 virtio_chain_clear(rb->rb_chain);
308 list_insert_head(&vif->vif_rxbufs, rb);
309 }
310
311 static void
vioif_rx_free_callback(caddr_t free_arg)312 vioif_rx_free_callback(caddr_t free_arg)
313 {
314 vioif_rxbuf_t *rb = (vioif_rxbuf_t *)free_arg;
315 vioif_t *vif = rb->rb_vioif;
316
317 mutex_enter(&vif->vif_mutex);
318
319 /*
320 * Return this receive buffer to the free list.
321 */
322 vioif_rxbuf_free(vif, rb);
323
324 VERIFY3U(vif->vif_nrxbufs_onloan, >, 0);
325 vif->vif_nrxbufs_onloan--;
326
327 /*
328 * Attempt to replenish the receive queue with at least the buffer we
329 * just freed. There isn't a great way to deal with failure here,
330 * though because we'll only loan at most half of the buffers there
331 * should always be at least some available even if this fails.
332 */
333 (void) vioif_add_rx(vif);
334
335 mutex_exit(&vif->vif_mutex);
336 }
337
338 static vioif_ctrlbuf_t *
vioif_ctrlbuf_alloc(vioif_t * vif)339 vioif_ctrlbuf_alloc(vioif_t *vif)
340 {
341 vioif_ctrlbuf_t *cb;
342
343 VERIFY(MUTEX_HELD(&vif->vif_mutex));
344
345 if ((cb = list_remove_head(&vif->vif_ctrlbufs)) != NULL) {
346 vif->vif_nctrlbufs_alloc++;
347 }
348
349 return (cb);
350 }
351
352 static void
vioif_ctrlbuf_free(vioif_t * vif,vioif_ctrlbuf_t * cb)353 vioif_ctrlbuf_free(vioif_t *vif, vioif_ctrlbuf_t *cb)
354 {
355 VERIFY(MUTEX_HELD(&vif->vif_mutex));
356
357 VERIFY3U(vif->vif_nctrlbufs_alloc, >, 0);
358 vif->vif_nctrlbufs_alloc--;
359
360 virtio_chain_clear(cb->cb_chain);
361 list_insert_head(&vif->vif_ctrlbufs, cb);
362 }
363
364 static void
vioif_free_bufs(vioif_t * vif)365 vioif_free_bufs(vioif_t *vif)
366 {
367 VERIFY(MUTEX_HELD(&vif->vif_mutex));
368
369 VERIFY3U(vif->vif_ntxbufs_alloc, ==, 0);
370 for (uint_t i = 0; i < vif->vif_txbufs_capacity; i++) {
371 vioif_txbuf_t *tb = &vif->vif_txbufs_mem[i];
372
373 /*
374 * Ensure that this txbuf is now in the free list:
375 */
376 VERIFY(list_link_active(&tb->tb_link));
377 list_remove(&vif->vif_txbufs, tb);
378
379 /*
380 * We should not have an mblk chain at this point.
381 */
382 VERIFY3P(tb->tb_mp, ==, NULL);
383
384 if (tb->tb_dma != NULL) {
385 virtio_dma_free(tb->tb_dma);
386 tb->tb_dma = NULL;
387 }
388
389 if (tb->tb_chain != NULL) {
390 virtio_chain_free(tb->tb_chain);
391 tb->tb_chain = NULL;
392 }
393
394 if (tb->tb_dmaext != NULL) {
395 for (uint_t j = 0; j < tb->tb_dmaext_capacity; j++) {
396 if (tb->tb_dmaext[j] != NULL) {
397 virtio_dma_free(
398 tb->tb_dmaext[j]);
399 tb->tb_dmaext[j] = NULL;
400 }
401 }
402
403 kmem_free(tb->tb_dmaext,
404 sizeof (virtio_dma_t *) * tb->tb_dmaext_capacity);
405 tb->tb_dmaext = NULL;
406 tb->tb_dmaext_capacity = 0;
407 }
408 }
409 VERIFY(list_is_empty(&vif->vif_txbufs));
410 if (vif->vif_txbufs_mem != NULL) {
411 kmem_free(vif->vif_txbufs_mem,
412 sizeof (vioif_txbuf_t) * vif->vif_txbufs_capacity);
413 vif->vif_txbufs_mem = NULL;
414 vif->vif_txbufs_capacity = 0;
415 }
416
417 VERIFY3U(vif->vif_nrxbufs_alloc, ==, 0);
418 for (uint_t i = 0; i < vif->vif_rxbufs_capacity; i++) {
419 vioif_rxbuf_t *rb = &vif->vif_rxbufs_mem[i];
420
421 /*
422 * Ensure that this rxbuf is now in the free list:
423 */
424 VERIFY(list_link_active(&rb->rb_link));
425 list_remove(&vif->vif_rxbufs, rb);
426
427 if (rb->rb_dma != NULL) {
428 virtio_dma_free(rb->rb_dma);
429 rb->rb_dma = NULL;
430 }
431
432 if (rb->rb_chain != NULL) {
433 virtio_chain_free(rb->rb_chain);
434 rb->rb_chain = NULL;
435 }
436 }
437 VERIFY(list_is_empty(&vif->vif_rxbufs));
438 if (vif->vif_rxbufs_mem != NULL) {
439 kmem_free(vif->vif_rxbufs_mem,
440 sizeof (vioif_rxbuf_t) * vif->vif_rxbufs_capacity);
441 vif->vif_rxbufs_mem = NULL;
442 vif->vif_rxbufs_capacity = 0;
443 }
444
445 if (vif->vif_has_ctrlq) {
446 VERIFY3U(vif->vif_nctrlbufs_alloc, ==, 0);
447 for (uint_t i = 0; i < vif->vif_ctrlbufs_capacity; i++) {
448 vioif_ctrlbuf_t *cb = &vif->vif_ctrlbufs_mem[i];
449
450 /*
451 * Ensure that this ctrlbuf is now in the free list
452 */
453 VERIFY(list_link_active(&cb->cb_link));
454 list_remove(&vif->vif_ctrlbufs, cb);
455
456 if (cb->cb_dma != NULL) {
457 virtio_dma_free(cb->cb_dma);
458 cb->cb_dma = NULL;
459 }
460
461 if (cb->cb_chain != NULL) {
462 virtio_chain_free(cb->cb_chain);
463 cb->cb_chain = NULL;
464 }
465 }
466 VERIFY(list_is_empty(&vif->vif_ctrlbufs));
467 if (vif->vif_ctrlbufs_mem != NULL) {
468 kmem_free(vif->vif_ctrlbufs_mem,
469 sizeof (vioif_ctrlbuf_t) *
470 vif->vif_ctrlbufs_capacity);
471 vif->vif_ctrlbufs_mem = NULL;
472 vif->vif_ctrlbufs_capacity = 0;
473 }
474 }
475 }
476
477 static int
vioif_alloc_bufs(vioif_t * vif)478 vioif_alloc_bufs(vioif_t *vif)
479 {
480 VERIFY(MUTEX_HELD(&vif->vif_mutex));
481
482 /*
483 * Allocate one contiguous chunk of memory for the transmit and receive
484 * buffer tracking objects. If the ring is unusually small, we'll
485 * reduce our target buffer count accordingly.
486 */
487 vif->vif_txbufs_capacity = MIN(VIRTIO_NET_TX_BUFS,
488 virtio_queue_size(vif->vif_tx_vq));
489 vif->vif_txbufs_mem = kmem_zalloc(
490 sizeof (vioif_txbuf_t) * vif->vif_txbufs_capacity, KM_SLEEP);
491 list_create(&vif->vif_txbufs, sizeof (vioif_txbuf_t),
492 offsetof(vioif_txbuf_t, tb_link));
493
494 vif->vif_rxbufs_capacity = MIN(VIRTIO_NET_RX_BUFS,
495 virtio_queue_size(vif->vif_rx_vq));
496 vif->vif_rxbufs_mem = kmem_zalloc(
497 sizeof (vioif_rxbuf_t) * vif->vif_rxbufs_capacity, KM_SLEEP);
498 list_create(&vif->vif_rxbufs, sizeof (vioif_rxbuf_t),
499 offsetof(vioif_rxbuf_t, rb_link));
500
501 if (vif->vif_has_ctrlq) {
502 vif->vif_ctrlbufs_capacity = MIN(VIRTIO_NET_CTRL_BUFS,
503 virtio_queue_size(vif->vif_ctrl_vq));
504 vif->vif_ctrlbufs_mem = kmem_zalloc(
505 sizeof (vioif_ctrlbuf_t) * vif->vif_ctrlbufs_capacity,
506 KM_SLEEP);
507 }
508 list_create(&vif->vif_ctrlbufs, sizeof (vioif_ctrlbuf_t),
509 offsetof(vioif_ctrlbuf_t, cb_link));
510
511 /*
512 * Do not loan more than half of our allocated receive buffers into
513 * the networking stack.
514 */
515 vif->vif_nrxbufs_onloan_max = vif->vif_rxbufs_capacity / 2;
516
517 /*
518 * Put everything in the free list straight away in order to simplify
519 * the use of vioif_free_bufs() for cleanup on allocation failure.
520 */
521 for (uint_t i = 0; i < vif->vif_txbufs_capacity; i++) {
522 list_insert_tail(&vif->vif_txbufs, &vif->vif_txbufs_mem[i]);
523 }
524 for (uint_t i = 0; i < vif->vif_rxbufs_capacity; i++) {
525 list_insert_tail(&vif->vif_rxbufs, &vif->vif_rxbufs_mem[i]);
526 }
527 for (uint_t i = 0; i < vif->vif_ctrlbufs_capacity; i++) {
528 list_insert_tail(&vif->vif_ctrlbufs, &vif->vif_ctrlbufs_mem[i]);
529 }
530
531 /*
532 * Start from the DMA attribute template common to both transmit and
533 * receive buffers. The SGL entry count will be modified for each
534 * buffer type.
535 */
536 ddi_dma_attr_t attr = vioif_dma_attr_bufs;
537
538 /*
539 * The transmit inline buffer is small (less than a page), so it's
540 * reasonable to request a single cookie.
541 */
542 attr.dma_attr_sgllen = 1;
543
544 for (vioif_txbuf_t *tb = list_head(&vif->vif_txbufs); tb != NULL;
545 tb = list_next(&vif->vif_txbufs, tb)) {
546 if ((tb->tb_dma = virtio_dma_alloc(vif->vif_virtio,
547 VIOIF_TX_INLINE_SIZE, &attr,
548 DDI_DMA_STREAMING | DDI_DMA_WRITE, KM_SLEEP)) == NULL) {
549 goto fail;
550 }
551 VERIFY3U(virtio_dma_ncookies(tb->tb_dma), ==, 1);
552
553 if ((tb->tb_chain = virtio_chain_alloc(vif->vif_tx_vq,
554 KM_SLEEP)) == NULL) {
555 goto fail;
556 }
557 virtio_chain_data_set(tb->tb_chain, tb);
558
559 tb->tb_dmaext_capacity = VIOIF_MAX_SEGS - 1;
560 tb->tb_dmaext = kmem_zalloc(
561 sizeof (virtio_dma_t *) * tb->tb_dmaext_capacity,
562 KM_SLEEP);
563 }
564
565 /*
566 * Control queue buffers are also small (less than a page), so we'll
567 * also request a single cookie for them.
568 */
569 for (vioif_ctrlbuf_t *cb = list_head(&vif->vif_ctrlbufs); cb != NULL;
570 cb = list_next(&vif->vif_ctrlbufs, cb)) {
571 if ((cb->cb_dma = virtio_dma_alloc(vif->vif_virtio,
572 VIOIF_CTRL_SIZE, &attr,
573 DDI_DMA_STREAMING | DDI_DMA_RDWR, KM_SLEEP)) == NULL) {
574 goto fail;
575 }
576 VERIFY3U(virtio_dma_ncookies(cb->cb_dma), ==, 1);
577
578 if ((cb->cb_chain = virtio_chain_alloc(vif->vif_ctrl_vq,
579 KM_SLEEP)) == NULL) {
580 goto fail;
581 }
582 virtio_chain_data_set(cb->cb_chain, cb);
583 }
584
585 /*
586 * The receive buffers are larger, and we can tolerate a large number
587 * of segments. Adjust the SGL entry count, setting aside one segment
588 * for the virtio net header.
589 */
590 attr.dma_attr_sgllen = VIOIF_MAX_SEGS - 1;
591
592 for (vioif_rxbuf_t *rb = list_head(&vif->vif_rxbufs); rb != NULL;
593 rb = list_next(&vif->vif_rxbufs, rb)) {
594 if ((rb->rb_dma = virtio_dma_alloc(vif->vif_virtio,
595 VIOIF_RX_BUF_SIZE, &attr, DDI_DMA_STREAMING | DDI_DMA_READ,
596 KM_SLEEP)) == NULL) {
597 goto fail;
598 }
599
600 if ((rb->rb_chain = virtio_chain_alloc(vif->vif_rx_vq,
601 KM_SLEEP)) == NULL) {
602 goto fail;
603 }
604 virtio_chain_data_set(rb->rb_chain, rb);
605
606 /*
607 * Ensure that the first cookie is sufficient to cover the
608 * header skip region plus one byte.
609 */
610 VERIFY3U(virtio_dma_cookie_size(rb->rb_dma, 0), >=,
611 VIOIF_HEADER_SKIP + 1);
612
613 /*
614 * Ensure that the frame data begins at a location with a
615 * correctly aligned IP header.
616 */
617 VERIFY3U((uintptr_t)virtio_dma_va(rb->rb_dma,
618 VIOIF_HEADER_SKIP) % 4, ==, 2);
619
620 rb->rb_vioif = vif;
621 rb->rb_frtn.free_func = vioif_rx_free_callback;
622 rb->rb_frtn.free_arg = (caddr_t)rb;
623 }
624
625 return (0);
626
627 fail:
628 vioif_free_bufs(vif);
629 return (ENOMEM);
630 }
631
632 static int
vioif_ctrlq_req(vioif_t * vif,uint8_t class,uint8_t cmd,void * data,size_t datalen)633 vioif_ctrlq_req(vioif_t *vif, uint8_t class, uint8_t cmd, void *data,
634 size_t datalen)
635 {
636 vioif_ctrlbuf_t *cb = NULL;
637 virtio_chain_t *vic = NULL;
638 uint8_t *p = NULL;
639 uint64_t pa = 0;
640 uint8_t *ackp = NULL;
641 struct virtio_net_ctrlq_hdr hdr = {
642 .vnch_class = class,
643 .vnch_command = cmd,
644 };
645 const size_t hdrlen = sizeof (hdr);
646 const size_t acklen = 1; /* the ack is always 1 byte */
647 size_t totlen = hdrlen + datalen + acklen;
648 int r = DDI_SUCCESS;
649
650 /*
651 * We shouldn't be called unless the ctrlq feature has been
652 * negotiated with the host
653 */
654 VERIFY(vif->vif_has_ctrlq);
655
656 mutex_enter(&vif->vif_mutex);
657 cb = vioif_ctrlbuf_alloc(vif);
658 if (cb == NULL) {
659 vif->vif_noctrlbuf++;
660 mutex_exit(&vif->vif_mutex);
661 r = DDI_FAILURE;
662 goto done;
663 }
664 mutex_exit(&vif->vif_mutex);
665
666 if (totlen > virtio_dma_size(cb->cb_dma)) {
667 vif->vif_ctrlbuf_toosmall++;
668 r = DDI_FAILURE;
669 goto done;
670 }
671
672 /*
673 * Clear the entire buffer. Technically not necessary, but useful
674 * if trying to troubleshoot an issue, and probably not a bad idea
675 * to not let any old data linger.
676 */
677 p = virtio_dma_va(cb->cb_dma, 0);
678 bzero(p, virtio_dma_size(cb->cb_dma));
679
680 /*
681 * We currently do not support VIRTIO_F_ANY_LAYOUT. That means,
682 * that we must put the header, the data, and the ack in their
683 * own respective descriptors. Since all the currently supported
684 * control queue commands take _very_ small amounts of data, we
685 * use a single DMA buffer for all of it, but use 3 descriptors to
686 * reference (respectively) the header, the data, and the ack byte
687 * within that memory to adhere to the virtio spec.
688 *
689 * If we add support for control queue features such as custom
690 * MAC filtering tables, which might require larger amounts of
691 * memory, we likely will want to add more sophistication here
692 * and optionally use additional allocated memory to hold that
693 * data instead of a fixed size buffer.
694 *
695 * Copy the header.
696 */
697 bcopy(&hdr, p, sizeof (hdr));
698 pa = virtio_dma_cookie_pa(cb->cb_dma, 0);
699 if ((r = virtio_chain_append(cb->cb_chain,
700 pa, hdrlen, VIRTIO_DIR_DEVICE_READS)) != DDI_SUCCESS) {
701 goto done;
702 }
703
704 /*
705 * Copy the request data
706 */
707 p = virtio_dma_va(cb->cb_dma, hdrlen);
708 bcopy(data, p, datalen);
709 if ((r = virtio_chain_append(cb->cb_chain,
710 pa + hdrlen, datalen, VIRTIO_DIR_DEVICE_READS)) != DDI_SUCCESS) {
711 goto done;
712 }
713
714 /*
715 * We already cleared the buffer, so don't need to copy out a 0 for
716 * the ack byte. Just add a descriptor for that spot.
717 */
718 ackp = virtio_dma_va(cb->cb_dma, hdrlen + datalen);
719 if ((r = virtio_chain_append(cb->cb_chain,
720 pa + hdrlen + datalen, acklen,
721 VIRTIO_DIR_DEVICE_WRITES)) != DDI_SUCCESS) {
722 goto done;
723 }
724
725 virtio_dma_sync(cb->cb_dma, DDI_DMA_SYNC_FORDEV);
726 virtio_chain_submit(cb->cb_chain, B_TRUE);
727
728 /*
729 * Spin waiting for response.
730 */
731 mutex_enter(&vif->vif_mutex);
732 while ((vic = virtio_queue_poll(vif->vif_ctrl_vq)) == NULL) {
733 mutex_exit(&vif->vif_mutex);
734 delay(drv_usectohz(1000));
735 mutex_enter(&vif->vif_mutex);
736 }
737
738 virtio_dma_sync(cb->cb_dma, DDI_DMA_SYNC_FORCPU);
739 VERIFY3P(virtio_chain_data(vic), ==, cb);
740 mutex_exit(&vif->vif_mutex);
741
742 if (*ackp != VIRTIO_NET_CQ_OK) {
743 r = DDI_FAILURE;
744 }
745
746 done:
747 mutex_enter(&vif->vif_mutex);
748 vioif_ctrlbuf_free(vif, cb);
749 mutex_exit(&vif->vif_mutex);
750
751 return (r);
752 }
753
754 static int
vioif_m_multicst(void * arg,boolean_t add,const uint8_t * mcst_addr)755 vioif_m_multicst(void *arg, boolean_t add, const uint8_t *mcst_addr)
756 {
757 /*
758 * Even though we currently do not have support for programming
759 * multicast filters, or even enabling promiscuous mode, we return
760 * success here to avoid the networking stack falling back to link
761 * layer broadcast for multicast traffic. Some hypervisors already
762 * pass received multicast frames onto the guest, so at least on those
763 * systems multicast will work as expected anyway.
764 */
765 return (0);
766 }
767
768 static int
vioif_m_setpromisc(void * arg,boolean_t on)769 vioif_m_setpromisc(void *arg, boolean_t on)
770 {
771 vioif_t *vif = arg;
772 uint8_t val = on ? 1 : 0;
773
774 if (!vif->vif_has_ctrlq_rx) {
775 if (vioif_fake_promisc_success)
776 return (0);
777
778 return (ENOTSUP);
779 }
780
781 return (vioif_ctrlq_req(vif, VIRTIO_NET_CTRL_RX,
782 VIRTIO_NET_CTRL_RX_PROMISC, &val, sizeof (val)));
783 }
784
785 static int
vioif_m_unicst(void * arg,const uint8_t * mac)786 vioif_m_unicst(void *arg, const uint8_t *mac)
787 {
788 return (ENOTSUP);
789 }
790
791 static uint_t
vioif_add_rx(vioif_t * vif)792 vioif_add_rx(vioif_t *vif)
793 {
794 VERIFY(MUTEX_HELD(&vif->vif_mutex));
795
796 if (vif->vif_runstate != VIOIF_RUNSTATE_RUNNING) {
797 /*
798 * If the NIC is not running, do not give the device any
799 * receive buffers.
800 */
801 return (0);
802 }
803
804 uint_t num_added = 0;
805
806 vioif_rxbuf_t *rb;
807 while ((rb = vioif_rxbuf_alloc(vif)) != NULL) {
808 /*
809 * For legacy devices, and those that have not negotiated
810 * VIRTIO_F_ANY_LAYOUT, the virtio net header must appear in a
811 * separate descriptor entry to the rest of the buffer.
812 */
813 if (virtio_chain_append(rb->rb_chain,
814 virtio_dma_cookie_pa(rb->rb_dma, 0),
815 sizeof (struct virtio_net_hdr),
816 VIRTIO_DIR_DEVICE_WRITES) != DDI_SUCCESS) {
817 goto fail;
818 }
819
820 for (uint_t n = 0; n < virtio_dma_ncookies(rb->rb_dma); n++) {
821 uint64_t pa = virtio_dma_cookie_pa(rb->rb_dma, n);
822 size_t sz = virtio_dma_cookie_size(rb->rb_dma, n);
823
824 if (n == 0) {
825 pa += VIOIF_HEADER_SKIP;
826 VERIFY3U(sz, >, VIOIF_HEADER_SKIP);
827 sz -= VIOIF_HEADER_SKIP;
828 }
829
830 if (virtio_chain_append(rb->rb_chain, pa, sz,
831 VIRTIO_DIR_DEVICE_WRITES) != DDI_SUCCESS) {
832 goto fail;
833 }
834 }
835
836 virtio_chain_submit(rb->rb_chain, B_FALSE);
837 num_added++;
838 continue;
839
840 fail:
841 vioif_rxbuf_free(vif, rb);
842 vif->vif_norecvbuf++;
843 break;
844 }
845
846 if (num_added > 0) {
847 virtio_queue_flush(vif->vif_rx_vq);
848 }
849
850 return (num_added);
851 }
852
853 static uint_t
vioif_process_rx(vioif_t * vif)854 vioif_process_rx(vioif_t *vif)
855 {
856 virtio_chain_t *vic;
857 mblk_t *mphead = NULL, *lastmp = NULL, *mp;
858 uint_t num_processed = 0;
859
860 VERIFY(MUTEX_HELD(&vif->vif_mutex));
861
862 while ((vic = virtio_queue_poll(vif->vif_rx_vq)) != NULL) {
863 /*
864 * We have to use the chain received length here, as the device
865 * does not tell us the received frame length any other way.
866 * In a limited survey of hypervisors, virtio network devices
867 * appear to provide the right value here.
868 */
869 size_t len = virtio_chain_received_length(vic);
870 vioif_rxbuf_t *rb = virtio_chain_data(vic);
871
872 virtio_dma_sync(rb->rb_dma, DDI_DMA_SYNC_FORCPU);
873
874 /*
875 * If the NIC is not running, discard any received frames.
876 */
877 if (vif->vif_runstate != VIOIF_RUNSTATE_RUNNING) {
878 vioif_rxbuf_free(vif, rb);
879 continue;
880 }
881
882 if (len < sizeof (struct virtio_net_hdr)) {
883 vif->vif_rxfail_chain_undersize++;
884 vif->vif_ierrors++;
885 vioif_rxbuf_free(vif, rb);
886 continue;
887 }
888 len -= sizeof (struct virtio_net_hdr);
889
890 /*
891 * We copy small packets that happen to fit into a single
892 * cookie and reuse the buffers. For bigger ones, we loan
893 * the buffers upstream.
894 */
895 if (len < vif->vif_rxcopy_thresh ||
896 vif->vif_nrxbufs_onloan >= vif->vif_nrxbufs_onloan_max) {
897 mutex_exit(&vif->vif_mutex);
898 if ((mp = allocb(len, 0)) == NULL) {
899 mutex_enter(&vif->vif_mutex);
900 vif->vif_norecvbuf++;
901 vif->vif_ierrors++;
902
903 vioif_rxbuf_free(vif, rb);
904 continue;
905 }
906
907 bcopy(virtio_dma_va(rb->rb_dma, VIOIF_HEADER_SKIP),
908 mp->b_rptr, len);
909 mp->b_wptr = mp->b_rptr + len;
910
911 /*
912 * As the packet contents was copied rather than
913 * loaned, we can return the receive buffer resources
914 * to the free list.
915 */
916 mutex_enter(&vif->vif_mutex);
917 vioif_rxbuf_free(vif, rb);
918
919 } else {
920 mutex_exit(&vif->vif_mutex);
921 if ((mp = desballoc(virtio_dma_va(rb->rb_dma,
922 VIOIF_HEADER_SKIP), len, 0,
923 &rb->rb_frtn)) == NULL) {
924 mutex_enter(&vif->vif_mutex);
925 vif->vif_norecvbuf++;
926 vif->vif_ierrors++;
927
928 vioif_rxbuf_free(vif, rb);
929 continue;
930 }
931 mp->b_wptr = mp->b_rptr + len;
932
933 mutex_enter(&vif->vif_mutex);
934 vif->vif_nrxbufs_onloan++;
935 }
936
937 /*
938 * virtio-net does not tell us if this packet is multicast
939 * or broadcast, so we have to check it.
940 */
941 if (mp->b_rptr[0] & 0x1) {
942 if (bcmp(mp->b_rptr, vioif_broadcast, ETHERADDRL) != 0)
943 vif->vif_multircv++;
944 else
945 vif->vif_brdcstrcv++;
946 }
947
948 vif->vif_rbytes += len;
949 vif->vif_ipackets++;
950
951 if (lastmp == NULL) {
952 mphead = mp;
953 } else {
954 lastmp->b_next = mp;
955 }
956 lastmp = mp;
957 num_processed++;
958 }
959
960 if (mphead != NULL) {
961 if (vif->vif_runstate == VIOIF_RUNSTATE_RUNNING) {
962 mutex_exit(&vif->vif_mutex);
963 mac_rx(vif->vif_mac_handle, NULL, mphead);
964 mutex_enter(&vif->vif_mutex);
965 } else {
966 /*
967 * The NIC was disabled part way through our execution,
968 * so free the messages we allocated.
969 */
970 freemsgchain(mphead);
971 }
972 }
973
974 return (num_processed);
975 }
976
977 static uint_t
vioif_reclaim_used_tx(vioif_t * vif)978 vioif_reclaim_used_tx(vioif_t *vif)
979 {
980 virtio_chain_t *vic;
981 uint_t num_reclaimed = 0;
982
983 VERIFY(MUTEX_NOT_HELD(&vif->vif_mutex));
984
985 while ((vic = virtio_queue_poll(vif->vif_tx_vq)) != NULL) {
986 vioif_txbuf_t *tb = virtio_chain_data(vic);
987
988 if (tb->tb_mp != NULL) {
989 /*
990 * Unbind the external mapping.
991 */
992 for (uint_t i = 0; i < tb->tb_dmaext_capacity; i++) {
993 if (tb->tb_dmaext[i] == NULL) {
994 continue;
995 }
996
997 virtio_dma_unbind(tb->tb_dmaext[i]);
998 }
999
1000 freemsg(tb->tb_mp);
1001 tb->tb_mp = NULL;
1002 }
1003
1004 /*
1005 * Return this transmit buffer to the free list for reuse.
1006 */
1007 mutex_enter(&vif->vif_mutex);
1008 vioif_txbuf_free(vif, tb);
1009 mutex_exit(&vif->vif_mutex);
1010
1011 num_reclaimed++;
1012 }
1013
1014 /* Return ring to transmitting state if descriptors were reclaimed. */
1015 if (num_reclaimed > 0) {
1016 boolean_t do_update = B_FALSE;
1017
1018 mutex_enter(&vif->vif_mutex);
1019 vif->vif_stat_tx_reclaim += num_reclaimed;
1020 if (vif->vif_tx_corked) {
1021 /*
1022 * TX was corked on a lack of available descriptors.
1023 * That dire state has passed so the TX interrupt can
1024 * be disabled and MAC can be notified that
1025 * transmission is possible again.
1026 */
1027 vif->vif_tx_corked = B_FALSE;
1028 virtio_queue_no_interrupt(vif->vif_tx_vq, B_TRUE);
1029 do_update = B_TRUE;
1030 }
1031
1032 mutex_exit(&vif->vif_mutex);
1033 if (do_update) {
1034 mac_tx_update(vif->vif_mac_handle);
1035 }
1036 }
1037
1038 return (num_reclaimed);
1039 }
1040
1041 static void
vioif_reclaim_periodic(void * arg)1042 vioif_reclaim_periodic(void *arg)
1043 {
1044 vioif_t *vif = arg;
1045 uint_t num_reclaimed;
1046
1047 num_reclaimed = vioif_reclaim_used_tx(vif);
1048
1049 mutex_enter(&vif->vif_mutex);
1050 vif->vif_tx_reclaim_tid = 0;
1051 /*
1052 * If used descriptors were reclaimed or TX descriptors appear to be
1053 * outstanding, the ring is considered active and periodic reclamation
1054 * is necessary for now.
1055 */
1056 if (num_reclaimed != 0 || virtio_queue_nactive(vif->vif_tx_vq) != 0) {
1057 /* Do not reschedule if the ring is being drained. */
1058 if (!vif->vif_tx_drain) {
1059 vioif_reclaim_restart(vif);
1060 }
1061 }
1062 mutex_exit(&vif->vif_mutex);
1063 }
1064
1065 static void
vioif_reclaim_restart(vioif_t * vif)1066 vioif_reclaim_restart(vioif_t *vif)
1067 {
1068 VERIFY(MUTEX_HELD(&vif->vif_mutex));
1069 VERIFY(!vif->vif_tx_drain);
1070
1071 if (vif->vif_tx_reclaim_tid == 0) {
1072 vif->vif_tx_reclaim_tid = timeout(vioif_reclaim_periodic, vif,
1073 MSEC_TO_TICK_ROUNDUP(vioif_reclaim_ms));
1074 }
1075 }
1076
1077 static void
vioif_tx_drain(vioif_t * vif)1078 vioif_tx_drain(vioif_t *vif)
1079 {
1080 VERIFY(MUTEX_HELD(&vif->vif_mutex));
1081 VERIFY3S(vif->vif_runstate, ==, VIOIF_RUNSTATE_STOPPING);
1082
1083 vif->vif_tx_drain = B_TRUE;
1084 /* Put a stop to the periodic reclaim if it is running */
1085 if (vif->vif_tx_reclaim_tid != 0) {
1086 timeout_id_t tid = vif->vif_tx_reclaim_tid;
1087
1088 /*
1089 * With vif_tx_drain set, there is no risk that a racing
1090 * vioif_reclaim_periodic() call will reschedule itself.
1091 *
1092 * Being part of the mc_stop hook also guarantees that
1093 * vioif_m_tx() will not be called to restart it.
1094 */
1095 vif->vif_tx_reclaim_tid = 0;
1096 mutex_exit(&vif->vif_mutex);
1097 (void) untimeout(tid);
1098 mutex_enter(&vif->vif_mutex);
1099 }
1100 virtio_queue_no_interrupt(vif->vif_tx_vq, B_TRUE);
1101
1102 /*
1103 * Wait for all of the TX descriptors to be processed by the host so
1104 * they can be reclaimed.
1105 */
1106 while (vif->vif_ntxbufs_alloc > 0) {
1107 mutex_exit(&vif->vif_mutex);
1108 (void) vioif_reclaim_used_tx(vif);
1109 delay(5);
1110 mutex_enter(&vif->vif_mutex);
1111 }
1112 VERIFY(!vif->vif_tx_corked);
1113 VERIFY3U(vif->vif_tx_reclaim_tid, ==, 0);
1114 VERIFY3U(virtio_queue_nactive(vif->vif_tx_vq), ==, 0);
1115 }
1116
1117 static int
vioif_tx_inline(vioif_t * vif,vioif_txbuf_t * tb,mblk_t * mp,size_t msg_size)1118 vioif_tx_inline(vioif_t *vif, vioif_txbuf_t *tb, mblk_t *mp, size_t msg_size)
1119 {
1120 VERIFY(MUTEX_NOT_HELD(&vif->vif_mutex));
1121
1122 VERIFY3U(msg_size, <=, virtio_dma_size(tb->tb_dma) - VIOIF_HEADER_SKIP);
1123
1124 /*
1125 * Copy the message into the inline buffer and then free the message.
1126 */
1127 mcopymsg(mp, virtio_dma_va(tb->tb_dma, VIOIF_HEADER_SKIP));
1128
1129 if (virtio_chain_append(tb->tb_chain,
1130 virtio_dma_cookie_pa(tb->tb_dma, 0) + VIOIF_HEADER_SKIP,
1131 msg_size, VIRTIO_DIR_DEVICE_READS) != DDI_SUCCESS) {
1132 return (DDI_FAILURE);
1133 }
1134
1135 return (DDI_SUCCESS);
1136 }
1137
1138 static int
vioif_tx_external(vioif_t * vif,vioif_txbuf_t * tb,mblk_t * mp,size_t msg_size)1139 vioif_tx_external(vioif_t *vif, vioif_txbuf_t *tb, mblk_t *mp, size_t msg_size)
1140 {
1141 VERIFY(MUTEX_NOT_HELD(&vif->vif_mutex));
1142
1143 mblk_t *nmp = mp;
1144 tb->tb_ndmaext = 0;
1145
1146 while (nmp != NULL) {
1147 size_t len;
1148
1149 if ((len = MBLKL(nmp)) == 0) {
1150 /*
1151 * Skip any zero-length entries in the chain.
1152 */
1153 nmp = nmp->b_cont;
1154 continue;
1155 }
1156
1157 if (tb->tb_ndmaext >= tb->tb_dmaext_capacity) {
1158 mutex_enter(&vif->vif_mutex);
1159 vif->vif_txfail_indirect_limit++;
1160 vif->vif_notxbuf++;
1161 mutex_exit(&vif->vif_mutex);
1162 goto fail;
1163 }
1164
1165 if (tb->tb_dmaext[tb->tb_ndmaext] == NULL) {
1166 /*
1167 * Allocate a DMA handle for this slot.
1168 */
1169 if ((tb->tb_dmaext[tb->tb_ndmaext] =
1170 virtio_dma_alloc_nomem(vif->vif_virtio,
1171 &vioif_dma_attr_external, KM_SLEEP)) == NULL) {
1172 mutex_enter(&vif->vif_mutex);
1173 vif->vif_notxbuf++;
1174 mutex_exit(&vif->vif_mutex);
1175 goto fail;
1176 }
1177 }
1178 virtio_dma_t *extdma = tb->tb_dmaext[tb->tb_ndmaext++];
1179
1180 if (virtio_dma_bind(extdma, nmp->b_rptr, len,
1181 DDI_DMA_WRITE | DDI_DMA_STREAMING, KM_SLEEP) !=
1182 DDI_SUCCESS) {
1183 mutex_enter(&vif->vif_mutex);
1184 vif->vif_txfail_dma_bind++;
1185 mutex_exit(&vif->vif_mutex);
1186 goto fail;
1187 }
1188
1189 for (uint_t n = 0; n < virtio_dma_ncookies(extdma); n++) {
1190 uint64_t pa = virtio_dma_cookie_pa(extdma, n);
1191 size_t sz = virtio_dma_cookie_size(extdma, n);
1192
1193 if (virtio_chain_append(tb->tb_chain, pa, sz,
1194 VIRTIO_DIR_DEVICE_READS) != DDI_SUCCESS) {
1195 mutex_enter(&vif->vif_mutex);
1196 vif->vif_txfail_indirect_limit++;
1197 vif->vif_notxbuf++;
1198 mutex_exit(&vif->vif_mutex);
1199 goto fail;
1200 }
1201 }
1202
1203 nmp = nmp->b_cont;
1204 }
1205
1206 /*
1207 * We need to keep the message around until we reclaim the buffer from
1208 * the device before freeing it.
1209 */
1210 tb->tb_mp = mp;
1211
1212 return (DDI_SUCCESS);
1213
1214 fail:
1215 for (uint_t n = 0; n < tb->tb_ndmaext; n++) {
1216 if (tb->tb_dmaext[n] != NULL) {
1217 virtio_dma_unbind(tb->tb_dmaext[n]);
1218 }
1219 }
1220 tb->tb_ndmaext = 0;
1221
1222 freemsg(mp);
1223
1224 return (DDI_FAILURE);
1225 }
1226
1227 static boolean_t
vioif_send(vioif_t * vif,mblk_t * mp)1228 vioif_send(vioif_t *vif, mblk_t *mp)
1229 {
1230 VERIFY(MUTEX_NOT_HELD(&vif->vif_mutex));
1231
1232 vioif_txbuf_t *tb = NULL;
1233 struct virtio_net_hdr *vnh = NULL;
1234 size_t msg_size = 0;
1235 uint32_t csum_start;
1236 uint32_t csum_stuff;
1237 uint32_t csum_flags;
1238 uint32_t lso_flags;
1239 uint32_t lso_mss;
1240 mblk_t *nmp;
1241 int ret;
1242 boolean_t lso_required = B_FALSE;
1243 struct ether_header *ether = (void *)mp->b_rptr;
1244
1245 for (nmp = mp; nmp; nmp = nmp->b_cont)
1246 msg_size += MBLKL(nmp);
1247
1248 if (vif->vif_tx_tso4 || vif->vif_tx_tso6) {
1249 mac_lso_get(mp, &lso_mss, &lso_flags);
1250 lso_required = (lso_flags & HW_LSO) != 0;
1251 }
1252
1253 mutex_enter(&vif->vif_mutex);
1254 if ((tb = vioif_txbuf_alloc(vif)) == NULL) {
1255 vif->vif_notxbuf++;
1256 goto fail;
1257 }
1258 mutex_exit(&vif->vif_mutex);
1259
1260 /*
1261 * Use the inline buffer for the virtio net header. Zero the portion
1262 * of our DMA allocation prior to the packet data.
1263 */
1264 vnh = virtio_dma_va(tb->tb_dma, 0);
1265 bzero(vnh, VIOIF_HEADER_SKIP);
1266
1267 /*
1268 * For legacy devices, and those that have not negotiated
1269 * VIRTIO_F_ANY_LAYOUT, the virtio net header must appear in a separate
1270 * descriptor entry to the rest of the buffer.
1271 */
1272 if (virtio_chain_append(tb->tb_chain,
1273 virtio_dma_cookie_pa(tb->tb_dma, 0), sizeof (struct virtio_net_hdr),
1274 VIRTIO_DIR_DEVICE_READS) != DDI_SUCCESS) {
1275 mutex_enter(&vif->vif_mutex);
1276 vif->vif_notxbuf++;
1277 goto fail;
1278 }
1279
1280 mac_hcksum_get(mp, &csum_start, &csum_stuff, NULL, NULL, &csum_flags);
1281
1282 /*
1283 * They want us to do the TCP/UDP csum calculation.
1284 */
1285 if (csum_flags & HCK_PARTIALCKSUM) {
1286 int eth_hsize;
1287
1288 /*
1289 * Did we ask for it?
1290 */
1291 ASSERT(vif->vif_tx_csum);
1292
1293 /*
1294 * We only asked for partial csum packets.
1295 */
1296 ASSERT(!(csum_flags & HCK_IPV4_HDRCKSUM));
1297 ASSERT(!(csum_flags & HCK_FULLCKSUM));
1298
1299 if (ether->ether_type == htons(ETHERTYPE_VLAN)) {
1300 eth_hsize = sizeof (struct ether_vlan_header);
1301 } else {
1302 eth_hsize = sizeof (struct ether_header);
1303 }
1304
1305 vnh->vnh_flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
1306 vnh->vnh_csum_start = eth_hsize + csum_start;
1307 vnh->vnh_csum_offset = csum_stuff - csum_start;
1308 }
1309
1310 /*
1311 * Setup LSO fields if required.
1312 */
1313 if (lso_required) {
1314 mac_ether_offload_flags_t needed;
1315 mac_ether_offload_info_t meo;
1316 uint32_t cksum;
1317 size_t len;
1318 mblk_t *pullmp = NULL;
1319 tcpha_t *tcpha;
1320
1321 mac_ether_offload_info(mp, &meo);
1322 needed = MEOI_L2INFO_SET | MEOI_L3INFO_SET | MEOI_L4INFO_SET;
1323 if ((meo.meoi_flags & needed) != needed) {
1324 goto fail;
1325 }
1326
1327 if (meo.meoi_l4proto != IPPROTO_TCP) {
1328 goto fail;
1329 }
1330
1331 if (meo.meoi_l3proto == ETHERTYPE_IP && vif->vif_tx_tso4) {
1332 vnh->vnh_gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
1333 } else if (meo.meoi_l3proto == ETHERTYPE_IPV6 &&
1334 vif->vif_tx_tso6) {
1335 vnh->vnh_gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
1336 } else {
1337 goto fail;
1338 }
1339
1340 /*
1341 * The TCP stack does not include the length in the TCP
1342 * pseudo-header when it is performing LSO since hardware
1343 * generally asks for it to be removed (as it'll change).
1344 * Unfortunately, for virtio, we actually need it. This means we
1345 * need to go through and calculate the actual length and fix
1346 * things up. Because the virtio spec cares about the ECN flag
1347 * and indicating that, at least this means we'll have that
1348 * available as well.
1349 */
1350 if (MBLKL(mp) < vnh->vnh_hdr_len) {
1351 pullmp = msgpullup(mp, vnh->vnh_hdr_len);
1352 if (pullmp == NULL)
1353 goto fail;
1354 tcpha = (tcpha_t *)(pullmp->b_rptr + meo.meoi_l2hlen +
1355 meo.meoi_l3hlen);
1356 } else {
1357 tcpha = (tcpha_t *)(mp->b_rptr + meo.meoi_l2hlen +
1358 meo.meoi_l3hlen);
1359 }
1360
1361 len = meo.meoi_len - meo.meoi_l2hlen - meo.meoi_l3hlen;
1362 cksum = ntohs(tcpha->tha_sum) + len;
1363 cksum = (cksum >> 16) + (cksum & 0xffff);
1364 cksum = (cksum >> 16) + (cksum & 0xffff);
1365 tcpha->tha_sum = htons(cksum);
1366
1367 if (tcpha->tha_flags & TH_CWR) {
1368 vnh->vnh_gso_type |= VIRTIO_NET_HDR_GSO_ECN;
1369 }
1370 vnh->vnh_gso_size = (uint16_t)lso_mss;
1371 vnh->vnh_hdr_len = meo.meoi_l2hlen + meo.meoi_l3hlen +
1372 meo.meoi_l4hlen;
1373
1374 freemsg(pullmp);
1375 }
1376
1377 /*
1378 * The device does not maintain its own statistics about broadcast or
1379 * multicast packets, so we have to check the destination address
1380 * ourselves.
1381 */
1382 if ((ether->ether_dhost.ether_addr_octet[0] & 0x01) != 0) {
1383 mutex_enter(&vif->vif_mutex);
1384 if (ether_cmp(ðer->ether_dhost, vioif_broadcast) == 0) {
1385 vif->vif_brdcstxmt++;
1386 } else {
1387 vif->vif_multixmt++;
1388 }
1389 mutex_exit(&vif->vif_mutex);
1390 }
1391
1392 /*
1393 * For small packets, copy into the preallocated inline buffer rather
1394 * than incur the overhead of mapping. Note that both of these
1395 * functions ensure that "mp" is freed before returning.
1396 */
1397 if (msg_size < vif->vif_txcopy_thresh) {
1398 ret = vioif_tx_inline(vif, tb, mp, msg_size);
1399 } else {
1400 ret = vioif_tx_external(vif, tb, mp, msg_size);
1401 }
1402 mp = NULL;
1403
1404 mutex_enter(&vif->vif_mutex);
1405
1406 if (ret != DDI_SUCCESS) {
1407 goto fail;
1408 }
1409
1410 vif->vif_opackets++;
1411 vif->vif_obytes += msg_size;
1412 mutex_exit(&vif->vif_mutex);
1413
1414 virtio_dma_sync(tb->tb_dma, DDI_DMA_SYNC_FORDEV);
1415 virtio_chain_submit(tb->tb_chain, B_TRUE);
1416
1417 return (B_TRUE);
1418
1419 fail:
1420 vif->vif_oerrors++;
1421 if (tb != NULL) {
1422 vioif_txbuf_free(vif, tb);
1423 }
1424 mutex_exit(&vif->vif_mutex);
1425
1426 return (mp == NULL);
1427 }
1428
1429 static mblk_t *
vioif_m_tx(void * arg,mblk_t * mp)1430 vioif_m_tx(void *arg, mblk_t *mp)
1431 {
1432 vioif_t *vif = arg;
1433 mblk_t *nmp;
1434
1435 /*
1436 * Prior to attempting to send any more frames, do a reclaim to pick up
1437 * any descriptors which have been processed by the host.
1438 */
1439 if (virtio_queue_nactive(vif->vif_tx_vq) != 0) {
1440 (void) vioif_reclaim_used_tx(vif);
1441 }
1442
1443 while (mp != NULL) {
1444 nmp = mp->b_next;
1445 mp->b_next = NULL;
1446
1447 if (!vioif_send(vif, mp)) {
1448 /*
1449 * If there are no descriptors available, try to
1450 * reclaim some, allowing a retry of the send if some
1451 * are found.
1452 */
1453 mp->b_next = nmp;
1454 if (vioif_reclaim_used_tx(vif) != 0) {
1455 continue;
1456 }
1457
1458 /*
1459 * Otherwise, enable the TX ring interrupt so that as
1460 * soon as a descriptor becomes available, transmission
1461 * can begin again. For safety, make sure the periodic
1462 * reclaim is running as well.
1463 */
1464 mutex_enter(&vif->vif_mutex);
1465 vif->vif_tx_corked = B_TRUE;
1466 virtio_queue_no_interrupt(vif->vif_tx_vq, B_FALSE);
1467 vioif_reclaim_restart(vif);
1468 mutex_exit(&vif->vif_mutex);
1469 return (mp);
1470 }
1471 mp = nmp;
1472 }
1473
1474 /* Ensure the periodic reclaim has been started. */
1475 mutex_enter(&vif->vif_mutex);
1476 vioif_reclaim_restart(vif);
1477 mutex_exit(&vif->vif_mutex);
1478
1479 return (NULL);
1480 }
1481
1482 static int
vioif_m_start(void * arg)1483 vioif_m_start(void *arg)
1484 {
1485 vioif_t *vif = arg;
1486
1487 mutex_enter(&vif->vif_mutex);
1488
1489 VERIFY3S(vif->vif_runstate, ==, VIOIF_RUNSTATE_STOPPED);
1490 vif->vif_runstate = VIOIF_RUNSTATE_RUNNING;
1491
1492 mac_link_update(vif->vif_mac_handle, LINK_STATE_UP);
1493
1494 virtio_queue_no_interrupt(vif->vif_rx_vq, B_FALSE);
1495
1496 /*
1497 * Starting interrupts on the TX virtqueue is unnecessary at this time.
1498 * Descriptor reclamation is handling during transmit, via a periodic
1499 * timer, and when resources are tight, via the then-enabled interrupt.
1500 */
1501 vif->vif_tx_drain = B_FALSE;
1502
1503 /*
1504 * Add as many receive buffers as we can to the receive queue. If we
1505 * cannot add any, it may be because we have stopped and started again
1506 * and the descriptors are all in the queue already.
1507 */
1508 (void) vioif_add_rx(vif);
1509
1510 mutex_exit(&vif->vif_mutex);
1511 return (DDI_SUCCESS);
1512 }
1513
1514 static void
vioif_m_stop(void * arg)1515 vioif_m_stop(void *arg)
1516 {
1517 vioif_t *vif = arg;
1518
1519 mutex_enter(&vif->vif_mutex);
1520
1521 VERIFY3S(vif->vif_runstate, ==, VIOIF_RUNSTATE_RUNNING);
1522 vif->vif_runstate = VIOIF_RUNSTATE_STOPPING;
1523
1524 /* Ensure all TX descriptors have been processed and reclaimed */
1525 vioif_tx_drain(vif);
1526
1527 virtio_queue_no_interrupt(vif->vif_rx_vq, B_TRUE);
1528
1529 vif->vif_runstate = VIOIF_RUNSTATE_STOPPED;
1530 mutex_exit(&vif->vif_mutex);
1531 }
1532
1533 static int
vioif_m_stat(void * arg,uint_t stat,uint64_t * val)1534 vioif_m_stat(void *arg, uint_t stat, uint64_t *val)
1535 {
1536 vioif_t *vif = arg;
1537
1538 switch (stat) {
1539 case MAC_STAT_IERRORS:
1540 *val = vif->vif_ierrors;
1541 break;
1542 case MAC_STAT_OERRORS:
1543 *val = vif->vif_oerrors;
1544 break;
1545 case MAC_STAT_MULTIRCV:
1546 *val = vif->vif_multircv;
1547 break;
1548 case MAC_STAT_BRDCSTRCV:
1549 *val = vif->vif_brdcstrcv;
1550 break;
1551 case MAC_STAT_MULTIXMT:
1552 *val = vif->vif_multixmt;
1553 break;
1554 case MAC_STAT_BRDCSTXMT:
1555 *val = vif->vif_brdcstxmt;
1556 break;
1557 case MAC_STAT_IPACKETS:
1558 *val = vif->vif_ipackets;
1559 break;
1560 case MAC_STAT_RBYTES:
1561 *val = vif->vif_rbytes;
1562 break;
1563 case MAC_STAT_OPACKETS:
1564 *val = vif->vif_opackets;
1565 break;
1566 case MAC_STAT_OBYTES:
1567 *val = vif->vif_obytes;
1568 break;
1569 case MAC_STAT_NORCVBUF:
1570 *val = vif->vif_norecvbuf;
1571 break;
1572 case MAC_STAT_NOXMTBUF:
1573 *val = vif->vif_notxbuf;
1574 break;
1575 case MAC_STAT_IFSPEED:
1576 /* always 1 Gbit */
1577 *val = 1000000000ULL;
1578 break;
1579 case ETHER_STAT_LINK_DUPLEX:
1580 /* virtual device, always full-duplex */
1581 *val = LINK_DUPLEX_FULL;
1582 break;
1583
1584 default:
1585 return (ENOTSUP);
1586 }
1587
1588 return (DDI_SUCCESS);
1589 }
1590
1591 static int
vioif_m_setprop(void * arg,const char * pr_name,mac_prop_id_t pr_num,uint_t pr_valsize,const void * pr_val)1592 vioif_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
1593 uint_t pr_valsize, const void *pr_val)
1594 {
1595 vioif_t *vif = arg;
1596
1597 switch (pr_num) {
1598 case MAC_PROP_MTU: {
1599 int r;
1600 uint32_t mtu;
1601 if (pr_valsize < sizeof (mtu)) {
1602 return (EOVERFLOW);
1603 }
1604 bcopy(pr_val, &mtu, sizeof (mtu));
1605
1606 if (mtu < ETHERMIN || mtu > vif->vif_mtu_max) {
1607 return (EINVAL);
1608 }
1609
1610 mutex_enter(&vif->vif_mutex);
1611 if ((r = mac_maxsdu_update(vif->vif_mac_handle, mtu)) == 0) {
1612 vif->vif_mtu = mtu;
1613 }
1614 mutex_exit(&vif->vif_mutex);
1615
1616 return (r);
1617 }
1618
1619 case MAC_PROP_PRIVATE: {
1620 long max, result;
1621 uint_t *resp;
1622 char *endptr;
1623
1624 if (strcmp(pr_name, VIOIF_MACPROP_TXCOPY_THRESH) == 0) {
1625 max = VIOIF_MACPROP_TXCOPY_THRESH_MAX;
1626 resp = &vif->vif_txcopy_thresh;
1627 } else if (strcmp(pr_name, VIOIF_MACPROP_RXCOPY_THRESH) == 0) {
1628 max = VIOIF_MACPROP_RXCOPY_THRESH_MAX;
1629 resp = &vif->vif_rxcopy_thresh;
1630 } else {
1631 return (ENOTSUP);
1632 }
1633
1634 if (pr_val == NULL) {
1635 return (EINVAL);
1636 }
1637
1638 if (ddi_strtol(pr_val, &endptr, 10, &result) != 0 ||
1639 *endptr != '\0' || result < 0 || result > max) {
1640 return (EINVAL);
1641 }
1642
1643 mutex_enter(&vif->vif_mutex);
1644 *resp = result;
1645 mutex_exit(&vif->vif_mutex);
1646
1647 return (0);
1648 }
1649
1650 default:
1651 return (ENOTSUP);
1652 }
1653 }
1654
1655 static int
vioif_m_getprop(void * arg,const char * pr_name,mac_prop_id_t pr_num,uint_t pr_valsize,void * pr_val)1656 vioif_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
1657 uint_t pr_valsize, void *pr_val)
1658 {
1659 vioif_t *vif = arg;
1660
1661 switch (pr_num) {
1662 case MAC_PROP_PRIVATE: {
1663 uint_t value;
1664
1665 if (strcmp(pr_name, VIOIF_MACPROP_TXCOPY_THRESH) == 0) {
1666 value = vif->vif_txcopy_thresh;
1667 } else if (strcmp(pr_name, VIOIF_MACPROP_RXCOPY_THRESH) == 0) {
1668 value = vif->vif_rxcopy_thresh;
1669 } else {
1670 return (ENOTSUP);
1671 }
1672
1673 if (snprintf(pr_val, pr_valsize, "%u", value) >= pr_valsize) {
1674 return (EOVERFLOW);
1675 }
1676
1677 return (0);
1678 }
1679
1680 default:
1681 return (ENOTSUP);
1682 }
1683 }
1684
1685 static void
vioif_m_propinfo(void * arg,const char * pr_name,mac_prop_id_t pr_num,mac_prop_info_handle_t prh)1686 vioif_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num,
1687 mac_prop_info_handle_t prh)
1688 {
1689 vioif_t *vif = arg;
1690 char valstr[64];
1691 int value;
1692
1693 switch (pr_num) {
1694 case MAC_PROP_MTU:
1695 mac_prop_info_set_perm(prh, MAC_PROP_PERM_RW);
1696 mac_prop_info_set_range_uint32(prh, ETHERMIN, vif->vif_mtu_max);
1697 return;
1698
1699 case MAC_PROP_PRIVATE:
1700 if (strcmp(pr_name, VIOIF_MACPROP_TXCOPY_THRESH) == 0) {
1701 value = VIOIF_MACPROP_TXCOPY_THRESH_DEF;
1702 } else if (strcmp(pr_name, VIOIF_MACPROP_RXCOPY_THRESH) == 0) {
1703 value = VIOIF_MACPROP_RXCOPY_THRESH_DEF;
1704 } else {
1705 /*
1706 * We do not recognise this private property name.
1707 */
1708 return;
1709 }
1710 mac_prop_info_set_perm(prh, MAC_PROP_PERM_RW);
1711 (void) snprintf(valstr, sizeof (valstr), "%d", value);
1712 mac_prop_info_set_default_str(prh, valstr);
1713 return;
1714
1715 default:
1716 return;
1717 }
1718 }
1719
1720 static boolean_t
vioif_m_getcapab(void * arg,mac_capab_t cap,void * cap_data)1721 vioif_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
1722 {
1723 vioif_t *vif = arg;
1724
1725 switch (cap) {
1726 case MAC_CAPAB_HCKSUM: {
1727 if (!vif->vif_tx_csum) {
1728 return (B_FALSE);
1729 }
1730
1731 *(uint32_t *)cap_data = HCKSUM_INET_PARTIAL;
1732
1733 return (B_TRUE);
1734 }
1735
1736 case MAC_CAPAB_LSO: {
1737 if (!vif->vif_tx_tso4) {
1738 return (B_FALSE);
1739 }
1740
1741 mac_capab_lso_t *lso = cap_data;
1742 lso->lso_flags = LSO_TX_BASIC_TCP_IPV4 | LSO_TX_BASIC_TCP_IPV6;
1743 lso->lso_basic_tcp_ipv4.lso_max = VIOIF_RX_DATA_SIZE;
1744 lso->lso_basic_tcp_ipv6.lso_max = VIOIF_RX_DATA_SIZE;
1745
1746 return (B_TRUE);
1747 }
1748
1749 default:
1750 return (B_FALSE);
1751 }
1752 }
1753
1754 static boolean_t
vioif_has_feature(vioif_t * vif,uint32_t feature)1755 vioif_has_feature(vioif_t *vif, uint32_t feature)
1756 {
1757 return (virtio_feature_present(vif->vif_virtio, feature));
1758 }
1759
1760 /*
1761 * Read the primary MAC address from the device if one is provided. If not,
1762 * generate a random locally administered MAC address and write it back to the
1763 * device.
1764 */
1765 static void
vioif_get_mac(vioif_t * vif)1766 vioif_get_mac(vioif_t *vif)
1767 {
1768 VERIFY(MUTEX_HELD(&vif->vif_mutex));
1769
1770 if (vioif_has_feature(vif, VIRTIO_NET_F_MAC)) {
1771 for (uint_t i = 0; i < ETHERADDRL; i++) {
1772 vif->vif_mac[i] = virtio_dev_get8(vif->vif_virtio,
1773 VIRTIO_NET_CONFIG_MAC + i);
1774 }
1775 vif->vif_mac_from_host = 1;
1776
1777 return;
1778 }
1779
1780 /* Get a few random bytes */
1781 (void) random_get_pseudo_bytes(vif->vif_mac, ETHERADDRL);
1782 /* Make sure it's a unicast MAC */
1783 vif->vif_mac[0] &= ~1;
1784 /* Set the "locally administered" bit */
1785 vif->vif_mac[1] |= 2;
1786
1787 /*
1788 * Write the random MAC address back to the device.
1789 */
1790 for (uint_t i = 0; i < ETHERADDRL; i++) {
1791 virtio_dev_put8(vif->vif_virtio, VIRTIO_NET_CONFIG_MAC + i,
1792 vif->vif_mac[i]);
1793 }
1794 vif->vif_mac_from_host = 0;
1795
1796 dev_err(vif->vif_dip, CE_NOTE, "!Generated a random MAC address: "
1797 "%02x:%02x:%02x:%02x:%02x:%02x",
1798 (uint_t)vif->vif_mac[0], (uint_t)vif->vif_mac[1],
1799 (uint_t)vif->vif_mac[2], (uint_t)vif->vif_mac[3],
1800 (uint_t)vif->vif_mac[4], (uint_t)vif->vif_mac[5]);
1801 }
1802
1803 /*
1804 * Virtqueue interrupt handlers
1805 */
1806 static uint_t
vioif_rx_handler(caddr_t arg0,caddr_t arg1)1807 vioif_rx_handler(caddr_t arg0, caddr_t arg1)
1808 {
1809 vioif_t *vif = (vioif_t *)arg0;
1810
1811 mutex_enter(&vif->vif_mutex);
1812 (void) vioif_process_rx(vif);
1813
1814 /*
1815 * Attempt to replenish the receive queue. If we cannot add any
1816 * descriptors here, it may be because all of the recently received
1817 * packets were loaned up to the networking stack.
1818 */
1819 (void) vioif_add_rx(vif);
1820 mutex_exit(&vif->vif_mutex);
1821
1822 return (DDI_INTR_CLAIMED);
1823 }
1824
1825 static uint_t
vioif_tx_handler(caddr_t arg0,caddr_t arg1)1826 vioif_tx_handler(caddr_t arg0, caddr_t arg1)
1827 {
1828 vioif_t *vif = (vioif_t *)arg0;
1829
1830 /*
1831 * The TX interrupt could race with other reclamation activity, so
1832 * interpreting the return value is unimportant.
1833 */
1834 (void) vioif_reclaim_used_tx(vif);
1835
1836 return (DDI_INTR_CLAIMED);
1837 }
1838
1839 static void
vioif_check_features(vioif_t * vif)1840 vioif_check_features(vioif_t *vif)
1841 {
1842 VERIFY(MUTEX_HELD(&vif->vif_mutex));
1843
1844 vif->vif_tx_csum = 0;
1845 vif->vif_tx_tso4 = 0;
1846 vif->vif_tx_tso6 = 0;
1847
1848 if (vioif_has_feature(vif, VIRTIO_NET_F_CSUM)) {
1849 /*
1850 * The host will accept packets with partial checksums from us.
1851 */
1852 vif->vif_tx_csum = 1;
1853
1854 /*
1855 * The legacy GSO feature represents the combination of
1856 * HOST_TSO4, HOST_TSO6, and HOST_ECN.
1857 */
1858 boolean_t gso = vioif_has_feature(vif, VIRTIO_NET_F_GSO);
1859 boolean_t tso4 = vioif_has_feature(vif, VIRTIO_NET_F_HOST_TSO4);
1860 boolean_t tso6 = vioif_has_feature(vif, VIRTIO_NET_F_HOST_TSO6);
1861 boolean_t ecn = vioif_has_feature(vif, VIRTIO_NET_F_HOST_ECN);
1862
1863 /*
1864 * Explicit congestion notification (ECN) is configured
1865 * globally; see "tcp_ecn_permitted". As we cannot currently
1866 * request that the stack disable ECN on a per interface basis,
1867 * we require the device to support the combination of
1868 * segmentation offload and ECN support.
1869 */
1870 if (gso) {
1871 vif->vif_tx_tso4 = 1;
1872 vif->vif_tx_tso6 = 1;
1873 }
1874 if (tso4 && ecn) {
1875 vif->vif_tx_tso4 = 1;
1876 }
1877 if (tso6 && ecn) {
1878 vif->vif_tx_tso6 = 1;
1879 }
1880 }
1881
1882 if (vioif_has_feature(vif, VIRTIO_NET_F_CTRL_VQ)) {
1883 vif->vif_has_ctrlq = 1;
1884
1885 /*
1886 * The VIRTIO_NET_F_CTRL_VQ feature must be enabled if there's
1887 * any chance of the VIRTIO_NET_F_CTRL_RX being enabled.
1888 */
1889 if (vioif_has_feature(vif, VIRTIO_NET_F_CTRL_RX))
1890 vif->vif_has_ctrlq_rx = 1;
1891 }
1892 }
1893
1894 static int
vioif_select_interrupt_types(void)1895 vioif_select_interrupt_types(void)
1896 {
1897 id_t id;
1898 smbios_system_t sys;
1899 smbios_info_t info;
1900
1901 if (vioif_allowed_int_types != -1) {
1902 /*
1903 * If this value was tuned via /etc/system or the debugger,
1904 * use the provided value directly.
1905 */
1906 return (vioif_allowed_int_types);
1907 }
1908
1909 if (ksmbios == NULL ||
1910 (id = smbios_info_system(ksmbios, &sys)) == SMB_ERR ||
1911 smbios_info_common(ksmbios, id, &info) == SMB_ERR) {
1912 /*
1913 * The system may not have valid SMBIOS data, so ignore a
1914 * failure here.
1915 */
1916 return (VIRTIO_ANY_INTR_TYPE);
1917 }
1918
1919 if (strcmp(info.smbi_manufacturer, "Google") == 0 &&
1920 strcmp(info.smbi_product, "Google Compute Engine") == 0) {
1921 /*
1922 * An undiagnosed issue with the Google Compute Engine (GCE)
1923 * hypervisor exists. In this environment, no RX interrupts
1924 * are received if MSI-X handlers are installed. This does not
1925 * appear to be true for the Virtio SCSI driver. Fixed
1926 * interrupts do appear to work, so we fall back for now:
1927 */
1928 return (DDI_INTR_TYPE_FIXED);
1929 }
1930
1931 return (VIRTIO_ANY_INTR_TYPE);
1932 }
1933
1934 static int
vioif_attach(dev_info_t * dip,ddi_attach_cmd_t cmd)1935 vioif_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
1936 {
1937 int ret;
1938 vioif_t *vif;
1939 virtio_t *vio;
1940 mac_register_t *macp = NULL;
1941
1942 if (cmd != DDI_ATTACH) {
1943 return (DDI_FAILURE);
1944 }
1945
1946 if ((vio = virtio_init(dip, VIRTIO_NET_WANTED_FEATURES, B_TRUE)) ==
1947 NULL) {
1948 return (DDI_FAILURE);
1949 }
1950
1951 vif = kmem_zalloc(sizeof (*vif), KM_SLEEP);
1952 vif->vif_dip = dip;
1953 vif->vif_virtio = vio;
1954 vif->vif_runstate = VIOIF_RUNSTATE_STOPPED;
1955 ddi_set_driver_private(dip, vif);
1956
1957 if ((vif->vif_rx_vq = virtio_queue_alloc(vio, VIRTIO_NET_VIRTQ_RX,
1958 "rx", vioif_rx_handler, vif, B_FALSE, VIOIF_MAX_SEGS)) == NULL ||
1959 (vif->vif_tx_vq = virtio_queue_alloc(vio, VIRTIO_NET_VIRTQ_TX,
1960 "tx", vioif_tx_handler, vif, B_FALSE, VIOIF_MAX_SEGS)) == NULL) {
1961 goto fail;
1962 }
1963
1964 if (vioif_has_feature(vif, VIRTIO_NET_F_CTRL_VQ) &&
1965 (vif->vif_ctrl_vq = virtio_queue_alloc(vio,
1966 VIRTIO_NET_VIRTQ_CONTROL, "ctrlq", NULL, vif,
1967 B_FALSE, VIOIF_MAX_SEGS)) == NULL) {
1968 goto fail;
1969 }
1970
1971 if (virtio_init_complete(vio, vioif_select_interrupt_types()) !=
1972 DDI_SUCCESS) {
1973 dev_err(dip, CE_WARN, "failed to complete Virtio init");
1974 goto fail;
1975 }
1976
1977 virtio_queue_no_interrupt(vif->vif_rx_vq, B_TRUE);
1978 virtio_queue_no_interrupt(vif->vif_tx_vq, B_TRUE);
1979 if (vif->vif_ctrl_vq != NULL)
1980 virtio_queue_no_interrupt(vif->vif_ctrl_vq, B_TRUE);
1981
1982 mutex_init(&vif->vif_mutex, NULL, MUTEX_DRIVER, virtio_intr_pri(vio));
1983 mutex_enter(&vif->vif_mutex);
1984
1985 vioif_get_mac(vif);
1986
1987 vif->vif_rxcopy_thresh = VIOIF_MACPROP_RXCOPY_THRESH_DEF;
1988 vif->vif_txcopy_thresh = VIOIF_MACPROP_TXCOPY_THRESH_DEF;
1989
1990 if (vioif_has_feature(vif, VIRTIO_NET_F_MTU)) {
1991 vif->vif_mtu_max = virtio_dev_get16(vio, VIRTIO_NET_CONFIG_MTU);
1992 } else {
1993 vif->vif_mtu_max = ETHERMTU;
1994 }
1995
1996 vif->vif_mtu = ETHERMTU;
1997 if (vif->vif_mtu > vif->vif_mtu_max) {
1998 vif->vif_mtu = vif->vif_mtu_max;
1999 }
2000
2001 vioif_check_features(vif);
2002
2003 if (vioif_alloc_bufs(vif) != 0) {
2004 mutex_exit(&vif->vif_mutex);
2005 dev_err(dip, CE_WARN, "failed to allocate memory");
2006 goto fail;
2007 }
2008
2009 mutex_exit(&vif->vif_mutex);
2010
2011 if (virtio_interrupts_enable(vio) != DDI_SUCCESS) {
2012 dev_err(dip, CE_WARN, "failed to enable interrupts");
2013 goto fail;
2014 }
2015
2016 if ((macp = mac_alloc(MAC_VERSION)) == NULL) {
2017 dev_err(dip, CE_WARN, "failed to allocate a mac_register");
2018 goto fail;
2019 }
2020
2021 macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
2022 macp->m_driver = vif;
2023 macp->m_dip = dip;
2024 macp->m_src_addr = vif->vif_mac;
2025 macp->m_callbacks = &vioif_mac_callbacks;
2026 macp->m_min_sdu = 0;
2027 macp->m_max_sdu = vif->vif_mtu;
2028 macp->m_margin = VLAN_TAGSZ;
2029 macp->m_priv_props = vioif_priv_props;
2030
2031 if ((ret = mac_register(macp, &vif->vif_mac_handle)) != 0) {
2032 dev_err(dip, CE_WARN, "mac_register() failed (%d)", ret);
2033 goto fail;
2034 }
2035 mac_free(macp);
2036
2037 mac_link_update(vif->vif_mac_handle, LINK_STATE_UP);
2038
2039 return (DDI_SUCCESS);
2040
2041 fail:
2042 vioif_free_bufs(vif);
2043 if (macp != NULL) {
2044 mac_free(macp);
2045 }
2046 (void) virtio_fini(vio, B_TRUE);
2047 kmem_free(vif, sizeof (*vif));
2048 return (DDI_FAILURE);
2049 }
2050
2051 static int
vioif_detach(dev_info_t * dip,ddi_detach_cmd_t cmd)2052 vioif_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2053 {
2054 int r;
2055 vioif_t *vif;
2056
2057 if (cmd != DDI_DETACH) {
2058 return (DDI_FAILURE);
2059 }
2060
2061 if ((vif = ddi_get_driver_private(dip)) == NULL) {
2062 return (DDI_FAILURE);
2063 }
2064
2065 mutex_enter(&vif->vif_mutex);
2066 if (vif->vif_runstate != VIOIF_RUNSTATE_STOPPED) {
2067 dev_err(dip, CE_WARN, "!NIC still running, cannot detach");
2068 mutex_exit(&vif->vif_mutex);
2069 return (DDI_FAILURE);
2070 }
2071
2072 /*
2073 * There should be no outstanding transmit buffers once the NIC is
2074 * completely stopped.
2075 */
2076 VERIFY3U(vif->vif_ntxbufs_alloc, ==, 0);
2077
2078 /*
2079 * Though we cannot claw back all of the receive buffers until we reset
2080 * the device, we must ensure all those loaned to MAC have been
2081 * returned before calling mac_unregister().
2082 */
2083 if (vif->vif_nrxbufs_onloan > 0) {
2084 dev_err(dip, CE_WARN, "!%u receive buffers still loaned, "
2085 "cannot detach", vif->vif_nrxbufs_onloan);
2086 mutex_exit(&vif->vif_mutex);
2087 return (DDI_FAILURE);
2088 }
2089
2090 if ((r = mac_unregister(vif->vif_mac_handle)) != 0) {
2091 dev_err(dip, CE_WARN, "!MAC unregister failed (%d)", r);
2092 return (DDI_FAILURE);
2093 }
2094
2095 /*
2096 * Shut down the device so that we can recover any previously
2097 * submitted receive buffers.
2098 */
2099 virtio_shutdown(vif->vif_virtio);
2100 for (;;) {
2101 virtio_chain_t *vic;
2102
2103 if ((vic = virtio_queue_evacuate(vif->vif_rx_vq)) == NULL) {
2104 break;
2105 }
2106
2107 vioif_rxbuf_t *rb = virtio_chain_data(vic);
2108 vioif_rxbuf_free(vif, rb);
2109 }
2110
2111 /*
2112 * vioif_free_bufs() must be called before virtio_fini()
2113 * as it uses virtio_chain_free() which itself depends on some
2114 * virtio data structures still being around.
2115 */
2116 vioif_free_bufs(vif);
2117 (void) virtio_fini(vif->vif_virtio, B_FALSE);
2118
2119 mutex_exit(&vif->vif_mutex);
2120 mutex_destroy(&vif->vif_mutex);
2121
2122 kmem_free(vif, sizeof (*vif));
2123
2124 return (DDI_SUCCESS);
2125 }
2126
2127 static int
vioif_quiesce(dev_info_t * dip)2128 vioif_quiesce(dev_info_t *dip)
2129 {
2130 vioif_t *vif;
2131
2132 if ((vif = ddi_get_driver_private(dip)) == NULL)
2133 return (DDI_FAILURE);
2134
2135 return (virtio_quiesce(vif->vif_virtio));
2136 }
2137
2138 int
_init(void)2139 _init(void)
2140 {
2141 int ret;
2142
2143 mac_init_ops(&vioif_dev_ops, "vioif");
2144
2145 if ((ret = mod_install(&vioif_modlinkage)) != DDI_SUCCESS) {
2146 mac_fini_ops(&vioif_dev_ops);
2147 }
2148
2149 return (ret);
2150 }
2151
2152 int
_fini(void)2153 _fini(void)
2154 {
2155 int ret;
2156
2157 if ((ret = mod_remove(&vioif_modlinkage)) == DDI_SUCCESS) {
2158 mac_fini_ops(&vioif_dev_ops);
2159 }
2160
2161 return (ret);
2162 }
2163
2164 int
_info(struct modinfo * modinfop)2165 _info(struct modinfo *modinfop)
2166 {
2167 return (mod_info(&vioif_modlinkage, modinfop));
2168 }
2169