1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 /*
28 * Copyright (c) 2014, 2017 by Delphix. All rights reserved.
29 * Copyright 2020 RackTop Systems, Inc.
30 */
31
32 /*
33 *
34 * Copyright (c) 2004 Christian Limpach.
35 * All rights reserved.
36 *
37 * Redistribution and use in source and binary forms, with or without
38 * modification, are permitted provided that the following conditions
39 * are met:
40 * 1. Redistributions of source code must retain the above copyright
41 * notice, this list of conditions and the following disclaimer.
42 * 2. Redistributions in binary form must reproduce the above copyright
43 * notice, this list of conditions and the following disclaimer in the
44 * documentation and/or other materials provided with the distribution.
45 * 3. This section intentionally left blank.
46 * 4. The name of the author may not be used to endorse or promote products
47 * derived from this software without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
50 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
51 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
52 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
53 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
54 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
55 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
56 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
57 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
58 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
59 */
60 /*
61 * Section 3 of the above license was updated in response to bug 6379571.
62 */
63
64 /*
65 * xnf.c - GLDv3 network driver for domU.
66 */
67
68 /*
69 * This driver uses four per-instance locks:
70 *
71 * xnf_gref_lock:
72 *
73 * Protects access to the grant reference list stored in
74 * xnf_gref_head. Grant references should be acquired and released
75 * using gref_get() and gref_put() respectively.
76 *
77 * xnf_schedlock:
78 *
79 * Protects:
80 * xnf_need_sched - used to record that a previous transmit attempt
81 * failed (and consequently it will be necessary to call
82 * mac_tx_update() when transmit resources are available).
83 * xnf_pending_multicast - the number of multicast requests that
84 * have been submitted to the backend for which we have not
85 * processed responses.
86 *
87 * xnf_txlock:
88 *
89 * Protects the transmit ring (xnf_tx_ring) and associated
90 * structures (notably xnf_tx_pkt_id and xnf_tx_pkt_id_head).
91 *
92 * xnf_rxlock:
93 *
94 * Protects the receive ring (xnf_rx_ring) and associated
95 * structures (notably xnf_rx_pkt_info).
96 *
97 * If driver-global state that affects both the transmit and receive
98 * rings is manipulated, both xnf_txlock and xnf_rxlock should be
99 * held, in that order.
100 *
101 * xnf_schedlock is acquired both whilst holding xnf_txlock and
102 * without. It should always be acquired after xnf_txlock if both are
103 * held.
104 *
105 * Notes:
106 * - atomic_add_64() is used to manipulate counters where we require
107 * accuracy. For counters intended only for observation by humans,
108 * post increment/decrement are used instead.
109 */
110
111 #include <sys/types.h>
112 #include <sys/errno.h>
113 #include <sys/param.h>
114 #include <sys/sysmacros.h>
115 #include <sys/systm.h>
116 #include <sys/stream.h>
117 #include <sys/strsubr.h>
118 #include <sys/strsun.h>
119 #include <sys/conf.h>
120 #include <sys/ddi.h>
121 #include <sys/devops.h>
122 #include <sys/sunddi.h>
123 #include <sys/sunndi.h>
124 #include <sys/dlpi.h>
125 #include <sys/ethernet.h>
126 #include <sys/strsun.h>
127 #include <sys/pattr.h>
128 #include <inet/ip.h>
129 #include <inet/ip_impl.h>
130 #include <inet/tcp.h>
131 #include <netinet/udp.h>
132 #include <sys/gld.h>
133 #include <sys/modctl.h>
134 #include <sys/mac_provider.h>
135 #include <sys/mac_ether.h>
136 #include <sys/bootinfo.h>
137 #include <sys/mach_mmu.h>
138 #ifdef XPV_HVM_DRIVER
139 #include <sys/xpv_support.h>
140 #include <sys/hypervisor.h>
141 #else
142 #include <sys/hypervisor.h>
143 #include <sys/evtchn_impl.h>
144 #include <sys/balloon_impl.h>
145 #endif
146 #include <xen/public/io/netif.h>
147 #include <sys/gnttab.h>
148 #include <xen/sys/xendev.h>
149 #include <sys/sdt.h>
150 #include <sys/note.h>
151 #include <sys/debug.h>
152
153 #include <io/xnf.h>
154
155 /*
156 * On a 32 bit PAE system physical and machine addresses are larger
157 * than 32 bits. ddi_btop() on such systems take an unsigned long
158 * argument, and so addresses above 4G are truncated before ddi_btop()
159 * gets to see them. To avoid this, code the shift operation here.
160 */
161 #define xnf_btop(addr) ((addr) >> PAGESHIFT)
162
163 /*
164 * The parameters below should only be changed in /etc/system, never in mdb.
165 */
166
167 /*
168 * Should we use the multicast control feature if the backend provides
169 * it?
170 */
171 boolean_t xnf_multicast_control = B_TRUE;
172
173 /*
174 * Should we allow scatter-gather for tx if backend allows it?
175 */
176 boolean_t xnf_enable_tx_sg = B_TRUE;
177
178 /*
179 * Should we allow scatter-gather for rx if backend allows it?
180 */
181 boolean_t xnf_enable_rx_sg = B_TRUE;
182
183 /*
184 * Should we allow lso for tx sends if backend allows it?
185 * Requires xnf_enable_tx_sg to be also set to TRUE.
186 */
187 boolean_t xnf_enable_lso = B_TRUE;
188
189 /*
190 * Should we allow lro on rx if backend supports it?
191 * Requires xnf_enable_rx_sg to be also set to TRUE.
192 *
193 * !! WARNING !!
194 * LRO is not yet supported in the OS so this should be left as FALSE.
195 * !! WARNING !!
196 */
197 boolean_t xnf_enable_lro = B_FALSE;
198
199 /*
200 * Received packets below this size are copied to a new streams buffer
201 * rather than being desballoc'ed.
202 *
203 * This value is chosen to accommodate traffic where there are a large
204 * number of small packets. For data showing a typical distribution,
205 * see:
206 *
207 * Sinha07a:
208 * Rishi Sinha, Christos Papadopoulos, and John
209 * Heidemann. Internet Packet Size Distributions: Some
210 * Observations. Technical Report ISI-TR-2007-643,
211 * USC/Information Sciences Institute, May, 2007. Orignally
212 * released October 2005 as web page
213 * http://netweb.usc.edu/~sinha/pkt-sizes/.
214 * <http://www.isi.edu/~johnh/PAPERS/Sinha07a.html>.
215 */
216 size_t xnf_rx_copy_limit = 64;
217
218 #define INVALID_GRANT_HANDLE ((grant_handle_t)-1)
219 #define INVALID_GRANT_REF ((grant_ref_t)-1)
220 #define INVALID_TX_ID ((uint16_t)-1)
221
222 #define TX_ID_TO_TXID(p, id) (&((p)->xnf_tx_pkt_id[(id)]))
223 #define TX_ID_VALID(i) \
224 (((i) != INVALID_TX_ID) && ((i) < NET_TX_RING_SIZE))
225
226 /*
227 * calculate how many pages are spanned by an mblk fragment
228 */
229 #define xnf_mblk_pages(mp) (MBLKL(mp) == 0 ? 0 : \
230 xnf_btop((uintptr_t)mp->b_wptr - 1) - xnf_btop((uintptr_t)mp->b_rptr) + 1)
231
232 /* Required system entry points */
233 static int xnf_attach(dev_info_t *, ddi_attach_cmd_t);
234 static int xnf_detach(dev_info_t *, ddi_detach_cmd_t);
235
236 /* Required driver entry points for Nemo */
237 static int xnf_start(void *);
238 static void xnf_stop(void *);
239 static int xnf_set_mac_addr(void *, const uint8_t *);
240 static int xnf_set_multicast(void *, boolean_t, const uint8_t *);
241 static int xnf_set_promiscuous(void *, boolean_t);
242 static mblk_t *xnf_send(void *, mblk_t *);
243 static uint_t xnf_intr(caddr_t);
244 static int xnf_stat(void *, uint_t, uint64_t *);
245 static boolean_t xnf_getcapab(void *, mac_capab_t, void *);
246 static int xnf_getprop(void *, const char *, mac_prop_id_t, uint_t, void *);
247 static int xnf_setprop(void *, const char *, mac_prop_id_t, uint_t,
248 const void *);
249 static void xnf_propinfo(void *, const char *, mac_prop_id_t,
250 mac_prop_info_handle_t);
251
252 /* Driver private functions */
253 static int xnf_alloc_dma_resources(xnf_t *);
254 static void xnf_release_dma_resources(xnf_t *);
255 static void xnf_release_mblks(xnf_t *);
256
257 static int xnf_buf_constructor(void *, void *, int);
258 static void xnf_buf_destructor(void *, void *);
259 static xnf_buf_t *xnf_buf_get(xnf_t *, int, boolean_t);
260 #pragma inline(xnf_buf_get)
261 static void xnf_buf_put(xnf_t *, xnf_buf_t *, boolean_t);
262 #pragma inline(xnf_buf_put)
263 static void xnf_buf_refresh(xnf_buf_t *);
264 #pragma inline(xnf_buf_refresh)
265 static void xnf_buf_recycle(xnf_buf_t *);
266
267 static int xnf_tx_buf_constructor(void *, void *, int);
268 static void xnf_tx_buf_destructor(void *, void *);
269
270 static grant_ref_t xnf_gref_get(xnf_t *);
271 #pragma inline(xnf_gref_get)
272 static void xnf_gref_put(xnf_t *, grant_ref_t);
273 #pragma inline(xnf_gref_put)
274
275 static xnf_txid_t *xnf_txid_get(xnf_t *);
276 #pragma inline(xnf_txid_get)
277 static void xnf_txid_put(xnf_t *, xnf_txid_t *);
278 #pragma inline(xnf_txid_put)
279
280 static void xnf_rxbuf_hang(xnf_t *, xnf_buf_t *);
281 static int xnf_tx_clean_ring(xnf_t *);
282 static void oe_state_change(dev_info_t *, ddi_eventcookie_t,
283 void *, void *);
284 static boolean_t xnf_kstat_init(xnf_t *);
285 static void xnf_rx_collect(xnf_t *);
286
287 #define XNF_CALLBACK_FLAGS (MC_GETCAPAB | MC_PROPERTIES)
288
289 static mac_callbacks_t xnf_callbacks = {
290 .mc_callbacks = XNF_CALLBACK_FLAGS,
291 .mc_getstat = xnf_stat,
292 .mc_start = xnf_start,
293 .mc_stop = xnf_stop,
294 .mc_setpromisc = xnf_set_promiscuous,
295 .mc_multicst = xnf_set_multicast,
296 .mc_unicst = xnf_set_mac_addr,
297 .mc_tx = xnf_send,
298 .mc_getcapab = xnf_getcapab,
299 .mc_setprop = xnf_setprop,
300 .mc_getprop = xnf_getprop,
301 .mc_propinfo = xnf_propinfo,
302 };
303
304 /* DMA attributes for network ring buffer */
305 static ddi_dma_attr_t ringbuf_dma_attr = {
306 .dma_attr_version = DMA_ATTR_V0,
307 .dma_attr_addr_lo = 0,
308 .dma_attr_addr_hi = 0xffffffffffffffffULL,
309 .dma_attr_count_max = 0x7fffffff,
310 .dma_attr_align = MMU_PAGESIZE,
311 .dma_attr_burstsizes = 0x7ff,
312 .dma_attr_minxfer = 1,
313 .dma_attr_maxxfer = 0xffffffffU,
314 .dma_attr_seg = 0xffffffffffffffffULL,
315 .dma_attr_sgllen = 1,
316 .dma_attr_granular = 1,
317 .dma_attr_flags = 0
318 };
319
320 /* DMA attributes for receive data */
321 static ddi_dma_attr_t rx_buf_dma_attr = {
322 .dma_attr_version = DMA_ATTR_V0,
323 .dma_attr_addr_lo = 0,
324 .dma_attr_addr_hi = 0xffffffffffffffffULL,
325 .dma_attr_count_max = MMU_PAGEOFFSET,
326 .dma_attr_align = MMU_PAGESIZE, /* allocation alignment */
327 .dma_attr_burstsizes = 0x7ff,
328 .dma_attr_minxfer = 1,
329 .dma_attr_maxxfer = 0xffffffffU,
330 .dma_attr_seg = 0xffffffffffffffffULL,
331 .dma_attr_sgllen = 1,
332 .dma_attr_granular = 1,
333 .dma_attr_flags = 0
334 };
335
336 /* DMA attributes for transmit data */
337 static ddi_dma_attr_t tx_buf_dma_attr = {
338 .dma_attr_version = DMA_ATTR_V0,
339 .dma_attr_addr_lo = 0,
340 .dma_attr_addr_hi = 0xffffffffffffffffULL,
341 .dma_attr_count_max = MMU_PAGEOFFSET,
342 .dma_attr_align = 1,
343 .dma_attr_burstsizes = 0x7ff,
344 .dma_attr_minxfer = 1,
345 .dma_attr_maxxfer = 0xffffffffU,
346 .dma_attr_seg = XEN_DATA_BOUNDARY - 1, /* segment boundary */
347 .dma_attr_sgllen = XEN_MAX_TX_DATA_PAGES, /* max number of segments */
348 .dma_attr_granular = 1,
349 .dma_attr_flags = 0
350 };
351
352 /* DMA access attributes for registers and descriptors */
353 static ddi_device_acc_attr_t accattr = {
354 DDI_DEVICE_ATTR_V0,
355 DDI_STRUCTURE_LE_ACC, /* This is a little-endian device */
356 DDI_STRICTORDER_ACC
357 };
358
359 /* DMA access attributes for data: NOT to be byte swapped. */
360 static ddi_device_acc_attr_t data_accattr = {
361 DDI_DEVICE_ATTR_V0,
362 DDI_NEVERSWAP_ACC,
363 DDI_STRICTORDER_ACC
364 };
365
366 DDI_DEFINE_STREAM_OPS(xnf_dev_ops, nulldev, nulldev, xnf_attach, xnf_detach,
367 nodev, NULL, D_MP, NULL, ddi_quiesce_not_supported);
368
369 static struct modldrv xnf_modldrv = {
370 &mod_driverops,
371 "Virtual Ethernet driver",
372 &xnf_dev_ops
373 };
374
375 static struct modlinkage modlinkage = {
376 MODREV_1, &xnf_modldrv, NULL
377 };
378
379 int
_init(void)380 _init(void)
381 {
382 int r;
383
384 mac_init_ops(&xnf_dev_ops, "xnf");
385 r = mod_install(&modlinkage);
386 if (r != DDI_SUCCESS)
387 mac_fini_ops(&xnf_dev_ops);
388
389 return (r);
390 }
391
392 int
_fini(void)393 _fini(void)
394 {
395 return (EBUSY); /* XXPV should be removable */
396 }
397
398 int
_info(struct modinfo * modinfop)399 _info(struct modinfo *modinfop)
400 {
401 return (mod_info(&modlinkage, modinfop));
402 }
403
404 /*
405 * Acquire a grant reference.
406 */
407 static grant_ref_t
xnf_gref_get(xnf_t * xnfp)408 xnf_gref_get(xnf_t *xnfp)
409 {
410 grant_ref_t gref;
411
412 mutex_enter(&xnfp->xnf_gref_lock);
413
414 do {
415 gref = gnttab_claim_grant_reference(&xnfp->xnf_gref_head);
416
417 } while ((gref == INVALID_GRANT_REF) &&
418 (gnttab_alloc_grant_references(16, &xnfp->xnf_gref_head) == 0));
419
420 mutex_exit(&xnfp->xnf_gref_lock);
421
422 if (gref == INVALID_GRANT_REF) {
423 xnfp->xnf_stat_gref_failure++;
424 } else {
425 atomic_inc_64(&xnfp->xnf_stat_gref_outstanding);
426 if (xnfp->xnf_stat_gref_outstanding > xnfp->xnf_stat_gref_peak)
427 xnfp->xnf_stat_gref_peak =
428 xnfp->xnf_stat_gref_outstanding;
429 }
430
431 return (gref);
432 }
433
434 /*
435 * Release a grant reference.
436 */
437 static void
xnf_gref_put(xnf_t * xnfp,grant_ref_t gref)438 xnf_gref_put(xnf_t *xnfp, grant_ref_t gref)
439 {
440 ASSERT(gref != INVALID_GRANT_REF);
441
442 mutex_enter(&xnfp->xnf_gref_lock);
443 gnttab_release_grant_reference(&xnfp->xnf_gref_head, gref);
444 mutex_exit(&xnfp->xnf_gref_lock);
445
446 atomic_dec_64(&xnfp->xnf_stat_gref_outstanding);
447 }
448
449 /*
450 * Acquire a transmit id.
451 */
452 static xnf_txid_t *
xnf_txid_get(xnf_t * xnfp)453 xnf_txid_get(xnf_t *xnfp)
454 {
455 xnf_txid_t *tidp;
456
457 ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
458
459 if (xnfp->xnf_tx_pkt_id_head == INVALID_TX_ID)
460 return (NULL);
461
462 ASSERT(TX_ID_VALID(xnfp->xnf_tx_pkt_id_head));
463
464 tidp = TX_ID_TO_TXID(xnfp, xnfp->xnf_tx_pkt_id_head);
465 xnfp->xnf_tx_pkt_id_head = tidp->next;
466 tidp->next = INVALID_TX_ID;
467
468 ASSERT(tidp->txbuf == NULL);
469
470 return (tidp);
471 }
472
473 /*
474 * Release a transmit id.
475 */
476 static void
xnf_txid_put(xnf_t * xnfp,xnf_txid_t * tidp)477 xnf_txid_put(xnf_t *xnfp, xnf_txid_t *tidp)
478 {
479 ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
480 ASSERT(TX_ID_VALID(tidp->id));
481 ASSERT(tidp->next == INVALID_TX_ID);
482
483 tidp->txbuf = NULL;
484 tidp->next = xnfp->xnf_tx_pkt_id_head;
485 xnfp->xnf_tx_pkt_id_head = tidp->id;
486 }
487
488 static void
xnf_data_txbuf_free(xnf_t * xnfp,xnf_txbuf_t * txp)489 xnf_data_txbuf_free(xnf_t *xnfp, xnf_txbuf_t *txp)
490 {
491 ASSERT3U(txp->tx_type, ==, TX_DATA);
492
493 /*
494 * We are either using a lookaside buffer or we are mapping existing
495 * buffers.
496 */
497 if (txp->tx_bdesc != NULL) {
498 ASSERT(!txp->tx_handle_bound);
499 xnf_buf_put(xnfp, txp->tx_bdesc, B_TRUE);
500 } else {
501 if (txp->tx_txreq.gref != INVALID_GRANT_REF) {
502 if (gnttab_query_foreign_access(txp->tx_txreq.gref) !=
503 0) {
504 cmn_err(CE_PANIC, "tx grant %d still in use by "
505 "backend domain", txp->tx_txreq.gref);
506 }
507 (void) gnttab_end_foreign_access_ref(
508 txp->tx_txreq.gref, 1);
509 xnf_gref_put(xnfp, txp->tx_txreq.gref);
510 }
511
512 if (txp->tx_handle_bound)
513 (void) ddi_dma_unbind_handle(txp->tx_dma_handle);
514 }
515
516 if (txp->tx_mp != NULL)
517 freemsg(txp->tx_mp);
518
519 if (txp->tx_prev != NULL) {
520 ASSERT3P(txp->tx_prev->tx_next, ==, txp);
521 txp->tx_prev->tx_next = NULL;
522 }
523
524 if (txp->tx_txreq.id != INVALID_TX_ID) {
525 /*
526 * This should be only possible when resuming from a suspend.
527 */
528 ASSERT(!xnfp->xnf_connected);
529 xnf_txid_put(xnfp, TX_ID_TO_TXID(xnfp, txp->tx_txreq.id));
530 txp->tx_txreq.id = INVALID_TX_ID;
531 }
532
533 kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
534 }
535
536 static void
xnf_data_txbuf_free_chain(xnf_t * xnfp,xnf_txbuf_t * txp)537 xnf_data_txbuf_free_chain(xnf_t *xnfp, xnf_txbuf_t *txp)
538 {
539 if (txp == NULL)
540 return;
541
542 while (txp->tx_next != NULL)
543 txp = txp->tx_next;
544
545 /*
546 * We free the chain in reverse order so that grants can be released
547 * for all dma chunks before unbinding the dma handles. The mblk is
548 * freed last, after all its fragments' dma handles are unbound.
549 */
550 xnf_txbuf_t *prev;
551 for (; txp != NULL; txp = prev) {
552 prev = txp->tx_prev;
553 xnf_data_txbuf_free(xnfp, txp);
554 }
555 }
556
557 static xnf_txbuf_t *
xnf_data_txbuf_alloc(xnf_t * xnfp,int flag)558 xnf_data_txbuf_alloc(xnf_t *xnfp, int flag)
559 {
560 xnf_txbuf_t *txp;
561
562 if ((txp = kmem_cache_alloc(xnfp->xnf_tx_buf_cache, flag)) == NULL) {
563 return (NULL);
564 }
565
566 txp->tx_type = TX_DATA;
567 txp->tx_next = NULL;
568 txp->tx_prev = NULL;
569 txp->tx_head = txp;
570 txp->tx_frags_to_ack = 0;
571 txp->tx_mp = NULL;
572 txp->tx_bdesc = NULL;
573 txp->tx_handle_bound = B_FALSE;
574 txp->tx_txreq.gref = INVALID_GRANT_REF;
575 txp->tx_txreq.id = INVALID_TX_ID;
576
577 return (txp);
578 }
579
580 /*
581 * Get `wanted' slots in the transmit ring, waiting for at least that
582 * number if `wait' is B_TRUE. Force the ring to be cleaned by setting
583 * `wanted' to zero.
584 *
585 * Return the number of slots available.
586 */
587 static int
xnf_tx_slots_get(xnf_t * xnfp,int wanted,boolean_t wait)588 xnf_tx_slots_get(xnf_t *xnfp, int wanted, boolean_t wait)
589 {
590 int slotsfree;
591 boolean_t forced_clean = (wanted == 0);
592
593 ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
594
595 /* LINTED: constant in conditional context */
596 while (B_TRUE) {
597 slotsfree = RING_FREE_REQUESTS(&xnfp->xnf_tx_ring);
598
599 if ((slotsfree < wanted) || forced_clean)
600 slotsfree = xnf_tx_clean_ring(xnfp);
601
602 /*
603 * If there are more than we need free, tell other
604 * people to come looking again. We hold txlock, so we
605 * are able to take our slots before anyone else runs.
606 */
607 if (slotsfree > wanted)
608 cv_broadcast(&xnfp->xnf_cv_tx_slots);
609
610 if (slotsfree >= wanted)
611 break;
612
613 if (!wait)
614 break;
615
616 cv_wait(&xnfp->xnf_cv_tx_slots, &xnfp->xnf_txlock);
617 }
618
619 ASSERT(slotsfree <= RING_SIZE(&(xnfp->xnf_tx_ring)));
620
621 return (slotsfree);
622 }
623
624 static int
xnf_setup_rings(xnf_t * xnfp)625 xnf_setup_rings(xnf_t *xnfp)
626 {
627 domid_t oeid;
628 struct xenbus_device *xsd;
629 RING_IDX i;
630 int err;
631 xnf_txid_t *tidp;
632 xnf_buf_t **bdescp;
633
634 oeid = xvdi_get_oeid(xnfp->xnf_devinfo);
635 xsd = xvdi_get_xsd(xnfp->xnf_devinfo);
636
637 if (xnfp->xnf_tx_ring_ref != INVALID_GRANT_REF)
638 gnttab_end_foreign_access(xnfp->xnf_tx_ring_ref, 0, 0);
639
640 err = gnttab_grant_foreign_access(oeid,
641 xnf_btop(pa_to_ma(xnfp->xnf_tx_ring_phys_addr)), 0);
642 if (err <= 0) {
643 err = -err;
644 xenbus_dev_error(xsd, err, "granting access to tx ring page");
645 goto out;
646 }
647 xnfp->xnf_tx_ring_ref = (grant_ref_t)err;
648
649 if (xnfp->xnf_rx_ring_ref != INVALID_GRANT_REF)
650 gnttab_end_foreign_access(xnfp->xnf_rx_ring_ref, 0, 0);
651
652 err = gnttab_grant_foreign_access(oeid,
653 xnf_btop(pa_to_ma(xnfp->xnf_rx_ring_phys_addr)), 0);
654 if (err <= 0) {
655 err = -err;
656 xenbus_dev_error(xsd, err, "granting access to rx ring page");
657 goto out;
658 }
659 xnfp->xnf_rx_ring_ref = (grant_ref_t)err;
660
661 mutex_enter(&xnfp->xnf_txlock);
662
663 /*
664 * We first cleanup the TX ring in case we are doing a resume.
665 * Note that this can lose packets, but we expect to stagger on.
666 */
667 xnfp->xnf_tx_pkt_id_head = INVALID_TX_ID; /* I.e. emtpy list. */
668 for (i = 0, tidp = &xnfp->xnf_tx_pkt_id[0];
669 i < NET_TX_RING_SIZE;
670 i++, tidp++) {
671 xnf_txbuf_t *txp = tidp->txbuf;
672 if (txp == NULL)
673 continue;
674
675 switch (txp->tx_type) {
676 case TX_DATA:
677 /*
678 * txid_put() will be called for each txbuf's txid in
679 * the chain which will result in clearing tidp->txbuf.
680 */
681 xnf_data_txbuf_free_chain(xnfp, txp);
682
683 break;
684
685 case TX_MCAST_REQ:
686 txp->tx_type = TX_MCAST_RSP;
687 txp->tx_status = NETIF_RSP_DROPPED;
688 cv_broadcast(&xnfp->xnf_cv_multicast);
689
690 /*
691 * The request consumed two slots in the ring,
692 * yet only a single xnf_txid_t is used. Step
693 * over the empty slot.
694 */
695 i++;
696 ASSERT3U(i, <, NET_TX_RING_SIZE);
697 break;
698
699 case TX_MCAST_RSP:
700 break;
701 }
702 }
703
704 /*
705 * Now purge old list and add each txid to the new free list.
706 */
707 xnfp->xnf_tx_pkt_id_head = INVALID_TX_ID; /* I.e. emtpy list. */
708 for (i = 0, tidp = &xnfp->xnf_tx_pkt_id[0];
709 i < NET_TX_RING_SIZE;
710 i++, tidp++) {
711 tidp->id = i;
712 ASSERT3P(tidp->txbuf, ==, NULL);
713 tidp->next = INVALID_TX_ID; /* Appease txid_put(). */
714 xnf_txid_put(xnfp, tidp);
715 }
716
717 /* LINTED: constant in conditional context */
718 SHARED_RING_INIT(xnfp->xnf_tx_ring.sring);
719 /* LINTED: constant in conditional context */
720 FRONT_RING_INIT(&xnfp->xnf_tx_ring,
721 xnfp->xnf_tx_ring.sring, PAGESIZE);
722
723 mutex_exit(&xnfp->xnf_txlock);
724
725 mutex_enter(&xnfp->xnf_rxlock);
726
727 /*
728 * Clean out any buffers currently posted to the receive ring
729 * before we reset it.
730 */
731 for (i = 0, bdescp = &xnfp->xnf_rx_pkt_info[0];
732 i < NET_RX_RING_SIZE;
733 i++, bdescp++) {
734 if (*bdescp != NULL) {
735 xnf_buf_put(xnfp, *bdescp, B_FALSE);
736 *bdescp = NULL;
737 }
738 }
739
740 /* LINTED: constant in conditional context */
741 SHARED_RING_INIT(xnfp->xnf_rx_ring.sring);
742 /* LINTED: constant in conditional context */
743 FRONT_RING_INIT(&xnfp->xnf_rx_ring,
744 xnfp->xnf_rx_ring.sring, PAGESIZE);
745
746 /*
747 * Fill the ring with buffers.
748 */
749 for (i = 0; i < NET_RX_RING_SIZE; i++) {
750 xnf_buf_t *bdesc;
751
752 bdesc = xnf_buf_get(xnfp, KM_SLEEP, B_FALSE);
753 VERIFY(bdesc != NULL);
754 xnf_rxbuf_hang(xnfp, bdesc);
755 }
756
757 /* LINTED: constant in conditional context */
758 RING_PUSH_REQUESTS(&xnfp->xnf_rx_ring);
759
760 mutex_exit(&xnfp->xnf_rxlock);
761
762 return (0);
763
764 out:
765 if (xnfp->xnf_tx_ring_ref != INVALID_GRANT_REF)
766 gnttab_end_foreign_access(xnfp->xnf_tx_ring_ref, 0, 0);
767 xnfp->xnf_tx_ring_ref = INVALID_GRANT_REF;
768
769 if (xnfp->xnf_rx_ring_ref != INVALID_GRANT_REF)
770 gnttab_end_foreign_access(xnfp->xnf_rx_ring_ref, 0, 0);
771 xnfp->xnf_rx_ring_ref = INVALID_GRANT_REF;
772
773 return (err);
774 }
775
776 /*
777 * Connect driver to back end, called to set up communication with
778 * back end driver both initially and on resume after restore/migrate.
779 */
780 void
xnf_be_connect(xnf_t * xnfp)781 xnf_be_connect(xnf_t *xnfp)
782 {
783 const char *message;
784 xenbus_transaction_t xbt;
785 struct xenbus_device *xsd;
786 char *xsname;
787 int err;
788
789 ASSERT(!xnfp->xnf_connected);
790
791 xsd = xvdi_get_xsd(xnfp->xnf_devinfo);
792 xsname = xvdi_get_xsname(xnfp->xnf_devinfo);
793
794 err = xnf_setup_rings(xnfp);
795 if (err != 0) {
796 cmn_err(CE_WARN, "failed to set up tx/rx rings");
797 xenbus_dev_error(xsd, err, "setting up ring");
798 return;
799 }
800
801 again:
802 err = xenbus_transaction_start(&xbt);
803 if (err != 0) {
804 xenbus_dev_error(xsd, EIO, "starting transaction");
805 return;
806 }
807
808 err = xenbus_printf(xbt, xsname, "tx-ring-ref", "%u",
809 xnfp->xnf_tx_ring_ref);
810 if (err != 0) {
811 message = "writing tx ring-ref";
812 goto abort_transaction;
813 }
814
815 err = xenbus_printf(xbt, xsname, "rx-ring-ref", "%u",
816 xnfp->xnf_rx_ring_ref);
817 if (err != 0) {
818 message = "writing rx ring-ref";
819 goto abort_transaction;
820 }
821
822 err = xenbus_printf(xbt, xsname, "event-channel", "%u",
823 xnfp->xnf_evtchn);
824 if (err != 0) {
825 message = "writing event-channel";
826 goto abort_transaction;
827 }
828
829 err = xenbus_printf(xbt, xsname, "feature-rx-notify", "%d", 1);
830 if (err != 0) {
831 message = "writing feature-rx-notify";
832 goto abort_transaction;
833 }
834
835 err = xenbus_printf(xbt, xsname, "request-rx-copy", "%d", 1);
836 if (err != 0) {
837 message = "writing request-rx-copy";
838 goto abort_transaction;
839 }
840
841 if (xnfp->xnf_be_mcast_control) {
842 err = xenbus_printf(xbt, xsname, "request-multicast-control",
843 "%d", 1);
844 if (err != 0) {
845 message = "writing request-multicast-control";
846 goto abort_transaction;
847 }
848 }
849
850 /*
851 * Tell backend if we support scatter-gather lists on the rx side.
852 */
853 err = xenbus_printf(xbt, xsname, "feature-sg", "%d",
854 xnf_enable_rx_sg ? 1 : 0);
855 if (err != 0) {
856 message = "writing feature-sg";
857 goto abort_transaction;
858 }
859
860 /*
861 * Tell backend if we support LRO for IPv4. Scatter-gather on rx is
862 * a prerequisite.
863 */
864 err = xenbus_printf(xbt, xsname, "feature-gso-tcpv4", "%d",
865 (xnf_enable_rx_sg && xnf_enable_lro) ? 1 : 0);
866 if (err != 0) {
867 message = "writing feature-gso-tcpv4";
868 goto abort_transaction;
869 }
870
871 err = xvdi_switch_state(xnfp->xnf_devinfo, xbt, XenbusStateConnected);
872 if (err != 0) {
873 message = "switching state to XenbusStateConnected";
874 goto abort_transaction;
875 }
876
877 err = xenbus_transaction_end(xbt, 0);
878 if (err != 0) {
879 if (err == EAGAIN)
880 goto again;
881 xenbus_dev_error(xsd, err, "completing transaction");
882 }
883
884 return;
885
886 abort_transaction:
887 (void) xenbus_transaction_end(xbt, 1);
888 xenbus_dev_error(xsd, err, "%s", message);
889 }
890
891 /*
892 * Read configuration information from xenstore.
893 */
894 void
xnf_read_config(xnf_t * xnfp)895 xnf_read_config(xnf_t *xnfp)
896 {
897 int err, be_cap;
898 char mac[ETHERADDRL * 3];
899 char *oename = xvdi_get_oename(xnfp->xnf_devinfo);
900
901 err = xenbus_scanf(XBT_NULL, oename, "mac",
902 "%s", (char *)&mac[0]);
903 if (err != 0) {
904 /*
905 * bad: we're supposed to be set up with a proper mac
906 * addr. at this point
907 */
908 cmn_err(CE_WARN, "%s%d: no mac address",
909 ddi_driver_name(xnfp->xnf_devinfo),
910 ddi_get_instance(xnfp->xnf_devinfo));
911 return;
912 }
913 if (ether_aton(mac, xnfp->xnf_mac_addr) != ETHERADDRL) {
914 err = ENOENT;
915 xenbus_dev_error(xvdi_get_xsd(xnfp->xnf_devinfo), ENOENT,
916 "parsing %s/mac", xvdi_get_xsname(xnfp->xnf_devinfo));
917 return;
918 }
919
920 err = xenbus_scanf(XBT_NULL, oename,
921 "feature-rx-copy", "%d", &be_cap);
922 /*
923 * If we fail to read the store we assume that the key is
924 * absent, implying an older domain at the far end. Older
925 * domains cannot do HV copy.
926 */
927 if (err != 0)
928 be_cap = 0;
929 xnfp->xnf_be_rx_copy = (be_cap != 0);
930
931 err = xenbus_scanf(XBT_NULL, oename,
932 "feature-multicast-control", "%d", &be_cap);
933 /*
934 * If we fail to read the store we assume that the key is
935 * absent, implying an older domain at the far end. Older
936 * domains do not support multicast control.
937 */
938 if (err != 0)
939 be_cap = 0;
940 xnfp->xnf_be_mcast_control = (be_cap != 0) && xnf_multicast_control;
941
942 /*
943 * See if back-end supports scatter-gather for transmits. If not,
944 * we will not support LSO and limit the mtu to 1500.
945 */
946 err = xenbus_scanf(XBT_NULL, oename, "feature-sg", "%d", &be_cap);
947 if (err != 0) {
948 be_cap = 0;
949 dev_err(xnfp->xnf_devinfo, CE_WARN, "error reading "
950 "'feature-sg' from backend driver");
951 }
952 if (be_cap == 0) {
953 dev_err(xnfp->xnf_devinfo, CE_WARN, "scatter-gather is not "
954 "supported for transmits in the backend driver. LSO is "
955 "disabled and MTU is restricted to 1500 bytes.");
956 }
957 xnfp->xnf_be_tx_sg = (be_cap != 0) && xnf_enable_tx_sg;
958
959 if (xnfp->xnf_be_tx_sg) {
960 /*
961 * Check if LSO is supported. Currently we only check for
962 * IPv4 as Illumos doesn't support LSO for IPv6.
963 */
964 err = xenbus_scanf(XBT_NULL, oename, "feature-gso-tcpv4", "%d",
965 &be_cap);
966 if (err != 0) {
967 be_cap = 0;
968 dev_err(xnfp->xnf_devinfo, CE_WARN, "error reading "
969 "'feature-gso-tcpv4' from backend driver");
970 }
971 if (be_cap == 0) {
972 dev_err(xnfp->xnf_devinfo, CE_WARN, "LSO is not "
973 "supported by the backend driver. Performance "
974 "will be affected.");
975 }
976 xnfp->xnf_be_lso = (be_cap != 0) && xnf_enable_lso;
977 }
978 }
979
980 /*
981 * attach(9E) -- Attach a device to the system
982 */
983 static int
xnf_attach(dev_info_t * devinfo,ddi_attach_cmd_t cmd)984 xnf_attach(dev_info_t *devinfo, ddi_attach_cmd_t cmd)
985 {
986 mac_register_t *macp;
987 xnf_t *xnfp;
988 int err;
989 char cachename[32];
990
991 switch (cmd) {
992 case DDI_RESUME:
993 xnfp = ddi_get_driver_private(devinfo);
994 xnfp->xnf_gen++;
995
996 (void) xvdi_resume(devinfo);
997 (void) xvdi_alloc_evtchn(devinfo);
998 xnfp->xnf_evtchn = xvdi_get_evtchn(devinfo);
999 #ifdef XPV_HVM_DRIVER
1000 ec_bind_evtchn_to_handler(xnfp->xnf_evtchn, IPL_VIF, xnf_intr,
1001 xnfp);
1002 #else
1003 (void) ddi_add_intr(devinfo, 0, NULL, NULL, xnf_intr,
1004 (caddr_t)xnfp);
1005 #endif
1006 return (DDI_SUCCESS);
1007
1008 case DDI_ATTACH:
1009 break;
1010
1011 default:
1012 return (DDI_FAILURE);
1013 }
1014
1015 /*
1016 * Allocate gld_mac_info_t and xnf_instance structures
1017 */
1018 macp = mac_alloc(MAC_VERSION);
1019 if (macp == NULL)
1020 return (DDI_FAILURE);
1021 xnfp = kmem_zalloc(sizeof (*xnfp), KM_SLEEP);
1022
1023 xnfp->xnf_tx_pkt_id =
1024 kmem_zalloc(sizeof (xnf_txid_t) * NET_TX_RING_SIZE, KM_SLEEP);
1025
1026 xnfp->xnf_rx_pkt_info =
1027 kmem_zalloc(sizeof (xnf_buf_t *) * NET_RX_RING_SIZE, KM_SLEEP);
1028
1029 macp->m_dip = devinfo;
1030 macp->m_driver = xnfp;
1031 xnfp->xnf_devinfo = devinfo;
1032
1033 macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
1034 macp->m_src_addr = xnfp->xnf_mac_addr;
1035 macp->m_callbacks = &xnf_callbacks;
1036 macp->m_min_sdu = 0;
1037 xnfp->xnf_mtu = ETHERMTU;
1038 macp->m_max_sdu = xnfp->xnf_mtu;
1039
1040 xnfp->xnf_running = B_FALSE;
1041 xnfp->xnf_connected = B_FALSE;
1042 xnfp->xnf_be_rx_copy = B_FALSE;
1043 xnfp->xnf_be_mcast_control = B_FALSE;
1044 xnfp->xnf_need_sched = B_FALSE;
1045
1046 xnfp->xnf_rx_head = NULL;
1047 xnfp->xnf_rx_tail = NULL;
1048 xnfp->xnf_rx_new_buffers_posted = B_FALSE;
1049
1050 #ifdef XPV_HVM_DRIVER
1051 /* Report our version to dom0 */
1052 (void) xenbus_printf(XBT_NULL, "guest/xnf", "version", "%d",
1053 HVMPV_XNF_VERS);
1054 #endif
1055
1056 /*
1057 * Get the iblock cookie with which to initialize the mutexes.
1058 */
1059 if (ddi_get_iblock_cookie(devinfo, 0, &xnfp->xnf_icookie)
1060 != DDI_SUCCESS)
1061 goto failure;
1062
1063 mutex_init(&xnfp->xnf_txlock,
1064 NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
1065 mutex_init(&xnfp->xnf_rxlock,
1066 NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
1067 mutex_init(&xnfp->xnf_schedlock,
1068 NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
1069 mutex_init(&xnfp->xnf_gref_lock,
1070 NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
1071
1072 cv_init(&xnfp->xnf_cv_state, NULL, CV_DEFAULT, NULL);
1073 cv_init(&xnfp->xnf_cv_multicast, NULL, CV_DEFAULT, NULL);
1074 cv_init(&xnfp->xnf_cv_tx_slots, NULL, CV_DEFAULT, NULL);
1075
1076 (void) sprintf(cachename, "xnf_buf_cache_%d",
1077 ddi_get_instance(devinfo));
1078 xnfp->xnf_buf_cache = kmem_cache_create(cachename,
1079 sizeof (xnf_buf_t), 0,
1080 xnf_buf_constructor, xnf_buf_destructor,
1081 NULL, xnfp, NULL, 0);
1082 if (xnfp->xnf_buf_cache == NULL)
1083 goto failure_0;
1084
1085 (void) sprintf(cachename, "xnf_tx_buf_cache_%d",
1086 ddi_get_instance(devinfo));
1087 xnfp->xnf_tx_buf_cache = kmem_cache_create(cachename,
1088 sizeof (xnf_txbuf_t), 0,
1089 xnf_tx_buf_constructor, xnf_tx_buf_destructor,
1090 NULL, xnfp, NULL, 0);
1091 if (xnfp->xnf_tx_buf_cache == NULL)
1092 goto failure_1;
1093
1094 xnfp->xnf_gref_head = INVALID_GRANT_REF;
1095
1096 if (xnf_alloc_dma_resources(xnfp) == DDI_FAILURE) {
1097 cmn_err(CE_WARN, "xnf%d: failed to allocate and initialize "
1098 "driver data structures",
1099 ddi_get_instance(xnfp->xnf_devinfo));
1100 goto failure_2;
1101 }
1102
1103 xnfp->xnf_rx_ring.sring->rsp_event =
1104 xnfp->xnf_tx_ring.sring->rsp_event = 1;
1105
1106 xnfp->xnf_tx_ring_ref = INVALID_GRANT_REF;
1107 xnfp->xnf_rx_ring_ref = INVALID_GRANT_REF;
1108
1109 /* set driver private pointer now */
1110 ddi_set_driver_private(devinfo, xnfp);
1111
1112 if (!xnf_kstat_init(xnfp))
1113 goto failure_3;
1114
1115 /*
1116 * Allocate an event channel, add the interrupt handler and
1117 * bind it to the event channel.
1118 */
1119 (void) xvdi_alloc_evtchn(devinfo);
1120 xnfp->xnf_evtchn = xvdi_get_evtchn(devinfo);
1121 #ifdef XPV_HVM_DRIVER
1122 ec_bind_evtchn_to_handler(xnfp->xnf_evtchn, IPL_VIF, xnf_intr, xnfp);
1123 #else
1124 (void) ddi_add_intr(devinfo, 0, NULL, NULL, xnf_intr, (caddr_t)xnfp);
1125 #endif
1126
1127 err = mac_register(macp, &xnfp->xnf_mh);
1128 mac_free(macp);
1129 macp = NULL;
1130 if (err != 0)
1131 goto failure_4;
1132
1133 if (xvdi_add_event_handler(devinfo, XS_OE_STATE, oe_state_change, NULL)
1134 != DDI_SUCCESS)
1135 goto failure_5;
1136
1137 #ifdef XPV_HVM_DRIVER
1138 /*
1139 * In the HVM case, this driver essentially replaces a driver for
1140 * a 'real' PCI NIC. Without the "model" property set to
1141 * "Ethernet controller", like the PCI code does, netbooting does
1142 * not work correctly, as strplumb_get_netdev_path() will not find
1143 * this interface.
1144 */
1145 (void) ndi_prop_update_string(DDI_DEV_T_NONE, devinfo, "model",
1146 "Ethernet controller");
1147 #endif
1148
1149 return (DDI_SUCCESS);
1150
1151 failure_5:
1152 (void) mac_unregister(xnfp->xnf_mh);
1153
1154 failure_4:
1155 #ifdef XPV_HVM_DRIVER
1156 ec_unbind_evtchn(xnfp->xnf_evtchn);
1157 xvdi_free_evtchn(devinfo);
1158 #else
1159 ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie);
1160 #endif
1161 xnfp->xnf_evtchn = INVALID_EVTCHN;
1162 kstat_delete(xnfp->xnf_kstat_aux);
1163
1164 failure_3:
1165 xnf_release_dma_resources(xnfp);
1166
1167 failure_2:
1168 kmem_cache_destroy(xnfp->xnf_tx_buf_cache);
1169
1170 failure_1:
1171 kmem_cache_destroy(xnfp->xnf_buf_cache);
1172
1173 failure_0:
1174 cv_destroy(&xnfp->xnf_cv_tx_slots);
1175 cv_destroy(&xnfp->xnf_cv_multicast);
1176 cv_destroy(&xnfp->xnf_cv_state);
1177
1178 mutex_destroy(&xnfp->xnf_gref_lock);
1179 mutex_destroy(&xnfp->xnf_schedlock);
1180 mutex_destroy(&xnfp->xnf_rxlock);
1181 mutex_destroy(&xnfp->xnf_txlock);
1182
1183 failure:
1184 kmem_free(xnfp, sizeof (*xnfp));
1185 if (macp != NULL)
1186 mac_free(macp);
1187
1188 return (DDI_FAILURE);
1189 }
1190
1191 /* detach(9E) -- Detach a device from the system */
1192 static int
xnf_detach(dev_info_t * devinfo,ddi_detach_cmd_t cmd)1193 xnf_detach(dev_info_t *devinfo, ddi_detach_cmd_t cmd)
1194 {
1195 xnf_t *xnfp; /* Our private device info */
1196
1197 xnfp = ddi_get_driver_private(devinfo);
1198
1199 switch (cmd) {
1200 case DDI_SUSPEND:
1201 #ifdef XPV_HVM_DRIVER
1202 ec_unbind_evtchn(xnfp->xnf_evtchn);
1203 xvdi_free_evtchn(devinfo);
1204 #else
1205 ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie);
1206 #endif
1207
1208 xvdi_suspend(devinfo);
1209
1210 mutex_enter(&xnfp->xnf_rxlock);
1211 mutex_enter(&xnfp->xnf_txlock);
1212
1213 xnfp->xnf_evtchn = INVALID_EVTCHN;
1214 xnfp->xnf_connected = B_FALSE;
1215 mutex_exit(&xnfp->xnf_txlock);
1216 mutex_exit(&xnfp->xnf_rxlock);
1217
1218 /* claim link to be down after disconnect */
1219 mac_link_update(xnfp->xnf_mh, LINK_STATE_DOWN);
1220 return (DDI_SUCCESS);
1221
1222 case DDI_DETACH:
1223 break;
1224
1225 default:
1226 return (DDI_FAILURE);
1227 }
1228
1229 if (xnfp->xnf_connected)
1230 return (DDI_FAILURE);
1231
1232 /*
1233 * Cannot detach if we have xnf_buf_t outstanding.
1234 */
1235 if (xnfp->xnf_stat_buf_allocated > 0)
1236 return (DDI_FAILURE);
1237
1238 if (mac_unregister(xnfp->xnf_mh) != 0)
1239 return (DDI_FAILURE);
1240
1241 kstat_delete(xnfp->xnf_kstat_aux);
1242
1243 /* Stop the receiver */
1244 xnf_stop(xnfp);
1245
1246 xvdi_remove_event_handler(devinfo, XS_OE_STATE);
1247
1248 /* Remove the interrupt */
1249 #ifdef XPV_HVM_DRIVER
1250 ec_unbind_evtchn(xnfp->xnf_evtchn);
1251 xvdi_free_evtchn(devinfo);
1252 #else
1253 ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie);
1254 #endif
1255
1256 /* Release any pending xmit mblks */
1257 xnf_release_mblks(xnfp);
1258
1259 /* Release all DMA resources */
1260 xnf_release_dma_resources(xnfp);
1261
1262 cv_destroy(&xnfp->xnf_cv_tx_slots);
1263 cv_destroy(&xnfp->xnf_cv_multicast);
1264 cv_destroy(&xnfp->xnf_cv_state);
1265
1266 kmem_cache_destroy(xnfp->xnf_tx_buf_cache);
1267 kmem_cache_destroy(xnfp->xnf_buf_cache);
1268
1269 mutex_destroy(&xnfp->xnf_gref_lock);
1270 mutex_destroy(&xnfp->xnf_schedlock);
1271 mutex_destroy(&xnfp->xnf_rxlock);
1272 mutex_destroy(&xnfp->xnf_txlock);
1273
1274 kmem_free(xnfp, sizeof (*xnfp));
1275
1276 return (DDI_SUCCESS);
1277 }
1278
1279 /*
1280 * xnf_set_mac_addr() -- set the physical network address on the board.
1281 */
1282 static int
xnf_set_mac_addr(void * arg,const uint8_t * macaddr)1283 xnf_set_mac_addr(void *arg, const uint8_t *macaddr)
1284 {
1285 _NOTE(ARGUNUSED(arg, macaddr));
1286
1287 /*
1288 * We can't set our macaddr.
1289 */
1290 return (ENOTSUP);
1291 }
1292
1293 /*
1294 * xnf_set_multicast() -- set (enable) or disable a multicast address.
1295 *
1296 * Program the hardware to enable/disable the multicast address
1297 * in "mca". Enable if "add" is true, disable if false.
1298 */
1299 static int
xnf_set_multicast(void * arg,boolean_t add,const uint8_t * mca)1300 xnf_set_multicast(void *arg, boolean_t add, const uint8_t *mca)
1301 {
1302 xnf_t *xnfp = arg;
1303 xnf_txbuf_t *txp;
1304 int n_slots;
1305 RING_IDX slot;
1306 xnf_txid_t *tidp;
1307 netif_tx_request_t *txrp;
1308 struct netif_extra_info *erp;
1309 boolean_t notify, result;
1310
1311 /*
1312 * If the backend does not support multicast control then we
1313 * must assume that the right packets will just arrive.
1314 */
1315 if (!xnfp->xnf_be_mcast_control)
1316 return (0);
1317
1318 txp = kmem_cache_alloc(xnfp->xnf_tx_buf_cache, KM_SLEEP);
1319
1320 mutex_enter(&xnfp->xnf_txlock);
1321
1322 /*
1323 * If we're not yet connected then claim success. This is
1324 * acceptable because we refresh the entire set of multicast
1325 * addresses when we get connected.
1326 *
1327 * We can't wait around here because the MAC layer expects
1328 * this to be a non-blocking operation - waiting ends up
1329 * causing a deadlock during resume.
1330 */
1331 if (!xnfp->xnf_connected) {
1332 mutex_exit(&xnfp->xnf_txlock);
1333 return (0);
1334 }
1335
1336 /*
1337 * 1. Acquire two slots in the ring.
1338 * 2. Fill in the slots.
1339 * 3. Request notification when the operation is done.
1340 * 4. Kick the peer.
1341 * 5. Wait for the response via xnf_tx_clean_ring().
1342 */
1343
1344 n_slots = xnf_tx_slots_get(xnfp, 2, B_TRUE);
1345 ASSERT(n_slots >= 2);
1346
1347 slot = xnfp->xnf_tx_ring.req_prod_pvt;
1348 tidp = xnf_txid_get(xnfp);
1349 VERIFY(tidp != NULL);
1350
1351 txp->tx_type = TX_MCAST_REQ;
1352 txp->tx_slot = slot;
1353
1354 txrp = RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot);
1355 erp = (struct netif_extra_info *)
1356 RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot + 1);
1357
1358 txrp->gref = 0;
1359 txrp->size = 0;
1360 txrp->offset = 0;
1361 /* Set tx_txreq.id to appease xnf_tx_clean_ring(). */
1362 txrp->id = txp->tx_txreq.id = tidp->id;
1363 txrp->flags = NETTXF_extra_info;
1364
1365 erp->type = add ? XEN_NETIF_EXTRA_TYPE_MCAST_ADD :
1366 XEN_NETIF_EXTRA_TYPE_MCAST_DEL;
1367 bcopy((void *)mca, &erp->u.mcast.addr, ETHERADDRL);
1368
1369 tidp->txbuf = txp;
1370
1371 xnfp->xnf_tx_ring.req_prod_pvt = slot + 2;
1372
1373 mutex_enter(&xnfp->xnf_schedlock);
1374 xnfp->xnf_pending_multicast++;
1375 mutex_exit(&xnfp->xnf_schedlock);
1376
1377 /* LINTED: constant in conditional context */
1378 RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_tx_ring,
1379 notify);
1380 if (notify)
1381 ec_notify_via_evtchn(xnfp->xnf_evtchn);
1382
1383 while (txp->tx_type == TX_MCAST_REQ)
1384 cv_wait(&xnfp->xnf_cv_multicast, &xnfp->xnf_txlock);
1385
1386 ASSERT3U(txp->tx_type, ==, TX_MCAST_RSP);
1387
1388 mutex_enter(&xnfp->xnf_schedlock);
1389 xnfp->xnf_pending_multicast--;
1390 mutex_exit(&xnfp->xnf_schedlock);
1391
1392 result = (txp->tx_status == NETIF_RSP_OKAY);
1393
1394 xnf_txid_put(xnfp, tidp);
1395
1396 mutex_exit(&xnfp->xnf_txlock);
1397
1398 kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
1399
1400 return (result ? 0 : 1);
1401 }
1402
1403 /*
1404 * xnf_set_promiscuous() -- set or reset promiscuous mode on the board
1405 *
1406 * Program the hardware to enable/disable promiscuous mode.
1407 */
1408 static int
xnf_set_promiscuous(void * arg,boolean_t on)1409 xnf_set_promiscuous(void *arg, boolean_t on)
1410 {
1411 _NOTE(ARGUNUSED(arg, on));
1412
1413 /*
1414 * We can't really do this, but we pretend that we can in
1415 * order that snoop will work.
1416 */
1417 return (0);
1418 }
1419
1420 /*
1421 * Clean buffers that we have responses for from the transmit ring.
1422 */
1423 static int
xnf_tx_clean_ring(xnf_t * xnfp)1424 xnf_tx_clean_ring(xnf_t *xnfp)
1425 {
1426 boolean_t work_to_do;
1427
1428 ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
1429
1430 loop:
1431 while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_tx_ring)) {
1432 RING_IDX cons, prod, i;
1433
1434 cons = xnfp->xnf_tx_ring.rsp_cons;
1435 prod = xnfp->xnf_tx_ring.sring->rsp_prod;
1436 membar_consumer();
1437 /*
1438 * Clean tx requests from ring that we have responses
1439 * for.
1440 */
1441 DTRACE_PROBE2(xnf_tx_clean_range, int, cons, int, prod);
1442 for (i = cons; i != prod; i++) {
1443 netif_tx_response_t *trp;
1444 xnf_txid_t *tidp;
1445 xnf_txbuf_t *txp;
1446
1447 trp = RING_GET_RESPONSE(&xnfp->xnf_tx_ring, i);
1448 /*
1449 * if this slot was occupied by netif_extra_info_t,
1450 * then the response will be NETIF_RSP_NULL. In this
1451 * case there are no resources to clean up.
1452 */
1453 if (trp->status == NETIF_RSP_NULL)
1454 continue;
1455
1456 ASSERT(TX_ID_VALID(trp->id));
1457
1458 tidp = TX_ID_TO_TXID(xnfp, trp->id);
1459 ASSERT3U(tidp->id, ==, trp->id);
1460 ASSERT3U(tidp->next, ==, INVALID_TX_ID);
1461
1462 txp = tidp->txbuf;
1463 ASSERT(txp != NULL);
1464 ASSERT3U(txp->tx_txreq.id, ==, trp->id);
1465
1466 switch (txp->tx_type) {
1467 case TX_DATA:
1468 /*
1469 * We must put the txid for each response we
1470 * acknowledge to make sure that we never have
1471 * more free slots than txids. Because of this
1472 * we do it here instead of waiting for it to
1473 * be done in xnf_data_txbuf_free_chain().
1474 */
1475 xnf_txid_put(xnfp, tidp);
1476 txp->tx_txreq.id = INVALID_TX_ID;
1477 ASSERT3S(txp->tx_head->tx_frags_to_ack, >, 0);
1478 txp->tx_head->tx_frags_to_ack--;
1479
1480 /*
1481 * We clean the whole chain once we got a
1482 * response for each fragment.
1483 */
1484 if (txp->tx_head->tx_frags_to_ack == 0)
1485 xnf_data_txbuf_free_chain(xnfp, txp);
1486
1487 break;
1488
1489 case TX_MCAST_REQ:
1490 txp->tx_type = TX_MCAST_RSP;
1491 txp->tx_status = trp->status;
1492 cv_broadcast(&xnfp->xnf_cv_multicast);
1493
1494 break;
1495
1496 default:
1497 cmn_err(CE_PANIC, "xnf_tx_clean_ring: "
1498 "invalid xnf_txbuf_t type: %d",
1499 txp->tx_type);
1500 break;
1501 }
1502 }
1503 /*
1504 * Record the last response we dealt with so that we
1505 * know where to start next time around.
1506 */
1507 xnfp->xnf_tx_ring.rsp_cons = prod;
1508 membar_enter();
1509 }
1510
1511 /* LINTED: constant in conditional context */
1512 RING_FINAL_CHECK_FOR_RESPONSES(&xnfp->xnf_tx_ring, work_to_do);
1513 if (work_to_do)
1514 goto loop;
1515
1516 return (RING_FREE_REQUESTS(&xnfp->xnf_tx_ring));
1517 }
1518
1519 /*
1520 * Allocate and fill in a look-aside buffer for the packet `mp'. Used
1521 * to ensure that the packet is physically contiguous and contained
1522 * within a single page.
1523 */
1524 static xnf_buf_t *
xnf_tx_get_lookaside(xnf_t * xnfp,mblk_t * mp,size_t * plen)1525 xnf_tx_get_lookaside(xnf_t *xnfp, mblk_t *mp, size_t *plen)
1526 {
1527 xnf_buf_t *bd;
1528 caddr_t bp;
1529
1530 if ((bd = xnf_buf_get(xnfp, KM_NOSLEEP, B_TRUE)) == NULL) {
1531 return (NULL);
1532 }
1533
1534 bp = bd->buf;
1535 while (mp != NULL) {
1536 size_t len = MBLKL(mp);
1537
1538 bcopy(mp->b_rptr, bp, len);
1539 bp += len;
1540
1541 mp = mp->b_cont;
1542 }
1543
1544 *plen = bp - bd->buf;
1545 ASSERT3U(*plen, <=, PAGESIZE);
1546
1547 xnfp->xnf_stat_tx_lookaside++;
1548
1549 return (bd);
1550 }
1551
1552 /*
1553 * Insert the pseudo-header checksum into the packet.
1554 * Assumes packet is IPv4, TCP/UDP since we only advertised support for
1555 * HCKSUM_INET_FULL_V4.
1556 */
1557 int
xnf_pseudo_cksum(mblk_t * mp)1558 xnf_pseudo_cksum(mblk_t *mp)
1559 {
1560 struct ether_header *ehp;
1561 uint16_t sap, iplen, *stuff;
1562 uint32_t cksum;
1563 size_t len;
1564 ipha_t *ipha;
1565 ipaddr_t src, dst;
1566 uchar_t *ptr;
1567
1568 ptr = mp->b_rptr;
1569 len = MBLKL(mp);
1570
1571 /* Each header must fit completely in an mblk. */
1572 ASSERT3U(len, >=, sizeof (*ehp));
1573
1574 ehp = (struct ether_header *)ptr;
1575
1576 if (ntohs(ehp->ether_type) == VLAN_TPID) {
1577 struct ether_vlan_header *evhp;
1578 ASSERT3U(len, >=, sizeof (*evhp));
1579 evhp = (struct ether_vlan_header *)ptr;
1580 sap = ntohs(evhp->ether_type);
1581 ptr += sizeof (*evhp);
1582 len -= sizeof (*evhp);
1583 } else {
1584 sap = ntohs(ehp->ether_type);
1585 ptr += sizeof (*ehp);
1586 len -= sizeof (*ehp);
1587 }
1588
1589 ASSERT3U(sap, ==, ETHERTYPE_IP);
1590
1591 /*
1592 * Ethernet and IP headers may be in different mblks.
1593 */
1594 ASSERT3P(ptr, <=, mp->b_wptr);
1595 if (ptr == mp->b_wptr) {
1596 mp = mp->b_cont;
1597 ptr = mp->b_rptr;
1598 len = MBLKL(mp);
1599 }
1600
1601 ASSERT3U(len, >=, sizeof (ipha_t));
1602 ipha = (ipha_t *)ptr;
1603
1604 /*
1605 * We assume the IP header has no options. (This is enforced in
1606 * ire_send_wire_v4() -- search for IXAF_NO_HW_CKSUM).
1607 */
1608 ASSERT3U(IPH_HDR_LENGTH(ipha), ==, IP_SIMPLE_HDR_LENGTH);
1609 iplen = ntohs(ipha->ipha_length) - IP_SIMPLE_HDR_LENGTH;
1610
1611 ptr += IP_SIMPLE_HDR_LENGTH;
1612 len -= IP_SIMPLE_HDR_LENGTH;
1613
1614 /*
1615 * IP and L4 headers may be in different mblks.
1616 */
1617 ASSERT3P(ptr, <=, mp->b_wptr);
1618 if (ptr == mp->b_wptr) {
1619 mp = mp->b_cont;
1620 ptr = mp->b_rptr;
1621 len = MBLKL(mp);
1622 }
1623
1624 switch (ipha->ipha_protocol) {
1625 case IPPROTO_TCP:
1626 ASSERT3U(len, >=, sizeof (tcph_t));
1627 stuff = (uint16_t *)(ptr + TCP_CHECKSUM_OFFSET);
1628 cksum = IP_TCP_CSUM_COMP;
1629 break;
1630 case IPPROTO_UDP:
1631 ASSERT3U(len, >=, sizeof (struct udphdr));
1632 stuff = (uint16_t *)(ptr + UDP_CHECKSUM_OFFSET);
1633 cksum = IP_UDP_CSUM_COMP;
1634 break;
1635 default:
1636 cmn_err(CE_WARN, "xnf_pseudo_cksum: unexpected protocol %d",
1637 ipha->ipha_protocol);
1638 return (EINVAL);
1639 }
1640
1641 src = ipha->ipha_src;
1642 dst = ipha->ipha_dst;
1643
1644 cksum += (dst >> 16) + (dst & 0xFFFF);
1645 cksum += (src >> 16) + (src & 0xFFFF);
1646 cksum += htons(iplen);
1647
1648 cksum = (cksum >> 16) + (cksum & 0xFFFF);
1649 cksum = (cksum >> 16) + (cksum & 0xFFFF);
1650
1651 ASSERT(cksum <= 0xFFFF);
1652
1653 *stuff = (uint16_t)(cksum ? cksum : ~cksum);
1654
1655 return (0);
1656 }
1657
1658 /*
1659 * Push a packet into the transmit ring.
1660 *
1661 * Note: the format of a tx packet that spans multiple slots is similar to
1662 * what is described in xnf_rx_one_packet().
1663 */
1664 static void
xnf_tx_push_packet(xnf_t * xnfp,xnf_txbuf_t * head)1665 xnf_tx_push_packet(xnf_t *xnfp, xnf_txbuf_t *head)
1666 {
1667 int nslots = 0;
1668 int extras = 0;
1669 RING_IDX slot;
1670 boolean_t notify;
1671
1672 ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
1673 ASSERT(xnfp->xnf_running);
1674
1675 slot = xnfp->xnf_tx_ring.req_prod_pvt;
1676
1677 /*
1678 * The caller has already checked that we have enough slots to proceed.
1679 */
1680 for (xnf_txbuf_t *txp = head; txp != NULL; txp = txp->tx_next) {
1681 xnf_txid_t *tidp;
1682 netif_tx_request_t *txrp;
1683
1684 tidp = xnf_txid_get(xnfp);
1685 VERIFY(tidp != NULL);
1686 txrp = RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot);
1687
1688 txp->tx_slot = slot;
1689 txp->tx_txreq.id = tidp->id;
1690 *txrp = txp->tx_txreq;
1691
1692 tidp->txbuf = txp;
1693 slot++;
1694 nslots++;
1695
1696 /*
1697 * When present, LSO info is placed in a slot after the first
1698 * data segment, and doesn't require a txid.
1699 */
1700 if (txp->tx_txreq.flags & NETTXF_extra_info) {
1701 netif_extra_info_t *extra;
1702 ASSERT3U(nslots, ==, 1);
1703
1704 extra = (netif_extra_info_t *)
1705 RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot);
1706 *extra = txp->tx_extra;
1707 slot++;
1708 nslots++;
1709 extras = 1;
1710 }
1711 }
1712
1713 ASSERT3U(nslots, <=, XEN_MAX_SLOTS_PER_TX);
1714
1715 /*
1716 * Store the number of data fragments.
1717 */
1718 head->tx_frags_to_ack = nslots - extras;
1719
1720 xnfp->xnf_tx_ring.req_prod_pvt = slot;
1721
1722 /*
1723 * Tell the peer that we sent something, if it cares.
1724 */
1725 /* LINTED: constant in conditional context */
1726 RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_tx_ring, notify);
1727 if (notify)
1728 ec_notify_via_evtchn(xnfp->xnf_evtchn);
1729 }
1730
1731 static xnf_txbuf_t *
xnf_mblk_copy(xnf_t * xnfp,mblk_t * mp)1732 xnf_mblk_copy(xnf_t *xnfp, mblk_t *mp)
1733 {
1734 xnf_txbuf_t *txp;
1735 size_t length;
1736
1737 if ((txp = xnf_data_txbuf_alloc(xnfp, KM_NOSLEEP)) == NULL) {
1738 return (NULL);
1739 }
1740
1741 txp->tx_bdesc = xnf_tx_get_lookaside(xnfp, mp, &length);
1742 if (txp->tx_bdesc == NULL) {
1743 xnf_data_txbuf_free(xnfp, txp);
1744 return (NULL);
1745 }
1746 txp->tx_mfn = txp->tx_bdesc->buf_mfn;
1747 txp->tx_txreq.gref = txp->tx_bdesc->grant_ref;
1748 txp->tx_txreq.size = length;
1749 txp->tx_txreq.offset = (uintptr_t)txp->tx_bdesc->buf & PAGEOFFSET;
1750 txp->tx_txreq.flags = 0;
1751
1752 return (txp);
1753 }
1754
1755 static xnf_txbuf_t *
xnf_mblk_map(xnf_t * xnfp,mblk_t * mp,int * countp)1756 xnf_mblk_map(xnf_t *xnfp, mblk_t *mp, int *countp)
1757 {
1758 xnf_txbuf_t *head = NULL;
1759 xnf_txbuf_t *tail = NULL;
1760 domid_t oeid;
1761 int nsegs = 0;
1762
1763 oeid = xvdi_get_oeid(xnfp->xnf_devinfo);
1764
1765 for (mblk_t *ml = mp; ml != NULL; ml = ml->b_cont) {
1766 ddi_dma_handle_t dma_handle;
1767 const ddi_dma_cookie_t *dma_cookie, *dma_cookie_prev;
1768 xnf_txbuf_t *txp;
1769
1770 if (MBLKL(ml) == 0)
1771 continue;
1772
1773 if ((txp = xnf_data_txbuf_alloc(xnfp, KM_NOSLEEP)) == NULL) {
1774 goto error;
1775 }
1776
1777 if (head == NULL) {
1778 head = txp;
1779 } else {
1780 ASSERT(tail != NULL);
1781 TXBUF_SETNEXT(tail, txp);
1782 txp->tx_head = head;
1783 }
1784
1785 /*
1786 * The necessary segmentation rules (e.g. not crossing a page
1787 * boundary) are enforced by the dma attributes of the handle.
1788 */
1789 dma_handle = txp->tx_dma_handle;
1790 int ret = ddi_dma_addr_bind_handle(dma_handle,
1791 NULL, (char *)ml->b_rptr, MBLKL(ml),
1792 DDI_DMA_WRITE | DDI_DMA_STREAMING,
1793 DDI_DMA_DONTWAIT, 0, NULL, NULL);
1794 if (ret != DDI_DMA_MAPPED) {
1795 if (ret != DDI_DMA_NORESOURCES) {
1796 dev_err(xnfp->xnf_devinfo, CE_WARN,
1797 "ddi_dma_addr_bind_handle() failed "
1798 "[dma_error=%d]", ret);
1799 }
1800 goto error;
1801 }
1802 txp->tx_handle_bound = B_TRUE;
1803
1804 dma_cookie_prev = NULL;
1805 while ((dma_cookie = ddi_dma_cookie_iter(dma_handle,
1806 dma_cookie_prev)) != NULL) {
1807 if (nsegs == XEN_MAX_TX_DATA_PAGES) {
1808 dev_err(xnfp->xnf_devinfo, CE_WARN,
1809 "xnf_dmamap_alloc() failed: "
1810 "too many segments");
1811 goto error;
1812 }
1813 if (dma_cookie_prev != NULL) {
1814 if ((txp = xnf_data_txbuf_alloc(xnfp,
1815 KM_NOSLEEP)) == NULL) {
1816 goto error;
1817 }
1818 ASSERT(tail != NULL);
1819 TXBUF_SETNEXT(tail, txp);
1820 txp->tx_head = head;
1821 }
1822
1823 txp->tx_mfn =
1824 xnf_btop(pa_to_ma(dma_cookie->dmac_laddress));
1825 txp->tx_txreq.gref = xnf_gref_get(xnfp);
1826 if (txp->tx_txreq.gref == INVALID_GRANT_REF) {
1827 dev_err(xnfp->xnf_devinfo, CE_WARN,
1828 "xnf_dmamap_alloc() failed: "
1829 "invalid grant ref");
1830 goto error;
1831 }
1832 gnttab_grant_foreign_access_ref(txp->tx_txreq.gref,
1833 oeid, txp->tx_mfn, 1);
1834 txp->tx_txreq.offset =
1835 dma_cookie->dmac_laddress & PAGEOFFSET;
1836 txp->tx_txreq.size = dma_cookie->dmac_size;
1837 txp->tx_txreq.flags = 0;
1838
1839 nsegs++;
1840
1841 if (tail != NULL)
1842 tail->tx_txreq.flags = NETTXF_more_data;
1843 tail = txp;
1844
1845 dma_cookie_prev = dma_cookie;
1846 }
1847 }
1848
1849 *countp = nsegs;
1850 return (head);
1851
1852 error:
1853 xnf_data_txbuf_free_chain(xnfp, head);
1854 return (NULL);
1855 }
1856
1857 static void
xnf_tx_setup_offload(xnf_t * xnfp,xnf_txbuf_t * head,uint32_t cksum_flags,uint32_t lso_flags,uint32_t mss)1858 xnf_tx_setup_offload(xnf_t *xnfp, xnf_txbuf_t *head,
1859 uint32_t cksum_flags, uint32_t lso_flags, uint32_t mss)
1860 {
1861 if (lso_flags != 0) {
1862 ASSERT3U(lso_flags, ==, HW_LSO);
1863 ASSERT3P(head->tx_bdesc, ==, NULL);
1864
1865 head->tx_txreq.flags |= NETTXF_extra_info;
1866 netif_extra_info_t *extra = &head->tx_extra;
1867 extra->type = XEN_NETIF_EXTRA_TYPE_GSO;
1868 extra->flags = 0;
1869 extra->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4;
1870 extra->u.gso.size = mss;
1871 extra->u.gso.features = 0;
1872 extra->u.gso.pad = 0;
1873 } else if (cksum_flags != 0) {
1874 ASSERT3U(cksum_flags, ==, HCK_FULLCKSUM);
1875 /*
1876 * If the local protocol stack requests checksum
1877 * offload we set the 'checksum blank' flag,
1878 * indicating to the peer that we need the checksum
1879 * calculated for us.
1880 *
1881 * We _don't_ set the validated flag, because we haven't
1882 * validated that the data and the checksum match.
1883 *
1884 * Note: we already called xnf_pseudo_cksum() in
1885 * xnf_send(), so we just set the txreq flag here.
1886 */
1887 head->tx_txreq.flags |= NETTXF_csum_blank;
1888 xnfp->xnf_stat_tx_cksum_deferred++;
1889 }
1890 }
1891
1892 /*
1893 * Send packet mp. Called by the MAC framework.
1894 */
1895 static mblk_t *
xnf_send(void * arg,mblk_t * mp)1896 xnf_send(void *arg, mblk_t *mp)
1897 {
1898 xnf_t *xnfp = arg;
1899 xnf_txbuf_t *head;
1900 mblk_t *ml;
1901 int length;
1902 int pages, chunks, slots, slots_free;
1903 uint32_t cksum_flags, lso_flags, mss;
1904 boolean_t pulledup = B_FALSE;
1905 boolean_t force_copy = B_FALSE;
1906
1907 ASSERT3P(mp->b_next, ==, NULL);
1908
1909 mutex_enter(&xnfp->xnf_txlock);
1910
1911 /*
1912 * Wait until we are connected to the backend.
1913 */
1914 while (!xnfp->xnf_connected)
1915 cv_wait(&xnfp->xnf_cv_state, &xnfp->xnf_txlock);
1916
1917 /*
1918 * To simplify logic and be in sync with the rescheduling mechanism,
1919 * we require the maximum amount of slots that could be used by a
1920 * transaction to be free before proceeding. The only downside of doing
1921 * this is that it slightly reduces the effective size of the ring.
1922 */
1923 slots_free = xnf_tx_slots_get(xnfp, XEN_MAX_SLOTS_PER_TX, B_FALSE);
1924 if (slots_free < XEN_MAX_SLOTS_PER_TX) {
1925 /*
1926 * We need to ask for a re-schedule later as the ring is full.
1927 */
1928 mutex_enter(&xnfp->xnf_schedlock);
1929 xnfp->xnf_need_sched = B_TRUE;
1930 mutex_exit(&xnfp->xnf_schedlock);
1931
1932 xnfp->xnf_stat_tx_defer++;
1933 mutex_exit(&xnfp->xnf_txlock);
1934 return (mp);
1935 }
1936
1937 /*
1938 * Get hw offload parameters.
1939 * This must be done before pulling up the mp as those parameters
1940 * are not copied over.
1941 */
1942 mac_hcksum_get(mp, NULL, NULL, NULL, NULL, &cksum_flags);
1943 mac_lso_get(mp, &mss, &lso_flags);
1944
1945 /*
1946 * XXX: fix MAC framework so that we can advertise support for
1947 * partial checksum for IPv4 only. This way we won't need to calculate
1948 * the pseudo header checksum ourselves.
1949 */
1950 if (cksum_flags != 0) {
1951 ASSERT3U(cksum_flags, ==, HCK_FULLCKSUM);
1952 (void) xnf_pseudo_cksum(mp);
1953 }
1954
1955 pulledup:
1956 for (ml = mp, pages = 0, chunks = 0, length = 0; ml != NULL;
1957 ml = ml->b_cont, chunks++) {
1958 pages += xnf_mblk_pages(ml);
1959 length += MBLKL(ml);
1960 }
1961 DTRACE_PROBE3(packet, int, length, int, chunks, int, pages);
1962 DTRACE_PROBE3(lso, int, length, uint32_t, lso_flags, uint32_t, mss);
1963
1964 /*
1965 * If the ethernet header crosses a page boundary the packet
1966 * will be dropped by the backend. In practice it seems like
1967 * this happens fairly rarely so we'll do nothing unless the
1968 * packet is small enough to fit in a look-aside buffer.
1969 */
1970 if (((uintptr_t)mp->b_rptr & PAGEOFFSET) +
1971 sizeof (struct ether_header) > PAGESIZE) {
1972 xnfp->xnf_stat_tx_eth_hdr_split++;
1973 if (length <= PAGESIZE)
1974 force_copy = B_TRUE;
1975 }
1976
1977 if (force_copy || (pages > 1 && !xnfp->xnf_be_tx_sg)) {
1978 /*
1979 * If the packet spans several pages and scatter-gather is not
1980 * supported then use a look-aside buffer.
1981 */
1982 ASSERT3U(length, <=, PAGESIZE);
1983 head = xnf_mblk_copy(xnfp, mp);
1984 if (head == NULL) {
1985 dev_err(xnfp->xnf_devinfo, CE_WARN,
1986 "xnf_mblk_copy() failed");
1987 goto drop;
1988 }
1989 } else {
1990 /*
1991 * There's a limit for how many pages can be passed to the
1992 * backend. If we pass that limit, the packet will be dropped
1993 * and some backend implementations (e.g. Linux) could even
1994 * offline the interface.
1995 */
1996 if (pages > XEN_MAX_TX_DATA_PAGES) {
1997 if (pulledup) {
1998 dev_err(xnfp->xnf_devinfo, CE_WARN,
1999 "too many pages, even after pullup: %d.",
2000 pages);
2001 goto drop;
2002 }
2003
2004 /*
2005 * Defragment packet if it spans too many pages.
2006 */
2007 mblk_t *newmp = msgpullup(mp, -1);
2008 if (newmp == NULL) {
2009 dev_err(xnfp->xnf_devinfo, CE_WARN,
2010 "msgpullup() failed");
2011 goto drop;
2012 }
2013
2014 freemsg(mp);
2015 mp = newmp;
2016 xnfp->xnf_stat_tx_pullup++;
2017 pulledup = B_TRUE;
2018 goto pulledup;
2019 }
2020
2021 head = xnf_mblk_map(xnfp, mp, &slots);
2022 if (head == NULL)
2023 goto drop;
2024
2025 IMPLY(slots > 1, xnfp->xnf_be_tx_sg);
2026 }
2027
2028 /*
2029 * Set tx_mp so that mblk is freed when the txbuf chain is freed.
2030 */
2031 head->tx_mp = mp;
2032
2033 xnf_tx_setup_offload(xnfp, head, cksum_flags, lso_flags, mss);
2034
2035 /*
2036 * The first request must store the total length of the packet.
2037 */
2038 head->tx_txreq.size = length;
2039
2040 /*
2041 * Push the packet we have prepared into the ring.
2042 */
2043 xnf_tx_push_packet(xnfp, head);
2044 xnfp->xnf_stat_opackets++;
2045 xnfp->xnf_stat_obytes += length;
2046
2047 mutex_exit(&xnfp->xnf_txlock);
2048 return (NULL);
2049
2050 drop:
2051 freemsg(mp);
2052 xnfp->xnf_stat_tx_drop++;
2053 mutex_exit(&xnfp->xnf_txlock);
2054 return (NULL);
2055 }
2056
2057 /*
2058 * Notification of RX packets. Currently no TX-complete interrupt is
2059 * used, as we clean the TX ring lazily.
2060 */
2061 static uint_t
xnf_intr(caddr_t arg)2062 xnf_intr(caddr_t arg)
2063 {
2064 xnf_t *xnfp = (xnf_t *)arg;
2065 mblk_t *mp;
2066 boolean_t need_sched, clean_ring;
2067
2068 mutex_enter(&xnfp->xnf_rxlock);
2069
2070 /*
2071 * Interrupts before we are connected are spurious.
2072 */
2073 if (!xnfp->xnf_connected) {
2074 mutex_exit(&xnfp->xnf_rxlock);
2075 xnfp->xnf_stat_unclaimed_interrupts++;
2076 return (DDI_INTR_UNCLAIMED);
2077 }
2078
2079 /*
2080 * Receive side processing.
2081 */
2082 do {
2083 /*
2084 * Collect buffers from the ring.
2085 */
2086 xnf_rx_collect(xnfp);
2087
2088 /*
2089 * Interrupt me when the next receive buffer is consumed.
2090 */
2091 xnfp->xnf_rx_ring.sring->rsp_event =
2092 xnfp->xnf_rx_ring.rsp_cons + 1;
2093 xen_mb();
2094
2095 } while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_rx_ring));
2096
2097 if (xnfp->xnf_rx_new_buffers_posted) {
2098 boolean_t notify;
2099
2100 /*
2101 * Indicate to the peer that we have re-filled the
2102 * receive ring, if it cares.
2103 */
2104 /* LINTED: constant in conditional context */
2105 RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_rx_ring, notify);
2106 if (notify)
2107 ec_notify_via_evtchn(xnfp->xnf_evtchn);
2108 xnfp->xnf_rx_new_buffers_posted = B_FALSE;
2109 }
2110
2111 mp = xnfp->xnf_rx_head;
2112 xnfp->xnf_rx_head = xnfp->xnf_rx_tail = NULL;
2113
2114 xnfp->xnf_stat_interrupts++;
2115 mutex_exit(&xnfp->xnf_rxlock);
2116
2117 if (mp != NULL)
2118 mac_rx(xnfp->xnf_mh, NULL, mp);
2119
2120 /*
2121 * Transmit side processing.
2122 *
2123 * If a previous transmit attempt failed or we have pending
2124 * multicast requests, clean the ring.
2125 *
2126 * If we previously stalled transmission and cleaning produces
2127 * some free slots, tell upstream to attempt sending again.
2128 *
2129 * The odd style is to avoid acquiring xnf_txlock unless we
2130 * will actually look inside the tx machinery.
2131 */
2132 mutex_enter(&xnfp->xnf_schedlock);
2133 need_sched = xnfp->xnf_need_sched;
2134 clean_ring = need_sched || (xnfp->xnf_pending_multicast > 0);
2135 mutex_exit(&xnfp->xnf_schedlock);
2136
2137 if (clean_ring) {
2138 int free_slots;
2139
2140 mutex_enter(&xnfp->xnf_txlock);
2141 free_slots = xnf_tx_slots_get(xnfp, 0, B_FALSE);
2142
2143 if (need_sched && (free_slots >= XEN_MAX_SLOTS_PER_TX)) {
2144 mutex_enter(&xnfp->xnf_schedlock);
2145 xnfp->xnf_need_sched = B_FALSE;
2146 mutex_exit(&xnfp->xnf_schedlock);
2147
2148 mac_tx_update(xnfp->xnf_mh);
2149 }
2150 mutex_exit(&xnfp->xnf_txlock);
2151 }
2152
2153 return (DDI_INTR_CLAIMED);
2154 }
2155
2156 /*
2157 * xnf_start() -- start the board receiving and enable interrupts.
2158 */
2159 static int
xnf_start(void * arg)2160 xnf_start(void *arg)
2161 {
2162 xnf_t *xnfp = arg;
2163
2164 mutex_enter(&xnfp->xnf_rxlock);
2165 mutex_enter(&xnfp->xnf_txlock);
2166
2167 /* Accept packets from above. */
2168 xnfp->xnf_running = B_TRUE;
2169
2170 mutex_exit(&xnfp->xnf_txlock);
2171 mutex_exit(&xnfp->xnf_rxlock);
2172
2173 return (0);
2174 }
2175
2176 /* xnf_stop() - disable hardware */
2177 static void
xnf_stop(void * arg)2178 xnf_stop(void *arg)
2179 {
2180 xnf_t *xnfp = arg;
2181
2182 mutex_enter(&xnfp->xnf_rxlock);
2183 mutex_enter(&xnfp->xnf_txlock);
2184
2185 xnfp->xnf_running = B_FALSE;
2186
2187 mutex_exit(&xnfp->xnf_txlock);
2188 mutex_exit(&xnfp->xnf_rxlock);
2189 }
2190
2191 /*
2192 * Hang buffer `bdesc' on the RX ring.
2193 */
2194 static void
xnf_rxbuf_hang(xnf_t * xnfp,xnf_buf_t * bdesc)2195 xnf_rxbuf_hang(xnf_t *xnfp, xnf_buf_t *bdesc)
2196 {
2197 netif_rx_request_t *reqp;
2198 RING_IDX hang_ix;
2199
2200 ASSERT(MUTEX_HELD(&xnfp->xnf_rxlock));
2201
2202 reqp = RING_GET_REQUEST(&xnfp->xnf_rx_ring,
2203 xnfp->xnf_rx_ring.req_prod_pvt);
2204 hang_ix = (RING_IDX) (reqp - RING_GET_REQUEST(&xnfp->xnf_rx_ring, 0));
2205 ASSERT(xnfp->xnf_rx_pkt_info[hang_ix] == NULL);
2206
2207 reqp->id = bdesc->id = hang_ix;
2208 reqp->gref = bdesc->grant_ref;
2209
2210 xnfp->xnf_rx_pkt_info[hang_ix] = bdesc;
2211 xnfp->xnf_rx_ring.req_prod_pvt++;
2212
2213 xnfp->xnf_rx_new_buffers_posted = B_TRUE;
2214 }
2215
2216 /*
2217 * Receive an entire packet from the ring, starting from slot *consp.
2218 * prod indicates the slot of the latest response.
2219 * On return, *consp will point to the head of the next packet.
2220 *
2221 * Note: If slot prod was reached before we could gather a full packet, we will
2222 * drop the partial packet; this would most likely indicate a bug in either
2223 * the front-end or the back-end driver.
2224 *
2225 * An rx packet can consist of several fragments and thus span multiple slots.
2226 * Each fragment can contain up to 4k of data.
2227 *
2228 * A typical 9000 MTU packet with look like this:
2229 * +------+---------------------+-------------------+-----------------------+
2230 * | SLOT | TYPE | CONTENTS | FLAGS |
2231 * +------+---------------------+-------------------+-----------------------+
2232 * | 1 | netif_rx_response_t | 1st data fragment | more_data |
2233 * +------+---------------------+-------------------+-----------------------+
2234 * | 2 | netif_rx_response_t | 2nd data fragment | more_data |
2235 * +------+---------------------+-------------------+-----------------------+
2236 * | 3 | netif_rx_response_t | 3rd data fragment | [none] |
2237 * +------+---------------------+-------------------+-----------------------+
2238 *
2239 * Fragments are chained by setting NETRXF_more_data in the previous
2240 * response's flags. If there are additional flags, such as
2241 * NETRXF_data_validated or NETRXF_extra_info, those should be set on the
2242 * first fragment.
2243 *
2244 * Sometimes extra info can be present. If so, it will follow the first
2245 * fragment, and NETRXF_extra_info flag will be set on the first response.
2246 * If LRO is set on a packet, it will be stored in the extra info. Conforming
2247 * to the spec, extra info can also be chained, but must all be present right
2248 * after the first fragment.
2249 *
2250 * Example of a packet with 2 extra infos:
2251 * +------+---------------------+-------------------+-----------------------+
2252 * | SLOT | TYPE | CONTENTS | FLAGS |
2253 * +------+---------------------+-------------------+-----------------------+
2254 * | 1 | netif_rx_response_t | 1st data fragment | extra_info, more_data |
2255 * +------+---------------------+-------------------+-----------------------+
2256 * | 2 | netif_extra_info_t | 1st extra info | EXTRA_FLAG_MORE |
2257 * +------+---------------------+-------------------+-----------------------+
2258 * | 3 | netif_extra_info_t | 2nd extra info | [none] |
2259 * +------+---------------------+-------------------+-----------------------+
2260 * | 4 | netif_rx_response_t | 2nd data fragment | more_data |
2261 * +------+---------------------+-------------------+-----------------------+
2262 * | 5 | netif_rx_response_t | 3rd data fragment | more_data |
2263 * +------+---------------------+-------------------+-----------------------+
2264 * | 6 | netif_rx_response_t | 4th data fragment | [none] |
2265 * +------+---------------------+-------------------+-----------------------+
2266 *
2267 * In practice, the only extra we expect is for LRO, but only if we advertise
2268 * that we support it to the backend (xnf_enable_lro == TRUE).
2269 */
2270 static int
xnf_rx_one_packet(xnf_t * xnfp,RING_IDX prod,RING_IDX * consp,mblk_t ** mpp)2271 xnf_rx_one_packet(xnf_t *xnfp, RING_IDX prod, RING_IDX *consp, mblk_t **mpp)
2272 {
2273 mblk_t *head = NULL;
2274 mblk_t *tail = NULL;
2275 mblk_t *mp;
2276 int error = 0;
2277 RING_IDX cons = *consp;
2278 netif_extra_info_t lro;
2279 boolean_t is_lro = B_FALSE;
2280 boolean_t is_extra = B_FALSE;
2281
2282 netif_rx_response_t rsp = *RING_GET_RESPONSE(&xnfp->xnf_rx_ring, cons);
2283
2284 boolean_t hwcsum = (rsp.flags & NETRXF_data_validated) != 0;
2285 boolean_t more_data = (rsp.flags & NETRXF_more_data) != 0;
2286 boolean_t more_extra = (rsp.flags & NETRXF_extra_info) != 0;
2287
2288 IMPLY(more_data, xnf_enable_rx_sg);
2289
2290 while (cons != prod) {
2291 xnf_buf_t *bdesc;
2292 int len, off;
2293 int rxidx = cons & (NET_RX_RING_SIZE - 1);
2294
2295 bdesc = xnfp->xnf_rx_pkt_info[rxidx];
2296 xnfp->xnf_rx_pkt_info[rxidx] = NULL;
2297
2298 if (is_extra) {
2299 netif_extra_info_t *extra = (netif_extra_info_t *)&rsp;
2300 /*
2301 * The only extra we expect is for LRO, and it should
2302 * only be present once.
2303 */
2304 if (extra->type == XEN_NETIF_EXTRA_TYPE_GSO &&
2305 !is_lro) {
2306 ASSERT(xnf_enable_lro);
2307 lro = *extra;
2308 is_lro = B_TRUE;
2309 DTRACE_PROBE1(lro, netif_extra_info_t *, &lro);
2310 } else {
2311 dev_err(xnfp->xnf_devinfo, CE_WARN, "rx packet "
2312 "contains unexpected extra info of type %d",
2313 extra->type);
2314 error = EINVAL;
2315 }
2316 more_extra =
2317 (extra->flags & XEN_NETIF_EXTRA_FLAG_MORE) != 0;
2318
2319 goto hang_buf;
2320 }
2321
2322 ASSERT3U(bdesc->id, ==, rsp.id);
2323
2324 /*
2325 * status stores packet length when >= 0, or errors when < 0.
2326 */
2327 len = rsp.status;
2328 off = rsp.offset;
2329 more_data = (rsp.flags & NETRXF_more_data) != 0;
2330
2331 /*
2332 * sanity checks.
2333 */
2334 if (!xnfp->xnf_running) {
2335 error = EBUSY;
2336 } else if (len <= 0) {
2337 xnfp->xnf_stat_errrx++;
2338
2339 switch (len) {
2340 case 0:
2341 xnfp->xnf_stat_runt++;
2342 break;
2343 case NETIF_RSP_ERROR:
2344 xnfp->xnf_stat_mac_rcv_error++;
2345 break;
2346 case NETIF_RSP_DROPPED:
2347 xnfp->xnf_stat_norxbuf++;
2348 break;
2349 }
2350 error = EINVAL;
2351 } else if (bdesc->grant_ref == INVALID_GRANT_REF) {
2352 dev_err(xnfp->xnf_devinfo, CE_WARN,
2353 "Bad rx grant reference, rsp id %d", rsp.id);
2354 error = EINVAL;
2355 } else if ((off + len) > PAGESIZE) {
2356 dev_err(xnfp->xnf_devinfo, CE_WARN, "Rx packet crosses "
2357 "page boundary (offset %d, length %d)", off, len);
2358 error = EINVAL;
2359 }
2360
2361 if (error != 0) {
2362 /*
2363 * If an error has been detected, we do not attempt
2364 * to read the data but we still need to replace
2365 * the rx bufs.
2366 */
2367 goto hang_buf;
2368 }
2369
2370 xnf_buf_t *nbuf = NULL;
2371
2372 /*
2373 * If the packet is below a pre-determined size we will
2374 * copy data out of the buf rather than replace it.
2375 */
2376 if (len > xnf_rx_copy_limit)
2377 nbuf = xnf_buf_get(xnfp, KM_NOSLEEP, B_FALSE);
2378
2379 if (nbuf != NULL) {
2380 mp = desballoc((unsigned char *)bdesc->buf,
2381 bdesc->len, 0, &bdesc->free_rtn);
2382
2383 if (mp == NULL) {
2384 xnfp->xnf_stat_rx_desballoc_fail++;
2385 xnfp->xnf_stat_norxbuf++;
2386 error = ENOMEM;
2387 /*
2388 * we free the buf we just allocated as we
2389 * will re-hang the old buf.
2390 */
2391 xnf_buf_put(xnfp, nbuf, B_FALSE);
2392 goto hang_buf;
2393 }
2394
2395 mp->b_rptr = mp->b_rptr + off;
2396 mp->b_wptr = mp->b_rptr + len;
2397
2398 /*
2399 * Release the grant as the backend doesn't need to
2400 * access this buffer anymore and grants are scarce.
2401 */
2402 (void) gnttab_end_foreign_access_ref(bdesc->grant_ref,
2403 0);
2404 xnf_gref_put(xnfp, bdesc->grant_ref);
2405 bdesc->grant_ref = INVALID_GRANT_REF;
2406
2407 bdesc = nbuf;
2408 } else {
2409 /*
2410 * We failed to allocate a new buf or decided to reuse
2411 * the old one. In either case we copy the data off it
2412 * and put it back into the ring.
2413 */
2414 mp = allocb(len, 0);
2415 if (mp == NULL) {
2416 xnfp->xnf_stat_rx_allocb_fail++;
2417 xnfp->xnf_stat_norxbuf++;
2418 error = ENOMEM;
2419 goto hang_buf;
2420 }
2421 bcopy(bdesc->buf + off, mp->b_wptr, len);
2422 mp->b_wptr += len;
2423 }
2424
2425 if (head == NULL)
2426 head = mp;
2427 else
2428 tail->b_cont = mp;
2429 tail = mp;
2430
2431 hang_buf:
2432 /*
2433 * No matter what happens, for each response we need to hang
2434 * a new buf on the rx ring. Put either the old one, or a new
2435 * one if the old one is borrowed by the kernel via desballoc().
2436 */
2437 xnf_rxbuf_hang(xnfp, bdesc);
2438 cons++;
2439
2440 /* next response is an extra */
2441 is_extra = more_extra;
2442
2443 if (!more_data && !more_extra)
2444 break;
2445
2446 /*
2447 * Note that since requests and responses are union'd on the
2448 * same ring, we copy the response to a local variable instead
2449 * of keeping a pointer. Otherwise xnf_rxbuf_hang() would have
2450 * overwritten contents of rsp.
2451 */
2452 rsp = *RING_GET_RESPONSE(&xnfp->xnf_rx_ring, cons);
2453 }
2454
2455 /*
2456 * Check that we do not get stuck in a loop.
2457 */
2458 ASSERT3U(*consp, !=, cons);
2459 *consp = cons;
2460
2461 /*
2462 * We ran out of responses but the flags indicate there is more data.
2463 */
2464 if (more_data) {
2465 dev_err(xnfp->xnf_devinfo, CE_WARN, "rx: need more fragments.");
2466 error = EINVAL;
2467 }
2468 if (more_extra) {
2469 dev_err(xnfp->xnf_devinfo, CE_WARN, "rx: need more fragments "
2470 "(extras).");
2471 error = EINVAL;
2472 }
2473
2474 /*
2475 * An error means the packet must be dropped. If we have already formed
2476 * a partial packet, then discard it.
2477 */
2478 if (error != 0) {
2479 if (head != NULL)
2480 freemsg(head);
2481 xnfp->xnf_stat_rx_drop++;
2482 return (error);
2483 }
2484
2485 ASSERT(head != NULL);
2486
2487 if (hwcsum) {
2488 /*
2489 * If the peer says that the data has been validated then we
2490 * declare that the full checksum has been verified.
2491 *
2492 * We don't look at the "checksum blank" flag, and hence could
2493 * have a packet here that we are asserting is good with
2494 * a blank checksum.
2495 */
2496 mac_hcksum_set(head, 0, 0, 0, 0, HCK_FULLCKSUM_OK);
2497 xnfp->xnf_stat_rx_cksum_no_need++;
2498 }
2499
2500 /* XXX: set lro info for packet once LRO is supported in OS. */
2501
2502 *mpp = head;
2503
2504 return (0);
2505 }
2506
2507 /*
2508 * Collect packets from the RX ring, storing them in `xnfp' for later use.
2509 */
2510 static void
xnf_rx_collect(xnf_t * xnfp)2511 xnf_rx_collect(xnf_t *xnfp)
2512 {
2513 RING_IDX prod;
2514
2515 ASSERT(MUTEX_HELD(&xnfp->xnf_rxlock));
2516
2517 prod = xnfp->xnf_rx_ring.sring->rsp_prod;
2518 /*
2519 * Ensure we see queued responses up to 'prod'.
2520 */
2521 membar_consumer();
2522
2523 while (xnfp->xnf_rx_ring.rsp_cons != prod) {
2524 mblk_t *mp;
2525
2526 /*
2527 * Collect a packet.
2528 * rsp_cons is updated inside xnf_rx_one_packet().
2529 */
2530 int error = xnf_rx_one_packet(xnfp, prod,
2531 &xnfp->xnf_rx_ring.rsp_cons, &mp);
2532 if (error == 0) {
2533 xnfp->xnf_stat_ipackets++;
2534 xnfp->xnf_stat_rbytes += xmsgsize(mp);
2535
2536 /*
2537 * Append the mblk to the rx list.
2538 */
2539 if (xnfp->xnf_rx_head == NULL) {
2540 ASSERT3P(xnfp->xnf_rx_tail, ==, NULL);
2541 xnfp->xnf_rx_head = mp;
2542 } else {
2543 ASSERT(xnfp->xnf_rx_tail != NULL);
2544 xnfp->xnf_rx_tail->b_next = mp;
2545 }
2546 xnfp->xnf_rx_tail = mp;
2547 }
2548 }
2549 }
2550
2551 /*
2552 * xnf_alloc_dma_resources() -- initialize the drivers structures
2553 */
2554 static int
xnf_alloc_dma_resources(xnf_t * xnfp)2555 xnf_alloc_dma_resources(xnf_t *xnfp)
2556 {
2557 dev_info_t *devinfo = xnfp->xnf_devinfo;
2558 size_t len;
2559 ddi_dma_cookie_t dma_cookie;
2560 uint_t ncookies;
2561 int rc;
2562 caddr_t rptr;
2563
2564 /*
2565 * The code below allocates all the DMA data structures that
2566 * need to be released when the driver is detached.
2567 *
2568 * Allocate page for the transmit descriptor ring.
2569 */
2570 if (ddi_dma_alloc_handle(devinfo, &ringbuf_dma_attr,
2571 DDI_DMA_SLEEP, 0, &xnfp->xnf_tx_ring_dma_handle) != DDI_SUCCESS)
2572 goto alloc_error;
2573
2574 if (ddi_dma_mem_alloc(xnfp->xnf_tx_ring_dma_handle,
2575 PAGESIZE, &accattr, DDI_DMA_CONSISTENT,
2576 DDI_DMA_SLEEP, 0, &rptr, &len,
2577 &xnfp->xnf_tx_ring_dma_acchandle) != DDI_SUCCESS) {
2578 ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle);
2579 xnfp->xnf_tx_ring_dma_handle = NULL;
2580 goto alloc_error;
2581 }
2582
2583 if ((rc = ddi_dma_addr_bind_handle(xnfp->xnf_tx_ring_dma_handle, NULL,
2584 rptr, PAGESIZE, DDI_DMA_RDWR | DDI_DMA_CONSISTENT,
2585 DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies)) != DDI_DMA_MAPPED) {
2586 ddi_dma_mem_free(&xnfp->xnf_tx_ring_dma_acchandle);
2587 ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle);
2588 xnfp->xnf_tx_ring_dma_handle = NULL;
2589 xnfp->xnf_tx_ring_dma_acchandle = NULL;
2590 if (rc == DDI_DMA_NORESOURCES)
2591 goto alloc_error;
2592 else
2593 goto error;
2594 }
2595
2596 ASSERT(ncookies == 1);
2597 bzero(rptr, PAGESIZE);
2598 /* LINTED: constant in conditional context */
2599 SHARED_RING_INIT((netif_tx_sring_t *)rptr);
2600 /* LINTED: constant in conditional context */
2601 FRONT_RING_INIT(&xnfp->xnf_tx_ring, (netif_tx_sring_t *)rptr, PAGESIZE);
2602 xnfp->xnf_tx_ring_phys_addr = dma_cookie.dmac_laddress;
2603
2604 /*
2605 * Allocate page for the receive descriptor ring.
2606 */
2607 if (ddi_dma_alloc_handle(devinfo, &ringbuf_dma_attr,
2608 DDI_DMA_SLEEP, 0, &xnfp->xnf_rx_ring_dma_handle) != DDI_SUCCESS)
2609 goto alloc_error;
2610
2611 if (ddi_dma_mem_alloc(xnfp->xnf_rx_ring_dma_handle,
2612 PAGESIZE, &accattr, DDI_DMA_CONSISTENT,
2613 DDI_DMA_SLEEP, 0, &rptr, &len,
2614 &xnfp->xnf_rx_ring_dma_acchandle) != DDI_SUCCESS) {
2615 ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle);
2616 xnfp->xnf_rx_ring_dma_handle = NULL;
2617 goto alloc_error;
2618 }
2619
2620 if ((rc = ddi_dma_addr_bind_handle(xnfp->xnf_rx_ring_dma_handle, NULL,
2621 rptr, PAGESIZE, DDI_DMA_RDWR | DDI_DMA_CONSISTENT,
2622 DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies)) != DDI_DMA_MAPPED) {
2623 ddi_dma_mem_free(&xnfp->xnf_rx_ring_dma_acchandle);
2624 ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle);
2625 xnfp->xnf_rx_ring_dma_handle = NULL;
2626 xnfp->xnf_rx_ring_dma_acchandle = NULL;
2627 if (rc == DDI_DMA_NORESOURCES)
2628 goto alloc_error;
2629 else
2630 goto error;
2631 }
2632
2633 ASSERT(ncookies == 1);
2634 bzero(rptr, PAGESIZE);
2635 /* LINTED: constant in conditional context */
2636 SHARED_RING_INIT((netif_rx_sring_t *)rptr);
2637 /* LINTED: constant in conditional context */
2638 FRONT_RING_INIT(&xnfp->xnf_rx_ring, (netif_rx_sring_t *)rptr, PAGESIZE);
2639 xnfp->xnf_rx_ring_phys_addr = dma_cookie.dmac_laddress;
2640
2641 return (DDI_SUCCESS);
2642
2643 alloc_error:
2644 cmn_err(CE_WARN, "xnf%d: could not allocate enough DMA memory",
2645 ddi_get_instance(xnfp->xnf_devinfo));
2646 error:
2647 xnf_release_dma_resources(xnfp);
2648 return (DDI_FAILURE);
2649 }
2650
2651 /*
2652 * Release all DMA resources in the opposite order from acquisition
2653 */
2654 static void
xnf_release_dma_resources(xnf_t * xnfp)2655 xnf_release_dma_resources(xnf_t *xnfp)
2656 {
2657 int i;
2658
2659 /*
2660 * Free receive buffers which are currently associated with
2661 * descriptors.
2662 */
2663 mutex_enter(&xnfp->xnf_rxlock);
2664 for (i = 0; i < NET_RX_RING_SIZE; i++) {
2665 xnf_buf_t *bp;
2666
2667 if ((bp = xnfp->xnf_rx_pkt_info[i]) == NULL)
2668 continue;
2669 xnfp->xnf_rx_pkt_info[i] = NULL;
2670 xnf_buf_put(xnfp, bp, B_FALSE);
2671 }
2672 mutex_exit(&xnfp->xnf_rxlock);
2673
2674 /* Free the receive ring buffer. */
2675 if (xnfp->xnf_rx_ring_dma_acchandle != NULL) {
2676 (void) ddi_dma_unbind_handle(xnfp->xnf_rx_ring_dma_handle);
2677 ddi_dma_mem_free(&xnfp->xnf_rx_ring_dma_acchandle);
2678 ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle);
2679 xnfp->xnf_rx_ring_dma_acchandle = NULL;
2680 }
2681 /* Free the transmit ring buffer. */
2682 if (xnfp->xnf_tx_ring_dma_acchandle != NULL) {
2683 (void) ddi_dma_unbind_handle(xnfp->xnf_tx_ring_dma_handle);
2684 ddi_dma_mem_free(&xnfp->xnf_tx_ring_dma_acchandle);
2685 ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle);
2686 xnfp->xnf_tx_ring_dma_acchandle = NULL;
2687 }
2688
2689 }
2690
2691 /*
2692 * Release any packets and associated structures used by the TX ring.
2693 */
2694 static void
xnf_release_mblks(xnf_t * xnfp)2695 xnf_release_mblks(xnf_t *xnfp)
2696 {
2697 RING_IDX i;
2698 xnf_txid_t *tidp;
2699
2700 for (i = 0, tidp = &xnfp->xnf_tx_pkt_id[0];
2701 i < NET_TX_RING_SIZE;
2702 i++, tidp++) {
2703 xnf_txbuf_t *txp = tidp->txbuf;
2704
2705 if (txp != NULL) {
2706 ASSERT(txp->tx_mp != NULL);
2707 freemsg(txp->tx_mp);
2708
2709 xnf_txid_put(xnfp, tidp);
2710 kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
2711 }
2712 }
2713 }
2714
2715 static int
xnf_buf_constructor(void * buf,void * arg,int kmflag)2716 xnf_buf_constructor(void *buf, void *arg, int kmflag)
2717 {
2718 int (*ddiflags)(caddr_t) = DDI_DMA_SLEEP;
2719 xnf_buf_t *bdesc = buf;
2720 xnf_t *xnfp = arg;
2721 ddi_dma_cookie_t dma_cookie;
2722 uint_t ncookies;
2723 size_t len;
2724
2725 if (kmflag & KM_NOSLEEP)
2726 ddiflags = DDI_DMA_DONTWAIT;
2727
2728 /* Allocate a DMA access handle for the buffer. */
2729 if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &rx_buf_dma_attr,
2730 ddiflags, 0, &bdesc->dma_handle) != DDI_SUCCESS)
2731 goto failure;
2732
2733 /* Allocate DMA-able memory for buffer. */
2734 if (ddi_dma_mem_alloc(bdesc->dma_handle,
2735 PAGESIZE, &data_accattr, DDI_DMA_STREAMING, ddiflags, 0,
2736 &bdesc->buf, &len, &bdesc->acc_handle) != DDI_SUCCESS)
2737 goto failure_1;
2738
2739 /* Bind to virtual address of buffer to get physical address. */
2740 if (ddi_dma_addr_bind_handle(bdesc->dma_handle, NULL,
2741 bdesc->buf, len, DDI_DMA_RDWR | DDI_DMA_STREAMING,
2742 ddiflags, 0, &dma_cookie, &ncookies) != DDI_DMA_MAPPED)
2743 goto failure_2;
2744 ASSERT(ncookies == 1);
2745
2746 bdesc->free_rtn.free_func = xnf_buf_recycle;
2747 bdesc->free_rtn.free_arg = (caddr_t)bdesc;
2748 bdesc->xnfp = xnfp;
2749 bdesc->buf_phys = dma_cookie.dmac_laddress;
2750 bdesc->buf_mfn = pfn_to_mfn(xnf_btop(bdesc->buf_phys));
2751 bdesc->len = dma_cookie.dmac_size;
2752 bdesc->grant_ref = INVALID_GRANT_REF;
2753 bdesc->gen = xnfp->xnf_gen;
2754
2755 atomic_inc_64(&xnfp->xnf_stat_buf_allocated);
2756
2757 return (0);
2758
2759 failure_2:
2760 ddi_dma_mem_free(&bdesc->acc_handle);
2761
2762 failure_1:
2763 ddi_dma_free_handle(&bdesc->dma_handle);
2764
2765 failure:
2766
2767 ASSERT(kmflag & KM_NOSLEEP); /* Cannot fail for KM_SLEEP. */
2768 return (-1);
2769 }
2770
2771 static void
xnf_buf_destructor(void * buf,void * arg)2772 xnf_buf_destructor(void *buf, void *arg)
2773 {
2774 xnf_buf_t *bdesc = buf;
2775 xnf_t *xnfp = arg;
2776
2777 (void) ddi_dma_unbind_handle(bdesc->dma_handle);
2778 ddi_dma_mem_free(&bdesc->acc_handle);
2779 ddi_dma_free_handle(&bdesc->dma_handle);
2780
2781 atomic_dec_64(&xnfp->xnf_stat_buf_allocated);
2782 }
2783
2784 static xnf_buf_t *
xnf_buf_get(xnf_t * xnfp,int flags,boolean_t readonly)2785 xnf_buf_get(xnf_t *xnfp, int flags, boolean_t readonly)
2786 {
2787 grant_ref_t gref;
2788 xnf_buf_t *bufp;
2789
2790 /*
2791 * Usually grant references are more scarce than memory, so we
2792 * attempt to acquire a grant reference first.
2793 */
2794 gref = xnf_gref_get(xnfp);
2795 if (gref == INVALID_GRANT_REF)
2796 return (NULL);
2797
2798 bufp = kmem_cache_alloc(xnfp->xnf_buf_cache, flags);
2799 if (bufp == NULL) {
2800 xnf_gref_put(xnfp, gref);
2801 return (NULL);
2802 }
2803
2804 ASSERT3U(bufp->grant_ref, ==, INVALID_GRANT_REF);
2805
2806 bufp->grant_ref = gref;
2807
2808 if (bufp->gen != xnfp->xnf_gen)
2809 xnf_buf_refresh(bufp);
2810
2811 gnttab_grant_foreign_access_ref(bufp->grant_ref,
2812 xvdi_get_oeid(bufp->xnfp->xnf_devinfo),
2813 bufp->buf_mfn, readonly ? 1 : 0);
2814
2815 atomic_inc_64(&xnfp->xnf_stat_buf_outstanding);
2816
2817 return (bufp);
2818 }
2819
2820 static void
xnf_buf_put(xnf_t * xnfp,xnf_buf_t * bufp,boolean_t readonly)2821 xnf_buf_put(xnf_t *xnfp, xnf_buf_t *bufp, boolean_t readonly)
2822 {
2823 if (bufp->grant_ref != INVALID_GRANT_REF) {
2824 (void) gnttab_end_foreign_access_ref(
2825 bufp->grant_ref, readonly ? 1 : 0);
2826 xnf_gref_put(xnfp, bufp->grant_ref);
2827 bufp->grant_ref = INVALID_GRANT_REF;
2828 }
2829
2830 kmem_cache_free(xnfp->xnf_buf_cache, bufp);
2831
2832 atomic_dec_64(&xnfp->xnf_stat_buf_outstanding);
2833 }
2834
2835 /*
2836 * Refresh any cached data about a buffer after resume.
2837 */
2838 static void
xnf_buf_refresh(xnf_buf_t * bdesc)2839 xnf_buf_refresh(xnf_buf_t *bdesc)
2840 {
2841 bdesc->buf_mfn = pfn_to_mfn(xnf_btop(bdesc->buf_phys));
2842 bdesc->gen = bdesc->xnfp->xnf_gen;
2843 }
2844
2845 /*
2846 * Streams `freeb' routine for `xnf_buf_t' when used as transmit
2847 * look-aside buffers.
2848 */
2849 static void
xnf_buf_recycle(xnf_buf_t * bdesc)2850 xnf_buf_recycle(xnf_buf_t *bdesc)
2851 {
2852 xnf_t *xnfp = bdesc->xnfp;
2853
2854 xnf_buf_put(xnfp, bdesc, B_TRUE);
2855 }
2856
2857 static int
xnf_tx_buf_constructor(void * buf,void * arg,int kmflag)2858 xnf_tx_buf_constructor(void *buf, void *arg, int kmflag)
2859 {
2860 int (*ddiflags)(caddr_t) = DDI_DMA_SLEEP;
2861 xnf_txbuf_t *txp = buf;
2862 xnf_t *xnfp = arg;
2863
2864 if (kmflag & KM_NOSLEEP)
2865 ddiflags = DDI_DMA_DONTWAIT;
2866
2867 if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &tx_buf_dma_attr,
2868 ddiflags, 0, &txp->tx_dma_handle) != DDI_SUCCESS) {
2869 ASSERT(kmflag & KM_NOSLEEP); /* Cannot fail for KM_SLEEP. */
2870 return (-1);
2871 }
2872
2873 return (0);
2874 }
2875
2876 static void
xnf_tx_buf_destructor(void * buf,void * arg)2877 xnf_tx_buf_destructor(void *buf, void *arg)
2878 {
2879 _NOTE(ARGUNUSED(arg));
2880 xnf_txbuf_t *txp = buf;
2881
2882 ddi_dma_free_handle(&txp->tx_dma_handle);
2883 }
2884
2885 /*
2886 * Statistics.
2887 */
2888 static char *xnf_aux_statistics[] = {
2889 "tx_cksum_deferred",
2890 "rx_cksum_no_need",
2891 "interrupts",
2892 "unclaimed_interrupts",
2893 "tx_pullup",
2894 "tx_lookaside",
2895 "tx_drop",
2896 "tx_eth_hdr_split",
2897 "buf_allocated",
2898 "buf_outstanding",
2899 "gref_outstanding",
2900 "gref_failure",
2901 "gref_peak",
2902 "rx_allocb_fail",
2903 "rx_desballoc_fail",
2904 };
2905
2906 static int
xnf_kstat_aux_update(kstat_t * ksp,int flag)2907 xnf_kstat_aux_update(kstat_t *ksp, int flag)
2908 {
2909 xnf_t *xnfp;
2910 kstat_named_t *knp;
2911
2912 if (flag != KSTAT_READ)
2913 return (EACCES);
2914
2915 xnfp = ksp->ks_private;
2916 knp = ksp->ks_data;
2917
2918 /*
2919 * Assignment order must match that of the names in
2920 * xnf_aux_statistics.
2921 */
2922 (knp++)->value.ui64 = xnfp->xnf_stat_tx_cksum_deferred;
2923 (knp++)->value.ui64 = xnfp->xnf_stat_rx_cksum_no_need;
2924
2925 (knp++)->value.ui64 = xnfp->xnf_stat_interrupts;
2926 (knp++)->value.ui64 = xnfp->xnf_stat_unclaimed_interrupts;
2927 (knp++)->value.ui64 = xnfp->xnf_stat_tx_pullup;
2928 (knp++)->value.ui64 = xnfp->xnf_stat_tx_lookaside;
2929 (knp++)->value.ui64 = xnfp->xnf_stat_tx_drop;
2930 (knp++)->value.ui64 = xnfp->xnf_stat_tx_eth_hdr_split;
2931
2932 (knp++)->value.ui64 = xnfp->xnf_stat_buf_allocated;
2933 (knp++)->value.ui64 = xnfp->xnf_stat_buf_outstanding;
2934 (knp++)->value.ui64 = xnfp->xnf_stat_gref_outstanding;
2935 (knp++)->value.ui64 = xnfp->xnf_stat_gref_failure;
2936 (knp++)->value.ui64 = xnfp->xnf_stat_gref_peak;
2937 (knp++)->value.ui64 = xnfp->xnf_stat_rx_allocb_fail;
2938 (knp++)->value.ui64 = xnfp->xnf_stat_rx_desballoc_fail;
2939
2940 return (0);
2941 }
2942
2943 static boolean_t
xnf_kstat_init(xnf_t * xnfp)2944 xnf_kstat_init(xnf_t *xnfp)
2945 {
2946 int nstat = sizeof (xnf_aux_statistics) /
2947 sizeof (xnf_aux_statistics[0]);
2948 char **cp = xnf_aux_statistics;
2949 kstat_named_t *knp;
2950
2951 /*
2952 * Create and initialise kstats.
2953 */
2954 if ((xnfp->xnf_kstat_aux = kstat_create("xnf",
2955 ddi_get_instance(xnfp->xnf_devinfo),
2956 "aux_statistics", "net", KSTAT_TYPE_NAMED,
2957 nstat, 0)) == NULL)
2958 return (B_FALSE);
2959
2960 xnfp->xnf_kstat_aux->ks_private = xnfp;
2961 xnfp->xnf_kstat_aux->ks_update = xnf_kstat_aux_update;
2962
2963 knp = xnfp->xnf_kstat_aux->ks_data;
2964 while (nstat > 0) {
2965 kstat_named_init(knp, *cp, KSTAT_DATA_UINT64);
2966
2967 knp++;
2968 cp++;
2969 nstat--;
2970 }
2971
2972 kstat_install(xnfp->xnf_kstat_aux);
2973
2974 return (B_TRUE);
2975 }
2976
2977 static int
xnf_stat(void * arg,uint_t stat,uint64_t * val)2978 xnf_stat(void *arg, uint_t stat, uint64_t *val)
2979 {
2980 xnf_t *xnfp = arg;
2981
2982 mutex_enter(&xnfp->xnf_rxlock);
2983 mutex_enter(&xnfp->xnf_txlock);
2984
2985 #define mac_stat(q, r) \
2986 case (MAC_STAT_##q): \
2987 *val = xnfp->xnf_stat_##r; \
2988 break
2989
2990 #define ether_stat(q, r) \
2991 case (ETHER_STAT_##q): \
2992 *val = xnfp->xnf_stat_##r; \
2993 break
2994
2995 switch (stat) {
2996
2997 mac_stat(IPACKETS, ipackets);
2998 mac_stat(OPACKETS, opackets);
2999 mac_stat(RBYTES, rbytes);
3000 mac_stat(OBYTES, obytes);
3001 mac_stat(NORCVBUF, norxbuf);
3002 mac_stat(IERRORS, errrx);
3003 mac_stat(NOXMTBUF, tx_defer);
3004
3005 ether_stat(MACRCV_ERRORS, mac_rcv_error);
3006 ether_stat(TOOSHORT_ERRORS, runt);
3007
3008 /* always claim to be in full duplex mode */
3009 case ETHER_STAT_LINK_DUPLEX:
3010 *val = LINK_DUPLEX_FULL;
3011 break;
3012
3013 /* always claim to be at 1Gb/s link speed */
3014 case MAC_STAT_IFSPEED:
3015 *val = 1000000000ull;
3016 break;
3017
3018 default:
3019 mutex_exit(&xnfp->xnf_txlock);
3020 mutex_exit(&xnfp->xnf_rxlock);
3021
3022 return (ENOTSUP);
3023 }
3024
3025 #undef mac_stat
3026 #undef ether_stat
3027
3028 mutex_exit(&xnfp->xnf_txlock);
3029 mutex_exit(&xnfp->xnf_rxlock);
3030
3031 return (0);
3032 }
3033
3034 static int
xnf_change_mtu(xnf_t * xnfp,uint32_t mtu)3035 xnf_change_mtu(xnf_t *xnfp, uint32_t mtu)
3036 {
3037 if (mtu > ETHERMTU) {
3038 if (!xnf_enable_tx_sg) {
3039 dev_err(xnfp->xnf_devinfo, CE_WARN, "MTU limited to %d "
3040 "because scatter-gather is disabled for transmit "
3041 "in driver settings", ETHERMTU);
3042 return (EINVAL);
3043 } else if (!xnf_enable_rx_sg) {
3044 dev_err(xnfp->xnf_devinfo, CE_WARN, "MTU limited to %d "
3045 "because scatter-gather is disabled for receive "
3046 "in driver settings", ETHERMTU);
3047 return (EINVAL);
3048 } else if (!xnfp->xnf_be_tx_sg) {
3049 dev_err(xnfp->xnf_devinfo, CE_WARN, "MTU limited to %d "
3050 "because backend doesn't support scatter-gather",
3051 ETHERMTU);
3052 return (EINVAL);
3053 }
3054 if (mtu > XNF_MAXPKT)
3055 return (EINVAL);
3056 }
3057 int error = mac_maxsdu_update(xnfp->xnf_mh, mtu);
3058 if (error == 0)
3059 xnfp->xnf_mtu = mtu;
3060
3061 return (error);
3062 }
3063
3064 /*ARGSUSED*/
3065 static int
xnf_getprop(void * data,const char * prop_name,mac_prop_id_t prop_id,uint_t prop_val_size,void * prop_val)3066 xnf_getprop(void *data, const char *prop_name, mac_prop_id_t prop_id,
3067 uint_t prop_val_size, void *prop_val)
3068 {
3069 xnf_t *xnfp = data;
3070
3071 switch (prop_id) {
3072 case MAC_PROP_MTU:
3073 ASSERT(prop_val_size >= sizeof (uint32_t));
3074 bcopy(&xnfp->xnf_mtu, prop_val, sizeof (uint32_t));
3075 break;
3076 default:
3077 return (ENOTSUP);
3078 }
3079 return (0);
3080 }
3081
3082 /*ARGSUSED*/
3083 static int
xnf_setprop(void * data,const char * prop_name,mac_prop_id_t prop_id,uint_t prop_val_size,const void * prop_val)3084 xnf_setprop(void *data, const char *prop_name, mac_prop_id_t prop_id,
3085 uint_t prop_val_size, const void *prop_val)
3086 {
3087 xnf_t *xnfp = data;
3088 uint32_t new_mtu;
3089 int error;
3090
3091 switch (prop_id) {
3092 case MAC_PROP_MTU:
3093 ASSERT(prop_val_size >= sizeof (uint32_t));
3094 bcopy(prop_val, &new_mtu, sizeof (new_mtu));
3095 error = xnf_change_mtu(xnfp, new_mtu);
3096 break;
3097 default:
3098 return (ENOTSUP);
3099 }
3100
3101 return (error);
3102 }
3103
3104 /*ARGSUSED*/
3105 static void
xnf_propinfo(void * data,const char * prop_name,mac_prop_id_t prop_id,mac_prop_info_handle_t prop_handle)3106 xnf_propinfo(void *data, const char *prop_name, mac_prop_id_t prop_id,
3107 mac_prop_info_handle_t prop_handle)
3108 {
3109 switch (prop_id) {
3110 case MAC_PROP_MTU:
3111 mac_prop_info_set_range_uint32(prop_handle, 0, XNF_MAXPKT);
3112 break;
3113 default:
3114 break;
3115 }
3116 }
3117
3118 static boolean_t
xnf_getcapab(void * arg,mac_capab_t cap,void * cap_data)3119 xnf_getcapab(void *arg, mac_capab_t cap, void *cap_data)
3120 {
3121 xnf_t *xnfp = arg;
3122
3123 switch (cap) {
3124 case MAC_CAPAB_HCKSUM: {
3125 uint32_t *capab = cap_data;
3126
3127 /*
3128 * Whilst the flag used to communicate with the IO
3129 * domain is called "NETTXF_csum_blank", the checksum
3130 * in the packet must contain the pseudo-header
3131 * checksum and not zero.
3132 *
3133 * To help out the IO domain, we might use
3134 * HCKSUM_INET_PARTIAL. Unfortunately our stack will
3135 * then use checksum offload for IPv6 packets, which
3136 * the IO domain can't handle.
3137 *
3138 * As a result, we declare outselves capable of
3139 * HCKSUM_INET_FULL_V4. This means that we receive
3140 * IPv4 packets from the stack with a blank checksum
3141 * field and must insert the pseudo-header checksum
3142 * before passing the packet to the IO domain.
3143 */
3144 *capab = HCKSUM_INET_FULL_V4;
3145
3146 /*
3147 * TODO: query the "feature-ipv6-csum-offload" capability.
3148 * If enabled, that could allow us to use HCKSUM_INET_PARTIAL.
3149 */
3150
3151 break;
3152 }
3153 case MAC_CAPAB_LSO: {
3154 if (!xnfp->xnf_be_lso)
3155 return (B_FALSE);
3156
3157 mac_capab_lso_t *lso = cap_data;
3158 lso->lso_flags = LSO_TX_BASIC_TCP_IPV4;
3159 lso->lso_basic_tcp_ipv4.lso_max = IP_MAXPACKET;
3160 break;
3161 }
3162 default:
3163 return (B_FALSE);
3164 }
3165
3166 return (B_TRUE);
3167 }
3168
3169 /*
3170 * The state of the peer has changed - react accordingly.
3171 */
3172 static void
oe_state_change(dev_info_t * dip,ddi_eventcookie_t id,void * arg,void * impl_data)3173 oe_state_change(dev_info_t *dip, ddi_eventcookie_t id,
3174 void *arg, void *impl_data)
3175 {
3176 _NOTE(ARGUNUSED(id, arg));
3177 xnf_t *xnfp = ddi_get_driver_private(dip);
3178 XenbusState new_state = *(XenbusState *)impl_data;
3179
3180 ASSERT(xnfp != NULL);
3181
3182 switch (new_state) {
3183 case XenbusStateUnknown:
3184 case XenbusStateInitialising:
3185 case XenbusStateInitialised:
3186 case XenbusStateClosing:
3187 case XenbusStateClosed:
3188 case XenbusStateReconfiguring:
3189 case XenbusStateReconfigured:
3190 break;
3191
3192 case XenbusStateInitWait:
3193 xnf_read_config(xnfp);
3194
3195 if (!xnfp->xnf_be_rx_copy) {
3196 cmn_err(CE_WARN,
3197 "The xnf driver requires a dom0 that "
3198 "supports 'feature-rx-copy'.");
3199 (void) xvdi_switch_state(xnfp->xnf_devinfo,
3200 XBT_NULL, XenbusStateClosed);
3201 break;
3202 }
3203
3204 /*
3205 * Connect to the backend.
3206 */
3207 xnf_be_connect(xnfp);
3208
3209 /*
3210 * Our MAC address as discovered by xnf_read_config().
3211 */
3212 mac_unicst_update(xnfp->xnf_mh, xnfp->xnf_mac_addr);
3213
3214 /*
3215 * We do not know if some features such as LSO are supported
3216 * until we connect to the backend. We request the MAC layer
3217 * to poll our capabilities again.
3218 */
3219 mac_capab_update(xnfp->xnf_mh);
3220
3221 break;
3222
3223 case XenbusStateConnected:
3224 mutex_enter(&xnfp->xnf_rxlock);
3225 mutex_enter(&xnfp->xnf_txlock);
3226
3227 xnfp->xnf_connected = B_TRUE;
3228 /*
3229 * Wake up any threads waiting to send data to
3230 * backend.
3231 */
3232 cv_broadcast(&xnfp->xnf_cv_state);
3233
3234 mutex_exit(&xnfp->xnf_txlock);
3235 mutex_exit(&xnfp->xnf_rxlock);
3236
3237 /*
3238 * Kick the peer in case it missed any transmits
3239 * request in the TX ring.
3240 */
3241 ec_notify_via_evtchn(xnfp->xnf_evtchn);
3242
3243 /*
3244 * There may already be completed receive requests in
3245 * the ring sent by backend after it gets connected
3246 * but before we see its state change here, so we call
3247 * xnf_intr() to handle them, if any.
3248 */
3249 (void) xnf_intr((caddr_t)xnfp);
3250
3251 /*
3252 * Mark the link up now that we are connected.
3253 */
3254 mac_link_update(xnfp->xnf_mh, LINK_STATE_UP);
3255
3256 /*
3257 * Tell the backend about the multicast addresses in
3258 * which we are interested.
3259 */
3260 mac_multicast_refresh(xnfp->xnf_mh, NULL, xnfp, B_TRUE);
3261
3262 break;
3263
3264 default:
3265 break;
3266 }
3267 }
3268