1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 /*
28 * Copyright (c) 2014, 2017 by Delphix. All rights reserved.
29 * Copyright 2020 RackTop Systems, Inc.
30 */
31
32 /*
33 *
34 * Copyright (c) 2004 Christian Limpach.
35 * All rights reserved.
36 *
37 * Redistribution and use in source and binary forms, with or without
38 * modification, are permitted provided that the following conditions
39 * are met:
40 * 1. Redistributions of source code must retain the above copyright
41 * notice, this list of conditions and the following disclaimer.
42 * 2. Redistributions in binary form must reproduce the above copyright
43 * notice, this list of conditions and the following disclaimer in the
44 * documentation and/or other materials provided with the distribution.
45 * 3. This section intentionally left blank.
46 * 4. The name of the author may not be used to endorse or promote products
47 * derived from this software without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
50 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
51 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
52 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
53 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
54 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
55 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
56 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
57 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
58 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
59 */
60 /*
61 * Section 3 of the above license was updated in response to bug 6379571.
62 */
63
64 /*
65 * xnf.c - GLDv3 network driver for domU.
66 */
67
68 /*
69 * This driver uses four per-instance locks:
70 *
71 * xnf_gref_lock:
72 *
73 * Protects access to the grant reference list stored in
74 * xnf_gref_head. Grant references should be acquired and released
75 * using gref_get() and gref_put() respectively.
76 *
77 * xnf_schedlock:
78 *
79 * Protects:
80 * xnf_need_sched - used to record that a previous transmit attempt
81 * failed (and consequently it will be necessary to call
82 * mac_tx_update() when transmit resources are available).
83 * xnf_pending_multicast - the number of multicast requests that
84 * have been submitted to the backend for which we have not
85 * processed responses.
86 *
87 * xnf_txlock:
88 *
89 * Protects the transmit ring (xnf_tx_ring) and associated
90 * structures (notably xnf_tx_pkt_id and xnf_tx_pkt_id_head).
91 *
92 * xnf_rxlock:
93 *
94 * Protects the receive ring (xnf_rx_ring) and associated
95 * structures (notably xnf_rx_pkt_info).
96 *
97 * If driver-global state that affects both the transmit and receive
98 * rings is manipulated, both xnf_txlock and xnf_rxlock should be
99 * held, in that order.
100 *
101 * xnf_schedlock is acquired both whilst holding xnf_txlock and
102 * without. It should always be acquired after xnf_txlock if both are
103 * held.
104 *
105 * Notes:
106 * - atomic_add_64() is used to manipulate counters where we require
107 * accuracy. For counters intended only for observation by humans,
108 * post increment/decrement are used instead.
109 */
110
111 #include <sys/types.h>
112 #include <sys/errno.h>
113 #include <sys/param.h>
114 #include <sys/sysmacros.h>
115 #include <sys/systm.h>
116 #include <sys/stream.h>
117 #include <sys/strsubr.h>
118 #include <sys/strsun.h>
119 #include <sys/conf.h>
120 #include <sys/ddi.h>
121 #include <sys/devops.h>
122 #include <sys/sunddi.h>
123 #include <sys/sunndi.h>
124 #include <sys/dlpi.h>
125 #include <sys/ethernet.h>
126 #include <sys/strsun.h>
127 #include <sys/pattr.h>
128 #include <inet/ip.h>
129 #include <inet/ip_impl.h>
130 #include <inet/tcp.h>
131 #include <netinet/udp.h>
132 #include <sys/gld.h>
133 #include <sys/modctl.h>
134 #include <sys/mac_provider.h>
135 #include <sys/mac_ether.h>
136 #include <sys/bootinfo.h>
137 #include <sys/mach_mmu.h>
138 #ifdef XPV_HVM_DRIVER
139 #include <sys/xpv_support.h>
140 #include <sys/hypervisor.h>
141 #else
142 #include <sys/hypervisor.h>
143 #include <sys/evtchn_impl.h>
144 #include <sys/balloon_impl.h>
145 #endif
146 #include <xen/public/io/netif.h>
147 #include <sys/gnttab.h>
148 #include <xen/sys/xendev.h>
149 #include <sys/sdt.h>
150 #include <sys/note.h>
151 #include <sys/debug.h>
152
153 #include <io/xnf.h>
154
155 /*
156 * On a 32 bit PAE system physical and machine addresses are larger
157 * than 32 bits. ddi_btop() on such systems take an unsigned long
158 * argument, and so addresses above 4G are truncated before ddi_btop()
159 * gets to see them. To avoid this, code the shift operation here.
160 */
161 #define xnf_btop(addr) ((addr) >> PAGESHIFT)
162
163 /*
164 * The parameters below should only be changed in /etc/system, never in mdb.
165 */
166
167 /*
168 * Should we use the multicast control feature if the backend provides
169 * it?
170 */
171 boolean_t xnf_multicast_control = B_TRUE;
172
173 /*
174 * Should we allow scatter-gather for tx if backend allows it?
175 */
176 boolean_t xnf_enable_tx_sg = B_TRUE;
177
178 /*
179 * Should we allow scatter-gather for rx if backend allows it?
180 */
181 boolean_t xnf_enable_rx_sg = B_TRUE;
182
183 /*
184 * Should we allow lso for tx sends if backend allows it?
185 * Requires xnf_enable_tx_sg to be also set to TRUE.
186 */
187 boolean_t xnf_enable_lso = B_TRUE;
188
189 /*
190 * Should we allow lro on rx if backend supports it?
191 * Requires xnf_enable_rx_sg to be also set to TRUE.
192 *
193 * !! WARNING !!
194 * LRO is not yet supported in the OS so this should be left as FALSE.
195 * !! WARNING !!
196 */
197 boolean_t xnf_enable_lro = B_FALSE;
198
199 /*
200 * Received packets below this size are copied to a new streams buffer
201 * rather than being desballoc'ed.
202 *
203 * This value is chosen to accommodate traffic where there are a large
204 * number of small packets. For data showing a typical distribution,
205 * see:
206 *
207 * Sinha07a:
208 * Rishi Sinha, Christos Papadopoulos, and John
209 * Heidemann. Internet Packet Size Distributions: Some
210 * Observations. Technical Report ISI-TR-2007-643,
211 * USC/Information Sciences Institute, May, 2007. Orignally
212 * released October 2005 as web page
213 * http://netweb.usc.edu/~sinha/pkt-sizes/.
214 * <http://www.isi.edu/~johnh/PAPERS/Sinha07a.html>.
215 */
216 size_t xnf_rx_copy_limit = 64;
217
218 #define INVALID_GRANT_HANDLE ((grant_handle_t)-1)
219 #define INVALID_GRANT_REF ((grant_ref_t)-1)
220 #define INVALID_TX_ID ((uint16_t)-1)
221
222 #define TX_ID_TO_TXID(p, id) (&((p)->xnf_tx_pkt_id[(id)]))
223 #define TX_ID_VALID(i) \
224 (((i) != INVALID_TX_ID) && ((i) < NET_TX_RING_SIZE))
225
226 /*
227 * calculate how many pages are spanned by an mblk fragment
228 */
229 #define xnf_mblk_pages(mp) (MBLKL(mp) == 0 ? 0 : \
230 xnf_btop((uintptr_t)mp->b_wptr - 1) - xnf_btop((uintptr_t)mp->b_rptr) + 1)
231
232 /* Required system entry points */
233 static int xnf_attach(dev_info_t *, ddi_attach_cmd_t);
234 static int xnf_detach(dev_info_t *, ddi_detach_cmd_t);
235
236 /* Required driver entry points for Nemo */
237 static int xnf_start(void *);
238 static void xnf_stop(void *);
239 static int xnf_set_mac_addr(void *, const uint8_t *);
240 static int xnf_set_multicast(void *, boolean_t, const uint8_t *);
241 static int xnf_set_promiscuous(void *, boolean_t);
242 static mblk_t *xnf_send(void *, mblk_t *);
243 static uint_t xnf_intr(caddr_t);
244 static int xnf_stat(void *, uint_t, uint64_t *);
245 static boolean_t xnf_getcapab(void *, mac_capab_t, void *);
246 static int xnf_getprop(void *, const char *, mac_prop_id_t, uint_t, void *);
247 static int xnf_setprop(void *, const char *, mac_prop_id_t, uint_t,
248 const void *);
249 static void xnf_propinfo(void *, const char *, mac_prop_id_t,
250 mac_prop_info_handle_t);
251
252 /* Driver private functions */
253 static int xnf_alloc_dma_resources(xnf_t *);
254 static void xnf_release_dma_resources(xnf_t *);
255 static void xnf_release_mblks(xnf_t *);
256
257 static int xnf_buf_constructor(void *, void *, int);
258 static void xnf_buf_destructor(void *, void *);
259 static xnf_buf_t *xnf_buf_get(xnf_t *, int, boolean_t);
260 static void xnf_buf_put(xnf_t *, xnf_buf_t *, boolean_t);
261 static void xnf_buf_refresh(xnf_buf_t *);
262 static void xnf_buf_recycle(xnf_buf_t *);
263
264 static int xnf_tx_buf_constructor(void *, void *, int);
265 static void xnf_tx_buf_destructor(void *, void *);
266
267 static grant_ref_t xnf_gref_get(xnf_t *);
268 static void xnf_gref_put(xnf_t *, grant_ref_t);
269
270 static xnf_txid_t *xnf_txid_get(xnf_t *);
271 static void xnf_txid_put(xnf_t *, xnf_txid_t *);
272
273 static void xnf_rxbuf_hang(xnf_t *, xnf_buf_t *);
274 static int xnf_tx_clean_ring(xnf_t *);
275 static void oe_state_change(dev_info_t *, ddi_eventcookie_t,
276 void *, void *);
277 static boolean_t xnf_kstat_init(xnf_t *);
278 static void xnf_rx_collect(xnf_t *);
279
280 #define XNF_CALLBACK_FLAGS (MC_GETCAPAB | MC_PROPERTIES)
281
282 static mac_callbacks_t xnf_callbacks = {
283 .mc_callbacks = XNF_CALLBACK_FLAGS,
284 .mc_getstat = xnf_stat,
285 .mc_start = xnf_start,
286 .mc_stop = xnf_stop,
287 .mc_setpromisc = xnf_set_promiscuous,
288 .mc_multicst = xnf_set_multicast,
289 .mc_unicst = xnf_set_mac_addr,
290 .mc_tx = xnf_send,
291 .mc_getcapab = xnf_getcapab,
292 .mc_setprop = xnf_setprop,
293 .mc_getprop = xnf_getprop,
294 .mc_propinfo = xnf_propinfo,
295 };
296
297 /* DMA attributes for network ring buffer */
298 static ddi_dma_attr_t ringbuf_dma_attr = {
299 .dma_attr_version = DMA_ATTR_V0,
300 .dma_attr_addr_lo = 0,
301 .dma_attr_addr_hi = 0xffffffffffffffffULL,
302 .dma_attr_count_max = 0x7fffffff,
303 .dma_attr_align = MMU_PAGESIZE,
304 .dma_attr_burstsizes = 0x7ff,
305 .dma_attr_minxfer = 1,
306 .dma_attr_maxxfer = 0xffffffffU,
307 .dma_attr_seg = 0xffffffffffffffffULL,
308 .dma_attr_sgllen = 1,
309 .dma_attr_granular = 1,
310 .dma_attr_flags = 0
311 };
312
313 /* DMA attributes for receive data */
314 static ddi_dma_attr_t rx_buf_dma_attr = {
315 .dma_attr_version = DMA_ATTR_V0,
316 .dma_attr_addr_lo = 0,
317 .dma_attr_addr_hi = 0xffffffffffffffffULL,
318 .dma_attr_count_max = MMU_PAGEOFFSET,
319 .dma_attr_align = MMU_PAGESIZE, /* allocation alignment */
320 .dma_attr_burstsizes = 0x7ff,
321 .dma_attr_minxfer = 1,
322 .dma_attr_maxxfer = 0xffffffffU,
323 .dma_attr_seg = 0xffffffffffffffffULL,
324 .dma_attr_sgllen = 1,
325 .dma_attr_granular = 1,
326 .dma_attr_flags = 0
327 };
328
329 /* DMA attributes for transmit data */
330 static ddi_dma_attr_t tx_buf_dma_attr = {
331 .dma_attr_version = DMA_ATTR_V0,
332 .dma_attr_addr_lo = 0,
333 .dma_attr_addr_hi = 0xffffffffffffffffULL,
334 .dma_attr_count_max = MMU_PAGEOFFSET,
335 .dma_attr_align = 1,
336 .dma_attr_burstsizes = 0x7ff,
337 .dma_attr_minxfer = 1,
338 .dma_attr_maxxfer = 0xffffffffU,
339 .dma_attr_seg = XEN_DATA_BOUNDARY - 1, /* segment boundary */
340 .dma_attr_sgllen = XEN_MAX_TX_DATA_PAGES, /* max number of segments */
341 .dma_attr_granular = 1,
342 .dma_attr_flags = 0
343 };
344
345 /* DMA access attributes for registers and descriptors */
346 static ddi_device_acc_attr_t accattr = {
347 DDI_DEVICE_ATTR_V0,
348 DDI_STRUCTURE_LE_ACC, /* This is a little-endian device */
349 DDI_STRICTORDER_ACC
350 };
351
352 /* DMA access attributes for data: NOT to be byte swapped. */
353 static ddi_device_acc_attr_t data_accattr = {
354 DDI_DEVICE_ATTR_V0,
355 DDI_NEVERSWAP_ACC,
356 DDI_STRICTORDER_ACC
357 };
358
359 DDI_DEFINE_STREAM_OPS(xnf_dev_ops, nulldev, nulldev, xnf_attach, xnf_detach,
360 nodev, NULL, D_MP, NULL, ddi_quiesce_not_supported);
361
362 static struct modldrv xnf_modldrv = {
363 &mod_driverops,
364 "Virtual Ethernet driver",
365 &xnf_dev_ops
366 };
367
368 static struct modlinkage modlinkage = {
369 MODREV_1, &xnf_modldrv, NULL
370 };
371
372 int
_init(void)373 _init(void)
374 {
375 int r;
376
377 mac_init_ops(&xnf_dev_ops, "xnf");
378 r = mod_install(&modlinkage);
379 if (r != DDI_SUCCESS)
380 mac_fini_ops(&xnf_dev_ops);
381
382 return (r);
383 }
384
385 int
_fini(void)386 _fini(void)
387 {
388 return (EBUSY); /* XXPV should be removable */
389 }
390
391 int
_info(struct modinfo * modinfop)392 _info(struct modinfo *modinfop)
393 {
394 return (mod_info(&modlinkage, modinfop));
395 }
396
397 /*
398 * Acquire a grant reference.
399 */
400 static grant_ref_t
xnf_gref_get(xnf_t * xnfp)401 xnf_gref_get(xnf_t *xnfp)
402 {
403 grant_ref_t gref;
404
405 mutex_enter(&xnfp->xnf_gref_lock);
406
407 do {
408 gref = gnttab_claim_grant_reference(&xnfp->xnf_gref_head);
409
410 } while ((gref == INVALID_GRANT_REF) &&
411 (gnttab_alloc_grant_references(16, &xnfp->xnf_gref_head) == 0));
412
413 mutex_exit(&xnfp->xnf_gref_lock);
414
415 if (gref == INVALID_GRANT_REF) {
416 xnfp->xnf_stat_gref_failure++;
417 } else {
418 atomic_inc_64(&xnfp->xnf_stat_gref_outstanding);
419 if (xnfp->xnf_stat_gref_outstanding > xnfp->xnf_stat_gref_peak)
420 xnfp->xnf_stat_gref_peak =
421 xnfp->xnf_stat_gref_outstanding;
422 }
423
424 return (gref);
425 }
426
427 /*
428 * Release a grant reference.
429 */
430 static void
xnf_gref_put(xnf_t * xnfp,grant_ref_t gref)431 xnf_gref_put(xnf_t *xnfp, grant_ref_t gref)
432 {
433 ASSERT(gref != INVALID_GRANT_REF);
434
435 mutex_enter(&xnfp->xnf_gref_lock);
436 gnttab_release_grant_reference(&xnfp->xnf_gref_head, gref);
437 mutex_exit(&xnfp->xnf_gref_lock);
438
439 atomic_dec_64(&xnfp->xnf_stat_gref_outstanding);
440 }
441
442 /*
443 * Acquire a transmit id.
444 */
445 static xnf_txid_t *
xnf_txid_get(xnf_t * xnfp)446 xnf_txid_get(xnf_t *xnfp)
447 {
448 xnf_txid_t *tidp;
449
450 ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
451
452 if (xnfp->xnf_tx_pkt_id_head == INVALID_TX_ID)
453 return (NULL);
454
455 ASSERT(TX_ID_VALID(xnfp->xnf_tx_pkt_id_head));
456
457 tidp = TX_ID_TO_TXID(xnfp, xnfp->xnf_tx_pkt_id_head);
458 xnfp->xnf_tx_pkt_id_head = tidp->next;
459 tidp->next = INVALID_TX_ID;
460
461 ASSERT(tidp->txbuf == NULL);
462
463 return (tidp);
464 }
465
466 /*
467 * Release a transmit id.
468 */
469 static void
xnf_txid_put(xnf_t * xnfp,xnf_txid_t * tidp)470 xnf_txid_put(xnf_t *xnfp, xnf_txid_t *tidp)
471 {
472 ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
473 ASSERT(TX_ID_VALID(tidp->id));
474 ASSERT(tidp->next == INVALID_TX_ID);
475
476 tidp->txbuf = NULL;
477 tidp->next = xnfp->xnf_tx_pkt_id_head;
478 xnfp->xnf_tx_pkt_id_head = tidp->id;
479 }
480
481 static void
xnf_data_txbuf_free(xnf_t * xnfp,xnf_txbuf_t * txp)482 xnf_data_txbuf_free(xnf_t *xnfp, xnf_txbuf_t *txp)
483 {
484 ASSERT3U(txp->tx_type, ==, TX_DATA);
485
486 /*
487 * We are either using a lookaside buffer or we are mapping existing
488 * buffers.
489 */
490 if (txp->tx_bdesc != NULL) {
491 ASSERT(!txp->tx_handle_bound);
492 xnf_buf_put(xnfp, txp->tx_bdesc, B_TRUE);
493 } else {
494 if (txp->tx_txreq.gref != INVALID_GRANT_REF) {
495 if (gnttab_query_foreign_access(txp->tx_txreq.gref) !=
496 0) {
497 cmn_err(CE_PANIC, "tx grant %d still in use by "
498 "backend domain", txp->tx_txreq.gref);
499 }
500 (void) gnttab_end_foreign_access_ref(
501 txp->tx_txreq.gref, 1);
502 xnf_gref_put(xnfp, txp->tx_txreq.gref);
503 }
504
505 if (txp->tx_handle_bound)
506 (void) ddi_dma_unbind_handle(txp->tx_dma_handle);
507 }
508
509 if (txp->tx_mp != NULL)
510 freemsg(txp->tx_mp);
511
512 if (txp->tx_prev != NULL) {
513 ASSERT3P(txp->tx_prev->tx_next, ==, txp);
514 txp->tx_prev->tx_next = NULL;
515 }
516
517 if (txp->tx_txreq.id != INVALID_TX_ID) {
518 /*
519 * This should be only possible when resuming from a suspend.
520 */
521 ASSERT(!xnfp->xnf_connected);
522 xnf_txid_put(xnfp, TX_ID_TO_TXID(xnfp, txp->tx_txreq.id));
523 txp->tx_txreq.id = INVALID_TX_ID;
524 }
525
526 kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
527 }
528
529 static void
xnf_data_txbuf_free_chain(xnf_t * xnfp,xnf_txbuf_t * txp)530 xnf_data_txbuf_free_chain(xnf_t *xnfp, xnf_txbuf_t *txp)
531 {
532 if (txp == NULL)
533 return;
534
535 while (txp->tx_next != NULL)
536 txp = txp->tx_next;
537
538 /*
539 * We free the chain in reverse order so that grants can be released
540 * for all dma chunks before unbinding the dma handles. The mblk is
541 * freed last, after all its fragments' dma handles are unbound.
542 */
543 xnf_txbuf_t *prev;
544 for (; txp != NULL; txp = prev) {
545 prev = txp->tx_prev;
546 xnf_data_txbuf_free(xnfp, txp);
547 }
548 }
549
550 static xnf_txbuf_t *
xnf_data_txbuf_alloc(xnf_t * xnfp,int flag)551 xnf_data_txbuf_alloc(xnf_t *xnfp, int flag)
552 {
553 xnf_txbuf_t *txp;
554
555 if ((txp = kmem_cache_alloc(xnfp->xnf_tx_buf_cache, flag)) == NULL) {
556 return (NULL);
557 }
558
559 txp->tx_type = TX_DATA;
560 txp->tx_next = NULL;
561 txp->tx_prev = NULL;
562 txp->tx_head = txp;
563 txp->tx_frags_to_ack = 0;
564 txp->tx_mp = NULL;
565 txp->tx_bdesc = NULL;
566 txp->tx_handle_bound = B_FALSE;
567 txp->tx_txreq.gref = INVALID_GRANT_REF;
568 txp->tx_txreq.id = INVALID_TX_ID;
569
570 return (txp);
571 }
572
573 /*
574 * Get `wanted' slots in the transmit ring, waiting for at least that
575 * number if `wait' is B_TRUE. Force the ring to be cleaned by setting
576 * `wanted' to zero.
577 *
578 * Return the number of slots available.
579 */
580 static int
xnf_tx_slots_get(xnf_t * xnfp,int wanted,boolean_t wait)581 xnf_tx_slots_get(xnf_t *xnfp, int wanted, boolean_t wait)
582 {
583 int slotsfree;
584 boolean_t forced_clean = (wanted == 0);
585
586 ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
587
588 /* LINTED: constant in conditional context */
589 while (B_TRUE) {
590 slotsfree = RING_FREE_REQUESTS(&xnfp->xnf_tx_ring);
591
592 if ((slotsfree < wanted) || forced_clean)
593 slotsfree = xnf_tx_clean_ring(xnfp);
594
595 /*
596 * If there are more than we need free, tell other
597 * people to come looking again. We hold txlock, so we
598 * are able to take our slots before anyone else runs.
599 */
600 if (slotsfree > wanted)
601 cv_broadcast(&xnfp->xnf_cv_tx_slots);
602
603 if (slotsfree >= wanted)
604 break;
605
606 if (!wait)
607 break;
608
609 cv_wait(&xnfp->xnf_cv_tx_slots, &xnfp->xnf_txlock);
610 }
611
612 ASSERT(slotsfree <= RING_SIZE(&(xnfp->xnf_tx_ring)));
613
614 return (slotsfree);
615 }
616
617 static int
xnf_setup_rings(xnf_t * xnfp)618 xnf_setup_rings(xnf_t *xnfp)
619 {
620 domid_t oeid;
621 struct xenbus_device *xsd;
622 RING_IDX i;
623 int err;
624 xnf_txid_t *tidp;
625 xnf_buf_t **bdescp;
626
627 oeid = xvdi_get_oeid(xnfp->xnf_devinfo);
628 xsd = xvdi_get_xsd(xnfp->xnf_devinfo);
629
630 if (xnfp->xnf_tx_ring_ref != INVALID_GRANT_REF)
631 gnttab_end_foreign_access(xnfp->xnf_tx_ring_ref, 0, 0);
632
633 err = gnttab_grant_foreign_access(oeid,
634 xnf_btop(pa_to_ma(xnfp->xnf_tx_ring_phys_addr)), 0);
635 if (err <= 0) {
636 err = -err;
637 xenbus_dev_error(xsd, err, "granting access to tx ring page");
638 goto out;
639 }
640 xnfp->xnf_tx_ring_ref = (grant_ref_t)err;
641
642 if (xnfp->xnf_rx_ring_ref != INVALID_GRANT_REF)
643 gnttab_end_foreign_access(xnfp->xnf_rx_ring_ref, 0, 0);
644
645 err = gnttab_grant_foreign_access(oeid,
646 xnf_btop(pa_to_ma(xnfp->xnf_rx_ring_phys_addr)), 0);
647 if (err <= 0) {
648 err = -err;
649 xenbus_dev_error(xsd, err, "granting access to rx ring page");
650 goto out;
651 }
652 xnfp->xnf_rx_ring_ref = (grant_ref_t)err;
653
654 mutex_enter(&xnfp->xnf_txlock);
655
656 /*
657 * We first cleanup the TX ring in case we are doing a resume.
658 * Note that this can lose packets, but we expect to stagger on.
659 */
660 xnfp->xnf_tx_pkt_id_head = INVALID_TX_ID; /* I.e. emtpy list. */
661 for (i = 0, tidp = &xnfp->xnf_tx_pkt_id[0];
662 i < NET_TX_RING_SIZE;
663 i++, tidp++) {
664 xnf_txbuf_t *txp = tidp->txbuf;
665 if (txp == NULL)
666 continue;
667
668 switch (txp->tx_type) {
669 case TX_DATA:
670 /*
671 * txid_put() will be called for each txbuf's txid in
672 * the chain which will result in clearing tidp->txbuf.
673 */
674 xnf_data_txbuf_free_chain(xnfp, txp);
675
676 break;
677
678 case TX_MCAST_REQ:
679 txp->tx_type = TX_MCAST_RSP;
680 txp->tx_status = NETIF_RSP_DROPPED;
681 cv_broadcast(&xnfp->xnf_cv_multicast);
682
683 /*
684 * The request consumed two slots in the ring,
685 * yet only a single xnf_txid_t is used. Step
686 * over the empty slot.
687 */
688 i++;
689 ASSERT3U(i, <, NET_TX_RING_SIZE);
690 break;
691
692 case TX_MCAST_RSP:
693 break;
694 }
695 }
696
697 /*
698 * Now purge old list and add each txid to the new free list.
699 */
700 xnfp->xnf_tx_pkt_id_head = INVALID_TX_ID; /* I.e. emtpy list. */
701 for (i = 0, tidp = &xnfp->xnf_tx_pkt_id[0];
702 i < NET_TX_RING_SIZE;
703 i++, tidp++) {
704 tidp->id = i;
705 ASSERT3P(tidp->txbuf, ==, NULL);
706 tidp->next = INVALID_TX_ID; /* Appease txid_put(). */
707 xnf_txid_put(xnfp, tidp);
708 }
709
710 /* LINTED: constant in conditional context */
711 SHARED_RING_INIT(xnfp->xnf_tx_ring.sring);
712 /* LINTED: constant in conditional context */
713 FRONT_RING_INIT(&xnfp->xnf_tx_ring,
714 xnfp->xnf_tx_ring.sring, PAGESIZE);
715
716 mutex_exit(&xnfp->xnf_txlock);
717
718 mutex_enter(&xnfp->xnf_rxlock);
719
720 /*
721 * Clean out any buffers currently posted to the receive ring
722 * before we reset it.
723 */
724 for (i = 0, bdescp = &xnfp->xnf_rx_pkt_info[0];
725 i < NET_RX_RING_SIZE;
726 i++, bdescp++) {
727 if (*bdescp != NULL) {
728 xnf_buf_put(xnfp, *bdescp, B_FALSE);
729 *bdescp = NULL;
730 }
731 }
732
733 /* LINTED: constant in conditional context */
734 SHARED_RING_INIT(xnfp->xnf_rx_ring.sring);
735 /* LINTED: constant in conditional context */
736 FRONT_RING_INIT(&xnfp->xnf_rx_ring,
737 xnfp->xnf_rx_ring.sring, PAGESIZE);
738
739 /*
740 * Fill the ring with buffers.
741 */
742 for (i = 0; i < NET_RX_RING_SIZE; i++) {
743 xnf_buf_t *bdesc;
744
745 bdesc = xnf_buf_get(xnfp, KM_SLEEP, B_FALSE);
746 VERIFY(bdesc != NULL);
747 xnf_rxbuf_hang(xnfp, bdesc);
748 }
749
750 /* LINTED: constant in conditional context */
751 RING_PUSH_REQUESTS(&xnfp->xnf_rx_ring);
752
753 mutex_exit(&xnfp->xnf_rxlock);
754
755 return (0);
756
757 out:
758 if (xnfp->xnf_tx_ring_ref != INVALID_GRANT_REF)
759 gnttab_end_foreign_access(xnfp->xnf_tx_ring_ref, 0, 0);
760 xnfp->xnf_tx_ring_ref = INVALID_GRANT_REF;
761
762 if (xnfp->xnf_rx_ring_ref != INVALID_GRANT_REF)
763 gnttab_end_foreign_access(xnfp->xnf_rx_ring_ref, 0, 0);
764 xnfp->xnf_rx_ring_ref = INVALID_GRANT_REF;
765
766 return (err);
767 }
768
769 /*
770 * Connect driver to back end, called to set up communication with
771 * back end driver both initially and on resume after restore/migrate.
772 */
773 void
xnf_be_connect(xnf_t * xnfp)774 xnf_be_connect(xnf_t *xnfp)
775 {
776 const char *message;
777 xenbus_transaction_t xbt;
778 struct xenbus_device *xsd;
779 char *xsname;
780 int err;
781
782 ASSERT(!xnfp->xnf_connected);
783
784 xsd = xvdi_get_xsd(xnfp->xnf_devinfo);
785 xsname = xvdi_get_xsname(xnfp->xnf_devinfo);
786
787 err = xnf_setup_rings(xnfp);
788 if (err != 0) {
789 cmn_err(CE_WARN, "failed to set up tx/rx rings");
790 xenbus_dev_error(xsd, err, "setting up ring");
791 return;
792 }
793
794 again:
795 err = xenbus_transaction_start(&xbt);
796 if (err != 0) {
797 xenbus_dev_error(xsd, EIO, "starting transaction");
798 return;
799 }
800
801 err = xenbus_printf(xbt, xsname, "tx-ring-ref", "%u",
802 xnfp->xnf_tx_ring_ref);
803 if (err != 0) {
804 message = "writing tx ring-ref";
805 goto abort_transaction;
806 }
807
808 err = xenbus_printf(xbt, xsname, "rx-ring-ref", "%u",
809 xnfp->xnf_rx_ring_ref);
810 if (err != 0) {
811 message = "writing rx ring-ref";
812 goto abort_transaction;
813 }
814
815 err = xenbus_printf(xbt, xsname, "event-channel", "%u",
816 xnfp->xnf_evtchn);
817 if (err != 0) {
818 message = "writing event-channel";
819 goto abort_transaction;
820 }
821
822 err = xenbus_printf(xbt, xsname, "feature-rx-notify", "%d", 1);
823 if (err != 0) {
824 message = "writing feature-rx-notify";
825 goto abort_transaction;
826 }
827
828 err = xenbus_printf(xbt, xsname, "request-rx-copy", "%d", 1);
829 if (err != 0) {
830 message = "writing request-rx-copy";
831 goto abort_transaction;
832 }
833
834 if (xnfp->xnf_be_mcast_control) {
835 err = xenbus_printf(xbt, xsname, "request-multicast-control",
836 "%d", 1);
837 if (err != 0) {
838 message = "writing request-multicast-control";
839 goto abort_transaction;
840 }
841 }
842
843 /*
844 * Tell backend if we support scatter-gather lists on the rx side.
845 */
846 err = xenbus_printf(xbt, xsname, "feature-sg", "%d",
847 xnf_enable_rx_sg ? 1 : 0);
848 if (err != 0) {
849 message = "writing feature-sg";
850 goto abort_transaction;
851 }
852
853 /*
854 * Tell backend if we support LRO for IPv4. Scatter-gather on rx is
855 * a prerequisite.
856 */
857 err = xenbus_printf(xbt, xsname, "feature-gso-tcpv4", "%d",
858 (xnf_enable_rx_sg && xnf_enable_lro) ? 1 : 0);
859 if (err != 0) {
860 message = "writing feature-gso-tcpv4";
861 goto abort_transaction;
862 }
863
864 err = xvdi_switch_state(xnfp->xnf_devinfo, xbt, XenbusStateConnected);
865 if (err != 0) {
866 message = "switching state to XenbusStateConnected";
867 goto abort_transaction;
868 }
869
870 err = xenbus_transaction_end(xbt, 0);
871 if (err != 0) {
872 if (err == EAGAIN)
873 goto again;
874 xenbus_dev_error(xsd, err, "completing transaction");
875 }
876
877 return;
878
879 abort_transaction:
880 (void) xenbus_transaction_end(xbt, 1);
881 xenbus_dev_error(xsd, err, "%s", message);
882 }
883
884 /*
885 * Read configuration information from xenstore.
886 */
887 void
xnf_read_config(xnf_t * xnfp)888 xnf_read_config(xnf_t *xnfp)
889 {
890 int err, be_cap;
891 char mac[ETHERADDRL * 3];
892 char *oename = xvdi_get_oename(xnfp->xnf_devinfo);
893
894 err = xenbus_scanf(XBT_NULL, oename, "mac",
895 "%s", (char *)&mac[0]);
896 if (err != 0) {
897 /*
898 * bad: we're supposed to be set up with a proper mac
899 * addr. at this point
900 */
901 cmn_err(CE_WARN, "%s%d: no mac address",
902 ddi_driver_name(xnfp->xnf_devinfo),
903 ddi_get_instance(xnfp->xnf_devinfo));
904 return;
905 }
906 if (ether_aton(mac, xnfp->xnf_mac_addr) != ETHERADDRL) {
907 err = ENOENT;
908 xenbus_dev_error(xvdi_get_xsd(xnfp->xnf_devinfo), ENOENT,
909 "parsing %s/mac", xvdi_get_xsname(xnfp->xnf_devinfo));
910 return;
911 }
912
913 err = xenbus_scanf(XBT_NULL, oename,
914 "feature-rx-copy", "%d", &be_cap);
915 /*
916 * If we fail to read the store we assume that the key is
917 * absent, implying an older domain at the far end. Older
918 * domains cannot do HV copy.
919 */
920 if (err != 0)
921 be_cap = 0;
922 xnfp->xnf_be_rx_copy = (be_cap != 0);
923
924 err = xenbus_scanf(XBT_NULL, oename,
925 "feature-multicast-control", "%d", &be_cap);
926 /*
927 * If we fail to read the store we assume that the key is
928 * absent, implying an older domain at the far end. Older
929 * domains do not support multicast control.
930 */
931 if (err != 0)
932 be_cap = 0;
933 xnfp->xnf_be_mcast_control = (be_cap != 0) && xnf_multicast_control;
934
935 /*
936 * See if back-end supports scatter-gather for transmits. If not,
937 * we will not support LSO and limit the mtu to 1500.
938 */
939 err = xenbus_scanf(XBT_NULL, oename, "feature-sg", "%d", &be_cap);
940 if (err != 0) {
941 be_cap = 0;
942 dev_err(xnfp->xnf_devinfo, CE_WARN, "error reading "
943 "'feature-sg' from backend driver");
944 }
945 if (be_cap == 0) {
946 dev_err(xnfp->xnf_devinfo, CE_WARN, "scatter-gather is not "
947 "supported for transmits in the backend driver. LSO is "
948 "disabled and MTU is restricted to 1500 bytes.");
949 }
950 xnfp->xnf_be_tx_sg = (be_cap != 0) && xnf_enable_tx_sg;
951
952 if (xnfp->xnf_be_tx_sg) {
953 /*
954 * Check if LSO is supported. Currently we only check for
955 * IPv4 as Illumos doesn't support LSO for IPv6.
956 */
957 err = xenbus_scanf(XBT_NULL, oename, "feature-gso-tcpv4", "%d",
958 &be_cap);
959 if (err != 0) {
960 be_cap = 0;
961 dev_err(xnfp->xnf_devinfo, CE_WARN, "error reading "
962 "'feature-gso-tcpv4' from backend driver");
963 }
964 if (be_cap == 0) {
965 dev_err(xnfp->xnf_devinfo, CE_WARN, "LSO is not "
966 "supported by the backend driver. Performance "
967 "will be affected.");
968 }
969 xnfp->xnf_be_lso = (be_cap != 0) && xnf_enable_lso;
970 }
971 }
972
973 /*
974 * attach(9E) -- Attach a device to the system
975 */
976 static int
xnf_attach(dev_info_t * devinfo,ddi_attach_cmd_t cmd)977 xnf_attach(dev_info_t *devinfo, ddi_attach_cmd_t cmd)
978 {
979 mac_register_t *macp;
980 xnf_t *xnfp;
981 int err;
982 char cachename[32];
983
984 switch (cmd) {
985 case DDI_RESUME:
986 xnfp = ddi_get_driver_private(devinfo);
987 xnfp->xnf_gen++;
988
989 (void) xvdi_resume(devinfo);
990 (void) xvdi_alloc_evtchn(devinfo);
991 xnfp->xnf_evtchn = xvdi_get_evtchn(devinfo);
992 #ifdef XPV_HVM_DRIVER
993 ec_bind_evtchn_to_handler(xnfp->xnf_evtchn, IPL_VIF, xnf_intr,
994 xnfp);
995 #else
996 (void) ddi_add_intr(devinfo, 0, NULL, NULL, xnf_intr,
997 (caddr_t)xnfp);
998 #endif
999 return (DDI_SUCCESS);
1000
1001 case DDI_ATTACH:
1002 break;
1003
1004 default:
1005 return (DDI_FAILURE);
1006 }
1007
1008 /*
1009 * Allocate gld_mac_info_t and xnf_instance structures
1010 */
1011 macp = mac_alloc(MAC_VERSION);
1012 if (macp == NULL)
1013 return (DDI_FAILURE);
1014 xnfp = kmem_zalloc(sizeof (*xnfp), KM_SLEEP);
1015
1016 xnfp->xnf_tx_pkt_id =
1017 kmem_zalloc(sizeof (xnf_txid_t) * NET_TX_RING_SIZE, KM_SLEEP);
1018
1019 xnfp->xnf_rx_pkt_info =
1020 kmem_zalloc(sizeof (xnf_buf_t *) * NET_RX_RING_SIZE, KM_SLEEP);
1021
1022 macp->m_dip = devinfo;
1023 macp->m_driver = xnfp;
1024 xnfp->xnf_devinfo = devinfo;
1025
1026 macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
1027 macp->m_src_addr = xnfp->xnf_mac_addr;
1028 macp->m_callbacks = &xnf_callbacks;
1029 macp->m_min_sdu = 0;
1030 xnfp->xnf_mtu = ETHERMTU;
1031 macp->m_max_sdu = xnfp->xnf_mtu;
1032
1033 xnfp->xnf_running = B_FALSE;
1034 xnfp->xnf_connected = B_FALSE;
1035 xnfp->xnf_be_rx_copy = B_FALSE;
1036 xnfp->xnf_be_mcast_control = B_FALSE;
1037 xnfp->xnf_need_sched = B_FALSE;
1038
1039 xnfp->xnf_rx_head = NULL;
1040 xnfp->xnf_rx_tail = NULL;
1041 xnfp->xnf_rx_new_buffers_posted = B_FALSE;
1042
1043 #ifdef XPV_HVM_DRIVER
1044 /* Report our version to dom0 */
1045 (void) xenbus_printf(XBT_NULL, "guest/xnf", "version", "%d",
1046 HVMPV_XNF_VERS);
1047 #endif
1048
1049 /*
1050 * Get the iblock cookie with which to initialize the mutexes.
1051 */
1052 if (ddi_get_iblock_cookie(devinfo, 0, &xnfp->xnf_icookie)
1053 != DDI_SUCCESS)
1054 goto failure;
1055
1056 mutex_init(&xnfp->xnf_txlock,
1057 NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
1058 mutex_init(&xnfp->xnf_rxlock,
1059 NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
1060 mutex_init(&xnfp->xnf_schedlock,
1061 NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
1062 mutex_init(&xnfp->xnf_gref_lock,
1063 NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
1064
1065 cv_init(&xnfp->xnf_cv_state, NULL, CV_DEFAULT, NULL);
1066 cv_init(&xnfp->xnf_cv_multicast, NULL, CV_DEFAULT, NULL);
1067 cv_init(&xnfp->xnf_cv_tx_slots, NULL, CV_DEFAULT, NULL);
1068
1069 (void) sprintf(cachename, "xnf_buf_cache_%d",
1070 ddi_get_instance(devinfo));
1071 xnfp->xnf_buf_cache = kmem_cache_create(cachename,
1072 sizeof (xnf_buf_t), 0,
1073 xnf_buf_constructor, xnf_buf_destructor,
1074 NULL, xnfp, NULL, 0);
1075 if (xnfp->xnf_buf_cache == NULL)
1076 goto failure_0;
1077
1078 (void) sprintf(cachename, "xnf_tx_buf_cache_%d",
1079 ddi_get_instance(devinfo));
1080 xnfp->xnf_tx_buf_cache = kmem_cache_create(cachename,
1081 sizeof (xnf_txbuf_t), 0,
1082 xnf_tx_buf_constructor, xnf_tx_buf_destructor,
1083 NULL, xnfp, NULL, 0);
1084 if (xnfp->xnf_tx_buf_cache == NULL)
1085 goto failure_1;
1086
1087 xnfp->xnf_gref_head = INVALID_GRANT_REF;
1088
1089 if (xnf_alloc_dma_resources(xnfp) == DDI_FAILURE) {
1090 cmn_err(CE_WARN, "xnf%d: failed to allocate and initialize "
1091 "driver data structures",
1092 ddi_get_instance(xnfp->xnf_devinfo));
1093 goto failure_2;
1094 }
1095
1096 xnfp->xnf_rx_ring.sring->rsp_event =
1097 xnfp->xnf_tx_ring.sring->rsp_event = 1;
1098
1099 xnfp->xnf_tx_ring_ref = INVALID_GRANT_REF;
1100 xnfp->xnf_rx_ring_ref = INVALID_GRANT_REF;
1101
1102 /* set driver private pointer now */
1103 ddi_set_driver_private(devinfo, xnfp);
1104
1105 if (!xnf_kstat_init(xnfp))
1106 goto failure_3;
1107
1108 /*
1109 * Allocate an event channel, add the interrupt handler and
1110 * bind it to the event channel.
1111 */
1112 (void) xvdi_alloc_evtchn(devinfo);
1113 xnfp->xnf_evtchn = xvdi_get_evtchn(devinfo);
1114 #ifdef XPV_HVM_DRIVER
1115 ec_bind_evtchn_to_handler(xnfp->xnf_evtchn, IPL_VIF, xnf_intr, xnfp);
1116 #else
1117 (void) ddi_add_intr(devinfo, 0, NULL, NULL, xnf_intr, (caddr_t)xnfp);
1118 #endif
1119
1120 err = mac_register(macp, &xnfp->xnf_mh);
1121 mac_free(macp);
1122 macp = NULL;
1123 if (err != 0)
1124 goto failure_4;
1125
1126 if (xvdi_add_event_handler(devinfo, XS_OE_STATE, oe_state_change, NULL)
1127 != DDI_SUCCESS)
1128 goto failure_5;
1129
1130 #ifdef XPV_HVM_DRIVER
1131 /*
1132 * In the HVM case, this driver essentially replaces a driver for
1133 * a 'real' PCI NIC. Without the "model" property set to
1134 * "Ethernet controller", like the PCI code does, netbooting does
1135 * not work correctly, as strplumb_get_netdev_path() will not find
1136 * this interface.
1137 */
1138 (void) ndi_prop_update_string(DDI_DEV_T_NONE, devinfo, "model",
1139 "Ethernet controller");
1140 #endif
1141
1142 return (DDI_SUCCESS);
1143
1144 failure_5:
1145 (void) mac_unregister(xnfp->xnf_mh);
1146
1147 failure_4:
1148 #ifdef XPV_HVM_DRIVER
1149 ec_unbind_evtchn(xnfp->xnf_evtchn);
1150 xvdi_free_evtchn(devinfo);
1151 #else
1152 ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie);
1153 #endif
1154 xnfp->xnf_evtchn = INVALID_EVTCHN;
1155 kstat_delete(xnfp->xnf_kstat_aux);
1156
1157 failure_3:
1158 xnf_release_dma_resources(xnfp);
1159
1160 failure_2:
1161 kmem_cache_destroy(xnfp->xnf_tx_buf_cache);
1162
1163 failure_1:
1164 kmem_cache_destroy(xnfp->xnf_buf_cache);
1165
1166 failure_0:
1167 cv_destroy(&xnfp->xnf_cv_tx_slots);
1168 cv_destroy(&xnfp->xnf_cv_multicast);
1169 cv_destroy(&xnfp->xnf_cv_state);
1170
1171 mutex_destroy(&xnfp->xnf_gref_lock);
1172 mutex_destroy(&xnfp->xnf_schedlock);
1173 mutex_destroy(&xnfp->xnf_rxlock);
1174 mutex_destroy(&xnfp->xnf_txlock);
1175
1176 failure:
1177 kmem_free(xnfp, sizeof (*xnfp));
1178 if (macp != NULL)
1179 mac_free(macp);
1180
1181 return (DDI_FAILURE);
1182 }
1183
1184 /* detach(9E) -- Detach a device from the system */
1185 static int
xnf_detach(dev_info_t * devinfo,ddi_detach_cmd_t cmd)1186 xnf_detach(dev_info_t *devinfo, ddi_detach_cmd_t cmd)
1187 {
1188 xnf_t *xnfp; /* Our private device info */
1189
1190 xnfp = ddi_get_driver_private(devinfo);
1191
1192 switch (cmd) {
1193 case DDI_SUSPEND:
1194 #ifdef XPV_HVM_DRIVER
1195 ec_unbind_evtchn(xnfp->xnf_evtchn);
1196 xvdi_free_evtchn(devinfo);
1197 #else
1198 ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie);
1199 #endif
1200
1201 xvdi_suspend(devinfo);
1202
1203 mutex_enter(&xnfp->xnf_rxlock);
1204 mutex_enter(&xnfp->xnf_txlock);
1205
1206 xnfp->xnf_evtchn = INVALID_EVTCHN;
1207 xnfp->xnf_connected = B_FALSE;
1208 mutex_exit(&xnfp->xnf_txlock);
1209 mutex_exit(&xnfp->xnf_rxlock);
1210
1211 /* claim link to be down after disconnect */
1212 mac_link_update(xnfp->xnf_mh, LINK_STATE_DOWN);
1213 return (DDI_SUCCESS);
1214
1215 case DDI_DETACH:
1216 break;
1217
1218 default:
1219 return (DDI_FAILURE);
1220 }
1221
1222 if (xnfp->xnf_connected)
1223 return (DDI_FAILURE);
1224
1225 /*
1226 * Cannot detach if we have xnf_buf_t outstanding.
1227 */
1228 if (xnfp->xnf_stat_buf_allocated > 0)
1229 return (DDI_FAILURE);
1230
1231 if (mac_unregister(xnfp->xnf_mh) != 0)
1232 return (DDI_FAILURE);
1233
1234 kstat_delete(xnfp->xnf_kstat_aux);
1235
1236 /* Stop the receiver */
1237 xnf_stop(xnfp);
1238
1239 xvdi_remove_event_handler(devinfo, XS_OE_STATE);
1240
1241 /* Remove the interrupt */
1242 #ifdef XPV_HVM_DRIVER
1243 ec_unbind_evtchn(xnfp->xnf_evtchn);
1244 xvdi_free_evtchn(devinfo);
1245 #else
1246 ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie);
1247 #endif
1248
1249 /* Release any pending xmit mblks */
1250 xnf_release_mblks(xnfp);
1251
1252 /* Release all DMA resources */
1253 xnf_release_dma_resources(xnfp);
1254
1255 cv_destroy(&xnfp->xnf_cv_tx_slots);
1256 cv_destroy(&xnfp->xnf_cv_multicast);
1257 cv_destroy(&xnfp->xnf_cv_state);
1258
1259 kmem_cache_destroy(xnfp->xnf_tx_buf_cache);
1260 kmem_cache_destroy(xnfp->xnf_buf_cache);
1261
1262 mutex_destroy(&xnfp->xnf_gref_lock);
1263 mutex_destroy(&xnfp->xnf_schedlock);
1264 mutex_destroy(&xnfp->xnf_rxlock);
1265 mutex_destroy(&xnfp->xnf_txlock);
1266
1267 kmem_free(xnfp, sizeof (*xnfp));
1268
1269 return (DDI_SUCCESS);
1270 }
1271
1272 /*
1273 * xnf_set_mac_addr() -- set the physical network address on the board.
1274 */
1275 static int
xnf_set_mac_addr(void * arg,const uint8_t * macaddr)1276 xnf_set_mac_addr(void *arg, const uint8_t *macaddr)
1277 {
1278 _NOTE(ARGUNUSED(arg, macaddr));
1279
1280 /*
1281 * We can't set our macaddr.
1282 */
1283 return (ENOTSUP);
1284 }
1285
1286 /*
1287 * xnf_set_multicast() -- set (enable) or disable a multicast address.
1288 *
1289 * Program the hardware to enable/disable the multicast address
1290 * in "mca". Enable if "add" is true, disable if false.
1291 */
1292 static int
xnf_set_multicast(void * arg,boolean_t add,const uint8_t * mca)1293 xnf_set_multicast(void *arg, boolean_t add, const uint8_t *mca)
1294 {
1295 xnf_t *xnfp = arg;
1296 xnf_txbuf_t *txp;
1297 int n_slots;
1298 RING_IDX slot;
1299 xnf_txid_t *tidp;
1300 netif_tx_request_t *txrp;
1301 struct netif_extra_info *erp;
1302 boolean_t notify, result;
1303
1304 /*
1305 * If the backend does not support multicast control then we
1306 * must assume that the right packets will just arrive.
1307 */
1308 if (!xnfp->xnf_be_mcast_control)
1309 return (0);
1310
1311 txp = kmem_cache_alloc(xnfp->xnf_tx_buf_cache, KM_SLEEP);
1312
1313 mutex_enter(&xnfp->xnf_txlock);
1314
1315 /*
1316 * If we're not yet connected then claim success. This is
1317 * acceptable because we refresh the entire set of multicast
1318 * addresses when we get connected.
1319 *
1320 * We can't wait around here because the MAC layer expects
1321 * this to be a non-blocking operation - waiting ends up
1322 * causing a deadlock during resume.
1323 */
1324 if (!xnfp->xnf_connected) {
1325 mutex_exit(&xnfp->xnf_txlock);
1326 return (0);
1327 }
1328
1329 /*
1330 * 1. Acquire two slots in the ring.
1331 * 2. Fill in the slots.
1332 * 3. Request notification when the operation is done.
1333 * 4. Kick the peer.
1334 * 5. Wait for the response via xnf_tx_clean_ring().
1335 */
1336
1337 n_slots = xnf_tx_slots_get(xnfp, 2, B_TRUE);
1338 ASSERT(n_slots >= 2);
1339
1340 slot = xnfp->xnf_tx_ring.req_prod_pvt;
1341 tidp = xnf_txid_get(xnfp);
1342 VERIFY(tidp != NULL);
1343
1344 txp->tx_type = TX_MCAST_REQ;
1345 txp->tx_slot = slot;
1346
1347 txrp = RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot);
1348 erp = (struct netif_extra_info *)
1349 RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot + 1);
1350
1351 txrp->gref = 0;
1352 txrp->size = 0;
1353 txrp->offset = 0;
1354 /* Set tx_txreq.id to appease xnf_tx_clean_ring(). */
1355 txrp->id = txp->tx_txreq.id = tidp->id;
1356 txrp->flags = NETTXF_extra_info;
1357
1358 erp->type = add ? XEN_NETIF_EXTRA_TYPE_MCAST_ADD :
1359 XEN_NETIF_EXTRA_TYPE_MCAST_DEL;
1360 bcopy((void *)mca, &erp->u.mcast.addr, ETHERADDRL);
1361
1362 tidp->txbuf = txp;
1363
1364 xnfp->xnf_tx_ring.req_prod_pvt = slot + 2;
1365
1366 mutex_enter(&xnfp->xnf_schedlock);
1367 xnfp->xnf_pending_multicast++;
1368 mutex_exit(&xnfp->xnf_schedlock);
1369
1370 /* LINTED: constant in conditional context */
1371 RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_tx_ring,
1372 notify);
1373 if (notify)
1374 ec_notify_via_evtchn(xnfp->xnf_evtchn);
1375
1376 while (txp->tx_type == TX_MCAST_REQ)
1377 cv_wait(&xnfp->xnf_cv_multicast, &xnfp->xnf_txlock);
1378
1379 ASSERT3U(txp->tx_type, ==, TX_MCAST_RSP);
1380
1381 mutex_enter(&xnfp->xnf_schedlock);
1382 xnfp->xnf_pending_multicast--;
1383 mutex_exit(&xnfp->xnf_schedlock);
1384
1385 result = (txp->tx_status == NETIF_RSP_OKAY);
1386
1387 xnf_txid_put(xnfp, tidp);
1388
1389 mutex_exit(&xnfp->xnf_txlock);
1390
1391 kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
1392
1393 return (result ? 0 : 1);
1394 }
1395
1396 /*
1397 * xnf_set_promiscuous() -- set or reset promiscuous mode on the board
1398 *
1399 * Program the hardware to enable/disable promiscuous mode.
1400 */
1401 static int
xnf_set_promiscuous(void * arg,boolean_t on)1402 xnf_set_promiscuous(void *arg, boolean_t on)
1403 {
1404 _NOTE(ARGUNUSED(arg, on));
1405
1406 /*
1407 * We can't really do this, but we pretend that we can in
1408 * order that snoop will work.
1409 */
1410 return (0);
1411 }
1412
1413 /*
1414 * Clean buffers that we have responses for from the transmit ring.
1415 */
1416 static int
xnf_tx_clean_ring(xnf_t * xnfp)1417 xnf_tx_clean_ring(xnf_t *xnfp)
1418 {
1419 boolean_t work_to_do;
1420
1421 ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
1422
1423 loop:
1424 while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_tx_ring)) {
1425 RING_IDX cons, prod, i;
1426
1427 cons = xnfp->xnf_tx_ring.rsp_cons;
1428 prod = xnfp->xnf_tx_ring.sring->rsp_prod;
1429 membar_consumer();
1430 /*
1431 * Clean tx requests from ring that we have responses
1432 * for.
1433 */
1434 DTRACE_PROBE2(xnf_tx_clean_range, int, cons, int, prod);
1435 for (i = cons; i != prod; i++) {
1436 netif_tx_response_t *trp;
1437 xnf_txid_t *tidp;
1438 xnf_txbuf_t *txp;
1439
1440 trp = RING_GET_RESPONSE(&xnfp->xnf_tx_ring, i);
1441 /*
1442 * if this slot was occupied by netif_extra_info_t,
1443 * then the response will be NETIF_RSP_NULL. In this
1444 * case there are no resources to clean up.
1445 */
1446 if (trp->status == NETIF_RSP_NULL)
1447 continue;
1448
1449 ASSERT(TX_ID_VALID(trp->id));
1450
1451 tidp = TX_ID_TO_TXID(xnfp, trp->id);
1452 ASSERT3U(tidp->id, ==, trp->id);
1453 ASSERT3U(tidp->next, ==, INVALID_TX_ID);
1454
1455 txp = tidp->txbuf;
1456 ASSERT(txp != NULL);
1457 ASSERT3U(txp->tx_txreq.id, ==, trp->id);
1458
1459 switch (txp->tx_type) {
1460 case TX_DATA:
1461 /*
1462 * We must put the txid for each response we
1463 * acknowledge to make sure that we never have
1464 * more free slots than txids. Because of this
1465 * we do it here instead of waiting for it to
1466 * be done in xnf_data_txbuf_free_chain().
1467 */
1468 xnf_txid_put(xnfp, tidp);
1469 txp->tx_txreq.id = INVALID_TX_ID;
1470 ASSERT3S(txp->tx_head->tx_frags_to_ack, >, 0);
1471 txp->tx_head->tx_frags_to_ack--;
1472
1473 /*
1474 * We clean the whole chain once we got a
1475 * response for each fragment.
1476 */
1477 if (txp->tx_head->tx_frags_to_ack == 0)
1478 xnf_data_txbuf_free_chain(xnfp, txp);
1479
1480 break;
1481
1482 case TX_MCAST_REQ:
1483 txp->tx_type = TX_MCAST_RSP;
1484 txp->tx_status = trp->status;
1485 cv_broadcast(&xnfp->xnf_cv_multicast);
1486
1487 break;
1488
1489 default:
1490 cmn_err(CE_PANIC, "xnf_tx_clean_ring: "
1491 "invalid xnf_txbuf_t type: %d",
1492 txp->tx_type);
1493 break;
1494 }
1495 }
1496 /*
1497 * Record the last response we dealt with so that we
1498 * know where to start next time around.
1499 */
1500 xnfp->xnf_tx_ring.rsp_cons = prod;
1501 membar_enter();
1502 }
1503
1504 /* LINTED: constant in conditional context */
1505 RING_FINAL_CHECK_FOR_RESPONSES(&xnfp->xnf_tx_ring, work_to_do);
1506 if (work_to_do)
1507 goto loop;
1508
1509 return (RING_FREE_REQUESTS(&xnfp->xnf_tx_ring));
1510 }
1511
1512 /*
1513 * Allocate and fill in a look-aside buffer for the packet `mp'. Used
1514 * to ensure that the packet is physically contiguous and contained
1515 * within a single page.
1516 */
1517 static xnf_buf_t *
xnf_tx_get_lookaside(xnf_t * xnfp,mblk_t * mp,size_t * plen)1518 xnf_tx_get_lookaside(xnf_t *xnfp, mblk_t *mp, size_t *plen)
1519 {
1520 xnf_buf_t *bd;
1521 caddr_t bp;
1522
1523 if ((bd = xnf_buf_get(xnfp, KM_NOSLEEP, B_TRUE)) == NULL) {
1524 return (NULL);
1525 }
1526
1527 bp = bd->buf;
1528 while (mp != NULL) {
1529 size_t len = MBLKL(mp);
1530
1531 bcopy(mp->b_rptr, bp, len);
1532 bp += len;
1533
1534 mp = mp->b_cont;
1535 }
1536
1537 *plen = bp - bd->buf;
1538 ASSERT3U(*plen, <=, PAGESIZE);
1539
1540 xnfp->xnf_stat_tx_lookaside++;
1541
1542 return (bd);
1543 }
1544
1545 /*
1546 * Insert the pseudo-header checksum into the packet.
1547 * Assumes packet is IPv4, TCP/UDP since we only advertised support for
1548 * HCKSUM_INET_FULL_V4.
1549 */
1550 int
xnf_pseudo_cksum(mblk_t * mp)1551 xnf_pseudo_cksum(mblk_t *mp)
1552 {
1553 struct ether_header *ehp;
1554 uint16_t sap, iplen, *stuff;
1555 uint32_t cksum;
1556 size_t len;
1557 ipha_t *ipha;
1558 ipaddr_t src, dst;
1559 uchar_t *ptr;
1560
1561 ptr = mp->b_rptr;
1562 len = MBLKL(mp);
1563
1564 /* Each header must fit completely in an mblk. */
1565 ASSERT3U(len, >=, sizeof (*ehp));
1566
1567 ehp = (struct ether_header *)ptr;
1568
1569 if (ntohs(ehp->ether_type) == VLAN_TPID) {
1570 struct ether_vlan_header *evhp;
1571 ASSERT3U(len, >=, sizeof (*evhp));
1572 evhp = (struct ether_vlan_header *)ptr;
1573 sap = ntohs(evhp->ether_type);
1574 ptr += sizeof (*evhp);
1575 len -= sizeof (*evhp);
1576 } else {
1577 sap = ntohs(ehp->ether_type);
1578 ptr += sizeof (*ehp);
1579 len -= sizeof (*ehp);
1580 }
1581
1582 ASSERT3U(sap, ==, ETHERTYPE_IP);
1583
1584 /*
1585 * Ethernet and IP headers may be in different mblks.
1586 */
1587 ASSERT3P(ptr, <=, mp->b_wptr);
1588 if (ptr == mp->b_wptr) {
1589 mp = mp->b_cont;
1590 ptr = mp->b_rptr;
1591 len = MBLKL(mp);
1592 }
1593
1594 ASSERT3U(len, >=, sizeof (ipha_t));
1595 ipha = (ipha_t *)ptr;
1596
1597 /*
1598 * We assume the IP header has no options. (This is enforced in
1599 * ire_send_wire_v4() -- search for IXAF_NO_HW_CKSUM).
1600 */
1601 ASSERT3U(IPH_HDR_LENGTH(ipha), ==, IP_SIMPLE_HDR_LENGTH);
1602 iplen = ntohs(ipha->ipha_length) - IP_SIMPLE_HDR_LENGTH;
1603
1604 ptr += IP_SIMPLE_HDR_LENGTH;
1605 len -= IP_SIMPLE_HDR_LENGTH;
1606
1607 /*
1608 * IP and L4 headers may be in different mblks.
1609 */
1610 ASSERT3P(ptr, <=, mp->b_wptr);
1611 if (ptr == mp->b_wptr) {
1612 mp = mp->b_cont;
1613 ptr = mp->b_rptr;
1614 len = MBLKL(mp);
1615 }
1616
1617 switch (ipha->ipha_protocol) {
1618 case IPPROTO_TCP:
1619 ASSERT3U(len, >=, sizeof (tcph_t));
1620 stuff = (uint16_t *)(ptr + TCP_CHECKSUM_OFFSET);
1621 cksum = IP_TCP_CSUM_COMP;
1622 break;
1623 case IPPROTO_UDP:
1624 ASSERT3U(len, >=, sizeof (struct udphdr));
1625 stuff = (uint16_t *)(ptr + UDP_CHECKSUM_OFFSET);
1626 cksum = IP_UDP_CSUM_COMP;
1627 break;
1628 default:
1629 cmn_err(CE_WARN, "xnf_pseudo_cksum: unexpected protocol %d",
1630 ipha->ipha_protocol);
1631 return (EINVAL);
1632 }
1633
1634 src = ipha->ipha_src;
1635 dst = ipha->ipha_dst;
1636
1637 cksum += (dst >> 16) + (dst & 0xFFFF);
1638 cksum += (src >> 16) + (src & 0xFFFF);
1639 cksum += htons(iplen);
1640
1641 cksum = (cksum >> 16) + (cksum & 0xFFFF);
1642 cksum = (cksum >> 16) + (cksum & 0xFFFF);
1643
1644 ASSERT(cksum <= 0xFFFF);
1645
1646 *stuff = (uint16_t)(cksum ? cksum : ~cksum);
1647
1648 return (0);
1649 }
1650
1651 /*
1652 * Push a packet into the transmit ring.
1653 *
1654 * Note: the format of a tx packet that spans multiple slots is similar to
1655 * what is described in xnf_rx_one_packet().
1656 */
1657 static void
xnf_tx_push_packet(xnf_t * xnfp,xnf_txbuf_t * head)1658 xnf_tx_push_packet(xnf_t *xnfp, xnf_txbuf_t *head)
1659 {
1660 int nslots = 0;
1661 int extras = 0;
1662 RING_IDX slot;
1663 boolean_t notify;
1664
1665 ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
1666 ASSERT(xnfp->xnf_running);
1667
1668 slot = xnfp->xnf_tx_ring.req_prod_pvt;
1669
1670 /*
1671 * The caller has already checked that we have enough slots to proceed.
1672 */
1673 for (xnf_txbuf_t *txp = head; txp != NULL; txp = txp->tx_next) {
1674 xnf_txid_t *tidp;
1675 netif_tx_request_t *txrp;
1676
1677 tidp = xnf_txid_get(xnfp);
1678 VERIFY(tidp != NULL);
1679 txrp = RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot);
1680
1681 txp->tx_slot = slot;
1682 txp->tx_txreq.id = tidp->id;
1683 *txrp = txp->tx_txreq;
1684
1685 tidp->txbuf = txp;
1686 slot++;
1687 nslots++;
1688
1689 /*
1690 * When present, LSO info is placed in a slot after the first
1691 * data segment, and doesn't require a txid.
1692 */
1693 if (txp->tx_txreq.flags & NETTXF_extra_info) {
1694 netif_extra_info_t *extra;
1695 ASSERT3U(nslots, ==, 1);
1696
1697 extra = (netif_extra_info_t *)
1698 RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot);
1699 *extra = txp->tx_extra;
1700 slot++;
1701 nslots++;
1702 extras = 1;
1703 }
1704 }
1705
1706 ASSERT3U(nslots, <=, XEN_MAX_SLOTS_PER_TX);
1707
1708 /*
1709 * Store the number of data fragments.
1710 */
1711 head->tx_frags_to_ack = nslots - extras;
1712
1713 xnfp->xnf_tx_ring.req_prod_pvt = slot;
1714
1715 /*
1716 * Tell the peer that we sent something, if it cares.
1717 */
1718 /* LINTED: constant in conditional context */
1719 RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_tx_ring, notify);
1720 if (notify)
1721 ec_notify_via_evtchn(xnfp->xnf_evtchn);
1722 }
1723
1724 static xnf_txbuf_t *
xnf_mblk_copy(xnf_t * xnfp,mblk_t * mp)1725 xnf_mblk_copy(xnf_t *xnfp, mblk_t *mp)
1726 {
1727 xnf_txbuf_t *txp;
1728 size_t length;
1729
1730 if ((txp = xnf_data_txbuf_alloc(xnfp, KM_NOSLEEP)) == NULL) {
1731 return (NULL);
1732 }
1733
1734 txp->tx_bdesc = xnf_tx_get_lookaside(xnfp, mp, &length);
1735 if (txp->tx_bdesc == NULL) {
1736 xnf_data_txbuf_free(xnfp, txp);
1737 return (NULL);
1738 }
1739 txp->tx_mfn = txp->tx_bdesc->buf_mfn;
1740 txp->tx_txreq.gref = txp->tx_bdesc->grant_ref;
1741 txp->tx_txreq.size = length;
1742 txp->tx_txreq.offset = (uintptr_t)txp->tx_bdesc->buf & PAGEOFFSET;
1743 txp->tx_txreq.flags = 0;
1744
1745 return (txp);
1746 }
1747
1748 static xnf_txbuf_t *
xnf_mblk_map(xnf_t * xnfp,mblk_t * mp,int * countp)1749 xnf_mblk_map(xnf_t *xnfp, mblk_t *mp, int *countp)
1750 {
1751 xnf_txbuf_t *head = NULL;
1752 xnf_txbuf_t *tail = NULL;
1753 domid_t oeid;
1754 int nsegs = 0;
1755
1756 oeid = xvdi_get_oeid(xnfp->xnf_devinfo);
1757
1758 for (mblk_t *ml = mp; ml != NULL; ml = ml->b_cont) {
1759 ddi_dma_handle_t dma_handle;
1760 const ddi_dma_cookie_t *dma_cookie, *dma_cookie_prev;
1761 xnf_txbuf_t *txp;
1762
1763 if (MBLKL(ml) == 0)
1764 continue;
1765
1766 if ((txp = xnf_data_txbuf_alloc(xnfp, KM_NOSLEEP)) == NULL) {
1767 goto error;
1768 }
1769
1770 if (head == NULL) {
1771 head = txp;
1772 } else {
1773 ASSERT(tail != NULL);
1774 TXBUF_SETNEXT(tail, txp);
1775 txp->tx_head = head;
1776 }
1777
1778 /*
1779 * The necessary segmentation rules (e.g. not crossing a page
1780 * boundary) are enforced by the dma attributes of the handle.
1781 */
1782 dma_handle = txp->tx_dma_handle;
1783 int ret = ddi_dma_addr_bind_handle(dma_handle,
1784 NULL, (char *)ml->b_rptr, MBLKL(ml),
1785 DDI_DMA_WRITE | DDI_DMA_STREAMING,
1786 DDI_DMA_DONTWAIT, 0, NULL, NULL);
1787 if (ret != DDI_DMA_MAPPED) {
1788 if (ret != DDI_DMA_NORESOURCES) {
1789 dev_err(xnfp->xnf_devinfo, CE_WARN,
1790 "ddi_dma_addr_bind_handle() failed "
1791 "[dma_error=%d]", ret);
1792 }
1793 goto error;
1794 }
1795 txp->tx_handle_bound = B_TRUE;
1796
1797 dma_cookie_prev = NULL;
1798 while ((dma_cookie = ddi_dma_cookie_iter(dma_handle,
1799 dma_cookie_prev)) != NULL) {
1800 if (nsegs == XEN_MAX_TX_DATA_PAGES) {
1801 dev_err(xnfp->xnf_devinfo, CE_WARN,
1802 "xnf_dmamap_alloc() failed: "
1803 "too many segments");
1804 goto error;
1805 }
1806 if (dma_cookie_prev != NULL) {
1807 if ((txp = xnf_data_txbuf_alloc(xnfp,
1808 KM_NOSLEEP)) == NULL) {
1809 goto error;
1810 }
1811 ASSERT(tail != NULL);
1812 TXBUF_SETNEXT(tail, txp);
1813 txp->tx_head = head;
1814 }
1815
1816 txp->tx_mfn =
1817 xnf_btop(pa_to_ma(dma_cookie->dmac_laddress));
1818 txp->tx_txreq.gref = xnf_gref_get(xnfp);
1819 if (txp->tx_txreq.gref == INVALID_GRANT_REF) {
1820 dev_err(xnfp->xnf_devinfo, CE_WARN,
1821 "xnf_dmamap_alloc() failed: "
1822 "invalid grant ref");
1823 goto error;
1824 }
1825 gnttab_grant_foreign_access_ref(txp->tx_txreq.gref,
1826 oeid, txp->tx_mfn, 1);
1827 txp->tx_txreq.offset =
1828 dma_cookie->dmac_laddress & PAGEOFFSET;
1829 txp->tx_txreq.size = dma_cookie->dmac_size;
1830 txp->tx_txreq.flags = 0;
1831
1832 nsegs++;
1833
1834 if (tail != NULL)
1835 tail->tx_txreq.flags = NETTXF_more_data;
1836 tail = txp;
1837
1838 dma_cookie_prev = dma_cookie;
1839 }
1840 }
1841
1842 *countp = nsegs;
1843 return (head);
1844
1845 error:
1846 xnf_data_txbuf_free_chain(xnfp, head);
1847 return (NULL);
1848 }
1849
1850 static void
xnf_tx_setup_offload(xnf_t * xnfp,xnf_txbuf_t * head,uint32_t cksum_flags,uint32_t lso_flags,uint32_t mss)1851 xnf_tx_setup_offload(xnf_t *xnfp, xnf_txbuf_t *head,
1852 uint32_t cksum_flags, uint32_t lso_flags, uint32_t mss)
1853 {
1854 if (lso_flags != 0) {
1855 ASSERT3U(lso_flags, ==, HW_LSO);
1856 ASSERT3P(head->tx_bdesc, ==, NULL);
1857
1858 head->tx_txreq.flags |= NETTXF_extra_info;
1859 netif_extra_info_t *extra = &head->tx_extra;
1860 extra->type = XEN_NETIF_EXTRA_TYPE_GSO;
1861 extra->flags = 0;
1862 extra->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4;
1863 extra->u.gso.size = mss;
1864 extra->u.gso.features = 0;
1865 extra->u.gso.pad = 0;
1866 } else if (cksum_flags != 0) {
1867 ASSERT3U(cksum_flags, ==, HCK_FULLCKSUM);
1868 /*
1869 * If the local protocol stack requests checksum
1870 * offload we set the 'checksum blank' flag,
1871 * indicating to the peer that we need the checksum
1872 * calculated for us.
1873 *
1874 * We _don't_ set the validated flag, because we haven't
1875 * validated that the data and the checksum match.
1876 *
1877 * Note: we already called xnf_pseudo_cksum() in
1878 * xnf_send(), so we just set the txreq flag here.
1879 */
1880 head->tx_txreq.flags |= NETTXF_csum_blank;
1881 xnfp->xnf_stat_tx_cksum_deferred++;
1882 }
1883 }
1884
1885 /*
1886 * Send packet mp. Called by the MAC framework.
1887 */
1888 static mblk_t *
xnf_send(void * arg,mblk_t * mp)1889 xnf_send(void *arg, mblk_t *mp)
1890 {
1891 xnf_t *xnfp = arg;
1892 xnf_txbuf_t *head;
1893 mblk_t *ml;
1894 int length;
1895 int pages, chunks, slots, slots_free;
1896 uint32_t cksum_flags, lso_flags, mss;
1897 boolean_t pulledup = B_FALSE;
1898 boolean_t force_copy = B_FALSE;
1899
1900 ASSERT3P(mp->b_next, ==, NULL);
1901
1902 mutex_enter(&xnfp->xnf_txlock);
1903
1904 /*
1905 * Wait until we are connected to the backend.
1906 */
1907 while (!xnfp->xnf_connected)
1908 cv_wait(&xnfp->xnf_cv_state, &xnfp->xnf_txlock);
1909
1910 /*
1911 * To simplify logic and be in sync with the rescheduling mechanism,
1912 * we require the maximum amount of slots that could be used by a
1913 * transaction to be free before proceeding. The only downside of doing
1914 * this is that it slightly reduces the effective size of the ring.
1915 */
1916 slots_free = xnf_tx_slots_get(xnfp, XEN_MAX_SLOTS_PER_TX, B_FALSE);
1917 if (slots_free < XEN_MAX_SLOTS_PER_TX) {
1918 /*
1919 * We need to ask for a re-schedule later as the ring is full.
1920 */
1921 mutex_enter(&xnfp->xnf_schedlock);
1922 xnfp->xnf_need_sched = B_TRUE;
1923 mutex_exit(&xnfp->xnf_schedlock);
1924
1925 xnfp->xnf_stat_tx_defer++;
1926 mutex_exit(&xnfp->xnf_txlock);
1927 return (mp);
1928 }
1929
1930 /*
1931 * Get hw offload parameters.
1932 * This must be done before pulling up the mp as those parameters
1933 * are not copied over.
1934 */
1935 mac_hcksum_get(mp, NULL, NULL, NULL, NULL, &cksum_flags);
1936 mac_lso_get(mp, &mss, &lso_flags);
1937
1938 /*
1939 * XXX: fix MAC framework so that we can advertise support for
1940 * partial checksum for IPv4 only. This way we won't need to calculate
1941 * the pseudo header checksum ourselves.
1942 */
1943 if (cksum_flags != 0) {
1944 ASSERT3U(cksum_flags, ==, HCK_FULLCKSUM);
1945 (void) xnf_pseudo_cksum(mp);
1946 }
1947
1948 pulledup:
1949 for (ml = mp, pages = 0, chunks = 0, length = 0; ml != NULL;
1950 ml = ml->b_cont, chunks++) {
1951 pages += xnf_mblk_pages(ml);
1952 length += MBLKL(ml);
1953 }
1954 DTRACE_PROBE3(packet, int, length, int, chunks, int, pages);
1955 DTRACE_PROBE3(lso, int, length, uint32_t, lso_flags, uint32_t, mss);
1956
1957 /*
1958 * If the ethernet header crosses a page boundary the packet
1959 * will be dropped by the backend. In practice it seems like
1960 * this happens fairly rarely so we'll do nothing unless the
1961 * packet is small enough to fit in a look-aside buffer.
1962 */
1963 if (((uintptr_t)mp->b_rptr & PAGEOFFSET) +
1964 sizeof (struct ether_header) > PAGESIZE) {
1965 xnfp->xnf_stat_tx_eth_hdr_split++;
1966 if (length <= PAGESIZE)
1967 force_copy = B_TRUE;
1968 }
1969
1970 if (force_copy || (pages > 1 && !xnfp->xnf_be_tx_sg)) {
1971 /*
1972 * If the packet spans several pages and scatter-gather is not
1973 * supported then use a look-aside buffer.
1974 */
1975 ASSERT3U(length, <=, PAGESIZE);
1976 head = xnf_mblk_copy(xnfp, mp);
1977 if (head == NULL) {
1978 dev_err(xnfp->xnf_devinfo, CE_WARN,
1979 "xnf_mblk_copy() failed");
1980 goto drop;
1981 }
1982 } else {
1983 /*
1984 * There's a limit for how many pages can be passed to the
1985 * backend. If we pass that limit, the packet will be dropped
1986 * and some backend implementations (e.g. Linux) could even
1987 * offline the interface.
1988 */
1989 if (pages > XEN_MAX_TX_DATA_PAGES) {
1990 if (pulledup) {
1991 dev_err(xnfp->xnf_devinfo, CE_WARN,
1992 "too many pages, even after pullup: %d.",
1993 pages);
1994 goto drop;
1995 }
1996
1997 /*
1998 * Defragment packet if it spans too many pages.
1999 */
2000 mblk_t *newmp = msgpullup(mp, -1);
2001 if (newmp == NULL) {
2002 dev_err(xnfp->xnf_devinfo, CE_WARN,
2003 "msgpullup() failed");
2004 goto drop;
2005 }
2006
2007 freemsg(mp);
2008 mp = newmp;
2009 xnfp->xnf_stat_tx_pullup++;
2010 pulledup = B_TRUE;
2011 goto pulledup;
2012 }
2013
2014 head = xnf_mblk_map(xnfp, mp, &slots);
2015 if (head == NULL)
2016 goto drop;
2017
2018 IMPLY(slots > 1, xnfp->xnf_be_tx_sg);
2019 }
2020
2021 /*
2022 * Set tx_mp so that mblk is freed when the txbuf chain is freed.
2023 */
2024 head->tx_mp = mp;
2025
2026 xnf_tx_setup_offload(xnfp, head, cksum_flags, lso_flags, mss);
2027
2028 /*
2029 * The first request must store the total length of the packet.
2030 */
2031 head->tx_txreq.size = length;
2032
2033 /*
2034 * Push the packet we have prepared into the ring.
2035 */
2036 xnf_tx_push_packet(xnfp, head);
2037 xnfp->xnf_stat_opackets++;
2038 xnfp->xnf_stat_obytes += length;
2039
2040 mutex_exit(&xnfp->xnf_txlock);
2041 return (NULL);
2042
2043 drop:
2044 freemsg(mp);
2045 xnfp->xnf_stat_tx_drop++;
2046 mutex_exit(&xnfp->xnf_txlock);
2047 return (NULL);
2048 }
2049
2050 /*
2051 * Notification of RX packets. Currently no TX-complete interrupt is
2052 * used, as we clean the TX ring lazily.
2053 */
2054 static uint_t
xnf_intr(caddr_t arg)2055 xnf_intr(caddr_t arg)
2056 {
2057 xnf_t *xnfp = (xnf_t *)arg;
2058 mblk_t *mp;
2059 boolean_t need_sched, clean_ring;
2060
2061 mutex_enter(&xnfp->xnf_rxlock);
2062
2063 /*
2064 * Interrupts before we are connected are spurious.
2065 */
2066 if (!xnfp->xnf_connected) {
2067 mutex_exit(&xnfp->xnf_rxlock);
2068 xnfp->xnf_stat_unclaimed_interrupts++;
2069 return (DDI_INTR_UNCLAIMED);
2070 }
2071
2072 /*
2073 * Receive side processing.
2074 */
2075 do {
2076 /*
2077 * Collect buffers from the ring.
2078 */
2079 xnf_rx_collect(xnfp);
2080
2081 /*
2082 * Interrupt me when the next receive buffer is consumed.
2083 */
2084 xnfp->xnf_rx_ring.sring->rsp_event =
2085 xnfp->xnf_rx_ring.rsp_cons + 1;
2086 xen_mb();
2087
2088 } while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_rx_ring));
2089
2090 if (xnfp->xnf_rx_new_buffers_posted) {
2091 boolean_t notify;
2092
2093 /*
2094 * Indicate to the peer that we have re-filled the
2095 * receive ring, if it cares.
2096 */
2097 /* LINTED: constant in conditional context */
2098 RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_rx_ring, notify);
2099 if (notify)
2100 ec_notify_via_evtchn(xnfp->xnf_evtchn);
2101 xnfp->xnf_rx_new_buffers_posted = B_FALSE;
2102 }
2103
2104 mp = xnfp->xnf_rx_head;
2105 xnfp->xnf_rx_head = xnfp->xnf_rx_tail = NULL;
2106
2107 xnfp->xnf_stat_interrupts++;
2108 mutex_exit(&xnfp->xnf_rxlock);
2109
2110 if (mp != NULL)
2111 mac_rx(xnfp->xnf_mh, NULL, mp);
2112
2113 /*
2114 * Transmit side processing.
2115 *
2116 * If a previous transmit attempt failed or we have pending
2117 * multicast requests, clean the ring.
2118 *
2119 * If we previously stalled transmission and cleaning produces
2120 * some free slots, tell upstream to attempt sending again.
2121 *
2122 * The odd style is to avoid acquiring xnf_txlock unless we
2123 * will actually look inside the tx machinery.
2124 */
2125 mutex_enter(&xnfp->xnf_schedlock);
2126 need_sched = xnfp->xnf_need_sched;
2127 clean_ring = need_sched || (xnfp->xnf_pending_multicast > 0);
2128 mutex_exit(&xnfp->xnf_schedlock);
2129
2130 if (clean_ring) {
2131 int free_slots;
2132
2133 mutex_enter(&xnfp->xnf_txlock);
2134 free_slots = xnf_tx_slots_get(xnfp, 0, B_FALSE);
2135
2136 if (need_sched && (free_slots >= XEN_MAX_SLOTS_PER_TX)) {
2137 mutex_enter(&xnfp->xnf_schedlock);
2138 xnfp->xnf_need_sched = B_FALSE;
2139 mutex_exit(&xnfp->xnf_schedlock);
2140
2141 mac_tx_update(xnfp->xnf_mh);
2142 }
2143 mutex_exit(&xnfp->xnf_txlock);
2144 }
2145
2146 return (DDI_INTR_CLAIMED);
2147 }
2148
2149 /*
2150 * xnf_start() -- start the board receiving and enable interrupts.
2151 */
2152 static int
xnf_start(void * arg)2153 xnf_start(void *arg)
2154 {
2155 xnf_t *xnfp = arg;
2156
2157 mutex_enter(&xnfp->xnf_rxlock);
2158 mutex_enter(&xnfp->xnf_txlock);
2159
2160 /* Accept packets from above. */
2161 xnfp->xnf_running = B_TRUE;
2162
2163 mutex_exit(&xnfp->xnf_txlock);
2164 mutex_exit(&xnfp->xnf_rxlock);
2165
2166 return (0);
2167 }
2168
2169 /* xnf_stop() - disable hardware */
2170 static void
xnf_stop(void * arg)2171 xnf_stop(void *arg)
2172 {
2173 xnf_t *xnfp = arg;
2174
2175 mutex_enter(&xnfp->xnf_rxlock);
2176 mutex_enter(&xnfp->xnf_txlock);
2177
2178 xnfp->xnf_running = B_FALSE;
2179
2180 mutex_exit(&xnfp->xnf_txlock);
2181 mutex_exit(&xnfp->xnf_rxlock);
2182 }
2183
2184 /*
2185 * Hang buffer `bdesc' on the RX ring.
2186 */
2187 static void
xnf_rxbuf_hang(xnf_t * xnfp,xnf_buf_t * bdesc)2188 xnf_rxbuf_hang(xnf_t *xnfp, xnf_buf_t *bdesc)
2189 {
2190 netif_rx_request_t *reqp;
2191 RING_IDX hang_ix;
2192
2193 ASSERT(MUTEX_HELD(&xnfp->xnf_rxlock));
2194
2195 reqp = RING_GET_REQUEST(&xnfp->xnf_rx_ring,
2196 xnfp->xnf_rx_ring.req_prod_pvt);
2197 hang_ix = (RING_IDX) (reqp - RING_GET_REQUEST(&xnfp->xnf_rx_ring, 0));
2198 ASSERT(xnfp->xnf_rx_pkt_info[hang_ix] == NULL);
2199
2200 reqp->id = bdesc->id = hang_ix;
2201 reqp->gref = bdesc->grant_ref;
2202
2203 xnfp->xnf_rx_pkt_info[hang_ix] = bdesc;
2204 xnfp->xnf_rx_ring.req_prod_pvt++;
2205
2206 xnfp->xnf_rx_new_buffers_posted = B_TRUE;
2207 }
2208
2209 /*
2210 * Receive an entire packet from the ring, starting from slot *consp.
2211 * prod indicates the slot of the latest response.
2212 * On return, *consp will point to the head of the next packet.
2213 *
2214 * Note: If slot prod was reached before we could gather a full packet, we will
2215 * drop the partial packet; this would most likely indicate a bug in either
2216 * the front-end or the back-end driver.
2217 *
2218 * An rx packet can consist of several fragments and thus span multiple slots.
2219 * Each fragment can contain up to 4k of data.
2220 *
2221 * A typical 9000 MTU packet with look like this:
2222 * +------+---------------------+-------------------+-----------------------+
2223 * | SLOT | TYPE | CONTENTS | FLAGS |
2224 * +------+---------------------+-------------------+-----------------------+
2225 * | 1 | netif_rx_response_t | 1st data fragment | more_data |
2226 * +------+---------------------+-------------------+-----------------------+
2227 * | 2 | netif_rx_response_t | 2nd data fragment | more_data |
2228 * +------+---------------------+-------------------+-----------------------+
2229 * | 3 | netif_rx_response_t | 3rd data fragment | [none] |
2230 * +------+---------------------+-------------------+-----------------------+
2231 *
2232 * Fragments are chained by setting NETRXF_more_data in the previous
2233 * response's flags. If there are additional flags, such as
2234 * NETRXF_data_validated or NETRXF_extra_info, those should be set on the
2235 * first fragment.
2236 *
2237 * Sometimes extra info can be present. If so, it will follow the first
2238 * fragment, and NETRXF_extra_info flag will be set on the first response.
2239 * If LRO is set on a packet, it will be stored in the extra info. Conforming
2240 * to the spec, extra info can also be chained, but must all be present right
2241 * after the first fragment.
2242 *
2243 * Example of a packet with 2 extra infos:
2244 * +------+---------------------+-------------------+-----------------------+
2245 * | SLOT | TYPE | CONTENTS | FLAGS |
2246 * +------+---------------------+-------------------+-----------------------+
2247 * | 1 | netif_rx_response_t | 1st data fragment | extra_info, more_data |
2248 * +------+---------------------+-------------------+-----------------------+
2249 * | 2 | netif_extra_info_t | 1st extra info | EXTRA_FLAG_MORE |
2250 * +------+---------------------+-------------------+-----------------------+
2251 * | 3 | netif_extra_info_t | 2nd extra info | [none] |
2252 * +------+---------------------+-------------------+-----------------------+
2253 * | 4 | netif_rx_response_t | 2nd data fragment | more_data |
2254 * +------+---------------------+-------------------+-----------------------+
2255 * | 5 | netif_rx_response_t | 3rd data fragment | more_data |
2256 * +------+---------------------+-------------------+-----------------------+
2257 * | 6 | netif_rx_response_t | 4th data fragment | [none] |
2258 * +------+---------------------+-------------------+-----------------------+
2259 *
2260 * In practice, the only extra we expect is for LRO, but only if we advertise
2261 * that we support it to the backend (xnf_enable_lro == TRUE).
2262 */
2263 static int
xnf_rx_one_packet(xnf_t * xnfp,RING_IDX prod,RING_IDX * consp,mblk_t ** mpp)2264 xnf_rx_one_packet(xnf_t *xnfp, RING_IDX prod, RING_IDX *consp, mblk_t **mpp)
2265 {
2266 mblk_t *head = NULL;
2267 mblk_t *tail = NULL;
2268 mblk_t *mp;
2269 int error = 0;
2270 RING_IDX cons = *consp;
2271 netif_extra_info_t lro;
2272 boolean_t is_lro = B_FALSE;
2273 boolean_t is_extra = B_FALSE;
2274
2275 netif_rx_response_t rsp = *RING_GET_RESPONSE(&xnfp->xnf_rx_ring, cons);
2276
2277 boolean_t hwcsum = (rsp.flags & NETRXF_data_validated) != 0;
2278 boolean_t more_data = (rsp.flags & NETRXF_more_data) != 0;
2279 boolean_t more_extra = (rsp.flags & NETRXF_extra_info) != 0;
2280
2281 IMPLY(more_data, xnf_enable_rx_sg);
2282
2283 while (cons != prod) {
2284 xnf_buf_t *bdesc;
2285 int len, off;
2286 int rxidx = cons & (NET_RX_RING_SIZE - 1);
2287
2288 bdesc = xnfp->xnf_rx_pkt_info[rxidx];
2289 xnfp->xnf_rx_pkt_info[rxidx] = NULL;
2290
2291 if (is_extra) {
2292 netif_extra_info_t *extra = (netif_extra_info_t *)&rsp;
2293 /*
2294 * The only extra we expect is for LRO, and it should
2295 * only be present once.
2296 */
2297 if (extra->type == XEN_NETIF_EXTRA_TYPE_GSO &&
2298 !is_lro) {
2299 ASSERT(xnf_enable_lro);
2300 lro = *extra;
2301 is_lro = B_TRUE;
2302 DTRACE_PROBE1(lro, netif_extra_info_t *, &lro);
2303 } else {
2304 dev_err(xnfp->xnf_devinfo, CE_WARN, "rx packet "
2305 "contains unexpected extra info of type %d",
2306 extra->type);
2307 error = EINVAL;
2308 }
2309 more_extra =
2310 (extra->flags & XEN_NETIF_EXTRA_FLAG_MORE) != 0;
2311
2312 goto hang_buf;
2313 }
2314
2315 ASSERT3U(bdesc->id, ==, rsp.id);
2316
2317 /*
2318 * status stores packet length when >= 0, or errors when < 0.
2319 */
2320 len = rsp.status;
2321 off = rsp.offset;
2322 more_data = (rsp.flags & NETRXF_more_data) != 0;
2323
2324 /*
2325 * sanity checks.
2326 */
2327 if (!xnfp->xnf_running) {
2328 error = EBUSY;
2329 } else if (len <= 0) {
2330 xnfp->xnf_stat_errrx++;
2331
2332 switch (len) {
2333 case 0:
2334 xnfp->xnf_stat_runt++;
2335 break;
2336 case NETIF_RSP_ERROR:
2337 xnfp->xnf_stat_mac_rcv_error++;
2338 break;
2339 case NETIF_RSP_DROPPED:
2340 xnfp->xnf_stat_norxbuf++;
2341 break;
2342 }
2343 error = EINVAL;
2344 } else if (bdesc->grant_ref == INVALID_GRANT_REF) {
2345 dev_err(xnfp->xnf_devinfo, CE_WARN,
2346 "Bad rx grant reference, rsp id %d", rsp.id);
2347 error = EINVAL;
2348 } else if ((off + len) > PAGESIZE) {
2349 dev_err(xnfp->xnf_devinfo, CE_WARN, "Rx packet crosses "
2350 "page boundary (offset %d, length %d)", off, len);
2351 error = EINVAL;
2352 }
2353
2354 if (error != 0) {
2355 /*
2356 * If an error has been detected, we do not attempt
2357 * to read the data but we still need to replace
2358 * the rx bufs.
2359 */
2360 goto hang_buf;
2361 }
2362
2363 xnf_buf_t *nbuf = NULL;
2364
2365 /*
2366 * If the packet is below a pre-determined size we will
2367 * copy data out of the buf rather than replace it.
2368 */
2369 if (len > xnf_rx_copy_limit)
2370 nbuf = xnf_buf_get(xnfp, KM_NOSLEEP, B_FALSE);
2371
2372 if (nbuf != NULL) {
2373 mp = desballoc((unsigned char *)bdesc->buf,
2374 bdesc->len, 0, &bdesc->free_rtn);
2375
2376 if (mp == NULL) {
2377 xnfp->xnf_stat_rx_desballoc_fail++;
2378 xnfp->xnf_stat_norxbuf++;
2379 error = ENOMEM;
2380 /*
2381 * we free the buf we just allocated as we
2382 * will re-hang the old buf.
2383 */
2384 xnf_buf_put(xnfp, nbuf, B_FALSE);
2385 goto hang_buf;
2386 }
2387
2388 mp->b_rptr = mp->b_rptr + off;
2389 mp->b_wptr = mp->b_rptr + len;
2390
2391 /*
2392 * Release the grant as the backend doesn't need to
2393 * access this buffer anymore and grants are scarce.
2394 */
2395 (void) gnttab_end_foreign_access_ref(bdesc->grant_ref,
2396 0);
2397 xnf_gref_put(xnfp, bdesc->grant_ref);
2398 bdesc->grant_ref = INVALID_GRANT_REF;
2399
2400 bdesc = nbuf;
2401 } else {
2402 /*
2403 * We failed to allocate a new buf or decided to reuse
2404 * the old one. In either case we copy the data off it
2405 * and put it back into the ring.
2406 */
2407 mp = allocb(len, 0);
2408 if (mp == NULL) {
2409 xnfp->xnf_stat_rx_allocb_fail++;
2410 xnfp->xnf_stat_norxbuf++;
2411 error = ENOMEM;
2412 goto hang_buf;
2413 }
2414 bcopy(bdesc->buf + off, mp->b_wptr, len);
2415 mp->b_wptr += len;
2416 }
2417
2418 if (head == NULL)
2419 head = mp;
2420 else
2421 tail->b_cont = mp;
2422 tail = mp;
2423
2424 hang_buf:
2425 /*
2426 * No matter what happens, for each response we need to hang
2427 * a new buf on the rx ring. Put either the old one, or a new
2428 * one if the old one is borrowed by the kernel via desballoc().
2429 */
2430 xnf_rxbuf_hang(xnfp, bdesc);
2431 cons++;
2432
2433 /* next response is an extra */
2434 is_extra = more_extra;
2435
2436 if (!more_data && !more_extra)
2437 break;
2438
2439 /*
2440 * Note that since requests and responses are union'd on the
2441 * same ring, we copy the response to a local variable instead
2442 * of keeping a pointer. Otherwise xnf_rxbuf_hang() would have
2443 * overwritten contents of rsp.
2444 */
2445 rsp = *RING_GET_RESPONSE(&xnfp->xnf_rx_ring, cons);
2446 }
2447
2448 /*
2449 * Check that we do not get stuck in a loop.
2450 */
2451 ASSERT3U(*consp, !=, cons);
2452 *consp = cons;
2453
2454 /*
2455 * We ran out of responses but the flags indicate there is more data.
2456 */
2457 if (more_data) {
2458 dev_err(xnfp->xnf_devinfo, CE_WARN, "rx: need more fragments.");
2459 error = EINVAL;
2460 }
2461 if (more_extra) {
2462 dev_err(xnfp->xnf_devinfo, CE_WARN, "rx: need more fragments "
2463 "(extras).");
2464 error = EINVAL;
2465 }
2466
2467 /*
2468 * An error means the packet must be dropped. If we have already formed
2469 * a partial packet, then discard it.
2470 */
2471 if (error != 0) {
2472 if (head != NULL)
2473 freemsg(head);
2474 xnfp->xnf_stat_rx_drop++;
2475 return (error);
2476 }
2477
2478 ASSERT(head != NULL);
2479
2480 if (hwcsum) {
2481 /*
2482 * If the peer says that the data has been validated then we
2483 * declare that the full checksum has been verified.
2484 *
2485 * We don't look at the "checksum blank" flag, and hence could
2486 * have a packet here that we are asserting is good with
2487 * a blank checksum.
2488 */
2489 mac_hcksum_set(head, 0, 0, 0, 0, HCK_FULLCKSUM_OK);
2490 xnfp->xnf_stat_rx_cksum_no_need++;
2491 }
2492
2493 /* XXX: set lro info for packet once LRO is supported in OS. */
2494
2495 *mpp = head;
2496
2497 return (0);
2498 }
2499
2500 /*
2501 * Collect packets from the RX ring, storing them in `xnfp' for later use.
2502 */
2503 static void
xnf_rx_collect(xnf_t * xnfp)2504 xnf_rx_collect(xnf_t *xnfp)
2505 {
2506 RING_IDX prod;
2507
2508 ASSERT(MUTEX_HELD(&xnfp->xnf_rxlock));
2509
2510 prod = xnfp->xnf_rx_ring.sring->rsp_prod;
2511 /*
2512 * Ensure we see queued responses up to 'prod'.
2513 */
2514 membar_consumer();
2515
2516 while (xnfp->xnf_rx_ring.rsp_cons != prod) {
2517 mblk_t *mp;
2518
2519 /*
2520 * Collect a packet.
2521 * rsp_cons is updated inside xnf_rx_one_packet().
2522 */
2523 int error = xnf_rx_one_packet(xnfp, prod,
2524 &xnfp->xnf_rx_ring.rsp_cons, &mp);
2525 if (error == 0) {
2526 xnfp->xnf_stat_ipackets++;
2527 xnfp->xnf_stat_rbytes += xmsgsize(mp);
2528
2529 /*
2530 * Append the mblk to the rx list.
2531 */
2532 if (xnfp->xnf_rx_head == NULL) {
2533 ASSERT3P(xnfp->xnf_rx_tail, ==, NULL);
2534 xnfp->xnf_rx_head = mp;
2535 } else {
2536 ASSERT(xnfp->xnf_rx_tail != NULL);
2537 xnfp->xnf_rx_tail->b_next = mp;
2538 }
2539 xnfp->xnf_rx_tail = mp;
2540 }
2541 }
2542 }
2543
2544 /*
2545 * xnf_alloc_dma_resources() -- initialize the drivers structures
2546 */
2547 static int
xnf_alloc_dma_resources(xnf_t * xnfp)2548 xnf_alloc_dma_resources(xnf_t *xnfp)
2549 {
2550 dev_info_t *devinfo = xnfp->xnf_devinfo;
2551 size_t len;
2552 ddi_dma_cookie_t dma_cookie;
2553 uint_t ncookies;
2554 int rc;
2555 caddr_t rptr;
2556
2557 /*
2558 * The code below allocates all the DMA data structures that
2559 * need to be released when the driver is detached.
2560 *
2561 * Allocate page for the transmit descriptor ring.
2562 */
2563 if (ddi_dma_alloc_handle(devinfo, &ringbuf_dma_attr,
2564 DDI_DMA_SLEEP, 0, &xnfp->xnf_tx_ring_dma_handle) != DDI_SUCCESS)
2565 goto alloc_error;
2566
2567 if (ddi_dma_mem_alloc(xnfp->xnf_tx_ring_dma_handle,
2568 PAGESIZE, &accattr, DDI_DMA_CONSISTENT,
2569 DDI_DMA_SLEEP, 0, &rptr, &len,
2570 &xnfp->xnf_tx_ring_dma_acchandle) != DDI_SUCCESS) {
2571 ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle);
2572 xnfp->xnf_tx_ring_dma_handle = NULL;
2573 goto alloc_error;
2574 }
2575
2576 if ((rc = ddi_dma_addr_bind_handle(xnfp->xnf_tx_ring_dma_handle, NULL,
2577 rptr, PAGESIZE, DDI_DMA_RDWR | DDI_DMA_CONSISTENT,
2578 DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies)) != DDI_DMA_MAPPED) {
2579 ddi_dma_mem_free(&xnfp->xnf_tx_ring_dma_acchandle);
2580 ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle);
2581 xnfp->xnf_tx_ring_dma_handle = NULL;
2582 xnfp->xnf_tx_ring_dma_acchandle = NULL;
2583 if (rc == DDI_DMA_NORESOURCES)
2584 goto alloc_error;
2585 else
2586 goto error;
2587 }
2588
2589 ASSERT(ncookies == 1);
2590 bzero(rptr, PAGESIZE);
2591 /* LINTED: constant in conditional context */
2592 SHARED_RING_INIT((netif_tx_sring_t *)rptr);
2593 /* LINTED: constant in conditional context */
2594 FRONT_RING_INIT(&xnfp->xnf_tx_ring, (netif_tx_sring_t *)rptr, PAGESIZE);
2595 xnfp->xnf_tx_ring_phys_addr = dma_cookie.dmac_laddress;
2596
2597 /*
2598 * Allocate page for the receive descriptor ring.
2599 */
2600 if (ddi_dma_alloc_handle(devinfo, &ringbuf_dma_attr,
2601 DDI_DMA_SLEEP, 0, &xnfp->xnf_rx_ring_dma_handle) != DDI_SUCCESS)
2602 goto alloc_error;
2603
2604 if (ddi_dma_mem_alloc(xnfp->xnf_rx_ring_dma_handle,
2605 PAGESIZE, &accattr, DDI_DMA_CONSISTENT,
2606 DDI_DMA_SLEEP, 0, &rptr, &len,
2607 &xnfp->xnf_rx_ring_dma_acchandle) != DDI_SUCCESS) {
2608 ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle);
2609 xnfp->xnf_rx_ring_dma_handle = NULL;
2610 goto alloc_error;
2611 }
2612
2613 if ((rc = ddi_dma_addr_bind_handle(xnfp->xnf_rx_ring_dma_handle, NULL,
2614 rptr, PAGESIZE, DDI_DMA_RDWR | DDI_DMA_CONSISTENT,
2615 DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies)) != DDI_DMA_MAPPED) {
2616 ddi_dma_mem_free(&xnfp->xnf_rx_ring_dma_acchandle);
2617 ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle);
2618 xnfp->xnf_rx_ring_dma_handle = NULL;
2619 xnfp->xnf_rx_ring_dma_acchandle = NULL;
2620 if (rc == DDI_DMA_NORESOURCES)
2621 goto alloc_error;
2622 else
2623 goto error;
2624 }
2625
2626 ASSERT(ncookies == 1);
2627 bzero(rptr, PAGESIZE);
2628 /* LINTED: constant in conditional context */
2629 SHARED_RING_INIT((netif_rx_sring_t *)rptr);
2630 /* LINTED: constant in conditional context */
2631 FRONT_RING_INIT(&xnfp->xnf_rx_ring, (netif_rx_sring_t *)rptr, PAGESIZE);
2632 xnfp->xnf_rx_ring_phys_addr = dma_cookie.dmac_laddress;
2633
2634 return (DDI_SUCCESS);
2635
2636 alloc_error:
2637 cmn_err(CE_WARN, "xnf%d: could not allocate enough DMA memory",
2638 ddi_get_instance(xnfp->xnf_devinfo));
2639 error:
2640 xnf_release_dma_resources(xnfp);
2641 return (DDI_FAILURE);
2642 }
2643
2644 /*
2645 * Release all DMA resources in the opposite order from acquisition
2646 */
2647 static void
xnf_release_dma_resources(xnf_t * xnfp)2648 xnf_release_dma_resources(xnf_t *xnfp)
2649 {
2650 int i;
2651
2652 /*
2653 * Free receive buffers which are currently associated with
2654 * descriptors.
2655 */
2656 mutex_enter(&xnfp->xnf_rxlock);
2657 for (i = 0; i < NET_RX_RING_SIZE; i++) {
2658 xnf_buf_t *bp;
2659
2660 if ((bp = xnfp->xnf_rx_pkt_info[i]) == NULL)
2661 continue;
2662 xnfp->xnf_rx_pkt_info[i] = NULL;
2663 xnf_buf_put(xnfp, bp, B_FALSE);
2664 }
2665 mutex_exit(&xnfp->xnf_rxlock);
2666
2667 /* Free the receive ring buffer. */
2668 if (xnfp->xnf_rx_ring_dma_acchandle != NULL) {
2669 (void) ddi_dma_unbind_handle(xnfp->xnf_rx_ring_dma_handle);
2670 ddi_dma_mem_free(&xnfp->xnf_rx_ring_dma_acchandle);
2671 ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle);
2672 xnfp->xnf_rx_ring_dma_acchandle = NULL;
2673 }
2674 /* Free the transmit ring buffer. */
2675 if (xnfp->xnf_tx_ring_dma_acchandle != NULL) {
2676 (void) ddi_dma_unbind_handle(xnfp->xnf_tx_ring_dma_handle);
2677 ddi_dma_mem_free(&xnfp->xnf_tx_ring_dma_acchandle);
2678 ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle);
2679 xnfp->xnf_tx_ring_dma_acchandle = NULL;
2680 }
2681
2682 }
2683
2684 /*
2685 * Release any packets and associated structures used by the TX ring.
2686 */
2687 static void
xnf_release_mblks(xnf_t * xnfp)2688 xnf_release_mblks(xnf_t *xnfp)
2689 {
2690 RING_IDX i;
2691 xnf_txid_t *tidp;
2692
2693 for (i = 0, tidp = &xnfp->xnf_tx_pkt_id[0];
2694 i < NET_TX_RING_SIZE;
2695 i++, tidp++) {
2696 xnf_txbuf_t *txp = tidp->txbuf;
2697
2698 if (txp != NULL) {
2699 ASSERT(txp->tx_mp != NULL);
2700 freemsg(txp->tx_mp);
2701
2702 xnf_txid_put(xnfp, tidp);
2703 kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
2704 }
2705 }
2706 }
2707
2708 static int
xnf_buf_constructor(void * buf,void * arg,int kmflag)2709 xnf_buf_constructor(void *buf, void *arg, int kmflag)
2710 {
2711 int (*ddiflags)(caddr_t) = DDI_DMA_SLEEP;
2712 xnf_buf_t *bdesc = buf;
2713 xnf_t *xnfp = arg;
2714 ddi_dma_cookie_t dma_cookie;
2715 uint_t ncookies;
2716 size_t len;
2717
2718 if (kmflag & KM_NOSLEEP)
2719 ddiflags = DDI_DMA_DONTWAIT;
2720
2721 /* Allocate a DMA access handle for the buffer. */
2722 if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &rx_buf_dma_attr,
2723 ddiflags, 0, &bdesc->dma_handle) != DDI_SUCCESS)
2724 goto failure;
2725
2726 /* Allocate DMA-able memory for buffer. */
2727 if (ddi_dma_mem_alloc(bdesc->dma_handle,
2728 PAGESIZE, &data_accattr, DDI_DMA_STREAMING, ddiflags, 0,
2729 &bdesc->buf, &len, &bdesc->acc_handle) != DDI_SUCCESS)
2730 goto failure_1;
2731
2732 /* Bind to virtual address of buffer to get physical address. */
2733 if (ddi_dma_addr_bind_handle(bdesc->dma_handle, NULL,
2734 bdesc->buf, len, DDI_DMA_RDWR | DDI_DMA_STREAMING,
2735 ddiflags, 0, &dma_cookie, &ncookies) != DDI_DMA_MAPPED)
2736 goto failure_2;
2737 ASSERT(ncookies == 1);
2738
2739 bdesc->free_rtn.free_func = xnf_buf_recycle;
2740 bdesc->free_rtn.free_arg = (caddr_t)bdesc;
2741 bdesc->xnfp = xnfp;
2742 bdesc->buf_phys = dma_cookie.dmac_laddress;
2743 bdesc->buf_mfn = pfn_to_mfn(xnf_btop(bdesc->buf_phys));
2744 bdesc->len = dma_cookie.dmac_size;
2745 bdesc->grant_ref = INVALID_GRANT_REF;
2746 bdesc->gen = xnfp->xnf_gen;
2747
2748 atomic_inc_64(&xnfp->xnf_stat_buf_allocated);
2749
2750 return (0);
2751
2752 failure_2:
2753 ddi_dma_mem_free(&bdesc->acc_handle);
2754
2755 failure_1:
2756 ddi_dma_free_handle(&bdesc->dma_handle);
2757
2758 failure:
2759
2760 ASSERT(kmflag & KM_NOSLEEP); /* Cannot fail for KM_SLEEP. */
2761 return (-1);
2762 }
2763
2764 static void
xnf_buf_destructor(void * buf,void * arg)2765 xnf_buf_destructor(void *buf, void *arg)
2766 {
2767 xnf_buf_t *bdesc = buf;
2768 xnf_t *xnfp = arg;
2769
2770 (void) ddi_dma_unbind_handle(bdesc->dma_handle);
2771 ddi_dma_mem_free(&bdesc->acc_handle);
2772 ddi_dma_free_handle(&bdesc->dma_handle);
2773
2774 atomic_dec_64(&xnfp->xnf_stat_buf_allocated);
2775 }
2776
2777 static xnf_buf_t *
xnf_buf_get(xnf_t * xnfp,int flags,boolean_t readonly)2778 xnf_buf_get(xnf_t *xnfp, int flags, boolean_t readonly)
2779 {
2780 grant_ref_t gref;
2781 xnf_buf_t *bufp;
2782
2783 /*
2784 * Usually grant references are more scarce than memory, so we
2785 * attempt to acquire a grant reference first.
2786 */
2787 gref = xnf_gref_get(xnfp);
2788 if (gref == INVALID_GRANT_REF)
2789 return (NULL);
2790
2791 bufp = kmem_cache_alloc(xnfp->xnf_buf_cache, flags);
2792 if (bufp == NULL) {
2793 xnf_gref_put(xnfp, gref);
2794 return (NULL);
2795 }
2796
2797 ASSERT3U(bufp->grant_ref, ==, INVALID_GRANT_REF);
2798
2799 bufp->grant_ref = gref;
2800
2801 if (bufp->gen != xnfp->xnf_gen)
2802 xnf_buf_refresh(bufp);
2803
2804 gnttab_grant_foreign_access_ref(bufp->grant_ref,
2805 xvdi_get_oeid(bufp->xnfp->xnf_devinfo),
2806 bufp->buf_mfn, readonly ? 1 : 0);
2807
2808 atomic_inc_64(&xnfp->xnf_stat_buf_outstanding);
2809
2810 return (bufp);
2811 }
2812
2813 static void
xnf_buf_put(xnf_t * xnfp,xnf_buf_t * bufp,boolean_t readonly)2814 xnf_buf_put(xnf_t *xnfp, xnf_buf_t *bufp, boolean_t readonly)
2815 {
2816 if (bufp->grant_ref != INVALID_GRANT_REF) {
2817 (void) gnttab_end_foreign_access_ref(
2818 bufp->grant_ref, readonly ? 1 : 0);
2819 xnf_gref_put(xnfp, bufp->grant_ref);
2820 bufp->grant_ref = INVALID_GRANT_REF;
2821 }
2822
2823 kmem_cache_free(xnfp->xnf_buf_cache, bufp);
2824
2825 atomic_dec_64(&xnfp->xnf_stat_buf_outstanding);
2826 }
2827
2828 /*
2829 * Refresh any cached data about a buffer after resume.
2830 */
2831 static void
xnf_buf_refresh(xnf_buf_t * bdesc)2832 xnf_buf_refresh(xnf_buf_t *bdesc)
2833 {
2834 bdesc->buf_mfn = pfn_to_mfn(xnf_btop(bdesc->buf_phys));
2835 bdesc->gen = bdesc->xnfp->xnf_gen;
2836 }
2837
2838 /*
2839 * Streams `freeb' routine for `xnf_buf_t' when used as transmit
2840 * look-aside buffers.
2841 */
2842 static void
xnf_buf_recycle(xnf_buf_t * bdesc)2843 xnf_buf_recycle(xnf_buf_t *bdesc)
2844 {
2845 xnf_t *xnfp = bdesc->xnfp;
2846
2847 xnf_buf_put(xnfp, bdesc, B_TRUE);
2848 }
2849
2850 static int
xnf_tx_buf_constructor(void * buf,void * arg,int kmflag)2851 xnf_tx_buf_constructor(void *buf, void *arg, int kmflag)
2852 {
2853 int (*ddiflags)(caddr_t) = DDI_DMA_SLEEP;
2854 xnf_txbuf_t *txp = buf;
2855 xnf_t *xnfp = arg;
2856
2857 if (kmflag & KM_NOSLEEP)
2858 ddiflags = DDI_DMA_DONTWAIT;
2859
2860 if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &tx_buf_dma_attr,
2861 ddiflags, 0, &txp->tx_dma_handle) != DDI_SUCCESS) {
2862 ASSERT(kmflag & KM_NOSLEEP); /* Cannot fail for KM_SLEEP. */
2863 return (-1);
2864 }
2865
2866 return (0);
2867 }
2868
2869 static void
xnf_tx_buf_destructor(void * buf,void * arg)2870 xnf_tx_buf_destructor(void *buf, void *arg)
2871 {
2872 _NOTE(ARGUNUSED(arg));
2873 xnf_txbuf_t *txp = buf;
2874
2875 ddi_dma_free_handle(&txp->tx_dma_handle);
2876 }
2877
2878 /*
2879 * Statistics.
2880 */
2881 static char *xnf_aux_statistics[] = {
2882 "tx_cksum_deferred",
2883 "rx_cksum_no_need",
2884 "interrupts",
2885 "unclaimed_interrupts",
2886 "tx_pullup",
2887 "tx_lookaside",
2888 "tx_drop",
2889 "tx_eth_hdr_split",
2890 "buf_allocated",
2891 "buf_outstanding",
2892 "gref_outstanding",
2893 "gref_failure",
2894 "gref_peak",
2895 "rx_allocb_fail",
2896 "rx_desballoc_fail",
2897 };
2898
2899 static int
xnf_kstat_aux_update(kstat_t * ksp,int flag)2900 xnf_kstat_aux_update(kstat_t *ksp, int flag)
2901 {
2902 xnf_t *xnfp;
2903 kstat_named_t *knp;
2904
2905 if (flag != KSTAT_READ)
2906 return (EACCES);
2907
2908 xnfp = ksp->ks_private;
2909 knp = ksp->ks_data;
2910
2911 /*
2912 * Assignment order must match that of the names in
2913 * xnf_aux_statistics.
2914 */
2915 (knp++)->value.ui64 = xnfp->xnf_stat_tx_cksum_deferred;
2916 (knp++)->value.ui64 = xnfp->xnf_stat_rx_cksum_no_need;
2917
2918 (knp++)->value.ui64 = xnfp->xnf_stat_interrupts;
2919 (knp++)->value.ui64 = xnfp->xnf_stat_unclaimed_interrupts;
2920 (knp++)->value.ui64 = xnfp->xnf_stat_tx_pullup;
2921 (knp++)->value.ui64 = xnfp->xnf_stat_tx_lookaside;
2922 (knp++)->value.ui64 = xnfp->xnf_stat_tx_drop;
2923 (knp++)->value.ui64 = xnfp->xnf_stat_tx_eth_hdr_split;
2924
2925 (knp++)->value.ui64 = xnfp->xnf_stat_buf_allocated;
2926 (knp++)->value.ui64 = xnfp->xnf_stat_buf_outstanding;
2927 (knp++)->value.ui64 = xnfp->xnf_stat_gref_outstanding;
2928 (knp++)->value.ui64 = xnfp->xnf_stat_gref_failure;
2929 (knp++)->value.ui64 = xnfp->xnf_stat_gref_peak;
2930 (knp++)->value.ui64 = xnfp->xnf_stat_rx_allocb_fail;
2931 (knp++)->value.ui64 = xnfp->xnf_stat_rx_desballoc_fail;
2932
2933 return (0);
2934 }
2935
2936 static boolean_t
xnf_kstat_init(xnf_t * xnfp)2937 xnf_kstat_init(xnf_t *xnfp)
2938 {
2939 int nstat = sizeof (xnf_aux_statistics) /
2940 sizeof (xnf_aux_statistics[0]);
2941 char **cp = xnf_aux_statistics;
2942 kstat_named_t *knp;
2943
2944 /*
2945 * Create and initialise kstats.
2946 */
2947 if ((xnfp->xnf_kstat_aux = kstat_create("xnf",
2948 ddi_get_instance(xnfp->xnf_devinfo),
2949 "aux_statistics", "net", KSTAT_TYPE_NAMED,
2950 nstat, 0)) == NULL)
2951 return (B_FALSE);
2952
2953 xnfp->xnf_kstat_aux->ks_private = xnfp;
2954 xnfp->xnf_kstat_aux->ks_update = xnf_kstat_aux_update;
2955
2956 knp = xnfp->xnf_kstat_aux->ks_data;
2957 while (nstat > 0) {
2958 kstat_named_init(knp, *cp, KSTAT_DATA_UINT64);
2959
2960 knp++;
2961 cp++;
2962 nstat--;
2963 }
2964
2965 kstat_install(xnfp->xnf_kstat_aux);
2966
2967 return (B_TRUE);
2968 }
2969
2970 static int
xnf_stat(void * arg,uint_t stat,uint64_t * val)2971 xnf_stat(void *arg, uint_t stat, uint64_t *val)
2972 {
2973 xnf_t *xnfp = arg;
2974
2975 mutex_enter(&xnfp->xnf_rxlock);
2976 mutex_enter(&xnfp->xnf_txlock);
2977
2978 #define mac_stat(q, r) \
2979 case (MAC_STAT_##q): \
2980 *val = xnfp->xnf_stat_##r; \
2981 break
2982
2983 #define ether_stat(q, r) \
2984 case (ETHER_STAT_##q): \
2985 *val = xnfp->xnf_stat_##r; \
2986 break
2987
2988 switch (stat) {
2989
2990 mac_stat(IPACKETS, ipackets);
2991 mac_stat(OPACKETS, opackets);
2992 mac_stat(RBYTES, rbytes);
2993 mac_stat(OBYTES, obytes);
2994 mac_stat(NORCVBUF, norxbuf);
2995 mac_stat(IERRORS, errrx);
2996 mac_stat(NOXMTBUF, tx_defer);
2997
2998 ether_stat(MACRCV_ERRORS, mac_rcv_error);
2999 ether_stat(TOOSHORT_ERRORS, runt);
3000
3001 /* always claim to be in full duplex mode */
3002 case ETHER_STAT_LINK_DUPLEX:
3003 *val = LINK_DUPLEX_FULL;
3004 break;
3005
3006 /* always claim to be at 1Gb/s link speed */
3007 case MAC_STAT_IFSPEED:
3008 *val = 1000000000ull;
3009 break;
3010
3011 default:
3012 mutex_exit(&xnfp->xnf_txlock);
3013 mutex_exit(&xnfp->xnf_rxlock);
3014
3015 return (ENOTSUP);
3016 }
3017
3018 #undef mac_stat
3019 #undef ether_stat
3020
3021 mutex_exit(&xnfp->xnf_txlock);
3022 mutex_exit(&xnfp->xnf_rxlock);
3023
3024 return (0);
3025 }
3026
3027 static int
xnf_change_mtu(xnf_t * xnfp,uint32_t mtu)3028 xnf_change_mtu(xnf_t *xnfp, uint32_t mtu)
3029 {
3030 if (mtu > ETHERMTU) {
3031 if (!xnf_enable_tx_sg) {
3032 dev_err(xnfp->xnf_devinfo, CE_WARN, "MTU limited to %d "
3033 "because scatter-gather is disabled for transmit "
3034 "in driver settings", ETHERMTU);
3035 return (EINVAL);
3036 } else if (!xnf_enable_rx_sg) {
3037 dev_err(xnfp->xnf_devinfo, CE_WARN, "MTU limited to %d "
3038 "because scatter-gather is disabled for receive "
3039 "in driver settings", ETHERMTU);
3040 return (EINVAL);
3041 } else if (!xnfp->xnf_be_tx_sg) {
3042 dev_err(xnfp->xnf_devinfo, CE_WARN, "MTU limited to %d "
3043 "because backend doesn't support scatter-gather",
3044 ETHERMTU);
3045 return (EINVAL);
3046 }
3047 if (mtu > XNF_MAXPKT)
3048 return (EINVAL);
3049 }
3050 int error = mac_maxsdu_update(xnfp->xnf_mh, mtu);
3051 if (error == 0)
3052 xnfp->xnf_mtu = mtu;
3053
3054 return (error);
3055 }
3056
3057 /*ARGSUSED*/
3058 static int
xnf_getprop(void * data,const char * prop_name,mac_prop_id_t prop_id,uint_t prop_val_size,void * prop_val)3059 xnf_getprop(void *data, const char *prop_name, mac_prop_id_t prop_id,
3060 uint_t prop_val_size, void *prop_val)
3061 {
3062 xnf_t *xnfp = data;
3063
3064 switch (prop_id) {
3065 case MAC_PROP_MTU:
3066 ASSERT(prop_val_size >= sizeof (uint32_t));
3067 bcopy(&xnfp->xnf_mtu, prop_val, sizeof (uint32_t));
3068 break;
3069 default:
3070 return (ENOTSUP);
3071 }
3072 return (0);
3073 }
3074
3075 /*ARGSUSED*/
3076 static int
xnf_setprop(void * data,const char * prop_name,mac_prop_id_t prop_id,uint_t prop_val_size,const void * prop_val)3077 xnf_setprop(void *data, const char *prop_name, mac_prop_id_t prop_id,
3078 uint_t prop_val_size, const void *prop_val)
3079 {
3080 xnf_t *xnfp = data;
3081 uint32_t new_mtu;
3082 int error;
3083
3084 switch (prop_id) {
3085 case MAC_PROP_MTU:
3086 ASSERT(prop_val_size >= sizeof (uint32_t));
3087 bcopy(prop_val, &new_mtu, sizeof (new_mtu));
3088 error = xnf_change_mtu(xnfp, new_mtu);
3089 break;
3090 default:
3091 return (ENOTSUP);
3092 }
3093
3094 return (error);
3095 }
3096
3097 /*ARGSUSED*/
3098 static void
xnf_propinfo(void * data,const char * prop_name,mac_prop_id_t prop_id,mac_prop_info_handle_t prop_handle)3099 xnf_propinfo(void *data, const char *prop_name, mac_prop_id_t prop_id,
3100 mac_prop_info_handle_t prop_handle)
3101 {
3102 switch (prop_id) {
3103 case MAC_PROP_MTU:
3104 mac_prop_info_set_range_uint32(prop_handle, 0, XNF_MAXPKT);
3105 break;
3106 default:
3107 break;
3108 }
3109 }
3110
3111 static boolean_t
xnf_getcapab(void * arg,mac_capab_t cap,void * cap_data)3112 xnf_getcapab(void *arg, mac_capab_t cap, void *cap_data)
3113 {
3114 xnf_t *xnfp = arg;
3115
3116 switch (cap) {
3117 case MAC_CAPAB_HCKSUM: {
3118 uint32_t *capab = cap_data;
3119
3120 /*
3121 * Whilst the flag used to communicate with the IO
3122 * domain is called "NETTXF_csum_blank", the checksum
3123 * in the packet must contain the pseudo-header
3124 * checksum and not zero.
3125 *
3126 * To help out the IO domain, we might use
3127 * HCKSUM_INET_PARTIAL. Unfortunately our stack will
3128 * then use checksum offload for IPv6 packets, which
3129 * the IO domain can't handle.
3130 *
3131 * As a result, we declare outselves capable of
3132 * HCKSUM_INET_FULL_V4. This means that we receive
3133 * IPv4 packets from the stack with a blank checksum
3134 * field and must insert the pseudo-header checksum
3135 * before passing the packet to the IO domain.
3136 */
3137 *capab = HCKSUM_INET_FULL_V4;
3138
3139 /*
3140 * TODO: query the "feature-ipv6-csum-offload" capability.
3141 * If enabled, that could allow us to use HCKSUM_INET_PARTIAL.
3142 */
3143
3144 break;
3145 }
3146 case MAC_CAPAB_LSO: {
3147 if (!xnfp->xnf_be_lso)
3148 return (B_FALSE);
3149
3150 mac_capab_lso_t *lso = cap_data;
3151 lso->lso_flags = LSO_TX_BASIC_TCP_IPV4;
3152 lso->lso_basic_tcp_ipv4.lso_max = IP_MAXPACKET;
3153 break;
3154 }
3155 default:
3156 return (B_FALSE);
3157 }
3158
3159 return (B_TRUE);
3160 }
3161
3162 /*
3163 * The state of the peer has changed - react accordingly.
3164 */
3165 static void
oe_state_change(dev_info_t * dip,ddi_eventcookie_t id,void * arg,void * impl_data)3166 oe_state_change(dev_info_t *dip, ddi_eventcookie_t id,
3167 void *arg, void *impl_data)
3168 {
3169 _NOTE(ARGUNUSED(id, arg));
3170 xnf_t *xnfp = ddi_get_driver_private(dip);
3171 XenbusState new_state = *(XenbusState *)impl_data;
3172
3173 ASSERT(xnfp != NULL);
3174
3175 switch (new_state) {
3176 case XenbusStateUnknown:
3177 case XenbusStateInitialising:
3178 case XenbusStateInitialised:
3179 case XenbusStateClosing:
3180 case XenbusStateClosed:
3181 case XenbusStateReconfiguring:
3182 case XenbusStateReconfigured:
3183 break;
3184
3185 case XenbusStateInitWait:
3186 xnf_read_config(xnfp);
3187
3188 if (!xnfp->xnf_be_rx_copy) {
3189 cmn_err(CE_WARN,
3190 "The xnf driver requires a dom0 that "
3191 "supports 'feature-rx-copy'.");
3192 (void) xvdi_switch_state(xnfp->xnf_devinfo,
3193 XBT_NULL, XenbusStateClosed);
3194 break;
3195 }
3196
3197 /*
3198 * Connect to the backend.
3199 */
3200 xnf_be_connect(xnfp);
3201
3202 /*
3203 * Our MAC address as discovered by xnf_read_config().
3204 */
3205 mac_unicst_update(xnfp->xnf_mh, xnfp->xnf_mac_addr);
3206
3207 /*
3208 * We do not know if some features such as LSO are supported
3209 * until we connect to the backend. We request the MAC layer
3210 * to poll our capabilities again.
3211 */
3212 mac_capab_update(xnfp->xnf_mh);
3213
3214 break;
3215
3216 case XenbusStateConnected:
3217 mutex_enter(&xnfp->xnf_rxlock);
3218 mutex_enter(&xnfp->xnf_txlock);
3219
3220 xnfp->xnf_connected = B_TRUE;
3221 /*
3222 * Wake up any threads waiting to send data to
3223 * backend.
3224 */
3225 cv_broadcast(&xnfp->xnf_cv_state);
3226
3227 mutex_exit(&xnfp->xnf_txlock);
3228 mutex_exit(&xnfp->xnf_rxlock);
3229
3230 /*
3231 * Kick the peer in case it missed any transmits
3232 * request in the TX ring.
3233 */
3234 ec_notify_via_evtchn(xnfp->xnf_evtchn);
3235
3236 /*
3237 * There may already be completed receive requests in
3238 * the ring sent by backend after it gets connected
3239 * but before we see its state change here, so we call
3240 * xnf_intr() to handle them, if any.
3241 */
3242 (void) xnf_intr((caddr_t)xnfp);
3243
3244 /*
3245 * Mark the link up now that we are connected.
3246 */
3247 mac_link_update(xnfp->xnf_mh, LINK_STATE_UP);
3248
3249 /*
3250 * Tell the backend about the multicast addresses in
3251 * which we are interested.
3252 */
3253 mac_multicast_refresh(xnfp->xnf_mh, NULL, xnfp, B_TRUE);
3254
3255 break;
3256
3257 default:
3258 break;
3259 }
3260 }
3261