xref: /illumos-gate/usr/src/uts/common/xen/io/xnf.c (revision fcdb3229a31dd4ff700c69238814e326aad49098)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * Copyright (c) 2014, 2017 by Delphix. All rights reserved.
29  * Copyright 2020 RackTop Systems, Inc.
30  */
31 
32 /*
33  *
34  * Copyright (c) 2004 Christian Limpach.
35  * All rights reserved.
36  *
37  * Redistribution and use in source and binary forms, with or without
38  * modification, are permitted provided that the following conditions
39  * are met:
40  * 1. Redistributions of source code must retain the above copyright
41  *    notice, this list of conditions and the following disclaimer.
42  * 2. Redistributions in binary form must reproduce the above copyright
43  *    notice, this list of conditions and the following disclaimer in the
44  *    documentation and/or other materials provided with the distribution.
45  * 3. This section intentionally left blank.
46  * 4. The name of the author may not be used to endorse or promote products
47  *    derived from this software without specific prior written permission.
48  *
49  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
50  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
51  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
52  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
53  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
54  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
55  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
56  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
57  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
58  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
59  */
60 /*
61  * Section 3 of the above license was updated in response to bug 6379571.
62  */
63 
64 /*
65  * xnf.c - GLDv3 network driver for domU.
66  */
67 
68 /*
69  * This driver uses four per-instance locks:
70  *
71  * xnf_gref_lock:
72  *
73  *    Protects access to the grant reference list stored in
74  *    xnf_gref_head. Grant references should be acquired and released
75  *    using gref_get() and gref_put() respectively.
76  *
77  * xnf_schedlock:
78  *
79  *    Protects:
80  *    xnf_need_sched - used to record that a previous transmit attempt
81  *       failed (and consequently it will be necessary to call
82  *       mac_tx_update() when transmit resources are available).
83  *    xnf_pending_multicast - the number of multicast requests that
84  *       have been submitted to the backend for which we have not
85  *       processed responses.
86  *
87  * xnf_txlock:
88  *
89  *    Protects the transmit ring (xnf_tx_ring) and associated
90  *    structures (notably xnf_tx_pkt_id and xnf_tx_pkt_id_head).
91  *
92  * xnf_rxlock:
93  *
94  *    Protects the receive ring (xnf_rx_ring) and associated
95  *    structures (notably xnf_rx_pkt_info).
96  *
97  * If driver-global state that affects both the transmit and receive
98  * rings is manipulated, both xnf_txlock and xnf_rxlock should be
99  * held, in that order.
100  *
101  * xnf_schedlock is acquired both whilst holding xnf_txlock and
102  * without. It should always be acquired after xnf_txlock if both are
103  * held.
104  *
105  * Notes:
106  * - atomic_add_64() is used to manipulate counters where we require
107  *   accuracy. For counters intended only for observation by humans,
108  *   post increment/decrement are used instead.
109  */
110 
111 #include <sys/types.h>
112 #include <sys/errno.h>
113 #include <sys/param.h>
114 #include <sys/sysmacros.h>
115 #include <sys/systm.h>
116 #include <sys/stream.h>
117 #include <sys/strsubr.h>
118 #include <sys/strsun.h>
119 #include <sys/conf.h>
120 #include <sys/ddi.h>
121 #include <sys/devops.h>
122 #include <sys/sunddi.h>
123 #include <sys/sunndi.h>
124 #include <sys/dlpi.h>
125 #include <sys/ethernet.h>
126 #include <sys/strsun.h>
127 #include <sys/pattr.h>
128 #include <inet/ip.h>
129 #include <inet/ip_impl.h>
130 #include <inet/tcp.h>
131 #include <netinet/udp.h>
132 #include <sys/gld.h>
133 #include <sys/modctl.h>
134 #include <sys/mac_provider.h>
135 #include <sys/mac_ether.h>
136 #include <sys/bootinfo.h>
137 #include <sys/mach_mmu.h>
138 #ifdef	XPV_HVM_DRIVER
139 #include <sys/xpv_support.h>
140 #include <sys/hypervisor.h>
141 #else
142 #include <sys/hypervisor.h>
143 #include <sys/evtchn_impl.h>
144 #include <sys/balloon_impl.h>
145 #endif
146 #include <xen/public/io/netif.h>
147 #include <sys/gnttab.h>
148 #include <xen/sys/xendev.h>
149 #include <sys/sdt.h>
150 #include <sys/note.h>
151 #include <sys/debug.h>
152 
153 #include <io/xnf.h>
154 
155 /*
156  * On a 32 bit PAE system physical and machine addresses are larger
157  * than 32 bits.  ddi_btop() on such systems take an unsigned long
158  * argument, and so addresses above 4G are truncated before ddi_btop()
159  * gets to see them.  To avoid this, code the shift operation here.
160  */
161 #define	xnf_btop(addr)	((addr) >> PAGESHIFT)
162 
163 /*
164  * The parameters below should only be changed in /etc/system, never in mdb.
165  */
166 
167 /*
168  * Should we use the multicast control feature if the backend provides
169  * it?
170  */
171 boolean_t xnf_multicast_control = B_TRUE;
172 
173 /*
174  * Should we allow scatter-gather for tx if backend allows it?
175  */
176 boolean_t xnf_enable_tx_sg = B_TRUE;
177 
178 /*
179  * Should we allow scatter-gather for rx if backend allows it?
180  */
181 boolean_t xnf_enable_rx_sg = B_TRUE;
182 
183 /*
184  * Should we allow lso for tx sends if backend allows it?
185  * Requires xnf_enable_tx_sg to be also set to TRUE.
186  */
187 boolean_t xnf_enable_lso = B_TRUE;
188 
189 /*
190  * Should we allow lro on rx if backend supports it?
191  * Requires xnf_enable_rx_sg to be also set to TRUE.
192  *
193  * !! WARNING !!
194  * LRO is not yet supported in the OS so this should be left as FALSE.
195  * !! WARNING !!
196  */
197 boolean_t xnf_enable_lro = B_FALSE;
198 
199 /*
200  * Received packets below this size are copied to a new streams buffer
201  * rather than being desballoc'ed.
202  *
203  * This value is chosen to accommodate traffic where there are a large
204  * number of small packets. For data showing a typical distribution,
205  * see:
206  *
207  * Sinha07a:
208  *	Rishi Sinha, Christos Papadopoulos, and John
209  *	Heidemann. Internet Packet Size Distributions: Some
210  *	Observations. Technical Report ISI-TR-2007-643,
211  *	USC/Information Sciences Institute, May, 2007. Orignally
212  *	released October 2005 as web page
213  *	http://netweb.usc.edu/~sinha/pkt-sizes/.
214  *	<http://www.isi.edu/~johnh/PAPERS/Sinha07a.html>.
215  */
216 size_t xnf_rx_copy_limit = 64;
217 
218 #define	INVALID_GRANT_HANDLE	((grant_handle_t)-1)
219 #define	INVALID_GRANT_REF	((grant_ref_t)-1)
220 #define	INVALID_TX_ID		((uint16_t)-1)
221 
222 #define	TX_ID_TO_TXID(p, id) (&((p)->xnf_tx_pkt_id[(id)]))
223 #define	TX_ID_VALID(i) \
224 	(((i) != INVALID_TX_ID) && ((i) < NET_TX_RING_SIZE))
225 
226 /*
227  * calculate how many pages are spanned by an mblk fragment
228  */
229 #define	xnf_mblk_pages(mp)	(MBLKL(mp) == 0 ? 0 : \
230     xnf_btop((uintptr_t)mp->b_wptr - 1) - xnf_btop((uintptr_t)mp->b_rptr) + 1)
231 
232 /* Required system entry points */
233 static int	xnf_attach(dev_info_t *, ddi_attach_cmd_t);
234 static int	xnf_detach(dev_info_t *, ddi_detach_cmd_t);
235 
236 /* Required driver entry points for Nemo */
237 static int	xnf_start(void *);
238 static void	xnf_stop(void *);
239 static int	xnf_set_mac_addr(void *, const uint8_t *);
240 static int	xnf_set_multicast(void *, boolean_t, const uint8_t *);
241 static int	xnf_set_promiscuous(void *, boolean_t);
242 static mblk_t	*xnf_send(void *, mblk_t *);
243 static uint_t	xnf_intr(caddr_t);
244 static int	xnf_stat(void *, uint_t, uint64_t *);
245 static boolean_t xnf_getcapab(void *, mac_capab_t, void *);
246 static int xnf_getprop(void *, const char *, mac_prop_id_t, uint_t, void *);
247 static int xnf_setprop(void *, const char *, mac_prop_id_t, uint_t,
248     const void *);
249 static void xnf_propinfo(void *, const char *, mac_prop_id_t,
250     mac_prop_info_handle_t);
251 
252 /* Driver private functions */
253 static int xnf_alloc_dma_resources(xnf_t *);
254 static void xnf_release_dma_resources(xnf_t *);
255 static void xnf_release_mblks(xnf_t *);
256 
257 static int xnf_buf_constructor(void *, void *, int);
258 static void xnf_buf_destructor(void *, void *);
259 static xnf_buf_t *xnf_buf_get(xnf_t *, int, boolean_t);
260 static void xnf_buf_put(xnf_t *, xnf_buf_t *, boolean_t);
261 static void xnf_buf_refresh(xnf_buf_t *);
262 static void xnf_buf_recycle(xnf_buf_t *);
263 
264 static int xnf_tx_buf_constructor(void *, void *, int);
265 static void xnf_tx_buf_destructor(void *, void *);
266 
267 static grant_ref_t xnf_gref_get(xnf_t *);
268 static void xnf_gref_put(xnf_t *, grant_ref_t);
269 
270 static xnf_txid_t *xnf_txid_get(xnf_t *);
271 static void xnf_txid_put(xnf_t *, xnf_txid_t *);
272 
273 static void xnf_rxbuf_hang(xnf_t *, xnf_buf_t *);
274 static int xnf_tx_clean_ring(xnf_t  *);
275 static void oe_state_change(dev_info_t *, ddi_eventcookie_t,
276     void *, void *);
277 static boolean_t xnf_kstat_init(xnf_t *);
278 static void xnf_rx_collect(xnf_t *);
279 
280 #define	XNF_CALLBACK_FLAGS	(MC_GETCAPAB | MC_PROPERTIES)
281 
282 static mac_callbacks_t xnf_callbacks = {
283 	.mc_callbacks = XNF_CALLBACK_FLAGS,
284 	.mc_getstat = xnf_stat,
285 	.mc_start = xnf_start,
286 	.mc_stop = xnf_stop,
287 	.mc_setpromisc = xnf_set_promiscuous,
288 	.mc_multicst = xnf_set_multicast,
289 	.mc_unicst = xnf_set_mac_addr,
290 	.mc_tx = xnf_send,
291 	.mc_getcapab = xnf_getcapab,
292 	.mc_setprop = xnf_setprop,
293 	.mc_getprop = xnf_getprop,
294 	.mc_propinfo = xnf_propinfo,
295 };
296 
297 /* DMA attributes for network ring buffer */
298 static ddi_dma_attr_t ringbuf_dma_attr = {
299 	.dma_attr_version = DMA_ATTR_V0,
300 	.dma_attr_addr_lo = 0,
301 	.dma_attr_addr_hi = 0xffffffffffffffffULL,
302 	.dma_attr_count_max = 0x7fffffff,
303 	.dma_attr_align = MMU_PAGESIZE,
304 	.dma_attr_burstsizes = 0x7ff,
305 	.dma_attr_minxfer = 1,
306 	.dma_attr_maxxfer = 0xffffffffU,
307 	.dma_attr_seg = 0xffffffffffffffffULL,
308 	.dma_attr_sgllen = 1,
309 	.dma_attr_granular = 1,
310 	.dma_attr_flags = 0
311 };
312 
313 /* DMA attributes for receive data */
314 static ddi_dma_attr_t rx_buf_dma_attr = {
315 	.dma_attr_version = DMA_ATTR_V0,
316 	.dma_attr_addr_lo = 0,
317 	.dma_attr_addr_hi = 0xffffffffffffffffULL,
318 	.dma_attr_count_max = MMU_PAGEOFFSET,
319 	.dma_attr_align = MMU_PAGESIZE, /* allocation alignment */
320 	.dma_attr_burstsizes = 0x7ff,
321 	.dma_attr_minxfer = 1,
322 	.dma_attr_maxxfer = 0xffffffffU,
323 	.dma_attr_seg = 0xffffffffffffffffULL,
324 	.dma_attr_sgllen = 1,
325 	.dma_attr_granular = 1,
326 	.dma_attr_flags = 0
327 };
328 
329 /* DMA attributes for transmit data */
330 static ddi_dma_attr_t tx_buf_dma_attr = {
331 	.dma_attr_version = DMA_ATTR_V0,
332 	.dma_attr_addr_lo = 0,
333 	.dma_attr_addr_hi = 0xffffffffffffffffULL,
334 	.dma_attr_count_max = MMU_PAGEOFFSET,
335 	.dma_attr_align = 1,
336 	.dma_attr_burstsizes = 0x7ff,
337 	.dma_attr_minxfer = 1,
338 	.dma_attr_maxxfer = 0xffffffffU,
339 	.dma_attr_seg = XEN_DATA_BOUNDARY - 1, /* segment boundary */
340 	.dma_attr_sgllen = XEN_MAX_TX_DATA_PAGES, /* max number of segments */
341 	.dma_attr_granular = 1,
342 	.dma_attr_flags = 0
343 };
344 
345 /* DMA access attributes for registers and descriptors */
346 static ddi_device_acc_attr_t accattr = {
347 	DDI_DEVICE_ATTR_V0,
348 	DDI_STRUCTURE_LE_ACC,	/* This is a little-endian device */
349 	DDI_STRICTORDER_ACC
350 };
351 
352 /* DMA access attributes for data: NOT to be byte swapped. */
353 static ddi_device_acc_attr_t data_accattr = {
354 	DDI_DEVICE_ATTR_V0,
355 	DDI_NEVERSWAP_ACC,
356 	DDI_STRICTORDER_ACC
357 };
358 
359 DDI_DEFINE_STREAM_OPS(xnf_dev_ops, nulldev, nulldev, xnf_attach, xnf_detach,
360     nodev, NULL, D_MP, NULL, ddi_quiesce_not_supported);
361 
362 static struct modldrv xnf_modldrv = {
363 	&mod_driverops,
364 	"Virtual Ethernet driver",
365 	&xnf_dev_ops
366 };
367 
368 static struct modlinkage modlinkage = {
369 	MODREV_1, &xnf_modldrv, NULL
370 };
371 
372 int
_init(void)373 _init(void)
374 {
375 	int r;
376 
377 	mac_init_ops(&xnf_dev_ops, "xnf");
378 	r = mod_install(&modlinkage);
379 	if (r != DDI_SUCCESS)
380 		mac_fini_ops(&xnf_dev_ops);
381 
382 	return (r);
383 }
384 
385 int
_fini(void)386 _fini(void)
387 {
388 	return (EBUSY); /* XXPV should be removable */
389 }
390 
391 int
_info(struct modinfo * modinfop)392 _info(struct modinfo *modinfop)
393 {
394 	return (mod_info(&modlinkage, modinfop));
395 }
396 
397 /*
398  * Acquire a grant reference.
399  */
400 static grant_ref_t
xnf_gref_get(xnf_t * xnfp)401 xnf_gref_get(xnf_t *xnfp)
402 {
403 	grant_ref_t gref;
404 
405 	mutex_enter(&xnfp->xnf_gref_lock);
406 
407 	do {
408 		gref = gnttab_claim_grant_reference(&xnfp->xnf_gref_head);
409 
410 	} while ((gref == INVALID_GRANT_REF) &&
411 	    (gnttab_alloc_grant_references(16, &xnfp->xnf_gref_head) == 0));
412 
413 	mutex_exit(&xnfp->xnf_gref_lock);
414 
415 	if (gref == INVALID_GRANT_REF) {
416 		xnfp->xnf_stat_gref_failure++;
417 	} else {
418 		atomic_inc_64(&xnfp->xnf_stat_gref_outstanding);
419 		if (xnfp->xnf_stat_gref_outstanding > xnfp->xnf_stat_gref_peak)
420 			xnfp->xnf_stat_gref_peak =
421 			    xnfp->xnf_stat_gref_outstanding;
422 	}
423 
424 	return (gref);
425 }
426 
427 /*
428  * Release a grant reference.
429  */
430 static void
xnf_gref_put(xnf_t * xnfp,grant_ref_t gref)431 xnf_gref_put(xnf_t *xnfp, grant_ref_t gref)
432 {
433 	ASSERT(gref != INVALID_GRANT_REF);
434 
435 	mutex_enter(&xnfp->xnf_gref_lock);
436 	gnttab_release_grant_reference(&xnfp->xnf_gref_head, gref);
437 	mutex_exit(&xnfp->xnf_gref_lock);
438 
439 	atomic_dec_64(&xnfp->xnf_stat_gref_outstanding);
440 }
441 
442 /*
443  * Acquire a transmit id.
444  */
445 static xnf_txid_t *
xnf_txid_get(xnf_t * xnfp)446 xnf_txid_get(xnf_t *xnfp)
447 {
448 	xnf_txid_t *tidp;
449 
450 	ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
451 
452 	if (xnfp->xnf_tx_pkt_id_head == INVALID_TX_ID)
453 		return (NULL);
454 
455 	ASSERT(TX_ID_VALID(xnfp->xnf_tx_pkt_id_head));
456 
457 	tidp = TX_ID_TO_TXID(xnfp, xnfp->xnf_tx_pkt_id_head);
458 	xnfp->xnf_tx_pkt_id_head = tidp->next;
459 	tidp->next = INVALID_TX_ID;
460 
461 	ASSERT(tidp->txbuf == NULL);
462 
463 	return (tidp);
464 }
465 
466 /*
467  * Release a transmit id.
468  */
469 static void
xnf_txid_put(xnf_t * xnfp,xnf_txid_t * tidp)470 xnf_txid_put(xnf_t *xnfp, xnf_txid_t *tidp)
471 {
472 	ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
473 	ASSERT(TX_ID_VALID(tidp->id));
474 	ASSERT(tidp->next == INVALID_TX_ID);
475 
476 	tidp->txbuf = NULL;
477 	tidp->next = xnfp->xnf_tx_pkt_id_head;
478 	xnfp->xnf_tx_pkt_id_head = tidp->id;
479 }
480 
481 static void
xnf_data_txbuf_free(xnf_t * xnfp,xnf_txbuf_t * txp)482 xnf_data_txbuf_free(xnf_t *xnfp, xnf_txbuf_t *txp)
483 {
484 	ASSERT3U(txp->tx_type, ==, TX_DATA);
485 
486 	/*
487 	 * We are either using a lookaside buffer or we are mapping existing
488 	 * buffers.
489 	 */
490 	if (txp->tx_bdesc != NULL) {
491 		ASSERT(!txp->tx_handle_bound);
492 		xnf_buf_put(xnfp, txp->tx_bdesc, B_TRUE);
493 	} else {
494 		if (txp->tx_txreq.gref != INVALID_GRANT_REF) {
495 			if (gnttab_query_foreign_access(txp->tx_txreq.gref) !=
496 			    0) {
497 				cmn_err(CE_PANIC, "tx grant %d still in use by "
498 				    "backend domain", txp->tx_txreq.gref);
499 			}
500 			(void) gnttab_end_foreign_access_ref(
501 			    txp->tx_txreq.gref, 1);
502 			xnf_gref_put(xnfp, txp->tx_txreq.gref);
503 		}
504 
505 		if (txp->tx_handle_bound)
506 			(void) ddi_dma_unbind_handle(txp->tx_dma_handle);
507 	}
508 
509 	if (txp->tx_mp != NULL)
510 		freemsg(txp->tx_mp);
511 
512 	if (txp->tx_prev != NULL) {
513 		ASSERT3P(txp->tx_prev->tx_next, ==, txp);
514 		txp->tx_prev->tx_next = NULL;
515 	}
516 
517 	if (txp->tx_txreq.id != INVALID_TX_ID) {
518 		/*
519 		 * This should be only possible when resuming from a suspend.
520 		 */
521 		ASSERT(!xnfp->xnf_connected);
522 		xnf_txid_put(xnfp, TX_ID_TO_TXID(xnfp, txp->tx_txreq.id));
523 		txp->tx_txreq.id = INVALID_TX_ID;
524 	}
525 
526 	kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
527 }
528 
529 static void
xnf_data_txbuf_free_chain(xnf_t * xnfp,xnf_txbuf_t * txp)530 xnf_data_txbuf_free_chain(xnf_t *xnfp, xnf_txbuf_t *txp)
531 {
532 	if (txp == NULL)
533 		return;
534 
535 	while (txp->tx_next != NULL)
536 		txp = txp->tx_next;
537 
538 	/*
539 	 * We free the chain in reverse order so that grants can be released
540 	 * for all dma chunks before unbinding the dma handles. The mblk is
541 	 * freed last, after all its fragments' dma handles are unbound.
542 	 */
543 	xnf_txbuf_t *prev;
544 	for (; txp != NULL; txp = prev) {
545 		prev = txp->tx_prev;
546 		xnf_data_txbuf_free(xnfp, txp);
547 	}
548 }
549 
550 static xnf_txbuf_t *
xnf_data_txbuf_alloc(xnf_t * xnfp,int flag)551 xnf_data_txbuf_alloc(xnf_t *xnfp, int flag)
552 {
553 	xnf_txbuf_t *txp;
554 
555 	if ((txp = kmem_cache_alloc(xnfp->xnf_tx_buf_cache, flag)) == NULL) {
556 		return (NULL);
557 	}
558 
559 	txp->tx_type = TX_DATA;
560 	txp->tx_next = NULL;
561 	txp->tx_prev = NULL;
562 	txp->tx_head = txp;
563 	txp->tx_frags_to_ack = 0;
564 	txp->tx_mp = NULL;
565 	txp->tx_bdesc = NULL;
566 	txp->tx_handle_bound = B_FALSE;
567 	txp->tx_txreq.gref = INVALID_GRANT_REF;
568 	txp->tx_txreq.id = INVALID_TX_ID;
569 
570 	return (txp);
571 }
572 
573 /*
574  * Get `wanted' slots in the transmit ring, waiting for at least that
575  * number if `wait' is B_TRUE. Force the ring to be cleaned by setting
576  * `wanted' to zero.
577  *
578  * Return the number of slots available.
579  */
580 static int
xnf_tx_slots_get(xnf_t * xnfp,int wanted,boolean_t wait)581 xnf_tx_slots_get(xnf_t *xnfp, int wanted, boolean_t wait)
582 {
583 	int slotsfree;
584 	boolean_t forced_clean = (wanted == 0);
585 
586 	ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
587 
588 	/* LINTED: constant in conditional context */
589 	while (B_TRUE) {
590 		slotsfree = RING_FREE_REQUESTS(&xnfp->xnf_tx_ring);
591 
592 		if ((slotsfree < wanted) || forced_clean)
593 			slotsfree = xnf_tx_clean_ring(xnfp);
594 
595 		/*
596 		 * If there are more than we need free, tell other
597 		 * people to come looking again. We hold txlock, so we
598 		 * are able to take our slots before anyone else runs.
599 		 */
600 		if (slotsfree > wanted)
601 			cv_broadcast(&xnfp->xnf_cv_tx_slots);
602 
603 		if (slotsfree >= wanted)
604 			break;
605 
606 		if (!wait)
607 			break;
608 
609 		cv_wait(&xnfp->xnf_cv_tx_slots, &xnfp->xnf_txlock);
610 	}
611 
612 	ASSERT(slotsfree <= RING_SIZE(&(xnfp->xnf_tx_ring)));
613 
614 	return (slotsfree);
615 }
616 
617 static int
xnf_setup_rings(xnf_t * xnfp)618 xnf_setup_rings(xnf_t *xnfp)
619 {
620 	domid_t			oeid;
621 	struct xenbus_device	*xsd;
622 	RING_IDX		i;
623 	int			err;
624 	xnf_txid_t		*tidp;
625 	xnf_buf_t **bdescp;
626 
627 	oeid = xvdi_get_oeid(xnfp->xnf_devinfo);
628 	xsd = xvdi_get_xsd(xnfp->xnf_devinfo);
629 
630 	if (xnfp->xnf_tx_ring_ref != INVALID_GRANT_REF)
631 		gnttab_end_foreign_access(xnfp->xnf_tx_ring_ref, 0, 0);
632 
633 	err = gnttab_grant_foreign_access(oeid,
634 	    xnf_btop(pa_to_ma(xnfp->xnf_tx_ring_phys_addr)), 0);
635 	if (err <= 0) {
636 		err = -err;
637 		xenbus_dev_error(xsd, err, "granting access to tx ring page");
638 		goto out;
639 	}
640 	xnfp->xnf_tx_ring_ref = (grant_ref_t)err;
641 
642 	if (xnfp->xnf_rx_ring_ref != INVALID_GRANT_REF)
643 		gnttab_end_foreign_access(xnfp->xnf_rx_ring_ref, 0, 0);
644 
645 	err = gnttab_grant_foreign_access(oeid,
646 	    xnf_btop(pa_to_ma(xnfp->xnf_rx_ring_phys_addr)), 0);
647 	if (err <= 0) {
648 		err = -err;
649 		xenbus_dev_error(xsd, err, "granting access to rx ring page");
650 		goto out;
651 	}
652 	xnfp->xnf_rx_ring_ref = (grant_ref_t)err;
653 
654 	mutex_enter(&xnfp->xnf_txlock);
655 
656 	/*
657 	 * We first cleanup the TX ring in case we are doing a resume.
658 	 * Note that this can lose packets, but we expect to stagger on.
659 	 */
660 	xnfp->xnf_tx_pkt_id_head = INVALID_TX_ID; /* I.e. emtpy list. */
661 	for (i = 0, tidp = &xnfp->xnf_tx_pkt_id[0];
662 	    i < NET_TX_RING_SIZE;
663 	    i++, tidp++) {
664 		xnf_txbuf_t *txp = tidp->txbuf;
665 		if (txp == NULL)
666 			continue;
667 
668 		switch (txp->tx_type) {
669 		case TX_DATA:
670 			/*
671 			 * txid_put() will be called for each txbuf's txid in
672 			 * the chain which will result in clearing tidp->txbuf.
673 			 */
674 			xnf_data_txbuf_free_chain(xnfp, txp);
675 
676 			break;
677 
678 		case TX_MCAST_REQ:
679 			txp->tx_type = TX_MCAST_RSP;
680 			txp->tx_status = NETIF_RSP_DROPPED;
681 			cv_broadcast(&xnfp->xnf_cv_multicast);
682 
683 			/*
684 			 * The request consumed two slots in the ring,
685 			 * yet only a single xnf_txid_t is used. Step
686 			 * over the empty slot.
687 			 */
688 			i++;
689 			ASSERT3U(i, <, NET_TX_RING_SIZE);
690 			break;
691 
692 		case TX_MCAST_RSP:
693 			break;
694 		}
695 	}
696 
697 	/*
698 	 * Now purge old list and add each txid to the new free list.
699 	 */
700 	xnfp->xnf_tx_pkt_id_head = INVALID_TX_ID; /* I.e. emtpy list. */
701 	for (i = 0, tidp = &xnfp->xnf_tx_pkt_id[0];
702 	    i < NET_TX_RING_SIZE;
703 	    i++, tidp++) {
704 		tidp->id = i;
705 		ASSERT3P(tidp->txbuf, ==, NULL);
706 		tidp->next = INVALID_TX_ID; /* Appease txid_put(). */
707 		xnf_txid_put(xnfp, tidp);
708 	}
709 
710 	/* LINTED: constant in conditional context */
711 	SHARED_RING_INIT(xnfp->xnf_tx_ring.sring);
712 	/* LINTED: constant in conditional context */
713 	FRONT_RING_INIT(&xnfp->xnf_tx_ring,
714 	    xnfp->xnf_tx_ring.sring, PAGESIZE);
715 
716 	mutex_exit(&xnfp->xnf_txlock);
717 
718 	mutex_enter(&xnfp->xnf_rxlock);
719 
720 	/*
721 	 * Clean out any buffers currently posted to the receive ring
722 	 * before we reset it.
723 	 */
724 	for (i = 0, bdescp = &xnfp->xnf_rx_pkt_info[0];
725 	    i < NET_RX_RING_SIZE;
726 	    i++, bdescp++) {
727 		if (*bdescp != NULL) {
728 			xnf_buf_put(xnfp, *bdescp, B_FALSE);
729 			*bdescp = NULL;
730 		}
731 	}
732 
733 	/* LINTED: constant in conditional context */
734 	SHARED_RING_INIT(xnfp->xnf_rx_ring.sring);
735 	/* LINTED: constant in conditional context */
736 	FRONT_RING_INIT(&xnfp->xnf_rx_ring,
737 	    xnfp->xnf_rx_ring.sring, PAGESIZE);
738 
739 	/*
740 	 * Fill the ring with buffers.
741 	 */
742 	for (i = 0; i < NET_RX_RING_SIZE; i++) {
743 		xnf_buf_t *bdesc;
744 
745 		bdesc = xnf_buf_get(xnfp, KM_SLEEP, B_FALSE);
746 		VERIFY(bdesc != NULL);
747 		xnf_rxbuf_hang(xnfp, bdesc);
748 	}
749 
750 	/* LINTED: constant in conditional context */
751 	RING_PUSH_REQUESTS(&xnfp->xnf_rx_ring);
752 
753 	mutex_exit(&xnfp->xnf_rxlock);
754 
755 	return (0);
756 
757 out:
758 	if (xnfp->xnf_tx_ring_ref != INVALID_GRANT_REF)
759 		gnttab_end_foreign_access(xnfp->xnf_tx_ring_ref, 0, 0);
760 	xnfp->xnf_tx_ring_ref = INVALID_GRANT_REF;
761 
762 	if (xnfp->xnf_rx_ring_ref != INVALID_GRANT_REF)
763 		gnttab_end_foreign_access(xnfp->xnf_rx_ring_ref, 0, 0);
764 	xnfp->xnf_rx_ring_ref = INVALID_GRANT_REF;
765 
766 	return (err);
767 }
768 
769 /*
770  * Connect driver to back end, called to set up communication with
771  * back end driver both initially and on resume after restore/migrate.
772  */
773 void
xnf_be_connect(xnf_t * xnfp)774 xnf_be_connect(xnf_t *xnfp)
775 {
776 	const char	*message;
777 	xenbus_transaction_t xbt;
778 	struct		xenbus_device *xsd;
779 	char		*xsname;
780 	int		err;
781 
782 	ASSERT(!xnfp->xnf_connected);
783 
784 	xsd = xvdi_get_xsd(xnfp->xnf_devinfo);
785 	xsname = xvdi_get_xsname(xnfp->xnf_devinfo);
786 
787 	err = xnf_setup_rings(xnfp);
788 	if (err != 0) {
789 		cmn_err(CE_WARN, "failed to set up tx/rx rings");
790 		xenbus_dev_error(xsd, err, "setting up ring");
791 		return;
792 	}
793 
794 again:
795 	err = xenbus_transaction_start(&xbt);
796 	if (err != 0) {
797 		xenbus_dev_error(xsd, EIO, "starting transaction");
798 		return;
799 	}
800 
801 	err = xenbus_printf(xbt, xsname, "tx-ring-ref", "%u",
802 	    xnfp->xnf_tx_ring_ref);
803 	if (err != 0) {
804 		message = "writing tx ring-ref";
805 		goto abort_transaction;
806 	}
807 
808 	err = xenbus_printf(xbt, xsname, "rx-ring-ref", "%u",
809 	    xnfp->xnf_rx_ring_ref);
810 	if (err != 0) {
811 		message = "writing rx ring-ref";
812 		goto abort_transaction;
813 	}
814 
815 	err = xenbus_printf(xbt, xsname, "event-channel", "%u",
816 	    xnfp->xnf_evtchn);
817 	if (err != 0) {
818 		message = "writing event-channel";
819 		goto abort_transaction;
820 	}
821 
822 	err = xenbus_printf(xbt, xsname, "feature-rx-notify", "%d", 1);
823 	if (err != 0) {
824 		message = "writing feature-rx-notify";
825 		goto abort_transaction;
826 	}
827 
828 	err = xenbus_printf(xbt, xsname, "request-rx-copy", "%d", 1);
829 	if (err != 0) {
830 		message = "writing request-rx-copy";
831 		goto abort_transaction;
832 	}
833 
834 	if (xnfp->xnf_be_mcast_control) {
835 		err = xenbus_printf(xbt, xsname, "request-multicast-control",
836 		    "%d", 1);
837 		if (err != 0) {
838 			message = "writing request-multicast-control";
839 			goto abort_transaction;
840 		}
841 	}
842 
843 	/*
844 	 * Tell backend if we support scatter-gather lists on the rx side.
845 	 */
846 	err = xenbus_printf(xbt, xsname, "feature-sg", "%d",
847 	    xnf_enable_rx_sg ? 1 : 0);
848 	if (err != 0) {
849 		message = "writing feature-sg";
850 		goto abort_transaction;
851 	}
852 
853 	/*
854 	 * Tell backend if we support LRO for IPv4. Scatter-gather on rx is
855 	 * a prerequisite.
856 	 */
857 	err = xenbus_printf(xbt, xsname, "feature-gso-tcpv4", "%d",
858 	    (xnf_enable_rx_sg && xnf_enable_lro) ? 1 : 0);
859 	if (err != 0) {
860 		message = "writing feature-gso-tcpv4";
861 		goto abort_transaction;
862 	}
863 
864 	err = xvdi_switch_state(xnfp->xnf_devinfo, xbt, XenbusStateConnected);
865 	if (err != 0) {
866 		message = "switching state to XenbusStateConnected";
867 		goto abort_transaction;
868 	}
869 
870 	err = xenbus_transaction_end(xbt, 0);
871 	if (err != 0) {
872 		if (err == EAGAIN)
873 			goto again;
874 		xenbus_dev_error(xsd, err, "completing transaction");
875 	}
876 
877 	return;
878 
879 abort_transaction:
880 	(void) xenbus_transaction_end(xbt, 1);
881 	xenbus_dev_error(xsd, err, "%s", message);
882 }
883 
884 /*
885  * Read configuration information from xenstore.
886  */
887 void
xnf_read_config(xnf_t * xnfp)888 xnf_read_config(xnf_t *xnfp)
889 {
890 	int err, be_cap;
891 	char mac[ETHERADDRL * 3];
892 	char *oename = xvdi_get_oename(xnfp->xnf_devinfo);
893 
894 	err = xenbus_scanf(XBT_NULL, oename, "mac",
895 	    "%s", (char *)&mac[0]);
896 	if (err != 0) {
897 		/*
898 		 * bad: we're supposed to be set up with a proper mac
899 		 * addr. at this point
900 		 */
901 		cmn_err(CE_WARN, "%s%d: no mac address",
902 		    ddi_driver_name(xnfp->xnf_devinfo),
903 		    ddi_get_instance(xnfp->xnf_devinfo));
904 			return;
905 	}
906 	if (ether_aton(mac, xnfp->xnf_mac_addr) != ETHERADDRL) {
907 		err = ENOENT;
908 		xenbus_dev_error(xvdi_get_xsd(xnfp->xnf_devinfo), ENOENT,
909 		    "parsing %s/mac", xvdi_get_xsname(xnfp->xnf_devinfo));
910 		return;
911 	}
912 
913 	err = xenbus_scanf(XBT_NULL, oename,
914 	    "feature-rx-copy", "%d", &be_cap);
915 	/*
916 	 * If we fail to read the store we assume that the key is
917 	 * absent, implying an older domain at the far end.  Older
918 	 * domains cannot do HV copy.
919 	 */
920 	if (err != 0)
921 		be_cap = 0;
922 	xnfp->xnf_be_rx_copy = (be_cap != 0);
923 
924 	err = xenbus_scanf(XBT_NULL, oename,
925 	    "feature-multicast-control", "%d", &be_cap);
926 	/*
927 	 * If we fail to read the store we assume that the key is
928 	 * absent, implying an older domain at the far end.  Older
929 	 * domains do not support multicast control.
930 	 */
931 	if (err != 0)
932 		be_cap = 0;
933 	xnfp->xnf_be_mcast_control = (be_cap != 0) && xnf_multicast_control;
934 
935 	/*
936 	 * See if back-end supports scatter-gather for transmits. If not,
937 	 * we will not support LSO and limit the mtu to 1500.
938 	 */
939 	err = xenbus_scanf(XBT_NULL, oename, "feature-sg", "%d", &be_cap);
940 	if (err != 0) {
941 		be_cap = 0;
942 		dev_err(xnfp->xnf_devinfo, CE_WARN, "error reading "
943 		    "'feature-sg' from backend driver");
944 	}
945 	if (be_cap == 0) {
946 		dev_err(xnfp->xnf_devinfo, CE_WARN, "scatter-gather is not "
947 		    "supported for transmits in the backend driver. LSO is "
948 		    "disabled and MTU is restricted to 1500 bytes.");
949 	}
950 	xnfp->xnf_be_tx_sg = (be_cap != 0) && xnf_enable_tx_sg;
951 
952 	if (xnfp->xnf_be_tx_sg) {
953 		/*
954 		 * Check if LSO is supported. Currently we only check for
955 		 * IPv4 as Illumos doesn't support LSO for IPv6.
956 		 */
957 		err = xenbus_scanf(XBT_NULL, oename, "feature-gso-tcpv4", "%d",
958 		    &be_cap);
959 		if (err != 0) {
960 			be_cap = 0;
961 			dev_err(xnfp->xnf_devinfo, CE_WARN, "error reading "
962 			    "'feature-gso-tcpv4' from backend driver");
963 		}
964 		if (be_cap == 0) {
965 			dev_err(xnfp->xnf_devinfo, CE_WARN, "LSO is not "
966 			    "supported by the backend driver. Performance "
967 			    "will be affected.");
968 		}
969 		xnfp->xnf_be_lso = (be_cap != 0) && xnf_enable_lso;
970 	}
971 }
972 
973 /*
974  *  attach(9E) -- Attach a device to the system
975  */
976 static int
xnf_attach(dev_info_t * devinfo,ddi_attach_cmd_t cmd)977 xnf_attach(dev_info_t *devinfo, ddi_attach_cmd_t cmd)
978 {
979 	mac_register_t *macp;
980 	xnf_t *xnfp;
981 	int err;
982 	char cachename[32];
983 
984 	switch (cmd) {
985 	case DDI_RESUME:
986 		xnfp = ddi_get_driver_private(devinfo);
987 		xnfp->xnf_gen++;
988 
989 		(void) xvdi_resume(devinfo);
990 		(void) xvdi_alloc_evtchn(devinfo);
991 		xnfp->xnf_evtchn = xvdi_get_evtchn(devinfo);
992 #ifdef XPV_HVM_DRIVER
993 		ec_bind_evtchn_to_handler(xnfp->xnf_evtchn, IPL_VIF, xnf_intr,
994 		    xnfp);
995 #else
996 		(void) ddi_add_intr(devinfo, 0, NULL, NULL, xnf_intr,
997 		    (caddr_t)xnfp);
998 #endif
999 		return (DDI_SUCCESS);
1000 
1001 	case DDI_ATTACH:
1002 		break;
1003 
1004 	default:
1005 		return (DDI_FAILURE);
1006 	}
1007 
1008 	/*
1009 	 *  Allocate gld_mac_info_t and xnf_instance structures
1010 	 */
1011 	macp = mac_alloc(MAC_VERSION);
1012 	if (macp == NULL)
1013 		return (DDI_FAILURE);
1014 	xnfp = kmem_zalloc(sizeof (*xnfp), KM_SLEEP);
1015 
1016 	xnfp->xnf_tx_pkt_id =
1017 	    kmem_zalloc(sizeof (xnf_txid_t) * NET_TX_RING_SIZE, KM_SLEEP);
1018 
1019 	xnfp->xnf_rx_pkt_info =
1020 	    kmem_zalloc(sizeof (xnf_buf_t *) * NET_RX_RING_SIZE, KM_SLEEP);
1021 
1022 	macp->m_dip = devinfo;
1023 	macp->m_driver = xnfp;
1024 	xnfp->xnf_devinfo = devinfo;
1025 
1026 	macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
1027 	macp->m_src_addr = xnfp->xnf_mac_addr;
1028 	macp->m_callbacks = &xnf_callbacks;
1029 	macp->m_min_sdu = 0;
1030 	xnfp->xnf_mtu = ETHERMTU;
1031 	macp->m_max_sdu = xnfp->xnf_mtu;
1032 
1033 	xnfp->xnf_running = B_FALSE;
1034 	xnfp->xnf_connected = B_FALSE;
1035 	xnfp->xnf_be_rx_copy = B_FALSE;
1036 	xnfp->xnf_be_mcast_control = B_FALSE;
1037 	xnfp->xnf_need_sched = B_FALSE;
1038 
1039 	xnfp->xnf_rx_head = NULL;
1040 	xnfp->xnf_rx_tail = NULL;
1041 	xnfp->xnf_rx_new_buffers_posted = B_FALSE;
1042 
1043 #ifdef XPV_HVM_DRIVER
1044 	/* Report our version to dom0 */
1045 	(void) xenbus_printf(XBT_NULL, "guest/xnf", "version", "%d",
1046 	    HVMPV_XNF_VERS);
1047 #endif
1048 
1049 	/*
1050 	 * Get the iblock cookie with which to initialize the mutexes.
1051 	 */
1052 	if (ddi_get_iblock_cookie(devinfo, 0, &xnfp->xnf_icookie)
1053 	    != DDI_SUCCESS)
1054 		goto failure;
1055 
1056 	mutex_init(&xnfp->xnf_txlock,
1057 	    NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
1058 	mutex_init(&xnfp->xnf_rxlock,
1059 	    NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
1060 	mutex_init(&xnfp->xnf_schedlock,
1061 	    NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
1062 	mutex_init(&xnfp->xnf_gref_lock,
1063 	    NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
1064 
1065 	cv_init(&xnfp->xnf_cv_state, NULL, CV_DEFAULT, NULL);
1066 	cv_init(&xnfp->xnf_cv_multicast, NULL, CV_DEFAULT, NULL);
1067 	cv_init(&xnfp->xnf_cv_tx_slots, NULL, CV_DEFAULT, NULL);
1068 
1069 	(void) sprintf(cachename, "xnf_buf_cache_%d",
1070 	    ddi_get_instance(devinfo));
1071 	xnfp->xnf_buf_cache = kmem_cache_create(cachename,
1072 	    sizeof (xnf_buf_t), 0,
1073 	    xnf_buf_constructor, xnf_buf_destructor,
1074 	    NULL, xnfp, NULL, 0);
1075 	if (xnfp->xnf_buf_cache == NULL)
1076 		goto failure_0;
1077 
1078 	(void) sprintf(cachename, "xnf_tx_buf_cache_%d",
1079 	    ddi_get_instance(devinfo));
1080 	xnfp->xnf_tx_buf_cache = kmem_cache_create(cachename,
1081 	    sizeof (xnf_txbuf_t), 0,
1082 	    xnf_tx_buf_constructor, xnf_tx_buf_destructor,
1083 	    NULL, xnfp, NULL, 0);
1084 	if (xnfp->xnf_tx_buf_cache == NULL)
1085 		goto failure_1;
1086 
1087 	xnfp->xnf_gref_head = INVALID_GRANT_REF;
1088 
1089 	if (xnf_alloc_dma_resources(xnfp) == DDI_FAILURE) {
1090 		cmn_err(CE_WARN, "xnf%d: failed to allocate and initialize "
1091 		    "driver data structures",
1092 		    ddi_get_instance(xnfp->xnf_devinfo));
1093 		goto failure_2;
1094 	}
1095 
1096 	xnfp->xnf_rx_ring.sring->rsp_event =
1097 	    xnfp->xnf_tx_ring.sring->rsp_event = 1;
1098 
1099 	xnfp->xnf_tx_ring_ref = INVALID_GRANT_REF;
1100 	xnfp->xnf_rx_ring_ref = INVALID_GRANT_REF;
1101 
1102 	/* set driver private pointer now */
1103 	ddi_set_driver_private(devinfo, xnfp);
1104 
1105 	if (!xnf_kstat_init(xnfp))
1106 		goto failure_3;
1107 
1108 	/*
1109 	 * Allocate an event channel, add the interrupt handler and
1110 	 * bind it to the event channel.
1111 	 */
1112 	(void) xvdi_alloc_evtchn(devinfo);
1113 	xnfp->xnf_evtchn = xvdi_get_evtchn(devinfo);
1114 #ifdef XPV_HVM_DRIVER
1115 	ec_bind_evtchn_to_handler(xnfp->xnf_evtchn, IPL_VIF, xnf_intr, xnfp);
1116 #else
1117 	(void) ddi_add_intr(devinfo, 0, NULL, NULL, xnf_intr, (caddr_t)xnfp);
1118 #endif
1119 
1120 	err = mac_register(macp, &xnfp->xnf_mh);
1121 	mac_free(macp);
1122 	macp = NULL;
1123 	if (err != 0)
1124 		goto failure_4;
1125 
1126 	if (xvdi_add_event_handler(devinfo, XS_OE_STATE, oe_state_change, NULL)
1127 	    != DDI_SUCCESS)
1128 		goto failure_5;
1129 
1130 #ifdef XPV_HVM_DRIVER
1131 	/*
1132 	 * In the HVM case, this driver essentially replaces a driver for
1133 	 * a 'real' PCI NIC. Without the "model" property set to
1134 	 * "Ethernet controller", like the PCI code does, netbooting does
1135 	 * not work correctly, as strplumb_get_netdev_path() will not find
1136 	 * this interface.
1137 	 */
1138 	(void) ndi_prop_update_string(DDI_DEV_T_NONE, devinfo, "model",
1139 	    "Ethernet controller");
1140 #endif
1141 
1142 	return (DDI_SUCCESS);
1143 
1144 failure_5:
1145 	(void) mac_unregister(xnfp->xnf_mh);
1146 
1147 failure_4:
1148 #ifdef XPV_HVM_DRIVER
1149 	ec_unbind_evtchn(xnfp->xnf_evtchn);
1150 	xvdi_free_evtchn(devinfo);
1151 #else
1152 	ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie);
1153 #endif
1154 	xnfp->xnf_evtchn = INVALID_EVTCHN;
1155 	kstat_delete(xnfp->xnf_kstat_aux);
1156 
1157 failure_3:
1158 	xnf_release_dma_resources(xnfp);
1159 
1160 failure_2:
1161 	kmem_cache_destroy(xnfp->xnf_tx_buf_cache);
1162 
1163 failure_1:
1164 	kmem_cache_destroy(xnfp->xnf_buf_cache);
1165 
1166 failure_0:
1167 	cv_destroy(&xnfp->xnf_cv_tx_slots);
1168 	cv_destroy(&xnfp->xnf_cv_multicast);
1169 	cv_destroy(&xnfp->xnf_cv_state);
1170 
1171 	mutex_destroy(&xnfp->xnf_gref_lock);
1172 	mutex_destroy(&xnfp->xnf_schedlock);
1173 	mutex_destroy(&xnfp->xnf_rxlock);
1174 	mutex_destroy(&xnfp->xnf_txlock);
1175 
1176 failure:
1177 	kmem_free(xnfp, sizeof (*xnfp));
1178 	if (macp != NULL)
1179 		mac_free(macp);
1180 
1181 	return (DDI_FAILURE);
1182 }
1183 
1184 /*  detach(9E) -- Detach a device from the system */
1185 static int
xnf_detach(dev_info_t * devinfo,ddi_detach_cmd_t cmd)1186 xnf_detach(dev_info_t *devinfo, ddi_detach_cmd_t cmd)
1187 {
1188 	xnf_t *xnfp;		/* Our private device info */
1189 
1190 	xnfp = ddi_get_driver_private(devinfo);
1191 
1192 	switch (cmd) {
1193 	case DDI_SUSPEND:
1194 #ifdef XPV_HVM_DRIVER
1195 		ec_unbind_evtchn(xnfp->xnf_evtchn);
1196 		xvdi_free_evtchn(devinfo);
1197 #else
1198 		ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie);
1199 #endif
1200 
1201 		xvdi_suspend(devinfo);
1202 
1203 		mutex_enter(&xnfp->xnf_rxlock);
1204 		mutex_enter(&xnfp->xnf_txlock);
1205 
1206 		xnfp->xnf_evtchn = INVALID_EVTCHN;
1207 		xnfp->xnf_connected = B_FALSE;
1208 		mutex_exit(&xnfp->xnf_txlock);
1209 		mutex_exit(&xnfp->xnf_rxlock);
1210 
1211 		/* claim link to be down after disconnect */
1212 		mac_link_update(xnfp->xnf_mh, LINK_STATE_DOWN);
1213 		return (DDI_SUCCESS);
1214 
1215 	case DDI_DETACH:
1216 		break;
1217 
1218 	default:
1219 		return (DDI_FAILURE);
1220 	}
1221 
1222 	if (xnfp->xnf_connected)
1223 		return (DDI_FAILURE);
1224 
1225 	/*
1226 	 * Cannot detach if we have xnf_buf_t outstanding.
1227 	 */
1228 	if (xnfp->xnf_stat_buf_allocated > 0)
1229 		return (DDI_FAILURE);
1230 
1231 	if (mac_unregister(xnfp->xnf_mh) != 0)
1232 		return (DDI_FAILURE);
1233 
1234 	kstat_delete(xnfp->xnf_kstat_aux);
1235 
1236 	/* Stop the receiver */
1237 	xnf_stop(xnfp);
1238 
1239 	xvdi_remove_event_handler(devinfo, XS_OE_STATE);
1240 
1241 	/* Remove the interrupt */
1242 #ifdef XPV_HVM_DRIVER
1243 	ec_unbind_evtchn(xnfp->xnf_evtchn);
1244 	xvdi_free_evtchn(devinfo);
1245 #else
1246 	ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie);
1247 #endif
1248 
1249 	/* Release any pending xmit mblks */
1250 	xnf_release_mblks(xnfp);
1251 
1252 	/* Release all DMA resources */
1253 	xnf_release_dma_resources(xnfp);
1254 
1255 	cv_destroy(&xnfp->xnf_cv_tx_slots);
1256 	cv_destroy(&xnfp->xnf_cv_multicast);
1257 	cv_destroy(&xnfp->xnf_cv_state);
1258 
1259 	kmem_cache_destroy(xnfp->xnf_tx_buf_cache);
1260 	kmem_cache_destroy(xnfp->xnf_buf_cache);
1261 
1262 	mutex_destroy(&xnfp->xnf_gref_lock);
1263 	mutex_destroy(&xnfp->xnf_schedlock);
1264 	mutex_destroy(&xnfp->xnf_rxlock);
1265 	mutex_destroy(&xnfp->xnf_txlock);
1266 
1267 	kmem_free(xnfp, sizeof (*xnfp));
1268 
1269 	return (DDI_SUCCESS);
1270 }
1271 
1272 /*
1273  *  xnf_set_mac_addr() -- set the physical network address on the board.
1274  */
1275 static int
xnf_set_mac_addr(void * arg,const uint8_t * macaddr)1276 xnf_set_mac_addr(void *arg, const uint8_t *macaddr)
1277 {
1278 	_NOTE(ARGUNUSED(arg, macaddr));
1279 
1280 	/*
1281 	 * We can't set our macaddr.
1282 	 */
1283 	return (ENOTSUP);
1284 }
1285 
1286 /*
1287  *  xnf_set_multicast() -- set (enable) or disable a multicast address.
1288  *
1289  *  Program the hardware to enable/disable the multicast address
1290  *  in "mca".  Enable if "add" is true, disable if false.
1291  */
1292 static int
xnf_set_multicast(void * arg,boolean_t add,const uint8_t * mca)1293 xnf_set_multicast(void *arg, boolean_t add, const uint8_t *mca)
1294 {
1295 	xnf_t *xnfp = arg;
1296 	xnf_txbuf_t *txp;
1297 	int n_slots;
1298 	RING_IDX slot;
1299 	xnf_txid_t *tidp;
1300 	netif_tx_request_t *txrp;
1301 	struct netif_extra_info *erp;
1302 	boolean_t notify, result;
1303 
1304 	/*
1305 	 * If the backend does not support multicast control then we
1306 	 * must assume that the right packets will just arrive.
1307 	 */
1308 	if (!xnfp->xnf_be_mcast_control)
1309 		return (0);
1310 
1311 	txp = kmem_cache_alloc(xnfp->xnf_tx_buf_cache, KM_SLEEP);
1312 
1313 	mutex_enter(&xnfp->xnf_txlock);
1314 
1315 	/*
1316 	 * If we're not yet connected then claim success. This is
1317 	 * acceptable because we refresh the entire set of multicast
1318 	 * addresses when we get connected.
1319 	 *
1320 	 * We can't wait around here because the MAC layer expects
1321 	 * this to be a non-blocking operation - waiting ends up
1322 	 * causing a deadlock during resume.
1323 	 */
1324 	if (!xnfp->xnf_connected) {
1325 		mutex_exit(&xnfp->xnf_txlock);
1326 		return (0);
1327 	}
1328 
1329 	/*
1330 	 * 1. Acquire two slots in the ring.
1331 	 * 2. Fill in the slots.
1332 	 * 3. Request notification when the operation is done.
1333 	 * 4. Kick the peer.
1334 	 * 5. Wait for the response via xnf_tx_clean_ring().
1335 	 */
1336 
1337 	n_slots = xnf_tx_slots_get(xnfp, 2, B_TRUE);
1338 	ASSERT(n_slots >= 2);
1339 
1340 	slot = xnfp->xnf_tx_ring.req_prod_pvt;
1341 	tidp = xnf_txid_get(xnfp);
1342 	VERIFY(tidp != NULL);
1343 
1344 	txp->tx_type = TX_MCAST_REQ;
1345 	txp->tx_slot = slot;
1346 
1347 	txrp = RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot);
1348 	erp = (struct netif_extra_info *)
1349 	    RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot + 1);
1350 
1351 	txrp->gref = 0;
1352 	txrp->size = 0;
1353 	txrp->offset = 0;
1354 	/* Set tx_txreq.id to appease xnf_tx_clean_ring(). */
1355 	txrp->id = txp->tx_txreq.id = tidp->id;
1356 	txrp->flags = NETTXF_extra_info;
1357 
1358 	erp->type = add ? XEN_NETIF_EXTRA_TYPE_MCAST_ADD :
1359 	    XEN_NETIF_EXTRA_TYPE_MCAST_DEL;
1360 	bcopy((void *)mca, &erp->u.mcast.addr, ETHERADDRL);
1361 
1362 	tidp->txbuf = txp;
1363 
1364 	xnfp->xnf_tx_ring.req_prod_pvt = slot + 2;
1365 
1366 	mutex_enter(&xnfp->xnf_schedlock);
1367 	xnfp->xnf_pending_multicast++;
1368 	mutex_exit(&xnfp->xnf_schedlock);
1369 
1370 	/* LINTED: constant in conditional context */
1371 	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_tx_ring,
1372 	    notify);
1373 	if (notify)
1374 		ec_notify_via_evtchn(xnfp->xnf_evtchn);
1375 
1376 	while (txp->tx_type == TX_MCAST_REQ)
1377 		cv_wait(&xnfp->xnf_cv_multicast, &xnfp->xnf_txlock);
1378 
1379 	ASSERT3U(txp->tx_type, ==, TX_MCAST_RSP);
1380 
1381 	mutex_enter(&xnfp->xnf_schedlock);
1382 	xnfp->xnf_pending_multicast--;
1383 	mutex_exit(&xnfp->xnf_schedlock);
1384 
1385 	result = (txp->tx_status == NETIF_RSP_OKAY);
1386 
1387 	xnf_txid_put(xnfp, tidp);
1388 
1389 	mutex_exit(&xnfp->xnf_txlock);
1390 
1391 	kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
1392 
1393 	return (result ? 0 : 1);
1394 }
1395 
1396 /*
1397  * xnf_set_promiscuous() -- set or reset promiscuous mode on the board
1398  *
1399  *  Program the hardware to enable/disable promiscuous mode.
1400  */
1401 static int
xnf_set_promiscuous(void * arg,boolean_t on)1402 xnf_set_promiscuous(void *arg, boolean_t on)
1403 {
1404 	_NOTE(ARGUNUSED(arg, on));
1405 
1406 	/*
1407 	 * We can't really do this, but we pretend that we can in
1408 	 * order that snoop will work.
1409 	 */
1410 	return (0);
1411 }
1412 
1413 /*
1414  * Clean buffers that we have responses for from the transmit ring.
1415  */
1416 static int
xnf_tx_clean_ring(xnf_t * xnfp)1417 xnf_tx_clean_ring(xnf_t *xnfp)
1418 {
1419 	boolean_t work_to_do;
1420 
1421 	ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
1422 
1423 loop:
1424 	while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_tx_ring)) {
1425 		RING_IDX cons, prod, i;
1426 
1427 		cons = xnfp->xnf_tx_ring.rsp_cons;
1428 		prod = xnfp->xnf_tx_ring.sring->rsp_prod;
1429 		membar_consumer();
1430 		/*
1431 		 * Clean tx requests from ring that we have responses
1432 		 * for.
1433 		 */
1434 		DTRACE_PROBE2(xnf_tx_clean_range, int, cons, int, prod);
1435 		for (i = cons; i != prod; i++) {
1436 			netif_tx_response_t *trp;
1437 			xnf_txid_t *tidp;
1438 			xnf_txbuf_t *txp;
1439 
1440 			trp = RING_GET_RESPONSE(&xnfp->xnf_tx_ring, i);
1441 			/*
1442 			 * if this slot was occupied by netif_extra_info_t,
1443 			 * then the response will be NETIF_RSP_NULL. In this
1444 			 * case there are no resources to clean up.
1445 			 */
1446 			if (trp->status == NETIF_RSP_NULL)
1447 				continue;
1448 
1449 			ASSERT(TX_ID_VALID(trp->id));
1450 
1451 			tidp = TX_ID_TO_TXID(xnfp, trp->id);
1452 			ASSERT3U(tidp->id, ==, trp->id);
1453 			ASSERT3U(tidp->next, ==, INVALID_TX_ID);
1454 
1455 			txp = tidp->txbuf;
1456 			ASSERT(txp != NULL);
1457 			ASSERT3U(txp->tx_txreq.id, ==, trp->id);
1458 
1459 			switch (txp->tx_type) {
1460 			case TX_DATA:
1461 				/*
1462 				 * We must put the txid for each response we
1463 				 * acknowledge to make sure that we never have
1464 				 * more free slots than txids. Because of this
1465 				 * we do it here instead of waiting for it to
1466 				 * be done in xnf_data_txbuf_free_chain().
1467 				 */
1468 				xnf_txid_put(xnfp, tidp);
1469 				txp->tx_txreq.id = INVALID_TX_ID;
1470 				ASSERT3S(txp->tx_head->tx_frags_to_ack, >, 0);
1471 				txp->tx_head->tx_frags_to_ack--;
1472 
1473 				/*
1474 				 * We clean the whole chain once we got a
1475 				 * response for each fragment.
1476 				 */
1477 				if (txp->tx_head->tx_frags_to_ack == 0)
1478 					xnf_data_txbuf_free_chain(xnfp, txp);
1479 
1480 				break;
1481 
1482 			case TX_MCAST_REQ:
1483 				txp->tx_type = TX_MCAST_RSP;
1484 				txp->tx_status = trp->status;
1485 				cv_broadcast(&xnfp->xnf_cv_multicast);
1486 
1487 				break;
1488 
1489 			default:
1490 				cmn_err(CE_PANIC, "xnf_tx_clean_ring: "
1491 				    "invalid xnf_txbuf_t type: %d",
1492 				    txp->tx_type);
1493 				break;
1494 			}
1495 		}
1496 		/*
1497 		 * Record the last response we dealt with so that we
1498 		 * know where to start next time around.
1499 		 */
1500 		xnfp->xnf_tx_ring.rsp_cons = prod;
1501 		membar_enter();
1502 	}
1503 
1504 	/* LINTED: constant in conditional context */
1505 	RING_FINAL_CHECK_FOR_RESPONSES(&xnfp->xnf_tx_ring, work_to_do);
1506 	if (work_to_do)
1507 		goto loop;
1508 
1509 	return (RING_FREE_REQUESTS(&xnfp->xnf_tx_ring));
1510 }
1511 
1512 /*
1513  * Allocate and fill in a look-aside buffer for the packet `mp'. Used
1514  * to ensure that the packet is physically contiguous and contained
1515  * within a single page.
1516  */
1517 static xnf_buf_t *
xnf_tx_get_lookaside(xnf_t * xnfp,mblk_t * mp,size_t * plen)1518 xnf_tx_get_lookaside(xnf_t *xnfp, mblk_t *mp, size_t *plen)
1519 {
1520 	xnf_buf_t *bd;
1521 	caddr_t bp;
1522 
1523 	if ((bd = xnf_buf_get(xnfp, KM_NOSLEEP, B_TRUE)) == NULL) {
1524 		return (NULL);
1525 	}
1526 
1527 	bp = bd->buf;
1528 	while (mp != NULL) {
1529 		size_t len = MBLKL(mp);
1530 
1531 		bcopy(mp->b_rptr, bp, len);
1532 		bp += len;
1533 
1534 		mp = mp->b_cont;
1535 	}
1536 
1537 	*plen = bp - bd->buf;
1538 	ASSERT3U(*plen, <=, PAGESIZE);
1539 
1540 	xnfp->xnf_stat_tx_lookaside++;
1541 
1542 	return (bd);
1543 }
1544 
1545 /*
1546  * Insert the pseudo-header checksum into the packet.
1547  * Assumes packet is IPv4, TCP/UDP since we only advertised support for
1548  * HCKSUM_INET_FULL_V4.
1549  */
1550 int
xnf_pseudo_cksum(mblk_t * mp)1551 xnf_pseudo_cksum(mblk_t *mp)
1552 {
1553 	struct ether_header *ehp;
1554 	uint16_t sap, iplen, *stuff;
1555 	uint32_t cksum;
1556 	size_t len;
1557 	ipha_t *ipha;
1558 	ipaddr_t src, dst;
1559 	uchar_t *ptr;
1560 
1561 	ptr = mp->b_rptr;
1562 	len = MBLKL(mp);
1563 
1564 	/* Each header must fit completely in an mblk. */
1565 	ASSERT3U(len, >=, sizeof (*ehp));
1566 
1567 	ehp = (struct ether_header *)ptr;
1568 
1569 	if (ntohs(ehp->ether_type) == VLAN_TPID) {
1570 		struct ether_vlan_header *evhp;
1571 		ASSERT3U(len, >=, sizeof (*evhp));
1572 		evhp = (struct ether_vlan_header *)ptr;
1573 		sap = ntohs(evhp->ether_type);
1574 		ptr += sizeof (*evhp);
1575 		len -= sizeof (*evhp);
1576 	} else {
1577 		sap = ntohs(ehp->ether_type);
1578 		ptr += sizeof (*ehp);
1579 		len -= sizeof (*ehp);
1580 	}
1581 
1582 	ASSERT3U(sap, ==, ETHERTYPE_IP);
1583 
1584 	/*
1585 	 * Ethernet and IP headers may be in different mblks.
1586 	 */
1587 	ASSERT3P(ptr, <=, mp->b_wptr);
1588 	if (ptr == mp->b_wptr) {
1589 		mp = mp->b_cont;
1590 		ptr = mp->b_rptr;
1591 		len = MBLKL(mp);
1592 	}
1593 
1594 	ASSERT3U(len, >=, sizeof (ipha_t));
1595 	ipha = (ipha_t *)ptr;
1596 
1597 	/*
1598 	 * We assume the IP header has no options. (This is enforced in
1599 	 * ire_send_wire_v4() -- search for IXAF_NO_HW_CKSUM).
1600 	 */
1601 	ASSERT3U(IPH_HDR_LENGTH(ipha), ==, IP_SIMPLE_HDR_LENGTH);
1602 	iplen = ntohs(ipha->ipha_length) - IP_SIMPLE_HDR_LENGTH;
1603 
1604 	ptr += IP_SIMPLE_HDR_LENGTH;
1605 	len -= IP_SIMPLE_HDR_LENGTH;
1606 
1607 	/*
1608 	 * IP and L4 headers may be in different mblks.
1609 	 */
1610 	ASSERT3P(ptr, <=, mp->b_wptr);
1611 	if (ptr == mp->b_wptr) {
1612 		mp = mp->b_cont;
1613 		ptr = mp->b_rptr;
1614 		len = MBLKL(mp);
1615 	}
1616 
1617 	switch (ipha->ipha_protocol) {
1618 	case IPPROTO_TCP:
1619 		ASSERT3U(len, >=, sizeof (tcph_t));
1620 		stuff = (uint16_t *)(ptr + TCP_CHECKSUM_OFFSET);
1621 		cksum = IP_TCP_CSUM_COMP;
1622 		break;
1623 	case IPPROTO_UDP:
1624 		ASSERT3U(len, >=, sizeof (struct udphdr));
1625 		stuff = (uint16_t *)(ptr + UDP_CHECKSUM_OFFSET);
1626 		cksum = IP_UDP_CSUM_COMP;
1627 		break;
1628 	default:
1629 		cmn_err(CE_WARN, "xnf_pseudo_cksum: unexpected protocol %d",
1630 		    ipha->ipha_protocol);
1631 		return (EINVAL);
1632 	}
1633 
1634 	src = ipha->ipha_src;
1635 	dst = ipha->ipha_dst;
1636 
1637 	cksum += (dst >> 16) + (dst & 0xFFFF);
1638 	cksum += (src >> 16) + (src & 0xFFFF);
1639 	cksum += htons(iplen);
1640 
1641 	cksum = (cksum >> 16) + (cksum & 0xFFFF);
1642 	cksum = (cksum >> 16) + (cksum & 0xFFFF);
1643 
1644 	ASSERT(cksum <= 0xFFFF);
1645 
1646 	*stuff = (uint16_t)(cksum ? cksum : ~cksum);
1647 
1648 	return (0);
1649 }
1650 
1651 /*
1652  * Push a packet into the transmit ring.
1653  *
1654  * Note: the format of a tx packet that spans multiple slots is similar to
1655  * what is described in xnf_rx_one_packet().
1656  */
1657 static void
xnf_tx_push_packet(xnf_t * xnfp,xnf_txbuf_t * head)1658 xnf_tx_push_packet(xnf_t *xnfp, xnf_txbuf_t *head)
1659 {
1660 	int nslots = 0;
1661 	int extras = 0;
1662 	RING_IDX slot;
1663 	boolean_t notify;
1664 
1665 	ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
1666 	ASSERT(xnfp->xnf_running);
1667 
1668 	slot = xnfp->xnf_tx_ring.req_prod_pvt;
1669 
1670 	/*
1671 	 * The caller has already checked that we have enough slots to proceed.
1672 	 */
1673 	for (xnf_txbuf_t *txp = head; txp != NULL; txp = txp->tx_next) {
1674 		xnf_txid_t *tidp;
1675 		netif_tx_request_t *txrp;
1676 
1677 		tidp = xnf_txid_get(xnfp);
1678 		VERIFY(tidp != NULL);
1679 		txrp = RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot);
1680 
1681 		txp->tx_slot = slot;
1682 		txp->tx_txreq.id = tidp->id;
1683 		*txrp = txp->tx_txreq;
1684 
1685 		tidp->txbuf = txp;
1686 		slot++;
1687 		nslots++;
1688 
1689 		/*
1690 		 * When present, LSO info is placed in a slot after the first
1691 		 * data segment, and doesn't require a txid.
1692 		 */
1693 		if (txp->tx_txreq.flags & NETTXF_extra_info) {
1694 			netif_extra_info_t *extra;
1695 			ASSERT3U(nslots, ==, 1);
1696 
1697 			extra = (netif_extra_info_t *)
1698 			    RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot);
1699 			*extra = txp->tx_extra;
1700 			slot++;
1701 			nslots++;
1702 			extras = 1;
1703 		}
1704 	}
1705 
1706 	ASSERT3U(nslots, <=, XEN_MAX_SLOTS_PER_TX);
1707 
1708 	/*
1709 	 * Store the number of data fragments.
1710 	 */
1711 	head->tx_frags_to_ack = nslots - extras;
1712 
1713 	xnfp->xnf_tx_ring.req_prod_pvt = slot;
1714 
1715 	/*
1716 	 * Tell the peer that we sent something, if it cares.
1717 	 */
1718 	/* LINTED: constant in conditional context */
1719 	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_tx_ring, notify);
1720 	if (notify)
1721 		ec_notify_via_evtchn(xnfp->xnf_evtchn);
1722 }
1723 
1724 static xnf_txbuf_t *
xnf_mblk_copy(xnf_t * xnfp,mblk_t * mp)1725 xnf_mblk_copy(xnf_t *xnfp, mblk_t *mp)
1726 {
1727 	xnf_txbuf_t *txp;
1728 	size_t length;
1729 
1730 	if ((txp = xnf_data_txbuf_alloc(xnfp, KM_NOSLEEP)) == NULL) {
1731 		return (NULL);
1732 	}
1733 
1734 	txp->tx_bdesc = xnf_tx_get_lookaside(xnfp, mp, &length);
1735 	if (txp->tx_bdesc == NULL) {
1736 		xnf_data_txbuf_free(xnfp, txp);
1737 		return (NULL);
1738 	}
1739 	txp->tx_mfn = txp->tx_bdesc->buf_mfn;
1740 	txp->tx_txreq.gref = txp->tx_bdesc->grant_ref;
1741 	txp->tx_txreq.size = length;
1742 	txp->tx_txreq.offset = (uintptr_t)txp->tx_bdesc->buf & PAGEOFFSET;
1743 	txp->tx_txreq.flags = 0;
1744 
1745 	return (txp);
1746 }
1747 
1748 static xnf_txbuf_t *
xnf_mblk_map(xnf_t * xnfp,mblk_t * mp,int * countp)1749 xnf_mblk_map(xnf_t *xnfp, mblk_t *mp, int *countp)
1750 {
1751 	xnf_txbuf_t *head = NULL;
1752 	xnf_txbuf_t *tail = NULL;
1753 	domid_t oeid;
1754 	int nsegs = 0;
1755 
1756 	oeid = xvdi_get_oeid(xnfp->xnf_devinfo);
1757 
1758 	for (mblk_t *ml = mp; ml != NULL; ml = ml->b_cont) {
1759 		ddi_dma_handle_t dma_handle;
1760 		const ddi_dma_cookie_t *dma_cookie, *dma_cookie_prev;
1761 		xnf_txbuf_t *txp;
1762 
1763 		if (MBLKL(ml) == 0)
1764 			continue;
1765 
1766 		if ((txp = xnf_data_txbuf_alloc(xnfp, KM_NOSLEEP)) == NULL) {
1767 			goto error;
1768 		}
1769 
1770 		if (head == NULL) {
1771 			head = txp;
1772 		} else {
1773 			ASSERT(tail != NULL);
1774 			TXBUF_SETNEXT(tail, txp);
1775 			txp->tx_head = head;
1776 		}
1777 
1778 		/*
1779 		 * The necessary segmentation rules (e.g. not crossing a page
1780 		 * boundary) are enforced by the dma attributes of the handle.
1781 		 */
1782 		dma_handle = txp->tx_dma_handle;
1783 		int ret = ddi_dma_addr_bind_handle(dma_handle,
1784 		    NULL, (char *)ml->b_rptr, MBLKL(ml),
1785 		    DDI_DMA_WRITE | DDI_DMA_STREAMING,
1786 		    DDI_DMA_DONTWAIT, 0, NULL, NULL);
1787 		if (ret != DDI_DMA_MAPPED) {
1788 			if (ret != DDI_DMA_NORESOURCES) {
1789 				dev_err(xnfp->xnf_devinfo, CE_WARN,
1790 				    "ddi_dma_addr_bind_handle() failed "
1791 				    "[dma_error=%d]", ret);
1792 			}
1793 			goto error;
1794 		}
1795 		txp->tx_handle_bound = B_TRUE;
1796 
1797 		dma_cookie_prev = NULL;
1798 		while ((dma_cookie = ddi_dma_cookie_iter(dma_handle,
1799 		    dma_cookie_prev)) != NULL) {
1800 			if (nsegs == XEN_MAX_TX_DATA_PAGES) {
1801 				dev_err(xnfp->xnf_devinfo, CE_WARN,
1802 				    "xnf_dmamap_alloc() failed: "
1803 				    "too many segments");
1804 				goto error;
1805 			}
1806 			if (dma_cookie_prev != NULL) {
1807 				if ((txp = xnf_data_txbuf_alloc(xnfp,
1808 				    KM_NOSLEEP)) == NULL) {
1809 					goto error;
1810 				}
1811 				ASSERT(tail != NULL);
1812 				TXBUF_SETNEXT(tail, txp);
1813 				txp->tx_head = head;
1814 			}
1815 
1816 			txp->tx_mfn =
1817 			    xnf_btop(pa_to_ma(dma_cookie->dmac_laddress));
1818 			txp->tx_txreq.gref = xnf_gref_get(xnfp);
1819 			if (txp->tx_txreq.gref == INVALID_GRANT_REF) {
1820 				dev_err(xnfp->xnf_devinfo, CE_WARN,
1821 				    "xnf_dmamap_alloc() failed: "
1822 				    "invalid grant ref");
1823 				goto error;
1824 			}
1825 			gnttab_grant_foreign_access_ref(txp->tx_txreq.gref,
1826 			    oeid, txp->tx_mfn, 1);
1827 			txp->tx_txreq.offset =
1828 			    dma_cookie->dmac_laddress & PAGEOFFSET;
1829 			txp->tx_txreq.size = dma_cookie->dmac_size;
1830 			txp->tx_txreq.flags = 0;
1831 
1832 			nsegs++;
1833 
1834 			if (tail != NULL)
1835 				tail->tx_txreq.flags = NETTXF_more_data;
1836 			tail = txp;
1837 
1838 			dma_cookie_prev = dma_cookie;
1839 		}
1840 	}
1841 
1842 	*countp = nsegs;
1843 	return (head);
1844 
1845 error:
1846 	xnf_data_txbuf_free_chain(xnfp, head);
1847 	return (NULL);
1848 }
1849 
1850 static void
xnf_tx_setup_offload(xnf_t * xnfp,xnf_txbuf_t * head,uint32_t cksum_flags,uint32_t lso_flags,uint32_t mss)1851 xnf_tx_setup_offload(xnf_t *xnfp, xnf_txbuf_t *head,
1852     uint32_t cksum_flags, uint32_t lso_flags, uint32_t mss)
1853 {
1854 	if (lso_flags != 0) {
1855 		ASSERT3U(lso_flags, ==, HW_LSO);
1856 		ASSERT3P(head->tx_bdesc, ==, NULL);
1857 
1858 		head->tx_txreq.flags |= NETTXF_extra_info;
1859 		netif_extra_info_t *extra = &head->tx_extra;
1860 		extra->type = XEN_NETIF_EXTRA_TYPE_GSO;
1861 		extra->flags = 0;
1862 		extra->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4;
1863 		extra->u.gso.size = mss;
1864 		extra->u.gso.features = 0;
1865 		extra->u.gso.pad = 0;
1866 	} else if (cksum_flags != 0) {
1867 		ASSERT3U(cksum_flags, ==, HCK_FULLCKSUM);
1868 		/*
1869 		 * If the local protocol stack requests checksum
1870 		 * offload we set the 'checksum blank' flag,
1871 		 * indicating to the peer that we need the checksum
1872 		 * calculated for us.
1873 		 *
1874 		 * We _don't_ set the validated flag, because we haven't
1875 		 * validated that the data and the checksum match.
1876 		 *
1877 		 * Note: we already called xnf_pseudo_cksum() in
1878 		 * xnf_send(), so we just set the txreq flag here.
1879 		 */
1880 		head->tx_txreq.flags |= NETTXF_csum_blank;
1881 		xnfp->xnf_stat_tx_cksum_deferred++;
1882 	}
1883 }
1884 
1885 /*
1886  * Send packet mp. Called by the MAC framework.
1887  */
1888 static mblk_t *
xnf_send(void * arg,mblk_t * mp)1889 xnf_send(void *arg, mblk_t *mp)
1890 {
1891 	xnf_t *xnfp = arg;
1892 	xnf_txbuf_t *head;
1893 	mblk_t *ml;
1894 	int length;
1895 	int pages, chunks, slots, slots_free;
1896 	uint32_t cksum_flags, lso_flags, mss;
1897 	boolean_t pulledup = B_FALSE;
1898 	boolean_t force_copy = B_FALSE;
1899 
1900 	ASSERT3P(mp->b_next, ==, NULL);
1901 
1902 	mutex_enter(&xnfp->xnf_txlock);
1903 
1904 	/*
1905 	 * Wait until we are connected to the backend.
1906 	 */
1907 	while (!xnfp->xnf_connected)
1908 		cv_wait(&xnfp->xnf_cv_state, &xnfp->xnf_txlock);
1909 
1910 	/*
1911 	 * To simplify logic and be in sync with the rescheduling mechanism,
1912 	 * we require the maximum amount of slots that could be used by a
1913 	 * transaction to be free before proceeding. The only downside of doing
1914 	 * this is that it slightly reduces the effective size of the ring.
1915 	 */
1916 	slots_free = xnf_tx_slots_get(xnfp, XEN_MAX_SLOTS_PER_TX, B_FALSE);
1917 	if (slots_free < XEN_MAX_SLOTS_PER_TX) {
1918 		/*
1919 		 * We need to ask for a re-schedule later as the ring is full.
1920 		 */
1921 		mutex_enter(&xnfp->xnf_schedlock);
1922 		xnfp->xnf_need_sched = B_TRUE;
1923 		mutex_exit(&xnfp->xnf_schedlock);
1924 
1925 		xnfp->xnf_stat_tx_defer++;
1926 		mutex_exit(&xnfp->xnf_txlock);
1927 		return (mp);
1928 	}
1929 
1930 	/*
1931 	 * Get hw offload parameters.
1932 	 * This must be done before pulling up the mp as those parameters
1933 	 * are not copied over.
1934 	 */
1935 	mac_hcksum_get(mp, NULL, NULL, NULL, NULL, &cksum_flags);
1936 	mac_lso_get(mp, &mss, &lso_flags);
1937 
1938 	/*
1939 	 * XXX: fix MAC framework so that we can advertise support for
1940 	 * partial checksum for IPv4 only. This way we won't need to calculate
1941 	 * the pseudo header checksum ourselves.
1942 	 */
1943 	if (cksum_flags != 0) {
1944 		ASSERT3U(cksum_flags, ==, HCK_FULLCKSUM);
1945 		(void) xnf_pseudo_cksum(mp);
1946 	}
1947 
1948 pulledup:
1949 	for (ml = mp, pages = 0, chunks = 0, length = 0; ml != NULL;
1950 	    ml = ml->b_cont, chunks++) {
1951 		pages += xnf_mblk_pages(ml);
1952 		length += MBLKL(ml);
1953 	}
1954 	DTRACE_PROBE3(packet, int, length, int, chunks, int, pages);
1955 	DTRACE_PROBE3(lso, int, length, uint32_t, lso_flags, uint32_t, mss);
1956 
1957 	/*
1958 	 * If the ethernet header crosses a page boundary the packet
1959 	 * will be dropped by the backend. In practice it seems like
1960 	 * this happens fairly rarely so we'll do nothing unless the
1961 	 * packet is small enough to fit in a look-aside buffer.
1962 	 */
1963 	if (((uintptr_t)mp->b_rptr & PAGEOFFSET) +
1964 	    sizeof (struct ether_header) > PAGESIZE) {
1965 		xnfp->xnf_stat_tx_eth_hdr_split++;
1966 		if (length <= PAGESIZE)
1967 			force_copy = B_TRUE;
1968 	}
1969 
1970 	if (force_copy || (pages > 1 && !xnfp->xnf_be_tx_sg)) {
1971 		/*
1972 		 * If the packet spans several pages and scatter-gather is not
1973 		 * supported then use a look-aside buffer.
1974 		 */
1975 		ASSERT3U(length, <=, PAGESIZE);
1976 		head = xnf_mblk_copy(xnfp, mp);
1977 		if (head == NULL) {
1978 			dev_err(xnfp->xnf_devinfo, CE_WARN,
1979 			    "xnf_mblk_copy() failed");
1980 			goto drop;
1981 		}
1982 	} else {
1983 		/*
1984 		 * There's a limit for how many pages can be passed to the
1985 		 * backend. If we pass that limit, the packet will be dropped
1986 		 * and some backend implementations (e.g. Linux) could even
1987 		 * offline the interface.
1988 		 */
1989 		if (pages > XEN_MAX_TX_DATA_PAGES) {
1990 			if (pulledup) {
1991 				dev_err(xnfp->xnf_devinfo, CE_WARN,
1992 				    "too many pages, even after pullup: %d.",
1993 				    pages);
1994 				goto drop;
1995 			}
1996 
1997 			/*
1998 			 * Defragment packet if it spans too many pages.
1999 			 */
2000 			mblk_t *newmp = msgpullup(mp, -1);
2001 			if (newmp == NULL) {
2002 				dev_err(xnfp->xnf_devinfo, CE_WARN,
2003 				    "msgpullup() failed");
2004 				goto drop;
2005 			}
2006 
2007 			freemsg(mp);
2008 			mp = newmp;
2009 			xnfp->xnf_stat_tx_pullup++;
2010 			pulledup = B_TRUE;
2011 			goto pulledup;
2012 		}
2013 
2014 		head = xnf_mblk_map(xnfp, mp, &slots);
2015 		if (head == NULL)
2016 			goto drop;
2017 
2018 		IMPLY(slots > 1, xnfp->xnf_be_tx_sg);
2019 	}
2020 
2021 	/*
2022 	 * Set tx_mp so that mblk is freed when the txbuf chain is freed.
2023 	 */
2024 	head->tx_mp = mp;
2025 
2026 	xnf_tx_setup_offload(xnfp, head, cksum_flags, lso_flags, mss);
2027 
2028 	/*
2029 	 * The first request must store the total length of the packet.
2030 	 */
2031 	head->tx_txreq.size = length;
2032 
2033 	/*
2034 	 * Push the packet we have prepared into the ring.
2035 	 */
2036 	xnf_tx_push_packet(xnfp, head);
2037 	xnfp->xnf_stat_opackets++;
2038 	xnfp->xnf_stat_obytes += length;
2039 
2040 	mutex_exit(&xnfp->xnf_txlock);
2041 	return (NULL);
2042 
2043 drop:
2044 	freemsg(mp);
2045 	xnfp->xnf_stat_tx_drop++;
2046 	mutex_exit(&xnfp->xnf_txlock);
2047 	return (NULL);
2048 }
2049 
2050 /*
2051  * Notification of RX packets. Currently no TX-complete interrupt is
2052  * used, as we clean the TX ring lazily.
2053  */
2054 static uint_t
xnf_intr(caddr_t arg)2055 xnf_intr(caddr_t arg)
2056 {
2057 	xnf_t *xnfp = (xnf_t *)arg;
2058 	mblk_t *mp;
2059 	boolean_t need_sched, clean_ring;
2060 
2061 	mutex_enter(&xnfp->xnf_rxlock);
2062 
2063 	/*
2064 	 * Interrupts before we are connected are spurious.
2065 	 */
2066 	if (!xnfp->xnf_connected) {
2067 		mutex_exit(&xnfp->xnf_rxlock);
2068 		xnfp->xnf_stat_unclaimed_interrupts++;
2069 		return (DDI_INTR_UNCLAIMED);
2070 	}
2071 
2072 	/*
2073 	 * Receive side processing.
2074 	 */
2075 	do {
2076 		/*
2077 		 * Collect buffers from the ring.
2078 		 */
2079 		xnf_rx_collect(xnfp);
2080 
2081 		/*
2082 		 * Interrupt me when the next receive buffer is consumed.
2083 		 */
2084 		xnfp->xnf_rx_ring.sring->rsp_event =
2085 		    xnfp->xnf_rx_ring.rsp_cons + 1;
2086 		xen_mb();
2087 
2088 	} while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_rx_ring));
2089 
2090 	if (xnfp->xnf_rx_new_buffers_posted) {
2091 		boolean_t notify;
2092 
2093 		/*
2094 		 * Indicate to the peer that we have re-filled the
2095 		 * receive ring, if it cares.
2096 		 */
2097 		/* LINTED: constant in conditional context */
2098 		RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_rx_ring, notify);
2099 		if (notify)
2100 			ec_notify_via_evtchn(xnfp->xnf_evtchn);
2101 		xnfp->xnf_rx_new_buffers_posted = B_FALSE;
2102 	}
2103 
2104 	mp = xnfp->xnf_rx_head;
2105 	xnfp->xnf_rx_head = xnfp->xnf_rx_tail = NULL;
2106 
2107 	xnfp->xnf_stat_interrupts++;
2108 	mutex_exit(&xnfp->xnf_rxlock);
2109 
2110 	if (mp != NULL)
2111 		mac_rx(xnfp->xnf_mh, NULL, mp);
2112 
2113 	/*
2114 	 * Transmit side processing.
2115 	 *
2116 	 * If a previous transmit attempt failed or we have pending
2117 	 * multicast requests, clean the ring.
2118 	 *
2119 	 * If we previously stalled transmission and cleaning produces
2120 	 * some free slots, tell upstream to attempt sending again.
2121 	 *
2122 	 * The odd style is to avoid acquiring xnf_txlock unless we
2123 	 * will actually look inside the tx machinery.
2124 	 */
2125 	mutex_enter(&xnfp->xnf_schedlock);
2126 	need_sched = xnfp->xnf_need_sched;
2127 	clean_ring = need_sched || (xnfp->xnf_pending_multicast > 0);
2128 	mutex_exit(&xnfp->xnf_schedlock);
2129 
2130 	if (clean_ring) {
2131 		int free_slots;
2132 
2133 		mutex_enter(&xnfp->xnf_txlock);
2134 		free_slots = xnf_tx_slots_get(xnfp, 0, B_FALSE);
2135 
2136 		if (need_sched && (free_slots >= XEN_MAX_SLOTS_PER_TX)) {
2137 			mutex_enter(&xnfp->xnf_schedlock);
2138 			xnfp->xnf_need_sched = B_FALSE;
2139 			mutex_exit(&xnfp->xnf_schedlock);
2140 
2141 			mac_tx_update(xnfp->xnf_mh);
2142 		}
2143 		mutex_exit(&xnfp->xnf_txlock);
2144 	}
2145 
2146 	return (DDI_INTR_CLAIMED);
2147 }
2148 
2149 /*
2150  *  xnf_start() -- start the board receiving and enable interrupts.
2151  */
2152 static int
xnf_start(void * arg)2153 xnf_start(void *arg)
2154 {
2155 	xnf_t *xnfp = arg;
2156 
2157 	mutex_enter(&xnfp->xnf_rxlock);
2158 	mutex_enter(&xnfp->xnf_txlock);
2159 
2160 	/* Accept packets from above. */
2161 	xnfp->xnf_running = B_TRUE;
2162 
2163 	mutex_exit(&xnfp->xnf_txlock);
2164 	mutex_exit(&xnfp->xnf_rxlock);
2165 
2166 	return (0);
2167 }
2168 
2169 /* xnf_stop() - disable hardware */
2170 static void
xnf_stop(void * arg)2171 xnf_stop(void *arg)
2172 {
2173 	xnf_t *xnfp = arg;
2174 
2175 	mutex_enter(&xnfp->xnf_rxlock);
2176 	mutex_enter(&xnfp->xnf_txlock);
2177 
2178 	xnfp->xnf_running = B_FALSE;
2179 
2180 	mutex_exit(&xnfp->xnf_txlock);
2181 	mutex_exit(&xnfp->xnf_rxlock);
2182 }
2183 
2184 /*
2185  * Hang buffer `bdesc' on the RX ring.
2186  */
2187 static void
xnf_rxbuf_hang(xnf_t * xnfp,xnf_buf_t * bdesc)2188 xnf_rxbuf_hang(xnf_t *xnfp, xnf_buf_t *bdesc)
2189 {
2190 	netif_rx_request_t *reqp;
2191 	RING_IDX hang_ix;
2192 
2193 	ASSERT(MUTEX_HELD(&xnfp->xnf_rxlock));
2194 
2195 	reqp = RING_GET_REQUEST(&xnfp->xnf_rx_ring,
2196 	    xnfp->xnf_rx_ring.req_prod_pvt);
2197 	hang_ix = (RING_IDX) (reqp - RING_GET_REQUEST(&xnfp->xnf_rx_ring, 0));
2198 	ASSERT(xnfp->xnf_rx_pkt_info[hang_ix] == NULL);
2199 
2200 	reqp->id = bdesc->id = hang_ix;
2201 	reqp->gref = bdesc->grant_ref;
2202 
2203 	xnfp->xnf_rx_pkt_info[hang_ix] = bdesc;
2204 	xnfp->xnf_rx_ring.req_prod_pvt++;
2205 
2206 	xnfp->xnf_rx_new_buffers_posted = B_TRUE;
2207 }
2208 
2209 /*
2210  * Receive an entire packet from the ring, starting from slot *consp.
2211  * prod indicates the slot of the latest response.
2212  * On return, *consp will point to the head of the next packet.
2213  *
2214  * Note: If slot prod was reached before we could gather a full packet, we will
2215  * drop the partial packet; this would most likely indicate a bug in either
2216  * the front-end or the back-end driver.
2217  *
2218  * An rx packet can consist of several fragments and thus span multiple slots.
2219  * Each fragment can contain up to 4k of data.
2220  *
2221  * A typical 9000 MTU packet with look like this:
2222  * +------+---------------------+-------------------+-----------------------+
2223  * | SLOT | TYPE                | CONTENTS          | FLAGS                 |
2224  * +------+---------------------+-------------------+-----------------------+
2225  * | 1    | netif_rx_response_t | 1st data fragment | more_data             |
2226  * +------+---------------------+-------------------+-----------------------+
2227  * | 2    | netif_rx_response_t | 2nd data fragment | more_data             |
2228  * +------+---------------------+-------------------+-----------------------+
2229  * | 3    | netif_rx_response_t | 3rd data fragment | [none]                |
2230  * +------+---------------------+-------------------+-----------------------+
2231  *
2232  * Fragments are chained by setting NETRXF_more_data in the previous
2233  * response's flags. If there are additional flags, such as
2234  * NETRXF_data_validated or NETRXF_extra_info, those should be set on the
2235  * first fragment.
2236  *
2237  * Sometimes extra info can be present. If so, it will follow the first
2238  * fragment, and NETRXF_extra_info flag will be set on the first response.
2239  * If LRO is set on a packet, it will be stored in the extra info. Conforming
2240  * to the spec, extra info can also be chained, but must all be present right
2241  * after the first fragment.
2242  *
2243  * Example of a packet with 2 extra infos:
2244  * +------+---------------------+-------------------+-----------------------+
2245  * | SLOT | TYPE                | CONTENTS          | FLAGS                 |
2246  * +------+---------------------+-------------------+-----------------------+
2247  * | 1    | netif_rx_response_t | 1st data fragment | extra_info, more_data |
2248  * +------+---------------------+-------------------+-----------------------+
2249  * | 2    | netif_extra_info_t  | 1st extra info    | EXTRA_FLAG_MORE       |
2250  * +------+---------------------+-------------------+-----------------------+
2251  * | 3    | netif_extra_info_t  | 2nd extra info    | [none]                |
2252  * +------+---------------------+-------------------+-----------------------+
2253  * | 4    | netif_rx_response_t | 2nd data fragment | more_data             |
2254  * +------+---------------------+-------------------+-----------------------+
2255  * | 5    | netif_rx_response_t | 3rd data fragment | more_data             |
2256  * +------+---------------------+-------------------+-----------------------+
2257  * | 6    | netif_rx_response_t | 4th data fragment | [none]                |
2258  * +------+---------------------+-------------------+-----------------------+
2259  *
2260  * In practice, the only extra we expect is for LRO, but only if we advertise
2261  * that we support it to the backend (xnf_enable_lro == TRUE).
2262  */
2263 static int
xnf_rx_one_packet(xnf_t * xnfp,RING_IDX prod,RING_IDX * consp,mblk_t ** mpp)2264 xnf_rx_one_packet(xnf_t *xnfp, RING_IDX prod, RING_IDX *consp, mblk_t **mpp)
2265 {
2266 	mblk_t *head = NULL;
2267 	mblk_t *tail = NULL;
2268 	mblk_t *mp;
2269 	int error = 0;
2270 	RING_IDX cons = *consp;
2271 	netif_extra_info_t lro;
2272 	boolean_t is_lro = B_FALSE;
2273 	boolean_t is_extra = B_FALSE;
2274 
2275 	netif_rx_response_t rsp = *RING_GET_RESPONSE(&xnfp->xnf_rx_ring, cons);
2276 
2277 	boolean_t hwcsum = (rsp.flags & NETRXF_data_validated) != 0;
2278 	boolean_t more_data = (rsp.flags & NETRXF_more_data) != 0;
2279 	boolean_t more_extra = (rsp.flags & NETRXF_extra_info) != 0;
2280 
2281 	IMPLY(more_data, xnf_enable_rx_sg);
2282 
2283 	while (cons != prod) {
2284 		xnf_buf_t *bdesc;
2285 		int len, off;
2286 		int rxidx = cons & (NET_RX_RING_SIZE - 1);
2287 
2288 		bdesc = xnfp->xnf_rx_pkt_info[rxidx];
2289 		xnfp->xnf_rx_pkt_info[rxidx] = NULL;
2290 
2291 		if (is_extra) {
2292 			netif_extra_info_t *extra = (netif_extra_info_t *)&rsp;
2293 			/*
2294 			 * The only extra we expect is for LRO, and it should
2295 			 * only be present once.
2296 			 */
2297 			if (extra->type == XEN_NETIF_EXTRA_TYPE_GSO &&
2298 			    !is_lro) {
2299 				ASSERT(xnf_enable_lro);
2300 				lro = *extra;
2301 				is_lro = B_TRUE;
2302 				DTRACE_PROBE1(lro, netif_extra_info_t *, &lro);
2303 			} else {
2304 				dev_err(xnfp->xnf_devinfo, CE_WARN, "rx packet "
2305 				    "contains unexpected extra info of type %d",
2306 				    extra->type);
2307 				error = EINVAL;
2308 			}
2309 			more_extra =
2310 			    (extra->flags & XEN_NETIF_EXTRA_FLAG_MORE) != 0;
2311 
2312 			goto hang_buf;
2313 		}
2314 
2315 		ASSERT3U(bdesc->id, ==, rsp.id);
2316 
2317 		/*
2318 		 * status stores packet length when >= 0, or errors when < 0.
2319 		 */
2320 		len = rsp.status;
2321 		off = rsp.offset;
2322 		more_data = (rsp.flags & NETRXF_more_data) != 0;
2323 
2324 		/*
2325 		 * sanity checks.
2326 		 */
2327 		if (!xnfp->xnf_running) {
2328 			error = EBUSY;
2329 		} else if (len <= 0) {
2330 			xnfp->xnf_stat_errrx++;
2331 
2332 			switch (len) {
2333 			case 0:
2334 				xnfp->xnf_stat_runt++;
2335 				break;
2336 			case NETIF_RSP_ERROR:
2337 				xnfp->xnf_stat_mac_rcv_error++;
2338 				break;
2339 			case NETIF_RSP_DROPPED:
2340 				xnfp->xnf_stat_norxbuf++;
2341 				break;
2342 			}
2343 			error = EINVAL;
2344 		} else if (bdesc->grant_ref == INVALID_GRANT_REF) {
2345 			dev_err(xnfp->xnf_devinfo, CE_WARN,
2346 			    "Bad rx grant reference, rsp id %d", rsp.id);
2347 			error = EINVAL;
2348 		} else if ((off + len) > PAGESIZE) {
2349 			dev_err(xnfp->xnf_devinfo, CE_WARN, "Rx packet crosses "
2350 			    "page boundary (offset %d, length %d)", off, len);
2351 			error = EINVAL;
2352 		}
2353 
2354 		if (error != 0) {
2355 			/*
2356 			 * If an error has been detected, we do not attempt
2357 			 * to read the data but we still need to replace
2358 			 * the rx bufs.
2359 			 */
2360 			goto hang_buf;
2361 		}
2362 
2363 		xnf_buf_t *nbuf = NULL;
2364 
2365 		/*
2366 		 * If the packet is below a pre-determined size we will
2367 		 * copy data out of the buf rather than replace it.
2368 		 */
2369 		if (len > xnf_rx_copy_limit)
2370 			nbuf = xnf_buf_get(xnfp, KM_NOSLEEP, B_FALSE);
2371 
2372 		if (nbuf != NULL) {
2373 			mp = desballoc((unsigned char *)bdesc->buf,
2374 			    bdesc->len, 0, &bdesc->free_rtn);
2375 
2376 			if (mp == NULL) {
2377 				xnfp->xnf_stat_rx_desballoc_fail++;
2378 				xnfp->xnf_stat_norxbuf++;
2379 				error = ENOMEM;
2380 				/*
2381 				 * we free the buf we just allocated as we
2382 				 * will re-hang the old buf.
2383 				 */
2384 				xnf_buf_put(xnfp, nbuf, B_FALSE);
2385 				goto hang_buf;
2386 			}
2387 
2388 			mp->b_rptr = mp->b_rptr + off;
2389 			mp->b_wptr = mp->b_rptr + len;
2390 
2391 			/*
2392 			 * Release the grant as the backend doesn't need to
2393 			 * access this buffer anymore and grants are scarce.
2394 			 */
2395 			(void) gnttab_end_foreign_access_ref(bdesc->grant_ref,
2396 			    0);
2397 			xnf_gref_put(xnfp, bdesc->grant_ref);
2398 			bdesc->grant_ref = INVALID_GRANT_REF;
2399 
2400 			bdesc = nbuf;
2401 		} else {
2402 			/*
2403 			 * We failed to allocate a new buf or decided to reuse
2404 			 * the old one. In either case we copy the data off it
2405 			 * and put it back into the ring.
2406 			 */
2407 			mp = allocb(len, 0);
2408 			if (mp == NULL) {
2409 				xnfp->xnf_stat_rx_allocb_fail++;
2410 				xnfp->xnf_stat_norxbuf++;
2411 				error = ENOMEM;
2412 				goto hang_buf;
2413 			}
2414 			bcopy(bdesc->buf + off, mp->b_wptr, len);
2415 			mp->b_wptr += len;
2416 		}
2417 
2418 		if (head == NULL)
2419 			head = mp;
2420 		else
2421 			tail->b_cont = mp;
2422 		tail = mp;
2423 
2424 hang_buf:
2425 		/*
2426 		 * No matter what happens, for each response we need to hang
2427 		 * a new buf on the rx ring. Put either the old one, or a new
2428 		 * one if the old one is borrowed by the kernel via desballoc().
2429 		 */
2430 		xnf_rxbuf_hang(xnfp, bdesc);
2431 		cons++;
2432 
2433 		/* next response is an extra */
2434 		is_extra = more_extra;
2435 
2436 		if (!more_data && !more_extra)
2437 			break;
2438 
2439 		/*
2440 		 * Note that since requests and responses are union'd on the
2441 		 * same ring, we copy the response to a local variable instead
2442 		 * of keeping a pointer. Otherwise xnf_rxbuf_hang() would have
2443 		 * overwritten contents of rsp.
2444 		 */
2445 		rsp = *RING_GET_RESPONSE(&xnfp->xnf_rx_ring, cons);
2446 	}
2447 
2448 	/*
2449 	 * Check that we do not get stuck in a loop.
2450 	 */
2451 	ASSERT3U(*consp, !=, cons);
2452 	*consp = cons;
2453 
2454 	/*
2455 	 * We ran out of responses but the flags indicate there is more data.
2456 	 */
2457 	if (more_data) {
2458 		dev_err(xnfp->xnf_devinfo, CE_WARN, "rx: need more fragments.");
2459 		error = EINVAL;
2460 	}
2461 	if (more_extra) {
2462 		dev_err(xnfp->xnf_devinfo, CE_WARN, "rx: need more fragments "
2463 		    "(extras).");
2464 		error = EINVAL;
2465 	}
2466 
2467 	/*
2468 	 * An error means the packet must be dropped. If we have already formed
2469 	 * a partial packet, then discard it.
2470 	 */
2471 	if (error != 0) {
2472 		if (head != NULL)
2473 			freemsg(head);
2474 		xnfp->xnf_stat_rx_drop++;
2475 		return (error);
2476 	}
2477 
2478 	ASSERT(head != NULL);
2479 
2480 	if (hwcsum) {
2481 		/*
2482 		 * If the peer says that the data has been validated then we
2483 		 * declare that the full checksum has been verified.
2484 		 *
2485 		 * We don't look at the "checksum blank" flag, and hence could
2486 		 * have a packet here that we are asserting is good with
2487 		 * a blank checksum.
2488 		 */
2489 		mac_hcksum_set(head, 0, 0, 0, 0, HCK_FULLCKSUM_OK);
2490 		xnfp->xnf_stat_rx_cksum_no_need++;
2491 	}
2492 
2493 	/* XXX: set lro info for packet once LRO is supported in OS. */
2494 
2495 	*mpp = head;
2496 
2497 	return (0);
2498 }
2499 
2500 /*
2501  * Collect packets from the RX ring, storing them in `xnfp' for later use.
2502  */
2503 static void
xnf_rx_collect(xnf_t * xnfp)2504 xnf_rx_collect(xnf_t *xnfp)
2505 {
2506 	RING_IDX prod;
2507 
2508 	ASSERT(MUTEX_HELD(&xnfp->xnf_rxlock));
2509 
2510 	prod = xnfp->xnf_rx_ring.sring->rsp_prod;
2511 	/*
2512 	 * Ensure we see queued responses up to 'prod'.
2513 	 */
2514 	membar_consumer();
2515 
2516 	while (xnfp->xnf_rx_ring.rsp_cons != prod) {
2517 		mblk_t *mp;
2518 
2519 		/*
2520 		 * Collect a packet.
2521 		 * rsp_cons is updated inside xnf_rx_one_packet().
2522 		 */
2523 		int error = xnf_rx_one_packet(xnfp, prod,
2524 		    &xnfp->xnf_rx_ring.rsp_cons, &mp);
2525 		if (error == 0) {
2526 			xnfp->xnf_stat_ipackets++;
2527 			xnfp->xnf_stat_rbytes += xmsgsize(mp);
2528 
2529 			/*
2530 			 * Append the mblk to the rx list.
2531 			 */
2532 			if (xnfp->xnf_rx_head == NULL) {
2533 				ASSERT3P(xnfp->xnf_rx_tail, ==, NULL);
2534 				xnfp->xnf_rx_head = mp;
2535 			} else {
2536 				ASSERT(xnfp->xnf_rx_tail != NULL);
2537 				xnfp->xnf_rx_tail->b_next = mp;
2538 			}
2539 			xnfp->xnf_rx_tail = mp;
2540 		}
2541 	}
2542 }
2543 
2544 /*
2545  *  xnf_alloc_dma_resources() -- initialize the drivers structures
2546  */
2547 static int
xnf_alloc_dma_resources(xnf_t * xnfp)2548 xnf_alloc_dma_resources(xnf_t *xnfp)
2549 {
2550 	dev_info_t		*devinfo = xnfp->xnf_devinfo;
2551 	size_t			len;
2552 	ddi_dma_cookie_t	dma_cookie;
2553 	uint_t			ncookies;
2554 	int			rc;
2555 	caddr_t			rptr;
2556 
2557 	/*
2558 	 * The code below allocates all the DMA data structures that
2559 	 * need to be released when the driver is detached.
2560 	 *
2561 	 * Allocate page for the transmit descriptor ring.
2562 	 */
2563 	if (ddi_dma_alloc_handle(devinfo, &ringbuf_dma_attr,
2564 	    DDI_DMA_SLEEP, 0, &xnfp->xnf_tx_ring_dma_handle) != DDI_SUCCESS)
2565 		goto alloc_error;
2566 
2567 	if (ddi_dma_mem_alloc(xnfp->xnf_tx_ring_dma_handle,
2568 	    PAGESIZE, &accattr, DDI_DMA_CONSISTENT,
2569 	    DDI_DMA_SLEEP, 0, &rptr, &len,
2570 	    &xnfp->xnf_tx_ring_dma_acchandle) != DDI_SUCCESS) {
2571 		ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle);
2572 		xnfp->xnf_tx_ring_dma_handle = NULL;
2573 		goto alloc_error;
2574 	}
2575 
2576 	if ((rc = ddi_dma_addr_bind_handle(xnfp->xnf_tx_ring_dma_handle, NULL,
2577 	    rptr, PAGESIZE, DDI_DMA_RDWR | DDI_DMA_CONSISTENT,
2578 	    DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies)) != DDI_DMA_MAPPED) {
2579 		ddi_dma_mem_free(&xnfp->xnf_tx_ring_dma_acchandle);
2580 		ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle);
2581 		xnfp->xnf_tx_ring_dma_handle = NULL;
2582 		xnfp->xnf_tx_ring_dma_acchandle = NULL;
2583 		if (rc == DDI_DMA_NORESOURCES)
2584 			goto alloc_error;
2585 		else
2586 			goto error;
2587 	}
2588 
2589 	ASSERT(ncookies == 1);
2590 	bzero(rptr, PAGESIZE);
2591 	/* LINTED: constant in conditional context */
2592 	SHARED_RING_INIT((netif_tx_sring_t *)rptr);
2593 	/* LINTED: constant in conditional context */
2594 	FRONT_RING_INIT(&xnfp->xnf_tx_ring, (netif_tx_sring_t *)rptr, PAGESIZE);
2595 	xnfp->xnf_tx_ring_phys_addr = dma_cookie.dmac_laddress;
2596 
2597 	/*
2598 	 * Allocate page for the receive descriptor ring.
2599 	 */
2600 	if (ddi_dma_alloc_handle(devinfo, &ringbuf_dma_attr,
2601 	    DDI_DMA_SLEEP, 0, &xnfp->xnf_rx_ring_dma_handle) != DDI_SUCCESS)
2602 		goto alloc_error;
2603 
2604 	if (ddi_dma_mem_alloc(xnfp->xnf_rx_ring_dma_handle,
2605 	    PAGESIZE, &accattr, DDI_DMA_CONSISTENT,
2606 	    DDI_DMA_SLEEP, 0, &rptr, &len,
2607 	    &xnfp->xnf_rx_ring_dma_acchandle) != DDI_SUCCESS) {
2608 		ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle);
2609 		xnfp->xnf_rx_ring_dma_handle = NULL;
2610 		goto alloc_error;
2611 	}
2612 
2613 	if ((rc = ddi_dma_addr_bind_handle(xnfp->xnf_rx_ring_dma_handle, NULL,
2614 	    rptr, PAGESIZE, DDI_DMA_RDWR | DDI_DMA_CONSISTENT,
2615 	    DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies)) != DDI_DMA_MAPPED) {
2616 		ddi_dma_mem_free(&xnfp->xnf_rx_ring_dma_acchandle);
2617 		ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle);
2618 		xnfp->xnf_rx_ring_dma_handle = NULL;
2619 		xnfp->xnf_rx_ring_dma_acchandle = NULL;
2620 		if (rc == DDI_DMA_NORESOURCES)
2621 			goto alloc_error;
2622 		else
2623 			goto error;
2624 	}
2625 
2626 	ASSERT(ncookies == 1);
2627 	bzero(rptr, PAGESIZE);
2628 	/* LINTED: constant in conditional context */
2629 	SHARED_RING_INIT((netif_rx_sring_t *)rptr);
2630 	/* LINTED: constant in conditional context */
2631 	FRONT_RING_INIT(&xnfp->xnf_rx_ring, (netif_rx_sring_t *)rptr, PAGESIZE);
2632 	xnfp->xnf_rx_ring_phys_addr = dma_cookie.dmac_laddress;
2633 
2634 	return (DDI_SUCCESS);
2635 
2636 alloc_error:
2637 	cmn_err(CE_WARN, "xnf%d: could not allocate enough DMA memory",
2638 	    ddi_get_instance(xnfp->xnf_devinfo));
2639 error:
2640 	xnf_release_dma_resources(xnfp);
2641 	return (DDI_FAILURE);
2642 }
2643 
2644 /*
2645  * Release all DMA resources in the opposite order from acquisition
2646  */
2647 static void
xnf_release_dma_resources(xnf_t * xnfp)2648 xnf_release_dma_resources(xnf_t *xnfp)
2649 {
2650 	int i;
2651 
2652 	/*
2653 	 * Free receive buffers which are currently associated with
2654 	 * descriptors.
2655 	 */
2656 	mutex_enter(&xnfp->xnf_rxlock);
2657 	for (i = 0; i < NET_RX_RING_SIZE; i++) {
2658 		xnf_buf_t *bp;
2659 
2660 		if ((bp = xnfp->xnf_rx_pkt_info[i]) == NULL)
2661 			continue;
2662 		xnfp->xnf_rx_pkt_info[i] = NULL;
2663 		xnf_buf_put(xnfp, bp, B_FALSE);
2664 	}
2665 	mutex_exit(&xnfp->xnf_rxlock);
2666 
2667 	/* Free the receive ring buffer. */
2668 	if (xnfp->xnf_rx_ring_dma_acchandle != NULL) {
2669 		(void) ddi_dma_unbind_handle(xnfp->xnf_rx_ring_dma_handle);
2670 		ddi_dma_mem_free(&xnfp->xnf_rx_ring_dma_acchandle);
2671 		ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle);
2672 		xnfp->xnf_rx_ring_dma_acchandle = NULL;
2673 	}
2674 	/* Free the transmit ring buffer. */
2675 	if (xnfp->xnf_tx_ring_dma_acchandle != NULL) {
2676 		(void) ddi_dma_unbind_handle(xnfp->xnf_tx_ring_dma_handle);
2677 		ddi_dma_mem_free(&xnfp->xnf_tx_ring_dma_acchandle);
2678 		ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle);
2679 		xnfp->xnf_tx_ring_dma_acchandle = NULL;
2680 	}
2681 
2682 }
2683 
2684 /*
2685  * Release any packets and associated structures used by the TX ring.
2686  */
2687 static void
xnf_release_mblks(xnf_t * xnfp)2688 xnf_release_mblks(xnf_t *xnfp)
2689 {
2690 	RING_IDX i;
2691 	xnf_txid_t *tidp;
2692 
2693 	for (i = 0, tidp = &xnfp->xnf_tx_pkt_id[0];
2694 	    i < NET_TX_RING_SIZE;
2695 	    i++, tidp++) {
2696 		xnf_txbuf_t *txp = tidp->txbuf;
2697 
2698 		if (txp != NULL) {
2699 			ASSERT(txp->tx_mp != NULL);
2700 			freemsg(txp->tx_mp);
2701 
2702 			xnf_txid_put(xnfp, tidp);
2703 			kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
2704 		}
2705 	}
2706 }
2707 
2708 static int
xnf_buf_constructor(void * buf,void * arg,int kmflag)2709 xnf_buf_constructor(void *buf, void *arg, int kmflag)
2710 {
2711 	int (*ddiflags)(caddr_t) = DDI_DMA_SLEEP;
2712 	xnf_buf_t *bdesc = buf;
2713 	xnf_t *xnfp = arg;
2714 	ddi_dma_cookie_t dma_cookie;
2715 	uint_t ncookies;
2716 	size_t len;
2717 
2718 	if (kmflag & KM_NOSLEEP)
2719 		ddiflags = DDI_DMA_DONTWAIT;
2720 
2721 	/* Allocate a DMA access handle for the buffer. */
2722 	if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &rx_buf_dma_attr,
2723 	    ddiflags, 0, &bdesc->dma_handle) != DDI_SUCCESS)
2724 		goto failure;
2725 
2726 	/* Allocate DMA-able memory for buffer. */
2727 	if (ddi_dma_mem_alloc(bdesc->dma_handle,
2728 	    PAGESIZE, &data_accattr, DDI_DMA_STREAMING, ddiflags, 0,
2729 	    &bdesc->buf, &len, &bdesc->acc_handle) != DDI_SUCCESS)
2730 		goto failure_1;
2731 
2732 	/* Bind to virtual address of buffer to get physical address. */
2733 	if (ddi_dma_addr_bind_handle(bdesc->dma_handle, NULL,
2734 	    bdesc->buf, len, DDI_DMA_RDWR | DDI_DMA_STREAMING,
2735 	    ddiflags, 0, &dma_cookie, &ncookies) != DDI_DMA_MAPPED)
2736 		goto failure_2;
2737 	ASSERT(ncookies == 1);
2738 
2739 	bdesc->free_rtn.free_func = xnf_buf_recycle;
2740 	bdesc->free_rtn.free_arg = (caddr_t)bdesc;
2741 	bdesc->xnfp = xnfp;
2742 	bdesc->buf_phys = dma_cookie.dmac_laddress;
2743 	bdesc->buf_mfn = pfn_to_mfn(xnf_btop(bdesc->buf_phys));
2744 	bdesc->len = dma_cookie.dmac_size;
2745 	bdesc->grant_ref = INVALID_GRANT_REF;
2746 	bdesc->gen = xnfp->xnf_gen;
2747 
2748 	atomic_inc_64(&xnfp->xnf_stat_buf_allocated);
2749 
2750 	return (0);
2751 
2752 failure_2:
2753 	ddi_dma_mem_free(&bdesc->acc_handle);
2754 
2755 failure_1:
2756 	ddi_dma_free_handle(&bdesc->dma_handle);
2757 
2758 failure:
2759 
2760 	ASSERT(kmflag & KM_NOSLEEP); /* Cannot fail for KM_SLEEP. */
2761 	return (-1);
2762 }
2763 
2764 static void
xnf_buf_destructor(void * buf,void * arg)2765 xnf_buf_destructor(void *buf, void *arg)
2766 {
2767 	xnf_buf_t *bdesc = buf;
2768 	xnf_t *xnfp = arg;
2769 
2770 	(void) ddi_dma_unbind_handle(bdesc->dma_handle);
2771 	ddi_dma_mem_free(&bdesc->acc_handle);
2772 	ddi_dma_free_handle(&bdesc->dma_handle);
2773 
2774 	atomic_dec_64(&xnfp->xnf_stat_buf_allocated);
2775 }
2776 
2777 static xnf_buf_t *
xnf_buf_get(xnf_t * xnfp,int flags,boolean_t readonly)2778 xnf_buf_get(xnf_t *xnfp, int flags, boolean_t readonly)
2779 {
2780 	grant_ref_t gref;
2781 	xnf_buf_t *bufp;
2782 
2783 	/*
2784 	 * Usually grant references are more scarce than memory, so we
2785 	 * attempt to acquire a grant reference first.
2786 	 */
2787 	gref = xnf_gref_get(xnfp);
2788 	if (gref == INVALID_GRANT_REF)
2789 		return (NULL);
2790 
2791 	bufp = kmem_cache_alloc(xnfp->xnf_buf_cache, flags);
2792 	if (bufp == NULL) {
2793 		xnf_gref_put(xnfp, gref);
2794 		return (NULL);
2795 	}
2796 
2797 	ASSERT3U(bufp->grant_ref, ==, INVALID_GRANT_REF);
2798 
2799 	bufp->grant_ref = gref;
2800 
2801 	if (bufp->gen != xnfp->xnf_gen)
2802 		xnf_buf_refresh(bufp);
2803 
2804 	gnttab_grant_foreign_access_ref(bufp->grant_ref,
2805 	    xvdi_get_oeid(bufp->xnfp->xnf_devinfo),
2806 	    bufp->buf_mfn, readonly ? 1 : 0);
2807 
2808 	atomic_inc_64(&xnfp->xnf_stat_buf_outstanding);
2809 
2810 	return (bufp);
2811 }
2812 
2813 static void
xnf_buf_put(xnf_t * xnfp,xnf_buf_t * bufp,boolean_t readonly)2814 xnf_buf_put(xnf_t *xnfp, xnf_buf_t *bufp, boolean_t readonly)
2815 {
2816 	if (bufp->grant_ref != INVALID_GRANT_REF) {
2817 		(void) gnttab_end_foreign_access_ref(
2818 		    bufp->grant_ref, readonly ? 1 : 0);
2819 		xnf_gref_put(xnfp, bufp->grant_ref);
2820 		bufp->grant_ref = INVALID_GRANT_REF;
2821 	}
2822 
2823 	kmem_cache_free(xnfp->xnf_buf_cache, bufp);
2824 
2825 	atomic_dec_64(&xnfp->xnf_stat_buf_outstanding);
2826 }
2827 
2828 /*
2829  * Refresh any cached data about a buffer after resume.
2830  */
2831 static void
xnf_buf_refresh(xnf_buf_t * bdesc)2832 xnf_buf_refresh(xnf_buf_t *bdesc)
2833 {
2834 	bdesc->buf_mfn = pfn_to_mfn(xnf_btop(bdesc->buf_phys));
2835 	bdesc->gen = bdesc->xnfp->xnf_gen;
2836 }
2837 
2838 /*
2839  * Streams `freeb' routine for `xnf_buf_t' when used as transmit
2840  * look-aside buffers.
2841  */
2842 static void
xnf_buf_recycle(xnf_buf_t * bdesc)2843 xnf_buf_recycle(xnf_buf_t *bdesc)
2844 {
2845 	xnf_t *xnfp = bdesc->xnfp;
2846 
2847 	xnf_buf_put(xnfp, bdesc, B_TRUE);
2848 }
2849 
2850 static int
xnf_tx_buf_constructor(void * buf,void * arg,int kmflag)2851 xnf_tx_buf_constructor(void *buf, void *arg, int kmflag)
2852 {
2853 	int (*ddiflags)(caddr_t) = DDI_DMA_SLEEP;
2854 	xnf_txbuf_t *txp = buf;
2855 	xnf_t *xnfp = arg;
2856 
2857 	if (kmflag & KM_NOSLEEP)
2858 		ddiflags = DDI_DMA_DONTWAIT;
2859 
2860 	if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &tx_buf_dma_attr,
2861 	    ddiflags, 0, &txp->tx_dma_handle) != DDI_SUCCESS) {
2862 		ASSERT(kmflag & KM_NOSLEEP); /* Cannot fail for KM_SLEEP. */
2863 		return (-1);
2864 	}
2865 
2866 	return (0);
2867 }
2868 
2869 static void
xnf_tx_buf_destructor(void * buf,void * arg)2870 xnf_tx_buf_destructor(void *buf, void *arg)
2871 {
2872 	_NOTE(ARGUNUSED(arg));
2873 	xnf_txbuf_t *txp = buf;
2874 
2875 	ddi_dma_free_handle(&txp->tx_dma_handle);
2876 }
2877 
2878 /*
2879  * Statistics.
2880  */
2881 static char *xnf_aux_statistics[] = {
2882 	"tx_cksum_deferred",
2883 	"rx_cksum_no_need",
2884 	"interrupts",
2885 	"unclaimed_interrupts",
2886 	"tx_pullup",
2887 	"tx_lookaside",
2888 	"tx_drop",
2889 	"tx_eth_hdr_split",
2890 	"buf_allocated",
2891 	"buf_outstanding",
2892 	"gref_outstanding",
2893 	"gref_failure",
2894 	"gref_peak",
2895 	"rx_allocb_fail",
2896 	"rx_desballoc_fail",
2897 };
2898 
2899 static int
xnf_kstat_aux_update(kstat_t * ksp,int flag)2900 xnf_kstat_aux_update(kstat_t *ksp, int flag)
2901 {
2902 	xnf_t *xnfp;
2903 	kstat_named_t *knp;
2904 
2905 	if (flag != KSTAT_READ)
2906 		return (EACCES);
2907 
2908 	xnfp = ksp->ks_private;
2909 	knp = ksp->ks_data;
2910 
2911 	/*
2912 	 * Assignment order must match that of the names in
2913 	 * xnf_aux_statistics.
2914 	 */
2915 	(knp++)->value.ui64 = xnfp->xnf_stat_tx_cksum_deferred;
2916 	(knp++)->value.ui64 = xnfp->xnf_stat_rx_cksum_no_need;
2917 
2918 	(knp++)->value.ui64 = xnfp->xnf_stat_interrupts;
2919 	(knp++)->value.ui64 = xnfp->xnf_stat_unclaimed_interrupts;
2920 	(knp++)->value.ui64 = xnfp->xnf_stat_tx_pullup;
2921 	(knp++)->value.ui64 = xnfp->xnf_stat_tx_lookaside;
2922 	(knp++)->value.ui64 = xnfp->xnf_stat_tx_drop;
2923 	(knp++)->value.ui64 = xnfp->xnf_stat_tx_eth_hdr_split;
2924 
2925 	(knp++)->value.ui64 = xnfp->xnf_stat_buf_allocated;
2926 	(knp++)->value.ui64 = xnfp->xnf_stat_buf_outstanding;
2927 	(knp++)->value.ui64 = xnfp->xnf_stat_gref_outstanding;
2928 	(knp++)->value.ui64 = xnfp->xnf_stat_gref_failure;
2929 	(knp++)->value.ui64 = xnfp->xnf_stat_gref_peak;
2930 	(knp++)->value.ui64 = xnfp->xnf_stat_rx_allocb_fail;
2931 	(knp++)->value.ui64 = xnfp->xnf_stat_rx_desballoc_fail;
2932 
2933 	return (0);
2934 }
2935 
2936 static boolean_t
xnf_kstat_init(xnf_t * xnfp)2937 xnf_kstat_init(xnf_t *xnfp)
2938 {
2939 	int nstat = sizeof (xnf_aux_statistics) /
2940 	    sizeof (xnf_aux_statistics[0]);
2941 	char **cp = xnf_aux_statistics;
2942 	kstat_named_t *knp;
2943 
2944 	/*
2945 	 * Create and initialise kstats.
2946 	 */
2947 	if ((xnfp->xnf_kstat_aux = kstat_create("xnf",
2948 	    ddi_get_instance(xnfp->xnf_devinfo),
2949 	    "aux_statistics", "net", KSTAT_TYPE_NAMED,
2950 	    nstat, 0)) == NULL)
2951 		return (B_FALSE);
2952 
2953 	xnfp->xnf_kstat_aux->ks_private = xnfp;
2954 	xnfp->xnf_kstat_aux->ks_update = xnf_kstat_aux_update;
2955 
2956 	knp = xnfp->xnf_kstat_aux->ks_data;
2957 	while (nstat > 0) {
2958 		kstat_named_init(knp, *cp, KSTAT_DATA_UINT64);
2959 
2960 		knp++;
2961 		cp++;
2962 		nstat--;
2963 	}
2964 
2965 	kstat_install(xnfp->xnf_kstat_aux);
2966 
2967 	return (B_TRUE);
2968 }
2969 
2970 static int
xnf_stat(void * arg,uint_t stat,uint64_t * val)2971 xnf_stat(void *arg, uint_t stat, uint64_t *val)
2972 {
2973 	xnf_t *xnfp = arg;
2974 
2975 	mutex_enter(&xnfp->xnf_rxlock);
2976 	mutex_enter(&xnfp->xnf_txlock);
2977 
2978 #define	mac_stat(q, r)				\
2979 	case (MAC_STAT_##q):			\
2980 		*val = xnfp->xnf_stat_##r;	\
2981 		break
2982 
2983 #define	ether_stat(q, r)			\
2984 	case (ETHER_STAT_##q):			\
2985 		*val = xnfp->xnf_stat_##r;	\
2986 		break
2987 
2988 	switch (stat) {
2989 
2990 	mac_stat(IPACKETS, ipackets);
2991 	mac_stat(OPACKETS, opackets);
2992 	mac_stat(RBYTES, rbytes);
2993 	mac_stat(OBYTES, obytes);
2994 	mac_stat(NORCVBUF, norxbuf);
2995 	mac_stat(IERRORS, errrx);
2996 	mac_stat(NOXMTBUF, tx_defer);
2997 
2998 	ether_stat(MACRCV_ERRORS, mac_rcv_error);
2999 	ether_stat(TOOSHORT_ERRORS, runt);
3000 
3001 	/* always claim to be in full duplex mode */
3002 	case ETHER_STAT_LINK_DUPLEX:
3003 		*val = LINK_DUPLEX_FULL;
3004 		break;
3005 
3006 	/* always claim to be at 1Gb/s link speed */
3007 	case MAC_STAT_IFSPEED:
3008 		*val = 1000000000ull;
3009 		break;
3010 
3011 	default:
3012 		mutex_exit(&xnfp->xnf_txlock);
3013 		mutex_exit(&xnfp->xnf_rxlock);
3014 
3015 		return (ENOTSUP);
3016 	}
3017 
3018 #undef mac_stat
3019 #undef ether_stat
3020 
3021 	mutex_exit(&xnfp->xnf_txlock);
3022 	mutex_exit(&xnfp->xnf_rxlock);
3023 
3024 	return (0);
3025 }
3026 
3027 static int
xnf_change_mtu(xnf_t * xnfp,uint32_t mtu)3028 xnf_change_mtu(xnf_t *xnfp, uint32_t mtu)
3029 {
3030 	if (mtu > ETHERMTU) {
3031 		if (!xnf_enable_tx_sg) {
3032 			dev_err(xnfp->xnf_devinfo, CE_WARN, "MTU limited to %d "
3033 			    "because scatter-gather is disabled for transmit "
3034 			    "in driver settings", ETHERMTU);
3035 			return (EINVAL);
3036 		} else if (!xnf_enable_rx_sg) {
3037 			dev_err(xnfp->xnf_devinfo, CE_WARN, "MTU limited to %d "
3038 			    "because scatter-gather is disabled for receive "
3039 			    "in driver settings", ETHERMTU);
3040 			return (EINVAL);
3041 		} else if (!xnfp->xnf_be_tx_sg) {
3042 			dev_err(xnfp->xnf_devinfo, CE_WARN, "MTU limited to %d "
3043 			    "because backend doesn't support scatter-gather",
3044 			    ETHERMTU);
3045 			return (EINVAL);
3046 		}
3047 		if (mtu > XNF_MAXPKT)
3048 			return (EINVAL);
3049 	}
3050 	int error = mac_maxsdu_update(xnfp->xnf_mh, mtu);
3051 	if (error == 0)
3052 		xnfp->xnf_mtu = mtu;
3053 
3054 	return (error);
3055 }
3056 
3057 /*ARGSUSED*/
3058 static int
xnf_getprop(void * data,const char * prop_name,mac_prop_id_t prop_id,uint_t prop_val_size,void * prop_val)3059 xnf_getprop(void *data, const char *prop_name, mac_prop_id_t prop_id,
3060     uint_t prop_val_size, void *prop_val)
3061 {
3062 	xnf_t *xnfp = data;
3063 
3064 	switch (prop_id) {
3065 	case MAC_PROP_MTU:
3066 		ASSERT(prop_val_size >= sizeof (uint32_t));
3067 		bcopy(&xnfp->xnf_mtu, prop_val, sizeof (uint32_t));
3068 		break;
3069 	default:
3070 		return (ENOTSUP);
3071 	}
3072 	return (0);
3073 }
3074 
3075 /*ARGSUSED*/
3076 static int
xnf_setprop(void * data,const char * prop_name,mac_prop_id_t prop_id,uint_t prop_val_size,const void * prop_val)3077 xnf_setprop(void *data, const char *prop_name, mac_prop_id_t prop_id,
3078     uint_t prop_val_size, const void *prop_val)
3079 {
3080 	xnf_t *xnfp = data;
3081 	uint32_t new_mtu;
3082 	int error;
3083 
3084 	switch (prop_id) {
3085 	case MAC_PROP_MTU:
3086 		ASSERT(prop_val_size >= sizeof (uint32_t));
3087 		bcopy(prop_val, &new_mtu, sizeof (new_mtu));
3088 		error = xnf_change_mtu(xnfp, new_mtu);
3089 		break;
3090 	default:
3091 		return (ENOTSUP);
3092 	}
3093 
3094 	return (error);
3095 }
3096 
3097 /*ARGSUSED*/
3098 static void
xnf_propinfo(void * data,const char * prop_name,mac_prop_id_t prop_id,mac_prop_info_handle_t prop_handle)3099 xnf_propinfo(void *data, const char *prop_name, mac_prop_id_t prop_id,
3100     mac_prop_info_handle_t prop_handle)
3101 {
3102 	switch (prop_id) {
3103 	case MAC_PROP_MTU:
3104 		mac_prop_info_set_range_uint32(prop_handle, 0, XNF_MAXPKT);
3105 		break;
3106 	default:
3107 		break;
3108 	}
3109 }
3110 
3111 static boolean_t
xnf_getcapab(void * arg,mac_capab_t cap,void * cap_data)3112 xnf_getcapab(void *arg, mac_capab_t cap, void *cap_data)
3113 {
3114 	xnf_t *xnfp = arg;
3115 
3116 	switch (cap) {
3117 	case MAC_CAPAB_HCKSUM: {
3118 		uint32_t *capab = cap_data;
3119 
3120 		/*
3121 		 * Whilst the flag used to communicate with the IO
3122 		 * domain is called "NETTXF_csum_blank", the checksum
3123 		 * in the packet must contain the pseudo-header
3124 		 * checksum and not zero.
3125 		 *
3126 		 * To help out the IO domain, we might use
3127 		 * HCKSUM_INET_PARTIAL. Unfortunately our stack will
3128 		 * then use checksum offload for IPv6 packets, which
3129 		 * the IO domain can't handle.
3130 		 *
3131 		 * As a result, we declare outselves capable of
3132 		 * HCKSUM_INET_FULL_V4. This means that we receive
3133 		 * IPv4 packets from the stack with a blank checksum
3134 		 * field and must insert the pseudo-header checksum
3135 		 * before passing the packet to the IO domain.
3136 		 */
3137 		*capab = HCKSUM_INET_FULL_V4;
3138 
3139 		/*
3140 		 * TODO: query the "feature-ipv6-csum-offload" capability.
3141 		 * If enabled, that could allow us to use HCKSUM_INET_PARTIAL.
3142 		 */
3143 
3144 		break;
3145 	}
3146 	case MAC_CAPAB_LSO: {
3147 		if (!xnfp->xnf_be_lso)
3148 			return (B_FALSE);
3149 
3150 		mac_capab_lso_t *lso = cap_data;
3151 		lso->lso_flags = LSO_TX_BASIC_TCP_IPV4;
3152 		lso->lso_basic_tcp_ipv4.lso_max = IP_MAXPACKET;
3153 		break;
3154 	}
3155 	default:
3156 		return (B_FALSE);
3157 	}
3158 
3159 	return (B_TRUE);
3160 }
3161 
3162 /*
3163  * The state of the peer has changed - react accordingly.
3164  */
3165 static void
oe_state_change(dev_info_t * dip,ddi_eventcookie_t id,void * arg,void * impl_data)3166 oe_state_change(dev_info_t *dip, ddi_eventcookie_t id,
3167     void *arg, void *impl_data)
3168 {
3169 	_NOTE(ARGUNUSED(id, arg));
3170 	xnf_t *xnfp = ddi_get_driver_private(dip);
3171 	XenbusState new_state = *(XenbusState *)impl_data;
3172 
3173 	ASSERT(xnfp != NULL);
3174 
3175 	switch (new_state) {
3176 	case XenbusStateUnknown:
3177 	case XenbusStateInitialising:
3178 	case XenbusStateInitialised:
3179 	case XenbusStateClosing:
3180 	case XenbusStateClosed:
3181 	case XenbusStateReconfiguring:
3182 	case XenbusStateReconfigured:
3183 		break;
3184 
3185 	case XenbusStateInitWait:
3186 		xnf_read_config(xnfp);
3187 
3188 		if (!xnfp->xnf_be_rx_copy) {
3189 			cmn_err(CE_WARN,
3190 			    "The xnf driver requires a dom0 that "
3191 			    "supports 'feature-rx-copy'.");
3192 			(void) xvdi_switch_state(xnfp->xnf_devinfo,
3193 			    XBT_NULL, XenbusStateClosed);
3194 			break;
3195 		}
3196 
3197 		/*
3198 		 * Connect to the backend.
3199 		 */
3200 		xnf_be_connect(xnfp);
3201 
3202 		/*
3203 		 * Our MAC address as discovered by xnf_read_config().
3204 		 */
3205 		mac_unicst_update(xnfp->xnf_mh, xnfp->xnf_mac_addr);
3206 
3207 		/*
3208 		 * We do not know if some features such as LSO are supported
3209 		 * until we connect to the backend. We request the MAC layer
3210 		 * to poll our capabilities again.
3211 		 */
3212 		mac_capab_update(xnfp->xnf_mh);
3213 
3214 		break;
3215 
3216 	case XenbusStateConnected:
3217 		mutex_enter(&xnfp->xnf_rxlock);
3218 		mutex_enter(&xnfp->xnf_txlock);
3219 
3220 		xnfp->xnf_connected = B_TRUE;
3221 		/*
3222 		 * Wake up any threads waiting to send data to
3223 		 * backend.
3224 		 */
3225 		cv_broadcast(&xnfp->xnf_cv_state);
3226 
3227 		mutex_exit(&xnfp->xnf_txlock);
3228 		mutex_exit(&xnfp->xnf_rxlock);
3229 
3230 		/*
3231 		 * Kick the peer in case it missed any transmits
3232 		 * request in the TX ring.
3233 		 */
3234 		ec_notify_via_evtchn(xnfp->xnf_evtchn);
3235 
3236 		/*
3237 		 * There may already be completed receive requests in
3238 		 * the ring sent by backend after it gets connected
3239 		 * but before we see its state change here, so we call
3240 		 * xnf_intr() to handle them, if any.
3241 		 */
3242 		(void) xnf_intr((caddr_t)xnfp);
3243 
3244 		/*
3245 		 * Mark the link up now that we are connected.
3246 		 */
3247 		mac_link_update(xnfp->xnf_mh, LINK_STATE_UP);
3248 
3249 		/*
3250 		 * Tell the backend about the multicast addresses in
3251 		 * which we are interested.
3252 		 */
3253 		mac_multicast_refresh(xnfp->xnf_mh, NULL, xnfp, B_TRUE);
3254 
3255 		break;
3256 
3257 	default:
3258 		break;
3259 	}
3260 }
3261