xref: /freebsd/sys/dev/xen/netfront/netfront.c (revision 640235e2c2ba32947f7c59d168437ffa1280f1e6)
1 /*-
2  * Copyright (c) 2004-2006 Kip Macy
3  * Copyright (c) 2015 Wei Liu <wei.liu2@citrix.com>
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30 
31 #include "opt_inet.h"
32 #include "opt_inet6.h"
33 
34 #include <sys/param.h>
35 #include <sys/sockio.h>
36 #include <sys/limits.h>
37 #include <sys/mbuf.h>
38 #include <sys/malloc.h>
39 #include <sys/module.h>
40 #include <sys/kernel.h>
41 #include <sys/socket.h>
42 #include <sys/sysctl.h>
43 #include <sys/taskqueue.h>
44 
45 #include <net/if.h>
46 #include <net/if_var.h>
47 #include <net/if_arp.h>
48 #include <net/ethernet.h>
49 #include <net/if_media.h>
50 #include <net/bpf.h>
51 #include <net/if_types.h>
52 
53 #include <netinet/in.h>
54 #include <netinet/ip.h>
55 #include <netinet/if_ether.h>
56 #include <netinet/tcp.h>
57 #include <netinet/tcp_lro.h>
58 
59 #include <vm/vm.h>
60 #include <vm/pmap.h>
61 
62 #include <sys/bus.h>
63 
64 #include <xen/xen-os.h>
65 #include <xen/hypervisor.h>
66 #include <xen/xen_intr.h>
67 #include <xen/gnttab.h>
68 #include <xen/interface/memory.h>
69 #include <xen/interface/io/netif.h>
70 #include <xen/xenbus/xenbusvar.h>
71 
72 #include "xenbus_if.h"
73 
74 /* Features supported by all backends.  TSO and LRO can be negotiated */
75 #define XN_CSUM_FEATURES	(CSUM_TCP | CSUM_UDP)
76 
77 #define NET_TX_RING_SIZE __RING_SIZE((netif_tx_sring_t *)0, PAGE_SIZE)
78 #define NET_RX_RING_SIZE __RING_SIZE((netif_rx_sring_t *)0, PAGE_SIZE)
79 
80 #define NET_RX_SLOTS_MIN (XEN_NETIF_NR_SLOTS_MIN + 1)
81 
82 /*
83  * Should the driver do LRO on the RX end
84  *  this can be toggled on the fly, but the
85  *  interface must be reset (down/up) for it
86  *  to take effect.
87  */
88 static int xn_enable_lro = 1;
89 TUNABLE_INT("hw.xn.enable_lro", &xn_enable_lro);
90 
91 /*
92  * Number of pairs of queues.
93  */
94 static unsigned long xn_num_queues = 4;
95 TUNABLE_ULONG("hw.xn.num_queues", &xn_num_queues);
96 
97 /**
98  * \brief The maximum allowed data fragments in a single transmit
99  *        request.
100  *
101  * This limit is imposed by the backend driver.  We assume here that
102  * we are dealing with a Linux driver domain and have set our limit
103  * to mirror the Linux MAX_SKB_FRAGS constant.
104  */
105 #define	MAX_TX_REQ_FRAGS (65536 / PAGE_SIZE + 2)
106 
107 #define RX_COPY_THRESHOLD 256
108 
109 #define net_ratelimit() 0
110 
111 struct netfront_rxq;
112 struct netfront_txq;
113 struct netfront_info;
114 struct netfront_rx_info;
115 
116 static void xn_txeof(struct netfront_txq *);
117 static void xn_rxeof(struct netfront_rxq *);
118 static void xn_alloc_rx_buffers(struct netfront_rxq *);
119 static void xn_alloc_rx_buffers_callout(void *arg);
120 
121 static void xn_release_rx_bufs(struct netfront_rxq *);
122 static void xn_release_tx_bufs(struct netfront_txq *);
123 
124 static void xn_rxq_intr(struct netfront_rxq *);
125 static void xn_txq_intr(struct netfront_txq *);
126 static void xn_intr(void *);
127 static inline int xn_count_frags(struct mbuf *m);
128 static int xn_assemble_tx_request(struct netfront_txq *, struct mbuf *);
129 static int xn_ioctl(struct ifnet *, u_long, caddr_t);
130 static void xn_ifinit_locked(struct netfront_info *);
131 static void xn_ifinit(void *);
132 static void xn_stop(struct netfront_info *);
133 static void xn_query_features(struct netfront_info *np);
134 static int xn_configure_features(struct netfront_info *np);
135 static void netif_free(struct netfront_info *info);
136 static int netfront_detach(device_t dev);
137 
138 static int xn_txq_mq_start_locked(struct netfront_txq *, struct mbuf *);
139 static int xn_txq_mq_start(struct ifnet *, struct mbuf *);
140 
141 static int talk_to_backend(device_t dev, struct netfront_info *info);
142 static int create_netdev(device_t dev);
143 static void netif_disconnect_backend(struct netfront_info *info);
144 static int setup_device(device_t dev, struct netfront_info *info,
145     unsigned long);
146 static int xn_ifmedia_upd(struct ifnet *ifp);
147 static void xn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr);
148 
149 static int xn_connect(struct netfront_info *);
150 static void xn_kick_rings(struct netfront_info *);
151 
152 static int xn_get_responses(struct netfront_rxq *,
153     struct netfront_rx_info *, RING_IDX, RING_IDX *,
154     struct mbuf **);
155 
156 #define virt_to_mfn(x) (vtophys(x) >> PAGE_SHIFT)
157 
158 #define INVALID_P2M_ENTRY (~0UL)
159 
160 struct xn_rx_stats
161 {
162 	u_long	rx_packets;	/* total packets received	*/
163 	u_long	rx_bytes;	/* total bytes received 	*/
164 	u_long	rx_errors;	/* bad packets received		*/
165 };
166 
167 struct xn_tx_stats
168 {
169 	u_long	tx_packets;	/* total packets transmitted	*/
170 	u_long	tx_bytes;	/* total bytes transmitted	*/
171 	u_long	tx_errors;	/* packet transmit problems	*/
172 };
173 
174 #define XN_QUEUE_NAME_LEN  8	/* xn{t,r}x_%u, allow for two digits */
175 struct netfront_rxq {
176 	struct netfront_info 	*info;
177 	u_int			id;
178 	char			name[XN_QUEUE_NAME_LEN];
179 	struct mtx		lock;
180 
181 	int			ring_ref;
182 	netif_rx_front_ring_t 	ring;
183 	xen_intr_handle_t	xen_intr_handle;
184 
185 	grant_ref_t 		gref_head;
186 	grant_ref_t 		grant_ref[NET_TX_RING_SIZE + 1];
187 
188 	struct mbuf		*mbufs[NET_RX_RING_SIZE + 1];
189 
190 	struct lro_ctrl		lro;
191 
192 	struct callout		rx_refill;
193 
194 	struct xn_rx_stats	stats;
195 };
196 
197 struct netfront_txq {
198 	struct netfront_info 	*info;
199 	u_int 			id;
200 	char			name[XN_QUEUE_NAME_LEN];
201 	struct mtx		lock;
202 
203 	int			ring_ref;
204 	netif_tx_front_ring_t	ring;
205 	xen_intr_handle_t 	xen_intr_handle;
206 
207 	grant_ref_t		gref_head;
208 	grant_ref_t		grant_ref[NET_TX_RING_SIZE + 1];
209 
210 	struct mbuf		*mbufs[NET_TX_RING_SIZE + 1];
211 	int			mbufs_cnt;
212 	struct buf_ring		*br;
213 
214 	struct taskqueue 	*tq;
215 	struct task       	defrtask;
216 
217 	bool			full;
218 
219 	struct xn_tx_stats	stats;
220 };
221 
222 struct netfront_info {
223 	struct ifnet 		*xn_ifp;
224 
225 	struct mtx   		sc_lock;
226 
227 	u_int  num_queues;
228 	struct netfront_rxq 	*rxq;
229 	struct netfront_txq 	*txq;
230 
231 	u_int			carrier;
232 	u_int			maxfrags;
233 
234 	device_t		xbdev;
235 	uint8_t			mac[ETHER_ADDR_LEN];
236 
237 	int			xn_if_flags;
238 
239 	struct ifmedia		sc_media;
240 
241 	bool			xn_reset;
242 };
243 
244 struct netfront_rx_info {
245 	struct netif_rx_response rx;
246 	struct netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1];
247 };
248 
249 #define XN_RX_LOCK(_q)         mtx_lock(&(_q)->lock)
250 #define XN_RX_UNLOCK(_q)       mtx_unlock(&(_q)->lock)
251 
252 #define XN_TX_LOCK(_q)         mtx_lock(&(_q)->lock)
253 #define XN_TX_TRYLOCK(_q)      mtx_trylock(&(_q)->lock)
254 #define XN_TX_UNLOCK(_q)       mtx_unlock(&(_q)->lock)
255 
256 #define XN_LOCK(_sc)           mtx_lock(&(_sc)->sc_lock);
257 #define XN_UNLOCK(_sc)         mtx_unlock(&(_sc)->sc_lock);
258 
259 #define XN_LOCK_ASSERT(_sc)    mtx_assert(&(_sc)->sc_lock, MA_OWNED);
260 #define XN_RX_LOCK_ASSERT(_q)  mtx_assert(&(_q)->lock, MA_OWNED);
261 #define XN_TX_LOCK_ASSERT(_q)  mtx_assert(&(_q)->lock, MA_OWNED);
262 
263 #define netfront_carrier_on(netif)	((netif)->carrier = 1)
264 #define netfront_carrier_off(netif)	((netif)->carrier = 0)
265 #define netfront_carrier_ok(netif)	((netif)->carrier)
266 
267 /* Access macros for acquiring freeing slots in xn_free_{tx,rx}_idxs[]. */
268 
269 static inline void
270 add_id_to_freelist(struct mbuf **list, uintptr_t id)
271 {
272 
273 	KASSERT(id != 0,
274 		("%s: the head item (0) must always be free.", __func__));
275 	list[id] = list[0];
276 	list[0]  = (struct mbuf *)id;
277 }
278 
279 static inline unsigned short
280 get_id_from_freelist(struct mbuf **list)
281 {
282 	uintptr_t id;
283 
284 	id = (uintptr_t)list[0];
285 	KASSERT(id != 0,
286 		("%s: the head item (0) must always remain free.", __func__));
287 	list[0] = list[id];
288 	return (id);
289 }
290 
291 static inline int
292 xn_rxidx(RING_IDX idx)
293 {
294 
295 	return idx & (NET_RX_RING_SIZE - 1);
296 }
297 
298 static inline struct mbuf *
299 xn_get_rx_mbuf(struct netfront_rxq *rxq, RING_IDX ri)
300 {
301 	int i;
302 	struct mbuf *m;
303 
304 	i = xn_rxidx(ri);
305 	m = rxq->mbufs[i];
306 	rxq->mbufs[i] = NULL;
307 	return (m);
308 }
309 
310 static inline grant_ref_t
311 xn_get_rx_ref(struct netfront_rxq *rxq, RING_IDX ri)
312 {
313 	int i = xn_rxidx(ri);
314 	grant_ref_t ref = rxq->grant_ref[i];
315 
316 	KASSERT(ref != GRANT_REF_INVALID, ("Invalid grant reference!\n"));
317 	rxq->grant_ref[i] = GRANT_REF_INVALID;
318 	return (ref);
319 }
320 
321 #define IPRINTK(fmt, args...) \
322     printf("[XEN] " fmt, ##args)
323 #ifdef INVARIANTS
324 #define WPRINTK(fmt, args...) \
325     printf("[XEN] " fmt, ##args)
326 #else
327 #define WPRINTK(fmt, args...)
328 #endif
329 #ifdef DEBUG
330 #define DPRINTK(fmt, args...) \
331     printf("[XEN] %s: " fmt, __func__, ##args)
332 #else
333 #define DPRINTK(fmt, args...)
334 #endif
335 
336 /**
337  * Read the 'mac' node at the given device's node in the store, and parse that
338  * as colon-separated octets, placing result the given mac array.  mac must be
339  * a preallocated array of length ETH_ALEN (as declared in linux/if_ether.h).
340  * Return 0 on success, or errno on error.
341  */
342 static int
343 xen_net_read_mac(device_t dev, uint8_t mac[])
344 {
345 	int error, i;
346 	char *s, *e, *macstr;
347 	const char *path;
348 
349 	path = xenbus_get_node(dev);
350 	error = xs_read(XST_NIL, path, "mac", NULL, (void **) &macstr);
351 	if (error == ENOENT) {
352 		/*
353 		 * Deal with missing mac XenStore nodes on devices with
354 		 * HVM emulation (the 'ioemu' configuration attribute)
355 		 * enabled.
356 		 *
357 		 * The HVM emulator may execute in a stub device model
358 		 * domain which lacks the permission, only given to Dom0,
359 		 * to update the guest's XenStore tree.  For this reason,
360 		 * the HVM emulator doesn't even attempt to write the
361 		 * front-side mac node, even when operating in Dom0.
362 		 * However, there should always be a mac listed in the
363 		 * backend tree.  Fallback to this version if our query
364 		 * of the front side XenStore location doesn't find
365 		 * anything.
366 		 */
367 		path = xenbus_get_otherend_path(dev);
368 		error = xs_read(XST_NIL, path, "mac", NULL, (void **) &macstr);
369 	}
370 	if (error != 0) {
371 		xenbus_dev_fatal(dev, error, "parsing %s/mac", path);
372 		return (error);
373 	}
374 
375 	s = macstr;
376 	for (i = 0; i < ETHER_ADDR_LEN; i++) {
377 		mac[i] = strtoul(s, &e, 16);
378 		if (s == e || (e[0] != ':' && e[0] != 0)) {
379 			free(macstr, M_XENBUS);
380 			return (ENOENT);
381 		}
382 		s = &e[1];
383 	}
384 	free(macstr, M_XENBUS);
385 	return (0);
386 }
387 
388 /**
389  * Entry point to this code when a new device is created.  Allocate the basic
390  * structures and the ring buffers for communication with the backend, and
391  * inform the backend of the appropriate details for those.  Switch to
392  * Connected state.
393  */
394 static int
395 netfront_probe(device_t dev)
396 {
397 
398 	if (xen_hvm_domain() && xen_disable_pv_nics != 0)
399 		return (ENXIO);
400 
401 	if (!strcmp(xenbus_get_type(dev), "vif")) {
402 		device_set_desc(dev, "Virtual Network Interface");
403 		return (0);
404 	}
405 
406 	return (ENXIO);
407 }
408 
409 static int
410 netfront_attach(device_t dev)
411 {
412 	int err;
413 
414 	err = create_netdev(dev);
415 	if (err != 0) {
416 		xenbus_dev_fatal(dev, err, "creating netdev");
417 		return (err);
418 	}
419 
420 	SYSCTL_ADD_INT(device_get_sysctl_ctx(dev),
421 	    SYSCTL_CHILDREN(device_get_sysctl_tree(dev)),
422 	    OID_AUTO, "enable_lro", CTLFLAG_RW,
423 	    &xn_enable_lro, 0, "Large Receive Offload");
424 
425 	SYSCTL_ADD_ULONG(device_get_sysctl_ctx(dev),
426 	    SYSCTL_CHILDREN(device_get_sysctl_tree(dev)),
427 	    OID_AUTO, "num_queues", CTLFLAG_RD,
428 	    &xn_num_queues, "Number of pairs of queues");
429 
430 	return (0);
431 }
432 
433 static int
434 netfront_suspend(device_t dev)
435 {
436 	struct netfront_info *np = device_get_softc(dev);
437 	u_int i;
438 
439 	for (i = 0; i < np->num_queues; i++) {
440 		XN_RX_LOCK(&np->rxq[i]);
441 		XN_TX_LOCK(&np->txq[i]);
442 	}
443 	netfront_carrier_off(np);
444 	for (i = 0; i < np->num_queues; i++) {
445 		XN_RX_UNLOCK(&np->rxq[i]);
446 		XN_TX_UNLOCK(&np->txq[i]);
447 	}
448 	return (0);
449 }
450 
451 /**
452  * We are reconnecting to the backend, due to a suspend/resume, or a backend
453  * driver restart.  We tear down our netif structure and recreate it, but
454  * leave the device-layer structures intact so that this is transparent to the
455  * rest of the kernel.
456  */
457 static int
458 netfront_resume(device_t dev)
459 {
460 	struct netfront_info *info = device_get_softc(dev);
461 
462 	netif_disconnect_backend(info);
463 	return (0);
464 }
465 
466 static int
467 write_queue_xenstore_keys(device_t dev,
468     struct netfront_rxq *rxq,
469     struct netfront_txq *txq,
470     struct xs_transaction *xst, bool hierarchy)
471 {
472 	int err;
473 	const char *message;
474 	const char *node = xenbus_get_node(dev);
475 	char *path;
476 	size_t path_size;
477 
478 	KASSERT(rxq->id == txq->id, ("Mismatch between RX and TX queue ids"));
479 	/* Split event channel support is not yet there. */
480 	KASSERT(rxq->xen_intr_handle == txq->xen_intr_handle,
481 	    ("Split event channels are not supported"));
482 
483 	if (hierarchy) {
484 		path_size = strlen(node) + 10;
485 		path = malloc(path_size, M_DEVBUF, M_WAITOK|M_ZERO);
486 		snprintf(path, path_size, "%s/queue-%u", node, rxq->id);
487 	} else {
488 		path_size = strlen(node) + 1;
489 		path = malloc(path_size, M_DEVBUF, M_WAITOK|M_ZERO);
490 		snprintf(path, path_size, "%s", node);
491 	}
492 
493 	err = xs_printf(*xst, path, "tx-ring-ref","%u", txq->ring_ref);
494 	if (err != 0) {
495 		message = "writing tx ring-ref";
496 		goto error;
497 	}
498 	err = xs_printf(*xst, path, "rx-ring-ref","%u", rxq->ring_ref);
499 	if (err != 0) {
500 		message = "writing rx ring-ref";
501 		goto error;
502 	}
503 	err = xs_printf(*xst, path, "event-channel", "%u",
504 	    xen_intr_port(rxq->xen_intr_handle));
505 	if (err != 0) {
506 		message = "writing event-channel";
507 		goto error;
508 	}
509 
510 	free(path, M_DEVBUF);
511 
512 	return (0);
513 
514 error:
515 	free(path, M_DEVBUF);
516 	xenbus_dev_fatal(dev, err, "%s", message);
517 
518 	return (err);
519 }
520 
521 /* Common code used when first setting up, and when resuming. */
522 static int
523 talk_to_backend(device_t dev, struct netfront_info *info)
524 {
525 	const char *message;
526 	struct xs_transaction xst;
527 	const char *node = xenbus_get_node(dev);
528 	int err;
529 	unsigned long num_queues, max_queues = 0;
530 	unsigned int i;
531 
532 	err = xen_net_read_mac(dev, info->mac);
533 	if (err != 0) {
534 		xenbus_dev_fatal(dev, err, "parsing %s/mac", node);
535 		goto out;
536 	}
537 
538 	err = xs_scanf(XST_NIL, xenbus_get_otherend_path(info->xbdev),
539 	    "multi-queue-max-queues", NULL, "%lu", &max_queues);
540 	if (err != 0)
541 		max_queues = 1;
542 	num_queues = xn_num_queues;
543 	if (num_queues > max_queues)
544 		num_queues = max_queues;
545 
546 	err = setup_device(dev, info, num_queues);
547 	if (err != 0)
548 		goto out;
549 
550  again:
551 	err = xs_transaction_start(&xst);
552 	if (err != 0) {
553 		xenbus_dev_fatal(dev, err, "starting transaction");
554 		goto free;
555 	}
556 
557 	if (info->num_queues == 1) {
558 		err = write_queue_xenstore_keys(dev, &info->rxq[0],
559 		    &info->txq[0], &xst, false);
560 		if (err != 0)
561 			goto abort_transaction_no_def_error;
562 	} else {
563 		err = xs_printf(xst, node, "multi-queue-num-queues",
564 		    "%u", info->num_queues);
565 		if (err != 0) {
566 			message = "writing multi-queue-num-queues";
567 			goto abort_transaction;
568 		}
569 
570 		for (i = 0; i < info->num_queues; i++) {
571 			err = write_queue_xenstore_keys(dev, &info->rxq[i],
572 			    &info->txq[i], &xst, true);
573 			if (err != 0)
574 				goto abort_transaction_no_def_error;
575 		}
576 	}
577 
578 	err = xs_printf(xst, node, "request-rx-copy", "%u", 1);
579 	if (err != 0) {
580 		message = "writing request-rx-copy";
581 		goto abort_transaction;
582 	}
583 	err = xs_printf(xst, node, "feature-rx-notify", "%d", 1);
584 	if (err != 0) {
585 		message = "writing feature-rx-notify";
586 		goto abort_transaction;
587 	}
588 	err = xs_printf(xst, node, "feature-sg", "%d", 1);
589 	if (err != 0) {
590 		message = "writing feature-sg";
591 		goto abort_transaction;
592 	}
593 	if ((info->xn_ifp->if_capenable & IFCAP_LRO) != 0) {
594 		err = xs_printf(xst, node, "feature-gso-tcpv4", "%d", 1);
595 		if (err != 0) {
596 			message = "writing feature-gso-tcpv4";
597 			goto abort_transaction;
598 		}
599 	}
600 	if ((info->xn_ifp->if_capenable & IFCAP_RXCSUM) == 0) {
601 		err = xs_printf(xst, node, "feature-no-csum-offload", "%d", 1);
602 		if (err != 0) {
603 			message = "writing feature-no-csum-offload";
604 			goto abort_transaction;
605 		}
606 	}
607 
608 	err = xs_transaction_end(xst, 0);
609 	if (err != 0) {
610 		if (err == EAGAIN)
611 			goto again;
612 		xenbus_dev_fatal(dev, err, "completing transaction");
613 		goto free;
614 	}
615 
616 	return 0;
617 
618  abort_transaction:
619 	xenbus_dev_fatal(dev, err, "%s", message);
620  abort_transaction_no_def_error:
621 	xs_transaction_end(xst, 1);
622  free:
623 	netif_free(info);
624  out:
625 	return (err);
626 }
627 
628 static void
629 xn_rxq_intr(struct netfront_rxq *rxq)
630 {
631 
632 	XN_RX_LOCK(rxq);
633 	xn_rxeof(rxq);
634 	XN_RX_UNLOCK(rxq);
635 }
636 
637 static void
638 xn_txq_start(struct netfront_txq *txq)
639 {
640 	struct netfront_info *np = txq->info;
641 	struct ifnet *ifp = np->xn_ifp;
642 
643 	XN_TX_LOCK_ASSERT(txq);
644 	if (!drbr_empty(ifp, txq->br))
645 		xn_txq_mq_start_locked(txq, NULL);
646 }
647 
648 static void
649 xn_txq_intr(struct netfront_txq *txq)
650 {
651 
652 	XN_TX_LOCK(txq);
653 	if (RING_HAS_UNCONSUMED_RESPONSES(&txq->ring))
654 		xn_txeof(txq);
655 	xn_txq_start(txq);
656 	XN_TX_UNLOCK(txq);
657 }
658 
659 static void
660 xn_txq_tq_deferred(void *xtxq, int pending)
661 {
662 	struct netfront_txq *txq = xtxq;
663 
664 	XN_TX_LOCK(txq);
665 	xn_txq_start(txq);
666 	XN_TX_UNLOCK(txq);
667 }
668 
669 static void
670 disconnect_rxq(struct netfront_rxq *rxq)
671 {
672 
673 	xn_release_rx_bufs(rxq);
674 	gnttab_free_grant_references(rxq->gref_head);
675 	gnttab_end_foreign_access(rxq->ring_ref, NULL);
676 	/*
677 	 * No split event channel support at the moment, handle will
678 	 * be unbound in tx. So no need to call xen_intr_unbind here,
679 	 * but we do want to reset the handler to 0.
680 	 */
681 	rxq->xen_intr_handle = 0;
682 }
683 
684 static void
685 destroy_rxq(struct netfront_rxq *rxq)
686 {
687 
688 	callout_drain(&rxq->rx_refill);
689 	free(rxq->ring.sring, M_DEVBUF);
690 }
691 
692 static void
693 destroy_rxqs(struct netfront_info *np)
694 {
695 	int i;
696 
697 	for (i = 0; i < np->num_queues; i++)
698 		destroy_rxq(&np->rxq[i]);
699 
700 	free(np->rxq, M_DEVBUF);
701 	np->rxq = NULL;
702 }
703 
704 static int
705 setup_rxqs(device_t dev, struct netfront_info *info,
706 	   unsigned long num_queues)
707 {
708 	int q, i;
709 	int error;
710 	netif_rx_sring_t *rxs;
711 	struct netfront_rxq *rxq;
712 
713 	info->rxq = malloc(sizeof(struct netfront_rxq) * num_queues,
714 	    M_DEVBUF, M_WAITOK|M_ZERO);
715 
716 	for (q = 0; q < num_queues; q++) {
717 		rxq = &info->rxq[q];
718 
719 		rxq->id = q;
720 		rxq->info = info;
721 		rxq->ring_ref = GRANT_REF_INVALID;
722 		rxq->ring.sring = NULL;
723 		snprintf(rxq->name, XN_QUEUE_NAME_LEN, "xnrx_%u", q);
724 		mtx_init(&rxq->lock, rxq->name, "netfront receive lock",
725 		    MTX_DEF);
726 
727 		for (i = 0; i <= NET_RX_RING_SIZE; i++) {
728 			rxq->mbufs[i] = NULL;
729 			rxq->grant_ref[i] = GRANT_REF_INVALID;
730 		}
731 
732 		/* Start resources allocation */
733 
734 		if (gnttab_alloc_grant_references(NET_RX_RING_SIZE,
735 		    &rxq->gref_head) != 0) {
736 			device_printf(dev, "allocating rx gref");
737 			error = ENOMEM;
738 			goto fail;
739 		}
740 
741 		rxs = (netif_rx_sring_t *)malloc(PAGE_SIZE, M_DEVBUF,
742 		    M_WAITOK|M_ZERO);
743 		SHARED_RING_INIT(rxs);
744 		FRONT_RING_INIT(&rxq->ring, rxs, PAGE_SIZE);
745 
746 		error = xenbus_grant_ring(dev, virt_to_mfn(rxs),
747 		    &rxq->ring_ref);
748 		if (error != 0) {
749 			device_printf(dev, "granting rx ring page");
750 			goto fail_grant_ring;
751 		}
752 
753 		callout_init(&rxq->rx_refill, 1);
754 	}
755 
756 	return (0);
757 
758 fail_grant_ring:
759 	gnttab_free_grant_references(rxq->gref_head);
760 	free(rxq->ring.sring, M_DEVBUF);
761 fail:
762 	for (; q >= 0; q--) {
763 		disconnect_rxq(&info->rxq[q]);
764 		destroy_rxq(&info->rxq[q]);
765 	}
766 
767 	free(info->rxq, M_DEVBUF);
768 	return (error);
769 }
770 
771 static void
772 disconnect_txq(struct netfront_txq *txq)
773 {
774 
775 	xn_release_tx_bufs(txq);
776 	gnttab_free_grant_references(txq->gref_head);
777 	gnttab_end_foreign_access(txq->ring_ref, NULL);
778 	xen_intr_unbind(&txq->xen_intr_handle);
779 }
780 
781 static void
782 destroy_txq(struct netfront_txq *txq)
783 {
784 
785 	free(txq->ring.sring, M_DEVBUF);
786 	buf_ring_free(txq->br, M_DEVBUF);
787 	taskqueue_drain_all(txq->tq);
788 	taskqueue_free(txq->tq);
789 }
790 
791 static void
792 destroy_txqs(struct netfront_info *np)
793 {
794 	int i;
795 
796 	for (i = 0; i < np->num_queues; i++)
797 		destroy_txq(&np->txq[i]);
798 
799 	free(np->txq, M_DEVBUF);
800 	np->txq = NULL;
801 }
802 
803 static int
804 setup_txqs(device_t dev, struct netfront_info *info,
805 	   unsigned long num_queues)
806 {
807 	int q, i;
808 	int error;
809 	netif_tx_sring_t *txs;
810 	struct netfront_txq *txq;
811 
812 	info->txq = malloc(sizeof(struct netfront_txq) * num_queues,
813 	    M_DEVBUF, M_WAITOK|M_ZERO);
814 
815 	for (q = 0; q < num_queues; q++) {
816 		txq = &info->txq[q];
817 
818 		txq->id = q;
819 		txq->info = info;
820 
821 		txq->ring_ref = GRANT_REF_INVALID;
822 		txq->ring.sring = NULL;
823 
824 		snprintf(txq->name, XN_QUEUE_NAME_LEN, "xntx_%u", q);
825 
826 		mtx_init(&txq->lock, txq->name, "netfront transmit lock",
827 		    MTX_DEF);
828 
829 		for (i = 0; i <= NET_TX_RING_SIZE; i++) {
830 			txq->mbufs[i] = (void *) ((u_long) i+1);
831 			txq->grant_ref[i] = GRANT_REF_INVALID;
832 		}
833 		txq->mbufs[NET_TX_RING_SIZE] = (void *)0;
834 
835 		/* Start resources allocation. */
836 
837 		if (gnttab_alloc_grant_references(NET_TX_RING_SIZE,
838 		    &txq->gref_head) != 0) {
839 			device_printf(dev, "failed to allocate tx grant refs\n");
840 			error = ENOMEM;
841 			goto fail;
842 		}
843 
844 		txs = (netif_tx_sring_t *)malloc(PAGE_SIZE, M_DEVBUF,
845 		    M_WAITOK|M_ZERO);
846 		SHARED_RING_INIT(txs);
847 		FRONT_RING_INIT(&txq->ring, txs, PAGE_SIZE);
848 
849 		error = xenbus_grant_ring(dev, virt_to_mfn(txs),
850 		    &txq->ring_ref);
851 		if (error != 0) {
852 			device_printf(dev, "failed to grant tx ring\n");
853 			goto fail_grant_ring;
854 		}
855 
856 		txq->br = buf_ring_alloc(NET_TX_RING_SIZE, M_DEVBUF,
857 		    M_WAITOK, &txq->lock);
858 		TASK_INIT(&txq->defrtask, 0, xn_txq_tq_deferred, txq);
859 
860 		txq->tq = taskqueue_create(txq->name, M_WAITOK,
861 		    taskqueue_thread_enqueue, &txq->tq);
862 
863 		error = taskqueue_start_threads(&txq->tq, 1, PI_NET,
864 		    "%s txq %d", device_get_nameunit(dev), txq->id);
865 		if (error != 0) {
866 			device_printf(dev, "failed to start tx taskq %d\n",
867 			    txq->id);
868 			goto fail_start_thread;
869 		}
870 
871 		error = xen_intr_alloc_and_bind_local_port(dev,
872 		    xenbus_get_otherend_id(dev), /* filter */ NULL, xn_intr,
873 		    &info->txq[q], INTR_TYPE_NET | INTR_MPSAFE | INTR_ENTROPY,
874 		    &txq->xen_intr_handle);
875 
876 		if (error != 0) {
877 			device_printf(dev, "xen_intr_alloc_and_bind_local_port failed\n");
878 			goto fail_bind_port;
879 		}
880 	}
881 
882 	return (0);
883 
884 fail_bind_port:
885 	taskqueue_drain_all(txq->tq);
886 fail_start_thread:
887 	buf_ring_free(txq->br, M_DEVBUF);
888 	taskqueue_free(txq->tq);
889 	gnttab_end_foreign_access(txq->ring_ref, NULL);
890 fail_grant_ring:
891 	gnttab_free_grant_references(txq->gref_head);
892 	free(txq->ring.sring, M_DEVBUF);
893 fail:
894 	for (; q >= 0; q--) {
895 		disconnect_txq(&info->txq[q]);
896 		destroy_txq(&info->txq[q]);
897 	}
898 
899 	free(info->txq, M_DEVBUF);
900 	return (error);
901 }
902 
903 static int
904 setup_device(device_t dev, struct netfront_info *info,
905     unsigned long num_queues)
906 {
907 	int error;
908 	int q;
909 
910 	if (info->txq)
911 		destroy_txqs(info);
912 
913 	if (info->rxq)
914 		destroy_rxqs(info);
915 
916 	info->num_queues = 0;
917 
918 	error = setup_rxqs(dev, info, num_queues);
919 	if (error != 0)
920 		goto out;
921 	error = setup_txqs(dev, info, num_queues);
922 	if (error != 0)
923 		goto out;
924 
925 	info->num_queues = num_queues;
926 
927 	/* No split event channel at the moment. */
928 	for (q = 0; q < num_queues; q++)
929 		info->rxq[q].xen_intr_handle = info->txq[q].xen_intr_handle;
930 
931 	return (0);
932 
933 out:
934 	KASSERT(error != 0, ("Error path taken without providing an error code"));
935 	return (error);
936 }
937 
938 #ifdef INET
939 /**
940  * If this interface has an ipv4 address, send an arp for it. This
941  * helps to get the network going again after migrating hosts.
942  */
943 static void
944 netfront_send_fake_arp(device_t dev, struct netfront_info *info)
945 {
946 	struct ifnet *ifp;
947 	struct ifaddr *ifa;
948 
949 	ifp = info->xn_ifp;
950 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
951 		if (ifa->ifa_addr->sa_family == AF_INET) {
952 			arp_ifinit(ifp, ifa);
953 		}
954 	}
955 }
956 #endif
957 
958 /**
959  * Callback received when the backend's state changes.
960  */
961 static void
962 netfront_backend_changed(device_t dev, XenbusState newstate)
963 {
964 	struct netfront_info *sc = device_get_softc(dev);
965 
966 	DPRINTK("newstate=%d\n", newstate);
967 
968 	switch (newstate) {
969 	case XenbusStateInitialising:
970 	case XenbusStateInitialised:
971 	case XenbusStateUnknown:
972 	case XenbusStateReconfigured:
973 	case XenbusStateReconfiguring:
974 		break;
975 	case XenbusStateInitWait:
976 		if (xenbus_get_state(dev) != XenbusStateInitialising)
977 			break;
978 		if (xn_connect(sc) != 0)
979 			break;
980 		/* Switch to connected state before kicking the rings. */
981 		xenbus_set_state(sc->xbdev, XenbusStateConnected);
982 		xn_kick_rings(sc);
983 		break;
984 	case XenbusStateClosing:
985 		xenbus_set_state(dev, XenbusStateClosed);
986 		break;
987 	case XenbusStateClosed:
988 		if (sc->xn_reset) {
989 			netif_disconnect_backend(sc);
990 			xenbus_set_state(dev, XenbusStateInitialising);
991 			sc->xn_reset = false;
992 		}
993 		break;
994 	case XenbusStateConnected:
995 #ifdef INET
996 		netfront_send_fake_arp(dev, sc);
997 #endif
998 		break;
999 	}
1000 }
1001 
1002 /**
1003  * \brief Verify that there is sufficient space in the Tx ring
1004  *        buffer for a maximally sized request to be enqueued.
1005  *
1006  * A transmit request requires a transmit descriptor for each packet
1007  * fragment, plus up to 2 entries for "options" (e.g. TSO).
1008  */
1009 static inline int
1010 xn_tx_slot_available(struct netfront_txq *txq)
1011 {
1012 
1013 	return (RING_FREE_REQUESTS(&txq->ring) > (MAX_TX_REQ_FRAGS + 2));
1014 }
1015 
1016 static void
1017 xn_release_tx_bufs(struct netfront_txq *txq)
1018 {
1019 	int i;
1020 
1021 	for (i = 1; i <= NET_TX_RING_SIZE; i++) {
1022 		struct mbuf *m;
1023 
1024 		m = txq->mbufs[i];
1025 
1026 		/*
1027 		 * We assume that no kernel addresses are
1028 		 * less than NET_TX_RING_SIZE.  Any entry
1029 		 * in the table that is below this number
1030 		 * must be an index from free-list tracking.
1031 		 */
1032 		if (((uintptr_t)m) <= NET_TX_RING_SIZE)
1033 			continue;
1034 		gnttab_end_foreign_access_ref(txq->grant_ref[i]);
1035 		gnttab_release_grant_reference(&txq->gref_head,
1036 		    txq->grant_ref[i]);
1037 		txq->grant_ref[i] = GRANT_REF_INVALID;
1038 		add_id_to_freelist(txq->mbufs, i);
1039 		txq->mbufs_cnt--;
1040 		if (txq->mbufs_cnt < 0) {
1041 			panic("%s: tx_chain_cnt must be >= 0", __func__);
1042 		}
1043 		m_free(m);
1044 	}
1045 }
1046 
1047 static struct mbuf *
1048 xn_alloc_one_rx_buffer(struct netfront_rxq *rxq)
1049 {
1050 	struct mbuf *m;
1051 
1052 	m = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, MJUMPAGESIZE);
1053 	if (m == NULL)
1054 		return NULL;
1055 	m->m_len = m->m_pkthdr.len = MJUMPAGESIZE;
1056 
1057 	return (m);
1058 }
1059 
1060 static void
1061 xn_alloc_rx_buffers(struct netfront_rxq *rxq)
1062 {
1063 	RING_IDX req_prod;
1064 	int notify;
1065 
1066 	XN_RX_LOCK_ASSERT(rxq);
1067 
1068 	if (__predict_false(rxq->info->carrier == 0))
1069 		return;
1070 
1071 	for (req_prod = rxq->ring.req_prod_pvt;
1072 	     req_prod - rxq->ring.rsp_cons < NET_RX_RING_SIZE;
1073 	     req_prod++) {
1074 		struct mbuf *m;
1075 		unsigned short id;
1076 		grant_ref_t ref;
1077 		struct netif_rx_request *req;
1078 		unsigned long pfn;
1079 
1080 		m = xn_alloc_one_rx_buffer(rxq);
1081 		if (m == NULL)
1082 			break;
1083 
1084 		id = xn_rxidx(req_prod);
1085 
1086 		KASSERT(rxq->mbufs[id] == NULL, ("non-NULL xn_rx_chain"));
1087 		rxq->mbufs[id] = m;
1088 
1089 		ref = gnttab_claim_grant_reference(&rxq->gref_head);
1090 		KASSERT(ref != GNTTAB_LIST_END,
1091 		    ("reserved grant references exhuasted"));
1092 		rxq->grant_ref[id] = ref;
1093 
1094 		pfn = atop(vtophys(mtod(m, vm_offset_t)));
1095 		req = RING_GET_REQUEST(&rxq->ring, req_prod);
1096 
1097 		gnttab_grant_foreign_access_ref(ref,
1098 		    xenbus_get_otherend_id(rxq->info->xbdev), pfn, 0);
1099 		req->id = id;
1100 		req->gref = ref;
1101 	}
1102 
1103 	rxq->ring.req_prod_pvt = req_prod;
1104 
1105 	/* Not enough requests? Try again later. */
1106 	if (req_prod - rxq->ring.rsp_cons < NET_RX_SLOTS_MIN) {
1107 		callout_reset_curcpu(&rxq->rx_refill, hz/10,
1108 		    xn_alloc_rx_buffers_callout, rxq);
1109 		return;
1110 	}
1111 
1112 	wmb();		/* barrier so backend seens requests */
1113 
1114 	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&rxq->ring, notify);
1115 	if (notify)
1116 		xen_intr_signal(rxq->xen_intr_handle);
1117 }
1118 
1119 static void xn_alloc_rx_buffers_callout(void *arg)
1120 {
1121 	struct netfront_rxq *rxq;
1122 
1123 	rxq = (struct netfront_rxq *)arg;
1124 	XN_RX_LOCK(rxq);
1125 	xn_alloc_rx_buffers(rxq);
1126 	XN_RX_UNLOCK(rxq);
1127 }
1128 
1129 static void
1130 xn_release_rx_bufs(struct netfront_rxq *rxq)
1131 {
1132 	int i,  ref;
1133 	struct mbuf *m;
1134 
1135 	for (i = 0; i < NET_RX_RING_SIZE; i++) {
1136 		m = rxq->mbufs[i];
1137 
1138 		if (m == NULL)
1139 			continue;
1140 
1141 		ref = rxq->grant_ref[i];
1142 		if (ref == GRANT_REF_INVALID)
1143 			continue;
1144 
1145 		gnttab_end_foreign_access_ref(ref);
1146 		gnttab_release_grant_reference(&rxq->gref_head, ref);
1147 		rxq->mbufs[i] = NULL;
1148 		rxq->grant_ref[i] = GRANT_REF_INVALID;
1149 		m_freem(m);
1150 	}
1151 }
1152 
1153 static void
1154 xn_rxeof(struct netfront_rxq *rxq)
1155 {
1156 	struct ifnet *ifp;
1157 	struct netfront_info *np = rxq->info;
1158 #if (defined(INET) || defined(INET6))
1159 	struct lro_ctrl *lro = &rxq->lro;
1160 #endif
1161 	struct netfront_rx_info rinfo;
1162 	struct netif_rx_response *rx = &rinfo.rx;
1163 	struct netif_extra_info *extras = rinfo.extras;
1164 	RING_IDX i, rp;
1165 	struct mbuf *m;
1166 	struct mbufq mbufq_rxq, mbufq_errq;
1167 	int err, work_to_do;
1168 
1169 	do {
1170 		XN_RX_LOCK_ASSERT(rxq);
1171 		if (!netfront_carrier_ok(np))
1172 			return;
1173 
1174 		/* XXX: there should be some sane limit. */
1175 		mbufq_init(&mbufq_errq, INT_MAX);
1176 		mbufq_init(&mbufq_rxq, INT_MAX);
1177 
1178 		ifp = np->xn_ifp;
1179 
1180 		rp = rxq->ring.sring->rsp_prod;
1181 		rmb();	/* Ensure we see queued responses up to 'rp'. */
1182 
1183 		i = rxq->ring.rsp_cons;
1184 		while ((i != rp)) {
1185 			memcpy(rx, RING_GET_RESPONSE(&rxq->ring, i), sizeof(*rx));
1186 			memset(extras, 0, sizeof(rinfo.extras));
1187 
1188 			m = NULL;
1189 			err = xn_get_responses(rxq, &rinfo, rp, &i, &m);
1190 
1191 			if (__predict_false(err)) {
1192 				if (m)
1193 					(void )mbufq_enqueue(&mbufq_errq, m);
1194 				rxq->stats.rx_errors++;
1195 				continue;
1196 			}
1197 
1198 			m->m_pkthdr.rcvif = ifp;
1199 			if ( rx->flags & NETRXF_data_validated ) {
1200 				/* Tell the stack the checksums are okay */
1201 				/*
1202 				 * XXX this isn't necessarily the case - need to add
1203 				 * check
1204 				 */
1205 
1206 				m->m_pkthdr.csum_flags |=
1207 					(CSUM_IP_CHECKED | CSUM_IP_VALID | CSUM_DATA_VALID
1208 					    | CSUM_PSEUDO_HDR);
1209 				m->m_pkthdr.csum_data = 0xffff;
1210 			}
1211 			if ((rx->flags & NETRXF_extra_info) != 0 &&
1212 			    (extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].type ==
1213 			    XEN_NETIF_EXTRA_TYPE_GSO)) {
1214 				m->m_pkthdr.tso_segsz =
1215 				extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].u.gso.size;
1216 				m->m_pkthdr.csum_flags |= CSUM_TSO;
1217 			}
1218 
1219 			rxq->stats.rx_packets++;
1220 			rxq->stats.rx_bytes += m->m_pkthdr.len;
1221 
1222 			(void )mbufq_enqueue(&mbufq_rxq, m);
1223 			rxq->ring.rsp_cons = i;
1224 		}
1225 
1226 		mbufq_drain(&mbufq_errq);
1227 
1228 		/*
1229 		 * Process all the mbufs after the remapping is complete.
1230 		 * Break the mbuf chain first though.
1231 		 */
1232 		while ((m = mbufq_dequeue(&mbufq_rxq)) != NULL) {
1233 			if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
1234 
1235 			/* XXX: Do we really need to drop the rx lock? */
1236 			XN_RX_UNLOCK(rxq);
1237 #if (defined(INET) || defined(INET6))
1238 			/* Use LRO if possible */
1239 			if ((ifp->if_capenable & IFCAP_LRO) == 0 ||
1240 			    lro->lro_cnt == 0 || tcp_lro_rx(lro, m, 0)) {
1241 				/*
1242 				 * If LRO fails, pass up to the stack
1243 				 * directly.
1244 				 */
1245 				(*ifp->if_input)(ifp, m);
1246 			}
1247 #else
1248 			(*ifp->if_input)(ifp, m);
1249 #endif
1250 
1251 			XN_RX_LOCK(rxq);
1252 		}
1253 
1254 		rxq->ring.rsp_cons = i;
1255 
1256 #if (defined(INET) || defined(INET6))
1257 		/*
1258 		 * Flush any outstanding LRO work
1259 		 */
1260 		tcp_lro_flush_all(lro);
1261 #endif
1262 
1263 		xn_alloc_rx_buffers(rxq);
1264 
1265 		RING_FINAL_CHECK_FOR_RESPONSES(&rxq->ring, work_to_do);
1266 	} while (work_to_do);
1267 }
1268 
1269 static void
1270 xn_txeof(struct netfront_txq *txq)
1271 {
1272 	RING_IDX i, prod;
1273 	unsigned short id;
1274 	struct ifnet *ifp;
1275 	netif_tx_response_t *txr;
1276 	struct mbuf *m;
1277 	struct netfront_info *np = txq->info;
1278 
1279 	XN_TX_LOCK_ASSERT(txq);
1280 
1281 	if (!netfront_carrier_ok(np))
1282 		return;
1283 
1284 	ifp = np->xn_ifp;
1285 
1286 	do {
1287 		prod = txq->ring.sring->rsp_prod;
1288 		rmb(); /* Ensure we see responses up to 'rp'. */
1289 
1290 		for (i = txq->ring.rsp_cons; i != prod; i++) {
1291 			txr = RING_GET_RESPONSE(&txq->ring, i);
1292 			if (txr->status == NETIF_RSP_NULL)
1293 				continue;
1294 
1295 			if (txr->status != NETIF_RSP_OKAY) {
1296 				printf("%s: WARNING: response is %d!\n",
1297 				       __func__, txr->status);
1298 			}
1299 			id = txr->id;
1300 			m = txq->mbufs[id];
1301 			KASSERT(m != NULL, ("mbuf not found in chain"));
1302 			KASSERT((uintptr_t)m > NET_TX_RING_SIZE,
1303 				("mbuf already on the free list, but we're "
1304 				"trying to free it again!"));
1305 			M_ASSERTVALID(m);
1306 
1307 			/*
1308 			 * Increment packet count if this is the last
1309 			 * mbuf of the chain.
1310 			 */
1311 			if (!m->m_next)
1312 				if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
1313 			if (__predict_false(gnttab_query_foreign_access(
1314 			    txq->grant_ref[id]) != 0)) {
1315 				panic("%s: grant id %u still in use by the "
1316 				    "backend", __func__, id);
1317 			}
1318 			gnttab_end_foreign_access_ref(txq->grant_ref[id]);
1319 			gnttab_release_grant_reference(
1320 				&txq->gref_head, txq->grant_ref[id]);
1321 			txq->grant_ref[id] = GRANT_REF_INVALID;
1322 
1323 			txq->mbufs[id] = NULL;
1324 			add_id_to_freelist(txq->mbufs, id);
1325 			txq->mbufs_cnt--;
1326 			m_free(m);
1327 			/* Only mark the txq active if we've freed up at least one slot to try */
1328 			ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
1329 		}
1330 		txq->ring.rsp_cons = prod;
1331 
1332 		/*
1333 		 * Set a new event, then check for race with update of
1334 		 * tx_cons. Note that it is essential to schedule a
1335 		 * callback, no matter how few buffers are pending. Even if
1336 		 * there is space in the transmit ring, higher layers may
1337 		 * be blocked because too much data is outstanding: in such
1338 		 * cases notification from Xen is likely to be the only kick
1339 		 * that we'll get.
1340 		 */
1341 		txq->ring.sring->rsp_event =
1342 		    prod + ((txq->ring.sring->req_prod - prod) >> 1) + 1;
1343 
1344 		mb();
1345 	} while (prod != txq->ring.sring->rsp_prod);
1346 
1347 	if (txq->full &&
1348 	    ((txq->ring.sring->req_prod - prod) < NET_TX_RING_SIZE)) {
1349 		txq->full = false;
1350 		xn_txq_start(txq);
1351 	}
1352 }
1353 
1354 static void
1355 xn_intr(void *xsc)
1356 {
1357 	struct netfront_txq *txq = xsc;
1358 	struct netfront_info *np = txq->info;
1359 	struct netfront_rxq *rxq = &np->rxq[txq->id];
1360 
1361 	/* kick both tx and rx */
1362 	xn_rxq_intr(rxq);
1363 	xn_txq_intr(txq);
1364 }
1365 
1366 static void
1367 xn_move_rx_slot(struct netfront_rxq *rxq, struct mbuf *m,
1368     grant_ref_t ref)
1369 {
1370 	int new = xn_rxidx(rxq->ring.req_prod_pvt);
1371 
1372 	KASSERT(rxq->mbufs[new] == NULL, ("mbufs != NULL"));
1373 	rxq->mbufs[new] = m;
1374 	rxq->grant_ref[new] = ref;
1375 	RING_GET_REQUEST(&rxq->ring, rxq->ring.req_prod_pvt)->id = new;
1376 	RING_GET_REQUEST(&rxq->ring, rxq->ring.req_prod_pvt)->gref = ref;
1377 	rxq->ring.req_prod_pvt++;
1378 }
1379 
1380 static int
1381 xn_get_extras(struct netfront_rxq *rxq,
1382     struct netif_extra_info *extras, RING_IDX rp, RING_IDX *cons)
1383 {
1384 	struct netif_extra_info *extra;
1385 
1386 	int err = 0;
1387 
1388 	do {
1389 		struct mbuf *m;
1390 		grant_ref_t ref;
1391 
1392 		if (__predict_false(*cons + 1 == rp)) {
1393 			err = EINVAL;
1394 			break;
1395 		}
1396 
1397 		extra = (struct netif_extra_info *)
1398 		RING_GET_RESPONSE(&rxq->ring, ++(*cons));
1399 
1400 		if (__predict_false(!extra->type ||
1401 			extra->type >= XEN_NETIF_EXTRA_TYPE_MAX)) {
1402 			err = EINVAL;
1403 		} else {
1404 			memcpy(&extras[extra->type - 1], extra, sizeof(*extra));
1405 		}
1406 
1407 		m = xn_get_rx_mbuf(rxq, *cons);
1408 		ref = xn_get_rx_ref(rxq,  *cons);
1409 		xn_move_rx_slot(rxq, m, ref);
1410 	} while (extra->flags & XEN_NETIF_EXTRA_FLAG_MORE);
1411 
1412 	return err;
1413 }
1414 
1415 static int
1416 xn_get_responses(struct netfront_rxq *rxq,
1417     struct netfront_rx_info *rinfo, RING_IDX rp, RING_IDX *cons,
1418     struct mbuf  **list)
1419 {
1420 	struct netif_rx_response *rx = &rinfo->rx;
1421 	struct netif_extra_info *extras = rinfo->extras;
1422 	struct mbuf *m, *m0, *m_prev;
1423 	grant_ref_t ref = xn_get_rx_ref(rxq, *cons);
1424 	RING_IDX ref_cons = *cons;
1425 	int frags = 1;
1426 	int err = 0;
1427 	u_long ret;
1428 
1429 	m0 = m = m_prev = xn_get_rx_mbuf(rxq, *cons);
1430 
1431 	if (rx->flags & NETRXF_extra_info) {
1432 		err = xn_get_extras(rxq, extras, rp, cons);
1433 	}
1434 
1435 	if (m0 != NULL) {
1436 		m0->m_pkthdr.len = 0;
1437 		m0->m_next = NULL;
1438 	}
1439 
1440 	for (;;) {
1441 #if 0
1442 		DPRINTK("rx->status=%hd rx->offset=%hu frags=%u\n",
1443 			rx->status, rx->offset, frags);
1444 #endif
1445 		if (__predict_false(rx->status < 0 ||
1446 			rx->offset + rx->status > PAGE_SIZE)) {
1447 
1448 			xn_move_rx_slot(rxq, m, ref);
1449 			if (m0 == m)
1450 				m0 = NULL;
1451 			m = NULL;
1452 			err = EINVAL;
1453 			goto next_skip_queue;
1454 		}
1455 
1456 		/*
1457 		 * This definitely indicates a bug, either in this driver or in
1458 		 * the backend driver. In future this should flag the bad
1459 		 * situation to the system controller to reboot the backed.
1460 		 */
1461 		if (ref == GRANT_REF_INVALID) {
1462 			printf("%s: Bad rx response id %d.\n", __func__, rx->id);
1463 			err = EINVAL;
1464 			goto next;
1465 		}
1466 
1467 		ret = gnttab_end_foreign_access_ref(ref);
1468 		KASSERT(ret, ("Unable to end access to grant references"));
1469 
1470 		gnttab_release_grant_reference(&rxq->gref_head, ref);
1471 
1472 next:
1473 		if (m == NULL)
1474 			break;
1475 
1476 		m->m_len = rx->status;
1477 		m->m_data += rx->offset;
1478 		m0->m_pkthdr.len += rx->status;
1479 
1480 next_skip_queue:
1481 		if (!(rx->flags & NETRXF_more_data))
1482 			break;
1483 
1484 		if (*cons + frags == rp) {
1485 			if (net_ratelimit())
1486 				WPRINTK("Need more frags\n");
1487 			err = ENOENT;
1488 			printf("%s: cons %u frags %u rp %u, not enough frags\n",
1489 			       __func__, *cons, frags, rp);
1490 			break;
1491 		}
1492 		/*
1493 		 * Note that m can be NULL, if rx->status < 0 or if
1494 		 * rx->offset + rx->status > PAGE_SIZE above.
1495 		 */
1496 		m_prev = m;
1497 
1498 		rx = RING_GET_RESPONSE(&rxq->ring, *cons + frags);
1499 		m = xn_get_rx_mbuf(rxq, *cons + frags);
1500 
1501 		/*
1502 		 * m_prev == NULL can happen if rx->status < 0 or if
1503 		 * rx->offset + * rx->status > PAGE_SIZE above.
1504 		 */
1505 		if (m_prev != NULL)
1506 			m_prev->m_next = m;
1507 
1508 		/*
1509 		 * m0 can be NULL if rx->status < 0 or if * rx->offset +
1510 		 * rx->status > PAGE_SIZE above.
1511 		 */
1512 		if (m0 == NULL)
1513 			m0 = m;
1514 		m->m_next = NULL;
1515 		ref = xn_get_rx_ref(rxq, *cons + frags);
1516 		ref_cons = *cons + frags;
1517 		frags++;
1518 	}
1519 	*list = m0;
1520 	*cons += frags;
1521 
1522 	return (err);
1523 }
1524 
1525 /**
1526  * \brief Count the number of fragments in an mbuf chain.
1527  *
1528  * Surprisingly, there isn't an M* macro for this.
1529  */
1530 static inline int
1531 xn_count_frags(struct mbuf *m)
1532 {
1533 	int nfrags;
1534 
1535 	for (nfrags = 0; m != NULL; m = m->m_next)
1536 		nfrags++;
1537 
1538 	return (nfrags);
1539 }
1540 
1541 /**
1542  * Given an mbuf chain, make sure we have enough room and then push
1543  * it onto the transmit ring.
1544  */
1545 static int
1546 xn_assemble_tx_request(struct netfront_txq *txq, struct mbuf *m_head)
1547 {
1548 	struct mbuf *m;
1549 	struct netfront_info *np = txq->info;
1550 	struct ifnet *ifp = np->xn_ifp;
1551 	u_int nfrags;
1552 	int otherend_id;
1553 
1554 	/**
1555 	 * Defragment the mbuf if necessary.
1556 	 */
1557 	nfrags = xn_count_frags(m_head);
1558 
1559 	/*
1560 	 * Check to see whether this request is longer than netback
1561 	 * can handle, and try to defrag it.
1562 	 */
1563 	/**
1564 	 * It is a bit lame, but the netback driver in Linux can't
1565 	 * deal with nfrags > MAX_TX_REQ_FRAGS, which is a quirk of
1566 	 * the Linux network stack.
1567 	 */
1568 	if (nfrags > np->maxfrags) {
1569 		m = m_defrag(m_head, M_NOWAIT);
1570 		if (!m) {
1571 			/*
1572 			 * Defrag failed, so free the mbuf and
1573 			 * therefore drop the packet.
1574 			 */
1575 			m_freem(m_head);
1576 			return (EMSGSIZE);
1577 		}
1578 		m_head = m;
1579 	}
1580 
1581 	/* Determine how many fragments now exist */
1582 	nfrags = xn_count_frags(m_head);
1583 
1584 	/*
1585 	 * Check to see whether the defragmented packet has too many
1586 	 * segments for the Linux netback driver.
1587 	 */
1588 	/**
1589 	 * The FreeBSD TCP stack, with TSO enabled, can produce a chain
1590 	 * of mbufs longer than Linux can handle.  Make sure we don't
1591 	 * pass a too-long chain over to the other side by dropping the
1592 	 * packet.  It doesn't look like there is currently a way to
1593 	 * tell the TCP stack to generate a shorter chain of packets.
1594 	 */
1595 	if (nfrags > MAX_TX_REQ_FRAGS) {
1596 #ifdef DEBUG
1597 		printf("%s: nfrags %d > MAX_TX_REQ_FRAGS %d, netback "
1598 		       "won't be able to handle it, dropping\n",
1599 		       __func__, nfrags, MAX_TX_REQ_FRAGS);
1600 #endif
1601 		m_freem(m_head);
1602 		return (EMSGSIZE);
1603 	}
1604 
1605 	/*
1606 	 * This check should be redundant.  We've already verified that we
1607 	 * have enough slots in the ring to handle a packet of maximum
1608 	 * size, and that our packet is less than the maximum size.  Keep
1609 	 * it in here as an assert for now just to make certain that
1610 	 * chain_cnt is accurate.
1611 	 */
1612 	KASSERT((txq->mbufs_cnt + nfrags) <= NET_TX_RING_SIZE,
1613 		("%s: chain_cnt (%d) + nfrags (%d) > NET_TX_RING_SIZE "
1614 		 "(%d)!", __func__, (int) txq->mbufs_cnt,
1615                     (int) nfrags, (int) NET_TX_RING_SIZE));
1616 
1617 	/*
1618 	 * Start packing the mbufs in this chain into
1619 	 * the fragment pointers. Stop when we run out
1620 	 * of fragments or hit the end of the mbuf chain.
1621 	 */
1622 	m = m_head;
1623 	otherend_id = xenbus_get_otherend_id(np->xbdev);
1624 	for (m = m_head; m; m = m->m_next) {
1625 		netif_tx_request_t *tx;
1626 		uintptr_t id;
1627 		grant_ref_t ref;
1628 		u_long mfn; /* XXX Wrong type? */
1629 
1630 		tx = RING_GET_REQUEST(&txq->ring, txq->ring.req_prod_pvt);
1631 		id = get_id_from_freelist(txq->mbufs);
1632 		if (id == 0)
1633 			panic("%s: was allocated the freelist head!\n",
1634 			    __func__);
1635 		txq->mbufs_cnt++;
1636 		if (txq->mbufs_cnt > NET_TX_RING_SIZE)
1637 			panic("%s: tx_chain_cnt must be <= NET_TX_RING_SIZE\n",
1638 			    __func__);
1639 		txq->mbufs[id] = m;
1640 		tx->id = id;
1641 		ref = gnttab_claim_grant_reference(&txq->gref_head);
1642 		KASSERT((short)ref >= 0, ("Negative ref"));
1643 		mfn = virt_to_mfn(mtod(m, vm_offset_t));
1644 		gnttab_grant_foreign_access_ref(ref, otherend_id,
1645 		    mfn, GNTMAP_readonly);
1646 		tx->gref = txq->grant_ref[id] = ref;
1647 		tx->offset = mtod(m, vm_offset_t) & (PAGE_SIZE - 1);
1648 		tx->flags = 0;
1649 		if (m == m_head) {
1650 			/*
1651 			 * The first fragment has the entire packet
1652 			 * size, subsequent fragments have just the
1653 			 * fragment size. The backend works out the
1654 			 * true size of the first fragment by
1655 			 * subtracting the sizes of the other
1656 			 * fragments.
1657 			 */
1658 			tx->size = m->m_pkthdr.len;
1659 
1660 			/*
1661 			 * The first fragment contains the checksum flags
1662 			 * and is optionally followed by extra data for
1663 			 * TSO etc.
1664 			 */
1665 			/**
1666 			 * CSUM_TSO requires checksum offloading.
1667 			 * Some versions of FreeBSD fail to
1668 			 * set CSUM_TCP in the CSUM_TSO case,
1669 			 * so we have to test for CSUM_TSO
1670 			 * explicitly.
1671 			 */
1672 			if (m->m_pkthdr.csum_flags
1673 			    & (CSUM_DELAY_DATA | CSUM_TSO)) {
1674 				tx->flags |= (NETTXF_csum_blank
1675 				    | NETTXF_data_validated);
1676 			}
1677 			if (m->m_pkthdr.csum_flags & CSUM_TSO) {
1678 				struct netif_extra_info *gso =
1679 					(struct netif_extra_info *)
1680 					RING_GET_REQUEST(&txq->ring,
1681 							 ++txq->ring.req_prod_pvt);
1682 
1683 				tx->flags |= NETTXF_extra_info;
1684 
1685 				gso->u.gso.size = m->m_pkthdr.tso_segsz;
1686 				gso->u.gso.type =
1687 					XEN_NETIF_GSO_TYPE_TCPV4;
1688 				gso->u.gso.pad = 0;
1689 				gso->u.gso.features = 0;
1690 
1691 				gso->type = XEN_NETIF_EXTRA_TYPE_GSO;
1692 				gso->flags = 0;
1693 			}
1694 		} else {
1695 			tx->size = m->m_len;
1696 		}
1697 		if (m->m_next)
1698 			tx->flags |= NETTXF_more_data;
1699 
1700 		txq->ring.req_prod_pvt++;
1701 	}
1702 	BPF_MTAP(ifp, m_head);
1703 
1704 	xn_txeof(txq);
1705 
1706 	txq->stats.tx_bytes += m_head->m_pkthdr.len;
1707 	txq->stats.tx_packets++;
1708 
1709 	return (0);
1710 }
1711 
1712 /* equivalent of network_open() in Linux */
1713 static void
1714 xn_ifinit_locked(struct netfront_info *np)
1715 {
1716 	struct ifnet *ifp;
1717 	int i;
1718 	struct netfront_rxq *rxq;
1719 
1720 	XN_LOCK_ASSERT(np);
1721 
1722 	ifp = np->xn_ifp;
1723 
1724 	if (ifp->if_drv_flags & IFF_DRV_RUNNING || !netfront_carrier_ok(np))
1725 		return;
1726 
1727 	xn_stop(np);
1728 
1729 	for (i = 0; i < np->num_queues; i++) {
1730 		rxq = &np->rxq[i];
1731 		XN_RX_LOCK(rxq);
1732 		xn_alloc_rx_buffers(rxq);
1733 		rxq->ring.sring->rsp_event = rxq->ring.rsp_cons + 1;
1734 		if (RING_HAS_UNCONSUMED_RESPONSES(&rxq->ring))
1735 			xn_rxeof(rxq);
1736 		XN_RX_UNLOCK(rxq);
1737 	}
1738 
1739 	ifp->if_drv_flags |= IFF_DRV_RUNNING;
1740 	ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
1741 	if_link_state_change(ifp, LINK_STATE_UP);
1742 }
1743 
1744 static void
1745 xn_ifinit(void *xsc)
1746 {
1747 	struct netfront_info *sc = xsc;
1748 
1749 	XN_LOCK(sc);
1750 	xn_ifinit_locked(sc);
1751 	XN_UNLOCK(sc);
1752 }
1753 
1754 static int
1755 xn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
1756 {
1757 	struct netfront_info *sc = ifp->if_softc;
1758 	struct ifreq *ifr = (struct ifreq *) data;
1759 	device_t dev;
1760 #ifdef INET
1761 	struct ifaddr *ifa = (struct ifaddr *)data;
1762 #endif
1763 	int mask, error = 0, reinit;
1764 
1765 	dev = sc->xbdev;
1766 
1767 	switch(cmd) {
1768 	case SIOCSIFADDR:
1769 #ifdef INET
1770 		XN_LOCK(sc);
1771 		if (ifa->ifa_addr->sa_family == AF_INET) {
1772 			ifp->if_flags |= IFF_UP;
1773 			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING))
1774 				xn_ifinit_locked(sc);
1775 			arp_ifinit(ifp, ifa);
1776 			XN_UNLOCK(sc);
1777 		} else {
1778 			XN_UNLOCK(sc);
1779 #endif
1780 			error = ether_ioctl(ifp, cmd, data);
1781 #ifdef INET
1782 		}
1783 #endif
1784 		break;
1785 	case SIOCSIFMTU:
1786 		ifp->if_mtu = ifr->ifr_mtu;
1787 		ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
1788 		xn_ifinit(sc);
1789 		break;
1790 	case SIOCSIFFLAGS:
1791 		XN_LOCK(sc);
1792 		if (ifp->if_flags & IFF_UP) {
1793 			/*
1794 			 * If only the state of the PROMISC flag changed,
1795 			 * then just use the 'set promisc mode' command
1796 			 * instead of reinitializing the entire NIC. Doing
1797 			 * a full re-init means reloading the firmware and
1798 			 * waiting for it to start up, which may take a
1799 			 * second or two.
1800 			 */
1801 			xn_ifinit_locked(sc);
1802 		} else {
1803 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
1804 				xn_stop(sc);
1805 			}
1806 		}
1807 		sc->xn_if_flags = ifp->if_flags;
1808 		XN_UNLOCK(sc);
1809 		break;
1810 	case SIOCSIFCAP:
1811 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
1812 		reinit = 0;
1813 
1814 		if (mask & IFCAP_TXCSUM) {
1815 			ifp->if_capenable ^= IFCAP_TXCSUM;
1816 			ifp->if_hwassist ^= XN_CSUM_FEATURES;
1817 		}
1818 		if (mask & IFCAP_TSO4) {
1819 			ifp->if_capenable ^= IFCAP_TSO4;
1820 			ifp->if_hwassist ^= CSUM_TSO;
1821 		}
1822 
1823 		if (mask & (IFCAP_RXCSUM | IFCAP_LRO)) {
1824 			/* These Rx features require us to renegotiate. */
1825 			reinit = 1;
1826 
1827 			if (mask & IFCAP_RXCSUM)
1828 				ifp->if_capenable ^= IFCAP_RXCSUM;
1829 			if (mask & IFCAP_LRO)
1830 				ifp->if_capenable ^= IFCAP_LRO;
1831 		}
1832 
1833 		if (reinit == 0)
1834 			break;
1835 
1836 		/*
1837 		 * We must reset the interface so the backend picks up the
1838 		 * new features.
1839 		 */
1840 		device_printf(sc->xbdev,
1841 		    "performing interface reset due to feature change\n");
1842 		XN_LOCK(sc);
1843 		netfront_carrier_off(sc);
1844 		sc->xn_reset = true;
1845 		/*
1846 		 * NB: the pending packet queue is not flushed, since
1847 		 * the interface should still support the old options.
1848 		 */
1849 		XN_UNLOCK(sc);
1850 		/*
1851 		 * Delete the xenstore nodes that export features.
1852 		 *
1853 		 * NB: There's a xenbus state called
1854 		 * "XenbusStateReconfiguring", which is what we should set
1855 		 * here. Sadly none of the backends know how to handle it,
1856 		 * and simply disconnect from the frontend, so we will just
1857 		 * switch back to XenbusStateInitialising in order to force
1858 		 * a reconnection.
1859 		 */
1860 		xs_rm(XST_NIL, xenbus_get_node(dev), "feature-gso-tcpv4");
1861 		xs_rm(XST_NIL, xenbus_get_node(dev), "feature-no-csum-offload");
1862 		xenbus_set_state(dev, XenbusStateClosing);
1863 
1864 		/*
1865 		 * Wait for the frontend to reconnect before returning
1866 		 * from the ioctl. 30s should be more than enough for any
1867 		 * sane backend to reconnect.
1868 		 */
1869 		error = tsleep(sc, 0, "xn_rst", 30*hz);
1870 		break;
1871 	case SIOCADDMULTI:
1872 	case SIOCDELMULTI:
1873 		break;
1874 	case SIOCSIFMEDIA:
1875 	case SIOCGIFMEDIA:
1876 		error = ifmedia_ioctl(ifp, ifr, &sc->sc_media, cmd);
1877 		break;
1878 	default:
1879 		error = ether_ioctl(ifp, cmd, data);
1880 	}
1881 
1882 	return (error);
1883 }
1884 
1885 static void
1886 xn_stop(struct netfront_info *sc)
1887 {
1888 	struct ifnet *ifp;
1889 
1890 	XN_LOCK_ASSERT(sc);
1891 
1892 	ifp = sc->xn_ifp;
1893 
1894 	ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE);
1895 	if_link_state_change(ifp, LINK_STATE_DOWN);
1896 }
1897 
1898 static void
1899 xn_rebuild_rx_bufs(struct netfront_rxq *rxq)
1900 {
1901 	int requeue_idx, i;
1902 	grant_ref_t ref;
1903 	netif_rx_request_t *req;
1904 
1905 	for (requeue_idx = 0, i = 0; i < NET_RX_RING_SIZE; i++) {
1906 		struct mbuf *m;
1907 		u_long pfn;
1908 
1909 		if (rxq->mbufs[i] == NULL)
1910 			continue;
1911 
1912 		m = rxq->mbufs[requeue_idx] = xn_get_rx_mbuf(rxq, i);
1913 		ref = rxq->grant_ref[requeue_idx] = xn_get_rx_ref(rxq, i);
1914 
1915 		req = RING_GET_REQUEST(&rxq->ring, requeue_idx);
1916 		pfn = vtophys(mtod(m, vm_offset_t)) >> PAGE_SHIFT;
1917 
1918 		gnttab_grant_foreign_access_ref(ref,
1919 		    xenbus_get_otherend_id(rxq->info->xbdev),
1920 		    pfn, 0);
1921 
1922 		req->gref = ref;
1923 		req->id   = requeue_idx;
1924 
1925 		requeue_idx++;
1926 	}
1927 
1928 	rxq->ring.req_prod_pvt = requeue_idx;
1929 }
1930 
1931 /* START of Xenolinux helper functions adapted to FreeBSD */
1932 static int
1933 xn_connect(struct netfront_info *np)
1934 {
1935 	int i, error;
1936 	u_int feature_rx_copy;
1937 	struct netfront_rxq *rxq;
1938 	struct netfront_txq *txq;
1939 
1940 	error = xs_scanf(XST_NIL, xenbus_get_otherend_path(np->xbdev),
1941 	    "feature-rx-copy", NULL, "%u", &feature_rx_copy);
1942 	if (error != 0)
1943 		feature_rx_copy = 0;
1944 
1945 	/* We only support rx copy. */
1946 	if (!feature_rx_copy)
1947 		return (EPROTONOSUPPORT);
1948 
1949 	/* Recovery procedure: */
1950 	error = talk_to_backend(np->xbdev, np);
1951 	if (error != 0)
1952 		return (error);
1953 
1954 	/* Step 1: Reinitialise variables. */
1955 	xn_query_features(np);
1956 	xn_configure_features(np);
1957 
1958 	/* Step 2: Release TX buffer */
1959 	for (i = 0; i < np->num_queues; i++) {
1960 		txq = &np->txq[i];
1961 		xn_release_tx_bufs(txq);
1962 	}
1963 
1964 	/* Step 3: Rebuild the RX buffer freelist and the RX ring itself. */
1965 	for (i = 0; i < np->num_queues; i++) {
1966 		rxq = &np->rxq[i];
1967 		xn_rebuild_rx_bufs(rxq);
1968 	}
1969 
1970 	/* Step 4: All public and private state should now be sane.  Get
1971 	 * ready to start sending and receiving packets and give the driver
1972 	 * domain a kick because we've probably just requeued some
1973 	 * packets.
1974 	 */
1975 	netfront_carrier_on(np);
1976 	wakeup(np);
1977 
1978 	return (0);
1979 }
1980 
1981 static void
1982 xn_kick_rings(struct netfront_info *np)
1983 {
1984 	struct netfront_rxq *rxq;
1985 	struct netfront_txq *txq;
1986 	int i;
1987 
1988 	for (i = 0; i < np->num_queues; i++) {
1989 		txq = &np->txq[i];
1990 		rxq = &np->rxq[i];
1991 		xen_intr_signal(txq->xen_intr_handle);
1992 		XN_TX_LOCK(txq);
1993 		xn_txeof(txq);
1994 		XN_TX_UNLOCK(txq);
1995 		XN_RX_LOCK(rxq);
1996 		xn_alloc_rx_buffers(rxq);
1997 		XN_RX_UNLOCK(rxq);
1998 	}
1999 }
2000 
2001 static void
2002 xn_query_features(struct netfront_info *np)
2003 {
2004 	int val;
2005 
2006 	device_printf(np->xbdev, "backend features:");
2007 
2008 	if (xs_scanf(XST_NIL, xenbus_get_otherend_path(np->xbdev),
2009 		"feature-sg", NULL, "%d", &val) != 0)
2010 		val = 0;
2011 
2012 	np->maxfrags = 1;
2013 	if (val) {
2014 		np->maxfrags = MAX_TX_REQ_FRAGS;
2015 		printf(" feature-sg");
2016 	}
2017 
2018 	if (xs_scanf(XST_NIL, xenbus_get_otherend_path(np->xbdev),
2019 		"feature-gso-tcpv4", NULL, "%d", &val) != 0)
2020 		val = 0;
2021 
2022 	np->xn_ifp->if_capabilities &= ~(IFCAP_TSO4|IFCAP_LRO);
2023 	if (val) {
2024 		np->xn_ifp->if_capabilities |= IFCAP_TSO4|IFCAP_LRO;
2025 		printf(" feature-gso-tcp4");
2026 	}
2027 
2028 	/*
2029 	 * HW CSUM offload is assumed to be available unless
2030 	 * feature-no-csum-offload is set in xenstore.
2031 	 */
2032 	if (xs_scanf(XST_NIL, xenbus_get_otherend_path(np->xbdev),
2033 		"feature-no-csum-offload", NULL, "%d", &val) != 0)
2034 		val = 0;
2035 
2036 	np->xn_ifp->if_capabilities |= IFCAP_HWCSUM;
2037 	if (val) {
2038 		np->xn_ifp->if_capabilities &= ~(IFCAP_HWCSUM);
2039 		printf(" feature-no-csum-offload");
2040 	}
2041 
2042 	printf("\n");
2043 }
2044 
2045 static int
2046 xn_configure_features(struct netfront_info *np)
2047 {
2048 	int err, cap_enabled;
2049 #if (defined(INET) || defined(INET6))
2050 	int i;
2051 #endif
2052 	struct ifnet *ifp;
2053 
2054 	ifp = np->xn_ifp;
2055 	err = 0;
2056 
2057 	if ((ifp->if_capenable & ifp->if_capabilities) == ifp->if_capenable) {
2058 		/* Current options are available, no need to do anything. */
2059 		return (0);
2060 	}
2061 
2062 	/* Try to preserve as many options as possible. */
2063 	cap_enabled = ifp->if_capenable;
2064 	ifp->if_capenable = ifp->if_hwassist = 0;
2065 
2066 #if (defined(INET) || defined(INET6))
2067 	if ((cap_enabled & IFCAP_LRO) != 0)
2068 		for (i = 0; i < np->num_queues; i++)
2069 			tcp_lro_free(&np->rxq[i].lro);
2070 	if (xn_enable_lro &&
2071 	    (ifp->if_capabilities & cap_enabled & IFCAP_LRO) != 0) {
2072 	    	ifp->if_capenable |= IFCAP_LRO;
2073 		for (i = 0; i < np->num_queues; i++) {
2074 			err = tcp_lro_init(&np->rxq[i].lro);
2075 			if (err != 0) {
2076 				device_printf(np->xbdev,
2077 				    "LRO initialization failed\n");
2078 				ifp->if_capenable &= ~IFCAP_LRO;
2079 				break;
2080 			}
2081 			np->rxq[i].lro.ifp = ifp;
2082 		}
2083 	}
2084 	if ((ifp->if_capabilities & cap_enabled & IFCAP_TSO4) != 0) {
2085 		ifp->if_capenable |= IFCAP_TSO4;
2086 		ifp->if_hwassist |= CSUM_TSO;
2087 	}
2088 #endif
2089 	if ((ifp->if_capabilities & cap_enabled & IFCAP_TXCSUM) != 0) {
2090 		ifp->if_capenable |= IFCAP_TXCSUM;
2091 		ifp->if_hwassist |= XN_CSUM_FEATURES;
2092 	}
2093 	if ((ifp->if_capabilities & cap_enabled & IFCAP_RXCSUM) != 0)
2094 		ifp->if_capenable |= IFCAP_RXCSUM;
2095 
2096 	return (err);
2097 }
2098 
2099 static int
2100 xn_txq_mq_start_locked(struct netfront_txq *txq, struct mbuf *m)
2101 {
2102 	struct netfront_info *np;
2103 	struct ifnet *ifp;
2104 	struct buf_ring *br;
2105 	int error, notify;
2106 
2107 	np = txq->info;
2108 	br = txq->br;
2109 	ifp = np->xn_ifp;
2110 	error = 0;
2111 
2112 	XN_TX_LOCK_ASSERT(txq);
2113 
2114 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 ||
2115 	    !netfront_carrier_ok(np)) {
2116 		if (m != NULL)
2117 			error = drbr_enqueue(ifp, br, m);
2118 		return (error);
2119 	}
2120 
2121 	if (m != NULL) {
2122 		error = drbr_enqueue(ifp, br, m);
2123 		if (error != 0)
2124 			return (error);
2125 	}
2126 
2127 	while ((m = drbr_peek(ifp, br)) != NULL) {
2128 		if (!xn_tx_slot_available(txq)) {
2129 			drbr_putback(ifp, br, m);
2130 			break;
2131 		}
2132 
2133 		error = xn_assemble_tx_request(txq, m);
2134 		/* xn_assemble_tx_request always consumes the mbuf*/
2135 		if (error != 0) {
2136 			drbr_advance(ifp, br);
2137 			break;
2138 		}
2139 
2140 		RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&txq->ring, notify);
2141 		if (notify)
2142 			xen_intr_signal(txq->xen_intr_handle);
2143 
2144 		drbr_advance(ifp, br);
2145 	}
2146 
2147 	if (RING_FULL(&txq->ring))
2148 		txq->full = true;
2149 
2150 	return (0);
2151 }
2152 
2153 static int
2154 xn_txq_mq_start(struct ifnet *ifp, struct mbuf *m)
2155 {
2156 	struct netfront_info *np;
2157 	struct netfront_txq *txq;
2158 	int i, npairs, error;
2159 
2160 	np = ifp->if_softc;
2161 	npairs = np->num_queues;
2162 
2163 	if (!netfront_carrier_ok(np))
2164 		return (ENOBUFS);
2165 
2166 	KASSERT(npairs != 0, ("called with 0 available queues"));
2167 
2168 	/* check if flowid is set */
2169 	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
2170 		i = m->m_pkthdr.flowid % npairs;
2171 	else
2172 		i = curcpu % npairs;
2173 
2174 	txq = &np->txq[i];
2175 
2176 	if (XN_TX_TRYLOCK(txq) != 0) {
2177 		error = xn_txq_mq_start_locked(txq, m);
2178 		XN_TX_UNLOCK(txq);
2179 	} else {
2180 		error = drbr_enqueue(ifp, txq->br, m);
2181 		taskqueue_enqueue(txq->tq, &txq->defrtask);
2182 	}
2183 
2184 	return (error);
2185 }
2186 
2187 static void
2188 xn_qflush(struct ifnet *ifp)
2189 {
2190 	struct netfront_info *np;
2191 	struct netfront_txq *txq;
2192 	struct mbuf *m;
2193 	int i;
2194 
2195 	np = ifp->if_softc;
2196 
2197 	for (i = 0; i < np->num_queues; i++) {
2198 		txq = &np->txq[i];
2199 
2200 		XN_TX_LOCK(txq);
2201 		while ((m = buf_ring_dequeue_sc(txq->br)) != NULL)
2202 			m_freem(m);
2203 		XN_TX_UNLOCK(txq);
2204 	}
2205 
2206 	if_qflush(ifp);
2207 }
2208 
2209 /**
2210  * Create a network device.
2211  * @param dev  Newbus device representing this virtual NIC.
2212  */
2213 int
2214 create_netdev(device_t dev)
2215 {
2216 	struct netfront_info *np;
2217 	int err;
2218 	struct ifnet *ifp;
2219 
2220 	np = device_get_softc(dev);
2221 
2222 	np->xbdev         = dev;
2223 
2224 	mtx_init(&np->sc_lock, "xnsc", "netfront softc lock", MTX_DEF);
2225 
2226 	ifmedia_init(&np->sc_media, 0, xn_ifmedia_upd, xn_ifmedia_sts);
2227 	ifmedia_add(&np->sc_media, IFM_ETHER|IFM_MANUAL, 0, NULL);
2228 	ifmedia_set(&np->sc_media, IFM_ETHER|IFM_MANUAL);
2229 
2230 	err = xen_net_read_mac(dev, np->mac);
2231 	if (err != 0)
2232 		goto error;
2233 
2234 	/* Set up ifnet structure */
2235 	ifp = np->xn_ifp = if_alloc(IFT_ETHER);
2236     	ifp->if_softc = np;
2237     	if_initname(ifp, "xn",  device_get_unit(dev));
2238     	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
2239     	ifp->if_ioctl = xn_ioctl;
2240 
2241 	ifp->if_transmit = xn_txq_mq_start;
2242 	ifp->if_qflush = xn_qflush;
2243 
2244     	ifp->if_init = xn_ifinit;
2245 
2246     	ifp->if_hwassist = XN_CSUM_FEATURES;
2247 	/* Enable all supported features at device creation. */
2248 	ifp->if_capenable = ifp->if_capabilities =
2249 	    IFCAP_HWCSUM|IFCAP_TSO4|IFCAP_LRO;
2250 	ifp->if_hw_tsomax = 65536 - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
2251 	ifp->if_hw_tsomaxsegcount = MAX_TX_REQ_FRAGS;
2252 	ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
2253 
2254     	ether_ifattach(ifp, np->mac);
2255 	netfront_carrier_off(np);
2256 
2257 	return (0);
2258 
2259 error:
2260 	KASSERT(err != 0, ("Error path with no error code specified"));
2261 	return (err);
2262 }
2263 
2264 static int
2265 netfront_detach(device_t dev)
2266 {
2267 	struct netfront_info *info = device_get_softc(dev);
2268 
2269 	DPRINTK("%s\n", xenbus_get_node(dev));
2270 
2271 	netif_free(info);
2272 
2273 	return 0;
2274 }
2275 
2276 static void
2277 netif_free(struct netfront_info *np)
2278 {
2279 
2280 	XN_LOCK(np);
2281 	xn_stop(np);
2282 	XN_UNLOCK(np);
2283 	netif_disconnect_backend(np);
2284 	ether_ifdetach(np->xn_ifp);
2285 	free(np->rxq, M_DEVBUF);
2286 	free(np->txq, M_DEVBUF);
2287 	if_free(np->xn_ifp);
2288 	np->xn_ifp = NULL;
2289 	ifmedia_removeall(&np->sc_media);
2290 }
2291 
2292 static void
2293 netif_disconnect_backend(struct netfront_info *np)
2294 {
2295 	u_int i;
2296 
2297 	for (i = 0; i < np->num_queues; i++) {
2298 		XN_RX_LOCK(&np->rxq[i]);
2299 		XN_TX_LOCK(&np->txq[i]);
2300 	}
2301 	netfront_carrier_off(np);
2302 	for (i = 0; i < np->num_queues; i++) {
2303 		XN_RX_UNLOCK(&np->rxq[i]);
2304 		XN_TX_UNLOCK(&np->txq[i]);
2305 	}
2306 
2307 	for (i = 0; i < np->num_queues; i++) {
2308 		disconnect_rxq(&np->rxq[i]);
2309 		disconnect_txq(&np->txq[i]);
2310 	}
2311 }
2312 
2313 static int
2314 xn_ifmedia_upd(struct ifnet *ifp)
2315 {
2316 
2317 	return (0);
2318 }
2319 
2320 static void
2321 xn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
2322 {
2323 
2324 	ifmr->ifm_status = IFM_AVALID|IFM_ACTIVE;
2325 	ifmr->ifm_active = IFM_ETHER|IFM_MANUAL;
2326 }
2327 
2328 /* ** Driver registration ** */
2329 static device_method_t netfront_methods[] = {
2330 	/* Device interface */
2331 	DEVMETHOD(device_probe,         netfront_probe),
2332 	DEVMETHOD(device_attach,        netfront_attach),
2333 	DEVMETHOD(device_detach,        netfront_detach),
2334 	DEVMETHOD(device_shutdown,      bus_generic_shutdown),
2335 	DEVMETHOD(device_suspend,       netfront_suspend),
2336 	DEVMETHOD(device_resume,        netfront_resume),
2337 
2338 	/* Xenbus interface */
2339 	DEVMETHOD(xenbus_otherend_changed, netfront_backend_changed),
2340 
2341 	DEVMETHOD_END
2342 };
2343 
2344 static driver_t netfront_driver = {
2345 	"xn",
2346 	netfront_methods,
2347 	sizeof(struct netfront_info),
2348 };
2349 devclass_t netfront_devclass;
2350 
2351 DRIVER_MODULE(xe, xenbusb_front, netfront_driver, netfront_devclass, NULL,
2352     NULL);
2353