xref: /linux/drivers/net/xen-netfront.c (revision 2b8232ce512105e28453f301d1510de8363bccd1)
1 /*
2  * Virtual network driver for conversing with remote driver backends.
3  *
4  * Copyright (c) 2002-2005, K A Fraser
5  * Copyright (c) 2005, XenSource Ltd
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License version 2
9  * as published by the Free Software Foundation; or, when distributed
10  * separately from the Linux kernel or incorporated into other
11  * software packages, subject to the following license:
12  *
13  * Permission is hereby granted, free of charge, to any person obtaining a copy
14  * of this source file (the "Software"), to deal in the Software without
15  * restriction, including without limitation the rights to use, copy, modify,
16  * merge, publish, distribute, sublicense, and/or sell copies of the Software,
17  * and to permit persons to whom the Software is furnished to do so, subject to
18  * the following conditions:
19  *
20  * The above copyright notice and this permission notice shall be included in
21  * all copies or substantial portions of the Software.
22  *
23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
24  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
26  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
28  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
29  * IN THE SOFTWARE.
30  */
31 
32 #include <linux/module.h>
33 #include <linux/kernel.h>
34 #include <linux/netdevice.h>
35 #include <linux/etherdevice.h>
36 #include <linux/skbuff.h>
37 #include <linux/ethtool.h>
38 #include <linux/if_ether.h>
39 #include <linux/tcp.h>
40 #include <linux/udp.h>
41 #include <linux/moduleparam.h>
42 #include <linux/mm.h>
43 #include <net/ip.h>
44 
45 #include <xen/xenbus.h>
46 #include <xen/events.h>
47 #include <xen/page.h>
48 #include <xen/grant_table.h>
49 
50 #include <xen/interface/io/netif.h>
51 #include <xen/interface/memory.h>
52 #include <xen/interface/grant_table.h>
53 
54 static struct ethtool_ops xennet_ethtool_ops;
55 
56 struct netfront_cb {
57 	struct page *page;
58 	unsigned offset;
59 };
60 
61 #define NETFRONT_SKB_CB(skb)	((struct netfront_cb *)((skb)->cb))
62 
63 #define RX_COPY_THRESHOLD 256
64 
65 #define GRANT_INVALID_REF	0
66 
67 #define NET_TX_RING_SIZE __RING_SIZE((struct xen_netif_tx_sring *)0, PAGE_SIZE)
68 #define NET_RX_RING_SIZE __RING_SIZE((struct xen_netif_rx_sring *)0, PAGE_SIZE)
69 #define TX_MAX_TARGET min_t(int, NET_RX_RING_SIZE, 256)
70 
71 struct netfront_info {
72 	struct list_head list;
73 	struct net_device *netdev;
74 
75 	struct napi_struct napi;
76 
77 	struct xen_netif_tx_front_ring tx;
78 	struct xen_netif_rx_front_ring rx;
79 
80 	spinlock_t   tx_lock;
81 	spinlock_t   rx_lock;
82 
83 	unsigned int evtchn;
84 
85 	/* Receive-ring batched refills. */
86 #define RX_MIN_TARGET 8
87 #define RX_DFL_MIN_TARGET 64
88 #define RX_MAX_TARGET min_t(int, NET_RX_RING_SIZE, 256)
89 	unsigned rx_min_target, rx_max_target, rx_target;
90 	struct sk_buff_head rx_batch;
91 
92 	struct timer_list rx_refill_timer;
93 
94 	/*
95 	 * {tx,rx}_skbs store outstanding skbuffs. Free tx_skb entries
96 	 * are linked from tx_skb_freelist through skb_entry.link.
97 	 *
98 	 *  NB. Freelist index entries are always going to be less than
99 	 *  PAGE_OFFSET, whereas pointers to skbs will always be equal or
100 	 *  greater than PAGE_OFFSET: we use this property to distinguish
101 	 *  them.
102 	 */
103 	union skb_entry {
104 		struct sk_buff *skb;
105 		unsigned link;
106 	} tx_skbs[NET_TX_RING_SIZE];
107 	grant_ref_t gref_tx_head;
108 	grant_ref_t grant_tx_ref[NET_TX_RING_SIZE];
109 	unsigned tx_skb_freelist;
110 
111 	struct sk_buff *rx_skbs[NET_RX_RING_SIZE];
112 	grant_ref_t gref_rx_head;
113 	grant_ref_t grant_rx_ref[NET_RX_RING_SIZE];
114 
115 	struct xenbus_device *xbdev;
116 	int tx_ring_ref;
117 	int rx_ring_ref;
118 
119 	unsigned long rx_pfn_array[NET_RX_RING_SIZE];
120 	struct multicall_entry rx_mcl[NET_RX_RING_SIZE+1];
121 	struct mmu_update rx_mmu[NET_RX_RING_SIZE];
122 };
123 
124 struct netfront_rx_info {
125 	struct xen_netif_rx_response rx;
126 	struct xen_netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1];
127 };
128 
129 /*
130  * Access macros for acquiring freeing slots in tx_skbs[].
131  */
132 
133 static void add_id_to_freelist(unsigned *head, union skb_entry *list,
134 			       unsigned short id)
135 {
136 	list[id].link = *head;
137 	*head = id;
138 }
139 
140 static unsigned short get_id_from_freelist(unsigned *head,
141 					   union skb_entry *list)
142 {
143 	unsigned int id = *head;
144 	*head = list[id].link;
145 	return id;
146 }
147 
148 static int xennet_rxidx(RING_IDX idx)
149 {
150 	return idx & (NET_RX_RING_SIZE - 1);
151 }
152 
153 static struct sk_buff *xennet_get_rx_skb(struct netfront_info *np,
154 					 RING_IDX ri)
155 {
156 	int i = xennet_rxidx(ri);
157 	struct sk_buff *skb = np->rx_skbs[i];
158 	np->rx_skbs[i] = NULL;
159 	return skb;
160 }
161 
162 static grant_ref_t xennet_get_rx_ref(struct netfront_info *np,
163 					    RING_IDX ri)
164 {
165 	int i = xennet_rxidx(ri);
166 	grant_ref_t ref = np->grant_rx_ref[i];
167 	np->grant_rx_ref[i] = GRANT_INVALID_REF;
168 	return ref;
169 }
170 
171 #ifdef CONFIG_SYSFS
172 static int xennet_sysfs_addif(struct net_device *netdev);
173 static void xennet_sysfs_delif(struct net_device *netdev);
174 #else /* !CONFIG_SYSFS */
175 #define xennet_sysfs_addif(dev) (0)
176 #define xennet_sysfs_delif(dev) do { } while (0)
177 #endif
178 
179 static int xennet_can_sg(struct net_device *dev)
180 {
181 	return dev->features & NETIF_F_SG;
182 }
183 
184 
185 static void rx_refill_timeout(unsigned long data)
186 {
187 	struct net_device *dev = (struct net_device *)data;
188 	struct netfront_info *np = netdev_priv(dev);
189 	netif_rx_schedule(dev, &np->napi);
190 }
191 
192 static int netfront_tx_slot_available(struct netfront_info *np)
193 {
194 	return ((np->tx.req_prod_pvt - np->tx.rsp_cons) <
195 		(TX_MAX_TARGET - MAX_SKB_FRAGS - 2));
196 }
197 
198 static void xennet_maybe_wake_tx(struct net_device *dev)
199 {
200 	struct netfront_info *np = netdev_priv(dev);
201 
202 	if (unlikely(netif_queue_stopped(dev)) &&
203 	    netfront_tx_slot_available(np) &&
204 	    likely(netif_running(dev)))
205 		netif_wake_queue(dev);
206 }
207 
208 static void xennet_alloc_rx_buffers(struct net_device *dev)
209 {
210 	unsigned short id;
211 	struct netfront_info *np = netdev_priv(dev);
212 	struct sk_buff *skb;
213 	struct page *page;
214 	int i, batch_target, notify;
215 	RING_IDX req_prod = np->rx.req_prod_pvt;
216 	grant_ref_t ref;
217 	unsigned long pfn;
218 	void *vaddr;
219 	struct xen_netif_rx_request *req;
220 
221 	if (unlikely(!netif_carrier_ok(dev)))
222 		return;
223 
224 	/*
225 	 * Allocate skbuffs greedily, even though we batch updates to the
226 	 * receive ring. This creates a less bursty demand on the memory
227 	 * allocator, so should reduce the chance of failed allocation requests
228 	 * both for ourself and for other kernel subsystems.
229 	 */
230 	batch_target = np->rx_target - (req_prod - np->rx.rsp_cons);
231 	for (i = skb_queue_len(&np->rx_batch); i < batch_target; i++) {
232 		skb = __netdev_alloc_skb(dev, RX_COPY_THRESHOLD,
233 					 GFP_ATOMIC | __GFP_NOWARN);
234 		if (unlikely(!skb))
235 			goto no_skb;
236 
237 		page = alloc_page(GFP_ATOMIC | __GFP_NOWARN);
238 		if (!page) {
239 			kfree_skb(skb);
240 no_skb:
241 			/* Any skbuffs queued for refill? Force them out. */
242 			if (i != 0)
243 				goto refill;
244 			/* Could not allocate any skbuffs. Try again later. */
245 			mod_timer(&np->rx_refill_timer,
246 				  jiffies + (HZ/10));
247 			break;
248 		}
249 
250 		skb_shinfo(skb)->frags[0].page = page;
251 		skb_shinfo(skb)->nr_frags = 1;
252 		__skb_queue_tail(&np->rx_batch, skb);
253 	}
254 
255 	/* Is the batch large enough to be worthwhile? */
256 	if (i < (np->rx_target/2)) {
257 		if (req_prod > np->rx.sring->req_prod)
258 			goto push;
259 		return;
260 	}
261 
262 	/* Adjust our fill target if we risked running out of buffers. */
263 	if (((req_prod - np->rx.sring->rsp_prod) < (np->rx_target / 4)) &&
264 	    ((np->rx_target *= 2) > np->rx_max_target))
265 		np->rx_target = np->rx_max_target;
266 
267  refill:
268 	for (i = 0; ; i++) {
269 		skb = __skb_dequeue(&np->rx_batch);
270 		if (skb == NULL)
271 			break;
272 
273 		skb->dev = dev;
274 
275 		id = xennet_rxidx(req_prod + i);
276 
277 		BUG_ON(np->rx_skbs[id]);
278 		np->rx_skbs[id] = skb;
279 
280 		ref = gnttab_claim_grant_reference(&np->gref_rx_head);
281 		BUG_ON((signed short)ref < 0);
282 		np->grant_rx_ref[id] = ref;
283 
284 		pfn = page_to_pfn(skb_shinfo(skb)->frags[0].page);
285 		vaddr = page_address(skb_shinfo(skb)->frags[0].page);
286 
287 		req = RING_GET_REQUEST(&np->rx, req_prod + i);
288 		gnttab_grant_foreign_access_ref(ref,
289 						np->xbdev->otherend_id,
290 						pfn_to_mfn(pfn),
291 						0);
292 
293 		req->id = id;
294 		req->gref = ref;
295 	}
296 
297 	wmb();		/* barrier so backend seens requests */
298 
299 	/* Above is a suitable barrier to ensure backend will see requests. */
300 	np->rx.req_prod_pvt = req_prod + i;
301  push:
302 	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&np->rx, notify);
303 	if (notify)
304 		notify_remote_via_irq(np->netdev->irq);
305 }
306 
307 static int xennet_open(struct net_device *dev)
308 {
309 	struct netfront_info *np = netdev_priv(dev);
310 
311 	napi_enable(&np->napi);
312 
313 	spin_lock_bh(&np->rx_lock);
314 	if (netif_carrier_ok(dev)) {
315 		xennet_alloc_rx_buffers(dev);
316 		np->rx.sring->rsp_event = np->rx.rsp_cons + 1;
317 		if (RING_HAS_UNCONSUMED_RESPONSES(&np->rx))
318 			netif_rx_schedule(dev, &np->napi);
319 	}
320 	spin_unlock_bh(&np->rx_lock);
321 
322 	xennet_maybe_wake_tx(dev);
323 
324 	return 0;
325 }
326 
327 static void xennet_tx_buf_gc(struct net_device *dev)
328 {
329 	RING_IDX cons, prod;
330 	unsigned short id;
331 	struct netfront_info *np = netdev_priv(dev);
332 	struct sk_buff *skb;
333 
334 	BUG_ON(!netif_carrier_ok(dev));
335 
336 	do {
337 		prod = np->tx.sring->rsp_prod;
338 		rmb(); /* Ensure we see responses up to 'rp'. */
339 
340 		for (cons = np->tx.rsp_cons; cons != prod; cons++) {
341 			struct xen_netif_tx_response *txrsp;
342 
343 			txrsp = RING_GET_RESPONSE(&np->tx, cons);
344 			if (txrsp->status == NETIF_RSP_NULL)
345 				continue;
346 
347 			id  = txrsp->id;
348 			skb = np->tx_skbs[id].skb;
349 			if (unlikely(gnttab_query_foreign_access(
350 				np->grant_tx_ref[id]) != 0)) {
351 				printk(KERN_ALERT "xennet_tx_buf_gc: warning "
352 				       "-- grant still in use by backend "
353 				       "domain.\n");
354 				BUG();
355 			}
356 			gnttab_end_foreign_access_ref(
357 				np->grant_tx_ref[id], GNTMAP_readonly);
358 			gnttab_release_grant_reference(
359 				&np->gref_tx_head, np->grant_tx_ref[id]);
360 			np->grant_tx_ref[id] = GRANT_INVALID_REF;
361 			add_id_to_freelist(&np->tx_skb_freelist, np->tx_skbs, id);
362 			dev_kfree_skb_irq(skb);
363 		}
364 
365 		np->tx.rsp_cons = prod;
366 
367 		/*
368 		 * Set a new event, then check for race with update of tx_cons.
369 		 * Note that it is essential to schedule a callback, no matter
370 		 * how few buffers are pending. Even if there is space in the
371 		 * transmit ring, higher layers may be blocked because too much
372 		 * data is outstanding: in such cases notification from Xen is
373 		 * likely to be the only kick that we'll get.
374 		 */
375 		np->tx.sring->rsp_event =
376 			prod + ((np->tx.sring->req_prod - prod) >> 1) + 1;
377 		mb();		/* update shared area */
378 	} while ((cons == prod) && (prod != np->tx.sring->rsp_prod));
379 
380 	xennet_maybe_wake_tx(dev);
381 }
382 
383 static void xennet_make_frags(struct sk_buff *skb, struct net_device *dev,
384 			      struct xen_netif_tx_request *tx)
385 {
386 	struct netfront_info *np = netdev_priv(dev);
387 	char *data = skb->data;
388 	unsigned long mfn;
389 	RING_IDX prod = np->tx.req_prod_pvt;
390 	int frags = skb_shinfo(skb)->nr_frags;
391 	unsigned int offset = offset_in_page(data);
392 	unsigned int len = skb_headlen(skb);
393 	unsigned int id;
394 	grant_ref_t ref;
395 	int i;
396 
397 	/* While the header overlaps a page boundary (including being
398 	   larger than a page), split it it into page-sized chunks. */
399 	while (len > PAGE_SIZE - offset) {
400 		tx->size = PAGE_SIZE - offset;
401 		tx->flags |= NETTXF_more_data;
402 		len -= tx->size;
403 		data += tx->size;
404 		offset = 0;
405 
406 		id = get_id_from_freelist(&np->tx_skb_freelist, np->tx_skbs);
407 		np->tx_skbs[id].skb = skb_get(skb);
408 		tx = RING_GET_REQUEST(&np->tx, prod++);
409 		tx->id = id;
410 		ref = gnttab_claim_grant_reference(&np->gref_tx_head);
411 		BUG_ON((signed short)ref < 0);
412 
413 		mfn = virt_to_mfn(data);
414 		gnttab_grant_foreign_access_ref(ref, np->xbdev->otherend_id,
415 						mfn, GNTMAP_readonly);
416 
417 		tx->gref = np->grant_tx_ref[id] = ref;
418 		tx->offset = offset;
419 		tx->size = len;
420 		tx->flags = 0;
421 	}
422 
423 	/* Grant backend access to each skb fragment page. */
424 	for (i = 0; i < frags; i++) {
425 		skb_frag_t *frag = skb_shinfo(skb)->frags + i;
426 
427 		tx->flags |= NETTXF_more_data;
428 
429 		id = get_id_from_freelist(&np->tx_skb_freelist, np->tx_skbs);
430 		np->tx_skbs[id].skb = skb_get(skb);
431 		tx = RING_GET_REQUEST(&np->tx, prod++);
432 		tx->id = id;
433 		ref = gnttab_claim_grant_reference(&np->gref_tx_head);
434 		BUG_ON((signed short)ref < 0);
435 
436 		mfn = pfn_to_mfn(page_to_pfn(frag->page));
437 		gnttab_grant_foreign_access_ref(ref, np->xbdev->otherend_id,
438 						mfn, GNTMAP_readonly);
439 
440 		tx->gref = np->grant_tx_ref[id] = ref;
441 		tx->offset = frag->page_offset;
442 		tx->size = frag->size;
443 		tx->flags = 0;
444 	}
445 
446 	np->tx.req_prod_pvt = prod;
447 }
448 
449 static int xennet_start_xmit(struct sk_buff *skb, struct net_device *dev)
450 {
451 	unsigned short id;
452 	struct netfront_info *np = netdev_priv(dev);
453 	struct xen_netif_tx_request *tx;
454 	struct xen_netif_extra_info *extra;
455 	char *data = skb->data;
456 	RING_IDX i;
457 	grant_ref_t ref;
458 	unsigned long mfn;
459 	int notify;
460 	int frags = skb_shinfo(skb)->nr_frags;
461 	unsigned int offset = offset_in_page(data);
462 	unsigned int len = skb_headlen(skb);
463 
464 	frags += (offset + len + PAGE_SIZE - 1) / PAGE_SIZE;
465 	if (unlikely(frags > MAX_SKB_FRAGS + 1)) {
466 		printk(KERN_ALERT "xennet: skb rides the rocket: %d frags\n",
467 		       frags);
468 		dump_stack();
469 		goto drop;
470 	}
471 
472 	spin_lock_irq(&np->tx_lock);
473 
474 	if (unlikely(!netif_carrier_ok(dev) ||
475 		     (frags > 1 && !xennet_can_sg(dev)) ||
476 		     netif_needs_gso(dev, skb))) {
477 		spin_unlock_irq(&np->tx_lock);
478 		goto drop;
479 	}
480 
481 	i = np->tx.req_prod_pvt;
482 
483 	id = get_id_from_freelist(&np->tx_skb_freelist, np->tx_skbs);
484 	np->tx_skbs[id].skb = skb;
485 
486 	tx = RING_GET_REQUEST(&np->tx, i);
487 
488 	tx->id   = id;
489 	ref = gnttab_claim_grant_reference(&np->gref_tx_head);
490 	BUG_ON((signed short)ref < 0);
491 	mfn = virt_to_mfn(data);
492 	gnttab_grant_foreign_access_ref(
493 		ref, np->xbdev->otherend_id, mfn, GNTMAP_readonly);
494 	tx->gref = np->grant_tx_ref[id] = ref;
495 	tx->offset = offset;
496 	tx->size = len;
497 	extra = NULL;
498 
499 	tx->flags = 0;
500 	if (skb->ip_summed == CHECKSUM_PARTIAL)
501 		/* local packet? */
502 		tx->flags |= NETTXF_csum_blank | NETTXF_data_validated;
503 	else if (skb->ip_summed == CHECKSUM_UNNECESSARY)
504 		/* remote but checksummed. */
505 		tx->flags |= NETTXF_data_validated;
506 
507 	if (skb_shinfo(skb)->gso_size) {
508 		struct xen_netif_extra_info *gso;
509 
510 		gso = (struct xen_netif_extra_info *)
511 			RING_GET_REQUEST(&np->tx, ++i);
512 
513 		if (extra)
514 			extra->flags |= XEN_NETIF_EXTRA_FLAG_MORE;
515 		else
516 			tx->flags |= NETTXF_extra_info;
517 
518 		gso->u.gso.size = skb_shinfo(skb)->gso_size;
519 		gso->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4;
520 		gso->u.gso.pad = 0;
521 		gso->u.gso.features = 0;
522 
523 		gso->type = XEN_NETIF_EXTRA_TYPE_GSO;
524 		gso->flags = 0;
525 		extra = gso;
526 	}
527 
528 	np->tx.req_prod_pvt = i + 1;
529 
530 	xennet_make_frags(skb, dev, tx);
531 	tx->size = skb->len;
532 
533 	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&np->tx, notify);
534 	if (notify)
535 		notify_remote_via_irq(np->netdev->irq);
536 
537 	dev->stats.tx_bytes += skb->len;
538 	dev->stats.tx_packets++;
539 
540 	/* Note: It is not safe to access skb after xennet_tx_buf_gc()! */
541 	xennet_tx_buf_gc(dev);
542 
543 	if (!netfront_tx_slot_available(np))
544 		netif_stop_queue(dev);
545 
546 	spin_unlock_irq(&np->tx_lock);
547 
548 	return 0;
549 
550  drop:
551 	dev->stats.tx_dropped++;
552 	dev_kfree_skb(skb);
553 	return 0;
554 }
555 
556 static int xennet_close(struct net_device *dev)
557 {
558 	struct netfront_info *np = netdev_priv(dev);
559 	netif_stop_queue(np->netdev);
560 	napi_disable(&np->napi);
561 	return 0;
562 }
563 
564 static void xennet_move_rx_slot(struct netfront_info *np, struct sk_buff *skb,
565 				grant_ref_t ref)
566 {
567 	int new = xennet_rxidx(np->rx.req_prod_pvt);
568 
569 	BUG_ON(np->rx_skbs[new]);
570 	np->rx_skbs[new] = skb;
571 	np->grant_rx_ref[new] = ref;
572 	RING_GET_REQUEST(&np->rx, np->rx.req_prod_pvt)->id = new;
573 	RING_GET_REQUEST(&np->rx, np->rx.req_prod_pvt)->gref = ref;
574 	np->rx.req_prod_pvt++;
575 }
576 
577 static int xennet_get_extras(struct netfront_info *np,
578 			     struct xen_netif_extra_info *extras,
579 			     RING_IDX rp)
580 
581 {
582 	struct xen_netif_extra_info *extra;
583 	struct device *dev = &np->netdev->dev;
584 	RING_IDX cons = np->rx.rsp_cons;
585 	int err = 0;
586 
587 	do {
588 		struct sk_buff *skb;
589 		grant_ref_t ref;
590 
591 		if (unlikely(cons + 1 == rp)) {
592 			if (net_ratelimit())
593 				dev_warn(dev, "Missing extra info\n");
594 			err = -EBADR;
595 			break;
596 		}
597 
598 		extra = (struct xen_netif_extra_info *)
599 			RING_GET_RESPONSE(&np->rx, ++cons);
600 
601 		if (unlikely(!extra->type ||
602 			     extra->type >= XEN_NETIF_EXTRA_TYPE_MAX)) {
603 			if (net_ratelimit())
604 				dev_warn(dev, "Invalid extra type: %d\n",
605 					extra->type);
606 			err = -EINVAL;
607 		} else {
608 			memcpy(&extras[extra->type - 1], extra,
609 			       sizeof(*extra));
610 		}
611 
612 		skb = xennet_get_rx_skb(np, cons);
613 		ref = xennet_get_rx_ref(np, cons);
614 		xennet_move_rx_slot(np, skb, ref);
615 	} while (extra->flags & XEN_NETIF_EXTRA_FLAG_MORE);
616 
617 	np->rx.rsp_cons = cons;
618 	return err;
619 }
620 
621 static int xennet_get_responses(struct netfront_info *np,
622 				struct netfront_rx_info *rinfo, RING_IDX rp,
623 				struct sk_buff_head *list)
624 {
625 	struct xen_netif_rx_response *rx = &rinfo->rx;
626 	struct xen_netif_extra_info *extras = rinfo->extras;
627 	struct device *dev = &np->netdev->dev;
628 	RING_IDX cons = np->rx.rsp_cons;
629 	struct sk_buff *skb = xennet_get_rx_skb(np, cons);
630 	grant_ref_t ref = xennet_get_rx_ref(np, cons);
631 	int max = MAX_SKB_FRAGS + (rx->status <= RX_COPY_THRESHOLD);
632 	int frags = 1;
633 	int err = 0;
634 	unsigned long ret;
635 
636 	if (rx->flags & NETRXF_extra_info) {
637 		err = xennet_get_extras(np, extras, rp);
638 		cons = np->rx.rsp_cons;
639 	}
640 
641 	for (;;) {
642 		if (unlikely(rx->status < 0 ||
643 			     rx->offset + rx->status > PAGE_SIZE)) {
644 			if (net_ratelimit())
645 				dev_warn(dev, "rx->offset: %x, size: %u\n",
646 					 rx->offset, rx->status);
647 			xennet_move_rx_slot(np, skb, ref);
648 			err = -EINVAL;
649 			goto next;
650 		}
651 
652 		/*
653 		 * This definitely indicates a bug, either in this driver or in
654 		 * the backend driver. In future this should flag the bad
655 		 * situation to the system controller to reboot the backed.
656 		 */
657 		if (ref == GRANT_INVALID_REF) {
658 			if (net_ratelimit())
659 				dev_warn(dev, "Bad rx response id %d.\n",
660 					 rx->id);
661 			err = -EINVAL;
662 			goto next;
663 		}
664 
665 		ret = gnttab_end_foreign_access_ref(ref, 0);
666 		BUG_ON(!ret);
667 
668 		gnttab_release_grant_reference(&np->gref_rx_head, ref);
669 
670 		__skb_queue_tail(list, skb);
671 
672 next:
673 		if (!(rx->flags & NETRXF_more_data))
674 			break;
675 
676 		if (cons + frags == rp) {
677 			if (net_ratelimit())
678 				dev_warn(dev, "Need more frags\n");
679 			err = -ENOENT;
680 			break;
681 		}
682 
683 		rx = RING_GET_RESPONSE(&np->rx, cons + frags);
684 		skb = xennet_get_rx_skb(np, cons + frags);
685 		ref = xennet_get_rx_ref(np, cons + frags);
686 		frags++;
687 	}
688 
689 	if (unlikely(frags > max)) {
690 		if (net_ratelimit())
691 			dev_warn(dev, "Too many frags\n");
692 		err = -E2BIG;
693 	}
694 
695 	if (unlikely(err))
696 		np->rx.rsp_cons = cons + frags;
697 
698 	return err;
699 }
700 
701 static int xennet_set_skb_gso(struct sk_buff *skb,
702 			      struct xen_netif_extra_info *gso)
703 {
704 	if (!gso->u.gso.size) {
705 		if (net_ratelimit())
706 			printk(KERN_WARNING "GSO size must not be zero.\n");
707 		return -EINVAL;
708 	}
709 
710 	/* Currently only TCPv4 S.O. is supported. */
711 	if (gso->u.gso.type != XEN_NETIF_GSO_TYPE_TCPV4) {
712 		if (net_ratelimit())
713 			printk(KERN_WARNING "Bad GSO type %d.\n", gso->u.gso.type);
714 		return -EINVAL;
715 	}
716 
717 	skb_shinfo(skb)->gso_size = gso->u.gso.size;
718 	skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
719 
720 	/* Header must be checked, and gso_segs computed. */
721 	skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
722 	skb_shinfo(skb)->gso_segs = 0;
723 
724 	return 0;
725 }
726 
727 static RING_IDX xennet_fill_frags(struct netfront_info *np,
728 				  struct sk_buff *skb,
729 				  struct sk_buff_head *list)
730 {
731 	struct skb_shared_info *shinfo = skb_shinfo(skb);
732 	int nr_frags = shinfo->nr_frags;
733 	RING_IDX cons = np->rx.rsp_cons;
734 	skb_frag_t *frag = shinfo->frags + nr_frags;
735 	struct sk_buff *nskb;
736 
737 	while ((nskb = __skb_dequeue(list))) {
738 		struct xen_netif_rx_response *rx =
739 			RING_GET_RESPONSE(&np->rx, ++cons);
740 
741 		frag->page = skb_shinfo(nskb)->frags[0].page;
742 		frag->page_offset = rx->offset;
743 		frag->size = rx->status;
744 
745 		skb->data_len += rx->status;
746 
747 		skb_shinfo(nskb)->nr_frags = 0;
748 		kfree_skb(nskb);
749 
750 		frag++;
751 		nr_frags++;
752 	}
753 
754 	shinfo->nr_frags = nr_frags;
755 	return cons;
756 }
757 
758 static int skb_checksum_setup(struct sk_buff *skb)
759 {
760 	struct iphdr *iph;
761 	unsigned char *th;
762 	int err = -EPROTO;
763 
764 	if (skb->protocol != htons(ETH_P_IP))
765 		goto out;
766 
767 	iph = (void *)skb->data;
768 	th = skb->data + 4 * iph->ihl;
769 	if (th >= skb_tail_pointer(skb))
770 		goto out;
771 
772 	skb->csum_start = th - skb->head;
773 	switch (iph->protocol) {
774 	case IPPROTO_TCP:
775 		skb->csum_offset = offsetof(struct tcphdr, check);
776 		break;
777 	case IPPROTO_UDP:
778 		skb->csum_offset = offsetof(struct udphdr, check);
779 		break;
780 	default:
781 		if (net_ratelimit())
782 			printk(KERN_ERR "Attempting to checksum a non-"
783 			       "TCP/UDP packet, dropping a protocol"
784 			       " %d packet", iph->protocol);
785 		goto out;
786 	}
787 
788 	if ((th + skb->csum_offset + 2) > skb_tail_pointer(skb))
789 		goto out;
790 
791 	err = 0;
792 
793 out:
794 	return err;
795 }
796 
797 static int handle_incoming_queue(struct net_device *dev,
798 				 struct sk_buff_head *rxq)
799 {
800 	int packets_dropped = 0;
801 	struct sk_buff *skb;
802 
803 	while ((skb = __skb_dequeue(rxq)) != NULL) {
804 		struct page *page = NETFRONT_SKB_CB(skb)->page;
805 		void *vaddr = page_address(page);
806 		unsigned offset = NETFRONT_SKB_CB(skb)->offset;
807 
808 		memcpy(skb->data, vaddr + offset,
809 		       skb_headlen(skb));
810 
811 		if (page != skb_shinfo(skb)->frags[0].page)
812 			__free_page(page);
813 
814 		/* Ethernet work: Delayed to here as it peeks the header. */
815 		skb->protocol = eth_type_trans(skb, dev);
816 
817 		if (skb->ip_summed == CHECKSUM_PARTIAL) {
818 			if (skb_checksum_setup(skb)) {
819 				kfree_skb(skb);
820 				packets_dropped++;
821 				dev->stats.rx_errors++;
822 				continue;
823 			}
824 		}
825 
826 		dev->stats.rx_packets++;
827 		dev->stats.rx_bytes += skb->len;
828 
829 		/* Pass it up. */
830 		netif_receive_skb(skb);
831 		dev->last_rx = jiffies;
832 	}
833 
834 	return packets_dropped;
835 }
836 
837 static int xennet_poll(struct napi_struct *napi, int budget)
838 {
839 	struct netfront_info *np = container_of(napi, struct netfront_info, napi);
840 	struct net_device *dev = np->netdev;
841 	struct sk_buff *skb;
842 	struct netfront_rx_info rinfo;
843 	struct xen_netif_rx_response *rx = &rinfo.rx;
844 	struct xen_netif_extra_info *extras = rinfo.extras;
845 	RING_IDX i, rp;
846 	int work_done;
847 	struct sk_buff_head rxq;
848 	struct sk_buff_head errq;
849 	struct sk_buff_head tmpq;
850 	unsigned long flags;
851 	unsigned int len;
852 	int err;
853 
854 	spin_lock(&np->rx_lock);
855 
856 	if (unlikely(!netif_carrier_ok(dev))) {
857 		spin_unlock(&np->rx_lock);
858 		return 0;
859 	}
860 
861 	skb_queue_head_init(&rxq);
862 	skb_queue_head_init(&errq);
863 	skb_queue_head_init(&tmpq);
864 
865 	rp = np->rx.sring->rsp_prod;
866 	rmb(); /* Ensure we see queued responses up to 'rp'. */
867 
868 	i = np->rx.rsp_cons;
869 	work_done = 0;
870 	while ((i != rp) && (work_done < budget)) {
871 		memcpy(rx, RING_GET_RESPONSE(&np->rx, i), sizeof(*rx));
872 		memset(extras, 0, sizeof(rinfo.extras));
873 
874 		err = xennet_get_responses(np, &rinfo, rp, &tmpq);
875 
876 		if (unlikely(err)) {
877 err:
878 			while ((skb = __skb_dequeue(&tmpq)))
879 				__skb_queue_tail(&errq, skb);
880 			dev->stats.rx_errors++;
881 			i = np->rx.rsp_cons;
882 			continue;
883 		}
884 
885 		skb = __skb_dequeue(&tmpq);
886 
887 		if (extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].type) {
888 			struct xen_netif_extra_info *gso;
889 			gso = &extras[XEN_NETIF_EXTRA_TYPE_GSO - 1];
890 
891 			if (unlikely(xennet_set_skb_gso(skb, gso))) {
892 				__skb_queue_head(&tmpq, skb);
893 				np->rx.rsp_cons += skb_queue_len(&tmpq);
894 				goto err;
895 			}
896 		}
897 
898 		NETFRONT_SKB_CB(skb)->page = skb_shinfo(skb)->frags[0].page;
899 		NETFRONT_SKB_CB(skb)->offset = rx->offset;
900 
901 		len = rx->status;
902 		if (len > RX_COPY_THRESHOLD)
903 			len = RX_COPY_THRESHOLD;
904 		skb_put(skb, len);
905 
906 		if (rx->status > len) {
907 			skb_shinfo(skb)->frags[0].page_offset =
908 				rx->offset + len;
909 			skb_shinfo(skb)->frags[0].size = rx->status - len;
910 			skb->data_len = rx->status - len;
911 		} else {
912 			skb_shinfo(skb)->frags[0].page = NULL;
913 			skb_shinfo(skb)->nr_frags = 0;
914 		}
915 
916 		i = xennet_fill_frags(np, skb, &tmpq);
917 
918 		/*
919 		 * Truesize approximates the size of true data plus
920 		 * any supervisor overheads. Adding hypervisor
921 		 * overheads has been shown to significantly reduce
922 		 * achievable bandwidth with the default receive
923 		 * buffer size. It is therefore not wise to account
924 		 * for it here.
925 		 *
926 		 * After alloc_skb(RX_COPY_THRESHOLD), truesize is set
927 		 * to RX_COPY_THRESHOLD + the supervisor
928 		 * overheads. Here, we add the size of the data pulled
929 		 * in xennet_fill_frags().
930 		 *
931 		 * We also adjust for any unused space in the main
932 		 * data area by subtracting (RX_COPY_THRESHOLD -
933 		 * len). This is especially important with drivers
934 		 * which split incoming packets into header and data,
935 		 * using only 66 bytes of the main data area (see the
936 		 * e1000 driver for example.)  On such systems,
937 		 * without this last adjustement, our achievable
938 		 * receive throughout using the standard receive
939 		 * buffer size was cut by 25%(!!!).
940 		 */
941 		skb->truesize += skb->data_len - (RX_COPY_THRESHOLD - len);
942 		skb->len += skb->data_len;
943 
944 		if (rx->flags & NETRXF_csum_blank)
945 			skb->ip_summed = CHECKSUM_PARTIAL;
946 		else if (rx->flags & NETRXF_data_validated)
947 			skb->ip_summed = CHECKSUM_UNNECESSARY;
948 
949 		__skb_queue_tail(&rxq, skb);
950 
951 		np->rx.rsp_cons = ++i;
952 		work_done++;
953 	}
954 
955 	while ((skb = __skb_dequeue(&errq)))
956 		kfree_skb(skb);
957 
958 	work_done -= handle_incoming_queue(dev, &rxq);
959 
960 	/* If we get a callback with very few responses, reduce fill target. */
961 	/* NB. Note exponential increase, linear decrease. */
962 	if (((np->rx.req_prod_pvt - np->rx.sring->rsp_prod) >
963 	     ((3*np->rx_target) / 4)) &&
964 	    (--np->rx_target < np->rx_min_target))
965 		np->rx_target = np->rx_min_target;
966 
967 	xennet_alloc_rx_buffers(dev);
968 
969 	if (work_done < budget) {
970 		int more_to_do = 0;
971 
972 		local_irq_save(flags);
973 
974 		RING_FINAL_CHECK_FOR_RESPONSES(&np->rx, more_to_do);
975 		if (!more_to_do)
976 			__netif_rx_complete(dev, napi);
977 
978 		local_irq_restore(flags);
979 	}
980 
981 	spin_unlock(&np->rx_lock);
982 
983 	return work_done;
984 }
985 
986 static int xennet_change_mtu(struct net_device *dev, int mtu)
987 {
988 	int max = xennet_can_sg(dev) ? 65535 - ETH_HLEN : ETH_DATA_LEN;
989 
990 	if (mtu > max)
991 		return -EINVAL;
992 	dev->mtu = mtu;
993 	return 0;
994 }
995 
996 static void xennet_release_tx_bufs(struct netfront_info *np)
997 {
998 	struct sk_buff *skb;
999 	int i;
1000 
1001 	for (i = 0; i < NET_TX_RING_SIZE; i++) {
1002 		/* Skip over entries which are actually freelist references */
1003 		if ((unsigned long)np->tx_skbs[i].skb < PAGE_OFFSET)
1004 			continue;
1005 
1006 		skb = np->tx_skbs[i].skb;
1007 		gnttab_end_foreign_access_ref(np->grant_tx_ref[i],
1008 					      GNTMAP_readonly);
1009 		gnttab_release_grant_reference(&np->gref_tx_head,
1010 					       np->grant_tx_ref[i]);
1011 		np->grant_tx_ref[i] = GRANT_INVALID_REF;
1012 		add_id_to_freelist(&np->tx_skb_freelist, np->tx_skbs, i);
1013 		dev_kfree_skb_irq(skb);
1014 	}
1015 }
1016 
1017 static void xennet_release_rx_bufs(struct netfront_info *np)
1018 {
1019 	struct mmu_update      *mmu = np->rx_mmu;
1020 	struct multicall_entry *mcl = np->rx_mcl;
1021 	struct sk_buff_head free_list;
1022 	struct sk_buff *skb;
1023 	unsigned long mfn;
1024 	int xfer = 0, noxfer = 0, unused = 0;
1025 	int id, ref;
1026 
1027 	dev_warn(&np->netdev->dev, "%s: fix me for copying receiver.\n",
1028 			 __func__);
1029 	return;
1030 
1031 	skb_queue_head_init(&free_list);
1032 
1033 	spin_lock_bh(&np->rx_lock);
1034 
1035 	for (id = 0; id < NET_RX_RING_SIZE; id++) {
1036 		ref = np->grant_rx_ref[id];
1037 		if (ref == GRANT_INVALID_REF) {
1038 			unused++;
1039 			continue;
1040 		}
1041 
1042 		skb = np->rx_skbs[id];
1043 		mfn = gnttab_end_foreign_transfer_ref(ref);
1044 		gnttab_release_grant_reference(&np->gref_rx_head, ref);
1045 		np->grant_rx_ref[id] = GRANT_INVALID_REF;
1046 
1047 		if (0 == mfn) {
1048 			skb_shinfo(skb)->nr_frags = 0;
1049 			dev_kfree_skb(skb);
1050 			noxfer++;
1051 			continue;
1052 		}
1053 
1054 		if (!xen_feature(XENFEAT_auto_translated_physmap)) {
1055 			/* Remap the page. */
1056 			struct page *page = skb_shinfo(skb)->frags[0].page;
1057 			unsigned long pfn = page_to_pfn(page);
1058 			void *vaddr = page_address(page);
1059 
1060 			MULTI_update_va_mapping(mcl, (unsigned long)vaddr,
1061 						mfn_pte(mfn, PAGE_KERNEL),
1062 						0);
1063 			mcl++;
1064 			mmu->ptr = ((u64)mfn << PAGE_SHIFT)
1065 				| MMU_MACHPHYS_UPDATE;
1066 			mmu->val = pfn;
1067 			mmu++;
1068 
1069 			set_phys_to_machine(pfn, mfn);
1070 		}
1071 		__skb_queue_tail(&free_list, skb);
1072 		xfer++;
1073 	}
1074 
1075 	dev_info(&np->netdev->dev, "%s: %d xfer, %d noxfer, %d unused\n",
1076 		 __func__, xfer, noxfer, unused);
1077 
1078 	if (xfer) {
1079 		if (!xen_feature(XENFEAT_auto_translated_physmap)) {
1080 			/* Do all the remapping work and M2P updates. */
1081 			MULTI_mmu_update(mcl, np->rx_mmu, mmu - np->rx_mmu,
1082 					 0, DOMID_SELF);
1083 			mcl++;
1084 			HYPERVISOR_multicall(np->rx_mcl, mcl - np->rx_mcl);
1085 		}
1086 	}
1087 
1088 	while ((skb = __skb_dequeue(&free_list)) != NULL)
1089 		dev_kfree_skb(skb);
1090 
1091 	spin_unlock_bh(&np->rx_lock);
1092 }
1093 
1094 static void xennet_uninit(struct net_device *dev)
1095 {
1096 	struct netfront_info *np = netdev_priv(dev);
1097 	xennet_release_tx_bufs(np);
1098 	xennet_release_rx_bufs(np);
1099 	gnttab_free_grant_references(np->gref_tx_head);
1100 	gnttab_free_grant_references(np->gref_rx_head);
1101 }
1102 
1103 static struct net_device * __devinit xennet_create_dev(struct xenbus_device *dev)
1104 {
1105 	int i, err;
1106 	struct net_device *netdev;
1107 	struct netfront_info *np;
1108 
1109 	netdev = alloc_etherdev(sizeof(struct netfront_info));
1110 	if (!netdev) {
1111 		printk(KERN_WARNING "%s> alloc_etherdev failed.\n",
1112 		       __func__);
1113 		return ERR_PTR(-ENOMEM);
1114 	}
1115 
1116 	np                   = netdev_priv(netdev);
1117 	np->xbdev            = dev;
1118 
1119 	spin_lock_init(&np->tx_lock);
1120 	spin_lock_init(&np->rx_lock);
1121 
1122 	skb_queue_head_init(&np->rx_batch);
1123 	np->rx_target     = RX_DFL_MIN_TARGET;
1124 	np->rx_min_target = RX_DFL_MIN_TARGET;
1125 	np->rx_max_target = RX_MAX_TARGET;
1126 
1127 	init_timer(&np->rx_refill_timer);
1128 	np->rx_refill_timer.data = (unsigned long)netdev;
1129 	np->rx_refill_timer.function = rx_refill_timeout;
1130 
1131 	/* Initialise tx_skbs as a free chain containing every entry. */
1132 	np->tx_skb_freelist = 0;
1133 	for (i = 0; i < NET_TX_RING_SIZE; i++) {
1134 		np->tx_skbs[i].link = i+1;
1135 		np->grant_tx_ref[i] = GRANT_INVALID_REF;
1136 	}
1137 
1138 	/* Clear out rx_skbs */
1139 	for (i = 0; i < NET_RX_RING_SIZE; i++) {
1140 		np->rx_skbs[i] = NULL;
1141 		np->grant_rx_ref[i] = GRANT_INVALID_REF;
1142 	}
1143 
1144 	/* A grant for every tx ring slot */
1145 	if (gnttab_alloc_grant_references(TX_MAX_TARGET,
1146 					  &np->gref_tx_head) < 0) {
1147 		printk(KERN_ALERT "#### netfront can't alloc tx grant refs\n");
1148 		err = -ENOMEM;
1149 		goto exit;
1150 	}
1151 	/* A grant for every rx ring slot */
1152 	if (gnttab_alloc_grant_references(RX_MAX_TARGET,
1153 					  &np->gref_rx_head) < 0) {
1154 		printk(KERN_ALERT "#### netfront can't alloc rx grant refs\n");
1155 		err = -ENOMEM;
1156 		goto exit_free_tx;
1157 	}
1158 
1159 	netdev->open            = xennet_open;
1160 	netdev->hard_start_xmit = xennet_start_xmit;
1161 	netdev->stop            = xennet_close;
1162 	netif_napi_add(netdev, &np->napi, xennet_poll, 64);
1163 	netdev->uninit          = xennet_uninit;
1164 	netdev->change_mtu	= xennet_change_mtu;
1165 	netdev->features        = NETIF_F_IP_CSUM;
1166 
1167 	SET_ETHTOOL_OPS(netdev, &xennet_ethtool_ops);
1168 	SET_NETDEV_DEV(netdev, &dev->dev);
1169 
1170 	np->netdev = netdev;
1171 
1172 	netif_carrier_off(netdev);
1173 
1174 	return netdev;
1175 
1176  exit_free_tx:
1177 	gnttab_free_grant_references(np->gref_tx_head);
1178  exit:
1179 	free_netdev(netdev);
1180 	return ERR_PTR(err);
1181 }
1182 
1183 /**
1184  * Entry point to this code when a new device is created.  Allocate the basic
1185  * structures and the ring buffers for communication with the backend, and
1186  * inform the backend of the appropriate details for those.
1187  */
1188 static int __devinit netfront_probe(struct xenbus_device *dev,
1189 				    const struct xenbus_device_id *id)
1190 {
1191 	int err;
1192 	struct net_device *netdev;
1193 	struct netfront_info *info;
1194 
1195 	netdev = xennet_create_dev(dev);
1196 	if (IS_ERR(netdev)) {
1197 		err = PTR_ERR(netdev);
1198 		xenbus_dev_fatal(dev, err, "creating netdev");
1199 		return err;
1200 	}
1201 
1202 	info = netdev_priv(netdev);
1203 	dev->dev.driver_data = info;
1204 
1205 	err = register_netdev(info->netdev);
1206 	if (err) {
1207 		printk(KERN_WARNING "%s: register_netdev err=%d\n",
1208 		       __func__, err);
1209 		goto fail;
1210 	}
1211 
1212 	err = xennet_sysfs_addif(info->netdev);
1213 	if (err) {
1214 		unregister_netdev(info->netdev);
1215 		printk(KERN_WARNING "%s: add sysfs failed err=%d\n",
1216 		       __func__, err);
1217 		goto fail;
1218 	}
1219 
1220 	return 0;
1221 
1222  fail:
1223 	free_netdev(netdev);
1224 	dev->dev.driver_data = NULL;
1225 	return err;
1226 }
1227 
1228 static void xennet_end_access(int ref, void *page)
1229 {
1230 	/* This frees the page as a side-effect */
1231 	if (ref != GRANT_INVALID_REF)
1232 		gnttab_end_foreign_access(ref, 0, (unsigned long)page);
1233 }
1234 
1235 static void xennet_disconnect_backend(struct netfront_info *info)
1236 {
1237 	/* Stop old i/f to prevent errors whilst we rebuild the state. */
1238 	spin_lock_bh(&info->rx_lock);
1239 	spin_lock_irq(&info->tx_lock);
1240 	netif_carrier_off(info->netdev);
1241 	spin_unlock_irq(&info->tx_lock);
1242 	spin_unlock_bh(&info->rx_lock);
1243 
1244 	if (info->netdev->irq)
1245 		unbind_from_irqhandler(info->netdev->irq, info->netdev);
1246 	info->evtchn = info->netdev->irq = 0;
1247 
1248 	/* End access and free the pages */
1249 	xennet_end_access(info->tx_ring_ref, info->tx.sring);
1250 	xennet_end_access(info->rx_ring_ref, info->rx.sring);
1251 
1252 	info->tx_ring_ref = GRANT_INVALID_REF;
1253 	info->rx_ring_ref = GRANT_INVALID_REF;
1254 	info->tx.sring = NULL;
1255 	info->rx.sring = NULL;
1256 }
1257 
1258 /**
1259  * We are reconnecting to the backend, due to a suspend/resume, or a backend
1260  * driver restart.  We tear down our netif structure and recreate it, but
1261  * leave the device-layer structures intact so that this is transparent to the
1262  * rest of the kernel.
1263  */
1264 static int netfront_resume(struct xenbus_device *dev)
1265 {
1266 	struct netfront_info *info = dev->dev.driver_data;
1267 
1268 	dev_dbg(&dev->dev, "%s\n", dev->nodename);
1269 
1270 	xennet_disconnect_backend(info);
1271 	return 0;
1272 }
1273 
1274 static int xen_net_read_mac(struct xenbus_device *dev, u8 mac[])
1275 {
1276 	char *s, *e, *macstr;
1277 	int i;
1278 
1279 	macstr = s = xenbus_read(XBT_NIL, dev->nodename, "mac", NULL);
1280 	if (IS_ERR(macstr))
1281 		return PTR_ERR(macstr);
1282 
1283 	for (i = 0; i < ETH_ALEN; i++) {
1284 		mac[i] = simple_strtoul(s, &e, 16);
1285 		if ((s == e) || (*e != ((i == ETH_ALEN-1) ? '\0' : ':'))) {
1286 			kfree(macstr);
1287 			return -ENOENT;
1288 		}
1289 		s = e+1;
1290 	}
1291 
1292 	kfree(macstr);
1293 	return 0;
1294 }
1295 
1296 static irqreturn_t xennet_interrupt(int irq, void *dev_id)
1297 {
1298 	struct net_device *dev = dev_id;
1299 	struct netfront_info *np = netdev_priv(dev);
1300 	unsigned long flags;
1301 
1302 	spin_lock_irqsave(&np->tx_lock, flags);
1303 
1304 	if (likely(netif_carrier_ok(dev))) {
1305 		xennet_tx_buf_gc(dev);
1306 		/* Under tx_lock: protects access to rx shared-ring indexes. */
1307 		if (RING_HAS_UNCONSUMED_RESPONSES(&np->rx))
1308 			netif_rx_schedule(dev, &np->napi);
1309 	}
1310 
1311 	spin_unlock_irqrestore(&np->tx_lock, flags);
1312 
1313 	return IRQ_HANDLED;
1314 }
1315 
1316 static int setup_netfront(struct xenbus_device *dev, struct netfront_info *info)
1317 {
1318 	struct xen_netif_tx_sring *txs;
1319 	struct xen_netif_rx_sring *rxs;
1320 	int err;
1321 	struct net_device *netdev = info->netdev;
1322 
1323 	info->tx_ring_ref = GRANT_INVALID_REF;
1324 	info->rx_ring_ref = GRANT_INVALID_REF;
1325 	info->rx.sring = NULL;
1326 	info->tx.sring = NULL;
1327 	netdev->irq = 0;
1328 
1329 	err = xen_net_read_mac(dev, netdev->dev_addr);
1330 	if (err) {
1331 		xenbus_dev_fatal(dev, err, "parsing %s/mac", dev->nodename);
1332 		goto fail;
1333 	}
1334 
1335 	txs = (struct xen_netif_tx_sring *)get_zeroed_page(GFP_KERNEL);
1336 	if (!txs) {
1337 		err = -ENOMEM;
1338 		xenbus_dev_fatal(dev, err, "allocating tx ring page");
1339 		goto fail;
1340 	}
1341 	SHARED_RING_INIT(txs);
1342 	FRONT_RING_INIT(&info->tx, txs, PAGE_SIZE);
1343 
1344 	err = xenbus_grant_ring(dev, virt_to_mfn(txs));
1345 	if (err < 0) {
1346 		free_page((unsigned long)txs);
1347 		goto fail;
1348 	}
1349 
1350 	info->tx_ring_ref = err;
1351 	rxs = (struct xen_netif_rx_sring *)get_zeroed_page(GFP_KERNEL);
1352 	if (!rxs) {
1353 		err = -ENOMEM;
1354 		xenbus_dev_fatal(dev, err, "allocating rx ring page");
1355 		goto fail;
1356 	}
1357 	SHARED_RING_INIT(rxs);
1358 	FRONT_RING_INIT(&info->rx, rxs, PAGE_SIZE);
1359 
1360 	err = xenbus_grant_ring(dev, virt_to_mfn(rxs));
1361 	if (err < 0) {
1362 		free_page((unsigned long)rxs);
1363 		goto fail;
1364 	}
1365 	info->rx_ring_ref = err;
1366 
1367 	err = xenbus_alloc_evtchn(dev, &info->evtchn);
1368 	if (err)
1369 		goto fail;
1370 
1371 	err = bind_evtchn_to_irqhandler(info->evtchn, xennet_interrupt,
1372 					IRQF_SAMPLE_RANDOM, netdev->name,
1373 					netdev);
1374 	if (err < 0)
1375 		goto fail;
1376 	netdev->irq = err;
1377 	return 0;
1378 
1379  fail:
1380 	return err;
1381 }
1382 
1383 /* Common code used when first setting up, and when resuming. */
1384 static int talk_to_backend(struct xenbus_device *dev,
1385 			   struct netfront_info *info)
1386 {
1387 	const char *message;
1388 	struct xenbus_transaction xbt;
1389 	int err;
1390 
1391 	/* Create shared ring, alloc event channel. */
1392 	err = setup_netfront(dev, info);
1393 	if (err)
1394 		goto out;
1395 
1396 again:
1397 	err = xenbus_transaction_start(&xbt);
1398 	if (err) {
1399 		xenbus_dev_fatal(dev, err, "starting transaction");
1400 		goto destroy_ring;
1401 	}
1402 
1403 	err = xenbus_printf(xbt, dev->nodename, "tx-ring-ref", "%u",
1404 			    info->tx_ring_ref);
1405 	if (err) {
1406 		message = "writing tx ring-ref";
1407 		goto abort_transaction;
1408 	}
1409 	err = xenbus_printf(xbt, dev->nodename, "rx-ring-ref", "%u",
1410 			    info->rx_ring_ref);
1411 	if (err) {
1412 		message = "writing rx ring-ref";
1413 		goto abort_transaction;
1414 	}
1415 	err = xenbus_printf(xbt, dev->nodename,
1416 			    "event-channel", "%u", info->evtchn);
1417 	if (err) {
1418 		message = "writing event-channel";
1419 		goto abort_transaction;
1420 	}
1421 
1422 	err = xenbus_printf(xbt, dev->nodename, "request-rx-copy", "%u",
1423 			    1);
1424 	if (err) {
1425 		message = "writing request-rx-copy";
1426 		goto abort_transaction;
1427 	}
1428 
1429 	err = xenbus_printf(xbt, dev->nodename, "feature-rx-notify", "%d", 1);
1430 	if (err) {
1431 		message = "writing feature-rx-notify";
1432 		goto abort_transaction;
1433 	}
1434 
1435 	err = xenbus_printf(xbt, dev->nodename, "feature-sg", "%d", 1);
1436 	if (err) {
1437 		message = "writing feature-sg";
1438 		goto abort_transaction;
1439 	}
1440 
1441 	err = xenbus_printf(xbt, dev->nodename, "feature-gso-tcpv4", "%d", 1);
1442 	if (err) {
1443 		message = "writing feature-gso-tcpv4";
1444 		goto abort_transaction;
1445 	}
1446 
1447 	err = xenbus_transaction_end(xbt, 0);
1448 	if (err) {
1449 		if (err == -EAGAIN)
1450 			goto again;
1451 		xenbus_dev_fatal(dev, err, "completing transaction");
1452 		goto destroy_ring;
1453 	}
1454 
1455 	return 0;
1456 
1457  abort_transaction:
1458 	xenbus_transaction_end(xbt, 1);
1459 	xenbus_dev_fatal(dev, err, "%s", message);
1460  destroy_ring:
1461 	xennet_disconnect_backend(info);
1462  out:
1463 	return err;
1464 }
1465 
1466 static int xennet_set_sg(struct net_device *dev, u32 data)
1467 {
1468 	if (data) {
1469 		struct netfront_info *np = netdev_priv(dev);
1470 		int val;
1471 
1472 		if (xenbus_scanf(XBT_NIL, np->xbdev->otherend, "feature-sg",
1473 				 "%d", &val) < 0)
1474 			val = 0;
1475 		if (!val)
1476 			return -ENOSYS;
1477 	} else if (dev->mtu > ETH_DATA_LEN)
1478 		dev->mtu = ETH_DATA_LEN;
1479 
1480 	return ethtool_op_set_sg(dev, data);
1481 }
1482 
1483 static int xennet_set_tso(struct net_device *dev, u32 data)
1484 {
1485 	if (data) {
1486 		struct netfront_info *np = netdev_priv(dev);
1487 		int val;
1488 
1489 		if (xenbus_scanf(XBT_NIL, np->xbdev->otherend,
1490 				 "feature-gso-tcpv4", "%d", &val) < 0)
1491 			val = 0;
1492 		if (!val)
1493 			return -ENOSYS;
1494 	}
1495 
1496 	return ethtool_op_set_tso(dev, data);
1497 }
1498 
1499 static void xennet_set_features(struct net_device *dev)
1500 {
1501 	/* Turn off all GSO bits except ROBUST. */
1502 	dev->features &= (1 << NETIF_F_GSO_SHIFT) - 1;
1503 	dev->features |= NETIF_F_GSO_ROBUST;
1504 	xennet_set_sg(dev, 0);
1505 
1506 	/* We need checksum offload to enable scatter/gather and TSO. */
1507 	if (!(dev->features & NETIF_F_IP_CSUM))
1508 		return;
1509 
1510 	if (!xennet_set_sg(dev, 1))
1511 		xennet_set_tso(dev, 1);
1512 }
1513 
1514 static int xennet_connect(struct net_device *dev)
1515 {
1516 	struct netfront_info *np = netdev_priv(dev);
1517 	int i, requeue_idx, err;
1518 	struct sk_buff *skb;
1519 	grant_ref_t ref;
1520 	struct xen_netif_rx_request *req;
1521 	unsigned int feature_rx_copy;
1522 
1523 	err = xenbus_scanf(XBT_NIL, np->xbdev->otherend,
1524 			   "feature-rx-copy", "%u", &feature_rx_copy);
1525 	if (err != 1)
1526 		feature_rx_copy = 0;
1527 
1528 	if (!feature_rx_copy) {
1529 		dev_info(&dev->dev,
1530 			 "backend does not support copying recieve path");
1531 		return -ENODEV;
1532 	}
1533 
1534 	err = talk_to_backend(np->xbdev, np);
1535 	if (err)
1536 		return err;
1537 
1538 	xennet_set_features(dev);
1539 
1540 	spin_lock_bh(&np->rx_lock);
1541 	spin_lock_irq(&np->tx_lock);
1542 
1543 	/* Step 1: Discard all pending TX packet fragments. */
1544 	xennet_release_tx_bufs(np);
1545 
1546 	/* Step 2: Rebuild the RX buffer freelist and the RX ring itself. */
1547 	for (requeue_idx = 0, i = 0; i < NET_RX_RING_SIZE; i++) {
1548 		if (!np->rx_skbs[i])
1549 			continue;
1550 
1551 		skb = np->rx_skbs[requeue_idx] = xennet_get_rx_skb(np, i);
1552 		ref = np->grant_rx_ref[requeue_idx] = xennet_get_rx_ref(np, i);
1553 		req = RING_GET_REQUEST(&np->rx, requeue_idx);
1554 
1555 		gnttab_grant_foreign_access_ref(
1556 			ref, np->xbdev->otherend_id,
1557 			pfn_to_mfn(page_to_pfn(skb_shinfo(skb)->
1558 					       frags->page)),
1559 			0);
1560 		req->gref = ref;
1561 		req->id   = requeue_idx;
1562 
1563 		requeue_idx++;
1564 	}
1565 
1566 	np->rx.req_prod_pvt = requeue_idx;
1567 
1568 	/*
1569 	 * Step 3: All public and private state should now be sane.  Get
1570 	 * ready to start sending and receiving packets and give the driver
1571 	 * domain a kick because we've probably just requeued some
1572 	 * packets.
1573 	 */
1574 	netif_carrier_on(np->netdev);
1575 	notify_remote_via_irq(np->netdev->irq);
1576 	xennet_tx_buf_gc(dev);
1577 	xennet_alloc_rx_buffers(dev);
1578 
1579 	spin_unlock_irq(&np->tx_lock);
1580 	spin_unlock_bh(&np->rx_lock);
1581 
1582 	return 0;
1583 }
1584 
1585 /**
1586  * Callback received when the backend's state changes.
1587  */
1588 static void backend_changed(struct xenbus_device *dev,
1589 			    enum xenbus_state backend_state)
1590 {
1591 	struct netfront_info *np = dev->dev.driver_data;
1592 	struct net_device *netdev = np->netdev;
1593 
1594 	dev_dbg(&dev->dev, "%s\n", xenbus_strstate(backend_state));
1595 
1596 	switch (backend_state) {
1597 	case XenbusStateInitialising:
1598 	case XenbusStateInitialised:
1599 	case XenbusStateConnected:
1600 	case XenbusStateUnknown:
1601 	case XenbusStateClosed:
1602 		break;
1603 
1604 	case XenbusStateInitWait:
1605 		if (dev->state != XenbusStateInitialising)
1606 			break;
1607 		if (xennet_connect(netdev) != 0)
1608 			break;
1609 		xenbus_switch_state(dev, XenbusStateConnected);
1610 		break;
1611 
1612 	case XenbusStateClosing:
1613 		xenbus_frontend_closed(dev);
1614 		break;
1615 	}
1616 }
1617 
1618 static struct ethtool_ops xennet_ethtool_ops =
1619 {
1620 	.set_tx_csum = ethtool_op_set_tx_csum,
1621 	.set_sg = xennet_set_sg,
1622 	.set_tso = xennet_set_tso,
1623 	.get_link = ethtool_op_get_link,
1624 };
1625 
1626 #ifdef CONFIG_SYSFS
1627 static ssize_t show_rxbuf_min(struct device *dev,
1628 			      struct device_attribute *attr, char *buf)
1629 {
1630 	struct net_device *netdev = to_net_dev(dev);
1631 	struct netfront_info *info = netdev_priv(netdev);
1632 
1633 	return sprintf(buf, "%u\n", info->rx_min_target);
1634 }
1635 
1636 static ssize_t store_rxbuf_min(struct device *dev,
1637 			       struct device_attribute *attr,
1638 			       const char *buf, size_t len)
1639 {
1640 	struct net_device *netdev = to_net_dev(dev);
1641 	struct netfront_info *np = netdev_priv(netdev);
1642 	char *endp;
1643 	unsigned long target;
1644 
1645 	if (!capable(CAP_NET_ADMIN))
1646 		return -EPERM;
1647 
1648 	target = simple_strtoul(buf, &endp, 0);
1649 	if (endp == buf)
1650 		return -EBADMSG;
1651 
1652 	if (target < RX_MIN_TARGET)
1653 		target = RX_MIN_TARGET;
1654 	if (target > RX_MAX_TARGET)
1655 		target = RX_MAX_TARGET;
1656 
1657 	spin_lock_bh(&np->rx_lock);
1658 	if (target > np->rx_max_target)
1659 		np->rx_max_target = target;
1660 	np->rx_min_target = target;
1661 	if (target > np->rx_target)
1662 		np->rx_target = target;
1663 
1664 	xennet_alloc_rx_buffers(netdev);
1665 
1666 	spin_unlock_bh(&np->rx_lock);
1667 	return len;
1668 }
1669 
1670 static ssize_t show_rxbuf_max(struct device *dev,
1671 			      struct device_attribute *attr, char *buf)
1672 {
1673 	struct net_device *netdev = to_net_dev(dev);
1674 	struct netfront_info *info = netdev_priv(netdev);
1675 
1676 	return sprintf(buf, "%u\n", info->rx_max_target);
1677 }
1678 
1679 static ssize_t store_rxbuf_max(struct device *dev,
1680 			       struct device_attribute *attr,
1681 			       const char *buf, size_t len)
1682 {
1683 	struct net_device *netdev = to_net_dev(dev);
1684 	struct netfront_info *np = netdev_priv(netdev);
1685 	char *endp;
1686 	unsigned long target;
1687 
1688 	if (!capable(CAP_NET_ADMIN))
1689 		return -EPERM;
1690 
1691 	target = simple_strtoul(buf, &endp, 0);
1692 	if (endp == buf)
1693 		return -EBADMSG;
1694 
1695 	if (target < RX_MIN_TARGET)
1696 		target = RX_MIN_TARGET;
1697 	if (target > RX_MAX_TARGET)
1698 		target = RX_MAX_TARGET;
1699 
1700 	spin_lock_bh(&np->rx_lock);
1701 	if (target < np->rx_min_target)
1702 		np->rx_min_target = target;
1703 	np->rx_max_target = target;
1704 	if (target < np->rx_target)
1705 		np->rx_target = target;
1706 
1707 	xennet_alloc_rx_buffers(netdev);
1708 
1709 	spin_unlock_bh(&np->rx_lock);
1710 	return len;
1711 }
1712 
1713 static ssize_t show_rxbuf_cur(struct device *dev,
1714 			      struct device_attribute *attr, char *buf)
1715 {
1716 	struct net_device *netdev = to_net_dev(dev);
1717 	struct netfront_info *info = netdev_priv(netdev);
1718 
1719 	return sprintf(buf, "%u\n", info->rx_target);
1720 }
1721 
1722 static struct device_attribute xennet_attrs[] = {
1723 	__ATTR(rxbuf_min, S_IRUGO|S_IWUSR, show_rxbuf_min, store_rxbuf_min),
1724 	__ATTR(rxbuf_max, S_IRUGO|S_IWUSR, show_rxbuf_max, store_rxbuf_max),
1725 	__ATTR(rxbuf_cur, S_IRUGO, show_rxbuf_cur, NULL),
1726 };
1727 
1728 static int xennet_sysfs_addif(struct net_device *netdev)
1729 {
1730 	int i;
1731 	int err;
1732 
1733 	for (i = 0; i < ARRAY_SIZE(xennet_attrs); i++) {
1734 		err = device_create_file(&netdev->dev,
1735 					   &xennet_attrs[i]);
1736 		if (err)
1737 			goto fail;
1738 	}
1739 	return 0;
1740 
1741  fail:
1742 	while (--i >= 0)
1743 		device_remove_file(&netdev->dev, &xennet_attrs[i]);
1744 	return err;
1745 }
1746 
1747 static void xennet_sysfs_delif(struct net_device *netdev)
1748 {
1749 	int i;
1750 
1751 	for (i = 0; i < ARRAY_SIZE(xennet_attrs); i++)
1752 		device_remove_file(&netdev->dev, &xennet_attrs[i]);
1753 }
1754 
1755 #endif /* CONFIG_SYSFS */
1756 
1757 static struct xenbus_device_id netfront_ids[] = {
1758 	{ "vif" },
1759 	{ "" }
1760 };
1761 
1762 
1763 static int __devexit xennet_remove(struct xenbus_device *dev)
1764 {
1765 	struct netfront_info *info = dev->dev.driver_data;
1766 
1767 	dev_dbg(&dev->dev, "%s\n", dev->nodename);
1768 
1769 	unregister_netdev(info->netdev);
1770 
1771 	xennet_disconnect_backend(info);
1772 
1773 	del_timer_sync(&info->rx_refill_timer);
1774 
1775 	xennet_sysfs_delif(info->netdev);
1776 
1777 	free_netdev(info->netdev);
1778 
1779 	return 0;
1780 }
1781 
1782 static struct xenbus_driver netfront = {
1783 	.name = "vif",
1784 	.owner = THIS_MODULE,
1785 	.ids = netfront_ids,
1786 	.probe = netfront_probe,
1787 	.remove = __devexit_p(xennet_remove),
1788 	.resume = netfront_resume,
1789 	.otherend_changed = backend_changed,
1790 };
1791 
1792 static int __init netif_init(void)
1793 {
1794 	if (!is_running_on_xen())
1795 		return -ENODEV;
1796 
1797 	if (is_initial_xendomain())
1798 		return 0;
1799 
1800 	printk(KERN_INFO "Initialising Xen virtual ethernet driver.\n");
1801 
1802 	return xenbus_register_frontend(&netfront);
1803 }
1804 module_init(netif_init);
1805 
1806 
1807 static void __exit netif_exit(void)
1808 {
1809 	if (is_initial_xendomain())
1810 		return;
1811 
1812 	return xenbus_unregister_driver(&netfront);
1813 }
1814 module_exit(netif_exit);
1815 
1816 MODULE_DESCRIPTION("Xen virtual network device frontend");
1817 MODULE_LICENSE("GPL");
1818