xref: /linux/drivers/net/xen-netfront.c (revision c28054d4b31d78272f65c0d11db0796f50fb9569)
1 /*
2  * Virtual network driver for conversing with remote driver backends.
3  *
4  * Copyright (c) 2002-2005, K A Fraser
5  * Copyright (c) 2005, XenSource Ltd
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License version 2
9  * as published by the Free Software Foundation; or, when distributed
10  * separately from the Linux kernel or incorporated into other
11  * software packages, subject to the following license:
12  *
13  * Permission is hereby granted, free of charge, to any person obtaining a copy
14  * of this source file (the "Software"), to deal in the Software without
15  * restriction, including without limitation the rights to use, copy, modify,
16  * merge, publish, distribute, sublicense, and/or sell copies of the Software,
17  * and to permit persons to whom the Software is furnished to do so, subject to
18  * the following conditions:
19  *
20  * The above copyright notice and this permission notice shall be included in
21  * all copies or substantial portions of the Software.
22  *
23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
24  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
26  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
28  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
29  * IN THE SOFTWARE.
30  */
31 
32 #include <linux/module.h>
33 #include <linux/kernel.h>
34 #include <linux/netdevice.h>
35 #include <linux/etherdevice.h>
36 #include <linux/skbuff.h>
37 #include <linux/ethtool.h>
38 #include <linux/if_ether.h>
39 #include <linux/tcp.h>
40 #include <linux/udp.h>
41 #include <linux/moduleparam.h>
42 #include <linux/mm.h>
43 #include <net/ip.h>
44 
45 #include <xen/xenbus.h>
46 #include <xen/events.h>
47 #include <xen/page.h>
48 #include <xen/grant_table.h>
49 
50 #include <xen/interface/io/netif.h>
51 #include <xen/interface/memory.h>
52 #include <xen/interface/grant_table.h>
53 
54 static struct ethtool_ops xennet_ethtool_ops;
55 
56 struct netfront_cb {
57 	struct page *page;
58 	unsigned offset;
59 };
60 
61 #define NETFRONT_SKB_CB(skb)	((struct netfront_cb *)((skb)->cb))
62 
63 #define RX_COPY_THRESHOLD 256
64 
65 #define GRANT_INVALID_REF	0
66 
67 #define NET_TX_RING_SIZE __RING_SIZE((struct xen_netif_tx_sring *)0, PAGE_SIZE)
68 #define NET_RX_RING_SIZE __RING_SIZE((struct xen_netif_rx_sring *)0, PAGE_SIZE)
69 #define TX_MAX_TARGET min_t(int, NET_RX_RING_SIZE, 256)
70 
71 struct netfront_info {
72 	struct list_head list;
73 	struct net_device *netdev;
74 
75 	struct net_device_stats stats;
76 
77 	struct xen_netif_tx_front_ring tx;
78 	struct xen_netif_rx_front_ring rx;
79 
80 	spinlock_t   tx_lock;
81 	spinlock_t   rx_lock;
82 
83 	unsigned int evtchn;
84 
85 	/* Receive-ring batched refills. */
86 #define RX_MIN_TARGET 8
87 #define RX_DFL_MIN_TARGET 64
88 #define RX_MAX_TARGET min_t(int, NET_RX_RING_SIZE, 256)
89 	unsigned rx_min_target, rx_max_target, rx_target;
90 	struct sk_buff_head rx_batch;
91 
92 	struct timer_list rx_refill_timer;
93 
94 	/*
95 	 * {tx,rx}_skbs store outstanding skbuffs. Free tx_skb entries
96 	 * are linked from tx_skb_freelist through skb_entry.link.
97 	 *
98 	 *  NB. Freelist index entries are always going to be less than
99 	 *  PAGE_OFFSET, whereas pointers to skbs will always be equal or
100 	 *  greater than PAGE_OFFSET: we use this property to distinguish
101 	 *  them.
102 	 */
103 	union skb_entry {
104 		struct sk_buff *skb;
105 		unsigned link;
106 	} tx_skbs[NET_TX_RING_SIZE];
107 	grant_ref_t gref_tx_head;
108 	grant_ref_t grant_tx_ref[NET_TX_RING_SIZE];
109 	unsigned tx_skb_freelist;
110 
111 	struct sk_buff *rx_skbs[NET_RX_RING_SIZE];
112 	grant_ref_t gref_rx_head;
113 	grant_ref_t grant_rx_ref[NET_RX_RING_SIZE];
114 
115 	struct xenbus_device *xbdev;
116 	int tx_ring_ref;
117 	int rx_ring_ref;
118 
119 	unsigned long rx_pfn_array[NET_RX_RING_SIZE];
120 	struct multicall_entry rx_mcl[NET_RX_RING_SIZE+1];
121 	struct mmu_update rx_mmu[NET_RX_RING_SIZE];
122 };
123 
124 struct netfront_rx_info {
125 	struct xen_netif_rx_response rx;
126 	struct xen_netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1];
127 };
128 
129 /*
130  * Access macros for acquiring freeing slots in tx_skbs[].
131  */
132 
133 static void add_id_to_freelist(unsigned *head, union skb_entry *list,
134 			       unsigned short id)
135 {
136 	list[id].link = *head;
137 	*head = id;
138 }
139 
140 static unsigned short get_id_from_freelist(unsigned *head,
141 					   union skb_entry *list)
142 {
143 	unsigned int id = *head;
144 	*head = list[id].link;
145 	return id;
146 }
147 
148 static int xennet_rxidx(RING_IDX idx)
149 {
150 	return idx & (NET_RX_RING_SIZE - 1);
151 }
152 
153 static struct sk_buff *xennet_get_rx_skb(struct netfront_info *np,
154 					 RING_IDX ri)
155 {
156 	int i = xennet_rxidx(ri);
157 	struct sk_buff *skb = np->rx_skbs[i];
158 	np->rx_skbs[i] = NULL;
159 	return skb;
160 }
161 
162 static grant_ref_t xennet_get_rx_ref(struct netfront_info *np,
163 					    RING_IDX ri)
164 {
165 	int i = xennet_rxidx(ri);
166 	grant_ref_t ref = np->grant_rx_ref[i];
167 	np->grant_rx_ref[i] = GRANT_INVALID_REF;
168 	return ref;
169 }
170 
171 #ifdef CONFIG_SYSFS
172 static int xennet_sysfs_addif(struct net_device *netdev);
173 static void xennet_sysfs_delif(struct net_device *netdev);
174 #else /* !CONFIG_SYSFS */
175 #define xennet_sysfs_addif(dev) (0)
176 #define xennet_sysfs_delif(dev) do { } while (0)
177 #endif
178 
179 static int xennet_can_sg(struct net_device *dev)
180 {
181 	return dev->features & NETIF_F_SG;
182 }
183 
184 
185 static void rx_refill_timeout(unsigned long data)
186 {
187 	struct net_device *dev = (struct net_device *)data;
188 	netif_rx_schedule(dev);
189 }
190 
191 static int netfront_tx_slot_available(struct netfront_info *np)
192 {
193 	return ((np->tx.req_prod_pvt - np->tx.rsp_cons) <
194 		(TX_MAX_TARGET - MAX_SKB_FRAGS - 2));
195 }
196 
197 static void xennet_maybe_wake_tx(struct net_device *dev)
198 {
199 	struct netfront_info *np = netdev_priv(dev);
200 
201 	if (unlikely(netif_queue_stopped(dev)) &&
202 	    netfront_tx_slot_available(np) &&
203 	    likely(netif_running(dev)))
204 		netif_wake_queue(dev);
205 }
206 
207 static void xennet_alloc_rx_buffers(struct net_device *dev)
208 {
209 	unsigned short id;
210 	struct netfront_info *np = netdev_priv(dev);
211 	struct sk_buff *skb;
212 	struct page *page;
213 	int i, batch_target, notify;
214 	RING_IDX req_prod = np->rx.req_prod_pvt;
215 	struct xen_memory_reservation reservation;
216 	grant_ref_t ref;
217 	unsigned long pfn;
218 	void *vaddr;
219 	int nr_flips;
220 	struct xen_netif_rx_request *req;
221 
222 	if (unlikely(!netif_carrier_ok(dev)))
223 		return;
224 
225 	/*
226 	 * Allocate skbuffs greedily, even though we batch updates to the
227 	 * receive ring. This creates a less bursty demand on the memory
228 	 * allocator, so should reduce the chance of failed allocation requests
229 	 * both for ourself and for other kernel subsystems.
230 	 */
231 	batch_target = np->rx_target - (req_prod - np->rx.rsp_cons);
232 	for (i = skb_queue_len(&np->rx_batch); i < batch_target; i++) {
233 		skb = __netdev_alloc_skb(dev, RX_COPY_THRESHOLD,
234 					 GFP_ATOMIC | __GFP_NOWARN);
235 		if (unlikely(!skb))
236 			goto no_skb;
237 
238 		page = alloc_page(GFP_ATOMIC | __GFP_NOWARN);
239 		if (!page) {
240 			kfree_skb(skb);
241 no_skb:
242 			/* Any skbuffs queued for refill? Force them out. */
243 			if (i != 0)
244 				goto refill;
245 			/* Could not allocate any skbuffs. Try again later. */
246 			mod_timer(&np->rx_refill_timer,
247 				  jiffies + (HZ/10));
248 			break;
249 		}
250 
251 		skb_shinfo(skb)->frags[0].page = page;
252 		skb_shinfo(skb)->nr_frags = 1;
253 		__skb_queue_tail(&np->rx_batch, skb);
254 	}
255 
256 	/* Is the batch large enough to be worthwhile? */
257 	if (i < (np->rx_target/2)) {
258 		if (req_prod > np->rx.sring->req_prod)
259 			goto push;
260 		return;
261 	}
262 
263 	/* Adjust our fill target if we risked running out of buffers. */
264 	if (((req_prod - np->rx.sring->rsp_prod) < (np->rx_target / 4)) &&
265 	    ((np->rx_target *= 2) > np->rx_max_target))
266 		np->rx_target = np->rx_max_target;
267 
268  refill:
269 	for (nr_flips = i = 0; ; i++) {
270 		skb = __skb_dequeue(&np->rx_batch);
271 		if (skb == NULL)
272 			break;
273 
274 		skb->dev = dev;
275 
276 		id = xennet_rxidx(req_prod + i);
277 
278 		BUG_ON(np->rx_skbs[id]);
279 		np->rx_skbs[id] = skb;
280 
281 		ref = gnttab_claim_grant_reference(&np->gref_rx_head);
282 		BUG_ON((signed short)ref < 0);
283 		np->grant_rx_ref[id] = ref;
284 
285 		pfn = page_to_pfn(skb_shinfo(skb)->frags[0].page);
286 		vaddr = page_address(skb_shinfo(skb)->frags[0].page);
287 
288 		req = RING_GET_REQUEST(&np->rx, req_prod + i);
289 		gnttab_grant_foreign_access_ref(ref,
290 						np->xbdev->otherend_id,
291 						pfn_to_mfn(pfn),
292 						0);
293 
294 		req->id = id;
295 		req->gref = ref;
296 	}
297 
298 	if (nr_flips != 0) {
299 		reservation.extent_start = np->rx_pfn_array;
300 		reservation.nr_extents   = nr_flips;
301 		reservation.extent_order = 0;
302 		reservation.address_bits = 0;
303 		reservation.domid        = DOMID_SELF;
304 
305 		if (!xen_feature(XENFEAT_auto_translated_physmap)) {
306 			/* After all PTEs have been zapped, flush the TLB. */
307 			np->rx_mcl[i-1].args[MULTI_UVMFLAGS_INDEX] =
308 				UVMF_TLB_FLUSH|UVMF_ALL;
309 
310 			/* Give away a batch of pages. */
311 			np->rx_mcl[i].op = __HYPERVISOR_memory_op;
312 			np->rx_mcl[i].args[0] = XENMEM_decrease_reservation;
313 			np->rx_mcl[i].args[1] = (unsigned long)&reservation;
314 
315 			/* Zap PTEs and give away pages in one big
316 			 * multicall. */
317 			(void)HYPERVISOR_multicall(np->rx_mcl, i+1);
318 
319 			/* Check return status of HYPERVISOR_memory_op(). */
320 			if (unlikely(np->rx_mcl[i].result != i))
321 				panic("Unable to reduce memory reservation\n");
322 		} else {
323 			if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
324 						 &reservation) != i)
325 				panic("Unable to reduce memory reservation\n");
326 		}
327 	} else {
328 		wmb();		/* barrier so backend seens requests */
329 	}
330 
331 	/* Above is a suitable barrier to ensure backend will see requests. */
332 	np->rx.req_prod_pvt = req_prod + i;
333  push:
334 	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&np->rx, notify);
335 	if (notify)
336 		notify_remote_via_irq(np->netdev->irq);
337 }
338 
339 static int xennet_open(struct net_device *dev)
340 {
341 	struct netfront_info *np = netdev_priv(dev);
342 
343 	memset(&np->stats, 0, sizeof(np->stats));
344 
345 	spin_lock_bh(&np->rx_lock);
346 	if (netif_carrier_ok(dev)) {
347 		xennet_alloc_rx_buffers(dev);
348 		np->rx.sring->rsp_event = np->rx.rsp_cons + 1;
349 		if (RING_HAS_UNCONSUMED_RESPONSES(&np->rx))
350 			netif_rx_schedule(dev);
351 	}
352 	spin_unlock_bh(&np->rx_lock);
353 
354 	xennet_maybe_wake_tx(dev);
355 
356 	return 0;
357 }
358 
359 static void xennet_tx_buf_gc(struct net_device *dev)
360 {
361 	RING_IDX cons, prod;
362 	unsigned short id;
363 	struct netfront_info *np = netdev_priv(dev);
364 	struct sk_buff *skb;
365 
366 	BUG_ON(!netif_carrier_ok(dev));
367 
368 	do {
369 		prod = np->tx.sring->rsp_prod;
370 		rmb(); /* Ensure we see responses up to 'rp'. */
371 
372 		for (cons = np->tx.rsp_cons; cons != prod; cons++) {
373 			struct xen_netif_tx_response *txrsp;
374 
375 			txrsp = RING_GET_RESPONSE(&np->tx, cons);
376 			if (txrsp->status == NETIF_RSP_NULL)
377 				continue;
378 
379 			id  = txrsp->id;
380 			skb = np->tx_skbs[id].skb;
381 			if (unlikely(gnttab_query_foreign_access(
382 				np->grant_tx_ref[id]) != 0)) {
383 				printk(KERN_ALERT "xennet_tx_buf_gc: warning "
384 				       "-- grant still in use by backend "
385 				       "domain.\n");
386 				BUG();
387 			}
388 			gnttab_end_foreign_access_ref(
389 				np->grant_tx_ref[id], GNTMAP_readonly);
390 			gnttab_release_grant_reference(
391 				&np->gref_tx_head, np->grant_tx_ref[id]);
392 			np->grant_tx_ref[id] = GRANT_INVALID_REF;
393 			add_id_to_freelist(&np->tx_skb_freelist, np->tx_skbs, id);
394 			dev_kfree_skb_irq(skb);
395 		}
396 
397 		np->tx.rsp_cons = prod;
398 
399 		/*
400 		 * Set a new event, then check for race with update of tx_cons.
401 		 * Note that it is essential to schedule a callback, no matter
402 		 * how few buffers are pending. Even if there is space in the
403 		 * transmit ring, higher layers may be blocked because too much
404 		 * data is outstanding: in such cases notification from Xen is
405 		 * likely to be the only kick that we'll get.
406 		 */
407 		np->tx.sring->rsp_event =
408 			prod + ((np->tx.sring->req_prod - prod) >> 1) + 1;
409 		mb();		/* update shared area */
410 	} while ((cons == prod) && (prod != np->tx.sring->rsp_prod));
411 
412 	xennet_maybe_wake_tx(dev);
413 }
414 
415 static void xennet_make_frags(struct sk_buff *skb, struct net_device *dev,
416 			      struct xen_netif_tx_request *tx)
417 {
418 	struct netfront_info *np = netdev_priv(dev);
419 	char *data = skb->data;
420 	unsigned long mfn;
421 	RING_IDX prod = np->tx.req_prod_pvt;
422 	int frags = skb_shinfo(skb)->nr_frags;
423 	unsigned int offset = offset_in_page(data);
424 	unsigned int len = skb_headlen(skb);
425 	unsigned int id;
426 	grant_ref_t ref;
427 	int i;
428 
429 	/* While the header overlaps a page boundary (including being
430 	   larger than a page), split it it into page-sized chunks. */
431 	while (len > PAGE_SIZE - offset) {
432 		tx->size = PAGE_SIZE - offset;
433 		tx->flags |= NETTXF_more_data;
434 		len -= tx->size;
435 		data += tx->size;
436 		offset = 0;
437 
438 		id = get_id_from_freelist(&np->tx_skb_freelist, np->tx_skbs);
439 		np->tx_skbs[id].skb = skb_get(skb);
440 		tx = RING_GET_REQUEST(&np->tx, prod++);
441 		tx->id = id;
442 		ref = gnttab_claim_grant_reference(&np->gref_tx_head);
443 		BUG_ON((signed short)ref < 0);
444 
445 		mfn = virt_to_mfn(data);
446 		gnttab_grant_foreign_access_ref(ref, np->xbdev->otherend_id,
447 						mfn, GNTMAP_readonly);
448 
449 		tx->gref = np->grant_tx_ref[id] = ref;
450 		tx->offset = offset;
451 		tx->size = len;
452 		tx->flags = 0;
453 	}
454 
455 	/* Grant backend access to each skb fragment page. */
456 	for (i = 0; i < frags; i++) {
457 		skb_frag_t *frag = skb_shinfo(skb)->frags + i;
458 
459 		tx->flags |= NETTXF_more_data;
460 
461 		id = get_id_from_freelist(&np->tx_skb_freelist, np->tx_skbs);
462 		np->tx_skbs[id].skb = skb_get(skb);
463 		tx = RING_GET_REQUEST(&np->tx, prod++);
464 		tx->id = id;
465 		ref = gnttab_claim_grant_reference(&np->gref_tx_head);
466 		BUG_ON((signed short)ref < 0);
467 
468 		mfn = pfn_to_mfn(page_to_pfn(frag->page));
469 		gnttab_grant_foreign_access_ref(ref, np->xbdev->otherend_id,
470 						mfn, GNTMAP_readonly);
471 
472 		tx->gref = np->grant_tx_ref[id] = ref;
473 		tx->offset = frag->page_offset;
474 		tx->size = frag->size;
475 		tx->flags = 0;
476 	}
477 
478 	np->tx.req_prod_pvt = prod;
479 }
480 
481 static int xennet_start_xmit(struct sk_buff *skb, struct net_device *dev)
482 {
483 	unsigned short id;
484 	struct netfront_info *np = netdev_priv(dev);
485 	struct xen_netif_tx_request *tx;
486 	struct xen_netif_extra_info *extra;
487 	char *data = skb->data;
488 	RING_IDX i;
489 	grant_ref_t ref;
490 	unsigned long mfn;
491 	int notify;
492 	int frags = skb_shinfo(skb)->nr_frags;
493 	unsigned int offset = offset_in_page(data);
494 	unsigned int len = skb_headlen(skb);
495 
496 	frags += (offset + len + PAGE_SIZE - 1) / PAGE_SIZE;
497 	if (unlikely(frags > MAX_SKB_FRAGS + 1)) {
498 		printk(KERN_ALERT "xennet: skb rides the rocket: %d frags\n",
499 		       frags);
500 		dump_stack();
501 		goto drop;
502 	}
503 
504 	spin_lock_irq(&np->tx_lock);
505 
506 	if (unlikely(!netif_carrier_ok(dev) ||
507 		     (frags > 1 && !xennet_can_sg(dev)) ||
508 		     netif_needs_gso(dev, skb))) {
509 		spin_unlock_irq(&np->tx_lock);
510 		goto drop;
511 	}
512 
513 	i = np->tx.req_prod_pvt;
514 
515 	id = get_id_from_freelist(&np->tx_skb_freelist, np->tx_skbs);
516 	np->tx_skbs[id].skb = skb;
517 
518 	tx = RING_GET_REQUEST(&np->tx, i);
519 
520 	tx->id   = id;
521 	ref = gnttab_claim_grant_reference(&np->gref_tx_head);
522 	BUG_ON((signed short)ref < 0);
523 	mfn = virt_to_mfn(data);
524 	gnttab_grant_foreign_access_ref(
525 		ref, np->xbdev->otherend_id, mfn, GNTMAP_readonly);
526 	tx->gref = np->grant_tx_ref[id] = ref;
527 	tx->offset = offset;
528 	tx->size = len;
529 	extra = NULL;
530 
531 	tx->flags = 0;
532 	if (skb->ip_summed == CHECKSUM_PARTIAL)
533 		/* local packet? */
534 		tx->flags |= NETTXF_csum_blank | NETTXF_data_validated;
535 	else if (skb->ip_summed == CHECKSUM_UNNECESSARY)
536 		/* remote but checksummed. */
537 		tx->flags |= NETTXF_data_validated;
538 
539 	if (skb_shinfo(skb)->gso_size) {
540 		struct xen_netif_extra_info *gso;
541 
542 		gso = (struct xen_netif_extra_info *)
543 			RING_GET_REQUEST(&np->tx, ++i);
544 
545 		if (extra)
546 			extra->flags |= XEN_NETIF_EXTRA_FLAG_MORE;
547 		else
548 			tx->flags |= NETTXF_extra_info;
549 
550 		gso->u.gso.size = skb_shinfo(skb)->gso_size;
551 		gso->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4;
552 		gso->u.gso.pad = 0;
553 		gso->u.gso.features = 0;
554 
555 		gso->type = XEN_NETIF_EXTRA_TYPE_GSO;
556 		gso->flags = 0;
557 		extra = gso;
558 	}
559 
560 	np->tx.req_prod_pvt = i + 1;
561 
562 	xennet_make_frags(skb, dev, tx);
563 	tx->size = skb->len;
564 
565 	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&np->tx, notify);
566 	if (notify)
567 		notify_remote_via_irq(np->netdev->irq);
568 
569 	xennet_tx_buf_gc(dev);
570 
571 	if (!netfront_tx_slot_available(np))
572 		netif_stop_queue(dev);
573 
574 	spin_unlock_irq(&np->tx_lock);
575 
576 	np->stats.tx_bytes += skb->len;
577 	np->stats.tx_packets++;
578 
579 	return 0;
580 
581  drop:
582 	np->stats.tx_dropped++;
583 	dev_kfree_skb(skb);
584 	return 0;
585 }
586 
587 static int xennet_close(struct net_device *dev)
588 {
589 	struct netfront_info *np = netdev_priv(dev);
590 	netif_stop_queue(np->netdev);
591 	return 0;
592 }
593 
594 static struct net_device_stats *xennet_get_stats(struct net_device *dev)
595 {
596 	struct netfront_info *np = netdev_priv(dev);
597 	return &np->stats;
598 }
599 
600 static void xennet_move_rx_slot(struct netfront_info *np, struct sk_buff *skb,
601 				grant_ref_t ref)
602 {
603 	int new = xennet_rxidx(np->rx.req_prod_pvt);
604 
605 	BUG_ON(np->rx_skbs[new]);
606 	np->rx_skbs[new] = skb;
607 	np->grant_rx_ref[new] = ref;
608 	RING_GET_REQUEST(&np->rx, np->rx.req_prod_pvt)->id = new;
609 	RING_GET_REQUEST(&np->rx, np->rx.req_prod_pvt)->gref = ref;
610 	np->rx.req_prod_pvt++;
611 }
612 
613 static int xennet_get_extras(struct netfront_info *np,
614 			     struct xen_netif_extra_info *extras,
615 			     RING_IDX rp)
616 
617 {
618 	struct xen_netif_extra_info *extra;
619 	struct device *dev = &np->netdev->dev;
620 	RING_IDX cons = np->rx.rsp_cons;
621 	int err = 0;
622 
623 	do {
624 		struct sk_buff *skb;
625 		grant_ref_t ref;
626 
627 		if (unlikely(cons + 1 == rp)) {
628 			if (net_ratelimit())
629 				dev_warn(dev, "Missing extra info\n");
630 			err = -EBADR;
631 			break;
632 		}
633 
634 		extra = (struct xen_netif_extra_info *)
635 			RING_GET_RESPONSE(&np->rx, ++cons);
636 
637 		if (unlikely(!extra->type ||
638 			     extra->type >= XEN_NETIF_EXTRA_TYPE_MAX)) {
639 			if (net_ratelimit())
640 				dev_warn(dev, "Invalid extra type: %d\n",
641 					extra->type);
642 			err = -EINVAL;
643 		} else {
644 			memcpy(&extras[extra->type - 1], extra,
645 			       sizeof(*extra));
646 		}
647 
648 		skb = xennet_get_rx_skb(np, cons);
649 		ref = xennet_get_rx_ref(np, cons);
650 		xennet_move_rx_slot(np, skb, ref);
651 	} while (extra->flags & XEN_NETIF_EXTRA_FLAG_MORE);
652 
653 	np->rx.rsp_cons = cons;
654 	return err;
655 }
656 
657 static int xennet_get_responses(struct netfront_info *np,
658 				struct netfront_rx_info *rinfo, RING_IDX rp,
659 				struct sk_buff_head *list)
660 {
661 	struct xen_netif_rx_response *rx = &rinfo->rx;
662 	struct xen_netif_extra_info *extras = rinfo->extras;
663 	struct device *dev = &np->netdev->dev;
664 	RING_IDX cons = np->rx.rsp_cons;
665 	struct sk_buff *skb = xennet_get_rx_skb(np, cons);
666 	grant_ref_t ref = xennet_get_rx_ref(np, cons);
667 	int max = MAX_SKB_FRAGS + (rx->status <= RX_COPY_THRESHOLD);
668 	int frags = 1;
669 	int err = 0;
670 	unsigned long ret;
671 
672 	if (rx->flags & NETRXF_extra_info) {
673 		err = xennet_get_extras(np, extras, rp);
674 		cons = np->rx.rsp_cons;
675 	}
676 
677 	for (;;) {
678 		if (unlikely(rx->status < 0 ||
679 			     rx->offset + rx->status > PAGE_SIZE)) {
680 			if (net_ratelimit())
681 				dev_warn(dev, "rx->offset: %x, size: %u\n",
682 					 rx->offset, rx->status);
683 			xennet_move_rx_slot(np, skb, ref);
684 			err = -EINVAL;
685 			goto next;
686 		}
687 
688 		/*
689 		 * This definitely indicates a bug, either in this driver or in
690 		 * the backend driver. In future this should flag the bad
691 		 * situation to the system controller to reboot the backed.
692 		 */
693 		if (ref == GRANT_INVALID_REF) {
694 			if (net_ratelimit())
695 				dev_warn(dev, "Bad rx response id %d.\n",
696 					 rx->id);
697 			err = -EINVAL;
698 			goto next;
699 		}
700 
701 		ret = gnttab_end_foreign_access_ref(ref, 0);
702 		BUG_ON(!ret);
703 
704 		gnttab_release_grant_reference(&np->gref_rx_head, ref);
705 
706 		__skb_queue_tail(list, skb);
707 
708 next:
709 		if (!(rx->flags & NETRXF_more_data))
710 			break;
711 
712 		if (cons + frags == rp) {
713 			if (net_ratelimit())
714 				dev_warn(dev, "Need more frags\n");
715 			err = -ENOENT;
716 			break;
717 		}
718 
719 		rx = RING_GET_RESPONSE(&np->rx, cons + frags);
720 		skb = xennet_get_rx_skb(np, cons + frags);
721 		ref = xennet_get_rx_ref(np, cons + frags);
722 		frags++;
723 	}
724 
725 	if (unlikely(frags > max)) {
726 		if (net_ratelimit())
727 			dev_warn(dev, "Too many frags\n");
728 		err = -E2BIG;
729 	}
730 
731 	if (unlikely(err))
732 		np->rx.rsp_cons = cons + frags;
733 
734 	return err;
735 }
736 
737 static int xennet_set_skb_gso(struct sk_buff *skb,
738 			      struct xen_netif_extra_info *gso)
739 {
740 	if (!gso->u.gso.size) {
741 		if (net_ratelimit())
742 			printk(KERN_WARNING "GSO size must not be zero.\n");
743 		return -EINVAL;
744 	}
745 
746 	/* Currently only TCPv4 S.O. is supported. */
747 	if (gso->u.gso.type != XEN_NETIF_GSO_TYPE_TCPV4) {
748 		if (net_ratelimit())
749 			printk(KERN_WARNING "Bad GSO type %d.\n", gso->u.gso.type);
750 		return -EINVAL;
751 	}
752 
753 	skb_shinfo(skb)->gso_size = gso->u.gso.size;
754 	skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
755 
756 	/* Header must be checked, and gso_segs computed. */
757 	skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
758 	skb_shinfo(skb)->gso_segs = 0;
759 
760 	return 0;
761 }
762 
763 static RING_IDX xennet_fill_frags(struct netfront_info *np,
764 				  struct sk_buff *skb,
765 				  struct sk_buff_head *list)
766 {
767 	struct skb_shared_info *shinfo = skb_shinfo(skb);
768 	int nr_frags = shinfo->nr_frags;
769 	RING_IDX cons = np->rx.rsp_cons;
770 	skb_frag_t *frag = shinfo->frags + nr_frags;
771 	struct sk_buff *nskb;
772 
773 	while ((nskb = __skb_dequeue(list))) {
774 		struct xen_netif_rx_response *rx =
775 			RING_GET_RESPONSE(&np->rx, ++cons);
776 
777 		frag->page = skb_shinfo(nskb)->frags[0].page;
778 		frag->page_offset = rx->offset;
779 		frag->size = rx->status;
780 
781 		skb->data_len += rx->status;
782 
783 		skb_shinfo(nskb)->nr_frags = 0;
784 		kfree_skb(nskb);
785 
786 		frag++;
787 		nr_frags++;
788 	}
789 
790 	shinfo->nr_frags = nr_frags;
791 	return cons;
792 }
793 
794 static int skb_checksum_setup(struct sk_buff *skb)
795 {
796 	struct iphdr *iph;
797 	unsigned char *th;
798 	int err = -EPROTO;
799 
800 	if (skb->protocol != htons(ETH_P_IP))
801 		goto out;
802 
803 	iph = (void *)skb->data;
804 	th = skb->data + 4 * iph->ihl;
805 	if (th >= skb_tail_pointer(skb))
806 		goto out;
807 
808 	skb->csum_start = th - skb->head;
809 	switch (iph->protocol) {
810 	case IPPROTO_TCP:
811 		skb->csum_offset = offsetof(struct tcphdr, check);
812 		break;
813 	case IPPROTO_UDP:
814 		skb->csum_offset = offsetof(struct udphdr, check);
815 		break;
816 	default:
817 		if (net_ratelimit())
818 			printk(KERN_ERR "Attempting to checksum a non-"
819 			       "TCP/UDP packet, dropping a protocol"
820 			       " %d packet", iph->protocol);
821 		goto out;
822 	}
823 
824 	if ((th + skb->csum_offset + 2) > skb_tail_pointer(skb))
825 		goto out;
826 
827 	err = 0;
828 
829 out:
830 	return err;
831 }
832 
833 static int handle_incoming_queue(struct net_device *dev,
834 				  struct sk_buff_head *rxq)
835 {
836 	struct netfront_info *np = netdev_priv(dev);
837 	int packets_dropped = 0;
838 	struct sk_buff *skb;
839 
840 	while ((skb = __skb_dequeue(rxq)) != NULL) {
841 		struct page *page = NETFRONT_SKB_CB(skb)->page;
842 		void *vaddr = page_address(page);
843 		unsigned offset = NETFRONT_SKB_CB(skb)->offset;
844 
845 		memcpy(skb->data, vaddr + offset,
846 		       skb_headlen(skb));
847 
848 		if (page != skb_shinfo(skb)->frags[0].page)
849 			__free_page(page);
850 
851 		/* Ethernet work: Delayed to here as it peeks the header. */
852 		skb->protocol = eth_type_trans(skb, dev);
853 
854 		if (skb->ip_summed == CHECKSUM_PARTIAL) {
855 			if (skb_checksum_setup(skb)) {
856 				kfree_skb(skb);
857 				packets_dropped++;
858 				np->stats.rx_errors++;
859 				continue;
860 			}
861 		}
862 
863 		np->stats.rx_packets++;
864 		np->stats.rx_bytes += skb->len;
865 
866 		/* Pass it up. */
867 		netif_receive_skb(skb);
868 		dev->last_rx = jiffies;
869 	}
870 
871 	return packets_dropped;
872 }
873 
874 static int xennet_poll(struct net_device *dev, int *pbudget)
875 {
876 	struct netfront_info *np = netdev_priv(dev);
877 	struct sk_buff *skb;
878 	struct netfront_rx_info rinfo;
879 	struct xen_netif_rx_response *rx = &rinfo.rx;
880 	struct xen_netif_extra_info *extras = rinfo.extras;
881 	RING_IDX i, rp;
882 	int work_done, budget, more_to_do = 1;
883 	struct sk_buff_head rxq;
884 	struct sk_buff_head errq;
885 	struct sk_buff_head tmpq;
886 	unsigned long flags;
887 	unsigned int len;
888 	int err;
889 
890 	spin_lock(&np->rx_lock);
891 
892 	if (unlikely(!netif_carrier_ok(dev))) {
893 		spin_unlock(&np->rx_lock);
894 		return 0;
895 	}
896 
897 	skb_queue_head_init(&rxq);
898 	skb_queue_head_init(&errq);
899 	skb_queue_head_init(&tmpq);
900 
901 	budget = *pbudget;
902 	if (budget > dev->quota)
903 		budget = dev->quota;
904 	rp = np->rx.sring->rsp_prod;
905 	rmb(); /* Ensure we see queued responses up to 'rp'. */
906 
907 	i = np->rx.rsp_cons;
908 	work_done = 0;
909 	while ((i != rp) && (work_done < budget)) {
910 		memcpy(rx, RING_GET_RESPONSE(&np->rx, i), sizeof(*rx));
911 		memset(extras, 0, sizeof(rinfo.extras));
912 
913 		err = xennet_get_responses(np, &rinfo, rp, &tmpq);
914 
915 		if (unlikely(err)) {
916 err:
917 			while ((skb = __skb_dequeue(&tmpq)))
918 				__skb_queue_tail(&errq, skb);
919 			np->stats.rx_errors++;
920 			i = np->rx.rsp_cons;
921 			continue;
922 		}
923 
924 		skb = __skb_dequeue(&tmpq);
925 
926 		if (extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].type) {
927 			struct xen_netif_extra_info *gso;
928 			gso = &extras[XEN_NETIF_EXTRA_TYPE_GSO - 1];
929 
930 			if (unlikely(xennet_set_skb_gso(skb, gso))) {
931 				__skb_queue_head(&tmpq, skb);
932 				np->rx.rsp_cons += skb_queue_len(&tmpq);
933 				goto err;
934 			}
935 		}
936 
937 		NETFRONT_SKB_CB(skb)->page = skb_shinfo(skb)->frags[0].page;
938 		NETFRONT_SKB_CB(skb)->offset = rx->offset;
939 
940 		len = rx->status;
941 		if (len > RX_COPY_THRESHOLD)
942 			len = RX_COPY_THRESHOLD;
943 		skb_put(skb, len);
944 
945 		if (rx->status > len) {
946 			skb_shinfo(skb)->frags[0].page_offset =
947 				rx->offset + len;
948 			skb_shinfo(skb)->frags[0].size = rx->status - len;
949 			skb->data_len = rx->status - len;
950 		} else {
951 			skb_shinfo(skb)->frags[0].page = NULL;
952 			skb_shinfo(skb)->nr_frags = 0;
953 		}
954 
955 		i = xennet_fill_frags(np, skb, &tmpq);
956 
957 		/*
958 		 * Truesize approximates the size of true data plus
959 		 * any supervisor overheads. Adding hypervisor
960 		 * overheads has been shown to significantly reduce
961 		 * achievable bandwidth with the default receive
962 		 * buffer size. It is therefore not wise to account
963 		 * for it here.
964 		 *
965 		 * After alloc_skb(RX_COPY_THRESHOLD), truesize is set
966 		 * to RX_COPY_THRESHOLD + the supervisor
967 		 * overheads. Here, we add the size of the data pulled
968 		 * in xennet_fill_frags().
969 		 *
970 		 * We also adjust for any unused space in the main
971 		 * data area by subtracting (RX_COPY_THRESHOLD -
972 		 * len). This is especially important with drivers
973 		 * which split incoming packets into header and data,
974 		 * using only 66 bytes of the main data area (see the
975 		 * e1000 driver for example.)  On such systems,
976 		 * without this last adjustement, our achievable
977 		 * receive throughout using the standard receive
978 		 * buffer size was cut by 25%(!!!).
979 		 */
980 		skb->truesize += skb->data_len - (RX_COPY_THRESHOLD - len);
981 		skb->len += skb->data_len;
982 
983 		if (rx->flags & NETRXF_csum_blank)
984 			skb->ip_summed = CHECKSUM_PARTIAL;
985 		else if (rx->flags & NETRXF_data_validated)
986 			skb->ip_summed = CHECKSUM_UNNECESSARY;
987 
988 		__skb_queue_tail(&rxq, skb);
989 
990 		np->rx.rsp_cons = ++i;
991 		work_done++;
992 	}
993 
994 	while ((skb = __skb_dequeue(&errq)))
995 		kfree_skb(skb);
996 
997 	work_done -= handle_incoming_queue(dev, &rxq);
998 
999 	/* If we get a callback with very few responses, reduce fill target. */
1000 	/* NB. Note exponential increase, linear decrease. */
1001 	if (((np->rx.req_prod_pvt - np->rx.sring->rsp_prod) >
1002 	     ((3*np->rx_target) / 4)) &&
1003 	    (--np->rx_target < np->rx_min_target))
1004 		np->rx_target = np->rx_min_target;
1005 
1006 	xennet_alloc_rx_buffers(dev);
1007 
1008 	*pbudget   -= work_done;
1009 	dev->quota -= work_done;
1010 
1011 	if (work_done < budget) {
1012 		local_irq_save(flags);
1013 
1014 		RING_FINAL_CHECK_FOR_RESPONSES(&np->rx, more_to_do);
1015 		if (!more_to_do)
1016 			__netif_rx_complete(dev);
1017 
1018 		local_irq_restore(flags);
1019 	}
1020 
1021 	spin_unlock(&np->rx_lock);
1022 
1023 	return more_to_do;
1024 }
1025 
1026 static int xennet_change_mtu(struct net_device *dev, int mtu)
1027 {
1028 	int max = xennet_can_sg(dev) ? 65535 - ETH_HLEN : ETH_DATA_LEN;
1029 
1030 	if (mtu > max)
1031 		return -EINVAL;
1032 	dev->mtu = mtu;
1033 	return 0;
1034 }
1035 
1036 static void xennet_release_tx_bufs(struct netfront_info *np)
1037 {
1038 	struct sk_buff *skb;
1039 	int i;
1040 
1041 	for (i = 0; i < NET_TX_RING_SIZE; i++) {
1042 		/* Skip over entries which are actually freelist references */
1043 		if ((unsigned long)np->tx_skbs[i].skb < PAGE_OFFSET)
1044 			continue;
1045 
1046 		skb = np->tx_skbs[i].skb;
1047 		gnttab_end_foreign_access_ref(np->grant_tx_ref[i],
1048 					      GNTMAP_readonly);
1049 		gnttab_release_grant_reference(&np->gref_tx_head,
1050 					       np->grant_tx_ref[i]);
1051 		np->grant_tx_ref[i] = GRANT_INVALID_REF;
1052 		add_id_to_freelist(&np->tx_skb_freelist, np->tx_skbs, i);
1053 		dev_kfree_skb_irq(skb);
1054 	}
1055 }
1056 
1057 static void xennet_release_rx_bufs(struct netfront_info *np)
1058 {
1059 	struct mmu_update      *mmu = np->rx_mmu;
1060 	struct multicall_entry *mcl = np->rx_mcl;
1061 	struct sk_buff_head free_list;
1062 	struct sk_buff *skb;
1063 	unsigned long mfn;
1064 	int xfer = 0, noxfer = 0, unused = 0;
1065 	int id, ref;
1066 
1067 	dev_warn(&np->netdev->dev, "%s: fix me for copying receiver.\n",
1068 			 __func__);
1069 	return;
1070 
1071 	skb_queue_head_init(&free_list);
1072 
1073 	spin_lock_bh(&np->rx_lock);
1074 
1075 	for (id = 0; id < NET_RX_RING_SIZE; id++) {
1076 		ref = np->grant_rx_ref[id];
1077 		if (ref == GRANT_INVALID_REF) {
1078 			unused++;
1079 			continue;
1080 		}
1081 
1082 		skb = np->rx_skbs[id];
1083 		mfn = gnttab_end_foreign_transfer_ref(ref);
1084 		gnttab_release_grant_reference(&np->gref_rx_head, ref);
1085 		np->grant_rx_ref[id] = GRANT_INVALID_REF;
1086 
1087 		if (0 == mfn) {
1088 			skb_shinfo(skb)->nr_frags = 0;
1089 			dev_kfree_skb(skb);
1090 			noxfer++;
1091 			continue;
1092 		}
1093 
1094 		if (!xen_feature(XENFEAT_auto_translated_physmap)) {
1095 			/* Remap the page. */
1096 			struct page *page = skb_shinfo(skb)->frags[0].page;
1097 			unsigned long pfn = page_to_pfn(page);
1098 			void *vaddr = page_address(page);
1099 
1100 			MULTI_update_va_mapping(mcl, (unsigned long)vaddr,
1101 						mfn_pte(mfn, PAGE_KERNEL),
1102 						0);
1103 			mcl++;
1104 			mmu->ptr = ((u64)mfn << PAGE_SHIFT)
1105 				| MMU_MACHPHYS_UPDATE;
1106 			mmu->val = pfn;
1107 			mmu++;
1108 
1109 			set_phys_to_machine(pfn, mfn);
1110 		}
1111 		__skb_queue_tail(&free_list, skb);
1112 		xfer++;
1113 	}
1114 
1115 	dev_info(&np->netdev->dev, "%s: %d xfer, %d noxfer, %d unused\n",
1116 		 __func__, xfer, noxfer, unused);
1117 
1118 	if (xfer) {
1119 		if (!xen_feature(XENFEAT_auto_translated_physmap)) {
1120 			/* Do all the remapping work and M2P updates. */
1121 			MULTI_mmu_update(mcl, np->rx_mmu, mmu - np->rx_mmu,
1122 					 0, DOMID_SELF);
1123 			mcl++;
1124 			HYPERVISOR_multicall(np->rx_mcl, mcl - np->rx_mcl);
1125 		}
1126 	}
1127 
1128 	while ((skb = __skb_dequeue(&free_list)) != NULL)
1129 		dev_kfree_skb(skb);
1130 
1131 	spin_unlock_bh(&np->rx_lock);
1132 }
1133 
1134 static void xennet_uninit(struct net_device *dev)
1135 {
1136 	struct netfront_info *np = netdev_priv(dev);
1137 	xennet_release_tx_bufs(np);
1138 	xennet_release_rx_bufs(np);
1139 	gnttab_free_grant_references(np->gref_tx_head);
1140 	gnttab_free_grant_references(np->gref_rx_head);
1141 }
1142 
1143 static struct net_device * __devinit xennet_create_dev(struct xenbus_device *dev)
1144 {
1145 	int i, err;
1146 	struct net_device *netdev;
1147 	struct netfront_info *np;
1148 
1149 	netdev = alloc_etherdev(sizeof(struct netfront_info));
1150 	if (!netdev) {
1151 		printk(KERN_WARNING "%s> alloc_etherdev failed.\n",
1152 		       __func__);
1153 		return ERR_PTR(-ENOMEM);
1154 	}
1155 
1156 	np                   = netdev_priv(netdev);
1157 	np->xbdev            = dev;
1158 
1159 	spin_lock_init(&np->tx_lock);
1160 	spin_lock_init(&np->rx_lock);
1161 
1162 	skb_queue_head_init(&np->rx_batch);
1163 	np->rx_target     = RX_DFL_MIN_TARGET;
1164 	np->rx_min_target = RX_DFL_MIN_TARGET;
1165 	np->rx_max_target = RX_MAX_TARGET;
1166 
1167 	init_timer(&np->rx_refill_timer);
1168 	np->rx_refill_timer.data = (unsigned long)netdev;
1169 	np->rx_refill_timer.function = rx_refill_timeout;
1170 
1171 	/* Initialise tx_skbs as a free chain containing every entry. */
1172 	np->tx_skb_freelist = 0;
1173 	for (i = 0; i < NET_TX_RING_SIZE; i++) {
1174 		np->tx_skbs[i].link = i+1;
1175 		np->grant_tx_ref[i] = GRANT_INVALID_REF;
1176 	}
1177 
1178 	/* Clear out rx_skbs */
1179 	for (i = 0; i < NET_RX_RING_SIZE; i++) {
1180 		np->rx_skbs[i] = NULL;
1181 		np->grant_rx_ref[i] = GRANT_INVALID_REF;
1182 	}
1183 
1184 	/* A grant for every tx ring slot */
1185 	if (gnttab_alloc_grant_references(TX_MAX_TARGET,
1186 					  &np->gref_tx_head) < 0) {
1187 		printk(KERN_ALERT "#### netfront can't alloc tx grant refs\n");
1188 		err = -ENOMEM;
1189 		goto exit;
1190 	}
1191 	/* A grant for every rx ring slot */
1192 	if (gnttab_alloc_grant_references(RX_MAX_TARGET,
1193 					  &np->gref_rx_head) < 0) {
1194 		printk(KERN_ALERT "#### netfront can't alloc rx grant refs\n");
1195 		err = -ENOMEM;
1196 		goto exit_free_tx;
1197 	}
1198 
1199 	netdev->open            = xennet_open;
1200 	netdev->hard_start_xmit = xennet_start_xmit;
1201 	netdev->stop            = xennet_close;
1202 	netdev->get_stats       = xennet_get_stats;
1203 	netdev->poll            = xennet_poll;
1204 	netdev->uninit          = xennet_uninit;
1205 	netdev->change_mtu	= xennet_change_mtu;
1206 	netdev->weight          = 64;
1207 	netdev->features        = NETIF_F_IP_CSUM;
1208 
1209 	SET_ETHTOOL_OPS(netdev, &xennet_ethtool_ops);
1210 	SET_MODULE_OWNER(netdev);
1211 	SET_NETDEV_DEV(netdev, &dev->dev);
1212 
1213 	np->netdev = netdev;
1214 
1215 	netif_carrier_off(netdev);
1216 
1217 	return netdev;
1218 
1219  exit_free_tx:
1220 	gnttab_free_grant_references(np->gref_tx_head);
1221  exit:
1222 	free_netdev(netdev);
1223 	return ERR_PTR(err);
1224 }
1225 
1226 /**
1227  * Entry point to this code when a new device is created.  Allocate the basic
1228  * structures and the ring buffers for communication with the backend, and
1229  * inform the backend of the appropriate details for those.
1230  */
1231 static int __devinit netfront_probe(struct xenbus_device *dev,
1232 				    const struct xenbus_device_id *id)
1233 {
1234 	int err;
1235 	struct net_device *netdev;
1236 	struct netfront_info *info;
1237 
1238 	netdev = xennet_create_dev(dev);
1239 	if (IS_ERR(netdev)) {
1240 		err = PTR_ERR(netdev);
1241 		xenbus_dev_fatal(dev, err, "creating netdev");
1242 		return err;
1243 	}
1244 
1245 	info = netdev_priv(netdev);
1246 	dev->dev.driver_data = info;
1247 
1248 	err = register_netdev(info->netdev);
1249 	if (err) {
1250 		printk(KERN_WARNING "%s: register_netdev err=%d\n",
1251 		       __func__, err);
1252 		goto fail;
1253 	}
1254 
1255 	err = xennet_sysfs_addif(info->netdev);
1256 	if (err) {
1257 		unregister_netdev(info->netdev);
1258 		printk(KERN_WARNING "%s: add sysfs failed err=%d\n",
1259 		       __func__, err);
1260 		goto fail;
1261 	}
1262 
1263 	return 0;
1264 
1265  fail:
1266 	free_netdev(netdev);
1267 	dev->dev.driver_data = NULL;
1268 	return err;
1269 }
1270 
1271 static void xennet_end_access(int ref, void *page)
1272 {
1273 	/* This frees the page as a side-effect */
1274 	if (ref != GRANT_INVALID_REF)
1275 		gnttab_end_foreign_access(ref, 0, (unsigned long)page);
1276 }
1277 
1278 static void xennet_disconnect_backend(struct netfront_info *info)
1279 {
1280 	/* Stop old i/f to prevent errors whilst we rebuild the state. */
1281 	spin_lock_bh(&info->rx_lock);
1282 	spin_lock_irq(&info->tx_lock);
1283 	netif_carrier_off(info->netdev);
1284 	spin_unlock_irq(&info->tx_lock);
1285 	spin_unlock_bh(&info->rx_lock);
1286 
1287 	if (info->netdev->irq)
1288 		unbind_from_irqhandler(info->netdev->irq, info->netdev);
1289 	info->evtchn = info->netdev->irq = 0;
1290 
1291 	/* End access and free the pages */
1292 	xennet_end_access(info->tx_ring_ref, info->tx.sring);
1293 	xennet_end_access(info->rx_ring_ref, info->rx.sring);
1294 
1295 	info->tx_ring_ref = GRANT_INVALID_REF;
1296 	info->rx_ring_ref = GRANT_INVALID_REF;
1297 	info->tx.sring = NULL;
1298 	info->rx.sring = NULL;
1299 }
1300 
1301 /**
1302  * We are reconnecting to the backend, due to a suspend/resume, or a backend
1303  * driver restart.  We tear down our netif structure and recreate it, but
1304  * leave the device-layer structures intact so that this is transparent to the
1305  * rest of the kernel.
1306  */
1307 static int netfront_resume(struct xenbus_device *dev)
1308 {
1309 	struct netfront_info *info = dev->dev.driver_data;
1310 
1311 	dev_dbg(&dev->dev, "%s\n", dev->nodename);
1312 
1313 	xennet_disconnect_backend(info);
1314 	return 0;
1315 }
1316 
1317 static int xen_net_read_mac(struct xenbus_device *dev, u8 mac[])
1318 {
1319 	char *s, *e, *macstr;
1320 	int i;
1321 
1322 	macstr = s = xenbus_read(XBT_NIL, dev->nodename, "mac", NULL);
1323 	if (IS_ERR(macstr))
1324 		return PTR_ERR(macstr);
1325 
1326 	for (i = 0; i < ETH_ALEN; i++) {
1327 		mac[i] = simple_strtoul(s, &e, 16);
1328 		if ((s == e) || (*e != ((i == ETH_ALEN-1) ? '\0' : ':'))) {
1329 			kfree(macstr);
1330 			return -ENOENT;
1331 		}
1332 		s = e+1;
1333 	}
1334 
1335 	kfree(macstr);
1336 	return 0;
1337 }
1338 
1339 static irqreturn_t xennet_interrupt(int irq, void *dev_id)
1340 {
1341 	struct net_device *dev = dev_id;
1342 	struct netfront_info *np = netdev_priv(dev);
1343 	unsigned long flags;
1344 
1345 	spin_lock_irqsave(&np->tx_lock, flags);
1346 
1347 	if (likely(netif_carrier_ok(dev))) {
1348 		xennet_tx_buf_gc(dev);
1349 		/* Under tx_lock: protects access to rx shared-ring indexes. */
1350 		if (RING_HAS_UNCONSUMED_RESPONSES(&np->rx))
1351 			netif_rx_schedule(dev);
1352 	}
1353 
1354 	spin_unlock_irqrestore(&np->tx_lock, flags);
1355 
1356 	return IRQ_HANDLED;
1357 }
1358 
1359 static int setup_netfront(struct xenbus_device *dev, struct netfront_info *info)
1360 {
1361 	struct xen_netif_tx_sring *txs;
1362 	struct xen_netif_rx_sring *rxs;
1363 	int err;
1364 	struct net_device *netdev = info->netdev;
1365 
1366 	info->tx_ring_ref = GRANT_INVALID_REF;
1367 	info->rx_ring_ref = GRANT_INVALID_REF;
1368 	info->rx.sring = NULL;
1369 	info->tx.sring = NULL;
1370 	netdev->irq = 0;
1371 
1372 	err = xen_net_read_mac(dev, netdev->dev_addr);
1373 	if (err) {
1374 		xenbus_dev_fatal(dev, err, "parsing %s/mac", dev->nodename);
1375 		goto fail;
1376 	}
1377 
1378 	txs = (struct xen_netif_tx_sring *)get_zeroed_page(GFP_KERNEL);
1379 	if (!txs) {
1380 		err = -ENOMEM;
1381 		xenbus_dev_fatal(dev, err, "allocating tx ring page");
1382 		goto fail;
1383 	}
1384 	SHARED_RING_INIT(txs);
1385 	FRONT_RING_INIT(&info->tx, txs, PAGE_SIZE);
1386 
1387 	err = xenbus_grant_ring(dev, virt_to_mfn(txs));
1388 	if (err < 0) {
1389 		free_page((unsigned long)txs);
1390 		goto fail;
1391 	}
1392 
1393 	info->tx_ring_ref = err;
1394 	rxs = (struct xen_netif_rx_sring *)get_zeroed_page(GFP_KERNEL);
1395 	if (!rxs) {
1396 		err = -ENOMEM;
1397 		xenbus_dev_fatal(dev, err, "allocating rx ring page");
1398 		goto fail;
1399 	}
1400 	SHARED_RING_INIT(rxs);
1401 	FRONT_RING_INIT(&info->rx, rxs, PAGE_SIZE);
1402 
1403 	err = xenbus_grant_ring(dev, virt_to_mfn(rxs));
1404 	if (err < 0) {
1405 		free_page((unsigned long)rxs);
1406 		goto fail;
1407 	}
1408 	info->rx_ring_ref = err;
1409 
1410 	err = xenbus_alloc_evtchn(dev, &info->evtchn);
1411 	if (err)
1412 		goto fail;
1413 
1414 	err = bind_evtchn_to_irqhandler(info->evtchn, xennet_interrupt,
1415 					IRQF_SAMPLE_RANDOM, netdev->name,
1416 					netdev);
1417 	if (err < 0)
1418 		goto fail;
1419 	netdev->irq = err;
1420 	return 0;
1421 
1422  fail:
1423 	return err;
1424 }
1425 
1426 /* Common code used when first setting up, and when resuming. */
1427 static int talk_to_backend(struct xenbus_device *dev,
1428 			   struct netfront_info *info)
1429 {
1430 	const char *message;
1431 	struct xenbus_transaction xbt;
1432 	int err;
1433 
1434 	/* Create shared ring, alloc event channel. */
1435 	err = setup_netfront(dev, info);
1436 	if (err)
1437 		goto out;
1438 
1439 again:
1440 	err = xenbus_transaction_start(&xbt);
1441 	if (err) {
1442 		xenbus_dev_fatal(dev, err, "starting transaction");
1443 		goto destroy_ring;
1444 	}
1445 
1446 	err = xenbus_printf(xbt, dev->nodename, "tx-ring-ref", "%u",
1447 			    info->tx_ring_ref);
1448 	if (err) {
1449 		message = "writing tx ring-ref";
1450 		goto abort_transaction;
1451 	}
1452 	err = xenbus_printf(xbt, dev->nodename, "rx-ring-ref", "%u",
1453 			    info->rx_ring_ref);
1454 	if (err) {
1455 		message = "writing rx ring-ref";
1456 		goto abort_transaction;
1457 	}
1458 	err = xenbus_printf(xbt, dev->nodename,
1459 			    "event-channel", "%u", info->evtchn);
1460 	if (err) {
1461 		message = "writing event-channel";
1462 		goto abort_transaction;
1463 	}
1464 
1465 	err = xenbus_printf(xbt, dev->nodename, "request-rx-copy", "%u",
1466 			    1);
1467 	if (err) {
1468 		message = "writing request-rx-copy";
1469 		goto abort_transaction;
1470 	}
1471 
1472 	err = xenbus_printf(xbt, dev->nodename, "feature-rx-notify", "%d", 1);
1473 	if (err) {
1474 		message = "writing feature-rx-notify";
1475 		goto abort_transaction;
1476 	}
1477 
1478 	err = xenbus_printf(xbt, dev->nodename, "feature-sg", "%d", 1);
1479 	if (err) {
1480 		message = "writing feature-sg";
1481 		goto abort_transaction;
1482 	}
1483 
1484 	err = xenbus_printf(xbt, dev->nodename, "feature-gso-tcpv4", "%d", 1);
1485 	if (err) {
1486 		message = "writing feature-gso-tcpv4";
1487 		goto abort_transaction;
1488 	}
1489 
1490 	err = xenbus_transaction_end(xbt, 0);
1491 	if (err) {
1492 		if (err == -EAGAIN)
1493 			goto again;
1494 		xenbus_dev_fatal(dev, err, "completing transaction");
1495 		goto destroy_ring;
1496 	}
1497 
1498 	return 0;
1499 
1500  abort_transaction:
1501 	xenbus_transaction_end(xbt, 1);
1502 	xenbus_dev_fatal(dev, err, "%s", message);
1503  destroy_ring:
1504 	xennet_disconnect_backend(info);
1505  out:
1506 	return err;
1507 }
1508 
1509 static int xennet_set_sg(struct net_device *dev, u32 data)
1510 {
1511 	if (data) {
1512 		struct netfront_info *np = netdev_priv(dev);
1513 		int val;
1514 
1515 		if (xenbus_scanf(XBT_NIL, np->xbdev->otherend, "feature-sg",
1516 				 "%d", &val) < 0)
1517 			val = 0;
1518 		if (!val)
1519 			return -ENOSYS;
1520 	} else if (dev->mtu > ETH_DATA_LEN)
1521 		dev->mtu = ETH_DATA_LEN;
1522 
1523 	return ethtool_op_set_sg(dev, data);
1524 }
1525 
1526 static int xennet_set_tso(struct net_device *dev, u32 data)
1527 {
1528 	if (data) {
1529 		struct netfront_info *np = netdev_priv(dev);
1530 		int val;
1531 
1532 		if (xenbus_scanf(XBT_NIL, np->xbdev->otherend,
1533 				 "feature-gso-tcpv4", "%d", &val) < 0)
1534 			val = 0;
1535 		if (!val)
1536 			return -ENOSYS;
1537 	}
1538 
1539 	return ethtool_op_set_tso(dev, data);
1540 }
1541 
1542 static void xennet_set_features(struct net_device *dev)
1543 {
1544 	/* Turn off all GSO bits except ROBUST. */
1545 	dev->features &= (1 << NETIF_F_GSO_SHIFT) - 1;
1546 	dev->features |= NETIF_F_GSO_ROBUST;
1547 	xennet_set_sg(dev, 0);
1548 
1549 	/* We need checksum offload to enable scatter/gather and TSO. */
1550 	if (!(dev->features & NETIF_F_IP_CSUM))
1551 		return;
1552 
1553 	if (!xennet_set_sg(dev, 1))
1554 		xennet_set_tso(dev, 1);
1555 }
1556 
1557 static int xennet_connect(struct net_device *dev)
1558 {
1559 	struct netfront_info *np = netdev_priv(dev);
1560 	int i, requeue_idx, err;
1561 	struct sk_buff *skb;
1562 	grant_ref_t ref;
1563 	struct xen_netif_rx_request *req;
1564 	unsigned int feature_rx_copy;
1565 
1566 	err = xenbus_scanf(XBT_NIL, np->xbdev->otherend,
1567 			   "feature-rx-copy", "%u", &feature_rx_copy);
1568 	if (err != 1)
1569 		feature_rx_copy = 0;
1570 
1571 	if (!feature_rx_copy) {
1572 		dev_info(&dev->dev,
1573 			 "backend does not support copying recieve path");
1574 		return -ENODEV;
1575 	}
1576 
1577 	err = talk_to_backend(np->xbdev, np);
1578 	if (err)
1579 		return err;
1580 
1581 	xennet_set_features(dev);
1582 
1583 	spin_lock_bh(&np->rx_lock);
1584 	spin_lock_irq(&np->tx_lock);
1585 
1586 	/* Step 1: Discard all pending TX packet fragments. */
1587 	xennet_release_tx_bufs(np);
1588 
1589 	/* Step 2: Rebuild the RX buffer freelist and the RX ring itself. */
1590 	for (requeue_idx = 0, i = 0; i < NET_RX_RING_SIZE; i++) {
1591 		if (!np->rx_skbs[i])
1592 			continue;
1593 
1594 		skb = np->rx_skbs[requeue_idx] = xennet_get_rx_skb(np, i);
1595 		ref = np->grant_rx_ref[requeue_idx] = xennet_get_rx_ref(np, i);
1596 		req = RING_GET_REQUEST(&np->rx, requeue_idx);
1597 
1598 		gnttab_grant_foreign_access_ref(
1599 			ref, np->xbdev->otherend_id,
1600 			pfn_to_mfn(page_to_pfn(skb_shinfo(skb)->
1601 					       frags->page)),
1602 			0);
1603 		req->gref = ref;
1604 		req->id   = requeue_idx;
1605 
1606 		requeue_idx++;
1607 	}
1608 
1609 	np->rx.req_prod_pvt = requeue_idx;
1610 
1611 	/*
1612 	 * Step 3: All public and private state should now be sane.  Get
1613 	 * ready to start sending and receiving packets and give the driver
1614 	 * domain a kick because we've probably just requeued some
1615 	 * packets.
1616 	 */
1617 	netif_carrier_on(np->netdev);
1618 	notify_remote_via_irq(np->netdev->irq);
1619 	xennet_tx_buf_gc(dev);
1620 	xennet_alloc_rx_buffers(dev);
1621 
1622 	spin_unlock_irq(&np->tx_lock);
1623 	spin_unlock_bh(&np->rx_lock);
1624 
1625 	return 0;
1626 }
1627 
1628 /**
1629  * Callback received when the backend's state changes.
1630  */
1631 static void backend_changed(struct xenbus_device *dev,
1632 			    enum xenbus_state backend_state)
1633 {
1634 	struct netfront_info *np = dev->dev.driver_data;
1635 	struct net_device *netdev = np->netdev;
1636 
1637 	dev_dbg(&dev->dev, "%s\n", xenbus_strstate(backend_state));
1638 
1639 	switch (backend_state) {
1640 	case XenbusStateInitialising:
1641 	case XenbusStateInitialised:
1642 	case XenbusStateConnected:
1643 	case XenbusStateUnknown:
1644 	case XenbusStateClosed:
1645 		break;
1646 
1647 	case XenbusStateInitWait:
1648 		if (dev->state != XenbusStateInitialising)
1649 			break;
1650 		if (xennet_connect(netdev) != 0)
1651 			break;
1652 		xenbus_switch_state(dev, XenbusStateConnected);
1653 		break;
1654 
1655 	case XenbusStateClosing:
1656 		xenbus_frontend_closed(dev);
1657 		break;
1658 	}
1659 }
1660 
1661 static struct ethtool_ops xennet_ethtool_ops =
1662 {
1663 	.get_tx_csum = ethtool_op_get_tx_csum,
1664 	.set_tx_csum = ethtool_op_set_tx_csum,
1665 	.get_sg = ethtool_op_get_sg,
1666 	.set_sg = xennet_set_sg,
1667 	.get_tso = ethtool_op_get_tso,
1668 	.set_tso = xennet_set_tso,
1669 	.get_link = ethtool_op_get_link,
1670 };
1671 
1672 #ifdef CONFIG_SYSFS
1673 static ssize_t show_rxbuf_min(struct device *dev,
1674 			      struct device_attribute *attr, char *buf)
1675 {
1676 	struct net_device *netdev = to_net_dev(dev);
1677 	struct netfront_info *info = netdev_priv(netdev);
1678 
1679 	return sprintf(buf, "%u\n", info->rx_min_target);
1680 }
1681 
1682 static ssize_t store_rxbuf_min(struct device *dev,
1683 			       struct device_attribute *attr,
1684 			       const char *buf, size_t len)
1685 {
1686 	struct net_device *netdev = to_net_dev(dev);
1687 	struct netfront_info *np = netdev_priv(netdev);
1688 	char *endp;
1689 	unsigned long target;
1690 
1691 	if (!capable(CAP_NET_ADMIN))
1692 		return -EPERM;
1693 
1694 	target = simple_strtoul(buf, &endp, 0);
1695 	if (endp == buf)
1696 		return -EBADMSG;
1697 
1698 	if (target < RX_MIN_TARGET)
1699 		target = RX_MIN_TARGET;
1700 	if (target > RX_MAX_TARGET)
1701 		target = RX_MAX_TARGET;
1702 
1703 	spin_lock_bh(&np->rx_lock);
1704 	if (target > np->rx_max_target)
1705 		np->rx_max_target = target;
1706 	np->rx_min_target = target;
1707 	if (target > np->rx_target)
1708 		np->rx_target = target;
1709 
1710 	xennet_alloc_rx_buffers(netdev);
1711 
1712 	spin_unlock_bh(&np->rx_lock);
1713 	return len;
1714 }
1715 
1716 static ssize_t show_rxbuf_max(struct device *dev,
1717 			      struct device_attribute *attr, char *buf)
1718 {
1719 	struct net_device *netdev = to_net_dev(dev);
1720 	struct netfront_info *info = netdev_priv(netdev);
1721 
1722 	return sprintf(buf, "%u\n", info->rx_max_target);
1723 }
1724 
1725 static ssize_t store_rxbuf_max(struct device *dev,
1726 			       struct device_attribute *attr,
1727 			       const char *buf, size_t len)
1728 {
1729 	struct net_device *netdev = to_net_dev(dev);
1730 	struct netfront_info *np = netdev_priv(netdev);
1731 	char *endp;
1732 	unsigned long target;
1733 
1734 	if (!capable(CAP_NET_ADMIN))
1735 		return -EPERM;
1736 
1737 	target = simple_strtoul(buf, &endp, 0);
1738 	if (endp == buf)
1739 		return -EBADMSG;
1740 
1741 	if (target < RX_MIN_TARGET)
1742 		target = RX_MIN_TARGET;
1743 	if (target > RX_MAX_TARGET)
1744 		target = RX_MAX_TARGET;
1745 
1746 	spin_lock_bh(&np->rx_lock);
1747 	if (target < np->rx_min_target)
1748 		np->rx_min_target = target;
1749 	np->rx_max_target = target;
1750 	if (target < np->rx_target)
1751 		np->rx_target = target;
1752 
1753 	xennet_alloc_rx_buffers(netdev);
1754 
1755 	spin_unlock_bh(&np->rx_lock);
1756 	return len;
1757 }
1758 
1759 static ssize_t show_rxbuf_cur(struct device *dev,
1760 			      struct device_attribute *attr, char *buf)
1761 {
1762 	struct net_device *netdev = to_net_dev(dev);
1763 	struct netfront_info *info = netdev_priv(netdev);
1764 
1765 	return sprintf(buf, "%u\n", info->rx_target);
1766 }
1767 
1768 static struct device_attribute xennet_attrs[] = {
1769 	__ATTR(rxbuf_min, S_IRUGO|S_IWUSR, show_rxbuf_min, store_rxbuf_min),
1770 	__ATTR(rxbuf_max, S_IRUGO|S_IWUSR, show_rxbuf_max, store_rxbuf_max),
1771 	__ATTR(rxbuf_cur, S_IRUGO, show_rxbuf_cur, NULL),
1772 };
1773 
1774 static int xennet_sysfs_addif(struct net_device *netdev)
1775 {
1776 	int i;
1777 	int err;
1778 
1779 	for (i = 0; i < ARRAY_SIZE(xennet_attrs); i++) {
1780 		err = device_create_file(&netdev->dev,
1781 					   &xennet_attrs[i]);
1782 		if (err)
1783 			goto fail;
1784 	}
1785 	return 0;
1786 
1787  fail:
1788 	while (--i >= 0)
1789 		device_remove_file(&netdev->dev, &xennet_attrs[i]);
1790 	return err;
1791 }
1792 
1793 static void xennet_sysfs_delif(struct net_device *netdev)
1794 {
1795 	int i;
1796 
1797 	for (i = 0; i < ARRAY_SIZE(xennet_attrs); i++)
1798 		device_remove_file(&netdev->dev, &xennet_attrs[i]);
1799 }
1800 
1801 #endif /* CONFIG_SYSFS */
1802 
1803 static struct xenbus_device_id netfront_ids[] = {
1804 	{ "vif" },
1805 	{ "" }
1806 };
1807 
1808 
1809 static int __devexit xennet_remove(struct xenbus_device *dev)
1810 {
1811 	struct netfront_info *info = dev->dev.driver_data;
1812 
1813 	dev_dbg(&dev->dev, "%s\n", dev->nodename);
1814 
1815 	unregister_netdev(info->netdev);
1816 
1817 	xennet_disconnect_backend(info);
1818 
1819 	del_timer_sync(&info->rx_refill_timer);
1820 
1821 	xennet_sysfs_delif(info->netdev);
1822 
1823 	free_netdev(info->netdev);
1824 
1825 	return 0;
1826 }
1827 
1828 static struct xenbus_driver netfront = {
1829 	.name = "vif",
1830 	.owner = THIS_MODULE,
1831 	.ids = netfront_ids,
1832 	.probe = netfront_probe,
1833 	.remove = __devexit_p(xennet_remove),
1834 	.resume = netfront_resume,
1835 	.otherend_changed = backend_changed,
1836 };
1837 
1838 static int __init netif_init(void)
1839 {
1840 	if (!is_running_on_xen())
1841 		return -ENODEV;
1842 
1843 	if (is_initial_xendomain())
1844 		return 0;
1845 
1846 	printk(KERN_INFO "Initialising Xen virtual ethernet driver.\n");
1847 
1848 	return xenbus_register_frontend(&netfront);
1849 }
1850 module_init(netif_init);
1851 
1852 
1853 static void __exit netif_exit(void)
1854 {
1855 	if (is_initial_xendomain())
1856 		return;
1857 
1858 	return xenbus_unregister_driver(&netfront);
1859 }
1860 module_exit(netif_exit);
1861 
1862 MODULE_DESCRIPTION("Xen virtual network device frontend");
1863 MODULE_LICENSE("GPL");
1864