xref: /linux/net/core/dev.c (revision b8bb76713ec50df2f11efee386e16f93d51e1076)
1 /*
2  * 	NET3	Protocol independent device support routines.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  *	Derived from the non IP parts of dev.c 1.0.19
10  * 		Authors:	Ross Biro
11  *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *				Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *
14  *	Additional Authors:
15  *		Florian la Roche <rzsfl@rz.uni-sb.de>
16  *		Alan Cox <gw4pts@gw4pts.ampr.org>
17  *		David Hinds <dahinds@users.sourceforge.net>
18  *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19  *		Adam Sulmicki <adam@cfar.umd.edu>
20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
21  *
22  *	Changes:
23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
24  *              			to 2 if register_netdev gets called
25  *              			before net_dev_init & also removed a
26  *              			few lines of code in the process.
27  *		Alan Cox	:	device private ioctl copies fields back.
28  *		Alan Cox	:	Transmit queue code does relevant
29  *					stunts to keep the queue safe.
30  *		Alan Cox	:	Fixed double lock.
31  *		Alan Cox	:	Fixed promisc NULL pointer trap
32  *		????????	:	Support the full private ioctl range
33  *		Alan Cox	:	Moved ioctl permission check into
34  *					drivers
35  *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
36  *		Alan Cox	:	100 backlog just doesn't cut it when
37  *					you start doing multicast video 8)
38  *		Alan Cox	:	Rewrote net_bh and list manager.
39  *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
40  *		Alan Cox	:	Took out transmit every packet pass
41  *					Saved a few bytes in the ioctl handler
42  *		Alan Cox	:	Network driver sets packet type before
43  *					calling netif_rx. Saves a function
44  *					call a packet.
45  *		Alan Cox	:	Hashed net_bh()
46  *		Richard Kooijman:	Timestamp fixes.
47  *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
48  *		Alan Cox	:	Device lock protection.
49  *		Alan Cox	: 	Fixed nasty side effect of device close
50  *					changes.
51  *		Rudi Cilibrasi	:	Pass the right thing to
52  *					set_mac_address()
53  *		Dave Miller	:	32bit quantity for the device lock to
54  *					make it work out on a Sparc.
55  *		Bjorn Ekwall	:	Added KERNELD hack.
56  *		Alan Cox	:	Cleaned up the backlog initialise.
57  *		Craig Metz	:	SIOCGIFCONF fix if space for under
58  *					1 device.
59  *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
60  *					is no device open function.
61  *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
62  *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
63  *		Cyrus Durgin	:	Cleaned for KMOD
64  *		Adam Sulmicki   :	Bug Fix : Network Device Unload
65  *					A network device unload needs to purge
66  *					the backlog queue.
67  *	Paul Rusty Russell	:	SIOCSIFNAME
68  *              Pekka Riikonen  :	Netdev boot-time settings code
69  *              Andrew Morton   :       Make unregister_netdevice wait
70  *              			indefinitely on dev->refcnt
71  * 		J Hadi Salim	:	- Backlog queue sampling
72  *				        - netif_rx() feedback
73  */
74 
75 #include <asm/uaccess.h>
76 #include <asm/system.h>
77 #include <linux/bitops.h>
78 #include <linux/capability.h>
79 #include <linux/cpu.h>
80 #include <linux/types.h>
81 #include <linux/kernel.h>
82 #include <linux/sched.h>
83 #include <linux/mutex.h>
84 #include <linux/string.h>
85 #include <linux/mm.h>
86 #include <linux/socket.h>
87 #include <linux/sockios.h>
88 #include <linux/errno.h>
89 #include <linux/interrupt.h>
90 #include <linux/if_ether.h>
91 #include <linux/netdevice.h>
92 #include <linux/etherdevice.h>
93 #include <linux/ethtool.h>
94 #include <linux/notifier.h>
95 #include <linux/skbuff.h>
96 #include <net/net_namespace.h>
97 #include <net/sock.h>
98 #include <linux/rtnetlink.h>
99 #include <linux/proc_fs.h>
100 #include <linux/seq_file.h>
101 #include <linux/stat.h>
102 #include <linux/if_bridge.h>
103 #include <linux/if_macvlan.h>
104 #include <net/dst.h>
105 #include <net/pkt_sched.h>
106 #include <net/checksum.h>
107 #include <linux/highmem.h>
108 #include <linux/init.h>
109 #include <linux/kmod.h>
110 #include <linux/module.h>
111 #include <linux/netpoll.h>
112 #include <linux/rcupdate.h>
113 #include <linux/delay.h>
114 #include <net/wext.h>
115 #include <net/iw_handler.h>
116 #include <asm/current.h>
117 #include <linux/audit.h>
118 #include <linux/dmaengine.h>
119 #include <linux/err.h>
120 #include <linux/ctype.h>
121 #include <linux/if_arp.h>
122 #include <linux/if_vlan.h>
123 #include <linux/ip.h>
124 #include <net/ip.h>
125 #include <linux/ipv6.h>
126 #include <linux/in.h>
127 #include <linux/jhash.h>
128 #include <linux/random.h>
129 
130 #include "net-sysfs.h"
131 
132 /* Instead of increasing this, you should create a hash table. */
133 #define MAX_GRO_SKBS 8
134 
135 /* This should be increased if a protocol with a bigger head is added. */
136 #define GRO_MAX_HEAD (MAX_HEADER + 128)
137 
138 /*
139  *	The list of packet types we will receive (as opposed to discard)
140  *	and the routines to invoke.
141  *
142  *	Why 16. Because with 16 the only overlap we get on a hash of the
143  *	low nibble of the protocol value is RARP/SNAP/X.25.
144  *
145  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
146  *             sure which should go first, but I bet it won't make much
147  *             difference if we are running VLANs.  The good news is that
148  *             this protocol won't be in the list unless compiled in, so
149  *             the average user (w/out VLANs) will not be adversely affected.
150  *             --BLG
151  *
152  *		0800	IP
153  *		8100    802.1Q VLAN
154  *		0001	802.3
155  *		0002	AX.25
156  *		0004	802.2
157  *		8035	RARP
158  *		0005	SNAP
159  *		0805	X.25
160  *		0806	ARP
161  *		8137	IPX
162  *		0009	Localtalk
163  *		86DD	IPv6
164  */
165 
166 #define PTYPE_HASH_SIZE	(16)
167 #define PTYPE_HASH_MASK	(PTYPE_HASH_SIZE - 1)
168 
169 static DEFINE_SPINLOCK(ptype_lock);
170 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
171 static struct list_head ptype_all __read_mostly;	/* Taps */
172 
173 /*
174  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
175  * semaphore.
176  *
177  * Pure readers hold dev_base_lock for reading.
178  *
179  * Writers must hold the rtnl semaphore while they loop through the
180  * dev_base_head list, and hold dev_base_lock for writing when they do the
181  * actual updates.  This allows pure readers to access the list even
182  * while a writer is preparing to update it.
183  *
184  * To put it another way, dev_base_lock is held for writing only to
185  * protect against pure readers; the rtnl semaphore provides the
186  * protection against other writers.
187  *
188  * See, for example usages, register_netdevice() and
189  * unregister_netdevice(), which must be called with the rtnl
190  * semaphore held.
191  */
192 DEFINE_RWLOCK(dev_base_lock);
193 
194 EXPORT_SYMBOL(dev_base_lock);
195 
196 #define NETDEV_HASHBITS	8
197 #define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS)
198 
199 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
200 {
201 	unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
202 	return &net->dev_name_head[hash & ((1 << NETDEV_HASHBITS) - 1)];
203 }
204 
205 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
206 {
207 	return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)];
208 }
209 
210 /* Device list insertion */
211 static int list_netdevice(struct net_device *dev)
212 {
213 	struct net *net = dev_net(dev);
214 
215 	ASSERT_RTNL();
216 
217 	write_lock_bh(&dev_base_lock);
218 	list_add_tail(&dev->dev_list, &net->dev_base_head);
219 	hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
220 	hlist_add_head(&dev->index_hlist, dev_index_hash(net, dev->ifindex));
221 	write_unlock_bh(&dev_base_lock);
222 	return 0;
223 }
224 
225 /* Device list removal */
226 static void unlist_netdevice(struct net_device *dev)
227 {
228 	ASSERT_RTNL();
229 
230 	/* Unlink dev from the device chain */
231 	write_lock_bh(&dev_base_lock);
232 	list_del(&dev->dev_list);
233 	hlist_del(&dev->name_hlist);
234 	hlist_del(&dev->index_hlist);
235 	write_unlock_bh(&dev_base_lock);
236 }
237 
238 /*
239  *	Our notifier list
240  */
241 
242 static RAW_NOTIFIER_HEAD(netdev_chain);
243 
244 /*
245  *	Device drivers call our routines to queue packets here. We empty the
246  *	queue in the local softnet handler.
247  */
248 
249 DEFINE_PER_CPU(struct softnet_data, softnet_data);
250 
251 #ifdef CONFIG_LOCKDEP
252 /*
253  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
254  * according to dev->type
255  */
256 static const unsigned short netdev_lock_type[] =
257 	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
258 	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
259 	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
260 	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
261 	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
262 	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
263 	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
264 	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
265 	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
266 	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
267 	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
268 	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
269 	 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
270 	 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
271 	 ARPHRD_PHONET_PIPE, ARPHRD_VOID, ARPHRD_NONE};
272 
273 static const char *netdev_lock_name[] =
274 	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
275 	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
276 	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
277 	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
278 	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
279 	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
280 	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
281 	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
282 	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
283 	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
284 	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
285 	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
286 	 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
287 	 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
288 	 "_xmit_PHONET_PIPE", "_xmit_VOID", "_xmit_NONE"};
289 
290 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
291 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
292 
293 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
294 {
295 	int i;
296 
297 	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
298 		if (netdev_lock_type[i] == dev_type)
299 			return i;
300 	/* the last key is used by default */
301 	return ARRAY_SIZE(netdev_lock_type) - 1;
302 }
303 
304 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
305 						 unsigned short dev_type)
306 {
307 	int i;
308 
309 	i = netdev_lock_pos(dev_type);
310 	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
311 				   netdev_lock_name[i]);
312 }
313 
314 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
315 {
316 	int i;
317 
318 	i = netdev_lock_pos(dev->type);
319 	lockdep_set_class_and_name(&dev->addr_list_lock,
320 				   &netdev_addr_lock_key[i],
321 				   netdev_lock_name[i]);
322 }
323 #else
324 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
325 						 unsigned short dev_type)
326 {
327 }
328 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
329 {
330 }
331 #endif
332 
333 /*******************************************************************************
334 
335 		Protocol management and registration routines
336 
337 *******************************************************************************/
338 
339 /*
340  *	Add a protocol ID to the list. Now that the input handler is
341  *	smarter we can dispense with all the messy stuff that used to be
342  *	here.
343  *
344  *	BEWARE!!! Protocol handlers, mangling input packets,
345  *	MUST BE last in hash buckets and checking protocol handlers
346  *	MUST start from promiscuous ptype_all chain in net_bh.
347  *	It is true now, do not change it.
348  *	Explanation follows: if protocol handler, mangling packet, will
349  *	be the first on list, it is not able to sense, that packet
350  *	is cloned and should be copied-on-write, so that it will
351  *	change it and subsequent readers will get broken packet.
352  *							--ANK (980803)
353  */
354 
355 /**
356  *	dev_add_pack - add packet handler
357  *	@pt: packet type declaration
358  *
359  *	Add a protocol handler to the networking stack. The passed &packet_type
360  *	is linked into kernel lists and may not be freed until it has been
361  *	removed from the kernel lists.
362  *
363  *	This call does not sleep therefore it can not
364  *	guarantee all CPU's that are in middle of receiving packets
365  *	will see the new packet type (until the next received packet).
366  */
367 
368 void dev_add_pack(struct packet_type *pt)
369 {
370 	int hash;
371 
372 	spin_lock_bh(&ptype_lock);
373 	if (pt->type == htons(ETH_P_ALL))
374 		list_add_rcu(&pt->list, &ptype_all);
375 	else {
376 		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
377 		list_add_rcu(&pt->list, &ptype_base[hash]);
378 	}
379 	spin_unlock_bh(&ptype_lock);
380 }
381 
382 /**
383  *	__dev_remove_pack	 - remove packet handler
384  *	@pt: packet type declaration
385  *
386  *	Remove a protocol handler that was previously added to the kernel
387  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
388  *	from the kernel lists and can be freed or reused once this function
389  *	returns.
390  *
391  *      The packet type might still be in use by receivers
392  *	and must not be freed until after all the CPU's have gone
393  *	through a quiescent state.
394  */
395 void __dev_remove_pack(struct packet_type *pt)
396 {
397 	struct list_head *head;
398 	struct packet_type *pt1;
399 
400 	spin_lock_bh(&ptype_lock);
401 
402 	if (pt->type == htons(ETH_P_ALL))
403 		head = &ptype_all;
404 	else
405 		head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
406 
407 	list_for_each_entry(pt1, head, list) {
408 		if (pt == pt1) {
409 			list_del_rcu(&pt->list);
410 			goto out;
411 		}
412 	}
413 
414 	printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
415 out:
416 	spin_unlock_bh(&ptype_lock);
417 }
418 /**
419  *	dev_remove_pack	 - remove packet handler
420  *	@pt: packet type declaration
421  *
422  *	Remove a protocol handler that was previously added to the kernel
423  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
424  *	from the kernel lists and can be freed or reused once this function
425  *	returns.
426  *
427  *	This call sleeps to guarantee that no CPU is looking at the packet
428  *	type after return.
429  */
430 void dev_remove_pack(struct packet_type *pt)
431 {
432 	__dev_remove_pack(pt);
433 
434 	synchronize_net();
435 }
436 
437 /******************************************************************************
438 
439 		      Device Boot-time Settings Routines
440 
441 *******************************************************************************/
442 
443 /* Boot time configuration table */
444 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
445 
446 /**
447  *	netdev_boot_setup_add	- add new setup entry
448  *	@name: name of the device
449  *	@map: configured settings for the device
450  *
451  *	Adds new setup entry to the dev_boot_setup list.  The function
452  *	returns 0 on error and 1 on success.  This is a generic routine to
453  *	all netdevices.
454  */
455 static int netdev_boot_setup_add(char *name, struct ifmap *map)
456 {
457 	struct netdev_boot_setup *s;
458 	int i;
459 
460 	s = dev_boot_setup;
461 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
462 		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
463 			memset(s[i].name, 0, sizeof(s[i].name));
464 			strlcpy(s[i].name, name, IFNAMSIZ);
465 			memcpy(&s[i].map, map, sizeof(s[i].map));
466 			break;
467 		}
468 	}
469 
470 	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
471 }
472 
473 /**
474  *	netdev_boot_setup_check	- check boot time settings
475  *	@dev: the netdevice
476  *
477  * 	Check boot time settings for the device.
478  *	The found settings are set for the device to be used
479  *	later in the device probing.
480  *	Returns 0 if no settings found, 1 if they are.
481  */
482 int netdev_boot_setup_check(struct net_device *dev)
483 {
484 	struct netdev_boot_setup *s = dev_boot_setup;
485 	int i;
486 
487 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
488 		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
489 		    !strcmp(dev->name, s[i].name)) {
490 			dev->irq 	= s[i].map.irq;
491 			dev->base_addr 	= s[i].map.base_addr;
492 			dev->mem_start 	= s[i].map.mem_start;
493 			dev->mem_end 	= s[i].map.mem_end;
494 			return 1;
495 		}
496 	}
497 	return 0;
498 }
499 
500 
501 /**
502  *	netdev_boot_base	- get address from boot time settings
503  *	@prefix: prefix for network device
504  *	@unit: id for network device
505  *
506  * 	Check boot time settings for the base address of device.
507  *	The found settings are set for the device to be used
508  *	later in the device probing.
509  *	Returns 0 if no settings found.
510  */
511 unsigned long netdev_boot_base(const char *prefix, int unit)
512 {
513 	const struct netdev_boot_setup *s = dev_boot_setup;
514 	char name[IFNAMSIZ];
515 	int i;
516 
517 	sprintf(name, "%s%d", prefix, unit);
518 
519 	/*
520 	 * If device already registered then return base of 1
521 	 * to indicate not to probe for this interface
522 	 */
523 	if (__dev_get_by_name(&init_net, name))
524 		return 1;
525 
526 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
527 		if (!strcmp(name, s[i].name))
528 			return s[i].map.base_addr;
529 	return 0;
530 }
531 
532 /*
533  * Saves at boot time configured settings for any netdevice.
534  */
535 int __init netdev_boot_setup(char *str)
536 {
537 	int ints[5];
538 	struct ifmap map;
539 
540 	str = get_options(str, ARRAY_SIZE(ints), ints);
541 	if (!str || !*str)
542 		return 0;
543 
544 	/* Save settings */
545 	memset(&map, 0, sizeof(map));
546 	if (ints[0] > 0)
547 		map.irq = ints[1];
548 	if (ints[0] > 1)
549 		map.base_addr = ints[2];
550 	if (ints[0] > 2)
551 		map.mem_start = ints[3];
552 	if (ints[0] > 3)
553 		map.mem_end = ints[4];
554 
555 	/* Add new entry to the list */
556 	return netdev_boot_setup_add(str, &map);
557 }
558 
559 __setup("netdev=", netdev_boot_setup);
560 
561 /*******************************************************************************
562 
563 			    Device Interface Subroutines
564 
565 *******************************************************************************/
566 
567 /**
568  *	__dev_get_by_name	- find a device by its name
569  *	@net: the applicable net namespace
570  *	@name: name to find
571  *
572  *	Find an interface by name. Must be called under RTNL semaphore
573  *	or @dev_base_lock. If the name is found a pointer to the device
574  *	is returned. If the name is not found then %NULL is returned. The
575  *	reference counters are not incremented so the caller must be
576  *	careful with locks.
577  */
578 
579 struct net_device *__dev_get_by_name(struct net *net, const char *name)
580 {
581 	struct hlist_node *p;
582 
583 	hlist_for_each(p, dev_name_hash(net, name)) {
584 		struct net_device *dev
585 			= hlist_entry(p, struct net_device, name_hlist);
586 		if (!strncmp(dev->name, name, IFNAMSIZ))
587 			return dev;
588 	}
589 	return NULL;
590 }
591 
592 /**
593  *	dev_get_by_name		- find a device by its name
594  *	@net: the applicable net namespace
595  *	@name: name to find
596  *
597  *	Find an interface by name. This can be called from any
598  *	context and does its own locking. The returned handle has
599  *	the usage count incremented and the caller must use dev_put() to
600  *	release it when it is no longer needed. %NULL is returned if no
601  *	matching device is found.
602  */
603 
604 struct net_device *dev_get_by_name(struct net *net, const char *name)
605 {
606 	struct net_device *dev;
607 
608 	read_lock(&dev_base_lock);
609 	dev = __dev_get_by_name(net, name);
610 	if (dev)
611 		dev_hold(dev);
612 	read_unlock(&dev_base_lock);
613 	return dev;
614 }
615 
616 /**
617  *	__dev_get_by_index - find a device by its ifindex
618  *	@net: the applicable net namespace
619  *	@ifindex: index of device
620  *
621  *	Search for an interface by index. Returns %NULL if the device
622  *	is not found or a pointer to the device. The device has not
623  *	had its reference counter increased so the caller must be careful
624  *	about locking. The caller must hold either the RTNL semaphore
625  *	or @dev_base_lock.
626  */
627 
628 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
629 {
630 	struct hlist_node *p;
631 
632 	hlist_for_each(p, dev_index_hash(net, ifindex)) {
633 		struct net_device *dev
634 			= hlist_entry(p, struct net_device, index_hlist);
635 		if (dev->ifindex == ifindex)
636 			return dev;
637 	}
638 	return NULL;
639 }
640 
641 
642 /**
643  *	dev_get_by_index - find a device by its ifindex
644  *	@net: the applicable net namespace
645  *	@ifindex: index of device
646  *
647  *	Search for an interface by index. Returns NULL if the device
648  *	is not found or a pointer to the device. The device returned has
649  *	had a reference added and the pointer is safe until the user calls
650  *	dev_put to indicate they have finished with it.
651  */
652 
653 struct net_device *dev_get_by_index(struct net *net, int ifindex)
654 {
655 	struct net_device *dev;
656 
657 	read_lock(&dev_base_lock);
658 	dev = __dev_get_by_index(net, ifindex);
659 	if (dev)
660 		dev_hold(dev);
661 	read_unlock(&dev_base_lock);
662 	return dev;
663 }
664 
665 /**
666  *	dev_getbyhwaddr - find a device by its hardware address
667  *	@net: the applicable net namespace
668  *	@type: media type of device
669  *	@ha: hardware address
670  *
671  *	Search for an interface by MAC address. Returns NULL if the device
672  *	is not found or a pointer to the device. The caller must hold the
673  *	rtnl semaphore. The returned device has not had its ref count increased
674  *	and the caller must therefore be careful about locking
675  *
676  *	BUGS:
677  *	If the API was consistent this would be __dev_get_by_hwaddr
678  */
679 
680 struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha)
681 {
682 	struct net_device *dev;
683 
684 	ASSERT_RTNL();
685 
686 	for_each_netdev(net, dev)
687 		if (dev->type == type &&
688 		    !memcmp(dev->dev_addr, ha, dev->addr_len))
689 			return dev;
690 
691 	return NULL;
692 }
693 
694 EXPORT_SYMBOL(dev_getbyhwaddr);
695 
696 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
697 {
698 	struct net_device *dev;
699 
700 	ASSERT_RTNL();
701 	for_each_netdev(net, dev)
702 		if (dev->type == type)
703 			return dev;
704 
705 	return NULL;
706 }
707 
708 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
709 
710 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
711 {
712 	struct net_device *dev;
713 
714 	rtnl_lock();
715 	dev = __dev_getfirstbyhwtype(net, type);
716 	if (dev)
717 		dev_hold(dev);
718 	rtnl_unlock();
719 	return dev;
720 }
721 
722 EXPORT_SYMBOL(dev_getfirstbyhwtype);
723 
724 /**
725  *	dev_get_by_flags - find any device with given flags
726  *	@net: the applicable net namespace
727  *	@if_flags: IFF_* values
728  *	@mask: bitmask of bits in if_flags to check
729  *
730  *	Search for any interface with the given flags. Returns NULL if a device
731  *	is not found or a pointer to the device. The device returned has
732  *	had a reference added and the pointer is safe until the user calls
733  *	dev_put to indicate they have finished with it.
734  */
735 
736 struct net_device * dev_get_by_flags(struct net *net, unsigned short if_flags, unsigned short mask)
737 {
738 	struct net_device *dev, *ret;
739 
740 	ret = NULL;
741 	read_lock(&dev_base_lock);
742 	for_each_netdev(net, dev) {
743 		if (((dev->flags ^ if_flags) & mask) == 0) {
744 			dev_hold(dev);
745 			ret = dev;
746 			break;
747 		}
748 	}
749 	read_unlock(&dev_base_lock);
750 	return ret;
751 }
752 
753 /**
754  *	dev_valid_name - check if name is okay for network device
755  *	@name: name string
756  *
757  *	Network device names need to be valid file names to
758  *	to allow sysfs to work.  We also disallow any kind of
759  *	whitespace.
760  */
761 int dev_valid_name(const char *name)
762 {
763 	if (*name == '\0')
764 		return 0;
765 	if (strlen(name) >= IFNAMSIZ)
766 		return 0;
767 	if (!strcmp(name, ".") || !strcmp(name, ".."))
768 		return 0;
769 
770 	while (*name) {
771 		if (*name == '/' || isspace(*name))
772 			return 0;
773 		name++;
774 	}
775 	return 1;
776 }
777 
778 /**
779  *	__dev_alloc_name - allocate a name for a device
780  *	@net: network namespace to allocate the device name in
781  *	@name: name format string
782  *	@buf:  scratch buffer and result name string
783  *
784  *	Passed a format string - eg "lt%d" it will try and find a suitable
785  *	id. It scans list of devices to build up a free map, then chooses
786  *	the first empty slot. The caller must hold the dev_base or rtnl lock
787  *	while allocating the name and adding the device in order to avoid
788  *	duplicates.
789  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
790  *	Returns the number of the unit assigned or a negative errno code.
791  */
792 
793 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
794 {
795 	int i = 0;
796 	const char *p;
797 	const int max_netdevices = 8*PAGE_SIZE;
798 	unsigned long *inuse;
799 	struct net_device *d;
800 
801 	p = strnchr(name, IFNAMSIZ-1, '%');
802 	if (p) {
803 		/*
804 		 * Verify the string as this thing may have come from
805 		 * the user.  There must be either one "%d" and no other "%"
806 		 * characters.
807 		 */
808 		if (p[1] != 'd' || strchr(p + 2, '%'))
809 			return -EINVAL;
810 
811 		/* Use one page as a bit array of possible slots */
812 		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
813 		if (!inuse)
814 			return -ENOMEM;
815 
816 		for_each_netdev(net, d) {
817 			if (!sscanf(d->name, name, &i))
818 				continue;
819 			if (i < 0 || i >= max_netdevices)
820 				continue;
821 
822 			/*  avoid cases where sscanf is not exact inverse of printf */
823 			snprintf(buf, IFNAMSIZ, name, i);
824 			if (!strncmp(buf, d->name, IFNAMSIZ))
825 				set_bit(i, inuse);
826 		}
827 
828 		i = find_first_zero_bit(inuse, max_netdevices);
829 		free_page((unsigned long) inuse);
830 	}
831 
832 	snprintf(buf, IFNAMSIZ, name, i);
833 	if (!__dev_get_by_name(net, buf))
834 		return i;
835 
836 	/* It is possible to run out of possible slots
837 	 * when the name is long and there isn't enough space left
838 	 * for the digits, or if all bits are used.
839 	 */
840 	return -ENFILE;
841 }
842 
843 /**
844  *	dev_alloc_name - allocate a name for a device
845  *	@dev: device
846  *	@name: name format string
847  *
848  *	Passed a format string - eg "lt%d" it will try and find a suitable
849  *	id. It scans list of devices to build up a free map, then chooses
850  *	the first empty slot. The caller must hold the dev_base or rtnl lock
851  *	while allocating the name and adding the device in order to avoid
852  *	duplicates.
853  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
854  *	Returns the number of the unit assigned or a negative errno code.
855  */
856 
857 int dev_alloc_name(struct net_device *dev, const char *name)
858 {
859 	char buf[IFNAMSIZ];
860 	struct net *net;
861 	int ret;
862 
863 	BUG_ON(!dev_net(dev));
864 	net = dev_net(dev);
865 	ret = __dev_alloc_name(net, name, buf);
866 	if (ret >= 0)
867 		strlcpy(dev->name, buf, IFNAMSIZ);
868 	return ret;
869 }
870 
871 
872 /**
873  *	dev_change_name - change name of a device
874  *	@dev: device
875  *	@newname: name (or format string) must be at least IFNAMSIZ
876  *
877  *	Change name of a device, can pass format strings "eth%d".
878  *	for wildcarding.
879  */
880 int dev_change_name(struct net_device *dev, const char *newname)
881 {
882 	char oldname[IFNAMSIZ];
883 	int err = 0;
884 	int ret;
885 	struct net *net;
886 
887 	ASSERT_RTNL();
888 	BUG_ON(!dev_net(dev));
889 
890 	net = dev_net(dev);
891 	if (dev->flags & IFF_UP)
892 		return -EBUSY;
893 
894 	if (!dev_valid_name(newname))
895 		return -EINVAL;
896 
897 	if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
898 		return 0;
899 
900 	memcpy(oldname, dev->name, IFNAMSIZ);
901 
902 	if (strchr(newname, '%')) {
903 		err = dev_alloc_name(dev, newname);
904 		if (err < 0)
905 			return err;
906 	}
907 	else if (__dev_get_by_name(net, newname))
908 		return -EEXIST;
909 	else
910 		strlcpy(dev->name, newname, IFNAMSIZ);
911 
912 rollback:
913 	/* For now only devices in the initial network namespace
914 	 * are in sysfs.
915 	 */
916 	if (net == &init_net) {
917 		ret = device_rename(&dev->dev, dev->name);
918 		if (ret) {
919 			memcpy(dev->name, oldname, IFNAMSIZ);
920 			return ret;
921 		}
922 	}
923 
924 	write_lock_bh(&dev_base_lock);
925 	hlist_del(&dev->name_hlist);
926 	hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
927 	write_unlock_bh(&dev_base_lock);
928 
929 	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
930 	ret = notifier_to_errno(ret);
931 
932 	if (ret) {
933 		if (err) {
934 			printk(KERN_ERR
935 			       "%s: name change rollback failed: %d.\n",
936 			       dev->name, ret);
937 		} else {
938 			err = ret;
939 			memcpy(dev->name, oldname, IFNAMSIZ);
940 			goto rollback;
941 		}
942 	}
943 
944 	return err;
945 }
946 
947 /**
948  *	dev_set_alias - change ifalias of a device
949  *	@dev: device
950  *	@alias: name up to IFALIASZ
951  *	@len: limit of bytes to copy from info
952  *
953  *	Set ifalias for a device,
954  */
955 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
956 {
957 	ASSERT_RTNL();
958 
959 	if (len >= IFALIASZ)
960 		return -EINVAL;
961 
962 	if (!len) {
963 		if (dev->ifalias) {
964 			kfree(dev->ifalias);
965 			dev->ifalias = NULL;
966 		}
967 		return 0;
968 	}
969 
970 	dev->ifalias = krealloc(dev->ifalias, len+1, GFP_KERNEL);
971 	if (!dev->ifalias)
972 		return -ENOMEM;
973 
974 	strlcpy(dev->ifalias, alias, len+1);
975 	return len;
976 }
977 
978 
979 /**
980  *	netdev_features_change - device changes features
981  *	@dev: device to cause notification
982  *
983  *	Called to indicate a device has changed features.
984  */
985 void netdev_features_change(struct net_device *dev)
986 {
987 	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
988 }
989 EXPORT_SYMBOL(netdev_features_change);
990 
991 /**
992  *	netdev_state_change - device changes state
993  *	@dev: device to cause notification
994  *
995  *	Called to indicate a device has changed state. This function calls
996  *	the notifier chains for netdev_chain and sends a NEWLINK message
997  *	to the routing socket.
998  */
999 void netdev_state_change(struct net_device *dev)
1000 {
1001 	if (dev->flags & IFF_UP) {
1002 		call_netdevice_notifiers(NETDEV_CHANGE, dev);
1003 		rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1004 	}
1005 }
1006 
1007 void netdev_bonding_change(struct net_device *dev)
1008 {
1009 	call_netdevice_notifiers(NETDEV_BONDING_FAILOVER, dev);
1010 }
1011 EXPORT_SYMBOL(netdev_bonding_change);
1012 
1013 /**
1014  *	dev_load 	- load a network module
1015  *	@net: the applicable net namespace
1016  *	@name: name of interface
1017  *
1018  *	If a network interface is not present and the process has suitable
1019  *	privileges this function loads the module. If module loading is not
1020  *	available in this kernel then it becomes a nop.
1021  */
1022 
1023 void dev_load(struct net *net, const char *name)
1024 {
1025 	struct net_device *dev;
1026 
1027 	read_lock(&dev_base_lock);
1028 	dev = __dev_get_by_name(net, name);
1029 	read_unlock(&dev_base_lock);
1030 
1031 	if (!dev && capable(CAP_SYS_MODULE))
1032 		request_module("%s", name);
1033 }
1034 
1035 /**
1036  *	dev_open	- prepare an interface for use.
1037  *	@dev:	device to open
1038  *
1039  *	Takes a device from down to up state. The device's private open
1040  *	function is invoked and then the multicast lists are loaded. Finally
1041  *	the device is moved into the up state and a %NETDEV_UP message is
1042  *	sent to the netdev notifier chain.
1043  *
1044  *	Calling this function on an active interface is a nop. On a failure
1045  *	a negative errno code is returned.
1046  */
1047 int dev_open(struct net_device *dev)
1048 {
1049 	const struct net_device_ops *ops = dev->netdev_ops;
1050 	int ret = 0;
1051 
1052 	ASSERT_RTNL();
1053 
1054 	/*
1055 	 *	Is it already up?
1056 	 */
1057 
1058 	if (dev->flags & IFF_UP)
1059 		return 0;
1060 
1061 	/*
1062 	 *	Is it even present?
1063 	 */
1064 	if (!netif_device_present(dev))
1065 		return -ENODEV;
1066 
1067 	/*
1068 	 *	Call device private open method
1069 	 */
1070 	set_bit(__LINK_STATE_START, &dev->state);
1071 
1072 	if (ops->ndo_validate_addr)
1073 		ret = ops->ndo_validate_addr(dev);
1074 
1075 	if (!ret && ops->ndo_open)
1076 		ret = ops->ndo_open(dev);
1077 
1078 	/*
1079 	 *	If it went open OK then:
1080 	 */
1081 
1082 	if (ret)
1083 		clear_bit(__LINK_STATE_START, &dev->state);
1084 	else {
1085 		/*
1086 		 *	Set the flags.
1087 		 */
1088 		dev->flags |= IFF_UP;
1089 
1090 		/*
1091 		 *	Enable NET_DMA
1092 		 */
1093 		net_dmaengine_get();
1094 
1095 		/*
1096 		 *	Initialize multicasting status
1097 		 */
1098 		dev_set_rx_mode(dev);
1099 
1100 		/*
1101 		 *	Wakeup transmit queue engine
1102 		 */
1103 		dev_activate(dev);
1104 
1105 		/*
1106 		 *	... and announce new interface.
1107 		 */
1108 		call_netdevice_notifiers(NETDEV_UP, dev);
1109 	}
1110 
1111 	return ret;
1112 }
1113 
1114 /**
1115  *	dev_close - shutdown an interface.
1116  *	@dev: device to shutdown
1117  *
1118  *	This function moves an active device into down state. A
1119  *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1120  *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1121  *	chain.
1122  */
1123 int dev_close(struct net_device *dev)
1124 {
1125 	const struct net_device_ops *ops = dev->netdev_ops;
1126 	ASSERT_RTNL();
1127 
1128 	might_sleep();
1129 
1130 	if (!(dev->flags & IFF_UP))
1131 		return 0;
1132 
1133 	/*
1134 	 *	Tell people we are going down, so that they can
1135 	 *	prepare to death, when device is still operating.
1136 	 */
1137 	call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1138 
1139 	clear_bit(__LINK_STATE_START, &dev->state);
1140 
1141 	/* Synchronize to scheduled poll. We cannot touch poll list,
1142 	 * it can be even on different cpu. So just clear netif_running().
1143 	 *
1144 	 * dev->stop() will invoke napi_disable() on all of it's
1145 	 * napi_struct instances on this device.
1146 	 */
1147 	smp_mb__after_clear_bit(); /* Commit netif_running(). */
1148 
1149 	dev_deactivate(dev);
1150 
1151 	/*
1152 	 *	Call the device specific close. This cannot fail.
1153 	 *	Only if device is UP
1154 	 *
1155 	 *	We allow it to be called even after a DETACH hot-plug
1156 	 *	event.
1157 	 */
1158 	if (ops->ndo_stop)
1159 		ops->ndo_stop(dev);
1160 
1161 	/*
1162 	 *	Device is now down.
1163 	 */
1164 
1165 	dev->flags &= ~IFF_UP;
1166 
1167 	/*
1168 	 * Tell people we are down
1169 	 */
1170 	call_netdevice_notifiers(NETDEV_DOWN, dev);
1171 
1172 	/*
1173 	 *	Shutdown NET_DMA
1174 	 */
1175 	net_dmaengine_put();
1176 
1177 	return 0;
1178 }
1179 
1180 
1181 /**
1182  *	dev_disable_lro - disable Large Receive Offload on a device
1183  *	@dev: device
1184  *
1185  *	Disable Large Receive Offload (LRO) on a net device.  Must be
1186  *	called under RTNL.  This is needed if received packets may be
1187  *	forwarded to another interface.
1188  */
1189 void dev_disable_lro(struct net_device *dev)
1190 {
1191 	if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1192 	    dev->ethtool_ops->set_flags) {
1193 		u32 flags = dev->ethtool_ops->get_flags(dev);
1194 		if (flags & ETH_FLAG_LRO) {
1195 			flags &= ~ETH_FLAG_LRO;
1196 			dev->ethtool_ops->set_flags(dev, flags);
1197 		}
1198 	}
1199 	WARN_ON(dev->features & NETIF_F_LRO);
1200 }
1201 EXPORT_SYMBOL(dev_disable_lro);
1202 
1203 
1204 static int dev_boot_phase = 1;
1205 
1206 /*
1207  *	Device change register/unregister. These are not inline or static
1208  *	as we export them to the world.
1209  */
1210 
1211 /**
1212  *	register_netdevice_notifier - register a network notifier block
1213  *	@nb: notifier
1214  *
1215  *	Register a notifier to be called when network device events occur.
1216  *	The notifier passed is linked into the kernel structures and must
1217  *	not be reused until it has been unregistered. A negative errno code
1218  *	is returned on a failure.
1219  *
1220  * 	When registered all registration and up events are replayed
1221  *	to the new notifier to allow device to have a race free
1222  *	view of the network device list.
1223  */
1224 
1225 int register_netdevice_notifier(struct notifier_block *nb)
1226 {
1227 	struct net_device *dev;
1228 	struct net_device *last;
1229 	struct net *net;
1230 	int err;
1231 
1232 	rtnl_lock();
1233 	err = raw_notifier_chain_register(&netdev_chain, nb);
1234 	if (err)
1235 		goto unlock;
1236 	if (dev_boot_phase)
1237 		goto unlock;
1238 	for_each_net(net) {
1239 		for_each_netdev(net, dev) {
1240 			err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1241 			err = notifier_to_errno(err);
1242 			if (err)
1243 				goto rollback;
1244 
1245 			if (!(dev->flags & IFF_UP))
1246 				continue;
1247 
1248 			nb->notifier_call(nb, NETDEV_UP, dev);
1249 		}
1250 	}
1251 
1252 unlock:
1253 	rtnl_unlock();
1254 	return err;
1255 
1256 rollback:
1257 	last = dev;
1258 	for_each_net(net) {
1259 		for_each_netdev(net, dev) {
1260 			if (dev == last)
1261 				break;
1262 
1263 			if (dev->flags & IFF_UP) {
1264 				nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1265 				nb->notifier_call(nb, NETDEV_DOWN, dev);
1266 			}
1267 			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1268 		}
1269 	}
1270 
1271 	raw_notifier_chain_unregister(&netdev_chain, nb);
1272 	goto unlock;
1273 }
1274 
1275 /**
1276  *	unregister_netdevice_notifier - unregister a network notifier block
1277  *	@nb: notifier
1278  *
1279  *	Unregister a notifier previously registered by
1280  *	register_netdevice_notifier(). The notifier is unlinked into the
1281  *	kernel structures and may then be reused. A negative errno code
1282  *	is returned on a failure.
1283  */
1284 
1285 int unregister_netdevice_notifier(struct notifier_block *nb)
1286 {
1287 	int err;
1288 
1289 	rtnl_lock();
1290 	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1291 	rtnl_unlock();
1292 	return err;
1293 }
1294 
1295 /**
1296  *	call_netdevice_notifiers - call all network notifier blocks
1297  *      @val: value passed unmodified to notifier function
1298  *      @dev: net_device pointer passed unmodified to notifier function
1299  *
1300  *	Call all network notifier blocks.  Parameters and return value
1301  *	are as for raw_notifier_call_chain().
1302  */
1303 
1304 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1305 {
1306 	return raw_notifier_call_chain(&netdev_chain, val, dev);
1307 }
1308 
1309 /* When > 0 there are consumers of rx skb time stamps */
1310 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1311 
1312 void net_enable_timestamp(void)
1313 {
1314 	atomic_inc(&netstamp_needed);
1315 }
1316 
1317 void net_disable_timestamp(void)
1318 {
1319 	atomic_dec(&netstamp_needed);
1320 }
1321 
1322 static inline void net_timestamp(struct sk_buff *skb)
1323 {
1324 	if (atomic_read(&netstamp_needed))
1325 		__net_timestamp(skb);
1326 	else
1327 		skb->tstamp.tv64 = 0;
1328 }
1329 
1330 /*
1331  *	Support routine. Sends outgoing frames to any network
1332  *	taps currently in use.
1333  */
1334 
1335 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1336 {
1337 	struct packet_type *ptype;
1338 
1339 	net_timestamp(skb);
1340 
1341 	rcu_read_lock();
1342 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
1343 		/* Never send packets back to the socket
1344 		 * they originated from - MvS (miquels@drinkel.ow.org)
1345 		 */
1346 		if ((ptype->dev == dev || !ptype->dev) &&
1347 		    (ptype->af_packet_priv == NULL ||
1348 		     (struct sock *)ptype->af_packet_priv != skb->sk)) {
1349 			struct sk_buff *skb2= skb_clone(skb, GFP_ATOMIC);
1350 			if (!skb2)
1351 				break;
1352 
1353 			/* skb->nh should be correctly
1354 			   set by sender, so that the second statement is
1355 			   just protection against buggy protocols.
1356 			 */
1357 			skb_reset_mac_header(skb2);
1358 
1359 			if (skb_network_header(skb2) < skb2->data ||
1360 			    skb2->network_header > skb2->tail) {
1361 				if (net_ratelimit())
1362 					printk(KERN_CRIT "protocol %04x is "
1363 					       "buggy, dev %s\n",
1364 					       skb2->protocol, dev->name);
1365 				skb_reset_network_header(skb2);
1366 			}
1367 
1368 			skb2->transport_header = skb2->network_header;
1369 			skb2->pkt_type = PACKET_OUTGOING;
1370 			ptype->func(skb2, skb->dev, ptype, skb->dev);
1371 		}
1372 	}
1373 	rcu_read_unlock();
1374 }
1375 
1376 
1377 static inline void __netif_reschedule(struct Qdisc *q)
1378 {
1379 	struct softnet_data *sd;
1380 	unsigned long flags;
1381 
1382 	local_irq_save(flags);
1383 	sd = &__get_cpu_var(softnet_data);
1384 	q->next_sched = sd->output_queue;
1385 	sd->output_queue = q;
1386 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
1387 	local_irq_restore(flags);
1388 }
1389 
1390 void __netif_schedule(struct Qdisc *q)
1391 {
1392 	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1393 		__netif_reschedule(q);
1394 }
1395 EXPORT_SYMBOL(__netif_schedule);
1396 
1397 void dev_kfree_skb_irq(struct sk_buff *skb)
1398 {
1399 	if (atomic_dec_and_test(&skb->users)) {
1400 		struct softnet_data *sd;
1401 		unsigned long flags;
1402 
1403 		local_irq_save(flags);
1404 		sd = &__get_cpu_var(softnet_data);
1405 		skb->next = sd->completion_queue;
1406 		sd->completion_queue = skb;
1407 		raise_softirq_irqoff(NET_TX_SOFTIRQ);
1408 		local_irq_restore(flags);
1409 	}
1410 }
1411 EXPORT_SYMBOL(dev_kfree_skb_irq);
1412 
1413 void dev_kfree_skb_any(struct sk_buff *skb)
1414 {
1415 	if (in_irq() || irqs_disabled())
1416 		dev_kfree_skb_irq(skb);
1417 	else
1418 		dev_kfree_skb(skb);
1419 }
1420 EXPORT_SYMBOL(dev_kfree_skb_any);
1421 
1422 
1423 /**
1424  * netif_device_detach - mark device as removed
1425  * @dev: network device
1426  *
1427  * Mark device as removed from system and therefore no longer available.
1428  */
1429 void netif_device_detach(struct net_device *dev)
1430 {
1431 	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1432 	    netif_running(dev)) {
1433 		netif_stop_queue(dev);
1434 	}
1435 }
1436 EXPORT_SYMBOL(netif_device_detach);
1437 
1438 /**
1439  * netif_device_attach - mark device as attached
1440  * @dev: network device
1441  *
1442  * Mark device as attached from system and restart if needed.
1443  */
1444 void netif_device_attach(struct net_device *dev)
1445 {
1446 	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1447 	    netif_running(dev)) {
1448 		netif_wake_queue(dev);
1449 		__netdev_watchdog_up(dev);
1450 	}
1451 }
1452 EXPORT_SYMBOL(netif_device_attach);
1453 
1454 static bool can_checksum_protocol(unsigned long features, __be16 protocol)
1455 {
1456 	return ((features & NETIF_F_GEN_CSUM) ||
1457 		((features & NETIF_F_IP_CSUM) &&
1458 		 protocol == htons(ETH_P_IP)) ||
1459 		((features & NETIF_F_IPV6_CSUM) &&
1460 		 protocol == htons(ETH_P_IPV6)) ||
1461 		((features & NETIF_F_FCOE_CRC) &&
1462 		 protocol == htons(ETH_P_FCOE)));
1463 }
1464 
1465 static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
1466 {
1467 	if (can_checksum_protocol(dev->features, skb->protocol))
1468 		return true;
1469 
1470 	if (skb->protocol == htons(ETH_P_8021Q)) {
1471 		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
1472 		if (can_checksum_protocol(dev->features & dev->vlan_features,
1473 					  veh->h_vlan_encapsulated_proto))
1474 			return true;
1475 	}
1476 
1477 	return false;
1478 }
1479 
1480 /*
1481  * Invalidate hardware checksum when packet is to be mangled, and
1482  * complete checksum manually on outgoing path.
1483  */
1484 int skb_checksum_help(struct sk_buff *skb)
1485 {
1486 	__wsum csum;
1487 	int ret = 0, offset;
1488 
1489 	if (skb->ip_summed == CHECKSUM_COMPLETE)
1490 		goto out_set_summed;
1491 
1492 	if (unlikely(skb_shinfo(skb)->gso_size)) {
1493 		/* Let GSO fix up the checksum. */
1494 		goto out_set_summed;
1495 	}
1496 
1497 	offset = skb->csum_start - skb_headroom(skb);
1498 	BUG_ON(offset >= skb_headlen(skb));
1499 	csum = skb_checksum(skb, offset, skb->len - offset, 0);
1500 
1501 	offset += skb->csum_offset;
1502 	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1503 
1504 	if (skb_cloned(skb) &&
1505 	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1506 		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1507 		if (ret)
1508 			goto out;
1509 	}
1510 
1511 	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
1512 out_set_summed:
1513 	skb->ip_summed = CHECKSUM_NONE;
1514 out:
1515 	return ret;
1516 }
1517 
1518 /**
1519  *	skb_gso_segment - Perform segmentation on skb.
1520  *	@skb: buffer to segment
1521  *	@features: features for the output path (see dev->features)
1522  *
1523  *	This function segments the given skb and returns a list of segments.
1524  *
1525  *	It may return NULL if the skb requires no segmentation.  This is
1526  *	only possible when GSO is used for verifying header integrity.
1527  */
1528 struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
1529 {
1530 	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1531 	struct packet_type *ptype;
1532 	__be16 type = skb->protocol;
1533 	int err;
1534 
1535 	skb_reset_mac_header(skb);
1536 	skb->mac_len = skb->network_header - skb->mac_header;
1537 	__skb_pull(skb, skb->mac_len);
1538 
1539 	if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1540 		struct net_device *dev = skb->dev;
1541 		struct ethtool_drvinfo info = {};
1542 
1543 		if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1544 			dev->ethtool_ops->get_drvinfo(dev, &info);
1545 
1546 		WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d "
1547 			"ip_summed=%d",
1548 		     info.driver, dev ? dev->features : 0L,
1549 		     skb->sk ? skb->sk->sk_route_caps : 0L,
1550 		     skb->len, skb->data_len, skb->ip_summed);
1551 
1552 		if (skb_header_cloned(skb) &&
1553 		    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1554 			return ERR_PTR(err);
1555 	}
1556 
1557 	rcu_read_lock();
1558 	list_for_each_entry_rcu(ptype,
1559 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1560 		if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1561 			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1562 				err = ptype->gso_send_check(skb);
1563 				segs = ERR_PTR(err);
1564 				if (err || skb_gso_ok(skb, features))
1565 					break;
1566 				__skb_push(skb, (skb->data -
1567 						 skb_network_header(skb)));
1568 			}
1569 			segs = ptype->gso_segment(skb, features);
1570 			break;
1571 		}
1572 	}
1573 	rcu_read_unlock();
1574 
1575 	__skb_push(skb, skb->data - skb_mac_header(skb));
1576 
1577 	return segs;
1578 }
1579 
1580 EXPORT_SYMBOL(skb_gso_segment);
1581 
1582 /* Take action when hardware reception checksum errors are detected. */
1583 #ifdef CONFIG_BUG
1584 void netdev_rx_csum_fault(struct net_device *dev)
1585 {
1586 	if (net_ratelimit()) {
1587 		printk(KERN_ERR "%s: hw csum failure.\n",
1588 			dev ? dev->name : "<unknown>");
1589 		dump_stack();
1590 	}
1591 }
1592 EXPORT_SYMBOL(netdev_rx_csum_fault);
1593 #endif
1594 
1595 /* Actually, we should eliminate this check as soon as we know, that:
1596  * 1. IOMMU is present and allows to map all the memory.
1597  * 2. No high memory really exists on this machine.
1598  */
1599 
1600 static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1601 {
1602 #ifdef CONFIG_HIGHMEM
1603 	int i;
1604 
1605 	if (dev->features & NETIF_F_HIGHDMA)
1606 		return 0;
1607 
1608 	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1609 		if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1610 			return 1;
1611 
1612 #endif
1613 	return 0;
1614 }
1615 
1616 struct dev_gso_cb {
1617 	void (*destructor)(struct sk_buff *skb);
1618 };
1619 
1620 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1621 
1622 static void dev_gso_skb_destructor(struct sk_buff *skb)
1623 {
1624 	struct dev_gso_cb *cb;
1625 
1626 	do {
1627 		struct sk_buff *nskb = skb->next;
1628 
1629 		skb->next = nskb->next;
1630 		nskb->next = NULL;
1631 		kfree_skb(nskb);
1632 	} while (skb->next);
1633 
1634 	cb = DEV_GSO_CB(skb);
1635 	if (cb->destructor)
1636 		cb->destructor(skb);
1637 }
1638 
1639 /**
1640  *	dev_gso_segment - Perform emulated hardware segmentation on skb.
1641  *	@skb: buffer to segment
1642  *
1643  *	This function segments the given skb and stores the list of segments
1644  *	in skb->next.
1645  */
1646 static int dev_gso_segment(struct sk_buff *skb)
1647 {
1648 	struct net_device *dev = skb->dev;
1649 	struct sk_buff *segs;
1650 	int features = dev->features & ~(illegal_highdma(dev, skb) ?
1651 					 NETIF_F_SG : 0);
1652 
1653 	segs = skb_gso_segment(skb, features);
1654 
1655 	/* Verifying header integrity only. */
1656 	if (!segs)
1657 		return 0;
1658 
1659 	if (IS_ERR(segs))
1660 		return PTR_ERR(segs);
1661 
1662 	skb->next = segs;
1663 	DEV_GSO_CB(skb)->destructor = skb->destructor;
1664 	skb->destructor = dev_gso_skb_destructor;
1665 
1666 	return 0;
1667 }
1668 
1669 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
1670 			struct netdev_queue *txq)
1671 {
1672 	const struct net_device_ops *ops = dev->netdev_ops;
1673 	int rc;
1674 
1675 	if (likely(!skb->next)) {
1676 		if (!list_empty(&ptype_all))
1677 			dev_queue_xmit_nit(skb, dev);
1678 
1679 		if (netif_needs_gso(dev, skb)) {
1680 			if (unlikely(dev_gso_segment(skb)))
1681 				goto out_kfree_skb;
1682 			if (skb->next)
1683 				goto gso;
1684 		}
1685 
1686 		rc = ops->ndo_start_xmit(skb, dev);
1687 		/*
1688 		 * TODO: if skb_orphan() was called by
1689 		 * dev->hard_start_xmit() (for example, the unmodified
1690 		 * igb driver does that; bnx2 doesn't), then
1691 		 * skb_tx_software_timestamp() will be unable to send
1692 		 * back the time stamp.
1693 		 *
1694 		 * How can this be prevented? Always create another
1695 		 * reference to the socket before calling
1696 		 * dev->hard_start_xmit()? Prevent that skb_orphan()
1697 		 * does anything in dev->hard_start_xmit() by clearing
1698 		 * the skb destructor before the call and restoring it
1699 		 * afterwards, then doing the skb_orphan() ourselves?
1700 		 */
1701 		return rc;
1702 	}
1703 
1704 gso:
1705 	do {
1706 		struct sk_buff *nskb = skb->next;
1707 
1708 		skb->next = nskb->next;
1709 		nskb->next = NULL;
1710 		rc = ops->ndo_start_xmit(nskb, dev);
1711 		if (unlikely(rc)) {
1712 			nskb->next = skb->next;
1713 			skb->next = nskb;
1714 			return rc;
1715 		}
1716 		if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
1717 			return NETDEV_TX_BUSY;
1718 	} while (skb->next);
1719 
1720 	skb->destructor = DEV_GSO_CB(skb)->destructor;
1721 
1722 out_kfree_skb:
1723 	kfree_skb(skb);
1724 	return 0;
1725 }
1726 
1727 static u32 skb_tx_hashrnd;
1728 
1729 u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
1730 {
1731 	u32 hash;
1732 
1733 	if (skb_rx_queue_recorded(skb)) {
1734 		hash = skb_get_rx_queue(skb);
1735 	} else if (skb->sk && skb->sk->sk_hash) {
1736 		hash = skb->sk->sk_hash;
1737 	} else
1738 		hash = skb->protocol;
1739 
1740 	hash = jhash_1word(hash, skb_tx_hashrnd);
1741 
1742 	return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
1743 }
1744 EXPORT_SYMBOL(skb_tx_hash);
1745 
1746 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
1747 					struct sk_buff *skb)
1748 {
1749 	const struct net_device_ops *ops = dev->netdev_ops;
1750 	u16 queue_index = 0;
1751 
1752 	if (ops->ndo_select_queue)
1753 		queue_index = ops->ndo_select_queue(dev, skb);
1754 	else if (dev->real_num_tx_queues > 1)
1755 		queue_index = skb_tx_hash(dev, skb);
1756 
1757 	skb_set_queue_mapping(skb, queue_index);
1758 	return netdev_get_tx_queue(dev, queue_index);
1759 }
1760 
1761 /**
1762  *	dev_queue_xmit - transmit a buffer
1763  *	@skb: buffer to transmit
1764  *
1765  *	Queue a buffer for transmission to a network device. The caller must
1766  *	have set the device and priority and built the buffer before calling
1767  *	this function. The function can be called from an interrupt.
1768  *
1769  *	A negative errno code is returned on a failure. A success does not
1770  *	guarantee the frame will be transmitted as it may be dropped due
1771  *	to congestion or traffic shaping.
1772  *
1773  * -----------------------------------------------------------------------------------
1774  *      I notice this method can also return errors from the queue disciplines,
1775  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
1776  *      be positive.
1777  *
1778  *      Regardless of the return value, the skb is consumed, so it is currently
1779  *      difficult to retry a send to this method.  (You can bump the ref count
1780  *      before sending to hold a reference for retry if you are careful.)
1781  *
1782  *      When calling this method, interrupts MUST be enabled.  This is because
1783  *      the BH enable code must have IRQs enabled so that it will not deadlock.
1784  *          --BLG
1785  */
1786 int dev_queue_xmit(struct sk_buff *skb)
1787 {
1788 	struct net_device *dev = skb->dev;
1789 	struct netdev_queue *txq;
1790 	struct Qdisc *q;
1791 	int rc = -ENOMEM;
1792 
1793 	/* GSO will handle the following emulations directly. */
1794 	if (netif_needs_gso(dev, skb))
1795 		goto gso;
1796 
1797 	if (skb_shinfo(skb)->frag_list &&
1798 	    !(dev->features & NETIF_F_FRAGLIST) &&
1799 	    __skb_linearize(skb))
1800 		goto out_kfree_skb;
1801 
1802 	/* Fragmented skb is linearized if device does not support SG,
1803 	 * or if at least one of fragments is in highmem and device
1804 	 * does not support DMA from it.
1805 	 */
1806 	if (skb_shinfo(skb)->nr_frags &&
1807 	    (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
1808 	    __skb_linearize(skb))
1809 		goto out_kfree_skb;
1810 
1811 	/* If packet is not checksummed and device does not support
1812 	 * checksumming for this protocol, complete checksumming here.
1813 	 */
1814 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
1815 		skb_set_transport_header(skb, skb->csum_start -
1816 					      skb_headroom(skb));
1817 		if (!dev_can_checksum(dev, skb) && skb_checksum_help(skb))
1818 			goto out_kfree_skb;
1819 	}
1820 
1821 gso:
1822 	/* Disable soft irqs for various locks below. Also
1823 	 * stops preemption for RCU.
1824 	 */
1825 	rcu_read_lock_bh();
1826 
1827 	txq = dev_pick_tx(dev, skb);
1828 	q = rcu_dereference(txq->qdisc);
1829 
1830 #ifdef CONFIG_NET_CLS_ACT
1831 	skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS);
1832 #endif
1833 	if (q->enqueue) {
1834 		spinlock_t *root_lock = qdisc_lock(q);
1835 
1836 		spin_lock(root_lock);
1837 
1838 		if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
1839 			kfree_skb(skb);
1840 			rc = NET_XMIT_DROP;
1841 		} else {
1842 			rc = qdisc_enqueue_root(skb, q);
1843 			qdisc_run(q);
1844 		}
1845 		spin_unlock(root_lock);
1846 
1847 		goto out;
1848 	}
1849 
1850 	/* The device has no queue. Common case for software devices:
1851 	   loopback, all the sorts of tunnels...
1852 
1853 	   Really, it is unlikely that netif_tx_lock protection is necessary
1854 	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
1855 	   counters.)
1856 	   However, it is possible, that they rely on protection
1857 	   made by us here.
1858 
1859 	   Check this and shot the lock. It is not prone from deadlocks.
1860 	   Either shot noqueue qdisc, it is even simpler 8)
1861 	 */
1862 	if (dev->flags & IFF_UP) {
1863 		int cpu = smp_processor_id(); /* ok because BHs are off */
1864 
1865 		if (txq->xmit_lock_owner != cpu) {
1866 
1867 			HARD_TX_LOCK(dev, txq, cpu);
1868 
1869 			if (!netif_tx_queue_stopped(txq)) {
1870 				rc = 0;
1871 				if (!dev_hard_start_xmit(skb, dev, txq)) {
1872 					HARD_TX_UNLOCK(dev, txq);
1873 					goto out;
1874 				}
1875 			}
1876 			HARD_TX_UNLOCK(dev, txq);
1877 			if (net_ratelimit())
1878 				printk(KERN_CRIT "Virtual device %s asks to "
1879 				       "queue packet!\n", dev->name);
1880 		} else {
1881 			/* Recursion is detected! It is possible,
1882 			 * unfortunately */
1883 			if (net_ratelimit())
1884 				printk(KERN_CRIT "Dead loop on virtual device "
1885 				       "%s, fix it urgently!\n", dev->name);
1886 		}
1887 	}
1888 
1889 	rc = -ENETDOWN;
1890 	rcu_read_unlock_bh();
1891 
1892 out_kfree_skb:
1893 	kfree_skb(skb);
1894 	return rc;
1895 out:
1896 	rcu_read_unlock_bh();
1897 	return rc;
1898 }
1899 
1900 
1901 /*=======================================================================
1902 			Receiver routines
1903   =======================================================================*/
1904 
1905 int netdev_max_backlog __read_mostly = 1000;
1906 int netdev_budget __read_mostly = 300;
1907 int weight_p __read_mostly = 64;            /* old backlog weight */
1908 
1909 DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
1910 
1911 
1912 /**
1913  *	netif_rx	-	post buffer to the network code
1914  *	@skb: buffer to post
1915  *
1916  *	This function receives a packet from a device driver and queues it for
1917  *	the upper (protocol) levels to process.  It always succeeds. The buffer
1918  *	may be dropped during processing for congestion control or by the
1919  *	protocol layers.
1920  *
1921  *	return values:
1922  *	NET_RX_SUCCESS	(no congestion)
1923  *	NET_RX_DROP     (packet was dropped)
1924  *
1925  */
1926 
1927 int netif_rx(struct sk_buff *skb)
1928 {
1929 	struct softnet_data *queue;
1930 	unsigned long flags;
1931 
1932 	/* if netpoll wants it, pretend we never saw it */
1933 	if (netpoll_rx(skb))
1934 		return NET_RX_DROP;
1935 
1936 	if (!skb->tstamp.tv64)
1937 		net_timestamp(skb);
1938 
1939 	/*
1940 	 * The code is rearranged so that the path is the most
1941 	 * short when CPU is congested, but is still operating.
1942 	 */
1943 	local_irq_save(flags);
1944 	queue = &__get_cpu_var(softnet_data);
1945 
1946 	__get_cpu_var(netdev_rx_stat).total++;
1947 	if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
1948 		if (queue->input_pkt_queue.qlen) {
1949 enqueue:
1950 			__skb_queue_tail(&queue->input_pkt_queue, skb);
1951 			local_irq_restore(flags);
1952 			return NET_RX_SUCCESS;
1953 		}
1954 
1955 		napi_schedule(&queue->backlog);
1956 		goto enqueue;
1957 	}
1958 
1959 	__get_cpu_var(netdev_rx_stat).dropped++;
1960 	local_irq_restore(flags);
1961 
1962 	kfree_skb(skb);
1963 	return NET_RX_DROP;
1964 }
1965 
1966 int netif_rx_ni(struct sk_buff *skb)
1967 {
1968 	int err;
1969 
1970 	preempt_disable();
1971 	err = netif_rx(skb);
1972 	if (local_softirq_pending())
1973 		do_softirq();
1974 	preempt_enable();
1975 
1976 	return err;
1977 }
1978 
1979 EXPORT_SYMBOL(netif_rx_ni);
1980 
1981 static void net_tx_action(struct softirq_action *h)
1982 {
1983 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
1984 
1985 	if (sd->completion_queue) {
1986 		struct sk_buff *clist;
1987 
1988 		local_irq_disable();
1989 		clist = sd->completion_queue;
1990 		sd->completion_queue = NULL;
1991 		local_irq_enable();
1992 
1993 		while (clist) {
1994 			struct sk_buff *skb = clist;
1995 			clist = clist->next;
1996 
1997 			WARN_ON(atomic_read(&skb->users));
1998 			__kfree_skb(skb);
1999 		}
2000 	}
2001 
2002 	if (sd->output_queue) {
2003 		struct Qdisc *head;
2004 
2005 		local_irq_disable();
2006 		head = sd->output_queue;
2007 		sd->output_queue = NULL;
2008 		local_irq_enable();
2009 
2010 		while (head) {
2011 			struct Qdisc *q = head;
2012 			spinlock_t *root_lock;
2013 
2014 			head = head->next_sched;
2015 
2016 			root_lock = qdisc_lock(q);
2017 			if (spin_trylock(root_lock)) {
2018 				smp_mb__before_clear_bit();
2019 				clear_bit(__QDISC_STATE_SCHED,
2020 					  &q->state);
2021 				qdisc_run(q);
2022 				spin_unlock(root_lock);
2023 			} else {
2024 				if (!test_bit(__QDISC_STATE_DEACTIVATED,
2025 					      &q->state)) {
2026 					__netif_reschedule(q);
2027 				} else {
2028 					smp_mb__before_clear_bit();
2029 					clear_bit(__QDISC_STATE_SCHED,
2030 						  &q->state);
2031 				}
2032 			}
2033 		}
2034 	}
2035 }
2036 
2037 static inline int deliver_skb(struct sk_buff *skb,
2038 			      struct packet_type *pt_prev,
2039 			      struct net_device *orig_dev)
2040 {
2041 	atomic_inc(&skb->users);
2042 	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2043 }
2044 
2045 #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
2046 /* These hooks defined here for ATM */
2047 struct net_bridge;
2048 struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br,
2049 						unsigned char *addr);
2050 void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent) __read_mostly;
2051 
2052 /*
2053  * If bridge module is loaded call bridging hook.
2054  *  returns NULL if packet was consumed.
2055  */
2056 struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
2057 					struct sk_buff *skb) __read_mostly;
2058 static inline struct sk_buff *handle_bridge(struct sk_buff *skb,
2059 					    struct packet_type **pt_prev, int *ret,
2060 					    struct net_device *orig_dev)
2061 {
2062 	struct net_bridge_port *port;
2063 
2064 	if (skb->pkt_type == PACKET_LOOPBACK ||
2065 	    (port = rcu_dereference(skb->dev->br_port)) == NULL)
2066 		return skb;
2067 
2068 	if (*pt_prev) {
2069 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
2070 		*pt_prev = NULL;
2071 	}
2072 
2073 	return br_handle_frame_hook(port, skb);
2074 }
2075 #else
2076 #define handle_bridge(skb, pt_prev, ret, orig_dev)	(skb)
2077 #endif
2078 
2079 #if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE)
2080 struct sk_buff *(*macvlan_handle_frame_hook)(struct sk_buff *skb) __read_mostly;
2081 EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook);
2082 
2083 static inline struct sk_buff *handle_macvlan(struct sk_buff *skb,
2084 					     struct packet_type **pt_prev,
2085 					     int *ret,
2086 					     struct net_device *orig_dev)
2087 {
2088 	if (skb->dev->macvlan_port == NULL)
2089 		return skb;
2090 
2091 	if (*pt_prev) {
2092 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
2093 		*pt_prev = NULL;
2094 	}
2095 	return macvlan_handle_frame_hook(skb);
2096 }
2097 #else
2098 #define handle_macvlan(skb, pt_prev, ret, orig_dev)	(skb)
2099 #endif
2100 
2101 #ifdef CONFIG_NET_CLS_ACT
2102 /* TODO: Maybe we should just force sch_ingress to be compiled in
2103  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2104  * a compare and 2 stores extra right now if we dont have it on
2105  * but have CONFIG_NET_CLS_ACT
2106  * NOTE: This doesnt stop any functionality; if you dont have
2107  * the ingress scheduler, you just cant add policies on ingress.
2108  *
2109  */
2110 static int ing_filter(struct sk_buff *skb)
2111 {
2112 	struct net_device *dev = skb->dev;
2113 	u32 ttl = G_TC_RTTL(skb->tc_verd);
2114 	struct netdev_queue *rxq;
2115 	int result = TC_ACT_OK;
2116 	struct Qdisc *q;
2117 
2118 	if (MAX_RED_LOOP < ttl++) {
2119 		printk(KERN_WARNING
2120 		       "Redir loop detected Dropping packet (%d->%d)\n",
2121 		       skb->iif, dev->ifindex);
2122 		return TC_ACT_SHOT;
2123 	}
2124 
2125 	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2126 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2127 
2128 	rxq = &dev->rx_queue;
2129 
2130 	q = rxq->qdisc;
2131 	if (q != &noop_qdisc) {
2132 		spin_lock(qdisc_lock(q));
2133 		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2134 			result = qdisc_enqueue_root(skb, q);
2135 		spin_unlock(qdisc_lock(q));
2136 	}
2137 
2138 	return result;
2139 }
2140 
2141 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2142 					 struct packet_type **pt_prev,
2143 					 int *ret, struct net_device *orig_dev)
2144 {
2145 	if (skb->dev->rx_queue.qdisc == &noop_qdisc)
2146 		goto out;
2147 
2148 	if (*pt_prev) {
2149 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
2150 		*pt_prev = NULL;
2151 	} else {
2152 		/* Huh? Why does turning on AF_PACKET affect this? */
2153 		skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
2154 	}
2155 
2156 	switch (ing_filter(skb)) {
2157 	case TC_ACT_SHOT:
2158 	case TC_ACT_STOLEN:
2159 		kfree_skb(skb);
2160 		return NULL;
2161 	}
2162 
2163 out:
2164 	skb->tc_verd = 0;
2165 	return skb;
2166 }
2167 #endif
2168 
2169 /*
2170  * 	netif_nit_deliver - deliver received packets to network taps
2171  * 	@skb: buffer
2172  *
2173  * 	This function is used to deliver incoming packets to network
2174  * 	taps. It should be used when the normal netif_receive_skb path
2175  * 	is bypassed, for example because of VLAN acceleration.
2176  */
2177 void netif_nit_deliver(struct sk_buff *skb)
2178 {
2179 	struct packet_type *ptype;
2180 
2181 	if (list_empty(&ptype_all))
2182 		return;
2183 
2184 	skb_reset_network_header(skb);
2185 	skb_reset_transport_header(skb);
2186 	skb->mac_len = skb->network_header - skb->mac_header;
2187 
2188 	rcu_read_lock();
2189 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
2190 		if (!ptype->dev || ptype->dev == skb->dev)
2191 			deliver_skb(skb, ptype, skb->dev);
2192 	}
2193 	rcu_read_unlock();
2194 }
2195 
2196 /**
2197  *	netif_receive_skb - process receive buffer from network
2198  *	@skb: buffer to process
2199  *
2200  *	netif_receive_skb() is the main receive data processing function.
2201  *	It always succeeds. The buffer may be dropped during processing
2202  *	for congestion control or by the protocol layers.
2203  *
2204  *	This function may only be called from softirq context and interrupts
2205  *	should be enabled.
2206  *
2207  *	Return values (usually ignored):
2208  *	NET_RX_SUCCESS: no congestion
2209  *	NET_RX_DROP: packet was dropped
2210  */
2211 int netif_receive_skb(struct sk_buff *skb)
2212 {
2213 	struct packet_type *ptype, *pt_prev;
2214 	struct net_device *orig_dev;
2215 	struct net_device *null_or_orig;
2216 	int ret = NET_RX_DROP;
2217 	__be16 type;
2218 
2219 	if (skb->vlan_tci && vlan_hwaccel_do_receive(skb))
2220 		return NET_RX_SUCCESS;
2221 
2222 	/* if we've gotten here through NAPI, check netpoll */
2223 	if (netpoll_receive_skb(skb))
2224 		return NET_RX_DROP;
2225 
2226 	if (!skb->tstamp.tv64)
2227 		net_timestamp(skb);
2228 
2229 	if (!skb->iif)
2230 		skb->iif = skb->dev->ifindex;
2231 
2232 	null_or_orig = NULL;
2233 	orig_dev = skb->dev;
2234 	if (orig_dev->master) {
2235 		if (skb_bond_should_drop(skb))
2236 			null_or_orig = orig_dev; /* deliver only exact match */
2237 		else
2238 			skb->dev = orig_dev->master;
2239 	}
2240 
2241 	__get_cpu_var(netdev_rx_stat).total++;
2242 
2243 	skb_reset_network_header(skb);
2244 	skb_reset_transport_header(skb);
2245 	skb->mac_len = skb->network_header - skb->mac_header;
2246 
2247 	pt_prev = NULL;
2248 
2249 	rcu_read_lock();
2250 
2251 #ifdef CONFIG_NET_CLS_ACT
2252 	if (skb->tc_verd & TC_NCLS) {
2253 		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
2254 		goto ncls;
2255 	}
2256 #endif
2257 
2258 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
2259 		if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2260 		    ptype->dev == orig_dev) {
2261 			if (pt_prev)
2262 				ret = deliver_skb(skb, pt_prev, orig_dev);
2263 			pt_prev = ptype;
2264 		}
2265 	}
2266 
2267 #ifdef CONFIG_NET_CLS_ACT
2268 	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
2269 	if (!skb)
2270 		goto out;
2271 ncls:
2272 #endif
2273 
2274 	skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
2275 	if (!skb)
2276 		goto out;
2277 	skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
2278 	if (!skb)
2279 		goto out;
2280 
2281 	skb_orphan(skb);
2282 
2283 	type = skb->protocol;
2284 	list_for_each_entry_rcu(ptype,
2285 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
2286 		if (ptype->type == type &&
2287 		    (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2288 		     ptype->dev == orig_dev)) {
2289 			if (pt_prev)
2290 				ret = deliver_skb(skb, pt_prev, orig_dev);
2291 			pt_prev = ptype;
2292 		}
2293 	}
2294 
2295 	if (pt_prev) {
2296 		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2297 	} else {
2298 		kfree_skb(skb);
2299 		/* Jamal, now you will not able to escape explaining
2300 		 * me how you were going to use this. :-)
2301 		 */
2302 		ret = NET_RX_DROP;
2303 	}
2304 
2305 out:
2306 	rcu_read_unlock();
2307 	return ret;
2308 }
2309 
2310 /* Network device is going away, flush any packets still pending  */
2311 static void flush_backlog(void *arg)
2312 {
2313 	struct net_device *dev = arg;
2314 	struct softnet_data *queue = &__get_cpu_var(softnet_data);
2315 	struct sk_buff *skb, *tmp;
2316 
2317 	skb_queue_walk_safe(&queue->input_pkt_queue, skb, tmp)
2318 		if (skb->dev == dev) {
2319 			__skb_unlink(skb, &queue->input_pkt_queue);
2320 			kfree_skb(skb);
2321 		}
2322 }
2323 
2324 static int napi_gro_complete(struct sk_buff *skb)
2325 {
2326 	struct packet_type *ptype;
2327 	__be16 type = skb->protocol;
2328 	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
2329 	int err = -ENOENT;
2330 
2331 	if (NAPI_GRO_CB(skb)->count == 1)
2332 		goto out;
2333 
2334 	rcu_read_lock();
2335 	list_for_each_entry_rcu(ptype, head, list) {
2336 		if (ptype->type != type || ptype->dev || !ptype->gro_complete)
2337 			continue;
2338 
2339 		err = ptype->gro_complete(skb);
2340 		break;
2341 	}
2342 	rcu_read_unlock();
2343 
2344 	if (err) {
2345 		WARN_ON(&ptype->list == head);
2346 		kfree_skb(skb);
2347 		return NET_RX_SUCCESS;
2348 	}
2349 
2350 out:
2351 	skb_shinfo(skb)->gso_size = 0;
2352 	return netif_receive_skb(skb);
2353 }
2354 
2355 void napi_gro_flush(struct napi_struct *napi)
2356 {
2357 	struct sk_buff *skb, *next;
2358 
2359 	for (skb = napi->gro_list; skb; skb = next) {
2360 		next = skb->next;
2361 		skb->next = NULL;
2362 		napi_gro_complete(skb);
2363 	}
2364 
2365 	napi->gro_count = 0;
2366 	napi->gro_list = NULL;
2367 }
2368 EXPORT_SYMBOL(napi_gro_flush);
2369 
2370 void *skb_gro_header(struct sk_buff *skb, unsigned int hlen)
2371 {
2372 	unsigned int offset = skb_gro_offset(skb);
2373 
2374 	hlen += offset;
2375 	if (hlen <= skb_headlen(skb))
2376 		return skb->data + offset;
2377 
2378 	if (unlikely(!skb_shinfo(skb)->nr_frags ||
2379 		     skb_shinfo(skb)->frags[0].size <=
2380 		     hlen - skb_headlen(skb) ||
2381 		     PageHighMem(skb_shinfo(skb)->frags[0].page)))
2382 		return pskb_may_pull(skb, hlen) ? skb->data + offset : NULL;
2383 
2384 	return page_address(skb_shinfo(skb)->frags[0].page) +
2385 	       skb_shinfo(skb)->frags[0].page_offset +
2386 	       offset - skb_headlen(skb);
2387 }
2388 EXPORT_SYMBOL(skb_gro_header);
2389 
2390 int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2391 {
2392 	struct sk_buff **pp = NULL;
2393 	struct packet_type *ptype;
2394 	__be16 type = skb->protocol;
2395 	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
2396 	int same_flow;
2397 	int mac_len;
2398 	int ret;
2399 
2400 	if (!(skb->dev->features & NETIF_F_GRO))
2401 		goto normal;
2402 
2403 	if (skb_is_gso(skb) || skb_shinfo(skb)->frag_list)
2404 		goto normal;
2405 
2406 	rcu_read_lock();
2407 	list_for_each_entry_rcu(ptype, head, list) {
2408 		if (ptype->type != type || ptype->dev || !ptype->gro_receive)
2409 			continue;
2410 
2411 		skb_set_network_header(skb, skb_gro_offset(skb));
2412 		mac_len = skb->network_header - skb->mac_header;
2413 		skb->mac_len = mac_len;
2414 		NAPI_GRO_CB(skb)->same_flow = 0;
2415 		NAPI_GRO_CB(skb)->flush = 0;
2416 		NAPI_GRO_CB(skb)->free = 0;
2417 
2418 		pp = ptype->gro_receive(&napi->gro_list, skb);
2419 		break;
2420 	}
2421 	rcu_read_unlock();
2422 
2423 	if (&ptype->list == head)
2424 		goto normal;
2425 
2426 	same_flow = NAPI_GRO_CB(skb)->same_flow;
2427 	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
2428 
2429 	if (pp) {
2430 		struct sk_buff *nskb = *pp;
2431 
2432 		*pp = nskb->next;
2433 		nskb->next = NULL;
2434 		napi_gro_complete(nskb);
2435 		napi->gro_count--;
2436 	}
2437 
2438 	if (same_flow)
2439 		goto ok;
2440 
2441 	if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
2442 		goto normal;
2443 
2444 	napi->gro_count++;
2445 	NAPI_GRO_CB(skb)->count = 1;
2446 	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
2447 	skb->next = napi->gro_list;
2448 	napi->gro_list = skb;
2449 	ret = GRO_HELD;
2450 
2451 pull:
2452 	if (unlikely(!pskb_may_pull(skb, skb_gro_offset(skb)))) {
2453 		if (napi->gro_list == skb)
2454 			napi->gro_list = skb->next;
2455 		ret = GRO_DROP;
2456 	}
2457 
2458 ok:
2459 	return ret;
2460 
2461 normal:
2462 	ret = GRO_NORMAL;
2463 	goto pull;
2464 }
2465 EXPORT_SYMBOL(dev_gro_receive);
2466 
2467 static int __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2468 {
2469 	struct sk_buff *p;
2470 
2471 	if (netpoll_rx_on(skb))
2472 		return GRO_NORMAL;
2473 
2474 	for (p = napi->gro_list; p; p = p->next) {
2475 		NAPI_GRO_CB(p)->same_flow = !compare_ether_header(
2476 			skb_mac_header(p), skb_gro_mac_header(skb));
2477 		NAPI_GRO_CB(p)->flush = 0;
2478 	}
2479 
2480 	return dev_gro_receive(napi, skb);
2481 }
2482 
2483 int napi_skb_finish(int ret, struct sk_buff *skb)
2484 {
2485 	int err = NET_RX_SUCCESS;
2486 
2487 	switch (ret) {
2488 	case GRO_NORMAL:
2489 		return netif_receive_skb(skb);
2490 
2491 	case GRO_DROP:
2492 		err = NET_RX_DROP;
2493 		/* fall through */
2494 
2495 	case GRO_MERGED_FREE:
2496 		kfree_skb(skb);
2497 		break;
2498 	}
2499 
2500 	return err;
2501 }
2502 EXPORT_SYMBOL(napi_skb_finish);
2503 
2504 int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2505 {
2506 	skb_gro_reset_offset(skb);
2507 
2508 	return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
2509 }
2510 EXPORT_SYMBOL(napi_gro_receive);
2511 
2512 void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
2513 {
2514 	__skb_pull(skb, skb_headlen(skb));
2515 	skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
2516 
2517 	napi->skb = skb;
2518 }
2519 EXPORT_SYMBOL(napi_reuse_skb);
2520 
2521 struct sk_buff *napi_fraginfo_skb(struct napi_struct *napi,
2522 				  struct napi_gro_fraginfo *info)
2523 {
2524 	struct net_device *dev = napi->dev;
2525 	struct sk_buff *skb = napi->skb;
2526 	struct ethhdr *eth;
2527 	skb_frag_t *frag;
2528 	int i;
2529 
2530 	napi->skb = NULL;
2531 
2532 	if (!skb) {
2533 		skb = netdev_alloc_skb(dev, GRO_MAX_HEAD + NET_IP_ALIGN);
2534 		if (!skb)
2535 			goto out;
2536 
2537 		skb_reserve(skb, NET_IP_ALIGN);
2538 	}
2539 
2540 	BUG_ON(info->nr_frags > MAX_SKB_FRAGS);
2541 	frag = &info->frags[info->nr_frags - 1];
2542 
2543 	for (i = skb_shinfo(skb)->nr_frags; i < info->nr_frags; i++) {
2544 		skb_fill_page_desc(skb, i, frag->page, frag->page_offset,
2545 				   frag->size);
2546 		frag++;
2547 	}
2548 	skb_shinfo(skb)->nr_frags = info->nr_frags;
2549 
2550 	skb->data_len = info->len;
2551 	skb->len += info->len;
2552 	skb->truesize += info->len;
2553 
2554 	skb_reset_mac_header(skb);
2555 	skb_gro_reset_offset(skb);
2556 
2557 	eth = skb_gro_header(skb, sizeof(*eth));
2558 	if (!eth) {
2559 		napi_reuse_skb(napi, skb);
2560 		skb = NULL;
2561 		goto out;
2562 	}
2563 
2564 	skb_gro_pull(skb, sizeof(*eth));
2565 
2566 	/*
2567 	 * This works because the only protocols we care about don't require
2568 	 * special handling.  We'll fix it up properly at the end.
2569 	 */
2570 	skb->protocol = eth->h_proto;
2571 
2572 	skb->ip_summed = info->ip_summed;
2573 	skb->csum = info->csum;
2574 
2575 out:
2576 	return skb;
2577 }
2578 EXPORT_SYMBOL(napi_fraginfo_skb);
2579 
2580 int napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, int ret)
2581 {
2582 	int err = NET_RX_SUCCESS;
2583 
2584 	switch (ret) {
2585 	case GRO_NORMAL:
2586 	case GRO_HELD:
2587 		skb->protocol = eth_type_trans(skb, napi->dev);
2588 
2589 		if (ret == GRO_NORMAL)
2590 			return netif_receive_skb(skb);
2591 
2592 		skb_gro_pull(skb, -ETH_HLEN);
2593 		break;
2594 
2595 	case GRO_DROP:
2596 		err = NET_RX_DROP;
2597 		/* fall through */
2598 
2599 	case GRO_MERGED_FREE:
2600 		napi_reuse_skb(napi, skb);
2601 		break;
2602 	}
2603 
2604 	return err;
2605 }
2606 EXPORT_SYMBOL(napi_frags_finish);
2607 
2608 int napi_gro_frags(struct napi_struct *napi, struct napi_gro_fraginfo *info)
2609 {
2610 	struct sk_buff *skb = napi_fraginfo_skb(napi, info);
2611 
2612 	if (!skb)
2613 		return NET_RX_DROP;
2614 
2615 	return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
2616 }
2617 EXPORT_SYMBOL(napi_gro_frags);
2618 
2619 static int process_backlog(struct napi_struct *napi, int quota)
2620 {
2621 	int work = 0;
2622 	struct softnet_data *queue = &__get_cpu_var(softnet_data);
2623 	unsigned long start_time = jiffies;
2624 
2625 	napi->weight = weight_p;
2626 	do {
2627 		struct sk_buff *skb;
2628 
2629 		local_irq_disable();
2630 		skb = __skb_dequeue(&queue->input_pkt_queue);
2631 		if (!skb) {
2632 			__napi_complete(napi);
2633 			local_irq_enable();
2634 			break;
2635 		}
2636 		local_irq_enable();
2637 
2638 		netif_receive_skb(skb);
2639 	} while (++work < quota && jiffies == start_time);
2640 
2641 	return work;
2642 }
2643 
2644 /**
2645  * __napi_schedule - schedule for receive
2646  * @n: entry to schedule
2647  *
2648  * The entry's receive function will be scheduled to run
2649  */
2650 void __napi_schedule(struct napi_struct *n)
2651 {
2652 	unsigned long flags;
2653 
2654 	local_irq_save(flags);
2655 	list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);
2656 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2657 	local_irq_restore(flags);
2658 }
2659 EXPORT_SYMBOL(__napi_schedule);
2660 
2661 void __napi_complete(struct napi_struct *n)
2662 {
2663 	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
2664 	BUG_ON(n->gro_list);
2665 
2666 	list_del(&n->poll_list);
2667 	smp_mb__before_clear_bit();
2668 	clear_bit(NAPI_STATE_SCHED, &n->state);
2669 }
2670 EXPORT_SYMBOL(__napi_complete);
2671 
2672 void napi_complete(struct napi_struct *n)
2673 {
2674 	unsigned long flags;
2675 
2676 	/*
2677 	 * don't let napi dequeue from the cpu poll list
2678 	 * just in case its running on a different cpu
2679 	 */
2680 	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
2681 		return;
2682 
2683 	napi_gro_flush(n);
2684 	local_irq_save(flags);
2685 	__napi_complete(n);
2686 	local_irq_restore(flags);
2687 }
2688 EXPORT_SYMBOL(napi_complete);
2689 
2690 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
2691 		    int (*poll)(struct napi_struct *, int), int weight)
2692 {
2693 	INIT_LIST_HEAD(&napi->poll_list);
2694 	napi->gro_count = 0;
2695 	napi->gro_list = NULL;
2696 	napi->skb = NULL;
2697 	napi->poll = poll;
2698 	napi->weight = weight;
2699 	list_add(&napi->dev_list, &dev->napi_list);
2700 	napi->dev = dev;
2701 #ifdef CONFIG_NETPOLL
2702 	spin_lock_init(&napi->poll_lock);
2703 	napi->poll_owner = -1;
2704 #endif
2705 	set_bit(NAPI_STATE_SCHED, &napi->state);
2706 }
2707 EXPORT_SYMBOL(netif_napi_add);
2708 
2709 void netif_napi_del(struct napi_struct *napi)
2710 {
2711 	struct sk_buff *skb, *next;
2712 
2713 	list_del_init(&napi->dev_list);
2714 	kfree_skb(napi->skb);
2715 
2716 	for (skb = napi->gro_list; skb; skb = next) {
2717 		next = skb->next;
2718 		skb->next = NULL;
2719 		kfree_skb(skb);
2720 	}
2721 
2722 	napi->gro_list = NULL;
2723 	napi->gro_count = 0;
2724 }
2725 EXPORT_SYMBOL(netif_napi_del);
2726 
2727 
2728 static void net_rx_action(struct softirq_action *h)
2729 {
2730 	struct list_head *list = &__get_cpu_var(softnet_data).poll_list;
2731 	unsigned long time_limit = jiffies + 2;
2732 	int budget = netdev_budget;
2733 	void *have;
2734 
2735 	local_irq_disable();
2736 
2737 	while (!list_empty(list)) {
2738 		struct napi_struct *n;
2739 		int work, weight;
2740 
2741 		/* If softirq window is exhuasted then punt.
2742 		 * Allow this to run for 2 jiffies since which will allow
2743 		 * an average latency of 1.5/HZ.
2744 		 */
2745 		if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
2746 			goto softnet_break;
2747 
2748 		local_irq_enable();
2749 
2750 		/* Even though interrupts have been re-enabled, this
2751 		 * access is safe because interrupts can only add new
2752 		 * entries to the tail of this list, and only ->poll()
2753 		 * calls can remove this head entry from the list.
2754 		 */
2755 		n = list_entry(list->next, struct napi_struct, poll_list);
2756 
2757 		have = netpoll_poll_lock(n);
2758 
2759 		weight = n->weight;
2760 
2761 		/* This NAPI_STATE_SCHED test is for avoiding a race
2762 		 * with netpoll's poll_napi().  Only the entity which
2763 		 * obtains the lock and sees NAPI_STATE_SCHED set will
2764 		 * actually make the ->poll() call.  Therefore we avoid
2765 		 * accidently calling ->poll() when NAPI is not scheduled.
2766 		 */
2767 		work = 0;
2768 		if (test_bit(NAPI_STATE_SCHED, &n->state))
2769 			work = n->poll(n, weight);
2770 
2771 		WARN_ON_ONCE(work > weight);
2772 
2773 		budget -= work;
2774 
2775 		local_irq_disable();
2776 
2777 		/* Drivers must not modify the NAPI state if they
2778 		 * consume the entire weight.  In such cases this code
2779 		 * still "owns" the NAPI instance and therefore can
2780 		 * move the instance around on the list at-will.
2781 		 */
2782 		if (unlikely(work == weight)) {
2783 			if (unlikely(napi_disable_pending(n)))
2784 				__napi_complete(n);
2785 			else
2786 				list_move_tail(&n->poll_list, list);
2787 		}
2788 
2789 		netpoll_poll_unlock(have);
2790 	}
2791 out:
2792 	local_irq_enable();
2793 
2794 #ifdef CONFIG_NET_DMA
2795 	/*
2796 	 * There may not be any more sk_buffs coming right now, so push
2797 	 * any pending DMA copies to hardware
2798 	 */
2799 	dma_issue_pending_all();
2800 #endif
2801 
2802 	return;
2803 
2804 softnet_break:
2805 	__get_cpu_var(netdev_rx_stat).time_squeeze++;
2806 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2807 	goto out;
2808 }
2809 
2810 static gifconf_func_t * gifconf_list [NPROTO];
2811 
2812 /**
2813  *	register_gifconf	-	register a SIOCGIF handler
2814  *	@family: Address family
2815  *	@gifconf: Function handler
2816  *
2817  *	Register protocol dependent address dumping routines. The handler
2818  *	that is passed must not be freed or reused until it has been replaced
2819  *	by another handler.
2820  */
2821 int register_gifconf(unsigned int family, gifconf_func_t * gifconf)
2822 {
2823 	if (family >= NPROTO)
2824 		return -EINVAL;
2825 	gifconf_list[family] = gifconf;
2826 	return 0;
2827 }
2828 
2829 
2830 /*
2831  *	Map an interface index to its name (SIOCGIFNAME)
2832  */
2833 
2834 /*
2835  *	We need this ioctl for efficient implementation of the
2836  *	if_indextoname() function required by the IPv6 API.  Without
2837  *	it, we would have to search all the interfaces to find a
2838  *	match.  --pb
2839  */
2840 
2841 static int dev_ifname(struct net *net, struct ifreq __user *arg)
2842 {
2843 	struct net_device *dev;
2844 	struct ifreq ifr;
2845 
2846 	/*
2847 	 *	Fetch the caller's info block.
2848 	 */
2849 
2850 	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
2851 		return -EFAULT;
2852 
2853 	read_lock(&dev_base_lock);
2854 	dev = __dev_get_by_index(net, ifr.ifr_ifindex);
2855 	if (!dev) {
2856 		read_unlock(&dev_base_lock);
2857 		return -ENODEV;
2858 	}
2859 
2860 	strcpy(ifr.ifr_name, dev->name);
2861 	read_unlock(&dev_base_lock);
2862 
2863 	if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
2864 		return -EFAULT;
2865 	return 0;
2866 }
2867 
2868 /*
2869  *	Perform a SIOCGIFCONF call. This structure will change
2870  *	size eventually, and there is nothing I can do about it.
2871  *	Thus we will need a 'compatibility mode'.
2872  */
2873 
2874 static int dev_ifconf(struct net *net, char __user *arg)
2875 {
2876 	struct ifconf ifc;
2877 	struct net_device *dev;
2878 	char __user *pos;
2879 	int len;
2880 	int total;
2881 	int i;
2882 
2883 	/*
2884 	 *	Fetch the caller's info block.
2885 	 */
2886 
2887 	if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
2888 		return -EFAULT;
2889 
2890 	pos = ifc.ifc_buf;
2891 	len = ifc.ifc_len;
2892 
2893 	/*
2894 	 *	Loop over the interfaces, and write an info block for each.
2895 	 */
2896 
2897 	total = 0;
2898 	for_each_netdev(net, dev) {
2899 		for (i = 0; i < NPROTO; i++) {
2900 			if (gifconf_list[i]) {
2901 				int done;
2902 				if (!pos)
2903 					done = gifconf_list[i](dev, NULL, 0);
2904 				else
2905 					done = gifconf_list[i](dev, pos + total,
2906 							       len - total);
2907 				if (done < 0)
2908 					return -EFAULT;
2909 				total += done;
2910 			}
2911 		}
2912 	}
2913 
2914 	/*
2915 	 *	All done.  Write the updated control block back to the caller.
2916 	 */
2917 	ifc.ifc_len = total;
2918 
2919 	/*
2920 	 * 	Both BSD and Solaris return 0 here, so we do too.
2921 	 */
2922 	return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
2923 }
2924 
2925 #ifdef CONFIG_PROC_FS
2926 /*
2927  *	This is invoked by the /proc filesystem handler to display a device
2928  *	in detail.
2929  */
2930 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
2931 	__acquires(dev_base_lock)
2932 {
2933 	struct net *net = seq_file_net(seq);
2934 	loff_t off;
2935 	struct net_device *dev;
2936 
2937 	read_lock(&dev_base_lock);
2938 	if (!*pos)
2939 		return SEQ_START_TOKEN;
2940 
2941 	off = 1;
2942 	for_each_netdev(net, dev)
2943 		if (off++ == *pos)
2944 			return dev;
2945 
2946 	return NULL;
2947 }
2948 
2949 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2950 {
2951 	struct net *net = seq_file_net(seq);
2952 	++*pos;
2953 	return v == SEQ_START_TOKEN ?
2954 		first_net_device(net) : next_net_device((struct net_device *)v);
2955 }
2956 
2957 void dev_seq_stop(struct seq_file *seq, void *v)
2958 	__releases(dev_base_lock)
2959 {
2960 	read_unlock(&dev_base_lock);
2961 }
2962 
2963 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
2964 {
2965 	const struct net_device_stats *stats = dev_get_stats(dev);
2966 
2967 	seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
2968 		   "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
2969 		   dev->name, stats->rx_bytes, stats->rx_packets,
2970 		   stats->rx_errors,
2971 		   stats->rx_dropped + stats->rx_missed_errors,
2972 		   stats->rx_fifo_errors,
2973 		   stats->rx_length_errors + stats->rx_over_errors +
2974 		    stats->rx_crc_errors + stats->rx_frame_errors,
2975 		   stats->rx_compressed, stats->multicast,
2976 		   stats->tx_bytes, stats->tx_packets,
2977 		   stats->tx_errors, stats->tx_dropped,
2978 		   stats->tx_fifo_errors, stats->collisions,
2979 		   stats->tx_carrier_errors +
2980 		    stats->tx_aborted_errors +
2981 		    stats->tx_window_errors +
2982 		    stats->tx_heartbeat_errors,
2983 		   stats->tx_compressed);
2984 }
2985 
2986 /*
2987  *	Called from the PROCfs module. This now uses the new arbitrary sized
2988  *	/proc/net interface to create /proc/net/dev
2989  */
2990 static int dev_seq_show(struct seq_file *seq, void *v)
2991 {
2992 	if (v == SEQ_START_TOKEN)
2993 		seq_puts(seq, "Inter-|   Receive                            "
2994 			      "                    |  Transmit\n"
2995 			      " face |bytes    packets errs drop fifo frame "
2996 			      "compressed multicast|bytes    packets errs "
2997 			      "drop fifo colls carrier compressed\n");
2998 	else
2999 		dev_seq_printf_stats(seq, v);
3000 	return 0;
3001 }
3002 
3003 static struct netif_rx_stats *softnet_get_online(loff_t *pos)
3004 {
3005 	struct netif_rx_stats *rc = NULL;
3006 
3007 	while (*pos < nr_cpu_ids)
3008 		if (cpu_online(*pos)) {
3009 			rc = &per_cpu(netdev_rx_stat, *pos);
3010 			break;
3011 		} else
3012 			++*pos;
3013 	return rc;
3014 }
3015 
3016 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
3017 {
3018 	return softnet_get_online(pos);
3019 }
3020 
3021 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3022 {
3023 	++*pos;
3024 	return softnet_get_online(pos);
3025 }
3026 
3027 static void softnet_seq_stop(struct seq_file *seq, void *v)
3028 {
3029 }
3030 
3031 static int softnet_seq_show(struct seq_file *seq, void *v)
3032 {
3033 	struct netif_rx_stats *s = v;
3034 
3035 	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
3036 		   s->total, s->dropped, s->time_squeeze, 0,
3037 		   0, 0, 0, 0, /* was fastroute */
3038 		   s->cpu_collision );
3039 	return 0;
3040 }
3041 
3042 static const struct seq_operations dev_seq_ops = {
3043 	.start = dev_seq_start,
3044 	.next  = dev_seq_next,
3045 	.stop  = dev_seq_stop,
3046 	.show  = dev_seq_show,
3047 };
3048 
3049 static int dev_seq_open(struct inode *inode, struct file *file)
3050 {
3051 	return seq_open_net(inode, file, &dev_seq_ops,
3052 			    sizeof(struct seq_net_private));
3053 }
3054 
3055 static const struct file_operations dev_seq_fops = {
3056 	.owner	 = THIS_MODULE,
3057 	.open    = dev_seq_open,
3058 	.read    = seq_read,
3059 	.llseek  = seq_lseek,
3060 	.release = seq_release_net,
3061 };
3062 
3063 static const struct seq_operations softnet_seq_ops = {
3064 	.start = softnet_seq_start,
3065 	.next  = softnet_seq_next,
3066 	.stop  = softnet_seq_stop,
3067 	.show  = softnet_seq_show,
3068 };
3069 
3070 static int softnet_seq_open(struct inode *inode, struct file *file)
3071 {
3072 	return seq_open(file, &softnet_seq_ops);
3073 }
3074 
3075 static const struct file_operations softnet_seq_fops = {
3076 	.owner	 = THIS_MODULE,
3077 	.open    = softnet_seq_open,
3078 	.read    = seq_read,
3079 	.llseek  = seq_lseek,
3080 	.release = seq_release,
3081 };
3082 
3083 static void *ptype_get_idx(loff_t pos)
3084 {
3085 	struct packet_type *pt = NULL;
3086 	loff_t i = 0;
3087 	int t;
3088 
3089 	list_for_each_entry_rcu(pt, &ptype_all, list) {
3090 		if (i == pos)
3091 			return pt;
3092 		++i;
3093 	}
3094 
3095 	for (t = 0; t < PTYPE_HASH_SIZE; t++) {
3096 		list_for_each_entry_rcu(pt, &ptype_base[t], list) {
3097 			if (i == pos)
3098 				return pt;
3099 			++i;
3100 		}
3101 	}
3102 	return NULL;
3103 }
3104 
3105 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
3106 	__acquires(RCU)
3107 {
3108 	rcu_read_lock();
3109 	return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
3110 }
3111 
3112 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3113 {
3114 	struct packet_type *pt;
3115 	struct list_head *nxt;
3116 	int hash;
3117 
3118 	++*pos;
3119 	if (v == SEQ_START_TOKEN)
3120 		return ptype_get_idx(0);
3121 
3122 	pt = v;
3123 	nxt = pt->list.next;
3124 	if (pt->type == htons(ETH_P_ALL)) {
3125 		if (nxt != &ptype_all)
3126 			goto found;
3127 		hash = 0;
3128 		nxt = ptype_base[0].next;
3129 	} else
3130 		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
3131 
3132 	while (nxt == &ptype_base[hash]) {
3133 		if (++hash >= PTYPE_HASH_SIZE)
3134 			return NULL;
3135 		nxt = ptype_base[hash].next;
3136 	}
3137 found:
3138 	return list_entry(nxt, struct packet_type, list);
3139 }
3140 
3141 static void ptype_seq_stop(struct seq_file *seq, void *v)
3142 	__releases(RCU)
3143 {
3144 	rcu_read_unlock();
3145 }
3146 
3147 static int ptype_seq_show(struct seq_file *seq, void *v)
3148 {
3149 	struct packet_type *pt = v;
3150 
3151 	if (v == SEQ_START_TOKEN)
3152 		seq_puts(seq, "Type Device      Function\n");
3153 	else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
3154 		if (pt->type == htons(ETH_P_ALL))
3155 			seq_puts(seq, "ALL ");
3156 		else
3157 			seq_printf(seq, "%04x", ntohs(pt->type));
3158 
3159 		seq_printf(seq, " %-8s %pF\n",
3160 			   pt->dev ? pt->dev->name : "", pt->func);
3161 	}
3162 
3163 	return 0;
3164 }
3165 
3166 static const struct seq_operations ptype_seq_ops = {
3167 	.start = ptype_seq_start,
3168 	.next  = ptype_seq_next,
3169 	.stop  = ptype_seq_stop,
3170 	.show  = ptype_seq_show,
3171 };
3172 
3173 static int ptype_seq_open(struct inode *inode, struct file *file)
3174 {
3175 	return seq_open_net(inode, file, &ptype_seq_ops,
3176 			sizeof(struct seq_net_private));
3177 }
3178 
3179 static const struct file_operations ptype_seq_fops = {
3180 	.owner	 = THIS_MODULE,
3181 	.open    = ptype_seq_open,
3182 	.read    = seq_read,
3183 	.llseek  = seq_lseek,
3184 	.release = seq_release_net,
3185 };
3186 
3187 
3188 static int __net_init dev_proc_net_init(struct net *net)
3189 {
3190 	int rc = -ENOMEM;
3191 
3192 	if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
3193 		goto out;
3194 	if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
3195 		goto out_dev;
3196 	if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
3197 		goto out_softnet;
3198 
3199 	if (wext_proc_init(net))
3200 		goto out_ptype;
3201 	rc = 0;
3202 out:
3203 	return rc;
3204 out_ptype:
3205 	proc_net_remove(net, "ptype");
3206 out_softnet:
3207 	proc_net_remove(net, "softnet_stat");
3208 out_dev:
3209 	proc_net_remove(net, "dev");
3210 	goto out;
3211 }
3212 
3213 static void __net_exit dev_proc_net_exit(struct net *net)
3214 {
3215 	wext_proc_exit(net);
3216 
3217 	proc_net_remove(net, "ptype");
3218 	proc_net_remove(net, "softnet_stat");
3219 	proc_net_remove(net, "dev");
3220 }
3221 
3222 static struct pernet_operations __net_initdata dev_proc_ops = {
3223 	.init = dev_proc_net_init,
3224 	.exit = dev_proc_net_exit,
3225 };
3226 
3227 static int __init dev_proc_init(void)
3228 {
3229 	return register_pernet_subsys(&dev_proc_ops);
3230 }
3231 #else
3232 #define dev_proc_init() 0
3233 #endif	/* CONFIG_PROC_FS */
3234 
3235 
3236 /**
3237  *	netdev_set_master	-	set up master/slave pair
3238  *	@slave: slave device
3239  *	@master: new master device
3240  *
3241  *	Changes the master device of the slave. Pass %NULL to break the
3242  *	bonding. The caller must hold the RTNL semaphore. On a failure
3243  *	a negative errno code is returned. On success the reference counts
3244  *	are adjusted, %RTM_NEWLINK is sent to the routing socket and the
3245  *	function returns zero.
3246  */
3247 int netdev_set_master(struct net_device *slave, struct net_device *master)
3248 {
3249 	struct net_device *old = slave->master;
3250 
3251 	ASSERT_RTNL();
3252 
3253 	if (master) {
3254 		if (old)
3255 			return -EBUSY;
3256 		dev_hold(master);
3257 	}
3258 
3259 	slave->master = master;
3260 
3261 	synchronize_net();
3262 
3263 	if (old)
3264 		dev_put(old);
3265 
3266 	if (master)
3267 		slave->flags |= IFF_SLAVE;
3268 	else
3269 		slave->flags &= ~IFF_SLAVE;
3270 
3271 	rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
3272 	return 0;
3273 }
3274 
3275 static void dev_change_rx_flags(struct net_device *dev, int flags)
3276 {
3277 	const struct net_device_ops *ops = dev->netdev_ops;
3278 
3279 	if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
3280 		ops->ndo_change_rx_flags(dev, flags);
3281 }
3282 
3283 static int __dev_set_promiscuity(struct net_device *dev, int inc)
3284 {
3285 	unsigned short old_flags = dev->flags;
3286 	uid_t uid;
3287 	gid_t gid;
3288 
3289 	ASSERT_RTNL();
3290 
3291 	dev->flags |= IFF_PROMISC;
3292 	dev->promiscuity += inc;
3293 	if (dev->promiscuity == 0) {
3294 		/*
3295 		 * Avoid overflow.
3296 		 * If inc causes overflow, untouch promisc and return error.
3297 		 */
3298 		if (inc < 0)
3299 			dev->flags &= ~IFF_PROMISC;
3300 		else {
3301 			dev->promiscuity -= inc;
3302 			printk(KERN_WARNING "%s: promiscuity touches roof, "
3303 				"set promiscuity failed, promiscuity feature "
3304 				"of device might be broken.\n", dev->name);
3305 			return -EOVERFLOW;
3306 		}
3307 	}
3308 	if (dev->flags != old_flags) {
3309 		printk(KERN_INFO "device %s %s promiscuous mode\n",
3310 		       dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
3311 							       "left");
3312 		if (audit_enabled) {
3313 			current_uid_gid(&uid, &gid);
3314 			audit_log(current->audit_context, GFP_ATOMIC,
3315 				AUDIT_ANOM_PROMISCUOUS,
3316 				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
3317 				dev->name, (dev->flags & IFF_PROMISC),
3318 				(old_flags & IFF_PROMISC),
3319 				audit_get_loginuid(current),
3320 				uid, gid,
3321 				audit_get_sessionid(current));
3322 		}
3323 
3324 		dev_change_rx_flags(dev, IFF_PROMISC);
3325 	}
3326 	return 0;
3327 }
3328 
3329 /**
3330  *	dev_set_promiscuity	- update promiscuity count on a device
3331  *	@dev: device
3332  *	@inc: modifier
3333  *
3334  *	Add or remove promiscuity from a device. While the count in the device
3335  *	remains above zero the interface remains promiscuous. Once it hits zero
3336  *	the device reverts back to normal filtering operation. A negative inc
3337  *	value is used to drop promiscuity on the device.
3338  *	Return 0 if successful or a negative errno code on error.
3339  */
3340 int dev_set_promiscuity(struct net_device *dev, int inc)
3341 {
3342 	unsigned short old_flags = dev->flags;
3343 	int err;
3344 
3345 	err = __dev_set_promiscuity(dev, inc);
3346 	if (err < 0)
3347 		return err;
3348 	if (dev->flags != old_flags)
3349 		dev_set_rx_mode(dev);
3350 	return err;
3351 }
3352 
3353 /**
3354  *	dev_set_allmulti	- update allmulti count on a device
3355  *	@dev: device
3356  *	@inc: modifier
3357  *
3358  *	Add or remove reception of all multicast frames to a device. While the
3359  *	count in the device remains above zero the interface remains listening
3360  *	to all interfaces. Once it hits zero the device reverts back to normal
3361  *	filtering operation. A negative @inc value is used to drop the counter
3362  *	when releasing a resource needing all multicasts.
3363  *	Return 0 if successful or a negative errno code on error.
3364  */
3365 
3366 int dev_set_allmulti(struct net_device *dev, int inc)
3367 {
3368 	unsigned short old_flags = dev->flags;
3369 
3370 	ASSERT_RTNL();
3371 
3372 	dev->flags |= IFF_ALLMULTI;
3373 	dev->allmulti += inc;
3374 	if (dev->allmulti == 0) {
3375 		/*
3376 		 * Avoid overflow.
3377 		 * If inc causes overflow, untouch allmulti and return error.
3378 		 */
3379 		if (inc < 0)
3380 			dev->flags &= ~IFF_ALLMULTI;
3381 		else {
3382 			dev->allmulti -= inc;
3383 			printk(KERN_WARNING "%s: allmulti touches roof, "
3384 				"set allmulti failed, allmulti feature of "
3385 				"device might be broken.\n", dev->name);
3386 			return -EOVERFLOW;
3387 		}
3388 	}
3389 	if (dev->flags ^ old_flags) {
3390 		dev_change_rx_flags(dev, IFF_ALLMULTI);
3391 		dev_set_rx_mode(dev);
3392 	}
3393 	return 0;
3394 }
3395 
3396 /*
3397  *	Upload unicast and multicast address lists to device and
3398  *	configure RX filtering. When the device doesn't support unicast
3399  *	filtering it is put in promiscuous mode while unicast addresses
3400  *	are present.
3401  */
3402 void __dev_set_rx_mode(struct net_device *dev)
3403 {
3404 	const struct net_device_ops *ops = dev->netdev_ops;
3405 
3406 	/* dev_open will call this function so the list will stay sane. */
3407 	if (!(dev->flags&IFF_UP))
3408 		return;
3409 
3410 	if (!netif_device_present(dev))
3411 		return;
3412 
3413 	if (ops->ndo_set_rx_mode)
3414 		ops->ndo_set_rx_mode(dev);
3415 	else {
3416 		/* Unicast addresses changes may only happen under the rtnl,
3417 		 * therefore calling __dev_set_promiscuity here is safe.
3418 		 */
3419 		if (dev->uc_count > 0 && !dev->uc_promisc) {
3420 			__dev_set_promiscuity(dev, 1);
3421 			dev->uc_promisc = 1;
3422 		} else if (dev->uc_count == 0 && dev->uc_promisc) {
3423 			__dev_set_promiscuity(dev, -1);
3424 			dev->uc_promisc = 0;
3425 		}
3426 
3427 		if (ops->ndo_set_multicast_list)
3428 			ops->ndo_set_multicast_list(dev);
3429 	}
3430 }
3431 
3432 void dev_set_rx_mode(struct net_device *dev)
3433 {
3434 	netif_addr_lock_bh(dev);
3435 	__dev_set_rx_mode(dev);
3436 	netif_addr_unlock_bh(dev);
3437 }
3438 
3439 int __dev_addr_delete(struct dev_addr_list **list, int *count,
3440 		      void *addr, int alen, int glbl)
3441 {
3442 	struct dev_addr_list *da;
3443 
3444 	for (; (da = *list) != NULL; list = &da->next) {
3445 		if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
3446 		    alen == da->da_addrlen) {
3447 			if (glbl) {
3448 				int old_glbl = da->da_gusers;
3449 				da->da_gusers = 0;
3450 				if (old_glbl == 0)
3451 					break;
3452 			}
3453 			if (--da->da_users)
3454 				return 0;
3455 
3456 			*list = da->next;
3457 			kfree(da);
3458 			(*count)--;
3459 			return 0;
3460 		}
3461 	}
3462 	return -ENOENT;
3463 }
3464 
3465 int __dev_addr_add(struct dev_addr_list **list, int *count,
3466 		   void *addr, int alen, int glbl)
3467 {
3468 	struct dev_addr_list *da;
3469 
3470 	for (da = *list; da != NULL; da = da->next) {
3471 		if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
3472 		    da->da_addrlen == alen) {
3473 			if (glbl) {
3474 				int old_glbl = da->da_gusers;
3475 				da->da_gusers = 1;
3476 				if (old_glbl)
3477 					return 0;
3478 			}
3479 			da->da_users++;
3480 			return 0;
3481 		}
3482 	}
3483 
3484 	da = kzalloc(sizeof(*da), GFP_ATOMIC);
3485 	if (da == NULL)
3486 		return -ENOMEM;
3487 	memcpy(da->da_addr, addr, alen);
3488 	da->da_addrlen = alen;
3489 	da->da_users = 1;
3490 	da->da_gusers = glbl ? 1 : 0;
3491 	da->next = *list;
3492 	*list = da;
3493 	(*count)++;
3494 	return 0;
3495 }
3496 
3497 /**
3498  *	dev_unicast_delete	- Release secondary unicast address.
3499  *	@dev: device
3500  *	@addr: address to delete
3501  *	@alen: length of @addr
3502  *
3503  *	Release reference to a secondary unicast address and remove it
3504  *	from the device if the reference count drops to zero.
3505  *
3506  * 	The caller must hold the rtnl_mutex.
3507  */
3508 int dev_unicast_delete(struct net_device *dev, void *addr, int alen)
3509 {
3510 	int err;
3511 
3512 	ASSERT_RTNL();
3513 
3514 	netif_addr_lock_bh(dev);
3515 	err = __dev_addr_delete(&dev->uc_list, &dev->uc_count, addr, alen, 0);
3516 	if (!err)
3517 		__dev_set_rx_mode(dev);
3518 	netif_addr_unlock_bh(dev);
3519 	return err;
3520 }
3521 EXPORT_SYMBOL(dev_unicast_delete);
3522 
3523 /**
3524  *	dev_unicast_add		- add a secondary unicast address
3525  *	@dev: device
3526  *	@addr: address to add
3527  *	@alen: length of @addr
3528  *
3529  *	Add a secondary unicast address to the device or increase
3530  *	the reference count if it already exists.
3531  *
3532  *	The caller must hold the rtnl_mutex.
3533  */
3534 int dev_unicast_add(struct net_device *dev, void *addr, int alen)
3535 {
3536 	int err;
3537 
3538 	ASSERT_RTNL();
3539 
3540 	netif_addr_lock_bh(dev);
3541 	err = __dev_addr_add(&dev->uc_list, &dev->uc_count, addr, alen, 0);
3542 	if (!err)
3543 		__dev_set_rx_mode(dev);
3544 	netif_addr_unlock_bh(dev);
3545 	return err;
3546 }
3547 EXPORT_SYMBOL(dev_unicast_add);
3548 
3549 int __dev_addr_sync(struct dev_addr_list **to, int *to_count,
3550 		    struct dev_addr_list **from, int *from_count)
3551 {
3552 	struct dev_addr_list *da, *next;
3553 	int err = 0;
3554 
3555 	da = *from;
3556 	while (da != NULL) {
3557 		next = da->next;
3558 		if (!da->da_synced) {
3559 			err = __dev_addr_add(to, to_count,
3560 					     da->da_addr, da->da_addrlen, 0);
3561 			if (err < 0)
3562 				break;
3563 			da->da_synced = 1;
3564 			da->da_users++;
3565 		} else if (da->da_users == 1) {
3566 			__dev_addr_delete(to, to_count,
3567 					  da->da_addr, da->da_addrlen, 0);
3568 			__dev_addr_delete(from, from_count,
3569 					  da->da_addr, da->da_addrlen, 0);
3570 		}
3571 		da = next;
3572 	}
3573 	return err;
3574 }
3575 
3576 void __dev_addr_unsync(struct dev_addr_list **to, int *to_count,
3577 		       struct dev_addr_list **from, int *from_count)
3578 {
3579 	struct dev_addr_list *da, *next;
3580 
3581 	da = *from;
3582 	while (da != NULL) {
3583 		next = da->next;
3584 		if (da->da_synced) {
3585 			__dev_addr_delete(to, to_count,
3586 					  da->da_addr, da->da_addrlen, 0);
3587 			da->da_synced = 0;
3588 			__dev_addr_delete(from, from_count,
3589 					  da->da_addr, da->da_addrlen, 0);
3590 		}
3591 		da = next;
3592 	}
3593 }
3594 
3595 /**
3596  *	dev_unicast_sync - Synchronize device's unicast list to another device
3597  *	@to: destination device
3598  *	@from: source device
3599  *
3600  *	Add newly added addresses to the destination device and release
3601  *	addresses that have no users left. The source device must be
3602  *	locked by netif_tx_lock_bh.
3603  *
3604  *	This function is intended to be called from the dev->set_rx_mode
3605  *	function of layered software devices.
3606  */
3607 int dev_unicast_sync(struct net_device *to, struct net_device *from)
3608 {
3609 	int err = 0;
3610 
3611 	netif_addr_lock_bh(to);
3612 	err = __dev_addr_sync(&to->uc_list, &to->uc_count,
3613 			      &from->uc_list, &from->uc_count);
3614 	if (!err)
3615 		__dev_set_rx_mode(to);
3616 	netif_addr_unlock_bh(to);
3617 	return err;
3618 }
3619 EXPORT_SYMBOL(dev_unicast_sync);
3620 
3621 /**
3622  *	dev_unicast_unsync - Remove synchronized addresses from the destination device
3623  *	@to: destination device
3624  *	@from: source device
3625  *
3626  *	Remove all addresses that were added to the destination device by
3627  *	dev_unicast_sync(). This function is intended to be called from the
3628  *	dev->stop function of layered software devices.
3629  */
3630 void dev_unicast_unsync(struct net_device *to, struct net_device *from)
3631 {
3632 	netif_addr_lock_bh(from);
3633 	netif_addr_lock(to);
3634 
3635 	__dev_addr_unsync(&to->uc_list, &to->uc_count,
3636 			  &from->uc_list, &from->uc_count);
3637 	__dev_set_rx_mode(to);
3638 
3639 	netif_addr_unlock(to);
3640 	netif_addr_unlock_bh(from);
3641 }
3642 EXPORT_SYMBOL(dev_unicast_unsync);
3643 
3644 static void __dev_addr_discard(struct dev_addr_list **list)
3645 {
3646 	struct dev_addr_list *tmp;
3647 
3648 	while (*list != NULL) {
3649 		tmp = *list;
3650 		*list = tmp->next;
3651 		if (tmp->da_users > tmp->da_gusers)
3652 			printk("__dev_addr_discard: address leakage! "
3653 			       "da_users=%d\n", tmp->da_users);
3654 		kfree(tmp);
3655 	}
3656 }
3657 
3658 static void dev_addr_discard(struct net_device *dev)
3659 {
3660 	netif_addr_lock_bh(dev);
3661 
3662 	__dev_addr_discard(&dev->uc_list);
3663 	dev->uc_count = 0;
3664 
3665 	__dev_addr_discard(&dev->mc_list);
3666 	dev->mc_count = 0;
3667 
3668 	netif_addr_unlock_bh(dev);
3669 }
3670 
3671 /**
3672  *	dev_get_flags - get flags reported to userspace
3673  *	@dev: device
3674  *
3675  *	Get the combination of flag bits exported through APIs to userspace.
3676  */
3677 unsigned dev_get_flags(const struct net_device *dev)
3678 {
3679 	unsigned flags;
3680 
3681 	flags = (dev->flags & ~(IFF_PROMISC |
3682 				IFF_ALLMULTI |
3683 				IFF_RUNNING |
3684 				IFF_LOWER_UP |
3685 				IFF_DORMANT)) |
3686 		(dev->gflags & (IFF_PROMISC |
3687 				IFF_ALLMULTI));
3688 
3689 	if (netif_running(dev)) {
3690 		if (netif_oper_up(dev))
3691 			flags |= IFF_RUNNING;
3692 		if (netif_carrier_ok(dev))
3693 			flags |= IFF_LOWER_UP;
3694 		if (netif_dormant(dev))
3695 			flags |= IFF_DORMANT;
3696 	}
3697 
3698 	return flags;
3699 }
3700 
3701 /**
3702  *	dev_change_flags - change device settings
3703  *	@dev: device
3704  *	@flags: device state flags
3705  *
3706  *	Change settings on device based state flags. The flags are
3707  *	in the userspace exported format.
3708  */
3709 int dev_change_flags(struct net_device *dev, unsigned flags)
3710 {
3711 	int ret, changes;
3712 	int old_flags = dev->flags;
3713 
3714 	ASSERT_RTNL();
3715 
3716 	/*
3717 	 *	Set the flags on our device.
3718 	 */
3719 
3720 	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
3721 			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
3722 			       IFF_AUTOMEDIA)) |
3723 		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
3724 				    IFF_ALLMULTI));
3725 
3726 	/*
3727 	 *	Load in the correct multicast list now the flags have changed.
3728 	 */
3729 
3730 	if ((old_flags ^ flags) & IFF_MULTICAST)
3731 		dev_change_rx_flags(dev, IFF_MULTICAST);
3732 
3733 	dev_set_rx_mode(dev);
3734 
3735 	/*
3736 	 *	Have we downed the interface. We handle IFF_UP ourselves
3737 	 *	according to user attempts to set it, rather than blindly
3738 	 *	setting it.
3739 	 */
3740 
3741 	ret = 0;
3742 	if ((old_flags ^ flags) & IFF_UP) {	/* Bit is different  ? */
3743 		ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev);
3744 
3745 		if (!ret)
3746 			dev_set_rx_mode(dev);
3747 	}
3748 
3749 	if (dev->flags & IFF_UP &&
3750 	    ((old_flags ^ dev->flags) &~ (IFF_UP | IFF_PROMISC | IFF_ALLMULTI |
3751 					  IFF_VOLATILE)))
3752 		call_netdevice_notifiers(NETDEV_CHANGE, dev);
3753 
3754 	if ((flags ^ dev->gflags) & IFF_PROMISC) {
3755 		int inc = (flags & IFF_PROMISC) ? +1 : -1;
3756 		dev->gflags ^= IFF_PROMISC;
3757 		dev_set_promiscuity(dev, inc);
3758 	}
3759 
3760 	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
3761 	   is important. Some (broken) drivers set IFF_PROMISC, when
3762 	   IFF_ALLMULTI is requested not asking us and not reporting.
3763 	 */
3764 	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
3765 		int inc = (flags & IFF_ALLMULTI) ? +1 : -1;
3766 		dev->gflags ^= IFF_ALLMULTI;
3767 		dev_set_allmulti(dev, inc);
3768 	}
3769 
3770 	/* Exclude state transition flags, already notified */
3771 	changes = (old_flags ^ dev->flags) & ~(IFF_UP | IFF_RUNNING);
3772 	if (changes)
3773 		rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
3774 
3775 	return ret;
3776 }
3777 
3778 /**
3779  *	dev_set_mtu - Change maximum transfer unit
3780  *	@dev: device
3781  *	@new_mtu: new transfer unit
3782  *
3783  *	Change the maximum transfer size of the network device.
3784  */
3785 int dev_set_mtu(struct net_device *dev, int new_mtu)
3786 {
3787 	const struct net_device_ops *ops = dev->netdev_ops;
3788 	int err;
3789 
3790 	if (new_mtu == dev->mtu)
3791 		return 0;
3792 
3793 	/*	MTU must be positive.	 */
3794 	if (new_mtu < 0)
3795 		return -EINVAL;
3796 
3797 	if (!netif_device_present(dev))
3798 		return -ENODEV;
3799 
3800 	err = 0;
3801 	if (ops->ndo_change_mtu)
3802 		err = ops->ndo_change_mtu(dev, new_mtu);
3803 	else
3804 		dev->mtu = new_mtu;
3805 
3806 	if (!err && dev->flags & IFF_UP)
3807 		call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
3808 	return err;
3809 }
3810 
3811 /**
3812  *	dev_set_mac_address - Change Media Access Control Address
3813  *	@dev: device
3814  *	@sa: new address
3815  *
3816  *	Change the hardware (MAC) address of the device
3817  */
3818 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
3819 {
3820 	const struct net_device_ops *ops = dev->netdev_ops;
3821 	int err;
3822 
3823 	if (!ops->ndo_set_mac_address)
3824 		return -EOPNOTSUPP;
3825 	if (sa->sa_family != dev->type)
3826 		return -EINVAL;
3827 	if (!netif_device_present(dev))
3828 		return -ENODEV;
3829 	err = ops->ndo_set_mac_address(dev, sa);
3830 	if (!err)
3831 		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3832 	return err;
3833 }
3834 
3835 /*
3836  *	Perform the SIOCxIFxxx calls, inside read_lock(dev_base_lock)
3837  */
3838 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
3839 {
3840 	int err;
3841 	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
3842 
3843 	if (!dev)
3844 		return -ENODEV;
3845 
3846 	switch (cmd) {
3847 		case SIOCGIFFLAGS:	/* Get interface flags */
3848 			ifr->ifr_flags = dev_get_flags(dev);
3849 			return 0;
3850 
3851 		case SIOCGIFMETRIC:	/* Get the metric on the interface
3852 					   (currently unused) */
3853 			ifr->ifr_metric = 0;
3854 			return 0;
3855 
3856 		case SIOCGIFMTU:	/* Get the MTU of a device */
3857 			ifr->ifr_mtu = dev->mtu;
3858 			return 0;
3859 
3860 		case SIOCGIFHWADDR:
3861 			if (!dev->addr_len)
3862 				memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
3863 			else
3864 				memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
3865 				       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
3866 			ifr->ifr_hwaddr.sa_family = dev->type;
3867 			return 0;
3868 
3869 		case SIOCGIFSLAVE:
3870 			err = -EINVAL;
3871 			break;
3872 
3873 		case SIOCGIFMAP:
3874 			ifr->ifr_map.mem_start = dev->mem_start;
3875 			ifr->ifr_map.mem_end   = dev->mem_end;
3876 			ifr->ifr_map.base_addr = dev->base_addr;
3877 			ifr->ifr_map.irq       = dev->irq;
3878 			ifr->ifr_map.dma       = dev->dma;
3879 			ifr->ifr_map.port      = dev->if_port;
3880 			return 0;
3881 
3882 		case SIOCGIFINDEX:
3883 			ifr->ifr_ifindex = dev->ifindex;
3884 			return 0;
3885 
3886 		case SIOCGIFTXQLEN:
3887 			ifr->ifr_qlen = dev->tx_queue_len;
3888 			return 0;
3889 
3890 		default:
3891 			/* dev_ioctl() should ensure this case
3892 			 * is never reached
3893 			 */
3894 			WARN_ON(1);
3895 			err = -EINVAL;
3896 			break;
3897 
3898 	}
3899 	return err;
3900 }
3901 
3902 /*
3903  *	Perform the SIOCxIFxxx calls, inside rtnl_lock()
3904  */
3905 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
3906 {
3907 	int err;
3908 	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
3909 	const struct net_device_ops *ops;
3910 
3911 	if (!dev)
3912 		return -ENODEV;
3913 
3914 	ops = dev->netdev_ops;
3915 
3916 	switch (cmd) {
3917 		case SIOCSIFFLAGS:	/* Set interface flags */
3918 			return dev_change_flags(dev, ifr->ifr_flags);
3919 
3920 		case SIOCSIFMETRIC:	/* Set the metric on the interface
3921 					   (currently unused) */
3922 			return -EOPNOTSUPP;
3923 
3924 		case SIOCSIFMTU:	/* Set the MTU of a device */
3925 			return dev_set_mtu(dev, ifr->ifr_mtu);
3926 
3927 		case SIOCSIFHWADDR:
3928 			return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
3929 
3930 		case SIOCSIFHWBROADCAST:
3931 			if (ifr->ifr_hwaddr.sa_family != dev->type)
3932 				return -EINVAL;
3933 			memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
3934 			       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
3935 			call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3936 			return 0;
3937 
3938 		case SIOCSIFMAP:
3939 			if (ops->ndo_set_config) {
3940 				if (!netif_device_present(dev))
3941 					return -ENODEV;
3942 				return ops->ndo_set_config(dev, &ifr->ifr_map);
3943 			}
3944 			return -EOPNOTSUPP;
3945 
3946 		case SIOCADDMULTI:
3947 			if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
3948 			    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
3949 				return -EINVAL;
3950 			if (!netif_device_present(dev))
3951 				return -ENODEV;
3952 			return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data,
3953 					  dev->addr_len, 1);
3954 
3955 		case SIOCDELMULTI:
3956 			if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
3957 			    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
3958 				return -EINVAL;
3959 			if (!netif_device_present(dev))
3960 				return -ENODEV;
3961 			return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data,
3962 					     dev->addr_len, 1);
3963 
3964 		case SIOCSIFTXQLEN:
3965 			if (ifr->ifr_qlen < 0)
3966 				return -EINVAL;
3967 			dev->tx_queue_len = ifr->ifr_qlen;
3968 			return 0;
3969 
3970 		case SIOCSIFNAME:
3971 			ifr->ifr_newname[IFNAMSIZ-1] = '\0';
3972 			return dev_change_name(dev, ifr->ifr_newname);
3973 
3974 		/*
3975 		 *	Unknown or private ioctl
3976 		 */
3977 
3978 		default:
3979 			if ((cmd >= SIOCDEVPRIVATE &&
3980 			    cmd <= SIOCDEVPRIVATE + 15) ||
3981 			    cmd == SIOCBONDENSLAVE ||
3982 			    cmd == SIOCBONDRELEASE ||
3983 			    cmd == SIOCBONDSETHWADDR ||
3984 			    cmd == SIOCBONDSLAVEINFOQUERY ||
3985 			    cmd == SIOCBONDINFOQUERY ||
3986 			    cmd == SIOCBONDCHANGEACTIVE ||
3987 			    cmd == SIOCGMIIPHY ||
3988 			    cmd == SIOCGMIIREG ||
3989 			    cmd == SIOCSMIIREG ||
3990 			    cmd == SIOCBRADDIF ||
3991 			    cmd == SIOCBRDELIF ||
3992 			    cmd == SIOCSHWTSTAMP ||
3993 			    cmd == SIOCWANDEV) {
3994 				err = -EOPNOTSUPP;
3995 				if (ops->ndo_do_ioctl) {
3996 					if (netif_device_present(dev))
3997 						err = ops->ndo_do_ioctl(dev, ifr, cmd);
3998 					else
3999 						err = -ENODEV;
4000 				}
4001 			} else
4002 				err = -EINVAL;
4003 
4004 	}
4005 	return err;
4006 }
4007 
4008 /*
4009  *	This function handles all "interface"-type I/O control requests. The actual
4010  *	'doing' part of this is dev_ifsioc above.
4011  */
4012 
4013 /**
4014  *	dev_ioctl	-	network device ioctl
4015  *	@net: the applicable net namespace
4016  *	@cmd: command to issue
4017  *	@arg: pointer to a struct ifreq in user space
4018  *
4019  *	Issue ioctl functions to devices. This is normally called by the
4020  *	user space syscall interfaces but can sometimes be useful for
4021  *	other purposes. The return value is the return from the syscall if
4022  *	positive or a negative errno code on error.
4023  */
4024 
4025 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4026 {
4027 	struct ifreq ifr;
4028 	int ret;
4029 	char *colon;
4030 
4031 	/* One special case: SIOCGIFCONF takes ifconf argument
4032 	   and requires shared lock, because it sleeps writing
4033 	   to user space.
4034 	 */
4035 
4036 	if (cmd == SIOCGIFCONF) {
4037 		rtnl_lock();
4038 		ret = dev_ifconf(net, (char __user *) arg);
4039 		rtnl_unlock();
4040 		return ret;
4041 	}
4042 	if (cmd == SIOCGIFNAME)
4043 		return dev_ifname(net, (struct ifreq __user *)arg);
4044 
4045 	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4046 		return -EFAULT;
4047 
4048 	ifr.ifr_name[IFNAMSIZ-1] = 0;
4049 
4050 	colon = strchr(ifr.ifr_name, ':');
4051 	if (colon)
4052 		*colon = 0;
4053 
4054 	/*
4055 	 *	See which interface the caller is talking about.
4056 	 */
4057 
4058 	switch (cmd) {
4059 		/*
4060 		 *	These ioctl calls:
4061 		 *	- can be done by all.
4062 		 *	- atomic and do not require locking.
4063 		 *	- return a value
4064 		 */
4065 		case SIOCGIFFLAGS:
4066 		case SIOCGIFMETRIC:
4067 		case SIOCGIFMTU:
4068 		case SIOCGIFHWADDR:
4069 		case SIOCGIFSLAVE:
4070 		case SIOCGIFMAP:
4071 		case SIOCGIFINDEX:
4072 		case SIOCGIFTXQLEN:
4073 			dev_load(net, ifr.ifr_name);
4074 			read_lock(&dev_base_lock);
4075 			ret = dev_ifsioc_locked(net, &ifr, cmd);
4076 			read_unlock(&dev_base_lock);
4077 			if (!ret) {
4078 				if (colon)
4079 					*colon = ':';
4080 				if (copy_to_user(arg, &ifr,
4081 						 sizeof(struct ifreq)))
4082 					ret = -EFAULT;
4083 			}
4084 			return ret;
4085 
4086 		case SIOCETHTOOL:
4087 			dev_load(net, ifr.ifr_name);
4088 			rtnl_lock();
4089 			ret = dev_ethtool(net, &ifr);
4090 			rtnl_unlock();
4091 			if (!ret) {
4092 				if (colon)
4093 					*colon = ':';
4094 				if (copy_to_user(arg, &ifr,
4095 						 sizeof(struct ifreq)))
4096 					ret = -EFAULT;
4097 			}
4098 			return ret;
4099 
4100 		/*
4101 		 *	These ioctl calls:
4102 		 *	- require superuser power.
4103 		 *	- require strict serialization.
4104 		 *	- return a value
4105 		 */
4106 		case SIOCGMIIPHY:
4107 		case SIOCGMIIREG:
4108 		case SIOCSIFNAME:
4109 			if (!capable(CAP_NET_ADMIN))
4110 				return -EPERM;
4111 			dev_load(net, ifr.ifr_name);
4112 			rtnl_lock();
4113 			ret = dev_ifsioc(net, &ifr, cmd);
4114 			rtnl_unlock();
4115 			if (!ret) {
4116 				if (colon)
4117 					*colon = ':';
4118 				if (copy_to_user(arg, &ifr,
4119 						 sizeof(struct ifreq)))
4120 					ret = -EFAULT;
4121 			}
4122 			return ret;
4123 
4124 		/*
4125 		 *	These ioctl calls:
4126 		 *	- require superuser power.
4127 		 *	- require strict serialization.
4128 		 *	- do not return a value
4129 		 */
4130 		case SIOCSIFFLAGS:
4131 		case SIOCSIFMETRIC:
4132 		case SIOCSIFMTU:
4133 		case SIOCSIFMAP:
4134 		case SIOCSIFHWADDR:
4135 		case SIOCSIFSLAVE:
4136 		case SIOCADDMULTI:
4137 		case SIOCDELMULTI:
4138 		case SIOCSIFHWBROADCAST:
4139 		case SIOCSIFTXQLEN:
4140 		case SIOCSMIIREG:
4141 		case SIOCBONDENSLAVE:
4142 		case SIOCBONDRELEASE:
4143 		case SIOCBONDSETHWADDR:
4144 		case SIOCBONDCHANGEACTIVE:
4145 		case SIOCBRADDIF:
4146 		case SIOCBRDELIF:
4147 		case SIOCSHWTSTAMP:
4148 			if (!capable(CAP_NET_ADMIN))
4149 				return -EPERM;
4150 			/* fall through */
4151 		case SIOCBONDSLAVEINFOQUERY:
4152 		case SIOCBONDINFOQUERY:
4153 			dev_load(net, ifr.ifr_name);
4154 			rtnl_lock();
4155 			ret = dev_ifsioc(net, &ifr, cmd);
4156 			rtnl_unlock();
4157 			return ret;
4158 
4159 		case SIOCGIFMEM:
4160 			/* Get the per device memory space. We can add this but
4161 			 * currently do not support it */
4162 		case SIOCSIFMEM:
4163 			/* Set the per device memory buffer space.
4164 			 * Not applicable in our case */
4165 		case SIOCSIFLINK:
4166 			return -EINVAL;
4167 
4168 		/*
4169 		 *	Unknown or private ioctl.
4170 		 */
4171 		default:
4172 			if (cmd == SIOCWANDEV ||
4173 			    (cmd >= SIOCDEVPRIVATE &&
4174 			     cmd <= SIOCDEVPRIVATE + 15)) {
4175 				dev_load(net, ifr.ifr_name);
4176 				rtnl_lock();
4177 				ret = dev_ifsioc(net, &ifr, cmd);
4178 				rtnl_unlock();
4179 				if (!ret && copy_to_user(arg, &ifr,
4180 							 sizeof(struct ifreq)))
4181 					ret = -EFAULT;
4182 				return ret;
4183 			}
4184 			/* Take care of Wireless Extensions */
4185 			if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
4186 				return wext_handle_ioctl(net, &ifr, cmd, arg);
4187 			return -EINVAL;
4188 	}
4189 }
4190 
4191 
4192 /**
4193  *	dev_new_index	-	allocate an ifindex
4194  *	@net: the applicable net namespace
4195  *
4196  *	Returns a suitable unique value for a new device interface
4197  *	number.  The caller must hold the rtnl semaphore or the
4198  *	dev_base_lock to be sure it remains unique.
4199  */
4200 static int dev_new_index(struct net *net)
4201 {
4202 	static int ifindex;
4203 	for (;;) {
4204 		if (++ifindex <= 0)
4205 			ifindex = 1;
4206 		if (!__dev_get_by_index(net, ifindex))
4207 			return ifindex;
4208 	}
4209 }
4210 
4211 /* Delayed registration/unregisteration */
4212 static LIST_HEAD(net_todo_list);
4213 
4214 static void net_set_todo(struct net_device *dev)
4215 {
4216 	list_add_tail(&dev->todo_list, &net_todo_list);
4217 }
4218 
4219 static void rollback_registered(struct net_device *dev)
4220 {
4221 	BUG_ON(dev_boot_phase);
4222 	ASSERT_RTNL();
4223 
4224 	/* Some devices call without registering for initialization unwind. */
4225 	if (dev->reg_state == NETREG_UNINITIALIZED) {
4226 		printk(KERN_DEBUG "unregister_netdevice: device %s/%p never "
4227 				  "was registered\n", dev->name, dev);
4228 
4229 		WARN_ON(1);
4230 		return;
4231 	}
4232 
4233 	BUG_ON(dev->reg_state != NETREG_REGISTERED);
4234 
4235 	/* If device is running, close it first. */
4236 	dev_close(dev);
4237 
4238 	/* And unlink it from device chain. */
4239 	unlist_netdevice(dev);
4240 
4241 	dev->reg_state = NETREG_UNREGISTERING;
4242 
4243 	synchronize_net();
4244 
4245 	/* Shutdown queueing discipline. */
4246 	dev_shutdown(dev);
4247 
4248 
4249 	/* Notify protocols, that we are about to destroy
4250 	   this device. They should clean all the things.
4251 	*/
4252 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4253 
4254 	/*
4255 	 *	Flush the unicast and multicast chains
4256 	 */
4257 	dev_addr_discard(dev);
4258 
4259 	if (dev->netdev_ops->ndo_uninit)
4260 		dev->netdev_ops->ndo_uninit(dev);
4261 
4262 	/* Notifier chain MUST detach us from master device. */
4263 	WARN_ON(dev->master);
4264 
4265 	/* Remove entries from kobject tree */
4266 	netdev_unregister_kobject(dev);
4267 
4268 	synchronize_net();
4269 
4270 	dev_put(dev);
4271 }
4272 
4273 static void __netdev_init_queue_locks_one(struct net_device *dev,
4274 					  struct netdev_queue *dev_queue,
4275 					  void *_unused)
4276 {
4277 	spin_lock_init(&dev_queue->_xmit_lock);
4278 	netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type);
4279 	dev_queue->xmit_lock_owner = -1;
4280 }
4281 
4282 static void netdev_init_queue_locks(struct net_device *dev)
4283 {
4284 	netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL);
4285 	__netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL);
4286 }
4287 
4288 unsigned long netdev_fix_features(unsigned long features, const char *name)
4289 {
4290 	/* Fix illegal SG+CSUM combinations. */
4291 	if ((features & NETIF_F_SG) &&
4292 	    !(features & NETIF_F_ALL_CSUM)) {
4293 		if (name)
4294 			printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no "
4295 			       "checksum feature.\n", name);
4296 		features &= ~NETIF_F_SG;
4297 	}
4298 
4299 	/* TSO requires that SG is present as well. */
4300 	if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
4301 		if (name)
4302 			printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
4303 			       "SG feature.\n", name);
4304 		features &= ~NETIF_F_TSO;
4305 	}
4306 
4307 	if (features & NETIF_F_UFO) {
4308 		if (!(features & NETIF_F_GEN_CSUM)) {
4309 			if (name)
4310 				printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4311 				       "since no NETIF_F_HW_CSUM feature.\n",
4312 				       name);
4313 			features &= ~NETIF_F_UFO;
4314 		}
4315 
4316 		if (!(features & NETIF_F_SG)) {
4317 			if (name)
4318 				printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4319 				       "since no NETIF_F_SG feature.\n", name);
4320 			features &= ~NETIF_F_UFO;
4321 		}
4322 	}
4323 
4324 	return features;
4325 }
4326 EXPORT_SYMBOL(netdev_fix_features);
4327 
4328 /* Some devices need to (re-)set their netdev_ops inside
4329  * ->init() or similar.  If that happens, we have to setup
4330  * the compat pointers again.
4331  */
4332 void netdev_resync_ops(struct net_device *dev)
4333 {
4334 #ifdef CONFIG_COMPAT_NET_DEV_OPS
4335 	const struct net_device_ops *ops = dev->netdev_ops;
4336 
4337 	dev->init = ops->ndo_init;
4338 	dev->uninit = ops->ndo_uninit;
4339 	dev->open = ops->ndo_open;
4340 	dev->change_rx_flags = ops->ndo_change_rx_flags;
4341 	dev->set_rx_mode = ops->ndo_set_rx_mode;
4342 	dev->set_multicast_list = ops->ndo_set_multicast_list;
4343 	dev->set_mac_address = ops->ndo_set_mac_address;
4344 	dev->validate_addr = ops->ndo_validate_addr;
4345 	dev->do_ioctl = ops->ndo_do_ioctl;
4346 	dev->set_config = ops->ndo_set_config;
4347 	dev->change_mtu = ops->ndo_change_mtu;
4348 	dev->neigh_setup = ops->ndo_neigh_setup;
4349 	dev->tx_timeout = ops->ndo_tx_timeout;
4350 	dev->get_stats = ops->ndo_get_stats;
4351 	dev->vlan_rx_register = ops->ndo_vlan_rx_register;
4352 	dev->vlan_rx_add_vid = ops->ndo_vlan_rx_add_vid;
4353 	dev->vlan_rx_kill_vid = ops->ndo_vlan_rx_kill_vid;
4354 #ifdef CONFIG_NET_POLL_CONTROLLER
4355 	dev->poll_controller = ops->ndo_poll_controller;
4356 #endif
4357 #endif
4358 }
4359 EXPORT_SYMBOL(netdev_resync_ops);
4360 
4361 /**
4362  *	register_netdevice	- register a network device
4363  *	@dev: device to register
4364  *
4365  *	Take a completed network device structure and add it to the kernel
4366  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
4367  *	chain. 0 is returned on success. A negative errno code is returned
4368  *	on a failure to set up the device, or if the name is a duplicate.
4369  *
4370  *	Callers must hold the rtnl semaphore. You may want
4371  *	register_netdev() instead of this.
4372  *
4373  *	BUGS:
4374  *	The locking appears insufficient to guarantee two parallel registers
4375  *	will not get the same name.
4376  */
4377 
4378 int register_netdevice(struct net_device *dev)
4379 {
4380 	struct hlist_head *head;
4381 	struct hlist_node *p;
4382 	int ret;
4383 	struct net *net = dev_net(dev);
4384 
4385 	BUG_ON(dev_boot_phase);
4386 	ASSERT_RTNL();
4387 
4388 	might_sleep();
4389 
4390 	/* When net_device's are persistent, this will be fatal. */
4391 	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
4392 	BUG_ON(!net);
4393 
4394 	spin_lock_init(&dev->addr_list_lock);
4395 	netdev_set_addr_lockdep_class(dev);
4396 	netdev_init_queue_locks(dev);
4397 
4398 	dev->iflink = -1;
4399 
4400 #ifdef CONFIG_COMPAT_NET_DEV_OPS
4401 	/* Netdevice_ops API compatiability support.
4402 	 * This is temporary until all network devices are converted.
4403 	 */
4404 	if (dev->netdev_ops) {
4405 		netdev_resync_ops(dev);
4406 	} else {
4407 		char drivername[64];
4408 		pr_info("%s (%s): not using net_device_ops yet\n",
4409 			dev->name, netdev_drivername(dev, drivername, 64));
4410 
4411 		/* This works only because net_device_ops and the
4412 		   compatiablity structure are the same. */
4413 		dev->netdev_ops = (void *) &(dev->init);
4414 	}
4415 #endif
4416 
4417 	/* Init, if this function is available */
4418 	if (dev->netdev_ops->ndo_init) {
4419 		ret = dev->netdev_ops->ndo_init(dev);
4420 		if (ret) {
4421 			if (ret > 0)
4422 				ret = -EIO;
4423 			goto out;
4424 		}
4425 	}
4426 
4427 	if (!dev_valid_name(dev->name)) {
4428 		ret = -EINVAL;
4429 		goto err_uninit;
4430 	}
4431 
4432 	dev->ifindex = dev_new_index(net);
4433 	if (dev->iflink == -1)
4434 		dev->iflink = dev->ifindex;
4435 
4436 	/* Check for existence of name */
4437 	head = dev_name_hash(net, dev->name);
4438 	hlist_for_each(p, head) {
4439 		struct net_device *d
4440 			= hlist_entry(p, struct net_device, name_hlist);
4441 		if (!strncmp(d->name, dev->name, IFNAMSIZ)) {
4442 			ret = -EEXIST;
4443 			goto err_uninit;
4444 		}
4445 	}
4446 
4447 	/* Fix illegal checksum combinations */
4448 	if ((dev->features & NETIF_F_HW_CSUM) &&
4449 	    (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
4450 		printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
4451 		       dev->name);
4452 		dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
4453 	}
4454 
4455 	if ((dev->features & NETIF_F_NO_CSUM) &&
4456 	    (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
4457 		printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
4458 		       dev->name);
4459 		dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
4460 	}
4461 
4462 	dev->features = netdev_fix_features(dev->features, dev->name);
4463 
4464 	/* Enable software GSO if SG is supported. */
4465 	if (dev->features & NETIF_F_SG)
4466 		dev->features |= NETIF_F_GSO;
4467 
4468 	netdev_initialize_kobject(dev);
4469 	ret = netdev_register_kobject(dev);
4470 	if (ret)
4471 		goto err_uninit;
4472 	dev->reg_state = NETREG_REGISTERED;
4473 
4474 	/*
4475 	 *	Default initial state at registry is that the
4476 	 *	device is present.
4477 	 */
4478 
4479 	set_bit(__LINK_STATE_PRESENT, &dev->state);
4480 
4481 	dev_init_scheduler(dev);
4482 	dev_hold(dev);
4483 	list_netdevice(dev);
4484 
4485 	/* Notify protocols, that a new device appeared. */
4486 	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
4487 	ret = notifier_to_errno(ret);
4488 	if (ret) {
4489 		rollback_registered(dev);
4490 		dev->reg_state = NETREG_UNREGISTERED;
4491 	}
4492 
4493 out:
4494 	return ret;
4495 
4496 err_uninit:
4497 	if (dev->netdev_ops->ndo_uninit)
4498 		dev->netdev_ops->ndo_uninit(dev);
4499 	goto out;
4500 }
4501 
4502 /**
4503  *	init_dummy_netdev	- init a dummy network device for NAPI
4504  *	@dev: device to init
4505  *
4506  *	This takes a network device structure and initialize the minimum
4507  *	amount of fields so it can be used to schedule NAPI polls without
4508  *	registering a full blown interface. This is to be used by drivers
4509  *	that need to tie several hardware interfaces to a single NAPI
4510  *	poll scheduler due to HW limitations.
4511  */
4512 int init_dummy_netdev(struct net_device *dev)
4513 {
4514 	/* Clear everything. Note we don't initialize spinlocks
4515 	 * are they aren't supposed to be taken by any of the
4516 	 * NAPI code and this dummy netdev is supposed to be
4517 	 * only ever used for NAPI polls
4518 	 */
4519 	memset(dev, 0, sizeof(struct net_device));
4520 
4521 	/* make sure we BUG if trying to hit standard
4522 	 * register/unregister code path
4523 	 */
4524 	dev->reg_state = NETREG_DUMMY;
4525 
4526 	/* initialize the ref count */
4527 	atomic_set(&dev->refcnt, 1);
4528 
4529 	/* NAPI wants this */
4530 	INIT_LIST_HEAD(&dev->napi_list);
4531 
4532 	/* a dummy interface is started by default */
4533 	set_bit(__LINK_STATE_PRESENT, &dev->state);
4534 	set_bit(__LINK_STATE_START, &dev->state);
4535 
4536 	return 0;
4537 }
4538 EXPORT_SYMBOL_GPL(init_dummy_netdev);
4539 
4540 
4541 /**
4542  *	register_netdev	- register a network device
4543  *	@dev: device to register
4544  *
4545  *	Take a completed network device structure and add it to the kernel
4546  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
4547  *	chain. 0 is returned on success. A negative errno code is returned
4548  *	on a failure to set up the device, or if the name is a duplicate.
4549  *
4550  *	This is a wrapper around register_netdevice that takes the rtnl semaphore
4551  *	and expands the device name if you passed a format string to
4552  *	alloc_netdev.
4553  */
4554 int register_netdev(struct net_device *dev)
4555 {
4556 	int err;
4557 
4558 	rtnl_lock();
4559 
4560 	/*
4561 	 * If the name is a format string the caller wants us to do a
4562 	 * name allocation.
4563 	 */
4564 	if (strchr(dev->name, '%')) {
4565 		err = dev_alloc_name(dev, dev->name);
4566 		if (err < 0)
4567 			goto out;
4568 	}
4569 
4570 	err = register_netdevice(dev);
4571 out:
4572 	rtnl_unlock();
4573 	return err;
4574 }
4575 EXPORT_SYMBOL(register_netdev);
4576 
4577 /*
4578  * netdev_wait_allrefs - wait until all references are gone.
4579  *
4580  * This is called when unregistering network devices.
4581  *
4582  * Any protocol or device that holds a reference should register
4583  * for netdevice notification, and cleanup and put back the
4584  * reference if they receive an UNREGISTER event.
4585  * We can get stuck here if buggy protocols don't correctly
4586  * call dev_put.
4587  */
4588 static void netdev_wait_allrefs(struct net_device *dev)
4589 {
4590 	unsigned long rebroadcast_time, warning_time;
4591 
4592 	rebroadcast_time = warning_time = jiffies;
4593 	while (atomic_read(&dev->refcnt) != 0) {
4594 		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
4595 			rtnl_lock();
4596 
4597 			/* Rebroadcast unregister notification */
4598 			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4599 
4600 			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
4601 				     &dev->state)) {
4602 				/* We must not have linkwatch events
4603 				 * pending on unregister. If this
4604 				 * happens, we simply run the queue
4605 				 * unscheduled, resulting in a noop
4606 				 * for this device.
4607 				 */
4608 				linkwatch_run_queue();
4609 			}
4610 
4611 			__rtnl_unlock();
4612 
4613 			rebroadcast_time = jiffies;
4614 		}
4615 
4616 		msleep(250);
4617 
4618 		if (time_after(jiffies, warning_time + 10 * HZ)) {
4619 			printk(KERN_EMERG "unregister_netdevice: "
4620 			       "waiting for %s to become free. Usage "
4621 			       "count = %d\n",
4622 			       dev->name, atomic_read(&dev->refcnt));
4623 			warning_time = jiffies;
4624 		}
4625 	}
4626 }
4627 
4628 /* The sequence is:
4629  *
4630  *	rtnl_lock();
4631  *	...
4632  *	register_netdevice(x1);
4633  *	register_netdevice(x2);
4634  *	...
4635  *	unregister_netdevice(y1);
4636  *	unregister_netdevice(y2);
4637  *      ...
4638  *	rtnl_unlock();
4639  *	free_netdev(y1);
4640  *	free_netdev(y2);
4641  *
4642  * We are invoked by rtnl_unlock().
4643  * This allows us to deal with problems:
4644  * 1) We can delete sysfs objects which invoke hotplug
4645  *    without deadlocking with linkwatch via keventd.
4646  * 2) Since we run with the RTNL semaphore not held, we can sleep
4647  *    safely in order to wait for the netdev refcnt to drop to zero.
4648  *
4649  * We must not return until all unregister events added during
4650  * the interval the lock was held have been completed.
4651  */
4652 void netdev_run_todo(void)
4653 {
4654 	struct list_head list;
4655 
4656 	/* Snapshot list, allow later requests */
4657 	list_replace_init(&net_todo_list, &list);
4658 
4659 	__rtnl_unlock();
4660 
4661 	while (!list_empty(&list)) {
4662 		struct net_device *dev
4663 			= list_entry(list.next, struct net_device, todo_list);
4664 		list_del(&dev->todo_list);
4665 
4666 		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
4667 			printk(KERN_ERR "network todo '%s' but state %d\n",
4668 			       dev->name, dev->reg_state);
4669 			dump_stack();
4670 			continue;
4671 		}
4672 
4673 		dev->reg_state = NETREG_UNREGISTERED;
4674 
4675 		on_each_cpu(flush_backlog, dev, 1);
4676 
4677 		netdev_wait_allrefs(dev);
4678 
4679 		/* paranoia */
4680 		BUG_ON(atomic_read(&dev->refcnt));
4681 		WARN_ON(dev->ip_ptr);
4682 		WARN_ON(dev->ip6_ptr);
4683 		WARN_ON(dev->dn_ptr);
4684 
4685 		if (dev->destructor)
4686 			dev->destructor(dev);
4687 
4688 		/* Free network device */
4689 		kobject_put(&dev->dev.kobj);
4690 	}
4691 }
4692 
4693 /**
4694  *	dev_get_stats	- get network device statistics
4695  *	@dev: device to get statistics from
4696  *
4697  *	Get network statistics from device. The device driver may provide
4698  *	its own method by setting dev->netdev_ops->get_stats; otherwise
4699  *	the internal statistics structure is used.
4700  */
4701 const struct net_device_stats *dev_get_stats(struct net_device *dev)
4702  {
4703 	const struct net_device_ops *ops = dev->netdev_ops;
4704 
4705 	if (ops->ndo_get_stats)
4706 		return ops->ndo_get_stats(dev);
4707 	else
4708 		return &dev->stats;
4709 }
4710 EXPORT_SYMBOL(dev_get_stats);
4711 
4712 static void netdev_init_one_queue(struct net_device *dev,
4713 				  struct netdev_queue *queue,
4714 				  void *_unused)
4715 {
4716 	queue->dev = dev;
4717 }
4718 
4719 static void netdev_init_queues(struct net_device *dev)
4720 {
4721 	netdev_init_one_queue(dev, &dev->rx_queue, NULL);
4722 	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
4723 	spin_lock_init(&dev->tx_global_lock);
4724 }
4725 
4726 /**
4727  *	alloc_netdev_mq - allocate network device
4728  *	@sizeof_priv:	size of private data to allocate space for
4729  *	@name:		device name format string
4730  *	@setup:		callback to initialize device
4731  *	@queue_count:	the number of subqueues to allocate
4732  *
4733  *	Allocates a struct net_device with private data area for driver use
4734  *	and performs basic initialization.  Also allocates subquue structs
4735  *	for each queue on the device at the end of the netdevice.
4736  */
4737 struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
4738 		void (*setup)(struct net_device *), unsigned int queue_count)
4739 {
4740 	struct netdev_queue *tx;
4741 	struct net_device *dev;
4742 	size_t alloc_size;
4743 	void *p;
4744 
4745 	BUG_ON(strlen(name) >= sizeof(dev->name));
4746 
4747 	alloc_size = sizeof(struct net_device);
4748 	if (sizeof_priv) {
4749 		/* ensure 32-byte alignment of private area */
4750 		alloc_size = (alloc_size + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST;
4751 		alloc_size += sizeof_priv;
4752 	}
4753 	/* ensure 32-byte alignment of whole construct */
4754 	alloc_size += NETDEV_ALIGN_CONST;
4755 
4756 	p = kzalloc(alloc_size, GFP_KERNEL);
4757 	if (!p) {
4758 		printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
4759 		return NULL;
4760 	}
4761 
4762 	tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL);
4763 	if (!tx) {
4764 		printk(KERN_ERR "alloc_netdev: Unable to allocate "
4765 		       "tx qdiscs.\n");
4766 		kfree(p);
4767 		return NULL;
4768 	}
4769 
4770 	dev = (struct net_device *)
4771 		(((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST);
4772 	dev->padded = (char *)dev - (char *)p;
4773 	dev_net_set(dev, &init_net);
4774 
4775 	dev->_tx = tx;
4776 	dev->num_tx_queues = queue_count;
4777 	dev->real_num_tx_queues = queue_count;
4778 
4779 	dev->gso_max_size = GSO_MAX_SIZE;
4780 
4781 	netdev_init_queues(dev);
4782 
4783 	INIT_LIST_HEAD(&dev->napi_list);
4784 	setup(dev);
4785 	strcpy(dev->name, name);
4786 	return dev;
4787 }
4788 EXPORT_SYMBOL(alloc_netdev_mq);
4789 
4790 /**
4791  *	free_netdev - free network device
4792  *	@dev: device
4793  *
4794  *	This function does the last stage of destroying an allocated device
4795  * 	interface. The reference to the device object is released.
4796  *	If this is the last reference then it will be freed.
4797  */
4798 void free_netdev(struct net_device *dev)
4799 {
4800 	struct napi_struct *p, *n;
4801 
4802 	release_net(dev_net(dev));
4803 
4804 	kfree(dev->_tx);
4805 
4806 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
4807 		netif_napi_del(p);
4808 
4809 	/*  Compatibility with error handling in drivers */
4810 	if (dev->reg_state == NETREG_UNINITIALIZED) {
4811 		kfree((char *)dev - dev->padded);
4812 		return;
4813 	}
4814 
4815 	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
4816 	dev->reg_state = NETREG_RELEASED;
4817 
4818 	/* will free via device release */
4819 	put_device(&dev->dev);
4820 }
4821 
4822 /**
4823  *	synchronize_net -  Synchronize with packet receive processing
4824  *
4825  *	Wait for packets currently being received to be done.
4826  *	Does not block later packets from starting.
4827  */
4828 void synchronize_net(void)
4829 {
4830 	might_sleep();
4831 	synchronize_rcu();
4832 }
4833 
4834 /**
4835  *	unregister_netdevice - remove device from the kernel
4836  *	@dev: device
4837  *
4838  *	This function shuts down a device interface and removes it
4839  *	from the kernel tables.
4840  *
4841  *	Callers must hold the rtnl semaphore.  You may want
4842  *	unregister_netdev() instead of this.
4843  */
4844 
4845 void unregister_netdevice(struct net_device *dev)
4846 {
4847 	ASSERT_RTNL();
4848 
4849 	rollback_registered(dev);
4850 	/* Finish processing unregister after unlock */
4851 	net_set_todo(dev);
4852 }
4853 
4854 /**
4855  *	unregister_netdev - remove device from the kernel
4856  *	@dev: device
4857  *
4858  *	This function shuts down a device interface and removes it
4859  *	from the kernel tables.
4860  *
4861  *	This is just a wrapper for unregister_netdevice that takes
4862  *	the rtnl semaphore.  In general you want to use this and not
4863  *	unregister_netdevice.
4864  */
4865 void unregister_netdev(struct net_device *dev)
4866 {
4867 	rtnl_lock();
4868 	unregister_netdevice(dev);
4869 	rtnl_unlock();
4870 }
4871 
4872 EXPORT_SYMBOL(unregister_netdev);
4873 
4874 /**
4875  *	dev_change_net_namespace - move device to different nethost namespace
4876  *	@dev: device
4877  *	@net: network namespace
4878  *	@pat: If not NULL name pattern to try if the current device name
4879  *	      is already taken in the destination network namespace.
4880  *
4881  *	This function shuts down a device interface and moves it
4882  *	to a new network namespace. On success 0 is returned, on
4883  *	a failure a netagive errno code is returned.
4884  *
4885  *	Callers must hold the rtnl semaphore.
4886  */
4887 
4888 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
4889 {
4890 	char buf[IFNAMSIZ];
4891 	const char *destname;
4892 	int err;
4893 
4894 	ASSERT_RTNL();
4895 
4896 	/* Don't allow namespace local devices to be moved. */
4897 	err = -EINVAL;
4898 	if (dev->features & NETIF_F_NETNS_LOCAL)
4899 		goto out;
4900 
4901 #ifdef CONFIG_SYSFS
4902 	/* Don't allow real devices to be moved when sysfs
4903 	 * is enabled.
4904 	 */
4905 	err = -EINVAL;
4906 	if (dev->dev.parent)
4907 		goto out;
4908 #endif
4909 
4910 	/* Ensure the device has been registrered */
4911 	err = -EINVAL;
4912 	if (dev->reg_state != NETREG_REGISTERED)
4913 		goto out;
4914 
4915 	/* Get out if there is nothing todo */
4916 	err = 0;
4917 	if (net_eq(dev_net(dev), net))
4918 		goto out;
4919 
4920 	/* Pick the destination device name, and ensure
4921 	 * we can use it in the destination network namespace.
4922 	 */
4923 	err = -EEXIST;
4924 	destname = dev->name;
4925 	if (__dev_get_by_name(net, destname)) {
4926 		/* We get here if we can't use the current device name */
4927 		if (!pat)
4928 			goto out;
4929 		if (!dev_valid_name(pat))
4930 			goto out;
4931 		if (strchr(pat, '%')) {
4932 			if (__dev_alloc_name(net, pat, buf) < 0)
4933 				goto out;
4934 			destname = buf;
4935 		} else
4936 			destname = pat;
4937 		if (__dev_get_by_name(net, destname))
4938 			goto out;
4939 	}
4940 
4941 	/*
4942 	 * And now a mini version of register_netdevice unregister_netdevice.
4943 	 */
4944 
4945 	/* If device is running close it first. */
4946 	dev_close(dev);
4947 
4948 	/* And unlink it from device chain */
4949 	err = -ENODEV;
4950 	unlist_netdevice(dev);
4951 
4952 	synchronize_net();
4953 
4954 	/* Shutdown queueing discipline. */
4955 	dev_shutdown(dev);
4956 
4957 	/* Notify protocols, that we are about to destroy
4958 	   this device. They should clean all the things.
4959 	*/
4960 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4961 
4962 	/*
4963 	 *	Flush the unicast and multicast chains
4964 	 */
4965 	dev_addr_discard(dev);
4966 
4967 	netdev_unregister_kobject(dev);
4968 
4969 	/* Actually switch the network namespace */
4970 	dev_net_set(dev, net);
4971 
4972 	/* Assign the new device name */
4973 	if (destname != dev->name)
4974 		strcpy(dev->name, destname);
4975 
4976 	/* If there is an ifindex conflict assign a new one */
4977 	if (__dev_get_by_index(net, dev->ifindex)) {
4978 		int iflink = (dev->iflink == dev->ifindex);
4979 		dev->ifindex = dev_new_index(net);
4980 		if (iflink)
4981 			dev->iflink = dev->ifindex;
4982 	}
4983 
4984 	/* Fixup kobjects */
4985 	err = netdev_register_kobject(dev);
4986 	WARN_ON(err);
4987 
4988 	/* Add the device back in the hashes */
4989 	list_netdevice(dev);
4990 
4991 	/* Notify protocols, that a new device appeared. */
4992 	call_netdevice_notifiers(NETDEV_REGISTER, dev);
4993 
4994 	synchronize_net();
4995 	err = 0;
4996 out:
4997 	return err;
4998 }
4999 
5000 static int dev_cpu_callback(struct notifier_block *nfb,
5001 			    unsigned long action,
5002 			    void *ocpu)
5003 {
5004 	struct sk_buff **list_skb;
5005 	struct Qdisc **list_net;
5006 	struct sk_buff *skb;
5007 	unsigned int cpu, oldcpu = (unsigned long)ocpu;
5008 	struct softnet_data *sd, *oldsd;
5009 
5010 	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
5011 		return NOTIFY_OK;
5012 
5013 	local_irq_disable();
5014 	cpu = smp_processor_id();
5015 	sd = &per_cpu(softnet_data, cpu);
5016 	oldsd = &per_cpu(softnet_data, oldcpu);
5017 
5018 	/* Find end of our completion_queue. */
5019 	list_skb = &sd->completion_queue;
5020 	while (*list_skb)
5021 		list_skb = &(*list_skb)->next;
5022 	/* Append completion queue from offline CPU. */
5023 	*list_skb = oldsd->completion_queue;
5024 	oldsd->completion_queue = NULL;
5025 
5026 	/* Find end of our output_queue. */
5027 	list_net = &sd->output_queue;
5028 	while (*list_net)
5029 		list_net = &(*list_net)->next_sched;
5030 	/* Append output queue from offline CPU. */
5031 	*list_net = oldsd->output_queue;
5032 	oldsd->output_queue = NULL;
5033 
5034 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
5035 	local_irq_enable();
5036 
5037 	/* Process offline CPU's input_pkt_queue */
5038 	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
5039 		netif_rx(skb);
5040 
5041 	return NOTIFY_OK;
5042 }
5043 
5044 
5045 /**
5046  *	netdev_increment_features - increment feature set by one
5047  *	@all: current feature set
5048  *	@one: new feature set
5049  *	@mask: mask feature set
5050  *
5051  *	Computes a new feature set after adding a device with feature set
5052  *	@one to the master device with current feature set @all.  Will not
5053  *	enable anything that is off in @mask. Returns the new feature set.
5054  */
5055 unsigned long netdev_increment_features(unsigned long all, unsigned long one,
5056 					unsigned long mask)
5057 {
5058 	/* If device needs checksumming, downgrade to it. */
5059         if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
5060 		all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
5061 	else if (mask & NETIF_F_ALL_CSUM) {
5062 		/* If one device supports v4/v6 checksumming, set for all. */
5063 		if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
5064 		    !(all & NETIF_F_GEN_CSUM)) {
5065 			all &= ~NETIF_F_ALL_CSUM;
5066 			all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
5067 		}
5068 
5069 		/* If one device supports hw checksumming, set for all. */
5070 		if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
5071 			all &= ~NETIF_F_ALL_CSUM;
5072 			all |= NETIF_F_HW_CSUM;
5073 		}
5074 	}
5075 
5076 	one |= NETIF_F_ALL_CSUM;
5077 
5078 	one |= all & NETIF_F_ONE_FOR_ALL;
5079 	all &= one | NETIF_F_LLTX | NETIF_F_GSO;
5080 	all |= one & mask & NETIF_F_ONE_FOR_ALL;
5081 
5082 	return all;
5083 }
5084 EXPORT_SYMBOL(netdev_increment_features);
5085 
5086 static struct hlist_head *netdev_create_hash(void)
5087 {
5088 	int i;
5089 	struct hlist_head *hash;
5090 
5091 	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
5092 	if (hash != NULL)
5093 		for (i = 0; i < NETDEV_HASHENTRIES; i++)
5094 			INIT_HLIST_HEAD(&hash[i]);
5095 
5096 	return hash;
5097 }
5098 
5099 /* Initialize per network namespace state */
5100 static int __net_init netdev_init(struct net *net)
5101 {
5102 	INIT_LIST_HEAD(&net->dev_base_head);
5103 
5104 	net->dev_name_head = netdev_create_hash();
5105 	if (net->dev_name_head == NULL)
5106 		goto err_name;
5107 
5108 	net->dev_index_head = netdev_create_hash();
5109 	if (net->dev_index_head == NULL)
5110 		goto err_idx;
5111 
5112 	return 0;
5113 
5114 err_idx:
5115 	kfree(net->dev_name_head);
5116 err_name:
5117 	return -ENOMEM;
5118 }
5119 
5120 /**
5121  *	netdev_drivername - network driver for the device
5122  *	@dev: network device
5123  *	@buffer: buffer for resulting name
5124  *	@len: size of buffer
5125  *
5126  *	Determine network driver for device.
5127  */
5128 char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
5129 {
5130 	const struct device_driver *driver;
5131 	const struct device *parent;
5132 
5133 	if (len <= 0 || !buffer)
5134 		return buffer;
5135 	buffer[0] = 0;
5136 
5137 	parent = dev->dev.parent;
5138 
5139 	if (!parent)
5140 		return buffer;
5141 
5142 	driver = parent->driver;
5143 	if (driver && driver->name)
5144 		strlcpy(buffer, driver->name, len);
5145 	return buffer;
5146 }
5147 
5148 static void __net_exit netdev_exit(struct net *net)
5149 {
5150 	kfree(net->dev_name_head);
5151 	kfree(net->dev_index_head);
5152 }
5153 
5154 static struct pernet_operations __net_initdata netdev_net_ops = {
5155 	.init = netdev_init,
5156 	.exit = netdev_exit,
5157 };
5158 
5159 static void __net_exit default_device_exit(struct net *net)
5160 {
5161 	struct net_device *dev;
5162 	/*
5163 	 * Push all migratable of the network devices back to the
5164 	 * initial network namespace
5165 	 */
5166 	rtnl_lock();
5167 restart:
5168 	for_each_netdev(net, dev) {
5169 		int err;
5170 		char fb_name[IFNAMSIZ];
5171 
5172 		/* Ignore unmoveable devices (i.e. loopback) */
5173 		if (dev->features & NETIF_F_NETNS_LOCAL)
5174 			continue;
5175 
5176 		/* Delete virtual devices */
5177 		if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) {
5178 			dev->rtnl_link_ops->dellink(dev);
5179 			goto restart;
5180 		}
5181 
5182 		/* Push remaing network devices to init_net */
5183 		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
5184 		err = dev_change_net_namespace(dev, &init_net, fb_name);
5185 		if (err) {
5186 			printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
5187 				__func__, dev->name, err);
5188 			BUG();
5189 		}
5190 		goto restart;
5191 	}
5192 	rtnl_unlock();
5193 }
5194 
5195 static struct pernet_operations __net_initdata default_device_ops = {
5196 	.exit = default_device_exit,
5197 };
5198 
5199 /*
5200  *	Initialize the DEV module. At boot time this walks the device list and
5201  *	unhooks any devices that fail to initialise (normally hardware not
5202  *	present) and leaves us with a valid list of present and active devices.
5203  *
5204  */
5205 
5206 /*
5207  *       This is called single threaded during boot, so no need
5208  *       to take the rtnl semaphore.
5209  */
5210 static int __init net_dev_init(void)
5211 {
5212 	int i, rc = -ENOMEM;
5213 
5214 	BUG_ON(!dev_boot_phase);
5215 
5216 	if (dev_proc_init())
5217 		goto out;
5218 
5219 	if (netdev_kobject_init())
5220 		goto out;
5221 
5222 	INIT_LIST_HEAD(&ptype_all);
5223 	for (i = 0; i < PTYPE_HASH_SIZE; i++)
5224 		INIT_LIST_HEAD(&ptype_base[i]);
5225 
5226 	if (register_pernet_subsys(&netdev_net_ops))
5227 		goto out;
5228 
5229 	/*
5230 	 *	Initialise the packet receive queues.
5231 	 */
5232 
5233 	for_each_possible_cpu(i) {
5234 		struct softnet_data *queue;
5235 
5236 		queue = &per_cpu(softnet_data, i);
5237 		skb_queue_head_init(&queue->input_pkt_queue);
5238 		queue->completion_queue = NULL;
5239 		INIT_LIST_HEAD(&queue->poll_list);
5240 
5241 		queue->backlog.poll = process_backlog;
5242 		queue->backlog.weight = weight_p;
5243 		queue->backlog.gro_list = NULL;
5244 		queue->backlog.gro_count = 0;
5245 	}
5246 
5247 	dev_boot_phase = 0;
5248 
5249 	/* The loopback device is special if any other network devices
5250 	 * is present in a network namespace the loopback device must
5251 	 * be present. Since we now dynamically allocate and free the
5252 	 * loopback device ensure this invariant is maintained by
5253 	 * keeping the loopback device as the first device on the
5254 	 * list of network devices.  Ensuring the loopback devices
5255 	 * is the first device that appears and the last network device
5256 	 * that disappears.
5257 	 */
5258 	if (register_pernet_device(&loopback_net_ops))
5259 		goto out;
5260 
5261 	if (register_pernet_device(&default_device_ops))
5262 		goto out;
5263 
5264 	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
5265 	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
5266 
5267 	hotcpu_notifier(dev_cpu_callback, 0);
5268 	dst_init();
5269 	dev_mcast_init();
5270 	rc = 0;
5271 out:
5272 	return rc;
5273 }
5274 
5275 subsys_initcall(net_dev_init);
5276 
5277 static int __init initialize_hashrnd(void)
5278 {
5279 	get_random_bytes(&skb_tx_hashrnd, sizeof(skb_tx_hashrnd));
5280 	return 0;
5281 }
5282 
5283 late_initcall_sync(initialize_hashrnd);
5284 
5285 EXPORT_SYMBOL(__dev_get_by_index);
5286 EXPORT_SYMBOL(__dev_get_by_name);
5287 EXPORT_SYMBOL(__dev_remove_pack);
5288 EXPORT_SYMBOL(dev_valid_name);
5289 EXPORT_SYMBOL(dev_add_pack);
5290 EXPORT_SYMBOL(dev_alloc_name);
5291 EXPORT_SYMBOL(dev_close);
5292 EXPORT_SYMBOL(dev_get_by_flags);
5293 EXPORT_SYMBOL(dev_get_by_index);
5294 EXPORT_SYMBOL(dev_get_by_name);
5295 EXPORT_SYMBOL(dev_open);
5296 EXPORT_SYMBOL(dev_queue_xmit);
5297 EXPORT_SYMBOL(dev_remove_pack);
5298 EXPORT_SYMBOL(dev_set_allmulti);
5299 EXPORT_SYMBOL(dev_set_promiscuity);
5300 EXPORT_SYMBOL(dev_change_flags);
5301 EXPORT_SYMBOL(dev_set_mtu);
5302 EXPORT_SYMBOL(dev_set_mac_address);
5303 EXPORT_SYMBOL(free_netdev);
5304 EXPORT_SYMBOL(netdev_boot_setup_check);
5305 EXPORT_SYMBOL(netdev_set_master);
5306 EXPORT_SYMBOL(netdev_state_change);
5307 EXPORT_SYMBOL(netif_receive_skb);
5308 EXPORT_SYMBOL(netif_rx);
5309 EXPORT_SYMBOL(register_gifconf);
5310 EXPORT_SYMBOL(register_netdevice);
5311 EXPORT_SYMBOL(register_netdevice_notifier);
5312 EXPORT_SYMBOL(skb_checksum_help);
5313 EXPORT_SYMBOL(synchronize_net);
5314 EXPORT_SYMBOL(unregister_netdevice);
5315 EXPORT_SYMBOL(unregister_netdevice_notifier);
5316 EXPORT_SYMBOL(net_enable_timestamp);
5317 EXPORT_SYMBOL(net_disable_timestamp);
5318 EXPORT_SYMBOL(dev_get_flags);
5319 
5320 #if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
5321 EXPORT_SYMBOL(br_handle_frame_hook);
5322 EXPORT_SYMBOL(br_fdb_get_hook);
5323 EXPORT_SYMBOL(br_fdb_put_hook);
5324 #endif
5325 
5326 EXPORT_SYMBOL(dev_load);
5327 
5328 EXPORT_PER_CPU_SYMBOL(softnet_data);
5329